summaryrefslogtreecommitdiffstats
path: root/sys/contrib/opensolaris
diff options
context:
space:
mode:
Diffstat (limited to 'sys/contrib/opensolaris')
-rw-r--r--sys/contrib/opensolaris/OPENSOLARIS.LICENSE384
-rw-r--r--sys/contrib/opensolaris/common/atomic/i386/atomic.S98
-rw-r--r--sys/contrib/opensolaris/common/atomic/ia64/atomic.S82
-rw-r--r--sys/contrib/opensolaris/common/avl/avl.c969
-rw-r--r--sys/contrib/opensolaris/common/nvpair/nvpair.c2953
-rw-r--r--sys/contrib/opensolaris/common/nvpair/nvpair_alloc_fixed.c118
-rw-r--r--sys/contrib/opensolaris/common/zfs/zfs_namecheck.c287
-rw-r--r--sys/contrib/opensolaris/common/zfs/zfs_namecheck.h56
-rw-r--r--sys/contrib/opensolaris/common/zfs/zfs_prop.c657
-rw-r--r--sys/contrib/opensolaris/common/zfs/zfs_prop.h56
-rw-r--r--sys/contrib/opensolaris/uts/common/Makefile.files101
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/gfs.c884
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/arc.c2859
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/bplist.c312
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/dbuf.c2247
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/dmu.c1029
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c160
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c1037
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c1009
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c888
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c992
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c655
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/dnode.c1369
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c623
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c2035
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c1215
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c256
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c501
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c196
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/fletcher.c145
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/gzip.c69
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/lzjb.c129
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/metaslab.c1023
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/refcount.c194
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sha256.c131
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/spa.c3301
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/spa_config.c375
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c440
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/spa_history.c354
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c1130
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/space_map.c501
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h109
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h89
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h334
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h587
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h237
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h125
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h120
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h134
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_zfetch.h75
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h267
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h185
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h143
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h82
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h77
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h77
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h69
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h81
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h103
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h491
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h168
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h162
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h120
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h77
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h50
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h63
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/unique.h56
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h132
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h52
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_file.h46
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h298
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h359
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h204
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h234
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h115
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h120
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h71
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h75
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h71
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h163
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_rlock.h89
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h100
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h298
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h276
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h111
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h366
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h75
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h82
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h205
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h68
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/txg.c611
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/uberblock.c63
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/unique.c107
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/vdev.c1915
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c394
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c363
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c225
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c583
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c1011
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c495
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c89
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c323
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c1237
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c118
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/zap.c1071
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c741
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c857
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/zfs.conf28
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c1608
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c99
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c1119
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c797
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c335
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c1826
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c349
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c430
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c594
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c1021
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c3623
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c1072
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/zil.c1607
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/zio.c1861
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c172
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c148
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c315
-rw-r--r--sys/contrib/opensolaris/uts/common/fs/zfs/zvol.c801
-rw-r--r--sys/contrib/opensolaris/uts/common/os/callb.c363
-rw-r--r--sys/contrib/opensolaris/uts/common/os/list.c193
-rw-r--r--sys/contrib/opensolaris/uts/common/os/nvpair_alloc_system.c63
-rw-r--r--sys/contrib/opensolaris/uts/common/os/taskq.c1020
-rw-r--r--sys/contrib/opensolaris/uts/common/rpc/xdr.c673
-rw-r--r--sys/contrib/opensolaris/uts/common/rpc/xdr.h605
-rw-r--r--sys/contrib/opensolaris/uts/common/rpc/xdr_array.c123
-rw-r--r--sys/contrib/opensolaris/uts/common/rpc/xdr_mem.c209
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/asm_linkage.h110
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/avl.h298
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/avl_impl.h164
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/bitmap.h194
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/byteorder.h137
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/callb.h214
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/ccompile.h127
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/compress.h46
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/cred.h154
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/debug.h129
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/dkio.h477
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/dklabel.h268
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/errorq.h83
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/feature_tests.h397
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/fm/fs/zfs.h75
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/fm/protocol.h301
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/fm/util.h103
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/fs/zfs.h437
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/gfs.h139
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/isa_defs.h485
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/list.h63
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/list_impl.h53
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/note.h56
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/nvpair.h260
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/nvpair_impl.h73
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/processor.h146
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/procset.h162
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/sdt.h176
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/synch.h161
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/sysevent.h227
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/sysmacros.h290
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/vmem.h142
-rw-r--r--sys/contrib/opensolaris/uts/common/sys/zmod.h68
-rw-r--r--sys/contrib/opensolaris/uts/common/zmod/adler32.c149
-rw-r--r--sys/contrib/opensolaris/uts/common/zmod/crc32.c428
-rw-r--r--sys/contrib/opensolaris/uts/common/zmod/crc32.h443
-rw-r--r--sys/contrib/opensolaris/uts/common/zmod/deflate.c1742
-rw-r--r--sys/contrib/opensolaris/uts/common/zmod/deflate.h331
-rw-r--r--sys/contrib/opensolaris/uts/common/zmod/inffast.c320
-rw-r--r--sys/contrib/opensolaris/uts/common/zmod/inffast.h13
-rw-r--r--sys/contrib/opensolaris/uts/common/zmod/inffixed.h96
-rw-r--r--sys/contrib/opensolaris/uts/common/zmod/inflate.c1395
-rw-r--r--sys/contrib/opensolaris/uts/common/zmod/inflate.h117
-rw-r--r--sys/contrib/opensolaris/uts/common/zmod/inftrees.c331
-rw-r--r--sys/contrib/opensolaris/uts/common/zmod/inftrees.h57
-rw-r--r--sys/contrib/opensolaris/uts/common/zmod/trees.c1219
-rw-r--r--sys/contrib/opensolaris/uts/common/zmod/zconf.h117
-rw-r--r--sys/contrib/opensolaris/uts/common/zmod/zlib.h1359
-rw-r--r--sys/contrib/opensolaris/uts/common/zmod/zmod.c109
-rw-r--r--sys/contrib/opensolaris/uts/common/zmod/zmod_subr.c84
-rw-r--r--sys/contrib/opensolaris/uts/common/zmod/zutil.c324
-rw-r--r--sys/contrib/opensolaris/uts/common/zmod/zutil.h274
186 files changed, 0 insertions, 85891 deletions
diff --git a/sys/contrib/opensolaris/OPENSOLARIS.LICENSE b/sys/contrib/opensolaris/OPENSOLARIS.LICENSE
deleted file mode 100644
index da23621..0000000
--- a/sys/contrib/opensolaris/OPENSOLARIS.LICENSE
+++ /dev/null
@@ -1,384 +0,0 @@
-Unless otherwise noted, all files in this distribution are released
-under the Common Development and Distribution License (CDDL).
-Exceptions are noted within the associated source files.
-
---------------------------------------------------------------------
-
-
-COMMON DEVELOPMENT AND DISTRIBUTION LICENSE Version 1.0
-
-1. Definitions.
-
- 1.1. "Contributor" means each individual or entity that creates
- or contributes to the creation of Modifications.
-
- 1.2. "Contributor Version" means the combination of the Original
- Software, prior Modifications used by a Contributor (if any),
- and the Modifications made by that particular Contributor.
-
- 1.3. "Covered Software" means (a) the Original Software, or (b)
- Modifications, or (c) the combination of files containing
- Original Software with files containing Modifications, in
- each case including portions thereof.
-
- 1.4. "Executable" means the Covered Software in any form other
- than Source Code.
-
- 1.5. "Initial Developer" means the individual or entity that first
- makes Original Software available under this License.
-
- 1.6. "Larger Work" means a work which combines Covered Software or
- portions thereof with code not governed by the terms of this
- License.
-
- 1.7. "License" means this document.
-
- 1.8. "Licensable" means having the right to grant, to the maximum
- extent possible, whether at the time of the initial grant or
- subsequently acquired, any and all of the rights conveyed
- herein.
-
- 1.9. "Modifications" means the Source Code and Executable form of
- any of the following:
-
- A. Any file that results from an addition to, deletion from or
- modification of the contents of a file containing Original
- Software or previous Modifications;
-
- B. Any new file that contains any part of the Original
- Software or previous Modifications; or
-
- C. Any new file that is contributed or otherwise made
- available under the terms of this License.
-
- 1.10. "Original Software" means the Source Code and Executable
- form of computer software code that is originally released
- under this License.
-
- 1.11. "Patent Claims" means any patent claim(s), now owned or
- hereafter acquired, including without limitation, method,
- process, and apparatus claims, in any patent Licensable by
- grantor.
-
- 1.12. "Source Code" means (a) the common form of computer software
- code in which modifications are made and (b) associated
- documentation included in or with such code.
-
- 1.13. "You" (or "Your") means an individual or a legal entity
- exercising rights under, and complying with all of the terms
- of, this License. For legal entities, "You" includes any
- entity which controls, is controlled by, or is under common
- control with You. For purposes of this definition,
- "control" means (a) the power, direct or indirect, to cause
- the direction or management of such entity, whether by
- contract or otherwise, or (b) ownership of more than fifty
- percent (50%) of the outstanding shares or beneficial
- ownership of such entity.
-
-2. License Grants.
-
- 2.1. The Initial Developer Grant.
-
- Conditioned upon Your compliance with Section 3.1 below and
- subject to third party intellectual property claims, the Initial
- Developer hereby grants You a world-wide, royalty-free,
- non-exclusive license:
-
- (a) under intellectual property rights (other than patent or
- trademark) Licensable by Initial Developer, to use,
- reproduce, modify, display, perform, sublicense and
- distribute the Original Software (or portions thereof),
- with or without Modifications, and/or as part of a Larger
- Work; and
-
- (b) under Patent Claims infringed by the making, using or
- selling of Original Software, to make, have made, use,
- practice, sell, and offer for sale, and/or otherwise
- dispose of the Original Software (or portions thereof).
-
- (c) The licenses granted in Sections 2.1(a) and (b) are
- effective on the date Initial Developer first distributes
- or otherwise makes the Original Software available to a
- third party under the terms of this License.
-
- (d) Notwithstanding Section 2.1(b) above, no patent license is
- granted: (1) for code that You delete from the Original
- Software, or (2) for infringements caused by: (i) the
- modification of the Original Software, or (ii) the
- combination of the Original Software with other software
- or devices.
-
- 2.2. Contributor Grant.
-
- Conditioned upon Your compliance with Section 3.1 below and
- subject to third party intellectual property claims, each
- Contributor hereby grants You a world-wide, royalty-free,
- non-exclusive license:
-
- (a) under intellectual property rights (other than patent or
- trademark) Licensable by Contributor to use, reproduce,
- modify, display, perform, sublicense and distribute the
- Modifications created by such Contributor (or portions
- thereof), either on an unmodified basis, with other
- Modifications, as Covered Software and/or as part of a
- Larger Work; and
-
- (b) under Patent Claims infringed by the making, using, or
- selling of Modifications made by that Contributor either
- alone and/or in combination with its Contributor Version
- (or portions of such combination), to make, use, sell,
- offer for sale, have made, and/or otherwise dispose of:
- (1) Modifications made by that Contributor (or portions
- thereof); and (2) the combination of Modifications made by
- that Contributor with its Contributor Version (or portions
- of such combination).
-
- (c) The licenses granted in Sections 2.2(a) and 2.2(b) are
- effective on the date Contributor first distributes or
- otherwise makes the Modifications available to a third
- party.
-
- (d) Notwithstanding Section 2.2(b) above, no patent license is
- granted: (1) for any code that Contributor has deleted
- from the Contributor Version; (2) for infringements caused
- by: (i) third party modifications of Contributor Version,
- or (ii) the combination of Modifications made by that
- Contributor with other software (except as part of the
- Contributor Version) or other devices; or (3) under Patent
- Claims infringed by Covered Software in the absence of
- Modifications made by that Contributor.
-
-3. Distribution Obligations.
-
- 3.1. Availability of Source Code.
-
- Any Covered Software that You distribute or otherwise make
- available in Executable form must also be made available in Source
- Code form and that Source Code form must be distributed only under
- the terms of this License. You must include a copy of this
- License with every copy of the Source Code form of the Covered
- Software You distribute or otherwise make available. You must
- inform recipients of any such Covered Software in Executable form
- as to how they can obtain such Covered Software in Source Code
- form in a reasonable manner on or through a medium customarily
- used for software exchange.
-
- 3.2. Modifications.
-
- The Modifications that You create or to which You contribute are
- governed by the terms of this License. You represent that You
- believe Your Modifications are Your original creation(s) and/or
- You have sufficient rights to grant the rights conveyed by this
- License.
-
- 3.3. Required Notices.
-
- You must include a notice in each of Your Modifications that
- identifies You as the Contributor of the Modification. You may
- not remove or alter any copyright, patent or trademark notices
- contained within the Covered Software, or any notices of licensing
- or any descriptive text giving attribution to any Contributor or
- the Initial Developer.
-
- 3.4. Application of Additional Terms.
-
- You may not offer or impose any terms on any Covered Software in
- Source Code form that alters or restricts the applicable version
- of this License or the recipients' rights hereunder. You may
- choose to offer, and to charge a fee for, warranty, support,
- indemnity or liability obligations to one or more recipients of
- Covered Software. However, you may do so only on Your own behalf,
- and not on behalf of the Initial Developer or any Contributor.
- You must make it absolutely clear that any such warranty, support,
- indemnity or liability obligation is offered by You alone, and You
- hereby agree to indemnify the Initial Developer and every
- Contributor for any liability incurred by the Initial Developer or
- such Contributor as a result of warranty, support, indemnity or
- liability terms You offer.
-
- 3.5. Distribution of Executable Versions.
-
- You may distribute the Executable form of the Covered Software
- under the terms of this License or under the terms of a license of
- Your choice, which may contain terms different from this License,
- provided that You are in compliance with the terms of this License
- and that the license for the Executable form does not attempt to
- limit or alter the recipient's rights in the Source Code form from
- the rights set forth in this License. If You distribute the
- Covered Software in Executable form under a different license, You
- must make it absolutely clear that any terms which differ from
- this License are offered by You alone, not by the Initial
- Developer or Contributor. You hereby agree to indemnify the
- Initial Developer and every Contributor for any liability incurred
- by the Initial Developer or such Contributor as a result of any
- such terms You offer.
-
- 3.6. Larger Works.
-
- You may create a Larger Work by combining Covered Software with
- other code not governed by the terms of this License and
- distribute the Larger Work as a single product. In such a case,
- You must make sure the requirements of this License are fulfilled
- for the Covered Software.
-
-4. Versions of the License.
-
- 4.1. New Versions.
-
- Sun Microsystems, Inc. is the initial license steward and may
- publish revised and/or new versions of this License from time to
- time. Each version will be given a distinguishing version number.
- Except as provided in Section 4.3, no one other than the license
- steward has the right to modify this License.
-
- 4.2. Effect of New Versions.
-
- You may always continue to use, distribute or otherwise make the
- Covered Software available under the terms of the version of the
- License under which You originally received the Covered Software.
- If the Initial Developer includes a notice in the Original
- Software prohibiting it from being distributed or otherwise made
- available under any subsequent version of the License, You must
- distribute and make the Covered Software available under the terms
- of the version of the License under which You originally received
- the Covered Software. Otherwise, You may also choose to use,
- distribute or otherwise make the Covered Software available under
- the terms of any subsequent version of the License published by
- the license steward.
-
- 4.3. Modified Versions.
-
- When You are an Initial Developer and You want to create a new
- license for Your Original Software, You may create and use a
- modified version of this License if You: (a) rename the license
- and remove any references to the name of the license steward
- (except to note that the license differs from this License); and
- (b) otherwise make it clear that the license contains terms which
- differ from this License.
-
-5. DISCLAIMER OF WARRANTY.
-
- COVERED SOFTWARE IS PROVIDED UNDER THIS LICENSE ON AN "AS IS"
- BASIS, WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED,
- INCLUDING, WITHOUT LIMITATION, WARRANTIES THAT THE COVERED
- SOFTWARE IS FREE OF DEFECTS, MERCHANTABLE, FIT FOR A PARTICULAR
- PURPOSE OR NON-INFRINGING. THE ENTIRE RISK AS TO THE QUALITY AND
- PERFORMANCE OF THE COVERED SOFTWARE IS WITH YOU. SHOULD ANY
- COVERED SOFTWARE PROVE DEFECTIVE IN ANY RESPECT, YOU (NOT THE
- INITIAL DEVELOPER OR ANY OTHER CONTRIBUTOR) ASSUME THE COST OF ANY
- NECESSARY SERVICING, REPAIR OR CORRECTION. THIS DISCLAIMER OF
- WARRANTY CONSTITUTES AN ESSENTIAL PART OF THIS LICENSE. NO USE OF
- ANY COVERED SOFTWARE IS AUTHORIZED HEREUNDER EXCEPT UNDER THIS
- DISCLAIMER.
-
-6. TERMINATION.
-
- 6.1. This License and the rights granted hereunder will terminate
- automatically if You fail to comply with terms herein and fail to
- cure such breach within 30 days of becoming aware of the breach.
- Provisions which, by their nature, must remain in effect beyond
- the termination of this License shall survive.
-
- 6.2. If You assert a patent infringement claim (excluding
- declaratory judgment actions) against Initial Developer or a
- Contributor (the Initial Developer or Contributor against whom You
- assert such claim is referred to as "Participant") alleging that
- the Participant Software (meaning the Contributor Version where
- the Participant is a Contributor or the Original Software where
- the Participant is the Initial Developer) directly or indirectly
- infringes any patent, then any and all rights granted directly or
- indirectly to You by such Participant, the Initial Developer (if
- the Initial Developer is not the Participant) and all Contributors
- under Sections 2.1 and/or 2.2 of this License shall, upon 60 days
- notice from Participant terminate prospectively and automatically
- at the expiration of such 60 day notice period, unless if within
- such 60 day period You withdraw Your claim with respect to the
- Participant Software against such Participant either unilaterally
- or pursuant to a written agreement with Participant.
-
- 6.3. In the event of termination under Sections 6.1 or 6.2 above,
- all end user licenses that have been validly granted by You or any
- distributor hereunder prior to termination (excluding licenses
- granted to You by any distributor) shall survive termination.
-
-7. LIMITATION OF LIABILITY.
-
- UNDER NO CIRCUMSTANCES AND UNDER NO LEGAL THEORY, WHETHER TORT
- (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE, SHALL YOU, THE
- INITIAL DEVELOPER, ANY OTHER CONTRIBUTOR, OR ANY DISTRIBUTOR OF
- COVERED SOFTWARE, OR ANY SUPPLIER OF ANY OF SUCH PARTIES, BE
- LIABLE TO ANY PERSON FOR ANY INDIRECT, SPECIAL, INCIDENTAL, OR
- CONSEQUENTIAL DAMAGES OF ANY CHARACTER INCLUDING, WITHOUT
- LIMITATION, DAMAGES FOR LOST PROFITS, LOSS OF GOODWILL, WORK
- STOPPAGE, COMPUTER FAILURE OR MALFUNCTION, OR ANY AND ALL OTHER
- COMMERCIAL DAMAGES OR LOSSES, EVEN IF SUCH PARTY SHALL HAVE BEEN
- INFORMED OF THE POSSIBILITY OF SUCH DAMAGES. THIS LIMITATION OF
- LIABILITY SHALL NOT APPLY TO LIABILITY FOR DEATH OR PERSONAL
- INJURY RESULTING FROM SUCH PARTY'S NEGLIGENCE TO THE EXTENT
- APPLICABLE LAW PROHIBITS SUCH LIMITATION. SOME JURISDICTIONS DO
- NOT ALLOW THE EXCLUSION OR LIMITATION OF INCIDENTAL OR
- CONSEQUENTIAL DAMAGES, SO THIS EXCLUSION AND LIMITATION MAY NOT
- APPLY TO YOU.
-
-8. U.S. GOVERNMENT END USERS.
-
- The Covered Software is a "commercial item," as that term is
- defined in 48 C.F.R. 2.101 (Oct. 1995), consisting of "commercial
- computer software" (as that term is defined at 48
- C.F.R. 252.227-7014(a)(1)) and "commercial computer software
- documentation" as such terms are used in 48 C.F.R. 12.212
- (Sept. 1995). Consistent with 48 C.F.R. 12.212 and 48
- C.F.R. 227.7202-1 through 227.7202-4 (June 1995), all
- U.S. Government End Users acquire Covered Software with only those
- rights set forth herein. This U.S. Government Rights clause is in
- lieu of, and supersedes, any other FAR, DFAR, or other clause or
- provision that addresses Government rights in computer software
- under this License.
-
-9. MISCELLANEOUS.
-
- This License represents the complete agreement concerning subject
- matter hereof. If any provision of this License is held to be
- unenforceable, such provision shall be reformed only to the extent
- necessary to make it enforceable. This License shall be governed
- by the law of the jurisdiction specified in a notice contained
- within the Original Software (except to the extent applicable law,
- if any, provides otherwise), excluding such jurisdiction's
- conflict-of-law provisions. Any litigation relating to this
- License shall be subject to the jurisdiction of the courts located
- in the jurisdiction and venue specified in a notice contained
- within the Original Software, with the losing party responsible
- for costs, including, without limitation, court costs and
- reasonable attorneys' fees and expenses. The application of the
- United Nations Convention on Contracts for the International Sale
- of Goods is expressly excluded. Any law or regulation which
- provides that the language of a contract shall be construed
- against the drafter shall not apply to this License. You agree
- that You alone are responsible for compliance with the United
- States export administration regulations (and the export control
- laws and regulation of any other countries) when You use,
- distribute or otherwise make available any Covered Software.
-
-10. RESPONSIBILITY FOR CLAIMS.
-
- As between Initial Developer and the Contributors, each party is
- responsible for claims and damages arising, directly or
- indirectly, out of its utilization of rights under this License
- and You agree to work with Initial Developer and Contributors to
- distribute such responsibility on an equitable basis. Nothing
- herein is intended or shall be deemed to constitute any admission
- of liability.
-
---------------------------------------------------------------------
-
-NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND
-DISTRIBUTION LICENSE (CDDL)
-
-For Covered Software in this distribution, this License shall
-be governed by the laws of the State of California (excluding
-conflict-of-law provisions).
-
-Any litigation relating to this License shall be subject to the
-jurisdiction of the Federal Courts of the Northern District of
-California and the state courts of the State of California, with
-venue lying in Santa Clara County, California.
diff --git a/sys/contrib/opensolaris/common/atomic/i386/atomic.S b/sys/contrib/opensolaris/common/atomic/i386/atomic.S
deleted file mode 100644
index bc7f22a..0000000
--- a/sys/contrib/opensolaris/common/atomic/i386/atomic.S
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
- .ident "%Z%%M% %I% %E% SMI"
-
- .file "%M%"
-
-#define _ASM
-#include <sys/asm_linkage.h>
-
- ENTRY(atomic_add_64)
- ALTENTRY(atomic_add_64_nv)
- pushl %edi
- pushl %ebx
- movl 12(%esp), %edi // %edi = target address
- movl (%edi), %eax
- movl 4(%edi), %edx // %edx:%eax = old value
-1:
- movl 16(%esp), %ebx
- movl 20(%esp), %ecx // %ecx:%ebx = delta
- addl %eax, %ebx
- adcl %edx, %ecx // %ecx:%ebx = new value
- lock
- cmpxchg8b (%edi) // try to stick it in
- jne 1b
- movl %ebx, %eax
- movl %ecx, %edx // return new value
- popl %ebx
- popl %edi
- ret
- SET_SIZE(atomic_add_64_nv)
- SET_SIZE(atomic_add_64)
-
- ENTRY(atomic_or_8_nv)
- movl 4(%esp), %edx // %edx = target address
- movb (%edx), %al // %al = old value
-1:
- movl 8(%esp), %ecx // %ecx = delta
- orb %al, %cl // %cl = new value
- lock
- cmpxchgb %cl, (%edx) // try to stick it in
- jne 1b
- movzbl %cl, %eax // return new value
- ret
- SET_SIZE(atomic_or_8_nv)
-
- ENTRY(atomic_cas_ptr)
- movl 4(%esp), %edx
- movl 8(%esp), %eax
- movl 12(%esp), %ecx
- lock
- cmpxchgl %ecx, (%edx)
- ret
- SET_SIZE(atomic_cas_ptr)
-
- ENTRY(atomic_cas_64)
- pushl %ebx
- pushl %esi
- movl 12(%esp), %esi
- movl 16(%esp), %eax
- movl 20(%esp), %edx
- movl 24(%esp), %ebx
- movl 28(%esp), %ecx
- lock
- cmpxchg8b (%esi)
- popl %esi
- popl %ebx
- ret
- SET_SIZE(atomic_cas_64)
-
- ENTRY(membar_producer)
- lock
- xorl $0, (%esp)
- ret
- SET_SIZE(membar_producer)
diff --git a/sys/contrib/opensolaris/common/atomic/ia64/atomic.S b/sys/contrib/opensolaris/common/atomic/ia64/atomic.S
deleted file mode 100644
index 409d759..0000000
--- a/sys/contrib/opensolaris/common/atomic/ia64/atomic.S
+++ /dev/null
@@ -1,82 +0,0 @@
-/*-
- * Copyright (c) 2007 Marcel Moolenaar
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
- * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-
-#include <machine/asm.h>
-
- .text
-
-/*
- * uint64_t atomic_cas_64(volatile uint64_t *p, uint64_t cmp, uint64_t v)
- */
-ENTRY(atomic_cas_64, 3)
- mov ar.ccv = r33
- ;;
- cmpxchg8.acq r8 = [r32], r34, ar.ccv
- ;;
- br.ret.sptk rp
-END(atomic_cas_64)
-
-/*
- * uint64_t atomic_add_64_nv(volatile uint64_t *p, uint64_t v)
- */
-ENTRY(atomic_add_64_nv, 2)
-1:
- ld8 r16 = [r32]
- ;;
- mov ar.ccv = r16
- add r8 = r16, r33
- ;;
- cmpxchg8.acq r17 = [r32], r8, ar.ccv
- ;;
- cmp.eq p6, p7 = r16, r17
-(p6) br.ret.sptk rp
-(p7) br.cond.spnt 1b
-END(atomic_add_64_nv)
-
-/*
- * uint8_t atomic_or_8_nv(volatile uint8_t *p, uint8_t v)
- */
-ENTRY(atomic_or_8_nv, 2)
-1:
- ld8 r16 = [r32]
- ;;
- mov ar.ccv = r16
- or r8 = r16, r33
- ;;
- cmpxchg1.acq r17 = [r32], r8, ar.ccv
- ;;
- cmp.eq p6, p7 = r16, r17
-(p6) br.ret.sptk rp
-(p7) br.cond.spnt 1b
-END(atomic_or_8_nv)
-
-ENTRY(membar_producer, 0)
- mf.a
- ;;
- br.ret.sptk rp
-END(membar_producer)
diff --git a/sys/contrib/opensolaris/common/avl/avl.c b/sys/contrib/opensolaris/common/avl/avl.c
deleted file mode 100644
index 1fa2236..0000000
--- a/sys/contrib/opensolaris/common/avl/avl.c
+++ /dev/null
@@ -1,969 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-
-/*
- * AVL - generic AVL tree implementation for kernel use
- *
- * A complete description of AVL trees can be found in many CS textbooks.
- *
- * Here is a very brief overview. An AVL tree is a binary search tree that is
- * almost perfectly balanced. By "almost" perfectly balanced, we mean that at
- * any given node, the left and right subtrees are allowed to differ in height
- * by at most 1 level.
- *
- * This relaxation from a perfectly balanced binary tree allows doing
- * insertion and deletion relatively efficiently. Searching the tree is
- * still a fast operation, roughly O(log(N)).
- *
- * The key to insertion and deletion is a set of tree maniuplations called
- * rotations, which bring unbalanced subtrees back into the semi-balanced state.
- *
- * This implementation of AVL trees has the following peculiarities:
- *
- * - The AVL specific data structures are physically embedded as fields
- * in the "using" data structures. To maintain generality the code
- * must constantly translate between "avl_node_t *" and containing
- * data structure "void *"s by adding/subracting the avl_offset.
- *
- * - Since the AVL data is always embedded in other structures, there is
- * no locking or memory allocation in the AVL routines. This must be
- * provided for by the enclosing data structure's semantics. Typically,
- * avl_insert()/_add()/_remove()/avl_insert_here() require some kind of
- * exclusive write lock. Other operations require a read lock.
- *
- * - The implementation uses iteration instead of explicit recursion,
- * since it is intended to run on limited size kernel stacks. Since
- * there is no recursion stack present to move "up" in the tree,
- * there is an explicit "parent" link in the avl_node_t.
- *
- * - The left/right children pointers of a node are in an array.
- * In the code, variables (instead of constants) are used to represent
- * left and right indices. The implementation is written as if it only
- * dealt with left handed manipulations. By changing the value assigned
- * to "left", the code also works for right handed trees. The
- * following variables/terms are frequently used:
- *
- * int left; // 0 when dealing with left children,
- * // 1 for dealing with right children
- *
- * int left_heavy; // -1 when left subtree is taller at some node,
- * // +1 when right subtree is taller
- *
- * int right; // will be the opposite of left (0 or 1)
- * int right_heavy;// will be the opposite of left_heavy (-1 or 1)
- *
- * int direction; // 0 for "<" (ie. left child); 1 for ">" (right)
- *
- * Though it is a little more confusing to read the code, the approach
- * allows using half as much code (and hence cache footprint) for tree
- * manipulations and eliminates many conditional branches.
- *
- * - The avl_index_t is an opaque "cookie" used to find nodes at or
- * adjacent to where a new value would be inserted in the tree. The value
- * is a modified "avl_node_t *". The bottom bit (normally 0 for a
- * pointer) is set to indicate if that the new node has a value greater
- * than the value of the indicated "avl_node_t *".
- */
-
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/stdint.h>
-#include <sys/debug.h>
-#include <sys/avl.h>
-
-/*
- * Small arrays to translate between balance (or diff) values and child indeces.
- *
- * Code that deals with binary tree data structures will randomly use
- * left and right children when examining a tree. C "if()" statements
- * which evaluate randomly suffer from very poor hardware branch prediction.
- * In this code we avoid some of the branch mispredictions by using the
- * following translation arrays. They replace random branches with an
- * additional memory reference. Since the translation arrays are both very
- * small the data should remain efficiently in cache.
- */
-static const int avl_child2balance[2] = {-1, 1};
-static const int avl_balance2child[] = {0, 0, 1};
-
-
-/*
- * Walk from one node to the previous valued node (ie. an infix walk
- * towards the left). At any given node we do one of 2 things:
- *
- * - If there is a left child, go to it, then to it's rightmost descendant.
- *
- * - otherwise we return thru parent nodes until we've come from a right child.
- *
- * Return Value:
- * NULL - if at the end of the nodes
- * otherwise next node
- */
-void *
-avl_walk(avl_tree_t *tree, void *oldnode, int left)
-{
- size_t off = tree->avl_offset;
- avl_node_t *node = AVL_DATA2NODE(oldnode, off);
- int right = 1 - left;
- int was_child;
-
-
- /*
- * nowhere to walk to if tree is empty
- */
- if (node == NULL)
- return (NULL);
-
- /*
- * Visit the previous valued node. There are two possibilities:
- *
- * If this node has a left child, go down one left, then all
- * the way right.
- */
- if (node->avl_child[left] != NULL) {
- for (node = node->avl_child[left];
- node->avl_child[right] != NULL;
- node = node->avl_child[right])
- ;
- /*
- * Otherwise, return thru left children as far as we can.
- */
- } else {
- for (;;) {
- was_child = AVL_XCHILD(node);
- node = AVL_XPARENT(node);
- if (node == NULL)
- return (NULL);
- if (was_child == right)
- break;
- }
- }
-
- return (AVL_NODE2DATA(node, off));
-}
-
-/*
- * Return the lowest valued node in a tree or NULL.
- * (leftmost child from root of tree)
- */
-void *
-avl_first(avl_tree_t *tree)
-{
- avl_node_t *node;
- avl_node_t *prev = NULL;
- size_t off = tree->avl_offset;
-
- for (node = tree->avl_root; node != NULL; node = node->avl_child[0])
- prev = node;
-
- if (prev != NULL)
- return (AVL_NODE2DATA(prev, off));
- return (NULL);
-}
-
-/*
- * Return the highest valued node in a tree or NULL.
- * (rightmost child from root of tree)
- */
-void *
-avl_last(avl_tree_t *tree)
-{
- avl_node_t *node;
- avl_node_t *prev = NULL;
- size_t off = tree->avl_offset;
-
- for (node = tree->avl_root; node != NULL; node = node->avl_child[1])
- prev = node;
-
- if (prev != NULL)
- return (AVL_NODE2DATA(prev, off));
- return (NULL);
-}
-
-/*
- * Access the node immediately before or after an insertion point.
- *
- * "avl_index_t" is a (avl_node_t *) with the bottom bit indicating a child
- *
- * Return value:
- * NULL: no node in the given direction
- * "void *" of the found tree node
- */
-void *
-avl_nearest(avl_tree_t *tree, avl_index_t where, int direction)
-{
- int child = AVL_INDEX2CHILD(where);
- avl_node_t *node = AVL_INDEX2NODE(where);
- void *data;
- size_t off = tree->avl_offset;
-
- if (node == NULL) {
- ASSERT(tree->avl_root == NULL);
- return (NULL);
- }
- data = AVL_NODE2DATA(node, off);
- if (child != direction)
- return (data);
-
- return (avl_walk(tree, data, direction));
-}
-
-
-/*
- * Search for the node which contains "value". The algorithm is a
- * simple binary tree search.
- *
- * return value:
- * NULL: the value is not in the AVL tree
- * *where (if not NULL) is set to indicate the insertion point
- * "void *" of the found tree node
- */
-void *
-avl_find(avl_tree_t *tree, void *value, avl_index_t *where)
-{
- avl_node_t *node;
- avl_node_t *prev = NULL;
- int child = 0;
- int diff;
- size_t off = tree->avl_offset;
-
- for (node = tree->avl_root; node != NULL;
- node = node->avl_child[child]) {
-
- prev = node;
-
- diff = tree->avl_compar(value, AVL_NODE2DATA(node, off));
- ASSERT(-1 <= diff && diff <= 1);
- if (diff == 0) {
-#ifdef DEBUG
- if (where != NULL)
- *where = 0;
-#endif
- return (AVL_NODE2DATA(node, off));
- }
- child = avl_balance2child[1 + diff];
-
- }
-
- if (where != NULL)
- *where = AVL_MKINDEX(prev, child);
-
- return (NULL);
-}
-
-
-/*
- * Perform a rotation to restore balance at the subtree given by depth.
- *
- * This routine is used by both insertion and deletion. The return value
- * indicates:
- * 0 : subtree did not change height
- * !0 : subtree was reduced in height
- *
- * The code is written as if handling left rotations, right rotations are
- * symmetric and handled by swapping values of variables right/left[_heavy]
- *
- * On input balance is the "new" balance at "node". This value is either
- * -2 or +2.
- */
-static int
-avl_rotation(avl_tree_t *tree, avl_node_t *node, int balance)
-{
- int left = !(balance < 0); /* when balance = -2, left will be 0 */
- int right = 1 - left;
- int left_heavy = balance >> 1;
- int right_heavy = -left_heavy;
- avl_node_t *parent = AVL_XPARENT(node);
- avl_node_t *child = node->avl_child[left];
- avl_node_t *cright;
- avl_node_t *gchild;
- avl_node_t *gright;
- avl_node_t *gleft;
- int which_child = AVL_XCHILD(node);
- int child_bal = AVL_XBALANCE(child);
-
- /* BEGIN CSTYLED */
- /*
- * case 1 : node is overly left heavy, the left child is balanced or
- * also left heavy. This requires the following rotation.
- *
- * (node bal:-2)
- * / \
- * / \
- * (child bal:0 or -1)
- * / \
- * / \
- * cright
- *
- * becomes:
- *
- * (child bal:1 or 0)
- * / \
- * / \
- * (node bal:-1 or 0)
- * / \
- * / \
- * cright
- *
- * we detect this situation by noting that child's balance is not
- * right_heavy.
- */
- /* END CSTYLED */
- if (child_bal != right_heavy) {
-
- /*
- * compute new balance of nodes
- *
- * If child used to be left heavy (now balanced) we reduced
- * the height of this sub-tree -- used in "return...;" below
- */
- child_bal += right_heavy; /* adjust towards right */
-
- /*
- * move "cright" to be node's left child
- */
- cright = child->avl_child[right];
- node->avl_child[left] = cright;
- if (cright != NULL) {
- AVL_SETPARENT(cright, node);
- AVL_SETCHILD(cright, left);
- }
-
- /*
- * move node to be child's right child
- */
- child->avl_child[right] = node;
- AVL_SETBALANCE(node, -child_bal);
- AVL_SETCHILD(node, right);
- AVL_SETPARENT(node, child);
-
- /*
- * update the pointer into this subtree
- */
- AVL_SETBALANCE(child, child_bal);
- AVL_SETCHILD(child, which_child);
- AVL_SETPARENT(child, parent);
- if (parent != NULL)
- parent->avl_child[which_child] = child;
- else
- tree->avl_root = child;
-
- return (child_bal == 0);
- }
-
- /* BEGIN CSTYLED */
- /*
- * case 2 : When node is left heavy, but child is right heavy we use
- * a different rotation.
- *
- * (node b:-2)
- * / \
- * / \
- * / \
- * (child b:+1)
- * / \
- * / \
- * (gchild b: != 0)
- * / \
- * / \
- * gleft gright
- *
- * becomes:
- *
- * (gchild b:0)
- * / \
- * / \
- * / \
- * (child b:?) (node b:?)
- * / \ / \
- * / \ / \
- * gleft gright
- *
- * computing the new balances is more complicated. As an example:
- * if gchild was right_heavy, then child is now left heavy
- * else it is balanced
- */
- /* END CSTYLED */
- gchild = child->avl_child[right];
- gleft = gchild->avl_child[left];
- gright = gchild->avl_child[right];
-
- /*
- * move gright to left child of node and
- *
- * move gleft to right child of node
- */
- node->avl_child[left] = gright;
- if (gright != NULL) {
- AVL_SETPARENT(gright, node);
- AVL_SETCHILD(gright, left);
- }
-
- child->avl_child[right] = gleft;
- if (gleft != NULL) {
- AVL_SETPARENT(gleft, child);
- AVL_SETCHILD(gleft, right);
- }
-
- /*
- * move child to left child of gchild and
- *
- * move node to right child of gchild and
- *
- * fixup parent of all this to point to gchild
- */
- balance = AVL_XBALANCE(gchild);
- gchild->avl_child[left] = child;
- AVL_SETBALANCE(child, (balance == right_heavy ? left_heavy : 0));
- AVL_SETPARENT(child, gchild);
- AVL_SETCHILD(child, left);
-
- gchild->avl_child[right] = node;
- AVL_SETBALANCE(node, (balance == left_heavy ? right_heavy : 0));
- AVL_SETPARENT(node, gchild);
- AVL_SETCHILD(node, right);
-
- AVL_SETBALANCE(gchild, 0);
- AVL_SETPARENT(gchild, parent);
- AVL_SETCHILD(gchild, which_child);
- if (parent != NULL)
- parent->avl_child[which_child] = gchild;
- else
- tree->avl_root = gchild;
-
- return (1); /* the new tree is always shorter */
-}
-
-
-/*
- * Insert a new node into an AVL tree at the specified (from avl_find()) place.
- *
- * Newly inserted nodes are always leaf nodes in the tree, since avl_find()
- * searches out to the leaf positions. The avl_index_t indicates the node
- * which will be the parent of the new node.
- *
- * After the node is inserted, a single rotation further up the tree may
- * be necessary to maintain an acceptable AVL balance.
- */
-void
-avl_insert(avl_tree_t *tree, void *new_data, avl_index_t where)
-{
- avl_node_t *node;
- avl_node_t *parent = AVL_INDEX2NODE(where);
- int old_balance;
- int new_balance;
- int which_child = AVL_INDEX2CHILD(where);
- size_t off = tree->avl_offset;
-
- ASSERT(tree);
-#ifdef _LP64
- ASSERT(((uintptr_t)new_data & 0x7) == 0);
-#endif
-
- node = AVL_DATA2NODE(new_data, off);
-
- /*
- * First, add the node to the tree at the indicated position.
- */
- ++tree->avl_numnodes;
-
- node->avl_child[0] = NULL;
- node->avl_child[1] = NULL;
-
- AVL_SETCHILD(node, which_child);
- AVL_SETBALANCE(node, 0);
- AVL_SETPARENT(node, parent);
- if (parent != NULL) {
- ASSERT(parent->avl_child[which_child] == NULL);
- parent->avl_child[which_child] = node;
- } else {
- ASSERT(tree->avl_root == NULL);
- tree->avl_root = node;
- }
- /*
- * Now, back up the tree modifying the balance of all nodes above the
- * insertion point. If we get to a highly unbalanced ancestor, we
- * need to do a rotation. If we back out of the tree we are done.
- * If we brought any subtree into perfect balance (0), we are also done.
- */
- for (;;) {
- node = parent;
- if (node == NULL)
- return;
-
- /*
- * Compute the new balance
- */
- old_balance = AVL_XBALANCE(node);
- new_balance = old_balance + avl_child2balance[which_child];
-
- /*
- * If we introduced equal balance, then we are done immediately
- */
- if (new_balance == 0) {
- AVL_SETBALANCE(node, 0);
- return;
- }
-
- /*
- * If both old and new are not zero we went
- * from -1 to -2 balance, do a rotation.
- */
- if (old_balance != 0)
- break;
-
- AVL_SETBALANCE(node, new_balance);
- parent = AVL_XPARENT(node);
- which_child = AVL_XCHILD(node);
- }
-
- /*
- * perform a rotation to fix the tree and return
- */
- (void) avl_rotation(tree, node, new_balance);
-}
-
-/*
- * Insert "new_data" in "tree" in the given "direction" either after or
- * before (AVL_AFTER, AVL_BEFORE) the data "here".
- *
- * Insertions can only be done at empty leaf points in the tree, therefore
- * if the given child of the node is already present we move to either
- * the AVL_PREV or AVL_NEXT and reverse the insertion direction. Since
- * every other node in the tree is a leaf, this always works.
- *
- * To help developers using this interface, we assert that the new node
- * is correctly ordered at every step of the way in DEBUG kernels.
- */
-void
-avl_insert_here(
- avl_tree_t *tree,
- void *new_data,
- void *here,
- int direction)
-{
- avl_node_t *node;
- int child = direction; /* rely on AVL_BEFORE == 0, AVL_AFTER == 1 */
-#ifdef DEBUG
- int diff;
-#endif
-
- ASSERT(tree != NULL);
- ASSERT(new_data != NULL);
- ASSERT(here != NULL);
- ASSERT(direction == AVL_BEFORE || direction == AVL_AFTER);
-
- /*
- * If corresponding child of node is not NULL, go to the neighboring
- * node and reverse the insertion direction.
- */
- node = AVL_DATA2NODE(here, tree->avl_offset);
-
-#ifdef DEBUG
- diff = tree->avl_compar(new_data, here);
- ASSERT(-1 <= diff && diff <= 1);
- ASSERT(diff != 0);
- ASSERT(diff > 0 ? child == 1 : child == 0);
-#endif
-
- if (node->avl_child[child] != NULL) {
- node = node->avl_child[child];
- child = 1 - child;
- while (node->avl_child[child] != NULL) {
-#ifdef DEBUG
- diff = tree->avl_compar(new_data,
- AVL_NODE2DATA(node, tree->avl_offset));
- ASSERT(-1 <= diff && diff <= 1);
- ASSERT(diff != 0);
- ASSERT(diff > 0 ? child == 1 : child == 0);
-#endif
- node = node->avl_child[child];
- }
-#ifdef DEBUG
- diff = tree->avl_compar(new_data,
- AVL_NODE2DATA(node, tree->avl_offset));
- ASSERT(-1 <= diff && diff <= 1);
- ASSERT(diff != 0);
- ASSERT(diff > 0 ? child == 1 : child == 0);
-#endif
- }
- ASSERT(node->avl_child[child] == NULL);
-
- avl_insert(tree, new_data, AVL_MKINDEX(node, child));
-}
-
-/*
- * Add a new node to an AVL tree.
- */
-void
-avl_add(avl_tree_t *tree, void *new_node)
-{
- avl_index_t where;
-
- /*
- * This is unfortunate. We want to call panic() here, even for
- * non-DEBUG kernels. In userland, however, we can't depend on anything
- * in libc or else the rtld build process gets confused. So, all we can
- * do in userland is resort to a normal ASSERT().
- */
- if (avl_find(tree, new_node, &where) != NULL)
-#ifdef _KERNEL
- panic("avl_find() succeeded inside avl_add()");
-#else
- ASSERT(0);
-#endif
- avl_insert(tree, new_node, where);
-}
-
-/*
- * Delete a node from the AVL tree. Deletion is similar to insertion, but
- * with 2 complications.
- *
- * First, we may be deleting an interior node. Consider the following subtree:
- *
- * d c c
- * / \ / \ / \
- * b e b e b e
- * / \ / \ /
- * a c a a
- *
- * When we are deleting node (d), we find and bring up an adjacent valued leaf
- * node, say (c), to take the interior node's place. In the code this is
- * handled by temporarily swapping (d) and (c) in the tree and then using
- * common code to delete (d) from the leaf position.
- *
- * Secondly, an interior deletion from a deep tree may require more than one
- * rotation to fix the balance. This is handled by moving up the tree through
- * parents and applying rotations as needed. The return value from
- * avl_rotation() is used to detect when a subtree did not change overall
- * height due to a rotation.
- */
-void
-avl_remove(avl_tree_t *tree, void *data)
-{
- avl_node_t *delete;
- avl_node_t *parent;
- avl_node_t *node;
- avl_node_t tmp;
- int old_balance;
- int new_balance;
- int left;
- int right;
- int which_child;
- size_t off = tree->avl_offset;
-
- ASSERT(tree);
-
- delete = AVL_DATA2NODE(data, off);
-
- /*
- * Deletion is easiest with a node that has at most 1 child.
- * We swap a node with 2 children with a sequentially valued
- * neighbor node. That node will have at most 1 child. Note this
- * has no effect on the ordering of the remaining nodes.
- *
- * As an optimization, we choose the greater neighbor if the tree
- * is right heavy, otherwise the left neighbor. This reduces the
- * number of rotations needed.
- */
- if (delete->avl_child[0] != NULL && delete->avl_child[1] != NULL) {
-
- /*
- * choose node to swap from whichever side is taller
- */
- old_balance = AVL_XBALANCE(delete);
- left = avl_balance2child[old_balance + 1];
- right = 1 - left;
-
- /*
- * get to the previous value'd node
- * (down 1 left, as far as possible right)
- */
- for (node = delete->avl_child[left];
- node->avl_child[right] != NULL;
- node = node->avl_child[right])
- ;
-
- /*
- * create a temp placeholder for 'node'
- * move 'node' to delete's spot in the tree
- */
- tmp = *node;
-
- *node = *delete;
- if (node->avl_child[left] == node)
- node->avl_child[left] = &tmp;
-
- parent = AVL_XPARENT(node);
- if (parent != NULL)
- parent->avl_child[AVL_XCHILD(node)] = node;
- else
- tree->avl_root = node;
- AVL_SETPARENT(node->avl_child[left], node);
- AVL_SETPARENT(node->avl_child[right], node);
-
- /*
- * Put tmp where node used to be (just temporary).
- * It always has a parent and at most 1 child.
- */
- delete = &tmp;
- parent = AVL_XPARENT(delete);
- parent->avl_child[AVL_XCHILD(delete)] = delete;
- which_child = (delete->avl_child[1] != 0);
- if (delete->avl_child[which_child] != NULL)
- AVL_SETPARENT(delete->avl_child[which_child], delete);
- }
-
-
- /*
- * Here we know "delete" is at least partially a leaf node. It can
- * be easily removed from the tree.
- */
- ASSERT(tree->avl_numnodes > 0);
- --tree->avl_numnodes;
- parent = AVL_XPARENT(delete);
- which_child = AVL_XCHILD(delete);
- if (delete->avl_child[0] != NULL)
- node = delete->avl_child[0];
- else
- node = delete->avl_child[1];
-
- /*
- * Connect parent directly to node (leaving out delete).
- */
- if (node != NULL) {
- AVL_SETPARENT(node, parent);
- AVL_SETCHILD(node, which_child);
- }
- if (parent == NULL) {
- tree->avl_root = node;
- return;
- }
- parent->avl_child[which_child] = node;
-
-
- /*
- * Since the subtree is now shorter, begin adjusting parent balances
- * and performing any needed rotations.
- */
- do {
-
- /*
- * Move up the tree and adjust the balance
- *
- * Capture the parent and which_child values for the next
- * iteration before any rotations occur.
- */
- node = parent;
- old_balance = AVL_XBALANCE(node);
- new_balance = old_balance - avl_child2balance[which_child];
- parent = AVL_XPARENT(node);
- which_child = AVL_XCHILD(node);
-
- /*
- * If a node was in perfect balance but isn't anymore then
- * we can stop, since the height didn't change above this point
- * due to a deletion.
- */
- if (old_balance == 0) {
- AVL_SETBALANCE(node, new_balance);
- break;
- }
-
- /*
- * If the new balance is zero, we don't need to rotate
- * else
- * need a rotation to fix the balance.
- * If the rotation doesn't change the height
- * of the sub-tree we have finished adjusting.
- */
- if (new_balance == 0)
- AVL_SETBALANCE(node, new_balance);
- else if (!avl_rotation(tree, node, new_balance))
- break;
- } while (parent != NULL);
-}
-
-/*
- * initialize a new AVL tree
- */
-void
-avl_create(avl_tree_t *tree, int (*compar) (const void *, const void *),
- size_t size, size_t offset)
-{
- ASSERT(tree);
- ASSERT(compar);
- ASSERT(size > 0);
- ASSERT(size >= offset + sizeof (avl_node_t));
-#ifdef _LP64
- ASSERT((offset & 0x7) == 0);
-#endif
-
- tree->avl_compar = compar;
- tree->avl_root = NULL;
- tree->avl_numnodes = 0;
- tree->avl_size = size;
- tree->avl_offset = offset;
-}
-
-/*
- * Delete a tree.
- */
-/* ARGSUSED */
-void
-avl_destroy(avl_tree_t *tree)
-{
- ASSERT(tree);
- ASSERT(tree->avl_numnodes == 0);
- ASSERT(tree->avl_root == NULL);
-}
-
-
-/*
- * Return the number of nodes in an AVL tree.
- */
-ulong_t
-avl_numnodes(avl_tree_t *tree)
-{
- ASSERT(tree);
- return (tree->avl_numnodes);
-}
-
-
-#define CHILDBIT (1L)
-
-/*
- * Post-order tree walk used to visit all tree nodes and destroy the tree
- * in post order. This is used for destroying a tree w/o paying any cost
- * for rebalancing it.
- *
- * example:
- *
- * void *cookie = NULL;
- * my_data_t *node;
- *
- * while ((node = avl_destroy_nodes(tree, &cookie)) != NULL)
- * free(node);
- * avl_destroy(tree);
- *
- * The cookie is really an avl_node_t to the current node's parent and
- * an indication of which child you looked at last.
- *
- * On input, a cookie value of CHILDBIT indicates the tree is done.
- */
-void *
-avl_destroy_nodes(avl_tree_t *tree, void **cookie)
-{
- avl_node_t *node;
- avl_node_t *parent;
- int child;
- void *first;
- size_t off = tree->avl_offset;
-
- /*
- * Initial calls go to the first node or it's right descendant.
- */
- if (*cookie == NULL) {
- first = avl_first(tree);
-
- /*
- * deal with an empty tree
- */
- if (first == NULL) {
- *cookie = (void *)CHILDBIT;
- return (NULL);
- }
-
- node = AVL_DATA2NODE(first, off);
- parent = AVL_XPARENT(node);
- goto check_right_side;
- }
-
- /*
- * If there is no parent to return to we are done.
- */
- parent = (avl_node_t *)((uintptr_t)(*cookie) & ~CHILDBIT);
- if (parent == NULL) {
- if (tree->avl_root != NULL) {
- ASSERT(tree->avl_numnodes == 1);
- tree->avl_root = NULL;
- tree->avl_numnodes = 0;
- }
- return (NULL);
- }
-
- /*
- * Remove the child pointer we just visited from the parent and tree.
- */
- child = (uintptr_t)(*cookie) & CHILDBIT;
- parent->avl_child[child] = NULL;
- ASSERT(tree->avl_numnodes > 1);
- --tree->avl_numnodes;
-
- /*
- * If we just did a right child or there isn't one, go up to parent.
- */
- if (child == 1 || parent->avl_child[1] == NULL) {
- node = parent;
- parent = AVL_XPARENT(parent);
- goto done;
- }
-
- /*
- * Do parent's right child, then leftmost descendent.
- */
- node = parent->avl_child[1];
- while (node->avl_child[0] != NULL) {
- parent = node;
- node = node->avl_child[0];
- }
-
- /*
- * If here, we moved to a left child. It may have one
- * child on the right (when balance == +1).
- */
-check_right_side:
- if (node->avl_child[1] != NULL) {
- ASSERT(AVL_XBALANCE(node) == 1);
- parent = node;
- node = node->avl_child[1];
- ASSERT(node->avl_child[0] == NULL &&
- node->avl_child[1] == NULL);
- } else {
- ASSERT(AVL_XBALANCE(node) <= 0);
- }
-
-done:
- if (parent == NULL) {
- *cookie = (void *)CHILDBIT;
- ASSERT(node == tree->avl_root);
- } else {
- *cookie = (void *)((uintptr_t)parent | AVL_XCHILD(node));
- }
-
- return (AVL_NODE2DATA(node, off));
-}
diff --git a/sys/contrib/opensolaris/common/nvpair/nvpair.c b/sys/contrib/opensolaris/common/nvpair/nvpair.c
deleted file mode 100644
index d3d5bed..0000000
--- a/sys/contrib/opensolaris/common/nvpair/nvpair.c
+++ /dev/null
@@ -1,2953 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/debug.h>
-#include <sys/nvpair.h>
-#include <sys/nvpair_impl.h>
-#include <rpc/types.h>
-#include <rpc/xdr.h>
-
-#if defined(_KERNEL) && !defined(_BOOT)
-#include <sys/varargs.h>
-#else
-#include <stdarg.h>
-#include <strings.h>
-#endif
-
-#ifndef offsetof
-#define offsetof(s, m) ((size_t)(&(((s *)0)->m)))
-#endif
-
-
-/*
- * nvpair.c - Provides kernel & userland interfaces for manipulating
- * name-value pairs.
- *
- * Overview Diagram
- *
- * +--------------+
- * | nvlist_t |
- * |--------------|
- * | nvl_version |
- * | nvl_nvflag |
- * | nvl_priv -+-+
- * | nvl_flag | |
- * | nvl_pad | |
- * +--------------+ |
- * V
- * +--------------+ last i_nvp in list
- * | nvpriv_t | +--------------------->
- * |--------------| |
- * +--+- nvp_list | | +------------+
- * | | nvp_last -+--+ + nv_alloc_t |
- * | | nvp_curr | |------------|
- * | | nvp_nva -+----> | nva_ops |
- * | | nvp_stat | | nva_arg |
- * | +--------------+ +------------+
- * |
- * +-------+
- * V
- * +---------------------+ +-------------------+
- * | i_nvp_t | +-->| i_nvp_t | +-->
- * |---------------------| | |-------------------| |
- * | nvi_next -+--+ | nvi_next -+--+
- * | nvi_prev (NULL) | <----+ nvi_prev |
- * | . . . . . . . . . . | | . . . . . . . . . |
- * | nvp (nvpair_t) | | nvp (nvpair_t) |
- * | - nvp_size | | - nvp_size |
- * | - nvp_name_sz | | - nvp_name_sz |
- * | - nvp_value_elem | | - nvp_value_elem |
- * | - nvp_type | | - nvp_type |
- * | - data ... | | - data ... |
- * +---------------------+ +-------------------+
- *
- *
- *
- * +---------------------+ +---------------------+
- * | i_nvp_t | +--> +-->| i_nvp_t (last) |
- * |---------------------| | | |---------------------|
- * | nvi_next -+--+ ... --+ | nvi_next (NULL) |
- * <-+- nvi_prev |<-- ... <----+ nvi_prev |
- * | . . . . . . . . . | | . . . . . . . . . |
- * | nvp (nvpair_t) | | nvp (nvpair_t) |
- * | - nvp_size | | - nvp_size |
- * | - nvp_name_sz | | - nvp_name_sz |
- * | - nvp_value_elem | | - nvp_value_elem |
- * | - DATA_TYPE_NVLIST | | - nvp_type |
- * | - data (embedded) | | - data ... |
- * | nvlist name | +---------------------+
- * | +--------------+ |
- * | | nvlist_t | |
- * | |--------------| |
- * | | nvl_version | |
- * | | nvl_nvflag | |
- * | | nvl_priv --+---+---->
- * | | nvl_flag | |
- * | | nvl_pad | |
- * | +--------------+ |
- * +---------------------+
- *
- *
- * N.B. nvpair_t may be aligned on 4 byte boundary, so +4 will
- * allow value to be aligned on 8 byte boundary
- *
- * name_len is the length of the name string including the null terminator
- * so it must be >= 1
- */
-#define NVP_SIZE_CALC(name_len, data_len) \
- (NV_ALIGN((sizeof (nvpair_t)) + name_len) + NV_ALIGN(data_len))
-
-static int i_get_value_size(data_type_t type, const void *data, uint_t nelem);
-static int nvlist_add_common(nvlist_t *nvl, const char *name, data_type_t type,
- uint_t nelem, const void *data);
-
-#define NV_STAT_EMBEDDED 0x1
-#define EMBEDDED_NVL(nvp) ((nvlist_t *)(void *)NVP_VALUE(nvp))
-#define EMBEDDED_NVL_ARRAY(nvp) ((nvlist_t **)(void *)NVP_VALUE(nvp))
-
-#define NVP_VALOFF(nvp) (NV_ALIGN(sizeof (nvpair_t) + (nvp)->nvp_name_sz))
-#define NVPAIR2I_NVP(nvp) \
- ((i_nvp_t *)((size_t)(nvp) - offsetof(i_nvp_t, nvi_nvp)))
-
-
-int
-nv_alloc_init(nv_alloc_t *nva, const nv_alloc_ops_t *nvo, /* args */ ...)
-{
- va_list valist;
- int err = 0;
-
- nva->nva_ops = nvo;
- nva->nva_arg = NULL;
-
- va_start(valist, nvo);
- if (nva->nva_ops->nv_ao_init != NULL)
- err = nva->nva_ops->nv_ao_init(nva, valist);
- va_end(valist);
-
- return (err);
-}
-
-void
-nv_alloc_reset(nv_alloc_t *nva)
-{
- if (nva->nva_ops->nv_ao_reset != NULL)
- nva->nva_ops->nv_ao_reset(nva);
-}
-
-void
-nv_alloc_fini(nv_alloc_t *nva)
-{
- if (nva->nva_ops->nv_ao_fini != NULL)
- nva->nva_ops->nv_ao_fini(nva);
-}
-
-nv_alloc_t *
-nvlist_lookup_nv_alloc(nvlist_t *nvl)
-{
- nvpriv_t *priv;
-
- if (nvl == NULL ||
- (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
- return (NULL);
-
- return (priv->nvp_nva);
-}
-
-static void *
-nv_mem_zalloc(nvpriv_t *nvp, size_t size)
-{
- nv_alloc_t *nva = nvp->nvp_nva;
- void *buf;
-
- if ((buf = nva->nva_ops->nv_ao_alloc(nva, size)) != NULL)
- bzero(buf, size);
-
- return (buf);
-}
-
-static void
-nv_mem_free(nvpriv_t *nvp, void *buf, size_t size)
-{
- nv_alloc_t *nva = nvp->nvp_nva;
-
- nva->nva_ops->nv_ao_free(nva, buf, size);
-}
-
-static void
-nv_priv_init(nvpriv_t *priv, nv_alloc_t *nva, uint32_t stat)
-{
- bzero(priv, sizeof (priv));
-
- priv->nvp_nva = nva;
- priv->nvp_stat = stat;
-}
-
-static nvpriv_t *
-nv_priv_alloc(nv_alloc_t *nva)
-{
- nvpriv_t *priv;
-
- /*
- * nv_mem_alloc() cannot called here because it needs the priv
- * argument.
- */
- if ((priv = nva->nva_ops->nv_ao_alloc(nva, sizeof (nvpriv_t))) == NULL)
- return (NULL);
-
- nv_priv_init(priv, nva, 0);
-
- return (priv);
-}
-
-/*
- * Embedded lists need their own nvpriv_t's. We create a new
- * nvpriv_t using the parameters and allocator from the parent
- * list's nvpriv_t.
- */
-static nvpriv_t *
-nv_priv_alloc_embedded(nvpriv_t *priv)
-{
- nvpriv_t *emb_priv;
-
- if ((emb_priv = nv_mem_zalloc(priv, sizeof (nvpriv_t))) == NULL)
- return (NULL);
-
- nv_priv_init(emb_priv, priv->nvp_nva, NV_STAT_EMBEDDED);
-
- return (emb_priv);
-}
-
-static void
-nvlist_init(nvlist_t *nvl, uint32_t nvflag, nvpriv_t *priv)
-{
- nvl->nvl_version = NV_VERSION;
- nvl->nvl_nvflag = nvflag & (NV_UNIQUE_NAME|NV_UNIQUE_NAME_TYPE);
- nvl->nvl_priv = (uint64_t)(uintptr_t)priv;
- nvl->nvl_flag = 0;
- nvl->nvl_pad = 0;
-}
-
-/*
- * nvlist_alloc - Allocate nvlist.
- */
-/*ARGSUSED1*/
-int
-nvlist_alloc(nvlist_t **nvlp, uint_t nvflag, int kmflag)
-{
-#if defined(_KERNEL) && !defined(_BOOT)
- return (nvlist_xalloc(nvlp, nvflag,
- (kmflag == KM_SLEEP ? nv_alloc_sleep : nv_alloc_nosleep)));
-#else
- return (nvlist_xalloc(nvlp, nvflag, nv_alloc_nosleep));
-#endif
-}
-
-int
-nvlist_xalloc(nvlist_t **nvlp, uint_t nvflag, nv_alloc_t *nva)
-{
- nvpriv_t *priv;
-
- if (nvlp == NULL || nva == NULL)
- return (EINVAL);
-
- if ((priv = nv_priv_alloc(nva)) == NULL)
- return (ENOMEM);
-
- if ((*nvlp = nv_mem_zalloc(priv,
- NV_ALIGN(sizeof (nvlist_t)))) == NULL) {
- nv_mem_free(priv, priv, sizeof (nvpriv_t));
- return (ENOMEM);
- }
-
- nvlist_init(*nvlp, nvflag, priv);
-
- return (0);
-}
-
-/*
- * nvp_buf_alloc - Allocate i_nvp_t for storing a new nv pair.
- */
-static nvpair_t *
-nvp_buf_alloc(nvlist_t *nvl, size_t len)
-{
- nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
- i_nvp_t *buf;
- nvpair_t *nvp;
- size_t nvsize;
-
- /*
- * Allocate the buffer
- */
- nvsize = len + offsetof(i_nvp_t, nvi_nvp);
-
- if ((buf = nv_mem_zalloc(priv, nvsize)) == NULL)
- return (NULL);
-
- nvp = &buf->nvi_nvp;
- nvp->nvp_size = len;
-
- return (nvp);
-}
-
-/*
- * nvp_buf_free - de-Allocate an i_nvp_t.
- */
-static void
-nvp_buf_free(nvlist_t *nvl, nvpair_t *nvp)
-{
- nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
- size_t nvsize = nvp->nvp_size + offsetof(i_nvp_t, nvi_nvp);
-
- nv_mem_free(priv, NVPAIR2I_NVP(nvp), nvsize);
-}
-
-/*
- * nvp_buf_link - link a new nv pair into the nvlist.
- */
-static void
-nvp_buf_link(nvlist_t *nvl, nvpair_t *nvp)
-{
- nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
- i_nvp_t *curr = NVPAIR2I_NVP(nvp);
-
- /* Put element at end of nvlist */
- if (priv->nvp_list == NULL) {
- priv->nvp_list = priv->nvp_last = curr;
- } else {
- curr->nvi_prev = priv->nvp_last;
- priv->nvp_last->nvi_next = curr;
- priv->nvp_last = curr;
- }
-}
-
-/*
- * nvp_buf_unlink - unlink an removed nvpair out of the nvlist.
- */
-static void
-nvp_buf_unlink(nvlist_t *nvl, nvpair_t *nvp)
-{
- nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
- i_nvp_t *curr = NVPAIR2I_NVP(nvp);
-
- /*
- * protect nvlist_next_nvpair() against walking on freed memory.
- */
- if (priv->nvp_curr == curr)
- priv->nvp_curr = curr->nvi_next;
-
- if (curr == priv->nvp_list)
- priv->nvp_list = curr->nvi_next;
- else
- curr->nvi_prev->nvi_next = curr->nvi_next;
-
- if (curr == priv->nvp_last)
- priv->nvp_last = curr->nvi_prev;
- else
- curr->nvi_next->nvi_prev = curr->nvi_prev;
-}
-
-/*
- * take a nvpair type and number of elements and make sure the are valid
- */
-static int
-i_validate_type_nelem(data_type_t type, uint_t nelem)
-{
- switch (type) {
- case DATA_TYPE_BOOLEAN:
- if (nelem != 0)
- return (EINVAL);
- break;
- case DATA_TYPE_BOOLEAN_VALUE:
- case DATA_TYPE_BYTE:
- case DATA_TYPE_INT8:
- case DATA_TYPE_UINT8:
- case DATA_TYPE_INT16:
- case DATA_TYPE_UINT16:
- case DATA_TYPE_INT32:
- case DATA_TYPE_UINT32:
- case DATA_TYPE_INT64:
- case DATA_TYPE_UINT64:
- case DATA_TYPE_STRING:
- case DATA_TYPE_HRTIME:
- case DATA_TYPE_NVLIST:
- if (nelem != 1)
- return (EINVAL);
- break;
- case DATA_TYPE_BOOLEAN_ARRAY:
- case DATA_TYPE_BYTE_ARRAY:
- case DATA_TYPE_INT8_ARRAY:
- case DATA_TYPE_UINT8_ARRAY:
- case DATA_TYPE_INT16_ARRAY:
- case DATA_TYPE_UINT16_ARRAY:
- case DATA_TYPE_INT32_ARRAY:
- case DATA_TYPE_UINT32_ARRAY:
- case DATA_TYPE_INT64_ARRAY:
- case DATA_TYPE_UINT64_ARRAY:
- case DATA_TYPE_STRING_ARRAY:
- case DATA_TYPE_NVLIST_ARRAY:
- /* we allow arrays with 0 elements */
- break;
- default:
- return (EINVAL);
- }
- return (0);
-}
-
-/*
- * Verify nvp_name_sz and check the name string length.
- */
-static int
-i_validate_nvpair_name(nvpair_t *nvp)
-{
- if ((nvp->nvp_name_sz <= 0) ||
- (nvp->nvp_size < NVP_SIZE_CALC(nvp->nvp_name_sz, 0)))
- return (EFAULT);
-
- /* verify the name string, make sure its terminated */
- if (NVP_NAME(nvp)[nvp->nvp_name_sz - 1] != '\0')
- return (EFAULT);
-
- return (strlen(NVP_NAME(nvp)) == nvp->nvp_name_sz - 1 ? 0 : EFAULT);
-}
-
-static int
-i_validate_nvpair_value(data_type_t type, uint_t nelem, const void *data)
-{
- switch (type) {
- case DATA_TYPE_BOOLEAN_VALUE:
- if (*(boolean_t *)data != B_TRUE &&
- *(boolean_t *)data != B_FALSE)
- return (EINVAL);
- break;
- case DATA_TYPE_BOOLEAN_ARRAY: {
- int i;
-
- for (i = 0; i < nelem; i++)
- if (((boolean_t *)data)[i] != B_TRUE &&
- ((boolean_t *)data)[i] != B_FALSE)
- return (EINVAL);
- break;
- }
- default:
- break;
- }
-
- return (0);
-}
-
-/*
- * This function takes a pointer to what should be a nvpair and it's size
- * and then verifies that all the nvpair fields make sense and can be
- * trusted. This function is used when decoding packed nvpairs.
- */
-static int
-i_validate_nvpair(nvpair_t *nvp)
-{
- data_type_t type = NVP_TYPE(nvp);
- int size1, size2;
-
- /* verify nvp_name_sz, check the name string length */
- if (i_validate_nvpair_name(nvp) != 0)
- return (EFAULT);
-
- if (i_validate_nvpair_value(type, NVP_NELEM(nvp), NVP_VALUE(nvp)) != 0)
- return (EFAULT);
-
- /*
- * verify nvp_type, nvp_value_elem, and also possibly
- * verify string values and get the value size.
- */
- size2 = i_get_value_size(type, NVP_VALUE(nvp), NVP_NELEM(nvp));
- size1 = nvp->nvp_size - NVP_VALOFF(nvp);
- if (size2 < 0 || size1 != NV_ALIGN(size2))
- return (EFAULT);
-
- return (0);
-}
-
-static int
-nvlist_copy_pairs(nvlist_t *snvl, nvlist_t *dnvl)
-{
- nvpriv_t *priv;
- i_nvp_t *curr;
-
- if ((priv = (nvpriv_t *)(uintptr_t)snvl->nvl_priv) == NULL)
- return (EINVAL);
-
- for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) {
- nvpair_t *nvp = &curr->nvi_nvp;
- int err;
-
- if ((err = nvlist_add_common(dnvl, NVP_NAME(nvp), NVP_TYPE(nvp),
- NVP_NELEM(nvp), NVP_VALUE(nvp))) != 0)
- return (err);
- }
-
- return (0);
-}
-
-/*
- * Frees all memory allocated for an nvpair (like embedded lists) with
- * the exception of the nvpair buffer itself.
- */
-static void
-nvpair_free(nvpair_t *nvp)
-{
- switch (NVP_TYPE(nvp)) {
- case DATA_TYPE_NVLIST:
- nvlist_free(EMBEDDED_NVL(nvp));
- break;
- case DATA_TYPE_NVLIST_ARRAY: {
- nvlist_t **nvlp = EMBEDDED_NVL_ARRAY(nvp);
- int i;
-
- for (i = 0; i < NVP_NELEM(nvp); i++)
- if (nvlp[i] != NULL)
- nvlist_free(nvlp[i]);
- break;
- }
- default:
- break;
- }
-}
-
-/*
- * nvlist_free - free an unpacked nvlist
- */
-void
-nvlist_free(nvlist_t *nvl)
-{
- nvpriv_t *priv;
- i_nvp_t *curr;
-
- if (nvl == NULL ||
- (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
- return;
-
- /*
- * Unpacked nvlist are linked through i_nvp_t
- */
- curr = priv->nvp_list;
- while (curr != NULL) {
- nvpair_t *nvp = &curr->nvi_nvp;
- curr = curr->nvi_next;
-
- nvpair_free(nvp);
- nvp_buf_free(nvl, nvp);
- }
-
- if (!(priv->nvp_stat & NV_STAT_EMBEDDED))
- nv_mem_free(priv, nvl, NV_ALIGN(sizeof (nvlist_t)));
- else
- nvl->nvl_priv = 0;
-
- nv_mem_free(priv, priv, sizeof (nvpriv_t));
-}
-
-static int
-nvlist_contains_nvp(nvlist_t *nvl, nvpair_t *nvp)
-{
- nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
- i_nvp_t *curr;
-
- if (nvp == NULL)
- return (0);
-
- for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next)
- if (&curr->nvi_nvp == nvp)
- return (1);
-
- return (0);
-}
-
-/*
- * Make a copy of nvlist
- */
-/*ARGSUSED1*/
-int
-nvlist_dup(nvlist_t *nvl, nvlist_t **nvlp, int kmflag)
-{
-#if defined(_KERNEL) && !defined(_BOOT)
- return (nvlist_xdup(nvl, nvlp,
- (kmflag == KM_SLEEP ? nv_alloc_sleep : nv_alloc_nosleep)));
-#else
- return (nvlist_xdup(nvl, nvlp, nv_alloc_nosleep));
-#endif
-}
-
-int
-nvlist_xdup(nvlist_t *nvl, nvlist_t **nvlp, nv_alloc_t *nva)
-{
- int err;
- nvlist_t *ret;
-
- if (nvl == NULL || nvlp == NULL)
- return (EINVAL);
-
- if ((err = nvlist_xalloc(&ret, nvl->nvl_nvflag, nva)) != 0)
- return (err);
-
- if ((err = nvlist_copy_pairs(nvl, ret)) != 0)
- nvlist_free(ret);
- else
- *nvlp = ret;
-
- return (err);
-}
-
-/*
- * Remove all with matching name
- */
-int
-nvlist_remove_all(nvlist_t *nvl, const char *name)
-{
- nvpriv_t *priv;
- i_nvp_t *curr;
- int error = ENOENT;
-
- if (nvl == NULL || name == NULL ||
- (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
- return (EINVAL);
-
- curr = priv->nvp_list;
- while (curr != NULL) {
- nvpair_t *nvp = &curr->nvi_nvp;
-
- curr = curr->nvi_next;
- if (strcmp(name, NVP_NAME(nvp)) != 0)
- continue;
-
- nvp_buf_unlink(nvl, nvp);
- nvpair_free(nvp);
- nvp_buf_free(nvl, nvp);
-
- error = 0;
- }
-
- return (error);
-}
-
-/*
- * Remove first one with matching name and type
- */
-int
-nvlist_remove(nvlist_t *nvl, const char *name, data_type_t type)
-{
- nvpriv_t *priv;
- i_nvp_t *curr;
-
- if (nvl == NULL || name == NULL ||
- (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
- return (EINVAL);
-
- curr = priv->nvp_list;
- while (curr != NULL) {
- nvpair_t *nvp = &curr->nvi_nvp;
-
- if (strcmp(name, NVP_NAME(nvp)) == 0 && NVP_TYPE(nvp) == type) {
- nvp_buf_unlink(nvl, nvp);
- nvpair_free(nvp);
- nvp_buf_free(nvl, nvp);
-
- return (0);
- }
- curr = curr->nvi_next;
- }
-
- return (ENOENT);
-}
-
-/*
- * This function calculates the size of an nvpair value.
- *
- * The data argument controls the behavior in case of the data types
- * DATA_TYPE_STRING and
- * DATA_TYPE_STRING_ARRAY
- * Is data == NULL then the size of the string(s) is excluded.
- */
-static int
-i_get_value_size(data_type_t type, const void *data, uint_t nelem)
-{
- uint64_t value_sz;
-
- if (i_validate_type_nelem(type, nelem) != 0)
- return (-1);
-
- /* Calculate required size for holding value */
- switch (type) {
- case DATA_TYPE_BOOLEAN:
- value_sz = 0;
- break;
- case DATA_TYPE_BOOLEAN_VALUE:
- value_sz = sizeof (boolean_t);
- break;
- case DATA_TYPE_BYTE:
- value_sz = sizeof (uchar_t);
- break;
- case DATA_TYPE_INT8:
- value_sz = sizeof (int8_t);
- break;
- case DATA_TYPE_UINT8:
- value_sz = sizeof (uint8_t);
- break;
- case DATA_TYPE_INT16:
- value_sz = sizeof (int16_t);
- break;
- case DATA_TYPE_UINT16:
- value_sz = sizeof (uint16_t);
- break;
- case DATA_TYPE_INT32:
- value_sz = sizeof (int32_t);
- break;
- case DATA_TYPE_UINT32:
- value_sz = sizeof (uint32_t);
- break;
- case DATA_TYPE_INT64:
- value_sz = sizeof (int64_t);
- break;
- case DATA_TYPE_UINT64:
- value_sz = sizeof (uint64_t);
- break;
- case DATA_TYPE_STRING:
- if (data == NULL)
- value_sz = 0;
- else
- value_sz = strlen(data) + 1;
- break;
- case DATA_TYPE_BOOLEAN_ARRAY:
- value_sz = (uint64_t)nelem * sizeof (boolean_t);
- break;
- case DATA_TYPE_BYTE_ARRAY:
- value_sz = (uint64_t)nelem * sizeof (uchar_t);
- break;
- case DATA_TYPE_INT8_ARRAY:
- value_sz = (uint64_t)nelem * sizeof (int8_t);
- break;
- case DATA_TYPE_UINT8_ARRAY:
- value_sz = (uint64_t)nelem * sizeof (uint8_t);
- break;
- case DATA_TYPE_INT16_ARRAY:
- value_sz = (uint64_t)nelem * sizeof (int16_t);
- break;
- case DATA_TYPE_UINT16_ARRAY:
- value_sz = (uint64_t)nelem * sizeof (uint16_t);
- break;
- case DATA_TYPE_INT32_ARRAY:
- value_sz = (uint64_t)nelem * sizeof (int32_t);
- break;
- case DATA_TYPE_UINT32_ARRAY:
- value_sz = (uint64_t)nelem * sizeof (uint32_t);
- break;
- case DATA_TYPE_INT64_ARRAY:
- value_sz = (uint64_t)nelem * sizeof (int64_t);
- break;
- case DATA_TYPE_UINT64_ARRAY:
- value_sz = (uint64_t)nelem * sizeof (uint64_t);
- break;
- case DATA_TYPE_STRING_ARRAY:
- value_sz = (uint64_t)nelem * sizeof (uint64_t);
-
- if (data != NULL) {
- char *const *strs = data;
- uint_t i;
-
- /* no alignment requirement for strings */
- for (i = 0; i < nelem; i++) {
- if (strs[i] == NULL)
- return (-1);
- value_sz += strlen(strs[i]) + 1;
- }
- }
- break;
- case DATA_TYPE_HRTIME:
- value_sz = sizeof (hrtime_t);
- break;
- case DATA_TYPE_NVLIST:
- value_sz = NV_ALIGN(sizeof (nvlist_t));
- break;
- case DATA_TYPE_NVLIST_ARRAY:
- value_sz = (uint64_t)nelem * sizeof (uint64_t) +
- (uint64_t)nelem * NV_ALIGN(sizeof (nvlist_t));
- break;
- default:
- return (-1);
- }
-
- return (value_sz > INT32_MAX ? -1 : (int)value_sz);
-}
-
-static int
-nvlist_copy_embedded(nvlist_t *nvl, nvlist_t *onvl, nvlist_t *emb_nvl)
-{
- nvpriv_t *priv;
- int err;
-
- if ((priv = nv_priv_alloc_embedded((nvpriv_t *)(uintptr_t)
- nvl->nvl_priv)) == NULL)
- return (ENOMEM);
-
- nvlist_init(emb_nvl, onvl->nvl_nvflag, priv);
-
- if ((err = nvlist_copy_pairs(onvl, emb_nvl)) != 0) {
- nvlist_free(emb_nvl);
- emb_nvl->nvl_priv = 0;
- }
-
- return (err);
-}
-
-/*
- * nvlist_add_common - Add new <name,value> pair to nvlist
- */
-static int
-nvlist_add_common(nvlist_t *nvl, const char *name,
- data_type_t type, uint_t nelem, const void *data)
-{
- nvpair_t *nvp;
- uint_t i;
-
- int nvp_sz, name_sz, value_sz;
- int err = 0;
-
- if (name == NULL || nvl == NULL || nvl->nvl_priv == 0)
- return (EINVAL);
-
- if (nelem != 0 && data == NULL)
- return (EINVAL);
-
- /*
- * Verify type and nelem and get the value size.
- * In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY
- * is the size of the string(s) included.
- */
- if ((value_sz = i_get_value_size(type, data, nelem)) < 0)
- return (EINVAL);
-
- if (i_validate_nvpair_value(type, nelem, data) != 0)
- return (EINVAL);
-
- /*
- * If we're adding an nvlist or nvlist array, ensure that we are not
- * adding the input nvlist to itself, which would cause recursion,
- * and ensure that no NULL nvlist pointers are present.
- */
- switch (type) {
- case DATA_TYPE_NVLIST:
- if (data == nvl || data == NULL)
- return (EINVAL);
- break;
- case DATA_TYPE_NVLIST_ARRAY: {
- nvlist_t **onvlp = (nvlist_t **)data;
- for (i = 0; i < nelem; i++) {
- if (onvlp[i] == nvl || onvlp[i] == NULL)
- return (EINVAL);
- }
- break;
- }
- default:
- break;
- }
-
- /* calculate sizes of the nvpair elements and the nvpair itself */
- name_sz = strlen(name) + 1;
-
- nvp_sz = NVP_SIZE_CALC(name_sz, value_sz);
-
- if ((nvp = nvp_buf_alloc(nvl, nvp_sz)) == NULL)
- return (ENOMEM);
-
- ASSERT(nvp->nvp_size == nvp_sz);
- nvp->nvp_name_sz = name_sz;
- nvp->nvp_value_elem = nelem;
- nvp->nvp_type = type;
- bcopy(name, NVP_NAME(nvp), name_sz);
-
- switch (type) {
- case DATA_TYPE_BOOLEAN:
- break;
- case DATA_TYPE_STRING_ARRAY: {
- char *const *strs = data;
- char *buf = NVP_VALUE(nvp);
- char **cstrs = (void *)buf;
-
- /* skip pre-allocated space for pointer array */
- buf += nelem * sizeof (uint64_t);
- for (i = 0; i < nelem; i++) {
- int slen = strlen(strs[i]) + 1;
- bcopy(strs[i], buf, slen);
- cstrs[i] = buf;
- buf += slen;
- }
- break;
- }
- case DATA_TYPE_NVLIST: {
- nvlist_t *nnvl = EMBEDDED_NVL(nvp);
- nvlist_t *onvl = (nvlist_t *)data;
-
- if ((err = nvlist_copy_embedded(nvl, onvl, nnvl)) != 0) {
- nvp_buf_free(nvl, nvp);
- return (err);
- }
- break;
- }
- case DATA_TYPE_NVLIST_ARRAY: {
- nvlist_t **onvlp = (nvlist_t **)data;
- nvlist_t **nvlp = EMBEDDED_NVL_ARRAY(nvp);
- nvlist_t *embedded = (nvlist_t *)
- ((uintptr_t)nvlp + nelem * sizeof (uint64_t));
-
- for (i = 0; i < nelem; i++) {
- if ((err = nvlist_copy_embedded(nvl,
- onvlp[i], embedded)) != 0) {
- /*
- * Free any successfully created lists
- */
- nvpair_free(nvp);
- nvp_buf_free(nvl, nvp);
- return (err);
- }
-
- nvlp[i] = embedded++;
- }
- break;
- }
- default:
- bcopy(data, NVP_VALUE(nvp), value_sz);
- }
-
- /* if unique name, remove before add */
- if (nvl->nvl_nvflag & NV_UNIQUE_NAME)
- (void) nvlist_remove_all(nvl, name);
- else if (nvl->nvl_nvflag & NV_UNIQUE_NAME_TYPE)
- (void) nvlist_remove(nvl, name, type);
-
- nvp_buf_link(nvl, nvp);
-
- return (0);
-}
-
-int
-nvlist_add_boolean(nvlist_t *nvl, const char *name)
-{
- return (nvlist_add_common(nvl, name, DATA_TYPE_BOOLEAN, 0, NULL));
-}
-
-int
-nvlist_add_boolean_value(nvlist_t *nvl, const char *name, boolean_t val)
-{
- return (nvlist_add_common(nvl, name, DATA_TYPE_BOOLEAN_VALUE, 1, &val));
-}
-
-int
-nvlist_add_byte(nvlist_t *nvl, const char *name, uchar_t val)
-{
- return (nvlist_add_common(nvl, name, DATA_TYPE_BYTE, 1, &val));
-}
-
-int
-nvlist_add_int8(nvlist_t *nvl, const char *name, int8_t val)
-{
- return (nvlist_add_common(nvl, name, DATA_TYPE_INT8, 1, &val));
-}
-
-int
-nvlist_add_uint8(nvlist_t *nvl, const char *name, uint8_t val)
-{
- return (nvlist_add_common(nvl, name, DATA_TYPE_UINT8, 1, &val));
-}
-
-int
-nvlist_add_int16(nvlist_t *nvl, const char *name, int16_t val)
-{
- return (nvlist_add_common(nvl, name, DATA_TYPE_INT16, 1, &val));
-}
-
-int
-nvlist_add_uint16(nvlist_t *nvl, const char *name, uint16_t val)
-{
- return (nvlist_add_common(nvl, name, DATA_TYPE_UINT16, 1, &val));
-}
-
-int
-nvlist_add_int32(nvlist_t *nvl, const char *name, int32_t val)
-{
- return (nvlist_add_common(nvl, name, DATA_TYPE_INT32, 1, &val));
-}
-
-int
-nvlist_add_uint32(nvlist_t *nvl, const char *name, uint32_t val)
-{
- return (nvlist_add_common(nvl, name, DATA_TYPE_UINT32, 1, &val));
-}
-
-int
-nvlist_add_int64(nvlist_t *nvl, const char *name, int64_t val)
-{
- return (nvlist_add_common(nvl, name, DATA_TYPE_INT64, 1, &val));
-}
-
-int
-nvlist_add_uint64(nvlist_t *nvl, const char *name, uint64_t val)
-{
- return (nvlist_add_common(nvl, name, DATA_TYPE_UINT64, 1, &val));
-}
-
-int
-nvlist_add_string(nvlist_t *nvl, const char *name, const char *val)
-{
- return (nvlist_add_common(nvl, name, DATA_TYPE_STRING, 1, (void *)val));
-}
-
-int
-nvlist_add_boolean_array(nvlist_t *nvl, const char *name,
- boolean_t *a, uint_t n)
-{
- return (nvlist_add_common(nvl, name, DATA_TYPE_BOOLEAN_ARRAY, n, a));
-}
-
-int
-nvlist_add_byte_array(nvlist_t *nvl, const char *name, uchar_t *a, uint_t n)
-{
- return (nvlist_add_common(nvl, name, DATA_TYPE_BYTE_ARRAY, n, a));
-}
-
-int
-nvlist_add_int8_array(nvlist_t *nvl, const char *name, int8_t *a, uint_t n)
-{
- return (nvlist_add_common(nvl, name, DATA_TYPE_INT8_ARRAY, n, a));
-}
-
-int
-nvlist_add_uint8_array(nvlist_t *nvl, const char *name, uint8_t *a, uint_t n)
-{
- return (nvlist_add_common(nvl, name, DATA_TYPE_UINT8_ARRAY, n, a));
-}
-
-int
-nvlist_add_int16_array(nvlist_t *nvl, const char *name, int16_t *a, uint_t n)
-{
- return (nvlist_add_common(nvl, name, DATA_TYPE_INT16_ARRAY, n, a));
-}
-
-int
-nvlist_add_uint16_array(nvlist_t *nvl, const char *name, uint16_t *a, uint_t n)
-{
- return (nvlist_add_common(nvl, name, DATA_TYPE_UINT16_ARRAY, n, a));
-}
-
-int
-nvlist_add_int32_array(nvlist_t *nvl, const char *name, int32_t *a, uint_t n)
-{
- return (nvlist_add_common(nvl, name, DATA_TYPE_INT32_ARRAY, n, a));
-}
-
-int
-nvlist_add_uint32_array(nvlist_t *nvl, const char *name, uint32_t *a, uint_t n)
-{
- return (nvlist_add_common(nvl, name, DATA_TYPE_UINT32_ARRAY, n, a));
-}
-
-int
-nvlist_add_int64_array(nvlist_t *nvl, const char *name, int64_t *a, uint_t n)
-{
- return (nvlist_add_common(nvl, name, DATA_TYPE_INT64_ARRAY, n, a));
-}
-
-int
-nvlist_add_uint64_array(nvlist_t *nvl, const char *name, uint64_t *a, uint_t n)
-{
- return (nvlist_add_common(nvl, name, DATA_TYPE_UINT64_ARRAY, n, a));
-}
-
-int
-nvlist_add_string_array(nvlist_t *nvl, const char *name,
- char *const *a, uint_t n)
-{
- return (nvlist_add_common(nvl, name, DATA_TYPE_STRING_ARRAY, n, a));
-}
-
-int
-nvlist_add_hrtime(nvlist_t *nvl, const char *name, hrtime_t val)
-{
- return (nvlist_add_common(nvl, name, DATA_TYPE_HRTIME, 1, &val));
-}
-
-int
-nvlist_add_nvlist(nvlist_t *nvl, const char *name, nvlist_t *val)
-{
- return (nvlist_add_common(nvl, name, DATA_TYPE_NVLIST, 1, val));
-}
-
-int
-nvlist_add_nvlist_array(nvlist_t *nvl, const char *name, nvlist_t **a, uint_t n)
-{
- return (nvlist_add_common(nvl, name, DATA_TYPE_NVLIST_ARRAY, n, a));
-}
-
-/* reading name-value pairs */
-nvpair_t *
-nvlist_next_nvpair(nvlist_t *nvl, nvpair_t *nvp)
-{
- nvpriv_t *priv;
- i_nvp_t *curr;
-
- if (nvl == NULL ||
- (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
- return (NULL);
-
- curr = NVPAIR2I_NVP(nvp);
-
- /*
- * Ensure that nvp is an valid pointer.
- */
- if (nvp == NULL)
- curr = priv->nvp_list;
- else if (priv->nvp_curr == curr)
- curr = curr->nvi_next;
- else if (nvlist_contains_nvp(nvl, nvp) == 0)
- curr = NULL;
-
- priv->nvp_curr = curr;
-
- return (curr != NULL ? &curr->nvi_nvp : NULL);
-}
-
-char *
-nvpair_name(nvpair_t *nvp)
-{
- return (NVP_NAME(nvp));
-}
-
-data_type_t
-nvpair_type(nvpair_t *nvp)
-{
- return (NVP_TYPE(nvp));
-}
-
-static int
-nvpair_value_common(nvpair_t *nvp, data_type_t type, uint_t *nelem, void *data)
-{
- if (nvp == NULL || nvpair_type(nvp) != type)
- return (EINVAL);
-
- /*
- * For non-array types, we copy the data.
- * For array types (including string), we set a pointer.
- */
- switch (type) {
- case DATA_TYPE_BOOLEAN:
- if (nelem != NULL)
- *nelem = 0;
- break;
-
- case DATA_TYPE_BOOLEAN_VALUE:
- case DATA_TYPE_BYTE:
- case DATA_TYPE_INT8:
- case DATA_TYPE_UINT8:
- case DATA_TYPE_INT16:
- case DATA_TYPE_UINT16:
- case DATA_TYPE_INT32:
- case DATA_TYPE_UINT32:
- case DATA_TYPE_INT64:
- case DATA_TYPE_UINT64:
- case DATA_TYPE_HRTIME:
- if (data == NULL)
- return (EINVAL);
- bcopy(NVP_VALUE(nvp), data,
- (size_t)i_get_value_size(type, NULL, 1));
- if (nelem != NULL)
- *nelem = 1;
- break;
-
- case DATA_TYPE_NVLIST:
- case DATA_TYPE_STRING:
- if (data == NULL)
- return (EINVAL);
- *(void **)data = (void *)NVP_VALUE(nvp);
- if (nelem != NULL)
- *nelem = 1;
- break;
-
- case DATA_TYPE_BOOLEAN_ARRAY:
- case DATA_TYPE_BYTE_ARRAY:
- case DATA_TYPE_INT8_ARRAY:
- case DATA_TYPE_UINT8_ARRAY:
- case DATA_TYPE_INT16_ARRAY:
- case DATA_TYPE_UINT16_ARRAY:
- case DATA_TYPE_INT32_ARRAY:
- case DATA_TYPE_UINT32_ARRAY:
- case DATA_TYPE_INT64_ARRAY:
- case DATA_TYPE_UINT64_ARRAY:
- case DATA_TYPE_STRING_ARRAY:
- case DATA_TYPE_NVLIST_ARRAY:
- if (nelem == NULL || data == NULL)
- return (EINVAL);
- if ((*nelem = NVP_NELEM(nvp)) != 0)
- *(void **)data = (void *)NVP_VALUE(nvp);
- else
- *(void **)data = NULL;
- break;
-
- default:
- return (ENOTSUP);
- }
-
- return (0);
-}
-
-static int
-nvlist_lookup_common(nvlist_t *nvl, const char *name, data_type_t type,
- uint_t *nelem, void *data)
-{
- nvpriv_t *priv;
- nvpair_t *nvp;
- i_nvp_t *curr;
-
- if (name == NULL || nvl == NULL ||
- (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
- return (EINVAL);
-
- if (!(nvl->nvl_nvflag & (NV_UNIQUE_NAME | NV_UNIQUE_NAME_TYPE)))
- return (ENOTSUP);
-
- for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) {
- nvp = &curr->nvi_nvp;
-
- if (strcmp(name, NVP_NAME(nvp)) == 0 && NVP_TYPE(nvp) == type)
- return (nvpair_value_common(nvp, type, nelem, data));
- }
-
- return (ENOENT);
-}
-
-int
-nvlist_lookup_boolean(nvlist_t *nvl, const char *name)
-{
- return (nvlist_lookup_common(nvl, name, DATA_TYPE_BOOLEAN, NULL, NULL));
-}
-
-int
-nvlist_lookup_boolean_value(nvlist_t *nvl, const char *name, boolean_t *val)
-{
- return (nvlist_lookup_common(nvl, name,
- DATA_TYPE_BOOLEAN_VALUE, NULL, val));
-}
-
-int
-nvlist_lookup_byte(nvlist_t *nvl, const char *name, uchar_t *val)
-{
- return (nvlist_lookup_common(nvl, name, DATA_TYPE_BYTE, NULL, val));
-}
-
-int
-nvlist_lookup_int8(nvlist_t *nvl, const char *name, int8_t *val)
-{
- return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT8, NULL, val));
-}
-
-int
-nvlist_lookup_uint8(nvlist_t *nvl, const char *name, uint8_t *val)
-{
- return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT8, NULL, val));
-}
-
-int
-nvlist_lookup_int16(nvlist_t *nvl, const char *name, int16_t *val)
-{
- return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT16, NULL, val));
-}
-
-int
-nvlist_lookup_uint16(nvlist_t *nvl, const char *name, uint16_t *val)
-{
- return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT16, NULL, val));
-}
-
-int
-nvlist_lookup_int32(nvlist_t *nvl, const char *name, int32_t *val)
-{
- return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT32, NULL, val));
-}
-
-int
-nvlist_lookup_uint32(nvlist_t *nvl, const char *name, uint32_t *val)
-{
- return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT32, NULL, val));
-}
-
-int
-nvlist_lookup_int64(nvlist_t *nvl, const char *name, int64_t *val)
-{
- return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT64, NULL, val));
-}
-
-int
-nvlist_lookup_uint64(nvlist_t *nvl, const char *name, uint64_t *val)
-{
- return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT64, NULL, val));
-}
-
-int
-nvlist_lookup_string(nvlist_t *nvl, const char *name, char **val)
-{
- return (nvlist_lookup_common(nvl, name, DATA_TYPE_STRING, NULL, val));
-}
-
-int
-nvlist_lookup_nvlist(nvlist_t *nvl, const char *name, nvlist_t **val)
-{
- return (nvlist_lookup_common(nvl, name, DATA_TYPE_NVLIST, NULL, val));
-}
-
-int
-nvlist_lookup_boolean_array(nvlist_t *nvl, const char *name,
- boolean_t **a, uint_t *n)
-{
- return (nvlist_lookup_common(nvl, name,
- DATA_TYPE_BOOLEAN_ARRAY, n, a));
-}
-
-int
-nvlist_lookup_byte_array(nvlist_t *nvl, const char *name,
- uchar_t **a, uint_t *n)
-{
- return (nvlist_lookup_common(nvl, name, DATA_TYPE_BYTE_ARRAY, n, a));
-}
-
-int
-nvlist_lookup_int8_array(nvlist_t *nvl, const char *name, int8_t **a, uint_t *n)
-{
- return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT8_ARRAY, n, a));
-}
-
-int
-nvlist_lookup_uint8_array(nvlist_t *nvl, const char *name,
- uint8_t **a, uint_t *n)
-{
- return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT8_ARRAY, n, a));
-}
-
-int
-nvlist_lookup_int16_array(nvlist_t *nvl, const char *name,
- int16_t **a, uint_t *n)
-{
- return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT16_ARRAY, n, a));
-}
-
-int
-nvlist_lookup_uint16_array(nvlist_t *nvl, const char *name,
- uint16_t **a, uint_t *n)
-{
- return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT16_ARRAY, n, a));
-}
-
-int
-nvlist_lookup_int32_array(nvlist_t *nvl, const char *name,
- int32_t **a, uint_t *n)
-{
- return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT32_ARRAY, n, a));
-}
-
-int
-nvlist_lookup_uint32_array(nvlist_t *nvl, const char *name,
- uint32_t **a, uint_t *n)
-{
- return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT32_ARRAY, n, a));
-}
-
-int
-nvlist_lookup_int64_array(nvlist_t *nvl, const char *name,
- int64_t **a, uint_t *n)
-{
- return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT64_ARRAY, n, a));
-}
-
-int
-nvlist_lookup_uint64_array(nvlist_t *nvl, const char *name,
- uint64_t **a, uint_t *n)
-{
- return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT64_ARRAY, n, a));
-}
-
-int
-nvlist_lookup_string_array(nvlist_t *nvl, const char *name,
- char ***a, uint_t *n)
-{
- return (nvlist_lookup_common(nvl, name, DATA_TYPE_STRING_ARRAY, n, a));
-}
-
-int
-nvlist_lookup_nvlist_array(nvlist_t *nvl, const char *name,
- nvlist_t ***a, uint_t *n)
-{
- return (nvlist_lookup_common(nvl, name, DATA_TYPE_NVLIST_ARRAY, n, a));
-}
-
-int
-nvlist_lookup_hrtime(nvlist_t *nvl, const char *name, hrtime_t *val)
-{
- return (nvlist_lookup_common(nvl, name, DATA_TYPE_HRTIME, NULL, val));
-}
-
-int
-nvlist_lookup_pairs(nvlist_t *nvl, int flag, ...)
-{
- va_list ap;
- char *name;
- int noentok = (flag & NV_FLAG_NOENTOK ? 1 : 0);
- int ret = 0;
-
- va_start(ap, flag);
- while (ret == 0 && (name = va_arg(ap, char *)) != NULL) {
- data_type_t type;
- void *val;
- uint_t *nelem;
-
- switch (type = va_arg(ap, data_type_t)) {
- case DATA_TYPE_BOOLEAN:
- ret = nvlist_lookup_common(nvl, name, type, NULL, NULL);
- break;
-
- case DATA_TYPE_BOOLEAN_VALUE:
- case DATA_TYPE_BYTE:
- case DATA_TYPE_INT8:
- case DATA_TYPE_UINT8:
- case DATA_TYPE_INT16:
- case DATA_TYPE_UINT16:
- case DATA_TYPE_INT32:
- case DATA_TYPE_UINT32:
- case DATA_TYPE_INT64:
- case DATA_TYPE_UINT64:
- case DATA_TYPE_HRTIME:
- case DATA_TYPE_STRING:
- case DATA_TYPE_NVLIST:
- val = va_arg(ap, void *);
- ret = nvlist_lookup_common(nvl, name, type, NULL, val);
- break;
-
- case DATA_TYPE_BYTE_ARRAY:
- case DATA_TYPE_BOOLEAN_ARRAY:
- case DATA_TYPE_INT8_ARRAY:
- case DATA_TYPE_UINT8_ARRAY:
- case DATA_TYPE_INT16_ARRAY:
- case DATA_TYPE_UINT16_ARRAY:
- case DATA_TYPE_INT32_ARRAY:
- case DATA_TYPE_UINT32_ARRAY:
- case DATA_TYPE_INT64_ARRAY:
- case DATA_TYPE_UINT64_ARRAY:
- case DATA_TYPE_STRING_ARRAY:
- case DATA_TYPE_NVLIST_ARRAY:
- val = va_arg(ap, void *);
- nelem = va_arg(ap, uint_t *);
- ret = nvlist_lookup_common(nvl, name, type, nelem, val);
- break;
-
- default:
- ret = EINVAL;
- }
-
- if (ret == ENOENT && noentok)
- ret = 0;
- }
- va_end(ap);
-
- return (ret);
-}
-
-int
-nvpair_value_boolean_value(nvpair_t *nvp, boolean_t *val)
-{
- return (nvpair_value_common(nvp, DATA_TYPE_BOOLEAN_VALUE, NULL, val));
-}
-
-int
-nvpair_value_byte(nvpair_t *nvp, uchar_t *val)
-{
- return (nvpair_value_common(nvp, DATA_TYPE_BYTE, NULL, val));
-}
-
-int
-nvpair_value_int8(nvpair_t *nvp, int8_t *val)
-{
- return (nvpair_value_common(nvp, DATA_TYPE_INT8, NULL, val));
-}
-
-int
-nvpair_value_uint8(nvpair_t *nvp, uint8_t *val)
-{
- return (nvpair_value_common(nvp, DATA_TYPE_UINT8, NULL, val));
-}
-
-int
-nvpair_value_int16(nvpair_t *nvp, int16_t *val)
-{
- return (nvpair_value_common(nvp, DATA_TYPE_INT16, NULL, val));
-}
-
-int
-nvpair_value_uint16(nvpair_t *nvp, uint16_t *val)
-{
- return (nvpair_value_common(nvp, DATA_TYPE_UINT16, NULL, val));
-}
-
-int
-nvpair_value_int32(nvpair_t *nvp, int32_t *val)
-{
- return (nvpair_value_common(nvp, DATA_TYPE_INT32, NULL, val));
-}
-
-int
-nvpair_value_uint32(nvpair_t *nvp, uint32_t *val)
-{
- return (nvpair_value_common(nvp, DATA_TYPE_UINT32, NULL, val));
-}
-
-int
-nvpair_value_int64(nvpair_t *nvp, int64_t *val)
-{
- return (nvpair_value_common(nvp, DATA_TYPE_INT64, NULL, val));
-}
-
-int
-nvpair_value_uint64(nvpair_t *nvp, uint64_t *val)
-{
- return (nvpair_value_common(nvp, DATA_TYPE_UINT64, NULL, val));
-}
-
-int
-nvpair_value_string(nvpair_t *nvp, char **val)
-{
- return (nvpair_value_common(nvp, DATA_TYPE_STRING, NULL, val));
-}
-
-int
-nvpair_value_nvlist(nvpair_t *nvp, nvlist_t **val)
-{
- return (nvpair_value_common(nvp, DATA_TYPE_NVLIST, NULL, val));
-}
-
-int
-nvpair_value_boolean_array(nvpair_t *nvp, boolean_t **val, uint_t *nelem)
-{
- return (nvpair_value_common(nvp, DATA_TYPE_BOOLEAN_ARRAY, nelem, val));
-}
-
-int
-nvpair_value_byte_array(nvpair_t *nvp, uchar_t **val, uint_t *nelem)
-{
- return (nvpair_value_common(nvp, DATA_TYPE_BYTE_ARRAY, nelem, val));
-}
-
-int
-nvpair_value_int8_array(nvpair_t *nvp, int8_t **val, uint_t *nelem)
-{
- return (nvpair_value_common(nvp, DATA_TYPE_INT8_ARRAY, nelem, val));
-}
-
-int
-nvpair_value_uint8_array(nvpair_t *nvp, uint8_t **val, uint_t *nelem)
-{
- return (nvpair_value_common(nvp, DATA_TYPE_UINT8_ARRAY, nelem, val));
-}
-
-int
-nvpair_value_int16_array(nvpair_t *nvp, int16_t **val, uint_t *nelem)
-{
- return (nvpair_value_common(nvp, DATA_TYPE_INT16_ARRAY, nelem, val));
-}
-
-int
-nvpair_value_uint16_array(nvpair_t *nvp, uint16_t **val, uint_t *nelem)
-{
- return (nvpair_value_common(nvp, DATA_TYPE_UINT16_ARRAY, nelem, val));
-}
-
-int
-nvpair_value_int32_array(nvpair_t *nvp, int32_t **val, uint_t *nelem)
-{
- return (nvpair_value_common(nvp, DATA_TYPE_INT32_ARRAY, nelem, val));
-}
-
-int
-nvpair_value_uint32_array(nvpair_t *nvp, uint32_t **val, uint_t *nelem)
-{
- return (nvpair_value_common(nvp, DATA_TYPE_UINT32_ARRAY, nelem, val));
-}
-
-int
-nvpair_value_int64_array(nvpair_t *nvp, int64_t **val, uint_t *nelem)
-{
- return (nvpair_value_common(nvp, DATA_TYPE_INT64_ARRAY, nelem, val));
-}
-
-int
-nvpair_value_uint64_array(nvpair_t *nvp, uint64_t **val, uint_t *nelem)
-{
- return (nvpair_value_common(nvp, DATA_TYPE_UINT64_ARRAY, nelem, val));
-}
-
-int
-nvpair_value_string_array(nvpair_t *nvp, char ***val, uint_t *nelem)
-{
- return (nvpair_value_common(nvp, DATA_TYPE_STRING_ARRAY, nelem, val));
-}
-
-int
-nvpair_value_nvlist_array(nvpair_t *nvp, nvlist_t ***val, uint_t *nelem)
-{
- return (nvpair_value_common(nvp, DATA_TYPE_NVLIST_ARRAY, nelem, val));
-}
-
-int
-nvpair_value_hrtime(nvpair_t *nvp, hrtime_t *val)
-{
- return (nvpair_value_common(nvp, DATA_TYPE_HRTIME, NULL, val));
-}
-
-/*
- * Add specified pair to the list.
- */
-int
-nvlist_add_nvpair(nvlist_t *nvl, nvpair_t *nvp)
-{
- if (nvl == NULL || nvp == NULL)
- return (EINVAL);
-
- return (nvlist_add_common(nvl, NVP_NAME(nvp), NVP_TYPE(nvp),
- NVP_NELEM(nvp), NVP_VALUE(nvp)));
-}
-
-/*
- * Merge the supplied nvlists and put the result in dst.
- * The merged list will contain all names specified in both lists,
- * the values are taken from nvl in the case of duplicates.
- * Return 0 on success.
- */
-/*ARGSUSED*/
-int
-nvlist_merge(nvlist_t *dst, nvlist_t *nvl, int flag)
-{
- if (nvl == NULL || dst == NULL)
- return (EINVAL);
-
- if (dst != nvl)
- return (nvlist_copy_pairs(nvl, dst));
-
- return (0);
-}
-
-/*
- * Encoding related routines
- */
-#define NVS_OP_ENCODE 0
-#define NVS_OP_DECODE 1
-#define NVS_OP_GETSIZE 2
-
-typedef struct nvs_ops nvs_ops_t;
-
-typedef struct {
- int nvs_op;
- const nvs_ops_t *nvs_ops;
- void *nvs_private;
- nvpriv_t *nvs_priv;
-} nvstream_t;
-
-/*
- * nvs operations are:
- * - nvs_nvlist
- * encoding / decoding of a nvlist header (nvlist_t)
- * calculates the size used for header and end detection
- *
- * - nvs_nvpair
- * responsible for the first part of encoding / decoding of an nvpair
- * calculates the decoded size of an nvpair
- *
- * - nvs_nvp_op
- * second part of encoding / decoding of an nvpair
- *
- * - nvs_nvp_size
- * calculates the encoding size of an nvpair
- *
- * - nvs_nvl_fini
- * encodes the end detection mark (zeros).
- */
-struct nvs_ops {
- int (*nvs_nvlist)(nvstream_t *, nvlist_t *, size_t *);
- int (*nvs_nvpair)(nvstream_t *, nvpair_t *, size_t *);
- int (*nvs_nvp_op)(nvstream_t *, nvpair_t *);
- int (*nvs_nvp_size)(nvstream_t *, nvpair_t *, size_t *);
- int (*nvs_nvl_fini)(nvstream_t *);
-};
-
-typedef struct {
- char nvh_encoding; /* nvs encoding method */
- char nvh_endian; /* nvs endian */
- char nvh_reserved1; /* reserved for future use */
- char nvh_reserved2; /* reserved for future use */
-} nvs_header_t;
-
-static int
-nvs_encode_pairs(nvstream_t *nvs, nvlist_t *nvl)
-{
- nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
- i_nvp_t *curr;
-
- /*
- * Walk nvpair in list and encode each nvpair
- */
- for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next)
- if (nvs->nvs_ops->nvs_nvpair(nvs, &curr->nvi_nvp, NULL) != 0)
- return (EFAULT);
-
- return (nvs->nvs_ops->nvs_nvl_fini(nvs));
-}
-
-static int
-nvs_decode_pairs(nvstream_t *nvs, nvlist_t *nvl)
-{
- nvpair_t *nvp;
- size_t nvsize;
- int err;
-
- /*
- * Get decoded size of next pair in stream, alloc
- * memory for nvpair_t, then decode the nvpair
- */
- while ((err = nvs->nvs_ops->nvs_nvpair(nvs, NULL, &nvsize)) == 0) {
- if (nvsize == 0) /* end of list */
- break;
-
- /* make sure len makes sense */
- if (nvsize < NVP_SIZE_CALC(1, 0))
- return (EFAULT);
-
- if ((nvp = nvp_buf_alloc(nvl, nvsize)) == NULL)
- return (ENOMEM);
-
- if ((err = nvs->nvs_ops->nvs_nvp_op(nvs, nvp)) != 0) {
- nvp_buf_free(nvl, nvp);
- return (err);
- }
-
- if (i_validate_nvpair(nvp) != 0) {
- nvpair_free(nvp);
- nvp_buf_free(nvl, nvp);
- return (EFAULT);
- }
-
- nvp_buf_link(nvl, nvp);
- }
- return (err);
-}
-
-static int
-nvs_getsize_pairs(nvstream_t *nvs, nvlist_t *nvl, size_t *buflen)
-{
- nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
- i_nvp_t *curr;
- uint64_t nvsize = *buflen;
- size_t size;
-
- /*
- * Get encoded size of nvpairs in nvlist
- */
- for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) {
- if (nvs->nvs_ops->nvs_nvp_size(nvs, &curr->nvi_nvp, &size) != 0)
- return (EINVAL);
-
- if ((nvsize += size) > INT32_MAX)
- return (EINVAL);
- }
-
- *buflen = nvsize;
- return (0);
-}
-
-static int
-nvs_operation(nvstream_t *nvs, nvlist_t *nvl, size_t *buflen)
-{
- int err;
-
- if (nvl->nvl_priv == 0)
- return (EFAULT);
-
- /*
- * Perform the operation, starting with header, then each nvpair
- */
- if ((err = nvs->nvs_ops->nvs_nvlist(nvs, nvl, buflen)) != 0)
- return (err);
-
- switch (nvs->nvs_op) {
- case NVS_OP_ENCODE:
- err = nvs_encode_pairs(nvs, nvl);
- break;
-
- case NVS_OP_DECODE:
- err = nvs_decode_pairs(nvs, nvl);
- break;
-
- case NVS_OP_GETSIZE:
- err = nvs_getsize_pairs(nvs, nvl, buflen);
- break;
-
- default:
- err = EINVAL;
- }
-
- return (err);
-}
-
-static int
-nvs_embedded(nvstream_t *nvs, nvlist_t *embedded)
-{
- switch (nvs->nvs_op) {
- case NVS_OP_ENCODE:
- return (nvs_operation(nvs, embedded, NULL));
-
- case NVS_OP_DECODE: {
- nvpriv_t *priv;
- int err;
-
- if (embedded->nvl_version != NV_VERSION)
- return (ENOTSUP);
-
- if ((priv = nv_priv_alloc_embedded(nvs->nvs_priv)) == NULL)
- return (ENOMEM);
-
- nvlist_init(embedded, embedded->nvl_nvflag, priv);
-
- if ((err = nvs_operation(nvs, embedded, NULL)) != 0)
- nvlist_free(embedded);
- return (err);
- }
- default:
- break;
- }
-
- return (EINVAL);
-}
-
-static int
-nvs_embedded_nvl_array(nvstream_t *nvs, nvpair_t *nvp, size_t *size)
-{
- size_t nelem = NVP_NELEM(nvp);
- nvlist_t **nvlp = EMBEDDED_NVL_ARRAY(nvp);
- int i;
-
- switch (nvs->nvs_op) {
- case NVS_OP_ENCODE:
- for (i = 0; i < nelem; i++)
- if (nvs_embedded(nvs, nvlp[i]) != 0)
- return (EFAULT);
- break;
-
- case NVS_OP_DECODE: {
- size_t len = nelem * sizeof (uint64_t);
- nvlist_t *embedded = (nvlist_t *)((uintptr_t)nvlp + len);
-
- bzero(nvlp, len); /* don't trust packed data */
- for (i = 0; i < nelem; i++) {
- if (nvs_embedded(nvs, embedded) != 0) {
- nvpair_free(nvp);
- return (EFAULT);
- }
-
- nvlp[i] = embedded++;
- }
- break;
- }
- case NVS_OP_GETSIZE: {
- uint64_t nvsize = 0;
-
- for (i = 0; i < nelem; i++) {
- size_t nvp_sz = 0;
-
- if (nvs_operation(nvs, nvlp[i], &nvp_sz) != 0)
- return (EINVAL);
-
- if ((nvsize += nvp_sz) > INT32_MAX)
- return (EINVAL);
- }
-
- *size = nvsize;
- break;
- }
- default:
- return (EINVAL);
- }
-
- return (0);
-}
-
-static int nvs_native(nvstream_t *, nvlist_t *, char *, size_t *);
-static int nvs_xdr(nvstream_t *, nvlist_t *, char *, size_t *);
-
-/*
- * Common routine for nvlist operations:
- * encode, decode, getsize (encoded size).
- */
-static int
-nvlist_common(nvlist_t *nvl, char *buf, size_t *buflen, int encoding,
- int nvs_op)
-{
- int err = 0;
- nvstream_t nvs;
- int nvl_endian;
-#if BYTE_ORDER == _LITTLE_ENDIAN
- int host_endian = 1;
-#else
- int host_endian = 0;
-#endif /* _LITTLE_ENDIAN */
- nvs_header_t *nvh = (void *)buf;
-
- if (buflen == NULL || nvl == NULL ||
- (nvs.nvs_priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
- return (EINVAL);
-
- nvs.nvs_op = nvs_op;
-
- /*
- * For NVS_OP_ENCODE and NVS_OP_DECODE make sure an nvlist and
- * a buffer is allocated. The first 4 bytes in the buffer are
- * used for encoding method and host endian.
- */
- switch (nvs_op) {
- case NVS_OP_ENCODE:
- if (buf == NULL || *buflen < sizeof (nvs_header_t))
- return (EINVAL);
-
- nvh->nvh_encoding = encoding;
- nvh->nvh_endian = nvl_endian = host_endian;
- nvh->nvh_reserved1 = 0;
- nvh->nvh_reserved2 = 0;
- break;
-
- case NVS_OP_DECODE:
- if (buf == NULL || *buflen < sizeof (nvs_header_t))
- return (EINVAL);
-
- /* get method of encoding from first byte */
- encoding = nvh->nvh_encoding;
- nvl_endian = nvh->nvh_endian;
- break;
-
- case NVS_OP_GETSIZE:
- nvl_endian = host_endian;
-
- /*
- * add the size for encoding
- */
- *buflen = sizeof (nvs_header_t);
- break;
-
- default:
- return (ENOTSUP);
- }
-
- /*
- * Create an nvstream with proper encoding method
- */
- switch (encoding) {
- case NV_ENCODE_NATIVE:
- /*
- * check endianness, in case we are unpacking
- * from a file
- */
- if (nvl_endian != host_endian)
- return (ENOTSUP);
- err = nvs_native(&nvs, nvl, buf, buflen);
- break;
- case NV_ENCODE_XDR:
- err = nvs_xdr(&nvs, nvl, buf, buflen);
- break;
- default:
- err = ENOTSUP;
- break;
- }
-
- return (err);
-}
-
-int
-nvlist_size(nvlist_t *nvl, size_t *size, int encoding)
-{
- return (nvlist_common(nvl, NULL, size, encoding, NVS_OP_GETSIZE));
-}
-
-/*
- * Pack nvlist into contiguous memory
- */
-/*ARGSUSED1*/
-int
-nvlist_pack(nvlist_t *nvl, char **bufp, size_t *buflen, int encoding,
- int kmflag)
-{
-#if defined(_KERNEL) && !defined(_BOOT)
- return (nvlist_xpack(nvl, bufp, buflen, encoding,
- (kmflag == KM_SLEEP ? nv_alloc_sleep : nv_alloc_nosleep)));
-#else
- return (nvlist_xpack(nvl, bufp, buflen, encoding, nv_alloc_nosleep));
-#endif
-}
-
-int
-nvlist_xpack(nvlist_t *nvl, char **bufp, size_t *buflen, int encoding,
- nv_alloc_t *nva)
-{
- nvpriv_t nvpriv;
- size_t alloc_size;
- char *buf;
- int err;
-
- if (nva == NULL || nvl == NULL || bufp == NULL || buflen == NULL)
- return (EINVAL);
-
- if (*bufp != NULL)
- return (nvlist_common(nvl, *bufp, buflen, encoding,
- NVS_OP_ENCODE));
-
- /*
- * Here is a difficult situation:
- * 1. The nvlist has fixed allocator properties.
- * All other nvlist routines (like nvlist_add_*, ...) use
- * these properties.
- * 2. When using nvlist_pack() the user can specify his own
- * allocator properties (e.g. by using KM_NOSLEEP).
- *
- * We use the user specified properties (2). A clearer solution
- * will be to remove the kmflag from nvlist_pack(), but we will
- * not change the interface.
- */
- nv_priv_init(&nvpriv, nva, 0);
-
- if (err = nvlist_size(nvl, &alloc_size, encoding))
- return (err);
-
- if ((buf = nv_mem_zalloc(&nvpriv, alloc_size)) == NULL)
- return (ENOMEM);
-
- if ((err = nvlist_common(nvl, buf, &alloc_size, encoding,
- NVS_OP_ENCODE)) != 0) {
- nv_mem_free(&nvpriv, buf, alloc_size);
- } else {
- *buflen = alloc_size;
- *bufp = buf;
- }
-
- return (err);
-}
-
-/*
- * Unpack buf into an nvlist_t
- */
-/*ARGSUSED1*/
-int
-nvlist_unpack(char *buf, size_t buflen, nvlist_t **nvlp, int kmflag)
-{
-#if defined(_KERNEL) && !defined(_BOOT)
- return (nvlist_xunpack(buf, buflen, nvlp,
- (kmflag == KM_SLEEP ? nv_alloc_sleep : nv_alloc_nosleep)));
-#else
- return (nvlist_xunpack(buf, buflen, nvlp, nv_alloc_nosleep));
-#endif
-}
-
-int
-nvlist_xunpack(char *buf, size_t buflen, nvlist_t **nvlp, nv_alloc_t *nva)
-{
- nvlist_t *nvl;
- int err;
-
- if (nvlp == NULL)
- return (EINVAL);
-
- if ((err = nvlist_xalloc(&nvl, 0, nva)) != 0)
- return (err);
-
- if ((err = nvlist_common(nvl, buf, &buflen, 0, NVS_OP_DECODE)) != 0)
- nvlist_free(nvl);
- else
- *nvlp = nvl;
-
- return (err);
-}
-
-/*
- * Native encoding functions
- */
-typedef struct {
- /*
- * This structure is used when decoding a packed nvpair in
- * the native format. n_base points to a buffer containing the
- * packed nvpair. n_end is a pointer to the end of the buffer.
- * (n_end actually points to the first byte past the end of the
- * buffer.) n_curr is a pointer that lies between n_base and n_end.
- * It points to the current data that we are decoding.
- * The amount of data left in the buffer is equal to n_end - n_curr.
- * n_flag is used to recognize a packed embedded list.
- */
- caddr_t n_base;
- caddr_t n_end;
- caddr_t n_curr;
- uint_t n_flag;
-} nvs_native_t;
-
-static int
-nvs_native_create(nvstream_t *nvs, nvs_native_t *native, char *buf,
- size_t buflen)
-{
- switch (nvs->nvs_op) {
- case NVS_OP_ENCODE:
- case NVS_OP_DECODE:
- nvs->nvs_private = native;
- native->n_curr = native->n_base = buf;
- native->n_end = buf + buflen;
- native->n_flag = 0;
- return (0);
-
- case NVS_OP_GETSIZE:
- nvs->nvs_private = native;
- native->n_curr = native->n_base = native->n_end = NULL;
- native->n_flag = 0;
- return (0);
- default:
- return (EINVAL);
- }
-}
-
-/*ARGSUSED*/
-static void
-nvs_native_destroy(nvstream_t *nvs)
-{
-}
-
-static int
-native_cp(nvstream_t *nvs, void *buf, size_t size)
-{
- nvs_native_t *native = (nvs_native_t *)nvs->nvs_private;
-
- if (native->n_curr + size > native->n_end)
- return (EFAULT);
-
- /*
- * The bcopy() below eliminates alignment requirement
- * on the buffer (stream) and is preferred over direct access.
- */
- switch (nvs->nvs_op) {
- case NVS_OP_ENCODE:
- bcopy(buf, native->n_curr, size);
- break;
- case NVS_OP_DECODE:
- bcopy(native->n_curr, buf, size);
- break;
- default:
- return (EINVAL);
- }
-
- native->n_curr += size;
- return (0);
-}
-
-/*
- * operate on nvlist_t header
- */
-static int
-nvs_native_nvlist(nvstream_t *nvs, nvlist_t *nvl, size_t *size)
-{
- nvs_native_t *native = nvs->nvs_private;
-
- switch (nvs->nvs_op) {
- case NVS_OP_ENCODE:
- case NVS_OP_DECODE:
- if (native->n_flag)
- return (0); /* packed embedded list */
-
- native->n_flag = 1;
-
- /* copy version and nvflag of the nvlist_t */
- if (native_cp(nvs, &nvl->nvl_version, sizeof (int32_t)) != 0 ||
- native_cp(nvs, &nvl->nvl_nvflag, sizeof (int32_t)) != 0)
- return (EFAULT);
-
- return (0);
-
- case NVS_OP_GETSIZE:
- /*
- * if calculate for packed embedded list
- * 4 for end of the embedded list
- * else
- * 2 * sizeof (int32_t) for nvl_version and nvl_nvflag
- * and 4 for end of the entire list
- */
- if (native->n_flag) {
- *size += 4;
- } else {
- native->n_flag = 1;
- *size += 2 * sizeof (int32_t) + 4;
- }
-
- return (0);
-
- default:
- return (EINVAL);
- }
-}
-
-static int
-nvs_native_nvl_fini(nvstream_t *nvs)
-{
- if (nvs->nvs_op == NVS_OP_ENCODE) {
- nvs_native_t *native = (nvs_native_t *)nvs->nvs_private;
- /*
- * Add 4 zero bytes at end of nvlist. They are used
- * for end detection by the decode routine.
- */
- if (native->n_curr + sizeof (int) > native->n_end)
- return (EFAULT);
-
- bzero(native->n_curr, sizeof (int));
- native->n_curr += sizeof (int);
- }
-
- return (0);
-}
-
-static int
-nvpair_native_embedded(nvstream_t *nvs, nvpair_t *nvp)
-{
- if (nvs->nvs_op == NVS_OP_ENCODE) {
- nvs_native_t *native = (nvs_native_t *)nvs->nvs_private;
- nvlist_t *packed = (void *)
- (native->n_curr - nvp->nvp_size + NVP_VALOFF(nvp));
- /*
- * Null out the pointer that is meaningless in the packed
- * structure. The address may not be aligned, so we have
- * to use bzero.
- */
- bzero(&packed->nvl_priv, sizeof (packed->nvl_priv));
- }
-
- return (nvs_embedded(nvs, EMBEDDED_NVL(nvp)));
-}
-
-static int
-nvpair_native_embedded_array(nvstream_t *nvs, nvpair_t *nvp)
-{
- if (nvs->nvs_op == NVS_OP_ENCODE) {
- nvs_native_t *native = (nvs_native_t *)nvs->nvs_private;
- char *value = native->n_curr - nvp->nvp_size + NVP_VALOFF(nvp);
- size_t len = NVP_NELEM(nvp) * sizeof (uint64_t);
- nvlist_t *packed = (nvlist_t *)((uintptr_t)value + len);
- int i;
- /*
- * Null out pointers that are meaningless in the packed
- * structure. The addresses may not be aligned, so we have
- * to use bzero.
- */
- bzero(value, len);
-
- for (i = 0; i < NVP_NELEM(nvp); i++, packed++)
- /*
- * Null out the pointer that is meaningless in the
- * packed structure. The address may not be aligned,
- * so we have to use bzero.
- */
- bzero(&packed->nvl_priv, sizeof (packed->nvl_priv));
- }
-
- return (nvs_embedded_nvl_array(nvs, nvp, NULL));
-}
-
-static void
-nvpair_native_string_array(nvstream_t *nvs, nvpair_t *nvp)
-{
- switch (nvs->nvs_op) {
- case NVS_OP_ENCODE: {
- nvs_native_t *native = (nvs_native_t *)nvs->nvs_private;
- uint64_t *strp = (void *)
- (native->n_curr - nvp->nvp_size + NVP_VALOFF(nvp));
- /*
- * Null out pointers that are meaningless in the packed
- * structure. The addresses may not be aligned, so we have
- * to use bzero.
- */
- bzero(strp, NVP_NELEM(nvp) * sizeof (uint64_t));
- break;
- }
- case NVS_OP_DECODE: {
- char **strp = (void *)NVP_VALUE(nvp);
- char *buf = ((char *)strp + NVP_NELEM(nvp) * sizeof (uint64_t));
- int i;
-
- for (i = 0; i < NVP_NELEM(nvp); i++) {
- strp[i] = buf;
- buf += strlen(buf) + 1;
- }
- break;
- }
- }
-}
-
-static int
-nvs_native_nvp_op(nvstream_t *nvs, nvpair_t *nvp)
-{
- data_type_t type;
- int value_sz;
- int ret = 0;
-
- /*
- * We do the initial bcopy of the data before we look at
- * the nvpair type, because when we're decoding, we won't
- * have the correct values for the pair until we do the bcopy.
- */
- switch (nvs->nvs_op) {
- case NVS_OP_ENCODE:
- case NVS_OP_DECODE:
- if (native_cp(nvs, nvp, nvp->nvp_size) != 0)
- return (EFAULT);
- break;
- default:
- return (EINVAL);
- }
-
- /* verify nvp_name_sz, check the name string length */
- if (i_validate_nvpair_name(nvp) != 0)
- return (EFAULT);
-
- type = NVP_TYPE(nvp);
-
- /*
- * Verify type and nelem and get the value size.
- * In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY
- * is the size of the string(s) excluded.
- */
- if ((value_sz = i_get_value_size(type, NULL, NVP_NELEM(nvp))) < 0)
- return (EFAULT);
-
- if (NVP_SIZE_CALC(nvp->nvp_name_sz, value_sz) > nvp->nvp_size)
- return (EFAULT);
-
- switch (type) {
- case DATA_TYPE_NVLIST:
- ret = nvpair_native_embedded(nvs, nvp);
- break;
- case DATA_TYPE_NVLIST_ARRAY:
- ret = nvpair_native_embedded_array(nvs, nvp);
- break;
- case DATA_TYPE_STRING_ARRAY:
- nvpair_native_string_array(nvs, nvp);
- break;
- default:
- break;
- }
-
- return (ret);
-}
-
-static int
-nvs_native_nvp_size(nvstream_t *nvs, nvpair_t *nvp, size_t *size)
-{
- uint64_t nvp_sz = nvp->nvp_size;
-
- switch (NVP_TYPE(nvp)) {
- case DATA_TYPE_NVLIST: {
- size_t nvsize = 0;
-
- if (nvs_operation(nvs, EMBEDDED_NVL(nvp), &nvsize) != 0)
- return (EINVAL);
-
- nvp_sz += nvsize;
- break;
- }
- case DATA_TYPE_NVLIST_ARRAY: {
- size_t nvsize;
-
- if (nvs_embedded_nvl_array(nvs, nvp, &nvsize) != 0)
- return (EINVAL);
-
- nvp_sz += nvsize;
- break;
- }
- default:
- break;
- }
-
- if (nvp_sz > INT32_MAX)
- return (EINVAL);
-
- *size = nvp_sz;
-
- return (0);
-}
-
-static int
-nvs_native_nvpair(nvstream_t *nvs, nvpair_t *nvp, size_t *size)
-{
- switch (nvs->nvs_op) {
- case NVS_OP_ENCODE:
- return (nvs_native_nvp_op(nvs, nvp));
-
- case NVS_OP_DECODE: {
- nvs_native_t *native = (nvs_native_t *)nvs->nvs_private;
- int32_t decode_len;
-
- /* try to read the size value from the stream */
- if (native->n_curr + sizeof (int32_t) > native->n_end)
- return (EFAULT);
- bcopy(native->n_curr, &decode_len, sizeof (int32_t));
-
- /* sanity check the size value */
- if (decode_len < 0 ||
- decode_len > native->n_end - native->n_curr)
- return (EFAULT);
-
- *size = decode_len;
-
- /*
- * If at the end of the stream then move the cursor
- * forward, otherwise nvpair_native_op() will read
- * the entire nvpair at the same cursor position.
- */
- if (*size == 0)
- native->n_curr += sizeof (int32_t);
- break;
- }
-
- default:
- return (EINVAL);
- }
-
- return (0);
-}
-
-static const nvs_ops_t nvs_native_ops = {
- nvs_native_nvlist,
- nvs_native_nvpair,
- nvs_native_nvp_op,
- nvs_native_nvp_size,
- nvs_native_nvl_fini
-};
-
-static int
-nvs_native(nvstream_t *nvs, nvlist_t *nvl, char *buf, size_t *buflen)
-{
- nvs_native_t native;
- int err;
-
- nvs->nvs_ops = &nvs_native_ops;
-
- if ((err = nvs_native_create(nvs, &native, buf + sizeof (nvs_header_t),
- *buflen - sizeof (nvs_header_t))) != 0)
- return (err);
-
- err = nvs_operation(nvs, nvl, buflen);
-
- nvs_native_destroy(nvs);
-
- return (err);
-}
-
-/*
- * XDR encoding functions
- *
- * An xdr packed nvlist is encoded as:
- *
- * - encoding methode and host endian (4 bytes)
- * - nvl_version (4 bytes)
- * - nvl_nvflag (4 bytes)
- *
- * - encoded nvpairs, the format of one xdr encoded nvpair is:
- * - encoded size of the nvpair (4 bytes)
- * - decoded size of the nvpair (4 bytes)
- * - name string, (4 + sizeof(NV_ALIGN4(string))
- * a string is coded as size (4 bytes) and data
- * - data type (4 bytes)
- * - number of elements in the nvpair (4 bytes)
- * - data
- *
- * - 2 zero's for end of the entire list (8 bytes)
- */
-static int
-nvs_xdr_create(nvstream_t *nvs, XDR *xdr, char *buf, size_t buflen)
-{
- /* xdr data must be 4 byte aligned */
- if ((ulong_t)buf % 4 != 0)
- return (EFAULT);
-
- switch (nvs->nvs_op) {
- case NVS_OP_ENCODE:
- xdrmem_create(xdr, buf, (uint_t)buflen, XDR_ENCODE);
- nvs->nvs_private = xdr;
- return (0);
- case NVS_OP_DECODE:
- xdrmem_create(xdr, buf, (uint_t)buflen, XDR_DECODE);
- nvs->nvs_private = xdr;
- return (0);
- case NVS_OP_GETSIZE:
- nvs->nvs_private = NULL;
- return (0);
- default:
- return (EINVAL);
- }
-}
-
-static void
-nvs_xdr_destroy(nvstream_t *nvs)
-{
- switch (nvs->nvs_op) {
- case NVS_OP_ENCODE:
- case NVS_OP_DECODE:
- xdr_destroy((XDR *)nvs->nvs_private);
- break;
- default:
- break;
- }
-}
-
-static int
-nvs_xdr_nvlist(nvstream_t *nvs, nvlist_t *nvl, size_t *size)
-{
- switch (nvs->nvs_op) {
- case NVS_OP_ENCODE:
- case NVS_OP_DECODE: {
- XDR *xdr = nvs->nvs_private;
-
- if (!xdr_int(xdr, &nvl->nvl_version) ||
- !xdr_u_int(xdr, &nvl->nvl_nvflag))
- return (EFAULT);
- break;
- }
- case NVS_OP_GETSIZE: {
- /*
- * 2 * 4 for nvl_version + nvl_nvflag
- * and 8 for end of the entire list
- */
- *size += 2 * 4 + 8;
- break;
- }
- default:
- return (EINVAL);
- }
- return (0);
-}
-
-static int
-nvs_xdr_nvl_fini(nvstream_t *nvs)
-{
- if (nvs->nvs_op == NVS_OP_ENCODE) {
- XDR *xdr = nvs->nvs_private;
- int zero = 0;
-
- if (!xdr_int(xdr, &zero) || !xdr_int(xdr, &zero))
- return (EFAULT);
- }
-
- return (0);
-}
-
-/*
- * The format of xdr encoded nvpair is:
- * encode_size, decode_size, name string, data type, nelem, data
- */
-static int
-nvs_xdr_nvp_op(nvstream_t *nvs, nvpair_t *nvp)
-{
- data_type_t type;
- char *buf;
- char *buf_end = (char *)nvp + nvp->nvp_size;
- int value_sz;
- uint_t nelem, buflen;
- bool_t ret = FALSE;
- XDR *xdr = nvs->nvs_private;
-
- ASSERT(xdr != NULL && nvp != NULL);
-
- /* name string */
- if ((buf = NVP_NAME(nvp)) >= buf_end)
- return (EFAULT);
- buflen = buf_end - buf;
-
- if (!xdr_string(xdr, &buf, buflen - 1))
- return (EFAULT);
- nvp->nvp_name_sz = strlen(buf) + 1;
-
- /* type and nelem */
- if (!xdr_int(xdr, (int *)&nvp->nvp_type) ||
- !xdr_int(xdr, &nvp->nvp_value_elem))
- return (EFAULT);
-
- type = NVP_TYPE(nvp);
- nelem = nvp->nvp_value_elem;
-
- /*
- * Verify type and nelem and get the value size.
- * In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY
- * is the size of the string(s) excluded.
- */
- if ((value_sz = i_get_value_size(type, NULL, nelem)) < 0)
- return (EFAULT);
-
- /* if there is no data to extract then return */
- if (nelem == 0)
- return (0);
-
- /* value */
- if ((buf = NVP_VALUE(nvp)) >= buf_end)
- return (EFAULT);
- buflen = buf_end - buf;
-
- if (buflen < value_sz)
- return (EFAULT);
-
- switch (type) {
- case DATA_TYPE_NVLIST:
- if (nvs_embedded(nvs, (void *)buf) == 0)
- return (0);
- break;
-
- case DATA_TYPE_NVLIST_ARRAY:
- if (nvs_embedded_nvl_array(nvs, nvp, NULL) == 0)
- return (0);
- break;
-
- case DATA_TYPE_BOOLEAN:
- ret = TRUE;
- break;
-
- case DATA_TYPE_BYTE:
- case DATA_TYPE_INT8:
- case DATA_TYPE_UINT8:
- ret = xdr_char(xdr, buf);
- break;
-
- case DATA_TYPE_INT16:
- ret = xdr_short(xdr, (void *)buf);
- break;
-
- case DATA_TYPE_UINT16:
- ret = xdr_u_short(xdr, (void *)buf);
- break;
-
- case DATA_TYPE_BOOLEAN_VALUE:
- case DATA_TYPE_INT32:
- ret = xdr_int(xdr, (void *)buf);
- break;
-
- case DATA_TYPE_UINT32:
- ret = xdr_u_int(xdr, (void *)buf);
- break;
-
- case DATA_TYPE_INT64:
- ret = xdr_longlong_t(xdr, (void *)buf);
- break;
-
- case DATA_TYPE_UINT64:
- ret = xdr_u_longlong_t(xdr, (void *)buf);
- break;
-
- case DATA_TYPE_HRTIME:
- /*
- * NOTE: must expose the definition of hrtime_t here
- */
- ret = xdr_longlong_t(xdr, (void *)buf);
- break;
-
- case DATA_TYPE_STRING:
- ret = xdr_string(xdr, &buf, buflen - 1);
- break;
-
- case DATA_TYPE_BYTE_ARRAY:
- ret = xdr_opaque(xdr, buf, nelem);
- break;
-
- case DATA_TYPE_INT8_ARRAY:
- case DATA_TYPE_UINT8_ARRAY:
- ret = xdr_array(xdr, &buf, &nelem, buflen, sizeof (int8_t),
- (xdrproc_t)xdr_char);
- break;
-
- case DATA_TYPE_INT16_ARRAY:
- ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int16_t),
- sizeof (int16_t), (xdrproc_t)xdr_short);
- break;
-
- case DATA_TYPE_UINT16_ARRAY:
- ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint16_t),
- sizeof (uint16_t), (xdrproc_t)xdr_u_short);
- break;
-
- case DATA_TYPE_BOOLEAN_ARRAY:
- case DATA_TYPE_INT32_ARRAY:
- ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int32_t),
- sizeof (int32_t), (xdrproc_t)xdr_int);
- break;
-
- case DATA_TYPE_UINT32_ARRAY:
- ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint32_t),
- sizeof (uint32_t), (xdrproc_t)xdr_u_int);
- break;
-
- case DATA_TYPE_INT64_ARRAY:
- ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int64_t),
- sizeof (int64_t), (xdrproc_t)xdr_longlong_t);
- break;
-
- case DATA_TYPE_UINT64_ARRAY:
- ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint64_t),
- sizeof (uint64_t), (xdrproc_t)xdr_u_longlong_t);
- break;
-
- case DATA_TYPE_STRING_ARRAY: {
- size_t len = nelem * sizeof (uint64_t);
- char **strp = (void *)buf;
- int i;
-
- if (nvs->nvs_op == NVS_OP_DECODE)
- bzero(buf, len); /* don't trust packed data */
-
- for (i = 0; i < nelem; i++) {
- if (buflen <= len)
- return (EFAULT);
-
- buf += len;
- buflen -= len;
-
- if (xdr_string(xdr, &buf, buflen - 1) != TRUE)
- return (EFAULT);
-
- if (nvs->nvs_op == NVS_OP_DECODE)
- strp[i] = buf;
- len = strlen(buf) + 1;
- }
- ret = TRUE;
- break;
- }
- default:
- break;
- }
-
- return (ret == TRUE ? 0 : EFAULT);
-}
-
-static int
-nvs_xdr_nvp_size(nvstream_t *nvs, nvpair_t *nvp, size_t *size)
-{
- data_type_t type = NVP_TYPE(nvp);
- /*
- * encode_size + decode_size + name string size + data type + nelem
- * where name string size = 4 + NV_ALIGN4(strlen(NVP_NAME(nvp)))
- */
- uint64_t nvp_sz = 4 + 4 + 4 + NV_ALIGN4(strlen(NVP_NAME(nvp))) + 4 + 4;
-
- switch (type) {
- case DATA_TYPE_BOOLEAN:
- break;
-
- case DATA_TYPE_BOOLEAN_VALUE:
- case DATA_TYPE_BYTE:
- case DATA_TYPE_INT8:
- case DATA_TYPE_UINT8:
- case DATA_TYPE_INT16:
- case DATA_TYPE_UINT16:
- case DATA_TYPE_INT32:
- case DATA_TYPE_UINT32:
- nvp_sz += 4; /* 4 is the minimum xdr unit */
- break;
-
- case DATA_TYPE_INT64:
- case DATA_TYPE_UINT64:
- case DATA_TYPE_HRTIME:
- nvp_sz += 8;
- break;
-
- case DATA_TYPE_STRING:
- nvp_sz += 4 + NV_ALIGN4(strlen((char *)NVP_VALUE(nvp)));
- break;
-
- case DATA_TYPE_BYTE_ARRAY:
- nvp_sz += NV_ALIGN4(NVP_NELEM(nvp));
- break;
-
- case DATA_TYPE_BOOLEAN_ARRAY:
- case DATA_TYPE_INT8_ARRAY:
- case DATA_TYPE_UINT8_ARRAY:
- case DATA_TYPE_INT16_ARRAY:
- case DATA_TYPE_UINT16_ARRAY:
- case DATA_TYPE_INT32_ARRAY:
- case DATA_TYPE_UINT32_ARRAY:
- nvp_sz += 4 + 4 * (uint64_t)NVP_NELEM(nvp);
- break;
-
- case DATA_TYPE_INT64_ARRAY:
- case DATA_TYPE_UINT64_ARRAY:
- nvp_sz += 4 + 8 * (uint64_t)NVP_NELEM(nvp);
- break;
-
- case DATA_TYPE_STRING_ARRAY: {
- int i;
- char **strs = (void *)NVP_VALUE(nvp);
-
- for (i = 0; i < NVP_NELEM(nvp); i++)
- nvp_sz += 4 + NV_ALIGN4(strlen(strs[i]));
-
- break;
- }
-
- case DATA_TYPE_NVLIST:
- case DATA_TYPE_NVLIST_ARRAY: {
- size_t nvsize = 0;
- int old_nvs_op = nvs->nvs_op;
- int err;
-
- nvs->nvs_op = NVS_OP_GETSIZE;
- if (type == DATA_TYPE_NVLIST)
- err = nvs_operation(nvs, EMBEDDED_NVL(nvp), &nvsize);
- else
- err = nvs_embedded_nvl_array(nvs, nvp, &nvsize);
- nvs->nvs_op = old_nvs_op;
-
- if (err != 0)
- return (EINVAL);
-
- nvp_sz += nvsize;
- break;
- }
-
- default:
- return (EINVAL);
- }
-
- if (nvp_sz > INT32_MAX)
- return (EINVAL);
-
- *size = nvp_sz;
-
- return (0);
-}
-
-
-/*
- * The NVS_XDR_MAX_LEN macro takes a packed xdr buffer of size x and estimates
- * the largest nvpair that could be encoded in the buffer.
- *
- * See comments above nvpair_xdr_op() for the format of xdr encoding.
- * The size of a xdr packed nvpair without any data is 5 words.
- *
- * Using the size of the data directly as an estimate would be ok
- * in all cases except one. If the data type is of DATA_TYPE_STRING_ARRAY
- * then the actual nvpair has space for an array of pointers to index
- * the strings. These pointers are not encoded into the packed xdr buffer.
- *
- * If the data is of type DATA_TYPE_STRING_ARRAY and all the strings are
- * of length 0, then each string is endcoded in xdr format as a single word.
- * Therefore when expanded to an nvpair there will be 2.25 word used for
- * each string. (a int64_t allocated for pointer usage, and a single char
- * for the null termination.)
- *
- * This is the calculation performed by the NVS_XDR_MAX_LEN macro.
- */
-#define NVS_XDR_HDR_LEN ((size_t)(5 * 4))
-#define NVS_XDR_DATA_LEN(y) (((size_t)(y) <= NVS_XDR_HDR_LEN) ? \
- 0 : ((size_t)(y) - NVS_XDR_HDR_LEN))
-#define NVS_XDR_MAX_LEN(x) (NVP_SIZE_CALC(1, 0) + \
- (NVS_XDR_DATA_LEN(x) * 2) + \
- NV_ALIGN4((NVS_XDR_DATA_LEN(x) / 4)))
-
-static int
-nvs_xdr_nvpair(nvstream_t *nvs, nvpair_t *nvp, size_t *size)
-{
- XDR *xdr = nvs->nvs_private;
- int32_t encode_len, decode_len;
-
- switch (nvs->nvs_op) {
- case NVS_OP_ENCODE: {
- size_t nvsize;
-
- if (nvs_xdr_nvp_size(nvs, nvp, &nvsize) != 0)
- return (EFAULT);
-
- decode_len = nvp->nvp_size;
- encode_len = nvsize;
- if (!xdr_int(xdr, &encode_len) || !xdr_int(xdr, &decode_len))
- return (EFAULT);
-
- return (nvs_xdr_nvp_op(nvs, nvp));
- }
- case NVS_OP_DECODE: {
- struct xdr_bytesrec bytesrec;
-
- /* get the encode and decode size */
- if (!xdr_int(xdr, &encode_len) || !xdr_int(xdr, &decode_len))
- return (EFAULT);
- *size = decode_len;
-
- /* are we at the end of the stream? */
- if (*size == 0)
- return (0);
-
- /* sanity check the size parameter */
- if (!xdr_control(xdr, XDR_GET_BYTES_AVAIL, &bytesrec))
- return (EFAULT);
-
- if (*size > NVS_XDR_MAX_LEN(bytesrec.xc_num_avail))
- return (EFAULT);
- break;
- }
-
- default:
- return (EINVAL);
- }
- return (0);
-}
-
-static const struct nvs_ops nvs_xdr_ops = {
- nvs_xdr_nvlist,
- nvs_xdr_nvpair,
- nvs_xdr_nvp_op,
- nvs_xdr_nvp_size,
- nvs_xdr_nvl_fini
-};
-
-static int
-nvs_xdr(nvstream_t *nvs, nvlist_t *nvl, char *buf, size_t *buflen)
-{
- XDR xdr;
- int err;
-
- nvs->nvs_ops = &nvs_xdr_ops;
-
- if ((err = nvs_xdr_create(nvs, &xdr, buf + sizeof (nvs_header_t),
- *buflen - sizeof (nvs_header_t))) != 0)
- return (err);
-
- err = nvs_operation(nvs, nvl, buflen);
-
- nvs_xdr_destroy(nvs);
-
- return (err);
-}
diff --git a/sys/contrib/opensolaris/common/nvpair/nvpair_alloc_fixed.c b/sys/contrib/opensolaris/common/nvpair/nvpair_alloc_fixed.c
deleted file mode 100644
index 620171e..0000000
--- a/sys/contrib/opensolaris/common/nvpair/nvpair_alloc_fixed.c
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/nvpair.h>
-#include <sys/sysmacros.h>
-#if defined(_KERNEL) && !defined(_BOOT)
-#include <sys/varargs.h>
-#else
-#include <stdarg.h>
-#include <strings.h>
-#endif
-
-/*
- * This allocator is very simple.
- * - it uses a pre-allocated buffer for memory allocations.
- * - it does _not_ free memory in the pre-allocated buffer.
- *
- * The reason for the selected implemention is simplicity.
- * This allocator is designed for the usage in interrupt context when
- * the caller may not wait for free memory.
- */
-
-/* pre-allocated buffer for memory allocations */
-typedef struct nvbuf {
- uintptr_t nvb_buf; /* address of pre-allocated buffer */
- uintptr_t nvb_lim; /* limit address in the buffer */
- uintptr_t nvb_cur; /* current address in the buffer */
-} nvbuf_t;
-
-/*
- * Initialize the pre-allocated buffer allocator. The caller needs to supply
- *
- * buf address of pre-allocated buffer
- * bufsz size of pre-allocated buffer
- *
- * nv_fixed_init() calculates the remaining members of nvbuf_t.
- */
-static int
-nv_fixed_init(nv_alloc_t *nva, va_list valist)
-{
- uintptr_t base = va_arg(valist, uintptr_t);
- uintptr_t lim = base + va_arg(valist, size_t);
- nvbuf_t *nvb = (nvbuf_t *)P2ROUNDUP(base, sizeof (uintptr_t));
-
- if (base == 0 || (uintptr_t)&nvb[1] > lim)
- return (EINVAL);
-
- nvb->nvb_buf = (uintptr_t)&nvb[0];
- nvb->nvb_cur = (uintptr_t)&nvb[1];
- nvb->nvb_lim = lim;
- nva->nva_arg = nvb;
-
- return (0);
-}
-
-static void *
-nv_fixed_alloc(nv_alloc_t *nva, size_t size)
-{
- nvbuf_t *nvb = nva->nva_arg;
- uintptr_t new = nvb->nvb_cur;
-
- if (size == 0 || new + size > nvb->nvb_lim)
- return (NULL);
-
- nvb->nvb_cur = P2ROUNDUP(new + size, sizeof (uintptr_t));
-
- return ((void *)new);
-}
-
-/*ARGSUSED*/
-static void
-nv_fixed_free(nv_alloc_t *nva, void *buf, size_t size)
-{
- /* don't free memory in the pre-allocated buffer */
-}
-
-static void
-nv_fixed_reset(nv_alloc_t *nva)
-{
- nvbuf_t *nvb = nva->nva_arg;
-
- nvb->nvb_cur = (uintptr_t)&nvb[1];
-}
-
-const nv_alloc_ops_t nv_fixed_ops_def = {
- nv_fixed_init, /* nv_ao_init() */
- NULL, /* nv_ao_fini() */
- nv_fixed_alloc, /* nv_ao_alloc() */
- nv_fixed_free, /* nv_ao_free() */
- nv_fixed_reset /* nv_ao_reset() */
-};
-
-const nv_alloc_ops_t *nv_fixed_ops = &nv_fixed_ops_def;
diff --git a/sys/contrib/opensolaris/common/zfs/zfs_namecheck.c b/sys/contrib/opensolaris/common/zfs/zfs_namecheck.c
deleted file mode 100644
index 2004d86..0000000
--- a/sys/contrib/opensolaris/common/zfs/zfs_namecheck.c
+++ /dev/null
@@ -1,287 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-/*
- * Common name validation routines for ZFS. These routines are shared by the
- * userland code as well as the ioctl() layer to ensure that we don't
- * inadvertently expose a hole through direct ioctl()s that never gets tested.
- * In userland, however, we want significantly more information about _why_ the
- * name is invalid. In the kernel, we only care whether it's valid or not.
- * Each routine therefore takes a 'namecheck_err_t' which describes exactly why
- * the name failed to validate.
- *
- * Each function returns 0 on success, -1 on error.
- */
-
-#if defined(_KERNEL)
-#include <sys/systm.h>
-#else
-#include <string.h>
-#endif
-
-#include <sys/param.h>
-#include "zfs_namecheck.h"
-
-static int
-valid_char(char c)
-{
- return ((c >= 'a' && c <= 'z') ||
- (c >= 'A' && c <= 'Z') ||
- (c >= '0' && c <= '9') ||
- c == '-' || c == '_' || c == '.' || c == ':');
-}
-
-/*
- * Snapshot names must be made up of alphanumeric characters plus the following
- * characters:
- *
- * [-_.:]
- */
-int
-snapshot_namecheck(const char *path, namecheck_err_t *why, char *what)
-{
- const char *loc;
-
- if (strlen(path) >= MAXNAMELEN) {
- if (why)
- *why = NAME_ERR_TOOLONG;
- return (-1);
- }
-
- if (path[0] == '\0') {
- if (why)
- *why = NAME_ERR_EMPTY_COMPONENT;
- return (-1);
- }
-
- for (loc = path; *loc; loc++) {
- if (!valid_char(*loc)) {
- if (why) {
- *why = NAME_ERR_INVALCHAR;
- *what = *loc;
- }
- return (-1);
- }
- }
- return (0);
-}
-
-/*
- * Dataset names must be of the following form:
- *
- * [component][/]*[component][@component]
- *
- * Where each component is made up of alphanumeric characters plus the following
- * characters:
- *
- * [-_.:]
- */
-int
-dataset_namecheck(const char *path, namecheck_err_t *why, char *what)
-{
- const char *loc, *end;
- int found_snapshot;
-
- /*
- * Make sure the name is not too long.
- *
- * ZFS_MAXNAMELEN is the maximum dataset length used in the userland
- * which is the same as MAXNAMELEN used in the kernel.
- * If ZFS_MAXNAMELEN value is changed, make sure to cleanup all
- * places using MAXNAMELEN.
- */
- if (strlen(path) >= MAXNAMELEN) {
- if (why)
- *why = NAME_ERR_TOOLONG;
- return (-1);
- }
-
- /* Explicitly check for a leading slash. */
- if (path[0] == '/') {
- if (why)
- *why = NAME_ERR_LEADING_SLASH;
- return (-1);
- }
-
- if (path[0] == '\0') {
- if (why)
- *why = NAME_ERR_EMPTY_COMPONENT;
- return (-1);
- }
-
- loc = path;
- found_snapshot = 0;
- for (;;) {
- /* Find the end of this component */
- end = loc;
- while (*end != '/' && *end != '@' && *end != '\0')
- end++;
-
- if (*end == '\0' && end[-1] == '/') {
- /* trailing slashes are not allowed */
- if (why)
- *why = NAME_ERR_TRAILING_SLASH;
- return (-1);
- }
-
- /* Zero-length components are not allowed */
- if (loc == end) {
- if (why) {
- /*
- * Make sure this is really a zero-length
- * component and not a '@@'.
- */
- if (*end == '@' && found_snapshot) {
- *why = NAME_ERR_MULTIPLE_AT;
- } else {
- *why = NAME_ERR_EMPTY_COMPONENT;
- }
- }
-
- return (-1);
- }
-
- /* Validate the contents of this component */
- while (loc != end) {
- if (!valid_char(*loc)) {
- if (why) {
- *why = NAME_ERR_INVALCHAR;
- *what = *loc;
- }
- return (-1);
- }
- loc++;
- }
-
- /* If we've reached the end of the string, we're OK */
- if (*end == '\0')
- return (0);
-
- if (*end == '@') {
- /*
- * If we've found an @ symbol, indicate that we're in
- * the snapshot component, and report a second '@'
- * character as an error.
- */
- if (found_snapshot) {
- if (why)
- *why = NAME_ERR_MULTIPLE_AT;
- return (-1);
- }
-
- found_snapshot = 1;
- }
-
- /*
- * If there is a '/' in a snapshot name
- * then report an error
- */
- if (*end == '/' && found_snapshot) {
- if (why)
- *why = NAME_ERR_TRAILING_SLASH;
- return (-1);
- }
-
- /* Update to the next component */
- loc = end + 1;
- }
-}
-
-/*
- * For pool names, we have the same set of valid characters as described in
- * dataset names, with the additional restriction that the pool name must begin
- * with a letter. The pool names 'raidz' and 'mirror' are also reserved names
- * that cannot be used.
- */
-int
-pool_namecheck(const char *pool, namecheck_err_t *why, char *what)
-{
- const char *c;
-
- /*
- * Make sure the name is not too long.
- *
- * ZPOOL_MAXNAMELEN is the maximum pool length used in the userland
- * which is the same as MAXNAMELEN used in the kernel.
- * If ZPOOL_MAXNAMELEN value is changed, make sure to cleanup all
- * places using MAXNAMELEN.
- */
- if (strlen(pool) >= MAXNAMELEN) {
- if (why)
- *why = NAME_ERR_TOOLONG;
- return (-1);
- }
-
- c = pool;
- while (*c != '\0') {
- if (!valid_char(*c)) {
- if (why) {
- *why = NAME_ERR_INVALCHAR;
- *what = *c;
- }
- return (-1);
- }
- c++;
- }
-
- if (!(*pool >= 'a' && *pool <= 'z') &&
- !(*pool >= 'A' && *pool <= 'Z')) {
- if (why)
- *why = NAME_ERR_NOLETTER;
- return (-1);
- }
-
- if (strcmp(pool, "mirror") == 0 || strcmp(pool, "raidz") == 0) {
- if (why)
- *why = NAME_ERR_RESERVED;
- return (-1);
- }
-
- if (pool[0] == 'c' && (pool[1] >= '0' && pool[1] <= '9')) {
- if (why)
- *why = NAME_ERR_DISKLIKE;
- return (-1);
- }
-
- return (0);
-}
-
-/*
- * Check if the dataset name is private for internal usage.
- * '$' is reserved for internal dataset names. e.g. "$MOS"
- *
- * Return 1 if the given name is used internally.
- * Return 0 if it is not.
- */
-int
-dataset_name_hidden(const char *name)
-{
- if (strchr(name, '$') != NULL)
- return (1);
-
- return (0);
-}
diff --git a/sys/contrib/opensolaris/common/zfs/zfs_namecheck.h b/sys/contrib/opensolaris/common/zfs/zfs_namecheck.h
deleted file mode 100644
index 7e0cda9..0000000
--- a/sys/contrib/opensolaris/common/zfs/zfs_namecheck.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _ZFS_NAMECHECK_H
-#define _ZFS_NAMECHECK_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef enum {
- NAME_ERR_LEADING_SLASH, /* name begins with leading slash */
- NAME_ERR_EMPTY_COMPONENT, /* name contains an empty component */
- NAME_ERR_TRAILING_SLASH, /* name ends with a slash */
- NAME_ERR_INVALCHAR, /* invalid character found */
- NAME_ERR_MULTIPLE_AT, /* multiple '@' characters found */
- NAME_ERR_NOLETTER, /* pool doesn't begin with a letter */
- NAME_ERR_RESERVED, /* entire name is reserved */
- NAME_ERR_DISKLIKE, /* reserved disk name (c[0-9].*) */
- NAME_ERR_TOOLONG, /* name is too long */
-} namecheck_err_t;
-
-int pool_namecheck(const char *, namecheck_err_t *, char *);
-int dataset_namecheck(const char *, namecheck_err_t *, char *);
-int dataset_name_hidden(const char *);
-int snapshot_namecheck(const char *, namecheck_err_t *, char *);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _ZFS_NAMECHECK_H */
diff --git a/sys/contrib/opensolaris/common/zfs/zfs_prop.c b/sys/contrib/opensolaris/common/zfs/zfs_prop.c
deleted file mode 100644
index 7125619..0000000
--- a/sys/contrib/opensolaris/common/zfs/zfs_prop.c
+++ /dev/null
@@ -1,657 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-/*
- * Master property table.
- *
- * This table keeps track of all the properties supported by ZFS, and their
- * various attributes. Not all of these are needed by the kernel, and several
- * are only used by a single libzfs client. But having them here centralizes
- * all property information in one location.
- *
- * name The human-readable string representing this property
- * proptype Basic type (string, boolean, number)
- * default Default value for the property. Sadly, C only allows
- * you to initialize the first member of a union, so we
- * have two default members for each property.
- * attr Attributes (readonly, inheritable) for the property
- * types Valid dataset types to which this applies
- * values String describing acceptable values for the property
- * colname The column header for 'zfs list'
- * colfmt The column formatting for 'zfs list'
- *
- * This table must match the order of property types in libzfs.h.
- */
-
-#include <sys/zio.h>
-#include <sys/spa.h>
-#include <sys/zfs_acl.h>
-#include <sys/zfs_ioctl.h>
-
-#include "zfs_prop.h"
-
-#if defined(_KERNEL)
-#include <sys/systm.h>
-#else
-#include <stdlib.h>
-#include <string.h>
-#include <ctype.h>
-#endif
-
-typedef enum {
- prop_default,
- prop_readonly,
- prop_inherit
-} prop_attr_t;
-
-typedef struct {
- const char *pd_name;
- zfs_proptype_t pd_proptype;
- uint64_t pd_numdefault;
- const char *pd_strdefault;
- prop_attr_t pd_attr;
- int pd_types;
- const char *pd_values;
- const char *pd_colname;
- boolean_t pd_rightalign;
- boolean_t pd_visible;
-} prop_desc_t;
-
-static prop_desc_t zfs_prop_table[] = {
- { "type", prop_type_string, 0, NULL, prop_readonly,
- ZFS_TYPE_ANY, "filesystem | volume | snapshot", "TYPE", B_TRUE,
- B_TRUE },
- { "creation", prop_type_number, 0, NULL, prop_readonly,
- ZFS_TYPE_ANY, "<date>", "CREATION", B_FALSE, B_TRUE },
- { "used", prop_type_number, 0, NULL, prop_readonly,
- ZFS_TYPE_ANY, "<size>", "USED", B_TRUE, B_TRUE },
- { "available", prop_type_number, 0, NULL, prop_readonly,
- ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "AVAIL", B_TRUE,
- B_TRUE },
- { "referenced", prop_type_number, 0, NULL, prop_readonly,
- ZFS_TYPE_ANY,
- "<size>", "REFER", B_TRUE, B_TRUE },
- { "compressratio", prop_type_number, 0, NULL, prop_readonly,
- ZFS_TYPE_ANY, "<1.00x or higher if compressed>", "RATIO", B_TRUE,
- B_TRUE },
- { "mounted", prop_type_boolean, 0, NULL, prop_readonly,
- ZFS_TYPE_FILESYSTEM, "yes | no | -", "MOUNTED", B_TRUE, B_TRUE },
- { "origin", prop_type_string, 0, NULL, prop_readonly,
- ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<snapshot>", "ORIGIN",
- B_FALSE, B_TRUE },
- { "quota", prop_type_number, 0, NULL, prop_default,
- ZFS_TYPE_FILESYSTEM, "<size> | none", "QUOTA", B_TRUE, B_TRUE },
- { "reservation", prop_type_number, 0, NULL, prop_default,
- ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
- "<size> | none", "RESERV", B_TRUE, B_TRUE },
- { "volsize", prop_type_number, 0, NULL, prop_default,
- ZFS_TYPE_VOLUME, "<size>", "VOLSIZE", B_TRUE, B_TRUE },
- { "volblocksize", prop_type_number, 8192, NULL, prop_readonly,
- ZFS_TYPE_VOLUME, "512 to 128k, power of 2", "VOLBLOCK", B_TRUE,
- B_TRUE },
- { "recordsize", prop_type_number, SPA_MAXBLOCKSIZE, NULL,
- prop_inherit,
- ZFS_TYPE_FILESYSTEM,
- "512 to 128k, power of 2", "RECSIZE", B_TRUE, B_TRUE },
- { "mountpoint", prop_type_string, 0, "/", prop_inherit,
- ZFS_TYPE_FILESYSTEM,
- "<path> | legacy | none", "MOUNTPOINT", B_FALSE, B_TRUE },
- { "sharenfs", prop_type_string, 0, "off", prop_inherit,
- ZFS_TYPE_FILESYSTEM,
- "on | off | exports(5) options", "SHARENFS", B_FALSE, B_TRUE },
- { "checksum", prop_type_index, ZIO_CHECKSUM_DEFAULT, "on",
- prop_inherit, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
- "on | off | fletcher2 | fletcher4 | sha256", "CHECKSUM", B_TRUE,
- B_TRUE },
- { "compression", prop_type_index, ZIO_COMPRESS_DEFAULT, "off",
- prop_inherit, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
- "on | off | lzjb | gzip | gzip-[1-9]", "COMPRESS", B_TRUE, B_TRUE },
- { "atime", prop_type_boolean, 1, NULL, prop_inherit,
- ZFS_TYPE_FILESYSTEM,
- "on | off", "ATIME", B_TRUE, B_TRUE },
- { "devices", prop_type_boolean, 1, NULL, prop_inherit,
- ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
- "on | off", "DEVICES", B_TRUE, B_TRUE },
- { "exec", prop_type_boolean, 1, NULL, prop_inherit,
- ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
- "on | off", "EXEC", B_TRUE, B_TRUE },
- { "setuid", prop_type_boolean, 1, NULL, prop_inherit,
- ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "SETUID",
- B_TRUE, B_TRUE },
- { "readonly", prop_type_boolean, 0, NULL, prop_inherit,
- ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
- "on | off", "RDONLY", B_TRUE, B_TRUE },
- { "jailed", prop_type_boolean, 0, NULL, prop_inherit,
- ZFS_TYPE_FILESYSTEM,
- "on | off", "JAILED", B_TRUE, B_TRUE },
- { "snapdir", prop_type_index, ZFS_SNAPDIR_HIDDEN, "hidden",
- prop_inherit,
- ZFS_TYPE_FILESYSTEM,
- "hidden | visible", "SNAPDIR", B_TRUE, B_TRUE },
- { "aclmode", prop_type_index, ZFS_ACL_GROUPMASK, "groupmask",
- prop_inherit, ZFS_TYPE_FILESYSTEM,
- "discard | groupmask | passthrough", "ACLMODE", B_TRUE, B_TRUE },
- { "aclinherit", prop_type_index, ZFS_ACL_SECURE, "secure",
- prop_inherit, ZFS_TYPE_FILESYSTEM,
- "discard | noallow | secure | passthrough", "ACLINHERIT", B_TRUE,
- B_TRUE },
- { "createtxg", prop_type_number, 0, NULL, prop_readonly,
- ZFS_TYPE_ANY, NULL, NULL, B_FALSE, B_FALSE },
- { "name", prop_type_string, 0, NULL, prop_readonly,
- ZFS_TYPE_ANY, NULL, "NAME", B_FALSE, B_FALSE },
- { "canmount", prop_type_boolean, 1, NULL, prop_default,
- ZFS_TYPE_FILESYSTEM,
- "on | off", "CANMOUNT", B_TRUE, B_TRUE },
- { "shareiscsi", prop_type_string, 0, "off", prop_inherit,
- ZFS_TYPE_ANY,
- "on | off | type=<type>", "SHAREISCSI", B_FALSE, B_TRUE },
- { "iscsioptions", prop_type_string, 0, NULL, prop_inherit,
- ZFS_TYPE_VOLUME, NULL, "ISCSIOPTIONS", B_FALSE, B_FALSE },
- { "xattr", prop_type_boolean, 1, NULL, prop_inherit,
- ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
- "on | off", "XATTR", B_TRUE, B_TRUE },
- { "numclones", prop_type_number, 0, NULL, prop_readonly,
- ZFS_TYPE_SNAPSHOT, NULL, NULL, B_FALSE, B_FALSE },
- { "copies", prop_type_index, 1, "1", prop_inherit,
- ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
- "1 | 2 | 3", "COPIES", B_TRUE, B_TRUE },
- { "bootfs", prop_type_string, 0, NULL, prop_default,
- ZFS_TYPE_POOL, "<filesystem>", "BOOTFS", B_FALSE, B_TRUE },
-};
-
-#define ZFS_PROP_COUNT ((sizeof (zfs_prop_table))/(sizeof (prop_desc_t)))
-
-/*
- * Returns TRUE if the property applies to the given dataset types.
- */
-int
-zfs_prop_valid_for_type(zfs_prop_t prop, int types)
-{
- return ((zfs_prop_table[prop].pd_types & types) != 0);
-}
-
-/*
- * Determine if the specified property is visible or not.
- */
-boolean_t
-zfs_prop_is_visible(zfs_prop_t prop)
-{
- if (prop < 0)
- return (B_FALSE);
-
- return (zfs_prop_table[prop].pd_visible);
-}
-
-/*
- * Iterate over all properties, calling back into the specified function
- * for each property. We will continue to iterate until we either
- * reach the end or the callback function something other than
- * ZFS_PROP_CONT.
- */
-zfs_prop_t
-zfs_prop_iter_common(zfs_prop_f func, void *cb, zfs_type_t type,
- boolean_t show_all)
-{
- int i;
-
- for (i = 0; i < ZFS_PROP_COUNT; i++) {
- if (zfs_prop_valid_for_type(i, type) &&
- (zfs_prop_is_visible(i) || show_all)) {
- if (func(i, cb) != ZFS_PROP_CONT)
- return (i);
- }
- }
- return (ZFS_PROP_CONT);
-}
-
-zfs_prop_t
-zfs_prop_iter(zfs_prop_f func, void *cb, boolean_t show_all)
-{
- return (zfs_prop_iter_common(func, cb, ZFS_TYPE_ANY, show_all));
-}
-
-zpool_prop_t
-zpool_prop_iter(zpool_prop_f func, void *cb, boolean_t show_all)
-{
- return (zfs_prop_iter_common(func, cb, ZFS_TYPE_POOL, show_all));
-}
-
-zfs_proptype_t
-zfs_prop_get_type(zfs_prop_t prop)
-{
- return (zfs_prop_table[prop].pd_proptype);
-}
-
-static boolean_t
-propname_match(const char *p, zfs_prop_t prop, size_t len)
-{
- const char *propname = zfs_prop_table[prop].pd_name;
-#ifndef _KERNEL
- const char *colname = zfs_prop_table[prop].pd_colname;
- int c;
-#endif
-
-#ifndef _KERNEL
- if (colname == NULL)
- return (B_FALSE);
-#endif
-
- if (len == strlen(propname) &&
- strncmp(p, propname, len) == 0)
- return (B_TRUE);
-
-#ifndef _KERNEL
- if (len != strlen(colname))
- return (B_FALSE);
-
- for (c = 0; c < len; c++)
- if (p[c] != tolower(colname[c]))
- break;
-
- return (colname[c] == '\0');
-#else
- return (B_FALSE);
-#endif
-}
-
-zfs_prop_t
-zfs_name_to_prop_cb(zfs_prop_t prop, void *cb_data)
-{
- const char *propname = cb_data;
-
- if (propname_match(propname, prop, strlen(propname)))
- return (prop);
-
- return (ZFS_PROP_CONT);
-}
-
-/*
- * Given a property name and its type, returns the corresponding property ID.
- */
-zfs_prop_t
-zfs_name_to_prop_common(const char *propname, zfs_type_t type)
-{
- zfs_prop_t prop;
-
- prop = zfs_prop_iter_common(zfs_name_to_prop_cb, (void *)propname,
- type, B_TRUE);
- return (prop == ZFS_PROP_CONT ? ZFS_PROP_INVAL : prop);
-}
-
-/*
- * Given a zfs dataset property name, returns the corresponding property ID.
- */
-zfs_prop_t
-zfs_name_to_prop(const char *propname)
-{
- return (zfs_name_to_prop_common(propname, ZFS_TYPE_ANY));
-}
-
-/*
- * Given a pool property name, returns the corresponding property ID.
- */
-zpool_prop_t
-zpool_name_to_prop(const char *propname)
-{
- return (zfs_name_to_prop_common(propname, ZFS_TYPE_POOL));
-}
-
-/*
- * For user property names, we allow all lowercase alphanumeric characters, plus
- * a few useful punctuation characters.
- */
-static int
-valid_char(char c)
-{
- return ((c >= 'a' && c <= 'z') ||
- (c >= '0' && c <= '9') ||
- c == '-' || c == '_' || c == '.' || c == ':');
-}
-
-/*
- * Returns true if this is a valid user-defined property (one with a ':').
- */
-boolean_t
-zfs_prop_user(const char *name)
-{
- int i;
- char c;
- boolean_t foundsep = B_FALSE;
-
- for (i = 0; i < strlen(name); i++) {
- c = name[i];
- if (!valid_char(c))
- return (B_FALSE);
- if (c == ':')
- foundsep = B_TRUE;
- }
-
- if (!foundsep)
- return (B_FALSE);
-
- return (B_TRUE);
-}
-
-/*
- * Return the default value for the given property.
- */
-const char *
-zfs_prop_default_string(zfs_prop_t prop)
-{
- return (zfs_prop_table[prop].pd_strdefault);
-}
-
-uint64_t
-zfs_prop_default_numeric(zfs_prop_t prop)
-{
- return (zfs_prop_table[prop].pd_numdefault);
-}
-
-/*
- * Returns TRUE if the property is readonly.
- */
-int
-zfs_prop_readonly(zfs_prop_t prop)
-{
- return (zfs_prop_table[prop].pd_attr == prop_readonly);
-}
-
-/*
- * Given a dataset property ID, returns the corresponding name.
- * Assuming the zfs dataset propety ID is valid.
- */
-const char *
-zfs_prop_to_name(zfs_prop_t prop)
-{
- return (zfs_prop_table[prop].pd_name);
-}
-
-/*
- * Given a pool property ID, returns the corresponding name.
- * Assuming the pool propety ID is valid.
- */
-const char *
-zpool_prop_to_name(zpool_prop_t prop)
-{
- return (zfs_prop_table[prop].pd_name);
-}
-
-/*
- * Returns TRUE if the property is inheritable.
- */
-int
-zfs_prop_inheritable(zfs_prop_t prop)
-{
- return (zfs_prop_table[prop].pd_attr == prop_inherit);
-}
-
-typedef struct zfs_index {
- const char *name;
- uint64_t index;
-} zfs_index_t;
-
-static zfs_index_t checksum_table[] = {
- { "on", ZIO_CHECKSUM_ON },
- { "off", ZIO_CHECKSUM_OFF },
- { "fletcher2", ZIO_CHECKSUM_FLETCHER_2 },
- { "fletcher4", ZIO_CHECKSUM_FLETCHER_4 },
- { "sha256", ZIO_CHECKSUM_SHA256 },
- { NULL }
-};
-
-static zfs_index_t compress_table[] = {
- { "on", ZIO_COMPRESS_ON },
- { "off", ZIO_COMPRESS_OFF },
- { "lzjb", ZIO_COMPRESS_LZJB },
- { "gzip", ZIO_COMPRESS_GZIP_6 }, /* the default gzip level */
- { "gzip-1", ZIO_COMPRESS_GZIP_1 },
- { "gzip-2", ZIO_COMPRESS_GZIP_2 },
- { "gzip-3", ZIO_COMPRESS_GZIP_3 },
- { "gzip-4", ZIO_COMPRESS_GZIP_4 },
- { "gzip-5", ZIO_COMPRESS_GZIP_5 },
- { "gzip-6", ZIO_COMPRESS_GZIP_6 },
- { "gzip-7", ZIO_COMPRESS_GZIP_7 },
- { "gzip-8", ZIO_COMPRESS_GZIP_8 },
- { "gzip-9", ZIO_COMPRESS_GZIP_9 },
- { NULL }
-};
-
-static zfs_index_t snapdir_table[] = {
- { "hidden", ZFS_SNAPDIR_HIDDEN },
- { "visible", ZFS_SNAPDIR_VISIBLE },
- { NULL }
-};
-
-static zfs_index_t acl_mode_table[] = {
- { "discard", ZFS_ACL_DISCARD },
- { "groupmask", ZFS_ACL_GROUPMASK },
- { "passthrough", ZFS_ACL_PASSTHROUGH },
- { NULL }
-};
-
-static zfs_index_t acl_inherit_table[] = {
- { "discard", ZFS_ACL_DISCARD },
- { "noallow", ZFS_ACL_NOALLOW },
- { "secure", ZFS_ACL_SECURE },
- { "passthrough", ZFS_ACL_PASSTHROUGH },
- { NULL }
-};
-
-static zfs_index_t copies_table[] = {
- { "1", 1 },
- { "2", 2 },
- { "3", 3 },
- { NULL }
-};
-
-static zfs_index_t *
-zfs_prop_index_table(zfs_prop_t prop)
-{
- switch (prop) {
- case ZFS_PROP_CHECKSUM:
- return (checksum_table);
- case ZFS_PROP_COMPRESSION:
- return (compress_table);
- case ZFS_PROP_SNAPDIR:
- return (snapdir_table);
- case ZFS_PROP_ACLMODE:
- return (acl_mode_table);
- case ZFS_PROP_ACLINHERIT:
- return (acl_inherit_table);
- case ZFS_PROP_COPIES:
- return (copies_table);
- default:
- return (NULL);
- }
-}
-
-
-/*
- * Tables of index types, plus functions to convert between the user view
- * (strings) and internal representation (uint64_t).
- */
-int
-zfs_prop_string_to_index(zfs_prop_t prop, const char *string, uint64_t *index)
-{
- zfs_index_t *table;
- int i;
-
- if ((table = zfs_prop_index_table(prop)) == NULL)
- return (-1);
-
- for (i = 0; table[i].name != NULL; i++) {
- if (strcmp(string, table[i].name) == 0) {
- *index = table[i].index;
- return (0);
- }
- }
-
- return (-1);
-}
-
-int
-zfs_prop_index_to_string(zfs_prop_t prop, uint64_t index, const char **string)
-{
- zfs_index_t *table;
- int i;
-
- if ((table = zfs_prop_index_table(prop)) == NULL)
- return (-1);
-
- for (i = 0; table[i].name != NULL; i++) {
- if (table[i].index == index) {
- *string = table[i].name;
- return (0);
- }
- }
-
- return (-1);
-}
-
-#ifndef _KERNEL
-
-/*
- * Returns a string describing the set of acceptable values for the given
- * zfs property, or NULL if it cannot be set.
- */
-const char *
-zfs_prop_values(zfs_prop_t prop)
-{
- if (zfs_prop_table[prop].pd_types == ZFS_TYPE_POOL)
- return (NULL);
-
- return (zfs_prop_table[prop].pd_values);
-}
-
-/*
- * Returns a string describing the set of acceptable values for the given
- * zpool property, or NULL if it cannot be set.
- */
-const char *
-zpool_prop_values(zfs_prop_t prop)
-{
- if (zfs_prop_table[prop].pd_types != ZFS_TYPE_POOL)
- return (NULL);
-
- return (zfs_prop_table[prop].pd_values);
-}
-
-/*
- * Returns TRUE if this property is a string type. Note that index types
- * (compression, checksum) are treated as strings in userland, even though they
- * are stored numerically on disk.
- */
-int
-zfs_prop_is_string(zfs_prop_t prop)
-{
- return (zfs_prop_table[prop].pd_proptype == prop_type_string ||
- zfs_prop_table[prop].pd_proptype == prop_type_index);
-}
-
-/*
- * Returns the column header for the given property. Used only in
- * 'zfs list -o', but centralized here with the other property information.
- */
-const char *
-zfs_prop_column_name(zfs_prop_t prop)
-{
- return (zfs_prop_table[prop].pd_colname);
-}
-
-/*
- * Returns whether the given property should be displayed right-justified for
- * 'zfs list'.
- */
-boolean_t
-zfs_prop_align_right(zfs_prop_t prop)
-{
- return (zfs_prop_table[prop].pd_rightalign);
-}
-
-/*
- * Determines the minimum width for the column, and indicates whether it's fixed
- * or not. Only string columns are non-fixed.
- */
-size_t
-zfs_prop_width(zfs_prop_t prop, boolean_t *fixed)
-{
- prop_desc_t *pd = &zfs_prop_table[prop];
- zfs_index_t *idx;
- size_t ret;
- int i;
-
- *fixed = B_TRUE;
-
- /*
- * Start with the width of the column name.
- */
- ret = strlen(pd->pd_colname);
-
- /*
- * For fixed-width values, make sure the width is large enough to hold
- * any possible value.
- */
- switch (pd->pd_proptype) {
- case prop_type_number:
- /*
- * The maximum length of a human-readable number is 5 characters
- * ("20.4M", for example).
- */
- if (ret < 5)
- ret = 5;
- /*
- * 'creation' is handled specially because it's a number
- * internally, but displayed as a date string.
- */
- if (prop == ZFS_PROP_CREATION)
- *fixed = B_FALSE;
- break;
- case prop_type_boolean:
- /*
- * The maximum length of a boolean value is 3 characters, for
- * "off".
- */
- if (ret < 3)
- ret = 3;
- break;
- case prop_type_index:
- idx = zfs_prop_index_table(prop);
- for (i = 0; idx[i].name != NULL; i++) {
- if (strlen(idx[i].name) > ret)
- ret = strlen(idx[i].name);
- }
- break;
-
- case prop_type_string:
- *fixed = B_FALSE;
- break;
- }
-
- return (ret);
-}
-
-#endif
diff --git a/sys/contrib/opensolaris/common/zfs/zfs_prop.h b/sys/contrib/opensolaris/common/zfs/zfs_prop.h
deleted file mode 100644
index 133e740..0000000
--- a/sys/contrib/opensolaris/common/zfs/zfs_prop.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _ZFS_PROP_H
-#define _ZFS_PROP_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/fs/zfs.h>
-#include <sys/types.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * For index types (e.g. compression and checksum), we want the numeric value
- * in the kernel, but the string value in userland.
- */
-typedef enum {
- prop_type_number, /* numeric value */
- prop_type_string, /* string value */
- prop_type_boolean, /* boolean value */
- prop_type_index /* numeric value indexed by string */
-} zfs_proptype_t;
-
-zfs_proptype_t zfs_prop_get_type(zfs_prop_t);
-size_t zfs_prop_width(zfs_prop_t, boolean_t *);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _ZFS_PROP_H */
diff --git a/sys/contrib/opensolaris/uts/common/Makefile.files b/sys/contrib/opensolaris/uts/common/Makefile.files
deleted file mode 100644
index 1800e79..0000000
--- a/sys/contrib/opensolaris/uts/common/Makefile.files
+++ /dev/null
@@ -1,101 +0,0 @@
-#
-# CDDL HEADER START
-#
-# The contents of this file are subject to the terms of the
-# Common Development and Distribution License (the "License").
-# You may not use this file except in compliance with the License.
-#
-# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
-# or http://www.opensolaris.org/os/licensing.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-# When distributing Covered Code, include this CDDL HEADER in each
-# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
-# If applicable, add the following below this CDDL HEADER, with the
-# fields enclosed by brackets "[]" replaced with your own identifying
-# information: Portions Copyright [yyyy] [name of copyright owner]
-#
-# CDDL HEADER END
-#
-
-#
-# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
-# Use is subject to license terms.
-#
-# ident "%Z%%M% %I% %E% SMI"
-#
-# This Makefile defines all file modules for the directory uts/common
-# and its children. These are the source files which may be considered
-# common to all SunOS systems.
-
-ZFS_COMMON_OBJS += \
- arc.o \
- bplist.o \
- dbuf.o \
- dmu.o \
- dmu_send.o \
- dmu_object.o \
- dmu_objset.o \
- dmu_traverse.o \
- dmu_tx.o \
- dnode.o \
- dnode_sync.o \
- dsl_dir.o \
- dsl_dataset.o \
- dsl_pool.o \
- dsl_synctask.o \
- dmu_zfetch.o \
- dsl_prop.o \
- fletcher.o \
- gzip.o \
- lzjb.o \
- metaslab.o \
- refcount.o \
- sha256.o \
- spa.o \
- spa_config.o \
- spa_errlog.o \
- spa_history.o \
- spa_misc.o \
- space_map.o \
- txg.o \
- uberblock.o \
- unique.o \
- vdev.o \
- vdev_cache.o \
- vdev_label.o \
- vdev_mirror.o \
- vdev_missing.o \
- vdev_queue.o \
- vdev_raidz.o \
- vdev_root.o \
- zap.o \
- zap_leaf.o \
- zap_micro.o \
- zfs_byteswap.o \
- zfs_fm.o \
- zfs_znode.o \
- zil.o \
- zio.o \
- zio_checksum.o \
- zio_compress.o \
- zio_inject.o
-
-ZFS_SHARED_OBJS += \
- zfs_namecheck.o \
- zfs_prop.o
-
-ZFS_OBJS += \
- $(ZFS_COMMON_OBJS) \
- $(ZFS_SHARED_OBJS) \
- zfs_acl.o \
- zfs_ctldir.o \
- zfs_dir.o \
- zfs_ioctl.o \
- zfs_log.o \
- zfs_replay.o \
- zfs_rlock.o \
- zfs_vfsops.o \
- zfs_vnops.o \
- zvol.o
diff --git a/sys/contrib/opensolaris/uts/common/fs/gfs.c b/sys/contrib/opensolaris/uts/common/fs/gfs.c
deleted file mode 100644
index 738c9d4..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/gfs.c
+++ /dev/null
@@ -1,884 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/* Portions Copyright 2007 Shivakumar GN */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/types.h>
-#include <sys/cmn_err.h>
-#include <sys/debug.h>
-#include <sys/dirent.h>
-#include <sys/kmem.h>
-#include <sys/mman.h>
-#include <sys/mutex.h>
-#include <sys/sysmacros.h>
-#include <sys/systm.h>
-#include <sys/uio.h>
-#include <sys/vfs.h>
-#include <sys/vnode.h>
-#include <sys/cred.h>
-#include <sys/kdb.h>
-
-#include <sys/gfs.h>
-
-/*
- * Generic pseudo-filesystem routines.
- *
- * There are significant similarities between the implementation of certain file
- * system entry points across different filesystems. While one could attempt to
- * "choke up on the bat" and incorporate common functionality into a VOP
- * preamble or postamble, such an approach is limited in the benefit it can
- * provide. In this file we instead define a toolkit of routines which can be
- * called from a filesystem (with in-kernel pseudo-filesystems being the focus
- * of the exercise) in a more component-like fashion.
- *
- * There are three basic classes of routines:
- *
- * 1) Lowlevel support routines
- *
- * These routines are designed to play a support role for existing
- * pseudo-filesystems (such as procfs). They simplify common tasks,
- * without enforcing the filesystem to hand over management to GFS. The
- * routines covered are:
- *
- * gfs_readdir_init()
- * gfs_readdir_emit()
- * gfs_readdir_emitn()
- * gfs_readdir_pred()
- * gfs_readdir_fini()
- * gfs_lookup_dot()
- *
- * 2) Complete GFS management
- *
- * These routines take a more active role in management of the
- * pseudo-filesystem. They handle the relationship between vnode private
- * data and VFS data, as well as the relationship between vnodes in the
- * directory hierarchy.
- *
- * In order to use these interfaces, the first member of every private
- * v_data must be a gfs_file_t or a gfs_dir_t. This hands over all control
- * to GFS.
- *
- * gfs_file_create()
- * gfs_dir_create()
- * gfs_root_create()
- *
- * gfs_file_inactive()
- * gfs_dir_inactive()
- * gfs_dir_lookup()
- * gfs_dir_readdir()
- *
- * gfs_vop_inactive()
- * gfs_vop_lookup()
- * gfs_vop_readdir()
- * gfs_vop_map()
- *
- * 3) Single File pseudo-filesystems
- *
- * This routine creates a rooted file to be overlayed ontop of another
- * file in the physical filespace.
- *
- * Note that the parent is NULL (actually the vfs), but there is nothing
- * technically keeping such a file from utilizing the "Complete GFS
- * management" set of routines.
- *
- * gfs_root_create_file()
- */
-
-/*
- * Low level directory routines
- *
- * These routines provide some simple abstractions for reading directories.
- * They are designed to be used by existing pseudo filesystems (namely procfs)
- * that already have a complicated management infrastructure.
- */
-
-/*
- * gfs_readdir_init: initiate a generic readdir
- * st - a pointer to an uninitialized gfs_readdir_state_t structure
- * name_max - the directory's maximum file name length
- * ureclen - the exported file-space record length (1 for non-legacy FSs)
- * uiop - the uiop passed to readdir
- * parent - the parent directory's inode
- * self - this directory's inode
- *
- * Returns 0 or a non-zero errno.
- *
- * Typical VOP_READDIR usage of gfs_readdir_*:
- *
- * if ((error = gfs_readdir_init(...)) != 0)
- * return (error);
- * eof = 0;
- * while ((error = gfs_readdir_pred(..., &voffset)) != 0) {
- * if (!consumer_entry_at(voffset))
- * voffset = consumer_next_entry(voffset);
- * if (consumer_eof(voffset)) {
- * eof = 1
- * break;
- * }
- * if ((error = gfs_readdir_emit(..., voffset,
- * consumer_ino(voffset), consumer_name(voffset))) != 0)
- * break;
- * }
- * return (gfs_readdir_fini(..., error, eofp, eof));
- *
- * As you can see, a zero result from gfs_readdir_pred() or
- * gfs_readdir_emit() indicates that processing should continue,
- * whereas a non-zero result indicates that the loop should terminate.
- * Most consumers need do nothing more than let gfs_readdir_fini()
- * determine what the cause of failure was and return the appropriate
- * value.
- */
-int
-gfs_readdir_init(gfs_readdir_state_t *st, int name_max, int ureclen,
- uio_t *uiop, ino64_t parent, ino64_t self)
-{
- if (uiop->uio_loffset < 0 || uiop->uio_resid <= 0 ||
- (uiop->uio_loffset % ureclen) != 0)
- return (EINVAL);
-
- st->grd_ureclen = ureclen;
- st->grd_oresid = uiop->uio_resid;
- st->grd_namlen = name_max;
- st->grd_dirent = kmem_zalloc(DIRENT64_RECLEN(st->grd_namlen), KM_SLEEP);
- st->grd_parent = parent;
- st->grd_self = self;
-
- return (0);
-}
-
-/*
- * gfs_readdir_emit_int: internal routine to emit directory entry
- *
- * st - the current readdir state, which must have d_ino and d_name
- * set
- * uiop - caller-supplied uio pointer
- * next - the offset of the next entry
- */
-static int
-gfs_readdir_emit_int(gfs_readdir_state_t *st, uio_t *uiop, offset_t next,
- int *ncookies, u_long **cookies)
-{
- int reclen, namlen;
-
- namlen = strlen(st->grd_dirent->d_name);
- reclen = DIRENT64_RECLEN(namlen);
-
- if (reclen > uiop->uio_resid) {
- /*
- * Error if no entries were returned yet
- */
- if (uiop->uio_resid == st->grd_oresid)
- return (EINVAL);
- return (-1);
- }
-
- /* XXX: This can change in the future. */
- st->grd_dirent->d_type = DT_DIR;
- st->grd_dirent->d_reclen = (ushort_t)reclen;
- st->grd_dirent->d_namlen = namlen;
-
- if (uiomove((caddr_t)st->grd_dirent, reclen, UIO_READ, uiop))
- return (EFAULT);
-
- uiop->uio_loffset = next;
- if (*cookies != NULL) {
- **cookies = next;
- (*cookies)++;
- (*ncookies)--;
- KASSERT(*ncookies >= 0, ("ncookies=%d", *ncookies));
- }
-
- return (0);
-}
-
-/*
- * gfs_readdir_emit: emit a directory entry
- * voff - the virtual offset (obtained from gfs_readdir_pred)
- * ino - the entry's inode
- * name - the entry's name
- *
- * Returns a 0 on success, a non-zero errno on failure, or -1 if the
- * readdir loop should terminate. A non-zero result (either errno or
- * -1) from this function is typically passed directly to
- * gfs_readdir_fini().
- */
-int
-gfs_readdir_emit(gfs_readdir_state_t *st, uio_t *uiop, offset_t voff,
- ino64_t ino, const char *name, int *ncookies, u_long **cookies)
-{
- offset_t off = (voff + 2) * st->grd_ureclen;
-
- st->grd_dirent->d_ino = ino;
- (void) strncpy(st->grd_dirent->d_name, name, st->grd_namlen);
-
- /*
- * Inter-entry offsets are invalid, so we assume a record size of
- * grd_ureclen and explicitly set the offset appropriately.
- */
- return (gfs_readdir_emit_int(st, uiop, off + st->grd_ureclen, ncookies,
- cookies));
-}
-
-/*
- * gfs_readdir_pred: readdir loop predicate
- * voffp - a pointer in which the next virtual offset should be stored
- *
- * Returns a 0 on success, a non-zero errno on failure, or -1 if the
- * readdir loop should terminate. A non-zero result (either errno or
- * -1) from this function is typically passed directly to
- * gfs_readdir_fini().
- */
-int
-gfs_readdir_pred(gfs_readdir_state_t *st, uio_t *uiop, offset_t *voffp,
- int *ncookies, u_long **cookies)
-{
- offset_t off, voff;
- int error;
-
-top:
- if (uiop->uio_resid <= 0)
- return (-1);
-
- off = uiop->uio_loffset / st->grd_ureclen;
- voff = off - 2;
- if (off == 0) {
- if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_self,
- ".", ncookies, cookies)) == 0)
- goto top;
- } else if (off == 1) {
- if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_parent,
- "..", ncookies, cookies)) == 0)
- goto top;
- } else {
- *voffp = voff;
- return (0);
- }
-
- return (error);
-}
-
-/*
- * gfs_readdir_fini: generic readdir cleanup
- * error - if positive, an error to return
- * eofp - the eofp passed to readdir
- * eof - the eof value
- *
- * Returns a 0 on success, a non-zero errno on failure. This result
- * should be returned from readdir.
- */
-int
-gfs_readdir_fini(gfs_readdir_state_t *st, int error, int *eofp, int eof)
-{
- kmem_free(st->grd_dirent, DIRENT64_RECLEN(st->grd_namlen));
- if (error > 0)
- return (error);
- if (eofp)
- *eofp = eof;
- return (0);
-}
-
-/*
- * gfs_lookup_dot
- *
- * Performs a basic check for "." and ".." directory entries.
- */
-int
-gfs_lookup_dot(vnode_t **vpp, vnode_t *dvp, vnode_t *pvp, const char *nm)
-{
- if (*nm == '\0' || strcmp(nm, ".") == 0) {
- VN_HOLD(dvp);
- *vpp = dvp;
- return (0);
- } else if (strcmp(nm, "..") == 0) {
- if (pvp == NULL) {
- ASSERT(dvp->v_flag & VROOT);
- VN_HOLD(dvp);
- *vpp = dvp;
- } else {
- VN_HOLD(pvp);
- *vpp = pvp;
- }
- vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
- return (0);
- }
-
- return (-1);
-}
-
-/*
- * gfs_file_create(): create a new GFS file
- *
- * size - size of private data structure (v_data)
- * pvp - parent vnode (GFS directory)
- * ops - vnode operations vector
- *
- * In order to use this interface, the parent vnode must have been created by
- * gfs_dir_create(), and the private data stored in v_data must have a
- * 'gfs_file_t' as its first field.
- *
- * Given these constraints, this routine will automatically:
- *
- * - Allocate v_data for the vnode
- * - Initialize necessary fields in the vnode
- * - Hold the parent
- */
-vnode_t *
-gfs_file_create(size_t size, vnode_t *pvp, vfs_t *vfsp, vnodeops_t *ops)
-{
- gfs_file_t *fp;
- vnode_t *vp;
- int error;
-
- /*
- * Allocate vnode and internal data structure
- */
- fp = kmem_zalloc(size, KM_SLEEP);
- error = getnewvnode("zfs", vfsp, ops, &vp);
- ASSERT(error == 0);
- vp->v_data = (caddr_t)fp;
-
- /*
- * Set up various pointers
- */
- fp->gfs_vnode = vp;
- fp->gfs_parent = pvp;
- fp->gfs_size = size;
- fp->gfs_type = GFS_FILE;
-
- error = insmntque(vp, vfsp);
- KASSERT(error == 0, ("insmntque() failed: error %d", error));
-
- /*
- * Initialize vnode and hold parent.
- */
- if (pvp)
- VN_HOLD(pvp);
-
- return (vp);
-}
-
-/*
- * gfs_dir_create: creates a new directory in the parent
- *
- * size - size of private data structure (v_data)
- * pvp - parent vnode (GFS directory)
- * ops - vnode operations vector
- * entries - NULL-terminated list of static entries (if any)
- * maxlen - maximum length of a directory entry
- * readdir_cb - readdir callback (see gfs_dir_readdir)
- * inode_cb - inode callback (see gfs_dir_readdir)
- * lookup_cb - lookup callback (see gfs_dir_lookup)
- *
- * In order to use this function, the first member of the private vnode
- * structure (v_data) must be a gfs_dir_t. For each directory, there are
- * static entries, defined when the structure is initialized, and dynamic
- * entries, retrieved through callbacks.
- *
- * If a directory has static entries, then it must supply a inode callback,
- * which will compute the inode number based on the parent and the index.
- * For a directory with dynamic entries, the caller must supply a readdir
- * callback and a lookup callback. If a static lookup fails, we fall back to
- * the supplied lookup callback, if any.
- *
- * This function also performs the same initialization as gfs_file_create().
- */
-vnode_t *
-gfs_dir_create(size_t struct_size, vnode_t *pvp, vfs_t *vfsp, vnodeops_t *ops,
- gfs_dirent_t *entries, gfs_inode_cb inode_cb, int maxlen,
- gfs_readdir_cb readdir_cb, gfs_lookup_cb lookup_cb)
-{
- vnode_t *vp;
- gfs_dir_t *dp;
- gfs_dirent_t *de;
-
- vp = gfs_file_create(struct_size, pvp, vfsp, ops);
- vp->v_type = VDIR;
-
- dp = vp->v_data;
- dp->gfsd_file.gfs_type = GFS_DIR;
- dp->gfsd_maxlen = maxlen;
-
- if (entries != NULL) {
- for (de = entries; de->gfse_name != NULL; de++)
- dp->gfsd_nstatic++;
-
- dp->gfsd_static = kmem_alloc(
- dp->gfsd_nstatic * sizeof (gfs_dirent_t), KM_SLEEP);
- bcopy(entries, dp->gfsd_static,
- dp->gfsd_nstatic * sizeof (gfs_dirent_t));
- }
-
- dp->gfsd_readdir = readdir_cb;
- dp->gfsd_lookup = lookup_cb;
- dp->gfsd_inode = inode_cb;
-
- mutex_init(&dp->gfsd_lock, NULL, MUTEX_DEFAULT, NULL);
-
- return (vp);
-}
-
-/*
- * gfs_root_create(): create a root vnode for a GFS filesystem
- *
- * Similar to gfs_dir_create(), this creates a root vnode for a filesystem. The
- * only difference is that it takes a vfs_t instead of a vnode_t as its parent.
- */
-vnode_t *
-gfs_root_create(size_t size, vfs_t *vfsp, vnodeops_t *ops, ino64_t ino,
- gfs_dirent_t *entries, gfs_inode_cb inode_cb, int maxlen,
- gfs_readdir_cb readdir_cb, gfs_lookup_cb lookup_cb)
-{
- vnode_t *vp;
-
- VFS_HOLD(vfsp);
- vp = gfs_dir_create(size, NULL, vfsp, ops, entries, inode_cb,
- maxlen, readdir_cb, lookup_cb);
- /* Manually set the inode */
- ((gfs_file_t *)vp->v_data)->gfs_ino = ino;
- vp->v_flag |= VROOT;
-
- return (vp);
-}
-
-/*
- * gfs_file_inactive()
- *
- * Called from the VOP_INACTIVE() routine. If necessary, this routine will
- * remove the given vnode from the parent directory and clean up any references
- * in the VFS layer.
- *
- * If the vnode was not removed (due to a race with vget), then NULL is
- * returned. Otherwise, a pointer to the private data is returned.
- */
-void *
-gfs_file_inactive(vnode_t *vp)
-{
- int i;
- gfs_dirent_t *ge = NULL;
- gfs_file_t *fp = vp->v_data;
- gfs_dir_t *dp = NULL;
- void *data;
-
- if (fp->gfs_parent == NULL)
- goto found;
-
- dp = fp->gfs_parent->v_data;
-
- /*
- * First, see if this vnode is cached in the parent.
- */
- gfs_dir_lock(dp);
-
- /*
- * Find it in the set of static entries.
- */
- for (i = 0; i < dp->gfsd_nstatic; i++) {
- ge = &dp->gfsd_static[i];
-
- if (ge->gfse_vnode == vp)
- goto found;
- }
-
- /*
- * If 'ge' is NULL, then it is a dynamic entry.
- */
- ge = NULL;
-
-found:
- VI_LOCK(vp);
- ASSERT(vp->v_count < 2);
- /*
- * Really remove this vnode
- */
- data = vp->v_data;
- if (ge != NULL) {
- /*
- * If this was a statically cached entry, simply set the
- * cached vnode to NULL.
- */
- ge->gfse_vnode = NULL;
- }
- if (vp->v_count == 1) {
- vp->v_usecount--;
- vdropl(vp);
- } else {
- VI_UNLOCK(vp);
- }
-
- /*
- * Free vnode and release parent
- */
- if (fp->gfs_parent) {
- gfs_dir_unlock(dp);
- VI_LOCK(fp->gfs_parent);
- fp->gfs_parent->v_usecount--;
- VI_UNLOCK(fp->gfs_parent);
- } else {
- ASSERT(vp->v_vfsp != NULL);
- VFS_RELE(vp->v_vfsp);
- }
-
- return (data);
-}
-
-/*
- * gfs_dir_inactive()
- *
- * Same as above, but for directories.
- */
-void *
-gfs_dir_inactive(vnode_t *vp)
-{
- gfs_dir_t *dp;
-
- ASSERT(vp->v_type == VDIR);
-
- if ((dp = gfs_file_inactive(vp)) != NULL) {
- mutex_destroy(&dp->gfsd_lock);
- if (dp->gfsd_nstatic)
- kmem_free(dp->gfsd_static,
- dp->gfsd_nstatic * sizeof (gfs_dirent_t));
- }
-
- return (dp);
-}
-
-/*
- * gfs_dir_lookup()
- *
- * Looks up the given name in the directory and returns the corresponding vnode,
- * if found.
- *
- * First, we search statically defined entries, if any. If a match is found,
- * and GFS_CACHE_VNODE is set and the vnode exists, we simply return the
- * existing vnode. Otherwise, we call the static entry's callback routine,
- * caching the result if necessary.
- *
- * If no static entry is found, we invoke the lookup callback, if any. The
- * arguments to this callback are:
- *
- * int gfs_lookup_cb(vnode_t *pvp, const char *nm, vnode_t **vpp);
- *
- * pvp - parent vnode
- * nm - name of entry
- * vpp - pointer to resulting vnode
- *
- * Returns 0 on success, non-zero on error.
- */
-int
-gfs_dir_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp)
-{
- int i;
- gfs_dirent_t *ge;
- vnode_t *vp;
- gfs_dir_t *dp = dvp->v_data;
- int ret = 0;
-
- ASSERT(dvp->v_type == VDIR);
-
- if (gfs_lookup_dot(vpp, dvp, dp->gfsd_file.gfs_parent, nm) == 0)
- return (0);
-
- gfs_dir_lock(dp);
-
- /*
- * Search static entries.
- */
- for (i = 0; i < dp->gfsd_nstatic; i++) {
- ge = &dp->gfsd_static[i];
-
- if (strcmp(ge->gfse_name, nm) == 0) {
- if (ge->gfse_vnode) {
- ASSERT(ge->gfse_flags & GFS_CACHE_VNODE);
- vp = ge->gfse_vnode;
- VN_HOLD(vp);
- goto out;
- }
-
- /*
- * We drop the directory lock, as the constructor will
- * need to do KM_SLEEP allocations. If we return from
- * the constructor only to find that a parallel
- * operation has completed, and GFS_CACHE_VNODE is set
- * for this entry, we discard the result in favor of the
- * cached vnode.
- */
- gfs_dir_unlock(dp);
- vp = ge->gfse_ctor(dvp);
- gfs_dir_lock(dp);
-
- ((gfs_file_t *)vp->v_data)->gfs_index = i;
-
- /* Set the inode according to the callback. */
- ((gfs_file_t *)vp->v_data)->gfs_ino =
- dp->gfsd_inode(dvp, i);
-
- if (ge->gfse_flags & GFS_CACHE_VNODE) {
- if (ge->gfse_vnode == NULL) {
- ge->gfse_vnode = vp;
- } else {
- /*
- * A parallel constructor beat us to it;
- * return existing vnode. We have to be
- * careful because we can't release the
- * current vnode while holding the
- * directory lock; its inactive routine
- * will try to lock this directory.
- */
- vnode_t *oldvp = vp;
- vp = ge->gfse_vnode;
- VN_HOLD(vp);
-
- gfs_dir_unlock(dp);
- VN_RELE(oldvp);
- gfs_dir_lock(dp);
- }
- }
-
- goto out;
- }
- }
-
- /*
- * See if there is a dynamic constructor.
- */
- if (dp->gfsd_lookup) {
- ino64_t ino;
- gfs_file_t *fp;
-
- /*
- * Once again, drop the directory lock, as the lookup routine
- * will need to allocate memory, or otherwise deadlock on this
- * directory.
- */
- gfs_dir_unlock(dp);
- ret = dp->gfsd_lookup(dvp, nm, &vp, &ino);
- gfs_dir_lock(dp);
- if (ret != 0)
- goto out;
-
- fp = (gfs_file_t *)vp->v_data;
- fp->gfs_index = -1;
- fp->gfs_ino = ino;
- } else {
- /*
- * No static entry found, and there is no lookup callback, so
- * return ENOENT.
- */
- ret = ENOENT;
- }
-
-out:
- gfs_dir_unlock(dp);
-
- if (ret == 0)
- *vpp = vp;
- else
- *vpp = NULL;
-
- return (ret);
-}
-
-/*
- * gfs_dir_readdir: does a readdir() on the given directory
- *
- * dvp - directory vnode
- * uiop - uio structure
- * eofp - eof pointer
- * data - arbitrary data passed to readdir callback
- *
- * This routine does all the readdir() dirty work. Even so, the caller must
- * supply two callbacks in order to get full compatibility.
- *
- * If the directory contains static entries, an inode callback must be
- * specified. This avoids having to create every vnode and call VOP_GETATTR()
- * when reading the directory. This function has the following arguments:
- *
- * ino_t gfs_inode_cb(vnode_t *vp, int index);
- *
- * vp - vnode for the directory
- * index - index in original gfs_dirent_t array
- *
- * Returns the inode number for the given entry.
- *
- * For directories with dynamic entries, a readdir callback must be provided.
- * This is significantly more complex, thanks to the particulars of
- * VOP_READDIR().
- *
- * int gfs_readdir_cb(vnode_t *vp, struct dirent64 *dp, int *eofp,
- * offset_t *off, offset_t *nextoff, void *data)
- *
- * vp - directory vnode
- * dp - directory entry, sized according to maxlen given to
- * gfs_dir_create(). callback must fill in d_name and
- * d_ino.
- * eofp - callback must set to 1 when EOF has been reached
- * off - on entry, the last offset read from the directory. Callback
- * must set to the offset of the current entry, typically left
- * untouched.
- * nextoff - callback must set to offset of next entry. Typically
- * (off + 1)
- * data - caller-supplied data
- *
- * Return 0 on success, or error on failure.
- */
-int
-gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, int *ncookies,
- u_long **cookies, void *data)
-{
- gfs_readdir_state_t gstate;
- int error, eof = 0;
- ino64_t ino, pino;
- offset_t off, next;
- gfs_dir_t *dp = dvp->v_data;
-
- ino = dp->gfsd_file.gfs_ino;
-
- if (dp->gfsd_file.gfs_parent == NULL)
- pino = ino; /* root of filesystem */
- else
- pino = ((gfs_file_t *)
- (dp->gfsd_file.gfs_parent->v_data))->gfs_ino;
-
- if ((error = gfs_readdir_init(&gstate, dp->gfsd_maxlen, 1, uiop,
- pino, ino)) != 0)
- return (error);
-
- while ((error = gfs_readdir_pred(&gstate, uiop, &off, ncookies,
- cookies)) == 0 && !eof) {
-
- if (off >= 0 && off < dp->gfsd_nstatic) {
- ino = dp->gfsd_inode(dvp, off);
-
- if ((error = gfs_readdir_emit(&gstate, uiop,
- off, ino, dp->gfsd_static[off].gfse_name, ncookies,
- cookies)) != 0)
- break;
-
- } else if (dp->gfsd_readdir) {
- off -= dp->gfsd_nstatic;
-
- if ((error = dp->gfsd_readdir(dvp,
- gstate.grd_dirent, &eof, &off, &next,
- data)) != 0 || eof)
- break;
-
- off += dp->gfsd_nstatic + 2;
- next += dp->gfsd_nstatic + 2;
-
- if ((error = gfs_readdir_emit_int(&gstate, uiop,
- next, ncookies, cookies)) != 0)
- break;
- } else {
- /*
- * Offset is beyond the end of the static entries, and
- * we have no dynamic entries. Set EOF.
- */
- eof = 1;
- }
- }
-
- return (gfs_readdir_fini(&gstate, error, eofp, eof));
-}
-
-/*
- * gfs_vop_readdir: VOP_READDIR() entry point
- *
- * For use directly in vnode ops table. Given a GFS directory, calls
- * gfs_dir_readdir() as necessary.
- */
-/* ARGSUSED */
-int
-gfs_vop_readdir(ap)
- struct vop_readdir_args /* {
- struct vnode *a_vp;
- struct uio *a_uio;
- struct ucred *a_cred;
- int *a_eofflag;
- int *ncookies;
- u_long **a_cookies;
- } */ *ap;
-{
- vnode_t *vp = ap->a_vp;
- uio_t *uiop = ap->a_uio;
- int *eofp = ap->a_eofflag;
- int ncookies = 0;
- u_long *cookies = NULL;
- int error;
-
- if (ap->a_ncookies) {
- /*
- * Minimum entry size is dirent size and 1 byte for a file name.
- */
- ncookies = uiop->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
- cookies = malloc(ncookies * sizeof(u_long), M_TEMP, M_WAITOK);
- *ap->a_cookies = cookies;
- *ap->a_ncookies = ncookies;
- }
-
- error = gfs_dir_readdir(vp, uiop, eofp, &ncookies, &cookies, NULL);
-
- if (error == 0) {
- /* Subtract unused cookies */
- if (ap->a_ncookies)
- *ap->a_ncookies -= ncookies;
- } else if (ap->a_ncookies) {
- free(*ap->a_cookies, M_TEMP);
- *ap->a_cookies = NULL;
- *ap->a_ncookies = 0;
- }
-
- return (error);
-}
-
-/*
- * gfs_vop_inactive: VOP_INACTIVE() entry point
- *
- * Given a vnode that is a GFS file or directory, call gfs_file_inactive() or
- * gfs_dir_inactive() as necessary, and kmem_free()s associated private data.
- */
-/* ARGSUSED */
-int
-gfs_vop_inactive(ap)
- struct vop_inactive_args /* {
- struct vnode *a_vp;
- struct thread *a_td;
- } */ *ap;
-{
- vnode_t *vp = ap->a_vp;
- gfs_file_t *fp = vp->v_data;
- void *data;
-
- if (fp->gfs_type == GFS_DIR)
- data = gfs_dir_inactive(vp);
- else
- data = gfs_file_inactive(vp);
-
- if (data != NULL)
- kmem_free(data, fp->gfs_size);
- vp->v_data = NULL;
- return (0);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/contrib/opensolaris/uts/common/fs/zfs/arc.c
deleted file mode 100644
index 420f802..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/arc.c
+++ /dev/null
@@ -1,2859 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-/*
- * DVA-based Adjustable Replacement Cache
- *
- * While much of the theory of operation used here is
- * based on the self-tuning, low overhead replacement cache
- * presented by Megiddo and Modha at FAST 2003, there are some
- * significant differences:
- *
- * 1. The Megiddo and Modha model assumes any page is evictable.
- * Pages in its cache cannot be "locked" into memory. This makes
- * the eviction algorithm simple: evict the last page in the list.
- * This also make the performance characteristics easy to reason
- * about. Our cache is not so simple. At any given moment, some
- * subset of the blocks in the cache are un-evictable because we
- * have handed out a reference to them. Blocks are only evictable
- * when there are no external references active. This makes
- * eviction far more problematic: we choose to evict the evictable
- * blocks that are the "lowest" in the list.
- *
- * There are times when it is not possible to evict the requested
- * space. In these circumstances we are unable to adjust the cache
- * size. To prevent the cache growing unbounded at these times we
- * implement a "cache throttle" that slowes the flow of new data
- * into the cache until we can make space avaiable.
- *
- * 2. The Megiddo and Modha model assumes a fixed cache size.
- * Pages are evicted when the cache is full and there is a cache
- * miss. Our model has a variable sized cache. It grows with
- * high use, but also tries to react to memory preasure from the
- * operating system: decreasing its size when system memory is
- * tight.
- *
- * 3. The Megiddo and Modha model assumes a fixed page size. All
- * elements of the cache are therefor exactly the same size. So
- * when adjusting the cache size following a cache miss, its simply
- * a matter of choosing a single page to evict. In our model, we
- * have variable sized cache blocks (rangeing from 512 bytes to
- * 128K bytes). We therefor choose a set of blocks to evict to make
- * space for a cache miss that approximates as closely as possible
- * the space used by the new block.
- *
- * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache"
- * by N. Megiddo & D. Modha, FAST 2003
- */
-
-/*
- * The locking model:
- *
- * A new reference to a cache buffer can be obtained in two
- * ways: 1) via a hash table lookup using the DVA as a key,
- * or 2) via one of the ARC lists. The arc_read() inerface
- * uses method 1, while the internal arc algorithms for
- * adjusting the cache use method 2. We therefor provide two
- * types of locks: 1) the hash table lock array, and 2) the
- * arc list locks.
- *
- * Buffers do not have their own mutexs, rather they rely on the
- * hash table mutexs for the bulk of their protection (i.e. most
- * fields in the arc_buf_hdr_t are protected by these mutexs).
- *
- * buf_hash_find() returns the appropriate mutex (held) when it
- * locates the requested buffer in the hash table. It returns
- * NULL for the mutex if the buffer was not in the table.
- *
- * buf_hash_remove() expects the appropriate hash mutex to be
- * already held before it is invoked.
- *
- * Each arc state also has a mutex which is used to protect the
- * buffer list associated with the state. When attempting to
- * obtain a hash table lock while holding an arc list lock you
- * must use: mutex_tryenter() to avoid deadlock. Also note that
- * the active state mutex must be held before the ghost state mutex.
- *
- * Arc buffers may have an associated eviction callback function.
- * This function will be invoked prior to removing the buffer (e.g.
- * in arc_do_user_evicts()). Note however that the data associated
- * with the buffer may be evicted prior to the callback. The callback
- * must be made with *no locks held* (to prevent deadlock). Additionally,
- * the users of callbacks must ensure that their private data is
- * protected from simultaneous callbacks from arc_buf_evict()
- * and arc_do_user_evicts().
- *
- * Note that the majority of the performance stats are manipulated
- * with atomic operations.
- */
-
-#include <sys/spa.h>
-#include <sys/zio.h>
-#include <sys/zio_checksum.h>
-#include <sys/zfs_context.h>
-#include <sys/arc.h>
-#include <sys/refcount.h>
-#ifdef _KERNEL
-#include <sys/dnlc.h>
-#endif
-#include <sys/callb.h>
-#include <sys/kstat.h>
-#include <sys/sdt.h>
-
-static kmutex_t arc_reclaim_thr_lock;
-static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */
-static uint8_t arc_thread_exit;
-
-#define ARC_REDUCE_DNLC_PERCENT 3
-uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
-
-typedef enum arc_reclaim_strategy {
- ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */
- ARC_RECLAIM_CONS /* Conservative reclaim strategy */
-} arc_reclaim_strategy_t;
-
-/* number of seconds before growing cache again */
-static int arc_grow_retry = 60;
-
-/*
- * minimum lifespan of a prefetch block in clock ticks
- * (initialized in arc_init())
- */
-static int arc_min_prefetch_lifespan;
-
-static int arc_dead;
-
-/*
- * These tunables are for performance analysis.
- */
-u_long zfs_arc_max;
-u_long zfs_arc_min;
-TUNABLE_ULONG("vfs.zfs.arc_max", &zfs_arc_max);
-TUNABLE_ULONG("vfs.zfs.arc_min", &zfs_arc_min);
-SYSCTL_DECL(_vfs_zfs);
-SYSCTL_ULONG(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0,
- "Maximum ARC size");
-SYSCTL_ULONG(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0,
- "Minimum ARC size");
-
-/*
- * Note that buffers can be on one of 5 states:
- * ARC_anon - anonymous (discussed below)
- * ARC_mru - recently used, currently cached
- * ARC_mru_ghost - recentely used, no longer in cache
- * ARC_mfu - frequently used, currently cached
- * ARC_mfu_ghost - frequently used, no longer in cache
- * When there are no active references to the buffer, they
- * are linked onto one of the lists in arc. These are the
- * only buffers that can be evicted or deleted.
- *
- * Anonymous buffers are buffers that are not associated with
- * a DVA. These are buffers that hold dirty block copies
- * before they are written to stable storage. By definition,
- * they are "ref'd" and are considered part of arc_mru
- * that cannot be freed. Generally, they will aquire a DVA
- * as they are written and migrate onto the arc_mru list.
- */
-
-typedef struct arc_state {
- list_t arcs_list; /* linked list of evictable buffer in state */
- uint64_t arcs_lsize; /* total size of buffers in the linked list */
- uint64_t arcs_size; /* total size of all buffers in this state */
- kmutex_t arcs_mtx;
-} arc_state_t;
-
-/* The 5 states: */
-static arc_state_t ARC_anon;
-static arc_state_t ARC_mru;
-static arc_state_t ARC_mru_ghost;
-static arc_state_t ARC_mfu;
-static arc_state_t ARC_mfu_ghost;
-
-typedef struct arc_stats {
- kstat_named_t arcstat_hits;
- kstat_named_t arcstat_misses;
- kstat_named_t arcstat_demand_data_hits;
- kstat_named_t arcstat_demand_data_misses;
- kstat_named_t arcstat_demand_metadata_hits;
- kstat_named_t arcstat_demand_metadata_misses;
- kstat_named_t arcstat_prefetch_data_hits;
- kstat_named_t arcstat_prefetch_data_misses;
- kstat_named_t arcstat_prefetch_metadata_hits;
- kstat_named_t arcstat_prefetch_metadata_misses;
- kstat_named_t arcstat_mru_hits;
- kstat_named_t arcstat_mru_ghost_hits;
- kstat_named_t arcstat_mfu_hits;
- kstat_named_t arcstat_mfu_ghost_hits;
- kstat_named_t arcstat_deleted;
- kstat_named_t arcstat_recycle_miss;
- kstat_named_t arcstat_mutex_miss;
- kstat_named_t arcstat_evict_skip;
- kstat_named_t arcstat_hash_elements;
- kstat_named_t arcstat_hash_elements_max;
- kstat_named_t arcstat_hash_collisions;
- kstat_named_t arcstat_hash_chains;
- kstat_named_t arcstat_hash_chain_max;
- kstat_named_t arcstat_p;
- kstat_named_t arcstat_c;
- kstat_named_t arcstat_c_min;
- kstat_named_t arcstat_c_max;
- kstat_named_t arcstat_size;
-} arc_stats_t;
-
-static arc_stats_t arc_stats = {
- { "hits", KSTAT_DATA_UINT64 },
- { "misses", KSTAT_DATA_UINT64 },
- { "demand_data_hits", KSTAT_DATA_UINT64 },
- { "demand_data_misses", KSTAT_DATA_UINT64 },
- { "demand_metadata_hits", KSTAT_DATA_UINT64 },
- { "demand_metadata_misses", KSTAT_DATA_UINT64 },
- { "prefetch_data_hits", KSTAT_DATA_UINT64 },
- { "prefetch_data_misses", KSTAT_DATA_UINT64 },
- { "prefetch_metadata_hits", KSTAT_DATA_UINT64 },
- { "prefetch_metadata_misses", KSTAT_DATA_UINT64 },
- { "mru_hits", KSTAT_DATA_UINT64 },
- { "mru_ghost_hits", KSTAT_DATA_UINT64 },
- { "mfu_hits", KSTAT_DATA_UINT64 },
- { "mfu_ghost_hits", KSTAT_DATA_UINT64 },
- { "deleted", KSTAT_DATA_UINT64 },
- { "recycle_miss", KSTAT_DATA_UINT64 },
- { "mutex_miss", KSTAT_DATA_UINT64 },
- { "evict_skip", KSTAT_DATA_UINT64 },
- { "hash_elements", KSTAT_DATA_UINT64 },
- { "hash_elements_max", KSTAT_DATA_UINT64 },
- { "hash_collisions", KSTAT_DATA_UINT64 },
- { "hash_chains", KSTAT_DATA_UINT64 },
- { "hash_chain_max", KSTAT_DATA_UINT64 },
- { "p", KSTAT_DATA_UINT64 },
- { "c", KSTAT_DATA_UINT64 },
- { "c_min", KSTAT_DATA_UINT64 },
- { "c_max", KSTAT_DATA_UINT64 },
- { "size", KSTAT_DATA_UINT64 }
-};
-
-#define ARCSTAT(stat) (arc_stats.stat.value.ui64)
-
-#define ARCSTAT_INCR(stat, val) \
- atomic_add_64(&arc_stats.stat.value.ui64, (val));
-
-#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1)
-#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1)
-
-#define ARCSTAT_MAX(stat, val) { \
- uint64_t m; \
- while ((val) > (m = arc_stats.stat.value.ui64) && \
- (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \
- continue; \
-}
-
-#define ARCSTAT_MAXSTAT(stat) \
- ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
-
-/*
- * We define a macro to allow ARC hits/misses to be easily broken down by
- * two separate conditions, giving a total of four different subtypes for
- * each of hits and misses (so eight statistics total).
- */
-#define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
- if (cond1) { \
- if (cond2) { \
- ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
- } else { \
- ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
- } \
- } else { \
- if (cond2) { \
- ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
- } else { \
- ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
- } \
- }
-
-kstat_t *arc_ksp;
-static arc_state_t *arc_anon;
-static arc_state_t *arc_mru;
-static arc_state_t *arc_mru_ghost;
-static arc_state_t *arc_mfu;
-static arc_state_t *arc_mfu_ghost;
-
-/*
- * There are several ARC variables that are critical to export as kstats --
- * but we don't want to have to grovel around in the kstat whenever we wish to
- * manipulate them. For these variables, we therefore define them to be in
- * terms of the statistic variable. This assures that we are not introducing
- * the possibility of inconsistency by having shadow copies of the variables,
- * while still allowing the code to be readable.
- */
-#define arc_size ARCSTAT(arcstat_size) /* actual total arc size */
-#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */
-#define arc_c ARCSTAT(arcstat_c) /* target size of cache */
-#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */
-#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */
-
-static int arc_no_grow; /* Don't try to grow cache size */
-static uint64_t arc_tempreserve;
-
-typedef struct arc_callback arc_callback_t;
-
-struct arc_callback {
- void *acb_private;
- arc_done_func_t *acb_done;
- arc_byteswap_func_t *acb_byteswap;
- arc_buf_t *acb_buf;
- zio_t *acb_zio_dummy;
- arc_callback_t *acb_next;
-};
-
-typedef struct arc_write_callback arc_write_callback_t;
-
-struct arc_write_callback {
- void *awcb_private;
- arc_done_func_t *awcb_ready;
- arc_done_func_t *awcb_done;
- arc_buf_t *awcb_buf;
-};
-
-struct arc_buf_hdr {
- /* protected by hash lock */
- dva_t b_dva;
- uint64_t b_birth;
- uint64_t b_cksum0;
-
- kmutex_t b_freeze_lock;
- zio_cksum_t *b_freeze_cksum;
-
- arc_buf_hdr_t *b_hash_next;
- arc_buf_t *b_buf;
- uint32_t b_flags;
- uint32_t b_datacnt;
-
- arc_callback_t *b_acb;
- kcondvar_t b_cv;
-
- /* immutable */
- arc_buf_contents_t b_type;
- uint64_t b_size;
- spa_t *b_spa;
-
- /* protected by arc state mutex */
- arc_state_t *b_state;
- list_node_t b_arc_node;
-
- /* updated atomically */
- clock_t b_arc_access;
-
- /* self protecting */
- refcount_t b_refcnt;
-};
-
-static arc_buf_t *arc_eviction_list;
-static kmutex_t arc_eviction_mtx;
-static arc_buf_hdr_t arc_eviction_hdr;
-static void arc_get_data_buf(arc_buf_t *buf);
-static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
-
-#define GHOST_STATE(state) \
- ((state) == arc_mru_ghost || (state) == arc_mfu_ghost)
-
-/*
- * Private ARC flags. These flags are private ARC only flags that will show up
- * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can
- * be passed in as arc_flags in things like arc_read. However, these flags
- * should never be passed and should only be set by ARC code. When adding new
- * public flags, make sure not to smash the private ones.
- */
-
-#define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */
-#define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */
-#define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */
-#define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */
-#define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */
-#define ARC_INDIRECT (1 << 14) /* this is an indirect block */
-
-#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE)
-#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS)
-#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR)
-#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ)
-#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE)
-
-/*
- * Hash table routines
- */
-
-#define HT_LOCK_PAD 128
-
-struct ht_lock {
- kmutex_t ht_lock;
-#ifdef _KERNEL
- unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
-#endif
-};
-
-#define BUF_LOCKS 256
-typedef struct buf_hash_table {
- uint64_t ht_mask;
- arc_buf_hdr_t **ht_table;
- struct ht_lock ht_locks[BUF_LOCKS];
-} buf_hash_table_t;
-
-static buf_hash_table_t buf_hash_table;
-
-#define BUF_HASH_INDEX(spa, dva, birth) \
- (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
-#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
-#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
-#define HDR_LOCK(buf) \
- (BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth)))
-
-uint64_t zfs_crc64_table[256];
-
-static uint64_t
-buf_hash(spa_t *spa, dva_t *dva, uint64_t birth)
-{
- uintptr_t spav = (uintptr_t)spa;
- uint8_t *vdva = (uint8_t *)dva;
- uint64_t crc = -1ULL;
- int i;
-
- ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
-
- for (i = 0; i < sizeof (dva_t); i++)
- crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
-
- crc ^= (spav>>8) ^ birth;
-
- return (crc);
-}
-
-#define BUF_EMPTY(buf) \
- ((buf)->b_dva.dva_word[0] == 0 && \
- (buf)->b_dva.dva_word[1] == 0 && \
- (buf)->b_birth == 0)
-
-#define BUF_EQUAL(spa, dva, birth, buf) \
- ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \
- ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \
- ((buf)->b_birth == birth) && ((buf)->b_spa == spa)
-
-static arc_buf_hdr_t *
-buf_hash_find(spa_t *spa, dva_t *dva, uint64_t birth, kmutex_t **lockp)
-{
- uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
- kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
- arc_buf_hdr_t *buf;
-
- mutex_enter(hash_lock);
- for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
- buf = buf->b_hash_next) {
- if (BUF_EQUAL(spa, dva, birth, buf)) {
- *lockp = hash_lock;
- return (buf);
- }
- }
- mutex_exit(hash_lock);
- *lockp = NULL;
- return (NULL);
-}
-
-/*
- * Insert an entry into the hash table. If there is already an element
- * equal to elem in the hash table, then the already existing element
- * will be returned and the new element will not be inserted.
- * Otherwise returns NULL.
- */
-static arc_buf_hdr_t *
-buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
-{
- uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
- kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
- arc_buf_hdr_t *fbuf;
- uint32_t i;
-
- ASSERT(!HDR_IN_HASH_TABLE(buf));
- *lockp = hash_lock;
- mutex_enter(hash_lock);
- for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
- fbuf = fbuf->b_hash_next, i++) {
- if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
- return (fbuf);
- }
-
- buf->b_hash_next = buf_hash_table.ht_table[idx];
- buf_hash_table.ht_table[idx] = buf;
- buf->b_flags |= ARC_IN_HASH_TABLE;
-
- /* collect some hash table performance data */
- if (i > 0) {
- ARCSTAT_BUMP(arcstat_hash_collisions);
- if (i == 1)
- ARCSTAT_BUMP(arcstat_hash_chains);
-
- ARCSTAT_MAX(arcstat_hash_chain_max, i);
- }
-
- ARCSTAT_BUMP(arcstat_hash_elements);
- ARCSTAT_MAXSTAT(arcstat_hash_elements);
-
- return (NULL);
-}
-
-static void
-buf_hash_remove(arc_buf_hdr_t *buf)
-{
- arc_buf_hdr_t *fbuf, **bufp;
- uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
-
- ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
- ASSERT(HDR_IN_HASH_TABLE(buf));
-
- bufp = &buf_hash_table.ht_table[idx];
- while ((fbuf = *bufp) != buf) {
- ASSERT(fbuf != NULL);
- bufp = &fbuf->b_hash_next;
- }
- *bufp = buf->b_hash_next;
- buf->b_hash_next = NULL;
- buf->b_flags &= ~ARC_IN_HASH_TABLE;
-
- /* collect some hash table performance data */
- ARCSTAT_BUMPDOWN(arcstat_hash_elements);
-
- if (buf_hash_table.ht_table[idx] &&
- buf_hash_table.ht_table[idx]->b_hash_next == NULL)
- ARCSTAT_BUMPDOWN(arcstat_hash_chains);
-}
-
-/*
- * Global data structures and functions for the buf kmem cache.
- */
-static kmem_cache_t *hdr_cache;
-static kmem_cache_t *buf_cache;
-
-static void
-buf_fini(void)
-{
- int i;
-
- kmem_free(buf_hash_table.ht_table,
- (buf_hash_table.ht_mask + 1) * sizeof (void *));
- for (i = 0; i < BUF_LOCKS; i++)
- mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
- kmem_cache_destroy(hdr_cache);
- kmem_cache_destroy(buf_cache);
-}
-
-/*
- * Constructor callback - called when the cache is empty
- * and a new buf is requested.
- */
-/* ARGSUSED */
-static int
-hdr_cons(void *vbuf, void *unused, int kmflag)
-{
- arc_buf_hdr_t *buf = vbuf;
-
- bzero(buf, sizeof (arc_buf_hdr_t));
- refcount_create(&buf->b_refcnt);
- cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
- return (0);
-}
-
-/*
- * Destructor callback - called when a cached buf is
- * no longer required.
- */
-/* ARGSUSED */
-static void
-hdr_dest(void *vbuf, void *unused)
-{
- arc_buf_hdr_t *buf = vbuf;
-
- refcount_destroy(&buf->b_refcnt);
- cv_destroy(&buf->b_cv);
-}
-
-/*
- * Reclaim callback -- invoked when memory is low.
- */
-/* ARGSUSED */
-static void
-hdr_recl(void *unused)
-{
- dprintf("hdr_recl called\n");
- /*
- * umem calls the reclaim func when we destroy the buf cache,
- * which is after we do arc_fini().
- */
- if (!arc_dead)
- cv_signal(&arc_reclaim_thr_cv);
-}
-
-static void
-buf_init(void)
-{
- uint64_t *ct;
- uint64_t hsize = 1ULL << 12;
- int i, j;
-
- /*
- * The hash table is big enough to fill all of physical memory
- * with an average 64K block size. The table will take up
- * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
- */
- while (hsize * 65536 < (uint64_t)physmem * PAGESIZE)
- hsize <<= 1;
-retry:
- buf_hash_table.ht_mask = hsize - 1;
- buf_hash_table.ht_table =
- kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
- if (buf_hash_table.ht_table == NULL) {
- ASSERT(hsize > (1ULL << 8));
- hsize >>= 1;
- goto retry;
- }
-
- hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
- 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
- buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
- 0, NULL, NULL, NULL, NULL, NULL, 0);
-
- for (i = 0; i < 256; i++)
- for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
- *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
-
- for (i = 0; i < BUF_LOCKS; i++) {
- mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
- NULL, MUTEX_DEFAULT, NULL);
- }
-}
-
-#define ARC_MINTIME (hz>>4) /* 62 ms */
-
-static void
-arc_cksum_verify(arc_buf_t *buf)
-{
- zio_cksum_t zc;
-
- if (!(zfs_flags & ZFS_DEBUG_MODIFY))
- return;
-
- mutex_enter(&buf->b_hdr->b_freeze_lock);
- if (buf->b_hdr->b_freeze_cksum == NULL ||
- (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
- mutex_exit(&buf->b_hdr->b_freeze_lock);
- return;
- }
- fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
- if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
- panic("buffer modified while frozen!");
- mutex_exit(&buf->b_hdr->b_freeze_lock);
-}
-
-static void
-arc_cksum_compute(arc_buf_t *buf)
-{
- if (!(zfs_flags & ZFS_DEBUG_MODIFY))
- return;
-
- mutex_enter(&buf->b_hdr->b_freeze_lock);
- if (buf->b_hdr->b_freeze_cksum != NULL) {
- mutex_exit(&buf->b_hdr->b_freeze_lock);
- return;
- }
- buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
- fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
- buf->b_hdr->b_freeze_cksum);
- mutex_exit(&buf->b_hdr->b_freeze_lock);
-}
-
-void
-arc_buf_thaw(arc_buf_t *buf)
-{
- if (!(zfs_flags & ZFS_DEBUG_MODIFY))
- return;
-
- if (buf->b_hdr->b_state != arc_anon)
- panic("modifying non-anon buffer!");
- if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
- panic("modifying buffer while i/o in progress!");
- arc_cksum_verify(buf);
- mutex_enter(&buf->b_hdr->b_freeze_lock);
- if (buf->b_hdr->b_freeze_cksum != NULL) {
- kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
- buf->b_hdr->b_freeze_cksum = NULL;
- }
- mutex_exit(&buf->b_hdr->b_freeze_lock);
-}
-
-void
-arc_buf_freeze(arc_buf_t *buf)
-{
- if (!(zfs_flags & ZFS_DEBUG_MODIFY))
- return;
-
- ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
- buf->b_hdr->b_state == arc_anon);
- arc_cksum_compute(buf);
-}
-
-static void
-add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
-{
- ASSERT(MUTEX_HELD(hash_lock));
-
- if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
- (ab->b_state != arc_anon)) {
- uint64_t delta = ab->b_size * ab->b_datacnt;
-
- ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx));
- mutex_enter(&ab->b_state->arcs_mtx);
- ASSERT(list_link_active(&ab->b_arc_node));
- list_remove(&ab->b_state->arcs_list, ab);
- if (GHOST_STATE(ab->b_state)) {
- ASSERT3U(ab->b_datacnt, ==, 0);
- ASSERT3P(ab->b_buf, ==, NULL);
- delta = ab->b_size;
- }
- ASSERT(delta > 0);
- ASSERT3U(ab->b_state->arcs_lsize, >=, delta);
- atomic_add_64(&ab->b_state->arcs_lsize, -delta);
- mutex_exit(&ab->b_state->arcs_mtx);
- /* remove the prefetch flag is we get a reference */
- if (ab->b_flags & ARC_PREFETCH)
- ab->b_flags &= ~ARC_PREFETCH;
- }
-}
-
-static int
-remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
-{
- int cnt;
- arc_state_t *state = ab->b_state;
-
- ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
- ASSERT(!GHOST_STATE(state));
-
- if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
- (state != arc_anon)) {
- ASSERT(!MUTEX_HELD(&state->arcs_mtx));
- mutex_enter(&state->arcs_mtx);
- ASSERT(!list_link_active(&ab->b_arc_node));
- list_insert_head(&state->arcs_list, ab);
- ASSERT(ab->b_datacnt > 0);
- atomic_add_64(&state->arcs_lsize, ab->b_size * ab->b_datacnt);
- ASSERT3U(state->arcs_size, >=, state->arcs_lsize);
- mutex_exit(&state->arcs_mtx);
- }
- return (cnt);
-}
-
-/*
- * Move the supplied buffer to the indicated state. The mutex
- * for the buffer must be held by the caller.
- */
-static void
-arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
-{
- arc_state_t *old_state = ab->b_state;
- int64_t refcnt = refcount_count(&ab->b_refcnt);
- uint64_t from_delta, to_delta;
-
- ASSERT(MUTEX_HELD(hash_lock));
- ASSERT(new_state != old_state);
- ASSERT(refcnt == 0 || ab->b_datacnt > 0);
- ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
-
- from_delta = to_delta = ab->b_datacnt * ab->b_size;
-
- /*
- * If this buffer is evictable, transfer it from the
- * old state list to the new state list.
- */
- if (refcnt == 0) {
- if (old_state != arc_anon) {
- int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
-
- if (use_mutex)
- mutex_enter(&old_state->arcs_mtx);
-
- ASSERT(list_link_active(&ab->b_arc_node));
- list_remove(&old_state->arcs_list, ab);
-
- /*
- * If prefetching out of the ghost cache,
- * we will have a non-null datacnt.
- */
- if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
- /* ghost elements have a ghost size */
- ASSERT(ab->b_buf == NULL);
- from_delta = ab->b_size;
- }
- ASSERT3U(old_state->arcs_lsize, >=, from_delta);
- atomic_add_64(&old_state->arcs_lsize, -from_delta);
-
- if (use_mutex)
- mutex_exit(&old_state->arcs_mtx);
- }
- if (new_state != arc_anon) {
- int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
-
- if (use_mutex)
- mutex_enter(&new_state->arcs_mtx);
-
- list_insert_head(&new_state->arcs_list, ab);
-
- /* ghost elements have a ghost size */
- if (GHOST_STATE(new_state)) {
- ASSERT(ab->b_datacnt == 0);
- ASSERT(ab->b_buf == NULL);
- to_delta = ab->b_size;
- }
- atomic_add_64(&new_state->arcs_lsize, to_delta);
- ASSERT3U(new_state->arcs_size + to_delta, >=,
- new_state->arcs_lsize);
-
- if (use_mutex)
- mutex_exit(&new_state->arcs_mtx);
- }
- }
-
- ASSERT(!BUF_EMPTY(ab));
- if (new_state == arc_anon && old_state != arc_anon) {
- buf_hash_remove(ab);
- }
-
- /* adjust state sizes */
- if (to_delta)
- atomic_add_64(&new_state->arcs_size, to_delta);
- if (from_delta) {
- ASSERT3U(old_state->arcs_size, >=, from_delta);
- atomic_add_64(&old_state->arcs_size, -from_delta);
- }
- ab->b_state = new_state;
-}
-
-arc_buf_t *
-arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
-{
- arc_buf_hdr_t *hdr;
- arc_buf_t *buf;
-
- ASSERT3U(size, >, 0);
- hdr = kmem_cache_alloc(hdr_cache, KM_SLEEP);
- ASSERT(BUF_EMPTY(hdr));
- hdr->b_size = size;
- hdr->b_type = type;
- hdr->b_spa = spa;
- hdr->b_state = arc_anon;
- hdr->b_arc_access = 0;
- mutex_init(&hdr->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
- buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
- buf->b_hdr = hdr;
- buf->b_data = NULL;
- buf->b_efunc = NULL;
- buf->b_private = NULL;
- buf->b_next = NULL;
- hdr->b_buf = buf;
- arc_get_data_buf(buf);
- hdr->b_datacnt = 1;
- hdr->b_flags = 0;
- ASSERT(refcount_is_zero(&hdr->b_refcnt));
- (void) refcount_add(&hdr->b_refcnt, tag);
-
- return (buf);
-}
-
-static arc_buf_t *
-arc_buf_clone(arc_buf_t *from)
-{
- arc_buf_t *buf;
- arc_buf_hdr_t *hdr = from->b_hdr;
- uint64_t size = hdr->b_size;
-
- buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
- buf->b_hdr = hdr;
- buf->b_data = NULL;
- buf->b_efunc = NULL;
- buf->b_private = NULL;
- buf->b_next = hdr->b_buf;
- hdr->b_buf = buf;
- arc_get_data_buf(buf);
- bcopy(from->b_data, buf->b_data, size);
- hdr->b_datacnt += 1;
- return (buf);
-}
-
-void
-arc_buf_add_ref(arc_buf_t *buf, void* tag)
-{
- arc_buf_hdr_t *hdr;
- kmutex_t *hash_lock;
-
- /*
- * Check to see if this buffer is currently being evicted via
- * arc_do_user_evicts().
- */
- mutex_enter(&arc_eviction_mtx);
- hdr = buf->b_hdr;
- if (hdr == NULL) {
- mutex_exit(&arc_eviction_mtx);
- return;
- }
- hash_lock = HDR_LOCK(hdr);
- mutex_exit(&arc_eviction_mtx);
-
- mutex_enter(hash_lock);
- if (buf->b_data == NULL) {
- /*
- * This buffer is evicted.
- */
- mutex_exit(hash_lock);
- return;
- }
-
- ASSERT(buf->b_hdr == hdr);
- ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
- add_reference(hdr, hash_lock, tag);
- arc_access(hdr, hash_lock);
- mutex_exit(hash_lock);
- ARCSTAT_BUMP(arcstat_hits);
- ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
- demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
- data, metadata, hits);
-}
-
-static void
-arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
-{
- arc_buf_t **bufp;
-
- /* free up data associated with the buf */
- if (buf->b_data) {
- arc_state_t *state = buf->b_hdr->b_state;
- uint64_t size = buf->b_hdr->b_size;
- arc_buf_contents_t type = buf->b_hdr->b_type;
-
- arc_cksum_verify(buf);
- if (!recycle) {
- if (type == ARC_BUFC_METADATA) {
- zio_buf_free(buf->b_data, size);
- } else {
- ASSERT(type == ARC_BUFC_DATA);
- zio_data_buf_free(buf->b_data, size);
- }
- atomic_add_64(&arc_size, -size);
- }
- if (list_link_active(&buf->b_hdr->b_arc_node)) {
- ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
- ASSERT(state != arc_anon);
- ASSERT3U(state->arcs_lsize, >=, size);
- atomic_add_64(&state->arcs_lsize, -size);
- }
- ASSERT3U(state->arcs_size, >=, size);
- atomic_add_64(&state->arcs_size, -size);
- buf->b_data = NULL;
- ASSERT(buf->b_hdr->b_datacnt > 0);
- buf->b_hdr->b_datacnt -= 1;
- }
-
- /* only remove the buf if requested */
- if (!all)
- return;
-
- /* remove the buf from the hdr list */
- for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
- continue;
- *bufp = buf->b_next;
-
- ASSERT(buf->b_efunc == NULL);
-
- /* clean up the buf */
- buf->b_hdr = NULL;
- kmem_cache_free(buf_cache, buf);
-}
-
-static void
-arc_hdr_destroy(arc_buf_hdr_t *hdr)
-{
- ASSERT(refcount_is_zero(&hdr->b_refcnt));
- ASSERT3P(hdr->b_state, ==, arc_anon);
- ASSERT(!HDR_IO_IN_PROGRESS(hdr));
-
- if (!BUF_EMPTY(hdr)) {
- ASSERT(!HDR_IN_HASH_TABLE(hdr));
- bzero(&hdr->b_dva, sizeof (dva_t));
- hdr->b_birth = 0;
- hdr->b_cksum0 = 0;
- }
- while (hdr->b_buf) {
- arc_buf_t *buf = hdr->b_buf;
-
- if (buf->b_efunc) {
- mutex_enter(&arc_eviction_mtx);
- ASSERT(buf->b_hdr != NULL);
- arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
- hdr->b_buf = buf->b_next;
- buf->b_hdr = &arc_eviction_hdr;
- buf->b_next = arc_eviction_list;
- arc_eviction_list = buf;
- mutex_exit(&arc_eviction_mtx);
- } else {
- arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
- }
- }
- if (hdr->b_freeze_cksum != NULL) {
- kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
- hdr->b_freeze_cksum = NULL;
- }
- mutex_destroy(&hdr->b_freeze_lock);
-
- ASSERT(!list_link_active(&hdr->b_arc_node));
- ASSERT3P(hdr->b_hash_next, ==, NULL);
- ASSERT3P(hdr->b_acb, ==, NULL);
- kmem_cache_free(hdr_cache, hdr);
-}
-
-void
-arc_buf_free(arc_buf_t *buf, void *tag)
-{
- arc_buf_hdr_t *hdr = buf->b_hdr;
- int hashed = hdr->b_state != arc_anon;
-
- ASSERT(buf->b_efunc == NULL);
- ASSERT(buf->b_data != NULL);
-
- if (hashed) {
- kmutex_t *hash_lock = HDR_LOCK(hdr);
-
- mutex_enter(hash_lock);
- (void) remove_reference(hdr, hash_lock, tag);
- if (hdr->b_datacnt > 1)
- arc_buf_destroy(buf, FALSE, TRUE);
- else
- hdr->b_flags |= ARC_BUF_AVAILABLE;
- mutex_exit(hash_lock);
- } else if (HDR_IO_IN_PROGRESS(hdr)) {
- int destroy_hdr;
- /*
- * We are in the middle of an async write. Don't destroy
- * this buffer unless the write completes before we finish
- * decrementing the reference count.
- */
- mutex_enter(&arc_eviction_mtx);
- (void) remove_reference(hdr, NULL, tag);
- ASSERT(refcount_is_zero(&hdr->b_refcnt));
- destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
- mutex_exit(&arc_eviction_mtx);
- if (destroy_hdr)
- arc_hdr_destroy(hdr);
- } else {
- if (remove_reference(hdr, NULL, tag) > 0) {
- ASSERT(HDR_IO_ERROR(hdr));
- arc_buf_destroy(buf, FALSE, TRUE);
- } else {
- arc_hdr_destroy(hdr);
- }
- }
-}
-
-int
-arc_buf_remove_ref(arc_buf_t *buf, void* tag)
-{
- arc_buf_hdr_t *hdr = buf->b_hdr;
- kmutex_t *hash_lock = HDR_LOCK(hdr);
- int no_callback = (buf->b_efunc == NULL);
-
- if (hdr->b_state == arc_anon) {
- arc_buf_free(buf, tag);
- return (no_callback);
- }
-
- mutex_enter(hash_lock);
- ASSERT(hdr->b_state != arc_anon);
- ASSERT(buf->b_data != NULL);
-
- (void) remove_reference(hdr, hash_lock, tag);
- if (hdr->b_datacnt > 1) {
- if (no_callback)
- arc_buf_destroy(buf, FALSE, TRUE);
- } else if (no_callback) {
- ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
- hdr->b_flags |= ARC_BUF_AVAILABLE;
- }
- ASSERT(no_callback || hdr->b_datacnt > 1 ||
- refcount_is_zero(&hdr->b_refcnt));
- mutex_exit(hash_lock);
- return (no_callback);
-}
-
-int
-arc_buf_size(arc_buf_t *buf)
-{
- return (buf->b_hdr->b_size);
-}
-
-/*
- * Evict buffers from list until we've removed the specified number of
- * bytes. Move the removed buffers to the appropriate evict state.
- * If the recycle flag is set, then attempt to "recycle" a buffer:
- * - look for a buffer to evict that is `bytes' long.
- * - return the data block from this buffer rather than freeing it.
- * This flag is used by callers that are trying to make space for a
- * new buffer in a full arc cache.
- */
-static void *
-arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle,
- arc_buf_contents_t type)
-{
- arc_state_t *evicted_state;
- uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
- arc_buf_hdr_t *ab, *ab_prev = NULL;
- kmutex_t *hash_lock;
- boolean_t have_lock;
- void *stolen = NULL;
-
- ASSERT(state == arc_mru || state == arc_mfu);
-
- evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
-
- mutex_enter(&state->arcs_mtx);
- mutex_enter(&evicted_state->arcs_mtx);
-
- for (ab = list_tail(&state->arcs_list); ab; ab = ab_prev) {
- ab_prev = list_prev(&state->arcs_list, ab);
- /* prefetch buffers have a minimum lifespan */
- if (HDR_IO_IN_PROGRESS(ab) ||
- (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
- LBOLT - ab->b_arc_access < arc_min_prefetch_lifespan)) {
- skipped++;
- continue;
- }
- /* "lookahead" for better eviction candidate */
- if (recycle && ab->b_size != bytes &&
- ab_prev && ab_prev->b_size == bytes)
- continue;
- hash_lock = HDR_LOCK(ab);
- have_lock = MUTEX_HELD(hash_lock);
- if (have_lock || mutex_tryenter(hash_lock)) {
- ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0);
- ASSERT(ab->b_datacnt > 0);
- while (ab->b_buf) {
- arc_buf_t *buf = ab->b_buf;
- if (buf->b_data) {
- bytes_evicted += ab->b_size;
- if (recycle && ab->b_type == type &&
- ab->b_size == bytes) {
- stolen = buf->b_data;
- recycle = FALSE;
- }
- }
- if (buf->b_efunc) {
- mutex_enter(&arc_eviction_mtx);
- arc_buf_destroy(buf,
- buf->b_data == stolen, FALSE);
- ab->b_buf = buf->b_next;
- buf->b_hdr = &arc_eviction_hdr;
- buf->b_next = arc_eviction_list;
- arc_eviction_list = buf;
- mutex_exit(&arc_eviction_mtx);
- } else {
- arc_buf_destroy(buf,
- buf->b_data == stolen, TRUE);
- }
- }
- ASSERT(ab->b_datacnt == 0);
- arc_change_state(evicted_state, ab, hash_lock);
- ASSERT(HDR_IN_HASH_TABLE(ab));
- ab->b_flags = ARC_IN_HASH_TABLE;
- DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
- if (!have_lock)
- mutex_exit(hash_lock);
- if (bytes >= 0 && bytes_evicted >= bytes)
- break;
- } else {
- missed += 1;
- }
- }
-
- mutex_exit(&evicted_state->arcs_mtx);
- mutex_exit(&state->arcs_mtx);
-
- if (bytes_evicted < bytes)
- dprintf("only evicted %lld bytes from %x",
- (longlong_t)bytes_evicted, state);
-
- if (skipped)
- ARCSTAT_INCR(arcstat_evict_skip, skipped);
-
- if (missed)
- ARCSTAT_INCR(arcstat_mutex_miss, missed);
-
- return (stolen);
-}
-
-/*
- * Remove buffers from list until we've removed the specified number of
- * bytes. Destroy the buffers that are removed.
- */
-static void
-arc_evict_ghost(arc_state_t *state, int64_t bytes)
-{
- arc_buf_hdr_t *ab, *ab_prev;
- kmutex_t *hash_lock;
- uint64_t bytes_deleted = 0;
- uint64_t bufs_skipped = 0;
-
- ASSERT(GHOST_STATE(state));
-top:
- mutex_enter(&state->arcs_mtx);
- for (ab = list_tail(&state->arcs_list); ab; ab = ab_prev) {
- ab_prev = list_prev(&state->arcs_list, ab);
- hash_lock = HDR_LOCK(ab);
- if (mutex_tryenter(hash_lock)) {
- ASSERT(!HDR_IO_IN_PROGRESS(ab));
- ASSERT(ab->b_buf == NULL);
- arc_change_state(arc_anon, ab, hash_lock);
- mutex_exit(hash_lock);
- ARCSTAT_BUMP(arcstat_deleted);
- bytes_deleted += ab->b_size;
- arc_hdr_destroy(ab);
- DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
- if (bytes >= 0 && bytes_deleted >= bytes)
- break;
- } else {
- if (bytes < 0) {
- mutex_exit(&state->arcs_mtx);
- mutex_enter(hash_lock);
- mutex_exit(hash_lock);
- goto top;
- }
- bufs_skipped += 1;
- }
- }
- mutex_exit(&state->arcs_mtx);
-
- if (bufs_skipped) {
- ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
- ASSERT(bytes >= 0);
- }
-
- if (bytes_deleted < bytes)
- dprintf("only deleted %lld bytes from %p",
- (longlong_t)bytes_deleted, state);
-}
-
-static void
-arc_adjust(void)
-{
- int64_t top_sz, mru_over, arc_over, todelete;
-
- top_sz = arc_anon->arcs_size + arc_mru->arcs_size;
-
- if (top_sz > arc_p && arc_mru->arcs_lsize > 0) {
- int64_t toevict = MIN(arc_mru->arcs_lsize, top_sz - arc_p);
- (void) arc_evict(arc_mru, toevict, FALSE, ARC_BUFC_UNDEF);
- top_sz = arc_anon->arcs_size + arc_mru->arcs_size;
- }
-
- mru_over = top_sz + arc_mru_ghost->arcs_size - arc_c;
-
- if (mru_over > 0) {
- if (arc_mru_ghost->arcs_lsize > 0) {
- todelete = MIN(arc_mru_ghost->arcs_lsize, mru_over);
- arc_evict_ghost(arc_mru_ghost, todelete);
- }
- }
-
- if ((arc_over = arc_size - arc_c) > 0) {
- int64_t tbl_over;
-
- if (arc_mfu->arcs_lsize > 0) {
- int64_t toevict = MIN(arc_mfu->arcs_lsize, arc_over);
- (void) arc_evict(arc_mfu, toevict, FALSE,
- ARC_BUFC_UNDEF);
- }
-
- tbl_over = arc_size + arc_mru_ghost->arcs_lsize +
- arc_mfu_ghost->arcs_lsize - arc_c*2;
-
- if (tbl_over > 0 && arc_mfu_ghost->arcs_lsize > 0) {
- todelete = MIN(arc_mfu_ghost->arcs_lsize, tbl_over);
- arc_evict_ghost(arc_mfu_ghost, todelete);
- }
- }
-}
-
-static void
-arc_do_user_evicts(void)
-{
- mutex_enter(&arc_eviction_mtx);
- while (arc_eviction_list != NULL) {
- arc_buf_t *buf = arc_eviction_list;
- arc_eviction_list = buf->b_next;
- buf->b_hdr = NULL;
- mutex_exit(&arc_eviction_mtx);
-
- if (buf->b_efunc != NULL)
- VERIFY(buf->b_efunc(buf) == 0);
-
- buf->b_efunc = NULL;
- buf->b_private = NULL;
- kmem_cache_free(buf_cache, buf);
- mutex_enter(&arc_eviction_mtx);
- }
- mutex_exit(&arc_eviction_mtx);
-}
-
-/*
- * Flush all *evictable* data from the cache.
- * NOTE: this will not touch "active" (i.e. referenced) data.
- */
-void
-arc_flush(void)
-{
- while (list_head(&arc_mru->arcs_list))
- (void) arc_evict(arc_mru, -1, FALSE, ARC_BUFC_UNDEF);
- while (list_head(&arc_mfu->arcs_list))
- (void) arc_evict(arc_mfu, -1, FALSE, ARC_BUFC_UNDEF);
-
- arc_evict_ghost(arc_mru_ghost, -1);
- arc_evict_ghost(arc_mfu_ghost, -1);
-
- mutex_enter(&arc_reclaim_thr_lock);
- arc_do_user_evicts();
- mutex_exit(&arc_reclaim_thr_lock);
- ASSERT(arc_eviction_list == NULL);
-}
-
-int arc_shrink_shift = 5; /* log2(fraction of arc to reclaim) */
-
-void
-arc_shrink(void)
-{
- if (arc_c > arc_c_min) {
- uint64_t to_free;
-
-#ifdef _KERNEL
- to_free = arc_c >> arc_shrink_shift;
-#else
- to_free = arc_c >> arc_shrink_shift;
-#endif
- if (arc_c > arc_c_min + to_free)
- atomic_add_64(&arc_c, -to_free);
- else
- arc_c = arc_c_min;
-
- atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
- if (arc_c > arc_size)
- arc_c = MAX(arc_size, arc_c_min);
- if (arc_p > arc_c)
- arc_p = (arc_c >> 1);
- ASSERT(arc_c >= arc_c_min);
- ASSERT((int64_t)arc_p >= 0);
- }
-
- if (arc_size > arc_c)
- arc_adjust();
-}
-
-static int zfs_needfree = 0;
-
-static int
-arc_reclaim_needed(void)
-{
-#if 0
- uint64_t extra;
-#endif
-
-#ifdef _KERNEL
-
- if (zfs_needfree)
- return (1);
-
-#if 0
- /*
- * check to make sure that swapfs has enough space so that anon
- * reservations can still succeeed. anon_resvmem() checks that the
- * availrmem is greater than swapfs_minfree, and the number of reserved
- * swap pages. We also add a bit of extra here just to prevent
- * circumstances from getting really dire.
- */
- if (availrmem < swapfs_minfree + swapfs_reserve + extra)
- return (1);
-
- /*
- * If zio data pages are being allocated out of a separate heap segment,
- * then check that the size of available vmem for this area remains
- * above 1/4th free. This needs to be done when the size of the
- * non-default segment is smaller than physical memory, so we could
- * conceivably run out of VA in that segment before running out of
- * physical memory.
- */
- if (zio_arena != NULL) {
- size_t arc_ziosize =
- btop(vmem_size(zio_arena, VMEM_FREE | VMEM_ALLOC));
-
- if ((physmem > arc_ziosize) &&
- (btop(vmem_size(zio_arena, VMEM_FREE)) < arc_ziosize >> 2))
- return (1);
- }
-
-#if defined(__i386)
- /*
- * If we're on an i386 platform, it's possible that we'll exhaust the
- * kernel heap space before we ever run out of available physical
- * memory. Most checks of the size of the heap_area compare against
- * tune.t_minarmem, which is the minimum available real memory that we
- * can have in the system. However, this is generally fixed at 25 pages
- * which is so low that it's useless. In this comparison, we seek to
- * calculate the total heap-size, and reclaim if more than 3/4ths of the
- * heap is allocated. (Or, in the caclulation, if less than 1/4th is
- * free)
- */
- if (btop(vmem_size(heap_arena, VMEM_FREE)) <
- (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2))
- return (1);
-#endif
-#else
- if (kmem_used() > (kmem_size() * 3) / 4)
- return (1);
-#endif
-
-#else
- if (spa_get_random(100) == 0)
- return (1);
-#endif
- return (0);
-}
-
-static void
-arc_kmem_reap_now(arc_reclaim_strategy_t strat)
-{
-#ifdef ZIO_USE_UMA
- size_t i;
- kmem_cache_t *prev_cache = NULL;
- kmem_cache_t *prev_data_cache = NULL;
- extern kmem_cache_t *zio_buf_cache[];
- extern kmem_cache_t *zio_data_buf_cache[];
-#endif
-
-#ifdef _KERNEL
- /*
- * First purge some DNLC entries, in case the DNLC is using
- * up too much memory.
- */
- dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
-
-#if defined(__i386)
- /*
- * Reclaim unused memory from all kmem caches.
- */
- kmem_reap();
-#endif
-#endif
-
- /*
- * An agressive reclamation will shrink the cache size as well as
- * reap free buffers from the arc kmem caches.
- */
- if (strat == ARC_RECLAIM_AGGR)
- arc_shrink();
-
-#ifdef ZIO_USE_UMA
- for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
- if (zio_buf_cache[i] != prev_cache) {
- prev_cache = zio_buf_cache[i];
- kmem_cache_reap_now(zio_buf_cache[i]);
- }
- if (zio_data_buf_cache[i] != prev_data_cache) {
- prev_data_cache = zio_data_buf_cache[i];
- kmem_cache_reap_now(zio_data_buf_cache[i]);
- }
- }
-#endif
- kmem_cache_reap_now(buf_cache);
- kmem_cache_reap_now(hdr_cache);
-}
-
-static void
-arc_reclaim_thread(void *dummy __unused)
-{
- clock_t growtime = 0;
- arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS;
- callb_cpr_t cpr;
-
- CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
-
- mutex_enter(&arc_reclaim_thr_lock);
- while (arc_thread_exit == 0) {
- if (arc_reclaim_needed()) {
-
- if (arc_no_grow) {
- if (last_reclaim == ARC_RECLAIM_CONS) {
- last_reclaim = ARC_RECLAIM_AGGR;
- } else {
- last_reclaim = ARC_RECLAIM_CONS;
- }
- } else {
- arc_no_grow = TRUE;
- last_reclaim = ARC_RECLAIM_AGGR;
- membar_producer();
- }
-
- /* reset the growth delay for every reclaim */
- growtime = LBOLT + (arc_grow_retry * hz);
- ASSERT(growtime > 0);
-
- if (zfs_needfree && last_reclaim == ARC_RECLAIM_CONS) {
- /*
- * If zfs_needfree is TRUE our vm_lowmem hook
- * was called and in that case we must free some
- * memory, so switch to aggressive mode.
- */
- arc_no_grow = TRUE;
- last_reclaim = ARC_RECLAIM_AGGR;
- }
- arc_kmem_reap_now(last_reclaim);
- } else if ((growtime > 0) && ((growtime - LBOLT) <= 0)) {
- arc_no_grow = FALSE;
- }
-
- if (zfs_needfree ||
- (2 * arc_c < arc_size +
- arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size))
- arc_adjust();
-
- if (arc_eviction_list != NULL)
- arc_do_user_evicts();
-
- if (arc_reclaim_needed()) {
- zfs_needfree = 0;
-#ifdef _KERNEL
- wakeup(&zfs_needfree);
-#endif
- }
-
- /* block until needed, or one second, whichever is shorter */
- CALLB_CPR_SAFE_BEGIN(&cpr);
- (void) cv_timedwait(&arc_reclaim_thr_cv,
- &arc_reclaim_thr_lock, hz);
- CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
- }
-
- arc_thread_exit = 0;
- cv_broadcast(&arc_reclaim_thr_cv);
- CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */
- thread_exit();
-}
-
-/*
- * Adapt arc info given the number of bytes we are trying to add and
- * the state that we are comming from. This function is only called
- * when we are adding new content to the cache.
- */
-static void
-arc_adapt(int bytes, arc_state_t *state)
-{
- int mult;
-
- ASSERT(bytes > 0);
- /*
- * Adapt the target size of the MRU list:
- * - if we just hit in the MRU ghost list, then increase
- * the target size of the MRU list.
- * - if we just hit in the MFU ghost list, then increase
- * the target size of the MFU list by decreasing the
- * target size of the MRU list.
- */
- if (state == arc_mru_ghost) {
- mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
- 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
-
- arc_p = MIN(arc_c, arc_p + bytes * mult);
- } else if (state == arc_mfu_ghost) {
- mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
- 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
-
- arc_p = MAX(0, (int64_t)arc_p - bytes * mult);
- }
- ASSERT((int64_t)arc_p >= 0);
-
- if (arc_reclaim_needed()) {
- cv_signal(&arc_reclaim_thr_cv);
- return;
- }
-
- if (arc_no_grow)
- return;
-
- if (arc_c >= arc_c_max)
- return;
-
- /*
- * If we're within (2 * maxblocksize) bytes of the target
- * cache size, increment the target cache size
- */
- if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
- atomic_add_64(&arc_c, (int64_t)bytes);
- if (arc_c > arc_c_max)
- arc_c = arc_c_max;
- else if (state == arc_anon)
- atomic_add_64(&arc_p, (int64_t)bytes);
- if (arc_p > arc_c)
- arc_p = arc_c;
- }
- ASSERT((int64_t)arc_p >= 0);
-}
-
-/*
- * Check if the cache has reached its limits and eviction is required
- * prior to insert.
- */
-static int
-arc_evict_needed()
-{
- if (arc_reclaim_needed())
- return (1);
-
- return (arc_size > arc_c);
-}
-
-/*
- * The buffer, supplied as the first argument, needs a data block.
- * So, if we are at cache max, determine which cache should be victimized.
- * We have the following cases:
- *
- * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
- * In this situation if we're out of space, but the resident size of the MFU is
- * under the limit, victimize the MFU cache to satisfy this insertion request.
- *
- * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
- * Here, we've used up all of the available space for the MRU, so we need to
- * evict from our own cache instead. Evict from the set of resident MRU
- * entries.
- *
- * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
- * c minus p represents the MFU space in the cache, since p is the size of the
- * cache that is dedicated to the MRU. In this situation there's still space on
- * the MFU side, so the MRU side needs to be victimized.
- *
- * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
- * MFU's resident set is consuming more space than it has been allotted. In
- * this situation, we must victimize our own cache, the MFU, for this insertion.
- */
-static void
-arc_get_data_buf(arc_buf_t *buf)
-{
- arc_state_t *state = buf->b_hdr->b_state;
- uint64_t size = buf->b_hdr->b_size;
- arc_buf_contents_t type = buf->b_hdr->b_type;
-
- arc_adapt(size, state);
-
- /*
- * We have not yet reached cache maximum size,
- * just allocate a new buffer.
- */
- if (!arc_evict_needed()) {
- if (type == ARC_BUFC_METADATA) {
- buf->b_data = zio_buf_alloc(size);
- } else {
- ASSERT(type == ARC_BUFC_DATA);
- buf->b_data = zio_data_buf_alloc(size);
- }
- atomic_add_64(&arc_size, size);
- goto out;
- }
-
- /*
- * If we are prefetching from the mfu ghost list, this buffer
- * will end up on the mru list; so steal space from there.
- */
- if (state == arc_mfu_ghost)
- state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
- else if (state == arc_mru_ghost)
- state = arc_mru;
-
- if (state == arc_mru || state == arc_anon) {
- uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
- state = (arc_p > mru_used) ? arc_mfu : arc_mru;
- } else {
- /* MFU cases */
- uint64_t mfu_space = arc_c - arc_p;
- state = (mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
- }
- if ((buf->b_data = arc_evict(state, size, TRUE, type)) == NULL) {
- if (type == ARC_BUFC_METADATA) {
- buf->b_data = zio_buf_alloc(size);
- } else {
- ASSERT(type == ARC_BUFC_DATA);
- buf->b_data = zio_data_buf_alloc(size);
- }
- atomic_add_64(&arc_size, size);
- ARCSTAT_BUMP(arcstat_recycle_miss);
- }
- ASSERT(buf->b_data != NULL);
-out:
- /*
- * Update the state size. Note that ghost states have a
- * "ghost size" and so don't need to be updated.
- */
- if (!GHOST_STATE(buf->b_hdr->b_state)) {
- arc_buf_hdr_t *hdr = buf->b_hdr;
-
- atomic_add_64(&hdr->b_state->arcs_size, size);
- if (list_link_active(&hdr->b_arc_node)) {
- ASSERT(refcount_is_zero(&hdr->b_refcnt));
- atomic_add_64(&hdr->b_state->arcs_lsize, size);
- }
- /*
- * If we are growing the cache, and we are adding anonymous
- * data, and we have outgrown arc_p, update arc_p
- */
- if (arc_size < arc_c && hdr->b_state == arc_anon &&
- arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
- arc_p = MIN(arc_c, arc_p + size);
- }
-}
-
-/*
- * This routine is called whenever a buffer is accessed.
- * NOTE: the hash lock is dropped in this function.
- */
-static void
-arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
-{
- ASSERT(MUTEX_HELD(hash_lock));
-
- if (buf->b_state == arc_anon) {
- /*
- * This buffer is not in the cache, and does not
- * appear in our "ghost" list. Add the new buffer
- * to the MRU state.
- */
-
- ASSERT(buf->b_arc_access == 0);
- buf->b_arc_access = LBOLT;
- DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
- arc_change_state(arc_mru, buf, hash_lock);
-
- } else if (buf->b_state == arc_mru) {
- /*
- * If this buffer is here because of a prefetch, then either:
- * - clear the flag if this is a "referencing" read
- * (any subsequent access will bump this into the MFU state).
- * or
- * - move the buffer to the head of the list if this is
- * another prefetch (to make it less likely to be evicted).
- */
- if ((buf->b_flags & ARC_PREFETCH) != 0) {
- if (refcount_count(&buf->b_refcnt) == 0) {
- ASSERT(list_link_active(&buf->b_arc_node));
- mutex_enter(&arc_mru->arcs_mtx);
- list_remove(&arc_mru->arcs_list, buf);
- list_insert_head(&arc_mru->arcs_list, buf);
- mutex_exit(&arc_mru->arcs_mtx);
- } else {
- buf->b_flags &= ~ARC_PREFETCH;
- ARCSTAT_BUMP(arcstat_mru_hits);
- }
- buf->b_arc_access = LBOLT;
- return;
- }
-
- /*
- * This buffer has been "accessed" only once so far,
- * but it is still in the cache. Move it to the MFU
- * state.
- */
- if (LBOLT > buf->b_arc_access + ARC_MINTIME) {
- /*
- * More than 125ms have passed since we
- * instantiated this buffer. Move it to the
- * most frequently used state.
- */
- buf->b_arc_access = LBOLT;
- DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
- arc_change_state(arc_mfu, buf, hash_lock);
- }
- ARCSTAT_BUMP(arcstat_mru_hits);
- } else if (buf->b_state == arc_mru_ghost) {
- arc_state_t *new_state;
- /*
- * This buffer has been "accessed" recently, but
- * was evicted from the cache. Move it to the
- * MFU state.
- */
-
- if (buf->b_flags & ARC_PREFETCH) {
- new_state = arc_mru;
- if (refcount_count(&buf->b_refcnt) > 0)
- buf->b_flags &= ~ARC_PREFETCH;
- DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
- } else {
- new_state = arc_mfu;
- DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
- }
-
- buf->b_arc_access = LBOLT;
- arc_change_state(new_state, buf, hash_lock);
-
- ARCSTAT_BUMP(arcstat_mru_ghost_hits);
- } else if (buf->b_state == arc_mfu) {
- /*
- * This buffer has been accessed more than once and is
- * still in the cache. Keep it in the MFU state.
- *
- * NOTE: an add_reference() that occurred when we did
- * the arc_read() will have kicked this off the list.
- * If it was a prefetch, we will explicitly move it to
- * the head of the list now.
- */
- if ((buf->b_flags & ARC_PREFETCH) != 0) {
- ASSERT(refcount_count(&buf->b_refcnt) == 0);
- ASSERT(list_link_active(&buf->b_arc_node));
- mutex_enter(&arc_mfu->arcs_mtx);
- list_remove(&arc_mfu->arcs_list, buf);
- list_insert_head(&arc_mfu->arcs_list, buf);
- mutex_exit(&arc_mfu->arcs_mtx);
- }
- ARCSTAT_BUMP(arcstat_mfu_hits);
- buf->b_arc_access = LBOLT;
- } else if (buf->b_state == arc_mfu_ghost) {
- arc_state_t *new_state = arc_mfu;
- /*
- * This buffer has been accessed more than once but has
- * been evicted from the cache. Move it back to the
- * MFU state.
- */
-
- if (buf->b_flags & ARC_PREFETCH) {
- /*
- * This is a prefetch access...
- * move this block back to the MRU state.
- */
- ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0);
- new_state = arc_mru;
- }
-
- buf->b_arc_access = LBOLT;
- DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
- arc_change_state(new_state, buf, hash_lock);
-
- ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
- } else {
- ASSERT(!"invalid arc state");
- }
-}
-
-/* a generic arc_done_func_t which you can use */
-/* ARGSUSED */
-void
-arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
-{
- bcopy(buf->b_data, arg, buf->b_hdr->b_size);
- VERIFY(arc_buf_remove_ref(buf, arg) == 1);
-}
-
-/* a generic arc_done_func_t which you can use */
-void
-arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
-{
- arc_buf_t **bufp = arg;
- if (zio && zio->io_error) {
- VERIFY(arc_buf_remove_ref(buf, arg) == 1);
- *bufp = NULL;
- } else {
- *bufp = buf;
- }
-}
-
-static void
-arc_read_done(zio_t *zio)
-{
- arc_buf_hdr_t *hdr, *found;
- arc_buf_t *buf;
- arc_buf_t *abuf; /* buffer we're assigning to callback */
- kmutex_t *hash_lock;
- arc_callback_t *callback_list, *acb;
- int freeable = FALSE;
-
- buf = zio->io_private;
- hdr = buf->b_hdr;
-
- /*
- * The hdr was inserted into hash-table and removed from lists
- * prior to starting I/O. We should find this header, since
- * it's in the hash table, and it should be legit since it's
- * not possible to evict it during the I/O. The only possible
- * reason for it not to be found is if we were freed during the
- * read.
- */
- found = buf_hash_find(zio->io_spa, &hdr->b_dva, hdr->b_birth,
- &hash_lock);
-
- ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
- (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))));
-
- /* byteswap if necessary */
- callback_list = hdr->b_acb;
- ASSERT(callback_list != NULL);
- if (BP_SHOULD_BYTESWAP(zio->io_bp) && callback_list->acb_byteswap)
- callback_list->acb_byteswap(buf->b_data, hdr->b_size);
-
- arc_cksum_compute(buf);
-
- /* create copies of the data buffer for the callers */
- abuf = buf;
- for (acb = callback_list; acb; acb = acb->acb_next) {
- if (acb->acb_done) {
- if (abuf == NULL)
- abuf = arc_buf_clone(buf);
- acb->acb_buf = abuf;
- abuf = NULL;
- }
- }
- hdr->b_acb = NULL;
- hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
- ASSERT(!HDR_BUF_AVAILABLE(hdr));
- if (abuf == buf)
- hdr->b_flags |= ARC_BUF_AVAILABLE;
-
- ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
-
- if (zio->io_error != 0) {
- hdr->b_flags |= ARC_IO_ERROR;
- if (hdr->b_state != arc_anon)
- arc_change_state(arc_anon, hdr, hash_lock);
- if (HDR_IN_HASH_TABLE(hdr))
- buf_hash_remove(hdr);
- freeable = refcount_is_zero(&hdr->b_refcnt);
- /* convert checksum errors into IO errors */
- if (zio->io_error == ECKSUM)
- zio->io_error = EIO;
- }
-
- /*
- * Broadcast before we drop the hash_lock to avoid the possibility
- * that the hdr (and hence the cv) might be freed before we get to
- * the cv_broadcast().
- */
- cv_broadcast(&hdr->b_cv);
-
- if (hash_lock) {
- /*
- * Only call arc_access on anonymous buffers. This is because
- * if we've issued an I/O for an evicted buffer, we've already
- * called arc_access (to prevent any simultaneous readers from
- * getting confused).
- */
- if (zio->io_error == 0 && hdr->b_state == arc_anon)
- arc_access(hdr, hash_lock);
- mutex_exit(hash_lock);
- } else {
- /*
- * This block was freed while we waited for the read to
- * complete. It has been removed from the hash table and
- * moved to the anonymous state (so that it won't show up
- * in the cache).
- */
- ASSERT3P(hdr->b_state, ==, arc_anon);
- freeable = refcount_is_zero(&hdr->b_refcnt);
- }
-
- /* execute each callback and free its structure */
- while ((acb = callback_list) != NULL) {
- if (acb->acb_done)
- acb->acb_done(zio, acb->acb_buf, acb->acb_private);
-
- if (acb->acb_zio_dummy != NULL) {
- acb->acb_zio_dummy->io_error = zio->io_error;
- zio_nowait(acb->acb_zio_dummy);
- }
-
- callback_list = acb->acb_next;
- kmem_free(acb, sizeof (arc_callback_t));
- }
-
- if (freeable)
- arc_hdr_destroy(hdr);
-}
-
-/*
- * "Read" the block block at the specified DVA (in bp) via the
- * cache. If the block is found in the cache, invoke the provided
- * callback immediately and return. Note that the `zio' parameter
- * in the callback will be NULL in this case, since no IO was
- * required. If the block is not in the cache pass the read request
- * on to the spa with a substitute callback function, so that the
- * requested block will be added to the cache.
- *
- * If a read request arrives for a block that has a read in-progress,
- * either wait for the in-progress read to complete (and return the
- * results); or, if this is a read with a "done" func, add a record
- * to the read to invoke the "done" func when the read completes,
- * and return; or just return.
- *
- * arc_read_done() will invoke all the requested "done" functions
- * for readers of this block.
- */
-int
-arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap,
- arc_done_func_t *done, void *private, int priority, int flags,
- uint32_t *arc_flags, zbookmark_t *zb)
-{
- arc_buf_hdr_t *hdr;
- arc_buf_t *buf;
- kmutex_t *hash_lock;
- zio_t *rzio;
-
-top:
- hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
- if (hdr && hdr->b_datacnt > 0) {
-
- *arc_flags |= ARC_CACHED;
-
- if (HDR_IO_IN_PROGRESS(hdr)) {
-
- if (*arc_flags & ARC_WAIT) {
- cv_wait(&hdr->b_cv, hash_lock);
- mutex_exit(hash_lock);
- goto top;
- }
- ASSERT(*arc_flags & ARC_NOWAIT);
-
- if (done) {
- arc_callback_t *acb = NULL;
-
- acb = kmem_zalloc(sizeof (arc_callback_t),
- KM_SLEEP);
- acb->acb_done = done;
- acb->acb_private = private;
- acb->acb_byteswap = swap;
- if (pio != NULL)
- acb->acb_zio_dummy = zio_null(pio,
- spa, NULL, NULL, flags);
-
- ASSERT(acb->acb_done != NULL);
- acb->acb_next = hdr->b_acb;
- hdr->b_acb = acb;
- add_reference(hdr, hash_lock, private);
- mutex_exit(hash_lock);
- return (0);
- }
- mutex_exit(hash_lock);
- return (0);
- }
-
- ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
-
- if (done) {
- add_reference(hdr, hash_lock, private);
- /*
- * If this block is already in use, create a new
- * copy of the data so that we will be guaranteed
- * that arc_release() will always succeed.
- */
- buf = hdr->b_buf;
- ASSERT(buf);
- ASSERT(buf->b_data);
- if (HDR_BUF_AVAILABLE(hdr)) {
- ASSERT(buf->b_efunc == NULL);
- hdr->b_flags &= ~ARC_BUF_AVAILABLE;
- } else {
- buf = arc_buf_clone(buf);
- }
- } else if (*arc_flags & ARC_PREFETCH &&
- refcount_count(&hdr->b_refcnt) == 0) {
- hdr->b_flags |= ARC_PREFETCH;
- }
- DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
- arc_access(hdr, hash_lock);
- mutex_exit(hash_lock);
- ARCSTAT_BUMP(arcstat_hits);
- ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
- demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
- data, metadata, hits);
-
- if (done)
- done(NULL, buf, private);
- } else {
- uint64_t size = BP_GET_LSIZE(bp);
- arc_callback_t *acb;
-
- if (hdr == NULL) {
- /* this block is not in the cache */
- arc_buf_hdr_t *exists;
- arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
- buf = arc_buf_alloc(spa, size, private, type);
- hdr = buf->b_hdr;
- hdr->b_dva = *BP_IDENTITY(bp);
- hdr->b_birth = bp->blk_birth;
- hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
- exists = buf_hash_insert(hdr, &hash_lock);
- if (exists) {
- /* somebody beat us to the hash insert */
- mutex_exit(hash_lock);
- bzero(&hdr->b_dva, sizeof (dva_t));
- hdr->b_birth = 0;
- hdr->b_cksum0 = 0;
- (void) arc_buf_remove_ref(buf, private);
- goto top; /* restart the IO request */
- }
- /* if this is a prefetch, we don't have a reference */
- if (*arc_flags & ARC_PREFETCH) {
- (void) remove_reference(hdr, hash_lock,
- private);
- hdr->b_flags |= ARC_PREFETCH;
- }
- if (BP_GET_LEVEL(bp) > 0)
- hdr->b_flags |= ARC_INDIRECT;
- } else {
- /* this block is in the ghost cache */
- ASSERT(GHOST_STATE(hdr->b_state));
- ASSERT(!HDR_IO_IN_PROGRESS(hdr));
- ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0);
- ASSERT(hdr->b_buf == NULL);
-
- /* if this is a prefetch, we don't have a reference */
- if (*arc_flags & ARC_PREFETCH)
- hdr->b_flags |= ARC_PREFETCH;
- else
- add_reference(hdr, hash_lock, private);
- buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
- buf->b_hdr = hdr;
- buf->b_data = NULL;
- buf->b_efunc = NULL;
- buf->b_private = NULL;
- buf->b_next = NULL;
- hdr->b_buf = buf;
- arc_get_data_buf(buf);
- ASSERT(hdr->b_datacnt == 0);
- hdr->b_datacnt = 1;
-
- }
-
- acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
- acb->acb_done = done;
- acb->acb_private = private;
- acb->acb_byteswap = swap;
-
- ASSERT(hdr->b_acb == NULL);
- hdr->b_acb = acb;
- hdr->b_flags |= ARC_IO_IN_PROGRESS;
-
- /*
- * If the buffer has been evicted, migrate it to a present state
- * before issuing the I/O. Once we drop the hash-table lock,
- * the header will be marked as I/O in progress and have an
- * attached buffer. At this point, anybody who finds this
- * buffer ought to notice that it's legit but has a pending I/O.
- */
-
- if (GHOST_STATE(hdr->b_state))
- arc_access(hdr, hash_lock);
- mutex_exit(hash_lock);
-
- ASSERT3U(hdr->b_size, ==, size);
- DTRACE_PROBE3(arc__miss, blkptr_t *, bp, uint64_t, size,
- zbookmark_t *, zb);
- ARCSTAT_BUMP(arcstat_misses);
- ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
- demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
- data, metadata, misses);
-
- rzio = zio_read(pio, spa, bp, buf->b_data, size,
- arc_read_done, buf, priority, flags, zb);
-
- if (*arc_flags & ARC_WAIT)
- return (zio_wait(rzio));
-
- ASSERT(*arc_flags & ARC_NOWAIT);
- zio_nowait(rzio);
- }
- return (0);
-}
-
-/*
- * arc_read() variant to support pool traversal. If the block is already
- * in the ARC, make a copy of it; otherwise, the caller will do the I/O.
- * The idea is that we don't want pool traversal filling up memory, but
- * if the ARC already has the data anyway, we shouldn't pay for the I/O.
- */
-int
-arc_tryread(spa_t *spa, blkptr_t *bp, void *data)
-{
- arc_buf_hdr_t *hdr;
- kmutex_t *hash_mtx;
- int rc = 0;
-
- hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx);
-
- if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) {
- arc_buf_t *buf = hdr->b_buf;
-
- ASSERT(buf);
- while (buf->b_data == NULL) {
- buf = buf->b_next;
- ASSERT(buf);
- }
- bcopy(buf->b_data, data, hdr->b_size);
- } else {
- rc = ENOENT;
- }
-
- if (hash_mtx)
- mutex_exit(hash_mtx);
-
- return (rc);
-}
-
-void
-arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
-{
- ASSERT(buf->b_hdr != NULL);
- ASSERT(buf->b_hdr->b_state != arc_anon);
- ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
- buf->b_efunc = func;
- buf->b_private = private;
-}
-
-/*
- * This is used by the DMU to let the ARC know that a buffer is
- * being evicted, so the ARC should clean up. If this arc buf
- * is not yet in the evicted state, it will be put there.
- */
-int
-arc_buf_evict(arc_buf_t *buf)
-{
- arc_buf_hdr_t *hdr;
- kmutex_t *hash_lock;
- arc_buf_t **bufp;
-
- mutex_enter(&arc_eviction_mtx);
- hdr = buf->b_hdr;
- if (hdr == NULL) {
- /*
- * We are in arc_do_user_evicts().
- */
- ASSERT(buf->b_data == NULL);
- mutex_exit(&arc_eviction_mtx);
- return (0);
- }
- hash_lock = HDR_LOCK(hdr);
- mutex_exit(&arc_eviction_mtx);
-
- mutex_enter(hash_lock);
-
- if (buf->b_data == NULL) {
- /*
- * We are on the eviction list.
- */
- mutex_exit(hash_lock);
- mutex_enter(&arc_eviction_mtx);
- if (buf->b_hdr == NULL) {
- /*
- * We are already in arc_do_user_evicts().
- */
- mutex_exit(&arc_eviction_mtx);
- return (0);
- } else {
- arc_buf_t copy = *buf; /* structure assignment */
- /*
- * Process this buffer now
- * but let arc_do_user_evicts() do the reaping.
- */
- buf->b_efunc = NULL;
- mutex_exit(&arc_eviction_mtx);
- VERIFY(copy.b_efunc(&copy) == 0);
- return (1);
- }
- }
-
- ASSERT(buf->b_hdr == hdr);
- ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
- ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
-
- /*
- * Pull this buffer off of the hdr
- */
- bufp = &hdr->b_buf;
- while (*bufp != buf)
- bufp = &(*bufp)->b_next;
- *bufp = buf->b_next;
-
- ASSERT(buf->b_data != NULL);
- arc_buf_destroy(buf, FALSE, FALSE);
-
- if (hdr->b_datacnt == 0) {
- arc_state_t *old_state = hdr->b_state;
- arc_state_t *evicted_state;
-
- ASSERT(refcount_is_zero(&hdr->b_refcnt));
-
- evicted_state =
- (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
-
- mutex_enter(&old_state->arcs_mtx);
- mutex_enter(&evicted_state->arcs_mtx);
-
- arc_change_state(evicted_state, hdr, hash_lock);
- ASSERT(HDR_IN_HASH_TABLE(hdr));
- hdr->b_flags = ARC_IN_HASH_TABLE;
-
- mutex_exit(&evicted_state->arcs_mtx);
- mutex_exit(&old_state->arcs_mtx);
- }
- mutex_exit(hash_lock);
-
- VERIFY(buf->b_efunc(buf) == 0);
- buf->b_efunc = NULL;
- buf->b_private = NULL;
- buf->b_hdr = NULL;
- kmem_cache_free(buf_cache, buf);
- return (1);
-}
-
-/*
- * Release this buffer from the cache. This must be done
- * after a read and prior to modifying the buffer contents.
- * If the buffer has more than one reference, we must make
- * make a new hdr for the buffer.
- */
-void
-arc_release(arc_buf_t *buf, void *tag)
-{
- arc_buf_hdr_t *hdr = buf->b_hdr;
- kmutex_t *hash_lock = HDR_LOCK(hdr);
-
- /* this buffer is not on any list */
- ASSERT(refcount_count(&hdr->b_refcnt) > 0);
-
- if (hdr->b_state == arc_anon) {
- /* this buffer is already released */
- ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1);
- ASSERT(BUF_EMPTY(hdr));
- ASSERT(buf->b_efunc == NULL);
- arc_buf_thaw(buf);
- return;
- }
-
- mutex_enter(hash_lock);
-
- /*
- * Do we have more than one buf?
- */
- if (hdr->b_buf != buf || buf->b_next != NULL) {
- arc_buf_hdr_t *nhdr;
- arc_buf_t **bufp;
- uint64_t blksz = hdr->b_size;
- spa_t *spa = hdr->b_spa;
- arc_buf_contents_t type = hdr->b_type;
-
- ASSERT(hdr->b_datacnt > 1);
- /*
- * Pull the data off of this buf and attach it to
- * a new anonymous buf.
- */
- (void) remove_reference(hdr, hash_lock, tag);
- bufp = &hdr->b_buf;
- while (*bufp != buf)
- bufp = &(*bufp)->b_next;
- *bufp = (*bufp)->b_next;
- buf->b_next = NULL;
-
- ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
- atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
- if (refcount_is_zero(&hdr->b_refcnt)) {
- ASSERT3U(hdr->b_state->arcs_lsize, >=, hdr->b_size);
- atomic_add_64(&hdr->b_state->arcs_lsize, -hdr->b_size);
- }
- hdr->b_datacnt -= 1;
- arc_cksum_verify(buf);
-
- mutex_exit(hash_lock);
-
- nhdr = kmem_cache_alloc(hdr_cache, KM_SLEEP);
- nhdr->b_size = blksz;
- nhdr->b_spa = spa;
- nhdr->b_type = type;
- nhdr->b_buf = buf;
- nhdr->b_state = arc_anon;
- nhdr->b_arc_access = 0;
- nhdr->b_flags = 0;
- nhdr->b_datacnt = 1;
- nhdr->b_freeze_cksum = NULL;
- mutex_init(&nhdr->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
- (void) refcount_add(&nhdr->b_refcnt, tag);
- buf->b_hdr = nhdr;
- atomic_add_64(&arc_anon->arcs_size, blksz);
-
- hdr = nhdr;
- } else {
- ASSERT(refcount_count(&hdr->b_refcnt) == 1);
- ASSERT(!list_link_active(&hdr->b_arc_node));
- ASSERT(!HDR_IO_IN_PROGRESS(hdr));
- arc_change_state(arc_anon, hdr, hash_lock);
- hdr->b_arc_access = 0;
- mutex_exit(hash_lock);
- bzero(&hdr->b_dva, sizeof (dva_t));
- hdr->b_birth = 0;
- hdr->b_cksum0 = 0;
- arc_buf_thaw(buf);
- }
- buf->b_efunc = NULL;
- buf->b_private = NULL;
-}
-
-int
-arc_released(arc_buf_t *buf)
-{
- return (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
-}
-
-int
-arc_has_callback(arc_buf_t *buf)
-{
- return (buf->b_efunc != NULL);
-}
-
-#ifdef ZFS_DEBUG
-int
-arc_referenced(arc_buf_t *buf)
-{
- return (refcount_count(&buf->b_hdr->b_refcnt));
-}
-#endif
-
-static void
-arc_write_ready(zio_t *zio)
-{
- arc_write_callback_t *callback = zio->io_private;
- arc_buf_t *buf = callback->awcb_buf;
-
- if (callback->awcb_ready) {
- ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
- callback->awcb_ready(zio, buf, callback->awcb_private);
- }
- arc_cksum_compute(buf);
-}
-
-static void
-arc_write_done(zio_t *zio)
-{
- arc_write_callback_t *callback = zio->io_private;
- arc_buf_t *buf = callback->awcb_buf;
- arc_buf_hdr_t *hdr = buf->b_hdr;
-
- hdr->b_acb = NULL;
-
- /* this buffer is on no lists and is not in the hash table */
- ASSERT3P(hdr->b_state, ==, arc_anon);
-
- hdr->b_dva = *BP_IDENTITY(zio->io_bp);
- hdr->b_birth = zio->io_bp->blk_birth;
- hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
- /*
- * If the block to be written was all-zero, we may have
- * compressed it away. In this case no write was performed
- * so there will be no dva/birth-date/checksum. The buffer
- * must therefor remain anonymous (and uncached).
- */
- if (!BUF_EMPTY(hdr)) {
- arc_buf_hdr_t *exists;
- kmutex_t *hash_lock;
-
- arc_cksum_verify(buf);
-
- exists = buf_hash_insert(hdr, &hash_lock);
- if (exists) {
- /*
- * This can only happen if we overwrite for
- * sync-to-convergence, because we remove
- * buffers from the hash table when we arc_free().
- */
- ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig),
- BP_IDENTITY(zio->io_bp)));
- ASSERT3U(zio->io_bp_orig.blk_birth, ==,
- zio->io_bp->blk_birth);
-
- ASSERT(refcount_is_zero(&exists->b_refcnt));
- arc_change_state(arc_anon, exists, hash_lock);
- mutex_exit(hash_lock);
- arc_hdr_destroy(exists);
- exists = buf_hash_insert(hdr, &hash_lock);
- ASSERT3P(exists, ==, NULL);
- }
- hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
- arc_access(hdr, hash_lock);
- mutex_exit(hash_lock);
- } else if (callback->awcb_done == NULL) {
- int destroy_hdr;
- /*
- * This is an anonymous buffer with no user callback,
- * destroy it if there are no active references.
- */
- mutex_enter(&arc_eviction_mtx);
- destroy_hdr = refcount_is_zero(&hdr->b_refcnt);
- hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
- mutex_exit(&arc_eviction_mtx);
- if (destroy_hdr)
- arc_hdr_destroy(hdr);
- } else {
- hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
- }
-
- if (callback->awcb_done) {
- ASSERT(!refcount_is_zero(&hdr->b_refcnt));
- callback->awcb_done(zio, buf, callback->awcb_private);
- }
-
- kmem_free(callback, sizeof (arc_write_callback_t));
-}
-
-zio_t *
-arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies,
- uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
- arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority,
- int flags, zbookmark_t *zb)
-{
- arc_buf_hdr_t *hdr = buf->b_hdr;
- arc_write_callback_t *callback;
- zio_t *zio;
-
- /* this is a private buffer - no locking required */
- ASSERT3P(hdr->b_state, ==, arc_anon);
- ASSERT(BUF_EMPTY(hdr));
- ASSERT(!HDR_IO_ERROR(hdr));
- ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
- ASSERT(hdr->b_acb == 0);
- callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
- callback->awcb_ready = ready;
- callback->awcb_done = done;
- callback->awcb_private = private;
- callback->awcb_buf = buf;
- hdr->b_flags |= ARC_IO_IN_PROGRESS;
- zio = zio_write(pio, spa, checksum, compress, ncopies, txg, bp,
- buf->b_data, hdr->b_size, arc_write_ready, arc_write_done, callback,
- priority, flags, zb);
-
- return (zio);
-}
-
-int
-arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
- zio_done_func_t *done, void *private, uint32_t arc_flags)
-{
- arc_buf_hdr_t *ab;
- kmutex_t *hash_lock;
- zio_t *zio;
-
- /*
- * If this buffer is in the cache, release it, so it
- * can be re-used.
- */
- ab = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
- if (ab != NULL) {
- /*
- * The checksum of blocks to free is not always
- * preserved (eg. on the deadlist). However, if it is
- * nonzero, it should match what we have in the cache.
- */
- ASSERT(bp->blk_cksum.zc_word[0] == 0 ||
- ab->b_cksum0 == bp->blk_cksum.zc_word[0]);
- if (ab->b_state != arc_anon)
- arc_change_state(arc_anon, ab, hash_lock);
- if (HDR_IO_IN_PROGRESS(ab)) {
- /*
- * This should only happen when we prefetch.
- */
- ASSERT(ab->b_flags & ARC_PREFETCH);
- ASSERT3U(ab->b_datacnt, ==, 1);
- ab->b_flags |= ARC_FREED_IN_READ;
- if (HDR_IN_HASH_TABLE(ab))
- buf_hash_remove(ab);
- ab->b_arc_access = 0;
- bzero(&ab->b_dva, sizeof (dva_t));
- ab->b_birth = 0;
- ab->b_cksum0 = 0;
- ab->b_buf->b_efunc = NULL;
- ab->b_buf->b_private = NULL;
- mutex_exit(hash_lock);
- } else if (refcount_is_zero(&ab->b_refcnt)) {
- mutex_exit(hash_lock);
- arc_hdr_destroy(ab);
- ARCSTAT_BUMP(arcstat_deleted);
- } else {
- /*
- * We still have an active reference on this
- * buffer. This can happen, e.g., from
- * dbuf_unoverride().
- */
- ASSERT(!HDR_IN_HASH_TABLE(ab));
- ab->b_arc_access = 0;
- bzero(&ab->b_dva, sizeof (dva_t));
- ab->b_birth = 0;
- ab->b_cksum0 = 0;
- ab->b_buf->b_efunc = NULL;
- ab->b_buf->b_private = NULL;
- mutex_exit(hash_lock);
- }
- }
-
- zio = zio_free(pio, spa, txg, bp, done, private);
-
- if (arc_flags & ARC_WAIT)
- return (zio_wait(zio));
-
- ASSERT(arc_flags & ARC_NOWAIT);
- zio_nowait(zio);
-
- return (0);
-}
-
-void
-arc_tempreserve_clear(uint64_t tempreserve)
-{
- atomic_add_64(&arc_tempreserve, -tempreserve);
- ASSERT((int64_t)arc_tempreserve >= 0);
-}
-
-int
-arc_tempreserve_space(uint64_t tempreserve)
-{
-#ifdef ZFS_DEBUG
- /*
- * Once in a while, fail for no reason. Everything should cope.
- */
- if (spa_get_random(10000) == 0) {
- dprintf("forcing random failure\n");
- return (ERESTART);
- }
-#endif
- if (tempreserve > arc_c/4 && !arc_no_grow)
- arc_c = MIN(arc_c_max, tempreserve * 4);
- if (tempreserve > arc_c)
- return (ENOMEM);
-
- /*
- * Throttle writes when the amount of dirty data in the cache
- * gets too large. We try to keep the cache less than half full
- * of dirty blocks so that our sync times don't grow too large.
- * Note: if two requests come in concurrently, we might let them
- * both succeed, when one of them should fail. Not a huge deal.
- *
- * XXX The limit should be adjusted dynamically to keep the time
- * to sync a dataset fixed (around 1-5 seconds?).
- */
-
- if (tempreserve + arc_tempreserve + arc_anon->arcs_size > arc_c / 2 &&
- arc_tempreserve + arc_anon->arcs_size > arc_c / 4) {
- dprintf("failing, arc_tempreserve=%lluK anon=%lluK "
- "tempreserve=%lluK arc_c=%lluK\n",
- arc_tempreserve>>10, arc_anon->arcs_lsize>>10,
- tempreserve>>10, arc_c>>10);
- return (ERESTART);
- }
- atomic_add_64(&arc_tempreserve, tempreserve);
- return (0);
-}
-
-static kmutex_t arc_lowmem_lock;
-#ifdef _KERNEL
-static eventhandler_tag arc_event_lowmem = NULL;
-
-static void
-arc_lowmem(void *arg __unused, int howto __unused)
-{
-
- /* Serialize access via arc_lowmem_lock. */
- mutex_enter(&arc_lowmem_lock);
- zfs_needfree = 1;
- cv_signal(&arc_reclaim_thr_cv);
- while (zfs_needfree)
- tsleep(&zfs_needfree, 0, "zfs:lowmem", hz / 5);
- mutex_exit(&arc_lowmem_lock);
-}
-#endif
-
-void
-arc_init(void)
-{
- mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
- cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
- mutex_init(&arc_lowmem_lock, NULL, MUTEX_DEFAULT, NULL);
-
- /* Convert seconds to clock ticks */
- arc_min_prefetch_lifespan = 1 * hz;
-
- /* Start out with 1/8 of all memory */
- arc_c = kmem_size() / 8;
-#if 0
-#ifdef _KERNEL
- /*
- * On architectures where the physical memory can be larger
- * than the addressable space (intel in 32-bit mode), we may
- * need to limit the cache to 1/8 of VM size.
- */
- arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
-#endif
-#endif
- /* set min cache to 1/32 of all memory, or 16MB, whichever is more */
- arc_c_min = MAX(arc_c / 4, 64<<18);
- /* set max to 1/2 of all memory, or all but 1GB, whichever is more */
- if (arc_c * 8 >= 1<<30)
- arc_c_max = (arc_c * 8) - (1<<30);
- else
- arc_c_max = arc_c_min;
- arc_c_max = MAX(arc_c * 5, arc_c_max);
-#ifdef _KERNEL
- /*
- * Allow the tunables to override our calculations if they are
- * reasonable (ie. over 16MB)
- */
- if (zfs_arc_max >= 64<<18 && zfs_arc_max < kmem_size())
- arc_c_max = zfs_arc_max;
- if (zfs_arc_min >= 64<<18 && zfs_arc_min <= arc_c_max)
- arc_c_min = zfs_arc_min;
-#endif
- arc_c = arc_c_max;
- arc_p = (arc_c >> 1);
-
- /* if kmem_flags are set, lets try to use less memory */
- if (kmem_debugging())
- arc_c = arc_c / 2;
- if (arc_c < arc_c_min)
- arc_c = arc_c_min;
-
- zfs_arc_min = arc_c_min;
- zfs_arc_max = arc_c_max;
-
- arc_anon = &ARC_anon;
- arc_mru = &ARC_mru;
- arc_mru_ghost = &ARC_mru_ghost;
- arc_mfu = &ARC_mfu;
- arc_mfu_ghost = &ARC_mfu_ghost;
- arc_size = 0;
-
- mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
-
- list_create(&arc_mru->arcs_list, sizeof (arc_buf_hdr_t),
- offsetof(arc_buf_hdr_t, b_arc_node));
- list_create(&arc_mru_ghost->arcs_list, sizeof (arc_buf_hdr_t),
- offsetof(arc_buf_hdr_t, b_arc_node));
- list_create(&arc_mfu->arcs_list, sizeof (arc_buf_hdr_t),
- offsetof(arc_buf_hdr_t, b_arc_node));
- list_create(&arc_mfu_ghost->arcs_list, sizeof (arc_buf_hdr_t),
- offsetof(arc_buf_hdr_t, b_arc_node));
-
- buf_init();
-
- arc_thread_exit = 0;
- arc_eviction_list = NULL;
- mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
- bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
-
- arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
- sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
-
- if (arc_ksp != NULL) {
- arc_ksp->ks_data = &arc_stats;
- kstat_install(arc_ksp);
- }
-
- (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
- TS_RUN, minclsyspri);
-
-#ifdef _KERNEL
- arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL,
- EVENTHANDLER_PRI_FIRST);
-#endif
-
- arc_dead = FALSE;
-
-#ifdef _KERNEL
- /* Warn about ZFS memory and address space requirements. */
- if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) {
- printf("ZFS WARNING: Recommended minimum RAM size is 512MB; "
- "expect unstable behavior.\n");
- }
- if (kmem_size() < 512 * (1 << 20)) {
- printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; "
- "expect unstable behavior.\n");
- printf(" Consider tuning vm.kmem_size and "
- "vm.kmem_size_max\n");
- printf(" in /boot/loader.conf.\n");
- }
-#endif
-}
-
-void
-arc_fini(void)
-{
- mutex_enter(&arc_reclaim_thr_lock);
- arc_thread_exit = 1;
- cv_signal(&arc_reclaim_thr_cv);
- while (arc_thread_exit != 0)
- cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
- mutex_exit(&arc_reclaim_thr_lock);
-
- arc_flush();
-
- arc_dead = TRUE;
-
- if (arc_ksp != NULL) {
- kstat_delete(arc_ksp);
- arc_ksp = NULL;
- }
-
- mutex_destroy(&arc_eviction_mtx);
- mutex_destroy(&arc_reclaim_thr_lock);
- cv_destroy(&arc_reclaim_thr_cv);
-
- list_destroy(&arc_mru->arcs_list);
- list_destroy(&arc_mru_ghost->arcs_list);
- list_destroy(&arc_mfu->arcs_list);
- list_destroy(&arc_mfu_ghost->arcs_list);
-
- mutex_destroy(&arc_anon->arcs_mtx);
- mutex_destroy(&arc_mru->arcs_mtx);
- mutex_destroy(&arc_mru_ghost->arcs_mtx);
- mutex_destroy(&arc_mfu->arcs_mtx);
- mutex_destroy(&arc_mfu_ghost->arcs_mtx);
-
- buf_fini();
-
- mutex_destroy(&arc_lowmem_lock);
-#ifdef _KERNEL
- if (arc_event_lowmem != NULL)
- EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem);
-#endif
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/bplist.c b/sys/contrib/opensolaris/uts/common/fs/zfs/bplist.c
deleted file mode 100644
index 4442b1f..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/bplist.c
+++ /dev/null
@@ -1,312 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/bplist.h>
-#include <sys/zfs_context.h>
-
-static int
-bplist_hold(bplist_t *bpl)
-{
- ASSERT(MUTEX_HELD(&bpl->bpl_lock));
- if (bpl->bpl_dbuf == NULL) {
- int err = dmu_bonus_hold(bpl->bpl_mos,
- bpl->bpl_object, bpl, &bpl->bpl_dbuf);
- if (err)
- return (err);
- bpl->bpl_phys = bpl->bpl_dbuf->db_data;
- }
- return (0);
-}
-
-uint64_t
-bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx)
-{
- int size;
-
- size = spa_version(dmu_objset_spa(mos)) < ZFS_VERSION_BPLIST_ACCOUNT ?
- BPLIST_SIZE_V0 : sizeof (bplist_phys_t);
-
- return (dmu_object_alloc(mos, DMU_OT_BPLIST, blocksize,
- DMU_OT_BPLIST_HDR, size, tx));
-}
-
-void
-bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx)
-{
- VERIFY(dmu_object_free(mos, object, tx) == 0);
-}
-
-int
-bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object)
-{
- dmu_object_info_t doi;
- int err;
-
- err = dmu_object_info(mos, object, &doi);
- if (err)
- return (err);
-
- mutex_enter(&bpl->bpl_lock);
-
- ASSERT(bpl->bpl_dbuf == NULL);
- ASSERT(bpl->bpl_phys == NULL);
- ASSERT(bpl->bpl_cached_dbuf == NULL);
- ASSERT(bpl->bpl_queue == NULL);
- ASSERT(object != 0);
- ASSERT3U(doi.doi_type, ==, DMU_OT_BPLIST);
- ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPLIST_HDR);
-
- bpl->bpl_mos = mos;
- bpl->bpl_object = object;
- bpl->bpl_blockshift = highbit(doi.doi_data_block_size - 1);
- bpl->bpl_bpshift = bpl->bpl_blockshift - SPA_BLKPTRSHIFT;
- bpl->bpl_havecomp = (doi.doi_bonus_size == sizeof (bplist_phys_t));
-
- mutex_exit(&bpl->bpl_lock);
- return (0);
-}
-
-void
-bplist_close(bplist_t *bpl)
-{
- mutex_enter(&bpl->bpl_lock);
-
- ASSERT(bpl->bpl_queue == NULL);
-
- if (bpl->bpl_cached_dbuf) {
- dmu_buf_rele(bpl->bpl_cached_dbuf, bpl);
- bpl->bpl_cached_dbuf = NULL;
- }
- if (bpl->bpl_dbuf) {
- dmu_buf_rele(bpl->bpl_dbuf, bpl);
- bpl->bpl_dbuf = NULL;
- bpl->bpl_phys = NULL;
- }
-
- mutex_exit(&bpl->bpl_lock);
-}
-
-boolean_t
-bplist_empty(bplist_t *bpl)
-{
- boolean_t rv;
-
- if (bpl->bpl_object == 0)
- return (B_TRUE);
-
- mutex_enter(&bpl->bpl_lock);
- VERIFY(0 == bplist_hold(bpl)); /* XXX */
- rv = (bpl->bpl_phys->bpl_entries == 0);
- mutex_exit(&bpl->bpl_lock);
-
- return (rv);
-}
-
-static int
-bplist_cache(bplist_t *bpl, uint64_t blkid)
-{
- int err = 0;
-
- if (bpl->bpl_cached_dbuf == NULL ||
- bpl->bpl_cached_dbuf->db_offset != (blkid << bpl->bpl_blockshift)) {
- if (bpl->bpl_cached_dbuf != NULL)
- dmu_buf_rele(bpl->bpl_cached_dbuf, bpl);
- err = dmu_buf_hold(bpl->bpl_mos,
- bpl->bpl_object, blkid << bpl->bpl_blockshift,
- bpl, &bpl->bpl_cached_dbuf);
- ASSERT(err || bpl->bpl_cached_dbuf->db_size ==
- 1ULL << bpl->bpl_blockshift);
- }
- return (err);
-}
-
-int
-bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp)
-{
- uint64_t blk, off;
- blkptr_t *bparray;
- int err;
-
- mutex_enter(&bpl->bpl_lock);
-
- err = bplist_hold(bpl);
- if (err) {
- mutex_exit(&bpl->bpl_lock);
- return (err);
- }
-
- if (*itorp >= bpl->bpl_phys->bpl_entries) {
- mutex_exit(&bpl->bpl_lock);
- return (ENOENT);
- }
-
- blk = *itorp >> bpl->bpl_bpshift;
- off = P2PHASE(*itorp, 1ULL << bpl->bpl_bpshift);
-
- err = bplist_cache(bpl, blk);
- if (err) {
- mutex_exit(&bpl->bpl_lock);
- return (err);
- }
-
- bparray = bpl->bpl_cached_dbuf->db_data;
- *bp = bparray[off];
- (*itorp)++;
- mutex_exit(&bpl->bpl_lock);
- return (0);
-}
-
-int
-bplist_enqueue(bplist_t *bpl, blkptr_t *bp, dmu_tx_t *tx)
-{
- uint64_t blk, off;
- blkptr_t *bparray;
- int err;
-
- ASSERT(!BP_IS_HOLE(bp));
- mutex_enter(&bpl->bpl_lock);
- err = bplist_hold(bpl);
- if (err)
- return (err);
-
- blk = bpl->bpl_phys->bpl_entries >> bpl->bpl_bpshift;
- off = P2PHASE(bpl->bpl_phys->bpl_entries, 1ULL << bpl->bpl_bpshift);
-
- err = bplist_cache(bpl, blk);
- if (err) {
- mutex_exit(&bpl->bpl_lock);
- return (err);
- }
-
- dmu_buf_will_dirty(bpl->bpl_cached_dbuf, tx);
- bparray = bpl->bpl_cached_dbuf->db_data;
- bparray[off] = *bp;
-
- /* We never need the fill count. */
- bparray[off].blk_fill = 0;
-
- /* The bplist will compress better if we can leave off the checksum */
- bzero(&bparray[off].blk_cksum, sizeof (bparray[off].blk_cksum));
-
- dmu_buf_will_dirty(bpl->bpl_dbuf, tx);
- bpl->bpl_phys->bpl_entries++;
- bpl->bpl_phys->bpl_bytes +=
- bp_get_dasize(dmu_objset_spa(bpl->bpl_mos), bp);
- if (bpl->bpl_havecomp) {
- bpl->bpl_phys->bpl_comp += BP_GET_PSIZE(bp);
- bpl->bpl_phys->bpl_uncomp += BP_GET_UCSIZE(bp);
- }
- mutex_exit(&bpl->bpl_lock);
-
- return (0);
-}
-
-/*
- * Deferred entry; will be written later by bplist_sync().
- */
-void
-bplist_enqueue_deferred(bplist_t *bpl, blkptr_t *bp)
-{
- bplist_q_t *bpq = kmem_alloc(sizeof (*bpq), KM_SLEEP);
-
- ASSERT(!BP_IS_HOLE(bp));
- mutex_enter(&bpl->bpl_lock);
- bpq->bpq_blk = *bp;
- bpq->bpq_next = bpl->bpl_queue;
- bpl->bpl_queue = bpq;
- mutex_exit(&bpl->bpl_lock);
-}
-
-void
-bplist_sync(bplist_t *bpl, dmu_tx_t *tx)
-{
- bplist_q_t *bpq;
-
- mutex_enter(&bpl->bpl_lock);
- while ((bpq = bpl->bpl_queue) != NULL) {
- bpl->bpl_queue = bpq->bpq_next;
- mutex_exit(&bpl->bpl_lock);
- VERIFY(0 == bplist_enqueue(bpl, &bpq->bpq_blk, tx));
- kmem_free(bpq, sizeof (*bpq));
- mutex_enter(&bpl->bpl_lock);
- }
- mutex_exit(&bpl->bpl_lock);
-}
-
-void
-bplist_vacate(bplist_t *bpl, dmu_tx_t *tx)
-{
- mutex_enter(&bpl->bpl_lock);
- ASSERT3P(bpl->bpl_queue, ==, NULL);
- VERIFY(0 == bplist_hold(bpl));
- dmu_buf_will_dirty(bpl->bpl_dbuf, tx);
- VERIFY(0 == dmu_free_range(bpl->bpl_mos,
- bpl->bpl_object, 0, -1ULL, tx));
- bpl->bpl_phys->bpl_entries = 0;
- bpl->bpl_phys->bpl_bytes = 0;
- if (bpl->bpl_havecomp) {
- bpl->bpl_phys->bpl_comp = 0;
- bpl->bpl_phys->bpl_uncomp = 0;
- }
- mutex_exit(&bpl->bpl_lock);
-}
-
-int
-bplist_space(bplist_t *bpl, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
-{
- uint64_t itor = 0, comp = 0, uncomp = 0;
- int err;
- blkptr_t bp;
-
- mutex_enter(&bpl->bpl_lock);
-
- err = bplist_hold(bpl);
- if (err) {
- mutex_exit(&bpl->bpl_lock);
- return (err);
- }
-
- *usedp = bpl->bpl_phys->bpl_bytes;
- if (bpl->bpl_havecomp) {
- *compp = bpl->bpl_phys->bpl_comp;
- *uncompp = bpl->bpl_phys->bpl_uncomp;
- }
- mutex_exit(&bpl->bpl_lock);
-
- if (!bpl->bpl_havecomp) {
- while ((err = bplist_iterate(bpl, &itor, &bp)) == 0) {
- comp += BP_GET_PSIZE(&bp);
- uncomp += BP_GET_UCSIZE(&bp);
- }
- if (err == ENOENT)
- err = 0;
- *compp = comp;
- *uncompp = uncomp;
- }
-
- return (err);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dbuf.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
deleted file mode 100644
index 94c6308..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
+++ /dev/null
@@ -1,2247 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/dmu.h>
-#include <sys/dmu_impl.h>
-#include <sys/dbuf.h>
-#include <sys/dmu_objset.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_dir.h>
-#include <sys/dmu_tx.h>
-#include <sys/spa.h>
-#include <sys/zio.h>
-#include <sys/dmu_zfetch.h>
-
-static void dbuf_destroy(dmu_buf_impl_t *db);
-static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
-static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum,
- int compress, dmu_tx_t *tx);
-static arc_done_func_t dbuf_write_ready;
-static arc_done_func_t dbuf_write_done;
-
-int zfs_mdcomp_disable = 0;
-SYSCTL_DECL(_vfs_zfs);
-TUNABLE_INT("vfs.zfs.mdcomp_disable", &zfs_mdcomp_disable);
-SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RDTUN,
- &zfs_mdcomp_disable, 0, "Disable metadata compression");
-
-/*
- * Global data structures and functions for the dbuf cache.
- */
-static kmem_cache_t *dbuf_cache;
-
-/* ARGSUSED */
-static int
-dbuf_cons(void *vdb, void *unused, int kmflag)
-{
- dmu_buf_impl_t *db = vdb;
- bzero(db, sizeof (dmu_buf_impl_t));
-
- mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
- cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
- refcount_create(&db->db_holds);
- return (0);
-}
-
-/* ARGSUSED */
-static void
-dbuf_dest(void *vdb, void *unused)
-{
- dmu_buf_impl_t *db = vdb;
- mutex_destroy(&db->db_mtx);
- cv_destroy(&db->db_changed);
- refcount_destroy(&db->db_holds);
-}
-
-/*
- * dbuf hash table routines
- */
-static dbuf_hash_table_t dbuf_hash_table;
-
-static uint64_t dbuf_hash_count;
-
-static uint64_t
-dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
-{
- uintptr_t osv = (uintptr_t)os;
- uint64_t crc = -1ULL;
-
- ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
- crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
- crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
- crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
- crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
- crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
- crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
-
- crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
-
- return (crc);
-}
-
-#define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
-
-#define DBUF_EQUAL(dbuf, os, obj, level, blkid) \
- ((dbuf)->db.db_object == (obj) && \
- (dbuf)->db_objset == (os) && \
- (dbuf)->db_level == (level) && \
- (dbuf)->db_blkid == (blkid))
-
-dmu_buf_impl_t *
-dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
-{
- dbuf_hash_table_t *h = &dbuf_hash_table;
- objset_impl_t *os = dn->dn_objset;
- uint64_t obj = dn->dn_object;
- uint64_t hv = DBUF_HASH(os, obj, level, blkid);
- uint64_t idx = hv & h->hash_table_mask;
- dmu_buf_impl_t *db;
-
- mutex_enter(DBUF_HASH_MUTEX(h, idx));
- for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
- if (DBUF_EQUAL(db, os, obj, level, blkid)) {
- mutex_enter(&db->db_mtx);
- if (db->db_state != DB_EVICTING) {
- mutex_exit(DBUF_HASH_MUTEX(h, idx));
- return (db);
- }
- mutex_exit(&db->db_mtx);
- }
- }
- mutex_exit(DBUF_HASH_MUTEX(h, idx));
- return (NULL);
-}
-
-/*
- * Insert an entry into the hash table. If there is already an element
- * equal to elem in the hash table, then the already existing element
- * will be returned and the new element will not be inserted.
- * Otherwise returns NULL.
- */
-static dmu_buf_impl_t *
-dbuf_hash_insert(dmu_buf_impl_t *db)
-{
- dbuf_hash_table_t *h = &dbuf_hash_table;
- objset_impl_t *os = db->db_objset;
- uint64_t obj = db->db.db_object;
- int level = db->db_level;
- uint64_t blkid = db->db_blkid;
- uint64_t hv = DBUF_HASH(os, obj, level, blkid);
- uint64_t idx = hv & h->hash_table_mask;
- dmu_buf_impl_t *dbf;
-
- mutex_enter(DBUF_HASH_MUTEX(h, idx));
- for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
- if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
- mutex_enter(&dbf->db_mtx);
- if (dbf->db_state != DB_EVICTING) {
- mutex_exit(DBUF_HASH_MUTEX(h, idx));
- return (dbf);
- }
- mutex_exit(&dbf->db_mtx);
- }
- }
-
- mutex_enter(&db->db_mtx);
- db->db_hash_next = h->hash_table[idx];
- h->hash_table[idx] = db;
- mutex_exit(DBUF_HASH_MUTEX(h, idx));
- atomic_add_64(&dbuf_hash_count, 1);
-
- return (NULL);
-}
-
-/*
- * Remove an entry from the hash table. This operation will
- * fail if there are any existing holds on the db.
- */
-static void
-dbuf_hash_remove(dmu_buf_impl_t *db)
-{
- dbuf_hash_table_t *h = &dbuf_hash_table;
- uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object,
- db->db_level, db->db_blkid);
- uint64_t idx = hv & h->hash_table_mask;
- dmu_buf_impl_t *dbf, **dbp;
-
- /*
- * We musn't hold db_mtx to maintin lock ordering:
- * DBUF_HASH_MUTEX > db_mtx.
- */
- ASSERT(refcount_is_zero(&db->db_holds));
- ASSERT(db->db_state == DB_EVICTING);
- ASSERT(!MUTEX_HELD(&db->db_mtx));
-
- mutex_enter(DBUF_HASH_MUTEX(h, idx));
- dbp = &h->hash_table[idx];
- while ((dbf = *dbp) != db) {
- dbp = &dbf->db_hash_next;
- ASSERT(dbf != NULL);
- }
- *dbp = db->db_hash_next;
- db->db_hash_next = NULL;
- mutex_exit(DBUF_HASH_MUTEX(h, idx));
- atomic_add_64(&dbuf_hash_count, -1);
-}
-
-static arc_evict_func_t dbuf_do_evict;
-
-static void
-dbuf_evict_user(dmu_buf_impl_t *db)
-{
- ASSERT(MUTEX_HELD(&db->db_mtx));
-
- if (db->db_level != 0 || db->db_evict_func == NULL)
- return;
-
- if (db->db_user_data_ptr_ptr)
- *db->db_user_data_ptr_ptr = db->db.db_data;
- db->db_evict_func(&db->db, db->db_user_ptr);
- db->db_user_ptr = NULL;
- db->db_user_data_ptr_ptr = NULL;
- db->db_evict_func = NULL;
-}
-
-void
-dbuf_evict(dmu_buf_impl_t *db)
-{
- ASSERT(MUTEX_HELD(&db->db_mtx));
- ASSERT(db->db_buf == NULL);
- ASSERT(db->db_data_pending == NULL);
-
- dbuf_clear(db);
- dbuf_destroy(db);
-}
-
-void
-dbuf_init(void)
-{
- uint64_t hsize = 1ULL << 16;
- dbuf_hash_table_t *h = &dbuf_hash_table;
- int i;
-
- /*
- * The hash table is big enough to fill all of physical memory
- * with an average 4K block size. The table will take up
- * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
- */
- while (hsize * 4096 < (uint64_t)physmem * PAGESIZE)
- hsize <<= 1;
-
-retry:
- h->hash_table_mask = hsize - 1;
- h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
- if (h->hash_table == NULL) {
- /* XXX - we should really return an error instead of assert */
- ASSERT(hsize > (1ULL << 10));
- hsize >>= 1;
- goto retry;
- }
-
- dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
- sizeof (dmu_buf_impl_t),
- 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
-
- for (i = 0; i < DBUF_MUTEXES; i++)
- mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
-}
-
-void
-dbuf_fini(void)
-{
- dbuf_hash_table_t *h = &dbuf_hash_table;
- int i;
-
- for (i = 0; i < DBUF_MUTEXES; i++)
- mutex_destroy(&h->hash_mutexes[i]);
- kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
- kmem_cache_destroy(dbuf_cache);
-}
-
-/*
- * Other stuff.
- */
-
-#ifdef ZFS_DEBUG
-static void
-dbuf_verify(dmu_buf_impl_t *db)
-{
- dnode_t *dn = db->db_dnode;
-
- ASSERT(MUTEX_HELD(&db->db_mtx));
-
- if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
- return;
-
- ASSERT(db->db_objset != NULL);
- if (dn == NULL) {
- ASSERT(db->db_parent == NULL);
- ASSERT(db->db_blkptr == NULL);
- } else {
- ASSERT3U(db->db.db_object, ==, dn->dn_object);
- ASSERT3P(db->db_objset, ==, dn->dn_objset);
- ASSERT3U(db->db_level, <, dn->dn_nlevels);
- ASSERT(db->db_blkid == DB_BONUS_BLKID ||
- list_head(&dn->dn_dbufs));
- }
- if (db->db_blkid == DB_BONUS_BLKID) {
- ASSERT(dn != NULL);
- ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen);
- ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID);
- } else {
- ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
- }
-
- if (db->db_level == 0) {
- /* we can be momentarily larger in dnode_set_blksz() */
- if (db->db_blkid != DB_BONUS_BLKID && dn) {
- ASSERT3U(db->db.db_size, >=, dn->dn_datablksz);
- }
- if (db->db.db_object == DMU_META_DNODE_OBJECT) {
- dbuf_dirty_record_t *dr = db->db_data_pending;
- /*
- * it should only be modified in syncing
- * context, so make sure we only have
- * one copy of the data.
- */
- ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
- }
- }
-
- /* verify db->db_blkptr */
- if (db->db_blkptr) {
- if (db->db_parent == dn->dn_dbuf) {
- /* db is pointed to by the dnode */
- /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
- if (db->db.db_object == DMU_META_DNODE_OBJECT)
- ASSERT(db->db_parent == NULL);
- else
- ASSERT(db->db_parent != NULL);
- ASSERT3P(db->db_blkptr, ==,
- &dn->dn_phys->dn_blkptr[db->db_blkid]);
- } else {
- /* db is pointed to by an indirect block */
- int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
- ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
- ASSERT3U(db->db_parent->db.db_object, ==,
- db->db.db_object);
- /*
- * dnode_grow_indblksz() can make this fail if we don't
- * have the struct_rwlock. XXX indblksz no longer
- * grows. safe to do this now?
- */
- if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) {
- ASSERT3P(db->db_blkptr, ==,
- ((blkptr_t *)db->db_parent->db.db_data +
- db->db_blkid % epb));
- }
- }
- }
- if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
- db->db.db_data && db->db_blkid != DB_BONUS_BLKID &&
- db->db_state != DB_FILL && !dn->dn_free_txg) {
- /*
- * If the blkptr isn't set but they have nonzero data,
- * it had better be dirty, otherwise we'll lose that
- * data when we evict this buffer.
- */
- if (db->db_dirtycnt == 0) {
- uint64_t *buf = db->db.db_data;
- int i;
-
- for (i = 0; i < db->db.db_size >> 3; i++) {
- ASSERT(buf[i] == 0);
- }
- }
- }
-}
-#endif
-
-static void
-dbuf_update_data(dmu_buf_impl_t *db)
-{
- ASSERT(MUTEX_HELD(&db->db_mtx));
- if (db->db_level == 0 && db->db_user_data_ptr_ptr) {
- ASSERT(!refcount_is_zero(&db->db_holds));
- *db->db_user_data_ptr_ptr = db->db.db_data;
- }
-}
-
-static void
-dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
-{
- ASSERT(MUTEX_HELD(&db->db_mtx));
- ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf));
- db->db_buf = buf;
- if (buf != NULL) {
- ASSERT(buf->b_data != NULL);
- db->db.db_data = buf->b_data;
- if (!arc_released(buf))
- arc_set_callback(buf, dbuf_do_evict, db);
- dbuf_update_data(db);
- } else {
- dbuf_evict_user(db);
- db->db.db_data = NULL;
- db->db_state = DB_UNCACHED;
- }
-}
-
-uint64_t
-dbuf_whichblock(dnode_t *dn, uint64_t offset)
-{
- if (dn->dn_datablkshift) {
- return (offset >> dn->dn_datablkshift);
- } else {
- ASSERT3U(offset, <, dn->dn_datablksz);
- return (0);
- }
-}
-
-static void
-dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
-{
- dmu_buf_impl_t *db = vdb;
-
- mutex_enter(&db->db_mtx);
- ASSERT3U(db->db_state, ==, DB_READ);
- /*
- * All reads are synchronous, so we must have a hold on the dbuf
- */
- ASSERT(refcount_count(&db->db_holds) > 0);
- ASSERT(db->db_buf == NULL);
- ASSERT(db->db.db_data == NULL);
- if (db->db_level == 0 && db->db_freed_in_flight) {
- /* we were freed in flight; disregard any error */
- arc_release(buf, db);
- bzero(buf->b_data, db->db.db_size);
- arc_buf_freeze(buf);
- db->db_freed_in_flight = FALSE;
- dbuf_set_data(db, buf);
- db->db_state = DB_CACHED;
- } else if (zio == NULL || zio->io_error == 0) {
- dbuf_set_data(db, buf);
- db->db_state = DB_CACHED;
- } else {
- ASSERT(db->db_blkid != DB_BONUS_BLKID);
- ASSERT3P(db->db_buf, ==, NULL);
- VERIFY(arc_buf_remove_ref(buf, db) == 1);
- db->db_state = DB_UNCACHED;
- }
- cv_broadcast(&db->db_changed);
- mutex_exit(&db->db_mtx);
- dbuf_rele(db, NULL);
-}
-
-static void
-dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
-{
- blkptr_t *bp;
- zbookmark_t zb;
- uint32_t aflags = ARC_NOWAIT;
-
- ASSERT(!refcount_is_zero(&db->db_holds));
- /* We need the struct_rwlock to prevent db_blkptr from changing. */
- ASSERT(RW_LOCK_HELD(&db->db_dnode->dn_struct_rwlock));
- ASSERT(MUTEX_HELD(&db->db_mtx));
- ASSERT(db->db_state == DB_UNCACHED);
- ASSERT(db->db_buf == NULL);
-
- if (db->db_blkid == DB_BONUS_BLKID) {
- ASSERT3U(db->db_dnode->dn_bonuslen, ==, db->db.db_size);
- db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
- if (db->db.db_size < DN_MAX_BONUSLEN)
- bzero(db->db.db_data, DN_MAX_BONUSLEN);
- bcopy(DN_BONUS(db->db_dnode->dn_phys), db->db.db_data,
- db->db.db_size);
- dbuf_update_data(db);
- db->db_state = DB_CACHED;
- mutex_exit(&db->db_mtx);
- return;
- }
-
- if (db->db_level == 0 && dnode_block_freed(db->db_dnode, db->db_blkid))
- bp = NULL;
- else
- bp = db->db_blkptr;
-
- if (bp == NULL)
- dprintf_dbuf(db, "blkptr: %s\n", "NULL");
- else
- dprintf_dbuf_bp(db, bp, "%s", "blkptr:");
-
- if (bp == NULL || BP_IS_HOLE(bp)) {
- arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
-
- ASSERT(bp == NULL || BP_IS_HOLE(bp));
- dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
- db->db.db_size, db, type));
- bzero(db->db.db_data, db->db.db_size);
- db->db_state = DB_CACHED;
- *flags |= DB_RF_CACHED;
- mutex_exit(&db->db_mtx);
- return;
- }
-
- db->db_state = DB_READ;
- mutex_exit(&db->db_mtx);
-
- zb.zb_objset = db->db_objset->os_dsl_dataset ?
- db->db_objset->os_dsl_dataset->ds_object : 0;
- zb.zb_object = db->db.db_object;
- zb.zb_level = db->db_level;
- zb.zb_blkid = db->db_blkid;
-
- dbuf_add_ref(db, NULL);
- /* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */
- ASSERT3U(db->db_dnode->dn_type, <, DMU_OT_NUMTYPES);
- (void) arc_read(zio, db->db_dnode->dn_objset->os_spa, bp,
- db->db_level > 0 ? byteswap_uint64_array :
- dmu_ot[db->db_dnode->dn_type].ot_byteswap,
- dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
- (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
- &aflags, &zb);
- if (aflags & ARC_CACHED)
- *flags |= DB_RF_CACHED;
-}
-
-int
-dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
-{
- int err = 0;
- int havepzio = (zio != NULL);
- int prefetch;
-
- /*
- * We don't have to hold the mutex to check db_state because it
- * can't be freed while we have a hold on the buffer.
- */
- ASSERT(!refcount_is_zero(&db->db_holds));
-
- if ((flags & DB_RF_HAVESTRUCT) == 0)
- rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
-
- prefetch = db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
- (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL;
-
- mutex_enter(&db->db_mtx);
- if (db->db_state == DB_CACHED) {
- mutex_exit(&db->db_mtx);
- if (prefetch)
- dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
- db->db.db_size, TRUE);
- if ((flags & DB_RF_HAVESTRUCT) == 0)
- rw_exit(&db->db_dnode->dn_struct_rwlock);
- } else if (db->db_state == DB_UNCACHED) {
- if (zio == NULL) {
- zio = zio_root(db->db_dnode->dn_objset->os_spa,
- NULL, NULL, ZIO_FLAG_CANFAIL);
- }
- dbuf_read_impl(db, zio, &flags);
-
- /* dbuf_read_impl has dropped db_mtx for us */
-
- if (prefetch)
- dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
- db->db.db_size, flags & DB_RF_CACHED);
-
- if ((flags & DB_RF_HAVESTRUCT) == 0)
- rw_exit(&db->db_dnode->dn_struct_rwlock);
-
- if (!havepzio)
- err = zio_wait(zio);
- } else {
- mutex_exit(&db->db_mtx);
- if (prefetch)
- dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
- db->db.db_size, TRUE);
- if ((flags & DB_RF_HAVESTRUCT) == 0)
- rw_exit(&db->db_dnode->dn_struct_rwlock);
-
- mutex_enter(&db->db_mtx);
- if ((flags & DB_RF_NEVERWAIT) == 0) {
- while (db->db_state == DB_READ ||
- db->db_state == DB_FILL) {
- ASSERT(db->db_state == DB_READ ||
- (flags & DB_RF_HAVESTRUCT) == 0);
- cv_wait(&db->db_changed, &db->db_mtx);
- }
- if (db->db_state == DB_UNCACHED)
- err = EIO;
- }
- mutex_exit(&db->db_mtx);
- }
-
- ASSERT(err || havepzio || db->db_state == DB_CACHED);
- return (err);
-}
-
-static void
-dbuf_noread(dmu_buf_impl_t *db)
-{
- ASSERT(!refcount_is_zero(&db->db_holds));
- ASSERT(db->db_blkid != DB_BONUS_BLKID);
- mutex_enter(&db->db_mtx);
- while (db->db_state == DB_READ || db->db_state == DB_FILL)
- cv_wait(&db->db_changed, &db->db_mtx);
- if (db->db_state == DB_UNCACHED) {
- arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
-
- ASSERT(db->db_buf == NULL);
- ASSERT(db->db.db_data == NULL);
- dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
- db->db.db_size, db, type));
- db->db_state = DB_FILL;
- } else {
- ASSERT3U(db->db_state, ==, DB_CACHED);
- }
- mutex_exit(&db->db_mtx);
-}
-
-/*
- * This is our just-in-time copy function. It makes a copy of
- * buffers, that have been modified in a previous transaction
- * group, before we modify them in the current active group.
- *
- * This function is used in two places: when we are dirtying a
- * buffer for the first time in a txg, and when we are freeing
- * a range in a dnode that includes this buffer.
- *
- * Note that when we are called from dbuf_free_range() we do
- * not put a hold on the buffer, we just traverse the active
- * dbuf list for the dnode.
- */
-static void
-dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
-{
- dbuf_dirty_record_t *dr = db->db_last_dirty;
-
- ASSERT(MUTEX_HELD(&db->db_mtx));
- ASSERT(db->db.db_data != NULL);
- ASSERT(db->db_level == 0);
- ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
-
- if (dr == NULL ||
- (dr->dt.dl.dr_data !=
- ((db->db_blkid == DB_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
- return;
-
- /*
- * If the last dirty record for this dbuf has not yet synced
- * and its referencing the dbuf data, either:
- * reset the reference to point to a new copy,
- * or (if there a no active holders)
- * just null out the current db_data pointer.
- */
- ASSERT(dr->dr_txg >= txg - 2);
- if (db->db_blkid == DB_BONUS_BLKID) {
- /* Note that the data bufs here are zio_bufs */
- dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
- bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
- } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
- int size = db->db.db_size;
- arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
- dr->dt.dl.dr_data = arc_buf_alloc(
- db->db_dnode->dn_objset->os_spa, size, db, type);
- bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
- } else {
- dbuf_set_data(db, NULL);
- }
-}
-
-void
-dbuf_unoverride(dbuf_dirty_record_t *dr)
-{
- dmu_buf_impl_t *db = dr->dr_dbuf;
- uint64_t txg = dr->dr_txg;
-
- ASSERT(MUTEX_HELD(&db->db_mtx));
- ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
- ASSERT(db->db_level == 0);
-
- if (db->db_blkid == DB_BONUS_BLKID ||
- dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
- return;
-
- /* free this block */
- if (!BP_IS_HOLE(&dr->dt.dl.dr_overridden_by)) {
- /* XXX can get silent EIO here */
- (void) arc_free(NULL, db->db_dnode->dn_objset->os_spa,
- txg, &dr->dt.dl.dr_overridden_by, NULL, NULL, ARC_WAIT);
- }
- dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
- /*
- * Release the already-written buffer, so we leave it in
- * a consistent dirty state. Note that all callers are
- * modifying the buffer, so they will immediately do
- * another (redundant) arc_release(). Therefore, leave
- * the buf thawed to save the effort of freezing &
- * immediately re-thawing it.
- */
- arc_release(dr->dt.dl.dr_data, db);
-}
-
-void
-dbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
-{
- dmu_buf_impl_t *db, *db_next;
- uint64_t txg = tx->tx_txg;
-
- dprintf_dnode(dn, "blkid=%llu nblks=%llu\n", blkid, nblks);
- mutex_enter(&dn->dn_dbufs_mtx);
- for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
- db_next = list_next(&dn->dn_dbufs, db);
- ASSERT(db->db_blkid != DB_BONUS_BLKID);
- if (db->db_level != 0)
- continue;
- dprintf_dbuf(db, "found buf %s\n", "");
- if (db->db_blkid < blkid ||
- db->db_blkid >= blkid+nblks)
- continue;
-
- /* found a level 0 buffer in the range */
- if (dbuf_undirty(db, tx))
- continue;
-
- mutex_enter(&db->db_mtx);
- if (db->db_state == DB_UNCACHED ||
- db->db_state == DB_EVICTING) {
- ASSERT(db->db.db_data == NULL);
- mutex_exit(&db->db_mtx);
- continue;
- }
- if (db->db_state == DB_READ || db->db_state == DB_FILL) {
- /* will be handled in dbuf_read_done or dbuf_rele */
- db->db_freed_in_flight = TRUE;
- mutex_exit(&db->db_mtx);
- continue;
- }
- if (refcount_count(&db->db_holds) == 0) {
- ASSERT(db->db_buf);
- dbuf_clear(db);
- continue;
- }
- /* The dbuf is referenced */
-
- if (db->db_last_dirty != NULL) {
- dbuf_dirty_record_t *dr = db->db_last_dirty;
-
- if (dr->dr_txg == txg) {
- /*
- * This buffer is "in-use", re-adjust the file
- * size to reflect that this buffer may
- * contain new data when we sync.
- */
- if (db->db_blkid > dn->dn_maxblkid)
- dn->dn_maxblkid = db->db_blkid;
- dbuf_unoverride(dr);
- } else {
- /*
- * This dbuf is not dirty in the open context.
- * Either uncache it (if its not referenced in
- * the open context) or reset its contents to
- * empty.
- */
- dbuf_fix_old_data(db, txg);
- }
- }
- /* clear the contents if its cached */
- if (db->db_state == DB_CACHED) {
- ASSERT(db->db.db_data != NULL);
- arc_release(db->db_buf, db);
- bzero(db->db.db_data, db->db.db_size);
- arc_buf_freeze(db->db_buf);
- }
-
- mutex_exit(&db->db_mtx);
- }
- mutex_exit(&dn->dn_dbufs_mtx);
-}
-
-static int
-dbuf_new_block(dmu_buf_impl_t *db)
-{
- dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
- uint64_t birth_txg = 0;
-
- /* Don't count meta-objects */
- if (ds == NULL)
- return (FALSE);
-
- /*
- * We don't need any locking to protect db_blkptr:
- * If it's syncing, then db_last_dirty will be set
- * so we'll ignore db_blkptr.
- */
- ASSERT(MUTEX_HELD(&db->db_mtx));
- /* If we have been dirtied since the last snapshot, its not new */
- if (db->db_last_dirty)
- birth_txg = db->db_last_dirty->dr_txg;
- else if (db->db_blkptr)
- birth_txg = db->db_blkptr->blk_birth;
-
- if (birth_txg)
- return (!dsl_dataset_block_freeable(ds, birth_txg));
- else
- return (TRUE);
-}
-
-void
-dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
-{
- arc_buf_t *buf, *obuf;
- int osize = db->db.db_size;
- arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
-
- ASSERT(db->db_blkid != DB_BONUS_BLKID);
-
- /* XXX does *this* func really need the lock? */
- ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock));
-
- /*
- * This call to dbuf_will_dirty() with the dn_struct_rwlock held
- * is OK, because there can be no other references to the db
- * when we are changing its size, so no concurrent DB_FILL can
- * be happening.
- */
- /*
- * XXX we should be doing a dbuf_read, checking the return
- * value and returning that up to our callers
- */
- dbuf_will_dirty(db, tx);
-
- /* create the data buffer for the new block */
- buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, size, db, type);
-
- /* copy old block data to the new block */
- obuf = db->db_buf;
- bcopy(obuf->b_data, buf->b_data, MIN(osize, size));
- /* zero the remainder */
- if (size > osize)
- bzero((uint8_t *)buf->b_data + osize, size - osize);
-
- mutex_enter(&db->db_mtx);
- dbuf_set_data(db, buf);
- VERIFY(arc_buf_remove_ref(obuf, db) == 1);
- db->db.db_size = size;
-
- if (db->db_level == 0) {
- ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
- db->db_last_dirty->dt.dl.dr_data = buf;
- }
- mutex_exit(&db->db_mtx);
-
- dnode_willuse_space(db->db_dnode, size-osize, tx);
-}
-
-dbuf_dirty_record_t *
-dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
-{
- dnode_t *dn = db->db_dnode;
- objset_impl_t *os = dn->dn_objset;
- dbuf_dirty_record_t **drp, *dr;
- int drop_struct_lock = FALSE;
- int txgoff = tx->tx_txg & TXG_MASK;
-
- ASSERT(tx->tx_txg != 0);
- ASSERT(!refcount_is_zero(&db->db_holds));
- DMU_TX_DIRTY_BUF(tx, db);
-
- /*
- * Shouldn't dirty a regular buffer in syncing context. Private
- * objects may be dirtied in syncing context, but only if they
- * were already pre-dirtied in open context.
- * XXX We may want to prohibit dirtying in syncing context even
- * if they did pre-dirty.
- */
- ASSERT(!dmu_tx_is_syncing(tx) ||
- BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
- dn->dn_object == DMU_META_DNODE_OBJECT ||
- dn->dn_objset->os_dsl_dataset == NULL ||
- dsl_dir_is_private(dn->dn_objset->os_dsl_dataset->ds_dir));
-
- /*
- * We make this assert for private objects as well, but after we
- * check if we're already dirty. They are allowed to re-dirty
- * in syncing context.
- */
- ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
- dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
- (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
-
- mutex_enter(&db->db_mtx);
- /*
- * XXX make this true for indirects too? The problem is that
- * transactions created with dmu_tx_create_assigned() from
- * syncing context don't bother holding ahead.
- */
- ASSERT(db->db_level != 0 ||
- db->db_state == DB_CACHED || db->db_state == DB_FILL);
-
- mutex_enter(&dn->dn_mtx);
- /*
- * Don't set dirtyctx to SYNC if we're just modifying this as we
- * initialize the objset.
- */
- if (dn->dn_dirtyctx == DN_UNDIRTIED &&
- !BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
- dn->dn_dirtyctx =
- (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
- ASSERT(dn->dn_dirtyctx_firstset == NULL);
- dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
- }
- mutex_exit(&dn->dn_mtx);
-
- /*
- * If this buffer is already dirty, we're done.
- */
- drp = &db->db_last_dirty;
- ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
- db->db.db_object == DMU_META_DNODE_OBJECT);
- while (*drp && (*drp)->dr_txg > tx->tx_txg)
- drp = &(*drp)->dr_next;
- if (*drp && (*drp)->dr_txg == tx->tx_txg) {
- if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) {
- /*
- * If this buffer has already been written out,
- * we now need to reset its state.
- */
- dbuf_unoverride(*drp);
- if (db->db.db_object != DMU_META_DNODE_OBJECT)
- arc_buf_thaw(db->db_buf);
- }
- mutex_exit(&db->db_mtx);
- return (*drp);
- }
-
- /*
- * Only valid if not already dirty.
- */
- ASSERT(dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
- (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
-
- ASSERT3U(dn->dn_nlevels, >, db->db_level);
- ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
- dn->dn_phys->dn_nlevels > db->db_level ||
- dn->dn_next_nlevels[txgoff] > db->db_level ||
- dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
- dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
-
- /*
- * We should only be dirtying in syncing context if it's the
- * mos, a spa os, or we're initializing the os. However, we are
- * allowed to dirty in syncing context provided we already
- * dirtied it in open context. Hence we must make this
- * assertion only if we're not already dirty.
- */
- ASSERT(!dmu_tx_is_syncing(tx) ||
- os->os_dsl_dataset == NULL ||
- !dsl_dir_is_private(os->os_dsl_dataset->ds_dir) ||
- !BP_IS_HOLE(os->os_rootbp));
- ASSERT(db->db.db_size != 0);
-
- dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
-
- /*
- * If this buffer is dirty in an old transaction group we need
- * to make a copy of it so that the changes we make in this
- * transaction group won't leak out when we sync the older txg.
- */
- dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
- if (db->db_level == 0) {
- void *data_old = db->db_buf;
-
- if (db->db_blkid == DB_BONUS_BLKID) {
- dbuf_fix_old_data(db, tx->tx_txg);
- data_old = db->db.db_data;
- } else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
- /*
- * Release the data buffer from the cache so that we
- * can modify it without impacting possible other users
- * of this cached data block. Note that indirect
- * blocks and private objects are not released until the
- * syncing state (since they are only modified then).
- */
- arc_release(db->db_buf, db);
- dbuf_fix_old_data(db, tx->tx_txg);
- data_old = db->db_buf;
- }
- ASSERT(data_old != NULL);
- dr->dt.dl.dr_data = data_old;
- } else {
- mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
- list_create(&dr->dt.di.dr_children,
- sizeof (dbuf_dirty_record_t),
- offsetof(dbuf_dirty_record_t, dr_dirty_node));
- }
- dr->dr_dbuf = db;
- dr->dr_txg = tx->tx_txg;
- dr->dr_next = *drp;
- *drp = dr;
-
- /*
- * We could have been freed_in_flight between the dbuf_noread
- * and dbuf_dirty. We win, as though the dbuf_noread() had
- * happened after the free.
- */
- if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) {
- mutex_enter(&dn->dn_mtx);
- dnode_clear_range(dn, db->db_blkid, 1, tx);
- mutex_exit(&dn->dn_mtx);
- db->db_freed_in_flight = FALSE;
- }
-
- if (db->db_blkid != DB_BONUS_BLKID) {
- /*
- * Update the accounting.
- */
- if (!dbuf_new_block(db) && db->db_blkptr) {
- /*
- * This is only a guess -- if the dbuf is dirty
- * in a previous txg, we don't know how much
- * space it will use on disk yet. We should
- * really have the struct_rwlock to access
- * db_blkptr, but since this is just a guess,
- * it's OK if we get an odd answer.
- */
- dnode_willuse_space(dn,
- -bp_get_dasize(os->os_spa, db->db_blkptr), tx);
- }
- dnode_willuse_space(dn, db->db.db_size, tx);
- }
-
- /*
- * This buffer is now part of this txg
- */
- dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
- db->db_dirtycnt += 1;
- ASSERT3U(db->db_dirtycnt, <=, 3);
-
- mutex_exit(&db->db_mtx);
-
- if (db->db_blkid == DB_BONUS_BLKID) {
- mutex_enter(&dn->dn_mtx);
- ASSERT(!list_link_active(&dr->dr_dirty_node));
- list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
- mutex_exit(&dn->dn_mtx);
- dnode_setdirty(dn, tx);
- return (dr);
- }
-
- if (db->db_level == 0) {
- dnode_new_blkid(dn, db->db_blkid, tx);
- ASSERT(dn->dn_maxblkid >= db->db_blkid);
- }
-
- if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
- rw_enter(&dn->dn_struct_rwlock, RW_READER);
- drop_struct_lock = TRUE;
- }
-
- if (db->db_level+1 < dn->dn_nlevels) {
- dmu_buf_impl_t *parent = db->db_parent;
- dbuf_dirty_record_t *di;
- int parent_held = FALSE;
-
- if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
- int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
-
- parent = dbuf_hold_level(dn, db->db_level+1,
- db->db_blkid >> epbs, FTAG);
- parent_held = TRUE;
- }
- if (drop_struct_lock)
- rw_exit(&dn->dn_struct_rwlock);
- ASSERT3U(db->db_level+1, ==, parent->db_level);
- di = dbuf_dirty(parent, tx);
- if (parent_held)
- dbuf_rele(parent, FTAG);
-
- mutex_enter(&db->db_mtx);
- /* possible race with dbuf_undirty() */
- if (db->db_last_dirty == dr ||
- dn->dn_object == DMU_META_DNODE_OBJECT) {
- mutex_enter(&di->dt.di.dr_mtx);
- ASSERT3U(di->dr_txg, ==, tx->tx_txg);
- ASSERT(!list_link_active(&dr->dr_dirty_node));
- list_insert_tail(&di->dt.di.dr_children, dr);
- mutex_exit(&di->dt.di.dr_mtx);
- dr->dr_parent = di;
- }
- mutex_exit(&db->db_mtx);
- } else {
- ASSERT(db->db_level+1 == dn->dn_nlevels);
- ASSERT(db->db_blkid < dn->dn_nblkptr);
- ASSERT(db->db_parent == NULL ||
- db->db_parent == db->db_dnode->dn_dbuf);
- mutex_enter(&dn->dn_mtx);
- ASSERT(!list_link_active(&dr->dr_dirty_node));
- list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
- mutex_exit(&dn->dn_mtx);
- if (drop_struct_lock)
- rw_exit(&dn->dn_struct_rwlock);
- }
-
- dnode_setdirty(dn, tx);
- return (dr);
-}
-
-static int
-dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
-{
- dnode_t *dn = db->db_dnode;
- uint64_t txg = tx->tx_txg;
- dbuf_dirty_record_t *dr;
-
- ASSERT(txg != 0);
- ASSERT(db->db_blkid != DB_BONUS_BLKID);
-
- mutex_enter(&db->db_mtx);
-
- /*
- * If this buffer is not dirty, we're done.
- */
- for (dr = db->db_last_dirty; dr; dr = dr->dr_next)
- if (dr->dr_txg <= txg)
- break;
- if (dr == NULL || dr->dr_txg < txg) {
- mutex_exit(&db->db_mtx);
- return (0);
- }
- ASSERT(dr->dr_txg == txg);
-
- /*
- * If this buffer is currently held, we cannot undirty
- * it, since one of the current holders may be in the
- * middle of an update. Note that users of dbuf_undirty()
- * should not place a hold on the dbuf before the call.
- */
- if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
- mutex_exit(&db->db_mtx);
- /* Make sure we don't toss this buffer at sync phase */
- mutex_enter(&dn->dn_mtx);
- dnode_clear_range(dn, db->db_blkid, 1, tx);
- mutex_exit(&dn->dn_mtx);
- return (0);
- }
-
- dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
-
- ASSERT(db->db.db_size != 0);
-
- /* XXX would be nice to fix up dn_towrite_space[] */
-
- db->db_last_dirty = dr->dr_next;
-
- if (dr->dr_parent) {
- mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
- list_remove(&dr->dr_parent->dt.di.dr_children, dr);
- mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
- } else if (db->db_level+1 == dn->dn_nlevels) {
- ASSERT3P(db->db_parent, ==, dn->dn_dbuf);
- mutex_enter(&dn->dn_mtx);
- list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
- mutex_exit(&dn->dn_mtx);
- }
-
- if (db->db_level == 0) {
- dbuf_unoverride(dr);
-
- ASSERT(db->db_buf != NULL);
- ASSERT(dr->dt.dl.dr_data != NULL);
- if (dr->dt.dl.dr_data != db->db_buf)
- VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1);
- } else {
- ASSERT(db->db_buf != NULL);
- ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
- list_destroy(&dr->dt.di.dr_children);
- mutex_destroy(&dr->dt.di.dr_mtx);
- }
- kmem_free(dr, sizeof (dbuf_dirty_record_t));
-
- ASSERT(db->db_dirtycnt > 0);
- db->db_dirtycnt -= 1;
-
- if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
- arc_buf_t *buf = db->db_buf;
-
- ASSERT(arc_released(buf));
- dbuf_set_data(db, NULL);
- VERIFY(arc_buf_remove_ref(buf, db) == 1);
- dbuf_evict(db);
- return (1);
- }
-
- mutex_exit(&db->db_mtx);
- return (0);
-}
-
-#pragma weak dmu_buf_will_dirty = dbuf_will_dirty
-void
-dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
-{
- int rf = DB_RF_MUST_SUCCEED;
-
- ASSERT(tx->tx_txg != 0);
- ASSERT(!refcount_is_zero(&db->db_holds));
-
- if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock))
- rf |= DB_RF_HAVESTRUCT;
- (void) dbuf_read(db, NULL, rf);
- (void) dbuf_dirty(db, tx);
-}
-
-void
-dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
-{
- dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
-
- ASSERT(db->db_blkid != DB_BONUS_BLKID);
- ASSERT(tx->tx_txg != 0);
- ASSERT(db->db_level == 0);
- ASSERT(!refcount_is_zero(&db->db_holds));
-
- ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
- dmu_tx_private_ok(tx));
-
- dbuf_noread(db);
- (void) dbuf_dirty(db, tx);
-}
-
-#pragma weak dmu_buf_fill_done = dbuf_fill_done
-/* ARGSUSED */
-void
-dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
-{
- mutex_enter(&db->db_mtx);
- DBUF_VERIFY(db);
-
- if (db->db_state == DB_FILL) {
- if (db->db_level == 0 && db->db_freed_in_flight) {
- ASSERT(db->db_blkid != DB_BONUS_BLKID);
- /* we were freed while filling */
- /* XXX dbuf_undirty? */
- bzero(db->db.db_data, db->db.db_size);
- db->db_freed_in_flight = FALSE;
- }
- db->db_state = DB_CACHED;
- cv_broadcast(&db->db_changed);
- }
- mutex_exit(&db->db_mtx);
-}
-
-/*
- * "Clear" the contents of this dbuf. This will mark the dbuf
- * EVICTING and clear *most* of its references. Unfortunetely,
- * when we are not holding the dn_dbufs_mtx, we can't clear the
- * entry in the dn_dbufs list. We have to wait until dbuf_destroy()
- * in this case. For callers from the DMU we will usually see:
- * dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy()
- * For the arc callback, we will usually see:
- * dbuf_do_evict()->dbuf_clear();dbuf_destroy()
- * Sometimes, though, we will get a mix of these two:
- * DMU: dbuf_clear()->arc_buf_evict()
- * ARC: dbuf_do_evict()->dbuf_destroy()
- */
-void
-dbuf_clear(dmu_buf_impl_t *db)
-{
- dnode_t *dn = db->db_dnode;
- dmu_buf_impl_t *parent = db->db_parent;
- dmu_buf_impl_t *dndb = dn->dn_dbuf;
- int dbuf_gone = FALSE;
-
- ASSERT(MUTEX_HELD(&db->db_mtx));
- ASSERT(refcount_is_zero(&db->db_holds));
-
- dbuf_evict_user(db);
-
- if (db->db_state == DB_CACHED) {
- ASSERT(db->db.db_data != NULL);
- if (db->db_blkid == DB_BONUS_BLKID)
- zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
- db->db.db_data = NULL;
- db->db_state = DB_UNCACHED;
- }
-
- ASSERT3U(db->db_state, ==, DB_UNCACHED);
- ASSERT(db->db_data_pending == NULL);
-
- db->db_state = DB_EVICTING;
- db->db_blkptr = NULL;
-
- if (db->db_blkid != DB_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
- list_remove(&dn->dn_dbufs, db);
- dnode_rele(dn, db);
- }
-
- if (db->db_buf)
- dbuf_gone = arc_buf_evict(db->db_buf);
-
- if (!dbuf_gone)
- mutex_exit(&db->db_mtx);
-
- /*
- * If this dbuf is referened from an indirect dbuf,
- * decrement the ref count on the indirect dbuf.
- */
- if (parent && parent != dndb)
- dbuf_rele(parent, db);
-}
-
-static int
-dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
- dmu_buf_impl_t **parentp, blkptr_t **bpp)
-{
- int nlevels, epbs;
-
- *parentp = NULL;
- *bpp = NULL;
-
- ASSERT(blkid != DB_BONUS_BLKID);
-
- if (dn->dn_phys->dn_nlevels == 0)
- nlevels = 1;
- else
- nlevels = dn->dn_phys->dn_nlevels;
-
- epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
-
- ASSERT3U(level * epbs, <, 64);
- ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
- if (level >= nlevels ||
- (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
- /* the buffer has no parent yet */
- return (ENOENT);
- } else if (level < nlevels-1) {
- /* this block is referenced from an indirect block */
- int err = dbuf_hold_impl(dn, level+1,
- blkid >> epbs, fail_sparse, NULL, parentp);
- if (err)
- return (err);
- err = dbuf_read(*parentp, NULL,
- (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
- if (err) {
- dbuf_rele(*parentp, NULL);
- *parentp = NULL;
- return (err);
- }
- *bpp = ((blkptr_t *)(*parentp)->db.db_data) +
- (blkid & ((1ULL << epbs) - 1));
- return (0);
- } else {
- /* the block is referenced from the dnode */
- ASSERT3U(level, ==, nlevels-1);
- ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
- blkid < dn->dn_phys->dn_nblkptr);
- if (dn->dn_dbuf) {
- dbuf_add_ref(dn->dn_dbuf, NULL);
- *parentp = dn->dn_dbuf;
- }
- *bpp = &dn->dn_phys->dn_blkptr[blkid];
- return (0);
- }
-}
-
-static dmu_buf_impl_t *
-dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
- dmu_buf_impl_t *parent, blkptr_t *blkptr)
-{
- objset_impl_t *os = dn->dn_objset;
- dmu_buf_impl_t *db, *odb;
-
- ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
- ASSERT(dn->dn_type != DMU_OT_NONE);
-
- db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
-
- db->db_objset = os;
- db->db.db_object = dn->dn_object;
- db->db_level = level;
- db->db_blkid = blkid;
- db->db_last_dirty = NULL;
- db->db_dirtycnt = 0;
- db->db_dnode = dn;
- db->db_parent = parent;
- db->db_blkptr = blkptr;
-
- db->db_user_ptr = NULL;
- db->db_user_data_ptr_ptr = NULL;
- db->db_evict_func = NULL;
- db->db_immediate_evict = 0;
- db->db_freed_in_flight = 0;
-
- if (blkid == DB_BONUS_BLKID) {
- ASSERT3P(parent, ==, dn->dn_dbuf);
- db->db.db_size = dn->dn_bonuslen;
- db->db.db_offset = DB_BONUS_BLKID;
- db->db_state = DB_UNCACHED;
- /* the bonus dbuf is not placed in the hash table */
- return (db);
- } else {
- int blocksize =
- db->db_level ? 1<<dn->dn_indblkshift : dn->dn_datablksz;
- db->db.db_size = blocksize;
- db->db.db_offset = db->db_blkid * blocksize;
- }
-
- /*
- * Hold the dn_dbufs_mtx while we get the new dbuf
- * in the hash table *and* added to the dbufs list.
- * This prevents a possible deadlock with someone
- * trying to look up this dbuf before its added to the
- * dn_dbufs list.
- */
- mutex_enter(&dn->dn_dbufs_mtx);
- db->db_state = DB_EVICTING;
- if ((odb = dbuf_hash_insert(db)) != NULL) {
- /* someone else inserted it first */
- kmem_cache_free(dbuf_cache, db);
- mutex_exit(&dn->dn_dbufs_mtx);
- return (odb);
- }
- list_insert_head(&dn->dn_dbufs, db);
- db->db_state = DB_UNCACHED;
- mutex_exit(&dn->dn_dbufs_mtx);
-
- if (parent && parent != dn->dn_dbuf)
- dbuf_add_ref(parent, db);
-
- ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
- refcount_count(&dn->dn_holds) > 0);
- (void) refcount_add(&dn->dn_holds, db);
-
- dprintf_dbuf(db, "db=%p\n", db);
-
- return (db);
-}
-
-static int
-dbuf_do_evict(void *private)
-{
- arc_buf_t *buf = private;
- dmu_buf_impl_t *db = buf->b_private;
-
- if (!MUTEX_HELD(&db->db_mtx))
- mutex_enter(&db->db_mtx);
-
- ASSERT(refcount_is_zero(&db->db_holds));
-
- if (db->db_state != DB_EVICTING) {
- ASSERT(db->db_state == DB_CACHED);
- DBUF_VERIFY(db);
- db->db_buf = NULL;
- dbuf_evict(db);
- } else {
- mutex_exit(&db->db_mtx);
- dbuf_destroy(db);
- }
- return (0);
-}
-
-static void
-dbuf_destroy(dmu_buf_impl_t *db)
-{
- ASSERT(refcount_is_zero(&db->db_holds));
-
- if (db->db_blkid != DB_BONUS_BLKID) {
- dnode_t *dn = db->db_dnode;
-
- /*
- * If this dbuf is still on the dn_dbufs list,
- * remove it from that list.
- */
- if (list_link_active(&db->db_link)) {
- mutex_enter(&dn->dn_dbufs_mtx);
- list_remove(&dn->dn_dbufs, db);
- mutex_exit(&dn->dn_dbufs_mtx);
-
- dnode_rele(dn, db);
- }
- dbuf_hash_remove(db);
- }
- db->db_parent = NULL;
- db->db_dnode = NULL;
- db->db_buf = NULL;
-
- ASSERT(db->db.db_data == NULL);
- ASSERT(db->db_hash_next == NULL);
- ASSERT(db->db_blkptr == NULL);
- ASSERT(db->db_data_pending == NULL);
-
- kmem_cache_free(dbuf_cache, db);
-}
-
-void
-dbuf_prefetch(dnode_t *dn, uint64_t blkid)
-{
- dmu_buf_impl_t *db = NULL;
- blkptr_t *bp = NULL;
-
- ASSERT(blkid != DB_BONUS_BLKID);
- ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
-
- if (dnode_block_freed(dn, blkid))
- return;
-
- /* dbuf_find() returns with db_mtx held */
- if (db = dbuf_find(dn, 0, blkid)) {
- if (refcount_count(&db->db_holds) > 0) {
- /*
- * This dbuf is active. We assume that it is
- * already CACHED, or else about to be either
- * read or filled.
- */
- mutex_exit(&db->db_mtx);
- return;
- }
- mutex_exit(&db->db_mtx);
- db = NULL;
- }
-
- if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
- if (bp && !BP_IS_HOLE(bp)) {
- uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
- zbookmark_t zb;
- zb.zb_objset = dn->dn_objset->os_dsl_dataset ?
- dn->dn_objset->os_dsl_dataset->ds_object : 0;
- zb.zb_object = dn->dn_object;
- zb.zb_level = 0;
- zb.zb_blkid = blkid;
-
- (void) arc_read(NULL, dn->dn_objset->os_spa, bp,
- dmu_ot[dn->dn_type].ot_byteswap,
- NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
- ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
- &aflags, &zb);
- }
- if (db)
- dbuf_rele(db, NULL);
- }
-}
-
-/*
- * Returns with db_holds incremented, and db_mtx not held.
- * Note: dn_struct_rwlock must be held.
- */
-int
-dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
- void *tag, dmu_buf_impl_t **dbp)
-{
- dmu_buf_impl_t *db, *parent = NULL;
-
- ASSERT(blkid != DB_BONUS_BLKID);
- ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
- ASSERT3U(dn->dn_nlevels, >, level);
-
- *dbp = NULL;
-top:
- /* dbuf_find() returns with db_mtx held */
- db = dbuf_find(dn, level, blkid);
-
- if (db == NULL) {
- blkptr_t *bp = NULL;
- int err;
-
- ASSERT3P(parent, ==, NULL);
- err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
- if (fail_sparse) {
- if (err == 0 && bp && BP_IS_HOLE(bp))
- err = ENOENT;
- if (err) {
- if (parent)
- dbuf_rele(parent, NULL);
- return (err);
- }
- }
- if (err && err != ENOENT)
- return (err);
- db = dbuf_create(dn, level, blkid, parent, bp);
- }
-
- if (db->db_buf && refcount_is_zero(&db->db_holds)) {
- arc_buf_add_ref(db->db_buf, db);
- if (db->db_buf->b_data == NULL) {
- dbuf_clear(db);
- if (parent) {
- dbuf_rele(parent, NULL);
- parent = NULL;
- }
- goto top;
- }
- ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
- }
-
- ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
-
- /*
- * If this buffer is currently syncing out, and we are are
- * still referencing it from db_data, we need to make a copy
- * of it in case we decide we want to dirty it again in this txg.
- */
- if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
- dn->dn_object != DMU_META_DNODE_OBJECT &&
- db->db_state == DB_CACHED && db->db_data_pending) {
- dbuf_dirty_record_t *dr = db->db_data_pending;
-
- if (dr->dt.dl.dr_data == db->db_buf) {
- arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
-
- dbuf_set_data(db,
- arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
- db->db.db_size, db, type));
- bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
- db->db.db_size);
- }
- }
-
- (void) refcount_add(&db->db_holds, tag);
- dbuf_update_data(db);
- DBUF_VERIFY(db);
- mutex_exit(&db->db_mtx);
-
- /* NOTE: we can't rele the parent until after we drop the db_mtx */
- if (parent)
- dbuf_rele(parent, NULL);
-
- ASSERT3P(db->db_dnode, ==, dn);
- ASSERT3U(db->db_blkid, ==, blkid);
- ASSERT3U(db->db_level, ==, level);
- *dbp = db;
-
- return (0);
-}
-
-dmu_buf_impl_t *
-dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
-{
- dmu_buf_impl_t *db;
- int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db);
- return (err ? NULL : db);
-}
-
-dmu_buf_impl_t *
-dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
-{
- dmu_buf_impl_t *db;
- int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db);
- return (err ? NULL : db);
-}
-
-dmu_buf_impl_t *
-dbuf_create_bonus(dnode_t *dn)
-{
- dmu_buf_impl_t *db = dn->dn_bonus;
-
- ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
-
- ASSERT(dn->dn_bonus == NULL);
- db = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL);
- return (db);
-}
-
-#pragma weak dmu_buf_add_ref = dbuf_add_ref
-void
-dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
-{
- int64_t holds = refcount_add(&db->db_holds, tag);
- ASSERT(holds > 1);
-}
-
-#pragma weak dmu_buf_rele = dbuf_rele
-void
-dbuf_rele(dmu_buf_impl_t *db, void *tag)
-{
- int64_t holds;
-
- mutex_enter(&db->db_mtx);
- DBUF_VERIFY(db);
-
- holds = refcount_remove(&db->db_holds, tag);
- ASSERT(holds >= 0);
-
- /*
- * We can't freeze indirects if there is a possibility that they
- * may be modified in the current syncing context.
- */
- if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0))
- arc_buf_freeze(db->db_buf);
-
- if (holds == db->db_dirtycnt &&
- db->db_level == 0 && db->db_immediate_evict)
- dbuf_evict_user(db);
-
- if (holds == 0) {
- if (db->db_blkid == DB_BONUS_BLKID) {
- mutex_exit(&db->db_mtx);
- dnode_rele(db->db_dnode, db);
- } else if (db->db_buf == NULL) {
- /*
- * This is a special case: we never associated this
- * dbuf with any data allocated from the ARC.
- */
- ASSERT3U(db->db_state, ==, DB_UNCACHED);
- dbuf_evict(db);
- } else if (arc_released(db->db_buf)) {
- arc_buf_t *buf = db->db_buf;
- /*
- * This dbuf has anonymous data associated with it.
- */
- dbuf_set_data(db, NULL);
- VERIFY(arc_buf_remove_ref(buf, db) == 1);
- dbuf_evict(db);
- } else {
- VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0);
- mutex_exit(&db->db_mtx);
- }
- } else {
- mutex_exit(&db->db_mtx);
- }
-}
-
-#pragma weak dmu_buf_refcount = dbuf_refcount
-uint64_t
-dbuf_refcount(dmu_buf_impl_t *db)
-{
- return (refcount_count(&db->db_holds));
-}
-
-void *
-dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
- dmu_buf_evict_func_t *evict_func)
-{
- return (dmu_buf_update_user(db_fake, NULL, user_ptr,
- user_data_ptr_ptr, evict_func));
-}
-
-void *
-dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
- dmu_buf_evict_func_t *evict_func)
-{
- dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
-
- db->db_immediate_evict = TRUE;
- return (dmu_buf_update_user(db_fake, NULL, user_ptr,
- user_data_ptr_ptr, evict_func));
-}
-
-void *
-dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr,
- void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func)
-{
- dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
- ASSERT(db->db_level == 0);
-
- ASSERT((user_ptr == NULL) == (evict_func == NULL));
-
- mutex_enter(&db->db_mtx);
-
- if (db->db_user_ptr == old_user_ptr) {
- db->db_user_ptr = user_ptr;
- db->db_user_data_ptr_ptr = user_data_ptr_ptr;
- db->db_evict_func = evict_func;
-
- dbuf_update_data(db);
- } else {
- old_user_ptr = db->db_user_ptr;
- }
-
- mutex_exit(&db->db_mtx);
- return (old_user_ptr);
-}
-
-void *
-dmu_buf_get_user(dmu_buf_t *db_fake)
-{
- dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
- ASSERT(!refcount_is_zero(&db->db_holds));
-
- return (db->db_user_ptr);
-}
-
-static void
-dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
-{
- /* ASSERT(dmu_tx_is_syncing(tx) */
- ASSERT(MUTEX_HELD(&db->db_mtx));
-
- if (db->db_blkptr != NULL)
- return;
-
- if (db->db_level == dn->dn_phys->dn_nlevels-1) {
- /*
- * This buffer was allocated at a time when there was
- * no available blkptrs from the dnode, or it was
- * inappropriate to hook it in (i.e., nlevels mis-match).
- */
- ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
- ASSERT(db->db_parent == NULL);
- db->db_parent = dn->dn_dbuf;
- db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
- DBUF_VERIFY(db);
- } else {
- dmu_buf_impl_t *parent = db->db_parent;
- int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
-
- ASSERT(dn->dn_phys->dn_nlevels > 1);
- if (parent == NULL) {
- mutex_exit(&db->db_mtx);
- rw_enter(&dn->dn_struct_rwlock, RW_READER);
- (void) dbuf_hold_impl(dn, db->db_level+1,
- db->db_blkid >> epbs, FALSE, db, &parent);
- rw_exit(&dn->dn_struct_rwlock);
- mutex_enter(&db->db_mtx);
- db->db_parent = parent;
- }
- db->db_blkptr = (blkptr_t *)parent->db.db_data +
- (db->db_blkid & ((1ULL << epbs) - 1));
- DBUF_VERIFY(db);
- }
-}
-
-static void
-dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
-{
- dmu_buf_impl_t *db = dr->dr_dbuf;
- dnode_t *dn = db->db_dnode;
- zio_t *zio;
-
- ASSERT(dmu_tx_is_syncing(tx));
-
- dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
-
- mutex_enter(&db->db_mtx);
-
- ASSERT(db->db_level > 0);
- DBUF_VERIFY(db);
-
- if (db->db_buf == NULL) {
- mutex_exit(&db->db_mtx);
- (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
- mutex_enter(&db->db_mtx);
- }
- ASSERT3U(db->db_state, ==, DB_CACHED);
- ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
- ASSERT(db->db_buf != NULL);
-
- dbuf_check_blkptr(dn, db);
-
- db->db_data_pending = dr;
-
- arc_release(db->db_buf, db);
- mutex_exit(&db->db_mtx);
-
- /*
- * XXX -- we should design a compression algorithm
- * that specializes in arrays of bps.
- */
- dbuf_write(dr, db->db_buf, ZIO_CHECKSUM_FLETCHER_4,
- zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY : ZIO_COMPRESS_LZJB, tx);
-
- zio = dr->dr_zio;
- mutex_enter(&dr->dt.di.dr_mtx);
- dbuf_sync_list(&dr->dt.di.dr_children, tx);
- ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
- mutex_exit(&dr->dt.di.dr_mtx);
- zio_nowait(zio);
-}
-
-static void
-dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
-{
- arc_buf_t **datap = &dr->dt.dl.dr_data;
- dmu_buf_impl_t *db = dr->dr_dbuf;
- dnode_t *dn = db->db_dnode;
- objset_impl_t *os = dn->dn_objset;
- uint64_t txg = tx->tx_txg;
- int checksum, compress;
- int blksz;
-
- ASSERT(dmu_tx_is_syncing(tx));
-
- dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
-
- mutex_enter(&db->db_mtx);
- /*
- * To be synced, we must be dirtied. But we
- * might have been freed after the dirty.
- */
- if (db->db_state == DB_UNCACHED) {
- /* This buffer has been freed since it was dirtied */
- ASSERT(db->db.db_data == NULL);
- } else if (db->db_state == DB_FILL) {
- /* This buffer was freed and is now being re-filled */
- ASSERT(db->db.db_data != dr->dt.dl.dr_data);
- } else {
- ASSERT3U(db->db_state, ==, DB_CACHED);
- }
- DBUF_VERIFY(db);
-
- /*
- * If this is a bonus buffer, simply copy the bonus data into the
- * dnode. It will be written out when the dnode is synced (and it
- * will be synced, since it must have been dirty for dbuf_sync to
- * be called).
- */
- if (db->db_blkid == DB_BONUS_BLKID) {
- dbuf_dirty_record_t **drp;
- /*
- * Use dn_phys->dn_bonuslen since db.db_size is the length
- * of the bonus buffer in the open transaction rather than
- * the syncing transaction.
- */
- ASSERT(*datap != NULL);
- ASSERT3U(db->db_level, ==, 0);
- ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
- bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
- if (*datap != db->db.db_data)
- zio_buf_free(*datap, DN_MAX_BONUSLEN);
- db->db_data_pending = NULL;
- drp = &db->db_last_dirty;
- while (*drp != dr)
- drp = &(*drp)->dr_next;
- ASSERT((*drp)->dr_next == NULL);
- *drp = NULL;
- if (dr->dr_dbuf->db_level != 0) {
- list_destroy(&dr->dt.di.dr_children);
- mutex_destroy(&dr->dt.di.dr_mtx);
- }
- kmem_free(dr, sizeof (dbuf_dirty_record_t));
- ASSERT(db->db_dirtycnt > 0);
- db->db_dirtycnt -= 1;
- mutex_exit(&db->db_mtx);
- dbuf_rele(db, (void *)(uintptr_t)txg);
- return;
- }
-
- /*
- * If this buffer is in the middle of an immdiate write,
- * wait for the synchronous IO to complete.
- */
- while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
- ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
- cv_wait(&db->db_changed, &db->db_mtx);
- ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
- }
-
- dbuf_check_blkptr(dn, db);
-
- /*
- * If this dbuf has already been written out via an immediate write,
- * just complete the write by copying over the new block pointer and
- * updating the accounting via the write-completion functions.
- */
- if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
- zio_t zio_fake;
-
- zio_fake.io_private = &db;
- zio_fake.io_error = 0;
- zio_fake.io_bp = db->db_blkptr;
- zio_fake.io_bp_orig = *db->db_blkptr;
- zio_fake.io_txg = txg;
-
- *db->db_blkptr = dr->dt.dl.dr_overridden_by;
- dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
- db->db_data_pending = dr;
- dr->dr_zio = &zio_fake;
- mutex_exit(&db->db_mtx);
-
- if (BP_IS_OLDER(&zio_fake.io_bp_orig, txg))
- dsl_dataset_block_kill(os->os_dsl_dataset,
- &zio_fake.io_bp_orig, dn->dn_zio, tx);
-
- dbuf_write_ready(&zio_fake, db->db_buf, db);
- dbuf_write_done(&zio_fake, db->db_buf, db);
-
- return;
- }
-
- blksz = arc_buf_size(*datap);
-
- if (dn->dn_object != DMU_META_DNODE_OBJECT) {
- /*
- * If this buffer is currently "in use" (i.e., there are
- * active holds and db_data still references it), then make
- * a copy before we start the write so that any modifications
- * from the open txg will not leak into this write.
- *
- * NOTE: this copy does not need to be made for objects only
- * modified in the syncing context (e.g. DNONE_DNODE blocks).
- */
- if (refcount_count(&db->db_holds) > 1 && *datap == db->db_buf) {
- arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
- *datap = arc_buf_alloc(os->os_spa, blksz, db, type);
- bcopy(db->db.db_data, (*datap)->b_data, blksz);
- }
- } else {
- /*
- * Private object buffers are released here rather
- * than in dbuf_dirty() since they are only modified
- * in the syncing context and we don't want the
- * overhead of making multiple copies of the data.
- */
- arc_release(db->db_buf, db);
- }
-
- ASSERT(*datap != NULL);
- db->db_data_pending = dr;
-
- mutex_exit(&db->db_mtx);
-
- /*
- * Allow dnode settings to override objset settings,
- * except for metadata checksums.
- */
- if (dmu_ot[dn->dn_type].ot_metadata) {
- checksum = os->os_md_checksum;
- compress = zio_compress_select(dn->dn_compress,
- os->os_md_compress);
- } else {
- checksum = zio_checksum_select(dn->dn_checksum,
- os->os_checksum);
- compress = zio_compress_select(dn->dn_compress,
- os->os_compress);
- }
-
- dbuf_write(dr, *datap, checksum, compress, tx);
-
- ASSERT(!list_link_active(&dr->dr_dirty_node));
- if (dn->dn_object == DMU_META_DNODE_OBJECT)
- list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
- else
- zio_nowait(dr->dr_zio);
-}
-
-void
-dbuf_sync_list(list_t *list, dmu_tx_t *tx)
-{
- dbuf_dirty_record_t *dr;
-
- while (dr = list_head(list)) {
- if (dr->dr_zio != NULL) {
- /*
- * If we find an already initialized zio then we
- * are processing the meta-dnode, and we have finished.
- * The dbufs for all dnodes are put back on the list
- * during processing, so that we can zio_wait()
- * these IOs after initiating all child IOs.
- */
- ASSERT3U(dr->dr_dbuf->db.db_object, ==,
- DMU_META_DNODE_OBJECT);
- break;
- }
- list_remove(list, dr);
- if (dr->dr_dbuf->db_level > 0)
- dbuf_sync_indirect(dr, tx);
- else
- dbuf_sync_leaf(dr, tx);
- }
-}
-
-static void
-dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum,
- int compress, dmu_tx_t *tx)
-{
- dmu_buf_impl_t *db = dr->dr_dbuf;
- dnode_t *dn = db->db_dnode;
- objset_impl_t *os = dn->dn_objset;
- dmu_buf_impl_t *parent = db->db_parent;
- uint64_t txg = tx->tx_txg;
- zbookmark_t zb;
- zio_t *zio;
- int zio_flags;
-
- if (parent != dn->dn_dbuf) {
- ASSERT(parent && parent->db_data_pending);
- ASSERT(db->db_level == parent->db_level-1);
- ASSERT(arc_released(parent->db_buf));
- zio = parent->db_data_pending->dr_zio;
- } else {
- ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1);
- ASSERT3P(db->db_blkptr, ==,
- &dn->dn_phys->dn_blkptr[db->db_blkid]);
- zio = dn->dn_zio;
- }
-
- ASSERT(db->db_level == 0 || data == db->db_buf);
- ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
- ASSERT(zio);
-
- zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0;
- zb.zb_object = db->db.db_object;
- zb.zb_level = db->db_level;
- zb.zb_blkid = db->db_blkid;
-
- zio_flags = ZIO_FLAG_MUSTSUCCEED;
- if (dmu_ot[dn->dn_type].ot_metadata || zb.zb_level != 0)
- zio_flags |= ZIO_FLAG_METADATA;
- if (BP_IS_OLDER(db->db_blkptr, txg))
- dsl_dataset_block_kill(
- os->os_dsl_dataset, db->db_blkptr, zio, tx);
-
- dr->dr_zio = arc_write(zio, os->os_spa, checksum, compress,
- dmu_get_replication_level(os, &zb, dn->dn_type), txg,
- db->db_blkptr, data, dbuf_write_ready, dbuf_write_done, db,
- ZIO_PRIORITY_ASYNC_WRITE, zio_flags, &zb);
-}
-
-/* ARGSUSED */
-static void
-dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
-{
- dmu_buf_impl_t *db = vdb;
- dnode_t *dn = db->db_dnode;
- objset_impl_t *os = dn->dn_objset;
- blkptr_t *bp_orig = &zio->io_bp_orig;
- uint64_t fill = 0;
- int old_size, new_size, i;
-
- dprintf_dbuf_bp(db, bp_orig, "bp_orig: %s", "");
-
- old_size = bp_get_dasize(os->os_spa, bp_orig);
- new_size = bp_get_dasize(os->os_spa, zio->io_bp);
-
- dnode_diduse_space(dn, new_size-old_size);
-
- if (BP_IS_HOLE(zio->io_bp)) {
- dsl_dataset_t *ds = os->os_dsl_dataset;
- dmu_tx_t *tx = os->os_synctx;
-
- if (bp_orig->blk_birth == tx->tx_txg)
- dsl_dataset_block_kill(ds, bp_orig, NULL, tx);
- ASSERT3U(db->db_blkptr->blk_fill, ==, 0);
- return;
- }
-
- mutex_enter(&db->db_mtx);
-
- if (db->db_level == 0) {
- mutex_enter(&dn->dn_mtx);
- if (db->db_blkid > dn->dn_phys->dn_maxblkid)
- dn->dn_phys->dn_maxblkid = db->db_blkid;
- mutex_exit(&dn->dn_mtx);
-
- if (dn->dn_type == DMU_OT_DNODE) {
- dnode_phys_t *dnp = db->db.db_data;
- for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
- i--, dnp++) {
- if (dnp->dn_type != DMU_OT_NONE)
- fill++;
- }
- } else {
- fill = 1;
- }
- } else {
- blkptr_t *bp = db->db.db_data;
- ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
- for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, bp++) {
- if (BP_IS_HOLE(bp))
- continue;
- ASSERT3U(BP_GET_LSIZE(bp), ==,
- db->db_level == 1 ? dn->dn_datablksz :
- (1<<dn->dn_phys->dn_indblkshift));
- fill += bp->blk_fill;
- }
- }
-
- db->db_blkptr->blk_fill = fill;
- BP_SET_TYPE(db->db_blkptr, dn->dn_type);
- BP_SET_LEVEL(db->db_blkptr, db->db_level);
-
- mutex_exit(&db->db_mtx);
-
- /* We must do this after we've set the bp's type and level */
- if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp), BP_IDENTITY(bp_orig))) {
- dsl_dataset_t *ds = os->os_dsl_dataset;
- dmu_tx_t *tx = os->os_synctx;
-
- if (bp_orig->blk_birth == tx->tx_txg)
- dsl_dataset_block_kill(ds, bp_orig, NULL, tx);
- dsl_dataset_block_born(ds, zio->io_bp, tx);
- }
-}
-
-/* ARGSUSED */
-static void
-dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
-{
- dmu_buf_impl_t *db = vdb;
- uint64_t txg = zio->io_txg;
- dbuf_dirty_record_t **drp, *dr;
-
- ASSERT3U(zio->io_error, ==, 0);
-
- mutex_enter(&db->db_mtx);
-
- drp = &db->db_last_dirty;
- while (*drp != db->db_data_pending)
- drp = &(*drp)->dr_next;
- ASSERT(!list_link_active(&(*drp)->dr_dirty_node));
- ASSERT((*drp)->dr_txg == txg);
- ASSERT((*drp)->dr_next == NULL);
- dr = *drp;
- *drp = NULL;
-
- if (db->db_level == 0) {
- ASSERT(db->db_blkid != DB_BONUS_BLKID);
- ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
-
- if (dr->dt.dl.dr_data != db->db_buf)
- VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1);
- else if (!BP_IS_HOLE(db->db_blkptr))
- arc_set_callback(db->db_buf, dbuf_do_evict, db);
- else
- ASSERT(arc_released(db->db_buf));
- } else {
- dnode_t *dn = db->db_dnode;
-
- ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
- ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
- if (!BP_IS_HOLE(db->db_blkptr)) {
- int epbs =
- dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
- ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
- db->db.db_size);
- ASSERT3U(dn->dn_phys->dn_maxblkid
- >> (db->db_level * epbs), >=, db->db_blkid);
- arc_set_callback(db->db_buf, dbuf_do_evict, db);
- }
- list_destroy(&dr->dt.di.dr_children);
- mutex_destroy(&dr->dt.di.dr_mtx);
- }
- kmem_free(dr, sizeof (dbuf_dirty_record_t));
-
- cv_broadcast(&db->db_changed);
- ASSERT(db->db_dirtycnt > 0);
- db->db_dirtycnt -= 1;
- db->db_data_pending = NULL;
- mutex_exit(&db->db_mtx);
-
- dprintf_dbuf_bp(db, zio->io_bp, "bp: %s", "");
-
- dbuf_rele(db, (void *)(uintptr_t)txg);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dmu.c
deleted file mode 100644
index d3be6b4..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu.c
+++ /dev/null
@@ -1,1029 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/dmu.h>
-#include <sys/dmu_impl.h>
-#include <sys/dmu_tx.h>
-#include <sys/dbuf.h>
-#include <sys/dnode.h>
-#include <sys/zfs_context.h>
-#include <sys/dmu_objset.h>
-#include <sys/dmu_traverse.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_dir.h>
-#include <sys/dsl_pool.h>
-#include <sys/dsl_synctask.h>
-#include <sys/dsl_prop.h>
-#include <sys/dmu_zfetch.h>
-#include <sys/zfs_ioctl.h>
-#include <sys/zap.h>
-#include <sys/zio_checksum.h>
-
-const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
- { byteswap_uint8_array, TRUE, "unallocated" },
- { zap_byteswap, TRUE, "object directory" },
- { byteswap_uint64_array, TRUE, "object array" },
- { byteswap_uint8_array, TRUE, "packed nvlist" },
- { byteswap_uint64_array, TRUE, "packed nvlist size" },
- { byteswap_uint64_array, TRUE, "bplist" },
- { byteswap_uint64_array, TRUE, "bplist header" },
- { byteswap_uint64_array, TRUE, "SPA space map header" },
- { byteswap_uint64_array, TRUE, "SPA space map" },
- { byteswap_uint64_array, TRUE, "ZIL intent log" },
- { dnode_buf_byteswap, TRUE, "DMU dnode" },
- { dmu_objset_byteswap, TRUE, "DMU objset" },
- { byteswap_uint64_array, TRUE, "DSL directory" },
- { zap_byteswap, TRUE, "DSL directory child map"},
- { zap_byteswap, TRUE, "DSL dataset snap map" },
- { zap_byteswap, TRUE, "DSL props" },
- { byteswap_uint64_array, TRUE, "DSL dataset" },
- { zfs_znode_byteswap, TRUE, "ZFS znode" },
- { zfs_acl_byteswap, TRUE, "ZFS ACL" },
- { byteswap_uint8_array, FALSE, "ZFS plain file" },
- { zap_byteswap, TRUE, "ZFS directory" },
- { zap_byteswap, TRUE, "ZFS master node" },
- { zap_byteswap, TRUE, "ZFS delete queue" },
- { byteswap_uint8_array, FALSE, "zvol object" },
- { zap_byteswap, TRUE, "zvol prop" },
- { byteswap_uint8_array, FALSE, "other uint8[]" },
- { byteswap_uint64_array, FALSE, "other uint64[]" },
- { zap_byteswap, TRUE, "other ZAP" },
- { zap_byteswap, TRUE, "persistent error log" },
- { byteswap_uint8_array, TRUE, "SPA history" },
- { byteswap_uint64_array, TRUE, "SPA history offsets" },
- { zap_byteswap, TRUE, "Pool properties" },
-};
-
-int
-dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
- void *tag, dmu_buf_t **dbp)
-{
- dnode_t *dn;
- uint64_t blkid;
- dmu_buf_impl_t *db;
- int err;
-
- err = dnode_hold(os->os, object, FTAG, &dn);
- if (err)
- return (err);
- blkid = dbuf_whichblock(dn, offset);
- rw_enter(&dn->dn_struct_rwlock, RW_READER);
- db = dbuf_hold(dn, blkid, tag);
- rw_exit(&dn->dn_struct_rwlock);
- if (db == NULL) {
- err = EIO;
- } else {
- err = dbuf_read(db, NULL, DB_RF_CANFAIL);
- if (err) {
- dbuf_rele(db, tag);
- db = NULL;
- }
- }
-
- dnode_rele(dn, FTAG);
- *dbp = &db->db;
- return (err);
-}
-
-int
-dmu_bonus_max(void)
-{
- return (DN_MAX_BONUSLEN);
-}
-
-/*
- * returns ENOENT, EIO, or 0.
- */
-int
-dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
-{
- dnode_t *dn;
- int err, count;
- dmu_buf_impl_t *db;
-
- err = dnode_hold(os->os, object, FTAG, &dn);
- if (err)
- return (err);
-
- rw_enter(&dn->dn_struct_rwlock, RW_READER);
- if (dn->dn_bonus == NULL) {
- rw_exit(&dn->dn_struct_rwlock);
- rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
- if (dn->dn_bonus == NULL)
- dn->dn_bonus = dbuf_create_bonus(dn);
- }
- db = dn->dn_bonus;
- rw_exit(&dn->dn_struct_rwlock);
- mutex_enter(&db->db_mtx);
- count = refcount_add(&db->db_holds, tag);
- mutex_exit(&db->db_mtx);
- if (count == 1)
- dnode_add_ref(dn, db);
- dnode_rele(dn, FTAG);
-
- VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED));
-
- *dbp = &db->db;
- return (0);
-}
-
-/*
- * Note: longer-term, we should modify all of the dmu_buf_*() interfaces
- * to take a held dnode rather than <os, object> -- the lookup is wasteful,
- * and can induce severe lock contention when writing to several files
- * whose dnodes are in the same block.
- */
-static int
-dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset,
- uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
-{
- dmu_buf_t **dbp;
- uint64_t blkid, nblks, i;
- uint32_t flags;
- int err;
- zio_t *zio;
-
- ASSERT(length <= DMU_MAX_ACCESS);
-
- flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT;
- if (length > zfetch_array_rd_sz)
- flags |= DB_RF_NOPREFETCH;
-
- rw_enter(&dn->dn_struct_rwlock, RW_READER);
- if (dn->dn_datablkshift) {
- int blkshift = dn->dn_datablkshift;
- nblks = (P2ROUNDUP(offset+length, 1ULL<<blkshift) -
- P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift;
- } else {
- if (offset + length > dn->dn_datablksz) {
- zfs_panic_recover("zfs: accessing past end of object "
- "%llx/%llx (size=%u access=%llu+%llu)",
- (longlong_t)dn->dn_objset->
- os_dsl_dataset->ds_object,
- (longlong_t)dn->dn_object, dn->dn_datablksz,
- (longlong_t)offset, (longlong_t)length);
- return (EIO);
- }
- nblks = 1;
- }
- dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
-
- zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, TRUE);
- blkid = dbuf_whichblock(dn, offset);
- for (i = 0; i < nblks; i++) {
- dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag);
- if (db == NULL) {
- rw_exit(&dn->dn_struct_rwlock);
- dmu_buf_rele_array(dbp, nblks, tag);
- zio_nowait(zio);
- return (EIO);
- }
- /* initiate async i/o */
- if (read) {
- rw_exit(&dn->dn_struct_rwlock);
- (void) dbuf_read(db, zio, flags);
- rw_enter(&dn->dn_struct_rwlock, RW_READER);
- }
- dbp[i] = &db->db;
- }
- rw_exit(&dn->dn_struct_rwlock);
-
- /* wait for async i/o */
- err = zio_wait(zio);
- if (err) {
- dmu_buf_rele_array(dbp, nblks, tag);
- return (err);
- }
-
- /* wait for other io to complete */
- if (read) {
- for (i = 0; i < nblks; i++) {
- dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
- mutex_enter(&db->db_mtx);
- while (db->db_state == DB_READ ||
- db->db_state == DB_FILL)
- cv_wait(&db->db_changed, &db->db_mtx);
- if (db->db_state == DB_UNCACHED)
- err = EIO;
- mutex_exit(&db->db_mtx);
- if (err) {
- dmu_buf_rele_array(dbp, nblks, tag);
- return (err);
- }
- }
- }
-
- *numbufsp = nblks;
- *dbpp = dbp;
- return (0);
-}
-
-static int
-dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
- uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
-{
- dnode_t *dn;
- int err;
-
- err = dnode_hold(os->os, object, FTAG, &dn);
- if (err)
- return (err);
-
- err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
- numbufsp, dbpp);
-
- dnode_rele(dn, FTAG);
-
- return (err);
-}
-
-int
-dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset,
- uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
-{
- dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
- int err;
-
- err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
- numbufsp, dbpp);
-
- return (err);
-}
-
-void
-dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag)
-{
- int i;
- dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
-
- if (numbufs == 0)
- return;
-
- for (i = 0; i < numbufs; i++) {
- if (dbp[i])
- dbuf_rele(dbp[i], tag);
- }
-
- kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs);
-}
-
-void
-dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
-{
- dnode_t *dn;
- uint64_t blkid;
- int nblks, i, err;
-
- if (zfs_prefetch_disable)
- return;
-
- if (len == 0) { /* they're interested in the bonus buffer */
- dn = os->os->os_meta_dnode;
-
- if (object == 0 || object >= DN_MAX_OBJECT)
- return;
-
- rw_enter(&dn->dn_struct_rwlock, RW_READER);
- blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t));
- dbuf_prefetch(dn, blkid);
- rw_exit(&dn->dn_struct_rwlock);
- return;
- }
-
- /*
- * XXX - Note, if the dnode for the requested object is not
- * already cached, we will do a *synchronous* read in the
- * dnode_hold() call. The same is true for any indirects.
- */
- err = dnode_hold(os->os, object, FTAG, &dn);
- if (err != 0)
- return;
-
- rw_enter(&dn->dn_struct_rwlock, RW_READER);
- if (dn->dn_datablkshift) {
- int blkshift = dn->dn_datablkshift;
- nblks = (P2ROUNDUP(offset+len, 1<<blkshift) -
- P2ALIGN(offset, 1<<blkshift)) >> blkshift;
- } else {
- nblks = (offset < dn->dn_datablksz);
- }
-
- if (nblks != 0) {
- blkid = dbuf_whichblock(dn, offset);
- for (i = 0; i < nblks; i++)
- dbuf_prefetch(dn, blkid+i);
- }
-
- rw_exit(&dn->dn_struct_rwlock);
-
- dnode_rele(dn, FTAG);
-}
-
-int
-dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
- uint64_t size, dmu_tx_t *tx)
-{
- dnode_t *dn;
- int err = dnode_hold(os->os, object, FTAG, &dn);
- if (err)
- return (err);
- ASSERT(offset < UINT64_MAX);
- ASSERT(size == -1ULL || size <= UINT64_MAX - offset);
- dnode_free_range(dn, offset, size, tx);
- dnode_rele(dn, FTAG);
- return (0);
-}
-
-int
-dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
- void *buf)
-{
- dnode_t *dn;
- dmu_buf_t **dbp;
- int numbufs, i, err;
-
- err = dnode_hold(os->os, object, FTAG, &dn);
- if (err)
- return (err);
-
- /*
- * Deal with odd block sizes, where there can't be data past the first
- * block. If we ever do the tail block optimization, we will need to
- * handle that here as well.
- */
- if (dn->dn_datablkshift == 0) {
- int newsz = offset > dn->dn_datablksz ? 0 :
- MIN(size, dn->dn_datablksz - offset);
- bzero((char *)buf + newsz, size - newsz);
- size = newsz;
- }
-
- while (size > 0) {
- uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
- int err;
-
- /*
- * NB: we could do this block-at-a-time, but it's nice
- * to be reading in parallel.
- */
- err = dmu_buf_hold_array_by_dnode(dn, offset, mylen,
- TRUE, FTAG, &numbufs, &dbp);
- if (err)
- return (err);
-
- for (i = 0; i < numbufs; i++) {
- int tocpy;
- int bufoff;
- dmu_buf_t *db = dbp[i];
-
- ASSERT(size > 0);
-
- bufoff = offset - db->db_offset;
- tocpy = (int)MIN(db->db_size - bufoff, size);
-
- bcopy((char *)db->db_data + bufoff, buf, tocpy);
-
- offset += tocpy;
- size -= tocpy;
- buf = (char *)buf + tocpy;
- }
- dmu_buf_rele_array(dbp, numbufs, FTAG);
- }
- dnode_rele(dn, FTAG);
- return (0);
-}
-
-void
-dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
- const void *buf, dmu_tx_t *tx)
-{
- dmu_buf_t **dbp;
- int numbufs, i;
-
- if (size == 0)
- return;
-
- VERIFY(0 == dmu_buf_hold_array(os, object, offset, size,
- FALSE, FTAG, &numbufs, &dbp));
-
- for (i = 0; i < numbufs; i++) {
- int tocpy;
- int bufoff;
- dmu_buf_t *db = dbp[i];
-
- ASSERT(size > 0);
-
- bufoff = offset - db->db_offset;
- tocpy = (int)MIN(db->db_size - bufoff, size);
-
- ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
-
- if (tocpy == db->db_size)
- dmu_buf_will_fill(db, tx);
- else
- dmu_buf_will_dirty(db, tx);
-
- bcopy(buf, (char *)db->db_data + bufoff, tocpy);
-
- if (tocpy == db->db_size)
- dmu_buf_fill_done(db, tx);
-
- offset += tocpy;
- size -= tocpy;
- buf = (char *)buf + tocpy;
- }
- dmu_buf_rele_array(dbp, numbufs, FTAG);
-}
-
-#ifdef _KERNEL
-int
-dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
-{
- dmu_buf_t **dbp;
- int numbufs, i, err;
-
- /*
- * NB: we could do this block-at-a-time, but it's nice
- * to be reading in parallel.
- */
- err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, TRUE, FTAG,
- &numbufs, &dbp);
- if (err)
- return (err);
-
- for (i = 0; i < numbufs; i++) {
- int tocpy;
- int bufoff;
- dmu_buf_t *db = dbp[i];
-
- ASSERT(size > 0);
-
- bufoff = uio->uio_loffset - db->db_offset;
- tocpy = (int)MIN(db->db_size - bufoff, size);
-
- err = uiomove((char *)db->db_data + bufoff, tocpy,
- UIO_READ, uio);
- if (err)
- break;
-
- size -= tocpy;
- }
- dmu_buf_rele_array(dbp, numbufs, FTAG);
-
- return (err);
-}
-
-int
-dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size,
- dmu_tx_t *tx)
-{
- dmu_buf_t **dbp;
- int numbufs, i;
- int err = 0;
-
- if (size == 0)
- return (0);
-
- err = dmu_buf_hold_array(os, object, uio->uio_loffset, size,
- FALSE, FTAG, &numbufs, &dbp);
- if (err)
- return (err);
-
- for (i = 0; i < numbufs; i++) {
- int tocpy;
- int bufoff;
- dmu_buf_t *db = dbp[i];
-
- ASSERT(size > 0);
-
- bufoff = uio->uio_loffset - db->db_offset;
- tocpy = (int)MIN(db->db_size - bufoff, size);
-
- ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
-
- if (tocpy == db->db_size)
- dmu_buf_will_fill(db, tx);
- else
- dmu_buf_will_dirty(db, tx);
-
- /*
- * XXX uiomove could block forever (eg. nfs-backed
- * pages). There needs to be a uiolockdown() function
- * to lock the pages in memory, so that uiomove won't
- * block.
- */
- err = uiomove((char *)db->db_data + bufoff, tocpy,
- UIO_WRITE, uio);
-
- if (tocpy == db->db_size)
- dmu_buf_fill_done(db, tx);
-
- if (err)
- break;
-
- size -= tocpy;
- }
- dmu_buf_rele_array(dbp, numbufs, FTAG);
- return (err);
-}
-
-#ifndef __FreeBSD__
-int
-dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
- page_t *pp, dmu_tx_t *tx)
-{
- dmu_buf_t **dbp;
- int numbufs, i;
- int err;
-
- if (size == 0)
- return (0);
-
- err = dmu_buf_hold_array(os, object, offset, size,
- FALSE, FTAG, &numbufs, &dbp);
- if (err)
- return (err);
-
- for (i = 0; i < numbufs; i++) {
- int tocpy, copied, thiscpy;
- int bufoff;
- dmu_buf_t *db = dbp[i];
- caddr_t va;
-
- ASSERT(size > 0);
- ASSERT3U(db->db_size, >=, PAGESIZE);
-
- bufoff = offset - db->db_offset;
- tocpy = (int)MIN(db->db_size - bufoff, size);
-
- ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
-
- if (tocpy == db->db_size)
- dmu_buf_will_fill(db, tx);
- else
- dmu_buf_will_dirty(db, tx);
-
- for (copied = 0; copied < tocpy; copied += PAGESIZE) {
- ASSERT3U(pp->p_offset, ==, db->db_offset + bufoff);
- thiscpy = MIN(PAGESIZE, tocpy - copied);
- va = ppmapin(pp, PROT_READ, (caddr_t)-1);
- bcopy(va, (char *)db->db_data + bufoff, thiscpy);
- ppmapout(va);
- pp = pp->p_next;
- bufoff += PAGESIZE;
- }
-
- if (tocpy == db->db_size)
- dmu_buf_fill_done(db, tx);
-
- if (err)
- break;
-
- offset += tocpy;
- size -= tocpy;
- }
- dmu_buf_rele_array(dbp, numbufs, FTAG);
- return (err);
-}
-#endif /* !__FreeBSD__ */
-#endif /* _KERNEL */
-
-typedef struct {
- dbuf_dirty_record_t *dr;
- dmu_sync_cb_t *done;
- void *arg;
-} dmu_sync_arg_t;
-
-/* ARGSUSED */
-static void
-dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
-{
- dmu_sync_arg_t *in = varg;
- dbuf_dirty_record_t *dr = in->dr;
- dmu_buf_impl_t *db = dr->dr_dbuf;
- dmu_sync_cb_t *done = in->done;
-
- if (!BP_IS_HOLE(zio->io_bp)) {
- zio->io_bp->blk_fill = 1;
- BP_SET_TYPE(zio->io_bp, db->db_dnode->dn_type);
- BP_SET_LEVEL(zio->io_bp, 0);
- }
-
- mutex_enter(&db->db_mtx);
- ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
- dr->dt.dl.dr_overridden_by = *zio->io_bp; /* structure assignment */
- dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
- cv_broadcast(&db->db_changed);
- mutex_exit(&db->db_mtx);
-
- if (done)
- done(&(db->db), in->arg);
-
- kmem_free(in, sizeof (dmu_sync_arg_t));
-}
-
-/*
- * Intent log support: sync the block associated with db to disk.
- * N.B. and XXX: the caller is responsible for making sure that the
- * data isn't changing while dmu_sync() is writing it.
- *
- * Return values:
- *
- * EEXIST: this txg has already been synced, so there's nothing to to.
- * The caller should not log the write.
- *
- * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do.
- * The caller should not log the write.
- *
- * EALREADY: this block is already in the process of being synced.
- * The caller should track its progress (somehow).
- *
- * EINPROGRESS: the IO has been initiated.
- * The caller should log this blkptr in the callback.
- *
- * 0: completed. Sets *bp to the blkptr just written.
- * The caller should log this blkptr immediately.
- */
-int
-dmu_sync(zio_t *pio, dmu_buf_t *db_fake,
- blkptr_t *bp, uint64_t txg, dmu_sync_cb_t *done, void *arg)
-{
- dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
- objset_impl_t *os = db->db_objset;
- dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool;
- tx_state_t *tx = &dp->dp_tx;
- dbuf_dirty_record_t *dr;
- dmu_sync_arg_t *in;
- zbookmark_t zb;
- zio_t *zio;
- int zio_flags;
- int err;
-
- ASSERT(BP_IS_HOLE(bp));
- ASSERT(txg != 0);
-
-
- dprintf("dmu_sync txg=%llu, s,o,q %llu %llu %llu\n",
- txg, tx->tx_synced_txg, tx->tx_open_txg, tx->tx_quiesced_txg);
-
- /*
- * XXX - would be nice if we could do this without suspending...
- */
- txg_suspend(dp);
-
- /*
- * If this txg already synced, there's nothing to do.
- */
- if (txg <= tx->tx_synced_txg) {
- txg_resume(dp);
- /*
- * If we're running ziltest, we need the blkptr regardless.
- */
- if (txg > spa_freeze_txg(dp->dp_spa)) {
- /* if db_blkptr == NULL, this was an empty write */
- if (db->db_blkptr)
- *bp = *db->db_blkptr; /* structure assignment */
- return (0);
- }
- return (EEXIST);
- }
-
- mutex_enter(&db->db_mtx);
-
- if (txg == tx->tx_syncing_txg) {
- while (db->db_data_pending) {
- /*
- * IO is in-progress. Wait for it to finish.
- * XXX - would be nice to be able to somehow "attach"
- * this zio to the parent zio passed in.
- */
- cv_wait(&db->db_changed, &db->db_mtx);
- if (!db->db_data_pending &&
- db->db_blkptr && BP_IS_HOLE(db->db_blkptr)) {
- /*
- * IO was compressed away
- */
- *bp = *db->db_blkptr; /* structure assignment */
- mutex_exit(&db->db_mtx);
- txg_resume(dp);
- return (0);
- }
- ASSERT(db->db_data_pending ||
- (db->db_blkptr && db->db_blkptr->blk_birth == txg));
- }
-
- if (db->db_blkptr && db->db_blkptr->blk_birth == txg) {
- /*
- * IO is already completed.
- */
- *bp = *db->db_blkptr; /* structure assignment */
- mutex_exit(&db->db_mtx);
- txg_resume(dp);
- return (0);
- }
- }
-
- dr = db->db_last_dirty;
- while (dr && dr->dr_txg > txg)
- dr = dr->dr_next;
- if (dr == NULL || dr->dr_txg < txg) {
- /*
- * This dbuf isn't dirty, must have been free_range'd.
- * There's no need to log writes to freed blocks, so we're done.
- */
- mutex_exit(&db->db_mtx);
- txg_resume(dp);
- return (ENOENT);
- }
-
- ASSERT(dr->dr_txg == txg);
- if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
- /*
- * We have already issued a sync write for this buffer.
- */
- mutex_exit(&db->db_mtx);
- txg_resume(dp);
- return (EALREADY);
- } else if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
- /*
- * This buffer has already been synced. It could not
- * have been dirtied since, or we would have cleared the state.
- */
- *bp = dr->dt.dl.dr_overridden_by; /* structure assignment */
- mutex_exit(&db->db_mtx);
- txg_resume(dp);
- return (0);
- }
-
- dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
- in = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
- in->dr = dr;
- in->done = done;
- in->arg = arg;
- mutex_exit(&db->db_mtx);
- txg_resume(dp);
-
- zb.zb_objset = os->os_dsl_dataset->ds_object;
- zb.zb_object = db->db.db_object;
- zb.zb_level = db->db_level;
- zb.zb_blkid = db->db_blkid;
- zio_flags = ZIO_FLAG_MUSTSUCCEED;
- if (dmu_ot[db->db_dnode->dn_type].ot_metadata || zb.zb_level != 0)
- zio_flags |= ZIO_FLAG_METADATA;
- zio = arc_write(pio, os->os_spa,
- zio_checksum_select(db->db_dnode->dn_checksum, os->os_checksum),
- zio_compress_select(db->db_dnode->dn_compress, os->os_compress),
- dmu_get_replication_level(os, &zb, db->db_dnode->dn_type),
- txg, bp, dr->dt.dl.dr_data, NULL, dmu_sync_done, in,
- ZIO_PRIORITY_SYNC_WRITE, zio_flags, &zb);
-
- if (pio) {
- zio_nowait(zio);
- err = EINPROGRESS;
- } else {
- err = zio_wait(zio);
- ASSERT(err == 0);
- }
- return (err);
-}
-
-int
-dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
- dmu_tx_t *tx)
-{
- dnode_t *dn;
- int err;
-
- err = dnode_hold(os->os, object, FTAG, &dn);
- if (err)
- return (err);
- err = dnode_set_blksz(dn, size, ibs, tx);
- dnode_rele(dn, FTAG);
- return (err);
-}
-
-void
-dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
- dmu_tx_t *tx)
-{
- dnode_t *dn;
-
- /* XXX assumes dnode_hold will not get an i/o error */
- (void) dnode_hold(os->os, object, FTAG, &dn);
- ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS);
- dn->dn_checksum = checksum;
- dnode_setdirty(dn, tx);
- dnode_rele(dn, FTAG);
-}
-
-void
-dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
- dmu_tx_t *tx)
-{
- dnode_t *dn;
-
- /* XXX assumes dnode_hold will not get an i/o error */
- (void) dnode_hold(os->os, object, FTAG, &dn);
- ASSERT(compress < ZIO_COMPRESS_FUNCTIONS);
- dn->dn_compress = compress;
- dnode_setdirty(dn, tx);
- dnode_rele(dn, FTAG);
-}
-
-int
-dmu_get_replication_level(objset_impl_t *os,
- zbookmark_t *zb, dmu_object_type_t ot)
-{
- int ncopies = os->os_copies;
-
- /* If it's the mos, it should have max copies set. */
- ASSERT(zb->zb_objset != 0 ||
- ncopies == spa_max_replication(os->os_spa));
-
- if (dmu_ot[ot].ot_metadata || zb->zb_level != 0)
- ncopies++;
- return (MIN(ncopies, spa_max_replication(os->os_spa)));
-}
-
-int
-dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
-{
- dnode_t *dn;
- int i, err;
-
- err = dnode_hold(os->os, object, FTAG, &dn);
- if (err)
- return (err);
- /*
- * Sync any current changes before
- * we go trundling through the block pointers.
- */
- for (i = 0; i < TXG_SIZE; i++) {
- if (list_link_active(&dn->dn_dirty_link[i]))
- break;
- }
- if (i != TXG_SIZE) {
- dnode_rele(dn, FTAG);
- txg_wait_synced(dmu_objset_pool(os), 0);
- err = dnode_hold(os->os, object, FTAG, &dn);
- if (err)
- return (err);
- }
-
- err = dnode_next_offset(dn, hole, off, 1, 1, 0);
- dnode_rele(dn, FTAG);
-
- return (err);
-}
-
-void
-dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
-{
- rw_enter(&dn->dn_struct_rwlock, RW_READER);
- mutex_enter(&dn->dn_mtx);
-
- doi->doi_data_block_size = dn->dn_datablksz;
- doi->doi_metadata_block_size = dn->dn_indblkshift ?
- 1ULL << dn->dn_indblkshift : 0;
- doi->doi_indirection = dn->dn_nlevels;
- doi->doi_checksum = dn->dn_checksum;
- doi->doi_compress = dn->dn_compress;
- doi->doi_physical_blks = (DN_USED_BYTES(dn->dn_phys) +
- SPA_MINBLOCKSIZE/2) >> SPA_MINBLOCKSHIFT;
- doi->doi_max_block_offset = dn->dn_phys->dn_maxblkid;
- doi->doi_type = dn->dn_type;
- doi->doi_bonus_size = dn->dn_bonuslen;
- doi->doi_bonus_type = dn->dn_bonustype;
-
- mutex_exit(&dn->dn_mtx);
- rw_exit(&dn->dn_struct_rwlock);
-}
-
-/*
- * Get information on a DMU object.
- * If doi is NULL, just indicates whether the object exists.
- */
-int
-dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi)
-{
- dnode_t *dn;
- int err = dnode_hold(os->os, object, FTAG, &dn);
-
- if (err)
- return (err);
-
- if (doi != NULL)
- dmu_object_info_from_dnode(dn, doi);
-
- dnode_rele(dn, FTAG);
- return (0);
-}
-
-/*
- * As above, but faster; can be used when you have a held dbuf in hand.
- */
-void
-dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi)
-{
- dmu_object_info_from_dnode(((dmu_buf_impl_t *)db)->db_dnode, doi);
-}
-
-/*
- * Faster still when you only care about the size.
- * This is specifically optimized for zfs_getattr().
- */
-void
-dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, u_longlong_t *nblk512)
-{
- dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
-
- *blksize = dn->dn_datablksz;
- /* add 1 for dnode space */
- *nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >>
- SPA_MINBLOCKSHIFT) + 1;
-}
-
-void
-byteswap_uint64_array(void *vbuf, size_t size)
-{
- uint64_t *buf = vbuf;
- size_t count = size >> 3;
- int i;
-
- ASSERT((size & 7) == 0);
-
- for (i = 0; i < count; i++)
- buf[i] = BSWAP_64(buf[i]);
-}
-
-void
-byteswap_uint32_array(void *vbuf, size_t size)
-{
- uint32_t *buf = vbuf;
- size_t count = size >> 2;
- int i;
-
- ASSERT((size & 3) == 0);
-
- for (i = 0; i < count; i++)
- buf[i] = BSWAP_32(buf[i]);
-}
-
-void
-byteswap_uint16_array(void *vbuf, size_t size)
-{
- uint16_t *buf = vbuf;
- size_t count = size >> 1;
- int i;
-
- ASSERT((size & 1) == 0);
-
- for (i = 0; i < count; i++)
- buf[i] = BSWAP_16(buf[i]);
-}
-
-/* ARGSUSED */
-void
-byteswap_uint8_array(void *vbuf, size_t size)
-{
-}
-
-void
-dmu_init(void)
-{
- dbuf_init();
- dnode_init();
- arc_init();
-}
-
-void
-dmu_fini(void)
-{
- arc_fini();
- dnode_fini();
- dbuf_fini();
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c
deleted file mode 100644
index 93168cc..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/dmu.h>
-#include <sys/dmu_objset.h>
-#include <sys/dmu_tx.h>
-#include <sys/dnode.h>
-
-uint64_t
-dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
- dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
-{
- objset_impl_t *osi = os->os;
- uint64_t object;
- uint64_t L2_dnode_count = DNODES_PER_BLOCK <<
- (osi->os_meta_dnode->dn_indblkshift - SPA_BLKPTRSHIFT);
- dnode_t *dn = NULL;
- int restarted = B_FALSE;
-
- mutex_enter(&osi->os_obj_lock);
- for (;;) {
- object = osi->os_obj_next;
- /*
- * Each time we polish off an L2 bp worth of dnodes
- * (2^13 objects), move to another L2 bp that's still
- * reasonably sparse (at most 1/4 full). Look from the
- * beginning once, but after that keep looking from here.
- * If we can't find one, just keep going from here.
- */
- if (P2PHASE(object, L2_dnode_count) == 0) {
- uint64_t offset = restarted ? object << DNODE_SHIFT : 0;
- int error = dnode_next_offset(osi->os_meta_dnode,
- B_TRUE, &offset, 2, DNODES_PER_BLOCK >> 2, 0);
- restarted = B_TRUE;
- if (error == 0)
- object = offset >> DNODE_SHIFT;
- }
- osi->os_obj_next = ++object;
-
- /*
- * XXX We should check for an i/o error here and return
- * up to our caller. Actually we should pre-read it in
- * dmu_tx_assign(), but there is currently no mechanism
- * to do so.
- */
- (void) dnode_hold_impl(os->os, object, DNODE_MUST_BE_FREE,
- FTAG, &dn);
- if (dn)
- break;
-
- if (dmu_object_next(os, &object, B_TRUE, 0) == 0)
- osi->os_obj_next = object - 1;
- }
-
- dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx);
- dnode_rele(dn, FTAG);
-
- mutex_exit(&osi->os_obj_lock);
-
- dmu_tx_add_new_object(tx, os, object);
- return (object);
-}
-
-int
-dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
- int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
-{
- dnode_t *dn;
- int err;
-
- if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx))
- return (EBADF);
-
- err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_FREE, FTAG, &dn);
- if (err)
- return (err);
- dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx);
- dnode_rele(dn, FTAG);
-
- dmu_tx_add_new_object(tx, os, object);
- return (0);
-}
-
-int
-dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
- int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
-{
- dnode_t *dn;
- int err;
-
- if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx))
- return (EBADF);
-
- err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED,
- FTAG, &dn);
- if (err)
- return (err);
- dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, tx);
- dnode_rele(dn, FTAG);
-
- return (0);
-}
-
-int
-dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
-{
- dnode_t *dn;
- int err;
-
- ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
-
- err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED,
- FTAG, &dn);
- if (err)
- return (err);
-
- ASSERT(dn->dn_type != DMU_OT_NONE);
- dnode_free(dn, tx);
- dnode_rele(dn, FTAG);
-
- return (0);
-}
-
-int
-dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
-{
- uint64_t offset = (*objectp + 1) << DNODE_SHIFT;
- int error;
-
- error = dnode_next_offset(os->os->os_meta_dnode,
- hole, &offset, 0, DNODES_PER_BLOCK, txg);
-
- *objectp = offset >> DNODE_SHIFT;
-
- return (error);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
deleted file mode 100644
index 378fe8c..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
+++ /dev/null
@@ -1,1037 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/dmu_objset.h>
-#include <sys/dsl_dir.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_prop.h>
-#include <sys/dsl_pool.h>
-#include <sys/dsl_synctask.h>
-#include <sys/dnode.h>
-#include <sys/dbuf.h>
-#include <sys/zvol.h>
-#include <sys/dmu_tx.h>
-#include <sys/zio_checksum.h>
-#include <sys/zap.h>
-#include <sys/zil.h>
-#include <sys/dmu_impl.h>
-
-
-spa_t *
-dmu_objset_spa(objset_t *os)
-{
- return (os->os->os_spa);
-}
-
-zilog_t *
-dmu_objset_zil(objset_t *os)
-{
- return (os->os->os_zil);
-}
-
-dsl_pool_t *
-dmu_objset_pool(objset_t *os)
-{
- dsl_dataset_t *ds;
-
- if ((ds = os->os->os_dsl_dataset) != NULL && ds->ds_dir)
- return (ds->ds_dir->dd_pool);
- else
- return (spa_get_dsl(os->os->os_spa));
-}
-
-dsl_dataset_t *
-dmu_objset_ds(objset_t *os)
-{
- return (os->os->os_dsl_dataset);
-}
-
-dmu_objset_type_t
-dmu_objset_type(objset_t *os)
-{
- return (os->os->os_phys->os_type);
-}
-
-void
-dmu_objset_name(objset_t *os, char *buf)
-{
- dsl_dataset_name(os->os->os_dsl_dataset, buf);
-}
-
-uint64_t
-dmu_objset_id(objset_t *os)
-{
- dsl_dataset_t *ds = os->os->os_dsl_dataset;
-
- return (ds ? ds->ds_object : 0);
-}
-
-static void
-checksum_changed_cb(void *arg, uint64_t newval)
-{
- objset_impl_t *osi = arg;
-
- /*
- * Inheritance should have been done by now.
- */
- ASSERT(newval != ZIO_CHECKSUM_INHERIT);
-
- osi->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE);
-}
-
-static void
-compression_changed_cb(void *arg, uint64_t newval)
-{
- objset_impl_t *osi = arg;
-
- /*
- * Inheritance and range checking should have been done by now.
- */
- ASSERT(newval != ZIO_COMPRESS_INHERIT);
-
- osi->os_compress = zio_compress_select(newval, ZIO_COMPRESS_ON_VALUE);
-}
-
-static void
-copies_changed_cb(void *arg, uint64_t newval)
-{
- objset_impl_t *osi = arg;
-
- /*
- * Inheritance and range checking should have been done by now.
- */
- ASSERT(newval > 0);
- ASSERT(newval <= spa_max_replication(osi->os_spa));
-
- osi->os_copies = newval;
-}
-
-void
-dmu_objset_byteswap(void *buf, size_t size)
-{
- objset_phys_t *osp = buf;
-
- ASSERT(size == sizeof (objset_phys_t));
- dnode_byteswap(&osp->os_meta_dnode);
- byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t));
- osp->os_type = BSWAP_64(osp->os_type);
-}
-
-int
-dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
- objset_impl_t **osip)
-{
- objset_impl_t *winner, *osi;
- int i, err, checksum;
-
- osi = kmem_zalloc(sizeof (objset_impl_t), KM_SLEEP);
- osi->os.os = osi;
- osi->os_dsl_dataset = ds;
- osi->os_spa = spa;
- osi->os_rootbp = bp;
- if (!BP_IS_HOLE(osi->os_rootbp)) {
- uint32_t aflags = ARC_WAIT;
- zbookmark_t zb;
- zb.zb_objset = ds ? ds->ds_object : 0;
- zb.zb_object = 0;
- zb.zb_level = -1;
- zb.zb_blkid = 0;
-
- dprintf_bp(osi->os_rootbp, "reading %s", "");
- err = arc_read(NULL, spa, osi->os_rootbp,
- dmu_ot[DMU_OT_OBJSET].ot_byteswap,
- arc_getbuf_func, &osi->os_phys_buf,
- ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb);
- if (err) {
- kmem_free(osi, sizeof (objset_impl_t));
- return (err);
- }
- osi->os_phys = osi->os_phys_buf->b_data;
- arc_release(osi->os_phys_buf, &osi->os_phys_buf);
- } else {
- osi->os_phys_buf = arc_buf_alloc(spa, sizeof (objset_phys_t),
- &osi->os_phys_buf, ARC_BUFC_METADATA);
- osi->os_phys = osi->os_phys_buf->b_data;
- bzero(osi->os_phys, sizeof (objset_phys_t));
- }
-
- /*
- * Note: the changed_cb will be called once before the register
- * func returns, thus changing the checksum/compression from the
- * default (fletcher2/off). Snapshots don't need to know, and
- * registering would complicate clone promotion.
- */
- if (ds && ds->ds_phys->ds_num_children == 0) {
- err = dsl_prop_register(ds, "checksum",
- checksum_changed_cb, osi);
- if (err == 0)
- err = dsl_prop_register(ds, "compression",
- compression_changed_cb, osi);
- if (err == 0)
- err = dsl_prop_register(ds, "copies",
- copies_changed_cb, osi);
- if (err) {
- VERIFY(arc_buf_remove_ref(osi->os_phys_buf,
- &osi->os_phys_buf) == 1);
- kmem_free(osi, sizeof (objset_impl_t));
- return (err);
- }
- } else if (ds == NULL) {
- /* It's the meta-objset. */
- osi->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
- osi->os_compress = ZIO_COMPRESS_LZJB;
- osi->os_copies = spa_max_replication(spa);
- }
-
- osi->os_zil = zil_alloc(&osi->os, &osi->os_phys->os_zil_header);
-
- /*
- * Metadata always gets compressed and checksummed.
- * If the data checksum is multi-bit correctable, and it's not
- * a ZBT-style checksum, then it's suitable for metadata as well.
- * Otherwise, the metadata checksum defaults to fletcher4.
- */
- checksum = osi->os_checksum;
-
- if (zio_checksum_table[checksum].ci_correctable &&
- !zio_checksum_table[checksum].ci_zbt)
- osi->os_md_checksum = checksum;
- else
- osi->os_md_checksum = ZIO_CHECKSUM_FLETCHER_4;
- osi->os_md_compress = ZIO_COMPRESS_LZJB;
-
- for (i = 0; i < TXG_SIZE; i++) {
- list_create(&osi->os_dirty_dnodes[i], sizeof (dnode_t),
- offsetof(dnode_t, dn_dirty_link[i]));
- list_create(&osi->os_free_dnodes[i], sizeof (dnode_t),
- offsetof(dnode_t, dn_dirty_link[i]));
- }
- list_create(&osi->os_dnodes, sizeof (dnode_t),
- offsetof(dnode_t, dn_link));
- list_create(&osi->os_downgraded_dbufs, sizeof (dmu_buf_impl_t),
- offsetof(dmu_buf_impl_t, db_link));
-
- mutex_init(&osi->os_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&osi->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);
-
- osi->os_meta_dnode = dnode_special_open(osi,
- &osi->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT);
-
- if (ds != NULL) {
- winner = dsl_dataset_set_user_ptr(ds, osi, dmu_objset_evict);
- if (winner) {
- dmu_objset_evict(ds, osi);
- osi = winner;
- }
- }
-
- *osip = osi;
- return (0);
-}
-
-/* called from zpl */
-int
-dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
- objset_t **osp)
-{
- dsl_dataset_t *ds;
- int err;
- objset_t *os;
- objset_impl_t *osi;
-
- os = kmem_alloc(sizeof (objset_t), KM_SLEEP);
- err = dsl_dataset_open(name, mode, os, &ds);
- if (err) {
- kmem_free(os, sizeof (objset_t));
- return (err);
- }
-
- osi = dsl_dataset_get_user_ptr(ds);
- if (osi == NULL) {
- err = dmu_objset_open_impl(dsl_dataset_get_spa(ds),
- ds, &ds->ds_phys->ds_bp, &osi);
- if (err) {
- dsl_dataset_close(ds, mode, os);
- kmem_free(os, sizeof (objset_t));
- return (err);
- }
- }
-
- os->os = osi;
- os->os_mode = mode;
-
- if (type != DMU_OST_ANY && type != os->os->os_phys->os_type) {
- dmu_objset_close(os);
- return (EINVAL);
- }
- *osp = os;
- return (0);
-}
-
-void
-dmu_objset_close(objset_t *os)
-{
- dsl_dataset_close(os->os->os_dsl_dataset, os->os_mode, os);
- kmem_free(os, sizeof (objset_t));
-}
-
-int
-dmu_objset_evict_dbufs(objset_t *os, int try)
-{
- objset_impl_t *osi = os->os;
- dnode_t *dn;
-
- mutex_enter(&osi->os_lock);
-
- /* process the mdn last, since the other dnodes have holds on it */
- list_remove(&osi->os_dnodes, osi->os_meta_dnode);
- list_insert_tail(&osi->os_dnodes, osi->os_meta_dnode);
-
- /*
- * Find the first dnode with holds. We have to do this dance
- * because dnode_add_ref() only works if you already have a
- * hold. If there are no holds then it has no dbufs so OK to
- * skip.
- */
- for (dn = list_head(&osi->os_dnodes);
- dn && refcount_is_zero(&dn->dn_holds);
- dn = list_next(&osi->os_dnodes, dn))
- continue;
- if (dn)
- dnode_add_ref(dn, FTAG);
-
- while (dn) {
- dnode_t *next_dn = dn;
-
- do {
- next_dn = list_next(&osi->os_dnodes, next_dn);
- } while (next_dn && refcount_is_zero(&next_dn->dn_holds));
- if (next_dn)
- dnode_add_ref(next_dn, FTAG);
-
- mutex_exit(&osi->os_lock);
- if (dnode_evict_dbufs(dn, try)) {
- dnode_rele(dn, FTAG);
- if (next_dn)
- dnode_rele(next_dn, FTAG);
- return (1);
- }
- dnode_rele(dn, FTAG);
- mutex_enter(&osi->os_lock);
- dn = next_dn;
- }
- mutex_exit(&osi->os_lock);
- return (0);
-}
-
-void
-dmu_objset_evict(dsl_dataset_t *ds, void *arg)
-{
- objset_impl_t *osi = arg;
- objset_t os;
- int i;
-
- for (i = 0; i < TXG_SIZE; i++) {
- ASSERT(list_head(&osi->os_dirty_dnodes[i]) == NULL);
- ASSERT(list_head(&osi->os_free_dnodes[i]) == NULL);
- }
-
- if (ds && ds->ds_phys->ds_num_children == 0) {
- VERIFY(0 == dsl_prop_unregister(ds, "checksum",
- checksum_changed_cb, osi));
- VERIFY(0 == dsl_prop_unregister(ds, "compression",
- compression_changed_cb, osi));
- VERIFY(0 == dsl_prop_unregister(ds, "copies",
- copies_changed_cb, osi));
- }
-
- /*
- * We should need only a single pass over the dnode list, since
- * nothing can be added to the list at this point.
- */
- os.os = osi;
- (void) dmu_objset_evict_dbufs(&os, 0);
-
- ASSERT3P(list_head(&osi->os_dnodes), ==, osi->os_meta_dnode);
- ASSERT3P(list_tail(&osi->os_dnodes), ==, osi->os_meta_dnode);
- ASSERT3P(list_head(&osi->os_meta_dnode->dn_dbufs), ==, NULL);
-
- dnode_special_close(osi->os_meta_dnode);
- zil_free(osi->os_zil);
-
- VERIFY(arc_buf_remove_ref(osi->os_phys_buf, &osi->os_phys_buf) == 1);
- mutex_destroy(&osi->os_lock);
- mutex_destroy(&osi->os_obj_lock);
- kmem_free(osi, sizeof (objset_impl_t));
-}
-
-/* called from dsl for meta-objset */
-objset_impl_t *
-dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
- dmu_objset_type_t type, dmu_tx_t *tx)
-{
- objset_impl_t *osi;
- dnode_t *mdn;
-
- ASSERT(dmu_tx_is_syncing(tx));
- VERIFY(0 == dmu_objset_open_impl(spa, ds, bp, &osi));
- mdn = osi->os_meta_dnode;
-
- dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT,
- DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx);
-
- /*
- * We don't want to have to increase the meta-dnode's nlevels
- * later, because then we could do it in quescing context while
- * we are also accessing it in open context.
- *
- * This precaution is not necessary for the MOS (ds == NULL),
- * because the MOS is only updated in syncing context.
- * This is most fortunate: the MOS is the only objset that
- * needs to be synced multiple times as spa_sync() iterates
- * to convergence, so minimizing its dn_nlevels matters.
- */
- if (ds != NULL) {
- int levels = 1;
-
- /*
- * Determine the number of levels necessary for the meta-dnode
- * to contain DN_MAX_OBJECT dnodes.
- */
- while ((uint64_t)mdn->dn_nblkptr << (mdn->dn_datablkshift +
- (levels - 1) * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) <
- DN_MAX_OBJECT * sizeof (dnode_phys_t))
- levels++;
-
- mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] =
- mdn->dn_nlevels = levels;
- }
-
- ASSERT(type != DMU_OST_NONE);
- ASSERT(type != DMU_OST_ANY);
- ASSERT(type < DMU_OST_NUMTYPES);
- osi->os_phys->os_type = type;
-
- dsl_dataset_dirty(ds, tx);
-
- return (osi);
-}
-
-struct oscarg {
- void (*userfunc)(objset_t *os, void *arg, dmu_tx_t *tx);
- void *userarg;
- dsl_dataset_t *clone_parent;
- const char *lastname;
- dmu_objset_type_t type;
-};
-
-/* ARGSUSED */
-static int
-dmu_objset_create_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
- dsl_dir_t *dd = arg1;
- struct oscarg *oa = arg2;
- objset_t *mos = dd->dd_pool->dp_meta_objset;
- int err;
- uint64_t ddobj;
-
- err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj,
- oa->lastname, sizeof (uint64_t), 1, &ddobj);
- if (err != ENOENT)
- return (err ? err : EEXIST);
-
- if (oa->clone_parent != NULL) {
- /*
- * You can't clone across pools.
- */
- if (oa->clone_parent->ds_dir->dd_pool != dd->dd_pool)
- return (EXDEV);
-
- /*
- * You can only clone snapshots, not the head datasets.
- */
- if (oa->clone_parent->ds_phys->ds_num_children == 0)
- return (EINVAL);
- }
- return (0);
-}
-
-static void
-dmu_objset_create_sync(void *arg1, void *arg2, dmu_tx_t *tx)
-{
- dsl_dir_t *dd = arg1;
- struct oscarg *oa = arg2;
- dsl_dataset_t *ds;
- blkptr_t *bp;
- uint64_t dsobj;
-
- ASSERT(dmu_tx_is_syncing(tx));
-
- dsobj = dsl_dataset_create_sync(dd, oa->lastname,
- oa->clone_parent, tx);
-
- VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, dsobj, NULL,
- DS_MODE_STANDARD | DS_MODE_READONLY, FTAG, &ds));
- bp = dsl_dataset_get_blkptr(ds);
- if (BP_IS_HOLE(bp)) {
- objset_impl_t *osi;
-
- /* This is an empty dmu_objset; not a clone. */
- osi = dmu_objset_create_impl(dsl_dataset_get_spa(ds),
- ds, bp, oa->type, tx);
-
- if (oa->userfunc)
- oa->userfunc(&osi->os, oa->userarg, tx);
- }
- dsl_dataset_close(ds, DS_MODE_STANDARD | DS_MODE_READONLY, FTAG);
-}
-
-int
-dmu_objset_create(const char *name, dmu_objset_type_t type,
- objset_t *clone_parent,
- void (*func)(objset_t *os, void *arg, dmu_tx_t *tx), void *arg)
-{
- dsl_dir_t *pdd;
- const char *tail;
- int err = 0;
- struct oscarg oa = { 0 };
-
- ASSERT(strchr(name, '@') == NULL);
- err = dsl_dir_open(name, FTAG, &pdd, &tail);
- if (err)
- return (err);
- if (tail == NULL) {
- dsl_dir_close(pdd, FTAG);
- return (EEXIST);
- }
-
- dprintf("name=%s\n", name);
-
- oa.userfunc = func;
- oa.userarg = arg;
- oa.lastname = tail;
- oa.type = type;
- if (clone_parent != NULL) {
- /*
- * You can't clone to a different type.
- */
- if (clone_parent->os->os_phys->os_type != type) {
- dsl_dir_close(pdd, FTAG);
- return (EINVAL);
- }
- oa.clone_parent = clone_parent->os->os_dsl_dataset;
- }
- err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check,
- dmu_objset_create_sync, pdd, &oa, 5);
- dsl_dir_close(pdd, FTAG);
- return (err);
-}
-
-int
-dmu_objset_destroy(const char *name)
-{
- objset_t *os;
- int error;
-
- /*
- * If it looks like we'll be able to destroy it, and there's
- * an unplayed replay log sitting around, destroy the log.
- * It would be nicer to do this in dsl_dataset_destroy_sync(),
- * but the replay log objset is modified in open context.
- */
- error = dmu_objset_open(name, DMU_OST_ANY, DS_MODE_EXCLUSIVE, &os);
- if (error == 0) {
- zil_destroy(dmu_objset_zil(os), B_FALSE);
- dmu_objset_close(os);
- }
-
- return (dsl_dataset_destroy(name));
-}
-
-int
-dmu_objset_rollback(const char *name)
-{
- int err;
- objset_t *os;
-
- err = dmu_objset_open(name, DMU_OST_ANY,
- DS_MODE_EXCLUSIVE | DS_MODE_INCONSISTENT, &os);
- if (err == 0) {
- err = zil_suspend(dmu_objset_zil(os));
- if (err == 0)
- zil_resume(dmu_objset_zil(os));
- if (err == 0) {
- /* XXX uncache everything? */
- err = dsl_dataset_rollback(os->os->os_dsl_dataset);
- }
- dmu_objset_close(os);
- }
- return (err);
-}
-
-struct snaparg {
- dsl_sync_task_group_t *dstg;
- char *snapname;
- char failed[MAXPATHLEN];
-};
-
-static int
-dmu_objset_snapshot_one(char *name, void *arg)
-{
- struct snaparg *sn = arg;
- objset_t *os;
- dmu_objset_stats_t stat;
- int err;
-
- (void) strcpy(sn->failed, name);
-
- err = dmu_objset_open(name, DMU_OST_ANY, DS_MODE_STANDARD, &os);
- if (err != 0)
- return (err);
-
- /*
- * If the objset is in an inconsistent state, return busy.
- */
- dmu_objset_fast_stat(os, &stat);
- if (stat.dds_inconsistent) {
- dmu_objset_close(os);
- return (EBUSY);
- }
-
- /*
- * NB: we need to wait for all in-flight changes to get to disk,
- * so that we snapshot those changes. zil_suspend does this as
- * a side effect.
- */
- err = zil_suspend(dmu_objset_zil(os));
- if (err == 0) {
- dsl_sync_task_create(sn->dstg, dsl_dataset_snapshot_check,
- dsl_dataset_snapshot_sync, os, sn->snapname, 3);
- } else {
- dmu_objset_close(os);
- }
-
- return (err);
-}
-
-int
-dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive)
-{
- dsl_sync_task_t *dst;
- struct snaparg sn = { 0 };
- char *cp;
- spa_t *spa;
- int err;
-
- (void) strcpy(sn.failed, fsname);
-
- cp = strchr(fsname, '/');
- if (cp) {
- *cp = '\0';
- err = spa_open(fsname, &spa, FTAG);
- *cp = '/';
- } else {
- err = spa_open(fsname, &spa, FTAG);
- }
- if (err)
- return (err);
-
- sn.dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
- sn.snapname = snapname;
-
- if (recursive) {
- err = dmu_objset_find(fsname,
- dmu_objset_snapshot_one, &sn, DS_FIND_CHILDREN);
- } else {
- err = dmu_objset_snapshot_one(fsname, &sn);
- }
-
- if (err)
- goto out;
-
- err = dsl_sync_task_group_wait(sn.dstg);
-
- for (dst = list_head(&sn.dstg->dstg_tasks); dst;
- dst = list_next(&sn.dstg->dstg_tasks, dst)) {
- objset_t *os = dst->dst_arg1;
- if (dst->dst_err)
- dmu_objset_name(os, sn.failed);
- zil_resume(dmu_objset_zil(os));
- dmu_objset_close(os);
- }
-out:
- if (err)
- (void) strcpy(fsname, sn.failed);
- dsl_sync_task_group_destroy(sn.dstg);
- spa_close(spa, FTAG);
- return (err);
-}
-
-static void
-dmu_objset_sync_dnodes(list_t *list, dmu_tx_t *tx)
-{
- dnode_t *dn;
-
- while (dn = list_head(list)) {
- ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
- ASSERT(dn->dn_dbuf->db_data_pending);
- /*
- * Initialize dn_zio outside dnode_sync()
- * to accomodate meta-dnode
- */
- dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio;
- ASSERT(dn->dn_zio);
-
- ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS);
- list_remove(list, dn);
- dnode_sync(dn, tx);
- }
-}
-
-/* ARGSUSED */
-static void
-ready(zio_t *zio, arc_buf_t *abuf, void *arg)
-{
- objset_impl_t *os = arg;
- blkptr_t *bp = os->os_rootbp;
- dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;
- int i;
-
- /*
- * Update rootbp fill count.
- */
- bp->blk_fill = 1; /* count the meta-dnode */
- for (i = 0; i < dnp->dn_nblkptr; i++)
- bp->blk_fill += dnp->dn_blkptr[i].blk_fill;
-}
-
-/* ARGSUSED */
-static void
-killer(zio_t *zio, arc_buf_t *abuf, void *arg)
-{
- objset_impl_t *os = arg;
-
- ASSERT3U(zio->io_error, ==, 0);
-
- BP_SET_TYPE(zio->io_bp, DMU_OT_OBJSET);
- BP_SET_LEVEL(zio->io_bp, 0);
-
- if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp),
- BP_IDENTITY(&zio->io_bp_orig))) {
- if (zio->io_bp_orig.blk_birth == os->os_synctx->tx_txg)
- dsl_dataset_block_kill(os->os_dsl_dataset,
- &zio->io_bp_orig, NULL, os->os_synctx);
- dsl_dataset_block_born(os->os_dsl_dataset, zio->io_bp,
- os->os_synctx);
- }
- arc_release(os->os_phys_buf, &os->os_phys_buf);
-}
-
-/* called from dsl */
-void
-dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx)
-{
- int txgoff;
- zbookmark_t zb;
- zio_t *zio;
- list_t *list;
- dbuf_dirty_record_t *dr;
- int zio_flags;
-
- dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg);
-
- ASSERT(dmu_tx_is_syncing(tx));
- /* XXX the write_done callback should really give us the tx... */
- os->os_synctx = tx;
-
- if (os->os_dsl_dataset == NULL) {
- /*
- * This is the MOS. If we have upgraded,
- * spa_max_replication() could change, so reset
- * os_copies here.
- */
- os->os_copies = spa_max_replication(os->os_spa);
- }
-
- /*
- * Create the root block IO
- */
- zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0;
- zb.zb_object = 0;
- zb.zb_level = -1;
- zb.zb_blkid = 0;
- zio_flags = ZIO_FLAG_MUSTSUCCEED;
- if (dmu_ot[DMU_OT_OBJSET].ot_metadata || zb.zb_level != 0)
- zio_flags |= ZIO_FLAG_METADATA;
- if (BP_IS_OLDER(os->os_rootbp, tx->tx_txg))
- dsl_dataset_block_kill(os->os_dsl_dataset,
- os->os_rootbp, pio, tx);
- zio = arc_write(pio, os->os_spa, os->os_md_checksum,
- os->os_md_compress,
- dmu_get_replication_level(os, &zb, DMU_OT_OBJSET),
- tx->tx_txg, os->os_rootbp, os->os_phys_buf, ready, killer, os,
- ZIO_PRIORITY_ASYNC_WRITE, zio_flags, &zb);
-
- /*
- * Sync meta-dnode - the parent IO for the sync is the root block
- */
- os->os_meta_dnode->dn_zio = zio;
- dnode_sync(os->os_meta_dnode, tx);
-
- txgoff = tx->tx_txg & TXG_MASK;
-
- dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], tx);
- dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], tx);
-
- list = &os->os_meta_dnode->dn_dirty_records[txgoff];
- while (dr = list_head(list)) {
- ASSERT(dr->dr_dbuf->db_level == 0);
- list_remove(list, dr);
- if (dr->dr_zio)
- zio_nowait(dr->dr_zio);
- }
- /*
- * Free intent log blocks up to this tx.
- */
- zil_sync(os->os_zil, tx);
- zio_nowait(zio);
-}
-
-void
-dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
- uint64_t *usedobjsp, uint64_t *availobjsp)
-{
- dsl_dataset_space(os->os->os_dsl_dataset, refdbytesp, availbytesp,
- usedobjsp, availobjsp);
-}
-
-uint64_t
-dmu_objset_fsid_guid(objset_t *os)
-{
- return (dsl_dataset_fsid_guid(os->os->os_dsl_dataset));
-}
-
-void
-dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat)
-{
- stat->dds_type = os->os->os_phys->os_type;
- if (os->os->os_dsl_dataset)
- dsl_dataset_fast_stat(os->os->os_dsl_dataset, stat);
-}
-
-void
-dmu_objset_stats(objset_t *os, nvlist_t *nv)
-{
- ASSERT(os->os->os_dsl_dataset ||
- os->os->os_phys->os_type == DMU_OST_META);
-
- if (os->os->os_dsl_dataset != NULL)
- dsl_dataset_stats(os->os->os_dsl_dataset, nv);
-
- dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE,
- os->os->os_phys->os_type);
-}
-
-int
-dmu_objset_is_snapshot(objset_t *os)
-{
- if (os->os->os_dsl_dataset != NULL)
- return (dsl_dataset_is_snapshot(os->os->os_dsl_dataset));
- else
- return (B_FALSE);
-}
-
-int
-dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
- uint64_t *idp, uint64_t *offp)
-{
- dsl_dataset_t *ds = os->os->os_dsl_dataset;
- zap_cursor_t cursor;
- zap_attribute_t attr;
-
- if (ds->ds_phys->ds_snapnames_zapobj == 0)
- return (ENOENT);
-
- zap_cursor_init_serialized(&cursor,
- ds->ds_dir->dd_pool->dp_meta_objset,
- ds->ds_phys->ds_snapnames_zapobj, *offp);
-
- if (zap_cursor_retrieve(&cursor, &attr) != 0) {
- zap_cursor_fini(&cursor);
- return (ENOENT);
- }
-
- if (strlen(attr.za_name) + 1 > namelen) {
- zap_cursor_fini(&cursor);
- return (ENAMETOOLONG);
- }
-
- (void) strcpy(name, attr.za_name);
- if (idp)
- *idp = attr.za_first_integer;
- zap_cursor_advance(&cursor);
- *offp = zap_cursor_serialize(&cursor);
- zap_cursor_fini(&cursor);
-
- return (0);
-}
-
-int
-dmu_dir_list_next(objset_t *os, int namelen, char *name,
- uint64_t *idp, uint64_t *offp)
-{
- dsl_dir_t *dd = os->os->os_dsl_dataset->ds_dir;
- zap_cursor_t cursor;
- zap_attribute_t attr;
-
- /* there is no next dir on a snapshot! */
- if (os->os->os_dsl_dataset->ds_object !=
- dd->dd_phys->dd_head_dataset_obj)
- return (ENOENT);
-
- zap_cursor_init_serialized(&cursor,
- dd->dd_pool->dp_meta_objset,
- dd->dd_phys->dd_child_dir_zapobj, *offp);
-
- if (zap_cursor_retrieve(&cursor, &attr) != 0) {
- zap_cursor_fini(&cursor);
- return (ENOENT);
- }
-
- if (strlen(attr.za_name) + 1 > namelen) {
- zap_cursor_fini(&cursor);
- return (ENAMETOOLONG);
- }
-
- (void) strcpy(name, attr.za_name);
- if (idp)
- *idp = attr.za_first_integer;
- zap_cursor_advance(&cursor);
- *offp = zap_cursor_serialize(&cursor);
- zap_cursor_fini(&cursor);
-
- return (0);
-}
-
-/*
- * Find all objsets under name, and for each, call 'func(child_name, arg)'.
- */
-int
-dmu_objset_find(char *name, int func(char *, void *), void *arg, int flags)
-{
- dsl_dir_t *dd;
- objset_t *os;
- uint64_t snapobj;
- zap_cursor_t zc;
- zap_attribute_t *attr;
- char *child;
- int do_self, err;
-
- err = dsl_dir_open(name, FTAG, &dd, NULL);
- if (err)
- return (err);
-
- /* NB: the $MOS dir doesn't have a head dataset */
- do_self = (dd->dd_phys->dd_head_dataset_obj != 0);
- attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
-
- /*
- * Iterate over all children.
- */
- if (flags & DS_FIND_CHILDREN) {
- for (zap_cursor_init(&zc, dd->dd_pool->dp_meta_objset,
- dd->dd_phys->dd_child_dir_zapobj);
- zap_cursor_retrieve(&zc, attr) == 0;
- (void) zap_cursor_advance(&zc)) {
- ASSERT(attr->za_integer_length == sizeof (uint64_t));
- ASSERT(attr->za_num_integers == 1);
-
- /*
- * No separating '/' because parent's name ends in /.
- */
- child = kmem_alloc(MAXPATHLEN, KM_SLEEP);
- /* XXX could probably just use name here */
- dsl_dir_name(dd, child);
- (void) strcat(child, "/");
- (void) strcat(child, attr->za_name);
- err = dmu_objset_find(child, func, arg, flags);
- kmem_free(child, MAXPATHLEN);
- if (err)
- break;
- }
- zap_cursor_fini(&zc);
-
- if (err) {
- dsl_dir_close(dd, FTAG);
- kmem_free(attr, sizeof (zap_attribute_t));
- return (err);
- }
- }
-
- /*
- * Iterate over all snapshots.
- */
- if ((flags & DS_FIND_SNAPSHOTS) &&
- dmu_objset_open(name, DMU_OST_ANY,
- DS_MODE_STANDARD | DS_MODE_READONLY, &os) == 0) {
-
- snapobj = os->os->os_dsl_dataset->ds_phys->ds_snapnames_zapobj;
- dmu_objset_close(os);
-
- for (zap_cursor_init(&zc, dd->dd_pool->dp_meta_objset, snapobj);
- zap_cursor_retrieve(&zc, attr) == 0;
- (void) zap_cursor_advance(&zc)) {
- ASSERT(attr->za_integer_length == sizeof (uint64_t));
- ASSERT(attr->za_num_integers == 1);
-
- child = kmem_alloc(MAXPATHLEN, KM_SLEEP);
- /* XXX could probably just use name here */
- dsl_dir_name(dd, child);
- (void) strcat(child, "@");
- (void) strcat(child, attr->za_name);
- err = func(child, arg);
- kmem_free(child, MAXPATHLEN);
- if (err)
- break;
- }
- zap_cursor_fini(&zc);
- }
-
- dsl_dir_close(dd, FTAG);
- kmem_free(attr, sizeof (zap_attribute_t));
-
- if (err)
- return (err);
-
- /*
- * Apply to self if appropriate.
- */
- if (do_self)
- err = func(name, arg);
- return (err);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
deleted file mode 100644
index 3e55dc3..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
+++ /dev/null
@@ -1,1009 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/dmu.h>
-#include <sys/dmu_impl.h>
-#include <sys/dmu_tx.h>
-#include <sys/dbuf.h>
-#include <sys/dnode.h>
-#include <sys/zfs_context.h>
-#include <sys/dmu_objset.h>
-#include <sys/dmu_traverse.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_dir.h>
-#include <sys/dsl_pool.h>
-#include <sys/dsl_synctask.h>
-#include <sys/zfs_ioctl.h>
-#include <sys/zap.h>
-#include <sys/zio_checksum.h>
-
-struct backuparg {
- dmu_replay_record_t *drr;
- kthread_t *td;
- struct file *fp;
- objset_t *os;
- zio_cksum_t zc;
- int err;
-};
-
-static int
-dump_bytes(struct backuparg *ba, void *buf, int len)
-{
- struct uio auio;
- struct iovec aiov;
-
- ASSERT3U(len % 8, ==, 0);
-
- fletcher_4_incremental_native(buf, len, &ba->zc);
-
- aiov.iov_base = buf;
- aiov.iov_len = len;
- auio.uio_iov = &aiov;
- auio.uio_iovcnt = 1;
- auio.uio_resid = len;
- auio.uio_segflg = UIO_SYSSPACE;
- auio.uio_rw = UIO_WRITE;
- auio.uio_offset = (off_t)-1;
- auio.uio_td = ba->td;
-#ifdef _KERNEL
- if (ba->fp->f_type == DTYPE_VNODE)
- bwillwrite();
- ba->err = fo_write(ba->fp, &auio, ba->td->td_ucred, 0, ba->td);
-#else
- fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__);
- ba->err = EOPNOTSUPP;
-#endif
-
- return (ba->err);
-}
-
-static int
-dump_free(struct backuparg *ba, uint64_t object, uint64_t offset,
- uint64_t length)
-{
- /* write a FREE record */
- bzero(ba->drr, sizeof (dmu_replay_record_t));
- ba->drr->drr_type = DRR_FREE;
- ba->drr->drr_u.drr_free.drr_object = object;
- ba->drr->drr_u.drr_free.drr_offset = offset;
- ba->drr->drr_u.drr_free.drr_length = length;
-
- if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
- return (EINTR);
- return (0);
-}
-
-static int
-dump_data(struct backuparg *ba, dmu_object_type_t type,
- uint64_t object, uint64_t offset, int blksz, void *data)
-{
- /* write a DATA record */
- bzero(ba->drr, sizeof (dmu_replay_record_t));
- ba->drr->drr_type = DRR_WRITE;
- ba->drr->drr_u.drr_write.drr_object = object;
- ba->drr->drr_u.drr_write.drr_type = type;
- ba->drr->drr_u.drr_write.drr_offset = offset;
- ba->drr->drr_u.drr_write.drr_length = blksz;
-
- if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
- return (EINTR);
- if (dump_bytes(ba, data, blksz))
- return (EINTR);
- return (0);
-}
-
-static int
-dump_freeobjects(struct backuparg *ba, uint64_t firstobj, uint64_t numobjs)
-{
- /* write a FREEOBJECTS record */
- bzero(ba->drr, sizeof (dmu_replay_record_t));
- ba->drr->drr_type = DRR_FREEOBJECTS;
- ba->drr->drr_u.drr_freeobjects.drr_firstobj = firstobj;
- ba->drr->drr_u.drr_freeobjects.drr_numobjs = numobjs;
-
- if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
- return (EINTR);
- return (0);
-}
-
-static int
-dump_dnode(struct backuparg *ba, uint64_t object, dnode_phys_t *dnp)
-{
- if (dnp == NULL || dnp->dn_type == DMU_OT_NONE)
- return (dump_freeobjects(ba, object, 1));
-
- /* write an OBJECT record */
- bzero(ba->drr, sizeof (dmu_replay_record_t));
- ba->drr->drr_type = DRR_OBJECT;
- ba->drr->drr_u.drr_object.drr_object = object;
- ba->drr->drr_u.drr_object.drr_type = dnp->dn_type;
- ba->drr->drr_u.drr_object.drr_bonustype = dnp->dn_bonustype;
- ba->drr->drr_u.drr_object.drr_blksz =
- dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
- ba->drr->drr_u.drr_object.drr_bonuslen = dnp->dn_bonuslen;
- ba->drr->drr_u.drr_object.drr_checksum = dnp->dn_checksum;
- ba->drr->drr_u.drr_object.drr_compress = dnp->dn_compress;
-
- if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
- return (EINTR);
-
- if (dump_bytes(ba, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)))
- return (EINTR);
-
- /* free anything past the end of the file */
- if (dump_free(ba, object, (dnp->dn_maxblkid + 1) *
- (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL))
- return (EINTR);
- if (ba->err)
- return (EINTR);
- return (0);
-}
-
-#define BP_SPAN(dnp, level) \
- (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \
- (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)))
-
-static int
-backup_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
-{
- struct backuparg *ba = arg;
- uint64_t object = bc->bc_bookmark.zb_object;
- int level = bc->bc_bookmark.zb_level;
- uint64_t blkid = bc->bc_bookmark.zb_blkid;
- blkptr_t *bp = bc->bc_blkptr.blk_birth ? &bc->bc_blkptr : NULL;
- dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE;
- void *data = bc->bc_data;
- int err = 0;
-
- if (SIGPENDING(curthread))
- return (EINTR);
-
- ASSERT(data || bp == NULL);
-
- if (bp == NULL && object == 0) {
- uint64_t span = BP_SPAN(bc->bc_dnode, level);
- uint64_t dnobj = (blkid * span) >> DNODE_SHIFT;
- err = dump_freeobjects(ba, dnobj, span >> DNODE_SHIFT);
- } else if (bp == NULL) {
- uint64_t span = BP_SPAN(bc->bc_dnode, level);
- err = dump_free(ba, object, blkid * span, span);
- } else if (data && level == 0 && type == DMU_OT_DNODE) {
- dnode_phys_t *blk = data;
- int i;
- int blksz = BP_GET_LSIZE(bp);
-
- for (i = 0; i < blksz >> DNODE_SHIFT; i++) {
- uint64_t dnobj =
- (blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i;
- err = dump_dnode(ba, dnobj, blk+i);
- if (err)
- break;
- }
- } else if (level == 0 &&
- type != DMU_OT_DNODE && type != DMU_OT_OBJSET) {
- int blksz = BP_GET_LSIZE(bp);
- if (data == NULL) {
- uint32_t aflags = ARC_WAIT;
- arc_buf_t *abuf;
- zbookmark_t zb;
-
- zb.zb_objset = ba->os->os->os_dsl_dataset->ds_object;
- zb.zb_object = object;
- zb.zb_level = level;
- zb.zb_blkid = blkid;
- (void) arc_read(NULL, spa, bp,
- dmu_ot[type].ot_byteswap, arc_getbuf_func, &abuf,
- ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_MUSTSUCCEED,
- &aflags, &zb);
-
- if (abuf) {
- err = dump_data(ba, type, object, blkid * blksz,
- blksz, abuf->b_data);
- (void) arc_buf_remove_ref(abuf, &abuf);
- }
- } else {
- err = dump_data(ba, type, object, blkid * blksz,
- blksz, data);
- }
- }
-
- ASSERT(err == 0 || err == EINTR);
- return (err);
-}
-
-int
-dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, struct file *fp)
-{
- dsl_dataset_t *ds = tosnap->os->os_dsl_dataset;
- dsl_dataset_t *fromds = fromsnap ? fromsnap->os->os_dsl_dataset : NULL;
- dmu_replay_record_t *drr;
- struct backuparg ba;
- int err;
-
- /* tosnap must be a snapshot */
- if (ds->ds_phys->ds_next_snap_obj == 0)
- return (EINVAL);
-
- /* fromsnap must be an earlier snapshot from the same fs as tosnap */
- if (fromds && (ds->ds_dir != fromds->ds_dir ||
- fromds->ds_phys->ds_creation_txg >=
- ds->ds_phys->ds_creation_txg))
- return (EXDEV);
-
- drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
- drr->drr_type = DRR_BEGIN;
- drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC;
- drr->drr_u.drr_begin.drr_version = DMU_BACKUP_VERSION;
- drr->drr_u.drr_begin.drr_creation_time =
- ds->ds_phys->ds_creation_time;
- drr->drr_u.drr_begin.drr_type = tosnap->os->os_phys->os_type;
- drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid;
- if (fromds)
- drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid;
- dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname);
-
- ba.drr = drr;
- ba.td = curthread;
- ba.fp = fp;
- ba.os = tosnap;
- ZIO_SET_CHECKSUM(&ba.zc, 0, 0, 0, 0);
-
- if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) {
- kmem_free(drr, sizeof (dmu_replay_record_t));
- return (ba.err);
- }
-
- err = traverse_dsl_dataset(ds,
- fromds ? fromds->ds_phys->ds_creation_txg : 0,
- ADVANCE_PRE | ADVANCE_HOLES | ADVANCE_DATA | ADVANCE_NOLOCK,
- backup_cb, &ba);
-
- if (err) {
- if (err == EINTR && ba.err)
- err = ba.err;
- kmem_free(drr, sizeof (dmu_replay_record_t));
- return (err);
- }
-
- bzero(drr, sizeof (dmu_replay_record_t));
- drr->drr_type = DRR_END;
- drr->drr_u.drr_end.drr_checksum = ba.zc;
-
- if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) {
- kmem_free(drr, sizeof (dmu_replay_record_t));
- return (ba.err);
- }
-
- kmem_free(drr, sizeof (dmu_replay_record_t));
-
- return (0);
-}
-
-struct restorearg {
- int err;
- int byteswap;
- kthread_t *td;
- struct file *fp;
- char *buf;
- uint64_t voff;
- int buflen; /* number of valid bytes in buf */
- int bufoff; /* next offset to read */
- int bufsize; /* amount of memory allocated for buf */
- zio_cksum_t zc;
-};
-
-/* ARGSUSED */
-static int
-replay_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
- dsl_dataset_t *ds = arg1;
- struct drr_begin *drrb = arg2;
- const char *snapname;
- int err;
- uint64_t val;
-
- /* must already be a snapshot of this fs */
- if (ds->ds_phys->ds_prev_snap_obj == 0)
- return (ENODEV);
-
- /* most recent snapshot must match fromguid */
- if (ds->ds_prev->ds_phys->ds_guid != drrb->drr_fromguid)
- return (ENODEV);
- /* must not have any changes since most recent snapshot */
- if (ds->ds_phys->ds_bp.blk_birth >
- ds->ds_prev->ds_phys->ds_creation_txg)
- return (ETXTBSY);
-
- /* new snapshot name must not exist */
- snapname = strrchr(drrb->drr_toname, '@');
- if (snapname == NULL)
- return (EEXIST);
-
- snapname++;
- err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
- ds->ds_phys->ds_snapnames_zapobj, snapname, 8, 1, &val);
- if (err == 0)
- return (EEXIST);
- if (err != ENOENT)
- return (err);
-
- return (0);
-}
-
-/* ARGSUSED */
-static void
-replay_incremental_sync(void *arg1, void *arg2, dmu_tx_t *tx)
-{
- dsl_dataset_t *ds = arg1;
- dmu_buf_will_dirty(ds->ds_dbuf, tx);
- ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
-}
-
-/* ARGSUSED */
-static int
-replay_full_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
- dsl_dir_t *dd = arg1;
- struct drr_begin *drrb = arg2;
- objset_t *mos = dd->dd_pool->dp_meta_objset;
- char *cp;
- uint64_t val;
- int err;
-
- cp = strchr(drrb->drr_toname, '@');
- *cp = '\0';
- err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj,
- strrchr(drrb->drr_toname, '/') + 1,
- sizeof (uint64_t), 1, &val);
- *cp = '@';
-
- if (err != ENOENT)
- return (err ? err : EEXIST);
-
- return (0);
-}
-
-static void
-replay_full_sync(void *arg1, void *arg2, dmu_tx_t *tx)
-{
- dsl_dir_t *dd = arg1;
- struct drr_begin *drrb = arg2;
- char *cp;
- dsl_dataset_t *ds;
- uint64_t dsobj;
-
- cp = strchr(drrb->drr_toname, '@');
- *cp = '\0';
- dsobj = dsl_dataset_create_sync(dd, strrchr(drrb->drr_toname, '/') + 1,
- NULL, tx);
- *cp = '@';
-
- VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, dsobj, NULL,
- DS_MODE_EXCLUSIVE, FTAG, &ds));
-
- (void) dmu_objset_create_impl(dsl_dataset_get_spa(ds),
- ds, &ds->ds_phys->ds_bp, drrb->drr_type, tx);
-
- dmu_buf_will_dirty(ds->ds_dbuf, tx);
- ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
-
- dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
-}
-
-static int
-replay_end_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
- objset_t *os = arg1;
- struct drr_begin *drrb = arg2;
- char *snapname;
-
- /* XXX verify that drr_toname is in dd */
-
- snapname = strchr(drrb->drr_toname, '@');
- if (snapname == NULL)
- return (EINVAL);
- snapname++;
-
- return (dsl_dataset_snapshot_check(os, snapname, tx));
-}
-
-static void
-replay_end_sync(void *arg1, void *arg2, dmu_tx_t *tx)
-{
- objset_t *os = arg1;
- struct drr_begin *drrb = arg2;
- char *snapname;
- dsl_dataset_t *ds, *hds;
-
- snapname = strchr(drrb->drr_toname, '@') + 1;
-
- dsl_dataset_snapshot_sync(os, snapname, tx);
-
- /* set snapshot's creation time and guid */
- hds = os->os->os_dsl_dataset;
- VERIFY(0 == dsl_dataset_open_obj(hds->ds_dir->dd_pool,
- hds->ds_phys->ds_prev_snap_obj, NULL,
- DS_MODE_PRIMARY | DS_MODE_READONLY | DS_MODE_INCONSISTENT,
- FTAG, &ds));
-
- dmu_buf_will_dirty(ds->ds_dbuf, tx);
- ds->ds_phys->ds_creation_time = drrb->drr_creation_time;
- ds->ds_phys->ds_guid = drrb->drr_toguid;
- ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
-
- dsl_dataset_close(ds, DS_MODE_PRIMARY, FTAG);
-
- dmu_buf_will_dirty(hds->ds_dbuf, tx);
- hds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
-}
-
-static int
-restore_bytes(struct restorearg *ra, void *buf, int len, off_t off, int *resid)
-{
- struct uio auio;
- struct iovec aiov;
- int error;
-
- aiov.iov_base = buf;
- aiov.iov_len = len;
- auio.uio_iov = &aiov;
- auio.uio_iovcnt = 1;
- auio.uio_resid = len;
- auio.uio_segflg = UIO_SYSSPACE;
- auio.uio_rw = UIO_READ;
- auio.uio_offset = off;
- auio.uio_td = ra->td;
-#ifdef _KERNEL
- error = fo_read(ra->fp, &auio, ra->td->td_ucred, FOF_OFFSET, ra->td);
-#else
- fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__);
- error = EOPNOTSUPP;
-#endif
- *resid = auio.uio_resid;
- return (error);
-}
-
-static void *
-restore_read(struct restorearg *ra, int len)
-{
- void *rv;
-
- /* some things will require 8-byte alignment, so everything must */
- ASSERT3U(len % 8, ==, 0);
-
- while (ra->buflen - ra->bufoff < len) {
- int resid;
- int leftover = ra->buflen - ra->bufoff;
-
- (void) memmove(ra->buf, ra->buf + ra->bufoff, leftover);
-
- ra->err = restore_bytes(ra, (caddr_t)ra->buf + leftover,
- ra->bufsize - leftover, ra->voff, &resid);
-
- ra->voff += ra->bufsize - leftover - resid;
- ra->buflen = ra->bufsize - resid;
- ra->bufoff = 0;
- if (resid == ra->bufsize - leftover)
- ra->err = EINVAL;
- if (ra->err)
- return (NULL);
- /* Could compute checksum here? */
- }
-
- ASSERT3U(ra->bufoff % 8, ==, 0);
- ASSERT3U(ra->buflen - ra->bufoff, >=, len);
- rv = ra->buf + ra->bufoff;
- ra->bufoff += len;
- if (ra->byteswap)
- fletcher_4_incremental_byteswap(rv, len, &ra->zc);
- else
- fletcher_4_incremental_native(rv, len, &ra->zc);
- return (rv);
-}
-
-static void
-backup_byteswap(dmu_replay_record_t *drr)
-{
-#define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X))
-#define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X))
- drr->drr_type = BSWAP_32(drr->drr_type);
- switch (drr->drr_type) {
- case DRR_BEGIN:
- DO64(drr_begin.drr_magic);
- DO64(drr_begin.drr_version);
- DO64(drr_begin.drr_creation_time);
- DO32(drr_begin.drr_type);
- DO64(drr_begin.drr_toguid);
- DO64(drr_begin.drr_fromguid);
- break;
- case DRR_OBJECT:
- DO64(drr_object.drr_object);
- /* DO64(drr_object.drr_allocation_txg); */
- DO32(drr_object.drr_type);
- DO32(drr_object.drr_bonustype);
- DO32(drr_object.drr_blksz);
- DO32(drr_object.drr_bonuslen);
- break;
- case DRR_FREEOBJECTS:
- DO64(drr_freeobjects.drr_firstobj);
- DO64(drr_freeobjects.drr_numobjs);
- break;
- case DRR_WRITE:
- DO64(drr_write.drr_object);
- DO32(drr_write.drr_type);
- DO64(drr_write.drr_offset);
- DO64(drr_write.drr_length);
- break;
- case DRR_FREE:
- DO64(drr_free.drr_object);
- DO64(drr_free.drr_offset);
- DO64(drr_free.drr_length);
- break;
- case DRR_END:
- DO64(drr_end.drr_checksum.zc_word[0]);
- DO64(drr_end.drr_checksum.zc_word[1]);
- DO64(drr_end.drr_checksum.zc_word[2]);
- DO64(drr_end.drr_checksum.zc_word[3]);
- break;
- }
-#undef DO64
-#undef DO32
-}
-
-static int
-restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
-{
- int err;
- dmu_tx_t *tx;
-
- err = dmu_object_info(os, drro->drr_object, NULL);
-
- if (err != 0 && err != ENOENT)
- return (EINVAL);
-
- if (drro->drr_type == DMU_OT_NONE ||
- drro->drr_type >= DMU_OT_NUMTYPES ||
- drro->drr_bonustype >= DMU_OT_NUMTYPES ||
- drro->drr_checksum >= ZIO_CHECKSUM_FUNCTIONS ||
- drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
- P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
- drro->drr_blksz < SPA_MINBLOCKSIZE ||
- drro->drr_blksz > SPA_MAXBLOCKSIZE ||
- drro->drr_bonuslen > DN_MAX_BONUSLEN) {
- return (EINVAL);
- }
-
- tx = dmu_tx_create(os);
-
- if (err == ENOENT) {
- /* currently free, want to be allocated */
- dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 1);
- err = dmu_tx_assign(tx, TXG_WAIT);
- if (err) {
- dmu_tx_abort(tx);
- return (err);
- }
- err = dmu_object_claim(os, drro->drr_object,
- drro->drr_type, drro->drr_blksz,
- drro->drr_bonustype, drro->drr_bonuslen, tx);
- } else {
- /* currently allocated, want to be allocated */
- dmu_tx_hold_bonus(tx, drro->drr_object);
- /*
- * We may change blocksize, so need to
- * hold_write
- */
- dmu_tx_hold_write(tx, drro->drr_object, 0, 1);
- err = dmu_tx_assign(tx, TXG_WAIT);
- if (err) {
- dmu_tx_abort(tx);
- return (err);
- }
-
- err = dmu_object_reclaim(os, drro->drr_object,
- drro->drr_type, drro->drr_blksz,
- drro->drr_bonustype, drro->drr_bonuslen, tx);
- }
- if (err) {
- dmu_tx_commit(tx);
- return (EINVAL);
- }
-
- dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksum, tx);
- dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx);
-
- if (drro->drr_bonuslen) {
- dmu_buf_t *db;
- void *data;
- VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db));
- dmu_buf_will_dirty(db, tx);
-
- ASSERT3U(db->db_size, ==, drro->drr_bonuslen);
- data = restore_read(ra, P2ROUNDUP(db->db_size, 8));
- if (data == NULL) {
- dmu_tx_commit(tx);
- return (ra->err);
- }
- bcopy(data, db->db_data, db->db_size);
- if (ra->byteswap) {
- dmu_ot[drro->drr_bonustype].ot_byteswap(db->db_data,
- drro->drr_bonuslen);
- }
- dmu_buf_rele(db, FTAG);
- }
- dmu_tx_commit(tx);
- return (0);
-}
-
-/* ARGSUSED */
-static int
-restore_freeobjects(struct restorearg *ra, objset_t *os,
- struct drr_freeobjects *drrfo)
-{
- uint64_t obj;
-
- if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj)
- return (EINVAL);
-
- for (obj = drrfo->drr_firstobj;
- obj < drrfo->drr_firstobj + drrfo->drr_numobjs;
- (void) dmu_object_next(os, &obj, FALSE, 0)) {
- dmu_tx_t *tx;
- int err;
-
- if (dmu_object_info(os, obj, NULL) != 0)
- continue;
-
- tx = dmu_tx_create(os);
- dmu_tx_hold_bonus(tx, obj);
- err = dmu_tx_assign(tx, TXG_WAIT);
- if (err) {
- dmu_tx_abort(tx);
- return (err);
- }
- err = dmu_object_free(os, obj, tx);
- dmu_tx_commit(tx);
- if (err && err != ENOENT)
- return (EINVAL);
- }
- return (0);
-}
-
-static int
-restore_write(struct restorearg *ra, objset_t *os,
- struct drr_write *drrw)
-{
- dmu_tx_t *tx;
- void *data;
- int err;
-
- if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset ||
- drrw->drr_type >= DMU_OT_NUMTYPES)
- return (EINVAL);
-
- data = restore_read(ra, drrw->drr_length);
- if (data == NULL)
- return (ra->err);
-
- if (dmu_object_info(os, drrw->drr_object, NULL) != 0)
- return (EINVAL);
-
- tx = dmu_tx_create(os);
-
- dmu_tx_hold_write(tx, drrw->drr_object,
- drrw->drr_offset, drrw->drr_length);
- err = dmu_tx_assign(tx, TXG_WAIT);
- if (err) {
- dmu_tx_abort(tx);
- return (err);
- }
- if (ra->byteswap)
- dmu_ot[drrw->drr_type].ot_byteswap(data, drrw->drr_length);
- dmu_write(os, drrw->drr_object,
- drrw->drr_offset, drrw->drr_length, data, tx);
- dmu_tx_commit(tx);
- return (0);
-}
-
-/* ARGSUSED */
-static int
-restore_free(struct restorearg *ra, objset_t *os,
- struct drr_free *drrf)
-{
- dmu_tx_t *tx;
- int err;
-
- if (drrf->drr_length != -1ULL &&
- drrf->drr_offset + drrf->drr_length < drrf->drr_offset)
- return (EINVAL);
-
- if (dmu_object_info(os, drrf->drr_object, NULL) != 0)
- return (EINVAL);
-
- tx = dmu_tx_create(os);
-
- dmu_tx_hold_free(tx, drrf->drr_object,
- drrf->drr_offset, drrf->drr_length);
- err = dmu_tx_assign(tx, TXG_WAIT);
- if (err) {
- dmu_tx_abort(tx);
- return (err);
- }
- err = dmu_free_range(os, drrf->drr_object,
- drrf->drr_offset, drrf->drr_length, tx);
- dmu_tx_commit(tx);
- return (err);
-}
-
-int
-dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep,
- boolean_t force, struct file *fp, uint64_t voffset)
-{
- kthread_t *td = curthread;
- struct restorearg ra;
- dmu_replay_record_t *drr;
- char *cp;
- objset_t *os = NULL;
- zio_cksum_t pzc;
-
- bzero(&ra, sizeof (ra));
- ra.td = td;
- ra.fp = fp;
- ra.voff = voffset;
- ra.bufsize = 1<<20;
- ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP);
-
- if (drrb->drr_magic == DMU_BACKUP_MAGIC) {
- ra.byteswap = FALSE;
- } else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) {
- ra.byteswap = TRUE;
- } else {
- ra.err = EINVAL;
- goto out;
- }
-
- /*
- * NB: this assumes that struct drr_begin will be the largest in
- * dmu_replay_record_t's drr_u, and thus we don't need to pad it
- * with zeros to make it the same length as we wrote out.
- */
- ((dmu_replay_record_t *)ra.buf)->drr_type = DRR_BEGIN;
- ((dmu_replay_record_t *)ra.buf)->drr_pad = 0;
- ((dmu_replay_record_t *)ra.buf)->drr_u.drr_begin = *drrb;
- if (ra.byteswap) {
- fletcher_4_incremental_byteswap(ra.buf,
- sizeof (dmu_replay_record_t), &ra.zc);
- } else {
- fletcher_4_incremental_native(ra.buf,
- sizeof (dmu_replay_record_t), &ra.zc);
- }
- (void) strcpy(drrb->drr_toname, tosnap); /* for the sync funcs */
-
- if (ra.byteswap) {
- drrb->drr_magic = BSWAP_64(drrb->drr_magic);
- drrb->drr_version = BSWAP_64(drrb->drr_version);
- drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time);
- drrb->drr_type = BSWAP_32(drrb->drr_type);
- drrb->drr_toguid = BSWAP_64(drrb->drr_toguid);
- drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid);
- }
-
- ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
-
- if (drrb->drr_version != DMU_BACKUP_VERSION ||
- drrb->drr_type >= DMU_OST_NUMTYPES ||
- strchr(drrb->drr_toname, '@') == NULL) {
- ra.err = EINVAL;
- goto out;
- }
-
- /*
- * Process the begin in syncing context.
- */
- if (drrb->drr_fromguid) {
- /* incremental backup */
- dsl_dataset_t *ds = NULL;
-
- cp = strchr(tosnap, '@');
- *cp = '\0';
- ra.err = dsl_dataset_open(tosnap, DS_MODE_EXCLUSIVE, FTAG, &ds);
- *cp = '@';
- if (ra.err)
- goto out;
-
- /*
- * Only do the rollback if the most recent snapshot
- * matches the incremental source
- */
- if (force) {
- if (ds->ds_prev == NULL ||
- ds->ds_prev->ds_phys->ds_guid !=
- drrb->drr_fromguid) {
- dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
- kmem_free(ra.buf, ra.bufsize);
- return (ENODEV);
- }
- (void) dsl_dataset_rollback(ds);
- }
- ra.err = dsl_sync_task_do(ds->ds_dir->dd_pool,
- replay_incremental_check, replay_incremental_sync,
- ds, drrb, 1);
- dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
- } else {
- /* full backup */
- dsl_dir_t *dd = NULL;
- const char *tail;
-
- /* can't restore full backup into topmost fs, for now */
- if (strrchr(drrb->drr_toname, '/') == NULL) {
- ra.err = EINVAL;
- goto out;
- }
-
- cp = strchr(tosnap, '@');
- *cp = '\0';
- ra.err = dsl_dir_open(tosnap, FTAG, &dd, &tail);
- *cp = '@';
- if (ra.err)
- goto out;
- if (tail == NULL) {
- ra.err = EEXIST;
- goto out;
- }
-
- ra.err = dsl_sync_task_do(dd->dd_pool, replay_full_check,
- replay_full_sync, dd, drrb, 5);
- dsl_dir_close(dd, FTAG);
- }
- if (ra.err)
- goto out;
-
- /*
- * Open the objset we are modifying.
- */
-
- cp = strchr(tosnap, '@');
- *cp = '\0';
- ra.err = dmu_objset_open(tosnap, DMU_OST_ANY,
- DS_MODE_PRIMARY | DS_MODE_INCONSISTENT, &os);
- *cp = '@';
- ASSERT3U(ra.err, ==, 0);
-
- /*
- * Read records and process them.
- */
- pzc = ra.zc;
- while (ra.err == 0 &&
- NULL != (drr = restore_read(&ra, sizeof (*drr)))) {
- if (SIGPENDING(td)) {
- ra.err = EINTR;
- goto out;
- }
-
- if (ra.byteswap)
- backup_byteswap(drr);
-
- switch (drr->drr_type) {
- case DRR_OBJECT:
- {
- /*
- * We need to make a copy of the record header,
- * because restore_{object,write} may need to
- * restore_read(), which will invalidate drr.
- */
- struct drr_object drro = drr->drr_u.drr_object;
- ra.err = restore_object(&ra, os, &drro);
- break;
- }
- case DRR_FREEOBJECTS:
- {
- struct drr_freeobjects drrfo =
- drr->drr_u.drr_freeobjects;
- ra.err = restore_freeobjects(&ra, os, &drrfo);
- break;
- }
- case DRR_WRITE:
- {
- struct drr_write drrw = drr->drr_u.drr_write;
- ra.err = restore_write(&ra, os, &drrw);
- break;
- }
- case DRR_FREE:
- {
- struct drr_free drrf = drr->drr_u.drr_free;
- ra.err = restore_free(&ra, os, &drrf);
- break;
- }
- case DRR_END:
- {
- struct drr_end drre = drr->drr_u.drr_end;
- /*
- * We compare against the *previous* checksum
- * value, because the stored checksum is of
- * everything before the DRR_END record.
- */
- if (drre.drr_checksum.zc_word[0] != 0 &&
- !ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pzc)) {
- ra.err = ECKSUM;
- goto out;
- }
-
- ra.err = dsl_sync_task_do(dmu_objset_ds(os)->
- ds_dir->dd_pool, replay_end_check, replay_end_sync,
- os, drrb, 3);
- goto out;
- }
- default:
- ra.err = EINVAL;
- goto out;
- }
- pzc = ra.zc;
- }
-
-out:
- if (os)
- dmu_objset_close(os);
-
- /*
- * Make sure we don't rollback/destroy unless we actually
- * processed the begin properly. 'os' will only be set if this
- * is the case.
- */
- if (ra.err && os && tosnap && strchr(tosnap, '@')) {
- /*
- * rollback or destroy what we created, so we don't
- * leave it in the restoring state.
- */
- dsl_dataset_t *ds;
- int err;
-
- cp = strchr(tosnap, '@');
- *cp = '\0';
- err = dsl_dataset_open(tosnap,
- DS_MODE_EXCLUSIVE | DS_MODE_INCONSISTENT,
- FTAG, &ds);
- if (err == 0) {
- txg_wait_synced(ds->ds_dir->dd_pool, 0);
- if (drrb->drr_fromguid) {
- /* incremental: rollback to most recent snap */
- (void) dsl_dataset_rollback(ds);
- dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
- } else {
- /* full: destroy whole fs */
- dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
- (void) dsl_dataset_destroy(tosnap);
- }
- }
- *cp = '@';
- }
-
- kmem_free(ra.buf, ra.bufsize);
- if (sizep)
- *sizep = ra.voff;
- return (ra.err);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
deleted file mode 100644
index 3d2bc3e..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
+++ /dev/null
@@ -1,888 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/dmu_objset.h>
-#include <sys/dmu_traverse.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_dir.h>
-#include <sys/dsl_pool.h>
-#include <sys/dnode.h>
-#include <sys/spa.h>
-#include <sys/zio.h>
-#include <sys/dmu_impl.h>
-
-#define BP_SPAN_SHIFT(level, width) ((level) * (width))
-
-#define BP_EQUAL(b1, b2) \
- (DVA_EQUAL(BP_IDENTITY(b1), BP_IDENTITY(b2)) && \
- (b1)->blk_birth == (b2)->blk_birth)
-
-/*
- * Compare two bookmarks.
- *
- * For ADVANCE_PRE, the visitation order is:
- *
- * objset 0, 1, 2, ..., ZB_MAXOBJSET.
- * object 0, 1, 2, ..., ZB_MAXOBJECT.
- * blkoff 0, 1, 2, ...
- * level ZB_MAXLEVEL, ..., 2, 1, 0.
- *
- * where blkoff = blkid << BP_SPAN_SHIFT(level, width), and thus a valid
- * ordering vector is:
- *
- * < objset, object, blkoff, -level >
- *
- * For ADVANCE_POST, the starting offsets aren't sequential but ending
- * offsets [blkoff = (blkid + 1) << BP_SPAN_SHIFT(level, width)] are.
- * The visitation order is:
- *
- * objset 1, 2, ..., ZB_MAXOBJSET, 0.
- * object 1, 2, ..., ZB_MAXOBJECT, 0.
- * blkoff 1, 2, ...
- * level 0, 1, 2, ..., ZB_MAXLEVEL.
- *
- * and thus a valid ordering vector is:
- *
- * < objset - 1, object - 1, blkoff, level >
- *
- * Both orderings can be expressed as:
- *
- * < objset + bias, object + bias, blkoff, level ^ bias >
- *
- * where 'bias' is either 0 or -1 (for ADVANCE_PRE or ADVANCE_POST)
- * and 'blkoff' is (blkid - bias) << BP_SPAN_SHIFT(level, wshift).
- *
- * Special case: an objset's osphys is represented as level -1 of object 0.
- * It is always either the very first or very last block we visit in an objset.
- * Therefore, if either bookmark's level is -1, level alone determines order.
- */
-static int
-compare_bookmark(zbookmark_t *szb, zbookmark_t *ezb, dnode_phys_t *dnp,
- int advance)
-{
- int bias = (advance & ADVANCE_PRE) ? 0 : -1;
- uint64_t sblkoff, eblkoff;
- int slevel, elevel, wshift;
-
- if (szb->zb_objset + bias < ezb->zb_objset + bias)
- return (-1);
-
- if (szb->zb_objset + bias > ezb->zb_objset + bias)
- return (1);
-
- slevel = szb->zb_level;
- elevel = ezb->zb_level;
-
- if ((slevel | elevel) < 0)
- return ((slevel ^ bias) - (elevel ^ bias));
-
- if (szb->zb_object + bias < ezb->zb_object + bias)
- return (-1);
-
- if (szb->zb_object + bias > ezb->zb_object + bias)
- return (1);
-
- if (dnp == NULL)
- return (0);
-
- wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
-
- sblkoff = (szb->zb_blkid - bias) << BP_SPAN_SHIFT(slevel, wshift);
- eblkoff = (ezb->zb_blkid - bias) << BP_SPAN_SHIFT(elevel, wshift);
-
- if (sblkoff < eblkoff)
- return (-1);
-
- if (sblkoff > eblkoff)
- return (1);
-
- return ((elevel ^ bias) - (slevel ^ bias));
-}
-
-#define SET_BOOKMARK(zb, objset, object, level, blkid) \
-{ \
- (zb)->zb_objset = objset; \
- (zb)->zb_object = object; \
- (zb)->zb_level = level; \
- (zb)->zb_blkid = blkid; \
-}
-
-#define SET_BOOKMARK_LB(zb, level, blkid) \
-{ \
- (zb)->zb_level = level; \
- (zb)->zb_blkid = blkid; \
-}
-
-static int
-advance_objset(zseg_t *zseg, uint64_t objset, int advance)
-{
- zbookmark_t *zb = &zseg->seg_start;
-
- if (advance & ADVANCE_PRE) {
- if (objset >= ZB_MAXOBJSET)
- return (ERANGE);
- SET_BOOKMARK(zb, objset, 0, -1, 0);
- } else {
- if (objset >= ZB_MAXOBJSET)
- objset = 0;
- SET_BOOKMARK(zb, objset, 1, 0, 0);
- }
-
- if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
- return (ERANGE);
-
- return (EAGAIN);
-}
-
-static int
-advance_object(zseg_t *zseg, uint64_t object, int advance)
-{
- zbookmark_t *zb = &zseg->seg_start;
-
- if (advance & ADVANCE_PRE) {
- if (object >= ZB_MAXOBJECT) {
- SET_BOOKMARK(zb, zb->zb_objset + 1, 0, -1, 0);
- } else {
- SET_BOOKMARK(zb, zb->zb_objset, object, ZB_MAXLEVEL, 0);
- }
- } else {
- if (zb->zb_object == 0) {
- SET_BOOKMARK(zb, zb->zb_objset, 0, -1, 0);
- } else {
- if (object >= ZB_MAXOBJECT)
- object = 0;
- SET_BOOKMARK(zb, zb->zb_objset, object, 0, 0);
- }
- }
-
- if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
- return (ERANGE);
-
- return (EAGAIN);
-}
-
-static int
-advance_from_osphys(zseg_t *zseg, int advance)
-{
- zbookmark_t *zb = &zseg->seg_start;
-
- ASSERT(zb->zb_object == 0);
- ASSERT(zb->zb_level == -1);
- ASSERT(zb->zb_blkid == 0);
-
- if (advance & ADVANCE_PRE) {
- SET_BOOKMARK_LB(zb, ZB_MAXLEVEL, 0);
- } else {
- if (zb->zb_objset == 0)
- return (ERANGE);
- SET_BOOKMARK(zb, zb->zb_objset + 1, 1, 0, 0);
- }
-
- if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
- return (ERANGE);
-
- return (EAGAIN);
-}
-
-static int
-advance_block(zseg_t *zseg, dnode_phys_t *dnp, int rc, int advance)
-{
- zbookmark_t *zb = &zseg->seg_start;
- int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
- int maxlevel = dnp->dn_nlevels - 1;
- int level = zb->zb_level;
- uint64_t blkid = zb->zb_blkid;
-
- if (advance & ADVANCE_PRE) {
- if (level > 0 && rc == 0) {
- level--;
- blkid <<= wshift;
- } else {
- blkid++;
-
- if ((blkid << BP_SPAN_SHIFT(level, wshift)) >
- dnp->dn_maxblkid)
- return (ERANGE);
-
- while (level < maxlevel) {
- if (P2PHASE(blkid, 1ULL << wshift))
- break;
- blkid >>= wshift;
- level++;
- }
- }
- } else {
- if (level >= maxlevel || P2PHASE(blkid + 1, 1ULL << wshift)) {
- blkid = (blkid + 1) << BP_SPAN_SHIFT(level, wshift);
- level = 0;
- } else {
- blkid >>= wshift;
- level++;
- }
-
- while ((blkid << BP_SPAN_SHIFT(level, wshift)) >
- dnp->dn_maxblkid) {
- if (level == maxlevel)
- return (ERANGE);
- blkid >>= wshift;
- level++;
- }
- }
- SET_BOOKMARK_LB(zb, level, blkid);
-
- if (compare_bookmark(zb, &zseg->seg_end, dnp, advance) > 0)
- return (ERANGE);
-
- return (EAGAIN);
-}
-
-static int
-traverse_callback(traverse_handle_t *th, zseg_t *zseg, traverse_blk_cache_t *bc)
-{
- /*
- * Before we issue the callback, prune against maxtxg.
- *
- * We prune against mintxg before we get here because it's a big win.
- * If a given block was born in txg 37, then we know that the entire
- * subtree below that block must have been born in txg 37 or earlier.
- * We can therefore lop off huge branches of the tree as we go.
- *
- * There's no corresponding optimization for maxtxg because knowing
- * that bp->blk_birth >= maxtxg doesn't imply anything about the bp's
- * children. In fact, the copy-on-write design of ZFS ensures that
- * top-level blocks will pretty much always be new.
- *
- * Therefore, in the name of simplicity we don't prune against
- * maxtxg until the last possible moment -- that being right now.
- */
- if (bc->bc_errno == 0 && bc->bc_blkptr.blk_birth >= zseg->seg_maxtxg)
- return (0);
-
- /*
- * Debugging: verify that the order we visit things agrees with the
- * order defined by compare_bookmark(). We don't check this for
- * log blocks because there's no defined ordering for them; they're
- * always visited (or not) as part of visiting the objset_phys_t.
- */
- if (bc->bc_errno == 0 && bc != &th->th_zil_cache) {
- zbookmark_t *zb = &bc->bc_bookmark;
- zbookmark_t *szb = &zseg->seg_start;
- zbookmark_t *ezb = &zseg->seg_end;
- zbookmark_t *lzb = &th->th_lastcb;
- dnode_phys_t *dnp = bc->bc_dnode;
-
- ASSERT(compare_bookmark(zb, ezb, dnp, th->th_advance) <= 0);
- ASSERT(compare_bookmark(zb, szb, dnp, th->th_advance) == 0);
- ASSERT(compare_bookmark(lzb, zb, dnp, th->th_advance) < 0 ||
- lzb->zb_level == ZB_NO_LEVEL);
- *lzb = *zb;
- }
-
- th->th_callbacks++;
- return (th->th_func(bc, th->th_spa, th->th_arg));
-}
-
-static int
-traverse_read(traverse_handle_t *th, traverse_blk_cache_t *bc, blkptr_t *bp,
- dnode_phys_t *dnp)
-{
- zbookmark_t *zb = &bc->bc_bookmark;
- int error;
-
- th->th_hits++;
-
- bc->bc_dnode = dnp;
- bc->bc_errno = 0;
-
- if (BP_EQUAL(&bc->bc_blkptr, bp))
- return (0);
-
- bc->bc_blkptr = *bp;
-
- if (bc->bc_data == NULL)
- return (0);
-
- if (BP_IS_HOLE(bp)) {
- ASSERT(th->th_advance & ADVANCE_HOLES);
- return (0);
- }
-
- if (compare_bookmark(zb, &th->th_noread, dnp, 0) == 0) {
- error = EIO;
- } else if (arc_tryread(th->th_spa, bp, bc->bc_data) == 0) {
- error = 0;
- th->th_arc_hits++;
- } else {
- error = zio_wait(zio_read(NULL, th->th_spa, bp, bc->bc_data,
- BP_GET_LSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_READ,
- th->th_zio_flags | ZIO_FLAG_DONT_CACHE, zb));
-
- if (BP_SHOULD_BYTESWAP(bp) && error == 0)
- (zb->zb_level > 0 ? byteswap_uint64_array :
- dmu_ot[BP_GET_TYPE(bp)].ot_byteswap)(bc->bc_data,
- BP_GET_LSIZE(bp));
- th->th_reads++;
- }
-
- if (error) {
- bc->bc_errno = error;
- error = traverse_callback(th, NULL, bc);
- ASSERT(error == EAGAIN || error == EINTR || error == ERESTART);
- bc->bc_blkptr.blk_birth = -1ULL;
- }
-
- dprintf("cache %02x error %d <%llu, %llu, %d, %llx>\n",
- bc - &th->th_cache[0][0], error,
- zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid);
-
- return (error);
-}
-
-static int
-find_block(traverse_handle_t *th, zseg_t *zseg, dnode_phys_t *dnp, int depth)
-{
- zbookmark_t *zb = &zseg->seg_start;
- traverse_blk_cache_t *bc;
- blkptr_t *bp = dnp->dn_blkptr;
- int i, first, level;
- int nbp = dnp->dn_nblkptr;
- int minlevel = zb->zb_level;
- int maxlevel = dnp->dn_nlevels - 1;
- int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
- int bp_shift = BP_SPAN_SHIFT(maxlevel - minlevel, wshift);
- uint64_t blkid = zb->zb_blkid >> bp_shift;
- int do_holes = (th->th_advance & ADVANCE_HOLES) && depth == ZB_DN_CACHE;
- int rc;
-
- if (minlevel > maxlevel || blkid >= nbp)
- return (ERANGE);
-
- for (level = maxlevel; level >= minlevel; level--) {
- first = P2PHASE(blkid, 1ULL << wshift);
-
- for (i = first; i < nbp; i++)
- if (bp[i].blk_birth > zseg->seg_mintxg ||
- BP_IS_HOLE(&bp[i]) && do_holes)
- break;
-
- if (i != first) {
- i--;
- SET_BOOKMARK_LB(zb, level, blkid + (i - first));
- return (ENOTBLK);
- }
-
- bc = &th->th_cache[depth][level];
-
- SET_BOOKMARK(&bc->bc_bookmark, zb->zb_objset, zb->zb_object,
- level, blkid);
-
- if (rc = traverse_read(th, bc, bp + i, dnp)) {
- if (rc != EAGAIN) {
- SET_BOOKMARK_LB(zb, level, blkid);
- }
- return (rc);
- }
-
- if (BP_IS_HOLE(&bp[i])) {
- SET_BOOKMARK_LB(zb, level, blkid);
- th->th_lastcb.zb_level = ZB_NO_LEVEL;
- return (0);
- }
-
- nbp = 1 << wshift;
- bp = bc->bc_data;
- bp_shift -= wshift;
- blkid = zb->zb_blkid >> bp_shift;
- }
-
- return (0);
-}
-
-static int
-get_dnode(traverse_handle_t *th, uint64_t objset, dnode_phys_t *mdn,
- uint64_t *objectp, dnode_phys_t **dnpp, uint64_t txg, int type, int depth)
-{
- zseg_t zseg;
- zbookmark_t *zb = &zseg.seg_start;
- uint64_t object = *objectp;
- int i, rc;
-
- SET_BOOKMARK(zb, objset, 0, 0, object / DNODES_PER_BLOCK);
- SET_BOOKMARK(&zseg.seg_end, objset, 0, 0, ZB_MAXBLKID);
-
- zseg.seg_mintxg = txg;
- zseg.seg_maxtxg = -1ULL;
-
- for (;;) {
- rc = find_block(th, &zseg, mdn, depth);
-
- if (rc == EAGAIN || rc == EINTR || rc == ERANGE)
- break;
-
- if (rc == 0 && zb->zb_level == 0) {
- dnode_phys_t *dnp = th->th_cache[depth][0].bc_data;
- for (i = 0; i < DNODES_PER_BLOCK; i++) {
- object = (zb->zb_blkid * DNODES_PER_BLOCK) + i;
- if (object >= *objectp &&
- dnp[i].dn_type != DMU_OT_NONE &&
- (type == -1 || dnp[i].dn_type == type)) {
- *objectp = object;
- *dnpp = &dnp[i];
- return (0);
- }
- }
- }
-
- rc = advance_block(&zseg, mdn, rc, ADVANCE_PRE);
-
- if (rc == ERANGE)
- break;
- }
-
- if (rc == ERANGE)
- *objectp = ZB_MAXOBJECT;
-
- return (rc);
-}
-
-/* ARGSUSED */
-static void
-traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
-{
- traverse_handle_t *th = arg;
- traverse_blk_cache_t *bc = &th->th_zil_cache;
- zbookmark_t *zb = &bc->bc_bookmark;
- zseg_t *zseg = list_head(&th->th_seglist);
-
- if (bp->blk_birth <= zseg->seg_mintxg)
- return;
-
- if (claim_txg != 0 || bp->blk_birth < spa_first_txg(th->th_spa)) {
- zb->zb_object = 0;
- zb->zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
- bc->bc_blkptr = *bp;
- (void) traverse_callback(th, zseg, bc);
- }
-}
-
-/* ARGSUSED */
-static void
-traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
-{
- traverse_handle_t *th = arg;
- traverse_blk_cache_t *bc = &th->th_zil_cache;
- zbookmark_t *zb = &bc->bc_bookmark;
- zseg_t *zseg = list_head(&th->th_seglist);
-
- if (lrc->lrc_txtype == TX_WRITE) {
- lr_write_t *lr = (lr_write_t *)lrc;
- blkptr_t *bp = &lr->lr_blkptr;
-
- if (bp->blk_birth <= zseg->seg_mintxg)
- return;
-
- if (claim_txg != 0 && bp->blk_birth >= claim_txg) {
- zb->zb_object = lr->lr_foid;
- zb->zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp);
- bc->bc_blkptr = *bp;
- (void) traverse_callback(th, zseg, bc);
- }
- }
-}
-
-static void
-traverse_zil(traverse_handle_t *th, traverse_blk_cache_t *bc)
-{
- spa_t *spa = th->th_spa;
- dsl_pool_t *dp = spa_get_dsl(spa);
- objset_phys_t *osphys = bc->bc_data;
- zil_header_t *zh = &osphys->os_zil_header;
- uint64_t claim_txg = zh->zh_claim_txg;
- zilog_t *zilog;
-
- ASSERT(bc == &th->th_cache[ZB_MDN_CACHE][ZB_MAXLEVEL - 1]);
- ASSERT(bc->bc_bookmark.zb_level == -1);
-
- /*
- * We only want to visit blocks that have been claimed but not yet
- * replayed (or, in read-only mode, blocks that *would* be claimed).
- */
- if (claim_txg == 0 && (spa_mode & FWRITE))
- return;
-
- th->th_zil_cache.bc_bookmark = bc->bc_bookmark;
-
- zilog = zil_alloc(dp->dp_meta_objset, zh);
-
- (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, th,
- claim_txg);
-
- zil_free(zilog);
-}
-
-static int
-traverse_segment(traverse_handle_t *th, zseg_t *zseg, blkptr_t *mosbp)
-{
- zbookmark_t *zb = &zseg->seg_start;
- traverse_blk_cache_t *bc;
- dnode_phys_t *dn, *dn_tmp;
- int worklimit = 100;
- int rc;
-
- dprintf("<%llu, %llu, %d, %llx>\n",
- zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid);
-
- bc = &th->th_cache[ZB_MOS_CACHE][ZB_MAXLEVEL - 1];
- dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode;
-
- SET_BOOKMARK(&bc->bc_bookmark, 0, 0, -1, 0);
-
- rc = traverse_read(th, bc, mosbp, dn);
-
- if (rc) /* If we get ERESTART, we've got nowhere left to go */
- return (rc == ERESTART ? EINTR : rc);
-
- ASSERT(dn->dn_nlevels < ZB_MAXLEVEL);
-
- if (zb->zb_objset != 0) {
- uint64_t objset = zb->zb_objset;
- dsl_dataset_phys_t *dsp;
-
- rc = get_dnode(th, 0, dn, &objset, &dn_tmp, 0,
- DMU_OT_DSL_DATASET, ZB_MOS_CACHE);
-
- if (objset != zb->zb_objset)
- rc = advance_objset(zseg, objset, th->th_advance);
-
- if (rc != 0)
- return (rc);
-
- dsp = DN_BONUS(dn_tmp);
-
- bc = &th->th_cache[ZB_MDN_CACHE][ZB_MAXLEVEL - 1];
- dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode;
-
- SET_BOOKMARK(&bc->bc_bookmark, objset, 0, -1, 0);
-
- /*
- * If we're traversing an open snapshot, we know that it
- * can't be deleted (because it's open) and it can't change
- * (because it's a snapshot). Therefore, once we've gotten
- * from the uberblock down to the snapshot's objset_phys_t,
- * we no longer need to synchronize with spa_sync(); we're
- * traversing a completely static block tree from here on.
- */
- if (th->th_advance & ADVANCE_NOLOCK) {
- ASSERT(th->th_locked);
- rw_exit(spa_traverse_rwlock(th->th_spa));
- th->th_locked = 0;
- }
-
- rc = traverse_read(th, bc, &dsp->ds_bp, dn);
-
- if (rc != 0) {
- if (rc == ERESTART)
- rc = advance_objset(zseg, zb->zb_objset + 1,
- th->th_advance);
- return (rc);
- }
-
- if (th->th_advance & ADVANCE_PRUNE)
- zseg->seg_mintxg =
- MAX(zseg->seg_mintxg, dsp->ds_prev_snap_txg);
- }
-
- if (zb->zb_level == -1) {
- ASSERT(zb->zb_object == 0);
- ASSERT(zb->zb_blkid == 0);
- ASSERT(BP_GET_TYPE(&bc->bc_blkptr) == DMU_OT_OBJSET);
-
- if (bc->bc_blkptr.blk_birth > zseg->seg_mintxg) {
- rc = traverse_callback(th, zseg, bc);
- if (rc) {
- ASSERT(rc == EINTR);
- return (rc);
- }
- if ((th->th_advance & ADVANCE_ZIL) &&
- zb->zb_objset != 0)
- traverse_zil(th, bc);
- }
-
- return (advance_from_osphys(zseg, th->th_advance));
- }
-
- if (zb->zb_object != 0) {
- uint64_t object = zb->zb_object;
-
- rc = get_dnode(th, zb->zb_objset, dn, &object, &dn_tmp,
- zseg->seg_mintxg, -1, ZB_MDN_CACHE);
-
- if (object != zb->zb_object)
- rc = advance_object(zseg, object, th->th_advance);
-
- if (rc != 0)
- return (rc);
-
- dn = dn_tmp;
- }
-
- if (zb->zb_level == ZB_MAXLEVEL)
- zb->zb_level = dn->dn_nlevels - 1;
-
- for (;;) {
- rc = find_block(th, zseg, dn, ZB_DN_CACHE);
-
- if (rc == EAGAIN || rc == EINTR || rc == ERANGE)
- break;
-
- if (rc == 0) {
- bc = &th->th_cache[ZB_DN_CACHE][zb->zb_level];
- ASSERT(bc->bc_dnode == dn);
- ASSERT(bc->bc_blkptr.blk_birth <= mosbp->blk_birth);
- rc = traverse_callback(th, zseg, bc);
- if (rc) {
- ASSERT(rc == EINTR);
- return (rc);
- }
- if (BP_IS_HOLE(&bc->bc_blkptr)) {
- ASSERT(th->th_advance & ADVANCE_HOLES);
- rc = ENOTBLK;
- }
- }
-
- rc = advance_block(zseg, dn, rc, th->th_advance);
-
- if (rc == ERANGE)
- break;
-
- /*
- * Give spa_sync() a chance to run.
- */
- if (th->th_locked && spa_traverse_wanted(th->th_spa)) {
- th->th_syncs++;
- return (EAGAIN);
- }
-
- if (--worklimit == 0)
- return (EAGAIN);
- }
-
- if (rc == ERANGE)
- rc = advance_object(zseg, zb->zb_object + 1, th->th_advance);
-
- return (rc);
-}
-
-/*
- * It is the caller's responsibility to ensure that the dsl_dataset_t
- * doesn't go away during traversal.
- */
-int
-traverse_dsl_dataset(dsl_dataset_t *ds, uint64_t txg_start, int advance,
- blkptr_cb_t func, void *arg)
-{
- spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
- traverse_handle_t *th;
- int err;
-
- th = traverse_init(spa, func, arg, advance, ZIO_FLAG_MUSTSUCCEED);
-
- traverse_add_objset(th, txg_start, -1ULL, ds->ds_object);
-
- while ((err = traverse_more(th)) == EAGAIN)
- continue;
-
- traverse_fini(th);
- return (err);
-}
-
-int
-traverse_more(traverse_handle_t *th)
-{
- zseg_t *zseg = list_head(&th->th_seglist);
- uint64_t save_txg; /* XXX won't be necessary with real itinerary */
- krwlock_t *rw = spa_traverse_rwlock(th->th_spa);
- blkptr_t *mosbp = spa_get_rootblkptr(th->th_spa);
- int rc;
-
- if (zseg == NULL)
- return (0);
-
- th->th_restarts++;
-
- save_txg = zseg->seg_mintxg;
-
- rw_enter(rw, RW_READER);
- th->th_locked = 1;
-
- rc = traverse_segment(th, zseg, mosbp);
- ASSERT(rc == ERANGE || rc == EAGAIN || rc == EINTR);
-
- if (th->th_locked)
- rw_exit(rw);
- th->th_locked = 0;
-
- zseg->seg_mintxg = save_txg;
-
- if (rc == ERANGE) {
- list_remove(&th->th_seglist, zseg);
- kmem_free(zseg, sizeof (*zseg));
- return (EAGAIN);
- }
-
- return (rc);
-}
-
-/*
- * Note: (mintxg, maxtxg) is an open interval; mintxg and maxtxg themselves
- * are not included. The blocks covered by this segment will all have
- * mintxg < birth < maxtxg.
- */
-static void
-traverse_add_segment(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
- uint64_t sobjset, uint64_t sobject, int slevel, uint64_t sblkid,
- uint64_t eobjset, uint64_t eobject, int elevel, uint64_t eblkid)
-{
- zseg_t *zseg;
-
- zseg = kmem_alloc(sizeof (zseg_t), KM_SLEEP);
-
- zseg->seg_mintxg = mintxg;
- zseg->seg_maxtxg = maxtxg;
-
- zseg->seg_start.zb_objset = sobjset;
- zseg->seg_start.zb_object = sobject;
- zseg->seg_start.zb_level = slevel;
- zseg->seg_start.zb_blkid = sblkid;
-
- zseg->seg_end.zb_objset = eobjset;
- zseg->seg_end.zb_object = eobject;
- zseg->seg_end.zb_level = elevel;
- zseg->seg_end.zb_blkid = eblkid;
-
- list_insert_tail(&th->th_seglist, zseg);
-}
-
-void
-traverse_add_dnode(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
- uint64_t objset, uint64_t object)
-{
- if (th->th_advance & ADVANCE_PRE)
- traverse_add_segment(th, mintxg, maxtxg,
- objset, object, ZB_MAXLEVEL, 0,
- objset, object, 0, ZB_MAXBLKID);
- else
- traverse_add_segment(th, mintxg, maxtxg,
- objset, object, 0, 0,
- objset, object, 0, ZB_MAXBLKID);
-}
-
-void
-traverse_add_objset(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
- uint64_t objset)
-{
- if (th->th_advance & ADVANCE_PRE)
- traverse_add_segment(th, mintxg, maxtxg,
- objset, 0, -1, 0,
- objset, ZB_MAXOBJECT, 0, ZB_MAXBLKID);
- else
- traverse_add_segment(th, mintxg, maxtxg,
- objset, 1, 0, 0,
- objset, 0, -1, 0);
-}
-
-void
-traverse_add_pool(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg)
-{
- if (th->th_advance & ADVANCE_PRE)
- traverse_add_segment(th, mintxg, maxtxg,
- 0, 0, -1, 0,
- ZB_MAXOBJSET, ZB_MAXOBJECT, 0, ZB_MAXBLKID);
- else
- traverse_add_segment(th, mintxg, maxtxg,
- 1, 1, 0, 0,
- 0, 0, -1, 0);
-}
-
-traverse_handle_t *
-traverse_init(spa_t *spa, blkptr_cb_t func, void *arg, int advance,
- int zio_flags)
-{
- traverse_handle_t *th;
- int d, l;
-
- th = kmem_zalloc(sizeof (*th), KM_SLEEP);
-
- th->th_spa = spa;
- th->th_func = func;
- th->th_arg = arg;
- th->th_advance = advance;
- th->th_lastcb.zb_level = ZB_NO_LEVEL;
- th->th_noread.zb_level = ZB_NO_LEVEL;
- th->th_zio_flags = zio_flags;
-
- list_create(&th->th_seglist, sizeof (zseg_t),
- offsetof(zseg_t, seg_node));
-
- for (d = 0; d < ZB_DEPTH; d++) {
- for (l = 0; l < ZB_MAXLEVEL; l++) {
- if ((advance & ADVANCE_DATA) ||
- l != 0 || d != ZB_DN_CACHE)
- th->th_cache[d][l].bc_data =
- zio_buf_alloc(SPA_MAXBLOCKSIZE);
- }
- }
-
- return (th);
-}
-
-void
-traverse_fini(traverse_handle_t *th)
-{
- int d, l;
- zseg_t *zseg;
-
- for (d = 0; d < ZB_DEPTH; d++)
- for (l = 0; l < ZB_MAXLEVEL; l++)
- if (th->th_cache[d][l].bc_data != NULL)
- zio_buf_free(th->th_cache[d][l].bc_data,
- SPA_MAXBLOCKSIZE);
-
- while ((zseg = list_head(&th->th_seglist)) != NULL) {
- list_remove(&th->th_seglist, zseg);
- kmem_free(zseg, sizeof (*zseg));
- }
-
- list_destroy(&th->th_seglist);
-
- dprintf("%llu hit, %llu ARC, %llu IO, %llu cb, %llu sync, %llu again\n",
- th->th_hits, th->th_arc_hits, th->th_reads, th->th_callbacks,
- th->th_syncs, th->th_restarts);
-
- kmem_free(th, sizeof (*th));
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
deleted file mode 100644
index 13fd8d4..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
+++ /dev/null
@@ -1,992 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/dmu.h>
-#include <sys/dmu_impl.h>
-#include <sys/dbuf.h>
-#include <sys/dmu_tx.h>
-#include <sys/dmu_objset.h>
-#include <sys/dsl_dataset.h> /* for dsl_dataset_block_freeable() */
-#include <sys/dsl_dir.h> /* for dsl_dir_tempreserve_*() */
-#include <sys/dsl_pool.h>
-#include <sys/zap_impl.h> /* for fzap_default_block_shift */
-#include <sys/spa.h>
-#include <sys/zfs_context.h>
-
-typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
- uint64_t arg1, uint64_t arg2);
-
-
-dmu_tx_t *
-dmu_tx_create_dd(dsl_dir_t *dd)
-{
- dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP);
- tx->tx_dir = dd;
- if (dd)
- tx->tx_pool = dd->dd_pool;
- list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t),
- offsetof(dmu_tx_hold_t, txh_node));
-#ifdef ZFS_DEBUG
- refcount_create(&tx->tx_space_written);
- refcount_create(&tx->tx_space_freed);
-#endif
- return (tx);
-}
-
-dmu_tx_t *
-dmu_tx_create(objset_t *os)
-{
- dmu_tx_t *tx = dmu_tx_create_dd(os->os->os_dsl_dataset->ds_dir);
- tx->tx_objset = os;
- tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os->os_dsl_dataset);
- return (tx);
-}
-
-dmu_tx_t *
-dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg)
-{
- dmu_tx_t *tx = dmu_tx_create_dd(NULL);
-
- ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg);
- tx->tx_pool = dp;
- tx->tx_txg = txg;
- tx->tx_anyobj = TRUE;
-
- return (tx);
-}
-
-int
-dmu_tx_is_syncing(dmu_tx_t *tx)
-{
- return (tx->tx_anyobj);
-}
-
-int
-dmu_tx_private_ok(dmu_tx_t *tx)
-{
- return (tx->tx_anyobj);
-}
-
-static dmu_tx_hold_t *
-dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object,
- enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2)
-{
- dmu_tx_hold_t *txh;
- dnode_t *dn = NULL;
- int err;
-
- if (object != DMU_NEW_OBJECT) {
- err = dnode_hold(os->os, object, tx, &dn);
- if (err) {
- tx->tx_err = err;
- return (NULL);
- }
-
- if (err == 0 && tx->tx_txg != 0) {
- mutex_enter(&dn->dn_mtx);
- /*
- * dn->dn_assigned_txg == tx->tx_txg doesn't pose a
- * problem, but there's no way for it to happen (for
- * now, at least).
- */
- ASSERT(dn->dn_assigned_txg == 0);
- dn->dn_assigned_txg = tx->tx_txg;
- (void) refcount_add(&dn->dn_tx_holds, tx);
- mutex_exit(&dn->dn_mtx);
- }
- }
-
- txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP);
- txh->txh_tx = tx;
- txh->txh_dnode = dn;
-#ifdef ZFS_DEBUG
- txh->txh_type = type;
- txh->txh_arg1 = arg1;
- txh->txh_arg2 = arg2;
-#endif
- list_insert_tail(&tx->tx_holds, txh);
-
- return (txh);
-}
-
-void
-dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object)
-{
- /*
- * If we're syncing, they can manipulate any object anyhow, and
- * the hold on the dnode_t can cause problems.
- */
- if (!dmu_tx_is_syncing(tx)) {
- (void) dmu_tx_hold_object_impl(tx, os,
- object, THT_NEWOBJECT, 0, 0);
- }
-}
-
-static int
-dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
-{
- int err;
- dmu_buf_impl_t *db;
-
- rw_enter(&dn->dn_struct_rwlock, RW_READER);
- db = dbuf_hold_level(dn, level, blkid, FTAG);
- rw_exit(&dn->dn_struct_rwlock);
- if (db == NULL)
- return (EIO);
- err = dbuf_read(db, zio, DB_RF_CANFAIL);
- dbuf_rele(db, FTAG);
- return (err);
-}
-
-/* ARGSUSED */
-static void
-dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
-{
- dnode_t *dn = txh->txh_dnode;
- uint64_t start, end, i;
- int min_bs, max_bs, min_ibs, max_ibs, epbs, bits;
- int err = 0;
-
- if (len == 0)
- return;
-
- min_bs = SPA_MINBLOCKSHIFT;
- max_bs = SPA_MAXBLOCKSHIFT;
- min_ibs = DN_MIN_INDBLKSHIFT;
- max_ibs = DN_MAX_INDBLKSHIFT;
-
-
- /*
- * For i/o error checking, read the first and last level-0
- * blocks (if they are not aligned), and all the level-1 blocks.
- */
-
- if (dn) {
- if (dn->dn_maxblkid == 0) {
- err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
- if (err)
- goto out;
- } else {
- zio_t *zio = zio_root(dn->dn_objset->os_spa,
- NULL, NULL, ZIO_FLAG_CANFAIL);
-
- /* first level-0 block */
- start = off >> dn->dn_datablkshift;
- if (P2PHASE(off, dn->dn_datablksz) ||
- len < dn->dn_datablksz) {
- err = dmu_tx_check_ioerr(zio, dn, 0, start);
- if (err)
- goto out;
- }
-
- /* last level-0 block */
- end = (off+len-1) >> dn->dn_datablkshift;
- if (end != start &&
- P2PHASE(off+len, dn->dn_datablksz)) {
- err = dmu_tx_check_ioerr(zio, dn, 0, end);
- if (err)
- goto out;
- }
-
- /* level-1 blocks */
- if (dn->dn_nlevels > 1) {
- start >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT;
- end >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT;
- for (i = start+1; i < end; i++) {
- err = dmu_tx_check_ioerr(zio, dn, 1, i);
- if (err)
- goto out;
- }
- }
-
- err = zio_wait(zio);
- if (err)
- goto out;
- }
- }
-
- /*
- * If there's more than one block, the blocksize can't change,
- * so we can make a more precise estimate. Alternatively,
- * if the dnode's ibs is larger than max_ibs, always use that.
- * This ensures that if we reduce DN_MAX_INDBLKSHIFT,
- * the code will still work correctly on existing pools.
- */
- if (dn && (dn->dn_maxblkid != 0 || dn->dn_indblkshift > max_ibs)) {
- min_ibs = max_ibs = dn->dn_indblkshift;
- if (dn->dn_datablkshift != 0)
- min_bs = max_bs = dn->dn_datablkshift;
- }
-
- /*
- * 'end' is the last thing we will access, not one past.
- * This way we won't overflow when accessing the last byte.
- */
- start = P2ALIGN(off, 1ULL << max_bs);
- end = P2ROUNDUP(off + len, 1ULL << max_bs) - 1;
- txh->txh_space_towrite += end - start + 1;
-
- start >>= min_bs;
- end >>= min_bs;
-
- epbs = min_ibs - SPA_BLKPTRSHIFT;
-
- /*
- * The object contains at most 2^(64 - min_bs) blocks,
- * and each indirect level maps 2^epbs.
- */
- for (bits = 64 - min_bs; bits >= 0; bits -= epbs) {
- start >>= epbs;
- end >>= epbs;
- /*
- * If we increase the number of levels of indirection,
- * we'll need new blkid=0 indirect blocks. If start == 0,
- * we're already accounting for that blocks; and if end == 0,
- * we can't increase the number of levels beyond that.
- */
- if (start != 0 && end != 0)
- txh->txh_space_towrite += 1ULL << max_ibs;
- txh->txh_space_towrite += (end - start + 1) << max_ibs;
- }
-
- ASSERT(txh->txh_space_towrite < 2 * DMU_MAX_ACCESS);
-
-out:
- if (err)
- txh->txh_tx->tx_err = err;
-}
-
-static void
-dmu_tx_count_dnode(dmu_tx_hold_t *txh)
-{
- dnode_t *dn = txh->txh_dnode;
- dnode_t *mdn = txh->txh_tx->tx_objset->os->os_meta_dnode;
- uint64_t space = mdn->dn_datablksz +
- ((mdn->dn_nlevels-1) << mdn->dn_indblkshift);
-
- if (dn && dn->dn_dbuf->db_blkptr &&
- dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
- dn->dn_dbuf->db_blkptr->blk_birth)) {
- txh->txh_space_tooverwrite += space;
- } else {
- txh->txh_space_towrite += space;
- }
-}
-
-void
-dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
-{
- dmu_tx_hold_t *txh;
-
- ASSERT(tx->tx_txg == 0);
- ASSERT(len < DMU_MAX_ACCESS);
- ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
-
- txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
- object, THT_WRITE, off, len);
- if (txh == NULL)
- return;
-
- dmu_tx_count_write(txh, off, len);
- dmu_tx_count_dnode(txh);
-}
-
-static void
-dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
-{
- uint64_t blkid, nblks;
- uint64_t space = 0;
- dnode_t *dn = txh->txh_dnode;
- dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
- spa_t *spa = txh->txh_tx->tx_pool->dp_spa;
- int dirty;
-
- /*
- * We don't need to use any locking to check for dirtyness
- * because it's OK if we get stale data -- the dnode may become
- * dirty immediately after our check anyway. This is just a
- * means to avoid the expensive count when we aren't sure we
- * need it. We need to be able to deal with a dirty dnode.
- */
- dirty = list_link_active(&dn->dn_dirty_link[0]) |
- list_link_active(&dn->dn_dirty_link[1]) |
- list_link_active(&dn->dn_dirty_link[2]) |
- list_link_active(&dn->dn_dirty_link[3]);
- if (dirty || dn->dn_assigned_txg || dn->dn_phys->dn_nlevels == 0)
- return;
-
- /*
- * the struct_rwlock protects us against dn_phys->dn_nlevels
- * changing, in case (against all odds) we manage to dirty &
- * sync out the changes after we check for being dirty.
- * also, dbuf_hold_impl() wants us to have the struct_rwlock.
- *
- * It's fine to use dn_datablkshift rather than the dn_phys
- * equivalent because if it is changing, maxblkid==0 and we will
- * bail.
- */
- rw_enter(&dn->dn_struct_rwlock, RW_READER);
- if (dn->dn_phys->dn_maxblkid == 0) {
- if (off == 0 && len >= dn->dn_datablksz) {
- blkid = 0;
- nblks = 1;
- } else {
- rw_exit(&dn->dn_struct_rwlock);
- return;
- }
- } else {
- blkid = off >> dn->dn_datablkshift;
- nblks = (off + len) >> dn->dn_datablkshift;
-
- if (blkid >= dn->dn_phys->dn_maxblkid) {
- rw_exit(&dn->dn_struct_rwlock);
- return;
- }
- if (blkid + nblks > dn->dn_phys->dn_maxblkid)
- nblks = dn->dn_phys->dn_maxblkid - blkid;
-
- /* don't bother after 128,000 blocks */
- nblks = MIN(nblks, 128*1024);
- }
-
- if (dn->dn_phys->dn_nlevels == 1) {
- int i;
- for (i = 0; i < nblks; i++) {
- blkptr_t *bp = dn->dn_phys->dn_blkptr;
- ASSERT3U(blkid + i, <, dn->dn_phys->dn_nblkptr);
- bp += blkid + i;
- if (dsl_dataset_block_freeable(ds, bp->blk_birth)) {
- dprintf_bp(bp, "can free old%s", "");
- space += bp_get_dasize(spa, bp);
- }
- }
- nblks = 0;
- }
-
- while (nblks) {
- dmu_buf_impl_t *dbuf;
- int err, epbs, blkoff, tochk;
-
- epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
- blkoff = P2PHASE(blkid, 1<<epbs);
- tochk = MIN((1<<epbs) - blkoff, nblks);
-
- err = dbuf_hold_impl(dn, 1, blkid >> epbs, TRUE, FTAG, &dbuf);
- if (err == 0) {
- int i;
- blkptr_t *bp;
-
- err = dbuf_read(dbuf, NULL,
- DB_RF_HAVESTRUCT | DB_RF_CANFAIL);
- if (err != 0) {
- txh->txh_tx->tx_err = err;
- dbuf_rele(dbuf, FTAG);
- break;
- }
-
- bp = dbuf->db.db_data;
- bp += blkoff;
-
- for (i = 0; i < tochk; i++) {
- if (dsl_dataset_block_freeable(ds,
- bp[i].blk_birth)) {
- dprintf_bp(&bp[i],
- "can free old%s", "");
- space += bp_get_dasize(spa, &bp[i]);
- }
- }
- dbuf_rele(dbuf, FTAG);
- }
- if (err && err != ENOENT) {
- txh->txh_tx->tx_err = err;
- break;
- }
-
- blkid += tochk;
- nblks -= tochk;
- }
- rw_exit(&dn->dn_struct_rwlock);
-
- txh->txh_space_tofree += space;
-}
-
-void
-dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
-{
- dmu_tx_hold_t *txh;
- dnode_t *dn;
- uint64_t start, end, i;
- int err, shift;
- zio_t *zio;
-
- ASSERT(tx->tx_txg == 0);
-
- txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
- object, THT_FREE, off, len);
- if (txh == NULL)
- return;
- dn = txh->txh_dnode;
-
- /* first block */
- if (off != 0)
- dmu_tx_count_write(txh, off, 1);
- /* last block */
- if (len != DMU_OBJECT_END)
- dmu_tx_count_write(txh, off+len, 1);
-
- if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz)
- return;
- if (len == DMU_OBJECT_END)
- len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off;
-
- /*
- * For i/o error checking, read the first and last level-0
- * blocks, and all the level-1 blocks. The above count_write's
- * will take care of the level-0 blocks.
- */
- if (dn->dn_nlevels > 1) {
- shift = dn->dn_datablkshift + dn->dn_indblkshift -
- SPA_BLKPTRSHIFT;
- start = off >> shift;
- end = dn->dn_datablkshift ? ((off+len) >> shift) : 0;
-
- zio = zio_root(tx->tx_pool->dp_spa,
- NULL, NULL, ZIO_FLAG_CANFAIL);
- for (i = start; i <= end; i++) {
- uint64_t ibyte = i << shift;
- err = dnode_next_offset(dn, FALSE, &ibyte, 2, 1, 0);
- i = ibyte >> shift;
- if (err == ESRCH)
- break;
- if (err) {
- tx->tx_err = err;
- return;
- }
-
- err = dmu_tx_check_ioerr(zio, dn, 1, i);
- if (err) {
- tx->tx_err = err;
- return;
- }
- }
- err = zio_wait(zio);
- if (err) {
- tx->tx_err = err;
- return;
- }
- }
-
- dmu_tx_count_dnode(txh);
- dmu_tx_count_free(txh, off, len);
-}
-
-void
-dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name)
-{
- dmu_tx_hold_t *txh;
- dnode_t *dn;
- uint64_t nblocks;
- int epbs, err;
-
- ASSERT(tx->tx_txg == 0);
-
- txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
- object, THT_ZAP, add, (uintptr_t)name);
- if (txh == NULL)
- return;
- dn = txh->txh_dnode;
-
- dmu_tx_count_dnode(txh);
-
- if (dn == NULL) {
- /*
- * We will be able to fit a new object's entries into one leaf
- * block. So there will be at most 2 blocks total,
- * including the header block.
- */
- dmu_tx_count_write(txh, 0, 2 << fzap_default_block_shift);
- return;
- }
-
- ASSERT3P(dmu_ot[dn->dn_type].ot_byteswap, ==, zap_byteswap);
-
- if (dn->dn_maxblkid == 0 && !add) {
- /*
- * If there is only one block (i.e. this is a micro-zap)
- * and we are not adding anything, the accounting is simple.
- */
- err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
- if (err) {
- tx->tx_err = err;
- return;
- }
-
- /*
- * Use max block size here, since we don't know how much
- * the size will change between now and the dbuf dirty call.
- */
- if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
- dn->dn_phys->dn_blkptr[0].blk_birth))
- txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
- else
- txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
- return;
- }
-
- if (dn->dn_maxblkid > 0 && name) {
- /*
- * access the name in this fat-zap so that we'll check
- * for i/o errors to the leaf blocks, etc.
- */
- err = zap_lookup(&dn->dn_objset->os, dn->dn_object, name,
- 8, 0, NULL);
- if (err == EIO) {
- tx->tx_err = err;
- return;
- }
- }
-
- /*
- * 3 blocks overwritten: target leaf, ptrtbl block, header block
- * 3 new blocks written if adding: new split leaf, 2 grown ptrtbl blocks
- */
- dmu_tx_count_write(txh, dn->dn_maxblkid * dn->dn_datablksz,
- (3 + add ? 3 : 0) << dn->dn_datablkshift);
-
- /*
- * If the modified blocks are scattered to the four winds,
- * we'll have to modify an indirect twig for each.
- */
- epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
- for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs)
- txh->txh_space_towrite += 3 << dn->dn_indblkshift;
-}
-
-void
-dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object)
-{
- dmu_tx_hold_t *txh;
-
- ASSERT(tx->tx_txg == 0);
-
- txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
- object, THT_BONUS, 0, 0);
- if (txh)
- dmu_tx_count_dnode(txh);
-}
-
-void
-dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space)
-{
- dmu_tx_hold_t *txh;
- ASSERT(tx->tx_txg == 0);
-
- txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
- DMU_NEW_OBJECT, THT_SPACE, space, 0);
-
- txh->txh_space_towrite += space;
-}
-
-int
-dmu_tx_holds(dmu_tx_t *tx, uint64_t object)
-{
- dmu_tx_hold_t *txh;
- int holds = 0;
-
- /*
- * By asserting that the tx is assigned, we're counting the
- * number of dn_tx_holds, which is the same as the number of
- * dn_holds. Otherwise, we'd be counting dn_holds, but
- * dn_tx_holds could be 0.
- */
- ASSERT(tx->tx_txg != 0);
-
- /* if (tx->tx_anyobj == TRUE) */
- /* return (0); */
-
- for (txh = list_head(&tx->tx_holds); txh;
- txh = list_next(&tx->tx_holds, txh)) {
- if (txh->txh_dnode && txh->txh_dnode->dn_object == object)
- holds++;
- }
-
- return (holds);
-}
-
-#ifdef ZFS_DEBUG
-void
-dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
-{
- dmu_tx_hold_t *txh;
- int match_object = FALSE, match_offset = FALSE;
- dnode_t *dn = db->db_dnode;
-
- ASSERT(tx->tx_txg != 0);
- ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset->os);
- ASSERT3U(dn->dn_object, ==, db->db.db_object);
-
- if (tx->tx_anyobj)
- return;
-
- /* XXX No checking on the meta dnode for now */
- if (db->db.db_object == DMU_META_DNODE_OBJECT)
- return;
-
- for (txh = list_head(&tx->tx_holds); txh;
- txh = list_next(&tx->tx_holds, txh)) {
- ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg);
- if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT)
- match_object = TRUE;
- if (txh->txh_dnode == NULL || txh->txh_dnode == dn) {
- int datablkshift = dn->dn_datablkshift ?
- dn->dn_datablkshift : SPA_MAXBLOCKSHIFT;
- int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
- int shift = datablkshift + epbs * db->db_level;
- uint64_t beginblk = shift >= 64 ? 0 :
- (txh->txh_arg1 >> shift);
- uint64_t endblk = shift >= 64 ? 0 :
- ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift);
- uint64_t blkid = db->db_blkid;
-
- /* XXX txh_arg2 better not be zero... */
-
- dprintf("found txh type %x beginblk=%llx endblk=%llx\n",
- txh->txh_type, beginblk, endblk);
-
- switch (txh->txh_type) {
- case THT_WRITE:
- if (blkid >= beginblk && blkid <= endblk)
- match_offset = TRUE;
- /*
- * We will let this hold work for the bonus
- * buffer so that we don't need to hold it
- * when creating a new object.
- */
- if (blkid == DB_BONUS_BLKID)
- match_offset = TRUE;
- /*
- * They might have to increase nlevels,
- * thus dirtying the new TLIBs. Or the
- * might have to change the block size,
- * thus dirying the new lvl=0 blk=0.
- */
- if (blkid == 0)
- match_offset = TRUE;
- break;
- case THT_FREE:
- if (blkid == beginblk &&
- (txh->txh_arg1 != 0 ||
- dn->dn_maxblkid == 0))
- match_offset = TRUE;
- if (blkid == endblk &&
- txh->txh_arg2 != DMU_OBJECT_END)
- match_offset = TRUE;
- break;
- case THT_BONUS:
- if (blkid == DB_BONUS_BLKID)
- match_offset = TRUE;
- break;
- case THT_ZAP:
- match_offset = TRUE;
- break;
- case THT_NEWOBJECT:
- match_object = TRUE;
- break;
- default:
- ASSERT(!"bad txh_type");
- }
- }
- if (match_object && match_offset)
- return;
- }
- panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n",
- (u_longlong_t)db->db.db_object, db->db_level,
- (u_longlong_t)db->db_blkid);
-}
-#endif
-
-static int
-dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
-{
- dmu_tx_hold_t *txh;
- uint64_t lsize, asize, fsize, towrite, tofree, tooverwrite;
-
- ASSERT3U(tx->tx_txg, ==, 0);
- if (tx->tx_err)
- return (tx->tx_err);
-
- tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
- tx->tx_needassign_txh = NULL;
-
- /*
- * NB: No error returns are allowed after txg_hold_open, but
- * before processing the dnode holds, due to the
- * dmu_tx_unassign() logic.
- */
-
- towrite = tofree = tooverwrite = 0;
- for (txh = list_head(&tx->tx_holds); txh;
- txh = list_next(&tx->tx_holds, txh)) {
- dnode_t *dn = txh->txh_dnode;
- if (dn != NULL) {
- mutex_enter(&dn->dn_mtx);
- if (dn->dn_assigned_txg == tx->tx_txg - 1) {
- mutex_exit(&dn->dn_mtx);
- tx->tx_needassign_txh = txh;
- return (ERESTART);
- }
- if (dn->dn_assigned_txg == 0)
- dn->dn_assigned_txg = tx->tx_txg;
- ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
- (void) refcount_add(&dn->dn_tx_holds, tx);
- mutex_exit(&dn->dn_mtx);
- }
- towrite += txh->txh_space_towrite;
- tofree += txh->txh_space_tofree;
- tooverwrite += txh->txh_space_tooverwrite;
- }
-
- /*
- * NB: This check must be after we've held the dnodes, so that
- * the dmu_tx_unassign() logic will work properly
- */
- if (txg_how >= TXG_INITIAL && txg_how != tx->tx_txg)
- return (ERESTART);
-
- /*
- * If a snapshot has been taken since we made our estimates,
- * assume that we won't be able to free or overwrite anything.
- */
- if (tx->tx_objset &&
- dsl_dataset_prev_snap_txg(tx->tx_objset->os->os_dsl_dataset) >
- tx->tx_lastsnap_txg) {
- towrite += tooverwrite;
- tooverwrite = tofree = 0;
- }
-
- /*
- * Convert logical size to worst-case allocated size.
- */
- fsize = spa_get_asize(tx->tx_pool->dp_spa, tooverwrite) + tofree;
- lsize = towrite + tooverwrite;
- asize = spa_get_asize(tx->tx_pool->dp_spa, lsize);
-
-#ifdef ZFS_DEBUG
- tx->tx_space_towrite = asize;
- tx->tx_space_tofree = tofree;
- tx->tx_space_tooverwrite = tooverwrite;
-#endif
-
- if (tx->tx_dir && asize != 0) {
- int err = dsl_dir_tempreserve_space(tx->tx_dir,
- lsize, asize, fsize, &tx->tx_tempreserve_cookie, tx);
- if (err)
- return (err);
- }
-
- return (0);
-}
-
-static void
-dmu_tx_unassign(dmu_tx_t *tx)
-{
- dmu_tx_hold_t *txh;
-
- if (tx->tx_txg == 0)
- return;
-
- txg_rele_to_quiesce(&tx->tx_txgh);
-
- for (txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh;
- txh = list_next(&tx->tx_holds, txh)) {
- dnode_t *dn = txh->txh_dnode;
-
- if (dn == NULL)
- continue;
- mutex_enter(&dn->dn_mtx);
- ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
-
- if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
- dn->dn_assigned_txg = 0;
- cv_broadcast(&dn->dn_notxholds);
- }
- mutex_exit(&dn->dn_mtx);
- }
-
- txg_rele_to_sync(&tx->tx_txgh);
-
- tx->tx_lasttried_txg = tx->tx_txg;
- tx->tx_txg = 0;
-}
-
-/*
- * Assign tx to a transaction group. txg_how can be one of:
- *
- * (1) TXG_WAIT. If the current open txg is full, waits until there's
- * a new one. This should be used when you're not holding locks.
- * If will only fail if we're truly out of space (or over quota).
- *
- * (2) TXG_NOWAIT. If we can't assign into the current open txg without
- * blocking, returns immediately with ERESTART. This should be used
- * whenever you're holding locks. On an ERESTART error, the caller
- * should drop locks, do a dmu_tx_wait(tx), and try again.
- *
- * (3) A specific txg. Use this if you need to ensure that multiple
- * transactions all sync in the same txg. Like TXG_NOWAIT, it
- * returns ERESTART if it can't assign you into the requested txg.
- */
-int
-dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how)
-{
- int err;
-
- ASSERT(tx->tx_txg == 0);
- ASSERT(txg_how != 0);
- ASSERT(!dsl_pool_sync_context(tx->tx_pool));
-
- while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
- dmu_tx_unassign(tx);
-
- if (err != ERESTART || txg_how != TXG_WAIT)
- return (err);
-
- dmu_tx_wait(tx);
- }
-
- txg_rele_to_quiesce(&tx->tx_txgh);
-
- return (0);
-}
-
-void
-dmu_tx_wait(dmu_tx_t *tx)
-{
- ASSERT(tx->tx_txg == 0);
- ASSERT(tx->tx_lasttried_txg != 0);
-
- if (tx->tx_needassign_txh) {
- dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
-
- mutex_enter(&dn->dn_mtx);
- while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1)
- cv_wait(&dn->dn_notxholds, &dn->dn_mtx);
- mutex_exit(&dn->dn_mtx);
- tx->tx_needassign_txh = NULL;
- } else {
- txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1);
- }
-}
-
-void
-dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta)
-{
-#ifdef ZFS_DEBUG
- if (tx->tx_dir == NULL || delta == 0)
- return;
-
- if (delta > 0) {
- ASSERT3U(refcount_count(&tx->tx_space_written) + delta, <=,
- tx->tx_space_towrite);
- (void) refcount_add_many(&tx->tx_space_written, delta, NULL);
- } else {
- (void) refcount_add_many(&tx->tx_space_freed, -delta, NULL);
- }
-#endif
-}
-
-void
-dmu_tx_commit(dmu_tx_t *tx)
-{
- dmu_tx_hold_t *txh;
-
- ASSERT(tx->tx_txg != 0);
-
- while (txh = list_head(&tx->tx_holds)) {
- dnode_t *dn = txh->txh_dnode;
-
- list_remove(&tx->tx_holds, txh);
- kmem_free(txh, sizeof (dmu_tx_hold_t));
- if (dn == NULL)
- continue;
- mutex_enter(&dn->dn_mtx);
- ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
-
- if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
- dn->dn_assigned_txg = 0;
- cv_broadcast(&dn->dn_notxholds);
- }
- mutex_exit(&dn->dn_mtx);
- dnode_rele(dn, tx);
- }
-
- if (tx->tx_tempreserve_cookie)
- dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx);
-
- if (tx->tx_anyobj == FALSE)
- txg_rele_to_sync(&tx->tx_txgh);
-#ifdef ZFS_DEBUG
- dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n",
- tx->tx_space_towrite, refcount_count(&tx->tx_space_written),
- tx->tx_space_tofree, refcount_count(&tx->tx_space_freed));
- refcount_destroy_many(&tx->tx_space_written,
- refcount_count(&tx->tx_space_written));
- refcount_destroy_many(&tx->tx_space_freed,
- refcount_count(&tx->tx_space_freed));
-#endif
- kmem_free(tx, sizeof (dmu_tx_t));
-}
-
-void
-dmu_tx_abort(dmu_tx_t *tx)
-{
- dmu_tx_hold_t *txh;
-
- ASSERT(tx->tx_txg == 0);
-
- while (txh = list_head(&tx->tx_holds)) {
- dnode_t *dn = txh->txh_dnode;
-
- list_remove(&tx->tx_holds, txh);
- kmem_free(txh, sizeof (dmu_tx_hold_t));
- if (dn != NULL)
- dnode_rele(dn, tx);
- }
-#ifdef ZFS_DEBUG
- refcount_destroy_many(&tx->tx_space_written,
- refcount_count(&tx->tx_space_written));
- refcount_destroy_many(&tx->tx_space_freed,
- refcount_count(&tx->tx_space_freed));
-#endif
- kmem_free(tx, sizeof (dmu_tx_t));
-}
-
-uint64_t
-dmu_tx_get_txg(dmu_tx_t *tx)
-{
- ASSERT(tx->tx_txg != 0);
- return (tx->tx_txg);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c
deleted file mode 100644
index 78d625c..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c
+++ /dev/null
@@ -1,655 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/dnode.h>
-#include <sys/dmu_objset.h>
-#include <sys/dmu_zfetch.h>
-#include <sys/dmu.h>
-#include <sys/dbuf.h>
-
-/*
- * I'm against tune-ables, but these should probably exist as tweakable globals
- * until we can get this working the way we want it to.
- */
-
-int zfs_prefetch_disable = 0;
-SYSCTL_DECL(_vfs_zfs);
-TUNABLE_INT("vfs.zfs.prefetch_disable", &zfs_prefetch_disable);
-SYSCTL_INT(_vfs_zfs, OID_AUTO, prefetch_disable, CTLFLAG_RDTUN,
- &zfs_prefetch_disable, 0, "Disable prefetch");
-
-/* max # of streams per zfetch */
-uint32_t zfetch_max_streams = 8;
-/* min time before stream reclaim */
-uint32_t zfetch_min_sec_reap = 2;
-/* max number of blocks to fetch at a time */
-uint32_t zfetch_block_cap = 256;
-/* number of bytes in a array_read at which we stop prefetching (1Mb) */
-uint64_t zfetch_array_rd_sz = 1024 * 1024;
-
-/* forward decls for static routines */
-static int dmu_zfetch_colinear(zfetch_t *, zstream_t *);
-static void dmu_zfetch_dofetch(zfetch_t *, zstream_t *);
-static uint64_t dmu_zfetch_fetch(dnode_t *, uint64_t, uint64_t);
-static uint64_t dmu_zfetch_fetchsz(dnode_t *, uint64_t, uint64_t);
-static int dmu_zfetch_find(zfetch_t *, zstream_t *, int);
-static int dmu_zfetch_stream_insert(zfetch_t *, zstream_t *);
-static zstream_t *dmu_zfetch_stream_reclaim(zfetch_t *);
-static void dmu_zfetch_stream_remove(zfetch_t *, zstream_t *);
-static int dmu_zfetch_streams_equal(zstream_t *, zstream_t *);
-
-/*
- * Given a zfetch structure and a zstream structure, determine whether the
- * blocks to be read are part of a co-linear pair of existing prefetch
- * streams. If a set is found, coalesce the streams, removing one, and
- * configure the prefetch so it looks for a strided access pattern.
- *
- * In other words: if we find two sequential access streams that are
- * the same length and distance N appart, and this read is N from the
- * last stream, then we are probably in a strided access pattern. So
- * combine the two sequential streams into a single strided stream.
- *
- * If no co-linear streams are found, return NULL.
- */
-static int
-dmu_zfetch_colinear(zfetch_t *zf, zstream_t *zh)
-{
- zstream_t *z_walk;
- zstream_t *z_comp;
-
- if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER))
- return (0);
-
- if (zh == NULL) {
- rw_exit(&zf->zf_rwlock);
- return (0);
- }
-
- for (z_walk = list_head(&zf->zf_stream); z_walk;
- z_walk = list_next(&zf->zf_stream, z_walk)) {
- for (z_comp = list_next(&zf->zf_stream, z_walk); z_comp;
- z_comp = list_next(&zf->zf_stream, z_comp)) {
- int64_t diff;
-
- if (z_walk->zst_len != z_walk->zst_stride ||
- z_comp->zst_len != z_comp->zst_stride) {
- continue;
- }
-
- diff = z_comp->zst_offset - z_walk->zst_offset;
- if (z_comp->zst_offset + diff == zh->zst_offset) {
- z_walk->zst_offset = zh->zst_offset;
- z_walk->zst_direction = diff < 0 ? -1 : 1;
- z_walk->zst_stride =
- diff * z_walk->zst_direction;
- z_walk->zst_ph_offset =
- zh->zst_offset + z_walk->zst_stride;
- dmu_zfetch_stream_remove(zf, z_comp);
- mutex_destroy(&z_comp->zst_lock);
- kmem_free(z_comp, sizeof (zstream_t));
-
- dmu_zfetch_dofetch(zf, z_walk);
-
- rw_exit(&zf->zf_rwlock);
- return (1);
- }
-
- diff = z_walk->zst_offset - z_comp->zst_offset;
- if (z_walk->zst_offset + diff == zh->zst_offset) {
- z_walk->zst_offset = zh->zst_offset;
- z_walk->zst_direction = diff < 0 ? -1 : 1;
- z_walk->zst_stride =
- diff * z_walk->zst_direction;
- z_walk->zst_ph_offset =
- zh->zst_offset + z_walk->zst_stride;
- dmu_zfetch_stream_remove(zf, z_comp);
- mutex_destroy(&z_comp->zst_lock);
- kmem_free(z_comp, sizeof (zstream_t));
-
- dmu_zfetch_dofetch(zf, z_walk);
-
- rw_exit(&zf->zf_rwlock);
- return (1);
- }
- }
- }
-
- rw_exit(&zf->zf_rwlock);
- return (0);
-}
-
-/*
- * Given a zstream_t, determine the bounds of the prefetch. Then call the
- * routine that actually prefetches the individual blocks.
- */
-static void
-dmu_zfetch_dofetch(zfetch_t *zf, zstream_t *zs)
-{
- uint64_t prefetch_tail;
- uint64_t prefetch_limit;
- uint64_t prefetch_ofst;
- uint64_t prefetch_len;
- uint64_t blocks_fetched;
-
- zs->zst_stride = MAX((int64_t)zs->zst_stride, zs->zst_len);
- zs->zst_cap = MIN(zfetch_block_cap, 2 * zs->zst_cap);
-
- prefetch_tail = MAX((int64_t)zs->zst_ph_offset,
- (int64_t)(zs->zst_offset + zs->zst_stride));
- /*
- * XXX: use a faster division method?
- */
- prefetch_limit = zs->zst_offset + zs->zst_len +
- (zs->zst_cap * zs->zst_stride) / zs->zst_len;
-
- while (prefetch_tail < prefetch_limit) {
- prefetch_ofst = zs->zst_offset + zs->zst_direction *
- (prefetch_tail - zs->zst_offset);
-
- prefetch_len = zs->zst_len;
-
- /*
- * Don't prefetch beyond the end of the file, if working
- * backwards.
- */
- if ((zs->zst_direction == ZFETCH_BACKWARD) &&
- (prefetch_ofst > prefetch_tail)) {
- prefetch_len += prefetch_ofst;
- prefetch_ofst = 0;
- }
-
- /* don't prefetch more than we're supposed to */
- if (prefetch_len > zs->zst_len)
- break;
-
- blocks_fetched = dmu_zfetch_fetch(zf->zf_dnode,
- prefetch_ofst, zs->zst_len);
-
- prefetch_tail += zs->zst_stride;
- /* stop if we've run out of stuff to prefetch */
- if (blocks_fetched < zs->zst_len)
- break;
- }
- zs->zst_ph_offset = prefetch_tail;
- zs->zst_last = lbolt;
-}
-
-/*
- * This takes a pointer to a zfetch structure and a dnode. It performs the
- * necessary setup for the zfetch structure, grokking data from the
- * associated dnode.
- */
-void
-dmu_zfetch_init(zfetch_t *zf, dnode_t *dno)
-{
- if (zf == NULL) {
- return;
- }
-
- zf->zf_dnode = dno;
- zf->zf_stream_cnt = 0;
- zf->zf_alloc_fail = 0;
-
- list_create(&zf->zf_stream, sizeof (zstream_t),
- offsetof(zstream_t, zst_node));
-
- rw_init(&zf->zf_rwlock, NULL, RW_DEFAULT, NULL);
-}
-
-/*
- * This function computes the actual size, in blocks, that can be prefetched,
- * and fetches it.
- */
-static uint64_t
-dmu_zfetch_fetch(dnode_t *dn, uint64_t blkid, uint64_t nblks)
-{
- uint64_t fetchsz;
- uint64_t i;
-
- fetchsz = dmu_zfetch_fetchsz(dn, blkid, nblks);
-
- for (i = 0; i < fetchsz; i++) {
- dbuf_prefetch(dn, blkid + i);
- }
-
- return (fetchsz);
-}
-
-/*
- * this function returns the number of blocks that would be prefetched, based
- * upon the supplied dnode, blockid, and nblks. This is used so that we can
- * update streams in place, and then prefetch with their old value after the
- * fact. This way, we can delay the prefetch, but subsequent accesses to the
- * stream won't result in the same data being prefetched multiple times.
- */
-static uint64_t
-dmu_zfetch_fetchsz(dnode_t *dn, uint64_t blkid, uint64_t nblks)
-{
- uint64_t fetchsz;
-
- if (blkid > dn->dn_maxblkid) {
- return (0);
- }
-
- /* compute fetch size */
- if (blkid + nblks + 1 > dn->dn_maxblkid) {
- fetchsz = (dn->dn_maxblkid - blkid) + 1;
- ASSERT(blkid + fetchsz - 1 <= dn->dn_maxblkid);
- } else {
- fetchsz = nblks;
- }
-
-
- return (fetchsz);
-}
-
-/*
- * given a zfetch and a zsearch structure, see if there is an associated zstream
- * for this block read. If so, it starts a prefetch for the stream it
- * located and returns true, otherwise it returns false
- */
-static int
-dmu_zfetch_find(zfetch_t *zf, zstream_t *zh, int prefetched)
-{
- zstream_t *zs;
- int64_t diff;
- int reset = !prefetched;
- int rc = 0;
-
- if (zh == NULL)
- return (0);
-
- /*
- * XXX: This locking strategy is a bit coarse; however, it's impact has
- * yet to be tested. If this turns out to be an issue, it can be
- * modified in a number of different ways.
- */
-
- rw_enter(&zf->zf_rwlock, RW_READER);
-top:
-
- for (zs = list_head(&zf->zf_stream); zs;
- zs = list_next(&zf->zf_stream, zs)) {
-
- /*
- * XXX - should this be an assert?
- */
- if (zs->zst_len == 0) {
- /* bogus stream */
- continue;
- }
-
- /*
- * We hit this case when we are in a strided prefetch stream:
- * we will read "len" blocks before "striding".
- */
- if (zh->zst_offset >= zs->zst_offset &&
- zh->zst_offset < zs->zst_offset + zs->zst_len) {
- /* already fetched */
- rc = 1;
- goto out;
- }
-
- /*
- * This is the forward sequential read case: we increment
- * len by one each time we hit here, so we will enter this
- * case on every read.
- */
- if (zh->zst_offset == zs->zst_offset + zs->zst_len) {
-
- reset = !prefetched && zs->zst_len > 1;
-
- mutex_enter(&zs->zst_lock);
-
- if (zh->zst_offset != zs->zst_offset + zs->zst_len) {
- mutex_exit(&zs->zst_lock);
- goto top;
- }
- zs->zst_len += zh->zst_len;
- diff = zs->zst_len - zfetch_block_cap;
- if (diff > 0) {
- zs->zst_offset += diff;
- zs->zst_len = zs->zst_len > diff ?
- zs->zst_len - diff : 0;
- }
- zs->zst_direction = ZFETCH_FORWARD;
-
- break;
-
- /*
- * Same as above, but reading backwards through the file.
- */
- } else if (zh->zst_offset == zs->zst_offset - zh->zst_len) {
- /* backwards sequential access */
-
- reset = !prefetched && zs->zst_len > 1;
-
- mutex_enter(&zs->zst_lock);
-
- if (zh->zst_offset != zs->zst_offset - zh->zst_len) {
- mutex_exit(&zs->zst_lock);
- goto top;
- }
-
- zs->zst_offset = zs->zst_offset > zh->zst_len ?
- zs->zst_offset - zh->zst_len : 0;
- zs->zst_ph_offset = zs->zst_ph_offset > zh->zst_len ?
- zs->zst_ph_offset - zh->zst_len : 0;
- zs->zst_len += zh->zst_len;
-
- diff = zs->zst_len - zfetch_block_cap;
- if (diff > 0) {
- zs->zst_ph_offset = zs->zst_ph_offset > diff ?
- zs->zst_ph_offset - diff : 0;
- zs->zst_len = zs->zst_len > diff ?
- zs->zst_len - diff : zs->zst_len;
- }
- zs->zst_direction = ZFETCH_BACKWARD;
-
- break;
-
- } else if ((zh->zst_offset - zs->zst_offset - zs->zst_stride <
- zs->zst_len) && (zs->zst_len != zs->zst_stride)) {
- /* strided forward access */
-
- mutex_enter(&zs->zst_lock);
-
- if ((zh->zst_offset - zs->zst_offset - zs->zst_stride >=
- zs->zst_len) || (zs->zst_len == zs->zst_stride)) {
- mutex_exit(&zs->zst_lock);
- goto top;
- }
-
- zs->zst_offset += zs->zst_stride;
- zs->zst_direction = ZFETCH_FORWARD;
-
- break;
-
- } else if ((zh->zst_offset - zs->zst_offset + zs->zst_stride <
- zs->zst_len) && (zs->zst_len != zs->zst_stride)) {
- /* strided reverse access */
-
- mutex_enter(&zs->zst_lock);
-
- if ((zh->zst_offset - zs->zst_offset + zs->zst_stride >=
- zs->zst_len) || (zs->zst_len == zs->zst_stride)) {
- mutex_exit(&zs->zst_lock);
- goto top;
- }
-
- zs->zst_offset = zs->zst_offset > zs->zst_stride ?
- zs->zst_offset - zs->zst_stride : 0;
- zs->zst_ph_offset = (zs->zst_ph_offset >
- (2 * zs->zst_stride)) ?
- (zs->zst_ph_offset - (2 * zs->zst_stride)) : 0;
- zs->zst_direction = ZFETCH_BACKWARD;
-
- break;
- }
- }
-
- if (zs) {
- if (reset) {
- zstream_t *remove = zs;
-
- rc = 0;
- mutex_exit(&zs->zst_lock);
- rw_exit(&zf->zf_rwlock);
- rw_enter(&zf->zf_rwlock, RW_WRITER);
- /*
- * Relocate the stream, in case someone removes
- * it while we were acquiring the WRITER lock.
- */
- for (zs = list_head(&zf->zf_stream); zs;
- zs = list_next(&zf->zf_stream, zs)) {
- if (zs == remove) {
- dmu_zfetch_stream_remove(zf, zs);
- mutex_destroy(&zs->zst_lock);
- kmem_free(zs, sizeof (zstream_t));
- break;
- }
- }
- } else {
- rc = 1;
- dmu_zfetch_dofetch(zf, zs);
- mutex_exit(&zs->zst_lock);
- }
- }
-out:
- rw_exit(&zf->zf_rwlock);
- return (rc);
-}
-
-/*
- * Clean-up state associated with a zfetch structure. This frees allocated
- * structure members, empties the zf_stream tree, and generally makes things
- * nice. This doesn't free the zfetch_t itself, that's left to the caller.
- */
-void
-dmu_zfetch_rele(zfetch_t *zf)
-{
- zstream_t *zs;
- zstream_t *zs_next;
-
- ASSERT(!RW_LOCK_HELD(&zf->zf_rwlock));
-
- for (zs = list_head(&zf->zf_stream); zs; zs = zs_next) {
- zs_next = list_next(&zf->zf_stream, zs);
-
- list_remove(&zf->zf_stream, zs);
- mutex_destroy(&zs->zst_lock);
- kmem_free(zs, sizeof (zstream_t));
- }
- list_destroy(&zf->zf_stream);
- rw_destroy(&zf->zf_rwlock);
-
- zf->zf_dnode = NULL;
-}
-
-/*
- * Given a zfetch and zstream structure, insert the zstream structure into the
- * AVL tree contained within the zfetch structure. Peform the appropriate
- * book-keeping. It is possible that another thread has inserted a stream which
- * matches one that we are about to insert, so we must be sure to check for this
- * case. If one is found, return failure, and let the caller cleanup the
- * duplicates.
- */
-static int
-dmu_zfetch_stream_insert(zfetch_t *zf, zstream_t *zs)
-{
- zstream_t *zs_walk;
- zstream_t *zs_next;
-
- ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
-
- for (zs_walk = list_head(&zf->zf_stream); zs_walk; zs_walk = zs_next) {
- zs_next = list_next(&zf->zf_stream, zs_walk);
-
- if (dmu_zfetch_streams_equal(zs_walk, zs)) {
- return (0);
- }
- }
-
- list_insert_head(&zf->zf_stream, zs);
- zf->zf_stream_cnt++;
-
- return (1);
-}
-
-
-/*
- * Walk the list of zstreams in the given zfetch, find an old one (by time), and
- * reclaim it for use by the caller.
- */
-static zstream_t *
-dmu_zfetch_stream_reclaim(zfetch_t *zf)
-{
- zstream_t *zs;
-
- if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER))
- return (0);
-
- for (zs = list_head(&zf->zf_stream); zs;
- zs = list_next(&zf->zf_stream, zs)) {
-
- if (((lbolt - zs->zst_last) / hz) > zfetch_min_sec_reap)
- break;
- }
-
- if (zs) {
- dmu_zfetch_stream_remove(zf, zs);
- mutex_destroy(&zs->zst_lock);
- bzero(zs, sizeof (zstream_t));
- } else {
- zf->zf_alloc_fail++;
- }
- rw_exit(&zf->zf_rwlock);
-
- return (zs);
-}
-
-/*
- * Given a zfetch and zstream structure, remove the zstream structure from its
- * container in the zfetch structure. Perform the appropriate book-keeping.
- */
-static void
-dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs)
-{
- ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
-
- list_remove(&zf->zf_stream, zs);
- zf->zf_stream_cnt--;
-}
-
-static int
-dmu_zfetch_streams_equal(zstream_t *zs1, zstream_t *zs2)
-{
- if (zs1->zst_offset != zs2->zst_offset)
- return (0);
-
- if (zs1->zst_len != zs2->zst_len)
- return (0);
-
- if (zs1->zst_stride != zs2->zst_stride)
- return (0);
-
- if (zs1->zst_ph_offset != zs2->zst_ph_offset)
- return (0);
-
- if (zs1->zst_cap != zs2->zst_cap)
- return (0);
-
- if (zs1->zst_direction != zs2->zst_direction)
- return (0);
-
- return (1);
-}
-
-/*
- * This is the prefetch entry point. It calls all of the other dmu_zfetch
- * routines to create, delete, find, or operate upon prefetch streams.
- */
-void
-dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched)
-{
- zstream_t zst;
- zstream_t *newstream;
- int fetched;
- int inserted;
- unsigned int blkshft;
- uint64_t blksz;
-
- if (zfs_prefetch_disable)
- return;
-
- /* files that aren't ln2 blocksz are only one block -- nothing to do */
- if (!zf->zf_dnode->dn_datablkshift)
- return;
-
- /* convert offset and size, into blockid and nblocks */
- blkshft = zf->zf_dnode->dn_datablkshift;
- blksz = (1 << blkshft);
-
- bzero(&zst, sizeof (zstream_t));
- zst.zst_offset = offset >> blkshft;
- zst.zst_len = (P2ROUNDUP(offset + size, blksz) -
- P2ALIGN(offset, blksz)) >> blkshft;
-
- fetched = dmu_zfetch_find(zf, &zst, prefetched);
- if (!fetched) {
- fetched = dmu_zfetch_colinear(zf, &zst);
- }
-
- if (!fetched) {
- newstream = dmu_zfetch_stream_reclaim(zf);
-
- /*
- * we still couldn't find a stream, drop the lock, and allocate
- * one if possible. Otherwise, give up and go home.
- */
- if (newstream == NULL) {
- uint64_t maxblocks;
- uint32_t max_streams;
- uint32_t cur_streams;
-
- cur_streams = zf->zf_stream_cnt;
- maxblocks = zf->zf_dnode->dn_maxblkid;
-
- max_streams = MIN(zfetch_max_streams,
- (maxblocks / zfetch_block_cap));
- if (max_streams == 0) {
- max_streams++;
- }
-
- if (cur_streams >= max_streams) {
- return;
- }
-
- newstream = kmem_zalloc(sizeof (zstream_t), KM_SLEEP);
- }
-
- newstream->zst_offset = zst.zst_offset;
- newstream->zst_len = zst.zst_len;
- newstream->zst_stride = zst.zst_len;
- newstream->zst_ph_offset = zst.zst_len + zst.zst_offset;
- newstream->zst_cap = zst.zst_len;
- newstream->zst_direction = ZFETCH_FORWARD;
- newstream->zst_last = lbolt;
-
- mutex_init(&newstream->zst_lock, NULL, MUTEX_DEFAULT, NULL);
-
- rw_enter(&zf->zf_rwlock, RW_WRITER);
- inserted = dmu_zfetch_stream_insert(zf, newstream);
- rw_exit(&zf->zf_rwlock);
-
- if (!inserted) {
- mutex_destroy(&newstream->zst_lock);
- kmem_free(newstream, sizeof (zstream_t));
- }
- }
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dnode.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dnode.c
deleted file mode 100644
index ca50285..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/dnode.c
+++ /dev/null
@@ -1,1369 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/dbuf.h>
-#include <sys/dnode.h>
-#include <sys/dmu.h>
-#include <sys/dmu_impl.h>
-#include <sys/dmu_tx.h>
-#include <sys/dmu_objset.h>
-#include <sys/dsl_dir.h>
-#include <sys/dsl_dataset.h>
-#include <sys/spa.h>
-#include <sys/zio.h>
-#include <sys/dmu_zfetch.h>
-
-static int free_range_compar(const void *node1, const void *node2);
-
-static kmem_cache_t *dnode_cache;
-
-static dnode_phys_t dnode_phys_zero;
-
-int zfs_default_bs = SPA_MINBLOCKSHIFT;
-int zfs_default_ibs = DN_MAX_INDBLKSHIFT;
-
-/* ARGSUSED */
-static int
-dnode_cons(void *arg, void *unused, int kmflag)
-{
- int i;
- dnode_t *dn = arg;
- bzero(dn, sizeof (dnode_t));
-
- cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL);
- rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL);
- mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL);
- refcount_create(&dn->dn_holds);
- refcount_create(&dn->dn_tx_holds);
-
- for (i = 0; i < TXG_SIZE; i++) {
- avl_create(&dn->dn_ranges[i], free_range_compar,
- sizeof (free_range_t),
- offsetof(struct free_range, fr_node));
- list_create(&dn->dn_dirty_records[i],
- sizeof (dbuf_dirty_record_t),
- offsetof(dbuf_dirty_record_t, dr_dirty_node));
- }
-
- list_create(&dn->dn_dbufs, sizeof (dmu_buf_impl_t),
- offsetof(dmu_buf_impl_t, db_link));
-
- return (0);
-}
-
-/* ARGSUSED */
-static void
-dnode_dest(void *arg, void *unused)
-{
- int i;
- dnode_t *dn = arg;
-
- cv_destroy(&dn->dn_notxholds);
- rw_destroy(&dn->dn_struct_rwlock);
- mutex_destroy(&dn->dn_mtx);
- mutex_destroy(&dn->dn_dbufs_mtx);
- refcount_destroy(&dn->dn_holds);
- refcount_destroy(&dn->dn_tx_holds);
-
- for (i = 0; i < TXG_SIZE; i++) {
- avl_destroy(&dn->dn_ranges[i]);
- list_destroy(&dn->dn_dirty_records[i]);
- }
-
- list_destroy(&dn->dn_dbufs);
-}
-
-void
-dnode_init(void)
-{
- dnode_cache = kmem_cache_create("dnode_t",
- sizeof (dnode_t),
- 0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0);
-}
-
-void
-dnode_fini(void)
-{
- kmem_cache_destroy(dnode_cache);
-}
-
-
-#ifdef ZFS_DEBUG
-void
-dnode_verify(dnode_t *dn)
-{
- int drop_struct_lock = FALSE;
-
- ASSERT(dn->dn_phys);
- ASSERT(dn->dn_objset);
-
- ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES);
-
- if (!(zfs_flags & ZFS_DEBUG_DNODE_VERIFY))
- return;
-
- if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
- rw_enter(&dn->dn_struct_rwlock, RW_READER);
- drop_struct_lock = TRUE;
- }
- if (dn->dn_phys->dn_type != DMU_OT_NONE || dn->dn_allocated_txg != 0) {
- int i;
- ASSERT3U(dn->dn_indblkshift, >=, 0);
- ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT);
- if (dn->dn_datablkshift) {
- ASSERT3U(dn->dn_datablkshift, >=, SPA_MINBLOCKSHIFT);
- ASSERT3U(dn->dn_datablkshift, <=, SPA_MAXBLOCKSHIFT);
- ASSERT3U(1<<dn->dn_datablkshift, ==, dn->dn_datablksz);
- }
- ASSERT3U(dn->dn_nlevels, <=, 30);
- ASSERT3U(dn->dn_type, <=, DMU_OT_NUMTYPES);
- ASSERT3U(dn->dn_nblkptr, >=, 1);
- ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
- ASSERT3U(dn->dn_bonuslen, <=, DN_MAX_BONUSLEN);
- ASSERT3U(dn->dn_datablksz, ==,
- dn->dn_datablkszsec << SPA_MINBLOCKSHIFT);
- ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0);
- ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) +
- dn->dn_bonuslen, <=, DN_MAX_BONUSLEN);
- for (i = 0; i < TXG_SIZE; i++) {
- ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels);
- }
- }
- if (dn->dn_phys->dn_type != DMU_OT_NONE)
- ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels);
- ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || dn->dn_dbuf != NULL);
- if (dn->dn_dbuf != NULL) {
- ASSERT3P(dn->dn_phys, ==,
- (dnode_phys_t *)dn->dn_dbuf->db.db_data +
- (dn->dn_object % (dn->dn_dbuf->db.db_size >> DNODE_SHIFT)));
- }
- if (drop_struct_lock)
- rw_exit(&dn->dn_struct_rwlock);
-}
-#endif
-
-void
-dnode_byteswap(dnode_phys_t *dnp)
-{
- uint64_t *buf64 = (void*)&dnp->dn_blkptr;
- int i;
-
- if (dnp->dn_type == DMU_OT_NONE) {
- bzero(dnp, sizeof (dnode_phys_t));
- return;
- }
-
- dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec);
- dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen);
- dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid);
- dnp->dn_used = BSWAP_64(dnp->dn_used);
-
- /*
- * dn_nblkptr is only one byte, so it's OK to read it in either
- * byte order. We can't read dn_bouslen.
- */
- ASSERT(dnp->dn_indblkshift <= SPA_MAXBLOCKSHIFT);
- ASSERT(dnp->dn_nblkptr <= DN_MAX_NBLKPTR);
- for (i = 0; i < dnp->dn_nblkptr * sizeof (blkptr_t)/8; i++)
- buf64[i] = BSWAP_64(buf64[i]);
-
- /*
- * OK to check dn_bonuslen for zero, because it won't matter if
- * we have the wrong byte order. This is necessary because the
- * dnode dnode is smaller than a regular dnode.
- */
- if (dnp->dn_bonuslen != 0) {
- /*
- * Note that the bonus length calculated here may be
- * longer than the actual bonus buffer. This is because
- * we always put the bonus buffer after the last block
- * pointer (instead of packing it against the end of the
- * dnode buffer).
- */
- int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t);
- size_t len = DN_MAX_BONUSLEN - off;
- ASSERT3U(dnp->dn_bonustype, <, DMU_OT_NUMTYPES);
- dmu_ot[dnp->dn_bonustype].ot_byteswap(dnp->dn_bonus + off, len);
- }
-}
-
-void
-dnode_buf_byteswap(void *vbuf, size_t size)
-{
- dnode_phys_t *buf = vbuf;
- int i;
-
- ASSERT3U(sizeof (dnode_phys_t), ==, (1<<DNODE_SHIFT));
- ASSERT((size & (sizeof (dnode_phys_t)-1)) == 0);
-
- size >>= DNODE_SHIFT;
- for (i = 0; i < size; i++) {
- dnode_byteswap(buf);
- buf++;
- }
-}
-
-static int
-free_range_compar(const void *node1, const void *node2)
-{
- const free_range_t *rp1 = node1;
- const free_range_t *rp2 = node2;
-
- if (rp1->fr_blkid < rp2->fr_blkid)
- return (-1);
- else if (rp1->fr_blkid > rp2->fr_blkid)
- return (1);
- else return (0);
-}
-
-static void
-dnode_setdblksz(dnode_t *dn, int size)
-{
- ASSERT3U(P2PHASE(size, SPA_MINBLOCKSIZE), ==, 0);
- ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
- ASSERT3U(size, >=, SPA_MINBLOCKSIZE);
- ASSERT3U(size >> SPA_MINBLOCKSHIFT, <,
- 1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8));
- dn->dn_datablksz = size;
- dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT;
- dn->dn_datablkshift = ISP2(size) ? highbit(size - 1) : 0;
-}
-
-static dnode_t *
-dnode_create(objset_impl_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
- uint64_t object)
-{
- dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
-
- dn->dn_objset = os;
- dn->dn_object = object;
- dn->dn_dbuf = db;
- dn->dn_phys = dnp;
-
- if (dnp->dn_datablkszsec)
- dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
- dn->dn_indblkshift = dnp->dn_indblkshift;
- dn->dn_nlevels = dnp->dn_nlevels;
- dn->dn_type = dnp->dn_type;
- dn->dn_nblkptr = dnp->dn_nblkptr;
- dn->dn_checksum = dnp->dn_checksum;
- dn->dn_compress = dnp->dn_compress;
- dn->dn_bonustype = dnp->dn_bonustype;
- dn->dn_bonuslen = dnp->dn_bonuslen;
- dn->dn_maxblkid = dnp->dn_maxblkid;
-
- dmu_zfetch_init(&dn->dn_zfetch, dn);
-
- ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES);
- mutex_enter(&os->os_lock);
- list_insert_head(&os->os_dnodes, dn);
- mutex_exit(&os->os_lock);
-
- return (dn);
-}
-
-static void
-dnode_destroy(dnode_t *dn)
-{
- objset_impl_t *os = dn->dn_objset;
-
-#ifdef ZFS_DEBUG
- int i;
-
- for (i = 0; i < TXG_SIZE; i++) {
- ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
- ASSERT(NULL == list_head(&dn->dn_dirty_records[i]));
- ASSERT(0 == avl_numnodes(&dn->dn_ranges[i]));
- }
- ASSERT(NULL == list_head(&dn->dn_dbufs));
-#endif
-
- mutex_enter(&os->os_lock);
- list_remove(&os->os_dnodes, dn);
- mutex_exit(&os->os_lock);
-
- if (dn->dn_dirtyctx_firstset) {
- kmem_free(dn->dn_dirtyctx_firstset, 1);
- dn->dn_dirtyctx_firstset = NULL;
- }
- dmu_zfetch_rele(&dn->dn_zfetch);
- if (dn->dn_bonus) {
- mutex_enter(&dn->dn_bonus->db_mtx);
- dbuf_evict(dn->dn_bonus);
- dn->dn_bonus = NULL;
- }
- kmem_cache_free(dnode_cache, dn);
-}
-
-void
-dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
- dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
-{
- int i;
-
- if (blocksize == 0)
- blocksize = 1 << zfs_default_bs;
- else if (blocksize > SPA_MAXBLOCKSIZE)
- blocksize = SPA_MAXBLOCKSIZE;
- else
- blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE);
-
- if (ibs == 0)
- ibs = zfs_default_ibs;
-
- ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT);
-
- dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d\n", dn->dn_objset,
- dn->dn_object, tx->tx_txg, blocksize, ibs);
-
- ASSERT(dn->dn_type == DMU_OT_NONE);
- ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0);
- ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE);
- ASSERT(ot != DMU_OT_NONE);
- ASSERT3U(ot, <, DMU_OT_NUMTYPES);
- ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
- (bonustype != DMU_OT_NONE && bonuslen != 0));
- ASSERT3U(bonustype, <, DMU_OT_NUMTYPES);
- ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
- ASSERT(dn->dn_type == DMU_OT_NONE);
- ASSERT3U(dn->dn_maxblkid, ==, 0);
- ASSERT3U(dn->dn_allocated_txg, ==, 0);
- ASSERT3U(dn->dn_assigned_txg, ==, 0);
- ASSERT(refcount_is_zero(&dn->dn_tx_holds));
- ASSERT3U(refcount_count(&dn->dn_holds), <=, 1);
- ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
-
- for (i = 0; i < TXG_SIZE; i++) {
- ASSERT3U(dn->dn_next_nlevels[i], ==, 0);
- ASSERT3U(dn->dn_next_indblkshift[i], ==, 0);
- ASSERT3U(dn->dn_next_blksz[i], ==, 0);
- ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
- ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
- ASSERT3U(avl_numnodes(&dn->dn_ranges[i]), ==, 0);
- }
-
- dn->dn_type = ot;
- dnode_setdblksz(dn, blocksize);
- dn->dn_indblkshift = ibs;
- dn->dn_nlevels = 1;
- dn->dn_nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
- dn->dn_bonustype = bonustype;
- dn->dn_bonuslen = bonuslen;
- dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
- dn->dn_compress = ZIO_COMPRESS_INHERIT;
- dn->dn_dirtyctx = 0;
-
- dn->dn_free_txg = 0;
- if (dn->dn_dirtyctx_firstset) {
- kmem_free(dn->dn_dirtyctx_firstset, 1);
- dn->dn_dirtyctx_firstset = NULL;
- }
-
- dn->dn_allocated_txg = tx->tx_txg;
-
- dnode_setdirty(dn, tx);
- dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs;
- dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz;
-}
-
-void
-dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
- dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
-{
- int i;
- dmu_buf_impl_t *db = NULL;
-
- ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
- ASSERT3U(blocksize, <=, SPA_MAXBLOCKSIZE);
- ASSERT3U(blocksize % SPA_MINBLOCKSIZE, ==, 0);
- ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
- ASSERT(tx->tx_txg != 0);
- ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
- (bonustype != DMU_OT_NONE && bonuslen != 0));
- ASSERT3U(bonustype, <, DMU_OT_NUMTYPES);
- ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
-
- for (i = 0; i < TXG_SIZE; i++)
- ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
-
- /* clean up any unreferenced dbufs */
- (void) dnode_evict_dbufs(dn, 0);
- ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
-
- /*
- * XXX I should really have a generation number to tell if we
- * need to do this...
- */
- if (blocksize != dn->dn_datablksz ||
- dn->dn_bonustype != bonustype || dn->dn_bonuslen != bonuslen) {
- /* free all old data */
- dnode_free_range(dn, 0, -1ULL, tx);
- }
-
- /* change blocksize */
- rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
- if (blocksize != dn->dn_datablksz &&
- (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) ||
- list_head(&dn->dn_dbufs) != NULL)) {
- db = dbuf_hold(dn, 0, FTAG);
- dbuf_new_size(db, blocksize, tx);
- }
- dnode_setdblksz(dn, blocksize);
- dnode_setdirty(dn, tx);
- dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize;
- rw_exit(&dn->dn_struct_rwlock);
- if (db) {
- dbuf_rele(db, FTAG);
- db = NULL;
- }
-
- /* change type */
- dn->dn_type = ot;
-
- if (dn->dn_bonuslen != bonuslen) {
- /* change bonus size */
- if (bonuslen == 0)
- bonuslen = 1; /* XXX */
- rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
- if (dn->dn_bonus == NULL)
- dn->dn_bonus = dbuf_create_bonus(dn);
- db = dn->dn_bonus;
- rw_exit(&dn->dn_struct_rwlock);
- if (refcount_add(&db->db_holds, FTAG) == 1)
- dnode_add_ref(dn, db);
- VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED));
- mutex_enter(&db->db_mtx);
- ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen);
- ASSERT(db->db.db_data != NULL);
- db->db.db_size = bonuslen;
- mutex_exit(&db->db_mtx);
- (void) dbuf_dirty(db, tx);
- }
-
- /* change bonus size and type */
- mutex_enter(&dn->dn_mtx);
- dn->dn_bonustype = bonustype;
- dn->dn_bonuslen = bonuslen;
- dn->dn_nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
- dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
- dn->dn_compress = ZIO_COMPRESS_INHERIT;
- ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
-
- /*
- * NB: we have to do the dbuf_rele after we've changed the
- * dn_bonuslen, for the sake of dbuf_verify().
- */
- if (db)
- dbuf_rele(db, FTAG);
-
- dn->dn_allocated_txg = tx->tx_txg;
- mutex_exit(&dn->dn_mtx);
-}
-
-void
-dnode_special_close(dnode_t *dn)
-{
- /*
- * Wait for final references to the dnode to clear. This can
- * only happen if the arc is asyncronously evicting state that
- * has a hold on this dnode while we are trying to evict this
- * dnode.
- */
- while (refcount_count(&dn->dn_holds) > 0)
- delay(1);
- dnode_destroy(dn);
-}
-
-dnode_t *
-dnode_special_open(objset_impl_t *os, dnode_phys_t *dnp, uint64_t object)
-{
- dnode_t *dn = dnode_create(os, dnp, NULL, object);
- DNODE_VERIFY(dn);
- return (dn);
-}
-
-static void
-dnode_buf_pageout(dmu_buf_t *db, void *arg)
-{
- dnode_t **children_dnodes = arg;
- int i;
- int epb = db->db_size >> DNODE_SHIFT;
-
- for (i = 0; i < epb; i++) {
- dnode_t *dn = children_dnodes[i];
- int n;
-
- if (dn == NULL)
- continue;
-#ifdef ZFS_DEBUG
- /*
- * If there are holds on this dnode, then there should
- * be holds on the dnode's containing dbuf as well; thus
- * it wouldn't be eligable for eviction and this function
- * would not have been called.
- */
- ASSERT(refcount_is_zero(&dn->dn_holds));
- ASSERT(list_head(&dn->dn_dbufs) == NULL);
- ASSERT(refcount_is_zero(&dn->dn_tx_holds));
-
- for (n = 0; n < TXG_SIZE; n++)
- ASSERT(!list_link_active(&dn->dn_dirty_link[n]));
-#endif
- children_dnodes[i] = NULL;
- dnode_destroy(dn);
- }
- kmem_free(children_dnodes, epb * sizeof (dnode_t *));
-}
-
-/*
- * errors:
- * EINVAL - invalid object number.
- * EIO - i/o error.
- * succeeds even for free dnodes.
- */
-int
-dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag,
- void *tag, dnode_t **dnp)
-{
- int epb, idx, err;
- int drop_struct_lock = FALSE;
- int type;
- uint64_t blk;
- dnode_t *mdn, *dn;
- dmu_buf_impl_t *db;
- dnode_t **children_dnodes;
-
- if (object == 0 || object >= DN_MAX_OBJECT)
- return (EINVAL);
-
- mdn = os->os_meta_dnode;
-
- DNODE_VERIFY(mdn);
-
- if (!RW_WRITE_HELD(&mdn->dn_struct_rwlock)) {
- rw_enter(&mdn->dn_struct_rwlock, RW_READER);
- drop_struct_lock = TRUE;
- }
-
- blk = dbuf_whichblock(mdn, object * sizeof (dnode_phys_t));
-
- db = dbuf_hold(mdn, blk, FTAG);
- if (drop_struct_lock)
- rw_exit(&mdn->dn_struct_rwlock);
- if (db == NULL)
- return (EIO);
- err = dbuf_read(db, NULL, DB_RF_CANFAIL);
- if (err) {
- dbuf_rele(db, FTAG);
- return (err);
- }
-
- ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT);
- epb = db->db.db_size >> DNODE_SHIFT;
-
- idx = object & (epb-1);
-
- children_dnodes = dmu_buf_get_user(&db->db);
- if (children_dnodes == NULL) {
- dnode_t **winner;
- children_dnodes = kmem_zalloc(epb * sizeof (dnode_t *),
- KM_SLEEP);
- if (winner = dmu_buf_set_user(&db->db, children_dnodes, NULL,
- dnode_buf_pageout)) {
- kmem_free(children_dnodes, epb * sizeof (dnode_t *));
- children_dnodes = winner;
- }
- }
-
- if ((dn = children_dnodes[idx]) == NULL) {
- dnode_t *winner;
- dn = dnode_create(os, (dnode_phys_t *)db->db.db_data+idx,
- db, object);
- winner = atomic_cas_ptr(&children_dnodes[idx], NULL, dn);
- if (winner != NULL) {
- dnode_destroy(dn);
- dn = winner;
- }
- }
-
- mutex_enter(&dn->dn_mtx);
- type = dn->dn_type;
- if (dn->dn_free_txg ||
- ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) ||
- ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE)) {
- mutex_exit(&dn->dn_mtx);
- dbuf_rele(db, FTAG);
- return (type == DMU_OT_NONE ? ENOENT : EEXIST);
- }
- mutex_exit(&dn->dn_mtx);
-
- if (refcount_add(&dn->dn_holds, tag) == 1)
- dbuf_add_ref(db, dn);
-
- DNODE_VERIFY(dn);
- ASSERT3P(dn->dn_dbuf, ==, db);
- ASSERT3U(dn->dn_object, ==, object);
- dbuf_rele(db, FTAG);
-
- *dnp = dn;
- return (0);
-}
-
-/*
- * Return held dnode if the object is allocated, NULL if not.
- */
-int
-dnode_hold(objset_impl_t *os, uint64_t object, void *tag, dnode_t **dnp)
-{
- return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, tag, dnp));
-}
-
-void
-dnode_add_ref(dnode_t *dn, void *tag)
-{
- ASSERT(refcount_count(&dn->dn_holds) > 0);
- (void) refcount_add(&dn->dn_holds, tag);
-}
-
-void
-dnode_rele(dnode_t *dn, void *tag)
-{
- uint64_t refs;
-
- refs = refcount_remove(&dn->dn_holds, tag);
- /* NOTE: the DNODE_DNODE does not have a dn_dbuf */
- if (refs == 0 && dn->dn_dbuf)
- dbuf_rele(dn->dn_dbuf, dn);
-}
-
-void
-dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
-{
- objset_impl_t *os = dn->dn_objset;
- uint64_t txg = tx->tx_txg;
-
- if (dn->dn_object == DMU_META_DNODE_OBJECT)
- return;
-
- DNODE_VERIFY(dn);
-
-#ifdef ZFS_DEBUG
- mutex_enter(&dn->dn_mtx);
- ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg);
- /* ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg); */
- mutex_exit(&dn->dn_mtx);
-#endif
-
- mutex_enter(&os->os_lock);
-
- /*
- * If we are already marked dirty, we're done.
- */
- if (list_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) {
- mutex_exit(&os->os_lock);
- return;
- }
-
- ASSERT(!refcount_is_zero(&dn->dn_holds) || list_head(&dn->dn_dbufs));
- ASSERT(dn->dn_datablksz != 0);
- ASSERT3U(dn->dn_next_blksz[txg&TXG_MASK], ==, 0);
-
- dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n",
- dn->dn_object, txg);
-
- if (dn->dn_free_txg > 0 && dn->dn_free_txg <= txg) {
- list_insert_tail(&os->os_free_dnodes[txg&TXG_MASK], dn);
- } else {
- list_insert_tail(&os->os_dirty_dnodes[txg&TXG_MASK], dn);
- }
-
- mutex_exit(&os->os_lock);
-
- /*
- * The dnode maintains a hold on its containing dbuf as
- * long as there are holds on it. Each instantiated child
- * dbuf maintaines a hold on the dnode. When the last child
- * drops its hold, the dnode will drop its hold on the
- * containing dbuf. We add a "dirty hold" here so that the
- * dnode will hang around after we finish processing its
- * children.
- */
- dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg);
-
- (void) dbuf_dirty(dn->dn_dbuf, tx);
-
- dsl_dataset_dirty(os->os_dsl_dataset, tx);
-}
-
-void
-dnode_free(dnode_t *dn, dmu_tx_t *tx)
-{
- int txgoff = tx->tx_txg & TXG_MASK;
-
- dprintf("dn=%p txg=%llu\n", dn, tx->tx_txg);
-
- /* we should be the only holder... hopefully */
- /* ASSERT3U(refcount_count(&dn->dn_holds), ==, 1); */
-
- mutex_enter(&dn->dn_mtx);
- if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg) {
- mutex_exit(&dn->dn_mtx);
- return;
- }
- dn->dn_free_txg = tx->tx_txg;
- mutex_exit(&dn->dn_mtx);
-
- /*
- * If the dnode is already dirty, it needs to be moved from
- * the dirty list to the free list.
- */
- mutex_enter(&dn->dn_objset->os_lock);
- if (list_link_active(&dn->dn_dirty_link[txgoff])) {
- list_remove(&dn->dn_objset->os_dirty_dnodes[txgoff], dn);
- list_insert_tail(&dn->dn_objset->os_free_dnodes[txgoff], dn);
- mutex_exit(&dn->dn_objset->os_lock);
- } else {
- mutex_exit(&dn->dn_objset->os_lock);
- dnode_setdirty(dn, tx);
- }
-}
-
-/*
- * Try to change the block size for the indicated dnode. This can only
- * succeed if there are no blocks allocated or dirty beyond first block
- */
-int
-dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
-{
- dmu_buf_impl_t *db, *db_next;
- int have_db0 = FALSE;
-
- if (size == 0)
- size = SPA_MINBLOCKSIZE;
- if (size > SPA_MAXBLOCKSIZE)
- size = SPA_MAXBLOCKSIZE;
- else
- size = P2ROUNDUP(size, SPA_MINBLOCKSIZE);
-
- if (ibs == dn->dn_indblkshift)
- ibs = 0;
-
- if (size >> SPA_MINBLOCKSHIFT == dn->dn_datablkszsec && ibs == 0)
- return (0);
-
- rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
-
- /* Check for any allocated blocks beyond the first */
- if (dn->dn_phys->dn_maxblkid != 0)
- goto fail;
-
- mutex_enter(&dn->dn_dbufs_mtx);
- for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
- db_next = list_next(&dn->dn_dbufs, db);
-
- if (db->db_blkid == 0) {
- have_db0 = TRUE;
- } else if (db->db_blkid != DB_BONUS_BLKID) {
- mutex_exit(&dn->dn_dbufs_mtx);
- goto fail;
- }
- }
- mutex_exit(&dn->dn_dbufs_mtx);
-
- if (ibs && dn->dn_nlevels != 1)
- goto fail;
-
- db = NULL;
- if (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) || have_db0) {
- /* obtain the old block */
- db = dbuf_hold(dn, 0, FTAG);
- dbuf_new_size(db, size, tx);
- }
-
- dnode_setdblksz(dn, size);
- dnode_setdirty(dn, tx);
- dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = size;
- if (ibs) {
- dn->dn_indblkshift = ibs;
- dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs;
- }
-
- if (db)
- dbuf_rele(db, FTAG);
-
- rw_exit(&dn->dn_struct_rwlock);
- return (0);
-
-fail:
- rw_exit(&dn->dn_struct_rwlock);
- return (ENOTSUP);
-}
-
-void
-dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx)
-{
- uint64_t txgoff = tx->tx_txg & TXG_MASK;
- int drop_struct_lock = FALSE;
- int epbs, new_nlevels;
- uint64_t sz;
-
- ASSERT(blkid != DB_BONUS_BLKID);
-
- if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
- rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
- drop_struct_lock = TRUE;
- }
-
- if (blkid <= dn->dn_maxblkid)
- goto out;
-
- dn->dn_maxblkid = blkid;
-
- /*
- * Compute the number of levels necessary to support the new maxblkid.
- */
- new_nlevels = 1;
- epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
- for (sz = dn->dn_nblkptr;
- sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs)
- new_nlevels++;
-
- if (new_nlevels > dn->dn_nlevels) {
- int old_nlevels = dn->dn_nlevels;
- dmu_buf_impl_t *db;
- list_t *list;
- dbuf_dirty_record_t *new, *dr, *dr_next;
-
- dn->dn_nlevels = new_nlevels;
-
- ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]);
- dn->dn_next_nlevels[txgoff] = new_nlevels;
-
- /* dirty the left indirects */
- db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
- new = dbuf_dirty(db, tx);
- dbuf_rele(db, FTAG);
-
- /* transfer the dirty records to the new indirect */
- mutex_enter(&dn->dn_mtx);
- mutex_enter(&new->dt.di.dr_mtx);
- list = &dn->dn_dirty_records[txgoff];
- for (dr = list_head(list); dr; dr = dr_next) {
- dr_next = list_next(&dn->dn_dirty_records[txgoff], dr);
- if (dr->dr_dbuf->db_level != new_nlevels-1 &&
- dr->dr_dbuf->db_blkid != DB_BONUS_BLKID) {
- ASSERT(dr->dr_dbuf->db_level == old_nlevels-1);
- list_remove(&dn->dn_dirty_records[txgoff], dr);
- list_insert_tail(&new->dt.di.dr_children, dr);
- dr->dr_parent = new;
- }
- }
- mutex_exit(&new->dt.di.dr_mtx);
- mutex_exit(&dn->dn_mtx);
- }
-
-out:
- if (drop_struct_lock)
- rw_exit(&dn->dn_struct_rwlock);
-}
-
-void
-dnode_clear_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
-{
- avl_tree_t *tree = &dn->dn_ranges[tx->tx_txg&TXG_MASK];
- avl_index_t where;
- free_range_t *rp;
- free_range_t rp_tofind;
- uint64_t endblk = blkid + nblks;
-
- ASSERT(MUTEX_HELD(&dn->dn_mtx));
- ASSERT(nblks <= UINT64_MAX - blkid); /* no overflow */
-
- dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
- blkid, nblks, tx->tx_txg);
- rp_tofind.fr_blkid = blkid;
- rp = avl_find(tree, &rp_tofind, &where);
- if (rp == NULL)
- rp = avl_nearest(tree, where, AVL_BEFORE);
- if (rp == NULL)
- rp = avl_nearest(tree, where, AVL_AFTER);
-
- while (rp && (rp->fr_blkid <= blkid + nblks)) {
- uint64_t fr_endblk = rp->fr_blkid + rp->fr_nblks;
- free_range_t *nrp = AVL_NEXT(tree, rp);
-
- if (blkid <= rp->fr_blkid && endblk >= fr_endblk) {
- /* clear this entire range */
- avl_remove(tree, rp);
- kmem_free(rp, sizeof (free_range_t));
- } else if (blkid <= rp->fr_blkid &&
- endblk > rp->fr_blkid && endblk < fr_endblk) {
- /* clear the beginning of this range */
- rp->fr_blkid = endblk;
- rp->fr_nblks = fr_endblk - endblk;
- } else if (blkid > rp->fr_blkid && blkid < fr_endblk &&
- endblk >= fr_endblk) {
- /* clear the end of this range */
- rp->fr_nblks = blkid - rp->fr_blkid;
- } else if (blkid > rp->fr_blkid && endblk < fr_endblk) {
- /* clear a chunk out of this range */
- free_range_t *new_rp =
- kmem_alloc(sizeof (free_range_t), KM_SLEEP);
-
- new_rp->fr_blkid = endblk;
- new_rp->fr_nblks = fr_endblk - endblk;
- avl_insert_here(tree, new_rp, rp, AVL_AFTER);
- rp->fr_nblks = blkid - rp->fr_blkid;
- }
- /* there may be no overlap */
- rp = nrp;
- }
-}
-
-void
-dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
-{
- dmu_buf_impl_t *db;
- uint64_t blkoff, blkid, nblks;
- int blksz, head;
- int trunc = FALSE;
-
- rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
- blksz = dn->dn_datablksz;
-
- /* If the range is past the end of the file, this is a no-op */
- if (off >= blksz * (dn->dn_maxblkid+1))
- goto out;
- if (len == -1ULL) {
- len = UINT64_MAX - off;
- trunc = TRUE;
- }
-
- /*
- * First, block align the region to free:
- */
- if (ISP2(blksz)) {
- head = P2NPHASE(off, blksz);
- blkoff = P2PHASE(off, blksz);
- } else {
- ASSERT(dn->dn_maxblkid == 0);
- if (off == 0 && len >= blksz) {
- /* Freeing the whole block; don't do any head. */
- head = 0;
- } else {
- /* Freeing part of the block. */
- head = blksz - off;
- ASSERT3U(head, >, 0);
- }
- blkoff = off;
- }
- /* zero out any partial block data at the start of the range */
- if (head) {
- ASSERT3U(blkoff + head, ==, blksz);
- if (len < head)
- head = len;
- if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off), TRUE,
- FTAG, &db) == 0) {
- caddr_t data;
-
- /* don't dirty if it isn't on disk and isn't dirty */
- if (db->db_last_dirty ||
- (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
- rw_exit(&dn->dn_struct_rwlock);
- dbuf_will_dirty(db, tx);
- rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
- data = db->db.db_data;
- bzero(data + blkoff, head);
- }
- dbuf_rele(db, FTAG);
- }
- off += head;
- len -= head;
- }
-
- /* If the range was less than one block, we're done */
- if (len == 0 || off >= blksz * (dn->dn_maxblkid+1))
- goto out;
-
- if (!ISP2(blksz)) {
- /*
- * They are freeing the whole block of a
- * non-power-of-two blocksize file. Skip all the messy
- * math.
- */
- ASSERT3U(off, ==, 0);
- ASSERT3U(len, >=, blksz);
- blkid = 0;
- nblks = 1;
- } else {
- int tail;
- int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
- int blkshift = dn->dn_datablkshift;
-
- /* If the remaining range is past end of file, we're done */
- if (off > dn->dn_maxblkid << blkshift)
- goto out;
-
- if (off + len == UINT64_MAX)
- tail = 0;
- else
- tail = P2PHASE(len, blksz);
-
- ASSERT3U(P2PHASE(off, blksz), ==, 0);
- /* zero out any partial block data at the end of the range */
- if (tail) {
- if (len < tail)
- tail = len;
- if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len),
- TRUE, FTAG, &db) == 0) {
- /* don't dirty if not on disk and not dirty */
- if (db->db_last_dirty ||
- (db->db_blkptr &&
- !BP_IS_HOLE(db->db_blkptr))) {
- rw_exit(&dn->dn_struct_rwlock);
- dbuf_will_dirty(db, tx);
- rw_enter(&dn->dn_struct_rwlock,
- RW_WRITER);
- bzero(db->db.db_data, tail);
- }
- dbuf_rele(db, FTAG);
- }
- len -= tail;
- }
- /* If the range did not include a full block, we are done */
- if (len == 0)
- goto out;
-
- /* dirty the left indirects */
- if (dn->dn_nlevels > 1 && off != 0) {
- db = dbuf_hold_level(dn, 1,
- (off - head) >> (blkshift + epbs), FTAG);
- dbuf_will_dirty(db, tx);
- dbuf_rele(db, FTAG);
- }
-
- /* dirty the right indirects */
- if (dn->dn_nlevels > 1 && !trunc) {
- db = dbuf_hold_level(dn, 1,
- (off + len + tail - 1) >> (blkshift + epbs), FTAG);
- dbuf_will_dirty(db, tx);
- dbuf_rele(db, FTAG);
- }
-
- /*
- * Finally, add this range to the dnode range list, we
- * will finish up this free operation in the syncing phase.
- */
- ASSERT(IS_P2ALIGNED(off, 1<<blkshift));
- ASSERT(off + len == UINT64_MAX ||
- IS_P2ALIGNED(len, 1<<blkshift));
- blkid = off >> blkshift;
- nblks = len >> blkshift;
-
- if (trunc)
- dn->dn_maxblkid = (blkid ? blkid - 1 : 0);
- }
-
- mutex_enter(&dn->dn_mtx);
- dnode_clear_range(dn, blkid, nblks, tx);
- {
- free_range_t *rp, *found;
- avl_index_t where;
- avl_tree_t *tree = &dn->dn_ranges[tx->tx_txg&TXG_MASK];
-
- /* Add new range to dn_ranges */
- rp = kmem_alloc(sizeof (free_range_t), KM_SLEEP);
- rp->fr_blkid = blkid;
- rp->fr_nblks = nblks;
- found = avl_find(tree, rp, &where);
- ASSERT(found == NULL);
- avl_insert(tree, rp, where);
- dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
- blkid, nblks, tx->tx_txg);
- }
- mutex_exit(&dn->dn_mtx);
-
- dbuf_free_range(dn, blkid, nblks, tx);
- dnode_setdirty(dn, tx);
-out:
- rw_exit(&dn->dn_struct_rwlock);
-}
-
-/* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */
-uint64_t
-dnode_block_freed(dnode_t *dn, uint64_t blkid)
-{
- free_range_t range_tofind;
- void *dp = spa_get_dsl(dn->dn_objset->os_spa);
- int i;
-
- if (blkid == DB_BONUS_BLKID)
- return (FALSE);
-
- /*
- * If we're in the process of opening the pool, dp will not be
- * set yet, but there shouldn't be anything dirty.
- */
- if (dp == NULL)
- return (FALSE);
-
- if (dn->dn_free_txg)
- return (TRUE);
-
- /*
- * If dn_datablkshift is not set, then there's only a single
- * block, in which case there will never be a free range so it
- * won't matter.
- */
- range_tofind.fr_blkid = blkid;
- mutex_enter(&dn->dn_mtx);
- for (i = 0; i < TXG_SIZE; i++) {
- free_range_t *range_found;
- avl_index_t idx;
-
- range_found = avl_find(&dn->dn_ranges[i], &range_tofind, &idx);
- if (range_found) {
- ASSERT(range_found->fr_nblks > 0);
- break;
- }
- range_found = avl_nearest(&dn->dn_ranges[i], idx, AVL_BEFORE);
- if (range_found &&
- range_found->fr_blkid + range_found->fr_nblks > blkid)
- break;
- }
- mutex_exit(&dn->dn_mtx);
- return (i < TXG_SIZE);
-}
-
-/* call from syncing context when we actually write/free space for this dnode */
-void
-dnode_diduse_space(dnode_t *dn, int64_t delta)
-{
- uint64_t space;
- dprintf_dnode(dn, "dn=%p dnp=%p used=%llu delta=%lld\n",
- dn, dn->dn_phys,
- (u_longlong_t)dn->dn_phys->dn_used,
- (longlong_t)delta);
-
- mutex_enter(&dn->dn_mtx);
- space = DN_USED_BYTES(dn->dn_phys);
- if (delta > 0) {
- ASSERT3U(space + delta, >=, space); /* no overflow */
- } else {
- ASSERT3U(space, >=, -delta); /* no underflow */
- }
- space += delta;
- if (spa_version(dn->dn_objset->os_spa) < ZFS_VERSION_DNODE_BYTES) {
- ASSERT((dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) == 0);
- ASSERT3U(P2PHASE(space, 1<<DEV_BSHIFT), ==, 0);
- dn->dn_phys->dn_used = space >> DEV_BSHIFT;
- } else {
- dn->dn_phys->dn_used = space;
- dn->dn_phys->dn_flags |= DNODE_FLAG_USED_BYTES;
- }
- mutex_exit(&dn->dn_mtx);
-}
-
-/*
- * Call when we think we're going to write/free space in open context.
- * Be conservative (ie. OK to write less than this or free more than
- * this, but don't write more or free less).
- */
-void
-dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx)
-{
- objset_impl_t *os = dn->dn_objset;
- dsl_dataset_t *ds = os->os_dsl_dataset;
-
- if (space > 0)
- space = spa_get_asize(os->os_spa, space);
-
- if (ds)
- dsl_dir_willuse_space(ds->ds_dir, space, tx);
-
- dmu_tx_willuse_space(tx, space);
-}
-
-static int
-dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset,
- int lvl, uint64_t blkfill, uint64_t txg)
-{
- dmu_buf_impl_t *db = NULL;
- void *data = NULL;
- uint64_t epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
- uint64_t epb = 1ULL << epbs;
- uint64_t minfill, maxfill;
- int i, error, span;
-
- dprintf("probing object %llu offset %llx level %d of %u\n",
- dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels);
-
- if (lvl == dn->dn_phys->dn_nlevels) {
- error = 0;
- epb = dn->dn_phys->dn_nblkptr;
- data = dn->dn_phys->dn_blkptr;
- } else {
- uint64_t blkid = dbuf_whichblock(dn, *offset) >> (epbs * lvl);
- error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FTAG, &db);
- if (error) {
- if (error == ENOENT)
- return (hole ? 0 : ESRCH);
- return (error);
- }
- error = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_HAVESTRUCT);
- if (error) {
- dbuf_rele(db, FTAG);
- return (error);
- }
- data = db->db.db_data;
- }
-
- if (db && txg &&
- (db->db_blkptr == NULL || db->db_blkptr->blk_birth <= txg)) {
- error = ESRCH;
- } else if (lvl == 0) {
- dnode_phys_t *dnp = data;
- span = DNODE_SHIFT;
- ASSERT(dn->dn_type == DMU_OT_DNODE);
-
- for (i = (*offset >> span) & (blkfill - 1); i < blkfill; i++) {
- boolean_t newcontents = B_TRUE;
- if (txg) {
- int j;
- newcontents = B_FALSE;
- for (j = 0; j < dnp[i].dn_nblkptr; j++) {
- if (dnp[i].dn_blkptr[j].blk_birth > txg)
- newcontents = B_TRUE;
- }
- }
- if (!dnp[i].dn_type == hole && newcontents)
- break;
- *offset += 1ULL << span;
- }
- if (i == blkfill)
- error = ESRCH;
- } else {
- blkptr_t *bp = data;
- span = (lvl - 1) * epbs + dn->dn_datablkshift;
- minfill = 0;
- maxfill = blkfill << ((lvl - 1) * epbs);
-
- if (hole)
- maxfill--;
- else
- minfill++;
-
- for (i = (*offset >> span) & ((1ULL << epbs) - 1);
- i < epb; i++) {
- if (bp[i].blk_fill >= minfill &&
- bp[i].blk_fill <= maxfill &&
- bp[i].blk_birth > txg)
- break;
- *offset += 1ULL << span;
- }
- if (i >= epb)
- error = ESRCH;
- }
-
- if (db)
- dbuf_rele(db, FTAG);
-
- return (error);
-}
-
-/*
- * Find the next hole, data, or sparse region at or after *offset.
- * The value 'blkfill' tells us how many items we expect to find
- * in an L0 data block; this value is 1 for normal objects,
- * DNODES_PER_BLOCK for the meta dnode, and some fraction of
- * DNODES_PER_BLOCK when searching for sparse regions thereof.
- *
- * Examples:
- *
- * dnode_next_offset(dn, hole, offset, 1, 1, 0);
- * Finds the next hole/data in a file.
- * Used in dmu_offset_next().
- *
- * dnode_next_offset(mdn, hole, offset, 0, DNODES_PER_BLOCK, txg);
- * Finds the next free/allocated dnode an objset's meta-dnode.
- * Only finds objects that have new contents since txg (ie.
- * bonus buffer changes and content removal are ignored).
- * Used in dmu_object_next().
- *
- * dnode_next_offset(mdn, TRUE, offset, 2, DNODES_PER_BLOCK >> 2, 0);
- * Finds the next L2 meta-dnode bp that's at most 1/4 full.
- * Used in dmu_object_alloc().
- */
-int
-dnode_next_offset(dnode_t *dn, boolean_t hole, uint64_t *offset,
- int minlvl, uint64_t blkfill, uint64_t txg)
-{
- int lvl, maxlvl;
- int error = 0;
- uint64_t initial_offset = *offset;
-
- rw_enter(&dn->dn_struct_rwlock, RW_READER);
-
- if (dn->dn_phys->dn_nlevels == 0) {
- rw_exit(&dn->dn_struct_rwlock);
- return (ESRCH);
- }
-
- if (dn->dn_datablkshift == 0) {
- if (*offset < dn->dn_datablksz) {
- if (hole)
- *offset = dn->dn_datablksz;
- } else {
- error = ESRCH;
- }
- rw_exit(&dn->dn_struct_rwlock);
- return (error);
- }
-
- maxlvl = dn->dn_phys->dn_nlevels;
-
- for (lvl = minlvl; lvl <= maxlvl; lvl++) {
- error = dnode_next_offset_level(dn,
- hole, offset, lvl, blkfill, txg);
- if (error != ESRCH)
- break;
- }
-
- while (--lvl >= minlvl && error == 0) {
- error = dnode_next_offset_level(dn,
- hole, offset, lvl, blkfill, txg);
- }
-
- rw_exit(&dn->dn_struct_rwlock);
-
- if (error == 0 && initial_offset > *offset)
- error = ESRCH;
-
- return (error);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
deleted file mode 100644
index 9e8c7ad..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
+++ /dev/null
@@ -1,623 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/dbuf.h>
-#include <sys/dnode.h>
-#include <sys/dmu.h>
-#include <sys/dmu_tx.h>
-#include <sys/dmu_objset.h>
-#include <sys/dsl_dataset.h>
-#include <sys/spa.h>
-
-static void
-dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
-{
- dmu_buf_impl_t *db;
- int txgoff = tx->tx_txg & TXG_MASK;
- int nblkptr = dn->dn_phys->dn_nblkptr;
- int old_toplvl = dn->dn_phys->dn_nlevels - 1;
- int new_level = dn->dn_next_nlevels[txgoff];
- int i;
-
- rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
-
- /* this dnode can't be paged out because it's dirty */
- ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
- ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
- ASSERT(new_level > 1 && dn->dn_phys->dn_nlevels > 0);
-
- db = dbuf_hold_level(dn, dn->dn_phys->dn_nlevels, 0, FTAG);
- ASSERT(db != NULL);
-
- dn->dn_phys->dn_nlevels = new_level;
- dprintf("os=%p obj=%llu, increase to %d\n",
- dn->dn_objset, dn->dn_object,
- dn->dn_phys->dn_nlevels);
-
- /* check for existing blkptrs in the dnode */
- for (i = 0; i < nblkptr; i++)
- if (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[i]))
- break;
- if (i != nblkptr) {
- /* transfer dnode's block pointers to new indirect block */
- (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED|DB_RF_HAVESTRUCT);
- ASSERT(db->db.db_data);
- ASSERT(arc_released(db->db_buf));
- ASSERT3U(sizeof (blkptr_t) * nblkptr, <=, db->db.db_size);
- bcopy(dn->dn_phys->dn_blkptr, db->db.db_data,
- sizeof (blkptr_t) * nblkptr);
- arc_buf_freeze(db->db_buf);
- }
-
- /* set dbuf's parent pointers to new indirect buf */
- for (i = 0; i < nblkptr; i++) {
- dmu_buf_impl_t *child = dbuf_find(dn, old_toplvl, i);
-
- if (child == NULL)
- continue;
- ASSERT3P(child->db_dnode, ==, dn);
- if (child->db_parent && child->db_parent != dn->dn_dbuf) {
- ASSERT(child->db_parent->db_level == db->db_level);
- ASSERT(child->db_blkptr !=
- &dn->dn_phys->dn_blkptr[child->db_blkid]);
- mutex_exit(&child->db_mtx);
- continue;
- }
- ASSERT(child->db_parent == NULL ||
- child->db_parent == dn->dn_dbuf);
-
- child->db_parent = db;
- dbuf_add_ref(db, child);
- if (db->db.db_data)
- child->db_blkptr = (blkptr_t *)db->db.db_data + i;
- else
- child->db_blkptr = NULL;
- dprintf_dbuf_bp(child, child->db_blkptr,
- "changed db_blkptr to new indirect %s", "");
-
- mutex_exit(&child->db_mtx);
- }
-
- bzero(dn->dn_phys->dn_blkptr, sizeof (blkptr_t) * nblkptr);
-
- dbuf_rele(db, FTAG);
-
- rw_exit(&dn->dn_struct_rwlock);
-}
-
-static void
-free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx)
-{
- objset_impl_t *os = dn->dn_objset;
- uint64_t bytesfreed = 0;
- int i;
-
- dprintf("os=%p obj=%llx num=%d\n", os, dn->dn_object, num);
-
- for (i = 0; i < num; i++, bp++) {
- if (BP_IS_HOLE(bp))
- continue;
-
- bytesfreed += bp_get_dasize(os->os_spa, bp);
- ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys));
- dsl_dataset_block_kill(os->os_dsl_dataset, bp, dn->dn_zio, tx);
- bzero(bp, sizeof (blkptr_t));
- }
- dnode_diduse_space(dn, -bytesfreed);
-}
-
-#ifdef ZFS_DEBUG
-static void
-free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
-{
- int off, num;
- int i, err, epbs;
- uint64_t txg = tx->tx_txg;
-
- epbs = db->db_dnode->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
- off = start - (db->db_blkid * 1<<epbs);
- num = end - start + 1;
-
- ASSERT3U(off, >=, 0);
- ASSERT3U(num, >=, 0);
- ASSERT3U(db->db_level, >, 0);
- ASSERT3U(db->db.db_size, ==, 1<<db->db_dnode->dn_phys->dn_indblkshift);
- ASSERT3U(off+num, <=, db->db.db_size >> SPA_BLKPTRSHIFT);
- ASSERT(db->db_blkptr != NULL);
-
- for (i = off; i < off+num; i++) {
- uint64_t *buf;
- dmu_buf_impl_t *child;
- dbuf_dirty_record_t *dr;
- int j;
-
- ASSERT(db->db_level == 1);
-
- rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
- err = dbuf_hold_impl(db->db_dnode, db->db_level-1,
- (db->db_blkid << epbs) + i, TRUE, FTAG, &child);
- rw_exit(&db->db_dnode->dn_struct_rwlock);
- if (err == ENOENT)
- continue;
- ASSERT(err == 0);
- ASSERT(child->db_level == 0);
- dr = child->db_last_dirty;
- while (dr && dr->dr_txg > txg)
- dr = dr->dr_next;
- ASSERT(dr == NULL || dr->dr_txg == txg);
-
- /* data_old better be zeroed */
- if (dr) {
- buf = dr->dt.dl.dr_data->b_data;
- for (j = 0; j < child->db.db_size >> 3; j++) {
- if (buf[j] != 0) {
- panic("freed data not zero: "
- "child=%p i=%d off=%d num=%d\n",
- child, i, off, num);
- }
- }
- }
-
- /*
- * db_data better be zeroed unless it's dirty in a
- * future txg.
- */
- mutex_enter(&child->db_mtx);
- buf = child->db.db_data;
- if (buf != NULL && child->db_state != DB_FILL &&
- child->db_last_dirty == NULL) {
- for (j = 0; j < child->db.db_size >> 3; j++) {
- if (buf[j] != 0) {
- panic("freed data not zero: "
- "child=%p i=%d off=%d num=%d\n",
- child, i, off, num);
- }
- }
- }
- mutex_exit(&child->db_mtx);
-
- dbuf_rele(child, FTAG);
- }
-}
-#endif
-
-static int
-free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
- dmu_tx_t *tx)
-{
- dnode_t *dn = db->db_dnode;
- blkptr_t *bp;
- dmu_buf_impl_t *subdb;
- uint64_t start, end, dbstart, dbend, i;
- int epbs, shift, err;
- int all = TRUE;
-
- (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
- arc_release(db->db_buf, db);
- bp = (blkptr_t *)db->db.db_data;
-
- epbs = db->db_dnode->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
- shift = (db->db_level - 1) * epbs;
- dbstart = db->db_blkid << epbs;
- start = blkid >> shift;
- if (dbstart < start) {
- bp += start - dbstart;
- all = FALSE;
- } else {
- start = dbstart;
- }
- dbend = ((db->db_blkid + 1) << epbs) - 1;
- end = (blkid + nblks - 1) >> shift;
- if (dbend <= end)
- end = dbend;
- else if (all)
- all = trunc;
- ASSERT3U(start, <=, end);
-
- if (db->db_level == 1) {
- FREE_VERIFY(db, start, end, tx);
- free_blocks(dn, bp, end-start+1, tx);
- arc_buf_freeze(db->db_buf);
- ASSERT(all || db->db_last_dirty);
- return (all);
- }
-
- for (i = start; i <= end; i++, bp++) {
- if (BP_IS_HOLE(bp))
- continue;
- rw_enter(&dn->dn_struct_rwlock, RW_READER);
- err = dbuf_hold_impl(dn, db->db_level-1, i, TRUE, FTAG, &subdb);
- ASSERT3U(err, ==, 0);
- rw_exit(&dn->dn_struct_rwlock);
-
- if (free_children(subdb, blkid, nblks, trunc, tx)) {
- ASSERT3P(subdb->db_blkptr, ==, bp);
- free_blocks(dn, bp, 1, tx);
- } else {
- all = FALSE;
- }
- dbuf_rele(subdb, FTAG);
- }
- arc_buf_freeze(db->db_buf);
-#ifdef ZFS_DEBUG
- bp -= (end-start)+1;
- for (i = start; i <= end; i++, bp++) {
- if (i == start && blkid != 0)
- continue;
- else if (i == end && !trunc)
- continue;
- ASSERT3U(bp->blk_birth, ==, 0);
- }
-#endif
- ASSERT(all || db->db_last_dirty);
- return (all);
-}
-
-/*
- * free_range: Traverse the indicated range of the provided file
- * and "free" all the blocks contained there.
- */
-static void
-dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
-{
- blkptr_t *bp = dn->dn_phys->dn_blkptr;
- dmu_buf_impl_t *db;
- int trunc, start, end, shift, i, err;
- int dnlevel = dn->dn_phys->dn_nlevels;
-
- if (blkid > dn->dn_phys->dn_maxblkid)
- return;
-
- ASSERT(dn->dn_phys->dn_maxblkid < UINT64_MAX);
- trunc = blkid + nblks > dn->dn_phys->dn_maxblkid;
- if (trunc)
- nblks = dn->dn_phys->dn_maxblkid - blkid + 1;
-
- /* There are no indirect blocks in the object */
- if (dnlevel == 1) {
- if (blkid >= dn->dn_phys->dn_nblkptr) {
- /* this range was never made persistent */
- return;
- }
- ASSERT3U(blkid + nblks, <=, dn->dn_phys->dn_nblkptr);
- free_blocks(dn, bp + blkid, nblks, tx);
- if (trunc) {
- uint64_t off = (dn->dn_phys->dn_maxblkid + 1) *
- (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT);
- dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0);
- ASSERT(off < dn->dn_phys->dn_maxblkid ||
- dn->dn_phys->dn_maxblkid == 0 ||
- dnode_next_offset(dn, FALSE, &off,
- 1, 1, 0) != 0);
- }
- return;
- }
-
- shift = (dnlevel - 1) * (dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT);
- start = blkid >> shift;
- ASSERT(start < dn->dn_phys->dn_nblkptr);
- end = (blkid + nblks - 1) >> shift;
- bp += start;
- for (i = start; i <= end; i++, bp++) {
- if (BP_IS_HOLE(bp))
- continue;
- rw_enter(&dn->dn_struct_rwlock, RW_READER);
- err = dbuf_hold_impl(dn, dnlevel-1, i, TRUE, FTAG, &db);
- ASSERT3U(err, ==, 0);
- rw_exit(&dn->dn_struct_rwlock);
-
- if (free_children(db, blkid, nblks, trunc, tx)) {
- ASSERT3P(db->db_blkptr, ==, bp);
- free_blocks(dn, bp, 1, tx);
- }
- dbuf_rele(db, FTAG);
- }
- if (trunc) {
- uint64_t off = (dn->dn_phys->dn_maxblkid + 1) *
- (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT);
- dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0);
- ASSERT(off < dn->dn_phys->dn_maxblkid ||
- dn->dn_phys->dn_maxblkid == 0 ||
- dnode_next_offset(dn, FALSE, &off, 1, 1, 0) != 0);
- }
-}
-
-/*
- * Try to kick all the dnodes dbufs out of the cache...
- */
-int
-dnode_evict_dbufs(dnode_t *dn, int try)
-{
- int progress;
- int pass = 0;
-
- do {
- dmu_buf_impl_t *db, marker;
- int evicting = FALSE;
-
- progress = FALSE;
- mutex_enter(&dn->dn_dbufs_mtx);
- list_insert_tail(&dn->dn_dbufs, &marker);
- db = list_head(&dn->dn_dbufs);
- for (; db != &marker; db = list_head(&dn->dn_dbufs)) {
- list_remove(&dn->dn_dbufs, db);
- list_insert_tail(&dn->dn_dbufs, db);
-
- mutex_enter(&db->db_mtx);
- if (db->db_state == DB_EVICTING) {
- progress = TRUE;
- evicting = TRUE;
- mutex_exit(&db->db_mtx);
- } else if (refcount_is_zero(&db->db_holds)) {
- progress = TRUE;
- ASSERT(!arc_released(db->db_buf));
- dbuf_clear(db); /* exits db_mtx for us */
- } else {
- mutex_exit(&db->db_mtx);
- }
-
- }
- list_remove(&dn->dn_dbufs, &marker);
- /*
- * NB: we need to drop dn_dbufs_mtx between passes so
- * that any DB_EVICTING dbufs can make progress.
- * Ideally, we would have some cv we could wait on, but
- * since we don't, just wait a bit to give the other
- * thread a chance to run.
- */
- mutex_exit(&dn->dn_dbufs_mtx);
- if (evicting)
- delay(1);
- pass++;
- ASSERT(pass < 100); /* sanity check */
- } while (progress);
-
- /*
- * This function works fine even if it can't evict everything.
- * If were only asked to try to evict everything then
- * return an error if we can't. Otherwise panic as the caller
- * expects total eviction.
- */
- if (list_head(&dn->dn_dbufs) != NULL) {
- if (try) {
- return (1);
- } else {
- panic("dangling dbufs (dn=%p, dbuf=%p)\n",
- dn, list_head(&dn->dn_dbufs));
- }
- }
-
- rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
- if (dn->dn_bonus && refcount_is_zero(&dn->dn_bonus->db_holds)) {
- mutex_enter(&dn->dn_bonus->db_mtx);
- dbuf_evict(dn->dn_bonus);
- dn->dn_bonus = NULL;
- }
- rw_exit(&dn->dn_struct_rwlock);
- return (0);
-}
-
-static void
-dnode_undirty_dbufs(list_t *list)
-{
- dbuf_dirty_record_t *dr;
-
- while (dr = list_head(list)) {
- dmu_buf_impl_t *db = dr->dr_dbuf;
- uint64_t txg = dr->dr_txg;
-
- mutex_enter(&db->db_mtx);
- /* XXX - use dbuf_undirty()? */
- list_remove(list, dr);
- ASSERT(db->db_last_dirty == dr);
- db->db_last_dirty = NULL;
- db->db_dirtycnt -= 1;
- if (db->db_level == 0) {
- ASSERT(db->db_blkid == DB_BONUS_BLKID ||
- dr->dt.dl.dr_data == db->db_buf);
- dbuf_unoverride(dr);
- mutex_exit(&db->db_mtx);
- } else {
- mutex_exit(&db->db_mtx);
- dnode_undirty_dbufs(&dr->dt.di.dr_children);
- list_destroy(&dr->dt.di.dr_children);
- mutex_destroy(&dr->dt.di.dr_mtx);
- }
- kmem_free(dr, sizeof (dbuf_dirty_record_t));
- dbuf_rele(db, (void *)(uintptr_t)txg);
- }
-}
-
-static void
-dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
-{
- int txgoff = tx->tx_txg & TXG_MASK;
-
- ASSERT(dmu_tx_is_syncing(tx));
-
- dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]);
- (void) dnode_evict_dbufs(dn, 0);
- ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
-
- /*
- * XXX - It would be nice to assert this, but we may still
- * have residual holds from async evictions from the arc...
- *
- * zfs_obj_to_path() also depends on this being
- * commented out.
- *
- * ASSERT3U(refcount_count(&dn->dn_holds), ==, 1);
- */
-
- /* Undirty next bits */
- dn->dn_next_nlevels[txgoff] = 0;
- dn->dn_next_indblkshift[txgoff] = 0;
- dn->dn_next_blksz[txgoff] = 0;
-
- /* free up all the blocks in the file. */
- dnode_sync_free_range(dn, 0, dn->dn_phys->dn_maxblkid+1, tx);
- ASSERT3U(DN_USED_BYTES(dn->dn_phys), ==, 0);
-
- /* ASSERT(blkptrs are zero); */
- ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
- ASSERT(dn->dn_type != DMU_OT_NONE);
-
- ASSERT(dn->dn_free_txg > 0);
- if (dn->dn_allocated_txg != dn->dn_free_txg)
- dbuf_will_dirty(dn->dn_dbuf, tx);
- bzero(dn->dn_phys, sizeof (dnode_phys_t));
-
- mutex_enter(&dn->dn_mtx);
- dn->dn_type = DMU_OT_NONE;
- dn->dn_maxblkid = 0;
- dn->dn_allocated_txg = 0;
- mutex_exit(&dn->dn_mtx);
-
- ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
-
- dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
- /*
- * Now that we've released our hold, the dnode may
- * be evicted, so we musn't access it.
- */
-}
-
-/*
- * Write out the dnode's dirty buffers.
- *
- * NOTE: The dnode is kept in memory by being dirty. Once the
- * dirty bit is cleared, it may be evicted. Beware of this!
- */
-void
-dnode_sync(dnode_t *dn, dmu_tx_t *tx)
-{
- free_range_t *rp;
- dnode_phys_t *dnp = dn->dn_phys;
- int txgoff = tx->tx_txg & TXG_MASK;
- list_t *list = &dn->dn_dirty_records[txgoff];
-
- ASSERT(dmu_tx_is_syncing(tx));
- ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg);
- DNODE_VERIFY(dn);
-
- ASSERT(dn->dn_dbuf == NULL || arc_released(dn->dn_dbuf->db_buf));
-
- mutex_enter(&dn->dn_mtx);
- if (dn->dn_allocated_txg == tx->tx_txg) {
- /* The dnode is newly allocated or reallocated */
- if (dnp->dn_type == DMU_OT_NONE) {
- /* this is a first alloc, not a realloc */
- /* XXX shouldn't the phys already be zeroed? */
- bzero(dnp, DNODE_CORE_SIZE);
- dnp->dn_nlevels = 1;
- }
-
- if (dn->dn_nblkptr > dnp->dn_nblkptr) {
- /* zero the new blkptrs we are gaining */
- bzero(dnp->dn_blkptr + dnp->dn_nblkptr,
- sizeof (blkptr_t) *
- (dn->dn_nblkptr - dnp->dn_nblkptr));
- }
- dnp->dn_type = dn->dn_type;
- dnp->dn_bonustype = dn->dn_bonustype;
- dnp->dn_bonuslen = dn->dn_bonuslen;
- dnp->dn_nblkptr = dn->dn_nblkptr;
- }
-
- ASSERT(dnp->dn_nlevels > 1 ||
- BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
- BP_GET_LSIZE(&dnp->dn_blkptr[0]) ==
- dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
-
- if (dn->dn_next_blksz[txgoff]) {
- ASSERT(P2PHASE(dn->dn_next_blksz[txgoff],
- SPA_MINBLOCKSIZE) == 0);
- ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
- list_head(list) != NULL ||
- dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT ==
- dnp->dn_datablkszsec);
- dnp->dn_datablkszsec =
- dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT;
- dn->dn_next_blksz[txgoff] = 0;
- }
-
- if (dn->dn_next_indblkshift[txgoff]) {
- ASSERT(dnp->dn_nlevels == 1);
- dnp->dn_indblkshift = dn->dn_next_indblkshift[txgoff];
- dn->dn_next_indblkshift[txgoff] = 0;
- }
-
- /*
- * Just take the live (open-context) values for checksum and compress.
- * Strictly speaking it's a future leak, but nothing bad happens if we
- * start using the new checksum or compress algorithm a little early.
- */
- dnp->dn_checksum = dn->dn_checksum;
- dnp->dn_compress = dn->dn_compress;
-
- mutex_exit(&dn->dn_mtx);
-
- /* process all the "freed" ranges in the file */
- if (dn->dn_free_txg == 0 || dn->dn_free_txg > tx->tx_txg) {
- for (rp = avl_last(&dn->dn_ranges[txgoff]); rp != NULL;
- rp = AVL_PREV(&dn->dn_ranges[txgoff], rp))
- dnode_sync_free_range(dn,
- rp->fr_blkid, rp->fr_nblks, tx);
- }
- mutex_enter(&dn->dn_mtx);
- for (rp = avl_first(&dn->dn_ranges[txgoff]); rp; ) {
- free_range_t *last = rp;
- rp = AVL_NEXT(&dn->dn_ranges[txgoff], rp);
- avl_remove(&dn->dn_ranges[txgoff], last);
- kmem_free(last, sizeof (free_range_t));
- }
- mutex_exit(&dn->dn_mtx);
-
- if (dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg) {
- dnode_sync_free(dn, tx);
- return;
- }
-
- if (dn->dn_next_nlevels[txgoff]) {
- dnode_increase_indirection(dn, tx);
- dn->dn_next_nlevels[txgoff] = 0;
- }
-
- dbuf_sync_list(list, tx);
-
- if (dn->dn_object != DMU_META_DNODE_OBJECT) {
- ASSERT3P(list_head(list), ==, NULL);
- dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
- }
-
- /*
- * Although we have dropped our reference to the dnode, it
- * can't be evicted until its written, and we haven't yet
- * initiated the IO for the dnode's dbuf.
- */
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
deleted file mode 100644
index 7d4689f..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
+++ /dev/null
@@ -1,2035 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/dmu_objset.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_dir.h>
-#include <sys/dsl_prop.h>
-#include <sys/dsl_synctask.h>
-#include <sys/dmu_traverse.h>
-#include <sys/dmu_tx.h>
-#include <sys/arc.h>
-#include <sys/zio.h>
-#include <sys/zap.h>
-#include <sys/unique.h>
-#include <sys/zfs_context.h>
-#include <sys/zfs_ioctl.h>
-
-static dsl_checkfunc_t dsl_dataset_destroy_begin_check;
-static dsl_syncfunc_t dsl_dataset_destroy_begin_sync;
-static dsl_checkfunc_t dsl_dataset_rollback_check;
-static dsl_syncfunc_t dsl_dataset_rollback_sync;
-static dsl_checkfunc_t dsl_dataset_destroy_check;
-static dsl_syncfunc_t dsl_dataset_destroy_sync;
-
-#define DS_REF_MAX (1ULL << 62)
-
-#define DSL_DEADLIST_BLOCKSIZE SPA_MAXBLOCKSIZE
-
-/*
- * We use weighted reference counts to express the various forms of exclusion
- * between different open modes. A STANDARD open is 1 point, an EXCLUSIVE open
- * is DS_REF_MAX, and a PRIMARY open is little more than half of an EXCLUSIVE.
- * This makes the exclusion logic simple: the total refcnt for all opens cannot
- * exceed DS_REF_MAX. For example, EXCLUSIVE opens are exclusive because their
- * weight (DS_REF_MAX) consumes the entire refcnt space. PRIMARY opens consume
- * just over half of the refcnt space, so there can't be more than one, but it
- * can peacefully coexist with any number of STANDARD opens.
- */
-static uint64_t ds_refcnt_weight[DS_MODE_LEVELS] = {
- 0, /* DS_MODE_NONE - invalid */
- 1, /* DS_MODE_STANDARD - unlimited number */
- (DS_REF_MAX >> 1) + 1, /* DS_MODE_PRIMARY - only one of these */
- DS_REF_MAX /* DS_MODE_EXCLUSIVE - no other opens */
-};
-
-
-void
-dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
-{
- int used = bp_get_dasize(tx->tx_pool->dp_spa, bp);
- int compressed = BP_GET_PSIZE(bp);
- int uncompressed = BP_GET_UCSIZE(bp);
-
- dprintf_bp(bp, "born, ds=%p\n", ds);
-
- ASSERT(dmu_tx_is_syncing(tx));
- /* It could have been compressed away to nothing */
- if (BP_IS_HOLE(bp))
- return;
- ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE);
- ASSERT3U(BP_GET_TYPE(bp), <, DMU_OT_NUMTYPES);
- if (ds == NULL) {
- /*
- * Account for the meta-objset space in its placeholder
- * dsl_dir.
- */
- ASSERT3U(compressed, ==, uncompressed); /* it's all metadata */
- dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir,
- used, compressed, uncompressed, tx);
- dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx);
- return;
- }
- dmu_buf_will_dirty(ds->ds_dbuf, tx);
- mutex_enter(&ds->ds_lock);
- ds->ds_phys->ds_used_bytes += used;
- ds->ds_phys->ds_compressed_bytes += compressed;
- ds->ds_phys->ds_uncompressed_bytes += uncompressed;
- ds->ds_phys->ds_unique_bytes += used;
- mutex_exit(&ds->ds_lock);
- dsl_dir_diduse_space(ds->ds_dir,
- used, compressed, uncompressed, tx);
-}
-
-void
-dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
- dmu_tx_t *tx)
-{
- int used = bp_get_dasize(tx->tx_pool->dp_spa, bp);
- int compressed = BP_GET_PSIZE(bp);
- int uncompressed = BP_GET_UCSIZE(bp);
-
- ASSERT(dmu_tx_is_syncing(tx));
- /* No block pointer => nothing to free */
- if (BP_IS_HOLE(bp))
- return;
-
- ASSERT(used > 0);
- if (ds == NULL) {
- int err;
- /*
- * Account for the meta-objset space in its placeholder
- * dataset.
- */
- err = arc_free(pio, tx->tx_pool->dp_spa,
- tx->tx_txg, bp, NULL, NULL, pio ? ARC_NOWAIT: ARC_WAIT);
- ASSERT(err == 0);
-
- dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir,
- -used, -compressed, -uncompressed, tx);
- dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx);
- return;
- }
- ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);
-
- dmu_buf_will_dirty(ds->ds_dbuf, tx);
-
- if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) {
- int err;
-
- dprintf_bp(bp, "freeing: %s", "");
- err = arc_free(pio, tx->tx_pool->dp_spa,
- tx->tx_txg, bp, NULL, NULL, pio ? ARC_NOWAIT: ARC_WAIT);
- ASSERT(err == 0);
-
- mutex_enter(&ds->ds_lock);
- /* XXX unique_bytes is not accurate for head datasets */
- /* ASSERT3U(ds->ds_phys->ds_unique_bytes, >=, used); */
- ds->ds_phys->ds_unique_bytes -= used;
- mutex_exit(&ds->ds_lock);
- dsl_dir_diduse_space(ds->ds_dir,
- -used, -compressed, -uncompressed, tx);
- } else {
- dprintf_bp(bp, "putting on dead list: %s", "");
- VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, bp, tx));
- /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
- if (ds->ds_phys->ds_prev_snap_obj != 0) {
- ASSERT3U(ds->ds_prev->ds_object, ==,
- ds->ds_phys->ds_prev_snap_obj);
- ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0);
- if (ds->ds_prev->ds_phys->ds_next_snap_obj ==
- ds->ds_object && bp->blk_birth >
- ds->ds_prev->ds_phys->ds_prev_snap_txg) {
- dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
- mutex_enter(&ds->ds_prev->ds_lock);
- ds->ds_prev->ds_phys->ds_unique_bytes +=
- used;
- mutex_exit(&ds->ds_prev->ds_lock);
- }
- }
- }
- mutex_enter(&ds->ds_lock);
- ASSERT3U(ds->ds_phys->ds_used_bytes, >=, used);
- ds->ds_phys->ds_used_bytes -= used;
- ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed);
- ds->ds_phys->ds_compressed_bytes -= compressed;
- ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed);
- ds->ds_phys->ds_uncompressed_bytes -= uncompressed;
- mutex_exit(&ds->ds_lock);
-}
-
-uint64_t
-dsl_dataset_prev_snap_txg(dsl_dataset_t *ds)
-{
- uint64_t trysnap = 0;
-
- if (ds == NULL)
- return (0);
- /*
- * The snapshot creation could fail, but that would cause an
- * incorrect FALSE return, which would only result in an
- * overestimation of the amount of space that an operation would
- * consume, which is OK.
- *
- * There's also a small window where we could miss a pending
- * snapshot, because we could set the sync task in the quiescing
- * phase. So this should only be used as a guess.
- */
- if (ds->ds_trysnap_txg >
- spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa))
- trysnap = ds->ds_trysnap_txg;
- return (MAX(ds->ds_phys->ds_prev_snap_txg, trysnap));
-}
-
-int
-dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth)
-{
- return (blk_birth > dsl_dataset_prev_snap_txg(ds));
-}
-
-/* ARGSUSED */
-static void
-dsl_dataset_evict(dmu_buf_t *db, void *dsv)
-{
- dsl_dataset_t *ds = dsv;
- dsl_pool_t *dp = ds->ds_dir->dd_pool;
-
- /* open_refcount == DS_REF_MAX when deleting */
- ASSERT(ds->ds_open_refcount == 0 ||
- ds->ds_open_refcount == DS_REF_MAX);
-
- dprintf_ds(ds, "evicting %s\n", "");
-
- unique_remove(ds->ds_phys->ds_fsid_guid);
-
- if (ds->ds_user_ptr != NULL)
- ds->ds_user_evict_func(ds, ds->ds_user_ptr);
-
- if (ds->ds_prev) {
- dsl_dataset_close(ds->ds_prev, DS_MODE_NONE, ds);
- ds->ds_prev = NULL;
- }
-
- bplist_close(&ds->ds_deadlist);
- dsl_dir_close(ds->ds_dir, ds);
-
- if (list_link_active(&ds->ds_synced_link))
- list_remove(&dp->dp_synced_objsets, ds);
-
- mutex_destroy(&ds->ds_lock);
- mutex_destroy(&ds->ds_deadlist.bpl_lock);
-
- kmem_free(ds, sizeof (dsl_dataset_t));
-}
-
-static int
-dsl_dataset_get_snapname(dsl_dataset_t *ds)
-{
- dsl_dataset_phys_t *headphys;
- int err;
- dmu_buf_t *headdbuf;
- dsl_pool_t *dp = ds->ds_dir->dd_pool;
- objset_t *mos = dp->dp_meta_objset;
-
- if (ds->ds_snapname[0])
- return (0);
- if (ds->ds_phys->ds_next_snap_obj == 0)
- return (0);
-
- err = dmu_bonus_hold(mos, ds->ds_dir->dd_phys->dd_head_dataset_obj,
- FTAG, &headdbuf);
- if (err)
- return (err);
- headphys = headdbuf->db_data;
- err = zap_value_search(dp->dp_meta_objset,
- headphys->ds_snapnames_zapobj, ds->ds_object, ds->ds_snapname);
- dmu_buf_rele(headdbuf, FTAG);
- return (err);
-}
-
-int
-dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname,
- int mode, void *tag, dsl_dataset_t **dsp)
-{
- uint64_t weight = ds_refcnt_weight[DS_MODE_LEVEL(mode)];
- objset_t *mos = dp->dp_meta_objset;
- dmu_buf_t *dbuf;
- dsl_dataset_t *ds;
- int err;
-
- ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
- dsl_pool_sync_context(dp));
-
- err = dmu_bonus_hold(mos, dsobj, tag, &dbuf);
- if (err)
- return (err);
- ds = dmu_buf_get_user(dbuf);
- if (ds == NULL) {
- dsl_dataset_t *winner;
-
- ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP);
- ds->ds_dbuf = dbuf;
- ds->ds_object = dsobj;
- ds->ds_phys = dbuf->db_data;
-
- mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&ds->ds_deadlist.bpl_lock, NULL, MUTEX_DEFAULT,
- NULL);
-
- err = bplist_open(&ds->ds_deadlist,
- mos, ds->ds_phys->ds_deadlist_obj);
- if (err == 0) {
- err = dsl_dir_open_obj(dp,
- ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir);
- }
- if (err) {
- /*
- * we don't really need to close the blist if we
- * just opened it.
- */
- mutex_destroy(&ds->ds_lock);
- mutex_destroy(&ds->ds_deadlist.bpl_lock);
- kmem_free(ds, sizeof (dsl_dataset_t));
- dmu_buf_rele(dbuf, tag);
- return (err);
- }
-
- if (ds->ds_dir->dd_phys->dd_head_dataset_obj == dsobj) {
- ds->ds_snapname[0] = '\0';
- if (ds->ds_phys->ds_prev_snap_obj) {
- err = dsl_dataset_open_obj(dp,
- ds->ds_phys->ds_prev_snap_obj, NULL,
- DS_MODE_NONE, ds, &ds->ds_prev);
- }
- } else {
- if (snapname) {
-#ifdef ZFS_DEBUG
- dsl_dataset_phys_t *headphys;
- dmu_buf_t *headdbuf;
- err = dmu_bonus_hold(mos,
- ds->ds_dir->dd_phys->dd_head_dataset_obj,
- FTAG, &headdbuf);
- if (err == 0) {
- headphys = headdbuf->db_data;
- uint64_t foundobj;
- err = zap_lookup(dp->dp_meta_objset,
- headphys->ds_snapnames_zapobj,
- snapname, sizeof (foundobj), 1,
- &foundobj);
- ASSERT3U(foundobj, ==, dsobj);
- dmu_buf_rele(headdbuf, FTAG);
- }
-#endif
- (void) strcat(ds->ds_snapname, snapname);
- } else if (zfs_flags & ZFS_DEBUG_SNAPNAMES) {
- err = dsl_dataset_get_snapname(ds);
- }
- }
-
- if (err == 0) {
- winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys,
- dsl_dataset_evict);
- }
- if (err || winner) {
- bplist_close(&ds->ds_deadlist);
- if (ds->ds_prev) {
- dsl_dataset_close(ds->ds_prev,
- DS_MODE_NONE, ds);
- }
- dsl_dir_close(ds->ds_dir, ds);
- mutex_destroy(&ds->ds_lock);
- mutex_destroy(&ds->ds_deadlist.bpl_lock);
- kmem_free(ds, sizeof (dsl_dataset_t));
- if (err) {
- dmu_buf_rele(dbuf, tag);
- return (err);
- }
- ds = winner;
- } else {
- uint64_t new =
- unique_insert(ds->ds_phys->ds_fsid_guid);
- if (new != ds->ds_phys->ds_fsid_guid) {
- /* XXX it won't necessarily be synced... */
- ds->ds_phys->ds_fsid_guid = new;
- }
- }
- }
- ASSERT3P(ds->ds_dbuf, ==, dbuf);
- ASSERT3P(ds->ds_phys, ==, dbuf->db_data);
-
- mutex_enter(&ds->ds_lock);
- if ((DS_MODE_LEVEL(mode) == DS_MODE_PRIMARY &&
- (ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) &&
- !DS_MODE_IS_INCONSISTENT(mode)) ||
- (ds->ds_open_refcount + weight > DS_REF_MAX)) {
- mutex_exit(&ds->ds_lock);
- dsl_dataset_close(ds, DS_MODE_NONE, tag);
- return (EBUSY);
- }
- ds->ds_open_refcount += weight;
- mutex_exit(&ds->ds_lock);
-
- *dsp = ds;
- return (0);
-}
-
-int
-dsl_dataset_open_spa(spa_t *spa, const char *name, int mode,
- void *tag, dsl_dataset_t **dsp)
-{
- dsl_dir_t *dd;
- dsl_pool_t *dp;
- const char *tail;
- uint64_t obj;
- dsl_dataset_t *ds = NULL;
- int err = 0;
-
- err = dsl_dir_open_spa(spa, name, FTAG, &dd, &tail);
- if (err)
- return (err);
-
- dp = dd->dd_pool;
- obj = dd->dd_phys->dd_head_dataset_obj;
- rw_enter(&dp->dp_config_rwlock, RW_READER);
- if (obj == 0) {
- /* A dataset with no associated objset */
- err = ENOENT;
- goto out;
- }
-
- if (tail != NULL) {
- objset_t *mos = dp->dp_meta_objset;
-
- err = dsl_dataset_open_obj(dp, obj, NULL,
- DS_MODE_NONE, tag, &ds);
- if (err)
- goto out;
- obj = ds->ds_phys->ds_snapnames_zapobj;
- dsl_dataset_close(ds, DS_MODE_NONE, tag);
- ds = NULL;
-
- if (tail[0] != '@') {
- err = ENOENT;
- goto out;
- }
- tail++;
-
- /* Look for a snapshot */
- if (!DS_MODE_IS_READONLY(mode)) {
- err = EROFS;
- goto out;
- }
- dprintf("looking for snapshot '%s'\n", tail);
- err = zap_lookup(mos, obj, tail, 8, 1, &obj);
- if (err)
- goto out;
- }
- err = dsl_dataset_open_obj(dp, obj, tail, mode, tag, &ds);
-
-out:
- rw_exit(&dp->dp_config_rwlock);
- dsl_dir_close(dd, FTAG);
-
- ASSERT3U((err == 0), ==, (ds != NULL));
- /* ASSERT(ds == NULL || strcmp(name, ds->ds_name) == 0); */
-
- *dsp = ds;
- return (err);
-}
-
-int
-dsl_dataset_open(const char *name, int mode, void *tag, dsl_dataset_t **dsp)
-{
- return (dsl_dataset_open_spa(NULL, name, mode, tag, dsp));
-}
-
-void
-dsl_dataset_name(dsl_dataset_t *ds, char *name)
-{
- if (ds == NULL) {
- (void) strcpy(name, "mos");
- } else {
- dsl_dir_name(ds->ds_dir, name);
- VERIFY(0 == dsl_dataset_get_snapname(ds));
- if (ds->ds_snapname[0]) {
- (void) strcat(name, "@");
- if (!MUTEX_HELD(&ds->ds_lock)) {
- /*
- * We use a "recursive" mutex so that we
- * can call dprintf_ds() with ds_lock held.
- */
- mutex_enter(&ds->ds_lock);
- (void) strcat(name, ds->ds_snapname);
- mutex_exit(&ds->ds_lock);
- } else {
- (void) strcat(name, ds->ds_snapname);
- }
- }
- }
-}
-
-static int
-dsl_dataset_namelen(dsl_dataset_t *ds)
-{
- int result;
-
- if (ds == NULL) {
- result = 3; /* "mos" */
- } else {
- result = dsl_dir_namelen(ds->ds_dir);
- VERIFY(0 == dsl_dataset_get_snapname(ds));
- if (ds->ds_snapname[0]) {
- ++result; /* adding one for the @-sign */
- if (!MUTEX_HELD(&ds->ds_lock)) {
- /* see dsl_datset_name */
- mutex_enter(&ds->ds_lock);
- result += strlen(ds->ds_snapname);
- mutex_exit(&ds->ds_lock);
- } else {
- result += strlen(ds->ds_snapname);
- }
- }
- }
-
- return (result);
-}
-
-void
-dsl_dataset_close(dsl_dataset_t *ds, int mode, void *tag)
-{
- uint64_t weight = ds_refcnt_weight[DS_MODE_LEVEL(mode)];
- mutex_enter(&ds->ds_lock);
- ASSERT3U(ds->ds_open_refcount, >=, weight);
- ds->ds_open_refcount -= weight;
- dprintf_ds(ds, "closing mode %u refcount now 0x%llx\n",
- mode, ds->ds_open_refcount);
- mutex_exit(&ds->ds_lock);
-
- dmu_buf_rele(ds->ds_dbuf, tag);
-}
-
-void
-dsl_dataset_create_root(dsl_pool_t *dp, uint64_t *ddobjp, dmu_tx_t *tx)
-{
- objset_t *mos = dp->dp_meta_objset;
- dmu_buf_t *dbuf;
- dsl_dataset_phys_t *dsphys;
- dsl_dataset_t *ds;
- uint64_t dsobj;
- dsl_dir_t *dd;
-
- dsl_dir_create_root(mos, ddobjp, tx);
- VERIFY(0 == dsl_dir_open_obj(dp, *ddobjp, NULL, FTAG, &dd));
-
- dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
- DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
- VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
- dmu_buf_will_dirty(dbuf, tx);
- dsphys = dbuf->db_data;
- dsphys->ds_dir_obj = dd->dd_object;
- dsphys->ds_fsid_guid = unique_create();
- unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */
- (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
- sizeof (dsphys->ds_guid));
- dsphys->ds_snapnames_zapobj =
- zap_create(mos, DMU_OT_DSL_DS_SNAP_MAP, DMU_OT_NONE, 0, tx);
- dsphys->ds_creation_time = gethrestime_sec();
- dsphys->ds_creation_txg = tx->tx_txg;
- dsphys->ds_deadlist_obj =
- bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
- dmu_buf_rele(dbuf, FTAG);
-
- dmu_buf_will_dirty(dd->dd_dbuf, tx);
- dd->dd_phys->dd_head_dataset_obj = dsobj;
- dsl_dir_close(dd, FTAG);
-
- VERIFY(0 ==
- dsl_dataset_open_obj(dp, dsobj, NULL, DS_MODE_NONE, FTAG, &ds));
- (void) dmu_objset_create_impl(dp->dp_spa, ds,
- &ds->ds_phys->ds_bp, DMU_OST_ZFS, tx);
- dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
-}
-
-uint64_t
-dsl_dataset_create_sync(dsl_dir_t *pdd,
- const char *lastname, dsl_dataset_t *clone_parent, dmu_tx_t *tx)
-{
- dsl_pool_t *dp = pdd->dd_pool;
- dmu_buf_t *dbuf;
- dsl_dataset_phys_t *dsphys;
- uint64_t dsobj, ddobj;
- objset_t *mos = dp->dp_meta_objset;
- dsl_dir_t *dd;
-
- ASSERT(clone_parent == NULL || clone_parent->ds_dir->dd_pool == dp);
- ASSERT(clone_parent == NULL ||
- clone_parent->ds_phys->ds_num_children > 0);
- ASSERT(lastname[0] != '@');
- ASSERT(dmu_tx_is_syncing(tx));
-
- ddobj = dsl_dir_create_sync(pdd, lastname, tx);
- VERIFY(0 == dsl_dir_open_obj(dp, ddobj, lastname, FTAG, &dd));
-
- dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
- DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
- VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
- dmu_buf_will_dirty(dbuf, tx);
- dsphys = dbuf->db_data;
- dsphys->ds_dir_obj = dd->dd_object;
- dsphys->ds_fsid_guid = unique_create();
- unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */
- (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
- sizeof (dsphys->ds_guid));
- dsphys->ds_snapnames_zapobj =
- zap_create(mos, DMU_OT_DSL_DS_SNAP_MAP, DMU_OT_NONE, 0, tx);
- dsphys->ds_creation_time = gethrestime_sec();
- dsphys->ds_creation_txg = tx->tx_txg;
- dsphys->ds_deadlist_obj =
- bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
- if (clone_parent) {
- dsphys->ds_prev_snap_obj = clone_parent->ds_object;
- dsphys->ds_prev_snap_txg =
- clone_parent->ds_phys->ds_creation_txg;
- dsphys->ds_used_bytes =
- clone_parent->ds_phys->ds_used_bytes;
- dsphys->ds_compressed_bytes =
- clone_parent->ds_phys->ds_compressed_bytes;
- dsphys->ds_uncompressed_bytes =
- clone_parent->ds_phys->ds_uncompressed_bytes;
- dsphys->ds_bp = clone_parent->ds_phys->ds_bp;
-
- dmu_buf_will_dirty(clone_parent->ds_dbuf, tx);
- clone_parent->ds_phys->ds_num_children++;
-
- dmu_buf_will_dirty(dd->dd_dbuf, tx);
- dd->dd_phys->dd_clone_parent_obj = clone_parent->ds_object;
- }
- dmu_buf_rele(dbuf, FTAG);
-
- dmu_buf_will_dirty(dd->dd_dbuf, tx);
- dd->dd_phys->dd_head_dataset_obj = dsobj;
- dsl_dir_close(dd, FTAG);
-
- return (dsobj);
-}
-
-struct destroyarg {
- dsl_sync_task_group_t *dstg;
- char *snapname;
- char *failed;
-};
-
-static int
-dsl_snapshot_destroy_one(char *name, void *arg)
-{
- struct destroyarg *da = arg;
- dsl_dataset_t *ds;
- char *cp;
- int err;
-
- (void) strcat(name, "@");
- (void) strcat(name, da->snapname);
- err = dsl_dataset_open(name,
- DS_MODE_EXCLUSIVE | DS_MODE_READONLY | DS_MODE_INCONSISTENT,
- da->dstg, &ds);
- cp = strchr(name, '@');
- *cp = '\0';
- if (err == ENOENT)
- return (0);
- if (err) {
- (void) strcpy(da->failed, name);
- return (err);
- }
-
- dsl_sync_task_create(da->dstg, dsl_dataset_destroy_check,
- dsl_dataset_destroy_sync, ds, da->dstg, 0);
- return (0);
-}
-
-/*
- * Destroy 'snapname' in all descendants of 'fsname'.
- */
-#pragma weak dmu_snapshots_destroy = dsl_snapshots_destroy
-int
-dsl_snapshots_destroy(char *fsname, char *snapname)
-{
- int err;
- struct destroyarg da;
- dsl_sync_task_t *dst;
- spa_t *spa;
- char *cp;
-
- cp = strchr(fsname, '/');
- if (cp) {
- *cp = '\0';
- err = spa_open(fsname, &spa, FTAG);
- *cp = '/';
- } else {
- err = spa_open(fsname, &spa, FTAG);
- }
- if (err)
- return (err);
- da.dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
- da.snapname = snapname;
- da.failed = fsname;
-
- err = dmu_objset_find(fsname,
- dsl_snapshot_destroy_one, &da, DS_FIND_CHILDREN);
-
- if (err == 0)
- err = dsl_sync_task_group_wait(da.dstg);
-
- for (dst = list_head(&da.dstg->dstg_tasks); dst;
- dst = list_next(&da.dstg->dstg_tasks, dst)) {
- dsl_dataset_t *ds = dst->dst_arg1;
- if (dst->dst_err) {
- dsl_dataset_name(ds, fsname);
- cp = strchr(fsname, '@');
- *cp = '\0';
- }
- /*
- * If it was successful, destroy_sync would have
- * closed the ds
- */
- if (err)
- dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, da.dstg);
- }
-
- dsl_sync_task_group_destroy(da.dstg);
- spa_close(spa, FTAG);
- return (err);
-}
-
-int
-dsl_dataset_destroy(const char *name)
-{
- int err;
- dsl_sync_task_group_t *dstg;
- objset_t *os;
- dsl_dataset_t *ds;
- dsl_dir_t *dd;
- uint64_t obj;
-
- if (strchr(name, '@')) {
- /* Destroying a snapshot is simpler */
- err = dsl_dataset_open(name,
- DS_MODE_EXCLUSIVE | DS_MODE_READONLY | DS_MODE_INCONSISTENT,
- FTAG, &ds);
- if (err)
- return (err);
- err = dsl_sync_task_do(ds->ds_dir->dd_pool,
- dsl_dataset_destroy_check, dsl_dataset_destroy_sync,
- ds, FTAG, 0);
- if (err)
- dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
- return (err);
- }
-
- err = dmu_objset_open(name, DMU_OST_ANY,
- DS_MODE_EXCLUSIVE | DS_MODE_INCONSISTENT, &os);
- if (err)
- return (err);
- ds = os->os->os_dsl_dataset;
- dd = ds->ds_dir;
-
- /*
- * Check for errors and mark this ds as inconsistent, in
- * case we crash while freeing the objects.
- */
- err = dsl_sync_task_do(dd->dd_pool, dsl_dataset_destroy_begin_check,
- dsl_dataset_destroy_begin_sync, ds, NULL, 0);
- if (err) {
- dmu_objset_close(os);
- return (err);
- }
-
- /*
- * remove the objects in open context, so that we won't
- * have too much to do in syncing context.
- */
- for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE,
- ds->ds_phys->ds_prev_snap_txg)) {
- dmu_tx_t *tx = dmu_tx_create(os);
- dmu_tx_hold_free(tx, obj, 0, DMU_OBJECT_END);
- dmu_tx_hold_bonus(tx, obj);
- err = dmu_tx_assign(tx, TXG_WAIT);
- if (err) {
- /*
- * Perhaps there is not enough disk
- * space. Just deal with it from
- * dsl_dataset_destroy_sync().
- */
- dmu_tx_abort(tx);
- continue;
- }
- VERIFY(0 == dmu_object_free(os, obj, tx));
- dmu_tx_commit(tx);
- }
- /* Make sure it's not dirty before we finish destroying it. */
- txg_wait_synced(dd->dd_pool, 0);
-
- dmu_objset_close(os);
- if (err != ESRCH)
- return (err);
-
- err = dsl_dataset_open(name,
- DS_MODE_EXCLUSIVE | DS_MODE_READONLY | DS_MODE_INCONSISTENT,
- FTAG, &ds);
- if (err)
- return (err);
-
- err = dsl_dir_open(name, FTAG, &dd, NULL);
- if (err) {
- dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
- return (err);
- }
-
- /*
- * Blow away the dsl_dir + head dataset.
- */
- dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool);
- dsl_sync_task_create(dstg, dsl_dataset_destroy_check,
- dsl_dataset_destroy_sync, ds, FTAG, 0);
- dsl_sync_task_create(dstg, dsl_dir_destroy_check,
- dsl_dir_destroy_sync, dd, FTAG, 0);
- err = dsl_sync_task_group_wait(dstg);
- dsl_sync_task_group_destroy(dstg);
- /* if it is successful, *destroy_sync will close the ds+dd */
- if (err) {
- dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
- dsl_dir_close(dd, FTAG);
- }
- return (err);
-}
-
-int
-dsl_dataset_rollback(dsl_dataset_t *ds)
-{
- ASSERT3U(ds->ds_open_refcount, ==, DS_REF_MAX);
- return (dsl_sync_task_do(ds->ds_dir->dd_pool,
- dsl_dataset_rollback_check, dsl_dataset_rollback_sync,
- ds, NULL, 0));
-}
-
-void *
-dsl_dataset_set_user_ptr(dsl_dataset_t *ds,
- void *p, dsl_dataset_evict_func_t func)
-{
- void *old;
-
- mutex_enter(&ds->ds_lock);
- old = ds->ds_user_ptr;
- if (old == NULL) {
- ds->ds_user_ptr = p;
- ds->ds_user_evict_func = func;
- }
- mutex_exit(&ds->ds_lock);
- return (old);
-}
-
-void *
-dsl_dataset_get_user_ptr(dsl_dataset_t *ds)
-{
- return (ds->ds_user_ptr);
-}
-
-
-blkptr_t *
-dsl_dataset_get_blkptr(dsl_dataset_t *ds)
-{
- return (&ds->ds_phys->ds_bp);
-}
-
-void
-dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
-{
- ASSERT(dmu_tx_is_syncing(tx));
- /* If it's the meta-objset, set dp_meta_rootbp */
- if (ds == NULL) {
- tx->tx_pool->dp_meta_rootbp = *bp;
- } else {
- dmu_buf_will_dirty(ds->ds_dbuf, tx);
- ds->ds_phys->ds_bp = *bp;
- }
-}
-
-spa_t *
-dsl_dataset_get_spa(dsl_dataset_t *ds)
-{
- return (ds->ds_dir->dd_pool->dp_spa);
-}
-
-void
-dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)
-{
- dsl_pool_t *dp;
-
- if (ds == NULL) /* this is the meta-objset */
- return;
-
- ASSERT(ds->ds_user_ptr != NULL);
-
- if (ds->ds_phys->ds_next_snap_obj != 0)
- panic("dirtying snapshot!");
-
- dp = ds->ds_dir->dd_pool;
-
- if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg) == 0) {
- /* up the hold count until we can be written out */
- dmu_buf_add_ref(ds->ds_dbuf, ds);
- }
-}
-
-struct killarg {
- uint64_t *usedp;
- uint64_t *compressedp;
- uint64_t *uncompressedp;
- zio_t *zio;
- dmu_tx_t *tx;
-};
-
-static int
-kill_blkptr(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
-{
- struct killarg *ka = arg;
- blkptr_t *bp = &bc->bc_blkptr;
-
- ASSERT3U(bc->bc_errno, ==, 0);
-
- /*
- * Since this callback is not called concurrently, no lock is
- * needed on the accounting values.
- */
- *ka->usedp += bp_get_dasize(spa, bp);
- *ka->compressedp += BP_GET_PSIZE(bp);
- *ka->uncompressedp += BP_GET_UCSIZE(bp);
- /* XXX check for EIO? */
- (void) arc_free(ka->zio, spa, ka->tx->tx_txg, bp, NULL, NULL,
- ARC_NOWAIT);
- return (0);
-}
-
-/* ARGSUSED */
-static int
-dsl_dataset_rollback_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
- dsl_dataset_t *ds = arg1;
-
- /*
- * There must be a previous snapshot. I suppose we could roll
- * it back to being empty (and re-initialize the upper (ZPL)
- * layer). But for now there's no way to do this via the user
- * interface.
- */
- if (ds->ds_phys->ds_prev_snap_txg == 0)
- return (EINVAL);
-
- /*
- * This must not be a snapshot.
- */
- if (ds->ds_phys->ds_next_snap_obj != 0)
- return (EINVAL);
-
- /*
- * If we made changes this txg, traverse_dsl_dataset won't find
- * them. Try again.
- */
- if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg)
- return (EAGAIN);
-
- return (0);
-}
-
-/* ARGSUSED */
-static void
-dsl_dataset_rollback_sync(void *arg1, void *arg2, dmu_tx_t *tx)
-{
- dsl_dataset_t *ds = arg1;
- objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
-
- dmu_buf_will_dirty(ds->ds_dbuf, tx);
-
- /* Zero out the deadlist. */
- bplist_close(&ds->ds_deadlist);
- bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx);
- ds->ds_phys->ds_deadlist_obj =
- bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
- VERIFY(0 == bplist_open(&ds->ds_deadlist, mos,
- ds->ds_phys->ds_deadlist_obj));
-
- {
- /* Free blkptrs that we gave birth to */
- zio_t *zio;
- uint64_t used = 0, compressed = 0, uncompressed = 0;
- struct killarg ka;
-
- zio = zio_root(tx->tx_pool->dp_spa, NULL, NULL,
- ZIO_FLAG_MUSTSUCCEED);
- ka.usedp = &used;
- ka.compressedp = &compressed;
- ka.uncompressedp = &uncompressed;
- ka.zio = zio;
- ka.tx = tx;
- (void) traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
- ADVANCE_POST, kill_blkptr, &ka);
- (void) zio_wait(zio);
-
- dsl_dir_diduse_space(ds->ds_dir,
- -used, -compressed, -uncompressed, tx);
- }
-
- /* Change our contents to that of the prev snapshot */
- ASSERT3U(ds->ds_prev->ds_object, ==, ds->ds_phys->ds_prev_snap_obj);
- ds->ds_phys->ds_bp = ds->ds_prev->ds_phys->ds_bp;
- ds->ds_phys->ds_used_bytes = ds->ds_prev->ds_phys->ds_used_bytes;
- ds->ds_phys->ds_compressed_bytes =
- ds->ds_prev->ds_phys->ds_compressed_bytes;
- ds->ds_phys->ds_uncompressed_bytes =
- ds->ds_prev->ds_phys->ds_uncompressed_bytes;
- ds->ds_phys->ds_flags = ds->ds_prev->ds_phys->ds_flags;
- ds->ds_phys->ds_unique_bytes = 0;
-
- if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
- dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
- ds->ds_prev->ds_phys->ds_unique_bytes = 0;
- }
-}
-
-/* ARGSUSED */
-static int
-dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
- dsl_dataset_t *ds = arg1;
-
- /*
- * Can't delete a head dataset if there are snapshots of it.
- * (Except if the only snapshots are from the branch we cloned
- * from.)
- */
- if (ds->ds_prev != NULL &&
- ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
- return (EINVAL);
-
- return (0);
-}
-
-/* ARGSUSED */
-static void
-dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, dmu_tx_t *tx)
-{
- dsl_dataset_t *ds = arg1;
-
- /* Mark it as inconsistent on-disk, in case we crash */
- dmu_buf_will_dirty(ds->ds_dbuf, tx);
- ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
-}
-
-/* ARGSUSED */
-static int
-dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
- dsl_dataset_t *ds = arg1;
-
- /* Can't delete a branch point. */
- if (ds->ds_phys->ds_num_children > 1)
- return (EEXIST);
-
- /*
- * Can't delete a head dataset if there are snapshots of it.
- * (Except if the only snapshots are from the branch we cloned
- * from.)
- */
- if (ds->ds_prev != NULL &&
- ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
- return (EINVAL);
-
- /*
- * If we made changes this txg, traverse_dsl_dataset won't find
- * them. Try again.
- */
- if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg)
- return (EAGAIN);
-
- /* XXX we should do some i/o error checking... */
- return (0);
-}
-
-static void
-dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
-{
- dsl_dataset_t *ds = arg1;
- uint64_t used = 0, compressed = 0, uncompressed = 0;
- zio_t *zio;
- int err;
- int after_branch_point = FALSE;
- dsl_pool_t *dp = ds->ds_dir->dd_pool;
- objset_t *mos = dp->dp_meta_objset;
- dsl_dataset_t *ds_prev = NULL;
- uint64_t obj;
-
- ASSERT3U(ds->ds_open_refcount, ==, DS_REF_MAX);
- ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
- ASSERT(ds->ds_prev == NULL ||
- ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object);
- ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg);
-
- ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
-
- obj = ds->ds_object;
-
- if (ds->ds_phys->ds_prev_snap_obj != 0) {
- if (ds->ds_prev) {
- ds_prev = ds->ds_prev;
- } else {
- VERIFY(0 == dsl_dataset_open_obj(dp,
- ds->ds_phys->ds_prev_snap_obj, NULL,
- DS_MODE_NONE, FTAG, &ds_prev));
- }
- after_branch_point =
- (ds_prev->ds_phys->ds_next_snap_obj != obj);
-
- dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
- if (after_branch_point &&
- ds->ds_phys->ds_next_snap_obj == 0) {
- /* This clone is toast. */
- ASSERT(ds_prev->ds_phys->ds_num_children > 1);
- ds_prev->ds_phys->ds_num_children--;
- } else if (!after_branch_point) {
- ds_prev->ds_phys->ds_next_snap_obj =
- ds->ds_phys->ds_next_snap_obj;
- }
- }
-
- zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
-
- if (ds->ds_phys->ds_next_snap_obj != 0) {
- blkptr_t bp;
- dsl_dataset_t *ds_next;
- uint64_t itor = 0;
-
- spa_scrub_restart(dp->dp_spa, tx->tx_txg);
-
- VERIFY(0 == dsl_dataset_open_obj(dp,
- ds->ds_phys->ds_next_snap_obj, NULL,
- DS_MODE_NONE, FTAG, &ds_next));
- ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj);
-
- dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
- ds_next->ds_phys->ds_prev_snap_obj =
- ds->ds_phys->ds_prev_snap_obj;
- ds_next->ds_phys->ds_prev_snap_txg =
- ds->ds_phys->ds_prev_snap_txg;
- ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
- ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0);
-
- /*
- * Transfer to our deadlist (which will become next's
- * new deadlist) any entries from next's current
- * deadlist which were born before prev, and free the
- * other entries.
- *
- * XXX we're doing this long task with the config lock held
- */
- while (bplist_iterate(&ds_next->ds_deadlist, &itor,
- &bp) == 0) {
- if (bp.blk_birth <= ds->ds_phys->ds_prev_snap_txg) {
- VERIFY(0 == bplist_enqueue(&ds->ds_deadlist,
- &bp, tx));
- if (ds_prev && !after_branch_point &&
- bp.blk_birth >
- ds_prev->ds_phys->ds_prev_snap_txg) {
- ds_prev->ds_phys->ds_unique_bytes +=
- bp_get_dasize(dp->dp_spa, &bp);
- }
- } else {
- used += bp_get_dasize(dp->dp_spa, &bp);
- compressed += BP_GET_PSIZE(&bp);
- uncompressed += BP_GET_UCSIZE(&bp);
- /* XXX check return value? */
- (void) arc_free(zio, dp->dp_spa, tx->tx_txg,
- &bp, NULL, NULL, ARC_NOWAIT);
- }
- }
-
- /* free next's deadlist */
- bplist_close(&ds_next->ds_deadlist);
- bplist_destroy(mos, ds_next->ds_phys->ds_deadlist_obj, tx);
-
- /* set next's deadlist to our deadlist */
- ds_next->ds_phys->ds_deadlist_obj =
- ds->ds_phys->ds_deadlist_obj;
- VERIFY(0 == bplist_open(&ds_next->ds_deadlist, mos,
- ds_next->ds_phys->ds_deadlist_obj));
- ds->ds_phys->ds_deadlist_obj = 0;
-
- if (ds_next->ds_phys->ds_next_snap_obj != 0) {
- /*
- * Update next's unique to include blocks which
- * were previously shared by only this snapshot
- * and it. Those blocks will be born after the
- * prev snap and before this snap, and will have
- * died after the next snap and before the one
- * after that (ie. be on the snap after next's
- * deadlist).
- *
- * XXX we're doing this long task with the
- * config lock held
- */
- dsl_dataset_t *ds_after_next;
-
- VERIFY(0 == dsl_dataset_open_obj(dp,
- ds_next->ds_phys->ds_next_snap_obj, NULL,
- DS_MODE_NONE, FTAG, &ds_after_next));
- itor = 0;
- while (bplist_iterate(&ds_after_next->ds_deadlist,
- &itor, &bp) == 0) {
- if (bp.blk_birth >
- ds->ds_phys->ds_prev_snap_txg &&
- bp.blk_birth <=
- ds->ds_phys->ds_creation_txg) {
- ds_next->ds_phys->ds_unique_bytes +=
- bp_get_dasize(dp->dp_spa, &bp);
- }
- }
-
- dsl_dataset_close(ds_after_next, DS_MODE_NONE, FTAG);
- ASSERT3P(ds_next->ds_prev, ==, NULL);
- } else {
- /*
- * It would be nice to update the head dataset's
- * unique. To do so we would have to traverse
- * it for blocks born after ds_prev, which is
- * pretty expensive just to maintain something
- * for debugging purposes.
- */
- ASSERT3P(ds_next->ds_prev, ==, ds);
- dsl_dataset_close(ds_next->ds_prev, DS_MODE_NONE,
- ds_next);
- if (ds_prev) {
- VERIFY(0 == dsl_dataset_open_obj(dp,
- ds->ds_phys->ds_prev_snap_obj, NULL,
- DS_MODE_NONE, ds_next, &ds_next->ds_prev));
- } else {
- ds_next->ds_prev = NULL;
- }
- }
- dsl_dataset_close(ds_next, DS_MODE_NONE, FTAG);
-
- /*
- * NB: unique_bytes is not accurate for head objsets
- * because we don't update it when we delete the most
- * recent snapshot -- see above comment.
- */
- ASSERT3U(used, ==, ds->ds_phys->ds_unique_bytes);
- } else {
- /*
- * There's no next snapshot, so this is a head dataset.
- * Destroy the deadlist. Unless it's a clone, the
- * deadlist should be empty. (If it's a clone, it's
- * safe to ignore the deadlist contents.)
- */
- struct killarg ka;
-
- ASSERT(after_branch_point || bplist_empty(&ds->ds_deadlist));
- bplist_close(&ds->ds_deadlist);
- bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx);
- ds->ds_phys->ds_deadlist_obj = 0;
-
- /*
- * Free everything that we point to (that's born after
- * the previous snapshot, if we are a clone)
- *
- * XXX we're doing this long task with the config lock held
- */
- ka.usedp = &used;
- ka.compressedp = &compressed;
- ka.uncompressedp = &uncompressed;
- ka.zio = zio;
- ka.tx = tx;
- err = traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
- ADVANCE_POST, kill_blkptr, &ka);
- ASSERT3U(err, ==, 0);
- }
-
- err = zio_wait(zio);
- ASSERT3U(err, ==, 0);
-
- dsl_dir_diduse_space(ds->ds_dir, -used, -compressed, -uncompressed, tx);
-
- if (ds->ds_phys->ds_snapnames_zapobj) {
- err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx);
- ASSERT(err == 0);
- }
-
- if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) {
- /* Erase the link in the dataset */
- dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
- ds->ds_dir->dd_phys->dd_head_dataset_obj = 0;
- /*
- * dsl_dir_sync_destroy() called us, they'll destroy
- * the dataset.
- */
- } else {
- /* remove from snapshot namespace */
- dsl_dataset_t *ds_head;
- VERIFY(0 == dsl_dataset_open_obj(dp,
- ds->ds_dir->dd_phys->dd_head_dataset_obj, NULL,
- DS_MODE_NONE, FTAG, &ds_head));
- VERIFY(0 == dsl_dataset_get_snapname(ds));
-#ifdef ZFS_DEBUG
- {
- uint64_t val;
- err = zap_lookup(mos,
- ds_head->ds_phys->ds_snapnames_zapobj,
- ds->ds_snapname, 8, 1, &val);
- ASSERT3U(err, ==, 0);
- ASSERT3U(val, ==, obj);
- }
-#endif
- err = zap_remove(mos, ds_head->ds_phys->ds_snapnames_zapobj,
- ds->ds_snapname, tx);
- ASSERT(err == 0);
- dsl_dataset_close(ds_head, DS_MODE_NONE, FTAG);
- }
-
- if (ds_prev && ds->ds_prev != ds_prev)
- dsl_dataset_close(ds_prev, DS_MODE_NONE, FTAG);
-
- spa_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
- dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, tag);
- VERIFY(0 == dmu_object_free(mos, obj, tx));
-
-}
-
-/* ARGSUSED */
-int
-dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
- objset_t *os = arg1;
- dsl_dataset_t *ds = os->os->os_dsl_dataset;
- const char *snapname = arg2;
- objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
- int err;
- uint64_t value;
-
- /*
- * We don't allow multiple snapshots of the same txg. If there
- * is already one, try again.
- */
- if (ds->ds_phys->ds_prev_snap_txg >= tx->tx_txg)
- return (EAGAIN);
-
- /*
- * Check for conflicting name snapshot name.
- */
- err = zap_lookup(mos, ds->ds_phys->ds_snapnames_zapobj,
- snapname, 8, 1, &value);
- if (err == 0)
- return (EEXIST);
- if (err != ENOENT)
- return (err);
-
- /*
- * Check that the dataset's name is not too long. Name consists
- * of the dataset's length + 1 for the @-sign + snapshot name's length
- */
- if (dsl_dataset_namelen(ds) + 1 + strlen(snapname) >= MAXNAMELEN)
- return (ENAMETOOLONG);
-
- ds->ds_trysnap_txg = tx->tx_txg;
- return (0);
-}
-
-void
-dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx)
-{
- objset_t *os = arg1;
- dsl_dataset_t *ds = os->os->os_dsl_dataset;
- const char *snapname = arg2;
- dsl_pool_t *dp = ds->ds_dir->dd_pool;
- dmu_buf_t *dbuf;
- dsl_dataset_phys_t *dsphys;
- uint64_t dsobj;
- objset_t *mos = dp->dp_meta_objset;
- int err;
-
- spa_scrub_restart(dp->dp_spa, tx->tx_txg);
- ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
-
- dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
- DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
- VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
- dmu_buf_will_dirty(dbuf, tx);
- dsphys = dbuf->db_data;
- dsphys->ds_dir_obj = ds->ds_dir->dd_object;
- dsphys->ds_fsid_guid = unique_create();
- unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */
- (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
- sizeof (dsphys->ds_guid));
- dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj;
- dsphys->ds_prev_snap_txg = ds->ds_phys->ds_prev_snap_txg;
- dsphys->ds_next_snap_obj = ds->ds_object;
- dsphys->ds_num_children = 1;
- dsphys->ds_creation_time = gethrestime_sec();
- dsphys->ds_creation_txg = tx->tx_txg;
- dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj;
- dsphys->ds_used_bytes = ds->ds_phys->ds_used_bytes;
- dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes;
- dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes;
- dsphys->ds_flags = ds->ds_phys->ds_flags;
- dsphys->ds_bp = ds->ds_phys->ds_bp;
- dmu_buf_rele(dbuf, FTAG);
-
- ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0);
- if (ds->ds_prev) {
- ASSERT(ds->ds_prev->ds_phys->ds_next_snap_obj ==
- ds->ds_object ||
- ds->ds_prev->ds_phys->ds_num_children > 1);
- if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
- dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
- ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
- ds->ds_prev->ds_phys->ds_creation_txg);
- ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj;
- }
- }
-
- bplist_close(&ds->ds_deadlist);
- dmu_buf_will_dirty(ds->ds_dbuf, tx);
- ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, dsphys->ds_creation_txg);
- ds->ds_phys->ds_prev_snap_obj = dsobj;
- ds->ds_phys->ds_prev_snap_txg = dsphys->ds_creation_txg;
- ds->ds_phys->ds_unique_bytes = 0;
- ds->ds_phys->ds_deadlist_obj =
- bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
- VERIFY(0 == bplist_open(&ds->ds_deadlist, mos,
- ds->ds_phys->ds_deadlist_obj));
-
- dprintf("snap '%s' -> obj %llu\n", snapname, dsobj);
- err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj,
- snapname, 8, 1, &dsobj, tx);
- ASSERT(err == 0);
-
- if (ds->ds_prev)
- dsl_dataset_close(ds->ds_prev, DS_MODE_NONE, ds);
- VERIFY(0 == dsl_dataset_open_obj(dp,
- ds->ds_phys->ds_prev_snap_obj, snapname,
- DS_MODE_NONE, ds, &ds->ds_prev));
-}
-
-void
-dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
-{
- ASSERT(dmu_tx_is_syncing(tx));
- ASSERT(ds->ds_user_ptr != NULL);
- ASSERT(ds->ds_phys->ds_next_snap_obj == 0);
-
- dsl_dir_dirty(ds->ds_dir, tx);
- dmu_objset_sync(ds->ds_user_ptr, zio, tx);
- /* Unneeded? bplist_close(&ds->ds_deadlist); */
-}
-
-void
-dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
-{
- dsl_dir_stats(ds->ds_dir, nv);
-
- dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION,
- ds->ds_phys->ds_creation_time);
- dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG,
- ds->ds_phys->ds_creation_txg);
- dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED,
- ds->ds_phys->ds_used_bytes);
-
- if (ds->ds_phys->ds_next_snap_obj) {
- /*
- * This is a snapshot; override the dd's space used with
- * our unique space and compression ratio.
- */
- dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
- ds->ds_phys->ds_unique_bytes);
- dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO,
- ds->ds_phys->ds_compressed_bytes == 0 ? 100 :
- (ds->ds_phys->ds_uncompressed_bytes * 100 /
- ds->ds_phys->ds_compressed_bytes));
- }
-}
-
-void
-dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
-{
- stat->dds_creation_txg = ds->ds_phys->ds_creation_txg;
- stat->dds_inconsistent = ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT;
- if (ds->ds_phys->ds_next_snap_obj) {
- stat->dds_is_snapshot = B_TRUE;
- stat->dds_num_clones = ds->ds_phys->ds_num_children - 1;
- }
-
- /* clone origin is really a dsl_dir thing... */
- if (ds->ds_dir->dd_phys->dd_clone_parent_obj) {
- dsl_dataset_t *ods;
-
- rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER);
- VERIFY(0 == dsl_dataset_open_obj(ds->ds_dir->dd_pool,
- ds->ds_dir->dd_phys->dd_clone_parent_obj,
- NULL, DS_MODE_NONE, FTAG, &ods));
- dsl_dataset_name(ods, stat->dds_clone_of);
- dsl_dataset_close(ods, DS_MODE_NONE, FTAG);
- rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
- }
-}
-
-uint64_t
-dsl_dataset_fsid_guid(dsl_dataset_t *ds)
-{
- return (ds->ds_phys->ds_fsid_guid);
-}
-
-void
-dsl_dataset_space(dsl_dataset_t *ds,
- uint64_t *refdbytesp, uint64_t *availbytesp,
- uint64_t *usedobjsp, uint64_t *availobjsp)
-{
- *refdbytesp = ds->ds_phys->ds_used_bytes;
- *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE);
- *usedobjsp = ds->ds_phys->ds_bp.blk_fill;
- *availobjsp = DN_MAX_OBJECT - *usedobjsp;
-}
-
-/* ARGSUSED */
-static int
-dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
- dsl_dataset_t *ds = arg1;
- char *newsnapname = arg2;
- dsl_dir_t *dd = ds->ds_dir;
- objset_t *mos = dd->dd_pool->dp_meta_objset;
- dsl_dataset_t *hds;
- uint64_t val;
- int err;
-
- err = dsl_dataset_open_obj(dd->dd_pool,
- dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG, &hds);
- if (err)
- return (err);
-
- /* new name better not be in use */
- err = zap_lookup(mos, hds->ds_phys->ds_snapnames_zapobj,
- newsnapname, 8, 1, &val);
- dsl_dataset_close(hds, DS_MODE_NONE, FTAG);
-
- if (err == 0)
- err = EEXIST;
- else if (err == ENOENT)
- err = 0;
-
- /* dataset name + 1 for the "@" + the new snapshot name must fit */
- if (dsl_dir_namelen(ds->ds_dir) + 1 + strlen(newsnapname) >= MAXNAMELEN)
- err = ENAMETOOLONG;
-
- return (err);
-}
-
-static void
-dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
-{
- dsl_dataset_t *ds = arg1;
- char *newsnapname = arg2;
- dsl_dir_t *dd = ds->ds_dir;
- objset_t *mos = dd->dd_pool->dp_meta_objset;
- dsl_dataset_t *hds;
- int err;
-
- ASSERT(ds->ds_phys->ds_next_snap_obj != 0);
-
- VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool,
- dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG, &hds));
-
- VERIFY(0 == dsl_dataset_get_snapname(ds));
- err = zap_remove(mos, hds->ds_phys->ds_snapnames_zapobj,
- ds->ds_snapname, tx);
- ASSERT3U(err, ==, 0);
- mutex_enter(&ds->ds_lock);
- (void) strcpy(ds->ds_snapname, newsnapname);
- mutex_exit(&ds->ds_lock);
- err = zap_add(mos, hds->ds_phys->ds_snapnames_zapobj,
- ds->ds_snapname, 8, 1, &ds->ds_object, tx);
- ASSERT3U(err, ==, 0);
-
- dsl_dataset_close(hds, DS_MODE_NONE, FTAG);
-}
-
-struct renamearg {
- dsl_sync_task_group_t *dstg;
- char failed[MAXPATHLEN];
- char *oldsnap;
- char *newsnap;
-};
-
-static int
-dsl_snapshot_rename_one(char *name, void *arg)
-{
- struct renamearg *ra = arg;
- dsl_dataset_t *ds = NULL;
- char *cp;
- int err;
-
- cp = name + strlen(name);
- *cp = '@';
- (void) strcpy(cp + 1, ra->oldsnap);
- err = dsl_dataset_open(name, DS_MODE_READONLY | DS_MODE_STANDARD,
- ra->dstg, &ds);
- if (err == ENOENT) {
- *cp = '\0';
- return (0);
- }
- if (err) {
- (void) strcpy(ra->failed, name);
- *cp = '\0';
- dsl_dataset_close(ds, DS_MODE_STANDARD, ra->dstg);
- return (err);
- }
-
-#ifdef _KERNEL
- /* for all filesystems undergoing rename, we'll need to unmount it */
- (void) zfs_unmount_snap(name, NULL);
-#endif
-
- *cp = '\0';
-
- dsl_sync_task_create(ra->dstg, dsl_dataset_snapshot_rename_check,
- dsl_dataset_snapshot_rename_sync, ds, ra->newsnap, 0);
-
- return (0);
-}
-
-static int
-dsl_recursive_rename(char *oldname, const char *newname)
-{
- int err;
- struct renamearg *ra;
- dsl_sync_task_t *dst;
- spa_t *spa;
- char *cp, *fsname = spa_strdup(oldname);
- int len = strlen(oldname);
-
- /* truncate the snapshot name to get the fsname */
- cp = strchr(fsname, '@');
- *cp = '\0';
-
- cp = strchr(fsname, '/');
- if (cp) {
- *cp = '\0';
- err = spa_open(fsname, &spa, FTAG);
- *cp = '/';
- } else {
- err = spa_open(fsname, &spa, FTAG);
- }
- if (err) {
- kmem_free(fsname, len + 1);
- return (err);
- }
- ra = kmem_alloc(sizeof (struct renamearg), KM_SLEEP);
- ra->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
-
- ra->oldsnap = strchr(oldname, '@') + 1;
- ra->newsnap = strchr(newname, '@') + 1;
- *ra->failed = '\0';
-
- err = dmu_objset_find(fsname, dsl_snapshot_rename_one, ra,
- DS_FIND_CHILDREN);
- kmem_free(fsname, len + 1);
-
- if (err == 0) {
- err = dsl_sync_task_group_wait(ra->dstg);
- }
-
- for (dst = list_head(&ra->dstg->dstg_tasks); dst;
- dst = list_next(&ra->dstg->dstg_tasks, dst)) {
- dsl_dataset_t *ds = dst->dst_arg1;
- if (dst->dst_err) {
- dsl_dir_name(ds->ds_dir, ra->failed);
- (void) strcat(ra->failed, "@");
- (void) strcat(ra->failed, ra->newsnap);
- }
- dsl_dataset_close(ds, DS_MODE_STANDARD, ra->dstg);
- }
-
- (void) strcpy(oldname, ra->failed);
-
- dsl_sync_task_group_destroy(ra->dstg);
- kmem_free(ra, sizeof (struct renamearg));
- spa_close(spa, FTAG);
- return (err);
-}
-
-#pragma weak dmu_objset_rename = dsl_dataset_rename
-int
-dsl_dataset_rename(char *oldname, const char *newname,
- boolean_t recursive)
-{
- dsl_dir_t *dd;
- dsl_dataset_t *ds;
- const char *tail;
- int err;
-
- err = dsl_dir_open(oldname, FTAG, &dd, &tail);
- if (err)
- return (err);
- if (tail == NULL) {
- err = dsl_dir_rename(dd, newname);
- dsl_dir_close(dd, FTAG);
- return (err);
- }
- if (tail[0] != '@') {
- /* the name ended in a nonexistant component */
- dsl_dir_close(dd, FTAG);
- return (ENOENT);
- }
-
- dsl_dir_close(dd, FTAG);
-
- /* new name must be snapshot in same filesystem */
- tail = strchr(newname, '@');
- if (tail == NULL)
- return (EINVAL);
- tail++;
- if (strncmp(oldname, newname, tail - newname) != 0)
- return (EXDEV);
-
- if (recursive) {
- err = dsl_recursive_rename(oldname, newname);
- } else {
- err = dsl_dataset_open(oldname,
- DS_MODE_READONLY | DS_MODE_STANDARD, FTAG, &ds);
- if (err)
- return (err);
-
- err = dsl_sync_task_do(ds->ds_dir->dd_pool,
- dsl_dataset_snapshot_rename_check,
- dsl_dataset_snapshot_rename_sync, ds, (char *)tail, 1);
-
- dsl_dataset_close(ds, DS_MODE_STANDARD, FTAG);
- }
-
- return (err);
-}
-
-struct promotearg {
- uint64_t used, comp, uncomp, unique;
- uint64_t newnext_obj, snapnames_obj;
-};
-
-static int
-dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
- dsl_dataset_t *hds = arg1;
- struct promotearg *pa = arg2;
- dsl_dir_t *dd = hds->ds_dir;
- dsl_pool_t *dp = hds->ds_dir->dd_pool;
- dsl_dir_t *pdd = NULL;
- dsl_dataset_t *ds = NULL;
- dsl_dataset_t *pivot_ds = NULL;
- dsl_dataset_t *newnext_ds = NULL;
- int err;
- char *name = NULL;
- uint64_t itor = 0;
- blkptr_t bp;
-
- bzero(pa, sizeof (*pa));
-
- /* Check that it is a clone */
- if (dd->dd_phys->dd_clone_parent_obj == 0)
- return (EINVAL);
-
- /* Since this is so expensive, don't do the preliminary check */
- if (!dmu_tx_is_syncing(tx))
- return (0);
-
- if (err = dsl_dataset_open_obj(dp,
- dd->dd_phys->dd_clone_parent_obj,
- NULL, DS_MODE_EXCLUSIVE, FTAG, &pivot_ds))
- goto out;
- pdd = pivot_ds->ds_dir;
-
- {
- dsl_dataset_t *phds;
- if (err = dsl_dataset_open_obj(dd->dd_pool,
- pdd->dd_phys->dd_head_dataset_obj,
- NULL, DS_MODE_NONE, FTAG, &phds))
- goto out;
- pa->snapnames_obj = phds->ds_phys->ds_snapnames_zapobj;
- dsl_dataset_close(phds, DS_MODE_NONE, FTAG);
- }
-
- if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE) {
- err = EXDEV;
- goto out;
- }
-
- /* find pivot point's new next ds */
- VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, hds->ds_object,
- NULL, DS_MODE_NONE, FTAG, &newnext_ds));
- while (newnext_ds->ds_phys->ds_prev_snap_obj != pivot_ds->ds_object) {
- dsl_dataset_t *prev;
-
- if (err = dsl_dataset_open_obj(dd->dd_pool,
- newnext_ds->ds_phys->ds_prev_snap_obj,
- NULL, DS_MODE_NONE, FTAG, &prev))
- goto out;
- dsl_dataset_close(newnext_ds, DS_MODE_NONE, FTAG);
- newnext_ds = prev;
- }
- pa->newnext_obj = newnext_ds->ds_object;
-
- /* compute pivot point's new unique space */
- while ((err = bplist_iterate(&newnext_ds->ds_deadlist,
- &itor, &bp)) == 0) {
- if (bp.blk_birth > pivot_ds->ds_phys->ds_prev_snap_txg)
- pa->unique += bp_get_dasize(dd->dd_pool->dp_spa, &bp);
- }
- if (err != ENOENT)
- goto out;
-
- /* Walk the snapshots that we are moving */
- name = kmem_alloc(MAXPATHLEN, KM_SLEEP);
- ds = pivot_ds;
- /* CONSTCOND */
- while (TRUE) {
- uint64_t val, dlused, dlcomp, dluncomp;
- dsl_dataset_t *prev;
-
- /* Check that the snapshot name does not conflict */
- dsl_dataset_name(ds, name);
- err = zap_lookup(dd->dd_pool->dp_meta_objset,
- hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname,
- 8, 1, &val);
- if (err != ENOENT) {
- if (err == 0)
- err = EEXIST;
- goto out;
- }
-
- /*
- * compute space to transfer. Each snapshot gave birth to:
- * (my used) - (prev's used) + (deadlist's used)
- */
- pa->used += ds->ds_phys->ds_used_bytes;
- pa->comp += ds->ds_phys->ds_compressed_bytes;
- pa->uncomp += ds->ds_phys->ds_uncompressed_bytes;
-
- /* If we reach the first snapshot, we're done. */
- if (ds->ds_phys->ds_prev_snap_obj == 0)
- break;
-
- if (err = bplist_space(&ds->ds_deadlist,
- &dlused, &dlcomp, &dluncomp))
- goto out;
- if (err = dsl_dataset_open_obj(dd->dd_pool,
- ds->ds_phys->ds_prev_snap_obj, NULL, DS_MODE_EXCLUSIVE,
- FTAG, &prev))
- goto out;
- pa->used += dlused - prev->ds_phys->ds_used_bytes;
- pa->comp += dlcomp - prev->ds_phys->ds_compressed_bytes;
- pa->uncomp += dluncomp - prev->ds_phys->ds_uncompressed_bytes;
-
- /*
- * We could be a clone of a clone. If we reach our
- * parent's branch point, we're done.
- */
- if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) {
- dsl_dataset_close(prev, DS_MODE_EXCLUSIVE, FTAG);
- break;
- }
- if (ds != pivot_ds)
- dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
- ds = prev;
- }
-
- /* Check that there is enough space here */
- err = dsl_dir_transfer_possible(pdd, dd, pa->used);
-
-out:
- if (ds && ds != pivot_ds)
- dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
- if (pivot_ds)
- dsl_dataset_close(pivot_ds, DS_MODE_EXCLUSIVE, FTAG);
- if (newnext_ds)
- dsl_dataset_close(newnext_ds, DS_MODE_NONE, FTAG);
- if (name)
- kmem_free(name, MAXPATHLEN);
- return (err);
-}
-
-static void
-dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx)
-{
- dsl_dataset_t *hds = arg1;
- struct promotearg *pa = arg2;
- dsl_dir_t *dd = hds->ds_dir;
- dsl_pool_t *dp = hds->ds_dir->dd_pool;
- dsl_dir_t *pdd = NULL;
- dsl_dataset_t *ds, *pivot_ds;
- char *name;
-
- ASSERT(dd->dd_phys->dd_clone_parent_obj != 0);
- ASSERT(0 == (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE));
-
- VERIFY(0 == dsl_dataset_open_obj(dp,
- dd->dd_phys->dd_clone_parent_obj,
- NULL, DS_MODE_EXCLUSIVE, FTAG, &pivot_ds));
- /*
- * We need to explicitly open pdd, since pivot_ds's pdd will be
- * changing.
- */
- VERIFY(0 == dsl_dir_open_obj(dp, pivot_ds->ds_dir->dd_object,
- NULL, FTAG, &pdd));
-
- /* move snapshots to this dir */
- name = kmem_alloc(MAXPATHLEN, KM_SLEEP);
- ds = pivot_ds;
- /* CONSTCOND */
- while (TRUE) {
- dsl_dataset_t *prev;
-
- /* move snap name entry */
- dsl_dataset_name(ds, name);
- VERIFY(0 == zap_remove(dp->dp_meta_objset,
- pa->snapnames_obj, ds->ds_snapname, tx));
- VERIFY(0 == zap_add(dp->dp_meta_objset,
- hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname,
- 8, 1, &ds->ds_object, tx));
-
- /* change containing dsl_dir */
- dmu_buf_will_dirty(ds->ds_dbuf, tx);
- ASSERT3U(ds->ds_phys->ds_dir_obj, ==, pdd->dd_object);
- ds->ds_phys->ds_dir_obj = dd->dd_object;
- ASSERT3P(ds->ds_dir, ==, pdd);
- dsl_dir_close(ds->ds_dir, ds);
- VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object,
- NULL, ds, &ds->ds_dir));
-
- ASSERT3U(dsl_prop_numcb(ds), ==, 0);
-
- if (ds->ds_phys->ds_prev_snap_obj == 0)
- break;
-
- VERIFY(0 == dsl_dataset_open_obj(dp,
- ds->ds_phys->ds_prev_snap_obj, NULL, DS_MODE_EXCLUSIVE,
- FTAG, &prev));
-
- if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) {
- dsl_dataset_close(prev, DS_MODE_EXCLUSIVE, FTAG);
- break;
- }
- if (ds != pivot_ds)
- dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
- ds = prev;
- }
- if (ds != pivot_ds)
- dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
-
- /* change pivot point's next snap */
- dmu_buf_will_dirty(pivot_ds->ds_dbuf, tx);
- pivot_ds->ds_phys->ds_next_snap_obj = pa->newnext_obj;
-
- /* change clone_parent-age */
- dmu_buf_will_dirty(dd->dd_dbuf, tx);
- ASSERT3U(dd->dd_phys->dd_clone_parent_obj, ==, pivot_ds->ds_object);
- dd->dd_phys->dd_clone_parent_obj = pdd->dd_phys->dd_clone_parent_obj;
- dmu_buf_will_dirty(pdd->dd_dbuf, tx);
- pdd->dd_phys->dd_clone_parent_obj = pivot_ds->ds_object;
-
- /* change space accounting */
- dsl_dir_diduse_space(pdd, -pa->used, -pa->comp, -pa->uncomp, tx);
- dsl_dir_diduse_space(dd, pa->used, pa->comp, pa->uncomp, tx);
- pivot_ds->ds_phys->ds_unique_bytes = pa->unique;
-
- dsl_dir_close(pdd, FTAG);
- dsl_dataset_close(pivot_ds, DS_MODE_EXCLUSIVE, FTAG);
- kmem_free(name, MAXPATHLEN);
-}
-
-int
-dsl_dataset_promote(const char *name)
-{
- dsl_dataset_t *ds;
- int err;
- dmu_object_info_t doi;
- struct promotearg pa;
-
- err = dsl_dataset_open(name, DS_MODE_NONE, FTAG, &ds);
- if (err)
- return (err);
-
- err = dmu_object_info(ds->ds_dir->dd_pool->dp_meta_objset,
- ds->ds_phys->ds_snapnames_zapobj, &doi);
- if (err) {
- dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
- return (err);
- }
-
- /*
- * Add in 128x the snapnames zapobj size, since we will be moving
- * a bunch of snapnames to the promoted ds, and dirtying their
- * bonus buffers.
- */
- err = dsl_sync_task_do(ds->ds_dir->dd_pool,
- dsl_dataset_promote_check,
- dsl_dataset_promote_sync, ds, &pa, 2 + 2 * doi.doi_physical_blks);
- dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
- return (err);
-}
-
-/*
- * Given a pool name and a dataset object number in that pool,
- * return the name of that dataset.
- */
-int
-dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf)
-{
- spa_t *spa;
- dsl_pool_t *dp;
- dsl_dataset_t *ds = NULL;
- int error;
-
- if ((error = spa_open(pname, &spa, FTAG)) != 0)
- return (error);
- dp = spa_get_dsl(spa);
- rw_enter(&dp->dp_config_rwlock, RW_READER);
- if ((error = dsl_dataset_open_obj(dp, obj,
- NULL, DS_MODE_NONE, FTAG, &ds)) != 0) {
- rw_exit(&dp->dp_config_rwlock);
- spa_close(spa, FTAG);
- return (error);
- }
- dsl_dataset_name(ds, buf);
- dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
- rw_exit(&dp->dp_config_rwlock);
- spa_close(spa, FTAG);
-
- return (0);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c
deleted file mode 100644
index 5e563b6..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c
+++ /dev/null
@@ -1,1215 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/dmu.h>
-#include <sys/dmu_tx.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_dir.h>
-#include <sys/dsl_prop.h>
-#include <sys/dsl_synctask.h>
-#include <sys/spa.h>
-#include <sys/zap.h>
-#include <sys/zio.h>
-#include <sys/arc.h>
-#include "zfs_namecheck.h"
-
-static uint64_t dsl_dir_estimated_space(dsl_dir_t *dd);
-static void dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx);
-
-
-/* ARGSUSED */
-static void
-dsl_dir_evict(dmu_buf_t *db, void *arg)
-{
- dsl_dir_t *dd = arg;
- dsl_pool_t *dp = dd->dd_pool;
- int t;
-
- for (t = 0; t < TXG_SIZE; t++) {
- ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t));
- ASSERT(dd->dd_tempreserved[t] == 0);
- ASSERT(dd->dd_space_towrite[t] == 0);
- }
-
- ASSERT3U(dd->dd_used_bytes, ==, dd->dd_phys->dd_used_bytes);
-
- if (dd->dd_parent)
- dsl_dir_close(dd->dd_parent, dd);
-
- spa_close(dd->dd_pool->dp_spa, dd);
-
- /*
- * The props callback list should be empty since they hold the
- * dir open.
- */
- list_destroy(&dd->dd_prop_cbs);
- mutex_destroy(&dd->dd_lock);
- kmem_free(dd, sizeof (dsl_dir_t));
-}
-
-int
-dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
- const char *tail, void *tag, dsl_dir_t **ddp)
-{
- dmu_buf_t *dbuf;
- dsl_dir_t *dd;
- int err;
-
- ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
- dsl_pool_sync_context(dp));
-
- err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf);
- if (err)
- return (err);
- dd = dmu_buf_get_user(dbuf);
-#ifdef ZFS_DEBUG
- {
- dmu_object_info_t doi;
- dmu_object_info_from_db(dbuf, &doi);
- ASSERT3U(doi.doi_type, ==, DMU_OT_DSL_DIR);
- }
-#endif
- /* XXX assert bonus buffer size is correct */
- if (dd == NULL) {
- dsl_dir_t *winner;
- int err;
-
- dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP);
- dd->dd_object = ddobj;
- dd->dd_dbuf = dbuf;
- dd->dd_pool = dp;
- dd->dd_phys = dbuf->db_data;
- dd->dd_used_bytes = dd->dd_phys->dd_used_bytes;
- mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL);
-
- list_create(&dd->dd_prop_cbs, sizeof (dsl_prop_cb_record_t),
- offsetof(dsl_prop_cb_record_t, cbr_node));
-
- if (dd->dd_phys->dd_parent_obj) {
- err = dsl_dir_open_obj(dp, dd->dd_phys->dd_parent_obj,
- NULL, dd, &dd->dd_parent);
- if (err) {
- mutex_destroy(&dd->dd_lock);
- kmem_free(dd, sizeof (dsl_dir_t));
- dmu_buf_rele(dbuf, tag);
- return (err);
- }
- if (tail) {
-#ifdef ZFS_DEBUG
- uint64_t foundobj;
-
- err = zap_lookup(dp->dp_meta_objset,
- dd->dd_parent->dd_phys->
- dd_child_dir_zapobj,
- tail, sizeof (foundobj), 1, &foundobj);
- ASSERT(err || foundobj == ddobj);
-#endif
- (void) strcpy(dd->dd_myname, tail);
- } else {
- err = zap_value_search(dp->dp_meta_objset,
- dd->dd_parent->dd_phys->
- dd_child_dir_zapobj,
- ddobj, dd->dd_myname);
- }
- if (err) {
- dsl_dir_close(dd->dd_parent, dd);
- mutex_destroy(&dd->dd_lock);
- kmem_free(dd, sizeof (dsl_dir_t));
- dmu_buf_rele(dbuf, tag);
- return (err);
- }
- } else {
- (void) strcpy(dd->dd_myname, spa_name(dp->dp_spa));
- }
-
- winner = dmu_buf_set_user_ie(dbuf, dd, &dd->dd_phys,
- dsl_dir_evict);
- if (winner) {
- if (dd->dd_parent)
- dsl_dir_close(dd->dd_parent, dd);
- mutex_destroy(&dd->dd_lock);
- kmem_free(dd, sizeof (dsl_dir_t));
- dd = winner;
- } else {
- spa_open_ref(dp->dp_spa, dd);
- }
- }
-
- /*
- * The dsl_dir_t has both open-to-close and instantiate-to-evict
- * holds on the spa. We need the open-to-close holds because
- * otherwise the spa_refcnt wouldn't change when we open a
- * dir which the spa also has open, so we could incorrectly
- * think it was OK to unload/export/destroy the pool. We need
- * the instantiate-to-evict hold because the dsl_dir_t has a
- * pointer to the dd_pool, which has a pointer to the spa_t.
- */
- spa_open_ref(dp->dp_spa, tag);
- ASSERT3P(dd->dd_pool, ==, dp);
- ASSERT3U(dd->dd_object, ==, ddobj);
- ASSERT3P(dd->dd_dbuf, ==, dbuf);
- *ddp = dd;
- return (0);
-}
-
-void
-dsl_dir_close(dsl_dir_t *dd, void *tag)
-{
- dprintf_dd(dd, "%s\n", "");
- spa_close(dd->dd_pool->dp_spa, tag);
- dmu_buf_rele(dd->dd_dbuf, tag);
-}
-
-/* buf must be long enough (MAXNAMELEN + strlen(MOS_DIR_NAME) + 1 should do) */
-void
-dsl_dir_name(dsl_dir_t *dd, char *buf)
-{
- if (dd->dd_parent) {
- dsl_dir_name(dd->dd_parent, buf);
- (void) strcat(buf, "/");
- } else {
- buf[0] = '\0';
- }
- if (!MUTEX_HELD(&dd->dd_lock)) {
- /*
- * recursive mutex so that we can use
- * dprintf_dd() with dd_lock held
- */
- mutex_enter(&dd->dd_lock);
- (void) strcat(buf, dd->dd_myname);
- mutex_exit(&dd->dd_lock);
- } else {
- (void) strcat(buf, dd->dd_myname);
- }
-}
-
-/* Calculate name legnth, avoiding all the strcat calls of dsl_dir_name */
-int
-dsl_dir_namelen(dsl_dir_t *dd)
-{
- int result = 0;
-
- if (dd->dd_parent) {
- /* parent's name + 1 for the "/" */
- result = dsl_dir_namelen(dd->dd_parent) + 1;
- }
-
- if (!MUTEX_HELD(&dd->dd_lock)) {
- /* see dsl_dir_name */
- mutex_enter(&dd->dd_lock);
- result += strlen(dd->dd_myname);
- mutex_exit(&dd->dd_lock);
- } else {
- result += strlen(dd->dd_myname);
- }
-
- return (result);
-}
-
-int
-dsl_dir_is_private(dsl_dir_t *dd)
-{
- int rv = FALSE;
-
- if (dd->dd_parent && dsl_dir_is_private(dd->dd_parent))
- rv = TRUE;
- if (dataset_name_hidden(dd->dd_myname))
- rv = TRUE;
- return (rv);
-}
-
-
-static int
-getcomponent(const char *path, char *component, const char **nextp)
-{
- char *p;
- if (path == NULL)
- return (ENOENT);
- /* This would be a good place to reserve some namespace... */
- p = strpbrk(path, "/@");
- if (p && (p[1] == '/' || p[1] == '@')) {
- /* two separators in a row */
- return (EINVAL);
- }
- if (p == NULL || p == path) {
- /*
- * if the first thing is an @ or /, it had better be an
- * @ and it had better not have any more ats or slashes,
- * and it had better have something after the @.
- */
- if (p != NULL &&
- (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0'))
- return (EINVAL);
- if (strlen(path) >= MAXNAMELEN)
- return (ENAMETOOLONG);
- (void) strcpy(component, path);
- p = NULL;
- } else if (p[0] == '/') {
- if (p-path >= MAXNAMELEN)
- return (ENAMETOOLONG);
- (void) strncpy(component, path, p - path);
- component[p-path] = '\0';
- p++;
- } else if (p[0] == '@') {
- /*
- * if the next separator is an @, there better not be
- * any more slashes.
- */
- if (strchr(path, '/'))
- return (EINVAL);
- if (p-path >= MAXNAMELEN)
- return (ENAMETOOLONG);
- (void) strncpy(component, path, p - path);
- component[p-path] = '\0';
- } else {
- ASSERT(!"invalid p");
- }
- *nextp = p;
- return (0);
-}
-
-/*
- * same as dsl_open_dir, ignore the first component of name and use the
- * spa instead
- */
-int
-dsl_dir_open_spa(spa_t *spa, const char *name, void *tag,
- dsl_dir_t **ddp, const char **tailp)
-{
- char buf[MAXNAMELEN];
- const char *next, *nextnext = NULL;
- int err;
- dsl_dir_t *dd;
- dsl_pool_t *dp;
- uint64_t ddobj;
- int openedspa = FALSE;
-
- dprintf("%s\n", name);
-
- err = getcomponent(name, buf, &next);
- if (err)
- return (err);
- if (spa == NULL) {
- err = spa_open(buf, &spa, FTAG);
- if (err) {
- dprintf("spa_open(%s) failed\n", buf);
- return (err);
- }
- openedspa = TRUE;
-
- /* XXX this assertion belongs in spa_open */
- ASSERT(!dsl_pool_sync_context(spa_get_dsl(spa)));
- }
-
- dp = spa_get_dsl(spa);
-
- rw_enter(&dp->dp_config_rwlock, RW_READER);
- err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd);
- if (err) {
- rw_exit(&dp->dp_config_rwlock);
- if (openedspa)
- spa_close(spa, FTAG);
- return (err);
- }
-
- while (next != NULL) {
- dsl_dir_t *child_ds;
- err = getcomponent(next, buf, &nextnext);
- if (err)
- break;
- ASSERT(next[0] != '\0');
- if (next[0] == '@')
- break;
- dprintf("looking up %s in obj%lld\n",
- buf, dd->dd_phys->dd_child_dir_zapobj);
-
- err = zap_lookup(dp->dp_meta_objset,
- dd->dd_phys->dd_child_dir_zapobj,
- buf, sizeof (ddobj), 1, &ddobj);
- if (err) {
- if (err == ENOENT)
- err = 0;
- break;
- }
-
- err = dsl_dir_open_obj(dp, ddobj, buf, tag, &child_ds);
- if (err)
- break;
- dsl_dir_close(dd, tag);
- dd = child_ds;
- next = nextnext;
- }
- rw_exit(&dp->dp_config_rwlock);
-
- if (err) {
- dsl_dir_close(dd, tag);
- if (openedspa)
- spa_close(spa, FTAG);
- return (err);
- }
-
- /*
- * It's an error if there's more than one component left, or
- * tailp==NULL and there's any component left.
- */
- if (next != NULL &&
- (tailp == NULL || (nextnext && nextnext[0] != '\0'))) {
- /* bad path name */
- dsl_dir_close(dd, tag);
- dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp);
- err = ENOENT;
- }
- if (tailp)
- *tailp = next;
- if (openedspa)
- spa_close(spa, FTAG);
- *ddp = dd;
- return (err);
-}
-
-/*
- * Return the dsl_dir_t, and possibly the last component which couldn't
- * be found in *tail. Return NULL if the path is bogus, or if
- * tail==NULL and we couldn't parse the whole name. (*tail)[0] == '@'
- * means that the last component is a snapshot.
- */
-int
-dsl_dir_open(const char *name, void *tag, dsl_dir_t **ddp, const char **tailp)
-{
- return (dsl_dir_open_spa(NULL, name, tag, ddp, tailp));
-}
-
-uint64_t
-dsl_dir_create_sync(dsl_dir_t *pds, const char *name, dmu_tx_t *tx)
-{
- objset_t *mos = pds->dd_pool->dp_meta_objset;
- uint64_t ddobj;
- dsl_dir_phys_t *dsphys;
- dmu_buf_t *dbuf;
-
- ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
- DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx);
- VERIFY(0 == zap_add(mos, pds->dd_phys->dd_child_dir_zapobj,
- name, sizeof (uint64_t), 1, &ddobj, tx));
- VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf));
- dmu_buf_will_dirty(dbuf, tx);
- dsphys = dbuf->db_data;
-
- dsphys->dd_creation_time = gethrestime_sec();
- dsphys->dd_parent_obj = pds->dd_object;
- dsphys->dd_props_zapobj = zap_create(mos,
- DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
- dsphys->dd_child_dir_zapobj = zap_create(mos,
- DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
- dmu_buf_rele(dbuf, FTAG);
-
- return (ddobj);
-}
-
-/* ARGSUSED */
-int
-dsl_dir_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
- dsl_dir_t *dd = arg1;
- dsl_pool_t *dp = dd->dd_pool;
- objset_t *mos = dp->dp_meta_objset;
- int err;
- uint64_t count;
-
- /*
- * There should be exactly two holds, both from
- * dsl_dataset_destroy: one on the dd directory, and one on its
- * head ds. Otherwise, someone is trying to lookup something
- * inside this dir while we want to destroy it. The
- * config_rwlock ensures that nobody else opens it after we
- * check.
- */
- if (dmu_buf_refcount(dd->dd_dbuf) > 2)
- return (EBUSY);
-
- err = zap_count(mos, dd->dd_phys->dd_child_dir_zapobj, &count);
- if (err)
- return (err);
- if (count != 0)
- return (EEXIST);
-
- return (0);
-}
-
-void
-dsl_dir_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
-{
- dsl_dir_t *dd = arg1;
- objset_t *mos = dd->dd_pool->dp_meta_objset;
- uint64_t val, obj;
-
- ASSERT(RW_WRITE_HELD(&dd->dd_pool->dp_config_rwlock));
- ASSERT(dd->dd_phys->dd_head_dataset_obj == 0);
-
- /* Remove our reservation. */
- val = 0;
- dsl_dir_set_reservation_sync(dd, &val, tx);
- ASSERT3U(dd->dd_used_bytes, ==, 0);
- ASSERT3U(dd->dd_phys->dd_reserved, ==, 0);
-
- VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_child_dir_zapobj, tx));
- VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_props_zapobj, tx));
- VERIFY(0 == zap_remove(mos,
- dd->dd_parent->dd_phys->dd_child_dir_zapobj, dd->dd_myname, tx));
-
- obj = dd->dd_object;
- dsl_dir_close(dd, tag);
- VERIFY(0 == dmu_object_free(mos, obj, tx));
-}
-
-void
-dsl_dir_create_root(objset_t *mos, uint64_t *ddobjp, dmu_tx_t *tx)
-{
- dsl_dir_phys_t *dsp;
- dmu_buf_t *dbuf;
- int error;
-
- *ddobjp = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
- DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx);
-
- error = zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ROOT_DATASET,
- sizeof (uint64_t), 1, ddobjp, tx);
- ASSERT3U(error, ==, 0);
-
- VERIFY(0 == dmu_bonus_hold(mos, *ddobjp, FTAG, &dbuf));
- dmu_buf_will_dirty(dbuf, tx);
- dsp = dbuf->db_data;
-
- dsp->dd_creation_time = gethrestime_sec();
- dsp->dd_props_zapobj = zap_create(mos,
- DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
- dsp->dd_child_dir_zapobj = zap_create(mos,
- DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
-
- dmu_buf_rele(dbuf, FTAG);
-}
-
-void
-dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv)
-{
- dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE,
- dsl_dir_space_available(dd, NULL, 0, TRUE));
-
- mutex_enter(&dd->dd_lock);
- dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, dd->dd_used_bytes);
- dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA,
- dd->dd_phys->dd_quota);
- dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION,
- dd->dd_phys->dd_reserved);
- dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO,
- dd->dd_phys->dd_compressed_bytes == 0 ? 100 :
- (dd->dd_phys->dd_uncompressed_bytes * 100 /
- dd->dd_phys->dd_compressed_bytes));
- mutex_exit(&dd->dd_lock);
-
- if (dd->dd_phys->dd_clone_parent_obj) {
- dsl_dataset_t *ds;
- char buf[MAXNAMELEN];
-
- rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
- VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool,
- dd->dd_phys->dd_clone_parent_obj,
- NULL, DS_MODE_NONE, FTAG, &ds));
- dsl_dataset_name(ds, buf);
- dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
- rw_exit(&dd->dd_pool->dp_config_rwlock);
-
- dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf);
- }
-}
-
-void
-dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx)
-{
- dsl_pool_t *dp = dd->dd_pool;
-
- ASSERT(dd->dd_phys);
-
- if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg) == 0) {
- /* up the hold count until we can be written out */
- dmu_buf_add_ref(dd->dd_dbuf, dd);
- }
-}
-
-static int64_t
-parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta)
-{
- uint64_t old_accounted = MAX(used, dd->dd_phys->dd_reserved);
- uint64_t new_accounted = MAX(used + delta, dd->dd_phys->dd_reserved);
- return (new_accounted - old_accounted);
-}
-
-void
-dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx)
-{
- ASSERT(dmu_tx_is_syncing(tx));
-
- dmu_buf_will_dirty(dd->dd_dbuf, tx);
-
- mutex_enter(&dd->dd_lock);
- ASSERT3U(dd->dd_tempreserved[tx->tx_txg&TXG_MASK], ==, 0);
- dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg,
- dd->dd_space_towrite[tx->tx_txg&TXG_MASK] / 1024);
- dd->dd_space_towrite[tx->tx_txg&TXG_MASK] = 0;
- dd->dd_phys->dd_used_bytes = dd->dd_used_bytes;
- mutex_exit(&dd->dd_lock);
-
- /* release the hold from dsl_dir_dirty */
- dmu_buf_rele(dd->dd_dbuf, dd);
-}
-
-static uint64_t
-dsl_dir_estimated_space(dsl_dir_t *dd)
-{
- int64_t space;
- int i;
-
- ASSERT(MUTEX_HELD(&dd->dd_lock));
-
- space = dd->dd_phys->dd_used_bytes;
- ASSERT(space >= 0);
- for (i = 0; i < TXG_SIZE; i++) {
- space += dd->dd_space_towrite[i&TXG_MASK];
- ASSERT3U(dd->dd_space_towrite[i&TXG_MASK], >=, 0);
- }
- return (space);
-}
-
-/*
- * How much space would dd have available if ancestor had delta applied
- * to it? If ondiskonly is set, we're only interested in what's
- * on-disk, not estimated pending changes.
- */
-uint64_t
-dsl_dir_space_available(dsl_dir_t *dd,
- dsl_dir_t *ancestor, int64_t delta, int ondiskonly)
-{
- uint64_t parentspace, myspace, quota, used;
-
- /*
- * If there are no restrictions otherwise, assume we have
- * unlimited space available.
- */
- quota = UINT64_MAX;
- parentspace = UINT64_MAX;
-
- if (dd->dd_parent != NULL) {
- parentspace = dsl_dir_space_available(dd->dd_parent,
- ancestor, delta, ondiskonly);
- }
-
- mutex_enter(&dd->dd_lock);
- if (dd->dd_phys->dd_quota != 0)
- quota = dd->dd_phys->dd_quota;
- if (ondiskonly) {
- used = dd->dd_used_bytes;
- } else {
- used = dsl_dir_estimated_space(dd);
- }
- if (dd == ancestor)
- used += delta;
-
- if (dd->dd_parent == NULL) {
- uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, FALSE);
- quota = MIN(quota, poolsize);
- }
-
- if (dd->dd_phys->dd_reserved > used && parentspace != UINT64_MAX) {
- /*
- * We have some space reserved, in addition to what our
- * parent gave us.
- */
- parentspace += dd->dd_phys->dd_reserved - used;
- }
-
- if (used > quota) {
- /* over quota */
- myspace = 0;
-
- /*
- * While it's OK to be a little over quota, if
- * we think we are using more space than there
- * is in the pool (which is already 1.6% more than
- * dsl_pool_adjustedsize()), something is very
- * wrong.
- */
- ASSERT3U(used, <=, spa_get_space(dd->dd_pool->dp_spa));
- } else {
- /*
- * the lesser of the space provided by our parent and
- * the space left in our quota
- */
- myspace = MIN(parentspace, quota - used);
- }
-
- mutex_exit(&dd->dd_lock);
-
- return (myspace);
-}
-
-struct tempreserve {
- list_node_t tr_node;
- dsl_dir_t *tr_ds;
- uint64_t tr_size;
-};
-
-/*
- * Reserve space in this dsl_dir, to be used in this tx's txg.
- * After the space has been dirtied (and thus
- * dsl_dir_willuse_space() has been called), the reservation should
- * be canceled, using dsl_dir_tempreserve_clear().
- */
-static int
-dsl_dir_tempreserve_impl(dsl_dir_t *dd,
- uint64_t asize, boolean_t netfree, list_t *tr_list, dmu_tx_t *tx)
-{
- uint64_t txg = tx->tx_txg;
- uint64_t est_used, quota, parent_rsrv;
- int edquot = EDQUOT;
- int txgidx = txg & TXG_MASK;
- int i;
- struct tempreserve *tr;
-
- ASSERT3U(txg, !=, 0);
- ASSERT3S(asize, >=, 0);
-
- mutex_enter(&dd->dd_lock);
- /*
- * Check against the dsl_dir's quota. We don't add in the delta
- * when checking for over-quota because they get one free hit.
- */
- est_used = dsl_dir_estimated_space(dd);
- for (i = 0; i < TXG_SIZE; i++)
- est_used += dd->dd_tempreserved[i];
-
- quota = UINT64_MAX;
-
- if (dd->dd_phys->dd_quota)
- quota = dd->dd_phys->dd_quota;
-
- /*
- * If this transaction will result in a net free of space, we want
- * to let it through, but we have to be careful: the space that it
- * frees won't become available until *after* this txg syncs.
- * Therefore, to ensure that it's possible to remove files from
- * a full pool without inducing transient overcommits, we throttle
- * netfree transactions against a quota that is slightly larger,
- * but still within the pool's allocation slop. In cases where
- * we're very close to full, this will allow a steady trickle of
- * removes to get through.
- */
- if (dd->dd_parent == NULL) {
- uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree);
- if (poolsize < quota) {
- quota = poolsize;
- edquot = ENOSPC;
- }
- } else if (netfree) {
- quota = UINT64_MAX;
- }
-
- /*
- * If they are requesting more space, and our current estimate
- * is over quota. They get to try again unless the actual
- * on-disk is over quota and there are no pending changes (which
- * may free up space for us).
- */
- if (asize > 0 && est_used > quota) {
- if (dd->dd_space_towrite[txg & TXG_MASK] != 0 ||
- dd->dd_space_towrite[(txg-1) & TXG_MASK] != 0 ||
- dd->dd_space_towrite[(txg-2) & TXG_MASK] != 0 ||
- dd->dd_used_bytes < quota)
- edquot = ERESTART;
- dprintf_dd(dd, "failing: used=%lluK est_used = %lluK "
- "quota=%lluK tr=%lluK err=%d\n",
- dd->dd_used_bytes>>10, est_used>>10,
- quota>>10, asize>>10, edquot);
- mutex_exit(&dd->dd_lock);
- return (edquot);
- }
-
- /* We need to up our estimated delta before dropping dd_lock */
- dd->dd_tempreserved[txgidx] += asize;
-
- parent_rsrv = parent_delta(dd, est_used, asize);
- mutex_exit(&dd->dd_lock);
-
- tr = kmem_alloc(sizeof (struct tempreserve), KM_SLEEP);
- tr->tr_ds = dd;
- tr->tr_size = asize;
- list_insert_tail(tr_list, tr);
-
- /* see if it's OK with our parent */
- if (dd->dd_parent && parent_rsrv) {
- return (dsl_dir_tempreserve_impl(dd->dd_parent,
- parent_rsrv, netfree, tr_list, tx));
- } else {
- return (0);
- }
-}
-
-/*
- * Reserve space in this dsl_dir, to be used in this tx's txg.
- * After the space has been dirtied (and thus
- * dsl_dir_willuse_space() has been called), the reservation should
- * be canceled, using dsl_dir_tempreserve_clear().
- */
-int
-dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize,
- uint64_t asize, uint64_t fsize, void **tr_cookiep, dmu_tx_t *tx)
-{
- int err = 0;
- list_t *tr_list;
-
- tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
- list_create(tr_list, sizeof (struct tempreserve),
- offsetof(struct tempreserve, tr_node));
- ASSERT3S(asize, >=, 0);
- ASSERT3S(fsize, >=, 0);
-
- err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize,
- tr_list, tx);
-
- if (err == 0) {
- struct tempreserve *tr;
-
- err = arc_tempreserve_space(lsize);
- if (err == 0) {
- tr = kmem_alloc(sizeof (struct tempreserve), KM_SLEEP);
- tr->tr_ds = NULL;
- tr->tr_size = lsize;
- list_insert_tail(tr_list, tr);
- }
- }
-
- if (err)
- dsl_dir_tempreserve_clear(tr_list, tx);
- else
- *tr_cookiep = tr_list;
- return (err);
-}
-
-/*
- * Clear a temporary reservation that we previously made with
- * dsl_dir_tempreserve_space().
- */
-void
-dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx)
-{
- int txgidx = tx->tx_txg & TXG_MASK;
- list_t *tr_list = tr_cookie;
- struct tempreserve *tr;
-
- ASSERT3U(tx->tx_txg, !=, 0);
-
- while (tr = list_head(tr_list)) {
- if (tr->tr_ds == NULL) {
- arc_tempreserve_clear(tr->tr_size);
- } else {
- mutex_enter(&tr->tr_ds->dd_lock);
- ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=,
- tr->tr_size);
- tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size;
- mutex_exit(&tr->tr_ds->dd_lock);
- }
- list_remove(tr_list, tr);
- kmem_free(tr, sizeof (struct tempreserve));
- }
-
- kmem_free(tr_list, sizeof (list_t));
-}
-
-/*
- * Call in open context when we think we're going to write/free space,
- * eg. when dirtying data. Be conservative (ie. OK to write less than
- * this or free more than this, but don't write more or free less).
- */
-void
-dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
-{
- int64_t parent_space;
- uint64_t est_used;
-
- mutex_enter(&dd->dd_lock);
- if (space > 0)
- dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space;
-
- est_used = dsl_dir_estimated_space(dd);
- parent_space = parent_delta(dd, est_used, space);
- mutex_exit(&dd->dd_lock);
-
- /* Make sure that we clean up dd_space_to* */
- dsl_dir_dirty(dd, tx);
-
- /* XXX this is potentially expensive and unnecessary... */
- if (parent_space && dd->dd_parent)
- dsl_dir_willuse_space(dd->dd_parent, parent_space, tx);
-}
-
-/* call from syncing context when we actually write/free space for this dd */
-void
-dsl_dir_diduse_space(dsl_dir_t *dd,
- int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx)
-{
- int64_t accounted_delta;
-
- ASSERT(dmu_tx_is_syncing(tx));
-
- dsl_dir_dirty(dd, tx);
-
- mutex_enter(&dd->dd_lock);
- accounted_delta = parent_delta(dd, dd->dd_used_bytes, used);
- ASSERT(used >= 0 || dd->dd_used_bytes >= -used);
- ASSERT(compressed >= 0 ||
- dd->dd_phys->dd_compressed_bytes >= -compressed);
- ASSERT(uncompressed >= 0 ||
- dd->dd_phys->dd_uncompressed_bytes >= -uncompressed);
- dd->dd_used_bytes += used;
- dd->dd_phys->dd_uncompressed_bytes += uncompressed;
- dd->dd_phys->dd_compressed_bytes += compressed;
- mutex_exit(&dd->dd_lock);
-
- if (dd->dd_parent != NULL) {
- dsl_dir_diduse_space(dd->dd_parent,
- accounted_delta, compressed, uncompressed, tx);
- }
-}
-
-/* ARGSUSED */
-static int
-dsl_dir_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
- dsl_dir_t *dd = arg1;
- uint64_t *quotap = arg2;
- uint64_t new_quota = *quotap;
- int err = 0;
- uint64_t towrite;
-
- if (new_quota == 0)
- return (0);
-
- mutex_enter(&dd->dd_lock);
- /*
- * If we are doing the preliminary check in open context, and
- * there are pending changes, then don't fail it, since the
- * pending changes could under-estimat the amount of space to be
- * freed up.
- */
- towrite = dd->dd_space_towrite[0] + dd->dd_space_towrite[1] +
- dd->dd_space_towrite[2] + dd->dd_space_towrite[3];
- if ((dmu_tx_is_syncing(tx) || towrite == 0) &&
- (new_quota < dd->dd_phys->dd_reserved ||
- new_quota < dsl_dir_estimated_space(dd))) {
- err = ENOSPC;
- }
- mutex_exit(&dd->dd_lock);
- return (err);
-}
-
-static void
-dsl_dir_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx)
-{
- dsl_dir_t *dd = arg1;
- uint64_t *quotap = arg2;
- uint64_t new_quota = *quotap;
-
- dmu_buf_will_dirty(dd->dd_dbuf, tx);
-
- mutex_enter(&dd->dd_lock);
- dd->dd_phys->dd_quota = new_quota;
- mutex_exit(&dd->dd_lock);
-}
-
-int
-dsl_dir_set_quota(const char *ddname, uint64_t quota)
-{
- dsl_dir_t *dd;
- int err;
-
- err = dsl_dir_open(ddname, FTAG, &dd, NULL);
- if (err)
- return (err);
- /*
- * If someone removes a file, then tries to set the quota, we
- * want to make sure the file freeing takes effect.
- */
- txg_wait_open(dd->dd_pool, 0);
-
- err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_quota_check,
- dsl_dir_set_quota_sync, dd, &quota, 0);
- dsl_dir_close(dd, FTAG);
- return (err);
-}
-
-/* ARGSUSED */
-static int
-dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
- dsl_dir_t *dd = arg1;
- uint64_t *reservationp = arg2;
- uint64_t new_reservation = *reservationp;
- uint64_t used, avail;
- int64_t delta;
-
- if (new_reservation > INT64_MAX)
- return (EOVERFLOW);
-
- /*
- * If we are doing the preliminary check in open context, the
- * space estimates may be inaccurate.
- */
- if (!dmu_tx_is_syncing(tx))
- return (0);
-
- mutex_enter(&dd->dd_lock);
- used = dd->dd_used_bytes;
- delta = MAX(used, new_reservation) -
- MAX(used, dd->dd_phys->dd_reserved);
- mutex_exit(&dd->dd_lock);
-
- if (dd->dd_parent) {
- avail = dsl_dir_space_available(dd->dd_parent,
- NULL, 0, FALSE);
- } else {
- avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used;
- }
-
- if (delta > 0 && delta > avail)
- return (ENOSPC);
- if (delta > 0 && dd->dd_phys->dd_quota > 0 &&
- new_reservation > dd->dd_phys->dd_quota)
- return (ENOSPC);
- return (0);
-}
-
-static void
-dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx)
-{
- dsl_dir_t *dd = arg1;
- uint64_t *reservationp = arg2;
- uint64_t new_reservation = *reservationp;
- uint64_t used;
- int64_t delta;
-
- mutex_enter(&dd->dd_lock);
- used = dd->dd_used_bytes;
- delta = MAX(used, new_reservation) -
- MAX(used, dd->dd_phys->dd_reserved);
- mutex_exit(&dd->dd_lock);
-
- dmu_buf_will_dirty(dd->dd_dbuf, tx);
- dd->dd_phys->dd_reserved = new_reservation;
-
- if (dd->dd_parent != NULL) {
- /* Roll up this additional usage into our ancestors */
- dsl_dir_diduse_space(dd->dd_parent, delta, 0, 0, tx);
- }
-}
-
-int
-dsl_dir_set_reservation(const char *ddname, uint64_t reservation)
-{
- dsl_dir_t *dd;
- int err;
-
- err = dsl_dir_open(ddname, FTAG, &dd, NULL);
- if (err)
- return (err);
- err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_reservation_check,
- dsl_dir_set_reservation_sync, dd, &reservation, 0);
- dsl_dir_close(dd, FTAG);
- return (err);
-}
-
-static dsl_dir_t *
-closest_common_ancestor(dsl_dir_t *ds1, dsl_dir_t *ds2)
-{
- for (; ds1; ds1 = ds1->dd_parent) {
- dsl_dir_t *dd;
- for (dd = ds2; dd; dd = dd->dd_parent) {
- if (ds1 == dd)
- return (dd);
- }
- }
- return (NULL);
-}
-
-/*
- * If delta is applied to dd, how much of that delta would be applied to
- * ancestor? Syncing context only.
- */
-static int64_t
-would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor)
-{
- if (dd == ancestor)
- return (delta);
-
- mutex_enter(&dd->dd_lock);
- delta = parent_delta(dd, dd->dd_used_bytes, delta);
- mutex_exit(&dd->dd_lock);
- return (would_change(dd->dd_parent, delta, ancestor));
-}
-
-struct renamearg {
- dsl_dir_t *newparent;
- const char *mynewname;
-};
-
-/* ARGSUSED */
-static int
-dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
- dsl_dir_t *dd = arg1;
- struct renamearg *ra = arg2;
- dsl_pool_t *dp = dd->dd_pool;
- objset_t *mos = dp->dp_meta_objset;
- int err;
- uint64_t val;
-
- /* There should be 2 references: the open and the dirty */
- if (dmu_buf_refcount(dd->dd_dbuf) > 2)
- return (EBUSY);
-
- /* check for existing name */
- err = zap_lookup(mos, ra->newparent->dd_phys->dd_child_dir_zapobj,
- ra->mynewname, 8, 1, &val);
- if (err == 0)
- return (EEXIST);
- if (err != ENOENT)
- return (err);
-
- if (ra->newparent != dd->dd_parent) {
- /* is there enough space? */
- uint64_t myspace =
- MAX(dd->dd_used_bytes, dd->dd_phys->dd_reserved);
-
- /* no rename into our descendant */
- if (closest_common_ancestor(dd, ra->newparent) == dd)
- return (EINVAL);
-
- if (err = dsl_dir_transfer_possible(dd->dd_parent,
- ra->newparent, myspace))
- return (err);
- }
-
- return (0);
-}
-
-static void
-dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
-{
- dsl_dir_t *dd = arg1;
- struct renamearg *ra = arg2;
- dsl_pool_t *dp = dd->dd_pool;
- objset_t *mos = dp->dp_meta_objset;
- int err;
-
- ASSERT(dmu_buf_refcount(dd->dd_dbuf) <= 2);
-
- if (ra->newparent != dd->dd_parent) {
- uint64_t myspace =
- MAX(dd->dd_used_bytes, dd->dd_phys->dd_reserved);
-
- dsl_dir_diduse_space(dd->dd_parent, -myspace,
- -dd->dd_phys->dd_compressed_bytes,
- -dd->dd_phys->dd_uncompressed_bytes, tx);
- dsl_dir_diduse_space(ra->newparent, myspace,
- dd->dd_phys->dd_compressed_bytes,
- dd->dd_phys->dd_uncompressed_bytes, tx);
- }
-
- dmu_buf_will_dirty(dd->dd_dbuf, tx);
-
- /* remove from old parent zapobj */
- err = zap_remove(mos, dd->dd_parent->dd_phys->dd_child_dir_zapobj,
- dd->dd_myname, tx);
- ASSERT3U(err, ==, 0);
-
- (void) strcpy(dd->dd_myname, ra->mynewname);
- dsl_dir_close(dd->dd_parent, dd);
- dd->dd_phys->dd_parent_obj = ra->newparent->dd_object;
- VERIFY(0 == dsl_dir_open_obj(dd->dd_pool,
- ra->newparent->dd_object, NULL, dd, &dd->dd_parent));
-
- /* add to new parent zapobj */
- err = zap_add(mos, ra->newparent->dd_phys->dd_child_dir_zapobj,
- dd->dd_myname, 8, 1, &dd->dd_object, tx);
- ASSERT3U(err, ==, 0);
-}
-
-int
-dsl_dir_rename(dsl_dir_t *dd, const char *newname)
-{
- struct renamearg ra;
- int err;
-
- /* new parent should exist */
- err = dsl_dir_open(newname, FTAG, &ra.newparent, &ra.mynewname);
- if (err)
- return (err);
-
- /* can't rename to different pool */
- if (dd->dd_pool != ra.newparent->dd_pool) {
- err = ENXIO;
- goto out;
- }
-
- /* new name should not already exist */
- if (ra.mynewname == NULL) {
- err = EEXIST;
- goto out;
- }
-
-
- err = dsl_sync_task_do(dd->dd_pool,
- dsl_dir_rename_check, dsl_dir_rename_sync, dd, &ra, 3);
-
-out:
- dsl_dir_close(ra.newparent, FTAG);
- return (err);
-}
-
-int
-dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space)
-{
- dsl_dir_t *ancestor;
- int64_t adelta;
- uint64_t avail;
-
- ancestor = closest_common_ancestor(sdd, tdd);
- adelta = would_change(sdd, -space, ancestor);
- avail = dsl_dir_space_available(tdd, ancestor, adelta, FALSE);
- if (avail < space)
- return (ENOSPC);
-
- return (0);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
deleted file mode 100644
index 00abf7e..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
+++ /dev/null
@@ -1,256 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/dsl_pool.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_dir.h>
-#include <sys/dsl_synctask.h>
-#include <sys/dmu_tx.h>
-#include <sys/dmu_objset.h>
-#include <sys/arc.h>
-#include <sys/zap.h>
-#include <sys/zio.h>
-#include <sys/zfs_context.h>
-#include <sys/fs/zfs.h>
-
-static int
-dsl_pool_open_mos_dir(dsl_pool_t *dp, dsl_dir_t **ddp)
-{
- uint64_t obj;
- int err;
-
- err = zap_lookup(dp->dp_meta_objset,
- dp->dp_root_dir->dd_phys->dd_child_dir_zapobj,
- MOS_DIR_NAME, sizeof (obj), 1, &obj);
- if (err)
- return (err);
-
- return (dsl_dir_open_obj(dp, obj, MOS_DIR_NAME, dp, ddp));
-}
-
-static dsl_pool_t *
-dsl_pool_open_impl(spa_t *spa, uint64_t txg)
-{
- dsl_pool_t *dp;
- blkptr_t *bp = spa_get_rootblkptr(spa);
-
- dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP);
- dp->dp_spa = spa;
- dp->dp_meta_rootbp = *bp;
- rw_init(&dp->dp_config_rwlock, NULL, RW_DEFAULT, NULL);
- txg_init(dp, txg);
-
- txg_list_create(&dp->dp_dirty_datasets,
- offsetof(dsl_dataset_t, ds_dirty_link));
- txg_list_create(&dp->dp_dirty_dirs,
- offsetof(dsl_dir_t, dd_dirty_link));
- txg_list_create(&dp->dp_sync_tasks,
- offsetof(dsl_sync_task_group_t, dstg_node));
- list_create(&dp->dp_synced_objsets, sizeof (dsl_dataset_t),
- offsetof(dsl_dataset_t, ds_synced_link));
-
- return (dp);
-}
-
-int
-dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
-{
- int err;
- dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
- objset_impl_t *osi;
-
- rw_enter(&dp->dp_config_rwlock, RW_READER);
- err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, &osi);
- if (err)
- goto out;
- dp->dp_meta_objset = &osi->os;
-
- err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1,
- &dp->dp_root_dir_obj);
- if (err)
- goto out;
-
- err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj,
- NULL, dp, &dp->dp_root_dir);
- if (err)
- goto out;
-
- err = dsl_pool_open_mos_dir(dp, &dp->dp_mos_dir);
- if (err)
- goto out;
-
-out:
- rw_exit(&dp->dp_config_rwlock);
- if (err)
- dsl_pool_close(dp);
- else
- *dpp = dp;
-
- return (err);
-}
-
-void
-dsl_pool_close(dsl_pool_t *dp)
-{
- /* drop our reference from dsl_pool_open() */
- if (dp->dp_mos_dir)
- dsl_dir_close(dp->dp_mos_dir, dp);
- if (dp->dp_root_dir)
- dsl_dir_close(dp->dp_root_dir, dp);
-
- /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */
- if (dp->dp_meta_objset)
- dmu_objset_evict(NULL, dp->dp_meta_objset->os);
-
- txg_list_destroy(&dp->dp_dirty_datasets);
- txg_list_destroy(&dp->dp_dirty_dirs);
- txg_list_destroy(&dp->dp_sync_tasks);
- list_destroy(&dp->dp_synced_objsets);
-
- arc_flush();
- txg_fini(dp);
- rw_destroy(&dp->dp_config_rwlock);
- kmem_free(dp, sizeof (dsl_pool_t));
-}
-
-dsl_pool_t *
-dsl_pool_create(spa_t *spa, uint64_t txg)
-{
- int err;
- dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
- dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
- dp->dp_meta_objset = &dmu_objset_create_impl(spa,
- NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx)->os;
-
- /* create the pool directory */
- err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx);
- ASSERT3U(err, ==, 0);
-
- /* create and open the root dir */
- dsl_dataset_create_root(dp, &dp->dp_root_dir_obj, tx);
- VERIFY(0 == dsl_dir_open_obj(dp, dp->dp_root_dir_obj,
- NULL, dp, &dp->dp_root_dir));
-
- /* create and open the meta-objset dir */
- (void) dsl_dir_create_sync(dp->dp_root_dir, MOS_DIR_NAME, tx);
- VERIFY(0 == dsl_pool_open_mos_dir(dp, &dp->dp_mos_dir));
-
- dmu_tx_commit(tx);
-
- return (dp);
-}
-
-void
-dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
-{
- zio_t *zio;
- dmu_tx_t *tx;
- dsl_dir_t *dd;
- dsl_dataset_t *ds;
- dsl_sync_task_group_t *dstg;
- objset_impl_t *mosi = dp->dp_meta_objset->os;
- int err;
-
- tx = dmu_tx_create_assigned(dp, txg);
-
- zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
- while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) {
- if (!list_link_active(&ds->ds_synced_link))
- list_insert_tail(&dp->dp_synced_objsets, ds);
- else
- dmu_buf_rele(ds->ds_dbuf, ds);
- dsl_dataset_sync(ds, zio, tx);
- }
- err = zio_wait(zio);
- ASSERT(err == 0);
-
- while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg))
- dsl_sync_task_group_sync(dstg, tx);
- while (dd = txg_list_remove(&dp->dp_dirty_dirs, txg))
- dsl_dir_sync(dd, tx);
-
- if (list_head(&mosi->os_dirty_dnodes[txg & TXG_MASK]) != NULL ||
- list_head(&mosi->os_free_dnodes[txg & TXG_MASK]) != NULL) {
- zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
- dmu_objset_sync(mosi, zio, tx);
- err = zio_wait(zio);
- ASSERT(err == 0);
- dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", "");
- spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
- }
-
- dmu_tx_commit(tx);
-}
-
-void
-dsl_pool_zil_clean(dsl_pool_t *dp)
-{
- dsl_dataset_t *ds;
-
- while (ds = list_head(&dp->dp_synced_objsets)) {
- list_remove(&dp->dp_synced_objsets, ds);
- ASSERT(ds->ds_user_ptr != NULL);
- zil_clean(((objset_impl_t *)ds->ds_user_ptr)->os_zil);
- dmu_buf_rele(ds->ds_dbuf, ds);
- }
-}
-
-/*
- * TRUE if the current thread is the tx_sync_thread or if we
- * are being called from SPA context during pool initialization.
- */
-int
-dsl_pool_sync_context(dsl_pool_t *dp)
-{
- return (curthread == dp->dp_tx.tx_sync_thread ||
- spa_get_dsl(dp->dp_spa) == NULL);
-}
-
-uint64_t
-dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree)
-{
- uint64_t space, resv;
-
- /*
- * Reserve about 1.6% (1/64), or at least 32MB, for allocation
- * efficiency.
- * XXX The intent log is not accounted for, so it must fit
- * within this slop.
- *
- * If we're trying to assess whether it's OK to do a free,
- * cut the reservation in half to allow forward progress
- * (e.g. make it possible to rm(1) files from a full pool).
- */
- space = spa_get_dspace(dp->dp_spa);
- resv = MAX(space >> 6, SPA_MINDEVSIZE >> 1);
- if (netfree)
- resv >>= 1;
-
- return (space - resv);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c
deleted file mode 100644
index 2fff66d..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c
+++ /dev/null
@@ -1,501 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/dmu.h>
-#include <sys/dmu_objset.h>
-#include <sys/dmu_tx.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_dir.h>
-#include <sys/dsl_prop.h>
-#include <sys/dsl_synctask.h>
-#include <sys/spa.h>
-#include <sys/zio_checksum.h> /* for the default checksum value */
-#include <sys/zap.h>
-#include <sys/fs/zfs.h>
-
-#include "zfs_prop.h"
-
-static int
-dodefault(const char *propname, int intsz, int numint, void *buf)
-{
- zfs_prop_t prop;
-
- if ((prop = zfs_name_to_prop(propname)) == ZFS_PROP_INVAL ||
- zfs_prop_readonly(prop))
- return (ENOENT);
-
- if (zfs_prop_get_type(prop) == prop_type_string) {
- if (intsz != 1)
- return (EOVERFLOW);
- (void) strncpy(buf, zfs_prop_default_string(prop), numint);
- } else {
- if (intsz != 8 || numint < 1)
- return (EOVERFLOW);
-
- *(uint64_t *)buf = zfs_prop_default_numeric(prop);
- }
-
- return (0);
-}
-
-static int
-dsl_prop_get_impl(dsl_dir_t *dd, const char *propname,
- int intsz, int numint, void *buf, char *setpoint)
-{
- int err = ENOENT;
- zfs_prop_t prop;
-
- if (setpoint)
- setpoint[0] = '\0';
-
- prop = zfs_name_to_prop(propname);
-
- /*
- * Note: dd may be NULL, therefore we shouldn't dereference it
- * ouside this loop.
- */
- for (; dd != NULL; dd = dd->dd_parent) {
- objset_t *mos = dd->dd_pool->dp_meta_objset;
- ASSERT(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock));
- err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj,
- propname, intsz, numint, buf);
- if (err != ENOENT) {
- if (setpoint)
- dsl_dir_name(dd, setpoint);
- break;
- }
-
- /*
- * Break out of this loop for non-inheritable properties.
- */
- if (prop != ZFS_PROP_INVAL &&
- !zfs_prop_inheritable(prop))
- break;
- }
- if (err == ENOENT)
- err = dodefault(propname, intsz, numint, buf);
-
- return (err);
-}
-
-/*
- * Register interest in the named property. We'll call the callback
- * once to notify it of the current property value, and again each time
- * the property changes, until this callback is unregistered.
- *
- * Return 0 on success, errno if the prop is not an integer value.
- */
-int
-dsl_prop_register(dsl_dataset_t *ds, const char *propname,
- dsl_prop_changed_cb_t *callback, void *cbarg)
-{
- dsl_dir_t *dd = ds->ds_dir;
- uint64_t value;
- dsl_prop_cb_record_t *cbr;
- int err;
- int need_rwlock;
-
- need_rwlock = !RW_WRITE_HELD(&dd->dd_pool->dp_config_rwlock);
- if (need_rwlock)
- rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
-
- err = dsl_prop_get_impl(dd, propname, 8, 1, &value, NULL);
- if (err != 0) {
- rw_exit(&dd->dd_pool->dp_config_rwlock);
- return (err);
- }
-
- cbr = kmem_alloc(sizeof (dsl_prop_cb_record_t), KM_SLEEP);
- cbr->cbr_ds = ds;
- cbr->cbr_propname = kmem_alloc(strlen(propname)+1, KM_SLEEP);
- (void) strcpy((char *)cbr->cbr_propname, propname);
- cbr->cbr_func = callback;
- cbr->cbr_arg = cbarg;
- mutex_enter(&dd->dd_lock);
- list_insert_head(&dd->dd_prop_cbs, cbr);
- mutex_exit(&dd->dd_lock);
-
- cbr->cbr_func(cbr->cbr_arg, value);
-
- VERIFY(0 == dsl_dir_open_obj(dd->dd_pool, dd->dd_object,
- NULL, cbr, &dd));
- if (need_rwlock)
- rw_exit(&dd->dd_pool->dp_config_rwlock);
- /* Leave dataset open until this callback is unregistered */
- return (0);
-}
-
-int
-dsl_prop_get_ds(dsl_dir_t *dd, const char *propname,
- int intsz, int numints, void *buf, char *setpoint)
-{
- int err;
-
- rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
- err = dsl_prop_get_impl(dd, propname, intsz, numints, buf, setpoint);
- rw_exit(&dd->dd_pool->dp_config_rwlock);
-
- return (err);
-}
-
-int
-dsl_prop_get(const char *ddname, const char *propname,
- int intsz, int numints, void *buf, char *setpoint)
-{
- dsl_dir_t *dd;
- const char *tail;
- int err;
-
- err = dsl_dir_open(ddname, FTAG, &dd, &tail);
- if (err)
- return (err);
- if (tail && tail[0] != '@') {
- dsl_dir_close(dd, FTAG);
- return (ENOENT);
- }
-
- err = dsl_prop_get_ds(dd, propname, intsz, numints, buf, setpoint);
-
- dsl_dir_close(dd, FTAG);
- return (err);
-}
-
-/*
- * Get the current property value. It may have changed by the time this
- * function returns, so it is NOT safe to follow up with
- * dsl_prop_register() and assume that the value has not changed in
- * between.
- *
- * Return 0 on success, ENOENT if ddname is invalid.
- */
-int
-dsl_prop_get_integer(const char *ddname, const char *propname,
- uint64_t *valuep, char *setpoint)
-{
- return (dsl_prop_get(ddname, propname, 8, 1, valuep, setpoint));
-}
-
-/*
- * Unregister this callback. Return 0 on success, ENOENT if ddname is
- * invalid, ENOMSG if no matching callback registered.
- */
-int
-dsl_prop_unregister(dsl_dataset_t *ds, const char *propname,
- dsl_prop_changed_cb_t *callback, void *cbarg)
-{
- dsl_dir_t *dd = ds->ds_dir;
- dsl_prop_cb_record_t *cbr;
-
- mutex_enter(&dd->dd_lock);
- for (cbr = list_head(&dd->dd_prop_cbs);
- cbr; cbr = list_next(&dd->dd_prop_cbs, cbr)) {
- if (cbr->cbr_ds == ds &&
- cbr->cbr_func == callback &&
- cbr->cbr_arg == cbarg &&
- strcmp(cbr->cbr_propname, propname) == 0)
- break;
- }
-
- if (cbr == NULL) {
- mutex_exit(&dd->dd_lock);
- return (ENOMSG);
- }
-
- list_remove(&dd->dd_prop_cbs, cbr);
- mutex_exit(&dd->dd_lock);
- kmem_free((void*)cbr->cbr_propname, strlen(cbr->cbr_propname)+1);
- kmem_free(cbr, sizeof (dsl_prop_cb_record_t));
-
- /* Clean up from dsl_prop_register */
- dsl_dir_close(dd, cbr);
- return (0);
-}
-
-/*
- * Return the number of callbacks that are registered for this dataset.
- */
-int
-dsl_prop_numcb(dsl_dataset_t *ds)
-{
- dsl_dir_t *dd = ds->ds_dir;
- dsl_prop_cb_record_t *cbr;
- int num = 0;
-
- mutex_enter(&dd->dd_lock);
- for (cbr = list_head(&dd->dd_prop_cbs);
- cbr; cbr = list_next(&dd->dd_prop_cbs, cbr)) {
- if (cbr->cbr_ds == ds)
- num++;
- }
- mutex_exit(&dd->dd_lock);
-
- return (num);
-}
-
-static void
-dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj,
- const char *propname, uint64_t value, int first)
-{
- dsl_dir_t *dd;
- dsl_prop_cb_record_t *cbr;
- objset_t *mos = dp->dp_meta_objset;
- zap_cursor_t zc;
- zap_attribute_t za;
- int err;
-
- ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
- err = dsl_dir_open_obj(dp, ddobj, NULL, FTAG, &dd);
- if (err)
- return;
-
- if (!first) {
- /*
- * If the prop is set here, then this change is not
- * being inherited here or below; stop the recursion.
- */
- err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj, propname,
- 8, 1, &value);
- if (err == 0) {
- dsl_dir_close(dd, FTAG);
- return;
- }
- ASSERT3U(err, ==, ENOENT);
- }
-
- mutex_enter(&dd->dd_lock);
- for (cbr = list_head(&dd->dd_prop_cbs);
- cbr; cbr = list_next(&dd->dd_prop_cbs, cbr)) {
- if (strcmp(cbr->cbr_propname, propname) == 0) {
- cbr->cbr_func(cbr->cbr_arg, value);
- }
- }
- mutex_exit(&dd->dd_lock);
-
- for (zap_cursor_init(&zc, mos,
- dd->dd_phys->dd_child_dir_zapobj);
- zap_cursor_retrieve(&zc, &za) == 0;
- zap_cursor_advance(&zc)) {
- /* XXX recursion could blow stack; esp. za! */
- dsl_prop_changed_notify(dp, za.za_first_integer,
- propname, value, FALSE);
- }
- zap_cursor_fini(&zc);
- dsl_dir_close(dd, FTAG);
-}
-
-struct prop_set_arg {
- const char *name;
- int intsz;
- int numints;
- const void *buf;
-};
-
-
-static void
-dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx)
-{
- dsl_dir_t *dd = arg1;
- struct prop_set_arg *psa = arg2;
- objset_t *mos = dd->dd_pool->dp_meta_objset;
- uint64_t zapobj = dd->dd_phys->dd_props_zapobj;
- uint64_t intval;
- int isint;
-
- isint = (dodefault(psa->name, 8, 1, &intval) == 0);
-
- if (psa->numints == 0) {
- int err = zap_remove(mos, zapobj, psa->name, tx);
- ASSERT(err == 0 || err == ENOENT);
- if (isint) {
- VERIFY(0 == dsl_prop_get_impl(dd->dd_parent,
- psa->name, 8, 1, &intval, NULL));
- }
- } else {
- VERIFY(0 == zap_update(mos, zapobj, psa->name,
- psa->intsz, psa->numints, psa->buf, tx));
- if (isint)
- intval = *(uint64_t *)psa->buf;
- }
-
- if (isint) {
- dsl_prop_changed_notify(dd->dd_pool,
- dd->dd_object, psa->name, intval, TRUE);
- }
-}
-
-int
-dsl_prop_set_dd(dsl_dir_t *dd, const char *propname,
- int intsz, int numints, const void *buf)
-{
- struct prop_set_arg psa;
-
- psa.name = propname;
- psa.intsz = intsz;
- psa.numints = numints;
- psa.buf = buf;
-
- return (dsl_sync_task_do(dd->dd_pool,
- NULL, dsl_prop_set_sync, dd, &psa, 2));
-}
-
-int
-dsl_prop_set(const char *ddname, const char *propname,
- int intsz, int numints, const void *buf)
-{
- dsl_dir_t *dd;
- int err;
-
- /*
- * We must do these checks before we get to the syncfunc, since
- * it can't fail.
- */
- if (strlen(propname) >= ZAP_MAXNAMELEN)
- return (ENAMETOOLONG);
- if (intsz * numints >= ZAP_MAXVALUELEN)
- return (E2BIG);
-
- err = dsl_dir_open(ddname, FTAG, &dd, NULL);
- if (err)
- return (err);
- err = dsl_prop_set_dd(dd, propname, intsz, numints, buf);
- dsl_dir_close(dd, FTAG);
- return (err);
-}
-
-/*
- * Iterate over all properties for this dataset and return them in an nvlist.
- */
-int
-dsl_prop_get_all(objset_t *os, nvlist_t **nvp)
-{
- dsl_dataset_t *ds = os->os->os_dsl_dataset;
- dsl_dir_t *dd = ds->ds_dir;
- int err = 0;
- dsl_pool_t *dp;
- objset_t *mos;
-
- if (dsl_dataset_is_snapshot(ds)) {
- VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
- return (0);
- }
-
- VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-
- dp = dd->dd_pool;
- mos = dp->dp_meta_objset;
-
- rw_enter(&dp->dp_config_rwlock, RW_READER);
- for (; dd != NULL; dd = dd->dd_parent) {
- char setpoint[MAXNAMELEN];
- zap_cursor_t zc;
- zap_attribute_t za;
-
- dsl_dir_name(dd, setpoint);
-
- for (zap_cursor_init(&zc, mos, dd->dd_phys->dd_props_zapobj);
- (err = zap_cursor_retrieve(&zc, &za)) == 0;
- zap_cursor_advance(&zc)) {
- nvlist_t *propval;
- zfs_prop_t prop;
- /*
- * Skip non-inheritable properties.
- */
- if ((prop = zfs_name_to_prop(za.za_name)) !=
- ZFS_PROP_INVAL && !zfs_prop_inheritable(prop) &&
- dd != ds->ds_dir)
- continue;
-
- if (nvlist_lookup_nvlist(*nvp, za.za_name,
- &propval) == 0)
- continue;
-
- VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME,
- KM_SLEEP) == 0);
- if (za.za_integer_length == 1) {
- /*
- * String property
- */
- char *tmp = kmem_alloc(za.za_num_integers,
- KM_SLEEP);
- err = zap_lookup(mos,
- dd->dd_phys->dd_props_zapobj,
- za.za_name, 1, za.za_num_integers,
- tmp);
- if (err != 0) {
- kmem_free(tmp, za.za_num_integers);
- break;
- }
- VERIFY(nvlist_add_string(propval,
- ZFS_PROP_VALUE, tmp) == 0);
- kmem_free(tmp, za.za_num_integers);
- } else {
- /*
- * Integer property
- */
- ASSERT(za.za_integer_length == 8);
- (void) nvlist_add_uint64(propval,
- ZFS_PROP_VALUE, za.za_first_integer);
- }
-
- VERIFY(nvlist_add_string(propval,
- ZFS_PROP_SOURCE, setpoint) == 0);
- VERIFY(nvlist_add_nvlist(*nvp, za.za_name,
- propval) == 0);
- nvlist_free(propval);
- }
- zap_cursor_fini(&zc);
-
- if (err != ENOENT)
- break;
- err = 0;
- }
- rw_exit(&dp->dp_config_rwlock);
-
- return (err);
-}
-
-void
-dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value)
-{
- nvlist_t *propval;
-
- VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
- VERIFY(nvlist_add_uint64(propval, ZFS_PROP_VALUE, value) == 0);
- VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(prop), propval) == 0);
- nvlist_free(propval);
-}
-
-void
-dsl_prop_nvlist_add_string(nvlist_t *nv, zfs_prop_t prop, const char *value)
-{
- nvlist_t *propval;
-
- VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
- VERIFY(nvlist_add_string(propval, ZFS_PROP_VALUE, value) == 0);
- VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(prop), propval) == 0);
- nvlist_free(propval);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c b/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c
deleted file mode 100644
index 17deb56..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c
+++ /dev/null
@@ -1,196 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/dmu.h>
-#include <sys/dmu_tx.h>
-#include <sys/dsl_pool.h>
-#include <sys/dsl_dir.h>
-#include <sys/dsl_synctask.h>
-
-#define DST_AVG_BLKSHIFT 14
-
-/* ARGSUSED */
-static int
-dsl_null_checkfunc(void *arg1, void *arg2, dmu_tx_t *tx)
-{
- return (0);
-}
-
-dsl_sync_task_group_t *
-dsl_sync_task_group_create(dsl_pool_t *dp)
-{
- dsl_sync_task_group_t *dstg;
-
- dstg = kmem_zalloc(sizeof (dsl_sync_task_group_t), KM_SLEEP);
- list_create(&dstg->dstg_tasks, sizeof (dsl_sync_task_t),
- offsetof(dsl_sync_task_t, dst_node));
- dstg->dstg_pool = dp;
-
- return (dstg);
-}
-
-void
-dsl_sync_task_create(dsl_sync_task_group_t *dstg,
- dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc,
- void *arg1, void *arg2, int blocks_modified)
-{
- dsl_sync_task_t *dst;
-
- if (checkfunc == NULL)
- checkfunc = dsl_null_checkfunc;
- dst = kmem_zalloc(sizeof (dsl_sync_task_t), KM_SLEEP);
- dst->dst_checkfunc = checkfunc;
- dst->dst_syncfunc = syncfunc;
- dst->dst_arg1 = arg1;
- dst->dst_arg2 = arg2;
- list_insert_tail(&dstg->dstg_tasks, dst);
-
- dstg->dstg_space += blocks_modified << DST_AVG_BLKSHIFT;
-}
-
-int
-dsl_sync_task_group_wait(dsl_sync_task_group_t *dstg)
-{
- dmu_tx_t *tx;
- uint64_t txg;
- dsl_sync_task_t *dst;
-
-top:
- tx = dmu_tx_create_dd(dstg->dstg_pool->dp_mos_dir);
- VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT));
-
- txg = dmu_tx_get_txg(tx);
-
- /* Do a preliminary error check. */
- dstg->dstg_err = 0;
- rw_enter(&dstg->dstg_pool->dp_config_rwlock, RW_READER);
- for (dst = list_head(&dstg->dstg_tasks); dst;
- dst = list_next(&dstg->dstg_tasks, dst)) {
-#ifdef ZFS_DEBUG
- /*
- * Only check half the time, otherwise, the sync-context
- * check will almost never fail.
- */
- if (spa_get_random(2) == 0)
- continue;
-#endif
- dst->dst_err =
- dst->dst_checkfunc(dst->dst_arg1, dst->dst_arg2, tx);
- if (dst->dst_err)
- dstg->dstg_err = dst->dst_err;
- }
- rw_exit(&dstg->dstg_pool->dp_config_rwlock);
-
- if (dstg->dstg_err) {
- dmu_tx_commit(tx);
- return (dstg->dstg_err);
- }
-
- VERIFY(0 == txg_list_add(&dstg->dstg_pool->dp_sync_tasks, dstg, txg));
-
- dmu_tx_commit(tx);
-
- txg_wait_synced(dstg->dstg_pool, txg);
-
- if (dstg->dstg_err == EAGAIN)
- goto top;
-
- return (dstg->dstg_err);
-}
-
-void
-dsl_sync_task_group_destroy(dsl_sync_task_group_t *dstg)
-{
- dsl_sync_task_t *dst;
-
- while (dst = list_head(&dstg->dstg_tasks)) {
- list_remove(&dstg->dstg_tasks, dst);
- kmem_free(dst, sizeof (dsl_sync_task_t));
- }
- kmem_free(dstg, sizeof (dsl_sync_task_group_t));
-}
-
-void
-dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx)
-{
- dsl_sync_task_t *dst;
- void *tr_cookie;
-
- ASSERT3U(dstg->dstg_err, ==, 0);
-
- /*
- * Check for sufficient space.
- */
- dstg->dstg_err = dsl_dir_tempreserve_space(dstg->dstg_pool->dp_mos_dir,
- dstg->dstg_space, dstg->dstg_space * 3, 0, &tr_cookie, tx);
- /* don't bother trying again */
- if (dstg->dstg_err == ERESTART)
- dstg->dstg_err = EAGAIN;
- if (dstg->dstg_err)
- return;
-
- /*
- * Check for errors by calling checkfuncs.
- */
- rw_enter(&dstg->dstg_pool->dp_config_rwlock, RW_WRITER);
- for (dst = list_head(&dstg->dstg_tasks); dst;
- dst = list_next(&dstg->dstg_tasks, dst)) {
- dst->dst_err =
- dst->dst_checkfunc(dst->dst_arg1, dst->dst_arg2, tx);
- if (dst->dst_err)
- dstg->dstg_err = dst->dst_err;
- }
-
- if (dstg->dstg_err == 0) {
- /*
- * Execute sync tasks.
- */
- for (dst = list_head(&dstg->dstg_tasks); dst;
- dst = list_next(&dstg->dstg_tasks, dst)) {
- dst->dst_syncfunc(dst->dst_arg1, dst->dst_arg2, tx);
- }
- }
- rw_exit(&dstg->dstg_pool->dp_config_rwlock);
-
- dsl_dir_tempreserve_clear(tr_cookie, tx);
-}
-
-int
-dsl_sync_task_do(dsl_pool_t *dp,
- dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc,
- void *arg1, void *arg2, int blocks_modified)
-{
- dsl_sync_task_group_t *dstg;
- int err;
-
- dstg = dsl_sync_task_group_create(dp);
- dsl_sync_task_create(dstg, checkfunc, syncfunc,
- arg1, arg2, blocks_modified);
- err = dsl_sync_task_group_wait(dstg);
- dsl_sync_task_group_destroy(dstg);
- return (err);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/fletcher.c b/sys/contrib/opensolaris/uts/common/fs/zfs/fletcher.c
deleted file mode 100644
index edda3c9..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/fletcher.c
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/types.h>
-#include <sys/sysmacros.h>
-#include <sys/byteorder.h>
-#include <sys/spa.h>
-
-void
-fletcher_2_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
-{
- const uint64_t *ip = buf;
- const uint64_t *ipend = ip + (size / sizeof (uint64_t));
- uint64_t a0, b0, a1, b1;
-
- for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) {
- a0 += ip[0];
- a1 += ip[1];
- b0 += a0;
- b1 += a1;
- }
-
- ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
-}
-
-void
-fletcher_2_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp)
-{
- const uint64_t *ip = buf;
- const uint64_t *ipend = ip + (size / sizeof (uint64_t));
- uint64_t a0, b0, a1, b1;
-
- for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) {
- a0 += BSWAP_64(ip[0]);
- a1 += BSWAP_64(ip[1]);
- b0 += a0;
- b1 += a1;
- }
-
- ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
-}
-
-void
-fletcher_4_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
-{
- const uint32_t *ip = buf;
- const uint32_t *ipend = ip + (size / sizeof (uint32_t));
- uint64_t a, b, c, d;
-
- for (a = b = c = d = 0; ip < ipend; ip++) {
- a += ip[0];
- b += a;
- c += b;
- d += c;
- }
-
- ZIO_SET_CHECKSUM(zcp, a, b, c, d);
-}
-
-void
-fletcher_4_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp)
-{
- const uint32_t *ip = buf;
- const uint32_t *ipend = ip + (size / sizeof (uint32_t));
- uint64_t a, b, c, d;
-
- for (a = b = c = d = 0; ip < ipend; ip++) {
- a += BSWAP_32(ip[0]);
- b += a;
- c += b;
- d += c;
- }
-
- ZIO_SET_CHECKSUM(zcp, a, b, c, d);
-}
-
-void
-fletcher_4_incremental_native(const void *buf, uint64_t size,
- zio_cksum_t *zcp)
-{
- const uint32_t *ip = buf;
- const uint32_t *ipend = ip + (size / sizeof (uint32_t));
- uint64_t a, b, c, d;
-
- a = zcp->zc_word[0];
- b = zcp->zc_word[1];
- c = zcp->zc_word[2];
- d = zcp->zc_word[3];
-
- for (; ip < ipend; ip++) {
- a += ip[0];
- b += a;
- c += b;
- d += c;
- }
-
- ZIO_SET_CHECKSUM(zcp, a, b, c, d);
-}
-
-void
-fletcher_4_incremental_byteswap(const void *buf, uint64_t size,
- zio_cksum_t *zcp)
-{
- const uint32_t *ip = buf;
- const uint32_t *ipend = ip + (size / sizeof (uint32_t));
- uint64_t a, b, c, d;
-
- a = zcp->zc_word[0];
- b = zcp->zc_word[1];
- c = zcp->zc_word[2];
- d = zcp->zc_word[3];
-
- for (; ip < ipend; ip++) {
- a += BSWAP_32(ip[0]);
- b += a;
- c += b;
- d += c;
- }
-
- ZIO_SET_CHECKSUM(zcp, a, b, c, d);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/gzip.c b/sys/contrib/opensolaris/uts/common/fs/zfs/gzip.c
deleted file mode 100644
index b257d4a..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/gzip.c
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/debug.h>
-#include <sys/types.h>
-#include <sys/zmod.h>
-
-#ifdef _KERNEL
-#include <sys/systm.h>
-#else
-#include <strings.h>
-#endif
-
-size_t
-gzip_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
-{
- size_t dstlen = d_len;
-
- ASSERT(d_len <= s_len);
-
- if (z_compress_level(d_start, &dstlen, s_start, s_len, n) != Z_OK) {
- if (d_len != s_len)
- return (s_len);
-
- bcopy(s_start, d_start, s_len);
- return (s_len);
- }
-
- return (dstlen);
-}
-
-/*ARGSUSED*/
-int
-gzip_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
-{
- size_t dstlen = d_len;
-
- ASSERT(d_len >= s_len);
-
- if (z_uncompress(d_start, &dstlen, s_start, s_len) != Z_OK)
- return (-1);
-
- return (0);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/lzjb.c b/sys/contrib/opensolaris/uts/common/fs/zfs/lzjb.c
deleted file mode 100644
index a88b85c..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/lzjb.c
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-/*
- * We keep our own copy of this algorithm for 2 main reasons:
- * 1. If we didn't, anyone modifying common/os/compress.c would
- * directly break our on disk format
- * 2. Our version of lzjb does not have a number of checks that the
- * common/os version needs and uses
- * In particular, we are adding the "feature" that compress() can
- * take a destination buffer size and return -1 if the data will not
- * compress to d_len or less.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/types.h>
-
-#define MATCH_BITS 6
-#define MATCH_MIN 3
-#define MATCH_MAX ((1 << MATCH_BITS) + (MATCH_MIN - 1))
-#define OFFSET_MASK ((1 << (16 - MATCH_BITS)) - 1)
-#define LEMPEL_SIZE 256
-
-/*ARGSUSED*/
-size_t
-lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
-{
- uchar_t *src = s_start;
- uchar_t *dst = d_start;
- uchar_t *cpy, *copymap;
- int copymask = 1 << (NBBY - 1);
- int mlen, offset;
- uint16_t *hp;
- uint16_t lempel[LEMPEL_SIZE]; /* uninitialized; see above */
-
- while (src < (uchar_t *)s_start + s_len) {
- if ((copymask <<= 1) == (1 << NBBY)) {
- if (dst >= (uchar_t *)d_start + d_len - 1 - 2 * NBBY) {
- if (d_len != s_len)
- return (s_len);
- mlen = s_len;
- for (src = s_start, dst = d_start; mlen; mlen--)
- *dst++ = *src++;
- return (s_len);
- }
- copymask = 1;
- copymap = dst;
- *dst++ = 0;
- }
- if (src > (uchar_t *)s_start + s_len - MATCH_MAX) {
- *dst++ = *src++;
- continue;
- }
- hp = &lempel[((src[0] + 13) ^ (src[1] - 13) ^ src[2]) &
- (LEMPEL_SIZE - 1)];
- offset = (intptr_t)(src - *hp) & OFFSET_MASK;
- *hp = (uint16_t)(uintptr_t)src;
- cpy = src - offset;
- if (cpy >= (uchar_t *)s_start && cpy != src &&
- src[0] == cpy[0] && src[1] == cpy[1] && src[2] == cpy[2]) {
- *copymap |= copymask;
- for (mlen = MATCH_MIN; mlen < MATCH_MAX; mlen++)
- if (src[mlen] != cpy[mlen])
- break;
- *dst++ = ((mlen - MATCH_MIN) << (NBBY - MATCH_BITS)) |
- (offset >> NBBY);
- *dst++ = (uchar_t)offset;
- src += mlen;
- } else {
- *dst++ = *src++;
- }
- }
- return (dst - (uchar_t *)d_start);
-}
-
-/*ARGSUSED*/
-int
-lzjb_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
-{
- uchar_t *src = s_start;
- uchar_t *dst = d_start;
- uchar_t *d_end = (uchar_t *)d_start + d_len;
- uchar_t *cpy, copymap;
- int copymask = 1 << (NBBY - 1);
-
- while (dst < d_end) {
- if ((copymask <<= 1) == (1 << NBBY)) {
- copymask = 1;
- copymap = *src++;
- }
- if (copymap & copymask) {
- int mlen = (src[0] >> (NBBY - MATCH_BITS)) + MATCH_MIN;
- int offset = ((src[0] << NBBY) | src[1]) & OFFSET_MASK;
- src += 2;
- if ((cpy = dst - offset) < (uchar_t *)d_start)
- return (-1);
- while (--mlen >= 0 && dst < d_end)
- *dst++ = *cpy++;
- } else {
- *dst++ = *src++;
- }
- }
- return (0);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/metaslab.c b/sys/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
deleted file mode 100644
index 0dba134..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
+++ /dev/null
@@ -1,1023 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/spa_impl.h>
-#include <sys/dmu.h>
-#include <sys/dmu_tx.h>
-#include <sys/space_map.h>
-#include <sys/metaslab_impl.h>
-#include <sys/vdev_impl.h>
-#include <sys/zio.h>
-
-uint64_t metaslab_aliquot = 512ULL << 10;
-
-/*
- * ==========================================================================
- * Metaslab classes
- * ==========================================================================
- */
-metaslab_class_t *
-metaslab_class_create(void)
-{
- metaslab_class_t *mc;
-
- mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
-
- mc->mc_rotor = NULL;
-
- return (mc);
-}
-
-void
-metaslab_class_destroy(metaslab_class_t *mc)
-{
- metaslab_group_t *mg;
-
- while ((mg = mc->mc_rotor) != NULL) {
- metaslab_class_remove(mc, mg);
- metaslab_group_destroy(mg);
- }
-
- kmem_free(mc, sizeof (metaslab_class_t));
-}
-
-void
-metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg)
-{
- metaslab_group_t *mgprev, *mgnext;
-
- ASSERT(mg->mg_class == NULL);
-
- if ((mgprev = mc->mc_rotor) == NULL) {
- mg->mg_prev = mg;
- mg->mg_next = mg;
- } else {
- mgnext = mgprev->mg_next;
- mg->mg_prev = mgprev;
- mg->mg_next = mgnext;
- mgprev->mg_next = mg;
- mgnext->mg_prev = mg;
- }
- mc->mc_rotor = mg;
- mg->mg_class = mc;
-}
-
-void
-metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg)
-{
- metaslab_group_t *mgprev, *mgnext;
-
- ASSERT(mg->mg_class == mc);
-
- mgprev = mg->mg_prev;
- mgnext = mg->mg_next;
-
- if (mg == mgnext) {
- mc->mc_rotor = NULL;
- } else {
- mc->mc_rotor = mgnext;
- mgprev->mg_next = mgnext;
- mgnext->mg_prev = mgprev;
- }
-
- mg->mg_prev = NULL;
- mg->mg_next = NULL;
- mg->mg_class = NULL;
-}
-
-/*
- * ==========================================================================
- * Metaslab groups
- * ==========================================================================
- */
-static int
-metaslab_compare(const void *x1, const void *x2)
-{
- const metaslab_t *m1 = x1;
- const metaslab_t *m2 = x2;
-
- if (m1->ms_weight < m2->ms_weight)
- return (1);
- if (m1->ms_weight > m2->ms_weight)
- return (-1);
-
- /*
- * If the weights are identical, use the offset to force uniqueness.
- */
- if (m1->ms_map.sm_start < m2->ms_map.sm_start)
- return (-1);
- if (m1->ms_map.sm_start > m2->ms_map.sm_start)
- return (1);
-
- ASSERT3P(m1, ==, m2);
-
- return (0);
-}
-
-metaslab_group_t *
-metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
-{
- metaslab_group_t *mg;
-
- mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
- mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
- avl_create(&mg->mg_metaslab_tree, metaslab_compare,
- sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
- mg->mg_aliquot = metaslab_aliquot * MAX(1, vd->vdev_children);
- mg->mg_vd = vd;
- metaslab_class_add(mc, mg);
-
- return (mg);
-}
-
-void
-metaslab_group_destroy(metaslab_group_t *mg)
-{
- avl_destroy(&mg->mg_metaslab_tree);
- mutex_destroy(&mg->mg_lock);
- kmem_free(mg, sizeof (metaslab_group_t));
-}
-
-static void
-metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
-{
- mutex_enter(&mg->mg_lock);
- ASSERT(msp->ms_group == NULL);
- msp->ms_group = mg;
- msp->ms_weight = 0;
- avl_add(&mg->mg_metaslab_tree, msp);
- mutex_exit(&mg->mg_lock);
-}
-
-static void
-metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
-{
- mutex_enter(&mg->mg_lock);
- ASSERT(msp->ms_group == mg);
- avl_remove(&mg->mg_metaslab_tree, msp);
- msp->ms_group = NULL;
- mutex_exit(&mg->mg_lock);
-}
-
-static void
-metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
-{
- /*
- * Although in principle the weight can be any value, in
- * practice we do not use values in the range [1, 510].
- */
- ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0);
- ASSERT(MUTEX_HELD(&msp->ms_lock));
-
- mutex_enter(&mg->mg_lock);
- ASSERT(msp->ms_group == mg);
- avl_remove(&mg->mg_metaslab_tree, msp);
- msp->ms_weight = weight;
- avl_add(&mg->mg_metaslab_tree, msp);
- mutex_exit(&mg->mg_lock);
-}
-
-/*
- * ==========================================================================
- * The first-fit block allocator
- * ==========================================================================
- */
-static void
-metaslab_ff_load(space_map_t *sm)
-{
- ASSERT(sm->sm_ppd == NULL);
- sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
-}
-
-static void
-metaslab_ff_unload(space_map_t *sm)
-{
- kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
- sm->sm_ppd = NULL;
-}
-
-static uint64_t
-metaslab_ff_alloc(space_map_t *sm, uint64_t size)
-{
- avl_tree_t *t = &sm->sm_root;
- uint64_t align = size & -size;
- uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
- space_seg_t *ss, ssearch;
- avl_index_t where;
-
- ssearch.ss_start = *cursor;
- ssearch.ss_end = *cursor + size;
-
- ss = avl_find(t, &ssearch, &where);
- if (ss == NULL)
- ss = avl_nearest(t, where, AVL_AFTER);
-
- while (ss != NULL) {
- uint64_t offset = P2ROUNDUP(ss->ss_start, align);
-
- if (offset + size <= ss->ss_end) {
- *cursor = offset + size;
- return (offset);
- }
- ss = AVL_NEXT(t, ss);
- }
-
- /*
- * If we know we've searched the whole map (*cursor == 0), give up.
- * Otherwise, reset the cursor to the beginning and try again.
- */
- if (*cursor == 0)
- return (-1ULL);
-
- *cursor = 0;
- return (metaslab_ff_alloc(sm, size));
-}
-
-/* ARGSUSED */
-static void
-metaslab_ff_claim(space_map_t *sm, uint64_t start, uint64_t size)
-{
- /* No need to update cursor */
-}
-
-/* ARGSUSED */
-static void
-metaslab_ff_free(space_map_t *sm, uint64_t start, uint64_t size)
-{
- /* No need to update cursor */
-}
-
-static space_map_ops_t metaslab_ff_ops = {
- metaslab_ff_load,
- metaslab_ff_unload,
- metaslab_ff_alloc,
- metaslab_ff_claim,
- metaslab_ff_free
-};
-
-/*
- * ==========================================================================
- * Metaslabs
- * ==========================================================================
- */
-metaslab_t *
-metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
- uint64_t start, uint64_t size, uint64_t txg)
-{
- vdev_t *vd = mg->mg_vd;
- metaslab_t *msp;
-
- msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
- mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL);
-
- msp->ms_smo_syncing = *smo;
-
- /*
- * We create the main space map here, but we don't create the
- * allocmaps and freemaps until metaslab_sync_done(). This serves
- * two purposes: it allows metaslab_sync_done() to detect the
- * addition of new space; and for debugging, it ensures that we'd
- * data fault on any attempt to use this metaslab before it's ready.
- */
- space_map_create(&msp->ms_map, start, size,
- vd->vdev_ashift, &msp->ms_lock);
-
- metaslab_group_add(mg, msp);
-
- /*
- * If we're opening an existing pool (txg == 0) or creating
- * a new one (txg == TXG_INITIAL), all space is available now.
- * If we're adding space to an existing pool, the new space
- * does not become available until after this txg has synced.
- */
- if (txg <= TXG_INITIAL)
- metaslab_sync_done(msp, 0);
-
- if (txg != 0) {
- /*
- * The vdev is dirty, but the metaslab isn't -- it just needs
- * to have metaslab_sync_done() invoked from vdev_sync_done().
- * [We could just dirty the metaslab, but that would cause us
- * to allocate a space map object for it, which is wasteful
- * and would mess up the locality logic in metaslab_weight().]
- */
- ASSERT(TXG_CLEAN(txg) == spa_last_synced_txg(vd->vdev_spa));
- vdev_dirty(vd, 0, NULL, txg);
- vdev_dirty(vd, VDD_METASLAB, msp, TXG_CLEAN(txg));
- }
-
- return (msp);
-}
-
-void
-metaslab_fini(metaslab_t *msp)
-{
- metaslab_group_t *mg = msp->ms_group;
- int t;
-
- vdev_space_update(mg->mg_vd, -msp->ms_map.sm_size,
- -msp->ms_smo.smo_alloc);
-
- metaslab_group_remove(mg, msp);
-
- mutex_enter(&msp->ms_lock);
-
- space_map_unload(&msp->ms_map);
- space_map_destroy(&msp->ms_map);
-
- for (t = 0; t < TXG_SIZE; t++) {
- space_map_destroy(&msp->ms_allocmap[t]);
- space_map_destroy(&msp->ms_freemap[t]);
- }
-
- mutex_exit(&msp->ms_lock);
- mutex_destroy(&msp->ms_lock);
-
- kmem_free(msp, sizeof (metaslab_t));
-}
-
-#define METASLAB_WEIGHT_PRIMARY (1ULL << 63)
-#define METASLAB_WEIGHT_SECONDARY (1ULL << 62)
-#define METASLAB_ACTIVE_MASK \
- (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
-#define METASLAB_SMO_BONUS_MULTIPLIER 2
-
-static uint64_t
-metaslab_weight(metaslab_t *msp)
-{
- metaslab_group_t *mg = msp->ms_group;
- space_map_t *sm = &msp->ms_map;
- space_map_obj_t *smo = &msp->ms_smo;
- vdev_t *vd = mg->mg_vd;
- uint64_t weight, space;
-
- ASSERT(MUTEX_HELD(&msp->ms_lock));
-
- /*
- * The baseline weight is the metaslab's free space.
- */
- space = sm->sm_size - smo->smo_alloc;
- weight = space;
-
- /*
- * Modern disks have uniform bit density and constant angular velocity.
- * Therefore, the outer recording zones are faster (higher bandwidth)
- * than the inner zones by the ratio of outer to inner track diameter,
- * which is typically around 2:1. We account for this by assigning
- * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
- * In effect, this means that we'll select the metaslab with the most
- * free bandwidth rather than simply the one with the most free space.
- */
- weight = 2 * weight -
- ((sm->sm_start >> vd->vdev_ms_shift) * weight) / vd->vdev_ms_count;
- ASSERT(weight >= space && weight <= 2 * space);
-
- /*
- * For locality, assign higher weight to metaslabs we've used before.
- */
- if (smo->smo_object != 0)
- weight *= METASLAB_SMO_BONUS_MULTIPLIER;
- ASSERT(weight >= space &&
- weight <= 2 * METASLAB_SMO_BONUS_MULTIPLIER * space);
-
- /*
- * If this metaslab is one we're actively using, adjust its weight to
- * make it preferable to any inactive metaslab so we'll polish it off.
- */
- weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
-
- return (weight);
-}
-
-static int
-metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
-{
- space_map_t *sm = &msp->ms_map;
-
- ASSERT(MUTEX_HELD(&msp->ms_lock));
-
- if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
- int error = space_map_load(sm, &metaslab_ff_ops,
- SM_FREE, &msp->ms_smo,
- msp->ms_group->mg_vd->vdev_spa->spa_meta_objset);
- if (error) {
- metaslab_group_sort(msp->ms_group, msp, 0);
- return (error);
- }
- metaslab_group_sort(msp->ms_group, msp,
- msp->ms_weight | activation_weight);
- }
- ASSERT(sm->sm_loaded);
- ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
-
- return (0);
-}
-
-static void
-metaslab_passivate(metaslab_t *msp, uint64_t size)
-{
- /*
- * If size < SPA_MINBLOCKSIZE, then we will not allocate from
- * this metaslab again. In that case, it had better be empty,
- * or we would be leaving space on the table.
- */
- ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map.sm_space == 0);
- metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size));
- ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
-}
-
-/*
- * Write a metaslab to disk in the context of the specified transaction group.
- */
-void
-metaslab_sync(metaslab_t *msp, uint64_t txg)
-{
- vdev_t *vd = msp->ms_group->mg_vd;
- spa_t *spa = vd->vdev_spa;
- objset_t *mos = spa->spa_meta_objset;
- space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK];
- space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK];
- space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
- space_map_t *sm = &msp->ms_map;
- space_map_obj_t *smo = &msp->ms_smo_syncing;
- dmu_buf_t *db;
- dmu_tx_t *tx;
- int t;
-
- tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
-
- /*
- * The only state that can actually be changing concurrently with
- * metaslab_sync() is the metaslab's ms_map. No other thread can
- * be modifying this txg's allocmap, freemap, freed_map, or smo.
- * Therefore, we only hold ms_lock to satify space_map ASSERTs.
- * We drop it whenever we call into the DMU, because the DMU
- * can call down to us (e.g. via zio_free()) at any time.
- */
- mutex_enter(&msp->ms_lock);
-
- if (smo->smo_object == 0) {
- ASSERT(smo->smo_objsize == 0);
- ASSERT(smo->smo_alloc == 0);
- mutex_exit(&msp->ms_lock);
- smo->smo_object = dmu_object_alloc(mos,
- DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
- DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
- ASSERT(smo->smo_object != 0);
- dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
- (sm->sm_start >> vd->vdev_ms_shift),
- sizeof (uint64_t), &smo->smo_object, tx);
- mutex_enter(&msp->ms_lock);
- }
-
- space_map_walk(freemap, space_map_add, freed_map);
-
- if (sm->sm_loaded && spa_sync_pass(spa) == 1 && smo->smo_objsize >=
- 2 * sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) {
- /*
- * The in-core space map representation is twice as compact
- * as the on-disk one, so it's time to condense the latter
- * by generating a pure allocmap from first principles.
- *
- * This metaslab is 100% allocated,
- * minus the content of the in-core map (sm),
- * minus what's been freed this txg (freed_map),
- * minus allocations from txgs in the future
- * (because they haven't been committed yet).
- */
- space_map_vacate(allocmap, NULL, NULL);
- space_map_vacate(freemap, NULL, NULL);
-
- space_map_add(allocmap, allocmap->sm_start, allocmap->sm_size);
-
- space_map_walk(sm, space_map_remove, allocmap);
- space_map_walk(freed_map, space_map_remove, allocmap);
-
- for (t = 1; t < TXG_CONCURRENT_STATES; t++)
- space_map_walk(&msp->ms_allocmap[(txg + t) & TXG_MASK],
- space_map_remove, allocmap);
-
- mutex_exit(&msp->ms_lock);
- space_map_truncate(smo, mos, tx);
- mutex_enter(&msp->ms_lock);
- }
-
- space_map_sync(allocmap, SM_ALLOC, smo, mos, tx);
- space_map_sync(freemap, SM_FREE, smo, mos, tx);
-
- mutex_exit(&msp->ms_lock);
-
- VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db));
- dmu_buf_will_dirty(db, tx);
- ASSERT3U(db->db_size, ==, sizeof (*smo));
- bcopy(smo, db->db_data, db->db_size);
- dmu_buf_rele(db, FTAG);
-
- dmu_tx_commit(tx);
-}
-
-/*
- * Called after a transaction group has completely synced to mark
- * all of the metaslab's free space as usable.
- */
-void
-metaslab_sync_done(metaslab_t *msp, uint64_t txg)
-{
- space_map_obj_t *smo = &msp->ms_smo;
- space_map_obj_t *smosync = &msp->ms_smo_syncing;
- space_map_t *sm = &msp->ms_map;
- space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
- metaslab_group_t *mg = msp->ms_group;
- vdev_t *vd = mg->mg_vd;
- int t;
-
- mutex_enter(&msp->ms_lock);
-
- /*
- * If this metaslab is just becoming available, initialize its
- * allocmaps and freemaps and add its capacity to the vdev.
- */
- if (freed_map->sm_size == 0) {
- for (t = 0; t < TXG_SIZE; t++) {
- space_map_create(&msp->ms_allocmap[t], sm->sm_start,
- sm->sm_size, sm->sm_shift, sm->sm_lock);
- space_map_create(&msp->ms_freemap[t], sm->sm_start,
- sm->sm_size, sm->sm_shift, sm->sm_lock);
- }
- vdev_space_update(vd, sm->sm_size, 0);
- }
-
- vdev_space_update(vd, 0, smosync->smo_alloc - smo->smo_alloc);
-
- ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0);
- ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0);
-
- /*
- * If there's a space_map_load() in progress, wait for it to complete
- * so that we have a consistent view of the in-core space map.
- * Then, add everything we freed in this txg to the map.
- */
- space_map_load_wait(sm);
- space_map_vacate(freed_map, sm->sm_loaded ? space_map_free : NULL, sm);
-
- *smo = *smosync;
-
- /*
- * If the map is loaded but no longer active, evict it as soon as all
- * future allocations have synced. (If we unloaded it now and then
- * loaded a moment later, the map wouldn't reflect those allocations.)
- */
- if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
- int evictable = 1;
-
- for (t = 1; t < TXG_CONCURRENT_STATES; t++)
- if (msp->ms_allocmap[(txg + t) & TXG_MASK].sm_space)
- evictable = 0;
-
- if (evictable)
- space_map_unload(sm);
- }
-
- metaslab_group_sort(mg, msp, metaslab_weight(msp));
-
- mutex_exit(&msp->ms_lock);
-}
-
-static uint64_t
-metaslab_distance(metaslab_t *msp, dva_t *dva)
-{
- uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
- uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
- uint64_t start = msp->ms_map.sm_start >> ms_shift;
-
- if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
- return (1ULL << 63);
-
- if (offset < start)
- return ((start - offset) << ms_shift);
- if (offset > start)
- return ((offset - start) << ms_shift);
- return (0);
-}
-
-static uint64_t
-metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
- uint64_t min_distance, dva_t *dva, int d)
-{
- metaslab_t *msp = NULL;
- uint64_t offset = -1ULL;
- avl_tree_t *t = &mg->mg_metaslab_tree;
- uint64_t activation_weight;
- uint64_t target_distance;
- int i;
-
- activation_weight = METASLAB_WEIGHT_PRIMARY;
- for (i = 0; i < d; i++)
- if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id)
- activation_weight = METASLAB_WEIGHT_SECONDARY;
-
- for (;;) {
- mutex_enter(&mg->mg_lock);
- for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
- if (msp->ms_weight < size) {
- mutex_exit(&mg->mg_lock);
- return (-1ULL);
- }
-
- if (activation_weight == METASLAB_WEIGHT_PRIMARY)
- break;
-
- target_distance = min_distance +
- (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1);
-
- for (i = 0; i < d; i++)
- if (metaslab_distance(msp, &dva[i]) <
- target_distance)
- break;
- if (i == d)
- break;
- }
- mutex_exit(&mg->mg_lock);
- if (msp == NULL)
- return (-1ULL);
-
- mutex_enter(&msp->ms_lock);
-
- /*
- * Ensure that the metaslab we have selected is still
- * capable of handling our request. It's possible that
- * another thread may have changed the weight while we
- * were blocked on the metaslab lock.
- */
- if (msp->ms_weight < size) {
- mutex_exit(&msp->ms_lock);
- continue;
- }
-
- if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
- activation_weight == METASLAB_WEIGHT_PRIMARY) {
- metaslab_passivate(msp,
- msp->ms_weight & ~METASLAB_ACTIVE_MASK);
- mutex_exit(&msp->ms_lock);
- continue;
- }
-
- if (metaslab_activate(msp, activation_weight) != 0) {
- mutex_exit(&msp->ms_lock);
- continue;
- }
-
- if ((offset = space_map_alloc(&msp->ms_map, size)) != -1ULL)
- break;
-
- metaslab_passivate(msp, size - 1);
-
- mutex_exit(&msp->ms_lock);
- }
-
- if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
- vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
-
- space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
-
- mutex_exit(&msp->ms_lock);
-
- return (offset);
-}
-
-/*
- * Allocate a block for the specified i/o.
- */
-static int
-metaslab_alloc_dva(spa_t *spa, uint64_t psize, dva_t *dva, int d,
- dva_t *hintdva, uint64_t txg, boolean_t hintdva_avoid)
-{
- metaslab_group_t *mg, *rotor;
- metaslab_class_t *mc;
- vdev_t *vd;
- int dshift = 3;
- int all_zero;
- uint64_t offset = -1ULL;
- uint64_t asize;
- uint64_t distance;
-
- ASSERT(!DVA_IS_VALID(&dva[d]));
-
- mc = spa_metaslab_class_select(spa);
-
- /*
- * Start at the rotor and loop through all mgs until we find something.
- * Note that there's no locking on mc_rotor or mc_allocated because
- * nothing actually breaks if we miss a few updates -- we just won't
- * allocate quite as evenly. It all balances out over time.
- *
- * If we are doing ditto or log blocks, try to spread them across
- * consecutive vdevs. If we're forced to reuse a vdev before we've
- * allocated all of our ditto blocks, then try and spread them out on
- * that vdev as much as possible. If it turns out to not be possible,
- * gradually lower our standards until anything becomes acceptable.
- * Also, allocating on consecutive vdevs (as opposed to random vdevs)
- * gives us hope of containing our fault domains to something we're
- * able to reason about. Otherwise, any two top-level vdev failures
- * will guarantee the loss of data. With consecutive allocation,
- * only two adjacent top-level vdev failures will result in data loss.
- *
- * If we are doing gang blocks (hintdva is non-NULL), try to keep
- * ourselves on the same vdev as our gang block header. That
- * way, we can hope for locality in vdev_cache, plus it makes our
- * fault domains something tractable.
- */
- if (hintdva) {
- vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
- if (hintdva_avoid)
- mg = vd->vdev_mg->mg_next;
- else
- mg = vd->vdev_mg;
- } else if (d != 0) {
- vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
- mg = vd->vdev_mg->mg_next;
- } else {
- mg = mc->mc_rotor;
- }
- rotor = mg;
-
-top:
- all_zero = B_TRUE;
- do {
- vd = mg->mg_vd;
-
- distance = vd->vdev_asize >> dshift;
- if (distance <= (1ULL << vd->vdev_ms_shift))
- distance = 0;
- else
- all_zero = B_FALSE;
-
- asize = vdev_psize_to_asize(vd, psize);
- ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
-
- offset = metaslab_group_alloc(mg, asize, txg, distance, dva, d);
- if (offset != -1ULL) {
- /*
- * If we've just selected this metaslab group,
- * figure out whether the corresponding vdev is
- * over- or under-used relative to the pool,
- * and set an allocation bias to even it out.
- */
- if (mc->mc_allocated == 0) {
- vdev_stat_t *vs = &vd->vdev_stat;
- uint64_t alloc, space;
- int64_t vu, su;
-
- alloc = spa_get_alloc(spa);
- space = spa_get_space(spa);
-
- /*
- * Determine percent used in units of 0..1024.
- * (This is just to avoid floating point.)
- */
- vu = (vs->vs_alloc << 10) / (vs->vs_space + 1);
- su = (alloc << 10) / (space + 1);
-
- /*
- * Bias by at most +/- 25% of the aliquot.
- */
- mg->mg_bias = ((su - vu) *
- (int64_t)mg->mg_aliquot) / (1024 * 4);
- }
-
- if (atomic_add_64_nv(&mc->mc_allocated, asize) >=
- mg->mg_aliquot + mg->mg_bias) {
- mc->mc_rotor = mg->mg_next;
- mc->mc_allocated = 0;
- }
-
- DVA_SET_VDEV(&dva[d], vd->vdev_id);
- DVA_SET_OFFSET(&dva[d], offset);
- DVA_SET_GANG(&dva[d], 0);
- DVA_SET_ASIZE(&dva[d], asize);
-
- return (0);
- }
- mc->mc_rotor = mg->mg_next;
- mc->mc_allocated = 0;
- } while ((mg = mg->mg_next) != rotor);
-
- if (!all_zero) {
- dshift++;
- ASSERT(dshift < 64);
- goto top;
- }
-
- bzero(&dva[d], sizeof (dva_t));
-
- return (ENOSPC);
-}
-
-/*
- * Free the block represented by DVA in the context of the specified
- * transaction group.
- */
-static void
-metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now)
-{
- uint64_t vdev = DVA_GET_VDEV(dva);
- uint64_t offset = DVA_GET_OFFSET(dva);
- uint64_t size = DVA_GET_ASIZE(dva);
- vdev_t *vd;
- metaslab_t *msp;
-
- ASSERT(DVA_IS_VALID(dva));
-
- if (txg > spa_freeze_txg(spa))
- return;
-
- if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
- (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
- cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu",
- (u_longlong_t)vdev, (u_longlong_t)offset);
- ASSERT(0);
- return;
- }
-
- msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
-
- if (DVA_GET_GANG(dva))
- size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
-
- mutex_enter(&msp->ms_lock);
-
- if (now) {
- space_map_remove(&msp->ms_allocmap[txg & TXG_MASK],
- offset, size);
- space_map_free(&msp->ms_map, offset, size);
- } else {
- if (msp->ms_freemap[txg & TXG_MASK].sm_space == 0)
- vdev_dirty(vd, VDD_METASLAB, msp, txg);
- space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size);
-
- /*
- * verify that this region is actually allocated in
- * either a ms_allocmap or the ms_map
- */
- if (msp->ms_map.sm_loaded) {
- boolean_t allocd = B_FALSE;
- int i;
-
- if (!space_map_contains(&msp->ms_map, offset, size)) {
- allocd = B_TRUE;
- } else {
- for (i = 0; i < TXG_CONCURRENT_STATES; i++) {
- space_map_t *sm = &msp->ms_allocmap
- [(txg - i) & TXG_MASK];
- if (space_map_contains(sm,
- offset, size)) {
- allocd = B_TRUE;
- break;
- }
- }
- }
-
- if (!allocd) {
- zfs_panic_recover("freeing free segment "
- "(vdev=%llu offset=%llx size=%llx)",
- (longlong_t)vdev, (longlong_t)offset,
- (longlong_t)size);
- }
- }
-
-
- }
-
- mutex_exit(&msp->ms_lock);
-}
-
-/*
- * Intent log support: upon opening the pool after a crash, notify the SPA
- * of blocks that the intent log has allocated for immediate write, but
- * which are still considered free by the SPA because the last transaction
- * group didn't commit yet.
- */
-static int
-metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
-{
- uint64_t vdev = DVA_GET_VDEV(dva);
- uint64_t offset = DVA_GET_OFFSET(dva);
- uint64_t size = DVA_GET_ASIZE(dva);
- vdev_t *vd;
- metaslab_t *msp;
- int error;
-
- ASSERT(DVA_IS_VALID(dva));
-
- if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
- (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
- return (ENXIO);
-
- msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
-
- if (DVA_GET_GANG(dva))
- size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
-
- mutex_enter(&msp->ms_lock);
-
- error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
- if (error) {
- mutex_exit(&msp->ms_lock);
- return (error);
- }
-
- if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
- vdev_dirty(vd, VDD_METASLAB, msp, txg);
-
- space_map_claim(&msp->ms_map, offset, size);
- space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
-
- mutex_exit(&msp->ms_lock);
-
- return (0);
-}
-
-int
-metaslab_alloc(spa_t *spa, uint64_t psize, blkptr_t *bp, int ndvas,
- uint64_t txg, blkptr_t *hintbp, boolean_t hintbp_avoid)
-{
- dva_t *dva = bp->blk_dva;
- dva_t *hintdva = hintbp->blk_dva;
- int d;
- int error = 0;
-
- ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
- ASSERT(BP_GET_NDVAS(bp) == 0);
- ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
-
- for (d = 0; d < ndvas; d++) {
- error = metaslab_alloc_dva(spa, psize, dva, d, hintdva,
- txg, hintbp_avoid);
- if (error) {
- for (d--; d >= 0; d--) {
- metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
- bzero(&dva[d], sizeof (dva_t));
- }
- return (error);
- }
- }
- ASSERT(error == 0);
- ASSERT(BP_GET_NDVAS(bp) == ndvas);
-
- return (0);
-}
-
-void
-metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
-{
- const dva_t *dva = bp->blk_dva;
- int ndvas = BP_GET_NDVAS(bp);
- int d;
-
- ASSERT(!BP_IS_HOLE(bp));
-
- for (d = 0; d < ndvas; d++)
- metaslab_free_dva(spa, &dva[d], txg, now);
-}
-
-int
-metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
-{
- const dva_t *dva = bp->blk_dva;
- int ndvas = BP_GET_NDVAS(bp);
- int d, error;
- int last_error = 0;
-
- ASSERT(!BP_IS_HOLE(bp));
-
- for (d = 0; d < ndvas; d++)
- if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
- last_error = error;
-
- return (last_error);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/refcount.c b/sys/contrib/opensolaris/uts/common/fs/zfs/refcount.c
deleted file mode 100644
index 411ed46..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/refcount.c
+++ /dev/null
@@ -1,194 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/refcount.h>
-
-#if defined(DEBUG) || !defined(_KERNEL)
-
-#ifdef _KERNEL
-int reference_tracking_enable = FALSE; /* runs out of memory too easily */
-#else
-int reference_tracking_enable = TRUE;
-#endif
-int reference_history = 4; /* tunable */
-
-static kmem_cache_t *reference_cache;
-static kmem_cache_t *reference_history_cache;
-
-void
-refcount_init(void)
-{
- reference_cache = kmem_cache_create("reference_cache",
- sizeof (reference_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
-
- reference_history_cache = kmem_cache_create("reference_history_cache",
- sizeof (uint64_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
-}
-
-void
-refcount_fini(void)
-{
- kmem_cache_destroy(reference_cache);
- kmem_cache_destroy(reference_history_cache);
-}
-
-void
-refcount_create(refcount_t *rc)
-{
- list_create(&rc->rc_list, sizeof (reference_t),
- offsetof(reference_t, ref_link));
- list_create(&rc->rc_removed, sizeof (reference_t),
- offsetof(reference_t, ref_link));
- mutex_init(&rc->rc_mtx, NULL, MUTEX_DEFAULT, NULL);
-}
-
-void
-refcount_destroy_many(refcount_t *rc, uint64_t number)
-{
- reference_t *ref;
-
- ASSERT(rc->rc_count == number);
- while (ref = list_head(&rc->rc_list)) {
- list_remove(&rc->rc_list, ref);
- kmem_cache_free(reference_cache, ref);
- }
- list_destroy(&rc->rc_list);
-
- while (ref = list_head(&rc->rc_removed)) {
- list_remove(&rc->rc_removed, ref);
- kmem_cache_free(reference_history_cache, ref->ref_removed);
- kmem_cache_free(reference_cache, ref);
- }
- list_destroy(&rc->rc_removed);
- mutex_destroy(&rc->rc_mtx);
-}
-
-void
-refcount_destroy(refcount_t *rc)
-{
- refcount_destroy_many(rc, 0);
-}
-
-int
-refcount_is_zero(refcount_t *rc)
-{
- ASSERT(rc->rc_count >= 0);
- return (rc->rc_count == 0);
-}
-
-int64_t
-refcount_count(refcount_t *rc)
-{
- ASSERT(rc->rc_count >= 0);
- return (rc->rc_count);
-}
-
-int64_t
-refcount_add_many(refcount_t *rc, uint64_t number, void *holder)
-{
- reference_t *ref;
- int64_t count;
-
- if (reference_tracking_enable) {
- ref = kmem_cache_alloc(reference_cache, KM_SLEEP);
- ref->ref_holder = holder;
- ref->ref_number = number;
- }
- mutex_enter(&rc->rc_mtx);
- ASSERT(rc->rc_count >= 0);
- if (reference_tracking_enable)
- list_insert_head(&rc->rc_list, ref);
- rc->rc_count += number;
- count = rc->rc_count;
- mutex_exit(&rc->rc_mtx);
-
- return (count);
-}
-
-int64_t
-refcount_add(refcount_t *rc, void *holder)
-{
- return (refcount_add_many(rc, 1, holder));
-}
-
-int64_t
-refcount_remove_many(refcount_t *rc, uint64_t number, void *holder)
-{
- reference_t *ref;
- int64_t count;
-
- mutex_enter(&rc->rc_mtx);
- ASSERT(rc->rc_count >= number);
-
- if (!reference_tracking_enable) {
- rc->rc_count -= number;
- count = rc->rc_count;
- mutex_exit(&rc->rc_mtx);
- return (count);
- }
-
- for (ref = list_head(&rc->rc_list); ref;
- ref = list_next(&rc->rc_list, ref)) {
- if (ref->ref_holder == holder && ref->ref_number == number) {
- list_remove(&rc->rc_list, ref);
- if (reference_history > 0) {
- ref->ref_removed =
- kmem_cache_alloc(reference_history_cache,
- KM_SLEEP);
- list_insert_head(&rc->rc_removed, ref);
- rc->rc_removed_count++;
- if (rc->rc_removed_count >= reference_history) {
- ref = list_tail(&rc->rc_removed);
- list_remove(&rc->rc_removed, ref);
- kmem_cache_free(reference_history_cache,
- ref->ref_removed);
- kmem_cache_free(reference_cache, ref);
- rc->rc_removed_count--;
- }
- } else {
- kmem_cache_free(reference_cache, ref);
- }
- rc->rc_count -= number;
- count = rc->rc_count;
- mutex_exit(&rc->rc_mtx);
- return (count);
- }
- }
- panic("No such hold %p on refcount %llx", holder,
- (u_longlong_t)(uintptr_t)rc);
- return (-1);
-}
-
-int64_t
-refcount_remove(refcount_t *rc, void *holder)
-{
- return (refcount_remove_many(rc, 1, holder));
-}
-
-#endif
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sha256.c b/sys/contrib/opensolaris/uts/common/fs/zfs/sha256.c
deleted file mode 100644
index ce5c261..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sha256.c
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/zio.h>
-#include <sys/zio_checksum.h>
-
-/*
- * SHA-256 checksum, as specified in FIPS 180-2, available at:
- * http://csrc.nist.gov/cryptval
- *
- * This is a very compact implementation of SHA-256.
- * It is designed to be simple and portable, not to be fast.
- */
-
-/*
- * The literal definitions according to FIPS180-2 would be:
- *
- * Ch(x, y, z) (((x) & (y)) ^ ((~(x)) & (z)))
- * Maj(x, y, z) (((x) & (y)) | ((x) & (z)) | ((y) & (z)))
- *
- * We use logical equivalents which require one less op.
- */
-#define Ch(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
-#define Maj(x, y, z) (((x) & (y)) ^ ((z) & ((x) ^ (y))))
-#define Rot32(x, s) (((x) >> s) | ((x) << (32 - s)))
-#define SIGMA0(x) (Rot32(x, 2) ^ Rot32(x, 13) ^ Rot32(x, 22))
-#define SIGMA1(x) (Rot32(x, 6) ^ Rot32(x, 11) ^ Rot32(x, 25))
-#define sigma0(x) (Rot32(x, 7) ^ Rot32(x, 18) ^ ((x) >> 3))
-#define sigma1(x) (Rot32(x, 17) ^ Rot32(x, 19) ^ ((x) >> 10))
-
-static const uint32_t SHA256_K[64] = {
- 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
- 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
- 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
- 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
- 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
- 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
- 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
- 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
- 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
- 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
- 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
- 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
- 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
- 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
- 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
- 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
-};
-
-static void
-SHA256Transform(uint32_t *H, const uint8_t *cp)
-{
- uint32_t a, b, c, d, e, f, g, h, t, T1, T2, W[64];
-
- for (t = 0; t < 16; t++, cp += 4)
- W[t] = (cp[0] << 24) | (cp[1] << 16) | (cp[2] << 8) | cp[3];
-
- for (t = 16; t < 64; t++)
- W[t] = sigma1(W[t - 2]) + W[t - 7] +
- sigma0(W[t - 15]) + W[t - 16];
-
- a = H[0]; b = H[1]; c = H[2]; d = H[3];
- e = H[4]; f = H[5]; g = H[6]; h = H[7];
-
- for (t = 0; t < 64; t++) {
- T1 = h + SIGMA1(e) + Ch(e, f, g) + SHA256_K[t] + W[t];
- T2 = SIGMA0(a) + Maj(a, b, c);
- h = g; g = f; f = e; e = d + T1;
- d = c; c = b; b = a; a = T1 + T2;
- }
-
- H[0] += a; H[1] += b; H[2] += c; H[3] += d;
- H[4] += e; H[5] += f; H[6] += g; H[7] += h;
-}
-
-void
-zio_checksum_SHA256(const void *buf, uint64_t size, zio_cksum_t *zcp)
-{
- uint32_t H[8] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
- 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 };
- uint8_t pad[128];
- int padsize = size & 63;
- int i;
-
- for (i = 0; i < size - padsize; i += 64)
- SHA256Transform(H, (uint8_t *)buf + i);
-
- for (i = 0; i < padsize; i++)
- pad[i] = ((uint8_t *)buf)[i];
-
- for (pad[padsize++] = 0x80; (padsize & 63) != 56; padsize++)
- pad[padsize] = 0;
-
- for (i = 0; i < 8; i++)
- pad[padsize++] = (size << 3) >> (56 - 8 * i);
-
- for (i = 0; i < padsize; i += 64)
- SHA256Transform(H, pad + i);
-
- ZIO_SET_CHECKSUM(zcp,
- (uint64_t)H[0] << 32 | H[1],
- (uint64_t)H[2] << 32 | H[3],
- (uint64_t)H[4] << 32 | H[5],
- (uint64_t)H[6] << 32 | H[7]);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/spa.c b/sys/contrib/opensolaris/uts/common/fs/zfs/spa.c
deleted file mode 100644
index 6a7c525..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/spa.c
+++ /dev/null
@@ -1,3301 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-/*
- * This file contains all the routines used when modifying on-disk SPA state.
- * This includes opening, importing, destroying, exporting a pool, and syncing a
- * pool.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/fm/fs/zfs.h>
-#include <sys/spa_impl.h>
-#include <sys/zio.h>
-#include <sys/zio_checksum.h>
-#include <sys/zio_compress.h>
-#include <sys/dmu.h>
-#include <sys/dmu_tx.h>
-#include <sys/zap.h>
-#include <sys/zil.h>
-#include <sys/vdev_impl.h>
-#include <sys/metaslab.h>
-#include <sys/uberblock_impl.h>
-#include <sys/txg.h>
-#include <sys/avl.h>
-#include <sys/dmu_traverse.h>
-#include <sys/dmu_objset.h>
-#include <sys/unique.h>
-#include <sys/dsl_pool.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_dir.h>
-#include <sys/dsl_prop.h>
-#include <sys/dsl_synctask.h>
-#include <sys/fs/zfs.h>
-#include <sys/callb.h>
-#include <sys/sunddi.h>
-
-int zio_taskq_threads = 0;
-SYSCTL_DECL(_vfs_zfs);
-SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO");
-TUNABLE_INT("vfs.zfs.zio.taskq_threads", &zio_taskq_threads);
-SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, taskq_threads, CTLFLAG_RW,
- &zio_taskq_threads, 0, "Number of ZIO threads per ZIO type");
-
-
-/*
- * ==========================================================================
- * SPA state manipulation (open/create/destroy/import/export)
- * ==========================================================================
- */
-
-static int
-spa_error_entry_compare(const void *a, const void *b)
-{
- spa_error_entry_t *sa = (spa_error_entry_t *)a;
- spa_error_entry_t *sb = (spa_error_entry_t *)b;
- int ret;
-
- ret = bcmp(&sa->se_bookmark, &sb->se_bookmark,
- sizeof (zbookmark_t));
-
- if (ret < 0)
- return (-1);
- else if (ret > 0)
- return (1);
- else
- return (0);
-}
-
-/*
- * Utility function which retrieves copies of the current logs and
- * re-initializes them in the process.
- */
-void
-spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
-{
- ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
-
- bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
- bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
-
- avl_create(&spa->spa_errlist_scrub,
- spa_error_entry_compare, sizeof (spa_error_entry_t),
- offsetof(spa_error_entry_t, se_avl));
- avl_create(&spa->spa_errlist_last,
- spa_error_entry_compare, sizeof (spa_error_entry_t),
- offsetof(spa_error_entry_t, se_avl));
-}
-
-/*
- * Activate an uninitialized pool.
- */
-static void
-spa_activate(spa_t *spa)
-{
- int t;
- int nthreads = zio_taskq_threads;
- char name[32];
-
- ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
-
- spa->spa_state = POOL_STATE_ACTIVE;
-
- spa->spa_normal_class = metaslab_class_create();
-
- if (nthreads == 0)
- nthreads = max_ncpus;
- for (t = 0; t < ZIO_TYPES; t++) {
- snprintf(name, sizeof(name), "spa_zio_issue %d", t);
- spa->spa_zio_issue_taskq[t] = taskq_create(name, nthreads,
- maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE);
- snprintf(name, sizeof(name), "spa_zio_intr %d", t);
- spa->spa_zio_intr_taskq[t] = taskq_create(name, nthreads,
- maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE);
- }
-
- rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL);
-
- mutex_init(&spa->spa_uberblock_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&spa->spa_config_lock.scl_lock, NULL, MUTEX_DEFAULT, NULL);
- cv_init(&spa->spa_config_lock.scl_cv, NULL, CV_DEFAULT, NULL);
- mutex_init(&spa->spa_sync_bplist.bpl_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL);
-
- list_create(&spa->spa_dirty_list, sizeof (vdev_t),
- offsetof(vdev_t, vdev_dirty_node));
-
- txg_list_create(&spa->spa_vdev_txg_list,
- offsetof(struct vdev, vdev_txg_node));
-
- avl_create(&spa->spa_errlist_scrub,
- spa_error_entry_compare, sizeof (spa_error_entry_t),
- offsetof(spa_error_entry_t, se_avl));
- avl_create(&spa->spa_errlist_last,
- spa_error_entry_compare, sizeof (spa_error_entry_t),
- offsetof(spa_error_entry_t, se_avl));
-}
-
-/*
- * Opposite of spa_activate().
- */
-static void
-spa_deactivate(spa_t *spa)
-{
- int t;
-
- ASSERT(spa->spa_sync_on == B_FALSE);
- ASSERT(spa->spa_dsl_pool == NULL);
- ASSERT(spa->spa_root_vdev == NULL);
-
- ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
-
- txg_list_destroy(&spa->spa_vdev_txg_list);
-
- list_destroy(&spa->spa_dirty_list);
-
- for (t = 0; t < ZIO_TYPES; t++) {
- taskq_destroy(spa->spa_zio_issue_taskq[t]);
- taskq_destroy(spa->spa_zio_intr_taskq[t]);
- spa->spa_zio_issue_taskq[t] = NULL;
- spa->spa_zio_intr_taskq[t] = NULL;
- }
-
- metaslab_class_destroy(spa->spa_normal_class);
- spa->spa_normal_class = NULL;
-
- /*
- * If this was part of an import or the open otherwise failed, we may
- * still have errors left in the queues. Empty them just in case.
- */
- spa_errlog_drain(spa);
-
- avl_destroy(&spa->spa_errlist_scrub);
- avl_destroy(&spa->spa_errlist_last);
-
- rw_destroy(&spa->spa_traverse_lock);
- mutex_destroy(&spa->spa_uberblock_lock);
- mutex_destroy(&spa->spa_errlog_lock);
- mutex_destroy(&spa->spa_errlist_lock);
- mutex_destroy(&spa->spa_config_lock.scl_lock);
- cv_destroy(&spa->spa_config_lock.scl_cv);
- mutex_destroy(&spa->spa_sync_bplist.bpl_lock);
- mutex_destroy(&spa->spa_history_lock);
- mutex_destroy(&spa->spa_props_lock);
-
- spa->spa_state = POOL_STATE_UNINITIALIZED;
-}
-
-/*
- * Verify a pool configuration, and construct the vdev tree appropriately. This
- * will create all the necessary vdevs in the appropriate layout, with each vdev
- * in the CLOSED state. This will prep the pool before open/creation/import.
- * All vdev validation is done by the vdev_alloc() routine.
- */
-static int
-spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
- uint_t id, int atype)
-{
- nvlist_t **child;
- uint_t c, children;
- int error;
-
- if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
- return (error);
-
- if ((*vdp)->vdev_ops->vdev_op_leaf)
- return (0);
-
- if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
- &child, &children) != 0) {
- vdev_free(*vdp);
- *vdp = NULL;
- return (EINVAL);
- }
-
- for (c = 0; c < children; c++) {
- vdev_t *vd;
- if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
- atype)) != 0) {
- vdev_free(*vdp);
- *vdp = NULL;
- return (error);
- }
- }
-
- ASSERT(*vdp != NULL);
-
- return (0);
-}
-
-/*
- * Opposite of spa_load().
- */
-static void
-spa_unload(spa_t *spa)
-{
- int i;
-
- /*
- * Stop async tasks.
- */
- spa_async_suspend(spa);
-
- /*
- * Stop syncing.
- */
- if (spa->spa_sync_on) {
- txg_sync_stop(spa->spa_dsl_pool);
- spa->spa_sync_on = B_FALSE;
- }
-
- /*
- * Wait for any outstanding prefetch I/O to complete.
- */
- spa_config_enter(spa, RW_WRITER, FTAG);
- spa_config_exit(spa, FTAG);
-
- /*
- * Close the dsl pool.
- */
- if (spa->spa_dsl_pool) {
- dsl_pool_close(spa->spa_dsl_pool);
- spa->spa_dsl_pool = NULL;
- }
-
- /*
- * Close all vdevs.
- */
- if (spa->spa_root_vdev)
- vdev_free(spa->spa_root_vdev);
- ASSERT(spa->spa_root_vdev == NULL);
-
- for (i = 0; i < spa->spa_nspares; i++)
- vdev_free(spa->spa_spares[i]);
- if (spa->spa_spares) {
- kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *));
- spa->spa_spares = NULL;
- }
- if (spa->spa_sparelist) {
- nvlist_free(spa->spa_sparelist);
- spa->spa_sparelist = NULL;
- }
-
- spa->spa_async_suspended = 0;
-}
-
-/*
- * Load (or re-load) the current list of vdevs describing the active spares for
- * this pool. When this is called, we have some form of basic information in
- * 'spa_sparelist'. We parse this into vdevs, try to open them, and then
- * re-generate a more complete list including status information.
- */
-static void
-spa_load_spares(spa_t *spa)
-{
- nvlist_t **spares;
- uint_t nspares;
- int i;
- vdev_t *vd, *tvd;
-
- /*
- * First, close and free any existing spare vdevs.
- */
- for (i = 0; i < spa->spa_nspares; i++) {
- vd = spa->spa_spares[i];
-
- /* Undo the call to spa_activate() below */
- if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL &&
- tvd->vdev_isspare)
- spa_spare_remove(tvd);
- vdev_close(vd);
- vdev_free(vd);
- }
-
- if (spa->spa_spares)
- kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *));
-
- if (spa->spa_sparelist == NULL)
- nspares = 0;
- else
- VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist,
- ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
-
- spa->spa_nspares = (int)nspares;
- spa->spa_spares = NULL;
-
- if (nspares == 0)
- return;
-
- /*
- * Construct the array of vdevs, opening them to get status in the
- * process. For each spare, there is potentially two different vdev_t
- * structures associated with it: one in the list of spares (used only
- * for basic validation purposes) and one in the active vdev
- * configuration (if it's spared in). During this phase we open and
- * validate each vdev on the spare list. If the vdev also exists in the
- * active configuration, then we also mark this vdev as an active spare.
- */
- spa->spa_spares = kmem_alloc(nspares * sizeof (void *), KM_SLEEP);
- for (i = 0; i < spa->spa_nspares; i++) {
- VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
- VDEV_ALLOC_SPARE) == 0);
- ASSERT(vd != NULL);
-
- spa->spa_spares[i] = vd;
-
- if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL) {
- if (!tvd->vdev_isspare)
- spa_spare_add(tvd);
-
- /*
- * We only mark the spare active if we were successfully
- * able to load the vdev. Otherwise, importing a pool
- * with a bad active spare would result in strange
- * behavior, because multiple pool would think the spare
- * is actively in use.
- *
- * There is a vulnerability here to an equally bizarre
- * circumstance, where a dead active spare is later
- * brought back to life (onlined or otherwise). Given
- * the rarity of this scenario, and the extra complexity
- * it adds, we ignore the possibility.
- */
- if (!vdev_is_dead(tvd))
- spa_spare_activate(tvd);
- }
-
- if (vdev_open(vd) != 0)
- continue;
-
- vd->vdev_top = vd;
- (void) vdev_validate_spare(vd);
- }
-
- /*
- * Recompute the stashed list of spares, with status information
- * this time.
- */
- VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES,
- DATA_TYPE_NVLIST_ARRAY) == 0);
-
- spares = kmem_alloc(spa->spa_nspares * sizeof (void *), KM_SLEEP);
- for (i = 0; i < spa->spa_nspares; i++)
- spares[i] = vdev_config_generate(spa, spa->spa_spares[i],
- B_TRUE, B_TRUE);
- VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES,
- spares, spa->spa_nspares) == 0);
- for (i = 0; i < spa->spa_nspares; i++)
- nvlist_free(spares[i]);
- kmem_free(spares, spa->spa_nspares * sizeof (void *));
-}
-
-static int
-load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
-{
- dmu_buf_t *db;
- char *packed = NULL;
- size_t nvsize = 0;
- int error;
- *value = NULL;
-
- VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
- nvsize = *(uint64_t *)db->db_data;
- dmu_buf_rele(db, FTAG);
-
- packed = kmem_alloc(nvsize, KM_SLEEP);
- error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed);
- if (error == 0)
- error = nvlist_unpack(packed, nvsize, value, 0);
- kmem_free(packed, nvsize);
-
- return (error);
-}
-
-/*
- * Load an existing storage pool, using the pool's builtin spa_config as a
- * source of configuration information.
- */
-static int
-spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
-{
- int error = 0;
- nvlist_t *nvroot = NULL;
- vdev_t *rvd;
- uberblock_t *ub = &spa->spa_uberblock;
- uint64_t config_cache_txg = spa->spa_config_txg;
- uint64_t pool_guid;
- uint64_t version;
- zio_t *zio;
-
- spa->spa_load_state = state;
-
- if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) ||
- nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) {
- error = EINVAL;
- goto out;
- }
-
- /*
- * Versioning wasn't explicitly added to the label until later, so if
- * it's not present treat it as the initial version.
- */
- if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0)
- version = ZFS_VERSION_INITIAL;
-
- (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
- &spa->spa_config_txg);
-
- if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
- spa_guid_exists(pool_guid, 0)) {
- error = EEXIST;
- goto out;
- }
-
- spa->spa_load_guid = pool_guid;
-
- /*
- * Parse the configuration into a vdev tree. We explicitly set the
- * value that will be returned by spa_version() since parsing the
- * configuration requires knowing the version number.
- */
- spa_config_enter(spa, RW_WRITER, FTAG);
- spa->spa_ubsync.ub_version = version;
- error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD);
- spa_config_exit(spa, FTAG);
-
- if (error != 0)
- goto out;
-
- ASSERT(spa->spa_root_vdev == rvd);
- ASSERT(spa_guid(spa) == pool_guid);
-
- /*
- * Try to open all vdevs, loading each label in the process.
- */
- error = vdev_open(rvd);
- if (error != 0)
- goto out;
-
- /*
- * Validate the labels for all leaf vdevs. We need to grab the config
- * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD
- * flag.
- */
- spa_config_enter(spa, RW_READER, FTAG);
- error = vdev_validate(rvd);
- spa_config_exit(spa, FTAG);
-
- if (error != 0)
- goto out;
-
- if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
- error = ENXIO;
- goto out;
- }
-
- /*
- * Find the best uberblock.
- */
- bzero(ub, sizeof (uberblock_t));
-
- zio = zio_root(spa, NULL, NULL,
- ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
- vdev_uberblock_load(zio, rvd, ub);
- error = zio_wait(zio);
-
- /*
- * If we weren't able to find a single valid uberblock, return failure.
- */
- if (ub->ub_txg == 0) {
- vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
- error = ENXIO;
- goto out;
- }
-
- /*
- * If the pool is newer than the code, we can't open it.
- */
- if (ub->ub_version > ZFS_VERSION) {
- vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_VERSION_NEWER);
- error = ENOTSUP;
- goto out;
- }
-
- /*
- * If the vdev guid sum doesn't match the uberblock, we have an
- * incomplete configuration.
- */
- if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) {
- vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_BAD_GUID_SUM);
- error = ENXIO;
- goto out;
- }
-
- /*
- * Initialize internal SPA structures.
- */
- spa->spa_state = POOL_STATE_ACTIVE;
- spa->spa_ubsync = spa->spa_uberblock;
- spa->spa_first_txg = spa_last_synced_txg(spa) + 1;
- error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
- if (error) {
- vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
- goto out;
- }
- spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
-
- if (zap_lookup(spa->spa_meta_objset,
- DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
- sizeof (uint64_t), 1, &spa->spa_config_object) != 0) {
- vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
- error = EIO;
- goto out;
- }
-
- if (!mosconfig) {
- nvlist_t *newconfig;
- uint64_t hostid;
-
- if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) {
- vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
- error = EIO;
- goto out;
- }
-
- /*
- * hostid is set after the root file system is mounted, so
- * ignore the check until it's done.
- */
- if (nvlist_lookup_uint64(newconfig, ZPOOL_CONFIG_HOSTID,
- &hostid) == 0 && root_mounted()) {
- char *hostname;
- unsigned long myhostid = 0;
-
- VERIFY(nvlist_lookup_string(newconfig,
- ZPOOL_CONFIG_HOSTNAME, &hostname) == 0);
-
- (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid);
- if ((unsigned long)hostid != myhostid) {
- cmn_err(CE_WARN, "pool '%s' could not be "
- "loaded as it was last accessed by "
- "another system (host: %s hostid: 0x%lx). "
- "See: http://www.sun.com/msg/ZFS-8000-EY",
- spa->spa_name, hostname,
- (unsigned long)hostid);
- error = EBADF;
- goto out;
- }
- }
-
- spa_config_set(spa, newconfig);
- spa_unload(spa);
- spa_deactivate(spa);
- spa_activate(spa);
-
- return (spa_load(spa, newconfig, state, B_TRUE));
- }
-
- if (zap_lookup(spa->spa_meta_objset,
- DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
- sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) {
- vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
- error = EIO;
- goto out;
- }
-
- /*
- * Load the bit that tells us to use the new accounting function
- * (raid-z deflation). If we have an older pool, this will not
- * be present.
- */
- error = zap_lookup(spa->spa_meta_objset,
- DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
- sizeof (uint64_t), 1, &spa->spa_deflate);
- if (error != 0 && error != ENOENT) {
- vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
- error = EIO;
- goto out;
- }
-
- /*
- * Load the persistent error log. If we have an older pool, this will
- * not be present.
- */
- error = zap_lookup(spa->spa_meta_objset,
- DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST,
- sizeof (uint64_t), 1, &spa->spa_errlog_last);
- if (error != 0 && error != ENOENT) {
- vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
- error = EIO;
- goto out;
- }
-
- error = zap_lookup(spa->spa_meta_objset,
- DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB,
- sizeof (uint64_t), 1, &spa->spa_errlog_scrub);
- if (error != 0 && error != ENOENT) {
- vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
- error = EIO;
- goto out;
- }
-
- /*
- * Load the history object. If we have an older pool, this
- * will not be present.
- */
- error = zap_lookup(spa->spa_meta_objset,
- DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY,
- sizeof (uint64_t), 1, &spa->spa_history);
- if (error != 0 && error != ENOENT) {
- vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
- error = EIO;
- goto out;
- }
-
- /*
- * Load any hot spares for this pool.
- */
- error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares_object);
- if (error != 0 && error != ENOENT) {
- vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
- error = EIO;
- goto out;
- }
- if (error == 0) {
- ASSERT(spa_version(spa) >= ZFS_VERSION_SPARES);
- if (load_nvlist(spa, spa->spa_spares_object,
- &spa->spa_sparelist) != 0) {
- vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
- error = EIO;
- goto out;
- }
-
- spa_config_enter(spa, RW_WRITER, FTAG);
- spa_load_spares(spa);
- spa_config_exit(spa, FTAG);
- }
-
- error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object);
-
- if (error && error != ENOENT) {
- vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
- error = EIO;
- goto out;
- }
-
- if (error == 0) {
- (void) zap_lookup(spa->spa_meta_objset,
- spa->spa_pool_props_object,
- zpool_prop_to_name(ZFS_PROP_BOOTFS),
- sizeof (uint64_t), 1, &spa->spa_bootfs);
- }
-
- /*
- * Load the vdev state for all toplevel vdevs.
- */
- vdev_load(rvd);
-
- /*
- * Propagate the leaf DTLs we just loaded all the way up the tree.
- */
- spa_config_enter(spa, RW_WRITER, FTAG);
- vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
- spa_config_exit(spa, FTAG);
-
- /*
- * Check the state of the root vdev. If it can't be opened, it
- * indicates one or more toplevel vdevs are faulted.
- */
- if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
- error = ENXIO;
- goto out;
- }
-
- if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) {
- dmu_tx_t *tx;
- int need_update = B_FALSE;
- int c;
-
- /*
- * Claim log blocks that haven't been committed yet.
- * This must all happen in a single txg.
- */
- tx = dmu_tx_create_assigned(spa_get_dsl(spa),
- spa_first_txg(spa));
- (void) dmu_objset_find(spa->spa_name,
- zil_claim, tx, DS_FIND_CHILDREN);
- dmu_tx_commit(tx);
-
- spa->spa_sync_on = B_TRUE;
- txg_sync_start(spa->spa_dsl_pool);
-
- /*
- * Wait for all claims to sync.
- */
- txg_wait_synced(spa->spa_dsl_pool, 0);
-
- /*
- * If the config cache is stale, or we have uninitialized
- * metaslabs (see spa_vdev_add()), then update the config.
- */
- if (config_cache_txg != spa->spa_config_txg ||
- state == SPA_LOAD_IMPORT)
- need_update = B_TRUE;
-
- for (c = 0; c < rvd->vdev_children; c++)
- if (rvd->vdev_child[c]->vdev_ms_array == 0)
- need_update = B_TRUE;
-
- /*
- * Update the config cache asychronously in case we're the
- * root pool, in which case the config cache isn't writable yet.
- */
- if (need_update)
- spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
- }
-
- error = 0;
-out:
- if (error && error != EBADF)
- zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0);
- spa->spa_load_state = SPA_LOAD_NONE;
- spa->spa_ena = 0;
-
- return (error);
-}
-
-/*
- * Pool Open/Import
- *
- * The import case is identical to an open except that the configuration is sent
- * down from userland, instead of grabbed from the configuration cache. For the
- * case of an open, the pool configuration will exist in the
- * POOL_STATE_UNITIALIZED state.
- *
- * The stats information (gen/count/ustats) is used to gather vdev statistics at
- * the same time open the pool, without having to keep around the spa_t in some
- * ambiguous state.
- */
-static int
-spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config)
-{
- spa_t *spa;
- int error;
- int loaded = B_FALSE;
- int locked = B_FALSE;
-
- *spapp = NULL;
-
- /*
- * As disgusting as this is, we need to support recursive calls to this
- * function because dsl_dir_open() is called during spa_load(), and ends
- * up calling spa_open() again. The real fix is to figure out how to
- * avoid dsl_dir_open() calling this in the first place.
- */
- if (mutex_owner(&spa_namespace_lock) != curthread) {
- mutex_enter(&spa_namespace_lock);
- locked = B_TRUE;
- }
-
- if ((spa = spa_lookup(pool)) == NULL) {
- if (locked)
- mutex_exit(&spa_namespace_lock);
- return (ENOENT);
- }
- if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
-
- spa_activate(spa);
-
- error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE);
-
- if (error == EBADF) {
- /*
- * If vdev_validate() returns failure (indicated by
- * EBADF), it indicates that one of the vdevs indicates
- * that the pool has been exported or destroyed. If
- * this is the case, the config cache is out of sync and
- * we should remove the pool from the namespace.
- */
- zfs_post_ok(spa, NULL);
- spa_unload(spa);
- spa_deactivate(spa);
- spa_remove(spa);
- spa_config_sync();
- if (locked)
- mutex_exit(&spa_namespace_lock);
- return (ENOENT);
- }
-
- if (error) {
- /*
- * We can't open the pool, but we still have useful
- * information: the state of each vdev after the
- * attempted vdev_open(). Return this to the user.
- */
- if (config != NULL && spa->spa_root_vdev != NULL) {
- spa_config_enter(spa, RW_READER, FTAG);
- *config = spa_config_generate(spa, NULL, -1ULL,
- B_TRUE);
- spa_config_exit(spa, FTAG);
- }
- spa_unload(spa);
- spa_deactivate(spa);
- spa->spa_last_open_failed = B_TRUE;
- if (locked)
- mutex_exit(&spa_namespace_lock);
- *spapp = NULL;
- return (error);
- } else {
- zfs_post_ok(spa, NULL);
- spa->spa_last_open_failed = B_FALSE;
- }
-
- loaded = B_TRUE;
- }
-
- spa_open_ref(spa, tag);
- if (locked)
- mutex_exit(&spa_namespace_lock);
-
- *spapp = spa;
-
- if (config != NULL) {
- spa_config_enter(spa, RW_READER, FTAG);
- *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
- spa_config_exit(spa, FTAG);
- }
-
- /*
- * If we just loaded the pool, resilver anything that's out of date.
- */
- if (loaded && (spa_mode & FWRITE))
- VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
-
- return (0);
-}
-
-int
-spa_open(const char *name, spa_t **spapp, void *tag)
-{
- return (spa_open_common(name, spapp, tag, NULL));
-}
-
-/*
- * Lookup the given spa_t, incrementing the inject count in the process,
- * preventing it from being exported or destroyed.
- */
-spa_t *
-spa_inject_addref(char *name)
-{
- spa_t *spa;
-
- mutex_enter(&spa_namespace_lock);
- if ((spa = spa_lookup(name)) == NULL) {
- mutex_exit(&spa_namespace_lock);
- return (NULL);
- }
- spa->spa_inject_ref++;
- mutex_exit(&spa_namespace_lock);
-
- return (spa);
-}
-
-void
-spa_inject_delref(spa_t *spa)
-{
- mutex_enter(&spa_namespace_lock);
- spa->spa_inject_ref--;
- mutex_exit(&spa_namespace_lock);
-}
-
-static void
-spa_add_spares(spa_t *spa, nvlist_t *config)
-{
- nvlist_t **spares;
- uint_t i, nspares;
- nvlist_t *nvroot;
- uint64_t guid;
- vdev_stat_t *vs;
- uint_t vsc;
- uint64_t pool;
-
- if (spa->spa_nspares == 0)
- return;
-
- VERIFY(nvlist_lookup_nvlist(config,
- ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
- VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist,
- ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
- if (nspares != 0) {
- VERIFY(nvlist_add_nvlist_array(nvroot,
- ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
- VERIFY(nvlist_lookup_nvlist_array(nvroot,
- ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
-
- /*
- * Go through and find any spares which have since been
- * repurposed as an active spare. If this is the case, update
- * their status appropriately.
- */
- for (i = 0; i < nspares; i++) {
- VERIFY(nvlist_lookup_uint64(spares[i],
- ZPOOL_CONFIG_GUID, &guid) == 0);
- if (spa_spare_exists(guid, &pool) && pool != 0ULL) {
- VERIFY(nvlist_lookup_uint64_array(
- spares[i], ZPOOL_CONFIG_STATS,
- (uint64_t **)&vs, &vsc) == 0);
- vs->vs_state = VDEV_STATE_CANT_OPEN;
- vs->vs_aux = VDEV_AUX_SPARED;
- }
- }
- }
-}
-
-int
-spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen)
-{
- int error;
- spa_t *spa;
-
- *config = NULL;
- error = spa_open_common(name, &spa, FTAG, config);
-
- if (spa && *config != NULL) {
- VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT,
- spa_get_errlog_size(spa)) == 0);
-
- spa_add_spares(spa, *config);
- }
-
- /*
- * We want to get the alternate root even for faulted pools, so we cheat
- * and call spa_lookup() directly.
- */
- if (altroot) {
- if (spa == NULL) {
- mutex_enter(&spa_namespace_lock);
- spa = spa_lookup(name);
- if (spa)
- spa_altroot(spa, altroot, buflen);
- else
- altroot[0] = '\0';
- spa = NULL;
- mutex_exit(&spa_namespace_lock);
- } else {
- spa_altroot(spa, altroot, buflen);
- }
- }
-
- if (spa != NULL)
- spa_close(spa, FTAG);
-
- return (error);
-}
-
-/*
- * Validate that the 'spares' array is well formed. We must have an array of
- * nvlists, each which describes a valid leaf vdev. If this is an import (mode
- * is VDEV_ALLOC_SPARE), then we allow corrupted spares to be specified, as long
- * as they are well-formed.
- */
-static int
-spa_validate_spares(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
-{
- nvlist_t **spares;
- uint_t i, nspares;
- vdev_t *vd;
- int error;
-
- /*
- * It's acceptable to have no spares specified.
- */
- if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
- &spares, &nspares) != 0)
- return (0);
-
- if (nspares == 0)
- return (EINVAL);
-
- /*
- * Make sure the pool is formatted with a version that supports hot
- * spares.
- */
- if (spa_version(spa) < ZFS_VERSION_SPARES)
- return (ENOTSUP);
-
- /*
- * Set the pending spare list so we correctly handle device in-use
- * checking.
- */
- spa->spa_pending_spares = spares;
- spa->spa_pending_nspares = nspares;
-
- for (i = 0; i < nspares; i++) {
- if ((error = spa_config_parse(spa, &vd, spares[i], NULL, 0,
- mode)) != 0)
- goto out;
-
- if (!vd->vdev_ops->vdev_op_leaf) {
- vdev_free(vd);
- error = EINVAL;
- goto out;
- }
-
- vd->vdev_top = vd;
-
- if ((error = vdev_open(vd)) == 0 &&
- (error = vdev_label_init(vd, crtxg,
- VDEV_LABEL_SPARE)) == 0) {
- VERIFY(nvlist_add_uint64(spares[i], ZPOOL_CONFIG_GUID,
- vd->vdev_guid) == 0);
- }
-
- vdev_free(vd);
-
- if (error && mode != VDEV_ALLOC_SPARE)
- goto out;
- else
- error = 0;
- }
-
-out:
- spa->spa_pending_spares = NULL;
- spa->spa_pending_nspares = 0;
- return (error);
-}
-
-/*
- * Pool Creation
- */
-int
-spa_create(const char *pool, nvlist_t *nvroot, const char *altroot)
-{
- spa_t *spa;
- vdev_t *rvd;
- dsl_pool_t *dp;
- dmu_tx_t *tx;
- int c, error = 0;
- uint64_t txg = TXG_INITIAL;
- nvlist_t **spares;
- uint_t nspares;
-
- /*
- * If this pool already exists, return failure.
- */
- mutex_enter(&spa_namespace_lock);
- if (spa_lookup(pool) != NULL) {
- mutex_exit(&spa_namespace_lock);
- return (EEXIST);
- }
-
- /*
- * Allocate a new spa_t structure.
- */
- spa = spa_add(pool, altroot);
- spa_activate(spa);
-
- spa->spa_uberblock.ub_txg = txg - 1;
- spa->spa_uberblock.ub_version = ZFS_VERSION;
- spa->spa_ubsync = spa->spa_uberblock;
-
- /*
- * Create the root vdev.
- */
- spa_config_enter(spa, RW_WRITER, FTAG);
-
- error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
-
- ASSERT(error != 0 || rvd != NULL);
- ASSERT(error != 0 || spa->spa_root_vdev == rvd);
-
- if (error == 0 && rvd->vdev_children == 0)
- error = EINVAL;
-
- if (error == 0 &&
- (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
- (error = spa_validate_spares(spa, nvroot, txg,
- VDEV_ALLOC_ADD)) == 0) {
- for (c = 0; c < rvd->vdev_children; c++)
- vdev_init(rvd->vdev_child[c], txg);
- vdev_config_dirty(rvd);
- }
-
- spa_config_exit(spa, FTAG);
-
- if (error != 0) {
- spa_unload(spa);
- spa_deactivate(spa);
- spa_remove(spa);
- mutex_exit(&spa_namespace_lock);
- return (error);
- }
-
- /*
- * Get the list of spares, if specified.
- */
- if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
- &spares, &nspares) == 0) {
- VERIFY(nvlist_alloc(&spa->spa_sparelist, NV_UNIQUE_NAME,
- KM_SLEEP) == 0);
- VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist,
- ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
- spa_config_enter(spa, RW_WRITER, FTAG);
- spa_load_spares(spa);
- spa_config_exit(spa, FTAG);
- spa->spa_sync_spares = B_TRUE;
- }
-
- spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg);
- spa->spa_meta_objset = dp->dp_meta_objset;
-
- tx = dmu_tx_create_assigned(dp, txg);
-
- /*
- * Create the pool config object.
- */
- spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
- DMU_OT_PACKED_NVLIST, 1 << 14,
- DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
-
- if (zap_add(spa->spa_meta_objset,
- DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
- sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
- cmn_err(CE_PANIC, "failed to add pool config");
- }
-
- /* Newly created pools are always deflated. */
- spa->spa_deflate = TRUE;
- if (zap_add(spa->spa_meta_objset,
- DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
- sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
- cmn_err(CE_PANIC, "failed to add deflate");
- }
-
- /*
- * Create the deferred-free bplist object. Turn off compression
- * because sync-to-convergence takes longer if the blocksize
- * keeps changing.
- */
- spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset,
- 1 << 14, tx);
- dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj,
- ZIO_COMPRESS_OFF, tx);
-
- if (zap_add(spa->spa_meta_objset,
- DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
- sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) {
- cmn_err(CE_PANIC, "failed to add bplist");
- }
-
- /*
- * Create the pool's history object.
- */
- spa_history_create_obj(spa, tx);
-
- dmu_tx_commit(tx);
-
- spa->spa_bootfs = zfs_prop_default_numeric(ZFS_PROP_BOOTFS);
- spa->spa_sync_on = B_TRUE;
- txg_sync_start(spa->spa_dsl_pool);
-
- /*
- * We explicitly wait for the first transaction to complete so that our
- * bean counters are appropriately updated.
- */
- txg_wait_synced(spa->spa_dsl_pool, txg);
-
- spa_config_sync();
-
- mutex_exit(&spa_namespace_lock);
-
- return (0);
-}
-
-/*
- * Import the given pool into the system. We set up the necessary spa_t and
- * then call spa_load() to do the dirty work.
- */
-int
-spa_import(const char *pool, nvlist_t *config, const char *altroot)
-{
- spa_t *spa;
- int error;
- nvlist_t *nvroot;
- nvlist_t **spares;
- uint_t nspares;
-
- if (!(spa_mode & FWRITE))
- return (EROFS);
-
- /*
- * If a pool with this name exists, return failure.
- */
- mutex_enter(&spa_namespace_lock);
- if (spa_lookup(pool) != NULL) {
- mutex_exit(&spa_namespace_lock);
- return (EEXIST);
- }
-
- /*
- * Create and initialize the spa structure.
- */
- spa = spa_add(pool, altroot);
- spa_activate(spa);
-
- /*
- * Pass off the heavy lifting to spa_load().
- * Pass TRUE for mosconfig because the user-supplied config
- * is actually the one to trust when doing an import.
- */
- error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE);
-
- spa_config_enter(spa, RW_WRITER, FTAG);
- /*
- * Toss any existing sparelist, as it doesn't have any validity anymore,
- * and conflicts with spa_has_spare().
- */
- if (spa->spa_sparelist) {
- nvlist_free(spa->spa_sparelist);
- spa->spa_sparelist = NULL;
- spa_load_spares(spa);
- }
-
- VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
- &nvroot) == 0);
- if (error == 0)
- error = spa_validate_spares(spa, nvroot, -1ULL,
- VDEV_ALLOC_SPARE);
- spa_config_exit(spa, FTAG);
-
- if (error != 0) {
- spa_unload(spa);
- spa_deactivate(spa);
- spa_remove(spa);
- mutex_exit(&spa_namespace_lock);
- return (error);
- }
-
- /*
- * Override any spares as specified by the user, as these may have
- * correct device names/devids, etc.
- */
- if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
- &spares, &nspares) == 0) {
- if (spa->spa_sparelist)
- VERIFY(nvlist_remove(spa->spa_sparelist,
- ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
- else
- VERIFY(nvlist_alloc(&spa->spa_sparelist,
- NV_UNIQUE_NAME, KM_SLEEP) == 0);
- VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist,
- ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
- spa_config_enter(spa, RW_WRITER, FTAG);
- spa_load_spares(spa);
- spa_config_exit(spa, FTAG);
- spa->spa_sync_spares = B_TRUE;
- }
-
- /*
- * Update the config cache to include the newly-imported pool.
- */
- spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
-
- mutex_exit(&spa_namespace_lock);
-
- /*
- * Resilver anything that's out of date.
- */
- if (spa_mode & FWRITE)
- VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
-
- return (0);
-}
-
-/*
- * This (illegal) pool name is used when temporarily importing a spa_t in order
- * to get the vdev stats associated with the imported devices.
- */
-#define TRYIMPORT_NAME "$import"
-
-nvlist_t *
-spa_tryimport(nvlist_t *tryconfig)
-{
- nvlist_t *config = NULL;
- char *poolname;
- spa_t *spa;
- uint64_t state;
-
- if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
- return (NULL);
-
- if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
- return (NULL);
-
- /*
- * Create and initialize the spa structure.
- */
- mutex_enter(&spa_namespace_lock);
- spa = spa_add(TRYIMPORT_NAME, NULL);
- spa_activate(spa);
-
- /*
- * Pass off the heavy lifting to spa_load().
- * Pass TRUE for mosconfig because the user-supplied config
- * is actually the one to trust when doing an import.
- */
- (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE);
-
- /*
- * If 'tryconfig' was at least parsable, return the current config.
- */
- if (spa->spa_root_vdev != NULL) {
- spa_config_enter(spa, RW_READER, FTAG);
- config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
- spa_config_exit(spa, FTAG);
- VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
- poolname) == 0);
- VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
- state) == 0);
- VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
- spa->spa_uberblock.ub_timestamp) == 0);
-
- /*
- * Add the list of hot spares.
- */
- spa_add_spares(spa, config);
- }
-
- spa_unload(spa);
- spa_deactivate(spa);
- spa_remove(spa);
- mutex_exit(&spa_namespace_lock);
-
- return (config);
-}
-
-/*
- * Pool export/destroy
- *
- * The act of destroying or exporting a pool is very simple. We make sure there
- * is no more pending I/O and any references to the pool are gone. Then, we
- * update the pool state and sync all the labels to disk, removing the
- * configuration from the cache afterwards.
- */
-static int
-spa_export_common(char *pool, int new_state, nvlist_t **oldconfig)
-{
- spa_t *spa;
-
- if (oldconfig)
- *oldconfig = NULL;
-
- if (!(spa_mode & FWRITE))
- return (EROFS);
-
- mutex_enter(&spa_namespace_lock);
- if ((spa = spa_lookup(pool)) == NULL) {
- mutex_exit(&spa_namespace_lock);
- return (ENOENT);
- }
-
- /*
- * Put a hold on the pool, drop the namespace lock, stop async tasks,
- * reacquire the namespace lock, and see if we can export.
- */
- spa_open_ref(spa, FTAG);
- mutex_exit(&spa_namespace_lock);
- spa_async_suspend(spa);
- mutex_enter(&spa_namespace_lock);
- spa_close(spa, FTAG);
-
- /*
- * The pool will be in core if it's openable,
- * in which case we can modify its state.
- */
- if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) {
- /*
- * Objsets may be open only because they're dirty, so we
- * have to force it to sync before checking spa_refcnt.
- */
- spa_scrub_suspend(spa);
- txg_wait_synced(spa->spa_dsl_pool, 0);
-
- /*
- * A pool cannot be exported or destroyed if there are active
- * references. If we are resetting a pool, allow references by
- * fault injection handlers.
- */
- if (!spa_refcount_zero(spa) ||
- (spa->spa_inject_ref != 0 &&
- new_state != POOL_STATE_UNINITIALIZED)) {
- spa_scrub_resume(spa);
- spa_async_resume(spa);
- mutex_exit(&spa_namespace_lock);
- return (EBUSY);
- }
-
- spa_scrub_resume(spa);
- VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0);
-
- /*
- * We want this to be reflected on every label,
- * so mark them all dirty. spa_unload() will do the
- * final sync that pushes these changes out.
- */
- if (new_state != POOL_STATE_UNINITIALIZED) {
- spa_config_enter(spa, RW_WRITER, FTAG);
- spa->spa_state = new_state;
- spa->spa_final_txg = spa_last_synced_txg(spa) + 1;
- vdev_config_dirty(spa->spa_root_vdev);
- spa_config_exit(spa, FTAG);
- }
- }
-
- if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
- spa_unload(spa);
- spa_deactivate(spa);
- }
-
- if (oldconfig && spa->spa_config)
- VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
-
- if (new_state != POOL_STATE_UNINITIALIZED) {
- spa_remove(spa);
- spa_config_sync();
- }
- mutex_exit(&spa_namespace_lock);
-
- return (0);
-}
-
-/*
- * Destroy a storage pool.
- */
-int
-spa_destroy(char *pool)
-{
- return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL));
-}
-
-/*
- * Export a storage pool.
- */
-int
-spa_export(char *pool, nvlist_t **oldconfig)
-{
- return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig));
-}
-
-/*
- * Similar to spa_export(), this unloads the spa_t without actually removing it
- * from the namespace in any way.
- */
-int
-spa_reset(char *pool)
-{
- return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL));
-}
-
-
-/*
- * ==========================================================================
- * Device manipulation
- * ==========================================================================
- */
-
-/*
- * Add capacity to a storage pool.
- */
-int
-spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
-{
- uint64_t txg;
- int c, error;
- vdev_t *rvd = spa->spa_root_vdev;
- vdev_t *vd, *tvd;
- nvlist_t **spares;
- uint_t i, nspares;
-
- txg = spa_vdev_enter(spa);
-
- if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
- VDEV_ALLOC_ADD)) != 0)
- return (spa_vdev_exit(spa, NULL, txg, error));
-
- spa->spa_pending_vdev = vd;
-
- if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
- &spares, &nspares) != 0)
- nspares = 0;
-
- if (vd->vdev_children == 0 && nspares == 0) {
- spa->spa_pending_vdev = NULL;
- return (spa_vdev_exit(spa, vd, txg, EINVAL));
- }
-
- if (vd->vdev_children != 0) {
- if ((error = vdev_create(vd, txg, B_FALSE)) != 0) {
- spa->spa_pending_vdev = NULL;
- return (spa_vdev_exit(spa, vd, txg, error));
- }
- }
-
- /*
- * We must validate the spares after checking the children. Otherwise,
- * vdev_inuse() will blindly overwrite the spare.
- */
- if ((error = spa_validate_spares(spa, nvroot, txg,
- VDEV_ALLOC_ADD)) != 0) {
- spa->spa_pending_vdev = NULL;
- return (spa_vdev_exit(spa, vd, txg, error));
- }
-
- spa->spa_pending_vdev = NULL;
-
- /*
- * Transfer each new top-level vdev from vd to rvd.
- */
- for (c = 0; c < vd->vdev_children; c++) {
- tvd = vd->vdev_child[c];
- vdev_remove_child(vd, tvd);
- tvd->vdev_id = rvd->vdev_children;
- vdev_add_child(rvd, tvd);
- vdev_config_dirty(tvd);
- }
-
- if (nspares != 0) {
- if (spa->spa_sparelist != NULL) {
- nvlist_t **oldspares;
- uint_t oldnspares;
- nvlist_t **newspares;
-
- VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist,
- ZPOOL_CONFIG_SPARES, &oldspares, &oldnspares) == 0);
-
- newspares = kmem_alloc(sizeof (void *) *
- (nspares + oldnspares), KM_SLEEP);
- for (i = 0; i < oldnspares; i++)
- VERIFY(nvlist_dup(oldspares[i],
- &newspares[i], KM_SLEEP) == 0);
- for (i = 0; i < nspares; i++)
- VERIFY(nvlist_dup(spares[i],
- &newspares[i + oldnspares],
- KM_SLEEP) == 0);
-
- VERIFY(nvlist_remove(spa->spa_sparelist,
- ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
-
- VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist,
- ZPOOL_CONFIG_SPARES, newspares,
- nspares + oldnspares) == 0);
- for (i = 0; i < oldnspares + nspares; i++)
- nvlist_free(newspares[i]);
- kmem_free(newspares, (oldnspares + nspares) *
- sizeof (void *));
- } else {
- VERIFY(nvlist_alloc(&spa->spa_sparelist,
- NV_UNIQUE_NAME, KM_SLEEP) == 0);
- VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist,
- ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
- }
-
- spa_load_spares(spa);
- spa->spa_sync_spares = B_TRUE;
- }
-
- /*
- * We have to be careful when adding new vdevs to an existing pool.
- * If other threads start allocating from these vdevs before we
- * sync the config cache, and we lose power, then upon reboot we may
- * fail to open the pool because there are DVAs that the config cache
- * can't translate. Therefore, we first add the vdevs without
- * initializing metaslabs; sync the config cache (via spa_vdev_exit());
- * and then let spa_config_update() initialize the new metaslabs.
- *
- * spa_load() checks for added-but-not-initialized vdevs, so that
- * if we lose power at any point in this sequence, the remaining
- * steps will be completed the next time we load the pool.
- */
- (void) spa_vdev_exit(spa, vd, txg, 0);
-
- mutex_enter(&spa_namespace_lock);
- spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
- mutex_exit(&spa_namespace_lock);
-
- return (0);
-}
-
-/*
- * Attach a device to a mirror. The arguments are the path to any device
- * in the mirror, and the nvroot for the new device. If the path specifies
- * a device that is not mirrored, we automatically insert the mirror vdev.
- *
- * If 'replacing' is specified, the new device is intended to replace the
- * existing device; in this case the two devices are made into their own
- * mirror using the 'replacing' vdev, which is functionally idendical to
- * the mirror vdev (it actually reuses all the same ops) but has a few
- * extra rules: you can't attach to it after it's been created, and upon
- * completion of resilvering, the first disk (the one being replaced)
- * is automatically detached.
- */
-int
-spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
-{
- uint64_t txg, open_txg;
- int error;
- vdev_t *rvd = spa->spa_root_vdev;
- vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
- vdev_ops_t *pvops;
-
- txg = spa_vdev_enter(spa);
-
- oldvd = vdev_lookup_by_guid(rvd, guid);
-
- if (oldvd == NULL)
- return (spa_vdev_exit(spa, NULL, txg, ENODEV));
-
- if (!oldvd->vdev_ops->vdev_op_leaf)
- return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
-
- pvd = oldvd->vdev_parent;
-
- if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
- VDEV_ALLOC_ADD)) != 0 || newrootvd->vdev_children != 1)
- return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
-
- newvd = newrootvd->vdev_child[0];
-
- if (!newvd->vdev_ops->vdev_op_leaf)
- return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
-
- if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
- return (spa_vdev_exit(spa, newrootvd, txg, error));
-
- if (!replacing) {
- /*
- * For attach, the only allowable parent is a mirror or the root
- * vdev.
- */
- if (pvd->vdev_ops != &vdev_mirror_ops &&
- pvd->vdev_ops != &vdev_root_ops)
- return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
-
- pvops = &vdev_mirror_ops;
- } else {
- /*
- * Active hot spares can only be replaced by inactive hot
- * spares.
- */
- if (pvd->vdev_ops == &vdev_spare_ops &&
- pvd->vdev_child[1] == oldvd &&
- !spa_has_spare(spa, newvd->vdev_guid))
- return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
-
- /*
- * If the source is a hot spare, and the parent isn't already a
- * spare, then we want to create a new hot spare. Otherwise, we
- * want to create a replacing vdev. The user is not allowed to
- * attach to a spared vdev child unless the 'isspare' state is
- * the same (spare replaces spare, non-spare replaces
- * non-spare).
- */
- if (pvd->vdev_ops == &vdev_replacing_ops)
- return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
- else if (pvd->vdev_ops == &vdev_spare_ops &&
- newvd->vdev_isspare != oldvd->vdev_isspare)
- return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
- else if (pvd->vdev_ops != &vdev_spare_ops &&
- newvd->vdev_isspare)
- pvops = &vdev_spare_ops;
- else
- pvops = &vdev_replacing_ops;
- }
-
- /*
- * Compare the new device size with the replaceable/attachable
- * device size.
- */
- if (newvd->vdev_psize < vdev_get_rsize(oldvd))
- return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
-
- /*
- * The new device cannot have a higher alignment requirement
- * than the top-level vdev.
- */
- if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
- return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
-
- /*
- * If this is an in-place replacement, update oldvd's path and devid
- * to make it distinguishable from newvd, and unopenable from now on.
- */
- if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
- spa_strfree(oldvd->vdev_path);
- oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
- KM_SLEEP);
- (void) sprintf(oldvd->vdev_path, "%s/%s",
- newvd->vdev_path, "old");
- if (oldvd->vdev_devid != NULL) {
- spa_strfree(oldvd->vdev_devid);
- oldvd->vdev_devid = NULL;
- }
- }
-
- /*
- * If the parent is not a mirror, or if we're replacing, insert the new
- * mirror/replacing/spare vdev above oldvd.
- */
- if (pvd->vdev_ops != pvops)
- pvd = vdev_add_parent(oldvd, pvops);
-
- ASSERT(pvd->vdev_top->vdev_parent == rvd);
- ASSERT(pvd->vdev_ops == pvops);
- ASSERT(oldvd->vdev_parent == pvd);
-
- /*
- * Extract the new device from its root and add it to pvd.
- */
- vdev_remove_child(newrootvd, newvd);
- newvd->vdev_id = pvd->vdev_children;
- vdev_add_child(pvd, newvd);
-
- /*
- * If newvd is smaller than oldvd, but larger than its rsize,
- * the addition of newvd may have decreased our parent's asize.
- */
- pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize);
-
- tvd = newvd->vdev_top;
- ASSERT(pvd->vdev_top == tvd);
- ASSERT(tvd->vdev_parent == rvd);
-
- vdev_config_dirty(tvd);
-
- /*
- * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate
- * upward when spa_vdev_exit() calls vdev_dtl_reassess().
- */
- open_txg = txg + TXG_CONCURRENT_STATES - 1;
-
- mutex_enter(&newvd->vdev_dtl_lock);
- space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL,
- open_txg - TXG_INITIAL + 1);
- mutex_exit(&newvd->vdev_dtl_lock);
-
- if (newvd->vdev_isspare)
- spa_spare_activate(newvd);
-
- /*
- * Mark newvd's DTL dirty in this txg.
- */
- vdev_dirty(tvd, VDD_DTL, newvd, txg);
-
- (void) spa_vdev_exit(spa, newrootvd, open_txg, 0);
-
- /*
- * Kick off a resilver to update newvd.
- */
- VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
-
- return (0);
-}
-
-/*
- * Detach a device from a mirror or replacing vdev.
- * If 'replace_done' is specified, only detach if the parent
- * is a replacing vdev.
- */
-int
-spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
-{
- uint64_t txg;
- int c, t, error;
- vdev_t *rvd = spa->spa_root_vdev;
- vdev_t *vd, *pvd, *cvd, *tvd;
- boolean_t unspare = B_FALSE;
- uint64_t unspare_guid;
-
- txg = spa_vdev_enter(spa);
-
- vd = vdev_lookup_by_guid(rvd, guid);
-
- if (vd == NULL)
- return (spa_vdev_exit(spa, NULL, txg, ENODEV));
-
- if (!vd->vdev_ops->vdev_op_leaf)
- return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
-
- pvd = vd->vdev_parent;
-
- /*
- * If replace_done is specified, only remove this device if it's
- * the first child of a replacing vdev. For the 'spare' vdev, either
- * disk can be removed.
- */
- if (replace_done) {
- if (pvd->vdev_ops == &vdev_replacing_ops) {
- if (vd->vdev_id != 0)
- return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
- } else if (pvd->vdev_ops != &vdev_spare_ops) {
- return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
- }
- }
-
- ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
- spa_version(spa) >= ZFS_VERSION_SPARES);
-
- /*
- * Only mirror, replacing, and spare vdevs support detach.
- */
- if (pvd->vdev_ops != &vdev_replacing_ops &&
- pvd->vdev_ops != &vdev_mirror_ops &&
- pvd->vdev_ops != &vdev_spare_ops)
- return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
-
- /*
- * If there's only one replica, you can't detach it.
- */
- if (pvd->vdev_children <= 1)
- return (spa_vdev_exit(spa, NULL, txg, EBUSY));
-
- /*
- * If all siblings have non-empty DTLs, this device may have the only
- * valid copy of the data, which means we cannot safely detach it.
- *
- * XXX -- as in the vdev_offline() case, we really want a more
- * precise DTL check.
- */
- for (c = 0; c < pvd->vdev_children; c++) {
- uint64_t dirty;
-
- cvd = pvd->vdev_child[c];
- if (cvd == vd)
- continue;
- if (vdev_is_dead(cvd))
- continue;
- mutex_enter(&cvd->vdev_dtl_lock);
- dirty = cvd->vdev_dtl_map.sm_space |
- cvd->vdev_dtl_scrub.sm_space;
- mutex_exit(&cvd->vdev_dtl_lock);
- if (!dirty)
- break;
- }
-
- /*
- * If we are a replacing or spare vdev, then we can always detach the
- * latter child, as that is how one cancels the operation.
- */
- if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) &&
- c == pvd->vdev_children)
- return (spa_vdev_exit(spa, NULL, txg, EBUSY));
-
- /*
- * If we are detaching the original disk from a spare, then it implies
- * that the spare should become a real disk, and be removed from the
- * active spare list for the pool.
- */
- if (pvd->vdev_ops == &vdev_spare_ops &&
- vd->vdev_id == 0)
- unspare = B_TRUE;
-
- /*
- * Erase the disk labels so the disk can be used for other things.
- * This must be done after all other error cases are handled,
- * but before we disembowel vd (so we can still do I/O to it).
- * But if we can't do it, don't treat the error as fatal --
- * it may be that the unwritability of the disk is the reason
- * it's being detached!
- */
- error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
-
- /*
- * Remove vd from its parent and compact the parent's children.
- */
- vdev_remove_child(pvd, vd);
- vdev_compact_children(pvd);
-
- /*
- * Remember one of the remaining children so we can get tvd below.
- */
- cvd = pvd->vdev_child[0];
-
- /*
- * If we need to remove the remaining child from the list of hot spares,
- * do it now, marking the vdev as no longer a spare in the process. We
- * must do this before vdev_remove_parent(), because that can change the
- * GUID if it creates a new toplevel GUID.
- */
- if (unspare) {
- ASSERT(cvd->vdev_isspare);
- spa_spare_remove(cvd);
- unspare_guid = cvd->vdev_guid;
- }
-
- /*
- * If the parent mirror/replacing vdev only has one child,
- * the parent is no longer needed. Remove it from the tree.
- */
- if (pvd->vdev_children == 1)
- vdev_remove_parent(cvd);
-
- /*
- * We don't set tvd until now because the parent we just removed
- * may have been the previous top-level vdev.
- */
- tvd = cvd->vdev_top;
- ASSERT(tvd->vdev_parent == rvd);
-
- /*
- * Reevaluate the parent vdev state.
- */
- vdev_propagate_state(cvd->vdev_parent);
-
- /*
- * If the device we just detached was smaller than the others, it may be
- * possible to add metaslabs (i.e. grow the pool). vdev_metaslab_init()
- * can't fail because the existing metaslabs are already in core, so
- * there's nothing to read from disk.
- */
- VERIFY(vdev_metaslab_init(tvd, txg) == 0);
-
- vdev_config_dirty(tvd);
-
- /*
- * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that
- * vd->vdev_detached is set and free vd's DTL object in syncing context.
- * But first make sure we're not on any *other* txg's DTL list, to
- * prevent vd from being accessed after it's freed.
- */
- for (t = 0; t < TXG_SIZE; t++)
- (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
- vd->vdev_detached = B_TRUE;
- vdev_dirty(tvd, VDD_DTL, vd, txg);
-
- error = spa_vdev_exit(spa, vd, txg, 0);
-
- /*
- * If this was the removal of the original device in a hot spare vdev,
- * then we want to go through and remove the device from the hot spare
- * list of every other pool.
- */
- if (unspare) {
- spa = NULL;
- mutex_enter(&spa_namespace_lock);
- while ((spa = spa_next(spa)) != NULL) {
- if (spa->spa_state != POOL_STATE_ACTIVE)
- continue;
-
- (void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
- }
- mutex_exit(&spa_namespace_lock);
- }
-
- return (error);
-}
-
-/*
- * Remove a device from the pool. Currently, this supports removing only hot
- * spares.
- */
-int
-spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
-{
- vdev_t *vd;
- nvlist_t **spares, *nv, **newspares;
- uint_t i, j, nspares;
- int ret = 0;
-
- spa_config_enter(spa, RW_WRITER, FTAG);
-
- vd = spa_lookup_by_guid(spa, guid);
-
- nv = NULL;
- if (spa->spa_spares != NULL &&
- nvlist_lookup_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES,
- &spares, &nspares) == 0) {
- for (i = 0; i < nspares; i++) {
- uint64_t theguid;
-
- VERIFY(nvlist_lookup_uint64(spares[i],
- ZPOOL_CONFIG_GUID, &theguid) == 0);
- if (theguid == guid) {
- nv = spares[i];
- break;
- }
- }
- }
-
- /*
- * We only support removing a hot spare, and only if it's not currently
- * in use in this pool.
- */
- if (nv == NULL && vd == NULL) {
- ret = ENOENT;
- goto out;
- }
-
- if (nv == NULL && vd != NULL) {
- ret = ENOTSUP;
- goto out;
- }
-
- if (!unspare && nv != NULL && vd != NULL) {
- ret = EBUSY;
- goto out;
- }
-
- if (nspares == 1) {
- newspares = NULL;
- } else {
- newspares = kmem_alloc((nspares - 1) * sizeof (void *),
- KM_SLEEP);
- for (i = 0, j = 0; i < nspares; i++) {
- if (spares[i] != nv)
- VERIFY(nvlist_dup(spares[i],
- &newspares[j++], KM_SLEEP) == 0);
- }
- }
-
- VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES,
- DATA_TYPE_NVLIST_ARRAY) == 0);
- VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES,
- newspares, nspares - 1) == 0);
- for (i = 0; i < nspares - 1; i++)
- nvlist_free(newspares[i]);
- kmem_free(newspares, (nspares - 1) * sizeof (void *));
- spa_load_spares(spa);
- spa->spa_sync_spares = B_TRUE;
-
-out:
- spa_config_exit(spa, FTAG);
-
- return (ret);
-}
-
-/*
- * Find any device that's done replacing, so we can detach it.
- */
-static vdev_t *
-spa_vdev_replace_done_hunt(vdev_t *vd)
-{
- vdev_t *newvd, *oldvd;
- int c;
-
- for (c = 0; c < vd->vdev_children; c++) {
- oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]);
- if (oldvd != NULL)
- return (oldvd);
- }
-
- if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) {
- oldvd = vd->vdev_child[0];
- newvd = vd->vdev_child[1];
-
- mutex_enter(&newvd->vdev_dtl_lock);
- if (newvd->vdev_dtl_map.sm_space == 0 &&
- newvd->vdev_dtl_scrub.sm_space == 0) {
- mutex_exit(&newvd->vdev_dtl_lock);
- return (oldvd);
- }
- mutex_exit(&newvd->vdev_dtl_lock);
- }
-
- return (NULL);
-}
-
-static void
-spa_vdev_replace_done(spa_t *spa)
-{
- vdev_t *vd;
- vdev_t *pvd;
- uint64_t guid;
- uint64_t pguid = 0;
-
- spa_config_enter(spa, RW_READER, FTAG);
-
- while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) {
- guid = vd->vdev_guid;
- /*
- * If we have just finished replacing a hot spared device, then
- * we need to detach the parent's first child (the original hot
- * spare) as well.
- */
- pvd = vd->vdev_parent;
- if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
- pvd->vdev_id == 0) {
- ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
- ASSERT(pvd->vdev_parent->vdev_children == 2);
- pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid;
- }
- spa_config_exit(spa, FTAG);
- if (spa_vdev_detach(spa, guid, B_TRUE) != 0)
- return;
- if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0)
- return;
- spa_config_enter(spa, RW_READER, FTAG);
- }
-
- spa_config_exit(spa, FTAG);
-}
-
-/*
- * Update the stored path for this vdev. Dirty the vdev configuration, relying
- * on spa_vdev_enter/exit() to synchronize the labels and cache.
- */
-int
-spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
-{
- vdev_t *rvd, *vd;
- uint64_t txg;
-
- rvd = spa->spa_root_vdev;
-
- txg = spa_vdev_enter(spa);
-
- if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) {
- /*
- * Determine if this is a reference to a hot spare. In that
- * case, update the path as stored in the spare list.
- */
- nvlist_t **spares;
- uint_t i, nspares;
- if (spa->spa_sparelist != NULL) {
- VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist,
- ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
- for (i = 0; i < nspares; i++) {
- uint64_t theguid;
- VERIFY(nvlist_lookup_uint64(spares[i],
- ZPOOL_CONFIG_GUID, &theguid) == 0);
- if (theguid == guid)
- break;
- }
-
- if (i == nspares)
- return (spa_vdev_exit(spa, NULL, txg, ENOENT));
-
- VERIFY(nvlist_add_string(spares[i],
- ZPOOL_CONFIG_PATH, newpath) == 0);
- spa_load_spares(spa);
- spa->spa_sync_spares = B_TRUE;
- return (spa_vdev_exit(spa, NULL, txg, 0));
- } else {
- return (spa_vdev_exit(spa, NULL, txg, ENOENT));
- }
- }
-
- if (!vd->vdev_ops->vdev_op_leaf)
- return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
-
- spa_strfree(vd->vdev_path);
- vd->vdev_path = spa_strdup(newpath);
-
- vdev_config_dirty(vd->vdev_top);
-
- return (spa_vdev_exit(spa, NULL, txg, 0));
-}
-
-/*
- * ==========================================================================
- * SPA Scrubbing
- * ==========================================================================
- */
-
-static void
-spa_scrub_io_done(zio_t *zio)
-{
- spa_t *spa = zio->io_spa;
-
- zio_data_buf_free(zio->io_data, zio->io_size);
-
- mutex_enter(&spa->spa_scrub_lock);
- if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
- vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev;
- spa->spa_scrub_errors++;
- mutex_enter(&vd->vdev_stat_lock);
- vd->vdev_stat.vs_scrub_errors++;
- mutex_exit(&vd->vdev_stat_lock);
- }
-
- if (--spa->spa_scrub_inflight < spa->spa_scrub_maxinflight)
- cv_broadcast(&spa->spa_scrub_io_cv);
-
- ASSERT(spa->spa_scrub_inflight >= 0);
-
- mutex_exit(&spa->spa_scrub_lock);
-}
-
-static void
-spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags,
- zbookmark_t *zb)
-{
- size_t size = BP_GET_LSIZE(bp);
- void *data;
-
- mutex_enter(&spa->spa_scrub_lock);
- /*
- * Do not give too much work to vdev(s).
- */
- while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) {
- cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
- }
- spa->spa_scrub_inflight++;
- mutex_exit(&spa->spa_scrub_lock);
-
- data = zio_data_buf_alloc(size);
-
- if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)
- flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */
-
- flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL;
-
- zio_nowait(zio_read(NULL, spa, bp, data, size,
- spa_scrub_io_done, NULL, priority, flags, zb));
-}
-
-/* ARGSUSED */
-static int
-spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a)
-{
- blkptr_t *bp = &bc->bc_blkptr;
- vdev_t *vd = spa->spa_root_vdev;
- dva_t *dva = bp->blk_dva;
- int needs_resilver = B_FALSE;
- int d;
-
- if (bc->bc_errno) {
- /*
- * We can't scrub this block, but we can continue to scrub
- * the rest of the pool. Note the error and move along.
- */
- mutex_enter(&spa->spa_scrub_lock);
- spa->spa_scrub_errors++;
- mutex_exit(&spa->spa_scrub_lock);
-
- mutex_enter(&vd->vdev_stat_lock);
- vd->vdev_stat.vs_scrub_errors++;
- mutex_exit(&vd->vdev_stat_lock);
-
- return (ERESTART);
- }
-
- ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg);
-
- for (d = 0; d < BP_GET_NDVAS(bp); d++) {
- vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]));
-
- ASSERT(vd != NULL);
-
- /*
- * Keep track of how much data we've examined so that
- * zpool(1M) status can make useful progress reports.
- */
- mutex_enter(&vd->vdev_stat_lock);
- vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]);
- mutex_exit(&vd->vdev_stat_lock);
-
- if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) {
- if (DVA_GET_GANG(&dva[d])) {
- /*
- * Gang members may be spread across multiple
- * vdevs, so the best we can do is look at the
- * pool-wide DTL.
- * XXX -- it would be better to change our
- * allocation policy to ensure that this can't
- * happen.
- */
- vd = spa->spa_root_vdev;
- }
- if (vdev_dtl_contains(&vd->vdev_dtl_map,
- bp->blk_birth, 1))
- needs_resilver = B_TRUE;
- }
- }
-
- if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING)
- spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB,
- ZIO_FLAG_SCRUB, &bc->bc_bookmark);
- else if (needs_resilver)
- spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER,
- ZIO_FLAG_RESILVER, &bc->bc_bookmark);
-
- return (0);
-}
-
-static void
-spa_scrub_thread(void *arg)
-{
- spa_t *spa = arg;
- callb_cpr_t cprinfo;
- traverse_handle_t *th = spa->spa_scrub_th;
- vdev_t *rvd = spa->spa_root_vdev;
- pool_scrub_type_t scrub_type = spa->spa_scrub_type;
- int error = 0;
- boolean_t complete;
-
- CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG);
-
- /*
- * If we're restarting due to a snapshot create/delete,
- * wait for that to complete.
- */
- txg_wait_synced(spa_get_dsl(spa), 0);
-
- dprintf("start %s mintxg=%llu maxtxg=%llu\n",
- scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub",
- spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg);
-
- spa_config_enter(spa, RW_WRITER, FTAG);
- vdev_reopen(rvd); /* purge all vdev caches */
- vdev_config_dirty(rvd); /* rewrite all disk labels */
- vdev_scrub_stat_update(rvd, scrub_type, B_FALSE);
- spa_config_exit(spa, FTAG);
-
- mutex_enter(&spa->spa_scrub_lock);
- spa->spa_scrub_errors = 0;
- spa->spa_scrub_active = 1;
- ASSERT(spa->spa_scrub_inflight == 0);
-
- while (!spa->spa_scrub_stop) {
- CALLB_CPR_SAFE_BEGIN(&cprinfo);
- while (spa->spa_scrub_suspended) {
- spa->spa_scrub_active = 0;
- cv_broadcast(&spa->spa_scrub_cv);
- cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
- spa->spa_scrub_active = 1;
- }
- CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock);
-
- if (spa->spa_scrub_restart_txg != 0)
- break;
-
- mutex_exit(&spa->spa_scrub_lock);
- error = traverse_more(th);
- mutex_enter(&spa->spa_scrub_lock);
- if (error != EAGAIN)
- break;
- }
-
- while (spa->spa_scrub_inflight)
- cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
-
- spa->spa_scrub_active = 0;
- cv_broadcast(&spa->spa_scrub_cv);
-
- mutex_exit(&spa->spa_scrub_lock);
-
- spa_config_enter(spa, RW_WRITER, FTAG);
-
- mutex_enter(&spa->spa_scrub_lock);
-
- /*
- * Note: we check spa_scrub_restart_txg under both spa_scrub_lock
- * AND the spa config lock to synchronize with any config changes
- * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit().
- */
- if (spa->spa_scrub_restart_txg != 0)
- error = ERESTART;
-
- if (spa->spa_scrub_stop)
- error = EINTR;
-
- /*
- * Even if there were uncorrectable errors, we consider the scrub
- * completed. The downside is that if there is a transient error during
- * a resilver, we won't resilver the data properly to the target. But
- * if the damage is permanent (more likely) we will resilver forever,
- * which isn't really acceptable. Since there is enough information for
- * the user to know what has failed and why, this seems like a more
- * tractable approach.
- */
- complete = (error == 0);
-
- dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n",
- scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub",
- spa->spa_scrub_maxtxg, complete ? "done" : "FAILED",
- error, spa->spa_scrub_errors, spa->spa_scrub_stop);
-
- mutex_exit(&spa->spa_scrub_lock);
-
- /*
- * If the scrub/resilver completed, update all DTLs to reflect this.
- * Whether it succeeded or not, vacate all temporary scrub DTLs.
- */
- vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1,
- complete ? spa->spa_scrub_maxtxg : 0, B_TRUE);
- vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete);
- spa_errlog_rotate(spa);
-
- spa_config_exit(spa, FTAG);
-
- mutex_enter(&spa->spa_scrub_lock);
-
- /*
- * We may have finished replacing a device.
- * Let the async thread assess this and handle the detach.
- */
- spa_async_request(spa, SPA_ASYNC_REPLACE_DONE);
-
- /*
- * If we were told to restart, our final act is to start a new scrub.
- */
- if (error == ERESTART)
- spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ?
- SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB);
-
- spa->spa_scrub_type = POOL_SCRUB_NONE;
- spa->spa_scrub_active = 0;
- spa->spa_scrub_thread = NULL;
- cv_broadcast(&spa->spa_scrub_cv);
- CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */
- thread_exit();
-}
-
-void
-spa_scrub_suspend(spa_t *spa)
-{
- mutex_enter(&spa->spa_scrub_lock);
- spa->spa_scrub_suspended++;
- while (spa->spa_scrub_active) {
- cv_broadcast(&spa->spa_scrub_cv);
- cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
- }
- while (spa->spa_scrub_inflight)
- cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
- mutex_exit(&spa->spa_scrub_lock);
-}
-
-void
-spa_scrub_resume(spa_t *spa)
-{
- mutex_enter(&spa->spa_scrub_lock);
- ASSERT(spa->spa_scrub_suspended != 0);
- if (--spa->spa_scrub_suspended == 0)
- cv_broadcast(&spa->spa_scrub_cv);
- mutex_exit(&spa->spa_scrub_lock);
-}
-
-void
-spa_scrub_restart(spa_t *spa, uint64_t txg)
-{
- /*
- * Something happened (e.g. snapshot create/delete) that means
- * we must restart any in-progress scrubs. The itinerary will
- * fix this properly.
- */
- mutex_enter(&spa->spa_scrub_lock);
- spa->spa_scrub_restart_txg = txg;
- mutex_exit(&spa->spa_scrub_lock);
-}
-
-int
-spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force)
-{
- space_seg_t *ss;
- uint64_t mintxg, maxtxg;
- vdev_t *rvd = spa->spa_root_vdev;
-
- if ((uint_t)type >= POOL_SCRUB_TYPES)
- return (ENOTSUP);
-
- mutex_enter(&spa->spa_scrub_lock);
-
- /*
- * If there's a scrub or resilver already in progress, stop it.
- */
- while (spa->spa_scrub_thread != NULL) {
- /*
- * Don't stop a resilver unless forced.
- */
- if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) {
- mutex_exit(&spa->spa_scrub_lock);
- return (EBUSY);
- }
- spa->spa_scrub_stop = 1;
- cv_broadcast(&spa->spa_scrub_cv);
- cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
- }
-
- /*
- * Terminate the previous traverse.
- */
- if (spa->spa_scrub_th != NULL) {
- traverse_fini(spa->spa_scrub_th);
- spa->spa_scrub_th = NULL;
- }
-
- if (rvd == NULL) {
- ASSERT(spa->spa_scrub_stop == 0);
- ASSERT(spa->spa_scrub_type == type);
- ASSERT(spa->spa_scrub_restart_txg == 0);
- mutex_exit(&spa->spa_scrub_lock);
- return (0);
- }
-
- mintxg = TXG_INITIAL - 1;
- maxtxg = spa_last_synced_txg(spa) + 1;
-
- mutex_enter(&rvd->vdev_dtl_lock);
-
- if (rvd->vdev_dtl_map.sm_space == 0) {
- /*
- * The pool-wide DTL is empty.
- * If this is a resilver, there's nothing to do except
- * check whether any in-progress replacements have completed.
- */
- if (type == POOL_SCRUB_RESILVER) {
- type = POOL_SCRUB_NONE;
- spa_async_request(spa, SPA_ASYNC_REPLACE_DONE);
- }
- } else {
- /*
- * The pool-wide DTL is non-empty.
- * If this is a normal scrub, upgrade to a resilver instead.
- */
- if (type == POOL_SCRUB_EVERYTHING)
- type = POOL_SCRUB_RESILVER;
- }
-
- if (type == POOL_SCRUB_RESILVER) {
- /*
- * Determine the resilvering boundaries.
- *
- * Note: (mintxg, maxtxg) is an open interval,
- * i.e. mintxg and maxtxg themselves are not included.
- *
- * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1
- * so we don't claim to resilver a txg that's still changing.
- */
- ss = avl_first(&rvd->vdev_dtl_map.sm_root);
- mintxg = ss->ss_start - 1;
- ss = avl_last(&rvd->vdev_dtl_map.sm_root);
- maxtxg = MIN(ss->ss_end, maxtxg);
- }
-
- mutex_exit(&rvd->vdev_dtl_lock);
-
- spa->spa_scrub_stop = 0;
- spa->spa_scrub_type = type;
- spa->spa_scrub_restart_txg = 0;
-
- if (type != POOL_SCRUB_NONE) {
- spa->spa_scrub_mintxg = mintxg;
- spa->spa_scrub_maxtxg = maxtxg;
- spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL,
- ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL,
- ZIO_FLAG_CANFAIL);
- traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg);
- spa->spa_scrub_thread = thread_create(NULL, 0,
- spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri);
- }
-
- mutex_exit(&spa->spa_scrub_lock);
-
- return (0);
-}
-
-/*
- * ==========================================================================
- * SPA async task processing
- * ==========================================================================
- */
-
-static void
-spa_async_reopen(spa_t *spa)
-{
- vdev_t *rvd = spa->spa_root_vdev;
- vdev_t *tvd;
- int c;
-
- spa_config_enter(spa, RW_WRITER, FTAG);
-
- for (c = 0; c < rvd->vdev_children; c++) {
- tvd = rvd->vdev_child[c];
- if (tvd->vdev_reopen_wanted) {
- tvd->vdev_reopen_wanted = 0;
- vdev_reopen(tvd);
- }
- }
-
- spa_config_exit(spa, FTAG);
-}
-
-static void
-spa_async_thread(void *arg)
-{
- spa_t *spa = arg;
- int tasks;
-
- ASSERT(spa->spa_sync_on);
-
- mutex_enter(&spa->spa_async_lock);
- tasks = spa->spa_async_tasks;
- spa->spa_async_tasks = 0;
- mutex_exit(&spa->spa_async_lock);
-
- /*
- * See if the config needs to be updated.
- */
- if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
- mutex_enter(&spa_namespace_lock);
- spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
- mutex_exit(&spa_namespace_lock);
- }
-
- /*
- * See if any devices need to be reopened.
- */
- if (tasks & SPA_ASYNC_REOPEN)
- spa_async_reopen(spa);
-
- /*
- * If any devices are done replacing, detach them.
- */
- if (tasks & SPA_ASYNC_REPLACE_DONE)
- spa_vdev_replace_done(spa);
-
- /*
- * Kick off a scrub.
- */
- if (tasks & SPA_ASYNC_SCRUB)
- VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0);
-
- /*
- * Kick off a resilver.
- */
- if (tasks & SPA_ASYNC_RESILVER)
- VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
-
- /*
- * Let the world know that we're done.
- */
- mutex_enter(&spa->spa_async_lock);
- spa->spa_async_thread = NULL;
- cv_broadcast(&spa->spa_async_cv);
- mutex_exit(&spa->spa_async_lock);
- thread_exit();
-}
-
-void
-spa_async_suspend(spa_t *spa)
-{
- mutex_enter(&spa->spa_async_lock);
- spa->spa_async_suspended++;
- while (spa->spa_async_thread != NULL)
- cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
- mutex_exit(&spa->spa_async_lock);
-}
-
-void
-spa_async_resume(spa_t *spa)
-{
- mutex_enter(&spa->spa_async_lock);
- ASSERT(spa->spa_async_suspended != 0);
- spa->spa_async_suspended--;
- mutex_exit(&spa->spa_async_lock);
-}
-
-static void
-spa_async_dispatch(spa_t *spa)
-{
- mutex_enter(&spa->spa_async_lock);
- if (spa->spa_async_tasks && !spa->spa_async_suspended &&
- spa->spa_async_thread == NULL &&
- rootdir != NULL && !vn_is_readonly(rootdir))
- spa->spa_async_thread = thread_create(NULL, 0,
- spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
- mutex_exit(&spa->spa_async_lock);
-}
-
-void
-spa_async_request(spa_t *spa, int task)
-{
- mutex_enter(&spa->spa_async_lock);
- spa->spa_async_tasks |= task;
- mutex_exit(&spa->spa_async_lock);
-}
-
-/*
- * ==========================================================================
- * SPA syncing routines
- * ==========================================================================
- */
-
-static void
-spa_sync_deferred_frees(spa_t *spa, uint64_t txg)
-{
- bplist_t *bpl = &spa->spa_sync_bplist;
- dmu_tx_t *tx;
- blkptr_t blk;
- uint64_t itor = 0;
- zio_t *zio;
- int error;
- uint8_t c = 1;
-
- zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD);
-
- while (bplist_iterate(bpl, &itor, &blk) == 0)
- zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL));
-
- error = zio_wait(zio);
- ASSERT3U(error, ==, 0);
-
- tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
- bplist_vacate(bpl, tx);
-
- /*
- * Pre-dirty the first block so we sync to convergence faster.
- * (Usually only the first block is needed.)
- */
- dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx);
- dmu_tx_commit(tx);
-}
-
-static void
-spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
-{
- char *packed = NULL;
- size_t nvsize = 0;
- dmu_buf_t *db;
-
- VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
-
- packed = kmem_alloc(nvsize, KM_SLEEP);
-
- VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
- KM_SLEEP) == 0);
-
- dmu_write(spa->spa_meta_objset, obj, 0, nvsize, packed, tx);
-
- kmem_free(packed, nvsize);
-
- VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
- dmu_buf_will_dirty(db, tx);
- *(uint64_t *)db->db_data = nvsize;
- dmu_buf_rele(db, FTAG);
-}
-
-static void
-spa_sync_spares(spa_t *spa, dmu_tx_t *tx)
-{
- nvlist_t *nvroot;
- nvlist_t **spares;
- int i;
-
- if (!spa->spa_sync_spares)
- return;
-
- /*
- * Update the MOS nvlist describing the list of available spares.
- * spa_validate_spares() will have already made sure this nvlist is
- * valid and the vdevs are labelled appropriately.
- */
- if (spa->spa_spares_object == 0) {
- spa->spa_spares_object = dmu_object_alloc(spa->spa_meta_objset,
- DMU_OT_PACKED_NVLIST, 1 << 14,
- DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
- VERIFY(zap_update(spa->spa_meta_objset,
- DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SPARES,
- sizeof (uint64_t), 1, &spa->spa_spares_object, tx) == 0);
- }
-
- VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
- if (spa->spa_nspares == 0) {
- VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
- NULL, 0) == 0);
- } else {
- spares = kmem_alloc(spa->spa_nspares * sizeof (void *),
- KM_SLEEP);
- for (i = 0; i < spa->spa_nspares; i++)
- spares[i] = vdev_config_generate(spa,
- spa->spa_spares[i], B_FALSE, B_TRUE);
- VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
- spares, spa->spa_nspares) == 0);
- for (i = 0; i < spa->spa_nspares; i++)
- nvlist_free(spares[i]);
- kmem_free(spares, spa->spa_nspares * sizeof (void *));
- }
-
- spa_sync_nvlist(spa, spa->spa_spares_object, nvroot, tx);
- nvlist_free(nvroot);
-
- spa->spa_sync_spares = B_FALSE;
-}
-
-static void
-spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
-{
- nvlist_t *config;
-
- if (list_is_empty(&spa->spa_dirty_list))
- return;
-
- config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE);
-
- if (spa->spa_config_syncing)
- nvlist_free(spa->spa_config_syncing);
- spa->spa_config_syncing = config;
-
- spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
-}
-
-static void
-spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx)
-{
- spa_t *spa = arg1;
- nvlist_t *nvp = arg2;
- nvpair_t *nvpair;
- objset_t *mos = spa->spa_meta_objset;
- uint64_t zapobj;
-
- mutex_enter(&spa->spa_props_lock);
- if (spa->spa_pool_props_object == 0) {
- zapobj = zap_create(mos, DMU_OT_POOL_PROPS, DMU_OT_NONE, 0, tx);
- VERIFY(zapobj > 0);
-
- spa->spa_pool_props_object = zapobj;
-
- VERIFY(zap_update(mos, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_PROPS, 8, 1,
- &spa->spa_pool_props_object, tx) == 0);
- }
- mutex_exit(&spa->spa_props_lock);
-
- nvpair = NULL;
- while ((nvpair = nvlist_next_nvpair(nvp, nvpair))) {
- switch (zpool_name_to_prop(nvpair_name(nvpair))) {
- case ZFS_PROP_BOOTFS:
- VERIFY(nvlist_lookup_uint64(nvp,
- nvpair_name(nvpair), &spa->spa_bootfs) == 0);
- VERIFY(zap_update(mos,
- spa->spa_pool_props_object,
- zpool_prop_to_name(ZFS_PROP_BOOTFS), 8, 1,
- &spa->spa_bootfs, tx) == 0);
- break;
- }
- }
-}
-
-/*
- * Sync the specified transaction group. New blocks may be dirtied as
- * part of the process, so we iterate until it converges.
- */
-void
-spa_sync(spa_t *spa, uint64_t txg)
-{
- dsl_pool_t *dp = spa->spa_dsl_pool;
- objset_t *mos = spa->spa_meta_objset;
- bplist_t *bpl = &spa->spa_sync_bplist;
- vdev_t *rvd = spa->spa_root_vdev;
- vdev_t *vd;
- dmu_tx_t *tx;
- int dirty_vdevs;
-
- /*
- * Lock out configuration changes.
- */
- spa_config_enter(spa, RW_READER, FTAG);
-
- spa->spa_syncing_txg = txg;
- spa->spa_sync_pass = 0;
-
- VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj));
-
- tx = dmu_tx_create_assigned(dp, txg);
-
- /*
- * If we are upgrading to ZFS_VERSION_RAIDZ_DEFLATE this txg,
- * set spa_deflate if we have no raid-z vdevs.
- */
- if (spa->spa_ubsync.ub_version < ZFS_VERSION_RAIDZ_DEFLATE &&
- spa->spa_uberblock.ub_version >= ZFS_VERSION_RAIDZ_DEFLATE) {
- int i;
-
- for (i = 0; i < rvd->vdev_children; i++) {
- vd = rvd->vdev_child[i];
- if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
- break;
- }
- if (i == rvd->vdev_children) {
- spa->spa_deflate = TRUE;
- VERIFY(0 == zap_add(spa->spa_meta_objset,
- DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
- sizeof (uint64_t), 1, &spa->spa_deflate, tx));
- }
- }
-
- /*
- * If anything has changed in this txg, push the deferred frees
- * from the previous txg. If not, leave them alone so that we
- * don't generate work on an otherwise idle system.
- */
- if (!txg_list_empty(&dp->dp_dirty_datasets, txg) ||
- !txg_list_empty(&dp->dp_dirty_dirs, txg) ||
- !txg_list_empty(&dp->dp_sync_tasks, txg))
- spa_sync_deferred_frees(spa, txg);
-
- /*
- * Iterate to convergence.
- */
- do {
- spa->spa_sync_pass++;
-
- spa_sync_config_object(spa, tx);
- spa_sync_spares(spa, tx);
- spa_errlog_sync(spa, txg);
- dsl_pool_sync(dp, txg);
-
- dirty_vdevs = 0;
- while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) {
- vdev_sync(vd, txg);
- dirty_vdevs++;
- }
-
- bplist_sync(bpl, tx);
- } while (dirty_vdevs);
-
- bplist_close(bpl);
-
- dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass);
-
- /*
- * Rewrite the vdev configuration (which includes the uberblock)
- * to commit the transaction group.
- *
- * If there are any dirty vdevs, sync the uberblock to all vdevs.
- * Otherwise, pick a random top-level vdev that's known to be
- * visible in the config cache (see spa_vdev_add() for details).
- * If the write fails, try the next vdev until we're tried them all.
- */
- if (!list_is_empty(&spa->spa_dirty_list)) {
- VERIFY(vdev_config_sync(rvd, txg) == 0);
- } else {
- int children = rvd->vdev_children;
- int c0 = spa_get_random(children);
- int c;
-
- for (c = 0; c < children; c++) {
- vd = rvd->vdev_child[(c0 + c) % children];
- if (vd->vdev_ms_array == 0)
- continue;
- if (vdev_config_sync(vd, txg) == 0)
- break;
- }
- if (c == children)
- VERIFY(vdev_config_sync(rvd, txg) == 0);
- }
-
- dmu_tx_commit(tx);
-
- /*
- * Clear the dirty config list.
- */
- while ((vd = list_head(&spa->spa_dirty_list)) != NULL)
- vdev_config_clean(vd);
-
- /*
- * Now that the new config has synced transactionally,
- * let it become visible to the config cache.
- */
- if (spa->spa_config_syncing != NULL) {
- spa_config_set(spa, spa->spa_config_syncing);
- spa->spa_config_txg = txg;
- spa->spa_config_syncing = NULL;
- }
-
- /*
- * Make a stable copy of the fully synced uberblock.
- * We use this as the root for pool traversals.
- */
- spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */
-
- spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */
-
- rw_enter(&spa->spa_traverse_lock, RW_WRITER);
- spa->spa_traverse_wanted = 0;
- spa->spa_ubsync = spa->spa_uberblock;
- rw_exit(&spa->spa_traverse_lock);
-
- spa_scrub_resume(spa); /* resume scrub with new ubsync */
-
- /*
- * Clean up the ZIL records for the synced txg.
- */
- dsl_pool_zil_clean(dp);
-
- /*
- * Update usable space statistics.
- */
- while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
- vdev_sync_done(vd, txg);
-
- /*
- * It had better be the case that we didn't dirty anything
- * since vdev_config_sync().
- */
- ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
- ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
- ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
- ASSERT(bpl->bpl_queue == NULL);
-
- spa_config_exit(spa, FTAG);
-
- /*
- * If any async tasks have been requested, kick them off.
- */
- spa_async_dispatch(spa);
-}
-
-/*
- * Sync all pools. We don't want to hold the namespace lock across these
- * operations, so we take a reference on the spa_t and drop the lock during the
- * sync.
- */
-void
-spa_sync_allpools(void)
-{
- spa_t *spa = NULL;
- mutex_enter(&spa_namespace_lock);
- while ((spa = spa_next(spa)) != NULL) {
- if (spa_state(spa) != POOL_STATE_ACTIVE)
- continue;
- spa_open_ref(spa, FTAG);
- mutex_exit(&spa_namespace_lock);
- txg_wait_synced(spa_get_dsl(spa), 0);
- mutex_enter(&spa_namespace_lock);
- spa_close(spa, FTAG);
- }
- mutex_exit(&spa_namespace_lock);
-}
-
-/*
- * ==========================================================================
- * Miscellaneous routines
- * ==========================================================================
- */
-
-/*
- * Remove all pools in the system.
- */
-void
-spa_evict_all(void)
-{
- spa_t *spa;
-
- /*
- * Remove all cached state. All pools should be closed now,
- * so every spa in the AVL tree should be unreferenced.
- */
- mutex_enter(&spa_namespace_lock);
- while ((spa = spa_next(NULL)) != NULL) {
- /*
- * Stop async tasks. The async thread may need to detach
- * a device that's been replaced, which requires grabbing
- * spa_namespace_lock, so we must drop it here.
- */
- spa_open_ref(spa, FTAG);
- mutex_exit(&spa_namespace_lock);
- spa_async_suspend(spa);
- VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0);
- mutex_enter(&spa_namespace_lock);
- spa_close(spa, FTAG);
-
- if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
- spa_unload(spa);
- spa_deactivate(spa);
- }
- spa_remove(spa);
- }
- mutex_exit(&spa_namespace_lock);
-}
-
-vdev_t *
-spa_lookup_by_guid(spa_t *spa, uint64_t guid)
-{
- return (vdev_lookup_by_guid(spa->spa_root_vdev, guid));
-}
-
-void
-spa_upgrade(spa_t *spa)
-{
- spa_config_enter(spa, RW_WRITER, FTAG);
-
- /*
- * This should only be called for a non-faulted pool, and since a
- * future version would result in an unopenable pool, this shouldn't be
- * possible.
- */
- ASSERT(spa->spa_uberblock.ub_version <= ZFS_VERSION);
-
- spa->spa_uberblock.ub_version = ZFS_VERSION;
- vdev_config_dirty(spa->spa_root_vdev);
-
- spa_config_exit(spa, FTAG);
-
- txg_wait_synced(spa_get_dsl(spa), 0);
-}
-
-boolean_t
-spa_has_spare(spa_t *spa, uint64_t guid)
-{
- int i;
- uint64_t spareguid;
-
- for (i = 0; i < spa->spa_nspares; i++)
- if (spa->spa_spares[i]->vdev_guid == guid)
- return (B_TRUE);
-
- for (i = 0; i < spa->spa_pending_nspares; i++) {
- if (nvlist_lookup_uint64(spa->spa_pending_spares[i],
- ZPOOL_CONFIG_GUID, &spareguid) == 0 &&
- spareguid == guid)
- return (B_TRUE);
- }
-
- return (B_FALSE);
-}
-
-int
-spa_set_props(spa_t *spa, nvlist_t *nvp)
-{
- return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props,
- spa, nvp, 3));
-}
-
-int
-spa_get_props(spa_t *spa, nvlist_t **nvp)
-{
- zap_cursor_t zc;
- zap_attribute_t za;
- objset_t *mos = spa->spa_meta_objset;
- zfs_source_t src;
- zfs_prop_t prop;
- nvlist_t *propval;
- uint64_t value;
- int err;
-
- VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-
- mutex_enter(&spa->spa_props_lock);
- /* If no props object, then just return empty nvlist */
- if (spa->spa_pool_props_object == 0) {
- mutex_exit(&spa->spa_props_lock);
- return (0);
- }
-
- for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
- (err = zap_cursor_retrieve(&zc, &za)) == 0;
- zap_cursor_advance(&zc)) {
-
- if ((prop = zpool_name_to_prop(za.za_name)) == ZFS_PROP_INVAL)
- continue;
-
- VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
- switch (za.za_integer_length) {
- case 8:
- if (zfs_prop_default_numeric(prop) ==
- za.za_first_integer)
- src = ZFS_SRC_DEFAULT;
- else
- src = ZFS_SRC_LOCAL;
- value = za.za_first_integer;
-
- if (prop == ZFS_PROP_BOOTFS) {
- dsl_pool_t *dp;
- dsl_dataset_t *ds = NULL;
- char strval[MAXPATHLEN];
-
- dp = spa_get_dsl(spa);
- rw_enter(&dp->dp_config_rwlock, RW_READER);
- if ((err = dsl_dataset_open_obj(dp,
- za.za_first_integer, NULL, DS_MODE_NONE,
- FTAG, &ds)) != 0) {
- rw_exit(&dp->dp_config_rwlock);
- break;
- }
- dsl_dataset_name(ds, strval);
- dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
- rw_exit(&dp->dp_config_rwlock);
-
- VERIFY(nvlist_add_uint64(propval,
- ZFS_PROP_SOURCE, src) == 0);
- VERIFY(nvlist_add_string(propval,
- ZFS_PROP_VALUE, strval) == 0);
- } else {
- VERIFY(nvlist_add_uint64(propval,
- ZFS_PROP_SOURCE, src) == 0);
- VERIFY(nvlist_add_uint64(propval,
- ZFS_PROP_VALUE, value) == 0);
- }
- VERIFY(nvlist_add_nvlist(*nvp, za.za_name,
- propval) == 0);
- break;
- }
- nvlist_free(propval);
- }
- zap_cursor_fini(&zc);
- mutex_exit(&spa->spa_props_lock);
- if (err && err != ENOENT) {
- nvlist_free(*nvp);
- return (err);
- }
-
- return (0);
-}
-
-/*
- * If the bootfs property value is dsobj, clear it.
- */
-void
-spa_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
-{
- if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
- VERIFY(zap_remove(spa->spa_meta_objset,
- spa->spa_pool_props_object,
- zpool_prop_to_name(ZFS_PROP_BOOTFS), tx) == 0);
- spa->spa_bootfs = 0;
- }
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/spa_config.c b/sys/contrib/opensolaris/uts/common/fs/zfs/spa_config.c
deleted file mode 100644
index 9e8bcf3..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/spa_config.c
+++ /dev/null
@@ -1,375 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/spa.h>
-#include <sys/spa_impl.h>
-#include <sys/nvpair.h>
-#include <sys/uio.h>
-#include <sys/fs/zfs.h>
-#include <sys/vdev_impl.h>
-#include <sys/zfs_ioctl.h>
-#include <sys/utsname.h>
-#include <sys/sunddi.h>
-#ifdef _KERNEL
-#include <sys/kobj.h>
-#endif
-
-/*
- * Pool configuration repository.
- *
- * The configuration for all pools, in addition to being stored on disk, is
- * stored in /etc/zfs/zpool.cache as a packed nvlist. The kernel maintains
- * this list as pools are created, destroyed, or modified.
- *
- * We have a single nvlist which holds all the configuration information. When
- * the module loads, we read this information from the cache and populate the
- * SPA namespace. This namespace is maintained independently in spa.c.
- * Whenever the namespace is modified, or the configuration of a pool is
- * changed, we call spa_config_sync(), which walks through all the active pools
- * and writes the configuration to disk.
- */
-
-static uint64_t spa_config_generation = 1;
-
-/*
- * This can be overridden in userland to preserve an alternate namespace for
- * userland pools when doing testing.
- */
-const char *spa_config_dir = ZPOOL_CACHE_DIR;
-
-/*
- * Called when the module is first loaded, this routine loads the configuration
- * file into the SPA namespace. It does not actually open or load the pools; it
- * only populates the namespace.
- */
-void
-spa_config_load(void)
-{
- void *buf = NULL;
- nvlist_t *nvlist, *child;
- nvpair_t *nvpair;
- spa_t *spa;
- char pathname[128];
- struct _buf *file;
- uint64_t fsize;
-
- /*
- * Open the configuration file.
- */
- (void) snprintf(pathname, sizeof (pathname), "%s/%s",
- spa_config_dir, ZPOOL_CACHE_FILE);
-
- file = kobj_open_file(pathname);
- if (file == (struct _buf *)-1) {
- ZFS_LOG(1, "Cannot open %s.", pathname);
- return;
- }
-
- if (kobj_get_filesize(file, &fsize) != 0) {
- ZFS_LOG(1, "Cannot get size of %s.", pathname);
- goto out;
- }
-
- buf = kmem_alloc(fsize, KM_SLEEP);
-
- /*
- * Read the nvlist from the file.
- */
- if (kobj_read_file(file, buf, fsize, 0) < 0) {
- ZFS_LOG(1, "Cannot read %s.", pathname);
- goto out;
- }
-
- /*
- * Unpack the nvlist.
- */
- if (nvlist_unpack(buf, fsize, &nvlist, KM_SLEEP) != 0)
- goto out;
-
- ZFS_LOG(1, "File %s loaded.", pathname);
-
- /*
- * Iterate over all elements in the nvlist, creating a new spa_t for
- * each one with the specified configuration.
- */
- mutex_enter(&spa_namespace_lock);
- nvpair = NULL;
- while ((nvpair = nvlist_next_nvpair(nvlist, nvpair)) != NULL) {
-
- if (nvpair_type(nvpair) != DATA_TYPE_NVLIST)
- continue;
-
- VERIFY(nvpair_value_nvlist(nvpair, &child) == 0);
-
- if (spa_lookup(nvpair_name(nvpair)) != NULL)
- continue;
- spa = spa_add(nvpair_name(nvpair), NULL);
-
- /*
- * We blindly duplicate the configuration here. If it's
- * invalid, we will catch it when the pool is first opened.
- */
- VERIFY(nvlist_dup(child, &spa->spa_config, 0) == 0);
- }
- mutex_exit(&spa_namespace_lock);
-
- nvlist_free(nvlist);
-
-out:
- if (buf != NULL)
- kmem_free(buf, fsize);
-
- kobj_close_file(file);
-}
-
-/*
- * Synchronize all pools to disk. This must be called with the namespace lock
- * held.
- */
-void
-spa_config_sync(void)
-{
- spa_t *spa = NULL;
- nvlist_t *config;
- size_t buflen;
- char *buf;
- vnode_t *vp;
- int oflags = FWRITE | FTRUNC | FCREAT | FOFFMAX;
- char pathname[128];
- char pathname2[128];
-
- ASSERT(MUTEX_HELD(&spa_namespace_lock));
-
- VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-
- /*
- * Add all known pools to the configuration list, ignoring those with
- * alternate root paths.
- */
- spa = NULL;
- while ((spa = spa_next(spa)) != NULL) {
- mutex_enter(&spa->spa_config_cache_lock);
- if (spa->spa_config && spa->spa_name && spa->spa_root == NULL)
- VERIFY(nvlist_add_nvlist(config, spa->spa_name,
- spa->spa_config) == 0);
- mutex_exit(&spa->spa_config_cache_lock);
- }
-
- /*
- * Pack the configuration into a buffer.
- */
- VERIFY(nvlist_size(config, &buflen, NV_ENCODE_XDR) == 0);
-
- buf = kmem_alloc(buflen, KM_SLEEP);
-
- VERIFY(nvlist_pack(config, &buf, &buflen, NV_ENCODE_XDR,
- KM_SLEEP) == 0);
-
- /*
- * Write the configuration to disk. We need to do the traditional
- * 'write to temporary file, sync, move over original' to make sure we
- * always have a consistent view of the data.
- */
- (void) snprintf(pathname, sizeof (pathname), "%s/%s", spa_config_dir,
- ZPOOL_CACHE_TMP);
-
- if (vn_open(pathname, UIO_SYSSPACE, oflags, 0644, &vp, CRCREAT, 0) != 0)
- goto out;
-
- if (vn_rdwr(UIO_WRITE, vp, buf, buflen, 0, UIO_SYSSPACE,
- 0, RLIM64_INFINITY, kcred, NULL) == 0 &&
- VOP_FSYNC(vp, FSYNC, kcred) == 0) {
- (void) snprintf(pathname2, sizeof (pathname2), "%s/%s",
- spa_config_dir, ZPOOL_CACHE_FILE);
- (void) vn_rename(pathname, pathname2, UIO_SYSSPACE);
- }
-
- (void) VOP_CLOSE(vp, oflags, 1, 0, kcred);
- VN_RELE(vp);
-
-out:
- (void) vn_remove(pathname, UIO_SYSSPACE, RMFILE);
- spa_config_generation++;
-
- kmem_free(buf, buflen);
- nvlist_free(config);
-}
-
-/*
- * Sigh. Inside a local zone, we don't have access to /etc/zfs/zpool.cache,
- * and we don't want to allow the local zone to see all the pools anyway.
- * So we have to invent the ZFS_IOC_CONFIG ioctl to grab the configuration
- * information for all pool visible within the zone.
- */
-nvlist_t *
-spa_all_configs(uint64_t *generation)
-{
- nvlist_t *pools;
- spa_t *spa;
-
- if (*generation == spa_config_generation)
- return (NULL);
-
- VERIFY(nvlist_alloc(&pools, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-
- spa = NULL;
- mutex_enter(&spa_namespace_lock);
- while ((spa = spa_next(spa)) != NULL) {
- if (INGLOBALZONE(curproc) ||
- zone_dataset_visible(spa_name(spa), NULL)) {
- mutex_enter(&spa->spa_config_cache_lock);
- VERIFY(nvlist_add_nvlist(pools, spa_name(spa),
- spa->spa_config) == 0);
- mutex_exit(&spa->spa_config_cache_lock);
- }
- }
- mutex_exit(&spa_namespace_lock);
-
- *generation = spa_config_generation;
-
- return (pools);
-}
-
-void
-spa_config_set(spa_t *spa, nvlist_t *config)
-{
- mutex_enter(&spa->spa_config_cache_lock);
- if (spa->spa_config != NULL)
- nvlist_free(spa->spa_config);
- spa->spa_config = config;
- mutex_exit(&spa->spa_config_cache_lock);
-}
-
-/*
- * Generate the pool's configuration based on the current in-core state.
- * We infer whether to generate a complete config or just one top-level config
- * based on whether vd is the root vdev.
- */
-nvlist_t *
-spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats)
-{
- nvlist_t *config, *nvroot;
- vdev_t *rvd = spa->spa_root_vdev;
- unsigned long hostid = 0;
-
- ASSERT(spa_config_held(spa, RW_READER));
-
- if (vd == NULL)
- vd = rvd;
-
- /*
- * If txg is -1, report the current value of spa->spa_config_txg.
- */
- if (txg == -1ULL)
- txg = spa->spa_config_txg;
-
- VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-
- VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
- spa_version(spa)) == 0);
- VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
- spa_name(spa)) == 0);
- VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
- spa_state(spa)) == 0);
- VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG,
- txg) == 0);
- VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
- spa_guid(spa)) == 0);
- (void) ddi_strtoul(hw_serial, NULL, 10, &hostid);
- VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID,
- hostid) == 0);
- VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME,
- utsname.nodename) == 0);
-
- if (vd != rvd) {
- VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TOP_GUID,
- vd->vdev_top->vdev_guid) == 0);
- VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_GUID,
- vd->vdev_guid) == 0);
- if (vd->vdev_isspare)
- VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_IS_SPARE,
- 1ULL) == 0);
- vd = vd->vdev_top; /* label contains top config */
- }
-
- nvroot = vdev_config_generate(spa, vd, getstats, B_FALSE);
- VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
- nvlist_free(nvroot);
-
- return (config);
-}
-
-/*
- * Update all disk labels, generate a fresh config based on the current
- * in-core state, and sync the global config cache.
- */
-void
-spa_config_update(spa_t *spa, int what)
-{
- vdev_t *rvd = spa->spa_root_vdev;
- uint64_t txg;
- int c;
-
- ASSERT(MUTEX_HELD(&spa_namespace_lock));
-
- spa_config_enter(spa, RW_WRITER, FTAG);
- txg = spa_last_synced_txg(spa) + 1;
- if (what == SPA_CONFIG_UPDATE_POOL) {
- vdev_config_dirty(rvd);
- } else {
- /*
- * If we have top-level vdevs that were added but have
- * not yet been prepared for allocation, do that now.
- * (It's safe now because the config cache is up to date,
- * so it will be able to translate the new DVAs.)
- * See comments in spa_vdev_add() for full details.
- */
- for (c = 0; c < rvd->vdev_children; c++) {
- vdev_t *tvd = rvd->vdev_child[c];
- if (tvd->vdev_ms_array == 0) {
- vdev_init(tvd, txg);
- vdev_config_dirty(tvd);
- }
- }
- }
- spa_config_exit(spa, FTAG);
-
- /*
- * Wait for the mosconfig to be regenerated and synced.
- */
- txg_wait_synced(spa->spa_dsl_pool, txg);
-
- /*
- * Update the global config cache to reflect the new mosconfig.
- */
- spa_config_sync();
-
- if (what == SPA_CONFIG_UPDATE_POOL)
- spa_config_update(spa, SPA_CONFIG_UPDATE_VDEVS);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c b/sys/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c
deleted file mode 100644
index c52acaf..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c
+++ /dev/null
@@ -1,440 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-/*
- * Routines to manage the on-disk persistent error log.
- *
- * Each pool stores a log of all logical data errors seen during normal
- * operation. This is actually the union of two distinct logs: the last log,
- * and the current log. All errors seen are logged to the current log. When a
- * scrub completes, the current log becomes the last log, the last log is thrown
- * out, and the current log is reinitialized. This way, if an error is somehow
- * corrected, a new scrub will show that that it no longer exists, and will be
- * deleted from the log when the scrub completes.
- *
- * The log is stored using a ZAP object whose key is a string form of the
- * zbookmark tuple (objset, object, level, blkid), and whose contents is an
- * optional 'objset:object' human-readable string describing the data. When an
- * error is first logged, this string will be empty, indicating that no name is
- * known. This prevents us from having to issue a potentially large amount of
- * I/O to discover the object name during an error path. Instead, we do the
- * calculation when the data is requested, storing the result so future queries
- * will be faster.
- *
- * This log is then shipped into an nvlist where the key is the dataset name and
- * the value is the object name. Userland is then responsible for uniquifying
- * this list and displaying it to the user.
- */
-
-#include <sys/dmu_tx.h>
-#include <sys/spa.h>
-#include <sys/spa_impl.h>
-#include <sys/zap.h>
-#include <sys/zio.h>
-
-/*
- * This is a stripped-down version of strtoull, suitable only for converting
- * lowercase hexidecimal numbers that don't overflow.
- */
-#ifdef _KERNEL
-static uint64_t
-_strtonum(char *str, char **nptr)
-{
- uint64_t val = 0;
- char c;
- int digit;
-
- while ((c = *str) != '\0') {
- if (c >= '0' && c <= '9')
- digit = c - '0';
- else if (c >= 'a' && c <= 'f')
- digit = 10 + c - 'a';
- else
- break;
-
- val *= 16;
- val += digit;
-
- str++;
- }
-
- *nptr = str;
-
- return (val);
-}
-#endif
-
-/*
- * Convert a bookmark to a string.
- */
-static void
-bookmark_to_name(zbookmark_t *zb, char *buf, size_t len)
-{
- (void) snprintf(buf, len, "%llx:%llx:%llx:%llx",
- (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object,
- (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid);
-}
-
-/*
- * Convert a string to a bookmark
- */
-#ifdef _KERNEL
-static void
-name_to_bookmark(char *buf, zbookmark_t *zb)
-{
- zb->zb_objset = _strtonum(buf, &buf);
- ASSERT(*buf == ':');
- zb->zb_object = _strtonum(buf + 1, &buf);
- ASSERT(*buf == ':');
- zb->zb_level = (int)_strtonum(buf + 1, &buf);
- ASSERT(*buf == ':');
- zb->zb_blkid = _strtonum(buf + 1, &buf);
- ASSERT(*buf == '\0');
-}
-#endif
-
-/*
- * Log an uncorrectable error to the persistent error log. We add it to the
- * spa's list of pending errors. The changes are actually synced out to disk
- * during spa_errlog_sync().
- */
-void
-spa_log_error(spa_t *spa, zio_t *zio)
-{
- zbookmark_t *zb = &zio->io_logical->io_bookmark;
- spa_error_entry_t search;
- spa_error_entry_t *new;
- avl_tree_t *tree;
- avl_index_t where;
-
- /*
- * If we are trying to import a pool, ignore any errors, as we won't be
- * writing to the pool any time soon.
- */
- if (spa->spa_load_state == SPA_LOAD_TRYIMPORT)
- return;
-
- mutex_enter(&spa->spa_errlist_lock);
-
- /*
- * If we have had a request to rotate the log, log it to the next list
- * instead of the current one.
- */
- if (spa->spa_scrub_active || spa->spa_scrub_finished)
- tree = &spa->spa_errlist_scrub;
- else
- tree = &spa->spa_errlist_last;
-
- search.se_bookmark = *zb;
- if (avl_find(tree, &search, &where) != NULL) {
- mutex_exit(&spa->spa_errlist_lock);
- return;
- }
-
- new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP);
- new->se_bookmark = *zb;
- avl_insert(tree, new, where);
-
- mutex_exit(&spa->spa_errlist_lock);
-}
-
-/*
- * Return the number of errors currently in the error log. This is actually the
- * sum of both the last log and the current log, since we don't know the union
- * of these logs until we reach userland.
- */
-uint64_t
-spa_get_errlog_size(spa_t *spa)
-{
- uint64_t total = 0, count;
-
- mutex_enter(&spa->spa_errlog_lock);
- if (spa->spa_errlog_scrub != 0 &&
- zap_count(spa->spa_meta_objset, spa->spa_errlog_scrub,
- &count) == 0)
- total += count;
-
- if (spa->spa_errlog_last != 0 && !spa->spa_scrub_finished &&
- zap_count(spa->spa_meta_objset, spa->spa_errlog_last,
- &count) == 0)
- total += count;
- mutex_exit(&spa->spa_errlog_lock);
-
- mutex_enter(&spa->spa_errlist_lock);
- total += avl_numnodes(&spa->spa_errlist_last);
- total += avl_numnodes(&spa->spa_errlist_scrub);
- mutex_exit(&spa->spa_errlist_lock);
-
- return (total);
-}
-
-#ifdef _KERNEL
-static int
-process_error_log(spa_t *spa, uint64_t obj, void *addr, size_t *count)
-{
- zap_cursor_t zc;
- zap_attribute_t za;
- zbookmark_t zb;
-
- if (obj == 0)
- return (0);
-
- for (zap_cursor_init(&zc, spa->spa_meta_objset, obj);
- zap_cursor_retrieve(&zc, &za) == 0;
- zap_cursor_advance(&zc)) {
-
- if (*count == 0) {
- zap_cursor_fini(&zc);
- return (ENOMEM);
- }
-
- name_to_bookmark(za.za_name, &zb);
-
- if (copyout(&zb, (char *)addr +
- (*count - 1) * sizeof (zbookmark_t),
- sizeof (zbookmark_t)) != 0)
- return (EFAULT);
-
- *count -= 1;
- }
-
- zap_cursor_fini(&zc);
-
- return (0);
-}
-
-static int
-process_error_list(avl_tree_t *list, void *addr, size_t *count)
-{
- spa_error_entry_t *se;
-
- for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) {
-
- if (*count == 0)
- return (ENOMEM);
-
- if (copyout(&se->se_bookmark, (char *)addr +
- (*count - 1) * sizeof (zbookmark_t),
- sizeof (zbookmark_t)) != 0)
- return (EFAULT);
-
- *count -= 1;
- }
-
- return (0);
-}
-#endif
-
-/*
- * Copy all known errors to userland as an array of bookmarks. This is
- * actually a union of the on-disk last log and current log, as well as any
- * pending error requests.
- *
- * Because the act of reading the on-disk log could cause errors to be
- * generated, we have two separate locks: one for the error log and one for the
- * in-core error lists. We only need the error list lock to log and error, so
- * we grab the error log lock while we read the on-disk logs, and only pick up
- * the error list lock when we are finished.
- */
-int
-spa_get_errlog(spa_t *spa, void *uaddr, size_t *count)
-{
- int ret = 0;
-
-#ifdef _KERNEL
- mutex_enter(&spa->spa_errlog_lock);
-
- ret = process_error_log(spa, spa->spa_errlog_scrub, uaddr, count);
-
- if (!ret && !spa->spa_scrub_finished)
- ret = process_error_log(spa, spa->spa_errlog_last, uaddr,
- count);
-
- mutex_enter(&spa->spa_errlist_lock);
- if (!ret)
- ret = process_error_list(&spa->spa_errlist_scrub, uaddr,
- count);
- if (!ret)
- ret = process_error_list(&spa->spa_errlist_last, uaddr,
- count);
- mutex_exit(&spa->spa_errlist_lock);
-
- mutex_exit(&spa->spa_errlog_lock);
-#endif
-
- return (ret);
-}
-
-/*
- * Called when a scrub completes. This simply set a bit which tells which AVL
- * tree to add new errors. spa_errlog_sync() is responsible for actually
- * syncing the changes to the underlying objects.
- */
-void
-spa_errlog_rotate(spa_t *spa)
-{
- mutex_enter(&spa->spa_errlist_lock);
-
- ASSERT(!spa->spa_scrub_finished);
- spa->spa_scrub_finished = B_TRUE;
-
- mutex_exit(&spa->spa_errlist_lock);
-}
-
-/*
- * Discard any pending errors from the spa_t. Called when unloading a faulted
- * pool, as the errors encountered during the open cannot be synced to disk.
- */
-void
-spa_errlog_drain(spa_t *spa)
-{
- spa_error_entry_t *se;
- void *cookie;
-
- mutex_enter(&spa->spa_errlist_lock);
-
- cookie = NULL;
- while ((se = avl_destroy_nodes(&spa->spa_errlist_last,
- &cookie)) != NULL)
- kmem_free(se, sizeof (spa_error_entry_t));
- cookie = NULL;
- while ((se = avl_destroy_nodes(&spa->spa_errlist_scrub,
- &cookie)) != NULL)
- kmem_free(se, sizeof (spa_error_entry_t));
-
- mutex_exit(&spa->spa_errlist_lock);
-}
-
-/*
- * Process a list of errors into the current on-disk log.
- */
-static void
-sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx)
-{
- spa_error_entry_t *se;
- char buf[64];
- void *cookie;
-
- if (avl_numnodes(t) != 0) {
- /* create log if necessary */
- if (*obj == 0)
- *obj = zap_create(spa->spa_meta_objset,
- DMU_OT_ERROR_LOG, DMU_OT_NONE,
- 0, tx);
-
- /* add errors to the current log */
- for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) {
- char *name = se->se_name ? se->se_name : "";
-
- bookmark_to_name(&se->se_bookmark, buf, sizeof (buf));
-
- (void) zap_update(spa->spa_meta_objset,
- *obj, buf, 1, strlen(name) + 1, name, tx);
- }
-
- /* purge the error list */
- cookie = NULL;
- while ((se = avl_destroy_nodes(t, &cookie)) != NULL)
- kmem_free(se, sizeof (spa_error_entry_t));
- }
-}
-
-/*
- * Sync the error log out to disk. This is a little tricky because the act of
- * writing the error log requires the spa_errlist_lock. So, we need to lock the
- * error lists, take a copy of the lists, and then reinitialize them. Then, we
- * drop the error list lock and take the error log lock, at which point we
- * do the errlog processing. Then, if we encounter an I/O error during this
- * process, we can successfully add the error to the list. Note that this will
- * result in the perpetual recycling of errors, but it is an unlikely situation
- * and not a performance critical operation.
- */
-void
-spa_errlog_sync(spa_t *spa, uint64_t txg)
-{
- dmu_tx_t *tx;
- avl_tree_t scrub, last;
- int scrub_finished;
-
- mutex_enter(&spa->spa_errlist_lock);
-
- /*
- * Bail out early under normal circumstances.
- */
- if (avl_numnodes(&spa->spa_errlist_scrub) == 0 &&
- avl_numnodes(&spa->spa_errlist_last) == 0 &&
- !spa->spa_scrub_finished) {
- mutex_exit(&spa->spa_errlist_lock);
- return;
- }
-
- spa_get_errlists(spa, &last, &scrub);
- scrub_finished = spa->spa_scrub_finished;
- spa->spa_scrub_finished = B_FALSE;
-
- mutex_exit(&spa->spa_errlist_lock);
- mutex_enter(&spa->spa_errlog_lock);
-
- tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
-
- /*
- * Sync out the current list of errors.
- */
- sync_error_list(spa, &last, &spa->spa_errlog_last, tx);
-
- /*
- * Rotate the log if necessary.
- */
- if (scrub_finished) {
- if (spa->spa_errlog_last != 0)
- VERIFY(dmu_object_free(spa->spa_meta_objset,
- spa->spa_errlog_last, tx) == 0);
- spa->spa_errlog_last = spa->spa_errlog_scrub;
- spa->spa_errlog_scrub = 0;
-
- sync_error_list(spa, &scrub, &spa->spa_errlog_last, tx);
- }
-
- /*
- * Sync out any pending scrub errors.
- */
- sync_error_list(spa, &scrub, &spa->spa_errlog_scrub, tx);
-
- /*
- * Update the MOS to reflect the new values.
- */
- (void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_ERRLOG_LAST, sizeof (uint64_t), 1,
- &spa->spa_errlog_last, tx);
- (void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_ERRLOG_SCRUB, sizeof (uint64_t), 1,
- &spa->spa_errlog_scrub, tx);
-
- dmu_tx_commit(tx);
-
- mutex_exit(&spa->spa_errlog_lock);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/spa_history.c b/sys/contrib/opensolaris/uts/common/fs/zfs/spa_history.c
deleted file mode 100644
index 6642801..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/spa_history.c
+++ /dev/null
@@ -1,354 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/spa_impl.h>
-#include <sys/zap.h>
-#include <sys/dsl_synctask.h>
-
-/*
- * Routines to manage the on-disk history log.
- *
- * The history log is stored as a dmu object containing
- * <packed record length, record nvlist> tuples.
- *
- * Where "record nvlist" is a nvlist containing uint64_ts and strings, and
- * "packed record length" is the packed length of the "record nvlist" stored
- * as a little endian uint64_t.
- *
- * The log is implemented as a ring buffer, though the original creation
- * of the pool ('zpool create') is never overwritten.
- *
- * The history log is tracked as object 'spa_t::spa_history'. The bonus buffer
- * of 'spa_history' stores the offsets for logging/retrieving history as
- * 'spa_history_phys_t'. 'sh_pool_create_len' is the ending offset in bytes of
- * where the 'zpool create' record is stored. This allows us to never
- * overwrite the original creation of the pool. 'sh_phys_max_off' is the
- * physical ending offset in bytes of the log. This tells you the length of
- * the buffer. 'sh_eof' is the logical EOF (in bytes). Whenever a record
- * is added, 'sh_eof' is incremented by the the size of the record.
- * 'sh_eof' is never decremented. 'sh_bof' is the logical BOF (in bytes).
- * This is where the consumer should start reading from after reading in
- * the 'zpool create' portion of the log.
- *
- * 'sh_records_lost' keeps track of how many records have been overwritten
- * and permanently lost.
- */
-
-typedef enum history_log_type {
- LOG_CMD_CREATE,
- LOG_CMD_NO_CREATE
-} history_log_type_t;
-
-typedef struct history_arg {
- const char *ha_history_str;
- history_log_type_t ha_log_type;
-} history_arg_t;
-
-/* convert a logical offset to physical */
-static uint64_t
-spa_history_log_to_phys(uint64_t log_off, spa_history_phys_t *shpp)
-{
- uint64_t phys_len;
-
- phys_len = shpp->sh_phys_max_off - shpp->sh_pool_create_len;
- return ((log_off - shpp->sh_pool_create_len) % phys_len
- + shpp->sh_pool_create_len);
-}
-
-void
-spa_history_create_obj(spa_t *spa, dmu_tx_t *tx)
-{
- dmu_buf_t *dbp;
- spa_history_phys_t *shpp;
- objset_t *mos = spa->spa_meta_objset;
-
- ASSERT(spa->spa_history == 0);
- spa->spa_history = dmu_object_alloc(mos, DMU_OT_SPA_HISTORY,
- SPA_MAXBLOCKSIZE, DMU_OT_SPA_HISTORY_OFFSETS,
- sizeof (spa_history_phys_t), tx);
-
- VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_HISTORY, sizeof (uint64_t), 1,
- &spa->spa_history, tx) == 0);
-
- VERIFY(0 == dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp));
- ASSERT(dbp->db_size >= sizeof (spa_history_phys_t));
-
- shpp = dbp->db_data;
- dmu_buf_will_dirty(dbp, tx);
-
- /*
- * Figure out maximum size of history log. We set it at
- * 1% of pool size, with a max of 32MB and min of 128KB.
- */
- shpp->sh_phys_max_off = spa_get_dspace(spa) / 100;
- shpp->sh_phys_max_off = MIN(shpp->sh_phys_max_off, 32<<20);
- shpp->sh_phys_max_off = MAX(shpp->sh_phys_max_off, 128<<10);
-
- dmu_buf_rele(dbp, FTAG);
-}
-
-/*
- * Change 'sh_bof' to the beginning of the next record.
- */
-static int
-spa_history_advance_bof(spa_t *spa, spa_history_phys_t *shpp)
-{
- objset_t *mos = spa->spa_meta_objset;
- uint64_t firstread, reclen, phys_bof;
- char buf[sizeof (reclen)];
- int err;
-
- phys_bof = spa_history_log_to_phys(shpp->sh_bof, shpp);
- firstread = MIN(sizeof (reclen), shpp->sh_phys_max_off - phys_bof);
-
- if ((err = dmu_read(mos, spa->spa_history, phys_bof, firstread,
- buf)) != 0)
- return (err);
- if (firstread != sizeof (reclen)) {
- if ((err = dmu_read(mos, spa->spa_history,
- shpp->sh_pool_create_len, sizeof (reclen) - firstread,
- buf + firstread)) != 0)
- return (err);
- }
-
- reclen = LE_64(*((uint64_t *)buf));
- shpp->sh_bof += reclen + sizeof (reclen);
- shpp->sh_records_lost++;
- return (0);
-}
-
-static int
-spa_history_write(spa_t *spa, void *buf, uint64_t len, spa_history_phys_t *shpp,
- dmu_tx_t *tx)
-{
- uint64_t firstwrite, phys_eof;
- objset_t *mos = spa->spa_meta_objset;
- int err;
-
- ASSERT(MUTEX_HELD(&spa->spa_history_lock));
-
- /* see if we need to reset logical BOF */
- while (shpp->sh_phys_max_off - shpp->sh_pool_create_len -
- (shpp->sh_eof - shpp->sh_bof) <= len) {
- if ((err = spa_history_advance_bof(spa, shpp)) != 0)
- return (err);
- }
-
- phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp);
- firstwrite = MIN(len, shpp->sh_phys_max_off - phys_eof);
- shpp->sh_eof += len;
- dmu_write(mos, spa->spa_history, phys_eof, firstwrite, buf, tx);
-
- len -= firstwrite;
- if (len > 0) {
- /* write out the rest at the beginning of physical file */
- dmu_write(mos, spa->spa_history, shpp->sh_pool_create_len,
- len, (char *)buf + firstwrite, tx);
- }
-
- return (0);
-}
-
-/*
- * Write out a history event.
- */
-void
-spa_history_log_sync(void *arg1, void *arg2, dmu_tx_t *tx)
-{
- spa_t *spa = arg1;
- history_arg_t *hap = arg2;
- const char *history_str = hap->ha_history_str;
- objset_t *mos = spa->spa_meta_objset;
- dmu_buf_t *dbp;
- spa_history_phys_t *shpp;
- size_t reclen;
- uint64_t le_len;
- nvlist_t *nvrecord;
- char *record_packed = NULL;
- int ret;
-
- if (history_str == NULL)
- return;
-
- /*
- * If we have an older pool that doesn't have a command
- * history object, create it now.
- */
- mutex_enter(&spa->spa_history_lock);
- if (!spa->spa_history)
- spa_history_create_obj(spa, tx);
- mutex_exit(&spa->spa_history_lock);
-
- /*
- * Get the offset of where we need to write via the bonus buffer.
- * Update the offset when the write completes.
- */
- VERIFY(0 == dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp));
- shpp = dbp->db_data;
-
- dmu_buf_will_dirty(dbp, tx);
-
-#ifdef ZFS_DEBUG
- {
- dmu_object_info_t doi;
- dmu_object_info_from_db(dbp, &doi);
- ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS);
- }
-#endif
-
- /* construct a nvlist of the current time and cmd string */
- VERIFY(nvlist_alloc(&nvrecord, NV_UNIQUE_NAME, KM_SLEEP) == 0);
- VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_TIME,
- gethrestime_sec()) == 0);
- VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_CMD, history_str) == 0);
- VERIFY(nvlist_pack(nvrecord, &record_packed, &reclen,
- NV_ENCODE_XDR, KM_SLEEP) == 0);
-
- mutex_enter(&spa->spa_history_lock);
- if (hap->ha_log_type == LOG_CMD_CREATE)
- VERIFY(shpp->sh_eof == shpp->sh_pool_create_len);
-
- /* write out the packed length as little endian */
- le_len = LE_64((uint64_t)reclen);
- ret = spa_history_write(spa, &le_len, sizeof (le_len), shpp, tx);
- if (!ret)
- ret = spa_history_write(spa, record_packed, reclen, shpp, tx);
-
- if (!ret && hap->ha_log_type == LOG_CMD_CREATE) {
- shpp->sh_pool_create_len += sizeof (le_len) + reclen;
- shpp->sh_bof = shpp->sh_pool_create_len;
- }
-
- mutex_exit(&spa->spa_history_lock);
- nvlist_free(nvrecord);
- kmem_free(record_packed, reclen);
- dmu_buf_rele(dbp, FTAG);
-}
-
-/*
- * Write out a history event.
- */
-int
-spa_history_log(spa_t *spa, const char *history_str, uint64_t pool_create)
-{
- history_arg_t ha;
-
- ha.ha_history_str = history_str;
- ha.ha_log_type = pool_create ? LOG_CMD_CREATE : LOG_CMD_NO_CREATE;
- return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_history_log_sync,
- spa, &ha, 0));
-}
-
-/*
- * Read out the command history.
- */
-int
-spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf)
-{
- objset_t *mos = spa->spa_meta_objset;
- dmu_buf_t *dbp;
- uint64_t read_len, phys_read_off, phys_eof;
- uint64_t leftover = 0;
- spa_history_phys_t *shpp;
- int err;
-
- /*
- * If the command history doesn't exist (older pool),
- * that's ok, just return ENOENT.
- */
- if (!spa->spa_history)
- return (ENOENT);
-
- if ((err = dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)) != 0)
- return (err);
- shpp = dbp->db_data;
-
-#ifdef ZFS_DEBUG
- {
- dmu_object_info_t doi;
- dmu_object_info_from_db(dbp, &doi);
- ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS);
- }
-#endif
-
- mutex_enter(&spa->spa_history_lock);
- phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp);
-
- if (*offp < shpp->sh_pool_create_len) {
- /* read in just the zpool create history */
- phys_read_off = *offp;
- read_len = MIN(*len, shpp->sh_pool_create_len -
- phys_read_off);
- } else {
- /*
- * Need to reset passed in offset to BOF if the passed in
- * offset has since been overwritten.
- */
- *offp = MAX(*offp, shpp->sh_bof);
- phys_read_off = spa_history_log_to_phys(*offp, shpp);
-
- /*
- * Read up to the minimum of what the user passed down or
- * the EOF (physical or logical). If we hit physical EOF,
- * use 'leftover' to read from the physical BOF.
- */
- if (phys_read_off <= phys_eof) {
- read_len = MIN(*len, phys_eof - phys_read_off);
- } else {
- read_len = MIN(*len,
- shpp->sh_phys_max_off - phys_read_off);
- if (phys_read_off + *len > shpp->sh_phys_max_off) {
- leftover = MIN(*len - read_len,
- phys_eof - shpp->sh_pool_create_len);
- }
- }
- }
-
- /* offset for consumer to use next */
- *offp += read_len + leftover;
-
- /* tell the consumer how much you actually read */
- *len = read_len + leftover;
-
- if (read_len == 0) {
- mutex_exit(&spa->spa_history_lock);
- dmu_buf_rele(dbp, FTAG);
- return (0);
- }
-
- err = dmu_read(mos, spa->spa_history, phys_read_off, read_len, buf);
- if (leftover && err == 0) {
- err = dmu_read(mos, spa->spa_history, shpp->sh_pool_create_len,
- leftover, buf + read_len);
- }
- mutex_exit(&spa->spa_history_lock);
-
- dmu_buf_rele(dbp, FTAG);
- return (err);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c b/sys/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
deleted file mode 100644
index 5da1f96..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
+++ /dev/null
@@ -1,1130 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/spa_impl.h>
-#include <sys/zio.h>
-#include <sys/zio_checksum.h>
-#include <sys/zio_compress.h>
-#include <sys/dmu.h>
-#include <sys/dmu_tx.h>
-#include <sys/zap.h>
-#include <sys/zil.h>
-#include <sys/vdev_impl.h>
-#include <sys/metaslab.h>
-#include <sys/uberblock_impl.h>
-#include <sys/txg.h>
-#include <sys/avl.h>
-#include <sys/unique.h>
-#include <sys/dsl_pool.h>
-#include <sys/dsl_dir.h>
-#include <sys/dsl_prop.h>
-#include <sys/fs/zfs.h>
-
-/*
- * SPA locking
- *
- * There are four basic locks for managing spa_t structures:
- *
- * spa_namespace_lock (global mutex)
- *
- * This lock must be acquired to do any of the following:
- *
- * - Lookup a spa_t by name
- * - Add or remove a spa_t from the namespace
- * - Increase spa_refcount from non-zero
- * - Check if spa_refcount is zero
- * - Rename a spa_t
- * - add/remove/attach/detach devices
- * - Held for the duration of create/destroy/import/export
- *
- * It does not need to handle recursion. A create or destroy may
- * reference objects (files or zvols) in other pools, but by
- * definition they must have an existing reference, and will never need
- * to lookup a spa_t by name.
- *
- * spa_refcount (per-spa refcount_t protected by mutex)
- *
- * This reference count keep track of any active users of the spa_t. The
- * spa_t cannot be destroyed or freed while this is non-zero. Internally,
- * the refcount is never really 'zero' - opening a pool implicitly keeps
- * some references in the DMU. Internally we check against SPA_MINREF, but
- * present the image of a zero/non-zero value to consumers.
- *
- * spa_config_lock (per-spa crazy rwlock)
- *
- * This SPA special is a recursive rwlock, capable of being acquired from
- * asynchronous threads. It has protects the spa_t from config changes,
- * and must be held in the following circumstances:
- *
- * - RW_READER to perform I/O to the spa
- * - RW_WRITER to change the vdev config
- *
- * spa_config_cache_lock (per-spa mutex)
- *
- * This mutex prevents the spa_config nvlist from being updated. No
- * other locks are required to obtain this lock, although implicitly you
- * must have the namespace lock or non-zero refcount to have any kind
- * of spa_t pointer at all.
- *
- * The locking order is fairly straightforward:
- *
- * spa_namespace_lock -> spa_refcount
- *
- * The namespace lock must be acquired to increase the refcount from 0
- * or to check if it is zero.
- *
- * spa_refcount -> spa_config_lock
- *
- * There must be at least one valid reference on the spa_t to acquire
- * the config lock.
- *
- * spa_namespace_lock -> spa_config_lock
- *
- * The namespace lock must always be taken before the config lock.
- *
- *
- * The spa_namespace_lock and spa_config_cache_lock can be acquired directly and
- * are globally visible.
- *
- * The namespace is manipulated using the following functions, all which require
- * the spa_namespace_lock to be held.
- *
- * spa_lookup() Lookup a spa_t by name.
- *
- * spa_add() Create a new spa_t in the namespace.
- *
- * spa_remove() Remove a spa_t from the namespace. This also
- * frees up any memory associated with the spa_t.
- *
- * spa_next() Returns the next spa_t in the system, or the
- * first if NULL is passed.
- *
- * spa_evict_all() Shutdown and remove all spa_t structures in
- * the system.
- *
- * spa_guid_exists() Determine whether a pool/device guid exists.
- *
- * The spa_refcount is manipulated using the following functions:
- *
- * spa_open_ref() Adds a reference to the given spa_t. Must be
- * called with spa_namespace_lock held if the
- * refcount is currently zero.
- *
- * spa_close() Remove a reference from the spa_t. This will
- * not free the spa_t or remove it from the
- * namespace. No locking is required.
- *
- * spa_refcount_zero() Returns true if the refcount is currently
- * zero. Must be called with spa_namespace_lock
- * held.
- *
- * The spa_config_lock is manipulated using the following functions:
- *
- * spa_config_enter() Acquire the config lock as RW_READER or
- * RW_WRITER. At least one reference on the spa_t
- * must exist.
- *
- * spa_config_exit() Release the config lock.
- *
- * spa_config_held() Returns true if the config lock is currently
- * held in the given state.
- *
- * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit().
- *
- * spa_vdev_enter() Acquire the namespace lock and the config lock
- * for writing.
- *
- * spa_vdev_exit() Release the config lock, wait for all I/O
- * to complete, sync the updated configs to the
- * cache, and release the namespace lock.
- *
- * The spa_name() function also requires either the spa_namespace_lock
- * or the spa_config_lock, as both are needed to do a rename. spa_rename() is
- * also implemented within this file since is requires manipulation of the
- * namespace.
- */
-
-static avl_tree_t spa_namespace_avl;
-kmutex_t spa_namespace_lock;
-static kcondvar_t spa_namespace_cv;
-static int spa_active_count;
-int spa_max_replication_override = SPA_DVAS_PER_BP;
-
-static kmutex_t spa_spare_lock;
-static avl_tree_t spa_spare_avl;
-
-kmem_cache_t *spa_buffer_pool;
-int spa_mode;
-
-#ifdef ZFS_DEBUG
-int zfs_flags = ~0;
-#else
-int zfs_flags = 0;
-#endif
-
-/*
- * zfs_recover can be set to nonzero to attempt to recover from
- * otherwise-fatal errors, typically caused by on-disk corruption. When
- * set, calls to zfs_panic_recover() will turn into warning messages.
- */
-int zfs_recover = 0;
-SYSCTL_DECL(_vfs_zfs);
-TUNABLE_INT("vfs.zfs.recover", &zfs_recover);
-SYSCTL_INT(_vfs_zfs, OID_AUTO, recover, CTLFLAG_RDTUN, &zfs_recover, 0,
- "Try to recover from otherwise-fatal errors.");
-
-#define SPA_MINREF 5 /* spa_refcnt for an open-but-idle pool */
-
-/*
- * ==========================================================================
- * SPA namespace functions
- * ==========================================================================
- */
-
-/*
- * Lookup the named spa_t in the AVL tree. The spa_namespace_lock must be held.
- * Returns NULL if no matching spa_t is found.
- */
-spa_t *
-spa_lookup(const char *name)
-{
- spa_t search, *spa;
- avl_index_t where;
-
- ASSERT(MUTEX_HELD(&spa_namespace_lock));
-
- search.spa_name = (char *)name;
- spa = avl_find(&spa_namespace_avl, &search, &where);
-
- return (spa);
-}
-
-/*
- * Create an uninitialized spa_t with the given name. Requires
- * spa_namespace_lock. The caller must ensure that the spa_t doesn't already
- * exist by calling spa_lookup() first.
- */
-spa_t *
-spa_add(const char *name, const char *altroot)
-{
- spa_t *spa;
-
- ASSERT(MUTEX_HELD(&spa_namespace_lock));
-
- spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP);
-
- spa->spa_name = spa_strdup(name);
- spa->spa_state = POOL_STATE_UNINITIALIZED;
- spa->spa_freeze_txg = UINT64_MAX;
- spa->spa_final_txg = UINT64_MAX;
-
- mutex_init(&spa->spa_config_cache_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
-
- cv_init(&spa->spa_scrub_cv, NULL, CV_DEFAULT, NULL);
- cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL);
- cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
-
- refcount_create(&spa->spa_refcount);
- refcount_create(&spa->spa_config_lock.scl_count);
-
- avl_add(&spa_namespace_avl, spa);
-
- /*
- * Set the alternate root, if there is one.
- */
- if (altroot) {
- spa->spa_root = spa_strdup(altroot);
- spa_active_count++;
- }
-
- return (spa);
-}
-
-/*
- * Removes a spa_t from the namespace, freeing up any memory used. Requires
- * spa_namespace_lock. This is called only after the spa_t has been closed and
- * deactivated.
- */
-void
-spa_remove(spa_t *spa)
-{
- ASSERT(MUTEX_HELD(&spa_namespace_lock));
- ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
- ASSERT(spa->spa_scrub_thread == NULL);
-
- avl_remove(&spa_namespace_avl, spa);
- cv_broadcast(&spa_namespace_cv);
-
- if (spa->spa_root) {
- spa_strfree(spa->spa_root);
- spa_active_count--;
- }
-
- if (spa->spa_name)
- spa_strfree(spa->spa_name);
-
- spa_config_set(spa, NULL);
-
- refcount_destroy(&spa->spa_refcount);
- refcount_destroy(&spa->spa_config_lock.scl_count);
-
- cv_destroy(&spa->spa_async_cv);
- cv_destroy(&spa->spa_scrub_io_cv);
- cv_destroy(&spa->spa_scrub_cv);
-
- mutex_destroy(&spa->spa_scrub_lock);
- mutex_destroy(&spa->spa_async_lock);
- mutex_destroy(&spa->spa_config_cache_lock);
-
- kmem_free(spa, sizeof (spa_t));
-}
-
-/*
- * Given a pool, return the next pool in the namespace, or NULL if there is
- * none. If 'prev' is NULL, return the first pool.
- */
-spa_t *
-spa_next(spa_t *prev)
-{
- ASSERT(MUTEX_HELD(&spa_namespace_lock));
-
- if (prev)
- return (AVL_NEXT(&spa_namespace_avl, prev));
- else
- return (avl_first(&spa_namespace_avl));
-}
-
-/*
- * ==========================================================================
- * SPA refcount functions
- * ==========================================================================
- */
-
-/*
- * Add a reference to the given spa_t. Must have at least one reference, or
- * have the namespace lock held.
- */
-void
-spa_open_ref(spa_t *spa, void *tag)
-{
- ASSERT(refcount_count(&spa->spa_refcount) > SPA_MINREF ||
- MUTEX_HELD(&spa_namespace_lock));
-
- (void) refcount_add(&spa->spa_refcount, tag);
-}
-
-/*
- * Remove a reference to the given spa_t. Must have at least one reference, or
- * have the namespace lock held.
- */
-void
-spa_close(spa_t *spa, void *tag)
-{
- ASSERT(refcount_count(&spa->spa_refcount) > SPA_MINREF ||
- MUTEX_HELD(&spa_namespace_lock));
-
- (void) refcount_remove(&spa->spa_refcount, tag);
-}
-
-/*
- * Check to see if the spa refcount is zero. Must be called with
- * spa_namespace_lock held. We really compare against SPA_MINREF, which is the
- * number of references acquired when opening a pool
- */
-boolean_t
-spa_refcount_zero(spa_t *spa)
-{
- ASSERT(MUTEX_HELD(&spa_namespace_lock));
-
- return (refcount_count(&spa->spa_refcount) == SPA_MINREF);
-}
-
-/*
- * ==========================================================================
- * SPA spare tracking
- * ==========================================================================
- */
-
-/*
- * Spares are tracked globally due to the following constraints:
- *
- * - A spare may be part of multiple pools.
- * - A spare may be added to a pool even if it's actively in use within
- * another pool.
- * - A spare in use in any pool can only be the source of a replacement if
- * the target is a spare in the same pool.
- *
- * We keep track of all spares on the system through the use of a reference
- * counted AVL tree. When a vdev is added as a spare, or used as a replacement
- * spare, then we bump the reference count in the AVL tree. In addition, we set
- * the 'vdev_isspare' member to indicate that the device is a spare (active or
- * inactive). When a spare is made active (used to replace a device in the
- * pool), we also keep track of which pool its been made a part of.
- *
- * The 'spa_spare_lock' protects the AVL tree. These functions are normally
- * called under the spa_namespace lock as part of vdev reconfiguration. The
- * separate spare lock exists for the status query path, which does not need to
- * be completely consistent with respect to other vdev configuration changes.
- */
-
-typedef struct spa_spare {
- uint64_t spare_guid;
- uint64_t spare_pool;
- avl_node_t spare_avl;
- int spare_count;
-} spa_spare_t;
-
-static int
-spa_spare_compare(const void *a, const void *b)
-{
- const spa_spare_t *sa = a;
- const spa_spare_t *sb = b;
-
- if (sa->spare_guid < sb->spare_guid)
- return (-1);
- else if (sa->spare_guid > sb->spare_guid)
- return (1);
- else
- return (0);
-}
-
-void
-spa_spare_add(vdev_t *vd)
-{
- avl_index_t where;
- spa_spare_t search;
- spa_spare_t *spare;
-
- mutex_enter(&spa_spare_lock);
- ASSERT(!vd->vdev_isspare);
-
- search.spare_guid = vd->vdev_guid;
- if ((spare = avl_find(&spa_spare_avl, &search, &where)) != NULL) {
- spare->spare_count++;
- } else {
- spare = kmem_zalloc(sizeof (spa_spare_t), KM_SLEEP);
- spare->spare_guid = vd->vdev_guid;
- spare->spare_count = 1;
- avl_insert(&spa_spare_avl, spare, where);
- }
- vd->vdev_isspare = B_TRUE;
-
- mutex_exit(&spa_spare_lock);
-}
-
-void
-spa_spare_remove(vdev_t *vd)
-{
- spa_spare_t search;
- spa_spare_t *spare;
- avl_index_t where;
-
- mutex_enter(&spa_spare_lock);
-
- search.spare_guid = vd->vdev_guid;
- spare = avl_find(&spa_spare_avl, &search, &where);
-
- ASSERT(vd->vdev_isspare);
- ASSERT(spare != NULL);
-
- if (--spare->spare_count == 0) {
- avl_remove(&spa_spare_avl, spare);
- kmem_free(spare, sizeof (spa_spare_t));
- } else if (spare->spare_pool == spa_guid(vd->vdev_spa)) {
- spare->spare_pool = 0ULL;
- }
-
- vd->vdev_isspare = B_FALSE;
- mutex_exit(&spa_spare_lock);
-}
-
-boolean_t
-spa_spare_exists(uint64_t guid, uint64_t *pool)
-{
- spa_spare_t search, *found;
- avl_index_t where;
-
- mutex_enter(&spa_spare_lock);
-
- search.spare_guid = guid;
- found = avl_find(&spa_spare_avl, &search, &where);
-
- if (pool) {
- if (found)
- *pool = found->spare_pool;
- else
- *pool = 0ULL;
- }
-
- mutex_exit(&spa_spare_lock);
-
- return (found != NULL);
-}
-
-void
-spa_spare_activate(vdev_t *vd)
-{
- spa_spare_t search, *found;
- avl_index_t where;
-
- mutex_enter(&spa_spare_lock);
- ASSERT(vd->vdev_isspare);
-
- search.spare_guid = vd->vdev_guid;
- found = avl_find(&spa_spare_avl, &search, &where);
- ASSERT(found != NULL);
- ASSERT(found->spare_pool == 0ULL);
-
- found->spare_pool = spa_guid(vd->vdev_spa);
- mutex_exit(&spa_spare_lock);
-}
-
-/*
- * ==========================================================================
- * SPA config locking
- * ==========================================================================
- */
-
-/*
- * Acquire the config lock. The config lock is a special rwlock that allows for
- * recursive enters. Because these enters come from the same thread as well as
- * asynchronous threads working on behalf of the owner, we must unilaterally
- * allow all reads access as long at least one reader is held (even if a write
- * is requested). This has the side effect of write starvation, but write locks
- * are extremely rare, and a solution to this problem would be significantly
- * more complex (if even possible).
- *
- * We would like to assert that the namespace lock isn't held, but this is a
- * valid use during create.
- */
-void
-spa_config_enter(spa_t *spa, krw_t rw, void *tag)
-{
- spa_config_lock_t *scl = &spa->spa_config_lock;
-
- mutex_enter(&scl->scl_lock);
-
- if (scl->scl_writer != curthread) {
- if (rw == RW_READER) {
- while (scl->scl_writer != NULL)
- cv_wait(&scl->scl_cv, &scl->scl_lock);
- } else {
- while (scl->scl_writer != NULL ||
- !refcount_is_zero(&scl->scl_count))
- cv_wait(&scl->scl_cv, &scl->scl_lock);
- scl->scl_writer = curthread;
- }
- }
-
- (void) refcount_add(&scl->scl_count, tag);
-
- mutex_exit(&scl->scl_lock);
-}
-
-/*
- * Release the spa config lock, notifying any waiters in the process.
- */
-void
-spa_config_exit(spa_t *spa, void *tag)
-{
- spa_config_lock_t *scl = &spa->spa_config_lock;
-
- mutex_enter(&scl->scl_lock);
-
- ASSERT(!refcount_is_zero(&scl->scl_count));
- if (refcount_remove(&scl->scl_count, tag) == 0) {
- cv_broadcast(&scl->scl_cv);
- scl->scl_writer = NULL; /* OK in either case */
- }
-
- mutex_exit(&scl->scl_lock);
-}
-
-/*
- * Returns true if the config lock is held in the given manner.
- */
-boolean_t
-spa_config_held(spa_t *spa, krw_t rw)
-{
- spa_config_lock_t *scl = &spa->spa_config_lock;
- boolean_t held;
-
- mutex_enter(&scl->scl_lock);
- if (rw == RW_WRITER)
- held = (scl->scl_writer == curthread);
- else
- held = !refcount_is_zero(&scl->scl_count);
- mutex_exit(&scl->scl_lock);
-
- return (held);
-}
-
-/*
- * ==========================================================================
- * SPA vdev locking
- * ==========================================================================
- */
-
-/*
- * Lock the given spa_t for the purpose of adding or removing a vdev.
- * Grabs the global spa_namespace_lock plus the spa config lock for writing.
- * It returns the next transaction group for the spa_t.
- */
-uint64_t
-spa_vdev_enter(spa_t *spa)
-{
- /*
- * Suspend scrub activity while we mess with the config.
- */
- spa_scrub_suspend(spa);
-
- mutex_enter(&spa_namespace_lock);
-
- spa_config_enter(spa, RW_WRITER, spa);
-
- return (spa_last_synced_txg(spa) + 1);
-}
-
-/*
- * Unlock the spa_t after adding or removing a vdev. Besides undoing the
- * locking of spa_vdev_enter(), we also want make sure the transactions have
- * synced to disk, and then update the global configuration cache with the new
- * information.
- */
-int
-spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
-{
- int config_changed = B_FALSE;
-
- ASSERT(txg > spa_last_synced_txg(spa));
-
- /*
- * Reassess the DTLs.
- */
- vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE);
-
- /*
- * If the config changed, notify the scrub thread that it must restart.
- */
- if (error == 0 && !list_is_empty(&spa->spa_dirty_list)) {
- config_changed = B_TRUE;
- spa_scrub_restart(spa, txg);
- }
-
- spa_config_exit(spa, spa);
-
- /*
- * Allow scrubbing to resume.
- */
- spa_scrub_resume(spa);
-
- /*
- * Note: this txg_wait_synced() is important because it ensures
- * that there won't be more than one config change per txg.
- * This allows us to use the txg as the generation number.
- */
- if (error == 0)
- txg_wait_synced(spa->spa_dsl_pool, txg);
-
- if (vd != NULL) {
- ASSERT(!vd->vdev_detached || vd->vdev_dtl.smo_object == 0);
- vdev_free(vd);
- }
-
- /*
- * If the config changed, update the config cache.
- */
- if (config_changed)
- spa_config_sync();
-
- mutex_exit(&spa_namespace_lock);
-
- return (error);
-}
-
-/*
- * ==========================================================================
- * Miscellaneous functions
- * ==========================================================================
- */
-
-/*
- * Rename a spa_t.
- */
-int
-spa_rename(const char *name, const char *newname)
-{
- spa_t *spa;
- int err;
-
- /*
- * Lookup the spa_t and grab the config lock for writing. We need to
- * actually open the pool so that we can sync out the necessary labels.
- * It's OK to call spa_open() with the namespace lock held because we
- * allow recursive calls for other reasons.
- */
- mutex_enter(&spa_namespace_lock);
- if ((err = spa_open(name, &spa, FTAG)) != 0) {
- mutex_exit(&spa_namespace_lock);
- return (err);
- }
-
- spa_config_enter(spa, RW_WRITER, FTAG);
-
- avl_remove(&spa_namespace_avl, spa);
- spa_strfree(spa->spa_name);
- spa->spa_name = spa_strdup(newname);
- avl_add(&spa_namespace_avl, spa);
-
- /*
- * Sync all labels to disk with the new names by marking the root vdev
- * dirty and waiting for it to sync. It will pick up the new pool name
- * during the sync.
- */
- vdev_config_dirty(spa->spa_root_vdev);
-
- spa_config_exit(spa, FTAG);
-
- txg_wait_synced(spa->spa_dsl_pool, 0);
-
- /*
- * Sync the updated config cache.
- */
- spa_config_sync();
-
- spa_close(spa, FTAG);
-
- mutex_exit(&spa_namespace_lock);
-
- return (0);
-}
-
-
-/*
- * Determine whether a pool with given pool_guid exists. If device_guid is
- * non-zero, determine whether the pool exists *and* contains a device with the
- * specified device_guid.
- */
-boolean_t
-spa_guid_exists(uint64_t pool_guid, uint64_t device_guid)
-{
- spa_t *spa;
- avl_tree_t *t = &spa_namespace_avl;
-
- ASSERT(MUTEX_HELD(&spa_namespace_lock));
-
- for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) {
- if (spa->spa_state == POOL_STATE_UNINITIALIZED)
- continue;
- if (spa->spa_root_vdev == NULL)
- continue;
- if (spa_guid(spa) == pool_guid) {
- if (device_guid == 0)
- break;
-
- if (vdev_lookup_by_guid(spa->spa_root_vdev,
- device_guid) != NULL)
- break;
-
- /*
- * Check any devices we may in the process of adding.
- */
- if (spa->spa_pending_vdev) {
- if (vdev_lookup_by_guid(spa->spa_pending_vdev,
- device_guid) != NULL)
- break;
- }
- }
- }
-
- return (spa != NULL);
-}
-
-char *
-spa_strdup(const char *s)
-{
- size_t len;
- char *new;
-
- len = strlen(s);
- new = kmem_alloc(len + 1, KM_SLEEP);
- bcopy(s, new, len);
- new[len] = '\0';
-
- return (new);
-}
-
-void
-spa_strfree(char *s)
-{
- kmem_free(s, strlen(s) + 1);
-}
-
-uint64_t
-spa_get_random(uint64_t range)
-{
- uint64_t r;
-
- ASSERT(range != 0);
-
- (void) random_get_pseudo_bytes((void *)&r, sizeof (uint64_t));
-
- return (r % range);
-}
-
-void
-sprintf_blkptr(char *buf, int len, const blkptr_t *bp)
-{
- int d;
-
- if (bp == NULL) {
- (void) snprintf(buf, len, "<NULL>");
- return;
- }
-
- if (BP_IS_HOLE(bp)) {
- (void) snprintf(buf, len, "<hole>");
- return;
- }
-
- (void) snprintf(buf, len, "[L%llu %s] %llxL/%llxP ",
- (u_longlong_t)BP_GET_LEVEL(bp),
- dmu_ot[BP_GET_TYPE(bp)].ot_name,
- (u_longlong_t)BP_GET_LSIZE(bp),
- (u_longlong_t)BP_GET_PSIZE(bp));
-
- for (d = 0; d < BP_GET_NDVAS(bp); d++) {
- const dva_t *dva = &bp->blk_dva[d];
- (void) snprintf(buf + strlen(buf), len - strlen(buf),
- "DVA[%d]=<%llu:%llx:%llx> ", d,
- (u_longlong_t)DVA_GET_VDEV(dva),
- (u_longlong_t)DVA_GET_OFFSET(dva),
- (u_longlong_t)DVA_GET_ASIZE(dva));
- }
-
- (void) snprintf(buf + strlen(buf), len - strlen(buf),
- "%s %s %s %s birth=%llu fill=%llu cksum=%llx:%llx:%llx:%llx",
- zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name,
- zio_compress_table[BP_GET_COMPRESS(bp)].ci_name,
- BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE",
- BP_IS_GANG(bp) ? "gang" : "contiguous",
- (u_longlong_t)bp->blk_birth,
- (u_longlong_t)bp->blk_fill,
- (u_longlong_t)bp->blk_cksum.zc_word[0],
- (u_longlong_t)bp->blk_cksum.zc_word[1],
- (u_longlong_t)bp->blk_cksum.zc_word[2],
- (u_longlong_t)bp->blk_cksum.zc_word[3]);
-}
-
-void
-spa_freeze(spa_t *spa)
-{
- uint64_t freeze_txg = 0;
-
- spa_config_enter(spa, RW_WRITER, FTAG);
- if (spa->spa_freeze_txg == UINT64_MAX) {
- freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE;
- spa->spa_freeze_txg = freeze_txg;
- }
- spa_config_exit(spa, FTAG);
- if (freeze_txg != 0)
- txg_wait_synced(spa_get_dsl(spa), freeze_txg);
-}
-
-void
-zfs_panic_recover(const char *fmt, ...)
-{
- va_list adx;
-
- va_start(adx, fmt);
- vcmn_err(zfs_recover ? CE_WARN : CE_PANIC, fmt, adx);
- va_end(adx);
-}
-
-/*
- * ==========================================================================
- * Accessor functions
- * ==========================================================================
- */
-
-krwlock_t *
-spa_traverse_rwlock(spa_t *spa)
-{
- return (&spa->spa_traverse_lock);
-}
-
-int
-spa_traverse_wanted(spa_t *spa)
-{
- return (spa->spa_traverse_wanted);
-}
-
-dsl_pool_t *
-spa_get_dsl(spa_t *spa)
-{
- return (spa->spa_dsl_pool);
-}
-
-blkptr_t *
-spa_get_rootblkptr(spa_t *spa)
-{
- return (&spa->spa_ubsync.ub_rootbp);
-}
-
-void
-spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp)
-{
- spa->spa_uberblock.ub_rootbp = *bp;
-}
-
-void
-spa_altroot(spa_t *spa, char *buf, size_t buflen)
-{
- if (spa->spa_root == NULL)
- buf[0] = '\0';
- else
- (void) strncpy(buf, spa->spa_root, buflen);
-}
-
-int
-spa_sync_pass(spa_t *spa)
-{
- return (spa->spa_sync_pass);
-}
-
-char *
-spa_name(spa_t *spa)
-{
- /*
- * Accessing the name requires holding either the namespace lock or the
- * config lock, both of which are required to do a rename.
- */
- ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
- spa_config_held(spa, RW_READER) || spa_config_held(spa, RW_WRITER));
-
- return (spa->spa_name);
-}
-
-uint64_t
-spa_guid(spa_t *spa)
-{
- /*
- * If we fail to parse the config during spa_load(), we can go through
- * the error path (which posts an ereport) and end up here with no root
- * vdev. We stash the original pool guid in 'spa_load_guid' to handle
- * this case.
- */
- if (spa->spa_root_vdev != NULL)
- return (spa->spa_root_vdev->vdev_guid);
- else
- return (spa->spa_load_guid);
-}
-
-uint64_t
-spa_last_synced_txg(spa_t *spa)
-{
- return (spa->spa_ubsync.ub_txg);
-}
-
-uint64_t
-spa_first_txg(spa_t *spa)
-{
- return (spa->spa_first_txg);
-}
-
-int
-spa_state(spa_t *spa)
-{
- return (spa->spa_state);
-}
-
-uint64_t
-spa_freeze_txg(spa_t *spa)
-{
- return (spa->spa_freeze_txg);
-}
-
-/*
- * In the future, this may select among different metaslab classes
- * depending on the zdp. For now, there's no such distinction.
- */
-metaslab_class_t *
-spa_metaslab_class_select(spa_t *spa)
-{
- return (spa->spa_normal_class);
-}
-
-/*
- * Return how much space is allocated in the pool (ie. sum of all asize)
- */
-uint64_t
-spa_get_alloc(spa_t *spa)
-{
- return (spa->spa_root_vdev->vdev_stat.vs_alloc);
-}
-
-/*
- * Return how much (raid-z inflated) space there is in the pool.
- */
-uint64_t
-spa_get_space(spa_t *spa)
-{
- return (spa->spa_root_vdev->vdev_stat.vs_space);
-}
-
-/*
- * Return the amount of raid-z-deflated space in the pool.
- */
-uint64_t
-spa_get_dspace(spa_t *spa)
-{
- if (spa->spa_deflate)
- return (spa->spa_root_vdev->vdev_stat.vs_dspace);
- else
- return (spa->spa_root_vdev->vdev_stat.vs_space);
-}
-
-/* ARGSUSED */
-uint64_t
-spa_get_asize(spa_t *spa, uint64_t lsize)
-{
- /*
- * For now, the worst case is 512-byte RAID-Z blocks, in which
- * case the space requirement is exactly 2x; so just assume that.
- * Add to this the fact that we can have up to 3 DVAs per bp, and
- * we have to multiply by a total of 6x.
- */
- return (lsize * 6);
-}
-
-uint64_t
-spa_version(spa_t *spa)
-{
- return (spa->spa_ubsync.ub_version);
-}
-
-int
-spa_max_replication(spa_t *spa)
-{
- /*
- * As of ZFS_VERSION == ZFS_VERSION_DITTO_BLOCKS, we are able to
- * handle BPs with more than one DVA allocated. Set our max
- * replication level accordingly.
- */
- if (spa_version(spa) < ZFS_VERSION_DITTO_BLOCKS)
- return (1);
- return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override));
-}
-
-uint64_t
-bp_get_dasize(spa_t *spa, const blkptr_t *bp)
-{
- int sz = 0, i;
-
- if (!spa->spa_deflate)
- return (BP_GET_ASIZE(bp));
-
- for (i = 0; i < SPA_DVAS_PER_BP; i++) {
- vdev_t *vd =
- vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[i]));
- sz += (DVA_GET_ASIZE(&bp->blk_dva[i]) >> SPA_MINBLOCKSHIFT) *
- vd->vdev_deflate_ratio;
- }
- return (sz);
-}
-
-/*
- * ==========================================================================
- * Initialization and Termination
- * ==========================================================================
- */
-
-static int
-spa_name_compare(const void *a1, const void *a2)
-{
- const spa_t *s1 = a1;
- const spa_t *s2 = a2;
- int s;
-
- s = strcmp(s1->spa_name, s2->spa_name);
- if (s > 0)
- return (1);
- if (s < 0)
- return (-1);
- return (0);
-}
-
-int
-spa_busy(void)
-{
- return (spa_active_count);
-}
-
-void
-spa_init(int mode)
-{
- mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL);
- cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL);
-
- avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t),
- offsetof(spa_t, spa_avl));
-
- mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL);
-
- avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_spare_t),
- offsetof(spa_spare_t, spare_avl));
-
- spa_mode = mode;
-
- refcount_init();
- unique_init();
- zio_init();
- dmu_init();
- zil_init();
- spa_config_load();
-}
-
-void
-spa_fini(void)
-{
- spa_evict_all();
-
- zil_fini();
- dmu_fini();
- zio_fini();
- refcount_fini();
-
- avl_destroy(&spa_namespace_avl);
- avl_destroy(&spa_spare_avl);
-
- cv_destroy(&spa_namespace_cv);
- mutex_destroy(&spa_namespace_lock);
- mutex_destroy(&spa_spare_lock);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/space_map.c b/sys/contrib/opensolaris/uts/common/fs/zfs/space_map.c
deleted file mode 100644
index 23313a9..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/space_map.c
+++ /dev/null
@@ -1,501 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/spa.h>
-#include <sys/dmu.h>
-#include <sys/zio.h>
-#include <sys/space_map.h>
-
-/*
- * Space map routines.
- * NOTE: caller is responsible for all locking.
- */
-static int
-space_map_seg_compare(const void *x1, const void *x2)
-{
- const space_seg_t *s1 = x1;
- const space_seg_t *s2 = x2;
-
- if (s1->ss_start < s2->ss_start) {
- if (s1->ss_end > s2->ss_start)
- return (0);
- return (-1);
- }
- if (s1->ss_start > s2->ss_start) {
- if (s1->ss_start < s2->ss_end)
- return (0);
- return (1);
- }
- return (0);
-}
-
-void
-space_map_create(space_map_t *sm, uint64_t start, uint64_t size, uint8_t shift,
- kmutex_t *lp)
-{
- bzero(sm, sizeof (*sm));
-
- cv_init(&sm->sm_load_cv, NULL, CV_DEFAULT, NULL);
- avl_create(&sm->sm_root, space_map_seg_compare,
- sizeof (space_seg_t), offsetof(struct space_seg, ss_node));
-
- sm->sm_start = start;
- sm->sm_size = size;
- sm->sm_shift = shift;
- sm->sm_lock = lp;
-}
-
-void
-space_map_destroy(space_map_t *sm)
-{
- ASSERT(!sm->sm_loaded && !sm->sm_loading);
- VERIFY3U(sm->sm_space, ==, 0);
- avl_destroy(&sm->sm_root);
- cv_destroy(&sm->sm_load_cv);
-}
-
-void
-space_map_add(space_map_t *sm, uint64_t start, uint64_t size)
-{
- avl_index_t where;
- space_seg_t ssearch, *ss_before, *ss_after, *ss;
- uint64_t end = start + size;
- int merge_before, merge_after;
-
- ASSERT(MUTEX_HELD(sm->sm_lock));
- VERIFY(size != 0);
- VERIFY3U(start, >=, sm->sm_start);
- VERIFY3U(end, <=, sm->sm_start + sm->sm_size);
- VERIFY(sm->sm_space + size <= sm->sm_size);
- VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0);
- VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0);
-
- ssearch.ss_start = start;
- ssearch.ss_end = end;
- ss = avl_find(&sm->sm_root, &ssearch, &where);
-
- if (ss != NULL && ss->ss_start <= start && ss->ss_end >= end) {
- zfs_panic_recover("zfs: allocating allocated segment"
- "(offset=%llu size=%llu)\n",
- (longlong_t)start, (longlong_t)size);
- return;
- }
-
- /* Make sure we don't overlap with either of our neighbors */
- VERIFY(ss == NULL);
-
- ss_before = avl_nearest(&sm->sm_root, where, AVL_BEFORE);
- ss_after = avl_nearest(&sm->sm_root, where, AVL_AFTER);
-
- merge_before = (ss_before != NULL && ss_before->ss_end == start);
- merge_after = (ss_after != NULL && ss_after->ss_start == end);
-
- if (merge_before && merge_after) {
- avl_remove(&sm->sm_root, ss_before);
- ss_after->ss_start = ss_before->ss_start;
- kmem_free(ss_before, sizeof (*ss_before));
- } else if (merge_before) {
- ss_before->ss_end = end;
- } else if (merge_after) {
- ss_after->ss_start = start;
- } else {
- ss = kmem_alloc(sizeof (*ss), KM_SLEEP);
- ss->ss_start = start;
- ss->ss_end = end;
- avl_insert(&sm->sm_root, ss, where);
- }
-
- sm->sm_space += size;
-}
-
-void
-space_map_remove(space_map_t *sm, uint64_t start, uint64_t size)
-{
- avl_index_t where;
- space_seg_t ssearch, *ss, *newseg;
- uint64_t end = start + size;
- int left_over, right_over;
-
- ASSERT(MUTEX_HELD(sm->sm_lock));
- VERIFY(size != 0);
- VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0);
- VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0);
-
- ssearch.ss_start = start;
- ssearch.ss_end = end;
- ss = avl_find(&sm->sm_root, &ssearch, &where);
-
- /* Make sure we completely overlap with someone */
- if (ss == NULL) {
- zfs_panic_recover("zfs: freeing free segment "
- "(offset=%llu size=%llu)",
- (longlong_t)start, (longlong_t)size);
- return;
- }
- VERIFY3U(ss->ss_start, <=, start);
- VERIFY3U(ss->ss_end, >=, end);
- VERIFY(sm->sm_space - size <= sm->sm_size);
-
- left_over = (ss->ss_start != start);
- right_over = (ss->ss_end != end);
-
- if (left_over && right_over) {
- newseg = kmem_alloc(sizeof (*newseg), KM_SLEEP);
- newseg->ss_start = end;
- newseg->ss_end = ss->ss_end;
- ss->ss_end = start;
- avl_insert_here(&sm->sm_root, newseg, ss, AVL_AFTER);
- } else if (left_over) {
- ss->ss_end = start;
- } else if (right_over) {
- ss->ss_start = end;
- } else {
- avl_remove(&sm->sm_root, ss);
- kmem_free(ss, sizeof (*ss));
- }
-
- sm->sm_space -= size;
-}
-
-int
-space_map_contains(space_map_t *sm, uint64_t start, uint64_t size)
-{
- avl_index_t where;
- space_seg_t ssearch, *ss;
- uint64_t end = start + size;
-
- ASSERT(MUTEX_HELD(sm->sm_lock));
- VERIFY(size != 0);
- VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0);
- VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0);
-
- ssearch.ss_start = start;
- ssearch.ss_end = end;
- ss = avl_find(&sm->sm_root, &ssearch, &where);
-
- return (ss != NULL && ss->ss_start <= start && ss->ss_end >= end);
-}
-
-void
-space_map_vacate(space_map_t *sm, space_map_func_t *func, space_map_t *mdest)
-{
- space_seg_t *ss;
- void *cookie = NULL;
-
- ASSERT(MUTEX_HELD(sm->sm_lock));
-
- while ((ss = avl_destroy_nodes(&sm->sm_root, &cookie)) != NULL) {
- if (func != NULL)
- func(mdest, ss->ss_start, ss->ss_end - ss->ss_start);
- kmem_free(ss, sizeof (*ss));
- }
- sm->sm_space = 0;
-}
-
-void
-space_map_walk(space_map_t *sm, space_map_func_t *func, space_map_t *mdest)
-{
- space_seg_t *ss;
-
- for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
- func(mdest, ss->ss_start, ss->ss_end - ss->ss_start);
-}
-
-void
-space_map_excise(space_map_t *sm, uint64_t start, uint64_t size)
-{
- avl_tree_t *t = &sm->sm_root;
- avl_index_t where;
- space_seg_t *ss, search;
- uint64_t end = start + size;
- uint64_t rm_start, rm_end;
-
- ASSERT(MUTEX_HELD(sm->sm_lock));
-
- search.ss_start = start;
- search.ss_end = start;
-
- for (;;) {
- ss = avl_find(t, &search, &where);
-
- if (ss == NULL)
- ss = avl_nearest(t, where, AVL_AFTER);
-
- if (ss == NULL || ss->ss_start >= end)
- break;
-
- rm_start = MAX(ss->ss_start, start);
- rm_end = MIN(ss->ss_end, end);
-
- space_map_remove(sm, rm_start, rm_end - rm_start);
- }
-}
-
-/*
- * Replace smd with the union of smd and sms.
- */
-void
-space_map_union(space_map_t *smd, space_map_t *sms)
-{
- avl_tree_t *t = &sms->sm_root;
- space_seg_t *ss;
-
- ASSERT(MUTEX_HELD(smd->sm_lock));
-
- /*
- * For each source segment, remove any intersections with the
- * destination, then add the source segment to the destination.
- */
- for (ss = avl_first(t); ss != NULL; ss = AVL_NEXT(t, ss)) {
- space_map_excise(smd, ss->ss_start, ss->ss_end - ss->ss_start);
- space_map_add(smd, ss->ss_start, ss->ss_end - ss->ss_start);
- }
-}
-
-/*
- * Wait for any in-progress space_map_load() to complete.
- */
-void
-space_map_load_wait(space_map_t *sm)
-{
- ASSERT(MUTEX_HELD(sm->sm_lock));
-
- while (sm->sm_loading)
- cv_wait(&sm->sm_load_cv, sm->sm_lock);
-}
-
-/*
- * Note: space_map_load() will drop sm_lock across dmu_read() calls.
- * The caller must be OK with this.
- */
-int
-space_map_load(space_map_t *sm, space_map_ops_t *ops, uint8_t maptype,
- space_map_obj_t *smo, objset_t *os)
-{
- uint64_t *entry, *entry_map, *entry_map_end;
- uint64_t bufsize, size, offset, end, space;
- uint64_t mapstart = sm->sm_start;
-
- ASSERT(MUTEX_HELD(sm->sm_lock));
-
- space_map_load_wait(sm);
-
- if (sm->sm_loaded)
- return (0);
-
- sm->sm_loading = B_TRUE;
- end = smo->smo_objsize;
- space = smo->smo_alloc;
-
- ASSERT(sm->sm_ops == NULL);
- VERIFY3U(sm->sm_space, ==, 0);
-
- if (maptype == SM_FREE) {
- space_map_add(sm, sm->sm_start, sm->sm_size);
- space = sm->sm_size - space;
- }
-
- bufsize = 1ULL << SPACE_MAP_BLOCKSHIFT;
- entry_map = zio_buf_alloc(bufsize);
-
- mutex_exit(sm->sm_lock);
- if (end > bufsize)
- dmu_prefetch(os, smo->smo_object, bufsize, end - bufsize);
- mutex_enter(sm->sm_lock);
-
- for (offset = 0; offset < end; offset += bufsize) {
- size = MIN(end - offset, bufsize);
- VERIFY(P2PHASE(size, sizeof (uint64_t)) == 0);
- VERIFY(size != 0);
-
- dprintf("object=%llu offset=%llx size=%llx\n",
- smo->smo_object, offset, size);
-
- mutex_exit(sm->sm_lock);
- VERIFY3U(dmu_read(os, smo->smo_object, offset, size,
- entry_map), ==, 0);
- mutex_enter(sm->sm_lock);
-
- entry_map_end = entry_map + (size / sizeof (uint64_t));
- for (entry = entry_map; entry < entry_map_end; entry++) {
- uint64_t e = *entry;
-
- if (SM_DEBUG_DECODE(e)) /* Skip debug entries */
- continue;
-
- (SM_TYPE_DECODE(e) == maptype ?
- space_map_add : space_map_remove)(sm,
- (SM_OFFSET_DECODE(e) << sm->sm_shift) + mapstart,
- SM_RUN_DECODE(e) << sm->sm_shift);
- }
- }
- VERIFY3U(sm->sm_space, ==, space);
-
- zio_buf_free(entry_map, bufsize);
-
- sm->sm_loading = B_FALSE;
- sm->sm_loaded = B_TRUE;
- sm->sm_ops = ops;
-
- cv_broadcast(&sm->sm_load_cv);
-
- if (ops != NULL)
- ops->smop_load(sm);
-
- return (0);
-}
-
-void
-space_map_unload(space_map_t *sm)
-{
- ASSERT(MUTEX_HELD(sm->sm_lock));
-
- if (sm->sm_loaded && sm->sm_ops != NULL)
- sm->sm_ops->smop_unload(sm);
-
- sm->sm_loaded = B_FALSE;
- sm->sm_ops = NULL;
-
- space_map_vacate(sm, NULL, NULL);
-}
-
-uint64_t
-space_map_alloc(space_map_t *sm, uint64_t size)
-{
- uint64_t start;
-
- start = sm->sm_ops->smop_alloc(sm, size);
- if (start != -1ULL)
- space_map_remove(sm, start, size);
- return (start);
-}
-
-void
-space_map_claim(space_map_t *sm, uint64_t start, uint64_t size)
-{
- sm->sm_ops->smop_claim(sm, start, size);
- space_map_remove(sm, start, size);
-}
-
-void
-space_map_free(space_map_t *sm, uint64_t start, uint64_t size)
-{
- space_map_add(sm, start, size);
- sm->sm_ops->smop_free(sm, start, size);
-}
-
-/*
- * Note: space_map_sync() will drop sm_lock across dmu_write() calls.
- */
-void
-space_map_sync(space_map_t *sm, uint8_t maptype,
- space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx)
-{
- spa_t *spa = dmu_objset_spa(os);
- void *cookie = NULL;
- space_seg_t *ss;
- uint64_t bufsize, start, size, run_len;
- uint64_t *entry, *entry_map, *entry_map_end;
-
- ASSERT(MUTEX_HELD(sm->sm_lock));
-
- if (sm->sm_space == 0)
- return;
-
- dprintf("object %4llu, txg %llu, pass %d, %c, count %lu, space %llx\n",
- smo->smo_object, dmu_tx_get_txg(tx), spa_sync_pass(spa),
- maptype == SM_ALLOC ? 'A' : 'F', avl_numnodes(&sm->sm_root),
- sm->sm_space);
-
- if (maptype == SM_ALLOC)
- smo->smo_alloc += sm->sm_space;
- else
- smo->smo_alloc -= sm->sm_space;
-
- bufsize = (8 + avl_numnodes(&sm->sm_root)) * sizeof (uint64_t);
- bufsize = MIN(bufsize, 1ULL << SPACE_MAP_BLOCKSHIFT);
- entry_map = zio_buf_alloc(bufsize);
- entry_map_end = entry_map + (bufsize / sizeof (uint64_t));
- entry = entry_map;
-
- *entry++ = SM_DEBUG_ENCODE(1) |
- SM_DEBUG_ACTION_ENCODE(maptype) |
- SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(spa)) |
- SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx));
-
- while ((ss = avl_destroy_nodes(&sm->sm_root, &cookie)) != NULL) {
- size = ss->ss_end - ss->ss_start;
- start = (ss->ss_start - sm->sm_start) >> sm->sm_shift;
-
- sm->sm_space -= size;
- size >>= sm->sm_shift;
-
- while (size) {
- run_len = MIN(size, SM_RUN_MAX);
-
- if (entry == entry_map_end) {
- mutex_exit(sm->sm_lock);
- dmu_write(os, smo->smo_object, smo->smo_objsize,
- bufsize, entry_map, tx);
- mutex_enter(sm->sm_lock);
- smo->smo_objsize += bufsize;
- entry = entry_map;
- }
-
- *entry++ = SM_OFFSET_ENCODE(start) |
- SM_TYPE_ENCODE(maptype) |
- SM_RUN_ENCODE(run_len);
-
- start += run_len;
- size -= run_len;
- }
- kmem_free(ss, sizeof (*ss));
- }
-
- if (entry != entry_map) {
- size = (entry - entry_map) * sizeof (uint64_t);
- mutex_exit(sm->sm_lock);
- dmu_write(os, smo->smo_object, smo->smo_objsize,
- size, entry_map, tx);
- mutex_enter(sm->sm_lock);
- smo->smo_objsize += size;
- }
-
- zio_buf_free(entry_map, bufsize);
-
- VERIFY3U(sm->sm_space, ==, 0);
-}
-
-void
-space_map_truncate(space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx)
-{
- VERIFY(dmu_free_range(os, smo->smo_object, 0, -1ULL, tx) == 0);
-
- smo->smo_objsize = 0;
- smo->smo_alloc = 0;
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
deleted file mode 100644
index f58ffc0..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_ARC_H
-#define _SYS_ARC_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/zfs_context.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <sys/zio.h>
-
-typedef struct arc_buf_hdr arc_buf_hdr_t;
-typedef struct arc_buf arc_buf_t;
-typedef void arc_done_func_t(zio_t *zio, arc_buf_t *buf, void *private);
-typedef void arc_byteswap_func_t(void *buf, size_t size);
-typedef int arc_evict_func_t(void *private);
-
-/* generic arc_done_func_t's which you can use */
-arc_done_func_t arc_bcopy_func;
-arc_done_func_t arc_getbuf_func;
-
-struct arc_buf {
- arc_buf_hdr_t *b_hdr;
- arc_buf_t *b_next;
- void *b_data;
- arc_evict_func_t *b_efunc;
- void *b_private;
-};
-
-typedef enum arc_buf_contents {
- ARC_BUFC_UNDEF, /* buffer contents undefined */
- ARC_BUFC_DATA, /* buffer contains data */
- ARC_BUFC_METADATA /* buffer contains metadata */
-} arc_buf_contents_t;
-/*
- * These are the flags we pass into calls to the arc
- */
-#define ARC_WAIT (1 << 1) /* perform I/O synchronously */
-#define ARC_NOWAIT (1 << 2) /* perform I/O asynchronously */
-#define ARC_PREFETCH (1 << 3) /* I/O is a prefetch */
-#define ARC_CACHED (1 << 4) /* I/O was already in cache */
-
-arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag,
- arc_buf_contents_t type);
-void arc_buf_add_ref(arc_buf_t *buf, void *tag);
-int arc_buf_remove_ref(arc_buf_t *buf, void *tag);
-int arc_buf_size(arc_buf_t *buf);
-void arc_release(arc_buf_t *buf, void *tag);
-int arc_released(arc_buf_t *buf);
-int arc_has_callback(arc_buf_t *buf);
-void arc_buf_freeze(arc_buf_t *buf);
-void arc_buf_thaw(arc_buf_t *buf);
-#ifdef ZFS_DEBUG
-int arc_referenced(arc_buf_t *buf);
-#endif
-
-int arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap,
- arc_done_func_t *done, void *private, int priority, int flags,
- uint32_t *arc_flags, zbookmark_t *zb);
-zio_t *arc_write(zio_t *pio, spa_t *spa, int checksum, int compress,
- int ncopies, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
- arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority,
- int flags, zbookmark_t *zb);
-int arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
- zio_done_func_t *done, void *private, uint32_t arc_flags);
-int arc_tryread(spa_t *spa, blkptr_t *bp, void *data);
-
-void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private);
-int arc_buf_evict(arc_buf_t *buf);
-
-void arc_flush(void);
-void arc_tempreserve_clear(uint64_t tempreserve);
-int arc_tempreserve_space(uint64_t tempreserve);
-
-void arc_init(void);
-void arc_fini(void);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_ARC_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h
deleted file mode 100644
index b4c8376..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_BPLIST_H
-#define _SYS_BPLIST_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/dmu.h>
-#include <sys/spa.h>
-#include <sys/txg.h>
-#include <sys/zfs_context.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct bplist_phys {
- /*
- * This is the bonus buffer for the dead lists. The object's
- * contents is an array of bpl_entries blkptr_t's, representing
- * a total of bpl_bytes physical space.
- */
- uint64_t bpl_entries;
- uint64_t bpl_bytes;
- uint64_t bpl_comp;
- uint64_t bpl_uncomp;
-} bplist_phys_t;
-
-#define BPLIST_SIZE_V0 (2 * sizeof (uint64_t))
-
-typedef struct bplist_q {
- blkptr_t bpq_blk;
- void *bpq_next;
-} bplist_q_t;
-
-typedef struct bplist {
- kmutex_t bpl_lock;
- objset_t *bpl_mos;
- uint64_t bpl_object;
- uint8_t bpl_blockshift;
- uint8_t bpl_bpshift;
- uint8_t bpl_havecomp;
- bplist_q_t *bpl_queue;
- bplist_phys_t *bpl_phys;
- dmu_buf_t *bpl_dbuf;
- dmu_buf_t *bpl_cached_dbuf;
-} bplist_t;
-
-extern uint64_t bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx);
-extern void bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx);
-extern int bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object);
-extern void bplist_close(bplist_t *bpl);
-extern boolean_t bplist_empty(bplist_t *bpl);
-extern int bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp);
-extern int bplist_enqueue(bplist_t *bpl, blkptr_t *bp, dmu_tx_t *tx);
-extern void bplist_enqueue_deferred(bplist_t *bpl, blkptr_t *bp);
-extern void bplist_sync(bplist_t *bpl, dmu_tx_t *tx);
-extern void bplist_vacate(bplist_t *bpl, dmu_tx_t *tx);
-extern int bplist_space(bplist_t *bpl,
- uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_BPLIST_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
deleted file mode 100644
index d33657b..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
+++ /dev/null
@@ -1,334 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_DBUF_H
-#define _SYS_DBUF_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/dmu.h>
-#include <sys/spa.h>
-#include <sys/txg.h>
-#include <sys/zio.h>
-#include <sys/arc.h>
-#include <sys/zfs_context.h>
-#include <sys/refcount.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define DB_BONUS_BLKID (-1ULL)
-#define IN_DMU_SYNC 2
-
-/*
- * define flags for dbuf_read
- */
-
-#define DB_RF_MUST_SUCCEED (1 << 0)
-#define DB_RF_CANFAIL (1 << 1)
-#define DB_RF_HAVESTRUCT (1 << 2)
-#define DB_RF_NOPREFETCH (1 << 3)
-#define DB_RF_NEVERWAIT (1 << 4)
-#define DB_RF_CACHED (1 << 5)
-
-/*
- * The state transition diagram for dbufs looks like:
- *
- * +----> READ ----+
- * | |
- * | V
- * (alloc)-->UNCACHED CACHED-->EVICTING-->(free)
- * | ^
- * | |
- * +----> FILL ----+
- */
-typedef enum dbuf_states {
- DB_UNCACHED,
- DB_FILL,
- DB_READ,
- DB_CACHED,
- DB_EVICTING
-} dbuf_states_t;
-
-struct objset_impl;
-struct dnode;
-struct dmu_tx;
-
-/*
- * level = 0 means the user data
- * level = 1 means the single indirect block
- * etc.
- */
-
-#define LIST_LINK_INACTIVE(link) \
- ((link)->list_next == NULL && (link)->list_prev == NULL)
-
-struct dmu_buf_impl;
-
-typedef enum override_states {
- DR_NOT_OVERRIDDEN,
- DR_IN_DMU_SYNC,
- DR_OVERRIDDEN
-} override_states_t;
-
-typedef struct dbuf_dirty_record {
- /* link on our parents dirty list */
- list_node_t dr_dirty_node;
-
- /* transaction group this data will sync in */
- uint64_t dr_txg;
-
- /* zio of outstanding write IO */
- zio_t *dr_zio;
-
- /* pointer back to our dbuf */
- struct dmu_buf_impl *dr_dbuf;
-
- /* pointer to next dirty record */
- struct dbuf_dirty_record *dr_next;
-
- /* pointer to parent dirty record */
- struct dbuf_dirty_record *dr_parent;
-
- union dirty_types {
- struct dirty_indirect {
-
- /* protect access to list */
- kmutex_t dr_mtx;
-
- /* Our list of dirty children */
- list_t dr_children;
- } di;
- struct dirty_leaf {
-
- /*
- * dr_data is set when we dirty the buffer
- * so that we can retain the pointer even if it
- * gets COW'd in a subsequent transaction group.
- */
- arc_buf_t *dr_data;
- blkptr_t dr_overridden_by;
- override_states_t dr_override_state;
- } dl;
- } dt;
-} dbuf_dirty_record_t;
-
-typedef struct dmu_buf_impl {
- /*
- * The following members are immutable, with the exception of
- * db.db_data, which is protected by db_mtx.
- */
-
- /* the publicly visible structure */
- dmu_buf_t db;
-
- /* the objset we belong to */
- struct objset_impl *db_objset;
-
- /*
- * the dnode we belong to (NULL when evicted)
- */
- struct dnode *db_dnode;
-
- /*
- * our parent buffer; if the dnode points to us directly,
- * db_parent == db_dnode->dn_dbuf
- * only accessed by sync thread ???
- * (NULL when evicted)
- */
- struct dmu_buf_impl *db_parent;
-
- /*
- * link for hash table of all dmu_buf_impl_t's
- */
- struct dmu_buf_impl *db_hash_next;
-
- /* our block number */
- uint64_t db_blkid;
-
- /*
- * Pointer to the blkptr_t which points to us. May be NULL if we
- * don't have one yet. (NULL when evicted)
- */
- blkptr_t *db_blkptr;
-
- /*
- * Our indirection level. Data buffers have db_level==0.
- * Indirect buffers which point to data buffers have
- * db_level==1. etc. Buffers which contain dnodes have
- * db_level==0, since the dnodes are stored in a file.
- */
- uint8_t db_level;
-
- /* db_mtx protects the members below */
- kmutex_t db_mtx;
-
- /*
- * Current state of the buffer
- */
- dbuf_states_t db_state;
-
- /*
- * Refcount accessed by dmu_buf_{hold,rele}.
- * If nonzero, the buffer can't be destroyed.
- * Protected by db_mtx.
- */
- refcount_t db_holds;
-
- /* buffer holding our data */
- arc_buf_t *db_buf;
-
- kcondvar_t db_changed;
- dbuf_dirty_record_t *db_data_pending;
-
- /* pointer to most recent dirty record for this buffer */
- dbuf_dirty_record_t *db_last_dirty;
-
- /*
- * Our link on the owner dnodes's dn_dbufs list.
- * Protected by its dn_dbufs_mtx.
- */
- list_node_t db_link;
-
- /* Data which is unique to data (leaf) blocks: */
-
- /* stuff we store for the user (see dmu_buf_set_user) */
- void *db_user_ptr;
- void **db_user_data_ptr_ptr;
- dmu_buf_evict_func_t *db_evict_func;
-
- uint8_t db_immediate_evict;
- uint8_t db_freed_in_flight;
-
- uint8_t db_dirtycnt;
-} dmu_buf_impl_t;
-
-/* Note: the dbuf hash table is exposed only for the mdb module */
-#define DBUF_MUTEXES 256
-#define DBUF_HASH_MUTEX(h, idx) (&(h)->hash_mutexes[(idx) & (DBUF_MUTEXES-1)])
-typedef struct dbuf_hash_table {
- uint64_t hash_table_mask;
- dmu_buf_impl_t **hash_table;
- kmutex_t hash_mutexes[DBUF_MUTEXES];
-} dbuf_hash_table_t;
-
-
-uint64_t dbuf_whichblock(struct dnode *di, uint64_t offset);
-
-dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data);
-dmu_buf_impl_t *dbuf_create_bonus(struct dnode *dn);
-
-dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag);
-dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid,
- void *tag);
-int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid, int create,
- void *tag, dmu_buf_impl_t **dbp);
-
-void dbuf_prefetch(struct dnode *dn, uint64_t blkid);
-
-void dbuf_add_ref(dmu_buf_impl_t *db, void *tag);
-uint64_t dbuf_refcount(dmu_buf_impl_t *db);
-
-void dbuf_rele(dmu_buf_impl_t *db, void *tag);
-
-dmu_buf_impl_t *dbuf_find(struct dnode *dn, uint8_t level, uint64_t blkid);
-
-int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags);
-void dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
-void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
-void dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx);
-void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
-void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx);
-dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
-
-void dbuf_clear(dmu_buf_impl_t *db);
-void dbuf_evict(dmu_buf_impl_t *db);
-
-void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
-void dbuf_unoverride(dbuf_dirty_record_t *dr);
-void dbuf_sync_list(list_t *list, dmu_tx_t *tx);
-
-void dbuf_free_range(struct dnode *dn, uint64_t blkid, uint64_t nblks,
- struct dmu_tx *);
-
-void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx);
-
-void dbuf_init(void);
-void dbuf_fini(void);
-
-#define DBUF_GET_BUFC_TYPE(db) \
- ((((db)->db_level > 0) || \
- (dmu_ot[(db)->db_dnode->dn_type].ot_metadata)) ? \
- ARC_BUFC_METADATA : ARC_BUFC_DATA);
-
-#ifdef ZFS_DEBUG
-
-/*
- * There should be a ## between the string literal and fmt, to make it
- * clear that we're joining two strings together, but gcc does not
- * support that preprocessor token.
- */
-#define dprintf_dbuf(dbuf, fmt, ...) do { \
- if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
- char __db_buf[32]; \
- uint64_t __db_obj = (dbuf)->db.db_object; \
- if (__db_obj == DMU_META_DNODE_OBJECT) \
- (void) strcpy(__db_buf, "mdn"); \
- else \
- (void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \
- (u_longlong_t)__db_obj); \
- dprintf_ds((dbuf)->db_objset->os_dsl_dataset, \
- "obj=%s lvl=%u blkid=%lld " fmt, \
- __db_buf, (dbuf)->db_level, \
- (u_longlong_t)(dbuf)->db_blkid, __VA_ARGS__); \
- } \
-_NOTE(CONSTCOND) } while (0)
-
-#define dprintf_dbuf_bp(db, bp, fmt, ...) do { \
- if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
- char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \
- sprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, bp); \
- dprintf_dbuf(db, fmt " %s\n", __VA_ARGS__, __blkbuf); \
- kmem_free(__blkbuf, BP_SPRINTF_LEN); \
- } \
-_NOTE(CONSTCOND) } while (0)
-
-#define DBUF_VERIFY(db) dbuf_verify(db)
-
-#else
-
-#define dprintf_dbuf(db, fmt, ...)
-#define dprintf_dbuf_bp(db, bp, fmt, ...)
-#define DBUF_VERIFY(db)
-
-#endif
-
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_DBUF_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
deleted file mode 100644
index 8c2a1fd..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
+++ /dev/null
@@ -1,587 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_DMU_H
-#define _SYS_DMU_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-/*
- * This file describes the interface that the DMU provides for its
- * consumers.
- *
- * The DMU also interacts with the SPA. That interface is described in
- * dmu_spa.h.
- */
-
-#include <sys/types.h>
-#include <sys/param.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct uio;
-struct page;
-struct vnode;
-struct spa;
-struct zilog;
-struct zio;
-struct blkptr;
-struct zap_cursor;
-struct dsl_dataset;
-struct dsl_pool;
-struct dnode;
-struct drr_begin;
-struct drr_end;
-struct zbookmark;
-struct spa;
-struct nvlist;
-struct objset_impl;
-struct file;
-
-typedef struct objset objset_t;
-typedef struct dmu_tx dmu_tx_t;
-typedef struct dsl_dir dsl_dir_t;
-
-typedef enum dmu_object_type {
- DMU_OT_NONE,
- /* general: */
- DMU_OT_OBJECT_DIRECTORY, /* ZAP */
- DMU_OT_OBJECT_ARRAY, /* UINT64 */
- DMU_OT_PACKED_NVLIST, /* UINT8 (XDR by nvlist_pack/unpack) */
- DMU_OT_PACKED_NVLIST_SIZE, /* UINT64 */
- DMU_OT_BPLIST, /* UINT64 */
- DMU_OT_BPLIST_HDR, /* UINT64 */
- /* spa: */
- DMU_OT_SPACE_MAP_HEADER, /* UINT64 */
- DMU_OT_SPACE_MAP, /* UINT64 */
- /* zil: */
- DMU_OT_INTENT_LOG, /* UINT64 */
- /* dmu: */
- DMU_OT_DNODE, /* DNODE */
- DMU_OT_OBJSET, /* OBJSET */
- /* dsl: */
- DMU_OT_DSL_DIR, /* UINT64 */
- DMU_OT_DSL_DIR_CHILD_MAP, /* ZAP */
- DMU_OT_DSL_DS_SNAP_MAP, /* ZAP */
- DMU_OT_DSL_PROPS, /* ZAP */
- DMU_OT_DSL_DATASET, /* UINT64 */
- /* zpl: */
- DMU_OT_ZNODE, /* ZNODE */
- DMU_OT_ACL, /* ACL */
- DMU_OT_PLAIN_FILE_CONTENTS, /* UINT8 */
- DMU_OT_DIRECTORY_CONTENTS, /* ZAP */
- DMU_OT_MASTER_NODE, /* ZAP */
- DMU_OT_UNLINKED_SET, /* ZAP */
- /* zvol: */
- DMU_OT_ZVOL, /* UINT8 */
- DMU_OT_ZVOL_PROP, /* ZAP */
- /* other; for testing only! */
- DMU_OT_PLAIN_OTHER, /* UINT8 */
- DMU_OT_UINT64_OTHER, /* UINT64 */
- DMU_OT_ZAP_OTHER, /* ZAP */
- /* new object types: */
- DMU_OT_ERROR_LOG, /* ZAP */
- DMU_OT_SPA_HISTORY, /* UINT8 */
- DMU_OT_SPA_HISTORY_OFFSETS, /* spa_his_phys_t */
- DMU_OT_POOL_PROPS, /* ZAP */
-
- DMU_OT_NUMTYPES
-} dmu_object_type_t;
-
-typedef enum dmu_objset_type {
- DMU_OST_NONE,
- DMU_OST_META,
- DMU_OST_ZFS,
- DMU_OST_ZVOL,
- DMU_OST_OTHER, /* For testing only! */
- DMU_OST_ANY, /* Be careful! */
- DMU_OST_NUMTYPES
-} dmu_objset_type_t;
-
-void byteswap_uint64_array(void *buf, size_t size);
-void byteswap_uint32_array(void *buf, size_t size);
-void byteswap_uint16_array(void *buf, size_t size);
-void byteswap_uint8_array(void *buf, size_t size);
-void zap_byteswap(void *buf, size_t size);
-void zfs_acl_byteswap(void *buf, size_t size);
-void zfs_znode_byteswap(void *buf, size_t size);
-
-#define DS_MODE_NONE 0 /* invalid, to aid debugging */
-#define DS_MODE_STANDARD 1 /* normal access, no special needs */
-#define DS_MODE_PRIMARY 2 /* the "main" access, e.g. a mount */
-#define DS_MODE_EXCLUSIVE 3 /* exclusive access, e.g. to destroy */
-#define DS_MODE_LEVELS 4
-#define DS_MODE_LEVEL(x) ((x) & (DS_MODE_LEVELS - 1))
-#define DS_MODE_READONLY 0x8
-#define DS_MODE_IS_READONLY(x) ((x) & DS_MODE_READONLY)
-#define DS_MODE_INCONSISTENT 0x10
-#define DS_MODE_IS_INCONSISTENT(x) ((x) & DS_MODE_INCONSISTENT)
-
-#define DS_FIND_SNAPSHOTS (1<<0)
-#define DS_FIND_CHILDREN (1<<1)
-
-/*
- * The maximum number of bytes that can be accessed as part of one
- * operation, including metadata.
- */
-#define DMU_MAX_ACCESS (10<<20) /* 10MB */
-
-/*
- * Public routines to create, destroy, open, and close objsets.
- */
-int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
- objset_t **osp);
-void dmu_objset_close(objset_t *os);
-int dmu_objset_evict_dbufs(objset_t *os, int try);
-int dmu_objset_create(const char *name, dmu_objset_type_t type,
- objset_t *clone_parent,
- void (*func)(objset_t *os, void *arg, dmu_tx_t *tx), void *arg);
-int dmu_objset_destroy(const char *name);
-int dmu_snapshots_destroy(char *fsname, char *snapname);
-int dmu_objset_rollback(const char *name);
-int dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive);
-int dmu_objset_rename(const char *name, const char *newname,
- boolean_t recursive);
-int dmu_objset_find(char *name, int func(char *, void *), void *arg,
- int flags);
-void dmu_objset_byteswap(void *buf, size_t size);
-
-typedef struct dmu_buf {
- uint64_t db_object; /* object that this buffer is part of */
- uint64_t db_offset; /* byte offset in this object */
- uint64_t db_size; /* size of buffer in bytes */
- void *db_data; /* data in buffer */
-} dmu_buf_t;
-
-typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr);
-
-/*
- * Callback function to perform byte swapping on a block.
- */
-typedef void dmu_byteswap_func_t(void *buf, size_t size);
-
-/*
- * The names of zap entries in the DIRECTORY_OBJECT of the MOS.
- */
-#define DMU_POOL_DIRECTORY_OBJECT 1
-#define DMU_POOL_CONFIG "config"
-#define DMU_POOL_ROOT_DATASET "root_dataset"
-#define DMU_POOL_SYNC_BPLIST "sync_bplist"
-#define DMU_POOL_ERRLOG_SCRUB "errlog_scrub"
-#define DMU_POOL_ERRLOG_LAST "errlog_last"
-#define DMU_POOL_SPARES "spares"
-#define DMU_POOL_DEFLATE "deflate"
-#define DMU_POOL_HISTORY "history"
-#define DMU_POOL_PROPS "pool_props"
-
-/*
- * Allocate an object from this objset. The range of object numbers
- * available is (0, DN_MAX_OBJECT). Object 0 is the meta-dnode.
- *
- * The transaction must be assigned to a txg. The newly allocated
- * object will be "held" in the transaction (ie. you can modify the
- * newly allocated object in this transaction).
- *
- * dmu_object_alloc() chooses an object and returns it in *objectp.
- *
- * dmu_object_claim() allocates a specific object number. If that
- * number is already allocated, it fails and returns EEXIST.
- *
- * Return 0 on success, or ENOSPC or EEXIST as specified above.
- */
-uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot,
- int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
-int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
- int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
-int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
- int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
-
-/*
- * Free an object from this objset.
- *
- * The object's data will be freed as well (ie. you don't need to call
- * dmu_free(object, 0, -1, tx)).
- *
- * The object need not be held in the transaction.
- *
- * If there are any holds on this object's buffers (via dmu_buf_hold()),
- * or tx holds on the object (via dmu_tx_hold_object()), you can not
- * free it; it fails and returns EBUSY.
- *
- * If the object is not allocated, it fails and returns ENOENT.
- *
- * Return 0 on success, or EBUSY or ENOENT as specified above.
- */
-int dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx);
-
-/*
- * Find the next allocated or free object.
- *
- * The objectp parameter is in-out. It will be updated to be the next
- * object which is allocated. Ignore objects which have not been
- * modified since txg.
- *
- * XXX Can only be called on a objset with no dirty data.
- *
- * Returns 0 on success, or ENOENT if there are no more objects.
- */
-int dmu_object_next(objset_t *os, uint64_t *objectp,
- boolean_t hole, uint64_t txg);
-
-/*
- * Set the data blocksize for an object.
- *
- * The object cannot have any blocks allcated beyond the first. If
- * the first block is allocated already, the new size must be greater
- * than the current block size. If these conditions are not met,
- * ENOTSUP will be returned.
- *
- * Returns 0 on success, or EBUSY if there are any holds on the object
- * contents, or ENOTSUP as described above.
- */
-int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size,
- int ibs, dmu_tx_t *tx);
-
-/*
- * Set the checksum property on a dnode. The new checksum algorithm will
- * apply to all newly written blocks; existing blocks will not be affected.
- */
-void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
- dmu_tx_t *tx);
-
-/*
- * Set the compress property on a dnode. The new compression algorithm will
- * apply to all newly written blocks; existing blocks will not be affected.
- */
-void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
- dmu_tx_t *tx);
-
-/*
- * Decide how many copies of a given block we should make. Can be from
- * 1 to SPA_DVAS_PER_BP.
- */
-int dmu_get_replication_level(struct objset_impl *, struct zbookmark *zb,
- dmu_object_type_t ot);
-/*
- * The bonus data is accessed more or less like a regular buffer.
- * You must dmu_bonus_hold() to get the buffer, which will give you a
- * dmu_buf_t with db_offset==-1ULL, and db_size = the size of the bonus
- * data. As with any normal buffer, you must call dmu_buf_read() to
- * read db_data, dmu_buf_will_dirty() before modifying it, and the
- * object must be held in an assigned transaction before calling
- * dmu_buf_will_dirty. You may use dmu_buf_set_user() on the bonus
- * buffer as well. You must release your hold with dmu_buf_rele().
- */
-int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **);
-int dmu_bonus_max(void);
-
-/*
- * Obtain the DMU buffer from the specified object which contains the
- * specified offset. dmu_buf_hold() puts a "hold" on the buffer, so
- * that it will remain in memory. You must release the hold with
- * dmu_buf_rele(). You musn't access the dmu_buf_t after releasing your
- * hold. You must have a hold on any dmu_buf_t* you pass to the DMU.
- *
- * You must call dmu_buf_read, dmu_buf_will_dirty, or dmu_buf_will_fill
- * on the returned buffer before reading or writing the buffer's
- * db_data. The comments for those routines describe what particular
- * operations are valid after calling them.
- *
- * The object number must be a valid, allocated object number.
- */
-int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
- void *tag, dmu_buf_t **);
-void dmu_buf_add_ref(dmu_buf_t *db, void* tag);
-void dmu_buf_rele(dmu_buf_t *db, void *tag);
-uint64_t dmu_buf_refcount(dmu_buf_t *db);
-
-/*
- * dmu_buf_hold_array holds the DMU buffers which contain all bytes in a
- * range of an object. A pointer to an array of dmu_buf_t*'s is
- * returned (in *dbpp).
- *
- * dmu_buf_rele_array releases the hold on an array of dmu_buf_t*'s, and
- * frees the array. The hold on the array of buffers MUST be released
- * with dmu_buf_rele_array. You can NOT release the hold on each buffer
- * individually with dmu_buf_rele.
- */
-int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset,
- uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp);
-void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag);
-
-/*
- * Returns NULL on success, or the existing user ptr if it's already
- * been set.
- *
- * user_ptr is for use by the user and can be obtained via dmu_buf_get_user().
- *
- * user_data_ptr_ptr should be NULL, or a pointer to a pointer which
- * will be set to db->db_data when you are allowed to access it. Note
- * that db->db_data (the pointer) can change when you do dmu_buf_read(),
- * dmu_buf_tryupgrade(), dmu_buf_will_dirty(), or dmu_buf_will_fill().
- * *user_data_ptr_ptr will be set to the new value when it changes.
- *
- * If non-NULL, pageout func will be called when this buffer is being
- * excised from the cache, so that you can clean up the data structure
- * pointed to by user_ptr.
- *
- * dmu_evict_user() will call the pageout func for all buffers in a
- * objset with a given pageout func.
- */
-void *dmu_buf_set_user(dmu_buf_t *db, void *user_ptr, void *user_data_ptr_ptr,
- dmu_buf_evict_func_t *pageout_func);
-/*
- * set_user_ie is the same as set_user, but request immediate eviction
- * when hold count goes to zero.
- */
-void *dmu_buf_set_user_ie(dmu_buf_t *db, void *user_ptr,
- void *user_data_ptr_ptr, dmu_buf_evict_func_t *pageout_func);
-void *dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr,
- void *user_ptr, void *user_data_ptr_ptr,
- dmu_buf_evict_func_t *pageout_func);
-void dmu_evict_user(objset_t *os, dmu_buf_evict_func_t *func);
-
-/*
- * Returns the user_ptr set with dmu_buf_set_user(), or NULL if not set.
- */
-void *dmu_buf_get_user(dmu_buf_t *db);
-
-/*
- * Indicate that you are going to modify the buffer's data (db_data).
- *
- * The transaction (tx) must be assigned to a txg (ie. you've called
- * dmu_tx_assign()). The buffer's object must be held in the tx
- * (ie. you've called dmu_tx_hold_object(tx, db->db_object)).
- */
-void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx);
-
-/*
- * You must create a transaction, then hold the objects which you will
- * (or might) modify as part of this transaction. Then you must assign
- * the transaction to a transaction group. Once the transaction has
- * been assigned, you can modify buffers which belong to held objects as
- * part of this transaction. You can't modify buffers before the
- * transaction has been assigned; you can't modify buffers which don't
- * belong to objects which this transaction holds; you can't hold
- * objects once the transaction has been assigned. You may hold an
- * object which you are going to free (with dmu_object_free()), but you
- * don't have to.
- *
- * You can abort the transaction before it has been assigned.
- *
- * Note that you may hold buffers (with dmu_buf_hold) at any time,
- * regardless of transaction state.
- */
-
-#define DMU_NEW_OBJECT (-1ULL)
-#define DMU_OBJECT_END (-1ULL)
-
-dmu_tx_t *dmu_tx_create(objset_t *os);
-void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len);
-void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off,
- uint64_t len);
-void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name);
-void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object);
-void dmu_tx_abort(dmu_tx_t *tx);
-int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
-void dmu_tx_wait(dmu_tx_t *tx);
-void dmu_tx_commit(dmu_tx_t *tx);
-
-/*
- * Free up the data blocks for a defined range of a file. If size is
- * zero, the range from offset to end-of-file is freed.
- */
-int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
- uint64_t size, dmu_tx_t *tx);
-
-/*
- * Convenience functions.
- *
- * Canfail routines will return 0 on success, or an errno if there is a
- * nonrecoverable I/O error.
- */
-int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
- void *buf);
-void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
- const void *buf, dmu_tx_t *tx);
-int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size);
-int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size,
- dmu_tx_t *tx);
-int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset,
- uint64_t size, struct page *pp, dmu_tx_t *tx);
-
-extern int zfs_prefetch_disable;
-
-/*
- * Asynchronously try to read in the data.
- */
-void dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset,
- uint64_t len);
-
-typedef struct dmu_object_info {
- /* All sizes are in bytes. */
- uint32_t doi_data_block_size;
- uint32_t doi_metadata_block_size;
- uint64_t doi_bonus_size;
- dmu_object_type_t doi_type;
- dmu_object_type_t doi_bonus_type;
- uint8_t doi_indirection; /* 2 = dnode->indirect->data */
- uint8_t doi_checksum;
- uint8_t doi_compress;
- uint8_t doi_pad[5];
- /* Values below are number of 512-byte blocks. */
- uint64_t doi_physical_blks; /* data + metadata */
- uint64_t doi_max_block_offset;
-} dmu_object_info_t;
-
-typedef struct dmu_object_type_info {
- dmu_byteswap_func_t *ot_byteswap;
- boolean_t ot_metadata;
- char *ot_name;
-} dmu_object_type_info_t;
-
-extern const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES];
-
-/*
- * Get information on a DMU object.
- *
- * Return 0 on success or ENOENT if object is not allocated.
- *
- * If doi is NULL, just indicates whether the object exists.
- */
-int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi);
-void dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi);
-void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi);
-void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize,
- u_longlong_t *nblk512);
-
-typedef struct dmu_objset_stats {
- uint64_t dds_num_clones; /* number of clones of this */
- uint64_t dds_creation_txg;
- dmu_objset_type_t dds_type;
- uint8_t dds_is_snapshot;
- uint8_t dds_inconsistent;
- char dds_clone_of[MAXNAMELEN];
-} dmu_objset_stats_t;
-
-/*
- * Get stats on a dataset.
- */
-void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat);
-
-/*
- * Add entries to the nvlist for all the objset's properties. See
- * zfs_prop_table[] and zfs(1m) for details on the properties.
- */
-void dmu_objset_stats(objset_t *os, struct nvlist *nv);
-
-/*
- * Get the space usage statistics for statvfs().
- *
- * refdbytes is the amount of space "referenced" by this objset.
- * availbytes is the amount of space available to this objset, taking
- * into account quotas & reservations, assuming that no other objsets
- * use the space first. These values correspond to the 'referenced' and
- * 'available' properties, described in the zfs(1m) manpage.
- *
- * usedobjs and availobjs are the number of objects currently allocated,
- * and available.
- */
-void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
- uint64_t *usedobjsp, uint64_t *availobjsp);
-
-/*
- * The fsid_guid is a 56-bit ID that can change to avoid collisions.
- * (Contrast with the ds_guid which is a 64-bit ID that will never
- * change, so there is a small probability that it will collide.)
- */
-uint64_t dmu_objset_fsid_guid(objset_t *os);
-
-int dmu_objset_is_snapshot(objset_t *os);
-
-extern struct spa *dmu_objset_spa(objset_t *os);
-extern struct zilog *dmu_objset_zil(objset_t *os);
-extern struct dsl_pool *dmu_objset_pool(objset_t *os);
-extern struct dsl_dataset *dmu_objset_ds(objset_t *os);
-extern void dmu_objset_name(objset_t *os, char *buf);
-extern dmu_objset_type_t dmu_objset_type(objset_t *os);
-extern uint64_t dmu_objset_id(objset_t *os);
-extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
- uint64_t *id, uint64_t *offp);
-extern int dmu_dir_list_next(objset_t *os, int namelen, char *name,
- uint64_t *idp, uint64_t *offp);
-
-/*
- * Return the txg number for the given assigned transaction.
- */
-uint64_t dmu_tx_get_txg(dmu_tx_t *tx);
-
-/*
- * Synchronous write.
- * If a parent zio is provided this function initiates a write on the
- * provided buffer as a child of the parent zio.
- * In the absense of a parent zio, the write is completed synchronously.
- * At write completion, blk is filled with the bp of the written block.
- * Note that while the data covered by this function will be on stable
- * storage when the write completes this new data does not become a
- * permanent part of the file until the associated transaction commits.
- */
-typedef void dmu_sync_cb_t(dmu_buf_t *db, void *arg);
-int dmu_sync(struct zio *zio, dmu_buf_t *db,
- struct blkptr *bp, uint64_t txg, dmu_sync_cb_t *done, void *arg);
-
-/*
- * Find the next hole or data block in file starting at *off
- * Return found offset in *off. Return ESRCH for end of file.
- */
-int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole,
- uint64_t *off);
-
-/*
- * Initial setup and final teardown.
- */
-extern void dmu_init(void);
-extern void dmu_fini(void);
-
-typedef void (*dmu_traverse_cb_t)(objset_t *os, void *arg, struct blkptr *bp,
- uint64_t object, uint64_t offset, int len);
-void dmu_traverse_objset(objset_t *os, uint64_t txg_start,
- dmu_traverse_cb_t cb, void *arg);
-
-int dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, struct file *fp);
-int dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep,
- boolean_t force, struct file *fp, uint64_t voffset);
-
-/* CRC64 table */
-#define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected form */
-extern uint64_t zfs_crc64_table[256];
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_DMU_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h
deleted file mode 100644
index 807011e..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h
+++ /dev/null
@@ -1,237 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_DMU_IMPL_H
-#define _SYS_DMU_IMPL_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/txg_impl.h>
-#include <sys/zio.h>
-#include <sys/dnode.h>
-#include <sys/zfs_context.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * This is the locking strategy for the DMU. Numbers in parenthesis are
- * cases that use that lock order, referenced below:
- *
- * ARC is self-contained
- * bplist is self-contained
- * refcount is self-contained
- * txg is self-contained (hopefully!)
- * zst_lock
- * zf_rwlock
- *
- * XXX try to improve evicting path?
- *
- * dp_config_rwlock > os_obj_lock > dn_struct_rwlock >
- * dn_dbufs_mtx > hash_mutexes > db_mtx > leafs
- *
- * dp_config_rwlock
- * must be held before: everything
- * protects dd namespace changes
- * protects property changes globally
- * held from:
- * dsl_dir_open/r:
- * dsl_dir_create_sync/w:
- * dsl_dir_sync_destroy/w:
- * dsl_dir_rename_sync/w:
- * dsl_prop_changed_notify/r:
- *
- * os_obj_lock
- * must be held before:
- * everything except dp_config_rwlock
- * protects os_obj_next
- * held from:
- * dmu_object_alloc: dn_dbufs_mtx, db_mtx, hash_mutexes, dn_struct_rwlock
- *
- * dn_struct_rwlock
- * must be held before:
- * everything except dp_config_rwlock and os_obj_lock
- * protects structure of dnode (eg. nlevels)
- * db_blkptr can change when syncing out change to nlevels
- * dn_maxblkid
- * dn_nlevels
- * dn_*blksz*
- * phys nlevels, maxblkid, physical blkptr_t's (?)
- * held from:
- * callers of dbuf_read_impl, dbuf_hold[_impl], dbuf_prefetch
- * dmu_object_info_from_dnode: dn_dirty_mtx (dn_datablksz)
- * dmu_tx_count_free:
- * dbuf_read_impl: db_mtx, dmu_zfetch()
- * dmu_zfetch: zf_rwlock/r, zst_lock, dbuf_prefetch()
- * dbuf_new_size: db_mtx
- * dbuf_dirty: db_mtx
- * dbuf_findbp: (callers, phys? - the real need)
- * dbuf_create: dn_dbufs_mtx, hash_mutexes, db_mtx (phys?)
- * dbuf_prefetch: dn_dirty_mtx, hash_mutexes, db_mtx, dn_dbufs_mtx
- * dbuf_hold_impl: hash_mutexes, db_mtx, dn_dbufs_mtx, dbuf_findbp()
- * dnode_sync/w (increase_indirection): db_mtx (phys)
- * dnode_set_blksz/w: dn_dbufs_mtx (dn_*blksz*)
- * dnode_new_blkid/w: (dn_maxblkid)
- * dnode_free_range/w: dn_dirty_mtx (dn_maxblkid)
- * dnode_next_offset: (phys)
- *
- * dn_dbufs_mtx
- * must be held before:
- * db_mtx, hash_mutexes
- * protects:
- * dn_dbufs
- * dn_evicted
- * held from:
- * dmu_evict_user: db_mtx (dn_dbufs)
- * dbuf_free_range: db_mtx (dn_dbufs)
- * dbuf_remove_ref: db_mtx, callees:
- * dbuf_hash_remove: hash_mutexes, db_mtx
- * dbuf_create: hash_mutexes, db_mtx (dn_dbufs)
- * dnode_set_blksz: (dn_dbufs)
- *
- * hash_mutexes (global)
- * must be held before:
- * db_mtx
- * protects dbuf_hash_table (global) and db_hash_next
- * held from:
- * dbuf_find: db_mtx
- * dbuf_hash_insert: db_mtx
- * dbuf_hash_remove: db_mtx
- *
- * db_mtx (meta-leaf)
- * must be held before:
- * dn_mtx, dn_dirty_mtx, dd_lock (leaf mutexes)
- * protects:
- * db_state
- * db_holds
- * db_buf
- * db_changed
- * db_data_pending
- * db_dirtied
- * db_link
- * db_dirty_node (??)
- * db_dirtycnt
- * db_d.*
- * db.*
- * held from:
- * dbuf_dirty: dn_mtx, dn_dirty_mtx
- * dbuf_dirty->dsl_dir_willuse_space: dd_lock
- * dbuf_dirty->dbuf_new_block->dsl_dataset_block_freeable: dd_lock
- * dbuf_undirty: dn_dirty_mtx (db_d)
- * dbuf_write_done: dn_dirty_mtx (db_state)
- * dbuf_*
- * dmu_buf_update_user: none (db_d)
- * dmu_evict_user: none (db_d) (maybe can eliminate)
- * dbuf_find: none (db_holds)
- * dbuf_hash_insert: none (db_holds)
- * dmu_buf_read_array_impl: none (db_state, db_changed)
- * dmu_sync: none (db_dirty_node, db_d)
- * dnode_reallocate: none (db)
- *
- * dn_mtx (leaf)
- * protects:
- * dn_dirty_dbufs
- * dn_ranges
- * phys accounting
- * dn_allocated_txg
- * dn_free_txg
- * dn_assigned_txg
- * dd_assigned_tx
- * dn_notxholds
- * dn_dirtyctx
- * dn_dirtyctx_firstset
- * (dn_phys copy fields?)
- * (dn_phys contents?)
- * held from:
- * dnode_*
- * dbuf_dirty: none
- * dbuf_sync: none (phys accounting)
- * dbuf_undirty: none (dn_ranges, dn_dirty_dbufs)
- * dbuf_write_done: none (phys accounting)
- * dmu_object_info_from_dnode: none (accounting)
- * dmu_tx_commit: none
- * dmu_tx_hold_object_impl: none
- * dmu_tx_try_assign: dn_notxholds(cv)
- * dmu_tx_unassign: none
- *
- * dd_lock (leaf)
- * protects:
- * dd_prop_cbs
- * dd_sync_*
- * dd_used_bytes
- * dd_tempreserved
- * dd_space_towrite
- * dd_myname
- * dd_phys accounting?
- * held from:
- * dsl_dir_*
- * dsl_prop_changed_notify: none (dd_prop_cbs)
- * dsl_prop_register: none (dd_prop_cbs)
- * dsl_prop_unregister: none (dd_prop_cbs)
- * dsl_dataset_block_freeable: none (dd_sync_*)
- *
- * os_lock (leaf)
- * protects:
- * os_dirty_dnodes
- * os_free_dnodes
- * os_dnodes
- * os_downgraded_dbufs
- * dn_dirtyblksz
- * dn_dirty_link
- * held from:
- * dnode_create: none (os_dnodes)
- * dnode_destroy: none (os_dnodes)
- * dnode_setdirty: none (dn_dirtyblksz, os_*_dnodes)
- * dnode_free: none (dn_dirtyblksz, os_*_dnodes)
- *
- * ds_lock (leaf)
- * protects:
- * ds_user_ptr
- * ds_user_evice_func
- * ds_open_refcount
- * ds_snapname
- * ds_phys accounting
- * held from:
- * dsl_dataset_*
- *
- * dr_mtx (leaf)
- * protects:
- * dr_children
- * held from:
- * dbuf_dirty
- * dbuf_undirty
- * dbuf_sync_indirect
- * dnode_new_blkid
- */
-
-struct objset;
-struct dmu_pool;
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_DMU_IMPL_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
deleted file mode 100644
index 8293a3b..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_DMU_OBJSET_H
-#define _SYS_DMU_OBJSET_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/spa.h>
-#include <sys/arc.h>
-#include <sys/txg.h>
-#include <sys/zfs_context.h>
-#include <sys/dnode.h>
-#include <sys/zio.h>
-#include <sys/zil.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct dsl_dataset;
-struct dmu_tx;
-struct objset_impl;
-
-typedef struct objset_phys {
- dnode_phys_t os_meta_dnode;
- zil_header_t os_zil_header;
- uint64_t os_type;
- char os_pad[1024 - sizeof (dnode_phys_t) - sizeof (zil_header_t) -
- sizeof (uint64_t)];
-} objset_phys_t;
-
-struct objset {
- struct objset_impl *os;
- int os_mode;
-};
-
-typedef struct objset_impl {
- /* Immutable: */
- struct dsl_dataset *os_dsl_dataset;
- spa_t *os_spa;
- arc_buf_t *os_phys_buf;
- objset_phys_t *os_phys;
- dnode_t *os_meta_dnode;
- zilog_t *os_zil;
- objset_t os;
- uint8_t os_checksum; /* can change, under dsl_dir's locks */
- uint8_t os_compress; /* can change, under dsl_dir's locks */
- uint8_t os_copies; /* can change, under dsl_dir's locks */
- uint8_t os_md_checksum;
- uint8_t os_md_compress;
-
- /* no lock needed: */
- struct dmu_tx *os_synctx; /* XXX sketchy */
- blkptr_t *os_rootbp;
-
- /* Protected by os_obj_lock */
- kmutex_t os_obj_lock;
- uint64_t os_obj_next;
-
- /* Protected by os_lock */
- kmutex_t os_lock;
- list_t os_dirty_dnodes[TXG_SIZE];
- list_t os_free_dnodes[TXG_SIZE];
- list_t os_dnodes;
- list_t os_downgraded_dbufs;
-} objset_impl_t;
-
-#define DMU_META_DNODE_OBJECT 0
-
-/* called from zpl */
-int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
- objset_t **osp);
-void dmu_objset_close(objset_t *os);
-int dmu_objset_create(const char *name, dmu_objset_type_t type,
- objset_t *clone_parent,
- void (*func)(objset_t *os, void *arg, dmu_tx_t *tx), void *arg);
-int dmu_objset_destroy(const char *name);
-int dmu_objset_rollback(const char *name);
-int dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive);
-void dmu_objset_stats(objset_t *os, nvlist_t *nv);
-void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat);
-void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
- uint64_t *usedobjsp, uint64_t *availobjsp);
-uint64_t dmu_objset_fsid_guid(objset_t *os);
-int dmu_objset_find(char *name, int func(char *, void *), void *arg,
- int flags);
-void dmu_objset_byteswap(void *buf, size_t size);
-int dmu_objset_evict_dbufs(objset_t *os, int try);
-
-/* called from dsl */
-void dmu_objset_sync(objset_impl_t *os, zio_t *zio, dmu_tx_t *tx);
-objset_impl_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds,
- blkptr_t *bp, dmu_objset_type_t type, dmu_tx_t *tx);
-int dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp,
- objset_impl_t **osip);
-void dmu_objset_evict(struct dsl_dataset *ds, void *arg);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_DMU_OBJSET_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h
deleted file mode 100644
index ea9fa6c..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_DMU_TRAVERSE_H
-#define _SYS_DMU_TRAVERSE_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/spa.h>
-#include <sys/zio.h>
-#include <sys/dmu.h>
-#include <sys/dnode.h>
-#include <sys/arc.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define ADVANCE_POST 0 /* post-order traversal */
-#define ADVANCE_PRE 0x01 /* pre-order traversal */
-#define ADVANCE_PRUNE 0x02 /* prune by prev snapshot birth time */
-#define ADVANCE_DATA 0x04 /* read user data blocks */
-#define ADVANCE_HOLES 0x08 /* visit holes */
-#define ADVANCE_ZIL 0x10 /* visit intent log blocks */
-#define ADVANCE_NOLOCK 0x20 /* Don't grab SPA sync lock */
-
-#define ZB_NO_LEVEL -2
-#define ZB_MAXLEVEL 32 /* Next power of 2 >= DN_MAX_LEVELS */
-#define ZB_MAXBLKID (1ULL << 62)
-#define ZB_MAXOBJSET (1ULL << 62)
-#define ZB_MAXOBJECT (1ULL << 62)
-
-#define ZB_MOS_CACHE 0
-#define ZB_MDN_CACHE 1
-#define ZB_DN_CACHE 2
-#define ZB_DEPTH 3
-
-typedef struct zseg {
- uint64_t seg_mintxg;
- uint64_t seg_maxtxg;
- zbookmark_t seg_start;
- zbookmark_t seg_end;
- list_node_t seg_node;
-} zseg_t;
-
-typedef struct traverse_blk_cache {
- zbookmark_t bc_bookmark;
- blkptr_t bc_blkptr;
- void *bc_data;
- dnode_phys_t *bc_dnode;
- int bc_errno;
- int bc_pad1;
- uint64_t bc_pad2;
-} traverse_blk_cache_t;
-
-typedef int (blkptr_cb_t)(traverse_blk_cache_t *bc, spa_t *spa, void *arg);
-
-struct traverse_handle {
- spa_t *th_spa;
- blkptr_cb_t *th_func;
- void *th_arg;
- uint16_t th_advance;
- uint16_t th_locked;
- int th_zio_flags;
- list_t th_seglist;
- traverse_blk_cache_t th_cache[ZB_DEPTH][ZB_MAXLEVEL];
- traverse_blk_cache_t th_zil_cache;
- uint64_t th_hits;
- uint64_t th_arc_hits;
- uint64_t th_reads;
- uint64_t th_callbacks;
- uint64_t th_syncs;
- uint64_t th_restarts;
- zbookmark_t th_noread;
- zbookmark_t th_lastcb;
-};
-
-int traverse_dsl_dataset(struct dsl_dataset *ds, uint64_t txg_start,
- int advance, blkptr_cb_t func, void *arg);
-
-traverse_handle_t *traverse_init(spa_t *spa, blkptr_cb_t *func, void *arg,
- int advance, int zio_flags);
-void traverse_fini(traverse_handle_t *th);
-
-void traverse_add_dnode(traverse_handle_t *th,
- uint64_t mintxg, uint64_t maxtxg, uint64_t objset, uint64_t object);
-void traverse_add_objset(traverse_handle_t *th,
- uint64_t mintxg, uint64_t maxtxg, uint64_t objset);
-void traverse_add_pool(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg);
-
-int traverse_more(traverse_handle_t *th);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_DMU_TRAVERSE_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h
deleted file mode 100644
index 89f4799..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_DMU_TX_H
-#define _SYS_DMU_TX_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/dmu.h>
-#include <sys/txg.h>
-#include <sys/refcount.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct dmu_buf_impl;
-struct dmu_tx_hold;
-struct dnode_link;
-struct dsl_pool;
-struct dnode;
-struct dsl_dir;
-
-struct dmu_tx {
- /*
- * No synchronization is needed because a tx can only be handled
- * by one thread.
- */
- list_t tx_holds; /* list of dmu_tx_hold_t */
- objset_t *tx_objset;
- struct dsl_dir *tx_dir;
- struct dsl_pool *tx_pool;
- uint64_t tx_txg;
- uint64_t tx_lastsnap_txg;
- uint64_t tx_lasttried_txg;
- txg_handle_t tx_txgh;
- void *tx_tempreserve_cookie;
- struct dmu_tx_hold *tx_needassign_txh;
- uint8_t tx_anyobj;
- int tx_err;
-#ifdef ZFS_DEBUG
- uint64_t tx_space_towrite;
- uint64_t tx_space_tofree;
- uint64_t tx_space_tooverwrite;
- refcount_t tx_space_written;
- refcount_t tx_space_freed;
-#endif
-};
-
-enum dmu_tx_hold_type {
- THT_NEWOBJECT,
- THT_WRITE,
- THT_BONUS,
- THT_FREE,
- THT_ZAP,
- THT_SPACE,
- THT_NUMTYPES
-};
-
-typedef struct dmu_tx_hold {
- dmu_tx_t *txh_tx;
- list_node_t txh_node;
- struct dnode *txh_dnode;
- uint64_t txh_space_towrite;
- uint64_t txh_space_tofree;
- uint64_t txh_space_tooverwrite;
-#ifdef ZFS_DEBUG
- enum dmu_tx_hold_type txh_type;
- uint64_t txh_arg1;
- uint64_t txh_arg2;
-#endif
-} dmu_tx_hold_t;
-
-
-/*
- * These routines are defined in dmu.h, and are called by the user.
- */
-dmu_tx_t *dmu_tx_create(objset_t *dd);
-int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
-void dmu_tx_commit(dmu_tx_t *tx);
-void dmu_tx_abort(dmu_tx_t *tx);
-uint64_t dmu_tx_get_txg(dmu_tx_t *tx);
-void dmu_tx_wait(dmu_tx_t *tx);
-
-/*
- * These routines are defined in dmu_spa.h, and are called by the SPA.
- */
-extern dmu_tx_t *dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg);
-
-/*
- * These routines are only called by the DMU.
- */
-dmu_tx_t *dmu_tx_create_dd(dsl_dir_t *dd);
-int dmu_tx_is_syncing(dmu_tx_t *tx);
-int dmu_tx_private_ok(dmu_tx_t *tx);
-void dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object);
-void dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta);
-void dmu_tx_dirty_buf(dmu_tx_t *tx, struct dmu_buf_impl *db);
-int dmu_tx_holds(dmu_tx_t *tx, uint64_t object);
-void dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space);
-
-#ifdef ZFS_DEBUG
-#define DMU_TX_DIRTY_BUF(tx, db) dmu_tx_dirty_buf(tx, db)
-#else
-#define DMU_TX_DIRTY_BUF(tx, db)
-#endif
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_DMU_TX_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_zfetch.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_zfetch.h
deleted file mode 100644
index c94bced..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_zfetch.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _DFETCH_H
-#define _DFETCH_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/zfs_context.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-extern uint64_t zfetch_array_rd_sz;
-
-struct dnode; /* so we can reference dnode */
-
-typedef enum zfetch_dirn {
- ZFETCH_FORWARD = 1, /* prefetch increasing block numbers */
- ZFETCH_BACKWARD = -1 /* prefetch decreasing block numbers */
-} zfetch_dirn_t;
-
-typedef struct zstream {
- uint64_t zst_offset; /* offset of starting block in range */
- uint64_t zst_len; /* length of range, in blocks */
- zfetch_dirn_t zst_direction; /* direction of prefetch */
- uint64_t zst_stride; /* length of stride, in blocks */
- uint64_t zst_ph_offset; /* prefetch offset, in blocks */
- uint64_t zst_cap; /* prefetch limit (cap), in blocks */
- kmutex_t zst_lock; /* protects stream */
- clock_t zst_last; /* lbolt of last prefetch */
- avl_node_t zst_node; /* embed avl node here */
-} zstream_t;
-
-typedef struct zfetch {
- krwlock_t zf_rwlock; /* protects zfetch structure */
- list_t zf_stream; /* AVL tree of zstream_t's */
- struct dnode *zf_dnode; /* dnode that owns this zfetch */
- uint32_t zf_stream_cnt; /* # of active streams */
- uint64_t zf_alloc_fail; /* # of failed attempts to alloc strm */
-} zfetch_t;
-
-void dmu_zfetch_init(zfetch_t *, struct dnode *);
-void dmu_zfetch_rele(zfetch_t *);
-void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, int);
-
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _DFETCH_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h
deleted file mode 100644
index 327e538..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h
+++ /dev/null
@@ -1,267 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_DNODE_H
-#define _SYS_DNODE_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/avl.h>
-#include <sys/spa.h>
-#include <sys/txg.h>
-#include <sys/zio.h>
-#include <sys/refcount.h>
-#include <sys/dmu_zfetch.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * Flags.
- */
-#define DNODE_MUST_BE_ALLOCATED 1
-#define DNODE_MUST_BE_FREE 2
-
-/*
- * Fixed constants.
- */
-#define DNODE_SHIFT 9 /* 512 bytes */
-#define DN_MIN_INDBLKSHIFT 10 /* 1k */
-#define DN_MAX_INDBLKSHIFT 14 /* 16k */
-#define DNODE_BLOCK_SHIFT 14 /* 16k */
-#define DNODE_CORE_SIZE 64 /* 64 bytes for dnode sans blkptrs */
-#define DN_MAX_OBJECT_SHIFT 48 /* 256 trillion (zfs_fid_t limit) */
-#define DN_MAX_OFFSET_SHIFT 64 /* 2^64 bytes in a dnode */
-
-/*
- * Derived constants.
- */
-#define DNODE_SIZE (1 << DNODE_SHIFT)
-#define DN_MAX_NBLKPTR ((DNODE_SIZE - DNODE_CORE_SIZE) >> SPA_BLKPTRSHIFT)
-#define DN_MAX_BONUSLEN (DNODE_SIZE - DNODE_CORE_SIZE - (1 << SPA_BLKPTRSHIFT))
-#define DN_MAX_OBJECT (1ULL << DN_MAX_OBJECT_SHIFT)
-
-#define DNODES_PER_BLOCK_SHIFT (DNODE_BLOCK_SHIFT - DNODE_SHIFT)
-#define DNODES_PER_BLOCK (1ULL << DNODES_PER_BLOCK_SHIFT)
-#define DNODES_PER_LEVEL_SHIFT (DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT)
-
-/* The +2 here is a cheesy way to round up */
-#define DN_MAX_LEVELS (2 + ((DN_MAX_OFFSET_SHIFT - SPA_MINBLOCKSHIFT) / \
- (DN_MIN_INDBLKSHIFT - SPA_BLKPTRSHIFT)))
-
-#define DN_BONUS(dnp) ((void*)((dnp)->dn_bonus + \
- (((dnp)->dn_nblkptr - 1) * sizeof (blkptr_t))))
-
-#define DN_USED_BYTES(dnp) (((dnp)->dn_flags & DNODE_FLAG_USED_BYTES) ? \
- (dnp)->dn_used : (dnp)->dn_used << SPA_MINBLOCKSHIFT)
-
-#define EPB(blkshift, typeshift) (1 << (blkshift - typeshift))
-
-struct dmu_buf_impl;
-struct objset_impl;
-struct zio;
-
-enum dnode_dirtycontext {
- DN_UNDIRTIED,
- DN_DIRTY_OPEN,
- DN_DIRTY_SYNC
-};
-
-/* Is dn_used in bytes? if not, it's in multiples of SPA_MINBLOCKSIZE */
-#define DNODE_FLAG_USED_BYTES (1<<0)
-
-typedef struct dnode_phys {
- uint8_t dn_type; /* dmu_object_type_t */
- uint8_t dn_indblkshift; /* ln2(indirect block size) */
- uint8_t dn_nlevels; /* 1=dn_blkptr->data blocks */
- uint8_t dn_nblkptr; /* length of dn_blkptr */
- uint8_t dn_bonustype; /* type of data in bonus buffer */
- uint8_t dn_checksum; /* ZIO_CHECKSUM type */
- uint8_t dn_compress; /* ZIO_COMPRESS type */
- uint8_t dn_flags; /* DNODE_FLAG_* */
- uint16_t dn_datablkszsec; /* data block size in 512b sectors */
- uint16_t dn_bonuslen; /* length of dn_bonus */
- uint8_t dn_pad2[4];
-
- /* accounting is protected by dn_dirty_mtx */
- uint64_t dn_maxblkid; /* largest allocated block ID */
- uint64_t dn_used; /* bytes (or sectors) of disk space */
-
- uint64_t dn_pad3[4];
-
- blkptr_t dn_blkptr[1];
- uint8_t dn_bonus[DN_MAX_BONUSLEN];
-} dnode_phys_t;
-
-typedef struct dnode {
- /*
- * dn_struct_rwlock protects the structure of the dnode,
- * including the number of levels of indirection (dn_nlevels),
- * dn_maxblkid, and dn_next_*
- */
- krwlock_t dn_struct_rwlock;
-
- /*
- * Our link on dataset's dd_dnodes list.
- * Protected by dd_accounting_mtx.
- */
- list_node_t dn_link;
-
- /* immutable: */
- struct objset_impl *dn_objset;
- uint64_t dn_object;
- struct dmu_buf_impl *dn_dbuf;
- dnode_phys_t *dn_phys; /* pointer into dn->dn_dbuf->db.db_data */
-
- /*
- * Copies of stuff in dn_phys. They're valid in the open
- * context (eg. even before the dnode is first synced).
- * Where necessary, these are protected by dn_struct_rwlock.
- */
- dmu_object_type_t dn_type; /* object type */
- uint16_t dn_bonuslen; /* bonus length */
- uint8_t dn_bonustype; /* bonus type */
- uint8_t dn_nblkptr; /* number of blkptrs (immutable) */
- uint8_t dn_checksum; /* ZIO_CHECKSUM type */
- uint8_t dn_compress; /* ZIO_COMPRESS type */
- uint8_t dn_nlevels;
- uint8_t dn_indblkshift;
- uint8_t dn_datablkshift; /* zero if blksz not power of 2! */
- uint16_t dn_datablkszsec; /* in 512b sectors */
- uint32_t dn_datablksz; /* in bytes */
- uint64_t dn_maxblkid;
- uint8_t dn_next_nlevels[TXG_SIZE];
- uint8_t dn_next_indblkshift[TXG_SIZE];
- uint32_t dn_next_blksz[TXG_SIZE]; /* next block size in bytes */
-
- /* protected by os_lock: */
- list_node_t dn_dirty_link[TXG_SIZE]; /* next on dataset's dirty */
-
- /* protected by dn_mtx: */
- kmutex_t dn_mtx;
- list_t dn_dirty_records[TXG_SIZE];
- avl_tree_t dn_ranges[TXG_SIZE];
- uint64_t dn_allocated_txg;
- uint64_t dn_free_txg;
- uint64_t dn_assigned_txg;
- kcondvar_t dn_notxholds;
- enum dnode_dirtycontext dn_dirtyctx;
- uint8_t *dn_dirtyctx_firstset; /* dbg: contents meaningless */
-
- /* protected by own devices */
- refcount_t dn_tx_holds;
- refcount_t dn_holds;
-
- kmutex_t dn_dbufs_mtx;
- list_t dn_dbufs; /* linked list of descendent dbuf_t's */
- struct dmu_buf_impl *dn_bonus; /* bonus buffer dbuf */
-
- /* parent IO for current sync write */
- zio_t *dn_zio;
-
- /* holds prefetch structure */
- struct zfetch dn_zfetch;
-} dnode_t;
-
-typedef struct free_range {
- avl_node_t fr_node;
- uint64_t fr_blkid;
- uint64_t fr_nblks;
-} free_range_t;
-
-dnode_t *dnode_special_open(struct objset_impl *dd, dnode_phys_t *dnp,
- uint64_t object);
-void dnode_special_close(dnode_t *dn);
-
-int dnode_hold(struct objset_impl *dd, uint64_t object,
- void *ref, dnode_t **dnp);
-int dnode_hold_impl(struct objset_impl *dd, uint64_t object, int flag,
- void *ref, dnode_t **dnp);
-void dnode_add_ref(dnode_t *dn, void *ref);
-void dnode_rele(dnode_t *dn, void *ref);
-void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx);
-void dnode_sync(dnode_t *dn, dmu_tx_t *tx);
-void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
- dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
-void dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
- dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
-void dnode_free(dnode_t *dn, dmu_tx_t *tx);
-void dnode_byteswap(dnode_phys_t *dnp);
-void dnode_buf_byteswap(void *buf, size_t size);
-void dnode_verify(dnode_t *dn);
-int dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx);
-uint64_t dnode_current_max_length(dnode_t *dn);
-void dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx);
-void dnode_clear_range(dnode_t *dn, uint64_t blkid,
- uint64_t nblks, dmu_tx_t *tx);
-void dnode_diduse_space(dnode_t *dn, int64_t space);
-void dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx);
-void dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx);
-uint64_t dnode_block_freed(dnode_t *dn, uint64_t blkid);
-void dnode_init(void);
-void dnode_fini(void);
-int dnode_next_offset(dnode_t *dn, boolean_t hole, uint64_t *off, int minlvl,
- uint64_t blkfill, uint64_t txg);
-int dnode_evict_dbufs(dnode_t *dn, int try);
-
-#ifdef ZFS_DEBUG
-
-/*
- * There should be a ## between the string literal and fmt, to make it
- * clear that we're joining two strings together, but that piece of shit
- * gcc doesn't support that preprocessor token.
- */
-#define dprintf_dnode(dn, fmt, ...) do { \
- if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
- char __db_buf[32]; \
- uint64_t __db_obj = (dn)->dn_object; \
- if (__db_obj == DMU_META_DNODE_OBJECT) \
- (void) strcpy(__db_buf, "mdn"); \
- else \
- (void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \
- (u_longlong_t)__db_obj);\
- dprintf_ds((dn)->dn_objset->os_dsl_dataset, "obj=%s " fmt, \
- __db_buf, __VA_ARGS__); \
- } \
-_NOTE(CONSTCOND) } while (0)
-
-#define DNODE_VERIFY(dn) dnode_verify(dn)
-#define FREE_VERIFY(db, start, end, tx) free_verify(db, start, end, tx)
-
-#else
-
-#define dprintf_dnode(db, fmt, ...)
-#define DNODE_VERIFY(dn)
-#define FREE_VERIFY(db, start, end, tx)
-
-#endif
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_DNODE_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h
deleted file mode 100644
index 8cfc1dc..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_DSL_DATASET_H
-#define _SYS_DSL_DATASET_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/dmu.h>
-#include <sys/spa.h>
-#include <sys/txg.h>
-#include <sys/zio.h>
-#include <sys/bplist.h>
-#include <sys/dsl_synctask.h>
-#include <sys/zfs_context.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct dsl_dataset;
-struct dsl_dir;
-struct dsl_pool;
-
-typedef void dsl_dataset_evict_func_t(struct dsl_dataset *, void *);
-
-#define DS_FLAG_INCONSISTENT (1ULL<<0)
-/*
- * NB: nopromote can not yet be set, but we want support for it in this
- * on-disk version, so that we don't need to upgrade for it later. It
- * will be needed when we implement 'zfs split' (where the split off
- * clone should not be promoted).
- */
-#define DS_FLAG_NOPROMOTE (1ULL<<1)
-
-typedef struct dsl_dataset_phys {
- uint64_t ds_dir_obj;
- uint64_t ds_prev_snap_obj;
- uint64_t ds_prev_snap_txg;
- uint64_t ds_next_snap_obj;
- uint64_t ds_snapnames_zapobj; /* zap obj of snaps; ==0 for snaps */
- uint64_t ds_num_children; /* clone/snap children; ==0 for head */
- uint64_t ds_creation_time; /* seconds since 1970 */
- uint64_t ds_creation_txg;
- uint64_t ds_deadlist_obj;
- uint64_t ds_used_bytes;
- uint64_t ds_compressed_bytes;
- uint64_t ds_uncompressed_bytes;
- uint64_t ds_unique_bytes; /* only relevant to snapshots */
- /*
- * The ds_fsid_guid is a 56-bit ID that can change to avoid
- * collisions. The ds_guid is a 64-bit ID that will never
- * change, so there is a small probability that it will collide.
- */
- uint64_t ds_fsid_guid;
- uint64_t ds_guid;
- uint64_t ds_flags;
- blkptr_t ds_bp;
- uint64_t ds_pad[8]; /* pad out to 320 bytes for good measure */
-} dsl_dataset_phys_t;
-
-typedef struct dsl_dataset {
- /* Immutable: */
- struct dsl_dir *ds_dir;
- dsl_dataset_phys_t *ds_phys;
- dmu_buf_t *ds_dbuf;
- uint64_t ds_object;
-
- /* only used in syncing context: */
- struct dsl_dataset *ds_prev; /* only valid for non-snapshots */
-
- /* has internal locking: */
- bplist_t ds_deadlist;
-
- /* protected by lock on pool's dp_dirty_datasets list */
- txg_node_t ds_dirty_link;
- list_node_t ds_synced_link;
-
- /*
- * ds_phys->ds_<accounting> is also protected by ds_lock.
- * Protected by ds_lock:
- */
- kmutex_t ds_lock;
- void *ds_user_ptr;
- dsl_dataset_evict_func_t *ds_user_evict_func;
- uint64_t ds_open_refcount;
-
- /* no locking; only for making guesses */
- uint64_t ds_trysnap_txg;
-
- /* Protected by ds_lock; keep at end of struct for better locality */
- char ds_snapname[MAXNAMELEN];
-} dsl_dataset_t;
-
-#define dsl_dataset_is_snapshot(ds) \
- ((ds)->ds_phys->ds_num_children != 0)
-
-int dsl_dataset_open_spa(spa_t *spa, const char *name, int mode,
- void *tag, dsl_dataset_t **dsp);
-int dsl_dataset_open(const char *name, int mode, void *tag,
- dsl_dataset_t **dsp);
-int dsl_dataset_open_obj(struct dsl_pool *dp, uint64_t dsobj,
- const char *tail, int mode, void *tag, dsl_dataset_t **);
-void dsl_dataset_name(dsl_dataset_t *ds, char *name);
-void dsl_dataset_close(dsl_dataset_t *ds, int mode, void *tag);
-uint64_t dsl_dataset_create_sync(dsl_dir_t *pds,
- const char *lastname, dsl_dataset_t *clone_parent, dmu_tx_t *tx);
-int dsl_dataset_destroy(const char *name);
-int dsl_snapshots_destroy(char *fsname, char *snapname);
-dsl_checkfunc_t dsl_dataset_snapshot_check;
-dsl_syncfunc_t dsl_dataset_snapshot_sync;
-int dsl_dataset_rollback(dsl_dataset_t *ds);
-int dsl_dataset_rename(char *name, const char *newname, boolean_t recursive);
-int dsl_dataset_promote(const char *name);
-
-void *dsl_dataset_set_user_ptr(dsl_dataset_t *ds,
- void *p, dsl_dataset_evict_func_t func);
-void *dsl_dataset_get_user_ptr(dsl_dataset_t *ds);
-
-blkptr_t *dsl_dataset_get_blkptr(dsl_dataset_t *ds);
-void dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
-
-spa_t *dsl_dataset_get_spa(dsl_dataset_t *ds);
-
-void dsl_dataset_sync(dsl_dataset_t *os, zio_t *zio, dmu_tx_t *tx);
-
-void dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
-void dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
- dmu_tx_t *tx);
-int dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth);
-uint64_t dsl_dataset_prev_snap_txg(dsl_dataset_t *ds);
-
-void dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx);
-void dsl_dataset_stats(dsl_dataset_t *os, nvlist_t *nv);
-void dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat);
-void dsl_dataset_space(dsl_dataset_t *ds,
- uint64_t *refdbytesp, uint64_t *availbytesp,
- uint64_t *usedobjsp, uint64_t *availobjsp);
-uint64_t dsl_dataset_fsid_guid(dsl_dataset_t *ds);
-
-void dsl_dataset_create_root(struct dsl_pool *dp, uint64_t *ddobjp,
- dmu_tx_t *tx);
-
-int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf);
-
-#ifdef ZFS_DEBUG
-#define dprintf_ds(ds, fmt, ...) do { \
- if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
- char *__ds_name = kmem_alloc(MAXNAMELEN, KM_SLEEP); \
- dsl_dataset_name(ds, __ds_name); \
- dprintf("ds=%s " fmt, __ds_name, __VA_ARGS__); \
- kmem_free(__ds_name, MAXNAMELEN); \
- } \
-_NOTE(CONSTCOND) } while (0)
-#else
-#define dprintf_ds(dd, fmt, ...)
-#endif
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_DSL_DATASET_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h
deleted file mode 100644
index e0595d3..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h
+++ /dev/null
@@ -1,143 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_DSL_DIR_H
-#define _SYS_DSL_DIR_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/dmu.h>
-#include <sys/dsl_pool.h>
-#include <sys/dsl_synctask.h>
-#include <sys/refcount.h>
-#include <sys/zfs_context.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct dsl_dataset;
-
-typedef struct dsl_dir_phys {
- uint64_t dd_creation_time; /* not actually used */
- uint64_t dd_head_dataset_obj;
- uint64_t dd_parent_obj;
- uint64_t dd_clone_parent_obj;
- uint64_t dd_child_dir_zapobj;
- /*
- * how much space our children are accounting for; for leaf
- * datasets, == physical space used by fs + snaps
- */
- uint64_t dd_used_bytes;
- uint64_t dd_compressed_bytes;
- uint64_t dd_uncompressed_bytes;
- /* Administrative quota setting */
- uint64_t dd_quota;
- /* Administrative reservation setting */
- uint64_t dd_reserved;
- uint64_t dd_props_zapobj;
- uint64_t dd_pad[21]; /* pad out to 256 bytes for good measure */
-} dsl_dir_phys_t;
-
-struct dsl_dir {
- /* These are immutable; no lock needed: */
- uint64_t dd_object;
- dsl_dir_phys_t *dd_phys;
- dmu_buf_t *dd_dbuf;
- dsl_pool_t *dd_pool;
-
- /* protected by lock on pool's dp_dirty_dirs list */
- txg_node_t dd_dirty_link;
-
- /* protected by dp_config_rwlock */
- dsl_dir_t *dd_parent;
-
- /* Protected by dd_lock */
- kmutex_t dd_lock;
- list_t dd_prop_cbs; /* list of dsl_prop_cb_record_t's */
-
- /* Accounting */
- /* reflects any changes to dd_phys->dd_used_bytes made this syncing */
- int64_t dd_used_bytes;
- /* gross estimate of space used by in-flight tx's */
- uint64_t dd_tempreserved[TXG_SIZE];
- /* amount of space we expect to write; == amount of dirty data */
- int64_t dd_space_towrite[TXG_SIZE];
-
- /* protected by dd_lock; keep at end of struct for better locality */
- char dd_myname[MAXNAMELEN];
-};
-
-void dsl_dir_close(dsl_dir_t *dd, void *tag);
-int dsl_dir_open(const char *name, void *tag, dsl_dir_t **, const char **tail);
-int dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, dsl_dir_t **,
- const char **tailp);
-int dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
- const char *tail, void *tag, dsl_dir_t **);
-void dsl_dir_name(dsl_dir_t *dd, char *buf);
-int dsl_dir_namelen(dsl_dir_t *dd);
-int dsl_dir_is_private(dsl_dir_t *dd);
-uint64_t dsl_dir_create_sync(dsl_dir_t *pds, const char *name, dmu_tx_t *tx);
-void dsl_dir_create_root(objset_t *mos, uint64_t *ddobjp, dmu_tx_t *tx);
-dsl_checkfunc_t dsl_dir_destroy_check;
-dsl_syncfunc_t dsl_dir_destroy_sync;
-void dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv);
-uint64_t dsl_dir_space_available(dsl_dir_t *dd,
- dsl_dir_t *ancestor, int64_t delta, int ondiskonly);
-void dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx);
-void dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx);
-int dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t mem,
- uint64_t asize, uint64_t fsize, void **tr_cookiep, dmu_tx_t *tx);
-void dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx);
-void dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx);
-void dsl_dir_diduse_space(dsl_dir_t *dd,
- int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx);
-int dsl_dir_set_quota(const char *ddname, uint64_t quota);
-int dsl_dir_set_reservation(const char *ddname, uint64_t reservation);
-int dsl_dir_rename(dsl_dir_t *dd, const char *newname);
-int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space);
-
-/* internal reserved dir name */
-#define MOS_DIR_NAME "$MOS"
-
-#ifdef ZFS_DEBUG
-#define dprintf_dd(dd, fmt, ...) do { \
- if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
- char *__ds_name = kmem_alloc(MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, \
- KM_SLEEP); \
- dsl_dir_name(dd, __ds_name); \
- dprintf("dd=%s " fmt, __ds_name, __VA_ARGS__); \
- kmem_free(__ds_name, MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); \
- } \
-_NOTE(CONSTCOND) } while (0)
-#else
-#define dprintf_dd(dd, fmt, ...)
-#endif
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_DSL_DIR_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h
deleted file mode 100644
index f7ec67a..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_DSL_POOL_H
-#define _SYS_DSL_POOL_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/spa.h>
-#include <sys/txg.h>
-#include <sys/txg_impl.h>
-#include <sys/zfs_context.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct objset;
-struct dsl_dir;
-
-typedef struct dsl_pool {
- /* Immutable */
- spa_t *dp_spa;
- struct objset *dp_meta_objset;
- struct dsl_dir *dp_root_dir;
- struct dsl_dir *dp_mos_dir;
- uint64_t dp_root_dir_obj;
-
- /* No lock needed - sync context only */
- blkptr_t dp_meta_rootbp;
- list_t dp_synced_objsets;
-
- /* Has its own locking */
- tx_state_t dp_tx;
- txg_list_t dp_dirty_datasets;
- txg_list_t dp_dirty_dirs;
- txg_list_t dp_sync_tasks;
-
- /*
- * Protects administrative changes (properties, namespace)
- * It is only held for write in syncing context. Therefore
- * syncing context does not need to ever have it for read, since
- * nobody else could possibly have it for write.
- */
- krwlock_t dp_config_rwlock;
-} dsl_pool_t;
-
-int dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp);
-void dsl_pool_close(dsl_pool_t *dp);
-dsl_pool_t *dsl_pool_create(spa_t *spa, uint64_t txg);
-void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg);
-void dsl_pool_zil_clean(dsl_pool_t *dp);
-int dsl_pool_sync_context(dsl_pool_t *dp);
-uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_DSL_POOL_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h
deleted file mode 100644
index d2debff..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_DSL_PROP_H
-#define _SYS_DSL_PROP_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/dmu.h>
-#include <sys/dsl_pool.h>
-#include <sys/zfs_context.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct dsl_dataset;
-
-/* The callback func may not call into the DMU or DSL! */
-typedef void (dsl_prop_changed_cb_t)(void *arg, uint64_t newval);
-
-typedef struct dsl_prop_cb_record {
- list_node_t cbr_node; /* link on dd_prop_cbs */
- struct dsl_dataset *cbr_ds;
- const char *cbr_propname;
- dsl_prop_changed_cb_t *cbr_func;
- void *cbr_arg;
-} dsl_prop_cb_record_t;
-
-int dsl_prop_register(struct dsl_dataset *ds, const char *propname,
- dsl_prop_changed_cb_t *callback, void *cbarg);
-int dsl_prop_unregister(struct dsl_dataset *ds, const char *propname,
- dsl_prop_changed_cb_t *callback, void *cbarg);
-int dsl_prop_numcb(struct dsl_dataset *ds);
-
-int dsl_prop_get(const char *ddname, const char *propname,
- int intsz, int numints, void *buf, char *setpoint);
-int dsl_prop_get_integer(const char *ddname, const char *propname,
- uint64_t *valuep, char *setpoint);
-int dsl_prop_get_all(objset_t *os, nvlist_t **nvp);
-
-int dsl_prop_set(const char *ddname, const char *propname,
- int intsz, int numints, const void *buf);
-int dsl_prop_set_dd(dsl_dir_t *dd, const char *propname,
- int intsz, int numints, const void *buf);
-
-void dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value);
-void dsl_prop_nvlist_add_string(nvlist_t *nv,
- zfs_prop_t prop, const char *value);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_DSL_PROP_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h
deleted file mode 100644
index e695b18..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_DSL_SYNCTASK_H
-#define _SYS_DSL_SYNCTASK_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/txg.h>
-#include <sys/zfs_context.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct dsl_pool;
-
-typedef int (dsl_checkfunc_t)(void *, void *, dmu_tx_t *);
-typedef void (dsl_syncfunc_t)(void *, void *, dmu_tx_t *);
-
-typedef struct dsl_sync_task {
- list_node_t dst_node;
- dsl_checkfunc_t *dst_checkfunc;
- dsl_syncfunc_t *dst_syncfunc;
- void *dst_arg1;
- void *dst_arg2;
- int dst_err;
-} dsl_sync_task_t;
-
-typedef struct dsl_sync_task_group {
- txg_node_t dstg_node;
- list_t dstg_tasks;
- struct dsl_pool *dstg_pool;
- uint64_t dstg_txg;
- int dstg_err;
- int dstg_space;
-} dsl_sync_task_group_t;
-
-dsl_sync_task_group_t *dsl_sync_task_group_create(struct dsl_pool *dp);
-void dsl_sync_task_create(dsl_sync_task_group_t *dstg,
- dsl_checkfunc_t *, dsl_syncfunc_t *,
- void *arg1, void *arg2, int blocks_modified);
-int dsl_sync_task_group_wait(dsl_sync_task_group_t *dstg);
-void dsl_sync_task_group_destroy(dsl_sync_task_group_t *dstg);
-void dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx);
-
-int dsl_sync_task_do(struct dsl_pool *dp,
- dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc,
- void *arg1, void *arg2, int blocks_modified);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_DSL_SYNCTASK_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
deleted file mode 100644
index 095dd3c..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_METASLAB_H
-#define _SYS_METASLAB_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/spa.h>
-#include <sys/space_map.h>
-#include <sys/txg.h>
-#include <sys/zio.h>
-#include <sys/avl.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct metaslab_class metaslab_class_t;
-typedef struct metaslab_group metaslab_group_t;
-
-extern metaslab_t *metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
- uint64_t start, uint64_t size, uint64_t txg);
-extern void metaslab_fini(metaslab_t *msp);
-extern void metaslab_sync(metaslab_t *msp, uint64_t txg);
-extern void metaslab_sync_done(metaslab_t *msp, uint64_t txg);
-
-extern int metaslab_alloc(spa_t *spa, uint64_t psize, blkptr_t *bp,
- int ncopies, uint64_t txg, blkptr_t *hintbp, boolean_t hintbp_avoid);
-extern void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg,
- boolean_t now);
-extern int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg);
-
-extern metaslab_class_t *metaslab_class_create(void);
-extern void metaslab_class_destroy(metaslab_class_t *mc);
-extern void metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg);
-extern void metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg);
-
-extern metaslab_group_t *metaslab_group_create(metaslab_class_t *mc,
- vdev_t *vd);
-extern void metaslab_group_destroy(metaslab_group_t *mg);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_METASLAB_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
deleted file mode 100644
index 5980cbc..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_METASLAB_IMPL_H
-#define _SYS_METASLAB_IMPL_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/metaslab.h>
-#include <sys/space_map.h>
-#include <sys/vdev.h>
-#include <sys/txg.h>
-#include <sys/avl.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct metaslab_class {
- metaslab_group_t *mc_rotor;
- uint64_t mc_allocated;
-};
-
-struct metaslab_group {
- kmutex_t mg_lock;
- avl_tree_t mg_metaslab_tree;
- uint64_t mg_aliquot;
- int64_t mg_bias;
- metaslab_class_t *mg_class;
- vdev_t *mg_vd;
- metaslab_group_t *mg_prev;
- metaslab_group_t *mg_next;
-};
-
-/*
- * Each metaslab's free space is tracked in space map object in the MOS,
- * which is only updated in syncing context. Each time we sync a txg,
- * we append the allocs and frees from that txg to the space map object.
- * When the txg is done syncing, metaslab_sync_done() updates ms_smo
- * to ms_smo_syncing. Everything in ms_smo is always safe to allocate.
- */
-struct metaslab {
- kmutex_t ms_lock; /* metaslab lock */
- space_map_obj_t ms_smo; /* synced space map object */
- space_map_obj_t ms_smo_syncing; /* syncing space map object */
- space_map_t ms_allocmap[TXG_SIZE]; /* allocated this txg */
- space_map_t ms_freemap[TXG_SIZE]; /* freed this txg */
- space_map_t ms_map; /* in-core free space map */
- uint64_t ms_weight; /* weight vs. others in group */
- metaslab_group_t *ms_group; /* metaslab group */
- avl_node_t ms_group_node; /* node in metaslab group tree */
- txg_node_t ms_txg_node; /* per-txg dirty metaslab links */
-};
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_METASLAB_IMPL_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h
deleted file mode 100644
index 4de1cae..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_REFCOUNT_H
-#define _SYS_REFCOUNT_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/list.h>
-#include <sys/zfs_context.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * If the reference is held only by the calling function and not any
- * particular object, use FTAG (which is a string) for the holder_tag.
- * Otherwise, use the object that holds the reference.
- */
-#define FTAG ((char *)__func__)
-
-#if defined(DEBUG) || !defined(_KERNEL)
-typedef struct reference {
- list_node_t ref_link;
- void *ref_holder;
- uint64_t ref_number;
- uint8_t *ref_removed;
-} reference_t;
-
-typedef struct refcount {
- kmutex_t rc_mtx;
- list_t rc_list;
- list_t rc_removed;
- int64_t rc_count;
- int64_t rc_removed_count;
-} refcount_t;
-
-/* Note: refcount_t should be initialized to zero before use. */
-
-void refcount_create(refcount_t *rc);
-void refcount_destroy(refcount_t *rc);
-void refcount_destroy_many(refcount_t *rc, uint64_t number);
-int refcount_is_zero(refcount_t *rc);
-int64_t refcount_count(refcount_t *rc);
-int64_t refcount_add(refcount_t *rc, void *holder_tag);
-int64_t refcount_remove(refcount_t *rc, void *holder_tag);
-int64_t refcount_add_many(refcount_t *rc, uint64_t number, void *holder_tag);
-int64_t refcount_remove_many(refcount_t *rc, uint64_t number, void *holder_tag);
-
-void refcount_init(void);
-void refcount_fini(void);
-
-#else /* DEBUG */
-
-typedef struct refcount {
- uint64_t rc_count;
-} refcount_t;
-
-#define refcount_create(rc) ((rc)->rc_count = 0)
-#define refcount_destroy(rc) ((rc)->rc_count = 0)
-#define refcount_destroy_many(rc, number) ((rc)->rc_count = 0)
-#define refcount_is_zero(rc) ((rc)->rc_count == 0)
-#define refcount_count(rc) ((rc)->rc_count)
-#define refcount_add(rc, holder) atomic_add_64_nv(&(rc)->rc_count, 1)
-#define refcount_remove(rc, holder) atomic_add_64_nv(&(rc)->rc_count, -1)
-#define refcount_add_many(rc, number, holder) \
- atomic_add_64_nv(&(rc)->rc_count, number)
-#define refcount_remove_many(rc, number, holder) \
- atomic_add_64_nv(&(rc)->rc_count, -number)
-
-#define refcount_init()
-#define refcount_fini()
-
-#endif /* DEBUG */
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_REFCOUNT_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
deleted file mode 100644
index f0eb2e1..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
+++ /dev/null
@@ -1,491 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_SPA_H
-#define _SYS_SPA_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/avl.h>
-#include <sys/zfs_context.h>
-#include <sys/nvpair.h>
-#include <sys/sysmacros.h>
-#include <sys/types.h>
-#include <sys/fs/zfs.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * Forward references that lots of things need.
- */
-typedef struct spa spa_t;
-typedef struct vdev vdev_t;
-typedef struct metaslab metaslab_t;
-typedef struct zilog zilog_t;
-typedef struct traverse_handle traverse_handle_t;
-struct dsl_pool;
-
-/*
- * General-purpose 32-bit and 64-bit bitfield encodings.
- */
-#define BF32_DECODE(x, low, len) P2PHASE((x) >> (low), 1U << (len))
-#define BF64_DECODE(x, low, len) P2PHASE((x) >> (low), 1ULL << (len))
-#define BF32_ENCODE(x, low, len) (P2PHASE((x), 1U << (len)) << (low))
-#define BF64_ENCODE(x, low, len) (P2PHASE((x), 1ULL << (len)) << (low))
-
-#define BF32_GET(x, low, len) BF32_DECODE(x, low, len)
-#define BF64_GET(x, low, len) BF64_DECODE(x, low, len)
-
-#define BF32_SET(x, low, len, val) \
- ((x) ^= BF32_ENCODE((x >> low) ^ (val), low, len))
-#define BF64_SET(x, low, len, val) \
- ((x) ^= BF64_ENCODE((x >> low) ^ (val), low, len))
-
-#define BF32_GET_SB(x, low, len, shift, bias) \
- ((BF32_GET(x, low, len) + (bias)) << (shift))
-#define BF64_GET_SB(x, low, len, shift, bias) \
- ((BF64_GET(x, low, len) + (bias)) << (shift))
-
-#define BF32_SET_SB(x, low, len, shift, bias, val) \
- BF32_SET(x, low, len, ((val) >> (shift)) - (bias))
-#define BF64_SET_SB(x, low, len, shift, bias, val) \
- BF64_SET(x, low, len, ((val) >> (shift)) - (bias))
-
-/*
- * We currently support nine block sizes, from 512 bytes to 128K.
- * We could go higher, but the benefits are near-zero and the cost
- * of COWing a giant block to modify one byte would become excessive.
- */
-#define SPA_MINBLOCKSHIFT 9
-#define SPA_MAXBLOCKSHIFT 17
-#define SPA_MINBLOCKSIZE (1ULL << SPA_MINBLOCKSHIFT)
-#define SPA_MAXBLOCKSIZE (1ULL << SPA_MAXBLOCKSHIFT)
-
-#define SPA_BLOCKSIZES (SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1)
-
-/*
- * The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB.
- * The ASIZE encoding should be at least 64 times larger (6 more bits)
- * to support up to 4-way RAID-Z mirror mode with worst-case gang block
- * overhead, three DVAs per bp, plus one more bit in case we do anything
- * else that expands the ASIZE.
- */
-#define SPA_LSIZEBITS 16 /* LSIZE up to 32M (2^16 * 512) */
-#define SPA_PSIZEBITS 16 /* PSIZE up to 32M (2^16 * 512) */
-#define SPA_ASIZEBITS 24 /* ASIZE up to 64 times larger */
-
-/*
- * All SPA data is represented by 128-bit data virtual addresses (DVAs).
- * The members of the dva_t should be considered opaque outside the SPA.
- */
-typedef struct dva {
- uint64_t dva_word[2];
-} dva_t;
-
-/*
- * Each block has a 256-bit checksum -- strong enough for cryptographic hashes.
- */
-typedef struct zio_cksum {
- uint64_t zc_word[4];
-} zio_cksum_t;
-
-/*
- * Each block is described by its DVAs, time of birth, checksum, etc.
- * The word-by-word, bit-by-bit layout of the blkptr is as follows:
- *
- * 64 56 48 40 32 24 16 8 0
- * +-------+-------+-------+-------+-------+-------+-------+-------+
- * 0 | vdev1 | GRID | ASIZE |
- * +-------+-------+-------+-------+-------+-------+-------+-------+
- * 1 |G| offset1 |
- * +-------+-------+-------+-------+-------+-------+-------+-------+
- * 2 | vdev2 | GRID | ASIZE |
- * +-------+-------+-------+-------+-------+-------+-------+-------+
- * 3 |G| offset2 |
- * +-------+-------+-------+-------+-------+-------+-------+-------+
- * 4 | vdev3 | GRID | ASIZE |
- * +-------+-------+-------+-------+-------+-------+-------+-------+
- * 5 |G| offset3 |
- * +-------+-------+-------+-------+-------+-------+-------+-------+
- * 6 |E| lvl | type | cksum | comp | PSIZE | LSIZE |
- * +-------+-------+-------+-------+-------+-------+-------+-------+
- * 7 | padding |
- * +-------+-------+-------+-------+-------+-------+-------+-------+
- * 8 | padding |
- * +-------+-------+-------+-------+-------+-------+-------+-------+
- * 9 | padding |
- * +-------+-------+-------+-------+-------+-------+-------+-------+
- * a | birth txg |
- * +-------+-------+-------+-------+-------+-------+-------+-------+
- * b | fill count |
- * +-------+-------+-------+-------+-------+-------+-------+-------+
- * c | checksum[0] |
- * +-------+-------+-------+-------+-------+-------+-------+-------+
- * d | checksum[1] |
- * +-------+-------+-------+-------+-------+-------+-------+-------+
- * e | checksum[2] |
- * +-------+-------+-------+-------+-------+-------+-------+-------+
- * f | checksum[3] |
- * +-------+-------+-------+-------+-------+-------+-------+-------+
- *
- * Legend:
- *
- * vdev virtual device ID
- * offset offset into virtual device
- * LSIZE logical size
- * PSIZE physical size (after compression)
- * ASIZE allocated size (including RAID-Z parity and gang block headers)
- * GRID RAID-Z layout information (reserved for future use)
- * cksum checksum function
- * comp compression function
- * G gang block indicator
- * E endianness
- * type DMU object type
- * lvl level of indirection
- * birth txg transaction group in which the block was born
- * fill count number of non-zero blocks under this bp
- * checksum[4] 256-bit checksum of the data this bp describes
- */
-typedef struct blkptr {
- dva_t blk_dva[3]; /* 128-bit Data Virtual Address */
- uint64_t blk_prop; /* size, compression, type, etc */
- uint64_t blk_pad[3]; /* Extra space for the future */
- uint64_t blk_birth; /* transaction group at birth */
- uint64_t blk_fill; /* fill count */
- zio_cksum_t blk_cksum; /* 256-bit checksum */
-} blkptr_t;
-
-#define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */
-#define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */
-
-/*
- * Macros to get and set fields in a bp or DVA.
- */
-#define DVA_GET_ASIZE(dva) \
- BF64_GET_SB((dva)->dva_word[0], 0, 24, SPA_MINBLOCKSHIFT, 0)
-#define DVA_SET_ASIZE(dva, x) \
- BF64_SET_SB((dva)->dva_word[0], 0, 24, SPA_MINBLOCKSHIFT, 0, x)
-
-#define DVA_GET_GRID(dva) BF64_GET((dva)->dva_word[0], 24, 8)
-#define DVA_SET_GRID(dva, x) BF64_SET((dva)->dva_word[0], 24, 8, x)
-
-#define DVA_GET_VDEV(dva) BF64_GET((dva)->dva_word[0], 32, 32)
-#define DVA_SET_VDEV(dva, x) BF64_SET((dva)->dva_word[0], 32, 32, x)
-
-#define DVA_GET_OFFSET(dva) \
- BF64_GET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0)
-#define DVA_SET_OFFSET(dva, x) \
- BF64_SET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0, x)
-
-#define DVA_GET_GANG(dva) BF64_GET((dva)->dva_word[1], 63, 1)
-#define DVA_SET_GANG(dva, x) BF64_SET((dva)->dva_word[1], 63, 1, x)
-
-#define BP_GET_LSIZE(bp) \
- (BP_IS_HOLE(bp) ? 0 : \
- BF64_GET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1))
-#define BP_SET_LSIZE(bp, x) \
- BF64_SET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x)
-
-#define BP_GET_PSIZE(bp) \
- BF64_GET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1)
-#define BP_SET_PSIZE(bp, x) \
- BF64_SET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x)
-
-#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 8)
-#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 8, x)
-
-#define BP_GET_CHECKSUM(bp) BF64_GET((bp)->blk_prop, 40, 8)
-#define BP_SET_CHECKSUM(bp, x) BF64_SET((bp)->blk_prop, 40, 8, x)
-
-#define BP_GET_TYPE(bp) BF64_GET((bp)->blk_prop, 48, 8)
-#define BP_SET_TYPE(bp, x) BF64_SET((bp)->blk_prop, 48, 8, x)
-
-#define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5)
-#define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x)
-
-#define BP_GET_BYTEORDER(bp) (0 - BF64_GET((bp)->blk_prop, 63, 1))
-#define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x)
-
-#define BP_GET_ASIZE(bp) \
- (DVA_GET_ASIZE(&(bp)->blk_dva[0]) + DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
- DVA_GET_ASIZE(&(bp)->blk_dva[2]))
-
-#define BP_GET_UCSIZE(bp) \
- ((BP_GET_LEVEL(bp) > 0 || dmu_ot[BP_GET_TYPE(bp)].ot_metadata) ? \
- BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp));
-
-#define BP_GET_NDVAS(bp) \
- (!!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
- !!DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
- !!DVA_GET_ASIZE(&(bp)->blk_dva[2]))
-
-#define BP_COUNT_GANG(bp) \
- (DVA_GET_GANG(&(bp)->blk_dva[0]) + \
- DVA_GET_GANG(&(bp)->blk_dva[1]) + \
- DVA_GET_GANG(&(bp)->blk_dva[2]))
-
-#define DVA_EQUAL(dva1, dva2) \
- ((dva1)->dva_word[1] == (dva2)->dva_word[1] && \
- (dva1)->dva_word[0] == (dva2)->dva_word[0])
-
-#define ZIO_CHECKSUM_EQUAL(zc1, zc2) \
- (0 == (((zc1).zc_word[0] - (zc2).zc_word[0]) | \
- ((zc1).zc_word[1] - (zc2).zc_word[1]) | \
- ((zc1).zc_word[2] - (zc2).zc_word[2]) | \
- ((zc1).zc_word[3] - (zc2).zc_word[3])))
-
-
-#define DVA_IS_VALID(dva) (DVA_GET_ASIZE(dva) != 0)
-
-#define ZIO_SET_CHECKSUM(zcp, w0, w1, w2, w3) \
-{ \
- (zcp)->zc_word[0] = w0; \
- (zcp)->zc_word[1] = w1; \
- (zcp)->zc_word[2] = w2; \
- (zcp)->zc_word[3] = w3; \
-}
-
-#define BP_IDENTITY(bp) (&(bp)->blk_dva[0])
-#define BP_IS_GANG(bp) DVA_GET_GANG(BP_IDENTITY(bp))
-#define BP_IS_HOLE(bp) ((bp)->blk_birth == 0)
-#define BP_IS_OLDER(bp, txg) (!BP_IS_HOLE(bp) && (bp)->blk_birth < (txg))
-
-#define BP_ZERO(bp) \
-{ \
- (bp)->blk_dva[0].dva_word[0] = 0; \
- (bp)->blk_dva[0].dva_word[1] = 0; \
- (bp)->blk_dva[1].dva_word[0] = 0; \
- (bp)->blk_dva[1].dva_word[1] = 0; \
- (bp)->blk_dva[2].dva_word[0] = 0; \
- (bp)->blk_dva[2].dva_word[1] = 0; \
- (bp)->blk_prop = 0; \
- (bp)->blk_pad[0] = 0; \
- (bp)->blk_pad[1] = 0; \
- (bp)->blk_pad[2] = 0; \
- (bp)->blk_birth = 0; \
- (bp)->blk_fill = 0; \
- ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \
-}
-
-/*
- * Note: the byteorder is either 0 or -1, both of which are palindromes.
- * This simplifies the endianness handling a bit.
- */
-#if BYTE_ORDER == _BIG_ENDIAN
-#define ZFS_HOST_BYTEORDER (0ULL)
-#else
-#define ZFS_HOST_BYTEORDER (-1ULL)
-#endif
-
-#define BP_SHOULD_BYTESWAP(bp) (BP_GET_BYTEORDER(bp) != ZFS_HOST_BYTEORDER)
-
-#define BP_SPRINTF_LEN 320
-
-#include <sys/dmu.h>
-
-#define BP_GET_BUFC_TYPE(bp) \
- (((BP_GET_LEVEL(bp) > 0) || (dmu_ot[BP_GET_TYPE(bp)].ot_metadata)) ? \
- ARC_BUFC_METADATA : ARC_BUFC_DATA);
-/*
- * Routines found in spa.c
- */
-
-/* state manipulation functions */
-extern int spa_open(const char *pool, spa_t **, void *tag);
-extern int spa_get_stats(const char *pool, nvlist_t **config,
- char *altroot, size_t buflen);
-extern int spa_create(const char *pool, nvlist_t *config, const char *altroot);
-extern int spa_import(const char *pool, nvlist_t *config, const char *altroot);
-extern nvlist_t *spa_tryimport(nvlist_t *tryconfig);
-extern int spa_destroy(char *pool);
-extern int spa_export(char *pool, nvlist_t **oldconfig);
-extern int spa_reset(char *pool);
-extern void spa_async_request(spa_t *spa, int flag);
-extern void spa_async_suspend(spa_t *spa);
-extern void spa_async_resume(spa_t *spa);
-extern spa_t *spa_inject_addref(char *pool);
-extern void spa_inject_delref(spa_t *spa);
-
-#define SPA_ASYNC_REOPEN 0x01
-#define SPA_ASYNC_REPLACE_DONE 0x02
-#define SPA_ASYNC_SCRUB 0x04
-#define SPA_ASYNC_RESILVER 0x08
-#define SPA_ASYNC_CONFIG_UPDATE 0x10
-
-/* device manipulation */
-extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot);
-extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot,
- int replacing);
-extern int spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done);
-extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare);
-extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath);
-
-/* spare state (which is global across all pools) */
-extern void spa_spare_add(vdev_t *vd);
-extern void spa_spare_remove(vdev_t *vd);
-extern boolean_t spa_spare_exists(uint64_t guid, uint64_t *pool);
-extern void spa_spare_activate(vdev_t *vd);
-
-/* scrubbing */
-extern int spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force);
-extern void spa_scrub_suspend(spa_t *spa);
-extern void spa_scrub_resume(spa_t *spa);
-extern void spa_scrub_restart(spa_t *spa, uint64_t txg);
-
-/* spa syncing */
-extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */
-extern void spa_sync_allpools(void);
-
-/*
- * SPA configuration functions in spa_config.c
- */
-
-#define SPA_CONFIG_UPDATE_POOL 0
-#define SPA_CONFIG_UPDATE_VDEVS 1
-
-extern void spa_config_sync(void);
-extern void spa_config_load(void);
-extern nvlist_t *spa_all_configs(uint64_t *);
-extern void spa_config_set(spa_t *spa, nvlist_t *config);
-extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg,
- int getstats);
-extern void spa_config_update(spa_t *spa, int what);
-
-/*
- * Miscellaneous SPA routines in spa_misc.c
- */
-
-/* Namespace manipulation */
-extern spa_t *spa_lookup(const char *name);
-extern spa_t *spa_add(const char *name, const char *altroot);
-extern void spa_remove(spa_t *spa);
-extern spa_t *spa_next(spa_t *prev);
-
-/* Refcount functions */
-extern void spa_open_ref(spa_t *spa, void *tag);
-extern void spa_close(spa_t *spa, void *tag);
-extern boolean_t spa_refcount_zero(spa_t *spa);
-
-/* Pool configuration lock */
-extern void spa_config_enter(spa_t *spa, krw_t rw, void *tag);
-extern void spa_config_exit(spa_t *spa, void *tag);
-extern boolean_t spa_config_held(spa_t *spa, krw_t rw);
-
-/* Pool vdev add/remove lock */
-extern uint64_t spa_vdev_enter(spa_t *spa);
-extern int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error);
-
-/* Accessor functions */
-extern krwlock_t *spa_traverse_rwlock(spa_t *spa);
-extern int spa_traverse_wanted(spa_t *spa);
-extern struct dsl_pool *spa_get_dsl(spa_t *spa);
-extern blkptr_t *spa_get_rootblkptr(spa_t *spa);
-extern void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp);
-extern void spa_altroot(spa_t *, char *, size_t);
-extern int spa_sync_pass(spa_t *spa);
-extern char *spa_name(spa_t *spa);
-extern uint64_t spa_guid(spa_t *spa);
-extern uint64_t spa_last_synced_txg(spa_t *spa);
-extern uint64_t spa_first_txg(spa_t *spa);
-extern uint64_t spa_version(spa_t *spa);
-extern int spa_state(spa_t *spa);
-extern uint64_t spa_freeze_txg(spa_t *spa);
-struct metaslab_class;
-extern struct metaslab_class *spa_metaslab_class_select(spa_t *spa);
-extern uint64_t spa_get_alloc(spa_t *spa);
-extern uint64_t spa_get_space(spa_t *spa);
-extern uint64_t spa_get_dspace(spa_t *spa);
-extern uint64_t spa_get_asize(spa_t *spa, uint64_t lsize);
-extern uint64_t spa_version(spa_t *spa);
-extern int spa_max_replication(spa_t *spa);
-extern int spa_busy(void);
-
-/* Miscellaneous support routines */
-extern int spa_rename(const char *oldname, const char *newname);
-extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid);
-extern char *spa_strdup(const char *);
-extern void spa_strfree(char *);
-extern uint64_t spa_get_random(uint64_t range);
-extern void sprintf_blkptr(char *buf, int len, const blkptr_t *bp);
-extern void spa_freeze(spa_t *spa);
-extern void spa_upgrade(spa_t *spa);
-extern void spa_evict_all(void);
-extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid);
-extern boolean_t spa_has_spare(spa_t *, uint64_t guid);
-extern uint64_t bp_get_dasize(spa_t *spa, const blkptr_t *bp);
-
-/* history logging */
-extern void spa_history_create_obj(spa_t *spa, dmu_tx_t *tx);
-extern int spa_history_get(spa_t *spa, uint64_t *offset, uint64_t *len_read,
- char *his_buf);
-extern int spa_history_log(spa_t *spa, const char *his_buf,
- uint64_t pool_create);
-
-/* error handling */
-struct zbookmark;
-struct zio;
-extern void spa_log_error(spa_t *spa, struct zio *zio);
-extern void zfs_ereport_post(const char *class, spa_t *spa, vdev_t *vd,
- struct zio *zio, uint64_t stateoroffset, uint64_t length);
-extern void zfs_post_ok(spa_t *spa, vdev_t *vd);
-extern uint64_t spa_get_errlog_size(spa_t *spa);
-extern int spa_get_errlog(spa_t *spa, void *uaddr, size_t *count);
-extern void spa_errlog_rotate(spa_t *spa);
-extern void spa_errlog_drain(spa_t *spa);
-extern void spa_errlog_sync(spa_t *spa, uint64_t txg);
-extern void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub);
-
-/* Initialization and termination */
-extern void spa_init(int flags);
-extern void spa_fini(void);
-
-/* properties */
-extern int spa_set_props(spa_t *spa, nvlist_t *nvp);
-extern int spa_get_props(spa_t *spa, nvlist_t **nvp);
-extern void spa_clear_bootfs(spa_t *spa, uint64_t obj, dmu_tx_t *tx);
-extern boolean_t spa_has_bootfs(spa_t *spa);
-
-#ifdef ZFS_DEBUG
-#define dprintf_bp(bp, fmt, ...) do { \
- if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
- char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \
- sprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, (bp)); \
- dprintf(fmt " %s\n", __VA_ARGS__, __blkbuf); \
- kmem_free(__blkbuf, BP_SPRINTF_LEN); \
- } \
-_NOTE(CONSTCOND) } while (0)
-#else
-#define dprintf_bp(bp, fmt, ...)
-#endif
-
-extern int spa_mode; /* mode, e.g. FREAD | FWRITE */
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_SPA_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
deleted file mode 100644
index 8c57123..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_SPA_IMPL_H
-#define _SYS_SPA_IMPL_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/spa.h>
-#include <sys/vdev.h>
-#include <sys/metaslab.h>
-#include <sys/dmu.h>
-#include <sys/dsl_pool.h>
-#include <sys/uberblock_impl.h>
-#include <sys/zfs_context.h>
-#include <sys/avl.h>
-#include <sys/refcount.h>
-#include <sys/bplist.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct spa_config_lock {
- kmutex_t scl_lock;
- refcount_t scl_count;
- kthread_t *scl_writer;
- kcondvar_t scl_cv;
-} spa_config_lock_t;
-
-typedef struct spa_error_entry {
- zbookmark_t se_bookmark;
- char *se_name;
- avl_node_t se_avl;
-} spa_error_entry_t;
-
-typedef struct spa_history_phys {
- uint64_t sh_pool_create_len; /* ending offset of zpool create */
- uint64_t sh_phys_max_off; /* physical EOF */
- uint64_t sh_bof; /* logical BOF */
- uint64_t sh_eof; /* logical EOF */
- uint64_t sh_records_lost; /* num of records overwritten */
-} spa_history_phys_t;
-
-typedef struct spa_props {
- nvlist_t *spa_props_nvp;
- list_node_t spa_list_node;
-} spa_props_t;
-
-struct spa {
- /*
- * Fields protected by spa_namespace_lock.
- */
- char *spa_name; /* pool name */
- avl_node_t spa_avl; /* node in spa_namespace_avl */
- nvlist_t *spa_config; /* last synced config */
- nvlist_t *spa_config_syncing; /* currently syncing config */
- uint64_t spa_config_txg; /* txg of last config change */
- kmutex_t spa_config_cache_lock; /* for spa_config RW_READER */
- int spa_sync_pass; /* iterate-to-convergence */
- int spa_state; /* pool state */
- int spa_inject_ref; /* injection references */
- uint8_t spa_traverse_wanted; /* traverse lock wanted */
- uint8_t spa_sync_on; /* sync threads are running */
- spa_load_state_t spa_load_state; /* current load operation */
- taskq_t *spa_zio_issue_taskq[ZIO_TYPES];
- taskq_t *spa_zio_intr_taskq[ZIO_TYPES];
- dsl_pool_t *spa_dsl_pool;
- metaslab_class_t *spa_normal_class; /* normal data class */
- uint64_t spa_first_txg; /* first txg after spa_open() */
- uint64_t spa_final_txg; /* txg of export/destroy */
- uint64_t spa_freeze_txg; /* freeze pool at this txg */
- objset_t *spa_meta_objset; /* copy of dp->dp_meta_objset */
- txg_list_t spa_vdev_txg_list; /* per-txg dirty vdev list */
- vdev_t *spa_root_vdev; /* top-level vdev container */
- uint64_t spa_load_guid; /* initial guid for spa_load */
- list_t spa_dirty_list; /* vdevs with dirty labels */
- uint64_t spa_spares_object; /* MOS object for spare list */
- nvlist_t *spa_sparelist; /* cached spare config */
- vdev_t **spa_spares; /* available hot spares */
- int spa_nspares; /* number of hot spares */
- boolean_t spa_sync_spares; /* sync the spares list */
- uint64_t spa_config_object; /* MOS object for pool config */
- uint64_t spa_syncing_txg; /* txg currently syncing */
- uint64_t spa_sync_bplist_obj; /* object for deferred frees */
- bplist_t spa_sync_bplist; /* deferred-free bplist */
- krwlock_t spa_traverse_lock; /* traverse vs. spa_sync() */
- uberblock_t spa_ubsync; /* last synced uberblock */
- uberblock_t spa_uberblock; /* current uberblock */
- kmutex_t spa_scrub_lock; /* resilver/scrub lock */
- kthread_t *spa_scrub_thread; /* scrub/resilver thread */
- traverse_handle_t *spa_scrub_th; /* scrub traverse handle */
- uint64_t spa_scrub_restart_txg; /* need to restart */
- uint64_t spa_scrub_mintxg; /* min txg we'll scrub */
- uint64_t spa_scrub_maxtxg; /* max txg we'll scrub */
- uint64_t spa_scrub_inflight; /* in-flight scrub I/Os */
- uint64_t spa_scrub_maxinflight; /* max in-flight scrub I/Os */
- uint64_t spa_scrub_errors; /* scrub I/O error count */
- int spa_scrub_suspended; /* tell scrubber to suspend */
- kcondvar_t spa_scrub_cv; /* scrub thread state change */
- kcondvar_t spa_scrub_io_cv; /* scrub I/O completion */
- uint8_t spa_scrub_stop; /* tell scrubber to stop */
- uint8_t spa_scrub_active; /* active or suspended? */
- uint8_t spa_scrub_type; /* type of scrub we're doing */
- uint8_t spa_scrub_finished; /* indicator to rotate logs */
- kmutex_t spa_async_lock; /* protect async state */
- kthread_t *spa_async_thread; /* thread doing async task */
- int spa_async_suspended; /* async tasks suspended */
- kcondvar_t spa_async_cv; /* wait for thread_exit() */
- uint16_t spa_async_tasks; /* async task mask */
- char *spa_root; /* alternate root directory */
- kmutex_t spa_uberblock_lock; /* vdev_uberblock_load_done() */
- uint64_t spa_ena; /* spa-wide ereport ENA */
- boolean_t spa_last_open_failed; /* true if last open faled */
- kmutex_t spa_errlog_lock; /* error log lock */
- uint64_t spa_errlog_last; /* last error log object */
- uint64_t spa_errlog_scrub; /* scrub error log object */
- kmutex_t spa_errlist_lock; /* error list/ereport lock */
- avl_tree_t spa_errlist_last; /* last error list */
- avl_tree_t spa_errlist_scrub; /* scrub error list */
- uint64_t spa_deflate; /* should we deflate? */
- uint64_t spa_history; /* history object */
- kmutex_t spa_history_lock; /* history lock */
- vdev_t *spa_pending_vdev; /* pending vdev additions */
- nvlist_t **spa_pending_spares; /* pending spare additions */
- uint_t spa_pending_nspares; /* # pending spares */
- kmutex_t spa_props_lock; /* property lock */
- uint64_t spa_pool_props_object; /* object for properties */
- uint64_t spa_bootfs; /* default boot filesystem */
- /*
- * spa_refcnt must be the last element because it changes size based on
- * compilation options. In order for the MDB module to function
- * correctly, the other fields must remain in the same location.
- */
- spa_config_lock_t spa_config_lock; /* configuration changes */
- refcount_t spa_refcount; /* number of opens */
-};
-
-extern const char *spa_config_dir;
-extern kmutex_t spa_namespace_lock;
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_SPA_IMPL_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h
deleted file mode 100644
index db9daef..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h
+++ /dev/null
@@ -1,162 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_SPACE_MAP_H
-#define _SYS_SPACE_MAP_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/avl.h>
-#include <sys/dmu.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct space_map_ops space_map_ops_t;
-
-typedef struct space_map {
- avl_tree_t sm_root; /* AVL tree of map segments */
- uint64_t sm_space; /* sum of all segments in the map */
- uint64_t sm_start; /* start of map */
- uint64_t sm_size; /* size of map */
- uint8_t sm_shift; /* unit shift */
- uint8_t sm_pad[3]; /* unused */
- uint8_t sm_loaded; /* map loaded? */
- uint8_t sm_loading; /* map loading? */
- kcondvar_t sm_load_cv; /* map load completion */
- space_map_ops_t *sm_ops; /* space map block picker ops vector */
- void *sm_ppd; /* picker-private data */
- kmutex_t *sm_lock; /* pointer to lock that protects map */
-} space_map_t;
-
-typedef struct space_seg {
- avl_node_t ss_node; /* AVL node */
- uint64_t ss_start; /* starting offset of this segment */
- uint64_t ss_end; /* ending offset (non-inclusive) */
-} space_seg_t;
-
-typedef struct space_map_obj {
- uint64_t smo_object; /* on-disk space map object */
- uint64_t smo_objsize; /* size of the object */
- uint64_t smo_alloc; /* space allocated from the map */
-} space_map_obj_t;
-
-struct space_map_ops {
- void (*smop_load)(space_map_t *sm);
- void (*smop_unload)(space_map_t *sm);
- uint64_t (*smop_alloc)(space_map_t *sm, uint64_t size);
- void (*smop_claim)(space_map_t *sm, uint64_t start, uint64_t size);
- void (*smop_free)(space_map_t *sm, uint64_t start, uint64_t size);
-};
-
-/*
- * debug entry
- *
- * 1 3 10 50
- * ,---+--------+------------+---------------------------------.
- * | 1 | action | syncpass | txg (lower bits) |
- * `---+--------+------------+---------------------------------'
- * 63 62 60 59 50 49 0
- *
- *
- *
- * non-debug entry
- *
- * 1 47 1 15
- * ,-----------------------------------------------------------.
- * | 0 | offset (sm_shift units) | type | run |
- * `-----------------------------------------------------------'
- * 63 62 17 16 15 0
- */
-
-/* All this stuff takes and returns bytes */
-#define SM_RUN_DECODE(x) (BF64_DECODE(x, 0, 15) + 1)
-#define SM_RUN_ENCODE(x) BF64_ENCODE((x) - 1, 0, 15)
-#define SM_TYPE_DECODE(x) BF64_DECODE(x, 15, 1)
-#define SM_TYPE_ENCODE(x) BF64_ENCODE(x, 15, 1)
-#define SM_OFFSET_DECODE(x) BF64_DECODE(x, 16, 47)
-#define SM_OFFSET_ENCODE(x) BF64_ENCODE(x, 16, 47)
-#define SM_DEBUG_DECODE(x) BF64_DECODE(x, 63, 1)
-#define SM_DEBUG_ENCODE(x) BF64_ENCODE(x, 63, 1)
-
-#define SM_DEBUG_ACTION_DECODE(x) BF64_DECODE(x, 60, 3)
-#define SM_DEBUG_ACTION_ENCODE(x) BF64_ENCODE(x, 60, 3)
-
-#define SM_DEBUG_SYNCPASS_DECODE(x) BF64_DECODE(x, 50, 10)
-#define SM_DEBUG_SYNCPASS_ENCODE(x) BF64_ENCODE(x, 50, 10)
-
-#define SM_DEBUG_TXG_DECODE(x) BF64_DECODE(x, 0, 50)
-#define SM_DEBUG_TXG_ENCODE(x) BF64_ENCODE(x, 0, 50)
-
-#define SM_RUN_MAX SM_RUN_DECODE(~0ULL)
-
-#define SM_ALLOC 0x0
-#define SM_FREE 0x1
-
-/*
- * The data for a given space map can be kept on blocks of any size.
- * Larger blocks entail fewer i/o operations, but they also cause the
- * DMU to keep more data in-core, and also to waste more i/o bandwidth
- * when only a few blocks have changed since the last transaction group.
- * This could use a lot more research, but for now, set the freelist
- * block size to 4k (2^12).
- */
-#define SPACE_MAP_BLOCKSHIFT 12
-
-typedef void space_map_func_t(space_map_t *sm, uint64_t start, uint64_t size);
-
-extern void space_map_create(space_map_t *sm, uint64_t start, uint64_t size,
- uint8_t shift, kmutex_t *lp);
-extern void space_map_destroy(space_map_t *sm);
-extern void space_map_add(space_map_t *sm, uint64_t start, uint64_t size);
-extern void space_map_remove(space_map_t *sm, uint64_t start, uint64_t size);
-extern int space_map_contains(space_map_t *sm, uint64_t start, uint64_t size);
-extern void space_map_vacate(space_map_t *sm,
- space_map_func_t *func, space_map_t *mdest);
-extern void space_map_walk(space_map_t *sm,
- space_map_func_t *func, space_map_t *mdest);
-extern void space_map_excise(space_map_t *sm, uint64_t start, uint64_t size);
-extern void space_map_union(space_map_t *smd, space_map_t *sms);
-
-extern void space_map_load_wait(space_map_t *sm);
-extern int space_map_load(space_map_t *sm, space_map_ops_t *ops,
- uint8_t maptype, space_map_obj_t *smo, objset_t *os);
-extern void space_map_unload(space_map_t *sm);
-
-extern uint64_t space_map_alloc(space_map_t *sm, uint64_t size);
-extern void space_map_claim(space_map_t *sm, uint64_t start, uint64_t size);
-extern void space_map_free(space_map_t *sm, uint64_t start, uint64_t size);
-
-extern void space_map_sync(space_map_t *sm, uint8_t maptype,
- space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx);
-extern void space_map_truncate(space_map_obj_t *smo,
- objset_t *os, dmu_tx_t *tx);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_SPACE_MAP_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h
deleted file mode 100644
index dae129c..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_TXG_H
-#define _SYS_TXG_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/spa.h>
-#include <sys/zfs_context.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define TXG_CONCURRENT_STATES 3 /* open, quiescing, syncing */
-#define TXG_SIZE 4 /* next power of 2 */
-#define TXG_MASK (TXG_SIZE - 1) /* mask for size */
-#define TXG_INITIAL TXG_SIZE /* initial txg */
-#define TXG_IDX (txg & TXG_MASK)
-
-#define TXG_WAIT 1ULL
-#define TXG_NOWAIT 2ULL
-
-typedef struct tx_cpu tx_cpu_t;
-
-typedef struct txg_handle {
- tx_cpu_t *th_cpu;
- uint64_t th_txg;
-} txg_handle_t;
-
-typedef struct txg_node {
- struct txg_node *tn_next[TXG_SIZE];
- uint8_t tn_member[TXG_SIZE];
-} txg_node_t;
-
-typedef struct txg_list {
- kmutex_t tl_lock;
- size_t tl_offset;
- txg_node_t *tl_head[TXG_SIZE];
-} txg_list_t;
-
-struct dsl_pool;
-
-extern void txg_init(struct dsl_pool *dp, uint64_t txg);
-extern void txg_fini(struct dsl_pool *dp);
-extern void txg_sync_start(struct dsl_pool *dp);
-extern void txg_sync_stop(struct dsl_pool *dp);
-extern uint64_t txg_hold_open(struct dsl_pool *dp, txg_handle_t *txghp);
-extern void txg_rele_to_quiesce(txg_handle_t *txghp);
-extern void txg_rele_to_sync(txg_handle_t *txghp);
-extern void txg_suspend(struct dsl_pool *dp);
-extern void txg_resume(struct dsl_pool *dp);
-
-/*
- * Wait until the given transaction group has finished syncing.
- * Try to make this happen as soon as possible (eg. kick off any
- * necessary syncs immediately). If txg==0, wait for the currently open
- * txg to finish syncing.
- */
-extern void txg_wait_synced(struct dsl_pool *dp, uint64_t txg);
-
-/*
- * Wait until the given transaction group, or one after it, is
- * the open transaction group. Try to make this happen as soon
- * as possible (eg. kick off any necessary syncs immediately).
- * If txg == 0, wait for the next open txg.
- */
-extern void txg_wait_open(struct dsl_pool *dp, uint64_t txg);
-
-/*
- * Returns TRUE if we are "backed up" waiting for the syncing
- * transaction to complete; otherwise returns FALSE.
- */
-extern int txg_stalled(struct dsl_pool *dp);
-
-/*
- * Per-txg object lists.
- */
-
-#define TXG_CLEAN(txg) ((txg) - 1)
-
-extern void txg_list_create(txg_list_t *tl, size_t offset);
-extern void txg_list_destroy(txg_list_t *tl);
-extern int txg_list_empty(txg_list_t *tl, uint64_t txg);
-extern int txg_list_add(txg_list_t *tl, void *p, uint64_t txg);
-extern void *txg_list_remove(txg_list_t *tl, uint64_t txg);
-extern void *txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg);
-extern int txg_list_member(txg_list_t *tl, void *p, uint64_t txg);
-extern void *txg_list_head(txg_list_t *tl, uint64_t txg);
-extern void *txg_list_next(txg_list_t *tl, void *p, uint64_t txg);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_TXG_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h
deleted file mode 100644
index 45a138a..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_TXG_IMPL_H
-#define _SYS_TXG_IMPL_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/spa.h>
-#include <sys/txg.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct tx_cpu {
- kmutex_t tc_lock;
- kcondvar_t tc_cv[TXG_SIZE];
- uint64_t tc_count[TXG_SIZE];
- char tc_pad[16];
-};
-
-typedef struct tx_state {
- tx_cpu_t *tx_cpu; /* protects right to enter txg */
- kmutex_t tx_sync_lock; /* protects tx_state_t */
- krwlock_t tx_suspend;
- uint64_t tx_open_txg; /* currently open txg id */
- uint64_t tx_quiesced_txg; /* quiesced txg waiting for sync */
- uint64_t tx_syncing_txg; /* currently syncing txg id */
- uint64_t tx_synced_txg; /* last synced txg id */
-
- uint64_t tx_sync_txg_waiting; /* txg we're waiting to sync */
- uint64_t tx_quiesce_txg_waiting; /* txg we're waiting to open */
-
- kcondvar_t tx_sync_more_cv;
- kcondvar_t tx_sync_done_cv;
- kcondvar_t tx_quiesce_more_cv;
- kcondvar_t tx_quiesce_done_cv;
- kcondvar_t tx_timeout_exit_cv;
- kcondvar_t tx_exit_cv; /* wait for all threads to exit */
-
- uint8_t tx_threads; /* number of threads */
- uint8_t tx_exiting; /* set when we're exiting */
-
- kthread_t *tx_sync_thread;
- kthread_t *tx_quiesce_thread;
- kthread_t *tx_timelimit_thread;
-} tx_state_t;
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_TXG_IMPL_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h
deleted file mode 100644
index 93d936a..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_UBERBLOCK_H
-#define _SYS_UBERBLOCK_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/spa.h>
-#include <sys/vdev.h>
-#include <sys/zio.h>
-#include <sys/zio_checksum.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct uberblock uberblock_t;
-
-extern int uberblock_verify(uberblock_t *ub);
-extern int uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_UBERBLOCK_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h
deleted file mode 100644
index ab0f2dc..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_UBERBLOCK_IMPL_H
-#define _SYS_UBERBLOCK_IMPL_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/uberblock.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * The uberblock version is incremented whenever an incompatible on-disk
- * format change is made to the SPA, DMU, or ZAP.
- *
- * Note: the first two fields should never be moved. When a storage pool
- * is opened, the uberblock must be read off the disk before the version
- * can be checked. If the ub_version field is moved, we may not detect
- * version mismatch. If the ub_magic field is moved, applications that
- * expect the magic number in the first word won't work.
- */
-#define UBERBLOCK_MAGIC 0x00bab10c /* oo-ba-bloc! */
-#define UBERBLOCK_SHIFT 10 /* up to 1K */
-
-struct uberblock {
- uint64_t ub_magic; /* UBERBLOCK_MAGIC */
- uint64_t ub_version; /* ZFS_VERSION */
- uint64_t ub_txg; /* txg of last sync */
- uint64_t ub_guid_sum; /* sum of all vdev guids */
- uint64_t ub_timestamp; /* UTC time of last sync */
- blkptr_t ub_rootbp; /* MOS objset_phys_t */
-};
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_UBERBLOCK_IMPL_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/unique.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/unique.h
deleted file mode 100644
index c8c177e..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/unique.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_UNIQUE_H
-#define _SYS_UNIQUE_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/zfs_context.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/* The number of significant bits in each unique value. */
-#define UNIQUE_BITS 56
-
-void unique_init(void);
-
-/* Return a new unique value. */
-uint64_t unique_create(void);
-
-/* Return a unique value, which equals the one passed in if possible. */
-uint64_t unique_insert(uint64_t value);
-
-/* Indicate that this value no longer needs to be uniquified against. */
-void unique_remove(uint64_t value);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_UNIQUE_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h
deleted file mode 100644
index 3120811..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_VDEV_H
-#define _SYS_VDEV_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/spa.h>
-#include <sys/zio.h>
-#include <sys/dmu.h>
-#include <sys/space_map.h>
-#include <sys/fs/zfs.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-extern boolean_t zfs_nocacheflush;
-
-/*
- * Fault injection modes.
- */
-#define VDEV_FAULT_NONE 0
-#define VDEV_FAULT_RANDOM 1
-#define VDEV_FAULT_COUNT 2
-
-extern int vdev_open(vdev_t *);
-extern int vdev_validate(vdev_t *);
-extern void vdev_close(vdev_t *);
-extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace);
-extern void vdev_init(vdev_t *, uint64_t txg);
-extern void vdev_reopen(vdev_t *);
-extern int vdev_validate_spare(vdev_t *);
-
-extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev);
-extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid);
-extern void vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size);
-extern int vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size);
-extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
- int scrub_done);
-
-extern const char *vdev_description(vdev_t *vd);
-
-extern int vdev_metaslab_init(vdev_t *vd, uint64_t txg);
-extern void vdev_metaslab_fini(vdev_t *vd);
-
-extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs);
-extern void vdev_stat_update(zio_t *zio);
-extern void vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type,
- boolean_t complete);
-extern int vdev_getspec(spa_t *spa, uint64_t vdev, char **vdev_spec);
-extern void vdev_propagate_state(vdev_t *vd);
-extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state,
- vdev_aux_t aux);
-
-extern void vdev_space_update(vdev_t *vd, int64_t space_delta,
- int64_t alloc_delta);
-
-extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize);
-
-extern void vdev_io_start(zio_t *zio);
-extern void vdev_io_done(zio_t *zio);
-
-extern int vdev_online(spa_t *spa, uint64_t guid);
-extern int vdev_offline(spa_t *spa, uint64_t guid, int istmp);
-extern void vdev_clear(spa_t *spa, vdev_t *vd);
-
-extern int vdev_error_inject(vdev_t *vd, zio_t *zio);
-extern int vdev_is_dead(vdev_t *vd);
-
-extern void vdev_cache_init(vdev_t *vd);
-extern void vdev_cache_fini(vdev_t *vd);
-extern int vdev_cache_read(zio_t *zio);
-extern void vdev_cache_write(zio_t *zio);
-
-extern void vdev_queue_init(vdev_t *vd);
-extern void vdev_queue_fini(vdev_t *vd);
-extern zio_t *vdev_queue_io(zio_t *zio);
-extern void vdev_queue_io_done(zio_t *zio);
-
-extern void vdev_config_dirty(vdev_t *vd);
-extern void vdev_config_clean(vdev_t *vd);
-extern int vdev_config_sync(vdev_t *vd, uint64_t txg);
-
-extern nvlist_t *vdev_config_generate(spa_t *spa, vdev_t *vd,
- boolean_t getstats, boolean_t isspare);
-
-/*
- * Label routines
- */
-struct uberblock;
-extern uint64_t vdev_label_offset(uint64_t psize, int l, uint64_t offset);
-extern nvlist_t *vdev_label_read_config(vdev_t *vd);
-extern void vdev_uberblock_load(zio_t *zio, vdev_t *vd, struct uberblock *ub);
-
-typedef enum {
- VDEV_LABEL_CREATE, /* create/add a new device */
- VDEV_LABEL_REPLACE, /* replace an existing device */
- VDEV_LABEL_SPARE, /* add a new hot spare */
- VDEV_LABEL_REMOVE /* remove an existing device */
-} vdev_labeltype_t;
-
-extern int vdev_label_init(vdev_t *vd, uint64_t txg, vdev_labeltype_t reason);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_VDEV_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h
deleted file mode 100644
index 95536a7..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_VDEV_DISK_H
-#define _SYS_VDEV_DISK_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/vdev.h>
-#ifdef _KERNEL
-#include <sys/sunldi.h>
-#include <sys/sunddi.h>
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct vdev_disk {
- ddi_devid_t vd_devid;
- char *vd_minor;
- ldi_handle_t vd_lh;
-} vdev_disk_t;
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_VDEV_DISK_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_file.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_file.h
deleted file mode 100644
index cd49673..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_file.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_VDEV_FILE_H
-#define _SYS_VDEV_FILE_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/vdev.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct vdev_file {
- vnode_t *vf_vnode;
-} vdev_file_t;
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_VDEV_FILE_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
deleted file mode 100644
index aba7567..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
+++ /dev/null
@@ -1,298 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_VDEV_IMPL_H
-#define _SYS_VDEV_IMPL_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/avl.h>
-#include <sys/dmu.h>
-#include <sys/metaslab.h>
-#include <sys/nvpair.h>
-#include <sys/space_map.h>
-#include <sys/vdev.h>
-#include <sys/dkio.h>
-#include <sys/uberblock_impl.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * Virtual device descriptors.
- *
- * All storage pool operations go through the virtual device framework,
- * which provides data replication and I/O scheduling.
- */
-
-/*
- * Forward declarations that lots of things need.
- */
-typedef struct vdev_queue vdev_queue_t;
-typedef struct vdev_cache vdev_cache_t;
-typedef struct vdev_cache_entry vdev_cache_entry_t;
-
-/*
- * Virtual device operations
- */
-typedef int vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *ashift);
-typedef void vdev_close_func_t(vdev_t *vd);
-typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize);
-typedef void vdev_io_start_func_t(zio_t *zio);
-typedef void vdev_io_done_func_t(zio_t *zio);
-typedef void vdev_state_change_func_t(vdev_t *vd, int, int);
-
-typedef struct vdev_ops {
- vdev_open_func_t *vdev_op_open;
- vdev_close_func_t *vdev_op_close;
- vdev_asize_func_t *vdev_op_asize;
- vdev_io_start_func_t *vdev_op_io_start;
- vdev_io_done_func_t *vdev_op_io_done;
- vdev_state_change_func_t *vdev_op_state_change;
- char vdev_op_type[16];
- boolean_t vdev_op_leaf;
-} vdev_ops_t;
-
-/*
- * Virtual device properties
- */
-struct vdev_cache_entry {
- char *ve_data;
- uint64_t ve_offset;
- uint64_t ve_lastused;
- avl_node_t ve_offset_node;
- avl_node_t ve_lastused_node;
- uint32_t ve_hits;
- uint16_t ve_missed_update;
- zio_t *ve_fill_io;
-};
-
-struct vdev_cache {
- avl_tree_t vc_offset_tree;
- avl_tree_t vc_lastused_tree;
- kmutex_t vc_lock;
-};
-
-struct vdev_queue {
- avl_tree_t vq_deadline_tree;
- avl_tree_t vq_read_tree;
- avl_tree_t vq_write_tree;
- avl_tree_t vq_pending_tree;
- kmutex_t vq_lock;
-};
-
-/*
- * Virtual device descriptor
- */
-struct vdev {
- /*
- * Common to all vdev types.
- */
- uint64_t vdev_id; /* child number in vdev parent */
- uint64_t vdev_guid; /* unique ID for this vdev */
- uint64_t vdev_guid_sum; /* self guid + all child guids */
- uint64_t vdev_asize; /* allocatable device capacity */
- uint64_t vdev_ashift; /* block alignment shift */
- uint64_t vdev_state; /* see VDEV_STATE_* #defines */
- uint64_t vdev_prevstate; /* used when reopening a vdev */
- vdev_ops_t *vdev_ops; /* vdev operations */
- spa_t *vdev_spa; /* spa for this vdev */
- void *vdev_tsd; /* type-specific data */
- vdev_t *vdev_top; /* top-level vdev */
- vdev_t *vdev_parent; /* parent vdev */
- vdev_t **vdev_child; /* array of children */
- uint64_t vdev_children; /* number of children */
- space_map_t vdev_dtl_map; /* dirty time log in-core state */
- space_map_t vdev_dtl_scrub; /* DTL for scrub repair writes */
- vdev_stat_t vdev_stat; /* virtual device statistics */
-
- /*
- * Top-level vdev state.
- */
- uint64_t vdev_ms_array; /* metaslab array object */
- uint64_t vdev_ms_shift; /* metaslab size shift */
- uint64_t vdev_ms_count; /* number of metaslabs */
- metaslab_group_t *vdev_mg; /* metaslab group */
- metaslab_t **vdev_ms; /* metaslab array */
- txg_list_t vdev_ms_list; /* per-txg dirty metaslab lists */
- txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */
- txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */
- uint8_t vdev_reopen_wanted; /* async reopen wanted? */
- list_node_t vdev_dirty_node; /* config dirty list */
- uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */
-
- /*
- * Leaf vdev state.
- */
- uint64_t vdev_psize; /* physical device capacity */
- space_map_obj_t vdev_dtl; /* dirty time log on-disk state */
- txg_node_t vdev_dtl_node; /* per-txg dirty DTL linkage */
- uint64_t vdev_wholedisk; /* true if this is a whole disk */
- uint64_t vdev_offline; /* device taken offline? */
- uint64_t vdev_nparity; /* number of parity devices for raidz */
- char *vdev_path; /* vdev path (if any) */
- char *vdev_devid; /* vdev devid (if any) */
- uint64_t vdev_fault_arg; /* fault injection paramater */
- int vdev_fault_mask; /* zio types to fault */
- uint8_t vdev_fault_mode; /* fault injection mode */
- uint8_t vdev_cache_active; /* vdev_cache and vdev_queue */
- uint8_t vdev_tmpoffline; /* device taken offline temporarily? */
- uint8_t vdev_detached; /* device detached? */
- uint64_t vdev_isspare; /* was a hot spare */
- vdev_queue_t vdev_queue; /* I/O deadline schedule queue */
- vdev_cache_t vdev_cache; /* physical block cache */
- uint64_t vdev_not_present; /* not present during import */
- hrtime_t vdev_last_try; /* last reopen time */
- boolean_t vdev_nowritecache; /* true if flushwritecache failed */
-
- /*
- * For DTrace to work in userland (libzpool) context, these fields must
- * remain at the end of the structure. DTrace will use the kernel's
- * CTF definition for 'struct vdev', and since the size of a kmutex_t is
- * larger in userland, the offsets for the rest fields would be
- * incorrect.
- */
- kmutex_t vdev_dtl_lock; /* vdev_dtl_{map,resilver} */
- kmutex_t vdev_stat_lock; /* vdev_stat */
-};
-
-#define VDEV_SKIP_SIZE (8 << 10)
-#define VDEV_BOOT_HEADER_SIZE (8 << 10)
-#define VDEV_PHYS_SIZE (112 << 10)
-#define VDEV_UBERBLOCK_RING (128 << 10)
-
-#define VDEV_UBERBLOCK_SHIFT(vd) \
- MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT)
-#define VDEV_UBERBLOCK_COUNT(vd) \
- (VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd))
-#define VDEV_UBERBLOCK_OFFSET(vd, n) \
- offsetof(vdev_label_t, vl_uberblock[(n) << VDEV_UBERBLOCK_SHIFT(vd)])
-#define VDEV_UBERBLOCK_SIZE(vd) (1ULL << VDEV_UBERBLOCK_SHIFT(vd))
-
-/* ZFS boot block */
-#define VDEV_BOOT_MAGIC 0x2f5b007b10cULL
-#define VDEV_BOOT_VERSION 1 /* version number */
-
-typedef struct vdev_boot_header {
- uint64_t vb_magic; /* VDEV_BOOT_MAGIC */
- uint64_t vb_version; /* VDEV_BOOT_VERSION */
- uint64_t vb_offset; /* start offset (bytes) */
- uint64_t vb_size; /* size (bytes) */
- char vb_pad[VDEV_BOOT_HEADER_SIZE - 4 * sizeof (uint64_t)];
-} vdev_boot_header_t;
-
-typedef struct vdev_phys {
- char vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_block_tail_t)];
- zio_block_tail_t vp_zbt;
-} vdev_phys_t;
-
-typedef struct vdev_label {
- char vl_pad[VDEV_SKIP_SIZE]; /* 8K */
- vdev_boot_header_t vl_boot_header; /* 8K */
- vdev_phys_t vl_vdev_phys; /* 112K */
- char vl_uberblock[VDEV_UBERBLOCK_RING]; /* 128K */
-} vdev_label_t; /* 256K total */
-
-/*
- * vdev_dirty() flags
- */
-#define VDD_METASLAB 0x01
-#define VDD_DTL 0x02
-
-/*
- * Size and offset of embedded boot loader region on each label.
- * The total size of the first two labels plus the boot area is 4MB.
- */
-#define VDEV_BOOT_OFFSET (2 * sizeof (vdev_label_t))
-#define VDEV_BOOT_SIZE (7ULL << 19) /* 3.5M */
-
-/*
- * Size of label regions at the start and end of each leaf device.
- */
-#define VDEV_LABEL_START_SIZE (2 * sizeof (vdev_label_t) + VDEV_BOOT_SIZE)
-#define VDEV_LABEL_END_SIZE (2 * sizeof (vdev_label_t))
-#define VDEV_LABELS 4
-
-#define VDEV_ALLOC_LOAD 0
-#define VDEV_ALLOC_ADD 1
-#define VDEV_ALLOC_SPARE 2
-
-/*
- * Allocate or free a vdev
- */
-extern int vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *config,
- vdev_t *parent, uint_t id, int alloctype);
-extern void vdev_free(vdev_t *vd);
-
-/*
- * Add or remove children and parents
- */
-extern void vdev_add_child(vdev_t *pvd, vdev_t *cvd);
-extern void vdev_remove_child(vdev_t *pvd, vdev_t *cvd);
-extern void vdev_compact_children(vdev_t *pvd);
-extern vdev_t *vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops);
-extern void vdev_remove_parent(vdev_t *cvd);
-
-/*
- * vdev sync load and sync
- */
-extern void vdev_load(vdev_t *vd);
-extern void vdev_sync(vdev_t *vd, uint64_t txg);
-extern void vdev_sync_done(vdev_t *vd, uint64_t txg);
-extern void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg);
-
-/*
- * Available vdev types.
- */
-extern vdev_ops_t vdev_root_ops;
-extern vdev_ops_t vdev_mirror_ops;
-extern vdev_ops_t vdev_replacing_ops;
-extern vdev_ops_t vdev_raidz_ops;
-#ifdef _KERNEL
-extern vdev_ops_t vdev_geom_ops;
-#else
-extern vdev_ops_t vdev_disk_ops;
-extern vdev_ops_t vdev_file_ops;
-#endif
-extern vdev_ops_t vdev_missing_ops;
-extern vdev_ops_t vdev_spare_ops;
-
-/*
- * Common size functions
- */
-extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize);
-extern uint64_t vdev_get_rsize(vdev_t *vd);
-
-/*
- * zdb uses this tunable, so it must be declared here to make lint happy.
- */
-extern int zfs_vdev_cache_size;
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_VDEV_IMPL_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h
deleted file mode 100644
index f89d938..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h
+++ /dev/null
@@ -1,359 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_ZAP_H
-#define _SYS_ZAP_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-/*
- * ZAP - ZFS Attribute Processor
- *
- * The ZAP is a module which sits on top of the DMU (Data Managemnt
- * Unit) and implements a higher-level storage primitive using DMU
- * objects. Its primary consumer is the ZPL (ZFS Posix Layer).
- *
- * A "zapobj" is a DMU object which the ZAP uses to stores attributes.
- * Users should use only zap routines to access a zapobj - they should
- * not access the DMU object directly using DMU routines.
- *
- * The attributes stored in a zapobj are name-value pairs. The name is
- * a zero-terminated string of up to ZAP_MAXNAMELEN bytes (including
- * terminating NULL). The value is an array of integers, which may be
- * 1, 2, 4, or 8 bytes long. The total space used by the array (number
- * of integers * integer length) can be up to ZAP_MAXVALUELEN bytes.
- * Note that an 8-byte integer value can be used to store the location
- * (object number) of another dmu object (which may be itself a zapobj).
- * Note that you can use a zero-length attribute to store a single bit
- * of information - the attribute is present or not.
- *
- * The ZAP routines are thread-safe. However, you must observe the
- * DMU's restriction that a transaction may not be operated on
- * concurrently.
- *
- * Any of the routines that return an int may return an I/O error (EIO
- * or ECHECKSUM).
- *
- *
- * Implementation / Performance Notes:
- *
- * The ZAP is intended to operate most efficiently on attributes with
- * short (49 bytes or less) names and single 8-byte values, for which
- * the microzap will be used. The ZAP should be efficient enough so
- * that the user does not need to cache these attributes.
- *
- * The ZAP's locking scheme makes its routines thread-safe. Operations
- * on different zapobjs will be processed concurrently. Operations on
- * the same zapobj which only read data will be processed concurrently.
- * Operations on the same zapobj which modify data will be processed
- * concurrently when there are many attributes in the zapobj (because
- * the ZAP uses per-block locking - more than 128 * (number of cpus)
- * small attributes will suffice).
- */
-
-/*
- * We're using zero-terminated byte strings (ie. ASCII or UTF-8 C
- * strings) for the names of attributes, rather than a byte string
- * bounded by an explicit length. If some day we want to support names
- * in character sets which have embedded zeros (eg. UTF-16, UTF-32),
- * we'll have to add routines for using length-bounded strings.
- */
-
-#include <sys/dmu.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define ZAP_MAXNAMELEN 256
-#define ZAP_MAXVALUELEN 1024
-
-/*
- * Create a new zapobj with no attributes and return its object number.
- */
-uint64_t zap_create(objset_t *ds, dmu_object_type_t ot,
- dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
-
-/*
- * Create a new zapobj with no attributes from the given (unallocated)
- * object number.
- */
-int zap_create_claim(objset_t *ds, uint64_t obj, dmu_object_type_t ot,
- dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
-
-/*
- * The zapobj passed in must be a valid ZAP object for all of the
- * following routines.
- */
-
-/*
- * Destroy this zapobj and all its attributes.
- *
- * Frees the object number using dmu_object_free.
- */
-int zap_destroy(objset_t *ds, uint64_t zapobj, dmu_tx_t *tx);
-
-/*
- * Manipulate attributes.
- *
- * 'integer_size' is in bytes, and must be 1, 2, 4, or 8.
- */
-
-/*
- * Retrieve the contents of the attribute with the given name.
- *
- * If the requested attribute does not exist, the call will fail and
- * return ENOENT.
- *
- * If 'integer_size' is smaller than the attribute's integer size, the
- * call will fail and return EINVAL.
- *
- * If 'integer_size' is equal to or larger than the attribute's integer
- * size, the call will succeed and return 0. * When converting to a
- * larger integer size, the integers will be treated as unsigned (ie. no
- * sign-extension will be performed).
- *
- * 'num_integers' is the length (in integers) of 'buf'.
- *
- * If the attribute is longer than the buffer, as many integers as will
- * fit will be transferred to 'buf'. If the entire attribute was not
- * transferred, the call will return EOVERFLOW.
- */
-int zap_lookup(objset_t *ds, uint64_t zapobj, const char *name,
- uint64_t integer_size, uint64_t num_integers, void *buf);
-
-/*
- * Create an attribute with the given name and value.
- *
- * If an attribute with the given name already exists, the call will
- * fail and return EEXIST.
- */
-int zap_add(objset_t *ds, uint64_t zapobj, const char *name,
- int integer_size, uint64_t num_integers,
- const void *val, dmu_tx_t *tx);
-
-/*
- * Set the attribute with the given name to the given value. If an
- * attribute with the given name does not exist, it will be created. If
- * an attribute with the given name already exists, the previous value
- * will be overwritten. The integer_size may be different from the
- * existing attribute's integer size, in which case the attribute's
- * integer size will be updated to the new value.
- */
-int zap_update(objset_t *ds, uint64_t zapobj, const char *name,
- int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
-
-/*
- * Get the length (in integers) and the integer size of the specified
- * attribute.
- *
- * If the requested attribute does not exist, the call will fail and
- * return ENOENT.
- */
-int zap_length(objset_t *ds, uint64_t zapobj, const char *name,
- uint64_t *integer_size, uint64_t *num_integers);
-
-/*
- * Remove the specified attribute.
- *
- * If the specified attribute does not exist, the call will fail and
- * return ENOENT.
- */
-int zap_remove(objset_t *ds, uint64_t zapobj, const char *name, dmu_tx_t *tx);
-
-/*
- * Returns (in *count) the number of attributes in the specified zap
- * object.
- */
-int zap_count(objset_t *ds, uint64_t zapobj, uint64_t *count);
-
-
-/*
- * Returns (in name) the name of the entry whose value
- * (za_first_integer) is value, or ENOENT if not found. The string
- * pointed to by name must be at least 256 bytes long.
- */
-int zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, char *name);
-
-struct zap;
-struct zap_leaf;
-typedef struct zap_cursor {
- /* This structure is opaque! */
- objset_t *zc_objset;
- struct zap *zc_zap;
- struct zap_leaf *zc_leaf;
- uint64_t zc_zapobj;
- uint64_t zc_hash;
- uint32_t zc_cd;
-} zap_cursor_t;
-
-typedef struct {
- int za_integer_length;
- uint64_t za_num_integers;
- uint64_t za_first_integer; /* no sign extension for <8byte ints */
- char za_name[MAXNAMELEN];
-} zap_attribute_t;
-
-/*
- * The interface for listing all the attributes of a zapobj can be
- * thought of as cursor moving down a list of the attributes one by
- * one. The cookie returned by the zap_cursor_serialize routine is
- * persistent across system calls (and across reboot, even).
- */
-
-/*
- * Initialize a zap cursor, pointing to the "first" attribute of the
- * zapobj. You must _fini the cursor when you are done with it.
- */
-void zap_cursor_init(zap_cursor_t *zc, objset_t *ds, uint64_t zapobj);
-void zap_cursor_fini(zap_cursor_t *zc);
-
-/*
- * Get the attribute currently pointed to by the cursor. Returns
- * ENOENT if at the end of the attributes.
- */
-int zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za);
-
-/*
- * Advance the cursor to the next attribute.
- */
-void zap_cursor_advance(zap_cursor_t *zc);
-
-/*
- * Get a persistent cookie pointing to the current position of the zap
- * cursor. The low 4 bits in the cookie are always zero, and thus can
- * be used as to differentiate a serialized cookie from a different type
- * of value. The cookie will be less than 2^32 as long as there are
- * fewer than 2^22 (4.2 million) entries in the zap object.
- */
-uint64_t zap_cursor_serialize(zap_cursor_t *zc);
-
-/*
- * Initialize a zap cursor pointing to the position recorded by
- * zap_cursor_serialize (in the "serialized" argument). You can also
- * use a "serialized" argument of 0 to start at the beginning of the
- * zapobj (ie. zap_cursor_init_serialized(..., 0) is equivalent to
- * zap_cursor_init(...).)
- */
-void zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *ds,
- uint64_t zapobj, uint64_t serialized);
-
-
-#define ZAP_HISTOGRAM_SIZE 10
-
-typedef struct zap_stats {
- /*
- * Size of the pointer table (in number of entries).
- * This is always a power of 2, or zero if it's a microzap.
- * In general, it should be considerably greater than zs_num_leafs.
- */
- uint64_t zs_ptrtbl_len;
-
- uint64_t zs_blocksize; /* size of zap blocks */
-
- /*
- * The number of blocks used. Note that some blocks may be
- * wasted because old ptrtbl's and large name/value blocks are
- * not reused. (Although their space is reclaimed, we don't
- * reuse those offsets in the object.)
- */
- uint64_t zs_num_blocks;
-
- /*
- * Pointer table values from zap_ptrtbl in the zap_phys_t
- */
- uint64_t zs_ptrtbl_nextblk; /* next (larger) copy start block */
- uint64_t zs_ptrtbl_blks_copied; /* number source blocks copied */
- uint64_t zs_ptrtbl_zt_blk; /* starting block number */
- uint64_t zs_ptrtbl_zt_numblks; /* number of blocks */
- uint64_t zs_ptrtbl_zt_shift; /* bits to index it */
-
- /*
- * Values of the other members of the zap_phys_t
- */
- uint64_t zs_block_type; /* ZBT_HEADER */
- uint64_t zs_magic; /* ZAP_MAGIC */
- uint64_t zs_num_leafs; /* The number of leaf blocks */
- uint64_t zs_num_entries; /* The number of zap entries */
- uint64_t zs_salt; /* salt to stir into hash function */
-
- /*
- * Histograms. For all histograms, the last index
- * (ZAP_HISTOGRAM_SIZE-1) includes any values which are greater
- * than what can be represented. For example
- * zs_leafs_with_n5_entries[ZAP_HISTOGRAM_SIZE-1] is the number
- * of leafs with more than 45 entries.
- */
-
- /*
- * zs_leafs_with_n_pointers[n] is the number of leafs with
- * 2^n pointers to it.
- */
- uint64_t zs_leafs_with_2n_pointers[ZAP_HISTOGRAM_SIZE];
-
- /*
- * zs_leafs_with_n_entries[n] is the number of leafs with
- * [n*5, (n+1)*5) entries. In the current implementation, there
- * can be at most 55 entries in any block, but there may be
- * fewer if the name or value is large, or the block is not
- * completely full.
- */
- uint64_t zs_blocks_with_n5_entries[ZAP_HISTOGRAM_SIZE];
-
- /*
- * zs_leafs_n_tenths_full[n] is the number of leafs whose
- * fullness is in the range [n/10, (n+1)/10).
- */
- uint64_t zs_blocks_n_tenths_full[ZAP_HISTOGRAM_SIZE];
-
- /*
- * zs_entries_using_n_chunks[n] is the number of entries which
- * consume n 24-byte chunks. (Note, large names/values only use
- * one chunk, but contribute to zs_num_blocks_large.)
- */
- uint64_t zs_entries_using_n_chunks[ZAP_HISTOGRAM_SIZE];
-
- /*
- * zs_buckets_with_n_entries[n] is the number of buckets (each
- * leaf has 64 buckets) with n entries.
- * zs_buckets_with_n_entries[1] should be very close to
- * zs_num_entries.
- */
- uint64_t zs_buckets_with_n_entries[ZAP_HISTOGRAM_SIZE];
-} zap_stats_t;
-
-/*
- * Get statistics about a ZAP object. Note: you need to be aware of the
- * internal implementation of the ZAP to correctly interpret some of the
- * statistics. This interface shouldn't be relied on unless you really
- * know what you're doing.
- */
-int zap_get_stats(objset_t *ds, uint64_t zapobj, zap_stats_t *zs);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_ZAP_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h
deleted file mode 100644
index 4e43f4a..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h
+++ /dev/null
@@ -1,204 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_ZAP_IMPL_H
-#define _SYS_ZAP_IMPL_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/zap.h>
-#include <sys/zfs_context.h>
-#include <sys/avl.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-extern int fzap_default_block_shift;
-
-#define ZAP_MAGIC 0x2F52AB2ABULL
-
-#define FZAP_BLOCK_SHIFT(zap) ((zap)->zap_f.zap_block_shift)
-
-#define ZAP_MAXCD (uint32_t)(-1)
-#define ZAP_HASHBITS 28
-#define MZAP_ENT_LEN 64
-#define MZAP_NAME_LEN (MZAP_ENT_LEN - 8 - 4 - 2)
-#define MZAP_MAX_BLKSHIFT SPA_MAXBLOCKSHIFT
-#define MZAP_MAX_BLKSZ (1 << MZAP_MAX_BLKSHIFT)
-
-typedef struct mzap_ent_phys {
- uint64_t mze_value;
- uint32_t mze_cd;
- uint16_t mze_pad; /* in case we want to chain them someday */
- char mze_name[MZAP_NAME_LEN];
-} mzap_ent_phys_t;
-
-typedef struct mzap_phys {
- uint64_t mz_block_type; /* ZBT_MICRO */
- uint64_t mz_salt;
- uint64_t mz_pad[6];
- mzap_ent_phys_t mz_chunk[1];
- /* actually variable size depending on block size */
-} mzap_phys_t;
-
-typedef struct mzap_ent {
- avl_node_t mze_node;
- int mze_chunkid;
- uint64_t mze_hash;
- mzap_ent_phys_t mze_phys;
-} mzap_ent_t;
-
-
-/*
- * The (fat) zap is stored in one object. It is an array of
- * 1<<FZAP_BLOCK_SHIFT byte blocks. The layout looks like one of:
- *
- * ptrtbl fits in first block:
- * [zap_phys_t zap_ptrtbl_shift < 6] [zap_leaf_t] ...
- *
- * ptrtbl too big for first block:
- * [zap_phys_t zap_ptrtbl_shift >= 6] [zap_leaf_t] [ptrtbl] ...
- *
- */
-
-struct dmu_buf;
-struct zap_leaf;
-
-#define ZBT_LEAF ((1ULL << 63) + 0)
-#define ZBT_HEADER ((1ULL << 63) + 1)
-#define ZBT_MICRO ((1ULL << 63) + 3)
-/* any other values are ptrtbl blocks */
-
-/*
- * the embedded pointer table takes up half a block:
- * block size / entry size (2^3) / 2
- */
-#define ZAP_EMBEDDED_PTRTBL_SHIFT(zap) (FZAP_BLOCK_SHIFT(zap) - 3 - 1)
-
-/*
- * The embedded pointer table starts half-way through the block. Since
- * the pointer table itself is half the block, it starts at (64-bit)
- * word number (1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)).
- */
-#define ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) \
- ((uint64_t *)(zap)->zap_f.zap_phys) \
- [(idx) + (1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap))]
-
-/*
- * TAKE NOTE:
- * If zap_phys_t is modified, zap_byteswap() must be modified.
- */
-typedef struct zap_phys {
- uint64_t zap_block_type; /* ZBT_HEADER */
- uint64_t zap_magic; /* ZAP_MAGIC */
-
- struct zap_table_phys {
- uint64_t zt_blk; /* starting block number */
- uint64_t zt_numblks; /* number of blocks */
- uint64_t zt_shift; /* bits to index it */
- uint64_t zt_nextblk; /* next (larger) copy start block */
- uint64_t zt_blks_copied; /* number source blocks copied */
- } zap_ptrtbl;
-
- uint64_t zap_freeblk; /* the next free block */
- uint64_t zap_num_leafs; /* number of leafs */
- uint64_t zap_num_entries; /* number of entries */
- uint64_t zap_salt; /* salt to stir into hash function */
- /*
- * This structure is followed by padding, and then the embedded
- * pointer table. The embedded pointer table takes up second
- * half of the block. It is accessed using the
- * ZAP_EMBEDDED_PTRTBL_ENT() macro.
- */
-} zap_phys_t;
-
-typedef struct zap_table_phys zap_table_phys_t;
-
-typedef struct zap {
- objset_t *zap_objset;
- uint64_t zap_object;
- struct dmu_buf *zap_dbuf;
- krwlock_t zap_rwlock;
- int zap_ismicro;
- uint64_t zap_salt;
- union {
- struct {
- zap_phys_t *zap_phys;
-
- /*
- * zap_num_entries_mtx protects
- * zap_num_entries
- */
- kmutex_t zap_num_entries_mtx;
- int zap_block_shift;
- } zap_fat;
- struct {
- mzap_phys_t *zap_phys;
- int16_t zap_num_entries;
- int16_t zap_num_chunks;
- int16_t zap_alloc_next;
- avl_tree_t zap_avl;
- } zap_micro;
- } zap_u;
-} zap_t;
-
-#define zap_f zap_u.zap_fat
-#define zap_m zap_u.zap_micro
-
-uint64_t zap_hash(zap_t *zap, const char *name);
-int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
- krw_t lti, int fatreader, zap_t **zapp);
-void zap_unlockdir(zap_t *zap);
-void zap_evict(dmu_buf_t *db, void *vmzap);
-
-#define ZAP_HASH_IDX(hash, n) (((n) == 0) ? 0 : ((hash) >> (64 - (n))))
-
-void fzap_byteswap(void *buf, size_t size);
-int fzap_count(zap_t *zap, uint64_t *count);
-int fzap_lookup(zap_t *zap, const char *name,
- uint64_t integer_size, uint64_t num_integers, void *buf);
-int fzap_add(zap_t *zap, const char *name,
- uint64_t integer_size, uint64_t num_integers,
- const void *val, dmu_tx_t *tx);
-int fzap_update(zap_t *zap, const char *name,
- int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
-int fzap_length(zap_t *zap, const char *name,
- uint64_t *integer_size, uint64_t *num_integers);
-int fzap_remove(zap_t *zap, const char *name, dmu_tx_t *tx);
-int fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za);
-void fzap_get_stats(zap_t *zap, zap_stats_t *zs);
-void zap_put_leaf(struct zap_leaf *l);
-
-int fzap_add_cd(zap_t *zap, const char *name,
- uint64_t integer_size, uint64_t num_integers,
- const void *val, uint32_t cd, dmu_tx_t *tx);
-void fzap_upgrade(zap_t *zap, dmu_tx_t *tx);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_ZAP_IMPL_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h
deleted file mode 100644
index 147fb72..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h
+++ /dev/null
@@ -1,234 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_ZAP_LEAF_H
-#define _SYS_ZAP_LEAF_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct zap;
-
-#define ZAP_LEAF_MAGIC 0x2AB1EAF
-
-/* chunk size = 24 bytes */
-#define ZAP_LEAF_CHUNKSIZE 24
-
-/*
- * The amount of space available for chunks is:
- * block size (1<<l->l_bs) - hash entry size (2) * number of hash
- * entries - header space (2*chunksize)
- */
-#define ZAP_LEAF_NUMCHUNKS(l) \
- (((1<<(l)->l_bs) - 2*ZAP_LEAF_HASH_NUMENTRIES(l)) / \
- ZAP_LEAF_CHUNKSIZE - 2)
-
-/*
- * The amount of space within the chunk available for the array is:
- * chunk size - space for type (1) - space for next pointer (2)
- */
-#define ZAP_LEAF_ARRAY_BYTES (ZAP_LEAF_CHUNKSIZE - 3)
-
-#define ZAP_LEAF_ARRAY_NCHUNKS(bytes) \
- (((bytes)+ZAP_LEAF_ARRAY_BYTES-1)/ZAP_LEAF_ARRAY_BYTES)
-
-/*
- * Low water mark: when there are only this many chunks free, start
- * growing the ptrtbl. Ideally, this should be larger than a
- * "reasonably-sized" entry. 20 chunks is more than enough for the
- * largest directory entry (MAXNAMELEN (256) byte name, 8-byte value),
- * while still being only around 3% for 16k blocks.
- */
-#define ZAP_LEAF_LOW_WATER (20)
-
-/*
- * The leaf hash table has block size / 2^5 (32) number of entries,
- * which should be more than enough for the maximum number of entries,
- * which is less than block size / CHUNKSIZE (24) / minimum number of
- * chunks per entry (3).
- */
-#define ZAP_LEAF_HASH_SHIFT(l) ((l)->l_bs - 5)
-#define ZAP_LEAF_HASH_NUMENTRIES(l) (1 << ZAP_LEAF_HASH_SHIFT(l))
-
-/*
- * The chunks start immediately after the hash table. The end of the
- * hash table is at l_hash + HASH_NUMENTRIES, which we simply cast to a
- * chunk_t.
- */
-#define ZAP_LEAF_CHUNK(l, idx) \
- ((zap_leaf_chunk_t *) \
- ((l)->l_phys->l_hash + ZAP_LEAF_HASH_NUMENTRIES(l)))[idx]
-#define ZAP_LEAF_ENTRY(l, idx) (&ZAP_LEAF_CHUNK(l, idx).l_entry)
-
-typedef enum zap_chunk_type {
- ZAP_CHUNK_FREE = 253,
- ZAP_CHUNK_ENTRY = 252,
- ZAP_CHUNK_ARRAY = 251,
- ZAP_CHUNK_TYPE_MAX = 250
-} zap_chunk_type_t;
-
-/*
- * TAKE NOTE:
- * If zap_leaf_phys_t is modified, zap_leaf_byteswap() must be modified.
- */
-typedef struct zap_leaf_phys {
- struct zap_leaf_header {
- uint64_t lh_block_type; /* ZBT_LEAF */
- uint64_t lh_pad1;
- uint64_t lh_prefix; /* hash prefix of this leaf */
- uint32_t lh_magic; /* ZAP_LEAF_MAGIC */
- uint16_t lh_nfree; /* number free chunks */
- uint16_t lh_nentries; /* number of entries */
- uint16_t lh_prefix_len; /* num bits used to id this */
-
-/* above is accessable to zap, below is zap_leaf private */
-
- uint16_t lh_freelist; /* chunk head of free list */
- uint8_t lh_pad2[12];
- } l_hdr; /* 2 24-byte chunks */
-
- /*
- * The header is followed by a hash table with
- * ZAP_LEAF_HASH_NUMENTRIES(zap) entries. The hash table is
- * followed by an array of ZAP_LEAF_NUMCHUNKS(zap)
- * zap_leaf_chunk structures. These structures are accessed
- * with the ZAP_LEAF_CHUNK() macro.
- */
-
- uint16_t l_hash[1];
-} zap_leaf_phys_t;
-
-typedef union zap_leaf_chunk {
- struct zap_leaf_entry {
- uint8_t le_type; /* always ZAP_CHUNK_ENTRY */
- uint8_t le_int_size; /* size of ints */
- uint16_t le_next; /* next entry in hash chain */
- uint16_t le_name_chunk; /* first chunk of the name */
- uint16_t le_name_length; /* bytes in name, incl null */
- uint16_t le_value_chunk; /* first chunk of the value */
- uint16_t le_value_length; /* value length in ints */
- uint32_t le_cd; /* collision differentiator */
- uint64_t le_hash; /* hash value of the name */
- } l_entry;
- struct zap_leaf_array {
- uint8_t la_type; /* always ZAP_CHUNK_ARRAY */
- uint8_t la_array[ZAP_LEAF_ARRAY_BYTES];
- uint16_t la_next; /* next blk or CHAIN_END */
- } l_array;
- struct zap_leaf_free {
- uint8_t lf_type; /* always ZAP_CHUNK_FREE */
- uint8_t lf_pad[ZAP_LEAF_ARRAY_BYTES];
- uint16_t lf_next; /* next in free list, or CHAIN_END */
- } l_free;
-} zap_leaf_chunk_t;
-
-typedef struct zap_leaf {
- krwlock_t l_rwlock; /* only used on head of chain */
- uint64_t l_blkid; /* 1<<ZAP_BLOCK_SHIFT byte block off */
- int l_bs; /* block size shift */
- dmu_buf_t *l_dbuf;
- zap_leaf_phys_t *l_phys;
-} zap_leaf_t;
-
-
-typedef struct zap_entry_handle {
- /* below is set by zap_leaf.c and is public to zap.c */
- uint64_t zeh_num_integers;
- uint64_t zeh_hash;
- uint32_t zeh_cd;
- uint8_t zeh_integer_size;
-
- /* below is private to zap_leaf.c */
- uint16_t zeh_fakechunk;
- uint16_t *zeh_chunkp;
- zap_leaf_t *zeh_leaf;
-} zap_entry_handle_t;
-
-/*
- * Return a handle to the named entry, or ENOENT if not found. The hash
- * value must equal zap_hash(name).
- */
-extern int zap_leaf_lookup(zap_leaf_t *l,
- const char *name, uint64_t h, zap_entry_handle_t *zeh);
-
-/*
- * Return a handle to the entry with this hash+cd, or the entry with the
- * next closest hash+cd.
- */
-extern int zap_leaf_lookup_closest(zap_leaf_t *l,
- uint64_t hash, uint32_t cd, zap_entry_handle_t *zeh);
-
-/*
- * Read the first num_integers in the attribute. Integer size
- * conversion will be done without sign extension. Return EINVAL if
- * integer_size is too small. Return EOVERFLOW if there are more than
- * num_integers in the attribute.
- */
-extern int zap_entry_read(const zap_entry_handle_t *zeh,
- uint8_t integer_size, uint64_t num_integers, void *buf);
-
-extern int zap_entry_read_name(const zap_entry_handle_t *zeh,
- uint16_t buflen, char *buf);
-
-/*
- * Replace the value of an existing entry.
- *
- * zap_entry_update may fail if it runs out of space (ENOSPC).
- */
-extern int zap_entry_update(zap_entry_handle_t *zeh,
- uint8_t integer_size, uint64_t num_integers, const void *buf);
-
-/*
- * Remove an entry.
- */
-extern void zap_entry_remove(zap_entry_handle_t *zeh);
-
-/*
- * Create an entry. An equal entry must not exist, and this entry must
- * belong in this leaf (according to its hash value). Fills in the
- * entry handle on success. Returns 0 on success or ENOSPC on failure.
- */
-extern int zap_entry_create(zap_leaf_t *l,
- const char *name, uint64_t h, uint32_t cd,
- uint8_t integer_size, uint64_t num_integers, const void *buf,
- zap_entry_handle_t *zeh);
-
-/*
- * Other stuff.
- */
-
-extern void zap_leaf_init(zap_leaf_t *l);
-extern void zap_leaf_byteswap(zap_leaf_phys_t *buf, int len);
-extern void zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl);
-extern void zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_ZAP_LEAF_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h
deleted file mode 100644
index 3250b76..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_FS_ZFS_ACL_H
-#define _SYS_FS_ZFS_ACL_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#ifdef _KERNEL
-#include <sys/cred.h>
-#endif
-#include <sys/acl.h>
-#include <sys/dmu.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct znode_phys;
-
-#define ACCESS_UNDETERMINED -1
-
-#define ACE_SLOT_CNT 6
-
-typedef struct zfs_znode_acl {
- uint64_t z_acl_extern_obj; /* ext acl pieces */
- uint32_t z_acl_count; /* Number of ACEs */
- uint16_t z_acl_version; /* acl version */
- uint16_t z_acl_pad; /* pad */
- ace_t z_ace_data[ACE_SLOT_CNT]; /* 6 standard ACEs */
-} zfs_znode_acl_t;
-
-#define ACL_DATA_ALLOCED 0x1
-
-/*
- * Max ACL size is prepended deny for all entries + the
- * canonical six tacked on * the end.
- */
-#define MAX_ACL_SIZE (MAX_ACL_ENTRIES * 2 + 6)
-
-typedef struct zfs_acl {
- int z_slots; /* number of allocated slots for ACEs */
- int z_acl_count;
- uint_t z_state;
- ace_t *z_acl;
-} zfs_acl_t;
-
-#define ZFS_ACL_SIZE(aclcnt) (sizeof (ace_t) * (aclcnt))
-
-/*
- * Property values for acl_mode and acl_inherit.
- *
- * acl_mode can take discard, noallow, groupmask and passthrough.
- * whereas acl_inherit has secure instead of groupmask.
- */
-
-#define ZFS_ACL_DISCARD 0
-#define ZFS_ACL_NOALLOW 1
-#define ZFS_ACL_GROUPMASK 2
-#define ZFS_ACL_PASSTHROUGH 3
-#define ZFS_ACL_SECURE 4
-
-struct znode;
-
-#ifdef _KERNEL
-void zfs_perm_init(struct znode *, struct znode *, int, vattr_t *,
- dmu_tx_t *, cred_t *);
-#ifdef TODO
-int zfs_getacl(struct znode *, vsecattr_t *, cred_t *);
-#endif
-int zfs_mode_update(struct znode *, uint64_t, dmu_tx_t *);
-#ifdef TODO
-int zfs_setacl(struct znode *, vsecattr_t *, cred_t *);
-#endif
-void zfs_acl_rele(void *);
-void zfs_ace_byteswap(ace_t *, int);
-extern int zfs_zaccess(struct znode *, int, cred_t *);
-extern int zfs_zaccess_rwx(struct znode *, mode_t, cred_t *);
-extern int zfs_acl_access(struct znode *, int, cred_t *);
-int zfs_acl_chmod_setattr(struct znode *, uint64_t, dmu_tx_t *);
-int zfs_zaccess_delete(struct znode *, struct znode *, cred_t *);
-int zfs_zaccess_rename(struct znode *, struct znode *,
- struct znode *, struct znode *, cred_t *cr);
-int zfs_zaccess_v4_perm(struct znode *, int, cred_t *);
-void zfs_acl_free(zfs_acl_t *);
-
-#endif
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* !ZFS_NO_ACL */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h
deleted file mode 100644
index 4deeb3c..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_ZFS_CONTEXT_H
-#define _SYS_ZFS_CONTEXT_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <sys/param.h>
-#include <sys/stdint.h>
-#include <sys/note.h>
-#include <sys/kernel.h>
-#include <sys/debug.h>
-#include <sys/systm.h>
-#include <sys/proc.h>
-#include <sys/sysmacros.h>
-#include <sys/bitmap.h>
-#include <sys/cmn_err.h>
-#include <sys/kmem.h>
-#include <sys/taskq.h>
-#include <sys/systm.h>
-#include <sys/conf.h>
-#include <sys/mutex.h>
-#include <sys/rwlock.h>
-#include <sys/random.h>
-#include <sys/byteorder.h>
-#include <sys/systm.h>
-#include <sys/list.h>
-#include <sys/uio.h>
-#include <sys/dirent.h>
-#include <sys/time.h>
-#include <sys/uio.h>
-#include <sys/fcntl.h>
-#include <sys/limits.h>
-#include <sys/string.h>
-#include <sys/bio.h>
-#include <sys/buf.h>
-#include <sys/cred.h>
-#include <sys/sdt.h>
-#include <sys/file.h>
-#include <sys/vfs.h>
-#include <sys/sysctl.h>
-#include <sys/sbuf.h>
-#include <sys/priv.h>
-#include <sys/kdb.h>
-#include <sys/ktr.h>
-#include <sys/stack.h>
-#include <sys/lockf.h>
-#include <sys/policy.h>
-#include <sys/zone.h>
-#include <sys/eventhandler.h>
-#include <sys/misc.h>
-#include <sys/zfs_debug.h>
-
-#include <machine/stdarg.h>
-
-#include <vm/vm.h>
-#include <vm/vm_page.h>
-#include <vm/vm_object.h>
-#include <vm/vm_pager.h>
-#include <vm/vm_kern.h>
-#include <vm/vm_map.h>
-/* There is clash. vm_map.h defines the two below and vdev_cache.c use them. */
-#ifdef min_offset
-#undef min_offset
-#endif
-#ifdef max_offset
-#undef max_offset
-#endif
-#include <vm/vm_extern.h>
-#include <vm/vnode_pager.h>
-
-#define CPU_SEQID (curcpu)
-
-#ifdef __cplusplus
-}
-#endif
-
-extern int zfs_debug_level;
-extern struct mtx zfs_debug_mtx;
-#define ZFS_LOG(lvl, ...) do { \
- if (((lvl) & 0xff) <= zfs_debug_level) { \
- mtx_lock(&zfs_debug_mtx); \
- printf("%s:%u[%d]: ", __func__, __LINE__, (lvl)); \
- printf(__VA_ARGS__); \
- printf("\n"); \
- if ((lvl) & 0x100) \
- kdb_backtrace(); \
- mtx_unlock(&zfs_debug_mtx); \
- } \
-} while (0)
-
-#endif /* _SYS_ZFS_CONTEXT_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h
deleted file mode 100644
index a676533..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _ZFS_CTLDIR_H
-#define _ZFS_CTLDIR_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/vnode.h>
-#include <sys/zfs_vfsops.h>
-#include <sys/zfs_znode.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define ZFS_CTLDIR_NAME ".zfs"
-
-#define zfs_has_ctldir(zdp) \
- ((zdp)->z_id == (zdp)->z_zfsvfs->z_root && \
- ((zdp)->z_zfsvfs->z_ctldir != NULL))
-#define zfs_show_ctldir(zdp) \
- (zfs_has_ctldir(zdp) && \
- ((zdp)->z_zfsvfs->z_show_ctldir))
-
-void zfsctl_create(zfsvfs_t *);
-void zfsctl_destroy(zfsvfs_t *);
-vnode_t *zfsctl_root(znode_t *);
-void zfsctl_init(void);
-void zfsctl_fini(void);
-
-int zfsctl_rename_snapshot(const char *from, const char *to);
-int zfsctl_destroy_snapshot(const char *snapname, int force);
-int zfsctl_umount_snapshots(vfs_t *, int, cred_t *);
-
-int zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
- int flags, vnode_t *rdir, cred_t *cr);
-
-int zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp);
-
-#define ZFSCTL_INO_ROOT 0x1
-#define ZFSCTL_INO_SNAPDIR 0x2
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _ZFS_CTLDIR_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h
deleted file mode 100644
index 450ac1c..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_ZFS_DEBUG_H
-#define _SYS_ZFS_DEBUG_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifndef TRUE
-#define TRUE 1
-#endif
-
-#ifndef FALSE
-#define FALSE 0
-#endif
-
-/*
- * ZFS debugging
- */
-
-#if defined(DEBUG) || !defined(_KERNEL)
-#define ZFS_DEBUG
-#endif
-
-extern int zfs_flags;
-
-#define ZFS_DEBUG_DPRINTF 0x0001
-#define ZFS_DEBUG_DBUF_VERIFY 0x0002
-#define ZFS_DEBUG_DNODE_VERIFY 0x0004
-#define ZFS_DEBUG_SNAPNAMES 0x0008
-#define ZFS_DEBUG_MODIFY 0x0010
-
-#ifdef ZFS_DEBUG
-extern void __dprintf(const char *file, const char *func,
- int line, const char *fmt, ...);
-#define dprintf(...) \
- if (zfs_flags & ZFS_DEBUG_DPRINTF) \
- __dprintf(__FILE__, __func__, __LINE__, __VA_ARGS__)
-#else
-#define dprintf(...) ((void)0)
-#endif /* ZFS_DEBUG */
-
-extern void zfs_panic_recover(const char *fmt, ...);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_ZFS_DEBUG_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h
deleted file mode 100644
index f60d614..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_FS_ZFS_DIR_H
-#define _SYS_FS_ZFS_DIR_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/dmu.h>
-#include <sys/zfs_znode.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/* zfs_dirent_lock() flags */
-#define ZNEW 0x0001 /* entry should not exist */
-#define ZEXISTS 0x0002 /* entry should exist */
-#define ZSHARED 0x0004 /* shared access (zfs_dirlook()) */
-#define ZXATTR 0x0008 /* we want the xattr dir */
-#define ZRENAMING 0x0010 /* znode is being renamed */
-
-/* mknode flags */
-#define IS_ROOT_NODE 0x01 /* create a root node */
-#define IS_XATTR 0x02 /* create an extended attribute node */
-#define IS_REPLAY 0x04 /* we are replaying intent log */
-
-extern int zfs_dirent_lock(zfs_dirlock_t **, znode_t *, char *, znode_t **,
- int);
-extern void zfs_dirent_unlock(zfs_dirlock_t *);
-extern int zfs_link_create(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int);
-extern int zfs_link_destroy(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int,
- boolean_t *);
-extern int zfs_dirlook(znode_t *, char *, vnode_t **);
-extern void zfs_mknode(znode_t *, vattr_t *, uint64_t *,
- dmu_tx_t *, cred_t *, uint_t, znode_t **, int);
-extern void zfs_rmnode(znode_t *);
-extern boolean_t zfs_dirempty(znode_t *);
-extern void zfs_unlinked_add(znode_t *, dmu_tx_t *);
-extern void zfs_unlinked_drain(zfsvfs_t *zfsvfs);
-extern int zfs_sticky_remove_access(znode_t *, znode_t *, cred_t *cr);
-extern int zfs_get_xattrdir(znode_t *, vnode_t **, cred_t *, int);
-extern int zfs_make_xattrdir(znode_t *, vattr_t *, vnode_t **, cred_t *);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_FS_ZFS_DIR_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
deleted file mode 100644
index 61a0a9e..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_ZFS_IOCTL_H
-#define _SYS_ZFS_IOCTL_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/cred.h>
-#include <sys/dmu.h>
-#include <sys/zio.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * Property values for snapdir
- */
-#define ZFS_SNAPDIR_HIDDEN 0
-#define ZFS_SNAPDIR_VISIBLE 1
-
-#define DMU_BACKUP_VERSION (1ULL)
-#define DMU_BACKUP_MAGIC 0x2F5bacbacULL
-
-/*
- * zfs ioctl command structure
- */
-typedef struct dmu_replay_record {
- enum {
- DRR_BEGIN, DRR_OBJECT, DRR_FREEOBJECTS,
- DRR_WRITE, DRR_FREE, DRR_END,
- } drr_type;
- uint32_t drr_pad;
- union {
- struct drr_begin {
- uint64_t drr_magic;
- uint64_t drr_version;
- uint64_t drr_creation_time;
- dmu_objset_type_t drr_type;
- uint32_t drr_pad;
- uint64_t drr_toguid;
- uint64_t drr_fromguid;
- char drr_toname[MAXNAMELEN];
- } drr_begin;
- struct drr_end {
- zio_cksum_t drr_checksum;
- } drr_end;
- struct drr_object {
- uint64_t drr_object;
- dmu_object_type_t drr_type;
- dmu_object_type_t drr_bonustype;
- uint32_t drr_blksz;
- uint32_t drr_bonuslen;
- uint8_t drr_checksum;
- uint8_t drr_compress;
- uint8_t drr_pad[6];
- /* bonus content follows */
- } drr_object;
- struct drr_freeobjects {
- uint64_t drr_firstobj;
- uint64_t drr_numobjs;
- } drr_freeobjects;
- struct drr_write {
- uint64_t drr_object;
- dmu_object_type_t drr_type;
- uint32_t drr_pad;
- uint64_t drr_offset;
- uint64_t drr_length;
- /* content follows */
- } drr_write;
- struct drr_free {
- uint64_t drr_object;
- uint64_t drr_offset;
- uint64_t drr_length;
- } drr_free;
- } drr_u;
-} dmu_replay_record_t;
-
-typedef struct zinject_record {
- uint64_t zi_objset;
- uint64_t zi_object;
- uint64_t zi_start;
- uint64_t zi_end;
- uint64_t zi_guid;
- uint32_t zi_level;
- uint32_t zi_error;
- uint64_t zi_type;
- uint32_t zi_freq;
-} zinject_record_t;
-
-#define ZINJECT_NULL 0x1
-#define ZINJECT_FLUSH_ARC 0x2
-#define ZINJECT_UNLOAD_SPA 0x4
-
-typedef struct zfs_cmd {
- char zc_name[MAXPATHLEN];
- char zc_value[MAXPATHLEN * 2];
- uint64_t zc_guid;
- uint64_t zc_nvlist_src; /* really (char *) */
- uint64_t zc_nvlist_src_size;
- uint64_t zc_nvlist_dst; /* really (char *) */
- uint64_t zc_nvlist_dst_size;
- uint64_t zc_cookie;
- uint64_t zc_cred;
- uint64_t zc_dev;
- uint64_t zc_objset_type;
- uint64_t zc_history; /* really (char *) */
- uint64_t zc_history_len;
- uint64_t zc_history_offset;
- uint64_t zc_obj;
- uint64_t zc_jailid;
- dmu_objset_stats_t zc_objset_stats;
- struct drr_begin zc_begin_record;
- zinject_record_t zc_inject_record;
-} zfs_cmd_t;
-
-#ifdef _KERNEL
-typedef struct zfs_create_data {
- cred_t *zc_cred;
- dev_t zc_dev;
- nvlist_t *zc_props;
-} zfs_create_data_t;
-#endif
-
-#define ZVOL_MAX_MINOR (1 << 16)
-#define ZFS_MIN_MINOR (ZVOL_MAX_MINOR + 1)
-
-#ifdef _KERNEL
-
-extern int zfs_secpolicy_write(const char *dataset, cred_t *cr);
-extern int zfs_busy(void);
-extern int zfs_unmount_snap(char *, void *);
-
-#endif /* _KERNEL */
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_ZFS_IOCTL_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_rlock.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_rlock.h
deleted file mode 100644
index f302b66..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_rlock.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_FS_ZFS_RLOCK_H
-#define _SYS_FS_ZFS_RLOCK_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifdef _KERNEL
-
-#include <sys/zfs_znode.h>
-
-typedef enum {
- RL_READER,
- RL_WRITER,
- RL_APPEND
-} rl_type_t;
-
-typedef struct rl {
- znode_t *r_zp; /* znode this lock applies to */
- avl_node_t r_node; /* avl node link */
- uint64_t r_off; /* file range offset */
- uint64_t r_len; /* file range length */
- uint_t r_cnt; /* range reference count in tree */
- rl_type_t r_type; /* range type */
- kcondvar_t r_wr_cv; /* cv for waiting writers */
- kcondvar_t r_rd_cv; /* cv for waiting readers */
- uint8_t r_proxy; /* acting for original range */
- uint8_t r_write_wanted; /* writer wants to lock this range */
- uint8_t r_read_wanted; /* reader wants to lock this range */
-} rl_t;
-
-/*
- * Lock a range (offset, length) as either shared (READER)
- * or exclusive (WRITER or APPEND). APPEND is a special type that
- * is converted to WRITER that specified to lock from the start of the
- * end of file. zfs_range_lock() returns the range lock structure.
- */
-rl_t *zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type);
-
-/*
- * Unlock range and destroy range lock structure.
- */
-void zfs_range_unlock(rl_t *rl);
-
-/*
- * Reduce range locked as RW_WRITER from whole file to specified range.
- * Asserts the whole file was previously locked.
- */
-void zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len);
-
-/*
- * AVL comparison function used to compare range locks
- */
-int zfs_range_compare(const void *arg1, const void *arg2);
-
-#endif /* _KERNEL */
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_FS_ZFS_RLOCK_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h
deleted file mode 100644
index aa82cc1..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_FS_ZFS_VFSOPS_H
-#define _SYS_FS_ZFS_VFSOPS_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/list.h>
-#include <sys/vfs.h>
-#include <sys/zil.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct zfsvfs zfsvfs_t;
-
-struct zfsvfs {
- vfs_t *z_vfs; /* generic fs struct */
- zfsvfs_t *z_parent; /* parent fs */
- objset_t *z_os; /* objset reference */
- uint64_t z_root; /* id of root znode */
- uint64_t z_unlinkedobj; /* id of unlinked zapobj */
- uint64_t z_max_blksz; /* maximum block size for files */
- uint64_t z_assign; /* TXG_NOWAIT or set by zil_replay() */
- zilog_t *z_log; /* intent log pointer */
- uint_t z_acl_mode; /* acl chmod/mode behavior */
- uint_t z_acl_inherit; /* acl inheritance behavior */
- boolean_t z_atime; /* enable atimes mount option */
- boolean_t z_unmounted1; /* unmounted phase 1 */
- boolean_t z_unmounted2; /* unmounted phase 2 */
- uint32_t z_op_cnt; /* vnode/vfs operations ref count */
- krwlock_t z_um_lock; /* rw lock for umount phase 2 */
- list_t z_all_znodes; /* all vnodes in the fs */
- kmutex_t z_znodes_lock; /* lock for z_all_znodes */
- vnode_t *z_ctldir; /* .zfs directory pointer */
- boolean_t z_show_ctldir; /* expose .zfs in the root dir */
- boolean_t z_issnap; /* true if this is a snapshot */
-#define ZFS_OBJ_MTX_SZ 64
- kmutex_t z_hold_mtx[ZFS_OBJ_MTX_SZ]; /* znode hold locks */
-};
-
-/*
- * The total file ID size is limited to 12 bytes (including the length
- * field) in the NFSv2 protocol. For historical reasons, this same limit
- * is currently being imposed by the Solaris NFSv3 implementation...
- * although the protocol actually permits a maximum of 64 bytes. It will
- * not be possible to expand beyond 12 bytes without abandoning support
- * of NFSv2 and making some changes to the Solaris NFSv3 implementation.
- *
- * For the time being, we will partition up the available space as follows:
- * 2 bytes fid length (required)
- * 6 bytes object number (48 bits)
- * 4 bytes generation number (32 bits)
- * We reserve only 48 bits for the object number, as this is the limit
- * currently defined and imposed by the DMU.
- */
-typedef struct zfid_short {
- uint16_t zf_len;
- uint8_t zf_object[6]; /* obj[i] = obj >> (8 * i) */
- uint8_t zf_gen[4]; /* gen[i] = gen >> (8 * i) */
-} zfid_short_t;
-
-typedef struct zfid_long {
- zfid_short_t z_fid;
- uint8_t zf_setid[6]; /* obj[i] = obj >> (8 * i) */
- uint8_t zf_setgen[2]; /* gen[i] = gen >> (8 * i) */
-} zfid_long_t;
-
-#define SHORT_FID_LEN (sizeof (zfid_short_t) - sizeof (uint16_t))
-#define LONG_FID_LEN (sizeof (zfid_long_t) - sizeof (uint16_t))
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_FS_ZFS_VFSOPS_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h
deleted file mode 100644
index c9c317e..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h
+++ /dev/null
@@ -1,298 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_FS_ZFS_ZNODE_H
-#define _SYS_FS_ZFS_ZNODE_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#ifdef _KERNEL
-#include <sys/list.h>
-#include <sys/dmu.h>
-#include <sys/zfs_vfsops.h>
-#endif
-#include <sys/zfs_acl.h>
-#include <sys/zil.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * Define special zfs pflags
- */
-#define ZFS_XATTR 0x1 /* is an extended attribute */
-#define ZFS_INHERIT_ACE 0x2 /* ace has inheritable ACEs */
-#define ZFS_ACL_TRIVIAL 0x4 /* files ACL is trivial */
-
-#define MASTER_NODE_OBJ 1
-
-/*
- * special attributes for master node.
- */
-
-#define ZFS_FSID "FSID"
-#define ZFS_UNLINKED_SET "DELETE_QUEUE"
-#define ZFS_ROOT_OBJ "ROOT"
-#define ZPL_VERSION_OBJ "VERSION"
-#define ZFS_PROP_BLOCKPERPAGE "BLOCKPERPAGE"
-#define ZFS_PROP_NOGROWBLOCKS "NOGROWBLOCKS"
-
-#define ZFS_FLAG_BLOCKPERPAGE 0x1
-#define ZFS_FLAG_NOGROWBLOCKS 0x2
-
-/*
- * ZPL version - rev'd whenever an incompatible on-disk format change
- * occurs. Independent of SPA/DMU/ZAP versioning.
- */
-
-#define ZPL_VERSION 1ULL
-
-#define ZFS_MAX_BLOCKSIZE (SPA_MAXBLOCKSIZE)
-
-/* Path component length */
-/*
- * The generic fs code uses MAXNAMELEN to represent
- * what the largest component length is. Unfortunately,
- * this length includes the terminating NULL. ZFS needs
- * to tell the users via pathconf() and statvfs() what the
- * true maximum length of a component is, excluding the NULL.
- */
-#define ZFS_MAXNAMELEN (MAXNAMELEN - 1)
-
-/*
- * The directory entry has the type (currently unused on Solaris) in the
- * top 4 bits, and the object number in the low 48 bits. The "middle"
- * 12 bits are unused.
- */
-#define ZFS_DIRENT_TYPE(de) BF64_GET(de, 60, 4)
-#define ZFS_DIRENT_OBJ(de) BF64_GET(de, 0, 48)
-#define ZFS_DIRENT_MAKE(type, obj) (((uint64_t)type << 60) | obj)
-
-
-/*
- * This is the persistent portion of the znode. It is stored
- * in the "bonus buffer" of the file. Short symbolic links
- * are also stored in the bonus buffer.
- */
-typedef struct znode_phys {
- uint64_t zp_atime[2]; /* 0 - last file access time */
- uint64_t zp_mtime[2]; /* 16 - last file modification time */
- uint64_t zp_ctime[2]; /* 32 - last file change time */
- uint64_t zp_crtime[2]; /* 48 - creation time */
- uint64_t zp_gen; /* 64 - generation (txg of creation) */
- uint64_t zp_mode; /* 72 - file mode bits */
- uint64_t zp_size; /* 80 - size of file */
- uint64_t zp_parent; /* 88 - directory parent (`..') */
- uint64_t zp_links; /* 96 - number of links to file */
- uint64_t zp_xattr; /* 104 - DMU object for xattrs */
- uint64_t zp_rdev; /* 112 - dev_t for VBLK & VCHR files */
- uint64_t zp_flags; /* 120 - persistent flags */
- uint64_t zp_uid; /* 128 - file owner */
- uint64_t zp_gid; /* 136 - owning group */
- uint64_t zp_pad[4]; /* 144 - future */
- zfs_znode_acl_t zp_acl; /* 176 - 263 ACL */
- /*
- * Data may pad out any remaining bytes in the znode buffer, eg:
- *
- * |<---------------------- dnode_phys (512) ------------------------>|
- * |<-- dnode (192) --->|<----------- "bonus" buffer (320) ---------->|
- * |<---- znode (264) ---->|<---- data (56) ---->|
- *
- * At present, we only use this space to store symbolic links.
- */
-} znode_phys_t;
-
-/*
- * Directory entry locks control access to directory entries.
- * They are used to protect creates, deletes, and renames.
- * Each directory znode has a mutex and a list of locked names.
- */
-#ifdef _KERNEL
-typedef struct zfs_dirlock {
- char *dl_name; /* directory entry being locked */
- uint32_t dl_sharecnt; /* 0 if exclusive, > 0 if shared */
- uint16_t dl_namesize; /* set if dl_name was allocated */
- kcondvar_t dl_cv; /* wait for entry to be unlocked */
- struct znode *dl_dzp; /* directory znode */
- struct zfs_dirlock *dl_next; /* next in z_dirlocks list */
-} zfs_dirlock_t;
-
-typedef struct znode {
- struct zfsvfs *z_zfsvfs;
- vnode_t *z_vnode;
- uint64_t z_id; /* object ID for this znode */
- kmutex_t z_lock; /* znode modification lock */
- krwlock_t z_map_lock; /* page map lock */
- krwlock_t z_parent_lock; /* parent lock for directories */
- krwlock_t z_name_lock; /* "master" lock for dirent locks */
- zfs_dirlock_t *z_dirlocks; /* directory entry lock list */
- kmutex_t z_range_lock; /* protects changes to z_range_avl */
- avl_tree_t z_range_avl; /* avl tree of file range locks */
- uint8_t z_unlinked; /* file has been unlinked */
- uint8_t z_atime_dirty; /* atime needs to be synced */
- uint8_t z_dbuf_held; /* Is z_dbuf already held? */
- uint8_t z_zn_prefetch; /* Prefetch znodes? */
- uint_t z_blksz; /* block size in bytes */
- uint_t z_seq; /* modification sequence number */
- uint64_t z_mapcnt; /* number of pages mapped to file */
- uint64_t z_last_itx; /* last ZIL itx on this znode */
- uint32_t z_sync_cnt; /* synchronous open count */
- kmutex_t z_acl_lock; /* acl data lock */
- list_node_t z_link_node; /* all znodes in fs link */
- struct lockf *z_lockf; /* Head of byte-level lock list. */
- /*
- * These are dmu managed fields.
- */
- znode_phys_t *z_phys; /* pointer to persistent znode */
- dmu_buf_t *z_dbuf; /* buffer containing the z_phys */
-} znode_t;
-
-
-/*
- * Range locking rules
- * --------------------
- * 1. When truncating a file (zfs_create, zfs_setattr, zfs_space) the whole
- * file range needs to be locked as RL_WRITER. Only then can the pages be
- * freed etc and zp_size reset. zp_size must be set within range lock.
- * 2. For writes and punching holes (zfs_write & zfs_space) just the range
- * being written or freed needs to be locked as RL_WRITER.
- * Multiple writes at the end of the file must coordinate zp_size updates
- * to ensure data isn't lost. A compare and swap loop is currently used
- * to ensure the file size is at least the offset last written.
- * 3. For reads (zfs_read, zfs_get_data & zfs_putapage) just the range being
- * read needs to be locked as RL_READER. A check against zp_size can then
- * be made for reading beyond end of file.
- */
-
-/*
- * Convert between znode pointers and vnode pointers
- */
-#define ZTOV(ZP) ((ZP)->z_vnode)
-#define VTOZ(VP) ((znode_t *)(VP)->v_data)
-
-/*
- * ZFS_ENTER() is called on entry to each ZFS vnode and vfs operation.
- * ZFS_EXIT() must be called before exitting the vop.
- */
-#define ZFS_ENTER(zfsvfs) \
- { \
- atomic_add_32(&(zfsvfs)->z_op_cnt, 1); \
- if ((zfsvfs)->z_unmounted1) { \
- ZFS_EXIT(zfsvfs); \
- return (EIO); \
- } \
- }
-#define ZFS_EXIT(zfsvfs) atomic_add_32(&(zfsvfs)->z_op_cnt, -1)
-
-/*
- * Macros for dealing with dmu_buf_hold
- */
-#define ZFS_OBJ_HASH(obj_num) (obj_num & (ZFS_OBJ_MTX_SZ - 1))
-#define ZFS_OBJ_MUTEX(zp) \
- (&zp->z_zfsvfs->z_hold_mtx[ZFS_OBJ_HASH(zp->z_id)])
-#define ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num) \
- mutex_enter(&zfsvfs->z_hold_mtx[ZFS_OBJ_HASH(obj_num)]);
-
-#define ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num) \
- mutex_exit(&zfsvfs->z_hold_mtx[ZFS_OBJ_HASH(obj_num)])
-
-/*
- * Macros to encode/decode ZFS stored time values from/to struct timespec
- */
-#define ZFS_TIME_ENCODE(tp, stmp) \
-{ \
- stmp[0] = (uint64_t)(tp)->tv_sec; \
- stmp[1] = (uint64_t)(tp)->tv_nsec; \
-}
-
-#define ZFS_TIME_DECODE(tp, stmp) \
-{ \
- (tp)->tv_sec = (time_t)stmp[0]; \
- (tp)->tv_nsec = (long)stmp[1]; \
-}
-
-/*
- * Timestamp defines
- */
-#define ACCESSED (AT_ATIME)
-#define STATE_CHANGED (AT_CTIME)
-#define CONTENT_MODIFIED (AT_MTIME | AT_CTIME)
-
-#define ZFS_ACCESSTIME_STAMP(zfsvfs, zp) \
- if ((zfsvfs)->z_atime && !((zfsvfs)->z_vfs->vfs_flag & VFS_RDONLY)) \
- zfs_time_stamper(zp, ACCESSED, NULL)
-
-extern int zfs_init_fs(zfsvfs_t *, znode_t **, cred_t *);
-extern void zfs_set_dataprop(objset_t *);
-extern void zfs_create_fs(objset_t *os, cred_t *cr, dmu_tx_t *tx);
-extern void zfs_time_stamper(znode_t *, uint_t, dmu_tx_t *);
-extern void zfs_time_stamper_locked(znode_t *, uint_t, dmu_tx_t *);
-extern void zfs_grow_blocksize(znode_t *, uint64_t, dmu_tx_t *);
-extern int zfs_freesp(znode_t *, uint64_t, uint64_t, int, boolean_t);
-extern void zfs_znode_init(void);
-extern void zfs_znode_fini(void);
-extern int zfs_zget(zfsvfs_t *, uint64_t, znode_t **);
-extern void zfs_zinactive(znode_t *);
-extern void zfs_znode_delete(znode_t *, dmu_tx_t *);
-extern void zfs_znode_free(znode_t *);
-extern void zfs_remove_op_tables();
-extern int zfs_create_op_tables();
-extern dev_t zfs_cmpldev(uint64_t);
-
-extern void zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, int txtype,
- znode_t *dzp, znode_t *zp, char *name);
-extern void zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, int txtype,
- znode_t *dzp, char *name);
-extern void zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, int txtype,
- znode_t *dzp, znode_t *zp, char *name);
-extern void zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, int txtype,
- znode_t *dzp, znode_t *zp, char *name, char *link);
-extern void zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, int txtype,
- znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp);
-extern void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
- znode_t *zp, offset_t off, ssize_t len, int ioflag);
-extern void zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
- znode_t *zp, uint64_t off, uint64_t len);
-extern void zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
- znode_t *zp, vattr_t *vap, uint_t mask_applied);
-#ifndef ZFS_NO_ACL
-extern void zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, int txtype,
- znode_t *zp, int aclcnt, ace_t *z_ace);
-#endif
-
-extern zil_get_data_t zfs_get_data;
-extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE];
-extern int zfsfstype;
-
-#endif /* _KERNEL */
-
-extern int zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_FS_ZFS_ZNODE_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h
deleted file mode 100644
index 947ba9f..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h
+++ /dev/null
@@ -1,276 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_ZIL_H
-#define _SYS_ZIL_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/types.h>
-#include <sys/spa.h>
-#include <sys/zio.h>
-#include <sys/dmu.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * Intent log format:
- *
- * Each objset has its own intent log. The log header (zil_header_t)
- * for objset N's intent log is kept in the Nth object of the SPA's
- * intent_log objset. The log header points to a chain of log blocks,
- * each of which contains log records (i.e., transactions) followed by
- * a log block trailer (zil_trailer_t). The format of a log record
- * depends on the record (or transaction) type, but all records begin
- * with a common structure that defines the type, length, and txg.
- */
-
-/*
- * Intent log header - this on disk structure holds fields to manage
- * the log. All fields are 64 bit to easily handle cross architectures.
- */
-typedef struct zil_header {
- uint64_t zh_claim_txg; /* txg in which log blocks were claimed */
- uint64_t zh_replay_seq; /* highest replayed sequence number */
- blkptr_t zh_log; /* log chain */
- uint64_t zh_claim_seq; /* highest claimed sequence number */
- uint64_t zh_pad[5];
-} zil_header_t;
-
-/*
- * Log block trailer - structure at the end of the header and each log block
- *
- * The zit_bt contains a zbt_cksum which for the intent log is
- * the sequence number of this log block. A seq of 0 is invalid.
- * The zbt_cksum is checked by the SPA against the sequence
- * number passed in the blk_cksum field of the blkptr_t
- */
-typedef struct zil_trailer {
- uint64_t zit_pad;
- blkptr_t zit_next_blk; /* next block in chain */
- uint64_t zit_nused; /* bytes in log block used */
- zio_block_tail_t zit_bt; /* block trailer */
-} zil_trailer_t;
-
-#define ZIL_MIN_BLKSZ 4096ULL
-#define ZIL_MAX_BLKSZ SPA_MAXBLOCKSIZE
-#define ZIL_BLK_DATA_SZ(lwb) ((lwb)->lwb_sz - sizeof (zil_trailer_t))
-
-/*
- * The words of a log block checksum.
- */
-#define ZIL_ZC_GUID_0 0
-#define ZIL_ZC_GUID_1 1
-#define ZIL_ZC_OBJSET 2
-#define ZIL_ZC_SEQ 3
-
-/*
- * Intent log transaction types and record structures
- */
-#define TX_CREATE 1 /* Create file */
-#define TX_MKDIR 2 /* Make directory */
-#define TX_MKXATTR 3 /* Make XATTR directory */
-#define TX_SYMLINK 4 /* Create symbolic link to a file */
-#define TX_REMOVE 5 /* Remove file */
-#define TX_RMDIR 6 /* Remove directory */
-#define TX_LINK 7 /* Create hard link to a file */
-#define TX_RENAME 8 /* Rename a file */
-#define TX_WRITE 9 /* File write */
-#define TX_TRUNCATE 10 /* Truncate a file */
-#define TX_SETATTR 11 /* Set file attributes */
-#define TX_ACL 12 /* Set acl */
-#define TX_MAX_TYPE 13 /* Max transaction type */
-
-/*
- * Format of log records.
- * The fields are carefully defined to allow them to be aligned
- * and sized the same on sparc & intel architectures.
- * Each log record has a common structure at the beginning.
- *
- * Note, lrc_seq holds two different sequence numbers. Whilst in memory
- * it contains the transaction sequence number. The log record on
- * disk holds the sequence number of all log records which is used to
- * ensure we don't replay the same record. The two sequence numbers are
- * different because the transactions can now be pushed out of order.
- */
-typedef struct { /* common log record header */
- uint64_t lrc_txtype; /* intent log transaction type */
- uint64_t lrc_reclen; /* transaction record length */
- uint64_t lrc_txg; /* dmu transaction group number */
- uint64_t lrc_seq; /* see comment above */
-} lr_t;
-
-typedef struct {
- lr_t lr_common; /* common portion of log record */
- uint64_t lr_doid; /* object id of directory */
- uint64_t lr_foid; /* object id of created file object */
- uint64_t lr_mode; /* mode of object */
- uint64_t lr_uid; /* uid of object */
- uint64_t lr_gid; /* gid of object */
- uint64_t lr_gen; /* generation (txg of creation) */
- uint64_t lr_crtime[2]; /* creation time */
- uint64_t lr_rdev; /* rdev of object to create */
- /* name of object to create follows this */
- /* for symlinks, link content follows name */
-} lr_create_t;
-
-typedef struct {
- lr_t lr_common; /* common portion of log record */
- uint64_t lr_doid; /* obj id of directory */
- /* name of object to remove follows this */
-} lr_remove_t;
-
-typedef struct {
- lr_t lr_common; /* common portion of log record */
- uint64_t lr_doid; /* obj id of directory */
- uint64_t lr_link_obj; /* obj id of link */
- /* name of object to link follows this */
-} lr_link_t;
-
-typedef struct {
- lr_t lr_common; /* common portion of log record */
- uint64_t lr_sdoid; /* obj id of source directory */
- uint64_t lr_tdoid; /* obj id of target directory */
- /* 2 strings: names of source and destination follow this */
-} lr_rename_t;
-
-typedef struct {
- lr_t lr_common; /* common portion of log record */
- uint64_t lr_foid; /* file object to write */
- uint64_t lr_offset; /* offset to write to */
- uint64_t lr_length; /* user data length to write */
- uint64_t lr_blkoff; /* offset represented by lr_blkptr */
- blkptr_t lr_blkptr; /* spa block pointer for replay */
- /* write data will follow for small writes */
-} lr_write_t;
-
-typedef struct {
- lr_t lr_common; /* common portion of log record */
- uint64_t lr_foid; /* object id of file to truncate */
- uint64_t lr_offset; /* offset to truncate from */
- uint64_t lr_length; /* length to truncate */
-} lr_truncate_t;
-
-typedef struct {
- lr_t lr_common; /* common portion of log record */
- uint64_t lr_foid; /* file object to change attributes */
- uint64_t lr_mask; /* mask of attributes to set */
- uint64_t lr_mode; /* mode to set */
- uint64_t lr_uid; /* uid to set */
- uint64_t lr_gid; /* gid to set */
- uint64_t lr_size; /* size to set */
- uint64_t lr_atime[2]; /* access time */
- uint64_t lr_mtime[2]; /* modification time */
-} lr_setattr_t;
-
-typedef struct {
- lr_t lr_common; /* common portion of log record */
- uint64_t lr_foid; /* obj id of file */
- uint64_t lr_aclcnt; /* number of acl entries */
- /* lr_aclcnt number of ace_t entries follow this */
-} lr_acl_t;
-
-/*
- * ZIL structure definitions, interface function prototype and globals.
- */
-
-/*
- * ZFS intent log transaction structure
- */
-typedef enum {
- WR_INDIRECT, /* indirect - a large write (dmu_sync() data */
- /* and put blkptr in log, rather than actual data) */
- WR_COPIED, /* immediate - data is copied into lr_write_t */
- WR_NEED_COPY, /* immediate - data needs to be copied if pushed */
-} itx_wr_state_t;
-
-typedef struct itx {
- list_node_t itx_node; /* linkage on zl_itx_list */
- void *itx_private; /* type-specific opaque data */
- itx_wr_state_t itx_wr_state; /* write state */
- uint8_t itx_sync; /* synchronous transaction */
- lr_t itx_lr; /* common part of log record */
- /* followed by type-specific part of lr_xx_t and its immediate data */
-} itx_t;
-
-
-/*
- * zgd_t is passed through dmu_sync() to the callback routine zfs_get_done()
- * to handle the cleanup of the dmu_sync() buffer write
- */
-typedef struct {
- zilog_t *zgd_zilog; /* zilog */
- blkptr_t *zgd_bp; /* block pointer */
- struct rl *zgd_rl; /* range lock */
-} zgd_t;
-
-
-typedef void zil_parse_blk_func_t(zilog_t *zilog, blkptr_t *bp, void *arg,
- uint64_t txg);
-typedef void zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg,
- uint64_t txg);
-typedef int zil_replay_func_t();
-typedef int zil_get_data_t(void *arg, lr_write_t *lr, char *dbuf, zio_t *zio);
-
-extern uint64_t zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
- zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg);
-
-extern void zil_init(void);
-extern void zil_fini(void);
-
-extern zilog_t *zil_alloc(objset_t *os, zil_header_t *zh_phys);
-extern void zil_free(zilog_t *zilog);
-
-extern zilog_t *zil_open(objset_t *os, zil_get_data_t *get_data);
-extern void zil_close(zilog_t *zilog);
-
-extern void zil_replay(objset_t *os, void *arg, uint64_t *txgp,
- zil_replay_func_t *replay_func[TX_MAX_TYPE]);
-extern void zil_destroy(zilog_t *zilog, boolean_t keep_first);
-
-extern itx_t *zil_itx_create(int txtype, size_t lrsize);
-extern uint64_t zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx);
-
-extern void zil_commit(zilog_t *zilog, uint64_t seq, uint64_t oid);
-
-extern int zil_claim(char *osname, void *txarg);
-extern void zil_sync(zilog_t *zilog, dmu_tx_t *tx);
-extern void zil_clean(zilog_t *zilog);
-extern int zil_is_committed(zilog_t *zilog);
-
-extern int zil_suspend(zilog_t *zilog);
-extern void zil_resume(zilog_t *zilog);
-
-extern void zil_add_vdev(zilog_t *zilog, uint64_t vdev);
-
-extern int zil_disable;
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_ZIL_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h
deleted file mode 100644
index 3ecf4e4..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_ZIL_IMPL_H
-#define _SYS_ZIL_IMPL_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/zil.h>
-#include <sys/dmu_objset.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * Log write buffer.
- */
-typedef struct lwb {
- zilog_t *lwb_zilog; /* back pointer to log struct */
- blkptr_t lwb_blk; /* on disk address of this log blk */
- int lwb_nused; /* # used bytes in buffer */
- int lwb_sz; /* size of block and buffer */
- char *lwb_buf; /* log write buffer */
- zio_t *lwb_zio; /* zio for this buffer */
- uint64_t lwb_max_txg; /* highest txg in this lwb */
- txg_handle_t lwb_txgh; /* txg handle for txg_exit() */
- list_node_t lwb_node; /* zilog->zl_lwb_list linkage */
-} lwb_t;
-
-/*
- * Vdev flushing: We use a bit map of size ZIL_VDEV_BMAP bytes.
- * Any vdev numbers beyond that use a linked list of zil_vdev_t structures.
- */
-
-#define ZIL_VDEV_BMSZ 16 /* 16 * 8 = 128 vdevs */
-typedef struct zil_vdev {
- uint64_t vdev; /* device written */
- list_node_t vdev_seq_node; /* zilog->zl_vdev_list linkage */
-} zil_vdev_t;
-
-/*
- * Stable storage intent log management structure. One per dataset.
- */
-struct zilog {
- kmutex_t zl_lock; /* protects most zilog_t fields */
- struct dsl_pool *zl_dmu_pool; /* DSL pool */
- spa_t *zl_spa; /* handle for read/write log */
- const zil_header_t *zl_header; /* log header buffer */
- objset_t *zl_os; /* object set we're logging */
- zil_get_data_t *zl_get_data; /* callback to get object content */
- zio_t *zl_root_zio; /* log writer root zio */
- uint64_t zl_itx_seq; /* next itx sequence number */
- uint64_t zl_commit_seq; /* committed upto this number */
- uint64_t zl_lr_seq; /* log record sequence number */
- uint64_t zl_destroy_txg; /* txg of last zil_destroy() */
- uint64_t zl_replay_seq[TXG_SIZE]; /* seq of last replayed rec */
- uint32_t zl_suspend; /* log suspend count */
- kcondvar_t zl_cv_writer; /* log writer thread completion */
- kcondvar_t zl_cv_suspend; /* log suspend completion */
- uint8_t zl_suspending; /* log is currently suspending */
- uint8_t zl_keep_first; /* keep first log block in destroy */
- uint8_t zl_stop_replay; /* don't replay any further */
- uint8_t zl_stop_sync; /* for debugging */
- uint8_t zl_writer; /* boolean: write setup in progress */
- uint8_t zl_log_error; /* boolean: log write error */
- list_t zl_itx_list; /* in-memory itx list */
- uint64_t zl_itx_list_sz; /* total size of records on list */
- uint64_t zl_cur_used; /* current commit log size used */
- uint64_t zl_prev_used; /* previous commit log size used */
- list_t zl_lwb_list; /* in-flight log write list */
- list_t zl_vdev_list; /* list of [vdev, seq] pairs */
- uint8_t zl_vdev_bmap[ZIL_VDEV_BMSZ]; /* bitmap of vdevs */
- taskq_t *zl_clean_taskq; /* runs lwb and itx clean tasks */
- avl_tree_t zl_dva_tree; /* track DVAs during log parse */
- clock_t zl_replay_time; /* lbolt of when replay started */
- uint64_t zl_replay_blks; /* number of log blocks replayed */
-};
-
-typedef struct zil_dva_node {
- dva_t zn_dva;
- avl_node_t zn_node;
-} zil_dva_node_t;
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_ZIL_IMPL_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
deleted file mode 100644
index b026ae6..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
+++ /dev/null
@@ -1,366 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _ZIO_H
-#define _ZIO_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/spa.h>
-#include <sys/txg.h>
-#include <sys/avl.h>
-#include <sys/dkio.h>
-#include <sys/fs/zfs.h>
-#include <sys/zio_impl.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define ZBT_MAGIC 0x210da7ab10c7a11ULL /* zio data bloc tail */
-
-typedef struct zio_block_tail {
- uint64_t zbt_magic; /* for validation, endianness */
- zio_cksum_t zbt_cksum; /* 256-bit checksum */
-} zio_block_tail_t;
-
-/*
- * Gang block headers are self-checksumming and contain an array
- * of block pointers.
- */
-#define SPA_GANGBLOCKSIZE SPA_MINBLOCKSIZE
-#define SPA_GBH_NBLKPTRS ((SPA_GANGBLOCKSIZE - \
- sizeof (zio_block_tail_t)) / sizeof (blkptr_t))
-#define SPA_GBH_FILLER ((SPA_GANGBLOCKSIZE - \
- sizeof (zio_block_tail_t) - \
- (SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\
- sizeof (uint64_t))
-
-#define ZIO_GET_IOSIZE(zio) \
- (BP_IS_GANG((zio)->io_bp) ? \
- SPA_GANGBLOCKSIZE : BP_GET_PSIZE((zio)->io_bp))
-
-typedef struct zio_gbh {
- blkptr_t zg_blkptr[SPA_GBH_NBLKPTRS];
- uint64_t zg_filler[SPA_GBH_FILLER];
- zio_block_tail_t zg_tail;
-} zio_gbh_phys_t;
-
-enum zio_checksum {
- ZIO_CHECKSUM_INHERIT = 0,
- ZIO_CHECKSUM_ON,
- ZIO_CHECKSUM_OFF,
- ZIO_CHECKSUM_LABEL,
- ZIO_CHECKSUM_GANG_HEADER,
- ZIO_CHECKSUM_ZILOG,
- ZIO_CHECKSUM_FLETCHER_2,
- ZIO_CHECKSUM_FLETCHER_4,
- ZIO_CHECKSUM_SHA256,
- ZIO_CHECKSUM_FUNCTIONS
-};
-
-#define ZIO_CHECKSUM_ON_VALUE ZIO_CHECKSUM_FLETCHER_2
-#define ZIO_CHECKSUM_DEFAULT ZIO_CHECKSUM_ON
-
-enum zio_compress {
- ZIO_COMPRESS_INHERIT = 0,
- ZIO_COMPRESS_ON,
- ZIO_COMPRESS_OFF,
- ZIO_COMPRESS_LZJB,
- ZIO_COMPRESS_EMPTY,
- ZIO_COMPRESS_GZIP_1,
- ZIO_COMPRESS_GZIP_2,
- ZIO_COMPRESS_GZIP_3,
- ZIO_COMPRESS_GZIP_4,
- ZIO_COMPRESS_GZIP_5,
- ZIO_COMPRESS_GZIP_6,
- ZIO_COMPRESS_GZIP_7,
- ZIO_COMPRESS_GZIP_8,
- ZIO_COMPRESS_GZIP_9,
- ZIO_COMPRESS_FUNCTIONS
-};
-
-#define ZIO_COMPRESS_ON_VALUE ZIO_COMPRESS_LZJB
-#define ZIO_COMPRESS_DEFAULT ZIO_COMPRESS_OFF
-
-#define ZIO_PRIORITY_NOW (zio_priority_table[0])
-#define ZIO_PRIORITY_SYNC_READ (zio_priority_table[1])
-#define ZIO_PRIORITY_SYNC_WRITE (zio_priority_table[2])
-#define ZIO_PRIORITY_ASYNC_READ (zio_priority_table[3])
-#define ZIO_PRIORITY_ASYNC_WRITE (zio_priority_table[4])
-#define ZIO_PRIORITY_FREE (zio_priority_table[5])
-#define ZIO_PRIORITY_CACHE_FILL (zio_priority_table[6])
-#define ZIO_PRIORITY_LOG_WRITE (zio_priority_table[7])
-#define ZIO_PRIORITY_RESILVER (zio_priority_table[8])
-#define ZIO_PRIORITY_SCRUB (zio_priority_table[9])
-#define ZIO_PRIORITY_TABLE_SIZE 10
-
-#define ZIO_FLAG_MUSTSUCCEED 0x00000
-#define ZIO_FLAG_CANFAIL 0x00001
-#define ZIO_FLAG_FAILFAST 0x00002
-#define ZIO_FLAG_CONFIG_HELD 0x00004
-#define ZIO_FLAG_CONFIG_GRABBED 0x00008
-
-#define ZIO_FLAG_DONT_CACHE 0x00010
-#define ZIO_FLAG_DONT_QUEUE 0x00020
-#define ZIO_FLAG_DONT_PROPAGATE 0x00040
-#define ZIO_FLAG_DONT_RETRY 0x00080
-
-#define ZIO_FLAG_PHYSICAL 0x00100
-#define ZIO_FLAG_IO_BYPASS 0x00200
-#define ZIO_FLAG_IO_REPAIR 0x00400
-#define ZIO_FLAG_SPECULATIVE 0x00800
-
-#define ZIO_FLAG_RESILVER 0x01000
-#define ZIO_FLAG_SCRUB 0x02000
-#define ZIO_FLAG_SCRUB_THREAD 0x04000
-#define ZIO_FLAG_SUBBLOCK 0x08000
-
-#define ZIO_FLAG_NOBOOKMARK 0x10000
-#define ZIO_FLAG_USER 0x20000
-
-#define ZIO_FLAG_METADATA 0x40000
-
-#define ZIO_FLAG_GANG_INHERIT \
- (ZIO_FLAG_CANFAIL | \
- ZIO_FLAG_FAILFAST | \
- ZIO_FLAG_CONFIG_HELD | \
- ZIO_FLAG_DONT_RETRY | \
- ZIO_FLAG_IO_REPAIR | \
- ZIO_FLAG_SPECULATIVE | \
- ZIO_FLAG_RESILVER | \
- ZIO_FLAG_SCRUB | \
- ZIO_FLAG_SCRUB_THREAD)
-
-#define ZIO_FLAG_VDEV_INHERIT \
- (ZIO_FLAG_GANG_INHERIT | \
- ZIO_FLAG_DONT_CACHE | \
- ZIO_FLAG_PHYSICAL)
-
-/*
- * We'll take the EILSEQ (Illegal byte sequence) errno
- * to indicate checksum errors.
- */
-#define ECKSUM EILSEQ
-
-typedef struct zio zio_t;
-typedef void zio_done_func_t(zio_t *zio);
-
-extern uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE];
-extern char *zio_type_name[ZIO_TYPES];
-
-/*
- * A bookmark is a four-tuple <objset, object, level, blkid> that uniquely
- * identifies any block in the pool. By convention, the meta-objset (MOS)
- * is objset 0, the meta-dnode is object 0, the root block (osphys_t) is
- * level -1 of the meta-dnode, and intent log blocks (which are chained
- * off the root block) have blkid == sequence number. In summary:
- *
- * mos is objset 0
- * meta-dnode is object 0
- * root block is <objset, 0, -1, 0>
- * intent log is <objset, 0, -1, ZIL sequence number>
- *
- * Note: this structure is called a bookmark because its first purpose was
- * to remember where to resume a pool-wide traverse. The absolute ordering
- * for block visitation during traversal is defined in compare_bookmark().
- *
- * Note: this structure is passed between userland and the kernel.
- * Therefore it must not change size or alignment between 32/64 bit
- * compilation options.
- */
-typedef struct zbookmark {
- uint64_t zb_objset;
- uint64_t zb_object;
- int64_t zb_level;
- uint64_t zb_blkid;
-} zbookmark_t;
-
-struct zio {
- /* Core information about this I/O */
- zio_t *io_parent;
- zio_t *io_root;
- spa_t *io_spa;
- zbookmark_t io_bookmark;
- enum zio_checksum io_checksum;
- enum zio_compress io_compress;
- int io_ndvas;
- uint64_t io_txg;
- blkptr_t *io_bp;
- blkptr_t io_bp_copy;
- zio_t *io_child;
- zio_t *io_sibling_prev;
- zio_t *io_sibling_next;
- zio_transform_t *io_transform_stack;
- zio_t *io_logical;
-
- /* Callback info */
- zio_done_func_t *io_ready;
- zio_done_func_t *io_done;
- void *io_private;
- blkptr_t io_bp_orig;
-
- /* Data represented by this I/O */
- void *io_data;
- uint64_t io_size;
-
- /* Stuff for the vdev stack */
- vdev_t *io_vd;
- void *io_vsd;
- uint64_t io_offset;
- uint64_t io_deadline;
- uint64_t io_timestamp;
- avl_node_t io_offset_node;
- avl_node_t io_deadline_node;
- avl_tree_t *io_vdev_tree;
- zio_t *io_delegate_list;
- zio_t *io_delegate_next;
-
- /* Internal pipeline state */
- int io_flags;
- enum zio_type io_type;
- enum zio_stage io_stage;
- uint8_t io_stalled;
- uint8_t io_priority;
- struct dk_callback io_dk_callback;
- int io_cmd;
- int io_retries;
- int io_error;
- uint32_t io_numerrors;
- uint32_t io_pipeline;
- uint32_t io_async_stages;
- uint64_t io_children_notready;
- uint64_t io_children_notdone;
- void *io_waiter;
- kmutex_t io_lock;
- kcondvar_t io_cv;
-
- /* FMA state */
- uint64_t io_ena;
-};
-
-extern zio_t *zio_null(zio_t *pio, spa_t *spa,
- zio_done_func_t *done, void *private, int flags);
-
-extern zio_t *zio_root(spa_t *spa,
- zio_done_func_t *done, void *private, int flags);
-
-extern zio_t *zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data,
- uint64_t size, zio_done_func_t *done, void *private,
- int priority, int flags, zbookmark_t *zb);
-
-extern zio_t *zio_write(zio_t *pio, spa_t *spa, int checksum, int compress,
- int ncopies, uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
- zio_done_func_t *ready, zio_done_func_t *done, void *private, int priority,
- int flags, zbookmark_t *zb);
-
-extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, int checksum,
- uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
- zio_done_func_t *done, void *private, int priority, int flags,
- zbookmark_t *zb);
-
-extern zio_t *zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
- zio_done_func_t *done, void *private);
-
-extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
- zio_done_func_t *done, void *private);
-
-extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
- zio_done_func_t *done, void *private, int priority, int flags);
-
-extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
- uint64_t size, void *data, int checksum,
- zio_done_func_t *done, void *private, int priority, int flags);
-
-extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
- uint64_t size, void *data, int checksum,
- zio_done_func_t *done, void *private, int priority, int flags);
-
-extern int zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp,
- blkptr_t *old_bp, uint64_t txg);
-extern void zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg);
-
-extern int zio_wait(zio_t *zio);
-extern void zio_nowait(zio_t *zio);
-
-extern void *zio_buf_alloc(size_t size);
-extern void zio_buf_free(void *buf, size_t size);
-extern void *zio_data_buf_alloc(size_t size);
-extern void zio_data_buf_free(void *buf, size_t size);
-
-/*
- * Move an I/O to the next stage of the pipeline and execute that stage.
- * There's no locking on io_stage because there's no legitimate way for
- * multiple threads to be attempting to process the same I/O.
- */
-extern void zio_next_stage(zio_t *zio);
-extern void zio_next_stage_async(zio_t *zio);
-extern void zio_wait_children_done(zio_t *zio);
-
-/*
- * Delegate I/O to a child vdev.
- */
-extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd,
- uint64_t offset, void *data, uint64_t size, int type, int priority,
- int flags, zio_done_func_t *done, void *private);
-
-extern void zio_vdev_io_bypass(zio_t *zio);
-extern void zio_vdev_io_reissue(zio_t *zio);
-extern void zio_vdev_io_redone(zio_t *zio);
-
-extern void zio_checksum_verified(zio_t *zio);
-extern void zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp);
-
-extern uint8_t zio_checksum_select(uint8_t child, uint8_t parent);
-extern uint8_t zio_compress_select(uint8_t child, uint8_t parent);
-
-boolean_t zio_should_retry(zio_t *zio);
-
-/*
- * Initial setup and teardown.
- */
-extern void zio_init(void);
-extern void zio_fini(void);
-
-/*
- * Fault injection
- */
-struct zinject_record;
-extern uint32_t zio_injection_enabled;
-extern int zio_inject_fault(char *name, int flags, int *id,
- struct zinject_record *record);
-extern int zio_inject_list_next(int *id, char *name, size_t buflen,
- struct zinject_record *record);
-extern int zio_clear_fault(int id);
-extern int zio_handle_fault_injection(zio_t *zio, int error);
-extern int zio_handle_device_injection(vdev_t *vd, int error);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _ZIO_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h
deleted file mode 100644
index bb7bd41..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_ZIO_CHECKSUM_H
-#define _SYS_ZIO_CHECKSUM_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/zio.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * Signature for checksum functions.
- */
-typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp);
-
-/*
- * Information about each checksum function.
- */
-typedef struct zio_checksum_info {
- zio_checksum_t *ci_func[2]; /* checksum function for each byteorder */
- int ci_correctable; /* number of correctable bits */
- int ci_zbt; /* uses zio block tail? */
- char *ci_name; /* descriptive name */
-} zio_checksum_info_t;
-
-extern zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS];
-
-/*
- * Checksum routines.
- */
-extern zio_checksum_t fletcher_2_native;
-extern zio_checksum_t fletcher_4_native;
-extern zio_checksum_t fletcher_4_incremental_native;
-
-extern zio_checksum_t fletcher_2_byteswap;
-extern zio_checksum_t fletcher_4_byteswap;
-extern zio_checksum_t fletcher_4_incremental_byteswap;
-
-extern zio_checksum_t zio_checksum_SHA256;
-
-extern void zio_checksum(uint_t checksum, zio_cksum_t *zcp,
- void *data, uint64_t size);
-extern int zio_checksum_error(zio_t *zio);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_ZIO_CHECKSUM_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h
deleted file mode 100644
index 66ee8d4..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_ZIO_COMPRESS_H
-#define _SYS_ZIO_COMPRESS_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/zio.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * Common signature for all zio compress/decompress functions.
- */
-typedef size_t zio_compress_func_t(void *src, void *dst,
- size_t s_len, size_t d_len, int);
-typedef int zio_decompress_func_t(void *src, void *dst,
- size_t s_len, size_t d_len, int);
-
-/*
- * Information about each compression function.
- */
-typedef struct zio_compress_info {
- zio_compress_func_t *ci_compress; /* compression function */
- zio_decompress_func_t *ci_decompress; /* decompression function */
- int ci_level; /* level parameter */
- char *ci_name; /* algorithm name */
-} zio_compress_info_t;
-
-extern zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS];
-
-/*
- * Compression routines.
- */
-extern size_t lzjb_compress(void *src, void *dst, size_t s_len, size_t d_len,
- int level);
-extern int lzjb_decompress(void *src, void *dst, size_t s_len, size_t d_len,
- int level);
-extern size_t gzip_compress(void *src, void *dst, size_t s_len, size_t d_len,
- int level);
-extern int gzip_decompress(void *src, void *dst, size_t s_len, size_t d_len,
- int level);
-
-/*
- * Compress and decompress data if necessary.
- */
-extern int zio_compress_data(int cpfunc, void *src, uint64_t srcsize,
- void **destp, uint64_t *destsizep, uint64_t *destbufsizep);
-extern int zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
- void *dest, uint64_t destsize);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_ZIO_COMPRESS_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h
deleted file mode 100644
index d2ddbc3..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h
+++ /dev/null
@@ -1,205 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _ZIO_IMPL_H
-#define _ZIO_IMPL_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/zio.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * I/O Groups: pipeline stage definitions.
- */
-
-typedef enum zio_stage {
- ZIO_STAGE_OPEN = 0, /* RWFCI */
- ZIO_STAGE_WAIT_CHILDREN_READY, /* RWFCI */
-
- ZIO_STAGE_WRITE_COMPRESS, /* -W--- */
- ZIO_STAGE_CHECKSUM_GENERATE, /* -W--- */
-
- ZIO_STAGE_GANG_PIPELINE, /* -WFC- */
-
- ZIO_STAGE_GET_GANG_HEADER, /* -WFC- */
- ZIO_STAGE_REWRITE_GANG_MEMBERS, /* -W--- */
- ZIO_STAGE_FREE_GANG_MEMBERS, /* --F-- */
- ZIO_STAGE_CLAIM_GANG_MEMBERS, /* ---C- */
-
- ZIO_STAGE_DVA_ALLOCATE, /* -W--- */
- ZIO_STAGE_DVA_FREE, /* --F-- */
- ZIO_STAGE_DVA_CLAIM, /* ---C- */
-
- ZIO_STAGE_GANG_CHECKSUM_GENERATE, /* -W--- */
-
- ZIO_STAGE_READY, /* RWFCI */
-
- ZIO_STAGE_VDEV_IO_START, /* RW--I */
- ZIO_STAGE_VDEV_IO_DONE, /* RW--I */
- ZIO_STAGE_VDEV_IO_ASSESS, /* RW--I */
-
- ZIO_STAGE_WAIT_CHILDREN_DONE, /* RWFCI */
-
- ZIO_STAGE_CHECKSUM_VERIFY, /* R---- */
- ZIO_STAGE_READ_GANG_MEMBERS, /* R---- */
- ZIO_STAGE_READ_DECOMPRESS, /* R---- */
-
- ZIO_STAGE_DONE /* RWFCI */
-} zio_stage_t;
-
-/*
- * The stages for which there's some performance value in going async.
- * When compression is enabled, ZIO_STAGE_WRITE_COMPRESS is ORed in as well.
- */
-#define ZIO_ASYNC_PIPELINE_STAGES \
- ((1U << ZIO_STAGE_CHECKSUM_GENERATE) | \
- (1U << ZIO_STAGE_VDEV_IO_DONE) | \
- (1U << ZIO_STAGE_CHECKSUM_VERIFY) | \
- (1U << ZIO_STAGE_READ_DECOMPRESS))
-
-#define ZIO_VDEV_IO_PIPELINE \
- ((1U << ZIO_STAGE_VDEV_IO_START) | \
- (1U << ZIO_STAGE_VDEV_IO_DONE) | \
- (1U << ZIO_STAGE_VDEV_IO_ASSESS))
-
-#define ZIO_READ_PHYS_PIPELINE \
- ((1U << ZIO_STAGE_OPEN) | \
- (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \
- (1U << ZIO_STAGE_READY) | \
- ZIO_VDEV_IO_PIPELINE | \
- (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \
- (1U << ZIO_STAGE_CHECKSUM_VERIFY) | \
- (1U << ZIO_STAGE_DONE))
-
-#define ZIO_READ_PIPELINE \
- ZIO_READ_PHYS_PIPELINE
-
-#define ZIO_WRITE_PHYS_PIPELINE \
- ((1U << ZIO_STAGE_OPEN) | \
- (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \
- (1U << ZIO_STAGE_CHECKSUM_GENERATE) | \
- (1U << ZIO_STAGE_READY) | \
- ZIO_VDEV_IO_PIPELINE | \
- (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \
- (1U << ZIO_STAGE_DONE))
-
-#define ZIO_WRITE_COMMON_PIPELINE \
- ZIO_WRITE_PHYS_PIPELINE
-
-#define ZIO_WRITE_PIPELINE \
- ((1U << ZIO_STAGE_WRITE_COMPRESS) | \
- ZIO_WRITE_COMMON_PIPELINE)
-
-#define ZIO_GANG_STAGES \
- ((1U << ZIO_STAGE_GET_GANG_HEADER) | \
- (1U << ZIO_STAGE_REWRITE_GANG_MEMBERS) | \
- (1U << ZIO_STAGE_FREE_GANG_MEMBERS) | \
- (1U << ZIO_STAGE_CLAIM_GANG_MEMBERS) | \
- (1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE) | \
- (1U << ZIO_STAGE_READ_GANG_MEMBERS))
-
-#define ZIO_REWRITE_PIPELINE \
- ((1U << ZIO_STAGE_GANG_PIPELINE) | \
- (1U << ZIO_STAGE_GET_GANG_HEADER) | \
- (1U << ZIO_STAGE_REWRITE_GANG_MEMBERS) | \
- (1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE) | \
- ZIO_WRITE_COMMON_PIPELINE)
-
-#define ZIO_WRITE_ALLOCATE_PIPELINE \
- ((1U << ZIO_STAGE_DVA_ALLOCATE) | \
- ZIO_WRITE_COMMON_PIPELINE)
-
-#define ZIO_GANG_FREE_STAGES \
- ((1U << ZIO_STAGE_GET_GANG_HEADER) | \
- (1U << ZIO_STAGE_FREE_GANG_MEMBERS))
-
-#define ZIO_FREE_PIPELINE \
- ((1U << ZIO_STAGE_OPEN) | \
- (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \
- (1U << ZIO_STAGE_GANG_PIPELINE) | \
- (1U << ZIO_STAGE_GET_GANG_HEADER) | \
- (1U << ZIO_STAGE_FREE_GANG_MEMBERS) | \
- (1U << ZIO_STAGE_DVA_FREE) | \
- (1U << ZIO_STAGE_READY) | \
- (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \
- (1U << ZIO_STAGE_DONE))
-
-#define ZIO_CLAIM_PIPELINE \
- ((1U << ZIO_STAGE_OPEN) | \
- (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \
- (1U << ZIO_STAGE_GANG_PIPELINE) | \
- (1U << ZIO_STAGE_GET_GANG_HEADER) | \
- (1U << ZIO_STAGE_CLAIM_GANG_MEMBERS) | \
- (1U << ZIO_STAGE_DVA_CLAIM) | \
- (1U << ZIO_STAGE_READY) | \
- (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \
- (1U << ZIO_STAGE_DONE))
-
-#define ZIO_IOCTL_PIPELINE \
- ((1U << ZIO_STAGE_OPEN) | \
- (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \
- (1U << ZIO_STAGE_READY) | \
- ZIO_VDEV_IO_PIPELINE | \
- (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \
- (1U << ZIO_STAGE_DONE))
-
-#define ZIO_WAIT_FOR_CHILDREN_PIPELINE \
- ((1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \
- (1U << ZIO_STAGE_READY) | \
- (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \
- (1U << ZIO_STAGE_DONE))
-
-#define ZIO_WAIT_FOR_CHILDREN_DONE_PIPELINE \
- ((1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \
- (1U << ZIO_STAGE_DONE))
-
-#define ZIO_VDEV_CHILD_PIPELINE \
- (ZIO_WAIT_FOR_CHILDREN_DONE_PIPELINE | \
- ZIO_VDEV_IO_PIPELINE)
-
-#define ZIO_ERROR_PIPELINE_MASK \
- ZIO_WAIT_FOR_CHILDREN_PIPELINE
-
-typedef struct zio_transform zio_transform_t;
-struct zio_transform {
- void *zt_data;
- uint64_t zt_size;
- uint64_t zt_bufsize;
- zio_transform_t *zt_next;
-};
-
-extern void zio_inject_init(void);
-extern void zio_inject_fini(void);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _ZIO_IMPL_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h b/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h
deleted file mode 100644
index df85824..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_ZVOL_H
-#define _SYS_ZVOL_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/zfs_context.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifdef _KERNEL
-extern int zvol_check_volsize(uint64_t volsize, uint64_t blocksize);
-extern int zvol_check_volblocksize(uint64_t volblocksize);
-extern int zvol_get_stats(objset_t *os, nvlist_t *nv);
-extern void zvol_create_cb(objset_t *os, void *arg, dmu_tx_t *tx);
-extern int zvol_create_minor(const char *, dev_t);
-extern int zvol_remove_minor(const char *);
-extern int zvol_set_volsize(const char *, dev_t, uint64_t);
-extern int zvol_set_volblocksize(const char *, uint64_t);
-
-extern int zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr);
-extern int zvol_close(dev_t dev, int flag, int otyp, cred_t *cr);
-#ifndef __FreeBSD__
-extern int zvol_strategy(buf_t *bp);
-extern int zvol_read(dev_t dev, uio_t *uiop, cred_t *cr);
-extern int zvol_write(dev_t dev, uio_t *uiop, cred_t *cr);
-extern int zvol_aread(dev_t dev, struct aio_req *aio, cred_t *cr);
-extern int zvol_awrite(dev_t dev, struct aio_req *aio, cred_t *cr);
-#endif
-extern int zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr,
- int *rvalp);
-extern int zvol_busy(void);
-extern void zvol_init(void);
-extern void zvol_fini(void);
-#endif
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_ZVOL_H */
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/txg.c b/sys/contrib/opensolaris/uts/common/fs/zfs/txg.c
deleted file mode 100644
index 844beb6..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/txg.c
+++ /dev/null
@@ -1,611 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/txg_impl.h>
-#include <sys/dmu_impl.h>
-#include <sys/dsl_pool.h>
-#include <sys/callb.h>
-
-/*
- * Pool-wide transaction groups.
- */
-
-static void txg_sync_thread(void *arg);
-static void txg_quiesce_thread(void *arg);
-static void txg_timelimit_thread(void *arg);
-
-int txg_time = 5; /* max 5 seconds worth of delta per txg */
-
-/*
- * Prepare the txg subsystem.
- */
-void
-txg_init(dsl_pool_t *dp, uint64_t txg)
-{
- tx_state_t *tx = &dp->dp_tx;
- int c, i;
- bzero(tx, sizeof (tx_state_t));
-
- tx->tx_cpu = kmem_zalloc(max_ncpus * sizeof (tx_cpu_t), KM_SLEEP);
- for (c = 0; c < max_ncpus; c++) {
- mutex_init(&tx->tx_cpu[c].tc_lock, NULL, MUTEX_DEFAULT, NULL);
- for (i = 0; i < TXG_SIZE; i++)
- cv_init(&tx->tx_cpu[c].tc_cv[i], NULL, CV_DEFAULT, NULL);
- }
-
- rw_init(&tx->tx_suspend, NULL, RW_DEFAULT, NULL);
- mutex_init(&tx->tx_sync_lock, NULL, MUTEX_DEFAULT, NULL);
- cv_init(&tx->tx_sync_more_cv, NULL, CV_DEFAULT, NULL);
- cv_init(&tx->tx_sync_done_cv, NULL, CV_DEFAULT, NULL);
- cv_init(&tx->tx_quiesce_more_cv, NULL, CV_DEFAULT, NULL);
- cv_init(&tx->tx_quiesce_done_cv, NULL, CV_DEFAULT, NULL);
- cv_init(&tx->tx_timeout_exit_cv, NULL, CV_DEFAULT, NULL);
- cv_init(&tx->tx_exit_cv, NULL, CV_DEFAULT, NULL);
-
- tx->tx_open_txg = txg;
-}
-
-/*
- * Close down the txg subsystem.
- */
-void
-txg_fini(dsl_pool_t *dp)
-{
- tx_state_t *tx = &dp->dp_tx;
- int c, i;
-
- ASSERT(tx->tx_threads == 0);
-
- cv_destroy(&tx->tx_exit_cv);
- cv_destroy(&tx->tx_timeout_exit_cv);
- cv_destroy(&tx->tx_quiesce_done_cv);
- cv_destroy(&tx->tx_quiesce_more_cv);
- cv_destroy(&tx->tx_sync_done_cv);
- cv_destroy(&tx->tx_sync_more_cv);
- rw_destroy(&tx->tx_suspend);
- mutex_destroy(&tx->tx_sync_lock);
-
- for (c = 0; c < max_ncpus; c++) {
- for (i = 0; i < TXG_SIZE; i++)
- cv_destroy(&tx->tx_cpu[c].tc_cv[i]);
- mutex_destroy(&tx->tx_cpu[c].tc_lock);
- }
-
- kmem_free(tx->tx_cpu, max_ncpus * sizeof (tx_cpu_t));
-
- bzero(tx, sizeof (tx_state_t));
-}
-
-/*
- * Start syncing transaction groups.
- */
-void
-txg_sync_start(dsl_pool_t *dp)
-{
- tx_state_t *tx = &dp->dp_tx;
-
- mutex_enter(&tx->tx_sync_lock);
-
- dprintf("pool %p\n", dp);
-
- ASSERT(tx->tx_threads == 0);
-
- tx->tx_threads = 3;
-
- tx->tx_quiesce_thread = thread_create(NULL, 0, txg_quiesce_thread,
- dp, 0, &p0, TS_RUN, minclsyspri);
-
- tx->tx_sync_thread = thread_create(NULL, 0, txg_sync_thread,
- dp, 0, &p0, TS_RUN, minclsyspri);
-
- tx->tx_timelimit_thread = thread_create(NULL, 0, txg_timelimit_thread,
- dp, 0, &p0, TS_RUN, minclsyspri);
-
- mutex_exit(&tx->tx_sync_lock);
-}
-
-static void
-txg_thread_enter(tx_state_t *tx, callb_cpr_t *cpr)
-{
- CALLB_CPR_INIT(cpr, &tx->tx_sync_lock, callb_generic_cpr, FTAG);
- mutex_enter(&tx->tx_sync_lock);
-}
-
-static void
-txg_thread_exit(tx_state_t *tx, callb_cpr_t *cpr, kthread_t **tpp)
-{
- ASSERT(*tpp != NULL);
- *tpp = NULL;
- tx->tx_threads--;
- cv_broadcast(&tx->tx_exit_cv);
- CALLB_CPR_EXIT(cpr); /* drops &tx->tx_sync_lock */
- thread_exit();
-}
-
-static void
-txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, int secmax)
-{
- CALLB_CPR_SAFE_BEGIN(cpr);
-
- if (secmax)
- (void) cv_timedwait(cv, &tx->tx_sync_lock, secmax * hz);
- else
- cv_wait(cv, &tx->tx_sync_lock);
-
- CALLB_CPR_SAFE_END(cpr, &tx->tx_sync_lock);
-}
-
-/*
- * Stop syncing transaction groups.
- */
-void
-txg_sync_stop(dsl_pool_t *dp)
-{
- tx_state_t *tx = &dp->dp_tx;
-
- dprintf("pool %p\n", dp);
- /*
- * Finish off any work in progress.
- */
- ASSERT(tx->tx_threads == 3);
- txg_wait_synced(dp, 0);
-
- /*
- * Wake all 3 sync threads (one per state) and wait for them to die.
- */
- mutex_enter(&tx->tx_sync_lock);
-
- ASSERT(tx->tx_threads == 3);
-
- tx->tx_exiting = 1;
-
- cv_broadcast(&tx->tx_quiesce_more_cv);
- cv_broadcast(&tx->tx_quiesce_done_cv);
- cv_broadcast(&tx->tx_sync_more_cv);
- cv_broadcast(&tx->tx_timeout_exit_cv);
-
- while (tx->tx_threads != 0)
- cv_wait(&tx->tx_exit_cv, &tx->tx_sync_lock);
-
- tx->tx_exiting = 0;
-
- mutex_exit(&tx->tx_sync_lock);
-}
-
-uint64_t
-txg_hold_open(dsl_pool_t *dp, txg_handle_t *th)
-{
- tx_state_t *tx = &dp->dp_tx;
- tx_cpu_t *tc = &tx->tx_cpu[CPU_SEQID];
- uint64_t txg;
-
- mutex_enter(&tc->tc_lock);
-
- txg = tx->tx_open_txg;
- tc->tc_count[txg & TXG_MASK]++;
-
- th->th_cpu = tc;
- th->th_txg = txg;
-
- return (txg);
-}
-
-void
-txg_rele_to_quiesce(txg_handle_t *th)
-{
- tx_cpu_t *tc = th->th_cpu;
-
- mutex_exit(&tc->tc_lock);
-}
-
-void
-txg_rele_to_sync(txg_handle_t *th)
-{
- tx_cpu_t *tc = th->th_cpu;
- int g = th->th_txg & TXG_MASK;
-
- mutex_enter(&tc->tc_lock);
- ASSERT(tc->tc_count[g] != 0);
- if (--tc->tc_count[g] == 0)
- cv_broadcast(&tc->tc_cv[g]);
- mutex_exit(&tc->tc_lock);
-
- th->th_cpu = NULL; /* defensive */
-}
-
-static void
-txg_quiesce(dsl_pool_t *dp, uint64_t txg)
-{
- tx_state_t *tx = &dp->dp_tx;
- int g = txg & TXG_MASK;
- int c;
-
- /*
- * Grab all tx_cpu locks so nobody else can get into this txg.
- */
- for (c = 0; c < max_ncpus; c++)
- mutex_enter(&tx->tx_cpu[c].tc_lock);
-
- ASSERT(txg == tx->tx_open_txg);
- tx->tx_open_txg++;
-
- /*
- * Now that we've incremented tx_open_txg, we can let threads
- * enter the next transaction group.
- */
- for (c = 0; c < max_ncpus; c++)
- mutex_exit(&tx->tx_cpu[c].tc_lock);
-
- /*
- * Quiesce the transaction group by waiting for everyone to txg_exit().
- */
- for (c = 0; c < max_ncpus; c++) {
- tx_cpu_t *tc = &tx->tx_cpu[c];
- mutex_enter(&tc->tc_lock);
- while (tc->tc_count[g] != 0)
- cv_wait(&tc->tc_cv[g], &tc->tc_lock);
- mutex_exit(&tc->tc_lock);
- }
-}
-
-static void
-txg_sync_thread(void *arg)
-{
- dsl_pool_t *dp = arg;
- tx_state_t *tx = &dp->dp_tx;
- callb_cpr_t cpr;
-
- txg_thread_enter(tx, &cpr);
-
- for (;;) {
- uint64_t txg;
-
- /*
- * We sync when there's someone waiting on us, or the
- * quiesce thread has handed off a txg to us.
- */
- while (!tx->tx_exiting &&
- tx->tx_synced_txg >= tx->tx_sync_txg_waiting &&
- tx->tx_quiesced_txg == 0) {
- dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n",
- tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
- txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, 0);
- }
-
- /*
- * Wait until the quiesce thread hands off a txg to us,
- * prompting it to do so if necessary.
- */
- while (!tx->tx_exiting && tx->tx_quiesced_txg == 0) {
- if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1)
- tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1;
- cv_broadcast(&tx->tx_quiesce_more_cv);
- txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0);
- }
-
- if (tx->tx_exiting)
- txg_thread_exit(tx, &cpr, &tx->tx_sync_thread);
-
- rw_enter(&tx->tx_suspend, RW_WRITER);
-
- /*
- * Consume the quiesced txg which has been handed off to
- * us. This may cause the quiescing thread to now be
- * able to quiesce another txg, so we must signal it.
- */
- txg = tx->tx_quiesced_txg;
- tx->tx_quiesced_txg = 0;
- tx->tx_syncing_txg = txg;
- cv_broadcast(&tx->tx_quiesce_more_cv);
- rw_exit(&tx->tx_suspend);
-
- dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
- txg, tx->tx_quiesce_txg_waiting,
- tx->tx_sync_txg_waiting);
- mutex_exit(&tx->tx_sync_lock);
- spa_sync(dp->dp_spa, txg);
- mutex_enter(&tx->tx_sync_lock);
- rw_enter(&tx->tx_suspend, RW_WRITER);
- tx->tx_synced_txg = txg;
- tx->tx_syncing_txg = 0;
- rw_exit(&tx->tx_suspend);
- cv_broadcast(&tx->tx_sync_done_cv);
- }
-}
-
-static void
-txg_quiesce_thread(void *arg)
-{
- dsl_pool_t *dp = arg;
- tx_state_t *tx = &dp->dp_tx;
- callb_cpr_t cpr;
-
- txg_thread_enter(tx, &cpr);
-
- for (;;) {
- uint64_t txg;
-
- /*
- * We quiesce when there's someone waiting on us.
- * However, we can only have one txg in "quiescing" or
- * "quiesced, waiting to sync" state. So we wait until
- * the "quiesced, waiting to sync" txg has been consumed
- * by the sync thread.
- */
- while (!tx->tx_exiting &&
- (tx->tx_open_txg >= tx->tx_quiesce_txg_waiting ||
- tx->tx_quiesced_txg != 0))
- txg_thread_wait(tx, &cpr, &tx->tx_quiesce_more_cv, 0);
-
- if (tx->tx_exiting)
- txg_thread_exit(tx, &cpr, &tx->tx_quiesce_thread);
-
- txg = tx->tx_open_txg;
- dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
- txg, tx->tx_quiesce_txg_waiting,
- tx->tx_sync_txg_waiting);
- mutex_exit(&tx->tx_sync_lock);
- txg_quiesce(dp, txg);
- mutex_enter(&tx->tx_sync_lock);
-
- /*
- * Hand this txg off to the sync thread.
- */
- dprintf("quiesce done, handing off txg %llu\n", txg);
- tx->tx_quiesced_txg = txg;
- cv_broadcast(&tx->tx_sync_more_cv);
- cv_broadcast(&tx->tx_quiesce_done_cv);
- }
-}
-
-void
-txg_wait_synced(dsl_pool_t *dp, uint64_t txg)
-{
- tx_state_t *tx = &dp->dp_tx;
-
- mutex_enter(&tx->tx_sync_lock);
- ASSERT(tx->tx_threads == 3);
- if (txg == 0)
- txg = tx->tx_open_txg;
- if (tx->tx_sync_txg_waiting < txg)
- tx->tx_sync_txg_waiting = txg;
- dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
- txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
- while (tx->tx_synced_txg < txg) {
- dprintf("broadcasting sync more "
- "tx_synced=%llu waiting=%llu dp=%p\n",
- tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
- cv_broadcast(&tx->tx_sync_more_cv);
- cv_wait(&tx->tx_sync_done_cv, &tx->tx_sync_lock);
- }
- mutex_exit(&tx->tx_sync_lock);
-}
-
-void
-txg_wait_open(dsl_pool_t *dp, uint64_t txg)
-{
- tx_state_t *tx = &dp->dp_tx;
-
- mutex_enter(&tx->tx_sync_lock);
- ASSERT(tx->tx_threads == 3);
- if (txg == 0)
- txg = tx->tx_open_txg + 1;
- if (tx->tx_quiesce_txg_waiting < txg)
- tx->tx_quiesce_txg_waiting = txg;
- dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
- txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
- while (tx->tx_open_txg < txg) {
- cv_broadcast(&tx->tx_quiesce_more_cv);
- cv_wait(&tx->tx_quiesce_done_cv, &tx->tx_sync_lock);
- }
- mutex_exit(&tx->tx_sync_lock);
-}
-
-static void
-txg_timelimit_thread(void *arg)
-{
- dsl_pool_t *dp = arg;
- tx_state_t *tx = &dp->dp_tx;
- callb_cpr_t cpr;
-
- txg_thread_enter(tx, &cpr);
-
- while (!tx->tx_exiting) {
- uint64_t txg = tx->tx_open_txg + 1;
-
- txg_thread_wait(tx, &cpr, &tx->tx_timeout_exit_cv, txg_time);
-
- if (tx->tx_quiesce_txg_waiting < txg)
- tx->tx_quiesce_txg_waiting = txg;
-
- while (!tx->tx_exiting && tx->tx_open_txg < txg) {
- dprintf("pushing out %llu\n", txg);
- cv_broadcast(&tx->tx_quiesce_more_cv);
- txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0);
- }
- }
- txg_thread_exit(tx, &cpr, &tx->tx_timelimit_thread);
-}
-
-int
-txg_stalled(dsl_pool_t *dp)
-{
- tx_state_t *tx = &dp->dp_tx;
- return (tx->tx_quiesce_txg_waiting > tx->tx_open_txg);
-}
-
-void
-txg_suspend(dsl_pool_t *dp)
-{
- tx_state_t *tx = &dp->dp_tx;
- /* XXX some code paths suspend when they are already suspended! */
- rw_enter(&tx->tx_suspend, RW_READER);
-}
-
-void
-txg_resume(dsl_pool_t *dp)
-{
- tx_state_t *tx = &dp->dp_tx;
- rw_exit(&tx->tx_suspend);
-}
-
-/*
- * Per-txg object lists.
- */
-void
-txg_list_create(txg_list_t *tl, size_t offset)
-{
- int t;
-
- mutex_init(&tl->tl_lock, NULL, MUTEX_DEFAULT, NULL);
-
- tl->tl_offset = offset;
-
- for (t = 0; t < TXG_SIZE; t++)
- tl->tl_head[t] = NULL;
-}
-
-void
-txg_list_destroy(txg_list_t *tl)
-{
- int t;
-
- for (t = 0; t < TXG_SIZE; t++)
- ASSERT(txg_list_empty(tl, t));
-
- mutex_destroy(&tl->tl_lock);
-}
-
-int
-txg_list_empty(txg_list_t *tl, uint64_t txg)
-{
- return (tl->tl_head[txg & TXG_MASK] == NULL);
-}
-
-/*
- * Add an entry to the list.
- * Returns 0 if it's a new entry, 1 if it's already there.
- */
-int
-txg_list_add(txg_list_t *tl, void *p, uint64_t txg)
-{
- int t = txg & TXG_MASK;
- txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
- int already_on_list;
-
- mutex_enter(&tl->tl_lock);
- already_on_list = tn->tn_member[t];
- if (!already_on_list) {
- tn->tn_member[t] = 1;
- tn->tn_next[t] = tl->tl_head[t];
- tl->tl_head[t] = tn;
- }
- mutex_exit(&tl->tl_lock);
-
- return (already_on_list);
-}
-
-/*
- * Remove the head of the list and return it.
- */
-void *
-txg_list_remove(txg_list_t *tl, uint64_t txg)
-{
- int t = txg & TXG_MASK;
- txg_node_t *tn;
- void *p = NULL;
-
- mutex_enter(&tl->tl_lock);
- if ((tn = tl->tl_head[t]) != NULL) {
- p = (char *)tn - tl->tl_offset;
- tl->tl_head[t] = tn->tn_next[t];
- tn->tn_next[t] = NULL;
- tn->tn_member[t] = 0;
- }
- mutex_exit(&tl->tl_lock);
-
- return (p);
-}
-
-/*
- * Remove a specific item from the list and return it.
- */
-void *
-txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg)
-{
- int t = txg & TXG_MASK;
- txg_node_t *tn, **tp;
-
- mutex_enter(&tl->tl_lock);
-
- for (tp = &tl->tl_head[t]; (tn = *tp) != NULL; tp = &tn->tn_next[t]) {
- if ((char *)tn - tl->tl_offset == p) {
- *tp = tn->tn_next[t];
- tn->tn_next[t] = NULL;
- tn->tn_member[t] = 0;
- mutex_exit(&tl->tl_lock);
- return (p);
- }
- }
-
- mutex_exit(&tl->tl_lock);
-
- return (NULL);
-}
-
-int
-txg_list_member(txg_list_t *tl, void *p, uint64_t txg)
-{
- int t = txg & TXG_MASK;
- txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
-
- return (tn->tn_member[t]);
-}
-
-/*
- * Walk a txg list -- only safe if you know it's not changing.
- */
-void *
-txg_list_head(txg_list_t *tl, uint64_t txg)
-{
- int t = txg & TXG_MASK;
- txg_node_t *tn = tl->tl_head[t];
-
- return (tn == NULL ? NULL : (char *)tn - tl->tl_offset);
-}
-
-void *
-txg_list_next(txg_list_t *tl, void *p, uint64_t txg)
-{
- int t = txg & TXG_MASK;
- txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
-
- tn = tn->tn_next[t];
-
- return (tn == NULL ? NULL : (char *)tn - tl->tl_offset);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/uberblock.c b/sys/contrib/opensolaris/uts/common/fs/zfs/uberblock.c
deleted file mode 100644
index 34d7e0c..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/uberblock.c
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/uberblock_impl.h>
-#include <sys/vdev_impl.h>
-
-int
-uberblock_verify(uberblock_t *ub)
-{
- if (ub->ub_magic == BSWAP_64((uint64_t)UBERBLOCK_MAGIC))
- byteswap_uint64_array(ub, sizeof (uberblock_t));
-
- if (ub->ub_magic != UBERBLOCK_MAGIC)
- return (EINVAL);
-
- return (0);
-}
-
-/*
- * Update the uberblock and return a boolean value indicating whether
- * anything changed in this transaction group.
- */
-int
-uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg)
-{
- ASSERT(ub->ub_txg < txg);
-
- /*
- * We explicitly do not set ub_version here, so that older versions
- * continue to be written with the previous uberblock version.
- */
- ub->ub_magic = UBERBLOCK_MAGIC;
- ub->ub_txg = txg;
- ub->ub_guid_sum = rvd->vdev_guid_sum;
- ub->ub_timestamp = gethrestime_sec();
-
- return (ub->ub_rootbp.blk_birth == txg);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/unique.c b/sys/contrib/opensolaris/uts/common/fs/zfs/unique.c
deleted file mode 100644
index b52e729..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/unique.c
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/avl.h>
-#include <sys/unique.h>
-
-static avl_tree_t unique_avl;
-static kmutex_t unique_mtx; /* Lock never initialized. */
-SX_SYSINIT(unique, &unique_mtx, "unique lock");
-
-typedef struct unique {
- avl_node_t un_link;
- uint64_t un_value;
-} unique_t;
-
-#define UNIQUE_MASK ((1ULL << UNIQUE_BITS) - 1)
-
-static int
-unique_compare(const void *a, const void *b)
-{
- const unique_t *una = a;
- const unique_t *unb = b;
-
- if (una->un_value < unb->un_value)
- return (-1);
- if (una->un_value > unb->un_value)
- return (+1);
- return (0);
-}
-
-void
-unique_init(void)
-{
- avl_create(&unique_avl, unique_compare,
- sizeof (unique_t), offsetof(unique_t, un_link));
-}
-
-uint64_t
-unique_create(void)
-{
- return (unique_insert(0));
-}
-
-uint64_t
-unique_insert(uint64_t value)
-{
- avl_index_t idx;
- unique_t *un = kmem_alloc(sizeof (unique_t), KM_SLEEP);
-
- un->un_value = value;
-
- mutex_enter(&unique_mtx);
- while (un->un_value == 0 || un->un_value & ~UNIQUE_MASK ||
- avl_find(&unique_avl, un, &idx)) {
- mutex_exit(&unique_mtx);
- (void) random_get_pseudo_bytes((void*)&un->un_value,
- sizeof (un->un_value));
- un->un_value &= UNIQUE_MASK;
- mutex_enter(&unique_mtx);
- }
-
- avl_insert(&unique_avl, un, idx);
- mutex_exit(&unique_mtx);
-
- return (un->un_value);
-}
-
-void
-unique_remove(uint64_t value)
-{
- unique_t un_tofind;
- unique_t *un;
-
- un_tofind.un_value = value;
- mutex_enter(&unique_mtx);
- un = avl_find(&unique_avl, &un_tofind, NULL);
- if (un != NULL) {
- avl_remove(&unique_avl, un);
- kmem_free(un, sizeof (unique_t));
- }
- mutex_exit(&unique_mtx);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev.c b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev.c
deleted file mode 100644
index b966099..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev.c
+++ /dev/null
@@ -1,1915 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/fm/fs/zfs.h>
-#include <sys/spa.h>
-#include <sys/spa_impl.h>
-#include <sys/dmu.h>
-#include <sys/dmu_tx.h>
-#include <sys/vdev_impl.h>
-#include <sys/uberblock_impl.h>
-#include <sys/metaslab.h>
-#include <sys/metaslab_impl.h>
-#include <sys/space_map.h>
-#include <sys/zio.h>
-#include <sys/zap.h>
-#include <sys/fs/zfs.h>
-
-SYSCTL_DECL(_vfs_zfs);
-SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV");
-
-/*
- * Virtual device management.
- */
-
-static vdev_ops_t *vdev_ops_table[] = {
- &vdev_root_ops,
- &vdev_raidz_ops,
- &vdev_mirror_ops,
- &vdev_replacing_ops,
- &vdev_spare_ops,
-#ifdef _KERNEL
- &vdev_geom_ops,
-#else
- &vdev_disk_ops,
- &vdev_file_ops,
-#endif
- &vdev_missing_ops,
- NULL
-};
-
-/* maximum scrub/resilver I/O queue */
-int zfs_scrub_limit = 70;
-
-/*
- * Given a vdev type, return the appropriate ops vector.
- */
-static vdev_ops_t *
-vdev_getops(const char *type)
-{
- vdev_ops_t *ops, **opspp;
-
- for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++)
- if (strcmp(ops->vdev_op_type, type) == 0)
- break;
-
- return (ops);
-}
-
-/*
- * Default asize function: return the MAX of psize with the asize of
- * all children. This is what's used by anything other than RAID-Z.
- */
-uint64_t
-vdev_default_asize(vdev_t *vd, uint64_t psize)
-{
- uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);
- uint64_t csize;
- uint64_t c;
-
- for (c = 0; c < vd->vdev_children; c++) {
- csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
- asize = MAX(asize, csize);
- }
-
- return (asize);
-}
-
-/*
- * Get the replaceable or attachable device size.
- * If the parent is a mirror or raidz, the replaceable size is the minimum
- * psize of all its children. For the rest, just return our own psize.
- *
- * e.g.
- * psize rsize
- * root - -
- * mirror/raidz - -
- * disk1 20g 20g
- * disk2 40g 20g
- * disk3 80g 80g
- */
-uint64_t
-vdev_get_rsize(vdev_t *vd)
-{
- vdev_t *pvd, *cvd;
- uint64_t c, rsize;
-
- pvd = vd->vdev_parent;
-
- /*
- * If our parent is NULL or the root, just return our own psize.
- */
- if (pvd == NULL || pvd->vdev_parent == NULL)
- return (vd->vdev_psize);
-
- rsize = 0;
-
- for (c = 0; c < pvd->vdev_children; c++) {
- cvd = pvd->vdev_child[c];
- rsize = MIN(rsize - 1, cvd->vdev_psize - 1) + 1;
- }
-
- return (rsize);
-}
-
-vdev_t *
-vdev_lookup_top(spa_t *spa, uint64_t vdev)
-{
- vdev_t *rvd = spa->spa_root_vdev;
-
- if (vdev < rvd->vdev_children)
- return (rvd->vdev_child[vdev]);
-
- return (NULL);
-}
-
-vdev_t *
-vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
-{
- int c;
- vdev_t *mvd;
-
- if (vd->vdev_guid == guid)
- return (vd);
-
- for (c = 0; c < vd->vdev_children; c++)
- if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) !=
- NULL)
- return (mvd);
-
- return (NULL);
-}
-
-void
-vdev_add_child(vdev_t *pvd, vdev_t *cvd)
-{
- size_t oldsize, newsize;
- uint64_t id = cvd->vdev_id;
- vdev_t **newchild;
-
- ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER));
- ASSERT(cvd->vdev_parent == NULL);
-
- cvd->vdev_parent = pvd;
-
- if (pvd == NULL)
- return;
-
- ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL);
-
- oldsize = pvd->vdev_children * sizeof (vdev_t *);
- pvd->vdev_children = MAX(pvd->vdev_children, id + 1);
- newsize = pvd->vdev_children * sizeof (vdev_t *);
-
- newchild = kmem_zalloc(newsize, KM_SLEEP);
- if (pvd->vdev_child != NULL) {
- bcopy(pvd->vdev_child, newchild, oldsize);
- kmem_free(pvd->vdev_child, oldsize);
- }
-
- pvd->vdev_child = newchild;
- pvd->vdev_child[id] = cvd;
-
- cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd);
- ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL);
-
- /*
- * Walk up all ancestors to update guid sum.
- */
- for (; pvd != NULL; pvd = pvd->vdev_parent)
- pvd->vdev_guid_sum += cvd->vdev_guid_sum;
-
- if (cvd->vdev_ops->vdev_op_leaf)
- cvd->vdev_spa->spa_scrub_maxinflight += zfs_scrub_limit;
-}
-
-void
-vdev_remove_child(vdev_t *pvd, vdev_t *cvd)
-{
- int c;
- uint_t id = cvd->vdev_id;
-
- ASSERT(cvd->vdev_parent == pvd);
-
- if (pvd == NULL)
- return;
-
- ASSERT(id < pvd->vdev_children);
- ASSERT(pvd->vdev_child[id] == cvd);
-
- pvd->vdev_child[id] = NULL;
- cvd->vdev_parent = NULL;
-
- for (c = 0; c < pvd->vdev_children; c++)
- if (pvd->vdev_child[c])
- break;
-
- if (c == pvd->vdev_children) {
- kmem_free(pvd->vdev_child, c * sizeof (vdev_t *));
- pvd->vdev_child = NULL;
- pvd->vdev_children = 0;
- }
-
- /*
- * Walk up all ancestors to update guid sum.
- */
- for (; pvd != NULL; pvd = pvd->vdev_parent)
- pvd->vdev_guid_sum -= cvd->vdev_guid_sum;
-
- if (cvd->vdev_ops->vdev_op_leaf)
- cvd->vdev_spa->spa_scrub_maxinflight -= zfs_scrub_limit;
-}
-
-/*
- * Remove any holes in the child array.
- */
-void
-vdev_compact_children(vdev_t *pvd)
-{
- vdev_t **newchild, *cvd;
- int oldc = pvd->vdev_children;
- int newc, c;
-
- ASSERT(spa_config_held(pvd->vdev_spa, RW_WRITER));
-
- for (c = newc = 0; c < oldc; c++)
- if (pvd->vdev_child[c])
- newc++;
-
- newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP);
-
- for (c = newc = 0; c < oldc; c++) {
- if ((cvd = pvd->vdev_child[c]) != NULL) {
- newchild[newc] = cvd;
- cvd->vdev_id = newc++;
- }
- }
-
- kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *));
- pvd->vdev_child = newchild;
- pvd->vdev_children = newc;
-}
-
-/*
- * Allocate and minimally initialize a vdev_t.
- */
-static vdev_t *
-vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
-{
- vdev_t *vd;
-
- vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
-
- if (spa->spa_root_vdev == NULL) {
- ASSERT(ops == &vdev_root_ops);
- spa->spa_root_vdev = vd;
- }
-
- if (guid == 0) {
- if (spa->spa_root_vdev == vd) {
- /*
- * The root vdev's guid will also be the pool guid,
- * which must be unique among all pools.
- */
- while (guid == 0 || spa_guid_exists(guid, 0))
- guid = spa_get_random(-1ULL);
- } else {
- /*
- * Any other vdev's guid must be unique within the pool.
- */
- while (guid == 0 ||
- spa_guid_exists(spa_guid(spa), guid))
- guid = spa_get_random(-1ULL);
- }
- ASSERT(!spa_guid_exists(spa_guid(spa), guid));
- }
-
- vd->vdev_spa = spa;
- vd->vdev_id = id;
- vd->vdev_guid = guid;
- vd->vdev_guid_sum = guid;
- vd->vdev_ops = ops;
- vd->vdev_state = VDEV_STATE_CLOSED;
-
- mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
- space_map_create(&vd->vdev_dtl_map, 0, -1ULL, 0, &vd->vdev_dtl_lock);
- space_map_create(&vd->vdev_dtl_scrub, 0, -1ULL, 0, &vd->vdev_dtl_lock);
- txg_list_create(&vd->vdev_ms_list,
- offsetof(struct metaslab, ms_txg_node));
- txg_list_create(&vd->vdev_dtl_list,
- offsetof(struct vdev, vdev_dtl_node));
- vd->vdev_stat.vs_timestamp = gethrtime();
-
- return (vd);
-}
-
-/*
- * Free a vdev_t that has been removed from service.
- */
-static void
-vdev_free_common(vdev_t *vd)
-{
- spa_t *spa = vd->vdev_spa;
-
- if (vd->vdev_path)
- spa_strfree(vd->vdev_path);
- if (vd->vdev_devid)
- spa_strfree(vd->vdev_devid);
-
- if (vd->vdev_isspare)
- spa_spare_remove(vd);
-
- txg_list_destroy(&vd->vdev_ms_list);
- txg_list_destroy(&vd->vdev_dtl_list);
- mutex_enter(&vd->vdev_dtl_lock);
- space_map_unload(&vd->vdev_dtl_map);
- space_map_destroy(&vd->vdev_dtl_map);
- space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
- space_map_destroy(&vd->vdev_dtl_scrub);
- mutex_exit(&vd->vdev_dtl_lock);
- mutex_destroy(&vd->vdev_dtl_lock);
- mutex_destroy(&vd->vdev_stat_lock);
-
- if (vd == spa->spa_root_vdev)
- spa->spa_root_vdev = NULL;
-
- kmem_free(vd, sizeof (vdev_t));
-}
-
-/*
- * Allocate a new vdev. The 'alloctype' is used to control whether we are
- * creating a new vdev or loading an existing one - the behavior is slightly
- * different for each case.
- */
-int
-vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
- int alloctype)
-{
- vdev_ops_t *ops;
- char *type;
- uint64_t guid = 0;
- vdev_t *vd;
-
- ASSERT(spa_config_held(spa, RW_WRITER));
-
- if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
- return (EINVAL);
-
- if ((ops = vdev_getops(type)) == NULL)
- return (EINVAL);
-
- /*
- * If this is a load, get the vdev guid from the nvlist.
- * Otherwise, vdev_alloc_common() will generate one for us.
- */
- if (alloctype == VDEV_ALLOC_LOAD) {
- uint64_t label_id;
-
- if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) ||
- label_id != id)
- return (EINVAL);
-
- if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
- return (EINVAL);
- } else if (alloctype == VDEV_ALLOC_SPARE) {
- if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
- return (EINVAL);
- }
-
- /*
- * The first allocated vdev must be of type 'root'.
- */
- if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL)
- return (EINVAL);
-
- vd = vdev_alloc_common(spa, id, guid, ops);
-
- if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
- vd->vdev_path = spa_strdup(vd->vdev_path);
- if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0)
- vd->vdev_devid = spa_strdup(vd->vdev_devid);
-
- /*
- * Set the nparity propery for RAID-Z vdevs.
- */
- if (ops == &vdev_raidz_ops) {
- if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
- &vd->vdev_nparity) == 0) {
- /*
- * Currently, we can only support 2 parity devices.
- */
- if (vd->vdev_nparity > 2)
- return (EINVAL);
- /*
- * Older versions can only support 1 parity device.
- */
- if (vd->vdev_nparity == 2 &&
- spa_version(spa) < ZFS_VERSION_RAID6)
- return (ENOTSUP);
-
- } else {
- /*
- * We require the parity to be specified for SPAs that
- * support multiple parity levels.
- */
- if (spa_version(spa) >= ZFS_VERSION_RAID6)
- return (EINVAL);
-
- /*
- * Otherwise, we default to 1 parity device for RAID-Z.
- */
- vd->vdev_nparity = 1;
- }
- } else {
- vd->vdev_nparity = 0;
- }
-
- /*
- * Set the whole_disk property. If it's not specified, leave the value
- * as -1.
- */
- if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
- &vd->vdev_wholedisk) != 0)
- vd->vdev_wholedisk = -1ULL;
-
- /*
- * Look for the 'not present' flag. This will only be set if the device
- * was not present at the time of import.
- */
- (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
- &vd->vdev_not_present);
-
- /*
- * Get the alignment requirement.
- */
- (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift);
-
- /*
- * If we're a top-level vdev, try to load the allocation parameters.
- */
- if (parent && !parent->vdev_parent && alloctype == VDEV_ALLOC_LOAD) {
- (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
- &vd->vdev_ms_array);
- (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
- &vd->vdev_ms_shift);
- (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
- &vd->vdev_asize);
- }
-
- /*
- * If we're a leaf vdev, try to load the DTL object and offline state.
- */
- if (vd->vdev_ops->vdev_op_leaf && alloctype == VDEV_ALLOC_LOAD) {
- (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
- &vd->vdev_dtl.smo_object);
- (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
- &vd->vdev_offline);
- }
-
- /*
- * Add ourselves to the parent's list of children.
- */
- vdev_add_child(parent, vd);
-
- *vdp = vd;
-
- return (0);
-}
-
-void
-vdev_free(vdev_t *vd)
-{
- int c;
-
- /*
- * vdev_free() implies closing the vdev first. This is simpler than
- * trying to ensure complicated semantics for all callers.
- */
- vdev_close(vd);
-
- ASSERT(!list_link_active(&vd->vdev_dirty_node));
-
- /*
- * Free all children.
- */
- for (c = 0; c < vd->vdev_children; c++)
- vdev_free(vd->vdev_child[c]);
-
- ASSERT(vd->vdev_child == NULL);
- ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
-
- /*
- * Discard allocation state.
- */
- if (vd == vd->vdev_top)
- vdev_metaslab_fini(vd);
-
- ASSERT3U(vd->vdev_stat.vs_space, ==, 0);
- ASSERT3U(vd->vdev_stat.vs_dspace, ==, 0);
- ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0);
-
- /*
- * Remove this vdev from its parent's child list.
- */
- vdev_remove_child(vd->vdev_parent, vd);
-
- ASSERT(vd->vdev_parent == NULL);
-
- vdev_free_common(vd);
-}
-
-/*
- * Transfer top-level vdev state from svd to tvd.
- */
-static void
-vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
-{
- spa_t *spa = svd->vdev_spa;
- metaslab_t *msp;
- vdev_t *vd;
- int t;
-
- ASSERT(tvd == tvd->vdev_top);
-
- tvd->vdev_ms_array = svd->vdev_ms_array;
- tvd->vdev_ms_shift = svd->vdev_ms_shift;
- tvd->vdev_ms_count = svd->vdev_ms_count;
-
- svd->vdev_ms_array = 0;
- svd->vdev_ms_shift = 0;
- svd->vdev_ms_count = 0;
-
- tvd->vdev_mg = svd->vdev_mg;
- tvd->vdev_ms = svd->vdev_ms;
-
- svd->vdev_mg = NULL;
- svd->vdev_ms = NULL;
-
- if (tvd->vdev_mg != NULL)
- tvd->vdev_mg->mg_vd = tvd;
-
- tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc;
- tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space;
- tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace;
-
- svd->vdev_stat.vs_alloc = 0;
- svd->vdev_stat.vs_space = 0;
- svd->vdev_stat.vs_dspace = 0;
-
- for (t = 0; t < TXG_SIZE; t++) {
- while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
- (void) txg_list_add(&tvd->vdev_ms_list, msp, t);
- while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL)
- (void) txg_list_add(&tvd->vdev_dtl_list, vd, t);
- if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t))
- (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t);
- }
-
- if (list_link_active(&svd->vdev_dirty_node)) {
- vdev_config_clean(svd);
- vdev_config_dirty(tvd);
- }
-
- tvd->vdev_reopen_wanted = svd->vdev_reopen_wanted;
- svd->vdev_reopen_wanted = 0;
-
- tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio;
- svd->vdev_deflate_ratio = 0;
-}
-
-static void
-vdev_top_update(vdev_t *tvd, vdev_t *vd)
-{
- int c;
-
- if (vd == NULL)
- return;
-
- vd->vdev_top = tvd;
-
- for (c = 0; c < vd->vdev_children; c++)
- vdev_top_update(tvd, vd->vdev_child[c]);
-}
-
-/*
- * Add a mirror/replacing vdev above an existing vdev.
- */
-vdev_t *
-vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
-{
- spa_t *spa = cvd->vdev_spa;
- vdev_t *pvd = cvd->vdev_parent;
- vdev_t *mvd;
-
- ASSERT(spa_config_held(spa, RW_WRITER));
-
- mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);
-
- mvd->vdev_asize = cvd->vdev_asize;
- mvd->vdev_ashift = cvd->vdev_ashift;
- mvd->vdev_state = cvd->vdev_state;
-
- vdev_remove_child(pvd, cvd);
- vdev_add_child(pvd, mvd);
- cvd->vdev_id = mvd->vdev_children;
- vdev_add_child(mvd, cvd);
- vdev_top_update(cvd->vdev_top, cvd->vdev_top);
-
- if (mvd == mvd->vdev_top)
- vdev_top_transfer(cvd, mvd);
-
- return (mvd);
-}
-
-/*
- * Remove a 1-way mirror/replacing vdev from the tree.
- */
-void
-vdev_remove_parent(vdev_t *cvd)
-{
- vdev_t *mvd = cvd->vdev_parent;
- vdev_t *pvd = mvd->vdev_parent;
-
- ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER));
-
- ASSERT(mvd->vdev_children == 1);
- ASSERT(mvd->vdev_ops == &vdev_mirror_ops ||
- mvd->vdev_ops == &vdev_replacing_ops ||
- mvd->vdev_ops == &vdev_spare_ops);
- cvd->vdev_ashift = mvd->vdev_ashift;
-
- vdev_remove_child(mvd, cvd);
- vdev_remove_child(pvd, mvd);
- cvd->vdev_id = mvd->vdev_id;
- vdev_add_child(pvd, cvd);
- /*
- * If we created a new toplevel vdev, then we need to change the child's
- * vdev GUID to match the old toplevel vdev. Otherwise, we could have
- * detached an offline device, and when we go to import the pool we'll
- * think we have two toplevel vdevs, instead of a different version of
- * the same toplevel vdev.
- */
- if (cvd->vdev_top == cvd) {
- pvd->vdev_guid_sum -= cvd->vdev_guid;
- cvd->vdev_guid_sum -= cvd->vdev_guid;
- cvd->vdev_guid = mvd->vdev_guid;
- cvd->vdev_guid_sum += mvd->vdev_guid;
- pvd->vdev_guid_sum += cvd->vdev_guid;
- }
- vdev_top_update(cvd->vdev_top, cvd->vdev_top);
-
- if (cvd == cvd->vdev_top)
- vdev_top_transfer(mvd, cvd);
-
- ASSERT(mvd->vdev_children == 0);
- vdev_free(mvd);
-}
-
-int
-vdev_metaslab_init(vdev_t *vd, uint64_t txg)
-{
- spa_t *spa = vd->vdev_spa;
- objset_t *mos = spa->spa_meta_objset;
- metaslab_class_t *mc = spa_metaslab_class_select(spa);
- uint64_t m;
- uint64_t oldc = vd->vdev_ms_count;
- uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
- metaslab_t **mspp;
- int error;
-
- if (vd->vdev_ms_shift == 0) /* not being allocated from yet */
- return (0);
-
- dprintf("%s oldc %llu newc %llu\n", vdev_description(vd), oldc, newc);
-
- ASSERT(oldc <= newc);
-
- if (vd->vdev_mg == NULL)
- vd->vdev_mg = metaslab_group_create(mc, vd);
-
- mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
-
- if (oldc != 0) {
- bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp));
- kmem_free(vd->vdev_ms, oldc * sizeof (*mspp));
- }
-
- vd->vdev_ms = mspp;
- vd->vdev_ms_count = newc;
-
- for (m = oldc; m < newc; m++) {
- space_map_obj_t smo = { 0, 0, 0 };
- if (txg == 0) {
- uint64_t object = 0;
- error = dmu_read(mos, vd->vdev_ms_array,
- m * sizeof (uint64_t), sizeof (uint64_t), &object);
- if (error)
- return (error);
- if (object != 0) {
- dmu_buf_t *db;
- error = dmu_bonus_hold(mos, object, FTAG, &db);
- if (error)
- return (error);
- ASSERT3U(db->db_size, ==, sizeof (smo));
- bcopy(db->db_data, &smo, db->db_size);
- ASSERT3U(smo.smo_object, ==, object);
- dmu_buf_rele(db, FTAG);
- }
- }
- vd->vdev_ms[m] = metaslab_init(vd->vdev_mg, &smo,
- m << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg);
- }
-
- return (0);
-}
-
-void
-vdev_metaslab_fini(vdev_t *vd)
-{
- uint64_t m;
- uint64_t count = vd->vdev_ms_count;
-
- if (vd->vdev_ms != NULL) {
- for (m = 0; m < count; m++)
- if (vd->vdev_ms[m] != NULL)
- metaslab_fini(vd->vdev_ms[m]);
- kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
- vd->vdev_ms = NULL;
- }
-}
-
-/*
- * Prepare a virtual device for access.
- */
-int
-vdev_open(vdev_t *vd)
-{
- int error;
- int c;
- uint64_t osize = 0;
- uint64_t asize, psize;
- uint64_t ashift = 0;
-
- ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
- vd->vdev_state == VDEV_STATE_CANT_OPEN ||
- vd->vdev_state == VDEV_STATE_OFFLINE);
-
- if (vd->vdev_fault_mode == VDEV_FAULT_COUNT)
- vd->vdev_fault_arg >>= 1;
- else
- vd->vdev_fault_mode = VDEV_FAULT_NONE;
-
- vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
-
- if (vd->vdev_ops->vdev_op_leaf) {
- vdev_cache_init(vd);
- vdev_queue_init(vd);
- vd->vdev_cache_active = B_TRUE;
- }
-
- if (vd->vdev_offline) {
- ASSERT(vd->vdev_children == 0);
- vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE);
- return (ENXIO);
- }
-
- error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift);
-
- if (zio_injection_enabled && error == 0)
- error = zio_handle_device_injection(vd, ENXIO);
-
- dprintf("%s = %d, osize %llu, state = %d\n",
- vdev_description(vd), error, osize, vd->vdev_state);
-
- if (error) {
- vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
- vd->vdev_stat.vs_aux);
- return (error);
- }
-
- vd->vdev_state = VDEV_STATE_HEALTHY;
-
- for (c = 0; c < vd->vdev_children; c++)
- if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
- vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
- VDEV_AUX_NONE);
- break;
- }
-
- osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t));
-
- if (vd->vdev_children == 0) {
- if (osize < SPA_MINDEVSIZE) {
- vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_TOO_SMALL);
- return (EOVERFLOW);
- }
- psize = osize;
- asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE);
- } else {
- if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE -
- (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) {
- vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_TOO_SMALL);
- return (EOVERFLOW);
- }
- psize = 0;
- asize = osize;
- }
-
- vd->vdev_psize = psize;
-
- if (vd->vdev_asize == 0) {
- /*
- * This is the first-ever open, so use the computed values.
- * For testing purposes, a higher ashift can be requested.
- */
- vd->vdev_asize = asize;
- vd->vdev_ashift = MAX(ashift, vd->vdev_ashift);
- } else {
- /*
- * Make sure the alignment requirement hasn't increased.
- */
- if (ashift > vd->vdev_top->vdev_ashift) {
- vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_BAD_LABEL);
- return (EINVAL);
- }
-
- /*
- * Make sure the device hasn't shrunk.
- */
- if (asize < vd->vdev_asize) {
- vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_BAD_LABEL);
- return (EINVAL);
- }
-
- /*
- * If all children are healthy and the asize has increased,
- * then we've experienced dynamic LUN growth.
- */
- if (vd->vdev_state == VDEV_STATE_HEALTHY &&
- asize > vd->vdev_asize) {
- vd->vdev_asize = asize;
- }
- }
-
- /*
- * If this is a top-level vdev, compute the raidz-deflation
- * ratio. Note, we hard-code in 128k (1<<17) because it is the
- * current "typical" blocksize. Even if SPA_MAXBLOCKSIZE
- * changes, this algorithm must never change, or we will
- * inconsistently account for existing bp's.
- */
- if (vd->vdev_top == vd) {
- vd->vdev_deflate_ratio = (1<<17) /
- (vdev_psize_to_asize(vd, 1<<17) >> SPA_MINBLOCKSHIFT);
- }
-
- /*
- * This allows the ZFS DE to close cases appropriately. If a device
- * goes away and later returns, we want to close the associated case.
- * But it's not enough to simply post this only when a device goes from
- * CANT_OPEN -> HEALTHY. If we reboot the system and the device is
- * back, we also need to close the case (otherwise we will try to replay
- * it). So we have to post this notifier every time. Since this only
- * occurs during pool open or error recovery, this should not be an
- * issue.
- */
- zfs_post_ok(vd->vdev_spa, vd);
-
- return (0);
-}
-
-/*
- * Called once the vdevs are all opened, this routine validates the label
- * contents. This needs to be done before vdev_load() so that we don't
- * inadvertently do repair I/Os to the wrong device, and so that vdev_reopen()
- * won't succeed if the device has been changed underneath.
- *
- * This function will only return failure if one of the vdevs indicates that it
- * has since been destroyed or exported. This is only possible if
- * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state
- * will be updated but the function will return 0.
- */
-int
-vdev_validate(vdev_t *vd)
-{
- spa_t *spa = vd->vdev_spa;
- int c;
- nvlist_t *label;
- uint64_t guid;
- uint64_t state;
-
- for (c = 0; c < vd->vdev_children; c++)
- if (vdev_validate(vd->vdev_child[c]) != 0)
- return (EBADF);
-
- /*
- * If the device has already failed, or was marked offline, don't do
- * any further validation. Otherwise, label I/O will fail and we will
- * overwrite the previous state.
- */
- if (vd->vdev_ops->vdev_op_leaf && !vdev_is_dead(vd)) {
-
- if ((label = vdev_label_read_config(vd)) == NULL) {
- vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_BAD_LABEL);
- return (0);
- }
-
- if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID,
- &guid) != 0 || guid != spa_guid(spa)) {
- vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
- nvlist_free(label);
- return (0);
- }
-
- if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID,
- &guid) != 0 || guid != vd->vdev_guid) {
- vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
- nvlist_free(label);
- return (0);
- }
-
- if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
- &state) != 0) {
- vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
- nvlist_free(label);
- return (0);
- }
-
- nvlist_free(label);
-
- if (spa->spa_load_state == SPA_LOAD_OPEN &&
- state != POOL_STATE_ACTIVE)
- return (EBADF);
- }
-
- /*
- * If we were able to open and validate a vdev that was previously
- * marked permanently unavailable, clear that state now.
- */
- if (vd->vdev_not_present)
- vd->vdev_not_present = 0;
-
- return (0);
-}
-
-/*
- * Close a virtual device.
- */
-void
-vdev_close(vdev_t *vd)
-{
- vd->vdev_ops->vdev_op_close(vd);
-
- if (vd->vdev_cache_active) {
- vdev_cache_fini(vd);
- vdev_queue_fini(vd);
- vd->vdev_cache_active = B_FALSE;
- }
-
- /*
- * We record the previous state before we close it, so that if we are
- * doing a reopen(), we don't generate FMA ereports if we notice that
- * it's still faulted.
- */
- vd->vdev_prevstate = vd->vdev_state;
-
- if (vd->vdev_offline)
- vd->vdev_state = VDEV_STATE_OFFLINE;
- else
- vd->vdev_state = VDEV_STATE_CLOSED;
- vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
-}
-
-void
-vdev_reopen(vdev_t *vd)
-{
- spa_t *spa = vd->vdev_spa;
-
- ASSERT(spa_config_held(spa, RW_WRITER));
-
- vdev_close(vd);
- (void) vdev_open(vd);
-
- /*
- * Call vdev_validate() here to make sure we have the same device.
- * Otherwise, a device with an invalid label could be successfully
- * opened in response to vdev_reopen().
- *
- * The downside to this is that if the user is simply experimenting by
- * overwriting an entire disk, we'll fault the device rather than
- * demonstrate self-healing capabilities. On the other hand, with
- * proper FMA integration, the series of errors we'd see from the device
- * would result in a faulted device anyway. Given that this doesn't
- * model any real-world corruption, it's better to catch this here and
- * correctly identify that the device has either changed beneath us, or
- * is corrupted beyond recognition.
- */
- (void) vdev_validate(vd);
-
- /*
- * Reassess root vdev's health.
- */
- vdev_propagate_state(spa->spa_root_vdev);
-}
-
-int
-vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing)
-{
- int error;
-
- /*
- * Normally, partial opens (e.g. of a mirror) are allowed.
- * For a create, however, we want to fail the request if
- * there are any components we can't open.
- */
- error = vdev_open(vd);
-
- if (error || vd->vdev_state != VDEV_STATE_HEALTHY) {
- vdev_close(vd);
- return (error ? error : ENXIO);
- }
-
- /*
- * Recursively initialize all labels.
- */
- if ((error = vdev_label_init(vd, txg, isreplacing ?
- VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) {
- vdev_close(vd);
- return (error);
- }
-
- return (0);
-}
-
-/*
- * The is the latter half of vdev_create(). It is distinct because it
- * involves initiating transactions in order to do metaslab creation.
- * For creation, we want to try to create all vdevs at once and then undo it
- * if anything fails; this is much harder if we have pending transactions.
- */
-void
-vdev_init(vdev_t *vd, uint64_t txg)
-{
- /*
- * Aim for roughly 200 metaslabs per vdev.
- */
- vd->vdev_ms_shift = highbit(vd->vdev_asize / 200);
- vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT);
-
- /*
- * Initialize the vdev's metaslabs. This can't fail because
- * there's nothing to read when creating all new metaslabs.
- */
- VERIFY(vdev_metaslab_init(vd, txg) == 0);
-}
-
-void
-vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
-{
- ASSERT(vd == vd->vdev_top);
- ASSERT(ISP2(flags));
-
- if (flags & VDD_METASLAB)
- (void) txg_list_add(&vd->vdev_ms_list, arg, txg);
-
- if (flags & VDD_DTL)
- (void) txg_list_add(&vd->vdev_dtl_list, arg, txg);
-
- (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg);
-}
-
-void
-vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size)
-{
- mutex_enter(sm->sm_lock);
- if (!space_map_contains(sm, txg, size))
- space_map_add(sm, txg, size);
- mutex_exit(sm->sm_lock);
-}
-
-int
-vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size)
-{
- int dirty;
-
- /*
- * Quick test without the lock -- covers the common case that
- * there are no dirty time segments.
- */
- if (sm->sm_space == 0)
- return (0);
-
- mutex_enter(sm->sm_lock);
- dirty = space_map_contains(sm, txg, size);
- mutex_exit(sm->sm_lock);
-
- return (dirty);
-}
-
-/*
- * Reassess DTLs after a config change or scrub completion.
- */
-void
-vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
-{
- spa_t *spa = vd->vdev_spa;
- int c;
-
- ASSERT(spa_config_held(spa, RW_WRITER));
-
- if (vd->vdev_children == 0) {
- mutex_enter(&vd->vdev_dtl_lock);
- /*
- * We're successfully scrubbed everything up to scrub_txg.
- * Therefore, excise all old DTLs up to that point, then
- * fold in the DTLs for everything we couldn't scrub.
- */
- if (scrub_txg != 0) {
- space_map_excise(&vd->vdev_dtl_map, 0, scrub_txg);
- space_map_union(&vd->vdev_dtl_map, &vd->vdev_dtl_scrub);
- }
- if (scrub_done)
- space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
- mutex_exit(&vd->vdev_dtl_lock);
- if (txg != 0)
- vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
- return;
- }
-
- /*
- * Make sure the DTLs are always correct under the scrub lock.
- */
- if (vd == spa->spa_root_vdev)
- mutex_enter(&spa->spa_scrub_lock);
-
- mutex_enter(&vd->vdev_dtl_lock);
- space_map_vacate(&vd->vdev_dtl_map, NULL, NULL);
- space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
- mutex_exit(&vd->vdev_dtl_lock);
-
- for (c = 0; c < vd->vdev_children; c++) {
- vdev_t *cvd = vd->vdev_child[c];
- vdev_dtl_reassess(cvd, txg, scrub_txg, scrub_done);
- mutex_enter(&vd->vdev_dtl_lock);
- space_map_union(&vd->vdev_dtl_map, &cvd->vdev_dtl_map);
- space_map_union(&vd->vdev_dtl_scrub, &cvd->vdev_dtl_scrub);
- mutex_exit(&vd->vdev_dtl_lock);
- }
-
- if (vd == spa->spa_root_vdev)
- mutex_exit(&spa->spa_scrub_lock);
-}
-
-static int
-vdev_dtl_load(vdev_t *vd)
-{
- spa_t *spa = vd->vdev_spa;
- space_map_obj_t *smo = &vd->vdev_dtl;
- objset_t *mos = spa->spa_meta_objset;
- dmu_buf_t *db;
- int error;
-
- ASSERT(vd->vdev_children == 0);
-
- if (smo->smo_object == 0)
- return (0);
-
- if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0)
- return (error);
-
- ASSERT3U(db->db_size, ==, sizeof (*smo));
- bcopy(db->db_data, smo, db->db_size);
- dmu_buf_rele(db, FTAG);
-
- mutex_enter(&vd->vdev_dtl_lock);
- error = space_map_load(&vd->vdev_dtl_map, NULL, SM_ALLOC, smo, mos);
- mutex_exit(&vd->vdev_dtl_lock);
-
- return (error);
-}
-
-void
-vdev_dtl_sync(vdev_t *vd, uint64_t txg)
-{
- spa_t *spa = vd->vdev_spa;
- space_map_obj_t *smo = &vd->vdev_dtl;
- space_map_t *sm = &vd->vdev_dtl_map;
- objset_t *mos = spa->spa_meta_objset;
- space_map_t smsync;
- kmutex_t smlock;
- dmu_buf_t *db;
- dmu_tx_t *tx;
-
- dprintf("%s in txg %llu pass %d\n",
- vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa));
-
- tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
-
- if (vd->vdev_detached) {
- if (smo->smo_object != 0) {
- int err = dmu_object_free(mos, smo->smo_object, tx);
- ASSERT3U(err, ==, 0);
- smo->smo_object = 0;
- }
- dmu_tx_commit(tx);
- dprintf("detach %s committed in txg %llu\n",
- vdev_description(vd), txg);
- return;
- }
-
- if (smo->smo_object == 0) {
- ASSERT(smo->smo_objsize == 0);
- ASSERT(smo->smo_alloc == 0);
- smo->smo_object = dmu_object_alloc(mos,
- DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
- DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
- ASSERT(smo->smo_object != 0);
- vdev_config_dirty(vd->vdev_top);
- }
-
- mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL);
-
- space_map_create(&smsync, sm->sm_start, sm->sm_size, sm->sm_shift,
- &smlock);
-
- mutex_enter(&smlock);
-
- mutex_enter(&vd->vdev_dtl_lock);
- space_map_walk(sm, space_map_add, &smsync);
- mutex_exit(&vd->vdev_dtl_lock);
-
- space_map_truncate(smo, mos, tx);
- space_map_sync(&smsync, SM_ALLOC, smo, mos, tx);
-
- space_map_destroy(&smsync);
-
- mutex_exit(&smlock);
- mutex_destroy(&smlock);
-
- VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db));
- dmu_buf_will_dirty(db, tx);
- ASSERT3U(db->db_size, ==, sizeof (*smo));
- bcopy(smo, db->db_data, db->db_size);
- dmu_buf_rele(db, FTAG);
-
- dmu_tx_commit(tx);
-}
-
-void
-vdev_load(vdev_t *vd)
-{
- int c;
-
- /*
- * Recursively load all children.
- */
- for (c = 0; c < vd->vdev_children; c++)
- vdev_load(vd->vdev_child[c]);
-
- /*
- * If this is a top-level vdev, initialize its metaslabs.
- */
- if (vd == vd->vdev_top &&
- (vd->vdev_ashift == 0 || vd->vdev_asize == 0 ||
- vdev_metaslab_init(vd, 0) != 0))
- vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
-
- /*
- * If this is a leaf vdev, load its DTL.
- */
- if (vd->vdev_ops->vdev_op_leaf && vdev_dtl_load(vd) != 0)
- vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
-}
-
-/*
- * This special case of vdev_spare() is used for hot spares. It's sole purpose
- * it to set the vdev state for the associated vdev. To do this, we make sure
- * that we can open the underlying device, then try to read the label, and make
- * sure that the label is sane and that it hasn't been repurposed to another
- * pool.
- */
-int
-vdev_validate_spare(vdev_t *vd)
-{
- nvlist_t *label;
- uint64_t guid, version;
- uint64_t state;
-
- if ((label = vdev_label_read_config(vd)) == NULL) {
- vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
- return (-1);
- }
-
- if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 ||
- version > ZFS_VERSION ||
- nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 ||
- guid != vd->vdev_guid ||
- nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) {
- vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
- nvlist_free(label);
- return (-1);
- }
-
- spa_spare_add(vd);
-
- /*
- * We don't actually check the pool state here. If it's in fact in
- * use by another pool, we update this fact on the fly when requested.
- */
- nvlist_free(label);
- return (0);
-}
-
-void
-vdev_sync_done(vdev_t *vd, uint64_t txg)
-{
- metaslab_t *msp;
-
- dprintf("%s txg %llu\n", vdev_description(vd), txg);
-
- while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
- metaslab_sync_done(msp, txg);
-}
-
-void
-vdev_sync(vdev_t *vd, uint64_t txg)
-{
- spa_t *spa = vd->vdev_spa;
- vdev_t *lvd;
- metaslab_t *msp;
- dmu_tx_t *tx;
-
- dprintf("%s txg %llu pass %d\n",
- vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa));
-
- if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) {
- ASSERT(vd == vd->vdev_top);
- tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
- vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
- DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
- ASSERT(vd->vdev_ms_array != 0);
- vdev_config_dirty(vd);
- dmu_tx_commit(tx);
- }
-
- while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
- metaslab_sync(msp, txg);
- (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
- }
-
- while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL)
- vdev_dtl_sync(lvd, txg);
-
- (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
-}
-
-uint64_t
-vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
-{
- return (vd->vdev_ops->vdev_op_asize(vd, psize));
-}
-
-void
-vdev_io_start(zio_t *zio)
-{
- zio->io_vd->vdev_ops->vdev_op_io_start(zio);
-}
-
-void
-vdev_io_done(zio_t *zio)
-{
- zio->io_vd->vdev_ops->vdev_op_io_done(zio);
-}
-
-const char *
-vdev_description(vdev_t *vd)
-{
- if (vd == NULL || vd->vdev_ops == NULL)
- return ("<unknown>");
-
- if (vd->vdev_path != NULL)
- return (vd->vdev_path);
-
- if (vd->vdev_parent == NULL)
- return (spa_name(vd->vdev_spa));
-
- return (vd->vdev_ops->vdev_op_type);
-}
-
-int
-vdev_online(spa_t *spa, uint64_t guid)
-{
- vdev_t *rvd, *vd;
- uint64_t txg;
-
- txg = spa_vdev_enter(spa);
-
- rvd = spa->spa_root_vdev;
-
- if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL)
- return (spa_vdev_exit(spa, NULL, txg, ENODEV));
-
- if (!vd->vdev_ops->vdev_op_leaf)
- return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
-
- dprintf("ONLINE: %s\n", vdev_description(vd));
-
- vd->vdev_offline = B_FALSE;
- vd->vdev_tmpoffline = B_FALSE;
- vdev_reopen(vd->vdev_top);
-
- vdev_config_dirty(vd->vdev_top);
-
- (void) spa_vdev_exit(spa, NULL, txg, 0);
-
- VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
-
- return (0);
-}
-
-int
-vdev_offline(spa_t *spa, uint64_t guid, int istmp)
-{
- vdev_t *rvd, *vd;
- uint64_t txg;
-
- txg = spa_vdev_enter(spa);
-
- rvd = spa->spa_root_vdev;
-
- if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL)
- return (spa_vdev_exit(spa, NULL, txg, ENODEV));
-
- if (!vd->vdev_ops->vdev_op_leaf)
- return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
-
- dprintf("OFFLINE: %s\n", vdev_description(vd));
-
- /*
- * If the device isn't already offline, try to offline it.
- */
- if (!vd->vdev_offline) {
- /*
- * If this device's top-level vdev has a non-empty DTL,
- * don't allow the device to be offlined.
- *
- * XXX -- make this more precise by allowing the offline
- * as long as the remaining devices don't have any DTL holes.
- */
- if (vd->vdev_top->vdev_dtl_map.sm_space != 0)
- return (spa_vdev_exit(spa, NULL, txg, EBUSY));
-
- /*
- * Offline this device and reopen its top-level vdev.
- * If this action results in the top-level vdev becoming
- * unusable, undo it and fail the request.
- */
- vd->vdev_offline = B_TRUE;
- vdev_reopen(vd->vdev_top);
- if (vdev_is_dead(vd->vdev_top)) {
- vd->vdev_offline = B_FALSE;
- vdev_reopen(vd->vdev_top);
- return (spa_vdev_exit(spa, NULL, txg, EBUSY));
- }
- }
-
- vd->vdev_tmpoffline = istmp;
-
- vdev_config_dirty(vd->vdev_top);
-
- return (spa_vdev_exit(spa, NULL, txg, 0));
-}
-
-/*
- * Clear the error counts associated with this vdev. Unlike vdev_online() and
- * vdev_offline(), we assume the spa config is locked. We also clear all
- * children. If 'vd' is NULL, then the user wants to clear all vdevs.
- */
-void
-vdev_clear(spa_t *spa, vdev_t *vd)
-{
- int c;
-
- if (vd == NULL)
- vd = spa->spa_root_vdev;
-
- vd->vdev_stat.vs_read_errors = 0;
- vd->vdev_stat.vs_write_errors = 0;
- vd->vdev_stat.vs_checksum_errors = 0;
-
- for (c = 0; c < vd->vdev_children; c++)
- vdev_clear(spa, vd->vdev_child[c]);
-}
-
-int
-vdev_is_dead(vdev_t *vd)
-{
- return (vd->vdev_state <= VDEV_STATE_CANT_OPEN);
-}
-
-int
-vdev_error_inject(vdev_t *vd, zio_t *zio)
-{
- int error = 0;
-
- if (vd->vdev_fault_mode == VDEV_FAULT_NONE)
- return (0);
-
- if (((1ULL << zio->io_type) & vd->vdev_fault_mask) == 0)
- return (0);
-
- switch (vd->vdev_fault_mode) {
- case VDEV_FAULT_RANDOM:
- if (spa_get_random(vd->vdev_fault_arg) == 0)
- error = EIO;
- break;
-
- case VDEV_FAULT_COUNT:
- if ((int64_t)--vd->vdev_fault_arg <= 0)
- vd->vdev_fault_mode = VDEV_FAULT_NONE;
- error = EIO;
- break;
- }
-
- if (error != 0) {
- dprintf("returning %d for type %d on %s state %d offset %llx\n",
- error, zio->io_type, vdev_description(vd),
- vd->vdev_state, zio->io_offset);
- }
-
- return (error);
-}
-
-/*
- * Get statistics for the given vdev.
- */
-void
-vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
-{
- vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
- int c, t;
-
- mutex_enter(&vd->vdev_stat_lock);
- bcopy(&vd->vdev_stat, vs, sizeof (*vs));
- vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
- vs->vs_state = vd->vdev_state;
- vs->vs_rsize = vdev_get_rsize(vd);
- mutex_exit(&vd->vdev_stat_lock);
-
- /*
- * If we're getting stats on the root vdev, aggregate the I/O counts
- * over all top-level vdevs (i.e. the direct children of the root).
- */
- if (vd == rvd) {
- for (c = 0; c < rvd->vdev_children; c++) {
- vdev_t *cvd = rvd->vdev_child[c];
- vdev_stat_t *cvs = &cvd->vdev_stat;
-
- mutex_enter(&vd->vdev_stat_lock);
- for (t = 0; t < ZIO_TYPES; t++) {
- vs->vs_ops[t] += cvs->vs_ops[t];
- vs->vs_bytes[t] += cvs->vs_bytes[t];
- }
- vs->vs_read_errors += cvs->vs_read_errors;
- vs->vs_write_errors += cvs->vs_write_errors;
- vs->vs_checksum_errors += cvs->vs_checksum_errors;
- vs->vs_scrub_examined += cvs->vs_scrub_examined;
- vs->vs_scrub_errors += cvs->vs_scrub_errors;
- mutex_exit(&vd->vdev_stat_lock);
- }
- }
-}
-
-void
-vdev_stat_update(zio_t *zio)
-{
- vdev_t *vd = zio->io_vd;
- vdev_t *pvd;
- uint64_t txg = zio->io_txg;
- vdev_stat_t *vs = &vd->vdev_stat;
- zio_type_t type = zio->io_type;
- int flags = zio->io_flags;
-
- if (zio->io_error == 0) {
- if (!(flags & ZIO_FLAG_IO_BYPASS)) {
- mutex_enter(&vd->vdev_stat_lock);
- vs->vs_ops[type]++;
- vs->vs_bytes[type] += zio->io_size;
- mutex_exit(&vd->vdev_stat_lock);
- }
- if ((flags & ZIO_FLAG_IO_REPAIR) &&
- zio->io_delegate_list == NULL) {
- mutex_enter(&vd->vdev_stat_lock);
- if (flags & ZIO_FLAG_SCRUB_THREAD)
- vs->vs_scrub_repaired += zio->io_size;
- else
- vs->vs_self_healed += zio->io_size;
- mutex_exit(&vd->vdev_stat_lock);
- }
- return;
- }
-
- if (flags & ZIO_FLAG_SPECULATIVE)
- return;
-
- if (!vdev_is_dead(vd)) {
- mutex_enter(&vd->vdev_stat_lock);
- if (type == ZIO_TYPE_READ) {
- if (zio->io_error == ECKSUM)
- vs->vs_checksum_errors++;
- else
- vs->vs_read_errors++;
- }
- if (type == ZIO_TYPE_WRITE)
- vs->vs_write_errors++;
- mutex_exit(&vd->vdev_stat_lock);
- }
-
- if (type == ZIO_TYPE_WRITE) {
- if (txg == 0 || vd->vdev_children != 0)
- return;
- if (flags & ZIO_FLAG_SCRUB_THREAD) {
- ASSERT(flags & ZIO_FLAG_IO_REPAIR);
- for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
- vdev_dtl_dirty(&pvd->vdev_dtl_scrub, txg, 1);
- }
- if (!(flags & ZIO_FLAG_IO_REPAIR)) {
- if (vdev_dtl_contains(&vd->vdev_dtl_map, txg, 1))
- return;
- vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
- for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
- vdev_dtl_dirty(&pvd->vdev_dtl_map, txg, 1);
- }
- }
-}
-
-void
-vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete)
-{
- int c;
- vdev_stat_t *vs = &vd->vdev_stat;
-
- for (c = 0; c < vd->vdev_children; c++)
- vdev_scrub_stat_update(vd->vdev_child[c], type, complete);
-
- mutex_enter(&vd->vdev_stat_lock);
-
- if (type == POOL_SCRUB_NONE) {
- /*
- * Update completion and end time. Leave everything else alone
- * so we can report what happened during the previous scrub.
- */
- vs->vs_scrub_complete = complete;
- vs->vs_scrub_end = gethrestime_sec();
- } else {
- vs->vs_scrub_type = type;
- vs->vs_scrub_complete = 0;
- vs->vs_scrub_examined = 0;
- vs->vs_scrub_repaired = 0;
- vs->vs_scrub_errors = 0;
- vs->vs_scrub_start = gethrestime_sec();
- vs->vs_scrub_end = 0;
- }
-
- mutex_exit(&vd->vdev_stat_lock);
-}
-
-/*
- * Update the in-core space usage stats for this vdev and the root vdev.
- */
-void
-vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta)
-{
- ASSERT(vd == vd->vdev_top);
- int64_t dspace_delta = space_delta;
-
- do {
- if (vd->vdev_ms_count) {
- /*
- * If this is a top-level vdev, apply the
- * inverse of its psize-to-asize (ie. RAID-Z)
- * space-expansion factor. We must calculate
- * this here and not at the root vdev because
- * the root vdev's psize-to-asize is simply the
- * max of its childrens', thus not accurate
- * enough for us.
- */
- ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0);
- dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) *
- vd->vdev_deflate_ratio;
- }
-
- mutex_enter(&vd->vdev_stat_lock);
- vd->vdev_stat.vs_space += space_delta;
- vd->vdev_stat.vs_alloc += alloc_delta;
- vd->vdev_stat.vs_dspace += dspace_delta;
- mutex_exit(&vd->vdev_stat_lock);
- } while ((vd = vd->vdev_parent) != NULL);
-}
-
-/*
- * Mark a top-level vdev's config as dirty, placing it on the dirty list
- * so that it will be written out next time the vdev configuration is synced.
- * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs.
- */
-void
-vdev_config_dirty(vdev_t *vd)
-{
- spa_t *spa = vd->vdev_spa;
- vdev_t *rvd = spa->spa_root_vdev;
- int c;
-
- /*
- * The dirty list is protected by the config lock. The caller must
- * either hold the config lock as writer, or must be the sync thread
- * (which holds the lock as reader). There's only one sync thread,
- * so this is sufficient to ensure mutual exclusion.
- */
- ASSERT(spa_config_held(spa, RW_WRITER) ||
- dsl_pool_sync_context(spa_get_dsl(spa)));
-
- if (vd == rvd) {
- for (c = 0; c < rvd->vdev_children; c++)
- vdev_config_dirty(rvd->vdev_child[c]);
- } else {
- ASSERT(vd == vd->vdev_top);
-
- if (!list_link_active(&vd->vdev_dirty_node))
- list_insert_head(&spa->spa_dirty_list, vd);
- }
-}
-
-void
-vdev_config_clean(vdev_t *vd)
-{
- spa_t *spa = vd->vdev_spa;
-
- ASSERT(spa_config_held(spa, RW_WRITER) ||
- dsl_pool_sync_context(spa_get_dsl(spa)));
-
- ASSERT(list_link_active(&vd->vdev_dirty_node));
- list_remove(&spa->spa_dirty_list, vd);
-}
-
-void
-vdev_propagate_state(vdev_t *vd)
-{
- vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
- int degraded = 0, faulted = 0;
- int corrupted = 0;
- int c;
- vdev_t *child;
-
- for (c = 0; c < vd->vdev_children; c++) {
- child = vd->vdev_child[c];
- if (child->vdev_state <= VDEV_STATE_CANT_OPEN)
- faulted++;
- else if (child->vdev_state == VDEV_STATE_DEGRADED)
- degraded++;
-
- if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA)
- corrupted++;
- }
-
- vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded);
-
- /*
- * Root special: if there is a toplevel vdev that cannot be
- * opened due to corrupted metadata, then propagate the root
- * vdev's aux state as 'corrupt' rather than 'insufficient
- * replicas'.
- */
- if (corrupted && vd == rvd && rvd->vdev_state == VDEV_STATE_CANT_OPEN)
- vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_CORRUPT_DATA);
-}
-
-/*
- * Set a vdev's state. If this is during an open, we don't update the parent
- * state, because we're in the process of opening children depth-first.
- * Otherwise, we propagate the change to the parent.
- *
- * If this routine places a device in a faulted state, an appropriate ereport is
- * generated.
- */
-void
-vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
-{
- uint64_t save_state;
-
- if (state == vd->vdev_state) {
- vd->vdev_stat.vs_aux = aux;
- return;
- }
-
- save_state = vd->vdev_state;
-
- vd->vdev_state = state;
- vd->vdev_stat.vs_aux = aux;
-
- /*
- * If we are setting the vdev state to anything but an open state, then
- * always close the underlying device. Otherwise, we keep accessible
- * but invalid devices open forever. We don't call vdev_close() itself,
- * because that implies some extra checks (offline, etc) that we don't
- * want here. This is limited to leaf devices, because otherwise
- * closing the device will affect other children.
- */
- if (vdev_is_dead(vd) && vd->vdev_ops->vdev_op_leaf)
- vd->vdev_ops->vdev_op_close(vd);
-
- if (state == VDEV_STATE_CANT_OPEN) {
- /*
- * If we fail to open a vdev during an import, we mark it as
- * "not available", which signifies that it was never there to
- * begin with. Failure to open such a device is not considered
- * an error.
- */
- if (vd->vdev_spa->spa_load_state == SPA_LOAD_IMPORT &&
- vd->vdev_ops->vdev_op_leaf)
- vd->vdev_not_present = 1;
-
- /*
- * Post the appropriate ereport. If the 'prevstate' field is
- * set to something other than VDEV_STATE_UNKNOWN, it indicates
- * that this is part of a vdev_reopen(). In this case, we don't
- * want to post the ereport if the device was already in the
- * CANT_OPEN state beforehand.
- */
- if (vd->vdev_prevstate != state && !vd->vdev_not_present &&
- vd != vd->vdev_spa->spa_root_vdev) {
- const char *class;
-
- switch (aux) {
- case VDEV_AUX_OPEN_FAILED:
- class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED;
- break;
- case VDEV_AUX_CORRUPT_DATA:
- class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA;
- break;
- case VDEV_AUX_NO_REPLICAS:
- class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS;
- break;
- case VDEV_AUX_BAD_GUID_SUM:
- class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM;
- break;
- case VDEV_AUX_TOO_SMALL:
- class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL;
- break;
- case VDEV_AUX_BAD_LABEL:
- class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL;
- break;
- default:
- class = FM_EREPORT_ZFS_DEVICE_UNKNOWN;
- }
-
- zfs_ereport_post(class, vd->vdev_spa,
- vd, NULL, save_state, 0);
- }
- }
-
- if (isopen)
- return;
-
- if (vd->vdev_parent != NULL)
- vdev_propagate_state(vd->vdev_parent);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c
deleted file mode 100644
index 4e419b6..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c
+++ /dev/null
@@ -1,394 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/spa.h>
-#include <sys/vdev_impl.h>
-#include <sys/zio.h>
-
-/*
- * Virtual device read-ahead caching.
- *
- * This file implements a simple LRU read-ahead cache. When the DMU reads
- * a given block, it will often want other, nearby blocks soon thereafter.
- * We take advantage of this by reading a larger disk region and caching
- * the result. In the best case, this can turn 256 back-to-back 512-byte
- * reads into a single 128k read followed by 255 cache hits; this reduces
- * latency dramatically. In the worst case, it can turn an isolated 512-byte
- * read into a 128k read, which doesn't affect latency all that much but is
- * terribly wasteful of bandwidth. A more intelligent version of the cache
- * could keep track of access patterns and not do read-ahead unless it sees
- * at least two temporally close I/Os to the same region. It could also
- * take advantage of semantic information about the I/O. And it could use
- * something faster than an AVL tree; that was chosen solely for convenience.
- *
- * There are five cache operations: allocate, fill, read, write, evict.
- *
- * (1) Allocate. This reserves a cache entry for the specified region.
- * We separate the allocate and fill operations so that multiple threads
- * don't generate I/O for the same cache miss.
- *
- * (2) Fill. When the I/O for a cache miss completes, the fill routine
- * places the data in the previously allocated cache entry.
- *
- * (3) Read. Read data from the cache.
- *
- * (4) Write. Update cache contents after write completion.
- *
- * (5) Evict. When allocating a new entry, we evict the oldest (LRU) entry
- * if the total cache size exceeds zfs_vdev_cache_size.
- */
-
-/*
- * These tunables are for performance analysis.
- */
-/*
- * All i/os smaller than zfs_vdev_cache_max will be turned into
- * 1<<zfs_vdev_cache_bshift byte reads by the vdev_cache (aka software
- * track buffer. At most zfs_vdev_cache_size bytes will be kept in each
- * vdev's vdev_cache.
- */
-int zfs_vdev_cache_max = 1<<14;
-int zfs_vdev_cache_size = 10ULL << 20;
-int zfs_vdev_cache_bshift = 16;
-
-SYSCTL_DECL(_vfs_zfs_vdev);
-SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, cache, CTLFLAG_RW, 0, "ZFS VDEV Cache");
-TUNABLE_INT("vfs.zfs.vdev.cache.max", &zfs_vdev_cache_max);
-SYSCTL_INT(_vfs_zfs_vdev_cache, OID_AUTO, max, CTLFLAG_RDTUN,
- &zfs_vdev_cache_max, 0, "Maximum I/O request size that increase read size");
-TUNABLE_INT("vfs.zfs.vdev.cache.size", &zfs_vdev_cache_size);
-SYSCTL_INT(_vfs_zfs_vdev_cache, OID_AUTO, size, CTLFLAG_RDTUN,
- &zfs_vdev_cache_size, 0, "Size of VDEV cache");
-
-#define VCBS (1 << zfs_vdev_cache_bshift)
-
-static int
-vdev_cache_offset_compare(const void *a1, const void *a2)
-{
- const vdev_cache_entry_t *ve1 = a1;
- const vdev_cache_entry_t *ve2 = a2;
-
- if (ve1->ve_offset < ve2->ve_offset)
- return (-1);
- if (ve1->ve_offset > ve2->ve_offset)
- return (1);
- return (0);
-}
-
-static int
-vdev_cache_lastused_compare(const void *a1, const void *a2)
-{
- const vdev_cache_entry_t *ve1 = a1;
- const vdev_cache_entry_t *ve2 = a2;
-
- if (ve1->ve_lastused < ve2->ve_lastused)
- return (-1);
- if (ve1->ve_lastused > ve2->ve_lastused)
- return (1);
-
- /*
- * Among equally old entries, sort by offset to ensure uniqueness.
- */
- return (vdev_cache_offset_compare(a1, a2));
-}
-
-/*
- * Evict the specified entry from the cache.
- */
-static void
-vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve)
-{
- ASSERT(MUTEX_HELD(&vc->vc_lock));
- ASSERT(ve->ve_fill_io == NULL);
- ASSERT(ve->ve_data != NULL);
-
- dprintf("evicting %p, off %llx, LRU %llu, age %lu, hits %u, stale %u\n",
- vc, ve->ve_offset, ve->ve_lastused, LBOLT - ve->ve_lastused,
- ve->ve_hits, ve->ve_missed_update);
-
- avl_remove(&vc->vc_lastused_tree, ve);
- avl_remove(&vc->vc_offset_tree, ve);
- zio_buf_free(ve->ve_data, VCBS);
- kmem_free(ve, sizeof (vdev_cache_entry_t));
-}
-
-/*
- * Allocate an entry in the cache. At the point we don't have the data,
- * we're just creating a placeholder so that multiple threads don't all
- * go off and read the same blocks.
- */
-static vdev_cache_entry_t *
-vdev_cache_allocate(zio_t *zio)
-{
- vdev_cache_t *vc = &zio->io_vd->vdev_cache;
- uint64_t offset = P2ALIGN(zio->io_offset, VCBS);
- vdev_cache_entry_t *ve;
-
- ASSERT(MUTEX_HELD(&vc->vc_lock));
-
- if (zfs_vdev_cache_size == 0)
- return (NULL);
-
- /*
- * If adding a new entry would exceed the cache size,
- * evict the oldest entry (LRU).
- */
- if ((avl_numnodes(&vc->vc_lastused_tree) << zfs_vdev_cache_bshift) >
- zfs_vdev_cache_size) {
- ve = avl_first(&vc->vc_lastused_tree);
- if (ve->ve_fill_io != NULL) {
- dprintf("can't evict in %p, still filling\n", vc);
- return (NULL);
- }
- ASSERT(ve->ve_hits != 0);
- vdev_cache_evict(vc, ve);
- }
-
- ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP);
- ve->ve_offset = offset;
- ve->ve_lastused = LBOLT;
- ve->ve_data = zio_buf_alloc(VCBS);
-
- avl_add(&vc->vc_offset_tree, ve);
- avl_add(&vc->vc_lastused_tree, ve);
-
- return (ve);
-}
-
-static void
-vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio)
-{
- uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS);
-
- ASSERT(MUTEX_HELD(&vc->vc_lock));
- ASSERT(ve->ve_fill_io == NULL);
-
- if (ve->ve_lastused != LBOLT) {
- avl_remove(&vc->vc_lastused_tree, ve);
- ve->ve_lastused = LBOLT;
- avl_add(&vc->vc_lastused_tree, ve);
- }
-
- ve->ve_hits++;
- bcopy(ve->ve_data + cache_phase, zio->io_data, zio->io_size);
-}
-
-/*
- * Fill a previously allocated cache entry with data.
- */
-static void
-vdev_cache_fill(zio_t *zio)
-{
- vdev_t *vd = zio->io_vd;
- vdev_cache_t *vc = &vd->vdev_cache;
- vdev_cache_entry_t *ve = zio->io_private;
- zio_t *dio;
-
- ASSERT(zio->io_size == VCBS);
-
- /*
- * Add data to the cache.
- */
- mutex_enter(&vc->vc_lock);
-
- ASSERT(ve->ve_fill_io == zio);
- ASSERT(ve->ve_offset == zio->io_offset);
- ASSERT(ve->ve_data == zio->io_data);
-
- ve->ve_fill_io = NULL;
-
- /*
- * Even if this cache line was invalidated by a missed write update,
- * any reads that were queued up before the missed update are still
- * valid, so we can satisfy them from this line before we evict it.
- */
- for (dio = zio->io_delegate_list; dio; dio = dio->io_delegate_next)
- vdev_cache_hit(vc, ve, dio);
-
- if (zio->io_error || ve->ve_missed_update)
- vdev_cache_evict(vc, ve);
-
- mutex_exit(&vc->vc_lock);
-
- while ((dio = zio->io_delegate_list) != NULL) {
- zio->io_delegate_list = dio->io_delegate_next;
- dio->io_delegate_next = NULL;
- dio->io_error = zio->io_error;
- zio_next_stage(dio);
- }
-}
-
-/*
- * Read data from the cache. Returns 0 on cache hit, errno on a miss.
- */
-int
-vdev_cache_read(zio_t *zio)
-{
- vdev_cache_t *vc = &zio->io_vd->vdev_cache;
- vdev_cache_entry_t *ve, ve_search;
- uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS);
- uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS);
- zio_t *fio;
-
- ASSERT(zio->io_type == ZIO_TYPE_READ);
-
- if (zio->io_flags & ZIO_FLAG_DONT_CACHE)
- return (EINVAL);
-
- if (zio->io_size > zfs_vdev_cache_max)
- return (EOVERFLOW);
-
- /*
- * If the I/O straddles two or more cache blocks, don't cache it.
- */
- if (P2CROSS(zio->io_offset, zio->io_offset + zio->io_size - 1, VCBS))
- return (EXDEV);
-
- ASSERT(cache_phase + zio->io_size <= VCBS);
-
- mutex_enter(&vc->vc_lock);
-
- ve_search.ve_offset = cache_offset;
- ve = avl_find(&vc->vc_offset_tree, &ve_search, NULL);
-
- if (ve != NULL) {
- if (ve->ve_missed_update) {
- mutex_exit(&vc->vc_lock);
- return (ESTALE);
- }
-
- if ((fio = ve->ve_fill_io) != NULL) {
- zio->io_delegate_next = fio->io_delegate_list;
- fio->io_delegate_list = zio;
- zio_vdev_io_bypass(zio);
- mutex_exit(&vc->vc_lock);
- return (0);
- }
-
- vdev_cache_hit(vc, ve, zio);
- zio_vdev_io_bypass(zio);
-
- mutex_exit(&vc->vc_lock);
- zio_next_stage(zio);
- return (0);
- }
-
- ve = vdev_cache_allocate(zio);
-
- if (ve == NULL) {
- mutex_exit(&vc->vc_lock);
- return (ENOMEM);
- }
-
- fio = zio_vdev_child_io(zio, NULL, zio->io_vd, cache_offset,
- ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_CACHE_FILL,
- ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE |
- ZIO_FLAG_DONT_RETRY | ZIO_FLAG_NOBOOKMARK,
- vdev_cache_fill, ve);
-
- ve->ve_fill_io = fio;
- fio->io_delegate_list = zio;
- zio_vdev_io_bypass(zio);
-
- mutex_exit(&vc->vc_lock);
- zio_nowait(fio);
-
- return (0);
-}
-
-/*
- * Update cache contents upon write completion.
- */
-void
-vdev_cache_write(zio_t *zio)
-{
- vdev_cache_t *vc = &zio->io_vd->vdev_cache;
- vdev_cache_entry_t *ve, ve_search;
- uint64_t io_start = zio->io_offset;
- uint64_t io_end = io_start + zio->io_size;
- uint64_t min_offset = P2ALIGN(io_start, VCBS);
- uint64_t max_offset = P2ROUNDUP(io_end, VCBS);
- avl_index_t where;
-
- ASSERT(zio->io_type == ZIO_TYPE_WRITE);
-
- mutex_enter(&vc->vc_lock);
-
- ve_search.ve_offset = min_offset;
- ve = avl_find(&vc->vc_offset_tree, &ve_search, &where);
-
- if (ve == NULL)
- ve = avl_nearest(&vc->vc_offset_tree, where, AVL_AFTER);
-
- while (ve != NULL && ve->ve_offset < max_offset) {
- uint64_t start = MAX(ve->ve_offset, io_start);
- uint64_t end = MIN(ve->ve_offset + VCBS, io_end);
-
- if (ve->ve_fill_io != NULL) {
- ve->ve_missed_update = 1;
- } else {
- bcopy((char *)zio->io_data + start - io_start,
- ve->ve_data + start - ve->ve_offset, end - start);
- }
- ve = AVL_NEXT(&vc->vc_offset_tree, ve);
- }
- mutex_exit(&vc->vc_lock);
-}
-
-void
-vdev_cache_init(vdev_t *vd)
-{
- vdev_cache_t *vc = &vd->vdev_cache;
-
- mutex_init(&vc->vc_lock, NULL, MUTEX_DEFAULT, NULL);
-
- avl_create(&vc->vc_offset_tree, vdev_cache_offset_compare,
- sizeof (vdev_cache_entry_t),
- offsetof(struct vdev_cache_entry, ve_offset_node));
-
- avl_create(&vc->vc_lastused_tree, vdev_cache_lastused_compare,
- sizeof (vdev_cache_entry_t),
- offsetof(struct vdev_cache_entry, ve_lastused_node));
-}
-
-void
-vdev_cache_fini(vdev_t *vd)
-{
- vdev_cache_t *vc = &vd->vdev_cache;
- vdev_cache_entry_t *ve;
-
- mutex_enter(&vc->vc_lock);
- while ((ve = avl_first(&vc->vc_offset_tree)) != NULL)
- vdev_cache_evict(vc, ve);
- mutex_exit(&vc->vc_lock);
-
- avl_destroy(&vc->vc_offset_tree);
- avl_destroy(&vc->vc_lastused_tree);
-
- mutex_destroy(&vc->vc_lock);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c
deleted file mode 100644
index b965b1c..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c
+++ /dev/null
@@ -1,363 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/spa.h>
-#include <sys/vdev_disk.h>
-#include <sys/vdev_impl.h>
-#include <sys/fs/zfs.h>
-#include <sys/zio.h>
-#include <sys/sunldi.h>
-
-/*
- * Virtual device vector for disks.
- */
-
-extern ldi_ident_t zfs_li;
-
-typedef struct vdev_disk_buf {
- buf_t vdb_buf;
- zio_t *vdb_io;
-} vdev_disk_buf_t;
-
-static int
-vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
-{
- vdev_disk_t *dvd;
- struct dk_minfo dkm;
- int error;
-
- /*
- * We must have a pathname, and it must be absolute.
- */
- if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
- vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
- return (EINVAL);
- }
-
- dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
-
- /*
- * When opening a disk device, we want to preserve the user's original
- * intent. We always want to open the device by the path the user gave
- * us, even if it is one of multiple paths to the save device. But we
- * also want to be able to survive disks being removed/recabled.
- * Therefore the sequence of opening devices is:
- *
- * 1. Try opening the device by path. For legacy pools without the
- * 'whole_disk' property, attempt to fix the path by appending 's0'.
- *
- * 2. If the devid of the device matches the stored value, return
- * success.
- *
- * 3. Otherwise, the device may have moved. Try opening the device
- * by the devid instead.
- *
- */
- if (vd->vdev_devid != NULL) {
- if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid,
- &dvd->vd_minor) != 0) {
- vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
- return (EINVAL);
- }
- }
-
- error = EINVAL; /* presume failure */
-
- if (vd->vdev_path != NULL) {
- ddi_devid_t devid;
-
- if (vd->vdev_wholedisk == -1ULL) {
- size_t len = strlen(vd->vdev_path) + 3;
- char *buf = kmem_alloc(len, KM_SLEEP);
- ldi_handle_t lh;
-
- (void) snprintf(buf, len, "%ss0", vd->vdev_path);
-
- if (ldi_open_by_name(buf, spa_mode, kcred,
- &lh, zfs_li) == 0) {
- spa_strfree(vd->vdev_path);
- vd->vdev_path = buf;
- vd->vdev_wholedisk = 1ULL;
- (void) ldi_close(lh, spa_mode, kcred);
- } else {
- kmem_free(buf, len);
- }
- }
-
- error = ldi_open_by_name(vd->vdev_path, spa_mode, kcred,
- &dvd->vd_lh, zfs_li);
-
- /*
- * Compare the devid to the stored value.
- */
- if (error == 0 && vd->vdev_devid != NULL &&
- ldi_get_devid(dvd->vd_lh, &devid) == 0) {
- if (ddi_devid_compare(devid, dvd->vd_devid) != 0) {
- error = EINVAL;
- (void) ldi_close(dvd->vd_lh, spa_mode, kcred);
- dvd->vd_lh = NULL;
- }
- ddi_devid_free(devid);
- }
-
- /*
- * If we succeeded in opening the device, but 'vdev_wholedisk'
- * is not yet set, then this must be a slice.
- */
- if (error == 0 && vd->vdev_wholedisk == -1ULL)
- vd->vdev_wholedisk = 0;
- }
-
- /*
- * If we were unable to open by path, or the devid check fails, open by
- * devid instead.
- */
- if (error != 0 && vd->vdev_devid != NULL)
- error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor,
- spa_mode, kcred, &dvd->vd_lh, zfs_li);
-
- if (error) {
- vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
- return (error);
- }
-
- /*
- * Determine the actual size of the device.
- */
- if (ldi_get_size(dvd->vd_lh, psize) != 0) {
- vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
- return (EINVAL);
- }
-
- /*
- * If we own the whole disk, try to enable disk write caching.
- * We ignore errors because it's OK if we can't do it.
- */
- if (vd->vdev_wholedisk == 1) {
- int wce = 1;
- (void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce,
- FKIOCTL, kcred, NULL);
- }
-
- /*
- * Determine the device's minimum transfer size.
- * If the ioctl isn't supported, assume DEV_BSIZE.
- */
- if (ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFO, (intptr_t)&dkm,
- FKIOCTL, kcred, NULL) != 0)
- dkm.dki_lbsize = DEV_BSIZE;
-
- *ashift = highbit(MAX(dkm.dki_lbsize, SPA_MINBLOCKSIZE)) - 1;
-
- /*
- * Clear the nowritecache bit, so that on a vdev_reopen() we will
- * try again.
- */
- vd->vdev_nowritecache = B_FALSE;
-
- return (0);
-}
-
-static void
-vdev_disk_close(vdev_t *vd)
-{
- vdev_disk_t *dvd = vd->vdev_tsd;
-
- if (dvd == NULL)
- return;
-
- dprintf("removing disk %s, devid %s\n",
- vd->vdev_path ? vd->vdev_path : "<none>",
- vd->vdev_devid ? vd->vdev_devid : "<none>");
-
- if (dvd->vd_minor != NULL)
- ddi_devid_str_free(dvd->vd_minor);
-
- if (dvd->vd_devid != NULL)
- ddi_devid_free(dvd->vd_devid);
-
- if (dvd->vd_lh != NULL)
- (void) ldi_close(dvd->vd_lh, spa_mode, kcred);
-
- kmem_free(dvd, sizeof (vdev_disk_t));
- vd->vdev_tsd = NULL;
-}
-
-static void
-vdev_disk_io_intr(buf_t *bp)
-{
- vdev_disk_buf_t *vdb = (vdev_disk_buf_t *)bp;
- zio_t *zio = vdb->vdb_io;
-
- if ((zio->io_error = geterror(bp)) == 0 && bp->b_resid != 0)
- zio->io_error = EIO;
-
- kmem_free(vdb, sizeof (vdev_disk_buf_t));
-
- zio_next_stage_async(zio);
-}
-
-static void
-vdev_disk_ioctl_done(void *zio_arg, int error)
-{
- zio_t *zio = zio_arg;
-
- zio->io_error = error;
-
- zio_next_stage_async(zio);
-}
-
-static void
-vdev_disk_io_start(zio_t *zio)
-{
- vdev_t *vd = zio->io_vd;
- vdev_disk_t *dvd = vd->vdev_tsd;
- vdev_disk_buf_t *vdb;
- buf_t *bp;
- int flags, error;
-
- if (zio->io_type == ZIO_TYPE_IOCTL) {
- zio_vdev_io_bypass(zio);
-
- /* XXPOLICY */
- if (vdev_is_dead(vd)) {
- zio->io_error = ENXIO;
- zio_next_stage_async(zio);
- return;
- }
-
- switch (zio->io_cmd) {
-
- case DKIOCFLUSHWRITECACHE:
-
- if (zfs_nocacheflush)
- break;
-
- if (vd->vdev_nowritecache) {
- zio->io_error = ENOTSUP;
- break;
- }
-
- zio->io_dk_callback.dkc_callback = vdev_disk_ioctl_done;
- zio->io_dk_callback.dkc_cookie = zio;
-
- error = ldi_ioctl(dvd->vd_lh, zio->io_cmd,
- (uintptr_t)&zio->io_dk_callback,
- FKIOCTL, kcred, NULL);
-
- if (error == 0) {
- /*
- * The ioctl will be done asychronously,
- * and will call vdev_disk_ioctl_done()
- * upon completion.
- */
- return;
- } else if (error == ENOTSUP) {
- /*
- * If we get ENOTSUP, we know that no future
- * attempts will ever succeed. In this case we
- * set a persistent bit so that we don't bother
- * with the ioctl in the future.
- */
- vd->vdev_nowritecache = B_TRUE;
- }
- zio->io_error = error;
-
- break;
-
- default:
- zio->io_error = ENOTSUP;
- }
-
- zio_next_stage_async(zio);
- return;
- }
-
- if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
- return;
-
- if ((zio = vdev_queue_io(zio)) == NULL)
- return;
-
- flags = (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE);
- flags |= B_BUSY | B_NOCACHE;
- if (zio->io_flags & ZIO_FLAG_FAILFAST)
- flags |= B_FAILFAST;
-
- vdb = kmem_alloc(sizeof (vdev_disk_buf_t), KM_SLEEP);
-
- vdb->vdb_io = zio;
- bp = &vdb->vdb_buf;
-
- bioinit(bp);
- bp->b_flags = flags;
- bp->b_bcount = zio->io_size;
- bp->b_un.b_addr = zio->io_data;
- bp->b_lblkno = lbtodb(zio->io_offset);
- bp->b_bufsize = zio->io_size;
- bp->b_iodone = (int (*)())vdev_disk_io_intr;
-
- /* XXPOLICY */
- error = vdev_is_dead(vd) ? ENXIO : vdev_error_inject(vd, zio);
- if (error) {
- zio->io_error = error;
- bioerror(bp, error);
- bp->b_resid = bp->b_bcount;
- bp->b_iodone(bp);
- return;
- }
-
- error = ldi_strategy(dvd->vd_lh, bp);
- /* ldi_strategy() will return non-zero only on programming errors */
- ASSERT(error == 0);
-}
-
-static void
-vdev_disk_io_done(zio_t *zio)
-{
- vdev_queue_io_done(zio);
-
- if (zio->io_type == ZIO_TYPE_WRITE)
- vdev_cache_write(zio);
-
- if (zio_injection_enabled && zio->io_error == 0)
- zio->io_error = zio_handle_device_injection(zio->io_vd, EIO);
-
- zio_next_stage(zio);
-}
-
-vdev_ops_t vdev_disk_ops = {
- vdev_disk_open,
- vdev_disk_close,
- vdev_default_asize,
- vdev_disk_io_start,
- vdev_disk_io_done,
- NULL,
- VDEV_TYPE_DISK, /* name of this vdev type */
- B_TRUE /* leaf vdev */
-};
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c
deleted file mode 100644
index b8e79f8..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c
+++ /dev/null
@@ -1,225 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/spa.h>
-#include <sys/vdev_file.h>
-#include <sys/vdev_impl.h>
-#include <sys/zio.h>
-#include <sys/fs/zfs.h>
-
-/*
- * Virtual device vector for files.
- */
-
-static int
-vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
-{
- vdev_file_t *vf;
- vnode_t *vp;
- vattr_t vattr;
- int error;
-
- /*
- * We must have a pathname, and it must be absolute.
- */
- if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
- vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
- return (EINVAL);
- }
-
- vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP);
-
- /*
- * We always open the files from the root of the global zone, even if
- * we're in a local zone. If the user has gotten to this point, the
- * administrator has already decided that the pool should be available
- * to local zone users, so the underlying devices should be as well.
- */
- ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/');
- error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE, spa_mode | FOFFMAX,
- 0, &vp, 0, 0, rootdir);
-
- if (error) {
- vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
- return (error);
- }
-
- vf->vf_vnode = vp;
-
-#ifdef _KERNEL
- /*
- * Make sure it's a regular file.
- */
- if (vp->v_type != VREG) {
- vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
- return (ENODEV);
- }
-#endif
-
- /*
- * Determine the physical size of the file.
- */
- vattr.va_mask = AT_SIZE;
- error = VOP_GETATTR(vp, &vattr, 0, kcred);
- if (error) {
- vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
- return (error);
- }
-
- *psize = vattr.va_size;
- *ashift = SPA_MINBLOCKSHIFT;
-
- return (0);
-}
-
-static void
-vdev_file_close(vdev_t *vd)
-{
- vdev_file_t *vf = vd->vdev_tsd;
-
- if (vf == NULL)
- return;
-
- if (vf->vf_vnode != NULL) {
- (void) VOP_PUTPAGE(vf->vf_vnode, 0, 0, B_INVAL, kcred);
- (void) VOP_CLOSE(vf->vf_vnode, spa_mode, 1, 0, kcred);
- VN_RELE(vf->vf_vnode);
- }
-
- kmem_free(vf, sizeof (vdev_file_t));
- vd->vdev_tsd = NULL;
-}
-
-static void
-vdev_file_io_start(zio_t *zio)
-{
- vdev_t *vd = zio->io_vd;
- vdev_file_t *vf = vd->vdev_tsd;
- ssize_t resid;
- int error;
-
- if (zio->io_type == ZIO_TYPE_IOCTL) {
- zio_vdev_io_bypass(zio);
-
- /* XXPOLICY */
- if (vdev_is_dead(vd)) {
- zio->io_error = ENXIO;
- zio_next_stage_async(zio);
- return;
- }
-
- switch (zio->io_cmd) {
- case DKIOCFLUSHWRITECACHE:
- zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC,
- kcred);
- dprintf("fsync(%s) = %d\n", vdev_description(vd),
- zio->io_error);
- break;
- default:
- zio->io_error = ENOTSUP;
- }
-
- zio_next_stage_async(zio);
- return;
- }
-
- /*
- * In the kernel, don't bother double-caching, but in userland,
- * we want to test the vdev_cache code.
- */
-#ifndef _KERNEL
- if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
- return;
-#endif
-
- if ((zio = vdev_queue_io(zio)) == NULL)
- return;
-
- /* XXPOLICY */
- error = vdev_is_dead(vd) ? ENXIO : vdev_error_inject(vd, zio);
- if (error) {
- zio->io_error = error;
- zio_next_stage_async(zio);
- return;
- }
-
- zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ?
- UIO_READ : UIO_WRITE, vf->vf_vnode, zio->io_data,
- zio->io_size, zio->io_offset, UIO_SYSSPACE,
- 0, RLIM64_INFINITY, kcred, &resid);
-
- if (resid != 0 && zio->io_error == 0)
- zio->io_error = ENOSPC;
-
- zio_next_stage_async(zio);
-}
-
-static void
-vdev_file_io_done(zio_t *zio)
-{
- vdev_queue_io_done(zio);
-
-#ifndef _KERNEL
- if (zio->io_type == ZIO_TYPE_WRITE)
- vdev_cache_write(zio);
-#endif
-
- if (zio_injection_enabled && zio->io_error == 0)
- zio->io_error = zio_handle_device_injection(zio->io_vd, EIO);
-
- zio_next_stage(zio);
-}
-
-vdev_ops_t vdev_file_ops = {
- vdev_file_open,
- vdev_file_close,
- vdev_default_asize,
- vdev_file_io_start,
- vdev_file_io_done,
- NULL,
- VDEV_TYPE_FILE, /* name of this vdev type */
- B_TRUE /* leaf vdev */
-};
-
-/*
- * From userland we access disks just like files.
- */
-#ifndef _KERNEL
-
-vdev_ops_t vdev_disk_ops = {
- vdev_file_open,
- vdev_file_close,
- vdev_default_asize,
- vdev_file_io_start,
- vdev_file_io_done,
- NULL,
- VDEV_TYPE_DISK, /* name of this vdev type */
- B_TRUE /* leaf vdev */
-};
-
-#endif
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
deleted file mode 100644
index eebc911..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
+++ /dev/null
@@ -1,583 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
- * All rights reserved.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/param.h>
-#include <sys/kernel.h>
-#include <sys/bio.h>
-#include <sys/disk.h>
-#include <sys/spa.h>
-#include <sys/vdev_impl.h>
-#include <sys/fs/zfs.h>
-#include <sys/zio.h>
-#include <geom/geom.h>
-#include <geom/geom_int.h>
-
-/*
- * Virtual device vector for GEOM.
- */
-
-struct g_class zfs_vdev_class = {
- .name = "ZFS::VDEV",
- .version = G_VERSION,
-};
-
-DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev);
-
-typedef struct vdev_geom_ctx {
- struct g_consumer *gc_consumer;
- int gc_state;
- struct bio_queue_head gc_queue;
- struct mtx gc_queue_mtx;
-} vdev_geom_ctx_t;
-
-static void
-vdev_geom_release(vdev_t *vd)
-{
- vdev_geom_ctx_t *ctx;
-
- ctx = vd->vdev_tsd;
- vd->vdev_tsd = NULL;
-
- mtx_lock(&ctx->gc_queue_mtx);
- ctx->gc_state = 1;
- wakeup_one(&ctx->gc_queue);
- while (ctx->gc_state != 2)
- msleep(&ctx->gc_state, &ctx->gc_queue_mtx, 0, "vgeom:w", 0);
- mtx_unlock(&ctx->gc_queue_mtx);
- mtx_destroy(&ctx->gc_queue_mtx);
- kmem_free(ctx, sizeof(*ctx));
-}
-
-static void
-vdev_geom_orphan(struct g_consumer *cp)
-{
- struct g_geom *gp;
- vdev_t *vd;
- int error;
-
- g_topology_assert();
-
- vd = cp->private;
- gp = cp->geom;
- error = cp->provider->error;
-
- ZFS_LOG(1, "Closing access to %s.", cp->provider->name);
- if (cp->acr + cp->acw + cp->ace > 0)
- g_access(cp, -cp->acr, -cp->acw, -cp->ace);
- ZFS_LOG(1, "Destroyed consumer to %s.", cp->provider->name);
- g_detach(cp);
- g_destroy_consumer(cp);
- /* Destroy geom if there are no consumers left. */
- if (LIST_EMPTY(&gp->consumer)) {
- ZFS_LOG(1, "Destroyed geom %s.", gp->name);
- g_wither_geom(gp, error);
- }
- vdev_geom_release(vd);
- /* Both methods below work, but in a bit different way. */
-#if 0
- vd->vdev_reopen_wanted = 1;
-#else
- vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
- vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, vd->vdev_stat.vs_aux);
-#endif
-}
-
-static struct g_consumer *
-vdev_geom_attach(struct g_provider *pp, int write)
-{
- struct g_geom *gp;
- struct g_consumer *cp;
-
- g_topology_assert();
-
- ZFS_LOG(1, "Attaching to %s.", pp->name);
- /* Do we have geom already? No? Create one. */
- LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) {
- if (gp->flags & G_GEOM_WITHER)
- continue;
- if (strcmp(gp->name, "zfs::vdev") != 0)
- continue;
- break;
- }
- if (gp == NULL) {
- gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev");
- gp->orphan = vdev_geom_orphan;
- cp = g_new_consumer(gp);
- if (g_attach(cp, pp) != 0) {
- g_wither_geom(gp, ENXIO);
- return (NULL);
- }
- if (g_access(cp, 1, write, 1) != 0) {
- g_wither_geom(gp, ENXIO);
- return (NULL);
- }
- ZFS_LOG(1, "Created geom and consumer for %s.", pp->name);
- } else {
- /* Check if we are already connected to this provider. */
- LIST_FOREACH(cp, &gp->consumer, consumer) {
- if (cp->provider == pp) {
- ZFS_LOG(1, "Found consumer for %s.", pp->name);
- break;
- }
- }
- if (cp == NULL) {
- cp = g_new_consumer(gp);
- if (g_attach(cp, pp) != 0) {
- g_destroy_consumer(cp);
- return (NULL);
- }
- if (g_access(cp, 1, write, 1) != 0) {
- g_detach(cp);
- g_destroy_consumer(cp);
- return (NULL);
- }
- ZFS_LOG(1, "Created consumer for %s.", pp->name);
- } else {
- if (g_access(cp, 1, cp->acw > 0 ? 0 : write, 1) != 0)
- return (NULL);
- ZFS_LOG(1, "Used existing consumer for %s.", pp->name);
- }
- }
- return (cp);
-}
-
-static void
-vdev_geom_detach(void *arg, int flag __unused)
-{
- struct g_geom *gp;
- struct g_consumer *cp;
-
- g_topology_assert();
- cp = arg;
- gp = cp->geom;
-
- ZFS_LOG(1, "Closing access to %s.", cp->provider->name);
- g_access(cp, -1, 0, -1);
- /* Destroy consumer on last close. */
- if (cp->acr == 0 && cp->ace == 0) {
- ZFS_LOG(1, "Destroyed consumer to %s.", cp->provider->name);
- if (cp->acw > 0)
- g_access(cp, 0, -cp->acw, 0);
- g_detach(cp);
- g_destroy_consumer(cp);
- }
- /* Destroy geom if there are no consumers left. */
- if (LIST_EMPTY(&gp->consumer)) {
- ZFS_LOG(1, "Destroyed geom %s.", gp->name);
- g_wither_geom(gp, ENXIO);
- }
-}
-
-static void
-vdev_geom_worker(void *arg)
-{
- vdev_geom_ctx_t *ctx;
- zio_t *zio;
- struct bio *bp;
-
- ctx = arg;
- for (;;) {
- mtx_lock(&ctx->gc_queue_mtx);
- bp = bioq_takefirst(&ctx->gc_queue);
- if (bp == NULL) {
- if (ctx->gc_state == 1) {
- ctx->gc_state = 2;
- wakeup_one(&ctx->gc_state);
- mtx_unlock(&ctx->gc_queue_mtx);
- kproc_exit(0);
- }
- msleep(&ctx->gc_queue, &ctx->gc_queue_mtx,
- PRIBIO | PDROP, "vgeom:io", 0);
- continue;
- }
- mtx_unlock(&ctx->gc_queue_mtx);
- zio = bp->bio_caller1;
- zio->io_error = bp->bio_error;
- if (bp->bio_cmd == BIO_FLUSH && bp->bio_error == ENOTSUP) {
- vdev_t *vd;
-
- /*
- * If we get ENOTSUP, we know that no future
- * attempts will ever succeed. In this case we
- * set a persistent bit so that we don't bother
- * with the ioctl in the future.
- */
- vd = zio->io_vd;
- vd->vdev_nowritecache = B_TRUE;
- }
- g_destroy_bio(bp);
- zio_next_stage_async(zio);
- }
-}
-
-static char *
-vdev_geom_get_id(struct g_consumer *cp)
-{
- char *id;
- int len;
-
- g_topology_assert_not();
- len = DISK_IDENT_SIZE;
- id = kmem_zalloc(len, KM_SLEEP);
- if (g_io_getattr("GEOM::ident", cp, &len, id) != 0) {
- kmem_free(id, DISK_IDENT_SIZE);
- return (NULL);
- }
- return (id);
-}
-
-static void
-vdev_geom_free_id(char *id)
-{
-
- if (id != NULL)
- kmem_free(id, DISK_IDENT_SIZE);
-}
-
-struct vdev_geom_find {
- const char *id;
- int write;
- struct g_consumer *cp;
-};
-
-static void
-vdev_geom_taste_orphan(struct g_consumer *cp)
-{
-
- KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
- cp->provider->name));
-}
-
-static void
-vdev_geom_attach_by_id_event(void *arg, int flags __unused)
-{
- struct vdev_geom_find *ap;
- struct g_class *mp;
- struct g_geom *gp, *zgp;
- struct g_provider *pp;
- struct g_consumer *zcp;
- char *id;
-
- g_topology_assert();
-
- ap = arg;
-
- zgp = g_new_geomf(&zfs_vdev_class, "zfs::vdev::taste");
- /* This orphan function should be never called. */
- zgp->orphan = vdev_geom_taste_orphan;
- zcp = g_new_consumer(zgp);
-
- LIST_FOREACH(mp, &g_classes, class) {
- if (mp == &zfs_vdev_class)
- continue;
- LIST_FOREACH(gp, &mp->geom, geom) {
- if (gp->flags & G_GEOM_WITHER)
- continue;
- LIST_FOREACH(pp, &gp->provider, provider) {
- if (pp->flags & G_PF_WITHER)
- continue;
- g_attach(zcp, pp);
- if (g_access(zcp, 1, 0, 0) != 0) {
- g_detach(zcp);
- continue;
- }
- g_topology_unlock();
- id = vdev_geom_get_id(zcp);
- g_topology_lock();
- g_access(zcp, -1, 0, 0);
- g_detach(zcp);
- if (id == NULL || strcmp(id, ap->id) != 0) {
- vdev_geom_free_id(id);
- continue;
- }
- vdev_geom_free_id(id);
- ap->cp = vdev_geom_attach(pp, ap->write);
- if (ap->cp == NULL) {
- printf("ZFS WARNING: Cannot open %s "
- "for writting.\n", pp->name);
- continue;
- }
- goto end;
- }
- }
- }
- ap->cp = NULL;
-end:
- g_destroy_consumer(zcp);
- g_destroy_geom(zgp);
-}
-
-static struct g_consumer *
-vdev_geom_attach_by_id(const char *id, int write)
-{
- struct vdev_geom_find *ap;
- struct g_consumer *cp;
-
- ap = kmem_zalloc(sizeof(*ap), KM_SLEEP);
- ap->id = id;
- ap->write = write;
- g_waitfor_event(vdev_geom_attach_by_id_event, ap, M_WAITOK, NULL);
- cp = ap->cp;
- kmem_free(ap, sizeof(*ap));
- return (cp);
-}
-
-static int
-vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
-{
- vdev_geom_ctx_t *ctx;
- struct g_provider *pp;
- struct g_consumer *cp;
- char *id = NULL;
- int owned;
-
- /*
- * We must have a pathname, and it must be absolute.
- */
- if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
- vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
- return (EINVAL);
- }
-
- if ((owned = mtx_owned(&Giant)))
- mtx_unlock(&Giant);
- cp = NULL;
- g_topology_lock();
- pp = g_provider_by_name(vd->vdev_path + sizeof("/dev/") - 1);
- if (pp != NULL) {
- ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path);
- cp = vdev_geom_attach(pp, !!(spa_mode & FWRITE));
- if (cp != NULL && vd->vdev_devid != NULL) {
- g_topology_unlock();
- id = vdev_geom_get_id(cp);
- g_topology_lock();
- if (id == NULL || strcmp(id, vd->vdev_devid) != 0) {
- vdev_geom_detach(cp, 0);
- cp = NULL;
- ZFS_LOG(1, "ID mismatch for provider %s: "
- "[%s]!=[%s].", vd->vdev_path,
- vd->vdev_devid, id);
- goto next;
- }
- ZFS_LOG(1, "ID match for provider %s.", vd->vdev_path);
- }
- }
-next:
- g_topology_unlock();
- vdev_geom_free_id(id);
- if (cp == NULL && vd->vdev_devid != NULL) {
- ZFS_LOG(1, "Searching by ID [%s].", vd->vdev_devid);
- cp = vdev_geom_attach_by_id(vd->vdev_devid,
- !!(spa_mode & FWRITE));
- if (cp != NULL) {
- size_t len = strlen(cp->provider->name) + 6; /* 6 == strlen("/dev/") + 1 */
- char *buf = kmem_alloc(len, KM_SLEEP);
-
- snprintf(buf, len, "/dev/%s", cp->provider->name);
- spa_strfree(vd->vdev_path);
- vd->vdev_path = buf;
-
- ZFS_LOG(1, "Attach by ID [%s] succeeded, provider %s.",
- vd->vdev_devid, vd->vdev_path);
- }
- }
- if (owned)
- mtx_lock(&Giant);
- if (cp == NULL) {
- ZFS_LOG(1, "Provider %s (id=[%s]) not found.", vd->vdev_path,
- vd->vdev_devid);
- vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
- return (EACCES);
- }
- pp = cp->provider;
-
- /*
- * Determine the actual size of the device.
- */
- *psize = pp->mediasize;
-
- /*
- * Determine the device's minimum transfer size.
- */
- *ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1;
-
- /*
- * Clear the nowritecache bit, so that on a vdev_reopen() we will
- * try again.
- */
- vd->vdev_nowritecache = B_FALSE;
-
- cp->private = vd;
-
- ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP);
- bioq_init(&ctx->gc_queue);
- mtx_init(&ctx->gc_queue_mtx, "zfs:vdev:geom:queue", NULL, MTX_DEF);
- ctx->gc_consumer = cp;
- ctx->gc_state = 0;
-
- vd->vdev_tsd = ctx;
-
- kproc_create(vdev_geom_worker, ctx, NULL, 0, 0, "vdev:worker %s",
- pp->name);
-
- return (0);
-}
-
-static void
-vdev_geom_close(vdev_t *vd)
-{
- vdev_geom_ctx_t *ctx;
- struct g_consumer *cp;
-
- if ((ctx = vd->vdev_tsd) == NULL)
- return;
- if ((cp = ctx->gc_consumer) == NULL)
- return;
- vdev_geom_release(vd);
- g_post_event(vdev_geom_detach, cp, M_WAITOK, NULL);
-}
-
-static void
-vdev_geom_io_intr(struct bio *bp)
-{
- vdev_geom_ctx_t *ctx;
- zio_t *zio;
-
- zio = bp->bio_caller1;
- ctx = zio->io_vd->vdev_tsd;
-
- mtx_lock(&ctx->gc_queue_mtx);
- bioq_insert_tail(&ctx->gc_queue, bp);
- wakeup_one(&ctx->gc_queue);
- mtx_unlock(&ctx->gc_queue_mtx);
-}
-
-static void
-vdev_geom_io_start(zio_t *zio)
-{
- vdev_t *vd;
- vdev_geom_ctx_t *ctx;
- struct g_consumer *cp;
- struct bio *bp;
- int error;
-
- cp = NULL;
-
- vd = zio->io_vd;
- ctx = vd->vdev_tsd;
- if (ctx != NULL)
- cp = ctx->gc_consumer;
-
- if (zio->io_type == ZIO_TYPE_IOCTL) {
- zio_vdev_io_bypass(zio);
-
- /* XXPOLICY */
- if (vdev_is_dead(vd)) {
- zio->io_error = ENXIO;
- zio_next_stage_async(zio);
- return;
- }
-
- switch (zio->io_cmd) {
-
- case DKIOCFLUSHWRITECACHE:
- if (vd->vdev_nowritecache) {
- zio->io_error = ENOTSUP;
- break;
- }
-
- goto sendreq;
- default:
- zio->io_error = ENOTSUP;
- }
-
- zio_next_stage_async(zio);
- return;
- }
-
- if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
- return;
-
- if ((zio = vdev_queue_io(zio)) == NULL)
- return;
-
-sendreq:
-
- error = vdev_is_dead(vd) ? ENXIO : vdev_error_inject(vd, zio);
- if (error == 0 && cp == NULL)
- error = ENXIO;
- if (error) {
- zio->io_error = error;
- zio_next_stage_async(zio);
- return;
- }
-
- bp = g_alloc_bio();
- bp->bio_caller1 = zio;
- switch (zio->io_type) {
- case ZIO_TYPE_READ:
- case ZIO_TYPE_WRITE:
- bp->bio_cmd = zio->io_type == ZIO_TYPE_READ ? BIO_READ : BIO_WRITE;
- bp->bio_data = zio->io_data;
- bp->bio_offset = zio->io_offset;
- bp->bio_length = zio->io_size;
- break;
- case ZIO_TYPE_IOCTL:
- bp->bio_cmd = BIO_FLUSH;
- bp->bio_data = NULL;
- bp->bio_offset = cp->provider->mediasize;
- bp->bio_length = 0;
- break;
- }
- bp->bio_done = vdev_geom_io_intr;
-
- g_io_request(bp, cp);
-}
-
-static void
-vdev_geom_io_done(zio_t *zio)
-{
- vdev_queue_io_done(zio);
-
- if (zio->io_type == ZIO_TYPE_WRITE)
- vdev_cache_write(zio);
-
- if (zio_injection_enabled && zio->io_error == 0)
- zio->io_error = zio_handle_device_injection(zio->io_vd, EIO);
-
- zio_next_stage(zio);
-}
-
-vdev_ops_t vdev_geom_ops = {
- vdev_geom_open,
- vdev_geom_close,
- vdev_default_asize,
- vdev_geom_io_start,
- vdev_geom_io_done,
- NULL,
- VDEV_TYPE_DISK, /* name of this vdev type */
- B_TRUE /* leaf vdev */
-};
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
deleted file mode 100644
index 9d9f555..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
+++ /dev/null
@@ -1,1011 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-/*
- * Virtual Device Labels
- * ---------------------
- *
- * The vdev label serves several distinct purposes:
- *
- * 1. Uniquely identify this device as part of a ZFS pool and confirm its
- * identity within the pool.
- *
- * 2. Verify that all the devices given in a configuration are present
- * within the pool.
- *
- * 3. Determine the uberblock for the pool.
- *
- * 4. In case of an import operation, determine the configuration of the
- * toplevel vdev of which it is a part.
- *
- * 5. If an import operation cannot find all the devices in the pool,
- * provide enough information to the administrator to determine which
- * devices are missing.
- *
- * It is important to note that while the kernel is responsible for writing the
- * label, it only consumes the information in the first three cases. The
- * latter information is only consumed in userland when determining the
- * configuration to import a pool.
- *
- *
- * Label Organization
- * ------------------
- *
- * Before describing the contents of the label, it's important to understand how
- * the labels are written and updated with respect to the uberblock.
- *
- * When the pool configuration is altered, either because it was newly created
- * or a device was added, we want to update all the labels such that we can deal
- * with fatal failure at any point. To this end, each disk has two labels which
- * are updated before and after the uberblock is synced. Assuming we have
- * labels and an uberblock with the following transacation groups:
- *
- * L1 UB L2
- * +------+ +------+ +------+
- * | | | | | |
- * | t10 | | t10 | | t10 |
- * | | | | | |
- * +------+ +------+ +------+
- *
- * In this stable state, the labels and the uberblock were all updated within
- * the same transaction group (10). Each label is mirrored and checksummed, so
- * that we can detect when we fail partway through writing the label.
- *
- * In order to identify which labels are valid, the labels are written in the
- * following manner:
- *
- * 1. For each vdev, update 'L1' to the new label
- * 2. Update the uberblock
- * 3. For each vdev, update 'L2' to the new label
- *
- * Given arbitrary failure, we can determine the correct label to use based on
- * the transaction group. If we fail after updating L1 but before updating the
- * UB, we will notice that L1's transaction group is greater than the uberblock,
- * so L2 must be valid. If we fail after writing the uberblock but before
- * writing L2, we will notice that L2's transaction group is less than L1, and
- * therefore L1 is valid.
- *
- * Another added complexity is that not every label is updated when the config
- * is synced. If we add a single device, we do not want to have to re-write
- * every label for every device in the pool. This means that both L1 and L2 may
- * be older than the pool uberblock, because the necessary information is stored
- * on another vdev.
- *
- *
- * On-disk Format
- * --------------
- *
- * The vdev label consists of two distinct parts, and is wrapped within the
- * vdev_label_t structure. The label includes 8k of padding to permit legacy
- * VTOC disk labels, but is otherwise ignored.
- *
- * The first half of the label is a packed nvlist which contains pool wide
- * properties, per-vdev properties, and configuration information. It is
- * described in more detail below.
- *
- * The latter half of the label consists of a redundant array of uberblocks.
- * These uberblocks are updated whenever a transaction group is committed,
- * or when the configuration is updated. When a pool is loaded, we scan each
- * vdev for the 'best' uberblock.
- *
- *
- * Configuration Information
- * -------------------------
- *
- * The nvlist describing the pool and vdev contains the following elements:
- *
- * version ZFS on-disk version
- * name Pool name
- * state Pool state
- * txg Transaction group in which this label was written
- * pool_guid Unique identifier for this pool
- * vdev_tree An nvlist describing vdev tree.
- *
- * Each leaf device label also contains the following:
- *
- * top_guid Unique ID for top-level vdev in which this is contained
- * guid Unique ID for the leaf vdev
- *
- * The 'vs' configuration follows the format described in 'spa_config.c'.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/spa.h>
-#include <sys/spa_impl.h>
-#include <sys/dmu.h>
-#include <sys/zap.h>
-#include <sys/vdev.h>
-#include <sys/vdev_impl.h>
-#include <sys/uberblock_impl.h>
-#include <sys/metaslab.h>
-#include <sys/zio.h>
-#include <sys/fs/zfs.h>
-
-/*
- * Basic routines to read and write from a vdev label.
- * Used throughout the rest of this file.
- */
-uint64_t
-vdev_label_offset(uint64_t psize, int l, uint64_t offset)
-{
- ASSERT(offset < sizeof (vdev_label_t));
-
- return (offset + l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ?
- 0 : psize - VDEV_LABELS * sizeof (vdev_label_t)));
-}
-
-static void
-vdev_label_read(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
- uint64_t size, zio_done_func_t *done, void *private)
-{
- ASSERT(vd->vdev_children == 0);
-
- zio_nowait(zio_read_phys(zio, vd,
- vdev_label_offset(vd->vdev_psize, l, offset),
- size, buf, ZIO_CHECKSUM_LABEL, done, private,
- ZIO_PRIORITY_SYNC_READ,
- ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE));
-}
-
-static void
-vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
- uint64_t size, zio_done_func_t *done, void *private)
-{
- ASSERT(vd->vdev_children == 0);
-
- zio_nowait(zio_write_phys(zio, vd,
- vdev_label_offset(vd->vdev_psize, l, offset),
- size, buf, ZIO_CHECKSUM_LABEL, done, private,
- ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL));
-}
-
-/*
- * Generate the nvlist representing this vdev's config.
- */
-nvlist_t *
-vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
- boolean_t isspare)
-{
- nvlist_t *nv = NULL;
-
- VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-
- VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE,
- vd->vdev_ops->vdev_op_type) == 0);
- if (!isspare)
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id)
- == 0);
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0);
-
- if (vd->vdev_path != NULL)
- VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_PATH,
- vd->vdev_path) == 0);
-
- if (vd->vdev_devid != NULL)
- VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_DEVID,
- vd->vdev_devid) == 0);
-
- if (vd->vdev_nparity != 0) {
- ASSERT(strcmp(vd->vdev_ops->vdev_op_type,
- VDEV_TYPE_RAIDZ) == 0);
-
- /*
- * Make sure someone hasn't managed to sneak a fancy new vdev
- * into a crufty old storage pool.
- */
- ASSERT(vd->vdev_nparity == 1 ||
- (vd->vdev_nparity == 2 &&
- spa_version(spa) >= ZFS_VERSION_RAID6));
-
- /*
- * Note that we'll add the nparity tag even on storage pools
- * that only support a single parity device -- older software
- * will just ignore it.
- */
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY,
- vd->vdev_nparity) == 0);
- }
-
- if (vd->vdev_wholedisk != -1ULL)
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
- vd->vdev_wholedisk) == 0);
-
- if (vd->vdev_not_present)
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1) == 0);
-
- if (vd->vdev_isspare)
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1) == 0);
-
- if (!isspare && vd == vd->vdev_top) {
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
- vd->vdev_ms_array) == 0);
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
- vd->vdev_ms_shift) == 0);
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT,
- vd->vdev_ashift) == 0);
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE,
- vd->vdev_asize) == 0);
- }
-
- if (vd->vdev_dtl.smo_object != 0)
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DTL,
- vd->vdev_dtl.smo_object) == 0);
-
- if (getstats) {
- vdev_stat_t vs;
- vdev_get_stats(vd, &vs);
- VERIFY(nvlist_add_uint64_array(nv, ZPOOL_CONFIG_STATS,
- (uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t)) == 0);
- }
-
- if (!vd->vdev_ops->vdev_op_leaf) {
- nvlist_t **child;
- int c;
-
- child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *),
- KM_SLEEP);
-
- for (c = 0; c < vd->vdev_children; c++)
- child[c] = vdev_config_generate(spa, vd->vdev_child[c],
- getstats, isspare);
-
- VERIFY(nvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
- child, vd->vdev_children) == 0);
-
- for (c = 0; c < vd->vdev_children; c++)
- nvlist_free(child[c]);
-
- kmem_free(child, vd->vdev_children * sizeof (nvlist_t *));
-
- } else {
- if (vd->vdev_offline && !vd->vdev_tmpoffline)
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE,
- B_TRUE) == 0);
- else
- (void) nvlist_remove(nv, ZPOOL_CONFIG_OFFLINE,
- DATA_TYPE_UINT64);
- }
-
- return (nv);
-}
-
-nvlist_t *
-vdev_label_read_config(vdev_t *vd)
-{
- spa_t *spa = vd->vdev_spa;
- nvlist_t *config = NULL;
- vdev_phys_t *vp;
- zio_t *zio;
- int l;
-
- ASSERT(spa_config_held(spa, RW_READER));
-
- if (vdev_is_dead(vd))
- return (NULL);
-
- vp = zio_buf_alloc(sizeof (vdev_phys_t));
-
- for (l = 0; l < VDEV_LABELS; l++) {
-
- zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL |
- ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CONFIG_HELD);
-
- vdev_label_read(zio, vd, l, vp,
- offsetof(vdev_label_t, vl_vdev_phys),
- sizeof (vdev_phys_t), NULL, NULL);
-
- if (zio_wait(zio) == 0 &&
- nvlist_unpack(vp->vp_nvlist, sizeof (vp->vp_nvlist),
- &config, 0) == 0)
- break;
-
- if (config != NULL) {
- nvlist_free(config);
- config = NULL;
- }
- }
-
- zio_buf_free(vp, sizeof (vdev_phys_t));
-
- return (config);
-}
-
-/*
- * Determine if a device is in use. The 'spare_guid' parameter will be filled
- * in with the device guid if this spare is active elsewhere on the system.
- */
-static boolean_t
-vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason,
- uint64_t *spare_guid)
-{
- spa_t *spa = vd->vdev_spa;
- uint64_t state, pool_guid, device_guid, txg, spare_pool;
- uint64_t vdtxg = 0;
- nvlist_t *label;
-
- if (spare_guid)
- *spare_guid = 0ULL;
-
- /*
- * Read the label, if any, and perform some basic sanity checks.
- */
- if ((label = vdev_label_read_config(vd)) == NULL)
- return (B_FALSE);
-
- (void) nvlist_lookup_uint64(label, ZPOOL_CONFIG_CREATE_TXG,
- &vdtxg);
-
- if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
- &state) != 0 ||
- nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID,
- &device_guid) != 0) {
- nvlist_free(label);
- return (B_FALSE);
- }
-
- if (state != POOL_STATE_SPARE &&
- (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID,
- &pool_guid) != 0 ||
- nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
- &txg) != 0)) {
- nvlist_free(label);
- return (B_FALSE);
- }
-
- nvlist_free(label);
-
- /*
- * Check to see if this device indeed belongs to the pool it claims to
- * be a part of. The only way this is allowed is if the device is a hot
- * spare (which we check for later on).
- */
- if (state != POOL_STATE_SPARE &&
- !spa_guid_exists(pool_guid, device_guid) &&
- !spa_spare_exists(device_guid, NULL))
- return (B_FALSE);
-
- /*
- * If the transaction group is zero, then this an initialized (but
- * unused) label. This is only an error if the create transaction
- * on-disk is the same as the one we're using now, in which case the
- * user has attempted to add the same vdev multiple times in the same
- * transaction.
- */
- if (state != POOL_STATE_SPARE && txg == 0 && vdtxg == crtxg)
- return (B_TRUE);
-
- /*
- * Check to see if this is a spare device. We do an explicit check for
- * spa_has_spare() here because it may be on our pending list of spares
- * to add.
- */
- if (spa_spare_exists(device_guid, &spare_pool) ||
- spa_has_spare(spa, device_guid)) {
- if (spare_guid)
- *spare_guid = device_guid;
-
- switch (reason) {
- case VDEV_LABEL_CREATE:
- return (B_TRUE);
-
- case VDEV_LABEL_REPLACE:
- return (!spa_has_spare(spa, device_guid) ||
- spare_pool != 0ULL);
-
- case VDEV_LABEL_SPARE:
- return (spa_has_spare(spa, device_guid));
- }
- }
-
- /*
- * If the device is marked ACTIVE, then this device is in use by another
- * pool on the system.
- */
- return (state == POOL_STATE_ACTIVE);
-}
-
-/*
- * Initialize a vdev label. We check to make sure each leaf device is not in
- * use, and writable. We put down an initial label which we will later
- * overwrite with a complete label. Note that it's important to do this
- * sequentially, not in parallel, so that we catch cases of multiple use of the
- * same leaf vdev in the vdev we're creating -- e.g. mirroring a disk with
- * itself.
- */
-int
-vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
-{
- spa_t *spa = vd->vdev_spa;
- nvlist_t *label;
- vdev_phys_t *vp;
- vdev_boot_header_t *vb;
- uberblock_t *ub;
- zio_t *zio;
- int l, c, n;
- char *buf;
- size_t buflen;
- int error;
- uint64_t spare_guid;
-
- ASSERT(spa_config_held(spa, RW_WRITER));
-
- for (c = 0; c < vd->vdev_children; c++)
- if ((error = vdev_label_init(vd->vdev_child[c],
- crtxg, reason)) != 0)
- return (error);
-
- if (!vd->vdev_ops->vdev_op_leaf)
- return (0);
-
- /*
- * Dead vdevs cannot be initialized.
- */
- if (vdev_is_dead(vd))
- return (EIO);
-
- /*
- * Determine if the vdev is in use.
- */
- if (reason != VDEV_LABEL_REMOVE &&
- vdev_inuse(vd, crtxg, reason, &spare_guid))
- return (EBUSY);
-
- ASSERT(reason != VDEV_LABEL_REMOVE ||
- vdev_inuse(vd, crtxg, reason, NULL));
-
- /*
- * If this is a request to add or replace a spare that is in use
- * elsewhere on the system, then we must update the guid (which was
- * initialized to a random value) to reflect the actual GUID (which is
- * shared between multiple pools).
- */
- if (reason != VDEV_LABEL_REMOVE && spare_guid != 0ULL) {
- vdev_t *pvd = vd->vdev_parent;
-
- for (; pvd != NULL; pvd = pvd->vdev_parent) {
- pvd->vdev_guid_sum -= vd->vdev_guid;
- pvd->vdev_guid_sum += spare_guid;
- }
-
- vd->vdev_guid = vd->vdev_guid_sum = spare_guid;
-
- /*
- * If this is a replacement, then we want to fallthrough to the
- * rest of the code. If we're adding a spare, then it's already
- * labelled appropriately and we can just return.
- */
- if (reason == VDEV_LABEL_SPARE)
- return (0);
- ASSERT(reason == VDEV_LABEL_REPLACE);
- }
-
- /*
- * Initialize its label.
- */
- vp = zio_buf_alloc(sizeof (vdev_phys_t));
- bzero(vp, sizeof (vdev_phys_t));
-
- /*
- * Generate a label describing the pool and our top-level vdev.
- * We mark it as being from txg 0 to indicate that it's not
- * really part of an active pool just yet. The labels will
- * be written again with a meaningful txg by spa_sync().
- */
- if (reason == VDEV_LABEL_SPARE ||
- (reason == VDEV_LABEL_REMOVE && vd->vdev_isspare)) {
- /*
- * For inactive hot spares, we generate a special label that
- * identifies as a mutually shared hot spare. We write the
- * label if we are adding a hot spare, or if we are removing an
- * active hot spare (in which case we want to revert the
- * labels).
- */
- VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-
- VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION,
- spa_version(spa)) == 0);
- VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE,
- POOL_STATE_SPARE) == 0);
- VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID,
- vd->vdev_guid) == 0);
- } else {
- label = spa_config_generate(spa, vd, 0ULL, B_FALSE);
-
- /*
- * Add our creation time. This allows us to detect multiple
- * vdev uses as described above, and automatically expires if we
- * fail.
- */
- VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_CREATE_TXG,
- crtxg) == 0);
- }
-
- buf = vp->vp_nvlist;
- buflen = sizeof (vp->vp_nvlist);
-
- error = nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP);
- if (error != 0) {
- nvlist_free(label);
- zio_buf_free(vp, sizeof (vdev_phys_t));
- /* EFAULT means nvlist_pack ran out of room */
- return (error == EFAULT ? ENAMETOOLONG : EINVAL);
- }
-
- /*
- * Initialize boot block header.
- */
- vb = zio_buf_alloc(sizeof (vdev_boot_header_t));
- bzero(vb, sizeof (vdev_boot_header_t));
- vb->vb_magic = VDEV_BOOT_MAGIC;
- vb->vb_version = VDEV_BOOT_VERSION;
- vb->vb_offset = VDEV_BOOT_OFFSET;
- vb->vb_size = VDEV_BOOT_SIZE;
-
- /*
- * Initialize uberblock template.
- */
- ub = zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd));
- bzero(ub, VDEV_UBERBLOCK_SIZE(vd));
- *ub = spa->spa_uberblock;
- ub->ub_txg = 0;
-
- /*
- * Write everything in parallel.
- */
- zio = zio_root(spa, NULL, NULL,
- ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL);
-
- for (l = 0; l < VDEV_LABELS; l++) {
-
- vdev_label_write(zio, vd, l, vp,
- offsetof(vdev_label_t, vl_vdev_phys),
- sizeof (vdev_phys_t), NULL, NULL);
-
- vdev_label_write(zio, vd, l, vb,
- offsetof(vdev_label_t, vl_boot_header),
- sizeof (vdev_boot_header_t), NULL, NULL);
-
- for (n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
- vdev_label_write(zio, vd, l, ub,
- VDEV_UBERBLOCK_OFFSET(vd, n),
- VDEV_UBERBLOCK_SIZE(vd), NULL, NULL);
- }
- }
-
- error = zio_wait(zio);
-
- nvlist_free(label);
- zio_buf_free(ub, VDEV_UBERBLOCK_SIZE(vd));
- zio_buf_free(vb, sizeof (vdev_boot_header_t));
- zio_buf_free(vp, sizeof (vdev_phys_t));
-
- /*
- * If this vdev hasn't been previously identified as a spare, then we
- * mark it as such only if a) we are labelling it as a spare, or b) it
- * exists as a spare elsewhere in the system.
- */
- if (error == 0 && !vd->vdev_isspare &&
- (reason == VDEV_LABEL_SPARE ||
- spa_spare_exists(vd->vdev_guid, NULL)))
- spa_spare_add(vd);
-
- return (error);
-}
-
-/*
- * ==========================================================================
- * uberblock load/sync
- * ==========================================================================
- */
-
-/*
- * Consider the following situation: txg is safely synced to disk. We've
- * written the first uberblock for txg + 1, and then we lose power. When we
- * come back up, we fail to see the uberblock for txg + 1 because, say,
- * it was on a mirrored device and the replica to which we wrote txg + 1
- * is now offline. If we then make some changes and sync txg + 1, and then
- * the missing replica comes back, then for a new seconds we'll have two
- * conflicting uberblocks on disk with the same txg. The solution is simple:
- * among uberblocks with equal txg, choose the one with the latest timestamp.
- */
-static int
-vdev_uberblock_compare(uberblock_t *ub1, uberblock_t *ub2)
-{
- if (ub1->ub_txg < ub2->ub_txg)
- return (-1);
- if (ub1->ub_txg > ub2->ub_txg)
- return (1);
-
- if (ub1->ub_timestamp < ub2->ub_timestamp)
- return (-1);
- if (ub1->ub_timestamp > ub2->ub_timestamp)
- return (1);
-
- return (0);
-}
-
-static void
-vdev_uberblock_load_done(zio_t *zio)
-{
- uberblock_t *ub = zio->io_data;
- uberblock_t *ubbest = zio->io_private;
- spa_t *spa = zio->io_spa;
-
- ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(zio->io_vd));
-
- if (zio->io_error == 0 && uberblock_verify(ub) == 0) {
- mutex_enter(&spa->spa_uberblock_lock);
- if (vdev_uberblock_compare(ub, ubbest) > 0)
- *ubbest = *ub;
- mutex_exit(&spa->spa_uberblock_lock);
- }
-
- zio_buf_free(zio->io_data, zio->io_size);
-}
-
-void
-vdev_uberblock_load(zio_t *zio, vdev_t *vd, uberblock_t *ubbest)
-{
- int l, c, n;
-
- for (c = 0; c < vd->vdev_children; c++)
- vdev_uberblock_load(zio, vd->vdev_child[c], ubbest);
-
- if (!vd->vdev_ops->vdev_op_leaf)
- return;
-
- if (vdev_is_dead(vd))
- return;
-
- for (l = 0; l < VDEV_LABELS; l++) {
- for (n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
- vdev_label_read(zio, vd, l,
- zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)),
- VDEV_UBERBLOCK_OFFSET(vd, n),
- VDEV_UBERBLOCK_SIZE(vd),
- vdev_uberblock_load_done, ubbest);
- }
- }
-}
-
-/*
- * Write the uberblock to both labels of all leaves of the specified vdev.
- * We only get credit for writes to known-visible vdevs; see spa_vdev_add().
- */
-static void
-vdev_uberblock_sync_done(zio_t *zio)
-{
- uint64_t *good_writes = zio->io_root->io_private;
-
- if (zio->io_error == 0 && zio->io_vd->vdev_top->vdev_ms_array != 0)
- atomic_add_64(good_writes, 1);
-}
-
-static void
-vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, uint64_t txg)
-{
- int l, c, n;
-
- for (c = 0; c < vd->vdev_children; c++)
- vdev_uberblock_sync(zio, ub, vd->vdev_child[c], txg);
-
- if (!vd->vdev_ops->vdev_op_leaf)
- return;
-
- if (vdev_is_dead(vd))
- return;
-
- n = txg & (VDEV_UBERBLOCK_COUNT(vd) - 1);
-
- ASSERT(ub->ub_txg == txg);
-
- for (l = 0; l < VDEV_LABELS; l++)
- vdev_label_write(zio, vd, l, ub,
- VDEV_UBERBLOCK_OFFSET(vd, n),
- VDEV_UBERBLOCK_SIZE(vd),
- vdev_uberblock_sync_done, NULL);
-
- dprintf("vdev %s in txg %llu\n", vdev_description(vd), txg);
-}
-
-static int
-vdev_uberblock_sync_tree(spa_t *spa, uberblock_t *ub, vdev_t *vd, uint64_t txg)
-{
- uberblock_t *ubbuf;
- size_t size = vd->vdev_top ? VDEV_UBERBLOCK_SIZE(vd) : SPA_MAXBLOCKSIZE;
- uint64_t *good_writes;
- zio_t *zio;
- int error;
-
- ubbuf = zio_buf_alloc(size);
- bzero(ubbuf, size);
- *ubbuf = *ub;
-
- good_writes = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
-
- zio = zio_root(spa, NULL, good_writes,
- ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL);
-
- vdev_uberblock_sync(zio, ubbuf, vd, txg);
-
- error = zio_wait(zio);
-
- if (error && *good_writes != 0) {
- dprintf("partial success: good_writes = %llu\n", *good_writes);
- error = 0;
- }
-
- /*
- * It's possible to have no good writes and no error if every vdev is in
- * the CANT_OPEN state.
- */
- if (*good_writes == 0 && error == 0)
- error = EIO;
-
- kmem_free(good_writes, sizeof (uint64_t));
- zio_buf_free(ubbuf, size);
-
- return (error);
-}
-
-/*
- * Sync out an individual vdev.
- */
-static void
-vdev_sync_label_done(zio_t *zio)
-{
- uint64_t *good_writes = zio->io_root->io_private;
-
- if (zio->io_error == 0)
- atomic_add_64(good_writes, 1);
-}
-
-static void
-vdev_sync_label(zio_t *zio, vdev_t *vd, int l, uint64_t txg)
-{
- nvlist_t *label;
- vdev_phys_t *vp;
- char *buf;
- size_t buflen;
- int c;
-
- for (c = 0; c < vd->vdev_children; c++)
- vdev_sync_label(zio, vd->vdev_child[c], l, txg);
-
- if (!vd->vdev_ops->vdev_op_leaf)
- return;
-
- if (vdev_is_dead(vd))
- return;
-
- /*
- * Generate a label describing the top-level config to which we belong.
- */
- label = spa_config_generate(vd->vdev_spa, vd, txg, B_FALSE);
-
- vp = zio_buf_alloc(sizeof (vdev_phys_t));
- bzero(vp, sizeof (vdev_phys_t));
-
- buf = vp->vp_nvlist;
- buflen = sizeof (vp->vp_nvlist);
-
- if (nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP) == 0)
- vdev_label_write(zio, vd, l, vp,
- offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t),
- vdev_sync_label_done, NULL);
-
- zio_buf_free(vp, sizeof (vdev_phys_t));
- nvlist_free(label);
-
- dprintf("%s label %d txg %llu\n", vdev_description(vd), l, txg);
-}
-
-static int
-vdev_sync_labels(vdev_t *vd, int l, uint64_t txg)
-{
- uint64_t *good_writes;
- zio_t *zio;
- int error;
-
- ASSERT(vd == vd->vdev_top);
-
- good_writes = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
-
- zio = zio_root(vd->vdev_spa, NULL, good_writes,
- ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL);
-
- /*
- * Recursively kick off writes to all labels.
- */
- vdev_sync_label(zio, vd, l, txg);
-
- error = zio_wait(zio);
-
- if (error && *good_writes != 0) {
- dprintf("partial success: good_writes = %llu\n", *good_writes);
- error = 0;
- }
-
- if (*good_writes == 0 && error == 0)
- error = ENODEV;
-
- kmem_free(good_writes, sizeof (uint64_t));
-
- return (error);
-}
-
-/*
- * Sync the entire vdev configuration.
- *
- * The order of operations is carefully crafted to ensure that
- * if the system panics or loses power at any time, the state on disk
- * is still transactionally consistent. The in-line comments below
- * describe the failure semantics at each stage.
- *
- * Moreover, it is designed to be idempotent: if spa_sync_labels() fails
- * at any time, you can just call it again, and it will resume its work.
- */
-int
-vdev_config_sync(vdev_t *uvd, uint64_t txg)
-{
- spa_t *spa = uvd->vdev_spa;
- uberblock_t *ub = &spa->spa_uberblock;
- vdev_t *rvd = spa->spa_root_vdev;
- vdev_t *vd;
- zio_t *zio;
- int l, error;
-
- ASSERT(ub->ub_txg <= txg);
-
- /*
- * If this isn't a resync due to I/O errors, and nothing changed
- * in this transaction group, and the vdev configuration hasn't changed,
- * then there's nothing to do.
- */
- if (ub->ub_txg < txg && uberblock_update(ub, rvd, txg) == B_FALSE &&
- list_is_empty(&spa->spa_dirty_list)) {
- dprintf("nothing to sync in %s in txg %llu\n",
- spa_name(spa), txg);
- return (0);
- }
-
- if (txg > spa_freeze_txg(spa))
- return (0);
-
- ASSERT(txg <= spa->spa_final_txg);
-
- dprintf("syncing %s txg %llu\n", spa_name(spa), txg);
-
- /*
- * Flush the write cache of every disk that's been written to
- * in this transaction group. This ensures that all blocks
- * written in this txg will be committed to stable storage
- * before any uberblock that references them.
- */
- zio = zio_root(spa, NULL, NULL,
- ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL);
- for (vd = txg_list_head(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)); vd;
- vd = txg_list_next(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg))) {
- zio_nowait(zio_ioctl(zio, spa, vd, DKIOCFLUSHWRITECACHE,
- NULL, NULL, ZIO_PRIORITY_NOW,
- ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
- }
- (void) zio_wait(zio);
-
- /*
- * Sync out the even labels (L0, L2) for every dirty vdev. If the
- * system dies in the middle of this process, that's OK: all of the
- * even labels that made it to disk will be newer than any uberblock,
- * and will therefore be considered invalid. The odd labels (L1, L3),
- * which have not yet been touched, will still be valid.
- */
- for (vd = list_head(&spa->spa_dirty_list); vd != NULL;
- vd = list_next(&spa->spa_dirty_list, vd)) {
- for (l = 0; l < VDEV_LABELS; l++) {
- if (l & 1)
- continue;
- if ((error = vdev_sync_labels(vd, l, txg)) != 0)
- return (error);
- }
- }
-
- /*
- * Flush the new labels to disk. This ensures that all even-label
- * updates are committed to stable storage before the uberblock update.
- */
- zio = zio_root(spa, NULL, NULL,
- ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL);
- for (vd = list_head(&spa->spa_dirty_list); vd != NULL;
- vd = list_next(&spa->spa_dirty_list, vd)) {
- zio_nowait(zio_ioctl(zio, spa, vd, DKIOCFLUSHWRITECACHE,
- NULL, NULL, ZIO_PRIORITY_NOW,
- ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
- }
- (void) zio_wait(zio);
-
- /*
- * Sync the uberblocks to all vdevs in the tree specified by uvd.
- * If the system dies in the middle of this step, there are two cases
- * to consider, and the on-disk state is consistent either way:
- *
- * (1) If none of the new uberblocks made it to disk, then the
- * previous uberblock will be the newest, and the odd labels
- * (which had not yet been touched) will be valid with respect
- * to that uberblock.
- *
- * (2) If one or more new uberblocks made it to disk, then they
- * will be the newest, and the even labels (which had all
- * been successfully committed) will be valid with respect
- * to the new uberblocks.
- */
- if ((error = vdev_uberblock_sync_tree(spa, ub, uvd, txg)) != 0)
- return (error);
-
- /*
- * Flush the uberblocks to disk. This ensures that the odd labels
- * are no longer needed (because the new uberblocks and the even
- * labels are safely on disk), so it is safe to overwrite them.
- */
- (void) zio_wait(zio_ioctl(NULL, spa, uvd, DKIOCFLUSHWRITECACHE,
- NULL, NULL, ZIO_PRIORITY_NOW,
- ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
-
- /*
- * Sync out odd labels for every dirty vdev. If the system dies
- * in the middle of this process, the even labels and the new
- * uberblocks will suffice to open the pool. The next time
- * the pool is opened, the first thing we'll do -- before any
- * user data is modified -- is mark every vdev dirty so that
- * all labels will be brought up to date.
- */
- for (vd = list_head(&spa->spa_dirty_list); vd != NULL;
- vd = list_next(&spa->spa_dirty_list, vd)) {
- for (l = 0; l < VDEV_LABELS; l++) {
- if ((l & 1) == 0)
- continue;
- if ((error = vdev_sync_labels(vd, l, txg)) != 0)
- return (error);
- }
- }
-
- /*
- * Flush the new labels to disk. This ensures that all odd-label
- * updates are committed to stable storage before the next
- * transaction group begins.
- */
- zio = zio_root(spa, NULL, NULL,
- ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL);
- for (vd = list_head(&spa->spa_dirty_list); vd != NULL;
- vd = list_next(&spa->spa_dirty_list, vd)) {
- zio_nowait(zio_ioctl(zio, spa, vd, DKIOCFLUSHWRITECACHE,
- NULL, NULL, ZIO_PRIORITY_NOW,
- ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
- }
- (void) zio_wait(zio);
-
- return (0);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
deleted file mode 100644
index 73d1a83..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
+++ /dev/null
@@ -1,495 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/spa.h>
-#include <sys/vdev_impl.h>
-#include <sys/zio.h>
-#include <sys/fs/zfs.h>
-
-/*
- * Virtual device vector for mirroring.
- */
-
-typedef struct mirror_child {
- vdev_t *mc_vd;
- uint64_t mc_offset;
- int mc_error;
- short mc_tried;
- short mc_skipped;
-} mirror_child_t;
-
-typedef struct mirror_map {
- int mm_children;
- int mm_replacing;
- int mm_preferred;
- int mm_root;
- mirror_child_t mm_child[1];
-} mirror_map_t;
-
-int vdev_mirror_shift = 21;
-
-static mirror_map_t *
-vdev_mirror_map_alloc(zio_t *zio)
-{
- mirror_map_t *mm = NULL;
- mirror_child_t *mc;
- vdev_t *vd = zio->io_vd;
- int c, d;
-
- if (vd == NULL) {
- dva_t *dva = zio->io_bp->blk_dva;
- spa_t *spa = zio->io_spa;
-
- c = BP_GET_NDVAS(zio->io_bp);
-
- mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
- mm->mm_children = c;
- mm->mm_replacing = B_FALSE;
- mm->mm_preferred = spa_get_random(c);
- mm->mm_root = B_TRUE;
-
- /*
- * Check the other, lower-index DVAs to see if they're on
- * the same vdev as the child we picked. If they are, use
- * them since they are likely to have been allocated from
- * the primary metaslab in use at the time, and hence are
- * more likely to have locality with single-copy data.
- */
- for (c = mm->mm_preferred, d = c - 1; d >= 0; d--) {
- if (DVA_GET_VDEV(&dva[d]) == DVA_GET_VDEV(&dva[c]))
- mm->mm_preferred = d;
- }
-
- for (c = 0; c < mm->mm_children; c++) {
- mc = &mm->mm_child[c];
-
- mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c]));
- mc->mc_offset = DVA_GET_OFFSET(&dva[c]);
- }
- } else {
- c = vd->vdev_children;
-
- mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
- mm->mm_children = c;
- mm->mm_replacing = (vd->vdev_ops == &vdev_replacing_ops ||
- vd->vdev_ops == &vdev_spare_ops);
- mm->mm_preferred = mm->mm_replacing ? 0 :
- (zio->io_offset >> vdev_mirror_shift) % c;
- mm->mm_root = B_FALSE;
-
- for (c = 0; c < mm->mm_children; c++) {
- mc = &mm->mm_child[c];
- mc->mc_vd = vd->vdev_child[c];
- mc->mc_offset = zio->io_offset;
- }
- }
-
- zio->io_vsd = mm;
- return (mm);
-}
-
-static void
-vdev_mirror_map_free(zio_t *zio)
-{
- mirror_map_t *mm = zio->io_vsd;
-
- kmem_free(mm, offsetof(mirror_map_t, mm_child[mm->mm_children]));
- zio->io_vsd = NULL;
-}
-
-static int
-vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
-{
- vdev_t *cvd;
- uint64_t c;
- int numerrors = 0;
- int ret, lasterror = 0;
-
- if (vd->vdev_children == 0) {
- vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
- return (EINVAL);
- }
-
- for (c = 0; c < vd->vdev_children; c++) {
- cvd = vd->vdev_child[c];
-
- if ((ret = vdev_open(cvd)) != 0) {
- lasterror = ret;
- numerrors++;
- continue;
- }
-
- *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
- *ashift = MAX(*ashift, cvd->vdev_ashift);
- }
-
- if (numerrors == vd->vdev_children) {
- vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
- return (lasterror);
- }
-
- return (0);
-}
-
-static void
-vdev_mirror_close(vdev_t *vd)
-{
- uint64_t c;
-
- for (c = 0; c < vd->vdev_children; c++)
- vdev_close(vd->vdev_child[c]);
-}
-
-static void
-vdev_mirror_child_done(zio_t *zio)
-{
- mirror_child_t *mc = zio->io_private;
-
- mc->mc_error = zio->io_error;
- mc->mc_tried = 1;
- mc->mc_skipped = 0;
-}
-
-static void
-vdev_mirror_scrub_done(zio_t *zio)
-{
- mirror_child_t *mc = zio->io_private;
-
- if (zio->io_error == 0) {
- zio_t *pio = zio->io_parent;
- mutex_enter(&pio->io_lock);
- ASSERT3U(zio->io_size, >=, pio->io_size);
- bcopy(zio->io_data, pio->io_data, pio->io_size);
- mutex_exit(&pio->io_lock);
- }
-
- zio_buf_free(zio->io_data, zio->io_size);
-
- mc->mc_error = zio->io_error;
- mc->mc_tried = 1;
- mc->mc_skipped = 0;
-}
-
-static void
-vdev_mirror_repair_done(zio_t *zio)
-{
- ASSERT(zio->io_private == zio->io_parent);
- vdev_mirror_map_free(zio->io_private);
-}
-
-/*
- * Try to find a child whose DTL doesn't contain the block we want to read.
- * If we can't, try the read on any vdev we haven't already tried.
- */
-static int
-vdev_mirror_child_select(zio_t *zio)
-{
- mirror_map_t *mm = zio->io_vsd;
- mirror_child_t *mc;
- uint64_t txg = zio->io_txg;
- int i, c;
-
- ASSERT(zio->io_bp == NULL || zio->io_bp->blk_birth == txg);
-
- /*
- * Try to find a child whose DTL doesn't contain the block to read.
- * If a child is known to be completely inaccessible (indicated by
- * vdev_is_dead() returning B_TRUE), don't even try.
- */
- for (i = 0, c = mm->mm_preferred; i < mm->mm_children; i++, c++) {
- if (c >= mm->mm_children)
- c = 0;
- mc = &mm->mm_child[c];
- if (mc->mc_tried || mc->mc_skipped)
- continue;
- if (vdev_is_dead(mc->mc_vd)) {
- mc->mc_error = ENXIO;
- mc->mc_tried = 1; /* don't even try */
- mc->mc_skipped = 1;
- continue;
- }
- if (!vdev_dtl_contains(&mc->mc_vd->vdev_dtl_map, txg, 1))
- return (c);
- mc->mc_error = ESTALE;
- mc->mc_skipped = 1;
- }
-
- /*
- * Every device is either missing or has this txg in its DTL.
- * Look for any child we haven't already tried before giving up.
- */
- for (c = 0; c < mm->mm_children; c++)
- if (!mm->mm_child[c].mc_tried)
- return (c);
-
- /*
- * Every child failed. There's no place left to look.
- */
- return (-1);
-}
-
-static void
-vdev_mirror_io_start(zio_t *zio)
-{
- mirror_map_t *mm;
- mirror_child_t *mc;
- int c, children;
-
- mm = vdev_mirror_map_alloc(zio);
-
- if (zio->io_type == ZIO_TYPE_READ) {
- if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_replacing) {
- /*
- * For scrubbing reads we need to allocate a read
- * buffer for each child and issue reads to all
- * children. If any child succeeds, it will copy its
- * data into zio->io_data in vdev_mirror_scrub_done.
- */
- for (c = 0; c < mm->mm_children; c++) {
- mc = &mm->mm_child[c];
- zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
- mc->mc_vd, mc->mc_offset,
- zio_buf_alloc(zio->io_size), zio->io_size,
- zio->io_type, zio->io_priority,
- ZIO_FLAG_CANFAIL,
- vdev_mirror_scrub_done, mc));
- }
- zio_wait_children_done(zio);
- return;
- }
- /*
- * For normal reads just pick one child.
- */
- c = vdev_mirror_child_select(zio);
- children = (c >= 0);
- } else {
- ASSERT(zio->io_type == ZIO_TYPE_WRITE);
-
- /*
- * If this is a resilvering I/O to a replacing vdev,
- * only the last child should be written -- unless the
- * first child happens to have a DTL entry here as well.
- * All other writes go to all children.
- */
- if ((zio->io_flags & ZIO_FLAG_RESILVER) && mm->mm_replacing &&
- !vdev_dtl_contains(&mm->mm_child[0].mc_vd->vdev_dtl_map,
- zio->io_txg, 1)) {
- c = mm->mm_children - 1;
- children = 1;
- } else {
- c = 0;
- children = mm->mm_children;
- }
- }
-
- while (children--) {
- mc = &mm->mm_child[c];
- zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
- mc->mc_vd, mc->mc_offset,
- zio->io_data, zio->io_size, zio->io_type, zio->io_priority,
- ZIO_FLAG_CANFAIL, vdev_mirror_child_done, mc));
- c++;
- }
-
- zio_wait_children_done(zio);
-}
-
-static void
-vdev_mirror_io_done(zio_t *zio)
-{
- mirror_map_t *mm = zio->io_vsd;
- mirror_child_t *mc;
- int c;
- int good_copies = 0;
- int unexpected_errors = 0;
-
- zio->io_error = 0;
- zio->io_numerrors = 0;
-
- for (c = 0; c < mm->mm_children; c++) {
- mc = &mm->mm_child[c];
-
- if (mc->mc_tried && mc->mc_error == 0) {
- good_copies++;
- continue;
- }
-
- /*
- * We preserve any EIOs because those may be worth retrying;
- * whereas ECKSUM and ENXIO are more likely to be persistent.
- */
- if (mc->mc_error) {
- if (zio->io_error != EIO)
- zio->io_error = mc->mc_error;
- if (!mc->mc_skipped)
- unexpected_errors++;
- zio->io_numerrors++;
- }
- }
-
- if (zio->io_type == ZIO_TYPE_WRITE) {
- /*
- * XXX -- for now, treat partial writes as success.
- * XXX -- For a replacing vdev, we need to make sure the
- * new child succeeds.
- */
- /* XXPOLICY */
- if (good_copies != 0)
- zio->io_error = 0;
- vdev_mirror_map_free(zio);
- zio_next_stage(zio);
- return;
- }
-
- ASSERT(zio->io_type == ZIO_TYPE_READ);
-
- /*
- * If we don't have a good copy yet, keep trying other children.
- */
- /* XXPOLICY */
- if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) {
- ASSERT(c >= 0 && c < mm->mm_children);
- mc = &mm->mm_child[c];
- dprintf("retrying i/o (err=%d) on child %s\n",
- zio->io_error, vdev_description(mc->mc_vd));
- zio->io_error = 0;
- zio_vdev_io_redone(zio);
- zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
- mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size,
- ZIO_TYPE_READ, zio->io_priority, ZIO_FLAG_CANFAIL,
- vdev_mirror_child_done, mc));
- zio_wait_children_done(zio);
- return;
- }
-
- /* XXPOLICY */
- if (good_copies)
- zio->io_error = 0;
- else
- ASSERT(zio->io_error != 0);
-
- if (good_copies && (spa_mode & FWRITE) &&
- (unexpected_errors ||
- (zio->io_flags & ZIO_FLAG_RESILVER) ||
- ((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_replacing))) {
- zio_t *rio;
-
- /*
- * Use the good data we have in hand to repair damaged children.
- *
- * We issue all repair I/Os as children of 'rio' to arrange
- * that vdev_mirror_map_free(zio) will be invoked after all
- * repairs complete, but before we advance to the next stage.
- */
- rio = zio_null(zio, zio->io_spa,
- vdev_mirror_repair_done, zio, ZIO_FLAG_CANFAIL);
-
- for (c = 0; c < mm->mm_children; c++) {
- /*
- * Don't rewrite known good children.
- * Not only is it unnecessary, it could
- * actually be harmful: if the system lost
- * power while rewriting the only good copy,
- * there would be no good copies left!
- */
- mc = &mm->mm_child[c];
-
- if (mc->mc_error == 0) {
- if (mc->mc_tried)
- continue;
- if (!(zio->io_flags & ZIO_FLAG_SCRUB) &&
- !vdev_dtl_contains(&mc->mc_vd->vdev_dtl_map,
- zio->io_txg, 1))
- continue;
- mc->mc_error = ESTALE;
- }
-
- dprintf("resilvered %s @ 0x%llx error %d\n",
- vdev_description(mc->mc_vd), mc->mc_offset,
- mc->mc_error);
-
- zio_nowait(zio_vdev_child_io(rio, zio->io_bp, mc->mc_vd,
- mc->mc_offset, zio->io_data, zio->io_size,
- ZIO_TYPE_WRITE, zio->io_priority,
- ZIO_FLAG_IO_REPAIR | ZIO_FLAG_CANFAIL |
- ZIO_FLAG_DONT_PROPAGATE, NULL, NULL));
- }
-
- zio_nowait(rio);
- zio_wait_children_done(zio);
- return;
- }
-
- vdev_mirror_map_free(zio);
- zio_next_stage(zio);
-}
-
-static void
-vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded)
-{
- if (faulted == vd->vdev_children)
- vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_NO_REPLICAS);
- else if (degraded + faulted != 0)
- vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
- else
- vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
-}
-
-vdev_ops_t vdev_mirror_ops = {
- vdev_mirror_open,
- vdev_mirror_close,
- vdev_default_asize,
- vdev_mirror_io_start,
- vdev_mirror_io_done,
- vdev_mirror_state_change,
- VDEV_TYPE_MIRROR, /* name of this vdev type */
- B_FALSE /* not a leaf vdev */
-};
-
-vdev_ops_t vdev_replacing_ops = {
- vdev_mirror_open,
- vdev_mirror_close,
- vdev_default_asize,
- vdev_mirror_io_start,
- vdev_mirror_io_done,
- vdev_mirror_state_change,
- VDEV_TYPE_REPLACING, /* name of this vdev type */
- B_FALSE /* not a leaf vdev */
-};
-
-vdev_ops_t vdev_spare_ops = {
- vdev_mirror_open,
- vdev_mirror_close,
- vdev_default_asize,
- vdev_mirror_io_start,
- vdev_mirror_io_done,
- vdev_mirror_state_change,
- VDEV_TYPE_SPARE, /* name of this vdev type */
- B_FALSE /* not a leaf vdev */
-};
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c
deleted file mode 100644
index b35f4a5..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-/*
- * The 'missing' vdev is a special vdev type used only during import. It
- * signifies a placeholder in the root vdev for some vdev that we know is
- * missing. We pass it down to the kernel to allow the rest of the
- * configuration to parsed and an attempt made to open all available devices.
- * Because its GUID is always 0, we know that the guid sum will mismatch and we
- * won't be able to open the pool anyway.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/spa.h>
-#include <sys/vdev_impl.h>
-#include <sys/fs/zfs.h>
-#include <sys/zio.h>
-
-/* ARGSUSED */
-static int
-vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
-{
- /*
- * Really this should just fail. But then the root vdev will be in the
- * faulted state with VDEV_AUX_NO_REPLICAS, when what we really want is
- * VDEV_AUX_BAD_GUID_SUM. So we pretend to succeed, knowing that we
- * will fail the GUID sum check before ever trying to open the pool.
- */
- *psize = SPA_MINDEVSIZE;
- *ashift = SPA_MINBLOCKSHIFT;
- return (0);
-}
-
-/* ARGSUSED */
-static void
-vdev_missing_close(vdev_t *vd)
-{
-}
-
-/* ARGSUSED */
-static void
-vdev_missing_io_start(zio_t *zio)
-{
- zio->io_error = ENOTSUP;
- zio_next_stage_async(zio);
-}
-
-/* ARGSUSED */
-static void
-vdev_missing_io_done(zio_t *zio)
-{
- zio_next_stage(zio);
-}
-
-vdev_ops_t vdev_missing_ops = {
- vdev_missing_open,
- vdev_missing_close,
- vdev_default_asize,
- vdev_missing_io_start,
- vdev_missing_io_done,
- NULL,
- VDEV_TYPE_MISSING, /* name of this vdev type */
- B_TRUE /* leaf vdev */
-};
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
deleted file mode 100644
index 8ef524f..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
+++ /dev/null
@@ -1,323 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/spa.h>
-#include <sys/vdev_impl.h>
-#include <sys/zio.h>
-#include <sys/avl.h>
-
-/*
- * These tunables are for performance analysis.
- */
-/*
- * zfs_vdev_max_pending is the maximum number of i/os concurrently
- * pending to each device. zfs_vdev_min_pending is the initial number
- * of i/os pending to each device (before it starts ramping up to
- * max_pending).
- */
-int zfs_vdev_max_pending = 35;
-int zfs_vdev_min_pending = 4;
-
-/* deadline = pri + (LBOLT >> time_shift) */
-int zfs_vdev_time_shift = 6;
-
-/* exponential I/O issue ramp-up rate */
-int zfs_vdev_ramp_rate = 2;
-
-/*
- * i/os will be aggregated into a single large i/o up to
- * zfs_vdev_aggregation_limit bytes long.
- */
-int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE;
-
-/*
- * Virtual device vector for disk I/O scheduling.
- */
-int
-vdev_queue_deadline_compare(const void *x1, const void *x2)
-{
- const zio_t *z1 = x1;
- const zio_t *z2 = x2;
-
- if (z1->io_deadline < z2->io_deadline)
- return (-1);
- if (z1->io_deadline > z2->io_deadline)
- return (1);
-
- if (z1->io_offset < z2->io_offset)
- return (-1);
- if (z1->io_offset > z2->io_offset)
- return (1);
-
- if (z1 < z2)
- return (-1);
- if (z1 > z2)
- return (1);
-
- return (0);
-}
-
-int
-vdev_queue_offset_compare(const void *x1, const void *x2)
-{
- const zio_t *z1 = x1;
- const zio_t *z2 = x2;
-
- if (z1->io_offset < z2->io_offset)
- return (-1);
- if (z1->io_offset > z2->io_offset)
- return (1);
-
- if (z1 < z2)
- return (-1);
- if (z1 > z2)
- return (1);
-
- return (0);
-}
-
-void
-vdev_queue_init(vdev_t *vd)
-{
- vdev_queue_t *vq = &vd->vdev_queue;
-
- mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
-
- avl_create(&vq->vq_deadline_tree, vdev_queue_deadline_compare,
- sizeof (zio_t), offsetof(struct zio, io_deadline_node));
-
- avl_create(&vq->vq_read_tree, vdev_queue_offset_compare,
- sizeof (zio_t), offsetof(struct zio, io_offset_node));
-
- avl_create(&vq->vq_write_tree, vdev_queue_offset_compare,
- sizeof (zio_t), offsetof(struct zio, io_offset_node));
-
- avl_create(&vq->vq_pending_tree, vdev_queue_offset_compare,
- sizeof (zio_t), offsetof(struct zio, io_offset_node));
-}
-
-void
-vdev_queue_fini(vdev_t *vd)
-{
- vdev_queue_t *vq = &vd->vdev_queue;
-
- avl_destroy(&vq->vq_deadline_tree);
- avl_destroy(&vq->vq_read_tree);
- avl_destroy(&vq->vq_write_tree);
- avl_destroy(&vq->vq_pending_tree);
-
- mutex_destroy(&vq->vq_lock);
-}
-
-static void
-vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
-{
- avl_add(&vq->vq_deadline_tree, zio);
- avl_add(zio->io_vdev_tree, zio);
-}
-
-static void
-vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
-{
- avl_remove(&vq->vq_deadline_tree, zio);
- avl_remove(zio->io_vdev_tree, zio);
-}
-
-static void
-vdev_queue_agg_io_done(zio_t *aio)
-{
- zio_t *dio;
- uint64_t offset = 0;
-
- while ((dio = aio->io_delegate_list) != NULL) {
- if (aio->io_type == ZIO_TYPE_READ)
- bcopy((char *)aio->io_data + offset, dio->io_data,
- dio->io_size);
- offset += dio->io_size;
- aio->io_delegate_list = dio->io_delegate_next;
- dio->io_delegate_next = NULL;
- dio->io_error = aio->io_error;
- zio_next_stage(dio);
- }
- ASSERT3U(offset, ==, aio->io_size);
-
- zio_buf_free(aio->io_data, aio->io_size);
-}
-
-#define IS_ADJACENT(io, nio) \
- ((io)->io_offset + (io)->io_size == (nio)->io_offset)
-
-typedef void zio_issue_func_t(zio_t *);
-
-static zio_t *
-vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit,
- zio_issue_func_t **funcp)
-{
- zio_t *fio, *lio, *aio, *dio;
- avl_tree_t *tree;
- uint64_t size;
-
- ASSERT(MUTEX_HELD(&vq->vq_lock));
-
- *funcp = NULL;
-
- if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit ||
- avl_numnodes(&vq->vq_deadline_tree) == 0)
- return (NULL);
-
- fio = lio = avl_first(&vq->vq_deadline_tree);
-
- tree = fio->io_vdev_tree;
- size = fio->io_size;
-
- while ((dio = AVL_PREV(tree, fio)) != NULL && IS_ADJACENT(dio, fio) &&
- size + dio->io_size <= zfs_vdev_aggregation_limit) {
- dio->io_delegate_next = fio;
- fio = dio;
- size += dio->io_size;
- }
-
- while ((dio = AVL_NEXT(tree, lio)) != NULL && IS_ADJACENT(lio, dio) &&
- size + dio->io_size <= zfs_vdev_aggregation_limit) {
- lio->io_delegate_next = dio;
- lio = dio;
- size += dio->io_size;
- }
-
- if (fio != lio) {
- char *buf = zio_buf_alloc(size);
- uint64_t offset = 0;
- int nagg = 0;
-
- ASSERT(size <= zfs_vdev_aggregation_limit);
-
- aio = zio_vdev_child_io(fio, NULL, fio->io_vd,
- fio->io_offset, buf, size, fio->io_type,
- ZIO_PRIORITY_NOW, ZIO_FLAG_DONT_QUEUE |
- ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE |
- ZIO_FLAG_NOBOOKMARK,
- vdev_queue_agg_io_done, NULL);
-
- aio->io_delegate_list = fio;
-
- for (dio = fio; dio != NULL; dio = dio->io_delegate_next) {
- ASSERT(dio->io_type == aio->io_type);
- ASSERT(dio->io_vdev_tree == tree);
- if (dio->io_type == ZIO_TYPE_WRITE)
- bcopy(dio->io_data, buf + offset, dio->io_size);
- offset += dio->io_size;
- vdev_queue_io_remove(vq, dio);
- zio_vdev_io_bypass(dio);
- nagg++;
- }
-
- ASSERT(offset == size);
-
- dprintf("%5s T=%llu off=%8llx agg=%3d "
- "old=%5llx new=%5llx\n",
- zio_type_name[fio->io_type],
- fio->io_deadline, fio->io_offset, nagg, fio->io_size, size);
-
- avl_add(&vq->vq_pending_tree, aio);
-
- *funcp = zio_nowait;
- return (aio);
- }
-
- ASSERT(fio->io_vdev_tree == tree);
- vdev_queue_io_remove(vq, fio);
-
- avl_add(&vq->vq_pending_tree, fio);
-
- *funcp = zio_next_stage;
-
- return (fio);
-}
-
-zio_t *
-vdev_queue_io(zio_t *zio)
-{
- vdev_queue_t *vq = &zio->io_vd->vdev_queue;
- zio_t *nio;
- zio_issue_func_t *func;
-
- ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
-
- if (zio->io_flags & ZIO_FLAG_DONT_QUEUE)
- return (zio);
-
- zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE;
-
- if (zio->io_type == ZIO_TYPE_READ)
- zio->io_vdev_tree = &vq->vq_read_tree;
- else
- zio->io_vdev_tree = &vq->vq_write_tree;
-
- mutex_enter(&vq->vq_lock);
-
- zio->io_deadline = (zio->io_timestamp >> zfs_vdev_time_shift) +
- zio->io_priority;
-
- vdev_queue_io_add(vq, zio);
-
- nio = vdev_queue_io_to_issue(vq, zfs_vdev_min_pending, &func);
-
- mutex_exit(&vq->vq_lock);
-
- if (nio == NULL || func != zio_nowait)
- return (nio);
-
- func(nio);
- return (NULL);
-}
-
-void
-vdev_queue_io_done(zio_t *zio)
-{
- vdev_queue_t *vq = &zio->io_vd->vdev_queue;
- zio_t *nio;
- zio_issue_func_t *func;
- int i;
-
- mutex_enter(&vq->vq_lock);
-
- avl_remove(&vq->vq_pending_tree, zio);
-
- for (i = 0; i < zfs_vdev_ramp_rate; i++) {
- nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending, &func);
- if (nio == NULL)
- break;
- mutex_exit(&vq->vq_lock);
- if (func == zio_next_stage)
- zio_vdev_io_reissue(nio);
- func(nio);
- mutex_enter(&vq->vq_lock);
- }
-
- mutex_exit(&vq->vq_lock);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
deleted file mode 100644
index 0c86630..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
+++ /dev/null
@@ -1,1237 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/spa.h>
-#include <sys/vdev_impl.h>
-#include <sys/zio.h>
-#include <sys/zio_checksum.h>
-#include <sys/fs/zfs.h>
-#include <sys/fm/fs/zfs.h>
-
-/*
- * Virtual device vector for RAID-Z.
- *
- * This vdev supports both single and double parity. For single parity, we
- * use a simple XOR of all the data columns. For double parity, we use both
- * the simple XOR as well as a technique described in "The mathematics of
- * RAID-6" by H. Peter Anvin. This technique defines a Galois field, GF(2^8),
- * over the integers expressable in a single byte. Briefly, the operations on
- * the field are defined as follows:
- *
- * o addition (+) is represented by a bitwise XOR
- * o subtraction (-) is therefore identical to addition: A + B = A - B
- * o multiplication of A by 2 is defined by the following bitwise expression:
- * (A * 2)_7 = A_6
- * (A * 2)_6 = A_5
- * (A * 2)_5 = A_4
- * (A * 2)_4 = A_3 + A_7
- * (A * 2)_3 = A_2 + A_7
- * (A * 2)_2 = A_1 + A_7
- * (A * 2)_1 = A_0
- * (A * 2)_0 = A_7
- *
- * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
- *
- * Observe that any number in the field (except for 0) can be expressed as a
- * power of 2 -- a generator for the field. We store a table of the powers of
- * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
- * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
- * than field addition). The inverse of a field element A (A^-1) is A^254.
- *
- * The two parity columns, P and Q, over several data columns, D_0, ... D_n-1,
- * can be expressed by field operations:
- *
- * P = D_0 + D_1 + ... + D_n-2 + D_n-1
- * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
- * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
- *
- * See the reconstruction code below for how P and Q can used individually or
- * in concert to recover missing data columns.
- */
-
-typedef struct raidz_col {
- uint64_t rc_devidx; /* child device index for I/O */
- uint64_t rc_offset; /* device offset */
- uint64_t rc_size; /* I/O size */
- void *rc_data; /* I/O data */
- int rc_error; /* I/O error for this device */
- uint8_t rc_tried; /* Did we attempt this I/O column? */
- uint8_t rc_skipped; /* Did we skip this I/O column? */
-} raidz_col_t;
-
-typedef struct raidz_map {
- uint64_t rm_cols; /* Column count */
- uint64_t rm_bigcols; /* Number of oversized columns */
- uint64_t rm_asize; /* Actual total I/O size */
- uint64_t rm_missingdata; /* Count of missing data devices */
- uint64_t rm_missingparity; /* Count of missing parity devices */
- uint64_t rm_firstdatacol; /* First data column/parity count */
- raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
-} raidz_map_t;
-
-#define VDEV_RAIDZ_P 0
-#define VDEV_RAIDZ_Q 1
-
-#define VDEV_RAIDZ_MAXPARITY 2
-
-#define VDEV_RAIDZ_MUL_2(a) (((a) << 1) ^ (((a) & 0x80) ? 0x1d : 0))
-
-/*
- * These two tables represent powers and logs of 2 in the Galois field defined
- * above. These values were computed by repeatedly multiplying by 2 as above.
- */
-static const uint8_t vdev_raidz_pow2[256] = {
- 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
- 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
- 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
- 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
- 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
- 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
- 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
- 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
- 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
- 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
- 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
- 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
- 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
- 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
- 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
- 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
- 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
- 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
- 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
- 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
- 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
- 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
- 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
- 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
- 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
- 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
- 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
- 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
- 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
- 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
- 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
- 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
-};
-static const uint8_t vdev_raidz_log2[256] = {
- 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
- 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
- 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
- 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
- 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
- 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
- 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
- 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
- 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
- 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
- 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
- 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
- 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
- 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
- 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
- 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
- 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
- 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
- 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
- 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
- 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
- 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
- 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
- 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
- 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
- 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
- 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
- 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
- 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
- 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
- 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
- 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
-};
-
-/*
- * Multiply a given number by 2 raised to the given power.
- */
-static uint8_t
-vdev_raidz_exp2(uint_t a, int exp)
-{
- if (a == 0)
- return (0);
-
- ASSERT(exp >= 0);
- ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
-
- exp += vdev_raidz_log2[a];
- if (exp > 255)
- exp -= 255;
-
- return (vdev_raidz_pow2[exp]);
-}
-
-static raidz_map_t *
-vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
- uint64_t nparity)
-{
- raidz_map_t *rm;
- uint64_t b = zio->io_offset >> unit_shift;
- uint64_t s = zio->io_size >> unit_shift;
- uint64_t f = b % dcols;
- uint64_t o = (b / dcols) << unit_shift;
- uint64_t q, r, c, bc, col, acols, coff, devidx;
-
- q = s / (dcols - nparity);
- r = s - q * (dcols - nparity);
- bc = (r == 0 ? 0 : r + nparity);
-
- acols = (q == 0 ? bc : dcols);
-
- rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP);
-
- rm->rm_cols = acols;
- rm->rm_bigcols = bc;
- rm->rm_asize = 0;
- rm->rm_missingdata = 0;
- rm->rm_missingparity = 0;
- rm->rm_firstdatacol = nparity;
-
- for (c = 0; c < acols; c++) {
- col = f + c;
- coff = o;
- if (col >= dcols) {
- col -= dcols;
- coff += 1ULL << unit_shift;
- }
- rm->rm_col[c].rc_devidx = col;
- rm->rm_col[c].rc_offset = coff;
- rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift;
- rm->rm_col[c].rc_data = NULL;
- rm->rm_col[c].rc_error = 0;
- rm->rm_col[c].rc_tried = 0;
- rm->rm_col[c].rc_skipped = 0;
- rm->rm_asize += rm->rm_col[c].rc_size;
- }
-
- rm->rm_asize = roundup(rm->rm_asize, (nparity + 1) << unit_shift);
-
- for (c = 0; c < rm->rm_firstdatacol; c++)
- rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
-
- rm->rm_col[c].rc_data = zio->io_data;
-
- for (c = c + 1; c < acols; c++)
- rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
- rm->rm_col[c - 1].rc_size;
-
- /*
- * If all data stored spans all columns, there's a danger that parity
- * will always be on the same device and, since parity isn't read
- * during normal operation, that that device's I/O bandwidth won't be
- * used effectively. We therefore switch the parity every 1MB.
- *
- * ... at least that was, ostensibly, the theory. As a practical
- * matter unless we juggle the parity between all devices evenly, we
- * won't see any benefit. Further, occasional writes that aren't a
- * multiple of the LCM of the number of children and the minimum
- * stripe width are sufficient to avoid pessimal behavior.
- * Unfortunately, this decision created an implicit on-disk format
- * requirement that we need to support for all eternity, but only
- * for single-parity RAID-Z.
- */
- ASSERT(rm->rm_cols >= 2);
- ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
-
- if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
- devidx = rm->rm_col[0].rc_devidx;
- o = rm->rm_col[0].rc_offset;
- rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
- rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
- rm->rm_col[1].rc_devidx = devidx;
- rm->rm_col[1].rc_offset = o;
- }
-
- zio->io_vsd = rm;
- return (rm);
-}
-
-static void
-vdev_raidz_map_free(zio_t *zio)
-{
- raidz_map_t *rm = zio->io_vsd;
- int c;
-
- for (c = 0; c < rm->rm_firstdatacol; c++)
- zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
-
- kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols]));
- zio->io_vsd = NULL;
-}
-
-static void
-vdev_raidz_generate_parity_p(raidz_map_t *rm)
-{
- uint64_t *p, *src, pcount, ccount, i;
- int c;
-
- pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
-
- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- src = rm->rm_col[c].rc_data;
- p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
- ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
-
- if (c == rm->rm_firstdatacol) {
- ASSERT(ccount == pcount);
- for (i = 0; i < ccount; i++, p++, src++) {
- *p = *src;
- }
- } else {
- ASSERT(ccount <= pcount);
- for (i = 0; i < ccount; i++, p++, src++) {
- *p ^= *src;
- }
- }
- }
-}
-
-static void
-vdev_raidz_generate_parity_pq(raidz_map_t *rm)
-{
- uint64_t *q, *p, *src, pcount, ccount, mask, i;
- int c;
-
- pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
- ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
- rm->rm_col[VDEV_RAIDZ_Q].rc_size);
-
- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- src = rm->rm_col[c].rc_data;
- p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
- q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
- ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
-
- if (c == rm->rm_firstdatacol) {
- ASSERT(ccount == pcount || ccount == 0);
- for (i = 0; i < ccount; i++, p++, q++, src++) {
- *q = *src;
- *p = *src;
- }
- for (; i < pcount; i++, p++, q++, src++) {
- *q = 0;
- *p = 0;
- }
- } else {
- ASSERT(ccount <= pcount);
-
- /*
- * Rather than multiplying each byte individually (as
- * described above), we are able to handle 8 at once
- * by generating a mask based on the high bit in each
- * byte and using that to conditionally XOR in 0x1d.
- */
- for (i = 0; i < ccount; i++, p++, q++, src++) {
- mask = *q & 0x8080808080808080ULL;
- mask = (mask << 1) - (mask >> 7);
- *q = ((*q << 1) & 0xfefefefefefefefeULL) ^
- (mask & 0x1d1d1d1d1d1d1d1dULL);
- *q ^= *src;
- *p ^= *src;
- }
-
- /*
- * Treat short columns as though they are full of 0s.
- */
- for (; i < pcount; i++, q++) {
- mask = *q & 0x8080808080808080ULL;
- mask = (mask << 1) - (mask >> 7);
- *q = ((*q << 1) & 0xfefefefefefefefeULL) ^
- (mask & 0x1d1d1d1d1d1d1d1dULL);
- }
- }
- }
-}
-
-static void
-vdev_raidz_reconstruct_p(raidz_map_t *rm, int x)
-{
- uint64_t *dst, *src, xcount, ccount, count, i;
- int c;
-
- xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
- ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
- ASSERT(xcount > 0);
-
- src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
- dst = rm->rm_col[x].rc_data;
- for (i = 0; i < xcount; i++, dst++, src++) {
- *dst = *src;
- }
-
- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- src = rm->rm_col[c].rc_data;
- dst = rm->rm_col[x].rc_data;
-
- if (c == x)
- continue;
-
- ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
- count = MIN(ccount, xcount);
-
- for (i = 0; i < count; i++, dst++, src++) {
- *dst ^= *src;
- }
- }
-}
-
-static void
-vdev_raidz_reconstruct_q(raidz_map_t *rm, int x)
-{
- uint64_t *dst, *src, xcount, ccount, count, mask, i;
- uint8_t *b;
- int c, j, exp;
-
- xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
- ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
-
- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- src = rm->rm_col[c].rc_data;
- dst = rm->rm_col[x].rc_data;
-
- if (c == x)
- ccount = 0;
- else
- ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
-
- count = MIN(ccount, xcount);
-
- if (c == rm->rm_firstdatacol) {
- for (i = 0; i < count; i++, dst++, src++) {
- *dst = *src;
- }
- for (; i < xcount; i++, dst++) {
- *dst = 0;
- }
-
- } else {
- /*
- * For an explanation of this, see the comment in
- * vdev_raidz_generate_parity_pq() above.
- */
- for (i = 0; i < count; i++, dst++, src++) {
- mask = *dst & 0x8080808080808080ULL;
- mask = (mask << 1) - (mask >> 7);
- *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
- (mask & 0x1d1d1d1d1d1d1d1dULL);
- *dst ^= *src;
- }
-
- for (; i < xcount; i++, dst++) {
- mask = *dst & 0x8080808080808080ULL;
- mask = (mask << 1) - (mask >> 7);
- *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
- (mask & 0x1d1d1d1d1d1d1d1dULL);
- }
- }
- }
-
- src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
- dst = rm->rm_col[x].rc_data;
- exp = 255 - (rm->rm_cols - 1 - x);
-
- for (i = 0; i < xcount; i++, dst++, src++) {
- *dst ^= *src;
- for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
- *b = vdev_raidz_exp2(*b, exp);
- }
- }
-}
-
-static void
-vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y)
-{
- uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
- void *pdata, *qdata;
- uint64_t xsize, ysize, i;
-
- ASSERT(x < y);
- ASSERT(x >= rm->rm_firstdatacol);
- ASSERT(y < rm->rm_cols);
-
- ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
-
- /*
- * Move the parity data aside -- we're going to compute parity as
- * though columns x and y were full of zeros -- Pxy and Qxy. We want to
- * reuse the parity generation mechanism without trashing the actual
- * parity so we make those columns appear to be full of zeros by
- * setting their lengths to zero.
- */
- pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
- qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
- xsize = rm->rm_col[x].rc_size;
- ysize = rm->rm_col[y].rc_size;
-
- rm->rm_col[VDEV_RAIDZ_P].rc_data =
- zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
- rm->rm_col[VDEV_RAIDZ_Q].rc_data =
- zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
- rm->rm_col[x].rc_size = 0;
- rm->rm_col[y].rc_size = 0;
-
- vdev_raidz_generate_parity_pq(rm);
-
- rm->rm_col[x].rc_size = xsize;
- rm->rm_col[y].rc_size = ysize;
-
- p = pdata;
- q = qdata;
- pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
- qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
- xd = rm->rm_col[x].rc_data;
- yd = rm->rm_col[y].rc_data;
-
- /*
- * We now have:
- * Pxy = P + D_x + D_y
- * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
- *
- * We can then solve for D_x:
- * D_x = A * (P + Pxy) + B * (Q + Qxy)
- * where
- * A = 2^(x - y) * (2^(x - y) + 1)^-1
- * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
- *
- * With D_x in hand, we can easily solve for D_y:
- * D_y = P + Pxy + D_x
- */
-
- a = vdev_raidz_pow2[255 + x - y];
- b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
- tmp = 255 - vdev_raidz_log2[a ^ 1];
-
- aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
- bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
-
- for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
- *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
- vdev_raidz_exp2(*q ^ *qxy, bexp);
-
- if (i < ysize)
- *yd = *p ^ *pxy ^ *xd;
- }
-
- zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
- rm->rm_col[VDEV_RAIDZ_P].rc_size);
- zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
- rm->rm_col[VDEV_RAIDZ_Q].rc_size);
-
- /*
- * Restore the saved parity data.
- */
- rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
- rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
-}
-
-
-static int
-vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
-{
- vdev_t *cvd;
- uint64_t nparity = vd->vdev_nparity;
- int c, error;
- int lasterror = 0;
- int numerrors = 0;
-
- ASSERT(nparity > 0);
-
- if (nparity > VDEV_RAIDZ_MAXPARITY ||
- vd->vdev_children < nparity + 1) {
- vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
- return (EINVAL);
- }
-
- for (c = 0; c < vd->vdev_children; c++) {
- cvd = vd->vdev_child[c];
-
- if ((error = vdev_open(cvd)) != 0) {
- lasterror = error;
- numerrors++;
- continue;
- }
-
- *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
- *ashift = MAX(*ashift, cvd->vdev_ashift);
- }
-
- *asize *= vd->vdev_children;
-
- if (numerrors > nparity) {
- vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
- return (lasterror);
- }
-
- return (0);
-}
-
-static void
-vdev_raidz_close(vdev_t *vd)
-{
- int c;
-
- for (c = 0; c < vd->vdev_children; c++)
- vdev_close(vd->vdev_child[c]);
-}
-
-static uint64_t
-vdev_raidz_asize(vdev_t *vd, uint64_t psize)
-{
- uint64_t asize;
- uint64_t ashift = vd->vdev_top->vdev_ashift;
- uint64_t cols = vd->vdev_children;
- uint64_t nparity = vd->vdev_nparity;
-
- asize = ((psize - 1) >> ashift) + 1;
- asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
- asize = roundup(asize, nparity + 1) << ashift;
-
- return (asize);
-}
-
-static void
-vdev_raidz_child_done(zio_t *zio)
-{
- raidz_col_t *rc = zio->io_private;
-
- rc->rc_error = zio->io_error;
- rc->rc_tried = 1;
- rc->rc_skipped = 0;
-}
-
-static void
-vdev_raidz_repair_done(zio_t *zio)
-{
- ASSERT(zio->io_private == zio->io_parent);
- vdev_raidz_map_free(zio->io_private);
-}
-
-static void
-vdev_raidz_io_start(zio_t *zio)
-{
- vdev_t *vd = zio->io_vd;
- vdev_t *tvd = vd->vdev_top;
- vdev_t *cvd;
- blkptr_t *bp = zio->io_bp;
- raidz_map_t *rm;
- raidz_col_t *rc;
- int c;
-
- rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
- vd->vdev_nparity);
-
- ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
-
- if (zio->io_type == ZIO_TYPE_WRITE) {
- /*
- * Generate RAID parity in the first virtual columns.
- */
- if (rm->rm_firstdatacol == 1)
- vdev_raidz_generate_parity_p(rm);
- else
- vdev_raidz_generate_parity_pq(rm);
-
- for (c = 0; c < rm->rm_cols; c++) {
- rc = &rm->rm_col[c];
- cvd = vd->vdev_child[rc->rc_devidx];
- zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
- rc->rc_offset, rc->rc_data, rc->rc_size,
- zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
- vdev_raidz_child_done, rc));
- }
- zio_wait_children_done(zio);
- return;
- }
-
- ASSERT(zio->io_type == ZIO_TYPE_READ);
-
- /*
- * Iterate over the columns in reverse order so that we hit the parity
- * last -- any errors along the way will force us to read the parity
- * data.
- */
- for (c = rm->rm_cols - 1; c >= 0; c--) {
- rc = &rm->rm_col[c];
- cvd = vd->vdev_child[rc->rc_devidx];
- if (vdev_is_dead(cvd)) {
- if (c >= rm->rm_firstdatacol)
- rm->rm_missingdata++;
- else
- rm->rm_missingparity++;
- rc->rc_error = ENXIO;
- rc->rc_tried = 1; /* don't even try */
- rc->rc_skipped = 1;
- continue;
- }
- if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) {
- if (c >= rm->rm_firstdatacol)
- rm->rm_missingdata++;
- else
- rm->rm_missingparity++;
- rc->rc_error = ESTALE;
- rc->rc_skipped = 1;
- continue;
- }
- if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
- (zio->io_flags & ZIO_FLAG_SCRUB)) {
- zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
- rc->rc_offset, rc->rc_data, rc->rc_size,
- zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
- vdev_raidz_child_done, rc));
- }
- }
-
- zio_wait_children_done(zio);
-}
-
-/*
- * Report a checksum error for a child of a RAID-Z device.
- */
-static void
-raidz_checksum_error(zio_t *zio, raidz_col_t *rc)
-{
- vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
- dprintf_bp(zio->io_bp, "imputed checksum error on %s: ",
- vdev_description(vd));
-
- if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
- mutex_enter(&vd->vdev_stat_lock);
- vd->vdev_stat.vs_checksum_errors++;
- mutex_exit(&vd->vdev_stat_lock);
- }
-
- if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE))
- zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
- zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size);
-}
-
-/*
- * Generate the parity from the data columns. If we tried and were able to
- * read the parity without error, verify that the generated parity matches the
- * data we read. If it doesn't, we fire off a checksum error. Return the
- * number such failures.
- */
-static int
-raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
-{
- void *orig[VDEV_RAIDZ_MAXPARITY];
- int c, ret = 0;
- raidz_col_t *rc;
-
- for (c = 0; c < rm->rm_firstdatacol; c++) {
- rc = &rm->rm_col[c];
- if (!rc->rc_tried || rc->rc_error != 0)
- continue;
- orig[c] = zio_buf_alloc(rc->rc_size);
- bcopy(rc->rc_data, orig[c], rc->rc_size);
- }
-
- if (rm->rm_firstdatacol == 1)
- vdev_raidz_generate_parity_p(rm);
- else
- vdev_raidz_generate_parity_pq(rm);
-
- for (c = 0; c < rm->rm_firstdatacol; c++) {
- rc = &rm->rm_col[c];
- if (!rc->rc_tried || rc->rc_error != 0)
- continue;
- if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
- raidz_checksum_error(zio, rc);
- rc->rc_error = ECKSUM;
- ret++;
- }
- zio_buf_free(orig[c], rc->rc_size);
- }
-
- return (ret);
-}
-
-static uint64_t raidz_corrected_p;
-static uint64_t raidz_corrected_q;
-static uint64_t raidz_corrected_pq;
-
-static void
-vdev_raidz_io_done(zio_t *zio)
-{
- vdev_t *vd = zio->io_vd;
- vdev_t *cvd;
- raidz_map_t *rm = zio->io_vsd;
- raidz_col_t *rc, *rc1;
- int unexpected_errors = 0;
- int parity_errors = 0;
- int parity_untried = 0;
- int data_errors = 0;
- int n, c, c1;
-
- ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */
-
- zio->io_error = 0;
- zio->io_numerrors = 0;
-
- ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
- ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
-
- for (c = 0; c < rm->rm_cols; c++) {
- rc = &rm->rm_col[c];
-
- /*
- * We preserve any EIOs because those may be worth retrying;
- * whereas ECKSUM and ENXIO are more likely to be persistent.
- */
- if (rc->rc_error) {
- if (zio->io_error != EIO)
- zio->io_error = rc->rc_error;
-
- if (c < rm->rm_firstdatacol)
- parity_errors++;
- else
- data_errors++;
-
- if (!rc->rc_skipped)
- unexpected_errors++;
-
- zio->io_numerrors++;
- } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
- parity_untried++;
- }
- }
-
- if (zio->io_type == ZIO_TYPE_WRITE) {
- /*
- * If this is not a failfast write, and we were able to
- * write enough columns to reconstruct the data, good enough.
- */
- /* XXPOLICY */
- if (zio->io_numerrors <= rm->rm_firstdatacol &&
- !(zio->io_flags & ZIO_FLAG_FAILFAST))
- zio->io_error = 0;
-
- vdev_raidz_map_free(zio);
- zio_next_stage(zio);
- return;
- }
-
- ASSERT(zio->io_type == ZIO_TYPE_READ);
- /*
- * There are three potential phases for a read:
- * 1. produce valid data from the columns read
- * 2. read all disks and try again
- * 3. perform combinatorial reconstruction
- *
- * Each phase is progressively both more expensive and less likely to
- * occur. If we encounter more errors than we can repair or all phases
- * fail, we have no choice but to return an error.
- */
-
- /*
- * If the number of errors we saw was correctable -- less than or equal
- * to the number of parity disks read -- attempt to produce data that
- * has a valid checksum. Naturally, this case applies in the absence of
- * any errors.
- */
- if (zio->io_numerrors <= rm->rm_firstdatacol - parity_untried) {
- switch (data_errors) {
- case 0:
- if (zio_checksum_error(zio) == 0) {
- zio->io_error = 0;
-
- /*
- * If we read parity information (unnecessarily
- * as it happens since no reconstruction was
- * needed) regenerate and verify the parity.
- * We also regenerate parity when resilvering
- * so we can write it out to the failed device
- * later.
- */
- if (parity_errors + parity_untried <
- rm->rm_firstdatacol ||
- (zio->io_flags & ZIO_FLAG_RESILVER)) {
- n = raidz_parity_verify(zio, rm);
- unexpected_errors += n;
- ASSERT(parity_errors + n <=
- rm->rm_firstdatacol);
- }
- goto done;
- }
- break;
-
- case 1:
- /*
- * We either attempt to read all the parity columns or
- * none of them. If we didn't try to read parity, we
- * wouldn't be here in the correctable case. There must
- * also have been fewer parity errors than parity
- * columns or, again, we wouldn't be in this code path.
- */
- ASSERT(parity_untried == 0);
- ASSERT(parity_errors < rm->rm_firstdatacol);
-
- /*
- * Find the column that reported the error.
- */
- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- rc = &rm->rm_col[c];
- if (rc->rc_error != 0)
- break;
- }
- ASSERT(c != rm->rm_cols);
- ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
- rc->rc_error == ESTALE);
-
- if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
- vdev_raidz_reconstruct_p(rm, c);
- } else {
- ASSERT(rm->rm_firstdatacol > 1);
- vdev_raidz_reconstruct_q(rm, c);
- }
-
- if (zio_checksum_error(zio) == 0) {
- zio->io_error = 0;
- if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0)
- atomic_inc_64(&raidz_corrected_p);
- else
- atomic_inc_64(&raidz_corrected_q);
-
- /*
- * If there's more than one parity disk that
- * was successfully read, confirm that the
- * other parity disk produced the correct data.
- * This routine is suboptimal in that it
- * regenerates both the parity we wish to test
- * as well as the parity we just used to
- * perform the reconstruction, but this should
- * be a relatively uncommon case, and can be
- * optimized if it becomes a problem.
- * We also regenerate parity when resilvering
- * so we can write it out to the failed device
- * later.
- */
- if (parity_errors < rm->rm_firstdatacol - 1 ||
- (zio->io_flags & ZIO_FLAG_RESILVER)) {
- n = raidz_parity_verify(zio, rm);
- unexpected_errors += n;
- ASSERT(parity_errors + n <=
- rm->rm_firstdatacol);
- }
-
- goto done;
- }
- break;
-
- case 2:
- /*
- * Two data column errors require double parity.
- */
- ASSERT(rm->rm_firstdatacol == 2);
-
- /*
- * Find the two columns that reported errors.
- */
- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- rc = &rm->rm_col[c];
- if (rc->rc_error != 0)
- break;
- }
- ASSERT(c != rm->rm_cols);
- ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
- rc->rc_error == ESTALE);
-
- for (c1 = c++; c < rm->rm_cols; c++) {
- rc = &rm->rm_col[c];
- if (rc->rc_error != 0)
- break;
- }
- ASSERT(c != rm->rm_cols);
- ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
- rc->rc_error == ESTALE);
-
- vdev_raidz_reconstruct_pq(rm, c1, c);
-
- if (zio_checksum_error(zio) == 0) {
- zio->io_error = 0;
- atomic_inc_64(&raidz_corrected_pq);
-
- goto done;
- }
- break;
-
- default:
- ASSERT(rm->rm_firstdatacol <= 2);
- ASSERT(0);
- }
- }
-
- /*
- * This isn't a typical situation -- either we got a read error or
- * a child silently returned bad data. Read every block so we can
- * try again with as much data and parity as we can track down. If
- * we've already been through once before, all children will be marked
- * as tried so we'll proceed to combinatorial reconstruction.
- */
- unexpected_errors = 1;
- rm->rm_missingdata = 0;
- rm->rm_missingparity = 0;
-
- for (c = 0; c < rm->rm_cols; c++) {
- if (rm->rm_col[c].rc_tried)
- continue;
-
- zio->io_error = 0;
- zio_vdev_io_redone(zio);
- do {
- rc = &rm->rm_col[c];
- if (rc->rc_tried)
- continue;
- zio_nowait(zio_vdev_child_io(zio, NULL,
- vd->vdev_child[rc->rc_devidx],
- rc->rc_offset, rc->rc_data, rc->rc_size,
- zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
- vdev_raidz_child_done, rc));
- } while (++c < rm->rm_cols);
- dprintf("rereading\n");
- zio_wait_children_done(zio);
- return;
- }
-
- /*
- * At this point we've attempted to reconstruct the data given the
- * errors we detected, and we've attempted to read all columns. There
- * must, therefore, be one or more additional problems -- silent errors
- * resulting in invalid data rather than explicit I/O errors resulting
- * in absent data. Before we attempt combinatorial reconstruction make
- * sure we have a chance of coming up with the right answer.
- */
- if (zio->io_numerrors >= rm->rm_firstdatacol) {
- ASSERT(zio->io_error != 0);
- goto done;
- }
-
- if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
- /*
- * Attempt to reconstruct the data from parity P.
- */
- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- void *orig;
- rc = &rm->rm_col[c];
-
- orig = zio_buf_alloc(rc->rc_size);
- bcopy(rc->rc_data, orig, rc->rc_size);
- vdev_raidz_reconstruct_p(rm, c);
-
- if (zio_checksum_error(zio) == 0) {
- zio_buf_free(orig, rc->rc_size);
- zio->io_error = 0;
- atomic_inc_64(&raidz_corrected_p);
-
- /*
- * If this child didn't know that it returned
- * bad data, inform it.
- */
- if (rc->rc_tried && rc->rc_error == 0)
- raidz_checksum_error(zio, rc);
- rc->rc_error = ECKSUM;
- goto done;
- }
-
- bcopy(orig, rc->rc_data, rc->rc_size);
- zio_buf_free(orig, rc->rc_size);
- }
- }
-
- if (rm->rm_firstdatacol > 1 && rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
- /*
- * Attempt to reconstruct the data from parity Q.
- */
- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- void *orig;
- rc = &rm->rm_col[c];
-
- orig = zio_buf_alloc(rc->rc_size);
- bcopy(rc->rc_data, orig, rc->rc_size);
- vdev_raidz_reconstruct_q(rm, c);
-
- if (zio_checksum_error(zio) == 0) {
- zio_buf_free(orig, rc->rc_size);
- zio->io_error = 0;
- atomic_inc_64(&raidz_corrected_q);
-
- /*
- * If this child didn't know that it returned
- * bad data, inform it.
- */
- if (rc->rc_tried && rc->rc_error == 0)
- raidz_checksum_error(zio, rc);
- rc->rc_error = ECKSUM;
- goto done;
- }
-
- bcopy(orig, rc->rc_data, rc->rc_size);
- zio_buf_free(orig, rc->rc_size);
- }
- }
-
- if (rm->rm_firstdatacol > 1 &&
- rm->rm_col[VDEV_RAIDZ_P].rc_error == 0 &&
- rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
- /*
- * Attempt to reconstruct the data from both P and Q.
- */
- for (c = rm->rm_firstdatacol; c < rm->rm_cols - 1; c++) {
- void *orig, *orig1;
- rc = &rm->rm_col[c];
-
- orig = zio_buf_alloc(rc->rc_size);
- bcopy(rc->rc_data, orig, rc->rc_size);
-
- for (c1 = c + 1; c1 < rm->rm_cols; c1++) {
- rc1 = &rm->rm_col[c1];
-
- orig1 = zio_buf_alloc(rc1->rc_size);
- bcopy(rc1->rc_data, orig1, rc1->rc_size);
-
- vdev_raidz_reconstruct_pq(rm, c, c1);
-
- if (zio_checksum_error(zio) == 0) {
- zio_buf_free(orig, rc->rc_size);
- zio_buf_free(orig1, rc1->rc_size);
- zio->io_error = 0;
- atomic_inc_64(&raidz_corrected_pq);
-
- /*
- * If these children didn't know they
- * returned bad data, inform them.
- */
- if (rc->rc_tried && rc->rc_error == 0)
- raidz_checksum_error(zio, rc);
- if (rc1->rc_tried && rc1->rc_error == 0)
- raidz_checksum_error(zio, rc1);
-
- rc->rc_error = ECKSUM;
- rc1->rc_error = ECKSUM;
-
- goto done;
- }
-
- bcopy(orig1, rc1->rc_data, rc1->rc_size);
- zio_buf_free(orig1, rc1->rc_size);
- }
-
- bcopy(orig, rc->rc_data, rc->rc_size);
- zio_buf_free(orig, rc->rc_size);
- }
- }
-
- /*
- * All combinations failed to checksum. Generate checksum ereports for
- * all children.
- */
- zio->io_error = ECKSUM;
- if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
- for (c = 0; c < rm->rm_cols; c++) {
- rc = &rm->rm_col[c];
- zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
- zio->io_spa, vd->vdev_child[rc->rc_devidx], zio,
- rc->rc_offset, rc->rc_size);
- }
- }
-
-done:
- zio_checksum_verified(zio);
-
- if (zio->io_error == 0 && (spa_mode & FWRITE) &&
- (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
- zio_t *rio;
-
- /*
- * Use the good data we have in hand to repair damaged children.
- *
- * We issue all repair I/Os as children of 'rio' to arrange
- * that vdev_raidz_map_free(zio) will be invoked after all
- * repairs complete, but before we advance to the next stage.
- */
- rio = zio_null(zio, zio->io_spa,
- vdev_raidz_repair_done, zio, ZIO_FLAG_CANFAIL);
-
- for (c = 0; c < rm->rm_cols; c++) {
- rc = &rm->rm_col[c];
- cvd = vd->vdev_child[rc->rc_devidx];
-
- if (rc->rc_error == 0)
- continue;
-
- dprintf("%s resilvered %s @ 0x%llx error %d\n",
- vdev_description(vd),
- vdev_description(cvd),
- zio->io_offset, rc->rc_error);
-
- zio_nowait(zio_vdev_child_io(rio, NULL, cvd,
- rc->rc_offset, rc->rc_data, rc->rc_size,
- ZIO_TYPE_WRITE, zio->io_priority,
- ZIO_FLAG_IO_REPAIR | ZIO_FLAG_DONT_PROPAGATE |
- ZIO_FLAG_CANFAIL, NULL, NULL));
- }
-
- zio_nowait(rio);
- zio_wait_children_done(zio);
- return;
- }
-
- vdev_raidz_map_free(zio);
- zio_next_stage(zio);
-}
-
-static void
-vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
-{
- if (faulted > vd->vdev_nparity)
- vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_NO_REPLICAS);
- else if (degraded + faulted != 0)
- vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
- else
- vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
-}
-
-vdev_ops_t vdev_raidz_ops = {
- vdev_raidz_open,
- vdev_raidz_close,
- vdev_raidz_asize,
- vdev_raidz_io_start,
- vdev_raidz_io_done,
- vdev_raidz_state_change,
- VDEV_TYPE_RAIDZ, /* name of this vdev type */
- B_FALSE /* not a leaf vdev */
-};
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c b/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c
deleted file mode 100644
index 0e8752c..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/spa.h>
-#include <sys/vdev_impl.h>
-#include <sys/zio.h>
-#include <sys/fs/zfs.h>
-
-/*
- * Virtual device vector for the pool's root vdev.
- */
-
-/*
- * We should be able to tolerate one failure with absolutely no damage
- * to our metadata. Two failures will take out space maps, a bunch of
- * indirect block trees, meta dnodes, dnodes, etc. Probably not a happy
- * place to live. When we get smarter, we can liberalize this policy.
- * e.g. If we haven't lost two consecutive top-level vdevs, then we are
- * probably fine. Adding bean counters during alloc/free can make this
- * future guesswork more accurate.
- */
-/*ARGSUSED*/
-static int
-too_many_errors(vdev_t *vd, int numerrors)
-{
- return (numerrors > 0);
-}
-
-static int
-vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
-{
- vdev_t *cvd;
- int c, error;
- int lasterror = 0;
- int numerrors = 0;
-
- if (vd->vdev_children == 0) {
- vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
- return (EINVAL);
- }
-
- for (c = 0; c < vd->vdev_children; c++) {
- cvd = vd->vdev_child[c];
-
- if ((error = vdev_open(cvd)) != 0) {
- lasterror = error;
- numerrors++;
- continue;
- }
- }
-
- if (too_many_errors(vd, numerrors)) {
- vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
- return (lasterror);
- }
-
- *asize = 0;
- *ashift = 0;
-
- return (0);
-}
-
-static void
-vdev_root_close(vdev_t *vd)
-{
- int c;
-
- for (c = 0; c < vd->vdev_children; c++)
- vdev_close(vd->vdev_child[c]);
-}
-
-static void
-vdev_root_state_change(vdev_t *vd, int faulted, int degraded)
-{
- if (too_many_errors(vd, faulted))
- vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_NO_REPLICAS);
- else if (degraded != 0)
- vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
- else
- vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
-}
-
-vdev_ops_t vdev_root_ops = {
- vdev_root_open,
- vdev_root_close,
- vdev_default_asize,
- NULL, /* io_start - not applicable to the root */
- NULL, /* io_done - not applicable to the root */
- vdev_root_state_change,
- VDEV_TYPE_ROOT, /* name of this vdev type */
- B_FALSE /* not a leaf vdev */
-};
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zap.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zap.c
deleted file mode 100644
index 4246ec0..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/zap.c
+++ /dev/null
@@ -1,1071 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-
-/*
- * This file contains the top half of the zfs directory structure
- * implementation. The bottom half is in zap_leaf.c.
- *
- * The zdir is an extendable hash data structure. There is a table of
- * pointers to buckets (zap_t->zd_data->zd_leafs). The buckets are
- * each a constant size and hold a variable number of directory entries.
- * The buckets (aka "leaf nodes") are implemented in zap_leaf.c.
- *
- * The pointer table holds a power of 2 number of pointers.
- * (1<<zap_t->zd_data->zd_phys->zd_prefix_len). The bucket pointed to
- * by the pointer at index i in the table holds entries whose hash value
- * has a zd_prefix_len - bit prefix
- */
-
-#include <sys/spa.h>
-#include <sys/dmu.h>
-#include <sys/zfs_context.h>
-#include <sys/zap.h>
-#include <sys/refcount.h>
-#include <sys/zap_impl.h>
-#include <sys/zap_leaf.h>
-#include <sys/zfs_znode.h>
-
-int fzap_default_block_shift = 14; /* 16k blocksize */
-
-static void zap_leaf_pageout(dmu_buf_t *db, void *vl);
-static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks);
-
-
-void
-fzap_byteswap(void *vbuf, size_t size)
-{
- uint64_t block_type;
-
- block_type = *(uint64_t *)vbuf;
-
- if (block_type == ZBT_LEAF || block_type == BSWAP_64(ZBT_LEAF))
- zap_leaf_byteswap(vbuf, size);
- else {
- /* it's a ptrtbl block */
- byteswap_uint64_array(vbuf, size);
- }
-}
-
-void
-fzap_upgrade(zap_t *zap, dmu_tx_t *tx)
-{
- dmu_buf_t *db;
- zap_leaf_t *l;
- int i;
- zap_phys_t *zp;
-
- ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
- zap->zap_ismicro = FALSE;
-
- (void) dmu_buf_update_user(zap->zap_dbuf, zap, zap,
- &zap->zap_f.zap_phys, zap_evict);
-
- mutex_init(&zap->zap_f.zap_num_entries_mtx, NULL, MUTEX_DEFAULT, 0);
- zap->zap_f.zap_block_shift = highbit(zap->zap_dbuf->db_size) - 1;
-
- zp = zap->zap_f.zap_phys;
- /*
- * explicitly zero it since it might be coming from an
- * initialized microzap
- */
- bzero(zap->zap_dbuf->db_data, zap->zap_dbuf->db_size);
- zp->zap_block_type = ZBT_HEADER;
- zp->zap_magic = ZAP_MAGIC;
-
- zp->zap_ptrtbl.zt_shift = ZAP_EMBEDDED_PTRTBL_SHIFT(zap);
-
- zp->zap_freeblk = 2; /* block 1 will be the first leaf */
- zp->zap_num_leafs = 1;
- zp->zap_num_entries = 0;
- zp->zap_salt = zap->zap_salt;
-
- /* block 1 will be the first leaf */
- for (i = 0; i < (1<<zp->zap_ptrtbl.zt_shift); i++)
- ZAP_EMBEDDED_PTRTBL_ENT(zap, i) = 1;
-
- /*
- * set up block 1 - the first leaf
- */
- VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
- 1<<FZAP_BLOCK_SHIFT(zap), FTAG, &db));
- dmu_buf_will_dirty(db, tx);
-
- l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
- l->l_dbuf = db;
- l->l_phys = db->db_data;
-
- zap_leaf_init(l);
-
- kmem_free(l, sizeof (zap_leaf_t));
- dmu_buf_rele(db, FTAG);
-}
-
-static int
-zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx)
-{
- if (RW_WRITE_HELD(&zap->zap_rwlock))
- return (1);
- if (rw_tryupgrade(&zap->zap_rwlock)) {
- dmu_buf_will_dirty(zap->zap_dbuf, tx);
- return (1);
- }
- return (0);
-}
-
-/*
- * Generic routines for dealing with the pointer & cookie tables.
- */
-
-static int
-zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
- void (*transfer_func)(const uint64_t *src, uint64_t *dst, int n),
- dmu_tx_t *tx)
-{
- uint64_t b, newblk;
- dmu_buf_t *db_old, *db_new;
- int err;
- int bs = FZAP_BLOCK_SHIFT(zap);
- int hepb = 1<<(bs-4);
- /* hepb = half the number of entries in a block */
-
- ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
- ASSERT(tbl->zt_blk != 0);
- ASSERT(tbl->zt_numblks > 0);
-
- if (tbl->zt_nextblk != 0) {
- newblk = tbl->zt_nextblk;
- } else {
- newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2);
- tbl->zt_nextblk = newblk;
- ASSERT3U(tbl->zt_blks_copied, ==, 0);
- dmu_prefetch(zap->zap_objset, zap->zap_object,
- tbl->zt_blk << bs, tbl->zt_numblks << bs);
- }
-
- /*
- * Copy the ptrtbl from the old to new location.
- */
-
- b = tbl->zt_blks_copied;
- err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
- (tbl->zt_blk + b) << bs, FTAG, &db_old);
- if (err)
- return (err);
-
- /* first half of entries in old[b] go to new[2*b+0] */
- VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
- (newblk + 2*b+0) << bs, FTAG, &db_new));
- dmu_buf_will_dirty(db_new, tx);
- transfer_func(db_old->db_data, db_new->db_data, hepb);
- dmu_buf_rele(db_new, FTAG);
-
- /* second half of entries in old[b] go to new[2*b+1] */
- VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
- (newblk + 2*b+1) << bs, FTAG, &db_new));
- dmu_buf_will_dirty(db_new, tx);
- transfer_func((uint64_t *)db_old->db_data + hepb,
- db_new->db_data, hepb);
- dmu_buf_rele(db_new, FTAG);
-
- dmu_buf_rele(db_old, FTAG);
-
- tbl->zt_blks_copied++;
-
- dprintf("copied block %llu of %llu\n",
- tbl->zt_blks_copied, tbl->zt_numblks);
-
- if (tbl->zt_blks_copied == tbl->zt_numblks) {
- (void) dmu_free_range(zap->zap_objset, zap->zap_object,
- tbl->zt_blk << bs, tbl->zt_numblks << bs, tx);
-
- tbl->zt_blk = newblk;
- tbl->zt_numblks *= 2;
- tbl->zt_shift++;
- tbl->zt_nextblk = 0;
- tbl->zt_blks_copied = 0;
-
- dprintf("finished; numblocks now %llu (%lluk entries)\n",
- tbl->zt_numblks, 1<<(tbl->zt_shift-10));
- }
-
- return (0);
-}
-
-static int
-zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
- dmu_tx_t *tx)
-{
- int err;
- uint64_t blk, off;
- int bs = FZAP_BLOCK_SHIFT(zap);
- dmu_buf_t *db;
-
- ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
- ASSERT(tbl->zt_blk != 0);
-
- dprintf("storing %llx at index %llx\n", val, idx);
-
- blk = idx >> (bs-3);
- off = idx & ((1<<(bs-3))-1);
-
- err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
- (tbl->zt_blk + blk) << bs, FTAG, &db);
- if (err)
- return (err);
- dmu_buf_will_dirty(db, tx);
-
- if (tbl->zt_nextblk != 0) {
- uint64_t idx2 = idx * 2;
- uint64_t blk2 = idx2 >> (bs-3);
- uint64_t off2 = idx2 & ((1<<(bs-3))-1);
- dmu_buf_t *db2;
-
- err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
- (tbl->zt_nextblk + blk2) << bs, FTAG, &db2);
- if (err) {
- dmu_buf_rele(db, FTAG);
- return (err);
- }
- dmu_buf_will_dirty(db2, tx);
- ((uint64_t *)db2->db_data)[off2] = val;
- ((uint64_t *)db2->db_data)[off2+1] = val;
- dmu_buf_rele(db2, FTAG);
- }
-
- ((uint64_t *)db->db_data)[off] = val;
- dmu_buf_rele(db, FTAG);
-
- return (0);
-}
-
-static int
-zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
-{
- uint64_t blk, off;
- int err;
- dmu_buf_t *db;
- int bs = FZAP_BLOCK_SHIFT(zap);
-
- ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
-
- blk = idx >> (bs-3);
- off = idx & ((1<<(bs-3))-1);
-
- err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
- (tbl->zt_blk + blk) << bs, FTAG, &db);
- if (err)
- return (err);
- *valp = ((uint64_t *)db->db_data)[off];
- dmu_buf_rele(db, FTAG);
-
- if (tbl->zt_nextblk != 0) {
- /*
- * read the nextblk for the sake of i/o error checking,
- * so that zap_table_load() will catch errors for
- * zap_table_store.
- */
- blk = (idx*2) >> (bs-3);
-
- err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
- (tbl->zt_nextblk + blk) << bs, FTAG, &db);
- dmu_buf_rele(db, FTAG);
- }
- return (err);
-}
-
-/*
- * Routines for growing the ptrtbl.
- */
-
-static void
-zap_ptrtbl_transfer(const uint64_t *src, uint64_t *dst, int n)
-{
- int i;
- for (i = 0; i < n; i++) {
- uint64_t lb = src[i];
- dst[2*i+0] = lb;
- dst[2*i+1] = lb;
- }
-}
-
-static int
-zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx)
-{
- /* In case things go horribly wrong. */
- if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift >= ZAP_HASHBITS-2)
- return (ENOSPC);
-
- if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
- /*
- * We are outgrowing the "embedded" ptrtbl (the one
- * stored in the header block). Give it its own entire
- * block, which will double the size of the ptrtbl.
- */
- uint64_t newblk;
- dmu_buf_t *db_new;
- int err;
-
- ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==,
- ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
- ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_blk, ==, 0);
-
- newblk = zap_allocate_blocks(zap, 1);
- err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
- newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new);
- if (err)
- return (err);
- dmu_buf_will_dirty(db_new, tx);
- zap_ptrtbl_transfer(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
- db_new->db_data, 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
- dmu_buf_rele(db_new, FTAG);
-
- zap->zap_f.zap_phys->zap_ptrtbl.zt_blk = newblk;
- zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks = 1;
- zap->zap_f.zap_phys->zap_ptrtbl.zt_shift++;
-
- ASSERT3U(1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==,
- zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks <<
- (FZAP_BLOCK_SHIFT(zap)-3));
-
- return (0);
- } else {
- return (zap_table_grow(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
- zap_ptrtbl_transfer, tx));
- }
-}
-
-static void
-zap_increment_num_entries(zap_t *zap, int delta, dmu_tx_t *tx)
-{
- dmu_buf_will_dirty(zap->zap_dbuf, tx);
- mutex_enter(&zap->zap_f.zap_num_entries_mtx);
- ASSERT(delta > 0 || zap->zap_f.zap_phys->zap_num_entries >= -delta);
- zap->zap_f.zap_phys->zap_num_entries += delta;
- mutex_exit(&zap->zap_f.zap_num_entries_mtx);
-}
-
-static uint64_t
-zap_allocate_blocks(zap_t *zap, int nblocks)
-{
- uint64_t newblk;
- ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
- newblk = zap->zap_f.zap_phys->zap_freeblk;
- zap->zap_f.zap_phys->zap_freeblk += nblocks;
- return (newblk);
-}
-
-static zap_leaf_t *
-zap_create_leaf(zap_t *zap, dmu_tx_t *tx)
-{
- void *winner;
- zap_leaf_t *l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP);
-
- ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
-
- rw_init(&l->l_rwlock, NULL, RW_DEFAULT, 0);
- rw_enter(&l->l_rwlock, RW_WRITER);
- l->l_blkid = zap_allocate_blocks(zap, 1);
- l->l_dbuf = NULL;
- l->l_phys = NULL;
-
- VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
- l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf));
- winner = dmu_buf_set_user(l->l_dbuf, l, &l->l_phys, zap_leaf_pageout);
- ASSERT(winner == NULL);
- dmu_buf_will_dirty(l->l_dbuf, tx);
-
- zap_leaf_init(l);
-
- zap->zap_f.zap_phys->zap_num_leafs++;
-
- return (l);
-}
-
-int
-fzap_count(zap_t *zap, uint64_t *count)
-{
- ASSERT(!zap->zap_ismicro);
- mutex_enter(&zap->zap_f.zap_num_entries_mtx); /* unnecessary */
- *count = zap->zap_f.zap_phys->zap_num_entries;
- mutex_exit(&zap->zap_f.zap_num_entries_mtx);
- return (0);
-}
-
-/*
- * Routines for obtaining zap_leaf_t's
- */
-
-void
-zap_put_leaf(zap_leaf_t *l)
-{
- rw_exit(&l->l_rwlock);
- dmu_buf_rele(l->l_dbuf, NULL);
-}
-
-_NOTE(ARGSUSED(0))
-static void
-zap_leaf_pageout(dmu_buf_t *db, void *vl)
-{
- zap_leaf_t *l = vl;
-
- rw_destroy(&l->l_rwlock);
- kmem_free(l, sizeof (zap_leaf_t));
-}
-
-static zap_leaf_t *
-zap_open_leaf(uint64_t blkid, dmu_buf_t *db)
-{
- zap_leaf_t *l, *winner;
-
- ASSERT(blkid != 0);
-
- l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP);
- rw_init(&l->l_rwlock, NULL, RW_DEFAULT, 0);
- rw_enter(&l->l_rwlock, RW_WRITER);
- l->l_blkid = blkid;
- l->l_bs = highbit(db->db_size)-1;
- l->l_dbuf = db;
- l->l_phys = NULL;
-
- winner = dmu_buf_set_user(db, l, &l->l_phys, zap_leaf_pageout);
-
- rw_exit(&l->l_rwlock);
- if (winner != NULL) {
- /* someone else set it first */
- zap_leaf_pageout(NULL, l);
- l = winner;
- }
-
- /*
- * lhr_pad was previously used for the next leaf in the leaf
- * chain. There should be no chained leafs (as we have removed
- * support for them).
- */
- ASSERT3U(l->l_phys->l_hdr.lh_pad1, ==, 0);
-
- /*
- * There should be more hash entries than there can be
- * chunks to put in the hash table
- */
- ASSERT3U(ZAP_LEAF_HASH_NUMENTRIES(l), >, ZAP_LEAF_NUMCHUNKS(l) / 3);
-
- /* The chunks should begin at the end of the hash table */
- ASSERT3P(&ZAP_LEAF_CHUNK(l, 0), ==,
- &l->l_phys->l_hash[ZAP_LEAF_HASH_NUMENTRIES(l)]);
-
- /* The chunks should end at the end of the block */
- ASSERT3U((uintptr_t)&ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)) -
- (uintptr_t)l->l_phys, ==, l->l_dbuf->db_size);
-
- return (l);
-}
-
-static int
-zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt,
- zap_leaf_t **lp)
-{
- dmu_buf_t *db;
- zap_leaf_t *l;
- int bs = FZAP_BLOCK_SHIFT(zap);
- int err;
-
- ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
-
- err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
- blkid << bs, NULL, &db);
- if (err)
- return (err);
-
- ASSERT3U(db->db_object, ==, zap->zap_object);
- ASSERT3U(db->db_offset, ==, blkid << bs);
- ASSERT3U(db->db_size, ==, 1 << bs);
- ASSERT(blkid != 0);
-
- l = dmu_buf_get_user(db);
-
- if (l == NULL)
- l = zap_open_leaf(blkid, db);
-
- rw_enter(&l->l_rwlock, lt);
- /*
- * Must lock before dirtying, otherwise l->l_phys could change,
- * causing ASSERT below to fail.
- */
- if (lt == RW_WRITER)
- dmu_buf_will_dirty(db, tx);
- ASSERT3U(l->l_blkid, ==, blkid);
- ASSERT3P(l->l_dbuf, ==, db);
- ASSERT3P(l->l_phys, ==, l->l_dbuf->db_data);
- ASSERT3U(l->l_phys->l_hdr.lh_block_type, ==, ZBT_LEAF);
- ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
-
- *lp = l;
- return (0);
-}
-
-static int
-zap_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t *valp)
-{
- ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
-
- if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
- ASSERT3U(idx, <,
- (1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift));
- *valp = ZAP_EMBEDDED_PTRTBL_ENT(zap, idx);
- return (0);
- } else {
- return (zap_table_load(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
- idx, valp));
- }
-}
-
-static int
-zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx)
-{
- ASSERT(tx != NULL);
- ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
-
- if (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk == 0) {
- ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) = blk;
- return (0);
- } else {
- return (zap_table_store(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
- idx, blk, tx));
- }
-}
-
-static int
-zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp)
-{
- uint64_t idx, blk;
- int err;
-
- ASSERT(zap->zap_dbuf == NULL ||
- zap->zap_f.zap_phys == zap->zap_dbuf->db_data);
- ASSERT3U(zap->zap_f.zap_phys->zap_magic, ==, ZAP_MAGIC);
- idx = ZAP_HASH_IDX(h, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
- err = zap_idx_to_blk(zap, idx, &blk);
- if (err != 0)
- return (err);
- err = zap_get_leaf_byblk(zap, blk, tx, lt, lp);
-
- ASSERT(err || ZAP_HASH_IDX(h, (*lp)->l_phys->l_hdr.lh_prefix_len) ==
- (*lp)->l_phys->l_hdr.lh_prefix);
- return (err);
-}
-
-static int
-zap_expand_leaf(zap_t *zap, zap_leaf_t *l, uint64_t hash, dmu_tx_t *tx,
- zap_leaf_t **lp)
-{
- zap_leaf_t *nl;
- int prefix_diff, i, err;
- uint64_t sibling;
- int old_prefix_len = l->l_phys->l_hdr.lh_prefix_len;
-
- ASSERT3U(old_prefix_len, <=, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
- ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
-
- ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
- l->l_phys->l_hdr.lh_prefix);
-
- if (zap_tryupgradedir(zap, tx) == 0 ||
- old_prefix_len == zap->zap_f.zap_phys->zap_ptrtbl.zt_shift) {
- /* We failed to upgrade, or need to grow the pointer table */
- objset_t *os = zap->zap_objset;
- uint64_t object = zap->zap_object;
-
- zap_put_leaf(l);
- zap_unlockdir(zap);
- err = zap_lockdir(os, object, tx, RW_WRITER, FALSE, &zap);
- if (err)
- return (err);
- ASSERT(!zap->zap_ismicro);
-
- while (old_prefix_len ==
- zap->zap_f.zap_phys->zap_ptrtbl.zt_shift) {
- err = zap_grow_ptrtbl(zap, tx);
- if (err)
- return (err);
- }
-
- err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l);
- if (err)
- return (err);
-
- if (l->l_phys->l_hdr.lh_prefix_len != old_prefix_len) {
- /* it split while our locks were down */
- *lp = l;
- return (0);
- }
- }
- ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
- ASSERT3U(old_prefix_len, <, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
- ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
- l->l_phys->l_hdr.lh_prefix);
-
- prefix_diff = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift -
- (old_prefix_len + 1);
- sibling = (ZAP_HASH_IDX(hash, old_prefix_len + 1) | 1) << prefix_diff;
-
- /* check for i/o errors before doing zap_leaf_split */
- for (i = 0; i < (1ULL<<prefix_diff); i++) {
- uint64_t blk;
- err = zap_idx_to_blk(zap, sibling+i, &blk);
- if (err)
- return (err);
- ASSERT3U(blk, ==, l->l_blkid);
- }
-
- nl = zap_create_leaf(zap, tx);
- zap_leaf_split(l, nl);
-
- /* set sibling pointers */
- for (i = 0; i < (1ULL<<prefix_diff); i++) {
- err = zap_set_idx_to_blk(zap, sibling+i, nl->l_blkid, tx);
- ASSERT3U(err, ==, 0); /* we checked for i/o errors above */
- }
-
- if (hash & (1ULL << (64 - l->l_phys->l_hdr.lh_prefix_len))) {
- /* we want the sibling */
- zap_put_leaf(l);
- *lp = nl;
- } else {
- zap_put_leaf(nl);
- *lp = l;
- }
-
- return (0);
-}
-
-static void
-zap_put_leaf_maybe_grow_ptrtbl(zap_t *zap, zap_leaf_t *l, dmu_tx_t *tx)
-{
- int shift = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift;
- int leaffull = (l->l_phys->l_hdr.lh_prefix_len == shift &&
- l->l_phys->l_hdr.lh_nfree < ZAP_LEAF_LOW_WATER);
-
- zap_put_leaf(l);
-
- if (leaffull || zap->zap_f.zap_phys->zap_ptrtbl.zt_nextblk) {
- int err;
-
- /*
- * We are in the middle of growing the pointer table, or
- * this leaf will soon make us grow it.
- */
- if (zap_tryupgradedir(zap, tx) == 0) {
- objset_t *os = zap->zap_objset;
- uint64_t zapobj = zap->zap_object;
-
- zap_unlockdir(zap);
- err = zap_lockdir(os, zapobj, tx,
- RW_WRITER, FALSE, &zap);
- if (err)
- return;
- }
-
- /* could have finished growing while our locks were down */
- if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift == shift)
- (void) zap_grow_ptrtbl(zap, tx);
- }
-}
-
-
-static int
-fzap_checksize(const char *name, uint64_t integer_size, uint64_t num_integers)
-{
- if (name && strlen(name) > ZAP_MAXNAMELEN)
- return (E2BIG);
-
- /* Only integer sizes supported by C */
- switch (integer_size) {
- case 1:
- case 2:
- case 4:
- case 8:
- break;
- default:
- return (EINVAL);
- }
-
- if (integer_size * num_integers > ZAP_MAXVALUELEN)
- return (E2BIG);
-
- return (0);
-}
-
-/*
- * Routines for maniplulating attributes.
- */
-int
-fzap_lookup(zap_t *zap, const char *name,
- uint64_t integer_size, uint64_t num_integers, void *buf)
-{
- zap_leaf_t *l;
- int err;
- uint64_t hash;
- zap_entry_handle_t zeh;
-
- err = fzap_checksize(name, integer_size, num_integers);
- if (err != 0)
- return (err);
-
- hash = zap_hash(zap, name);
- err = zap_deref_leaf(zap, hash, NULL, RW_READER, &l);
- if (err != 0)
- return (err);
- err = zap_leaf_lookup(l, name, hash, &zeh);
- if (err == 0)
- err = zap_entry_read(&zeh, integer_size, num_integers, buf);
-
- zap_put_leaf(l);
- return (err);
-}
-
-int
-fzap_add_cd(zap_t *zap, const char *name,
- uint64_t integer_size, uint64_t num_integers,
- const void *val, uint32_t cd, dmu_tx_t *tx)
-{
- zap_leaf_t *l;
- uint64_t hash;
- int err;
- zap_entry_handle_t zeh;
-
- ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
- ASSERT(!zap->zap_ismicro);
- ASSERT(fzap_checksize(name, integer_size, num_integers) == 0);
-
- hash = zap_hash(zap, name);
- err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l);
- if (err != 0)
- return (err);
-retry:
- err = zap_leaf_lookup(l, name, hash, &zeh);
- if (err == 0) {
- err = EEXIST;
- goto out;
- }
- if (err != ENOENT)
- goto out;
-
- err = zap_entry_create(l, name, hash, cd,
- integer_size, num_integers, val, &zeh);
-
- if (err == 0) {
- zap_increment_num_entries(zap, 1, tx);
- } else if (err == EAGAIN) {
- err = zap_expand_leaf(zap, l, hash, tx, &l);
- if (err == 0)
- goto retry;
- }
-
-out:
- zap_put_leaf_maybe_grow_ptrtbl(zap, l, tx);
- return (err);
-}
-
-int
-fzap_add(zap_t *zap, const char *name,
- uint64_t integer_size, uint64_t num_integers,
- const void *val, dmu_tx_t *tx)
-{
- int err = fzap_checksize(name, integer_size, num_integers);
- if (err != 0)
- return (err);
-
- return (fzap_add_cd(zap, name, integer_size, num_integers,
- val, ZAP_MAXCD, tx));
-}
-
-int
-fzap_update(zap_t *zap, const char *name,
- int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
-{
- zap_leaf_t *l;
- uint64_t hash;
- int err, create;
- zap_entry_handle_t zeh;
-
- ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
- err = fzap_checksize(name, integer_size, num_integers);
- if (err != 0)
- return (err);
-
- hash = zap_hash(zap, name);
- err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l);
- if (err != 0)
- return (err);
-retry:
- err = zap_leaf_lookup(l, name, hash, &zeh);
- create = (err == ENOENT);
- ASSERT(err == 0 || err == ENOENT);
-
- /* XXX If this leaf is chained, split it if we can. */
-
- if (create) {
- err = zap_entry_create(l, name, hash, ZAP_MAXCD,
- integer_size, num_integers, val, &zeh);
- if (err == 0)
- zap_increment_num_entries(zap, 1, tx);
- } else {
- err = zap_entry_update(&zeh, integer_size, num_integers, val);
- }
-
- if (err == EAGAIN) {
- err = zap_expand_leaf(zap, l, hash, tx, &l);
- if (err == 0)
- goto retry;
- }
-
- zap_put_leaf_maybe_grow_ptrtbl(zap, l, tx);
- return (err);
-}
-
-int
-fzap_length(zap_t *zap, const char *name,
- uint64_t *integer_size, uint64_t *num_integers)
-{
- zap_leaf_t *l;
- int err;
- uint64_t hash;
- zap_entry_handle_t zeh;
-
- hash = zap_hash(zap, name);
- err = zap_deref_leaf(zap, hash, NULL, RW_READER, &l);
- if (err != 0)
- return (err);
- err = zap_leaf_lookup(l, name, hash, &zeh);
- if (err != 0)
- goto out;
-
- if (integer_size)
- *integer_size = zeh.zeh_integer_size;
- if (num_integers)
- *num_integers = zeh.zeh_num_integers;
-out:
- zap_put_leaf(l);
- return (err);
-}
-
-int
-fzap_remove(zap_t *zap, const char *name, dmu_tx_t *tx)
-{
- zap_leaf_t *l;
- uint64_t hash;
- int err;
- zap_entry_handle_t zeh;
-
- hash = zap_hash(zap, name);
- err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l);
- if (err != 0)
- return (err);
- err = zap_leaf_lookup(l, name, hash, &zeh);
- if (err == 0) {
- zap_entry_remove(&zeh);
- zap_increment_num_entries(zap, -1, tx);
- }
- zap_put_leaf(l);
- dprintf("fzap_remove: ds=%p obj=%llu name=%s err=%d\n",
- zap->zap_objset, zap->zap_object, name, err);
- return (err);
-}
-
-int
-zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, char *name)
-{
- zap_cursor_t zc;
- zap_attribute_t *za;
- int err;
-
- za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
- for (zap_cursor_init(&zc, os, zapobj);
- (err = zap_cursor_retrieve(&zc, za)) == 0;
- zap_cursor_advance(&zc)) {
- if (ZFS_DIRENT_OBJ(za->za_first_integer) == value) {
- (void) strcpy(name, za->za_name);
- break;
- }
- }
- zap_cursor_fini(&zc);
- kmem_free(za, sizeof (zap_attribute_t));
- return (err);
-}
-
-
-/*
- * Routines for iterating over the attributes.
- */
-
-int
-fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za)
-{
- int err = ENOENT;
- zap_entry_handle_t zeh;
- zap_leaf_t *l;
-
- /* retrieve the next entry at or after zc_hash/zc_cd */
- /* if no entry, return ENOENT */
-
- if (zc->zc_leaf &&
- (ZAP_HASH_IDX(zc->zc_hash,
- zc->zc_leaf->l_phys->l_hdr.lh_prefix_len) !=
- zc->zc_leaf->l_phys->l_hdr.lh_prefix)) {
- rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
- zap_put_leaf(zc->zc_leaf);
- zc->zc_leaf = NULL;
- }
-
-again:
- if (zc->zc_leaf == NULL) {
- err = zap_deref_leaf(zap, zc->zc_hash, NULL, RW_READER,
- &zc->zc_leaf);
- if (err != 0)
- return (err);
- } else {
- rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
- }
- l = zc->zc_leaf;
-
- err = zap_leaf_lookup_closest(l, zc->zc_hash, zc->zc_cd, &zeh);
-
- if (err == ENOENT) {
- uint64_t nocare =
- (1ULL << (64 - l->l_phys->l_hdr.lh_prefix_len)) - 1;
- zc->zc_hash = (zc->zc_hash & ~nocare) + nocare + 1;
- zc->zc_cd = 0;
- if (l->l_phys->l_hdr.lh_prefix_len == 0 || zc->zc_hash == 0) {
- zc->zc_hash = -1ULL;
- } else {
- zap_put_leaf(zc->zc_leaf);
- zc->zc_leaf = NULL;
- goto again;
- }
- }
-
- if (err == 0) {
- zc->zc_hash = zeh.zeh_hash;
- zc->zc_cd = zeh.zeh_cd;
- za->za_integer_length = zeh.zeh_integer_size;
- za->za_num_integers = zeh.zeh_num_integers;
- if (zeh.zeh_num_integers == 0) {
- za->za_first_integer = 0;
- } else {
- err = zap_entry_read(&zeh, 8, 1, &za->za_first_integer);
- ASSERT(err == 0 || err == EOVERFLOW);
- }
- err = zap_entry_read_name(&zeh,
- sizeof (za->za_name), za->za_name);
- ASSERT(err == 0);
- }
- rw_exit(&zc->zc_leaf->l_rwlock);
- return (err);
-}
-
-
-static void
-zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs)
-{
- int i, err;
- uint64_t lastblk = 0;
-
- /*
- * NB: if a leaf has more pointers than an entire ptrtbl block
- * can hold, then it'll be accounted for more than once, since
- * we won't have lastblk.
- */
- for (i = 0; i < len; i++) {
- zap_leaf_t *l;
-
- if (tbl[i] == lastblk)
- continue;
- lastblk = tbl[i];
-
- err = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER, &l);
- if (err == 0) {
- zap_leaf_stats(zap, l, zs);
- zap_put_leaf(l);
- }
- }
-}
-
-void
-fzap_get_stats(zap_t *zap, zap_stats_t *zs)
-{
- int bs = FZAP_BLOCK_SHIFT(zap);
- zs->zs_blocksize = 1ULL << bs;
-
- /*
- * Set zap_phys_t fields
- */
- zs->zs_num_leafs = zap->zap_f.zap_phys->zap_num_leafs;
- zs->zs_num_entries = zap->zap_f.zap_phys->zap_num_entries;
- zs->zs_num_blocks = zap->zap_f.zap_phys->zap_freeblk;
- zs->zs_block_type = zap->zap_f.zap_phys->zap_block_type;
- zs->zs_magic = zap->zap_f.zap_phys->zap_magic;
- zs->zs_salt = zap->zap_f.zap_phys->zap_salt;
-
- /*
- * Set zap_ptrtbl fields
- */
- zs->zs_ptrtbl_len = 1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift;
- zs->zs_ptrtbl_nextblk = zap->zap_f.zap_phys->zap_ptrtbl.zt_nextblk;
- zs->zs_ptrtbl_blks_copied =
- zap->zap_f.zap_phys->zap_ptrtbl.zt_blks_copied;
- zs->zs_ptrtbl_zt_blk = zap->zap_f.zap_phys->zap_ptrtbl.zt_blk;
- zs->zs_ptrtbl_zt_numblks = zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks;
- zs->zs_ptrtbl_zt_shift = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift;
-
- if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
- /* the ptrtbl is entirely in the header block. */
- zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
- 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs);
- } else {
- int b;
-
- dmu_prefetch(zap->zap_objset, zap->zap_object,
- zap->zap_f.zap_phys->zap_ptrtbl.zt_blk << bs,
- zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks << bs);
-
- for (b = 0; b < zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks;
- b++) {
- dmu_buf_t *db;
- int err;
-
- err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
- (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk + b) << bs,
- FTAG, &db);
- if (err == 0) {
- zap_stats_ptrtbl(zap, db->db_data,
- 1<<(bs-3), zs);
- dmu_buf_rele(db, FTAG);
- }
- }
- }
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c
deleted file mode 100644
index 5dff514..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c
+++ /dev/null
@@ -1,741 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-/*
- * The 512-byte leaf is broken into 32 16-byte chunks.
- * chunk number n means l_chunk[n], even though the header precedes it.
- * the names are stored null-terminated.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/zap.h>
-#include <sys/zap_impl.h>
-#include <sys/zap_leaf.h>
-#include <sys/spa.h>
-#include <sys/dmu.h>
-
-#define CHAIN_END 0xffff /* end of the chunk chain */
-
-/* half the (current) minimum block size */
-#define MAX_ARRAY_BYTES (8<<10)
-
-#define LEAF_HASH(l, h) \
- ((ZAP_LEAF_HASH_NUMENTRIES(l)-1) & \
- ((h) >> (64 - ZAP_LEAF_HASH_SHIFT(l)-(l)->l_phys->l_hdr.lh_prefix_len)))
-
-#define LEAF_HASH_ENTPTR(l, h) (&(l)->l_phys->l_hash[LEAF_HASH(l, h)])
-
-
-static void
-zap_memset(void *a, int c, size_t n)
-{
- char *cp = a;
- char *cpend = cp + n;
-
- while (cp < cpend)
- *cp++ = c;
-}
-
-static void
-stv(int len, void *addr, uint64_t value)
-{
- switch (len) {
- case 1:
- *(uint8_t *)addr = value;
- return;
- case 2:
- *(uint16_t *)addr = value;
- return;
- case 4:
- *(uint32_t *)addr = value;
- return;
- case 8:
- *(uint64_t *)addr = value;
- return;
- }
- ASSERT(!"bad int len");
-}
-
-static uint64_t
-ldv(int len, const void *addr)
-{
- switch (len) {
- case 1:
- return (*(uint8_t *)addr);
- case 2:
- return (*(uint16_t *)addr);
- case 4:
- return (*(uint32_t *)addr);
- case 8:
- return (*(uint64_t *)addr);
- }
- ASSERT(!"bad int len");
- return (0xFEEDFACEDEADBEEFULL);
-}
-
-void
-zap_leaf_byteswap(zap_leaf_phys_t *buf, int size)
-{
- int i;
- zap_leaf_t l;
- l.l_bs = highbit(size)-1;
- l.l_phys = buf;
-
- buf->l_hdr.lh_block_type = BSWAP_64(buf->l_hdr.lh_block_type);
- buf->l_hdr.lh_prefix = BSWAP_64(buf->l_hdr.lh_prefix);
- buf->l_hdr.lh_magic = BSWAP_32(buf->l_hdr.lh_magic);
- buf->l_hdr.lh_nfree = BSWAP_16(buf->l_hdr.lh_nfree);
- buf->l_hdr.lh_nentries = BSWAP_16(buf->l_hdr.lh_nentries);
- buf->l_hdr.lh_prefix_len = BSWAP_16(buf->l_hdr.lh_prefix_len);
- buf->l_hdr.lh_freelist = BSWAP_16(buf->l_hdr.lh_freelist);
-
- for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++)
- buf->l_hash[i] = BSWAP_16(buf->l_hash[i]);
-
- for (i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) {
- zap_leaf_chunk_t *lc = &ZAP_LEAF_CHUNK(&l, i);
- struct zap_leaf_entry *le;
-
- switch (lc->l_free.lf_type) {
- case ZAP_CHUNK_ENTRY:
- le = &lc->l_entry;
-
- le->le_type = BSWAP_8(le->le_type);
- le->le_int_size = BSWAP_8(le->le_int_size);
- le->le_next = BSWAP_16(le->le_next);
- le->le_name_chunk = BSWAP_16(le->le_name_chunk);
- le->le_name_length = BSWAP_16(le->le_name_length);
- le->le_value_chunk = BSWAP_16(le->le_value_chunk);
- le->le_value_length = BSWAP_16(le->le_value_length);
- le->le_cd = BSWAP_32(le->le_cd);
- le->le_hash = BSWAP_64(le->le_hash);
- break;
- case ZAP_CHUNK_FREE:
- lc->l_free.lf_type = BSWAP_8(lc->l_free.lf_type);
- lc->l_free.lf_next = BSWAP_16(lc->l_free.lf_next);
- break;
- case ZAP_CHUNK_ARRAY:
- lc->l_array.la_type = BSWAP_8(lc->l_array.la_type);
- lc->l_array.la_next = BSWAP_16(lc->l_array.la_next);
- /* la_array doesn't need swapping */
- break;
- default:
- ASSERT(!"bad leaf type");
- }
- }
-}
-
-void
-zap_leaf_init(zap_leaf_t *l)
-{
- int i;
-
- l->l_bs = highbit(l->l_dbuf->db_size)-1;
- zap_memset(&l->l_phys->l_hdr, 0, sizeof (struct zap_leaf_header));
- zap_memset(l->l_phys->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l));
- for (i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
- ZAP_LEAF_CHUNK(l, i).l_free.lf_type = ZAP_CHUNK_FREE;
- ZAP_LEAF_CHUNK(l, i).l_free.lf_next = i+1;
- }
- ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)-1).l_free.lf_next = CHAIN_END;
- l->l_phys->l_hdr.lh_block_type = ZBT_LEAF;
- l->l_phys->l_hdr.lh_magic = ZAP_LEAF_MAGIC;
- l->l_phys->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l);
-}
-
-/*
- * Routines which manipulate leaf chunks (l_chunk[]).
- */
-
-static uint16_t
-zap_leaf_chunk_alloc(zap_leaf_t *l)
-{
- int chunk;
-
- ASSERT(l->l_phys->l_hdr.lh_nfree > 0);
-
- chunk = l->l_phys->l_hdr.lh_freelist;
- ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
- ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_free.lf_type, ==, ZAP_CHUNK_FREE);
-
- l->l_phys->l_hdr.lh_freelist = ZAP_LEAF_CHUNK(l, chunk).l_free.lf_next;
-
- l->l_phys->l_hdr.lh_nfree--;
-
- return (chunk);
-}
-
-static void
-zap_leaf_chunk_free(zap_leaf_t *l, uint16_t chunk)
-{
- struct zap_leaf_free *zlf = &ZAP_LEAF_CHUNK(l, chunk).l_free;
- ASSERT3U(l->l_phys->l_hdr.lh_nfree, <, ZAP_LEAF_NUMCHUNKS(l));
- ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
- ASSERT(zlf->lf_type != ZAP_CHUNK_FREE);
-
- zlf->lf_type = ZAP_CHUNK_FREE;
- zlf->lf_next = l->l_phys->l_hdr.lh_freelist;
- bzero(zlf->lf_pad, sizeof (zlf->lf_pad)); /* help it to compress */
- l->l_phys->l_hdr.lh_freelist = chunk;
-
- l->l_phys->l_hdr.lh_nfree++;
-}
-
-/*
- * Routines which manipulate leaf arrays (zap_leaf_array type chunks).
- */
-
-static uint16_t
-zap_leaf_array_create(zap_leaf_t *l, const char *buf,
- int integer_size, int num_integers)
-{
- uint16_t chunk_head;
- uint16_t *chunkp = &chunk_head;
- int byten = 0;
- uint64_t value;
- int shift = (integer_size-1)*8;
- int len = num_integers;
-
- ASSERT3U(num_integers * integer_size, <, MAX_ARRAY_BYTES);
-
- while (len > 0) {
- uint16_t chunk = zap_leaf_chunk_alloc(l);
- struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
- int i;
-
- la->la_type = ZAP_CHUNK_ARRAY;
- for (i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) {
- if (byten == 0)
- value = ldv(integer_size, buf);
- la->la_array[i] = value >> shift;
- value <<= 8;
- if (++byten == integer_size) {
- byten = 0;
- buf += integer_size;
- if (--len == 0)
- break;
- }
- }
-
- *chunkp = chunk;
- chunkp = &la->la_next;
- }
- *chunkp = CHAIN_END;
-
- return (chunk_head);
-}
-
-static void
-zap_leaf_array_free(zap_leaf_t *l, uint16_t *chunkp)
-{
- uint16_t chunk = *chunkp;
-
- *chunkp = CHAIN_END;
-
- while (chunk != CHAIN_END) {
- int nextchunk = ZAP_LEAF_CHUNK(l, chunk).l_array.la_next;
- ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_array.la_type, ==,
- ZAP_CHUNK_ARRAY);
- zap_leaf_chunk_free(l, chunk);
- chunk = nextchunk;
- }
-}
-
-/* array_len and buf_len are in integers, not bytes */
-static void
-zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk,
- int array_int_len, int array_len, int buf_int_len, uint64_t buf_len,
- char *buf)
-{
- int len = MIN(array_len, buf_len);
- int byten = 0;
- uint64_t value = 0;
-
- ASSERT3U(array_int_len, <=, buf_int_len);
-
- /* Fast path for one 8-byte integer */
- if (array_int_len == 8 && buf_int_len == 8 && len == 1) {
- struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
- uint8_t *ip = la->la_array;
- uint64_t *buf64 = (uint64_t *)buf;
-
- *buf64 = (uint64_t)ip[0] << 56 | (uint64_t)ip[1] << 48 |
- (uint64_t)ip[2] << 40 | (uint64_t)ip[3] << 32 |
- (uint64_t)ip[4] << 24 | (uint64_t)ip[5] << 16 |
- (uint64_t)ip[6] << 8 | (uint64_t)ip[7];
- return;
- }
-
- /* Fast path for an array of 1-byte integers (eg. the entry name) */
- if (array_int_len == 1 && buf_int_len == 1 &&
- buf_len > array_len + ZAP_LEAF_ARRAY_BYTES) {
- while (chunk != CHAIN_END) {
- struct zap_leaf_array *la =
- &ZAP_LEAF_CHUNK(l, chunk).l_array;
- bcopy(la->la_array, buf, ZAP_LEAF_ARRAY_BYTES);
- buf += ZAP_LEAF_ARRAY_BYTES;
- chunk = la->la_next;
- }
- return;
- }
-
- while (len > 0) {
- struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
- int i;
-
- ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
- for (i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) {
- value = (value << 8) | la->la_array[i];
- byten++;
- if (byten == array_int_len) {
- stv(buf_int_len, buf, value);
- byten = 0;
- len--;
- if (len == 0)
- return;
- buf += buf_int_len;
- }
- }
- chunk = la->la_next;
- }
-}
-
-/*
- * Only to be used on 8-bit arrays.
- * array_len is actual len in bytes (not encoded le_value_length).
- * buf is null-terminated.
- */
-static int
-zap_leaf_array_equal(zap_leaf_t *l, int chunk,
- int array_len, const char *buf)
-{
- int bseen = 0;
-
- while (bseen < array_len) {
- struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
- int toread = MIN(array_len - bseen, ZAP_LEAF_ARRAY_BYTES);
- ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
- if (bcmp(la->la_array, buf + bseen, toread))
- break;
- chunk = la->la_next;
- bseen += toread;
- }
- return (bseen == array_len);
-}
-
-/*
- * Routines which manipulate leaf entries.
- */
-
-int
-zap_leaf_lookup(zap_leaf_t *l,
- const char *name, uint64_t h, zap_entry_handle_t *zeh)
-{
- uint16_t *chunkp;
- struct zap_leaf_entry *le;
-
- ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
-
- for (chunkp = LEAF_HASH_ENTPTR(l, h);
- *chunkp != CHAIN_END; chunkp = &le->le_next) {
- uint16_t chunk = *chunkp;
- le = ZAP_LEAF_ENTRY(l, chunk);
-
- ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
- ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
-
- if (le->le_hash != h)
- continue;
-
- if (zap_leaf_array_equal(l, le->le_name_chunk,
- le->le_name_length, name)) {
- zeh->zeh_num_integers = le->le_value_length;
- zeh->zeh_integer_size = le->le_int_size;
- zeh->zeh_cd = le->le_cd;
- zeh->zeh_hash = le->le_hash;
- zeh->zeh_chunkp = chunkp;
- zeh->zeh_leaf = l;
- return (0);
- }
- }
-
- return (ENOENT);
-}
-
-/* Return (h1,cd1 >= h2,cd2) */
-#define HCD_GTEQ(h1, cd1, h2, cd2) \
- ((h1 > h2) ? TRUE : ((h1 == h2 && cd1 >= cd2) ? TRUE : FALSE))
-
-int
-zap_leaf_lookup_closest(zap_leaf_t *l,
- uint64_t h, uint32_t cd, zap_entry_handle_t *zeh)
-{
- uint16_t chunk;
- uint64_t besth = -1ULL;
- uint32_t bestcd = ZAP_MAXCD;
- uint16_t bestlh = ZAP_LEAF_HASH_NUMENTRIES(l)-1;
- uint16_t lh;
- struct zap_leaf_entry *le;
-
- ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
-
- for (lh = LEAF_HASH(l, h); lh <= bestlh; lh++) {
- for (chunk = l->l_phys->l_hash[lh];
- chunk != CHAIN_END; chunk = le->le_next) {
- le = ZAP_LEAF_ENTRY(l, chunk);
-
- ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
- ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
-
- if (HCD_GTEQ(le->le_hash, le->le_cd, h, cd) &&
- HCD_GTEQ(besth, bestcd, le->le_hash, le->le_cd)) {
- ASSERT3U(bestlh, >=, lh);
- bestlh = lh;
- besth = le->le_hash;
- bestcd = le->le_cd;
-
- zeh->zeh_num_integers = le->le_value_length;
- zeh->zeh_integer_size = le->le_int_size;
- zeh->zeh_cd = le->le_cd;
- zeh->zeh_hash = le->le_hash;
- zeh->zeh_fakechunk = chunk;
- zeh->zeh_chunkp = &zeh->zeh_fakechunk;
- zeh->zeh_leaf = l;
- }
- }
- }
-
- return (bestcd == ZAP_MAXCD ? ENOENT : 0);
-}
-
-int
-zap_entry_read(const zap_entry_handle_t *zeh,
- uint8_t integer_size, uint64_t num_integers, void *buf)
-{
- struct zap_leaf_entry *le =
- ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp);
- ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
-
- if (le->le_int_size > integer_size)
- return (EINVAL);
-
- zap_leaf_array_read(zeh->zeh_leaf, le->le_value_chunk, le->le_int_size,
- le->le_value_length, integer_size, num_integers, buf);
-
- if (zeh->zeh_num_integers > num_integers)
- return (EOVERFLOW);
- return (0);
-
-}
-
-int
-zap_entry_read_name(const zap_entry_handle_t *zeh, uint16_t buflen, char *buf)
-{
- struct zap_leaf_entry *le =
- ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp);
- ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
-
- zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 1,
- le->le_name_length, 1, buflen, buf);
- if (le->le_name_length > buflen)
- return (EOVERFLOW);
- return (0);
-}
-
-int
-zap_entry_update(zap_entry_handle_t *zeh,
- uint8_t integer_size, uint64_t num_integers, const void *buf)
-{
- int delta_chunks;
- zap_leaf_t *l = zeh->zeh_leaf;
- struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, *zeh->zeh_chunkp);
-
- delta_chunks = ZAP_LEAF_ARRAY_NCHUNKS(num_integers * integer_size) -
- ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_length * le->le_int_size);
-
- if ((int)l->l_phys->l_hdr.lh_nfree < delta_chunks)
- return (EAGAIN);
-
- /*
- * We should search other chained leaves (via
- * zap_entry_remove,create?) otherwise returning EAGAIN will
- * just send us into an infinite loop if we have to chain
- * another leaf block, rather than being able to split this
- * block.
- */
-
- zap_leaf_array_free(l, &le->le_value_chunk);
- le->le_value_chunk =
- zap_leaf_array_create(l, buf, integer_size, num_integers);
- le->le_value_length = num_integers;
- le->le_int_size = integer_size;
- return (0);
-}
-
-void
-zap_entry_remove(zap_entry_handle_t *zeh)
-{
- uint16_t entry_chunk;
- struct zap_leaf_entry *le;
- zap_leaf_t *l = zeh->zeh_leaf;
-
- ASSERT3P(zeh->zeh_chunkp, !=, &zeh->zeh_fakechunk);
-
- entry_chunk = *zeh->zeh_chunkp;
- le = ZAP_LEAF_ENTRY(l, entry_chunk);
- ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
-
- zap_leaf_array_free(l, &le->le_name_chunk);
- zap_leaf_array_free(l, &le->le_value_chunk);
-
- *zeh->zeh_chunkp = le->le_next;
- zap_leaf_chunk_free(l, entry_chunk);
-
- l->l_phys->l_hdr.lh_nentries--;
-}
-
-int
-zap_entry_create(zap_leaf_t *l, const char *name, uint64_t h, uint32_t cd,
- uint8_t integer_size, uint64_t num_integers, const void *buf,
- zap_entry_handle_t *zeh)
-{
- uint16_t chunk;
- uint16_t *chunkp;
- struct zap_leaf_entry *le;
- uint64_t namelen, valuelen;
- int numchunks;
-
- valuelen = integer_size * num_integers;
- namelen = strlen(name) + 1;
- ASSERT(namelen >= 2);
-
- numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(namelen) +
- ZAP_LEAF_ARRAY_NCHUNKS(valuelen);
- if (numchunks > ZAP_LEAF_NUMCHUNKS(l))
- return (E2BIG);
-
- if (cd == ZAP_MAXCD) {
- for (cd = 0; cd < ZAP_MAXCD; cd++) {
- for (chunk = *LEAF_HASH_ENTPTR(l, h);
- chunk != CHAIN_END; chunk = le->le_next) {
- le = ZAP_LEAF_ENTRY(l, chunk);
- if (le->le_hash == h &&
- le->le_cd == cd) {
- break;
- }
- }
- /* If this cd is not in use, we are good. */
- if (chunk == CHAIN_END)
- break;
- }
- /* If we tried all the cd's, we lose. */
- if (cd == ZAP_MAXCD)
- return (ENOSPC);
- }
-
- if (l->l_phys->l_hdr.lh_nfree < numchunks)
- return (EAGAIN);
-
- /* make the entry */
- chunk = zap_leaf_chunk_alloc(l);
- le = ZAP_LEAF_ENTRY(l, chunk);
- le->le_type = ZAP_CHUNK_ENTRY;
- le->le_name_chunk = zap_leaf_array_create(l, name, 1, namelen);
- le->le_name_length = namelen;
- le->le_value_chunk =
- zap_leaf_array_create(l, buf, integer_size, num_integers);
- le->le_value_length = num_integers;
- le->le_int_size = integer_size;
- le->le_hash = h;
- le->le_cd = cd;
-
- /* link it into the hash chain */
- chunkp = LEAF_HASH_ENTPTR(l, h);
- le->le_next = *chunkp;
- *chunkp = chunk;
-
- l->l_phys->l_hdr.lh_nentries++;
-
- zeh->zeh_leaf = l;
- zeh->zeh_num_integers = num_integers;
- zeh->zeh_integer_size = le->le_int_size;
- zeh->zeh_cd = le->le_cd;
- zeh->zeh_hash = le->le_hash;
- zeh->zeh_chunkp = chunkp;
-
- return (0);
-}
-
-/*
- * Routines for transferring entries between leafs.
- */
-
-static void
-zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry)
-{
- struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry);
- uint16_t *ptr = LEAF_HASH_ENTPTR(l, le->le_hash);
- le->le_next = *ptr;
- *ptr = entry;
-}
-
-static uint16_t
-zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl)
-{
- uint16_t new_chunk;
- uint16_t *nchunkp = &new_chunk;
-
- while (chunk != CHAIN_END) {
- uint16_t nchunk = zap_leaf_chunk_alloc(nl);
- struct zap_leaf_array *nla =
- &ZAP_LEAF_CHUNK(nl, nchunk).l_array;
- struct zap_leaf_array *la =
- &ZAP_LEAF_CHUNK(l, chunk).l_array;
- int nextchunk = la->la_next;
-
- ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
- ASSERT3U(nchunk, <, ZAP_LEAF_NUMCHUNKS(l));
-
- *nla = *la; /* structure assignment */
-
- zap_leaf_chunk_free(l, chunk);
- chunk = nextchunk;
- *nchunkp = nchunk;
- nchunkp = &nla->la_next;
- }
- *nchunkp = CHAIN_END;
- return (new_chunk);
-}
-
-static void
-zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl)
-{
- struct zap_leaf_entry *le, *nle;
- uint16_t chunk;
-
- le = ZAP_LEAF_ENTRY(l, entry);
- ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
-
- chunk = zap_leaf_chunk_alloc(nl);
- nle = ZAP_LEAF_ENTRY(nl, chunk);
- *nle = *le; /* structure assignment */
-
- zap_leaf_rehash_entry(nl, chunk);
-
- nle->le_name_chunk = zap_leaf_transfer_array(l, le->le_name_chunk, nl);
- nle->le_value_chunk =
- zap_leaf_transfer_array(l, le->le_value_chunk, nl);
-
- zap_leaf_chunk_free(l, entry);
-
- l->l_phys->l_hdr.lh_nentries--;
- nl->l_phys->l_hdr.lh_nentries++;
-}
-
-/*
- * Transfer the entries whose hash prefix ends in 1 to the new leaf.
- */
-void
-zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl)
-{
- int i;
- int bit = 64 - 1 - l->l_phys->l_hdr.lh_prefix_len;
-
- /* set new prefix and prefix_len */
- l->l_phys->l_hdr.lh_prefix <<= 1;
- l->l_phys->l_hdr.lh_prefix_len++;
- nl->l_phys->l_hdr.lh_prefix = l->l_phys->l_hdr.lh_prefix | 1;
- nl->l_phys->l_hdr.lh_prefix_len = l->l_phys->l_hdr.lh_prefix_len;
-
- /* break existing hash chains */
- zap_memset(l->l_phys->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l));
-
- /*
- * Transfer entries whose hash bit 'bit' is set to nl; rehash
- * the remaining entries
- *
- * NB: We could find entries via the hashtable instead. That
- * would be O(hashents+numents) rather than O(numblks+numents),
- * but this accesses memory more sequentially, and when we're
- * called, the block is usually pretty full.
- */
- for (i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
- struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, i);
- if (le->le_type != ZAP_CHUNK_ENTRY)
- continue;
-
- if (le->le_hash & (1ULL << bit))
- zap_leaf_transfer_entry(l, i, nl);
- else
- zap_leaf_rehash_entry(l, i);
- }
-}
-
-void
-zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs)
-{
- int i, n;
-
- n = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift -
- l->l_phys->l_hdr.lh_prefix_len;
- n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
- zs->zs_leafs_with_2n_pointers[n]++;
-
-
- n = l->l_phys->l_hdr.lh_nentries/5;
- n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
- zs->zs_blocks_with_n5_entries[n]++;
-
- n = ((1<<FZAP_BLOCK_SHIFT(zap)) -
- l->l_phys->l_hdr.lh_nfree * (ZAP_LEAF_ARRAY_BYTES+1))*10 /
- (1<<FZAP_BLOCK_SHIFT(zap));
- n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
- zs->zs_blocks_n_tenths_full[n]++;
-
- for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) {
- int nentries = 0;
- int chunk = l->l_phys->l_hash[i];
-
- while (chunk != CHAIN_END) {
- struct zap_leaf_entry *le =
- ZAP_LEAF_ENTRY(l, chunk);
-
- n = 1 + ZAP_LEAF_ARRAY_NCHUNKS(le->le_name_length) +
- ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_length *
- le->le_int_size);
- n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
- zs->zs_entries_using_n_chunks[n]++;
-
- chunk = le->le_next;
- nentries++;
- }
-
- n = nentries;
- n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
- zs->zs_buckets_with_n_entries[n]++;
- }
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c
deleted file mode 100644
index 9a882a5..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c
+++ /dev/null
@@ -1,857 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/spa.h>
-#include <sys/dmu.h>
-#include <sys/zfs_context.h>
-#include <sys/zap.h>
-#include <sys/refcount.h>
-#include <sys/zap_impl.h>
-#include <sys/zap_leaf.h>
-#include <sys/avl.h>
-
-
-static void mzap_upgrade(zap_t *zap, dmu_tx_t *tx);
-
-
-static void
-mzap_byteswap(mzap_phys_t *buf, size_t size)
-{
- int i, max;
- buf->mz_block_type = BSWAP_64(buf->mz_block_type);
- buf->mz_salt = BSWAP_64(buf->mz_salt);
- max = (size / MZAP_ENT_LEN) - 1;
- for (i = 0; i < max; i++) {
- buf->mz_chunk[i].mze_value =
- BSWAP_64(buf->mz_chunk[i].mze_value);
- buf->mz_chunk[i].mze_cd =
- BSWAP_32(buf->mz_chunk[i].mze_cd);
- }
-}
-
-void
-zap_byteswap(void *buf, size_t size)
-{
- uint64_t block_type;
-
- block_type = *(uint64_t *)buf;
-
- if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) {
- /* ASSERT(magic == ZAP_LEAF_MAGIC); */
- mzap_byteswap(buf, size);
- } else {
- fzap_byteswap(buf, size);
- }
-}
-
-static int
-mze_compare(const void *arg1, const void *arg2)
-{
- const mzap_ent_t *mze1 = arg1;
- const mzap_ent_t *mze2 = arg2;
-
- if (mze1->mze_hash > mze2->mze_hash)
- return (+1);
- if (mze1->mze_hash < mze2->mze_hash)
- return (-1);
- if (mze1->mze_phys.mze_cd > mze2->mze_phys.mze_cd)
- return (+1);
- if (mze1->mze_phys.mze_cd < mze2->mze_phys.mze_cd)
- return (-1);
- return (0);
-}
-
-static void
-mze_insert(zap_t *zap, int chunkid, uint64_t hash, mzap_ent_phys_t *mzep)
-{
- mzap_ent_t *mze;
-
- ASSERT(zap->zap_ismicro);
- ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
- ASSERT(mzep->mze_cd < ZAP_MAXCD);
- ASSERT3U(zap_hash(zap, mzep->mze_name), ==, hash);
-
- mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP);
- mze->mze_chunkid = chunkid;
- mze->mze_hash = hash;
- mze->mze_phys = *mzep;
- avl_add(&zap->zap_m.zap_avl, mze);
-}
-
-static mzap_ent_t *
-mze_find(zap_t *zap, const char *name, uint64_t hash)
-{
- mzap_ent_t mze_tofind;
- mzap_ent_t *mze;
- avl_index_t idx;
- avl_tree_t *avl = &zap->zap_m.zap_avl;
-
- ASSERT(zap->zap_ismicro);
- ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
- ASSERT3U(zap_hash(zap, name), ==, hash);
-
- if (strlen(name) >= sizeof (mze_tofind.mze_phys.mze_name))
- return (NULL);
-
- mze_tofind.mze_hash = hash;
- mze_tofind.mze_phys.mze_cd = 0;
-
- mze = avl_find(avl, &mze_tofind, &idx);
- if (mze == NULL)
- mze = avl_nearest(avl, idx, AVL_AFTER);
- for (; mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) {
- if (strcmp(name, mze->mze_phys.mze_name) == 0)
- return (mze);
- }
- return (NULL);
-}
-
-static uint32_t
-mze_find_unused_cd(zap_t *zap, uint64_t hash)
-{
- mzap_ent_t mze_tofind;
- mzap_ent_t *mze;
- avl_index_t idx;
- avl_tree_t *avl = &zap->zap_m.zap_avl;
- uint32_t cd;
-
- ASSERT(zap->zap_ismicro);
- ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
-
- mze_tofind.mze_hash = hash;
- mze_tofind.mze_phys.mze_cd = 0;
-
- cd = 0;
- for (mze = avl_find(avl, &mze_tofind, &idx);
- mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) {
- if (mze->mze_phys.mze_cd != cd)
- break;
- cd++;
- }
-
- return (cd);
-}
-
-static void
-mze_remove(zap_t *zap, mzap_ent_t *mze)
-{
- ASSERT(zap->zap_ismicro);
- ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
-
- avl_remove(&zap->zap_m.zap_avl, mze);
- kmem_free(mze, sizeof (mzap_ent_t));
-}
-
-static void
-mze_destroy(zap_t *zap)
-{
- mzap_ent_t *mze;
- void *avlcookie = NULL;
-
- while (mze = avl_destroy_nodes(&zap->zap_m.zap_avl, &avlcookie))
- kmem_free(mze, sizeof (mzap_ent_t));
- avl_destroy(&zap->zap_m.zap_avl);
-}
-
-static zap_t *
-mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
-{
- zap_t *winner;
- zap_t *zap;
- int i;
-
- ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t));
-
- zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP);
- rw_init(&zap->zap_rwlock, NULL, RW_DEFAULT, 0);
- rw_enter(&zap->zap_rwlock, RW_WRITER);
- zap->zap_objset = os;
- zap->zap_object = obj;
- zap->zap_dbuf = db;
-
- if (((uint64_t *)db->db_data)[0] != ZBT_MICRO) {
- mutex_init(&zap->zap_f.zap_num_entries_mtx, NULL,
- MUTEX_DEFAULT, 0);
- zap->zap_f.zap_block_shift = highbit(db->db_size) - 1;
- } else {
- zap->zap_ismicro = TRUE;
- }
-
- /*
- * Make sure that zap_ismicro is set before we let others see
- * it, because zap_lockdir() checks zap_ismicro without the lock
- * held.
- */
- winner = dmu_buf_set_user(db, zap, &zap->zap_m.zap_phys, zap_evict);
-
- if (winner != NULL) {
- rw_exit(&zap->zap_rwlock);
- rw_destroy(&zap->zap_rwlock);
- if (!zap->zap_ismicro)
- mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
- kmem_free(zap, sizeof (zap_t));
- return (winner);
- }
-
- if (zap->zap_ismicro) {
- zap->zap_salt = zap->zap_m.zap_phys->mz_salt;
- zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1;
- avl_create(&zap->zap_m.zap_avl, mze_compare,
- sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node));
-
- for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
- mzap_ent_phys_t *mze =
- &zap->zap_m.zap_phys->mz_chunk[i];
- if (mze->mze_name[0]) {
- zap->zap_m.zap_num_entries++;
- mze_insert(zap, i,
- zap_hash(zap, mze->mze_name), mze);
- }
- }
- } else {
- zap->zap_salt = zap->zap_f.zap_phys->zap_salt;
-
- ASSERT3U(sizeof (struct zap_leaf_header), ==,
- 2*ZAP_LEAF_CHUNKSIZE);
-
- /*
- * The embedded pointer table should not overlap the
- * other members.
- */
- ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >,
- &zap->zap_f.zap_phys->zap_salt);
-
- /*
- * The embedded pointer table should end at the end of
- * the block
- */
- ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap,
- 1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)) -
- (uintptr_t)zap->zap_f.zap_phys, ==,
- zap->zap_dbuf->db_size);
- }
- rw_exit(&zap->zap_rwlock);
- return (zap);
-}
-
-int
-zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
- krw_t lti, int fatreader, zap_t **zapp)
-{
- zap_t *zap;
- dmu_buf_t *db;
- krw_t lt;
- int err;
-
- *zapp = NULL;
-
- err = dmu_buf_hold(os, obj, 0, NULL, &db);
- if (err)
- return (err);
-
-#ifdef ZFS_DEBUG
- {
- dmu_object_info_t doi;
- dmu_object_info_from_db(db, &doi);
- ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap);
- }
-#endif
-
- zap = dmu_buf_get_user(db);
- if (zap == NULL)
- zap = mzap_open(os, obj, db);
-
- /*
- * We're checking zap_ismicro without the lock held, in order to
- * tell what type of lock we want. Once we have some sort of
- * lock, see if it really is the right type. In practice this
- * can only be different if it was upgraded from micro to fat,
- * and micro wanted WRITER but fat only needs READER.
- */
- lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti;
- rw_enter(&zap->zap_rwlock, lt);
- if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) {
- /* it was upgraded, now we only need reader */
- ASSERT(lt == RW_WRITER);
- ASSERT(RW_READER ==
- (!zap->zap_ismicro && fatreader) ? RW_READER : lti);
- rw_downgrade(&zap->zap_rwlock);
- lt = RW_READER;
- }
-
- zap->zap_objset = os;
-
- if (lt == RW_WRITER)
- dmu_buf_will_dirty(db, tx);
-
- ASSERT3P(zap->zap_dbuf, ==, db);
-
- ASSERT(!zap->zap_ismicro ||
- zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks);
- if (zap->zap_ismicro && tx &&
- zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) {
- uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE;
- if (newsz > MZAP_MAX_BLKSZ) {
- dprintf("upgrading obj %llu: num_entries=%u\n",
- obj, zap->zap_m.zap_num_entries);
- mzap_upgrade(zap, tx);
- *zapp = zap;
- return (0);
- }
- err = dmu_object_set_blocksize(os, obj, newsz, 0, tx);
- ASSERT3U(err, ==, 0);
- zap->zap_m.zap_num_chunks =
- db->db_size / MZAP_ENT_LEN - 1;
- }
-
- *zapp = zap;
- return (0);
-}
-
-void
-zap_unlockdir(zap_t *zap)
-{
- rw_exit(&zap->zap_rwlock);
- dmu_buf_rele(zap->zap_dbuf, NULL);
-}
-
-static void
-mzap_upgrade(zap_t *zap, dmu_tx_t *tx)
-{
- mzap_phys_t *mzp;
- int i, sz, nchunks, err;
-
- ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
-
- sz = zap->zap_dbuf->db_size;
- mzp = kmem_alloc(sz, KM_SLEEP);
- bcopy(zap->zap_dbuf->db_data, mzp, sz);
- nchunks = zap->zap_m.zap_num_chunks;
-
- err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object,
- 1ULL << fzap_default_block_shift, 0, tx);
- ASSERT(err == 0);
-
- dprintf("upgrading obj=%llu with %u chunks\n",
- zap->zap_object, nchunks);
- mze_destroy(zap);
-
- fzap_upgrade(zap, tx);
-
- for (i = 0; i < nchunks; i++) {
- int err;
- mzap_ent_phys_t *mze = &mzp->mz_chunk[i];
- if (mze->mze_name[0] == 0)
- continue;
- dprintf("adding %s=%llu\n",
- mze->mze_name, mze->mze_value);
- err = fzap_add_cd(zap,
- mze->mze_name, 8, 1, &mze->mze_value,
- mze->mze_cd, tx);
- ASSERT3U(err, ==, 0);
- }
- kmem_free(mzp, sz);
-}
-
-uint64_t
-zap_hash(zap_t *zap, const char *name)
-{
- const uint8_t *cp;
- uint8_t c;
- uint64_t crc = zap->zap_salt;
-
- ASSERT(crc != 0);
- ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
- for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
- crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
-
- /*
- * Only use 28 bits, since we need 4 bits in the cookie for the
- * collision differentiator. We MUST use the high bits, since
- * those are the onces that we first pay attention to when
- * chosing the bucket.
- */
- crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
-
- return (crc);
-}
-
-
-static void
-mzap_create_impl(objset_t *os, uint64_t obj, dmu_tx_t *tx)
-{
- dmu_buf_t *db;
- mzap_phys_t *zp;
-
- VERIFY(0 == dmu_buf_hold(os, obj, 0, FTAG, &db));
-
-#ifdef ZFS_DEBUG
- {
- dmu_object_info_t doi;
- dmu_object_info_from_db(db, &doi);
- ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap);
- }
-#endif
-
- dmu_buf_will_dirty(db, tx);
- zp = db->db_data;
- zp->mz_block_type = ZBT_MICRO;
- zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL;
- ASSERT(zp->mz_salt != 0);
- dmu_buf_rele(db, FTAG);
-}
-
-int
-zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot,
- dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
-{
- int err;
-
- err = dmu_object_claim(os, obj, ot, 0, bonustype, bonuslen, tx);
- if (err != 0)
- return (err);
- mzap_create_impl(os, obj, tx);
- return (0);
-}
-
-uint64_t
-zap_create(objset_t *os, dmu_object_type_t ot,
- dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
-{
- uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx);
-
- mzap_create_impl(os, obj, tx);
- return (obj);
-}
-
-int
-zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx)
-{
- /*
- * dmu_object_free will free the object number and free the
- * data. Freeing the data will cause our pageout function to be
- * called, which will destroy our data (zap_leaf_t's and zap_t).
- */
-
- return (dmu_object_free(os, zapobj, tx));
-}
-
-_NOTE(ARGSUSED(0))
-void
-zap_evict(dmu_buf_t *db, void *vzap)
-{
- zap_t *zap = vzap;
-
- rw_destroy(&zap->zap_rwlock);
-
- if (zap->zap_ismicro)
- mze_destroy(zap);
- else
- mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
-
- kmem_free(zap, sizeof (zap_t));
-}
-
-int
-zap_count(objset_t *os, uint64_t zapobj, uint64_t *count)
-{
- zap_t *zap;
- int err;
-
- err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, &zap);
- if (err)
- return (err);
- if (!zap->zap_ismicro) {
- err = fzap_count(zap, count);
- } else {
- *count = zap->zap_m.zap_num_entries;
- }
- zap_unlockdir(zap);
- return (err);
-}
-
-/*
- * Routines for maniplulating attributes.
- */
-
-int
-zap_lookup(objset_t *os, uint64_t zapobj, const char *name,
- uint64_t integer_size, uint64_t num_integers, void *buf)
-{
- zap_t *zap;
- int err;
- mzap_ent_t *mze;
-
- err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, &zap);
- if (err)
- return (err);
- if (!zap->zap_ismicro) {
- err = fzap_lookup(zap, name,
- integer_size, num_integers, buf);
- } else {
- mze = mze_find(zap, name, zap_hash(zap, name));
- if (mze == NULL) {
- err = ENOENT;
- } else {
- if (num_integers < 1)
- err = EOVERFLOW;
- else if (integer_size != 8)
- err = EINVAL;
- else
- *(uint64_t *)buf = mze->mze_phys.mze_value;
- }
- }
- zap_unlockdir(zap);
- return (err);
-}
-
-int
-zap_length(objset_t *os, uint64_t zapobj, const char *name,
- uint64_t *integer_size, uint64_t *num_integers)
-{
- zap_t *zap;
- int err;
- mzap_ent_t *mze;
-
- err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, &zap);
- if (err)
- return (err);
- if (!zap->zap_ismicro) {
- err = fzap_length(zap, name, integer_size, num_integers);
- } else {
- mze = mze_find(zap, name, zap_hash(zap, name));
- if (mze == NULL) {
- err = ENOENT;
- } else {
- if (integer_size)
- *integer_size = 8;
- if (num_integers)
- *num_integers = 1;
- }
- }
- zap_unlockdir(zap);
- return (err);
-}
-
-static void
-mzap_addent(zap_t *zap, const char *name, uint64_t hash, uint64_t value)
-{
- int i;
- int start = zap->zap_m.zap_alloc_next;
- uint32_t cd;
-
- dprintf("obj=%llu %s=%llu\n", zap->zap_object, name, value);
- ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
-
-#ifdef ZFS_DEBUG
- for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
- mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i];
- ASSERT(strcmp(name, mze->mze_name) != 0);
- }
-#endif
-
- cd = mze_find_unused_cd(zap, hash);
- /* given the limited size of the microzap, this can't happen */
- ASSERT(cd != ZAP_MAXCD);
-
-again:
- for (i = start; i < zap->zap_m.zap_num_chunks; i++) {
- mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i];
- if (mze->mze_name[0] == 0) {
- mze->mze_value = value;
- mze->mze_cd = cd;
- (void) strcpy(mze->mze_name, name);
- zap->zap_m.zap_num_entries++;
- zap->zap_m.zap_alloc_next = i+1;
- if (zap->zap_m.zap_alloc_next ==
- zap->zap_m.zap_num_chunks)
- zap->zap_m.zap_alloc_next = 0;
- mze_insert(zap, i, hash, mze);
- return;
- }
- }
- if (start != 0) {
- start = 0;
- goto again;
- }
- ASSERT(!"out of entries!");
-}
-
-int
-zap_add(objset_t *os, uint64_t zapobj, const char *name,
- int integer_size, uint64_t num_integers,
- const void *val, dmu_tx_t *tx)
-{
- zap_t *zap;
- int err;
- mzap_ent_t *mze;
- const uint64_t *intval = val;
- uint64_t hash;
-
- err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, &zap);
- if (err)
- return (err);
- if (!zap->zap_ismicro) {
- err = fzap_add(zap, name, integer_size, num_integers, val, tx);
- } else if (integer_size != 8 || num_integers != 1 ||
- strlen(name) >= MZAP_NAME_LEN) {
- dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
- zapobj, integer_size, num_integers, name);
- mzap_upgrade(zap, tx);
- err = fzap_add(zap, name, integer_size, num_integers, val, tx);
- } else {
- hash = zap_hash(zap, name);
- mze = mze_find(zap, name, hash);
- if (mze != NULL) {
- err = EEXIST;
- } else {
- mzap_addent(zap, name, hash, *intval);
- }
- }
- zap_unlockdir(zap);
- return (err);
-}
-
-int
-zap_update(objset_t *os, uint64_t zapobj, const char *name,
- int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
-{
- zap_t *zap;
- mzap_ent_t *mze;
- const uint64_t *intval = val;
- uint64_t hash;
- int err;
-
- err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, &zap);
- if (err)
- return (err);
- ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
- if (!zap->zap_ismicro) {
- err = fzap_update(zap, name,
- integer_size, num_integers, val, tx);
- } else if (integer_size != 8 || num_integers != 1 ||
- strlen(name) >= MZAP_NAME_LEN) {
- dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
- zapobj, integer_size, num_integers, name);
- mzap_upgrade(zap, tx);
- err = fzap_update(zap, name,
- integer_size, num_integers, val, tx);
- } else {
- hash = zap_hash(zap, name);
- mze = mze_find(zap, name, hash);
- if (mze != NULL) {
- mze->mze_phys.mze_value = *intval;
- zap->zap_m.zap_phys->mz_chunk
- [mze->mze_chunkid].mze_value = *intval;
- } else {
- mzap_addent(zap, name, hash, *intval);
- }
- }
- zap_unlockdir(zap);
- return (err);
-}
-
-int
-zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx)
-{
- zap_t *zap;
- int err;
- mzap_ent_t *mze;
-
- err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, &zap);
- if (err)
- return (err);
- if (!zap->zap_ismicro) {
- err = fzap_remove(zap, name, tx);
- } else {
- mze = mze_find(zap, name, zap_hash(zap, name));
- if (mze == NULL) {
- dprintf("fail: %s\n", name);
- err = ENOENT;
- } else {
- dprintf("success: %s\n", name);
- zap->zap_m.zap_num_entries--;
- bzero(&zap->zap_m.zap_phys->mz_chunk[mze->mze_chunkid],
- sizeof (mzap_ent_phys_t));
- mze_remove(zap, mze);
- }
- }
- zap_unlockdir(zap);
- return (err);
-}
-
-
-/*
- * Routines for iterating over the attributes.
- */
-
-/*
- * We want to keep the high 32 bits of the cursor zero if we can, so
- * that 32-bit programs can access this. So use a small hash value so
- * we can fit 4 bits of cd into the 32-bit cursor.
- *
- * [ 4 zero bits | 32-bit collision differentiator | 28-bit hash value ]
- */
-void
-zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
- uint64_t serialized)
-{
- zc->zc_objset = os;
- zc->zc_zap = NULL;
- zc->zc_leaf = NULL;
- zc->zc_zapobj = zapobj;
- if (serialized == -1ULL) {
- zc->zc_hash = -1ULL;
- zc->zc_cd = 0;
- } else {
- zc->zc_hash = serialized << (64-ZAP_HASHBITS);
- zc->zc_cd = serialized >> ZAP_HASHBITS;
- if (zc->zc_cd >= ZAP_MAXCD) /* corrupt serialized */
- zc->zc_cd = 0;
- }
-}
-
-void
-zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
-{
- zap_cursor_init_serialized(zc, os, zapobj, 0);
-}
-
-void
-zap_cursor_fini(zap_cursor_t *zc)
-{
- if (zc->zc_zap) {
- rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
- zap_unlockdir(zc->zc_zap);
- zc->zc_zap = NULL;
- }
- if (zc->zc_leaf) {
- rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
- zap_put_leaf(zc->zc_leaf);
- zc->zc_leaf = NULL;
- }
- zc->zc_objset = NULL;
-}
-
-uint64_t
-zap_cursor_serialize(zap_cursor_t *zc)
-{
- if (zc->zc_hash == -1ULL)
- return (-1ULL);
- ASSERT((zc->zc_hash & (ZAP_MAXCD-1)) == 0);
- ASSERT(zc->zc_cd < ZAP_MAXCD);
- return ((zc->zc_hash >> (64-ZAP_HASHBITS)) |
- ((uint64_t)zc->zc_cd << ZAP_HASHBITS));
-}
-
-int
-zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
-{
- int err;
- avl_index_t idx;
- mzap_ent_t mze_tofind;
- mzap_ent_t *mze;
-
- if (zc->zc_hash == -1ULL)
- return (ENOENT);
-
- if (zc->zc_zap == NULL) {
- err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
- RW_READER, TRUE, &zc->zc_zap);
- if (err)
- return (err);
- } else {
- rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
- }
- if (!zc->zc_zap->zap_ismicro) {
- err = fzap_cursor_retrieve(zc->zc_zap, zc, za);
- } else {
- err = ENOENT;
-
- mze_tofind.mze_hash = zc->zc_hash;
- mze_tofind.mze_phys.mze_cd = zc->zc_cd;
-
- mze = avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx);
- ASSERT(mze == NULL || 0 == bcmp(&mze->mze_phys,
- &zc->zc_zap->zap_m.zap_phys->mz_chunk[mze->mze_chunkid],
- sizeof (mze->mze_phys)));
- if (mze == NULL) {
- mze = avl_nearest(&zc->zc_zap->zap_m.zap_avl,
- idx, AVL_AFTER);
- }
- if (mze) {
- za->za_integer_length = 8;
- za->za_num_integers = 1;
- za->za_first_integer = mze->mze_phys.mze_value;
- (void) strcpy(za->za_name, mze->mze_phys.mze_name);
- zc->zc_hash = mze->mze_hash;
- zc->zc_cd = mze->mze_phys.mze_cd;
- err = 0;
- } else {
- zc->zc_hash = -1ULL;
- }
- }
- rw_exit(&zc->zc_zap->zap_rwlock);
- return (err);
-}
-
-void
-zap_cursor_advance(zap_cursor_t *zc)
-{
- if (zc->zc_hash == -1ULL)
- return;
- zc->zc_cd++;
- if (zc->zc_cd >= ZAP_MAXCD) {
- zc->zc_cd = 0;
- zc->zc_hash += 1ULL<<(64-ZAP_HASHBITS);
- if (zc->zc_hash == 0) /* EOF */
- zc->zc_hash = -1ULL;
- }
-}
-
-int
-zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
-{
- int err;
- zap_t *zap;
-
- err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, &zap);
- if (err)
- return (err);
-
- bzero(zs, sizeof (zap_stats_t));
-
- if (zap->zap_ismicro) {
- zs->zs_blocksize = zap->zap_dbuf->db_size;
- zs->zs_num_entries = zap->zap_m.zap_num_entries;
- zs->zs_num_blocks = 1;
- } else {
- fzap_get_stats(zap, zs);
- }
- zap_unlockdir(zap);
- return (0);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs.conf b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs.conf
deleted file mode 100644
index 0988190..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs.conf
+++ /dev/null
@@ -1,28 +0,0 @@
-#
-# CDDL HEADER START
-#
-# The contents of this file are subject to the terms of the
-# Common Development and Distribution License, Version 1.0 only
-# (the "License"). You may not use this file except in compliance
-# with the License.
-#
-# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
-# or http://www.opensolaris.org/os/licensing.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-# When distributing Covered Code, include this CDDL HEADER in each
-# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
-# If applicable, add the following below this CDDL HEADER, with the
-# fields enclosed by brackets "[]" replaced with your own identifying
-# information: Portions Copyright [yyyy] [name of copyright owner]
-#
-# CDDL HEADER END
-#
-#
-# Copyright 2005 Sun Microsystems, Inc. All rights reserved.
-# Use is subject to license terms.
-#
-# ident "%Z%%M% %I% %E% SMI"
-#
-name="zfs" parent="pseudo";
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c
deleted file mode 100644
index dd94618..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c
+++ /dev/null
@@ -1,1608 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/time.h>
-#include <sys/systm.h>
-#include <sys/sysmacros.h>
-#include <sys/resource.h>
-#include <sys/vfs.h>
-#include <sys/vnode.h>
-#include <sys/file.h>
-#include <sys/stat.h>
-#include <sys/kmem.h>
-#include <sys/cmn_err.h>
-#include <sys/errno.h>
-#include <sys/unistd.h>
-#include <sys/sdt.h>
-#include <sys/fs/zfs.h>
-#include <sys/policy.h>
-#include <sys/zfs_znode.h>
-#include <sys/zfs_acl.h>
-#include <sys/zfs_dir.h>
-#include <sys/zfs_vfsops.h>
-#include <sys/dmu.h>
-#include <sys/zap.h>
-#include <acl/acl_common.h>
-
-#define ALLOW ACE_ACCESS_ALLOWED_ACE_TYPE
-#define DENY ACE_ACCESS_DENIED_ACE_TYPE
-
-#define OWNING_GROUP (ACE_GROUP|ACE_IDENTIFIER_GROUP)
-#define EVERYONE_ALLOW_MASK (ACE_READ_ACL|ACE_READ_ATTRIBUTES | \
- ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE)
-#define EVERYONE_DENY_MASK (ACE_WRITE_ACL|ACE_WRITE_OWNER | \
- ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS)
-#define OWNER_ALLOW_MASK (ACE_WRITE_ACL | ACE_WRITE_OWNER | \
- ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS)
-#define WRITE_MASK (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS| \
- ACE_WRITE_ATTRIBUTES|ACE_WRITE_ACL|ACE_WRITE_OWNER)
-
-#define OGE_CLEAR (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \
- ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE)
-
-#define OKAY_MASK_BITS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \
- ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE)
-
-#define ALL_INHERIT (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE | \
- ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE)
-
-#define SECURE_CLEAR (ACE_WRITE_ACL|ACE_WRITE_OWNER)
-
-#define OGE_PAD 6 /* traditional owner/group/everyone ACES */
-
-static int zfs_ace_can_use(znode_t *zp, ace_t *);
-
-static zfs_acl_t *
-zfs_acl_alloc(int slots)
-{
- zfs_acl_t *aclp;
-
- aclp = kmem_zalloc(sizeof (zfs_acl_t), KM_SLEEP);
- if (slots != 0) {
- aclp->z_acl = kmem_alloc(ZFS_ACL_SIZE(slots), KM_SLEEP);
- aclp->z_acl_count = 0;
- aclp->z_state = ACL_DATA_ALLOCED;
- } else {
- aclp->z_state = 0;
- }
- aclp->z_slots = slots;
- return (aclp);
-}
-
-void
-zfs_acl_free(zfs_acl_t *aclp)
-{
- if (aclp->z_state == ACL_DATA_ALLOCED) {
- kmem_free(aclp->z_acl, ZFS_ACL_SIZE(aclp->z_slots));
- }
- kmem_free(aclp, sizeof (zfs_acl_t));
-}
-
-static uint32_t
-zfs_v4_to_unix(uint32_t access_mask)
-{
- uint32_t new_mask = 0;
-
- /*
- * This is used for mapping v4 permissions into permissions
- * that can be passed to secpolicy_vnode_access()
- */
- if (access_mask & (ACE_READ_DATA | ACE_LIST_DIRECTORY |
- ACE_READ_ATTRIBUTES | ACE_READ_ACL))
- new_mask |= S_IROTH;
- if (access_mask & (ACE_WRITE_DATA | ACE_APPEND_DATA |
- ACE_WRITE_ATTRIBUTES | ACE_ADD_FILE | ACE_WRITE_NAMED_ATTRS))
- new_mask |= S_IWOTH;
- if (access_mask & (ACE_EXECUTE | ACE_READ_NAMED_ATTRS))
- new_mask |= S_IXOTH;
-
- return (new_mask);
-}
-
-/*
- * Convert unix access mask to v4 access mask
- */
-static uint32_t
-zfs_unix_to_v4(uint32_t access_mask)
-{
- uint32_t new_mask = 0;
-
- if (access_mask & 01)
- new_mask |= (ACE_EXECUTE);
- if (access_mask & 02) {
- new_mask |= (ACE_WRITE_DATA);
- } if (access_mask & 04) {
- new_mask |= ACE_READ_DATA;
- }
- return (new_mask);
-}
-
-static void
-zfs_set_ace(ace_t *zacep, uint32_t access_mask, int access_type,
- uid_t uid, int entry_type)
-{
- zacep->a_access_mask = access_mask;
- zacep->a_type = access_type;
- zacep->a_who = uid;
- zacep->a_flags = entry_type;
-}
-
-static uint64_t
-zfs_mode_compute(znode_t *zp, zfs_acl_t *aclp)
-{
- int i;
- int entry_type;
- mode_t mode = (zp->z_phys->zp_mode &
- (S_IFMT | S_ISUID | S_ISGID | S_ISVTX));
- mode_t seen = 0;
- ace_t *acep;
-
- for (i = 0, acep = aclp->z_acl;
- i != aclp->z_acl_count; i++, acep++) {
- entry_type = (acep->a_flags & ACE_TYPE_FLAGS);
- if (entry_type == ACE_OWNER) {
- if ((acep->a_access_mask & ACE_READ_DATA) &&
- (!(seen & S_IRUSR))) {
- seen |= S_IRUSR;
- if (acep->a_type == ALLOW) {
- mode |= S_IRUSR;
- }
- }
- if ((acep->a_access_mask & ACE_WRITE_DATA) &&
- (!(seen & S_IWUSR))) {
- seen |= S_IWUSR;
- if (acep->a_type == ALLOW) {
- mode |= S_IWUSR;
- }
- }
- if ((acep->a_access_mask & ACE_EXECUTE) &&
- (!(seen & S_IXUSR))) {
- seen |= S_IXUSR;
- if (acep->a_type == ALLOW) {
- mode |= S_IXUSR;
- }
- }
- } else if (entry_type == OWNING_GROUP) {
- if ((acep->a_access_mask & ACE_READ_DATA) &&
- (!(seen & S_IRGRP))) {
- seen |= S_IRGRP;
- if (acep->a_type == ALLOW) {
- mode |= S_IRGRP;
- }
- }
- if ((acep->a_access_mask & ACE_WRITE_DATA) &&
- (!(seen & S_IWGRP))) {
- seen |= S_IWGRP;
- if (acep->a_type == ALLOW) {
- mode |= S_IWGRP;
- }
- }
- if ((acep->a_access_mask & ACE_EXECUTE) &&
- (!(seen & S_IXGRP))) {
- seen |= S_IXGRP;
- if (acep->a_type == ALLOW) {
- mode |= S_IXGRP;
- }
- }
- } else if (entry_type == ACE_EVERYONE) {
- if ((acep->a_access_mask & ACE_READ_DATA)) {
- if (!(seen & S_IRUSR)) {
- seen |= S_IRUSR;
- if (acep->a_type == ALLOW) {
- mode |= S_IRUSR;
- }
- }
- if (!(seen & S_IRGRP)) {
- seen |= S_IRGRP;
- if (acep->a_type == ALLOW) {
- mode |= S_IRGRP;
- }
- }
- if (!(seen & S_IROTH)) {
- seen |= S_IROTH;
- if (acep->a_type == ALLOW) {
- mode |= S_IROTH;
- }
- }
- }
- if ((acep->a_access_mask & ACE_WRITE_DATA)) {
- if (!(seen & S_IWUSR)) {
- seen |= S_IWUSR;
- if (acep->a_type == ALLOW) {
- mode |= S_IWUSR;
- }
- }
- if (!(seen & S_IWGRP)) {
- seen |= S_IWGRP;
- if (acep->a_type == ALLOW) {
- mode |= S_IWGRP;
- }
- }
- if (!(seen & S_IWOTH)) {
- seen |= S_IWOTH;
- if (acep->a_type == ALLOW) {
- mode |= S_IWOTH;
- }
- }
- }
- if ((acep->a_access_mask & ACE_EXECUTE)) {
- if (!(seen & S_IXUSR)) {
- seen |= S_IXUSR;
- if (acep->a_type == ALLOW) {
- mode |= S_IXUSR;
- }
- }
- if (!(seen & S_IXGRP)) {
- seen |= S_IXGRP;
- if (acep->a_type == ALLOW) {
- mode |= S_IXGRP;
- }
- }
- if (!(seen & S_IXOTH)) {
- seen |= S_IXOTH;
- if (acep->a_type == ALLOW) {
- mode |= S_IXOTH;
- }
- }
- }
- }
- }
- return (mode);
-}
-
-static zfs_acl_t *
-zfs_acl_node_read_internal(znode_t *zp)
-{
- zfs_acl_t *aclp;
-
- aclp = zfs_acl_alloc(0);
- aclp->z_acl_count = zp->z_phys->zp_acl.z_acl_count;
- aclp->z_acl = &zp->z_phys->zp_acl.z_ace_data[0];
-
- return (aclp);
-}
-
-/*
- * Read an external acl object.
- */
-static int
-zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp)
-{
- uint64_t extacl = zp->z_phys->zp_acl.z_acl_extern_obj;
- zfs_acl_t *aclp;
- int error;
-
- ASSERT(MUTEX_HELD(&zp->z_acl_lock));
-
- if (zp->z_phys->zp_acl.z_acl_extern_obj == 0) {
- *aclpp = zfs_acl_node_read_internal(zp);
- return (0);
- }
-
- aclp = zfs_acl_alloc(zp->z_phys->zp_acl.z_acl_count);
-
- error = dmu_read(zp->z_zfsvfs->z_os, extacl, 0,
- ZFS_ACL_SIZE(zp->z_phys->zp_acl.z_acl_count), aclp->z_acl);
- if (error != 0) {
- zfs_acl_free(aclp);
- return (error);
- }
-
- aclp->z_acl_count = zp->z_phys->zp_acl.z_acl_count;
-
- *aclpp = aclp;
- return (0);
-}
-
-static boolean_t
-zfs_acl_valid(znode_t *zp, ace_t *uace, int aclcnt, int *inherit)
-{
- ace_t *acep;
- int i;
-
- *inherit = 0;
-
- if (aclcnt > MAX_ACL_ENTRIES || aclcnt <= 0) {
- return (B_FALSE);
- }
-
- for (i = 0, acep = uace; i != aclcnt; i++, acep++) {
-
- /*
- * first check type of entry
- */
-
- switch (acep->a_flags & ACE_TYPE_FLAGS) {
- case ACE_OWNER:
- acep->a_who = -1;
- break;
- case (ACE_IDENTIFIER_GROUP | ACE_GROUP):
- case ACE_IDENTIFIER_GROUP:
- if (acep->a_flags & ACE_GROUP) {
- acep->a_who = -1;
- }
- break;
- case ACE_EVERYONE:
- acep->a_who = -1;
- break;
- }
-
- /*
- * next check inheritance level flags
- */
-
- if (acep->a_type != ALLOW && acep->a_type != DENY)
- return (B_FALSE);
-
- /*
- * Only directories should have inheritance flags.
- */
- if (ZTOV(zp)->v_type != VDIR && (acep->a_flags &
- (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE|
- ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE))) {
- return (B_FALSE);
- }
-
- if (acep->a_flags &
- (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))
- *inherit = 1;
-
- if (acep->a_flags &
- (ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE)) {
- if ((acep->a_flags & (ACE_FILE_INHERIT_ACE|
- ACE_DIRECTORY_INHERIT_ACE)) == 0) {
- return (B_FALSE);
- }
- }
- }
-
- return (B_TRUE);
-}
-/*
- * common code for setting acl's.
- *
- * This function is called from zfs_mode_update, zfs_perm_init, and zfs_setacl.
- * zfs_setacl passes a non-NULL inherit pointer (ihp) to indicate that it's
- * already checked the acl and knows whether to inherit.
- */
-int
-zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, dmu_tx_t *tx, int *ihp)
-{
- int inherit = 0;
- int error;
- znode_phys_t *zphys = zp->z_phys;
- zfs_znode_acl_t *zacl = &zphys->zp_acl;
- uint32_t acl_phys_size = ZFS_ACL_SIZE(aclp->z_acl_count);
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- uint64_t aoid = zphys->zp_acl.z_acl_extern_obj;
-
- ASSERT(MUTEX_HELD(&zp->z_lock));
- ASSERT(MUTEX_HELD(&zp->z_acl_lock));
-
- if (ihp)
- inherit = *ihp; /* already determined by caller */
- else if (!zfs_acl_valid(zp, aclp->z_acl,
- aclp->z_acl_count, &inherit)) {
- return (EINVAL);
- }
-
- dmu_buf_will_dirty(zp->z_dbuf, tx);
-
- /*
- * Will ACL fit internally?
- */
- if (aclp->z_acl_count > ACE_SLOT_CNT) {
- if (aoid == 0) {
- aoid = dmu_object_alloc(zfsvfs->z_os,
- DMU_OT_ACL, acl_phys_size, DMU_OT_NONE, 0, tx);
- } else {
- (void) dmu_object_set_blocksize(zfsvfs->z_os, aoid,
- acl_phys_size, 0, tx);
- }
- zphys->zp_acl.z_acl_extern_obj = aoid;
- zphys->zp_acl.z_acl_count = aclp->z_acl_count;
- dmu_write(zfsvfs->z_os, aoid, 0,
- acl_phys_size, aclp->z_acl, tx);
- } else {
- /*
- * Migrating back embedded?
- */
- if (zphys->zp_acl.z_acl_extern_obj) {
- error = dmu_object_free(zfsvfs->z_os,
- zp->z_phys->zp_acl.z_acl_extern_obj, tx);
- if (error)
- return (error);
- zphys->zp_acl.z_acl_extern_obj = 0;
- }
- bcopy(aclp->z_acl, zacl->z_ace_data,
- aclp->z_acl_count * sizeof (ace_t));
- zacl->z_acl_count = aclp->z_acl_count;
- }
-
- zp->z_phys->zp_flags &= ~(ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE);
- if (inherit) {
- zp->z_phys->zp_flags |= ZFS_INHERIT_ACE;
- } else if (ace_trivial(zacl->z_ace_data, zacl->z_acl_count) == 0) {
- zp->z_phys->zp_flags |= ZFS_ACL_TRIVIAL;
- }
-
- zphys->zp_mode = zfs_mode_compute(zp, aclp);
- zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
-
- return (0);
-}
-
-/*
- * Create space for slots_needed ACEs to be append
- * to aclp.
- */
-static void
-zfs_acl_append(zfs_acl_t *aclp, int slots_needed)
-{
- ace_t *newacep;
- ace_t *oldaclp;
- int slot_cnt;
- int slots_left = aclp->z_slots - aclp->z_acl_count;
-
- if (aclp->z_state == ACL_DATA_ALLOCED)
- ASSERT(aclp->z_slots >= aclp->z_acl_count);
- if (slots_left < slots_needed || aclp->z_state != ACL_DATA_ALLOCED) {
- slot_cnt = aclp->z_slots + 1 + (slots_needed - slots_left);
- newacep = kmem_alloc(ZFS_ACL_SIZE(slot_cnt), KM_SLEEP);
- bcopy(aclp->z_acl, newacep,
- ZFS_ACL_SIZE(aclp->z_acl_count));
- oldaclp = aclp->z_acl;
- if (aclp->z_state == ACL_DATA_ALLOCED)
- kmem_free(oldaclp, ZFS_ACL_SIZE(aclp->z_slots));
- aclp->z_acl = newacep;
- aclp->z_slots = slot_cnt;
- aclp->z_state = ACL_DATA_ALLOCED;
- }
-}
-
-/*
- * Remove "slot" ACE from aclp
- */
-static void
-zfs_ace_remove(zfs_acl_t *aclp, int slot)
-{
- if (aclp->z_acl_count > 1) {
- (void) memmove(&aclp->z_acl[slot],
- &aclp->z_acl[slot +1], sizeof (ace_t) *
- (--aclp->z_acl_count - slot));
- } else
- aclp->z_acl_count--;
-}
-
-/*
- * Update access mask for prepended ACE
- *
- * This applies the "groupmask" value for aclmode property.
- */
-static void
-zfs_acl_prepend_fixup(ace_t *acep, ace_t *origacep, mode_t mode, uid_t owner)
-{
-
- int rmask, wmask, xmask;
- int user_ace;
-
- user_ace = (!(acep->a_flags &
- (ACE_OWNER|ACE_GROUP|ACE_IDENTIFIER_GROUP)));
-
- if (user_ace && (acep->a_who == owner)) {
- rmask = S_IRUSR;
- wmask = S_IWUSR;
- xmask = S_IXUSR;
- } else {
- rmask = S_IRGRP;
- wmask = S_IWGRP;
- xmask = S_IXGRP;
- }
-
- if (origacep->a_access_mask & ACE_READ_DATA) {
- if (mode & rmask)
- acep->a_access_mask &= ~ACE_READ_DATA;
- else
- acep->a_access_mask |= ACE_READ_DATA;
- }
-
- if (origacep->a_access_mask & ACE_WRITE_DATA) {
- if (mode & wmask)
- acep->a_access_mask &= ~ACE_WRITE_DATA;
- else
- acep->a_access_mask |= ACE_WRITE_DATA;
- }
-
- if (origacep->a_access_mask & ACE_APPEND_DATA) {
- if (mode & wmask)
- acep->a_access_mask &= ~ACE_APPEND_DATA;
- else
- acep->a_access_mask |= ACE_APPEND_DATA;
- }
-
- if (origacep->a_access_mask & ACE_EXECUTE) {
- if (mode & xmask)
- acep->a_access_mask &= ~ACE_EXECUTE;
- else
- acep->a_access_mask |= ACE_EXECUTE;
- }
-}
-
-/*
- * Apply mode to canonical six ACEs.
- */
-static void
-zfs_acl_fixup_canonical_six(zfs_acl_t *aclp, mode_t mode)
-{
- int cnt;
- ace_t *acep;
-
- cnt = aclp->z_acl_count -1;
- acep = aclp->z_acl;
-
- /*
- * Fixup final ACEs to match the mode
- */
-
- ASSERT(cnt >= 5);
- adjust_ace_pair(&acep[cnt - 1], mode); /* everyone@ */
- adjust_ace_pair(&acep[cnt - 3], (mode & 0070) >> 3); /* group@ */
- adjust_ace_pair(&acep[cnt - 5], (mode & 0700) >> 6); /* owner@ */
-}
-
-
-static int
-zfs_acl_ace_match(ace_t *acep, int allow_deny, int type, int mask)
-{
- return (acep->a_access_mask == mask && acep->a_type == allow_deny &&
- ((acep->a_flags & ACE_TYPE_FLAGS) == type));
-}
-
-/*
- * Can prepended ACE be reused?
- */
-static int
-zfs_reuse_deny(ace_t *acep, int i)
-{
- int okay_masks;
-
- if (i < 1)
- return (B_FALSE);
-
- if (acep[i-1].a_type != DENY)
- return (B_FALSE);
-
- if (acep[i-1].a_flags != (acep[i].a_flags & ACE_IDENTIFIER_GROUP))
- return (B_FALSE);
-
- okay_masks = (acep[i].a_access_mask & OKAY_MASK_BITS);
-
- if (acep[i-1].a_access_mask & ~okay_masks)
- return (B_FALSE);
-
- return (B_TRUE);
-}
-
-/*
- * Create space to prepend an ACE
- */
-static void
-zfs_acl_prepend(zfs_acl_t *aclp, int i)
-{
- ace_t *oldaclp = NULL;
- ace_t *to, *from;
- int slots_left = aclp->z_slots - aclp->z_acl_count;
- int oldslots;
- int need_free = 0;
-
- if (aclp->z_state == ACL_DATA_ALLOCED)
- ASSERT(aclp->z_slots >= aclp->z_acl_count);
-
- if (slots_left == 0 || aclp->z_state != ACL_DATA_ALLOCED) {
-
- to = kmem_alloc(ZFS_ACL_SIZE(aclp->z_acl_count +
- OGE_PAD), KM_SLEEP);
- if (aclp->z_state == ACL_DATA_ALLOCED)
- need_free++;
- from = aclp->z_acl;
- oldaclp = aclp->z_acl;
- (void) memmove(to, from,
- sizeof (ace_t) * aclp->z_acl_count);
- aclp->z_state = ACL_DATA_ALLOCED;
- } else {
- from = aclp->z_acl;
- to = aclp->z_acl;
- }
-
-
- (void) memmove(&to[i + 1], &from[i],
- sizeof (ace_t) * (aclp->z_acl_count - i));
-
- if (oldaclp) {
- aclp->z_acl = to;
- oldslots = aclp->z_slots;
- aclp->z_slots = aclp->z_acl_count + OGE_PAD;
- if (need_free)
- kmem_free(oldaclp, ZFS_ACL_SIZE(oldslots));
- }
-
-}
-
-/*
- * Prepend deny ACE
- */
-static void
-zfs_acl_prepend_deny(znode_t *zp, zfs_acl_t *aclp, int i,
- mode_t mode)
-{
- ace_t *acep;
-
- zfs_acl_prepend(aclp, i);
-
- acep = aclp->z_acl;
- zfs_set_ace(&acep[i], 0, DENY, acep[i + 1].a_who,
- (acep[i + 1].a_flags & ACE_TYPE_FLAGS));
- zfs_acl_prepend_fixup(&acep[i], &acep[i+1], mode, zp->z_phys->zp_uid);
- aclp->z_acl_count++;
-}
-
-/*
- * Split an inherited ACE into inherit_only ACE
- * and original ACE with inheritance flags stripped off.
- */
-static void
-zfs_acl_split_ace(zfs_acl_t *aclp, int i)
-{
- ace_t *acep = aclp->z_acl;
-
- zfs_acl_prepend(aclp, i);
- acep = aclp->z_acl;
- acep[i] = acep[i + 1];
- acep[i].a_flags |= ACE_INHERIT_ONLY_ACE;
- acep[i + 1].a_flags &= ~ALL_INHERIT;
- aclp->z_acl_count++;
-}
-
-/*
- * Are ACES started at index i, the canonical six ACES?
- */
-static int
-zfs_have_canonical_six(zfs_acl_t *aclp, int i)
-{
- ace_t *acep = aclp->z_acl;
-
- if ((zfs_acl_ace_match(&acep[i],
- DENY, ACE_OWNER, 0) &&
- zfs_acl_ace_match(&acep[i + 1], ALLOW, ACE_OWNER,
- OWNER_ALLOW_MASK) && zfs_acl_ace_match(&acep[i + 2],
- DENY, OWNING_GROUP, 0) && zfs_acl_ace_match(&acep[i + 3],
- ALLOW, OWNING_GROUP, 0) && zfs_acl_ace_match(&acep[i + 4],
- DENY, ACE_EVERYONE, EVERYONE_DENY_MASK) &&
- zfs_acl_ace_match(&acep[i + 5], ALLOW, ACE_EVERYONE,
- EVERYONE_ALLOW_MASK))) {
- return (1);
- } else {
- return (0);
- }
-}
-
-/*
- * Apply step 1g, to group entries
- *
- * Need to deal with corner case where group may have
- * greater permissions than owner. If so then limit
- * group permissions, based on what extra permissions
- * group has.
- */
-static void
-zfs_fixup_group_entries(ace_t *acep, mode_t mode)
-{
- mode_t extramode = (mode >> 3) & 07;
- mode_t ownermode = (mode >> 6);
-
- if (acep[0].a_flags & ACE_IDENTIFIER_GROUP) {
-
- extramode &= ~ownermode;
-
- if (extramode) {
- if (extramode & 04) {
- acep[0].a_access_mask &= ~ACE_READ_DATA;
- acep[1].a_access_mask &= ~ACE_READ_DATA;
- }
- if (extramode & 02) {
- acep[0].a_access_mask &=
- ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
- acep[1].a_access_mask &=
- ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
- }
- if (extramode & 01) {
- acep[0].a_access_mask &= ~ACE_EXECUTE;
- acep[1].a_access_mask &= ~ACE_EXECUTE;
- }
- }
- }
-}
-
-/*
- * Apply the chmod algorithm as described
- * in PSARC/2002/240
- */
-static int
-zfs_acl_chmod(znode_t *zp, uint64_t mode, zfs_acl_t *aclp,
- dmu_tx_t *tx)
-{
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- ace_t *acep;
- int i;
- int error;
- int entry_type;
- int reuse_deny;
- int need_canonical_six = 1;
- int inherit = 0;
- int iflags;
-
- ASSERT(MUTEX_HELD(&zp->z_acl_lock));
- ASSERT(MUTEX_HELD(&zp->z_lock));
-
- i = 0;
- while (i < aclp->z_acl_count) {
- acep = aclp->z_acl;
- entry_type = (acep[i].a_flags & ACE_TYPE_FLAGS);
- iflags = (acep[i].a_flags & ALL_INHERIT);
-
- if ((acep[i].a_type != ALLOW && acep[i].a_type != DENY) ||
- (iflags & ACE_INHERIT_ONLY_ACE)) {
- i++;
- if (iflags)
- inherit = 1;
- continue;
- }
-
-
- if (zfsvfs->z_acl_mode == ZFS_ACL_DISCARD) {
- zfs_ace_remove(aclp, i);
- continue;
- }
-
- /*
- * Need to split ace into two?
- */
- if ((iflags & (ACE_FILE_INHERIT_ACE|
- ACE_DIRECTORY_INHERIT_ACE)) &&
- (!(iflags & ACE_INHERIT_ONLY_ACE))) {
- zfs_acl_split_ace(aclp, i);
- i++;
- inherit = 1;
- continue;
- }
-
- if (entry_type == ACE_OWNER || entry_type == ACE_EVERYONE ||
- (entry_type == OWNING_GROUP)) {
- acep[i].a_access_mask &= ~OGE_CLEAR;
- i++;
- continue;
-
- } else {
- if (acep[i].a_type == ALLOW) {
-
- /*
- * Check preceding ACE if any, to see
- * if we need to prepend a DENY ACE.
- * This is only applicable when the acl_mode
- * property == groupmask.
- */
- if (zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK) {
-
- reuse_deny = zfs_reuse_deny(acep, i);
-
- if (reuse_deny == B_FALSE) {
- zfs_acl_prepend_deny(zp, aclp,
- i, mode);
- i++;
- acep = aclp->z_acl;
- } else {
- zfs_acl_prepend_fixup(
- &acep[i - 1],
- &acep[i], mode,
- zp->z_phys->zp_uid);
- }
- zfs_fixup_group_entries(&acep[i - 1],
- mode);
- }
- }
- i++;
- }
- }
-
- /*
- * Check out last six aces, if we have six.
- */
-
- if (aclp->z_acl_count >= 6) {
- i = aclp->z_acl_count - 6;
-
- if (zfs_have_canonical_six(aclp, i)) {
- need_canonical_six = 0;
- }
- }
-
- if (need_canonical_six) {
-
- zfs_acl_append(aclp, 6);
- i = aclp->z_acl_count;
- acep = aclp->z_acl;
- zfs_set_ace(&acep[i++], 0, DENY, -1, ACE_OWNER);
- zfs_set_ace(&acep[i++], OWNER_ALLOW_MASK, ALLOW, -1, ACE_OWNER);
- zfs_set_ace(&acep[i++], 0, DENY, -1, OWNING_GROUP);
- zfs_set_ace(&acep[i++], 0, ALLOW, -1, OWNING_GROUP);
- zfs_set_ace(&acep[i++], EVERYONE_DENY_MASK,
- DENY, -1, ACE_EVERYONE);
- zfs_set_ace(&acep[i++], EVERYONE_ALLOW_MASK,
- ALLOW, -1, ACE_EVERYONE);
- aclp->z_acl_count += 6;
- }
-
- zfs_acl_fixup_canonical_six(aclp, mode);
-
- zp->z_phys->zp_mode = mode;
- error = zfs_aclset_common(zp, aclp, tx, &inherit);
- return (error);
-}
-
-
-int
-zfs_acl_chmod_setattr(znode_t *zp, uint64_t mode, dmu_tx_t *tx)
-{
- zfs_acl_t *aclp = NULL;
- int error;
-
- ASSERT(MUTEX_HELD(&zp->z_lock));
- mutex_enter(&zp->z_acl_lock);
- error = zfs_acl_node_read(zp, &aclp);
- if (error == 0)
- error = zfs_acl_chmod(zp, mode, aclp, tx);
- mutex_exit(&zp->z_acl_lock);
- if (aclp)
- zfs_acl_free(aclp);
- return (error);
-}
-
-/*
- * strip off write_owner and write_acl
- */
-static void
-zfs_securemode_update(zfsvfs_t *zfsvfs, ace_t *acep)
-{
- if ((zfsvfs->z_acl_inherit == ZFS_ACL_SECURE) &&
- (acep->a_type == ALLOW))
- acep->a_access_mask &= ~SECURE_CLEAR;
-}
-
-/*
- * inherit inheritable ACEs from parent
- */
-static zfs_acl_t *
-zfs_acl_inherit(znode_t *zp, zfs_acl_t *paclp)
-{
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- ace_t *pacep;
- ace_t *acep;
- int ace_cnt = 0;
- int pace_cnt;
- int i, j;
- zfs_acl_t *aclp = NULL;
-
- i = j = 0;
- pace_cnt = paclp->z_acl_count;
- pacep = paclp->z_acl;
- if (zfsvfs->z_acl_inherit != ZFS_ACL_DISCARD) {
- for (i = 0; i != pace_cnt; i++) {
-
- if (zfsvfs->z_acl_inherit == ZFS_ACL_NOALLOW &&
- pacep[i].a_type == ALLOW)
- continue;
-
- if (zfs_ace_can_use(zp, &pacep[i])) {
- ace_cnt++;
- if (!(pacep[i].a_flags &
- ACE_NO_PROPAGATE_INHERIT_ACE))
- ace_cnt++;
- }
- }
- }
-
- aclp = zfs_acl_alloc(ace_cnt + OGE_PAD);
- if (ace_cnt && zfsvfs->z_acl_inherit != ZFS_ACL_DISCARD) {
- acep = aclp->z_acl;
- pacep = paclp->z_acl;
- for (i = 0; i != pace_cnt; i++) {
-
- if (zfsvfs->z_acl_inherit == ZFS_ACL_NOALLOW &&
- pacep[i].a_type == ALLOW)
- continue;
-
- if (zfs_ace_can_use(zp, &pacep[i])) {
-
- /*
- * Now create entry for inherited ace
- */
-
- acep[j] = pacep[i];
-
- /*
- * When AUDIT/ALARM a_types are supported
- * they should be inherited here.
- */
-
- if ((pacep[i].a_flags &
- ACE_NO_PROPAGATE_INHERIT_ACE) ||
- (ZTOV(zp)->v_type != VDIR)) {
- acep[j].a_flags &= ~ALL_INHERIT;
- zfs_securemode_update(zfsvfs, &acep[j]);
- j++;
- continue;
- }
-
- ASSERT(ZTOV(zp)->v_type == VDIR);
-
- /*
- * If we are inheriting an ACE targeted for
- * only files, then make sure inherit_only
- * is on for future propagation.
- */
- if ((pacep[i].a_flags & (ACE_FILE_INHERIT_ACE |
- ACE_DIRECTORY_INHERIT_ACE)) !=
- ACE_FILE_INHERIT_ACE) {
- j++;
- acep[j] = acep[j-1];
- acep[j-1].a_flags |=
- ACE_INHERIT_ONLY_ACE;
- acep[j].a_flags &= ~ALL_INHERIT;
- } else {
- acep[j].a_flags |= ACE_INHERIT_ONLY_ACE;
- }
- zfs_securemode_update(zfsvfs, &acep[j]);
- j++;
- }
- }
- }
- aclp->z_acl_count = j;
- ASSERT(aclp->z_slots >= aclp->z_acl_count);
-
- return (aclp);
-}
-
-/*
- * Create file system object initial permissions
- * including inheritable ACEs.
- */
-void
-zfs_perm_init(znode_t *zp, znode_t *parent, int flag,
- vattr_t *vap, dmu_tx_t *tx, cred_t *cr)
-{
- uint64_t mode;
- uid_t uid;
- gid_t gid;
- int error;
- int pull_down;
- zfs_acl_t *aclp, *paclp;
-
- mode = MAKEIMODE(vap->va_type, vap->va_mode);
-
- /*
- * Determine uid and gid.
- */
- if ((flag & (IS_ROOT_NODE | IS_REPLAY)) ||
- ((flag & IS_XATTR) && (vap->va_type == VDIR))) {
- uid = vap->va_uid;
- gid = vap->va_gid;
- } else {
- uid = crgetuid(cr);
- if ((vap->va_mask & AT_GID) &&
- ((vap->va_gid == parent->z_phys->zp_gid) ||
- groupmember(vap->va_gid, cr) ||
- secpolicy_vnode_create_gid(cr) == 0))
- gid = vap->va_gid;
- else
-#ifdef __FreeBSD__
- gid = parent->z_phys->zp_gid;
-#else
- gid = (parent->z_phys->zp_mode & S_ISGID) ?
- parent->z_phys->zp_gid : crgetgid(cr);
-#endif
- }
-
- /*
- * If we're creating a directory, and the parent directory has the
- * set-GID bit set, set in on the new directory.
- * Otherwise, if the user is neither privileged nor a member of the
- * file's new group, clear the file's set-GID bit.
- */
-
- if ((parent->z_phys->zp_mode & S_ISGID) && (vap->va_type == VDIR))
- mode |= S_ISGID;
- else {
- if ((mode & S_ISGID) &&
- secpolicy_vnode_setids_setgids(cr, gid) != 0)
- mode &= ~S_ISGID;
- }
-
- zp->z_phys->zp_uid = uid;
- zp->z_phys->zp_gid = gid;
- zp->z_phys->zp_mode = mode;
-
- mutex_enter(&parent->z_lock);
- pull_down = (parent->z_phys->zp_flags & ZFS_INHERIT_ACE);
- if (pull_down) {
- mutex_enter(&parent->z_acl_lock);
- VERIFY(0 == zfs_acl_node_read(parent, &paclp));
- mutex_exit(&parent->z_acl_lock);
- aclp = zfs_acl_inherit(zp, paclp);
- zfs_acl_free(paclp);
- } else {
- aclp = zfs_acl_alloc(6);
- }
- mutex_exit(&parent->z_lock);
- mutex_enter(&zp->z_lock);
- mutex_enter(&zp->z_acl_lock);
- error = zfs_acl_chmod(zp, mode, aclp, tx);
- mutex_exit(&zp->z_lock);
- mutex_exit(&zp->z_acl_lock);
- ASSERT3U(error, ==, 0);
- zfs_acl_free(aclp);
-}
-
-/*
- * Should ACE be inherited?
- */
-static int
-zfs_ace_can_use(znode_t *zp, ace_t *acep)
-{
- int vtype = ZTOV(zp)->v_type;
-
- int iflags = (acep->a_flags & 0xf);
-
- if ((vtype == VDIR) && (iflags & ACE_DIRECTORY_INHERIT_ACE))
- return (1);
- else if (iflags & ACE_FILE_INHERIT_ACE)
- return (!((vtype == VDIR) &&
- (iflags & ACE_NO_PROPAGATE_INHERIT_ACE)));
- return (0);
-}
-
-#ifdef TODO
-/*
- * Retrieve a files ACL
- */
-int
-zfs_getacl(znode_t *zp, vsecattr_t *vsecp, cred_t *cr)
-{
- zfs_acl_t *aclp;
- ulong_t mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT);
- int error;
-
- if (error = zfs_zaccess(zp, ACE_READ_ACL, cr)) {
- /*
- * If owner of file then allow reading of the
- * ACL.
- */
- if (crgetuid(cr) != zp->z_phys->zp_uid)
- return (error);
- }
-
- if (mask == 0)
- return (ENOSYS);
-
- mutex_enter(&zp->z_acl_lock);
-
- error = zfs_acl_node_read(zp, &aclp);
- if (error != 0) {
- mutex_exit(&zp->z_acl_lock);
- return (error);
- }
-
-
- if (mask & VSA_ACECNT) {
- vsecp->vsa_aclcnt = aclp->z_acl_count;
- }
-
- if (mask & VSA_ACE) {
- vsecp->vsa_aclentp = kmem_alloc(aclp->z_acl_count *
- sizeof (ace_t), KM_SLEEP);
- bcopy(aclp->z_acl, vsecp->vsa_aclentp,
- aclp->z_acl_count * sizeof (ace_t));
- }
-
- mutex_exit(&zp->z_acl_lock);
-
- zfs_acl_free(aclp);
-
- return (0);
-}
-#endif /* TODO */
-
-#ifdef TODO
-/*
- * Set a files ACL
- */
-int
-zfs_setacl(znode_t *zp, vsecattr_t *vsecp, cred_t *cr)
-{
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- zilog_t *zilog = zfsvfs->z_log;
- ace_t *acep = vsecp->vsa_aclentp;
- int aclcnt = vsecp->vsa_aclcnt;
- ulong_t mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT);
- dmu_tx_t *tx;
- int error;
- int inherit;
- zfs_acl_t *aclp;
-
- if (mask == 0)
- return (EINVAL);
-
- if (!zfs_acl_valid(zp, acep, aclcnt, &inherit))
- return (EINVAL);
-top:
- error = zfs_zaccess_v4_perm(zp, ACE_WRITE_ACL, cr);
- if (error == EACCES || error == ACCESS_UNDETERMINED) {
- if ((error = secpolicy_vnode_setdac(cr,
- zp->z_phys->zp_uid)) != 0) {
- return (error);
- }
- } else if (error) {
- return (error == EROFS ? error : EPERM);
- }
-
- mutex_enter(&zp->z_lock);
- mutex_enter(&zp->z_acl_lock);
-
- tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_bonus(tx, zp->z_id);
-
- if (zp->z_phys->zp_acl.z_acl_extern_obj) {
- dmu_tx_hold_write(tx, zp->z_phys->zp_acl.z_acl_extern_obj,
- 0, ZFS_ACL_SIZE(aclcnt));
- } else if (aclcnt > ACE_SLOT_CNT) {
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, ZFS_ACL_SIZE(aclcnt));
- }
-
- error = dmu_tx_assign(tx, zfsvfs->z_assign);
- if (error) {
- mutex_exit(&zp->z_acl_lock);
- mutex_exit(&zp->z_lock);
-
- if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
- dmu_tx_wait(tx);
- dmu_tx_abort(tx);
- goto top;
- }
- dmu_tx_abort(tx);
- return (error);
- }
-
- aclp = zfs_acl_alloc(aclcnt);
- bcopy(acep, aclp->z_acl, sizeof (ace_t) * aclcnt);
- aclp->z_acl_count = aclcnt;
- error = zfs_aclset_common(zp, aclp, tx, &inherit);
- ASSERT(error == 0);
-
- zfs_acl_free(aclp);
- zfs_log_acl(zilog, tx, TX_ACL, zp, aclcnt, acep);
- dmu_tx_commit(tx);
-done:
- mutex_exit(&zp->z_acl_lock);
- mutex_exit(&zp->z_lock);
-
- return (error);
-}
-#endif /* TODO */
-
-static int
-zfs_ace_access(ace_t *zacep, int *working_mode)
-{
- if (*working_mode == 0) {
- return (0);
- }
-
- if (zacep->a_access_mask & *working_mode) {
- if (zacep->a_type == ALLOW) {
- *working_mode &=
- ~(*working_mode & zacep->a_access_mask);
- if (*working_mode == 0)
- return (0);
- } else if (zacep->a_type == DENY) {
- return (EACCES);
- }
- }
-
- /*
- * haven't been specifcally denied at this point
- * so return UNDETERMINED.
- */
-
- return (ACCESS_UNDETERMINED);
-}
-
-
-static int
-zfs_zaccess_common(znode_t *zp, int v4_mode, int *working_mode, cred_t *cr)
-{
- zfs_acl_t *aclp;
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- ace_t *zacep;
- gid_t gid;
- int cnt;
- int i;
- int error;
- int access_deny = ACCESS_UNDETERMINED;
- uint_t entry_type;
- uid_t uid = crgetuid(cr);
-
- if (zfsvfs->z_assign >= TXG_INITIAL) { /* ZIL replay */
- *working_mode = 0;
- return (0);
- }
-
- *working_mode = v4_mode;
-
- if ((v4_mode & WRITE_MASK) &&
- (zp->z_zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) &&
- (!IS_DEVVP(ZTOV(zp)))) {
- return (EROFS);
- }
-
- mutex_enter(&zp->z_acl_lock);
-
- error = zfs_acl_node_read(zp, &aclp);
- if (error != 0) {
- mutex_exit(&zp->z_acl_lock);
- return (error);
- }
-
-
- zacep = aclp->z_acl;
- cnt = aclp->z_acl_count;
-
- for (i = 0; i != cnt; i++) {
-
- DTRACE_PROBE2(zfs__access__common,
- ace_t *, &zacep[i], int, *working_mode);
-
- if (zacep[i].a_flags & ACE_INHERIT_ONLY_ACE)
- continue;
-
- entry_type = (zacep[i].a_flags & ACE_TYPE_FLAGS);
- switch (entry_type) {
- case ACE_OWNER:
- if (uid == zp->z_phys->zp_uid) {
- access_deny = zfs_ace_access(&zacep[i],
- working_mode);
- }
- break;
- case (ACE_IDENTIFIER_GROUP | ACE_GROUP):
- case ACE_IDENTIFIER_GROUP:
- /*
- * Owning group gid is in znode not ACL
- */
- if (entry_type == (ACE_IDENTIFIER_GROUP | ACE_GROUP))
- gid = zp->z_phys->zp_gid;
- else
- gid = zacep[i].a_who;
-
- if (groupmember(gid, cr)) {
- access_deny = zfs_ace_access(&zacep[i],
- working_mode);
- }
- break;
- case ACE_EVERYONE:
- access_deny = zfs_ace_access(&zacep[i], working_mode);
- break;
-
- /* USER Entry */
- default:
- if (entry_type == 0) {
- if (uid == zacep[i].a_who) {
- access_deny = zfs_ace_access(&zacep[i],
- working_mode);
- }
- break;
- }
- zfs_acl_free(aclp);
- mutex_exit(&zp->z_acl_lock);
- return (EIO);
- }
-
- if (access_deny != ACCESS_UNDETERMINED)
- break;
- }
-
- mutex_exit(&zp->z_acl_lock);
- zfs_acl_free(aclp);
-
- return (access_deny);
-}
-
-
-/*
- * Determine whether Access should be granted/denied, invoking least
- * priv subsytem when a deny is determined.
- */
-int
-zfs_zaccess(znode_t *zp, int mode, cred_t *cr)
-{
- int working_mode;
- int error;
- int is_attr;
- znode_t *xzp;
- znode_t *check_zp = zp;
-
- is_attr = ((zp->z_phys->zp_flags & ZFS_XATTR) &&
- (ZTOV(zp)->v_type == VDIR));
-
- /*
- * If attribute then validate against base file
- */
- if (is_attr) {
- if ((error = zfs_zget(zp->z_zfsvfs,
- zp->z_phys->zp_parent, &xzp)) != 0) {
- return (error);
- }
- check_zp = xzp;
- /*
- * fixup mode to map to xattr perms
- */
-
- if (mode & (ACE_WRITE_DATA|ACE_APPEND_DATA)) {
- mode &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
- mode |= ACE_WRITE_NAMED_ATTRS;
- }
-
- if (mode & (ACE_READ_DATA|ACE_EXECUTE)) {
- mode &= ~(ACE_READ_DATA|ACE_EXECUTE);
- mode |= ACE_READ_NAMED_ATTRS;
- }
- }
-
- error = zfs_zaccess_common(check_zp, mode, &working_mode, cr);
-
- if (error == EROFS) {
- if (is_attr)
- VN_RELE(ZTOV(xzp));
- return (error);
- }
-
- if (error || working_mode) {
- working_mode = (zfs_v4_to_unix(working_mode) << 6);
- error = secpolicy_vnode_access(cr, ZTOV(check_zp),
- check_zp->z_phys->zp_uid, working_mode);
- }
-
- if (is_attr)
- VN_RELE(ZTOV(xzp));
-
- return (error);
-}
-
-/*
- * Special zaccess function to check for special nfsv4 perm.
- * doesn't call secpolicy_vnode_access() for failure, since that
- * would probably be the wrong policy function to call.
- * instead its up to the caller to handle that situation.
- */
-
-int
-zfs_zaccess_v4_perm(znode_t *zp, int mode, cred_t *cr)
-{
- int working_mode = 0;
- return (zfs_zaccess_common(zp, mode, &working_mode, cr));
-}
-
-/*
- * Translate tradition unix VREAD/VWRITE/VEXEC mode into
- * native ACL format and call zfs_zaccess()
- */
-int
-zfs_zaccess_rwx(znode_t *zp, mode_t mode, cred_t *cr)
-{
- int v4_mode = zfs_unix_to_v4(mode >> 6);
-
- return (zfs_zaccess(zp, v4_mode, cr));
-}
-
-static int
-zfs_delete_final_check(znode_t *zp, znode_t *dzp, cred_t *cr)
-{
- int error;
-
- error = secpolicy_vnode_access(cr, ZTOV(zp),
- dzp->z_phys->zp_uid, S_IWRITE|S_IEXEC);
-
- if (error == 0)
- error = zfs_sticky_remove_access(dzp, zp, cr);
-
- return (error);
-}
-
-/*
- * Determine whether Access should be granted/deny, without
- * consulting least priv subsystem.
- *
- *
- * The following chart is the recommended NFSv4 enforcement for
- * ability to delete an object.
- *
- * -------------------------------------------------------
- * | Parent Dir | Target Object Permissions |
- * | permissions | |
- * -------------------------------------------------------
- * | | ACL Allows | ACL Denies| Delete |
- * | | Delete | Delete | unspecified|
- * -------------------------------------------------------
- * | ACL Allows | Permit | Permit | Permit |
- * | DELETE_CHILD | |
- * -------------------------------------------------------
- * | ACL Denies | Permit | Deny | Deny |
- * | DELETE_CHILD | | | |
- * -------------------------------------------------------
- * | ACL specifies | | | |
- * | only allow | Permit | Permit | Permit |
- * | write and | | | |
- * | execute | | | |
- * -------------------------------------------------------
- * | ACL denies | | | |
- * | write and | Permit | Deny | Deny |
- * | execute | | | |
- * -------------------------------------------------------
- * ^
- * |
- * No search privilege, can't even look up file?
- *
- */
-int
-zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr)
-{
- int dzp_working_mode = 0;
- int zp_working_mode = 0;
- int dzp_error, zp_error;
-
- /*
- * Arghh, this check is going to require a couple of questions
- * to be asked. We want specific DELETE permissions to
- * take precedence over WRITE/EXECUTE. We don't
- * want an ACL such as this to mess us up.
- * user:joe:write_data:deny,user:joe:delete:allow
- *
- * However, deny permissions may ultimately be overridden
- * by secpolicy_vnode_access().
- */
-
- dzp_error = zfs_zaccess_common(dzp, ACE_DELETE_CHILD,
- &dzp_working_mode, cr);
- zp_error = zfs_zaccess_common(zp, ACE_DELETE, &zp_working_mode, cr);
-
- if (dzp_error == EROFS || zp_error == EROFS)
- return (dzp_error);
-
- /*
- * First check the first row.
- * We only need to see if parent Allows delete_child
- */
- if ((dzp_working_mode & ACE_DELETE_CHILD) == 0)
- return (0);
-
- /*
- * Second row
- * we already have the necessary information in
- * zp_working_mode, zp_error and dzp_error.
- */
-
- if ((zp_working_mode & ACE_DELETE) == 0)
- return (0);
-
- /*
- * Now zp_error should either be EACCES which indicates
- * a "deny" delete entry or ACCESS_UNDETERMINED if the "delete"
- * entry exists on the target.
- *
- * dzp_error should be either EACCES which indicates a "deny"
- * entry for delete_child or ACCESS_UNDETERMINED if no delete_child
- * entry exists. If value is EACCES then we are done
- * and zfs_delete_final_check() will make the final decision
- * regarding to allow the delete.
- */
-
- ASSERT(zp_error != 0 && dzp_error != 0);
- if (dzp_error == EACCES)
- return (zfs_delete_final_check(zp, dzp, cr));
-
- /*
- * Third Row
- * Only need to check for write/execute on parent
- */
-
- dzp_error = zfs_zaccess_common(dzp, ACE_WRITE_DATA|ACE_EXECUTE,
- &dzp_working_mode, cr);
-
- if (dzp_error == EROFS)
- return (dzp_error);
-
- if ((dzp_working_mode & (ACE_WRITE_DATA|ACE_EXECUTE)) == 0)
- return (zfs_sticky_remove_access(dzp, zp, cr));
-
- /*
- * Fourth Row
- */
-
- if (((dzp_working_mode & (ACE_WRITE_DATA|ACE_EXECUTE)) != 0) &&
- ((zp_working_mode & ACE_DELETE) == 0))
- return (zfs_sticky_remove_access(dzp, zp, cr));
-
- return (zfs_delete_final_check(zp, dzp, cr));
-}
-
-int
-zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp,
- znode_t *tzp, cred_t *cr)
-{
- int add_perm;
- int error;
-
- add_perm = (ZTOV(szp)->v_type == VDIR) ?
- ACE_ADD_SUBDIRECTORY : ACE_ADD_FILE;
-
- /*
- * Rename permissions are combination of delete permission +
- * add file/subdir permission.
- */
-
- /*
- * first make sure we do the delete portion.
- *
- * If that succeeds then check for add_file/add_subdir permissions
- */
-
- if (error = zfs_zaccess_delete(sdzp, szp, cr))
- return (error);
-
- /*
- * If we have a tzp, see if we can delete it?
- */
- if (tzp) {
- if (error = zfs_zaccess_delete(tdzp, tzp, cr))
- return (error);
- }
-
- /*
- * Now check for add permissions
- */
- error = zfs_zaccess(tdzp, add_perm, cr);
-
- return (error);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c
deleted file mode 100644
index c8450d4..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_byteswap.c
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/vfs.h>
-#include <sys/fs/zfs.h>
-#include <sys/zfs_znode.h>
-#include <sys/zfs_acl.h>
-
-void
-zfs_ace_byteswap(ace_t *ace, int ace_cnt)
-{
- int i;
-
- for (i = 0; i != ace_cnt; i++, ace++) {
- ace->a_who = BSWAP_32(ace->a_who);
- ace->a_access_mask = BSWAP_32(ace->a_access_mask);
- ace->a_flags = BSWAP_16(ace->a_flags);
- ace->a_type = BSWAP_16(ace->a_type);
- }
-}
-
-/* ARGSUSED */
-void
-zfs_acl_byteswap(void *buf, size_t size)
-{
- int cnt;
-
- /*
- * Arggh, since we don't know how many ACEs are in
- * the array, we have to swap the entire block
- */
-
- cnt = size / sizeof (ace_t);
-
- zfs_ace_byteswap((ace_t *)buf, cnt);
-}
-
-void
-zfs_znode_byteswap(void *buf, size_t size)
-{
- znode_phys_t *zp = buf;
-
- ASSERT(size >= sizeof (znode_phys_t));
-
- zp->zp_crtime[0] = BSWAP_64(zp->zp_crtime[0]);
- zp->zp_crtime[1] = BSWAP_64(zp->zp_crtime[1]);
- zp->zp_atime[0] = BSWAP_64(zp->zp_atime[0]);
- zp->zp_atime[1] = BSWAP_64(zp->zp_atime[1]);
- zp->zp_mtime[0] = BSWAP_64(zp->zp_mtime[0]);
- zp->zp_mtime[1] = BSWAP_64(zp->zp_mtime[1]);
- zp->zp_ctime[0] = BSWAP_64(zp->zp_ctime[0]);
- zp->zp_ctime[1] = BSWAP_64(zp->zp_ctime[1]);
- zp->zp_gen = BSWAP_64(zp->zp_gen);
- zp->zp_mode = BSWAP_64(zp->zp_mode);
- zp->zp_size = BSWAP_64(zp->zp_size);
- zp->zp_parent = BSWAP_64(zp->zp_parent);
- zp->zp_links = BSWAP_64(zp->zp_links);
- zp->zp_xattr = BSWAP_64(zp->zp_xattr);
- zp->zp_rdev = BSWAP_64(zp->zp_rdev);
- zp->zp_flags = BSWAP_64(zp->zp_flags);
- zp->zp_uid = BSWAP_64(zp->zp_uid);
- zp->zp_gid = BSWAP_64(zp->zp_gid);
- zp->zp_pad[0] = BSWAP_64(zp->zp_pad[0]);
- zp->zp_pad[1] = BSWAP_64(zp->zp_pad[1]);
- zp->zp_pad[2] = BSWAP_64(zp->zp_pad[2]);
- zp->zp_pad[3] = BSWAP_64(zp->zp_pad[3]);
-
- zp->zp_acl.z_acl_extern_obj = BSWAP_64(zp->zp_acl.z_acl_extern_obj);
- zp->zp_acl.z_acl_count = BSWAP_32(zp->zp_acl.z_acl_count);
- zp->zp_acl.z_acl_version = BSWAP_16(zp->zp_acl.z_acl_version);
- zp->zp_acl.z_acl_pad = BSWAP_16(zp->zp_acl.z_acl_pad);
- zfs_ace_byteswap(&zp->zp_acl.z_ace_data[0], ACE_SLOT_CNT);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c
deleted file mode 100644
index 0c2fb02..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c
+++ /dev/null
@@ -1,1119 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-/*
- * ZFS control directory (a.k.a. ".zfs")
- *
- * This directory provides a common location for all ZFS meta-objects.
- * Currently, this is only the 'snapshot' directory, but this may expand in the
- * future. The elements are built using the GFS primitives, as the hierarchy
- * does not actually exist on disk.
- *
- * For 'snapshot', we don't want to have all snapshots always mounted, because
- * this would take up a huge amount of space in /etc/mnttab. We have three
- * types of objects:
- *
- * ctldir ------> snapshotdir -------> snapshot
- * |
- * |
- * V
- * mounted fs
- *
- * The 'snapshot' node contains just enough information to lookup '..' and act
- * as a mountpoint for the snapshot. Whenever we lookup a specific snapshot, we
- * perform an automount of the underlying filesystem and return the
- * corresponding vnode.
- *
- * All mounts are handled automatically by the kernel, but unmounts are
- * (currently) handled from user land. The main reason is that there is no
- * reliable way to auto-unmount the filesystem when it's "no longer in use".
- * When the user unmounts a filesystem, we call zfsctl_unmount(), which
- * unmounts any snapshots within the snapshot directory.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/zfs_ctldir.h>
-#include <sys/zfs_ioctl.h>
-#include <sys/zfs_vfsops.h>
-#include <sys/namei.h>
-#include <sys/gfs.h>
-#include <sys/stat.h>
-#include <sys/dmu.h>
-#include <sys/mount.h>
-
-typedef struct {
- char *se_name;
- vnode_t *se_root;
- avl_node_t se_node;
-} zfs_snapentry_t;
-
-static int
-snapentry_compare(const void *a, const void *b)
-{
- const zfs_snapentry_t *sa = a;
- const zfs_snapentry_t *sb = b;
- int ret = strcmp(sa->se_name, sb->se_name);
-
- if (ret < 0)
- return (-1);
- else if (ret > 0)
- return (1);
- else
- return (0);
-}
-
-static struct vop_vector zfsctl_ops_root;
-static struct vop_vector zfsctl_ops_snapdir;
-static struct vop_vector zfsctl_ops_snapshot;
-
-static vnode_t *zfsctl_mknode_snapdir(vnode_t *);
-static vnode_t *zfsctl_snapshot_mknode(vnode_t *, uint64_t objset);
-
-typedef struct zfsctl_node {
- gfs_dir_t zc_gfs_private;
- uint64_t zc_id;
- timestruc_t zc_cmtime; /* ctime and mtime, always the same */
-} zfsctl_node_t;
-
-typedef struct zfsctl_snapdir {
- zfsctl_node_t sd_node;
- kmutex_t sd_lock;
- avl_tree_t sd_snaps;
-} zfsctl_snapdir_t;
-
-/*
- * Root directory elements. We have only a single static entry, 'snapshot'.
- */
-static gfs_dirent_t zfsctl_root_entries[] = {
- { "snapshot", zfsctl_mknode_snapdir, GFS_CACHE_VNODE },
- { NULL }
-};
-
-/* include . and .. in the calculation */
-#define NROOT_ENTRIES ((sizeof (zfsctl_root_entries) / \
- sizeof (gfs_dirent_t)) + 1)
-
-
-/*
- * Initialize the various GFS pieces we'll need to create and manipulate .zfs
- * directories. This is called from the ZFS init routine, and initializes the
- * vnode ops vectors that we'll be using.
- */
-void
-zfsctl_init(void)
-{
-}
-
-void
-zfsctl_fini(void)
-{
-}
-
-/*
- * Return the inode number associated with the 'snapshot' directory.
- */
-/* ARGSUSED */
-static ino64_t
-zfsctl_root_inode_cb(vnode_t *vp, int index)
-{
- ASSERT(index == 0);
- return (ZFSCTL_INO_SNAPDIR);
-}
-
-/*
- * Create the '.zfs' directory. This directory is cached as part of the VFS
- * structure. This results in a hold on the vfs_t. The code in zfs_umount()
- * therefore checks against a vfs_count of 2 instead of 1. This reference
- * is removed when the ctldir is destroyed in the unmount.
- */
-void
-zfsctl_create(zfsvfs_t *zfsvfs)
-{
- vnode_t *vp, *rvp;
- zfsctl_node_t *zcp;
-
- ASSERT(zfsvfs->z_ctldir == NULL);
-
- vp = gfs_root_create(sizeof (zfsctl_node_t), zfsvfs->z_vfs,
- &zfsctl_ops_root, ZFSCTL_INO_ROOT, zfsctl_root_entries,
- zfsctl_root_inode_cb, MAXNAMELEN, NULL, NULL);
- zcp = vp->v_data;
- zcp->zc_id = ZFSCTL_INO_ROOT;
-
- VERIFY(VFS_ROOT(zfsvfs->z_vfs, LK_EXCLUSIVE, &rvp, curthread) == 0);
- ZFS_TIME_DECODE(&zcp->zc_cmtime, VTOZ(rvp)->z_phys->zp_crtime);
- VN_URELE(rvp);
-
- /*
- * We're only faking the fact that we have a root of a filesystem for
- * the sake of the GFS interfaces. Undo the flag manipulation it did
- * for us.
- */
- vp->v_vflag &= ~VV_ROOT;
-
- zfsvfs->z_ctldir = vp;
-}
-
-/*
- * Destroy the '.zfs' directory. Only called when the filesystem is unmounted.
- * There might still be more references if we were force unmounted, but only
- * new zfs_inactive() calls can occur and they don't reference .zfs
- */
-void
-zfsctl_destroy(zfsvfs_t *zfsvfs)
-{
- VN_RELE(zfsvfs->z_ctldir);
- zfsvfs->z_ctldir = NULL;
-}
-
-/*
- * Given a root znode, retrieve the associated .zfs directory.
- * Add a hold to the vnode and return it.
- */
-vnode_t *
-zfsctl_root(znode_t *zp)
-{
- ASSERT(zfs_has_ctldir(zp));
- VN_HOLD(zp->z_zfsvfs->z_ctldir);
- return (zp->z_zfsvfs->z_ctldir);
-}
-
-/*
- * Common open routine. Disallow any write access.
- */
-/* ARGSUSED */
-static int
-zfsctl_common_open(struct vop_open_args *ap)
-{
- int flags = ap->a_mode;
-
- if (flags & FWRITE)
- return (EACCES);
-
- return (0);
-}
-
-/*
- * Common close routine. Nothing to do here.
- */
-/* ARGSUSED */
-static int
-zfsctl_common_close(struct vop_close_args *ap)
-{
- return (0);
-}
-
-/*
- * Common access routine. Disallow writes.
- */
-/* ARGSUSED */
-static int
-zfsctl_common_access(ap)
- struct vop_access_args /* {
- struct vnode *a_vp;
- int a_mode;
- struct ucred *a_cred;
- struct thread *a_td;
- } */ *ap;
-{
- int mode = ap->a_mode;
-
- if (mode & VWRITE)
- return (EACCES);
-
- return (0);
-}
-
-/*
- * Common getattr function. Fill in basic information.
- */
-static void
-zfsctl_common_getattr(vnode_t *vp, vattr_t *vap)
-{
- zfsctl_node_t *zcp = vp->v_data;
- timestruc_t now;
-
- vap->va_uid = 0;
- vap->va_gid = 0;
- vap->va_rdev = 0;
- /*
- * We are a purly virtual object, so we have no
- * blocksize or allocated blocks.
- */
- vap->va_blksize = 0;
- vap->va_nblocks = 0;
- vap->va_seq = 0;
- vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
- vap->va_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP |
- S_IROTH | S_IXOTH;
- vap->va_type = VDIR;
- /*
- * We live in the now (for atime).
- */
- gethrestime(&now);
- vap->va_atime = now;
- vap->va_mtime = vap->va_ctime = vap->va_birthtime = zcp->zc_cmtime;
- /* FreeBSD: Reset chflags(2) flags. */
- vap->va_flags = 0;
-}
-
-static int
-zfsctl_common_fid(ap)
- struct vop_fid_args /* {
- struct vnode *a_vp;
- struct fid *a_fid;
- } */ *ap;
-{
- vnode_t *vp = ap->a_vp;
- fid_t *fidp = (void *)ap->a_fid;
- zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
- zfsctl_node_t *zcp = vp->v_data;
- uint64_t object = zcp->zc_id;
- zfid_short_t *zfid;
- int i;
-
- ZFS_ENTER(zfsvfs);
-
- fidp->fid_len = SHORT_FID_LEN;
-
- zfid = (zfid_short_t *)fidp;
-
- zfid->zf_len = SHORT_FID_LEN;
-
- for (i = 0; i < sizeof (zfid->zf_object); i++)
- zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
-
- /* .zfs znodes always have a generation number of 0 */
- for (i = 0; i < sizeof (zfid->zf_gen); i++)
- zfid->zf_gen[i] = 0;
-
- ZFS_EXIT(zfsvfs);
- return (0);
-}
-
-static int
-zfsctl_common_reclaim(ap)
- struct vop_reclaim_args /* {
- struct vnode *a_vp;
- struct thread *a_td;
- } */ *ap;
-{
- vnode_t *vp = ap->a_vp;
-
- /*
- * Destroy the vm object and flush associated pages.
- */
- vnode_destroy_vobject(vp);
- VI_LOCK(vp);
- vp->v_data = NULL;
- VI_UNLOCK(vp);
- return (0);
-}
-
-/*
- * .zfs inode namespace
- *
- * We need to generate unique inode numbers for all files and directories
- * within the .zfs pseudo-filesystem. We use the following scheme:
- *
- * ENTRY ZFSCTL_INODE
- * .zfs 1
- * .zfs/snapshot 2
- * .zfs/snapshot/<snap> objectid(snap)
- */
-
-#define ZFSCTL_INO_SNAP(id) (id)
-
-/*
- * Get root directory attributes.
- */
-/* ARGSUSED */
-static int
-zfsctl_root_getattr(ap)
- struct vop_getattr_args /* {
- struct vnode *a_vp;
- struct vattr *a_vap;
- struct ucred *a_cred;
- struct thread *a_td;
- } */ *ap;
-{
- struct vnode *vp = ap->a_vp;
- struct vattr *vap = ap->a_vap;
- zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
-
- ZFS_ENTER(zfsvfs);
- vap->va_nodeid = ZFSCTL_INO_ROOT;
- vap->va_nlink = vap->va_size = NROOT_ENTRIES;
-
- zfsctl_common_getattr(vp, vap);
- ZFS_EXIT(zfsvfs);
-
- return (0);
-}
-
-/*
- * Special case the handling of "..".
- */
-/* ARGSUSED */
-int
-zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
- int flags, vnode_t *rdir, cred_t *cr)
-{
- zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
- int err;
-
- ZFS_ENTER(zfsvfs);
-
- if (strcmp(nm, "..") == 0) {
- err = VFS_ROOT(dvp->v_vfsp, LK_EXCLUSIVE, vpp, curthread);
- if (err == 0)
- VOP_UNLOCK(*vpp, 0);
- } else {
- err = gfs_dir_lookup(dvp, nm, vpp);
- }
-
- ZFS_EXIT(zfsvfs);
-
- return (err);
-}
-
-/*
- * Special case the handling of "..".
- */
-/* ARGSUSED */
-int
-zfsctl_root_lookup_vop(ap)
- struct vop_lookup_args /* {
- struct vnode *a_dvp;
- struct vnode **a_vpp;
- struct componentname *a_cnp;
- } */ *ap;
-{
- vnode_t *dvp = ap->a_dvp;
- vnode_t **vpp = ap->a_vpp;
- cred_t *cr = ap->a_cnp->cn_cred;
- int flags = ap->a_cnp->cn_flags;
- int nameiop = ap->a_cnp->cn_nameiop;
- char nm[NAME_MAX + 1];
- int err;
-
- if ((flags & ISLASTCN) && (nameiop == RENAME || nameiop == CREATE))
- return (EOPNOTSUPP);
-
- ASSERT(ap->a_cnp->cn_namelen < sizeof(nm));
- strlcpy(nm, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1);
-
- err = zfsctl_root_lookup(dvp, nm, vpp, NULL, 0, NULL, cr);
- if (err == 0 && (nm[0] != '.' || nm[1] != '\0'))
- vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
-
- return (err);
-}
-
-static struct vop_vector zfsctl_ops_root = {
- .vop_default = &default_vnodeops,
- .vop_open = zfsctl_common_open,
- .vop_close = zfsctl_common_close,
- .vop_ioctl = VOP_EINVAL,
- .vop_getattr = zfsctl_root_getattr,
- .vop_access = zfsctl_common_access,
- .vop_readdir = gfs_vop_readdir,
- .vop_lookup = zfsctl_root_lookup_vop,
- .vop_inactive = gfs_vop_inactive,
- .vop_reclaim = zfsctl_common_reclaim,
- .vop_fid = zfsctl_common_fid,
-};
-
-static int
-zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname)
-{
- objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os;
-
- dmu_objset_name(os, zname);
- if (strlen(zname) + 1 + strlen(name) >= len)
- return (ENAMETOOLONG);
- (void) strcat(zname, "@");
- (void) strcat(zname, name);
- return (0);
-}
-
-static int
-zfsctl_unmount_snap(vnode_t *dvp, const char *name, int force, cred_t *cr)
-{
- zfsctl_snapdir_t *sdp = dvp->v_data;
- zfs_snapentry_t search, *sep;
- struct vop_inactive_args ap;
- avl_index_t where;
- int err;
-
- ASSERT(MUTEX_HELD(&sdp->sd_lock));
-
- search.se_name = (char *)name;
- if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL)
- return (ENOENT);
-
- ASSERT(vn_ismntpt(sep->se_root));
-
- /* this will be dropped by dounmount() */
- if ((err = vn_vfswlock(sep->se_root)) != 0)
- return (err);
-
- err = dounmount(vn_mountedvfs(sep->se_root), force, curthread);
- if (err)
- return (err);
- ASSERT(sep->se_root->v_count == 1);
- ap.a_vp = sep->se_root;
- gfs_vop_inactive(&ap);
-
- avl_remove(&sdp->sd_snaps, sep);
- kmem_free(sep->se_name, strlen(sep->se_name) + 1);
- kmem_free(sep, sizeof (zfs_snapentry_t));
-
- return (0);
-}
-
-#if 0
-static void
-zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm)
-{
- avl_index_t where;
- vfs_t *vfsp;
- refstr_t *pathref;
- char newpath[MAXNAMELEN];
- char *tail;
-
- ASSERT(MUTEX_HELD(&sdp->sd_lock));
- ASSERT(sep != NULL);
-
- vfsp = vn_mountedvfs(sep->se_root);
- ASSERT(vfsp != NULL);
-
- vfs_lock_wait(vfsp);
-
- /*
- * Change the name in the AVL tree.
- */
- avl_remove(&sdp->sd_snaps, sep);
- kmem_free(sep->se_name, strlen(sep->se_name) + 1);
- sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
- (void) strcpy(sep->se_name, nm);
- VERIFY(avl_find(&sdp->sd_snaps, sep, &where) == NULL);
- avl_insert(&sdp->sd_snaps, sep, where);
-
- /*
- * Change the current mountpoint info:
- * - update the tail of the mntpoint path
- * - update the tail of the resource path
- */
- pathref = vfs_getmntpoint(vfsp);
- (void) strncpy(newpath, refstr_value(pathref), sizeof (newpath));
- VERIFY((tail = strrchr(newpath, '/')) != NULL);
- *(tail+1) = '\0';
- ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath));
- (void) strcat(newpath, nm);
- refstr_rele(pathref);
- vfs_setmntpoint(vfsp, newpath);
-
- pathref = vfs_getresource(vfsp);
- (void) strncpy(newpath, refstr_value(pathref), sizeof (newpath));
- VERIFY((tail = strrchr(newpath, '@')) != NULL);
- *(tail+1) = '\0';
- ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath));
- (void) strcat(newpath, nm);
- refstr_rele(pathref);
- vfs_setresource(vfsp, newpath);
-
- vfs_unlock(vfsp);
-}
-#endif
-
-#if 0
-static int
-zfsctl_snapdir_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm,
- cred_t *cr)
-{
- zfsctl_snapdir_t *sdp = sdvp->v_data;
- zfs_snapentry_t search, *sep;
- avl_index_t where;
- char from[MAXNAMELEN], to[MAXNAMELEN];
- int err;
-
- err = zfsctl_snapshot_zname(sdvp, snm, MAXNAMELEN, from);
- if (err)
- return (err);
- err = zfs_secpolicy_write(from, cr);
- if (err)
- return (err);
-
- /*
- * Cannot move snapshots out of the snapdir.
- */
- if (sdvp != tdvp)
- return (EINVAL);
-
- if (strcmp(snm, tnm) == 0)
- return (0);
-
- err = zfsctl_snapshot_zname(tdvp, tnm, MAXNAMELEN, to);
- if (err)
- return (err);
-
- mutex_enter(&sdp->sd_lock);
-
- search.se_name = (char *)snm;
- if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL) {
- mutex_exit(&sdp->sd_lock);
- return (ENOENT);
- }
-
- err = dmu_objset_rename(from, to, B_FALSE);
- if (err == 0)
- zfsctl_rename_snap(sdp, sep, tnm);
-
- mutex_exit(&sdp->sd_lock);
-
- return (err);
-}
-#endif
-
-#if 0
-/* ARGSUSED */
-static int
-zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr)
-{
- zfsctl_snapdir_t *sdp = dvp->v_data;
- char snapname[MAXNAMELEN];
- int err;
-
- err = zfsctl_snapshot_zname(dvp, name, MAXNAMELEN, snapname);
- if (err)
- return (err);
- err = zfs_secpolicy_write(snapname, cr);
- if (err)
- return (err);
-
- mutex_enter(&sdp->sd_lock);
-
- err = zfsctl_unmount_snap(dvp, name, 0, cr);
- if (err) {
- mutex_exit(&sdp->sd_lock);
- return (err);
- }
-
- err = dmu_objset_destroy(snapname);
-
- mutex_exit(&sdp->sd_lock);
-
- return (err);
-}
-#endif
-
-/*
- * Lookup entry point for the 'snapshot' directory. Try to open the
- * snapshot if it exist, creating the pseudo filesystem vnode as necessary.
- * Perform a mount of the associated dataset on top of the vnode.
- */
-/* ARGSUSED */
-int
-zfsctl_snapdir_lookup(ap)
- struct vop_lookup_args /* {
- struct vnode *a_dvp;
- struct vnode **a_vpp;
- struct componentname *a_cnp;
- } */ *ap;
-{
- vnode_t *dvp = ap->a_dvp;
- vnode_t **vpp = ap->a_vpp;
- char nm[NAME_MAX + 1];
- zfsctl_snapdir_t *sdp = dvp->v_data;
- objset_t *snap;
- char snapname[MAXNAMELEN];
- char *mountpoint;
- zfs_snapentry_t *sep, search;
- size_t mountpoint_len;
- avl_index_t where;
- zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
- int err;
-
- ASSERT(ap->a_cnp->cn_namelen < sizeof(nm));
- strlcpy(nm, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1);
-
- ASSERT(dvp->v_type == VDIR);
-
- if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0)
- return (0);
-
- *vpp = NULL;
-
- /*
- * If we get a recursive call, that means we got called
- * from the domount() code while it was trying to look up the
- * spec (which looks like a local path for zfs). We need to
- * add some flag to domount() to tell it not to do this lookup.
- */
- if (MUTEX_HELD(&sdp->sd_lock))
- return (ENOENT);
-
- ZFS_ENTER(zfsvfs);
-
- mutex_enter(&sdp->sd_lock);
- search.se_name = (char *)nm;
- if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) != NULL) {
- *vpp = sep->se_root;
- VN_HOLD(*vpp);
- if ((*vpp)->v_mountedhere == NULL) {
- /*
- * The snapshot was unmounted behind our backs,
- * try to remount it.
- */
- goto domount;
- }
- vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
- mutex_exit(&sdp->sd_lock);
- ZFS_EXIT(zfsvfs);
- return (0);
- }
-
- /*
- * The requested snapshot is not currently mounted, look it up.
- */
- err = zfsctl_snapshot_zname(dvp, nm, MAXNAMELEN, snapname);
- if (err) {
- mutex_exit(&sdp->sd_lock);
- ZFS_EXIT(zfsvfs);
- return (err);
- }
- if (dmu_objset_open(snapname, DMU_OST_ZFS,
- DS_MODE_STANDARD | DS_MODE_READONLY, &snap) != 0) {
- mutex_exit(&sdp->sd_lock);
- ZFS_EXIT(zfsvfs);
- return (ENOENT);
- }
-
- sep = kmem_alloc(sizeof (zfs_snapentry_t), KM_SLEEP);
- sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
- (void) strcpy(sep->se_name, nm);
- *vpp = sep->se_root = zfsctl_snapshot_mknode(dvp, dmu_objset_id(snap));
- VN_HOLD(*vpp);
- avl_insert(&sdp->sd_snaps, sep, where);
-
- dmu_objset_close(snap);
-domount:
- mountpoint_len = strlen(dvp->v_vfsp->mnt_stat.f_mntonname) +
- strlen("/.zfs/snapshot/") + strlen(nm) + 1;
- mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP);
- (void) snprintf(mountpoint, mountpoint_len, "%s/.zfs/snapshot/%s",
- dvp->v_vfsp->mnt_stat.f_mntonname, nm);
- err = domount(curthread, *vpp, "zfs", mountpoint, snapname, 0);
- kmem_free(mountpoint, mountpoint_len);
- /* FreeBSD: This line was moved from below to avoid a lock recursion. */
- if (err == 0)
- vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
- mutex_exit(&sdp->sd_lock);
-
- /*
- * If we had an error, drop our hold on the vnode and
- * zfsctl_snapshot_inactive() will clean up.
- */
- if (err) {
- VN_RELE(*vpp);
- *vpp = NULL;
- }
- return (err);
-}
-
-/* ARGSUSED */
-static int
-zfsctl_snapdir_readdir_cb(vnode_t *vp, struct dirent64 *dp, int *eofp,
- offset_t *offp, offset_t *nextp, void *data)
-{
- zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
- char snapname[MAXNAMELEN];
- uint64_t id, cookie;
-
- ZFS_ENTER(zfsvfs);
-
- cookie = *offp;
- if (dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN, snapname, &id,
- &cookie) == ENOENT) {
- *eofp = 1;
- ZFS_EXIT(zfsvfs);
- return (0);
- }
-
- (void) strcpy(dp->d_name, snapname);
- dp->d_ino = ZFSCTL_INO_SNAP(id);
- *nextp = cookie;
-
- ZFS_EXIT(zfsvfs);
-
- return (0);
-}
-
-vnode_t *
-zfsctl_mknode_snapdir(vnode_t *pvp)
-{
- vnode_t *vp;
- zfsctl_snapdir_t *sdp;
-
- vp = gfs_dir_create(sizeof (zfsctl_snapdir_t), pvp, pvp->v_vfsp,
- &zfsctl_ops_snapdir, NULL, NULL, MAXNAMELEN,
- zfsctl_snapdir_readdir_cb, NULL);
- sdp = vp->v_data;
- sdp->sd_node.zc_id = ZFSCTL_INO_SNAPDIR;
- sdp->sd_node.zc_cmtime = ((zfsctl_node_t *)pvp->v_data)->zc_cmtime;
- mutex_init(&sdp->sd_lock, NULL, MUTEX_DEFAULT, NULL);
- avl_create(&sdp->sd_snaps, snapentry_compare,
- sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, se_node));
- return (vp);
-}
-
-/* ARGSUSED */
-static int
-zfsctl_snapdir_getattr(ap)
- struct vop_getattr_args /* {
- struct vnode *a_vp;
- struct vattr *a_vap;
- struct ucred *a_cred;
- struct thread *a_td;
- } */ *ap;
-{
- struct vnode *vp = ap->a_vp;
- struct vattr *vap = ap->a_vap;
- zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
- zfsctl_snapdir_t *sdp = vp->v_data;
-
- ZFS_ENTER(zfsvfs);
- zfsctl_common_getattr(vp, vap);
- vap->va_nodeid = gfs_file_inode(vp);
- vap->va_nlink = vap->va_size = avl_numnodes(&sdp->sd_snaps) + 2;
- ZFS_EXIT(zfsvfs);
-
- return (0);
-}
-
-/* ARGSUSED */
-static int
-zfsctl_snapdir_inactive(ap)
- struct vop_inactive_args /* {
- struct vnode *a_vp;
- struct thread *a_td;
- } */ *ap;
-{
- vnode_t *vp = ap->a_vp;
- zfsctl_snapdir_t *sdp = vp->v_data;
- void *private;
-
- private = gfs_dir_inactive(vp);
- if (private != NULL) {
- ASSERT(avl_numnodes(&sdp->sd_snaps) == 0);
- mutex_destroy(&sdp->sd_lock);
- avl_destroy(&sdp->sd_snaps);
- kmem_free(private, sizeof (zfsctl_snapdir_t));
- }
- return (0);
-}
-
-static struct vop_vector zfsctl_ops_snapdir = {
- .vop_default = &default_vnodeops,
- .vop_open = zfsctl_common_open,
- .vop_close = zfsctl_common_close,
- .vop_ioctl = VOP_EINVAL,
- .vop_getattr = zfsctl_snapdir_getattr,
- .vop_access = zfsctl_common_access,
- .vop_readdir = gfs_vop_readdir,
- .vop_lookup = zfsctl_snapdir_lookup,
- .vop_inactive = zfsctl_snapdir_inactive,
- .vop_reclaim = zfsctl_common_reclaim,
- .vop_fid = zfsctl_common_fid,
-};
-
-static vnode_t *
-zfsctl_snapshot_mknode(vnode_t *pvp, uint64_t objset)
-{
- vnode_t *vp;
- zfsctl_node_t *zcp;
-
- vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp, pvp->v_vfsp,
- &zfsctl_ops_snapshot, NULL, NULL, MAXNAMELEN, NULL, NULL);
- zcp = vp->v_data;
- zcp->zc_id = objset;
-
- return (vp);
-}
-
-static int
-zfsctl_snapshot_inactive(ap)
- struct vop_inactive_args /* {
- struct vnode *a_vp;
- struct thread *a_td;
- } */ *ap;
-{
- vnode_t *vp = ap->a_vp;
- struct vop_inactive_args iap;
- zfsctl_snapdir_t *sdp;
- zfs_snapentry_t *sep, *next;
- int locked;
- vnode_t *dvp;
-
- VERIFY(gfs_dir_lookup(vp, "..", &dvp) == 0);
- sdp = dvp->v_data;
- VOP_UNLOCK(dvp, 0);
-
- if (!(locked = MUTEX_HELD(&sdp->sd_lock)))
- mutex_enter(&sdp->sd_lock);
-
- if (vp->v_count > 1) {
- if (!locked)
- mutex_exit(&sdp->sd_lock);
- return (0);
- }
- ASSERT(!vn_ismntpt(vp));
-
- sep = avl_first(&sdp->sd_snaps);
- while (sep != NULL) {
- next = AVL_NEXT(&sdp->sd_snaps, sep);
-
- if (sep->se_root == vp) {
- avl_remove(&sdp->sd_snaps, sep);
- kmem_free(sep->se_name, strlen(sep->se_name) + 1);
- kmem_free(sep, sizeof (zfs_snapentry_t));
- break;
- }
- sep = next;
- }
- ASSERT(sep != NULL);
-
- if (!locked)
- mutex_exit(&sdp->sd_lock);
- VN_RELE(dvp);
-
- /*
- * Dispose of the vnode for the snapshot mount point.
- * This is safe to do because once this entry has been removed
- * from the AVL tree, it can't be found again, so cannot become
- * "active". If we lookup the same name again we will end up
- * creating a new vnode.
- */
- iap.a_vp = vp;
- return (gfs_vop_inactive(&iap));
-}
-
-static int
-zfsctl_traverse_begin(vnode_t **vpp, int lktype, kthread_t *td)
-{
-
- VN_HOLD(*vpp);
- /* Snapshot should be already mounted, but just in case. */
- if (vn_mountedvfs(*vpp) == NULL)
- return (ENOENT);
- return (traverse(vpp, lktype));
-}
-
-static void
-zfsctl_traverse_end(vnode_t *vp, int err)
-{
-
- if (err == 0)
- vput(vp);
- else
- VN_RELE(vp);
-}
-
-static int
-zfsctl_snapshot_getattr(ap)
- struct vop_getattr_args /* {
- struct vnode *a_vp;
- struct vattr *a_vap;
- struct ucred *a_cred;
- struct thread *a_td;
- } */ *ap;
-{
- vnode_t *vp = ap->a_vp;
- int err;
-
- err = zfsctl_traverse_begin(&vp, LK_SHARED | LK_RETRY, ap->a_td);
- if (err == 0)
- err = VOP_GETATTR(vp, ap->a_vap, ap->a_cred, ap->a_td);
- zfsctl_traverse_end(vp, err);
- return (err);
-}
-
-static int
-zfsctl_snapshot_fid(ap)
- struct vop_fid_args /* {
- struct vnode *a_vp;
- struct fid *a_fid;
- } */ *ap;
-{
- vnode_t *vp = ap->a_vp;
- int err;
-
- err = zfsctl_traverse_begin(&vp, LK_SHARED | LK_RETRY, curthread);
- if (err == 0)
- err = VOP_VPTOFH(vp, (void *)ap->a_fid);
- zfsctl_traverse_end(vp, err);
- return (err);
-}
-
-/*
- * These VP's should never see the light of day. They should always
- * be covered.
- */
-static struct vop_vector zfsctl_ops_snapshot = {
- .vop_default = &default_vnodeops,
- .vop_inactive = zfsctl_snapshot_inactive,
- .vop_reclaim = zfsctl_common_reclaim,
- .vop_getattr = zfsctl_snapshot_getattr,
- .vop_fid = zfsctl_snapshot_fid,
-};
-
-int
-zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp)
-{
- zfsvfs_t *zfsvfs = vfsp->vfs_data;
- vnode_t *dvp, *vp;
- zfsctl_snapdir_t *sdp;
- zfsctl_node_t *zcp;
- zfs_snapentry_t *sep;
- int error;
-
- ASSERT(zfsvfs->z_ctldir != NULL);
- error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
- NULL, 0, NULL, kcred);
- if (error != 0)
- return (error);
- sdp = dvp->v_data;
-
- mutex_enter(&sdp->sd_lock);
- sep = avl_first(&sdp->sd_snaps);
- while (sep != NULL) {
- vp = sep->se_root;
- zcp = vp->v_data;
- if (zcp->zc_id == objsetid)
- break;
-
- sep = AVL_NEXT(&sdp->sd_snaps, sep);
- }
-
- if (sep != NULL) {
- VN_HOLD(vp);
- error = traverse(&vp, LK_SHARED | LK_RETRY);
- if (error == 0) {
- if (vp == sep->se_root)
- error = EINVAL;
- else
- *zfsvfsp = VTOZ(vp)->z_zfsvfs;
- }
- mutex_exit(&sdp->sd_lock);
- if (error == 0)
- VN_URELE(vp);
- else
- VN_RELE(vp);
- } else {
- error = EINVAL;
- mutex_exit(&sdp->sd_lock);
- }
-
- VN_RELE(dvp);
-
- return (error);
-}
-
-/*
- * Unmount any snapshots for the given filesystem. This is called from
- * zfs_umount() - if we have a ctldir, then go through and unmount all the
- * snapshots.
- */
-int
-zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr)
-{
- struct vop_inactive_args ap;
- zfsvfs_t *zfsvfs = vfsp->vfs_data;
- vnode_t *dvp, *svp;
- zfsctl_snapdir_t *sdp;
- zfs_snapentry_t *sep, *next;
- int error;
-
- ASSERT(zfsvfs->z_ctldir != NULL);
- error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
- NULL, 0, NULL, cr);
- if (error != 0)
- return (error);
- sdp = dvp->v_data;
-
- mutex_enter(&sdp->sd_lock);
-
- sep = avl_first(&sdp->sd_snaps);
- while (sep != NULL) {
- svp = sep->se_root;
- next = AVL_NEXT(&sdp->sd_snaps, sep);
-
- /*
- * If this snapshot is not mounted, then it must
- * have just been unmounted by somebody else, and
- * will be cleaned up by zfsctl_snapdir_inactive().
- */
- if (vn_ismntpt(svp)) {
- if ((error = vn_vfswlock(svp)) != 0)
- goto out;
-
- /*
- * Increase usecount, so dounmount() won't vrele() it
- * to 0 and call zfsctl_snapdir_inactive().
- */
- VN_HOLD(svp);
- vfsp = vn_mountedvfs(svp);
- mtx_lock(&Giant);
- error = dounmount(vfsp, fflags, curthread);
- mtx_unlock(&Giant);
- if (error != 0) {
- VN_RELE(svp);
- goto out;
- }
-
- avl_remove(&sdp->sd_snaps, sep);
- kmem_free(sep->se_name, strlen(sep->se_name) + 1);
- kmem_free(sep, sizeof (zfs_snapentry_t));
-
- /*
- * We can't use VN_RELE(), as that will try to
- * invoke zfsctl_snapdir_inactive(), and that
- * would lead to an attempt to re-grab the sd_lock.
- */
- ASSERT3U(svp->v_count, ==, 1);
- ap.a_vp = svp;
- gfs_vop_inactive(&ap);
- }
- sep = next;
- }
-out:
- mutex_exit(&sdp->sd_lock);
- VN_RELE(dvp);
-
- return (error);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c
deleted file mode 100644
index f233b8f..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c
+++ /dev/null
@@ -1,797 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/time.h>
-#include <sys/systm.h>
-#include <sys/sysmacros.h>
-#include <sys/resource.h>
-#include <sys/vfs.h>
-#include <sys/vnode.h>
-#include <sys/file.h>
-#include <sys/kmem.h>
-#include <sys/uio.h>
-#include <sys/cmn_err.h>
-#include <sys/errno.h>
-#include <sys/stat.h>
-#include <sys/unistd.h>
-#include <sys/random.h>
-#include <sys/policy.h>
-#include <sys/kcondvar.h>
-#include <sys/callb.h>
-#include <sys/smp.h>
-#include <sys/zfs_dir.h>
-#include <sys/zfs_acl.h>
-#include <sys/fs/zfs.h>
-#include <sys/zap.h>
-#include <sys/dmu.h>
-#include <sys/atomic.h>
-#include <sys/zfs_ctldir.h>
-#include <sys/dnlc.h>
-
-/*
- * Lock a directory entry. A dirlock on <dzp, name> protects that name
- * in dzp's directory zap object. As long as you hold a dirlock, you can
- * assume two things: (1) dzp cannot be reaped, and (2) no other thread
- * can change the zap entry for (i.e. link or unlink) this name.
- *
- * Input arguments:
- * dzp - znode for directory
- * name - name of entry to lock
- * flag - ZNEW: if the entry already exists, fail with EEXIST.
- * ZEXISTS: if the entry does not exist, fail with ENOENT.
- * ZSHARED: allow concurrent access with other ZSHARED callers.
- * ZXATTR: we want dzp's xattr directory
- *
- * Output arguments:
- * zpp - pointer to the znode for the entry (NULL if there isn't one)
- * dlpp - pointer to the dirlock for this entry (NULL on error)
- *
- * Return value: 0 on success or errno on failure.
- *
- * NOTE: Always checks for, and rejects, '.' and '..'.
- */
-int
-zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp,
- int flag)
-{
- zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
- zfs_dirlock_t *dl;
- uint64_t zoid;
- int error;
- vnode_t *vp;
-
- *zpp = NULL;
- *dlpp = NULL;
-
- /*
- * Verify that we are not trying to lock '.', '..', or '.zfs'
- */
- if (name[0] == '.' &&
- (name[1] == '\0' || (name[1] == '.' && name[2] == '\0')) ||
- zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0)
- return (EEXIST);
-
- /*
- * Wait until there are no locks on this name.
- */
- rw_enter(&dzp->z_name_lock, RW_READER);
- mutex_enter(&dzp->z_lock);
- for (;;) {
- if (dzp->z_unlinked) {
- mutex_exit(&dzp->z_lock);
- rw_exit(&dzp->z_name_lock);
- return (ENOENT);
- }
- for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next)
- if (strcmp(name, dl->dl_name) == 0)
- break;
- if (dl == NULL) {
- /*
- * Allocate a new dirlock and add it to the list.
- */
- dl = kmem_alloc(sizeof (zfs_dirlock_t), KM_SLEEP);
- cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL);
- dl->dl_name = name;
- dl->dl_sharecnt = 0;
- dl->dl_namesize = 0;
- dl->dl_dzp = dzp;
- dl->dl_next = dzp->z_dirlocks;
- dzp->z_dirlocks = dl;
- break;
- }
- if ((flag & ZSHARED) && dl->dl_sharecnt != 0)
- break;
- cv_wait(&dl->dl_cv, &dzp->z_lock);
- }
-
- if ((flag & ZSHARED) && ++dl->dl_sharecnt > 1 && dl->dl_namesize == 0) {
- /*
- * We're the second shared reference to dl. Make a copy of
- * dl_name in case the first thread goes away before we do.
- * Note that we initialize the new name before storing its
- * pointer into dl_name, because the first thread may load
- * dl->dl_name at any time. He'll either see the old value,
- * which is his, or the new shared copy; either is OK.
- */
- dl->dl_namesize = strlen(dl->dl_name) + 1;
- name = kmem_alloc(dl->dl_namesize, KM_SLEEP);
- bcopy(dl->dl_name, name, dl->dl_namesize);
- dl->dl_name = name;
- }
-
- mutex_exit(&dzp->z_lock);
-
- /*
- * We have a dirlock on the name. (Note that it is the dirlock,
- * not the dzp's z_lock, that protects the name in the zap object.)
- * See if there's an object by this name; if so, put a hold on it.
- */
- if (flag & ZXATTR) {
- zoid = dzp->z_phys->zp_xattr;
- error = (zoid == 0 ? ENOENT : 0);
- } else {
- vp = dnlc_lookup(ZTOV(dzp), name);
- if (vp == DNLC_NO_VNODE) {
- VN_RELE(vp);
- error = ENOENT;
- } else if (vp) {
- if (flag & ZNEW) {
- zfs_dirent_unlock(dl);
- VN_RELE(vp);
- return (EEXIST);
- }
- *dlpp = dl;
- *zpp = VTOZ(vp);
- return (0);
- } else {
- error = zap_lookup(zfsvfs->z_os, dzp->z_id, name,
- 8, 1, &zoid);
- zoid = ZFS_DIRENT_OBJ(zoid);
- if (error == ENOENT)
- dnlc_update(ZTOV(dzp), name, DNLC_NO_VNODE);
- }
- }
- if (error) {
- if (error != ENOENT || (flag & ZEXISTS)) {
- zfs_dirent_unlock(dl);
- return (error);
- }
- } else {
- if (flag & ZNEW) {
- zfs_dirent_unlock(dl);
- return (EEXIST);
- }
- error = zfs_zget(zfsvfs, zoid, zpp);
- if (error) {
- zfs_dirent_unlock(dl);
- return (error);
- }
- if (!(flag & ZXATTR))
- dnlc_update(ZTOV(dzp), name, ZTOV(*zpp));
- }
-
- *dlpp = dl;
-
- return (0);
-}
-
-/*
- * Unlock this directory entry and wake anyone who was waiting for it.
- */
-void
-zfs_dirent_unlock(zfs_dirlock_t *dl)
-{
- znode_t *dzp = dl->dl_dzp;
- zfs_dirlock_t **prev_dl, *cur_dl;
-
- mutex_enter(&dzp->z_lock);
- rw_exit(&dzp->z_name_lock);
- if (dl->dl_sharecnt > 1) {
- dl->dl_sharecnt--;
- mutex_exit(&dzp->z_lock);
- return;
- }
- prev_dl = &dzp->z_dirlocks;
- while ((cur_dl = *prev_dl) != dl)
- prev_dl = &cur_dl->dl_next;
- *prev_dl = dl->dl_next;
- cv_broadcast(&dl->dl_cv);
- mutex_exit(&dzp->z_lock);
-
- if (dl->dl_namesize != 0)
- kmem_free(dl->dl_name, dl->dl_namesize);
- cv_destroy(&dl->dl_cv);
- kmem_free(dl, sizeof (*dl));
-}
-
-/*
- * Look up an entry in a directory.
- *
- * NOTE: '.' and '..' are handled as special cases because
- * no directory entries are actually stored for them. If this is
- * the root of a filesystem, then '.zfs' is also treated as a
- * special pseudo-directory.
- */
-int
-zfs_dirlook(znode_t *dzp, char *name, vnode_t **vpp)
-{
- zfs_dirlock_t *dl;
- znode_t *zp;
- int error = 0;
-
- if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
- *vpp = ZTOV(dzp);
- VN_HOLD(*vpp);
- } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
- zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
- /*
- * If we are a snapshot mounted under .zfs, return
- * the vp for the snapshot directory.
- */
- if (dzp->z_phys->zp_parent == dzp->z_id &&
- zfsvfs->z_parent != zfsvfs) {
- error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir,
- "snapshot", vpp, NULL, 0, NULL, kcred);
- return (error);
- }
- rw_enter(&dzp->z_parent_lock, RW_READER);
- error = zfs_zget(zfsvfs, dzp->z_phys->zp_parent, &zp);
- if (error == 0)
- *vpp = ZTOV(zp);
- rw_exit(&dzp->z_parent_lock);
- } else if (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) {
- *vpp = zfsctl_root(dzp);
- } else {
- error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS | ZSHARED);
- if (error == 0) {
- *vpp = ZTOV(zp);
- zfs_dirent_unlock(dl);
- dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */
- }
- }
-
- return (error);
-}
-
-static char *
-zfs_unlinked_hexname(char namebuf[17], uint64_t x)
-{
- char *name = &namebuf[16];
- const char digits[16] = "0123456789abcdef";
-
- *name = '\0';
- do {
- *--name = digits[x & 0xf];
- x >>= 4;
- } while (x != 0);
-
- return (name);
-}
-
-/*
- * unlinked Set (formerly known as the "delete queue") Error Handling
- *
- * When dealing with the unlinked set, we dmu_tx_hold_zap(), but we
- * don't specify the name of the entry that we will be manipulating. We
- * also fib and say that we won't be adding any new entries to the
- * unlinked set, even though we might (this is to lower the minimum file
- * size that can be deleted in a full filesystem). So on the small
- * chance that the nlink list is using a fat zap (ie. has more than
- * 2000 entries), we *may* not pre-read a block that's needed.
- * Therefore it is remotely possible for some of the assertions
- * regarding the unlinked set below to fail due to i/o error. On a
- * nondebug system, this will result in the space being leaked.
- */
-void
-zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx)
-{
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- char obj_name[17];
- int error;
-
- ASSERT(zp->z_unlinked);
- ASSERT3U(zp->z_phys->zp_links, ==, 0);
-
- error = zap_add(zfsvfs->z_os, zfsvfs->z_unlinkedobj,
- zfs_unlinked_hexname(obj_name, zp->z_id), 8, 1, &zp->z_id, tx);
- ASSERT3U(error, ==, 0);
-}
-
-/*
- * Clean up any znodes that had no links when we either crashed or
- * (force) umounted the file system.
- */
-void
-zfs_unlinked_drain(zfsvfs_t *zfsvfs)
-{
- zap_cursor_t zc;
- zap_attribute_t zap;
- dmu_object_info_t doi;
- znode_t *zp;
- int error;
-
- /*
- * Interate over the contents of the unlinked set.
- */
- for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj);
- zap_cursor_retrieve(&zc, &zap) == 0;
- zap_cursor_advance(&zc)) {
-
- /*
- * See what kind of object we have in list
- */
-
- error = dmu_object_info(zfsvfs->z_os,
- zap.za_first_integer, &doi);
- if (error != 0)
- continue;
-
- ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) ||
- (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS));
- /*
- * We need to re-mark these list entries for deletion,
- * so we pull them back into core and set zp->z_unlinked.
- */
- error = zfs_zget(zfsvfs, zap.za_first_integer, &zp);
-
- /*
- * We may pick up znodes that are already marked for deletion.
- * This could happen during the purge of an extended attribute
- * directory. All we need to do is skip over them, since they
- * are already in the system marked z_unlinked.
- */
- if (error != 0)
- continue;
-
- zp->z_unlinked = B_TRUE;
- VN_RELE(ZTOV(zp));
- }
- zap_cursor_fini(&zc);
-}
-
-/*
- * Delete the entire contents of a directory. Return a count
- * of the number of entries that could not be deleted.
- *
- * NOTE: this function assumes that the directory is inactive,
- * so there is no need to lock its entries before deletion.
- * Also, it assumes the directory contents is *only* regular
- * files.
- */
-static int
-zfs_purgedir(znode_t *dzp)
-{
- zap_cursor_t zc;
- zap_attribute_t zap;
- znode_t *xzp;
- dmu_tx_t *tx;
- zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
- zfs_dirlock_t dl;
- int skipped = 0;
- int error;
-
- for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id);
- (error = zap_cursor_retrieve(&zc, &zap)) == 0;
- zap_cursor_advance(&zc)) {
- error = zfs_zget(zfsvfs,
- ZFS_DIRENT_OBJ(zap.za_first_integer), &xzp);
- ASSERT3U(error, ==, 0);
-
- ASSERT((ZTOV(xzp)->v_type == VREG) ||
- (ZTOV(xzp)->v_type == VLNK));
-
- tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_bonus(tx, dzp->z_id);
- dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name);
- dmu_tx_hold_bonus(tx, xzp->z_id);
- dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- dmu_tx_abort(tx);
- VN_RELE(ZTOV(xzp));
- skipped += 1;
- continue;
- }
- bzero(&dl, sizeof (dl));
- dl.dl_dzp = dzp;
- dl.dl_name = zap.za_name;
-
- error = zfs_link_destroy(&dl, xzp, tx, 0, NULL);
- ASSERT3U(error, ==, 0);
- dmu_tx_commit(tx);
-
- VN_RELE(ZTOV(xzp));
- }
- zap_cursor_fini(&zc);
- ASSERT(error == ENOENT);
- return (skipped);
-}
-
-void
-zfs_rmnode(znode_t *zp)
-{
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- objset_t *os = zfsvfs->z_os;
- znode_t *xzp = NULL;
- char obj_name[17];
- dmu_tx_t *tx;
- uint64_t acl_obj;
- int error;
- int vfslocked;
-
- vfslocked = VFS_LOCK_GIANT(zfsvfs->z_vfs);
-
- ASSERT(zp->z_phys->zp_links == 0);
-
- /*
- * If this is an attribute directory, purge its contents.
- */
- if (ZTOV(zp) != NULL && ZTOV(zp)->v_type == VDIR &&
- (zp->z_phys->zp_flags & ZFS_XATTR)) {
- if (zfs_purgedir(zp) != 0) {
- /*
- * Not enough space to delete some xattrs.
- * Leave it on the unlinked set.
- */
- VFS_UNLOCK_GIANT(vfslocked);
- return;
- }
- }
-
- /*
- * If the file has extended attributes, we're going to unlink
- * the xattr dir.
- */
- if (zp->z_phys->zp_xattr) {
- error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp);
- ASSERT(error == 0);
- }
-
- acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj;
-
- /*
- * Set up the transaction.
- */
- tx = dmu_tx_create(os);
- dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
- dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
- if (xzp) {
- dmu_tx_hold_bonus(tx, xzp->z_id);
- dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL);
- }
- if (acl_obj)
- dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- /*
- * Not enough space to delete the file. Leave it in the
- * unlinked set, leaking it until the fs is remounted (at
- * which point we'll call zfs_unlinked_drain() to process it).
- */
- dmu_tx_abort(tx);
- VFS_UNLOCK_GIANT(vfslocked);
- return;
- }
-
- if (xzp) {
- dmu_buf_will_dirty(xzp->z_dbuf, tx);
- mutex_enter(&xzp->z_lock);
- xzp->z_unlinked = B_TRUE; /* mark xzp for deletion */
- xzp->z_phys->zp_links = 0; /* no more links to it */
- mutex_exit(&xzp->z_lock);
- zfs_unlinked_add(xzp, tx);
- }
-
- /* Remove this znode from the unlinked set */
- error = zap_remove(os, zfsvfs->z_unlinkedobj,
- zfs_unlinked_hexname(obj_name, zp->z_id), tx);
- ASSERT3U(error, ==, 0);
-
- zfs_znode_delete(zp, tx);
-
- dmu_tx_commit(tx);
-
- if (xzp)
- VN_RELE(ZTOV(xzp));
- VFS_UNLOCK_GIANT(vfslocked);
-}
-
-/*
- * Link zp into dl. Can only fail if zp has been unlinked.
- */
-int
-zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
-{
- znode_t *dzp = dl->dl_dzp;
- vnode_t *vp = ZTOV(zp);
- uint64_t value;
- int zp_is_dir = (vp->v_type == VDIR);
- int error;
-
- dmu_buf_will_dirty(zp->z_dbuf, tx);
- mutex_enter(&zp->z_lock);
-
- if (!(flag & ZRENAMING)) {
- if (zp->z_unlinked) { /* no new links to unlinked zp */
- ASSERT(!(flag & (ZNEW | ZEXISTS)));
- mutex_exit(&zp->z_lock);
- return (ENOENT);
- }
- zp->z_phys->zp_links++;
- }
- zp->z_phys->zp_parent = dzp->z_id; /* dzp is now zp's parent */
-
- if (!(flag & ZNEW))
- zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
- mutex_exit(&zp->z_lock);
-
- dmu_buf_will_dirty(dzp->z_dbuf, tx);
- mutex_enter(&dzp->z_lock);
- dzp->z_phys->zp_size++; /* one dirent added */
- dzp->z_phys->zp_links += zp_is_dir; /* ".." link from zp */
- zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx);
- mutex_exit(&dzp->z_lock);
-
- /*
- * MacOS X will fill in the 4-bit object type here.
- */
- value = ZFS_DIRENT_MAKE(IFTODT(zp->z_phys->zp_mode), zp->z_id);
- error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, dl->dl_name,
- 8, 1, &value, tx);
- ASSERT(error == 0);
-
- dnlc_update(ZTOV(dzp), dl->dl_name, vp);
-
- return (0);
-}
-
-/*
- * Unlink zp from dl, and mark zp for deletion if this was the last link.
- * Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST).
- * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list.
- * If it's non-NULL, we use it to indicate whether the znode needs deletion,
- * and it's the caller's job to do it.
- */
-int
-zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
- boolean_t *unlinkedp)
-{
- znode_t *dzp = dl->dl_dzp;
- vnode_t *vp = ZTOV(zp);
- int zp_is_dir = (vp->v_type == VDIR);
- boolean_t unlinked = B_FALSE;
- int error;
-
- dnlc_remove(ZTOV(dzp), dl->dl_name);
-
- if (!(flag & ZRENAMING)) {
- dmu_buf_will_dirty(zp->z_dbuf, tx);
-
- if (vn_vfswlock(vp)) /* prevent new mounts on zp */
- return (EBUSY);
-
- if (vn_ismntpt(vp)) { /* don't remove mount point */
- vn_vfsunlock(vp);
- return (EBUSY);
- }
-
- mutex_enter(&zp->z_lock);
- if (zp_is_dir && !zfs_dirempty(zp)) { /* dir not empty */
- mutex_exit(&zp->z_lock);
- vn_vfsunlock(vp);
- return (ENOTEMPTY);
- }
- if (zp->z_phys->zp_links <= zp_is_dir) {
- zfs_panic_recover("zfs: link count on vnode %p is %u, "
- "should be at least %u", zp->z_vnode,
- (int)zp->z_phys->zp_links,
- zp_is_dir + 1);
- zp->z_phys->zp_links = zp_is_dir + 1;
- }
- if (--zp->z_phys->zp_links == zp_is_dir) {
- zp->z_unlinked = B_TRUE;
- zp->z_phys->zp_links = 0;
- unlinked = B_TRUE;
- } else {
- zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
- }
- mutex_exit(&zp->z_lock);
- vn_vfsunlock(vp);
- }
-
- dmu_buf_will_dirty(dzp->z_dbuf, tx);
- mutex_enter(&dzp->z_lock);
- dzp->z_phys->zp_size--; /* one dirent removed */
- dzp->z_phys->zp_links -= zp_is_dir; /* ".." link from zp */
- zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx);
- mutex_exit(&dzp->z_lock);
-
- error = zap_remove(zp->z_zfsvfs->z_os, dzp->z_id, dl->dl_name, tx);
- ASSERT(error == 0);
-
- if (unlinkedp != NULL)
- *unlinkedp = unlinked;
- else if (unlinked)
- zfs_unlinked_add(zp, tx);
-
- return (0);
-}
-
-/*
- * Indicate whether the directory is empty. Works with or without z_lock
- * held, but can only be consider a hint in the latter case. Returns true
- * if only "." and ".." remain and there's no work in progress.
- */
-boolean_t
-zfs_dirempty(znode_t *dzp)
-{
- return (dzp->z_phys->zp_size == 2 && dzp->z_dirlocks == 0);
-}
-
-int
-zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr)
-{
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- znode_t *xzp;
- dmu_tx_t *tx;
- uint64_t xoid;
- int error;
-
- *xvpp = NULL;
-
- if (error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, cr))
- return (error);
-
- tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_bonus(tx, zp->z_id);
- dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
- error = dmu_tx_assign(tx, zfsvfs->z_assign);
- if (error) {
- if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT)
- dmu_tx_wait(tx);
- dmu_tx_abort(tx);
- return (error);
- }
- zfs_mknode(zp, vap, &xoid, tx, cr, IS_XATTR, &xzp, 0);
- ASSERT(xzp->z_id == xoid);
- ASSERT(xzp->z_phys->zp_parent == zp->z_id);
- dmu_buf_will_dirty(zp->z_dbuf, tx);
- zp->z_phys->zp_xattr = xoid;
-
- (void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp, xzp, "");
- dmu_tx_commit(tx);
-
- *xvpp = ZTOV(xzp);
-
- return (0);
-}
-
-/*
- * Return a znode for the extended attribute directory for zp.
- * ** If the directory does not already exist, it is created **
- *
- * IN: zp - znode to obtain attribute directory from
- * cr - credentials of caller
- * flags - flags from the VOP_LOOKUP call
- *
- * OUT: xzpp - pointer to extended attribute znode
- *
- * RETURN: 0 on success
- * error number on failure
- */
-int
-zfs_get_xattrdir(znode_t *zp, vnode_t **xvpp, cred_t *cr, int flags)
-{
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- znode_t *xzp;
- zfs_dirlock_t *dl;
- vattr_t va;
- int error;
-top:
- error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR);
- if (error)
- return (error);
-
- if (xzp != NULL) {
- *xvpp = ZTOV(xzp);
- zfs_dirent_unlock(dl);
- return (0);
- }
-
- ASSERT(zp->z_phys->zp_xattr == 0);
-
-#ifdef TODO
- if (!(flags & CREATE_XATTR_DIR)) {
- zfs_dirent_unlock(dl);
- return (ENOENT);
- }
-#endif
-
- if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
- zfs_dirent_unlock(dl);
- return (EROFS);
- }
-
- /*
- * The ability to 'create' files in an attribute
- * directory comes from the write_xattr permission on the base file.
- *
- * The ability to 'search' an attribute directory requires
- * read_xattr permission on the base file.
- *
- * Once in a directory the ability to read/write attributes
- * is controlled by the permissions on the attribute file.
- */
- va.va_mask = AT_TYPE | AT_MODE | AT_UID | AT_GID;
- va.va_type = VDIR;
- va.va_mode = S_IFDIR | S_ISVTX | 0777;
- va.va_uid = (uid_t)zp->z_phys->zp_uid;
- va.va_gid = (gid_t)zp->z_phys->zp_gid;
-
- error = zfs_make_xattrdir(zp, &va, xvpp, cr);
- zfs_dirent_unlock(dl);
-
- if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
- /* NB: we already did dmu_tx_wait() if necessary */
- goto top;
- }
-
- return (error);
-}
-
-/*
- * Decide whether it is okay to remove within a sticky directory.
- *
- * In sticky directories, write access is not sufficient;
- * you can remove entries from a directory only if:
- *
- * you own the directory,
- * you own the entry,
- * the entry is a plain file and you have write access,
- * or you are privileged (checked in secpolicy...).
- *
- * The function returns 0 if remove access is granted.
- */
-int
-zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr)
-{
- uid_t uid;
-
- if (zdp->z_zfsvfs->z_assign >= TXG_INITIAL) /* ZIL replay */
- return (0);
-
- if ((zdp->z_phys->zp_mode & S_ISVTX) == 0 ||
- (uid = crgetuid(cr)) == zdp->z_phys->zp_uid ||
- uid == zp->z_phys->zp_uid ||
- (ZTOV(zp)->v_type == VREG &&
- zfs_zaccess(zp, ACE_WRITE_DATA, cr) == 0))
- return (0);
- else
- return (secpolicy_vnode_remove(cr));
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c
deleted file mode 100644
index e2385a0..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c
+++ /dev/null
@@ -1,335 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/spa.h>
-#include <sys/spa_impl.h>
-#include <sys/vdev.h>
-#include <sys/vdev_impl.h>
-#include <sys/zio.h>
-
-#include <sys/fm/fs/zfs.h>
-#include <sys/fm/protocol.h>
-#include <sys/fm/util.h>
-
-#ifdef _KERNEL
-/* Including sys/bus.h is just too hard, so I declare what I need here. */
-extern void devctl_notify(const char *__system, const char *__subsystem,
- const char *__type, const char *__data);
-#endif
-
-/*
- * This general routine is responsible for generating all the different ZFS
- * ereports. The payload is dependent on the class, and which arguments are
- * supplied to the function:
- *
- * EREPORT POOL VDEV IO
- * block X X X
- * data X X
- * device X X
- * pool X
- *
- * If we are in a loading state, all errors are chained together by the same
- * SPA-wide ENA.
- *
- * For isolated I/O requests, we get the ENA from the zio_t. The propagation
- * gets very complicated due to RAID-Z, gang blocks, and vdev caching. We want
- * to chain together all ereports associated with a logical piece of data. For
- * read I/Os, there are basically three 'types' of I/O, which form a roughly
- * layered diagram:
- *
- * +---------------+
- * | Aggregate I/O | No associated logical data or device
- * +---------------+
- * |
- * V
- * +---------------+ Reads associated with a piece of logical data.
- * | Read I/O | This includes reads on behalf of RAID-Z,
- * +---------------+ mirrors, gang blocks, retries, etc.
- * |
- * V
- * +---------------+ Reads associated with a particular device, but
- * | Physical I/O | no logical data. Issued as part of vdev caching
- * +---------------+ and I/O aggregation.
- *
- * Note that 'physical I/O' here is not the same terminology as used in the rest
- * of ZIO. Typically, 'physical I/O' simply means that there is no attached
- * blockpointer. But I/O with no associated block pointer can still be related
- * to a logical piece of data (i.e. RAID-Z requests).
- *
- * Purely physical I/O always have unique ENAs. They are not related to a
- * particular piece of logical data, and therefore cannot be chained together.
- * We still generate an ereport, but the DE doesn't correlate it with any
- * logical piece of data. When such an I/O fails, the delegated I/O requests
- * will issue a retry, which will trigger the 'real' ereport with the correct
- * ENA.
- *
- * We keep track of the ENA for a ZIO chain through the 'io_logical' member.
- * When a new logical I/O is issued, we set this to point to itself. Child I/Os
- * then inherit this pointer, so that when it is first set subsequent failures
- * will use the same ENA. If a physical I/O is issued (by passing the
- * ZIO_FLAG_NOBOOKMARK flag), then this pointer is reset, guaranteeing that a
- * unique ENA will be generated. For an aggregate I/O, this pointer is set to
- * NULL, and no ereport will be generated (since it doesn't actually correspond
- * to any particular device or piece of data).
- */
-void
-zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
- uint64_t stateoroffset, uint64_t size)
-{
-#ifdef _KERNEL
- char buf[1024];
- struct sbuf sb;
- struct timespec ts;
-
- /*
- * If we are doing a spa_tryimport(), ignore errors.
- */
- if (spa->spa_load_state == SPA_LOAD_TRYIMPORT)
- return;
-
- /*
- * If we are in the middle of opening a pool, and the previous attempt
- * failed, don't bother logging any new ereports - we're just going to
- * get the same diagnosis anyway.
- */
- if (spa->spa_load_state != SPA_LOAD_NONE &&
- spa->spa_last_open_failed)
- return;
-
- /*
- * Ignore any errors from I/Os that we are going to retry anyway - we
- * only generate errors from the final failure.
- */
- if (zio && zio_should_retry(zio))
- return;
-
- /*
- * If this is not a read or write zio, ignore the error. This can occur
- * if the DKIOCFLUSHWRITECACHE ioctl fails.
- */
- if (zio && zio->io_type != ZIO_TYPE_READ &&
- zio->io_type != ZIO_TYPE_WRITE)
- return;
-
- nanotime(&ts);
-
- sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
- sbuf_printf(&sb, "time=%ju.%ld", (uintmax_t)ts.tv_sec, ts.tv_nsec);
-
- /*
- * Serialize ereport generation
- */
- mutex_enter(&spa->spa_errlist_lock);
-
-#if 0
- /*
- * Determine the ENA to use for this event. If we are in a loading
- * state, use a SPA-wide ENA. Otherwise, if we are in an I/O state, use
- * a root zio-wide ENA. Otherwise, simply use a unique ENA.
- */
- if (spa->spa_load_state != SPA_LOAD_NONE) {
-#if 0
- if (spa->spa_ena == 0)
- spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1);
-#endif
- ena = spa->spa_ena;
- } else if (zio != NULL && zio->io_logical != NULL) {
-#if 0
- if (zio->io_logical->io_ena == 0)
- zio->io_logical->io_ena =
- fm_ena_generate(0, FM_ENA_FMT1);
-#endif
- ena = zio->io_logical->io_ena;
- } else {
-#if 0
- ena = fm_ena_generate(0, FM_ENA_FMT1);
-#else
- ena = 0;
-#endif
- }
-#endif
-
- /*
- * Construct the full class, detector, and other standard FMA fields.
- */
- sbuf_printf(&sb, " ereport_version=%u", FM_EREPORT_VERSION);
- sbuf_printf(&sb, " class=%s.%s", ZFS_ERROR_CLASS, subclass);
-
- sbuf_printf(&sb, " zfs_scheme_version=%u", FM_ZFS_SCHEME_VERSION);
-
- /*
- * Construct the per-ereport payload, depending on which parameters are
- * passed in.
- */
-
- /*
- * Generic payload members common to all ereports.
- *
- * The direct reference to spa_name is used rather than spa_name()
- * because of the asynchronous nature of the zio pipeline. spa_name()
- * asserts that the config lock is held in some form. This is always
- * the case in I/O context, but because the check for RW_WRITER compares
- * against 'curthread', we may be in an asynchronous context and blow
- * this assert. Rather than loosen this assert, we acknowledge that all
- * contexts in which this function is called (pool open, I/O) are safe,
- * and dereference the name directly.
- */
- sbuf_printf(&sb, " %s=%s", FM_EREPORT_PAYLOAD_ZFS_POOL, spa->spa_name);
- sbuf_printf(&sb, " %s=%ju", FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
- spa_guid(spa));
- sbuf_printf(&sb, " %s=%u", FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT,
- spa->spa_load_state);
-
- if (vd != NULL) {
- vdev_t *pvd = vd->vdev_parent;
-
- sbuf_printf(&sb, " %s=%ju", FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
- vd->vdev_guid);
- sbuf_printf(&sb, " %s=%s", FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
- vd->vdev_ops->vdev_op_type);
- if (vd->vdev_path)
- sbuf_printf(&sb, " %s=%s",
- FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, vd->vdev_path);
- if (vd->vdev_devid)
- sbuf_printf(&sb, " %s=%s",
- FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, vd->vdev_devid);
-
- if (pvd != NULL) {
- sbuf_printf(&sb, " %s=%ju",
- FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID, pvd->vdev_guid);
- sbuf_printf(&sb, " %s=%s",
- FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE,
- pvd->vdev_ops->vdev_op_type);
- if (pvd->vdev_path)
- sbuf_printf(&sb, " %s=%s",
- FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH,
- pvd->vdev_path);
- if (pvd->vdev_devid)
- sbuf_printf(&sb, " %s=%s",
- FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID,
- pvd->vdev_devid);
- }
- }
-
- if (zio != NULL) {
- /*
- * Payload common to all I/Os.
- */
- sbuf_printf(&sb, " %s=%u", FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR,
- zio->io_error);
-
- /*
- * If the 'size' parameter is non-zero, it indicates this is a
- * RAID-Z or other I/O where the physical offset and length are
- * provided for us, instead of within the zio_t.
- */
- if (vd != NULL) {
- if (size) {
- sbuf_printf(&sb, " %s=%ju",
- FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
- stateoroffset);
- sbuf_printf(&sb, " %s=%ju",
- FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, size);
- } else {
- sbuf_printf(&sb, " %s=%ju",
- FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
- zio->io_offset);
- sbuf_printf(&sb, " %s=%ju",
- FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
- zio->io_size);
- }
- }
-
- /*
- * Payload for I/Os with corresponding logical information.
- */
- if (zio->io_logical != NULL) {
- sbuf_printf(&sb, " %s=%ju",
- FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT,
- zio->io_logical->io_bookmark.zb_object);
- sbuf_printf(&sb, " %s=%ju",
- FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL,
- zio->io_logical->io_bookmark.zb_level);
- sbuf_printf(&sb, " %s=%ju",
- FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID,
- zio->io_logical->io_bookmark.zb_blkid);
- }
- } else if (vd != NULL) {
- /*
- * If we have a vdev but no zio, this is a device fault, and the
- * 'stateoroffset' parameter indicates the previous state of the
- * vdev.
- */
- sbuf_printf(&sb, " %s=%ju", FM_EREPORT_PAYLOAD_ZFS_PREV_STATE,
- stateoroffset);
- }
- mutex_exit(&spa->spa_errlist_lock);
-
- sbuf_finish(&sb);
- ZFS_LOG(1, "%s", sbuf_data(&sb));
- devctl_notify("ZFS", spa->spa_name, subclass, sbuf_data(&sb));
- if (sbuf_overflowed(&sb))
- printf("ZFS WARNING: sbuf overflowed\n");
- sbuf_delete(&sb);
-#endif
-}
-
-/*
- * The 'resource.fs.zfs.ok' event is an internal signal that the associated
- * resource (pool or disk) has been identified by ZFS as healthy. This will
- * then trigger the DE to close the associated case, if any.
- */
-void
-zfs_post_ok(spa_t *spa, vdev_t *vd)
-{
-#ifdef _KERNEL
- char buf[1024];
- char class[64];
- struct sbuf sb;
- struct timespec ts;
-
- nanotime(&ts);
-
- sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
- sbuf_printf(&sb, "time=%ju.%ld", (uintmax_t)ts.tv_sec, ts.tv_nsec);
-
- snprintf(class, sizeof(class), "%s.%s.%s", FM_RSRC_RESOURCE,
- ZFS_ERROR_CLASS, FM_RESOURCE_OK);
- sbuf_printf(&sb, " %s=%hhu", FM_VERSION, FM_RSRC_VERSION);
- sbuf_printf(&sb, " %s=%s", FM_CLASS, class);
- sbuf_printf(&sb, " %s=%ju", FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
- spa_guid(spa));
- if (vd)
- sbuf_printf(&sb, " %s=%ju", FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
- vd->vdev_guid);
- sbuf_finish(&sb);
- devctl_notify("ZFS", spa->spa_name, class, sbuf_data(&sb));
- if (sbuf_overflowed(&sb))
- printf("ZFS WARNING: sbuf overflowed\n");
- sbuf_delete(&sb);
-#endif
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
deleted file mode 100644
index c9424be..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
+++ /dev/null
@@ -1,1826 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/conf.h>
-#include <sys/kernel.h>
-#include <sys/lock.h>
-#include <sys/malloc.h>
-#include <sys/mutex.h>
-#include <sys/proc.h>
-#include <sys/errno.h>
-#include <sys/uio.h>
-#include <sys/buf.h>
-#include <sys/file.h>
-#include <sys/kmem.h>
-#include <sys/conf.h>
-#include <sys/cmn_err.h>
-#include <sys/stat.h>
-#include <sys/zfs_ioctl.h>
-#include <sys/zap.h>
-#include <sys/spa.h>
-#include <sys/spa_impl.h>
-#include <sys/vdev.h>
-#include <sys/vdev_impl.h>
-#include <sys/dmu.h>
-#include <sys/dsl_dir.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_prop.h>
-#include <sys/sunddi.h>
-#include <sys/policy.h>
-#include <sys/zone.h>
-#include <sys/nvpair.h>
-#include <sys/mount.h>
-#include <sys/taskqueue.h>
-#include <sys/sdt.h>
-#include <sys/varargs.h>
-#include <sys/fs/zfs.h>
-#include <sys/zfs_ctldir.h>
-#include <sys/zvol.h>
-
-#include "zfs_namecheck.h"
-#include "zfs_prop.h"
-
-CTASSERT(sizeof(zfs_cmd_t) <= PAGE_SIZE);
-
-static struct cdev *zfsdev;
-
-extern void zfs_init(void);
-extern void zfs_fini(void);
-
-typedef int zfs_ioc_func_t(zfs_cmd_t *);
-typedef int zfs_secpolicy_func_t(const char *, cred_t *);
-
-typedef struct zfs_ioc_vec {
- zfs_ioc_func_t *zvec_func;
- zfs_secpolicy_func_t *zvec_secpolicy;
- enum {
- no_name,
- pool_name,
- dataset_name
- } zvec_namecheck;
-} zfs_ioc_vec_t;
-
-/* _NOTE(PRINTFLIKE(4)) - this is printf-like, but lint is too whiney */
-void
-__dprintf(const char *file, const char *func, int line, const char *fmt, ...)
-{
- const char *newfile;
- char buf[256];
- va_list adx;
-
- /*
- * Get rid of annoying "../common/" prefix to filename.
- */
- newfile = strrchr(file, '/');
- if (newfile != NULL) {
- newfile = newfile + 1; /* Get rid of leading / */
- } else {
- newfile = file;
- }
-
- va_start(adx, fmt);
- (void) vsnprintf(buf, sizeof (buf), fmt, adx);
- va_end(adx);
-
- /*
- * To get this data, use the zfs-dprintf probe as so:
- * dtrace -q -n 'zfs-dprintf \
- * /stringof(arg0) == "dbuf.c"/ \
- * {printf("%s: %s", stringof(arg1), stringof(arg3))}'
- * arg0 = file name
- * arg1 = function name
- * arg2 = line number
- * arg3 = message
- */
- DTRACE_PROBE4(zfs__dprintf,
- char *, newfile, char *, func, int, line, char *, buf);
-}
-
-/*
- * Policy for top-level read operations (list pools). Requires no privileges,
- * and can be used in the local zone, as there is no associated dataset.
- */
-/* ARGSUSED */
-static int
-zfs_secpolicy_none(const char *unused1, cred_t *cr)
-{
- return (0);
-}
-
-/*
- * Policy for dataset read operations (list children, get statistics). Requires
- * no privileges, but must be visible in the local zone.
- */
-/* ARGSUSED */
-static int
-zfs_secpolicy_read(const char *dataset, cred_t *cr)
-{
- if (INGLOBALZONE(curproc) ||
- zone_dataset_visible(dataset, NULL))
- return (0);
-
- return (ENOENT);
-}
-
-static int
-zfs_dozonecheck(const char *dataset, cred_t *cr)
-{
- uint64_t zoned;
- int writable = 1;
-
- /*
- * The dataset must be visible by this zone -- check this first
- * so they don't see EPERM on something they shouldn't know about.
- */
- if (!INGLOBALZONE(curproc) &&
- !zone_dataset_visible(dataset, &writable))
- return (ENOENT);
-
- if (dsl_prop_get_integer(dataset, "jailed", &zoned, NULL))
- return (ENOENT);
-
- if (INGLOBALZONE(curproc)) {
- /*
- * If the fs is zoned, only root can access it from the
- * global zone.
- */
- if (secpolicy_zfs(cr) && zoned)
- return (EPERM);
- } else {
- /*
- * If we are in a local zone, the 'zoned' property must be set.
- */
- if (!zoned)
- return (EPERM);
-
- /* must be writable by this zone */
- if (!writable)
- return (EPERM);
- }
- return (0);
-}
-
-/*
- * Policy for dataset write operations (create children, set properties, etc).
- * Requires SYS_MOUNT privilege, and must be writable in the local zone.
- */
-int
-zfs_secpolicy_write(const char *dataset, cred_t *cr)
-{
- int error;
-
- if (error = zfs_dozonecheck(dataset, cr))
- return (error);
-
- return (secpolicy_zfs(cr));
-}
-
-/*
- * Policy for operations that want to write a dataset's parent:
- * create, destroy, snapshot, clone, restore.
- */
-static int
-zfs_secpolicy_parent(const char *dataset, cred_t *cr)
-{
- char parentname[MAXNAMELEN];
- char *cp;
-
- /*
- * Remove the @bla or /bla from the end of the name to get the parent.
- */
- (void) strncpy(parentname, dataset, sizeof (parentname));
- cp = strrchr(parentname, '@');
- if (cp != NULL) {
- cp[0] = '\0';
- } else {
- cp = strrchr(parentname, '/');
- if (cp == NULL)
- return (ENOENT);
- cp[0] = '\0';
-
- }
-
- return (zfs_secpolicy_write(parentname, cr));
-}
-
-/*
- * Policy for pool operations - create/destroy pools, add vdevs, etc. Requires
- * SYS_CONFIG privilege, which is not available in a local zone.
- */
-/* ARGSUSED */
-static int
-zfs_secpolicy_config(const char *unused, cred_t *cr)
-{
- if (secpolicy_sys_config(cr, B_FALSE) != 0)
- return (EPERM);
-
- return (0);
-}
-
-/*
- * Policy for fault injection. Requires all privileges.
- */
-/* ARGSUSED */
-static int
-zfs_secpolicy_inject(const char *unused, cred_t *cr)
-{
- return (secpolicy_zinject(cr));
-}
-
-/*
- * Policy for dataset backup operations (sendbackup).
- * Requires SYS_MOUNT privilege, and must be writable in the local zone.
- */
-static int
-zfs_secpolicy_operator(const char *dataset, cred_t *cr)
-{
- int writable = 1;
-
- if (!INGLOBALZONE(curproc) && !zone_dataset_visible(dataset, &writable))
- return (ENOENT);
- if (secpolicy_zfs(cr) != 0 && !groupmember(GID_OPERATOR, cr))
- return (EPERM);
- return (0);
-}
-
-/*
- * Returns the nvlist as specified by the user in the zfs_cmd_t.
- */
-static int
-get_nvlist(zfs_cmd_t *zc, nvlist_t **nvp)
-{
- char *packed;
- size_t size;
- int error;
- nvlist_t *config = NULL;
-
- /*
- * Read in and unpack the user-supplied nvlist.
- */
- if ((size = zc->zc_nvlist_src_size) == 0)
- return (EINVAL);
-
- packed = kmem_alloc(size, KM_SLEEP);
-
- if ((error = xcopyin((void *)(uintptr_t)zc->zc_nvlist_src, packed,
- size)) != 0) {
- kmem_free(packed, size);
- return (error);
- }
-
- if ((error = nvlist_unpack(packed, size, &config, 0)) != 0) {
- kmem_free(packed, size);
- return (error);
- }
-
- kmem_free(packed, size);
-
- *nvp = config;
- return (0);
-}
-
-static int
-put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl)
-{
- char *packed = NULL;
- size_t size;
- int error;
-
- VERIFY(nvlist_size(nvl, &size, NV_ENCODE_NATIVE) == 0);
-
- if (size > zc->zc_nvlist_dst_size) {
- /*
- * Solaris returns ENOMEM here, because even if an error is
- * returned from an ioctl(2), new zc_nvlist_dst_size will be
- * passed to the userland. This is not the case for FreeBSD.
- * We need to return 0, so the kernel will copy the
- * zc_nvlist_dst_size back and the userland can discover that a
- * bigger buffer is needed.
- */
- error = 0;
- } else {
- VERIFY(nvlist_pack(nvl, &packed, &size, NV_ENCODE_NATIVE,
- KM_SLEEP) == 0);
- error = xcopyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst,
- size);
- kmem_free(packed, size);
- }
-
- zc->zc_nvlist_dst_size = size;
- return (error);
-}
-
-static int
-zfs_ioc_pool_create(zfs_cmd_t *zc)
-{
- int error;
- nvlist_t *config;
-
- if ((error = get_nvlist(zc, &config)) != 0)
- return (error);
-
- error = spa_create(zc->zc_name, config, zc->zc_value[0] == '\0' ?
- NULL : zc->zc_value);
-
- nvlist_free(config);
-
- return (error);
-}
-
-static int
-zfs_ioc_pool_destroy(zfs_cmd_t *zc)
-{
- return (spa_destroy(zc->zc_name));
-}
-
-static int
-zfs_ioc_pool_import(zfs_cmd_t *zc)
-{
- int error;
- nvlist_t *config;
- uint64_t guid;
-
- if ((error = get_nvlist(zc, &config)) != 0)
- return (error);
-
- if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 ||
- guid != zc->zc_guid)
- error = EINVAL;
- else
- error = spa_import(zc->zc_name, config,
- zc->zc_value[0] == '\0' ? NULL : zc->zc_value);
-
- nvlist_free(config);
-
- return (error);
-}
-
-static int
-zfs_ioc_pool_export(zfs_cmd_t *zc)
-{
- return (spa_export(zc->zc_name, NULL));
-}
-
-static int
-zfs_ioc_pool_configs(zfs_cmd_t *zc)
-{
- nvlist_t *configs;
- int error;
-
- if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL)
- return (EEXIST);
-
- error = put_nvlist(zc, configs);
-
- nvlist_free(configs);
-
- return (error);
-}
-
-static int
-zfs_ioc_pool_stats(zfs_cmd_t *zc)
-{
- nvlist_t *config;
- int error;
- int ret = 0;
-
- error = spa_get_stats(zc->zc_name, &config, zc->zc_value,
- sizeof (zc->zc_value));
-
- if (config != NULL) {
- ret = put_nvlist(zc, config);
- nvlist_free(config);
-
- /*
- * The config may be present even if 'error' is non-zero.
- * In this case we return success, and preserve the real errno
- * in 'zc_cookie'.
- */
- zc->zc_cookie = error;
- } else {
- ret = error;
- }
-
- return (ret);
-}
-
-/*
- * Try to import the given pool, returning pool stats as appropriate so that
- * user land knows which devices are available and overall pool health.
- */
-static int
-zfs_ioc_pool_tryimport(zfs_cmd_t *zc)
-{
- nvlist_t *tryconfig, *config;
- int error;
-
- if ((error = get_nvlist(zc, &tryconfig)) != 0)
- return (error);
-
- config = spa_tryimport(tryconfig);
-
- nvlist_free(tryconfig);
-
- if (config == NULL)
- return (EINVAL);
-
- error = put_nvlist(zc, config);
- nvlist_free(config);
-
- return (error);
-}
-
-static int
-zfs_ioc_pool_scrub(zfs_cmd_t *zc)
-{
- spa_t *spa;
- int error;
-
- if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
- return (error);
-
- error = spa_scrub(spa, zc->zc_cookie, B_FALSE);
-
- spa_close(spa, FTAG);
-
- return (error);
-}
-
-static int
-zfs_ioc_pool_freeze(zfs_cmd_t *zc)
-{
- spa_t *spa;
- int error;
-
- error = spa_open(zc->zc_name, &spa, FTAG);
- if (error == 0) {
- spa_freeze(spa);
- spa_close(spa, FTAG);
- }
- return (error);
-}
-
-static int
-zfs_ioc_pool_upgrade(zfs_cmd_t *zc)
-{
- spa_t *spa;
- int error;
-
- if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
- return (error);
-
- spa_upgrade(spa);
-
- spa_close(spa, FTAG);
-
- return (error);
-}
-
-static int
-zfs_ioc_pool_get_history(zfs_cmd_t *zc)
-{
- spa_t *spa;
- char *hist_buf;
- uint64_t size;
- int error;
-
- if ((size = zc->zc_history_len) == 0)
- return (EINVAL);
-
- if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
- return (error);
-
- if (spa_version(spa) < ZFS_VERSION_ZPOOL_HISTORY) {
- spa_close(spa, FTAG);
- return (ENOTSUP);
- }
-
- hist_buf = kmem_alloc(size, KM_SLEEP);
- if ((error = spa_history_get(spa, &zc->zc_history_offset,
- &zc->zc_history_len, hist_buf)) == 0) {
- error = xcopyout(hist_buf, (char *)(uintptr_t)zc->zc_history,
- zc->zc_history_len);
- }
-
- spa_close(spa, FTAG);
- kmem_free(hist_buf, size);
- return (error);
-}
-
-static int
-zfs_ioc_pool_log_history(zfs_cmd_t *zc)
-{
- spa_t *spa;
- char *history_str = NULL;
- size_t size;
- int error;
-
- size = zc->zc_history_len;
- if (size == 0 || size > HIS_MAX_RECORD_LEN)
- return (EINVAL);
-
- if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
- return (error);
-
- if (spa_version(spa) < ZFS_VERSION_ZPOOL_HISTORY) {
- spa_close(spa, FTAG);
- return (ENOTSUP);
- }
-
- /* add one for the NULL delimiter */
- size++;
- history_str = kmem_alloc(size, KM_SLEEP);
- if ((error = xcopyin((void *)(uintptr_t)zc->zc_history, history_str,
- size)) != 0) {
- spa_close(spa, FTAG);
- kmem_free(history_str, size);
- return (error);
- }
- history_str[size - 1] = '\0';
-
- error = spa_history_log(spa, history_str, zc->zc_history_offset);
-
- spa_close(spa, FTAG);
- kmem_free(history_str, size);
-
- return (error);
-}
-
-static int
-zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc)
-{
- int error;
-
- if (error = dsl_dsobj_to_dsname(zc->zc_name, zc->zc_obj, zc->zc_value))
- return (error);
-
- return (0);
-}
-
-static int
-zfs_ioc_obj_to_path(zfs_cmd_t *zc)
-{
- objset_t *osp;
- int error;
-
- if ((error = dmu_objset_open(zc->zc_name, DMU_OST_ZFS,
- DS_MODE_NONE | DS_MODE_READONLY, &osp)) != 0)
- return (error);
-
- error = zfs_obj_to_path(osp, zc->zc_obj, zc->zc_value,
- sizeof (zc->zc_value));
- dmu_objset_close(osp);
-
- return (error);
-}
-
-static int
-zfs_ioc_vdev_add(zfs_cmd_t *zc)
-{
- spa_t *spa;
- int error;
- nvlist_t *config;
-
- error = spa_open(zc->zc_name, &spa, FTAG);
- if (error != 0)
- return (error);
-
- /*
- * A root pool with concatenated devices is not supported.
- * Thus, can not add a device to a root pool with one device.
- */
- if (spa->spa_root_vdev->vdev_children == 1 && spa->spa_bootfs != 0) {
- spa_close(spa, FTAG);
- return (EDOM);
- }
-
- if ((error = get_nvlist(zc, &config)) == 0) {
- error = spa_vdev_add(spa, config);
- nvlist_free(config);
- }
-
- spa_close(spa, FTAG);
- return (error);
-}
-
-static int
-zfs_ioc_vdev_remove(zfs_cmd_t *zc)
-{
- spa_t *spa;
- int error;
-
- error = spa_open(zc->zc_name, &spa, FTAG);
- if (error != 0)
- return (error);
- error = spa_vdev_remove(spa, zc->zc_guid, B_FALSE);
- spa_close(spa, FTAG);
- return (error);
-}
-
-static int
-zfs_ioc_vdev_online(zfs_cmd_t *zc)
-{
- spa_t *spa;
- int error;
-
- if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
- return (error);
- error = vdev_online(spa, zc->zc_guid);
- spa_close(spa, FTAG);
- return (error);
-}
-
-static int
-zfs_ioc_vdev_offline(zfs_cmd_t *zc)
-{
- spa_t *spa;
- int istmp = zc->zc_cookie;
- int error;
-
- if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
- return (error);
- error = vdev_offline(spa, zc->zc_guid, istmp);
- spa_close(spa, FTAG);
- return (error);
-}
-
-static int
-zfs_ioc_vdev_attach(zfs_cmd_t *zc)
-{
- spa_t *spa;
- int replacing = zc->zc_cookie;
- nvlist_t *config;
- int error;
-
- if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
- return (error);
-
- if ((error = get_nvlist(zc, &config)) == 0) {
- error = spa_vdev_attach(spa, zc->zc_guid, config, replacing);
- nvlist_free(config);
- }
-
- spa_close(spa, FTAG);
- return (error);
-}
-
-static int
-zfs_ioc_vdev_detach(zfs_cmd_t *zc)
-{
- spa_t *spa;
- int error;
-
- if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
- return (error);
-
- error = spa_vdev_detach(spa, zc->zc_guid, B_FALSE);
-
- spa_close(spa, FTAG);
- return (error);
-}
-
-static int
-zfs_ioc_vdev_setpath(zfs_cmd_t *zc)
-{
- spa_t *spa;
- char *path = zc->zc_value;
- uint64_t guid = zc->zc_guid;
- int error;
-
- error = spa_open(zc->zc_name, &spa, FTAG);
- if (error != 0)
- return (error);
-
- error = spa_vdev_setpath(spa, guid, path);
- spa_close(spa, FTAG);
- return (error);
-}
-
-static int
-zfs_ioc_objset_stats(zfs_cmd_t *zc)
-{
- objset_t *os = NULL;
- int error;
- nvlist_t *nv;
-
-retry:
- error = dmu_objset_open(zc->zc_name, DMU_OST_ANY,
- DS_MODE_STANDARD | DS_MODE_READONLY, &os);
- if (error != 0) {
- /*
- * This is ugly: dmu_objset_open() can return EBUSY if
- * the objset is held exclusively. Fortunately this hold is
- * only for a short while, so we retry here.
- * This avoids user code having to handle EBUSY,
- * for example for a "zfs list".
- */
- if (error == EBUSY) {
- delay(1);
- goto retry;
- }
- return (error);
- }
-
- dmu_objset_fast_stat(os, &zc->zc_objset_stats);
-
- if (zc->zc_nvlist_dst != 0 &&
- (error = dsl_prop_get_all(os, &nv)) == 0) {
- dmu_objset_stats(os, nv);
- /*
- * NB: zvol_get_stats() will read the objset contents,
- * which we aren't supposed to do with a
- * DS_MODE_STANDARD open, because it could be
- * inconsistent. So this is a bit of a workaround...
- */
- if (!zc->zc_objset_stats.dds_inconsistent &&
- dmu_objset_type(os) == DMU_OST_ZVOL)
- VERIFY(zvol_get_stats(os, nv) == 0);
- error = put_nvlist(zc, nv);
- nvlist_free(nv);
- }
-
- spa_altroot(dmu_objset_spa(os), zc->zc_value, sizeof (zc->zc_value));
-
- dmu_objset_close(os);
- if (error == ENOMEM)
- error = 0;
- return (error);
-}
-
-static int
-zfs_ioc_dataset_list_next(zfs_cmd_t *zc)
-{
- objset_t *os;
- int error;
- char *p;
-
-retry:
- error = dmu_objset_open(zc->zc_name, DMU_OST_ANY,
- DS_MODE_STANDARD | DS_MODE_READONLY, &os);
- if (error != 0) {
- /*
- * This is ugly: dmu_objset_open() can return EBUSY if
- * the objset is held exclusively. Fortunately this hold is
- * only for a short while, so we retry here.
- * This avoids user code having to handle EBUSY,
- * for example for a "zfs list".
- */
- if (error == EBUSY) {
- delay(1);
- goto retry;
- }
- if (error == ENOENT)
- error = ESRCH;
- return (error);
- }
-
- p = strrchr(zc->zc_name, '/');
- if (p == NULL || p[1] != '\0')
- (void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name));
- p = zc->zc_name + strlen(zc->zc_name);
-
- do {
- error = dmu_dir_list_next(os,
- sizeof (zc->zc_name) - (p - zc->zc_name), p,
- NULL, &zc->zc_cookie);
- if (error == ENOENT)
- error = ESRCH;
- } while (error == 0 && !INGLOBALZONE(curproc) &&
- !zone_dataset_visible(zc->zc_name, NULL));
-
- /*
- * If it's a hidden dataset (ie. with a '$' in its name), don't
- * try to get stats for it. Userland will skip over it.
- */
- if (error == 0 && strchr(zc->zc_name, '$') == NULL)
- error = zfs_ioc_objset_stats(zc); /* fill in the stats */
-
- dmu_objset_close(os);
- return (error);
-}
-
-static int
-zfs_ioc_snapshot_list_next(zfs_cmd_t *zc)
-{
- objset_t *os;
- int error;
-
-retry:
- error = dmu_objset_open(zc->zc_name, DMU_OST_ANY,
- DS_MODE_STANDARD | DS_MODE_READONLY, &os);
- if (error != 0) {
- /*
- * This is ugly: dmu_objset_open() can return EBUSY if
- * the objset is held exclusively. Fortunately this hold is
- * only for a short while, so we retry here.
- * This avoids user code having to handle EBUSY,
- * for example for a "zfs list".
- */
- if (error == EBUSY) {
- delay(1);
- goto retry;
- }
- if (error == ENOENT)
- error = ESRCH;
- return (error);
- }
-
- /*
- * A dataset name of maximum length cannot have any snapshots,
- * so exit immediately.
- */
- if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >= MAXNAMELEN) {
- dmu_objset_close(os);
- return (ESRCH);
- }
-
- error = dmu_snapshot_list_next(os,
- sizeof (zc->zc_name) - strlen(zc->zc_name),
- zc->zc_name + strlen(zc->zc_name), NULL, &zc->zc_cookie);
- if (error == ENOENT)
- error = ESRCH;
-
- if (error == 0)
- error = zfs_ioc_objset_stats(zc); /* fill in the stats */
-
- dmu_objset_close(os);
- return (error);
-}
-
-static int
-zfs_set_prop_nvlist(const char *name, dev_t dev, cred_t *cr, nvlist_t *nvl)
-{
- nvpair_t *elem;
- int error;
- const char *propname;
- zfs_prop_t prop;
- uint64_t intval;
- char *strval;
- char buf[MAXNAMELEN];
- const char *p;
- spa_t *spa;
-
- elem = NULL;
- while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
- propname = nvpair_name(elem);
-
- if ((prop = zfs_name_to_prop(propname)) ==
- ZFS_PROP_INVAL) {
- /*
- * If this is a user-defined property, it must be a
- * string, and there is no further validation to do.
- */
- if (!zfs_prop_user(propname) ||
- nvpair_type(elem) != DATA_TYPE_STRING)
- return (EINVAL);
-
- VERIFY(nvpair_value_string(elem, &strval) == 0);
- error = dsl_prop_set(name, propname, 1,
- strlen(strval) + 1, strval);
- if (error == 0)
- continue;
- else
- return (error);
- }
-
- /*
- * Check permissions for special properties.
- */
- switch (prop) {
- case ZFS_PROP_ZONED:
- /*
- * Disallow setting of 'zoned' from within a local zone.
- */
- if (!INGLOBALZONE(curproc))
- return (EPERM);
- break;
-
- case ZFS_PROP_QUOTA:
- if (error = zfs_dozonecheck(name, cr))
- return (error);
-
- if (!INGLOBALZONE(curproc)) {
- uint64_t zoned;
- char setpoint[MAXNAMELEN];
- int dslen;
- /*
- * Unprivileged users are allowed to modify the
- * quota on things *under* (ie. contained by)
- * the thing they own.
- */
- if (dsl_prop_get_integer(name, "jailed", &zoned,
- setpoint))
- return (EPERM);
- if (!zoned) /* this shouldn't happen */
- return (EPERM);
- dslen = strlen(name);
- if (dslen <= strlen(setpoint))
- return (EPERM);
- }
- break;
-
- case ZFS_PROP_COMPRESSION:
- /*
- * If the user specified gzip compression, make sure
- * the SPA supports it. We ignore any errors here since
- * we'll catch them later.
- */
- if (nvpair_type(elem) == DATA_TYPE_UINT64 &&
- nvpair_value_uint64(elem, &intval) == 0 &&
- intval >= ZIO_COMPRESS_GZIP_1 &&
- intval <= ZIO_COMPRESS_GZIP_9) {
- if ((p = strchr(name, '/')) == NULL) {
- p = name;
- } else {
- bcopy(name, buf, p - name);
- buf[p - name] = '\0';
- p = buf;
- }
-
- if (spa_open(p, &spa, FTAG) == 0) {
- if (spa_version(spa) <
- ZFS_VERSION_GZIP_COMPRESSION) {
- spa_close(spa, FTAG);
- return (ENOTSUP);
- }
-
- spa_close(spa, FTAG);
- }
- }
- break;
- }
-
- switch (prop) {
- case ZFS_PROP_QUOTA:
- if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
- (error = dsl_dir_set_quota(name,
- intval)) != 0)
- return (error);
- break;
-
- case ZFS_PROP_RESERVATION:
- if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
- (error = dsl_dir_set_reservation(name,
- intval)) != 0)
- return (error);
- break;
-
- case ZFS_PROP_VOLSIZE:
- if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
- (error = zvol_set_volsize(name, dev,
- intval)) != 0)
- return (error);
- break;
-
- case ZFS_PROP_VOLBLOCKSIZE:
- if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
- (error = zvol_set_volblocksize(name,
- intval)) != 0)
- return (error);
- break;
-
- default:
- if (nvpair_type(elem) == DATA_TYPE_STRING) {
- if (zfs_prop_get_type(prop) !=
- prop_type_string)
- return (EINVAL);
- VERIFY(nvpair_value_string(elem, &strval) == 0);
- if ((error = dsl_prop_set(name,
- nvpair_name(elem), 1, strlen(strval) + 1,
- strval)) != 0)
- return (error);
- } else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
- const char *unused;
-
- VERIFY(nvpair_value_uint64(elem, &intval) == 0);
-
- switch (zfs_prop_get_type(prop)) {
- case prop_type_number:
- break;
- case prop_type_boolean:
- if (intval > 1)
- return (EINVAL);
- break;
- case prop_type_string:
- return (EINVAL);
- case prop_type_index:
- if (zfs_prop_index_to_string(prop,
- intval, &unused) != 0)
- return (EINVAL);
- break;
- default:
- cmn_err(CE_PANIC, "unknown property "
- "type");
- break;
- }
-
- if ((error = dsl_prop_set(name, propname,
- 8, 1, &intval)) != 0)
- return (error);
- } else {
- return (EINVAL);
- }
- break;
- }
- }
-
- return (0);
-}
-
-static int
-zfs_ioc_set_prop(zfs_cmd_t *zc)
-{
- nvlist_t *nvl;
- int error;
- zfs_prop_t prop;
-
- /*
- * If zc_value is set, then this is an attempt to inherit a value.
- * Otherwise, zc_nvlist refers to a list of properties to set.
- */
- if (zc->zc_value[0] != '\0') {
- if (!zfs_prop_user(zc->zc_value) &&
- ((prop = zfs_name_to_prop(zc->zc_value)) ==
- ZFS_PROP_INVAL ||
- !zfs_prop_inheritable(prop)))
- return (EINVAL);
-
- return (dsl_prop_set(zc->zc_name, zc->zc_value, 0, 0, NULL));
- }
-
- if ((error = get_nvlist(zc, &nvl)) != 0)
- return (error);
-
- error = zfs_set_prop_nvlist(zc->zc_name, zc->zc_dev,
- (cred_t *)(uintptr_t)zc->zc_cred, nvl);
- nvlist_free(nvl);
- return (error);
-}
-
-static int
-zfs_ioc_pool_set_props(zfs_cmd_t *zc)
-{
- nvlist_t *nvl;
- int error, reset_bootfs = 0;
- uint64_t objnum;
- zpool_prop_t prop;
- nvpair_t *elem;
- char *propname, *strval;
- spa_t *spa;
- vdev_t *rvdev;
- char *vdev_type;
- objset_t *os;
-
- if ((error = get_nvlist(zc, &nvl)) != 0)
- return (error);
-
- if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) {
- nvlist_free(nvl);
- return (error);
- }
-
- if (spa_version(spa) < ZFS_VERSION_BOOTFS) {
- nvlist_free(nvl);
- spa_close(spa, FTAG);
- return (ENOTSUP);
- }
-
- elem = NULL;
- while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
-
- propname = nvpair_name(elem);
-
- if ((prop = zpool_name_to_prop(propname)) ==
- ZFS_PROP_INVAL) {
- nvlist_free(nvl);
- spa_close(spa, FTAG);
- return (EINVAL);
- }
-
- switch (prop) {
- case ZFS_PROP_BOOTFS:
- /*
- * A bootable filesystem can not be on a RAIDZ pool
- * nor a striped pool with more than 1 device.
- */
- rvdev = spa->spa_root_vdev;
- vdev_type =
- rvdev->vdev_child[0]->vdev_ops->vdev_op_type;
- if (strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 ||
- (strcmp(vdev_type, VDEV_TYPE_MIRROR) != 0 &&
- rvdev->vdev_children > 1)) {
- error = ENOTSUP;
- break;
- }
-
- reset_bootfs = 1;
-
- VERIFY(nvpair_value_string(elem, &strval) == 0);
- if (strval == NULL || strval[0] == '\0') {
- objnum =
- zfs_prop_default_numeric(ZFS_PROP_BOOTFS);
- break;
- }
-
- if (error = dmu_objset_open(strval, DMU_OST_ZFS,
- DS_MODE_STANDARD | DS_MODE_READONLY, &os))
- break;
- objnum = dmu_objset_id(os);
- dmu_objset_close(os);
- break;
-
- default:
- error = EINVAL;
- }
-
- if (error)
- break;
- }
- if (error == 0) {
- if (reset_bootfs) {
- VERIFY(nvlist_remove(nvl,
- zpool_prop_to_name(ZFS_PROP_BOOTFS),
- DATA_TYPE_STRING) == 0);
- VERIFY(nvlist_add_uint64(nvl,
- zpool_prop_to_name(ZFS_PROP_BOOTFS), objnum) == 0);
- }
- error = spa_set_props(spa, nvl);
- }
-
- nvlist_free(nvl);
- spa_close(spa, FTAG);
-
- return (error);
-}
-
-static int
-zfs_ioc_pool_get_props(zfs_cmd_t *zc)
-{
- spa_t *spa;
- int error;
- nvlist_t *nvp = NULL;
-
- if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
- return (error);
-
- error = spa_get_props(spa, &nvp);
-
- if (error == 0 && zc->zc_nvlist_dst != 0)
- error = put_nvlist(zc, nvp);
- else
- error = EFAULT;
-
- spa_close(spa, FTAG);
-
- if (nvp)
- nvlist_free(nvp);
- return (error);
-}
-
-static int
-zfs_ioc_create_minor(zfs_cmd_t *zc)
-{
- return (zvol_create_minor(zc->zc_name, zc->zc_dev));
-}
-
-static int
-zfs_ioc_remove_minor(zfs_cmd_t *zc)
-{
- return (zvol_remove_minor(zc->zc_name));
-}
-
-/*
- * Search the vfs list for a specified resource. Returns a pointer to it
- * or NULL if no suitable entry is found. The caller of this routine
- * is responsible for releasing the returned vfs pointer.
- */
-static vfs_t *
-zfs_get_vfs(const char *resource)
-{
- vfs_t *vfsp;
-
- mtx_lock(&mountlist_mtx);
- TAILQ_FOREACH(vfsp, &mountlist, mnt_list) {
- if (strcmp(vfsp->mnt_stat.f_mntfromname, resource) == 0) {
- VFS_HOLD(vfsp);
- break;
- }
- }
- mtx_unlock(&mountlist_mtx);
- return (vfsp);
-}
-
-static void
-zfs_create_cb(objset_t *os, void *arg, dmu_tx_t *tx)
-{
- zfs_create_data_t *zc = arg;
-
- zfs_create_fs(os, (cred_t *)(uintptr_t)zc->zc_cred, tx);
-}
-
-static int
-zfs_ioc_create(zfs_cmd_t *zc)
-{
- objset_t *clone;
- int error = 0;
- zfs_create_data_t cbdata = { 0 };
- void (*cbfunc)(objset_t *os, void *arg, dmu_tx_t *tx);
- dmu_objset_type_t type = zc->zc_objset_type;
-
- switch (type) {
-
- case DMU_OST_ZFS:
- cbfunc = zfs_create_cb;
- break;
-
- case DMU_OST_ZVOL:
- cbfunc = zvol_create_cb;
- break;
-
- default:
- cbfunc = NULL;
- }
- if (strchr(zc->zc_name, '@'))
- return (EINVAL);
-
- if (zc->zc_nvlist_src != 0 &&
- (error = get_nvlist(zc, &cbdata.zc_props)) != 0)
- return (error);
-
- cbdata.zc_cred = (cred_t *)(uintptr_t)zc->zc_cred;
- cbdata.zc_dev = (dev_t)zc->zc_dev;
-
- if (zc->zc_value[0] != '\0') {
- /*
- * We're creating a clone of an existing snapshot.
- */
- zc->zc_value[sizeof (zc->zc_value) - 1] = '\0';
- if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0) {
- nvlist_free(cbdata.zc_props);
- return (EINVAL);
- }
-
- error = dmu_objset_open(zc->zc_value, type,
- DS_MODE_STANDARD | DS_MODE_READONLY, &clone);
- if (error) {
- nvlist_free(cbdata.zc_props);
- return (error);
- }
- error = dmu_objset_create(zc->zc_name, type, clone, NULL, NULL);
- dmu_objset_close(clone);
- } else {
- if (cbfunc == NULL) {
- nvlist_free(cbdata.zc_props);
- return (EINVAL);
- }
-
- if (type == DMU_OST_ZVOL) {
- uint64_t volsize, volblocksize;
-
- if (cbdata.zc_props == NULL ||
- nvlist_lookup_uint64(cbdata.zc_props,
- zfs_prop_to_name(ZFS_PROP_VOLSIZE),
- &volsize) != 0) {
- nvlist_free(cbdata.zc_props);
- return (EINVAL);
- }
-
- if ((error = nvlist_lookup_uint64(cbdata.zc_props,
- zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
- &volblocksize)) != 0 && error != ENOENT) {
- nvlist_free(cbdata.zc_props);
- return (EINVAL);
- }
-
- if (error != 0)
- volblocksize = zfs_prop_default_numeric(
- ZFS_PROP_VOLBLOCKSIZE);
-
- if ((error = zvol_check_volblocksize(
- volblocksize)) != 0 ||
- (error = zvol_check_volsize(volsize,
- volblocksize)) != 0) {
- nvlist_free(cbdata.zc_props);
- return (error);
- }
- }
-
- error = dmu_objset_create(zc->zc_name, type, NULL, cbfunc,
- &cbdata);
- }
-
- /*
- * It would be nice to do this atomically.
- */
- if (error == 0) {
- if ((error = zfs_set_prop_nvlist(zc->zc_name,
- zc->zc_dev, (cred_t *)(uintptr_t)zc->zc_cred,
- cbdata.zc_props)) != 0)
- (void) dmu_objset_destroy(zc->zc_name);
- }
-
- nvlist_free(cbdata.zc_props);
- return (error);
-}
-
-static int
-zfs_ioc_snapshot(zfs_cmd_t *zc)
-{
- if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0)
- return (EINVAL);
- return (dmu_objset_snapshot(zc->zc_name,
- zc->zc_value, zc->zc_cookie));
-}
-
-int
-zfs_unmount_snap(char *name, void *arg)
-{
- char *snapname = arg;
- char *cp;
- vfs_t *vfsp = NULL;
-
- /*
- * Snapshots (which are under .zfs control) must be unmounted
- * before they can be destroyed.
- */
-
- if (snapname) {
- (void) strcat(name, "@");
- (void) strcat(name, snapname);
- vfsp = zfs_get_vfs(name);
- cp = strchr(name, '@');
- *cp = '\0';
- } else if (strchr(name, '@')) {
- vfsp = zfs_get_vfs(name);
- }
-
- if (vfsp) {
- /*
- * Always force the unmount for snapshots.
- */
- int flag = MS_FORCE;
- int err;
-
- if ((err = vn_vfswlock(vfsp->vfs_vnodecovered)) != 0) {
- VFS_RELE(vfsp);
- return (err);
- }
- VFS_RELE(vfsp);
- mtx_lock(&Giant); /* dounmount() */
- dounmount(vfsp, flag, curthread);
- mtx_unlock(&Giant); /* dounmount() */
- }
- return (0);
-}
-
-static int
-zfs_ioc_destroy_snaps(zfs_cmd_t *zc)
-{
- int err;
-
- if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0)
- return (EINVAL);
- err = dmu_objset_find(zc->zc_name,
- zfs_unmount_snap, zc->zc_value, DS_FIND_CHILDREN);
- if (err)
- return (err);
- return (dmu_snapshots_destroy(zc->zc_name, zc->zc_value));
-}
-
-static int
-zfs_ioc_destroy(zfs_cmd_t *zc)
-{
- if (strchr(zc->zc_name, '@') && zc->zc_objset_type == DMU_OST_ZFS) {
- int err = zfs_unmount_snap(zc->zc_name, NULL);
- if (err)
- return (err);
- }
-
- return (dmu_objset_destroy(zc->zc_name));
-}
-
-static int
-zfs_ioc_rollback(zfs_cmd_t *zc)
-{
- return (dmu_objset_rollback(zc->zc_name));
-}
-
-static int
-zfs_ioc_rename(zfs_cmd_t *zc)
-{
- int recursive = zc->zc_cookie & 1;
-
- zc->zc_value[sizeof (zc->zc_value) - 1] = '\0';
- if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0)
- return (EINVAL);
-
- /*
- * Unmount snapshot unless we're doing a recursive rename,
- * in which case the dataset code figures out which snapshots
- * to unmount.
- */
- if (!recursive && strchr(zc->zc_name, '@') != NULL &&
- zc->zc_objset_type == DMU_OST_ZFS) {
- int err = zfs_unmount_snap(zc->zc_name, NULL);
- if (err)
- return (err);
- }
-
- return (dmu_objset_rename(zc->zc_name, zc->zc_value, recursive));
-}
-
-static int
-zfs_ioc_recvbackup(zfs_cmd_t *zc)
-{
- kthread_t *td = curthread;
- struct file *fp;
- int error;
- offset_t new_off;
-
- if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 ||
- strchr(zc->zc_value, '@') == NULL)
- return (EINVAL);
-
- error = fget_read(td, zc->zc_cookie, &fp);
- if (error)
- return (error);
-
- error = dmu_recvbackup(zc->zc_value, &zc->zc_begin_record,
- &zc->zc_cookie, (boolean_t)zc->zc_guid, fp,
- fp->f_offset);
-
- new_off = fp->f_offset + zc->zc_cookie;
- fp->f_offset = new_off;
-
- fdrop(fp, td);
- return (error);
-}
-
-static int
-zfs_ioc_sendbackup(zfs_cmd_t *zc)
-{
- kthread_t *td = curthread;
- struct file *fp;
- objset_t *fromsnap = NULL;
- objset_t *tosnap;
- int error, fd;
-
- error = dmu_objset_open(zc->zc_name, DMU_OST_ANY,
- DS_MODE_STANDARD | DS_MODE_READONLY, &tosnap);
- if (error)
- return (error);
-
- if (zc->zc_value[0] != '\0') {
- char buf[MAXPATHLEN];
- char *cp;
-
- (void) strncpy(buf, zc->zc_name, sizeof (buf));
- cp = strchr(buf, '@');
- if (cp)
- *(cp+1) = 0;
- (void) strlcat(buf, zc->zc_value, sizeof (buf));
- error = dmu_objset_open(buf, DMU_OST_ANY,
- DS_MODE_STANDARD | DS_MODE_READONLY, &fromsnap);
- if (error) {
- dmu_objset_close(tosnap);
- return (error);
- }
- }
-
- fd = zc->zc_cookie;
- error = fget_write(td, fd, &fp);
- if (error) {
- dmu_objset_close(tosnap);
- if (fromsnap)
- dmu_objset_close(fromsnap);
- return (error);
- }
-
- error = dmu_sendbackup(tosnap, fromsnap, fp);
-
- fdrop(fp, td);
- if (fromsnap)
- dmu_objset_close(fromsnap);
- dmu_objset_close(tosnap);
- return (error);
-}
-
-static int
-zfs_ioc_inject_fault(zfs_cmd_t *zc)
-{
- int id, error;
-
- error = zio_inject_fault(zc->zc_name, (int)zc->zc_guid, &id,
- &zc->zc_inject_record);
-
- if (error == 0)
- zc->zc_guid = (uint64_t)id;
-
- return (error);
-}
-
-static int
-zfs_ioc_clear_fault(zfs_cmd_t *zc)
-{
- return (zio_clear_fault((int)zc->zc_guid));
-}
-
-static int
-zfs_ioc_inject_list_next(zfs_cmd_t *zc)
-{
- int id = (int)zc->zc_guid;
- int error;
-
- error = zio_inject_list_next(&id, zc->zc_name, sizeof (zc->zc_name),
- &zc->zc_inject_record);
-
- zc->zc_guid = id;
-
- return (error);
-}
-
-static int
-zfs_ioc_error_log(zfs_cmd_t *zc)
-{
- spa_t *spa;
- int error;
- size_t count = (size_t)zc->zc_nvlist_dst_size;
-
- if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
- return (error);
-
- error = spa_get_errlog(spa, (void *)(uintptr_t)zc->zc_nvlist_dst,
- &count);
- if (error == 0)
- zc->zc_nvlist_dst_size = count;
- else
- zc->zc_nvlist_dst_size = spa_get_errlog_size(spa);
-
- spa_close(spa, FTAG);
-
- return (error);
-}
-
-static int
-zfs_ioc_clear(zfs_cmd_t *zc)
-{
- spa_t *spa;
- vdev_t *vd;
- int error;
-
- if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
- return (error);
-
- spa_config_enter(spa, RW_WRITER, FTAG);
-
- if (zc->zc_guid == 0) {
- vd = NULL;
- } else if ((vd = spa_lookup_by_guid(spa, zc->zc_guid)) == NULL) {
- spa_config_exit(spa, FTAG);
- spa_close(spa, FTAG);
- return (ENODEV);
- }
-
- vdev_clear(spa, vd);
-
- spa_config_exit(spa, FTAG);
-
- spa_close(spa, FTAG);
-
- return (0);
-}
-
-static int
-zfs_ioc_promote(zfs_cmd_t *zc)
-{
- char *cp;
-
- /*
- * We don't need to unmount *all* the origin fs's snapshots, but
- * it's easier.
- */
- cp = strchr(zc->zc_value, '@');
- if (cp)
- *cp = '\0';
- (void) dmu_objset_find(zc->zc_value,
- zfs_unmount_snap, NULL, DS_FIND_SNAPSHOTS);
- return (dsl_dataset_promote(zc->zc_name));
-}
-
-static int
-zfs_ioc_jail(zfs_cmd_t *zc)
-{
-
- return (zone_dataset_attach((cred_t *)(uintptr_t)zc->zc_cred,
- zc->zc_name, (int)zc->zc_jailid));
-}
-
-static int
-zfs_ioc_unjail(zfs_cmd_t *zc)
-{
-
- return (zone_dataset_detach((cred_t *)(uintptr_t)zc->zc_cred,
- zc->zc_name, (int)zc->zc_jailid));
-}
-
-static zfs_ioc_vec_t zfs_ioc_vec[] = {
- { zfs_ioc_pool_create, zfs_secpolicy_config, pool_name },
- { zfs_ioc_pool_destroy, zfs_secpolicy_config, pool_name },
- { zfs_ioc_pool_import, zfs_secpolicy_config, pool_name },
- { zfs_ioc_pool_export, zfs_secpolicy_config, pool_name },
- { zfs_ioc_pool_configs, zfs_secpolicy_none, no_name },
- { zfs_ioc_pool_stats, zfs_secpolicy_read, pool_name },
- { zfs_ioc_pool_tryimport, zfs_secpolicy_config, no_name },
- { zfs_ioc_pool_scrub, zfs_secpolicy_config, pool_name },
- { zfs_ioc_pool_freeze, zfs_secpolicy_config, no_name },
- { zfs_ioc_pool_upgrade, zfs_secpolicy_config, pool_name },
- { zfs_ioc_pool_get_history, zfs_secpolicy_config, pool_name },
- { zfs_ioc_pool_log_history, zfs_secpolicy_config, pool_name },
- { zfs_ioc_vdev_add, zfs_secpolicy_config, pool_name },
- { zfs_ioc_vdev_remove, zfs_secpolicy_config, pool_name },
- { zfs_ioc_vdev_online, zfs_secpolicy_config, pool_name },
- { zfs_ioc_vdev_offline, zfs_secpolicy_config, pool_name },
- { zfs_ioc_vdev_attach, zfs_secpolicy_config, pool_name },
- { zfs_ioc_vdev_detach, zfs_secpolicy_config, pool_name },
- { zfs_ioc_vdev_setpath, zfs_secpolicy_config, pool_name },
- { zfs_ioc_objset_stats, zfs_secpolicy_read, dataset_name },
- { zfs_ioc_dataset_list_next, zfs_secpolicy_read, dataset_name },
- { zfs_ioc_snapshot_list_next, zfs_secpolicy_read, dataset_name },
- { zfs_ioc_set_prop, zfs_secpolicy_write, dataset_name },
- { zfs_ioc_create_minor, zfs_secpolicy_config, dataset_name },
- { zfs_ioc_remove_minor, zfs_secpolicy_config, dataset_name },
- { zfs_ioc_create, zfs_secpolicy_parent, dataset_name },
- { zfs_ioc_destroy, zfs_secpolicy_parent, dataset_name },
- { zfs_ioc_rollback, zfs_secpolicy_write, dataset_name },
- { zfs_ioc_rename, zfs_secpolicy_write, dataset_name },
- { zfs_ioc_recvbackup, zfs_secpolicy_write, dataset_name },
- { zfs_ioc_sendbackup, zfs_secpolicy_operator, dataset_name },
- { zfs_ioc_inject_fault, zfs_secpolicy_inject, no_name },
- { zfs_ioc_clear_fault, zfs_secpolicy_inject, no_name },
- { zfs_ioc_inject_list_next, zfs_secpolicy_inject, no_name },
- { zfs_ioc_error_log, zfs_secpolicy_inject, pool_name },
- { zfs_ioc_clear, zfs_secpolicy_config, pool_name },
- { zfs_ioc_promote, zfs_secpolicy_write, dataset_name },
- { zfs_ioc_destroy_snaps, zfs_secpolicy_write, dataset_name },
- { zfs_ioc_snapshot, zfs_secpolicy_operator, dataset_name },
- { zfs_ioc_dsobj_to_dsname, zfs_secpolicy_config, pool_name },
- { zfs_ioc_obj_to_path, zfs_secpolicy_config, no_name },
- { zfs_ioc_pool_set_props, zfs_secpolicy_config, pool_name },
- { zfs_ioc_pool_get_props, zfs_secpolicy_read, pool_name },
- { zfs_ioc_jail, zfs_secpolicy_config, dataset_name },
- { zfs_ioc_unjail, zfs_secpolicy_config, dataset_name }
-};
-
-static int
-zfsdev_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag,
- struct thread *td)
-{
- zfs_cmd_t *zc = (void *)addr;
- uint_t vec;
- int error;
-
- vec = ZFS_IOC(cmd);
-
- if (vec >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0]))
- return (EINVAL);
-
- zc->zc_cred = (uintptr_t)td->td_ucred;
- zc->zc_dev = (uintptr_t)dev;
- error = zfs_ioc_vec[vec].zvec_secpolicy(zc->zc_name, td->td_ucred);
-
- /*
- * Ensure that all pool/dataset names are valid before we pass down to
- * the lower layers.
- */
- if (error == 0) {
- zc->zc_name[sizeof (zc->zc_name) - 1] = '\0';
- switch (zfs_ioc_vec[vec].zvec_namecheck) {
- case pool_name:
- if (pool_namecheck(zc->zc_name, NULL, NULL) != 0)
- error = EINVAL;
- break;
-
- case dataset_name:
- if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0)
- error = EINVAL;
- break;
-
- case no_name:
- break;
- }
- }
-
- if (error == 0)
- error = zfs_ioc_vec[vec].zvec_func(zc);
-
- return (error);
-}
-
-/*
- * OK, so this is a little weird.
- *
- * /dev/zfs is the control node, i.e. minor 0.
- * /dev/zvol/[r]dsk/pool/dataset are the zvols, minor > 0.
- *
- * /dev/zfs has basically nothing to do except serve up ioctls,
- * so most of the standard driver entry points are in zvol.c.
- */
-static struct cdevsw zfs_cdevsw = {
- .d_version = D_VERSION,
- .d_ioctl = zfsdev_ioctl,
- .d_name = ZFS_DEV_NAME
-};
-
-static void
-zfsdev_init(void)
-{
- zfsdev = make_dev(&zfs_cdevsw, 0x0, UID_ROOT, GID_OPERATOR, 0660,
- ZFS_DEV_NAME);
-}
-
-static void
-zfsdev_fini(void)
-{
- if (zfsdev != NULL)
- destroy_dev(zfsdev);
-}
-
-static struct task zfs_start_task;
-static struct root_hold_token *zfs_root_token;
-
-static void
-zfs_start(void *context __unused, int pending __unused)
-{
-
- zfsdev_init();
- spa_init(FREAD | FWRITE);
- zfs_init();
- zvol_init();
- printf("ZFS storage pool version " ZFS_VERSION_STRING "\n");
- root_mount_rel(zfs_root_token);
-}
-
-static int
-zfs_modevent(module_t mod, int type, void *unused __unused)
-{
- int error;
-
- error = EOPNOTSUPP;
- switch (type) {
- case MOD_LOAD:
- zfs_root_token = root_mount_hold("ZFS");
- printf("WARNING: ZFS is considered to be an experimental "
- "feature in FreeBSD.\n");
- TASK_INIT(&zfs_start_task, 0, zfs_start, NULL);
- taskqueue_enqueue(taskqueue_thread, &zfs_start_task);
- error = 0;
- break;
- case MOD_UNLOAD:
- if (spa_busy() || zfs_busy() || zvol_busy() ||
- zio_injection_enabled) {
- error = EBUSY;
- break;
- }
- zvol_fini();
- zfs_fini();
- spa_fini();
- zfsdev_fini();
- error = 0;
- break;
- }
- return (error);
-}
-
-static moduledata_t zfs_mod = {
- "zfsctrl",
- zfs_modevent,
- 0
-};
-DECLARE_MODULE(zfsctrl, zfs_mod, SI_SUB_VFS, SI_ORDER_ANY);
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c
deleted file mode 100644
index dde9ec1..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c
+++ /dev/null
@@ -1,349 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/sysmacros.h>
-#include <sys/cmn_err.h>
-#include <sys/kmem.h>
-#include <sys/file.h>
-#include <sys/vfs.h>
-#include <sys/zfs_znode.h>
-#include <sys/zfs_dir.h>
-#include <sys/zil.h>
-#include <sys/byteorder.h>
-#include <sys/policy.h>
-#include <sys/stat.h>
-#include <sys/acl.h>
-#include <sys/dmu.h>
-#include <sys/spa.h>
-
-/*
- * All the functions in this file are used to construct the log entries
- * to record transactions. They allocate * a intent log transaction
- * structure (itx_t) and save within it all the information necessary to
- * possibly replay the transaction. The itx is then assigned a sequence
- * number and inserted in the in-memory list anchored in the zilog.
- */
-
-/*
- * zfs_log_create() is used to handle TX_CREATE, TX_MKDIR and TX_MKXATTR
- * transactions.
- */
-void
-zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, int txtype,
- znode_t *dzp, znode_t *zp, char *name)
-{
- itx_t *itx;
- uint64_t seq;
- lr_create_t *lr;
- size_t namesize = strlen(name) + 1;
-
- if (zilog == NULL)
- return;
-
- itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
- lr = (lr_create_t *)&itx->itx_lr;
- lr->lr_doid = dzp->z_id;
- lr->lr_foid = zp->z_id;
- lr->lr_mode = zp->z_phys->zp_mode;
- lr->lr_uid = zp->z_phys->zp_uid;
- lr->lr_gid = zp->z_phys->zp_gid;
- lr->lr_gen = zp->z_phys->zp_gen;
- lr->lr_crtime[0] = zp->z_phys->zp_crtime[0];
- lr->lr_crtime[1] = zp->z_phys->zp_crtime[1];
- lr->lr_rdev = zp->z_phys->zp_rdev;
- bcopy(name, (char *)(lr + 1), namesize);
-
- seq = zil_itx_assign(zilog, itx, tx);
- dzp->z_last_itx = seq;
- zp->z_last_itx = seq;
-}
-
-/*
- * zfs_log_remove() handles both TX_REMOVE and TX_RMDIR transactions.
- */
-void
-zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, int txtype,
- znode_t *dzp, char *name)
-{
- itx_t *itx;
- uint64_t seq;
- lr_remove_t *lr;
- size_t namesize = strlen(name) + 1;
-
- if (zilog == NULL)
- return;
-
- itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
- lr = (lr_remove_t *)&itx->itx_lr;
- lr->lr_doid = dzp->z_id;
- bcopy(name, (char *)(lr + 1), namesize);
-
- seq = zil_itx_assign(zilog, itx, tx);
- dzp->z_last_itx = seq;
-}
-
-/*
- * zfs_log_link() handles TX_LINK transactions.
- */
-void
-zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, int txtype,
- znode_t *dzp, znode_t *zp, char *name)
-{
- itx_t *itx;
- uint64_t seq;
- lr_link_t *lr;
- size_t namesize = strlen(name) + 1;
-
- if (zilog == NULL)
- return;
-
- itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
- lr = (lr_link_t *)&itx->itx_lr;
- lr->lr_doid = dzp->z_id;
- lr->lr_link_obj = zp->z_id;
- bcopy(name, (char *)(lr + 1), namesize);
-
- seq = zil_itx_assign(zilog, itx, tx);
- dzp->z_last_itx = seq;
- zp->z_last_itx = seq;
-}
-
-/*
- * zfs_log_symlink() handles TX_SYMLINK transactions.
- */
-void
-zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, int txtype,
- znode_t *dzp, znode_t *zp, char *name, char *link)
-{
- itx_t *itx;
- uint64_t seq;
- lr_create_t *lr;
- size_t namesize = strlen(name) + 1;
- size_t linksize = strlen(link) + 1;
-
- if (zilog == NULL)
- return;
-
- itx = zil_itx_create(txtype, sizeof (*lr) + namesize + linksize);
- lr = (lr_create_t *)&itx->itx_lr;
- lr->lr_doid = dzp->z_id;
- lr->lr_foid = zp->z_id;
- lr->lr_mode = zp->z_phys->zp_mode;
- lr->lr_uid = zp->z_phys->zp_uid;
- lr->lr_gid = zp->z_phys->zp_gid;
- lr->lr_gen = zp->z_phys->zp_gen;
- lr->lr_crtime[0] = zp->z_phys->zp_crtime[0];
- lr->lr_crtime[1] = zp->z_phys->zp_crtime[1];
- bcopy(name, (char *)(lr + 1), namesize);
- bcopy(link, (char *)(lr + 1) + namesize, linksize);
-
- seq = zil_itx_assign(zilog, itx, tx);
- dzp->z_last_itx = seq;
- zp->z_last_itx = seq;
-}
-
-/*
- * zfs_log_rename() handles TX_RENAME transactions.
- */
-void
-zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, int txtype,
- znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp)
-{
- itx_t *itx;
- uint64_t seq;
- lr_rename_t *lr;
- size_t snamesize = strlen(sname) + 1;
- size_t dnamesize = strlen(dname) + 1;
-
- if (zilog == NULL)
- return;
-
- itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize);
- lr = (lr_rename_t *)&itx->itx_lr;
- lr->lr_sdoid = sdzp->z_id;
- lr->lr_tdoid = tdzp->z_id;
- bcopy(sname, (char *)(lr + 1), snamesize);
- bcopy(dname, (char *)(lr + 1) + snamesize, dnamesize);
-
- seq = zil_itx_assign(zilog, itx, tx);
- sdzp->z_last_itx = seq;
- tdzp->z_last_itx = seq;
- szp->z_last_itx = seq;
-}
-
-/*
- * zfs_log_write() handles TX_WRITE transactions.
- */
-ssize_t zfs_immediate_write_sz = 32768;
-
-void
-zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
- znode_t *zp, offset_t off, ssize_t len, int ioflag)
-{
- itx_t *itx;
- uint64_t seq;
- lr_write_t *lr;
- itx_wr_state_t write_state;
- int err;
-
- if (zilog == NULL || zp->z_unlinked)
- return;
-
- /*
- * Writes are handled in three different ways:
- *
- * WR_INDIRECT:
- * If the write is greater than zfs_immediate_write_sz then
- * later *if* we need to log the write then dmu_sync() is used
- * to immediately write the block and it's block pointer is put
- * in the log record.
- * WR_COPIED:
- * If we know we'll immediately be committing the
- * transaction (FDSYNC (O_DSYNC)), the we allocate a larger
- * log record here for the data and copy the data in.
- * WR_NEED_COPY:
- * Otherwise we don't allocate a buffer, and *if* we need to
- * flush the write later then a buffer is allocated and
- * we retrieve the data using the dmu.
- */
- if (len > zfs_immediate_write_sz)
- write_state = WR_INDIRECT;
- else if (ioflag & FDSYNC)
- write_state = WR_COPIED;
- else
- write_state = WR_NEED_COPY;
-
- itx = zil_itx_create(txtype, sizeof (*lr) +
- (write_state == WR_COPIED ? len : 0));
- lr = (lr_write_t *)&itx->itx_lr;
- if (write_state == WR_COPIED) {
- err = dmu_read(zp->z_zfsvfs->z_os, zp->z_id, off, len, lr + 1);
- if (err) {
- kmem_free(itx, offsetof(itx_t, itx_lr) +
- itx->itx_lr.lrc_reclen);
- itx = zil_itx_create(txtype, sizeof (*lr));
- lr = (lr_write_t *)&itx->itx_lr;
- write_state = WR_NEED_COPY;
- }
- }
-
- itx->itx_wr_state = write_state;
- lr->lr_foid = zp->z_id;
- lr->lr_offset = off;
- lr->lr_length = len;
- lr->lr_blkoff = 0;
- BP_ZERO(&lr->lr_blkptr);
-
- itx->itx_private = zp->z_zfsvfs;
-
- itx->itx_sync = (zp->z_sync_cnt != 0);
- seq = zil_itx_assign(zilog, itx, tx);
- zp->z_last_itx = seq;
-}
-
-/*
- * zfs_log_truncate() handles TX_TRUNCATE transactions.
- */
-void
-zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
- znode_t *zp, uint64_t off, uint64_t len)
-{
- itx_t *itx;
- uint64_t seq;
- lr_truncate_t *lr;
-
- if (zilog == NULL || zp->z_unlinked)
- return;
-
- itx = zil_itx_create(txtype, sizeof (*lr));
- lr = (lr_truncate_t *)&itx->itx_lr;
- lr->lr_foid = zp->z_id;
- lr->lr_offset = off;
- lr->lr_length = len;
-
- itx->itx_sync = (zp->z_sync_cnt != 0);
- seq = zil_itx_assign(zilog, itx, tx);
- zp->z_last_itx = seq;
-}
-
-/*
- * zfs_log_setattr() handles TX_SETATTR transactions.
- */
-void
-zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
- znode_t *zp, vattr_t *vap, uint_t mask_applied)
-{
- itx_t *itx;
- uint64_t seq;
- lr_setattr_t *lr;
-
- if (zilog == NULL || zp->z_unlinked)
- return;
-
- itx = zil_itx_create(txtype, sizeof (*lr));
- lr = (lr_setattr_t *)&itx->itx_lr;
- lr->lr_foid = zp->z_id;
- lr->lr_mask = (uint64_t)mask_applied;
- lr->lr_mode = (uint64_t)vap->va_mode;
- lr->lr_uid = (uint64_t)vap->va_uid;
- lr->lr_gid = (uint64_t)vap->va_gid;
- lr->lr_size = (uint64_t)vap->va_size;
- ZFS_TIME_ENCODE(&vap->va_atime, lr->lr_atime);
- ZFS_TIME_ENCODE(&vap->va_mtime, lr->lr_mtime);
-
- itx->itx_sync = (zp->z_sync_cnt != 0);
- seq = zil_itx_assign(zilog, itx, tx);
- zp->z_last_itx = seq;
-}
-
-/*
- * zfs_log_acl() handles TX_ACL transactions.
- */
-void
-zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, int txtype,
- znode_t *zp, int aclcnt, ace_t *z_ace)
-{
- itx_t *itx;
- uint64_t seq;
- lr_acl_t *lr;
-
- if (zilog == NULL || zp->z_unlinked)
- return;
-
- itx = zil_itx_create(txtype, sizeof (*lr) + aclcnt * sizeof (ace_t));
- lr = (lr_acl_t *)&itx->itx_lr;
- lr->lr_foid = zp->z_id;
- lr->lr_aclcnt = (uint64_t)aclcnt;
- bcopy(z_ace, (ace_t *)(lr + 1), aclcnt * sizeof (ace_t));
-
- itx->itx_sync = (zp->z_sync_cnt != 0);
- seq = zil_itx_assign(zilog, itx, tx);
- zp->z_last_itx = seq;
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c
deleted file mode 100644
index 2be3093..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c
+++ /dev/null
@@ -1,430 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/sysmacros.h>
-#include <sys/cmn_err.h>
-#include <sys/kmem.h>
-#include <sys/file.h>
-#include <sys/fcntl.h>
-#include <sys/vfs.h>
-#include <sys/fs/zfs.h>
-#include <sys/zfs_znode.h>
-#include <sys/zfs_dir.h>
-#include <sys/zfs_acl.h>
-#include <sys/spa.h>
-#include <sys/zil.h>
-#include <sys/byteorder.h>
-#include <sys/stat.h>
-#include <sys/acl.h>
-#include <sys/atomic.h>
-#include <sys/cred.h>
-#include <sys/namei.h>
-
-/*
- * Functions to replay ZFS intent log (ZIL) records
- * The functions are called through a function vector (zfs_replay_vector)
- * which is indexed by the transaction type.
- */
-
-static void
-zfs_init_vattr(vattr_t *vap, uint64_t mask, uint64_t mode,
- uint64_t uid, uint64_t gid, uint64_t rdev, uint64_t nodeid)
-{
- VATTR_NULL(vap);
- vap->va_mask = (uint_t)mask;
- vap->va_type = IFTOVT(mode);
- vap->va_mode = mode & MODEMASK;
- vap->va_uid = (uid_t)uid;
- vap->va_gid = (gid_t)gid;
- vap->va_rdev = zfs_cmpldev(rdev);
- vap->va_nodeid = nodeid;
-}
-
-/* ARGSUSED */
-static int
-zfs_replay_error(zfsvfs_t *zfsvfs, lr_t *lr, boolean_t byteswap)
-{
- return (ENOTSUP);
-}
-
-static int
-zfs_replay_create(zfsvfs_t *zfsvfs, lr_create_t *lr, boolean_t byteswap)
-{
- char *name = (char *)(lr + 1); /* name follows lr_create_t */
- char *link; /* symlink content follows name */
- znode_t *dzp;
- vnode_t *vp = NULL;
- vattr_t va;
- struct componentname cn;
- int error;
-
- if (byteswap)
- byteswap_uint64_array(lr, sizeof (*lr));
-
- if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
- return (error);
-
- zfs_init_vattr(&va, AT_TYPE | AT_MODE | AT_UID | AT_GID,
- lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, lr->lr_foid);
-
- /*
- * All forms of zfs create (create, mkdir, mkxattrdir, symlink)
- * eventually end up in zfs_mknode(), which assigns the object's
- * creation time and generation number. The generic VOP_CREATE()
- * doesn't have either concept, so we smuggle the values inside
- * the vattr's otherwise unused va_ctime and va_nblocks fields.
- */
- ZFS_TIME_DECODE(&va.va_ctime, lr->lr_crtime);
- va.va_nblocks = lr->lr_gen;
-
- cn.cn_nameptr = name;
- cn.cn_cred = kcred;
- cn.cn_thread = curthread;
- cn.cn_flags = SAVENAME;
-
- vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY);
- switch ((int)lr->lr_common.lrc_txtype) {
- case TX_CREATE:
- error = VOP_CREATE(ZTOV(dzp), &vp, &cn, &va);
- break;
- case TX_MKDIR:
- error = VOP_MKDIR(ZTOV(dzp), &vp, &cn, &va);
- break;
- case TX_MKXATTR:
- error = zfs_make_xattrdir(dzp, &va, &vp, kcred);
- break;
- case TX_SYMLINK:
- link = name + strlen(name) + 1;
- error = VOP_SYMLINK(ZTOV(dzp), &vp, &cn, &va, link);
- break;
- default:
- error = ENOTSUP;
- }
- VOP_UNLOCK(ZTOV(dzp), 0);
-
- if (error == 0 && vp != NULL) {
- VOP_UNLOCK(vp, 0);
- VN_RELE(vp);
- }
-
- VN_RELE(ZTOV(dzp));
-
- return (error);
-}
-
-static int
-zfs_replay_remove(zfsvfs_t *zfsvfs, lr_remove_t *lr, boolean_t byteswap)
-{
- char *name = (char *)(lr + 1); /* name follows lr_remove_t */
- znode_t *dzp;
- struct componentname cn;
- vnode_t *vp;
- int error;
-
- if (byteswap)
- byteswap_uint64_array(lr, sizeof (*lr));
-
- if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
- return (error);
-
- bzero(&cn, sizeof(cn));
- cn.cn_nameptr = name;
- cn.cn_namelen = strlen(name);
- cn.cn_nameiop = DELETE;
- cn.cn_flags = ISLASTCN | SAVENAME;
- cn.cn_lkflags = LK_EXCLUSIVE | LK_RETRY;
- cn.cn_cred = kcred;
- cn.cn_thread = curthread;
- vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY);
- error = VOP_LOOKUP(ZTOV(dzp), &vp, &cn);
- if (error != 0) {
- VOP_UNLOCK(ZTOV(dzp), 0);
- goto fail;
- }
-
- switch ((int)lr->lr_common.lrc_txtype) {
- case TX_REMOVE:
- error = VOP_REMOVE(ZTOV(dzp), vp, &cn);
- break;
- case TX_RMDIR:
- error = VOP_RMDIR(ZTOV(dzp), vp, &cn);
- break;
- default:
- error = ENOTSUP;
- }
- vput(vp);
- VOP_UNLOCK(ZTOV(dzp), 0);
-fail:
- VN_RELE(ZTOV(dzp));
-
- return (error);
-}
-
-static int
-zfs_replay_link(zfsvfs_t *zfsvfs, lr_link_t *lr, boolean_t byteswap)
-{
- char *name = (char *)(lr + 1); /* name follows lr_link_t */
- znode_t *dzp, *zp;
- struct componentname cn;
- int error;
-
- if (byteswap)
- byteswap_uint64_array(lr, sizeof (*lr));
-
- if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
- return (error);
-
- if ((error = zfs_zget(zfsvfs, lr->lr_link_obj, &zp)) != 0) {
- VN_RELE(ZTOV(dzp));
- return (error);
- }
-
- cn.cn_nameptr = name;
- cn.cn_cred = kcred;
- cn.cn_thread = curthread;
- cn.cn_flags = SAVENAME;
-
- vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY);
- vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY);
- error = VOP_LINK(ZTOV(dzp), ZTOV(zp), &cn);
- VOP_UNLOCK(ZTOV(zp), 0);
- VOP_UNLOCK(ZTOV(dzp), 0);
-
- VN_RELE(ZTOV(zp));
- VN_RELE(ZTOV(dzp));
-
- return (error);
-}
-
-static int
-zfs_replay_rename(zfsvfs_t *zfsvfs, lr_rename_t *lr, boolean_t byteswap)
-{
- char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */
- char *tname = sname + strlen(sname) + 1;
- znode_t *sdzp, *tdzp;
- struct componentname scn, tcn;
- vnode_t *svp, *tvp;
- kthread_t *td = curthread;
- int error;
-
- if (byteswap)
- byteswap_uint64_array(lr, sizeof (*lr));
-
- if ((error = zfs_zget(zfsvfs, lr->lr_sdoid, &sdzp)) != 0)
- return (error);
-
- if ((error = zfs_zget(zfsvfs, lr->lr_tdoid, &tdzp)) != 0) {
- VN_RELE(ZTOV(sdzp));
- return (error);
- }
-
- svp = tvp = NULL;
-
- bzero(&scn, sizeof(scn));
- scn.cn_nameptr = sname;
- scn.cn_namelen = strlen(sname);
- scn.cn_nameiop = DELETE;
- scn.cn_flags = ISLASTCN | SAVENAME;
- scn.cn_lkflags = LK_EXCLUSIVE | LK_RETRY;
- scn.cn_cred = kcred;
- scn.cn_thread = td;
- vn_lock(ZTOV(sdzp), LK_EXCLUSIVE | LK_RETRY);
- error = VOP_LOOKUP(ZTOV(sdzp), &svp, &scn);
- VOP_UNLOCK(ZTOV(sdzp), 0);
- if (error != 0)
- goto fail;
- VOP_UNLOCK(svp, 0);
-
- bzero(&tcn, sizeof(tcn));
- tcn.cn_nameptr = tname;
- tcn.cn_namelen = strlen(tname);
- tcn.cn_nameiop = RENAME;
- tcn.cn_flags = ISLASTCN | SAVENAME;
- tcn.cn_lkflags = LK_EXCLUSIVE | LK_RETRY;
- tcn.cn_cred = kcred;
- tcn.cn_thread = td;
- vn_lock(ZTOV(tdzp), LK_EXCLUSIVE | LK_RETRY);
- error = VOP_LOOKUP(ZTOV(tdzp), &tvp, &tcn);
- if (error == EJUSTRETURN)
- tvp = NULL;
- else if (error != 0) {
- VOP_UNLOCK(ZTOV(tdzp), 0);
- goto fail;
- }
-
- error = VOP_RENAME(ZTOV(sdzp), svp, &scn, ZTOV(tdzp), tvp, &tcn);
- return (error);
-fail:
- if (svp != NULL)
- vrele(svp);
- if (tvp != NULL)
- vrele(tvp);
- VN_RELE(ZTOV(tdzp));
- VN_RELE(ZTOV(sdzp));
-
- return (error);
-}
-
-static int
-zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap)
-{
- char *data = (char *)(lr + 1); /* data follows lr_write_t */
- znode_t *zp;
- int error;
- ssize_t resid;
-
- if (byteswap)
- byteswap_uint64_array(lr, sizeof (*lr));
-
- if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
- /*
- * As we can log writes out of order, it's possible the
- * file has been removed. In this case just drop the write
- * and return success.
- */
- if (error == ENOENT)
- error = 0;
- return (error);
- }
-
- error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, lr->lr_length,
- lr->lr_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
-
- VN_RELE(ZTOV(zp));
-
- return (error);
-}
-
-static int
-zfs_replay_truncate(zfsvfs_t *zfsvfs, lr_truncate_t *lr, boolean_t byteswap)
-{
-
- ZFS_LOG(0, "Unexpected code path, report to pjd@FreeBSD.org");
- return (EOPNOTSUPP);
-}
-
-static int
-zfs_replay_setattr(zfsvfs_t *zfsvfs, lr_setattr_t *lr, boolean_t byteswap)
-{
- znode_t *zp;
- vattr_t va;
- vnode_t *vp;
- int error;
-
- if (byteswap)
- byteswap_uint64_array(lr, sizeof (*lr));
-
- if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
- /*
- * As we can log setattrs out of order, it's possible the
- * file has been removed. In this case just drop the setattr
- * and return success.
- */
- if (error == ENOENT)
- error = 0;
- return (error);
- }
-
- zfs_init_vattr(&va, lr->lr_mask, lr->lr_mode,
- lr->lr_uid, lr->lr_gid, 0, lr->lr_foid);
-
- va.va_size = lr->lr_size;
- ZFS_TIME_DECODE(&va.va_atime, lr->lr_atime);
- ZFS_TIME_DECODE(&va.va_mtime, lr->lr_mtime);
-
- vp = ZTOV(zp);
- vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
- error = VOP_SETATTR(vp, &va, kcred, curthread);
- VOP_UNLOCK(vp, 0);
- VN_RELE(vp);
-
- return (error);
-}
-
-static int
-zfs_replay_acl(zfsvfs_t *zfsvfs, lr_acl_t *lr, boolean_t byteswap)
-{
- ace_t *ace = (ace_t *)(lr + 1); /* ace array follows lr_acl_t */
-#ifdef TODO
- vsecattr_t vsa;
-#endif
- znode_t *zp;
- int error;
-
- if (byteswap) {
- byteswap_uint64_array(lr, sizeof (*lr));
- zfs_ace_byteswap(ace, lr->lr_aclcnt);
- }
-
- if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
- /*
- * As we can log acls out of order, it's possible the
- * file has been removed. In this case just drop the acl
- * and return success.
- */
- if (error == ENOENT)
- error = 0;
- return (error);
- }
-
-#ifdef TODO
- bzero(&vsa, sizeof (vsa));
- vsa.vsa_mask = VSA_ACE | VSA_ACECNT;
- vsa.vsa_aclcnt = lr->lr_aclcnt;
- vsa.vsa_aclentp = ace;
-
- error = VOP_SETSECATTR(ZTOV(zp), &vsa, 0, kcred);
-#else
- error = EOPNOTSUPP;
-#endif
-
- VN_RELE(ZTOV(zp));
-
- return (error);
-}
-
-/*
- * Callback vectors for replaying records
- */
-zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = {
- zfs_replay_error, /* 0 no such transaction type */
- zfs_replay_create, /* TX_CREATE */
- zfs_replay_create, /* TX_MKDIR */
- zfs_replay_create, /* TX_MKXATTR */
- zfs_replay_create, /* TX_SYMLINK */
- zfs_replay_remove, /* TX_REMOVE */
- zfs_replay_remove, /* TX_RMDIR */
- zfs_replay_link, /* TX_LINK */
- zfs_replay_rename, /* TX_RENAME */
- zfs_replay_write, /* TX_WRITE */
- zfs_replay_truncate, /* TX_TRUNCATE */
- zfs_replay_setattr, /* TX_SETATTR */
- zfs_replay_acl, /* TX_ACL */
-};
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c
deleted file mode 100644
index 07ec0f6..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c
+++ /dev/null
@@ -1,594 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-/*
- * This file contains the code to implement file range locking in
- * ZFS, although there isn't much specific to ZFS (all that comes to mind
- * support for growing the blocksize).
- *
- * Interface
- * ---------
- * Defined in zfs_rlock.h but essentially:
- * rl = zfs_range_lock(zp, off, len, lock_type);
- * zfs_range_unlock(rl);
- * zfs_range_reduce(rl, off, len);
- *
- * AVL tree
- * --------
- * An AVL tree is used to maintain the state of the existing ranges
- * that are locked for exclusive (writer) or shared (reader) use.
- * The starting range offset is used for searching and sorting the tree.
- *
- * Common case
- * -----------
- * The (hopefully) usual case is of no overlaps or contention for
- * locks. On entry to zfs_lock_range() a rl_t is allocated; the tree
- * searched that finds no overlap, and *this* rl_t is placed in the tree.
- *
- * Overlaps/Reference counting/Proxy locks
- * ---------------------------------------
- * The avl code only allows one node at a particular offset. Also it's very
- * inefficient to search through all previous entries looking for overlaps
- * (because the very 1st in the ordered list might be at offset 0 but
- * cover the whole file).
- * So this implementation uses reference counts and proxy range locks.
- * Firstly, only reader locks use reference counts and proxy locks,
- * because writer locks are exclusive.
- * When a reader lock overlaps with another then a proxy lock is created
- * for that range and replaces the original lock. If the overlap
- * is exact then the reference count of the proxy is simply incremented.
- * Otherwise, the proxy lock is split into smaller lock ranges and
- * new proxy locks created for non overlapping ranges.
- * The reference counts are adjusted accordingly.
- * Meanwhile, the orginal lock is kept around (this is the callers handle)
- * and its offset and length are used when releasing the lock.
- *
- * Thread coordination
- * -------------------
- * In order to make wakeups efficient and to ensure multiple continuous
- * readers on a range don't starve a writer for the same range lock,
- * two condition variables are allocated in each rl_t.
- * If a writer (or reader) can't get a range it initialises the writer
- * (or reader) cv; sets a flag saying there's a writer (or reader) waiting;
- * and waits on that cv. When a thread unlocks that range it wakes up all
- * writers then all readers before destroying the lock.
- *
- * Append mode writes
- * ------------------
- * Append mode writes need to lock a range at the end of a file.
- * The offset of the end of the file is determined under the
- * range locking mutex, and the lock type converted from RL_APPEND to
- * RL_WRITER and the range locked.
- *
- * Grow block handling
- * -------------------
- * ZFS supports multiple block sizes currently upto 128K. The smallest
- * block size is used for the file which is grown as needed. During this
- * growth all other writers and readers must be excluded.
- * So if the block size needs to be grown then the whole file is
- * exclusively locked, then later the caller will reduce the lock
- * range to just the range to be written using zfs_reduce_range.
- */
-
-#include <sys/zfs_rlock.h>
-
-/*
- * Check if a write lock can be grabbed, or wait and recheck until available.
- */
-static void
-zfs_range_lock_writer(znode_t *zp, rl_t *new)
-{
- avl_tree_t *tree = &zp->z_range_avl;
- rl_t *rl;
- avl_index_t where;
- uint64_t end_size;
- uint64_t off = new->r_off;
- uint64_t len = new->r_len;
-
- for (;;) {
- /*
- * Range locking is also used by zvol and uses a
- * dummied up znode. However, for zvol, we don't need to
- * append or grow blocksize, and besides we don't have
- * a z_phys or z_zfsvfs - so skip that processing.
- *
- * Yes, this is ugly, and would be solved by not handling
- * grow or append in range lock code. If that was done then
- * we could make the range locking code generically available
- * to other non-zfs consumers.
- */
- if (zp->z_vnode) { /* caller is ZPL */
- /*
- * If in append mode pick up the current end of file.
- * This is done under z_range_lock to avoid races.
- */
- if (new->r_type == RL_APPEND)
- new->r_off = zp->z_phys->zp_size;
-
- /*
- * If we need to grow the block size then grab the whole
- * file range. This is also done under z_range_lock to
- * avoid races.
- */
- end_size = MAX(zp->z_phys->zp_size, new->r_off + len);
- if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) ||
- zp->z_blksz < zp->z_zfsvfs->z_max_blksz)) {
- new->r_off = 0;
- new->r_len = UINT64_MAX;
- }
- }
-
- /*
- * First check for the usual case of no locks
- */
- if (avl_numnodes(tree) == 0) {
- new->r_type = RL_WRITER; /* convert to writer */
- avl_add(tree, new);
- return;
- }
-
- /*
- * Look for any locks in the range.
- */
- rl = avl_find(tree, new, &where);
- if (rl)
- goto wait; /* already locked at same offset */
-
- rl = (rl_t *)avl_nearest(tree, where, AVL_AFTER);
- if (rl && (rl->r_off < new->r_off + new->r_len))
- goto wait;
-
- rl = (rl_t *)avl_nearest(tree, where, AVL_BEFORE);
- if (rl && rl->r_off + rl->r_len > new->r_off)
- goto wait;
-
- new->r_type = RL_WRITER; /* convert possible RL_APPEND */
- avl_insert(tree, new, where);
- return;
-wait:
- if (!rl->r_write_wanted) {
- cv_init(&rl->r_wr_cv, NULL, CV_DEFAULT, NULL);
- rl->r_write_wanted = B_TRUE;
- }
- cv_wait(&rl->r_wr_cv, &zp->z_range_lock);
-
- /* reset to original */
- new->r_off = off;
- new->r_len = len;
- }
-}
-
-/*
- * If this is an original (non-proxy) lock then replace it by
- * a proxy and return the proxy.
- */
-static rl_t *
-zfs_range_proxify(avl_tree_t *tree, rl_t *rl)
-{
- rl_t *proxy;
-
- if (rl->r_proxy)
- return (rl); /* already a proxy */
-
- ASSERT3U(rl->r_cnt, ==, 1);
- ASSERT(rl->r_write_wanted == B_FALSE);
- ASSERT(rl->r_read_wanted == B_FALSE);
- avl_remove(tree, rl);
- rl->r_cnt = 0;
-
- /* create a proxy range lock */
- proxy = kmem_alloc(sizeof (rl_t), KM_SLEEP);
- proxy->r_off = rl->r_off;
- proxy->r_len = rl->r_len;
- proxy->r_cnt = 1;
- proxy->r_type = RL_READER;
- proxy->r_proxy = B_TRUE;
- proxy->r_write_wanted = B_FALSE;
- proxy->r_read_wanted = B_FALSE;
- avl_add(tree, proxy);
-
- return (proxy);
-}
-
-/*
- * Split the range lock at the supplied offset
- * returning the *front* proxy.
- */
-static rl_t *
-zfs_range_split(avl_tree_t *tree, rl_t *rl, uint64_t off)
-{
- rl_t *front, *rear;
-
- ASSERT3U(rl->r_len, >, 1);
- ASSERT3U(off, >, rl->r_off);
- ASSERT3U(off, <, rl->r_off + rl->r_len);
- ASSERT(rl->r_write_wanted == B_FALSE);
- ASSERT(rl->r_read_wanted == B_FALSE);
-
- /* create the rear proxy range lock */
- rear = kmem_alloc(sizeof (rl_t), KM_SLEEP);
- rear->r_off = off;
- rear->r_len = rl->r_off + rl->r_len - off;
- rear->r_cnt = rl->r_cnt;
- rear->r_type = RL_READER;
- rear->r_proxy = B_TRUE;
- rear->r_write_wanted = B_FALSE;
- rear->r_read_wanted = B_FALSE;
-
- front = zfs_range_proxify(tree, rl);
- front->r_len = off - rl->r_off;
-
- avl_insert_here(tree, rear, front, AVL_AFTER);
- return (front);
-}
-
-/*
- * Create and add a new proxy range lock for the supplied range.
- */
-static void
-zfs_range_new_proxy(avl_tree_t *tree, uint64_t off, uint64_t len)
-{
- rl_t *rl;
-
- ASSERT(len);
- rl = kmem_alloc(sizeof (rl_t), KM_SLEEP);
- rl->r_off = off;
- rl->r_len = len;
- rl->r_cnt = 1;
- rl->r_type = RL_READER;
- rl->r_proxy = B_TRUE;
- rl->r_write_wanted = B_FALSE;
- rl->r_read_wanted = B_FALSE;
- avl_add(tree, rl);
-}
-
-static void
-zfs_range_add_reader(avl_tree_t *tree, rl_t *new, rl_t *prev, avl_index_t where)
-{
- rl_t *next;
- uint64_t off = new->r_off;
- uint64_t len = new->r_len;
-
- /*
- * prev arrives either:
- * - pointing to an entry at the same offset
- * - pointing to the entry with the closest previous offset whose
- * range may overlap with the new range
- * - null, if there were no ranges starting before the new one
- */
- if (prev) {
- if (prev->r_off + prev->r_len <= off) {
- prev = NULL;
- } else if (prev->r_off != off) {
- /*
- * convert to proxy if needed then
- * split this entry and bump ref count
- */
- prev = zfs_range_split(tree, prev, off);
- prev = AVL_NEXT(tree, prev); /* move to rear range */
- }
- }
- ASSERT((prev == NULL) || (prev->r_off == off));
-
- if (prev)
- next = prev;
- else
- next = (rl_t *)avl_nearest(tree, where, AVL_AFTER);
-
- if (next == NULL || off + len <= next->r_off) {
- /* no overlaps, use the original new rl_t in the tree */
- avl_insert(tree, new, where);
- return;
- }
-
- if (off < next->r_off) {
- /* Add a proxy for initial range before the overlap */
- zfs_range_new_proxy(tree, off, next->r_off - off);
- }
-
- new->r_cnt = 0; /* will use proxies in tree */
- /*
- * We now search forward through the ranges, until we go past the end
- * of the new range. For each entry we make it a proxy if it
- * isn't already, then bump its reference count. If there's any
- * gaps between the ranges then we create a new proxy range.
- */
- for (prev = NULL; next; prev = next, next = AVL_NEXT(tree, next)) {
- if (off + len <= next->r_off)
- break;
- if (prev && prev->r_off + prev->r_len < next->r_off) {
- /* there's a gap */
- ASSERT3U(next->r_off, >, prev->r_off + prev->r_len);
- zfs_range_new_proxy(tree, prev->r_off + prev->r_len,
- next->r_off - (prev->r_off + prev->r_len));
- }
- if (off + len == next->r_off + next->r_len) {
- /* exact overlap with end */
- next = zfs_range_proxify(tree, next);
- next->r_cnt++;
- return;
- }
- if (off + len < next->r_off + next->r_len) {
- /* new range ends in the middle of this block */
- next = zfs_range_split(tree, next, off + len);
- next->r_cnt++;
- return;
- }
- ASSERT3U(off + len, >, next->r_off + next->r_len);
- next = zfs_range_proxify(tree, next);
- next->r_cnt++;
- }
-
- /* Add the remaining end range. */
- zfs_range_new_proxy(tree, prev->r_off + prev->r_len,
- (off + len) - (prev->r_off + prev->r_len));
-}
-
-/*
- * Check if a reader lock can be grabbed, or wait and recheck until available.
- */
-static void
-zfs_range_lock_reader(znode_t *zp, rl_t *new)
-{
- avl_tree_t *tree = &zp->z_range_avl;
- rl_t *prev, *next;
- avl_index_t where;
- uint64_t off = new->r_off;
- uint64_t len = new->r_len;
-
- /*
- * Look for any writer locks in the range.
- */
-retry:
- prev = avl_find(tree, new, &where);
- if (prev == NULL)
- prev = (rl_t *)avl_nearest(tree, where, AVL_BEFORE);
-
- /*
- * Check the previous range for a writer lock overlap.
- */
- if (prev && (off < prev->r_off + prev->r_len)) {
- if ((prev->r_type == RL_WRITER) || (prev->r_write_wanted)) {
- if (!prev->r_read_wanted) {
- cv_init(&prev->r_rd_cv, NULL, CV_DEFAULT, NULL);
- prev->r_read_wanted = B_TRUE;
- }
- cv_wait(&prev->r_rd_cv, &zp->z_range_lock);
- goto retry;
- }
- if (off + len < prev->r_off + prev->r_len)
- goto got_lock;
- }
-
- /*
- * Search through the following ranges to see if there's
- * write lock any overlap.
- */
- if (prev)
- next = AVL_NEXT(tree, prev);
- else
- next = (rl_t *)avl_nearest(tree, where, AVL_AFTER);
- for (; next; next = AVL_NEXT(tree, next)) {
- if (off + len <= next->r_off)
- goto got_lock;
- if ((next->r_type == RL_WRITER) || (next->r_write_wanted)) {
- if (!next->r_read_wanted) {
- cv_init(&next->r_rd_cv, NULL, CV_DEFAULT, NULL);
- next->r_read_wanted = B_TRUE;
- }
- cv_wait(&next->r_rd_cv, &zp->z_range_lock);
- goto retry;
- }
- if (off + len <= next->r_off + next->r_len)
- goto got_lock;
- }
-
-got_lock:
- /*
- * Add the read lock, which may involve splitting existing
- * locks and bumping ref counts (r_cnt).
- */
- zfs_range_add_reader(tree, new, prev, where);
-}
-
-/*
- * Lock a range (offset, length) as either shared (RL_READER)
- * or exclusive (RL_WRITER). Returns the range lock structure
- * for later unlocking or reduce range (if entire file
- * previously locked as RL_WRITER).
- */
-rl_t *
-zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type)
-{
- rl_t *new;
-
- ASSERT(type == RL_READER || type == RL_WRITER || type == RL_APPEND);
-
- new = kmem_alloc(sizeof (rl_t), KM_SLEEP);
- new->r_zp = zp;
- new->r_off = off;
- new->r_len = len;
- new->r_cnt = 1; /* assume it's going to be in the tree */
- new->r_type = type;
- new->r_proxy = B_FALSE;
- new->r_write_wanted = B_FALSE;
- new->r_read_wanted = B_FALSE;
-
- mutex_enter(&zp->z_range_lock);
- if (type == RL_READER) {
- /*
- * First check for the usual case of no locks
- */
- if (avl_numnodes(&zp->z_range_avl) == 0)
- avl_add(&zp->z_range_avl, new);
- else
- zfs_range_lock_reader(zp, new);
- } else
- zfs_range_lock_writer(zp, new); /* RL_WRITER or RL_APPEND */
- mutex_exit(&zp->z_range_lock);
- return (new);
-}
-
-/*
- * Unlock a reader lock
- */
-static void
-zfs_range_unlock_reader(znode_t *zp, rl_t *remove)
-{
- avl_tree_t *tree = &zp->z_range_avl;
- rl_t *rl, *next;
- uint64_t len;
-
- /*
- * The common case is when the remove entry is in the tree
- * (cnt == 1) meaning there's been no other reader locks overlapping
- * with this one. Otherwise the remove entry will have been
- * removed from the tree and replaced by proxies (one or
- * more ranges mapping to the entire range).
- */
- if (remove->r_cnt == 1) {
- avl_remove(tree, remove);
- if (remove->r_write_wanted)
- cv_broadcast(&remove->r_wr_cv);
- if (remove->r_read_wanted)
- cv_broadcast(&remove->r_rd_cv);
- } else {
- ASSERT3U(remove->r_cnt, ==, 0);
- ASSERT3U(remove->r_write_wanted, ==, 0);
- ASSERT3U(remove->r_read_wanted, ==, 0);
- /*
- * Find start proxy representing this reader lock,
- * then decrement ref count on all proxies
- * that make up this range, freeing them as needed.
- */
- rl = avl_find(tree, remove, NULL);
- ASSERT(rl);
- ASSERT(rl->r_cnt);
- ASSERT(rl->r_type == RL_READER);
- for (len = remove->r_len; len != 0; rl = next) {
- len -= rl->r_len;
- if (len) {
- next = AVL_NEXT(tree, rl);
- ASSERT(next);
- ASSERT(rl->r_off + rl->r_len == next->r_off);
- ASSERT(next->r_cnt);
- ASSERT(next->r_type == RL_READER);
- }
- rl->r_cnt--;
- if (rl->r_cnt == 0) {
- avl_remove(tree, rl);
- if (rl->r_write_wanted)
- cv_broadcast(&rl->r_wr_cv);
- if (rl->r_read_wanted)
- cv_broadcast(&rl->r_rd_cv);
- kmem_free(rl, sizeof (rl_t));
- }
- }
- }
- kmem_free(remove, sizeof (rl_t));
-}
-
-/*
- * Unlock range and destroy range lock structure.
- */
-void
-zfs_range_unlock(rl_t *rl)
-{
- znode_t *zp = rl->r_zp;
-
- ASSERT(rl->r_type == RL_WRITER || rl->r_type == RL_READER);
- ASSERT(rl->r_cnt == 1 || rl->r_cnt == 0);
- ASSERT(!rl->r_proxy);
-
- mutex_enter(&zp->z_range_lock);
- if (rl->r_type == RL_WRITER) {
- /* writer locks can't be shared or split */
- avl_remove(&zp->z_range_avl, rl);
- mutex_exit(&zp->z_range_lock);
- if (rl->r_write_wanted) {
- cv_broadcast(&rl->r_wr_cv);
- cv_destroy(&rl->r_wr_cv);
- }
- if (rl->r_read_wanted) {
- cv_broadcast(&rl->r_rd_cv);
- cv_destroy(&rl->r_rd_cv);
- }
- kmem_free(rl, sizeof (rl_t));
- } else {
- /*
- * lock may be shared, let zfs_range_unlock_reader()
- * release the lock and free the rl_t
- */
- zfs_range_unlock_reader(zp, rl);
- mutex_exit(&zp->z_range_lock);
- }
-}
-
-/*
- * Reduce range locked as RL_WRITER from whole file to specified range.
- * Asserts the whole file is exclusivly locked and so there's only one
- * entry in the tree.
- */
-void
-zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len)
-{
- znode_t *zp = rl->r_zp;
-
- /* Ensure there are no other locks */
- ASSERT(avl_numnodes(&zp->z_range_avl) == 1);
- ASSERT(rl->r_off == 0);
- ASSERT(rl->r_type == RL_WRITER);
- ASSERT(!rl->r_proxy);
- ASSERT3U(rl->r_len, ==, UINT64_MAX);
- ASSERT3U(rl->r_cnt, ==, 1);
-
- mutex_enter(&zp->z_range_lock);
- rl->r_off = off;
- rl->r_len = len;
- mutex_exit(&zp->z_range_lock);
- if (rl->r_write_wanted)
- cv_broadcast(&rl->r_wr_cv);
- if (rl->r_read_wanted)
- cv_broadcast(&rl->r_rd_cv);
-}
-
-/*
- * AVL comparison function used to order range locks
- * Locks are ordered on the start offset of the range.
- */
-int
-zfs_range_compare(const void *arg1, const void *arg2)
-{
- const rl_t *rl1 = arg1;
- const rl_t *rl2 = arg2;
-
- if (rl1->r_off > rl2->r_off)
- return (1);
- if (rl1->r_off < rl2->r_off)
- return (-1);
- return (0);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
deleted file mode 100644
index 28f3293..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
+++ /dev/null
@@ -1,1021 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/kernel.h>
-#include <sys/sysmacros.h>
-#include <sys/kmem.h>
-#include <sys/acl.h>
-#include <sys/vnode.h>
-#include <sys/vfs.h>
-#include <sys/mntent.h>
-#include <sys/mount.h>
-#include <sys/cmn_err.h>
-#include <sys/zfs_znode.h>
-#include <sys/zfs_dir.h>
-#include <sys/zil.h>
-#include <sys/fs/zfs.h>
-#include <sys/dmu.h>
-#include <sys/dsl_prop.h>
-#include <sys/dsl_dataset.h>
-#include <sys/spa.h>
-#include <sys/zap.h>
-#include <sys/varargs.h>
-#include <sys/policy.h>
-#include <sys/atomic.h>
-#include <sys/zfs_ioctl.h>
-#include <sys/zfs_ctldir.h>
-#include <sys/sunddi.h>
-#include <sys/dnlc.h>
-
-struct mtx zfs_debug_mtx;
-MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF);
-SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system");
-int zfs_debug_level = 0;
-TUNABLE_INT("vfs.zfs.debug", &zfs_debug_level);
-SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RW, &zfs_debug_level, 0,
- "Debug level");
-
-static int zfs_mount(vfs_t *vfsp, kthread_t *td);
-static int zfs_umount(vfs_t *vfsp, int fflag, kthread_t *td);
-static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp, kthread_t *td);
-static int zfs_statfs(vfs_t *vfsp, struct statfs *statp, kthread_t *td);
-static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp);
-static int zfs_sync(vfs_t *vfsp, int waitfor, kthread_t *td);
-static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp);
-static void zfs_objset_close(zfsvfs_t *zfsvfs);
-static void zfs_freevfs(vfs_t *vfsp);
-
-static struct vfsops zfs_vfsops = {
- .vfs_mount = zfs_mount,
- .vfs_unmount = zfs_umount,
- .vfs_root = zfs_root,
- .vfs_statfs = zfs_statfs,
- .vfs_vget = zfs_vget,
- .vfs_sync = zfs_sync,
- .vfs_fhtovp = zfs_fhtovp,
-};
-
-VFS_SET(zfs_vfsops, zfs, VFCF_JAIL);
-
-/*
- * We need to keep a count of active fs's.
- * This is necessary to prevent our module
- * from being unloaded after a umount -f
- */
-static uint32_t zfs_active_fs_count = 0;
-
-/*ARGSUSED*/
-static int
-zfs_sync(vfs_t *vfsp, int waitfor, kthread_t *td)
-{
-
- /*
- * Data integrity is job one. We don't want a compromised kernel
- * writing to the storage pool, so we never sync during panic.
- */
- if (panicstr)
- return (0);
-
- if (vfsp != NULL) {
- /*
- * Sync a specific filesystem.
- */
- zfsvfs_t *zfsvfs = vfsp->vfs_data;
- int error;
-
- error = vfs_stdsync(vfsp, waitfor, td);
- if (error != 0)
- return (error);
-
- ZFS_ENTER(zfsvfs);
- if (zfsvfs->z_log != NULL)
- zil_commit(zfsvfs->z_log, UINT64_MAX, 0);
- else
- txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
- ZFS_EXIT(zfsvfs);
- } else {
- /*
- * Sync all ZFS filesystems. This is what happens when you
- * run sync(1M). Unlike other filesystems, ZFS honors the
- * request by waiting for all pools to commit all dirty data.
- */
- spa_sync_allpools();
- }
-
- return (0);
-}
-
-static void
-atime_changed_cb(void *arg, uint64_t newval)
-{
- zfsvfs_t *zfsvfs = arg;
-
- if (newval == TRUE) {
- zfsvfs->z_atime = TRUE;
- zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME;
- vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
- vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
- } else {
- zfsvfs->z_atime = FALSE;
- zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME;
- vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
- vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
- }
-}
-
-static void
-xattr_changed_cb(void *arg, uint64_t newval)
-{
- zfsvfs_t *zfsvfs = arg;
-
- if (newval == TRUE) {
- /* XXX locking on vfs_flag? */
-#ifdef TODO
- zfsvfs->z_vfs->vfs_flag |= VFS_XATTR;
-#endif
- vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR);
- vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0);
- } else {
- /* XXX locking on vfs_flag? */
-#ifdef TODO
- zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR;
-#endif
- vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR);
- vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0);
- }
-}
-
-static void
-blksz_changed_cb(void *arg, uint64_t newval)
-{
- zfsvfs_t *zfsvfs = arg;
-
- if (newval < SPA_MINBLOCKSIZE ||
- newval > SPA_MAXBLOCKSIZE || !ISP2(newval))
- newval = SPA_MAXBLOCKSIZE;
-
- zfsvfs->z_max_blksz = newval;
- zfsvfs->z_vfs->vfs_bsize = newval;
-}
-
-static void
-readonly_changed_cb(void *arg, uint64_t newval)
-{
- zfsvfs_t *zfsvfs = arg;
-
- if (newval) {
- /* XXX locking on vfs_flag? */
- zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
- vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
- vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
- } else {
- /* XXX locking on vfs_flag? */
- zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
- vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
- vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
- }
-}
-
-static void
-setuid_changed_cb(void *arg, uint64_t newval)
-{
- zfsvfs_t *zfsvfs = arg;
-
- if (newval == FALSE) {
- zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
- vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
- vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
- } else {
- zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
- vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
- vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
- }
-}
-
-static void
-exec_changed_cb(void *arg, uint64_t newval)
-{
- zfsvfs_t *zfsvfs = arg;
-
- if (newval == FALSE) {
- zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
- vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
- vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
- } else {
- zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
- vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
- vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
- }
-}
-
-static void
-snapdir_changed_cb(void *arg, uint64_t newval)
-{
- zfsvfs_t *zfsvfs = arg;
-
- zfsvfs->z_show_ctldir = newval;
-}
-
-static void
-acl_mode_changed_cb(void *arg, uint64_t newval)
-{
- zfsvfs_t *zfsvfs = arg;
-
- zfsvfs->z_acl_mode = newval;
-}
-
-static void
-acl_inherit_changed_cb(void *arg, uint64_t newval)
-{
- zfsvfs_t *zfsvfs = arg;
-
- zfsvfs->z_acl_inherit = newval;
-}
-
-static int
-zfs_refresh_properties(vfs_t *vfsp)
-{
- zfsvfs_t *zfsvfs = vfsp->vfs_data;
-
- /*
- * Remount operations default to "rw" unless "ro" is explicitly
- * specified.
- */
- if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
- readonly_changed_cb(zfsvfs, B_TRUE);
- } else {
- if (!dmu_objset_is_snapshot(zfsvfs->z_os))
- readonly_changed_cb(zfsvfs, B_FALSE);
- else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL))
- return (EROFS);
- }
-
- if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
- setuid_changed_cb(zfsvfs, B_FALSE);
- } else {
- if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
- setuid_changed_cb(zfsvfs, B_FALSE);
- else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL))
- setuid_changed_cb(zfsvfs, B_TRUE);
- }
-
- if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
- exec_changed_cb(zfsvfs, B_FALSE);
- else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL))
- exec_changed_cb(zfsvfs, B_TRUE);
-
- if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL))
- atime_changed_cb(zfsvfs, B_TRUE);
- else if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL))
- atime_changed_cb(zfsvfs, B_FALSE);
-
- if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL))
- xattr_changed_cb(zfsvfs, B_TRUE);
- else if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL))
- xattr_changed_cb(zfsvfs, B_FALSE);
-
- return (0);
-}
-
-static int
-zfs_register_callbacks(vfs_t *vfsp)
-{
- struct dsl_dataset *ds = NULL;
- objset_t *os = NULL;
- zfsvfs_t *zfsvfs = NULL;
- int readonly, do_readonly = FALSE;
- int setuid, do_setuid = FALSE;
- int exec, do_exec = FALSE;
- int xattr, do_xattr = FALSE;
- int error = 0;
-
- ASSERT(vfsp);
- zfsvfs = vfsp->vfs_data;
- ASSERT(zfsvfs);
- os = zfsvfs->z_os;
-
- /*
- * The act of registering our callbacks will destroy any mount
- * options we may have. In order to enable temporary overrides
- * of mount options, we stash away the current values and
- * restore them after we register the callbacks.
- */
- if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
- readonly = B_TRUE;
- do_readonly = B_TRUE;
- } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
- readonly = B_FALSE;
- do_readonly = B_TRUE;
- }
- if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
- setuid = B_FALSE;
- do_setuid = B_TRUE;
- } else {
- if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
- setuid = B_FALSE;
- do_setuid = B_TRUE;
- } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
- setuid = B_TRUE;
- do_setuid = B_TRUE;
- }
- }
- if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
- exec = B_FALSE;
- do_exec = B_TRUE;
- } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
- exec = B_TRUE;
- do_exec = B_TRUE;
- }
- if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
- xattr = B_FALSE;
- do_xattr = B_TRUE;
- } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
- xattr = B_TRUE;
- do_xattr = B_TRUE;
- }
-
- /*
- * Register property callbacks.
- *
- * It would probably be fine to just check for i/o error from
- * the first prop_register(), but I guess I like to go
- * overboard...
- */
- ds = dmu_objset_ds(os);
- error = dsl_prop_register(ds, "atime", atime_changed_cb, zfsvfs);
- error = error ? error : dsl_prop_register(ds,
- "xattr", xattr_changed_cb, zfsvfs);
- error = error ? error : dsl_prop_register(ds,
- "recordsize", blksz_changed_cb, zfsvfs);
- error = error ? error : dsl_prop_register(ds,
- "readonly", readonly_changed_cb, zfsvfs);
- error = error ? error : dsl_prop_register(ds,
- "setuid", setuid_changed_cb, zfsvfs);
- error = error ? error : dsl_prop_register(ds,
- "exec", exec_changed_cb, zfsvfs);
- error = error ? error : dsl_prop_register(ds,
- "snapdir", snapdir_changed_cb, zfsvfs);
- error = error ? error : dsl_prop_register(ds,
- "aclmode", acl_mode_changed_cb, zfsvfs);
- error = error ? error : dsl_prop_register(ds,
- "aclinherit", acl_inherit_changed_cb, zfsvfs);
- if (error)
- goto unregister;
-
- /*
- * Invoke our callbacks to restore temporary mount options.
- */
- if (do_readonly)
- readonly_changed_cb(zfsvfs, readonly);
- if (do_setuid)
- setuid_changed_cb(zfsvfs, setuid);
- if (do_exec)
- exec_changed_cb(zfsvfs, exec);
- if (do_xattr)
- xattr_changed_cb(zfsvfs, xattr);
-
- return (0);
-
-unregister:
- /*
- * We may attempt to unregister some callbacks that are not
- * registered, but this is OK; it will simply return ENOMSG,
- * which we will ignore.
- */
- (void) dsl_prop_unregister(ds, "atime", atime_changed_cb, zfsvfs);
- (void) dsl_prop_unregister(ds, "xattr", xattr_changed_cb, zfsvfs);
- (void) dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zfsvfs);
- (void) dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zfsvfs);
- (void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs);
- (void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs);
- (void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs);
- (void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs);
- (void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb,
- zfsvfs);
- return (error);
-
-}
-
-static int
-zfs_domount(vfs_t *vfsp, char *osname, kthread_t *td)
-{
- cred_t *cr = td->td_ucred;
- uint64_t recordsize, readonly;
- int error = 0;
- int mode;
- zfsvfs_t *zfsvfs;
- znode_t *zp = NULL;
-
- ASSERT(vfsp);
- ASSERT(osname);
-
- /*
- * Initialize the zfs-specific filesystem structure.
- * Should probably make this a kmem cache, shuffle fields,
- * and just bzero up to z_hold_mtx[].
- */
- zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
- zfsvfs->z_vfs = vfsp;
- zfsvfs->z_parent = zfsvfs;
- zfsvfs->z_assign = TXG_NOWAIT;
- zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
- zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
-
- mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
- list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
- offsetof(znode_t, z_link_node));
- rw_init(&zfsvfs->z_um_lock, NULL, RW_DEFAULT, NULL);
-
- if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
- NULL))
- goto out;
- zfsvfs->z_vfs->vfs_bsize = recordsize;
-
- vfsp->vfs_data = zfsvfs;
- vfsp->mnt_flag |= MNT_LOCAL;
- vfsp->mnt_kern_flag |= MNTK_MPSAFE;
- vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED;
-
- if (error = dsl_prop_get_integer(osname, "readonly", &readonly, NULL))
- goto out;
-
- if (readonly)
- mode = DS_MODE_PRIMARY | DS_MODE_READONLY;
- else
- mode = DS_MODE_PRIMARY;
-
- error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os);
- if (error == EROFS) {
- mode = DS_MODE_PRIMARY | DS_MODE_READONLY;
- error = dmu_objset_open(osname, DMU_OST_ZFS, mode,
- &zfsvfs->z_os);
- }
-
- if (error)
- goto out;
-
- if (error = zfs_init_fs(zfsvfs, &zp, cr))
- goto out;
-
- if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
- uint64_t xattr;
-
- ASSERT(mode & DS_MODE_READONLY);
- atime_changed_cb(zfsvfs, B_FALSE);
- readonly_changed_cb(zfsvfs, B_TRUE);
- if (error = dsl_prop_get_integer(osname, "xattr", &xattr, NULL))
- goto out;
- xattr_changed_cb(zfsvfs, xattr);
- zfsvfs->z_issnap = B_TRUE;
- } else {
- error = zfs_register_callbacks(vfsp);
- if (error)
- goto out;
-
- zfs_unlinked_drain(zfsvfs);
-
- /*
- * Parse and replay the intent log.
- */
- zil_replay(zfsvfs->z_os, zfsvfs, &zfsvfs->z_assign,
- zfs_replay_vector);
-
- if (!zil_disable)
- zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
- }
-
- vfs_mountedfrom(vfsp, osname);
-
- if (!zfsvfs->z_issnap)
- zfsctl_create(zfsvfs);
-out:
- if (error) {
- if (zfsvfs->z_os)
- dmu_objset_close(zfsvfs->z_os);
- rw_destroy(&zfsvfs->z_um_lock);
- mutex_destroy(&zfsvfs->z_znodes_lock);
- kmem_free(zfsvfs, sizeof (zfsvfs_t));
- } else {
- atomic_add_32(&zfs_active_fs_count, 1);
- }
-
- return (error);
-
-}
-
-void
-zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
-{
- objset_t *os = zfsvfs->z_os;
- struct dsl_dataset *ds;
-
- /*
- * Unregister properties.
- */
- if (!dmu_objset_is_snapshot(os)) {
- ds = dmu_objset_ds(os);
- VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb,
- zfsvfs) == 0);
-
- VERIFY(dsl_prop_unregister(ds, "xattr", xattr_changed_cb,
- zfsvfs) == 0);
-
- VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb,
- zfsvfs) == 0);
-
- VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb,
- zfsvfs) == 0);
-
- VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb,
- zfsvfs) == 0);
-
- VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb,
- zfsvfs) == 0);
-
- VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb,
- zfsvfs) == 0);
-
- VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb,
- zfsvfs) == 0);
-
- VERIFY(dsl_prop_unregister(ds, "aclinherit",
- acl_inherit_changed_cb, zfsvfs) == 0);
- }
-}
-
-/*ARGSUSED*/
-static int
-zfs_mount(vfs_t *vfsp, kthread_t *td)
-{
- char *from;
- int error;
-
- /*
- * When doing a remount, we simply refresh our temporary properties
- * according to those options set in the current VFS options.
- */
- if (vfsp->vfs_flag & MS_REMOUNT)
- return (zfs_refresh_properties(vfsp));
-
- if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&from, NULL))
- return (EINVAL);
-
- DROP_GIANT();
- error = zfs_domount(vfsp, from, td);
- PICKUP_GIANT();
- return (error);
-}
-
-static int
-zfs_statfs(vfs_t *vfsp, struct statfs *statp, kthread_t *td)
-{
- zfsvfs_t *zfsvfs = vfsp->vfs_data;
- uint64_t refdbytes, availbytes, usedobjs, availobjs;
-
- statp->f_version = STATFS_VERSION;
-
- ZFS_ENTER(zfsvfs);
-
- dmu_objset_space(zfsvfs->z_os,
- &refdbytes, &availbytes, &usedobjs, &availobjs);
-
- /*
- * The underlying storage pool actually uses multiple block sizes.
- * We report the fragsize as the smallest block size we support,
- * and we report our blocksize as the filesystem's maximum blocksize.
- */
- statp->f_bsize = zfsvfs->z_vfs->vfs_bsize;
- statp->f_iosize = zfsvfs->z_vfs->vfs_bsize;
-
- /*
- * The following report "total" blocks of various kinds in the
- * file system, but reported in terms of f_frsize - the
- * "fragment" size.
- */
-
- statp->f_blocks = (refdbytes + availbytes) / statp->f_bsize;
- statp->f_bfree = availbytes / statp->f_bsize;
- statp->f_bavail = statp->f_bfree; /* no root reservation */
-
- /*
- * statvfs() should really be called statufs(), because it assumes
- * static metadata. ZFS doesn't preallocate files, so the best
- * we can do is report the max that could possibly fit in f_files,
- * and that minus the number actually used in f_ffree.
- * For f_ffree, report the smaller of the number of object available
- * and the number of blocks (each object will take at least a block).
- */
- statp->f_ffree = MIN(availobjs, statp->f_bfree);
- statp->f_files = statp->f_ffree + usedobjs;
-
- /*
- * We're a zfs filesystem.
- */
- (void) strlcpy(statp->f_fstypename, "zfs", sizeof(statp->f_fstypename));
-
- strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname,
- sizeof(statp->f_mntfromname));
- strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname,
- sizeof(statp->f_mntonname));
-
- statp->f_namemax = ZFS_MAXNAMELEN;
-
- ZFS_EXIT(zfsvfs);
- return (0);
-}
-
-static int
-zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp, kthread_t *td)
-{
- zfsvfs_t *zfsvfs = vfsp->vfs_data;
- znode_t *rootzp;
- int error;
-
- ZFS_ENTER(zfsvfs);
-
- error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
- if (error == 0) {
- *vpp = ZTOV(rootzp);
- error = vn_lock(*vpp, flags);
- (*vpp)->v_vflag |= VV_ROOT;
- }
-
- ZFS_EXIT(zfsvfs);
- return (error);
-}
-
-/*ARGSUSED*/
-static int
-zfs_umount(vfs_t *vfsp, int fflag, kthread_t *td)
-{
- zfsvfs_t *zfsvfs = vfsp->vfs_data;
- cred_t *cr = td->td_ucred;
- int ret;
-
- if ((ret = secpolicy_fs_unmount(cr, vfsp)) != 0)
- return (ret);
-
- (void) dnlc_purge_vfsp(vfsp, 0);
-
- /*
- * Unmount any snapshots mounted under .zfs before unmounting the
- * dataset itself.
- */
- if (zfsvfs->z_ctldir != NULL) {
- if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0)
- return (ret);
- ret = vflush(vfsp, 0, 0, td);
- ASSERT(ret == EBUSY);
- if (!(fflag & MS_FORCE)) {
- if (zfsvfs->z_ctldir->v_count > 1)
- return (EBUSY);
- ASSERT(zfsvfs->z_ctldir->v_count == 1);
- }
- zfsctl_destroy(zfsvfs);
- ASSERT(zfsvfs->z_ctldir == NULL);
- }
-
- /*
- * Flush all the files.
- */
- ret = vflush(vfsp, 1, (fflag & MS_FORCE) ? FORCECLOSE : 0, td);
- if (ret != 0) {
- if (!zfsvfs->z_issnap) {
- zfsctl_create(zfsvfs);
- ASSERT(zfsvfs->z_ctldir != NULL);
- }
- return (ret);
- }
-
- if (fflag & MS_FORCE) {
- MNT_ILOCK(vfsp);
- vfsp->mnt_kern_flag |= MNTK_UNMOUNTF;
- MNT_IUNLOCK(vfsp);
- zfsvfs->z_unmounted1 = B_TRUE;
-
- /*
- * Wait for all zfs threads to leave zfs.
- * Grabbing a rwlock as reader in all vops and
- * as writer here doesn't work because it too easy to get
- * multiple reader enters as zfs can re-enter itself.
- * This can lead to deadlock if there is an intervening
- * rw_enter as writer.
- * So a file system threads ref count (z_op_cnt) is used.
- * A polling loop on z_op_cnt may seem inefficient, but
- * - this saves all threads on exit from having to grab a
- * mutex in order to cv_signal
- * - only occurs on forced unmount in the rare case when
- * there are outstanding threads within the file system.
- */
- while (zfsvfs->z_op_cnt) {
- delay(1);
- }
- }
-
- zfs_objset_close(zfsvfs);
- VFS_RELE(vfsp);
- zfs_freevfs(vfsp);
-
- return (0);
-}
-
-static int
-zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
-{
- zfsvfs_t *zfsvfs = vfsp->vfs_data;
- znode_t *zp;
- int err;
-
- ZFS_ENTER(zfsvfs);
- err = zfs_zget(zfsvfs, ino, &zp);
- if (err == 0 && zp->z_unlinked) {
- VN_RELE(ZTOV(zp));
- err = EINVAL;
- }
- if (err != 0)
- *vpp = NULL;
- else {
- *vpp = ZTOV(zp);
- vn_lock(*vpp, flags);
- }
- ZFS_EXIT(zfsvfs);
- return (err);
-}
-
-static int
-zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp)
-{
- kthread_t *td = curthread;
- zfsvfs_t *zfsvfs = vfsp->vfs_data;
- znode_t *zp;
- uint64_t object = 0;
- uint64_t fid_gen = 0;
- uint64_t gen_mask;
- uint64_t zp_gen;
- int i, err;
-
- *vpp = NULL;
-
- ZFS_ENTER(zfsvfs);
-
- if (fidp->fid_len == LONG_FID_LEN) {
- zfid_long_t *zlfid = (zfid_long_t *)fidp;
- uint64_t objsetid = 0;
- uint64_t setgen = 0;
-
- for (i = 0; i < sizeof (zlfid->zf_setid); i++)
- objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
-
- for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
- setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
-
- ZFS_EXIT(zfsvfs);
-
- err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
- if (err)
- return (EINVAL);
- ZFS_ENTER(zfsvfs);
- }
-
- if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
- zfid_short_t *zfid = (zfid_short_t *)fidp;
-
- for (i = 0; i < sizeof (zfid->zf_object); i++)
- object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
-
- for (i = 0; i < sizeof (zfid->zf_gen); i++)
- fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
- } else {
- ZFS_EXIT(zfsvfs);
- return (EINVAL);
- }
-
- /* A zero fid_gen means we are in the .zfs control directories */
- if (fid_gen == 0 &&
- (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) {
- *vpp = zfsvfs->z_ctldir;
- ASSERT(*vpp != NULL);
- if (object == ZFSCTL_INO_SNAPDIR) {
- VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL,
- 0, NULL, NULL) == 0);
- } else {
- VN_HOLD(*vpp);
- }
- ZFS_EXIT(zfsvfs);
- /* XXX: LK_RETRY? */
- vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
- return (0);
- }
-
- gen_mask = -1ULL >> (64 - 8 * i);
-
- dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask);
- if (err = zfs_zget(zfsvfs, object, &zp)) {
- ZFS_EXIT(zfsvfs);
- return (err);
- }
- zp_gen = zp->z_phys->zp_gen & gen_mask;
- if (zp_gen == 0)
- zp_gen = 1;
- if (zp->z_unlinked || zp_gen != fid_gen) {
- dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
- VN_RELE(ZTOV(zp));
- ZFS_EXIT(zfsvfs);
- return (EINVAL);
- }
-
- *vpp = ZTOV(zp);
- /* XXX: LK_RETRY? */
- vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
- vnode_create_vobject(*vpp, zp->z_phys->zp_size, td);
- ZFS_EXIT(zfsvfs);
- return (0);
-}
-
-static void
-zfs_objset_close(zfsvfs_t *zfsvfs)
-{
- znode_t *zp, *nextzp;
- objset_t *os = zfsvfs->z_os;
-
- /*
- * For forced unmount, at this point all vops except zfs_inactive
- * are erroring EIO. We need to now suspend zfs_inactive threads
- * while we are freeing dbufs before switching zfs_inactive
- * to use behaviour without a objset.
- */
- rw_enter(&zfsvfs->z_um_lock, RW_WRITER);
-
- /*
- * Release all holds on dbufs
- * Note, although we have stopped all other vop threads and
- * zfs_inactive(), the dmu can callback via znode_pageout_func()
- * which can zfs_znode_free() the znode.
- * So we lock z_all_znodes; search the list for a held
- * dbuf; drop the lock (we know zp can't disappear if we hold
- * a dbuf lock; then regrab the lock and restart.
- */
- mutex_enter(&zfsvfs->z_znodes_lock);
- for (zp = list_head(&zfsvfs->z_all_znodes); zp; zp = nextzp) {
- nextzp = list_next(&zfsvfs->z_all_znodes, zp);
- if (zp->z_dbuf_held) {
- /* dbufs should only be held when force unmounting */
- zp->z_dbuf_held = 0;
- mutex_exit(&zfsvfs->z_znodes_lock);
- dmu_buf_rele(zp->z_dbuf, NULL);
- /* Start again */
- mutex_enter(&zfsvfs->z_znodes_lock);
- nextzp = list_head(&zfsvfs->z_all_znodes);
- }
- }
- mutex_exit(&zfsvfs->z_znodes_lock);
-
- /*
- * Unregister properties.
- */
- if (!dmu_objset_is_snapshot(os))
- zfs_unregister_callbacks(zfsvfs);
-
- /*
- * Switch zfs_inactive to behaviour without an objset.
- * It just tosses cached pages and frees the znode & vnode.
- * Then re-enable zfs_inactive threads in that new behaviour.
- */
- zfsvfs->z_unmounted2 = B_TRUE;
- rw_exit(&zfsvfs->z_um_lock); /* re-enable any zfs_inactive threads */
-
- /*
- * Close the zil. Can't close the zil while zfs_inactive
- * threads are blocked as zil_close can call zfs_inactive.
- */
- if (zfsvfs->z_log) {
- zil_close(zfsvfs->z_log);
- zfsvfs->z_log = NULL;
- }
-
- /*
- * Evict all dbufs so that cached znodes will be freed
- */
- if (dmu_objset_evict_dbufs(os, 1)) {
- txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
- (void) dmu_objset_evict_dbufs(os, 0);
- }
-
- /*
- * Finally close the objset
- */
- dmu_objset_close(os);
-}
-
-static void
-zfs_freevfs(vfs_t *vfsp)
-{
- zfsvfs_t *zfsvfs = vfsp->vfs_data;
- int i;
-
- for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
- mutex_destroy(&zfsvfs->z_hold_mtx[i]);
- rw_destroy(&zfsvfs->z_um_lock);
- mutex_destroy(&zfsvfs->z_znodes_lock);
- kmem_free(zfsvfs, sizeof (zfsvfs_t));
-
- atomic_add_32(&zfs_active_fs_count, -1);
-}
-
-#ifdef __i386__
-static int desiredvnodes_backup;
-#endif
-
-static void
-zfs_vnodes_adjust(void)
-{
-#ifdef __i386__
- int val;
-
- desiredvnodes_backup = desiredvnodes;
-
- /*
- * We calculate newdesiredvnodes the same way it is done in
- * vntblinit(). If it is equal to desiredvnodes, it means that
- * it wasn't tuned by the administrator and we can tune it down.
- */
- val = min(maxproc + cnt.v_page_count / 4, 2 * vm_kmem_size /
- (5 * (sizeof(struct vm_object) + sizeof(struct vnode))));
- if (desiredvnodes == val)
- desiredvnodes = (3 * desiredvnodes) / 4;
-#endif
-}
-
-static void
-zfs_vnodes_adjust_back(void)
-{
-
-#ifdef __i386__
- desiredvnodes = desiredvnodes_backup;
-#endif
-}
-
-void
-zfs_init(void)
-{
-
- printf("ZFS filesystem version " ZFS_VERSION_STRING "\n");
-
- /*
- * Initialize .zfs directory structures
- */
- zfsctl_init();
-
- /*
- * Initialize znode cache, vnode ops, etc...
- */
- zfs_znode_init();
-
- /*
- * Reduce number of vnodes. Originally number of vnodes is calculated
- * with UFS inode in mind. We reduce it here, because it's too big for
- * ZFS/i386.
- */
- zfs_vnodes_adjust();
-}
-
-void
-zfs_fini(void)
-{
- zfsctl_fini();
- zfs_znode_fini();
- zfs_vnodes_adjust_back();
-}
-
-int
-zfs_busy(void)
-{
- return (zfs_active_fs_count != 0);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
deleted file mode 100644
index 088103a..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
+++ /dev/null
@@ -1,3623 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-/* Portions Copyright 2007 Jeremy Teo */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/time.h>
-#include <sys/systm.h>
-#include <sys/sysmacros.h>
-#include <sys/resource.h>
-#include <sys/vfs.h>
-#include <sys/vnode.h>
-#include <sys/file.h>
-#include <sys/stat.h>
-#include <sys/kmem.h>
-#include <sys/taskq.h>
-#include <sys/uio.h>
-#include <sys/atomic.h>
-#include <sys/namei.h>
-#include <sys/mman.h>
-#include <sys/cmn_err.h>
-#include <sys/errno.h>
-#include <sys/unistd.h>
-#include <sys/zfs_vfsops.h>
-#include <sys/zfs_dir.h>
-#include <sys/zfs_acl.h>
-#include <sys/zfs_ioctl.h>
-#include <sys/fs/zfs.h>
-#include <sys/dmu.h>
-#include <sys/spa.h>
-#include <sys/txg.h>
-#include <sys/dbuf.h>
-#include <sys/zap.h>
-#include <sys/dirent.h>
-#include <sys/policy.h>
-#include <sys/sunddi.h>
-#include <sys/filio.h>
-#include <sys/zfs_ctldir.h>
-#include <sys/dnlc.h>
-#include <sys/zfs_rlock.h>
-#include <sys/bio.h>
-#include <sys/buf.h>
-#include <sys/sf_buf.h>
-#include <sys/sched.h>
-
-/*
- * Programming rules.
- *
- * Each vnode op performs some logical unit of work. To do this, the ZPL must
- * properly lock its in-core state, create a DMU transaction, do the work,
- * record this work in the intent log (ZIL), commit the DMU transaction,
- * and wait the the intent log to commit if it's is a synchronous operation.
- * Morover, the vnode ops must work in both normal and log replay context.
- * The ordering of events is important to avoid deadlocks and references
- * to freed memory. The example below illustrates the following Big Rules:
- *
- * (1) A check must be made in each zfs thread for a mounted file system.
- * This is done avoiding races using ZFS_ENTER(zfsvfs).
- * A ZFS_EXIT(zfsvfs) is needed before all returns.
- *
- * (2) VN_RELE() should always be the last thing except for zil_commit()
- * (if necessary) and ZFS_EXIT(). This is for 3 reasons:
- * First, if it's the last reference, the vnode/znode
- * can be freed, so the zp may point to freed memory. Second, the last
- * reference will call zfs_zinactive(), which may induce a lot of work --
- * pushing cached pages (which acquires range locks) and syncing out
- * cached atime changes. Third, zfs_zinactive() may require a new tx,
- * which could deadlock the system if you were already holding one.
- *
- * (3) All range locks must be grabbed before calling dmu_tx_assign(),
- * as they can span dmu_tx_assign() calls.
- *
- * (4) Always pass zfsvfs->z_assign as the second argument to dmu_tx_assign().
- * In normal operation, this will be TXG_NOWAIT. During ZIL replay,
- * it will be a specific txg. Either way, dmu_tx_assign() never blocks.
- * This is critical because we don't want to block while holding locks.
- * Note, in particular, that if a lock is sometimes acquired before
- * the tx assigns, and sometimes after (e.g. z_lock), then failing to
- * use a non-blocking assign can deadlock the system. The scenario:
- *
- * Thread A has grabbed a lock before calling dmu_tx_assign().
- * Thread B is in an already-assigned tx, and blocks for this lock.
- * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
- * forever, because the previous txg can't quiesce until B's tx commits.
- *
- * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
- * then drop all locks, call dmu_tx_wait(), and try again.
- *
- * (5) If the operation succeeded, generate the intent log entry for it
- * before dropping locks. This ensures that the ordering of events
- * in the intent log matches the order in which they actually occurred.
- *
- * (6) At the end of each vnode op, the DMU tx must always commit,
- * regardless of whether there were any errors.
- *
- * (7) After dropping all locks, invoke zil_commit(zilog, seq, foid)
- * to ensure that synchronous semantics are provided when necessary.
- *
- * In general, this is how things should be ordered in each vnode op:
- *
- * ZFS_ENTER(zfsvfs); // exit if unmounted
- * top:
- * zfs_dirent_lock(&dl, ...) // lock directory entry (may VN_HOLD())
- * rw_enter(...); // grab any other locks you need
- * tx = dmu_tx_create(...); // get DMU tx
- * dmu_tx_hold_*(); // hold each object you might modify
- * error = dmu_tx_assign(tx, zfsvfs->z_assign); // try to assign
- * if (error) {
- * rw_exit(...); // drop locks
- * zfs_dirent_unlock(dl); // unlock directory entry
- * VN_RELE(...); // release held vnodes
- * if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
- * dmu_tx_wait(tx);
- * dmu_tx_abort(tx);
- * goto top;
- * }
- * dmu_tx_abort(tx); // abort DMU tx
- * ZFS_EXIT(zfsvfs); // finished in zfs
- * return (error); // really out of space
- * }
- * error = do_real_work(); // do whatever this VOP does
- * if (error == 0)
- * zfs_log_*(...); // on success, make ZIL entry
- * dmu_tx_commit(tx); // commit DMU tx -- error or not
- * rw_exit(...); // drop locks
- * zfs_dirent_unlock(dl); // unlock directory entry
- * VN_RELE(...); // release held vnodes
- * zil_commit(zilog, seq, foid); // synchronous when necessary
- * ZFS_EXIT(zfsvfs); // finished in zfs
- * return (error); // done, report error
- */
-/* ARGSUSED */
-static int
-zfs_open(vnode_t **vpp, int flag, cred_t *cr)
-{
- znode_t *zp = VTOZ(*vpp);
-
- /* Keep a count of the synchronous opens in the znode */
- if (flag & (FSYNC | FDSYNC))
- atomic_inc_32(&zp->z_sync_cnt);
- return (0);
-}
-
-/* ARGSUSED */
-static int
-zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr)
-{
- znode_t *zp = VTOZ(vp);
-
- /* Decrement the synchronous opens in the znode */
- if (flag & (FSYNC | FDSYNC))
- atomic_dec_32(&zp->z_sync_cnt);
-
- /*
- * Clean up any locks held by this process on the vp.
- */
- cleanlocks(vp, ddi_get_pid(), 0);
- cleanshares(vp, ddi_get_pid());
-
- return (0);
-}
-
-/*
- * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
- * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
- */
-static int
-zfs_holey(vnode_t *vp, u_long cmd, offset_t *off)
-{
- znode_t *zp = VTOZ(vp);
- uint64_t noff = (uint64_t)*off; /* new offset */
- uint64_t file_sz;
- int error;
- boolean_t hole;
-
- file_sz = zp->z_phys->zp_size;
- if (noff >= file_sz) {
- return (ENXIO);
- }
-
- if (cmd == _FIO_SEEK_HOLE)
- hole = B_TRUE;
- else
- hole = B_FALSE;
-
- error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
-
- /* end of file? */
- if ((error == ESRCH) || (noff > file_sz)) {
- /*
- * Handle the virtual hole at the end of file.
- */
- if (hole) {
- *off = file_sz;
- return (0);
- }
- return (ENXIO);
- }
-
- if (noff < *off)
- return (error);
- *off = noff;
- return (error);
-}
-
-/* ARGSUSED */
-static int
-zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred,
- int *rvalp)
-{
- offset_t off;
- int error;
- zfsvfs_t *zfsvfs;
-
- switch (com) {
- case _FIOFFS:
- return (0);
-
- /*
- * The following two ioctls are used by bfu. Faking out,
- * necessary to avoid bfu errors.
- */
- case _FIOGDIO:
- case _FIOSDIO:
- return (0);
-
- case _FIO_SEEK_DATA:
- case _FIO_SEEK_HOLE:
- if (ddi_copyin((void *)data, &off, sizeof (off), flag))
- return (EFAULT);
-
- zfsvfs = VTOZ(vp)->z_zfsvfs;
- ZFS_ENTER(zfsvfs);
-
- /* offset parameter is in/out */
- error = zfs_holey(vp, com, &off);
- ZFS_EXIT(zfsvfs);
- if (error)
- return (error);
- if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
- return (EFAULT);
- return (0);
- }
- return (ENOTTY);
-}
-
-/*
- * When a file is memory mapped, we must keep the IO data synchronized
- * between the DMU cache and the memory mapped pages. What this means:
- *
- * On Write: If we find a memory mapped page, we write to *both*
- * the page and the dmu buffer.
- *
- * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
- * the file is memory mapped.
- */
-static int
-mappedwrite(vnode_t *vp, int nbytes, uio_t *uio, dmu_tx_t *tx)
-{
- znode_t *zp = VTOZ(vp);
- objset_t *os = zp->z_zfsvfs->z_os;
- vm_object_t obj;
- vm_page_t m;
- struct sf_buf *sf;
- int64_t start, off;
- int len = nbytes;
- int error = 0;
- uint64_t dirbytes;
-
- ASSERT(vp->v_mount != NULL);
- obj = vp->v_object;
- ASSERT(obj != NULL);
-
- start = uio->uio_loffset;
- off = start & PAGEOFFSET;
- dirbytes = 0;
- VM_OBJECT_LOCK(obj);
- for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
- uint64_t bytes = MIN(PAGESIZE - off, len);
- uint64_t fsize;
-
-again:
- if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
- vm_page_is_valid(m, (vm_offset_t)off, bytes)) {
- uint64_t woff;
- caddr_t va;
-
- if (vm_page_sleep_if_busy(m, FALSE, "zfsmwb"))
- goto again;
- fsize = obj->un_pager.vnp.vnp_size;
- vm_page_busy(m);
- vm_page_lock_queues();
- vm_page_undirty(m);
- vm_page_unlock_queues();
- VM_OBJECT_UNLOCK(obj);
- if (dirbytes > 0) {
- error = dmu_write_uio(os, zp->z_id, uio,
- dirbytes, tx);
- dirbytes = 0;
- }
- if (error == 0) {
- sched_pin();
- sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
- va = (caddr_t)sf_buf_kva(sf);
- woff = uio->uio_loffset - off;
- error = uiomove(va + off, bytes, UIO_WRITE, uio);
- /*
- * The uiomove() above could have been partially
- * successful, that's why we call dmu_write()
- * below unconditionally. The page was marked
- * non-dirty above and we would lose the changes
- * without doing so. If the uiomove() failed
- * entirely, well, we just write what we got
- * before one more time.
- */
- dmu_write(os, zp->z_id, woff,
- MIN(PAGESIZE, fsize - woff), va, tx);
- sf_buf_free(sf);
- sched_unpin();
- }
- VM_OBJECT_LOCK(obj);
- vm_page_wakeup(m);
- } else {
- if (__predict_false(obj->cache != NULL)) {
- vm_page_cache_free(obj, OFF_TO_IDX(start),
- OFF_TO_IDX(start) + 1);
- }
- dirbytes += bytes;
- }
- len -= bytes;
- off = 0;
- if (error)
- break;
- }
- VM_OBJECT_UNLOCK(obj);
- if (error == 0 && dirbytes > 0)
- error = dmu_write_uio(os, zp->z_id, uio, dirbytes, tx);
- return (error);
-}
-
-/*
- * When a file is memory mapped, we must keep the IO data synchronized
- * between the DMU cache and the memory mapped pages. What this means:
- *
- * On Read: We "read" preferentially from memory mapped pages,
- * else we default from the dmu buffer.
- *
- * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
- * the file is memory mapped.
- */
-static int
-mappedread(vnode_t *vp, int nbytes, uio_t *uio)
-{
- znode_t *zp = VTOZ(vp);
- objset_t *os = zp->z_zfsvfs->z_os;
- vm_object_t obj;
- vm_page_t m;
- struct sf_buf *sf;
- int64_t start, off;
- caddr_t va;
- int len = nbytes;
- int error = 0;
- uint64_t dirbytes;
-
- ASSERT(vp->v_mount != NULL);
- obj = vp->v_object;
- ASSERT(obj != NULL);
-
- start = uio->uio_loffset;
- off = start & PAGEOFFSET;
- dirbytes = 0;
- VM_OBJECT_LOCK(obj);
- for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
- uint64_t bytes = MIN(PAGESIZE - off, len);
-
-again:
- if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
- vm_page_is_valid(m, (vm_offset_t)off, bytes)) {
- if (vm_page_sleep_if_busy(m, FALSE, "zfsmrb"))
- goto again;
- vm_page_busy(m);
- VM_OBJECT_UNLOCK(obj);
- if (dirbytes > 0) {
- error = dmu_read_uio(os, zp->z_id, uio,
- dirbytes);
- dirbytes = 0;
- }
- if (error == 0) {
- sched_pin();
- sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
- va = (caddr_t)sf_buf_kva(sf);
- error = uiomove(va + off, bytes, UIO_READ, uio);
- sf_buf_free(sf);
- sched_unpin();
- }
- VM_OBJECT_LOCK(obj);
- vm_page_wakeup(m);
- } else if (m != NULL && uio->uio_segflg == UIO_NOCOPY) {
- /*
- * The code below is here to make sendfile(2) work
- * correctly with ZFS. As pointed out by ups@
- * sendfile(2) should be changed to use VOP_GETPAGES(),
- * but it pessimize performance of sendfile/UFS, that's
- * why I handle this special case in ZFS code.
- */
- if (vm_page_sleep_if_busy(m, FALSE, "zfsmrb"))
- goto again;
- vm_page_busy(m);
- VM_OBJECT_UNLOCK(obj);
- if (dirbytes > 0) {
- error = dmu_read_uio(os, zp->z_id, uio,
- dirbytes);
- dirbytes = 0;
- }
- if (error == 0) {
- sched_pin();
- sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
- va = (caddr_t)sf_buf_kva(sf);
- error = dmu_read(os, zp->z_id, start + off,
- bytes, (void *)(va + off));
- sf_buf_free(sf);
- sched_unpin();
- }
- VM_OBJECT_LOCK(obj);
- vm_page_wakeup(m);
- if (error == 0)
- uio->uio_resid -= bytes;
- } else {
- dirbytes += bytes;
- }
- len -= bytes;
- off = 0;
- if (error)
- break;
- }
- VM_OBJECT_UNLOCK(obj);
- if (error == 0 && dirbytes > 0)
- error = dmu_read_uio(os, zp->z_id, uio, dirbytes);
- return (error);
-}
-
-offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
-
-/*
- * Read bytes from specified file into supplied buffer.
- *
- * IN: vp - vnode of file to be read from.
- * uio - structure supplying read location, range info,
- * and return buffer.
- * ioflag - SYNC flags; used to provide FRSYNC semantics.
- * cr - credentials of caller.
- *
- * OUT: uio - updated offset and range, buffer filled.
- *
- * RETURN: 0 if success
- * error code if failure
- *
- * Side Effects:
- * vp - atime updated if byte count > 0
- */
-/* ARGSUSED */
-static int
-zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
-{
- znode_t *zp = VTOZ(vp);
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- objset_t *os = zfsvfs->z_os;
- ssize_t n, nbytes;
- int error;
- rl_t *rl;
-
- ZFS_ENTER(zfsvfs);
-
- /*
- * Validate file offset
- */
- if (uio->uio_loffset < (offset_t)0) {
- ZFS_EXIT(zfsvfs);
- return (EINVAL);
- }
-
- /*
- * Fasttrack empty reads
- */
- if (uio->uio_resid == 0) {
- ZFS_EXIT(zfsvfs);
- return (0);
- }
-
- /*
- * Check for mandatory locks
- */
- if (MANDMODE((mode_t)zp->z_phys->zp_mode)) {
- if (error = chklock(vp, FREAD,
- uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
- ZFS_EXIT(zfsvfs);
- return (error);
- }
- }
-
- /*
- * If we're in FRSYNC mode, sync out this znode before reading it.
- */
- if (ioflag & FRSYNC)
- zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
-
- /*
- * Lock the range against changes.
- */
- rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
-
- /*
- * If we are reading past end-of-file we can skip
- * to the end; but we might still need to set atime.
- */
- if (uio->uio_loffset >= zp->z_phys->zp_size) {
- error = 0;
- goto out;
- }
-
- ASSERT(uio->uio_loffset < zp->z_phys->zp_size);
- n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset);
-
- while (n > 0) {
- nbytes = MIN(n, zfs_read_chunk_size -
- P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
-
- if (vn_has_cached_data(vp))
- error = mappedread(vp, nbytes, uio);
- else
- error = dmu_read_uio(os, zp->z_id, uio, nbytes);
- if (error)
- break;
-
- n -= nbytes;
- }
-
-out:
- zfs_range_unlock(rl);
-
- ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
- ZFS_EXIT(zfsvfs);
- return (error);
-}
-
-/*
- * Fault in the pages of the first n bytes specified by the uio structure.
- * 1 byte in each page is touched and the uio struct is unmodified.
- * Any error will exit this routine as this is only a best
- * attempt to get the pages resident. This is a copy of ufs_trans_touch().
- */
-static void
-zfs_prefault_write(ssize_t n, struct uio *uio)
-{
- struct iovec *iov;
- ulong_t cnt, incr;
- caddr_t p;
-
- if (uio->uio_segflg != UIO_USERSPACE)
- return;
-
- iov = uio->uio_iov;
-
- while (n) {
- cnt = MIN(iov->iov_len, n);
- if (cnt == 0) {
- /* empty iov entry */
- iov++;
- continue;
- }
- n -= cnt;
- /*
- * touch each page in this segment.
- */
- p = iov->iov_base;
- while (cnt) {
- if (fubyte(p) == -1)
- return;
- incr = MIN(cnt, PAGESIZE);
- p += incr;
- cnt -= incr;
- }
- /*
- * touch the last byte in case it straddles a page.
- */
- p--;
- if (fubyte(p) == -1)
- return;
- iov++;
- }
-}
-
-/*
- * Write the bytes to a file.
- *
- * IN: vp - vnode of file to be written to.
- * uio - structure supplying write location, range info,
- * and data buffer.
- * ioflag - IO_APPEND flag set if in append mode.
- * cr - credentials of caller.
- *
- * OUT: uio - updated offset and range.
- *
- * RETURN: 0 if success
- * error code if failure
- *
- * Timestamps:
- * vp - ctime|mtime updated if byte count > 0
- */
-/* ARGSUSED */
-static int
-zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
-{
- znode_t *zp = VTOZ(vp);
- rlim64_t limit = MAXOFFSET_T;
- ssize_t start_resid = uio->uio_resid;
- ssize_t tx_bytes;
- uint64_t end_size;
- dmu_tx_t *tx;
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- zilog_t *zilog = zfsvfs->z_log;
- offset_t woff;
- ssize_t n, nbytes;
- rl_t *rl;
- int max_blksz = zfsvfs->z_max_blksz;
- int error;
-
- /*
- * Fasttrack empty write
- */
- n = start_resid;
- if (n == 0)
- return (0);
-
- if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
- limit = MAXOFFSET_T;
-
- ZFS_ENTER(zfsvfs);
-
- /*
- * Pre-fault the pages to ensure slow (eg NFS) pages
- * don't hold up txg.
- */
- zfs_prefault_write(n, uio);
-
- /*
- * If in append mode, set the io offset pointer to eof.
- */
- if (ioflag & IO_APPEND) {
- /*
- * Range lock for a file append:
- * The value for the start of range will be determined by
- * zfs_range_lock() (to guarantee append semantics).
- * If this write will cause the block size to increase,
- * zfs_range_lock() will lock the entire file, so we must
- * later reduce the range after we grow the block size.
- */
- rl = zfs_range_lock(zp, 0, n, RL_APPEND);
- if (rl->r_len == UINT64_MAX) {
- /* overlocked, zp_size can't change */
- woff = uio->uio_loffset = zp->z_phys->zp_size;
- } else {
- woff = uio->uio_loffset = rl->r_off;
- }
- } else {
- woff = uio->uio_loffset;
- /*
- * Validate file offset
- */
- if (woff < 0) {
- ZFS_EXIT(zfsvfs);
- return (EINVAL);
- }
-
- /*
- * If we need to grow the block size then zfs_range_lock()
- * will lock a wider range than we request here.
- * Later after growing the block size we reduce the range.
- */
- rl = zfs_range_lock(zp, woff, n, RL_WRITER);
- }
-
- if (woff >= limit) {
- zfs_range_unlock(rl);
- ZFS_EXIT(zfsvfs);
- return (EFBIG);
- }
-
- if ((woff + n) > limit || woff > (limit - n))
- n = limit - woff;
-
- /*
- * Check for mandatory locks
- */
- if (MANDMODE((mode_t)zp->z_phys->zp_mode) &&
- (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
- zfs_range_unlock(rl);
- ZFS_EXIT(zfsvfs);
- return (error);
- }
- end_size = MAX(zp->z_phys->zp_size, woff + n);
-
- /*
- * Write the file in reasonable size chunks. Each chunk is written
- * in a separate transaction; this keeps the intent log records small
- * and allows us to do more fine-grained space accounting.
- */
- while (n > 0) {
- /*
- * Start a transaction.
- */
- woff = uio->uio_loffset;
- tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_bonus(tx, zp->z_id);
- dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
- error = dmu_tx_assign(tx, zfsvfs->z_assign);
- if (error) {
- if (error == ERESTART &&
- zfsvfs->z_assign == TXG_NOWAIT) {
- dmu_tx_wait(tx);
- dmu_tx_abort(tx);
- continue;
- }
- dmu_tx_abort(tx);
- break;
- }
-
- /*
- * If zfs_range_lock() over-locked we grow the blocksize
- * and then reduce the lock range. This will only happen
- * on the first iteration since zfs_range_reduce() will
- * shrink down r_len to the appropriate size.
- */
- if (rl->r_len == UINT64_MAX) {
- uint64_t new_blksz;
-
- if (zp->z_blksz > max_blksz) {
- ASSERT(!ISP2(zp->z_blksz));
- new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE);
- } else {
- new_blksz = MIN(end_size, max_blksz);
- }
- zfs_grow_blocksize(zp, new_blksz, tx);
- zfs_range_reduce(rl, woff, n);
- }
-
- /*
- * XXX - should we really limit each write to z_max_blksz?
- * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
- */
- nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
-
- if (woff + nbytes > zp->z_phys->zp_size)
- vnode_pager_setsize(vp, woff + nbytes);
-
- rw_enter(&zp->z_map_lock, RW_READER);
-
- tx_bytes = uio->uio_resid;
- if (vn_has_cached_data(vp)) {
- rw_exit(&zp->z_map_lock);
- error = mappedwrite(vp, nbytes, uio, tx);
- } else {
- error = dmu_write_uio(zfsvfs->z_os, zp->z_id,
- uio, nbytes, tx);
- rw_exit(&zp->z_map_lock);
- }
- tx_bytes -= uio->uio_resid;
-
- /*
- * If we made no progress, we're done. If we made even
- * partial progress, update the znode and ZIL accordingly.
- */
- if (tx_bytes == 0) {
- dmu_tx_commit(tx);
- ASSERT(error != 0);
- break;
- }
-
- /*
- * Clear Set-UID/Set-GID bits on successful write if not
- * privileged and at least one of the excute bits is set.
- *
- * It would be nice to to this after all writes have
- * been done, but that would still expose the ISUID/ISGID
- * to another app after the partial write is committed.
- */
- mutex_enter(&zp->z_acl_lock);
- if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) |
- (S_IXUSR >> 6))) != 0 &&
- (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 &&
- secpolicy_vnode_setid_retain(cr,
- (zp->z_phys->zp_mode & S_ISUID) != 0 &&
- zp->z_phys->zp_uid == 0) != 0) {
- zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID);
- }
- mutex_exit(&zp->z_acl_lock);
-
- /*
- * Update time stamp. NOTE: This marks the bonus buffer as
- * dirty, so we don't have to do it again for zp_size.
- */
- zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
-
- /*
- * Update the file size (zp_size) if it has changed;
- * account for possible concurrent updates.
- */
- while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset)
- (void) atomic_cas_64(&zp->z_phys->zp_size, end_size,
- uio->uio_loffset);
- zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
- dmu_tx_commit(tx);
-
- if (error != 0)
- break;
- ASSERT(tx_bytes == nbytes);
- n -= nbytes;
- }
-
- zfs_range_unlock(rl);
-
- /*
- * If we're in replay mode, or we made no progress, return error.
- * Otherwise, it's at least a partial write, so it's successful.
- */
- if (zfsvfs->z_assign >= TXG_INITIAL || uio->uio_resid == start_resid) {
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- if (ioflag & (FSYNC | FDSYNC))
- zil_commit(zilog, zp->z_last_itx, zp->z_id);
-
- ZFS_EXIT(zfsvfs);
- return (0);
-}
-
-void
-zfs_get_done(dmu_buf_t *db, void *vzgd)
-{
- zgd_t *zgd = (zgd_t *)vzgd;
- rl_t *rl = zgd->zgd_rl;
- vnode_t *vp = ZTOV(rl->r_zp);
- int vfslocked;
-
- vfslocked = VFS_LOCK_GIANT(vp->v_vfsp);
- dmu_buf_rele(db, vzgd);
- zfs_range_unlock(rl);
- VN_RELE(vp);
- zil_add_vdev(zgd->zgd_zilog, DVA_GET_VDEV(BP_IDENTITY(zgd->zgd_bp)));
- kmem_free(zgd, sizeof (zgd_t));
- VFS_UNLOCK_GIANT(vfslocked);
-}
-
-/*
- * Get data to generate a TX_WRITE intent log record.
- */
-int
-zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
-{
- zfsvfs_t *zfsvfs = arg;
- objset_t *os = zfsvfs->z_os;
- znode_t *zp;
- uint64_t off = lr->lr_offset;
- dmu_buf_t *db;
- rl_t *rl;
- zgd_t *zgd;
- int dlen = lr->lr_length; /* length of user data */
- int error = 0;
-
- ASSERT(zio);
- ASSERT(dlen != 0);
-
- /*
- * Nothing to do if the file has been removed
- */
- if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0)
- return (ENOENT);
- if (zp->z_unlinked) {
- VN_RELE(ZTOV(zp));
- return (ENOENT);
- }
-
- /*
- * Write records come in two flavors: immediate and indirect.
- * For small writes it's cheaper to store the data with the
- * log record (immediate); for large writes it's cheaper to
- * sync the data and get a pointer to it (indirect) so that
- * we don't have to write the data twice.
- */
- if (buf != NULL) { /* immediate write */
- rl = zfs_range_lock(zp, off, dlen, RL_READER);
- /* test for truncation needs to be done while range locked */
- if (off >= zp->z_phys->zp_size) {
- error = ENOENT;
- goto out;
- }
- VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf));
- } else { /* indirect write */
- uint64_t boff; /* block starting offset */
-
- /*
- * Have to lock the whole block to ensure when it's
- * written out and it's checksum is being calculated
- * that no one can change the data. We need to re-check
- * blocksize after we get the lock in case it's changed!
- */
- for (;;) {
- if (ISP2(zp->z_blksz)) {
- boff = P2ALIGN_TYPED(off, zp->z_blksz,
- uint64_t);
- } else {
- boff = 0;
- }
- dlen = zp->z_blksz;
- rl = zfs_range_lock(zp, boff, dlen, RL_READER);
- if (zp->z_blksz == dlen)
- break;
- zfs_range_unlock(rl);
- }
- /* test for truncation needs to be done while range locked */
- if (off >= zp->z_phys->zp_size) {
- error = ENOENT;
- goto out;
- }
- zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP);
- zgd->zgd_rl = rl;
- zgd->zgd_zilog = zfsvfs->z_log;
- zgd->zgd_bp = &lr->lr_blkptr;
- VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db));
- ASSERT(boff == db->db_offset);
- lr->lr_blkoff = off - boff;
- error = dmu_sync(zio, db, &lr->lr_blkptr,
- lr->lr_common.lrc_txg, zfs_get_done, zgd);
- ASSERT(error == EEXIST || lr->lr_length <= zp->z_blksz);
- if (error == 0) {
- zil_add_vdev(zfsvfs->z_log,
- DVA_GET_VDEV(BP_IDENTITY(&lr->lr_blkptr)));
- }
- /*
- * If we get EINPROGRESS, then we need to wait for a
- * write IO initiated by dmu_sync() to complete before
- * we can release this dbuf. We will finish everything
- * up in the zfs_get_done() callback.
- */
- if (error == EINPROGRESS)
- return (0);
- dmu_buf_rele(db, zgd);
- kmem_free(zgd, sizeof (zgd_t));
- }
-out:
- zfs_range_unlock(rl);
- VN_RELE(ZTOV(zp));
- return (error);
-}
-
-/*ARGSUSED*/
-static int
-zfs_access(vnode_t *vp, int mode, int flags, cred_t *cr)
-{
- znode_t *zp = VTOZ(vp);
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- int error;
-
- ZFS_ENTER(zfsvfs);
- error = zfs_zaccess_rwx(zp, mode, cr);
- ZFS_EXIT(zfsvfs);
- return (error);
-}
-
-/*
- * Lookup an entry in a directory, or an extended attribute directory.
- * If it exists, return a held vnode reference for it.
- *
- * IN: dvp - vnode of directory to search.
- * nm - name of entry to lookup.
- * pnp - full pathname to lookup [UNUSED].
- * flags - LOOKUP_XATTR set if looking for an attribute.
- * rdir - root directory vnode [UNUSED].
- * cr - credentials of caller.
- *
- * OUT: vpp - vnode of located entry, NULL if not found.
- *
- * RETURN: 0 if success
- * error code if failure
- *
- * Timestamps:
- * NA
- */
-/* ARGSUSED */
-static int
-zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
- int nameiop, cred_t *cr, kthread_t *td)
-{
-
- znode_t *zdp = VTOZ(dvp);
- zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
- int error;
-
- ZFS_ENTER(zfsvfs);
-
- *vpp = NULL;
-
-#ifdef TODO
- if (flags & LOOKUP_XATTR) {
- /*
- * If the xattr property is off, refuse the lookup request.
- */
- if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
- ZFS_EXIT(zfsvfs);
- return (EINVAL);
- }
-
- /*
- * We don't allow recursive attributes..
- * Maybe someday we will.
- */
- if (zdp->z_phys->zp_flags & ZFS_XATTR) {
- ZFS_EXIT(zfsvfs);
- return (EINVAL);
- }
-
- if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- /*
- * Do we have permission to get into attribute directory?
- */
-
- if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, cr)) {
- VN_RELE(*vpp);
- }
-
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-#endif /* TODO */
-
- if (dvp->v_type != VDIR) {
- ZFS_EXIT(zfsvfs);
- return (ENOTDIR);
- }
-
- /*
- * Check accessibility of directory.
- */
-
- if (error = zfs_zaccess(zdp, ACE_EXECUTE, cr)) {
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- if ((error = zfs_dirlook(zdp, nm, vpp)) == 0) {
-
- /*
- * Convert device special files
- */
- if (IS_DEVVP(*vpp)) {
- vnode_t *svp;
-
- svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
- VN_RELE(*vpp);
- if (svp == NULL)
- error = ENOSYS;
- else
- *vpp = svp;
- }
- }
-
- ZFS_EXIT(zfsvfs);
-
- /* Translate errors and add SAVENAME when needed. */
- if (cnp->cn_flags & ISLASTCN) {
- switch (nameiop) {
- case CREATE:
- case RENAME:
- if (error == ENOENT) {
- error = EJUSTRETURN;
- cnp->cn_flags |= SAVENAME;
- break;
- }
- /* FALLTHROUGH */
- case DELETE:
- if (error == 0)
- cnp->cn_flags |= SAVENAME;
- break;
- }
- }
- if (error == 0 && (nm[0] != '.' || nm[1] != '\0')) {
- int ltype = 0;
-
- if (cnp->cn_flags & ISDOTDOT) {
- ltype = VOP_ISLOCKED(dvp);
- VOP_UNLOCK(dvp, 0);
- }
- error = vn_lock(*vpp, cnp->cn_lkflags);
- if (cnp->cn_flags & ISDOTDOT)
- vn_lock(dvp, ltype | LK_RETRY);
- if (error != 0) {
- VN_RELE(*vpp);
- *vpp = NULL;
- return (error);
- }
- }
-
-#ifdef FREEBSD_NAMECACHE
- /*
- * Insert name into cache (as non-existent) if appropriate.
- */
- if (error == ENOENT && (cnp->cn_flags & MAKEENTRY) && nameiop != CREATE)
- cache_enter(dvp, *vpp, cnp);
- /*
- * Insert name into cache if appropriate.
- */
- if (error == 0 && (cnp->cn_flags & MAKEENTRY)) {
- if (!(cnp->cn_flags & ISLASTCN) ||
- (nameiop != DELETE && nameiop != RENAME)) {
- cache_enter(dvp, *vpp, cnp);
- }
- }
-#endif
-
- return (error);
-}
-
-/*
- * Attempt to create a new entry in a directory. If the entry
- * already exists, truncate the file if permissible, else return
- * an error. Return the vp of the created or trunc'd file.
- *
- * IN: dvp - vnode of directory to put new file entry in.
- * name - name of new file entry.
- * vap - attributes of new file.
- * excl - flag indicating exclusive or non-exclusive mode.
- * mode - mode to open file with.
- * cr - credentials of caller.
- * flag - large file flag [UNUSED].
- *
- * OUT: vpp - vnode of created or trunc'd entry.
- *
- * RETURN: 0 if success
- * error code if failure
- *
- * Timestamps:
- * dvp - ctime|mtime updated if new entry created
- * vp - ctime|mtime always, atime if new
- */
-/* ARGSUSED */
-static int
-zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
- vnode_t **vpp, cred_t *cr)
-{
- znode_t *zp, *dzp = VTOZ(dvp);
- zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
- zilog_t *zilog = zfsvfs->z_log;
- objset_t *os = zfsvfs->z_os;
- zfs_dirlock_t *dl;
- dmu_tx_t *tx;
- int error;
- uint64_t zoid;
-
- ZFS_ENTER(zfsvfs);
-
-top:
- *vpp = NULL;
-
- if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr))
- vap->va_mode &= ~VSVTX;
-
- if (*name == '\0') {
- /*
- * Null component name refers to the directory itself.
- */
- VN_HOLD(dvp);
- zp = dzp;
- dl = NULL;
- error = 0;
- } else {
- /* possible VN_HOLD(zp) */
- if (error = zfs_dirent_lock(&dl, dzp, name, &zp, 0)) {
- if (strcmp(name, "..") == 0)
- error = EISDIR;
- ZFS_EXIT(zfsvfs);
- return (error);
- }
- }
-
- zoid = zp ? zp->z_id : -1ULL;
-
- if (zp == NULL) {
- /*
- * Create a new file object and update the directory
- * to reference it.
- */
- if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) {
- goto out;
- }
-
- /*
- * We only support the creation of regular files in
- * extended attribute directories.
- */
- if ((dzp->z_phys->zp_flags & ZFS_XATTR) &&
- (vap->va_type != VREG)) {
- error = EINVAL;
- goto out;
- }
-
- tx = dmu_tx_create(os);
- dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
- dmu_tx_hold_bonus(tx, dzp->z_id);
- dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
- if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
- 0, SPA_MAXBLOCKSIZE);
- error = dmu_tx_assign(tx, zfsvfs->z_assign);
- if (error) {
- zfs_dirent_unlock(dl);
- if (error == ERESTART &&
- zfsvfs->z_assign == TXG_NOWAIT) {
- dmu_tx_wait(tx);
- dmu_tx_abort(tx);
- goto top;
- }
- dmu_tx_abort(tx);
- ZFS_EXIT(zfsvfs);
- return (error);
- }
- zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0);
- ASSERT(zp->z_id == zoid);
- (void) zfs_link_create(dl, zp, tx, ZNEW);
- zfs_log_create(zilog, tx, TX_CREATE, dzp, zp, name);
- dmu_tx_commit(tx);
- } else {
- /*
- * A directory entry already exists for this name.
- */
- /*
- * Can't truncate an existing file if in exclusive mode.
- */
- if (excl == EXCL) {
- error = EEXIST;
- goto out;
- }
- /*
- * Can't open a directory for writing.
- */
- if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) {
- error = EISDIR;
- goto out;
- }
- /*
- * Verify requested access to file.
- */
- if (mode && (error = zfs_zaccess_rwx(zp, mode, cr))) {
- goto out;
- }
-
- mutex_enter(&dzp->z_lock);
- dzp->z_seq++;
- mutex_exit(&dzp->z_lock);
-
- /*
- * Truncate regular files if requested.
- */
- if ((ZTOV(zp)->v_type == VREG) &&
- (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) {
- error = zfs_freesp(zp, 0, 0, mode, TRUE);
- if (error == ERESTART &&
- zfsvfs->z_assign == TXG_NOWAIT) {
- /* NB: we already did dmu_tx_wait() */
- zfs_dirent_unlock(dl);
- VN_RELE(ZTOV(zp));
- goto top;
- }
- }
- }
-out:
-
- if (error == 0) {
- *vpp = ZTOV(zp);
- vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
- }
-
- if (dl)
- zfs_dirent_unlock(dl);
-
- if (error) {
- if (zp)
- VN_RELE(ZTOV(zp));
- } else {
- *vpp = ZTOV(zp);
- /*
- * If vnode is for a device return a specfs vnode instead.
- */
- if (IS_DEVVP(*vpp)) {
- struct vnode *svp;
-
- svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
- VN_RELE(*vpp);
- if (svp == NULL) {
- error = ENOSYS;
- }
- *vpp = svp;
- }
- }
-
- ZFS_EXIT(zfsvfs);
- return (error);
-}
-
-/*
- * Remove an entry from a directory.
- *
- * IN: dvp - vnode of directory to remove entry from.
- * name - name of entry to remove.
- * cr - credentials of caller.
- *
- * RETURN: 0 if success
- * error code if failure
- *
- * Timestamps:
- * dvp - ctime|mtime
- * vp - ctime (if nlink > 0)
- */
-static int
-zfs_remove(vnode_t *dvp, char *name, cred_t *cr)
-{
- znode_t *zp, *dzp = VTOZ(dvp);
- znode_t *xzp = NULL;
- vnode_t *vp;
- zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
- zilog_t *zilog = zfsvfs->z_log;
- uint64_t acl_obj, xattr_obj;
- zfs_dirlock_t *dl;
- dmu_tx_t *tx;
- boolean_t may_delete_now, delete_now = FALSE;
- boolean_t unlinked;
- int error;
-
- ZFS_ENTER(zfsvfs);
-
-top:
- /*
- * Attempt to lock directory; fail if entry doesn't exist.
- */
- if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS)) {
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- vp = ZTOV(zp);
-
- if (error = zfs_zaccess_delete(dzp, zp, cr)) {
- goto out;
- }
-
- /*
- * Need to use rmdir for removing directories.
- */
- if (vp->v_type == VDIR) {
- error = EPERM;
- goto out;
- }
-
- vnevent_remove(vp);
-
- dnlc_remove(dvp, name);
-
- may_delete_now = FALSE;
-
- /*
- * We may delete the znode now, or we may put it in the unlinked set;
- * it depends on whether we're the last link, and on whether there are
- * other holds on the vnode. So we dmu_tx_hold() the right things to
- * allow for either case.
- */
- tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
- dmu_tx_hold_bonus(tx, zp->z_id);
- if (may_delete_now)
- dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
-
- /* are there any extended attributes? */
- if ((xattr_obj = zp->z_phys->zp_xattr) != 0) {
- /* XXX - do we need this if we are deleting? */
- dmu_tx_hold_bonus(tx, xattr_obj);
- }
-
- /* are there any additional acls */
- if ((acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj) != 0 &&
- may_delete_now)
- dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
-
- /* charge as an update -- would be nice not to charge at all */
- dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
-
- error = dmu_tx_assign(tx, zfsvfs->z_assign);
- if (error) {
- zfs_dirent_unlock(dl);
- VN_RELE(vp);
- if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
- dmu_tx_wait(tx);
- dmu_tx_abort(tx);
- goto top;
- }
- dmu_tx_abort(tx);
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- /*
- * Remove the directory entry.
- */
- error = zfs_link_destroy(dl, zp, tx, 0, &unlinked);
-
- if (error) {
- dmu_tx_commit(tx);
- goto out;
- }
-
- if (0 && unlinked) {
- VI_LOCK(vp);
- delete_now = may_delete_now &&
- vp->v_count == 1 && !vn_has_cached_data(vp) &&
- zp->z_phys->zp_xattr == xattr_obj &&
- zp->z_phys->zp_acl.z_acl_extern_obj == acl_obj;
- VI_UNLOCK(vp);
- }
-
- if (delete_now) {
- if (zp->z_phys->zp_xattr) {
- error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp);
- ASSERT3U(error, ==, 0);
- ASSERT3U(xzp->z_phys->zp_links, ==, 2);
- dmu_buf_will_dirty(xzp->z_dbuf, tx);
- mutex_enter(&xzp->z_lock);
- xzp->z_unlinked = 1;
- xzp->z_phys->zp_links = 0;
- mutex_exit(&xzp->z_lock);
- zfs_unlinked_add(xzp, tx);
- zp->z_phys->zp_xattr = 0; /* probably unnecessary */
- }
- mutex_enter(&zp->z_lock);
- VI_LOCK(vp);
- vp->v_count--;
- ASSERT3U(vp->v_count, ==, 0);
- VI_UNLOCK(vp);
- mutex_exit(&zp->z_lock);
- zfs_znode_delete(zp, tx);
- VFS_RELE(zfsvfs->z_vfs);
- } else if (unlinked) {
- zfs_unlinked_add(zp, tx);
- }
-
- zfs_log_remove(zilog, tx, TX_REMOVE, dzp, name);
-
- dmu_tx_commit(tx);
-out:
- zfs_dirent_unlock(dl);
-
- if (!delete_now) {
- VN_RELE(vp);
- } else if (xzp) {
- /* this rele delayed to prevent nesting transactions */
- VN_RELE(ZTOV(xzp));
- }
-
- ZFS_EXIT(zfsvfs);
- return (error);
-}
-
-/*
- * Create a new directory and insert it into dvp using the name
- * provided. Return a pointer to the inserted directory.
- *
- * IN: dvp - vnode of directory to add subdir to.
- * dirname - name of new directory.
- * vap - attributes of new directory.
- * cr - credentials of caller.
- *
- * OUT: vpp - vnode of created directory.
- *
- * RETURN: 0 if success
- * error code if failure
- *
- * Timestamps:
- * dvp - ctime|mtime updated
- * vp - ctime|mtime|atime updated
- */
-static int
-zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr)
-{
- znode_t *zp, *dzp = VTOZ(dvp);
- zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
- zilog_t *zilog = zfsvfs->z_log;
- zfs_dirlock_t *dl;
- uint64_t zoid = 0;
- dmu_tx_t *tx;
- int error;
-
- ASSERT(vap->va_type == VDIR);
-
- ZFS_ENTER(zfsvfs);
-
- if (dzp->z_phys->zp_flags & ZFS_XATTR) {
- ZFS_EXIT(zfsvfs);
- return (EINVAL);
- }
-top:
- *vpp = NULL;
-
- /*
- * First make sure the new directory doesn't exist.
- */
- if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, ZNEW)) {
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, cr)) {
- zfs_dirent_unlock(dl);
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- /*
- * Add a new entry to the directory.
- */
- tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
- dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
- if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
- 0, SPA_MAXBLOCKSIZE);
- error = dmu_tx_assign(tx, zfsvfs->z_assign);
- if (error) {
- zfs_dirent_unlock(dl);
- if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
- dmu_tx_wait(tx);
- dmu_tx_abort(tx);
- goto top;
- }
- dmu_tx_abort(tx);
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- /*
- * Create new node.
- */
- zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0);
-
- /*
- * Now put new name in parent dir.
- */
- (void) zfs_link_create(dl, zp, tx, ZNEW);
-
- *vpp = ZTOV(zp);
-
- zfs_log_create(zilog, tx, TX_MKDIR, dzp, zp, dirname);
- dmu_tx_commit(tx);
-
- vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
-
- zfs_dirent_unlock(dl);
-
- ZFS_EXIT(zfsvfs);
- return (0);
-}
-
-/*
- * Remove a directory subdir entry. If the current working
- * directory is the same as the subdir to be removed, the
- * remove will fail.
- *
- * IN: dvp - vnode of directory to remove from.
- * name - name of directory to be removed.
- * cwd - vnode of current working directory.
- * cr - credentials of caller.
- *
- * RETURN: 0 if success
- * error code if failure
- *
- * Timestamps:
- * dvp - ctime|mtime updated
- */
-static int
-zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr)
-{
- znode_t *dzp = VTOZ(dvp);
- znode_t *zp;
- vnode_t *vp;
- zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
- zilog_t *zilog = zfsvfs->z_log;
- zfs_dirlock_t *dl;
- dmu_tx_t *tx;
- int error;
-
- ZFS_ENTER(zfsvfs);
-
-top:
- zp = NULL;
-
- /*
- * Attempt to lock directory; fail if entry doesn't exist.
- */
- if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS)) {
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- vp = ZTOV(zp);
-
- if (error = zfs_zaccess_delete(dzp, zp, cr)) {
- goto out;
- }
-
- if (vp->v_type != VDIR) {
- error = ENOTDIR;
- goto out;
- }
-
- if (vp == cwd) {
- error = EINVAL;
- goto out;
- }
-
- vnevent_rmdir(vp);
-
- /*
- * Grab a lock on the directory to make sure that noone is
- * trying to add (or lookup) entries while we are removing it.
- */
- rw_enter(&zp->z_name_lock, RW_WRITER);
-
- /*
- * Grab a lock on the parent pointer to make sure we play well
- * with the treewalk and directory rename code.
- */
- rw_enter(&zp->z_parent_lock, RW_WRITER);
-
- tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
- dmu_tx_hold_bonus(tx, zp->z_id);
- dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
- error = dmu_tx_assign(tx, zfsvfs->z_assign);
- if (error) {
- rw_exit(&zp->z_parent_lock);
- rw_exit(&zp->z_name_lock);
- zfs_dirent_unlock(dl);
- VN_RELE(vp);
- if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
- dmu_tx_wait(tx);
- dmu_tx_abort(tx);
- goto top;
- }
- dmu_tx_abort(tx);
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
-#ifdef FREEBSD_NAMECACHE
- cache_purge(dvp);
-#endif
-
- error = zfs_link_destroy(dl, zp, tx, 0, NULL);
-
- if (error == 0)
- zfs_log_remove(zilog, tx, TX_RMDIR, dzp, name);
-
- dmu_tx_commit(tx);
-
- rw_exit(&zp->z_parent_lock);
- rw_exit(&zp->z_name_lock);
-#ifdef FREEBSD_NAMECACHE
- cache_purge(vp);
-#endif
-out:
- zfs_dirent_unlock(dl);
-
- VN_RELE(vp);
-
- ZFS_EXIT(zfsvfs);
- return (error);
-}
-
-/*
- * Read as many directory entries as will fit into the provided
- * buffer from the given directory cursor position (specified in
- * the uio structure.
- *
- * IN: vp - vnode of directory to read.
- * uio - structure supplying read location, range info,
- * and return buffer.
- * cr - credentials of caller.
- *
- * OUT: uio - updated offset and range, buffer filled.
- * eofp - set to true if end-of-file detected.
- *
- * RETURN: 0 if success
- * error code if failure
- *
- * Timestamps:
- * vp - atime updated
- *
- * Note that the low 4 bits of the cookie returned by zap is always zero.
- * This allows us to use the low range for "special" directory entries:
- * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem,
- * we use the offset 2 for the '.zfs' directory.
- */
-/* ARGSUSED */
-static int
-zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies)
-{
- znode_t *zp = VTOZ(vp);
- iovec_t *iovp;
- dirent64_t *odp;
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- objset_t *os;
- caddr_t outbuf;
- size_t bufsize;
- zap_cursor_t zc;
- zap_attribute_t zap;
- uint_t bytes_wanted;
- uint64_t offset; /* must be unsigned; checks for < 1 */
- int local_eof;
- int outcount;
- int error;
- uint8_t prefetch;
- uint8_t type;
- int ncooks;
- u_long *cooks = NULL;
-
- ZFS_ENTER(zfsvfs);
-
- /*
- * If we are not given an eof variable,
- * use a local one.
- */
- if (eofp == NULL)
- eofp = &local_eof;
-
- /*
- * Check for valid iov_len.
- */
- if (uio->uio_iov->iov_len <= 0) {
- ZFS_EXIT(zfsvfs);
- return (EINVAL);
- }
-
- /*
- * Quit if directory has been removed (posix)
- */
- if ((*eofp = zp->z_unlinked) != 0) {
- ZFS_EXIT(zfsvfs);
- return (0);
- }
-
- error = 0;
- os = zfsvfs->z_os;
- offset = uio->uio_loffset;
- prefetch = zp->z_zn_prefetch;
-
- /*
- * Initialize the iterator cursor.
- */
- if (offset <= 3) {
- /*
- * Start iteration from the beginning of the directory.
- */
- zap_cursor_init(&zc, os, zp->z_id);
- } else {
- /*
- * The offset is a serialized cursor.
- */
- zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
- }
-
- /*
- * Get space to change directory entries into fs independent format.
- */
- iovp = uio->uio_iov;
- bytes_wanted = iovp->iov_len;
- if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
- bufsize = bytes_wanted;
- outbuf = kmem_alloc(bufsize, KM_SLEEP);
- odp = (struct dirent64 *)outbuf;
- } else {
- bufsize = bytes_wanted;
- odp = (struct dirent64 *)iovp->iov_base;
- }
-
- if (ncookies != NULL) {
- /*
- * Minimum entry size is dirent size and 1 byte for a file name.
- */
- ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
- cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK);
- *cookies = cooks;
- *ncookies = ncooks;
- }
-
- /*
- * Transform to file-system independent format
- */
- outcount = 0;
- while (outcount < bytes_wanted) {
- ino64_t objnum;
- ushort_t reclen;
-
- /*
- * Special case `.', `..', and `.zfs'.
- */
- if (offset == 0) {
- (void) strcpy(zap.za_name, ".");
- objnum = zp->z_id;
- type = DT_DIR;
- } else if (offset == 1) {
- (void) strcpy(zap.za_name, "..");
- objnum = zp->z_phys->zp_parent;
- type = DT_DIR;
- } else if (offset == 2 && zfs_show_ctldir(zp)) {
- (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
- objnum = ZFSCTL_INO_ROOT;
- type = DT_DIR;
- } else {
- /*
- * Grab next entry.
- */
- if (error = zap_cursor_retrieve(&zc, &zap)) {
- if ((*eofp = (error == ENOENT)) != 0)
- break;
- else
- goto update;
- }
-
- if (zap.za_integer_length != 8 ||
- zap.za_num_integers != 1) {
- cmn_err(CE_WARN, "zap_readdir: bad directory "
- "entry, obj = %lld, offset = %lld\n",
- (u_longlong_t)zp->z_id,
- (u_longlong_t)offset);
- error = ENXIO;
- goto update;
- }
-
- objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
- /*
- * MacOS X can extract the object type here such as:
- * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
- */
- type = ZFS_DIRENT_TYPE(zap.za_first_integer);
- }
- reclen = DIRENT64_RECLEN(strlen(zap.za_name));
-
- /*
- * Will this entry fit in the buffer?
- */
- if (outcount + reclen > bufsize) {
- /*
- * Did we manage to fit anything in the buffer?
- */
- if (!outcount) {
- error = EINVAL;
- goto update;
- }
- break;
- }
- /*
- * Add this entry:
- */
- odp->d_ino = objnum;
- odp->d_reclen = reclen;
- odp->d_namlen = strlen(zap.za_name);
- (void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
- odp->d_type = type;
- outcount += reclen;
- odp = (dirent64_t *)((intptr_t)odp + reclen);
-
- ASSERT(outcount <= bufsize);
-
- /* Prefetch znode */
- if (prefetch)
- dmu_prefetch(os, objnum, 0, 0);
-
- /*
- * Move to the next entry, fill in the previous offset.
- */
- if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
- zap_cursor_advance(&zc);
- offset = zap_cursor_serialize(&zc);
- } else {
- offset += 1;
- }
-
- if (cooks != NULL) {
- *cooks++ = offset;
- ncooks--;
- KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
- }
- }
- zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
-
- /* Subtract unused cookies */
- if (ncookies != NULL)
- *ncookies -= ncooks;
-
- if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
- iovp->iov_base += outcount;
- iovp->iov_len -= outcount;
- uio->uio_resid -= outcount;
- } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
- /*
- * Reset the pointer.
- */
- offset = uio->uio_loffset;
- }
-
-update:
- zap_cursor_fini(&zc);
- if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
- kmem_free(outbuf, bufsize);
-
- if (error == ENOENT)
- error = 0;
-
- ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
-
- uio->uio_loffset = offset;
- ZFS_EXIT(zfsvfs);
- if (error != 0 && cookies != NULL) {
- free(*cookies, M_TEMP);
- *cookies = NULL;
- *ncookies = 0;
- }
- return (error);
-}
-
-static int
-zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr)
-{
- znode_t *zp = VTOZ(vp);
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-
- ZFS_ENTER(zfsvfs);
- zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
- ZFS_EXIT(zfsvfs);
- return (0);
-}
-
-/*
- * Get the requested file attributes and place them in the provided
- * vattr structure.
- *
- * IN: vp - vnode of file.
- * vap - va_mask identifies requested attributes.
- * flags - [UNUSED]
- * cr - credentials of caller.
- *
- * OUT: vap - attribute values.
- *
- * RETURN: 0 (always succeeds)
- */
-/* ARGSUSED */
-static int
-zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr)
-{
- znode_t *zp = VTOZ(vp);
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- znode_phys_t *pzp = zp->z_phys;
- uint32_t blksize;
- u_longlong_t nblocks;
- int error;
-
- ZFS_ENTER(zfsvfs);
-
- /*
- * Return all attributes. It's cheaper to provide the answer
- * than to determine whether we were asked the question.
- */
- mutex_enter(&zp->z_lock);
-
- vap->va_type = IFTOVT(pzp->zp_mode);
- vap->va_mode = pzp->zp_mode & ~S_IFMT;
- vap->va_uid = zp->z_phys->zp_uid;
- vap->va_gid = zp->z_phys->zp_gid;
- vap->va_nodeid = zp->z_id;
- vap->va_nlink = MIN(pzp->zp_links, UINT32_MAX); /* nlink_t limit! */
- vap->va_size = pzp->zp_size;
- vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
- vap->va_rdev = zfs_cmpldev(pzp->zp_rdev);
- vap->va_seq = zp->z_seq;
- vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */
-
- ZFS_TIME_DECODE(&vap->va_atime, pzp->zp_atime);
- ZFS_TIME_DECODE(&vap->va_mtime, pzp->zp_mtime);
- ZFS_TIME_DECODE(&vap->va_ctime, pzp->zp_ctime);
- ZFS_TIME_DECODE(&vap->va_birthtime, pzp->zp_crtime);
-
- /*
- * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
- * Also, if we are the owner don't bother, since owner should
- * always be allowed to read basic attributes of file.
- */
- if (!(zp->z_phys->zp_flags & ZFS_ACL_TRIVIAL) &&
- (zp->z_phys->zp_uid != crgetuid(cr))) {
- if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, cr)) {
- mutex_exit(&zp->z_lock);
- ZFS_EXIT(zfsvfs);
- return (error);
- }
- }
-
- mutex_exit(&zp->z_lock);
-
- dmu_object_size_from_db(zp->z_dbuf, &blksize, &nblocks);
- vap->va_blksize = blksize;
- vap->va_bytes = nblocks << 9; /* nblocks * 512 */
-
- if (zp->z_blksz == 0) {
- /*
- * Block size hasn't been set; suggest maximal I/O transfers.
- */
- vap->va_blksize = zfsvfs->z_max_blksz;
- }
-
- ZFS_EXIT(zfsvfs);
- return (0);
-}
-
-/*
- * Set the file attributes to the values contained in the
- * vattr structure.
- *
- * IN: vp - vnode of file to be modified.
- * vap - new attribute values.
- * flags - ATTR_UTIME set if non-default time values provided.
- * cr - credentials of caller.
- *
- * RETURN: 0 if success
- * error code if failure
- *
- * Timestamps:
- * vp - ctime updated, mtime updated if size changed.
- */
-/* ARGSUSED */
-static int
-zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
- caller_context_t *ct)
-{
- struct znode *zp = VTOZ(vp);
- znode_phys_t *pzp = zp->z_phys;
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- zilog_t *zilog = zfsvfs->z_log;
- dmu_tx_t *tx;
- vattr_t oldva;
- uint_t mask = vap->va_mask;
- uint_t saved_mask;
- int trim_mask = 0;
- uint64_t new_mode;
- znode_t *attrzp;
- int need_policy = FALSE;
- int err;
-
- if (mask == 0)
- return (0);
-
- if (mask & AT_NOSET)
- return (EINVAL);
-
- if (mask & AT_SIZE && vp->v_type == VDIR)
- return (EISDIR);
-
- if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO)
- return (EINVAL);
-
- ZFS_ENTER(zfsvfs);
-
-top:
- attrzp = NULL;
-
- if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
- ZFS_EXIT(zfsvfs);
- return (EROFS);
- }
-
- /*
- * First validate permissions
- */
-
- if (mask & AT_SIZE) {
- err = zfs_zaccess(zp, ACE_WRITE_DATA, cr);
- if (err) {
- ZFS_EXIT(zfsvfs);
- return (err);
- }
- /*
- * XXX - Note, we are not providing any open
- * mode flags here (like FNDELAY), so we may
- * block if there are locks present... this
- * should be addressed in openat().
- */
- do {
- err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
- /* NB: we already did dmu_tx_wait() if necessary */
- } while (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT);
- if (err) {
- ZFS_EXIT(zfsvfs);
- return (err);
- }
- }
-
- if (mask & (AT_ATIME|AT_MTIME))
- need_policy = zfs_zaccess_v4_perm(zp, ACE_WRITE_ATTRIBUTES, cr);
-
- if (mask & (AT_UID|AT_GID)) {
- int idmask = (mask & (AT_UID|AT_GID));
- int take_owner;
- int take_group;
-
- /*
- * NOTE: even if a new mode is being set,
- * we may clear S_ISUID/S_ISGID bits.
- */
-
- if (!(mask & AT_MODE))
- vap->va_mode = pzp->zp_mode;
-
- /*
- * Take ownership or chgrp to group we are a member of
- */
-
- take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
- take_group = (mask & AT_GID) && groupmember(vap->va_gid, cr);
-
- /*
- * If both AT_UID and AT_GID are set then take_owner and
- * take_group must both be set in order to allow taking
- * ownership.
- *
- * Otherwise, send the check through secpolicy_vnode_setattr()
- *
- */
-
- if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
- ((idmask == AT_UID) && take_owner) ||
- ((idmask == AT_GID) && take_group)) {
- if (zfs_zaccess_v4_perm(zp, ACE_WRITE_OWNER, cr) == 0) {
- /*
- * Remove setuid/setgid for non-privileged users
- */
- secpolicy_setid_clear(vap, cr);
- trim_mask = (mask & (AT_UID|AT_GID));
- } else {
- need_policy = TRUE;
- }
- } else {
- need_policy = TRUE;
- }
- }
-
- mutex_enter(&zp->z_lock);
- oldva.va_mode = pzp->zp_mode;
- oldva.va_uid = zp->z_phys->zp_uid;
- oldva.va_gid = zp->z_phys->zp_gid;
- mutex_exit(&zp->z_lock);
-
- if (mask & AT_MODE) {
- if (zfs_zaccess_v4_perm(zp, ACE_WRITE_ACL, cr) == 0) {
- err = secpolicy_setid_setsticky_clear(vp, vap,
- &oldva, cr);
- if (err) {
- ZFS_EXIT(zfsvfs);
- return (err);
- }
- trim_mask |= AT_MODE;
- } else {
- need_policy = TRUE;
- }
- }
-
- if (need_policy) {
- /*
- * If trim_mask is set then take ownership
- * has been granted or write_acl is present and user
- * has the ability to modify mode. In that case remove
- * UID|GID and or MODE from mask so that
- * secpolicy_vnode_setattr() doesn't revoke it.
- */
-
- if (trim_mask) {
- saved_mask = vap->va_mask;
- vap->va_mask &= ~trim_mask;
-
- }
- err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
- (int (*)(void *, int, cred_t *))zfs_zaccess_rwx, zp);
- if (err) {
- ZFS_EXIT(zfsvfs);
- return (err);
- }
-
- if (trim_mask)
- vap->va_mask |= saved_mask;
- }
-
- /*
- * secpolicy_vnode_setattr, or take ownership may have
- * changed va_mask
- */
- mask = vap->va_mask;
-
- tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_bonus(tx, zp->z_id);
-
- if (mask & AT_MODE) {
- uint64_t pmode = pzp->zp_mode;
-
- new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
-
- if (zp->z_phys->zp_acl.z_acl_extern_obj)
- dmu_tx_hold_write(tx,
- pzp->zp_acl.z_acl_extern_obj, 0, SPA_MAXBLOCKSIZE);
- else
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
- 0, ZFS_ACL_SIZE(MAX_ACL_SIZE));
- }
-
- if ((mask & (AT_UID | AT_GID)) && zp->z_phys->zp_xattr != 0) {
- err = zfs_zget(zp->z_zfsvfs, zp->z_phys->zp_xattr, &attrzp);
- if (err) {
- dmu_tx_abort(tx);
- ZFS_EXIT(zfsvfs);
- return (err);
- }
- dmu_tx_hold_bonus(tx, attrzp->z_id);
- }
-
- err = dmu_tx_assign(tx, zfsvfs->z_assign);
- if (err) {
- if (attrzp)
- VN_RELE(ZTOV(attrzp));
- if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
- dmu_tx_wait(tx);
- dmu_tx_abort(tx);
- goto top;
- }
- dmu_tx_abort(tx);
- ZFS_EXIT(zfsvfs);
- return (err);
- }
-
- dmu_buf_will_dirty(zp->z_dbuf, tx);
-
- /*
- * Set each attribute requested.
- * We group settings according to the locks they need to acquire.
- *
- * Note: you cannot set ctime directly, although it will be
- * updated as a side-effect of calling this function.
- */
-
- mutex_enter(&zp->z_lock);
-
- if (mask & AT_MODE) {
- err = zfs_acl_chmod_setattr(zp, new_mode, tx);
- ASSERT3U(err, ==, 0);
- }
-
- if (attrzp)
- mutex_enter(&attrzp->z_lock);
-
- if (mask & AT_UID) {
- zp->z_phys->zp_uid = (uint64_t)vap->va_uid;
- if (attrzp) {
- attrzp->z_phys->zp_uid = (uint64_t)vap->va_uid;
- }
- }
-
- if (mask & AT_GID) {
- zp->z_phys->zp_gid = (uint64_t)vap->va_gid;
- if (attrzp)
- attrzp->z_phys->zp_gid = (uint64_t)vap->va_gid;
- }
-
- if (attrzp)
- mutex_exit(&attrzp->z_lock);
-
- if (mask & AT_ATIME)
- ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime);
-
- if (mask & AT_MTIME)
- ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
-
- if (mask & AT_SIZE)
- zfs_time_stamper_locked(zp, CONTENT_MODIFIED, tx);
- else if (mask != 0)
- zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
-
- if (mask != 0)
- zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask);
-
- mutex_exit(&zp->z_lock);
-
- if (attrzp)
- VN_RELE(ZTOV(attrzp));
-
- dmu_tx_commit(tx);
-
- ZFS_EXIT(zfsvfs);
- return (err);
-}
-
-typedef struct zfs_zlock {
- krwlock_t *zl_rwlock; /* lock we acquired */
- znode_t *zl_znode; /* znode we held */
- struct zfs_zlock *zl_next; /* next in list */
-} zfs_zlock_t;
-
-/*
- * Drop locks and release vnodes that were held by zfs_rename_lock().
- */
-static void
-zfs_rename_unlock(zfs_zlock_t **zlpp)
-{
- zfs_zlock_t *zl;
-
- while ((zl = *zlpp) != NULL) {
- if (zl->zl_znode != NULL)
- VN_RELE(ZTOV(zl->zl_znode));
- rw_exit(zl->zl_rwlock);
- *zlpp = zl->zl_next;
- kmem_free(zl, sizeof (*zl));
- }
-}
-
-/*
- * Search back through the directory tree, using the ".." entries.
- * Lock each directory in the chain to prevent concurrent renames.
- * Fail any attempt to move a directory into one of its own descendants.
- * XXX - z_parent_lock can overlap with map or grow locks
- */
-static int
-zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
-{
- zfs_zlock_t *zl;
- znode_t *zp = tdzp;
- uint64_t rootid = zp->z_zfsvfs->z_root;
- uint64_t *oidp = &zp->z_id;
- krwlock_t *rwlp = &szp->z_parent_lock;
- krw_t rw = RW_WRITER;
-
- /*
- * First pass write-locks szp and compares to zp->z_id.
- * Later passes read-lock zp and compare to zp->z_parent.
- */
- do {
- if (!rw_tryenter(rwlp, rw)) {
- /*
- * Another thread is renaming in this path.
- * Note that if we are a WRITER, we don't have any
- * parent_locks held yet.
- */
- if (rw == RW_READER && zp->z_id > szp->z_id) {
- /*
- * Drop our locks and restart
- */
- zfs_rename_unlock(&zl);
- *zlpp = NULL;
- zp = tdzp;
- oidp = &zp->z_id;
- rwlp = &szp->z_parent_lock;
- rw = RW_WRITER;
- continue;
- } else {
- /*
- * Wait for other thread to drop its locks
- */
- rw_enter(rwlp, rw);
- }
- }
-
- zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
- zl->zl_rwlock = rwlp;
- zl->zl_znode = NULL;
- zl->zl_next = *zlpp;
- *zlpp = zl;
-
- if (*oidp == szp->z_id) /* We're a descendant of szp */
- return (EINVAL);
-
- if (*oidp == rootid) /* We've hit the top */
- return (0);
-
- if (rw == RW_READER) { /* i.e. not the first pass */
- int error = zfs_zget(zp->z_zfsvfs, *oidp, &zp);
- if (error)
- return (error);
- zl->zl_znode = zp;
- }
- oidp = &zp->z_phys->zp_parent;
- rwlp = &zp->z_parent_lock;
- rw = RW_READER;
-
- } while (zp->z_id != sdzp->z_id);
-
- return (0);
-}
-
-/*
- * Move an entry from the provided source directory to the target
- * directory. Change the entry name as indicated.
- *
- * IN: sdvp - Source directory containing the "old entry".
- * snm - Old entry name.
- * tdvp - Target directory to contain the "new entry".
- * tnm - New entry name.
- * cr - credentials of caller.
- *
- * RETURN: 0 if success
- * error code if failure
- *
- * Timestamps:
- * sdvp,tdvp - ctime|mtime updated
- */
-static int
-zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr)
-{
- znode_t *tdzp, *szp, *tzp;
- znode_t *sdzp = VTOZ(sdvp);
- zfsvfs_t *zfsvfs = sdzp->z_zfsvfs;
- zilog_t *zilog = zfsvfs->z_log;
- vnode_t *realvp;
- zfs_dirlock_t *sdl, *tdl;
- dmu_tx_t *tx;
- zfs_zlock_t *zl;
- int cmp, serr, terr, error;
-
- ZFS_ENTER(zfsvfs);
-
- /*
- * Make sure we have the real vp for the target directory.
- */
- if (VOP_REALVP(tdvp, &realvp) == 0)
- tdvp = realvp;
-
- if (tdvp->v_vfsp != sdvp->v_vfsp) {
- ZFS_EXIT(zfsvfs);
- return (EXDEV);
- }
-
- tdzp = VTOZ(tdvp);
-top:
- szp = NULL;
- tzp = NULL;
- zl = NULL;
-
- /*
- * This is to prevent the creation of links into attribute space
- * by renaming a linked file into/outof an attribute directory.
- * See the comment in zfs_link() for why this is considered bad.
- */
- if ((tdzp->z_phys->zp_flags & ZFS_XATTR) !=
- (sdzp->z_phys->zp_flags & ZFS_XATTR)) {
- ZFS_EXIT(zfsvfs);
- return (EINVAL);
- }
-
- /*
- * Lock source and target directory entries. To prevent deadlock,
- * a lock ordering must be defined. We lock the directory with
- * the smallest object id first, or if it's a tie, the one with
- * the lexically first name.
- */
- if (sdzp->z_id < tdzp->z_id) {
- cmp = -1;
- } else if (sdzp->z_id > tdzp->z_id) {
- cmp = 1;
- } else {
- cmp = strcmp(snm, tnm);
- if (cmp == 0) {
- /*
- * POSIX: "If the old argument and the new argument
- * both refer to links to the same existing file,
- * the rename() function shall return successfully
- * and perform no other action."
- */
- ZFS_EXIT(zfsvfs);
- return (0);
- }
- }
- if (cmp < 0) {
- serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS);
- terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, 0);
- } else {
- terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, 0);
- serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS);
- }
-
- if (serr) {
- /*
- * Source entry invalid or not there.
- */
- if (!terr) {
- zfs_dirent_unlock(tdl);
- if (tzp)
- VN_RELE(ZTOV(tzp));
- }
- if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0)
- serr = EINVAL;
- ZFS_EXIT(zfsvfs);
- return (serr);
- }
- if (terr) {
- zfs_dirent_unlock(sdl);
- VN_RELE(ZTOV(szp));
- if (strcmp(tnm, "..") == 0)
- terr = EINVAL;
- ZFS_EXIT(zfsvfs);
- return (terr);
- }
-
- /*
- * Must have write access at the source to remove the old entry
- * and write access at the target to create the new entry.
- * Note that if target and source are the same, this can be
- * done in a single check.
- */
-
- if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
- goto out;
-
- if (ZTOV(szp)->v_type == VDIR) {
- /*
- * Check to make sure rename is valid.
- * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
- */
- if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl))
- goto out;
- }
-
- /*
- * Does target exist?
- */
- if (tzp) {
- /*
- * Source and target must be the same type.
- */
- if (ZTOV(szp)->v_type == VDIR) {
- if (ZTOV(tzp)->v_type != VDIR) {
- error = ENOTDIR;
- goto out;
- }
- } else {
- if (ZTOV(tzp)->v_type == VDIR) {
- error = EISDIR;
- goto out;
- }
- }
- /*
- * POSIX dictates that when the source and target
- * entries refer to the same file object, rename
- * must do nothing and exit without error.
- */
- if (szp->z_id == tzp->z_id) {
- error = 0;
- goto out;
- }
- }
-
- vnevent_rename_src(ZTOV(szp));
- if (tzp)
- vnevent_rename_dest(ZTOV(tzp));
-
- tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_bonus(tx, szp->z_id); /* nlink changes */
- dmu_tx_hold_bonus(tx, sdzp->z_id); /* nlink changes */
- dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
- dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
- if (sdzp != tdzp)
- dmu_tx_hold_bonus(tx, tdzp->z_id); /* nlink changes */
- if (tzp)
- dmu_tx_hold_bonus(tx, tzp->z_id); /* parent changes */
- dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
- error = dmu_tx_assign(tx, zfsvfs->z_assign);
- if (error) {
- if (zl != NULL)
- zfs_rename_unlock(&zl);
- zfs_dirent_unlock(sdl);
- zfs_dirent_unlock(tdl);
- VN_RELE(ZTOV(szp));
- if (tzp)
- VN_RELE(ZTOV(tzp));
- if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
- dmu_tx_wait(tx);
- dmu_tx_abort(tx);
- goto top;
- }
- dmu_tx_abort(tx);
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- if (tzp) /* Attempt to remove the existing target */
- error = zfs_link_destroy(tdl, tzp, tx, 0, NULL);
-
- if (error == 0) {
- error = zfs_link_create(tdl, szp, tx, ZRENAMING);
- if (error == 0) {
- error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
- ASSERT(error == 0);
- zfs_log_rename(zilog, tx, TX_RENAME, sdzp,
- sdl->dl_name, tdzp, tdl->dl_name, szp);
- }
-#ifdef FREEBSD_NAMECACHE
- if (error == 0) {
- cache_purge(sdvp);
- cache_purge(tdvp);
- }
-#endif
- }
-
- dmu_tx_commit(tx);
-out:
- if (zl != NULL)
- zfs_rename_unlock(&zl);
-
- zfs_dirent_unlock(sdl);
- zfs_dirent_unlock(tdl);
-
- VN_RELE(ZTOV(szp));
- if (tzp)
- VN_RELE(ZTOV(tzp));
-
- ZFS_EXIT(zfsvfs);
-
- return (error);
-}
-
-/*
- * Insert the indicated symbolic reference entry into the directory.
- *
- * IN: dvp - Directory to contain new symbolic link.
- * link - Name for new symlink entry.
- * vap - Attributes of new entry.
- * target - Target path of new symlink.
- * cr - credentials of caller.
- *
- * RETURN: 0 if success
- * error code if failure
- *
- * Timestamps:
- * dvp - ctime|mtime updated
- */
-static int
-zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link, cred_t *cr, kthread_t *td)
-{
- znode_t *zp, *dzp = VTOZ(dvp);
- zfs_dirlock_t *dl;
- dmu_tx_t *tx;
- zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
- zilog_t *zilog = zfsvfs->z_log;
- uint64_t zoid;
- int len = strlen(link);
- int error;
-
- ASSERT(vap->va_type == VLNK);
-
- ZFS_ENTER(zfsvfs);
-top:
- if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) {
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- if (len > MAXPATHLEN) {
- ZFS_EXIT(zfsvfs);
- return (ENAMETOOLONG);
- }
-
- /*
- * Attempt to lock directory; fail if entry already exists.
- */
- if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZNEW)) {
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
- dmu_tx_hold_bonus(tx, dzp->z_id);
- dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
- if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE);
- error = dmu_tx_assign(tx, zfsvfs->z_assign);
- if (error) {
- zfs_dirent_unlock(dl);
- if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
- dmu_tx_wait(tx);
- dmu_tx_abort(tx);
- goto top;
- }
- dmu_tx_abort(tx);
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- dmu_buf_will_dirty(dzp->z_dbuf, tx);
-
- /*
- * Create a new object for the symlink.
- * Put the link content into bonus buffer if it will fit;
- * otherwise, store it just like any other file data.
- */
- zoid = 0;
- if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) {
- zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, len);
- if (len != 0)
- bcopy(link, zp->z_phys + 1, len);
- } else {
- dmu_buf_t *dbp;
-
- zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0);
-
- /*
- * Nothing can access the znode yet so no locking needed
- * for growing the znode's blocksize.
- */
- zfs_grow_blocksize(zp, len, tx);
-
- VERIFY(0 == dmu_buf_hold(zfsvfs->z_os, zoid, 0, FTAG, &dbp));
- dmu_buf_will_dirty(dbp, tx);
-
- ASSERT3U(len, <=, dbp->db_size);
- bcopy(link, dbp->db_data, len);
- dmu_buf_rele(dbp, FTAG);
- }
- zp->z_phys->zp_size = len;
-
- /*
- * Insert the new object into the directory.
- */
- (void) zfs_link_create(dl, zp, tx, ZNEW);
-out:
- if (error == 0) {
- zfs_log_symlink(zilog, tx, TX_SYMLINK, dzp, zp, name, link);
- *vpp = ZTOV(zp);
- vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
- }
-
- dmu_tx_commit(tx);
-
- zfs_dirent_unlock(dl);
-
- ZFS_EXIT(zfsvfs);
- return (error);
-}
-
-/*
- * Return, in the buffer contained in the provided uio structure,
- * the symbolic path referred to by vp.
- *
- * IN: vp - vnode of symbolic link.
- * uoip - structure to contain the link path.
- * cr - credentials of caller.
- *
- * OUT: uio - structure to contain the link path.
- *
- * RETURN: 0 if success
- * error code if failure
- *
- * Timestamps:
- * vp - atime updated
- */
-/* ARGSUSED */
-static int
-zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr)
-{
- znode_t *zp = VTOZ(vp);
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- size_t bufsz;
- int error;
-
- ZFS_ENTER(zfsvfs);
-
- bufsz = (size_t)zp->z_phys->zp_size;
- if (bufsz + sizeof (znode_phys_t) <= zp->z_dbuf->db_size) {
- error = uiomove(zp->z_phys + 1,
- MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
- } else {
- dmu_buf_t *dbp;
- error = dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0, FTAG, &dbp);
- if (error) {
- ZFS_EXIT(zfsvfs);
- return (error);
- }
- error = uiomove(dbp->db_data,
- MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
- dmu_buf_rele(dbp, FTAG);
- }
-
- ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
- ZFS_EXIT(zfsvfs);
- return (error);
-}
-
-/*
- * Insert a new entry into directory tdvp referencing svp.
- *
- * IN: tdvp - Directory to contain new entry.
- * svp - vnode of new entry.
- * name - name of new entry.
- * cr - credentials of caller.
- *
- * RETURN: 0 if success
- * error code if failure
- *
- * Timestamps:
- * tdvp - ctime|mtime updated
- * svp - ctime updated
- */
-/* ARGSUSED */
-static int
-zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr)
-{
- znode_t *dzp = VTOZ(tdvp);
- znode_t *tzp, *szp;
- zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
- zilog_t *zilog = zfsvfs->z_log;
- zfs_dirlock_t *dl;
- dmu_tx_t *tx;
- vnode_t *realvp;
- int error;
-
- ASSERT(tdvp->v_type == VDIR);
-
- ZFS_ENTER(zfsvfs);
-
- if (VOP_REALVP(svp, &realvp) == 0)
- svp = realvp;
-
- if (svp->v_vfsp != tdvp->v_vfsp) {
- ZFS_EXIT(zfsvfs);
- return (EXDEV);
- }
-
- szp = VTOZ(svp);
-top:
- /*
- * We do not support links between attributes and non-attributes
- * because of the potential security risk of creating links
- * into "normal" file space in order to circumvent restrictions
- * imposed in attribute space.
- */
- if ((szp->z_phys->zp_flags & ZFS_XATTR) !=
- (dzp->z_phys->zp_flags & ZFS_XATTR)) {
- ZFS_EXIT(zfsvfs);
- return (EINVAL);
- }
-
- /*
- * POSIX dictates that we return EPERM here.
- * Better choices include ENOTSUP or EISDIR.
- */
- if (svp->v_type == VDIR) {
- ZFS_EXIT(zfsvfs);
- return (EPERM);
- }
-
- if ((uid_t)szp->z_phys->zp_uid != crgetuid(cr) &&
- secpolicy_basic_link(cr) != 0) {
- ZFS_EXIT(zfsvfs);
- return (EPERM);
- }
-
- if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) {
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- /*
- * Attempt to lock directory; fail if entry already exists.
- */
- if (error = zfs_dirent_lock(&dl, dzp, name, &tzp, ZNEW)) {
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_bonus(tx, szp->z_id);
- dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
- error = dmu_tx_assign(tx, zfsvfs->z_assign);
- if (error) {
- zfs_dirent_unlock(dl);
- if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
- dmu_tx_wait(tx);
- dmu_tx_abort(tx);
- goto top;
- }
- dmu_tx_abort(tx);
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- error = zfs_link_create(dl, szp, tx, 0);
-
- if (error == 0)
- zfs_log_link(zilog, tx, TX_LINK, dzp, szp, name);
-
- dmu_tx_commit(tx);
-
- zfs_dirent_unlock(dl);
-
- ZFS_EXIT(zfsvfs);
- return (error);
-}
-
-void
-zfs_inactive(vnode_t *vp, cred_t *cr)
-{
- znode_t *zp = VTOZ(vp);
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- int error;
-
- rw_enter(&zfsvfs->z_um_lock, RW_READER);
- if (zfsvfs->z_unmounted2) {
- ASSERT(zp->z_dbuf_held == 0);
-
- mutex_enter(&zp->z_lock);
- VI_LOCK(vp);
- vp->v_count = 0; /* count arrives as 1 */
- VI_UNLOCK(vp);
- if (zp->z_dbuf == NULL) {
- mutex_exit(&zp->z_lock);
- zfs_znode_free(zp);
- } else {
- mutex_exit(&zp->z_lock);
- }
- rw_exit(&zfsvfs->z_um_lock);
- VFS_RELE(zfsvfs->z_vfs);
- return;
- }
-
- if (zp->z_atime_dirty && zp->z_unlinked == 0) {
- dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
-
- dmu_tx_hold_bonus(tx, zp->z_id);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- dmu_tx_abort(tx);
- } else {
- dmu_buf_will_dirty(zp->z_dbuf, tx);
- mutex_enter(&zp->z_lock);
- zp->z_atime_dirty = 0;
- mutex_exit(&zp->z_lock);
- dmu_tx_commit(tx);
- }
- }
-
- zfs_zinactive(zp);
- rw_exit(&zfsvfs->z_um_lock);
-}
-
-CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
-CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
-
-static int
-zfs_fid(vnode_t *vp, fid_t *fidp)
-{
- znode_t *zp = VTOZ(vp);
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- uint32_t gen = (uint32_t)zp->z_phys->zp_gen;
- uint64_t object = zp->z_id;
- zfid_short_t *zfid;
- int size, i;
-
- ZFS_ENTER(zfsvfs);
-
- size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
- fidp->fid_len = size;
-
- zfid = (zfid_short_t *)fidp;
-
- zfid->zf_len = size;
-
- for (i = 0; i < sizeof (zfid->zf_object); i++)
- zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
-
- /* Must have a non-zero generation number to distinguish from .zfs */
- if (gen == 0)
- gen = 1;
- for (i = 0; i < sizeof (zfid->zf_gen); i++)
- zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
-
- if (size == LONG_FID_LEN) {
- uint64_t objsetid = dmu_objset_id(zfsvfs->z_os);
- zfid_long_t *zlfid;
-
- zlfid = (zfid_long_t *)fidp;
-
- for (i = 0; i < sizeof (zlfid->zf_setid); i++)
- zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
-
- /* XXX - this should be the generation number for the objset */
- for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
- zlfid->zf_setgen[i] = 0;
- }
-
- ZFS_EXIT(zfsvfs);
- return (0);
-}
-
-static int
-zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr)
-{
- znode_t *zp, *xzp;
- zfsvfs_t *zfsvfs;
- zfs_dirlock_t *dl;
- int error;
-
- switch (cmd) {
- case _PC_LINK_MAX:
- *valp = INT_MAX;
- return (0);
-
- case _PC_FILESIZEBITS:
- *valp = 64;
- return (0);
-
-#if 0
- case _PC_XATTR_EXISTS:
- zp = VTOZ(vp);
- zfsvfs = zp->z_zfsvfs;
- ZFS_ENTER(zfsvfs);
- *valp = 0;
- error = zfs_dirent_lock(&dl, zp, "", &xzp,
- ZXATTR | ZEXISTS | ZSHARED);
- if (error == 0) {
- zfs_dirent_unlock(dl);
- if (!zfs_dirempty(xzp))
- *valp = 1;
- VN_RELE(ZTOV(xzp));
- } else if (error == ENOENT) {
- /*
- * If there aren't extended attributes, it's the
- * same as having zero of them.
- */
- error = 0;
- }
- ZFS_EXIT(zfsvfs);
- return (error);
-#endif
-
- case _PC_ACL_EXTENDED:
- *valp = 0; /* TODO */
- return (0);
-
- case _PC_MIN_HOLE_SIZE:
- *valp = (int)SPA_MINBLOCKSIZE;
- return (0);
-
- default:
- return (EOPNOTSUPP);
- }
-}
-
-#ifdef TODO
-/*ARGSUSED*/
-static int
-zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr)
-{
- znode_t *zp = VTOZ(vp);
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- int error;
-
- ZFS_ENTER(zfsvfs);
- error = zfs_getacl(zp, vsecp, cr);
- ZFS_EXIT(zfsvfs);
-
- return (error);
-}
-#endif /* TODO */
-
-#ifdef TODO
-/*ARGSUSED*/
-static int
-zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr)
-{
- znode_t *zp = VTOZ(vp);
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- int error;
-
- ZFS_ENTER(zfsvfs);
- error = zfs_setacl(zp, vsecp, cr);
- ZFS_EXIT(zfsvfs);
- return (error);
-}
-#endif /* TODO */
-
-static int
-zfs_freebsd_open(ap)
- struct vop_open_args /* {
- struct vnode *a_vp;
- int a_mode;
- struct ucred *a_cred;
- struct thread *a_td;
- } */ *ap;
-{
- vnode_t *vp = ap->a_vp;
- znode_t *zp = VTOZ(vp);
- int error;
-
- error = zfs_open(&vp, ap->a_mode, ap->a_cred);
- if (error == 0)
- vnode_create_vobject(vp, zp->z_phys->zp_size, ap->a_td);
- return (error);
-}
-
-static int
-zfs_freebsd_close(ap)
- struct vop_close_args /* {
- struct vnode *a_vp;
- int a_fflag;
- struct ucred *a_cred;
- struct thread *a_td;
- } */ *ap;
-{
-
- return (zfs_close(ap->a_vp, ap->a_fflag, 0, 0, ap->a_cred));
-}
-
-static int
-zfs_freebsd_ioctl(ap)
- struct vop_ioctl_args /* {
- struct vnode *a_vp;
- u_long a_command;
- caddr_t a_data;
- int a_fflag;
- struct ucred *cred;
- struct thread *td;
- } */ *ap;
-{
-
- return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
- ap->a_fflag, ap->a_cred, NULL));
-}
-
-static int
-zfs_freebsd_read(ap)
- struct vop_read_args /* {
- struct vnode *a_vp;
- struct uio *a_uio;
- int a_ioflag;
- struct ucred *a_cred;
- } */ *ap;
-{
-
- return (zfs_read(ap->a_vp, ap->a_uio, ap->a_ioflag, ap->a_cred, NULL));
-}
-
-static int
-zfs_freebsd_write(ap)
- struct vop_write_args /* {
- struct vnode *a_vp;
- struct uio *a_uio;
- int a_ioflag;
- struct ucred *a_cred;
- } */ *ap;
-{
-
- return (zfs_write(ap->a_vp, ap->a_uio, ap->a_ioflag, ap->a_cred, NULL));
-}
-
-static int
-zfs_freebsd_access(ap)
- struct vop_access_args /* {
- struct vnode *a_vp;
- int a_mode;
- struct ucred *a_cred;
- struct thread *a_td;
- } */ *ap;
-{
-
- return (zfs_access(ap->a_vp, ap->a_mode, 0, ap->a_cred));
-}
-
-static int
-zfs_freebsd_lookup(ap)
- struct vop_lookup_args /* {
- struct vnode *a_dvp;
- struct vnode **a_vpp;
- struct componentname *a_cnp;
- } */ *ap;
-{
- struct componentname *cnp = ap->a_cnp;
- char nm[NAME_MAX + 1];
-
- ASSERT(cnp->cn_namelen < sizeof(nm));
- strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm)));
-
- return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
- cnp->cn_cred, cnp->cn_thread));
-}
-
-static int
-zfs_freebsd_create(ap)
- struct vop_create_args /* {
- struct vnode *a_dvp;
- struct vnode **a_vpp;
- struct componentname *a_cnp;
- struct vattr *a_vap;
- } */ *ap;
-{
- struct componentname *cnp = ap->a_cnp;
- vattr_t *vap = ap->a_vap;
- int mode;
-
- ASSERT(cnp->cn_flags & SAVENAME);
-
- vattr_init_mask(vap);
- mode = vap->va_mode & ALLPERMS;
-
- return (zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode,
- ap->a_vpp, cnp->cn_cred));
-}
-
-static int
-zfs_freebsd_remove(ap)
- struct vop_remove_args /* {
- struct vnode *a_dvp;
- struct vnode *a_vp;
- struct componentname *a_cnp;
- } */ *ap;
-{
-
- ASSERT(ap->a_cnp->cn_flags & SAVENAME);
-
- return (zfs_remove(ap->a_dvp, ap->a_cnp->cn_nameptr,
- ap->a_cnp->cn_cred));
-}
-
-static int
-zfs_freebsd_mkdir(ap)
- struct vop_mkdir_args /* {
- struct vnode *a_dvp;
- struct vnode **a_vpp;
- struct componentname *a_cnp;
- struct vattr *a_vap;
- } */ *ap;
-{
- vattr_t *vap = ap->a_vap;
-
- ASSERT(ap->a_cnp->cn_flags & SAVENAME);
-
- vattr_init_mask(vap);
-
- return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp,
- ap->a_cnp->cn_cred));
-}
-
-static int
-zfs_freebsd_rmdir(ap)
- struct vop_rmdir_args /* {
- struct vnode *a_dvp;
- struct vnode *a_vp;
- struct componentname *a_cnp;
- } */ *ap;
-{
- struct componentname *cnp = ap->a_cnp;
-
- ASSERT(cnp->cn_flags & SAVENAME);
-
- return (zfs_rmdir(ap->a_dvp, cnp->cn_nameptr, NULL, cnp->cn_cred));
-}
-
-static int
-zfs_freebsd_readdir(ap)
- struct vop_readdir_args /* {
- struct vnode *a_vp;
- struct uio *a_uio;
- struct ucred *a_cred;
- int *a_eofflag;
- int *a_ncookies;
- u_long **a_cookies;
- } */ *ap;
-{
-
- return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag,
- ap->a_ncookies, ap->a_cookies));
-}
-
-static int
-zfs_freebsd_fsync(ap)
- struct vop_fsync_args /* {
- struct vnode *a_vp;
- int a_waitfor;
- struct thread *a_td;
- } */ *ap;
-{
-
- vop_stdfsync(ap);
- return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred));
-}
-
-static int
-zfs_freebsd_getattr(ap)
- struct vop_getattr_args /* {
- struct vnode *a_vp;
- struct vattr *a_vap;
- struct ucred *a_cred;
- struct thread *a_td;
- } */ *ap;
-{
-
- return (zfs_getattr(ap->a_vp, ap->a_vap, 0, ap->a_cred));
-}
-
-static int
-zfs_freebsd_setattr(ap)
- struct vop_setattr_args /* {
- struct vnode *a_vp;
- struct vattr *a_vap;
- struct ucred *a_cred;
- struct thread *a_td;
- } */ *ap;
-{
- vattr_t *vap = ap->a_vap;
-
- /* No support for FreeBSD's chflags(2). */
- if (vap->va_flags != VNOVAL)
- return (EOPNOTSUPP);
-
- vattr_init_mask(vap);
- vap->va_mask &= ~AT_NOSET;
-
- return (zfs_setattr(ap->a_vp, vap, 0, ap->a_cred, NULL));
-}
-
-static int
-zfs_freebsd_rename(ap)
- struct vop_rename_args /* {
- struct vnode *a_fdvp;
- struct vnode *a_fvp;
- struct componentname *a_fcnp;
- struct vnode *a_tdvp;
- struct vnode *a_tvp;
- struct componentname *a_tcnp;
- } */ *ap;
-{
- vnode_t *fdvp = ap->a_fdvp;
- vnode_t *fvp = ap->a_fvp;
- vnode_t *tdvp = ap->a_tdvp;
- vnode_t *tvp = ap->a_tvp;
- int error;
-
- ASSERT(ap->a_fcnp->cn_flags & SAVENAME);
- ASSERT(ap->a_tcnp->cn_flags & SAVENAME);
-
- error = zfs_rename(fdvp, ap->a_fcnp->cn_nameptr, tdvp,
- ap->a_tcnp->cn_nameptr, ap->a_fcnp->cn_cred);
-
- if (tdvp == tvp)
- VN_RELE(tdvp);
- else
- VN_URELE(tdvp);
- if (tvp)
- VN_URELE(tvp);
- VN_RELE(fdvp);
- VN_RELE(fvp);
-
- return (error);
-}
-
-static int
-zfs_freebsd_symlink(ap)
- struct vop_symlink_args /* {
- struct vnode *a_dvp;
- struct vnode **a_vpp;
- struct componentname *a_cnp;
- struct vattr *a_vap;
- char *a_target;
- } */ *ap;
-{
- struct componentname *cnp = ap->a_cnp;
- vattr_t *vap = ap->a_vap;
-
- ASSERT(cnp->cn_flags & SAVENAME);
-
- vap->va_type = VLNK; /* FreeBSD: Syscall only sets va_mode. */
- vattr_init_mask(vap);
-
- return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap,
- ap->a_target, cnp->cn_cred, cnp->cn_thread));
-}
-
-static int
-zfs_freebsd_readlink(ap)
- struct vop_readlink_args /* {
- struct vnode *a_vp;
- struct uio *a_uio;
- struct ucred *a_cred;
- } */ *ap;
-{
-
- return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred));
-}
-
-static int
-zfs_freebsd_link(ap)
- struct vop_link_args /* {
- struct vnode *a_tdvp;
- struct vnode *a_vp;
- struct componentname *a_cnp;
- } */ *ap;
-{
- struct componentname *cnp = ap->a_cnp;
-
- ASSERT(cnp->cn_flags & SAVENAME);
-
- return (zfs_link(ap->a_tdvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred));
-}
-
-static int
-zfs_freebsd_inactive(ap)
- struct vop_inactive_args /* {
- struct vnode *a_vp;
- struct thread *a_td;
- } */ *ap;
-{
- vnode_t *vp = ap->a_vp;
-
- zfs_inactive(vp, ap->a_td->td_ucred);
- return (0);
-}
-
-static int
-zfs_freebsd_reclaim(ap)
- struct vop_reclaim_args /* {
- struct vnode *a_vp;
- struct thread *a_td;
- } */ *ap;
-{
- vnode_t *vp = ap->a_vp;
- znode_t *zp = VTOZ(vp);
- zfsvfs_t *zfsvfs;
- int rele = 1;
-
- ASSERT(zp != NULL);
-
- /*
- * Destroy the vm object and flush associated pages.
- */
- vnode_destroy_vobject(vp);
-
- mutex_enter(&zp->z_lock);
- ASSERT(zp->z_phys);
- ASSERT(zp->z_dbuf_held);
- zfsvfs = zp->z_zfsvfs;
- if (!zp->z_unlinked) {
- zp->z_dbuf_held = 0;
- ZTOV(zp) = NULL;
- mutex_exit(&zp->z_lock);
- dmu_buf_rele(zp->z_dbuf, NULL);
- } else {
- mutex_exit(&zp->z_lock);
- }
- VI_LOCK(vp);
- if (vp->v_count > 0)
- rele = 0;
- vp->v_data = NULL;
- ASSERT(vp->v_holdcnt >= 1);
- VI_UNLOCK(vp);
- if (!zp->z_unlinked && rele)
- VFS_RELE(zfsvfs->z_vfs);
- return (0);
-}
-
-static int
-zfs_freebsd_fid(ap)
- struct vop_fid_args /* {
- struct vnode *a_vp;
- struct fid *a_fid;
- } */ *ap;
-{
-
- return (zfs_fid(ap->a_vp, (void *)ap->a_fid));
-}
-
-static int
-zfs_freebsd_pathconf(ap)
- struct vop_pathconf_args /* {
- struct vnode *a_vp;
- int a_name;
- register_t *a_retval;
- } */ *ap;
-{
- ulong_t val;
- int error;
-
- error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred);
- if (error == 0)
- *ap->a_retval = val;
- else if (error == EOPNOTSUPP)
- error = vop_stdpathconf(ap);
- return (error);
-}
-
-/*
- * Advisory record locking support
- */
-static int
-zfs_freebsd_advlock(ap)
- struct vop_advlock_args /* {
- struct vnode *a_vp;
- caddr_t a_id;
- int a_op;
- struct flock *a_fl;
- int a_flags;
- } */ *ap;
-{
- znode_t *zp = VTOZ(ap->a_vp);
-
- return (lf_advlock(ap, &(zp->z_lockf), zp->z_phys->zp_size));
-}
-
-/*
- * Advisory record locking support
- */
-static int
-zfs_freebsd_advlockasync(ap)
- struct vop_advlockasync_args /* {
- struct vnode *a_vp;
- caddr_t a_id;
- int a_op;
- struct flock *a_fl;
- int a_flags;
- struct task *a_task;
- } */ *ap;
-{
- znode_t *zp = VTOZ(ap->a_vp);
-
- return (lf_advlockasync(ap, &(zp->z_lockf), zp->z_phys->zp_size));
-}
-
-struct vop_vector zfs_vnodeops;
-struct vop_vector zfs_fifoops;
-
-struct vop_vector zfs_vnodeops = {
- .vop_default = &default_vnodeops,
- .vop_inactive = zfs_freebsd_inactive,
- .vop_reclaim = zfs_freebsd_reclaim,
- .vop_access = zfs_freebsd_access,
-#ifdef FREEBSD_NAMECACHE
- .vop_lookup = vfs_cache_lookup,
- .vop_cachedlookup = zfs_freebsd_lookup,
-#else
- .vop_lookup = zfs_freebsd_lookup,
-#endif
- .vop_getattr = zfs_freebsd_getattr,
- .vop_setattr = zfs_freebsd_setattr,
- .vop_create = zfs_freebsd_create,
- .vop_mknod = zfs_freebsd_create,
- .vop_mkdir = zfs_freebsd_mkdir,
- .vop_readdir = zfs_freebsd_readdir,
- .vop_fsync = zfs_freebsd_fsync,
- .vop_open = zfs_freebsd_open,
- .vop_close = zfs_freebsd_close,
- .vop_rmdir = zfs_freebsd_rmdir,
- .vop_ioctl = zfs_freebsd_ioctl,
- .vop_link = zfs_freebsd_link,
- .vop_symlink = zfs_freebsd_symlink,
- .vop_readlink = zfs_freebsd_readlink,
- .vop_read = zfs_freebsd_read,
- .vop_write = zfs_freebsd_write,
- .vop_remove = zfs_freebsd_remove,
- .vop_rename = zfs_freebsd_rename,
- .vop_advlock = zfs_freebsd_advlock,
- .vop_advlockasync = zfs_freebsd_advlockasync,
- .vop_pathconf = zfs_freebsd_pathconf,
- .vop_bmap = VOP_EOPNOTSUPP,
- .vop_fid = zfs_freebsd_fid,
-};
-
-struct vop_vector zfs_fifoops = {
- .vop_default = &fifo_specops,
- .vop_fsync = VOP_PANIC,
- .vop_access = zfs_freebsd_access,
- .vop_getattr = zfs_freebsd_getattr,
- .vop_inactive = zfs_freebsd_inactive,
- .vop_read = VOP_PANIC,
- .vop_reclaim = zfs_freebsd_reclaim,
- .vop_setattr = zfs_freebsd_setattr,
- .vop_write = VOP_PANIC,
- .vop_fid = zfs_freebsd_fid,
-};
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
deleted file mode 100644
index 46e501c..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
+++ /dev/null
@@ -1,1072 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-/* Portions Copyright 2007 Jeremy Teo */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#ifdef _KERNEL
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/time.h>
-#include <sys/systm.h>
-#include <sys/sysmacros.h>
-#include <sys/resource.h>
-#include <sys/mntent.h>
-#include <sys/vfs.h>
-#include <sys/vnode.h>
-#include <sys/file.h>
-#include <sys/kmem.h>
-#include <sys/cmn_err.h>
-#include <sys/errno.h>
-#include <sys/unistd.h>
-#include <sys/atomic.h>
-#include <sys/zfs_dir.h>
-#include <sys/zfs_acl.h>
-#include <sys/zfs_ioctl.h>
-#include <sys/zfs_rlock.h>
-#include <sys/fs/zfs.h>
-#endif /* _KERNEL */
-
-#include <sys/dmu.h>
-#include <sys/refcount.h>
-#include <sys/stat.h>
-#include <sys/zap.h>
-#include <sys/zfs_znode.h>
-#include <sys/refcount.h>
-
-/* Used by fstat(1). */
-SYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD, 0, sizeof(znode_t),
- "sizeof(znode_t)");
-
-/*
- * Functions needed for userland (ie: libzpool) are not put under
- * #ifdef_KERNEL; the rest of the functions have dependencies
- * (such as VFS logic) that will not compile easily in userland.
- */
-#ifdef _KERNEL
-struct kmem_cache *znode_cache = NULL;
-
-/*ARGSUSED*/
-static void
-znode_pageout_func(dmu_buf_t *dbuf, void *user_ptr)
-{
- znode_t *zp = user_ptr;
- vnode_t *vp;
-
- mutex_enter(&zp->z_lock);
- vp = ZTOV(zp);
- if (vp == NULL) {
- mutex_exit(&zp->z_lock);
- zfs_znode_free(zp);
- } else if (vp->v_count == 0) {
- ZTOV(zp) = NULL;
- vhold(vp);
- mutex_exit(&zp->z_lock);
- vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
- vrecycle(vp, curthread);
- VOP_UNLOCK(vp, 0);
- vdrop(vp);
- zfs_znode_free(zp);
- } else {
- /* signal force unmount that this znode can be freed */
- zp->z_dbuf = NULL;
- mutex_exit(&zp->z_lock);
- }
-}
-
-extern struct vop_vector zfs_vnodeops;
-extern struct vop_vector zfs_fifoops;
-
-/*
- * XXX: We cannot use this function as a cache constructor, because
- * there is one global cache for all file systems and we need
- * to pass vfsp here, which is not possible, because argument
- * 'cdrarg' is defined at kmem_cache_create() time.
- */
-static int
-zfs_znode_cache_constructor(void *buf, void *cdrarg, int kmflags)
-{
- znode_t *zp = buf;
- vnode_t *vp;
- vfs_t *vfsp = cdrarg;
- int error;
-
- if (cdrarg != NULL) {
- error = getnewvnode("zfs", vfsp, &zfs_vnodeops, &vp);
- ASSERT(error == 0);
- zp->z_vnode = vp;
- vp->v_data = (caddr_t)zp;
- VN_LOCK_AREC(vp);
- VN_LOCK_ASHARE(vp);
- } else {
- zp->z_vnode = NULL;
- }
- mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
- rw_init(&zp->z_map_lock, NULL, RW_DEFAULT, NULL);
- rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
- rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL);
- mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
-
- mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL);
- avl_create(&zp->z_range_avl, zfs_range_compare,
- sizeof (rl_t), offsetof(rl_t, r_node));
-
- zp->z_dbuf_held = 0;
- zp->z_dirlocks = 0;
- zp->z_lockf = NULL;
- return (0);
-}
-
-/*ARGSUSED*/
-static void
-zfs_znode_cache_destructor(void *buf, void *cdarg)
-{
- znode_t *zp = buf;
-
- ASSERT(zp->z_dirlocks == 0);
- mutex_destroy(&zp->z_lock);
- rw_destroy(&zp->z_map_lock);
- rw_destroy(&zp->z_parent_lock);
- rw_destroy(&zp->z_name_lock);
- mutex_destroy(&zp->z_acl_lock);
- mutex_destroy(&zp->z_range_lock);
- avl_destroy(&zp->z_range_avl);
-
- ASSERT(zp->z_dbuf_held == 0);
-}
-
-void
-zfs_znode_init(void)
-{
- /*
- * Initialize zcache
- */
- ASSERT(znode_cache == NULL);
- znode_cache = kmem_cache_create("zfs_znode_cache",
- sizeof (znode_t), 0, /* zfs_znode_cache_constructor */ NULL,
- zfs_znode_cache_destructor, NULL, NULL, NULL, 0);
-}
-
-void
-zfs_znode_fini(void)
-{
- /*
- * Cleanup zcache
- */
- if (znode_cache)
- kmem_cache_destroy(znode_cache);
- znode_cache = NULL;
-}
-
-/*
- * zfs_init_fs - Initialize the zfsvfs struct and the file system
- * incore "master" object. Verify version compatibility.
- */
-int
-zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp, cred_t *cr)
-{
- objset_t *os = zfsvfs->z_os;
- uint64_t version = ZPL_VERSION;
- int i, error;
- dmu_object_info_t doi;
- uint64_t fsid_guid;
-
- *zpp = NULL;
-
- /*
- * XXX - hack to auto-create the pool root filesystem at
- * the first attempted mount.
- */
- if (dmu_object_info(os, MASTER_NODE_OBJ, &doi) == ENOENT) {
- dmu_tx_t *tx = dmu_tx_create(os);
-
- dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL); /* master */
- dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL); /* del queue */
- dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); /* root node */
- error = dmu_tx_assign(tx, TXG_WAIT);
- ASSERT3U(error, ==, 0);
- zfs_create_fs(os, cr, tx);
- dmu_tx_commit(tx);
- }
-
- error = zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_OBJ, 8, 1,
- &version);
- if (error) {
- return (error);
- } else if (version != ZPL_VERSION) {
- (void) printf("Mismatched versions: File system "
- "is version %lld on-disk format, which is "
- "incompatible with this software version %lld!",
- (u_longlong_t)version, ZPL_VERSION);
- return (ENOTSUP);
- }
-
- /*
- * The fsid is 64 bits, composed of an 8-bit fs type, which
- * separates our fsid from any other filesystem types, and a
- * 56-bit objset unique ID. The objset unique ID is unique to
- * all objsets open on this system, provided by unique_create().
- * The 8-bit fs type must be put in the low bits of fsid[1]
- * because that's where other Solaris filesystems put it.
- */
- fsid_guid = dmu_objset_fsid_guid(os);
- ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
- zfsvfs->z_vfs->vfs_fsid.val[0] = fsid_guid;
- zfsvfs->z_vfs->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) |
- zfsvfs->z_vfs->mnt_vfc->vfc_typenum & 0xFF;
-
- error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
- &zfsvfs->z_root);
- if (error)
- return (error);
- ASSERT(zfsvfs->z_root != 0);
-
- /*
- * Create the per mount vop tables.
- */
-
- /*
- * Initialize zget mutex's
- */
- for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
- mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
-
- error = zfs_zget(zfsvfs, zfsvfs->z_root, zpp);
- if (error)
- return (error);
- ASSERT3U((*zpp)->z_id, ==, zfsvfs->z_root);
-
- error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
- &zfsvfs->z_unlinkedobj);
- if (error)
- return (error);
-
- return (0);
-}
-
-/*
- * define a couple of values we need available
- * for both 64 and 32 bit environments.
- */
-#ifndef NBITSMINOR64
-#define NBITSMINOR64 32
-#endif
-#ifndef MAXMAJ64
-#define MAXMAJ64 0xffffffffUL
-#endif
-#ifndef MAXMIN64
-#define MAXMIN64 0xffffffffUL
-#endif
-#ifndef major
-#define major(x) ((int)(((u_int)(x) >> 8)&0xff)) /* major number */
-#endif
-#ifndef minor
-#define minor(x) ((int)((x)&0xffff00ff)) /* minor number */
-#endif
-
-/*
- * Create special expldev for ZFS private use.
- * Can't use standard expldev since it doesn't do
- * what we want. The standard expldev() takes a
- * dev32_t in LP64 and expands it to a long dev_t.
- * We need an interface that takes a dev32_t in ILP32
- * and expands it to a long dev_t.
- */
-static uint64_t
-zfs_expldev(dev_t dev)
-{
- return (((uint64_t)major(dev) << NBITSMINOR64) | minor(dev));
-}
-/*
- * Special cmpldev for ZFS private use.
- * Can't use standard cmpldev since it takes
- * a long dev_t and compresses it to dev32_t in
- * LP64. We need to do a compaction of a long dev_t
- * to a dev32_t in ILP32.
- */
-dev_t
-zfs_cmpldev(uint64_t dev)
-{
- return (makedev((dev >> NBITSMINOR64), (dev & MAXMIN64)));
-}
-
-/*
- * Construct a new znode/vnode and intialize.
- *
- * This does not do a call to dmu_set_user() that is
- * up to the caller to do, in case you don't want to
- * return the znode
- */
-static znode_t *
-zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, uint64_t obj_num, int blksz)
-{
- znode_t *zp;
- vnode_t *vp;
- int error;
-
- zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
- zfs_znode_cache_constructor(zp, zfsvfs->z_vfs, 0);
-
- ASSERT(zp->z_dirlocks == NULL);
-
- zp->z_phys = db->db_data;
- zp->z_zfsvfs = zfsvfs;
- zp->z_unlinked = 0;
- zp->z_atime_dirty = 0;
- zp->z_dbuf_held = 0;
- zp->z_mapcnt = 0;
- zp->z_last_itx = 0;
- zp->z_dbuf = db;
- zp->z_id = obj_num;
- zp->z_blksz = blksz;
- zp->z_seq = 0x7A4653;
- zp->z_sync_cnt = 0;
-
- mutex_enter(&zfsvfs->z_znodes_lock);
- list_insert_tail(&zfsvfs->z_all_znodes, zp);
- mutex_exit(&zfsvfs->z_znodes_lock);
-
- vp = ZTOV(zp);
- if (vp == NULL)
- return (zp);
-
- error = insmntque(vp, zfsvfs->z_vfs);
- KASSERT(error == 0, ("insmntque() failed: error %d", error));
-
- vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode);
- switch (vp->v_type) {
- case VDIR:
- zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */
- break;
- case VFIFO:
- vp->v_op = &zfs_fifoops;
- break;
- }
-
- return (zp);
-}
-
-static void
-zfs_znode_dmu_init(znode_t *zp)
-{
- znode_t *nzp;
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- dmu_buf_t *db = zp->z_dbuf;
-
- mutex_enter(&zp->z_lock);
-
- nzp = dmu_buf_set_user_ie(db, zp, &zp->z_phys, znode_pageout_func);
-
- /*
- * there should be no
- * concurrent zgets on this object.
- */
- ASSERT3P(nzp, ==, NULL);
-
- /*
- * Slap on VROOT if we are the root znode
- */
- if (zp->z_id == zfsvfs->z_root) {
- ZTOV(zp)->v_flag |= VROOT;
- }
-
- ASSERT(zp->z_dbuf_held == 0);
- zp->z_dbuf_held = 1;
- VFS_HOLD(zfsvfs->z_vfs);
- mutex_exit(&zp->z_lock);
-}
-
-/*
- * Create a new DMU object to hold a zfs znode.
- *
- * IN: dzp - parent directory for new znode
- * vap - file attributes for new znode
- * tx - dmu transaction id for zap operations
- * cr - credentials of caller
- * flag - flags:
- * IS_ROOT_NODE - new object will be root
- * IS_XATTR - new object is an attribute
- * IS_REPLAY - intent log replay
- *
- * OUT: oid - ID of created object
- *
- */
-void
-zfs_mknode(znode_t *dzp, vattr_t *vap, uint64_t *oid, dmu_tx_t *tx, cred_t *cr,
- uint_t flag, znode_t **zpp, int bonuslen)
-{
- dmu_buf_t *dbp;
- znode_phys_t *pzp;
- znode_t *zp;
- zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
- timestruc_t now;
- uint64_t gen;
- int err;
-
- ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
-
- if (zfsvfs->z_assign >= TXG_INITIAL) { /* ZIL replay */
- *oid = vap->va_nodeid;
- flag |= IS_REPLAY;
- now = vap->va_ctime; /* see zfs_replay_create() */
- gen = vap->va_nblocks; /* ditto */
- } else {
- *oid = 0;
- gethrestime(&now);
- gen = dmu_tx_get_txg(tx);
- }
-
- /*
- * Create a new DMU object.
- */
- /*
- * There's currently no mechanism for pre-reading the blocks that will
- * be to needed allocate a new object, so we accept the small chance
- * that there will be an i/o error and we will fail one of the
- * assertions below.
- */
- if (vap->va_type == VDIR) {
- if (flag & IS_REPLAY) {
- err = zap_create_claim(zfsvfs->z_os, *oid,
- DMU_OT_DIRECTORY_CONTENTS,
- DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
- ASSERT3U(err, ==, 0);
- } else {
- *oid = zap_create(zfsvfs->z_os,
- DMU_OT_DIRECTORY_CONTENTS,
- DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
- }
- } else {
- if (flag & IS_REPLAY) {
- err = dmu_object_claim(zfsvfs->z_os, *oid,
- DMU_OT_PLAIN_FILE_CONTENTS, 0,
- DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
- ASSERT3U(err, ==, 0);
- } else {
- *oid = dmu_object_alloc(zfsvfs->z_os,
- DMU_OT_PLAIN_FILE_CONTENTS, 0,
- DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
- }
- }
- VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, *oid, NULL, &dbp));
- dmu_buf_will_dirty(dbp, tx);
-
- /*
- * Initialize the znode physical data to zero.
- */
- ASSERT(dbp->db_size >= sizeof (znode_phys_t));
- bzero(dbp->db_data, dbp->db_size);
- pzp = dbp->db_data;
-
- /*
- * If this is the root, fix up the half-initialized parent pointer
- * to reference the just-allocated physical data area.
- */
- if (flag & IS_ROOT_NODE) {
- dzp->z_phys = pzp;
- dzp->z_id = *oid;
- }
-
- /*
- * If parent is an xattr, so am I.
- */
- if (dzp->z_phys->zp_flags & ZFS_XATTR)
- flag |= IS_XATTR;
-
- if (vap->va_type == VBLK || vap->va_type == VCHR) {
- pzp->zp_rdev = zfs_expldev(vap->va_rdev);
- }
-
- if (vap->va_type == VDIR) {
- pzp->zp_size = 2; /* contents ("." and "..") */
- pzp->zp_links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1;
- }
-
- pzp->zp_parent = dzp->z_id;
- if (flag & IS_XATTR)
- pzp->zp_flags |= ZFS_XATTR;
-
- pzp->zp_gen = gen;
-
- ZFS_TIME_ENCODE(&now, pzp->zp_crtime);
- ZFS_TIME_ENCODE(&now, pzp->zp_ctime);
-
- if (vap->va_mask & AT_ATIME) {
- ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime);
- } else {
- ZFS_TIME_ENCODE(&now, pzp->zp_atime);
- }
-
- if (vap->va_mask & AT_MTIME) {
- ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
- } else {
- ZFS_TIME_ENCODE(&now, pzp->zp_mtime);
- }
-
- pzp->zp_mode = MAKEIMODE(vap->va_type, vap->va_mode);
- zp = zfs_znode_alloc(zfsvfs, dbp, *oid, 0);
-
- zfs_perm_init(zp, dzp, flag, vap, tx, cr);
-
- if (zpp) {
- kmutex_t *hash_mtx = ZFS_OBJ_MUTEX(zp);
-
- mutex_enter(hash_mtx);
- zfs_znode_dmu_init(zp);
- mutex_exit(hash_mtx);
-
- *zpp = zp;
- } else {
- if (ZTOV(zp) != NULL)
- ZTOV(zp)->v_count = 0;
- dmu_buf_rele(dbp, NULL);
- zfs_znode_free(zp);
- }
-}
-
-int
-zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
-{
- dmu_object_info_t doi;
- dmu_buf_t *db;
- znode_t *zp;
- vnode_t *vp;
- int err;
-
- *zpp = NULL;
-
- ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
-
- err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db);
- if (err) {
- ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
- return (err);
- }
-
- dmu_object_info_from_db(db, &doi);
- if (doi.doi_bonus_type != DMU_OT_ZNODE ||
- doi.doi_bonus_size < sizeof (znode_phys_t)) {
- dmu_buf_rele(db, NULL);
- ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
- return (EINVAL);
- }
-
- ASSERT(db->db_object == obj_num);
- ASSERT(db->db_offset == -1);
- ASSERT(db->db_data != NULL);
-
- zp = dmu_buf_get_user(db);
-
- if (zp != NULL) {
- mutex_enter(&zp->z_lock);
-
- ASSERT3U(zp->z_id, ==, obj_num);
- if (zp->z_unlinked) {
- dmu_buf_rele(db, NULL);
- mutex_exit(&zp->z_lock);
- ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
- return (ENOENT);
- } else if (zp->z_dbuf_held) {
- dmu_buf_rele(db, NULL);
- } else {
- zp->z_dbuf_held = 1;
- VFS_HOLD(zfsvfs->z_vfs);
- }
-
- if (ZTOV(zp) != NULL)
- VN_HOLD(ZTOV(zp));
- else {
- err = getnewvnode("zfs", zfsvfs->z_vfs, &zfs_vnodeops,
- &zp->z_vnode);
- ASSERT(err == 0);
- vp = ZTOV(zp);
- vp->v_data = (caddr_t)zp;
- VN_LOCK_AREC(vp);
- VN_LOCK_ASHARE(vp);
- vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode);
- if (vp->v_type == VDIR)
- zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */
- err = insmntque(vp, zfsvfs->z_vfs);
- KASSERT(err == 0, ("insmntque() failed: error %d", err));
- }
- mutex_exit(&zp->z_lock);
- ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
- *zpp = zp;
- return (0);
- }
-
- /*
- * Not found create new znode/vnode
- */
- zp = zfs_znode_alloc(zfsvfs, db, obj_num, doi.doi_data_block_size);
- ASSERT3U(zp->z_id, ==, obj_num);
- zfs_znode_dmu_init(zp);
- ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
- *zpp = zp;
- return (0);
-}
-
-void
-zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
-{
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- int error;
-
- ZFS_OBJ_HOLD_ENTER(zfsvfs, zp->z_id);
- if (zp->z_phys->zp_acl.z_acl_extern_obj) {
- error = dmu_object_free(zfsvfs->z_os,
- zp->z_phys->zp_acl.z_acl_extern_obj, tx);
- ASSERT3U(error, ==, 0);
- }
- error = dmu_object_free(zfsvfs->z_os, zp->z_id, tx);
- ASSERT3U(error, ==, 0);
- zp->z_dbuf_held = 0;
- ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id);
- dmu_buf_rele(zp->z_dbuf, NULL);
-}
-
-void
-zfs_zinactive(znode_t *zp)
-{
- vnode_t *vp = ZTOV(zp);
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- uint64_t z_id = zp->z_id;
-
- ASSERT(zp->z_dbuf_held && zp->z_phys);
-
- /*
- * Don't allow a zfs_zget() while were trying to release this znode
- */
- ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id);
-
- mutex_enter(&zp->z_lock);
- VI_LOCK(vp);
- if (vp->v_count > 0) {
- /*
- * If the hold count is greater than zero, somebody has
- * obtained a new reference on this znode while we were
- * processing it here, so we are done.
- */
- VI_UNLOCK(vp);
- mutex_exit(&zp->z_lock);
- ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
- return;
- }
- VI_UNLOCK(vp);
-
- /*
- * If this was the last reference to a file with no links,
- * remove the file from the file system.
- */
- if (zp->z_unlinked) {
- ZTOV(zp) = NULL;
- mutex_exit(&zp->z_lock);
- ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
- ASSERT(vp->v_count == 0);
- vrecycle(vp, curthread);
- zfs_rmnode(zp);
- VFS_RELE(zfsvfs->z_vfs);
- return;
- }
- ASSERT(zp->z_phys);
- ASSERT(zp->z_dbuf_held);
- mutex_exit(&zp->z_lock);
- ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
-}
-
-void
-zfs_znode_free(znode_t *zp)
-{
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-
- mutex_enter(&zfsvfs->z_znodes_lock);
- list_remove(&zfsvfs->z_all_znodes, zp);
- mutex_exit(&zfsvfs->z_znodes_lock);
-
- kmem_cache_free(znode_cache, zp);
-}
-
-void
-zfs_time_stamper_locked(znode_t *zp, uint_t flag, dmu_tx_t *tx)
-{
- timestruc_t now;
-
- ASSERT(MUTEX_HELD(&zp->z_lock));
-
- gethrestime(&now);
-
- if (tx) {
- dmu_buf_will_dirty(zp->z_dbuf, tx);
- zp->z_atime_dirty = 0;
- zp->z_seq++;
- } else {
- zp->z_atime_dirty = 1;
- }
-
- if (flag & AT_ATIME)
- ZFS_TIME_ENCODE(&now, zp->z_phys->zp_atime);
-
- if (flag & AT_MTIME)
- ZFS_TIME_ENCODE(&now, zp->z_phys->zp_mtime);
-
- if (flag & AT_CTIME)
- ZFS_TIME_ENCODE(&now, zp->z_phys->zp_ctime);
-}
-
-/*
- * Update the requested znode timestamps with the current time.
- * If we are in a transaction, then go ahead and mark the znode
- * dirty in the transaction so the timestamps will go to disk.
- * Otherwise, we will get pushed next time the znode is updated
- * in a transaction, or when this znode eventually goes inactive.
- *
- * Why is this OK?
- * 1 - Only the ACCESS time is ever updated outside of a transaction.
- * 2 - Multiple consecutive updates will be collapsed into a single
- * znode update by the transaction grouping semantics of the DMU.
- */
-void
-zfs_time_stamper(znode_t *zp, uint_t flag, dmu_tx_t *tx)
-{
- mutex_enter(&zp->z_lock);
- zfs_time_stamper_locked(zp, flag, tx);
- mutex_exit(&zp->z_lock);
-}
-
-/*
- * Grow the block size for a file.
- *
- * IN: zp - znode of file to free data in.
- * size - requested block size
- * tx - open transaction.
- *
- * NOTE: this function assumes that the znode is write locked.
- */
-void
-zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
-{
- int error;
- u_longlong_t dummy;
-
- if (size <= zp->z_blksz)
- return;
- /*
- * If the file size is already greater than the current blocksize,
- * we will not grow. If there is more than one block in a file,
- * the blocksize cannot change.
- */
- if (zp->z_blksz && zp->z_phys->zp_size > zp->z_blksz)
- return;
-
- error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id,
- size, 0, tx);
- if (error == ENOTSUP)
- return;
- ASSERT3U(error, ==, 0);
-
- /* What blocksize did we actually get? */
- dmu_object_size_from_db(zp->z_dbuf, &zp->z_blksz, &dummy);
-}
-
-/*
- * Free space in a file.
- *
- * IN: zp - znode of file to free data in.
- * off - start of section to free.
- * len - length of section to free (0 => to EOF).
- * flag - current file open mode flags.
- *
- * RETURN: 0 if success
- * error code if failure
- */
-int
-zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
-{
- vnode_t *vp = ZTOV(zp);
- dmu_tx_t *tx;
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- zilog_t *zilog = zfsvfs->z_log;
- rl_t *rl;
- uint64_t end = off + len;
- uint64_t size, new_blksz;
- int error;
-
- if (ZTOV(zp)->v_type == VFIFO)
- return (0);
-
- /*
- * If we will change zp_size then lock the whole file,
- * otherwise just lock the range being freed.
- */
- if (len == 0 || off + len > zp->z_phys->zp_size) {
- rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
- } else {
- rl = zfs_range_lock(zp, off, len, RL_WRITER);
- /* recheck, in case zp_size changed */
- if (off + len > zp->z_phys->zp_size) {
- /* lost race: file size changed, lock whole file */
- zfs_range_unlock(rl);
- rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
- }
- }
-
- /*
- * Nothing to do if file already at desired length.
- */
- size = zp->z_phys->zp_size;
- if (len == 0 && size == off && off != 0) {
- zfs_range_unlock(rl);
- return (0);
- }
-
- tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_bonus(tx, zp->z_id);
- new_blksz = 0;
- if (end > size &&
- (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
- /*
- * We are growing the file past the current block size.
- */
- if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) {
- ASSERT(!ISP2(zp->z_blksz));
- new_blksz = MIN(end, SPA_MAXBLOCKSIZE);
- } else {
- new_blksz = MIN(end, zp->z_zfsvfs->z_max_blksz);
- }
- dmu_tx_hold_write(tx, zp->z_id, 0, MIN(end, new_blksz));
- } else if (off < size) {
- /*
- * If len == 0, we are truncating the file.
- */
- dmu_tx_hold_free(tx, zp->z_id, off, len ? len : DMU_OBJECT_END);
- }
-
- error = dmu_tx_assign(tx, zfsvfs->z_assign);
- if (error) {
- if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT)
- dmu_tx_wait(tx);
- dmu_tx_abort(tx);
- zfs_range_unlock(rl);
- return (error);
- }
-
- if (new_blksz)
- zfs_grow_blocksize(zp, new_blksz, tx);
-
- if (end > size || len == 0)
- zp->z_phys->zp_size = end;
-
- if (off < size) {
- objset_t *os = zfsvfs->z_os;
- uint64_t rlen = len;
-
- if (len == 0)
- rlen = -1;
- else if (end > size)
- rlen = size - off;
- VERIFY(0 == dmu_free_range(os, zp->z_id, off, rlen, tx));
- }
-
- if (log) {
- zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
- zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
- }
-
- zfs_range_unlock(rl);
-
- dmu_tx_commit(tx);
-
- /*
- * Clear any mapped pages in the truncated region. This has to
- * happen outside of the transaction to avoid the possibility of
- * a deadlock with someone trying to push a page that we are
- * about to invalidate.
- */
- rw_enter(&zp->z_map_lock, RW_WRITER);
- if (end > size)
- vnode_pager_setsize(vp, end);
- else if (len == 0) {
-#if 0
- error = vtruncbuf(vp, curthread->td_ucred, curthread, end, PAGE_SIZE);
-#else
- error = vinvalbuf(vp, V_SAVE, curthread, 0, 0);
- vnode_pager_setsize(vp, end);
-#endif
- }
- rw_exit(&zp->z_map_lock);
-
- return (0);
-}
-
-void
-zfs_create_fs(objset_t *os, cred_t *cr, dmu_tx_t *tx)
-{
- zfsvfs_t zfsvfs;
- uint64_t moid, doid, roid = 0;
- uint64_t version = ZPL_VERSION;
- int error;
- znode_t *rootzp = NULL;
- vattr_t vattr;
-
- /*
- * First attempt to create master node.
- */
- /*
- * In an empty objset, there are no blocks to read and thus
- * there can be no i/o errors (which we assert below).
- */
- moid = MASTER_NODE_OBJ;
- error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
- DMU_OT_NONE, 0, tx);
- ASSERT(error == 0);
-
- /*
- * Set starting attributes.
- */
-
- error = zap_update(os, moid, ZPL_VERSION_OBJ, 8, 1, &version, tx);
- ASSERT(error == 0);
-
- /*
- * Create a delete queue.
- */
- doid = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
-
- error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &doid, tx);
- ASSERT(error == 0);
-
- /*
- * Create root znode. Create minimal znode/vnode/zfsvfs
- * to allow zfs_mknode to work.
- */
- vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
- vattr.va_type = VDIR;
- vattr.va_mode = S_IFDIR|0755;
- vattr.va_uid = UID_ROOT;
- vattr.va_gid = GID_WHEEL;
-
- rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
- zfs_znode_cache_constructor(rootzp, NULL, 0);
- rootzp->z_zfsvfs = &zfsvfs;
- rootzp->z_unlinked = 0;
- rootzp->z_atime_dirty = 0;
- rootzp->z_dbuf_held = 0;
-
- bzero(&zfsvfs, sizeof (zfsvfs_t));
-
- zfsvfs.z_os = os;
- zfsvfs.z_assign = TXG_NOWAIT;
- zfsvfs.z_parent = &zfsvfs;
-
- mutex_init(&zfsvfs.z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
- list_create(&zfsvfs.z_all_znodes, sizeof (znode_t),
- offsetof(znode_t, z_link_node));
-
- zfs_mknode(rootzp, &vattr, &roid, tx, cr, IS_ROOT_NODE, NULL, 0);
- ASSERT3U(rootzp->z_id, ==, roid);
- error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &roid, tx);
- ASSERT(error == 0);
-
- mutex_destroy(&zfsvfs.z_znodes_lock);
- kmem_cache_free(znode_cache, rootzp);
-}
-#endif /* _KERNEL */
-
-/*
- * Given an object number, return its parent object number and whether
- * or not the object is an extended attribute directory.
- */
-static int
-zfs_obj_to_pobj(objset_t *osp, uint64_t obj, uint64_t *pobjp, int *is_xattrdir)
-{
- dmu_buf_t *db;
- dmu_object_info_t doi;
- znode_phys_t *zp;
- int error;
-
- if ((error = dmu_bonus_hold(osp, obj, FTAG, &db)) != 0)
- return (error);
-
- dmu_object_info_from_db(db, &doi);
- if (doi.doi_bonus_type != DMU_OT_ZNODE ||
- doi.doi_bonus_size < sizeof (znode_phys_t)) {
- dmu_buf_rele(db, FTAG);
- return (EINVAL);
- }
-
- zp = db->db_data;
- *pobjp = zp->zp_parent;
- *is_xattrdir = ((zp->zp_flags & ZFS_XATTR) != 0) &&
- S_ISDIR(zp->zp_mode);
- dmu_buf_rele(db, FTAG);
-
- return (0);
-}
-
-int
-zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
-{
- char *path = buf + len - 1;
- int error;
-
- *path = '\0';
-
- for (;;) {
- uint64_t pobj;
- char component[MAXNAMELEN + 2];
- size_t complen;
- int is_xattrdir;
-
- if ((error = zfs_obj_to_pobj(osp, obj, &pobj,
- &is_xattrdir)) != 0)
- break;
-
- if (pobj == obj) {
- if (path[0] != '/')
- *--path = '/';
- break;
- }
-
- component[0] = '/';
- if (is_xattrdir) {
- (void) sprintf(component + 1, "<xattrdir>");
- } else {
- error = zap_value_search(osp, pobj, obj, component + 1);
- if (error != 0)
- break;
- }
-
- complen = strlen(component);
- path -= complen;
- ASSERT(path >= buf);
- bcopy(component, path, complen);
- obj = pobj;
- }
-
- if (error == 0)
- (void) memmove(buf, path, buf + len - path);
- return (error);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zil.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zil.c
deleted file mode 100644
index 69ee509..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/zil.c
+++ /dev/null
@@ -1,1607 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/spa.h>
-#include <sys/dmu.h>
-#include <sys/zap.h>
-#include <sys/arc.h>
-#include <sys/stat.h>
-#include <sys/resource.h>
-#include <sys/zil.h>
-#include <sys/zil_impl.h>
-#include <sys/dsl_dataset.h>
-#include <sys/vdev.h>
-#include <sys/dmu_tx.h>
-
-/*
- * The zfs intent log (ZIL) saves transaction records of system calls
- * that change the file system in memory with enough information
- * to be able to replay them. These are stored in memory until
- * either the DMU transaction group (txg) commits them to the stable pool
- * and they can be discarded, or they are flushed to the stable log
- * (also in the pool) due to a fsync, O_DSYNC or other synchronous
- * requirement. In the event of a panic or power fail then those log
- * records (transactions) are replayed.
- *
- * There is one ZIL per file system. Its on-disk (pool) format consists
- * of 3 parts:
- *
- * - ZIL header
- * - ZIL blocks
- * - ZIL records
- *
- * A log record holds a system call transaction. Log blocks can
- * hold many log records and the blocks are chained together.
- * Each ZIL block contains a block pointer (blkptr_t) to the next
- * ZIL block in the chain. The ZIL header points to the first
- * block in the chain. Note there is not a fixed place in the pool
- * to hold blocks. They are dynamically allocated and freed as
- * needed from the blocks available. Figure X shows the ZIL structure:
- */
-
-/*
- * This global ZIL switch affects all pools
- */
-int zil_disable = 0; /* disable intent logging */
-SYSCTL_DECL(_vfs_zfs);
-TUNABLE_INT("vfs.zfs.zil_disable", &zil_disable);
-SYSCTL_INT(_vfs_zfs, OID_AUTO, zil_disable, CTLFLAG_RW, &zil_disable, 0,
- "Disable ZFS Intent Log (ZIL)");
-
-/*
- * Tunable parameter for debugging or performance analysis. Setting
- * zfs_nocacheflush will cause corruption on power loss if a volatile
- * out-of-order write cache is enabled.
- */
-boolean_t zfs_nocacheflush = B_FALSE;
-TUNABLE_INT("vfs.zfs.cache_flush_disable", &zfs_nocacheflush);
-SYSCTL_INT(_vfs_zfs, OID_AUTO, cache_flush_disable, CTLFLAG_RDTUN,
- &zfs_nocacheflush, 0, "Disable cache flush");
-
-static kmem_cache_t *zil_lwb_cache;
-
-static int
-zil_dva_compare(const void *x1, const void *x2)
-{
- const dva_t *dva1 = x1;
- const dva_t *dva2 = x2;
-
- if (DVA_GET_VDEV(dva1) < DVA_GET_VDEV(dva2))
- return (-1);
- if (DVA_GET_VDEV(dva1) > DVA_GET_VDEV(dva2))
- return (1);
-
- if (DVA_GET_OFFSET(dva1) < DVA_GET_OFFSET(dva2))
- return (-1);
- if (DVA_GET_OFFSET(dva1) > DVA_GET_OFFSET(dva2))
- return (1);
-
- return (0);
-}
-
-static void
-zil_dva_tree_init(avl_tree_t *t)
-{
- avl_create(t, zil_dva_compare, sizeof (zil_dva_node_t),
- offsetof(zil_dva_node_t, zn_node));
-}
-
-static void
-zil_dva_tree_fini(avl_tree_t *t)
-{
- zil_dva_node_t *zn;
- void *cookie = NULL;
-
- while ((zn = avl_destroy_nodes(t, &cookie)) != NULL)
- kmem_free(zn, sizeof (zil_dva_node_t));
-
- avl_destroy(t);
-}
-
-static int
-zil_dva_tree_add(avl_tree_t *t, dva_t *dva)
-{
- zil_dva_node_t *zn;
- avl_index_t where;
-
- if (avl_find(t, dva, &where) != NULL)
- return (EEXIST);
-
- zn = kmem_alloc(sizeof (zil_dva_node_t), KM_SLEEP);
- zn->zn_dva = *dva;
- avl_insert(t, zn, where);
-
- return (0);
-}
-
-static zil_header_t *
-zil_header_in_syncing_context(zilog_t *zilog)
-{
- return ((zil_header_t *)zilog->zl_header);
-}
-
-static void
-zil_init_log_chain(zilog_t *zilog, blkptr_t *bp)
-{
- zio_cksum_t *zc = &bp->blk_cksum;
-
- zc->zc_word[ZIL_ZC_GUID_0] = spa_get_random(-1ULL);
- zc->zc_word[ZIL_ZC_GUID_1] = spa_get_random(-1ULL);
- zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os);
- zc->zc_word[ZIL_ZC_SEQ] = 1ULL;
-}
-
-/*
- * Read a log block, make sure it's valid, and byteswap it if necessary.
- */
-static int
-zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, arc_buf_t **abufpp)
-{
- blkptr_t blk = *bp;
- zbookmark_t zb;
- uint32_t aflags = ARC_WAIT;
- int error;
-
- zb.zb_objset = bp->blk_cksum.zc_word[ZIL_ZC_OBJSET];
- zb.zb_object = 0;
- zb.zb_level = -1;
- zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
-
- *abufpp = NULL;
-
- error = arc_read(NULL, zilog->zl_spa, &blk, byteswap_uint64_array,
- arc_getbuf_func, abufpp, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL |
- ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB, &aflags, &zb);
-
- if (error == 0) {
- char *data = (*abufpp)->b_data;
- uint64_t blksz = BP_GET_LSIZE(bp);
- zil_trailer_t *ztp = (zil_trailer_t *)(data + blksz) - 1;
- zio_cksum_t cksum = bp->blk_cksum;
-
- /*
- * Sequence numbers should be... sequential. The checksum
- * verifier for the next block should be bp's checksum plus 1.
- */
- cksum.zc_word[ZIL_ZC_SEQ]++;
-
- if (bcmp(&cksum, &ztp->zit_next_blk.blk_cksum, sizeof (cksum)))
- error = ESTALE;
- else if (BP_IS_HOLE(&ztp->zit_next_blk))
- error = ENOENT;
- else if (ztp->zit_nused > (blksz - sizeof (zil_trailer_t)))
- error = EOVERFLOW;
-
- if (error) {
- VERIFY(arc_buf_remove_ref(*abufpp, abufpp) == 1);
- *abufpp = NULL;
- }
- }
-
- dprintf("error %d on %llu:%llu\n", error, zb.zb_objset, zb.zb_blkid);
-
- return (error);
-}
-
-/*
- * Parse the intent log, and call parse_func for each valid record within.
- * Return the highest sequence number.
- */
-uint64_t
-zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
- zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg)
-{
- const zil_header_t *zh = zilog->zl_header;
- uint64_t claim_seq = zh->zh_claim_seq;
- uint64_t seq = 0;
- uint64_t max_seq = 0;
- blkptr_t blk = zh->zh_log;
- arc_buf_t *abuf;
- char *lrbuf, *lrp;
- zil_trailer_t *ztp;
- int reclen, error;
-
- if (BP_IS_HOLE(&blk))
- return (max_seq);
-
- /*
- * Starting at the block pointed to by zh_log we read the log chain.
- * For each block in the chain we strongly check that block to
- * ensure its validity. We stop when an invalid block is found.
- * For each block pointer in the chain we call parse_blk_func().
- * For each record in each valid block we call parse_lr_func().
- * If the log has been claimed, stop if we encounter a sequence
- * number greater than the highest claimed sequence number.
- */
- zil_dva_tree_init(&zilog->zl_dva_tree);
- for (;;) {
- seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
-
- if (claim_seq != 0 && seq > claim_seq)
- break;
-
- ASSERT(max_seq < seq);
- max_seq = seq;
-
- error = zil_read_log_block(zilog, &blk, &abuf);
-
- if (parse_blk_func != NULL)
- parse_blk_func(zilog, &blk, arg, txg);
-
- if (error)
- break;
-
- lrbuf = abuf->b_data;
- ztp = (zil_trailer_t *)(lrbuf + BP_GET_LSIZE(&blk)) - 1;
- blk = ztp->zit_next_blk;
-
- if (parse_lr_func == NULL) {
- VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
- continue;
- }
-
- for (lrp = lrbuf; lrp < lrbuf + ztp->zit_nused; lrp += reclen) {
- lr_t *lr = (lr_t *)lrp;
- reclen = lr->lrc_reclen;
- ASSERT3U(reclen, >=, sizeof (lr_t));
- parse_lr_func(zilog, lr, arg, txg);
- }
- VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
- }
- zil_dva_tree_fini(&zilog->zl_dva_tree);
-
- return (max_seq);
-}
-
-/* ARGSUSED */
-static void
-zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg)
-{
- spa_t *spa = zilog->zl_spa;
- int err;
-
- /*
- * Claim log block if not already committed and not already claimed.
- */
- if (bp->blk_birth >= first_txg &&
- zil_dva_tree_add(&zilog->zl_dva_tree, BP_IDENTITY(bp)) == 0) {
- err = zio_wait(zio_claim(NULL, spa, first_txg, bp, NULL, NULL));
- ASSERT(err == 0);
- }
-}
-
-static void
-zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg)
-{
- if (lrc->lrc_txtype == TX_WRITE) {
- lr_write_t *lr = (lr_write_t *)lrc;
- zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg);
- }
-}
-
-/* ARGSUSED */
-static void
-zil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg)
-{
- zio_free_blk(zilog->zl_spa, bp, dmu_tx_get_txg(tx));
-}
-
-static void
-zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg)
-{
- /*
- * If we previously claimed it, we need to free it.
- */
- if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE) {
- lr_write_t *lr = (lr_write_t *)lrc;
- blkptr_t *bp = &lr->lr_blkptr;
- if (bp->blk_birth >= claim_txg &&
- !zil_dva_tree_add(&zilog->zl_dva_tree, BP_IDENTITY(bp))) {
- (void) arc_free(NULL, zilog->zl_spa,
- dmu_tx_get_txg(tx), bp, NULL, NULL, ARC_WAIT);
- }
- }
-}
-
-/*
- * Create an on-disk intent log.
- */
-static void
-zil_create(zilog_t *zilog)
-{
- const zil_header_t *zh = zilog->zl_header;
- lwb_t *lwb;
- uint64_t txg = 0;
- dmu_tx_t *tx = NULL;
- blkptr_t blk;
- int error = 0;
-
- /*
- * Wait for any previous destroy to complete.
- */
- txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
-
- ASSERT(zh->zh_claim_txg == 0);
- ASSERT(zh->zh_replay_seq == 0);
-
- blk = zh->zh_log;
-
- /*
- * If we don't already have an initial log block, allocate one now.
- */
- if (BP_IS_HOLE(&blk)) {
- tx = dmu_tx_create(zilog->zl_os);
- (void) dmu_tx_assign(tx, TXG_WAIT);
- dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
- txg = dmu_tx_get_txg(tx);
-
- error = zio_alloc_blk(zilog->zl_spa, ZIL_MIN_BLKSZ, &blk,
- NULL, txg);
-
- if (error == 0)
- zil_init_log_chain(zilog, &blk);
- }
-
- /*
- * Allocate a log write buffer (lwb) for the first log block.
- */
- if (error == 0) {
- lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
- lwb->lwb_zilog = zilog;
- lwb->lwb_blk = blk;
- lwb->lwb_nused = 0;
- lwb->lwb_sz = BP_GET_LSIZE(&lwb->lwb_blk);
- lwb->lwb_buf = zio_buf_alloc(lwb->lwb_sz);
- lwb->lwb_max_txg = txg;
- lwb->lwb_zio = NULL;
-
- mutex_enter(&zilog->zl_lock);
- list_insert_tail(&zilog->zl_lwb_list, lwb);
- mutex_exit(&zilog->zl_lock);
- }
-
- /*
- * If we just allocated the first log block, commit our transaction
- * and wait for zil_sync() to stuff the block poiner into zh_log.
- * (zh is part of the MOS, so we cannot modify it in open context.)
- */
- if (tx != NULL) {
- dmu_tx_commit(tx);
- txg_wait_synced(zilog->zl_dmu_pool, txg);
- }
-
- ASSERT(bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0);
-}
-
-/*
- * In one tx, free all log blocks and clear the log header.
- * If keep_first is set, then we're replaying a log with no content.
- * We want to keep the first block, however, so that the first
- * synchronous transaction doesn't require a txg_wait_synced()
- * in zil_create(). We don't need to txg_wait_synced() here either
- * when keep_first is set, because both zil_create() and zil_destroy()
- * will wait for any in-progress destroys to complete.
- */
-void
-zil_destroy(zilog_t *zilog, boolean_t keep_first)
-{
- const zil_header_t *zh = zilog->zl_header;
- lwb_t *lwb;
- dmu_tx_t *tx;
- uint64_t txg;
-
- /*
- * Wait for any previous destroy to complete.
- */
- txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
-
- if (BP_IS_HOLE(&zh->zh_log))
- return;
-
- tx = dmu_tx_create(zilog->zl_os);
- (void) dmu_tx_assign(tx, TXG_WAIT);
- dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
- txg = dmu_tx_get_txg(tx);
-
- mutex_enter(&zilog->zl_lock);
-
- ASSERT3U(zilog->zl_destroy_txg, <, txg);
- zilog->zl_destroy_txg = txg;
- zilog->zl_keep_first = keep_first;
-
- if (!list_is_empty(&zilog->zl_lwb_list)) {
- ASSERT(zh->zh_claim_txg == 0);
- ASSERT(!keep_first);
- while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
- list_remove(&zilog->zl_lwb_list, lwb);
- if (lwb->lwb_buf != NULL)
- zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
- zio_free_blk(zilog->zl_spa, &lwb->lwb_blk, txg);
- kmem_cache_free(zil_lwb_cache, lwb);
- }
- } else {
- if (!keep_first) {
- (void) zil_parse(zilog, zil_free_log_block,
- zil_free_log_record, tx, zh->zh_claim_txg);
- }
- }
- mutex_exit(&zilog->zl_lock);
-
- dmu_tx_commit(tx);
-
- if (keep_first) /* no need to wait in this case */
- return;
-
- txg_wait_synced(zilog->zl_dmu_pool, txg);
- ASSERT(BP_IS_HOLE(&zh->zh_log));
-}
-
-int
-zil_claim(char *osname, void *txarg)
-{
- dmu_tx_t *tx = txarg;
- uint64_t first_txg = dmu_tx_get_txg(tx);
- zilog_t *zilog;
- zil_header_t *zh;
- objset_t *os;
- int error;
-
- error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_STANDARD, &os);
- if (error) {
- cmn_err(CE_WARN, "can't process intent log for %s", osname);
- return (0);
- }
-
- zilog = dmu_objset_zil(os);
- zh = zil_header_in_syncing_context(zilog);
-
- /*
- * Claim all log blocks if we haven't already done so, and remember
- * the highest claimed sequence number. This ensures that if we can
- * read only part of the log now (e.g. due to a missing device),
- * but we can read the entire log later, we will not try to replay
- * or destroy beyond the last block we successfully claimed.
- */
- ASSERT3U(zh->zh_claim_txg, <=, first_txg);
- if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) {
- zh->zh_claim_txg = first_txg;
- zh->zh_claim_seq = zil_parse(zilog, zil_claim_log_block,
- zil_claim_log_record, tx, first_txg);
- dsl_dataset_dirty(dmu_objset_ds(os), tx);
- }
-
- ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1));
- dmu_objset_close(os);
- return (0);
-}
-
-void
-zil_add_vdev(zilog_t *zilog, uint64_t vdev)
-{
- zil_vdev_t *zv, *new;
- uint64_t bmap_sz = sizeof (zilog->zl_vdev_bmap) << 3;
- uchar_t *cp;
-
- if (zfs_nocacheflush)
- return;
-
- if (vdev < bmap_sz) {
- cp = zilog->zl_vdev_bmap + (vdev / 8);
- atomic_or_8(cp, 1 << (vdev % 8));
- } else {
- /*
- * insert into ordered list
- */
- mutex_enter(&zilog->zl_lock);
- for (zv = list_head(&zilog->zl_vdev_list); zv != NULL;
- zv = list_next(&zilog->zl_vdev_list, zv)) {
- if (zv->vdev == vdev) {
- /* duplicate found - just return */
- mutex_exit(&zilog->zl_lock);
- return;
- }
- if (zv->vdev > vdev) {
- /* insert before this entry */
- new = kmem_alloc(sizeof (zil_vdev_t),
- KM_SLEEP);
- new->vdev = vdev;
- list_insert_before(&zilog->zl_vdev_list,
- zv, new);
- mutex_exit(&zilog->zl_lock);
- return;
- }
- }
- /* ran off end of list, insert at the end */
- ASSERT(zv == NULL);
- new = kmem_alloc(sizeof (zil_vdev_t), KM_SLEEP);
- new->vdev = vdev;
- list_insert_tail(&zilog->zl_vdev_list, new);
- mutex_exit(&zilog->zl_lock);
- }
-}
-
-/* start an async flush of the write cache for this vdev */
-void
-zil_flush_vdev(spa_t *spa, uint64_t vdev, zio_t **zio)
-{
- vdev_t *vd;
-
- if (*zio == NULL)
- *zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
-
- vd = vdev_lookup_top(spa, vdev);
- ASSERT(vd);
-
- (void) zio_nowait(zio_ioctl(*zio, spa, vd, DKIOCFLUSHWRITECACHE,
- NULL, NULL, ZIO_PRIORITY_NOW,
- ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
-}
-
-void
-zil_flush_vdevs(zilog_t *zilog)
-{
- zil_vdev_t *zv;
- zio_t *zio = NULL;
- spa_t *spa = zilog->zl_spa;
- uint64_t vdev;
- uint8_t b;
- int i, j;
-
- ASSERT(zilog->zl_writer);
-
- for (i = 0; i < sizeof (zilog->zl_vdev_bmap); i++) {
- b = zilog->zl_vdev_bmap[i];
- if (b == 0)
- continue;
- for (j = 0; j < 8; j++) {
- if (b & (1 << j)) {
- vdev = (i << 3) + j;
- zil_flush_vdev(spa, vdev, &zio);
- }
- }
- zilog->zl_vdev_bmap[i] = 0;
- }
-
- while ((zv = list_head(&zilog->zl_vdev_list)) != NULL) {
- zil_flush_vdev(spa, zv->vdev, &zio);
- list_remove(&zilog->zl_vdev_list, zv);
- kmem_free(zv, sizeof (zil_vdev_t));
- }
- /*
- * Wait for all the flushes to complete. Not all devices actually
- * support the DKIOCFLUSHWRITECACHE ioctl, so it's OK if it fails.
- */
- if (zio)
- (void) zio_wait(zio);
-}
-
-/*
- * Function called when a log block write completes
- */
-static void
-zil_lwb_write_done(zio_t *zio)
-{
- lwb_t *lwb = zio->io_private;
- zilog_t *zilog = lwb->lwb_zilog;
-
- /*
- * Now that we've written this log block, we have a stable pointer
- * to the next block in the chain, so it's OK to let the txg in
- * which we allocated the next block sync.
- */
- txg_rele_to_sync(&lwb->lwb_txgh);
-
- zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
- mutex_enter(&zilog->zl_lock);
- lwb->lwb_buf = NULL;
- if (zio->io_error) {
- zilog->zl_log_error = B_TRUE;
- mutex_exit(&zilog->zl_lock);
- return;
- }
- mutex_exit(&zilog->zl_lock);
-}
-
-/*
- * Initialize the io for a log block.
- *
- * Note, we should not initialize the IO until we are about
- * to use it, since zio_rewrite() does a spa_config_enter().
- */
-static void
-zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
-{
- zbookmark_t zb;
-
- zb.zb_objset = lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET];
- zb.zb_object = 0;
- zb.zb_level = -1;
- zb.zb_blkid = lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
-
- if (zilog->zl_root_zio == NULL) {
- zilog->zl_root_zio = zio_root(zilog->zl_spa, NULL, NULL,
- ZIO_FLAG_CANFAIL);
- }
- if (lwb->lwb_zio == NULL) {
- lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa,
- ZIO_CHECKSUM_ZILOG, 0, &lwb->lwb_blk, lwb->lwb_buf,
- lwb->lwb_sz, zil_lwb_write_done, lwb,
- ZIO_PRIORITY_LOG_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
- }
-}
-
-/*
- * Start a log block write and advance to the next log block.
- * Calls are serialized.
- */
-static lwb_t *
-zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
-{
- lwb_t *nlwb;
- zil_trailer_t *ztp = (zil_trailer_t *)(lwb->lwb_buf + lwb->lwb_sz) - 1;
- spa_t *spa = zilog->zl_spa;
- blkptr_t *bp = &ztp->zit_next_blk;
- uint64_t txg;
- uint64_t zil_blksz;
- int error;
-
- ASSERT(lwb->lwb_nused <= ZIL_BLK_DATA_SZ(lwb));
-
- /*
- * Allocate the next block and save its address in this block
- * before writing it in order to establish the log chain.
- * Note that if the allocation of nlwb synced before we wrote
- * the block that points at it (lwb), we'd leak it if we crashed.
- * Therefore, we don't do txg_rele_to_sync() until zil_lwb_write_done().
- */
- txg = txg_hold_open(zilog->zl_dmu_pool, &lwb->lwb_txgh);
- txg_rele_to_quiesce(&lwb->lwb_txgh);
-
- /*
- * Pick a ZIL blocksize. We request a size that is the
- * maximum of the previous used size, the current used size and
- * the amount waiting in the queue.
- */
- zil_blksz = MAX(zilog->zl_prev_used,
- zilog->zl_cur_used + sizeof (*ztp));
- zil_blksz = MAX(zil_blksz, zilog->zl_itx_list_sz + sizeof (*ztp));
- zil_blksz = P2ROUNDUP_TYPED(zil_blksz, ZIL_MIN_BLKSZ, uint64_t);
- if (zil_blksz > ZIL_MAX_BLKSZ)
- zil_blksz = ZIL_MAX_BLKSZ;
-
- BP_ZERO(bp);
- /* pass the old blkptr in order to spread log blocks across devs */
- error = zio_alloc_blk(spa, zil_blksz, bp, &lwb->lwb_blk, txg);
- if (error) {
- dmu_tx_t *tx = dmu_tx_create_assigned(zilog->zl_dmu_pool, txg);
-
- /*
- * We dirty the dataset to ensure that zil_sync() will
- * be called to remove this lwb from our zl_lwb_list.
- * Failing to do so, may leave an lwb with a NULL lwb_buf
- * hanging around on the zl_lwb_list.
- */
- dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
- dmu_tx_commit(tx);
-
- /*
- * Since we've just experienced an allocation failure so we
- * terminate the current lwb and send it on its way.
- */
- ztp->zit_pad = 0;
- ztp->zit_nused = lwb->lwb_nused;
- ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum;
- zio_nowait(lwb->lwb_zio);
-
- /*
- * By returning NULL the caller will call tx_wait_synced()
- */
- return (NULL);
- }
-
- ASSERT3U(bp->blk_birth, ==, txg);
- ztp->zit_pad = 0;
- ztp->zit_nused = lwb->lwb_nused;
- ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum;
- bp->blk_cksum = lwb->lwb_blk.blk_cksum;
- bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;
-
- /*
- * Allocate a new log write buffer (lwb).
- */
- nlwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
-
- nlwb->lwb_zilog = zilog;
- nlwb->lwb_blk = *bp;
- nlwb->lwb_nused = 0;
- nlwb->lwb_sz = BP_GET_LSIZE(&nlwb->lwb_blk);
- nlwb->lwb_buf = zio_buf_alloc(nlwb->lwb_sz);
- nlwb->lwb_max_txg = txg;
- nlwb->lwb_zio = NULL;
-
- /*
- * Put new lwb at the end of the log chain
- */
- mutex_enter(&zilog->zl_lock);
- list_insert_tail(&zilog->zl_lwb_list, nlwb);
- mutex_exit(&zilog->zl_lock);
-
- /* Record the vdev for later flushing */
- zil_add_vdev(zilog, DVA_GET_VDEV(BP_IDENTITY(&(lwb->lwb_blk))));
-
- /*
- * kick off the write for the old log block
- */
- dprintf_bp(&lwb->lwb_blk, "lwb %p txg %llu: ", lwb, txg);
- ASSERT(lwb->lwb_zio);
- zio_nowait(lwb->lwb_zio);
-
- return (nlwb);
-}
-
-static lwb_t *
-zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
-{
- lr_t *lrc = &itx->itx_lr; /* common log record */
- lr_write_t *lr = (lr_write_t *)lrc;
- uint64_t txg = lrc->lrc_txg;
- uint64_t reclen = lrc->lrc_reclen;
- uint64_t dlen;
-
- if (lwb == NULL)
- return (NULL);
- ASSERT(lwb->lwb_buf != NULL);
-
- if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY)
- dlen = P2ROUNDUP_TYPED(
- lr->lr_length, sizeof (uint64_t), uint64_t);
- else
- dlen = 0;
-
- zilog->zl_cur_used += (reclen + dlen);
-
- zil_lwb_write_init(zilog, lwb);
-
- /*
- * If this record won't fit in the current log block, start a new one.
- */
- if (lwb->lwb_nused + reclen + dlen > ZIL_BLK_DATA_SZ(lwb)) {
- lwb = zil_lwb_write_start(zilog, lwb);
- if (lwb == NULL)
- return (NULL);
- zil_lwb_write_init(zilog, lwb);
- ASSERT(lwb->lwb_nused == 0);
- if (reclen + dlen > ZIL_BLK_DATA_SZ(lwb)) {
- txg_wait_synced(zilog->zl_dmu_pool, txg);
- return (lwb);
- }
- }
-
- /*
- * Update the lrc_seq, to be log record sequence number. See zil.h
- * Then copy the record to the log buffer.
- */
- lrc->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */
- bcopy(lrc, lwb->lwb_buf + lwb->lwb_nused, reclen);
-
- /*
- * If it's a write, fetch the data or get its blkptr as appropriate.
- */
- if (lrc->lrc_txtype == TX_WRITE) {
- if (txg > spa_freeze_txg(zilog->zl_spa))
- txg_wait_synced(zilog->zl_dmu_pool, txg);
- if (itx->itx_wr_state != WR_COPIED) {
- char *dbuf;
- int error;
-
- /* alignment is guaranteed */
- lr = (lr_write_t *)(lwb->lwb_buf + lwb->lwb_nused);
- if (dlen) {
- ASSERT(itx->itx_wr_state == WR_NEED_COPY);
- dbuf = lwb->lwb_buf + lwb->lwb_nused + reclen;
- lr->lr_common.lrc_reclen += dlen;
- } else {
- ASSERT(itx->itx_wr_state == WR_INDIRECT);
- dbuf = NULL;
- }
- error = zilog->zl_get_data(
- itx->itx_private, lr, dbuf, lwb->lwb_zio);
- if (error) {
- ASSERT(error == ENOENT || error == EEXIST ||
- error == EALREADY);
- return (lwb);
- }
- }
- }
-
- lwb->lwb_nused += reclen + dlen;
- lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg);
- ASSERT3U(lwb->lwb_nused, <=, ZIL_BLK_DATA_SZ(lwb));
- ASSERT3U(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)), ==, 0);
-
- return (lwb);
-}
-
-itx_t *
-zil_itx_create(int txtype, size_t lrsize)
-{
- itx_t *itx;
-
- lrsize = P2ROUNDUP_TYPED(lrsize, sizeof (uint64_t), size_t);
-
- itx = kmem_alloc(offsetof(itx_t, itx_lr) + lrsize, KM_SLEEP);
- itx->itx_lr.lrc_txtype = txtype;
- itx->itx_lr.lrc_reclen = lrsize;
- itx->itx_lr.lrc_seq = 0; /* defensive */
-
- return (itx);
-}
-
-uint64_t
-zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
-{
- uint64_t seq;
-
- ASSERT(itx->itx_lr.lrc_seq == 0);
-
- mutex_enter(&zilog->zl_lock);
- list_insert_tail(&zilog->zl_itx_list, itx);
- zilog->zl_itx_list_sz += itx->itx_lr.lrc_reclen;
- itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx);
- itx->itx_lr.lrc_seq = seq = ++zilog->zl_itx_seq;
- mutex_exit(&zilog->zl_lock);
-
- return (seq);
-}
-
-/*
- * Free up all in-memory intent log transactions that have now been synced.
- */
-static void
-zil_itx_clean(zilog_t *zilog)
-{
- uint64_t synced_txg = spa_last_synced_txg(zilog->zl_spa);
- uint64_t freeze_txg = spa_freeze_txg(zilog->zl_spa);
- list_t clean_list;
- itx_t *itx;
-
- list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node));
-
- mutex_enter(&zilog->zl_lock);
- /* wait for a log writer to finish walking list */
- while (zilog->zl_writer) {
- cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
- }
-
- /*
- * Move the sync'd log transactions to a separate list so we can call
- * kmem_free without holding the zl_lock.
- *
- * There is no need to set zl_writer as we don't drop zl_lock here
- */
- while ((itx = list_head(&zilog->zl_itx_list)) != NULL &&
- itx->itx_lr.lrc_txg <= MIN(synced_txg, freeze_txg)) {
- list_remove(&zilog->zl_itx_list, itx);
- zilog->zl_itx_list_sz -= itx->itx_lr.lrc_reclen;
- list_insert_tail(&clean_list, itx);
- }
- cv_broadcast(&zilog->zl_cv_writer);
- mutex_exit(&zilog->zl_lock);
-
- /* destroy sync'd log transactions */
- while ((itx = list_head(&clean_list)) != NULL) {
- list_remove(&clean_list, itx);
- kmem_free(itx, offsetof(itx_t, itx_lr)
- + itx->itx_lr.lrc_reclen);
- }
- list_destroy(&clean_list);
-}
-
-/*
- * If there are any in-memory intent log transactions which have now been
- * synced then start up a taskq to free them.
- */
-void
-zil_clean(zilog_t *zilog)
-{
- itx_t *itx;
-
- mutex_enter(&zilog->zl_lock);
- itx = list_head(&zilog->zl_itx_list);
- if ((itx != NULL) &&
- (itx->itx_lr.lrc_txg <= spa_last_synced_txg(zilog->zl_spa))) {
- (void) taskq_dispatch(zilog->zl_clean_taskq,
- (void (*)(void *))zil_itx_clean, zilog, TQ_NOSLEEP);
- }
- mutex_exit(&zilog->zl_lock);
-}
-
-void
-zil_commit_writer(zilog_t *zilog, uint64_t seq, uint64_t foid)
-{
- uint64_t txg;
- uint64_t reclen;
- uint64_t commit_seq = 0;
- itx_t *itx, *itx_next = (itx_t *)-1;
- lwb_t *lwb;
- spa_t *spa;
-
- zilog->zl_writer = B_TRUE;
- zilog->zl_root_zio = NULL;
- spa = zilog->zl_spa;
-
- if (zilog->zl_suspend) {
- lwb = NULL;
- } else {
- lwb = list_tail(&zilog->zl_lwb_list);
- if (lwb == NULL) {
- /*
- * Return if there's nothing to flush before we
- * dirty the fs by calling zil_create()
- */
- if (list_is_empty(&zilog->zl_itx_list)) {
- zilog->zl_writer = B_FALSE;
- return;
- }
- mutex_exit(&zilog->zl_lock);
- zil_create(zilog);
- mutex_enter(&zilog->zl_lock);
- lwb = list_tail(&zilog->zl_lwb_list);
- }
- }
-
- /* Loop through in-memory log transactions filling log blocks. */
- DTRACE_PROBE1(zil__cw1, zilog_t *, zilog);
- for (;;) {
- /*
- * Find the next itx to push:
- * Push all transactions related to specified foid and all
- * other transactions except TX_WRITE, TX_TRUNCATE,
- * TX_SETATTR and TX_ACL for all other files.
- */
- if (itx_next != (itx_t *)-1)
- itx = itx_next;
- else
- itx = list_head(&zilog->zl_itx_list);
- for (; itx != NULL; itx = list_next(&zilog->zl_itx_list, itx)) {
- if (foid == 0) /* push all foids? */
- break;
- if (itx->itx_sync) /* push all O_[D]SYNC */
- break;
- switch (itx->itx_lr.lrc_txtype) {
- case TX_SETATTR:
- case TX_WRITE:
- case TX_TRUNCATE:
- case TX_ACL:
- /* lr_foid is same offset for these records */
- if (((lr_write_t *)&itx->itx_lr)->lr_foid
- != foid) {
- continue; /* skip this record */
- }
- }
- break;
- }
- if (itx == NULL)
- break;
-
- reclen = itx->itx_lr.lrc_reclen;
- if ((itx->itx_lr.lrc_seq > seq) &&
- ((lwb == NULL) || (lwb->lwb_nused == 0) ||
- (lwb->lwb_nused + reclen > ZIL_BLK_DATA_SZ(lwb)))) {
- break;
- }
-
- /*
- * Save the next pointer. Even though we soon drop
- * zl_lock all threads that may change the list
- * (another writer or zil_itx_clean) can't do so until
- * they have zl_writer.
- */
- itx_next = list_next(&zilog->zl_itx_list, itx);
- list_remove(&zilog->zl_itx_list, itx);
- mutex_exit(&zilog->zl_lock);
- txg = itx->itx_lr.lrc_txg;
- ASSERT(txg);
-
- if (txg > spa_last_synced_txg(spa) ||
- txg > spa_freeze_txg(spa))
- lwb = zil_lwb_commit(zilog, itx, lwb);
- kmem_free(itx, offsetof(itx_t, itx_lr)
- + itx->itx_lr.lrc_reclen);
- mutex_enter(&zilog->zl_lock);
- zilog->zl_itx_list_sz -= reclen;
- }
- DTRACE_PROBE1(zil__cw2, zilog_t *, zilog);
- /* determine commit sequence number */
- itx = list_head(&zilog->zl_itx_list);
- if (itx)
- commit_seq = itx->itx_lr.lrc_seq;
- else
- commit_seq = zilog->zl_itx_seq;
- mutex_exit(&zilog->zl_lock);
-
- /* write the last block out */
- if (lwb != NULL && lwb->lwb_zio != NULL)
- lwb = zil_lwb_write_start(zilog, lwb);
-
- zilog->zl_prev_used = zilog->zl_cur_used;
- zilog->zl_cur_used = 0;
-
- /*
- * Wait if necessary for the log blocks to be on stable storage.
- */
- if (zilog->zl_root_zio) {
- DTRACE_PROBE1(zil__cw3, zilog_t *, zilog);
- (void) zio_wait(zilog->zl_root_zio);
- DTRACE_PROBE1(zil__cw4, zilog_t *, zilog);
- if (!zfs_nocacheflush)
- zil_flush_vdevs(zilog);
- }
-
- if (zilog->zl_log_error || lwb == NULL) {
- zilog->zl_log_error = 0;
- txg_wait_synced(zilog->zl_dmu_pool, 0);
- }
-
- mutex_enter(&zilog->zl_lock);
- zilog->zl_writer = B_FALSE;
-
- ASSERT3U(commit_seq, >=, zilog->zl_commit_seq);
- zilog->zl_commit_seq = commit_seq;
-}
-
-/*
- * Push zfs transactions to stable storage up to the supplied sequence number.
- * If foid is 0 push out all transactions, otherwise push only those
- * for that file or might have been used to create that file.
- */
-void
-zil_commit(zilog_t *zilog, uint64_t seq, uint64_t foid)
-{
- if (zilog == NULL || seq == 0)
- return;
-
- mutex_enter(&zilog->zl_lock);
-
- seq = MIN(seq, zilog->zl_itx_seq); /* cap seq at largest itx seq */
-
- while (zilog->zl_writer) {
- cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
- if (seq < zilog->zl_commit_seq) {
- mutex_exit(&zilog->zl_lock);
- return;
- }
- }
- zil_commit_writer(zilog, seq, foid); /* drops zl_lock */
- /* wake up others waiting on the commit */
- cv_broadcast(&zilog->zl_cv_writer);
- mutex_exit(&zilog->zl_lock);
-}
-
-/*
- * Called in syncing context to free committed log blocks and update log header.
- */
-void
-zil_sync(zilog_t *zilog, dmu_tx_t *tx)
-{
- zil_header_t *zh = zil_header_in_syncing_context(zilog);
- uint64_t txg = dmu_tx_get_txg(tx);
- spa_t *spa = zilog->zl_spa;
- lwb_t *lwb;
-
- mutex_enter(&zilog->zl_lock);
-
- ASSERT(zilog->zl_stop_sync == 0);
-
- zh->zh_replay_seq = zilog->zl_replay_seq[txg & TXG_MASK];
-
- if (zilog->zl_destroy_txg == txg) {
- blkptr_t blk = zh->zh_log;
-
- ASSERT(list_head(&zilog->zl_lwb_list) == NULL);
- ASSERT(spa_sync_pass(spa) == 1);
-
- bzero(zh, sizeof (zil_header_t));
- bzero(zilog->zl_replay_seq, sizeof (zilog->zl_replay_seq));
-
- if (zilog->zl_keep_first) {
- /*
- * If this block was part of log chain that couldn't
- * be claimed because a device was missing during
- * zil_claim(), but that device later returns,
- * then this block could erroneously appear valid.
- * To guard against this, assign a new GUID to the new
- * log chain so it doesn't matter what blk points to.
- */
- zil_init_log_chain(zilog, &blk);
- zh->zh_log = blk;
- }
- }
-
- for (;;) {
- lwb = list_head(&zilog->zl_lwb_list);
- if (lwb == NULL) {
- mutex_exit(&zilog->zl_lock);
- return;
- }
- zh->zh_log = lwb->lwb_blk;
- if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg)
- break;
- list_remove(&zilog->zl_lwb_list, lwb);
- zio_free_blk(spa, &lwb->lwb_blk, txg);
- kmem_cache_free(zil_lwb_cache, lwb);
-
- /*
- * If we don't have anything left in the lwb list then
- * we've had an allocation failure and we need to zero
- * out the zil_header blkptr so that we don't end
- * up freeing the same block twice.
- */
- if (list_head(&zilog->zl_lwb_list) == NULL)
- BP_ZERO(&zh->zh_log);
- }
- mutex_exit(&zilog->zl_lock);
-}
-
-void
-zil_init(void)
-{
- zil_lwb_cache = kmem_cache_create("zil_lwb_cache",
- sizeof (struct lwb), 0, NULL, NULL, NULL, NULL, NULL, 0);
-}
-
-void
-zil_fini(void)
-{
- kmem_cache_destroy(zil_lwb_cache);
-}
-
-zilog_t *
-zil_alloc(objset_t *os, zil_header_t *zh_phys)
-{
- zilog_t *zilog;
-
- zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP);
-
- zilog->zl_header = zh_phys;
- zilog->zl_os = os;
- zilog->zl_spa = dmu_objset_spa(os);
- zilog->zl_dmu_pool = dmu_objset_pool(os);
- zilog->zl_destroy_txg = TXG_INITIAL - 1;
-
- mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL);
- cv_init(&zilog->zl_cv_writer, NULL, CV_DEFAULT, NULL);
- cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL);
-
- list_create(&zilog->zl_itx_list, sizeof (itx_t),
- offsetof(itx_t, itx_node));
-
- list_create(&zilog->zl_lwb_list, sizeof (lwb_t),
- offsetof(lwb_t, lwb_node));
-
- list_create(&zilog->zl_vdev_list, sizeof (zil_vdev_t),
- offsetof(zil_vdev_t, vdev_seq_node));
-
- return (zilog);
-}
-
-void
-zil_free(zilog_t *zilog)
-{
- lwb_t *lwb;
- zil_vdev_t *zv;
-
- zilog->zl_stop_sync = 1;
-
- while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
- list_remove(&zilog->zl_lwb_list, lwb);
- if (lwb->lwb_buf != NULL)
- zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
- kmem_cache_free(zil_lwb_cache, lwb);
- }
- list_destroy(&zilog->zl_lwb_list);
-
- while ((zv = list_head(&zilog->zl_vdev_list)) != NULL) {
- list_remove(&zilog->zl_vdev_list, zv);
- kmem_free(zv, sizeof (zil_vdev_t));
- }
- list_destroy(&zilog->zl_vdev_list);
-
- ASSERT(list_head(&zilog->zl_itx_list) == NULL);
- list_destroy(&zilog->zl_itx_list);
- cv_destroy(&zilog->zl_cv_suspend);
- cv_destroy(&zilog->zl_cv_writer);
- mutex_destroy(&zilog->zl_lock);
-
- kmem_free(zilog, sizeof (zilog_t));
-}
-
-/*
- * return true if the initial log block is not valid
- */
-static int
-zil_empty(zilog_t *zilog)
-{
- const zil_header_t *zh = zilog->zl_header;
- arc_buf_t *abuf = NULL;
-
- if (BP_IS_HOLE(&zh->zh_log))
- return (1);
-
- if (zil_read_log_block(zilog, &zh->zh_log, &abuf) != 0)
- return (1);
-
- VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
- return (0);
-}
-
-/*
- * Open an intent log.
- */
-zilog_t *
-zil_open(objset_t *os, zil_get_data_t *get_data)
-{
- zilog_t *zilog = dmu_objset_zil(os);
-
- zilog->zl_get_data = get_data;
- zilog->zl_clean_taskq = taskq_create("zil_clean", 1, minclsyspri,
- 2, 2, TASKQ_PREPOPULATE);
-
- return (zilog);
-}
-
-/*
- * Close an intent log.
- */
-void
-zil_close(zilog_t *zilog)
-{
- /*
- * If the log isn't already committed, mark the objset dirty
- * (so zil_sync() will be called) and wait for that txg to sync.
- */
- if (!zil_is_committed(zilog)) {
- uint64_t txg;
- dmu_tx_t *tx = dmu_tx_create(zilog->zl_os);
- (void) dmu_tx_assign(tx, TXG_WAIT);
- dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
- txg = dmu_tx_get_txg(tx);
- dmu_tx_commit(tx);
- txg_wait_synced(zilog->zl_dmu_pool, txg);
- }
-
- taskq_destroy(zilog->zl_clean_taskq);
- zilog->zl_clean_taskq = NULL;
- zilog->zl_get_data = NULL;
-
- zil_itx_clean(zilog);
- ASSERT(list_head(&zilog->zl_itx_list) == NULL);
-}
-
-/*
- * Suspend an intent log. While in suspended mode, we still honor
- * synchronous semantics, but we rely on txg_wait_synced() to do it.
- * We suspend the log briefly when taking a snapshot so that the snapshot
- * contains all the data it's supposed to, and has an empty intent log.
- */
-int
-zil_suspend(zilog_t *zilog)
-{
- const zil_header_t *zh = zilog->zl_header;
-
- mutex_enter(&zilog->zl_lock);
- if (zh->zh_claim_txg != 0) { /* unplayed log */
- mutex_exit(&zilog->zl_lock);
- return (EBUSY);
- }
- if (zilog->zl_suspend++ != 0) {
- /*
- * Someone else already began a suspend.
- * Just wait for them to finish.
- */
- while (zilog->zl_suspending)
- cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock);
- ASSERT(BP_IS_HOLE(&zh->zh_log));
- mutex_exit(&zilog->zl_lock);
- return (0);
- }
- zilog->zl_suspending = B_TRUE;
- mutex_exit(&zilog->zl_lock);
-
- zil_commit(zilog, UINT64_MAX, 0);
-
- /*
- * Wait for any in-flight log writes to complete.
- */
- mutex_enter(&zilog->zl_lock);
- while (zilog->zl_writer)
- cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
- mutex_exit(&zilog->zl_lock);
-
- zil_destroy(zilog, B_FALSE);
-
- mutex_enter(&zilog->zl_lock);
- ASSERT(BP_IS_HOLE(&zh->zh_log));
- zilog->zl_suspending = B_FALSE;
- cv_broadcast(&zilog->zl_cv_suspend);
- mutex_exit(&zilog->zl_lock);
-
- return (0);
-}
-
-void
-zil_resume(zilog_t *zilog)
-{
- mutex_enter(&zilog->zl_lock);
- ASSERT(zilog->zl_suspend != 0);
- zilog->zl_suspend--;
- mutex_exit(&zilog->zl_lock);
-}
-
-typedef struct zil_replay_arg {
- objset_t *zr_os;
- zil_replay_func_t **zr_replay;
- void *zr_arg;
- uint64_t *zr_txgp;
- boolean_t zr_byteswap;
- char *zr_lrbuf;
-} zil_replay_arg_t;
-
-static void
-zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
-{
- zil_replay_arg_t *zr = zra;
- const zil_header_t *zh = zilog->zl_header;
- uint64_t reclen = lr->lrc_reclen;
- uint64_t txtype = lr->lrc_txtype;
- char *name;
- int pass, error, sunk;
-
- if (zilog->zl_stop_replay)
- return;
-
- if (lr->lrc_txg < claim_txg) /* already committed */
- return;
-
- if (lr->lrc_seq <= zh->zh_replay_seq) /* already replayed */
- return;
-
- /*
- * Make a copy of the data so we can revise and extend it.
- */
- bcopy(lr, zr->zr_lrbuf, reclen);
-
- /*
- * The log block containing this lr may have been byteswapped
- * so that we can easily examine common fields like lrc_txtype.
- * However, the log is a mix of different data types, and only the
- * replay vectors know how to byteswap their records. Therefore, if
- * the lr was byteswapped, undo it before invoking the replay vector.
- */
- if (zr->zr_byteswap)
- byteswap_uint64_array(zr->zr_lrbuf, reclen);
-
- /*
- * If this is a TX_WRITE with a blkptr, suck in the data.
- */
- if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) {
- lr_write_t *lrw = (lr_write_t *)lr;
- blkptr_t *wbp = &lrw->lr_blkptr;
- uint64_t wlen = lrw->lr_length;
- char *wbuf = zr->zr_lrbuf + reclen;
-
- if (BP_IS_HOLE(wbp)) { /* compressed to a hole */
- bzero(wbuf, wlen);
- } else {
- /*
- * A subsequent write may have overwritten this block,
- * in which case wbp may have been been freed and
- * reallocated, and our read of wbp may fail with a
- * checksum error. We can safely ignore this because
- * the later write will provide the correct data.
- */
- zbookmark_t zb;
-
- zb.zb_objset = dmu_objset_id(zilog->zl_os);
- zb.zb_object = lrw->lr_foid;
- zb.zb_level = -1;
- zb.zb_blkid = lrw->lr_offset / BP_GET_LSIZE(wbp);
-
- (void) zio_wait(zio_read(NULL, zilog->zl_spa,
- wbp, wbuf, BP_GET_LSIZE(wbp), NULL, NULL,
- ZIO_PRIORITY_SYNC_READ,
- ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &zb));
- (void) memmove(wbuf, wbuf + lrw->lr_blkoff, wlen);
- }
- }
-
- /*
- * We must now do two things atomically: replay this log record,
- * and update the log header to reflect the fact that we did so.
- * We use the DMU's ability to assign into a specific txg to do this.
- */
- for (pass = 1, sunk = B_FALSE; /* CONSTANTCONDITION */; pass++) {
- uint64_t replay_txg;
- dmu_tx_t *replay_tx;
-
- replay_tx = dmu_tx_create(zr->zr_os);
- error = dmu_tx_assign(replay_tx, TXG_WAIT);
- if (error) {
- dmu_tx_abort(replay_tx);
- break;
- }
-
- replay_txg = dmu_tx_get_txg(replay_tx);
-
- if (txtype == 0 || txtype >= TX_MAX_TYPE) {
- error = EINVAL;
- } else {
- /*
- * On the first pass, arrange for the replay vector
- * to fail its dmu_tx_assign(). That's the only way
- * to ensure that those code paths remain well tested.
- */
- *zr->zr_txgp = replay_txg - (pass == 1);
- error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lrbuf,
- zr->zr_byteswap);
- *zr->zr_txgp = TXG_NOWAIT;
- }
-
- if (error == 0) {
- dsl_dataset_dirty(dmu_objset_ds(zr->zr_os), replay_tx);
- zilog->zl_replay_seq[replay_txg & TXG_MASK] =
- lr->lrc_seq;
- }
-
- dmu_tx_commit(replay_tx);
-
- if (!error)
- return;
-
- /*
- * The DMU's dnode layer doesn't see removes until the txg
- * commits, so a subsequent claim can spuriously fail with
- * EEXIST. So if we receive any error other than ERESTART
- * we try syncing out any removes then retrying the
- * transaction.
- */
- if (error != ERESTART && !sunk) {
- txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
- sunk = B_TRUE;
- continue; /* retry */
- }
-
- if (error != ERESTART)
- break;
-
- if (pass != 1)
- txg_wait_open(spa_get_dsl(zilog->zl_spa),
- replay_txg + 1);
-
- dprintf("pass %d, retrying\n", pass);
- }
-
- ASSERT(error && error != ERESTART);
- name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
- dmu_objset_name(zr->zr_os, name);
- cmn_err(CE_WARN, "ZFS replay transaction error %d, "
- "dataset %s, seq 0x%llx, txtype %llu\n",
- error, name, (u_longlong_t)lr->lrc_seq, (u_longlong_t)txtype);
- zilog->zl_stop_replay = 1;
- kmem_free(name, MAXNAMELEN);
-}
-
-/* ARGSUSED */
-static void
-zil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
-{
- zilog->zl_replay_blks++;
-}
-
-/*
- * If this dataset has a non-empty intent log, replay it and destroy it.
- */
-void
-zil_replay(objset_t *os, void *arg, uint64_t *txgp,
- zil_replay_func_t *replay_func[TX_MAX_TYPE])
-{
- zilog_t *zilog = dmu_objset_zil(os);
- const zil_header_t *zh = zilog->zl_header;
- zil_replay_arg_t zr;
-
- if (zil_empty(zilog)) {
- zil_destroy(zilog, B_TRUE);
- return;
- }
- //printf("ZFS: Replaying ZIL on %s...\n", os->os->os_spa->spa_name);
-
- zr.zr_os = os;
- zr.zr_replay = replay_func;
- zr.zr_arg = arg;
- zr.zr_txgp = txgp;
- zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log);
- zr.zr_lrbuf = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP);
-
- /*
- * Wait for in-progress removes to sync before starting replay.
- */
- txg_wait_synced(zilog->zl_dmu_pool, 0);
-
- zilog->zl_stop_replay = 0;
- zilog->zl_replay_time = LBOLT;
- ASSERT(zilog->zl_replay_blks == 0);
- (void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr,
- zh->zh_claim_txg);
- kmem_free(zr.zr_lrbuf, 2 * SPA_MAXBLOCKSIZE);
-
- zil_destroy(zilog, B_FALSE);
- //printf("ZFS: Replay of ZIL on %s finished.\n", os->os->os_spa->spa_name);
-}
-
-/*
- * Report whether all transactions are committed
- */
-int
-zil_is_committed(zilog_t *zilog)
-{
- lwb_t *lwb;
- int ret;
-
- mutex_enter(&zilog->zl_lock);
- while (zilog->zl_writer)
- cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
-
- /* recent unpushed intent log transactions? */
- if (!list_is_empty(&zilog->zl_itx_list)) {
- ret = B_FALSE;
- goto out;
- }
-
- /* intent log never used? */
- lwb = list_head(&zilog->zl_lwb_list);
- if (lwb == NULL) {
- ret = B_TRUE;
- goto out;
- }
-
- /*
- * more than 1 log buffer means zil_sync() hasn't yet freed
- * entries after a txg has committed
- */
- if (list_next(&zilog->zl_lwb_list, lwb)) {
- ret = B_FALSE;
- goto out;
- }
-
- ASSERT(zil_empty(zilog));
- ret = B_TRUE;
-out:
- cv_broadcast(&zilog->zl_cv_writer);
- mutex_exit(&zilog->zl_lock);
- return (ret);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zio.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zio.c
deleted file mode 100644
index b5dd35f..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/zio.c
+++ /dev/null
@@ -1,1861 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/fm/fs/zfs.h>
-#include <sys/spa.h>
-#include <sys/txg.h>
-#include <sys/spa_impl.h>
-#include <sys/vdev_impl.h>
-#include <sys/zio_impl.h>
-#include <sys/zio_compress.h>
-#include <sys/zio_checksum.h>
-
-/*
- * ==========================================================================
- * I/O priority table
- * ==========================================================================
- */
-uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = {
- 0, /* ZIO_PRIORITY_NOW */
- 0, /* ZIO_PRIORITY_SYNC_READ */
- 0, /* ZIO_PRIORITY_SYNC_WRITE */
- 6, /* ZIO_PRIORITY_ASYNC_READ */
- 4, /* ZIO_PRIORITY_ASYNC_WRITE */
- 4, /* ZIO_PRIORITY_FREE */
- 0, /* ZIO_PRIORITY_CACHE_FILL */
- 0, /* ZIO_PRIORITY_LOG_WRITE */
- 10, /* ZIO_PRIORITY_RESILVER */
- 20, /* ZIO_PRIORITY_SCRUB */
-};
-
-/*
- * ==========================================================================
- * I/O type descriptions
- * ==========================================================================
- */
-char *zio_type_name[ZIO_TYPES] = {
- "null", "read", "write", "free", "claim", "ioctl" };
-
-/* At or above this size, force gang blocking - for testing */
-uint64_t zio_gang_bang = SPA_MAXBLOCKSIZE + 1;
-
-/* Force an allocation failure when non-zero */
-uint16_t zio_zil_fail_shift = 0;
-
-typedef struct zio_sync_pass {
- int zp_defer_free; /* defer frees after this pass */
- int zp_dontcompress; /* don't compress after this pass */
- int zp_rewrite; /* rewrite new bps after this pass */
-} zio_sync_pass_t;
-
-zio_sync_pass_t zio_sync_pass = {
- 1, /* zp_defer_free */
- 4, /* zp_dontcompress */
- 1, /* zp_rewrite */
-};
-
-/*
- * ==========================================================================
- * I/O kmem caches
- * ==========================================================================
- */
-kmem_cache_t *zio_cache;
-#ifdef ZIO_USE_UMA
-kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
-kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
-#endif
-
-#ifdef _KERNEL
-extern vmem_t *zio_alloc_arena;
-#endif
-
-void
-zio_init(void)
-{
-#ifdef ZIO_USE_UMA
- size_t c;
-#endif
-#if 0
- vmem_t *data_alloc_arena = NULL;
-
-#ifdef _KERNEL
- data_alloc_arena = zio_alloc_arena;
-#endif
-#endif
-
- zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0,
- NULL, NULL, NULL, NULL, NULL, 0);
-
-#ifdef ZIO_USE_UMA
- /*
- * For small buffers, we want a cache for each multiple of
- * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache
- * for each quarter-power of 2. For large buffers, we want
- * a cache for each multiple of PAGESIZE.
- */
- for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
- size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
- size_t p2 = size;
- size_t align = 0;
-
- while (p2 & (p2 - 1))
- p2 &= p2 - 1;
-
- if (size <= 4 * SPA_MINBLOCKSIZE) {
- align = SPA_MINBLOCKSIZE;
- } else if (P2PHASE(size, PAGESIZE) == 0) {
- align = PAGESIZE;
- } else if (P2PHASE(size, p2 >> 2) == 0) {
- align = p2 >> 2;
- }
-
- if (align != 0) {
- char name[36];
- (void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
- zio_buf_cache[c] = kmem_cache_create(name, size,
- align, NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG);
-
- (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
- zio_data_buf_cache[c] = kmem_cache_create(name, size,
- align, NULL, NULL, NULL, NULL, data_alloc_arena,
- KMC_NODEBUG);
-
- dprintf("creating cache for size %5lx align %5lx\n",
- size, align);
- }
- }
-
- while (--c != 0) {
- ASSERT(zio_buf_cache[c] != NULL);
- if (zio_buf_cache[c - 1] == NULL)
- zio_buf_cache[c - 1] = zio_buf_cache[c];
-
- ASSERT(zio_data_buf_cache[c] != NULL);
- if (zio_data_buf_cache[c - 1] == NULL)
- zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
- }
-#endif
-
- zio_inject_init();
-}
-
-void
-zio_fini(void)
-{
-#ifdef ZIO_USE_UMA
- size_t c;
- kmem_cache_t *last_cache = NULL;
- kmem_cache_t *last_data_cache = NULL;
-
- for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
- if (zio_buf_cache[c] != last_cache) {
- last_cache = zio_buf_cache[c];
- kmem_cache_destroy(zio_buf_cache[c]);
- }
- zio_buf_cache[c] = NULL;
-
- if (zio_data_buf_cache[c] != last_data_cache) {
- last_data_cache = zio_data_buf_cache[c];
- kmem_cache_destroy(zio_data_buf_cache[c]);
- }
- zio_data_buf_cache[c] = NULL;
- }
-#endif
-
- kmem_cache_destroy(zio_cache);
-
- zio_inject_fini();
-}
-
-/*
- * ==========================================================================
- * Allocate and free I/O buffers
- * ==========================================================================
- */
-
-/*
- * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a
- * crashdump if the kernel panics, so use it judiciously. Obviously, it's
- * useful to inspect ZFS metadata, but if possible, we should avoid keeping
- * excess / transient data in-core during a crashdump.
- */
-void *
-zio_buf_alloc(size_t size)
-{
-#ifdef ZIO_USE_UMA
- size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
-
- ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
-
- return (kmem_cache_alloc(zio_buf_cache[c], KM_SLEEP));
-#else
- return (kmem_alloc(size, KM_SLEEP));
-#endif
-}
-
-/*
- * Use zio_data_buf_alloc to allocate data. The data will not appear in a
- * crashdump if the kernel panics. This exists so that we will limit the amount
- * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount
- * of kernel heap dumped to disk when the kernel panics)
- */
-void *
-zio_data_buf_alloc(size_t size)
-{
-#ifdef ZIO_USE_UMA
- size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
-
- ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
-
- return (kmem_cache_alloc(zio_data_buf_cache[c], KM_SLEEP));
-#else
- return (kmem_alloc(size, KM_SLEEP));
-#endif
-}
-
-void
-zio_buf_free(void *buf, size_t size)
-{
-#ifdef ZIO_USE_UMA
- size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
-
- ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
-
- kmem_cache_free(zio_buf_cache[c], buf);
-#else
- kmem_free(buf, size);
-#endif
-}
-
-void
-zio_data_buf_free(void *buf, size_t size)
-{
-#ifdef ZIO_USE_UMA
- size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
-
- ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
-
- kmem_cache_free(zio_data_buf_cache[c], buf);
-#else
- kmem_free(buf, size);
-#endif
-}
-
-/*
- * ==========================================================================
- * Push and pop I/O transform buffers
- * ==========================================================================
- */
-static void
-zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize)
-{
- zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
-
- zt->zt_data = data;
- zt->zt_size = size;
- zt->zt_bufsize = bufsize;
-
- zt->zt_next = zio->io_transform_stack;
- zio->io_transform_stack = zt;
-
- zio->io_data = data;
- zio->io_size = size;
-}
-
-static void
-zio_pop_transform(zio_t *zio, void **data, uint64_t *size, uint64_t *bufsize)
-{
- zio_transform_t *zt = zio->io_transform_stack;
-
- *data = zt->zt_data;
- *size = zt->zt_size;
- *bufsize = zt->zt_bufsize;
-
- zio->io_transform_stack = zt->zt_next;
- kmem_free(zt, sizeof (zio_transform_t));
-
- if ((zt = zio->io_transform_stack) != NULL) {
- zio->io_data = zt->zt_data;
- zio->io_size = zt->zt_size;
- }
-}
-
-static void
-zio_clear_transform_stack(zio_t *zio)
-{
- void *data;
- uint64_t size, bufsize;
-
- ASSERT(zio->io_transform_stack != NULL);
-
- zio_pop_transform(zio, &data, &size, &bufsize);
- while (zio->io_transform_stack != NULL) {
- zio_buf_free(data, bufsize);
- zio_pop_transform(zio, &data, &size, &bufsize);
- }
-}
-
-/*
- * ==========================================================================
- * Create the various types of I/O (read, write, free)
- * ==========================================================================
- */
-static zio_t *
-zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
- void *data, uint64_t size, zio_done_func_t *done, void *private,
- zio_type_t type, int priority, int flags, uint8_t stage, uint32_t pipeline)
-{
- zio_t *zio;
-
- ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
- ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
-
- zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
- bzero(zio, sizeof (zio_t));
- zio->io_parent = pio;
- zio->io_spa = spa;
- zio->io_txg = txg;
- if (bp != NULL) {
- zio->io_bp = bp;
- zio->io_bp_copy = *bp;
- zio->io_bp_orig = *bp;
- }
- zio->io_done = done;
- zio->io_private = private;
- zio->io_type = type;
- zio->io_priority = priority;
- zio->io_stage = stage;
- zio->io_pipeline = pipeline;
- zio->io_async_stages = ZIO_ASYNC_PIPELINE_STAGES;
- zio->io_timestamp = lbolt64;
- zio->io_flags = flags;
- mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
- cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
- zio_push_transform(zio, data, size, size);
-
- /*
- * Note on config lock:
- *
- * If CONFIG_HELD is set, then the caller already has the config
- * lock, so we don't need it for this io.
- *
- * We set CONFIG_GRABBED to indicate that we have grabbed the
- * config lock on behalf of this io, so it should be released
- * in zio_done.
- *
- * Unless CONFIG_HELD is set, we will grab the config lock for
- * any top-level (parent-less) io, *except* NULL top-level ios.
- * The NULL top-level ios rarely have any children, so we delay
- * grabbing the lock until the first child is added (but it is
- * still grabbed on behalf of the top-level i/o, so additional
- * children don't need to also grab it). This greatly reduces
- * contention on the config lock.
- */
- if (pio == NULL) {
- if (type != ZIO_TYPE_NULL &&
- !(flags & ZIO_FLAG_CONFIG_HELD)) {
- spa_config_enter(zio->io_spa, RW_READER, zio);
- zio->io_flags |= ZIO_FLAG_CONFIG_GRABBED;
- }
- zio->io_root = zio;
- } else {
- zio->io_root = pio->io_root;
- if (!(flags & ZIO_FLAG_NOBOOKMARK))
- zio->io_logical = pio->io_logical;
- mutex_enter(&pio->io_lock);
- if (pio->io_parent == NULL &&
- pio->io_type == ZIO_TYPE_NULL &&
- !(pio->io_flags & ZIO_FLAG_CONFIG_GRABBED) &&
- !(pio->io_flags & ZIO_FLAG_CONFIG_HELD)) {
- pio->io_flags |= ZIO_FLAG_CONFIG_GRABBED;
- spa_config_enter(zio->io_spa, RW_READER, pio);
- }
- if (stage < ZIO_STAGE_READY)
- pio->io_children_notready++;
- pio->io_children_notdone++;
- zio->io_sibling_next = pio->io_child;
- zio->io_sibling_prev = NULL;
- if (pio->io_child != NULL)
- pio->io_child->io_sibling_prev = zio;
- pio->io_child = zio;
- zio->io_ndvas = pio->io_ndvas;
- mutex_exit(&pio->io_lock);
- }
-
- return (zio);
-}
-
-zio_t *
-zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private,
- int flags)
-{
- zio_t *zio;
-
- zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
- ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, ZIO_STAGE_OPEN,
- ZIO_WAIT_FOR_CHILDREN_PIPELINE);
-
- return (zio);
-}
-
-zio_t *
-zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags)
-{
- return (zio_null(NULL, spa, done, private, flags));
-}
-
-zio_t *
-zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data,
- uint64_t size, zio_done_func_t *done, void *private,
- int priority, int flags, zbookmark_t *zb)
-{
- zio_t *zio;
-
- ASSERT3U(size, ==, BP_GET_LSIZE(bp));
-
- zio = zio_create(pio, spa, bp->blk_birth, bp, data, size, done, private,
- ZIO_TYPE_READ, priority, flags | ZIO_FLAG_USER,
- ZIO_STAGE_OPEN, ZIO_READ_PIPELINE);
- zio->io_bookmark = *zb;
-
- zio->io_logical = zio;
-
- /*
- * Work off our copy of the bp so the caller can free it.
- */
- zio->io_bp = &zio->io_bp_copy;
-
- if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
- uint64_t csize = BP_GET_PSIZE(bp);
- void *cbuf = zio_buf_alloc(csize);
-
- zio_push_transform(zio, cbuf, csize, csize);
- zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS;
- }
-
- if (BP_IS_GANG(bp)) {
- uint64_t gsize = SPA_GANGBLOCKSIZE;
- void *gbuf = zio_buf_alloc(gsize);
-
- zio_push_transform(zio, gbuf, gsize, gsize);
- zio->io_pipeline |= 1U << ZIO_STAGE_READ_GANG_MEMBERS;
- }
-
- return (zio);
-}
-
-zio_t *
-zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies,
- uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
- zio_done_func_t *ready, zio_done_func_t *done, void *private, int priority,
- int flags, zbookmark_t *zb)
-{
- zio_t *zio;
-
- ASSERT(checksum >= ZIO_CHECKSUM_OFF &&
- checksum < ZIO_CHECKSUM_FUNCTIONS);
-
- ASSERT(compress >= ZIO_COMPRESS_OFF &&
- compress < ZIO_COMPRESS_FUNCTIONS);
-
- zio = zio_create(pio, spa, txg, bp, data, size, done, private,
- ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER,
- ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE);
-
- zio->io_ready = ready;
-
- zio->io_bookmark = *zb;
-
- zio->io_logical = zio;
-
- zio->io_checksum = checksum;
- zio->io_compress = compress;
- zio->io_ndvas = ncopies;
-
- if (compress != ZIO_COMPRESS_OFF)
- zio->io_async_stages |= 1U << ZIO_STAGE_WRITE_COMPRESS;
-
- if (bp->blk_birth != txg) {
- /* XXX the bp usually (always?) gets re-zeroed later */
- BP_ZERO(bp);
- BP_SET_LSIZE(bp, size);
- BP_SET_PSIZE(bp, size);
- } else {
- /* Make sure someone doesn't change their mind on overwrites */
- ASSERT(MIN(zio->io_ndvas + BP_IS_GANG(bp),
- spa_max_replication(spa)) == BP_GET_NDVAS(bp));
- }
-
- return (zio);
-}
-
-zio_t *
-zio_rewrite(zio_t *pio, spa_t *spa, int checksum,
- uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
- zio_done_func_t *done, void *private, int priority, int flags,
- zbookmark_t *zb)
-{
- zio_t *zio;
-
- zio = zio_create(pio, spa, txg, bp, data, size, done, private,
- ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER,
- ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
-
- zio->io_bookmark = *zb;
- zio->io_checksum = checksum;
- zio->io_compress = ZIO_COMPRESS_OFF;
-
- if (pio != NULL)
- ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp));
-
- return (zio);
-}
-
-static zio_t *
-zio_write_allocate(zio_t *pio, spa_t *spa, int checksum,
- uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
- zio_done_func_t *done, void *private, int priority, int flags)
-{
- zio_t *zio;
-
- BP_ZERO(bp);
- BP_SET_LSIZE(bp, size);
- BP_SET_PSIZE(bp, size);
- BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
-
- zio = zio_create(pio, spa, txg, bp, data, size, done, private,
- ZIO_TYPE_WRITE, priority, flags,
- ZIO_STAGE_OPEN, ZIO_WRITE_ALLOCATE_PIPELINE);
-
- zio->io_checksum = checksum;
- zio->io_compress = ZIO_COMPRESS_OFF;
-
- return (zio);
-}
-
-zio_t *
-zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
- zio_done_func_t *done, void *private)
-{
- zio_t *zio;
-
- ASSERT(!BP_IS_HOLE(bp));
-
- if (txg == spa->spa_syncing_txg &&
- spa->spa_sync_pass > zio_sync_pass.zp_defer_free) {
- bplist_enqueue_deferred(&spa->spa_sync_bplist, bp);
- return (zio_null(pio, spa, NULL, NULL, 0));
- }
-
- zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private,
- ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, ZIO_FLAG_USER,
- ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE);
-
- zio->io_bp = &zio->io_bp_copy;
-
- return (zio);
-}
-
-zio_t *
-zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
- zio_done_func_t *done, void *private)
-{
- zio_t *zio;
-
- /*
- * A claim is an allocation of a specific block. Claims are needed
- * to support immediate writes in the intent log. The issue is that
- * immediate writes contain committed data, but in a txg that was
- * *not* committed. Upon opening the pool after an unclean shutdown,
- * the intent log claims all blocks that contain immediate write data
- * so that the SPA knows they're in use.
- *
- * All claims *must* be resolved in the first txg -- before the SPA
- * starts allocating blocks -- so that nothing is allocated twice.
- */
- ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
- ASSERT3U(spa_first_txg(spa), <=, txg);
-
- zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private,
- ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, 0,
- ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
-
- zio->io_bp = &zio->io_bp_copy;
-
- return (zio);
-}
-
-zio_t *
-zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
- zio_done_func_t *done, void *private, int priority, int flags)
-{
- zio_t *zio;
- int c;
-
- if (vd->vdev_children == 0) {
- zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
- ZIO_TYPE_IOCTL, priority, flags,
- ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
-
- zio->io_vd = vd;
- zio->io_cmd = cmd;
- } else {
- zio = zio_null(pio, spa, NULL, NULL, flags);
-
- for (c = 0; c < vd->vdev_children; c++)
- zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
- done, private, priority, flags));
- }
-
- return (zio);
-}
-
-static void
-zio_phys_bp_init(vdev_t *vd, blkptr_t *bp, uint64_t offset, uint64_t size,
- int checksum)
-{
- ASSERT(vd->vdev_children == 0);
-
- ASSERT(size <= SPA_MAXBLOCKSIZE);
- ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
- ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
-
- ASSERT(offset + size <= VDEV_LABEL_START_SIZE ||
- offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
- ASSERT3U(offset + size, <=, vd->vdev_psize);
-
- BP_ZERO(bp);
-
- BP_SET_LSIZE(bp, size);
- BP_SET_PSIZE(bp, size);
-
- BP_SET_CHECKSUM(bp, checksum);
- BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
- BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
-
- if (checksum != ZIO_CHECKSUM_OFF)
- ZIO_SET_CHECKSUM(&bp->blk_cksum, offset, 0, 0, 0);
-}
-
-zio_t *
-zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
- void *data, int checksum, zio_done_func_t *done, void *private,
- int priority, int flags)
-{
- zio_t *zio;
- blkptr_t blk;
-
- zio_phys_bp_init(vd, &blk, offset, size, checksum);
-
- zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private,
- ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL,
- ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
-
- zio->io_vd = vd;
- zio->io_offset = offset;
-
- /*
- * Work off our copy of the bp so the caller can free it.
- */
- zio->io_bp = &zio->io_bp_copy;
-
- return (zio);
-}
-
-zio_t *
-zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
- void *data, int checksum, zio_done_func_t *done, void *private,
- int priority, int flags)
-{
- zio_block_tail_t *zbt;
- void *wbuf;
- zio_t *zio;
- blkptr_t blk;
-
- zio_phys_bp_init(vd, &blk, offset, size, checksum);
-
- zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private,
- ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL,
- ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
-
- zio->io_vd = vd;
- zio->io_offset = offset;
-
- zio->io_bp = &zio->io_bp_copy;
- zio->io_checksum = checksum;
-
- if (zio_checksum_table[checksum].ci_zbt) {
- /*
- * zbt checksums are necessarily destructive -- they modify
- * one word of the write buffer to hold the verifier/checksum.
- * Therefore, we must make a local copy in case the data is
- * being written to multiple places.
- */
- wbuf = zio_buf_alloc(size);
- bcopy(data, wbuf, size);
- zio_push_transform(zio, wbuf, size, size);
-
- zbt = (zio_block_tail_t *)((char *)wbuf + size) - 1;
- zbt->zbt_cksum = blk.blk_cksum;
- }
-
- return (zio);
-}
-
-/*
- * Create a child I/O to do some work for us. It has no associated bp.
- */
-zio_t *
-zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
- void *data, uint64_t size, int type, int priority, int flags,
- zio_done_func_t *done, void *private)
-{
- uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE;
- zio_t *cio;
-
- if (type == ZIO_TYPE_READ && bp != NULL) {
- /*
- * If we have the bp, then the child should perform the
- * checksum and the parent need not. This pushes error
- * detection as close to the leaves as possible and
- * eliminates redundant checksums in the interior nodes.
- */
- pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY;
- zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY);
- }
-
- cio = zio_create(zio, zio->io_spa, zio->io_txg, bp, data, size,
- done, private, type, priority,
- (zio->io_flags & ZIO_FLAG_VDEV_INHERIT) | ZIO_FLAG_CANFAIL | flags,
- ZIO_STAGE_VDEV_IO_START - 1, pipeline);
-
- cio->io_vd = vd;
- cio->io_offset = offset;
-
- return (cio);
-}
-
-/*
- * ==========================================================================
- * Initiate I/O, either sync or async
- * ==========================================================================
- */
-int
-zio_wait(zio_t *zio)
-{
- int error;
-
- ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
-
- zio->io_waiter = curthread;
-
- zio_next_stage_async(zio);
-
- mutex_enter(&zio->io_lock);
- while (zio->io_stalled != ZIO_STAGE_DONE)
- cv_wait(&zio->io_cv, &zio->io_lock);
- mutex_exit(&zio->io_lock);
-
- error = zio->io_error;
- cv_destroy(&zio->io_cv);
- mutex_destroy(&zio->io_lock);
- kmem_cache_free(zio_cache, zio);
-
- return (error);
-}
-
-void
-zio_nowait(zio_t *zio)
-{
- zio_next_stage_async(zio);
-}
-
-/*
- * ==========================================================================
- * I/O pipeline interlocks: parent/child dependency scoreboarding
- * ==========================================================================
- */
-static void
-zio_wait_for_children(zio_t *zio, uint32_t stage, uint64_t *countp)
-{
- mutex_enter(&zio->io_lock);
- if (*countp == 0) {
- ASSERT(zio->io_stalled == 0);
- mutex_exit(&zio->io_lock);
- zio_next_stage(zio);
- } else {
- zio->io_stalled = stage;
- mutex_exit(&zio->io_lock);
- }
-}
-
-static void
-zio_notify_parent(zio_t *zio, uint32_t stage, uint64_t *countp)
-{
- zio_t *pio = zio->io_parent;
-
- mutex_enter(&pio->io_lock);
- if (pio->io_error == 0 && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
- pio->io_error = zio->io_error;
- if (--*countp == 0 && pio->io_stalled == stage) {
- pio->io_stalled = 0;
- mutex_exit(&pio->io_lock);
- zio_next_stage_async(pio);
- } else {
- mutex_exit(&pio->io_lock);
- }
-}
-
-static void
-zio_wait_children_ready(zio_t *zio)
-{
- zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_READY,
- &zio->io_children_notready);
-}
-
-void
-zio_wait_children_done(zio_t *zio)
-{
- zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_DONE,
- &zio->io_children_notdone);
-}
-
-static void
-zio_ready(zio_t *zio)
-{
- zio_t *pio = zio->io_parent;
-
- if (zio->io_ready)
- zio->io_ready(zio);
-
- if (pio != NULL)
- zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_READY,
- &pio->io_children_notready);
-
- if (zio->io_bp)
- zio->io_bp_copy = *zio->io_bp;
-
- zio_next_stage(zio);
-}
-
-static void
-zio_done(zio_t *zio)
-{
- zio_t *pio = zio->io_parent;
- spa_t *spa = zio->io_spa;
- blkptr_t *bp = zio->io_bp;
- vdev_t *vd = zio->io_vd;
-
- ASSERT(zio->io_children_notready == 0);
- ASSERT(zio->io_children_notdone == 0);
-
- if (bp != NULL) {
- ASSERT(bp->blk_pad[0] == 0);
- ASSERT(bp->blk_pad[1] == 0);
- ASSERT(bp->blk_pad[2] == 0);
- ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0);
- if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) &&
- !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
- ASSERT(!BP_SHOULD_BYTESWAP(bp));
- if (zio->io_ndvas != 0)
- ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp));
- ASSERT(BP_COUNT_GANG(bp) == 0 ||
- (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp)));
- }
- }
-
- if (vd != NULL)
- vdev_stat_update(zio);
-
- if (zio->io_error) {
- /*
- * If this I/O is attached to a particular vdev,
- * generate an error message describing the I/O failure
- * at the block level. We ignore these errors if the
- * device is currently unavailable.
- */
- if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd))
- zfs_ereport_post(FM_EREPORT_ZFS_IO,
- zio->io_spa, vd, zio, 0, 0);
-
- if ((zio->io_error == EIO ||
- !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) &&
- zio->io_logical == zio) {
- /*
- * For root I/O requests, tell the SPA to log the error
- * appropriately. Also, generate a logical data
- * ereport.
- */
- spa_log_error(zio->io_spa, zio);
-
- zfs_ereport_post(FM_EREPORT_ZFS_DATA,
- zio->io_spa, NULL, zio, 0, 0);
- }
-
- /*
- * For I/O requests that cannot fail, panic appropriately.
- */
- if (!(zio->io_flags & ZIO_FLAG_CANFAIL)) {
- char *blkbuf;
-
- blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_NOSLEEP);
- if (blkbuf) {
- sprintf_blkptr(blkbuf, BP_SPRINTF_LEN,
- bp ? bp : &zio->io_bp_copy);
- }
- panic("ZFS: %s (%s on %s off %llx: zio %p %s): error "
- "%d", zio->io_error == ECKSUM ?
- "bad checksum" : "I/O failure",
- zio_type_name[zio->io_type],
- vdev_description(vd),
- (u_longlong_t)zio->io_offset,
- zio, blkbuf ? blkbuf : "", zio->io_error);
- }
- }
- zio_clear_transform_stack(zio);
-
- if (zio->io_done)
- zio->io_done(zio);
-
- ASSERT(zio->io_delegate_list == NULL);
- ASSERT(zio->io_delegate_next == NULL);
-
- if (pio != NULL) {
- zio_t *next, *prev;
-
- mutex_enter(&pio->io_lock);
- next = zio->io_sibling_next;
- prev = zio->io_sibling_prev;
- if (next != NULL)
- next->io_sibling_prev = prev;
- if (prev != NULL)
- prev->io_sibling_next = next;
- if (pio->io_child == zio)
- pio->io_child = next;
- mutex_exit(&pio->io_lock);
-
- zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_DONE,
- &pio->io_children_notdone);
- }
-
- /*
- * Note: this I/O is now done, and will shortly be freed, so there is no
- * need to clear this (or any other) flag.
- */
- if (zio->io_flags & ZIO_FLAG_CONFIG_GRABBED)
- spa_config_exit(spa, zio);
-
- if (zio->io_waiter != NULL) {
- mutex_enter(&zio->io_lock);
- ASSERT(zio->io_stage == ZIO_STAGE_DONE);
- zio->io_stalled = zio->io_stage;
- cv_broadcast(&zio->io_cv);
- mutex_exit(&zio->io_lock);
- } else {
- cv_destroy(&zio->io_cv);
- mutex_destroy(&zio->io_lock);
- kmem_cache_free(zio_cache, zio);
- }
-}
-
-/*
- * ==========================================================================
- * Compression support
- * ==========================================================================
- */
-static void
-zio_write_compress(zio_t *zio)
-{
- int compress = zio->io_compress;
- blkptr_t *bp = zio->io_bp;
- void *cbuf;
- uint64_t lsize = zio->io_size;
- uint64_t csize = lsize;
- uint64_t cbufsize = 0;
- int pass;
-
- if (bp->blk_birth == zio->io_txg) {
- /*
- * We're rewriting an existing block, which means we're
- * working on behalf of spa_sync(). For spa_sync() to
- * converge, it must eventually be the case that we don't
- * have to allocate new blocks. But compression changes
- * the blocksize, which forces a reallocate, and makes
- * convergence take longer. Therefore, after the first
- * few passes, stop compressing to ensure convergence.
- */
- pass = spa_sync_pass(zio->io_spa);
- if (pass > zio_sync_pass.zp_dontcompress)
- compress = ZIO_COMPRESS_OFF;
- } else {
- ASSERT(BP_IS_HOLE(bp));
- pass = 1;
- }
-
- if (compress != ZIO_COMPRESS_OFF)
- if (!zio_compress_data(compress, zio->io_data, zio->io_size,
- &cbuf, &csize, &cbufsize))
- compress = ZIO_COMPRESS_OFF;
-
- if (compress != ZIO_COMPRESS_OFF && csize != 0)
- zio_push_transform(zio, cbuf, csize, cbufsize);
-
- /*
- * The final pass of spa_sync() must be all rewrites, but the first
- * few passes offer a trade-off: allocating blocks defers convergence,
- * but newly allocated blocks are sequential, so they can be written
- * to disk faster. Therefore, we allow the first few passes of
- * spa_sync() to reallocate new blocks, but force rewrites after that.
- * There should only be a handful of blocks after pass 1 in any case.
- */
- if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize &&
- pass > zio_sync_pass.zp_rewrite) {
- ASSERT(csize != 0);
- BP_SET_LSIZE(bp, lsize);
- BP_SET_COMPRESS(bp, compress);
- zio->io_pipeline = ZIO_REWRITE_PIPELINE;
- } else {
- if (bp->blk_birth == zio->io_txg)
- BP_ZERO(bp);
- if (csize == 0) {
- BP_ZERO(bp);
- zio->io_pipeline = ZIO_WAIT_FOR_CHILDREN_PIPELINE;
- } else {
- ASSERT3U(BP_GET_NDVAS(bp), ==, 0);
- BP_SET_LSIZE(bp, lsize);
- BP_SET_PSIZE(bp, csize);
- BP_SET_COMPRESS(bp, compress);
- zio->io_pipeline = ZIO_WRITE_ALLOCATE_PIPELINE;
- }
- }
-
- zio_next_stage(zio);
-}
-
-static void
-zio_read_decompress(zio_t *zio)
-{
- blkptr_t *bp = zio->io_bp;
- void *data;
- uint64_t size;
- uint64_t bufsize;
- int compress = BP_GET_COMPRESS(bp);
-
- ASSERT(compress != ZIO_COMPRESS_OFF);
-
- zio_pop_transform(zio, &data, &size, &bufsize);
-
- if (zio_decompress_data(compress, data, size,
- zio->io_data, zio->io_size))
- zio->io_error = EIO;
-
- zio_buf_free(data, bufsize);
-
- zio_next_stage(zio);
-}
-
-/*
- * ==========================================================================
- * Gang block support
- * ==========================================================================
- */
-static void
-zio_gang_pipeline(zio_t *zio)
-{
- /*
- * By default, the pipeline assumes that we're dealing with a gang
- * block. If we're not, strip out any gang-specific stages.
- */
- if (!BP_IS_GANG(zio->io_bp))
- zio->io_pipeline &= ~ZIO_GANG_STAGES;
-
- zio_next_stage(zio);
-}
-
-static void
-zio_gang_byteswap(zio_t *zio)
-{
- ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
-
- if (BP_SHOULD_BYTESWAP(zio->io_bp))
- byteswap_uint64_array(zio->io_data, zio->io_size);
-}
-
-static void
-zio_get_gang_header(zio_t *zio)
-{
- blkptr_t *bp = zio->io_bp;
- uint64_t gsize = SPA_GANGBLOCKSIZE;
- void *gbuf = zio_buf_alloc(gsize);
-
- ASSERT(BP_IS_GANG(bp));
-
- zio_push_transform(zio, gbuf, gsize, gsize);
-
- zio_nowait(zio_create(zio, zio->io_spa, bp->blk_birth, bp, gbuf, gsize,
- NULL, NULL, ZIO_TYPE_READ, zio->io_priority,
- zio->io_flags & ZIO_FLAG_GANG_INHERIT,
- ZIO_STAGE_OPEN, ZIO_READ_PIPELINE));
-
- zio_wait_children_done(zio);
-}
-
-static void
-zio_read_gang_members(zio_t *zio)
-{
- zio_gbh_phys_t *gbh;
- uint64_t gsize, gbufsize, loff, lsize;
- int i;
-
- ASSERT(BP_IS_GANG(zio->io_bp));
-
- zio_gang_byteswap(zio);
- zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
-
- for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) {
- blkptr_t *gbp = &gbh->zg_blkptr[i];
- lsize = BP_GET_PSIZE(gbp);
-
- ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF);
- ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp));
- ASSERT3U(loff + lsize, <=, zio->io_size);
- ASSERT(i < SPA_GBH_NBLKPTRS);
- ASSERT(!BP_IS_HOLE(gbp));
-
- zio_nowait(zio_read(zio, zio->io_spa, gbp,
- (char *)zio->io_data + loff, lsize, NULL, NULL,
- zio->io_priority, zio->io_flags & ZIO_FLAG_GANG_INHERIT,
- &zio->io_bookmark));
- }
-
- zio_buf_free(gbh, gbufsize);
- zio_wait_children_done(zio);
-}
-
-static void
-zio_rewrite_gang_members(zio_t *zio)
-{
- zio_gbh_phys_t *gbh;
- uint64_t gsize, gbufsize, loff, lsize;
- int i;
-
- ASSERT(BP_IS_GANG(zio->io_bp));
- ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE);
-
- zio_gang_byteswap(zio);
- zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
-
- ASSERT(gsize == gbufsize);
-
- for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) {
- blkptr_t *gbp = &gbh->zg_blkptr[i];
- lsize = BP_GET_PSIZE(gbp);
-
- ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF);
- ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp));
- ASSERT3U(loff + lsize, <=, zio->io_size);
- ASSERT(i < SPA_GBH_NBLKPTRS);
- ASSERT(!BP_IS_HOLE(gbp));
-
- zio_nowait(zio_rewrite(zio, zio->io_spa, zio->io_checksum,
- zio->io_txg, gbp, (char *)zio->io_data + loff, lsize,
- NULL, NULL, zio->io_priority, zio->io_flags,
- &zio->io_bookmark));
- }
-
- zio_push_transform(zio, gbh, gsize, gbufsize);
- zio_wait_children_ready(zio);
-}
-
-static void
-zio_free_gang_members(zio_t *zio)
-{
- zio_gbh_phys_t *gbh;
- uint64_t gsize, gbufsize;
- int i;
-
- ASSERT(BP_IS_GANG(zio->io_bp));
-
- zio_gang_byteswap(zio);
- zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
-
- for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
- blkptr_t *gbp = &gbh->zg_blkptr[i];
-
- if (BP_IS_HOLE(gbp))
- continue;
- zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg,
- gbp, NULL, NULL));
- }
-
- zio_buf_free(gbh, gbufsize);
- zio_next_stage(zio);
-}
-
-static void
-zio_claim_gang_members(zio_t *zio)
-{
- zio_gbh_phys_t *gbh;
- uint64_t gsize, gbufsize;
- int i;
-
- ASSERT(BP_IS_GANG(zio->io_bp));
-
- zio_gang_byteswap(zio);
- zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
-
- for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
- blkptr_t *gbp = &gbh->zg_blkptr[i];
- if (BP_IS_HOLE(gbp))
- continue;
- zio_nowait(zio_claim(zio, zio->io_spa, zio->io_txg,
- gbp, NULL, NULL));
- }
-
- zio_buf_free(gbh, gbufsize);
- zio_next_stage(zio);
-}
-
-static void
-zio_write_allocate_gang_member_done(zio_t *zio)
-{
- zio_t *pio = zio->io_parent;
- dva_t *cdva = zio->io_bp->blk_dva;
- dva_t *pdva = pio->io_bp->blk_dva;
- uint64_t asize;
- int d;
-
- ASSERT3U(pio->io_ndvas, ==, zio->io_ndvas);
- ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
- ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(zio->io_bp));
- ASSERT3U(pio->io_ndvas, <=, BP_GET_NDVAS(pio->io_bp));
-
- mutex_enter(&pio->io_lock);
- for (d = 0; d < BP_GET_NDVAS(pio->io_bp); d++) {
- ASSERT(DVA_GET_GANG(&pdva[d]));
- asize = DVA_GET_ASIZE(&pdva[d]);
- asize += DVA_GET_ASIZE(&cdva[d]);
- DVA_SET_ASIZE(&pdva[d], asize);
- }
- mutex_exit(&pio->io_lock);
-}
-
-static void
-zio_write_allocate_gang_members(zio_t *zio)
-{
- blkptr_t *bp = zio->io_bp;
- dva_t *dva = bp->blk_dva;
- spa_t *spa = zio->io_spa;
- zio_gbh_phys_t *gbh;
- uint64_t txg = zio->io_txg;
- uint64_t resid = zio->io_size;
- uint64_t maxalloc = P2ROUNDUP(zio->io_size >> 1, SPA_MINBLOCKSIZE);
- uint64_t gsize, loff, lsize;
- uint32_t gbps_left;
- int ndvas = zio->io_ndvas;
- int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa));
- int error;
- int i, d;
-
- gsize = SPA_GANGBLOCKSIZE;
- gbps_left = SPA_GBH_NBLKPTRS;
-
- error = metaslab_alloc(spa, gsize, bp, gbh_ndvas, txg, NULL, B_FALSE);
- if (error == ENOSPC)
- panic("can't allocate gang block header");
- ASSERT(error == 0);
-
- for (d = 0; d < gbh_ndvas; d++)
- DVA_SET_GANG(&dva[d], 1);
-
- bp->blk_birth = txg;
-
- gbh = zio_buf_alloc(gsize);
- bzero(gbh, gsize);
-
- /* We need to test multi-level gang blocks */
- if (maxalloc >= zio_gang_bang && (LBOLT & 0x1) == 0)
- maxalloc = MAX(maxalloc >> 2, SPA_MINBLOCKSIZE);
-
- for (loff = 0, i = 0; loff != zio->io_size;
- loff += lsize, resid -= lsize, gbps_left--, i++) {
- blkptr_t *gbp = &gbh->zg_blkptr[i];
- dva = gbp->blk_dva;
-
- ASSERT(gbps_left != 0);
- maxalloc = MIN(maxalloc, resid);
-
- while (resid <= maxalloc * gbps_left) {
- error = metaslab_alloc(spa, maxalloc, gbp, ndvas,
- txg, bp, B_FALSE);
- if (error == 0)
- break;
- ASSERT3U(error, ==, ENOSPC);
- if (maxalloc == SPA_MINBLOCKSIZE)
- panic("really out of space");
- maxalloc = P2ROUNDUP(maxalloc >> 1, SPA_MINBLOCKSIZE);
- }
-
- if (resid <= maxalloc * gbps_left) {
- lsize = maxalloc;
- BP_SET_LSIZE(gbp, lsize);
- BP_SET_PSIZE(gbp, lsize);
- BP_SET_COMPRESS(gbp, ZIO_COMPRESS_OFF);
- gbp->blk_birth = txg;
- zio_nowait(zio_rewrite(zio, spa,
- zio->io_checksum, txg, gbp,
- (char *)zio->io_data + loff, lsize,
- zio_write_allocate_gang_member_done, NULL,
- zio->io_priority, zio->io_flags,
- &zio->io_bookmark));
- } else {
- lsize = P2ROUNDUP(resid / gbps_left, SPA_MINBLOCKSIZE);
- ASSERT(lsize != SPA_MINBLOCKSIZE);
- zio_nowait(zio_write_allocate(zio, spa,
- zio->io_checksum, txg, gbp,
- (char *)zio->io_data + loff, lsize,
- zio_write_allocate_gang_member_done, NULL,
- zio->io_priority, zio->io_flags));
- }
- }
-
- ASSERT(resid == 0 && loff == zio->io_size);
-
- zio->io_pipeline |= 1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE;
-
- zio_push_transform(zio, gbh, gsize, gsize);
- /*
- * As much as we'd like this to be zio_wait_children_ready(),
- * updating our ASIZE doesn't happen until the io_done callback,
- * so we have to wait for that to finish in order for our BP
- * to be stable.
- */
- zio_wait_children_done(zio);
-}
-
-/*
- * ==========================================================================
- * Allocate and free blocks
- * ==========================================================================
- */
-static void
-zio_dva_allocate(zio_t *zio)
-{
- blkptr_t *bp = zio->io_bp;
- int error;
-
- ASSERT(BP_IS_HOLE(bp));
- ASSERT3U(BP_GET_NDVAS(bp), ==, 0);
- ASSERT3U(zio->io_ndvas, >, 0);
- ASSERT3U(zio->io_ndvas, <=, spa_max_replication(zio->io_spa));
-
- /* For testing, make some blocks above a certain size be gang blocks */
- if (zio->io_size >= zio_gang_bang && (LBOLT & 0x3) == 0) {
- zio_write_allocate_gang_members(zio);
- return;
- }
-
- ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
-
- error = metaslab_alloc(zio->io_spa, zio->io_size, bp, zio->io_ndvas,
- zio->io_txg, NULL, B_FALSE);
-
- if (error == 0) {
- bp->blk_birth = zio->io_txg;
- } else if (error == ENOSPC) {
- if (zio->io_size == SPA_MINBLOCKSIZE)
- panic("really, truly out of space");
- zio_write_allocate_gang_members(zio);
- return;
- } else {
- zio->io_error = error;
- }
- zio_next_stage(zio);
-}
-
-static void
-zio_dva_free(zio_t *zio)
-{
- blkptr_t *bp = zio->io_bp;
-
- metaslab_free(zio->io_spa, bp, zio->io_txg, B_FALSE);
-
- BP_ZERO(bp);
-
- zio_next_stage(zio);
-}
-
-static void
-zio_dva_claim(zio_t *zio)
-{
- zio->io_error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
-
- zio_next_stage(zio);
-}
-
-/*
- * ==========================================================================
- * Read and write to physical devices
- * ==========================================================================
- */
-
-static void
-zio_vdev_io_start(zio_t *zio)
-{
- vdev_t *vd = zio->io_vd;
- vdev_t *tvd = vd ? vd->vdev_top : NULL;
- blkptr_t *bp = zio->io_bp;
- uint64_t align;
-
- if (vd == NULL) {
- /* The mirror_ops handle multiple DVAs in a single BP */
- vdev_mirror_ops.vdev_op_io_start(zio);
- return;
- }
-
- align = 1ULL << tvd->vdev_ashift;
-
- if (zio->io_retries == 0 && vd == tvd)
- zio->io_flags |= ZIO_FLAG_FAILFAST;
-
- if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
- vd->vdev_children == 0) {
- zio->io_flags |= ZIO_FLAG_PHYSICAL;
- zio->io_offset += VDEV_LABEL_START_SIZE;
- }
-
- if (P2PHASE(zio->io_size, align) != 0) {
- uint64_t asize = P2ROUNDUP(zio->io_size, align);
- char *abuf = zio_buf_alloc(asize);
- ASSERT(vd == tvd);
- if (zio->io_type == ZIO_TYPE_WRITE) {
- bcopy(zio->io_data, abuf, zio->io_size);
- bzero(abuf + zio->io_size, asize - zio->io_size);
- }
- zio_push_transform(zio, abuf, asize, asize);
- ASSERT(!(zio->io_flags & ZIO_FLAG_SUBBLOCK));
- zio->io_flags |= ZIO_FLAG_SUBBLOCK;
- }
-
- ASSERT(P2PHASE(zio->io_offset, align) == 0);
- ASSERT(P2PHASE(zio->io_size, align) == 0);
- ASSERT(bp == NULL ||
- P2ROUNDUP(ZIO_GET_IOSIZE(zio), align) == zio->io_size);
- ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE));
-
- vdev_io_start(zio);
-
- /* zio_next_stage_async() gets called from io completion interrupt */
-}
-
-static void
-zio_vdev_io_done(zio_t *zio)
-{
- if (zio->io_vd == NULL)
- /* The mirror_ops handle multiple DVAs in a single BP */
- vdev_mirror_ops.vdev_op_io_done(zio);
- else
- vdev_io_done(zio);
-}
-
-/* XXPOLICY */
-boolean_t
-zio_should_retry(zio_t *zio)
-{
- vdev_t *vd = zio->io_vd;
-
- if (zio->io_error == 0)
- return (B_FALSE);
- if (zio->io_delegate_list != NULL)
- return (B_FALSE);
- if (vd && vd != vd->vdev_top)
- return (B_FALSE);
- if (zio->io_flags & ZIO_FLAG_DONT_RETRY)
- return (B_FALSE);
- if (zio->io_retries > 0)
- return (B_FALSE);
-
- return (B_TRUE);
-}
-
-static void
-zio_vdev_io_assess(zio_t *zio)
-{
- vdev_t *vd = zio->io_vd;
- vdev_t *tvd = vd ? vd->vdev_top : NULL;
-
- ASSERT(zio->io_vsd == NULL);
-
- if (zio->io_flags & ZIO_FLAG_SUBBLOCK) {
- void *abuf;
- uint64_t asize;
- ASSERT(vd == tvd);
- zio_pop_transform(zio, &abuf, &asize, &asize);
- if (zio->io_type == ZIO_TYPE_READ)
- bcopy(abuf, zio->io_data, zio->io_size);
- zio_buf_free(abuf, asize);
- zio->io_flags &= ~ZIO_FLAG_SUBBLOCK;
- }
-
- if (zio_injection_enabled && !zio->io_error)
- zio->io_error = zio_handle_fault_injection(zio, EIO);
-
- /*
- * If the I/O failed, determine whether we should attempt to retry it.
- */
- /* XXPOLICY */
- if (zio_should_retry(zio)) {
- ASSERT(tvd == vd);
-
- zio->io_retries++;
- zio->io_error = 0;
- zio->io_flags &= ZIO_FLAG_VDEV_INHERIT |
- ZIO_FLAG_CONFIG_GRABBED;
- /* XXPOLICY */
- zio->io_flags &= ~ZIO_FLAG_FAILFAST;
- zio->io_flags |= ZIO_FLAG_DONT_CACHE;
- zio->io_stage = ZIO_STAGE_VDEV_IO_START - 1;
-
- dprintf("retry #%d for %s to %s offset %llx\n",
- zio->io_retries, zio_type_name[zio->io_type],
- vdev_description(vd), zio->io_offset);
-
- zio_next_stage_async(zio);
- return;
- }
-
- if (zio->io_error != 0 && zio->io_error != ECKSUM &&
- !(zio->io_flags & ZIO_FLAG_SPECULATIVE) && vd) {
- /*
- * Poor man's hotplug support. Even if we're done retrying this
- * I/O, try to reopen the vdev to see if it's still attached.
- * To avoid excessive thrashing, we only try it once a minute.
- * This also has the effect of detecting when missing devices
- * have come back, by polling the device once a minute.
- *
- * We need to do this asynchronously because we can't grab
- * all the necessary locks way down here.
- */
- if (gethrtime() - vd->vdev_last_try > 60ULL * NANOSEC) {
- vd->vdev_last_try = gethrtime();
- tvd->vdev_reopen_wanted = 1;
- spa_async_request(vd->vdev_spa, SPA_ASYNC_REOPEN);
- }
- }
-
- zio_next_stage(zio);
-}
-
-void
-zio_vdev_io_reissue(zio_t *zio)
-{
- ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
- ASSERT(zio->io_error == 0);
-
- zio->io_stage--;
-}
-
-void
-zio_vdev_io_redone(zio_t *zio)
-{
- ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
-
- zio->io_stage--;
-}
-
-void
-zio_vdev_io_bypass(zio_t *zio)
-{
- ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
- ASSERT(zio->io_error == 0);
-
- zio->io_flags |= ZIO_FLAG_IO_BYPASS;
- zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS - 1;
-}
-
-/*
- * ==========================================================================
- * Generate and verify checksums
- * ==========================================================================
- */
-static void
-zio_checksum_generate(zio_t *zio)
-{
- int checksum = zio->io_checksum;
- blkptr_t *bp = zio->io_bp;
-
- ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
-
- BP_SET_CHECKSUM(bp, checksum);
- BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
-
- zio_checksum(checksum, &bp->blk_cksum, zio->io_data, zio->io_size);
-
- zio_next_stage(zio);
-}
-
-static void
-zio_gang_checksum_generate(zio_t *zio)
-{
- zio_cksum_t zc;
- zio_gbh_phys_t *gbh = zio->io_data;
-
- ASSERT(BP_IS_GANG(zio->io_bp));
- ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE);
-
- zio_set_gang_verifier(zio, &gbh->zg_tail.zbt_cksum);
-
- zio_checksum(ZIO_CHECKSUM_GANG_HEADER, &zc, zio->io_data, zio->io_size);
-
- zio_next_stage(zio);
-}
-
-static void
-zio_checksum_verify(zio_t *zio)
-{
- if (zio->io_bp != NULL) {
- zio->io_error = zio_checksum_error(zio);
- if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE))
- zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
- zio->io_spa, zio->io_vd, zio, 0, 0);
- }
-
- zio_next_stage(zio);
-}
-
-/*
- * Called by RAID-Z to ensure we don't compute the checksum twice.
- */
-void
-zio_checksum_verified(zio_t *zio)
-{
- zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY);
-}
-
-/*
- * Set the external verifier for a gang block based on stuff in the bp
- */
-void
-zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp)
-{
- blkptr_t *bp = zio->io_bp;
-
- zcp->zc_word[0] = DVA_GET_VDEV(BP_IDENTITY(bp));
- zcp->zc_word[1] = DVA_GET_OFFSET(BP_IDENTITY(bp));
- zcp->zc_word[2] = bp->blk_birth;
- zcp->zc_word[3] = 0;
-}
-
-/*
- * ==========================================================================
- * Define the pipeline
- * ==========================================================================
- */
-typedef void zio_pipe_stage_t(zio_t *zio);
-
-static void
-zio_badop(zio_t *zio)
-{
- panic("Invalid I/O pipeline stage %u for zio %p", zio->io_stage, zio);
-}
-
-zio_pipe_stage_t *zio_pipeline[ZIO_STAGE_DONE + 2] = {
- zio_badop,
- zio_wait_children_ready,
- zio_write_compress,
- zio_checksum_generate,
- zio_gang_pipeline,
- zio_get_gang_header,
- zio_rewrite_gang_members,
- zio_free_gang_members,
- zio_claim_gang_members,
- zio_dva_allocate,
- zio_dva_free,
- zio_dva_claim,
- zio_gang_checksum_generate,
- zio_ready,
- zio_vdev_io_start,
- zio_vdev_io_done,
- zio_vdev_io_assess,
- zio_wait_children_done,
- zio_checksum_verify,
- zio_read_gang_members,
- zio_read_decompress,
- zio_done,
- zio_badop
-};
-
-/*
- * Move an I/O to the next stage of the pipeline and execute that stage.
- * There's no locking on io_stage because there's no legitimate way for
- * multiple threads to be attempting to process the same I/O.
- */
-void
-zio_next_stage(zio_t *zio)
-{
- uint32_t pipeline = zio->io_pipeline;
-
- ASSERT(!MUTEX_HELD(&zio->io_lock));
-
- if (zio->io_error) {
- dprintf("zio %p vdev %s offset %llx stage %d error %d\n",
- zio, vdev_description(zio->io_vd),
- zio->io_offset, zio->io_stage, zio->io_error);
- if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0)
- pipeline &= ZIO_ERROR_PIPELINE_MASK;
- }
-
- while (((1U << ++zio->io_stage) & pipeline) == 0)
- continue;
-
- ASSERT(zio->io_stage <= ZIO_STAGE_DONE);
- ASSERT(zio->io_stalled == 0);
-
- /*
- * See the comment in zio_next_stage_async() about per-CPU taskqs.
- */
- if (((1U << zio->io_stage) & zio->io_async_stages) &&
- (zio->io_stage == ZIO_STAGE_WRITE_COMPRESS) &&
- !(zio->io_flags & ZIO_FLAG_METADATA)) {
- taskq_t *tq = zio->io_spa->spa_zio_issue_taskq[zio->io_type];
- (void) taskq_dispatch(tq,
- (task_func_t *)zio_pipeline[zio->io_stage], zio, TQ_SLEEP);
- } else {
- zio_pipeline[zio->io_stage](zio);
- }
-}
-
-void
-zio_next_stage_async(zio_t *zio)
-{
- taskq_t *tq;
- uint32_t pipeline = zio->io_pipeline;
-
- ASSERT(!MUTEX_HELD(&zio->io_lock));
-
- if (zio->io_error) {
- dprintf("zio %p vdev %s offset %llx stage %d error %d\n",
- zio, vdev_description(zio->io_vd),
- zio->io_offset, zio->io_stage, zio->io_error);
- if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0)
- pipeline &= ZIO_ERROR_PIPELINE_MASK;
- }
-
- while (((1U << ++zio->io_stage) & pipeline) == 0)
- continue;
-
- ASSERT(zio->io_stage <= ZIO_STAGE_DONE);
- ASSERT(zio->io_stalled == 0);
-
- /*
- * For performance, we'll probably want two sets of task queues:
- * per-CPU issue taskqs and per-CPU completion taskqs. The per-CPU
- * part is for read performance: since we have to make a pass over
- * the data to checksum it anyway, we want to do this on the same CPU
- * that issued the read, because (assuming CPU scheduling affinity)
- * that thread is probably still there. Getting this optimization
- * right avoids performance-hostile cache-to-cache transfers.
- *
- * Note that having two sets of task queues is also necessary for
- * correctness: if all of the issue threads get bogged down waiting
- * for dependent reads (e.g. metaslab freelist) to complete, then
- * there won't be any threads available to service I/O completion
- * interrupts.
- */
- if ((1U << zio->io_stage) & zio->io_async_stages) {
- if (zio->io_stage < ZIO_STAGE_VDEV_IO_DONE)
- tq = zio->io_spa->spa_zio_issue_taskq[zio->io_type];
- else
- tq = zio->io_spa->spa_zio_intr_taskq[zio->io_type];
- (void) taskq_dispatch(tq,
- (task_func_t *)zio_pipeline[zio->io_stage], zio, TQ_SLEEP);
- } else {
- zio_pipeline[zio->io_stage](zio);
- }
-}
-
-static boolean_t
-zio_alloc_should_fail(void)
-{
- static uint16_t allocs = 0;
-
- return (P2PHASE(allocs++, 1U<<zio_zil_fail_shift) == 0);
-}
-
-/*
- * Try to allocate an intent log block. Return 0 on success, errno on failure.
- */
-int
-zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, blkptr_t *old_bp,
- uint64_t txg)
-{
- int error;
-
- spa_config_enter(spa, RW_READER, FTAG);
-
- if (zio_zil_fail_shift && zio_alloc_should_fail()) {
- spa_config_exit(spa, FTAG);
- return (ENOSPC);
- }
-
- /*
- * We were passed the previous log blocks dva_t in bp->blk_dva[0].
- */
- error = metaslab_alloc(spa, size, new_bp, 1, txg, old_bp, B_TRUE);
-
- if (error == 0) {
- BP_SET_LSIZE(new_bp, size);
- BP_SET_PSIZE(new_bp, size);
- BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
- BP_SET_CHECKSUM(new_bp, ZIO_CHECKSUM_ZILOG);
- BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
- BP_SET_LEVEL(new_bp, 0);
- BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
- new_bp->blk_birth = txg;
- }
-
- spa_config_exit(spa, FTAG);
-
- return (error);
-}
-
-/*
- * Free an intent log block. We know it can't be a gang block, so there's
- * nothing to do except metaslab_free() it.
- */
-void
-zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg)
-{
- ASSERT(!BP_IS_GANG(bp));
-
- spa_config_enter(spa, RW_READER, FTAG);
-
- metaslab_free(spa, bp, txg, B_FALSE);
-
- spa_config_exit(spa, FTAG);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c
deleted file mode 100644
index f0d9a14..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c
+++ /dev/null
@@ -1,172 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/spa.h>
-#include <sys/zio.h>
-#include <sys/zio_checksum.h>
-
-/*
- * Checksum vectors.
- *
- * In the SPA, everything is checksummed. We support checksum vectors
- * for three distinct reasons:
- *
- * 1. Different kinds of data need different levels of protection.
- * For SPA metadata, we always want a very strong checksum.
- * For user data, we let users make the trade-off between speed
- * and checksum strength.
- *
- * 2. Cryptographic hash and MAC algorithms are an area of active research.
- * It is likely that in future hash functions will be at least as strong
- * as current best-of-breed, and may be substantially faster as well.
- * We want the ability to take advantage of these new hashes as soon as
- * they become available.
- *
- * 3. If someone develops hardware that can compute a strong hash quickly,
- * we want the ability to take advantage of that hardware.
- *
- * Of course, we don't want a checksum upgrade to invalidate existing
- * data, so we store the checksum *function* in five bits of the DVA.
- * This gives us room for up to 32 different checksum functions.
- *
- * When writing a block, we always checksum it with the latest-and-greatest
- * checksum function of the appropriate strength. When reading a block,
- * we compare the expected checksum against the actual checksum, which we
- * compute via the checksum function specified in the DVA encoding.
- */
-
-/*ARGSUSED*/
-static void
-zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp)
-{
- ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
-}
-
-zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
- {{NULL, NULL}, 0, 0, "inherit"},
- {{NULL, NULL}, 0, 0, "on"},
- {{zio_checksum_off, zio_checksum_off}, 0, 0, "off"},
- {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, "label"},
- {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, "gang_header"},
- {{fletcher_2_native, fletcher_2_byteswap}, 0, 1, "zilog"},
- {{fletcher_2_native, fletcher_2_byteswap}, 0, 0, "fletcher2"},
- {{fletcher_4_native, fletcher_4_byteswap}, 1, 0, "fletcher4"},
- {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 0, "SHA256"},
-};
-
-uint8_t
-zio_checksum_select(uint8_t child, uint8_t parent)
-{
- ASSERT(child < ZIO_CHECKSUM_FUNCTIONS);
- ASSERT(parent < ZIO_CHECKSUM_FUNCTIONS);
- ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON);
-
- if (child == ZIO_CHECKSUM_INHERIT)
- return (parent);
-
- if (child == ZIO_CHECKSUM_ON)
- return (ZIO_CHECKSUM_ON_VALUE);
-
- return (child);
-}
-
-/*
- * Generate the checksum.
- */
-void
-zio_checksum(uint_t checksum, zio_cksum_t *zcp, void *data, uint64_t size)
-{
- zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1;
- zio_checksum_info_t *ci = &zio_checksum_table[checksum];
- zio_cksum_t zbt_cksum;
-
- ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS);
- ASSERT(ci->ci_func[0] != NULL);
-
- if (ci->ci_zbt) {
- *zcp = zbt->zbt_cksum;
- zbt->zbt_magic = ZBT_MAGIC;
- ci->ci_func[0](data, size, &zbt_cksum);
- zbt->zbt_cksum = zbt_cksum;
- } else {
- ci->ci_func[0](data, size, zcp);
- }
-}
-
-int
-zio_checksum_error(zio_t *zio)
-{
- blkptr_t *bp = zio->io_bp;
- zio_cksum_t zc = bp->blk_cksum;
- uint_t checksum = BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER :
- BP_GET_CHECKSUM(bp);
- int byteswap = BP_SHOULD_BYTESWAP(bp);
- void *data = zio->io_data;
- uint64_t size = ZIO_GET_IOSIZE(zio);
- zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1;
- zio_checksum_info_t *ci = &zio_checksum_table[checksum];
- zio_cksum_t actual_cksum, expected_cksum;
-
- if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL)
- return (EINVAL);
-
- if (ci->ci_zbt) {
- if (checksum == ZIO_CHECKSUM_GANG_HEADER)
- zio_set_gang_verifier(zio, &zc);
-
- if (zbt->zbt_magic == BSWAP_64(ZBT_MAGIC)) {
- expected_cksum = zbt->zbt_cksum;
- byteswap_uint64_array(&expected_cksum,
- sizeof (zio_cksum_t));
- zbt->zbt_cksum = zc;
- byteswap_uint64_array(&zbt->zbt_cksum,
- sizeof (zio_cksum_t));
- ci->ci_func[1](data, size, &actual_cksum);
- zbt->zbt_cksum = expected_cksum;
- byteswap_uint64_array(&zbt->zbt_cksum,
- sizeof (zio_cksum_t));
- } else {
- expected_cksum = zbt->zbt_cksum;
- zbt->zbt_cksum = zc;
- ci->ci_func[0](data, size, &actual_cksum);
- zbt->zbt_cksum = expected_cksum;
- }
- zc = expected_cksum;
- } else {
- ASSERT(!BP_IS_GANG(bp));
- ci->ci_func[byteswap](data, size, &actual_cksum);
- }
-
- if (!ZIO_CHECKSUM_EQUAL(actual_cksum, zc))
- return (ECKSUM);
-
- if (zio_injection_enabled && !zio->io_error)
- return (zio_handle_fault_injection(zio, ECKSUM));
-
- return (0);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c
deleted file mode 100644
index c563be4..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/zfs_context.h>
-#include <sys/compress.h>
-#include <sys/spa.h>
-#include <sys/zio.h>
-#include <sys/zio_compress.h>
-
-/*
- * Compression vectors.
- */
-
-zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
- {NULL, NULL, 0, "inherit"},
- {NULL, NULL, 0, "on"},
- {NULL, NULL, 0, "uncompressed"},
- {lzjb_compress, lzjb_decompress, 0, "lzjb"},
- {NULL, NULL, 0, "empty"},
- {gzip_compress, gzip_decompress, 1, "gzip-1"},
- {gzip_compress, gzip_decompress, 2, "gzip-2"},
- {gzip_compress, gzip_decompress, 3, "gzip-3"},
- {gzip_compress, gzip_decompress, 4, "gzip-4"},
- {gzip_compress, gzip_decompress, 5, "gzip-5"},
- {gzip_compress, gzip_decompress, 6, "gzip-6"},
- {gzip_compress, gzip_decompress, 7, "gzip-7"},
- {gzip_compress, gzip_decompress, 8, "gzip-8"},
- {gzip_compress, gzip_decompress, 9, "gzip-9"},
-};
-
-uint8_t
-zio_compress_select(uint8_t child, uint8_t parent)
-{
- ASSERT(child < ZIO_COMPRESS_FUNCTIONS);
- ASSERT(parent < ZIO_COMPRESS_FUNCTIONS);
- ASSERT(parent != ZIO_COMPRESS_INHERIT && parent != ZIO_COMPRESS_ON);
-
- if (child == ZIO_COMPRESS_INHERIT)
- return (parent);
-
- if (child == ZIO_COMPRESS_ON)
- return (ZIO_COMPRESS_ON_VALUE);
-
- return (child);
-}
-
-int
-zio_compress_data(int cpfunc, void *src, uint64_t srcsize, void **destp,
- uint64_t *destsizep, uint64_t *destbufsizep)
-{
- uint64_t *word, *word_end;
- uint64_t ciosize, gapsize, destbufsize;
- zio_compress_info_t *ci = &zio_compress_table[cpfunc];
- char *dest;
- uint_t allzero;
-
- ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS);
- ASSERT((uint_t)cpfunc == ZIO_COMPRESS_EMPTY || ci->ci_compress != NULL);
-
- /*
- * If the data is all zeroes, we don't even need to allocate
- * a block for it. We indicate this by setting *destsizep = 0.
- */
- allzero = 1;
- word = src;
- word_end = (uint64_t *)(uintptr_t)((uintptr_t)word + srcsize);
- while (word < word_end) {
- if (*word++ != 0) {
- allzero = 0;
- break;
- }
- }
- if (allzero) {
- *destp = NULL;
- *destsizep = 0;
- *destbufsizep = 0;
- return (1);
- }
-
- if (cpfunc == ZIO_COMPRESS_EMPTY)
- return (0);
-
- /* Compress at least 12.5% */
- destbufsize = P2ALIGN(srcsize - (srcsize >> 3), SPA_MINBLOCKSIZE);
- if (destbufsize == 0)
- return (0);
- dest = zio_buf_alloc(destbufsize);
-
- ciosize = ci->ci_compress(src, dest, (size_t)srcsize,
- (size_t)destbufsize, ci->ci_level);
- if (ciosize > destbufsize) {
- zio_buf_free(dest, destbufsize);
- return (0);
- }
-
- /* Cool. We compressed at least as much as we were hoping to. */
-
- /* For security, make sure we don't write random heap crap to disk */
- gapsize = P2ROUNDUP(ciosize, SPA_MINBLOCKSIZE) - ciosize;
- if (gapsize != 0) {
- bzero(dest + ciosize, gapsize);
- ciosize += gapsize;
- }
-
- ASSERT3U(ciosize, <=, destbufsize);
- ASSERT(P2PHASE(ciosize, SPA_MINBLOCKSIZE) == 0);
- *destp = dest;
- *destsizep = ciosize;
- *destbufsizep = destbufsize;
-
- return (1);
-}
-
-int
-zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
- void *dest, uint64_t destsize)
-{
- zio_compress_info_t *ci = &zio_compress_table[cpfunc];
-
- ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS);
-
- return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c
deleted file mode 100644
index 4cada09..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c
+++ /dev/null
@@ -1,315 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-/*
- * ZFS fault injection
- *
- * To handle fault injection, we keep track of a series of zinject_record_t
- * structures which describe which logical block(s) should be injected with a
- * fault. These are kept in a global list. Each record corresponds to a given
- * spa_t and maintains a special hold on the spa_t so that it cannot be deleted
- * or exported while the injection record exists.
- *
- * Device level injection is done using the 'zi_guid' field. If this is set, it
- * means that the error is destined for a particular device, not a piece of
- * data.
- *
- * This is a rather poor data structure and algorithm, but we don't expect more
- * than a few faults at any one time, so it should be sufficient for our needs.
- */
-
-#include <sys/arc.h>
-#include <sys/zio_impl.h>
-#include <sys/zfs_ioctl.h>
-#include <sys/spa_impl.h>
-#include <sys/vdev_impl.h>
-
-uint32_t zio_injection_enabled;
-
-typedef struct inject_handler {
- int zi_id;
- spa_t *zi_spa;
- zinject_record_t zi_record;
- list_node_t zi_link;
-} inject_handler_t;
-
-static list_t inject_handlers;
-static krwlock_t inject_lock;
-static int inject_next_id = 1;
-
-/*
- * Returns true if the given record matches the I/O in progress.
- */
-static boolean_t
-zio_match_handler(zbookmark_t *zb, uint64_t type,
- zinject_record_t *record, int error)
-{
- /*
- * Check for a match against the MOS, which is based on type
- */
- if (zb->zb_objset == 0 && record->zi_objset == 0 &&
- record->zi_object == 0) {
- if (record->zi_type == DMU_OT_NONE ||
- type == record->zi_type)
- return (record->zi_freq == 0 ||
- spa_get_random(100) < record->zi_freq);
- else
- return (B_FALSE);
- }
-
- /*
- * Check for an exact match.
- */
- if (zb->zb_objset == record->zi_objset &&
- zb->zb_object == record->zi_object &&
- zb->zb_level == record->zi_level &&
- zb->zb_blkid >= record->zi_start &&
- zb->zb_blkid <= record->zi_end &&
- error == record->zi_error)
- return (record->zi_freq == 0 ||
- spa_get_random(100) < record->zi_freq);
-
- return (B_FALSE);
-}
-
-/*
- * Determine if the I/O in question should return failure. Returns the errno
- * to be returned to the caller.
- */
-int
-zio_handle_fault_injection(zio_t *zio, int error)
-{
- int ret = 0;
- inject_handler_t *handler;
-
- /*
- * Ignore I/O not associated with any logical data.
- */
- if (zio->io_logical == NULL)
- return (0);
-
- /*
- * Currently, we only support fault injection on reads.
- */
- if (zio->io_type != ZIO_TYPE_READ)
- return (0);
-
- rw_enter(&inject_lock, RW_READER);
-
- for (handler = list_head(&inject_handlers); handler != NULL;
- handler = list_next(&inject_handlers, handler)) {
-
- /* Ignore errors not destined for this pool */
- if (zio->io_spa != handler->zi_spa)
- continue;
-
- /* Ignore device errors */
- if (handler->zi_record.zi_guid != 0)
- continue;
-
- /* If this handler matches, return EIO */
- if (zio_match_handler(&zio->io_logical->io_bookmark,
- zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE,
- &handler->zi_record, error)) {
- ret = error;
- break;
- }
- }
-
- rw_exit(&inject_lock);
-
- return (ret);
-}
-
-int
-zio_handle_device_injection(vdev_t *vd, int error)
-{
- inject_handler_t *handler;
- int ret = 0;
-
- rw_enter(&inject_lock, RW_READER);
-
- for (handler = list_head(&inject_handlers); handler != NULL;
- handler = list_next(&inject_handlers, handler)) {
-
- if (vd->vdev_guid == handler->zi_record.zi_guid) {
- if (handler->zi_record.zi_error == error) {
- /*
- * For a failed open, pretend like the device
- * has gone away.
- */
- if (error == ENXIO)
- vd->vdev_stat.vs_aux =
- VDEV_AUX_OPEN_FAILED;
- ret = error;
- break;
- }
- if (handler->zi_record.zi_error == ENXIO) {
- ret = EIO;
- break;
- }
- }
- }
-
- rw_exit(&inject_lock);
-
- return (ret);
-}
-
-/*
- * Create a new handler for the given record. We add it to the list, adding
- * a reference to the spa_t in the process. We increment zio_injection_enabled,
- * which is the switch to trigger all fault injection.
- */
-int
-zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record)
-{
- inject_handler_t *handler;
- int error;
- spa_t *spa;
-
- /*
- * If this is pool-wide metadata, make sure we unload the corresponding
- * spa_t, so that the next attempt to load it will trigger the fault.
- * We call spa_reset() to unload the pool appropriately.
- */
- if (flags & ZINJECT_UNLOAD_SPA)
- if ((error = spa_reset(name)) != 0)
- return (error);
-
- if (!(flags & ZINJECT_NULL)) {
- /*
- * spa_inject_ref() will add an injection reference, which will
- * prevent the pool from being removed from the namespace while
- * still allowing it to be unloaded.
- */
- if ((spa = spa_inject_addref(name)) == NULL)
- return (ENOENT);
-
- handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP);
-
- rw_enter(&inject_lock, RW_WRITER);
-
- *id = handler->zi_id = inject_next_id++;
- handler->zi_spa = spa;
- handler->zi_record = *record;
- list_insert_tail(&inject_handlers, handler);
- atomic_add_32(&zio_injection_enabled, 1);
-
- rw_exit(&inject_lock);
- }
-
- /*
- * Flush the ARC, so that any attempts to read this data will end up
- * going to the ZIO layer. Note that this is a little overkill, but
- * we don't have the necessary ARC interfaces to do anything else, and
- * fault injection isn't a performance critical path.
- */
- if (flags & ZINJECT_FLUSH_ARC)
- arc_flush();
-
- return (0);
-}
-
-/*
- * Returns the next record with an ID greater than that supplied to the
- * function. Used to iterate over all handlers in the system.
- */
-int
-zio_inject_list_next(int *id, char *name, size_t buflen,
- zinject_record_t *record)
-{
- inject_handler_t *handler;
- int ret;
-
- mutex_enter(&spa_namespace_lock);
- rw_enter(&inject_lock, RW_READER);
-
- for (handler = list_head(&inject_handlers); handler != NULL;
- handler = list_next(&inject_handlers, handler))
- if (handler->zi_id > *id)
- break;
-
- if (handler) {
- *record = handler->zi_record;
- *id = handler->zi_id;
- (void) strncpy(name, spa_name(handler->zi_spa), buflen);
- ret = 0;
- } else {
- ret = ENOENT;
- }
-
- rw_exit(&inject_lock);
- mutex_exit(&spa_namespace_lock);
-
- return (ret);
-}
-
-/*
- * Clear the fault handler with the given identifier, or return ENOENT if none
- * exists.
- */
-int
-zio_clear_fault(int id)
-{
- inject_handler_t *handler;
- int ret;
-
- rw_enter(&inject_lock, RW_WRITER);
-
- for (handler = list_head(&inject_handlers); handler != NULL;
- handler = list_next(&inject_handlers, handler))
- if (handler->zi_id == id)
- break;
-
- if (handler == NULL) {
- ret = ENOENT;
- } else {
- list_remove(&inject_handlers, handler);
- spa_inject_delref(handler->zi_spa);
- kmem_free(handler, sizeof (inject_handler_t));
- atomic_add_32(&zio_injection_enabled, -1);
- ret = 0;
- }
-
- rw_exit(&inject_lock);
-
- return (ret);
-}
-
-void
-zio_inject_init(void)
-{
- list_create(&inject_handlers, sizeof (inject_handler_t),
- offsetof(inject_handler_t, zi_link));
-}
-
-void
-zio_inject_fini(void)
-{
- list_destroy(&inject_handlers);
-}
diff --git a/sys/contrib/opensolaris/uts/common/fs/zfs/zvol.c b/sys/contrib/opensolaris/uts/common/fs/zfs/zvol.c
deleted file mode 100644
index fedae03..0000000
--- a/sys/contrib/opensolaris/uts/common/fs/zfs/zvol.c
+++ /dev/null
@@ -1,801 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
- * All rights reserved.
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-/*
- * ZFS volume emulation driver.
- *
- * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
- * Volumes are accessed through the symbolic links named:
- *
- * /dev/zvol/dsk/<pool_name>/<dataset_name>
- * /dev/zvol/rdsk/<pool_name>/<dataset_name>
- *
- * These links are created by the ZFS-specific devfsadm link generator.
- * Volumes are persistent through reboot. No user command needs to be
- * run before opening and using a device.
- */
-
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/kernel.h>
-#include <sys/errno.h>
-#include <sys/uio.h>
-#include <sys/bio.h>
-#include <sys/buf.h>
-#include <sys/kmem.h>
-#include <sys/conf.h>
-#include <sys/cmn_err.h>
-#include <sys/stat.h>
-#include <sys/zap.h>
-#include <sys/spa.h>
-#include <sys/zio.h>
-#include <sys/dsl_prop.h>
-#include <sys/dkio.h>
-#include <sys/byteorder.h>
-#include <sys/sunddi.h>
-#include <sys/dirent.h>
-#include <sys/policy.h>
-#include <sys/fs/zfs.h>
-#include <sys/zfs_ioctl.h>
-#include <sys/zil.h>
-#include <sys/refcount.h>
-#include <sys/zfs_znode.h>
-#include <sys/zfs_rlock.h>
-#include <geom/geom.h>
-
-#include "zfs_namecheck.h"
-
-struct g_class zfs_zvol_class = {
- .name = "ZFS::ZVOL",
- .version = G_VERSION,
-};
-
-DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
-
-#define ZVOL_OBJ 1ULL
-#define ZVOL_ZAP_OBJ 2ULL
-
-static uint32_t zvol_minors;
-
-/*
- * The in-core state of each volume.
- */
-typedef struct zvol_state {
- char zv_name[MAXPATHLEN]; /* pool/dd name */
- uint64_t zv_volsize; /* amount of space we advertise */
- uint64_t zv_volblocksize; /* volume block size */
- struct g_provider *zv_provider; /* GEOM provider */
- uint8_t zv_min_bs; /* minimum addressable block shift */
- uint8_t zv_readonly; /* hard readonly; like write-protect */
- objset_t *zv_objset; /* objset handle */
- uint32_t zv_mode; /* DS_MODE_* flags at open time */
- uint32_t zv_total_opens; /* total open count */
- zilog_t *zv_zilog; /* ZIL handle */
- uint64_t zv_txg_assign; /* txg to assign during ZIL replay */
- znode_t zv_znode; /* for range locking */
- int zv_state;
- struct bio_queue_head zv_queue;
- struct mtx zv_queue_mtx; /* zv_queue mutex */
-} zvol_state_t;
-
-/*
- * zvol maximum transfer in one DMU tx.
- */
-int zvol_maxphys = DMU_MAX_ACCESS/2;
-
-static int zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio);
-
-int
-zvol_check_volsize(uint64_t volsize, uint64_t blocksize)
-{
- if (volsize == 0)
- return (EINVAL);
-
- if (volsize % blocksize != 0)
- return (EINVAL);
-
-#ifdef _ILP32
- if (volsize - 1 > SPEC_MAXOFFSET_T)
- return (EOVERFLOW);
-#endif
- return (0);
-}
-
-int
-zvol_check_volblocksize(uint64_t volblocksize)
-{
- if (volblocksize < SPA_MINBLOCKSIZE ||
- volblocksize > SPA_MAXBLOCKSIZE ||
- !ISP2(volblocksize))
- return (EDOM);
-
- return (0);
-}
-
-static void
-zvol_readonly_changed_cb(void *arg, uint64_t newval)
-{
- zvol_state_t *zv = arg;
-
- zv->zv_readonly = (uint8_t)newval;
-}
-
-int
-zvol_get_stats(objset_t *os, nvlist_t *nv)
-{
- int error;
- dmu_object_info_t doi;
- uint64_t val;
-
-
- error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val);
- if (error)
- return (error);
-
- dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val);
-
- error = dmu_object_info(os, ZVOL_OBJ, &doi);
-
- if (error == 0) {
- dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE,
- doi.doi_data_block_size);
- }
-
- return (error);
-}
-
-static zvol_state_t *
-zvol_minor_lookup(const char *name)
-{
- struct g_provider *pp;
- struct g_geom *gp;
-
- g_topology_assert();
-
- LIST_FOREACH(gp, &zfs_zvol_class.geom, geom) {
- LIST_FOREACH(pp, &gp->provider, provider) {
- if (strcmp(pp->name + sizeof(ZVOL_DEV_DIR), name) == 0)
- return (pp->private);
- }
- }
-
- return (NULL);
-}
-
-static int
-zvol_access(struct g_provider *pp, int acr, int acw, int ace)
-{
- zvol_state_t *zv;
-
- g_topology_assert();
-
- zv = pp->private;
- if (zv == NULL) {
- if (acr <= 0 && acw <= 0 && ace <= 0)
- return (0);
- return (pp->error);
- }
-
- ASSERT(zv->zv_objset != NULL);
-
- if (acw > 0 && (zv->zv_readonly || (zv->zv_mode & DS_MODE_READONLY)))
- return (EROFS);
-
- zv->zv_total_opens += acr + acw + ace;
-
- return (0);
-}
-
-/*
- * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions.
- *
- * We store data in the log buffers if it's small enough.
- * Otherwise we will later flush the data out via dmu_sync().
- */
-ssize_t zvol_immediate_write_sz = 32768;
-
-static void
-zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t len)
-{
- uint32_t blocksize = zv->zv_volblocksize;
- lr_write_t *lr;
-
- while (len) {
- ssize_t nbytes = MIN(len, blocksize - P2PHASE(off, blocksize));
- itx_t *itx = zil_itx_create(TX_WRITE, sizeof (*lr));
-
- itx->itx_wr_state =
- len > zvol_immediate_write_sz ? WR_INDIRECT : WR_NEED_COPY;
- itx->itx_private = zv;
- lr = (lr_write_t *)&itx->itx_lr;
- lr->lr_foid = ZVOL_OBJ;
- lr->lr_offset = off;
- lr->lr_length = nbytes;
- lr->lr_blkoff = off - P2ALIGN_TYPED(off, blocksize, uint64_t);
- BP_ZERO(&lr->lr_blkptr);
-
- (void) zil_itx_assign(zv->zv_zilog, itx, tx);
- len -= nbytes;
- off += nbytes;
- }
-}
-
-static void
-zvol_start(struct bio *bp)
-{
- zvol_state_t *zv;
-
- switch (bp->bio_cmd) {
- case BIO_READ:
- case BIO_WRITE:
- case BIO_FLUSH:
- zv = bp->bio_to->private;
- ASSERT(zv != NULL);
- mtx_lock(&zv->zv_queue_mtx);
- bioq_insert_tail(&zv->zv_queue, bp);
- wakeup_one(&zv->zv_queue);
- mtx_unlock(&zv->zv_queue_mtx);
- break;
- case BIO_DELETE:
- case BIO_GETATTR:
- default:
- g_io_deliver(bp, EOPNOTSUPP);
- break;
- }
-}
-
-static void
-zvol_serve_one(zvol_state_t *zv, struct bio *bp)
-{
- uint64_t off, volsize;
- size_t size, resid;
- char *addr;
- objset_t *os;
- rl_t *rl;
- int error = 0;
- boolean_t reading;
-
- off = bp->bio_offset;
- volsize = zv->zv_volsize;
-
- os = zv->zv_objset;
- ASSERT(os != NULL);
-
- addr = bp->bio_data;
- resid = bp->bio_length;
-
- error = 0;
-
- /*
- * There must be no buffer changes when doing a dmu_sync() because
- * we can't change the data whilst calculating the checksum.
- * A better approach than a per zvol rwlock would be to lock ranges.
- */
- reading = (bp->bio_cmd == BIO_READ);
- rl = zfs_range_lock(&zv->zv_znode, off, resid,
- reading ? RL_READER : RL_WRITER);
-
- while (resid != 0 && off < volsize) {
-
- size = MIN(resid, zvol_maxphys); /* zvol_maxphys per tx */
-
- if (size > volsize - off) /* don't write past the end */
- size = volsize - off;
-
- if (reading) {
- error = dmu_read(os, ZVOL_OBJ, off, size, addr);
- } else {
- dmu_tx_t *tx = dmu_tx_create(os);
- dmu_tx_hold_write(tx, ZVOL_OBJ, off, size);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- dmu_tx_abort(tx);
- } else {
- dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
- zvol_log_write(zv, tx, off, size);
- dmu_tx_commit(tx);
- }
- }
- if (error)
- break;
- off += size;
- addr += size;
- resid -= size;
- }
- zfs_range_unlock(rl);
-
- bp->bio_completed = bp->bio_length - resid;
- if (bp->bio_completed < bp->bio_length)
- bp->bio_error = (off > volsize ? EINVAL : error);
-}
-
-static void
-zvol_worker(void *arg)
-{
- zvol_state_t *zv;
- struct bio *bp;
-
- zv = arg;
- for (;;) {
- mtx_lock(&zv->zv_queue_mtx);
- bp = bioq_takefirst(&zv->zv_queue);
- if (bp == NULL) {
- if (zv->zv_state == 1) {
- zv->zv_state = 2;
- wakeup(&zv->zv_state);
- mtx_unlock(&zv->zv_queue_mtx);
- kproc_exit(0);
- }
- msleep(&zv->zv_queue, &zv->zv_queue_mtx, PRIBIO | PDROP,
- "zvol:io", 0);
- continue;
- }
- mtx_unlock(&zv->zv_queue_mtx);
- switch (bp->bio_cmd) {
- case BIO_FLUSH:
- break;
- case BIO_READ:
- case BIO_WRITE:
- zvol_serve_one(zv, bp);
- break;
- }
-
- if (bp->bio_cmd != BIO_READ && !zil_disable)
- zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ);
-
- g_io_deliver(bp, bp->bio_error);
- }
-}
-
-void
-zvol_create_cb(objset_t *os, void *arg, dmu_tx_t *tx)
-{
- zfs_create_data_t *zc = arg;
- int error;
- uint64_t volblocksize, volsize;
-
- VERIFY(nvlist_lookup_uint64(zc->zc_props,
- zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0);
- if (nvlist_lookup_uint64(zc->zc_props,
- zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0)
- volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
-
- /*
- * These properites must be removed from the list so the generic
- * property setting step won't apply to them.
- */
- VERIFY(nvlist_remove_all(zc->zc_props,
- zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0);
- (void) nvlist_remove_all(zc->zc_props,
- zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE));
-
- error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize,
- DMU_OT_NONE, 0, tx);
- ASSERT(error == 0);
-
- error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP,
- DMU_OT_NONE, 0, tx);
- ASSERT(error == 0);
-
- error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx);
- ASSERT(error == 0);
-}
-
-/*
- * Replay a TX_WRITE ZIL transaction that didn't get committed
- * after a system failure
- */
-static int
-zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap)
-{
- objset_t *os = zv->zv_objset;
- char *data = (char *)(lr + 1); /* data follows lr_write_t */
- uint64_t off = lr->lr_offset;
- uint64_t len = lr->lr_length;
- dmu_tx_t *tx;
- int error;
-
- if (byteswap)
- byteswap_uint64_array(lr, sizeof (*lr));
-
- tx = dmu_tx_create(os);
- dmu_tx_hold_write(tx, ZVOL_OBJ, off, len);
- error = dmu_tx_assign(tx, zv->zv_txg_assign);
- if (error) {
- dmu_tx_abort(tx);
- } else {
- dmu_write(os, ZVOL_OBJ, off, len, data, tx);
- dmu_tx_commit(tx);
- }
-
- return (error);
-}
-
-/* ARGSUSED */
-static int
-zvol_replay_err(zvol_state_t *zv, lr_t *lr, boolean_t byteswap)
-{
- return (ENOTSUP);
-}
-
-/*
- * Callback vectors for replaying records.
- * Only TX_WRITE is needed for zvol.
- */
-zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
- zvol_replay_err, /* 0 no such transaction type */
- zvol_replay_err, /* TX_CREATE */
- zvol_replay_err, /* TX_MKDIR */
- zvol_replay_err, /* TX_MKXATTR */
- zvol_replay_err, /* TX_SYMLINK */
- zvol_replay_err, /* TX_REMOVE */
- zvol_replay_err, /* TX_RMDIR */
- zvol_replay_err, /* TX_LINK */
- zvol_replay_err, /* TX_RENAME */
- zvol_replay_write, /* TX_WRITE */
- zvol_replay_err, /* TX_TRUNCATE */
- zvol_replay_err, /* TX_SETATTR */
- zvol_replay_err, /* TX_ACL */
-};
-
-/*
- * Create a minor node for the specified volume.
- */
-int
-zvol_create_minor(const char *name, dev_t dev)
-{
- struct g_provider *pp;
- struct g_geom *gp;
- zvol_state_t *zv;
- objset_t *os;
- dmu_object_info_t doi;
- uint64_t volsize;
- int ds_mode = DS_MODE_PRIMARY;
- int error;
-
- DROP_GIANT();
- g_topology_lock();
-
- if ((zv = zvol_minor_lookup(name)) != NULL) {
- error = EEXIST;
- goto end;
- }
-
- if (strchr(name, '@') != 0)
- ds_mode |= DS_MODE_READONLY;
-
- error = dmu_objset_open(name, DMU_OST_ZVOL, ds_mode, &os);
- if (error)
- goto end;
-
- g_topology_unlock();
- PICKUP_GIANT();
- error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
- DROP_GIANT();
- g_topology_lock();
- if (error) {
- dmu_objset_close(os);
- goto end;
- }
-
- gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
- gp->start = zvol_start;
- gp->access = zvol_access;
- pp = g_new_providerf(gp, "%s/%s", ZVOL_DEV_DIR, name);
- pp->mediasize = volsize;
- pp->sectorsize = DEV_BSIZE;
-
- zv = kmem_zalloc(sizeof(*zv), KM_SLEEP);
- (void) strcpy(zv->zv_name, name);
- zv->zv_min_bs = DEV_BSHIFT;
- zv->zv_provider = pp;
- zv->zv_volsize = pp->mediasize;
- zv->zv_objset = os;
- zv->zv_mode = ds_mode;
- zv->zv_zilog = zil_open(os, zvol_get_data);
- mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL);
- avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare,
- sizeof (rl_t), offsetof(rl_t, r_node));
-
-
- /* get and cache the blocksize */
- error = dmu_object_info(os, ZVOL_OBJ, &doi);
- ASSERT(error == 0);
- zv->zv_volblocksize = doi.doi_data_block_size;
-
- zil_replay(os, zv, &zv->zv_txg_assign, zvol_replay_vector);
-
- /* XXX this should handle the possible i/o error */
- VERIFY(dsl_prop_register(dmu_objset_ds(zv->zv_objset),
- "readonly", zvol_readonly_changed_cb, zv) == 0);
-
- pp->private = zv;
- g_error_provider(pp, 0);
-
- bioq_init(&zv->zv_queue);
- mtx_init(&zv->zv_queue_mtx, "zvol", NULL, MTX_DEF);
- zv->zv_state = 0;
- kproc_create(zvol_worker, zv, NULL, 0, 0, "zvol:worker %s", pp->name);
-
- zvol_minors++;
-end:
- g_topology_unlock();
- PICKUP_GIANT();
-
- return (error);
-}
-
-/*
- * Remove minor node for the specified volume.
- */
-int
-zvol_remove_minor(const char *name)
-{
- struct g_provider *pp;
- zvol_state_t *zv;
- int error = 0;
-
- DROP_GIANT();
- g_topology_lock();
-
- if ((zv = zvol_minor_lookup(name)) == NULL) {
- error = ENXIO;
- goto end;
- }
-
- if (zv->zv_total_opens != 0) {
- error = EBUSY;
- goto end;
- }
-
- VERIFY(dsl_prop_unregister(dmu_objset_ds(zv->zv_objset),
- "readonly", zvol_readonly_changed_cb, zv) == 0);
-
- mtx_lock(&zv->zv_queue_mtx);
- zv->zv_state = 1;
- wakeup_one(&zv->zv_queue);
- while (zv->zv_state != 2)
- msleep(&zv->zv_state, &zv->zv_queue_mtx, 0, "zvol:w", 0);
- mtx_unlock(&zv->zv_queue_mtx);
- mtx_destroy(&zv->zv_queue_mtx);
-
- pp = zv->zv_provider;
- pp->private = NULL;
- g_wither_geom(pp->geom, ENXIO);
-
- zil_close(zv->zv_zilog);
- zv->zv_zilog = NULL;
- dmu_objset_close(zv->zv_objset);
- zv->zv_objset = NULL;
- avl_destroy(&zv->zv_znode.z_range_avl);
- mutex_destroy(&zv->zv_znode.z_range_lock);
-
- kmem_free(zv, sizeof(*zv));
-
- zvol_minors--;
-end:
- g_topology_unlock();
- PICKUP_GIANT();
-
- return (error);
-}
-
-int
-zvol_set_volsize(const char *name, dev_t dev, uint64_t volsize)
-{
- zvol_state_t *zv;
- dmu_tx_t *tx;
- int error;
- dmu_object_info_t doi;
-
- DROP_GIANT();
- g_topology_lock();
-
- if ((zv = zvol_minor_lookup(name)) == NULL) {
- error = ENXIO;
- goto end;
- }
-
- if ((error = dmu_object_info(zv->zv_objset, ZVOL_OBJ, &doi)) != 0 ||
- (error = zvol_check_volsize(volsize,
- doi.doi_data_block_size)) != 0) {
- goto end;
- }
-
- if (zv->zv_readonly || (zv->zv_mode & DS_MODE_READONLY)) {
- error = EROFS;
- goto end;
- }
-
- tx = dmu_tx_create(zv->zv_objset);
- dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
- dmu_tx_hold_free(tx, ZVOL_OBJ, volsize, DMU_OBJECT_END);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- dmu_tx_abort(tx);
- goto end;
- }
-
- error = zap_update(zv->zv_objset, ZVOL_ZAP_OBJ, "size", 8, 1,
- &volsize, tx);
- if (error == 0) {
- error = dmu_free_range(zv->zv_objset, ZVOL_OBJ, volsize,
- DMU_OBJECT_END, tx);
- }
-
- dmu_tx_commit(tx);
-
- if (error == 0) {
- zv->zv_volsize = volsize;
- zv->zv_provider->mediasize = volsize; /* XXX: Not supported. */
- }
-end:
- g_topology_unlock();
- PICKUP_GIANT();
-
- return (error);
-}
-
-int
-zvol_set_volblocksize(const char *name, uint64_t volblocksize)
-{
- zvol_state_t *zv;
- dmu_tx_t *tx;
- int error;
-
- DROP_GIANT();
- g_topology_lock();
-
- if ((zv = zvol_minor_lookup(name)) == NULL) {
- error = ENXIO;
- goto end;
- }
-
- if (zv->zv_readonly || (zv->zv_mode & DS_MODE_READONLY)) {
- error = EROFS;
- goto end;
- }
-
- tx = dmu_tx_create(zv->zv_objset);
- dmu_tx_hold_bonus(tx, ZVOL_OBJ);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- dmu_tx_abort(tx);
- } else {
- error = dmu_object_set_blocksize(zv->zv_objset, ZVOL_OBJ,
- volblocksize, 0, tx);
- if (error == ENOTSUP)
- error = EBUSY;
- dmu_tx_commit(tx);
- /* XXX: Not supported. */
-#if 0
- if (error == 0)
- zv->zv_provider->sectorsize = zc->zc_volblocksize;
-#endif
- }
-end:
- g_topology_unlock();
- PICKUP_GIANT();
-
- return (error);
-}
-
-void
-zvol_get_done(dmu_buf_t *db, void *vzgd)
-{
- zgd_t *zgd = (zgd_t *)vzgd;
- rl_t *rl = zgd->zgd_rl;
-
- dmu_buf_rele(db, vzgd);
- zfs_range_unlock(rl);
- zil_add_vdev(zgd->zgd_zilog, DVA_GET_VDEV(BP_IDENTITY(zgd->zgd_bp)));
- kmem_free(zgd, sizeof (zgd_t));
-}
-
-/*
- * Get data to generate a TX_WRITE intent log record.
- */
-static int
-zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
-{
- zvol_state_t *zv = arg;
- objset_t *os = zv->zv_objset;
- dmu_buf_t *db;
- rl_t *rl;
- zgd_t *zgd;
- uint64_t boff; /* block starting offset */
- int dlen = lr->lr_length; /* length of user data */
- int error;
-
- ASSERT(zio);
- ASSERT(dlen != 0);
-
- /*
- * Write records come in two flavors: immediate and indirect.
- * For small writes it's cheaper to store the data with the
- * log record (immediate); for large writes it's cheaper to
- * sync the data and get a pointer to it (indirect) so that
- * we don't have to write the data twice.
- */
- if (buf != NULL) /* immediate write */
- return (dmu_read(os, ZVOL_OBJ, lr->lr_offset, dlen, buf));
-
- zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP);
- zgd->zgd_zilog = zv->zv_zilog;
- zgd->zgd_bp = &lr->lr_blkptr;
-
- /*
- * Lock the range of the block to ensure that when the data is
- * written out and it's checksum is being calculated that no other
- * thread can change the block.
- */
- boff = P2ALIGN_TYPED(lr->lr_offset, zv->zv_volblocksize, uint64_t);
- rl = zfs_range_lock(&zv->zv_znode, boff, zv->zv_volblocksize,
- RL_READER);
- zgd->zgd_rl = rl;
-
- VERIFY(0 == dmu_buf_hold(os, ZVOL_OBJ, lr->lr_offset, zgd, &db));
- error = dmu_sync(zio, db, &lr->lr_blkptr,
- lr->lr_common.lrc_txg, zvol_get_done, zgd);
- if (error == 0)
- zil_add_vdev(zv->zv_zilog,
- DVA_GET_VDEV(BP_IDENTITY(&lr->lr_blkptr)));
- /*
- * If we get EINPROGRESS, then we need to wait for a
- * write IO initiated by dmu_sync() to complete before
- * we can release this dbuf. We will finish everything
- * up in the zvol_get_done() callback.
- */
- if (error == EINPROGRESS)
- return (0);
- dmu_buf_rele(db, zgd);
- zfs_range_unlock(rl);
- kmem_free(zgd, sizeof (zgd_t));
- return (error);
-}
-
-int
-zvol_busy(void)
-{
- return (zvol_minors != 0);
-}
-
-void
-zvol_init(void)
-{
- ZFS_LOG(1, "ZVOL Initialized.");
-}
-
-void
-zvol_fini(void)
-{
- ZFS_LOG(1, "ZVOL Deinitialized.");
-}
diff --git a/sys/contrib/opensolaris/uts/common/os/callb.c b/sys/contrib/opensolaris/uts/common/os/callb.c
deleted file mode 100644
index c6e357e..0000000
--- a/sys/contrib/opensolaris/uts/common/os/callb.c
+++ /dev/null
@@ -1,363 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/param.h>
-#include <sys/types.h>
-#include <sys/time.h>
-#include <sys/sysmacros.h>
-#include <sys/systm.h>
-#include <sys/proc.h>
-#include <sys/mutex.h>
-#include <sys/condvar.h>
-#include <sys/callb.h>
-#include <sys/kmem.h>
-#include <sys/cmn_err.h>
-#include <sys/debug.h>
-#include <sys/kobj.h>
-#include <sys/systm.h> /* for delay() */
-#include <sys/taskq.h> /* For TASKQ_NAMELEN */
-#include <sys/kernel.h>
-
-#define CB_MAXNAME TASKQ_NAMELEN
-
-/*
- * The callb mechanism provides generic event scheduling/echoing.
- * A callb function is registered and called on behalf of the event.
- */
-typedef struct callb {
- struct callb *c_next; /* next in class or on freelist */
- kthread_id_t c_thread; /* ptr to caller's thread struct */
- char c_flag; /* info about the callb state */
- uchar_t c_class; /* this callb's class */
- kcondvar_t c_done_cv; /* signal callb completion */
- boolean_t (*c_func)(); /* cb function: returns true if ok */
- void *c_arg; /* arg to c_func */
- char c_name[CB_MAXNAME+1]; /* debug:max func name length */
-} callb_t;
-
-/*
- * callb c_flag bitmap definitions
- */
-#define CALLB_FREE 0x0
-#define CALLB_TAKEN 0x1
-#define CALLB_EXECUTING 0x2
-
-/*
- * Basic structure for a callb table.
- * All callbs are organized into different class groups described
- * by ct_class array.
- * The callbs within a class are single-linked and normally run by a
- * serial execution.
- */
-typedef struct callb_table {
- kmutex_t ct_lock; /* protect all callb states */
- callb_t *ct_freelist; /* free callb structures */
- int ct_busy; /* != 0 prevents additions */
- kcondvar_t ct_busy_cv; /* to wait for not busy */
- int ct_ncallb; /* num of callbs allocated */
- callb_t *ct_first_cb[NCBCLASS]; /* ptr to 1st callb in a class */
-} callb_table_t;
-
-int callb_timeout_sec = CPR_KTHREAD_TIMEOUT_SEC;
-
-static callb_id_t callb_add_common(boolean_t (*)(void *, int),
- void *, int, char *, kthread_id_t);
-
-static callb_table_t callb_table; /* system level callback table */
-static callb_table_t *ct = &callb_table;
-static kmutex_t callb_safe_mutex;
-callb_cpr_t callb_cprinfo_safe = {
- &callb_safe_mutex, CALLB_CPR_ALWAYS_SAFE, 0, 0, 0 };
-
-/*
- * Init all callb tables in the system.
- */
-void
-callb_init(void *dummy __unused)
-{
- callb_table.ct_busy = 0; /* mark table open for additions */
- mutex_init(&callb_safe_mutex, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&callb_table.ct_lock, NULL, MUTEX_DEFAULT, NULL);
-}
-
-void
-callb_fini(void *dummy __unused)
-{
- callb_t *cp;
-
- mutex_enter(&ct->ct_lock);
- while ((cp = ct->ct_freelist) != NULL) {
- ct->ct_freelist = cp->c_next;
- ct->ct_ncallb--;
- kmem_free(cp, sizeof (callb_t));
- }
- ASSERT(ct->ct_ncallb == 0);
- mutex_exit(&ct->ct_lock);
- mutex_destroy(&callb_safe_mutex);
- mutex_destroy(&callb_table.ct_lock);
-}
-
-/*
- * callout_add() is called to register func() be called later.
- */
-static callb_id_t
-callb_add_common(boolean_t (*func)(void *arg, int code),
- void *arg, int class, char *name, kthread_id_t t)
-{
- callb_t *cp;
-
- ASSERT(class < NCBCLASS);
-
- mutex_enter(&ct->ct_lock);
- while (ct->ct_busy)
- cv_wait(&ct->ct_busy_cv, &ct->ct_lock);
- if ((cp = ct->ct_freelist) == NULL) {
- ct->ct_ncallb++;
- cp = (callb_t *)kmem_zalloc(sizeof (callb_t), KM_SLEEP);
- }
- ct->ct_freelist = cp->c_next;
- cp->c_thread = t;
- cp->c_func = func;
- cp->c_arg = arg;
- cp->c_class = (uchar_t)class;
- cp->c_flag |= CALLB_TAKEN;
-#ifdef DEBUG
- if (strlen(name) > CB_MAXNAME)
- cmn_err(CE_WARN, "callb_add: name of callback function '%s' "
- "too long -- truncated to %d chars",
- name, CB_MAXNAME);
-#endif
- (void) strncpy(cp->c_name, name, CB_MAXNAME);
- cp->c_name[CB_MAXNAME] = '\0';
-
- /*
- * Insert the new callb at the head of its class list.
- */
- cp->c_next = ct->ct_first_cb[class];
- ct->ct_first_cb[class] = cp;
-
- mutex_exit(&ct->ct_lock);
- return ((callb_id_t)cp);
-}
-
-/*
- * The default function to add an entry to the callback table. Since
- * it uses curthread as the thread identifier to store in the table,
- * it should be used for the normal case of a thread which is calling
- * to add ITSELF to the table.
- */
-callb_id_t
-callb_add(boolean_t (*func)(void *arg, int code),
- void *arg, int class, char *name)
-{
- return (callb_add_common(func, arg, class, name, curthread));
-}
-
-/*
- * A special version of callb_add() above for use by threads which
- * might be adding an entry to the table on behalf of some other
- * thread (for example, one which is constructed but not yet running).
- * In this version the thread id is an argument.
- */
-callb_id_t
-callb_add_thread(boolean_t (*func)(void *arg, int code),
- void *arg, int class, char *name, kthread_id_t t)
-{
- return (callb_add_common(func, arg, class, name, t));
-}
-
-/*
- * callout_delete() is called to remove an entry identified by id
- * that was originally placed there by a call to callout_add().
- * return -1 if fail to delete a callb entry otherwise return 0.
- */
-int
-callb_delete(callb_id_t id)
-{
- callb_t **pp;
- callb_t *me = (callb_t *)id;
-
- mutex_enter(&ct->ct_lock);
-
- for (;;) {
- pp = &ct->ct_first_cb[me->c_class];
- while (*pp != NULL && *pp != me)
- pp = &(*pp)->c_next;
-
-#ifdef DEBUG
- if (*pp != me) {
- cmn_err(CE_WARN, "callb delete bogus entry 0x%p",
- (void *)me);
- mutex_exit(&ct->ct_lock);
- return (-1);
- }
-#endif /* DEBUG */
-
- /*
- * It is not allowed to delete a callb in the middle of
- * executing otherwise, the callb_execute() will be confused.
- */
- if (!(me->c_flag & CALLB_EXECUTING))
- break;
-
- cv_wait(&me->c_done_cv, &ct->ct_lock);
- }
- /* relink the class list */
- *pp = me->c_next;
-
- /* clean up myself and return the free callb to the head of freelist */
- me->c_flag = CALLB_FREE;
- me->c_next = ct->ct_freelist;
- ct->ct_freelist = me;
-
- mutex_exit(&ct->ct_lock);
- return (0);
-}
-
-/*
- * class: indicates to execute all callbs in the same class;
- * code: optional argument for the callb functions.
- * return: = 0: success
- * != 0: ptr to string supplied when callback was registered
- */
-void *
-callb_execute_class(int class, int code)
-{
- callb_t *cp;
- void *ret = NULL;
-
- ASSERT(class < NCBCLASS);
-
- mutex_enter(&ct->ct_lock);
-
- for (cp = ct->ct_first_cb[class];
- cp != NULL && ret == 0; cp = cp->c_next) {
- while (cp->c_flag & CALLB_EXECUTING)
- cv_wait(&cp->c_done_cv, &ct->ct_lock);
- /*
- * cont if the callb is deleted while we're sleeping
- */
- if (cp->c_flag == CALLB_FREE)
- continue;
- cp->c_flag |= CALLB_EXECUTING;
-
-#ifdef CALLB_DEBUG
- printf("callb_execute: name=%s func=%p arg=%p\n",
- cp->c_name, (void *)cp->c_func, (void *)cp->c_arg);
-#endif /* CALLB_DEBUG */
-
- mutex_exit(&ct->ct_lock);
- /* If callback function fails, pass back client's name */
- if (!(*cp->c_func)(cp->c_arg, code))
- ret = cp->c_name;
- mutex_enter(&ct->ct_lock);
-
- cp->c_flag &= ~CALLB_EXECUTING;
- cv_broadcast(&cp->c_done_cv);
- }
- mutex_exit(&ct->ct_lock);
- return (ret);
-}
-
-/*
- * callers make sure no recursive entries to this func.
- * dp->cc_lockp is registered by callb_add to protect callb_cpr_t structure.
- *
- * When calling to stop a kernel thread (code == CB_CODE_CPR_CHKPT) we
- * use a cv_timedwait() in case the kernel thread is blocked.
- *
- * Note that this is a generic callback handler for daemon CPR and
- * should NOT be changed to accommodate any specific requirement in a daemon.
- * Individual daemons that require changes to the handler shall write
- * callback routines in their own daemon modules.
- */
-boolean_t
-callb_generic_cpr(void *arg, int code)
-{
- callb_cpr_t *cp = (callb_cpr_t *)arg;
- clock_t ret = 0; /* assume success */
-
- mutex_enter(cp->cc_lockp);
-
- switch (code) {
- case CB_CODE_CPR_CHKPT:
- cp->cc_events |= CALLB_CPR_START;
- while (!(cp->cc_events & CALLB_CPR_SAFE))
- /* cv_timedwait() returns -1 if it times out. */
- if ((ret = cv_timedwait(&cp->cc_callb_cv,
- cp->cc_lockp,
- callb_timeout_sec * hz)) == -1)
- break;
- break;
-
- case CB_CODE_CPR_RESUME:
- cp->cc_events &= ~CALLB_CPR_START;
- cv_signal(&cp->cc_stop_cv);
- break;
- }
- mutex_exit(cp->cc_lockp);
- return (ret != -1);
-}
-
-/*
- * The generic callback function associated with kernel threads which
- * are always considered safe.
- */
-/* ARGSUSED */
-boolean_t
-callb_generic_cpr_safe(void *arg, int code)
-{
- return (B_TRUE);
-}
-/*
- * Prevent additions to callback table.
- */
-void
-callb_lock_table(void)
-{
- mutex_enter(&ct->ct_lock);
- ASSERT(ct->ct_busy == 0);
- ct->ct_busy = 1;
- mutex_exit(&ct->ct_lock);
-}
-
-/*
- * Allow additions to callback table.
- */
-void
-callb_unlock_table(void)
-{
- mutex_enter(&ct->ct_lock);
- ASSERT(ct->ct_busy != 0);
- ct->ct_busy = 0;
- cv_broadcast(&ct->ct_busy_cv);
- mutex_exit(&ct->ct_lock);
-}
-
-SYSINIT(sol_callb, SI_SUB_DRIVERS, SI_ORDER_FIRST, callb_init, NULL);
-SYSUNINIT(sol_callb, SI_SUB_DRIVERS, SI_ORDER_FIRST, callb_fini, NULL);
diff --git a/sys/contrib/opensolaris/uts/common/os/list.c b/sys/contrib/opensolaris/uts/common/os/list.c
deleted file mode 100644
index f9b6fcb..0000000
--- a/sys/contrib/opensolaris/uts/common/os/list.c
+++ /dev/null
@@ -1,193 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-/*
- * Generic doubly-linked list implementation
- */
-
-#include <sys/list.h>
-#include <sys/list_impl.h>
-#include <sys/types.h>
-#include <sys/sysmacros.h>
-#include <sys/debug.h>
-
-#define list_d2l(a, obj) ((list_node_t *)(((char *)obj) + (a)->list_offset))
-#define list_object(a, node) ((void *)(((char *)node) - (a)->list_offset))
-#define list_empty(a) ((a)->list_head.list_next == &(a)->list_head)
-
-#define list_insert_after_node(list, node, object) { \
- list_node_t *lnew = list_d2l(list, object); \
- lnew->list_prev = node; \
- lnew->list_next = node->list_next; \
- node->list_next->list_prev = lnew; \
- node->list_next = lnew; \
-}
-
-#define list_insert_before_node(list, node, object) { \
- list_node_t *lnew = list_d2l(list, object); \
- lnew->list_next = node; \
- lnew->list_prev = node->list_prev; \
- node->list_prev->list_next = lnew; \
- node->list_prev = lnew; \
-}
-
-void
-list_create(list_t *list, size_t size, size_t offset)
-{
- ASSERT(list);
- ASSERT(size > 0);
- ASSERT(size >= offset + sizeof (list_node_t));
-
- list->list_size = size;
- list->list_offset = offset;
- list->list_head.list_next = list->list_head.list_prev =
- &list->list_head;
-}
-
-void
-list_destroy(list_t *list)
-{
- list_node_t *node = &list->list_head;
-
- ASSERT(list);
- ASSERT(list->list_head.list_next == node);
- ASSERT(list->list_head.list_prev == node);
-
- node->list_next = node->list_prev = NULL;
-}
-
-void
-list_insert_after(list_t *list, void *object, void *nobject)
-{
- list_node_t *lold = list_d2l(list, object);
- list_insert_after_node(list, lold, nobject);
-}
-
-void
-list_insert_before(list_t *list, void *object, void *nobject)
-{
- list_node_t *lold = list_d2l(list, object);
- list_insert_before_node(list, lold, nobject)
-}
-
-void
-list_insert_head(list_t *list, void *object)
-{
- list_node_t *lold = &list->list_head;
- list_insert_after_node(list, lold, object);
-}
-
-void
-list_insert_tail(list_t *list, void *object)
-{
- list_node_t *lold = &list->list_head;
- list_insert_before_node(list, lold, object);
-}
-
-void
-list_remove(list_t *list, void *object)
-{
- list_node_t *lold = list_d2l(list, object);
- ASSERT(!list_empty(list));
- lold->list_prev->list_next = lold->list_next;
- lold->list_next->list_prev = lold->list_prev;
- lold->list_next = lold->list_prev = NULL;
-}
-
-void *
-list_head(list_t *list)
-{
- if (list_empty(list))
- return (NULL);
- return (list_object(list, list->list_head.list_next));
-}
-
-void *
-list_tail(list_t *list)
-{
- if (list_empty(list))
- return (NULL);
- return (list_object(list, list->list_head.list_prev));
-}
-
-void *
-list_next(list_t *list, void *object)
-{
- list_node_t *node = list_d2l(list, object);
-
- if (node->list_next != &list->list_head)
- return (list_object(list, node->list_next));
-
- return (NULL);
-}
-
-void *
-list_prev(list_t *list, void *object)
-{
- list_node_t *node = list_d2l(list, object);
-
- if (node->list_prev != &list->list_head)
- return (list_object(list, node->list_prev));
-
- return (NULL);
-}
-
-/*
- * Insert src list after dst list. Empty src list thereafter.
- */
-void
-list_move_tail(list_t *dst, list_t *src)
-{
- list_node_t *dstnode = &dst->list_head;
- list_node_t *srcnode = &src->list_head;
-
- ASSERT(dst->list_size == src->list_size);
- ASSERT(dst->list_offset == src->list_offset);
-
- if (list_empty(src))
- return;
-
- dstnode->list_prev->list_next = srcnode->list_next;
- srcnode->list_next->list_prev = dstnode->list_prev;
- dstnode->list_prev = srcnode->list_prev;
- srcnode->list_prev->list_next = dstnode;
-
- /* empty src list */
- srcnode->list_next = srcnode->list_prev = srcnode;
-}
-
-int
-list_link_active(list_node_t *link)
-{
- return (link->list_next != NULL);
-}
-
-int
-list_is_empty(list_t *list)
-{
- return (list_empty(list));
-}
diff --git a/sys/contrib/opensolaris/uts/common/os/nvpair_alloc_system.c b/sys/contrib/opensolaris/uts/common/os/nvpair_alloc_system.c
deleted file mode 100644
index 3682853..0000000
--- a/sys/contrib/opensolaris/uts/common/os/nvpair_alloc_system.c
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/nvpair.h>
-
-static void *
-nv_alloc_sys(nv_alloc_t *nva, size_t size)
-{
- return (kmem_alloc(size, (int)(uintptr_t)nva->nva_arg));
-}
-
-/*ARGSUSED*/
-static void
-nv_free_sys(nv_alloc_t *nva, void *buf, size_t size)
-{
- kmem_free(buf, size);
-}
-
-static const nv_alloc_ops_t system_ops = {
- NULL, /* nv_ao_init() */
- NULL, /* nv_ao_fini() */
- nv_alloc_sys, /* nv_ao_alloc() */
- nv_free_sys, /* nv_ao_free() */
- NULL /* nv_ao_reset() */
-};
-
-nv_alloc_t nv_alloc_sleep_def = {
- &system_ops,
- (void *)KM_SLEEP
-};
-
-nv_alloc_t nv_alloc_nosleep_def = {
- &system_ops,
- (void *)KM_NOSLEEP
-};
-
-nv_alloc_t *nv_alloc_sleep = &nv_alloc_sleep_def;
-nv_alloc_t *nv_alloc_nosleep = &nv_alloc_nosleep_def;
diff --git a/sys/contrib/opensolaris/uts/common/os/taskq.c b/sys/contrib/opensolaris/uts/common/os/taskq.c
deleted file mode 100644
index 1558c1f..0000000
--- a/sys/contrib/opensolaris/uts/common/os/taskq.c
+++ /dev/null
@@ -1,1020 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-/*
- * Kernel task queues: general-purpose asynchronous task scheduling.
- *
- * A common problem in kernel programming is the need to schedule tasks
- * to be performed later, by another thread. There are several reasons
- * you may want or need to do this:
- *
- * (1) The task isn't time-critical, but your current code path is.
- *
- * (2) The task may require grabbing locks that you already hold.
- *
- * (3) The task may need to block (e.g. to wait for memory), but you
- * cannot block in your current context.
- *
- * (4) Your code path can't complete because of some condition, but you can't
- * sleep or fail, so you queue the task for later execution when condition
- * disappears.
- *
- * (5) You just want a simple way to launch multiple tasks in parallel.
- *
- * Task queues provide such a facility. In its simplest form (used when
- * performance is not a critical consideration) a task queue consists of a
- * single list of tasks, together with one or more threads to service the
- * list. There are some cases when this simple queue is not sufficient:
- *
- * (1) The task queues are very hot and there is a need to avoid data and lock
- * contention over global resources.
- *
- * (2) Some tasks may depend on other tasks to complete, so they can't be put in
- * the same list managed by the same thread.
- *
- * (3) Some tasks may block for a long time, and this should not block other
- * tasks in the queue.
- *
- * To provide useful service in such cases we define a "dynamic task queue"
- * which has an individual thread for each of the tasks. These threads are
- * dynamically created as they are needed and destroyed when they are not in
- * use. The API for managing task pools is the same as for managing task queues
- * with the exception of a taskq creation flag TASKQ_DYNAMIC which tells that
- * dynamic task pool behavior is desired.
- *
- * Dynamic task queues may also place tasks in the normal queue (called "backing
- * queue") when task pool runs out of resources. Users of task queues may
- * disallow such queued scheduling by specifying TQ_NOQUEUE in the dispatch
- * flags.
- *
- * The backing task queue is also used for scheduling internal tasks needed for
- * dynamic task queue maintenance.
- *
- * INTERFACES:
- *
- * taskq_t *taskq_create(name, nthreads, pri_t pri, minalloc, maxall, flags);
- *
- * Create a taskq with specified properties.
- * Possible 'flags':
- *
- * TASKQ_DYNAMIC: Create task pool for task management. If this flag is
- * specified, 'nthreads' specifies the maximum number of threads in
- * the task queue. Task execution order for dynamic task queues is
- * not predictable.
- *
- * If this flag is not specified (default case) a
- * single-list task queue is created with 'nthreads' threads
- * servicing it. Entries in this queue are managed by
- * taskq_ent_alloc() and taskq_ent_free() which try to keep the
- * task population between 'minalloc' and 'maxalloc', but the
- * latter limit is only advisory for TQ_SLEEP dispatches and the
- * former limit is only advisory for TQ_NOALLOC dispatches. If
- * TASKQ_PREPOPULATE is set in 'flags', the taskq will be
- * prepopulated with 'minalloc' task structures.
- *
- * Since non-DYNAMIC taskqs are queues, tasks are guaranteed to be
- * executed in the order they are scheduled if nthreads == 1.
- * If nthreads > 1, task execution order is not predictable.
- *
- * TASKQ_PREPOPULATE: Prepopulate task queue with threads.
- * Also prepopulate the task queue with 'minalloc' task structures.
- *
- * TASKQ_CPR_SAFE: This flag specifies that users of the task queue will
- * use their own protocol for handling CPR issues. This flag is not
- * supported for DYNAMIC task queues.
- *
- * The 'pri' field specifies the default priority for the threads that
- * service all scheduled tasks.
- *
- * void taskq_destroy(tap):
- *
- * Waits for any scheduled tasks to complete, then destroys the taskq.
- * Caller should guarantee that no new tasks are scheduled in the closing
- * taskq.
- *
- * taskqid_t taskq_dispatch(tq, func, arg, flags):
- *
- * Dispatches the task "func(arg)" to taskq. The 'flags' indicates whether
- * the caller is willing to block for memory. The function returns an
- * opaque value which is zero iff dispatch fails. If flags is TQ_NOSLEEP
- * or TQ_NOALLOC and the task can't be dispatched, taskq_dispatch() fails
- * and returns (taskqid_t)0.
- *
- * ASSUMES: func != NULL.
- *
- * Possible flags:
- * TQ_NOSLEEP: Do not wait for resources; may fail.
- *
- * TQ_NOALLOC: Do not allocate memory; may fail. May only be used with
- * non-dynamic task queues.
- *
- * TQ_NOQUEUE: Do not enqueue a task if it can't dispatch it due to
- * lack of available resources and fail. If this flag is not
- * set, and the task pool is exhausted, the task may be scheduled
- * in the backing queue. This flag may ONLY be used with dynamic
- * task queues.
- *
- * NOTE: This flag should always be used when a task queue is used
- * for tasks that may depend on each other for completion.
- * Enqueueing dependent tasks may create deadlocks.
- *
- * TQ_SLEEP: May block waiting for resources. May still fail for
- * dynamic task queues if TQ_NOQUEUE is also specified, otherwise
- * always succeed.
- *
- * NOTE: Dynamic task queues are much more likely to fail in
- * taskq_dispatch() (especially if TQ_NOQUEUE was specified), so it
- * is important to have backup strategies handling such failures.
- *
- * void taskq_wait(tq):
- *
- * Waits for all previously scheduled tasks to complete.
- *
- * NOTE: It does not stop any new task dispatches.
- * Do NOT call taskq_wait() from a task: it will cause deadlock.
- *
- * void taskq_suspend(tq)
- *
- * Suspend all task execution. Tasks already scheduled for a dynamic task
- * queue will still be executed, but all new scheduled tasks will be
- * suspended until taskq_resume() is called.
- *
- * int taskq_suspended(tq)
- *
- * Returns 1 if taskq is suspended and 0 otherwise. It is intended to
- * ASSERT that the task queue is suspended.
- *
- * void taskq_resume(tq)
- *
- * Resume task queue execution.
- *
- * int taskq_member(tq, thread)
- *
- * Returns 1 if 'thread' belongs to taskq 'tq' and 0 otherwise. The
- * intended use is to ASSERT that a given function is called in taskq
- * context only.
- *
- * system_taskq
- *
- * Global system-wide dynamic task queue for common uses. It may be used by
- * any subsystem that needs to schedule tasks and does not need to manage
- * its own task queues. It is initialized quite early during system boot.
- *
- * IMPLEMENTATION.
- *
- * This is schematic representation of the task queue structures.
- *
- * taskq:
- * +-------------+
- * |tq_lock | +---< taskq_ent_free()
- * +-------------+ |
- * |... | | tqent: tqent:
- * +-------------+ | +------------+ +------------+
- * | tq_freelist |-->| tqent_next |--> ... ->| tqent_next |
- * +-------------+ +------------+ +------------+
- * |... | | ... | | ... |
- * +-------------+ +------------+ +------------+
- * | tq_task | |
- * | | +-------------->taskq_ent_alloc()
- * +--------------------------------------------------------------------------+
- * | | | tqent tqent |
- * | +---------------------+ +--> +------------+ +--> +------------+ |
- * | | ... | | | func, arg | | | func, arg | |
- * +>+---------------------+ <---|-+ +------------+ <---|-+ +------------+ |
- * | tq_taskq.tqent_next | ----+ | | tqent_next | --->+ | | tqent_next |--+
- * +---------------------+ | +------------+ ^ | +------------+
- * +-| tq_task.tqent_prev | +--| tqent_prev | | +--| tqent_prev | ^
- * | +---------------------+ +------------+ | +------------+ |
- * | |... | | ... | | | ... | |
- * | +---------------------+ +------------+ | +------------+ |
- * | ^ | |
- * | | | |
- * +--------------------------------------+--------------+ TQ_APPEND() -+
- * | | |
- * |... | taskq_thread()-----+
- * +-------------+
- * | tq_buckets |--+-------> [ NULL ] (for regular task queues)
- * +-------------+ |
- * | DYNAMIC TASK QUEUES:
- * |
- * +-> taskq_bucket[nCPU] taskq_bucket_dispatch()
- * +-------------------+ ^
- * +--->| tqbucket_lock | |
- * | +-------------------+ +--------+ +--------+
- * | | tqbucket_freelist |-->| tqent |-->...| tqent | ^
- * | +-------------------+<--+--------+<--...+--------+ |
- * | | ... | | thread | | thread | |
- * | +-------------------+ +--------+ +--------+ |
- * | +-------------------+ |
- * taskq_dispatch()--+--->| tqbucket_lock | TQ_APPEND()------+
- * TQ_HASH() | +-------------------+ +--------+ +--------+
- * | | tqbucket_freelist |-->| tqent |-->...| tqent |
- * | +-------------------+<--+--------+<--...+--------+
- * | | ... | | thread | | thread |
- * | +-------------------+ +--------+ +--------+
- * +---> ...
- *
- *
- * Task queues use tq_task field to link new entry in the queue. The queue is a
- * circular doubly-linked list. Entries are put in the end of the list with
- * TQ_APPEND() and processed from the front of the list by taskq_thread() in
- * FIFO order. Task queue entries are cached in the free list managed by
- * taskq_ent_alloc() and taskq_ent_free() functions.
- *
- * All threads used by task queues mark t_taskq field of the thread to
- * point to the task queue.
- *
- * Dynamic Task Queues Implementation.
- *
- * For a dynamic task queues there is a 1-to-1 mapping between a thread and
- * taskq_ent_structure. Each entry is serviced by its own thread and each thread
- * is controlled by a single entry.
- *
- * Entries are distributed over a set of buckets. To avoid using modulo
- * arithmetics the number of buckets is 2^n and is determined as the nearest
- * power of two roundown of the number of CPUs in the system. Tunable
- * variable 'taskq_maxbuckets' limits the maximum number of buckets. Each entry
- * is attached to a bucket for its lifetime and can't migrate to other buckets.
- *
- * Entries that have scheduled tasks are not placed in any list. The dispatch
- * function sets their "func" and "arg" fields and signals the corresponding
- * thread to execute the task. Once the thread executes the task it clears the
- * "func" field and places an entry on the bucket cache of free entries pointed
- * by "tqbucket_freelist" field. ALL entries on the free list should have "func"
- * field equal to NULL. The free list is a circular doubly-linked list identical
- * in structure to the tq_task list above, but entries are taken from it in LIFO
- * order - the last freed entry is the first to be allocated. The
- * taskq_bucket_dispatch() function gets the most recently used entry from the
- * free list, sets its "func" and "arg" fields and signals a worker thread.
- *
- * After executing each task a per-entry thread taskq_d_thread() places its
- * entry on the bucket free list and goes to a timed sleep. If it wakes up
- * without getting new task it removes the entry from the free list and destroys
- * itself. The thread sleep time is controlled by a tunable variable
- * `taskq_thread_timeout'.
- *
- * There is various statistics kept in the bucket which allows for later
- * analysis of taskq usage patterns. Also, a global copy of taskq creation and
- * death statistics is kept in the global taskq data structure. Since thread
- * creation and death happen rarely, updating such global data does not present
- * a performance problem.
- *
- * NOTE: Threads are not bound to any CPU and there is absolutely no association
- * between the bucket and actual thread CPU, so buckets are used only to
- * split resources and reduce resource contention. Having threads attached
- * to the CPU denoted by a bucket may reduce number of times the job
- * switches between CPUs.
- *
- * Current algorithm creates a thread whenever a bucket has no free
- * entries. It would be nice to know how many threads are in the running
- * state and don't create threads if all CPUs are busy with existing
- * tasks, but it is unclear how such strategy can be implemented.
- *
- * Currently buckets are created statically as an array attached to task
- * queue. On some system with nCPUs < max_ncpus it may waste system
- * memory. One solution may be allocation of buckets when they are first
- * touched, but it is not clear how useful it is.
- *
- * SUSPEND/RESUME implementation.
- *
- * Before executing a task taskq_thread() (executing non-dynamic task
- * queues) obtains taskq's thread lock as a reader. The taskq_suspend()
- * function gets the same lock as a writer blocking all non-dynamic task
- * execution. The taskq_resume() function releases the lock allowing
- * taskq_thread to continue execution.
- *
- * For dynamic task queues, each bucket is marked as TQBUCKET_SUSPEND by
- * taskq_suspend() function. After that taskq_bucket_dispatch() always
- * fails, so that taskq_dispatch() will either enqueue tasks for a
- * suspended backing queue or fail if TQ_NOQUEUE is specified in dispatch
- * flags.
- *
- * NOTE: taskq_suspend() does not immediately block any tasks already
- * scheduled for dynamic task queues. It only suspends new tasks
- * scheduled after taskq_suspend() was called.
- *
- * taskq_member() function works by comparing a thread t_taskq pointer with
- * the passed thread pointer.
- *
- * LOCKS and LOCK Hierarchy:
- *
- * There are two locks used in task queues.
- *
- * 1) Task queue structure has a lock, protecting global task queue state.
- *
- * 2) Each per-CPU bucket has a lock for bucket management.
- *
- * If both locks are needed, task queue lock should be taken only after bucket
- * lock.
- *
- * DEBUG FACILITIES.
- *
- * For DEBUG kernels it is possible to induce random failures to
- * taskq_dispatch() function when it is given TQ_NOSLEEP argument. The value of
- * taskq_dmtbf and taskq_smtbf tunables control the mean time between induced
- * failures for dynamic and static task queues respectively.
- *
- * Setting TASKQ_STATISTIC to 0 will disable per-bucket statistics.
- *
- * TUNABLES
- *
- * system_taskq_size - Size of the global system_taskq.
- * This value is multiplied by nCPUs to determine
- * actual size.
- * Default value: 64
- *
- * taskq_thread_timeout - Maximum idle time for taskq_d_thread()
- * Default value: 5 minutes
- *
- * taskq_maxbuckets - Maximum number of buckets in any task queue
- * Default value: 128
- *
- * taskq_search_depth - Maximum # of buckets searched for a free entry
- * Default value: 4
- *
- * taskq_dmtbf - Mean time between induced dispatch failures
- * for dynamic task queues.
- * Default value: UINT_MAX (no induced failures)
- *
- * taskq_smtbf - Mean time between induced dispatch failures
- * for static task queues.
- * Default value: UINT_MAX (no induced failures)
- *
- * CONDITIONAL compilation.
- *
- * TASKQ_STATISTIC - If set will enable bucket statistic (default).
- *
- */
-
-#include <sys/taskq_impl.h>
-#include <sys/proc.h>
-#include <sys/kmem.h>
-#include <sys/callb.h>
-#include <sys/systm.h>
-#include <sys/cmn_err.h>
-#include <sys/debug.h>
-#include <sys/sysmacros.h>
-#include <sys/sdt.h>
-#include <sys/mutex.h>
-#include <sys/kernel.h>
-#include <sys/limits.h>
-
-static kmem_cache_t *taskq_ent_cache, *taskq_cache;
-
-/* Global system task queue for common use */
-taskq_t *system_taskq;
-
-/*
- * Maxmimum number of entries in global system taskq is
- * system_taskq_size * max_ncpus
- */
-#define SYSTEM_TASKQ_SIZE 1
-int system_taskq_size = SYSTEM_TASKQ_SIZE;
-
-/*
- * Dynamic task queue threads that don't get any work within
- * taskq_thread_timeout destroy themselves
- */
-#define TASKQ_THREAD_TIMEOUT (60 * 5)
-int taskq_thread_timeout = TASKQ_THREAD_TIMEOUT;
-
-#define TASKQ_MAXBUCKETS 128
-int taskq_maxbuckets = TASKQ_MAXBUCKETS;
-
-/*
- * When a bucket has no available entries another buckets are tried.
- * taskq_search_depth parameter limits the amount of buckets that we search
- * before failing. This is mostly useful in systems with many CPUs where we may
- * spend too much time scanning busy buckets.
- */
-#define TASKQ_SEARCH_DEPTH 4
-int taskq_search_depth = TASKQ_SEARCH_DEPTH;
-
-/*
- * Hashing function: mix various bits of x. May be pretty much anything.
- */
-#define TQ_HASH(x) ((x) ^ ((x) >> 11) ^ ((x) >> 17) ^ ((x) ^ 27))
-
-/*
- * We do not create any new threads when the system is low on memory and start
- * throttling memory allocations. The following macro tries to estimate such
- * condition.
- */
-#define ENOUGH_MEMORY() (freemem > throttlefree)
-
-/*
- * Static functions.
- */
-static taskq_t *taskq_create_common(const char *, int, int, pri_t, int,
- int, uint_t);
-static void taskq_thread(void *);
-static int taskq_constructor(void *, void *, int);
-static void taskq_destructor(void *, void *);
-static int taskq_ent_constructor(void *, void *, int);
-static void taskq_ent_destructor(void *, void *);
-static taskq_ent_t *taskq_ent_alloc(taskq_t *, int);
-static void taskq_ent_free(taskq_t *, taskq_ent_t *);
-
-/*
- * Collect per-bucket statistic when TASKQ_STATISTIC is defined.
- */
-#define TASKQ_STATISTIC 1
-
-#if TASKQ_STATISTIC
-#define TQ_STAT(b, x) b->tqbucket_stat.x++
-#else
-#define TQ_STAT(b, x)
-#endif
-
-/*
- * Random fault injection.
- */
-uint_t taskq_random;
-uint_t taskq_dmtbf = UINT_MAX; /* mean time between injected failures */
-uint_t taskq_smtbf = UINT_MAX; /* mean time between injected failures */
-
-/*
- * TQ_NOSLEEP dispatches on dynamic task queues are always allowed to fail.
- *
- * TQ_NOSLEEP dispatches on static task queues can't arbitrarily fail because
- * they could prepopulate the cache and make sure that they do not use more
- * then minalloc entries. So, fault injection in this case insures that
- * either TASKQ_PREPOPULATE is not set or there are more entries allocated
- * than is specified by minalloc. TQ_NOALLOC dispatches are always allowed
- * to fail, but for simplicity we treat them identically to TQ_NOSLEEP
- * dispatches.
- */
-#ifdef DEBUG
-#define TASKQ_D_RANDOM_DISPATCH_FAILURE(tq, flag) \
- taskq_random = (taskq_random * 2416 + 374441) % 1771875;\
- if ((flag & TQ_NOSLEEP) && \
- taskq_random < 1771875 / taskq_dmtbf) { \
- return (NULL); \
- }
-
-#define TASKQ_S_RANDOM_DISPATCH_FAILURE(tq, flag) \
- taskq_random = (taskq_random * 2416 + 374441) % 1771875;\
- if ((flag & (TQ_NOSLEEP | TQ_NOALLOC)) && \
- (!(tq->tq_flags & TASKQ_PREPOPULATE) || \
- (tq->tq_nalloc > tq->tq_minalloc)) && \
- (taskq_random < (1771875 / taskq_smtbf))) { \
- mutex_exit(&tq->tq_lock); \
- return ((taskqid_t)0); \
- }
-#else
-#define TASKQ_S_RANDOM_DISPATCH_FAILURE(tq, flag)
-#define TASKQ_D_RANDOM_DISPATCH_FAILURE(tq, flag)
-#endif
-
-#define IS_EMPTY(l) (((l).tqent_prev == (l).tqent_next) && \
- ((l).tqent_prev == &(l)))
-
-/*
- * Append `tqe' in the end of the doubly-linked list denoted by l.
- */
-#define TQ_APPEND(l, tqe) { \
- tqe->tqent_next = &l; \
- tqe->tqent_prev = l.tqent_prev; \
- tqe->tqent_next->tqent_prev = tqe; \
- tqe->tqent_prev->tqent_next = tqe; \
-}
-
-/*
- * Schedule a task specified by func and arg into the task queue entry tqe.
- */
-#define TQ_ENQUEUE(tq, tqe, func, arg) { \
- ASSERT(MUTEX_HELD(&tq->tq_lock)); \
- TQ_APPEND(tq->tq_task, tqe); \
- tqe->tqent_func = (func); \
- tqe->tqent_arg = (arg); \
- tq->tq_tasks++; \
- if (tq->tq_tasks - tq->tq_executed > tq->tq_maxtasks) \
- tq->tq_maxtasks = tq->tq_tasks - tq->tq_executed; \
- cv_signal(&tq->tq_dispatch_cv); \
- DTRACE_PROBE2(taskq__enqueue, taskq_t *, tq, taskq_ent_t *, tqe); \
-}
-
-/*
- * Do-nothing task which may be used to prepopulate thread caches.
- */
-/*ARGSUSED*/
-void
-nulltask(void *unused)
-{
-}
-
-
-/*ARGSUSED*/
-static int
-taskq_constructor(void *buf, void *cdrarg, int kmflags)
-{
- taskq_t *tq = buf;
-
- bzero(tq, sizeof (taskq_t));
-
- mutex_init(&tq->tq_lock, NULL, MUTEX_DEFAULT, NULL);
- rw_init(&tq->tq_threadlock, NULL, RW_DEFAULT, NULL);
- cv_init(&tq->tq_dispatch_cv, NULL, CV_DEFAULT, NULL);
- cv_init(&tq->tq_wait_cv, NULL, CV_DEFAULT, NULL);
-
- tq->tq_task.tqent_next = &tq->tq_task;
- tq->tq_task.tqent_prev = &tq->tq_task;
-
- return (0);
-}
-
-/*ARGSUSED*/
-static void
-taskq_destructor(void *buf, void *cdrarg)
-{
- taskq_t *tq = buf;
-
- mutex_destroy(&tq->tq_lock);
- rw_destroy(&tq->tq_threadlock);
- cv_destroy(&tq->tq_dispatch_cv);
- cv_destroy(&tq->tq_wait_cv);
-}
-
-/*ARGSUSED*/
-static int
-taskq_ent_constructor(void *buf, void *cdrarg, int kmflags)
-{
- taskq_ent_t *tqe = buf;
-
- tqe->tqent_thread = NULL;
- cv_init(&tqe->tqent_cv, NULL, CV_DEFAULT, NULL);
-
- return (0);
-}
-
-/*ARGSUSED*/
-static void
-taskq_ent_destructor(void *buf, void *cdrarg)
-{
- taskq_ent_t *tqe = buf;
-
- ASSERT(tqe->tqent_thread == NULL);
- cv_destroy(&tqe->tqent_cv);
-}
-
-/*
- * Create global system dynamic task queue.
- */
-void
-system_taskq_init(void)
-{
- system_taskq = taskq_create_common("system_taskq", 0,
- system_taskq_size * max_ncpus, minclsyspri, 4, 512,
- TASKQ_PREPOPULATE);
-}
-
-void
-system_taskq_fini(void)
-{
- taskq_destroy(system_taskq);
-}
-
-static void
-taskq_init(void *dummy __unused)
-{
- taskq_ent_cache = kmem_cache_create("taskq_ent_cache",
- sizeof (taskq_ent_t), 0, taskq_ent_constructor,
- taskq_ent_destructor, NULL, NULL, NULL, 0);
- taskq_cache = kmem_cache_create("taskq_cache", sizeof (taskq_t),
- 0, taskq_constructor, taskq_destructor, NULL, NULL, NULL, 0);
- system_taskq_init();
-}
-
-static void
-taskq_fini(void *dummy __unused)
-{
- system_taskq_fini();
- kmem_cache_destroy(taskq_cache);
- kmem_cache_destroy(taskq_ent_cache);
-}
-
-/*
- * taskq_ent_alloc()
- *
- * Allocates a new taskq_ent_t structure either from the free list or from the
- * cache. Returns NULL if it can't be allocated.
- *
- * Assumes: tq->tq_lock is held.
- */
-static taskq_ent_t *
-taskq_ent_alloc(taskq_t *tq, int flags)
-{
- int kmflags = (flags & TQ_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
-
- taskq_ent_t *tqe;
-
- ASSERT(MUTEX_HELD(&tq->tq_lock));
-
- /*
- * TQ_NOALLOC allocations are allowed to use the freelist, even if
- * we are below tq_minalloc.
- */
- if ((tqe = tq->tq_freelist) != NULL &&
- ((flags & TQ_NOALLOC) || tq->tq_nalloc >= tq->tq_minalloc)) {
- tq->tq_freelist = tqe->tqent_next;
- } else {
- if (flags & TQ_NOALLOC)
- return (NULL);
-
- mutex_exit(&tq->tq_lock);
- if (tq->tq_nalloc >= tq->tq_maxalloc) {
- if (kmflags & KM_NOSLEEP) {
- mutex_enter(&tq->tq_lock);
- return (NULL);
- }
- /*
- * We don't want to exceed tq_maxalloc, but we can't
- * wait for other tasks to complete (and thus free up
- * task structures) without risking deadlock with
- * the caller. So, we just delay for one second
- * to throttle the allocation rate.
- */
- delay(hz);
- }
- tqe = kmem_cache_alloc(taskq_ent_cache, kmflags);
- mutex_enter(&tq->tq_lock);
- if (tqe != NULL)
- tq->tq_nalloc++;
- }
- return (tqe);
-}
-
-/*
- * taskq_ent_free()
- *
- * Free taskq_ent_t structure by either putting it on the free list or freeing
- * it to the cache.
- *
- * Assumes: tq->tq_lock is held.
- */
-static void
-taskq_ent_free(taskq_t *tq, taskq_ent_t *tqe)
-{
- ASSERT(MUTEX_HELD(&tq->tq_lock));
-
- if (tq->tq_nalloc <= tq->tq_minalloc) {
- tqe->tqent_next = tq->tq_freelist;
- tq->tq_freelist = tqe;
- } else {
- tq->tq_nalloc--;
- mutex_exit(&tq->tq_lock);
- kmem_cache_free(taskq_ent_cache, tqe);
- mutex_enter(&tq->tq_lock);
- }
-}
-
-/*
- * Dispatch a task.
- *
- * Assumes: func != NULL
- *
- * Returns: NULL if dispatch failed.
- * non-NULL if task dispatched successfully.
- * Actual return value is the pointer to taskq entry that was used to
- * dispatch a task. This is useful for debugging.
- */
-/* ARGSUSED */
-taskqid_t
-taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
-{
- taskq_ent_t *tqe = NULL;
-
- ASSERT(tq != NULL);
- ASSERT(func != NULL);
- ASSERT(!(tq->tq_flags & TASKQ_DYNAMIC));
-
- /*
- * TQ_NOQUEUE flag can't be used with non-dynamic task queues.
- */
- ASSERT(! (flags & TQ_NOQUEUE));
-
- /*
- * Enqueue the task to the underlying queue.
- */
- mutex_enter(&tq->tq_lock);
-
- TASKQ_S_RANDOM_DISPATCH_FAILURE(tq, flags);
-
- if ((tqe = taskq_ent_alloc(tq, flags)) == NULL) {
- mutex_exit(&tq->tq_lock);
- return ((taskqid_t)NULL);
- }
- TQ_ENQUEUE(tq, tqe, func, arg);
- mutex_exit(&tq->tq_lock);
- return ((taskqid_t)tqe);
-}
-
-/*
- * Wait for all pending tasks to complete.
- * Calling taskq_wait from a task will cause deadlock.
- */
-void
-taskq_wait(taskq_t *tq)
-{
-
- mutex_enter(&tq->tq_lock);
- while (tq->tq_task.tqent_next != &tq->tq_task || tq->tq_active != 0)
- cv_wait(&tq->tq_wait_cv, &tq->tq_lock);
- mutex_exit(&tq->tq_lock);
-}
-
-/*
- * Suspend execution of tasks.
- *
- * Tasks in the queue part will be suspended immediately upon return from this
- * function. Pending tasks in the dynamic part will continue to execute, but all
- * new tasks will be suspended.
- */
-void
-taskq_suspend(taskq_t *tq)
-{
- rw_enter(&tq->tq_threadlock, RW_WRITER);
-
- /*
- * Mark task queue as being suspended. Needed for taskq_suspended().
- */
- mutex_enter(&tq->tq_lock);
- ASSERT(!(tq->tq_flags & TASKQ_SUSPENDED));
- tq->tq_flags |= TASKQ_SUSPENDED;
- mutex_exit(&tq->tq_lock);
-}
-
-/*
- * returns: 1 if tq is suspended, 0 otherwise.
- */
-int
-taskq_suspended(taskq_t *tq)
-{
- return ((tq->tq_flags & TASKQ_SUSPENDED) != 0);
-}
-
-/*
- * Resume taskq execution.
- */
-void
-taskq_resume(taskq_t *tq)
-{
- ASSERT(RW_WRITE_HELD(&tq->tq_threadlock));
-
- mutex_enter(&tq->tq_lock);
- ASSERT(tq->tq_flags & TASKQ_SUSPENDED);
- tq->tq_flags &= ~TASKQ_SUSPENDED;
- mutex_exit(&tq->tq_lock);
-
- rw_exit(&tq->tq_threadlock);
-}
-
-/*
- * Worker thread for processing task queue.
- */
-static void
-taskq_thread(void *arg)
-{
- taskq_t *tq = arg;
- taskq_ent_t *tqe;
- callb_cpr_t cprinfo;
- hrtime_t start, end;
-
- CALLB_CPR_INIT(&cprinfo, &tq->tq_lock, callb_generic_cpr, tq->tq_name);
-
- mutex_enter(&tq->tq_lock);
- while (tq->tq_flags & TASKQ_ACTIVE) {
- if ((tqe = tq->tq_task.tqent_next) == &tq->tq_task) {
- if (--tq->tq_active == 0)
- cv_broadcast(&tq->tq_wait_cv);
- if (tq->tq_flags & TASKQ_CPR_SAFE) {
- cv_wait(&tq->tq_dispatch_cv, &tq->tq_lock);
- } else {
- CALLB_CPR_SAFE_BEGIN(&cprinfo);
- cv_wait(&tq->tq_dispatch_cv, &tq->tq_lock);
- CALLB_CPR_SAFE_END(&cprinfo, &tq->tq_lock);
- }
- tq->tq_active++;
- continue;
- }
- tqe->tqent_prev->tqent_next = tqe->tqent_next;
- tqe->tqent_next->tqent_prev = tqe->tqent_prev;
- mutex_exit(&tq->tq_lock);
-
- rw_enter(&tq->tq_threadlock, RW_READER);
- start = gethrtime();
- DTRACE_PROBE2(taskq__exec__start, taskq_t *, tq,
- taskq_ent_t *, tqe);
- tqe->tqent_func(tqe->tqent_arg);
- DTRACE_PROBE2(taskq__exec__end, taskq_t *, tq,
- taskq_ent_t *, tqe);
- end = gethrtime();
- rw_exit(&tq->tq_threadlock);
-
- mutex_enter(&tq->tq_lock);
- tq->tq_totaltime += end - start;
- tq->tq_executed++;
-
- taskq_ent_free(tq, tqe);
- }
- tq->tq_nthreads--;
- cv_broadcast(&tq->tq_wait_cv);
- ASSERT(!(tq->tq_flags & TASKQ_CPR_SAFE));
- CALLB_CPR_EXIT(&cprinfo);
- thread_exit();
-}
-
-/*
- * Taskq creation. May sleep for memory.
- * Always use automatically generated instances to avoid kstat name space
- * collisions.
- */
-
-taskq_t *
-taskq_create(const char *name, int nthreads, pri_t pri, int minalloc,
- int maxalloc, uint_t flags)
-{
- return taskq_create_common(name, 0, nthreads, pri, minalloc,
- maxalloc, flags | TASKQ_NOINSTANCE);
-}
-
-static taskq_t *
-taskq_create_common(const char *name, int instance, int nthreads, pri_t pri,
- int minalloc, int maxalloc, uint_t flags)
-{
- taskq_t *tq = kmem_cache_alloc(taskq_cache, KM_SLEEP);
- uint_t ncpus = ((boot_max_ncpus == -1) ? max_ncpus : boot_max_ncpus);
- uint_t bsize; /* # of buckets - always power of 2 */
-
- ASSERT(instance == 0);
- ASSERT(flags == TASKQ_PREPOPULATE | TASKQ_NOINSTANCE);
-
- /*
- * TASKQ_CPR_SAFE and TASKQ_DYNAMIC flags are mutually exclusive.
- */
- ASSERT((flags & (TASKQ_DYNAMIC | TASKQ_CPR_SAFE)) !=
- ((TASKQ_DYNAMIC | TASKQ_CPR_SAFE)));
-
- ASSERT(tq->tq_buckets == NULL);
-
- bsize = 1 << (highbit(ncpus) - 1);
- ASSERT(bsize >= 1);
- bsize = MIN(bsize, taskq_maxbuckets);
-
- tq->tq_maxsize = nthreads;
-
- (void) strncpy(tq->tq_name, name, TASKQ_NAMELEN + 1);
- tq->tq_name[TASKQ_NAMELEN] = '\0';
- /* Make sure the name conforms to the rules for C indentifiers */
- strident_canon(tq->tq_name, TASKQ_NAMELEN);
-
- tq->tq_flags = flags | TASKQ_ACTIVE;
- tq->tq_active = nthreads;
- tq->tq_nthreads = nthreads;
- tq->tq_minalloc = minalloc;
- tq->tq_maxalloc = maxalloc;
- tq->tq_nbuckets = bsize;
- tq->tq_pri = pri;
-
- if (flags & TASKQ_PREPOPULATE) {
- mutex_enter(&tq->tq_lock);
- while (minalloc-- > 0)
- taskq_ent_free(tq, taskq_ent_alloc(tq, TQ_SLEEP));
- mutex_exit(&tq->tq_lock);
- }
-
- if (nthreads == 1) {
- tq->tq_thread = thread_create(NULL, 0, taskq_thread, tq,
- 0, NULL, TS_RUN, pri);
- } else {
- kthread_t **tpp = kmem_alloc(sizeof (kthread_t *) * nthreads,
- KM_SLEEP);
-
- tq->tq_threadlist = tpp;
-
- mutex_enter(&tq->tq_lock);
- while (nthreads-- > 0) {
- *tpp = thread_create(NULL, 0, taskq_thread, tq,
- 0, NULL, TS_RUN, pri);
- tpp++;
- }
- mutex_exit(&tq->tq_lock);
- }
-
- return (tq);
-}
-
-/*
- * taskq_destroy().
- *
- * Assumes: by the time taskq_destroy is called no one will use this task queue
- * in any way and no one will try to dispatch entries in it.
- */
-void
-taskq_destroy(taskq_t *tq)
-{
- taskq_bucket_t *b = tq->tq_buckets;
- int bid = 0;
-
- ASSERT(! (tq->tq_flags & TASKQ_CPR_SAFE));
-
- /*
- * Wait for any pending entries to complete.
- */
- taskq_wait(tq);
-
- mutex_enter(&tq->tq_lock);
- ASSERT((tq->tq_task.tqent_next == &tq->tq_task) &&
- (tq->tq_active == 0));
-
- if ((tq->tq_nthreads > 1) && (tq->tq_threadlist != NULL))
- kmem_free(tq->tq_threadlist, sizeof (kthread_t *) *
- tq->tq_nthreads);
-
- tq->tq_flags &= ~TASKQ_ACTIVE;
- cv_broadcast(&tq->tq_dispatch_cv);
- while (tq->tq_nthreads != 0)
- cv_wait(&tq->tq_wait_cv, &tq->tq_lock);
-
- tq->tq_minalloc = 0;
- while (tq->tq_nalloc != 0)
- taskq_ent_free(tq, taskq_ent_alloc(tq, TQ_SLEEP));
-
- mutex_exit(&tq->tq_lock);
-
- /*
- * Mark each bucket as closing and wakeup all sleeping threads.
- */
- for (; (b != NULL) && (bid < tq->tq_nbuckets); b++, bid++) {
- taskq_ent_t *tqe;
-
- mutex_enter(&b->tqbucket_lock);
-
- b->tqbucket_flags |= TQBUCKET_CLOSE;
- /* Wakeup all sleeping threads */
-
- for (tqe = b->tqbucket_freelist.tqent_next;
- tqe != &b->tqbucket_freelist; tqe = tqe->tqent_next)
- cv_signal(&tqe->tqent_cv);
-
- ASSERT(b->tqbucket_nalloc == 0);
-
- /*
- * At this point we waited for all pending jobs to complete (in
- * both the task queue and the bucket and no new jobs should
- * arrive. Wait for all threads to die.
- */
- while (b->tqbucket_nfree > 0)
- cv_wait(&b->tqbucket_cv, &b->tqbucket_lock);
- mutex_exit(&b->tqbucket_lock);
- mutex_destroy(&b->tqbucket_lock);
- cv_destroy(&b->tqbucket_cv);
- }
-
- if (tq->tq_buckets != NULL) {
- ASSERT(tq->tq_flags & TASKQ_DYNAMIC);
- kmem_free(tq->tq_buckets,
- sizeof (taskq_bucket_t) * tq->tq_nbuckets);
-
- /* Cleanup fields before returning tq to the cache */
- tq->tq_buckets = NULL;
- tq->tq_tcreates = 0;
- tq->tq_tdeaths = 0;
- } else {
- ASSERT(!(tq->tq_flags & TASKQ_DYNAMIC));
- }
-
- tq->tq_totaltime = 0;
- tq->tq_tasks = 0;
- tq->tq_maxtasks = 0;
- tq->tq_executed = 0;
- kmem_cache_free(taskq_cache, tq);
-}
-
-SYSINIT(sol_taskq, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, taskq_init, NULL);
-SYSUNINIT(sol_taskq, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, taskq_fini, NULL);
diff --git a/sys/contrib/opensolaris/uts/common/rpc/xdr.c b/sys/contrib/opensolaris/uts/common/rpc/xdr.c
deleted file mode 100644
index e934668..0000000
--- a/sys/contrib/opensolaris/uts/common/rpc/xdr.c
+++ /dev/null
@@ -1,673 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
-/* All Rights Reserved */
-
-/*
- * Portions of this source code were derived from Berkeley 4.3 BSD
- * under license from the Regents of the University of California.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-/*
- * xdr.c, generic XDR routines implementation.
- * These are the "generic" xdr routines used to serialize and de-serialize
- * most common data items. See xdr.h for more info on the interface to
- * xdr.
- */
-
-#include <sys/param.h>
-#include <sys/cmn_err.h>
-#include <sys/types.h>
-#include <sys/systm.h>
-
-#include <rpc/types.h>
-#include <rpc/xdr.h>
-
-#pragma weak xdr_int32_t = xdr_int
-#pragma weak xdr_uint32_t = xdr_u_int
-#pragma weak xdr_int64_t = xdr_longlong_t
-#pragma weak xdr_uint64_t = xdr_u_longlong_t
-
-#if defined(sun)
-#if !defined(_BIG_ENDIAN) && !defined(_LITTLE_ENDIAN)
-#error "Exactly one of _BIG_ENDIAN or _LITTLE_ENDIAN must be defined"
-#elif defined(_BIG_ENDIAN) && defined(_LITTLE_ENDIAN)
-#error "Only one of _BIG_ENDIAN or _LITTLE_ENDIAN may be defined"
-#endif
-#endif
-
-/*
- * constants specific to the xdr "protocol"
- */
-#define XDR_FALSE ((int32_t)0)
-#define XDR_TRUE ((int32_t)1)
-#define LASTUNSIGNED ((uint_t)0-1)
-
-/*
- * for unit alignment
- */
-static char xdr_zero[BYTES_PER_XDR_UNIT] = { 0, 0, 0, 0 };
-
-/*
- * Free a data structure using XDR
- * Not a filter, but a convenient utility nonetheless
- */
-void
-xdr_free(xdrproc_t proc, char *objp)
-{
- XDR x;
-
- x.x_op = XDR_FREE;
- (*proc)(&x, objp);
-}
-
-/*
- * XDR nothing
- */
-bool_t
-xdr_void(void)
-{
- return (TRUE);
-}
-
-/*
- * XDR integers
- *
- * PSARC 2003/523 Contract Private Interface
- * xdr_int
- * Changes must be reviewed by Solaris File Sharing
- * Changes must be communicated to contract-2003-523@sun.com
- */
-bool_t
-xdr_int(XDR *xdrs, int *ip)
-{
- if (xdrs->x_op == XDR_ENCODE)
- return (XDR_PUTINT32(xdrs, ip));
-
- if (xdrs->x_op == XDR_DECODE)
- return (XDR_GETINT32(xdrs, ip));
-
- if (xdrs->x_op == XDR_FREE)
- return (TRUE);
-
-#ifdef DEBUG
- printf("xdr_int: FAILED\n");
-#endif
- return (FALSE);
-}
-
-/*
- * XDR unsigned integers
- *
- * PSARC 2003/523 Contract Private Interface
- * xdr_u_int
- * Changes must be reviewed by Solaris File Sharing
- * Changes must be communicated to contract-2003-523@sun.com
- */
-bool_t
-xdr_u_int(XDR *xdrs, uint_t *up)
-{
- if (xdrs->x_op == XDR_ENCODE)
- return (XDR_PUTINT32(xdrs, (int32_t *)up));
-
- if (xdrs->x_op == XDR_DECODE)
- return (XDR_GETINT32(xdrs, (int32_t *)up));
-
- if (xdrs->x_op == XDR_FREE)
- return (TRUE);
-
-#ifdef DEBUG
- printf("xdr_int: FAILED\n");
-#endif
- return (FALSE);
-}
-
-
-#if defined(_ILP32)
-/*
- * xdr_long and xdr_u_long for binary compatability on ILP32 kernels.
- *
- * No prototypes since new code should not be using these interfaces.
- */
-bool_t
-xdr_long(XDR *xdrs, long *ip)
-{
- return (xdr_int(xdrs, (int *)ip));
-}
-
-bool_t
-xdr_u_long(XDR *xdrs, unsigned long *up)
-{
- return (xdr_u_int(xdrs, (uint_t *)up));
-}
-#endif /* _ILP32 */
-
-
-/*
- * XDR long long integers
- */
-bool_t
-xdr_longlong_t(XDR *xdrs, longlong_t *hp)
-{
- if (xdrs->x_op == XDR_ENCODE) {
-#if BYTE_ORDER == _LITTLE_ENDIAN
- if (XDR_PUTINT32(xdrs, (int32_t *)((char *)hp +
- BYTES_PER_XDR_UNIT)) == TRUE) {
- return (XDR_PUTINT32(xdrs, (int32_t *)hp));
- }
-#else
- if (XDR_PUTINT32(xdrs, (int32_t *)hp) == TRUE) {
- return (XDR_PUTINT32(xdrs, (int32_t *)((char *)hp +
- BYTES_PER_XDR_UNIT)));
- }
-#endif
- return (FALSE);
-
- }
- if (xdrs->x_op == XDR_DECODE) {
-#if BYTE_ORDER == _LITTLE_ENDIAN
- if (XDR_GETINT32(xdrs, (int32_t *)((char *)hp +
- BYTES_PER_XDR_UNIT)) == TRUE) {
- return (XDR_GETINT32(xdrs, (int32_t *)hp));
- }
-#else
- if (XDR_GETINT32(xdrs, (int32_t *)hp) == TRUE) {
- return (XDR_GETINT32(xdrs, (int32_t *)((char *)hp +
- BYTES_PER_XDR_UNIT)));
- }
-#endif
- return (FALSE);
- }
- return (TRUE);
-}
-
-/*
- * XDR unsigned long long integers
- */
-bool_t
-xdr_u_longlong_t(XDR *xdrs, u_longlong_t *hp)
-{
-
- if (xdrs->x_op == XDR_ENCODE) {
-#if BYTE_ORDER == _LITTLE_ENDIAN
- if (XDR_PUTINT32(xdrs, (int32_t *)((char *)hp +
- BYTES_PER_XDR_UNIT)) == TRUE) {
- return (XDR_PUTINT32(xdrs, (int32_t *)hp));
- }
-#else
- if (XDR_PUTINT32(xdrs, (int32_t *)hp) == TRUE) {
- return (XDR_PUTINT32(xdrs, (int32_t *)((char *)hp +
- BYTES_PER_XDR_UNIT)));
- }
-#endif
- return (FALSE);
-
- }
- if (xdrs->x_op == XDR_DECODE) {
-#if BYTE_ORDER == _LITTLE_ENDIAN
- if (XDR_GETINT32(xdrs, (int32_t *)((char *)hp +
- BYTES_PER_XDR_UNIT)) == TRUE) {
- return (XDR_GETINT32(xdrs, (int32_t *)hp));
- }
-#else
- if (XDR_GETINT32(xdrs, (int32_t *)hp) == TRUE) {
- return (XDR_GETINT32(xdrs, (int32_t *)((char *)hp +
- BYTES_PER_XDR_UNIT)));
- }
-#endif
- return (FALSE);
- }
- return (TRUE);
-}
-
-/*
- * XDR short integers
- */
-bool_t
-xdr_short(XDR *xdrs, short *sp)
-{
- int32_t l;
-
- switch (xdrs->x_op) {
-
- case XDR_ENCODE:
- l = (int32_t)*sp;
- return (XDR_PUTINT32(xdrs, &l));
-
- case XDR_DECODE:
- if (!XDR_GETINT32(xdrs, &l))
- return (FALSE);
- *sp = (short)l;
- return (TRUE);
-
- case XDR_FREE:
- return (TRUE);
- }
- return (FALSE);
-}
-
-/*
- * XDR unsigned short integers
- */
-bool_t
-xdr_u_short(XDR *xdrs, ushort_t *usp)
-{
- uint32_t l;
-
- switch (xdrs->x_op) {
-
- case XDR_ENCODE:
- l = (uint32_t)*usp;
- return (XDR_PUTINT32(xdrs, (int32_t *)&l));
-
- case XDR_DECODE:
- if (!XDR_GETINT32(xdrs, (int32_t *)&l)) {
-#ifdef DEBUG
- printf("xdr_u_short: decode FAILED\n");
-#endif
- return (FALSE);
- }
- *usp = (ushort_t)l;
- return (TRUE);
-
- case XDR_FREE:
- return (TRUE);
- }
-#ifdef DEBUG
- printf("xdr_u_short: bad op FAILED\n");
-#endif
- return (FALSE);
-}
-
-
-/*
- * XDR a char
- */
-bool_t
-xdr_char(XDR *xdrs, char *cp)
-{
- int i;
-
- i = (*cp);
- if (!xdr_int(xdrs, &i)) {
- return (FALSE);
- }
- *cp = (char)i;
- return (TRUE);
-}
-
-/*
- * XDR booleans
- *
- * PSARC 2003/523 Contract Private Interface
- * xdr_bool
- * Changes must be reviewed by Solaris File Sharing
- * Changes must be communicated to contract-2003-523@sun.com
- */
-bool_t
-xdr_bool(XDR *xdrs, bool_t *bp)
-{
- int32_t i32b;
-
- switch (xdrs->x_op) {
-
- case XDR_ENCODE:
- i32b = *bp ? XDR_TRUE : XDR_FALSE;
- return (XDR_PUTINT32(xdrs, &i32b));
-
- case XDR_DECODE:
- if (!XDR_GETINT32(xdrs, &i32b)) {
-#ifdef DEBUG
- printf("xdr_bool: decode FAILED\n");
-#endif
- return (FALSE);
- }
- *bp = (i32b == XDR_FALSE) ? FALSE : TRUE;
- return (TRUE);
-
- case XDR_FREE:
- return (TRUE);
- }
-#ifdef DEBUG
- printf("xdr_bool: bad op FAILED\n");
-#endif
- return (FALSE);
-}
-
-/*
- * XDR enumerations
- *
- * PSARC 2003/523 Contract Private Interface
- * xdr_enum
- * Changes must be reviewed by Solaris File Sharing
- * Changes must be communicated to contract-2003-523@sun.com
- */
-#ifndef lint
-enum sizecheck { SIZEVAL } sizecheckvar; /* used to find the size of */
- /* an enum */
-#endif
-bool_t
-xdr_enum(XDR *xdrs, enum_t *ep)
-{
-#ifndef lint
- /*
- * enums are treated as ints
- */
- if (sizeof (sizecheckvar) == sizeof (int32_t)) {
- return (xdr_int(xdrs, (int32_t *)ep));
- } else if (sizeof (sizecheckvar) == sizeof (short)) {
- return (xdr_short(xdrs, (short *)ep));
- } else {
- return (FALSE);
- }
-#else
- (void) (xdr_short(xdrs, (short *)ep));
- return (xdr_int(xdrs, (int32_t *)ep));
-#endif
-}
-
-/*
- * XDR opaque data
- * Allows the specification of a fixed size sequence of opaque bytes.
- * cp points to the opaque object and cnt gives the byte length.
- *
- * PSARC 2003/523 Contract Private Interface
- * xdr_opaque
- * Changes must be reviewed by Solaris File Sharing
- * Changes must be communicated to contract-2003-523@sun.com
- */
-bool_t
-xdr_opaque(XDR *xdrs, caddr_t cp, const uint_t cnt)
-{
- uint_t rndup;
- static char crud[BYTES_PER_XDR_UNIT];
-
- /*
- * if no data we are done
- */
- if (cnt == 0)
- return (TRUE);
-
- /*
- * round byte count to full xdr units
- */
- rndup = cnt % BYTES_PER_XDR_UNIT;
- if (rndup != 0)
- rndup = BYTES_PER_XDR_UNIT - rndup;
-
- if (xdrs->x_op == XDR_DECODE) {
- if (!XDR_GETBYTES(xdrs, cp, cnt)) {
-#ifdef DEBUG
- printf("xdr_opaque: decode FAILED\n");
-#endif
- return (FALSE);
- }
- if (rndup == 0)
- return (TRUE);
- return (XDR_GETBYTES(xdrs, (caddr_t)crud, rndup));
- }
-
- if (xdrs->x_op == XDR_ENCODE) {
- if (!XDR_PUTBYTES(xdrs, cp, cnt)) {
-#ifdef DEBUG
- printf("xdr_opaque: encode FAILED\n");
-#endif
- return (FALSE);
- }
- if (rndup == 0)
- return (TRUE);
- return (XDR_PUTBYTES(xdrs, xdr_zero, rndup));
- }
-
- if (xdrs->x_op == XDR_FREE)
- return (TRUE);
-
-#ifdef DEBUG
- printf("xdr_opaque: bad op FAILED\n");
-#endif
- return (FALSE);
-}
-
-/*
- * XDR counted bytes
- * *cpp is a pointer to the bytes, *sizep is the count.
- * If *cpp is NULL maxsize bytes are allocated
- *
- * PSARC 2003/523 Contract Private Interface
- * xdr_bytes
- * Changes must be reviewed by Solaris File Sharing
- * Changes must be communicated to contract-2003-523@sun.com
- */
-bool_t
-xdr_bytes(XDR *xdrs, char **cpp, uint_t *sizep, const uint_t maxsize)
-{
- char *sp = *cpp; /* sp is the actual string pointer */
- uint_t nodesize;
-
- /*
- * first deal with the length since xdr bytes are counted
- */
- if (!xdr_u_int(xdrs, sizep)) {
-#ifdef DEBUG
- printf("xdr_bytes: size FAILED\n");
-#endif
- return (FALSE);
- }
- nodesize = *sizep;
- if ((nodesize > maxsize) && (xdrs->x_op != XDR_FREE)) {
-#ifdef DEBUG
- printf("xdr_bytes: bad size (%d) FAILED (%d max)\n",
- nodesize, maxsize);
-#endif
- return (FALSE);
- }
-
- /*
- * now deal with the actual bytes
- */
- switch (xdrs->x_op) {
- case XDR_DECODE:
- if (nodesize == 0)
- return (TRUE);
- if (sp == NULL)
- *cpp = sp = (char *)mem_alloc(nodesize);
- /* FALLTHROUGH */
-
- case XDR_ENCODE:
- return (xdr_opaque(xdrs, sp, nodesize));
-
- case XDR_FREE:
- if (sp != NULL) {
- mem_free(sp, nodesize);
- *cpp = NULL;
- }
- return (TRUE);
- }
-#ifdef DEBUG
- printf("xdr_bytes: bad op FAILED\n");
-#endif
- return (FALSE);
-}
-
-/*
- * Implemented here due to commonality of the object.
- */
-bool_t
-xdr_netobj(XDR *xdrs, struct netobj *np)
-{
- return (xdr_bytes(xdrs, &np->n_bytes, &np->n_len, MAX_NETOBJ_SZ));
-}
-
-/*
- * XDR a descriminated union
- * Support routine for discriminated unions.
- * You create an array of xdrdiscrim structures, terminated with
- * an entry with a null procedure pointer. The routine gets
- * the discriminant value and then searches the array of xdrdiscrims
- * looking for that value. It calls the procedure given in the xdrdiscrim
- * to handle the discriminant. If there is no specific routine a default
- * routine may be called.
- * If there is no specific or default routine an error is returned.
- */
-bool_t
-xdr_union(XDR *xdrs, enum_t *dscmp, char *unp,
- const struct xdr_discrim *choices, const xdrproc_t dfault)
-{
- enum_t dscm;
-
- /*
- * we deal with the discriminator; it's an enum
- */
- if (!xdr_enum(xdrs, dscmp)) {
-#ifdef DEBUG
- printf("xdr_enum: dscmp FAILED\n");
-#endif
- return (FALSE);
- }
- dscm = *dscmp;
-
- /*
- * search choices for a value that matches the discriminator.
- * if we find one, execute the xdr routine for that value.
- */
- for (; choices->proc != NULL_xdrproc_t; choices++) {
- if (choices->value == dscm)
- return ((*(choices->proc))(xdrs, unp, LASTUNSIGNED));
- }
-
- /*
- * no match - execute the default xdr routine if there is one
- */
- return ((dfault == NULL_xdrproc_t) ? FALSE :
- (*dfault)(xdrs, unp, LASTUNSIGNED));
-}
-
-
-/*
- * Non-portable xdr primitives.
- * Care should be taken when moving these routines to new architectures.
- */
-
-
-/*
- * XDR null terminated ASCII strings
- * xdr_string deals with "C strings" - arrays of bytes that are
- * terminated by a NULL character. The parameter cpp references a
- * pointer to storage; If the pointer is null, then the necessary
- * storage is allocated. The last parameter is the max allowed length
- * of the string as specified by a protocol.
- */
-bool_t
-xdr_string(XDR *xdrs, char **cpp, const uint_t maxsize)
-{
- char *sp = *cpp; /* sp is the actual string pointer */
- uint_t size;
- uint_t nodesize;
-
- /*
- * first deal with the length since xdr strings are counted-strings
- */
- switch (xdrs->x_op) {
- case XDR_FREE:
- if (sp == NULL)
- return (TRUE); /* already free */
- /* FALLTHROUGH */
- case XDR_ENCODE:
- size = (sp != NULL) ? (uint_t)strlen(sp) : 0;
- break;
- case XDR_DECODE:
- break;
- }
- if (!xdr_u_int(xdrs, &size)) {
-#ifdef DEBUG
- printf("xdr_string: size FAILED\n");
-#endif
- return (FALSE);
- }
- if (size > maxsize) {
-#ifdef DEBUG
- printf("xdr_string: bad size FAILED\n");
-#endif
- return (FALSE);
- }
- nodesize = size + 1;
-
- /*
- * now deal with the actual bytes
- */
- switch (xdrs->x_op) {
- case XDR_DECODE:
- if (nodesize == 0)
- return (TRUE);
- if (sp == NULL)
- sp = (char *)mem_alloc(nodesize);
- sp[size] = 0;
- if (!xdr_opaque(xdrs, sp, size)) {
- /*
- * free up memory if allocated here
- */
- if (*cpp == NULL) {
- mem_free(sp, nodesize);
- }
- return (FALSE);
- }
- if (strlen(sp) != size) {
- if (*cpp == NULL) {
- mem_free(sp, nodesize);
- }
- return (FALSE);
- }
- *cpp = sp;
- return (TRUE);
-
- case XDR_ENCODE:
- return (xdr_opaque(xdrs, sp, size));
-
- case XDR_FREE:
- mem_free(sp, nodesize);
- *cpp = NULL;
- return (TRUE);
- }
-#ifdef DEBUG
- printf("xdr_string: bad op FAILED\n");
-#endif
- return (FALSE);
-}
-
-/*
- * Wrapper for xdr_string that can be called directly from
- * routines like clnt_call
- */
-bool_t
-xdr_wrapstring(XDR *xdrs, char **cpp)
-{
- if (xdr_string(xdrs, cpp, LASTUNSIGNED))
- return (TRUE);
- return (FALSE);
-}
diff --git a/sys/contrib/opensolaris/uts/common/rpc/xdr.h b/sys/contrib/opensolaris/uts/common/rpc/xdr.h
deleted file mode 100644
index d60809e..0000000
--- a/sys/contrib/opensolaris/uts/common/rpc/xdr.h
+++ /dev/null
@@ -1,605 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- *
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
-/* All Rights Reserved */
-/*
- * Portions of this source code were derived from Berkeley
- * 4.3 BSD under license from the Regents of the University of
- * California.
- */
-
-/*
- * xdr.h, External Data Representation Serialization Routines.
- *
- */
-
-#ifndef _RPC_XDR_H
-#define _RPC_XDR_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/byteorder.h> /* For all ntoh* and hton*() kind of macros */
-#include <rpc/types.h> /* For all ntoh* and hton*() kind of macros */
-#ifndef _KERNEL
-#include <stdio.h> /* defines FILE *, used in ANSI C function prototypes */
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * XDR provides a conventional way for converting between C data
- * types and an external bit-string representation. Library supplied
- * routines provide for the conversion on built-in C data types. These
- * routines and utility routines defined here are used to help implement
- * a type encode/decode routine for each user-defined type.
- *
- * Each data type provides a single procedure which takes two arguments:
- *
- * bool_t
- * xdrproc(xdrs, argresp)
- * XDR *xdrs;
- * <type> *argresp;
- *
- * xdrs is an instance of a XDR handle, to which or from which the data
- * type is to be converted. argresp is a pointer to the structure to be
- * converted. The XDR handle contains an operation field which indicates
- * which of the operations (ENCODE, DECODE * or FREE) is to be performed.
- *
- * XDR_DECODE may allocate space if the pointer argresp is null. This
- * data can be freed with the XDR_FREE operation.
- *
- * We write only one procedure per data type to make it easy
- * to keep the encode and decode procedures for a data type consistent.
- * In many cases the same code performs all operations on a user defined type,
- * because all the hard work is done in the component type routines.
- * decode as a series of calls on the nested data types.
- */
-
-/*
- * Xdr operations. XDR_ENCODE causes the type to be encoded into the
- * stream. XDR_DECODE causes the type to be extracted from the stream.
- * XDR_FREE can be used to release the space allocated by an XDR_DECODE
- * request.
- */
-enum xdr_op {
- XDR_ENCODE = 0,
- XDR_DECODE = 1,
- XDR_FREE = 2
-};
-
-/*
- * This is the number of bytes per unit of external data.
- */
-#define BYTES_PER_XDR_UNIT (4)
-#define RNDUP(x) ((((x) + BYTES_PER_XDR_UNIT - 1) / BYTES_PER_XDR_UNIT) \
- * BYTES_PER_XDR_UNIT)
-
-/*
- * The XDR handle.
- * Contains operation which is being applied to the stream,
- * an operations vector for the paticular implementation (e.g. see xdr_mem.c),
- * and two private fields for the use of the particular impelementation.
- *
- * PSARC 2003/523 Contract Private Interface
- * XDR
- * Changes must be reviewed by Solaris File Sharing
- * Changes must be communicated to contract-2003-523@sun.com
- */
-typedef struct XDR {
- enum xdr_op x_op; /* operation; fast additional param */
- struct xdr_ops *x_ops;
- caddr_t x_public; /* users' data */
- caddr_t x_private; /* pointer to private data */
- caddr_t x_base; /* private used for position info */
- int x_handy; /* extra private word */
-} XDR;
-
-/*
- * PSARC 2003/523 Contract Private Interface
- * xdr_ops
- * Changes must be reviewed by Solaris File Sharing
- * Changes must be communicated to contract-2003-523@sun.com
- */
-struct xdr_ops {
-#ifdef __STDC__
-#if !defined(_KERNEL)
- bool_t (*x_getlong)(struct XDR *, long *);
- /* get a long from underlying stream */
- bool_t (*x_putlong)(struct XDR *, long *);
- /* put a long to " */
-#endif /* KERNEL */
- bool_t (*x_getbytes)(struct XDR *, caddr_t, int);
- /* get some bytes from " */
- bool_t (*x_putbytes)(struct XDR *, caddr_t, int);
- /* put some bytes to " */
- uint_t (*x_getpostn)(struct XDR *);
- /* returns bytes off from beginning */
- bool_t (*x_setpostn)(struct XDR *, uint_t);
- /* lets you reposition the stream */
- rpc_inline_t *(*x_inline)(struct XDR *, int);
- /* buf quick ptr to buffered data */
- void (*x_destroy)(struct XDR *);
- /* free privates of this xdr_stream */
- bool_t (*x_control)(struct XDR *, int, void *);
-#if defined(_LP64) || defined(_KERNEL)
- bool_t (*x_getint32)(struct XDR *, int32_t *);
- /* get a int from underlying stream */
- bool_t (*x_putint32)(struct XDR *, int32_t *);
- /* put an int to " */
-#endif /* _LP64 || _KERNEL */
-#else
-#if !defined(_KERNEL)
- bool_t (*x_getlong)(); /* get a long from underlying stream */
- bool_t (*x_putlong)(); /* put a long to " */
-#endif /* KERNEL */
- bool_t (*x_getbytes)(); /* get some bytes from " */
- bool_t (*x_putbytes)(); /* put some bytes to " */
- uint_t (*x_getpostn)(); /* returns bytes off from beginning */
- bool_t (*x_setpostn)(); /* lets you reposition the stream */
- rpc_inline_t *(*x_inline)();
- /* buf quick ptr to buffered data */
- void (*x_destroy)(); /* free privates of this xdr_stream */
- bool_t (*x_control)();
-#if defined(_LP64) || defined(_KERNEL)
- bool_t (*x_getint32)();
- bool_t (*x_putint32)();
-#endif /* _LP64 || defined(_KERNEL) */
-#endif
-};
-
-/*
- * Operations defined on a XDR handle
- *
- * XDR *xdrs;
- * long *longp;
- * caddr_t addr;
- * uint_t len;
- * uint_t pos;
- */
-#if !defined(_KERNEL)
-#define XDR_GETLONG(xdrs, longp) \
- (*(xdrs)->x_ops->x_getlong)(xdrs, longp)
-#define xdr_getlong(xdrs, longp) \
- (*(xdrs)->x_ops->x_getlong)(xdrs, longp)
-
-#define XDR_PUTLONG(xdrs, longp) \
- (*(xdrs)->x_ops->x_putlong)(xdrs, longp)
-#define xdr_putlong(xdrs, longp) \
- (*(xdrs)->x_ops->x_putlong)(xdrs, longp)
-#endif /* KERNEL */
-
-
-#if !defined(_LP64) && !defined(_KERNEL)
-
-/*
- * For binary compatability on ILP32 we do not change the shape
- * of the XDR structure and the GET/PUTINT32 functions just use
- * the get/putlong vectors which operate on identically-sized
- * units of data.
- */
-
-#define XDR_GETINT32(xdrs, int32p) \
- (*(xdrs)->x_ops->x_getlong)(xdrs, (long *)int32p)
-#define xdr_getint32(xdrs, int32p) \
- (*(xdrs)->x_ops->x_getlong)(xdrs, (long *)int32p)
-
-#define XDR_PUTINT32(xdrs, int32p) \
- (*(xdrs)->x_ops->x_putlong)(xdrs, (long *)int32p)
-#define xdr_putint32(xdrs, int32p) \
- (*(xdrs)->x_ops->x_putlong)(xdrs, (long *)int32p)
-
-#else /* !_LP64 && !_KERNEL */
-
-#define XDR_GETINT32(xdrs, int32p) \
- (*(xdrs)->x_ops->x_getint32)(xdrs, int32p)
-#define xdr_getint32(xdrs, int32p) \
- (*(xdrs)->x_ops->x_getint32)(xdrs, int32p)
-
-#define XDR_PUTINT32(xdrs, int32p) \
- (*(xdrs)->x_ops->x_putint32)(xdrs, int32p)
-#define xdr_putint32(xdrs, int32p) \
- (*(xdrs)->x_ops->x_putint32)(xdrs, int32p)
-
-#endif /* !_LP64 && !_KERNEL */
-
-#define XDR_GETBYTES(xdrs, addr, len) \
- (*(xdrs)->x_ops->x_getbytes)(xdrs, addr, len)
-#define xdr_getbytes(xdrs, addr, len) \
- (*(xdrs)->x_ops->x_getbytes)(xdrs, addr, len)
-
-#define XDR_PUTBYTES(xdrs, addr, len) \
- (*(xdrs)->x_ops->x_putbytes)(xdrs, addr, len)
-#define xdr_putbytes(xdrs, addr, len) \
- (*(xdrs)->x_ops->x_putbytes)(xdrs, addr, len)
-
-#define XDR_GETPOS(xdrs) \
- (*(xdrs)->x_ops->x_getpostn)(xdrs)
-#define xdr_getpos(xdrs) \
- (*(xdrs)->x_ops->x_getpostn)(xdrs)
-
-#define XDR_SETPOS(xdrs, pos) \
- (*(xdrs)->x_ops->x_setpostn)(xdrs, pos)
-#define xdr_setpos(xdrs, pos) \
- (*(xdrs)->x_ops->x_setpostn)(xdrs, pos)
-
-#define XDR_INLINE(xdrs, len) \
- (*(xdrs)->x_ops->x_inline)(xdrs, len)
-#define xdr_inline(xdrs, len) \
- (*(xdrs)->x_ops->x_inline)(xdrs, len)
-
-#define XDR_DESTROY(xdrs) \
- (*(xdrs)->x_ops->x_destroy)(xdrs)
-#define xdr_destroy(xdrs) \
- (*(xdrs)->x_ops->x_destroy)(xdrs)
-
-#define XDR_CONTROL(xdrs, req, op) \
- (*(xdrs)->x_ops->x_control)(xdrs, req, op)
-#define xdr_control(xdrs, req, op) \
- (*(xdrs)->x_ops->x_control)(xdrs, req, op)
-
-/*
- * Support struct for discriminated unions.
- * You create an array of xdrdiscrim structures, terminated with
- * a entry with a null procedure pointer. The xdr_union routine gets
- * the discriminant value and then searches the array of structures
- * for a matching value. If a match is found the associated xdr routine
- * is called to handle that part of the union. If there is
- * no match, then a default routine may be called.
- * If there is no match and no default routine it is an error.
- */
-
-
-/*
- * A xdrproc_t exists for each data type which is to be encoded or decoded.
- *
- * The second argument to the xdrproc_t is a pointer to an opaque pointer.
- * The opaque pointer generally points to a structure of the data type
- * to be decoded. If this pointer is 0, then the type routines should
- * allocate dynamic storage of the appropriate size and return it.
- * bool_t (*xdrproc_t)(XDR *, void *);
- */
-#ifdef __cplusplus
-typedef bool_t (*xdrproc_t)(XDR *, void *);
-#else
-#ifdef __STDC__
-typedef bool_t (*xdrproc_t)(); /* For Backward compatibility */
-#else
-typedef bool_t (*xdrproc_t)();
-#endif
-#endif
-
-#define NULL_xdrproc_t ((xdrproc_t)0)
-
-#if defined(_LP64) || defined(_I32LPx)
-#define xdr_rpcvers(xdrs, versp) xdr_u_int(xdrs, versp)
-#define xdr_rpcprog(xdrs, progp) xdr_u_int(xdrs, progp)
-#define xdr_rpcproc(xdrs, procp) xdr_u_int(xdrs, procp)
-#define xdr_rpcprot(xdrs, protp) xdr_u_int(xdrs, protp)
-#define xdr_rpcport(xdrs, portp) xdr_u_int(xdrs, portp)
-#else
-#define xdr_rpcvers(xdrs, versp) xdr_u_long(xdrs, versp)
-#define xdr_rpcprog(xdrs, progp) xdr_u_long(xdrs, progp)
-#define xdr_rpcproc(xdrs, procp) xdr_u_long(xdrs, procp)
-#define xdr_rpcprot(xdrs, protp) xdr_u_long(xdrs, protp)
-#define xdr_rpcport(xdrs, portp) xdr_u_long(xdrs, portp)
-#endif
-
-struct xdr_discrim {
- int value;
- xdrproc_t proc;
-};
-
-/*
- * In-line routines for fast encode/decode of primitve data types.
- * Caveat emptor: these use single memory cycles to get the
- * data from the underlying buffer, and will fail to operate
- * properly if the data is not aligned. The standard way to use these
- * is to say:
- * if ((buf = XDR_INLINE(xdrs, count)) == NULL)
- * return (FALSE);
- * <<< macro calls >>>
- * where ``count'' is the number of bytes of data occupied
- * by the primitive data types.
- *
- * N.B. and frozen for all time: each data type here uses 4 bytes
- * of external representation.
- */
-
-#define IXDR_GET_INT32(buf) ((int32_t)ntohl((uint32_t)*(buf)++))
-#define IXDR_PUT_INT32(buf, v) (*(buf)++ = (int32_t)htonl((uint32_t)v))
-#define IXDR_GET_U_INT32(buf) ((uint32_t)IXDR_GET_INT32(buf))
-#define IXDR_PUT_U_INT32(buf, v) IXDR_PUT_INT32((buf), ((int32_t)(v)))
-
-#if !defined(_KERNEL) && !defined(_LP64)
-
-#define IXDR_GET_LONG(buf) ((long)ntohl((ulong_t)*(buf)++))
-#define IXDR_PUT_LONG(buf, v) (*(buf)++ = (long)htonl((ulong_t)v))
-#define IXDR_GET_U_LONG(buf) ((ulong_t)IXDR_GET_LONG(buf))
-#define IXDR_PUT_U_LONG(buf, v) IXDR_PUT_LONG((buf), ((long)(v)))
-
-#define IXDR_GET_BOOL(buf) ((bool_t)IXDR_GET_LONG(buf))
-#define IXDR_GET_ENUM(buf, t) ((t)IXDR_GET_LONG(buf))
-#define IXDR_GET_SHORT(buf) ((short)IXDR_GET_LONG(buf))
-#define IXDR_GET_U_SHORT(buf) ((ushort_t)IXDR_GET_LONG(buf))
-
-#define IXDR_PUT_BOOL(buf, v) IXDR_PUT_LONG((buf), ((long)(v)))
-#define IXDR_PUT_ENUM(buf, v) IXDR_PUT_LONG((buf), ((long)(v)))
-#define IXDR_PUT_SHORT(buf, v) IXDR_PUT_LONG((buf), ((long)(v)))
-#define IXDR_PUT_U_SHORT(buf, v) IXDR_PUT_LONG((buf), ((long)(v)))
-
-#else
-
-#define IXDR_GET_BOOL(buf) ((bool_t)IXDR_GET_INT32(buf))
-#define IXDR_GET_ENUM(buf, t) ((t)IXDR_GET_INT32(buf))
-#define IXDR_GET_SHORT(buf) ((short)IXDR_GET_INT32(buf))
-#define IXDR_GET_U_SHORT(buf) ((ushort_t)IXDR_GET_INT32(buf))
-
-#define IXDR_PUT_BOOL(buf, v) IXDR_PUT_INT32((buf), ((int)(v)))
-#define IXDR_PUT_ENUM(buf, v) IXDR_PUT_INT32((buf), ((int)(v)))
-#define IXDR_PUT_SHORT(buf, v) IXDR_PUT_INT32((buf), ((int)(v)))
-#define IXDR_PUT_U_SHORT(buf, v) IXDR_PUT_INT32((buf), ((int)(v)))
-
-#endif
-
-#if BYTE_ORDER == _BIG_ENDIAN
-#define IXDR_GET_HYPER(buf, v) { \
- *((int32_t *)(&v)) = ntohl(*(uint32_t *)buf++); \
- *((int32_t *)(((char *)&v) + BYTES_PER_XDR_UNIT)) \
- = ntohl(*(uint32_t *)buf++); \
- }
-#define IXDR_PUT_HYPER(buf, v) { \
- *(buf)++ = (int32_t)htonl(*(uint32_t *) \
- ((char *)&v)); \
- *(buf)++ = \
- (int32_t)htonl(*(uint32_t *)(((char *)&v) \
- + BYTES_PER_XDR_UNIT)); \
- }
-#else
-
-#define IXDR_GET_HYPER(buf, v) { \
- *((int32_t *)(((char *)&v) + \
- BYTES_PER_XDR_UNIT)) \
- = ntohl(*(uint32_t *)buf++); \
- *((int32_t *)(&v)) = \
- ntohl(*(uint32_t *)buf++); \
- }
-
-#define IXDR_PUT_HYPER(buf, v) { \
- *(buf)++ = \
- (int32_t)htonl(*(uint32_t *)(((char *)&v) + \
- BYTES_PER_XDR_UNIT)); \
- *(buf)++ = \
- (int32_t)htonl(*(uint32_t *)((char *)&v)); \
- }
-#endif
-#define IXDR_GET_U_HYPER(buf, v) IXDR_GET_HYPER(buf, v)
-#define IXDR_PUT_U_HYPER(buf, v) IXDR_PUT_HYPER(buf, v)
-
-
-/*
- * These are the "generic" xdr routines.
- */
-#ifdef __STDC__
-extern bool_t xdr_void(void);
-extern bool_t xdr_int(XDR *, int *);
-extern bool_t xdr_u_int(XDR *, uint_t *);
-extern bool_t xdr_long(XDR *, long *);
-extern bool_t xdr_u_long(XDR *, ulong_t *);
-extern bool_t xdr_short(XDR *, short *);
-extern bool_t xdr_u_short(XDR *, ushort_t *);
-extern bool_t xdr_bool(XDR *, bool_t *);
-extern bool_t xdr_enum(XDR *, enum_t *);
-extern bool_t xdr_array(XDR *, caddr_t *, uint_t *, const uint_t,
- const uint_t, const xdrproc_t);
-extern bool_t xdr_bytes(XDR *, char **, uint_t *, const uint_t);
-extern bool_t xdr_opaque(XDR *, caddr_t, const uint_t);
-extern bool_t xdr_string(XDR *, char **, const uint_t);
-extern bool_t xdr_union(XDR *, enum_t *, char *,
- const struct xdr_discrim *, const xdrproc_t);
-extern unsigned int xdr_sizeof(xdrproc_t, void *);
-
-extern bool_t xdr_hyper(XDR *, longlong_t *);
-extern bool_t xdr_longlong_t(XDR *, longlong_t *);
-extern bool_t xdr_u_hyper(XDR *, u_longlong_t *);
-extern bool_t xdr_u_longlong_t(XDR *, u_longlong_t *);
-
-extern bool_t xdr_char(XDR *, char *);
-extern bool_t xdr_wrapstring(XDR *, char **);
-extern bool_t xdr_reference(XDR *, caddr_t *, uint_t, const xdrproc_t);
-extern bool_t xdr_pointer(XDR *, char **, uint_t, const xdrproc_t);
-extern void xdr_free(xdrproc_t, char *);
-extern bool_t xdr_time_t(XDR *, time_t *);
-
-extern bool_t xdr_int8_t(XDR *, int8_t *);
-extern bool_t xdr_uint8_t(XDR *, uint8_t *);
-extern bool_t xdr_int16_t(XDR *, int16_t *);
-extern bool_t xdr_uint16_t(XDR *, uint16_t *);
-extern bool_t xdr_int32_t(XDR *, int32_t *);
-extern bool_t xdr_uint32_t(XDR *, uint32_t *);
-#if defined(_INT64_TYPE)
-extern bool_t xdr_int64_t(XDR *, int64_t *);
-extern bool_t xdr_uint64_t(XDR *, uint64_t *);
-#endif
-
-#ifndef _KERNEL
-extern bool_t xdr_u_char(XDR *, uchar_t *);
-extern bool_t xdr_vector(XDR *, char *, const uint_t, const uint_t, const
-xdrproc_t);
-extern bool_t xdr_float(XDR *, float *);
-extern bool_t xdr_double(XDR *, double *);
-extern bool_t xdr_quadruple(XDR *, long double *);
-#endif /* !_KERNEL */
-#else
-extern bool_t xdr_void();
-extern bool_t xdr_int();
-extern bool_t xdr_u_int();
-extern bool_t xdr_long();
-extern bool_t xdr_u_long();
-extern bool_t xdr_short();
-extern bool_t xdr_u_short();
-extern bool_t xdr_bool();
-extern bool_t xdr_enum();
-extern bool_t xdr_array();
-extern bool_t xdr_bytes();
-extern bool_t xdr_opaque();
-extern bool_t xdr_string();
-extern bool_t xdr_union();
-
-extern bool_t xdr_hyper();
-extern bool_t xdr_longlong_t();
-extern bool_t xdr_u_hyper();
-extern bool_t xdr_u_longlong_t();
-extern bool_t xdr_char();
-extern bool_t xdr_reference();
-extern bool_t xdr_pointer();
-extern void xdr_free();
-extern bool_t xdr_wrapstring();
-extern bool_t xdr_time_t();
-
-extern bool_t xdr_int8_t();
-extern bool_t xdr_uint8_t();
-extern bool_t xdr_int16_t();
-extern bool_t xdr_uint16_t();
-extern bool_t xdr_int32_t();
-extern bool_t xdr_uint32_t();
-#if defined(_INT64_TYPE)
-extern bool_t xdr_int64_t();
-extern bool_t xdr_uint64_t();
-#endif
-
-#ifndef _KERNEL
-extern bool_t xdr_u_char();
-extern bool_t xdr_vector();
-extern bool_t xdr_float();
-extern bool_t xdr_double();
-extern bool_t xdr_quadruple();
-#endif /* !_KERNEL */
-#endif
-
-/*
- * Common opaque bytes objects used by many rpc protocols;
- * declared here due to commonality.
- */
-#define MAX_NETOBJ_SZ 1024
-struct netobj {
- uint_t n_len;
- char *n_bytes;
-};
-typedef struct netobj netobj;
-
-#ifdef __STDC__
-extern bool_t xdr_netobj(XDR *, netobj *);
-#else
-extern bool_t xdr_netobj();
-#endif
-
-/*
- * These are XDR control operators
- */
-
-#define XDR_GET_BYTES_AVAIL 1
-
-struct xdr_bytesrec {
- bool_t xc_is_last_record;
- size_t xc_num_avail;
-};
-
-typedef struct xdr_bytesrec xdr_bytesrec;
-
-/*
- * These are the request arguments to XDR_CONTROL.
- *
- * XDR_PEEK - returns the contents of the next XDR unit on the XDR stream.
- * XDR_SKIPBYTES - skips the next N bytes in the XDR stream.
- * XDR_RDMAGET - for xdr implementation over RDMA, gets private flags from
- * the XDR stream being moved over RDMA
- * XDR_RDMANOCHUNK - for xdr implementaion over RDMA, sets private flags in
- * the XDR stream moving over RDMA.
- */
-#ifdef _KERNEL
-#define XDR_PEEK 2
-#define XDR_SKIPBYTES 3
-#define XDR_RDMAGET 4
-#define XDR_RDMASET 5
-#endif
-
-/*
- * These are the public routines for the various implementations of
- * xdr streams.
- */
-#ifndef _KERNEL
-#ifdef __STDC__
-extern void xdrmem_create(XDR *, const caddr_t, const uint_t, const enum
-xdr_op);
- /* XDR using memory buffers */
-extern void xdrrec_create(XDR *, const uint_t, const uint_t, const caddr_t,
-int (*) (void *, caddr_t, int), int (*) (void *, caddr_t, int));
-/* XDR pseudo records for tcp */
-extern bool_t xdrrec_endofrecord(XDR *, bool_t);
-/* make end of xdr record */
-extern bool_t xdrrec_skiprecord(XDR *);
-/* move to beginning of next record */
-extern bool_t xdrrec_eof(XDR *);
-extern uint_t xdrrec_readbytes(XDR *, caddr_t, uint_t);
-/* true if no more input */
-#else
-extern void xdrmem_create();
-extern void xdrstdio_create();
-extern void xdrrec_create();
-extern bool_t xdrrec_endofrecord();
-extern bool_t xdrrec_skiprecord();
-extern bool_t xdrrec_eof();
-extern uint_t xdrrec_readbytes();
-#endif
-#else
-
-extern void xdrmem_create(XDR *, caddr_t, uint_t, enum xdr_op);
-
-extern struct xdr_ops xdrmblk_ops;
-
-struct rpc_msg;
-extern bool_t xdr_callmsg(XDR *, struct rpc_msg *);
-extern bool_t xdr_replymsg_body(XDR *, struct rpc_msg *);
-extern bool_t xdr_replymsg_hdr(XDR *, struct rpc_msg *);
-
-#include <sys/malloc.h>
-#ifdef mem_alloc
-#undef mem_alloc
-#define mem_alloc(size) malloc((size), M_TEMP, M_WAITOK | M_ZERO)
-#endif
-#ifdef mem_free
-#undef mem_free
-#define mem_free(ptr, size) free((ptr), M_TEMP)
-#endif
-
-#endif /* !_KERNEL */
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* !_RPC_XDR_H */
diff --git a/sys/contrib/opensolaris/uts/common/rpc/xdr_array.c b/sys/contrib/opensolaris/uts/common/rpc/xdr_array.c
deleted file mode 100644
index 3711e53..0000000
--- a/sys/contrib/opensolaris/uts/common/rpc/xdr_array.c
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
-/* All Rights Reserved */
-
-/*
- * Portions of this source code were derived from Berkeley 4.3 BSD
- * under license from the Regents of the University of California.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-/*
- * xdr_array.c, Generic XDR routines impelmentation.
- * These are the "non-trivial" xdr primitives used to serialize and de-serialize
- * arrays. See xdr.h for more info on the interface to xdr.
- */
-
-#include <sys/param.h>
-#include <sys/cmn_err.h>
-#include <sys/types.h>
-#include <sys/systm.h>
-
-#include <rpc/types.h>
-#include <rpc/xdr.h>
-
-#define LASTUNSIGNED ((uint_t)0-1)
-
-/*
- * XDR an array of arbitrary elements
- * *addrp is a pointer to the array, *sizep is the number of elements.
- * If addrp is NULL (*sizep * elsize) bytes are allocated.
- * elsize is the size (in bytes) of each element, and elproc is the
- * xdr procedure to call to handle each element of the array.
- */
-bool_t
-xdr_array(XDR *xdrs, caddr_t *addrp, uint_t *sizep, const uint_t maxsize,
- const uint_t elsize, const xdrproc_t elproc)
-{
- uint_t i;
- caddr_t target = *addrp;
- uint_t c; /* the actual element count */
- bool_t stat = TRUE;
- uint_t nodesize;
-
- /* like strings, arrays are really counted arrays */
- if (!xdr_u_int(xdrs, sizep)) {
-#ifdef DEBUG
- printf("xdr_array: size FAILED\n");
-#endif
- return (FALSE);
- }
- c = *sizep;
- if ((c > maxsize || LASTUNSIGNED / elsize < c) &&
- xdrs->x_op != XDR_FREE) {
-#ifdef DEBUG
- printf("xdr_array: bad size FAILED\n");
-#endif
- return (FALSE);
- }
- nodesize = c * elsize;
-
- /*
- * if we are deserializing, we may need to allocate an array.
- * We also save time by checking for a null array if we are freeing.
- */
- if (target == NULL)
- switch (xdrs->x_op) {
- case XDR_DECODE:
- if (c == 0)
- return (TRUE);
- *addrp = target = (char *)mem_alloc(nodesize);
- bzero(target, nodesize);
- break;
-
- case XDR_FREE:
- return (TRUE);
-
- case XDR_ENCODE:
- break;
- }
-
- /*
- * now we xdr each element of array
- */
- for (i = 0; (i < c) && stat; i++) {
- stat = (*elproc)(xdrs, target, LASTUNSIGNED);
- target += elsize;
- }
-
- /*
- * the array may need freeing
- */
- if (xdrs->x_op == XDR_FREE) {
- mem_free(*addrp, nodesize);
- *addrp = NULL;
- }
- return (stat);
-}
diff --git a/sys/contrib/opensolaris/uts/common/rpc/xdr_mem.c b/sys/contrib/opensolaris/uts/common/rpc/xdr_mem.c
deleted file mode 100644
index 32ff32d..0000000
--- a/sys/contrib/opensolaris/uts/common/rpc/xdr_mem.c
+++ /dev/null
@@ -1,209 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
-/* All Rights Reserved */
-
-/*
- * Portions of this source code were derived from Berkeley 4.3 BSD
- * under license from the Regents of the University of California.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-/*
- * xdr_mem.c, XDR implementation using memory buffers.
- *
- * If you have some data to be interpreted as external data representation
- * or to be converted to external data representation in a memory buffer,
- * then this is the package for you.
- */
-
-#include <sys/param.h>
-#include <sys/types.h>
-#include <sys/systm.h>
-
-#include <rpc/types.h>
-#include <rpc/xdr.h>
-
-static struct xdr_ops *xdrmem_ops(void);
-
-/*
- * The procedure xdrmem_create initializes a stream descriptor for a
- * memory buffer.
- */
-void
-xdrmem_create(XDR *xdrs, caddr_t addr, uint_t size, enum xdr_op op)
-{
- xdrs->x_op = op;
- xdrs->x_ops = xdrmem_ops();
- xdrs->x_private = xdrs->x_base = addr;
- xdrs->x_handy = size;
- xdrs->x_public = NULL;
-}
-
-/* ARGSUSED */
-static void
-xdrmem_destroy(XDR *xdrs)
-{
-}
-
-static bool_t
-xdrmem_getint32(XDR *xdrs, int32_t *int32p)
-{
- if ((xdrs->x_handy -= (int)sizeof (int32_t)) < 0)
- return (FALSE);
- /* LINTED pointer alignment */
- *int32p = (int32_t)ntohl((uint32_t)(*((int32_t *)(xdrs->x_private))));
- xdrs->x_private += sizeof (int32_t);
- return (TRUE);
-}
-
-static bool_t
-xdrmem_putint32(XDR *xdrs, int32_t *int32p)
-{
- if ((xdrs->x_handy -= (int)sizeof (int32_t)) < 0)
- return (FALSE);
- /* LINTED pointer alignment */
- *(int32_t *)xdrs->x_private = (int32_t)htonl((uint32_t)(*int32p));
- xdrs->x_private += sizeof (int32_t);
- return (TRUE);
-}
-
-static bool_t
-xdrmem_getbytes(XDR *xdrs, caddr_t addr, int len)
-{
- if ((xdrs->x_handy -= len) < 0)
- return (FALSE);
- bcopy(xdrs->x_private, addr, len);
- xdrs->x_private += len;
- return (TRUE);
-}
-
-static bool_t
-xdrmem_putbytes(XDR *xdrs, caddr_t addr, int len)
-{
- if ((xdrs->x_handy -= len) < 0)
- return (FALSE);
- bcopy(addr, xdrs->x_private, len);
- xdrs->x_private += len;
- return (TRUE);
-}
-
-static uint_t
-xdrmem_getpos(XDR *xdrs)
-{
- return ((uint_t)((uintptr_t)xdrs->x_private - (uintptr_t)xdrs->x_base));
-}
-
-static bool_t
-xdrmem_setpos(XDR *xdrs, uint_t pos)
-{
- caddr_t newaddr = xdrs->x_base + pos;
- caddr_t lastaddr = xdrs->x_private + xdrs->x_handy;
- ptrdiff_t diff;
-
- if (newaddr > lastaddr)
- return (FALSE);
- xdrs->x_private = newaddr;
- diff = lastaddr - newaddr;
- xdrs->x_handy = (int)diff;
- return (TRUE);
-}
-
-static rpc_inline_t *
-xdrmem_inline(XDR *xdrs, int len)
-{
- rpc_inline_t *buf = NULL;
-
- if (xdrs->x_handy >= len) {
- xdrs->x_handy -= len;
- /* LINTED pointer alignment */
- buf = (rpc_inline_t *)xdrs->x_private;
- xdrs->x_private += len;
- }
- return (buf);
-}
-
-static bool_t
-xdrmem_control(XDR *xdrs, int request, void *info)
-{
- xdr_bytesrec *xptr;
- int32_t *int32p;
- int len;
-
- switch (request) {
-
- case XDR_GET_BYTES_AVAIL:
- xptr = (xdr_bytesrec *)info;
- xptr->xc_is_last_record = TRUE;
- xptr->xc_num_avail = xdrs->x_handy;
- return (TRUE);
-
- case XDR_PEEK:
- /*
- * Return the next 4 byte unit in the XDR stream.
- */
- if (xdrs->x_handy < sizeof (int32_t))
- return (FALSE);
- int32p = (int32_t *)info;
- *int32p = (int32_t)ntohl((uint32_t)
- (*((int32_t *)(xdrs->x_private))));
- return (TRUE);
-
- case XDR_SKIPBYTES:
- /*
- * Skip the next N bytes in the XDR stream.
- */
- int32p = (int32_t *)info;
- len = RNDUP((int)(*int32p));
- if ((xdrs->x_handy -= len) < 0)
- return (FALSE);
- xdrs->x_private += len;
- return (TRUE);
-
- }
- return (FALSE);
-}
-
-static struct xdr_ops *
-xdrmem_ops(void)
-{
- static struct xdr_ops ops;
-
- if (ops.x_getint32 == NULL) {
- ops.x_getbytes = xdrmem_getbytes;
- ops.x_putbytes = xdrmem_putbytes;
- ops.x_getpostn = xdrmem_getpos;
- ops.x_setpostn = xdrmem_setpos;
- ops.x_inline = xdrmem_inline;
- ops.x_destroy = xdrmem_destroy;
- ops.x_control = xdrmem_control;
- ops.x_getint32 = xdrmem_getint32;
- ops.x_putint32 = xdrmem_putint32;
- }
- return (&ops);
-}
diff --git a/sys/contrib/opensolaris/uts/common/sys/asm_linkage.h b/sys/contrib/opensolaris/uts/common/sys/asm_linkage.h
deleted file mode 100644
index b2a3c16..0000000
--- a/sys/contrib/opensolaris/uts/common/sys/asm_linkage.h
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _IA32_SYS_ASM_LINKAGE_H
-#define _IA32_SYS_ASM_LINKAGE_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifdef _ASM /* The remainder of this file is only for assembly files */
-
-/*
- * make annoying differences in assembler syntax go away
- */
-
-#if defined(__i386__) || defined(__amd64__)
-
-#define ASM_ENTRY_ALIGN 16
-
-/*
- * ENTRY provides the standard procedure entry code and an easy way to
- * insert the calls to mcount for profiling. ENTRY_NP is identical, but
- * never calls mcount.
- */
-#define ENTRY(x) \
- .text; \
- .align ASM_ENTRY_ALIGN; \
- .globl x; \
- .type x, @function; \
-x:
-
-/*
- * ALTENTRY provides for additional entry points.
- */
-#define ALTENTRY(x) \
- .globl x; \
- .type x, @function; \
-x:
-
-/*
- * SET_SIZE trails a function and set the size for the ELF symbol table.
- */
-#define SET_SIZE(x) \
- .size x, [.-x]
-
-#elif defined(__sparc64__)
-
-/*
- * ENTRY provides the standard procedure entry code and an easy way to
- * insert the calls to mcount for profiling. ENTRY_NP is identical, but
- * never calls mcount.
- */
-#define ENTRY(x) \
- .section ".text"; \
- .align 4; \
- .global x; \
- .type x, @function; \
-x:
-
-/*
- * ALTENTRY provides for additional entry points.
- */
-#define ALTENTRY(x) \
- .global x; \
- .type x, @function; \
-x:
-
-/*
- * SET_SIZE trails a function and set the size for the ELF symbol table.
- */
-#define SET_SIZE(x) \
- .size x, (.-x)
-
-#else
-
-#error Unsupported architecture.
-
-#endif
-
-#endif /* _ASM */
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _IA32_SYS_ASM_LINKAGE_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/avl.h b/sys/contrib/opensolaris/uts/common/sys/avl.h
deleted file mode 100644
index bf9af89..0000000
--- a/sys/contrib/opensolaris/uts/common/sys/avl.h
+++ /dev/null
@@ -1,298 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _AVL_H
-#define _AVL_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-/*
- * This is a private header file. Applications should not directly include
- * this file.
- */
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <sys/avl_impl.h>
-
-/*
- * This is a generic implemenatation of AVL trees for use in the Solaris kernel.
- * The interfaces provide an efficient way of implementing an ordered set of
- * data structures.
- *
- * AVL trees provide an alternative to using an ordered linked list. Using AVL
- * trees will usually be faster, however they requires more storage. An ordered
- * linked list in general requires 2 pointers in each data structure. The
- * AVL tree implementation uses 3 pointers. The following chart gives the
- * approximate performance of operations with the different approaches:
- *
- * Operation Link List AVL tree
- * --------- -------- --------
- * lookup O(n) O(log(n))
- *
- * insert 1 node constant constant
- *
- * delete 1 node constant between constant and O(log(n))
- *
- * delete all nodes O(n) O(n)
- *
- * visit the next
- * or prev node constant between constant and O(log(n))
- *
- *
- * The data structure nodes are anchored at an "avl_tree_t" (the equivalent
- * of a list header) and the individual nodes will have a field of
- * type "avl_node_t" (corresponding to list pointers).
- *
- * The type "avl_index_t" is used to indicate a position in the list for
- * certain calls.
- *
- * The usage scenario is generally:
- *
- * 1. Create the list/tree with: avl_create()
- *
- * followed by any mixture of:
- *
- * 2a. Insert nodes with: avl_add(), or avl_find() and avl_insert()
- *
- * 2b. Visited elements with:
- * avl_first() - returns the lowest valued node
- * avl_last() - returns the highest valued node
- * AVL_NEXT() - given a node go to next higher one
- * AVL_PREV() - given a node go to previous lower one
- *
- * 2c. Find the node with the closest value either less than or greater
- * than a given value with avl_nearest().
- *
- * 2d. Remove individual nodes from the list/tree with avl_remove().
- *
- * and finally when the list is being destroyed
- *
- * 3. Use avl_destroy_nodes() to quickly process/free up any remaining nodes.
- * Note that once you use avl_destroy_nodes(), you can no longer
- * use any routine except avl_destroy_nodes() and avl_destoy().
- *
- * 4. Use avl_destroy() to destroy the AVL tree itself.
- *
- * Any locking for multiple thread access is up to the user to provide, just
- * as is needed for any linked list implementation.
- */
-
-
-/*
- * Type used for the root of the AVL tree.
- */
-typedef struct avl_tree avl_tree_t;
-
-/*
- * The data nodes in the AVL tree must have a field of this type.
- */
-typedef struct avl_node avl_node_t;
-
-/*
- * An opaque type used to locate a position in the tree where a node
- * would be inserted.
- */
-typedef uintptr_t avl_index_t;
-
-
-/*
- * Direction constants used for avl_nearest().
- */
-#define AVL_BEFORE (0)
-#define AVL_AFTER (1)
-
-
-
-/*
- * Prototypes
- *
- * Where not otherwise mentioned, "void *" arguments are a pointer to the
- * user data structure which must contain a field of type avl_node_t.
- *
- * Also assume the user data structures looks like:
- * stuct my_type {
- * ...
- * avl_node_t my_link;
- * ...
- * };
- */
-
-/*
- * Initialize an AVL tree. Arguments are:
- *
- * tree - the tree to be initialized
- * compar - function to compare two nodes, it must return exactly: -1, 0, or +1
- * -1 for <, 0 for ==, and +1 for >
- * size - the value of sizeof(struct my_type)
- * offset - the value of OFFSETOF(struct my_type, my_link)
- */
-extern void avl_create(avl_tree_t *tree,
- int (*compar) (const void *, const void *), size_t size, size_t offset);
-
-
-/*
- * Find a node with a matching value in the tree. Returns the matching node
- * found. If not found, it returns NULL and then if "where" is not NULL it sets
- * "where" for use with avl_insert() or avl_nearest().
- *
- * node - node that has the value being looked for
- * where - position for use with avl_nearest() or avl_insert(), may be NULL
- */
-extern void *avl_find(avl_tree_t *tree, void *node, avl_index_t *where);
-
-/*
- * Insert a node into the tree.
- *
- * node - the node to insert
- * where - position as returned from avl_find()
- */
-extern void avl_insert(avl_tree_t *tree, void *node, avl_index_t where);
-
-/*
- * Insert "new_data" in "tree" in the given "direction" either after
- * or before the data "here".
- *
- * This might be usefull for avl clients caching recently accessed
- * data to avoid doing avl_find() again for insertion.
- *
- * new_data - new data to insert
- * here - existing node in "tree"
- * direction - either AVL_AFTER or AVL_BEFORE the data "here".
- */
-extern void avl_insert_here(avl_tree_t *tree, void *new_data, void *here,
- int direction);
-
-
-/*
- * Return the first or last valued node in the tree. Will return NULL
- * if the tree is empty.
- *
- */
-extern void *avl_first(avl_tree_t *tree);
-extern void *avl_last(avl_tree_t *tree);
-
-
-/*
- * Return the next or previous valued node in the tree.
- * AVL_NEXT() will return NULL if at the last node.
- * AVL_PREV() will return NULL if at the first node.
- *
- * node - the node from which the next or previous node is found
- */
-#define AVL_NEXT(tree, node) avl_walk(tree, node, AVL_AFTER)
-#define AVL_PREV(tree, node) avl_walk(tree, node, AVL_BEFORE)
-
-
-/*
- * Find the node with the nearest value either greater or less than
- * the value from a previous avl_find(). Returns the node or NULL if
- * there isn't a matching one.
- *
- * where - position as returned from avl_find()
- * direction - either AVL_BEFORE or AVL_AFTER
- *
- * EXAMPLE get the greatest node that is less than a given value:
- *
- * avl_tree_t *tree;
- * struct my_data look_for_value = {....};
- * struct my_data *node;
- * struct my_data *less;
- * avl_index_t where;
- *
- * node = avl_find(tree, &look_for_value, &where);
- * if (node != NULL)
- * less = AVL_PREV(tree, node);
- * else
- * less = avl_nearest(tree, where, AVL_BEFORE);
- */
-extern void *avl_nearest(avl_tree_t *tree, avl_index_t where, int direction);
-
-
-/*
- * Add a single node to the tree.
- * The node must not be in the tree, and it must not
- * compare equal to any other node already in the tree.
- *
- * node - the node to add
- */
-extern void avl_add(avl_tree_t *tree, void *node);
-
-
-/*
- * Remove a single node from the tree. The node must be in the tree.
- *
- * node - the node to remove
- */
-extern void avl_remove(avl_tree_t *tree, void *node);
-
-
-/*
- * Return the number of nodes in the tree
- */
-extern ulong_t avl_numnodes(avl_tree_t *tree);
-
-
-/*
- * Used to destroy any remaining nodes in a tree. The cookie argument should
- * be initialized to NULL before the first call. Returns a node that has been
- * removed from the tree and may be free()'d. Returns NULL when the tree is
- * empty.
- *
- * Once you call avl_destroy_nodes(), you can only continuing calling it and
- * finally avl_destroy(). No other AVL routines will be valid.
- *
- * cookie - a "void *" used to save state between calls to avl_destroy_nodes()
- *
- * EXAMPLE:
- * avl_tree_t *tree;
- * struct my_data *node;
- * void *cookie;
- *
- * cookie = NULL;
- * while ((node = avl_destroy_nodes(tree, &cookie)) != NULL)
- * free(node);
- * avl_destroy(tree);
- */
-extern void *avl_destroy_nodes(avl_tree_t *tree, void **cookie);
-
-
-/*
- * Final destroy of an AVL tree. Arguments are:
- *
- * tree - the empty tree to destroy
- */
-extern void avl_destroy(avl_tree_t *tree);
-
-
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _AVL_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/avl_impl.h b/sys/contrib/opensolaris/uts/common/sys/avl_impl.h
deleted file mode 100644
index 620685f..0000000
--- a/sys/contrib/opensolaris/uts/common/sys/avl_impl.h
+++ /dev/null
@@ -1,164 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _AVL_IMPL_H
-#define _AVL_IMPL_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-/*
- * This is a private header file. Applications should not directly include
- * this file.
- */
-
-#include <sys/types.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-
-/*
- * generic AVL tree implementation for kernel use
- *
- * There are 5 pieces of information stored for each node in an AVL tree
- *
- * pointer to less than child
- * pointer to greater than child
- * a pointer to the parent of this node
- * an indication [0/1] of which child I am of my parent
- * a "balance" (-1, 0, +1) indicating which child tree is taller
- *
- * Since they only need 3 bits, the last two fields are packed into the
- * bottom bits of the parent pointer on 64 bit machines to save on space.
- */
-
-#ifndef _LP64
-
-struct avl_node {
- struct avl_node *avl_child[2]; /* left/right children */
- struct avl_node *avl_parent; /* this node's parent */
- unsigned short avl_child_index; /* my index in parent's avl_child[] */
- short avl_balance; /* balance value: -1, 0, +1 */
-};
-
-#define AVL_XPARENT(n) ((n)->avl_parent)
-#define AVL_SETPARENT(n, p) ((n)->avl_parent = (p))
-
-#define AVL_XCHILD(n) ((n)->avl_child_index)
-#define AVL_SETCHILD(n, c) ((n)->avl_child_index = (unsigned short)(c))
-
-#define AVL_XBALANCE(n) ((n)->avl_balance)
-#define AVL_SETBALANCE(n, b) ((n)->avl_balance = (short)(b))
-
-#else /* _LP64 */
-
-/*
- * for 64 bit machines, avl_pcb contains parent pointer, balance and child_index
- * values packed in the following manner:
- *
- * |63 3| 2 |1 0 |
- * |-------------------------------------|-----------------|-------------|
- * | avl_parent hi order bits | avl_child_index | avl_balance |
- * | | | + 1 |
- * |-------------------------------------|-----------------|-------------|
- *
- */
-struct avl_node {
- struct avl_node *avl_child[2]; /* left/right children nodes */
- uintptr_t avl_pcb; /* parent, child_index, balance */
-};
-
-/*
- * macros to extract/set fields in avl_pcb
- *
- * pointer to the parent of the current node is the high order bits
- */
-#define AVL_XPARENT(n) ((struct avl_node *)((n)->avl_pcb & ~7))
-#define AVL_SETPARENT(n, p) \
- ((n)->avl_pcb = (((n)->avl_pcb & 7) | (uintptr_t)(p)))
-
-/*
- * index of this node in its parent's avl_child[]: bit #2
- */
-#define AVL_XCHILD(n) (((n)->avl_pcb >> 2) & 1)
-#define AVL_SETCHILD(n, c) \
- ((n)->avl_pcb = (uintptr_t)(((n)->avl_pcb & ~4) | ((c) << 2)))
-
-/*
- * balance indication for a node, lowest 2 bits. A valid balance is
- * -1, 0, or +1, and is encoded by adding 1 to the value to get the
- * unsigned values of 0, 1, 2.
- */
-#define AVL_XBALANCE(n) ((int)(((n)->avl_pcb & 3) - 1))
-#define AVL_SETBALANCE(n, b) \
- ((n)->avl_pcb = (uintptr_t)((((n)->avl_pcb & ~3) | ((b) + 1))))
-
-#endif /* _LP64 */
-
-
-
-/*
- * switch between a node and data pointer for a given tree
- * the value of "o" is tree->avl_offset
- */
-#define AVL_NODE2DATA(n, o) ((void *)((uintptr_t)(n) - (o)))
-#define AVL_DATA2NODE(d, o) ((struct avl_node *)((uintptr_t)(d) + (o)))
-
-
-
-/*
- * macros used to create/access an avl_index_t
- */
-#define AVL_INDEX2NODE(x) ((avl_node_t *)((x) & ~1))
-#define AVL_INDEX2CHILD(x) ((x) & 1)
-#define AVL_MKINDEX(n, c) ((avl_index_t)(n) | (c))
-
-
-/*
- * The tree structure. The fields avl_root, avl_compar, and avl_offset come
- * first since they are needed for avl_find(). We want them to fit into
- * a single 64 byte cache line to make avl_find() as fast as possible.
- */
-struct avl_tree {
- struct avl_node *avl_root; /* root node in tree */
- int (*avl_compar)(const void *, const void *);
- size_t avl_offset; /* offsetof(type, avl_link_t field) */
- ulong_t avl_numnodes; /* number of nodes in the tree */
- size_t avl_size; /* sizeof user type struct */
-};
-
-
-/*
- * This will only by used via AVL_NEXT() or AVL_PREV()
- */
-extern void *avl_walk(struct avl_tree *, void *, int);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _AVL_IMPL_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/bitmap.h b/sys/contrib/opensolaris/uts/common/sys/bitmap.h
deleted file mode 100644
index d0dd12b..0000000
--- a/sys/contrib/opensolaris/uts/common/sys/bitmap.h
+++ /dev/null
@@ -1,194 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
-/* All Rights Reserved */
-
-
-#ifndef _SYS_BITMAP_H
-#define _SYS_BITMAP_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <sys/feature_tests.h>
-#if defined(__GNUC__) && defined(_ASM_INLINES) && \
- (defined(__i386) || defined(__amd64))
-#include <asm/bitmap.h>
-#endif
-
-/*
- * Operations on bitmaps of arbitrary size
- * A bitmap is a vector of 1 or more ulong_t's.
- * The user of the package is responsible for range checks and keeping
- * track of sizes.
- */
-
-#ifdef _LP64
-#define BT_ULSHIFT 6 /* log base 2 of BT_NBIPUL, to extract word index */
-#define BT_ULSHIFT32 5 /* log base 2 of BT_NBIPUL, to extract word index */
-#else
-#define BT_ULSHIFT 5 /* log base 2 of BT_NBIPUL, to extract word index */
-#endif
-
-#define BT_NBIPUL (1 << BT_ULSHIFT) /* n bits per ulong_t */
-#define BT_ULMASK (BT_NBIPUL - 1) /* to extract bit index */
-
-#ifdef _LP64
-#define BT_NBIPUL32 (1 << BT_ULSHIFT32) /* n bits per ulong_t */
-#define BT_ULMASK32 (BT_NBIPUL32 - 1) /* to extract bit index */
-#define BT_ULMAXMASK 0xffffffffffffffff /* used by bt_getlowbit */
-#else
-#define BT_ULMAXMASK 0xffffffff
-#endif
-
-/*
- * bitmap is a ulong_t *, bitindex an index_t
- *
- * The macros BT_WIM and BT_BIW internal; there is no need
- * for users of this package to use them.
- */
-
-/*
- * word in map
- */
-#define BT_WIM(bitmap, bitindex) \
- ((bitmap)[(bitindex) >> BT_ULSHIFT])
-/*
- * bit in word
- */
-#define BT_BIW(bitindex) \
- (1UL << ((bitindex) & BT_ULMASK))
-
-#ifdef _LP64
-#define BT_WIM32(bitmap, bitindex) \
- ((bitmap)[(bitindex) >> BT_ULSHIFT32])
-
-#define BT_BIW32(bitindex) \
- (1UL << ((bitindex) & BT_ULMASK32))
-#endif
-
-/*
- * These are public macros
- *
- * BT_BITOUL == n bits to n ulong_t's
- */
-#define BT_BITOUL(nbits) \
- (((nbits) + BT_NBIPUL - 1l) / BT_NBIPUL)
-#define BT_SIZEOFMAP(nbits) \
- (BT_BITOUL(nbits) * sizeof (ulong_t))
-#define BT_TEST(bitmap, bitindex) \
- ((BT_WIM((bitmap), (bitindex)) & BT_BIW(bitindex)) ? 1 : 0)
-#define BT_SET(bitmap, bitindex) \
- { BT_WIM((bitmap), (bitindex)) |= BT_BIW(bitindex); }
-#define BT_CLEAR(bitmap, bitindex) \
- { BT_WIM((bitmap), (bitindex)) &= ~BT_BIW(bitindex); }
-
-#ifdef _LP64
-#define BT_BITOUL32(nbits) \
- (((nbits) + BT_NBIPUL32 - 1l) / BT_NBIPUL32)
-#define BT_SIZEOFMAP32(nbits) \
- (BT_BITOUL32(nbits) * sizeof (uint_t))
-#define BT_TEST32(bitmap, bitindex) \
- ((BT_WIM32((bitmap), (bitindex)) & BT_BIW32(bitindex)) ? 1 : 0)
-#define BT_SET32(bitmap, bitindex) \
- { BT_WIM32((bitmap), (bitindex)) |= BT_BIW32(bitindex); }
-#define BT_CLEAR32(bitmap, bitindex) \
- { BT_WIM32((bitmap), (bitindex)) &= ~BT_BIW32(bitindex); }
-#endif /* _LP64 */
-
-
-/*
- * BIT_ONLYONESET is a private macro not designed for bitmaps of
- * arbitrary size. u must be an unsigned integer/long. It returns
- * true if one and only one bit is set in u.
- */
-#define BIT_ONLYONESET(u) \
- ((((u) == 0) ? 0 : ((u) & ((u) - 1)) == 0))
-
-#if defined(_KERNEL) && !defined(_ASM)
-#include <sys/atomic.h>
-
-/*
- * return next available bit index from map with specified number of bits
- */
-extern index_t bt_availbit(ulong_t *bitmap, size_t nbits);
-/*
- * find the highest order bit that is on, and is within or below
- * the word specified by wx
- */
-extern int bt_gethighbit(ulong_t *mapp, int wx);
-extern int bt_range(ulong_t *bitmap, size_t *pos1, size_t *pos2,
- size_t end_pos);
-/*
- * Find highest and lowest one bit set.
- * Returns bit number + 1 of bit that is set, otherwise returns 0.
- * Low order bit is 0, high order bit is 31.
- */
-extern int highbit(ulong_t);
-extern int lowbit(ulong_t);
-extern int bt_getlowbit(ulong_t *bitmap, size_t start, size_t stop);
-extern void bt_copy(ulong_t *, ulong_t *, ulong_t);
-
-/*
- * find the parity
- */
-extern int odd_parity(ulong_t);
-
-/*
- * Atomically set/clear bits
- * Atomic exclusive operations will set "result" to "-1"
- * if the bit is already set/cleared. "result" will be set
- * to 0 otherwise.
- */
-#define BT_ATOMIC_SET(bitmap, bitindex) \
- { atomic_or_long(&(BT_WIM(bitmap, bitindex)), BT_BIW(bitindex)); }
-#define BT_ATOMIC_CLEAR(bitmap, bitindex) \
- { atomic_and_long(&(BT_WIM(bitmap, bitindex)), ~BT_BIW(bitindex)); }
-
-#define BT_ATOMIC_SET_EXCL(bitmap, bitindex, result) \
- { result = atomic_set_long_excl(&(BT_WIM(bitmap, bitindex)), \
- (bitindex) % BT_NBIPUL); }
-#define BT_ATOMIC_CLEAR_EXCL(bitmap, bitindex, result) \
- { result = atomic_clear_long_excl(&(BT_WIM(bitmap, bitindex)), \
- (bitindex) % BT_NBIPUL); }
-
-/*
- * Extracts bits between index h (high, inclusive) and l (low, exclusive) from
- * u, which must be an unsigned integer.
- */
-#define BITX(u, h, l) (((u) >> (l)) & ((1LU << ((h) - (l) + 1LU)) - 1LU))
-
-#endif /* _KERNEL && !_ASM */
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_BITMAP_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/byteorder.h b/sys/contrib/opensolaris/uts/common/sys/byteorder.h
deleted file mode 100644
index 00afdd5..0000000
--- a/sys/contrib/opensolaris/uts/common/sys/byteorder.h
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
-/* All Rights Reserved */
-
-/*
- * University Copyright- Copyright (c) 1982, 1986, 1988
- * The Regents of the University of California
- * All Rights Reserved
- *
- * University Acknowledgment- Portions of this document are derived from
- * software developed by the University of California, Berkeley, and its
- * contributors.
- */
-
-#ifndef _SYS_BYTEORDER_H
-#define _SYS_BYTEORDER_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/isa_defs.h>
-#include <sys/int_types.h>
-
-#if defined(__GNUC__) && defined(_ASM_INLINES) && \
- (defined(__i386) || defined(__amd64))
-#include <asm/byteorder.h>
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * macros for conversion between host and (internet) network byte order
- */
-
-#if BYTE_ORDER == _BIG_ENDIAN && !defined(ntohl) && !defined(__lint)
-/* big-endian */
-#define ntohl(x) (x)
-#define ntohs(x) (x)
-#define htonl(x) (x)
-#define htons(x) (x)
-
-#elif !defined(ntohl) /* little-endian */
-
-#ifndef _IN_PORT_T
-#define _IN_PORT_T
-typedef uint16_t in_port_t;
-#endif
-
-#ifndef _IN_ADDR_T
-#define _IN_ADDR_T
-typedef uint32_t in_addr_t;
-#endif
-
-#if !defined(_XPG4_2) || defined(__EXTENSIONS__) || defined(_XPG5)
-extern uint32_t htonl(uint32_t);
-extern uint16_t htons(uint16_t);
-extern uint32_t ntohl(uint32_t);
-extern uint16_t ntohs(uint16_t);
-#else
-extern in_addr_t htonl(in_addr_t);
-extern in_port_t htons(in_port_t);
-extern in_addr_t ntohl(in_addr_t);
-extern in_port_t ntohs(in_port_t);
-#endif /* !defined(_XPG4_2) || defined(__EXTENSIONS__) || defined(_XPG5) */
-#endif
-
-#if !defined(_XPG4_2) || defined(__EXTENSIONS__)
-
-/*
- * Macros to reverse byte order
- */
-#define BSWAP_8(x) ((x) & 0xff)
-#define BSWAP_16(x) ((BSWAP_8(x) << 8) | BSWAP_8((x) >> 8))
-#define BSWAP_32(x) ((BSWAP_16(x) << 16) | BSWAP_16((x) >> 16))
-#define BSWAP_64(x) ((BSWAP_32(x) << 32) | BSWAP_32((x) >> 32))
-
-#define BMASK_8(x) ((x) & 0xff)
-#define BMASK_16(x) ((x) & 0xffff)
-#define BMASK_32(x) ((x) & 0xffffffff)
-#define BMASK_64(x) (x)
-
-/*
- * Macros to convert from a specific byte order to/from native byte order
- */
-#if BYTE_ORDER == _BIG_ENDIAN
-#define BE_8(x) BMASK_8(x)
-#define BE_16(x) BMASK_16(x)
-#define BE_32(x) BMASK_32(x)
-#define BE_64(x) BMASK_64(x)
-#define LE_8(x) BSWAP_8(x)
-#define LE_16(x) BSWAP_16(x)
-#define LE_32(x) BSWAP_32(x)
-#define LE_64(x) BSWAP_64(x)
-#else
-#define LE_8(x) BMASK_8(x)
-#define LE_16(x) BMASK_16(x)
-#define LE_32(x) BMASK_32(x)
-#define LE_64(x) BMASK_64(x)
-#define BE_8(x) BSWAP_8(x)
-#define BE_16(x) BSWAP_16(x)
-#define BE_32(x) BSWAP_32(x)
-#define BE_64(x) BSWAP_64(x)
-#endif
-
-#endif /* !defined(_XPG4_2) || defined(__EXTENSIONS__) */
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_BYTEORDER_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/callb.h b/sys/contrib/opensolaris/uts/common/sys/callb.h
deleted file mode 100644
index b12b2e2..0000000
--- a/sys/contrib/opensolaris/uts/common/sys/callb.h
+++ /dev/null
@@ -1,214 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_CALLB_H
-#define _SYS_CALLB_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/t_lock.h>
-#include <sys/thread.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * definitions of callback classes (c_class)
- *
- * Callbacks belong in the same class if (1) their callback routines
- * do the same kind of processing (ideally, using the same callback function)
- * and (2) they can/should be executed at the same time in a cpr
- * suspend/resume operation.
- *
- * Note: The DAEMON class, in particular, is for stopping kernel threads
- * and nothing else. The CALLB_* macros below should be used to deal
- * with kernel threads, and the callback function should be callb_generic_cpr.
- * Another idiosyncrasy of the DAEMON class is that if a suspend operation
- * fails, some of the callback functions may be called with the RESUME
- * code which were never called with SUSPEND. Not a problem currently,
- * but see bug 4201851.
- */
-#define CB_CL_CPR_DAEMON 0
-#define CB_CL_CPR_VM 1
-#define CB_CL_CPR_CALLOUT 2
-#define CB_CL_CPR_OBP 3
-#define CB_CL_CPR_FB 4
-#define CB_CL_PANIC 5
-#define CB_CL_CPR_RPC 6
-#define CB_CL_CPR_PROMPRINTF 7
-#define CB_CL_UADMIN 8
-#define CB_CL_CPR_PM 9
-#define CB_CL_HALT 10
-#define CB_CL_CPR_DMA 11
-#define CB_CL_CPR_POST_USER 12
-#define CB_CL_UADMIN_PRE_VFS 13
-#define CB_CL_MDBOOT CB_CL_UADMIN
-#define CB_CL_ENTER_DEBUGGER 14
-#define CB_CL_CPR_POST_KERNEL 15
-#define NCBCLASS 16 /* CHANGE ME if classes are added/removed */
-
-/*
- * CB_CL_CPR_DAEMON class specific definitions are given below:
- */
-
-/*
- * code for CPR callb_execute_class
- */
-#define CB_CODE_CPR_CHKPT 0
-#define CB_CODE_CPR_RESUME 1
-
-typedef void * callb_id_t;
-/*
- * Per kernel thread structure for CPR daemon callbacks.
- * Must be protected by either a existing lock in the daemon or
- * a new lock created for such a purpose.
- */
-typedef struct callb_cpr {
- kmutex_t *cc_lockp; /* lock to protect this struct */
- char cc_events; /* various events for CPR */
- callb_id_t cc_id; /* callb id address */
- kcondvar_t cc_callb_cv; /* cv for callback waiting */
- kcondvar_t cc_stop_cv; /* cv to checkpoint block */
-} callb_cpr_t;
-
-/*
- * cc_events definitions
- */
-#define CALLB_CPR_START 1 /* a checkpoint request's started */
-#define CALLB_CPR_SAFE 2 /* thread is safe for CPR */
-#define CALLB_CPR_ALWAYS_SAFE 4 /* thread is ALWAYS safe for CPR */
-
-/*
- * Used when checking that all kernel threads are stopped.
- */
-#define CALLB_MAX_RETRY 3 /* when waiting for kthread to sleep */
-#define CALLB_THREAD_DELAY 10 /* ticks allowed to reach sleep */
-#define CPR_KTHREAD_TIMEOUT_SEC 90 /* secs before callback times out -- */
- /* due to pwr mgmt of disks, make -- */
- /* big enough for worst spinup time */
-
-#ifdef _KERNEL
-/*
- *
- * CALLB_CPR_INIT macro is used by kernel threads to add their entry to
- * the callback table and perform other initialization. It automatically
- * adds the thread as being in the callback class CB_CL_CPR_DAEMON.
- *
- * cp - ptr to the callb_cpr_t structure for this kernel thread
- *
- * lockp - pointer to mutex protecting the callb_cpr_t stuct
- *
- * func - pointer to the callback function for this kernel thread.
- * It has the prototype boolean_t <func>(void *arg, int code)
- * where: arg - ptr to the callb_cpr_t structure
- * code - not used for this type of callback
- * returns: B_TRUE if successful; B_FALSE if unsuccessful.
- *
- * name - a string giving the name of the kernel thread
- *
- * Note: lockp is the lock to protect the callb_cpr_t (cp) structure
- * later on. No lock held is needed for this initialization.
- */
-#define CALLB_CPR_INIT(cp, lockp, func, name) { \
- bzero((caddr_t)(cp), sizeof (callb_cpr_t)); \
- (cp)->cc_lockp = lockp; \
- (cp)->cc_id = callb_add(func, (void *)(cp), \
- CB_CL_CPR_DAEMON, name); \
- }
-
-#ifndef __lock_lint
-#define CALLB_CPR_ASSERT(cp) ASSERT(MUTEX_HELD((cp)->cc_lockp));
-#else
-#define CALLB_CPR_ASSERT(cp)
-#endif
-/*
- * Some threads (like the idle threads) do not adhere to the callback
- * protocol and are always considered safe. Such threads must never exit.
- * They register their presence by calling this macro during their
- * initialization.
- *
- * Args:
- * t - thread pointer of the client kernel thread
- * name - a string giving the name of the kernel thread
- */
-#define CALLB_CPR_INIT_SAFE(t, name) { \
- (void) callb_add_thread(callb_generic_cpr_safe, \
- (void *) &callb_cprinfo_safe, CB_CL_CPR_DAEMON, \
- name, t); \
- }
-/*
- * The lock to protect cp's content must be held before
- * calling the following two macros.
- *
- * Any code region between CALLB_CPR_SAFE_BEGIN and CALLB_CPR_SAFE_END
- * is safe for checkpoint/resume.
- */
-#define CALLB_CPR_SAFE_BEGIN(cp) { \
- CALLB_CPR_ASSERT(cp) \
- (cp)->cc_events |= CALLB_CPR_SAFE; \
- if ((cp)->cc_events & CALLB_CPR_START) \
- cv_signal(&(cp)->cc_callb_cv); \
- }
-#define CALLB_CPR_SAFE_END(cp, lockp) { \
- CALLB_CPR_ASSERT(cp) \
- while ((cp)->cc_events & CALLB_CPR_START) \
- cv_wait(&(cp)->cc_stop_cv, lockp); \
- (cp)->cc_events &= ~CALLB_CPR_SAFE; \
- }
-/*
- * cv_destroy is nop right now but may be needed in the future.
- */
-#define CALLB_CPR_EXIT(cp) { \
- CALLB_CPR_ASSERT(cp) \
- (cp)->cc_events |= CALLB_CPR_SAFE; \
- if ((cp)->cc_events & CALLB_CPR_START) \
- cv_signal(&(cp)->cc_callb_cv); \
- mutex_exit((cp)->cc_lockp); \
- (void) callb_delete((cp)->cc_id); \
- cv_destroy(&(cp)->cc_callb_cv); \
- cv_destroy(&(cp)->cc_stop_cv); \
- }
-
-extern callb_cpr_t callb_cprinfo_safe;
-extern callb_id_t callb_add(boolean_t (*)(void *, int), void *, int, char *);
-extern callb_id_t callb_add_thread(boolean_t (*)(void *, int),
- void *, int, char *, kthread_id_t);
-extern int callb_delete(callb_id_t);
-extern void callb_execute(callb_id_t, int);
-extern void *callb_execute_class(int, int);
-extern boolean_t callb_generic_cpr(void *, int);
-extern boolean_t callb_generic_cpr_safe(void *, int);
-extern boolean_t callb_is_stopped(kthread_id_t, caddr_t *);
-extern void callb_lock_table(void);
-extern void callb_unlock_table(void);
-#endif
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_CALLB_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/ccompile.h b/sys/contrib/opensolaris/uts/common/sys/ccompile.h
deleted file mode 100644
index c9857b08..0000000
--- a/sys/contrib/opensolaris/uts/common/sys/ccompile.h
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_CCOMPILE_H
-#define _SYS_CCOMPILE_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-/*
- * This file contains definitions designed to enable different compilers
- * to be used harmoniously on Solaris systems.
- */
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * Allow for version tests for compiler bugs and features.
- */
-#if defined(__GNUC__)
-#define __GNUC_VERSION \
- (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
-#else
-#define __GNUC_VERSION 0
-#endif
-
-#if defined(__ATTRIBUTE_IMPLEMENTED) || defined(__GNUC__)
-
-/*
- * analogous to lint's PRINTFLIKEn
- */
-#define __sun_attr___PRINTFLIKE__(__n) \
- __attribute__((__format__(printf, __n, (__n)+1)))
-#define __sun_attr___VPRINTFLIKE__(__n) \
- __attribute__((__format__(printf, __n, 0)))
-
-/*
- * Handle the kernel printf routines that can take '%b' too
- */
-#if __GNUC_VERSION < 30402
-/*
- * XX64 at least this doesn't work correctly yet with 3.4.1 anyway!
- */
-#define __sun_attr___KPRINTFLIKE__ __sun_attr___PRINTFLIKE__
-#define __sun_attr___KVPRINTFLIKE__ __sun_attr___VPRINTFLIKE__
-#else
-#define __sun_attr___KPRINTFLIKE__(__n) \
- __attribute__((__format__(cmn_err, __n, (__n)+1)))
-#define __sun_attr___KVPRINTFLIKE__(__n) \
- __attribute__((__format__(cmn_err, __n, 0)))
-#endif
-
-/*
- * This one's pretty obvious -- the function never returns
- */
-#define __sun_attr___noreturn__ __attribute__((__noreturn__))
-
-
-/*
- * This is an appropriate label for functions that do not
- * modify their arguments, e.g. strlen()
- */
-#define __sun_attr___pure__ __attribute__((__pure__))
-
-/*
- * This is a stronger form of __pure__. Can be used for functions
- * that do not modify their arguments and don't depend on global
- * memory.
- */
-#define __sun_attr___const__ __attribute__((__const__))
-
-/*
- * structure packing like #pragma pack(1)
- */
-#define __sun_attr___packed__ __attribute__((__packed__))
-
-#define ___sun_attr_inner(__a) __sun_attr_##__a
-#define __sun_attr__(__a) ___sun_attr_inner __a
-
-#else /* __ATTRIBUTE_IMPLEMENTED || __GNUC__ */
-
-#define __sun_attr__(__a)
-
-#endif /* __ATTRIBUTE_IMPLEMENTED || __GNUC__ */
-
-/*
- * Shorthand versions for readability
- */
-
-#define __PRINTFLIKE(__n) __sun_attr__((__PRINTFLIKE__(__n)))
-#define __VPRINTFLIKE(__n) __sun_attr__((__VPRINTFLIKE__(__n)))
-#define __KPRINTFLIKE(__n) __sun_attr__((__KPRINTFLIKE__(__n)))
-#define __KVPRINTFLIKE(__n) __sun_attr__((__KVPRINTFLIKE__(__n)))
-#define __NORETURN __sun_attr__((__noreturn__))
-#define __CONST __sun_attr__((__const__))
-#define __PURE __sun_attr__((__pure__))
-
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_CCOMPILE_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/compress.h b/sys/contrib/opensolaris/uts/common/sys/compress.h
deleted file mode 100644
index 3d79d95..0000000
--- a/sys/contrib/opensolaris/uts/common/sys/compress.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 1998 by Sun Microsystems, Inc.
- * All rights reserved.
- */
-
-#ifndef _SYS_COMPRESS_H
-#define _SYS_COMPRESS_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/types.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-extern size_t compress(void *, void *, size_t);
-extern size_t decompress(void *, void *, size_t, size_t);
-extern uint32_t checksum32(void *, size_t);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_COMPRESS_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/cred.h b/sys/contrib/opensolaris/uts/common/sys/cred.h
deleted file mode 100644
index c1400b8..0000000
--- a/sys/contrib/opensolaris/uts/common/sys/cred.h
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
-/* All Rights Reserved */
-
-/*
- * Portions of this source code were derived from Berkeley 4.3 BSD
- * under license from the Regents of the University of California.
- */
-
-#ifndef _SYS_CRED_H
-#define _SYS_CRED_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/types.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * The credential is an opaque kernel private data structure defined in
- * <sys/cred_impl.h>.
- */
-
-typedef struct cred cred_t;
-
-#ifdef _KERNEL
-
-#define CRED() curthread->t_cred
-
-struct proc; /* cred.h is included in proc.h */
-struct prcred;
-
-struct auditinfo_addr; /* cred.h is included in audit.h */
-
-extern int ngroups_max;
-/*
- * kcred is used when you need all privileges.
- */
-extern struct cred *kcred;
-
-extern void cred_init(void);
-extern void crhold(cred_t *);
-extern void crfree(cred_t *);
-extern cred_t *cralloc(void); /* all but ref uninitialized */
-extern cred_t *crget(void); /* initialized */
-extern cred_t *crcopy(cred_t *);
-extern void crcopy_to(cred_t *, cred_t *);
-extern cred_t *crdup(cred_t *);
-extern void crdup_to(cred_t *, cred_t *);
-extern cred_t *crgetcred(void);
-extern void crset(struct proc *, cred_t *);
-extern int groupmember(gid_t, const cred_t *);
-extern int supgroupmember(gid_t, const cred_t *);
-extern int hasprocperm(const cred_t *, const cred_t *);
-extern int prochasprocperm(struct proc *, struct proc *, const cred_t *);
-extern int crcmp(const cred_t *, const cred_t *);
-extern cred_t *zone_kcred(void);
-
-extern uid_t crgetuid(const cred_t *);
-extern uid_t crgetruid(const cred_t *);
-extern uid_t crgetsuid(const cred_t *);
-extern gid_t crgetgid(const cred_t *);
-extern gid_t crgetrgid(const cred_t *);
-extern gid_t crgetsgid(const cred_t *);
-extern zoneid_t crgetzoneid(const cred_t *);
-extern projid_t crgetprojid(const cred_t *);
-
-
-extern const struct auditinfo_addr *crgetauinfo(const cred_t *);
-extern struct auditinfo_addr *crgetauinfo_modifiable(cred_t *);
-
-extern uint_t crgetref(const cred_t *);
-
-extern const gid_t *crgetgroups(const cred_t *);
-
-extern int crgetngroups(const cred_t *);
-
-/*
- * Sets real, effective and/or saved uid/gid;
- * -1 argument accepted as "no change".
- */
-extern int crsetresuid(cred_t *, uid_t, uid_t, uid_t);
-extern int crsetresgid(cred_t *, gid_t, gid_t, gid_t);
-
-/*
- * Sets real, effective and saved uids/gids all to the same
- * values. Both values must be non-negative and <= MAXUID
- */
-extern int crsetugid(cred_t *, uid_t, gid_t);
-
-extern int crsetgroups(cred_t *, int, gid_t *);
-
-/*
- * Private interface for setting zone association of credential.
- */
-struct zone;
-extern void crsetzone(cred_t *, struct zone *);
-extern struct zone *crgetzone(const cred_t *);
-
-/*
- * Private interface for setting project id in credential.
- */
-extern void crsetprojid(cred_t *, projid_t);
-
-/*
- * Private interface for nfs.
- */
-extern cred_t *crnetadjust(cred_t *);
-
-/*
- * Private interface for procfs.
- */
-extern void cred2prcred(const cred_t *, struct prcred *);
-
-/*
- * Private interfaces for Rampart Trusted Solaris.
- */
-struct ts_label_s;
-extern struct ts_label_s *crgetlabel(const cred_t *);
-extern boolean_t crisremote(const cred_t *);
-
-#endif /* _KERNEL */
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_CRED_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/debug.h b/sys/contrib/opensolaris/uts/common/sys/debug.h
deleted file mode 100644
index c87c884..0000000
--- a/sys/contrib/opensolaris/uts/common/sys/debug.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
-/* All Rights Reserved */
-
-
-#ifndef _SYS_DEBUG_H
-#define _SYS_DEBUG_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/types.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * ASSERT(ex) causes a panic or debugger entry if expression ex is not
- * true. ASSERT() is included only for debugging, and is a no-op in
- * production kernels. VERIFY(ex), on the other hand, behaves like
- * ASSERT and is evaluated on both debug and non-debug kernels.
- */
-
-#if defined(__STDC__)
-extern int assfail(const char *, const char *, int);
-#define VERIFY(EX) ((void)((EX) || assfail(#EX, __FILE__, __LINE__)))
-#if DEBUG
-#define ASSERT(EX) VERIFY(EX)
-#else
-#define ASSERT(x) ((void)0)
-#endif
-#else /* defined(__STDC__) */
-extern int assfail();
-#define VERIFY(EX) ((void)((EX) || assfail("EX", __FILE__, __LINE__)))
-#if DEBUG
-#define ASSERT(EX) VERIFY(EX)
-#else
-#define ASSERT(x) ((void)0)
-#endif
-#endif /* defined(__STDC__) */
-
-/*
- * Assertion variants sensitive to the compilation data model
- */
-#if defined(_LP64)
-#define ASSERT64(x) ASSERT(x)
-#define ASSERT32(x)
-#else
-#define ASSERT64(x)
-#define ASSERT32(x) ASSERT(x)
-#endif
-
-/*
- * ASSERT3() behaves like ASSERT() except that it is an explicit conditional,
- * and prints out the values of the left and right hand expressions as part of
- * the panic message to ease debugging. The three variants imply the type
- * of their arguments. ASSERT3S() is for signed data types, ASSERT3U() is
- * for unsigned, and ASSERT3P() is for pointers. The VERIFY3*() macros
- * have the same relationship as above.
- */
-extern void assfail3(const char *, uintmax_t, const char *, uintmax_t,
- const char *, int);
-#define VERIFY3_IMPL(LEFT, OP, RIGHT, TYPE) do { \
- const TYPE __left = (TYPE)(LEFT); \
- const TYPE __right = (TYPE)(RIGHT); \
- if (!(__left OP __right)) \
- assfail3(#LEFT " " #OP " " #RIGHT, \
- (uintmax_t)__left, #OP, (uintmax_t)__right, \
- __FILE__, __LINE__); \
-_NOTE(CONSTCOND) } while (0)
-
-#define VERIFY3S(x, y, z) VERIFY3_IMPL(x, y, z, int64_t)
-#define VERIFY3U(x, y, z) VERIFY3_IMPL(x, y, z, uint64_t)
-#define VERIFY3P(x, y, z) VERIFY3_IMPL(x, y, z, uintptr_t)
-#if DEBUG
-#define ASSERT3S(x, y, z) VERIFY3S(x, y, z)
-#define ASSERT3U(x, y, z) VERIFY3U(x, y, z)
-#define ASSERT3P(x, y, z) VERIFY3P(x, y, z)
-#else
-#define ASSERT3S(x, y, z) ((void)0)
-#define ASSERT3U(x, y, z) ((void)0)
-#define ASSERT3P(x, y, z) ((void)0)
-#endif
-
-#ifdef _KERNEL
-
-extern void abort_sequence_enter(char *);
-extern void debug_enter(char *);
-
-#endif /* _KERNEL */
-
-#if defined(DEBUG) && !defined(__sun)
-/* CSTYLED */
-#define STATIC
-#else
-/* CSTYLED */
-#define STATIC static
-#endif
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_DEBUG_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/dkio.h b/sys/contrib/opensolaris/uts/common/sys/dkio.h
deleted file mode 100644
index b0ddd07..0000000
--- a/sys/contrib/opensolaris/uts/common/sys/dkio.h
+++ /dev/null
@@ -1,477 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_DKIO_H
-#define _SYS_DKIO_H
-
-#pragma ident "%Z%%M% %I% %E% SMI" /* SunOS-4.0 5.19 */
-
-#include <sys/dklabel.h> /* Needed for NDKMAP define */
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * Structures and definitions for disk io control commands
- */
-
-/*
- * Structures used as data by ioctl calls.
- */
-
-#define DK_DEVLEN 16 /* device name max length, including */
- /* unit # & NULL (ie - "xyc1") */
-
-/*
- * Used for controller info
- */
-struct dk_cinfo {
- char dki_cname[DK_DEVLEN]; /* controller name (no unit #) */
- ushort_t dki_ctype; /* controller type */
- ushort_t dki_flags; /* flags */
- ushort_t dki_cnum; /* controller number */
- uint_t dki_addr; /* controller address */
- uint_t dki_space; /* controller bus type */
- uint_t dki_prio; /* interrupt priority */
- uint_t dki_vec; /* interrupt vector */
- char dki_dname[DK_DEVLEN]; /* drive name (no unit #) */
- uint_t dki_unit; /* unit number */
- uint_t dki_slave; /* slave number */
- ushort_t dki_partition; /* partition number */
- ushort_t dki_maxtransfer; /* max. transfer size in DEV_BSIZE */
-};
-
-/*
- * Controller types
- */
-#define DKC_UNKNOWN 0
-#define DKC_CDROM 1 /* CD-ROM, SCSI or otherwise */
-#define DKC_WDC2880 2
-#define DKC_XXX_0 3 /* unassigned */
-#define DKC_XXX_1 4 /* unassigned */
-#define DKC_DSD5215 5
-#define DKC_ACB4000 7
-#define DKC_MD21 8
-#define DKC_XXX_2 9 /* unassigned */
-#define DKC_NCRFLOPPY 10
-#define DKC_SMSFLOPPY 12
-#define DKC_SCSI_CCS 13 /* SCSI CCS compatible */
-#define DKC_INTEL82072 14 /* native floppy chip */
-#define DKC_MD 16 /* meta-disk (virtual-disk) driver */
-#define DKC_INTEL82077 19 /* 82077 floppy disk controller */
-#define DKC_DIRECT 20 /* Intel direct attached device i.e. IDE */
-#define DKC_PCMCIA_MEM 21 /* PCMCIA memory disk-like type */
-#define DKC_PCMCIA_ATA 22 /* PCMCIA AT Attached type */
-
-/*
- * Sun reserves up through 1023
- */
-
-#define DKC_CUSTOMER_BASE 1024
-
-/*
- * Flags
- */
-#define DKI_BAD144 0x01 /* use DEC std 144 bad sector fwding */
-#define DKI_MAPTRK 0x02 /* controller does track mapping */
-#define DKI_FMTTRK 0x04 /* formats only full track at a time */
-#define DKI_FMTVOL 0x08 /* formats only full volume at a time */
-#define DKI_FMTCYL 0x10 /* formats only full cylinders at a time */
-#define DKI_HEXUNIT 0x20 /* unit number is printed as 3 hex digits */
-#define DKI_PCMCIA_PFD 0x40 /* PCMCIA pseudo-floppy memory card */
-
-/*
- * Used for all partitions
- */
-struct dk_allmap {
- struct dk_map dka_map[NDKMAP];
-};
-
-#if defined(_SYSCALL32)
-struct dk_allmap32 {
- struct dk_map32 dka_map[NDKMAP];
-};
-#endif /* _SYSCALL32 */
-
-/*
- * Definition of a disk's geometry
- */
-struct dk_geom {
- unsigned short dkg_ncyl; /* # of data cylinders */
- unsigned short dkg_acyl; /* # of alternate cylinders */
- unsigned short dkg_bcyl; /* cyl offset (for fixed head area) */
- unsigned short dkg_nhead; /* # of heads */
- unsigned short dkg_obs1; /* obsolete */
- unsigned short dkg_nsect; /* # of data sectors per track */
- unsigned short dkg_intrlv; /* interleave factor */
- unsigned short dkg_obs2; /* obsolete */
- unsigned short dkg_obs3; /* obsolete */
- unsigned short dkg_apc; /* alternates per cyl (SCSI only) */
- unsigned short dkg_rpm; /* revolutions per minute */
- unsigned short dkg_pcyl; /* # of physical cylinders */
- unsigned short dkg_write_reinstruct; /* # sectors to skip, writes */
- unsigned short dkg_read_reinstruct; /* # sectors to skip, reads */
- unsigned short dkg_extra[7]; /* for compatible expansion */
-};
-
-/*
- * These defines are for historic compatibility with old drivers.
- */
-#define dkg_bhead dkg_obs1 /* used to be head offset */
-#define dkg_gap1 dkg_obs2 /* used to be gap1 */
-#define dkg_gap2 dkg_obs3 /* used to be gap2 */
-
-/*
- * Disk io control commands
- * Warning: some other ioctls with the DIOC prefix exist elsewhere.
- * The Generic DKIOC numbers are from 0 - 50.
- * The Floppy Driver uses 51 - 100.
- * The Hard Disk (except SCSI) 101 - 106. (these are obsolete)
- * The CDROM Driver 151 - 200.
- * The USCSI ioctl 201 - 250.
- */
-#define DKIOC (0x04 << 8)
-
-/*
- * The following ioctls are generic in nature and need to be
- * suported as appropriate by all disk drivers
- */
-#define DKIOCGGEOM (DKIOC|1) /* Get geometry */
-#define DKIOCINFO (DKIOC|3) /* Get info */
-#define DKIOCEJECT (DKIOC|6) /* Generic 'eject' */
-#define DKIOCGVTOC (DKIOC|11) /* Get VTOC */
-#define DKIOCSVTOC (DKIOC|12) /* Set VTOC & Write to Disk */
-
-/*
- * Disk Cache Controls. These ioctls should be supported by
- * all disk drivers.
- *
- * DKIOCFLUSHWRITECACHE when used from user-mode ignores the ioctl
- * argument, but it should be passed as NULL to allow for future
- * reinterpretation. From user-mode, this ioctl request is synchronous.
- *
- * When invoked from within the kernel, the arg can be NULL to indicate
- * a synchronous request or can be the address of a struct dk_callback
- * to request an asynchronous callback when the flush request is complete.
- * In this case, the flag to the ioctl must include FKIOCTL and the
- * dkc_callback field of the pointed to struct must be non-null or the
- * request is made synchronously.
- *
- * In the callback case: if the ioctl returns 0, a callback WILL be performed.
- * If the ioctl returns non-zero, a callback will NOT be performed.
- * NOTE: In some cases, the callback may be done BEFORE the ioctl call
- * returns. The caller's locking strategy should be prepared for this case.
- */
-#define DKIOCFLUSHWRITECACHE (DKIOC|34) /* flush cache to phys medium */
-
-struct dk_callback {
- void (*dkc_callback)(void *dkc_cookie, int error);
- void *dkc_cookie;
-};
-
-#define DKIOCGETWCE (DKIOC|36) /* Get current write cache */
- /* enablement status */
-#define DKIOCSETWCE (DKIOC|37) /* Enable/Disable write cache */
-
-/*
- * The following ioctls are used by Sun drivers to communicate
- * with their associated format routines. Support of these ioctls
- * is not required of foreign drivers
- */
-#define DKIOCSGEOM (DKIOC|2) /* Set geometry */
-#define DKIOCSAPART (DKIOC|4) /* Set all partitions */
-#define DKIOCGAPART (DKIOC|5) /* Get all partitions */
-#define DKIOCG_PHYGEOM (DKIOC|32) /* get physical geometry */
-#define DKIOCG_VIRTGEOM (DKIOC|33) /* get virtual geometry */
-
-/*
- * The following ioctl's are removable media support
- */
-#define DKIOCLOCK (DKIOC|7) /* Generic 'lock' */
-#define DKIOCUNLOCK (DKIOC|8) /* Generic 'unlock' */
-#define DKIOCSTATE (DKIOC|13) /* Inquire insert/eject state */
-#define DKIOCREMOVABLE (DKIOC|16) /* is media removable */
-
-
-/*
- * ioctl for hotpluggable devices
- */
-#define DKIOCHOTPLUGGABLE (DKIOC|35) /* is hotpluggable */
-
-/*
- * Ioctl to force driver to re-read the alternate partition and rebuild
- * the internal defect map.
- */
-#define DKIOCADDBAD (DKIOC|20) /* Re-read the alternate map (IDE) */
-#define DKIOCGETDEF (DKIOC|21) /* read defect list (IDE) */
-
-/*
- * Used by applications to get disk defect information from IDE
- * drives.
- */
-#ifdef _SYSCALL32
-struct defect_header32 {
- int head;
- caddr32_t buffer;
-};
-#endif /* _SYSCALL32 */
-
-struct defect_header {
- int head;
- caddr_t buffer;
-};
-
-#define DKIOCPARTINFO (DKIOC|22) /* Get partition or slice parameters */
-
-/*
- * Used by applications to get partition or slice information
- */
-#ifdef _SYSCALL32
-struct part_info32 {
- daddr32_t p_start;
- int p_length;
-};
-#endif /* _SYSCALL32 */
-
-struct part_info {
- daddr_t p_start;
- int p_length;
-};
-
-/* The following ioctls are for Optical Memory Device */
-#define DKIOC_EBP_ENABLE (DKIOC|40) /* enable by pass erase on write */
-#define DKIOC_EBP_DISABLE (DKIOC|41) /* disable by pass erase on write */
-
-/*
- * This state enum is the argument passed to the DKIOCSTATE ioctl.
- */
-enum dkio_state { DKIO_NONE, DKIO_EJECTED, DKIO_INSERTED, DKIO_DEV_GONE };
-
-#define DKIOCGMEDIAINFO (DKIOC|42) /* get information about the media */
-
-/*
- * ioctls to read/write mboot info.
- */
-#define DKIOCGMBOOT (DKIOC|43) /* get mboot info */
-#define DKIOCSMBOOT (DKIOC|44) /* set mboot info */
-
-/*
- * ioctl to get the device temperature.
- */
-#define DKIOCGTEMPERATURE (DKIOC|45) /* get temperature */
-
-/*
- * Used for providing the temperature.
- */
-
-struct dk_temperature {
- uint_t dkt_flags; /* Flags */
- short dkt_cur_temp; /* Current disk temperature */
- short dkt_ref_temp; /* reference disk temperature */
-};
-
-#define DKT_BYPASS_PM 0x1
-#define DKT_INVALID_TEMP 0xFFFF
-
-
-/*
- * Used for Media info or the current profile info
- */
-struct dk_minfo {
- uint_t dki_media_type; /* Media type or profile info */
- uint_t dki_lbsize; /* Logical blocksize of media */
- diskaddr_t dki_capacity; /* Capacity as # of dki_lbsize blks */
-};
-
-/*
- * Media types or profiles known
- */
-#define DK_UNKNOWN 0x00 /* Media inserted - type unknown */
-
-
-/*
- * SFF 8090 Specification Version 3, media types 0x01 - 0xfffe are retained to
- * maintain compatibility with SFF8090. The following define the
- * optical media type.
- */
-#define DK_REMOVABLE_DISK 0x02 /* Removable Disk */
-#define DK_MO_ERASABLE 0x03 /* MO Erasable */
-#define DK_MO_WRITEONCE 0x04 /* MO Write once */
-#define DK_AS_MO 0x05 /* AS MO */
-#define DK_CDROM 0x08 /* CDROM */
-#define DK_CDR 0x09 /* CD-R */
-#define DK_CDRW 0x0A /* CD-RW */
-#define DK_DVDROM 0x10 /* DVD-ROM */
-#define DK_DVDR 0x11 /* DVD-R */
-#define DK_DVDRAM 0x12 /* DVD_RAM or DVD-RW */
-
-/*
- * Media types for other rewritable magnetic media
- */
-#define DK_FIXED_DISK 0x10001 /* Fixed disk SCSI or otherwise */
-#define DK_FLOPPY 0x10002 /* Floppy media */
-#define DK_ZIP 0x10003 /* IOMEGA ZIP media */
-#define DK_JAZ 0x10004 /* IOMEGA JAZ media */
-
-#define DKIOCSETEFI (DKIOC|17) /* Set EFI info */
-#define DKIOCGETEFI (DKIOC|18) /* Get EFI info */
-
-#define DKIOCPARTITION (DKIOC|9) /* Get partition info */
-
-/*
- * Ioctls to get/set volume capabilities related to Logical Volume Managers.
- * They include the ability to get/set capabilities and to issue a read to a
- * specific underlying device of a replicated device.
- */
-
-#define DKIOCGETVOLCAP (DKIOC | 25) /* Get volume capabilities */
-#define DKIOCSETVOLCAP (DKIOC | 26) /* Set volume capabilities */
-#define DKIOCDMR (DKIOC | 27) /* Issue a directed read */
-
-typedef uint_t volcapinfo_t;
-
-typedef uint_t volcapset_t;
-
-#define DKV_ABR_CAP 0x00000001 /* Support Appl.Based Recovery */
-#define DKV_DMR_CAP 0x00000002 /* Support Directed Mirror Read */
-
-typedef struct volcap {
- volcapinfo_t vc_info; /* Capabilities available */
- volcapset_t vc_set; /* Capabilities set */
-} volcap_t;
-
-#define VOL_SIDENAME 256
-
-typedef struct vol_directed_rd {
- int vdr_flags;
- offset_t vdr_offset;
- size_t vdr_nbytes;
- size_t vdr_bytesread;
- void *vdr_data;
- int vdr_side;
- char vdr_side_name[VOL_SIDENAME];
-} vol_directed_rd_t;
-
-#define DKV_SIDE_INIT (-1)
-#define DKV_DMR_NEXT_SIDE 0x00000001
-#define DKV_DMR_DONE 0x00000002
-#define DKV_DMR_ERROR 0x00000004
-#define DKV_DMR_SUCCESS 0x00000008
-#define DKV_DMR_SHORT 0x00000010
-
-#ifdef _MULTI_DATAMODEL
-#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
-#pragma pack(4)
-#endif
-typedef struct vol_directed_rd32 {
- int32_t vdr_flags;
- offset_t vdr_offset; /* 64-bit element on 32-bit alignment */
- size32_t vdr_nbytes;
- size32_t vdr_bytesread;
- caddr32_t vdr_data;
- int32_t vdr_side;
- char vdr_side_name[VOL_SIDENAME];
-} vol_directed_rd32_t;
-#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
-#pragma pack()
-#endif
-#endif /* _MULTI_DATAMODEL */
-
-/*
- * The ioctl is used to fetch disk's device type, vendor ID,
- * model number/product ID, firmware revision and serial number together.
- *
- * Currently there are two device types - DKD_ATA_TYPE which means the
- * disk is driven by cmdk/ata or dad/uata driver, and DKD_SCSI_TYPE
- * which means the disk is driven by sd/scsi hba driver.
- */
-#define DKIOC_GETDISKID (DKIOC|46)
-
-/* These two labels are for dkd_dtype of dk_disk_id_t */
-#define DKD_ATA_TYPE 0x01 /* ATA disk or legacy mode SATA disk */
-#define DKD_SCSI_TYPE 0x02 /* SCSI disk or native mode SATA disk */
-
-#define DKD_ATA_MODEL 40 /* model number length */
-#define DKD_ATA_FWVER 8 /* firmware revision length */
-#define DKD_ATA_SERIAL 20 /* serial number length */
-
-#define DKD_SCSI_VENDOR 8 /* vendor ID length */
-#define DKD_SCSI_PRODUCT 16 /* product ID length */
-#define DKD_SCSI_REVLEVEL 4 /* revision level length */
-#define DKD_SCSI_SERIAL 12 /* serial number length */
-
-/*
- * The argument type for DKIOC_GETDISKID ioctl.
- */
-typedef struct dk_disk_id {
- uint_t dkd_dtype;
- union {
- struct {
- char dkd_amodel[DKD_ATA_MODEL]; /* 40 bytes */
- char dkd_afwver[DKD_ATA_FWVER]; /* 8 bytes */
- char dkd_aserial[DKD_ATA_SERIAL]; /* 20 bytes */
- } ata_disk_id;
- struct {
- char dkd_svendor[DKD_SCSI_VENDOR]; /* 8 bytes */
- char dkd_sproduct[DKD_SCSI_PRODUCT]; /* 16 bytes */
- char dkd_sfwver[DKD_SCSI_REVLEVEL]; /* 4 bytes */
- char dkd_sserial[DKD_SCSI_SERIAL]; /* 12 bytes */
- } scsi_disk_id;
- } disk_id;
-} dk_disk_id_t;
-
-/*
- * The ioctl is used to update the firmware of device.
- */
-#define DKIOC_UPDATEFW (DKIOC|47)
-
-/* The argument type for DKIOC_UPDATEFW ioctl */
-typedef struct dk_updatefw {
- caddr_t dku_ptrbuf; /* pointer to firmware buf */
- uint_t dku_size; /* firmware buf length */
- uint8_t dku_type; /* firmware update type */
-} dk_updatefw_t;
-
-#ifdef _SYSCALL32
-typedef struct dk_updatefw_32 {
- caddr32_t dku_ptrbuf; /* pointer to firmware buf */
- uint_t dku_size; /* firmware buf length */
- uint8_t dku_type; /* firmware update type */
-} dk_updatefw_32_t;
-#endif /* _SYSCALL32 */
-
-/*
- * firmware update type - temporary or permanent use
- */
-#define FW_TYPE_TEMP 0x0 /* temporary use */
-#define FW_TYPE_PERM 0x1 /* permanent use */
-
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_DKIO_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/dklabel.h b/sys/contrib/opensolaris/uts/common/sys/dklabel.h
deleted file mode 100644
index 92cb47a..0000000
--- a/sys/contrib/opensolaris/uts/common/sys/dklabel.h
+++ /dev/null
@@ -1,268 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 1990-2002 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_DKLABEL_H
-#define _SYS_DKLABEL_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/isa_defs.h>
-#include <sys/types32.h>
-#include <sys/isa_defs.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * Miscellaneous defines
- */
-#define DKL_MAGIC 0xDABE /* magic number */
-#define FKL_MAGIC 0xff /* magic number for DOS floppies */
-
-#if defined(_SUNOS_VTOC_16)
-#define NDKMAP 16 /* # of logical partitions */
-#define DK_LABEL_LOC 1 /* location of disk label */
-#elif defined(_SUNOS_VTOC_8)
-#define NDKMAP 8 /* # of logical partitions */
-#define DK_LABEL_LOC 0 /* location of disk label */
-#else
-#error "No VTOC format defined."
-#endif
-
-#define LEN_DKL_ASCII 128 /* length of dkl_asciilabel */
-#define LEN_DKL_VVOL 8 /* length of v_volume */
-#define DK_LABEL_SIZE 512 /* size of disk label */
-#define DK_MAX_BLOCKS 0x7fffffff /* max # of blocks handled */
-
-/*
- * Reserve two cylinders on SCSI disks.
- * One is for the backup disk label and the other is for the deviceid.
- *
- * IPI disks only reserve one cylinder, but they will go away soon.
- * CDROMs do not reserve any cylinders.
- */
-#define DK_ACYL 2
-
-/*
- * Format of a Sun disk label.
- * Resides in cylinder 0, head 0, sector 0.
- *
- * sizeof (struct dk_label) should be 512 (the current sector size),
- * but should the sector size increase, this structure should remain
- * at the beginning of the sector.
- */
-
-/*
- * partition headers: section 1
- * Returned in struct dk_allmap by ioctl DKIOC[SG]APART (dkio(7I))
- */
-struct dk_map {
- daddr_t dkl_cylno; /* starting cylinder */
- daddr_t dkl_nblk; /* number of blocks; if == 0, */
- /* partition is undefined */
-};
-
-/*
- * partition headers: section 1
- * Fixed size for on-disk dk_label
- */
-struct dk_map32 {
- daddr32_t dkl_cylno; /* starting cylinder */
- daddr32_t dkl_nblk; /* number of blocks; if == 0, */
- /* partition is undefined */
-};
-
-/*
- * partition headers: section 2,
- * brought over from AT&T SVr4 vtoc structure.
- */
-struct dk_map2 {
- uint16_t p_tag; /* ID tag of partition */
- uint16_t p_flag; /* permission flag */
-};
-
-struct dkl_partition {
- uint16_t p_tag; /* ID tag of partition */
- uint16_t p_flag; /* permision flags */
- daddr32_t p_start; /* start sector no of partition */
- int32_t p_size; /* # of blocks in partition */
-};
-
-
-/*
- * VTOC inclusions from AT&T SVr4
- * Fixed sized types for on-disk VTOC
- */
-
-struct dk_vtoc {
-#if defined(_SUNOS_VTOC_16)
- uint32_t v_bootinfo[3]; /* info for mboot (unsupported) */
- uint32_t v_sanity; /* to verify vtoc sanity */
- uint32_t v_version; /* layout version */
- char v_volume[LEN_DKL_VVOL]; /* volume name */
- uint16_t v_sectorsz; /* sector size in bytes */
- uint16_t v_nparts; /* number of partitions */
- uint32_t v_reserved[10]; /* free space */
- struct dkl_partition v_part[NDKMAP]; /* partition headers */
- time32_t timestamp[NDKMAP]; /* partition timestamp (unsupported) */
- char v_asciilabel[LEN_DKL_ASCII]; /* for compatibility */
-#elif defined(_SUNOS_VTOC_8)
- uint32_t v_version; /* layout version */
- char v_volume[LEN_DKL_VVOL]; /* volume name */
- uint16_t v_nparts; /* number of partitions */
- struct dk_map2 v_part[NDKMAP]; /* partition hdrs, sec 2 */
- uint32_t v_bootinfo[3]; /* info needed by mboot */
- uint32_t v_sanity; /* to verify vtoc sanity */
- uint32_t v_reserved[10]; /* free space */
- time32_t v_timestamp[NDKMAP]; /* partition timestamp */
-#else
-#error "No VTOC format defined."
-#endif
-};
-
-/*
- * define the amount of disk label padding needed to make
- * the entire structure occupy 512 bytes.
- */
-#if defined(_SUNOS_VTOC_16)
-#define LEN_DKL_PAD (DK_LABEL_SIZE - \
- ((sizeof (struct dk_vtoc) + \
- (4 * sizeof (uint32_t)) + \
- (12 * sizeof (uint16_t)) + \
- (2 * (sizeof (uint16_t))))))
-#elif defined(_SUNOS_VTOC_8)
-#define LEN_DKL_PAD (DK_LABEL_SIZE \
- - ((LEN_DKL_ASCII) + \
- (sizeof (struct dk_vtoc)) + \
- (sizeof (struct dk_map32) * NDKMAP) + \
- (14 * (sizeof (uint16_t))) + \
- (2 * (sizeof (uint16_t)))))
-#else
-#error "No VTOC format defined."
-#endif
-
-
-struct dk_label {
-#if defined(_SUNOS_VTOC_16)
- struct dk_vtoc dkl_vtoc; /* vtoc inclusions from AT&T SVr4 */
- uint32_t dkl_pcyl; /* # of physical cylinders */
- uint32_t dkl_ncyl; /* # of data cylinders */
- uint16_t dkl_acyl; /* # of alternate cylinders */
- uint16_t dkl_bcyl; /* cyl offset (for fixed head area) */
- uint32_t dkl_nhead; /* # of heads */
- uint32_t dkl_nsect; /* # of data sectors per track */
- uint16_t dkl_intrlv; /* interleave factor */
- uint16_t dkl_skew; /* skew factor */
- uint16_t dkl_apc; /* alternates per cyl (SCSI only) */
- uint16_t dkl_rpm; /* revolutions per minute */
- uint16_t dkl_write_reinstruct; /* # sectors to skip, writes */
- uint16_t dkl_read_reinstruct; /* # sectors to skip, reads */
- uint16_t dkl_extra[4]; /* for compatible expansion */
- char dkl_pad[LEN_DKL_PAD]; /* unused part of 512 bytes */
-#elif defined(_SUNOS_VTOC_8)
- char dkl_asciilabel[LEN_DKL_ASCII]; /* for compatibility */
- struct dk_vtoc dkl_vtoc; /* vtoc inclusions from AT&T SVr4 */
- uint16_t dkl_write_reinstruct; /* # sectors to skip, writes */
- uint16_t dkl_read_reinstruct; /* # sectors to skip, reads */
- char dkl_pad[LEN_DKL_PAD]; /* unused part of 512 bytes */
- uint16_t dkl_rpm; /* rotations per minute */
- uint16_t dkl_pcyl; /* # physical cylinders */
- uint16_t dkl_apc; /* alternates per cylinder */
- uint16_t dkl_obs1; /* obsolete */
- uint16_t dkl_obs2; /* obsolete */
- uint16_t dkl_intrlv; /* interleave factor */
- uint16_t dkl_ncyl; /* # of data cylinders */
- uint16_t dkl_acyl; /* # of alternate cylinders */
- uint16_t dkl_nhead; /* # of heads in this partition */
- uint16_t dkl_nsect; /* # of 512 byte sectors per track */
- uint16_t dkl_obs3; /* obsolete */
- uint16_t dkl_obs4; /* obsolete */
- struct dk_map32 dkl_map[NDKMAP]; /* logical partition headers */
-#else
-#error "No VTOC format defined."
-#endif
- uint16_t dkl_magic; /* identifies this label format */
- uint16_t dkl_cksum; /* xor checksum of sector */
-};
-
-#if defined(_SUNOS_VTOC_16)
-#define dkl_asciilabel dkl_vtoc.v_asciilabel
-#define v_timestamp timestamp
-
-#elif defined(_SUNOS_VTOC_8)
-
-/*
- * These defines are for historic compatibility with old drivers.
- */
-#define dkl_gap1 dkl_obs1 /* used to be gap1 */
-#define dkl_gap2 dkl_obs2 /* used to be gap2 */
-#define dkl_bhead dkl_obs3 /* used to be label head offset */
-#define dkl_ppart dkl_obs4 /* used to by physical partition */
-#else
-#error "No VTOC format defined."
-#endif
-
-struct fk_label { /* DOS floppy label */
- uchar_t fkl_type;
- uchar_t fkl_magich;
- uchar_t fkl_magicl;
- uchar_t filler;
-};
-
-/*
- * Layout of stored fabricated device id (on-disk)
- */
-#define DK_DEVID_BLKSIZE (512)
-#define DK_DEVID_SIZE (DK_DEVID_BLKSIZE - ((sizeof (uchar_t) * 7)))
-#define DK_DEVID_REV_MSB (0)
-#define DK_DEVID_REV_LSB (1)
-
-struct dk_devid {
- uchar_t dkd_rev_hi; /* revision (MSB) */
- uchar_t dkd_rev_lo; /* revision (LSB) */
- uchar_t dkd_flags; /* flags (not used yet) */
- uchar_t dkd_devid[DK_DEVID_SIZE]; /* devid stored here */
- uchar_t dkd_checksum3; /* checksum (MSB) */
- uchar_t dkd_checksum2;
- uchar_t dkd_checksum1;
- uchar_t dkd_checksum0; /* checksum (LSB) */
-};
-
-#define DKD_GETCHKSUM(dkd) ((dkd)->dkd_checksum3 << 24) + \
- ((dkd)->dkd_checksum2 << 16) + \
- ((dkd)->dkd_checksum1 << 8) + \
- ((dkd)->dkd_checksum0)
-
-#define DKD_FORMCHKSUM(c, dkd) (dkd)->dkd_checksum3 = hibyte(hiword((c))); \
- (dkd)->dkd_checksum2 = lobyte(hiword((c))); \
- (dkd)->dkd_checksum1 = hibyte(loword((c))); \
- (dkd)->dkd_checksum0 = lobyte(loword((c)));
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_DKLABEL_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/errorq.h b/sys/contrib/opensolaris/uts/common/sys/errorq.h
deleted file mode 100644
index 971b19e..0000000
--- a/sys/contrib/opensolaris/uts/common/sys/errorq.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _ERRORQ_H
-#define _ERRORQ_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/types.h>
-#include <sys/time.h>
-#include <sys/nvpair.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct errorq errorq_t;
-typedef struct errorq_elem errorq_elem_t;
-typedef void (*errorq_func_t)(void *, const void *, const errorq_elem_t *);
-
-/*
- * Public flags for errorq_create(): bit range 0-15
- */
-#define ERRORQ_VITAL 0x0001 /* drain queue automatically on system reset */
-
-/*
- * Public flags for errorq_dispatch():
- */
-#define ERRORQ_ASYNC 0 /* schedule async queue drain for caller */
-#define ERRORQ_SYNC 1 /* do not schedule drain; caller will drain */
-
-#ifdef _KERNEL
-
-extern errorq_t *errorq_create(const char *, errorq_func_t, void *,
- ulong_t, size_t, uint_t, uint_t);
-
-extern errorq_t *errorq_nvcreate(const char *, errorq_func_t, void *,
- ulong_t, size_t, uint_t, uint_t);
-
-extern void errorq_destroy(errorq_t *);
-extern void errorq_dispatch(errorq_t *, const void *, size_t, uint_t);
-extern void errorq_drain(errorq_t *);
-extern void errorq_init(void);
-extern void errorq_panic(void);
-extern errorq_elem_t *errorq_reserve(errorq_t *);
-extern void errorq_commit(errorq_t *, errorq_elem_t *, uint_t);
-extern void errorq_cancel(errorq_t *, errorq_elem_t *);
-extern nvlist_t *errorq_elem_nvl(errorq_t *, const errorq_elem_t *);
-extern nv_alloc_t *errorq_elem_nva(errorq_t *, const errorq_elem_t *);
-extern void *errorq_elem_dup(errorq_t *, const errorq_elem_t *,
- errorq_elem_t **);
-extern void errorq_dump();
-
-#endif /* _KERNEL */
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _ERRORQ_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/feature_tests.h b/sys/contrib/opensolaris/uts/common/sys/feature_tests.h
deleted file mode 100644
index bb79cb8..0000000
--- a/sys/contrib/opensolaris/uts/common/sys/feature_tests.h
+++ /dev/null
@@ -1,397 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_FEATURE_TESTS_H
-#define _SYS_FEATURE_TESTS_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/ccompile.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * Values of _POSIX_C_SOURCE
- *
- * undefined not a POSIX compilation
- * 1 POSIX.1-1990 compilation
- * 2 POSIX.2-1992 compilation
- * 199309L POSIX.1b-1993 compilation (Real Time)
- * 199506L POSIX.1c-1995 compilation (POSIX Threads)
- * 200112L POSIX.1-2001 compilation (Austin Group Revision)
- */
-#if defined(_POSIX_SOURCE) && !defined(_POSIX_C_SOURCE)
-#define _POSIX_C_SOURCE 1
-#endif
-
-/*
- * The feature test macros __XOPEN_OR_POSIX, _STRICT_STDC, and _STDC_C99
- * are Sun implementation specific macros created in order to compress
- * common standards specified feature test macros for easier reading.
- * These macros should not be used by the application developer as
- * unexpected results may occur. Instead, the user should reference
- * standards(5) for correct usage of the standards feature test macros.
- *
- * __XOPEN_OR_POSIX Used in cases where a symbol is defined by both
- * X/Open or POSIX or in the negative, when neither
- * X/Open or POSIX defines a symbol.
- *
- * _STRICT_STDC __STDC__ is specified by the C Standards and defined
- * by the compiler. For Sun compilers the value of
- * __STDC__ is either 1, 0, or not defined based on the
- * compilation mode (see cc(1)). When the value of
- * __STDC__ is 1 and in the absence of any other feature
- * test macros, the namespace available to the application
- * is limited to only those symbols defined by the C
- * Standard. _STRICT_STDC provides a more readable means
- * of identifying symbols defined by the standard, or in
- * the negative, symbols that are extensions to the C
- * Standard. See additional comments for GNU C differences.
- *
- * _STDC_C99 __STDC_VERSION__ is specified by the C standards and
- * defined by the compiler and indicates the version of
- * the C standard. A value of 199901L indicates a
- * compiler that complies with ISO/IEC 9899:1999, other-
- * wise known as the C99 standard.
- */
-
-#if defined(_XOPEN_SOURCE) || defined(_POSIX_C_SOURCE)
-#define __XOPEN_OR_POSIX
-#endif
-
-/*
- * ISO/IEC 9899:1990 and it's revision, ISO/IEC 9899:1999 specify the
- * following predefined macro name:
- *
- * __STDC__ The integer constant 1, intended to indicate a conforming
- * implementation.
- *
- * Furthermore, a strictly conforming program shall use only those features
- * of the language and library specified in these standards. A conforming
- * implementation shall accept any strictly conforming program.
- *
- * Based on these requirements, Sun's C compiler defines __STDC__ to 1 for
- * strictly conforming environments and __STDC__ to 0 for environments that
- * use ANSI C semantics but allow extensions to the C standard. For non-ANSI
- * C semantics, Sun's C compiler does not define __STDC__.
- *
- * The GNU C project interpretation is that __STDC__ should always be defined
- * to 1 for compilation modes that accept ANSI C syntax regardless of whether
- * or not extensions to the C standard are used. Violations of conforming
- * behavior are conditionally flagged as warnings via the use of the
- * -pedantic option. In addition to defining __STDC__ to 1, the GNU C
- * compiler also defines __STRICT_ANSI__ as a means of specifying strictly
- * conforming environments using the -ansi or -std=<standard> options.
- *
- * In the absence of any other compiler options, Sun and GNU set the value
- * of __STDC__ as follows when using the following options:
- *
- * Value of __STDC__ __STRICT_ANSI__
- *
- * cc -Xa (default) 0 undefined
- * cc -Xt (transitional) 0 undefined
- * cc -Xc (strictly conforming) 1 undefined
- * cc -Xs (K&R C) undefined undefined
- *
- * gcc (default) 1 undefined
- * gcc -ansi, -std={c89, c99,...) 1 defined
- * gcc -traditional (K&R) undefined undefined
- *
- * The default compilation modes for Sun C compilers versus GNU C compilers
- * results in a differing value for __STDC__ which results in a more
- * restricted namespace when using Sun compilers. To allow both GNU and Sun
- * interpretations to peacefully co-exist, we use the following Sun
- * implementation _STRICT_STDC_ macro:
- */
-
-#if (__STDC__ - 0 == 1 && !defined(__GNUC__)) || \
- (defined(__GNUC__) && defined(__STRICT_ANSI__))
-#define _STRICT_STDC
-#else
-#undef _STRICT_STDC
-#endif
-
-/*
- * Compiler complies with ISO/IEC 9899:1999
- */
-
-#if __STDC_VERSION__ - 0 >= 199901L
-#ifndef _STDC_C99
-#define _STDC_C99
-#endif
-#endif
-
-/*
- * Large file interfaces:
- *
- * _LARGEFILE_SOURCE
- * 1 large file-related additions to POSIX
- * interfaces requested (fseeko, etc.)
- * _LARGEFILE64_SOURCE
- * 1 transitional large-file-related interfaces
- * requested (seek64, stat64, etc.)
- *
- * The corresponding announcement macros are respectively:
- * _LFS_LARGEFILE
- * _LFS64_LARGEFILE
- * (These are set in <unistd.h>.)
- *
- * Requesting _LARGEFILE64_SOURCE implies requesting _LARGEFILE_SOURCE as
- * well.
- *
- * The large file interfaces are made visible regardless of the initial values
- * of the feature test macros under certain circumstances:
- * - If no explicit standards-conforming environment is requested (neither
- * of _POSIX_SOURCE nor _XOPEN_SOURCE is defined and the value of
- * __STDC__ does not imply standards conformance).
- * - Extended system interfaces are explicitly requested (__EXTENSIONS__
- * is defined).
- * - Access to in-kernel interfaces is requested (_KERNEL or _KMEMUSER is
- * defined). (Note that this dependency is an artifact of the current
- * kernel implementation and may change in future releases.)
- */
-#if (!defined(_STRICT_STDC) && !defined(__XOPEN_OR_POSIX)) || \
- defined(_KERNEL) || defined(_KMEMUSER) || \
- defined(__EXTENSIONS__)
-#undef _LARGEFILE64_SOURCE
-#define _LARGEFILE64_SOURCE 1
-#endif
-#if _LARGEFILE64_SOURCE - 0 == 1
-#undef _LARGEFILE_SOURCE
-#define _LARGEFILE_SOURCE 1
-#endif
-
-/*
- * Large file compilation environment control:
- *
- * The setting of _FILE_OFFSET_BITS controls the size of various file-related
- * types and governs the mapping between file-related source function symbol
- * names and the corresponding binary entry points.
- *
- * In the 32-bit environment, the default value is 32; if not set, set it to
- * the default here, to simplify tests in other headers.
- *
- * In the 64-bit compilation environment, the only value allowed is 64.
- */
-#if defined(_LP64)
-#ifndef _FILE_OFFSET_BITS
-#define _FILE_OFFSET_BITS 64
-#endif
-#if _FILE_OFFSET_BITS - 0 != 64
-#error "invalid _FILE_OFFSET_BITS value specified"
-#endif
-#else /* _LP64 */
-#ifndef _FILE_OFFSET_BITS
-#define _FILE_OFFSET_BITS 32
-#endif
-#if _FILE_OFFSET_BITS - 0 != 32 && _FILE_OFFSET_BITS - 0 != 64
-#error "invalid _FILE_OFFSET_BITS value specified"
-#endif
-#endif /* _LP64 */
-
-/*
- * Use of _XOPEN_SOURCE
- *
- * The following X/Open specifications are supported:
- *
- * X/Open Portability Guide, Issue 3 (XPG3)
- * X/Open CAE Specification, Issue 4 (XPG4)
- * X/Open CAE Specification, Issue 4, Version 2 (XPG4v2)
- * X/Open CAE Specification, Issue 5 (XPG5)
- * Open Group Technical Standard, Issue 6 (XPG6), also referred to as
- * IEEE Std. 1003.1-2001 and ISO/IEC 9945:2002.
- *
- * XPG4v2 is also referred to as UNIX 95 (SUS or SUSv1).
- * XPG5 is also referred to as UNIX 98 or the Single Unix Specification,
- * Version 2 (SUSv2)
- * XPG6 is the result of a merge of the X/Open and POSIX specifications
- * and as such is also referred to as IEEE Std. 1003.1-2001 in
- * addition to UNIX 03 and SUSv3.
- *
- * When writing a conforming X/Open application, as per the specification
- * requirements, the appropriate feature test macros must be defined at
- * compile time. These are as follows. For more info, see standards(5).
- *
- * Feature Test Macro Specification
- * ------------------------------------------------ -------------
- * _XOPEN_SOURCE XPG3
- * _XOPEN_SOURCE && _XOPEN_VERSION = 4 XPG4
- * _XOPEN_SOURCE && _XOPEN_SOURCE_EXTENDED = 1 XPG4v2
- * _XOPEN_SOURCE = 500 XPG5
- * _XOPEN_SOURCE = 600 (or POSIX_C_SOURCE=200112L) XPG6
- *
- * In order to simplify the guards within the headers, the following
- * implementation private test macros have been created. Applications
- * must NOT use these private test macros as unexpected results will
- * occur.
- *
- * Note that in general, the use of these private macros is cumulative.
- * For example, the use of _XPG3 with no other restrictions on the X/Open
- * namespace will make the symbols visible for XPG3 through XPG6
- * compilation environments. The use of _XPG4_2 with no other X/Open
- * namespace restrictions indicates that the symbols were introduced in
- * XPG4v2 and are therefore visible for XPG4v2 through XPG6 compilation
- * environments, but not for XPG3 or XPG4 compilation environments.
- *
- * _XPG3 X/Open Portability Guide, Issue 3 (XPG3)
- * _XPG4 X/Open CAE Specification, Issue 4 (XPG4)
- * _XPG4_2 X/Open CAE Specification, Issue 4, Version 2 (XPG4v2/UNIX 95/SUS)
- * _XPG5 X/Open CAE Specification, Issue 5 (XPG5/UNIX 98/SUSv2)
- * _XPG6 Open Group Technical Standard, Issue 6 (XPG6/UNIX 03/SUSv3)
- */
-
-/* X/Open Portability Guide, Issue 3 */
-#if defined(_XOPEN_SOURCE) && (_XOPEN_SOURCE - 0 < 500) && \
- (_XOPEN_VERSION - 0 < 4) && !defined(_XOPEN_SOURCE_EXTENDED)
-#define _XPG3
-/* X/Open CAE Specification, Issue 4 */
-#elif (defined(_XOPEN_SOURCE) && _XOPEN_VERSION - 0 == 4)
-#define _XPG4
-#define _XPG3
-/* X/Open CAE Specification, Issue 4, Version 2 */
-#elif (defined(_XOPEN_SOURCE) && _XOPEN_SOURCE_EXTENDED - 0 == 1)
-#define _XPG4_2
-#define _XPG4
-#define _XPG3
-/* X/Open CAE Specification, Issue 5 */
-#elif (_XOPEN_SOURCE - 0 == 500)
-#define _XPG5
-#define _XPG4_2
-#define _XPG4
-#define _XPG3
-#undef _POSIX_C_SOURCE
-#define _POSIX_C_SOURCE 199506L
-/* Open Group Technical Standard , Issue 6 */
-#elif (_XOPEN_SOURCE - 0 == 600) || (_POSIX_C_SOURCE - 0 == 200112L)
-#define _XPG6
-#define _XPG5
-#define _XPG4_2
-#define _XPG4
-#define _XPG3
-#undef _POSIX_C_SOURCE
-#define _POSIX_C_SOURCE 200112L
-#undef _XOPEN_SOURCE
-#define _XOPEN_SOURCE 600
-#endif
-
-/*
- * _XOPEN_VERSION is defined by the X/Open specifications and is not
- * normally defined by the application, except in the case of an XPG4
- * application. On the implementation side, _XOPEN_VERSION defined with
- * the value of 3 indicates an XPG3 application. _XOPEN_VERSION defined
- * with the value of 4 indicates an XPG4 or XPG4v2 (UNIX 95) application.
- * _XOPEN_VERSION defined with a value of 500 indicates an XPG5 (UNIX 98)
- * application and with a value of 600 indicates an XPG6 (UNIX 03)
- * application. The appropriate version is determined by the use of the
- * feature test macros described earlier. The value of _XOPEN_VERSION
- * defaults to 3 otherwise indicating support for XPG3 applications.
- */
-#ifndef _XOPEN_VERSION
-#ifdef _XPG6
-#define _XOPEN_VERSION 600
-#elif defined(_XPG5)
-#define _XOPEN_VERSION 500
-#elif defined(_XPG4_2)
-#define _XOPEN_VERSION 4
-#else
-#define _XOPEN_VERSION 3
-#endif
-#endif
-
-/*
- * ANSI C and ISO 9899:1990 say the type long long doesn't exist in strictly
- * conforming environments. ISO 9899:1999 says it does.
- *
- * The presence of _LONGLONG_TYPE says "long long exists" which is therefore
- * defined in all but strictly conforming environments that disallow it.
- */
-#if !defined(_STDC_C99) && defined(_STRICT_STDC) && !defined(__GNUC__)
-/*
- * Resist attempts to force the definition of long long in this case.
- */
-#if defined(_LONGLONG_TYPE)
-#error "No long long in strictly conforming ANSI C & 1990 ISO C environments"
-#endif
-#else
-#if !defined(_LONGLONG_TYPE)
-#define _LONGLONG_TYPE
-#endif
-#endif
-
-/*
- * It is invalid to compile an XPG3, XPG4, XPG4v2, or XPG5 application
- * using c99. The same is true for POSIX.1-1990, POSIX.2-1992, POSIX.1b,
- * and POSIX.1c applications. Likewise, it is invalid to compile an XPG6
- * or a POSIX.1-2001 application with anything other than a c99 or later
- * compiler. Therefore, we force an error in both cases.
- */
-#if defined(_STDC_C99) && (defined(__XOPEN_OR_POSIX) && !defined(_XPG6))
-#error "Compiler or options invalid for pre-UNIX 03 X/Open applications \
- and pre-2001 POSIX applications"
-#elif !defined(_STDC_C99) && \
- (defined(__XOPEN_OR_POSIX) && defined(_XPG6))
-#error "Compiler or options invalid; UNIX 03 and POSIX.1-2001 applications \
- require the use of c99"
-#endif
-
-/*
- * The following macro defines a value for the ISO C99 restrict
- * keyword so that _RESTRICT_KYWD resolves to "restrict" if
- * an ISO C99 compiler is used and "" (null string) if any other
- * compiler is used. This allows for the use of single prototype
- * declarations regardless of compiler version.
- */
-#if (defined(__STDC__) && defined(_STDC_C99))
-#define _RESTRICT_KYWD restrict
-#else
-#define _RESTRICT_KYWD
-#endif
-
-/*
- * The following macro indicates header support for the ANSI C++
- * standard. The ISO/IEC designation for this is ISO/IEC FDIS 14882.
- */
-#define _ISO_CPP_14882_1998
-
-/*
- * The following macro indicates header support for the C99 standard,
- * ISO/IEC 9899:1999, Programming Languages - C.
- */
-#define _ISO_C_9899_1999
-
-/*
- * The following macro indicates header support for DTrace. The value is an
- * integer that corresponds to the major version number for DTrace.
- */
-#define _DTRACE_VERSION 1
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_FEATURE_TESTS_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/fm/fs/zfs.h b/sys/contrib/opensolaris/uts/common/sys/fm/fs/zfs.h
deleted file mode 100644
index aa5c7ee..0000000
--- a/sys/contrib/opensolaris/uts/common/sys/fm/fs/zfs.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_FM_FS_ZFS_H
-#define _SYS_FM_FS_ZFS_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define ZFS_ERROR_CLASS "fs.zfs"
-
-#define FM_EREPORT_ZFS_CHECKSUM "checksum"
-#define FM_EREPORT_ZFS_IO "io"
-#define FM_EREPORT_ZFS_DATA "data"
-#define FM_EREPORT_ZFS_POOL "zpool"
-#define FM_EREPORT_ZFS_DEVICE_UNKNOWN "vdev.unknown"
-#define FM_EREPORT_ZFS_DEVICE_OPEN_FAILED "vdev.open_failed"
-#define FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA "vdev.corrupt_data"
-#define FM_EREPORT_ZFS_DEVICE_NO_REPLICAS "vdev.no_replicas"
-#define FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM "vdev.bad_guid_sum"
-#define FM_EREPORT_ZFS_DEVICE_TOO_SMALL "vdev.too_small"
-#define FM_EREPORT_ZFS_DEVICE_BAD_LABEL "vdev.bad_label"
-
-#define FM_EREPORT_PAYLOAD_ZFS_POOL "pool"
-#define FM_EREPORT_PAYLOAD_ZFS_POOL_GUID "pool_guid"
-#define FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT "pool_context"
-#define FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID "vdev_guid"
-#define FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE "vdev_type"
-#define FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH "vdev_path"
-#define FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID "vdev_devid"
-#define FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID "parent_guid"
-#define FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE "parent_type"
-#define FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH "parent_path"
-#define FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID "parent_devid"
-#define FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET "zio_objset"
-#define FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT "zio_object"
-#define FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL "zio_level"
-#define FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID "zio_blkid"
-#define FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR "zio_err"
-#define FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET "zio_offset"
-#define FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE "zio_size"
-#define FM_EREPORT_PAYLOAD_ZFS_PREV_STATE "prev_state"
-
-#define FM_RESOURCE_OK "ok"
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_FM_FS_ZFS_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/fm/protocol.h b/sys/contrib/opensolaris/uts/common/sys/fm/protocol.h
deleted file mode 100644
index a9980fe..0000000
--- a/sys/contrib/opensolaris/uts/common/sys/fm/protocol.h
+++ /dev/null
@@ -1,301 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_FM_PROTOCOL_H
-#define _SYS_FM_PROTOCOL_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifdef _KERNEL
-#include <sys/varargs.h>
-#include <sys/nvpair.h>
-#else
-#include <libnvpair.h>
-#include <stdarg.h>
-#endif
-
-/* FM common member names */
-#define FM_CLASS "class"
-#define FM_VERSION "version"
-
-/* FM event class values */
-#define FM_EREPORT_CLASS "ereport"
-#define FM_FAULT_CLASS "fault"
-#define FM_RSRC_CLASS "resource"
-#define FM_LIST_EVENT "list"
-
-/* FM list.* event class values */
-#define FM_LIST_SUSPECT_CLASS FM_LIST_EVENT ".suspect"
-#define FM_LIST_ISOLATED_CLASS FM_LIST_EVENT ".isolated"
-#define FM_LIST_REPAIRED_CLASS FM_LIST_EVENT ".repaired"
-
-/* ereport class subcategory values */
-#define FM_ERROR_CPU "cpu"
-#define FM_ERROR_IO "io"
-
-/* ereport version and payload member names */
-#define FM_EREPORT_VERS0 0
-#define FM_EREPORT_VERSION FM_EREPORT_VERS0
-
-/* ereport payload member names */
-#define FM_EREPORT_DETECTOR "detector"
-#define FM_EREPORT_ENA "ena"
-
-/* list.* event payload member names */
-#define FM_LIST_EVENT_SIZE "list-sz"
-
-/* list.suspect, isolated, and repaired versions and payload member names */
-#define FM_SUSPECT_UUID "uuid"
-#define FM_SUSPECT_DIAG_CODE "code"
-#define FM_SUSPECT_DIAG_TIME "diag-time"
-#define FM_SUSPECT_DE "de"
-#define FM_SUSPECT_FAULT_LIST "fault-list"
-#define FM_SUSPECT_FAULT_SZ "fault-list-sz"
-#define FM_SUSPECT_FAULT_STATUS "fault-status"
-#define FM_SUSPECT_MESSAGE "message"
-
-#define FM_SUSPECT_VERS0 0
-#define FM_SUSPECT_VERSION FM_SUSPECT_VERS0
-
-/* fault event versions and payload member names */
-#define FM_FAULT_VERS0 0
-#define FM_FAULT_VERSION FM_FAULT_VERS0
-
-#define FM_FAULT_ASRU "asru"
-#define FM_FAULT_FRU "fru"
-#define FM_FAULT_FRU_LABEL "fru-label"
-#define FM_FAULT_CERTAINTY "certainty"
-#define FM_FAULT_RESOURCE "resource"
-#define FM_FAULT_LOCATION "location"
-
-/* resource event versions and payload member names */
-#define FM_RSRC_VERS0 0
-#define FM_RSRC_VERSION FM_RSRC_VERS0
-#define FM_RSRC_RESOURCE "resource"
-
-/* resource.fm.asru.* payload member names */
-#define FM_RSRC_ASRU_UUID "uuid"
-#define FM_RSRC_ASRU_CODE "code"
-#define FM_RSRC_ASRU_FAULTY "faulty"
-#define FM_RSRC_ASRU_UNUSABLE "unusable"
-#define FM_RSRC_ASRU_EVENT "event"
-
-/* resource.fm.xprt.* versions and payload member names */
-#define FM_RSRC_XPRT_VERS0 0
-#define FM_RSRC_XPRT_VERSION FM_RSRC_XPRT_VERS0
-#define FM_RSRC_XPRT_UUID "uuid"
-#define FM_RSRC_XPRT_SUBCLASS "subclass"
-
-/*
- * FM ENA Format Macros
- */
-#define ENA_FORMAT_MASK 0x3
-#define ENA_FORMAT(ena) ((ena) & ENA_FORMAT_MASK)
-
-/* ENA format types */
-#define FM_ENA_FMT0 0
-#define FM_ENA_FMT1 1
-#define FM_ENA_FMT2 2
-
-/* Format 1 */
-#define ENA_FMT1_GEN_MASK 0x00000000000003FCull
-#define ENA_FMT1_ID_MASK 0xFFFFFFFFFFFFFC00ull
-#define ENA_FMT1_CPUID_MASK 0x00000000000FFC00ull
-#define ENA_FMT1_TIME_MASK 0xFFFFFFFFFFF00000ull
-#define ENA_FMT1_GEN_SHFT 2
-#define ENA_FMT1_ID_SHFT 10
-#define ENA_FMT1_CPUID_SHFT ENA_FMT1_ID_SHFT
-#define ENA_FMT1_TIME_SHFT 20
-
-/* Format 2 */
-#define ENA_FMT2_GEN_MASK 0x00000000000003FCull
-#define ENA_FMT2_ID_MASK 0xFFFFFFFFFFFFFC00ull
-#define ENA_FMT2_TIME_MASK ENA_FMT2_ID_MASK
-#define ENA_FMT2_GEN_SHFT 2
-#define ENA_FMT2_ID_SHFT 10
-#define ENA_FMT2_TIME_SHFT ENA_FMT2_ID_SHFT
-
-/* Common FMRI type names */
-#define FM_FMRI_AUTHORITY "authority"
-#define FM_FMRI_SCHEME "scheme"
-#define FM_FMRI_SVC_AUTHORITY "svc-authority"
-
-/* FMRI authority-type member names */
-#define FM_FMRI_AUTH_CHASSIS "chassis-id"
-#define FM_FMRI_AUTH_PRODUCT "product-id"
-#define FM_FMRI_AUTH_DOMAIN "domain-id"
-#define FM_FMRI_AUTH_SERVER "server-id"
-#define FM_FMRI_AUTH_HOST "host-id"
-
-#define FM_AUTH_VERS0 0
-#define FM_FMRI_AUTH_VERSION FM_AUTH_VERS0
-
-/* scheme name values */
-#define FM_FMRI_SCHEME_FMD "fmd"
-#define FM_FMRI_SCHEME_DEV "dev"
-#define FM_FMRI_SCHEME_HC "hc"
-#define FM_FMRI_SCHEME_SVC "svc"
-#define FM_FMRI_SCHEME_CPU "cpu"
-#define FM_FMRI_SCHEME_MEM "mem"
-#define FM_FMRI_SCHEME_MOD "mod"
-#define FM_FMRI_SCHEME_PKG "pkg"
-#define FM_FMRI_SCHEME_LEGACY "legacy-hc"
-#define FM_FMRI_SCHEME_ZFS "zfs"
-
-/* Scheme versions */
-#define FMD_SCHEME_VERSION0 0
-#define FM_FMD_SCHEME_VERSION FMD_SCHEME_VERSION0
-#define DEV_SCHEME_VERSION0 0
-#define FM_DEV_SCHEME_VERSION DEV_SCHEME_VERSION0
-#define FM_HC_VERS0 0
-#define FM_HC_SCHEME_VERSION FM_HC_VERS0
-#define CPU_SCHEME_VERSION0 0
-#define CPU_SCHEME_VERSION1 1
-#define FM_CPU_SCHEME_VERSION CPU_SCHEME_VERSION1
-#define MEM_SCHEME_VERSION0 0
-#define FM_MEM_SCHEME_VERSION MEM_SCHEME_VERSION0
-#define MOD_SCHEME_VERSION0 0
-#define FM_MOD_SCHEME_VERSION MOD_SCHEME_VERSION0
-#define PKG_SCHEME_VERSION0 0
-#define FM_PKG_SCHEME_VERSION PKG_SCHEME_VERSION0
-#define LEGACY_SCHEME_VERSION0 0
-#define FM_LEGACY_SCHEME_VERSION LEGACY_SCHEME_VERSION0
-#define ZFS_SCHEME_VERSION0 0
-#define FM_ZFS_SCHEME_VERSION ZFS_SCHEME_VERSION0
-
-/* hc scheme member names */
-#define FM_FMRI_HC_SERIAL_ID "serial"
-#define FM_FMRI_HC_PART "part"
-#define FM_FMRI_HC_REVISION "revision"
-#define FM_FMRI_HC_ROOT "hc-root"
-#define FM_FMRI_HC_LIST_SZ "hc-list-sz"
-#define FM_FMRI_HC_LIST "hc-list"
-#define FM_FMRI_HC_SPECIFIC "hc-specific"
-
-/* hc-list version and member names */
-#define FM_FMRI_HC_NAME "hc-name"
-#define FM_FMRI_HC_ID "hc-id"
-
-#define HC_LIST_VERSION0 0
-#define FM_HC_LIST_VERSION HC_LIST_VERSION0
-
-/* hc-specific member names */
-#define FM_FMRI_HC_SPECIFIC_OFFSET "offset"
-
-/* fmd module scheme member names */
-#define FM_FMRI_FMD_NAME "mod-name"
-#define FM_FMRI_FMD_VERSION "mod-version"
-
-/* dev scheme member names */
-#define FM_FMRI_DEV_ID "devid"
-#define FM_FMRI_DEV_PATH "device-path"
-
-/* pkg scheme member names */
-#define FM_FMRI_PKG_BASEDIR "pkg-basedir"
-#define FM_FMRI_PKG_INST "pkg-inst"
-#define FM_FMRI_PKG_VERSION "pkg-version"
-
-/* svc scheme member names */
-#define FM_FMRI_SVC_NAME "service-name"
-#define FM_FMRI_SVC_VERSION "service-version"
-#define FM_FMRI_SVC_INSTANCE "instance"
-#define FM_FMRI_SVC_CONTRACT_ID "contract-id"
-
-/* svc-authority member names */
-#define FM_FMRI_SVC_AUTH_SCOPE "scope"
-#define FM_FMRI_SVC_AUTH_SYSTEM_FQN "system-FQN"
-
-/* cpu scheme member names */
-#define FM_FMRI_CPU_ID "cpuid"
-#define FM_FMRI_CPU_SERIAL_ID "serial"
-#define FM_FMRI_CPU_MASK "cpumask"
-#define FM_FMRI_CPU_VID "cpuvid"
-#define FM_FMRI_CPU_CPUFRU "cpufru"
-
-/* legacy-hc scheme member names */
-#define FM_FMRI_LEGACY_HC "component"
-#define FM_FMRI_LEGACY_HC_PREFIX FM_FMRI_SCHEME_HC":///" \
- FM_FMRI_LEGACY_HC"="
-
-/* mem scheme member names */
-#define FM_FMRI_MEM_UNUM "unum"
-#define FM_FMRI_MEM_SERIAL_ID "serial"
-#define FM_FMRI_MEM_PHYSADDR "physaddr"
-#define FM_FMRI_MEM_MEMCONFIG "memconfig"
-#define FM_FMRI_MEM_OFFSET "offset"
-
-/* mod scheme member names */
-#define FM_FMRI_MOD_PKG "mod-pkg"
-#define FM_FMRI_MOD_NAME "mod-name"
-#define FM_FMRI_MOD_ID "mod-id"
-#define FM_FMRI_MOD_DESC "mod-desc"
-
-/* zfs scheme member names */
-#define FM_FMRI_ZFS_POOL "pool"
-#define FM_FMRI_ZFS_VDEV "vdev"
-
-extern nv_alloc_t *fm_nva_xcreate(char *, size_t);
-extern void fm_nva_xdestroy(nv_alloc_t *);
-
-extern nvlist_t *fm_nvlist_create(nv_alloc_t *);
-extern void fm_nvlist_destroy(nvlist_t *, int);
-
-#define FM_NVA_FREE 0 /* free allocator on nvlist_destroy */
-#define FM_NVA_RETAIN 1 /* keep allocator on nvlist_destroy */
-
-extern void fm_ereport_set(nvlist_t *, int, const char *, uint64_t,
- const nvlist_t *, ...);
-extern void fm_payload_set(nvlist_t *, ...);
-extern int i_fm_payload_set(nvlist_t *, const char *, va_list);
-extern void fm_fmri_hc_set(nvlist_t *, int, const nvlist_t *, nvlist_t *,
- int, ...);
-extern void fm_fmri_dev_set(nvlist_t *, int, const nvlist_t *, const char *,
- const char *);
-extern void fm_fmri_de_set(nvlist_t *, int, const nvlist_t *, const char *);
-extern void fm_fmri_cpu_set(nvlist_t *, int, const nvlist_t *, uint32_t,
- uint8_t *, const char *);
-extern void fm_fmri_mem_set(nvlist_t *, int, const nvlist_t *, const char *,
- const char *, uint64_t);
-extern void fm_authority_set(nvlist_t *, int, const char *, const char *,
- const char *, const char *);
-extern void fm_fmri_zfs_set(nvlist_t *, int, uint64_t, uint64_t);
-
-extern uint64_t fm_ena_increment(uint64_t);
-extern uint64_t fm_ena_generate(uint64_t, uchar_t);
-extern uint64_t fm_ena_generation_get(uint64_t);
-extern uchar_t fm_ena_format_get(uint64_t);
-extern uint64_t fm_ena_id_get(uint64_t);
-extern uint64_t fm_ena_time_get(uint64_t);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_FM_PROTOCOL_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/fm/util.h b/sys/contrib/opensolaris/uts/common/sys/fm/util.h
deleted file mode 100644
index f65e0ab4..0000000
--- a/sys/contrib/opensolaris/uts/common/sys/fm/util.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_FM_UTIL_H
-#define _SYS_FM_UTIL_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <sys/nvpair.h>
-#include <sys/errorq.h>
-
-/*
- * Shared user/kernel definitions for class length, error channel name,
- * and kernel event publisher string.
- */
-#define FM_MAX_CLASS 100
-#define FM_ERROR_CHAN "com.sun:fm:error"
-#define FM_PUB "fm"
-
-/*
- * ereport dump device transport support
- *
- * Ereports are written out to the dump device at a proscribed offset from the
- * end, similar to in-transit log messages. The ereports are represented as a
- * erpt_dump_t header followed by ed_size bytes of packed native nvlist data.
- *
- * NOTE: All of these constants and the header must be defined so they have the
- * same representation for *both* 32-bit and 64-bit producers and consumers.
- */
-#define ERPT_MAGIC 0xf00d4eddU
-#define ERPT_MAX_ERRS 16
-#define ERPT_DATA_SZ (6 * 1024)
-#define ERPT_EVCH_MAX 256
-#define ERPT_HIWAT 64
-
-typedef struct erpt_dump {
- uint32_t ed_magic; /* ERPT_MAGIC or zero to indicate end */
- uint32_t ed_chksum; /* checksum32() of packed nvlist data */
- uint32_t ed_size; /* ereport (nvl) fixed buf size */
- uint32_t ed_pad; /* reserved for future use */
- hrtime_t ed_hrt_nsec; /* hrtime of this ereport */
- hrtime_t ed_hrt_base; /* hrtime sample corresponding to ed_tod_base */
- struct {
- uint64_t sec; /* seconds since gettimeofday() Epoch */
- uint64_t nsec; /* nanoseconds past ed_tod_base.sec */
- } ed_tod_base;
-} erpt_dump_t;
-
-#ifdef _KERNEL
-#include <sys/systm.h>
-
-#define FM_STK_DEPTH 20 /* maximum stack depth */
-#define FM_SYM_SZ 64 /* maximum symbol size */
-#define FM_ERR_PIL 2 /* PIL for ereport_errorq drain processing */
-
-#define FM_EREPORT_PAYLOAD_NAME_STACK "stack"
-
-extern errorq_t *ereport_errorq;
-extern void *ereport_dumpbuf;
-extern size_t ereport_dumplen;
-
-extern void fm_init(void);
-extern void fm_nvprint(nvlist_t *);
-extern void fm_panic(const char *, ...);
-extern void fm_banner(void);
-
-extern void fm_ereport_dump(void);
-extern void fm_ereport_post(nvlist_t *, int);
-
-#endif /* _KERNEL */
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_FM_UTIL_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/fs/zfs.h b/sys/contrib/opensolaris/uts/common/sys/fs/zfs.h
deleted file mode 100644
index bcf8594..0000000
--- a/sys/contrib/opensolaris/uts/common/sys/fs/zfs.h
+++ /dev/null
@@ -1,437 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_FS_ZFS_H
-#define _SYS_FS_ZFS_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/types.h>
-#include <sys/ioccom.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * Types and constants shared between userland and the kernel.
- */
-
-/*
- * Each dataset can be one of the following types. These constants can be
- * combined into masks that can be passed to various functions.
- */
-typedef enum {
- ZFS_TYPE_FILESYSTEM = 0x1,
- ZFS_TYPE_SNAPSHOT = 0x2,
- ZFS_TYPE_VOLUME = 0x4,
- ZFS_TYPE_POOL = 0x8
-} zfs_type_t;
-
-#define ZFS_TYPE_ANY \
- (ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME | ZFS_TYPE_SNAPSHOT)
-
-/*
- * Properties are identified by these constants and must be added to the
- * end of this list to ensure that external conumsers are not affected
- * by the change. The property list also determines how 'zfs get' will
- * display them. If you make any changes to this list, be sure to update
- * the property table in usr/src/common/zfs/zfs_prop.c.
- */
-typedef enum {
- ZFS_PROP_CONT = -2,
- ZFS_PROP_INVAL = -1,
- ZFS_PROP_TYPE,
- ZFS_PROP_CREATION,
- ZFS_PROP_USED,
- ZFS_PROP_AVAILABLE,
- ZFS_PROP_REFERENCED,
- ZFS_PROP_COMPRESSRATIO,
- ZFS_PROP_MOUNTED,
- ZFS_PROP_ORIGIN,
- ZFS_PROP_QUOTA,
- ZFS_PROP_RESERVATION,
- ZFS_PROP_VOLSIZE,
- ZFS_PROP_VOLBLOCKSIZE,
- ZFS_PROP_RECORDSIZE,
- ZFS_PROP_MOUNTPOINT,
- ZFS_PROP_SHARENFS,
- ZFS_PROP_CHECKSUM,
- ZFS_PROP_COMPRESSION,
- ZFS_PROP_ATIME,
- ZFS_PROP_DEVICES,
- ZFS_PROP_EXEC,
- ZFS_PROP_SETUID,
- ZFS_PROP_READONLY,
- ZFS_PROP_ZONED,
- ZFS_PROP_SNAPDIR,
- ZFS_PROP_ACLMODE,
- ZFS_PROP_ACLINHERIT,
- ZFS_PROP_CREATETXG, /* not exposed to the user */
- ZFS_PROP_NAME, /* not exposed to the user */
- ZFS_PROP_CANMOUNT,
- ZFS_PROP_SHAREISCSI,
- ZFS_PROP_ISCSIOPTIONS, /* not exposed to the user */
- ZFS_PROP_XATTR,
- ZFS_PROP_NUMCLONES, /* not exposed to the user */
- ZFS_PROP_COPIES,
- ZFS_PROP_BOOTFS
-} zfs_prop_t;
-
-typedef zfs_prop_t zpool_prop_t;
-
-#define ZFS_PROP_VALUE "value"
-#define ZFS_PROP_SOURCE "source"
-
-typedef enum {
- ZFS_SRC_NONE = 0x1,
- ZFS_SRC_DEFAULT = 0x2,
- ZFS_SRC_TEMPORARY = 0x4,
- ZFS_SRC_LOCAL = 0x8,
- ZFS_SRC_INHERITED = 0x10
-} zfs_source_t;
-
-#define ZFS_SRC_ALL 0x1f
-
-/*
- * The following functions are shared between libzfs and the kernel.
- */
-zfs_prop_t zfs_name_to_prop(const char *);
-zpool_prop_t zpool_name_to_prop(const char *);
-boolean_t zfs_prop_user(const char *);
-int zfs_prop_readonly(zfs_prop_t);
-const char *zfs_prop_default_string(zfs_prop_t);
-const char *zfs_prop_to_name(zfs_prop_t);
-const char *zpool_prop_to_name(zfs_prop_t);
-uint64_t zfs_prop_default_numeric(zfs_prop_t);
-int zfs_prop_inheritable(zfs_prop_t);
-int zfs_prop_string_to_index(zfs_prop_t, const char *, uint64_t *);
-int zfs_prop_index_to_string(zfs_prop_t, uint64_t, const char **);
-
-/*
- * Property Iterator
- */
-typedef zfs_prop_t (*zfs_prop_f)(zfs_prop_t, void *);
-typedef zfs_prop_f zpool_prop_f;
-extern zfs_prop_t zfs_prop_iter(zfs_prop_f, void *, boolean_t);
-extern zpool_prop_t zpool_prop_iter(zpool_prop_f, void *, boolean_t);
-
-/*
- * On-disk version number.
- */
-#define ZFS_VERSION_1 1ULL
-#define ZFS_VERSION_2 2ULL
-#define ZFS_VERSION_3 3ULL
-#define ZFS_VERSION_4 4ULL
-#define ZFS_VERSION_5 5ULL
-#define ZFS_VERSION_6 6ULL
-/*
- * When bumping up ZFS_VERSION, make sure GRUB ZFS understand the on-disk
- * format change. Go to usr/src/grub/grub-0.95/stage2/{zfs-include/, fsys_zfs*},
- * and do the appropriate changes.
- */
-#define ZFS_VERSION ZFS_VERSION_6
-#define ZFS_VERSION_STRING "6"
-
-/*
- * Symbolic names for the changes that caused a ZFS_VERSION switch.
- * Used in the code when checking for presence or absence of a feature.
- * Feel free to define multiple symbolic names for each version if there
- * were multiple changes to on-disk structures during that version.
- *
- * NOTE: When checking the current ZFS_VERSION in your code, be sure
- * to use spa_version() since it reports the version of the
- * last synced uberblock. Checking the in-flight version can
- * be dangerous in some cases.
- */
-#define ZFS_VERSION_INITIAL ZFS_VERSION_1
-#define ZFS_VERSION_DITTO_BLOCKS ZFS_VERSION_2
-#define ZFS_VERSION_SPARES ZFS_VERSION_3
-#define ZFS_VERSION_RAID6 ZFS_VERSION_3
-#define ZFS_VERSION_BPLIST_ACCOUNT ZFS_VERSION_3
-#define ZFS_VERSION_RAIDZ_DEFLATE ZFS_VERSION_3
-#define ZFS_VERSION_DNODE_BYTES ZFS_VERSION_3
-#define ZFS_VERSION_ZPOOL_HISTORY ZFS_VERSION_4
-#define ZFS_VERSION_GZIP_COMPRESSION ZFS_VERSION_5
-#define ZFS_VERSION_BOOTFS ZFS_VERSION_6
-
-/*
- * The following are configuration names used in the nvlist describing a pool's
- * configuration.
- */
-#define ZPOOL_CONFIG_VERSION "version"
-#define ZPOOL_CONFIG_POOL_NAME "name"
-#define ZPOOL_CONFIG_POOL_STATE "state"
-#define ZPOOL_CONFIG_POOL_TXG "txg"
-#define ZPOOL_CONFIG_POOL_GUID "pool_guid"
-#define ZPOOL_CONFIG_CREATE_TXG "create_txg"
-#define ZPOOL_CONFIG_TOP_GUID "top_guid"
-#define ZPOOL_CONFIG_VDEV_TREE "vdev_tree"
-#define ZPOOL_CONFIG_TYPE "type"
-#define ZPOOL_CONFIG_CHILDREN "children"
-#define ZPOOL_CONFIG_ID "id"
-#define ZPOOL_CONFIG_GUID "guid"
-#define ZPOOL_CONFIG_PATH "path"
-#define ZPOOL_CONFIG_DEVID "devid"
-#define ZPOOL_CONFIG_METASLAB_ARRAY "metaslab_array"
-#define ZPOOL_CONFIG_METASLAB_SHIFT "metaslab_shift"
-#define ZPOOL_CONFIG_ASHIFT "ashift"
-#define ZPOOL_CONFIG_ASIZE "asize"
-#define ZPOOL_CONFIG_DTL "DTL"
-#define ZPOOL_CONFIG_STATS "stats"
-#define ZPOOL_CONFIG_WHOLE_DISK "whole_disk"
-#define ZPOOL_CONFIG_OFFLINE "offline"
-#define ZPOOL_CONFIG_ERRCOUNT "error_count"
-#define ZPOOL_CONFIG_NOT_PRESENT "not_present"
-#define ZPOOL_CONFIG_SPARES "spares"
-#define ZPOOL_CONFIG_IS_SPARE "is_spare"
-#define ZPOOL_CONFIG_NPARITY "nparity"
-#define ZPOOL_CONFIG_HOSTID "hostid"
-#define ZPOOL_CONFIG_HOSTNAME "hostname"
-#define ZPOOL_CONFIG_TIMESTAMP "timestamp" /* not stored on disk */
-
-#define VDEV_TYPE_ROOT "root"
-#define VDEV_TYPE_MIRROR "mirror"
-#define VDEV_TYPE_REPLACING "replacing"
-#define VDEV_TYPE_RAIDZ "raidz"
-#define VDEV_TYPE_DISK "disk"
-#define VDEV_TYPE_FILE "file"
-#define VDEV_TYPE_MISSING "missing"
-#define VDEV_TYPE_SPARE "spare"
-
-/*
- * This is needed in userland to report the minimum necessary device size.
- */
-#define SPA_MINDEVSIZE (64ULL << 20)
-
-/*
- * The location of the pool configuration repository, shared between kernel and
- * userland.
- */
-#define ZPOOL_CACHE_DIR "/boot/zfs"
-#define ZPOOL_CACHE_FILE "zpool.cache"
-#define ZPOOL_CACHE_TMP ".zpool.cache"
-
-#define ZPOOL_CACHE ZPOOL_CACHE_DIR "/" ZPOOL_CACHE_FILE
-
-/*
- * vdev states are ordered from least to most healthy.
- * A vdev that's CANT_OPEN or below is considered unusable.
- */
-typedef enum vdev_state {
- VDEV_STATE_UNKNOWN = 0, /* Uninitialized vdev */
- VDEV_STATE_CLOSED, /* Not currently open */
- VDEV_STATE_OFFLINE, /* Not allowed to open */
- VDEV_STATE_CANT_OPEN, /* Tried to open, but failed */
- VDEV_STATE_DEGRADED, /* Replicated vdev with unhealthy kids */
- VDEV_STATE_HEALTHY /* Presumed good */
-} vdev_state_t;
-
-/*
- * vdev aux states. When a vdev is in the CANT_OPEN state, the aux field
- * of the vdev stats structure uses these constants to distinguish why.
- */
-typedef enum vdev_aux {
- VDEV_AUX_NONE, /* no error */
- VDEV_AUX_OPEN_FAILED, /* ldi_open_*() or vn_open() failed */
- VDEV_AUX_CORRUPT_DATA, /* bad label or disk contents */
- VDEV_AUX_NO_REPLICAS, /* insufficient number of replicas */
- VDEV_AUX_BAD_GUID_SUM, /* vdev guid sum doesn't match */
- VDEV_AUX_TOO_SMALL, /* vdev size is too small */
- VDEV_AUX_BAD_LABEL, /* the label is OK but invalid */
- VDEV_AUX_VERSION_NEWER, /* on-disk version is too new */
- VDEV_AUX_VERSION_OLDER, /* on-disk version is too old */
- VDEV_AUX_SPARED /* hot spare used in another pool */
-} vdev_aux_t;
-
-/*
- * pool state. The following states are written to disk as part of the normal
- * SPA lifecycle: ACTIVE, EXPORTED, DESTROYED, SPARE. The remaining states are
- * software abstractions used at various levels to communicate pool state.
- */
-typedef enum pool_state {
- POOL_STATE_ACTIVE = 0, /* In active use */
- POOL_STATE_EXPORTED, /* Explicitly exported */
- POOL_STATE_DESTROYED, /* Explicitly destroyed */
- POOL_STATE_SPARE, /* Reserved for hot spare use */
- POOL_STATE_UNINITIALIZED, /* Internal spa_t state */
- POOL_STATE_UNAVAIL, /* Internal libzfs state */
- POOL_STATE_POTENTIALLY_ACTIVE /* Internal libzfs state */
-} pool_state_t;
-
-/*
- * Scrub types.
- */
-typedef enum pool_scrub_type {
- POOL_SCRUB_NONE,
- POOL_SCRUB_RESILVER,
- POOL_SCRUB_EVERYTHING,
- POOL_SCRUB_TYPES
-} pool_scrub_type_t;
-
-/*
- * ZIO types. Needed to interpret vdev statistics below.
- */
-typedef enum zio_type {
- ZIO_TYPE_NULL = 0,
- ZIO_TYPE_READ,
- ZIO_TYPE_WRITE,
- ZIO_TYPE_FREE,
- ZIO_TYPE_CLAIM,
- ZIO_TYPE_IOCTL,
- ZIO_TYPES
-} zio_type_t;
-
-/*
- * Vdev statistics. Note: all fields should be 64-bit because this
- * is passed between kernel and userland as an nvlist uint64 array.
- */
-typedef struct vdev_stat {
- hrtime_t vs_timestamp; /* time since vdev load */
- uint64_t vs_state; /* vdev state */
- uint64_t vs_aux; /* see vdev_aux_t */
- uint64_t vs_alloc; /* space allocated */
- uint64_t vs_space; /* total capacity */
- uint64_t vs_dspace; /* deflated capacity */
- uint64_t vs_rsize; /* replaceable dev size */
- uint64_t vs_ops[ZIO_TYPES]; /* operation count */
- uint64_t vs_bytes[ZIO_TYPES]; /* bytes read/written */
- uint64_t vs_read_errors; /* read errors */
- uint64_t vs_write_errors; /* write errors */
- uint64_t vs_checksum_errors; /* checksum errors */
- uint64_t vs_self_healed; /* self-healed bytes */
- uint64_t vs_scrub_type; /* pool_scrub_type_t */
- uint64_t vs_scrub_complete; /* completed? */
- uint64_t vs_scrub_examined; /* bytes examined; top */
- uint64_t vs_scrub_repaired; /* bytes repaired; leaf */
- uint64_t vs_scrub_errors; /* errors during scrub */
- uint64_t vs_scrub_start; /* UTC scrub start time */
- uint64_t vs_scrub_end; /* UTC scrub end time */
-} vdev_stat_t;
-
-#define ZFS_DRIVER "zfs"
-#define ZFS_DEV_NAME "zfs"
-#define ZFS_DEV "/dev/" ZFS_DEV_NAME
-
-/*
- * zvol paths. Irritatingly, the devfsadm interfaces want all these
- * paths without the /dev prefix, but for some things, we want the
- * /dev prefix. Below are the names without /dev.
- */
-#define ZVOL_DEV_DIR "zvol"
-
-/*
- * And here are the things we need with /dev, etc. in front of them.
- */
-#define ZVOL_PSEUDO_DEV "/devices/pseudo/zvol@0:"
-#define ZVOL_FULL_DEV_DIR "/dev/" ZVOL_DEV_DIR
-
-#define ZVOL_PROP_NAME "name"
-
-/*
- * /dev/zfs ioctl numbers.
- */
-typedef unsigned long zfs_ioc_t;
-
-#define ZFS_IOC(ioreq) ((ioreq) & 0xff)
-
-#define ZFS_IOC_POOL_CREATE _IOWR('Z', 0, struct zfs_cmd)
-#define ZFS_IOC_POOL_DESTROY _IOWR('Z', 1, struct zfs_cmd)
-#define ZFS_IOC_POOL_IMPORT _IOWR('Z', 2, struct zfs_cmd)
-#define ZFS_IOC_POOL_EXPORT _IOWR('Z', 3, struct zfs_cmd)
-#define ZFS_IOC_POOL_CONFIGS _IOWR('Z', 4, struct zfs_cmd)
-#define ZFS_IOC_POOL_STATS _IOWR('Z', 5, struct zfs_cmd)
-#define ZFS_IOC_POOL_TRYIMPORT _IOWR('Z', 6, struct zfs_cmd)
-#define ZFS_IOC_POOL_SCRUB _IOWR('Z', 7, struct zfs_cmd)
-#define ZFS_IOC_POOL_FREEZE _IOWR('Z', 8, struct zfs_cmd)
-#define ZFS_IOC_POOL_UPGRADE _IOWR('Z', 9, struct zfs_cmd)
-#define ZFS_IOC_POOL_GET_HISTORY _IOWR('Z', 10, struct zfs_cmd)
-#define ZFS_IOC_POOL_LOG_HISTORY _IOWR('Z', 11, struct zfs_cmd)
-#define ZFS_IOC_VDEV_ADD _IOWR('Z', 12, struct zfs_cmd)
-#define ZFS_IOC_VDEV_REMOVE _IOWR('Z', 13, struct zfs_cmd)
-#define ZFS_IOC_VDEV_ONLINE _IOWR('Z', 14, struct zfs_cmd)
-#define ZFS_IOC_VDEV_OFFLINE _IOWR('Z', 15, struct zfs_cmd)
-#define ZFS_IOC_VDEV_ATTACH _IOWR('Z', 16, struct zfs_cmd)
-#define ZFS_IOC_VDEV_DETACH _IOWR('Z', 17, struct zfs_cmd)
-#define ZFS_IOC_VDEV_SETPATH _IOWR('Z', 18, struct zfs_cmd)
-#define ZFS_IOC_OBJSET_STATS _IOWR('Z', 19, struct zfs_cmd)
-#define ZFS_IOC_DATASET_LIST_NEXT _IOWR('Z', 20, struct zfs_cmd)
-#define ZFS_IOC_SNAPSHOT_LIST_NEXT _IOWR('Z', 21, struct zfs_cmd)
-#define ZFS_IOC_SET_PROP _IOWR('Z', 22, struct zfs_cmd)
-#define ZFS_IOC_CREATE_MINOR _IOWR('Z', 23, struct zfs_cmd)
-#define ZFS_IOC_REMOVE_MINOR _IOWR('Z', 24, struct zfs_cmd)
-#define ZFS_IOC_CREATE _IOWR('Z', 25, struct zfs_cmd)
-#define ZFS_IOC_DESTROY _IOWR('Z', 26, struct zfs_cmd)
-#define ZFS_IOC_ROLLBACK _IOWR('Z', 27, struct zfs_cmd)
-#define ZFS_IOC_RENAME _IOWR('Z', 28, struct zfs_cmd)
-#define ZFS_IOC_RECVBACKUP _IOWR('Z', 29, struct zfs_cmd)
-#define ZFS_IOC_SENDBACKUP _IOWR('Z', 30, struct zfs_cmd)
-#define ZFS_IOC_INJECT_FAULT _IOWR('Z', 31, struct zfs_cmd)
-#define ZFS_IOC_CLEAR_FAULT _IOWR('Z', 32, struct zfs_cmd)
-#define ZFS_IOC_INJECT_LIST_NEXT _IOWR('Z', 33, struct zfs_cmd)
-#define ZFS_IOC_ERROR_LOG _IOWR('Z', 34, struct zfs_cmd)
-#define ZFS_IOC_CLEAR _IOWR('Z', 35, struct zfs_cmd)
-#define ZFS_IOC_PROMOTE _IOWR('Z', 36, struct zfs_cmd)
-#define ZFS_IOC_DESTROY_SNAPS _IOWR('Z', 37, struct zfs_cmd)
-#define ZFS_IOC_SNAPSHOT _IOWR('Z', 38, struct zfs_cmd)
-#define ZFS_IOC_DSOBJ_TO_DSNAME _IOWR('Z', 39, struct zfs_cmd)
-#define ZFS_IOC_OBJ_TO_PATH _IOWR('Z', 40, struct zfs_cmd)
-#define ZFS_IOC_POOL_SET_PROPS _IOWR('Z', 41, struct zfs_cmd)
-#define ZFS_IOC_POOL_GET_PROPS _IOWR('Z', 42, struct zfs_cmd)
-#define ZFS_IOC_JAIL _IOWR('Z', 43, struct zfs_cmd)
-#define ZFS_IOC_UNJAIL _IOWR('Z', 44, struct zfs_cmd)
-
-/*
- * Internal SPA load state. Used by FMA diagnosis engine.
- */
-typedef enum {
- SPA_LOAD_NONE, /* no load in progress */
- SPA_LOAD_OPEN, /* normal open */
- SPA_LOAD_IMPORT, /* import in progress */
- SPA_LOAD_TRYIMPORT /* tryimport in progress */
-} spa_load_state_t;
-
-/*
- * Bookmark name values.
- */
-#define ZPOOL_ERR_LIST "error list"
-#define ZPOOL_ERR_DATASET "dataset"
-#define ZPOOL_ERR_OBJECT "object"
-
-#define HIS_MAX_RECORD_LEN (MAXPATHLEN + MAXPATHLEN + 1)
-
-/*
- * The following are names used in the nvlist describing
- * the pool's history log.
- */
-#define ZPOOL_HIST_RECORD "history record"
-#define ZPOOL_HIST_TIME "history time"
-#define ZPOOL_HIST_CMD "history command"
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_FS_ZFS_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/gfs.h b/sys/contrib/opensolaris/uts/common/sys/gfs.h
deleted file mode 100644
index 8e70f29..0000000
--- a/sys/contrib/opensolaris/uts/common/sys/gfs.h
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_GFS_H
-#define _SYS_GFS_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/types.h>
-#include <sys/vnode.h>
-#include <sys/mutex.h>
-#include <sys/dirent.h>
-#include <sys/uio.h>
-#include <sys/list.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define GFS_CACHE_VNODE 0x1
-
-typedef struct gfs_dirent {
- char *gfse_name; /* entry name */
- vnode_t *(*gfse_ctor)(vnode_t *); /* constructor */
- int gfse_flags; /* flags */
- list_node_t gfse_link; /* dynamic list */
- vnode_t *gfse_vnode; /* cached vnode */
-} gfs_dirent_t;
-
-typedef enum gfs_type {
- GFS_DIR,
- GFS_FILE
-} gfs_type_t;
-
-typedef struct gfs_file {
- vnode_t *gfs_vnode; /* current vnode */
- vnode_t *gfs_parent; /* parent vnode */
- size_t gfs_size; /* size of private data structure */
- gfs_type_t gfs_type; /* type of vnode */
- int gfs_index; /* index in parent dir */
- ino64_t gfs_ino; /* inode for this vnode */
-} gfs_file_t;
-
-typedef int (*gfs_readdir_cb)(vnode_t *, struct dirent64 *, int *, offset_t *,
- offset_t *, void *);
-typedef int (*gfs_lookup_cb)(vnode_t *, const char *, vnode_t **, ino64_t *);
-typedef ino64_t (*gfs_inode_cb)(vnode_t *, int);
-
-typedef struct gfs_dir {
- gfs_file_t gfsd_file; /* generic file attributes */
- gfs_dirent_t *gfsd_static; /* statically defined entries */
- int gfsd_nstatic; /* # static entries */
- kmutex_t gfsd_lock; /* protects entries */
- int gfsd_maxlen; /* maximum name length */
- gfs_readdir_cb gfsd_readdir; /* readdir() callback */
- gfs_lookup_cb gfsd_lookup; /* lookup() callback */
- gfs_inode_cb gfsd_inode; /* get an inode number */
-} gfs_dir_t;
-
-struct vfs;
-
-extern vnode_t *gfs_file_create(size_t, vnode_t *, vfs_t *, vnodeops_t *);
-extern vnode_t *gfs_dir_create(size_t, vnode_t *, vfs_t *, vnodeops_t *,
- gfs_dirent_t *, gfs_inode_cb, int, gfs_readdir_cb, gfs_lookup_cb);
-extern vnode_t *gfs_root_create(size_t, vfs_t *, vnodeops_t *, ino64_t,
- gfs_dirent_t *, gfs_inode_cb, int, gfs_readdir_cb, gfs_lookup_cb);
-extern vnode_t *gfs_root_create_file(size_t, struct vfs *, vnodeops_t *,
- ino64_t);
-
-extern void *gfs_file_inactive(vnode_t *);
-extern void *gfs_dir_inactive(vnode_t *);
-
-extern int gfs_dir_lookup(vnode_t *, const char *, vnode_t **);
-extern int gfs_dir_readdir(vnode_t *, uio_t *, int *, int *, u_long **, void *);
-
-#define gfs_dir_lock(gd) mutex_enter(&(gd)->gfsd_lock)
-#define gfs_dir_unlock(gd) mutex_exit(&(gd)->gfsd_lock)
-
-#define gfs_file_parent(vp) (((gfs_file_t *)(vp)->v_data)->gfs_parent)
-
-#define gfs_file_index(vp) (((gfs_file_t *)(vp)->v_data)->gfs_index)
-#define gfs_file_set_index(vp, idx) \
- (((gfs_file_t *)(vp)->v_data)->gfs_index = (idx))
-
-#define gfs_file_inode(vp) (((gfs_file_t *)(vp)->v_data)->gfs_ino)
-#define gfs_file_set_inode(vp, ino) \
- (((gfs_file_t *)(vp)->v_data)->gfs_ino = (ino))
-
-typedef struct gfs_readdir_state {
- struct dirent64 *grd_dirent; /* directory entry buffer */
- size_t grd_namlen; /* max file name length */
- size_t grd_ureclen; /* exported record size */
- ssize_t grd_oresid; /* original uio_resid */
- ino64_t grd_parent; /* inode of parent */
- ino64_t grd_self; /* inode of self */
-} gfs_readdir_state_t;
-
-extern int gfs_readdir_init(gfs_readdir_state_t *, int, int, uio_t *, ino64_t,
- ino64_t);
-extern int gfs_readdir_emit(gfs_readdir_state_t *, uio_t *, offset_t, ino64_t,
- const char *, int *, u_long **);
-extern int gfs_readdir_pred(gfs_readdir_state_t *, uio_t *, offset_t *, int *,
- u_long **);
-extern int gfs_readdir_fini(gfs_readdir_state_t *, int, int *, int);
-
-extern int gfs_lookup_dot(vnode_t **, vnode_t *, vnode_t *, const char *);
-
-extern int gfs_vop_readdir(struct vop_readdir_args *);
-extern int gfs_vop_inactive(struct vop_inactive_args *);
-
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_GFS_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/isa_defs.h b/sys/contrib/opensolaris/uts/common/sys/isa_defs.h
deleted file mode 100644
index a65d16a..0000000
--- a/sys/contrib/opensolaris/uts/common/sys/isa_defs.h
+++ /dev/null
@@ -1,485 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_ISA_DEFS_H
-#define _SYS_ISA_DEFS_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-/*
- * This header file serves to group a set of well known defines and to
- * set these for each instruction set architecture. These defines may
- * be divided into two groups; characteristics of the processor and
- * implementation choices for Solaris on a processor.
- *
- * Processor Characteristics:
- *
- * _LITTLE_ENDIAN / _BIG_ENDIAN:
- * The natural byte order of the processor. A pointer to an int points
- * to the least/most significant byte of that int.
- *
- * _STACK_GROWS_UPWARD / _STACK_GROWS_DOWNWARD:
- * The processor specific direction of stack growth. A push onto the
- * stack increases/decreases the stack pointer, so it stores data at
- * successively higher/lower addresses. (Stackless machines ignored
- * without regrets).
- *
- * _LONG_LONG_HTOL / _LONG_LONG_LTOH:
- * A pointer to a long long points to the most/least significant long
- * within that long long.
- *
- * _BIT_FIELDS_HTOL / _BIT_FIELDS_LTOH:
- * The C compiler assigns bit fields from the high/low to the low/high end
- * of an int (most to least significant vs. least to most significant).
- *
- * _IEEE_754:
- * The processor (or supported implementations of the processor)
- * supports the ieee-754 floating point standard. No other floating
- * point standards are supported (or significant). Any other supported
- * floating point formats are expected to be cased on the ISA processor
- * symbol.
- *
- * _CHAR_IS_UNSIGNED / _CHAR_IS_SIGNED:
- * The C Compiler implements objects of type `char' as `unsigned' or
- * `signed' respectively. This is really an implementation choice of
- * the compiler writer, but it is specified in the ABI and tends to
- * be uniform across compilers for an instruction set architecture.
- * Hence, it has the properties of a processor characteristic.
- *
- * _CHAR_ALIGNMENT / _SHORT_ALIGNMENT / _INT_ALIGNMENT / _LONG_ALIGNMENT /
- * _LONG_LONG_ALIGNMENT / _DOUBLE_ALIGNMENT / _LONG_DOUBLE_ALIGNMENT /
- * _POINTER_ALIGNMENT / _FLOAT_ALIGNMENT:
- * The ABI defines alignment requirements of each of the primitive
- * object types. Some, if not all, may be hardware requirements as
- * well. The values are expressed in "byte-alignment" units.
- *
- * _MAX_ALIGNMENT:
- * The most stringent alignment requirement as specified by the ABI.
- * Equal to the maximum of all the above _XXX_ALIGNMENT values.
- *
- * _ALIGNMENT_REQUIRED:
- * True or false (1 or 0) whether or not the hardware requires the ABI
- * alignment.
- *
- * _LONG_LONG_ALIGNMENT_32
- * The 32-bit ABI supported by a 64-bit kernel may have different
- * alignment requirements for primitive object types. The value of this
- * identifier is expressed in "byte-alignment" units.
- *
- * _HAVE_CPUID_INSN
- * This indicates that the architecture supports the 'cpuid'
- * instruction as defined by Intel. (Intel allows other vendors
- * to extend the instruction for their own purposes.)
- *
- *
- * Implementation Choices:
- *
- * _ILP32 / _LP64:
- * This specifies the compiler data type implementation as specified in
- * the relevant ABI. The choice between these is strongly influenced
- * by the underlying hardware, but is not absolutely tied to it.
- * Currently only two data type models are supported:
- *
- * _ILP32:
- * Int/Long/Pointer are 32 bits. This is the historical UNIX
- * and Solaris implementation. Due to its historical standing,
- * this is the default case.
- *
- * _LP64:
- * Long/Pointer are 64 bits, Int is 32 bits. This is the chosen
- * implementation for 64-bit ABIs such as SPARC V9.
- *
- * _I32LPx:
- * A compilation environment where 'int' is 32-bit, and
- * longs and pointers are simply the same size.
- *
- * In all cases, Char is 8 bits and Short is 16 bits.
- *
- * _SUNOS_VTOC_8 / _SUNOS_VTOC_16 / _SVR4_VTOC_16:
- * This specifies the form of the disk VTOC (or label):
- *
- * _SUNOS_VTOC_8:
- * This is a VTOC form which is upwardly compatible with the
- * SunOS 4.x disk label and allows 8 partitions per disk.
- *
- * _SUNOS_VTOC_16:
- * In this format the incore vtoc image matches the ondisk
- * version. It allows 16 slices per disk, and is not
- * compatible with the SunOS 4.x disk label.
- *
- * Note that these are not the only two VTOC forms possible and
- * additional forms may be added. One possible form would be the
- * SVr4 VTOC form. The symbol for that is reserved now, although
- * it is not implemented.
- *
- * _SVR4_VTOC_16:
- * This VTOC form is compatible with the System V Release 4
- * VTOC (as implemented on the SVr4 Intel and 3b ports) with
- * 16 partitions per disk.
- *
- *
- * _DMA_USES_PHYSADDR / _DMA_USES_VIRTADDR
- * This describes the type of addresses used by system DMA:
- *
- * _DMA_USES_PHYSADDR:
- * This type of DMA, used in the x86 implementation,
- * requires physical addresses for DMA buffers. The 24-bit
- * addresses used by some legacy boards is the source of the
- * "low-memory" (<16MB) requirement for some devices using DMA.
- *
- * _DMA_USES_VIRTADDR:
- * This method of DMA allows the use of virtual addresses for
- * DMA transfers.
- *
- * _FIRMWARE_NEEDS_FDISK / _NO_FDISK_PRESENT
- * This indicates the presence/absence of an fdisk table.
- *
- * _FIRMWARE_NEEDS_FDISK
- * The fdisk table is required by system firmware. If present,
- * it allows a disk to be subdivided into multiple fdisk
- * partitions, each of which is equivalent to a separate,
- * virtual disk. This enables the co-existence of multiple
- * operating systems on a shared hard disk.
- *
- * _NO_FDISK_PRESENT
- * If the fdisk table is absent, it is assumed that the entire
- * media is allocated for a single operating system.
- *
- * _HAVE_TEM_FIRMWARE
- * Defined if this architecture has the (fallback) option of
- * using prom_* calls for doing I/O if a suitable kernel driver
- * is not available to do it.
- *
- * _DONT_USE_1275_GENERIC_NAMES
- * Controls whether or not device tree node names should
- * comply with the IEEE 1275 "Generic Names" Recommended
- * Practice. With _DONT_USE_GENERIC_NAMES, device-specific
- * names identifying the particular device will be used.
- *
- * __i386_COMPAT
- * This indicates whether the i386 ABI is supported as a *non-native*
- * mode for the platform. When this symbol is defined:
- * - 32-bit xstat-style system calls are enabled
- * - 32-bit xmknod-style system calls are enabled
- * - 32-bit system calls use i386 sizes -and- alignments
- *
- * Note that this is NOT defined for the i386 native environment!
- *
- * __x86
- * This is ONLY a synonym for defined(__i386) || defined(__amd64)
- * which is useful only insofar as these two architectures share
- * common attributes. Analogous to __sparc.
- *
- * _PSM_MODULES
- * This indicates whether or not the implementation uses PSM
- * modules for processor support, reading /etc/mach from inside
- * the kernel to extract a list.
- *
- * _RTC_CONFIG
- * This indicates whether or not the implementation uses /etc/rtc_config
- * to configure the real-time clock in the kernel.
- *
- * _UNIX_KRTLD
- * This indicates that the implementation uses a dynamically
- * linked unix + krtld to form the core kernel image at boot
- * time, or (in the absence of this symbol) a prelinked kernel image.
- */
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * The following set of definitions characterize Solaris on AMD's
- * 64-bit systems.
- */
-#if defined(__x86_64) || defined(__amd64)
-
-#if !defined(__amd64)
-#define __amd64 /* preferred guard */
-#endif
-
-#if !defined(__x86)
-#define __x86
-#endif
-
-/*
- * Define the appropriate "processor characteristics"
- */
-#if defined(sun)
-#define _LITTLE_ENDIAN
-#endif
-#define _STACK_GROWS_DOWNWARD
-#define _LONG_LONG_LTOH
-#define _BIT_FIELDS_LTOH
-#define _IEEE_754
-#define _CHAR_IS_SIGNED
-#define _BOOL_ALIGNMENT 1
-#define _CHAR_ALIGNMENT 1
-#define _SHORT_ALIGNMENT 2
-#define _INT_ALIGNMENT 4
-#define _FLOAT_ALIGNMENT 4
-#define _FLOAT_COMPLEX_ALIGNMENT 4
-#define _LONG_ALIGNMENT 8
-#define _LONG_LONG_ALIGNMENT 8
-#define _DOUBLE_ALIGNMENT 8
-#define _DOUBLE_COMPLEX_ALIGNMENT 8
-#define _LONG_DOUBLE_ALIGNMENT 16
-#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 16
-#define _POINTER_ALIGNMENT 8
-#define _MAX_ALIGNMENT 16
-#define _ALIGNMENT_REQUIRED 1
-
-/*
- * Different alignment constraints for the i386 ABI in compatibility mode
- */
-#define _LONG_LONG_ALIGNMENT_32 4
-
-/*
- * Define the appropriate "implementation choices".
- */
-#if !defined(_LP64)
-#define _LP64
-#endif
-#if !defined(_I32LPx) && defined(_KERNEL)
-#define _I32LPx
-#endif
-#define _MULTI_DATAMODEL
-#define _SUNOS_VTOC_16
-#define _DMA_USES_PHYSADDR
-#define _FIRMWARE_NEEDS_FDISK
-#define __i386_COMPAT
-#define _PSM_MODULES
-#define _RTC_CONFIG
-#define _DONT_USE_1275_GENERIC_NAMES
-#define _HAVE_CPUID_INSN
-
-/*
- * The feature test macro __i386 is generic for all processors implementing
- * the Intel 386 instruction set or a superset of it. Specifically, this
- * includes all members of the 386, 486, and Pentium family of processors.
- */
-#elif defined(__i386) || defined(__i386__)
-
-#if !defined(__i386)
-#define __i386
-#endif
-
-#if !defined(__x86)
-#define __x86
-#endif
-
-/*
- * Define the appropriate "processor characteristics"
- */
-#if defined(sun)
-#define _LITTLE_ENDIAN
-#endif
-#define _STACK_GROWS_DOWNWARD
-#define _LONG_LONG_LTOH
-#define _BIT_FIELDS_LTOH
-#define _IEEE_754
-#define _CHAR_IS_SIGNED
-#define _BOOL_ALIGNMENT 1
-#define _CHAR_ALIGNMENT 1
-#define _SHORT_ALIGNMENT 2
-#define _INT_ALIGNMENT 4
-#define _FLOAT_ALIGNMENT 4
-#define _FLOAT_COMPLEX_ALIGNMENT 4
-#define _LONG_ALIGNMENT 4
-#define _LONG_LONG_ALIGNMENT 4
-#define _DOUBLE_ALIGNMENT 4
-#define _DOUBLE_COMPLEX_ALIGNMENT 4
-#define _LONG_DOUBLE_ALIGNMENT 4
-#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 4
-#define _POINTER_ALIGNMENT 4
-#define _MAX_ALIGNMENT 4
-#define _ALIGNMENT_REQUIRED 0
-
-#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGNMENT
-
-/*
- * Define the appropriate "implementation choices".
- */
-#define _ILP32
-#if !defined(_I32LPx) && defined(_KERNEL)
-#define _I32LPx
-#endif
-#define _SUNOS_VTOC_16
-#define _DMA_USES_PHYSADDR
-#define _FIRMWARE_NEEDS_FDISK
-#define _PSM_MODULES
-#define _RTC_CONFIG
-#define _DONT_USE_1275_GENERIC_NAMES
-#define _HAVE_CPUID_INSN
-
-/*
- * The following set of definitions characterize the Solaris on SPARC systems.
- *
- * The symbol __sparc indicates any of the SPARC family of processor
- * architectures. This includes SPARC V7, SPARC V8 and SPARC V9.
- *
- * The symbol __sparcv8 indicates the 32-bit SPARC V8 architecture as defined
- * by Version 8 of the SPARC Architecture Manual. (SPARC V7 is close enough
- * to SPARC V8 for the former to be subsumed into the latter definition.)
- *
- * The symbol __sparcv9 indicates the 64-bit SPARC V9 architecture as defined
- * by Version 9 of the SPARC Architecture Manual.
- *
- * The symbols __sparcv8 and __sparcv9 are mutually exclusive, and are only
- * relevant when the symbol __sparc is defined.
- */
-/*
- * XXX Due to the existence of 5110166, "defined(__sparcv9)" needs to be added
- * to support backwards builds. This workaround should be removed in s10_71.
- */
-#elif defined(__sparc) || defined(__sparcv9) || defined(__sparc__)
-#if !defined(__sparc)
-#define __sparc
-#endif
-
-/*
- * You can be 32-bit or 64-bit, but not both at the same time.
- */
-#if defined(__sparcv8) && defined(__sparcv9)
-#error "SPARC Versions 8 and 9 are mutually exclusive choices"
-#endif
-
-/*
- * Existing compilers do not set __sparcv8. Years will transpire before
- * the compilers can be depended on to set the feature test macro. In
- * the interim, we'll set it here on the basis of historical behaviour;
- * if you haven't asked for SPARC V9, then you must've meant SPARC V8.
- */
-#if !defined(__sparcv9) && !defined(__sparcv8)
-#define __sparcv8
-#endif
-
-/*
- * Define the appropriate "processor characteristics" shared between
- * all Solaris on SPARC systems.
- */
-#if defined(sun)
-#define _BIG_ENDIAN
-#endif
-#define _STACK_GROWS_DOWNWARD
-#define _LONG_LONG_HTOL
-#define _BIT_FIELDS_HTOL
-#define _IEEE_754
-#define _CHAR_IS_SIGNED
-#define _BOOL_ALIGNMENT 1
-#define _CHAR_ALIGNMENT 1
-#define _SHORT_ALIGNMENT 2
-#define _INT_ALIGNMENT 4
-#define _FLOAT_ALIGNMENT 4
-#define _FLOAT_COMPLEX_ALIGNMENT 4
-#define _LONG_LONG_ALIGNMENT 8
-#define _DOUBLE_ALIGNMENT 8
-#define _DOUBLE_COMPLEX_ALIGNMENT 8
-#define _ALIGNMENT_REQUIRED 1
-
-/*
- * Define the appropriate "implementation choices" shared between versions.
- */
-#define _SUNOS_VTOC_8
-#define _DMA_USES_VIRTADDR
-#define _NO_FDISK_PRESENT
-#define _HAVE_TEM_FIRMWARE
-#define _UNIX_KRTLD
-
-/*
- * The following set of definitions characterize the implementation of
- * 32-bit Solaris on SPARC V8 systems.
- */
-#if defined(__sparcv8)
-
-/*
- * Define the appropriate "processor characteristics"
- */
-#define _LONG_ALIGNMENT 4
-#define _LONG_DOUBLE_ALIGNMENT 8
-#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 8
-#define _POINTER_ALIGNMENT 4
-#define _MAX_ALIGNMENT 8
-
-#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGNMENT
-
-/*
- * Define the appropriate "implementation choices"
- */
-#define _ILP32
-#if !defined(_I32LPx) && defined(_KERNEL)
-#define _I32LPx
-#endif
-
-/*
- * The following set of definitions characterize the implementation of
- * 64-bit Solaris on SPARC V9 systems.
- */
-#elif defined(__sparcv9)
-
-/*
- * Define the appropriate "processor characteristics"
- */
-#define _LONG_ALIGNMENT 8
-#define _LONG_DOUBLE_ALIGNMENT 16
-#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 16
-#define _POINTER_ALIGNMENT 8
-#define _MAX_ALIGNMENT 16
-
-#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGMENT
-
-/*
- * Define the appropriate "implementation choices"
- */
-#if !defined(_LP64)
-#define _LP64
-#endif
-#if !defined(_I32LPx)
-#define _I32LPx
-#endif
-#define _MULTI_DATAMODEL
-
-#else
-#error "unknown SPARC version"
-#endif
-
-/*
- * #error is strictly ansi-C, but works as well as anything for K&R systems.
- */
-#else
-#error "ISA not supported"
-#endif
-
-#if defined(_ILP32) && defined(_LP64)
-#error "Both _ILP32 and _LP64 are defined"
-#endif
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_ISA_DEFS_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/list.h b/sys/contrib/opensolaris/uts/common/sys/list.h
deleted file mode 100644
index 7e9d9aa..0000000
--- a/sys/contrib/opensolaris/uts/common/sys/list.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_LIST_H
-#define _SYS_LIST_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/list_impl.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct list_node list_node_t;
-typedef struct list list_t;
-
-void list_create(list_t *, size_t, size_t);
-void list_destroy(list_t *);
-
-void list_insert_after(list_t *, void *, void *);
-void list_insert_before(list_t *, void *, void *);
-void list_insert_head(list_t *, void *);
-void list_insert_tail(list_t *, void *);
-void list_remove(list_t *, void *);
-void list_move_tail(list_t *, list_t *);
-
-void *list_head(list_t *);
-void *list_tail(list_t *);
-void *list_next(list_t *, void *);
-void *list_prev(list_t *, void *);
-
-int list_link_active(list_node_t *);
-int list_is_empty(list_t *);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_LIST_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/list_impl.h b/sys/contrib/opensolaris/uts/common/sys/list_impl.h
deleted file mode 100644
index 9c42f88..0000000
--- a/sys/contrib/opensolaris/uts/common/sys/list_impl.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2003 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_LIST_IMPL_H
-#define _SYS_LIST_IMPL_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/types.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct list_node {
- struct list_node *list_next;
- struct list_node *list_prev;
-};
-
-struct list {
- size_t list_size;
- size_t list_offset;
- struct list_node list_head;
-};
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_LIST_IMPL_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/note.h b/sys/contrib/opensolaris/uts/common/sys/note.h
deleted file mode 100644
index 2cb7fd8..0000000
--- a/sys/contrib/opensolaris/uts/common/sys/note.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 1994 by Sun Microsystems, Inc.
- */
-
-/*
- * sys/note.h: interface for annotating source with info for tools
- *
- * This is the underlying interface; NOTE (/usr/include/note.h) is the
- * preferred interface, but all exported header files should include this
- * file directly and use _NOTE so as not to take "NOTE" from the user's
- * namespace. For consistency, *all* kernel source should use _NOTE.
- *
- * By default, annotations expand to nothing. This file implements
- * that. Tools using annotations will interpose a different version
- * of this file that will expand annotations as needed.
- */
-
-#ifndef _SYS_NOTE_H
-#define _SYS_NOTE_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifndef _NOTE
-#define _NOTE(s)
-#endif
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_NOTE_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/nvpair.h b/sys/contrib/opensolaris/uts/common/sys/nvpair.h
deleted file mode 100644
index 306e30f..0000000
--- a/sys/contrib/opensolaris/uts/common/sys/nvpair.h
+++ /dev/null
@@ -1,260 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_NVPAIR_H
-#define _SYS_NVPAIR_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/types.h>
-#include <sys/time.h>
-#include <sys/errno.h>
-
-#if defined(_KERNEL) && !defined(_BOOT)
-#include <sys/kmem.h>
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef enum {
- DATA_TYPE_UNKNOWN = 0,
- DATA_TYPE_BOOLEAN,
- DATA_TYPE_BYTE,
- DATA_TYPE_INT16,
- DATA_TYPE_UINT16,
- DATA_TYPE_INT32,
- DATA_TYPE_UINT32,
- DATA_TYPE_INT64,
- DATA_TYPE_UINT64,
- DATA_TYPE_STRING,
- DATA_TYPE_BYTE_ARRAY,
- DATA_TYPE_INT16_ARRAY,
- DATA_TYPE_UINT16_ARRAY,
- DATA_TYPE_INT32_ARRAY,
- DATA_TYPE_UINT32_ARRAY,
- DATA_TYPE_INT64_ARRAY,
- DATA_TYPE_UINT64_ARRAY,
- DATA_TYPE_STRING_ARRAY,
- DATA_TYPE_HRTIME,
- DATA_TYPE_NVLIST,
- DATA_TYPE_NVLIST_ARRAY,
- DATA_TYPE_BOOLEAN_VALUE,
- DATA_TYPE_INT8,
- DATA_TYPE_UINT8,
- DATA_TYPE_BOOLEAN_ARRAY,
- DATA_TYPE_INT8_ARRAY,
- DATA_TYPE_UINT8_ARRAY
-} data_type_t;
-
-typedef struct nvpair {
- int32_t nvp_size; /* size of this nvpair */
- int16_t nvp_name_sz; /* length of name string */
- int16_t nvp_reserve; /* not used */
- int32_t nvp_value_elem; /* number of elements for array types */
- data_type_t nvp_type; /* type of value */
- /* name string */
- /* aligned ptr array for string arrays */
- /* aligned array of data for value */
-} nvpair_t;
-
-/* nvlist header */
-typedef struct nvlist {
- int32_t nvl_version;
- uint32_t nvl_nvflag; /* persistent flags */
- uint64_t nvl_priv; /* ptr to private data if not packed */
- uint32_t nvl_flag;
- int32_t nvl_pad; /* currently not used, for alignment */
-} nvlist_t;
-
-/* nvp implementation version */
-#define NV_VERSION 0
-
-/* nvlist pack encoding */
-#define NV_ENCODE_NATIVE 0
-#define NV_ENCODE_XDR 1
-
-/* nvlist persistent unique name flags, stored in nvl_nvflags */
-#define NV_UNIQUE_NAME 0x1
-#define NV_UNIQUE_NAME_TYPE 0x2
-
-/* nvlist lookup pairs related flags */
-#define NV_FLAG_NOENTOK 0x1
-
-/* convenience macros */
-#define NV_ALIGN(x) (((ulong_t)(x) + 7ul) & ~7ul)
-#define NV_ALIGN4(x) (((x) + 3) & ~3)
-
-#define NVP_SIZE(nvp) ((nvp)->nvp_size)
-#define NVP_NAME(nvp) ((char *)(nvp) + sizeof (nvpair_t))
-#define NVP_TYPE(nvp) ((nvp)->nvp_type)
-#define NVP_NELEM(nvp) ((nvp)->nvp_value_elem)
-#define NVP_VALUE(nvp) ((char *)(nvp) + NV_ALIGN(sizeof (nvpair_t) \
- + (nvp)->nvp_name_sz))
-
-#define NVL_VERSION(nvl) ((nvl)->nvl_version)
-#define NVL_SIZE(nvl) ((nvl)->nvl_size)
-#define NVL_FLAG(nvl) ((nvl)->nvl_flag)
-
-/* NV allocator framework */
-typedef struct nv_alloc_ops nv_alloc_ops_t;
-
-typedef struct nv_alloc {
- const nv_alloc_ops_t *nva_ops;
- void *nva_arg;
-} nv_alloc_t;
-
-struct nv_alloc_ops {
- int (*nv_ao_init)(nv_alloc_t *, __va_list);
- void (*nv_ao_fini)(nv_alloc_t *);
- void *(*nv_ao_alloc)(nv_alloc_t *, size_t);
- void (*nv_ao_free)(nv_alloc_t *, void *, size_t);
- void (*nv_ao_reset)(nv_alloc_t *);
-};
-
-extern const nv_alloc_ops_t *nv_fixed_ops;
-extern nv_alloc_t *nv_alloc_nosleep;
-
-#if defined(_KERNEL) && !defined(_BOOT)
-extern nv_alloc_t *nv_alloc_sleep;
-#endif
-
-int nv_alloc_init(nv_alloc_t *, const nv_alloc_ops_t *, /* args */ ...);
-void nv_alloc_reset(nv_alloc_t *);
-void nv_alloc_fini(nv_alloc_t *);
-
-/* list management */
-int nvlist_alloc(nvlist_t **, uint_t, int);
-void nvlist_free(nvlist_t *);
-int nvlist_size(nvlist_t *, size_t *, int);
-int nvlist_pack(nvlist_t *, char **, size_t *, int, int);
-int nvlist_unpack(char *, size_t, nvlist_t **, int);
-int nvlist_dup(nvlist_t *, nvlist_t **, int);
-int nvlist_merge(nvlist_t *, nvlist_t *, int);
-
-int nvlist_xalloc(nvlist_t **, uint_t, nv_alloc_t *);
-int nvlist_xpack(nvlist_t *, char **, size_t *, int, nv_alloc_t *);
-int nvlist_xunpack(char *, size_t, nvlist_t **, nv_alloc_t *);
-int nvlist_xdup(nvlist_t *, nvlist_t **, nv_alloc_t *);
-nv_alloc_t *nvlist_lookup_nv_alloc(nvlist_t *);
-
-int nvlist_add_nvpair(nvlist_t *, nvpair_t *);
-int nvlist_add_boolean(nvlist_t *, const char *);
-int nvlist_add_boolean_value(nvlist_t *, const char *, boolean_t);
-int nvlist_add_byte(nvlist_t *, const char *, uchar_t);
-int nvlist_add_int8(nvlist_t *, const char *, int8_t);
-int nvlist_add_uint8(nvlist_t *, const char *, uint8_t);
-int nvlist_add_int16(nvlist_t *, const char *, int16_t);
-int nvlist_add_uint16(nvlist_t *, const char *, uint16_t);
-int nvlist_add_int32(nvlist_t *, const char *, int32_t);
-int nvlist_add_uint32(nvlist_t *, const char *, uint32_t);
-int nvlist_add_int64(nvlist_t *, const char *, int64_t);
-int nvlist_add_uint64(nvlist_t *, const char *, uint64_t);
-int nvlist_add_string(nvlist_t *, const char *, const char *);
-int nvlist_add_nvlist(nvlist_t *, const char *, nvlist_t *);
-int nvlist_add_boolean_array(nvlist_t *, const char *, boolean_t *, uint_t);
-int nvlist_add_byte_array(nvlist_t *, const char *, uchar_t *, uint_t);
-int nvlist_add_int8_array(nvlist_t *, const char *, int8_t *, uint_t);
-int nvlist_add_uint8_array(nvlist_t *, const char *, uint8_t *, uint_t);
-int nvlist_add_int16_array(nvlist_t *, const char *, int16_t *, uint_t);
-int nvlist_add_uint16_array(nvlist_t *, const char *, uint16_t *, uint_t);
-int nvlist_add_int32_array(nvlist_t *, const char *, int32_t *, uint_t);
-int nvlist_add_uint32_array(nvlist_t *, const char *, uint32_t *, uint_t);
-int nvlist_add_int64_array(nvlist_t *, const char *, int64_t *, uint_t);
-int nvlist_add_uint64_array(nvlist_t *, const char *, uint64_t *, uint_t);
-int nvlist_add_string_array(nvlist_t *, const char *, char *const *, uint_t);
-int nvlist_add_nvlist_array(nvlist_t *, const char *, nvlist_t **, uint_t);
-int nvlist_add_hrtime(nvlist_t *, const char *, hrtime_t);
-
-int nvlist_remove(nvlist_t *, const char *, data_type_t);
-int nvlist_remove_all(nvlist_t *, const char *);
-
-int nvlist_lookup_boolean(nvlist_t *, const char *);
-int nvlist_lookup_boolean_value(nvlist_t *, const char *, boolean_t *);
-int nvlist_lookup_byte(nvlist_t *, const char *, uchar_t *);
-int nvlist_lookup_int8(nvlist_t *, const char *, int8_t *);
-int nvlist_lookup_uint8(nvlist_t *, const char *, uint8_t *);
-int nvlist_lookup_int16(nvlist_t *, const char *, int16_t *);
-int nvlist_lookup_uint16(nvlist_t *, const char *, uint16_t *);
-int nvlist_lookup_int32(nvlist_t *, const char *, int32_t *);
-int nvlist_lookup_uint32(nvlist_t *, const char *, uint32_t *);
-int nvlist_lookup_int64(nvlist_t *, const char *, int64_t *);
-int nvlist_lookup_uint64(nvlist_t *, const char *, uint64_t *);
-int nvlist_lookup_string(nvlist_t *, const char *, char **);
-int nvlist_lookup_nvlist(nvlist_t *, const char *, nvlist_t **);
-int nvlist_lookup_boolean_array(nvlist_t *, const char *,
- boolean_t **, uint_t *);
-int nvlist_lookup_byte_array(nvlist_t *, const char *, uchar_t **, uint_t *);
-int nvlist_lookup_int8_array(nvlist_t *, const char *, int8_t **, uint_t *);
-int nvlist_lookup_uint8_array(nvlist_t *, const char *, uint8_t **, uint_t *);
-int nvlist_lookup_int16_array(nvlist_t *, const char *, int16_t **, uint_t *);
-int nvlist_lookup_uint16_array(nvlist_t *, const char *, uint16_t **, uint_t *);
-int nvlist_lookup_int32_array(nvlist_t *, const char *, int32_t **, uint_t *);
-int nvlist_lookup_uint32_array(nvlist_t *, const char *, uint32_t **, uint_t *);
-int nvlist_lookup_int64_array(nvlist_t *, const char *, int64_t **, uint_t *);
-int nvlist_lookup_uint64_array(nvlist_t *, const char *, uint64_t **, uint_t *);
-int nvlist_lookup_string_array(nvlist_t *, const char *, char ***, uint_t *);
-int nvlist_lookup_nvlist_array(nvlist_t *, const char *,
- nvlist_t ***, uint_t *);
-int nvlist_lookup_hrtime(nvlist_t *, const char *, hrtime_t *);
-int nvlist_lookup_pairs(nvlist_t *nvl, int, ...);
-
-/* processing nvpair */
-nvpair_t *nvlist_next_nvpair(nvlist_t *nvl, nvpair_t *);
-char *nvpair_name(nvpair_t *);
-data_type_t nvpair_type(nvpair_t *);
-int nvpair_value_boolean_value(nvpair_t *, boolean_t *);
-int nvpair_value_byte(nvpair_t *, uchar_t *);
-int nvpair_value_int8(nvpair_t *, int8_t *);
-int nvpair_value_uint8(nvpair_t *, uint8_t *);
-int nvpair_value_int16(nvpair_t *, int16_t *);
-int nvpair_value_uint16(nvpair_t *, uint16_t *);
-int nvpair_value_int32(nvpair_t *, int32_t *);
-int nvpair_value_uint32(nvpair_t *, uint32_t *);
-int nvpair_value_int64(nvpair_t *, int64_t *);
-int nvpair_value_uint64(nvpair_t *, uint64_t *);
-int nvpair_value_string(nvpair_t *, char **);
-int nvpair_value_nvlist(nvpair_t *, nvlist_t **);
-int nvpair_value_boolean_array(nvpair_t *, boolean_t **, uint_t *);
-int nvpair_value_byte_array(nvpair_t *, uchar_t **, uint_t *);
-int nvpair_value_int8_array(nvpair_t *, int8_t **, uint_t *);
-int nvpair_value_uint8_array(nvpair_t *, uint8_t **, uint_t *);
-int nvpair_value_int16_array(nvpair_t *, int16_t **, uint_t *);
-int nvpair_value_uint16_array(nvpair_t *, uint16_t **, uint_t *);
-int nvpair_value_int32_array(nvpair_t *, int32_t **, uint_t *);
-int nvpair_value_uint32_array(nvpair_t *, uint32_t **, uint_t *);
-int nvpair_value_int64_array(nvpair_t *, int64_t **, uint_t *);
-int nvpair_value_uint64_array(nvpair_t *, uint64_t **, uint_t *);
-int nvpair_value_string_array(nvpair_t *, char ***, uint_t *);
-int nvpair_value_nvlist_array(nvpair_t *, nvlist_t ***, uint_t *);
-int nvpair_value_hrtime(nvpair_t *, hrtime_t *);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_NVPAIR_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/nvpair_impl.h b/sys/contrib/opensolaris/uts/common/sys/nvpair_impl.h
deleted file mode 100644
index f12dbbf..0000000
--- a/sys/contrib/opensolaris/uts/common/sys/nvpair_impl.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _NVPAIR_IMPL_H
-#define _NVPAIR_IMPL_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <sys/nvpair.h>
-
-/*
- * The structures here provided for information and debugging purposes only
- * may be changed in the future.
- */
-
-/*
- * implementation linked list for pre-packed data
- */
-typedef struct i_nvp i_nvp_t;
-
-struct i_nvp {
- union {
- uint64_t _nvi_align; /* ensure alignment */
- struct {
- i_nvp_t *_nvi_next; /* pointer to next nvpair */
- i_nvp_t *_nvi_prev; /* pointer to prev nvpair */
- } _nvi;
- } _nvi_un;
- nvpair_t nvi_nvp; /* nvpair */
-};
-#define nvi_next _nvi_un._nvi._nvi_next
-#define nvi_prev _nvi_un._nvi._nvi_prev
-
-typedef struct {
- i_nvp_t *nvp_list; /* linked list of nvpairs */
- i_nvp_t *nvp_last; /* last nvpair */
- i_nvp_t *nvp_curr; /* current walker nvpair */
- nv_alloc_t *nvp_nva; /* pluggable allocator */
- uint32_t nvp_stat; /* internal state */
-} nvpriv_t;
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _NVPAIR_IMPL_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/processor.h b/sys/contrib/opensolaris/uts/common/sys/processor.h
deleted file mode 100644
index 063f7dacb..0000000
--- a/sys/contrib/opensolaris/uts/common/sys/processor.h
+++ /dev/null
@@ -1,146 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T
- * All Rights Reserved
- *
- */
-
-/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_PROCESSOR_H
-#define _SYS_PROCESSOR_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/types.h>
-#include <sys/procset.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * Definitions for p_online, processor_info & lgrp system calls.
- */
-
-/*
- * Type for an lgrpid
- */
-typedef uint16_t lgrpid_t;
-
-/*
- * Type for processor name (CPU number).
- */
-typedef int processorid_t;
-typedef int chipid_t;
-
-/*
- * Flags and return values for p_online(2), and pi_state for processor_info(2).
- * These flags are *not* for in-kernel examination of CPU states.
- * See <sys/cpuvar.h> for appropriate informational functions.
- */
-#define P_OFFLINE 0x0001 /* processor is offline, as quiet as possible */
-#define P_ONLINE 0x0002 /* processor is online */
-#define P_STATUS 0x0003 /* value passed to p_online to request status */
-#define P_FAULTED 0x0004 /* processor is offline, in faulted state */
-#define P_POWEROFF 0x0005 /* processor is powered off */
-#define P_NOINTR 0x0006 /* processor is online, but no I/O interrupts */
-#define P_SPARE 0x0007 /* processor is offline, can be reactivated */
-#define P_BAD P_FAULTED /* unused but defined by USL */
-#define P_FORCED 0x10000000 /* force processor offline */
-
-/*
- * String names for processor states defined above.
- */
-#define PS_OFFLINE "off-line"
-#define PS_ONLINE "on-line"
-#define PS_FAULTED "faulted"
-#define PS_POWEROFF "powered-off"
-#define PS_NOINTR "no-intr"
-#define PS_SPARE "spare"
-
-/*
- * Structure filled in by processor_info(2).
- *
- * The string fields are guaranteed to contain a NULL.
- *
- * The pi_fputypes field contains a (possibly empty) comma-separated
- * list of floating point identifier strings.
- */
-#define PI_TYPELEN 16 /* max size of CPU type string */
-#define PI_FPUTYPE 32 /* max size of FPU types string */
-
-typedef struct {
- int pi_state; /* processor state, see above */
- char pi_processor_type[PI_TYPELEN]; /* ASCII CPU type */
- char pi_fputypes[PI_FPUTYPE]; /* ASCII FPU types */
- int pi_clock; /* CPU clock freq in MHz */
-} processor_info_t;
-
-/*
- * Binding values for processor_bind(2)
- */
-#define PBIND_NONE -1 /* LWP/thread is not bound */
-#define PBIND_QUERY -2 /* don't set, just return the binding */
-
-/*
- * User-level system call interface prototypes
- */
-#ifndef _KERNEL
-#ifdef __STDC__
-
-extern int p_online(processorid_t processorid, int flag);
-extern int processor_info(processorid_t processorid,
- processor_info_t *infop);
-extern int processor_bind(idtype_t idtype, id_t id,
- processorid_t processorid, processorid_t *obind);
-extern processorid_t getcpuid(void);
-extern lgrpid_t gethomelgroup(void);
-
-#else
-
-extern int p_online();
-extern int processor_info();
-extern int processor_bind();
-extern processorid_t getcpuid();
-extern lgrpid_t gethomelgroup();
-
-#endif /* __STDC__ */
-
-#else /* _KERNEL */
-
-/*
- * Internal interface prototypes
- */
-extern int p_online_internal(processorid_t, int, int *);
-
-#endif /* !_KERNEL */
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_PROCESSOR_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/procset.h b/sys/contrib/opensolaris/uts/common/sys/procset.h
deleted file mode 100644
index c367c93..0000000
--- a/sys/contrib/opensolaris/uts/common/sys/procset.h
+++ /dev/null
@@ -1,162 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
-/* All Rights Reserved */
-
-
-#ifndef _SYS_PROCSET_H
-#define _SYS_PROCSET_H
-
-#pragma ident "%Z%%M% %I% %E% SMI" /* SVr4.0 1.6 */
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <sys/feature_tests.h>
-#include <sys/types.h>
-#include <sys/signal.h>
-
-/*
- * This file defines the data needed to specify a set of
- * processes. These types are used by the sigsend, sigsendset,
- * priocntl, priocntlset, waitid, evexit, and evexitset system
- * calls.
- */
-#define P_INITPID 1
-#define P_INITUID 0
-#define P_INITPGID 0
-
-
-/*
- * The following defines the values for an identifier type. It
- * specifies the interpretation of an id value. An idtype and
- * id together define a simple set of processes.
- */
-typedef enum
-#if !defined(_XPG4_2) || defined(__EXTENSIONS__)
- idtype /* pollutes XPG4.2 namespace */
-#endif
- {
- P_PID, /* A process identifier. */
- P_PPID, /* A parent process identifier. */
- P_PGID, /* A process group (job control group) */
- /* identifier. */
- P_SID, /* A session identifier. */
- P_CID, /* A scheduling class identifier. */
- P_UID, /* A user identifier. */
- P_GID, /* A group identifier. */
- P_ALL, /* All processes. */
- P_LWPID, /* An LWP identifier. */
- P_TASKID, /* A task identifier. */
- P_PROJID, /* A project identifier. */
- P_POOLID, /* A pool identifier. */
- P_ZONEID, /* A zone identifier. */
- P_CTID, /* A (process) contract identifier. */
- P_CPUID, /* CPU identifier. */
- P_PSETID /* Processor set identifier */
-} idtype_t;
-
-
-/*
- * The following defines the operations which can be performed to
- * combine two simple sets of processes to form another set of
- * processes.
- */
-#if !defined(_XPG4_2) || defined(__EXTENSIONS__)
-typedef enum idop {
- POP_DIFF, /* Set difference. The processes which */
- /* are in the left operand set and not */
- /* in the right operand set. */
- POP_AND, /* Set disjunction. The processes */
- /* which are in both the left and right */
- /* operand sets. */
- POP_OR, /* Set conjunction. The processes */
- /* which are in either the left or the */
- /* right operand sets (or both). */
- POP_XOR /* Set exclusive or. The processes */
- /* which are in either the left or */
- /* right operand sets but not in both. */
-} idop_t;
-
-
-/*
- * The following structure is used to define a set of processes.
- * The set is defined in terms of two simple sets of processes
- * and an operator which operates on these two operand sets.
- */
-typedef struct procset {
- idop_t p_op; /* The operator connection the */
- /* following two operands each */
- /* of which is a simple set of */
- /* processes. */
-
- idtype_t p_lidtype;
- /* The type of the left operand */
- /* simple set. */
- id_t p_lid; /* The id of the left operand. */
-
- idtype_t p_ridtype;
- /* The type of the right */
- /* operand simple set. */
- id_t p_rid; /* The id of the right operand. */
-} procset_t;
-
-/*
- * The following macro can be used to initialize a procset_t
- * structure.
- */
-#define setprocset(psp, op, ltype, lid, rtype, rid) \
- (psp)->p_op = (op); \
- (psp)->p_lidtype = (ltype); \
- (psp)->p_lid = (lid); \
- (psp)->p_ridtype = (rtype); \
- (psp)->p_rid = (rid);
-
-#endif /* !defined(_XPG4_2) || defined(__EXTENSIONS__) */
-
-#if defined(sun)
-#ifdef _KERNEL
-
-struct proc;
-
-extern int dotoprocs(procset_t *, int (*)(), char *);
-extern int dotolwp(procset_t *, int (*)(), char *);
-extern int procinset(struct proc *, procset_t *);
-extern int sigsendproc(struct proc *, sigsend_t *);
-extern int sigsendset(procset_t *, sigsend_t *);
-extern boolean_t cur_inset_only(procset_t *);
-extern id_t getmyid(idtype_t);
-
-#endif /* _KERNEL */
-#endif
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_PROCSET_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/sdt.h b/sys/contrib/opensolaris/uts/common/sys/sdt.h
deleted file mode 100644
index da695c9..0000000
--- a/sys/contrib/opensolaris/uts/common/sys/sdt.h
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_SDT_H
-#define _SYS_SDT_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifndef _KERNEL
-
-#define DTRACE_PROBE(provider, name) { \
- extern void __dtrace_##provider##___##name(void); \
- __dtrace_##provider##___##name(); \
-}
-
-#define DTRACE_PROBE1(provider, name, arg1) { \
- extern void __dtrace_##provider##___##name(unsigned long); \
- __dtrace_##provider##___##name((unsigned long)arg1); \
-}
-
-#define DTRACE_PROBE2(provider, name, arg1, arg2) { \
- extern void __dtrace_##provider##___##name(unsigned long, \
- unsigned long); \
- __dtrace_##provider##___##name((unsigned long)arg1, \
- (unsigned long)arg2); \
-}
-
-#define DTRACE_PROBE3(provider, name, arg1, arg2, arg3) { \
- extern void __dtrace_##provider##___##name(unsigned long, \
- unsigned long, unsigned long); \
- __dtrace_##provider##___##name((unsigned long)arg1, \
- (unsigned long)arg2, (unsigned long)arg3); \
-}
-
-#define DTRACE_PROBE4(provider, name, arg1, arg2, arg3, arg4) { \
- extern void __dtrace_##provider##___##name(unsigned long, \
- unsigned long, unsigned long, unsigned long); \
- __dtrace_##provider##___##name((unsigned long)arg1, \
- (unsigned long)arg2, (unsigned long)arg3, \
- (unsigned long)arg4); \
-}
-
-#define DTRACE_PROBE5(provider, name, arg1, arg2, arg3, arg4, arg5) { \
- extern void __dtrace_##provider##___##name(unsigned long, \
- unsigned long, unsigned long, unsigned long, unsigned long);\
- __dtrace_##provider##___##name((unsigned long)arg1, \
- (unsigned long)arg2, (unsigned long)arg3, \
- (unsigned long)arg4, (unsigned long)arg5); \
-}
-
-#else /* _KERNEL */
-
-#define DTRACE_PROBE(name) { \
- extern void __dtrace_probe_##name(void); \
- __dtrace_probe_##name(); \
-}
-
-#define DTRACE_PROBE1(name, type1, arg1) { \
- extern void __dtrace_probe_##name(uintptr_t); \
- __dtrace_probe_##name((uintptr_t)(arg1)); \
-}
-
-#define DTRACE_PROBE2(name, type1, arg1, type2, arg2) { \
- extern void __dtrace_probe_##name(uintptr_t, uintptr_t); \
- __dtrace_probe_##name((uintptr_t)(arg1), (uintptr_t)(arg2)); \
-}
-
-#define DTRACE_PROBE3(name, type1, arg1, type2, arg2, type3, arg3) { \
- extern void __dtrace_probe_##name(uintptr_t, uintptr_t, uintptr_t); \
- __dtrace_probe_##name((uintptr_t)(arg1), (uintptr_t)(arg2), \
- (uintptr_t)(arg3)); \
-}
-
-#define DTRACE_PROBE4(name, type1, arg1, type2, arg2, \
- type3, arg3, type4, arg4) { \
- extern void __dtrace_probe_##name(uintptr_t, uintptr_t, \
- uintptr_t, uintptr_t); \
- __dtrace_probe_##name((uintptr_t)(arg1), (uintptr_t)(arg2), \
- (uintptr_t)(arg3), (uintptr_t)(arg4)); \
-}
-
-#define DTRACE_SCHED(name) \
- DTRACE_PROBE(__sched_##name);
-
-#define DTRACE_SCHED1(name, type1, arg1) \
- DTRACE_PROBE1(__sched_##name, type1, arg1);
-
-#define DTRACE_SCHED2(name, type1, arg1, type2, arg2) \
- DTRACE_PROBE2(__sched_##name, type1, arg1, type2, arg2);
-
-#define DTRACE_SCHED3(name, type1, arg1, type2, arg2, type3, arg3) \
- DTRACE_PROBE3(__sched_##name, type1, arg1, type2, arg2, type3, arg3);
-
-#define DTRACE_SCHED4(name, type1, arg1, type2, arg2, \
- type3, arg3, type4, arg4) \
- DTRACE_PROBE4(__sched_##name, type1, arg1, type2, arg2, \
- type3, arg3, type4, arg4);
-
-#define DTRACE_PROC(name) \
- DTRACE_PROBE(__proc_##name);
-
-#define DTRACE_PROC1(name, type1, arg1) \
- DTRACE_PROBE1(__proc_##name, type1, arg1);
-
-#define DTRACE_PROC2(name, type1, arg1, type2, arg2) \
- DTRACE_PROBE2(__proc_##name, type1, arg1, type2, arg2);
-
-#define DTRACE_PROC3(name, type1, arg1, type2, arg2, type3, arg3) \
- DTRACE_PROBE3(__proc_##name, type1, arg1, type2, arg2, type3, arg3);
-
-#define DTRACE_PROC4(name, type1, arg1, type2, arg2, \
- type3, arg3, type4, arg4) \
- DTRACE_PROBE4(__proc_##name, type1, arg1, type2, arg2, \
- type3, arg3, type4, arg4);
-
-#define DTRACE_IO(name) \
- DTRACE_PROBE(__io_##name);
-
-#define DTRACE_IO1(name, type1, arg1) \
- DTRACE_PROBE1(__io_##name, type1, arg1);
-
-#define DTRACE_IO2(name, type1, arg1, type2, arg2) \
- DTRACE_PROBE2(__io_##name, type1, arg1, type2, arg2);
-
-#define DTRACE_IO3(name, type1, arg1, type2, arg2, type3, arg3) \
- DTRACE_PROBE3(__io_##name, type1, arg1, type2, arg2, type3, arg3);
-
-#define DTRACE_IO4(name, type1, arg1, type2, arg2, \
- type3, arg3, type4, arg4) \
- DTRACE_PROBE4(__io_##name, type1, arg1, type2, arg2, \
- type3, arg3, type4, arg4);
-
-#define DTRACE_SYSEVENT2(name, type1, arg1, type2, arg2) \
- DTRACE_PROBE2(__sysevent_##name, type1, arg1, type2, arg2);
-
-#endif /* _KERNEL */
-
-extern const char *sdt_prefix;
-
-typedef struct sdt_probedesc {
- char *sdpd_name; /* name of this probe */
- unsigned long sdpd_offset; /* offset of call in text */
- struct sdt_probedesc *sdpd_next; /* next static probe */
-} sdt_probedesc_t;
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_SDT_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/synch.h b/sys/contrib/opensolaris/uts/common/sys/synch.h
deleted file mode 100644
index 8f52d72..0000000
--- a/sys/contrib/opensolaris/uts/common/sys/synch.h
+++ /dev/null
@@ -1,161 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2003 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_SYNCH_H
-#define _SYS_SYNCH_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#ifndef _ASM
-#include <sys/types.h>
-#include <sys/int_types.h>
-#endif /* _ASM */
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifndef _ASM
-/*
- * Thread and LWP mutexes have the same type
- * definitions.
- *
- * NOTE:
- *
- * POSIX requires that <pthread.h> define the structures pthread_mutex_t
- * and pthread_cond_t. Although these structures are identical to mutex_t
- * (lwp_mutex_t) and cond_t (lwp_cond_t), defined here, a typedef of these
- * types would require including <synch.h> in <pthread.h>, pulling in
- * non-posix symbols/constants, violating POSIX namespace restrictions. Hence,
- * pthread_mutex_t/pthread_cond_t have been redefined (in <sys/types.h>).
- * Any modifications done to mutex_t/lwp_mutex_t or cond_t/lwp_cond_t must
- * also be done to pthread_mutex_t/pthread_cond_t.
- */
-typedef struct _lwp_mutex {
- struct {
- uint16_t flag1;
- uint8_t flag2;
- uint8_t ceiling;
- union {
- uint16_t bcptype;
- struct {
- uint8_t count_type1;
- uint8_t count_type2;
- } mtype_rcount;
- } mbcp_type_un;
- uint16_t magic;
- } flags;
- union {
- struct {
- uint8_t pad[8];
- } lock64;
- struct {
- uint32_t ownerpid;
- uint32_t lockword;
- } lock32;
- upad64_t owner64;
- } lock;
- upad64_t data;
-} lwp_mutex_t;
-
-/*
- * Thread and LWP condition variables have the same
- * type definition.
- * NOTE:
- * The layout of the following structure should be kept in sync with the
- * layout of pthread_cond_t in sys/types.h. See NOTE above for lwp_mutex_t.
- */
-typedef struct _lwp_cond {
- struct {
- uint8_t flag[4];
- uint16_t type;
- uint16_t magic;
- } flags;
- upad64_t data;
-} lwp_cond_t;
-
-/*
- * LWP semaphores
- */
-typedef struct _lwp_sema {
- uint32_t count; /* semaphore count */
- uint16_t type;
- uint16_t magic;
- uint8_t flags[8]; /* last byte reserved for waiters */
- upad64_t data; /* optional data */
-} lwp_sema_t;
-
-/*
- * Thread and LWP rwlocks have the same type definition.
- * NOTE: The layout of this structure should be kept in sync with the layout
- * of the correponding structure of pthread_rwlock_t in sys/types.h.
- * Also, because we have to deal with C++, there is an identical structure
- * for rwlock_t in head/sync.h that we cannot change.
- */
-typedef struct _lwp_rwlock {
- int32_t readers; /* -1 == writer else # of readers */
- uint16_t type;
- uint16_t magic;
- lwp_mutex_t mutex; /* used to indicate ownership */
- lwp_cond_t readercv; /* unused */
- lwp_cond_t writercv; /* unused */
-} lwp_rwlock_t;
-
-#endif /* _ASM */
-/*
- * Definitions of synchronization types.
- */
-#define USYNC_THREAD 0x00 /* private to a process */
-#define USYNC_PROCESS 0x01 /* shared by processes */
-
-/* Keep the following 3 fields in sync with pthread.h */
-#define LOCK_NORMAL 0x00 /* same as USYNC_THREAD */
-#define LOCK_ERRORCHECK 0x02 /* error check lock */
-#define LOCK_RECURSIVE 0x04 /* recursive lock */
-
-#define USYNC_PROCESS_ROBUST 0x08 /* shared by processes robustly */
-
-/* Keep the following 5 fields in sync with pthread.h */
-
-#define LOCK_PRIO_NONE 0x00
-#define LOCK_PRIO_INHERIT 0x10
-#define LOCK_PRIO_PROTECT 0x20
-#define LOCK_STALL_NP 0x00
-#define LOCK_ROBUST_NP 0x40
-
-/*
- * lwp_mutex_t flags
- */
-#define LOCK_OWNERDEAD 0x1
-#define LOCK_NOTRECOVERABLE 0x2
-#define LOCK_INITED 0x4
-#define LOCK_UNMAPPED 0x8
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_SYNCH_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/sysevent.h b/sys/contrib/opensolaris/uts/common/sys/sysevent.h
deleted file mode 100644
index 0a61e41..0000000
--- a/sys/contrib/opensolaris/uts/common/sys/sysevent.h
+++ /dev/null
@@ -1,227 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_SYSEVENT_H
-#define _SYS_SYSEVENT_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/nvpair.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifndef NULL
-#if defined(_LP64) && !defined(__cplusplus)
-#define NULL 0L
-#else
-#define NULL 0
-#endif
-#endif
-
-/* Internal registration class and subclass */
-#define EC_ALL "register_all_classes"
-#define EC_SUB_ALL "register_all_subclasses"
-
-/*
- * Event allocation/enqueuing sleep/nosleep flags
- */
-#define SE_SLEEP 0
-#define SE_NOSLEEP 1
-
-/* Framework error codes */
-#define SE_EINVAL 1 /* Invalid argument */
-#define SE_ENOMEM 2 /* Unable to allocate memory */
-#define SE_EQSIZE 3 /* Maximum event q size exceeded */
-#define SE_EFAULT 4 /* Copy fault */
-#define SE_NOTFOUND 5 /* Attribute not found */
-#define SE_NO_TRANSPORT 6 /* sysevent transport down */
-
-/* Internal data types */
-
-#define SE_DATA_TYPE_BYTE DATA_TYPE_BYTE
-#define SE_DATA_TYPE_INT16 DATA_TYPE_INT16
-#define SE_DATA_TYPE_UINT16 DATA_TYPE_UINT16
-#define SE_DATA_TYPE_INT32 DATA_TYPE_INT32
-#define SE_DATA_TYPE_UINT32 DATA_TYPE_UINT32
-#define SE_DATA_TYPE_INT64 DATA_TYPE_INT64
-#define SE_DATA_TYPE_UINT64 DATA_TYPE_UINT64
-#define SE_DATA_TYPE_STRING DATA_TYPE_STRING
-#define SE_DATA_TYPE_BYTES DATA_TYPE_BYTE_ARRAY
-#define SE_DATA_TYPE_TIME DATA_TYPE_HRTIME
-
-#define SE_KERN_PID 0
-
-#define SUNW_VENDOR "SUNW"
-#define SE_USR_PUB "usr:"
-#define SE_KERN_PUB "kern:"
-#define SUNW_KERN_PUB SUNW_VENDOR":"SE_KERN_PUB
-#define SUNW_USR_PUB SUNW_VENDOR":"SE_USR_PUB
-
-/*
- * Event header and attribute value limits
- */
-#define MAX_ATTR_NAME 1024
-#define MAX_STRING_SZ 1024
-#define MAX_BYTE_ARRAY 1024
-
-#define MAX_CLASS_LEN 64
-#define MAX_SUBCLASS_LEN 64
-#define MAX_PUB_LEN 128
-#define MAX_CHNAME_LEN 128
-#define MAX_SUBID_LEN 16
-
-/*
- * Limit for the event payload size
- */
-#define MAX_EV_SIZE_LEN (SHRT_MAX/4)
-
-/* Opaque sysevent_t data type */
-typedef void *sysevent_t;
-
-/* Opaque channel bind data type */
-typedef void evchan_t;
-
-/* sysevent attribute list */
-typedef nvlist_t sysevent_attr_list_t;
-
-/* sysevent attribute name-value pair */
-typedef nvpair_t sysevent_attr_t;
-
-/* Unique event identifier */
-typedef struct sysevent_id {
- uint64_t eid_seq;
- hrtime_t eid_ts;
-} sysevent_id_t;
-
-/* Event attribute value structures */
-typedef struct sysevent_bytes {
- int32_t size;
- uchar_t *data;
-} sysevent_bytes_t;
-
-typedef struct sysevent_value {
- int32_t value_type; /* data type */
- union {
- uchar_t sv_byte;
- int16_t sv_int16;
- uint16_t sv_uint16;
- int32_t sv_int32;
- uint32_t sv_uint32;
- int64_t sv_int64;
- uint64_t sv_uint64;
- hrtime_t sv_time;
- char *sv_string;
- sysevent_bytes_t sv_bytes;
- } value;
-} sysevent_value_t;
-
-/*
- * The following flags determine the memory allocation semantics to use for
- * kernel event buffer allocation by userland and kernel versions of
- * sysevent_evc_publish().
- *
- * EVCH_SLEEP and EVCH_NOSLEEP respectively map to KM_SLEEP and KM_NOSLEEP.
- * EVCH_TRYHARD is a kernel-only publish flag that allow event allocation
- * routines to use use alternate kmem caches in situations where free memory
- * may be low. Kernel callers of sysevent_evc_publish() must set flags to
- * one of EVCH_SLEEP, EVCH_NOSLEEP or EVCH_TRYHARD. Userland callers of
- * sysevent_evc_publish() must set flags to one of EVCH_SLEEP or EVCH_NOSLEEP.
- *
- * EVCH_QWAIT determines whether or not we should wait for slots in the event
- * queue at publication time. EVCH_QWAIT may be used by kernel and userland
- * publishers and must be used in conjunction with any of one of EVCH_SLEEP,
- * EVCH_NOSLEEP or EVCH_TRYHARD (kernel-only).
- */
-
-#define EVCH_NOSLEEP 0x0001 /* No sleep on kmem_alloc() */
-#define EVCH_SLEEP 0x0002 /* Sleep on kmem_alloc() */
-#define EVCH_TRYHARD 0x0004 /* May use alternate kmem cache for alloc */
-#define EVCH_QWAIT 0x0008 /* Wait for slot in event queue */
-
-/*
- * Meaning of flags for subscribe/unsubscribe. Bits 0 to 7 are dedicated to
- * the consolidation private interface.
- */
-#define EVCH_SUB_KEEP 0x0001
-#define EVCH_ALLSUB "all_subs"
-
-/*
- * Meaning of flags parameter of channel bind function
- */
-#define EVCH_CREAT 0x0001 /* Create a channel if not present */
-#define EVCH_HOLD_PEND 0x0002
-#define EVCH_B_FLAGS 0x0003 /* All valid bits */
-
-/*
- * Meaning of commands of evc_control function
- */
-#define EVCH_GET_CHAN_LEN_MAX 1 /* Get event queue length limit */
-#define EVCH_GET_CHAN_LEN 2 /* Get event queue length */
-#define EVCH_SET_CHAN_LEN 3 /* Set event queue length */
-#define EVCH_CMD_LAST EVCH_SET_CHAN_LEN /* Last command */
-
-/*
- * Event channel interface definitions
- */
-int sysevent_evc_bind(const char *, evchan_t **, uint32_t);
-void sysevent_evc_unbind(evchan_t *);
-int sysevent_evc_subscribe(evchan_t *, const char *, const char *,
- int (*)(sysevent_t *, void *), void *, uint32_t);
-void sysevent_evc_unsubscribe(evchan_t *, const char *);
-int sysevent_evc_publish(evchan_t *, const char *, const char *,
- const char *, const char *, nvlist_t *, uint32_t);
-int sysevent_evc_control(evchan_t *, int, ...);
-
-#ifdef _KERNEL
-
-/*
- * Kernel log_event interfaces.
- */
-int log_sysevent(sysevent_t *, int, sysevent_id_t *);
-
-sysevent_t *sysevent_alloc(char *, char *, char *, int);
-void sysevent_free(sysevent_t *);
-int sysevent_add_attr(sysevent_attr_list_t **, char *, sysevent_value_t *, int);
-void sysevent_free_attr(sysevent_attr_list_t *);
-int sysevent_attach_attributes(sysevent_t *, sysevent_attr_list_t *);
-void sysevent_detach_attributes(sysevent_t *);
-char *sysevent_get_class_name(sysevent_t *);
-char *sysevent_get_subclass_name(sysevent_t *);
-uint64_t sysevent_get_seq(sysevent_t *);
-void sysevent_get_time(sysevent_t *, hrtime_t *);
-size_t sysevent_get_size(sysevent_t *);
-char *sysevent_get_pub(sysevent_t *);
-int sysevent_get_attr_list(sysevent_t *, nvlist_t **);
-
-#endif /* _KERNEL */
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_SYSEVENT_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/sysmacros.h b/sys/contrib/opensolaris/uts/common/sys/sysmacros.h
deleted file mode 100644
index 9f16a07..0000000
--- a/sys/contrib/opensolaris/uts/common/sys/sysmacros.h
+++ /dev/null
@@ -1,290 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
-/* All Rights Reserved */
-
-
-/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_SYSMACROS_H
-#define _SYS_SYSMACROS_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/param.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * Some macros for units conversion
- */
-/*
- * Disk blocks (sectors) and bytes.
- */
-#define dtob(DD) ((DD) << DEV_BSHIFT)
-#define btod(BB) (((BB) + DEV_BSIZE - 1) >> DEV_BSHIFT)
-#define btodt(BB) ((BB) >> DEV_BSHIFT)
-#define lbtod(BB) (((offset_t)(BB) + DEV_BSIZE - 1) >> DEV_BSHIFT)
-
-/* common macros */
-#ifndef MIN
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-#endif
-#ifndef MAX
-#define MAX(a, b) ((a) < (b) ? (b) : (a))
-#endif
-#ifndef ABS
-#define ABS(a) ((a) < 0 ? -(a) : (a))
-#endif
-
-#ifdef _KERNEL
-
-/*
- * Convert a single byte to/from binary-coded decimal (BCD).
- */
-extern unsigned char byte_to_bcd[256];
-extern unsigned char bcd_to_byte[256];
-
-#define BYTE_TO_BCD(x) byte_to_bcd[(x) & 0xff]
-#define BCD_TO_BYTE(x) bcd_to_byte[(x) & 0xff]
-
-#endif /* _KERNEL */
-
-/*
- * WARNING: The device number macros defined here should not be used by device
- * drivers or user software. Device drivers should use the device functions
- * defined in the DDI/DKI interface (see also ddi.h). Application software
- * should make use of the library routines available in makedev(3). A set of
- * new device macros are provided to operate on the expanded device number
- * format supported in SVR4. Macro versions of the DDI device functions are
- * provided for use by kernel proper routines only. Macro routines bmajor(),
- * major(), minor(), emajor(), eminor(), and makedev() will be removed or
- * their definitions changed at the next major release following SVR4.
- */
-
-#define O_BITSMAJOR 7 /* # of SVR3 major device bits */
-#define O_BITSMINOR 8 /* # of SVR3 minor device bits */
-#define O_MAXMAJ 0x7f /* SVR3 max major value */
-#define O_MAXMIN 0xff /* SVR3 max minor value */
-
-
-#define L_BITSMAJOR32 14 /* # of SVR4 major device bits */
-#define L_BITSMINOR32 18 /* # of SVR4 minor device bits */
-#define L_MAXMAJ32 0x3fff /* SVR4 max major value */
-#define L_MAXMIN32 0x3ffff /* MAX minor for 3b2 software drivers. */
- /* For 3b2 hardware devices the minor is */
- /* restricted to 256 (0-255) */
-
-#ifdef _LP64
-#define L_BITSMAJOR 32 /* # of major device bits in 64-bit Solaris */
-#define L_BITSMINOR 32 /* # of minor device bits in 64-bit Solaris */
-#define L_MAXMAJ 0xfffffffful /* max major value */
-#define L_MAXMIN 0xfffffffful /* max minor value */
-#else
-#define L_BITSMAJOR L_BITSMAJOR32
-#define L_BITSMINOR L_BITSMINOR32
-#define L_MAXMAJ L_MAXMAJ32
-#define L_MAXMIN L_MAXMIN32
-#endif
-
-#if defined(sun)
-#ifdef _KERNEL
-
-/* major part of a device internal to the kernel */
-
-#define major(x) (major_t)((((unsigned)(x)) >> O_BITSMINOR) & O_MAXMAJ)
-#define bmajor(x) (major_t)((((unsigned)(x)) >> O_BITSMINOR) & O_MAXMAJ)
-
-/* get internal major part of expanded device number */
-
-#define getmajor(x) (major_t)((((dev_t)(x)) >> L_BITSMINOR) & L_MAXMAJ)
-
-/* minor part of a device internal to the kernel */
-
-#define minor(x) (minor_t)((x) & O_MAXMIN)
-
-/* get internal minor part of expanded device number */
-
-#define getminor(x) (minor_t)((x) & L_MAXMIN)
-
-#else
-
-/* major part of a device external from the kernel (same as emajor below) */
-
-#define major(x) (major_t)((((unsigned)(x)) >> O_BITSMINOR) & O_MAXMAJ)
-
-/* minor part of a device external from the kernel (same as eminor below) */
-
-#define minor(x) (minor_t)((x) & O_MAXMIN)
-
-#endif /* _KERNEL */
-
-/* create old device number */
-
-#define makedev(x, y) (unsigned short)(((x) << O_BITSMINOR) | ((y) & O_MAXMIN))
-
-/* make an new device number */
-
-#define makedevice(x, y) (dev_t)(((dev_t)(x) << L_BITSMINOR) | ((y) & L_MAXMIN))
-
-
-/*
- * emajor() allows kernel/driver code to print external major numbers
- * eminor() allows kernel/driver code to print external minor numbers
- */
-
-#define emajor(x) \
- (major_t)(((unsigned int)(x) >> O_BITSMINOR) > O_MAXMAJ) ? \
- NODEV : (((unsigned int)(x) >> O_BITSMINOR) & O_MAXMAJ)
-
-#define eminor(x) \
- (minor_t)((x) & O_MAXMIN)
-
-/*
- * get external major and minor device
- * components from expanded device number
- */
-#define getemajor(x) (major_t)((((dev_t)(x) >> L_BITSMINOR) > L_MAXMAJ) ? \
- NODEV : (((dev_t)(x) >> L_BITSMINOR) & L_MAXMAJ))
-#define geteminor(x) (minor_t)((x) & L_MAXMIN)
-
-#endif /* sun */
-
-/*
- * These are versions of the kernel routines for compressing and
- * expanding long device numbers that don't return errors.
- */
-#if (L_BITSMAJOR32 == L_BITSMAJOR) && (L_BITSMINOR32 == L_BITSMINOR)
-
-#define DEVCMPL(x) (x)
-#define DEVEXPL(x) (x)
-
-#else
-
-#define DEVCMPL(x) \
- (dev32_t)((((x) >> L_BITSMINOR) > L_MAXMAJ32 || \
- ((x) & L_MAXMIN) > L_MAXMIN32) ? NODEV32 : \
- ((((x) >> L_BITSMINOR) << L_BITSMINOR32) | ((x) & L_MAXMIN32)))
-
-#define DEVEXPL(x) \
- (((x) == NODEV32) ? NODEV : \
- makedevice(((x) >> L_BITSMINOR32) & L_MAXMAJ32, (x) & L_MAXMIN32))
-
-#endif /* L_BITSMAJOR32 ... */
-
-/* convert to old (SVR3.2) dev format */
-
-#define cmpdev(x) \
- (o_dev_t)((((x) >> L_BITSMINOR) > O_MAXMAJ || \
- ((x) & L_MAXMIN) > O_MAXMIN) ? NODEV : \
- ((((x) >> L_BITSMINOR) << O_BITSMINOR) | ((x) & O_MAXMIN)))
-
-/* convert to new (SVR4) dev format */
-
-#define expdev(x) \
- (dev_t)(((dev_t)(((x) >> O_BITSMINOR) & O_MAXMAJ) << L_BITSMINOR) | \
- ((x) & O_MAXMIN))
-
-/*
- * Macro for checking power of 2 address alignment.
- */
-#define IS_P2ALIGNED(v, a) ((((uintptr_t)(v)) & ((uintptr_t)(a) - 1)) == 0)
-
-/*
- * Macros for counting and rounding.
- */
-#define howmany(x, y) (((x)+((y)-1))/(y))
-#define roundup(x, y) ((((x)+((y)-1))/(y))*(y))
-
-/*
- * Macro to determine if value is a power of 2
- */
-#define ISP2(x) (((x) & ((x) - 1)) == 0)
-
-/*
- * Macros for various sorts of alignment and rounding when the alignment
- * is known to be a power of 2.
- */
-#define P2ALIGN(x, align) ((x) & -(align))
-#define P2PHASE(x, align) ((x) & ((align) - 1))
-#define P2NPHASE(x, align) (-(x) & ((align) - 1))
-#define P2ROUNDUP(x, align) (-(-(x) & -(align)))
-#define P2END(x, align) (-(~(x) & -(align)))
-#define P2PHASEUP(x, align, phase) ((phase) - (((phase) - (x)) & -(align)))
-#define P2CROSS(x, y, align) (((x) ^ (y)) > (align) - 1)
-/*
- * Determine whether two numbers have the same high-order bit.
- */
-#define P2SAMEHIGHBIT(x, y) (((x) ^ (y)) < ((x) & (y)))
-
-/*
- * Typed version of the P2* macros. These macros should be used to ensure
- * that the result is correctly calculated based on the data type of (x),
- * which is passed in as the last argument, regardless of the data
- * type of the alignment. For example, if (x) is of type uint64_t,
- * and we want to round it up to a page boundary using "PAGESIZE" as
- * the alignment, we can do either
- * P2ROUNDUP(x, (uint64_t)PAGESIZE)
- * or
- * P2ROUNDUP_TYPED(x, PAGESIZE, uint64_t)
- */
-#define P2ALIGN_TYPED(x, align, type) \
- ((type)(x) & -(type)(align))
-#define P2PHASE_TYPED(x, align, type) \
- ((type)(x) & ((type)(align) - 1))
-#define P2NPHASE_TYPED(x, align, type) \
- (-(type)(x) & ((type)(align) - 1))
-#define P2ROUNDUP_TYPED(x, align, type) \
- (-(-(type)(x) & -(type)(align)))
-#define P2END_TYPED(x, align, type) \
- (-(~(type)(x) & -(type)(align)))
-#define P2PHASEUP_TYPED(x, align, phase, type) \
- ((type)(phase) - (((type)(phase) - (type)(x)) & -(type)(align)))
-#define P2CROSS_TYPED(x, y, align, type) \
- (((type)(x) ^ (type)(y)) > (type)(align) - 1)
-#define P2SAMEHIGHBIT_TYPED(x, y, type) \
- (((type)(x) ^ (type)(y)) < ((type)(x) & (type)(y)))
-
-/*
- * Macros to atomically increment/decrement a variable. mutex and var
- * must be pointers.
- */
-#define INCR_COUNT(var, mutex) mutex_enter(mutex), (*(var))++, mutex_exit(mutex)
-#define DECR_COUNT(var, mutex) mutex_enter(mutex), (*(var))--, mutex_exit(mutex)
-
-#if defined(_KERNEL) && !defined(_KMEMUSER) && !defined(offsetof)
-
-/* avoid any possibility of clashing with <stddef.h> version */
-
-#define offsetof(s, m) ((size_t)(&(((s *)0)->m)))
-#endif
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_SYSMACROS_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/vmem.h b/sys/contrib/opensolaris/uts/common/sys/vmem.h
deleted file mode 100644
index f0caec6..0000000
--- a/sys/contrib/opensolaris/uts/common/sys/vmem.h
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_VMEM_H
-#define _SYS_VMEM_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/types.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-
-/*
- * Per-allocation flags
- */
-#define VM_SLEEP 0x00000000 /* same as KM_SLEEP */
-#define VM_NOSLEEP 0x00000001 /* same as KM_NOSLEEP */
-#define VM_PANIC 0x00000002 /* same as KM_PANIC */
-#define VM_PUSHPAGE 0x00000004 /* same as KM_PUSHPAGE */
-#define VM_KMFLAGS 0x000000ff /* flags that must match KM_* flags */
-
-#define VM_BESTFIT 0x00000100
-#define VM_FIRSTFIT 0x00000200
-#define VM_NEXTFIT 0x00000400
-
-/*
- * The following flags are restricted for use only within the kernel.
- * VM_MEMLOAD is for use by the HAT to avoid infinite recursion.
- * VM_NORELOC is used by the kernel when static VA->PA mappings are required.
- */
-#define VM_MEMLOAD 0x00000800
-#define VM_NORELOC 0x00001000
-/*
- * VM_ABORT requests that vmem_alloc() *ignore* the VM_SLEEP/VM_NOSLEEP flags
- * and forgo reaping if the allocation or attempted import, fails. This
- * flag is a segkmem-specific flag, and should not be used by anyone else.
- */
-#define VM_ABORT 0x00002000
-
-#define VM_FLAGS 0x0000FFFF
-
-/*
- * Arena creation flags
- */
-#define VMC_POPULATOR 0x00010000
-#define VMC_NO_QCACHE 0x00020000 /* cannot use quantum caches */
-#define VMC_IDENTIFIER 0x00040000 /* not backed by memory */
-/*
- * internal use only; the import function uses the vmem_ximport_t interface
- * and may increase the request size if it so desires
- */
-#define VMC_XALLOC 0x00080000
-#define VMC_FLAGS 0xFFFF0000
-
-/*
- * Public segment types
- */
-#define VMEM_ALLOC 0x01
-#define VMEM_FREE 0x02
-
-/*
- * Implementation-private segment types
- */
-#define VMEM_SPAN 0x10
-#define VMEM_ROTOR 0x20
-#define VMEM_WALKER 0x40
-
-/*
- * VMEM_REENTRANT indicates to vmem_walk() that the callback routine may
- * call back into the arena being walked, so vmem_walk() must drop the
- * arena lock before each callback. The caveat is that since the arena
- * isn't locked, its state can change. Therefore it is up to the callback
- * routine to handle cases where the segment isn't of the expected type.
- * For example, we use this to walk heap_arena when generating a crash dump;
- * see segkmem_dump() for sample usage.
- */
-#define VMEM_REENTRANT 0x80000000
-
-typedef struct vmem vmem_t;
-typedef void *(vmem_alloc_t)(vmem_t *, size_t, int);
-typedef void (vmem_free_t)(vmem_t *, void *, size_t);
-
-/*
- * Alternate import style; the requested size is passed in a pointer,
- * which can be increased by the import function if desired.
- */
-typedef void *(vmem_ximport_t)(vmem_t *, size_t *, int);
-
-#ifdef _KERNEL
-extern vmem_t *vmem_init(const char *, void *, size_t, size_t,
- vmem_alloc_t *, vmem_free_t *);
-extern void vmem_update(void *);
-extern int vmem_is_populator(void);
-extern size_t vmem_seg_size;
-#endif
-
-extern vmem_t *vmem_create(const char *, void *, size_t, size_t,
- vmem_alloc_t *, vmem_free_t *, vmem_t *, size_t, int);
-extern vmem_t *vmem_xcreate(const char *, void *, size_t, size_t,
- vmem_ximport_t *, vmem_free_t *, vmem_t *, size_t, int);
-extern void vmem_destroy(vmem_t *);
-extern void *vmem_alloc(vmem_t *, size_t, int);
-extern void *vmem_xalloc(vmem_t *, size_t, size_t, size_t, size_t,
- void *, void *, int);
-extern void vmem_free(vmem_t *, void *, size_t);
-extern void vmem_xfree(vmem_t *, void *, size_t);
-extern void *vmem_add(vmem_t *, void *, size_t, int);
-extern int vmem_contains(vmem_t *, void *, size_t);
-extern void vmem_walk(vmem_t *, int, void (*)(void *, void *, size_t), void *);
-extern size_t vmem_size(vmem_t *, int);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_VMEM_H */
diff --git a/sys/contrib/opensolaris/uts/common/sys/zmod.h b/sys/contrib/opensolaris/uts/common/sys/zmod.h
deleted file mode 100644
index ba02672..0000000
--- a/sys/contrib/opensolaris/uts/common/sys/zmod.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _ZMOD_H
-#define _ZMOD_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * zmod - RFC-1950-compatible decompression routines
- *
- * This file provides the public interfaces to zmod, an in-kernel RFC 1950
- * decompression library. More information about the implementation of these
- * interfaces can be found in the usr/src/uts/common/zmod/ directory.
- */
-
-#define Z_OK 0
-#define Z_STREAM_END 1
-#define Z_NEED_DICT 2
-#define Z_ERRNO (-1)
-#define Z_STREAM_ERROR (-2)
-#define Z_DATA_ERROR (-3)
-#define Z_MEM_ERROR (-4)
-#define Z_BUF_ERROR (-5)
-#define Z_VERSION_ERROR (-6)
-
-#define Z_NO_COMPRESSION 0
-#define Z_BEST_SPEED 1
-#define Z_BEST_COMPRESSION 9
-#define Z_DEFAULT_COMPRESSION (-1)
-
-extern int z_uncompress(void *, size_t *, const void *, size_t);
-extern int z_compress(void *, size_t *, const void *, size_t);
-extern int z_compress_level(void *, size_t *, const void *, size_t, int);
-extern const char *z_strerror(int);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _ZMOD_H */
diff --git a/sys/contrib/opensolaris/uts/common/zmod/adler32.c b/sys/contrib/opensolaris/uts/common/zmod/adler32.c
deleted file mode 100644
index 59d8463..0000000
--- a/sys/contrib/opensolaris/uts/common/zmod/adler32.c
+++ /dev/null
@@ -1,149 +0,0 @@
-/* adler32.c -- compute the Adler-32 checksum of a data stream
- * Copyright (C) 1995-2004 Mark Adler
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#define ZLIB_INTERNAL
-#include "zlib.h"
-
-#define BASE 65521UL /* largest prime smaller than 65536 */
-#define NMAX 5552
-/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
-
-#define DO1(buf,i) {adler += (buf)[i]; sum2 += adler;}
-#define DO2(buf,i) DO1(buf,i); DO1(buf,i+1);
-#define DO4(buf,i) DO2(buf,i); DO2(buf,i+2);
-#define DO8(buf,i) DO4(buf,i); DO4(buf,i+4);
-#define DO16(buf) DO8(buf,0); DO8(buf,8);
-
-/* use NO_DIVIDE if your processor does not do division in hardware */
-#ifdef NO_DIVIDE
-# define MOD(a) \
- do { \
- if (a >= (BASE << 16)) a -= (BASE << 16); \
- if (a >= (BASE << 15)) a -= (BASE << 15); \
- if (a >= (BASE << 14)) a -= (BASE << 14); \
- if (a >= (BASE << 13)) a -= (BASE << 13); \
- if (a >= (BASE << 12)) a -= (BASE << 12); \
- if (a >= (BASE << 11)) a -= (BASE << 11); \
- if (a >= (BASE << 10)) a -= (BASE << 10); \
- if (a >= (BASE << 9)) a -= (BASE << 9); \
- if (a >= (BASE << 8)) a -= (BASE << 8); \
- if (a >= (BASE << 7)) a -= (BASE << 7); \
- if (a >= (BASE << 6)) a -= (BASE << 6); \
- if (a >= (BASE << 5)) a -= (BASE << 5); \
- if (a >= (BASE << 4)) a -= (BASE << 4); \
- if (a >= (BASE << 3)) a -= (BASE << 3); \
- if (a >= (BASE << 2)) a -= (BASE << 2); \
- if (a >= (BASE << 1)) a -= (BASE << 1); \
- if (a >= BASE) a -= BASE; \
- } while (0)
-# define MOD4(a) \
- do { \
- if (a >= (BASE << 4)) a -= (BASE << 4); \
- if (a >= (BASE << 3)) a -= (BASE << 3); \
- if (a >= (BASE << 2)) a -= (BASE << 2); \
- if (a >= (BASE << 1)) a -= (BASE << 1); \
- if (a >= BASE) a -= BASE; \
- } while (0)
-#else
-# define MOD(a) a %= BASE
-# define MOD4(a) a %= BASE
-#endif
-
-/* ========================================================================= */
-uLong ZEXPORT adler32(adler, buf, len)
- uLong adler;
- const Bytef *buf;
- uInt len;
-{
- unsigned long sum2;
- unsigned n;
-
- /* split Adler-32 into component sums */
- sum2 = (adler >> 16) & 0xffff;
- adler &= 0xffff;
-
- /* in case user likes doing a byte at a time, keep it fast */
- if (len == 1) {
- adler += buf[0];
- if (adler >= BASE)
- adler -= BASE;
- sum2 += adler;
- if (sum2 >= BASE)
- sum2 -= BASE;
- return adler | (sum2 << 16);
- }
-
- /* initial Adler-32 value (deferred check for len == 1 speed) */
- if (buf == Z_NULL)
- return 1L;
-
- /* in case short lengths are provided, keep it somewhat fast */
- if (len < 16) {
- while (len--) {
- adler += *buf++;
- sum2 += adler;
- }
- if (adler >= BASE)
- adler -= BASE;
- MOD4(sum2); /* only added so many BASE's */
- return adler | (sum2 << 16);
- }
-
- /* do length NMAX blocks -- requires just one modulo operation */
- while (len >= NMAX) {
- len -= NMAX;
- n = NMAX / 16; /* NMAX is divisible by 16 */
- do {
- DO16(buf); /* 16 sums unrolled */
- buf += 16;
- } while (--n);
- MOD(adler);
- MOD(sum2);
- }
-
- /* do remaining bytes (less than NMAX, still just one modulo) */
- if (len) { /* avoid modulos if none remaining */
- while (len >= 16) {
- len -= 16;
- DO16(buf);
- buf += 16;
- }
- while (len--) {
- adler += *buf++;
- sum2 += adler;
- }
- MOD(adler);
- MOD(sum2);
- }
-
- /* return recombined sums */
- return adler | (sum2 << 16);
-}
-
-/* ========================================================================= */
-uLong ZEXPORT adler32_combine(adler1, adler2, len2)
- uLong adler1;
- uLong adler2;
- z_off_t len2;
-{
- unsigned long sum1;
- unsigned long sum2;
- unsigned rem;
-
- /* the derivation of this formula is left as an exercise for the reader */
- rem = (unsigned)(len2 % BASE);
- sum1 = adler1 & 0xffff;
- sum2 = rem * sum1;
- MOD(sum2);
- sum1 += (adler2 & 0xffff) + BASE - 1;
- sum2 += ((adler1 >> 16) & 0xffff) + ((adler2 >> 16) & 0xffff) + BASE - rem;
- if (sum1 > BASE) sum1 -= BASE;
- if (sum1 > BASE) sum1 -= BASE;
- if (sum2 > (BASE << 1)) sum2 -= (BASE << 1);
- if (sum2 > BASE) sum2 -= BASE;
- return sum1 | (sum2 << 16);
-}
diff --git a/sys/contrib/opensolaris/uts/common/zmod/crc32.c b/sys/contrib/opensolaris/uts/common/zmod/crc32.c
deleted file mode 100644
index 61ad581..0000000
--- a/sys/contrib/opensolaris/uts/common/zmod/crc32.c
+++ /dev/null
@@ -1,428 +0,0 @@
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-/* crc32.c -- compute the CRC-32 of a data stream
- * Copyright (C) 1995-2005 Mark Adler
- * For conditions of distribution and use, see copyright notice in zlib.h
- *
- * Thanks to Rodney Brown <rbrown64@csc.com.au> for his contribution of faster
- * CRC methods: exclusive-oring 32 bits of data at a time, and pre-computing
- * tables for updating the shift register in one step with three exclusive-ors
- * instead of four steps with four exclusive-ors. This results in about a
- * factor of two increase in speed on a Power PC G4 (PPC7455) using gcc -O3.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-/*
- Note on the use of DYNAMIC_CRC_TABLE: there is no mutex or semaphore
- protection on the static variables used to control the first-use generation
- of the crc tables. Therefore, if you #define DYNAMIC_CRC_TABLE, you should
- first call get_crc_table() to initialize the tables before allowing more than
- one thread to use crc32().
- */
-
-#ifdef MAKECRCH
-# include <stdio.h>
-# ifndef DYNAMIC_CRC_TABLE
-# define DYNAMIC_CRC_TABLE
-# endif /* !DYNAMIC_CRC_TABLE */
-#endif /* MAKECRCH */
-
-#include "zutil.h" /* for STDC and FAR definitions */
-
-#define local static
-
-/* Find a four-byte integer type for crc32_little() and crc32_big(). */
-#ifndef NOBYFOUR
-# ifdef STDC /* need ANSI C limits.h to determine sizes */
-# include <limits.h>
-# define BYFOUR
-# if (UINT_MAX == 0xffffffffUL)
- typedef unsigned int u4;
-# else
-# if (ULONG_MAX == 0xffffffffUL)
- typedef unsigned long u4;
-# else
-# if (USHRT_MAX == 0xffffffffUL)
- typedef unsigned short u4;
-# else
-# undef BYFOUR /* can't find a four-byte integer type! */
-# endif
-# endif
-# endif
-# endif /* STDC */
-#endif /* !NOBYFOUR */
-
-/* Definitions for doing the crc four data bytes at a time. */
-#ifdef BYFOUR
-# define REV(w) (((w)>>24)+(((w)>>8)&0xff00)+ \
- (((w)&0xff00)<<8)+(((w)&0xff)<<24))
- local unsigned long crc32_little OF((unsigned long,
- const unsigned char FAR *, unsigned));
- local unsigned long crc32_big OF((unsigned long,
- const unsigned char FAR *, unsigned));
-# define TBLS 8
-#else
-# define TBLS 1
-#endif /* BYFOUR */
-
-/* Local functions for crc concatenation */
-local unsigned long gf2_matrix_times OF((unsigned long *mat,
- unsigned long vec));
-local void gf2_matrix_square OF((unsigned long *square, unsigned long *mat));
-
-#ifdef DYNAMIC_CRC_TABLE
-
-local volatile int crc_table_empty = 1;
-local unsigned long FAR crc_table[TBLS][256];
-local void make_crc_table OF((void));
-#ifdef MAKECRCH
- local void write_table OF((FILE *, const unsigned long FAR *));
-#endif /* MAKECRCH */
-/*
- Generate tables for a byte-wise 32-bit CRC calculation on the polynomial:
- x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1.
-
- Polynomials over GF(2) are represented in binary, one bit per coefficient,
- with the lowest powers in the most significant bit. Then adding polynomials
- is just exclusive-or, and multiplying a polynomial by x is a right shift by
- one. If we call the above polynomial p, and represent a byte as the
- polynomial q, also with the lowest power in the most significant bit (so the
- byte 0xb1 is the polynomial x^7+x^3+x+1), then the CRC is (q*x^32) mod p,
- where a mod b means the remainder after dividing a by b.
-
- This calculation is done using the shift-register method of multiplying and
- taking the remainder. The register is initialized to zero, and for each
- incoming bit, x^32 is added mod p to the register if the bit is a one (where
- x^32 mod p is p+x^32 = x^26+...+1), and the register is multiplied mod p by
- x (which is shifting right by one and adding x^32 mod p if the bit shifted
- out is a one). We start with the highest power (least significant bit) of
- q and repeat for all eight bits of q.
-
- The first table is simply the CRC of all possible eight bit values. This is
- all the information needed to generate CRCs on data a byte at a time for all
- combinations of CRC register values and incoming bytes. The remaining tables
- allow for word-at-a-time CRC calculation for both big-endian and little-
- endian machines, where a word is four bytes.
-*/
-local void make_crc_table()
-{
- unsigned long c;
- int n, k;
- unsigned long poly; /* polynomial exclusive-or pattern */
- /* terms of polynomial defining this crc (except x^32): */
- static volatile int first = 1; /* flag to limit concurrent making */
- static const unsigned char p[] = {0,1,2,4,5,7,8,10,11,12,16,22,23,26};
-
- /* See if another task is already doing this (not thread-safe, but better
- than nothing -- significantly reduces duration of vulnerability in
- case the advice about DYNAMIC_CRC_TABLE is ignored) */
- if (first) {
- first = 0;
-
- /* make exclusive-or pattern from polynomial (0xedb88320UL) */
- poly = 0UL;
- for (n = 0; n < sizeof(p)/sizeof(unsigned char); n++)
- poly |= 1UL << (31 - p[n]);
-
- /* generate a crc for every 8-bit value */
- for (n = 0; n < 256; n++) {
- c = (unsigned long)n;
- for (k = 0; k < 8; k++)
- c = c & 1 ? poly ^ (c >> 1) : c >> 1;
- crc_table[0][n] = c;
- }
-
-#ifdef BYFOUR
- /* generate crc for each value followed by one, two, and three zeros,
- and then the byte reversal of those as well as the first table */
- for (n = 0; n < 256; n++) {
- c = crc_table[0][n];
- crc_table[4][n] = REV(c);
- for (k = 1; k < 4; k++) {
- c = crc_table[0][c & 0xff] ^ (c >> 8);
- crc_table[k][n] = c;
- crc_table[k + 4][n] = REV(c);
- }
- }
-#endif /* BYFOUR */
-
- crc_table_empty = 0;
- }
- else { /* not first */
- /* wait for the other guy to finish (not efficient, but rare) */
- while (crc_table_empty)
- ;
- }
-
-#ifdef MAKECRCH
- /* write out CRC tables to crc32.h */
- {
- FILE *out;
-
- out = fopen("crc32.h", "w");
- if (out == NULL) return;
- fprintf(out, "/* crc32.h -- tables for rapid CRC calculation\n");
- fprintf(out, " * Generated automatically by crc32.c\n */\n\n");
- fprintf(out, "local const unsigned long FAR ");
- fprintf(out, "crc_table[TBLS][256] =\n{\n {\n");
- write_table(out, crc_table[0]);
-# ifdef BYFOUR
- fprintf(out, "#ifdef BYFOUR\n");
- for (k = 1; k < 8; k++) {
- fprintf(out, " },\n {\n");
- write_table(out, crc_table[k]);
- }
- fprintf(out, "#endif\n");
-# endif /* BYFOUR */
- fprintf(out, " }\n};\n");
- fclose(out);
- }
-#endif /* MAKECRCH */
-}
-
-#ifdef MAKECRCH
-local void write_table(out, table)
- FILE *out;
- const unsigned long FAR *table;
-{
- int n;
-
- for (n = 0; n < 256; n++)
- fprintf(out, "%s0x%08lxUL%s", n % 5 ? "" : " ", table[n],
- n == 255 ? "\n" : (n % 5 == 4 ? ",\n" : ", "));
-}
-#endif /* MAKECRCH */
-
-#else /* !DYNAMIC_CRC_TABLE */
-/* ========================================================================
- * Tables of CRC-32s of all single-byte values, made by make_crc_table().
- */
-#include "crc32.h"
-#endif /* DYNAMIC_CRC_TABLE */
-
-/* =========================================================================
- * This function can be used by asm versions of crc32()
- */
-const unsigned long FAR * ZEXPORT get_crc_table()
-{
-#ifdef DYNAMIC_CRC_TABLE
- if (crc_table_empty)
- make_crc_table();
-#endif /* DYNAMIC_CRC_TABLE */
- return (const unsigned long FAR *)crc_table;
-}
-
-/* ========================================================================= */
-#define DO1 crc = crc_table[0][((int)crc ^ (*buf++)) & 0xff] ^ (crc >> 8)
-#define DO8 DO1; DO1; DO1; DO1; DO1; DO1; DO1; DO1
-
-/* ========================================================================= */
-unsigned long ZEXPORT crc32(crc, buf, len)
- unsigned long crc;
- const unsigned char FAR *buf;
- unsigned len;
-{
- if (buf == Z_NULL) return 0UL;
-
-#ifdef DYNAMIC_CRC_TABLE
- if (crc_table_empty)
- make_crc_table();
-#endif /* DYNAMIC_CRC_TABLE */
-
-#ifdef BYFOUR
- if (sizeof(void *) == sizeof(ptrdiff_t)) {
- u4 endian;
-
- endian = 1;
- if (*((unsigned char *)(&endian)))
- return crc32_little(crc, buf, len);
- else
- return crc32_big(crc, buf, len);
- }
-#endif /* BYFOUR */
- crc = crc ^ 0xffffffffUL;
- while (len >= 8) {
- DO8;
- len -= 8;
- }
- if (len) do {
- DO1;
- } while (--len);
- return crc ^ 0xffffffffUL;
-}
-
-#ifdef BYFOUR
-
-/* ========================================================================= */
-#define DOLIT4 c ^= *buf4++; \
- c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \
- crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24]
-#define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4
-
-/* ========================================================================= */
-local unsigned long crc32_little(crc, buf, len)
- unsigned long crc;
- const unsigned char FAR *buf;
- unsigned len;
-{
- register u4 c;
- register const u4 FAR *buf4;
-
- c = (u4)crc;
- c = ~c;
- while (len && ((ptrdiff_t)buf & 3)) {
- c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8);
- len--;
- }
-
- buf4 = (const u4 FAR *)(const void FAR *)buf;
- while (len >= 32) {
- DOLIT32;
- len -= 32;
- }
- while (len >= 4) {
- DOLIT4;
- len -= 4;
- }
- buf = (const unsigned char FAR *)buf4;
-
- if (len) do {
- c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8);
- } while (--len);
- c = ~c;
- return (unsigned long)c;
-}
-
-/* ========================================================================= */
-#define DOBIG4 c ^= *++buf4; \
- c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
- crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24]
-#define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4
-
-/* ========================================================================= */
-local unsigned long crc32_big(crc, buf, len)
- unsigned long crc;
- const unsigned char FAR *buf;
- unsigned len;
-{
- register u4 c;
- register const u4 FAR *buf4;
-
- c = REV((u4)crc);
- c = ~c;
- while (len && ((ptrdiff_t)buf & 3)) {
- c = crc_table[4][(c >> 24) ^ *buf++] ^ (c << 8);
- len--;
- }
-
- buf4 = (const u4 FAR *)(const void FAR *)buf;
- buf4--;
- while (len >= 32) {
- DOBIG32;
- len -= 32;
- }
- while (len >= 4) {
- DOBIG4;
- len -= 4;
- }
- buf4++;
- buf = (const unsigned char FAR *)buf4;
-
- if (len) do {
- c = crc_table[4][(c >> 24) ^ *buf++] ^ (c << 8);
- } while (--len);
- c = ~c;
- return (unsigned long)(REV(c));
-}
-
-#endif /* BYFOUR */
-
-#define GF2_DIM 32 /* dimension of GF(2) vectors (length of CRC) */
-
-/* ========================================================================= */
-local unsigned long gf2_matrix_times(mat, vec)
- unsigned long *mat;
- unsigned long vec;
-{
- unsigned long sum;
-
- sum = 0;
- while (vec) {
- if (vec & 1)
- sum ^= *mat;
- vec >>= 1;
- mat++;
- }
- return sum;
-}
-
-/* ========================================================================= */
-local void gf2_matrix_square(square, mat)
- unsigned long *square;
- unsigned long *mat;
-{
- int n;
-
- for (n = 0; n < GF2_DIM; n++)
- square[n] = gf2_matrix_times(mat, mat[n]);
-}
-
-/* ========================================================================= */
-uLong ZEXPORT crc32_combine(crc1, crc2, len2)
- uLong crc1;
- uLong crc2;
- z_off_t len2;
-{
- int n;
- unsigned long row;
- unsigned long even[GF2_DIM]; /* even-power-of-two zeros operator */
- unsigned long odd[GF2_DIM]; /* odd-power-of-two zeros operator */
-
- /* degenerate case */
- if (len2 == 0)
- return crc1;
-
- /* put operator for one zero bit in odd */
- odd[0] = 0xedb88320UL; /* CRC-32 polynomial */
- row = 1;
- for (n = 1; n < GF2_DIM; n++) {
- odd[n] = row;
- row <<= 1;
- }
-
- /* put operator for two zero bits in even */
- gf2_matrix_square(even, odd);
-
- /* put operator for four zero bits in odd */
- gf2_matrix_square(odd, even);
-
- /* apply len2 zeros to crc1 (first square will put the operator for one
- zero byte, eight zero bits, in even) */
- do {
- /* apply zeros operator for this bit of len2 */
- gf2_matrix_square(even, odd);
- if (len2 & 1)
- crc1 = gf2_matrix_times(even, crc1);
- len2 >>= 1;
-
- /* if no more bits set, then done */
- if (len2 == 0)
- break;
-
- /* another iteration of the loop with odd and even swapped */
- gf2_matrix_square(odd, even);
- if (len2 & 1)
- crc1 = gf2_matrix_times(odd, crc1);
- len2 >>= 1;
-
- /* if no more bits set, then done */
- } while (len2 != 0);
-
- /* return combined crc */
- crc1 ^= crc2;
- return crc1;
-}
diff --git a/sys/contrib/opensolaris/uts/common/zmod/crc32.h b/sys/contrib/opensolaris/uts/common/zmod/crc32.h
deleted file mode 100644
index 495c83e..0000000
--- a/sys/contrib/opensolaris/uts/common/zmod/crc32.h
+++ /dev/null
@@ -1,443 +0,0 @@
-/* crc32.h -- tables for rapid CRC calculation
- * Generated automatically by crc32.c
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-local const unsigned long FAR crc_table[TBLS][256] =
-{
- {
- 0x00000000UL, 0x77073096UL, 0xee0e612cUL, 0x990951baUL, 0x076dc419UL,
- 0x706af48fUL, 0xe963a535UL, 0x9e6495a3UL, 0x0edb8832UL, 0x79dcb8a4UL,
- 0xe0d5e91eUL, 0x97d2d988UL, 0x09b64c2bUL, 0x7eb17cbdUL, 0xe7b82d07UL,
- 0x90bf1d91UL, 0x1db71064UL, 0x6ab020f2UL, 0xf3b97148UL, 0x84be41deUL,
- 0x1adad47dUL, 0x6ddde4ebUL, 0xf4d4b551UL, 0x83d385c7UL, 0x136c9856UL,
- 0x646ba8c0UL, 0xfd62f97aUL, 0x8a65c9ecUL, 0x14015c4fUL, 0x63066cd9UL,
- 0xfa0f3d63UL, 0x8d080df5UL, 0x3b6e20c8UL, 0x4c69105eUL, 0xd56041e4UL,
- 0xa2677172UL, 0x3c03e4d1UL, 0x4b04d447UL, 0xd20d85fdUL, 0xa50ab56bUL,
- 0x35b5a8faUL, 0x42b2986cUL, 0xdbbbc9d6UL, 0xacbcf940UL, 0x32d86ce3UL,
- 0x45df5c75UL, 0xdcd60dcfUL, 0xabd13d59UL, 0x26d930acUL, 0x51de003aUL,
- 0xc8d75180UL, 0xbfd06116UL, 0x21b4f4b5UL, 0x56b3c423UL, 0xcfba9599UL,
- 0xb8bda50fUL, 0x2802b89eUL, 0x5f058808UL, 0xc60cd9b2UL, 0xb10be924UL,
- 0x2f6f7c87UL, 0x58684c11UL, 0xc1611dabUL, 0xb6662d3dUL, 0x76dc4190UL,
- 0x01db7106UL, 0x98d220bcUL, 0xefd5102aUL, 0x71b18589UL, 0x06b6b51fUL,
- 0x9fbfe4a5UL, 0xe8b8d433UL, 0x7807c9a2UL, 0x0f00f934UL, 0x9609a88eUL,
- 0xe10e9818UL, 0x7f6a0dbbUL, 0x086d3d2dUL, 0x91646c97UL, 0xe6635c01UL,
- 0x6b6b51f4UL, 0x1c6c6162UL, 0x856530d8UL, 0xf262004eUL, 0x6c0695edUL,
- 0x1b01a57bUL, 0x8208f4c1UL, 0xf50fc457UL, 0x65b0d9c6UL, 0x12b7e950UL,
- 0x8bbeb8eaUL, 0xfcb9887cUL, 0x62dd1ddfUL, 0x15da2d49UL, 0x8cd37cf3UL,
- 0xfbd44c65UL, 0x4db26158UL, 0x3ab551ceUL, 0xa3bc0074UL, 0xd4bb30e2UL,
- 0x4adfa541UL, 0x3dd895d7UL, 0xa4d1c46dUL, 0xd3d6f4fbUL, 0x4369e96aUL,
- 0x346ed9fcUL, 0xad678846UL, 0xda60b8d0UL, 0x44042d73UL, 0x33031de5UL,
- 0xaa0a4c5fUL, 0xdd0d7cc9UL, 0x5005713cUL, 0x270241aaUL, 0xbe0b1010UL,
- 0xc90c2086UL, 0x5768b525UL, 0x206f85b3UL, 0xb966d409UL, 0xce61e49fUL,
- 0x5edef90eUL, 0x29d9c998UL, 0xb0d09822UL, 0xc7d7a8b4UL, 0x59b33d17UL,
- 0x2eb40d81UL, 0xb7bd5c3bUL, 0xc0ba6cadUL, 0xedb88320UL, 0x9abfb3b6UL,
- 0x03b6e20cUL, 0x74b1d29aUL, 0xead54739UL, 0x9dd277afUL, 0x04db2615UL,
- 0x73dc1683UL, 0xe3630b12UL, 0x94643b84UL, 0x0d6d6a3eUL, 0x7a6a5aa8UL,
- 0xe40ecf0bUL, 0x9309ff9dUL, 0x0a00ae27UL, 0x7d079eb1UL, 0xf00f9344UL,
- 0x8708a3d2UL, 0x1e01f268UL, 0x6906c2feUL, 0xf762575dUL, 0x806567cbUL,
- 0x196c3671UL, 0x6e6b06e7UL, 0xfed41b76UL, 0x89d32be0UL, 0x10da7a5aUL,
- 0x67dd4accUL, 0xf9b9df6fUL, 0x8ebeeff9UL, 0x17b7be43UL, 0x60b08ed5UL,
- 0xd6d6a3e8UL, 0xa1d1937eUL, 0x38d8c2c4UL, 0x4fdff252UL, 0xd1bb67f1UL,
- 0xa6bc5767UL, 0x3fb506ddUL, 0x48b2364bUL, 0xd80d2bdaUL, 0xaf0a1b4cUL,
- 0x36034af6UL, 0x41047a60UL, 0xdf60efc3UL, 0xa867df55UL, 0x316e8eefUL,
- 0x4669be79UL, 0xcb61b38cUL, 0xbc66831aUL, 0x256fd2a0UL, 0x5268e236UL,
- 0xcc0c7795UL, 0xbb0b4703UL, 0x220216b9UL, 0x5505262fUL, 0xc5ba3bbeUL,
- 0xb2bd0b28UL, 0x2bb45a92UL, 0x5cb36a04UL, 0xc2d7ffa7UL, 0xb5d0cf31UL,
- 0x2cd99e8bUL, 0x5bdeae1dUL, 0x9b64c2b0UL, 0xec63f226UL, 0x756aa39cUL,
- 0x026d930aUL, 0x9c0906a9UL, 0xeb0e363fUL, 0x72076785UL, 0x05005713UL,
- 0x95bf4a82UL, 0xe2b87a14UL, 0x7bb12baeUL, 0x0cb61b38UL, 0x92d28e9bUL,
- 0xe5d5be0dUL, 0x7cdcefb7UL, 0x0bdbdf21UL, 0x86d3d2d4UL, 0xf1d4e242UL,
- 0x68ddb3f8UL, 0x1fda836eUL, 0x81be16cdUL, 0xf6b9265bUL, 0x6fb077e1UL,
- 0x18b74777UL, 0x88085ae6UL, 0xff0f6a70UL, 0x66063bcaUL, 0x11010b5cUL,
- 0x8f659effUL, 0xf862ae69UL, 0x616bffd3UL, 0x166ccf45UL, 0xa00ae278UL,
- 0xd70dd2eeUL, 0x4e048354UL, 0x3903b3c2UL, 0xa7672661UL, 0xd06016f7UL,
- 0x4969474dUL, 0x3e6e77dbUL, 0xaed16a4aUL, 0xd9d65adcUL, 0x40df0b66UL,
- 0x37d83bf0UL, 0xa9bcae53UL, 0xdebb9ec5UL, 0x47b2cf7fUL, 0x30b5ffe9UL,
- 0xbdbdf21cUL, 0xcabac28aUL, 0x53b39330UL, 0x24b4a3a6UL, 0xbad03605UL,
- 0xcdd70693UL, 0x54de5729UL, 0x23d967bfUL, 0xb3667a2eUL, 0xc4614ab8UL,
- 0x5d681b02UL, 0x2a6f2b94UL, 0xb40bbe37UL, 0xc30c8ea1UL, 0x5a05df1bUL,
- 0x2d02ef8dUL
-#ifdef BYFOUR
- },
- {
- 0x00000000UL, 0x191b3141UL, 0x32366282UL, 0x2b2d53c3UL, 0x646cc504UL,
- 0x7d77f445UL, 0x565aa786UL, 0x4f4196c7UL, 0xc8d98a08UL, 0xd1c2bb49UL,
- 0xfaefe88aUL, 0xe3f4d9cbUL, 0xacb54f0cUL, 0xb5ae7e4dUL, 0x9e832d8eUL,
- 0x87981ccfUL, 0x4ac21251UL, 0x53d92310UL, 0x78f470d3UL, 0x61ef4192UL,
- 0x2eaed755UL, 0x37b5e614UL, 0x1c98b5d7UL, 0x05838496UL, 0x821b9859UL,
- 0x9b00a918UL, 0xb02dfadbUL, 0xa936cb9aUL, 0xe6775d5dUL, 0xff6c6c1cUL,
- 0xd4413fdfUL, 0xcd5a0e9eUL, 0x958424a2UL, 0x8c9f15e3UL, 0xa7b24620UL,
- 0xbea97761UL, 0xf1e8e1a6UL, 0xe8f3d0e7UL, 0xc3de8324UL, 0xdac5b265UL,
- 0x5d5daeaaUL, 0x44469febUL, 0x6f6bcc28UL, 0x7670fd69UL, 0x39316baeUL,
- 0x202a5aefUL, 0x0b07092cUL, 0x121c386dUL, 0xdf4636f3UL, 0xc65d07b2UL,
- 0xed705471UL, 0xf46b6530UL, 0xbb2af3f7UL, 0xa231c2b6UL, 0x891c9175UL,
- 0x9007a034UL, 0x179fbcfbUL, 0x0e848dbaUL, 0x25a9de79UL, 0x3cb2ef38UL,
- 0x73f379ffUL, 0x6ae848beUL, 0x41c51b7dUL, 0x58de2a3cUL, 0xf0794f05UL,
- 0xe9627e44UL, 0xc24f2d87UL, 0xdb541cc6UL, 0x94158a01UL, 0x8d0ebb40UL,
- 0xa623e883UL, 0xbf38d9c2UL, 0x38a0c50dUL, 0x21bbf44cUL, 0x0a96a78fUL,
- 0x138d96ceUL, 0x5ccc0009UL, 0x45d73148UL, 0x6efa628bUL, 0x77e153caUL,
- 0xbabb5d54UL, 0xa3a06c15UL, 0x888d3fd6UL, 0x91960e97UL, 0xded79850UL,
- 0xc7cca911UL, 0xece1fad2UL, 0xf5facb93UL, 0x7262d75cUL, 0x6b79e61dUL,
- 0x4054b5deUL, 0x594f849fUL, 0x160e1258UL, 0x0f152319UL, 0x243870daUL,
- 0x3d23419bUL, 0x65fd6ba7UL, 0x7ce65ae6UL, 0x57cb0925UL, 0x4ed03864UL,
- 0x0191aea3UL, 0x188a9fe2UL, 0x33a7cc21UL, 0x2abcfd60UL, 0xad24e1afUL,
- 0xb43fd0eeUL, 0x9f12832dUL, 0x8609b26cUL, 0xc94824abUL, 0xd05315eaUL,
- 0xfb7e4629UL, 0xe2657768UL, 0x2f3f79f6UL, 0x362448b7UL, 0x1d091b74UL,
- 0x04122a35UL, 0x4b53bcf2UL, 0x52488db3UL, 0x7965de70UL, 0x607eef31UL,
- 0xe7e6f3feUL, 0xfefdc2bfUL, 0xd5d0917cUL, 0xcccba03dUL, 0x838a36faUL,
- 0x9a9107bbUL, 0xb1bc5478UL, 0xa8a76539UL, 0x3b83984bUL, 0x2298a90aUL,
- 0x09b5fac9UL, 0x10aecb88UL, 0x5fef5d4fUL, 0x46f46c0eUL, 0x6dd93fcdUL,
- 0x74c20e8cUL, 0xf35a1243UL, 0xea412302UL, 0xc16c70c1UL, 0xd8774180UL,
- 0x9736d747UL, 0x8e2de606UL, 0xa500b5c5UL, 0xbc1b8484UL, 0x71418a1aUL,
- 0x685abb5bUL, 0x4377e898UL, 0x5a6cd9d9UL, 0x152d4f1eUL, 0x0c367e5fUL,
- 0x271b2d9cUL, 0x3e001cddUL, 0xb9980012UL, 0xa0833153UL, 0x8bae6290UL,
- 0x92b553d1UL, 0xddf4c516UL, 0xc4eff457UL, 0xefc2a794UL, 0xf6d996d5UL,
- 0xae07bce9UL, 0xb71c8da8UL, 0x9c31de6bUL, 0x852aef2aUL, 0xca6b79edUL,
- 0xd37048acUL, 0xf85d1b6fUL, 0xe1462a2eUL, 0x66de36e1UL, 0x7fc507a0UL,
- 0x54e85463UL, 0x4df36522UL, 0x02b2f3e5UL, 0x1ba9c2a4UL, 0x30849167UL,
- 0x299fa026UL, 0xe4c5aeb8UL, 0xfdde9ff9UL, 0xd6f3cc3aUL, 0xcfe8fd7bUL,
- 0x80a96bbcUL, 0x99b25afdUL, 0xb29f093eUL, 0xab84387fUL, 0x2c1c24b0UL,
- 0x350715f1UL, 0x1e2a4632UL, 0x07317773UL, 0x4870e1b4UL, 0x516bd0f5UL,
- 0x7a468336UL, 0x635db277UL, 0xcbfad74eUL, 0xd2e1e60fUL, 0xf9ccb5ccUL,
- 0xe0d7848dUL, 0xaf96124aUL, 0xb68d230bUL, 0x9da070c8UL, 0x84bb4189UL,
- 0x03235d46UL, 0x1a386c07UL, 0x31153fc4UL, 0x280e0e85UL, 0x674f9842UL,
- 0x7e54a903UL, 0x5579fac0UL, 0x4c62cb81UL, 0x8138c51fUL, 0x9823f45eUL,
- 0xb30ea79dUL, 0xaa1596dcUL, 0xe554001bUL, 0xfc4f315aUL, 0xd7626299UL,
- 0xce7953d8UL, 0x49e14f17UL, 0x50fa7e56UL, 0x7bd72d95UL, 0x62cc1cd4UL,
- 0x2d8d8a13UL, 0x3496bb52UL, 0x1fbbe891UL, 0x06a0d9d0UL, 0x5e7ef3ecUL,
- 0x4765c2adUL, 0x6c48916eUL, 0x7553a02fUL, 0x3a1236e8UL, 0x230907a9UL,
- 0x0824546aUL, 0x113f652bUL, 0x96a779e4UL, 0x8fbc48a5UL, 0xa4911b66UL,
- 0xbd8a2a27UL, 0xf2cbbce0UL, 0xebd08da1UL, 0xc0fdde62UL, 0xd9e6ef23UL,
- 0x14bce1bdUL, 0x0da7d0fcUL, 0x268a833fUL, 0x3f91b27eUL, 0x70d024b9UL,
- 0x69cb15f8UL, 0x42e6463bUL, 0x5bfd777aUL, 0xdc656bb5UL, 0xc57e5af4UL,
- 0xee530937UL, 0xf7483876UL, 0xb809aeb1UL, 0xa1129ff0UL, 0x8a3fcc33UL,
- 0x9324fd72UL
- },
- {
- 0x00000000UL, 0x01c26a37UL, 0x0384d46eUL, 0x0246be59UL, 0x0709a8dcUL,
- 0x06cbc2ebUL, 0x048d7cb2UL, 0x054f1685UL, 0x0e1351b8UL, 0x0fd13b8fUL,
- 0x0d9785d6UL, 0x0c55efe1UL, 0x091af964UL, 0x08d89353UL, 0x0a9e2d0aUL,
- 0x0b5c473dUL, 0x1c26a370UL, 0x1de4c947UL, 0x1fa2771eUL, 0x1e601d29UL,
- 0x1b2f0bacUL, 0x1aed619bUL, 0x18abdfc2UL, 0x1969b5f5UL, 0x1235f2c8UL,
- 0x13f798ffUL, 0x11b126a6UL, 0x10734c91UL, 0x153c5a14UL, 0x14fe3023UL,
- 0x16b88e7aUL, 0x177ae44dUL, 0x384d46e0UL, 0x398f2cd7UL, 0x3bc9928eUL,
- 0x3a0bf8b9UL, 0x3f44ee3cUL, 0x3e86840bUL, 0x3cc03a52UL, 0x3d025065UL,
- 0x365e1758UL, 0x379c7d6fUL, 0x35dac336UL, 0x3418a901UL, 0x3157bf84UL,
- 0x3095d5b3UL, 0x32d36beaUL, 0x331101ddUL, 0x246be590UL, 0x25a98fa7UL,
- 0x27ef31feUL, 0x262d5bc9UL, 0x23624d4cUL, 0x22a0277bUL, 0x20e69922UL,
- 0x2124f315UL, 0x2a78b428UL, 0x2bbade1fUL, 0x29fc6046UL, 0x283e0a71UL,
- 0x2d711cf4UL, 0x2cb376c3UL, 0x2ef5c89aUL, 0x2f37a2adUL, 0x709a8dc0UL,
- 0x7158e7f7UL, 0x731e59aeUL, 0x72dc3399UL, 0x7793251cUL, 0x76514f2bUL,
- 0x7417f172UL, 0x75d59b45UL, 0x7e89dc78UL, 0x7f4bb64fUL, 0x7d0d0816UL,
- 0x7ccf6221UL, 0x798074a4UL, 0x78421e93UL, 0x7a04a0caUL, 0x7bc6cafdUL,
- 0x6cbc2eb0UL, 0x6d7e4487UL, 0x6f38fadeUL, 0x6efa90e9UL, 0x6bb5866cUL,
- 0x6a77ec5bUL, 0x68315202UL, 0x69f33835UL, 0x62af7f08UL, 0x636d153fUL,
- 0x612bab66UL, 0x60e9c151UL, 0x65a6d7d4UL, 0x6464bde3UL, 0x662203baUL,
- 0x67e0698dUL, 0x48d7cb20UL, 0x4915a117UL, 0x4b531f4eUL, 0x4a917579UL,
- 0x4fde63fcUL, 0x4e1c09cbUL, 0x4c5ab792UL, 0x4d98dda5UL, 0x46c49a98UL,
- 0x4706f0afUL, 0x45404ef6UL, 0x448224c1UL, 0x41cd3244UL, 0x400f5873UL,
- 0x4249e62aUL, 0x438b8c1dUL, 0x54f16850UL, 0x55330267UL, 0x5775bc3eUL,
- 0x56b7d609UL, 0x53f8c08cUL, 0x523aaabbUL, 0x507c14e2UL, 0x51be7ed5UL,
- 0x5ae239e8UL, 0x5b2053dfUL, 0x5966ed86UL, 0x58a487b1UL, 0x5deb9134UL,
- 0x5c29fb03UL, 0x5e6f455aUL, 0x5fad2f6dUL, 0xe1351b80UL, 0xe0f771b7UL,
- 0xe2b1cfeeUL, 0xe373a5d9UL, 0xe63cb35cUL, 0xe7fed96bUL, 0xe5b86732UL,
- 0xe47a0d05UL, 0xef264a38UL, 0xeee4200fUL, 0xeca29e56UL, 0xed60f461UL,
- 0xe82fe2e4UL, 0xe9ed88d3UL, 0xebab368aUL, 0xea695cbdUL, 0xfd13b8f0UL,
- 0xfcd1d2c7UL, 0xfe976c9eUL, 0xff5506a9UL, 0xfa1a102cUL, 0xfbd87a1bUL,
- 0xf99ec442UL, 0xf85cae75UL, 0xf300e948UL, 0xf2c2837fUL, 0xf0843d26UL,
- 0xf1465711UL, 0xf4094194UL, 0xf5cb2ba3UL, 0xf78d95faUL, 0xf64fffcdUL,
- 0xd9785d60UL, 0xd8ba3757UL, 0xdafc890eUL, 0xdb3ee339UL, 0xde71f5bcUL,
- 0xdfb39f8bUL, 0xddf521d2UL, 0xdc374be5UL, 0xd76b0cd8UL, 0xd6a966efUL,
- 0xd4efd8b6UL, 0xd52db281UL, 0xd062a404UL, 0xd1a0ce33UL, 0xd3e6706aUL,
- 0xd2241a5dUL, 0xc55efe10UL, 0xc49c9427UL, 0xc6da2a7eUL, 0xc7184049UL,
- 0xc25756ccUL, 0xc3953cfbUL, 0xc1d382a2UL, 0xc011e895UL, 0xcb4dafa8UL,
- 0xca8fc59fUL, 0xc8c97bc6UL, 0xc90b11f1UL, 0xcc440774UL, 0xcd866d43UL,
- 0xcfc0d31aUL, 0xce02b92dUL, 0x91af9640UL, 0x906dfc77UL, 0x922b422eUL,
- 0x93e92819UL, 0x96a63e9cUL, 0x976454abUL, 0x9522eaf2UL, 0x94e080c5UL,
- 0x9fbcc7f8UL, 0x9e7eadcfUL, 0x9c381396UL, 0x9dfa79a1UL, 0x98b56f24UL,
- 0x99770513UL, 0x9b31bb4aUL, 0x9af3d17dUL, 0x8d893530UL, 0x8c4b5f07UL,
- 0x8e0de15eUL, 0x8fcf8b69UL, 0x8a809decUL, 0x8b42f7dbUL, 0x89044982UL,
- 0x88c623b5UL, 0x839a6488UL, 0x82580ebfUL, 0x801eb0e6UL, 0x81dcdad1UL,
- 0x8493cc54UL, 0x8551a663UL, 0x8717183aUL, 0x86d5720dUL, 0xa9e2d0a0UL,
- 0xa820ba97UL, 0xaa6604ceUL, 0xaba46ef9UL, 0xaeeb787cUL, 0xaf29124bUL,
- 0xad6fac12UL, 0xacadc625UL, 0xa7f18118UL, 0xa633eb2fUL, 0xa4755576UL,
- 0xa5b73f41UL, 0xa0f829c4UL, 0xa13a43f3UL, 0xa37cfdaaUL, 0xa2be979dUL,
- 0xb5c473d0UL, 0xb40619e7UL, 0xb640a7beUL, 0xb782cd89UL, 0xb2cddb0cUL,
- 0xb30fb13bUL, 0xb1490f62UL, 0xb08b6555UL, 0xbbd72268UL, 0xba15485fUL,
- 0xb853f606UL, 0xb9919c31UL, 0xbcde8ab4UL, 0xbd1ce083UL, 0xbf5a5edaUL,
- 0xbe9834edUL
- },
- {
- 0x00000000UL, 0xb8bc6765UL, 0xaa09c88bUL, 0x12b5afeeUL, 0x8f629757UL,
- 0x37def032UL, 0x256b5fdcUL, 0x9dd738b9UL, 0xc5b428efUL, 0x7d084f8aUL,
- 0x6fbde064UL, 0xd7018701UL, 0x4ad6bfb8UL, 0xf26ad8ddUL, 0xe0df7733UL,
- 0x58631056UL, 0x5019579fUL, 0xe8a530faUL, 0xfa109f14UL, 0x42acf871UL,
- 0xdf7bc0c8UL, 0x67c7a7adUL, 0x75720843UL, 0xcdce6f26UL, 0x95ad7f70UL,
- 0x2d111815UL, 0x3fa4b7fbUL, 0x8718d09eUL, 0x1acfe827UL, 0xa2738f42UL,
- 0xb0c620acUL, 0x087a47c9UL, 0xa032af3eUL, 0x188ec85bUL, 0x0a3b67b5UL,
- 0xb28700d0UL, 0x2f503869UL, 0x97ec5f0cUL, 0x8559f0e2UL, 0x3de59787UL,
- 0x658687d1UL, 0xdd3ae0b4UL, 0xcf8f4f5aUL, 0x7733283fUL, 0xeae41086UL,
- 0x525877e3UL, 0x40edd80dUL, 0xf851bf68UL, 0xf02bf8a1UL, 0x48979fc4UL,
- 0x5a22302aUL, 0xe29e574fUL, 0x7f496ff6UL, 0xc7f50893UL, 0xd540a77dUL,
- 0x6dfcc018UL, 0x359fd04eUL, 0x8d23b72bUL, 0x9f9618c5UL, 0x272a7fa0UL,
- 0xbafd4719UL, 0x0241207cUL, 0x10f48f92UL, 0xa848e8f7UL, 0x9b14583dUL,
- 0x23a83f58UL, 0x311d90b6UL, 0x89a1f7d3UL, 0x1476cf6aUL, 0xaccaa80fUL,
- 0xbe7f07e1UL, 0x06c36084UL, 0x5ea070d2UL, 0xe61c17b7UL, 0xf4a9b859UL,
- 0x4c15df3cUL, 0xd1c2e785UL, 0x697e80e0UL, 0x7bcb2f0eUL, 0xc377486bUL,
- 0xcb0d0fa2UL, 0x73b168c7UL, 0x6104c729UL, 0xd9b8a04cUL, 0x446f98f5UL,
- 0xfcd3ff90UL, 0xee66507eUL, 0x56da371bUL, 0x0eb9274dUL, 0xb6054028UL,
- 0xa4b0efc6UL, 0x1c0c88a3UL, 0x81dbb01aUL, 0x3967d77fUL, 0x2bd27891UL,
- 0x936e1ff4UL, 0x3b26f703UL, 0x839a9066UL, 0x912f3f88UL, 0x299358edUL,
- 0xb4446054UL, 0x0cf80731UL, 0x1e4da8dfUL, 0xa6f1cfbaUL, 0xfe92dfecUL,
- 0x462eb889UL, 0x549b1767UL, 0xec277002UL, 0x71f048bbUL, 0xc94c2fdeUL,
- 0xdbf98030UL, 0x6345e755UL, 0x6b3fa09cUL, 0xd383c7f9UL, 0xc1366817UL,
- 0x798a0f72UL, 0xe45d37cbUL, 0x5ce150aeUL, 0x4e54ff40UL, 0xf6e89825UL,
- 0xae8b8873UL, 0x1637ef16UL, 0x048240f8UL, 0xbc3e279dUL, 0x21e91f24UL,
- 0x99557841UL, 0x8be0d7afUL, 0x335cb0caUL, 0xed59b63bUL, 0x55e5d15eUL,
- 0x47507eb0UL, 0xffec19d5UL, 0x623b216cUL, 0xda874609UL, 0xc832e9e7UL,
- 0x708e8e82UL, 0x28ed9ed4UL, 0x9051f9b1UL, 0x82e4565fUL, 0x3a58313aUL,
- 0xa78f0983UL, 0x1f336ee6UL, 0x0d86c108UL, 0xb53aa66dUL, 0xbd40e1a4UL,
- 0x05fc86c1UL, 0x1749292fUL, 0xaff54e4aUL, 0x322276f3UL, 0x8a9e1196UL,
- 0x982bbe78UL, 0x2097d91dUL, 0x78f4c94bUL, 0xc048ae2eUL, 0xd2fd01c0UL,
- 0x6a4166a5UL, 0xf7965e1cUL, 0x4f2a3979UL, 0x5d9f9697UL, 0xe523f1f2UL,
- 0x4d6b1905UL, 0xf5d77e60UL, 0xe762d18eUL, 0x5fdeb6ebUL, 0xc2098e52UL,
- 0x7ab5e937UL, 0x680046d9UL, 0xd0bc21bcUL, 0x88df31eaUL, 0x3063568fUL,
- 0x22d6f961UL, 0x9a6a9e04UL, 0x07bda6bdUL, 0xbf01c1d8UL, 0xadb46e36UL,
- 0x15080953UL, 0x1d724e9aUL, 0xa5ce29ffUL, 0xb77b8611UL, 0x0fc7e174UL,
- 0x9210d9cdUL, 0x2aacbea8UL, 0x38191146UL, 0x80a57623UL, 0xd8c66675UL,
- 0x607a0110UL, 0x72cfaefeUL, 0xca73c99bUL, 0x57a4f122UL, 0xef189647UL,
- 0xfdad39a9UL, 0x45115eccUL, 0x764dee06UL, 0xcef18963UL, 0xdc44268dUL,
- 0x64f841e8UL, 0xf92f7951UL, 0x41931e34UL, 0x5326b1daUL, 0xeb9ad6bfUL,
- 0xb3f9c6e9UL, 0x0b45a18cUL, 0x19f00e62UL, 0xa14c6907UL, 0x3c9b51beUL,
- 0x842736dbUL, 0x96929935UL, 0x2e2efe50UL, 0x2654b999UL, 0x9ee8defcUL,
- 0x8c5d7112UL, 0x34e11677UL, 0xa9362eceUL, 0x118a49abUL, 0x033fe645UL,
- 0xbb838120UL, 0xe3e09176UL, 0x5b5cf613UL, 0x49e959fdUL, 0xf1553e98UL,
- 0x6c820621UL, 0xd43e6144UL, 0xc68bceaaUL, 0x7e37a9cfUL, 0xd67f4138UL,
- 0x6ec3265dUL, 0x7c7689b3UL, 0xc4caeed6UL, 0x591dd66fUL, 0xe1a1b10aUL,
- 0xf3141ee4UL, 0x4ba87981UL, 0x13cb69d7UL, 0xab770eb2UL, 0xb9c2a15cUL,
- 0x017ec639UL, 0x9ca9fe80UL, 0x241599e5UL, 0x36a0360bUL, 0x8e1c516eUL,
- 0x866616a7UL, 0x3eda71c2UL, 0x2c6fde2cUL, 0x94d3b949UL, 0x090481f0UL,
- 0xb1b8e695UL, 0xa30d497bUL, 0x1bb12e1eUL, 0x43d23e48UL, 0xfb6e592dUL,
- 0xe9dbf6c3UL, 0x516791a6UL, 0xccb0a91fUL, 0x740cce7aUL, 0x66b96194UL,
- 0xde0506f1UL
- },
- {
- 0x00000000UL, 0x96300777UL, 0x2c610eeeUL, 0xba510999UL, 0x19c46d07UL,
- 0x8ff46a70UL, 0x35a563e9UL, 0xa395649eUL, 0x3288db0eUL, 0xa4b8dc79UL,
- 0x1ee9d5e0UL, 0x88d9d297UL, 0x2b4cb609UL, 0xbd7cb17eUL, 0x072db8e7UL,
- 0x911dbf90UL, 0x6410b71dUL, 0xf220b06aUL, 0x4871b9f3UL, 0xde41be84UL,
- 0x7dd4da1aUL, 0xebe4dd6dUL, 0x51b5d4f4UL, 0xc785d383UL, 0x56986c13UL,
- 0xc0a86b64UL, 0x7af962fdUL, 0xecc9658aUL, 0x4f5c0114UL, 0xd96c0663UL,
- 0x633d0ffaUL, 0xf50d088dUL, 0xc8206e3bUL, 0x5e10694cUL, 0xe44160d5UL,
- 0x727167a2UL, 0xd1e4033cUL, 0x47d4044bUL, 0xfd850dd2UL, 0x6bb50aa5UL,
- 0xfaa8b535UL, 0x6c98b242UL, 0xd6c9bbdbUL, 0x40f9bcacUL, 0xe36cd832UL,
- 0x755cdf45UL, 0xcf0dd6dcUL, 0x593dd1abUL, 0xac30d926UL, 0x3a00de51UL,
- 0x8051d7c8UL, 0x1661d0bfUL, 0xb5f4b421UL, 0x23c4b356UL, 0x9995bacfUL,
- 0x0fa5bdb8UL, 0x9eb80228UL, 0x0888055fUL, 0xb2d90cc6UL, 0x24e90bb1UL,
- 0x877c6f2fUL, 0x114c6858UL, 0xab1d61c1UL, 0x3d2d66b6UL, 0x9041dc76UL,
- 0x0671db01UL, 0xbc20d298UL, 0x2a10d5efUL, 0x8985b171UL, 0x1fb5b606UL,
- 0xa5e4bf9fUL, 0x33d4b8e8UL, 0xa2c90778UL, 0x34f9000fUL, 0x8ea80996UL,
- 0x18980ee1UL, 0xbb0d6a7fUL, 0x2d3d6d08UL, 0x976c6491UL, 0x015c63e6UL,
- 0xf4516b6bUL, 0x62616c1cUL, 0xd8306585UL, 0x4e0062f2UL, 0xed95066cUL,
- 0x7ba5011bUL, 0xc1f40882UL, 0x57c40ff5UL, 0xc6d9b065UL, 0x50e9b712UL,
- 0xeab8be8bUL, 0x7c88b9fcUL, 0xdf1ddd62UL, 0x492dda15UL, 0xf37cd38cUL,
- 0x654cd4fbUL, 0x5861b24dUL, 0xce51b53aUL, 0x7400bca3UL, 0xe230bbd4UL,
- 0x41a5df4aUL, 0xd795d83dUL, 0x6dc4d1a4UL, 0xfbf4d6d3UL, 0x6ae96943UL,
- 0xfcd96e34UL, 0x468867adUL, 0xd0b860daUL, 0x732d0444UL, 0xe51d0333UL,
- 0x5f4c0aaaUL, 0xc97c0dddUL, 0x3c710550UL, 0xaa410227UL, 0x10100bbeUL,
- 0x86200cc9UL, 0x25b56857UL, 0xb3856f20UL, 0x09d466b9UL, 0x9fe461ceUL,
- 0x0ef9de5eUL, 0x98c9d929UL, 0x2298d0b0UL, 0xb4a8d7c7UL, 0x173db359UL,
- 0x810db42eUL, 0x3b5cbdb7UL, 0xad6cbac0UL, 0x2083b8edUL, 0xb6b3bf9aUL,
- 0x0ce2b603UL, 0x9ad2b174UL, 0x3947d5eaUL, 0xaf77d29dUL, 0x1526db04UL,
- 0x8316dc73UL, 0x120b63e3UL, 0x843b6494UL, 0x3e6a6d0dUL, 0xa85a6a7aUL,
- 0x0bcf0ee4UL, 0x9dff0993UL, 0x27ae000aUL, 0xb19e077dUL, 0x44930ff0UL,
- 0xd2a30887UL, 0x68f2011eUL, 0xfec20669UL, 0x5d5762f7UL, 0xcb676580UL,
- 0x71366c19UL, 0xe7066b6eUL, 0x761bd4feUL, 0xe02bd389UL, 0x5a7ada10UL,
- 0xcc4add67UL, 0x6fdfb9f9UL, 0xf9efbe8eUL, 0x43beb717UL, 0xd58eb060UL,
- 0xe8a3d6d6UL, 0x7e93d1a1UL, 0xc4c2d838UL, 0x52f2df4fUL, 0xf167bbd1UL,
- 0x6757bca6UL, 0xdd06b53fUL, 0x4b36b248UL, 0xda2b0dd8UL, 0x4c1b0aafUL,
- 0xf64a0336UL, 0x607a0441UL, 0xc3ef60dfUL, 0x55df67a8UL, 0xef8e6e31UL,
- 0x79be6946UL, 0x8cb361cbUL, 0x1a8366bcUL, 0xa0d26f25UL, 0x36e26852UL,
- 0x95770cccUL, 0x03470bbbUL, 0xb9160222UL, 0x2f260555UL, 0xbe3bbac5UL,
- 0x280bbdb2UL, 0x925ab42bUL, 0x046ab35cUL, 0xa7ffd7c2UL, 0x31cfd0b5UL,
- 0x8b9ed92cUL, 0x1daede5bUL, 0xb0c2649bUL, 0x26f263ecUL, 0x9ca36a75UL,
- 0x0a936d02UL, 0xa906099cUL, 0x3f360eebUL, 0x85670772UL, 0x13570005UL,
- 0x824abf95UL, 0x147ab8e2UL, 0xae2bb17bUL, 0x381bb60cUL, 0x9b8ed292UL,
- 0x0dbed5e5UL, 0xb7efdc7cUL, 0x21dfdb0bUL, 0xd4d2d386UL, 0x42e2d4f1UL,
- 0xf8b3dd68UL, 0x6e83da1fUL, 0xcd16be81UL, 0x5b26b9f6UL, 0xe177b06fUL,
- 0x7747b718UL, 0xe65a0888UL, 0x706a0fffUL, 0xca3b0666UL, 0x5c0b0111UL,
- 0xff9e658fUL, 0x69ae62f8UL, 0xd3ff6b61UL, 0x45cf6c16UL, 0x78e20aa0UL,
- 0xeed20dd7UL, 0x5483044eUL, 0xc2b30339UL, 0x612667a7UL, 0xf71660d0UL,
- 0x4d476949UL, 0xdb776e3eUL, 0x4a6ad1aeUL, 0xdc5ad6d9UL, 0x660bdf40UL,
- 0xf03bd837UL, 0x53aebca9UL, 0xc59ebbdeUL, 0x7fcfb247UL, 0xe9ffb530UL,
- 0x1cf2bdbdUL, 0x8ac2bacaUL, 0x3093b353UL, 0xa6a3b424UL, 0x0536d0baUL,
- 0x9306d7cdUL, 0x2957de54UL, 0xbf67d923UL, 0x2e7a66b3UL, 0xb84a61c4UL,
- 0x021b685dUL, 0x942b6f2aUL, 0x37be0bb4UL, 0xa18e0cc3UL, 0x1bdf055aUL,
- 0x8def022dUL
- },
- {
- 0x00000000UL, 0x41311b19UL, 0x82623632UL, 0xc3532d2bUL, 0x04c56c64UL,
- 0x45f4777dUL, 0x86a75a56UL, 0xc796414fUL, 0x088ad9c8UL, 0x49bbc2d1UL,
- 0x8ae8effaUL, 0xcbd9f4e3UL, 0x0c4fb5acUL, 0x4d7eaeb5UL, 0x8e2d839eUL,
- 0xcf1c9887UL, 0x5112c24aUL, 0x1023d953UL, 0xd370f478UL, 0x9241ef61UL,
- 0x55d7ae2eUL, 0x14e6b537UL, 0xd7b5981cUL, 0x96848305UL, 0x59981b82UL,
- 0x18a9009bUL, 0xdbfa2db0UL, 0x9acb36a9UL, 0x5d5d77e6UL, 0x1c6c6cffUL,
- 0xdf3f41d4UL, 0x9e0e5acdUL, 0xa2248495UL, 0xe3159f8cUL, 0x2046b2a7UL,
- 0x6177a9beUL, 0xa6e1e8f1UL, 0xe7d0f3e8UL, 0x2483dec3UL, 0x65b2c5daUL,
- 0xaaae5d5dUL, 0xeb9f4644UL, 0x28cc6b6fUL, 0x69fd7076UL, 0xae6b3139UL,
- 0xef5a2a20UL, 0x2c09070bUL, 0x6d381c12UL, 0xf33646dfUL, 0xb2075dc6UL,
- 0x715470edUL, 0x30656bf4UL, 0xf7f32abbUL, 0xb6c231a2UL, 0x75911c89UL,
- 0x34a00790UL, 0xfbbc9f17UL, 0xba8d840eUL, 0x79dea925UL, 0x38efb23cUL,
- 0xff79f373UL, 0xbe48e86aUL, 0x7d1bc541UL, 0x3c2ade58UL, 0x054f79f0UL,
- 0x447e62e9UL, 0x872d4fc2UL, 0xc61c54dbUL, 0x018a1594UL, 0x40bb0e8dUL,
- 0x83e823a6UL, 0xc2d938bfUL, 0x0dc5a038UL, 0x4cf4bb21UL, 0x8fa7960aUL,
- 0xce968d13UL, 0x0900cc5cUL, 0x4831d745UL, 0x8b62fa6eUL, 0xca53e177UL,
- 0x545dbbbaUL, 0x156ca0a3UL, 0xd63f8d88UL, 0x970e9691UL, 0x5098d7deUL,
- 0x11a9ccc7UL, 0xd2fae1ecUL, 0x93cbfaf5UL, 0x5cd76272UL, 0x1de6796bUL,
- 0xdeb55440UL, 0x9f844f59UL, 0x58120e16UL, 0x1923150fUL, 0xda703824UL,
- 0x9b41233dUL, 0xa76bfd65UL, 0xe65ae67cUL, 0x2509cb57UL, 0x6438d04eUL,
- 0xa3ae9101UL, 0xe29f8a18UL, 0x21cca733UL, 0x60fdbc2aUL, 0xafe124adUL,
- 0xeed03fb4UL, 0x2d83129fUL, 0x6cb20986UL, 0xab2448c9UL, 0xea1553d0UL,
- 0x29467efbUL, 0x687765e2UL, 0xf6793f2fUL, 0xb7482436UL, 0x741b091dUL,
- 0x352a1204UL, 0xf2bc534bUL, 0xb38d4852UL, 0x70de6579UL, 0x31ef7e60UL,
- 0xfef3e6e7UL, 0xbfc2fdfeUL, 0x7c91d0d5UL, 0x3da0cbccUL, 0xfa368a83UL,
- 0xbb07919aUL, 0x7854bcb1UL, 0x3965a7a8UL, 0x4b98833bUL, 0x0aa99822UL,
- 0xc9fab509UL, 0x88cbae10UL, 0x4f5def5fUL, 0x0e6cf446UL, 0xcd3fd96dUL,
- 0x8c0ec274UL, 0x43125af3UL, 0x022341eaUL, 0xc1706cc1UL, 0x804177d8UL,
- 0x47d73697UL, 0x06e62d8eUL, 0xc5b500a5UL, 0x84841bbcUL, 0x1a8a4171UL,
- 0x5bbb5a68UL, 0x98e87743UL, 0xd9d96c5aUL, 0x1e4f2d15UL, 0x5f7e360cUL,
- 0x9c2d1b27UL, 0xdd1c003eUL, 0x120098b9UL, 0x533183a0UL, 0x9062ae8bUL,
- 0xd153b592UL, 0x16c5f4ddUL, 0x57f4efc4UL, 0x94a7c2efUL, 0xd596d9f6UL,
- 0xe9bc07aeUL, 0xa88d1cb7UL, 0x6bde319cUL, 0x2aef2a85UL, 0xed796bcaUL,
- 0xac4870d3UL, 0x6f1b5df8UL, 0x2e2a46e1UL, 0xe136de66UL, 0xa007c57fUL,
- 0x6354e854UL, 0x2265f34dUL, 0xe5f3b202UL, 0xa4c2a91bUL, 0x67918430UL,
- 0x26a09f29UL, 0xb8aec5e4UL, 0xf99fdefdUL, 0x3accf3d6UL, 0x7bfde8cfUL,
- 0xbc6ba980UL, 0xfd5ab299UL, 0x3e099fb2UL, 0x7f3884abUL, 0xb0241c2cUL,
- 0xf1150735UL, 0x32462a1eUL, 0x73773107UL, 0xb4e17048UL, 0xf5d06b51UL,
- 0x3683467aUL, 0x77b25d63UL, 0x4ed7facbUL, 0x0fe6e1d2UL, 0xccb5ccf9UL,
- 0x8d84d7e0UL, 0x4a1296afUL, 0x0b238db6UL, 0xc870a09dUL, 0x8941bb84UL,
- 0x465d2303UL, 0x076c381aUL, 0xc43f1531UL, 0x850e0e28UL, 0x42984f67UL,
- 0x03a9547eUL, 0xc0fa7955UL, 0x81cb624cUL, 0x1fc53881UL, 0x5ef42398UL,
- 0x9da70eb3UL, 0xdc9615aaUL, 0x1b0054e5UL, 0x5a314ffcUL, 0x996262d7UL,
- 0xd85379ceUL, 0x174fe149UL, 0x567efa50UL, 0x952dd77bUL, 0xd41ccc62UL,
- 0x138a8d2dUL, 0x52bb9634UL, 0x91e8bb1fUL, 0xd0d9a006UL, 0xecf37e5eUL,
- 0xadc26547UL, 0x6e91486cUL, 0x2fa05375UL, 0xe836123aUL, 0xa9070923UL,
- 0x6a542408UL, 0x2b653f11UL, 0xe479a796UL, 0xa548bc8fUL, 0x661b91a4UL,
- 0x272a8abdUL, 0xe0bccbf2UL, 0xa18dd0ebUL, 0x62defdc0UL, 0x23efe6d9UL,
- 0xbde1bc14UL, 0xfcd0a70dUL, 0x3f838a26UL, 0x7eb2913fUL, 0xb924d070UL,
- 0xf815cb69UL, 0x3b46e642UL, 0x7a77fd5bUL, 0xb56b65dcUL, 0xf45a7ec5UL,
- 0x370953eeUL, 0x763848f7UL, 0xb1ae09b8UL, 0xf09f12a1UL, 0x33cc3f8aUL,
- 0x72fd2493UL
- },
- {
- 0x00000000UL, 0x376ac201UL, 0x6ed48403UL, 0x59be4602UL, 0xdca80907UL,
- 0xebc2cb06UL, 0xb27c8d04UL, 0x85164f05UL, 0xb851130eUL, 0x8f3bd10fUL,
- 0xd685970dUL, 0xe1ef550cUL, 0x64f91a09UL, 0x5393d808UL, 0x0a2d9e0aUL,
- 0x3d475c0bUL, 0x70a3261cUL, 0x47c9e41dUL, 0x1e77a21fUL, 0x291d601eUL,
- 0xac0b2f1bUL, 0x9b61ed1aUL, 0xc2dfab18UL, 0xf5b56919UL, 0xc8f23512UL,
- 0xff98f713UL, 0xa626b111UL, 0x914c7310UL, 0x145a3c15UL, 0x2330fe14UL,
- 0x7a8eb816UL, 0x4de47a17UL, 0xe0464d38UL, 0xd72c8f39UL, 0x8e92c93bUL,
- 0xb9f80b3aUL, 0x3cee443fUL, 0x0b84863eUL, 0x523ac03cUL, 0x6550023dUL,
- 0x58175e36UL, 0x6f7d9c37UL, 0x36c3da35UL, 0x01a91834UL, 0x84bf5731UL,
- 0xb3d59530UL, 0xea6bd332UL, 0xdd011133UL, 0x90e56b24UL, 0xa78fa925UL,
- 0xfe31ef27UL, 0xc95b2d26UL, 0x4c4d6223UL, 0x7b27a022UL, 0x2299e620UL,
- 0x15f32421UL, 0x28b4782aUL, 0x1fdeba2bUL, 0x4660fc29UL, 0x710a3e28UL,
- 0xf41c712dUL, 0xc376b32cUL, 0x9ac8f52eUL, 0xada2372fUL, 0xc08d9a70UL,
- 0xf7e75871UL, 0xae591e73UL, 0x9933dc72UL, 0x1c259377UL, 0x2b4f5176UL,
- 0x72f11774UL, 0x459bd575UL, 0x78dc897eUL, 0x4fb64b7fUL, 0x16080d7dUL,
- 0x2162cf7cUL, 0xa4748079UL, 0x931e4278UL, 0xcaa0047aUL, 0xfdcac67bUL,
- 0xb02ebc6cUL, 0x87447e6dUL, 0xdefa386fUL, 0xe990fa6eUL, 0x6c86b56bUL,
- 0x5bec776aUL, 0x02523168UL, 0x3538f369UL, 0x087faf62UL, 0x3f156d63UL,
- 0x66ab2b61UL, 0x51c1e960UL, 0xd4d7a665UL, 0xe3bd6464UL, 0xba032266UL,
- 0x8d69e067UL, 0x20cbd748UL, 0x17a11549UL, 0x4e1f534bUL, 0x7975914aUL,
- 0xfc63de4fUL, 0xcb091c4eUL, 0x92b75a4cUL, 0xa5dd984dUL, 0x989ac446UL,
- 0xaff00647UL, 0xf64e4045UL, 0xc1248244UL, 0x4432cd41UL, 0x73580f40UL,
- 0x2ae64942UL, 0x1d8c8b43UL, 0x5068f154UL, 0x67023355UL, 0x3ebc7557UL,
- 0x09d6b756UL, 0x8cc0f853UL, 0xbbaa3a52UL, 0xe2147c50UL, 0xd57ebe51UL,
- 0xe839e25aUL, 0xdf53205bUL, 0x86ed6659UL, 0xb187a458UL, 0x3491eb5dUL,
- 0x03fb295cUL, 0x5a456f5eUL, 0x6d2fad5fUL, 0x801b35e1UL, 0xb771f7e0UL,
- 0xeecfb1e2UL, 0xd9a573e3UL, 0x5cb33ce6UL, 0x6bd9fee7UL, 0x3267b8e5UL,
- 0x050d7ae4UL, 0x384a26efUL, 0x0f20e4eeUL, 0x569ea2ecUL, 0x61f460edUL,
- 0xe4e22fe8UL, 0xd388ede9UL, 0x8a36abebUL, 0xbd5c69eaUL, 0xf0b813fdUL,
- 0xc7d2d1fcUL, 0x9e6c97feUL, 0xa90655ffUL, 0x2c101afaUL, 0x1b7ad8fbUL,
- 0x42c49ef9UL, 0x75ae5cf8UL, 0x48e900f3UL, 0x7f83c2f2UL, 0x263d84f0UL,
- 0x115746f1UL, 0x944109f4UL, 0xa32bcbf5UL, 0xfa958df7UL, 0xcdff4ff6UL,
- 0x605d78d9UL, 0x5737bad8UL, 0x0e89fcdaUL, 0x39e33edbUL, 0xbcf571deUL,
- 0x8b9fb3dfUL, 0xd221f5ddUL, 0xe54b37dcUL, 0xd80c6bd7UL, 0xef66a9d6UL,
- 0xb6d8efd4UL, 0x81b22dd5UL, 0x04a462d0UL, 0x33cea0d1UL, 0x6a70e6d3UL,
- 0x5d1a24d2UL, 0x10fe5ec5UL, 0x27949cc4UL, 0x7e2adac6UL, 0x494018c7UL,
- 0xcc5657c2UL, 0xfb3c95c3UL, 0xa282d3c1UL, 0x95e811c0UL, 0xa8af4dcbUL,
- 0x9fc58fcaUL, 0xc67bc9c8UL, 0xf1110bc9UL, 0x740744ccUL, 0x436d86cdUL,
- 0x1ad3c0cfUL, 0x2db902ceUL, 0x4096af91UL, 0x77fc6d90UL, 0x2e422b92UL,
- 0x1928e993UL, 0x9c3ea696UL, 0xab546497UL, 0xf2ea2295UL, 0xc580e094UL,
- 0xf8c7bc9fUL, 0xcfad7e9eUL, 0x9613389cUL, 0xa179fa9dUL, 0x246fb598UL,
- 0x13057799UL, 0x4abb319bUL, 0x7dd1f39aUL, 0x3035898dUL, 0x075f4b8cUL,
- 0x5ee10d8eUL, 0x698bcf8fUL, 0xec9d808aUL, 0xdbf7428bUL, 0x82490489UL,
- 0xb523c688UL, 0x88649a83UL, 0xbf0e5882UL, 0xe6b01e80UL, 0xd1dadc81UL,
- 0x54cc9384UL, 0x63a65185UL, 0x3a181787UL, 0x0d72d586UL, 0xa0d0e2a9UL,
- 0x97ba20a8UL, 0xce0466aaUL, 0xf96ea4abUL, 0x7c78ebaeUL, 0x4b1229afUL,
- 0x12ac6fadUL, 0x25c6adacUL, 0x1881f1a7UL, 0x2feb33a6UL, 0x765575a4UL,
- 0x413fb7a5UL, 0xc429f8a0UL, 0xf3433aa1UL, 0xaafd7ca3UL, 0x9d97bea2UL,
- 0xd073c4b5UL, 0xe71906b4UL, 0xbea740b6UL, 0x89cd82b7UL, 0x0cdbcdb2UL,
- 0x3bb10fb3UL, 0x620f49b1UL, 0x55658bb0UL, 0x6822d7bbUL, 0x5f4815baUL,
- 0x06f653b8UL, 0x319c91b9UL, 0xb48adebcUL, 0x83e01cbdUL, 0xda5e5abfUL,
- 0xed3498beUL
- },
- {
- 0x00000000UL, 0x6567bcb8UL, 0x8bc809aaUL, 0xeeafb512UL, 0x5797628fUL,
- 0x32f0de37UL, 0xdc5f6b25UL, 0xb938d79dUL, 0xef28b4c5UL, 0x8a4f087dUL,
- 0x64e0bd6fUL, 0x018701d7UL, 0xb8bfd64aUL, 0xddd86af2UL, 0x3377dfe0UL,
- 0x56106358UL, 0x9f571950UL, 0xfa30a5e8UL, 0x149f10faUL, 0x71f8ac42UL,
- 0xc8c07bdfUL, 0xada7c767UL, 0x43087275UL, 0x266fcecdUL, 0x707fad95UL,
- 0x1518112dUL, 0xfbb7a43fUL, 0x9ed01887UL, 0x27e8cf1aUL, 0x428f73a2UL,
- 0xac20c6b0UL, 0xc9477a08UL, 0x3eaf32a0UL, 0x5bc88e18UL, 0xb5673b0aUL,
- 0xd00087b2UL, 0x6938502fUL, 0x0c5fec97UL, 0xe2f05985UL, 0x8797e53dUL,
- 0xd1878665UL, 0xb4e03addUL, 0x5a4f8fcfUL, 0x3f283377UL, 0x8610e4eaUL,
- 0xe3775852UL, 0x0dd8ed40UL, 0x68bf51f8UL, 0xa1f82bf0UL, 0xc49f9748UL,
- 0x2a30225aUL, 0x4f579ee2UL, 0xf66f497fUL, 0x9308f5c7UL, 0x7da740d5UL,
- 0x18c0fc6dUL, 0x4ed09f35UL, 0x2bb7238dUL, 0xc518969fUL, 0xa07f2a27UL,
- 0x1947fdbaUL, 0x7c204102UL, 0x928ff410UL, 0xf7e848a8UL, 0x3d58149bUL,
- 0x583fa823UL, 0xb6901d31UL, 0xd3f7a189UL, 0x6acf7614UL, 0x0fa8caacUL,
- 0xe1077fbeUL, 0x8460c306UL, 0xd270a05eUL, 0xb7171ce6UL, 0x59b8a9f4UL,
- 0x3cdf154cUL, 0x85e7c2d1UL, 0xe0807e69UL, 0x0e2fcb7bUL, 0x6b4877c3UL,
- 0xa20f0dcbUL, 0xc768b173UL, 0x29c70461UL, 0x4ca0b8d9UL, 0xf5986f44UL,
- 0x90ffd3fcUL, 0x7e5066eeUL, 0x1b37da56UL, 0x4d27b90eUL, 0x284005b6UL,
- 0xc6efb0a4UL, 0xa3880c1cUL, 0x1ab0db81UL, 0x7fd76739UL, 0x9178d22bUL,
- 0xf41f6e93UL, 0x03f7263bUL, 0x66909a83UL, 0x883f2f91UL, 0xed589329UL,
- 0x546044b4UL, 0x3107f80cUL, 0xdfa84d1eUL, 0xbacff1a6UL, 0xecdf92feUL,
- 0x89b82e46UL, 0x67179b54UL, 0x027027ecUL, 0xbb48f071UL, 0xde2f4cc9UL,
- 0x3080f9dbUL, 0x55e74563UL, 0x9ca03f6bUL, 0xf9c783d3UL, 0x176836c1UL,
- 0x720f8a79UL, 0xcb375de4UL, 0xae50e15cUL, 0x40ff544eUL, 0x2598e8f6UL,
- 0x73888baeUL, 0x16ef3716UL, 0xf8408204UL, 0x9d273ebcUL, 0x241fe921UL,
- 0x41785599UL, 0xafd7e08bUL, 0xcab05c33UL, 0x3bb659edUL, 0x5ed1e555UL,
- 0xb07e5047UL, 0xd519ecffUL, 0x6c213b62UL, 0x094687daUL, 0xe7e932c8UL,
- 0x828e8e70UL, 0xd49eed28UL, 0xb1f95190UL, 0x5f56e482UL, 0x3a31583aUL,
- 0x83098fa7UL, 0xe66e331fUL, 0x08c1860dUL, 0x6da63ab5UL, 0xa4e140bdUL,
- 0xc186fc05UL, 0x2f294917UL, 0x4a4ef5afUL, 0xf3762232UL, 0x96119e8aUL,
- 0x78be2b98UL, 0x1dd99720UL, 0x4bc9f478UL, 0x2eae48c0UL, 0xc001fdd2UL,
- 0xa566416aUL, 0x1c5e96f7UL, 0x79392a4fUL, 0x97969f5dUL, 0xf2f123e5UL,
- 0x05196b4dUL, 0x607ed7f5UL, 0x8ed162e7UL, 0xebb6de5fUL, 0x528e09c2UL,
- 0x37e9b57aUL, 0xd9460068UL, 0xbc21bcd0UL, 0xea31df88UL, 0x8f566330UL,
- 0x61f9d622UL, 0x049e6a9aUL, 0xbda6bd07UL, 0xd8c101bfUL, 0x366eb4adUL,
- 0x53090815UL, 0x9a4e721dUL, 0xff29cea5UL, 0x11867bb7UL, 0x74e1c70fUL,
- 0xcdd91092UL, 0xa8beac2aUL, 0x46111938UL, 0x2376a580UL, 0x7566c6d8UL,
- 0x10017a60UL, 0xfeaecf72UL, 0x9bc973caUL, 0x22f1a457UL, 0x479618efUL,
- 0xa939adfdUL, 0xcc5e1145UL, 0x06ee4d76UL, 0x6389f1ceUL, 0x8d2644dcUL,
- 0xe841f864UL, 0x51792ff9UL, 0x341e9341UL, 0xdab12653UL, 0xbfd69aebUL,
- 0xe9c6f9b3UL, 0x8ca1450bUL, 0x620ef019UL, 0x07694ca1UL, 0xbe519b3cUL,
- 0xdb362784UL, 0x35999296UL, 0x50fe2e2eUL, 0x99b95426UL, 0xfcdee89eUL,
- 0x12715d8cUL, 0x7716e134UL, 0xce2e36a9UL, 0xab498a11UL, 0x45e63f03UL,
- 0x208183bbUL, 0x7691e0e3UL, 0x13f65c5bUL, 0xfd59e949UL, 0x983e55f1UL,
- 0x2106826cUL, 0x44613ed4UL, 0xaace8bc6UL, 0xcfa9377eUL, 0x38417fd6UL,
- 0x5d26c36eUL, 0xb389767cUL, 0xd6eecac4UL, 0x6fd61d59UL, 0x0ab1a1e1UL,
- 0xe41e14f3UL, 0x8179a84bUL, 0xd769cb13UL, 0xb20e77abUL, 0x5ca1c2b9UL,
- 0x39c67e01UL, 0x80fea99cUL, 0xe5991524UL, 0x0b36a036UL, 0x6e511c8eUL,
- 0xa7166686UL, 0xc271da3eUL, 0x2cde6f2cUL, 0x49b9d394UL, 0xf0810409UL,
- 0x95e6b8b1UL, 0x7b490da3UL, 0x1e2eb11bUL, 0x483ed243UL, 0x2d596efbUL,
- 0xc3f6dbe9UL, 0xa6916751UL, 0x1fa9b0ccUL, 0x7ace0c74UL, 0x9461b966UL,
- 0xf10605deUL
-#endif
- }
-};
diff --git a/sys/contrib/opensolaris/uts/common/zmod/deflate.c b/sys/contrib/opensolaris/uts/common/zmod/deflate.c
deleted file mode 100644
index 7847e40..0000000
--- a/sys/contrib/opensolaris/uts/common/zmod/deflate.c
+++ /dev/null
@@ -1,1742 +0,0 @@
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-/* deflate.c -- compress data using the deflation algorithm
- * Copyright (C) 1995-2005 Jean-loup Gailly.
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-/*
- * ALGORITHM
- *
- * The "deflation" process depends on being able to identify portions
- * of the input text which are identical to earlier input (within a
- * sliding window trailing behind the input currently being processed).
- *
- * The most straightforward technique turns out to be the fastest for
- * most input files: try all possible matches and select the longest.
- * The key feature of this algorithm is that insertions into the string
- * dictionary are very simple and thus fast, and deletions are avoided
- * completely. Insertions are performed at each input character, whereas
- * string matches are performed only when the previous match ends. So it
- * is preferable to spend more time in matches to allow very fast string
- * insertions and avoid deletions. The matching algorithm for small
- * strings is inspired from that of Rabin & Karp. A brute force approach
- * is used to find longer strings when a small match has been found.
- * A similar algorithm is used in comic (by Jan-Mark Wams) and freeze
- * (by Leonid Broukhis).
- * A previous version of this file used a more sophisticated algorithm
- * (by Fiala and Greene) which is guaranteed to run in linear amortized
- * time, but has a larger average cost, uses more memory and is patented.
- * However the F&G algorithm may be faster for some highly redundant
- * files if the parameter max_chain_length (described below) is too large.
- *
- * ACKNOWLEDGEMENTS
- *
- * The idea of lazy evaluation of matches is due to Jan-Mark Wams, and
- * I found it in 'freeze' written by Leonid Broukhis.
- * Thanks to many people for bug reports and testing.
- *
- * REFERENCES
- *
- * Deutsch, L.P.,"DEFLATE Compressed Data Format Specification".
- * Available in http://www.ietf.org/rfc/rfc1951.txt
- *
- * A description of the Rabin and Karp algorithm is given in the book
- * "Algorithms" by R. Sedgewick, Addison-Wesley, p252.
- *
- * Fiala,E.R., and Greene,D.H.
- * Data Compression with Finite Windows, Comm.ACM, 32,4 (1989) 490-595
- *
- */
-
-#include "deflate.h"
-
-static const char deflate_copyright[] =
- " deflate 1.2.3 Copyright 1995-2005 Jean-loup Gailly ";
-/*
- If you use the zlib library in a product, an acknowledgment is welcome
- in the documentation of your product. If for some reason you cannot
- include such an acknowledgment, I would appreciate that you keep this
- copyright string in the executable of your product.
- */
-
-/* ===========================================================================
- * Function prototypes.
- */
-typedef enum {
- need_more, /* block not completed, need more input or more output */
- block_done, /* block flush performed */
- finish_started, /* finish started, need only more output at next deflate */
- finish_done /* finish done, accept no more input or output */
-} block_state;
-
-typedef block_state (*compress_func) OF((deflate_state *s, int flush));
-/* Compression function. Returns the block state after the call. */
-
-local void fill_window OF((deflate_state *s));
-local block_state deflate_stored OF((deflate_state *s, int flush));
-local block_state deflate_fast OF((deflate_state *s, int flush));
-#ifndef FASTEST
-local block_state deflate_slow OF((deflate_state *s, int flush));
-#endif
-local void lm_init OF((deflate_state *s));
-local void putShortMSB OF((deflate_state *s, uInt b));
-local void flush_pending OF((z_streamp strm));
-local int read_buf OF((z_streamp strm, Bytef *buf, unsigned size));
-#ifndef FASTEST
-#ifdef ASMV
- void match_init OF((void)); /* asm code initialization */
- uInt longest_match OF((deflate_state *s, IPos cur_match));
-#else
-local uInt longest_match OF((deflate_state *s, IPos cur_match));
-#endif
-#endif
-local uInt longest_match_fast OF((deflate_state *s, IPos cur_match));
-
-#ifdef DEBUG
-local void check_match OF((deflate_state *s, IPos start, IPos match,
- int length));
-#endif
-
-/* ===========================================================================
- * Local data
- */
-
-#define NIL 0
-/* Tail of hash chains */
-
-#ifndef TOO_FAR
-# define TOO_FAR 4096
-#endif
-/* Matches of length 3 are discarded if their distance exceeds TOO_FAR */
-
-#define MIN_LOOKAHEAD (MAX_MATCH+MIN_MATCH+1)
-/* Minimum amount of lookahead, except at the end of the input file.
- * See deflate.c for comments about the MIN_MATCH+1.
- */
-
-/* Values for max_lazy_match, good_match and max_chain_length, depending on
- * the desired pack level (0..9). The values given below have been tuned to
- * exclude worst case performance for pathological files. Better values may be
- * found for specific files.
- */
-typedef struct config_s {
- ush good_length; /* reduce lazy search above this match length */
- ush max_lazy; /* do not perform lazy search above this match length */
- ush nice_length; /* quit search above this match length */
- ush max_chain;
- compress_func func;
-} config;
-
-#ifdef FASTEST
-local const config configuration_table[2] = {
-/* good lazy nice chain */
-/* 0 */ {0, 0, 0, 0, deflate_stored}, /* store only */
-/* 1 */ {4, 4, 8, 4, deflate_fast}}; /* max speed, no lazy matches */
-#else
-local const config configuration_table[10] = {
-/* good lazy nice chain */
-/* 0 */ {0, 0, 0, 0, deflate_stored}, /* store only */
-/* 1 */ {4, 4, 8, 4, deflate_fast}, /* max speed, no lazy matches */
-/* 2 */ {4, 5, 16, 8, deflate_fast},
-/* 3 */ {4, 6, 32, 32, deflate_fast},
-
-/* 4 */ {4, 4, 16, 16, deflate_slow}, /* lazy matches */
-/* 5 */ {8, 16, 32, 32, deflate_slow},
-/* 6 */ {8, 16, 128, 128, deflate_slow},
-/* 7 */ {8, 32, 128, 256, deflate_slow},
-/* 8 */ {32, 128, 258, 1024, deflate_slow},
-/* 9 */ {32, 258, 258, 4096, deflate_slow}}; /* max compression */
-#endif
-
-/* Note: the deflate() code requires max_lazy >= MIN_MATCH and max_chain >= 4
- * For deflate_fast() (levels <= 3) good is ignored and lazy has a different
- * meaning.
- */
-
-#define EQUAL 0
-/* result of memcmp for equal strings */
-
-#ifndef NO_DUMMY_DECL
-struct static_tree_desc_s {int dummy;}; /* for buggy compilers */
-#endif
-
-/* ===========================================================================
- * Update a hash value with the given input byte
- * IN assertion: all calls to to UPDATE_HASH are made with consecutive
- * input characters, so that a running hash key can be computed from the
- * previous key instead of complete recalculation each time.
- */
-#define UPDATE_HASH(s,h,c) (h = (((h)<<s->hash_shift) ^ (c)) & s->hash_mask)
-
-
-/* ===========================================================================
- * Insert string str in the dictionary and set match_head to the previous head
- * of the hash chain (the most recent string with same hash key). Return
- * the previous length of the hash chain.
- * If this file is compiled with -DFASTEST, the compression level is forced
- * to 1, and no hash chains are maintained.
- * IN assertion: all calls to to INSERT_STRING are made with consecutive
- * input characters and the first MIN_MATCH bytes of str are valid
- * (except for the last MIN_MATCH-1 bytes of the input file).
- */
-#ifdef FASTEST
-#define INSERT_STRING(s, str, match_head) \
- (UPDATE_HASH(s, s->ins_h, s->window[(str) + (MIN_MATCH-1)]), \
- match_head = s->head[s->ins_h], \
- s->head[s->ins_h] = (Pos)(str))
-#else
-#define INSERT_STRING(s, str, match_head) \
- (UPDATE_HASH(s, s->ins_h, s->window[(str) + (MIN_MATCH-1)]), \
- match_head = s->prev[(str) & s->w_mask] = s->head[s->ins_h], \
- s->head[s->ins_h] = (Pos)(str))
-#endif
-
-/* ===========================================================================
- * Initialize the hash table (avoiding 64K overflow for 16 bit systems).
- * prev[] will be initialized on the fly.
- */
-#define CLEAR_HASH(s) \
- s->head[s->hash_size-1] = NIL; \
- (void) zmemzero((Bytef *)s->head, \
- (unsigned)(s->hash_size-1)*sizeof(*s->head));
-
-/* ========================================================================= */
-int ZEXPORT deflateInit_(strm, level, version, stream_size)
- z_streamp strm;
- int level;
- const char *version;
- int stream_size;
-{
- return deflateInit2_(strm, level, Z_DEFLATED, MAX_WBITS, DEF_MEM_LEVEL,
- Z_DEFAULT_STRATEGY, version, stream_size);
- /* To do: ignore strm->next_in if we use it as window */
-}
-
-/* ========================================================================= */
-int ZEXPORT deflateInit2_(strm, level, method, windowBits, memLevel, strategy,
- version, stream_size)
- z_streamp strm;
- int level;
- int method;
- int windowBits;
- int memLevel;
- int strategy;
- const char *version;
- int stream_size;
-{
- deflate_state *s;
- int wrap = 1;
- static const char my_version[] = ZLIB_VERSION;
-
- ushf *overlay;
- /* We overlay pending_buf and d_buf+l_buf. This works since the average
- * output size for (length,distance) codes is <= 24 bits.
- */
-
- if (version == Z_NULL || version[0] != my_version[0] ||
- stream_size != sizeof(z_stream)) {
- return Z_VERSION_ERROR;
- }
- if (strm == Z_NULL) return Z_STREAM_ERROR;
-
- strm->msg = Z_NULL;
- if (strm->zalloc == (alloc_func)0) {
- strm->zalloc = zcalloc;
- strm->opaque = (voidpf)0;
- }
- if (strm->zfree == (free_func)0) strm->zfree = zcfree;
-
-#ifdef FASTEST
- if (level != 0) level = 1;
-#else
- if (level == Z_DEFAULT_COMPRESSION) level = 6;
-#endif
-
- if (windowBits < 0) { /* suppress zlib wrapper */
- wrap = 0;
- windowBits = -windowBits;
- }
-#ifdef GZIP
- else if (windowBits > 15) {
- wrap = 2; /* write gzip wrapper instead */
- windowBits -= 16;
- }
-#endif
- if (memLevel < 1 || memLevel > MAX_MEM_LEVEL || method != Z_DEFLATED ||
- windowBits < 8 || windowBits > 15 || level < 0 || level > 9 ||
- strategy < 0 || strategy > Z_FIXED) {
- return Z_STREAM_ERROR;
- }
- if (windowBits == 8) windowBits = 9; /* until 256-byte window bug fixed */
- s = (deflate_state *) ZALLOC(strm, 1, sizeof(deflate_state));
- if (s == Z_NULL) return Z_MEM_ERROR;
- strm->state = (struct internal_state FAR *)s;
- s->strm = strm;
-
- s->wrap = wrap;
- s->gzhead = Z_NULL;
- s->w_bits = windowBits;
- s->w_size = 1 << s->w_bits;
- s->w_mask = s->w_size - 1;
-
- s->hash_bits = memLevel + 7;
- s->hash_size = 1 << s->hash_bits;
- s->hash_mask = s->hash_size - 1;
- s->hash_shift = ((s->hash_bits+MIN_MATCH-1)/MIN_MATCH);
-
- s->window = (Bytef *) ZALLOC(strm, s->w_size, 2*sizeof(Byte));
- s->prev = (Posf *) ZALLOC(strm, s->w_size, sizeof(Pos));
- s->head = (Posf *) ZALLOC(strm, s->hash_size, sizeof(Pos));
-
- s->lit_bufsize = 1 << (memLevel + 6); /* 16K elements by default */
-
- overlay = (ushf *) ZALLOC(strm, s->lit_bufsize, sizeof(ush)+2);
- s->pending_buf = (uchf *) overlay;
- s->pending_buf_size = (ulg)s->lit_bufsize * (sizeof(ush)+2L);
-
- if (s->window == Z_NULL || s->prev == Z_NULL || s->head == Z_NULL ||
- s->pending_buf == Z_NULL) {
- s->status = FINISH_STATE;
- strm->msg = (char*)ERR_MSG(Z_MEM_ERROR);
- (void) deflateEnd (strm);
- return Z_MEM_ERROR;
- }
- s->d_buf = overlay + s->lit_bufsize/sizeof(ush);
- s->l_buf = s->pending_buf + (1+sizeof(ush))*s->lit_bufsize;
-
- s->level = level;
- s->strategy = strategy;
- s->method = (Byte)method;
-
- return deflateReset(strm);
-}
-
-/* ========================================================================= */
-int ZEXPORT deflateSetDictionary (strm, dictionary, dictLength)
- z_streamp strm;
- const Bytef *dictionary;
- uInt dictLength;
-{
- deflate_state *s;
- uInt length = dictLength;
- uInt n;
- IPos hash_head = 0;
-
- if (strm == Z_NULL || strm->state == Z_NULL || dictionary == Z_NULL ||
- strm->state->wrap == 2 ||
- (strm->state->wrap == 1 && strm->state->status != INIT_STATE))
- return Z_STREAM_ERROR;
-
- s = strm->state;
- if (s->wrap)
- strm->adler = adler32(strm->adler, dictionary, dictLength);
-
- if (length < MIN_MATCH) return Z_OK;
- if (length > MAX_DIST(s)) {
- length = MAX_DIST(s);
- dictionary += dictLength - length; /* use the tail of the dictionary */
- }
- (void) zmemcpy(s->window, dictionary, length);
- s->strstart = length;
- s->block_start = (long)length;
-
- /* Insert all strings in the hash table (except for the last two bytes).
- * s->lookahead stays null, so s->ins_h will be recomputed at the next
- * call of fill_window.
- */
- s->ins_h = s->window[0];
- UPDATE_HASH(s, s->ins_h, s->window[1]);
- for (n = 0; n <= length - MIN_MATCH; n++) {
- INSERT_STRING(s, n, hash_head);
- }
- if (hash_head) hash_head = 0; /* to make compiler happy */
- return Z_OK;
-}
-
-/* ========================================================================= */
-int ZEXPORT deflateReset (strm)
- z_streamp strm;
-{
- deflate_state *s;
-
- if (strm == Z_NULL || strm->state == Z_NULL ||
- strm->zalloc == (alloc_func)0 || strm->zfree == (free_func)0) {
- return Z_STREAM_ERROR;
- }
-
- strm->total_in = strm->total_out = 0;
- strm->msg = Z_NULL; /* use zfree if we ever allocate msg dynamically */
- strm->data_type = Z_UNKNOWN;
-
- s = (deflate_state *)strm->state;
- s->pending = 0;
- s->pending_out = s->pending_buf;
-
- if (s->wrap < 0) {
- s->wrap = -s->wrap; /* was made negative by deflate(..., Z_FINISH); */
- }
- s->status = s->wrap ? INIT_STATE : BUSY_STATE;
- strm->adler =
-#ifdef GZIP
- s->wrap == 2 ? crc32(0L, Z_NULL, 0) :
-#endif
- adler32(0L, Z_NULL, 0);
- s->last_flush = Z_NO_FLUSH;
-
- _tr_init(s);
- lm_init(s);
-
- return Z_OK;
-}
-
-/* ========================================================================= */
-int ZEXPORT deflateSetHeader (strm, head)
- z_streamp strm;
- gz_headerp head;
-{
- if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR;
- if (strm->state->wrap != 2) return Z_STREAM_ERROR;
- strm->state->gzhead = head;
- return Z_OK;
-}
-
-/* ========================================================================= */
-int ZEXPORT deflatePrime (strm, bits, value)
- z_streamp strm;
- int bits;
- int value;
-{
- if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR;
- strm->state->bi_valid = bits;
- strm->state->bi_buf = (ush)(value & ((1 << bits) - 1));
- return Z_OK;
-}
-
-/* ========================================================================= */
-int ZEXPORT deflateParams(strm, level, strategy)
- z_streamp strm;
- int level;
- int strategy;
-{
- deflate_state *s;
- compress_func func;
- int err = Z_OK;
-
- if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR;
- s = strm->state;
-
-#ifdef FASTEST
- if (level != 0) level = 1;
-#else
- if (level == Z_DEFAULT_COMPRESSION) level = 6;
-#endif
- if (level < 0 || level > 9 || strategy < 0 || strategy > Z_FIXED) {
- return Z_STREAM_ERROR;
- }
- func = configuration_table[s->level].func;
-
- if (func != configuration_table[level].func && strm->total_in != 0) {
- /* Flush the last buffer: */
- err = deflate(strm, Z_PARTIAL_FLUSH);
- }
- if (s->level != level) {
- s->level = level;
- s->max_lazy_match = configuration_table[level].max_lazy;
- s->good_match = configuration_table[level].good_length;
- s->nice_match = configuration_table[level].nice_length;
- s->max_chain_length = configuration_table[level].max_chain;
- }
- s->strategy = strategy;
- return err;
-}
-
-/* ========================================================================= */
-int ZEXPORT deflateTune(strm, good_length, max_lazy, nice_length, max_chain)
- z_streamp strm;
- int good_length;
- int max_lazy;
- int nice_length;
- int max_chain;
-{
- deflate_state *s;
-
- if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR;
- s = strm->state;
- s->good_match = good_length;
- s->max_lazy_match = max_lazy;
- s->nice_match = nice_length;
- s->max_chain_length = max_chain;
- return Z_OK;
-}
-
-/* =========================================================================
- * For the default windowBits of 15 and memLevel of 8, this function returns
- * a close to exact, as well as small, upper bound on the compressed size.
- * They are coded as constants here for a reason--if the #define's are
- * changed, then this function needs to be changed as well. The return
- * value for 15 and 8 only works for those exact settings.
- *
- * For any setting other than those defaults for windowBits and memLevel,
- * the value returned is a conservative worst case for the maximum expansion
- * resulting from using fixed blocks instead of stored blocks, which deflate
- * can emit on compressed data for some combinations of the parameters.
- *
- * This function could be more sophisticated to provide closer upper bounds
- * for every combination of windowBits and memLevel, as well as wrap.
- * But even the conservative upper bound of about 14% expansion does not
- * seem onerous for output buffer allocation.
- */
-uLong ZEXPORT deflateBound(strm, sourceLen)
- z_streamp strm;
- uLong sourceLen;
-{
- deflate_state *s;
- uLong destLen;
-
- /* conservative upper bound */
- destLen = sourceLen +
- ((sourceLen + 7) >> 3) + ((sourceLen + 63) >> 6) + 11;
-
- /* if can't get parameters, return conservative bound */
- if (strm == Z_NULL || strm->state == Z_NULL)
- return destLen;
-
- /* if not default parameters, return conservative bound */
- s = strm->state;
- if (s->w_bits != 15 || s->hash_bits != 8 + 7)
- return destLen;
-
- /* default settings: return tight bound for that case */
- return compressBound(sourceLen);
-}
-
-/* =========================================================================
- * Put a short in the pending buffer. The 16-bit value is put in MSB order.
- * IN assertion: the stream state is correct and there is enough room in
- * pending_buf.
- */
-local void putShortMSB (s, b)
- deflate_state *s;
- uInt b;
-{
- put_byte(s, (Byte)(b >> 8));
- put_byte(s, (Byte)(b & 0xff));
-}
-
-/* =========================================================================
- * Flush as much pending output as possible. All deflate() output goes
- * through this function so some applications may wish to modify it
- * to avoid allocating a large strm->next_out buffer and copying into it.
- * (See also read_buf()).
- */
-local void flush_pending(strm)
- z_streamp strm;
-{
- unsigned len = strm->state->pending;
-
- if (len > strm->avail_out) len = strm->avail_out;
- if (len == 0) return;
-
- zmemcpy(strm->next_out, strm->state->pending_out, len);
- strm->next_out += len;
- strm->state->pending_out += len;
- strm->total_out += len;
- strm->avail_out -= len;
- strm->state->pending -= len;
- if (strm->state->pending == 0) {
- strm->state->pending_out = strm->state->pending_buf;
- }
-}
-
-/* ========================================================================= */
-int ZEXPORT deflate (strm, flush)
- z_streamp strm;
- int flush;
-{
- int old_flush; /* value of flush param for previous deflate call */
- deflate_state *s;
-
- if (strm == Z_NULL || strm->state == Z_NULL ||
- flush > Z_FINISH || flush < 0) {
- return Z_STREAM_ERROR;
- }
- s = strm->state;
-
- if (strm->next_out == Z_NULL ||
- (strm->next_in == Z_NULL && strm->avail_in != 0) ||
- (s->status == FINISH_STATE && flush != Z_FINISH)) {
- ERR_RETURN(strm, Z_STREAM_ERROR);
- }
- if (strm->avail_out == 0) ERR_RETURN(strm, Z_BUF_ERROR);
-
- s->strm = strm; /* just in case */
- old_flush = s->last_flush;
- s->last_flush = flush;
-
- /* Write the header */
- if (s->status == INIT_STATE) {
-#ifdef GZIP
- if (s->wrap == 2) {
- strm->adler = crc32(0L, Z_NULL, 0);
- put_byte(s, 31);
- put_byte(s, 139);
- put_byte(s, 8);
- if (s->gzhead == NULL) {
- put_byte(s, 0);
- put_byte(s, 0);
- put_byte(s, 0);
- put_byte(s, 0);
- put_byte(s, 0);
- put_byte(s, s->level == 9 ? 2 :
- (s->strategy >= Z_HUFFMAN_ONLY || s->level < 2 ?
- 4 : 0));
- put_byte(s, OS_CODE);
- s->status = BUSY_STATE;
- }
- else {
- put_byte(s, (s->gzhead->text ? 1 : 0) +
- (s->gzhead->hcrc ? 2 : 0) +
- (s->gzhead->extra == Z_NULL ? 0 : 4) +
- (s->gzhead->name == Z_NULL ? 0 : 8) +
- (s->gzhead->comment == Z_NULL ? 0 : 16)
- );
- put_byte(s, (Byte)(s->gzhead->time & 0xff));
- put_byte(s, (Byte)((s->gzhead->time >> 8) & 0xff));
- put_byte(s, (Byte)((s->gzhead->time >> 16) & 0xff));
- put_byte(s, (Byte)((s->gzhead->time >> 24) & 0xff));
- put_byte(s, s->level == 9 ? 2 :
- (s->strategy >= Z_HUFFMAN_ONLY || s->level < 2 ?
- 4 : 0));
- put_byte(s, s->gzhead->os & 0xff);
- if (s->gzhead->extra != NULL) {
- put_byte(s, s->gzhead->extra_len & 0xff);
- put_byte(s, (s->gzhead->extra_len >> 8) & 0xff);
- }
- if (s->gzhead->hcrc)
- strm->adler = crc32(strm->adler, s->pending_buf,
- s->pending);
- s->gzindex = 0;
- s->status = EXTRA_STATE;
- }
- }
- else
-#endif
- {
- uInt header = (Z_DEFLATED + ((s->w_bits-8)<<4)) << 8;
- uInt level_flags;
-
- if (s->strategy >= Z_HUFFMAN_ONLY || s->level < 2)
- level_flags = 0;
- else if (s->level < 6)
- level_flags = 1;
- else if (s->level == 6)
- level_flags = 2;
- else
- level_flags = 3;
- header |= (level_flags << 6);
- if (s->strstart != 0) header |= PRESET_DICT;
- header += 31 - (header % 31);
-
- s->status = BUSY_STATE;
- putShortMSB(s, header);
-
- /* Save the adler32 of the preset dictionary: */
- if (s->strstart != 0) {
- putShortMSB(s, (uInt)(strm->adler >> 16));
- putShortMSB(s, (uInt)(strm->adler & 0xffff));
- }
- strm->adler = adler32(0L, Z_NULL, 0);
- }
- }
-#ifdef GZIP
- if (s->status == EXTRA_STATE) {
- if (s->gzhead->extra != NULL) {
- uInt beg = s->pending; /* start of bytes to update crc */
-
- while (s->gzindex < (s->gzhead->extra_len & 0xffff)) {
- if (s->pending == s->pending_buf_size) {
- if (s->gzhead->hcrc && s->pending > beg)
- strm->adler = crc32(strm->adler, s->pending_buf + beg,
- s->pending - beg);
- flush_pending(strm);
- beg = s->pending;
- if (s->pending == s->pending_buf_size)
- break;
- }
- put_byte(s, s->gzhead->extra[s->gzindex]);
- s->gzindex++;
- }
- if (s->gzhead->hcrc && s->pending > beg)
- strm->adler = crc32(strm->adler, s->pending_buf + beg,
- s->pending - beg);
- if (s->gzindex == s->gzhead->extra_len) {
- s->gzindex = 0;
- s->status = NAME_STATE;
- }
- }
- else
- s->status = NAME_STATE;
- }
- if (s->status == NAME_STATE) {
- if (s->gzhead->name != NULL) {
- uInt beg = s->pending; /* start of bytes to update crc */
- int val;
-
- do {
- if (s->pending == s->pending_buf_size) {
- if (s->gzhead->hcrc && s->pending > beg)
- strm->adler = crc32(strm->adler, s->pending_buf + beg,
- s->pending - beg);
- flush_pending(strm);
- beg = s->pending;
- if (s->pending == s->pending_buf_size) {
- val = 1;
- break;
- }
- }
- val = s->gzhead->name[s->gzindex++];
- put_byte(s, val);
- } while (val != 0);
- if (s->gzhead->hcrc && s->pending > beg)
- strm->adler = crc32(strm->adler, s->pending_buf + beg,
- s->pending - beg);
- if (val == 0) {
- s->gzindex = 0;
- s->status = COMMENT_STATE;
- }
- }
- else
- s->status = COMMENT_STATE;
- }
- if (s->status == COMMENT_STATE) {
- if (s->gzhead->comment != NULL) {
- uInt beg = s->pending; /* start of bytes to update crc */
- int val;
-
- do {
- if (s->pending == s->pending_buf_size) {
- if (s->gzhead->hcrc && s->pending > beg)
- strm->adler = crc32(strm->adler, s->pending_buf + beg,
- s->pending - beg);
- flush_pending(strm);
- beg = s->pending;
- if (s->pending == s->pending_buf_size) {
- val = 1;
- break;
- }
- }
- val = s->gzhead->comment[s->gzindex++];
- put_byte(s, val);
- } while (val != 0);
- if (s->gzhead->hcrc && s->pending > beg)
- strm->adler = crc32(strm->adler, s->pending_buf + beg,
- s->pending - beg);
- if (val == 0)
- s->status = HCRC_STATE;
- }
- else
- s->status = HCRC_STATE;
- }
- if (s->status == HCRC_STATE) {
- if (s->gzhead->hcrc) {
- if (s->pending + 2 > s->pending_buf_size)
- flush_pending(strm);
- if (s->pending + 2 <= s->pending_buf_size) {
- put_byte(s, (Byte)(strm->adler & 0xff));
- put_byte(s, (Byte)((strm->adler >> 8) & 0xff));
- strm->adler = crc32(0L, Z_NULL, 0);
- s->status = BUSY_STATE;
- }
- }
- else
- s->status = BUSY_STATE;
- }
-#endif
-
- /* Flush as much pending output as possible */
- if (s->pending != 0) {
- flush_pending(strm);
- if (strm->avail_out == 0) {
- /* Since avail_out is 0, deflate will be called again with
- * more output space, but possibly with both pending and
- * avail_in equal to zero. There won't be anything to do,
- * but this is not an error situation so make sure we
- * return OK instead of BUF_ERROR at next call of deflate:
- */
- s->last_flush = -1;
- return Z_OK;
- }
-
- /* Make sure there is something to do and avoid duplicate consecutive
- * flushes. For repeated and useless calls with Z_FINISH, we keep
- * returning Z_STREAM_END instead of Z_BUF_ERROR.
- */
- } else if (strm->avail_in == 0 && flush <= old_flush &&
- flush != Z_FINISH) {
- ERR_RETURN(strm, Z_BUF_ERROR);
- }
-
- /* User must not provide more input after the first FINISH: */
- if (s->status == FINISH_STATE && strm->avail_in != 0) {
- ERR_RETURN(strm, Z_BUF_ERROR);
- }
-
- /* Start a new block or continue the current one.
- */
- if (strm->avail_in != 0 || s->lookahead != 0 ||
- (flush != Z_NO_FLUSH && s->status != FINISH_STATE)) {
- block_state bstate;
-
- bstate = (*(configuration_table[s->level].func))(s, flush);
-
- if (bstate == finish_started || bstate == finish_done) {
- s->status = FINISH_STATE;
- }
- if (bstate == need_more || bstate == finish_started) {
- if (strm->avail_out == 0) {
- s->last_flush = -1; /* avoid BUF_ERROR next call, see above */
- }
- return Z_OK;
- /* If flush != Z_NO_FLUSH && avail_out == 0, the next call
- * of deflate should use the same flush parameter to make sure
- * that the flush is complete. So we don't have to output an
- * empty block here, this will be done at next call. This also
- * ensures that for a very small output buffer, we emit at most
- * one empty block.
- */
- }
- if (bstate == block_done) {
- if (flush == Z_PARTIAL_FLUSH) {
- _tr_align(s);
- } else { /* FULL_FLUSH or SYNC_FLUSH */
- _tr_stored_block(s, (char*)0, 0L, 0);
- /* For a full flush, this empty block will be recognized
- * as a special marker by inflate_sync().
- */
- if (flush == Z_FULL_FLUSH) {
- CLEAR_HASH(s); /* forget history */
- }
- }
- flush_pending(strm);
- if (strm->avail_out == 0) {
- s->last_flush = -1; /* avoid BUF_ERROR at next call, see above */
- return Z_OK;
- }
- }
- }
- Assert(strm->avail_out > 0, "bug2");
-
- if (flush != Z_FINISH) return Z_OK;
- if (s->wrap <= 0) return Z_STREAM_END;
-
- /* Write the trailer */
-#ifdef GZIP
- if (s->wrap == 2) {
- put_byte(s, (Byte)(strm->adler & 0xff));
- put_byte(s, (Byte)((strm->adler >> 8) & 0xff));
- put_byte(s, (Byte)((strm->adler >> 16) & 0xff));
- put_byte(s, (Byte)((strm->adler >> 24) & 0xff));
- put_byte(s, (Byte)(strm->total_in & 0xff));
- put_byte(s, (Byte)((strm->total_in >> 8) & 0xff));
- put_byte(s, (Byte)((strm->total_in >> 16) & 0xff));
- put_byte(s, (Byte)((strm->total_in >> 24) & 0xff));
- }
- else
-#endif
- {
- putShortMSB(s, (uInt)(strm->adler >> 16));
- putShortMSB(s, (uInt)(strm->adler & 0xffff));
- }
- flush_pending(strm);
- /* If avail_out is zero, the application will call deflate again
- * to flush the rest.
- */
- if (s->wrap > 0) s->wrap = -s->wrap; /* write the trailer only once! */
- return s->pending != 0 ? Z_OK : Z_STREAM_END;
-}
-
-/* ========================================================================= */
-int ZEXPORT deflateEnd (strm)
- z_streamp strm;
-{
- int status;
-
- if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR;
-
- status = strm->state->status;
- if (status != INIT_STATE &&
- status != EXTRA_STATE &&
- status != NAME_STATE &&
- status != COMMENT_STATE &&
- status != HCRC_STATE &&
- status != BUSY_STATE &&
- status != FINISH_STATE) {
- return Z_STREAM_ERROR;
- }
-
- /* Deallocate in reverse order of allocations: */
- TRY_FREE(strm, strm->state->pending_buf);
- TRY_FREE(strm, strm->state->head);
- TRY_FREE(strm, strm->state->prev);
- TRY_FREE(strm, strm->state->window);
-
- ZFREE(strm, strm->state);
- strm->state = Z_NULL;
-
- return status == BUSY_STATE ? Z_DATA_ERROR : Z_OK;
-}
-
-/* =========================================================================
- * Copy the source state to the destination state.
- * To simplify the source, this is not supported for 16-bit MSDOS (which
- * doesn't have enough memory anyway to duplicate compression states).
- */
-int ZEXPORT deflateCopy (dest, source)
- z_streamp dest;
- z_streamp source;
-{
-#ifdef MAXSEG_64K
- return Z_STREAM_ERROR;
-#else
- deflate_state *ds;
- deflate_state *ss;
- ushf *overlay;
-
-
- if (source == Z_NULL || dest == Z_NULL || source->state == Z_NULL) {
- return Z_STREAM_ERROR;
- }
-
- ss = source->state;
-
- zmemcpy(dest, source, sizeof(z_stream));
-
- ds = (deflate_state *) ZALLOC(dest, 1, sizeof(deflate_state));
- if (ds == Z_NULL) return Z_MEM_ERROR;
- dest->state = (struct internal_state FAR *) ds;
- zmemcpy(ds, ss, sizeof(deflate_state));
- ds->strm = dest;
-
- ds->window = (Bytef *) ZALLOC(dest, ds->w_size, 2*sizeof(Byte));
- ds->prev = (Posf *) ZALLOC(dest, ds->w_size, sizeof(Pos));
- ds->head = (Posf *) ZALLOC(dest, ds->hash_size, sizeof(Pos));
- overlay = (ushf *) ZALLOC(dest, ds->lit_bufsize, sizeof(ush)+2);
- ds->pending_buf = (uchf *) overlay;
-
- if (ds->window == Z_NULL || ds->prev == Z_NULL || ds->head == Z_NULL ||
- ds->pending_buf == Z_NULL) {
- deflateEnd (dest);
- return Z_MEM_ERROR;
- }
- /* following zmemcpy do not work for 16-bit MSDOS */
- zmemcpy(ds->window, ss->window, ds->w_size * 2 * sizeof(Byte));
- zmemcpy(ds->prev, ss->prev, ds->w_size * sizeof(Pos));
- zmemcpy(ds->head, ss->head, ds->hash_size * sizeof(Pos));
- zmemcpy(ds->pending_buf, ss->pending_buf, (uInt)ds->pending_buf_size);
-
- ds->pending_out = ds->pending_buf + (ss->pending_out - ss->pending_buf);
- ds->d_buf = overlay + ds->lit_bufsize/sizeof(ush);
- ds->l_buf = ds->pending_buf + (1+sizeof(ush))*ds->lit_bufsize;
-
- ds->l_desc.dyn_tree = ds->dyn_ltree;
- ds->d_desc.dyn_tree = ds->dyn_dtree;
- ds->bl_desc.dyn_tree = ds->bl_tree;
-
- return Z_OK;
-#endif /* MAXSEG_64K */
-}
-
-/* ===========================================================================
- * Read a new buffer from the current input stream, update the adler32
- * and total number of bytes read. All deflate() input goes through
- * this function so some applications may wish to modify it to avoid
- * allocating a large strm->next_in buffer and copying from it.
- * (See also flush_pending()).
- */
-local int read_buf(strm, buf, size)
- z_streamp strm;
- Bytef *buf;
- unsigned size;
-{
- unsigned len = strm->avail_in;
-
- if (len > size) len = size;
- if (len == 0) return 0;
-
- strm->avail_in -= len;
-
- if (strm->state->wrap == 1) {
- strm->adler = adler32(strm->adler, strm->next_in, len);
- }
-#ifdef GZIP
- else if (strm->state->wrap == 2) {
- strm->adler = crc32(strm->adler, strm->next_in, len);
- }
-#endif
- zmemcpy(buf, strm->next_in, len);
- strm->next_in += len;
- strm->total_in += len;
-
- return (int)len;
-}
-
-/* ===========================================================================
- * Initialize the "longest match" routines for a new zlib stream
- */
-local void lm_init (s)
- deflate_state *s;
-{
- s->window_size = (ulg)2L*s->w_size;
-
- CLEAR_HASH(s);
-
- /* Set the default configuration parameters:
- */
- s->max_lazy_match = configuration_table[s->level].max_lazy;
- s->good_match = configuration_table[s->level].good_length;
- s->nice_match = configuration_table[s->level].nice_length;
- s->max_chain_length = configuration_table[s->level].max_chain;
-
- s->strstart = 0;
- s->block_start = 0L;
- s->lookahead = 0;
- s->match_length = s->prev_length = MIN_MATCH-1;
- s->match_available = 0;
- s->ins_h = 0;
-#ifndef FASTEST
-#ifdef ASMV
- match_init(); /* initialize the asm code */
-#endif
-#endif
-}
-
-#ifndef FASTEST
-/* ===========================================================================
- * Set match_start to the longest match starting at the given string and
- * return its length. Matches shorter or equal to prev_length are discarded,
- * in which case the result is equal to prev_length and match_start is
- * garbage.
- * IN assertions: cur_match is the head of the hash chain for the current
- * string (strstart) and its distance is <= MAX_DIST, and prev_length >= 1
- * OUT assertion: the match length is not greater than s->lookahead.
- */
-#ifndef ASMV
-/* For 80x86 and 680x0, an optimized version will be provided in match.asm or
- * match.S. The code will be functionally equivalent.
- */
-local uInt longest_match(s, cur_match)
- deflate_state *s;
- IPos cur_match; /* current match */
-{
- unsigned chain_length = s->max_chain_length;/* max hash chain length */
- register Bytef *scan = s->window + s->strstart; /* current string */
- register Bytef *match; /* matched string */
- register int len; /* length of current match */
- int best_len = s->prev_length; /* best match length so far */
- int nice_match = s->nice_match; /* stop if match long enough */
- IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
- s->strstart - (IPos)MAX_DIST(s) : NIL;
- /* Stop when cur_match becomes <= limit. To simplify the code,
- * we prevent matches with the string of window index 0.
- */
- Posf *prev = s->prev;
- uInt wmask = s->w_mask;
-
-#ifdef UNALIGNED_OK
- /* Compare two bytes at a time. Note: this is not always beneficial.
- * Try with and without -DUNALIGNED_OK to check.
- */
- register Bytef *strend = s->window + s->strstart + MAX_MATCH - 1;
- register ush scan_start = *(ushf*)scan;
- register ush scan_end = *(ushf*)(scan+best_len-1);
-#else
- register Bytef *strend = s->window + s->strstart + MAX_MATCH;
- register Byte scan_end1 = scan[best_len-1];
- register Byte scan_end = scan[best_len];
-#endif
-
- /* The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of 16.
- * It is easy to get rid of this optimization if necessary.
- */
- Assert(s->hash_bits >= 8 && MAX_MATCH == 258, "Code too clever");
-
- /* Do not waste too much time if we already have a good match: */
- if (s->prev_length >= s->good_match) {
- chain_length >>= 2;
- }
- /* Do not look for matches beyond the end of the input. This is necessary
- * to make deflate deterministic.
- */
- if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
-
- Assert((ulg)s->strstart <= s->window_size-MIN_LOOKAHEAD, "need lookahead");
-
- do {
- Assert(cur_match < s->strstart, "no future");
- match = s->window + cur_match;
-
- /* Skip to next match if the match length cannot increase
- * or if the match length is less than 2. Note that the checks below
- * for insufficient lookahead only occur occasionally for performance
- * reasons. Therefore uninitialized memory will be accessed, and
- * conditional jumps will be made that depend on those values.
- * However the length of the match is limited to the lookahead, so
- * the output of deflate is not affected by the uninitialized values.
- */
-#if (defined(UNALIGNED_OK) && MAX_MATCH == 258)
- /* This code assumes sizeof(unsigned short) == 2. Do not use
- * UNALIGNED_OK if your compiler uses a different size.
- */
- if (*(ushf*)(match+best_len-1) != scan_end ||
- *(ushf*)match != scan_start) continue;
-
- /* It is not necessary to compare scan[2] and match[2] since they are
- * always equal when the other bytes match, given that the hash keys
- * are equal and that HASH_BITS >= 8. Compare 2 bytes at a time at
- * strstart+3, +5, ... up to strstart+257. We check for insufficient
- * lookahead only every 4th comparison; the 128th check will be made
- * at strstart+257. If MAX_MATCH-2 is not a multiple of 8, it is
- * necessary to put more guard bytes at the end of the window, or
- * to check more often for insufficient lookahead.
- */
- Assert(scan[2] == match[2], "scan[2]?");
- scan++, match++;
- do {
- } while (*(ushf*)(scan+=2) == *(ushf*)(match+=2) &&
- *(ushf*)(scan+=2) == *(ushf*)(match+=2) &&
- *(ushf*)(scan+=2) == *(ushf*)(match+=2) &&
- *(ushf*)(scan+=2) == *(ushf*)(match+=2) &&
- scan < strend);
- /* The funny "do {}" generates better code on most compilers */
-
- /* Here, scan <= window+strstart+257 */
- Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan");
- if (*scan == *match) scan++;
-
- len = (MAX_MATCH - 1) - (int)(strend-scan);
- scan = strend - (MAX_MATCH-1);
-
-#else /* UNALIGNED_OK */
-
- if (match[best_len] != scan_end ||
- match[best_len-1] != scan_end1 ||
- *match != *scan ||
- *++match != scan[1]) continue;
-
- /* The check at best_len-1 can be removed because it will be made
- * again later. (This heuristic is not always a win.)
- * It is not necessary to compare scan[2] and match[2] since they
- * are always equal when the other bytes match, given that
- * the hash keys are equal and that HASH_BITS >= 8.
- */
- scan += 2, match++;
- Assert(*scan == *match, "match[2]?");
-
- /* We check for insufficient lookahead only every 8th comparison;
- * the 256th check will be made at strstart+258.
- */
- do {
- } while (*++scan == *++match && *++scan == *++match &&
- *++scan == *++match && *++scan == *++match &&
- *++scan == *++match && *++scan == *++match &&
- *++scan == *++match && *++scan == *++match &&
- scan < strend);
-
- Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan");
-
- len = MAX_MATCH - (int)(strend - scan);
- scan = strend - MAX_MATCH;
-
-#endif /* UNALIGNED_OK */
-
- if (len > best_len) {
- s->match_start = cur_match;
- best_len = len;
- if (len >= nice_match) break;
-#ifdef UNALIGNED_OK
- scan_end = *(ushf*)(scan+best_len-1);
-#else
- scan_end1 = scan[best_len-1];
- scan_end = scan[best_len];
-#endif
- }
- } while ((cur_match = prev[cur_match & wmask]) > limit
- && --chain_length != 0);
-
- if ((uInt)best_len <= s->lookahead) return (uInt)best_len;
- return s->lookahead;
-}
-#endif /* ASMV */
-#endif /* FASTEST */
-
-/* ---------------------------------------------------------------------------
- * Optimized version for level == 1 or strategy == Z_RLE only
- */
-local uInt longest_match_fast(s, cur_match)
- deflate_state *s;
- IPos cur_match; /* current match */
-{
- register Bytef *scan = s->window + s->strstart; /* current string */
- register Bytef *match; /* matched string */
- register int len; /* length of current match */
- register Bytef *strend = s->window + s->strstart + MAX_MATCH;
-
- /* The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of 16.
- * It is easy to get rid of this optimization if necessary.
- */
- Assert(s->hash_bits >= 8 && MAX_MATCH == 258, "Code too clever");
-
- Assert((ulg)s->strstart <= s->window_size-MIN_LOOKAHEAD, "need lookahead");
-
- Assert(cur_match < s->strstart, "no future");
-
- match = s->window + cur_match;
-
- /* Return failure if the match length is less than 2:
- */
- if (match[0] != scan[0] || match[1] != scan[1]) return MIN_MATCH-1;
-
- /* The check at best_len-1 can be removed because it will be made
- * again later. (This heuristic is not always a win.)
- * It is not necessary to compare scan[2] and match[2] since they
- * are always equal when the other bytes match, given that
- * the hash keys are equal and that HASH_BITS >= 8.
- */
- scan += 2, match += 2;
- Assert(*scan == *match, "match[2]?");
-
- /* We check for insufficient lookahead only every 8th comparison;
- * the 256th check will be made at strstart+258.
- */
- do {
- } while (*++scan == *++match && *++scan == *++match &&
- *++scan == *++match && *++scan == *++match &&
- *++scan == *++match && *++scan == *++match &&
- *++scan == *++match && *++scan == *++match &&
- scan < strend);
-
- Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan");
-
- len = MAX_MATCH - (int)(strend - scan);
-
- if (len < MIN_MATCH) return MIN_MATCH - 1;
-
- s->match_start = cur_match;
- return (uInt)len <= s->lookahead ? (uInt)len : s->lookahead;
-}
-
-#ifdef DEBUG
-/* ===========================================================================
- * Check that the match at match_start is indeed a match.
- */
-local void check_match(s, start, match, length)
- deflate_state *s;
- IPos start, match;
- int length;
-{
- /* check that the match is indeed a match */
- if (zmemcmp(s->window + match,
- s->window + start, length) != EQUAL) {
- fprintf(stderr, " start %u, match %u, length %d\n",
- start, match, length);
- do {
- fprintf(stderr, "%c%c", s->window[match++], s->window[start++]);
- } while (--length != 0);
- z_error("invalid match");
- }
- if (z_verbose > 1) {
- fprintf(stderr,"\\[%d,%d]", start-match, length);
- do { putc(s->window[start++], stderr); } while (--length != 0);
- }
-}
-#else
-# define check_match(s, start, match, length)
-#endif /* DEBUG */
-
-/* ===========================================================================
- * Fill the window when the lookahead becomes insufficient.
- * Updates strstart and lookahead.
- *
- * IN assertion: lookahead < MIN_LOOKAHEAD
- * OUT assertions: strstart <= window_size-MIN_LOOKAHEAD
- * At least one byte has been read, or avail_in == 0; reads are
- * performed for at least two bytes (required for the zip translate_eol
- * option -- not supported here).
- */
-local void fill_window(s)
- deflate_state *s;
-{
- register unsigned n, m;
- register Posf *p;
- unsigned more; /* Amount of free space at the end of the window. */
- uInt wsize = s->w_size;
-
- do {
- more = (unsigned)(s->window_size -(ulg)s->lookahead -(ulg)s->strstart);
-
- /* Deal with !@#$% 64K limit: */
- if (sizeof(int) <= 2) {
- if (more == 0 && s->strstart == 0 && s->lookahead == 0) {
- more = wsize;
-
- } else if (more == (unsigned)(-1)) {
- /* Very unlikely, but possible on 16 bit machine if
- * strstart == 0 && lookahead == 1 (input done a byte at time)
- */
- more--;
- }
- }
-
- /* If the window is almost full and there is insufficient lookahead,
- * move the upper half to the lower one to make room in the upper half.
- */
- if (s->strstart >= wsize+MAX_DIST(s)) {
-
- zmemcpy(s->window, s->window+wsize, (unsigned)wsize);
- s->match_start -= wsize;
- s->strstart -= wsize; /* we now have strstart >= MAX_DIST */
- s->block_start -= (long) wsize;
-
- /* Slide the hash table (could be avoided with 32 bit values
- at the expense of memory usage). We slide even when level == 0
- to keep the hash table consistent if we switch back to level > 0
- later. (Using level 0 permanently is not an optimal usage of
- zlib, so we don't care about this pathological case.)
- */
- /* %%% avoid this when Z_RLE */
- n = s->hash_size;
- p = &s->head[n];
- do {
- m = *--p;
- *p = (Pos)(m >= wsize ? m-wsize : NIL);
- } while (--n);
-
- n = wsize;
-#ifndef FASTEST
- p = &s->prev[n];
- do {
- m = *--p;
- *p = (Pos)(m >= wsize ? m-wsize : NIL);
- /* If n is not on any hash chain, prev[n] is garbage but
- * its value will never be used.
- */
- } while (--n);
-#endif
- more += wsize;
- }
- if (s->strm->avail_in == 0) return;
-
- /* If there was no sliding:
- * strstart <= WSIZE+MAX_DIST-1 && lookahead <= MIN_LOOKAHEAD - 1 &&
- * more == window_size - lookahead - strstart
- * => more >= window_size - (MIN_LOOKAHEAD-1 + WSIZE + MAX_DIST-1)
- * => more >= window_size - 2*WSIZE + 2
- * In the BIG_MEM or MMAP case (not yet supported),
- * window_size == input_size + MIN_LOOKAHEAD &&
- * strstart + s->lookahead <= input_size => more >= MIN_LOOKAHEAD.
- * Otherwise, window_size == 2*WSIZE so more >= 2.
- * If there was sliding, more >= WSIZE. So in all cases, more >= 2.
- */
- Assert(more >= 2, "more < 2");
-
- n = read_buf(s->strm, s->window + s->strstart + s->lookahead, more);
- s->lookahead += n;
-
- /* Initialize the hash value now that we have some input: */
- if (s->lookahead >= MIN_MATCH) {
- s->ins_h = s->window[s->strstart];
- UPDATE_HASH(s, s->ins_h, s->window[s->strstart+1]);
-#if MIN_MATCH != 3
- Call UPDATE_HASH() MIN_MATCH-3 more times
-#endif
- }
- /* If the whole input has less than MIN_MATCH bytes, ins_h is garbage,
- * but this is not important since only literal bytes will be emitted.
- */
-
- } while (s->lookahead < MIN_LOOKAHEAD && s->strm->avail_in != 0);
-}
-
-/* ===========================================================================
- * Flush the current block, with given end-of-file flag.
- * IN assertion: strstart is set to the end of the current match.
- */
-#define FLUSH_BLOCK_ONLY(s, eof) { \
- _tr_flush_block(s, (s->block_start >= 0L ? \
- (charf *)&s->window[(unsigned)s->block_start] : \
- (charf *)Z_NULL), \
- (ulg)((long)s->strstart - s->block_start), \
- (eof)); \
- s->block_start = s->strstart; \
- flush_pending(s->strm); \
- Tracev((stderr,"[FLUSH]")); \
-}
-
-/* Same but force premature exit if necessary. */
-#define FLUSH_BLOCK(s, eof) { \
- FLUSH_BLOCK_ONLY(s, eof); \
- if (s->strm->avail_out == 0) return (eof) ? finish_started : need_more; \
-}
-
-/* ===========================================================================
- * Copy without compression as much as possible from the input stream, return
- * the current block state.
- * This function does not insert new strings in the dictionary since
- * uncompressible data is probably not useful. This function is used
- * only for the level=0 compression option.
- * NOTE: this function should be optimized to avoid extra copying from
- * window to pending_buf.
- */
-local block_state deflate_stored(s, flush)
- deflate_state *s;
- int flush;
-{
- /* Stored blocks are limited to 0xffff bytes, pending_buf is limited
- * to pending_buf_size, and each stored block has a 5 byte header:
- */
- ulg max_block_size = 0xffff;
- ulg max_start;
-
- if (max_block_size > s->pending_buf_size - 5) {
- max_block_size = s->pending_buf_size - 5;
- }
-
- /* Copy as much as possible from input to output: */
- for (;;) {
- /* Fill the window as much as possible: */
- if (s->lookahead <= 1) {
-
- Assert(s->strstart < s->w_size+MAX_DIST(s) ||
- s->block_start >= (long)s->w_size, "slide too late");
-
- fill_window(s);
- if (s->lookahead == 0 && flush == Z_NO_FLUSH) return need_more;
-
- if (s->lookahead == 0) break; /* flush the current block */
- }
- Assert(s->block_start >= 0L, "block gone");
-
- s->strstart += s->lookahead;
- s->lookahead = 0;
-
- /* Emit a stored block if pending_buf will be full: */
- max_start = s->block_start + max_block_size;
- if (s->strstart == 0 || (ulg)s->strstart >= max_start) {
- /* strstart == 0 is possible when wraparound on 16-bit machine */
- s->lookahead = (uInt)(s->strstart - max_start);
- s->strstart = (uInt)max_start;
- FLUSH_BLOCK(s, 0);
- }
- /* Flush if we may have to slide, otherwise block_start may become
- * negative and the data will be gone:
- */
- if (s->strstart - (uInt)s->block_start >= MAX_DIST(s)) {
- FLUSH_BLOCK(s, 0);
- }
- }
- FLUSH_BLOCK(s, flush == Z_FINISH);
- return flush == Z_FINISH ? finish_done : block_done;
-}
-
-/* ===========================================================================
- * Compress as much as possible from the input stream, return the current
- * block state.
- * This function does not perform lazy evaluation of matches and inserts
- * new strings in the dictionary only for unmatched strings or for short
- * matches. It is used only for the fast compression options.
- */
-local block_state deflate_fast(s, flush)
- deflate_state *s;
- int flush;
-{
- IPos hash_head = NIL; /* head of the hash chain */
- int bflush; /* set if current block must be flushed */
-
- for (;;) {
- /* Make sure that we always have enough lookahead, except
- * at the end of the input file. We need MAX_MATCH bytes
- * for the next match, plus MIN_MATCH bytes to insert the
- * string following the next match.
- */
- if (s->lookahead < MIN_LOOKAHEAD) {
- fill_window(s);
- if (s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH) {
- return need_more;
- }
- if (s->lookahead == 0) break; /* flush the current block */
- }
-
- /* Insert the string window[strstart .. strstart+2] in the
- * dictionary, and set hash_head to the head of the hash chain:
- */
- if (s->lookahead >= MIN_MATCH) {
- INSERT_STRING(s, s->strstart, hash_head);
- }
-
- /* Find the longest match, discarding those <= prev_length.
- * At this point we have always match_length < MIN_MATCH
- */
- if (hash_head != NIL && s->strstart - hash_head <= MAX_DIST(s)) {
- /* To simplify the code, we prevent matches with the string
- * of window index 0 (in particular we have to avoid a match
- * of the string with itself at the start of the input file).
- */
-#ifdef FASTEST
- if ((s->strategy != Z_HUFFMAN_ONLY && s->strategy != Z_RLE) ||
- (s->strategy == Z_RLE && s->strstart - hash_head == 1)) {
- s->match_length = longest_match_fast (s, hash_head);
- }
-#else
- if (s->strategy != Z_HUFFMAN_ONLY && s->strategy != Z_RLE) {
- s->match_length = longest_match (s, hash_head);
- } else if (s->strategy == Z_RLE && s->strstart - hash_head == 1) {
- s->match_length = longest_match_fast (s, hash_head);
- }
-#endif
- /* longest_match() or longest_match_fast() sets match_start */
- }
- if (s->match_length >= MIN_MATCH) {
- check_match(s, s->strstart, s->match_start, s->match_length);
-
- _tr_tally_dist(s, s->strstart - s->match_start,
- s->match_length - MIN_MATCH, bflush);
-
- s->lookahead -= s->match_length;
-
- /* Insert new strings in the hash table only if the match length
- * is not too large. This saves time but degrades compression.
- */
-#ifndef FASTEST
- if (s->match_length <= s->max_insert_length &&
- s->lookahead >= MIN_MATCH) {
- s->match_length--; /* string at strstart already in table */
- do {
- s->strstart++;
- INSERT_STRING(s, s->strstart, hash_head);
- /* strstart never exceeds WSIZE-MAX_MATCH, so there are
- * always MIN_MATCH bytes ahead.
- */
- } while (--s->match_length != 0);
- s->strstart++;
- } else
-#endif
- {
- s->strstart += s->match_length;
- s->match_length = 0;
- s->ins_h = s->window[s->strstart];
- UPDATE_HASH(s, s->ins_h, s->window[s->strstart+1]);
-#if MIN_MATCH != 3
- Call UPDATE_HASH() MIN_MATCH-3 more times
-#endif
- /* If lookahead < MIN_MATCH, ins_h is garbage, but it does not
- * matter since it will be recomputed at next deflate call.
- */
- }
- } else {
- /* No match, output a literal byte */
- Tracevv((stderr,"%c", s->window[s->strstart]));
- _tr_tally_lit (s, s->window[s->strstart], bflush);
- s->lookahead--;
- s->strstart++;
- }
- if (bflush) FLUSH_BLOCK(s, 0);
- }
- FLUSH_BLOCK(s, flush == Z_FINISH);
- return flush == Z_FINISH ? finish_done : block_done;
-}
-
-#ifndef FASTEST
-/* ===========================================================================
- * Same as above, but achieves better compression. We use a lazy
- * evaluation for matches: a match is finally adopted only if there is
- * no better match at the next window position.
- */
-local block_state deflate_slow(s, flush)
- deflate_state *s;
- int flush;
-{
- IPos hash_head = NIL; /* head of hash chain */
- int bflush; /* set if current block must be flushed */
-
- /* Process the input block. */
- for (;;) {
- /* Make sure that we always have enough lookahead, except
- * at the end of the input file. We need MAX_MATCH bytes
- * for the next match, plus MIN_MATCH bytes to insert the
- * string following the next match.
- */
- if (s->lookahead < MIN_LOOKAHEAD) {
- fill_window(s);
- if (s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH) {
- return need_more;
- }
- if (s->lookahead == 0) break; /* flush the current block */
- }
-
- /* Insert the string window[strstart .. strstart+2] in the
- * dictionary, and set hash_head to the head of the hash chain:
- */
- if (s->lookahead >= MIN_MATCH) {
- INSERT_STRING(s, s->strstart, hash_head);
- }
-
- /* Find the longest match, discarding those <= prev_length.
- */
- s->prev_length = s->match_length, s->prev_match = s->match_start;
- s->match_length = MIN_MATCH-1;
-
- if (hash_head != NIL && s->prev_length < s->max_lazy_match &&
- s->strstart - hash_head <= MAX_DIST(s)) {
- /* To simplify the code, we prevent matches with the string
- * of window index 0 (in particular we have to avoid a match
- * of the string with itself at the start of the input file).
- */
- if (s->strategy != Z_HUFFMAN_ONLY && s->strategy != Z_RLE) {
- s->match_length = longest_match (s, hash_head);
- } else if (s->strategy == Z_RLE && s->strstart - hash_head == 1) {
- s->match_length = longest_match_fast (s, hash_head);
- }
- /* longest_match() or longest_match_fast() sets match_start */
-
- if (s->match_length <= 5 && (s->strategy == Z_FILTERED
-#if TOO_FAR <= 32767
- || (s->match_length == MIN_MATCH &&
- s->strstart - s->match_start > TOO_FAR)
-#endif
- )) {
-
- /* If prev_match is also MIN_MATCH, match_start is garbage
- * but we will ignore the current match anyway.
- */
- s->match_length = MIN_MATCH-1;
- }
- }
- /* If there was a match at the previous step and the current
- * match is not better, output the previous match:
- */
- if (s->prev_length >= MIN_MATCH && s->match_length <= s->prev_length) {
- uInt max_insert = s->strstart + s->lookahead - MIN_MATCH;
- /* Do not insert strings in hash table beyond this. */
-
- check_match(s, s->strstart-1, s->prev_match, s->prev_length);
-
- _tr_tally_dist(s, s->strstart -1 - s->prev_match,
- s->prev_length - MIN_MATCH, bflush);
-
- /* Insert in hash table all strings up to the end of the match.
- * strstart-1 and strstart are already inserted. If there is not
- * enough lookahead, the last two strings are not inserted in
- * the hash table.
- */
- s->lookahead -= s->prev_length-1;
- s->prev_length -= 2;
- do {
- if (++s->strstart <= max_insert) {
- INSERT_STRING(s, s->strstart, hash_head);
- }
- } while (--s->prev_length != 0);
- s->match_available = 0;
- s->match_length = MIN_MATCH-1;
- s->strstart++;
-
- if (bflush) FLUSH_BLOCK(s, 0);
-
- } else if (s->match_available) {
- /* If there was no match at the previous position, output a
- * single literal. If there was a match but the current match
- * is longer, truncate the previous match to a single literal.
- */
- Tracevv((stderr,"%c", s->window[s->strstart-1]));
- _tr_tally_lit(s, s->window[s->strstart-1], bflush);
- if (bflush) {
- FLUSH_BLOCK_ONLY(s, 0);
- }
- s->strstart++;
- s->lookahead--;
- if (s->strm->avail_out == 0) return need_more;
- } else {
- /* There is no previous match to compare with, wait for
- * the next step to decide.
- */
- s->match_available = 1;
- s->strstart++;
- s->lookahead--;
- }
- }
- Assert (flush != Z_NO_FLUSH, "no flush?");
- if (s->match_available) {
- Tracevv((stderr,"%c", s->window[s->strstart-1]));
- _tr_tally_lit(s, s->window[s->strstart-1], bflush);
- s->match_available = 0;
- }
- FLUSH_BLOCK(s, flush == Z_FINISH);
- return flush == Z_FINISH ? finish_done : block_done;
-}
-#endif /* FASTEST */
-
-#if 0
-/* ===========================================================================
- * For Z_RLE, simply look for runs of bytes, generate matches only of distance
- * one. Do not maintain a hash table. (It will be regenerated if this run of
- * deflate switches away from Z_RLE.)
- */
-local block_state deflate_rle(s, flush)
- deflate_state *s;
- int flush;
-{
- int bflush; /* set if current block must be flushed */
- uInt run; /* length of run */
- uInt max; /* maximum length of run */
- uInt prev; /* byte at distance one to match */
- Bytef *scan; /* scan for end of run */
-
- for (;;) {
- /* Make sure that we always have enough lookahead, except
- * at the end of the input file. We need MAX_MATCH bytes
- * for the longest encodable run.
- */
- if (s->lookahead < MAX_MATCH) {
- fill_window(s);
- if (s->lookahead < MAX_MATCH && flush == Z_NO_FLUSH) {
- return need_more;
- }
- if (s->lookahead == 0) break; /* flush the current block */
- }
-
- /* See how many times the previous byte repeats */
- run = 0;
- if (s->strstart > 0) { /* if there is a previous byte, that is */
- max = s->lookahead < MAX_MATCH ? s->lookahead : MAX_MATCH;
- scan = s->window + s->strstart - 1;
- prev = *scan++;
- do {
- if (*scan++ != prev)
- break;
- } while (++run < max);
- }
-
- /* Emit match if have run of MIN_MATCH or longer, else emit literal */
- if (run >= MIN_MATCH) {
- check_match(s, s->strstart, s->strstart - 1, run);
- _tr_tally_dist(s, 1, run - MIN_MATCH, bflush);
- s->lookahead -= run;
- s->strstart += run;
- } else {
- /* No match, output a literal byte */
- Tracevv((stderr,"%c", s->window[s->strstart]));
- _tr_tally_lit (s, s->window[s->strstart], bflush);
- s->lookahead--;
- s->strstart++;
- }
- if (bflush) FLUSH_BLOCK(s, 0);
- }
- FLUSH_BLOCK(s, flush == Z_FINISH);
- return flush == Z_FINISH ? finish_done : block_done;
-}
-#endif
diff --git a/sys/contrib/opensolaris/uts/common/zmod/deflate.h b/sys/contrib/opensolaris/uts/common/zmod/deflate.h
deleted file mode 100644
index d01a3c1..0000000
--- a/sys/contrib/opensolaris/uts/common/zmod/deflate.h
+++ /dev/null
@@ -1,331 +0,0 @@
-/* deflate.h -- internal compression state
- * Copyright (C) 1995-2004 Jean-loup Gailly
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
-
-/* WARNING: this file should *not* be used by applications. It is
- part of the implementation of the compression library and is
- subject to change. Applications should only use zlib.h.
- */
-
-#ifndef _DEFLATE_H
-#define _DEFLATE_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include "zutil.h"
-
-/* define NO_GZIP when compiling if you want to disable gzip header and
- trailer creation by deflate(). NO_GZIP would be used to avoid linking in
- the crc code when it is not needed. For shared libraries, gzip encoding
- should be left enabled. */
-#ifndef NO_GZIP
-# define GZIP
-#endif
-
-/* ===========================================================================
- * Internal compression state.
- */
-
-#define LENGTH_CODES 29
-/* number of length codes, not counting the special END_BLOCK code */
-
-#define LITERALS 256
-/* number of literal bytes 0..255 */
-
-#define L_CODES (LITERALS+1+LENGTH_CODES)
-/* number of Literal or Length codes, including the END_BLOCK code */
-
-#define D_CODES 30
-/* number of distance codes */
-
-#define BL_CODES 19
-/* number of codes used to transfer the bit lengths */
-
-#define HEAP_SIZE (2*L_CODES+1)
-/* maximum heap size */
-
-#define MAX_BITS 15
-/* All codes must not exceed MAX_BITS bits */
-
-#define INIT_STATE 42
-#define EXTRA_STATE 69
-#define NAME_STATE 73
-#define COMMENT_STATE 91
-#define HCRC_STATE 103
-#define BUSY_STATE 113
-#define FINISH_STATE 666
-/* Stream status */
-
-
-/* Data structure describing a single value and its code string. */
-typedef struct ct_data_s {
- union {
- ush freq; /* frequency count */
- ush code; /* bit string */
- } fc;
- union {
- ush dad; /* father node in Huffman tree */
- ush len; /* length of bit string */
- } dl;
-} FAR ct_data;
-
-#define Freq fc.freq
-#define Code fc.code
-#define Dad dl.dad
-#define Len dl.len
-
-typedef struct static_tree_desc_s static_tree_desc;
-
-typedef struct tree_desc_s {
- ct_data *dyn_tree; /* the dynamic tree */
- int max_code; /* largest code with non zero frequency */
- static_tree_desc *stat_desc; /* the corresponding static tree */
-} FAR tree_desc;
-
-typedef ush Pos;
-typedef Pos FAR Posf;
-typedef unsigned IPos;
-
-/* A Pos is an index in the character window. We use short instead of int to
- * save space in the various tables. IPos is used only for parameter passing.
- */
-
-typedef struct internal_state {
- z_streamp strm; /* pointer back to this zlib stream */
- int status; /* as the name implies */
- Bytef *pending_buf; /* output still pending */
- ulg pending_buf_size; /* size of pending_buf */
- Bytef *pending_out; /* next pending byte to output to the stream */
- uInt pending; /* nb of bytes in the pending buffer */
- int wrap; /* bit 0 true for zlib, bit 1 true for gzip */
- gz_headerp gzhead; /* gzip header information to write */
- uInt gzindex; /* where in extra, name, or comment */
- Byte method; /* STORED (for zip only) or DEFLATED */
- int last_flush; /* value of flush param for previous deflate call */
-
- /* used by deflate.c: */
-
- uInt w_size; /* LZ77 window size (32K by default) */
- uInt w_bits; /* log2(w_size) (8..16) */
- uInt w_mask; /* w_size - 1 */
-
- Bytef *window;
- /* Sliding window. Input bytes are read into the second half of the window,
- * and move to the first half later to keep a dictionary of at least wSize
- * bytes. With this organization, matches are limited to a distance of
- * wSize-MAX_MATCH bytes, but this ensures that IO is always
- * performed with a length multiple of the block size. Also, it limits
- * the window size to 64K, which is quite useful on MSDOS.
- * To do: use the user input buffer as sliding window.
- */
-
- ulg window_size;
- /* Actual size of window: 2*wSize, except when the user input buffer
- * is directly used as sliding window.
- */
-
- Posf *prev;
- /* Link to older string with same hash index. To limit the size of this
- * array to 64K, this link is maintained only for the last 32K strings.
- * An index in this array is thus a window index modulo 32K.
- */
-
- Posf *head; /* Heads of the hash chains or NIL. */
-
- uInt ins_h; /* hash index of string to be inserted */
- uInt hash_size; /* number of elements in hash table */
- uInt hash_bits; /* log2(hash_size) */
- uInt hash_mask; /* hash_size-1 */
-
- uInt hash_shift;
- /* Number of bits by which ins_h must be shifted at each input
- * step. It must be such that after MIN_MATCH steps, the oldest
- * byte no longer takes part in the hash key, that is:
- * hash_shift * MIN_MATCH >= hash_bits
- */
-
- long block_start;
- /* Window position at the beginning of the current output block. Gets
- * negative when the window is moved backwards.
- */
-
- uInt match_length; /* length of best match */
- IPos prev_match; /* previous match */
- int match_available; /* set if previous match exists */
- uInt strstart; /* start of string to insert */
- uInt match_start; /* start of matching string */
- uInt lookahead; /* number of valid bytes ahead in window */
-
- uInt prev_length;
- /* Length of the best match at previous step. Matches not greater than this
- * are discarded. This is used in the lazy match evaluation.
- */
-
- uInt max_chain_length;
- /* To speed up deflation, hash chains are never searched beyond this
- * length. A higher limit improves compression ratio but degrades the
- * speed.
- */
-
- uInt max_lazy_match;
- /* Attempt to find a better match only when the current match is strictly
- * smaller than this value. This mechanism is used only for compression
- * levels >= 4.
- */
-# define max_insert_length max_lazy_match
- /* Insert new strings in the hash table only if the match length is not
- * greater than this length. This saves time but degrades compression.
- * max_insert_length is used only for compression levels <= 3.
- */
-
- int level; /* compression level (1..9) */
- int strategy; /* favor or force Huffman coding*/
-
- uInt good_match;
- /* Use a faster search when the previous match is longer than this */
-
- int nice_match; /* Stop searching when current match exceeds this */
-
- /* used by trees.c: */
- /* Didn't use ct_data typedef below to supress compiler warning */
- struct ct_data_s dyn_ltree[HEAP_SIZE]; /* literal and length tree */
- struct ct_data_s dyn_dtree[2*D_CODES+1]; /* distance tree */
- struct ct_data_s bl_tree[2*BL_CODES+1]; /* Huffman tree for bit lengths */
-
- struct tree_desc_s l_desc; /* desc. for literal tree */
- struct tree_desc_s d_desc; /* desc. for distance tree */
- struct tree_desc_s bl_desc; /* desc. for bit length tree */
-
- ush bl_count[MAX_BITS+1];
- /* number of codes at each bit length for an optimal tree */
-
- int heap[2*L_CODES+1]; /* heap used to build the Huffman trees */
- int heap_len; /* number of elements in the heap */
- int heap_max; /* element of largest frequency */
- /* The sons of heap[n] are heap[2*n] and heap[2*n+1]. heap[0] is not used.
- * The same heap array is used to build all trees.
- */
-
- uch depth[2*L_CODES+1];
- /* Depth of each subtree used as tie breaker for trees of equal frequency
- */
-
- uchf *l_buf; /* buffer for literals or lengths */
-
- uInt lit_bufsize;
- /* Size of match buffer for literals/lengths. There are 4 reasons for
- * limiting lit_bufsize to 64K:
- * - frequencies can be kept in 16 bit counters
- * - if compression is not successful for the first block, all input
- * data is still in the window so we can still emit a stored block even
- * when input comes from standard input. (This can also be done for
- * all blocks if lit_bufsize is not greater than 32K.)
- * - if compression is not successful for a file smaller than 64K, we can
- * even emit a stored file instead of a stored block (saving 5 bytes).
- * This is applicable only for zip (not gzip or zlib).
- * - creating new Huffman trees less frequently may not provide fast
- * adaptation to changes in the input data statistics. (Take for
- * example a binary file with poorly compressible code followed by
- * a highly compressible string table.) Smaller buffer sizes give
- * fast adaptation but have of course the overhead of transmitting
- * trees more frequently.
- * - I can't count above 4
- */
-
- uInt last_lit; /* running index in l_buf */
-
- ushf *d_buf;
- /* Buffer for distances. To simplify the code, d_buf and l_buf have
- * the same number of elements. To use different lengths, an extra flag
- * array would be necessary.
- */
-
- ulg opt_len; /* bit length of current block with optimal trees */
- ulg static_len; /* bit length of current block with static trees */
- uInt matches; /* number of string matches in current block */
- int last_eob_len; /* bit length of EOB code for last block */
-
-#ifdef DEBUG
- ulg compressed_len; /* total bit length of compressed file mod 2^32 */
- ulg bits_sent; /* bit length of compressed data sent mod 2^32 */
-#endif
-
- ush bi_buf;
- /* Output buffer. bits are inserted starting at the bottom (least
- * significant bits).
- */
- int bi_valid;
- /* Number of valid bits in bi_buf. All bits above the last valid bit
- * are always zero.
- */
-
-} FAR deflate_state;
-
-/* Output a byte on the stream.
- * IN assertion: there is enough room in pending_buf.
- */
-#define put_byte(s, c) {s->pending_buf[s->pending++] = (c);}
-
-
-#define MIN_LOOKAHEAD (MAX_MATCH+MIN_MATCH+1)
-/* Minimum amount of lookahead, except at the end of the input file.
- * See deflate.c for comments about the MIN_MATCH+1.
- */
-
-#define MAX_DIST(s) ((s)->w_size-MIN_LOOKAHEAD)
-/* In order to simplify the code, particularly on 16 bit machines, match
- * distances are limited to MAX_DIST instead of WSIZE.
- */
-
- /* in trees.c */
-void _tr_init OF((deflate_state *s));
-int _tr_tally OF((deflate_state *s, unsigned dist, unsigned lc));
-void _tr_flush_block OF((deflate_state *s, charf *buf, ulg stored_len,
- int eof));
-void _tr_align OF((deflate_state *s));
-void _tr_stored_block OF((deflate_state *s, charf *buf, ulg stored_len,
- int eof));
-
-#define d_code(dist) \
- ((dist) < 256 ? _dist_code[dist] : _dist_code[256+((dist)>>7)])
-/* Mapping from a distance to a distance code. dist is the distance - 1 and
- * must not have side effects. _dist_code[256] and _dist_code[257] are never
- * used.
- */
-
-#ifndef DEBUG
-/* Inline versions of _tr_tally for speed: */
-
-#if defined(GEN_TREES_H) || !defined(STDC)
- extern uch _length_code[];
- extern uch _dist_code[];
-#else
- extern const uch _length_code[];
- extern const uch _dist_code[];
-#endif
-
-# define _tr_tally_lit(s, c, flush) \
- { uch cc = (c); \
- s->d_buf[s->last_lit] = 0; \
- s->l_buf[s->last_lit++] = cc; \
- s->dyn_ltree[cc].Freq++; \
- flush = (s->last_lit == s->lit_bufsize-1); \
- }
-# define _tr_tally_dist(s, distance, length, flush) \
- { uch len = (length); \
- ush dist = (distance); \
- s->d_buf[s->last_lit] = dist; \
- s->l_buf[s->last_lit++] = len; \
- dist--; \
- s->dyn_ltree[_length_code[len]+LITERALS+1].Freq++; \
- s->dyn_dtree[d_code(dist)].Freq++; \
- flush = (s->last_lit == s->lit_bufsize-1); \
- }
-#else
-# define _tr_tally_lit(s, c, flush) flush = _tr_tally(s, 0, c)
-# define _tr_tally_dist(s, distance, length, flush) \
- flush = _tr_tally(s, distance, length)
-#endif
-
-#endif /* _DEFLATE_H */
diff --git a/sys/contrib/opensolaris/uts/common/zmod/inffast.c b/sys/contrib/opensolaris/uts/common/zmod/inffast.c
deleted file mode 100644
index a6dcf3f..0000000
--- a/sys/contrib/opensolaris/uts/common/zmod/inffast.c
+++ /dev/null
@@ -1,320 +0,0 @@
-/* inffast.c -- fast decoding
- * Copyright (C) 1995-2004 Mark Adler
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include "zutil.h"
-#include "inftrees.h"
-#include "inflate.h"
-#include "inffast.h"
-
-#ifndef ASMINF
-
-/* Allow machine dependent optimization for post-increment or pre-increment.
- Based on testing to date,
- Pre-increment preferred for:
- - PowerPC G3 (Adler)
- - MIPS R5000 (Randers-Pehrson)
- Post-increment preferred for:
- - none
- No measurable difference:
- - Pentium III (Anderson)
- - M68060 (Nikl)
- */
-#ifdef POSTINC
-# define OFF 0
-# define PUP(a) *(a)++
-#else
-# define OFF 1
-# define PUP(a) *++(a)
-#endif
-
-/*
- Decode literal, length, and distance codes and write out the resulting
- literal and match bytes until either not enough input or output is
- available, an end-of-block is encountered, or a data error is encountered.
- When large enough input and output buffers are supplied to inflate(), for
- example, a 16K input buffer and a 64K output buffer, more than 95% of the
- inflate execution time is spent in this routine.
-
- Entry assumptions:
-
- state->mode == LEN
- strm->avail_in >= 6
- strm->avail_out >= 258
- start >= strm->avail_out
- state->bits < 8
-
- On return, state->mode is one of:
-
- LEN -- ran out of enough output space or enough available input
- TYPE -- reached end of block code, inflate() to interpret next block
- BAD -- error in block data
-
- Notes:
-
- - The maximum input bits used by a length/distance pair is 15 bits for the
- length code, 5 bits for the length extra, 15 bits for the distance code,
- and 13 bits for the distance extra. This totals 48 bits, or six bytes.
- Therefore if strm->avail_in >= 6, then there is enough input to avoid
- checking for available input while decoding.
-
- - The maximum bytes that a single length/distance pair can output is 258
- bytes, which is the maximum length that can be coded. inflate_fast()
- requires strm->avail_out >= 258 for each loop to avoid checking for
- output space.
- */
-void inflate_fast(strm, start)
-z_streamp strm;
-unsigned start; /* inflate()'s starting value for strm->avail_out */
-{
- struct inflate_state FAR *state;
- unsigned char FAR *in; /* local strm->next_in */
- unsigned char FAR *last; /* while in < last, enough input available */
- unsigned char FAR *out; /* local strm->next_out */
- unsigned char FAR *beg; /* inflate()'s initial strm->next_out */
- unsigned char FAR *end; /* while out < end, enough space available */
-#ifdef INFLATE_STRICT
- unsigned dmax; /* maximum distance from zlib header */
-#endif
- unsigned wsize; /* window size or zero if not using window */
- unsigned whave; /* valid bytes in the window */
- unsigned write; /* window write index */
- unsigned char FAR *window; /* allocated sliding window, if wsize != 0 */
- unsigned long hold; /* local strm->hold */
- unsigned bits; /* local strm->bits */
- code const FAR *lcode; /* local strm->lencode */
- code const FAR *dcode; /* local strm->distcode */
- unsigned lmask; /* mask for first level of length codes */
- unsigned dmask; /* mask for first level of distance codes */
- code this; /* retrieved table entry */
- unsigned op; /* code bits, operation, extra bits, or */
- /* window position, window bytes to copy */
- unsigned len; /* match length, unused bytes */
- unsigned dist; /* match distance */
- unsigned char FAR *from; /* where to copy match from */
-
- /* copy state to local variables */
- state = (struct inflate_state FAR *)strm->state;
- in = strm->next_in - OFF;
- last = in + (strm->avail_in - 5);
- out = strm->next_out - OFF;
- beg = out - (start - strm->avail_out);
- end = out + (strm->avail_out - 257);
-#ifdef INFLATE_STRICT
- dmax = state->dmax;
-#endif
- wsize = state->wsize;
- whave = state->whave;
- write = state->write;
- window = state->window;
- hold = state->hold;
- bits = state->bits;
- lcode = state->lencode;
- dcode = state->distcode;
- lmask = (1U << state->lenbits) - 1;
- dmask = (1U << state->distbits) - 1;
-
- /* decode literals and length/distances until end-of-block or not enough
- input data or output space */
- do {
- if (bits < 15) {
- hold += (unsigned long)(PUP(in)) << bits;
- bits += 8;
- hold += (unsigned long)(PUP(in)) << bits;
- bits += 8;
- }
- this = lcode[hold & lmask];
- dolen:
- op = (unsigned)(this.bits);
- hold >>= op;
- bits -= op;
- op = (unsigned)(this.op);
- if (op == 0) { /* literal */
- Tracevv((stderr, this.val >= 0x20 && this.val < 0x7f ?
- "inflate: literal '%c'\n" :
- "inflate: literal 0x%02x\n", this.val));
- PUP(out) = (unsigned char)(this.val);
- }
- else if (op & 16) { /* length base */
- len = (unsigned)(this.val);
- op &= 15; /* number of extra bits */
- if (op) {
- if (bits < op) {
- hold += (unsigned long)(PUP(in)) << bits;
- bits += 8;
- }
- len += (unsigned)hold & ((1U << op) - 1);
- hold >>= op;
- bits -= op;
- }
- Tracevv((stderr, "inflate: length %u\n", len));
- if (bits < 15) {
- hold += (unsigned long)(PUP(in)) << bits;
- bits += 8;
- hold += (unsigned long)(PUP(in)) << bits;
- bits += 8;
- }
- this = dcode[hold & dmask];
- dodist:
- op = (unsigned)(this.bits);
- hold >>= op;
- bits -= op;
- op = (unsigned)(this.op);
- if (op & 16) { /* distance base */
- dist = (unsigned)(this.val);
- op &= 15; /* number of extra bits */
- if (bits < op) {
- hold += (unsigned long)(PUP(in)) << bits;
- bits += 8;
- if (bits < op) {
- hold += (unsigned long)(PUP(in)) << bits;
- bits += 8;
- }
- }
- dist += (unsigned)hold & ((1U << op) - 1);
-#ifdef INFLATE_STRICT
- if (dist > dmax) {
- strm->msg = (char *)"invalid distance too far back";
- state->mode = BAD;
- break;
- }
-#endif
- hold >>= op;
- bits -= op;
- Tracevv((stderr, "inflate: distance %u\n", dist));
- op = (unsigned)(out - beg); /* max distance in output */
- if (dist > op) { /* see if copy from window */
- op = dist - op; /* distance back in window */
- if (op > whave) {
- strm->msg = (char *)"invalid distance too far back";
- state->mode = BAD;
- break;
- }
- from = window - OFF;
- if (write == 0) { /* very common case */
- from += wsize - op;
- if (op < len) { /* some from window */
- len -= op;
- do {
- PUP(out) = PUP(from);
- } while (--op);
- from = out - dist; /* rest from output */
- }
- }
- else if (write < op) { /* wrap around window */
- from += wsize + write - op;
- op -= write;
- if (op < len) { /* some from end of window */
- len -= op;
- do {
- PUP(out) = PUP(from);
- } while (--op);
- from = window - OFF;
- if (write < len) { /* some from start of window */
- op = write;
- len -= op;
- do {
- PUP(out) = PUP(from);
- } while (--op);
- from = out - dist; /* rest from output */
- }
- }
- }
- else { /* contiguous in window */
- from += write - op;
- if (op < len) { /* some from window */
- len -= op;
- do {
- PUP(out) = PUP(from);
- } while (--op);
- from = out - dist; /* rest from output */
- }
- }
- while (len > 2) {
- PUP(out) = PUP(from);
- PUP(out) = PUP(from);
- PUP(out) = PUP(from);
- len -= 3;
- }
- if (len) {
- PUP(out) = PUP(from);
- if (len > 1)
- PUP(out) = PUP(from);
- }
- }
- else {
- from = out - dist; /* copy direct from output */
- do { /* minimum length is three */
- PUP(out) = PUP(from);
- PUP(out) = PUP(from);
- PUP(out) = PUP(from);
- len -= 3;
- } while (len > 2);
- if (len) {
- PUP(out) = PUP(from);
- if (len > 1)
- PUP(out) = PUP(from);
- }
- }
- }
- else if ((op & 64) == 0) { /* 2nd level distance code */
- this = dcode[this.val + (hold & ((1U << op) - 1))];
- goto dodist;
- }
- else {
- strm->msg = (char *)"invalid distance code";
- state->mode = BAD;
- break;
- }
- }
- else if ((op & 64) == 0) { /* 2nd level length code */
- this = lcode[this.val + (hold & ((1U << op) - 1))];
- goto dolen;
- }
- else if (op & 32) { /* end-of-block */
- Tracevv((stderr, "inflate: end of block\n"));
- state->mode = TYPE;
- break;
- }
- else {
- strm->msg = (char *)"invalid literal/length code";
- state->mode = BAD;
- break;
- }
- } while (in < last && out < end);
-
- /* return unused bytes (on entry, bits < 8, so in won't go too far back) */
- len = bits >> 3;
- in -= len;
- bits -= len << 3;
- hold &= (1U << bits) - 1;
-
- /* update state and return */
- strm->next_in = in + OFF;
- strm->next_out = out + OFF;
- strm->avail_in = (unsigned)(in < last ? 5 + (last - in) : 5 - (in - last));
- strm->avail_out = (unsigned)(out < end ?
- 257 + (end - out) : 257 - (out - end));
- state->hold = hold;
- state->bits = bits;
- return;
-}
-
-/*
- inflate_fast() speedups that turned out slower (on a PowerPC G3 750CXe):
- - Using bit fields for code structure
- - Different op definition to avoid & for extra bits (do & for table bits)
- - Three separate decoding do-loops for direct, window, and write == 0
- - Special case for distance > 1 copies to do overlapped load and store copy
- - Explicit branch predictions (based on measured branch probabilities)
- - Deferring match copy and interspersed it with decoding subsequent codes
- - Swapping literal/length else
- - Swapping window/direct else
- - Larger unrolled copy loops (three is about right)
- - Moving len -= 3 statement into middle of loop
- */
-
-#endif /* !ASMINF */
diff --git a/sys/contrib/opensolaris/uts/common/zmod/inffast.h b/sys/contrib/opensolaris/uts/common/zmod/inffast.h
deleted file mode 100644
index 2d214ef..0000000
--- a/sys/contrib/opensolaris/uts/common/zmod/inffast.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/* inffast.h -- header to use inffast.c
- * Copyright (C) 1995-2003 Mark Adler
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-/* WARNING: this file should *not* be used by applications. It is
- part of the implementation of the compression library and is
- subject to change. Applications should only use zlib.h.
- */
-
-void inflate_fast OF((z_streamp strm, unsigned start));
diff --git a/sys/contrib/opensolaris/uts/common/zmod/inffixed.h b/sys/contrib/opensolaris/uts/common/zmod/inffixed.h
deleted file mode 100644
index ed55df8..0000000
--- a/sys/contrib/opensolaris/uts/common/zmod/inffixed.h
+++ /dev/null
@@ -1,96 +0,0 @@
- /* inffixed.h -- table for decoding fixed codes
- * Generated automatically by makefixed().
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
- /* WARNING: this file should *not* be used by applications. It
- is part of the implementation of the compression library and
- is subject to change. Applications should only use zlib.h.
- */
-
- static const code lenfix[512] = {
- {96,7,0},{0,8,80},{0,8,16},{20,8,115},{18,7,31},{0,8,112},{0,8,48},
- {0,9,192},{16,7,10},{0,8,96},{0,8,32},{0,9,160},{0,8,0},{0,8,128},
- {0,8,64},{0,9,224},{16,7,6},{0,8,88},{0,8,24},{0,9,144},{19,7,59},
- {0,8,120},{0,8,56},{0,9,208},{17,7,17},{0,8,104},{0,8,40},{0,9,176},
- {0,8,8},{0,8,136},{0,8,72},{0,9,240},{16,7,4},{0,8,84},{0,8,20},
- {21,8,227},{19,7,43},{0,8,116},{0,8,52},{0,9,200},{17,7,13},{0,8,100},
- {0,8,36},{0,9,168},{0,8,4},{0,8,132},{0,8,68},{0,9,232},{16,7,8},
- {0,8,92},{0,8,28},{0,9,152},{20,7,83},{0,8,124},{0,8,60},{0,9,216},
- {18,7,23},{0,8,108},{0,8,44},{0,9,184},{0,8,12},{0,8,140},{0,8,76},
- {0,9,248},{16,7,3},{0,8,82},{0,8,18},{21,8,163},{19,7,35},{0,8,114},
- {0,8,50},{0,9,196},{17,7,11},{0,8,98},{0,8,34},{0,9,164},{0,8,2},
- {0,8,130},{0,8,66},{0,9,228},{16,7,7},{0,8,90},{0,8,26},{0,9,148},
- {20,7,67},{0,8,122},{0,8,58},{0,9,212},{18,7,19},{0,8,106},{0,8,42},
- {0,9,180},{0,8,10},{0,8,138},{0,8,74},{0,9,244},{16,7,5},{0,8,86},
- {0,8,22},{64,8,0},{19,7,51},{0,8,118},{0,8,54},{0,9,204},{17,7,15},
- {0,8,102},{0,8,38},{0,9,172},{0,8,6},{0,8,134},{0,8,70},{0,9,236},
- {16,7,9},{0,8,94},{0,8,30},{0,9,156},{20,7,99},{0,8,126},{0,8,62},
- {0,9,220},{18,7,27},{0,8,110},{0,8,46},{0,9,188},{0,8,14},{0,8,142},
- {0,8,78},{0,9,252},{96,7,0},{0,8,81},{0,8,17},{21,8,131},{18,7,31},
- {0,8,113},{0,8,49},{0,9,194},{16,7,10},{0,8,97},{0,8,33},{0,9,162},
- {0,8,1},{0,8,129},{0,8,65},{0,9,226},{16,7,6},{0,8,89},{0,8,25},
- {0,9,146},{19,7,59},{0,8,121},{0,8,57},{0,9,210},{17,7,17},{0,8,105},
- {0,8,41},{0,9,178},{0,8,9},{0,8,137},{0,8,73},{0,9,242},{16,7,4},
- {0,8,85},{0,8,21},{16,8,258},{19,7,43},{0,8,117},{0,8,53},{0,9,202},
- {17,7,13},{0,8,101},{0,8,37},{0,9,170},{0,8,5},{0,8,133},{0,8,69},
- {0,9,234},{16,7,8},{0,8,93},{0,8,29},{0,9,154},{20,7,83},{0,8,125},
- {0,8,61},{0,9,218},{18,7,23},{0,8,109},{0,8,45},{0,9,186},{0,8,13},
- {0,8,141},{0,8,77},{0,9,250},{16,7,3},{0,8,83},{0,8,19},{21,8,195},
- {19,7,35},{0,8,115},{0,8,51},{0,9,198},{17,7,11},{0,8,99},{0,8,35},
- {0,9,166},{0,8,3},{0,8,131},{0,8,67},{0,9,230},{16,7,7},{0,8,91},
- {0,8,27},{0,9,150},{20,7,67},{0,8,123},{0,8,59},{0,9,214},{18,7,19},
- {0,8,107},{0,8,43},{0,9,182},{0,8,11},{0,8,139},{0,8,75},{0,9,246},
- {16,7,5},{0,8,87},{0,8,23},{64,8,0},{19,7,51},{0,8,119},{0,8,55},
- {0,9,206},{17,7,15},{0,8,103},{0,8,39},{0,9,174},{0,8,7},{0,8,135},
- {0,8,71},{0,9,238},{16,7,9},{0,8,95},{0,8,31},{0,9,158},{20,7,99},
- {0,8,127},{0,8,63},{0,9,222},{18,7,27},{0,8,111},{0,8,47},{0,9,190},
- {0,8,15},{0,8,143},{0,8,79},{0,9,254},{96,7,0},{0,8,80},{0,8,16},
- {20,8,115},{18,7,31},{0,8,112},{0,8,48},{0,9,193},{16,7,10},{0,8,96},
- {0,8,32},{0,9,161},{0,8,0},{0,8,128},{0,8,64},{0,9,225},{16,7,6},
- {0,8,88},{0,8,24},{0,9,145},{19,7,59},{0,8,120},{0,8,56},{0,9,209},
- {17,7,17},{0,8,104},{0,8,40},{0,9,177},{0,8,8},{0,8,136},{0,8,72},
- {0,9,241},{16,7,4},{0,8,84},{0,8,20},{21,8,227},{19,7,43},{0,8,116},
- {0,8,52},{0,9,201},{17,7,13},{0,8,100},{0,8,36},{0,9,169},{0,8,4},
- {0,8,132},{0,8,68},{0,9,233},{16,7,8},{0,8,92},{0,8,28},{0,9,153},
- {20,7,83},{0,8,124},{0,8,60},{0,9,217},{18,7,23},{0,8,108},{0,8,44},
- {0,9,185},{0,8,12},{0,8,140},{0,8,76},{0,9,249},{16,7,3},{0,8,82},
- {0,8,18},{21,8,163},{19,7,35},{0,8,114},{0,8,50},{0,9,197},{17,7,11},
- {0,8,98},{0,8,34},{0,9,165},{0,8,2},{0,8,130},{0,8,66},{0,9,229},
- {16,7,7},{0,8,90},{0,8,26},{0,9,149},{20,7,67},{0,8,122},{0,8,58},
- {0,9,213},{18,7,19},{0,8,106},{0,8,42},{0,9,181},{0,8,10},{0,8,138},
- {0,8,74},{0,9,245},{16,7,5},{0,8,86},{0,8,22},{64,8,0},{19,7,51},
- {0,8,118},{0,8,54},{0,9,205},{17,7,15},{0,8,102},{0,8,38},{0,9,173},
- {0,8,6},{0,8,134},{0,8,70},{0,9,237},{16,7,9},{0,8,94},{0,8,30},
- {0,9,157},{20,7,99},{0,8,126},{0,8,62},{0,9,221},{18,7,27},{0,8,110},
- {0,8,46},{0,9,189},{0,8,14},{0,8,142},{0,8,78},{0,9,253},{96,7,0},
- {0,8,81},{0,8,17},{21,8,131},{18,7,31},{0,8,113},{0,8,49},{0,9,195},
- {16,7,10},{0,8,97},{0,8,33},{0,9,163},{0,8,1},{0,8,129},{0,8,65},
- {0,9,227},{16,7,6},{0,8,89},{0,8,25},{0,9,147},{19,7,59},{0,8,121},
- {0,8,57},{0,9,211},{17,7,17},{0,8,105},{0,8,41},{0,9,179},{0,8,9},
- {0,8,137},{0,8,73},{0,9,243},{16,7,4},{0,8,85},{0,8,21},{16,8,258},
- {19,7,43},{0,8,117},{0,8,53},{0,9,203},{17,7,13},{0,8,101},{0,8,37},
- {0,9,171},{0,8,5},{0,8,133},{0,8,69},{0,9,235},{16,7,8},{0,8,93},
- {0,8,29},{0,9,155},{20,7,83},{0,8,125},{0,8,61},{0,9,219},{18,7,23},
- {0,8,109},{0,8,45},{0,9,187},{0,8,13},{0,8,141},{0,8,77},{0,9,251},
- {16,7,3},{0,8,83},{0,8,19},{21,8,195},{19,7,35},{0,8,115},{0,8,51},
- {0,9,199},{17,7,11},{0,8,99},{0,8,35},{0,9,167},{0,8,3},{0,8,131},
- {0,8,67},{0,9,231},{16,7,7},{0,8,91},{0,8,27},{0,9,151},{20,7,67},
- {0,8,123},{0,8,59},{0,9,215},{18,7,19},{0,8,107},{0,8,43},{0,9,183},
- {0,8,11},{0,8,139},{0,8,75},{0,9,247},{16,7,5},{0,8,87},{0,8,23},
- {64,8,0},{19,7,51},{0,8,119},{0,8,55},{0,9,207},{17,7,15},{0,8,103},
- {0,8,39},{0,9,175},{0,8,7},{0,8,135},{0,8,71},{0,9,239},{16,7,9},
- {0,8,95},{0,8,31},{0,9,159},{20,7,99},{0,8,127},{0,8,63},{0,9,223},
- {18,7,27},{0,8,111},{0,8,47},{0,9,191},{0,8,15},{0,8,143},{0,8,79},
- {0,9,255}
- };
-
- static const code distfix[32] = {
- {16,5,1},{23,5,257},{19,5,17},{27,5,4097},{17,5,5},{25,5,1025},
- {21,5,65},{29,5,16385},{16,5,3},{24,5,513},{20,5,33},{28,5,8193},
- {18,5,9},{26,5,2049},{22,5,129},{64,5,0},{16,5,2},{23,5,385},
- {19,5,25},{27,5,6145},{17,5,7},{25,5,1537},{21,5,97},{29,5,24577},
- {16,5,4},{24,5,769},{20,5,49},{28,5,12289},{18,5,13},{26,5,3073},
- {22,5,193},{64,5,0}
- };
diff --git a/sys/contrib/opensolaris/uts/common/zmod/inflate.c b/sys/contrib/opensolaris/uts/common/zmod/inflate.c
deleted file mode 100644
index 023e7a1..0000000
--- a/sys/contrib/opensolaris/uts/common/zmod/inflate.c
+++ /dev/null
@@ -1,1395 +0,0 @@
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-/* inflate.c -- zlib decompression
- * Copyright (C) 1995-2005 Mark Adler
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-/*
- * Change history:
- *
- * 1.2.beta0 24 Nov 2002
- * - First version -- complete rewrite of inflate to simplify code, avoid
- * creation of window when not needed, minimize use of window when it is
- * needed, make inffast.c even faster, implement gzip decoding, and to
- * improve code readability and style over the previous zlib inflate code
- *
- * 1.2.beta1 25 Nov 2002
- * - Use pointers for available input and output checking in inffast.c
- * - Remove input and output counters in inffast.c
- * - Change inffast.c entry and loop from avail_in >= 7 to >= 6
- * - Remove unnecessary second byte pull from length extra in inffast.c
- * - Unroll direct copy to three copies per loop in inffast.c
- *
- * 1.2.beta2 4 Dec 2002
- * - Change external routine names to reduce potential conflicts
- * - Correct filename to inffixed.h for fixed tables in inflate.c
- * - Make hbuf[] unsigned char to match parameter type in inflate.c
- * - Change strm->next_out[-state->offset] to *(strm->next_out - state->offset)
- * to avoid negation problem on Alphas (64 bit) in inflate.c
- *
- * 1.2.beta3 22 Dec 2002
- * - Add comments on state->bits assertion in inffast.c
- * - Add comments on op field in inftrees.h
- * - Fix bug in reuse of allocated window after inflateReset()
- * - Remove bit fields--back to byte structure for speed
- * - Remove distance extra == 0 check in inflate_fast()--only helps for lengths
- * - Change post-increments to pre-increments in inflate_fast(), PPC biased?
- * - Add compile time option, POSTINC, to use post-increments instead (Intel?)
- * - Make MATCH copy in inflate() much faster for when inflate_fast() not used
- * - Use local copies of stream next and avail values, as well as local bit
- * buffer and bit count in inflate()--for speed when inflate_fast() not used
- *
- * 1.2.beta4 1 Jan 2003
- * - Split ptr - 257 statements in inflate_table() to avoid compiler warnings
- * - Move a comment on output buffer sizes from inffast.c to inflate.c
- * - Add comments in inffast.c to introduce the inflate_fast() routine
- * - Rearrange window copies in inflate_fast() for speed and simplification
- * - Unroll last copy for window match in inflate_fast()
- * - Use local copies of window variables in inflate_fast() for speed
- * - Pull out common write == 0 case for speed in inflate_fast()
- * - Make op and len in inflate_fast() unsigned for consistency
- * - Add FAR to lcode and dcode declarations in inflate_fast()
- * - Simplified bad distance check in inflate_fast()
- * - Added inflateBackInit(), inflateBack(), and inflateBackEnd() in new
- * source file infback.c to provide a call-back interface to inflate for
- * programs like gzip and unzip -- uses window as output buffer to avoid
- * window copying
- *
- * 1.2.beta5 1 Jan 2003
- * - Improved inflateBack() interface to allow the caller to provide initial
- * input in strm.
- * - Fixed stored blocks bug in inflateBack()
- *
- * 1.2.beta6 4 Jan 2003
- * - Added comments in inffast.c on effectiveness of POSTINC
- * - Typecasting all around to reduce compiler warnings
- * - Changed loops from while (1) or do {} while (1) to for (;;), again to
- * make compilers happy
- * - Changed type of window in inflateBackInit() to unsigned char *
- *
- * 1.2.beta7 27 Jan 2003
- * - Changed many types to unsigned or unsigned short to avoid warnings
- * - Added inflateCopy() function
- *
- * 1.2.0 9 Mar 2003
- * - Changed inflateBack() interface to provide separate opaque descriptors
- * for the in() and out() functions
- * - Changed inflateBack() argument and in_func typedef to swap the length
- * and buffer address return values for the input function
- * - Check next_in and next_out for Z_NULL on entry to inflate()
- *
- * The history for versions after 1.2.0 are in ChangeLog in zlib distribution.
- */
-
-#include "zutil.h"
-#include "inftrees.h"
-#include "inflate.h"
-#include "inffast.h"
-
-#ifdef MAKEFIXED
-# ifndef BUILDFIXED
-# define BUILDFIXED
-# endif
-#endif
-
-/* function prototypes */
-local void fixedtables OF((struct inflate_state FAR *state));
-local int updatewindow OF((z_streamp strm, unsigned out));
-#ifdef BUILDFIXED
- void makefixed OF((void));
-#endif
-local unsigned syncsearch OF((unsigned FAR *have, unsigned char FAR *buf,
- unsigned len));
-
-int ZEXPORT inflateReset(strm)
-z_streamp strm;
-{
- struct inflate_state FAR *state;
-
- if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR;
- state = (struct inflate_state FAR *)strm->state;
- strm->total_in = strm->total_out = state->total = 0;
- strm->msg = Z_NULL;
- strm->adler = 1; /* to support ill-conceived Java test suite */
- state->mode = HEAD;
- state->last = 0;
- state->havedict = 0;
- state->dmax = 32768U;
- state->head = Z_NULL;
- state->wsize = 0;
- state->whave = 0;
- state->write = 0;
- state->hold = 0;
- state->bits = 0;
- state->lencode = state->distcode = state->next = state->codes;
- Tracev((stderr, "inflate: reset\n"));
- return Z_OK;
-}
-
-int ZEXPORT inflatePrime(strm, bits, value)
-z_streamp strm;
-int bits;
-int value;
-{
- struct inflate_state FAR *state;
-
- if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR;
- state = (struct inflate_state FAR *)strm->state;
- if (bits > 16 || state->bits + bits > 32) return Z_STREAM_ERROR;
- value &= (1L << bits) - 1;
- state->hold += value << state->bits;
- state->bits += bits;
- return Z_OK;
-}
-
-int ZEXPORT inflateInit2_(strm, windowBits, version, stream_size)
-z_streamp strm;
-int windowBits;
-const char *version;
-int stream_size;
-{
- struct inflate_state FAR *state;
-
- if (version == Z_NULL || version[0] != ZLIB_VERSION[0] ||
- stream_size != (int)(sizeof(z_stream)))
- return Z_VERSION_ERROR;
- if (strm == Z_NULL) return Z_STREAM_ERROR;
- strm->msg = Z_NULL; /* in case we return an error */
- if (strm->zalloc == (alloc_func)0) {
- strm->zalloc = zcalloc;
- strm->opaque = (voidpf)0;
- }
- if (strm->zfree == (free_func)0) strm->zfree = zcfree;
- state = (struct inflate_state FAR *)
- ZALLOC(strm, 1, sizeof(struct inflate_state));
- if (state == Z_NULL) return Z_MEM_ERROR;
- Tracev((stderr, "inflate: allocated\n"));
- strm->state = (struct internal_state FAR *)state;
- if (windowBits < 0) {
- state->wrap = 0;
- windowBits = -windowBits;
- }
- else {
- state->wrap = (windowBits >> 4) + 1;
-#ifdef GUNZIP
- if (windowBits < 48) windowBits &= 15;
-#endif
- }
- if (windowBits < 8 || windowBits > 15) {
- ZFREE(strm, state);
- strm->state = Z_NULL;
- return Z_STREAM_ERROR;
- }
- state->wbits = (unsigned)windowBits;
- state->window = Z_NULL;
- return inflateReset(strm);
-}
-
-int ZEXPORT inflateInit_(strm, version, stream_size)
-z_streamp strm;
-const char *version;
-int stream_size;
-{
- return inflateInit2_(strm, DEF_WBITS, version, stream_size);
-}
-
-/*
- Return state with length and distance decoding tables and index sizes set to
- fixed code decoding. Normally this returns fixed tables from inffixed.h.
- If BUILDFIXED is defined, then instead this routine builds the tables the
- first time it's called, and returns those tables the first time and
- thereafter. This reduces the size of the code by about 2K bytes, in
- exchange for a little execution time. However, BUILDFIXED should not be
- used for threaded applications, since the rewriting of the tables and virgin
- may not be thread-safe.
- */
-local void fixedtables(state)
-struct inflate_state FAR *state;
-{
-#ifdef BUILDFIXED
- static int virgin = 1;
- static code *lenfix, *distfix;
- static code fixed[544];
-
- /* build fixed huffman tables if first call (may not be thread safe) */
- if (virgin) {
- unsigned sym, bits;
- static code *next;
-
- /* literal/length table */
- sym = 0;
- while (sym < 144) state->lens[sym++] = 8;
- while (sym < 256) state->lens[sym++] = 9;
- while (sym < 280) state->lens[sym++] = 7;
- while (sym < 288) state->lens[sym++] = 8;
- next = fixed;
- lenfix = next;
- bits = 9;
- inflate_table(LENS, state->lens, 288, &(next), &(bits), state->work);
-
- /* distance table */
- sym = 0;
- while (sym < 32) state->lens[sym++] = 5;
- distfix = next;
- bits = 5;
- inflate_table(DISTS, state->lens, 32, &(next), &(bits), state->work);
-
- /* do this just once */
- virgin = 0;
- }
-#else /* !BUILDFIXED */
-# include "inffixed.h"
-#endif /* BUILDFIXED */
- state->lencode = lenfix;
- state->lenbits = 9;
- state->distcode = distfix;
- state->distbits = 5;
-}
-
-#ifdef MAKEFIXED
-#include <stdio.h>
-
-/*
- Write out the inffixed.h that is #include'd above. Defining MAKEFIXED also
- defines BUILDFIXED, so the tables are built on the fly. makefixed() writes
- those tables to stdout, which would be piped to inffixed.h. A small program
- can simply call makefixed to do this:
-
- void makefixed(void);
-
- int main(void)
- {
- makefixed();
- return 0;
- }
-
- Then that can be linked with zlib built with MAKEFIXED defined and run:
-
- a.out > inffixed.h
- */
-void makefixed()
-{
- unsigned low, size;
- struct inflate_state state;
-
- fixedtables(&state);
- puts(" /* inffixed.h -- table for decoding fixed codes");
- puts(" * Generated automatically by makefixed().");
- puts(" */");
- puts("");
- puts(" /* WARNING: this file should *not* be used by applications.");
- puts(" It is part of the implementation of this library and is");
- puts(" subject to change. Applications should only use zlib.h.");
- puts(" */");
- puts("");
- size = 1U << 9;
- printf(" static const code lenfix[%u] = {", size);
- low = 0;
- for (;;) {
- if ((low % 7) == 0) printf("\n ");
- printf("{%u,%u,%d}", state.lencode[low].op, state.lencode[low].bits,
- state.lencode[low].val);
- if (++low == size) break;
- putchar(',');
- }
- puts("\n };");
- size = 1U << 5;
- printf("\n static const code distfix[%u] = {", size);
- low = 0;
- for (;;) {
- if ((low % 6) == 0) printf("\n ");
- printf("{%u,%u,%d}", state.distcode[low].op, state.distcode[low].bits,
- state.distcode[low].val);
- if (++low == size) break;
- putchar(',');
- }
- puts("\n };");
-}
-#endif /* MAKEFIXED */
-
-/*
- Update the window with the last wsize (normally 32K) bytes written before
- returning. If window does not exist yet, create it. This is only called
- when a window is already in use, or when output has been written during this
- inflate call, but the end of the deflate stream has not been reached yet.
- It is also called to create a window for dictionary data when a dictionary
- is loaded.
-
- Providing output buffers larger than 32K to inflate() should provide a speed
- advantage, since only the last 32K of output is copied to the sliding window
- upon return from inflate(), and since all distances after the first 32K of
- output will fall in the output data, making match copies simpler and faster.
- The advantage may be dependent on the size of the processor's data caches.
- */
-local int updatewindow(strm, out)
-z_streamp strm;
-unsigned out;
-{
- struct inflate_state FAR *state;
- unsigned copy, dist;
-
- state = (struct inflate_state FAR *)strm->state;
-
- /* if it hasn't been done already, allocate space for the window */
- if (state->window == Z_NULL) {
- state->window = (unsigned char FAR *)
- ZALLOC(strm, 1U << state->wbits,
- sizeof(unsigned char));
- if (state->window == Z_NULL) return 1;
- }
-
- /* if window not in use yet, initialize */
- if (state->wsize == 0) {
- state->wsize = 1U << state->wbits;
- state->write = 0;
- state->whave = 0;
- }
-
- /* copy state->wsize or less output bytes into the circular window */
- copy = out - strm->avail_out;
- if (copy >= state->wsize) {
- zmemcpy(state->window, strm->next_out - state->wsize, state->wsize);
- state->write = 0;
- state->whave = state->wsize;
- }
- else {
- dist = state->wsize - state->write;
- if (dist > copy) dist = copy;
- zmemcpy(state->window + state->write, strm->next_out - copy, dist);
- copy -= dist;
- if (copy) {
- zmemcpy(state->window, strm->next_out - copy, copy);
- state->write = copy;
- state->whave = state->wsize;
- }
- else {
- state->write += dist;
- if (state->write == state->wsize) state->write = 0;
- if (state->whave < state->wsize) state->whave += dist;
- }
- }
- return 0;
-}
-
-/* Macros for inflate(): */
-
-/* check function to use adler32() for zlib or crc32() for gzip */
-#ifdef GUNZIP
-# define UPDATE(check, buf, len) \
- (state->flags ? crc32(check, buf, len) : adler32(check, buf, len))
-#else
-# define UPDATE(check, buf, len) adler32(check, buf, len)
-#endif
-
-/* check macros for header crc */
-#ifdef GUNZIP
-# define CRC2(check, word) \
- do { \
- hbuf[0] = (unsigned char)(word); \
- hbuf[1] = (unsigned char)((word) >> 8); \
- check = crc32(check, hbuf, 2); \
- } while (0)
-
-# define CRC4(check, word) \
- do { \
- hbuf[0] = (unsigned char)(word); \
- hbuf[1] = (unsigned char)((word) >> 8); \
- hbuf[2] = (unsigned char)((word) >> 16); \
- hbuf[3] = (unsigned char)((word) >> 24); \
- check = crc32(check, hbuf, 4); \
- } while (0)
-#endif
-
-/* Load registers with state in inflate() for speed */
-#define LOAD() \
- do { \
- put = strm->next_out; \
- left = strm->avail_out; \
- next = strm->next_in; \
- have = strm->avail_in; \
- hold = state->hold; \
- bits = state->bits; \
- } while (0)
-
-/* Restore state from registers in inflate() */
-#define RESTORE() \
- do { \
- strm->next_out = put; \
- strm->avail_out = left; \
- strm->next_in = next; \
- strm->avail_in = have; \
- state->hold = hold; \
- state->bits = bits; \
- } while (0)
-
-/* Clear the input bit accumulator */
-#define INITBITS() \
- do { \
- hold = 0; \
- bits = 0; \
- } while (0)
-
-/* Get a byte of input into the bit accumulator, or return from inflate()
- if there is no input available. */
-#define PULLBYTE() \
- do { \
- if (have == 0) goto inf_leave; \
- have--; \
- hold += (unsigned long)(*next++) << bits; \
- bits += 8; \
- } while (0)
-
-/* Assure that there are at least n bits in the bit accumulator. If there is
- not enough available input to do that, then return from inflate(). */
-#define NEEDBITS(n) \
- do { \
- while (bits < (unsigned)(n)) \
- PULLBYTE(); \
- } while (0)
-
-/* Return the low n bits of the bit accumulator (n < 16) */
-#define BITS(n) \
- ((unsigned)hold & ((1U << (n)) - 1))
-
-/* Remove n bits from the bit accumulator */
-#define DROPBITS(n) \
- do { \
- hold >>= (n); \
- bits -= (unsigned)(n); \
- } while (0)
-
-/* Remove zero to seven bits as needed to go to a byte boundary */
-#define BYTEBITS() \
- do { \
- hold >>= bits & 7; \
- bits -= bits & 7; \
- } while (0)
-
-/* Reverse the bytes in a 32-bit value */
-#define REVERSE(q) \
- ((((q) >> 24) & 0xff) + (((q) >> 8) & 0xff00) + \
- (((q) & 0xff00) << 8) + (((q) & 0xff) << 24))
-
-/*
- inflate() uses a state machine to process as much input data and generate as
- much output data as possible before returning. The state machine is
- structured roughly as follows:
-
- for (;;) switch (state) {
- ...
- case STATEn:
- if (not enough input data or output space to make progress)
- return;
- ... make progress ...
- state = STATEm;
- break;
- ...
- }
-
- so when inflate() is called again, the same case is attempted again, and
- if the appropriate resources are provided, the machine proceeds to the
- next state. The NEEDBITS() macro is usually the way the state evaluates
- whether it can proceed or should return. NEEDBITS() does the return if
- the requested bits are not available. The typical use of the BITS macros
- is:
-
- NEEDBITS(n);
- ... do something with BITS(n) ...
- DROPBITS(n);
-
- where NEEDBITS(n) either returns from inflate() if there isn't enough
- input left to load n bits into the accumulator, or it continues. BITS(n)
- gives the low n bits in the accumulator. When done, DROPBITS(n) drops
- the low n bits off the accumulator. INITBITS() clears the accumulator
- and sets the number of available bits to zero. BYTEBITS() discards just
- enough bits to put the accumulator on a byte boundary. After BYTEBITS()
- and a NEEDBITS(8), then BITS(8) would return the next byte in the stream.
-
- NEEDBITS(n) uses PULLBYTE() to get an available byte of input, or to return
- if there is no input available. The decoding of variable length codes uses
- PULLBYTE() directly in order to pull just enough bytes to decode the next
- code, and no more.
-
- Some states loop until they get enough input, making sure that enough
- state information is maintained to continue the loop where it left off
- if NEEDBITS() returns in the loop. For example, want, need, and keep
- would all have to actually be part of the saved state in case NEEDBITS()
- returns:
-
- case STATEw:
- while (want < need) {
- NEEDBITS(n);
- keep[want++] = BITS(n);
- DROPBITS(n);
- }
- state = STATEx;
- case STATEx:
-
- As shown above, if the next state is also the next case, then the break
- is omitted.
-
- A state may also return if there is not enough output space available to
- complete that state. Those states are copying stored data, writing a
- literal byte, and copying a matching string.
-
- When returning, a "goto inf_leave" is used to update the total counters,
- update the check value, and determine whether any progress has been made
- during that inflate() call in order to return the proper return code.
- Progress is defined as a change in either strm->avail_in or strm->avail_out.
- When there is a window, goto inf_leave will update the window with the last
- output written. If a goto inf_leave occurs in the middle of decompression
- and there is no window currently, goto inf_leave will create one and copy
- output to the window for the next call of inflate().
-
- In this implementation, the flush parameter of inflate() only affects the
- return code (per zlib.h). inflate() always writes as much as possible to
- strm->next_out, given the space available and the provided input--the effect
- documented in zlib.h of Z_SYNC_FLUSH. Furthermore, inflate() always defers
- the allocation of and copying into a sliding window until necessary, which
- provides the effect documented in zlib.h for Z_FINISH when the entire input
- stream available. So the only thing the flush parameter actually does is:
- when flush is set to Z_FINISH, inflate() cannot return Z_OK. Instead it
- will return Z_BUF_ERROR if it has not reached the end of the stream.
- */
-
-int ZEXPORT inflate(strm, flush)
-z_streamp strm;
-int flush;
-{
- struct inflate_state FAR *state;
- unsigned char FAR *next; /* next input */
- unsigned char FAR *put; /* next output */
- unsigned have, left; /* available input and output */
- unsigned long hold; /* bit buffer */
- unsigned bits; /* bits in bit buffer */
- unsigned in, out; /* save starting available input and output */
- unsigned copy; /* number of stored or match bytes to copy */
- unsigned char FAR *from; /* where to copy match bytes from */
- code this; /* current decoding table entry */
- code last; /* parent table entry */
- unsigned len; /* length to copy for repeats, bits to drop */
- int ret; /* return code */
-#ifdef GUNZIP
- unsigned char hbuf[4]; /* buffer for gzip header crc calculation */
-#endif
- static const unsigned short order[19] = /* permutation of code lengths */
- {16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15};
-
- if (strm == Z_NULL || strm->state == Z_NULL || strm->next_out == Z_NULL ||
- (strm->next_in == Z_NULL && strm->avail_in != 0))
- return Z_STREAM_ERROR;
-
- state = (struct inflate_state FAR *)strm->state;
- if (state->mode == TYPE) state->mode = TYPEDO; /* skip check */
- LOAD();
- in = have;
- out = left;
- ret = Z_OK;
- for (;;)
- switch (state->mode) {
- case HEAD:
- if (state->wrap == 0) {
- state->mode = TYPEDO;
- break;
- }
- NEEDBITS(16);
-#ifdef GUNZIP
- if ((state->wrap & 2) && hold == 0x8b1f) { /* gzip header */
- state->check = crc32(0L, Z_NULL, 0);
- CRC2(state->check, hold);
- INITBITS();
- state->mode = FLAGS;
- break;
- }
- state->flags = 0; /* expect zlib header */
- if (state->head != Z_NULL)
- state->head->done = -1;
- if (!(state->wrap & 1) || /* check if zlib header allowed */
-#else
- if (
-#endif
- ((BITS(8) << 8) + (hold >> 8)) % 31) {
- strm->msg = (char *)"incorrect header check";
- state->mode = BAD;
- break;
- }
- if (BITS(4) != Z_DEFLATED) {
- strm->msg = (char *)"unknown compression method";
- state->mode = BAD;
- break;
- }
- DROPBITS(4);
- len = BITS(4) + 8;
- if (len > state->wbits) {
- strm->msg = (char *)"invalid window size";
- state->mode = BAD;
- break;
- }
- state->dmax = 1U << len;
- Tracev((stderr, "inflate: zlib header ok\n"));
- strm->adler = state->check = adler32(0L, Z_NULL, 0);
- state->mode = hold & 0x200 ? DICTID : TYPE;
- INITBITS();
- break;
-#ifdef GUNZIP
- case FLAGS:
- NEEDBITS(16);
- state->flags = (int)(hold);
- if ((state->flags & 0xff) != Z_DEFLATED) {
- strm->msg = (char *)"unknown compression method";
- state->mode = BAD;
- break;
- }
- if (state->flags & 0xe000) {
- strm->msg = (char *)"unknown header flags set";
- state->mode = BAD;
- break;
- }
- if (state->head != Z_NULL)
- state->head->text = (int)((hold >> 8) & 1);
- if (state->flags & 0x0200) CRC2(state->check, hold);
- INITBITS();
- state->mode = TIME;
- /*FALLTHRU*/
- case TIME:
- NEEDBITS(32);
- if (state->head != Z_NULL)
- state->head->time = hold;
- if (state->flags & 0x0200) CRC4(state->check, hold);
- INITBITS();
- state->mode = OS;
- /*FALLTHRU*/
- case OS:
- NEEDBITS(16);
- if (state->head != Z_NULL) {
- state->head->xflags = (int)(hold & 0xff);
- state->head->os = (int)(hold >> 8);
- }
- if (state->flags & 0x0200) CRC2(state->check, hold);
- INITBITS();
- state->mode = EXLEN;
- /*FALLTHRU*/
- case EXLEN:
- if (state->flags & 0x0400) {
- NEEDBITS(16);
- state->length = (unsigned)(hold);
- if (state->head != Z_NULL)
- state->head->extra_len = (unsigned)hold;
- if (state->flags & 0x0200) CRC2(state->check, hold);
- INITBITS();
- }
- else if (state->head != Z_NULL)
- state->head->extra = Z_NULL;
- state->mode = EXTRA;
- /*FALLTHRU*/
- case EXTRA:
- if (state->flags & 0x0400) {
- copy = state->length;
- if (copy > have) copy = have;
- if (copy) {
- if (state->head != Z_NULL &&
- state->head->extra != Z_NULL) {
- len = state->head->extra_len - state->length;
- zmemcpy(state->head->extra + len, next,
- len + copy > state->head->extra_max ?
- state->head->extra_max - len : copy);
- }
- if (state->flags & 0x0200)
- state->check = crc32(state->check, next, copy);
- have -= copy;
- next += copy;
- state->length -= copy;
- }
- if (state->length) goto inf_leave;
- }
- state->length = 0;
- state->mode = NAME;
- /*FALLTHRU*/
- case NAME:
- if (state->flags & 0x0800) {
- if (have == 0) goto inf_leave;
- copy = 0;
- do {
- len = (unsigned)(next[copy++]);
- if (state->head != Z_NULL &&
- state->head->name != Z_NULL &&
- state->length < state->head->name_max)
- state->head->name[state->length++] = len;
- } while (len && copy < have);
- if (state->flags & 0x0200)
- state->check = crc32(state->check, next, copy);
- have -= copy;
- next += copy;
- if (len) goto inf_leave;
- }
- else if (state->head != Z_NULL)
- state->head->name = Z_NULL;
- state->length = 0;
- state->mode = COMMENT;
- /*FALLTHRU*/
- case COMMENT:
- if (state->flags & 0x1000) {
- if (have == 0) goto inf_leave;
- copy = 0;
- do {
- len = (unsigned)(next[copy++]);
- if (state->head != Z_NULL &&
- state->head->comment != Z_NULL &&
- state->length < state->head->comm_max)
- state->head->comment[state->length++] = len;
- } while (len && copy < have);
- if (state->flags & 0x0200)
- state->check = crc32(state->check, next, copy);
- have -= copy;
- next += copy;
- if (len) goto inf_leave;
- }
- else if (state->head != Z_NULL)
- state->head->comment = Z_NULL;
- state->mode = HCRC;
- /*FALLTHRU*/
- case HCRC:
- if (state->flags & 0x0200) {
- NEEDBITS(16);
- if (hold != (state->check & 0xffff)) {
- strm->msg = (char *)"header crc mismatch";
- state->mode = BAD;
- break;
- }
- INITBITS();
- }
- if (state->head != Z_NULL) {
- state->head->hcrc = (int)((state->flags >> 9) & 1);
- state->head->done = 1;
- }
- strm->adler = state->check = crc32(0L, Z_NULL, 0);
- state->mode = TYPE;
- break;
-#endif
- case DICTID:
- NEEDBITS(32);
- strm->adler = state->check = REVERSE(hold);
- INITBITS();
- state->mode = DICT;
- /*FALLTHRU*/
- case DICT:
- if (state->havedict == 0) {
- RESTORE();
- return Z_NEED_DICT;
- }
- strm->adler = state->check = adler32(0L, Z_NULL, 0);
- state->mode = TYPE;
- /*FALLTHRU*/
- case TYPE:
- if (flush == Z_BLOCK) goto inf_leave;
- /*FALLTHRU*/
- case TYPEDO:
- if (state->last) {
- BYTEBITS();
- state->mode = CHECK;
- break;
- }
- NEEDBITS(3);
- state->last = BITS(1);
- DROPBITS(1);
- switch (BITS(2)) {
- case 0: /* stored block */
- Tracev((stderr, "inflate: stored block%s\n",
- state->last ? " (last)" : ""));
- state->mode = STORED;
- break;
- case 1: /* fixed block */
- fixedtables(state);
- Tracev((stderr, "inflate: fixed codes block%s\n",
- state->last ? " (last)" : ""));
- state->mode = LEN; /* decode codes */
- break;
- case 2: /* dynamic block */
- Tracev((stderr, "inflate: dynamic codes block%s\n",
- state->last ? " (last)" : ""));
- state->mode = TABLE;
- break;
- case 3:
- strm->msg = (char *)"invalid block type";
- state->mode = BAD;
- }
- DROPBITS(2);
- break;
- case STORED:
- BYTEBITS(); /* go to byte boundary */
- NEEDBITS(32);
- if ((hold & 0xffff) != ((hold >> 16) ^ 0xffff)) {
- strm->msg = (char *)"invalid stored block lengths";
- state->mode = BAD;
- break;
- }
- state->length = (unsigned)hold & 0xffff;
- Tracev((stderr, "inflate: stored length %u\n",
- state->length));
- INITBITS();
- state->mode = COPY;
- /*FALLTHRU*/
- case COPY:
- copy = state->length;
- if (copy) {
- if (copy > have) copy = have;
- if (copy > left) copy = left;
- if (copy == 0) goto inf_leave;
- zmemcpy(put, next, copy);
- have -= copy;
- next += copy;
- left -= copy;
- put += copy;
- state->length -= copy;
- break;
- }
- Tracev((stderr, "inflate: stored end\n"));
- state->mode = TYPE;
- break;
- case TABLE:
- NEEDBITS(14);
- state->nlen = BITS(5) + 257;
- DROPBITS(5);
- state->ndist = BITS(5) + 1;
- DROPBITS(5);
- state->ncode = BITS(4) + 4;
- DROPBITS(4);
-#ifndef PKZIP_BUG_WORKAROUND
- if (state->nlen > 286 || state->ndist > 30) {
- strm->msg = (char *)"too many length or distance symbols";
- state->mode = BAD;
- break;
- }
-#endif
- Tracev((stderr, "inflate: table sizes ok\n"));
- state->have = 0;
- state->mode = LENLENS;
- /*FALLTHRU*/
- case LENLENS:
- while (state->have < state->ncode) {
- NEEDBITS(3);
- state->lens[order[state->have++]] = (unsigned short)BITS(3);
- DROPBITS(3);
- }
- while (state->have < 19)
- state->lens[order[state->have++]] = 0;
- state->next = state->codes;
- state->lencode = (code const FAR *)(state->next);
- state->lenbits = 7;
- ret = inflate_table(CODES, state->lens, 19, &(state->next),
- &(state->lenbits), state->work);
- if (ret) {
- strm->msg = (char *)"invalid code lengths set";
- state->mode = BAD;
- break;
- }
- Tracev((stderr, "inflate: code lengths ok\n"));
- state->have = 0;
- state->mode = CODELENS;
- /*FALLTHRU*/
- case CODELENS:
- while (state->have < state->nlen + state->ndist) {
- for (;;) {
- this = state->lencode[BITS(state->lenbits)];
- if ((unsigned)(this.bits) <= bits) break;
- PULLBYTE();
- }
- if (this.val < 16) {
- NEEDBITS(this.bits);
- DROPBITS(this.bits);
- state->lens[state->have++] = this.val;
- }
- else {
- if (this.val == 16) {
- NEEDBITS(this.bits + 2);
- DROPBITS(this.bits);
- if (state->have == 0) {
- strm->msg = (char *)"invalid bit length repeat";
- state->mode = BAD;
- break;
- }
- len = state->lens[state->have - 1];
- copy = 3 + BITS(2);
- DROPBITS(2);
- }
- else if (this.val == 17) {
- NEEDBITS(this.bits + 3);
- DROPBITS(this.bits);
- len = 0;
- copy = 3 + BITS(3);
- DROPBITS(3);
- }
- else {
- NEEDBITS(this.bits + 7);
- DROPBITS(this.bits);
- len = 0;
- copy = 11 + BITS(7);
- DROPBITS(7);
- }
- if (state->have + copy > state->nlen + state->ndist) {
- strm->msg = (char *)"invalid bit length repeat";
- state->mode = BAD;
- break;
- }
- while (copy--)
- state->lens[state->have++] = (unsigned short)len;
- }
- }
-
- /* handle error breaks in while */
- if (state->mode == BAD) break;
-
- /* build code tables */
- state->next = state->codes;
- state->lencode = (code const FAR *)(state->next);
- state->lenbits = 9;
- ret = inflate_table(LENS, state->lens, state->nlen, &(state->next),
- &(state->lenbits), state->work);
- if (ret) {
- strm->msg = (char *)"invalid literal/lengths set";
- state->mode = BAD;
- break;
- }
- state->distcode = (code const FAR *)(state->next);
- state->distbits = 6;
- ret = inflate_table(DISTS, state->lens + state->nlen, state->ndist,
- &(state->next), &(state->distbits), state->work);
- if (ret) {
- strm->msg = (char *)"invalid distances set";
- state->mode = BAD;
- break;
- }
- Tracev((stderr, "inflate: codes ok\n"));
- state->mode = LEN;
- /*FALLTHRU*/
- case LEN:
- if (have >= 6 && left >= 258) {
- RESTORE();
- inflate_fast(strm, out);
- LOAD();
- break;
- }
- for (;;) {
- this = state->lencode[BITS(state->lenbits)];
- if ((unsigned)(this.bits) <= bits) break;
- PULLBYTE();
- }
- if (this.op && (this.op & 0xf0) == 0) {
- last = this;
- for (;;) {
- this = state->lencode[last.val +
- (BITS(last.bits + last.op) >> last.bits)];
- if ((unsigned)(last.bits + this.bits) <= bits) break;
- PULLBYTE();
- }
- DROPBITS(last.bits);
- }
- DROPBITS(this.bits);
- state->length = (unsigned)this.val;
- if ((int)(this.op) == 0) {
- Tracevv((stderr, this.val >= 0x20 && this.val < 0x7f ?
- "inflate: literal '%c'\n" :
- "inflate: literal 0x%02x\n", this.val));
- state->mode = LIT;
- break;
- }
- if (this.op & 32) {
- Tracevv((stderr, "inflate: end of block\n"));
- state->mode = TYPE;
- break;
- }
- if (this.op & 64) {
- strm->msg = (char *)"invalid literal/length code";
- state->mode = BAD;
- break;
- }
- state->extra = (unsigned)(this.op) & 15;
- state->mode = LENEXT;
- /*FALLTHRU*/
- case LENEXT:
- if (state->extra) {
- NEEDBITS(state->extra);
- state->length += BITS(state->extra);
- DROPBITS(state->extra);
- }
- Tracevv((stderr, "inflate: length %u\n", state->length));
- state->mode = DIST;
- /*FALLTHRU*/
- case DIST:
- for (;;) {
- this = state->distcode[BITS(state->distbits)];
- if ((unsigned)(this.bits) <= bits) break;
- PULLBYTE();
- }
- if ((this.op & 0xf0) == 0) {
- last = this;
- for (;;) {
- this = state->distcode[last.val +
- (BITS(last.bits + last.op) >> last.bits)];
- if ((unsigned)(last.bits + this.bits) <= bits) break;
- PULLBYTE();
- }
- DROPBITS(last.bits);
- }
- DROPBITS(this.bits);
- if (this.op & 64) {
- strm->msg = (char *)"invalid distance code";
- state->mode = BAD;
- break;
- }
- state->offset = (unsigned)this.val;
- state->extra = (unsigned)(this.op) & 15;
- state->mode = DISTEXT;
- /*FALLTHRU*/
- case DISTEXT:
- if (state->extra) {
- NEEDBITS(state->extra);
- state->offset += BITS(state->extra);
- DROPBITS(state->extra);
- }
-#ifdef INFLATE_STRICT
- if (state->offset > state->dmax) {
- strm->msg = (char *)"invalid distance too far back";
- state->mode = BAD;
- break;
- }
-#endif
- if (state->offset > state->whave + out - left) {
- strm->msg = (char *)"invalid distance too far back";
- state->mode = BAD;
- break;
- }
- Tracevv((stderr, "inflate: distance %u\n", state->offset));
- state->mode = MATCH;
- /*FALLTHRU*/
- case MATCH:
- if (left == 0) goto inf_leave;
- copy = out - left;
- if (state->offset > copy) { /* copy from window */
- copy = state->offset - copy;
- if (copy > state->write) {
- copy -= state->write;
- from = state->window + (state->wsize - copy);
- }
- else
- from = state->window + (state->write - copy);
- if (copy > state->length) copy = state->length;
- }
- else { /* copy from output */
- from = put - state->offset;
- copy = state->length;
- }
- if (copy > left) copy = left;
- left -= copy;
- state->length -= copy;
- do {
- *put++ = *from++;
- } while (--copy);
- if (state->length == 0) state->mode = LEN;
- break;
- case LIT:
- if (left == 0) goto inf_leave;
- *put++ = (unsigned char)(state->length);
- left--;
- state->mode = LEN;
- break;
- case CHECK:
- if (state->wrap) {
- NEEDBITS(32);
- out -= left;
- strm->total_out += out;
- state->total += out;
- if (out)
- strm->adler = state->check =
- UPDATE(state->check, put - out, out);
- out = left;
- if ((
-#ifdef GUNZIP
- state->flags ? hold :
-#endif
- REVERSE(hold)) != state->check) {
- strm->msg = (char *)"incorrect data check";
- state->mode = BAD;
- break;
- }
- INITBITS();
- Tracev((stderr, "inflate: check matches trailer\n"));
- }
-#ifdef GUNZIP
- state->mode = LENGTH;
- /*FALLTHRU*/
- case LENGTH:
- if (state->wrap && state->flags) {
- NEEDBITS(32);
- if (hold != (state->total & 0xffffffffUL)) {
- strm->msg = (char *)"incorrect length check";
- state->mode = BAD;
- break;
- }
- INITBITS();
- Tracev((stderr, "inflate: length matches trailer\n"));
- }
-#endif
- state->mode = DONE;
- /*FALLTHRU*/
- case DONE:
- ret = Z_STREAM_END;
- goto inf_leave;
- case BAD:
- ret = Z_DATA_ERROR;
- goto inf_leave;
- case MEM:
- return Z_MEM_ERROR;
- case SYNC:
- default:
- return Z_STREAM_ERROR;
- }
-
- /*
- Return from inflate(), updating the total counts and the check value.
- If there was no progress during the inflate() call, return a buffer
- error. Call updatewindow() to create and/or update the window state.
- Note: a memory error from inflate() is non-recoverable.
- */
- inf_leave:
- RESTORE();
- if (state->wsize || (state->mode < CHECK && out != strm->avail_out))
- if (updatewindow(strm, out)) {
- state->mode = MEM;
- return Z_MEM_ERROR;
- }
- in -= strm->avail_in;
- out -= strm->avail_out;
- strm->total_in += in;
- strm->total_out += out;
- state->total += out;
- if (state->wrap && out)
- strm->adler = state->check =
- UPDATE(state->check, strm->next_out - out, out);
- strm->data_type = state->bits + (state->last ? 64 : 0) +
- (state->mode == TYPE ? 128 : 0);
- if (((in == 0 && out == 0) || flush == Z_FINISH) && ret == Z_OK)
- ret = Z_BUF_ERROR;
- return ret;
-}
-
-int ZEXPORT inflateEnd(strm)
-z_streamp strm;
-{
- struct inflate_state FAR *state;
- if (strm == Z_NULL || strm->state == Z_NULL || strm->zfree == (free_func)0)
- return Z_STREAM_ERROR;
- state = (struct inflate_state FAR *)strm->state;
- if (state->window != Z_NULL) ZFREE(strm, state->window);
- ZFREE(strm, strm->state);
- strm->state = Z_NULL;
- Tracev((stderr, "inflate: end\n"));
- return Z_OK;
-}
-
-int ZEXPORT inflateSetDictionary(strm, dictionary, dictLength)
-z_streamp strm;
-const Bytef *dictionary;
-uInt dictLength;
-{
- struct inflate_state FAR *state;
- unsigned long id;
-
- /* check state */
- if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR;
- state = (struct inflate_state FAR *)strm->state;
- if (state->wrap != 0 && state->mode != DICT)
- return Z_STREAM_ERROR;
-
- /* check for correct dictionary id */
- if (state->mode == DICT) {
- id = adler32(0L, Z_NULL, 0);
- id = adler32(id, dictionary, dictLength);
- if (id != state->check)
- return Z_DATA_ERROR;
- }
-
- /* copy dictionary to window */
- if (updatewindow(strm, strm->avail_out)) {
- state->mode = MEM;
- return Z_MEM_ERROR;
- }
- if (dictLength > state->wsize) {
- zmemcpy(state->window, dictionary + dictLength - state->wsize,
- state->wsize);
- state->whave = state->wsize;
- }
- else {
- zmemcpy(state->window + state->wsize - dictLength, dictionary,
- dictLength);
- state->whave = dictLength;
- }
- state->havedict = 1;
- Tracev((stderr, "inflate: dictionary set\n"));
- return Z_OK;
-}
-
-int ZEXPORT inflateGetHeader(strm, head)
-z_streamp strm;
-gz_headerp head;
-{
- struct inflate_state FAR *state;
-
- /* check state */
- if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR;
- state = (struct inflate_state FAR *)strm->state;
- if ((state->wrap & 2) == 0) return Z_STREAM_ERROR;
-
- /* save header structure */
- state->head = head;
- head->done = 0;
- return Z_OK;
-}
-
-/*
- Search buf[0..len-1] for the pattern: 0, 0, 0xff, 0xff. Return when found
- or when out of input. When called, *have is the number of pattern bytes
- found in order so far, in 0..3. On return *have is updated to the new
- state. If on return *have equals four, then the pattern was found and the
- return value is how many bytes were read including the last byte of the
- pattern. If *have is less than four, then the pattern has not been found
- yet and the return value is len. In the latter case, syncsearch() can be
- called again with more data and the *have state. *have is initialized to
- zero for the first call.
- */
-local unsigned syncsearch(have, buf, len)
-unsigned FAR *have;
-unsigned char FAR *buf;
-unsigned len;
-{
- unsigned got;
- unsigned next;
-
- got = *have;
- next = 0;
- while (next < len && got < 4) {
- if ((int)(buf[next]) == (got < 2 ? 0 : 0xff))
- got++;
- else if (buf[next])
- got = 0;
- else
- got = 4 - got;
- next++;
- }
- *have = got;
- return next;
-}
-
-int ZEXPORT inflateSync(strm)
-z_streamp strm;
-{
- unsigned len; /* number of bytes to look at or looked at */
- unsigned long in, out; /* temporary to save total_in and total_out */
- unsigned char buf[4]; /* to restore bit buffer to byte string */
- struct inflate_state FAR *state;
-
- /* check parameters */
- if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR;
- state = (struct inflate_state FAR *)strm->state;
- if (strm->avail_in == 0 && state->bits < 8) return Z_BUF_ERROR;
-
- /* if first time, start search in bit buffer */
- if (state->mode != SYNC) {
- state->mode = SYNC;
- state->hold <<= state->bits & 7;
- state->bits -= state->bits & 7;
- len = 0;
- while (state->bits >= 8) {
- buf[len++] = (unsigned char)(state->hold);
- state->hold >>= 8;
- state->bits -= 8;
- }
- state->have = 0;
- (void) syncsearch(&(state->have), buf, len);
- }
-
- /* search available input */
- len = syncsearch(&(state->have), strm->next_in, strm->avail_in);
- strm->avail_in -= len;
- strm->next_in += len;
- strm->total_in += len;
-
- /* return no joy or set up to restart inflate() on a new block */
- if (state->have != 4) return Z_DATA_ERROR;
- in = strm->total_in; out = strm->total_out;
- (void) inflateReset(strm);
- strm->total_in = in; strm->total_out = out;
- state->mode = TYPE;
- return Z_OK;
-}
-
-/*
- Returns true if inflate is currently at the end of a block generated by
- Z_SYNC_FLUSH or Z_FULL_FLUSH. This function is used by one PPP
- implementation to provide an additional safety check. PPP uses
- Z_SYNC_FLUSH but removes the length bytes of the resulting empty stored
- block. When decompressing, PPP checks that at the end of input packet,
- inflate is waiting for these length bytes.
- */
-int ZEXPORT inflateSyncPoint(strm)
-z_streamp strm;
-{
- struct inflate_state FAR *state;
-
- if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR;
- state = (struct inflate_state FAR *)strm->state;
- return state->mode == STORED && state->bits == 0;
-}
-
-int ZEXPORT inflateCopy(dest, source)
-z_streamp dest;
-z_streamp source;
-{
- struct inflate_state FAR *state;
- struct inflate_state FAR *copy;
- unsigned char FAR *window;
- unsigned wsize;
-
- /* check input */
- if (dest == Z_NULL || source == Z_NULL || source->state == Z_NULL ||
- source->zalloc == (alloc_func)0 || source->zfree == (free_func)0)
- return Z_STREAM_ERROR;
- state = (struct inflate_state FAR *)source->state;
-
- /* allocate space */
- copy = (struct inflate_state FAR *)
- ZALLOC(source, 1, sizeof(struct inflate_state));
- if (copy == Z_NULL) return Z_MEM_ERROR;
- window = Z_NULL;
- if (state->window != Z_NULL) {
- window = (unsigned char FAR *)
- ZALLOC(source, 1U << state->wbits, sizeof(unsigned char));
- if (window == Z_NULL) {
- ZFREE(source, copy);
- return Z_MEM_ERROR;
- }
- }
-
- /* copy state */
- zmemcpy(dest, source, sizeof(z_stream));
- zmemcpy(copy, state, sizeof(struct inflate_state));
- if (state->lencode >= state->codes &&
- state->lencode <= state->codes + ENOUGH - 1) {
- copy->lencode = copy->codes + (state->lencode - state->codes);
- copy->distcode = copy->codes + (state->distcode - state->codes);
- }
- copy->next = copy->codes + (state->next - state->codes);
- if (window != Z_NULL) {
- wsize = 1U << state->wbits;
- zmemcpy(window, state->window, wsize);
- }
- copy->window = window;
- dest->state = (struct internal_state FAR *)copy;
- return Z_OK;
-}
diff --git a/sys/contrib/opensolaris/uts/common/zmod/inflate.h b/sys/contrib/opensolaris/uts/common/zmod/inflate.h
deleted file mode 100644
index 4d28b22..0000000
--- a/sys/contrib/opensolaris/uts/common/zmod/inflate.h
+++ /dev/null
@@ -1,117 +0,0 @@
-/* inflate.h -- internal inflate state definition
- * Copyright (C) 1995-2004 Mark Adler
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-/* WARNING: this file should *not* be used by applications. It is
- part of the implementation of the compression library and is
- subject to change. Applications should only use zlib.h.
- */
-
-/* define NO_GZIP when compiling if you want to disable gzip header and
- trailer decoding by inflate(). NO_GZIP would be used to avoid linking in
- the crc code when it is not needed. For shared libraries, gzip decoding
- should be left enabled. */
-#ifndef NO_GZIP
-# define GUNZIP
-#endif
-
-/* Possible inflate modes between inflate() calls */
-typedef enum {
- HEAD, /* i: waiting for magic header */
- FLAGS, /* i: waiting for method and flags (gzip) */
- TIME, /* i: waiting for modification time (gzip) */
- OS, /* i: waiting for extra flags and operating system (gzip) */
- EXLEN, /* i: waiting for extra length (gzip) */
- EXTRA, /* i: waiting for extra bytes (gzip) */
- NAME, /* i: waiting for end of file name (gzip) */
- COMMENT, /* i: waiting for end of comment (gzip) */
- HCRC, /* i: waiting for header crc (gzip) */
- DICTID, /* i: waiting for dictionary check value */
- DICT, /* waiting for inflateSetDictionary() call */
- TYPE, /* i: waiting for type bits, including last-flag bit */
- TYPEDO, /* i: same, but skip check to exit inflate on new block */
- STORED, /* i: waiting for stored size (length and complement) */
- COPY, /* i/o: waiting for input or output to copy stored block */
- TABLE, /* i: waiting for dynamic block table lengths */
- LENLENS, /* i: waiting for code length code lengths */
- CODELENS, /* i: waiting for length/lit and distance code lengths */
- LEN, /* i: waiting for length/lit code */
- LENEXT, /* i: waiting for length extra bits */
- DIST, /* i: waiting for distance code */
- DISTEXT, /* i: waiting for distance extra bits */
- MATCH, /* o: waiting for output space to copy string */
- LIT, /* o: waiting for output space to write literal */
- CHECK, /* i: waiting for 32-bit check value */
- LENGTH, /* i: waiting for 32-bit length (gzip) */
- DONE, /* finished check, done -- remain here until reset */
- BAD, /* got a data error -- remain here until reset */
- MEM, /* got an inflate() memory error -- remain here until reset */
- SYNC /* looking for synchronization bytes to restart inflate() */
-} inflate_mode;
-
-/*
- State transitions between above modes -
-
- (most modes can go to the BAD or MEM mode -- not shown for clarity)
-
- Process header:
- HEAD -> (gzip) or (zlib)
- (gzip) -> FLAGS -> TIME -> OS -> EXLEN -> EXTRA -> NAME
- NAME -> COMMENT -> HCRC -> TYPE
- (zlib) -> DICTID or TYPE
- DICTID -> DICT -> TYPE
- Read deflate blocks:
- TYPE -> STORED or TABLE or LEN or CHECK
- STORED -> COPY -> TYPE
- TABLE -> LENLENS -> CODELENS -> LEN
- Read deflate codes:
- LEN -> LENEXT or LIT or TYPE
- LENEXT -> DIST -> DISTEXT -> MATCH -> LEN
- LIT -> LEN
- Process trailer:
- CHECK -> LENGTH -> DONE
- */
-
-/* state maintained between inflate() calls. Approximately 7K bytes. */
-struct inflate_state {
- inflate_mode mode; /* current inflate mode */
- int last; /* true if processing last block */
- int wrap; /* bit 0 true for zlib, bit 1 true for gzip */
- int havedict; /* true if dictionary provided */
- int flags; /* gzip header method and flags (0 if zlib) */
- unsigned dmax; /* zlib header max distance (INFLATE_STRICT) */
- unsigned long check; /* protected copy of check value */
- unsigned long total; /* protected copy of output count */
- gz_headerp head; /* where to save gzip header information */
- /* sliding window */
- unsigned wbits; /* log base 2 of requested window size */
- unsigned wsize; /* window size or zero if not using window */
- unsigned whave; /* valid bytes in the window */
- unsigned write; /* window write index */
- unsigned char FAR *window; /* allocated sliding window, if needed */
- /* bit accumulator */
- unsigned long hold; /* input bit accumulator */
- unsigned bits; /* number of bits in "in" */
- /* for string and stored block copying */
- unsigned length; /* literal or length of data to copy */
- unsigned offset; /* distance back to copy string from */
- /* for table and code decoding */
- unsigned extra; /* extra bits needed */
- /* fixed and dynamic code tables */
- code const FAR *lencode; /* starting table for length/literal codes */
- code const FAR *distcode; /* starting table for distance codes */
- unsigned lenbits; /* index bits for lencode */
- unsigned distbits; /* index bits for distcode */
- /* dynamic table building */
- unsigned ncode; /* number of code length code lengths */
- unsigned nlen; /* number of length code lengths */
- unsigned ndist; /* number of distance code lengths */
- unsigned have; /* number of code lengths in lens[] */
- code FAR *next; /* next available space in codes[] */
- unsigned short lens[320]; /* temporary storage for code lengths */
- unsigned short work[288]; /* work area for code table building */
- code codes[ENOUGH]; /* space for code tables */
-};
diff --git a/sys/contrib/opensolaris/uts/common/zmod/inftrees.c b/sys/contrib/opensolaris/uts/common/zmod/inftrees.c
deleted file mode 100644
index 2d37167..0000000
--- a/sys/contrib/opensolaris/uts/common/zmod/inftrees.c
+++ /dev/null
@@ -1,331 +0,0 @@
-/* inftrees.c -- generate Huffman trees for efficient decoding
- * Copyright (C) 1995-2005 Mark Adler
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include "zutil.h"
-#include "inftrees.h"
-
-#define MAXBITS 15
-
-static const char inflate_copyright[] =
- " inflate 1.2.3 Copyright 1995-2005 Mark Adler ";
-/*
- If you use the zlib library in a product, an acknowledgment is welcome
- in the documentation of your product. If for some reason you cannot
- include such an acknowledgment, I would appreciate that you keep this
- copyright string in the executable of your product.
- */
-
-/*
- Build a set of tables to decode the provided canonical Huffman code.
- The code lengths are lens[0..codes-1]. The result starts at *table,
- whose indices are 0..2^bits-1. work is a writable array of at least
- lens shorts, which is used as a work area. type is the type of code
- to be generated, CODES, LENS, or DISTS. On return, zero is success,
- -1 is an invalid code, and +1 means that ENOUGH isn't enough. table
- on return points to the next available entry's address. bits is the
- requested root table index bits, and on return it is the actual root
- table index bits. It will differ if the request is greater than the
- longest code or if it is less than the shortest code.
- */
-int inflate_table(type, lens, codes, table, bits, work)
-codetype type;
-unsigned short FAR *lens;
-unsigned codes;
-code FAR * FAR *table;
-unsigned FAR *bits;
-unsigned short FAR *work;
-{
- unsigned len; /* a code's length in bits */
- unsigned sym; /* index of code symbols */
- unsigned min, max; /* minimum and maximum code lengths */
- unsigned root; /* number of index bits for root table */
- unsigned curr; /* number of index bits for current table */
- unsigned drop; /* code bits to drop for sub-table */
- int left; /* number of prefix codes available */
- unsigned used; /* code entries in table used */
- unsigned huff; /* Huffman code */
- unsigned incr; /* for incrementing code, index */
- unsigned fill; /* index for replicating entries */
- unsigned low; /* low bits for current root entry */
- unsigned mask; /* mask for low root bits */
- code this; /* table entry for duplication */
- code FAR *next; /* next available space in table */
- const unsigned short FAR *base; /* base value table to use */
- const unsigned short FAR *extra; /* extra bits table to use */
- int end; /* use base and extra for symbol > end */
- unsigned short count[MAXBITS+1]; /* number of codes of each length */
- unsigned short offs[MAXBITS+1]; /* offsets in table for each length */
- static const unsigned short lbase[31] = { /* Length codes 257..285 base */
- 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31,
- 35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 0, 0};
- static const unsigned short lext[31] = { /* Length codes 257..285 extra */
- 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18,
- 19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 16, 201, 196};
- static const unsigned short dbase[32] = { /* Distance codes 0..29 base */
- 1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193,
- 257, 385, 513, 769, 1025, 1537, 2049, 3073, 4097, 6145,
- 8193, 12289, 16385, 24577, 0, 0};
- static const unsigned short dext[32] = { /* Distance codes 0..29 extra */
- 16, 16, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22,
- 23, 23, 24, 24, 25, 25, 26, 26, 27, 27,
- 28, 28, 29, 29, 64, 64};
-
- /*
- Process a set of code lengths to create a canonical Huffman code. The
- code lengths are lens[0..codes-1]. Each length corresponds to the
- symbols 0..codes-1. The Huffman code is generated by first sorting the
- symbols by length from short to long, and retaining the symbol order
- for codes with equal lengths. Then the code starts with all zero bits
- for the first code of the shortest length, and the codes are integer
- increments for the same length, and zeros are appended as the length
- increases. For the deflate format, these bits are stored backwards
- from their more natural integer increment ordering, and so when the
- decoding tables are built in the large loop below, the integer codes
- are incremented backwards.
-
- This routine assumes, but does not check, that all of the entries in
- lens[] are in the range 0..MAXBITS. The caller must assure this.
- 1..MAXBITS is interpreted as that code length. zero means that that
- symbol does not occur in this code.
-
- The codes are sorted by computing a count of codes for each length,
- creating from that a table of starting indices for each length in the
- sorted table, and then entering the symbols in order in the sorted
- table. The sorted table is work[], with that space being provided by
- the caller.
-
- The length counts are used for other purposes as well, i.e. finding
- the minimum and maximum length codes, determining if there are any
- codes at all, checking for a valid set of lengths, and looking ahead
- at length counts to determine sub-table sizes when building the
- decoding tables.
- */
-
- /* accumulate lengths for codes (assumes lens[] all in 0..MAXBITS) */
- for (len = 0; len <= MAXBITS; len++)
- count[len] = 0;
- for (sym = 0; sym < codes; sym++)
- count[lens[sym]]++;
-
- /* bound code lengths, force root to be within code lengths */
- root = *bits;
- for (max = MAXBITS; max >= 1; max--)
- if (count[max] != 0) break;
- if (root > max) root = max;
- if (max == 0) { /* no symbols to code at all */
- this.op = (unsigned char)64; /* invalid code marker */
- this.bits = (unsigned char)1;
- this.val = (unsigned short)0;
- *(*table)++ = this; /* make a table to force an error */
- *(*table)++ = this;
- *bits = 1;
- return 0; /* no symbols, but wait for decoding to report error */
- }
- for (min = 1; min <= MAXBITS; min++)
- if (count[min] != 0) break;
- if (root < min) root = min;
-
- /* check for an over-subscribed or incomplete set of lengths */
- left = 1;
- for (len = 1; len <= MAXBITS; len++) {
- left <<= 1;
- left -= count[len];
- if (left < 0) return -1; /* over-subscribed */
- }
- if (left > 0 && (type == CODES || max != 1))
- return -1; /* incomplete set */
-
- /* generate offsets into symbol table for each length for sorting */
- offs[1] = 0;
- for (len = 1; len < MAXBITS; len++)
- offs[len + 1] = offs[len] + count[len];
-
- /* sort symbols by length, by symbol order within each length */
- for (sym = 0; sym < codes; sym++)
- if (lens[sym] != 0) work[offs[lens[sym]]++] = (unsigned short)sym;
-
- /*
- Create and fill in decoding tables. In this loop, the table being
- filled is at next and has curr index bits. The code being used is huff
- with length len. That code is converted to an index by dropping drop
- bits off of the bottom. For codes where len is less than drop + curr,
- those top drop + curr - len bits are incremented through all values to
- fill the table with replicated entries.
-
- root is the number of index bits for the root table. When len exceeds
- root, sub-tables are created pointed to by the root entry with an index
- of the low root bits of huff. This is saved in low to check for when a
- new sub-table should be started. drop is zero when the root table is
- being filled, and drop is root when sub-tables are being filled.
-
- When a new sub-table is needed, it is necessary to look ahead in the
- code lengths to determine what size sub-table is needed. The length
- counts are used for this, and so count[] is decremented as codes are
- entered in the tables.
-
- used keeps track of how many table entries have been allocated from the
- provided *table space. It is checked when a LENS table is being made
- against the space in *table, ENOUGH, minus the maximum space needed by
- the worst case distance code, MAXD. This should never happen, but the
- sufficiency of ENOUGH has not been proven exhaustively, hence the check.
- This assumes that when type == LENS, bits == 9.
-
- sym increments through all symbols, and the loop terminates when
- all codes of length max, i.e. all codes, have been processed. This
- routine permits incomplete codes, so another loop after this one fills
- in the rest of the decoding tables with invalid code markers.
- */
-
- /* set up for code type */
- switch (type) {
- case CODES:
- base = extra = work; /* dummy value--not used */
- end = 19;
- break;
- case LENS:
- base = lbase;
- base -= 257;
- extra = lext;
- extra -= 257;
- end = 256;
- break;
- default: /* DISTS */
- base = dbase;
- extra = dext;
- end = -1;
- }
-
- /* initialize state for loop */
- huff = 0; /* starting code */
- sym = 0; /* starting code symbol */
- len = min; /* starting code length */
- next = *table; /* current table to fill in */
- curr = root; /* current table index bits */
- drop = 0; /* current bits to drop from code for index */
- low = (unsigned)(-1); /* trigger new sub-table when len > root */
- used = 1U << root; /* use root table entries */
- mask = used - 1; /* mask for comparing low */
-
- /* check available table space */
- if (type == LENS && used >= ENOUGH - MAXD)
- return 1;
-
- /* process all codes and make table entries */
- for (;;) {
- /* create table entry */
- this.bits = (unsigned char)(len - drop);
- if ((int)(work[sym]) < end) {
- this.op = (unsigned char)0;
- this.val = work[sym];
- }
- else if ((int)(work[sym]) > end) {
- this.op = (unsigned char)(extra[work[sym]]);
- this.val = base[work[sym]];
- }
- else {
- this.op = (unsigned char)(32 + 64); /* end of block */
- this.val = 0;
- }
-
- /* replicate for those indices with low len bits equal to huff */
- incr = 1U << (len - drop);
- fill = 1U << curr;
- min = fill; /* save offset to next table */
- do {
- fill -= incr;
- next[(huff >> drop) + fill] = this;
- } while (fill != 0);
-
- /* backwards increment the len-bit code huff */
- incr = 1U << (len - 1);
- while (huff & incr)
- incr >>= 1;
- if (incr != 0) {
- huff &= incr - 1;
- huff += incr;
- }
- else
- huff = 0;
-
- /* go to next symbol, update count, len */
- sym++;
- if (--(count[len]) == 0) {
- if (len == max) break;
- len = lens[work[sym]];
- }
-
- /* create new sub-table if needed */
- if (len > root && (huff & mask) != low) {
- /* if first time, transition to sub-tables */
- if (drop == 0)
- drop = root;
-
- /* increment past last table */
- next += min; /* here min is 1 << curr */
-
- /* determine length of next table */
- curr = len - drop;
- left = (int)(1 << curr);
- while (curr + drop < max) {
- left -= count[curr + drop];
- if (left <= 0) break;
- curr++;
- left <<= 1;
- }
-
- /* check for enough space */
- used += 1U << curr;
- if (type == LENS && used >= ENOUGH - MAXD)
- return 1;
-
- /* point entry in root table to sub-table */
- low = huff & mask;
- (*table)[low].op = (unsigned char)curr;
- (*table)[low].bits = (unsigned char)root;
- (*table)[low].val = (unsigned short)(next - *table);
- }
- }
-
- /*
- Fill in rest of table for incomplete codes. This loop is similar to the
- loop above in incrementing huff for table indices. It is assumed that
- len is equal to curr + drop, so there is no loop needed to increment
- through high index bits. When the current sub-table is filled, the loop
- drops back to the root table to fill in any remaining entries there.
- */
- this.op = (unsigned char)64; /* invalid code marker */
- this.bits = (unsigned char)(len - drop);
- this.val = (unsigned short)0;
- while (huff != 0) {
- /* when done with sub-table, drop back to root table */
- if (drop != 0 && (huff & mask) != low) {
- drop = 0;
- len = root;
- next = *table;
- this.bits = (unsigned char)len;
- }
-
- /* put invalid code marker in table */
- next[huff >> drop] = this;
-
- /* backwards increment the len-bit code huff */
- incr = 1U << (len - 1);
- while (huff & incr)
- incr >>= 1;
- if (incr != 0) {
- huff &= incr - 1;
- huff += incr;
- }
- else
- huff = 0;
- }
-
- /* set return parameters */
- *table += used;
- *bits = root;
- return 0;
-}
diff --git a/sys/contrib/opensolaris/uts/common/zmod/inftrees.h b/sys/contrib/opensolaris/uts/common/zmod/inftrees.h
deleted file mode 100644
index 546e8c0..0000000
--- a/sys/contrib/opensolaris/uts/common/zmod/inftrees.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/* inftrees.h -- header to use inftrees.c
- * Copyright (C) 1995-2005 Mark Adler
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-/* WARNING: this file should *not* be used by applications. It is
- part of the implementation of the compression library and is
- subject to change. Applications should only use zlib.h.
- */
-
-/* Structure for decoding tables. Each entry provides either the
- information needed to do the operation requested by the code that
- indexed that table entry, or it provides a pointer to another
- table that indexes more bits of the code. op indicates whether
- the entry is a pointer to another table, a literal, a length or
- distance, an end-of-block, or an invalid code. For a table
- pointer, the low four bits of op is the number of index bits of
- that table. For a length or distance, the low four bits of op
- is the number of extra bits to get after the code. bits is
- the number of bits in this code or part of the code to drop off
- of the bit buffer. val is the actual byte to output in the case
- of a literal, the base length or distance, or the offset from
- the current table to the next table. Each entry is four bytes. */
-typedef struct {
- unsigned char op; /* operation, extra bits, table bits */
- unsigned char bits; /* bits in this part of the code */
- unsigned short val; /* offset in table or code value */
-} code;
-
-/* op values as set by inflate_table():
- 00000000 - literal
- 0000tttt - table link, tttt != 0 is the number of table index bits
- 0001eeee - length or distance, eeee is the number of extra bits
- 01100000 - end of block
- 01000000 - invalid code
- */
-
-/* Maximum size of dynamic tree. The maximum found in a long but non-
- exhaustive search was 1444 code structures (852 for length/literals
- and 592 for distances, the latter actually the result of an
- exhaustive search). The true maximum is not known, but the value
- below is more than safe. */
-#define ENOUGH 2048
-#define MAXD 592
-
-/* Type of code to build for inftable() */
-typedef enum {
- CODES,
- LENS,
- DISTS
-} codetype;
-
-extern int inflate_table OF((codetype type, unsigned short FAR *lens,
- unsigned codes, code FAR * FAR *table,
- unsigned FAR *bits, unsigned short FAR *work));
diff --git a/sys/contrib/opensolaris/uts/common/zmod/trees.c b/sys/contrib/opensolaris/uts/common/zmod/trees.c
deleted file mode 100644
index ce0cebc..0000000
--- a/sys/contrib/opensolaris/uts/common/zmod/trees.c
+++ /dev/null
@@ -1,1219 +0,0 @@
-/* trees.c -- output deflated data using Huffman coding
- * Copyright (C) 1995-2005 Jean-loup Gailly
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-/*
- * ALGORITHM
- *
- * The "deflation" process uses several Huffman trees. The more
- * common source values are represented by shorter bit sequences.
- *
- * Each code tree is stored in a compressed form which is itself
- * a Huffman encoding of the lengths of all the code strings (in
- * ascending order by source values). The actual code strings are
- * reconstructed from the lengths in the inflate process, as described
- * in the deflate specification.
- *
- * REFERENCES
- *
- * Deutsch, L.P.,"'Deflate' Compressed Data Format Specification".
- * Available in ftp.uu.net:/pub/archiving/zip/doc/deflate-1.1.doc
- *
- * Storer, James A.
- * Data Compression: Methods and Theory, pp. 49-50.
- * Computer Science Press, 1988. ISBN 0-7167-8156-5.
- *
- * Sedgewick, R.
- * Algorithms, p290.
- * Addison-Wesley, 1983. ISBN 0-201-06672-6.
- */
-
-/* #define GEN_TREES_H */
-
-#include "deflate.h"
-
-#ifdef DEBUG
-# include <ctype.h>
-#endif
-
-/* ===========================================================================
- * Constants
- */
-
-#define MAX_BL_BITS 7
-/* Bit length codes must not exceed MAX_BL_BITS bits */
-
-#define END_BLOCK 256
-/* end of block literal code */
-
-#define REP_3_6 16
-/* repeat previous bit length 3-6 times (2 bits of repeat count) */
-
-#define REPZ_3_10 17
-/* repeat a zero length 3-10 times (3 bits of repeat count) */
-
-#define REPZ_11_138 18
-/* repeat a zero length 11-138 times (7 bits of repeat count) */
-
-local const int extra_lbits[LENGTH_CODES] /* extra bits for each length code */
- = {0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0};
-
-local const int extra_dbits[D_CODES] /* extra bits for each distance code */
- = {0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13};
-
-local const int extra_blbits[BL_CODES]/* extra bits for each bit length code */
- = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,3,7};
-
-local const uch bl_order[BL_CODES]
- = {16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15};
-/* The lengths of the bit length codes are sent in order of decreasing
- * probability, to avoid transmitting the lengths for unused bit length codes.
- */
-
-#define Buf_size (8 * 2*sizeof(char))
-/* Number of bits used within bi_buf. (bi_buf might be implemented on
- * more than 16 bits on some systems.)
- */
-
-/* ===========================================================================
- * Local data. These are initialized only once.
- */
-
-#define DIST_CODE_LEN 512 /* see definition of array dist_code below */
-
-#if defined(GEN_TREES_H) || !defined(STDC)
-/* non ANSI compilers may not accept trees.h */
-
-local ct_data static_ltree[L_CODES+2];
-/* The static literal tree. Since the bit lengths are imposed, there is no
- * need for the L_CODES extra codes used during heap construction. However
- * The codes 286 and 287 are needed to build a canonical tree (see _tr_init
- * below).
- */
-
-local ct_data static_dtree[D_CODES];
-/* The static distance tree. (Actually a trivial tree since all codes use
- * 5 bits.)
- */
-
-uch _dist_code[DIST_CODE_LEN];
-/* Distance codes. The first 256 values correspond to the distances
- * 3 .. 258, the last 256 values correspond to the top 8 bits of
- * the 15 bit distances.
- */
-
-uch _length_code[MAX_MATCH-MIN_MATCH+1];
-/* length code for each normalized match length (0 == MIN_MATCH) */
-
-local int base_length[LENGTH_CODES];
-/* First normalized length for each code (0 = MIN_MATCH) */
-
-local int base_dist[D_CODES];
-/* First normalized distance for each code (0 = distance of 1) */
-
-#else
-# include "trees.h"
-#endif /* GEN_TREES_H */
-
-struct static_tree_desc_s {
- const ct_data *static_tree; /* static tree or NULL */
- const intf *extra_bits; /* extra bits for each code or NULL */
- int extra_base; /* base index for extra_bits */
- int elems; /* max number of elements in the tree */
- int max_length; /* max bit length for the codes */
-};
-
-local static_tree_desc static_l_desc =
-{static_ltree, extra_lbits, LITERALS+1, L_CODES, MAX_BITS};
-
-local static_tree_desc static_d_desc =
-{static_dtree, extra_dbits, 0, D_CODES, MAX_BITS};
-
-local static_tree_desc static_bl_desc =
-{(const ct_data *)0, extra_blbits, 0, BL_CODES, MAX_BL_BITS};
-
-/* ===========================================================================
- * Local (static) routines in this file.
- */
-
-local void tr_static_init OF((void));
-local void init_block OF((deflate_state *s));
-local void pqdownheap OF((deflate_state *s, ct_data *tree, int k));
-local void gen_bitlen OF((deflate_state *s, tree_desc *desc));
-local void gen_codes OF((ct_data *tree, int max_code, ushf *bl_count));
-local void build_tree OF((deflate_state *s, tree_desc *desc));
-local void scan_tree OF((deflate_state *s, ct_data *tree, int max_code));
-local void send_tree OF((deflate_state *s, ct_data *tree, int max_code));
-local int build_bl_tree OF((deflate_state *s));
-local void send_all_trees OF((deflate_state *s, int lcodes, int dcodes,
- int blcodes));
-local void compress_block OF((deflate_state *s, ct_data *ltree,
- ct_data *dtree));
-local void set_data_type OF((deflate_state *s));
-local unsigned bi_reverse OF((unsigned value, int length));
-local void bi_windup OF((deflate_state *s));
-local void bi_flush OF((deflate_state *s));
-local void copy_block OF((deflate_state *s, charf *buf, unsigned len,
- int header));
-
-#ifdef GEN_TREES_H
-local void gen_trees_header OF((void));
-#endif
-
-#ifndef DEBUG
-# define send_code(s, c, tree) send_bits(s, tree[c].Code, tree[c].Len)
- /* Send a code of the given tree. c and tree must not have side effects */
-
-#else /* DEBUG */
-# define send_code(s, c, tree) \
- { if (z_verbose>2) fprintf(stderr,"\ncd %3d ",(c)); \
- send_bits(s, tree[c].Code, tree[c].Len); }
-#endif
-
-/* ===========================================================================
- * Output a short LSB first on the stream.
- * IN assertion: there is enough room in pendingBuf.
- */
-#define put_short(s, w) { \
- put_byte(s, (uch)((w) & 0xff)); \
- put_byte(s, (uch)((ush)(w) >> 8)); \
-}
-
-/* ===========================================================================
- * Send a value on a given number of bits.
- * IN assertion: length <= 16 and value fits in length bits.
- */
-#ifdef DEBUG
-local void send_bits OF((deflate_state *s, int value, int length));
-
-local void send_bits(s, value, length)
- deflate_state *s;
- int value; /* value to send */
- int length; /* number of bits */
-{
- Tracevv((stderr," l %2d v %4x ", length, value));
- Assert(length > 0 && length <= 15, "invalid length");
- s->bits_sent += (ulg)length;
-
- /* If not enough room in bi_buf, use (valid) bits from bi_buf and
- * (16 - bi_valid) bits from value, leaving (width - (16-bi_valid))
- * unused bits in value.
- */
- if (s->bi_valid > (int)Buf_size - length) {
- s->bi_buf |= (value << s->bi_valid);
- put_short(s, s->bi_buf);
- s->bi_buf = (ush)value >> (Buf_size - s->bi_valid);
- s->bi_valid += length - Buf_size;
- } else {
- s->bi_buf |= value << s->bi_valid;
- s->bi_valid += length;
- }
-}
-#else /* !DEBUG */
-
-#define send_bits(s, value, length) \
-{ int len = length;\
- if (s->bi_valid > (int)Buf_size - len) {\
- int val = value;\
- s->bi_buf |= (val << s->bi_valid);\
- put_short(s, s->bi_buf);\
- s->bi_buf = (ush)val >> (Buf_size - s->bi_valid);\
- s->bi_valid += len - Buf_size;\
- } else {\
- s->bi_buf |= (value) << s->bi_valid;\
- s->bi_valid += len;\
- }\
-}
-#endif /* DEBUG */
-
-
-/* the arguments must not have side effects */
-
-/* ===========================================================================
- * Initialize the various 'constant' tables.
- */
-local void tr_static_init()
-{
-#if defined(GEN_TREES_H) || !defined(STDC)
- static int static_init_done = 0;
- int n; /* iterates over tree elements */
- int bits; /* bit counter */
- int length; /* length value */
- int code; /* code value */
- int dist; /* distance index */
- ush bl_count[MAX_BITS+1];
- /* number of codes at each bit length for an optimal tree */
-
- if (static_init_done) return;
-
- /* For some embedded targets, global variables are not initialized: */
- static_l_desc.static_tree = static_ltree;
- static_l_desc.extra_bits = extra_lbits;
- static_d_desc.static_tree = static_dtree;
- static_d_desc.extra_bits = extra_dbits;
- static_bl_desc.extra_bits = extra_blbits;
-
- /* Initialize the mapping length (0..255) -> length code (0..28) */
- length = 0;
- for (code = 0; code < LENGTH_CODES-1; code++) {
- base_length[code] = length;
- for (n = 0; n < (1<<extra_lbits[code]); n++) {
- _length_code[length++] = (uch)code;
- }
- }
- Assert (length == 256, "tr_static_init: length != 256");
- /* Note that the length 255 (match length 258) can be represented
- * in two different ways: code 284 + 5 bits or code 285, so we
- * overwrite length_code[255] to use the best encoding:
- */
- _length_code[length-1] = (uch)code;
-
- /* Initialize the mapping dist (0..32K) -> dist code (0..29) */
- dist = 0;
- for (code = 0 ; code < 16; code++) {
- base_dist[code] = dist;
- for (n = 0; n < (1<<extra_dbits[code]); n++) {
- _dist_code[dist++] = (uch)code;
- }
- }
- Assert (dist == 256, "tr_static_init: dist != 256");
- dist >>= 7; /* from now on, all distances are divided by 128 */
- for ( ; code < D_CODES; code++) {
- base_dist[code] = dist << 7;
- for (n = 0; n < (1<<(extra_dbits[code]-7)); n++) {
- _dist_code[256 + dist++] = (uch)code;
- }
- }
- Assert (dist == 256, "tr_static_init: 256+dist != 512");
-
- /* Construct the codes of the static literal tree */
- for (bits = 0; bits <= MAX_BITS; bits++) bl_count[bits] = 0;
- n = 0;
- while (n <= 143) static_ltree[n++].Len = 8, bl_count[8]++;
- while (n <= 255) static_ltree[n++].Len = 9, bl_count[9]++;
- while (n <= 279) static_ltree[n++].Len = 7, bl_count[7]++;
- while (n <= 287) static_ltree[n++].Len = 8, bl_count[8]++;
- /* Codes 286 and 287 do not exist, but we must include them in the
- * tree construction to get a canonical Huffman tree (longest code
- * all ones)
- */
- gen_codes((ct_data *)static_ltree, L_CODES+1, bl_count);
-
- /* The static distance tree is trivial: */
- for (n = 0; n < D_CODES; n++) {
- static_dtree[n].Len = 5;
- static_dtree[n].Code = bi_reverse((unsigned)n, 5);
- }
- static_init_done = 1;
-
-# ifdef GEN_TREES_H
- gen_trees_header();
-# endif
-#endif /* defined(GEN_TREES_H) || !defined(STDC) */
-}
-
-/* ===========================================================================
- * Genererate the file trees.h describing the static trees.
- */
-#ifdef GEN_TREES_H
-# ifndef DEBUG
-# include <stdio.h>
-# endif
-
-# define SEPARATOR(i, last, width) \
- ((i) == (last)? "\n};\n\n" : \
- ((i) % (width) == (width)-1 ? ",\n" : ", "))
-
-void gen_trees_header()
-{
- FILE *header = fopen("trees.h", "w");
- int i;
-
- Assert (header != NULL, "Can't open trees.h");
- fprintf(header,
- "/* header created automatically with -DGEN_TREES_H */\n\n");
-
- fprintf(header, "local const ct_data static_ltree[L_CODES+2] = {\n");
- for (i = 0; i < L_CODES+2; i++) {
- fprintf(header, "{{%3u},{%3u}}%s", static_ltree[i].Code,
- static_ltree[i].Len, SEPARATOR(i, L_CODES+1, 5));
- }
-
- fprintf(header, "local const ct_data static_dtree[D_CODES] = {\n");
- for (i = 0; i < D_CODES; i++) {
- fprintf(header, "{{%2u},{%2u}}%s", static_dtree[i].Code,
- static_dtree[i].Len, SEPARATOR(i, D_CODES-1, 5));
- }
-
- fprintf(header, "const uch _dist_code[DIST_CODE_LEN] = {\n");
- for (i = 0; i < DIST_CODE_LEN; i++) {
- fprintf(header, "%2u%s", _dist_code[i],
- SEPARATOR(i, DIST_CODE_LEN-1, 20));
- }
-
- fprintf(header, "const uch _length_code[MAX_MATCH-MIN_MATCH+1]= {\n");
- for (i = 0; i < MAX_MATCH-MIN_MATCH+1; i++) {
- fprintf(header, "%2u%s", _length_code[i],
- SEPARATOR(i, MAX_MATCH-MIN_MATCH, 20));
- }
-
- fprintf(header, "local const int base_length[LENGTH_CODES] = {\n");
- for (i = 0; i < LENGTH_CODES; i++) {
- fprintf(header, "%1u%s", base_length[i],
- SEPARATOR(i, LENGTH_CODES-1, 20));
- }
-
- fprintf(header, "local const int base_dist[D_CODES] = {\n");
- for (i = 0; i < D_CODES; i++) {
- fprintf(header, "%5u%s", base_dist[i],
- SEPARATOR(i, D_CODES-1, 10));
- }
-
- fclose(header);
-}
-#endif /* GEN_TREES_H */
-
-/* ===========================================================================
- * Initialize the tree data structures for a new zlib stream.
- */
-void _tr_init(s)
- deflate_state *s;
-{
- tr_static_init();
-
- s->l_desc.dyn_tree = s->dyn_ltree;
- s->l_desc.stat_desc = &static_l_desc;
-
- s->d_desc.dyn_tree = s->dyn_dtree;
- s->d_desc.stat_desc = &static_d_desc;
-
- s->bl_desc.dyn_tree = s->bl_tree;
- s->bl_desc.stat_desc = &static_bl_desc;
-
- s->bi_buf = 0;
- s->bi_valid = 0;
- s->last_eob_len = 8; /* enough lookahead for inflate */
-#ifdef DEBUG
- s->compressed_len = 0L;
- s->bits_sent = 0L;
-#endif
-
- /* Initialize the first block of the first file: */
- init_block(s);
-}
-
-/* ===========================================================================
- * Initialize a new block.
- */
-local void init_block(s)
- deflate_state *s;
-{
- int n; /* iterates over tree elements */
-
- /* Initialize the trees. */
- for (n = 0; n < L_CODES; n++) s->dyn_ltree[n].Freq = 0;
- for (n = 0; n < D_CODES; n++) s->dyn_dtree[n].Freq = 0;
- for (n = 0; n < BL_CODES; n++) s->bl_tree[n].Freq = 0;
-
- s->dyn_ltree[END_BLOCK].Freq = 1;
- s->opt_len = s->static_len = 0L;
- s->last_lit = s->matches = 0;
-}
-
-#define SMALLEST 1
-/* Index within the heap array of least frequent node in the Huffman tree */
-
-
-/* ===========================================================================
- * Remove the smallest element from the heap and recreate the heap with
- * one less element. Updates heap and heap_len.
- */
-#define pqremove(s, tree, top) \
-{\
- top = s->heap[SMALLEST]; \
- s->heap[SMALLEST] = s->heap[s->heap_len--]; \
- pqdownheap(s, tree, SMALLEST); \
-}
-
-/* ===========================================================================
- * Compares to subtrees, using the tree depth as tie breaker when
- * the subtrees have equal frequency. This minimizes the worst case length.
- */
-#define smaller(tree, n, m, depth) \
- (tree[n].Freq < tree[m].Freq || \
- (tree[n].Freq == tree[m].Freq && depth[n] <= depth[m]))
-
-/* ===========================================================================
- * Restore the heap property by moving down the tree starting at node k,
- * exchanging a node with the smallest of its two sons if necessary, stopping
- * when the heap property is re-established (each father smaller than its
- * two sons).
- */
-local void pqdownheap(s, tree, k)
- deflate_state *s;
- ct_data *tree; /* the tree to restore */
- int k; /* node to move down */
-{
- int v = s->heap[k];
- int j = k << 1; /* left son of k */
- while (j <= s->heap_len) {
- /* Set j to the smallest of the two sons: */
- if (j < s->heap_len &&
- smaller(tree, s->heap[j+1], s->heap[j], s->depth)) {
- j++;
- }
- /* Exit if v is smaller than both sons */
- if (smaller(tree, v, s->heap[j], s->depth)) break;
-
- /* Exchange v with the smallest son */
- s->heap[k] = s->heap[j]; k = j;
-
- /* And continue down the tree, setting j to the left son of k */
- j <<= 1;
- }
- s->heap[k] = v;
-}
-
-/* ===========================================================================
- * Compute the optimal bit lengths for a tree and update the total bit length
- * for the current block.
- * IN assertion: the fields freq and dad are set, heap[heap_max] and
- * above are the tree nodes sorted by increasing frequency.
- * OUT assertions: the field len is set to the optimal bit length, the
- * array bl_count contains the frequencies for each bit length.
- * The length opt_len is updated; static_len is also updated if stree is
- * not null.
- */
-local void gen_bitlen(s, desc)
- deflate_state *s;
- tree_desc *desc; /* the tree descriptor */
-{
- ct_data *tree = desc->dyn_tree;
- int max_code = desc->max_code;
- const ct_data *stree = desc->stat_desc->static_tree;
- const intf *extra = desc->stat_desc->extra_bits;
- int base = desc->stat_desc->extra_base;
- int max_length = desc->stat_desc->max_length;
- int h; /* heap index */
- int n, m; /* iterate over the tree elements */
- int bits; /* bit length */
- int xbits; /* extra bits */
- ush f; /* frequency */
- int overflow = 0; /* number of elements with bit length too large */
-
- for (bits = 0; bits <= MAX_BITS; bits++) s->bl_count[bits] = 0;
-
- /* In a first pass, compute the optimal bit lengths (which may
- * overflow in the case of the bit length tree).
- */
- tree[s->heap[s->heap_max]].Len = 0; /* root of the heap */
-
- for (h = s->heap_max+1; h < HEAP_SIZE; h++) {
- n = s->heap[h];
- bits = tree[tree[n].Dad].Len + 1;
- if (bits > max_length) bits = max_length, overflow++;
- tree[n].Len = (ush)bits;
- /* We overwrite tree[n].Dad which is no longer needed */
-
- if (n > max_code) continue; /* not a leaf node */
-
- s->bl_count[bits]++;
- xbits = 0;
- if (n >= base) xbits = extra[n-base];
- f = tree[n].Freq;
- s->opt_len += (ulg)f * (bits + xbits);
- if (stree) s->static_len += (ulg)f * (stree[n].Len + xbits);
- }
- if (overflow == 0) return;
-
- Trace((stderr,"\nbit length overflow\n"));
- /* This happens for example on obj2 and pic of the Calgary corpus */
-
- /* Find the first bit length which could increase: */
- do {
- bits = max_length-1;
- while (s->bl_count[bits] == 0) bits--;
- s->bl_count[bits]--; /* move one leaf down the tree */
- s->bl_count[bits+1] += 2; /* move one overflow item as its brother */
- s->bl_count[max_length]--;
- /* The brother of the overflow item also moves one step up,
- * but this does not affect bl_count[max_length]
- */
- overflow -= 2;
- } while (overflow > 0);
-
- /* Now recompute all bit lengths, scanning in increasing frequency.
- * h is still equal to HEAP_SIZE. (It is simpler to reconstruct all
- * lengths instead of fixing only the wrong ones. This idea is taken
- * from 'ar' written by Haruhiko Okumura.)
- */
- for (bits = max_length; bits != 0; bits--) {
- n = s->bl_count[bits];
- while (n != 0) {
- m = s->heap[--h];
- if (m > max_code) continue;
- if ((unsigned) tree[m].Len != (unsigned) bits) {
- Trace((stderr,"code %d bits %d->%d\n", m, tree[m].Len, bits));
- s->opt_len += ((long)bits - (long)tree[m].Len)
- *(long)tree[m].Freq;
- tree[m].Len = (ush)bits;
- }
- n--;
- }
- }
-}
-
-/* ===========================================================================
- * Generate the codes for a given tree and bit counts (which need not be
- * optimal).
- * IN assertion: the array bl_count contains the bit length statistics for
- * the given tree and the field len is set for all tree elements.
- * OUT assertion: the field code is set for all tree elements of non
- * zero code length.
- */
-local void gen_codes (tree, max_code, bl_count)
- ct_data *tree; /* the tree to decorate */
- int max_code; /* largest code with non zero frequency */
- ushf *bl_count; /* number of codes at each bit length */
-{
- ush next_code[MAX_BITS+1]; /* next code value for each bit length */
- ush code = 0; /* running code value */
- int bits; /* bit index */
- int n; /* code index */
-
- /* The distribution counts are first used to generate the code values
- * without bit reversal.
- */
- for (bits = 1; bits <= MAX_BITS; bits++) {
- next_code[bits] = code = (code + bl_count[bits-1]) << 1;
- }
- /* Check that the bit counts in bl_count are consistent. The last code
- * must be all ones.
- */
- Assert (code + bl_count[MAX_BITS]-1 == (1<<MAX_BITS)-1,
- "inconsistent bit counts");
- Tracev((stderr,"\ngen_codes: max_code %d ", max_code));
-
- for (n = 0; n <= max_code; n++) {
- int len = tree[n].Len;
- if (len == 0) continue;
- /* Now reverse the bits */
- tree[n].Code = bi_reverse(next_code[len]++, len);
-
- Tracecv(tree != static_ltree, (stderr,"\nn %3d %c l %2d c %4x (%x) ",
- n, (isgraph(n) ? n : ' '), len, tree[n].Code, next_code[len]-1));
- }
-}
-
-/* ===========================================================================
- * Construct one Huffman tree and assigns the code bit strings and lengths.
- * Update the total bit length for the current block.
- * IN assertion: the field freq is set for all tree elements.
- * OUT assertions: the fields len and code are set to the optimal bit length
- * and corresponding code. The length opt_len is updated; static_len is
- * also updated if stree is not null. The field max_code is set.
- */
-local void build_tree(s, desc)
- deflate_state *s;
- tree_desc *desc; /* the tree descriptor */
-{
- ct_data *tree = desc->dyn_tree;
- const ct_data *stree = desc->stat_desc->static_tree;
- int elems = desc->stat_desc->elems;
- int n, m; /* iterate over heap elements */
- int max_code = -1; /* largest code with non zero frequency */
- int node; /* new node being created */
-
- /* Construct the initial heap, with least frequent element in
- * heap[SMALLEST]. The sons of heap[n] are heap[2*n] and heap[2*n+1].
- * heap[0] is not used.
- */
- s->heap_len = 0, s->heap_max = HEAP_SIZE;
-
- for (n = 0; n < elems; n++) {
- if (tree[n].Freq != 0) {
- s->heap[++(s->heap_len)] = max_code = n;
- s->depth[n] = 0;
- } else {
- tree[n].Len = 0;
- }
- }
-
- /* The pkzip format requires that at least one distance code exists,
- * and that at least one bit should be sent even if there is only one
- * possible code. So to avoid special checks later on we force at least
- * two codes of non zero frequency.
- */
- while (s->heap_len < 2) {
- node = s->heap[++(s->heap_len)] = (max_code < 2 ? ++max_code : 0);
- tree[node].Freq = 1;
- s->depth[node] = 0;
- s->opt_len--; if (stree) s->static_len -= stree[node].Len;
- /* node is 0 or 1 so it does not have extra bits */
- }
- desc->max_code = max_code;
-
- /* The elements heap[heap_len/2+1 .. heap_len] are leaves of the tree,
- * establish sub-heaps of increasing lengths:
- */
- for (n = s->heap_len/2; n >= 1; n--) pqdownheap(s, tree, n);
-
- /* Construct the Huffman tree by repeatedly combining the least two
- * frequent nodes.
- */
- node = elems; /* next internal node of the tree */
- do {
- pqremove(s, tree, n); /* n = node of least frequency */
- m = s->heap[SMALLEST]; /* m = node of next least frequency */
-
- s->heap[--(s->heap_max)] = n; /* keep the nodes sorted by frequency */
- s->heap[--(s->heap_max)] = m;
-
- /* Create a new node father of n and m */
- tree[node].Freq = tree[n].Freq + tree[m].Freq;
- s->depth[node] = (uch)((s->depth[n] >= s->depth[m] ?
- s->depth[n] : s->depth[m]) + 1);
- tree[n].Dad = tree[m].Dad = (ush)node;
-#ifdef DUMP_BL_TREE
- if (tree == s->bl_tree) {
- fprintf(stderr,"\nnode %d(%d), sons %d(%d) %d(%d)",
- node, tree[node].Freq, n, tree[n].Freq, m, tree[m].Freq);
- }
-#endif
- /* and insert the new node in the heap */
- s->heap[SMALLEST] = node++;
- pqdownheap(s, tree, SMALLEST);
-
- } while (s->heap_len >= 2);
-
- s->heap[--(s->heap_max)] = s->heap[SMALLEST];
-
- /* At this point, the fields freq and dad are set. We can now
- * generate the bit lengths.
- */
- gen_bitlen(s, (tree_desc *)desc);
-
- /* The field len is now set, we can generate the bit codes */
- gen_codes ((ct_data *)tree, max_code, s->bl_count);
-}
-
-/* ===========================================================================
- * Scan a literal or distance tree to determine the frequencies of the codes
- * in the bit length tree.
- */
-local void scan_tree (s, tree, max_code)
- deflate_state *s;
- ct_data *tree; /* the tree to be scanned */
- int max_code; /* and its largest code of non zero frequency */
-{
- int n; /* iterates over all tree elements */
- int prevlen = -1; /* last emitted length */
- int curlen; /* length of current code */
- int nextlen = tree[0].Len; /* length of next code */
- int count = 0; /* repeat count of the current code */
- int max_count = 7; /* max repeat count */
- int min_count = 4; /* min repeat count */
-
- if (nextlen == 0) max_count = 138, min_count = 3;
- tree[max_code+1].Len = (ush)0xffff; /* guard */
-
- for (n = 0; n <= max_code; n++) {
- curlen = nextlen; nextlen = tree[n+1].Len;
- if (++count < max_count && curlen == nextlen) {
- continue;
- } else if (count < min_count) {
- s->bl_tree[curlen].Freq += count;
- } else if (curlen != 0) {
- if (curlen != prevlen) s->bl_tree[curlen].Freq++;
- s->bl_tree[REP_3_6].Freq++;
- } else if (count <= 10) {
- s->bl_tree[REPZ_3_10].Freq++;
- } else {
- s->bl_tree[REPZ_11_138].Freq++;
- }
- count = 0; prevlen = curlen;
- if (nextlen == 0) {
- max_count = 138, min_count = 3;
- } else if (curlen == nextlen) {
- max_count = 6, min_count = 3;
- } else {
- max_count = 7, min_count = 4;
- }
- }
-}
-
-/* ===========================================================================
- * Send a literal or distance tree in compressed form, using the codes in
- * bl_tree.
- */
-local void send_tree (s, tree, max_code)
- deflate_state *s;
- ct_data *tree; /* the tree to be scanned */
- int max_code; /* and its largest code of non zero frequency */
-{
- int n; /* iterates over all tree elements */
- int prevlen = -1; /* last emitted length */
- int curlen; /* length of current code */
- int nextlen = tree[0].Len; /* length of next code */
- int count = 0; /* repeat count of the current code */
- int max_count = 7; /* max repeat count */
- int min_count = 4; /* min repeat count */
-
- /* tree[max_code+1].Len = -1; */ /* guard already set */
- if (nextlen == 0) max_count = 138, min_count = 3;
-
- for (n = 0; n <= max_code; n++) {
- curlen = nextlen; nextlen = tree[n+1].Len;
- if (++count < max_count && curlen == nextlen) {
- continue;
- } else if (count < min_count) {
- do { send_code(s, curlen, s->bl_tree); } while (--count != 0);
-
- } else if (curlen != 0) {
- if (curlen != prevlen) {
- send_code(s, curlen, s->bl_tree); count--;
- }
- Assert(count >= 3 && count <= 6, " 3_6?");
- send_code(s, REP_3_6, s->bl_tree); send_bits(s, count-3, 2);
-
- } else if (count <= 10) {
- send_code(s, REPZ_3_10, s->bl_tree); send_bits(s, count-3, 3);
-
- } else {
- send_code(s, REPZ_11_138, s->bl_tree); send_bits(s, count-11, 7);
- }
- count = 0; prevlen = curlen;
- if (nextlen == 0) {
- max_count = 138, min_count = 3;
- } else if (curlen == nextlen) {
- max_count = 6, min_count = 3;
- } else {
- max_count = 7, min_count = 4;
- }
- }
-}
-
-/* ===========================================================================
- * Construct the Huffman tree for the bit lengths and return the index in
- * bl_order of the last bit length code to send.
- */
-local int build_bl_tree(s)
- deflate_state *s;
-{
- int max_blindex; /* index of last bit length code of non zero freq */
-
- /* Determine the bit length frequencies for literal and distance trees */
- scan_tree(s, (ct_data *)s->dyn_ltree, s->l_desc.max_code);
- scan_tree(s, (ct_data *)s->dyn_dtree, s->d_desc.max_code);
-
- /* Build the bit length tree: */
- build_tree(s, (tree_desc *)(&(s->bl_desc)));
- /* opt_len now includes the length of the tree representations, except
- * the lengths of the bit lengths codes and the 5+5+4 bits for the counts.
- */
-
- /* Determine the number of bit length codes to send. The pkzip format
- * requires that at least 4 bit length codes be sent. (appnote.txt says
- * 3 but the actual value used is 4.)
- */
- for (max_blindex = BL_CODES-1; max_blindex >= 3; max_blindex--) {
- if (s->bl_tree[bl_order[max_blindex]].Len != 0) break;
- }
- /* Update opt_len to include the bit length tree and counts */
- s->opt_len += 3*(max_blindex+1) + 5+5+4;
- Tracev((stderr, "\ndyn trees: dyn %ld, stat %ld",
- s->opt_len, s->static_len));
-
- return max_blindex;
-}
-
-/* ===========================================================================
- * Send the header for a block using dynamic Huffman trees: the counts, the
- * lengths of the bit length codes, the literal tree and the distance tree.
- * IN assertion: lcodes >= 257, dcodes >= 1, blcodes >= 4.
- */
-local void send_all_trees(s, lcodes, dcodes, blcodes)
- deflate_state *s;
- int lcodes, dcodes, blcodes; /* number of codes for each tree */
-{
- int rank; /* index in bl_order */
-
- Assert (lcodes >= 257 && dcodes >= 1 && blcodes >= 4, "not enough codes");
- Assert (lcodes <= L_CODES && dcodes <= D_CODES && blcodes <= BL_CODES,
- "too many codes");
- Tracev((stderr, "\nbl counts: "));
- send_bits(s, lcodes-257, 5); /* not +255 as stated in appnote.txt */
- send_bits(s, dcodes-1, 5);
- send_bits(s, blcodes-4, 4); /* not -3 as stated in appnote.txt */
- for (rank = 0; rank < blcodes; rank++) {
- Tracev((stderr, "\nbl code %2d ", bl_order[rank]));
- send_bits(s, s->bl_tree[bl_order[rank]].Len, 3);
- }
- Tracev((stderr, "\nbl tree: sent %ld", s->bits_sent));
-
- send_tree(s, (ct_data *)s->dyn_ltree, lcodes-1); /* literal tree */
- Tracev((stderr, "\nlit tree: sent %ld", s->bits_sent));
-
- send_tree(s, (ct_data *)s->dyn_dtree, dcodes-1); /* distance tree */
- Tracev((stderr, "\ndist tree: sent %ld", s->bits_sent));
-}
-
-/* ===========================================================================
- * Send a stored block
- */
-void _tr_stored_block(s, buf, stored_len, eof)
- deflate_state *s;
- charf *buf; /* input block */
- ulg stored_len; /* length of input block */
- int eof; /* true if this is the last block for a file */
-{
- send_bits(s, (STORED_BLOCK<<1)+eof, 3); /* send block type */
-#ifdef DEBUG
- s->compressed_len = (s->compressed_len + 3 + 7) & (ulg)~7L;
- s->compressed_len += (stored_len + 4) << 3;
-#endif
- copy_block(s, buf, (unsigned)stored_len, 1); /* with header */
-}
-
-/* ===========================================================================
- * Send one empty static block to give enough lookahead for inflate.
- * This takes 10 bits, of which 7 may remain in the bit buffer.
- * The current inflate code requires 9 bits of lookahead. If the
- * last two codes for the previous block (real code plus EOB) were coded
- * on 5 bits or less, inflate may have only 5+3 bits of lookahead to decode
- * the last real code. In this case we send two empty static blocks instead
- * of one. (There are no problems if the previous block is stored or fixed.)
- * To simplify the code, we assume the worst case of last real code encoded
- * on one bit only.
- */
-void _tr_align(s)
- deflate_state *s;
-{
- send_bits(s, STATIC_TREES<<1, 3);
- send_code(s, END_BLOCK, static_ltree);
-#ifdef DEBUG
- s->compressed_len += 10L; /* 3 for block type, 7 for EOB */
-#endif
- bi_flush(s);
- /* Of the 10 bits for the empty block, we have already sent
- * (10 - bi_valid) bits. The lookahead for the last real code (before
- * the EOB of the previous block) was thus at least one plus the length
- * of the EOB plus what we have just sent of the empty static block.
- */
- if (1 + s->last_eob_len + 10 - s->bi_valid < 9) {
- send_bits(s, STATIC_TREES<<1, 3);
- send_code(s, END_BLOCK, static_ltree);
-#ifdef DEBUG
- s->compressed_len += 10L;
-#endif
- bi_flush(s);
- }
- s->last_eob_len = 7;
-}
-
-/* ===========================================================================
- * Determine the best encoding for the current block: dynamic trees, static
- * trees or store, and output the encoded block to the zip file.
- */
-void _tr_flush_block(s, buf, stored_len, eof)
- deflate_state *s;
- charf *buf; /* input block, or NULL if too old */
- ulg stored_len; /* length of input block */
- int eof; /* true if this is the last block for a file */
-{
- ulg opt_lenb, static_lenb; /* opt_len and static_len in bytes */
- int max_blindex = 0; /* index of last bit length code of non zero freq */
-
- /* Build the Huffman trees unless a stored block is forced */
- if (s->level > 0) {
-
- /* Check if the file is binary or text */
- if (stored_len > 0 && s->strm->data_type == Z_UNKNOWN)
- set_data_type(s);
-
- /* Construct the literal and distance trees */
- build_tree(s, (tree_desc *)(&(s->l_desc)));
- Tracev((stderr, "\nlit data: dyn %ld, stat %ld", s->opt_len,
- s->static_len));
-
- build_tree(s, (tree_desc *)(&(s->d_desc)));
- Tracev((stderr, "\ndist data: dyn %ld, stat %ld", s->opt_len,
- s->static_len));
- /* At this point, opt_len and static_len are the total bit lengths of
- * the compressed block data, excluding the tree representations.
- */
-
- /* Build the bit length tree for the above two trees, and get the index
- * in bl_order of the last bit length code to send.
- */
- max_blindex = build_bl_tree(s);
-
- /* Determine the best encoding. Compute the block lengths in bytes. */
- opt_lenb = (s->opt_len+3+7)>>3;
- static_lenb = (s->static_len+3+7)>>3;
-
- Tracev((stderr, "\nopt %lu(%lu) stat %lu(%lu) stored %lu lit %u ",
- opt_lenb, s->opt_len, static_lenb, s->static_len, stored_len,
- s->last_lit));
-
- if (static_lenb <= opt_lenb) opt_lenb = static_lenb;
-
- } else {
- Assert(buf != (char*)0, "lost buf");
- opt_lenb = static_lenb = stored_len + 5; /* force a stored block */
- }
-
-#ifdef FORCE_STORED
- if (buf != (char*)0) { /* force stored block */
-#else
- if (stored_len+4 <= opt_lenb && buf != (char*)0) {
- /* 4: two words for the lengths */
-#endif
- /* The test buf != NULL is only necessary if LIT_BUFSIZE > WSIZE.
- * Otherwise we can't have processed more than WSIZE input bytes since
- * the last block flush, because compression would have been
- * successful. If LIT_BUFSIZE <= WSIZE, it is never too late to
- * transform a block into a stored block.
- */
- _tr_stored_block(s, buf, stored_len, eof);
-
-#ifdef FORCE_STATIC
- } else if (static_lenb >= 0) { /* force static trees */
-#else
- } else if (s->strategy == Z_FIXED || static_lenb == opt_lenb) {
-#endif
- send_bits(s, (STATIC_TREES<<1)+eof, 3);
- compress_block(s, (ct_data *)static_ltree, (ct_data *)static_dtree);
-#ifdef DEBUG
- s->compressed_len += 3 + s->static_len;
-#endif
- } else {
- send_bits(s, (DYN_TREES<<1)+eof, 3);
- send_all_trees(s, s->l_desc.max_code+1, s->d_desc.max_code+1,
- max_blindex+1);
- compress_block(s, (ct_data *)s->dyn_ltree, (ct_data *)s->dyn_dtree);
-#ifdef DEBUG
- s->compressed_len += 3 + s->opt_len;
-#endif
- }
- Assert (s->compressed_len == s->bits_sent, "bad compressed size");
- /* The above check is made mod 2^32, for files larger than 512 MB
- * and uLong implemented on 32 bits.
- */
- init_block(s);
-
- if (eof) {
- bi_windup(s);
-#ifdef DEBUG
- s->compressed_len += 7; /* align on byte boundary */
-#endif
- }
- Tracev((stderr,"\ncomprlen %lu(%lu) ", s->compressed_len>>3,
- s->compressed_len-7*eof));
-}
-
-/* ===========================================================================
- * Save the match info and tally the frequency counts. Return true if
- * the current block must be flushed.
- */
-int _tr_tally (s, dist, lc)
- deflate_state *s;
- unsigned dist; /* distance of matched string */
- unsigned lc; /* match length-MIN_MATCH or unmatched char (if dist==0) */
-{
- s->d_buf[s->last_lit] = (ush)dist;
- s->l_buf[s->last_lit++] = (uch)lc;
- if (dist == 0) {
- /* lc is the unmatched char */
- s->dyn_ltree[lc].Freq++;
- } else {
- s->matches++;
- /* Here, lc is the match length - MIN_MATCH */
- dist--; /* dist = match distance - 1 */
- Assert((ush)dist < (ush)MAX_DIST(s) &&
- (ush)lc <= (ush)(MAX_MATCH-MIN_MATCH) &&
- (ush)d_code(dist) < (ush)D_CODES, "_tr_tally: bad match");
-
- s->dyn_ltree[_length_code[lc]+LITERALS+1].Freq++;
- s->dyn_dtree[d_code(dist)].Freq++;
- }
-
-#ifdef TRUNCATE_BLOCK
- /* Try to guess if it is profitable to stop the current block here */
- if ((s->last_lit & 0x1fff) == 0 && s->level > 2) {
- /* Compute an upper bound for the compressed length */
- ulg out_length = (ulg)s->last_lit*8L;
- ulg in_length = (ulg)((long)s->strstart - s->block_start);
- int dcode;
- for (dcode = 0; dcode < D_CODES; dcode++) {
- out_length += (ulg)s->dyn_dtree[dcode].Freq *
- (5L+extra_dbits[dcode]);
- }
- out_length >>= 3;
- Tracev((stderr,"\nlast_lit %u, in %ld, out ~%ld(%ld%%) ",
- s->last_lit, in_length, out_length,
- 100L - out_length*100L/in_length));
- if (s->matches < s->last_lit/2 && out_length < in_length/2) return 1;
- }
-#endif
- return (s->last_lit == s->lit_bufsize-1);
- /* We avoid equality with lit_bufsize because of wraparound at 64K
- * on 16 bit machines and because stored blocks are restricted to
- * 64K-1 bytes.
- */
-}
-
-/* ===========================================================================
- * Send the block data compressed using the given Huffman trees
- */
-local void compress_block(s, ltree, dtree)
- deflate_state *s;
- ct_data *ltree; /* literal tree */
- ct_data *dtree; /* distance tree */
-{
- unsigned dist; /* distance of matched string */
- int lc; /* match length or unmatched char (if dist == 0) */
- unsigned lx = 0; /* running index in l_buf */
- unsigned code; /* the code to send */
- int extra; /* number of extra bits to send */
-
- if (s->last_lit != 0) do {
- dist = s->d_buf[lx];
- lc = s->l_buf[lx++];
- if (dist == 0) {
- send_code(s, lc, ltree); /* send a literal byte */
- Tracecv(isgraph(lc), (stderr," '%c' ", lc));
- } else {
- /* Here, lc is the match length - MIN_MATCH */
- code = _length_code[lc];
- send_code(s, code+LITERALS+1, ltree); /* send the length code */
- extra = extra_lbits[code];
- if (extra != 0) {
- lc -= base_length[code];
- send_bits(s, lc, extra); /* send the extra length bits */
- }
- dist--; /* dist is now the match distance - 1 */
- code = d_code(dist);
- Assert (code < D_CODES, "bad d_code");
-
- send_code(s, code, dtree); /* send the distance code */
- extra = extra_dbits[code];
- if (extra != 0) {
- dist -= base_dist[code];
- send_bits(s, dist, extra); /* send the extra distance bits */
- }
- } /* literal or match pair ? */
-
- /* Check that the overlay between pending_buf and d_buf+l_buf is ok: */
- Assert((uInt)(s->pending) < s->lit_bufsize + 2*lx,
- "pendingBuf overflow");
-
- } while (lx < s->last_lit);
-
- send_code(s, END_BLOCK, ltree);
- s->last_eob_len = ltree[END_BLOCK].Len;
-}
-
-/* ===========================================================================
- * Set the data type to BINARY or TEXT, using a crude approximation:
- * set it to Z_TEXT if all symbols are either printable characters (33 to 255)
- * or white spaces (9 to 13, or 32); or set it to Z_BINARY otherwise.
- * IN assertion: the fields Freq of dyn_ltree are set.
- */
-local void set_data_type(s)
- deflate_state *s;
-{
- int n;
-
- for (n = 0; n < 9; n++)
- if (s->dyn_ltree[n].Freq != 0)
- break;
- if (n == 9)
- for (n = 14; n < 32; n++)
- if (s->dyn_ltree[n].Freq != 0)
- break;
- s->strm->data_type = (n == 32) ? Z_TEXT : Z_BINARY;
-}
-
-/* ===========================================================================
- * Reverse the first len bits of a code, using straightforward code (a faster
- * method would use a table)
- * IN assertion: 1 <= len <= 15
- */
-local unsigned bi_reverse(code, len)
- unsigned code; /* the value to invert */
- int len; /* its bit length */
-{
- register unsigned res = 0;
- do {
- res |= code & 1;
- code >>= 1, res <<= 1;
- } while (--len > 0);
- return res >> 1;
-}
-
-/* ===========================================================================
- * Flush the bit buffer, keeping at most 7 bits in it.
- */
-local void bi_flush(s)
- deflate_state *s;
-{
- if (s->bi_valid == 16) {
- put_short(s, s->bi_buf);
- s->bi_buf = 0;
- s->bi_valid = 0;
- } else if (s->bi_valid >= 8) {
- put_byte(s, (Byte)s->bi_buf);
- s->bi_buf >>= 8;
- s->bi_valid -= 8;
- }
-}
-
-/* ===========================================================================
- * Flush the bit buffer and align the output on a byte boundary
- */
-local void bi_windup(s)
- deflate_state *s;
-{
- if (s->bi_valid > 8) {
- put_short(s, s->bi_buf);
- } else if (s->bi_valid > 0) {
- put_byte(s, (Byte)s->bi_buf);
- }
- s->bi_buf = 0;
- s->bi_valid = 0;
-#ifdef DEBUG
- s->bits_sent = (s->bits_sent+7) & ~7;
-#endif
-}
-
-/* ===========================================================================
- * Copy a stored block, storing first the length and its
- * one's complement if requested.
- */
-local void copy_block(s, buf, len, header)
- deflate_state *s;
- charf *buf; /* the input data */
- unsigned len; /* its length */
- int header; /* true if block header must be written */
-{
- bi_windup(s); /* align on byte boundary */
- s->last_eob_len = 8; /* enough lookahead for inflate */
-
- if (header) {
- put_short(s, (ush)len);
- put_short(s, (ush)~len);
-#ifdef DEBUG
- s->bits_sent += 2*16;
-#endif
- }
-#ifdef DEBUG
- s->bits_sent += (ulg)len<<3;
-#endif
- while (len--) {
- put_byte(s, *buf++);
- }
-}
diff --git a/sys/contrib/opensolaris/uts/common/zmod/zconf.h b/sys/contrib/opensolaris/uts/common/zmod/zconf.h
deleted file mode 100644
index ccce7b2..0000000
--- a/sys/contrib/opensolaris/uts/common/zmod/zconf.h
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _ZCONF_H
-#define _ZCONF_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/types.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * We don't want to turn on zlib's debugging.
- */
-#undef DEBUG
-
-/*
- * We define our own memory allocation and deallocation routines that use kmem.
- */
-#define MY_ZCALLOC
-
-/*
- * We don't define HAVE_MEMCPY here, but do in zutil.c, and implement our
- * our versions of zmemcpy(), zmemzero(), and zmemcmp().
- */
-
-/*
- * We have a sufficiently capable compiler as to not need zlib's compiler hack.
- */
-#define NO_DUMMY_DECL
-
-#define compressBound(len) (len + (len >> 12) + (len >> 14) + 11)
-
-#define z_off_t off_t
-#define OF(p) p
-#define ZEXTERN extern
-#define ZEXPORT
-#define ZEXPORTVA
-#define FAR
-
-#define deflateInit_ z_deflateInit_
-#define deflate z_deflate
-#define deflateEnd z_deflateEnd
-#define inflateInit_ z_inflateInit_
-#define inflate z_inflate
-#define inflateEnd z_inflateEnd
-#define deflateInit2_ z_deflateInit2_
-#define deflateSetDictionary z_deflateSetDictionary
-#define deflateCopy z_deflateCopy
-#define deflateReset z_deflateReset
-#define deflateParams z_deflateParams
-#define deflateBound z_deflateBound
-#define deflatePrime z_deflatePrime
-#define inflateInit2_ z_inflateInit2_
-#define inflateSetDictionary z_inflateSetDictionary
-#define inflateSync z_inflateSync
-#define inflateSyncPoint z_inflateSyncPoint
-#define inflateCopy z_inflateCopy
-#define inflateReset z_inflateReset
-#define inflateBack z_inflateBack
-#define inflateBackEnd z_inflateBackEnd
-#define compress zz_compress
-#define compress2 zz_compress2
-#define uncompress zz_uncompress
-#define adler32 z_adler32
-#define crc32 z_crc32
-#define get_crc_table z_get_crc_table
-#define zError z_zError
-
-#define MAX_MEM_LEVEL 9
-#define MAX_WBITS 15
-
-typedef unsigned char Byte;
-typedef unsigned int uInt;
-typedef unsigned long uLong;
-typedef Byte Bytef;
-typedef char charf;
-typedef int intf;
-typedef uInt uIntf;
-typedef uLong uLongf;
-typedef void *voidpc;
-typedef void *voidpf;
-typedef void *voidp;
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _ZCONF_H */
diff --git a/sys/contrib/opensolaris/uts/common/zmod/zlib.h b/sys/contrib/opensolaris/uts/common/zmod/zlib.h
deleted file mode 100644
index 9b971a0..0000000
--- a/sys/contrib/opensolaris/uts/common/zmod/zlib.h
+++ /dev/null
@@ -1,1359 +0,0 @@
-/* zlib.h -- interface of the 'zlib' general purpose compression library
- version 1.2.3, July 18th, 2005
-
- Copyright (C) 1995-2005 Jean-loup Gailly and Mark Adler
-
- This software is provided 'as-is', without any express or implied
- warranty. In no event will the authors be held liable for any damages
- arising from the use of this software.
-
- Permission is granted to anyone to use this software for any purpose,
- including commercial applications, and to alter it and redistribute it
- freely, subject to the following restrictions:
-
- 1. The origin of this software must not be misrepresented; you must not
- claim that you wrote the original software. If you use this software
- in a product, an acknowledgment in the product documentation would be
- appreciated but is not required.
- 2. Altered source versions must be plainly marked as such, and must not be
- misrepresented as being the original software.
- 3. This notice may not be removed or altered from any source distribution.
-
- Jean-loup Gailly Mark Adler
- jloup@gzip.org madler@alumni.caltech.edu
-
-
- The data format used by the zlib library is described by RFCs (Request for
- Comments) 1950 to 1952 in the files http://www.ietf.org/rfc/rfc1950.txt
- (zlib format), rfc1951.txt (deflate format) and rfc1952.txt (gzip format).
-*/
-
-#ifndef _ZLIB_H
-#define _ZLIB_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include "zconf.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define ZLIB_VERSION "1.2.3"
-#define ZLIB_VERNUM 0x1230
-
-/*
- The 'zlib' compression library provides in-memory compression and
- decompression functions, including integrity checks of the uncompressed
- data. This version of the library supports only one compression method
- (deflation) but other algorithms will be added later and will have the same
- stream interface.
-
- Compression can be done in a single step if the buffers are large
- enough (for example if an input file is mmap'ed), or can be done by
- repeated calls of the compression function. In the latter case, the
- application must provide more input and/or consume the output
- (providing more output space) before each call.
-
- The compressed data format used by default by the in-memory functions is
- the zlib format, which is a zlib wrapper documented in RFC 1950, wrapped
- around a deflate stream, which is itself documented in RFC 1951.
-
- The library also supports reading and writing files in gzip (.gz) format
- with an interface similar to that of stdio using the functions that start
- with "gz". The gzip format is different from the zlib format. gzip is a
- gzip wrapper, documented in RFC 1952, wrapped around a deflate stream.
-
- This library can optionally read and write gzip streams in memory as well.
-
- The zlib format was designed to be compact and fast for use in memory
- and on communications channels. The gzip format was designed for single-
- file compression on file systems, has a larger header than zlib to maintain
- directory information, and uses a different, slower check method than zlib.
-
- The library does not install any signal handler. The decoder checks
- the consistency of the compressed data, so the library should never
- crash even in case of corrupted input.
-*/
-
-typedef voidpf (*alloc_func) OF((voidpf opaque, uInt items, uInt size));
-typedef void (*free_func) OF((voidpf opaque, voidpf address));
-
-struct internal_state;
-
-typedef struct z_stream_s {
- Bytef *next_in; /* next input byte */
- uInt avail_in; /* number of bytes available at next_in */
- uLong total_in; /* total nb of input bytes read so far */
-
- Bytef *next_out; /* next output byte should be put there */
- uInt avail_out; /* remaining free space at next_out */
- uLong total_out; /* total nb of bytes output so far */
-
- char *msg; /* last error message, NULL if no error */
- struct internal_state FAR *state; /* not visible by applications */
-
- alloc_func zalloc; /* used to allocate the internal state */
- free_func zfree; /* used to free the internal state */
- voidpf opaque; /* private data object passed to zalloc and zfree */
-
- int data_type; /* best guess about the data type: binary or text */
- uLong adler; /* adler32 value of the uncompressed data */
- uLong reserved; /* reserved for future use */
-} z_stream;
-
-typedef z_stream FAR *z_streamp;
-
-/*
- gzip header information passed to and from zlib routines. See RFC 1952
- for more details on the meanings of these fields.
-*/
-typedef struct gz_header_s {
- int text; /* true if compressed data believed to be text */
- uLong time; /* modification time */
- int xflags; /* extra flags (not used when writing a gzip file) */
- int os; /* operating system */
- Bytef *extra; /* pointer to extra field or Z_NULL if none */
- uInt extra_len; /* extra field length (valid if extra != Z_NULL) */
- uInt extra_max; /* space at extra (only when reading header) */
- Bytef *name; /* pointer to zero-terminated file name or Z_NULL */
- uInt name_max; /* space at name (only when reading header) */
- Bytef *comment; /* pointer to zero-terminated comment or Z_NULL */
- uInt comm_max; /* space at comment (only when reading header) */
- int hcrc; /* true if there was or will be a header crc */
- int done; /* true when done reading gzip header (not used
- when writing a gzip file) */
-} gz_header;
-
-typedef gz_header FAR *gz_headerp;
-
-/*
- The application must update next_in and avail_in when avail_in has
- dropped to zero. It must update next_out and avail_out when avail_out
- has dropped to zero. The application must initialize zalloc, zfree and
- opaque before calling the init function. All other fields are set by the
- compression library and must not be updated by the application.
-
- The opaque value provided by the application will be passed as the first
- parameter for calls of zalloc and zfree. This can be useful for custom
- memory management. The compression library attaches no meaning to the
- opaque value.
-
- zalloc must return Z_NULL if there is not enough memory for the object.
- If zlib is used in a multi-threaded application, zalloc and zfree must be
- thread safe.
-
- On 16-bit systems, the functions zalloc and zfree must be able to allocate
- exactly 65536 bytes, but will not be required to allocate more than this
- if the symbol MAXSEG_64K is defined (see zconf.h). WARNING: On MSDOS,
- pointers returned by zalloc for objects of exactly 65536 bytes *must*
- have their offset normalized to zero. The default allocation function
- provided by this library ensures this (see zutil.c). To reduce memory
- requirements and avoid any allocation of 64K objects, at the expense of
- compression ratio, compile the library with -DMAX_WBITS=14 (see zconf.h).
-
- The fields total_in and total_out can be used for statistics or
- progress reports. After compression, total_in holds the total size of
- the uncompressed data and may be saved for use in the decompressor
- (particularly if the decompressor wants to decompress everything in
- a single step).
-*/
-
- /* constants */
-
-#define Z_NO_FLUSH 0
-#define Z_PARTIAL_FLUSH 1 /* will be removed, use Z_SYNC_FLUSH instead */
-#define Z_SYNC_FLUSH 2
-#define Z_FULL_FLUSH 3
-#define Z_FINISH 4
-#define Z_BLOCK 5
-/* Allowed flush values; see deflate() and inflate() below for details */
-
-#define Z_OK 0
-#define Z_STREAM_END 1
-#define Z_NEED_DICT 2
-#define Z_ERRNO (-1)
-#define Z_STREAM_ERROR (-2)
-#define Z_DATA_ERROR (-3)
-#define Z_MEM_ERROR (-4)
-#define Z_BUF_ERROR (-5)
-#define Z_VERSION_ERROR (-6)
-/* Return codes for the compression/decompression functions. Negative
- * values are errors, positive values are used for special but normal events.
- */
-
-#define Z_NO_COMPRESSION 0
-#define Z_BEST_SPEED 1
-#define Z_BEST_COMPRESSION 9
-#define Z_DEFAULT_COMPRESSION (-1)
-/* compression levels */
-
-#define Z_FILTERED 1
-#define Z_HUFFMAN_ONLY 2
-#define Z_RLE 3
-#define Z_FIXED 4
-#define Z_DEFAULT_STRATEGY 0
-/* compression strategy; see deflateInit2() below for details */
-
-#define Z_BINARY 0
-#define Z_TEXT 1
-#define Z_ASCII Z_TEXT /* for compatibility with 1.2.2 and earlier */
-#define Z_UNKNOWN 2
-/* Possible values of the data_type field (though see inflate()) */
-
-#define Z_DEFLATED 8
-/* The deflate compression method (the only one supported in this version) */
-
-#define Z_NULL 0 /* for initializing zalloc, zfree, opaque */
-
-#define zlib_version zlibVersion()
-/* for compatibility with versions < 1.0.2 */
-
- /* basic functions */
-
-ZEXTERN const char * ZEXPORT zlibVersion OF((void));
-/* The application can compare zlibVersion and ZLIB_VERSION for consistency.
- If the first character differs, the library code actually used is
- not compatible with the zlib.h header file used by the application.
- This check is automatically made by deflateInit and inflateInit.
- */
-
-/*
-ZEXTERN int ZEXPORT deflateInit OF((z_streamp strm, int level));
-
- Initializes the internal stream state for compression. The fields
- zalloc, zfree and opaque must be initialized before by the caller.
- If zalloc and zfree are set to Z_NULL, deflateInit updates them to
- use default allocation functions.
-
- The compression level must be Z_DEFAULT_COMPRESSION, or between 0 and 9:
- 1 gives best speed, 9 gives best compression, 0 gives no compression at
- all (the input data is simply copied a block at a time).
- Z_DEFAULT_COMPRESSION requests a default compromise between speed and
- compression (currently equivalent to level 6).
-
- deflateInit returns Z_OK if success, Z_MEM_ERROR if there was not
- enough memory, Z_STREAM_ERROR if level is not a valid compression level,
- Z_VERSION_ERROR if the zlib library version (zlib_version) is incompatible
- with the version assumed by the caller (ZLIB_VERSION).
- msg is set to null if there is no error message. deflateInit does not
- perform any compression: this will be done by deflate().
-*/
-
-
-ZEXTERN int ZEXPORT deflate OF((z_streamp strm, int flush));
-/*
- deflate compresses as much data as possible, and stops when the input
- buffer becomes empty or the output buffer becomes full. It may introduce some
- output latency (reading input without producing any output) except when
- forced to flush.
-
- The detailed semantics are as follows. deflate performs one or both of the
- following actions:
-
- - Compress more input starting at next_in and update next_in and avail_in
- accordingly. If not all input can be processed (because there is not
- enough room in the output buffer), next_in and avail_in are updated and
- processing will resume at this point for the next call of deflate().
-
- - Provide more output starting at next_out and update next_out and avail_out
- accordingly. This action is forced if the parameter flush is non zero.
- Forcing flush frequently degrades the compression ratio, so this parameter
- should be set only when necessary (in interactive applications).
- Some output may be provided even if flush is not set.
-
- Before the call of deflate(), the application should ensure that at least
- one of the actions is possible, by providing more input and/or consuming
- more output, and updating avail_in or avail_out accordingly; avail_out
- should never be zero before the call. The application can consume the
- compressed output when it wants, for example when the output buffer is full
- (avail_out == 0), or after each call of deflate(). If deflate returns Z_OK
- and with zero avail_out, it must be called again after making room in the
- output buffer because there might be more output pending.
-
- Normally the parameter flush is set to Z_NO_FLUSH, which allows deflate to
- decide how much data to accumualte before producing output, in order to
- maximize compression.
-
- If the parameter flush is set to Z_SYNC_FLUSH, all pending output is
- flushed to the output buffer and the output is aligned on a byte boundary, so
- that the decompressor can get all input data available so far. (In particular
- avail_in is zero after the call if enough output space has been provided
- before the call.) Flushing may degrade compression for some compression
- algorithms and so it should be used only when necessary.
-
- If flush is set to Z_FULL_FLUSH, all output is flushed as with
- Z_SYNC_FLUSH, and the compression state is reset so that decompression can
- restart from this point if previous compressed data has been damaged or if
- random access is desired. Using Z_FULL_FLUSH too often can seriously degrade
- compression.
-
- If deflate returns with avail_out == 0, this function must be called again
- with the same value of the flush parameter and more output space (updated
- avail_out), until the flush is complete (deflate returns with non-zero
- avail_out). In the case of a Z_FULL_FLUSH or Z_SYNC_FLUSH, make sure that
- avail_out is greater than six to avoid repeated flush markers due to
- avail_out == 0 on return.
-
- If the parameter flush is set to Z_FINISH, pending input is processed,
- pending output is flushed and deflate returns with Z_STREAM_END if there
- was enough output space; if deflate returns with Z_OK, this function must be
- called again with Z_FINISH and more output space (updated avail_out) but no
- more input data, until it returns with Z_STREAM_END or an error. After
- deflate has returned Z_STREAM_END, the only possible operations on the
- stream are deflateReset or deflateEnd.
-
- Z_FINISH can be used immediately after deflateInit if all the compression
- is to be done in a single step. In this case, avail_out must be at least
- the value returned by deflateBound (see below). If deflate does not return
- Z_STREAM_END, then it must be called again as described above.
-
- deflate() sets strm->adler to the adler32 checksum of all input read
- so far (that is, total_in bytes).
-
- deflate() may update strm->data_type if it can make a good guess about
- the input data type (Z_BINARY or Z_TEXT). In doubt, the data is considered
- binary. This field is only for information purposes and does not affect
- the compression algorithm in any manner.
-
- deflate() returns Z_OK if some progress has been made (more input
- processed or more output produced), Z_STREAM_END if all input has been
- consumed and all output has been produced (only when flush is set to
- Z_FINISH), Z_STREAM_ERROR if the stream state was inconsistent (for example
- if next_in or next_out was NULL), Z_BUF_ERROR if no progress is possible
- (for example avail_in or avail_out was zero). Note that Z_BUF_ERROR is not
- fatal, and deflate() can be called again with more input and more output
- space to continue compressing.
-*/
-
-
-ZEXTERN int ZEXPORT deflateEnd OF((z_streamp strm));
-/*
- All dynamically allocated data structures for this stream are freed.
- This function discards any unprocessed input and does not flush any
- pending output.
-
- deflateEnd returns Z_OK if success, Z_STREAM_ERROR if the
- stream state was inconsistent, Z_DATA_ERROR if the stream was freed
- prematurely (some input or output was discarded). In the error case,
- msg may be set but then points to a static string (which must not be
- deallocated).
-*/
-
-
-/*
-ZEXTERN int ZEXPORT inflateInit OF((z_streamp strm));
-
- Initializes the internal stream state for decompression. The fields
- next_in, avail_in, zalloc, zfree and opaque must be initialized before by
- the caller. If next_in is not Z_NULL and avail_in is large enough (the exact
- value depends on the compression method), inflateInit determines the
- compression method from the zlib header and allocates all data structures
- accordingly; otherwise the allocation will be deferred to the first call of
- inflate. If zalloc and zfree are set to Z_NULL, inflateInit updates them to
- use default allocation functions.
-
- inflateInit returns Z_OK if success, Z_MEM_ERROR if there was not enough
- memory, Z_VERSION_ERROR if the zlib library version is incompatible with the
- version assumed by the caller. msg is set to null if there is no error
- message. inflateInit does not perform any decompression apart from reading
- the zlib header if present: this will be done by inflate(). (So next_in and
- avail_in may be modified, but next_out and avail_out are unchanged.)
-*/
-
-
-ZEXTERN int ZEXPORT inflate OF((z_streamp strm, int flush));
-/*
- inflate decompresses as much data as possible, and stops when the input
- buffer becomes empty or the output buffer becomes full. It may introduce
- some output latency (reading input without producing any output) except when
- forced to flush.
-
- The detailed semantics are as follows. inflate performs one or both of the
- following actions:
-
- - Decompress more input starting at next_in and update next_in and avail_in
- accordingly. If not all input can be processed (because there is not
- enough room in the output buffer), next_in is updated and processing
- will resume at this point for the next call of inflate().
-
- - Provide more output starting at next_out and update next_out and avail_out
- accordingly. inflate() provides as much output as possible, until there
- is no more input data or no more space in the output buffer (see below
- about the flush parameter).
-
- Before the call of inflate(), the application should ensure that at least
- one of the actions is possible, by providing more input and/or consuming
- more output, and updating the next_* and avail_* values accordingly.
- The application can consume the uncompressed output when it wants, for
- example when the output buffer is full (avail_out == 0), or after each
- call of inflate(). If inflate returns Z_OK and with zero avail_out, it
- must be called again after making room in the output buffer because there
- might be more output pending.
-
- The flush parameter of inflate() can be Z_NO_FLUSH, Z_SYNC_FLUSH,
- Z_FINISH, or Z_BLOCK. Z_SYNC_FLUSH requests that inflate() flush as much
- output as possible to the output buffer. Z_BLOCK requests that inflate() stop
- if and when it gets to the next deflate block boundary. When decoding the
- zlib or gzip format, this will cause inflate() to return immediately after
- the header and before the first block. When doing a raw inflate, inflate()
- will go ahead and process the first block, and will return when it gets to
- the end of that block, or when it runs out of data.
-
- The Z_BLOCK option assists in appending to or combining deflate streams.
- Also to assist in this, on return inflate() will set strm->data_type to the
- number of unused bits in the last byte taken from strm->next_in, plus 64
- if inflate() is currently decoding the last block in the deflate stream,
- plus 128 if inflate() returned immediately after decoding an end-of-block
- code or decoding the complete header up to just before the first byte of the
- deflate stream. The end-of-block will not be indicated until all of the
- uncompressed data from that block has been written to strm->next_out. The
- number of unused bits may in general be greater than seven, except when
- bit 7 of data_type is set, in which case the number of unused bits will be
- less than eight.
-
- inflate() should normally be called until it returns Z_STREAM_END or an
- error. However if all decompression is to be performed in a single step
- (a single call of inflate), the parameter flush should be set to
- Z_FINISH. In this case all pending input is processed and all pending
- output is flushed; avail_out must be large enough to hold all the
- uncompressed data. (The size of the uncompressed data may have been saved
- by the compressor for this purpose.) The next operation on this stream must
- be inflateEnd to deallocate the decompression state. The use of Z_FINISH
- is never required, but can be used to inform inflate that a faster approach
- may be used for the single inflate() call.
-
- In this implementation, inflate() always flushes as much output as
- possible to the output buffer, and always uses the faster approach on the
- first call. So the only effect of the flush parameter in this implementation
- is on the return value of inflate(), as noted below, or when it returns early
- because Z_BLOCK is used.
-
- If a preset dictionary is needed after this call (see inflateSetDictionary
- below), inflate sets strm->adler to the adler32 checksum of the dictionary
- chosen by the compressor and returns Z_NEED_DICT; otherwise it sets
- strm->adler to the adler32 checksum of all output produced so far (that is,
- total_out bytes) and returns Z_OK, Z_STREAM_END or an error code as described
- below. At the end of the stream, inflate() checks that its computed adler32
- checksum is equal to that saved by the compressor and returns Z_STREAM_END
- only if the checksum is correct.
-
- inflate() will decompress and check either zlib-wrapped or gzip-wrapped
- deflate data. The header type is detected automatically. Any information
- contained in the gzip header is not retained, so applications that need that
- information should instead use raw inflate, see inflateInit2() below, or
- inflateBack() and perform their own processing of the gzip header and
- trailer.
-
- inflate() returns Z_OK if some progress has been made (more input processed
- or more output produced), Z_STREAM_END if the end of the compressed data has
- been reached and all uncompressed output has been produced, Z_NEED_DICT if a
- preset dictionary is needed at this point, Z_DATA_ERROR if the input data was
- corrupted (input stream not conforming to the zlib format or incorrect check
- value), Z_STREAM_ERROR if the stream structure was inconsistent (for example
- if next_in or next_out was NULL), Z_MEM_ERROR if there was not enough memory,
- Z_BUF_ERROR if no progress is possible or if there was not enough room in the
- output buffer when Z_FINISH is used. Note that Z_BUF_ERROR is not fatal, and
- inflate() can be called again with more input and more output space to
- continue decompressing. If Z_DATA_ERROR is returned, the application may then
- call inflateSync() to look for a good compression block if a partial recovery
- of the data is desired.
-*/
-
-
-ZEXTERN int ZEXPORT inflateEnd OF((z_streamp strm));
-/*
- All dynamically allocated data structures for this stream are freed.
- This function discards any unprocessed input and does not flush any
- pending output.
-
- inflateEnd returns Z_OK if success, Z_STREAM_ERROR if the stream state
- was inconsistent. In the error case, msg may be set but then points to a
- static string (which must not be deallocated).
-*/
-
- /* Advanced functions */
-
-/*
- The following functions are needed only in some special applications.
-*/
-
-/*
-ZEXTERN int ZEXPORT deflateInit2 OF((z_streamp strm,
- int level,
- int method,
- int windowBits,
- int memLevel,
- int strategy));
-
- This is another version of deflateInit with more compression options. The
- fields next_in, zalloc, zfree and opaque must be initialized before by
- the caller.
-
- The method parameter is the compression method. It must be Z_DEFLATED in
- this version of the library.
-
- The windowBits parameter is the base two logarithm of the window size
- (the size of the history buffer). It should be in the range 8..15 for this
- version of the library. Larger values of this parameter result in better
- compression at the expense of memory usage. The default value is 15 if
- deflateInit is used instead.
-
- windowBits can also be -8..-15 for raw deflate. In this case, -windowBits
- determines the window size. deflate() will then generate raw deflate data
- with no zlib header or trailer, and will not compute an adler32 check value.
-
- windowBits can also be greater than 15 for optional gzip encoding. Add
- 16 to windowBits to write a simple gzip header and trailer around the
- compressed data instead of a zlib wrapper. The gzip header will have no
- file name, no extra data, no comment, no modification time (set to zero),
- no header crc, and the operating system will be set to 255 (unknown). If a
- gzip stream is being written, strm->adler is a crc32 instead of an adler32.
-
- The memLevel parameter specifies how much memory should be allocated
- for the internal compression state. memLevel=1 uses minimum memory but
- is slow and reduces compression ratio; memLevel=9 uses maximum memory
- for optimal speed. The default value is 8. See zconf.h for total memory
- usage as a function of windowBits and memLevel.
-
- The strategy parameter is used to tune the compression algorithm. Use the
- value Z_DEFAULT_STRATEGY for normal data, Z_FILTERED for data produced by a
- filter (or predictor), Z_HUFFMAN_ONLY to force Huffman encoding only (no
- string match), or Z_RLE to limit match distances to one (run-length
- encoding). Filtered data consists mostly of small values with a somewhat
- random distribution. In this case, the compression algorithm is tuned to
- compress them better. The effect of Z_FILTERED is to force more Huffman
- coding and less string matching; it is somewhat intermediate between
- Z_DEFAULT and Z_HUFFMAN_ONLY. Z_RLE is designed to be almost as fast as
- Z_HUFFMAN_ONLY, but give better compression for PNG image data. The strategy
- parameter only affects the compression ratio but not the correctness of the
- compressed output even if it is not set appropriately. Z_FIXED prevents the
- use of dynamic Huffman codes, allowing for a simpler decoder for special
- applications.
-
- deflateInit2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
- memory, Z_STREAM_ERROR if a parameter is invalid (such as an invalid
- method). msg is set to null if there is no error message. deflateInit2 does
- not perform any compression: this will be done by deflate().
-*/
-
-ZEXTERN int ZEXPORT deflateSetDictionary OF((z_streamp strm,
- const Bytef *dictionary,
- uInt dictLength));
-/*
- Initializes the compression dictionary from the given byte sequence
- without producing any compressed output. This function must be called
- immediately after deflateInit, deflateInit2 or deflateReset, before any
- call of deflate. The compressor and decompressor must use exactly the same
- dictionary (see inflateSetDictionary).
-
- The dictionary should consist of strings (byte sequences) that are likely
- to be encountered later in the data to be compressed, with the most commonly
- used strings preferably put towards the end of the dictionary. Using a
- dictionary is most useful when the data to be compressed is short and can be
- predicted with good accuracy; the data can then be compressed better than
- with the default empty dictionary.
-
- Depending on the size of the compression data structures selected by
- deflateInit or deflateInit2, a part of the dictionary may in effect be
- discarded, for example if the dictionary is larger than the window size in
- deflate or deflate2. Thus the strings most likely to be useful should be
- put at the end of the dictionary, not at the front. In addition, the
- current implementation of deflate will use at most the window size minus
- 262 bytes of the provided dictionary.
-
- Upon return of this function, strm->adler is set to the adler32 value
- of the dictionary; the decompressor may later use this value to determine
- which dictionary has been used by the compressor. (The adler32 value
- applies to the whole dictionary even if only a subset of the dictionary is
- actually used by the compressor.) If a raw deflate was requested, then the
- adler32 value is not computed and strm->adler is not set.
-
- deflateSetDictionary returns Z_OK if success, or Z_STREAM_ERROR if a
- parameter is invalid (such as NULL dictionary) or the stream state is
- inconsistent (for example if deflate has already been called for this stream
- or if the compression method is bsort). deflateSetDictionary does not
- perform any compression: this will be done by deflate().
-*/
-
-ZEXTERN int ZEXPORT deflateCopy OF((z_streamp dest,
- z_streamp source));
-/*
- Sets the destination stream as a complete copy of the source stream.
-
- This function can be useful when several compression strategies will be
- tried, for example when there are several ways of pre-processing the input
- data with a filter. The streams that will be discarded should then be freed
- by calling deflateEnd. Note that deflateCopy duplicates the internal
- compression state which can be quite large, so this strategy is slow and
- can consume lots of memory.
-
- deflateCopy returns Z_OK if success, Z_MEM_ERROR if there was not
- enough memory, Z_STREAM_ERROR if the source stream state was inconsistent
- (such as zalloc being NULL). msg is left unchanged in both source and
- destination.
-*/
-
-ZEXTERN int ZEXPORT deflateReset OF((z_streamp strm));
-/*
- This function is equivalent to deflateEnd followed by deflateInit,
- but does not free and reallocate all the internal compression state.
- The stream will keep the same compression level and any other attributes
- that may have been set by deflateInit2.
-
- deflateReset returns Z_OK if success, or Z_STREAM_ERROR if the source
- stream state was inconsistent (such as zalloc or state being NULL).
-*/
-
-ZEXTERN int ZEXPORT deflateParams OF((z_streamp strm,
- int level,
- int strategy));
-/*
- Dynamically update the compression level and compression strategy. The
- interpretation of level and strategy is as in deflateInit2. This can be
- used to switch between compression and straight copy of the input data, or
- to switch to a different kind of input data requiring a different
- strategy. If the compression level is changed, the input available so far
- is compressed with the old level (and may be flushed); the new level will
- take effect only at the next call of deflate().
-
- Before the call of deflateParams, the stream state must be set as for
- a call of deflate(), since the currently available input may have to
- be compressed and flushed. In particular, strm->avail_out must be non-zero.
-
- deflateParams returns Z_OK if success, Z_STREAM_ERROR if the source
- stream state was inconsistent or if a parameter was invalid, Z_BUF_ERROR
- if strm->avail_out was zero.
-*/
-
-ZEXTERN int ZEXPORT deflateTune OF((z_streamp strm,
- int good_length,
- int max_lazy,
- int nice_length,
- int max_chain));
-/*
- Fine tune deflate's internal compression parameters. This should only be
- used by someone who understands the algorithm used by zlib's deflate for
- searching for the best matching string, and even then only by the most
- fanatic optimizer trying to squeeze out the last compressed bit for their
- specific input data. Read the deflate.c source code for the meaning of the
- max_lazy, good_length, nice_length, and max_chain parameters.
-
- deflateTune() can be called after deflateInit() or deflateInit2(), and
- returns Z_OK on success, or Z_STREAM_ERROR for an invalid deflate stream.
- */
-
-ZEXTERN uLong ZEXPORT deflateBound OF((z_streamp strm,
- uLong sourceLen));
-/*
- deflateBound() returns an upper bound on the compressed size after
- deflation of sourceLen bytes. It must be called after deflateInit()
- or deflateInit2(). This would be used to allocate an output buffer
- for deflation in a single pass, and so would be called before deflate().
-*/
-
-ZEXTERN int ZEXPORT deflatePrime OF((z_streamp strm,
- int bits,
- int value));
-/*
- deflatePrime() inserts bits in the deflate output stream. The intent
- is that this function is used to start off the deflate output with the
- bits leftover from a previous deflate stream when appending to it. As such,
- this function can only be used for raw deflate, and must be used before the
- first deflate() call after a deflateInit2() or deflateReset(). bits must be
- less than or equal to 16, and that many of the least significant bits of
- value will be inserted in the output.
-
- deflatePrime returns Z_OK if success, or Z_STREAM_ERROR if the source
- stream state was inconsistent.
-*/
-
-ZEXTERN int ZEXPORT deflateSetHeader OF((z_streamp strm,
- gz_headerp head));
-/*
- deflateSetHeader() provides gzip header information for when a gzip
- stream is requested by deflateInit2(). deflateSetHeader() may be called
- after deflateInit2() or deflateReset() and before the first call of
- deflate(). The text, time, os, extra field, name, and comment information
- in the provided gz_header structure are written to the gzip header (xflag is
- ignored -- the extra flags are set according to the compression level). The
- caller must assure that, if not Z_NULL, name and comment are terminated with
- a zero byte, and that if extra is not Z_NULL, that extra_len bytes are
- available there. If hcrc is true, a gzip header crc is included. Note that
- the current versions of the command-line version of gzip (up through version
- 1.3.x) do not support header crc's, and will report that it is a "multi-part
- gzip file" and give up.
-
- If deflateSetHeader is not used, the default gzip header has text false,
- the time set to zero, and os set to 255, with no extra, name, or comment
- fields. The gzip header is returned to the default state by deflateReset().
-
- deflateSetHeader returns Z_OK if success, or Z_STREAM_ERROR if the source
- stream state was inconsistent.
-*/
-
-/*
-ZEXTERN int ZEXPORT inflateInit2 OF((z_streamp strm,
- int windowBits));
-
- This is another version of inflateInit with an extra parameter. The
- fields next_in, avail_in, zalloc, zfree and opaque must be initialized
- before by the caller.
-
- The windowBits parameter is the base two logarithm of the maximum window
- size (the size of the history buffer). It should be in the range 8..15 for
- this version of the library. The default value is 15 if inflateInit is used
- instead. windowBits must be greater than or equal to the windowBits value
- provided to deflateInit2() while compressing, or it must be equal to 15 if
- deflateInit2() was not used. If a compressed stream with a larger window
- size is given as input, inflate() will return with the error code
- Z_DATA_ERROR instead of trying to allocate a larger window.
-
- windowBits can also be -8..-15 for raw inflate. In this case, -windowBits
- determines the window size. inflate() will then process raw deflate data,
- not looking for a zlib or gzip header, not generating a check value, and not
- looking for any check values for comparison at the end of the stream. This
- is for use with other formats that use the deflate compressed data format
- such as zip. Those formats provide their own check values. If a custom
- format is developed using the raw deflate format for compressed data, it is
- recommended that a check value such as an adler32 or a crc32 be applied to
- the uncompressed data as is done in the zlib, gzip, and zip formats. For
- most applications, the zlib format should be used as is. Note that comments
- above on the use in deflateInit2() applies to the magnitude of windowBits.
-
- windowBits can also be greater than 15 for optional gzip decoding. Add
- 32 to windowBits to enable zlib and gzip decoding with automatic header
- detection, or add 16 to decode only the gzip format (the zlib format will
- return a Z_DATA_ERROR). If a gzip stream is being decoded, strm->adler is
- a crc32 instead of an adler32.
-
- inflateInit2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
- memory, Z_STREAM_ERROR if a parameter is invalid (such as a null strm). msg
- is set to null if there is no error message. inflateInit2 does not perform
- any decompression apart from reading the zlib header if present: this will
- be done by inflate(). (So next_in and avail_in may be modified, but next_out
- and avail_out are unchanged.)
-*/
-
-ZEXTERN int ZEXPORT inflateSetDictionary OF((z_streamp strm,
- const Bytef *dictionary,
- uInt dictLength));
-/*
- Initializes the decompression dictionary from the given uncompressed byte
- sequence. This function must be called immediately after a call of inflate,
- if that call returned Z_NEED_DICT. The dictionary chosen by the compressor
- can be determined from the adler32 value returned by that call of inflate.
- The compressor and decompressor must use exactly the same dictionary (see
- deflateSetDictionary). For raw inflate, this function can be called
- immediately after inflateInit2() or inflateReset() and before any call of
- inflate() to set the dictionary. The application must insure that the
- dictionary that was used for compression is provided.
-
- inflateSetDictionary returns Z_OK if success, Z_STREAM_ERROR if a
- parameter is invalid (such as NULL dictionary) or the stream state is
- inconsistent, Z_DATA_ERROR if the given dictionary doesn't match the
- expected one (incorrect adler32 value). inflateSetDictionary does not
- perform any decompression: this will be done by subsequent calls of
- inflate().
-*/
-
-ZEXTERN int ZEXPORT inflateSync OF((z_streamp strm));
-/*
- Skips invalid compressed data until a full flush point (see above the
- description of deflate with Z_FULL_FLUSH) can be found, or until all
- available input is skipped. No output is provided.
-
- inflateSync returns Z_OK if a full flush point has been found, Z_BUF_ERROR
- if no more input was provided, Z_DATA_ERROR if no flush point has been found,
- or Z_STREAM_ERROR if the stream structure was inconsistent. In the success
- case, the application may save the current current value of total_in which
- indicates where valid compressed data was found. In the error case, the
- application may repeatedly call inflateSync, providing more input each time,
- until success or end of the input data.
-*/
-
-ZEXTERN int ZEXPORT inflateCopy OF((z_streamp dest,
- z_streamp source));
-/*
- Sets the destination stream as a complete copy of the source stream.
-
- This function can be useful when randomly accessing a large stream. The
- first pass through the stream can periodically record the inflate state,
- allowing restarting inflate at those points when randomly accessing the
- stream.
-
- inflateCopy returns Z_OK if success, Z_MEM_ERROR if there was not
- enough memory, Z_STREAM_ERROR if the source stream state was inconsistent
- (such as zalloc being NULL). msg is left unchanged in both source and
- destination.
-*/
-
-ZEXTERN int ZEXPORT inflateReset OF((z_streamp strm));
-/*
- This function is equivalent to inflateEnd followed by inflateInit,
- but does not free and reallocate all the internal decompression state.
- The stream will keep attributes that may have been set by inflateInit2.
-
- inflateReset returns Z_OK if success, or Z_STREAM_ERROR if the source
- stream state was inconsistent (such as zalloc or state being NULL).
-*/
-
-ZEXTERN int ZEXPORT inflatePrime OF((z_streamp strm,
- int bits,
- int value));
-/*
- This function inserts bits in the inflate input stream. The intent is
- that this function is used to start inflating at a bit position in the
- middle of a byte. The provided bits will be used before any bytes are used
- from next_in. This function should only be used with raw inflate, and
- should be used before the first inflate() call after inflateInit2() or
- inflateReset(). bits must be less than or equal to 16, and that many of the
- least significant bits of value will be inserted in the input.
-
- inflatePrime returns Z_OK if success, or Z_STREAM_ERROR if the source
- stream state was inconsistent.
-*/
-
-ZEXTERN int ZEXPORT inflateGetHeader OF((z_streamp strm,
- gz_headerp head));
-/*
- inflateGetHeader() requests that gzip header information be stored in the
- provided gz_header structure. inflateGetHeader() may be called after
- inflateInit2() or inflateReset(), and before the first call of inflate().
- As inflate() processes the gzip stream, head->done is zero until the header
- is completed, at which time head->done is set to one. If a zlib stream is
- being decoded, then head->done is set to -1 to indicate that there will be
- no gzip header information forthcoming. Note that Z_BLOCK can be used to
- force inflate() to return immediately after header processing is complete
- and before any actual data is decompressed.
-
- The text, time, xflags, and os fields are filled in with the gzip header
- contents. hcrc is set to true if there is a header CRC. (The header CRC
- was valid if done is set to one.) If extra is not Z_NULL, then extra_max
- contains the maximum number of bytes to write to extra. Once done is true,
- extra_len contains the actual extra field length, and extra contains the
- extra field, or that field truncated if extra_max is less than extra_len.
- If name is not Z_NULL, then up to name_max characters are written there,
- terminated with a zero unless the length is greater than name_max. If
- comment is not Z_NULL, then up to comm_max characters are written there,
- terminated with a zero unless the length is greater than comm_max. When
- any of extra, name, or comment are not Z_NULL and the respective field is
- not present in the header, then that field is set to Z_NULL to signal its
- absence. This allows the use of deflateSetHeader() with the returned
- structure to duplicate the header. However if those fields are set to
- allocated memory, then the application will need to save those pointers
- elsewhere so that they can be eventually freed.
-
- If inflateGetHeader is not used, then the header information is simply
- discarded. The header is always checked for validity, including the header
- CRC if present. inflateReset() will reset the process to discard the header
- information. The application would need to call inflateGetHeader() again to
- retrieve the header from the next gzip stream.
-
- inflateGetHeader returns Z_OK if success, or Z_STREAM_ERROR if the source
- stream state was inconsistent.
-*/
-
-/*
-ZEXTERN int ZEXPORT inflateBackInit OF((z_streamp strm, int windowBits,
- unsigned char FAR *window));
-
- Initialize the internal stream state for decompression using inflateBack()
- calls. The fields zalloc, zfree and opaque in strm must be initialized
- before the call. If zalloc and zfree are Z_NULL, then the default library-
- derived memory allocation routines are used. windowBits is the base two
- logarithm of the window size, in the range 8..15. window is a caller
- supplied buffer of that size. Except for special applications where it is
- assured that deflate was used with small window sizes, windowBits must be 15
- and a 32K byte window must be supplied to be able to decompress general
- deflate streams.
-
- See inflateBack() for the usage of these routines.
-
- inflateBackInit will return Z_OK on success, Z_STREAM_ERROR if any of
- the paramaters are invalid, Z_MEM_ERROR if the internal state could not
- be allocated, or Z_VERSION_ERROR if the version of the library does not
- match the version of the header file.
-*/
-
-typedef unsigned (*in_func) OF((void FAR *, unsigned char FAR * FAR *));
-typedef int (*out_func) OF((void FAR *, unsigned char FAR *, unsigned));
-
-ZEXTERN int ZEXPORT inflateBack OF((z_streamp strm,
- in_func in, void FAR *in_desc,
- out_func out, void FAR *out_desc));
-/*
- inflateBack() does a raw inflate with a single call using a call-back
- interface for input and output. This is more efficient than inflate() for
- file i/o applications in that it avoids copying between the output and the
- sliding window by simply making the window itself the output buffer. This
- function trusts the application to not change the output buffer passed by
- the output function, at least until inflateBack() returns.
-
- inflateBackInit() must be called first to allocate the internal state
- and to initialize the state with the user-provided window buffer.
- inflateBack() may then be used multiple times to inflate a complete, raw
- deflate stream with each call. inflateBackEnd() is then called to free
- the allocated state.
-
- A raw deflate stream is one with no zlib or gzip header or trailer.
- This routine would normally be used in a utility that reads zip or gzip
- files and writes out uncompressed files. The utility would decode the
- header and process the trailer on its own, hence this routine expects
- only the raw deflate stream to decompress. This is different from the
- normal behavior of inflate(), which expects either a zlib or gzip header and
- trailer around the deflate stream.
-
- inflateBack() uses two subroutines supplied by the caller that are then
- called by inflateBack() for input and output. inflateBack() calls those
- routines until it reads a complete deflate stream and writes out all of the
- uncompressed data, or until it encounters an error. The function's
- parameters and return types are defined above in the in_func and out_func
- typedefs. inflateBack() will call in(in_desc, &buf) which should return the
- number of bytes of provided input, and a pointer to that input in buf. If
- there is no input available, in() must return zero--buf is ignored in that
- case--and inflateBack() will return a buffer error. inflateBack() will call
- out(out_desc, buf, len) to write the uncompressed data buf[0..len-1]. out()
- should return zero on success, or non-zero on failure. If out() returns
- non-zero, inflateBack() will return with an error. Neither in() nor out()
- are permitted to change the contents of the window provided to
- inflateBackInit(), which is also the buffer that out() uses to write from.
- The length written by out() will be at most the window size. Any non-zero
- amount of input may be provided by in().
-
- For convenience, inflateBack() can be provided input on the first call by
- setting strm->next_in and strm->avail_in. If that input is exhausted, then
- in() will be called. Therefore strm->next_in must be initialized before
- calling inflateBack(). If strm->next_in is Z_NULL, then in() will be called
- immediately for input. If strm->next_in is not Z_NULL, then strm->avail_in
- must also be initialized, and then if strm->avail_in is not zero, input will
- initially be taken from strm->next_in[0 .. strm->avail_in - 1].
-
- The in_desc and out_desc parameters of inflateBack() is passed as the
- first parameter of in() and out() respectively when they are called. These
- descriptors can be optionally used to pass any information that the caller-
- supplied in() and out() functions need to do their job.
-
- On return, inflateBack() will set strm->next_in and strm->avail_in to
- pass back any unused input that was provided by the last in() call. The
- return values of inflateBack() can be Z_STREAM_END on success, Z_BUF_ERROR
- if in() or out() returned an error, Z_DATA_ERROR if there was a format
- error in the deflate stream (in which case strm->msg is set to indicate the
- nature of the error), or Z_STREAM_ERROR if the stream was not properly
- initialized. In the case of Z_BUF_ERROR, an input or output error can be
- distinguished using strm->next_in which will be Z_NULL only if in() returned
- an error. If strm->next is not Z_NULL, then the Z_BUF_ERROR was due to
- out() returning non-zero. (in() will always be called before out(), so
- strm->next_in is assured to be defined if out() returns non-zero.) Note
- that inflateBack() cannot return Z_OK.
-*/
-
-ZEXTERN int ZEXPORT inflateBackEnd OF((z_streamp strm));
-/*
- All memory allocated by inflateBackInit() is freed.
-
- inflateBackEnd() returns Z_OK on success, or Z_STREAM_ERROR if the stream
- state was inconsistent.
-*/
-
-ZEXTERN uLong ZEXPORT zlibCompileFlags OF((void));
-/* Return flags indicating compile-time options.
-
- Type sizes, two bits each, 00 = 16 bits, 01 = 32, 10 = 64, 11 = other:
- 1.0: size of uInt
- 3.2: size of uLong
- 5.4: size of voidpf (pointer)
- 7.6: size of z_off_t
-
- Compiler, assembler, and debug options:
- 8: DEBUG
- 9: ASMV or ASMINF -- use ASM code
- 10: ZLIB_WINAPI -- exported functions use the WINAPI calling convention
- 11: 0 (reserved)
-
- One-time table building (smaller code, but not thread-safe if true):
- 12: BUILDFIXED -- build static block decoding tables when needed
- 13: DYNAMIC_CRC_TABLE -- build CRC calculation tables when needed
- 14,15: 0 (reserved)
-
- Library content (indicates missing functionality):
- 16: NO_GZCOMPRESS -- gz* functions cannot compress (to avoid linking
- deflate code when not needed)
- 17: NO_GZIP -- deflate can't write gzip streams, and inflate can't detect
- and decode gzip streams (to avoid linking crc code)
- 18-19: 0 (reserved)
-
- Operation variations (changes in library functionality):
- 20: PKZIP_BUG_WORKAROUND -- slightly more permissive inflate
- 21: FASTEST -- deflate algorithm with only one, lowest compression level
- 22,23: 0 (reserved)
-
- The sprintf variant used by gzprintf (zero is best):
- 24: 0 = vs*, 1 = s* -- 1 means limited to 20 arguments after the format
- 25: 0 = *nprintf, 1 = *printf -- 1 means gzprintf() not secure!
- 26: 0 = returns value, 1 = void -- 1 means inferred string length returned
-
- Remainder:
- 27-31: 0 (reserved)
- */
-
-
- /* utility functions */
-
-/*
- The following utility functions are implemented on top of the
- basic stream-oriented functions. To simplify the interface, some
- default options are assumed (compression level and memory usage,
- standard memory allocation functions). The source code of these
- utility functions can easily be modified if you need special options.
-*/
-
-ZEXTERN int ZEXPORT compress OF((Bytef *dest, uLongf *destLen,
- const Bytef *source, uLong sourceLen));
-/*
- Compresses the source buffer into the destination buffer. sourceLen is
- the byte length of the source buffer. Upon entry, destLen is the total
- size of the destination buffer, which must be at least the value returned
- by compressBound(sourceLen). Upon exit, destLen is the actual size of the
- compressed buffer.
- This function can be used to compress a whole file at once if the
- input file is mmap'ed.
- compress returns Z_OK if success, Z_MEM_ERROR if there was not
- enough memory, Z_BUF_ERROR if there was not enough room in the output
- buffer.
-*/
-
-ZEXTERN int ZEXPORT compress2 OF((Bytef *dest, uLongf *destLen,
- const Bytef *source, uLong sourceLen,
- int level));
-/*
- Compresses the source buffer into the destination buffer. The level
- parameter has the same meaning as in deflateInit. sourceLen is the byte
- length of the source buffer. Upon entry, destLen is the total size of the
- destination buffer, which must be at least the value returned by
- compressBound(sourceLen). Upon exit, destLen is the actual size of the
- compressed buffer.
-
- compress2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
- memory, Z_BUF_ERROR if there was not enough room in the output buffer,
- Z_STREAM_ERROR if the level parameter is invalid.
-*/
-
-ZEXTERN uLong ZEXPORT compressBound OF((uLong sourceLen));
-/*
- compressBound() returns an upper bound on the compressed size after
- compress() or compress2() on sourceLen bytes. It would be used before
- a compress() or compress2() call to allocate the destination buffer.
-*/
-
-ZEXTERN int ZEXPORT uncompress OF((Bytef *dest, uLongf *destLen,
- const Bytef *source, uLong sourceLen));
-/*
- Decompresses the source buffer into the destination buffer. sourceLen is
- the byte length of the source buffer. Upon entry, destLen is the total
- size of the destination buffer, which must be large enough to hold the
- entire uncompressed data. (The size of the uncompressed data must have
- been saved previously by the compressor and transmitted to the decompressor
- by some mechanism outside the scope of this compression library.)
- Upon exit, destLen is the actual size of the compressed buffer.
- This function can be used to decompress a whole file at once if the
- input file is mmap'ed.
-
- uncompress returns Z_OK if success, Z_MEM_ERROR if there was not
- enough memory, Z_BUF_ERROR if there was not enough room in the output
- buffer, or Z_DATA_ERROR if the input data was corrupted or incomplete.
-*/
-
-
-typedef voidp gzFile;
-
-ZEXTERN gzFile ZEXPORT gzopen OF((const char *path, const char *mode));
-/*
- Opens a gzip (.gz) file for reading or writing. The mode parameter
- is as in fopen ("rb" or "wb") but can also include a compression level
- ("wb9") or a strategy: 'f' for filtered data as in "wb6f", 'h' for
- Huffman only compression as in "wb1h", or 'R' for run-length encoding
- as in "wb1R". (See the description of deflateInit2 for more information
- about the strategy parameter.)
-
- gzopen can be used to read a file which is not in gzip format; in this
- case gzread will directly read from the file without decompression.
-
- gzopen returns NULL if the file could not be opened or if there was
- insufficient memory to allocate the (de)compression state; errno
- can be checked to distinguish the two cases (if errno is zero, the
- zlib error is Z_MEM_ERROR). */
-
-ZEXTERN gzFile ZEXPORT gzdopen OF((int fd, const char *mode));
-/*
- gzdopen() associates a gzFile with the file descriptor fd. File
- descriptors are obtained from calls like open, dup, creat, pipe or
- fileno (in the file has been previously opened with fopen).
- The mode parameter is as in gzopen.
- The next call of gzclose on the returned gzFile will also close the
- file descriptor fd, just like fclose(fdopen(fd), mode) closes the file
- descriptor fd. If you want to keep fd open, use gzdopen(dup(fd), mode).
- gzdopen returns NULL if there was insufficient memory to allocate
- the (de)compression state.
-*/
-
-ZEXTERN int ZEXPORT gzsetparams OF((gzFile file, int level, int strategy));
-/*
- Dynamically update the compression level or strategy. See the description
- of deflateInit2 for the meaning of these parameters.
- gzsetparams returns Z_OK if success, or Z_STREAM_ERROR if the file was not
- opened for writing.
-*/
-
-ZEXTERN int ZEXPORT gzread OF((gzFile file, voidp buf, unsigned len));
-/*
- Reads the given number of uncompressed bytes from the compressed file.
- If the input file was not in gzip format, gzread copies the given number
- of bytes into the buffer.
- gzread returns the number of uncompressed bytes actually read (0 for
- end of file, -1 for error). */
-
-ZEXTERN int ZEXPORT gzwrite OF((gzFile file,
- voidpc buf, unsigned len));
-/*
- Writes the given number of uncompressed bytes into the compressed file.
- gzwrite returns the number of uncompressed bytes actually written
- (0 in case of error).
-*/
-
-ZEXTERN int ZEXPORTVA gzprintf OF((gzFile file, const char *format, ...));
-/*
- Converts, formats, and writes the args to the compressed file under
- control of the format string, as in fprintf. gzprintf returns the number of
- uncompressed bytes actually written (0 in case of error). The number of
- uncompressed bytes written is limited to 4095. The caller should assure that
- this limit is not exceeded. If it is exceeded, then gzprintf() will return
- return an error (0) with nothing written. In this case, there may also be a
- buffer overflow with unpredictable consequences, which is possible only if
- zlib was compiled with the insecure functions sprintf() or vsprintf()
- because the secure snprintf() or vsnprintf() functions were not available.
-*/
-
-ZEXTERN int ZEXPORT gzputs OF((gzFile file, const char *s));
-/*
- Writes the given null-terminated string to the compressed file, excluding
- the terminating null character.
- gzputs returns the number of characters written, or -1 in case of error.
-*/
-
-ZEXTERN char * ZEXPORT gzgets OF((gzFile file, char *buf, int len));
-/*
- Reads bytes from the compressed file until len-1 characters are read, or
- a newline character is read and transferred to buf, or an end-of-file
- condition is encountered. The string is then terminated with a null
- character.
- gzgets returns buf, or Z_NULL in case of error.
-*/
-
-ZEXTERN int ZEXPORT gzputc OF((gzFile file, int c));
-/*
- Writes c, converted to an unsigned char, into the compressed file.
- gzputc returns the value that was written, or -1 in case of error.
-*/
-
-ZEXTERN int ZEXPORT gzgetc OF((gzFile file));
-/*
- Reads one byte from the compressed file. gzgetc returns this byte
- or -1 in case of end of file or error.
-*/
-
-ZEXTERN int ZEXPORT gzungetc OF((int c, gzFile file));
-/*
- Push one character back onto the stream to be read again later.
- Only one character of push-back is allowed. gzungetc() returns the
- character pushed, or -1 on failure. gzungetc() will fail if a
- character has been pushed but not read yet, or if c is -1. The pushed
- character will be discarded if the stream is repositioned with gzseek()
- or gzrewind().
-*/
-
-ZEXTERN int ZEXPORT gzflush OF((gzFile file, int flush));
-/*
- Flushes all pending output into the compressed file. The parameter
- flush is as in the deflate() function. The return value is the zlib
- error number (see function gzerror below). gzflush returns Z_OK if
- the flush parameter is Z_FINISH and all output could be flushed.
- gzflush should be called only when strictly necessary because it can
- degrade compression.
-*/
-
-ZEXTERN z_off_t ZEXPORT gzseek OF((gzFile file,
- z_off_t offset, int whence));
-/*
- Sets the starting position for the next gzread or gzwrite on the
- given compressed file. The offset represents a number of bytes in the
- uncompressed data stream. The whence parameter is defined as in lseek(2);
- the value SEEK_END is not supported.
- If the file is opened for reading, this function is emulated but can be
- extremely slow. If the file is opened for writing, only forward seeks are
- supported; gzseek then compresses a sequence of zeroes up to the new
- starting position.
-
- gzseek returns the resulting offset location as measured in bytes from
- the beginning of the uncompressed stream, or -1 in case of error, in
- particular if the file is opened for writing and the new starting position
- would be before the current position.
-*/
-
-ZEXTERN int ZEXPORT gzrewind OF((gzFile file));
-/*
- Rewinds the given file. This function is supported only for reading.
-
- gzrewind(file) is equivalent to (int)gzseek(file, 0L, SEEK_SET)
-*/
-
-ZEXTERN z_off_t ZEXPORT gztell OF((gzFile file));
-/*
- Returns the starting position for the next gzread or gzwrite on the
- given compressed file. This position represents a number of bytes in the
- uncompressed data stream.
-
- gztell(file) is equivalent to gzseek(file, 0L, SEEK_CUR)
-*/
-
-ZEXTERN int ZEXPORT gzeof OF((gzFile file));
-/*
- Returns 1 when EOF has previously been detected reading the given
- input stream, otherwise zero.
-*/
-
-ZEXTERN int ZEXPORT gzdirect OF((gzFile file));
-/*
- Returns 1 if file is being read directly without decompression, otherwise
- zero.
-*/
-
-ZEXTERN int ZEXPORT gzclose OF((gzFile file));
-/*
- Flushes all pending output if necessary, closes the compressed file
- and deallocates all the (de)compression state. The return value is the zlib
- error number (see function gzerror below).
-*/
-
-ZEXTERN const char * ZEXPORT gzerror OF((gzFile file, int *errnum));
-/*
- Returns the error message for the last error which occurred on the
- given compressed file. errnum is set to zlib error number. If an
- error occurred in the file system and not in the compression library,
- errnum is set to Z_ERRNO and the application may consult errno
- to get the exact error code.
-*/
-
-ZEXTERN void ZEXPORT gzclearerr OF((gzFile file));
-/*
- Clears the error and end-of-file flags for file. This is analogous to the
- clearerr() function in stdio. This is useful for continuing to read a gzip
- file that is being written concurrently.
-*/
-
- /* checksum functions */
-
-/*
- These functions are not related to compression but are exported
- anyway because they might be useful in applications using the
- compression library.
-*/
-
-ZEXTERN uLong ZEXPORT adler32 OF((uLong adler, const Bytef *buf, uInt len));
-/*
- Update a running Adler-32 checksum with the bytes buf[0..len-1] and
- return the updated checksum. If buf is NULL, this function returns
- the required initial value for the checksum.
- An Adler-32 checksum is almost as reliable as a CRC32 but can be computed
- much faster. Usage example:
-
- uLong adler = adler32(0L, Z_NULL, 0);
-
- while (read_buffer(buffer, length) != EOF) {
- adler = adler32(adler, buffer, length);
- }
- if (adler != original_adler) error();
-*/
-
-ZEXTERN uLong ZEXPORT adler32_combine OF((uLong adler1, uLong adler2,
- z_off_t len2));
-/*
- Combine two Adler-32 checksums into one. For two sequences of bytes, seq1
- and seq2 with lengths len1 and len2, Adler-32 checksums were calculated for
- each, adler1 and adler2. adler32_combine() returns the Adler-32 checksum of
- seq1 and seq2 concatenated, requiring only adler1, adler2, and len2.
-*/
-
-ZEXTERN uLong ZEXPORT crc32 OF((uLong crc, const Bytef *buf, uInt len));
-/*
- Update a running CRC-32 with the bytes buf[0..len-1] and return the
- updated CRC-32. If buf is NULL, this function returns the required initial
- value for the for the crc. Pre- and post-conditioning (one's complement) is
- performed within this function so it shouldn't be done by the application.
- Usage example:
-
- uLong crc = crc32(0L, Z_NULL, 0);
-
- while (read_buffer(buffer, length) != EOF) {
- crc = crc32(crc, buffer, length);
- }
- if (crc != original_crc) error();
-*/
-
-ZEXTERN uLong ZEXPORT crc32_combine OF((uLong crc1, uLong crc2, z_off_t len2));
-
-/*
- Combine two CRC-32 check values into one. For two sequences of bytes,
- seq1 and seq2 with lengths len1 and len2, CRC-32 check values were
- calculated for each, crc1 and crc2. crc32_combine() returns the CRC-32
- check value of seq1 and seq2 concatenated, requiring only crc1, crc2, and
- len2.
-*/
-
-
- /* various hacks, don't look :) */
-
-/* deflateInit and inflateInit are macros to allow checking the zlib version
- * and the compiler's view of z_stream:
- */
-ZEXTERN int ZEXPORT deflateInit_ OF((z_streamp strm, int level,
- const char *version, int stream_size));
-ZEXTERN int ZEXPORT inflateInit_ OF((z_streamp strm,
- const char *version, int stream_size));
-ZEXTERN int ZEXPORT deflateInit2_ OF((z_streamp strm, int level, int method,
- int windowBits, int memLevel,
- int strategy, const char *version,
- int stream_size));
-ZEXTERN int ZEXPORT inflateInit2_ OF((z_streamp strm, int windowBits,
- const char *version, int stream_size));
-ZEXTERN int ZEXPORT inflateBackInit_ OF((z_streamp strm, int windowBits,
- unsigned char FAR *window,
- const char *version,
- int stream_size));
-#define deflateInit(strm, level) \
- deflateInit_((strm), (level), ZLIB_VERSION, sizeof(z_stream))
-#define inflateInit(strm) \
- inflateInit_((strm), ZLIB_VERSION, sizeof(z_stream))
-#define deflateInit2(strm, level, method, windowBits, memLevel, strategy) \
- deflateInit2_((strm),(level),(method),(windowBits),(memLevel),\
- (strategy), ZLIB_VERSION, sizeof(z_stream))
-#define inflateInit2(strm, windowBits) \
- inflateInit2_((strm), (windowBits), ZLIB_VERSION, sizeof(z_stream))
-#define inflateBackInit(strm, windowBits, window) \
- inflateBackInit_((strm), (windowBits), (window), \
- ZLIB_VERSION, sizeof(z_stream))
-
-
-#if !defined(_ZUTIL_H) && !defined(NO_DUMMY_DECL)
- struct internal_state {int dummy;}; /* hack for buggy compilers */
-#endif
-
-ZEXTERN const char * ZEXPORT zError OF((int));
-ZEXTERN int ZEXPORT inflateSyncPoint OF((z_streamp z));
-ZEXTERN const uLongf * ZEXPORT get_crc_table OF((void));
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _ZLIB_H */
diff --git a/sys/contrib/opensolaris/uts/common/zmod/zmod.c b/sys/contrib/opensolaris/uts/common/zmod/zmod.c
deleted file mode 100644
index 2627239..0000000
--- a/sys/contrib/opensolaris/uts/common/zmod/zmod.c
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/types.h>
-#include <sys/zmod.h>
-
-#include "zlib.h"
-
-/*
- * Uncompress the buffer 'src' into the buffer 'dst'. The caller must store
- * the expected decompressed data size externally so it can be passed in.
- * The resulting decompressed size is then returned through dstlen. This
- * function return Z_OK on success, or another error code on failure.
- */
-int
-z_uncompress(void *dst, size_t *dstlen, const void *src, size_t srclen)
-{
- z_stream zs;
- int err;
-
- bzero(&zs, sizeof (zs));
- zs.next_in = (uchar_t *)src;
- zs.avail_in = srclen;
- zs.next_out = dst;
- zs.avail_out = *dstlen;
-
- if ((err = inflateInit(&zs)) != Z_OK)
- return (err);
-
- if ((err = inflate(&zs, Z_FINISH)) != Z_STREAM_END) {
- (void) inflateEnd(&zs);
- return (err == Z_OK ? Z_BUF_ERROR : err);
- }
-
- *dstlen = zs.total_out;
- return (inflateEnd(&zs));
-}
-
-int
-z_compress_level(void *dst, size_t *dstlen, const void *src, size_t srclen,
- int level)
-{
-
- z_stream zs;
- int err;
-
- bzero(&zs, sizeof (zs));
- zs.next_in = (uchar_t *)src;
- zs.avail_in = srclen;
- zs.next_out = dst;
- zs.avail_out = *dstlen;
-
- if ((err = deflateInit(&zs, level)) != Z_OK)
- return (err);
-
- if ((err = deflate(&zs, Z_FINISH)) != Z_STREAM_END) {
- (void) deflateEnd(&zs);
- return (err == Z_OK ? Z_BUF_ERROR : err);
- }
-
- *dstlen = zs.total_out;
- return (deflateEnd(&zs));
-}
-
-int
-z_compress(void *dst, size_t *dstlen, const void *src, size_t srclen)
-{
- return (z_compress_level(dst, dstlen, src, srclen,
- Z_DEFAULT_COMPRESSION));
-}
-
-/*
- * Convert a zlib error code into a string error message.
- */
-const char *
-z_strerror(int err)
-{
- int i = Z_NEED_DICT - err;
-
- if (i < 0 || i > Z_NEED_DICT - Z_VERSION_ERROR)
- return ("unknown error");
-
- return (zError(err));
-}
diff --git a/sys/contrib/opensolaris/uts/common/zmod/zmod_subr.c b/sys/contrib/opensolaris/uts/common/zmod/zmod_subr.c
deleted file mode 100644
index 0542712..0000000
--- a/sys/contrib/opensolaris/uts/common/zmod/zmod_subr.c
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/systm.h>
-#include <sys/cmn_err.h>
-#include <sys/kobj.h>
-
-struct zchdr {
- uint_t zch_magic;
- uint_t zch_size;
-};
-
-#define ZCH_MAGIC 0x3cc13cc1
-
-/*ARGSUSED*/
-void *
-zcalloc(void *opaque, uint_t items, uint_t size)
-{
- size_t nbytes = sizeof (struct zchdr) + items * size;
- struct zchdr *z = kobj_zalloc(nbytes, KM_NOWAIT|KM_TMP);
-
- if (z == NULL)
- return (NULL);
-
- z->zch_magic = ZCH_MAGIC;
- z->zch_size = nbytes;
-
- return (z + 1);
-}
-
-/*ARGSUSED*/
-void
-zcfree(void *opaque, void *ptr)
-{
- struct zchdr *z = ((struct zchdr *)ptr) - 1;
-
- if (z->zch_magic != ZCH_MAGIC)
- panic("zcfree region corrupt: hdr=%p ptr=%p", (void *)z, ptr);
-
- kobj_free(z, z->zch_size);
-}
-
-void
-zmemcpy(void *dest, const void *source, uint_t len)
-{
- bcopy(source, dest, len);
-}
-
-int
-zmemcmp(const void *s1, const void *s2, uint_t len)
-{
- return (bcmp(s1, s2, len));
-}
-
-void
-zmemzero(void *dest, uint_t len)
-{
- bzero(dest, len);
-}
diff --git a/sys/contrib/opensolaris/uts/common/zmod/zutil.c b/sys/contrib/opensolaris/uts/common/zmod/zutil.c
deleted file mode 100644
index 7d46e30..0000000
--- a/sys/contrib/opensolaris/uts/common/zmod/zutil.c
+++ /dev/null
@@ -1,324 +0,0 @@
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-/* zutil.c -- target dependent utility functions for the compression library
- * Copyright (C) 1995-2005 Jean-loup Gailly.
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include "zutil.h"
-
-#ifndef NO_DUMMY_DECL
-struct internal_state {int dummy;}; /* for buggy compilers */
-#endif
-
-const char * const z_errmsg[10] = {
-"need dictionary", /* Z_NEED_DICT 2 */
-"stream end", /* Z_STREAM_END 1 */
-"", /* Z_OK 0 */
-"file error", /* Z_ERRNO (-1) */
-"stream error", /* Z_STREAM_ERROR (-2) */
-"data error", /* Z_DATA_ERROR (-3) */
-"insufficient memory", /* Z_MEM_ERROR (-4) */
-"buffer error", /* Z_BUF_ERROR (-5) */
-"incompatible version",/* Z_VERSION_ERROR (-6) */
-""};
-
-
-const char * ZEXPORT zlibVersion()
-{
- return ZLIB_VERSION;
-}
-
-uLong ZEXPORT zlibCompileFlags()
-{
- uLong flags;
-
- flags = 0;
- switch (sizeof(uInt)) {
- case 2: break;
- case 4: flags += 1; break;
- case 8: flags += 2; break;
- default: flags += 3;
- }
- switch (sizeof(uLong)) {
- case 2: break;
- case 4: flags += 1 << 2; break;
- case 8: flags += 2 << 2; break;
- default: flags += 3 << 2;
- }
- switch (sizeof(voidpf)) {
- case 2: break;
- case 4: flags += 1 << 4; break;
- case 8: flags += 2 << 4; break;
- default: flags += 3 << 4;
- }
- switch (sizeof(z_off_t)) {
- case 2: break;
- case 4: flags += 1 << 6; break;
- case 8: flags += 2 << 6; break;
- default: flags += 3 << 6;
- }
-#ifdef DEBUG
- flags += 1 << 8;
-#endif
-#if defined(ASMV) || defined(ASMINF)
- flags += 1 << 9;
-#endif
-#ifdef ZLIB_WINAPI
- flags += 1 << 10;
-#endif
-#ifdef BUILDFIXED
- flags += 1 << 12;
-#endif
-#ifdef DYNAMIC_CRC_TABLE
- flags += 1 << 13;
-#endif
-#ifdef NO_GZCOMPRESS
- flags += 1L << 16;
-#endif
-#ifdef NO_GZIP
- flags += 1L << 17;
-#endif
-#ifdef PKZIP_BUG_WORKAROUND
- flags += 1L << 20;
-#endif
-#ifdef FASTEST
- flags += 1L << 21;
-#endif
-#ifdef STDC
-# ifdef NO_vsnprintf
- flags += 1L << 25;
-# ifdef HAS_vsprintf_void
- flags += 1L << 26;
-# endif
-# else
-# ifdef HAS_vsnprintf_void
- flags += 1L << 26;
-# endif
-# endif
-#else
- flags += 1L << 24;
-# ifdef NO_snprintf
- flags += 1L << 25;
-# ifdef HAS_sprintf_void
- flags += 1L << 26;
-# endif
-# else
-# ifdef HAS_snprintf_void
- flags += 1L << 26;
-# endif
-# endif
-#endif
- return flags;
-}
-
-#ifdef DEBUG
-
-# ifndef verbose
-# define verbose 0
-# endif
-int z_verbose = verbose;
-
-void z_error (m)
- char *m;
-{
- fprintf(stderr, "%s\n", m);
- exit(1);
-}
-#endif
-
-/* exported to allow conversion of error code to string for compress() and
- * uncompress()
- */
-const char * ZEXPORT zError(err)
- int err;
-{
- return ERR_MSG(err);
-}
-
-#if defined(_WIN32_WCE)
- /* The Microsoft C Run-Time Library for Windows CE doesn't have
- * errno. We define it as a global variable to simplify porting.
- * Its value is always 0 and should not be used.
- */
- int errno = 0;
-#endif
-
-#define HAVE_MEMCPY
-#ifndef HAVE_MEMCPY
-
-void zmemcpy(dest, source, len)
- Bytef* dest;
- const Bytef* source;
- uInt len;
-{
- if (len == 0) return;
- do {
- *dest++ = *source++; /* ??? to be unrolled */
- } while (--len != 0);
-}
-
-int zmemcmp(s1, s2, len)
- const Bytef* s1;
- const Bytef* s2;
- uInt len;
-{
- uInt j;
-
- for (j = 0; j < len; j++) {
- if (s1[j] != s2[j]) return 2*(s1[j] > s2[j])-1;
- }
- return 0;
-}
-
-void zmemzero(dest, len)
- Bytef* dest;
- uInt len;
-{
- if (len == 0) return;
- do {
- *dest++ = 0; /* ??? to be unrolled */
- } while (--len != 0);
-}
-#endif
-
-
-#ifdef SYS16BIT
-
-#ifdef __TURBOC__
-/* Turbo C in 16-bit mode */
-
-# define MY_ZCALLOC
-
-/* Turbo C malloc() does not allow dynamic allocation of 64K bytes
- * and farmalloc(64K) returns a pointer with an offset of 8, so we
- * must fix the pointer. Warning: the pointer must be put back to its
- * original form in order to free it, use zcfree().
- */
-
-#define MAX_PTR 10
-/* 10*64K = 640K */
-
-local int next_ptr = 0;
-
-typedef struct ptr_table_s {
- voidpf org_ptr;
- voidpf new_ptr;
-} ptr_table;
-
-local ptr_table table[MAX_PTR];
-/* This table is used to remember the original form of pointers
- * to large buffers (64K). Such pointers are normalized with a zero offset.
- * Since MSDOS is not a preemptive multitasking OS, this table is not
- * protected from concurrent access. This hack doesn't work anyway on
- * a protected system like OS/2. Use Microsoft C instead.
- */
-
-voidpf zcalloc (voidpf opaque, unsigned items, unsigned size)
-{
- voidpf buf = opaque; /* just to make some compilers happy */
- ulg bsize = (ulg)items*size;
-
- /* If we allocate less than 65520 bytes, we assume that farmalloc
- * will return a usable pointer which doesn't have to be normalized.
- */
- if (bsize < 65520L) {
- buf = farmalloc(bsize);
- if (*(ush*)&buf != 0) return buf;
- } else {
- buf = farmalloc(bsize + 16L);
- }
- if (buf == NULL || next_ptr >= MAX_PTR) return NULL;
- table[next_ptr].org_ptr = buf;
-
- /* Normalize the pointer to seg:0 */
- *((ush*)&buf+1) += ((ush)((uch*)buf-0) + 15) >> 4;
- *(ush*)&buf = 0;
- table[next_ptr++].new_ptr = buf;
- return buf;
-}
-
-void zcfree (voidpf opaque, voidpf ptr)
-{
- int n;
- if (*(ush*)&ptr != 0) { /* object < 64K */
- farfree(ptr);
- return;
- }
- /* Find the original pointer */
- for (n = 0; n < next_ptr; n++) {
- if (ptr != table[n].new_ptr) continue;
-
- farfree(table[n].org_ptr);
- while (++n < next_ptr) {
- table[n-1] = table[n];
- }
- next_ptr--;
- return;
- }
- ptr = opaque; /* just to make some compilers happy */
- Assert(0, "zcfree: ptr not found");
-}
-
-#endif /* __TURBOC__ */
-
-
-#ifdef M_I86
-/* Microsoft C in 16-bit mode */
-
-# define MY_ZCALLOC
-
-#if (!defined(_MSC_VER) || (_MSC_VER <= 600))
-# define _halloc halloc
-# define _hfree hfree
-#endif
-
-voidpf zcalloc (voidpf opaque, unsigned items, unsigned size)
-{
- if (opaque) opaque = 0; /* to make compiler happy */
- return _halloc((long)items, size);
-}
-
-void zcfree (voidpf opaque, voidpf ptr)
-{
- if (opaque) opaque = 0; /* to make compiler happy */
- _hfree(ptr);
-}
-
-#endif /* M_I86 */
-
-#endif /* SYS16BIT */
-
-
-#ifndef MY_ZCALLOC /* Any system without a special alloc function */
-
-#ifndef STDC
-extern voidp malloc OF((uInt size));
-extern voidp calloc OF((uInt items, uInt size));
-extern void free OF((voidpf ptr));
-#endif
-
-voidpf zcalloc (opaque, items, size)
- voidpf opaque;
- unsigned items;
- unsigned size;
-{
- if (opaque) items += size - size; /* make compiler happy */
- return sizeof(uInt) > 2 ? (voidpf)malloc(items * size) :
- (voidpf)calloc(items, size);
-}
-
-void zcfree (opaque, ptr)
- voidpf opaque;
- voidpf ptr;
-{
- free(ptr);
- if (opaque) return; /* make compiler happy */
-}
-
-#endif /* MY_ZCALLOC */
diff --git a/sys/contrib/opensolaris/uts/common/zmod/zutil.h b/sys/contrib/opensolaris/uts/common/zmod/zutil.h
deleted file mode 100644
index 1d02c1d..0000000
--- a/sys/contrib/opensolaris/uts/common/zmod/zutil.h
+++ /dev/null
@@ -1,274 +0,0 @@
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-/* zutil.h -- internal interface and configuration of the compression library
- * Copyright (C) 1995-2005 Jean-loup Gailly.
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
-
-/* WARNING: this file should *not* be used by applications. It is
- part of the implementation of the compression library and is
- subject to change. Applications should only use zlib.h.
- */
-
-#ifndef _ZUTIL_H
-#define _ZUTIL_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#define ZLIB_INTERNAL
-#include "zlib.h"
-
-#ifdef STDC
-# ifndef _WIN32_WCE
-# include <stddef.h>
-# endif
-# include <string.h>
-# include <stdlib.h>
-#endif
-#ifdef NO_ERRNO_H
-# ifdef _WIN32_WCE
- /* The Microsoft C Run-Time Library for Windows CE doesn't have
- * errno. We define it as a global variable to simplify porting.
- * Its value is always 0 and should not be used. We rename it to
- * avoid conflict with other libraries that use the same workaround.
- */
-# define errno z_errno
-# endif
- extern int errno;
-#else
-# ifndef _WIN32_WCE
-# include <sys/errno.h>
-# endif
-#endif
-
-#ifndef local
-# define local static
-#endif
-/* compile with -Dlocal if your debugger can't find static symbols */
-
-typedef unsigned char uch;
-typedef uch FAR uchf;
-typedef unsigned short ush;
-typedef ush FAR ushf;
-typedef unsigned long ulg;
-
-extern const char * const z_errmsg[10]; /* indexed by 2-zlib_error */
-/* (size given to avoid silly warnings with Visual C++) */
-
-#define ERR_MSG(err) z_errmsg[Z_NEED_DICT-(err)]
-
-#define ERR_RETURN(strm,err) \
- return (strm->msg = (char*)ERR_MSG(err), (err))
-/* To be used only when the state is known to be valid */
-
- /* common constants */
-
-#ifndef DEF_WBITS
-# define DEF_WBITS MAX_WBITS
-#endif
-/* default windowBits for decompression. MAX_WBITS is for compression only */
-
-#if MAX_MEM_LEVEL >= 8
-# define DEF_MEM_LEVEL 8
-#else
-# define DEF_MEM_LEVEL MAX_MEM_LEVEL
-#endif
-/* default memLevel */
-
-#define STORED_BLOCK 0
-#define STATIC_TREES 1
-#define DYN_TREES 2
-/* The three kinds of block type */
-
-#define MIN_MATCH 3
-#define MAX_MATCH 258
-/* The minimum and maximum match lengths */
-
-#define PRESET_DICT 0x20 /* preset dictionary flag in zlib header */
-
- /* target dependencies */
-
-#if defined(MSDOS) || (defined(WINDOWS) && !defined(WIN32))
-# define OS_CODE 0x00
-# if defined(__TURBOC__) || defined(__BORLANDC__)
-# if(__STDC__ == 1) && (defined(__LARGE__) || defined(__COMPACT__))
- /* Allow compilation with ANSI keywords only enabled */
- void _Cdecl farfree( void *block );
- void *_Cdecl farmalloc( unsigned long nbytes );
-# else
-# include <alloc.h>
-# endif
-# else /* MSC or DJGPP */
-# include <malloc.h>
-# endif
-#endif
-
-#ifdef AMIGA
-# define OS_CODE 0x01
-#endif
-
-#if defined(VAXC) || defined(VMS)
-# define OS_CODE 0x02
-# define F_OPEN(name, mode) \
- fopen((name), (mode), "mbc=60", "ctx=stm", "rfm=fix", "mrs=512")
-#endif
-
-#if defined(ATARI) || defined(atarist)
-# define OS_CODE 0x05
-#endif
-
-#ifdef OS2
-# define OS_CODE 0x06
-# ifdef M_I86
- #include <malloc.h>
-# endif
-#endif
-
-#if defined(MACOS) || defined(TARGET_OS_MAC)
-# define OS_CODE 0x07
-# if defined(__MWERKS__) && __dest_os != __be_os && __dest_os != __win32_os
-# include <unix.h> /* for fdopen */
-# else
-# ifndef fdopen
-# define fdopen(fd,mode) NULL /* No fdopen() */
-# endif
-# endif
-#endif
-
-#ifdef TOPS20
-# define OS_CODE 0x0a
-#endif
-
-#ifdef WIN32
-# ifndef __CYGWIN__ /* Cygwin is Unix, not Win32 */
-# define OS_CODE 0x0b
-# endif
-#endif
-
-#ifdef __50SERIES /* Prime/PRIMOS */
-# define OS_CODE 0x0f
-#endif
-
-#if defined(_BEOS_) || defined(RISCOS)
-# define fdopen(fd,mode) NULL /* No fdopen() */
-#endif
-
-#if (defined(_MSC_VER) && (_MSC_VER > 600))
-# if defined(_WIN32_WCE)
-# define fdopen(fd,mode) NULL /* No fdopen() */
-# ifndef _PTRDIFF_T_DEFINED
- typedef int ptrdiff_t;
-# define _PTRDIFF_T_DEFINED
-# endif
-# else
-# define fdopen(fd,type) _fdopen(fd,type)
-# endif
-#endif
-
- /* common defaults */
-
-#ifndef OS_CODE
-# define OS_CODE 0x03 /* assume Unix */
-#endif
-
-#ifndef F_OPEN
-# define F_OPEN(name, mode) fopen((name), (mode))
-#endif
-
- /* functions */
-
-#if defined(STDC99) || (defined(__TURBOC__) && __TURBOC__ >= 0x550)
-# ifndef HAVE_VSNPRINTF
-# define HAVE_VSNPRINTF
-# endif
-#endif
-#if defined(__CYGWIN__)
-# ifndef HAVE_VSNPRINTF
-# define HAVE_VSNPRINTF
-# endif
-#endif
-#ifndef HAVE_VSNPRINTF
-# ifdef MSDOS
- /* vsnprintf may exist on some MS-DOS compilers (DJGPP?),
- but for now we just assume it doesn't. */
-# define NO_vsnprintf
-# endif
-# ifdef __TURBOC__
-# define NO_vsnprintf
-# endif
-# ifdef WIN32
- /* In Win32, vsnprintf is available as the "non-ANSI" _vsnprintf. */
-# if !defined(vsnprintf) && !defined(NO_vsnprintf)
-# define vsnprintf _vsnprintf
-# endif
-# endif
-# ifdef __SASC
-# define NO_vsnprintf
-# endif
-#endif
-#ifdef VMS
-# define NO_vsnprintf
-#endif
-
-#if defined(pyr)
-# define NO_MEMCPY
-#endif
-#if defined(SMALL_MEDIUM) && !defined(_MSC_VER) && !defined(__SC__)
- /* Use our own functions for small and medium model with MSC <= 5.0.
- * You may have to use the same strategy for Borland C (untested).
- * The __SC__ check is for Symantec.
- */
-# define NO_MEMCPY
-#endif
-#if defined(STDC) && !defined(HAVE_MEMCPY) && !defined(NO_MEMCPY)
-# define HAVE_MEMCPY
-#endif
-#ifdef HAVE_MEMCPY
-# ifdef SMALL_MEDIUM /* MSDOS small or medium model */
-# define zmemcpy _fmemcpy
-# define zmemcmp _fmemcmp
-# define zmemzero(dest, len) _fmemset(dest, 0, len)
-# else
-# define zmemcpy memcpy
-# define zmemcmp memcmp
-# define zmemzero(dest, len) memset(dest, 0, len)
-# endif
-#else
- extern void zmemcpy OF((void* dest, const void* source, uInt len));
- extern int zmemcmp OF((const void* s1, const void* s2, uInt len));
- extern void zmemzero OF((void* dest, uInt len));
-#endif
-
-/* Diagnostic functions */
-#ifdef DEBUG
-# include <stdio.h>
- extern int z_verbose;
- extern void z_error OF((char *m));
-# define Assert(cond,msg) {if(!(cond)) z_error(msg);}
-# define Trace(x) {if (z_verbose>=0) fprintf x ;}
-# define Tracev(x) {if (z_verbose>0) fprintf x ;}
-# define Tracevv(x) {if (z_verbose>1) fprintf x ;}
-# define Tracec(c,x) {if (z_verbose>0 && (c)) fprintf x ;}
-# define Tracecv(c,x) {if (z_verbose>1 && (c)) fprintf x ;}
-#else
-# define Assert(cond,msg)
-# define Trace(x)
-# define Tracev(x)
-# define Tracevv(x)
-# define Tracec(c,x)
-# define Tracecv(c,x)
-#endif
-
-
-voidpf zcalloc OF((voidpf opaque, unsigned items, unsigned size));
-void zcfree OF((voidpf opaque, voidpf ptr));
-
-#define ZALLOC(strm, items, size) \
- (*((strm)->zalloc))((strm)->opaque, (items), (size))
-#define ZFREE(strm, addr) (*((strm)->zfree))((strm)->opaque, (voidpf)(addr))
-#define TRY_FREE(s, p) {if (p) ZFREE(s, p);}
-
-#endif /* _ZUTIL_H */
OpenPOWER on IntegriCloud