summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/Makefile5
-rw-r--r--fs/9p/acl.c83
-rw-r--r--fs/9p/cache.c8
-rw-r--r--fs/9p/cache.h1
-rw-r--r--fs/9p/v9fs.c4
-rw-r--r--fs/9p/v9fs.h2
-rw-r--r--fs/9p/vfs_file.c7
-rw-r--r--fs/9p/vfs_inode.c33
-rw-r--r--fs/9p/vfs_inode_dotl.c24
-rw-r--r--fs/9p/xattr.c38
-rw-r--r--fs/9p/xattr.h3
-rw-r--r--fs/9p/xattr_security.c80
-rw-r--r--fs/9p/xattr_trusted.c80
-rw-r--r--fs/9p/xattr_user.c80
-rw-r--r--fs/Kconfig22
-rw-r--r--fs/Makefile10
-rw-r--r--fs/adfs/adfs.h6
-rw-r--r--fs/adfs/dir.c6
-rw-r--r--fs/adfs/dir_f.c2
-rw-r--r--fs/adfs/dir_fplus.c2
-rw-r--r--fs/adfs/super.c2
-rw-r--r--fs/affs/affs.h2
-rw-r--r--fs/affs/amigaffs.c13
-rw-r--r--fs/affs/inode.c1
-rw-r--r--fs/affs/namei.c1
-rw-r--r--fs/affs/super.c12
-rw-r--r--fs/affs/symlink.c9
-rw-r--r--fs/afs/inode.c1
-rw-r--r--fs/afs/proc.c25
-rw-r--r--fs/afs/super.c2
-rw-r--r--fs/aio.c27
-rw-r--r--fs/autofs4/symlink.c14
-rw-r--r--fs/bad_inode.c2
-rw-r--r--fs/befs/linuxvfs.c42
-rw-r--r--fs/bfs/inode.c2
-rw-r--r--fs/binfmt_elf.c22
-rw-r--r--fs/binfmt_elf_fdpic.c68
-rw-r--r--fs/block_dev.c207
-rw-r--r--fs/btrfs/Makefile5
-rw-r--r--fs/btrfs/acl.c12
-rw-r--r--fs/btrfs/async-thread.c61
-rw-r--r--fs/btrfs/async-thread.h2
-rw-r--r--fs/btrfs/backref.c74
-rw-r--r--fs/btrfs/btrfs_inode.h6
-rw-r--r--fs/btrfs/check-integrity.c123
-rw-r--r--fs/btrfs/compression.c133
-rw-r--r--fs/btrfs/ctree.c25
-rw-r--r--fs/btrfs/ctree.h390
-rw-r--r--fs/btrfs/delayed-inode.c11
-rw-r--r--fs/btrfs/delayed-ref.c194
-rw-r--r--fs/btrfs/delayed-ref.h31
-rw-r--r--fs/btrfs/dev-replace.c58
-rw-r--r--fs/btrfs/disk-io.c325
-rw-r--r--fs/btrfs/disk-io.h8
-rw-r--r--fs/btrfs/export.c10
-rw-r--r--fs/btrfs/extent-tree.c1210
-rw-r--r--fs/btrfs/extent-tree.h0
-rw-r--r--fs/btrfs/extent_io.c599
-rw-r--r--fs/btrfs/extent_io.h155
-rw-r--r--fs/btrfs/file-item.c4
-rw-r--r--fs/btrfs/file.c285
-rw-r--r--fs/btrfs/free-space-cache.c178
-rw-r--r--fs/btrfs/free-space-cache.h3
-rw-r--r--fs/btrfs/free-space-tree.c1591
-rw-r--r--fs/btrfs/free-space-tree.h72
-rw-r--r--fs/btrfs/inode-item.c2
-rw-r--r--fs/btrfs/inode-map.c14
-rw-r--r--fs/btrfs/inode.c727
-rw-r--r--fs/btrfs/ioctl.c600
-rw-r--r--fs/btrfs/locking.c15
-rw-r--r--fs/btrfs/ordered-data.c70
-rw-r--r--fs/btrfs/ordered-data.h2
-rw-r--r--fs/btrfs/props.c13
-rw-r--r--fs/btrfs/qgroup.c249
-rw-r--r--fs/btrfs/qgroup.h31
-rw-r--r--fs/btrfs/raid56.c157
-rw-r--r--fs/btrfs/raid56.h10
-rw-r--r--fs/btrfs/reada.c12
-rw-r--r--fs/btrfs/relocation.c71
-rw-r--r--fs/btrfs/root-tree.c11
-rw-r--r--fs/btrfs/scrub.c649
-rw-r--r--fs/btrfs/send.c236
-rw-r--r--fs/btrfs/send.h4
-rw-r--r--fs/btrfs/super.c160
-rw-r--r--fs/btrfs/sysfs.c52
-rw-r--r--fs/btrfs/sysfs.h4
-rw-r--r--fs/btrfs/tests/btrfs-tests.c58
-rw-r--r--fs/btrfs/tests/btrfs-tests.h10
-rw-r--r--fs/btrfs/tests/extent-io-tests.c149
-rw-r--r--fs/btrfs/tests/free-space-tests.c251
-rw-r--r--fs/btrfs/tests/free-space-tree-tests.c571
-rw-r--r--fs/btrfs/tests/inode-tests.c2
-rw-r--r--fs/btrfs/tests/qgroup-tests.c20
-rw-r--r--fs/btrfs/transaction.c258
-rw-r--r--fs/btrfs/transaction.h34
-rw-r--r--fs/btrfs/tree-defrag.c30
-rw-r--r--fs/btrfs/tree-log.c368
-rw-r--r--fs/btrfs/volumes.c716
-rw-r--r--fs/btrfs/volumes.h28
-rw-r--r--fs/btrfs/xattr.c174
-rw-r--r--fs/btrfs/xattr.h2
-rw-r--r--fs/buffer.c60
-rw-r--r--fs/cachefiles/daemon.c12
-rw-r--r--fs/cachefiles/internal.h2
-rw-r--r--fs/cachefiles/namei.c2
-rw-r--r--fs/cachefiles/rdwr.c75
-rw-r--r--fs/ceph/acl.c16
-rw-r--r--fs/ceph/addr.c15
-rw-r--r--fs/ceph/cache.c2
-rw-r--r--fs/ceph/caps.c84
-rw-r--r--fs/ceph/file.c101
-rw-r--r--fs/ceph/inode.c3
-rw-r--r--fs/ceph/locks.c4
-rw-r--r--fs/ceph/mds_client.c116
-rw-r--r--fs/ceph/mds_client.h4
-rw-r--r--fs/ceph/snap.c7
-rw-r--r--fs/ceph/super.c7
-rw-r--r--fs/ceph/super.h1
-rw-r--r--fs/char_dev.c2
-rw-r--r--fs/cifs/cifs_ioctl.h42
-rw-r--r--fs/cifs/cifs_spnego.c6
-rw-r--r--fs/cifs/cifsacl.c25
-rw-r--r--fs/cifs/cifsencrypt.c53
-rw-r--r--fs/cifs/cifsfs.c93
-rw-r--r--fs/cifs/cifsfs.h8
-rw-r--r--fs/cifs/cifsglob.h8
-rw-r--r--fs/cifs/cifspdu.h14
-rw-r--r--fs/cifs/cifssmb.c7
-rw-r--r--fs/cifs/connect.c87
-rw-r--r--fs/cifs/file.c18
-rw-r--r--fs/cifs/inode.c40
-rw-r--r--fs/cifs/ioctl.c165
-rw-r--r--fs/cifs/link.c10
-rw-r--r--fs/cifs/readdir.c2
-rw-r--r--fs/cifs/sess.c2
-rw-r--r--fs/cifs/smb2file.c19
-rw-r--r--fs/cifs/smb2ops.c18
-rw-r--r--fs/cifs/smb2pdu.c214
-rw-r--r--fs/cifs/smb2pdu.h45
-rw-r--r--fs/cifs/smbfsctl.h2
-rw-r--r--fs/cifs/transport.c2
-rw-r--r--fs/cifs/xattr.c16
-rw-r--r--fs/coda/cnode.c5
-rw-r--r--fs/coda/inode.c6
-rw-r--r--fs/coda/symlink.c4
-rw-r--r--fs/coda/upcall.c6
-rw-r--r--fs/compat.c21
-rw-r--r--fs/compat_ioctl.c271
-rw-r--r--fs/configfs/configfs_internal.h14
-rw-r--r--fs/configfs/dir.c278
-rw-r--r--fs/configfs/file.c261
-rw-r--r--fs/configfs/inode.c2
-rw-r--r--fs/configfs/symlink.c22
-rw-r--r--fs/coredump.c89
-rw-r--r--fs/cramfs/inode.c1
-rw-r--r--fs/dax.c474
-rw-r--r--fs/dcache.c35
-rw-r--r--fs/debugfs/file.c191
-rw-r--r--fs/debugfs/inode.c12
-rw-r--r--fs/direct-io.c54
-rw-r--r--fs/dlm/config.c288
-rw-r--r--fs/dlm/lowcomms.c747
-rw-r--r--fs/dlm/plock.c9
-rw-r--r--fs/dlm/user.c27
-rw-r--r--fs/drop_caches.c10
-rw-r--r--fs/ecryptfs/crypto.c3
-rw-r--r--fs/ecryptfs/dentry.c16
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h5
-rw-r--r--fs/ecryptfs/inode.c23
-rw-r--r--fs/ecryptfs/main.c6
-rw-r--r--fs/ecryptfs/mmap.c2
-rw-r--r--fs/efs/inode.c1
-rw-r--r--fs/efs/super.c6
-rw-r--r--fs/efs/symlink.c4
-rw-r--r--fs/eventfd.c4
-rw-r--r--fs/exec.c14
-rw-r--r--fs/exofs/inode.c6
-rw-r--r--fs/exofs/namei.c4
-rw-r--r--fs/exofs/super.c4
-rw-r--r--fs/ext2/ext2.h11
-rw-r--r--fs/ext2/file.c92
-rw-r--r--fs/ext2/ialloc.c5
-rw-r--r--fs/ext2/inode.c19
-rw-r--r--fs/ext2/namei.c50
-rw-r--r--fs/ext2/super.c7
-rw-r--r--fs/ext2/symlink.c5
-rw-r--r--fs/ext2/xattr.c22
-rw-r--r--fs/ext2/xattr_security.c30
-rw-r--r--fs/ext2/xattr_trusted.c32
-rw-r--r--fs/ext2/xattr_user.c32
-rw-r--r--fs/ext3/Kconfig89
-rw-r--r--fs/ext3/Makefile12
-rw-r--r--fs/ext3/acl.c281
-rw-r--r--fs/ext3/acl.h72
-rw-r--r--fs/ext3/balloc.c2158
-rw-r--r--fs/ext3/bitmap.c20
-rw-r--r--fs/ext3/dir.c537
-rw-r--r--fs/ext3/ext3.h1332
-rw-r--r--fs/ext3/ext3_jbd.c59
-rw-r--r--fs/ext3/file.c79
-rw-r--r--fs/ext3/fsync.c109
-rw-r--r--fs/ext3/hash.c206
-rw-r--r--fs/ext3/ialloc.c706
-rw-r--r--fs/ext3/inode.c3574
-rw-r--r--fs/ext3/ioctl.c327
-rw-r--r--fs/ext3/namei.c2586
-rw-r--r--fs/ext3/namei.h27
-rw-r--r--fs/ext3/resize.c1117
-rw-r--r--fs/ext3/super.c3165
-rw-r--r--fs/ext3/symlink.c46
-rw-r--r--fs/ext3/xattr.c1330
-rw-r--r--fs/ext3/xattr.h136
-rw-r--r--fs/ext3/xattr_security.c78
-rw-r--r--fs/ext3/xattr_trusted.c54
-rw-r--r--fs/ext3/xattr_user.c58
-rw-r--r--fs/ext4/Kconfig56
-rw-r--r--fs/ext4/Makefile2
-rw-r--r--fs/ext4/balloc.c92
-rw-r--r--fs/ext4/block_validity.c2
-rw-r--r--fs/ext4/crypto.c54
-rw-r--r--fs/ext4/crypto_fname.c7
-rw-r--r--fs/ext4/crypto_key.c24
-rw-r--r--fs/ext4/crypto_policy.c20
-rw-r--r--fs/ext4/dir.c7
-rw-r--r--fs/ext4/ext4.h232
-rw-r--r--fs/ext4/ext4_jbd2.c6
-rw-r--r--fs/ext4/ext4_jbd2.h10
-rw-r--r--fs/ext4/extents.c88
-rw-r--r--fs/ext4/extents_status.c60
-rw-r--r--fs/ext4/extents_status.h2
-rw-r--r--fs/ext4/file.c68
-rw-r--r--fs/ext4/ialloc.c119
-rw-r--r--fs/ext4/indirect.c6
-rw-r--r--fs/ext4/inline.c3
-rw-r--r--fs/ext4/inode.c117
-rw-r--r--fs/ext4/ioctl.c15
-rw-r--r--fs/ext4/mballoc.c87
-rw-r--r--fs/ext4/migrate.c9
-rw-r--r--fs/ext4/mmp.c50
-rw-r--r--fs/ext4/namei.c105
-rw-r--r--fs/ext4/page-io.c36
-rw-r--r--fs/ext4/readpage.c14
-rw-r--r--fs/ext4/resize.c34
-rw-r--r--fs/ext4/super.c769
-rw-r--r--fs/ext4/symlink.c33
-rw-r--r--fs/ext4/sysfs.c448
-rw-r--r--fs/ext4/xattr.c56
-rw-r--r--fs/ext4/xattr_security.c31
-rw-r--r--fs/ext4/xattr_trusted.c32
-rw-r--r--fs/ext4/xattr_user.c32
-rw-r--r--fs/f2fs/Kconfig2
-rw-r--r--fs/f2fs/Makefile1
-rw-r--r--fs/f2fs/checkpoint.c315
-rw-r--r--fs/f2fs/crypto_key.c7
-rw-r--r--fs/f2fs/data.c1404
-rw-r--r--fs/f2fs/debug.c75
-rw-r--r--fs/f2fs/dir.c61
-rw-r--r--fs/f2fs/extent_cache.c762
-rw-r--r--fs/f2fs/f2fs.h336
-rw-r--r--fs/f2fs/file.c760
-rw-r--r--fs/f2fs/gc.c159
-rw-r--r--fs/f2fs/gc.h8
-rw-r--r--fs/f2fs/inline.c74
-rw-r--r--fs/f2fs/inode.c130
-rw-r--r--fs/f2fs/namei.c133
-rw-r--r--fs/f2fs/node.c254
-rw-r--r--fs/f2fs/node.h10
-rw-r--r--fs/f2fs/recovery.c98
-rw-r--r--fs/f2fs/segment.c370
-rw-r--r--fs/f2fs/segment.h59
-rw-r--r--fs/f2fs/shrinker.c140
-rw-r--r--fs/f2fs/super.c352
-rw-r--r--fs/f2fs/xattr.c130
-rw-r--r--fs/f2fs/xattr.h2
-rw-r--r--fs/fat/dir.c16
-rw-r--r--fs/fat/inode.c2
-rw-r--r--fs/fcntl.c3
-rw-r--r--fs/file.c107
-rw-r--r--fs/freevxfs/vxfs_inode.c1
-rw-r--r--fs/freevxfs/vxfs_lookup.c2
-rw-r--r--fs/fs-writeback.c276
-rw-r--r--fs/fscache/cookie.c2
-rw-r--r--fs/fscache/netfs.c38
-rw-r--r--fs/fscache/object-list.c4
-rw-r--r--fs/fscache/page.c8
-rw-r--r--fs/fuse/cuse.c2
-rw-r--r--fs/fuse/dir.c17
-rw-r--r--fs/fuse/file.c77
-rw-r--r--fs/fuse/fuse_i.h3
-rw-r--r--fs/fuse/inode.c4
-rw-r--r--fs/gfs2/acl.c4
-rw-r--r--fs/gfs2/acl.h2
-rw-r--r--fs/gfs2/aops.c2
-rw-r--r--fs/gfs2/bmap.c13
-rw-r--r--fs/gfs2/dir.c182
-rw-r--r--fs/gfs2/file.c44
-rw-r--r--fs/gfs2/glock.c446
-rw-r--r--fs/gfs2/glock.h30
-rw-r--r--fs/gfs2/glops.c50
-rw-r--r--fs/gfs2/incore.h41
-rw-r--r--fs/gfs2/inode.c67
-rw-r--r--fs/gfs2/lock_dlm.c14
-rw-r--r--fs/gfs2/log.c3
-rw-r--r--fs/gfs2/lops.c25
-rw-r--r--fs/gfs2/main.c21
-rw-r--r--fs/gfs2/meta_io.c88
-rw-r--r--fs/gfs2/meta_io.h4
-rw-r--r--fs/gfs2/ops_fstype.c20
-rw-r--r--fs/gfs2/quota.c139
-rw-r--r--fs/gfs2/quota.h2
-rw-r--r--fs/gfs2/rgrp.c76
-rw-r--r--fs/gfs2/rgrp.h6
-rw-r--r--fs/gfs2/super.c49
-rw-r--r--fs/gfs2/trace_gfs2.h34
-rw-r--r--fs/gfs2/trans.c8
-rw-r--r--fs/gfs2/util.c2
-rw-r--r--fs/gfs2/util.h2
-rw-r--r--fs/gfs2/xattr.c73
-rw-r--r--fs/gfs2/xattr.h1
-rw-r--r--fs/hfs/bnode.c9
-rw-r--r--fs/hfs/brec.c20
-rw-r--r--fs/hfs/mdb.c4
-rw-r--r--fs/hfs/super.c8
-rw-r--r--fs/hfsplus/bnode.c3
-rw-r--r--fs/hfsplus/inode.c2
-rw-r--r--fs/hfsplus/options.c4
-rw-r--r--fs/hfsplus/posix_acl.c8
-rw-r--r--fs/hfsplus/super.c2
-rw-r--r--fs/hfsplus/xattr.c33
-rw-r--r--fs/hfsplus/xattr_security.c21
-rw-r--r--fs/hfsplus/xattr_trusted.c21
-rw-r--r--fs/hfsplus/xattr_user.c21
-rw-r--r--fs/hostfs/hostfs_kern.c30
-rw-r--r--fs/hpfs/buffer.c39
-rw-r--r--fs/hpfs/file.c9
-rw-r--r--fs/hpfs/hpfs_fn.h7
-rw-r--r--fs/hpfs/inode.c1
-rw-r--r--fs/hpfs/map.c26
-rw-r--r--fs/hpfs/namei.c32
-rw-r--r--fs/hpfs/super.c17
-rw-r--r--fs/hugetlbfs/inode.c396
-rw-r--r--fs/inode.c61
-rw-r--r--fs/internal.h12
-rw-r--r--fs/ioctl.c71
-rw-r--r--fs/isofs/inode.c3
-rw-r--r--fs/isofs/rock.c4
-rw-r--r--fs/jbd/Kconfig30
-rw-r--r--fs/jbd/Makefile7
-rw-r--r--fs/jbd/checkpoint.c782
-rw-r--r--fs/jbd/commit.c1021
-rw-r--r--fs/jbd/journal.c2145
-rw-r--r--fs/jbd/recovery.c594
-rw-r--r--fs/jbd/revoke.c733
-rw-r--r--fs/jbd/transaction.c2237
-rw-r--r--fs/jbd2/checkpoint.c47
-rw-r--r--fs/jbd2/commit.c24
-rw-r--r--fs/jbd2/journal.c40
-rw-r--r--fs/jbd2/recovery.c26
-rw-r--r--fs/jbd2/revoke.c4
-rw-r--r--fs/jbd2/transaction.c96
-rw-r--r--fs/jffs2/background.c7
-rw-r--r--fs/jffs2/dir.c3
-rw-r--r--fs/jffs2/malloc.c27
-rw-r--r--fs/jffs2/readinode.c6
-rw-r--r--fs/jffs2/security.c30
-rw-r--r--fs/jffs2/super.c2
-rw-r--r--fs/jffs2/symlink.c2
-rw-r--r--fs/jffs2/wbuf.c5
-rw-r--r--fs/jffs2/xattr.c25
-rw-r--r--fs/jffs2/xattr_trusted.c26
-rw-r--r--fs/jffs2/xattr_user.c28
-rw-r--r--fs/jfs/acl.c8
-rw-r--r--fs/jfs/file.c7
-rw-r--r--fs/jfs/inode.c1
-rw-r--r--fs/jfs/jfs_inode.c4
-rw-r--r--fs/jfs/jfs_logmgr.c31
-rw-r--r--fs/jfs/jfs_metapage.c8
-rw-r--r--fs/jfs/namei.c58
-rw-r--r--fs/jfs/super.c5
-rw-r--r--fs/jfs/symlink.c5
-rw-r--r--fs/kernfs/dir.c78
-rw-r--r--fs/kernfs/inode.c4
-rw-r--r--fs/kernfs/symlink.c24
-rw-r--r--fs/libfs.c24
-rw-r--r--fs/lockd/clntproc.c13
-rw-r--r--fs/lockd/host.c8
-rw-r--r--fs/lockd/mon.c125
-rw-r--r--fs/lockd/netns.h4
-rw-r--r--fs/lockd/svc.c89
-rw-r--r--fs/lockd/svc4proc.c2
-rw-r--r--fs/lockd/svcproc.c2
-rw-r--r--fs/locks.c267
-rw-r--r--fs/logfs/Kconfig2
-rw-r--r--fs/logfs/dev_bdev.c16
-rw-r--r--fs/logfs/dir.c9
-rw-r--r--fs/logfs/inode.c6
-rw-r--r--fs/logfs/logfs.h7
-rw-r--r--fs/logfs/readwrite.c4
-rw-r--r--fs/logfs/segment.c4
-rw-r--r--fs/minix/inode.c6
-rw-r--r--fs/minix/itree_v1.c9
-rw-r--r--fs/minix/itree_v2.c9
-rw-r--r--fs/mpage.c44
-rw-r--r--fs/namei.c377
-rw-r--r--fs/namespace.c82
-rw-r--r--fs/ncpfs/dir.c4
-rw-r--r--fs/ncpfs/inode.c6
-rw-r--r--fs/ncpfs/ioctl.c2
-rw-r--r--fs/nfs/blocklayout/blocklayout.c21
-rw-r--r--fs/nfs/blocklayout/blocklayout.h19
-rw-r--r--fs/nfs/blocklayout/dev.c9
-rw-r--r--fs/nfs/blocklayout/extent_tree.c19
-rw-r--r--fs/nfs/callback.c50
-rw-r--r--fs/nfs/callback.h12
-rw-r--r--fs/nfs/callback_proc.c61
-rw-r--r--fs/nfs/callback_xdr.c39
-rw-r--r--fs/nfs/client.c114
-rw-r--r--fs/nfs/delegation.c43
-rw-r--r--fs/nfs/delegation.h5
-rw-r--r--fs/nfs/dir.c49
-rw-r--r--fs/nfs/direct.c55
-rw-r--r--fs/nfs/file.c46
-rw-r--r--fs/nfs/filelayout/filelayout.c49
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c657
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.h13
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c98
-rw-r--r--fs/nfs/inode.c180
-rw-r--r--fs/nfs/internal.h61
-rw-r--r--fs/nfs/mount_clnt.c4
-rw-r--r--fs/nfs/nfs3acl.c4
-rw-r--r--fs/nfs/nfs3xdr.c1
-rw-r--r--fs/nfs/nfs42.h3
-rw-r--r--fs/nfs/nfs42proc.c107
-rw-r--r--fs/nfs/nfs42xdr.c102
-rw-r--r--fs/nfs/nfs4_fs.h10
-rw-r--r--fs/nfs/nfs4client.c7
-rw-r--r--fs/nfs/nfs4file.c112
-rw-r--r--fs/nfs/nfs4idmap.c18
-rw-r--r--fs/nfs/nfs4proc.c639
-rw-r--r--fs/nfs/nfs4state.c17
-rw-r--r--fs/nfs/nfs4sysctl.c2
-rw-r--r--fs/nfs/nfs4trace.c1
-rw-r--r--fs/nfs/nfs4trace.h490
-rw-r--r--fs/nfs/nfs4xdr.c129
-rw-r--r--fs/nfs/nfsroot.c2
-rw-r--r--fs/nfs/nfstrace.h1
-rw-r--r--fs/nfs/objlayout/objio_osd.c7
-rw-r--r--fs/nfs/pagelist.c132
-rw-r--r--fs/nfs/pnfs.c462
-rw-r--r--fs/nfs/pnfs.h105
-rw-r--r--fs/nfs/pnfs_nfs.c98
-rw-r--r--fs/nfs/read.c55
-rw-r--r--fs/nfs/super.c9
-rw-r--r--fs/nfs/symlink.c39
-rw-r--r--fs/nfs/write.c193
-rw-r--r--fs/nfs_common/grace.c23
-rw-r--r--fs/nfsd/blocklayout.c8
-rw-r--r--fs/nfsd/blocklayoutxdr.c2
-rw-r--r--fs/nfsd/blocklayoutxdr.h15
-rw-r--r--fs/nfsd/export.c73
-rw-r--r--fs/nfsd/export.h1
-rw-r--r--fs/nfsd/idmap.h4
-rw-r--r--fs/nfsd/lockd.c2
-rw-r--r--fs/nfsd/netns.h3
-rw-r--r--fs/nfsd/nfs2acl.c10
-rw-r--r--fs/nfsd/nfs3acl.c4
-rw-r--r--fs/nfsd/nfs3xdr.c6
-rw-r--r--fs/nfsd/nfs4acl.c8
-rw-r--r--fs/nfsd/nfs4callback.c128
-rw-r--r--fs/nfsd/nfs4idmap.c3
-rw-r--r--fs/nfsd/nfs4layouts.c73
-rw-r--r--fs/nfsd/nfs4proc.c95
-rw-r--r--fs/nfsd/nfs4recover.c24
-rw-r--r--fs/nfsd/nfs4state.c515
-rw-r--r--fs/nfsd/nfs4xdr.c187
-rw-r--r--fs/nfsd/nfscache.c32
-rw-r--r--fs/nfsd/nfsfh.c5
-rw-r--r--fs/nfsd/nfsfh.h43
-rw-r--r--fs/nfsd/nfssvc.c92
-rw-r--r--fs/nfsd/state.h53
-rw-r--r--fs/nfsd/trace.c2
-rw-r--r--fs/nfsd/trace.h43
-rw-r--r--fs/nfsd/vfs.c56
-rw-r--r--fs/nfsd/vfs.h12
-rw-r--r--fs/nfsd/xdr4.h12
-rw-r--r--fs/nilfs2/alloc.c308
-rw-r--r--fs/nilfs2/alloc.h1
-rw-r--r--fs/nilfs2/btree.c7
-rw-r--r--fs/nilfs2/dat.c2
-rw-r--r--fs/nilfs2/file.c2
-rw-r--r--fs/nilfs2/inode.c5
-rw-r--r--fs/nilfs2/mdt.c6
-rw-r--r--fs/nilfs2/mdt.h2
-rw-r--r--fs/nilfs2/namei.c7
-rw-r--r--fs/nilfs2/recovery.c4
-rw-r--r--fs/nilfs2/segbuf.c7
-rw-r--r--fs/nilfs2/segment.c107
-rw-r--r--fs/nilfs2/segment.h3
-rw-r--r--fs/nilfs2/sufile.c11
-rw-r--r--fs/nilfs2/super.c26
-rw-r--r--fs/notify/dnotify/dnotify.c14
-rw-r--r--fs/notify/fanotify/fanotify_user.c8
-rw-r--r--fs/notify/fdinfo.c12
-rw-r--r--fs/notify/fsnotify.c11
-rw-r--r--fs/notify/fsnotify.h21
-rw-r--r--fs/notify/inode_mark.c43
-rw-r--r--fs/notify/inotify/inotify_user.c14
-rw-r--r--fs/notify/mark.c173
-rw-r--r--fs/notify/vfsmount_mark.c19
-rw-r--r--fs/nsfs.c11
-rw-r--r--fs/ntfs/file.c4
-rw-r--r--fs/ntfs/super.c27
-rw-r--r--fs/ocfs2/acl.c26
-rw-r--r--fs/ocfs2/alloc.c163
-rw-r--r--fs/ocfs2/alloc.h2
-rw-r--r--fs/ocfs2/aops.c56
-rw-r--r--fs/ocfs2/buffer_head_io.c6
-rw-r--r--fs/ocfs2/cluster/heartbeat.c306
-rw-r--r--fs/ocfs2/cluster/nodemanager.c283
-rw-r--r--fs/ocfs2/dir.c70
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h11
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c82
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c71
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c27
-rw-r--r--fs/ocfs2/dlm/dlmthread.c13
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c2
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c2
-rw-r--r--fs/ocfs2/dlmglue.c13
-rw-r--r--fs/ocfs2/extent_map.c22
-rw-r--r--fs/ocfs2/file.c83
-rw-r--r--fs/ocfs2/inode.c50
-rw-r--r--fs/ocfs2/inode.h4
-rw-r--r--fs/ocfs2/ioctl.c4
-rw-r--r--fs/ocfs2/journal.c135
-rw-r--r--fs/ocfs2/localalloc.c13
-rw-r--r--fs/ocfs2/locks.c13
-rw-r--r--fs/ocfs2/move_extents.c8
-rw-r--r--fs/ocfs2/namei.c192
-rw-r--r--fs/ocfs2/namei.h3
-rw-r--r--fs/ocfs2/ocfs2.h2
-rw-r--r--fs/ocfs2/ocfs2_fs.h4
-rw-r--r--fs/ocfs2/quota.h2
-rw-r--r--fs/ocfs2/quota_global.c2
-rw-r--r--fs/ocfs2/quota_local.c7
-rw-r--r--fs/ocfs2/refcounttree.c91
-rw-r--r--fs/ocfs2/resize.c15
-rw-r--r--fs/ocfs2/slot_map.c14
-rw-r--r--fs/ocfs2/stack_user.c9
-rw-r--r--fs/ocfs2/suballoc.c101
-rw-r--r--fs/ocfs2/super.c86
-rw-r--r--fs/ocfs2/super.h8
-rw-r--r--fs/ocfs2/symlink.c3
-rw-r--r--fs/ocfs2/xattr.c240
-rw-r--r--fs/open.c7
-rw-r--r--fs/openpromfs/inode.c2
-rw-r--r--fs/overlayfs/copy_up.c23
-rw-r--r--fs/overlayfs/inode.c72
-rw-r--r--fs/overlayfs/overlayfs.h3
-rw-r--r--fs/overlayfs/super.c6
-rw-r--r--fs/pipe.c18
-rw-r--r--fs/posix_acl.c43
-rw-r--r--fs/proc/array.c31
-rw-r--r--fs/proc/base.c166
-rw-r--r--fs/proc/fd.c15
-rw-r--r--fs/proc/generic.c44
-rw-r--r--fs/proc/inode.c24
-rw-r--r--fs/proc/meminfo.c12
-rw-r--r--fs/proc/namespaces.c10
-rw-r--r--fs/proc/page.c69
-rw-r--r--fs/proc/proc_sysctl.c2
-rw-r--r--fs/proc/root.c2
-rw-r--r--fs/proc/self.c18
-rw-r--r--fs/proc/task_mmu.c487
-rw-r--r--fs/proc/thread_self.c19
-rw-r--r--fs/proc_namespace.c27
-rw-r--r--fs/pstore/Kconfig2
-rw-r--r--fs/pstore/Makefile6
-rw-r--r--fs/pstore/ftrace.c25
-rw-r--r--fs/pstore/inode.c11
-rw-r--r--fs/pstore/internal.h6
-rw-r--r--fs/pstore/platform.c47
-rw-r--r--fs/pstore/pmsg.c9
-rw-r--r--fs/pstore/ram.c19
-rw-r--r--fs/qnx4/inode.c3
-rw-r--r--fs/qnx6/inode.c3
-rw-r--r--fs/quota/dquot.c106
-rw-r--r--fs/quota/netlink.c5
-rw-r--r--fs/quota/quota.c4
-rw-r--r--fs/quota/quota_v2.c4
-rw-r--r--fs/ramfs/file-nommu.c5
-rw-r--r--fs/ramfs/inode.c1
-rw-r--r--fs/read_write.c341
-rw-r--r--fs/reiserfs/inode.c8
-rw-r--r--fs/reiserfs/journal.c24
-rw-r--r--fs/reiserfs/namei.c70
-rw-r--r--fs/reiserfs/prints.c9
-rw-r--r--fs/reiserfs/procfs.c5
-rw-r--r--fs/reiserfs/super.c11
-rw-r--r--fs/reiserfs/xattr.c22
-rw-r--r--fs/reiserfs/xattr_acl.c8
-rw-r--r--fs/reiserfs/xattr_security.c23
-rw-r--r--fs/reiserfs/xattr_trusted.c22
-rw-r--r--fs/reiserfs/xattr_user.c21
-rw-r--r--fs/romfs/super.c5
-rw-r--r--fs/select.c6
-rw-r--r--fs/seq_file.c139
-rw-r--r--fs/splice.c18
-rw-r--r--fs/squashfs/inode.c2
-rw-r--r--fs/squashfs/super.c10
-rw-r--r--fs/squashfs/symlink.c3
-rw-r--r--fs/squashfs/xattr.c86
-rw-r--r--fs/stat.c4
-rw-r--r--fs/super.c179
-rw-r--r--fs/sync.c10
-rw-r--r--fs/sysfs/file.c4
-rw-r--r--fs/sysfs/group.c61
-rw-r--r--fs/sysfs/mount.c4
-rw-r--r--fs/sysv/inode.c17
-rw-r--r--fs/tracefs/inode.c6
-rw-r--r--fs/ubifs/Kconfig15
-rw-r--r--fs/ubifs/debug.c8
-rw-r--r--fs/ubifs/dir.c11
-rw-r--r--fs/ubifs/file.c52
-rw-r--r--fs/ubifs/key.h6
-rw-r--r--fs/ubifs/lpt.c6
-rw-r--r--fs/ubifs/misc.h9
-rw-r--r--fs/ubifs/recovery.c8
-rw-r--r--fs/ubifs/super.c17
-rw-r--r--fs/ubifs/tnc.c3
-rw-r--r--fs/ubifs/ubifs.h8
-rw-r--r--fs/ubifs/xattr.c59
-rw-r--r--fs/udf/balloc.c98
-rw-r--r--fs/udf/inode.c246
-rw-r--r--fs/udf/namei.c8
-rw-r--r--fs/udf/super.c24
-rw-r--r--fs/udf/symlink.c4
-rw-r--r--fs/udf/udfdecl.h5
-rw-r--r--fs/udf/unicode.c21
-rw-r--r--fs/ufs/Makefile2
-rw-r--r--fs/ufs/balloc.c8
-rw-r--r--fs/ufs/inode.c953
-rw-r--r--fs/ufs/namei.c5
-rw-r--r--fs/ufs/super.c38
-rw-r--r--fs/ufs/symlink.c42
-rw-r--r--fs/ufs/truncate.c523
-rw-r--r--fs/ufs/ufs.h17
-rw-r--r--fs/userfaultfd.c1332
-rw-r--r--fs/xattr.c232
-rw-r--r--fs/xfs/Makefile4
-rw-r--r--fs/xfs/kmem.c10
-rw-r--r--fs/xfs/kmem.h1
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c38
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h9
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c35
-rw-r--r--fs/xfs/libxfs/xfs_attr.c149
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c8
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c43
-rw-r--r--fs/xfs/libxfs/xfs_bit.c (renamed from fs/xfs/xfs_bit.c)6
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c109
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h15
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c6
-rw-r--r--fs/xfs/libxfs/xfs_btree.c89
-rw-r--r--fs/xfs/libxfs/xfs_btree.h42
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c37
-rw-r--r--fs/xfs/libxfs/xfs_da_format.h11
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c42
-rw-r--r--fs/xfs/libxfs/xfs_dir2_block.c8
-rw-r--r--fs/xfs/libxfs/xfs_dir2_data.c12
-rw-r--r--fs/xfs/libxfs/xfs_dir2_leaf.c9
-rw-r--r--fs/xfs/libxfs/xfs_dir2_node.c21
-rw-r--r--fs/xfs/libxfs/xfs_dquot_buf.c41
-rw-r--r--fs/xfs/libxfs/xfs_format.h42
-rw-r--r--fs/xfs/libxfs/xfs_fs.h10
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c16
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c27
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c20
-rw-r--r--fs/xfs/libxfs/xfs_log_recover.h1
-rw-r--r--fs/xfs/libxfs/xfs_quota_defs.h2
-rw-r--r--fs/xfs/libxfs/xfs_sb.c39
-rw-r--r--fs/xfs/libxfs/xfs_shared.h1
-rw-r--r--fs/xfs/libxfs/xfs_symlink_remote.c12
-rw-r--r--fs/xfs/xfs_acl.c37
-rw-r--r--fs/xfs/xfs_acl.h8
-rw-r--r--fs/xfs/xfs_aops.c136
-rw-r--r--fs/xfs/xfs_aops.h3
-rw-r--r--fs/xfs/xfs_attr_list.c2
-rw-r--r--fs/xfs/xfs_bmap_util.c156
-rw-r--r--fs/xfs/xfs_buf.c54
-rw-r--r--fs/xfs/xfs_buf.h2
-rw-r--r--fs/xfs/xfs_buf_item.c26
-rw-r--r--fs/xfs/xfs_buf_item.h2
-rw-r--r--fs/xfs/xfs_dir2_readdir.c13
-rw-r--r--fs/xfs/xfs_dquot.c37
-rw-r--r--fs/xfs/xfs_error.c4
-rw-r--r--fs/xfs/xfs_extfree_item.c105
-rw-r--r--fs/xfs/xfs_extfree_item.h26
-rw-r--r--fs/xfs/xfs_file.c210
-rw-r--r--fs/xfs/xfs_fsops.c6
-rw-r--r--fs/xfs/xfs_icache.c18
-rw-r--r--fs/xfs/xfs_inode.c174
-rw-r--r--fs/xfs/xfs_inode.h85
-rw-r--r--fs/xfs/xfs_inode_item.c12
-rw-r--r--fs/xfs/xfs_inode_item.h1
-rw-r--r--fs/xfs/xfs_ioctl.c23
-rw-r--r--fs/xfs/xfs_ioctl32.c2
-rw-r--r--fs/xfs/xfs_iomap.c85
-rw-r--r--fs/xfs/xfs_iops.c26
-rw-r--r--fs/xfs/xfs_itable.c3
-rw-r--r--fs/xfs/xfs_linux.h7
-rw-r--r--fs/xfs/xfs_log.c231
-rw-r--r--fs/xfs/xfs_log.h2
-rw-r--r--fs/xfs/xfs_log_cil.c8
-rw-r--r--fs/xfs/xfs_log_priv.h56
-rw-r--r--fs/xfs/xfs_log_recover.c809
-rw-r--r--fs/xfs/xfs_message.c7
-rw-r--r--fs/xfs/xfs_mount.c49
-rw-r--r--fs/xfs/xfs_mount.h5
-rw-r--r--fs/xfs/xfs_pnfs.c5
-rw-r--r--fs/xfs/xfs_qm.c16
-rw-r--r--fs/xfs/xfs_rtalloc.c58
-rw-r--r--fs/xfs/xfs_stats.c93
-rw-r--r--fs/xfs/xfs_stats.h36
-rw-r--r--fs/xfs/xfs_super.c83
-rw-r--r--fs/xfs/xfs_symlink.c21
-rw-r--r--fs/xfs/xfs_sysctl.c15
-rw-r--r--fs/xfs/xfs_sysfs.c213
-rw-r--r--fs/xfs/xfs_sysfs.h1
-rw-r--r--fs/xfs/xfs_trace.h63
-rw-r--r--fs/xfs/xfs_trans.c21
-rw-r--r--fs/xfs/xfs_trans.h9
-rw-r--r--fs/xfs/xfs_trans_ail.c13
-rw-r--r--fs/xfs/xfs_trans_dquot.c14
-rw-r--r--fs/xfs/xfs_trans_extfree.c32
-rw-r--r--fs/xfs/xfs_trans_inode.c9
-rw-r--r--fs/xfs/xfs_trans_priv.h15
-rw-r--r--fs/xfs/xfs_xattr.c184
736 files changed, 31787 insertions, 43288 deletions
diff --git a/fs/9p/Makefile b/fs/9p/Makefile
index ff7be98..9619cca 100644
--- a/fs/9p/Makefile
+++ b/fs/9p/Makefile
@@ -10,10 +10,7 @@ obj-$(CONFIG_9P_FS) := 9p.o
vfs_dentry.o \
v9fs.o \
fid.o \
- xattr.o \
- xattr_user.o \
- xattr_trusted.o
+ xattr.o
9p-$(CONFIG_9P_FSCACHE) += cache.o
9p-$(CONFIG_9P_FS_POSIX_ACL) += acl.o
-9p-$(CONFIG_9P_FS_SECURITY) += xattr_security.o
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 31c0103..9da967f 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -67,8 +67,8 @@ int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
return 0;
}
/* get the default/access acl values and cache them */
- dacl = __v9fs_get_acl(fid, POSIX_ACL_XATTR_DEFAULT);
- pacl = __v9fs_get_acl(fid, POSIX_ACL_XATTR_ACCESS);
+ dacl = __v9fs_get_acl(fid, XATTR_NAME_POSIX_ACL_DEFAULT);
+ pacl = __v9fs_get_acl(fid, XATTR_NAME_POSIX_ACL_ACCESS);
if (!IS_ERR(dacl) && !IS_ERR(pacl)) {
set_cached_acl(inode, ACL_TYPE_DEFAULT, dacl);
@@ -133,10 +133,10 @@ static int v9fs_set_acl(struct p9_fid *fid, int type, struct posix_acl *acl)
goto err_free_out;
switch (type) {
case ACL_TYPE_ACCESS:
- name = POSIX_ACL_XATTR_ACCESS;
+ name = XATTR_NAME_POSIX_ACL_ACCESS;
break;
case ACL_TYPE_DEFAULT:
- name = POSIX_ACL_XATTR_DEFAULT;
+ name = XATTR_NAME_POSIX_ACL_DEFAULT;
break;
default:
BUG();
@@ -212,42 +212,22 @@ int v9fs_acl_mode(struct inode *dir, umode_t *modep,
return 0;
}
-static int v9fs_remote_get_acl(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
-{
- char *full_name;
-
- switch (type) {
- case ACL_TYPE_ACCESS:
- full_name = POSIX_ACL_XATTR_ACCESS;
- break;
- case ACL_TYPE_DEFAULT:
- full_name = POSIX_ACL_XATTR_DEFAULT;
- break;
- default:
- BUG();
- }
- return v9fs_xattr_get(dentry, full_name, buffer, size);
-}
-
-static int v9fs_xattr_get_acl(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+static int v9fs_xattr_get_acl(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
struct v9fs_session_info *v9ses;
struct posix_acl *acl;
int error;
- if (strcmp(name, "") != 0)
- return -EINVAL;
-
v9ses = v9fs_dentry2v9ses(dentry);
/*
* We allow set/get/list of acl when access=client is not specified
*/
if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT)
- return v9fs_remote_get_acl(dentry, name, buffer, size, type);
+ return v9fs_xattr_get(dentry, handler->name, buffer, size);
- acl = v9fs_get_cached_acl(d_inode(dentry), type);
+ acl = v9fs_get_cached_acl(d_inode(dentry), handler->flags);
if (IS_ERR(acl))
return PTR_ERR(acl);
if (acl == NULL)
@@ -258,46 +238,23 @@ static int v9fs_xattr_get_acl(struct dentry *dentry, const char *name,
return error;
}
-static int v9fs_remote_set_acl(struct dentry *dentry, const char *name,
- const void *value, size_t size,
- int flags, int type)
-{
- char *full_name;
-
- switch (type) {
- case ACL_TYPE_ACCESS:
- full_name = POSIX_ACL_XATTR_ACCESS;
- break;
- case ACL_TYPE_DEFAULT:
- full_name = POSIX_ACL_XATTR_DEFAULT;
- break;
- default:
- BUG();
- }
- return v9fs_xattr_set(dentry, full_name, value, size, flags);
-}
-
-
-static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
- const void *value, size_t size,
- int flags, int type)
+static int v9fs_xattr_set_acl(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
{
int retval;
struct posix_acl *acl;
struct v9fs_session_info *v9ses;
struct inode *inode = d_inode(dentry);
- if (strcmp(name, "") != 0)
- return -EINVAL;
-
v9ses = v9fs_dentry2v9ses(dentry);
/*
* set the attribute on the remote. Without even looking at the
* xattr value. We leave it to the server to validate
*/
if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT)
- return v9fs_remote_set_acl(dentry, name,
- value, size, flags, type);
+ return v9fs_xattr_set(dentry, handler->name, value, size,
+ flags);
if (S_ISLNK(inode->i_mode))
return -EOPNOTSUPP;
@@ -316,9 +273,8 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
} else
acl = NULL;
- switch (type) {
+ switch (handler->flags) {
case ACL_TYPE_ACCESS:
- name = POSIX_ACL_XATTR_ACCESS;
if (acl) {
umode_t mode = inode->i_mode;
retval = posix_acl_equiv_mode(acl, &mode);
@@ -349,7 +305,6 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
}
break;
case ACL_TYPE_DEFAULT:
- name = POSIX_ACL_XATTR_DEFAULT;
if (!S_ISDIR(inode->i_mode)) {
retval = acl ? -EINVAL : 0;
goto err_out;
@@ -358,23 +313,23 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
default:
BUG();
}
- retval = v9fs_xattr_set(dentry, name, value, size, flags);
+ retval = v9fs_xattr_set(dentry, handler->name, value, size, flags);
if (!retval)
- set_cached_acl(inode, type, acl);
+ set_cached_acl(inode, handler->flags, acl);
err_out:
posix_acl_release(acl);
return retval;
}
const struct xattr_handler v9fs_xattr_acl_access_handler = {
- .prefix = POSIX_ACL_XATTR_ACCESS,
+ .name = XATTR_NAME_POSIX_ACL_ACCESS,
.flags = ACL_TYPE_ACCESS,
.get = v9fs_xattr_get_acl,
.set = v9fs_xattr_set_acl,
};
const struct xattr_handler v9fs_xattr_acl_default_handler = {
- .prefix = POSIX_ACL_XATTR_DEFAULT,
+ .name = XATTR_NAME_POSIX_ACL_DEFAULT,
.flags = ACL_TYPE_DEFAULT,
.get = v9fs_xattr_get_acl,
.set = v9fs_xattr_set_acl,
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index a69260f..103ca5e 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -243,14 +243,14 @@ void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp)
if (!v9inode->fscache)
return;
- spin_lock(&v9inode->fscache_lock);
+ mutex_lock(&v9inode->fscache_lock);
if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
v9fs_cache_inode_flush_cookie(inode);
else
v9fs_cache_inode_get_cookie(inode);
- spin_unlock(&v9inode->fscache_lock);
+ mutex_unlock(&v9inode->fscache_lock);
}
void v9fs_cache_inode_reset_cookie(struct inode *inode)
@@ -264,7 +264,7 @@ void v9fs_cache_inode_reset_cookie(struct inode *inode)
old = v9inode->fscache;
- spin_lock(&v9inode->fscache_lock);
+ mutex_lock(&v9inode->fscache_lock);
fscache_relinquish_cookie(v9inode->fscache, 1);
v9ses = v9fs_inode2v9ses(inode);
@@ -274,7 +274,7 @@ void v9fs_cache_inode_reset_cookie(struct inode *inode)
p9_debug(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p\n",
inode, old, v9inode->fscache);
- spin_unlock(&v9inode->fscache_lock);
+ mutex_unlock(&v9inode->fscache_lock);
}
int __v9fs_fscache_release_page(struct page *page, gfp_t gfp)
diff --git a/fs/9p/cache.h b/fs/9p/cache.h
index 2f96754..247e47e 100644
--- a/fs/9p/cache.h
+++ b/fs/9p/cache.h
@@ -21,6 +21,7 @@
*/
#ifndef _9P_CACHE_H
+#define _9P_CACHE_H
#ifdef CONFIG_9P_FSCACHE
#include <linux/fscache.h>
#include <linux/spinlock.h>
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 8aa56bb..072e759 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -52,7 +52,7 @@ enum {
/* Options that take integer arguments */
Opt_debug, Opt_dfltuid, Opt_dfltgid, Opt_afid,
/* String options */
- Opt_uname, Opt_remotename, Opt_trans, Opt_cache, Opt_cachetag,
+ Opt_uname, Opt_remotename, Opt_cache, Opt_cachetag,
/* Options that take no arguments */
Opt_nodevmap,
/* Cache options */
@@ -575,7 +575,7 @@ static int v9fs_init_inode_cache(void)
v9fs_inode_cache = kmem_cache_create("v9fs_inode_cache",
sizeof(struct v9fs_inode),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
v9fs_inode_init_once);
if (!v9fs_inode_cache)
return -ENOMEM;
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 0923f2c..6877050 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -123,7 +123,7 @@ struct v9fs_session_info {
struct v9fs_inode {
#ifdef CONFIG_9P_FSCACHE
- spinlock_t fscache_lock;
+ struct mutex fscache_lock;
struct fscache_cookie *fscache;
#endif
struct p9_qid qid;
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 1ef16bd..7bf835f 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -161,7 +161,7 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
if ((fl->fl_flags & FL_POSIX) != FL_POSIX)
BUG();
- res = posix_lock_file_wait(filp, fl);
+ res = locks_lock_file_wait(filp, fl);
if (res < 0)
goto out;
@@ -231,7 +231,8 @@ out_unlock:
if (res < 0 && fl->fl_type != F_UNLCK) {
fl_type = fl->fl_type;
fl->fl_type = F_UNLCK;
- res = posix_lock_file_wait(filp, fl);
+ /* Even if this fails we want to return the remote error */
+ locks_lock_file_wait(filp, fl);
fl->fl_type = fl_type;
}
out:
@@ -381,7 +382,7 @@ static ssize_t
v9fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct p9_fid *fid = iocb->ki_filp->private_data;
- int ret, err;
+ int ret, err = 0;
p9_debug(P9_DEBUG_VFS, "count %zu offset %lld\n",
iov_iter_count(to), iocb->ki_pos);
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index b1dc518..3a08b3e 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -244,7 +244,7 @@ struct inode *v9fs_alloc_inode(struct super_block *sb)
return NULL;
#ifdef CONFIG_9P_FSCACHE
v9inode->fscache = NULL;
- spin_lock_init(&v9inode->fscache_lock);
+ mutex_init(&v9inode->fscache_lock);
#endif
v9inode->writeback_fid = NULL;
v9inode->cache_validity = 0;
@@ -451,9 +451,9 @@ void v9fs_evict_inode(struct inode *inode)
{
struct v9fs_inode *v9inode = V9FS_I(inode);
- truncate_inode_pages_final(inode->i_mapping);
+ truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
- filemap_fdatawrite(inode->i_mapping);
+ filemap_fdatawrite(&inode->i_data);
v9fs_cache_inode_put_cookie(inode);
/* clunk the fid stashed in writeback_fid */
@@ -1223,18 +1223,26 @@ ino_t v9fs_qid2ino(struct p9_qid *qid)
}
/**
- * v9fs_vfs_follow_link - follow a symlink path
+ * v9fs_vfs_get_link - follow a symlink path
* @dentry: dentry for symlink
- * @cookie: place to pass the data to put_link()
+ * @inode: inode for symlink
+ * @done: delayed call for when we are done with the return value
*/
-static const char *v9fs_vfs_follow_link(struct dentry *dentry, void **cookie)
+static const char *v9fs_vfs_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- struct v9fs_session_info *v9ses = v9fs_dentry2v9ses(dentry);
- struct p9_fid *fid = v9fs_fid_lookup(dentry);
+ struct v9fs_session_info *v9ses;
+ struct p9_fid *fid;
struct p9_wstat *st;
char *res;
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
+ v9ses = v9fs_dentry2v9ses(dentry);
+ fid = v9fs_fid_lookup(dentry);
p9_debug(P9_DEBUG_VFS, "%pd\n", dentry);
if (IS_ERR(fid))
@@ -1259,7 +1267,8 @@ static const char *v9fs_vfs_follow_link(struct dentry *dentry, void **cookie)
p9stat_free(st);
kfree(st);
- return *cookie = res;
+ set_delayed_call(done, kfree_link, res);
+ return res;
}
/**
@@ -1368,9 +1377,6 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rde
dir->i_ino, dentry, mode,
MAJOR(rdev), MINOR(rdev));
- if (!new_valid_dev(rdev))
- return -EINVAL;
-
/* build extension */
if (S_ISBLK(mode))
sprintf(name, "b %u %u", MAJOR(rdev), MINOR(rdev));
@@ -1455,8 +1461,7 @@ static const struct inode_operations v9fs_file_inode_operations = {
static const struct inode_operations v9fs_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = v9fs_vfs_follow_link,
- .put_link = kfree_put_link,
+ .get_link = v9fs_vfs_get_link,
.getattr = v9fs_vfs_getattr,
.setattr = v9fs_vfs_setattr,
};
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index e8aa57d..a34702c 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -829,9 +829,6 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
dir->i_ino, dentry, omode,
MAJOR(rdev), MINOR(rdev));
- if (!new_valid_dev(rdev))
- return -EINVAL;
-
v9ses = v9fs_inode2v9ses(dir);
dir_dentry = dentry->d_parent;
dfid = v9fs_fid_lookup(dir_dentry);
@@ -902,26 +899,34 @@ error:
}
/**
- * v9fs_vfs_follow_link_dotl - follow a symlink path
+ * v9fs_vfs_get_link_dotl - follow a symlink path
* @dentry: dentry for symlink
- * @cookie: place to pass the data to put_link()
+ * @inode: inode for symlink
+ * @done: destructor for return value
*/
static const char *
-v9fs_vfs_follow_link_dotl(struct dentry *dentry, void **cookie)
+v9fs_vfs_get_link_dotl(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- struct p9_fid *fid = v9fs_fid_lookup(dentry);
+ struct p9_fid *fid;
char *target;
int retval;
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
p9_debug(P9_DEBUG_VFS, "%pd\n", dentry);
+ fid = v9fs_fid_lookup(dentry);
if (IS_ERR(fid))
return ERR_CAST(fid);
retval = p9_client_readlink(fid, &target);
if (retval)
return ERR_PTR(retval);
- return *cookie = target;
+ set_delayed_call(done, kfree_link, target);
+ return target;
}
int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode)
@@ -987,8 +992,7 @@ const struct inode_operations v9fs_file_inode_operations_dotl = {
const struct inode_operations v9fs_symlink_inode_operations_dotl = {
.readlink = generic_readlink,
- .follow_link = v9fs_vfs_follow_link_dotl,
- .put_link = kfree_put_link,
+ .get_link = v9fs_vfs_get_link_dotl,
.getattr = v9fs_vfs_getattr_dotl,
.setattr = v9fs_vfs_setattr_dotl,
.setxattr = generic_setxattr,
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index 0cf44b6..9dd9b47 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -137,6 +137,44 @@ ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
return v9fs_xattr_get(dentry, NULL, buffer, buffer_size);
}
+static int v9fs_xattr_handler_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
+{
+ const char *full_name = xattr_full_name(handler, name);
+
+ return v9fs_xattr_get(dentry, full_name, buffer, size);
+}
+
+static int v9fs_xattr_handler_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
+{
+ const char *full_name = xattr_full_name(handler, name);
+
+ return v9fs_xattr_set(dentry, full_name, value, size, flags);
+}
+
+static struct xattr_handler v9fs_xattr_user_handler = {
+ .prefix = XATTR_USER_PREFIX,
+ .get = v9fs_xattr_handler_get,
+ .set = v9fs_xattr_handler_set,
+};
+
+static struct xattr_handler v9fs_xattr_trusted_handler = {
+ .prefix = XATTR_TRUSTED_PREFIX,
+ .get = v9fs_xattr_handler_get,
+ .set = v9fs_xattr_handler_set,
+};
+
+#ifdef CONFIG_9P_FS_SECURITY
+static struct xattr_handler v9fs_xattr_security_handler = {
+ .prefix = XATTR_SECURITY_PREFIX,
+ .get = v9fs_xattr_handler_get,
+ .set = v9fs_xattr_handler_set,
+};
+#endif
+
const struct xattr_handler *v9fs_xattr_handlers[] = {
&v9fs_xattr_user_handler,
&v9fs_xattr_trusted_handler,
diff --git a/fs/9p/xattr.h b/fs/9p/xattr.h
index d3e2ea3..c63c3be 100644
--- a/fs/9p/xattr.h
+++ b/fs/9p/xattr.h
@@ -19,9 +19,6 @@
#include <net/9p/client.h>
extern const struct xattr_handler *v9fs_xattr_handlers[];
-extern struct xattr_handler v9fs_xattr_user_handler;
-extern struct xattr_handler v9fs_xattr_trusted_handler;
-extern struct xattr_handler v9fs_xattr_security_handler;
extern const struct xattr_handler v9fs_xattr_acl_access_handler;
extern const struct xattr_handler v9fs_xattr_acl_default_handler;
diff --git a/fs/9p/xattr_security.c b/fs/9p/xattr_security.c
deleted file mode 100644
index cb247a1..0000000
--- a/fs/9p/xattr_security.c
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright IBM Corporation, 2010
- * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2.1 of the GNU Lesser General Public License
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- */
-
-
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/fs.h>
-#include <linux/slab.h>
-#include "xattr.h"
-
-static int v9fs_xattr_security_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
-{
- int retval;
- char *full_name;
- size_t name_len;
- size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
-
- if (name == NULL)
- return -EINVAL;
-
- if (strcmp(name, "") == 0)
- return -EINVAL;
-
- name_len = strlen(name);
- full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
- if (!full_name)
- return -ENOMEM;
- memcpy(full_name, XATTR_SECURITY_PREFIX, prefix_len);
- memcpy(full_name+prefix_len, name, name_len);
- full_name[prefix_len + name_len] = '\0';
-
- retval = v9fs_xattr_get(dentry, full_name, buffer, size);
- kfree(full_name);
- return retval;
-}
-
-static int v9fs_xattr_security_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
-{
- int retval;
- char *full_name;
- size_t name_len;
- size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
-
- if (name == NULL)
- return -EINVAL;
-
- if (strcmp(name, "") == 0)
- return -EINVAL;
-
- name_len = strlen(name);
- full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
- if (!full_name)
- return -ENOMEM;
- memcpy(full_name, XATTR_SECURITY_PREFIX, prefix_len);
- memcpy(full_name + prefix_len, name, name_len);
- full_name[prefix_len + name_len] = '\0';
-
- retval = v9fs_xattr_set(dentry, full_name, value, size, flags);
- kfree(full_name);
- return retval;
-}
-
-struct xattr_handler v9fs_xattr_security_handler = {
- .prefix = XATTR_SECURITY_PREFIX,
- .get = v9fs_xattr_security_get,
- .set = v9fs_xattr_security_set,
-};
diff --git a/fs/9p/xattr_trusted.c b/fs/9p/xattr_trusted.c
deleted file mode 100644
index e30d33b..0000000
--- a/fs/9p/xattr_trusted.c
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright IBM Corporation, 2010
- * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2.1 of the GNU Lesser General Public License
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- */
-
-
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/fs.h>
-#include <linux/slab.h>
-#include "xattr.h"
-
-static int v9fs_xattr_trusted_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
-{
- int retval;
- char *full_name;
- size_t name_len;
- size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
-
- if (name == NULL)
- return -EINVAL;
-
- if (strcmp(name, "") == 0)
- return -EINVAL;
-
- name_len = strlen(name);
- full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
- if (!full_name)
- return -ENOMEM;
- memcpy(full_name, XATTR_TRUSTED_PREFIX, prefix_len);
- memcpy(full_name+prefix_len, name, name_len);
- full_name[prefix_len + name_len] = '\0';
-
- retval = v9fs_xattr_get(dentry, full_name, buffer, size);
- kfree(full_name);
- return retval;
-}
-
-static int v9fs_xattr_trusted_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
-{
- int retval;
- char *full_name;
- size_t name_len;
- size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
-
- if (name == NULL)
- return -EINVAL;
-
- if (strcmp(name, "") == 0)
- return -EINVAL;
-
- name_len = strlen(name);
- full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
- if (!full_name)
- return -ENOMEM;
- memcpy(full_name, XATTR_TRUSTED_PREFIX, prefix_len);
- memcpy(full_name + prefix_len, name, name_len);
- full_name[prefix_len + name_len] = '\0';
-
- retval = v9fs_xattr_set(dentry, full_name, value, size, flags);
- kfree(full_name);
- return retval;
-}
-
-struct xattr_handler v9fs_xattr_trusted_handler = {
- .prefix = XATTR_TRUSTED_PREFIX,
- .get = v9fs_xattr_trusted_get,
- .set = v9fs_xattr_trusted_set,
-};
diff --git a/fs/9p/xattr_user.c b/fs/9p/xattr_user.c
deleted file mode 100644
index d0b701b..0000000
--- a/fs/9p/xattr_user.c
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright IBM Corporation, 2010
- * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2.1 of the GNU Lesser General Public License
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- */
-
-
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/fs.h>
-#include <linux/slab.h>
-#include "xattr.h"
-
-static int v9fs_xattr_user_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
-{
- int retval;
- char *full_name;
- size_t name_len;
- size_t prefix_len = XATTR_USER_PREFIX_LEN;
-
- if (name == NULL)
- return -EINVAL;
-
- if (strcmp(name, "") == 0)
- return -EINVAL;
-
- name_len = strlen(name);
- full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
- if (!full_name)
- return -ENOMEM;
- memcpy(full_name, XATTR_USER_PREFIX, prefix_len);
- memcpy(full_name+prefix_len, name, name_len);
- full_name[prefix_len + name_len] = '\0';
-
- retval = v9fs_xattr_get(dentry, full_name, buffer, size);
- kfree(full_name);
- return retval;
-}
-
-static int v9fs_xattr_user_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
-{
- int retval;
- char *full_name;
- size_t name_len;
- size_t prefix_len = XATTR_USER_PREFIX_LEN;
-
- if (name == NULL)
- return -EINVAL;
-
- if (strcmp(name, "") == 0)
- return -EINVAL;
-
- name_len = strlen(name);
- full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
- if (!full_name)
- return -ENOMEM;
- memcpy(full_name, XATTR_USER_PREFIX, prefix_len);
- memcpy(full_name + prefix_len, name, name_len);
- full_name[prefix_len + name_len] = '\0';
-
- retval = v9fs_xattr_set(dentry, full_name, value, size, flags);
- kfree(full_name);
- return retval;
-}
-
-struct xattr_handler v9fs_xattr_user_handler = {
- .prefix = XATTR_USER_PREFIX,
- .get = v9fs_xattr_user_get,
- .set = v9fs_xattr_user_set,
-};
diff --git a/fs/Kconfig b/fs/Kconfig
index 011f433..9adee0d 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -11,18 +11,15 @@ config DCACHE_WORD_ACCESS
if BLOCK
source "fs/ext2/Kconfig"
-source "fs/ext3/Kconfig"
source "fs/ext4/Kconfig"
-source "fs/jbd/Kconfig"
source "fs/jbd2/Kconfig"
config FS_MBCACHE
# Meta block cache for Extended Attributes (ext2/ext3/ext4)
tristate
default y if EXT2_FS=y && EXT2_FS_XATTR
- default y if EXT3_FS=y && EXT3_FS_XATTR
default y if EXT4_FS=y
- default m if EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS
+ default m if EXT2_FS_XATTR || EXT4_FS
source "fs/reiserfs/Kconfig"
source "fs/jfs/Kconfig"
@@ -49,6 +46,13 @@ config FS_DAX
or if unsure, say N. Saying Y will increase the size of the kernel
by about 5kB.
+config FS_DAX_PMD
+ bool
+ default FS_DAX
+ depends on FS_DAX
+ depends on ZONE_DEVICE
+ depends on TRANSPARENT_HUGEPAGE
+
endif # BLOCK
# Posix ACL utility routines
@@ -70,6 +74,16 @@ config FILE_LOCKING
for filesystems like NFS and for the flock() system
call. Disabling this option saves about 11k.
+config MANDATORY_FILE_LOCKING
+ bool "Enable Mandatory file locking"
+ depends on FILE_LOCKING
+ default y
+ help
+ This option enables files appropriately marked files on appropriely
+ mounted filesystems to support mandatory locking.
+
+ To the best of my knowledge this is dead code that no one cares about.
+
source "fs/notify/Kconfig"
source "fs/quota/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index cb20e4b..79f5225 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -27,6 +27,7 @@ obj-$(CONFIG_ANON_INODES) += anon_inodes.o
obj-$(CONFIG_SIGNALFD) += signalfd.o
obj-$(CONFIG_TIMERFD) += timerfd.o
obj-$(CONFIG_EVENTFD) += eventfd.o
+obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
obj-$(CONFIG_AIO) += aio.o
obj-$(CONFIG_FS_DAX) += dax.o
obj-$(CONFIG_FILE_LOCKING) += locks.o
@@ -62,12 +63,11 @@ obj-$(CONFIG_DLM) += dlm/
# Do not add any filesystems before this line
obj-$(CONFIG_FSCACHE) += fscache/
obj-$(CONFIG_REISERFS_FS) += reiserfs/
-obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3
-obj-$(CONFIG_EXT2_FS) += ext2/
-# We place ext4 after ext2 so plain ext2 root fs's are mounted using ext2
-# unless explicitly requested by rootfstype
obj-$(CONFIG_EXT4_FS) += ext4/
-obj-$(CONFIG_JBD) += jbd/
+# We place ext4 before ext2 so that clean ext3 root fs's do NOT mount using the
+# ext2 driver, which doesn't know about journalling! Explicitly request ext2
+# by giving the rootfstype= parameter.
+obj-$(CONFIG_EXT2_FS) += ext2/
obj-$(CONFIG_JBD2) += jbd2/
obj-$(CONFIG_CRAMFS) += cramfs/
obj-$(CONFIG_SQUASHFS) += squashfs/
diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h
index 24575d9..ea4aba5 100644
--- a/fs/adfs/adfs.h
+++ b/fs/adfs/adfs.h
@@ -45,7 +45,7 @@ struct adfs_dir_ops;
struct adfs_sb_info {
union { struct {
struct adfs_discmap *s_map; /* bh list containing map */
- struct adfs_dir_ops *s_dir; /* directory operations */
+ const struct adfs_dir_ops *s_dir; /* directory operations */
};
struct rcu_head rcu; /* used only at shutdown time */
};
@@ -168,8 +168,8 @@ void __adfs_error(struct super_block *sb, const char *function,
extern const struct inode_operations adfs_dir_inode_operations;
extern const struct file_operations adfs_dir_operations;
extern const struct dentry_operations adfs_dentry_operations;
-extern struct adfs_dir_ops adfs_f_dir_ops;
-extern struct adfs_dir_ops adfs_fplus_dir_ops;
+extern const struct adfs_dir_ops adfs_f_dir_ops;
+extern const struct adfs_dir_ops adfs_fplus_dir_ops;
extern int adfs_dir_update(struct super_block *sb, struct object_info *obj,
int wait);
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index 51c279a..fd4cf2c 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -21,7 +21,7 @@ adfs_readdir(struct file *file, struct dir_context *ctx)
{
struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
- struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir;
+ const struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir;
struct object_info obj;
struct adfs_dir dir;
int ret = 0;
@@ -69,7 +69,7 @@ adfs_dir_update(struct super_block *sb, struct object_info *obj, int wait)
{
int ret = -EINVAL;
#ifdef CONFIG_ADFS_FS_RW
- struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir;
+ const struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir;
struct adfs_dir dir;
printk(KERN_INFO "adfs_dir_update: object %06X in dir %06X\n",
@@ -129,7 +129,7 @@ static int
adfs_dir_lookup_byname(struct inode *inode, struct qstr *name, struct object_info *obj)
{
struct super_block *sb = inode->i_sb;
- struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir;
+ const struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir;
struct adfs_dir dir;
int ret;
diff --git a/fs/adfs/dir_f.c b/fs/adfs/dir_f.c
index 4bbe853..0fbfd0b 100644
--- a/fs/adfs/dir_f.c
+++ b/fs/adfs/dir_f.c
@@ -476,7 +476,7 @@ adfs_f_free(struct adfs_dir *dir)
dir->sb = NULL;
}
-struct adfs_dir_ops adfs_f_dir_ops = {
+const struct adfs_dir_ops adfs_f_dir_ops = {
.read = adfs_f_read,
.setpos = adfs_f_setpos,
.getnext = adfs_f_getnext,
diff --git a/fs/adfs/dir_fplus.c b/fs/adfs/dir_fplus.c
index 82d14cd..c92cfb6 100644
--- a/fs/adfs/dir_fplus.c
+++ b/fs/adfs/dir_fplus.c
@@ -256,7 +256,7 @@ adfs_fplus_free(struct adfs_dir *dir)
dir->sb = NULL;
}
-struct adfs_dir_ops adfs_fplus_dir_ops = {
+const struct adfs_dir_ops adfs_fplus_dir_ops = {
.read = adfs_fplus_read,
.setpos = adfs_fplus_setpos,
.getnext = adfs_fplus_getnext,
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 4d4a0df..c9fdfb1 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -271,7 +271,7 @@ static int __init init_inodecache(void)
adfs_inode_cachep = kmem_cache_create("adfs_inode_cache",
sizeof(struct adfs_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (adfs_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index c69a87e..cc2b2ef 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -138,7 +138,7 @@ extern int affs_remove_hash(struct inode *dir, struct buffer_head *rem_bh);
extern int affs_remove_header(struct dentry *dentry);
extern u32 affs_checksum_block(struct super_block *sb, struct buffer_head *bh);
extern void affs_fix_checksum(struct super_block *sb, struct buffer_head *bh);
-extern void secs_to_datestamp(time_t secs, struct affs_date *ds);
+extern void secs_to_datestamp(time64_t secs, struct affs_date *ds);
extern umode_t prot_to_mode(u32 prot);
extern void mode_to_prot(struct inode *inode);
__printf(3, 4)
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index 5fa92bc..d6c7a51 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -8,6 +8,7 @@
* Please send bug reports to: hjw@zvw.de
*/
+#include <linux/math64.h>
#include "affs.h"
/*
@@ -366,22 +367,22 @@ affs_fix_checksum(struct super_block *sb, struct buffer_head *bh)
}
void
-secs_to_datestamp(time_t secs, struct affs_date *ds)
+secs_to_datestamp(time64_t secs, struct affs_date *ds)
{
u32 days;
u32 minute;
+ s32 rem;
secs -= sys_tz.tz_minuteswest * 60 + ((8 * 365 + 2) * 24 * 60 * 60);
if (secs < 0)
secs = 0;
- days = secs / 86400;
- secs -= days * 86400;
- minute = secs / 60;
- secs -= minute * 60;
+ days = div_s64_rem(secs, 86400, &rem);
+ minute = rem / 60;
+ rem -= minute * 60;
ds->days = cpu_to_be32(days);
ds->mins = cpu_to_be32(minute);
- ds->ticks = cpu_to_be32(secs * 50);
+ ds->ticks = cpu_to_be32(rem * 50);
}
umode_t
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 1734950..0fdb0f5 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -140,6 +140,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
break;
case ST_SOFTLINK:
inode->i_mode |= S_IFLNK;
+ inode_nohighmem(inode);
inode->i_op = &affs_symlink_inode_operations;
inode->i_data.a_ops = &affs_symlink_aops;
break;
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 181e05b..00d3002 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -344,6 +344,7 @@ affs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
return -ENOSPC;
inode->i_op = &affs_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_data.a_ops = &affs_symlink_aops;
inode->i_mode = S_IFLNK | 0777;
mode_to_prot(inode);
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 3f89c9e..2a6713b 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -18,6 +18,7 @@
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/writeback.h>
+#include <linux/blkdev.h>
#include "affs.h"
static int affs_statfs(struct dentry *dentry, struct kstatfs *buf);
@@ -31,7 +32,7 @@ affs_commit_super(struct super_block *sb, int wait)
struct affs_root_tail *tail = AFFS_ROOT_TAIL(sb, bh);
lock_buffer(bh);
- secs_to_datestamp(get_seconds(), &tail->disk_change);
+ secs_to_datestamp(ktime_get_real_seconds(), &tail->disk_change);
affs_fix_checksum(sb, bh);
unlock_buffer(bh);
@@ -131,7 +132,7 @@ static int __init init_inodecache(void)
affs_inode_cachep = kmem_cache_create("affs_inode_cache",
sizeof(struct affs_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (affs_inode_cachep == NULL)
return -ENOMEM;
@@ -352,18 +353,19 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
* blocks, we will have to change it.
*/
- size = sb->s_bdev->bd_inode->i_size >> 9;
+ size = i_size_read(sb->s_bdev->bd_inode) >> 9;
pr_debug("initial blocksize=%d, #blocks=%d\n", 512, size);
affs_set_blocksize(sb, PAGE_SIZE);
/* Try to find root block. Its location depends on the block size. */
- i = 512;
- j = 4096;
+ i = bdev_logical_block_size(sb->s_bdev);
+ j = PAGE_SIZE;
if (blocksize > 0) {
i = j = blocksize;
size = size / (blocksize / 512);
}
+
for (blocksize = i; blocksize <= j; blocksize <<= 1, size >>= 1) {
sbi->s_root_block = root_block;
if (root_block < 0)
diff --git a/fs/affs/symlink.c b/fs/affs/symlink.c
index ea5b69a..69b03db 100644
--- a/fs/affs/symlink.c
+++ b/fs/affs/symlink.c
@@ -14,13 +14,13 @@ static int affs_symlink_readpage(struct file *file, struct page *page)
{
struct buffer_head *bh;
struct inode *inode = page->mapping->host;
- char *link = kmap(page);
+ char *link = page_address(page);
struct slink_front *lf;
int i, j;
char c;
char lc;
- pr_debug("follow_link(ino=%lu)\n", inode->i_ino);
+ pr_debug("get_link(ino=%lu)\n", inode->i_ino);
bh = affs_bread(inode->i_sb, inode->i_ino);
if (!bh)
@@ -57,12 +57,10 @@ static int affs_symlink_readpage(struct file *file, struct page *page)
link[i] = '\0';
affs_brelse(bh);
SetPageUptodate(page);
- kunmap(page);
unlock_page(page);
return 0;
fail:
SetPageError(page);
- kunmap(page);
unlock_page(page);
return -EIO;
}
@@ -73,7 +71,6 @@ const struct address_space_operations affs_symlink_aops = {
const struct inode_operations affs_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
.setattr = affs_notify_change,
};
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index e06f5a2..86cc726 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -56,6 +56,7 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key)
case AFS_FTYPE_SYMLINK:
inode->i_mode = S_IFLNK | vnode->status.mode;
inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
break;
default:
printk("kAFS: AFS vnode with undefined type\n");
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 24a905b..2853b40 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -230,14 +230,9 @@ static ssize_t afs_proc_cells_write(struct file *file, const char __user *buf,
if (size <= 1 || size >= PAGE_SIZE)
return -EINVAL;
- kbuf = kmalloc(size + 1, GFP_KERNEL);
- if (!kbuf)
- return -ENOMEM;
-
- ret = -EFAULT;
- if (copy_from_user(kbuf, buf, size) != 0)
- goto done;
- kbuf[size] = 0;
+ kbuf = memdup_user_nul(buf, size);
+ if (IS_ERR(kbuf))
+ return PTR_ERR(kbuf);
/* trim to first NL */
name = memchr(kbuf, '\n', size);
@@ -315,15 +310,9 @@ static ssize_t afs_proc_rootcell_write(struct file *file,
if (size <= 1 || size >= PAGE_SIZE)
return -EINVAL;
- ret = -ENOMEM;
- kbuf = kmalloc(size + 1, GFP_KERNEL);
- if (!kbuf)
- goto nomem;
-
- ret = -EFAULT;
- if (copy_from_user(kbuf, buf, size) != 0)
- goto infault;
- kbuf[size] = 0;
+ kbuf = memdup_user_nul(buf, size);
+ if (IS_ERR(kbuf))
+ return PTR_ERR(kbuf);
/* trim to first NL */
s = memchr(kbuf, '\n', size);
@@ -337,9 +326,7 @@ static ssize_t afs_proc_rootcell_write(struct file *file,
if (ret >= 0)
ret = size; /* consume everything, always */
-infault:
kfree(kbuf);
-nomem:
_leave(" = %d", ret);
return ret;
}
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 1fb4a51..81afefe 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -91,7 +91,7 @@ int __init afs_fs_init(void)
afs_inode_cachep = kmem_cache_create("afs_inode_cache",
sizeof(struct afs_vnode),
0,
- SLAB_HWCACHE_ALIGN,
+ SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT,
afs_i_init_once);
if (!afs_inode_cachep) {
printk(KERN_NOTICE "kAFS: Failed to allocate inode cache\n");
diff --git a/fs/aio.c b/fs/aio.c
index 480440f..155f842 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -308,15 +308,9 @@ static void aio_free_ring(struct kioctx *ctx)
}
}
-static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
-{
- vma->vm_flags |= VM_DONTEXPAND;
- vma->vm_ops = &generic_file_vm_ops;
- return 0;
-}
-
-static int aio_ring_remap(struct file *file, struct vm_area_struct *vma)
+static int aio_ring_mremap(struct vm_area_struct *vma)
{
+ struct file *file = vma->vm_file;
struct mm_struct *mm = vma->vm_mm;
struct kioctx_table *table;
int i, res = -EINVAL;
@@ -342,9 +336,24 @@ static int aio_ring_remap(struct file *file, struct vm_area_struct *vma)
return res;
}
+static const struct vm_operations_struct aio_ring_vm_ops = {
+ .mremap = aio_ring_mremap,
+#if IS_ENABLED(CONFIG_MMU)
+ .fault = filemap_fault,
+ .map_pages = filemap_map_pages,
+ .page_mkwrite = filemap_page_mkwrite,
+#endif
+};
+
+static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ vma->vm_flags |= VM_DONTEXPAND;
+ vma->vm_ops = &aio_ring_vm_ops;
+ return 0;
+}
+
static const struct file_operations aio_ring_fops = {
.mmap = aio_ring_mmap,
- .mremap = aio_ring_remap,
};
#if IS_ENABLED(CONFIG_MIGRATION)
diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c
index da0c334..84e037d 100644
--- a/fs/autofs4/symlink.c
+++ b/fs/autofs4/symlink.c
@@ -12,10 +12,16 @@
#include "autofs_i.h"
-static const char *autofs4_follow_link(struct dentry *dentry, void **cookie)
+static const char *autofs4_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
- struct autofs_info *ino = autofs4_dentry_ino(dentry);
+ struct autofs_sb_info *sbi;
+ struct autofs_info *ino;
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+ sbi = autofs4_sbi(dentry->d_sb);
+ ino = autofs4_dentry_ino(dentry);
if (ino && !autofs4_oz_mode(sbi))
ino->last_used = jiffies;
return d_inode(dentry)->i_private;
@@ -23,5 +29,5 @@ static const char *autofs4_follow_link(struct dentry *dentry, void **cookie)
const struct inode_operations autofs4_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = autofs4_follow_link
+ .get_link = autofs4_get_link
};
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 861b1e1..103f5d7 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -192,7 +192,7 @@ EXPORT_SYMBOL(make_bad_inode);
* Returns true if the inode in question has been marked as bad.
*/
-int is_bad_inode(struct inode *inode)
+bool is_bad_inode(struct inode *inode)
{
return (inode->i_op == &bad_inode_ops);
}
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 46aedac..cc0e082 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -42,7 +42,7 @@ static struct inode *befs_iget(struct super_block *, unsigned long);
static struct inode *befs_alloc_inode(struct super_block *sb);
static void befs_destroy_inode(struct inode *inode);
static void befs_destroy_inodecache(void);
-static const char *befs_follow_link(struct dentry *, void **);
+static int befs_symlink_readpage(struct file *, struct page *);
static int befs_utf2nls(struct super_block *sb, const char *in, int in_len,
char **out, int *out_len);
static int befs_nls2utf(struct super_block *sb, const char *in, int in_len,
@@ -79,10 +79,8 @@ static const struct address_space_operations befs_aops = {
.bmap = befs_bmap,
};
-static const struct inode_operations befs_symlink_inode_operations = {
- .readlink = generic_readlink,
- .follow_link = befs_follow_link,
- .put_link = kfree_put_link,
+static const struct address_space_operations befs_symlink_aops = {
+ .readpage = befs_symlink_readpage,
};
/*
@@ -398,7 +396,9 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
inode->i_fop = &befs_dir_operations;
} else if (S_ISLNK(inode->i_mode)) {
if (befs_ino->i_flags & BEFS_LONG_SYMLINK) {
- inode->i_op = &befs_symlink_inode_operations;
+ inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
+ inode->i_mapping->a_ops = &befs_symlink_aops;
} else {
inode->i_link = befs_ino->i_data.symlink;
inode->i_op = &simple_symlink_inode_operations;
@@ -434,7 +434,7 @@ befs_init_inodecache(void)
befs_inode_cachep = kmem_cache_create("befs_inode_cache",
sizeof (struct befs_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (befs_inode_cachep == NULL) {
pr_err("%s: Couldn't initialize inode slabcache\n", __func__);
@@ -463,31 +463,33 @@ befs_destroy_inodecache(void)
* The data stream become link name. Unless the LONG_SYMLINK
* flag is set.
*/
-static const char *
-befs_follow_link(struct dentry *dentry, void **cookie)
+static int befs_symlink_readpage(struct file *unused, struct page *page)
{
- struct super_block *sb = dentry->d_sb;
- struct befs_inode_info *befs_ino = BEFS_I(d_inode(dentry));
+ struct inode *inode = page->mapping->host;
+ struct super_block *sb = inode->i_sb;
+ struct befs_inode_info *befs_ino = BEFS_I(inode);
befs_data_stream *data = &befs_ino->i_data.ds;
befs_off_t len = data->size;
- char *link;
+ char *link = page_address(page);
- if (len == 0) {
+ if (len == 0 || len > PAGE_SIZE) {
befs_error(sb, "Long symlink with illegal length");
- return ERR_PTR(-EIO);
+ goto fail;
}
befs_debug(sb, "Follow long symlink");
- link = kmalloc(len, GFP_NOFS);
- if (!link)
- return ERR_PTR(-ENOMEM);
if (befs_read_lsymlink(sb, data, link, len) != len) {
- kfree(link);
befs_error(sb, "Failed to read entire long symlink");
- return ERR_PTR(-EIO);
+ goto fail;
}
link[len - 1] = '\0';
- return *cookie = link;
+ SetPageUptodate(page);
+ unlock_page(page);
+ return 0;
+fail:
+ SetPageError(page);
+ unlock_page(page);
+ return -EIO;
}
/*
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index fdcb4d6..1e5c896 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -270,7 +270,7 @@ static int __init init_inodecache(void)
bfs_inode_cachep = kmem_cache_create("bfs_inode_cache",
sizeof(struct bfs_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (bfs_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 6b65996..3a93755 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -35,6 +35,7 @@
#include <linux/utsname.h>
#include <linux/coredump.h>
#include <linux/sched.h>
+#include <linux/dax.h>
#include <asm/uaccess.h>
#include <asm/param.h>
#include <asm/page.h>
@@ -487,7 +488,7 @@ static inline int arch_elf_pt_proc(struct elfhdr *ehdr,
}
/**
- * arch_check_elf() - check a PT_LOPROC..PT_HIPROC ELF program header
+ * arch_check_elf() - check an ELF executable
* @ehdr: The main ELF header
* @has_interp: True if the ELF has an interpreter, else false.
* @state: Architecture-specific state preserved throughout the process
@@ -759,16 +760,16 @@ static int load_elf_binary(struct linux_binprm *bprm)
*/
would_dump(bprm, interpreter);
- retval = kernel_read(interpreter, 0, bprm->buf,
- BINPRM_BUF_SIZE);
- if (retval != BINPRM_BUF_SIZE) {
+ /* Get the exec headers */
+ retval = kernel_read(interpreter, 0,
+ (void *)&loc->interp_elf_ex,
+ sizeof(loc->interp_elf_ex));
+ if (retval != sizeof(loc->interp_elf_ex)) {
if (retval >= 0)
retval = -EIO;
goto out_free_dentry;
}
- /* Get the exec headers */
- loc->interp_elf_ex = *((struct elfhdr *)bprm->buf);
break;
}
elf_ppnt++;
@@ -1236,6 +1237,15 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma,
if (vma->vm_flags & VM_DONTDUMP)
return 0;
+ /* support for DAX */
+ if (vma_is_dax(vma)) {
+ if ((vma->vm_flags & VM_SHARED) && FILTER(DAX_SHARED))
+ goto whole;
+ if (!(vma->vm_flags & VM_SHARED) && FILTER(DAX_PRIVATE))
+ goto whole;
+ return 0;
+ }
+
/* Hugetlb memory check */
if (vma->vm_flags & VM_HUGETLB) {
if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED))
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index d3634bf..b1adb92 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -35,6 +35,7 @@
#include <linux/elf-fdpic.h>
#include <linux/elfcore.h>
#include <linux/coredump.h>
+#include <linux/dax.h>
#include <asm/uaccess.h>
#include <asm/param.h>
@@ -103,19 +104,36 @@ static void __exit exit_elf_fdpic_binfmt(void)
core_initcall(init_elf_fdpic_binfmt);
module_exit(exit_elf_fdpic_binfmt);
-static int is_elf_fdpic(struct elfhdr *hdr, struct file *file)
+static int is_elf(struct elfhdr *hdr, struct file *file)
{
if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0)
return 0;
if (hdr->e_type != ET_EXEC && hdr->e_type != ET_DYN)
return 0;
- if (!elf_check_arch(hdr) || !elf_check_fdpic(hdr))
+ if (!elf_check_arch(hdr))
return 0;
if (!file->f_op->mmap)
return 0;
return 1;
}
+#ifndef elf_check_fdpic
+#define elf_check_fdpic(x) 0
+#endif
+
+#ifndef elf_check_const_displacement
+#define elf_check_const_displacement(x) 0
+#endif
+
+static int is_constdisp(struct elfhdr *hdr)
+{
+ if (!elf_check_fdpic(hdr))
+ return 1;
+ if (elf_check_const_displacement(hdr))
+ return 1;
+ return 0;
+}
+
/*****************************************************************************/
/*
* read the program headers table into memory
@@ -191,8 +209,18 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
/* check that this is a binary we know how to deal with */
retval = -ENOEXEC;
- if (!is_elf_fdpic(&exec_params.hdr, bprm->file))
+ if (!is_elf(&exec_params.hdr, bprm->file))
+ goto error;
+ if (!elf_check_fdpic(&exec_params.hdr)) {
+#ifdef CONFIG_MMU
+ /* binfmt_elf handles non-fdpic elf except on nommu */
goto error;
+#else
+ /* nommu can only load ET_DYN (PIE) ELF */
+ if (exec_params.hdr.e_type != ET_DYN)
+ goto error;
+#endif
+ }
/* read the program header table */
retval = elf_fdpic_fetch_phdrs(&exec_params, bprm->file);
@@ -269,13 +297,13 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
}
- if (elf_check_const_displacement(&exec_params.hdr))
+ if (is_constdisp(&exec_params.hdr))
exec_params.flags |= ELF_FDPIC_FLAG_CONSTDISP;
/* perform insanity checks on the interpreter */
if (interpreter_name) {
retval = -ELIBBAD;
- if (!is_elf_fdpic(&interp_params.hdr, interpreter))
+ if (!is_elf(&interp_params.hdr, interpreter))
goto error;
interp_params.flags = ELF_FDPIC_FLAG_PRESENT;
@@ -306,9 +334,9 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
retval = -ENOEXEC;
if (stack_size == 0)
- goto error;
+ stack_size = 131072UL; /* same as exec.c's default commit */
- if (elf_check_const_displacement(&interp_params.hdr))
+ if (is_constdisp(&interp_params.hdr))
interp_params.flags |= ELF_FDPIC_FLAG_CONSTDISP;
/* flush all traces of the currently running executable */
@@ -319,7 +347,10 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
/* there's now no turning back... the old userspace image is dead,
* defunct, deceased, etc.
*/
- set_personality(PER_LINUX_FDPIC);
+ if (elf_check_fdpic(&exec_params.hdr))
+ set_personality(PER_LINUX_FDPIC);
+ else
+ set_personality(PER_LINUX);
if (elf_read_implies_exec(&exec_params.hdr, executable_stack))
current->personality |= READ_IMPLIES_EXEC;
@@ -374,10 +405,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
PAGE_ALIGN(current->mm->start_brk);
#else
- /* create a stack and brk area big enough for everyone
- * - the brk heap starts at the bottom and works up
- * - the stack starts at the top and works down
- */
+ /* create a stack area and zero-size brk area */
stack_size = (stack_size + PAGE_SIZE - 1) & PAGE_MASK;
if (stack_size < PAGE_SIZE * 2)
stack_size = PAGE_SIZE * 2;
@@ -400,8 +428,6 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
current->mm->brk = current->mm->start_brk;
current->mm->context.end_brk = current->mm->start_brk;
- current->mm->context.end_brk +=
- (stack_size > PAGE_SIZE) ? (stack_size - PAGE_SIZE) : 0;
current->mm->start_stack = current->mm->start_brk + stack_size;
#endif
@@ -1206,6 +1232,20 @@ static int maydump(struct vm_area_struct *vma, unsigned long mm_flags)
return 0;
}
+ /* support for DAX */
+ if (vma_is_dax(vma)) {
+ if (vma->vm_flags & VM_SHARED) {
+ dump_ok = test_bit(MMF_DUMP_DAX_SHARED, &mm_flags);
+ kdcore("%08lx: %08lx: %s (DAX shared)", vma->vm_start,
+ vma->vm_flags, dump_ok ? "yes" : "no");
+ } else {
+ dump_ok = test_bit(MMF_DUMP_DAX_PRIVATE, &mm_flags);
+ kdcore("%08lx: %08lx: %s (DAX private)", vma->vm_start,
+ vma->vm_flags, dump_ok ? "yes" : "no");
+ }
+ return dump_ok;
+ }
+
/* By default, dump shared memory if mapped from an anonymous file. */
if (vma->vm_flags & VM_SHARED) {
if (file_inode(vma->vm_file)->i_nlink == 0) {
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 1982437..ba762ea 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -28,6 +28,7 @@
#include <linux/namei.h>
#include <linux/log2.h>
#include <linux/cleancache.h>
+#include <linux/dax.h>
#include <asm/uaccess.h>
#include "internal.h"
@@ -49,12 +50,21 @@ struct block_device *I_BDEV(struct inode *inode)
}
EXPORT_SYMBOL(I_BDEV);
-static void bdev_write_inode(struct inode *inode)
+static void bdev_write_inode(struct block_device *bdev)
{
+ struct inode *inode = bdev->bd_inode;
+ int ret;
+
spin_lock(&inode->i_lock);
while (inode->i_state & I_DIRTY) {
spin_unlock(&inode->i_lock);
- WARN_ON_ONCE(write_inode_now(inode, true));
+ ret = write_inode_now(inode, true);
+ if (ret) {
+ char name[BDEVNAME_SIZE];
+ pr_warn_ratelimited("VFS: Dirty inode writeback failed "
+ "for block device %s (err=%d).\n",
+ bdevname(bdev, name), ret);
+ }
spin_lock(&inode->i_lock);
}
spin_unlock(&inode->i_lock);
@@ -146,11 +156,16 @@ blkdev_get_block(struct inode *inode, sector_t iblock,
return 0;
}
+static struct inode *bdev_file_inode(struct file *file)
+{
+ return file->f_mapping->host;
+}
+
static ssize_t
blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
{
struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_mapping->host;
+ struct inode *inode = bdev_file_inode(file);
if (IS_DAX(inode))
return dax_do_io(iocb, inode, iter, offset, blkdev_get_block,
@@ -328,7 +343,7 @@ static int blkdev_write_end(struct file *file, struct address_space *mapping,
*/
static loff_t block_llseek(struct file *file, loff_t offset, int whence)
{
- struct inode *bd_inode = file->f_mapping->host;
+ struct inode *bd_inode = bdev_file_inode(file);
loff_t retval;
mutex_lock(&bd_inode->i_mutex);
@@ -339,7 +354,7 @@ static loff_t block_llseek(struct file *file, loff_t offset, int whence)
int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
{
- struct inode *bd_inode = filp->f_mapping->host;
+ struct inode *bd_inode = bdev_file_inode(filp);
struct block_device *bdev = I_BDEV(bd_inode);
int error;
@@ -380,9 +395,17 @@ int bdev_read_page(struct block_device *bdev, sector_t sector,
struct page *page)
{
const struct block_device_operations *ops = bdev->bd_disk->fops;
+ int result = -EOPNOTSUPP;
+
if (!ops->rw_page || bdev_get_integrity(bdev))
- return -EOPNOTSUPP;
- return ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ);
+ return result;
+
+ result = blk_queue_enter(bdev->bd_queue, false);
+ if (result)
+ return result;
+ result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ);
+ blk_queue_exit(bdev->bd_queue);
+ return result;
}
EXPORT_SYMBOL_GPL(bdev_read_page);
@@ -411,14 +434,20 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
int result;
int rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE;
const struct block_device_operations *ops = bdev->bd_disk->fops;
+
if (!ops->rw_page || bdev_get_integrity(bdev))
return -EOPNOTSUPP;
+ result = blk_queue_enter(bdev->bd_queue, false);
+ if (result)
+ return result;
+
set_page_writeback(page);
result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, rw);
if (result)
end_page_writeback(page);
else
unlock_page(page);
+ blk_queue_exit(bdev->bd_queue);
return result;
}
EXPORT_SYMBOL_GPL(bdev_write_page);
@@ -426,10 +455,7 @@ EXPORT_SYMBOL_GPL(bdev_write_page);
/**
* bdev_direct_access() - Get the address for directly-accessibly memory
* @bdev: The device containing the memory
- * @sector: The offset within the device
- * @addr: Where to put the address of the memory
- * @pfn: The Page Frame Number for the memory
- * @size: The number of bytes requested
+ * @dax: control and output parameters for ->direct_access
*
* If a block device is made up of directly addressable memory, this function
* will tell the caller the PFN and the address of the memory. The address
@@ -440,10 +466,10 @@ EXPORT_SYMBOL_GPL(bdev_write_page);
* Return: negative errno if an error occurs, otherwise the number of bytes
* accessible at this address.
*/
-long bdev_direct_access(struct block_device *bdev, sector_t sector,
- void **addr, unsigned long *pfn, long size)
+long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax)
{
- long avail;
+ sector_t sector = dax->sector;
+ long avail, size = dax->size;
const struct block_device_operations *ops = bdev->bd_disk->fops;
/*
@@ -462,9 +488,11 @@ long bdev_direct_access(struct block_device *bdev, sector_t sector,
sector += get_start_sect(bdev);
if (sector % (PAGE_SIZE / 512))
return -EINVAL;
- avail = ops->direct_access(bdev, sector, addr, pfn, size);
+ avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn);
if (!avail)
return -ERANGE;
+ if (avail > 0 && avail & ~PAGE_MASK)
+ return -ENXIO;
return min(avail, size);
}
EXPORT_SYMBOL_GPL(bdev_direct_access);
@@ -566,7 +594,7 @@ void __init bdev_cache_init(void)
bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD|SLAB_PANIC),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT|SLAB_PANIC),
init_once);
err = register_filesystem(&bd_type);
if (err)
@@ -672,7 +700,7 @@ static struct block_device *bd_acquire(struct inode *inode)
spin_lock(&bdev_lock);
bdev = inode->i_bdev;
if (bdev) {
- ihold(bdev->bd_inode);
+ bdgrab(bdev);
spin_unlock(&bdev_lock);
return bdev;
}
@@ -688,7 +716,7 @@ static struct block_device *bd_acquire(struct inode *inode)
* So, we can access it via ->i_mapping always
* without igrab().
*/
- ihold(bdev->bd_inode);
+ bdgrab(bdev);
inode->i_bdev = bdev;
inode->i_mapping = bdev->bd_inode->i_mapping;
list_add(&inode->i_devices, &bdev->bd_inodes);
@@ -711,7 +739,7 @@ void bd_forget(struct inode *inode)
spin_unlock(&bdev_lock);
if (bdev)
- iput(bdev->bd_inode);
+ bdput(bdev);
}
/**
@@ -1018,12 +1046,9 @@ EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
static void flush_disk(struct block_device *bdev, bool kill_dirty)
{
if (__invalidate_device(bdev, kill_dirty)) {
- char name[BDEVNAME_SIZE] = "";
-
- if (bdev->bd_disk)
- disk_name(bdev->bd_disk, 0, name);
printk(KERN_WARNING "VFS: busy inodes on changed media or "
- "resized disk %s\n", name);
+ "resized disk %s\n",
+ bdev->bd_disk ? bdev->bd_disk->disk_name : "");
}
if (!bdev->bd_disk)
@@ -1047,12 +1072,9 @@ void check_disk_size_change(struct gendisk *disk, struct block_device *bdev)
disk_size = (loff_t)get_capacity(disk) << 9;
bdev_size = i_size_read(bdev->bd_inode);
if (disk_size != bdev_size) {
- char name[BDEVNAME_SIZE];
-
- disk_name(disk, 0, name);
printk(KERN_INFO
"%s: detected capacity change from %lld to %lld\n",
- name, bdev_size, disk_size);
+ disk->disk_name, bdev_size, disk_size);
i_size_write(bdev->bd_inode, disk_size);
flush_disk(bdev, false);
}
@@ -1074,7 +1096,7 @@ int revalidate_disk(struct gendisk *disk)
if (disk->fops->revalidate_disk)
ret = disk->fops->revalidate_disk(disk);
-
+ blk_integrity_revalidate(disk);
bdev = bdget_disk(disk, 0);
if (!bdev)
return ret;
@@ -1206,8 +1228,11 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
}
}
- if (!ret)
+ if (!ret) {
bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
+ if (!blkdev_dax_capable(bdev))
+ bdev->bd_inode->i_flags &= ~S_DAX;
+ }
/*
* If the device is invalidated, rescan partition
@@ -1221,6 +1246,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
else if (ret == -ENOMEDIUM)
invalidate_partitions(disk, bdev);
}
+
if (ret)
goto out_clear;
} else {
@@ -1241,6 +1267,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
goto out_clear;
}
bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
+ if (!blkdev_dax_capable(bdev))
+ bdev->bd_inode->i_flags &= ~S_DAX;
}
} else {
if (bdev->bd_contains == bdev) {
@@ -1492,11 +1520,14 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
WARN_ON_ONCE(bdev->bd_holders);
sync_blockdev(bdev);
kill_bdev(bdev);
+
+ bdev_write_inode(bdev);
/*
- * ->release can cause the queue to disappear, so flush all
- * dirty data before.
+ * Detaching bdev inode from its wb in __destroy_inode()
+ * is too late: the queue which embeds its bdi (along with
+ * root wb) can be gone as soon as we put_disk() below.
*/
- bdev_write_inode(bdev->bd_inode);
+ inode_detach_wb(bdev->bd_inode);
}
if (bdev->bd_contains == bdev) {
if (disk->fops->release)
@@ -1571,14 +1602,14 @@ EXPORT_SYMBOL(blkdev_put);
static int blkdev_close(struct inode * inode, struct file * filp)
{
- struct block_device *bdev = I_BDEV(filp->f_mapping->host);
+ struct block_device *bdev = I_BDEV(bdev_file_inode(filp));
blkdev_put(bdev, filp->f_mode);
return 0;
}
static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
{
- struct block_device *bdev = I_BDEV(file->f_mapping->host);
+ struct block_device *bdev = I_BDEV(bdev_file_inode(file));
fmode_t mode = file->f_mode;
/*
@@ -1603,7 +1634,7 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
- struct inode *bd_inode = file->f_mapping->host;
+ struct inode *bd_inode = bdev_file_inode(file);
loff_t size = i_size_read(bd_inode);
struct blk_plug plug;
ssize_t ret;
@@ -1635,7 +1666,7 @@ EXPORT_SYMBOL_GPL(blkdev_write_iter);
ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct file *file = iocb->ki_filp;
- struct inode *bd_inode = file->f_mapping->host;
+ struct inode *bd_inode = bdev_file_inode(file);
loff_t size = i_size_read(bd_inode);
loff_t pos = iocb->ki_pos;
@@ -1674,13 +1705,101 @@ static const struct address_space_operations def_blk_aops = {
.is_dirty_writeback = buffer_check_dirty_writeback,
};
+#ifdef CONFIG_FS_DAX
+/*
+ * In the raw block case we do not need to contend with truncation nor
+ * unwritten file extents. Without those concerns there is no need for
+ * additional locking beyond the mmap_sem context that these routines
+ * are already executing under.
+ *
+ * Note, there is no protection if the block device is dynamically
+ * resized (partition grow/shrink) during a fault. A stable block device
+ * size is already not enforced in the blkdev_direct_IO path.
+ *
+ * For DAX, it is the responsibility of the block device driver to
+ * ensure the whole-disk device size is stable while requests are in
+ * flight.
+ *
+ * Finally, unlike the filemap_page_mkwrite() case there is no
+ * filesystem superblock to sync against freezing. We still include a
+ * pfn_mkwrite callback for dax drivers to receive write fault
+ * notifications.
+ */
+static int blkdev_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ return __dax_fault(vma, vmf, blkdev_get_block, NULL);
+}
+
+static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmd, unsigned int flags)
+{
+ return __dax_pmd_fault(vma, addr, pmd, flags, blkdev_get_block, NULL);
+}
+
+static void blkdev_vm_open(struct vm_area_struct *vma)
+{
+ struct inode *bd_inode = bdev_file_inode(vma->vm_file);
+ struct block_device *bdev = I_BDEV(bd_inode);
+
+ mutex_lock(&bd_inode->i_mutex);
+ bdev->bd_map_count++;
+ mutex_unlock(&bd_inode->i_mutex);
+}
+
+static void blkdev_vm_close(struct vm_area_struct *vma)
+{
+ struct inode *bd_inode = bdev_file_inode(vma->vm_file);
+ struct block_device *bdev = I_BDEV(bd_inode);
+
+ mutex_lock(&bd_inode->i_mutex);
+ bdev->bd_map_count--;
+ mutex_unlock(&bd_inode->i_mutex);
+}
+
+static const struct vm_operations_struct blkdev_dax_vm_ops = {
+ .open = blkdev_vm_open,
+ .close = blkdev_vm_close,
+ .fault = blkdev_dax_fault,
+ .pmd_fault = blkdev_dax_pmd_fault,
+ .pfn_mkwrite = blkdev_dax_fault,
+};
+
+static const struct vm_operations_struct blkdev_default_vm_ops = {
+ .open = blkdev_vm_open,
+ .close = blkdev_vm_close,
+ .fault = filemap_fault,
+ .map_pages = filemap_map_pages,
+};
+
+static int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct inode *bd_inode = bdev_file_inode(file);
+ struct block_device *bdev = I_BDEV(bd_inode);
+
+ file_accessed(file);
+ mutex_lock(&bd_inode->i_mutex);
+ bdev->bd_map_count++;
+ if (IS_DAX(bd_inode)) {
+ vma->vm_ops = &blkdev_dax_vm_ops;
+ vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
+ } else {
+ vma->vm_ops = &blkdev_default_vm_ops;
+ }
+ mutex_unlock(&bd_inode->i_mutex);
+
+ return 0;
+}
+#else
+#define blkdev_mmap generic_file_mmap
+#endif
+
const struct file_operations def_blk_fops = {
.open = blkdev_open,
.release = blkdev_close,
.llseek = block_llseek,
.read_iter = blkdev_read_iter,
.write_iter = blkdev_write_iter,
- .mmap = generic_file_mmap,
+ .mmap = blkdev_mmap,
.fsync = blkdev_fsync,
.unlocked_ioctl = block_ioctl,
#ifdef CONFIG_COMPAT
@@ -1769,7 +1888,7 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
{
struct inode *inode, *old_inode = NULL;
- spin_lock(&inode_sb_list_lock);
+ spin_lock(&blockdev_superblock->s_inode_list_lock);
list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) {
struct address_space *mapping = inode->i_mapping;
@@ -1781,13 +1900,13 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
}
__iget(inode);
spin_unlock(&inode->i_lock);
- spin_unlock(&inode_sb_list_lock);
+ spin_unlock(&blockdev_superblock->s_inode_list_lock);
/*
* We hold a reference to 'inode' so it couldn't have been
* removed from s_inodes list while we dropped the
- * inode_sb_list_lock. We cannot iput the inode now as we can
+ * s_inode_list_lock We cannot iput the inode now as we can
* be holding the last reference and we cannot iput it under
- * inode_sb_list_lock. So we keep the reference and iput it
+ * s_inode_list_lock. So we keep the reference and iput it
* later.
*/
iput(old_inode);
@@ -1795,8 +1914,8 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
func(I_BDEV(inode), arg);
- spin_lock(&inode_sb_list_lock);
+ spin_lock(&blockdev_superblock->s_inode_list_lock);
}
- spin_unlock(&inode_sb_list_lock);
+ spin_unlock(&blockdev_superblock->s_inode_list_lock);
iput(old_inode);
}
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 6d1d0b9..128ce17 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -9,11 +9,12 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
export.o tree-log.o free-space-cache.o zlib.o lzo.o \
compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
- uuid-tree.o props.o hash.o
+ uuid-tree.o props.o hash.o free-space-tree.o
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \
tests/extent-buffer-tests.o tests/btrfs-tests.o \
- tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o
+ tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o \
+ tests/free-space-tree-tests.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 9a0124a..6d263bb 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -37,10 +37,10 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
switch (type) {
case ACL_TYPE_ACCESS:
- name = POSIX_ACL_XATTR_ACCESS;
+ name = XATTR_NAME_POSIX_ACL_ACCESS;
break;
case ACL_TYPE_DEFAULT:
- name = POSIX_ACL_XATTR_DEFAULT;
+ name = XATTR_NAME_POSIX_ACL_DEFAULT;
break;
default:
BUG();
@@ -48,7 +48,7 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
size = __btrfs_getxattr(inode, name, "", 0);
if (size > 0) {
- value = kzalloc(size, GFP_NOFS);
+ value = kzalloc(size, GFP_KERNEL);
if (!value)
return ERR_PTR(-ENOMEM);
size = __btrfs_getxattr(inode, name, value, size);
@@ -81,7 +81,7 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
switch (type) {
case ACL_TYPE_ACCESS:
- name = POSIX_ACL_XATTR_ACCESS;
+ name = XATTR_NAME_POSIX_ACL_ACCESS;
if (acl) {
ret = posix_acl_equiv_mode(acl, &inode->i_mode);
if (ret < 0)
@@ -94,7 +94,7 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
case ACL_TYPE_DEFAULT:
if (!S_ISDIR(inode->i_mode))
return acl ? -EINVAL : 0;
- name = POSIX_ACL_XATTR_DEFAULT;
+ name = XATTR_NAME_POSIX_ACL_DEFAULT;
break;
default:
return -EINVAL;
@@ -102,7 +102,7 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
if (acl) {
size = posix_acl_xattr_size(acl->a_count);
- value = kmalloc(size, GFP_NOFS);
+ value = kmalloc(size, GFP_KERNEL);
if (!value) {
ret = -ENOMEM;
goto out;
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 1ce06c84..88d9af3 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -42,8 +42,14 @@ struct __btrfs_workqueue {
/* Thresholding related variants */
atomic_t pending;
- int max_active;
- int current_max;
+
+ /* Up limit of concurrency workers */
+ int limit_active;
+
+ /* Current number of concurrency workers */
+ int current_active;
+
+ /* Threshold to change current_active */
int thresh;
unsigned int count;
spinlock_t thres_lock;
@@ -88,34 +94,39 @@ BTRFS_WORK_HELPER(scrubnc_helper);
BTRFS_WORK_HELPER(scrubparity_helper);
static struct __btrfs_workqueue *
-__btrfs_alloc_workqueue(const char *name, unsigned int flags, int max_active,
+__btrfs_alloc_workqueue(const char *name, unsigned int flags, int limit_active,
int thresh)
{
- struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
+ struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL);
if (!ret)
return NULL;
- ret->max_active = max_active;
+ ret->limit_active = limit_active;
atomic_set(&ret->pending, 0);
if (thresh == 0)
thresh = DFT_THRESHOLD;
/* For low threshold, disabling threshold is a better choice */
if (thresh < DFT_THRESHOLD) {
- ret->current_max = max_active;
+ ret->current_active = limit_active;
ret->thresh = NO_THRESHOLD;
} else {
- ret->current_max = 1;
+ /*
+ * For threshold-able wq, let its concurrency grow on demand.
+ * Use minimal max_active at alloc time to reduce resource
+ * usage.
+ */
+ ret->current_active = 1;
ret->thresh = thresh;
}
if (flags & WQ_HIGHPRI)
ret->normal_wq = alloc_workqueue("%s-%s-high", flags,
- ret->max_active,
- "btrfs", name);
+ ret->current_active, "btrfs",
+ name);
else
ret->normal_wq = alloc_workqueue("%s-%s", flags,
- ret->max_active, "btrfs",
+ ret->current_active, "btrfs",
name);
if (!ret->normal_wq) {
kfree(ret);
@@ -134,23 +145,23 @@ __btrfs_destroy_workqueue(struct __btrfs_workqueue *wq);
struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
unsigned int flags,
- int max_active,
+ int limit_active,
int thresh)
{
- struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
+ struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL);
if (!ret)
return NULL;
ret->normal = __btrfs_alloc_workqueue(name, flags & ~WQ_HIGHPRI,
- max_active, thresh);
+ limit_active, thresh);
if (!ret->normal) {
kfree(ret);
return NULL;
}
if (flags & WQ_HIGHPRI) {
- ret->high = __btrfs_alloc_workqueue(name, flags, max_active,
+ ret->high = __btrfs_alloc_workqueue(name, flags, limit_active,
thresh);
if (!ret->high) {
__btrfs_destroy_workqueue(ret->normal);
@@ -180,7 +191,7 @@ static inline void thresh_queue_hook(struct __btrfs_workqueue *wq)
*/
static inline void thresh_exec_hook(struct __btrfs_workqueue *wq)
{
- int new_max_active;
+ int new_current_active;
long pending;
int need_change = 0;
@@ -197,7 +208,7 @@ static inline void thresh_exec_hook(struct __btrfs_workqueue *wq)
wq->count %= (wq->thresh / 4);
if (!wq->count)
goto out;
- new_max_active = wq->current_max;
+ new_current_active = wq->current_active;
/*
* pending may be changed later, but it's OK since we really
@@ -205,19 +216,19 @@ static inline void thresh_exec_hook(struct __btrfs_workqueue *wq)
*/
pending = atomic_read(&wq->pending);
if (pending > wq->thresh)
- new_max_active++;
+ new_current_active++;
if (pending < wq->thresh / 2)
- new_max_active--;
- new_max_active = clamp_val(new_max_active, 1, wq->max_active);
- if (new_max_active != wq->current_max) {
+ new_current_active--;
+ new_current_active = clamp_val(new_current_active, 1, wq->limit_active);
+ if (new_current_active != wq->current_active) {
need_change = 1;
- wq->current_max = new_max_active;
+ wq->current_active = new_current_active;
}
out:
spin_unlock(&wq->thres_lock);
if (need_change) {
- workqueue_set_max_active(wq->normal_wq, wq->current_max);
+ workqueue_set_max_active(wq->normal_wq, wq->current_active);
}
}
@@ -351,13 +362,13 @@ void btrfs_destroy_workqueue(struct btrfs_workqueue *wq)
kfree(wq);
}
-void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max)
+void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int limit_active)
{
if (!wq)
return;
- wq->normal->max_active = max;
+ wq->normal->limit_active = limit_active;
if (wq->high)
- wq->high->max_active = max;
+ wq->high->limit_active = limit_active;
}
void btrfs_set_work_high_priority(struct btrfs_work *work)
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index b0b093b..ad4d064 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -69,7 +69,7 @@ BTRFS_WORK_HELPER_PROTO(scrubparity_helper);
struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
unsigned int flags,
- int max_active,
+ int limit_active,
int thresh);
void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t helper,
btrfs_func_t func,
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 802fabb..08405a3 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -206,10 +206,33 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id,
return -ENOMEM;
ref->root_id = root_id;
- if (key)
+ if (key) {
ref->key_for_search = *key;
- else
+ /*
+ * We can often find data backrefs with an offset that is too
+ * large (>= LLONG_MAX, maximum allowed file offset) due to
+ * underflows when subtracting a file's offset with the data
+ * offset of its corresponding extent data item. This can
+ * happen for example in the clone ioctl.
+ * So if we detect such case we set the search key's offset to
+ * zero to make sure we will find the matching file extent item
+ * at add_all_parents(), otherwise we will miss it because the
+ * offset taken form the backref is much larger then the offset
+ * of the file extent item. This can make us scan a very large
+ * number of file extent items, but at least it will not make
+ * us miss any.
+ * This is an ugly workaround for a behaviour that should have
+ * never existed, but it does and a fix for the clone ioctl
+ * would touch a lot of places, cause backwards incompatibility
+ * and would not fix the problem for extents cloned with older
+ * kernels.
+ */
+ if (ref->key_for_search.type == BTRFS_EXTENT_DATA_KEY &&
+ ref->key_for_search.offset >= LLONG_MAX)
+ ref->key_for_search.offset = 0;
+ } else {
memset(&ref->key_for_search, 0, sizeof(ref->key_for_search));
+ }
ref->inode_list = NULL;
ref->level = level;
@@ -332,13 +355,19 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
index = srcu_read_lock(&fs_info->subvol_srcu);
- root = btrfs_read_fs_root_no_name(fs_info, &root_key);
+ root = btrfs_get_fs_root(fs_info, &root_key, false);
if (IS_ERR(root)) {
srcu_read_unlock(&fs_info->subvol_srcu, index);
ret = PTR_ERR(root);
goto out;
}
+ if (btrfs_test_is_dummy_root(root)) {
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
+ ret = -ENOENT;
+ goto out;
+ }
+
if (path->search_commit_root)
root_level = btrfs_header_level(root->commit_root);
else if (time_seq == (u64)-1)
@@ -491,13 +520,10 @@ static inline int ref_for_same_block(struct __prelim_ref *ref1,
static int __add_missing_keys(struct btrfs_fs_info *fs_info,
struct list_head *head)
{
- struct list_head *pos;
+ struct __prelim_ref *ref;
struct extent_buffer *eb;
- list_for_each(pos, head) {
- struct __prelim_ref *ref;
- ref = list_entry(pos, struct __prelim_ref, list);
-
+ list_for_each_entry(ref, head, list) {
if (ref->parent)
continue;
if (ref->key_for_search.type)
@@ -534,23 +560,15 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info,
*/
static void __merge_refs(struct list_head *head, int mode)
{
- struct list_head *pos1;
-
- list_for_each(pos1, head) {
- struct list_head *n2;
- struct list_head *pos2;
- struct __prelim_ref *ref1;
+ struct __prelim_ref *ref1;
- ref1 = list_entry(pos1, struct __prelim_ref, list);
+ list_for_each_entry(ref1, head, list) {
+ struct __prelim_ref *ref2 = ref1, *tmp;
- for (pos2 = pos1->next, n2 = pos2->next; pos2 != head;
- pos2 = n2, n2 = pos2->next) {
- struct __prelim_ref *ref2;
+ list_for_each_entry_safe_continue(ref2, tmp, head, list) {
struct __prelim_ref *xchg;
struct extent_inode_elem *eie;
- ref2 = list_entry(pos2, struct __prelim_ref, list);
-
if (!ref_for_same_block(ref1, ref2))
continue;
if (mode == 1) {
@@ -632,7 +650,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
struct btrfs_delayed_tree_ref *ref;
ref = btrfs_delayed_node_to_tree_ref(node);
- ret = __add_prelim_ref(prefs, ref->root, NULL,
+ ret = __add_prelim_ref(prefs, 0, NULL,
ref->level + 1, ref->parent,
node->bytenr,
node->ref_mod * sgn, GFP_ATOMIC);
@@ -664,11 +682,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
struct btrfs_delayed_data_ref *ref;
ref = btrfs_delayed_node_to_data_ref(node);
-
- key.objectid = ref->objectid;
- key.type = BTRFS_EXTENT_DATA_KEY;
- key.offset = ref->offset;
- ret = __add_prelim_ref(prefs, ref->root, &key, 0,
+ ret = __add_prelim_ref(prefs, 0, NULL, 0,
ref->parent, node->bytenr,
node->ref_mod * sgn, GFP_ATOMIC);
break;
@@ -1809,7 +1823,6 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
int found = 0;
struct extent_buffer *eb;
struct btrfs_inode_extref *extref;
- struct extent_buffer *leaf;
u32 item_size;
u32 cur_offset;
unsigned long ptr;
@@ -1837,9 +1850,8 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
btrfs_release_path(path);
- leaf = path->nodes[0];
- item_size = btrfs_item_size_nr(leaf, slot);
- ptr = btrfs_item_ptr_offset(leaf, slot);
+ item_size = btrfs_item_size_nr(eb, slot);
+ ptr = btrfs_item_ptr_offset(eb, slot);
cur_offset = 0;
while (cur_offset < item_size) {
@@ -1853,7 +1865,7 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
if (ret)
break;
- cur_offset += btrfs_inode_extref_name_len(leaf, extref);
+ cur_offset += btrfs_inode_extref_name_len(eb, extref);
cur_offset += sizeof(*extref);
}
btrfs_tree_read_unlock_blocking(eb);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 81220b2..61205e3 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -44,8 +44,6 @@
#define BTRFS_INODE_IN_DELALLOC_LIST 9
#define BTRFS_INODE_READDIO_NEED_LOCK 10
#define BTRFS_INODE_HAS_PROPS 11
-/* DIO is ready to submit */
-#define BTRFS_INODE_DIO_READY 12
/*
* The following 3 bits are meant only for the btree inode.
* When any of them is set, it means an error happened while writing an
@@ -194,6 +192,10 @@ struct btrfs_inode {
/* File creation time. */
struct timespec i_otime;
+ /* Hook into fs_info->delayed_iputs */
+ struct list_head delayed_iput;
+ long delayed_iput_count;
+
struct inode vfs_inode;
};
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index ce7dec8..861d472 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -343,7 +343,7 @@ static int btrfsic_process_written_superblock(
struct btrfsic_state *state,
struct btrfsic_block *const block,
struct btrfs_super_block *const super_hdr);
-static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status);
+static void btrfsic_bio_end_io(struct bio *bp);
static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate);
static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state,
const struct btrfsic_block *block,
@@ -531,13 +531,9 @@ static struct btrfsic_block *btrfsic_block_hashtable_lookup(
(((unsigned int)(dev_bytenr >> 16)) ^
((unsigned int)((uintptr_t)bdev))) &
(BTRFSIC_BLOCK_HASHTABLE_SIZE - 1);
- struct list_head *elem;
-
- list_for_each(elem, h->table + hashval) {
- struct btrfsic_block *const b =
- list_entry(elem, struct btrfsic_block,
- collision_resolving_node);
+ struct btrfsic_block *b;
+ list_for_each_entry(b, h->table + hashval, collision_resolving_node) {
if (b->dev_state->bdev == bdev && b->dev_bytenr == dev_bytenr)
return b;
}
@@ -588,13 +584,9 @@ static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup(
((unsigned int)((uintptr_t)bdev_ref_to)) ^
((unsigned int)((uintptr_t)bdev_ref_from))) &
(BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1);
- struct list_head *elem;
-
- list_for_each(elem, h->table + hashval) {
- struct btrfsic_block_link *const l =
- list_entry(elem, struct btrfsic_block_link,
- collision_resolving_node);
+ struct btrfsic_block_link *l;
+ list_for_each_entry(l, h->table + hashval, collision_resolving_node) {
BUG_ON(NULL == l->block_ref_to);
BUG_ON(NULL == l->block_ref_from);
if (l->block_ref_to->dev_state->bdev == bdev_ref_to &&
@@ -639,13 +631,9 @@ static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(
const unsigned int hashval =
(((unsigned int)((uintptr_t)bdev)) &
(BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1));
- struct list_head *elem;
-
- list_for_each(elem, h->table + hashval) {
- struct btrfsic_dev_state *const ds =
- list_entry(elem, struct btrfsic_dev_state,
- collision_resolving_node);
+ struct btrfsic_dev_state *ds;
+ list_for_each_entry(ds, h->table + hashval, collision_resolving_node) {
if (ds->bdev == bdev)
return ds;
}
@@ -667,7 +655,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS);
if (NULL == selected_super) {
printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
- return -1;
+ return -ENOMEM;
}
list_for_each_entry(device, dev_head, dev_list) {
@@ -845,8 +833,8 @@ static int btrfsic_process_superblock_dev_mirror(
superblock_tmp->never_written = 0;
superblock_tmp->mirror_num = 1 + superblock_mirror_num;
if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
- printk_in_rcu(KERN_INFO "New initial S-block (bdev %p, %s)"
- " @%llu (%s/%llu/%d)\n",
+ btrfs_info_in_rcu(device->dev_root->fs_info,
+ "new initial S-block (bdev %p, %s) @%llu (%s/%llu/%d)",
superblock_bdev,
rcu_str_deref(device->name), dev_bytenr,
dev_state->name, dev_bytenr,
@@ -1660,7 +1648,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
sizeof(*block_ctx->pagev)) *
num_pages, GFP_NOFS);
if (!block_ctx->mem_to_free)
- return -1;
+ return -ENOMEM;
block_ctx->datav = block_ctx->mem_to_free;
block_ctx->pagev = (struct page **)(block_ctx->datav + num_pages);
for (i = 0; i < num_pages; i++) {
@@ -1720,29 +1708,20 @@ static int btrfsic_read_block(struct btrfsic_state *state,
static void btrfsic_dump_database(struct btrfsic_state *state)
{
- struct list_head *elem_all;
+ const struct btrfsic_block *b_all;
BUG_ON(NULL == state);
printk(KERN_INFO "all_blocks_list:\n");
- list_for_each(elem_all, &state->all_blocks_list) {
- const struct btrfsic_block *const b_all =
- list_entry(elem_all, struct btrfsic_block,
- all_blocks_node);
- struct list_head *elem_ref_to;
- struct list_head *elem_ref_from;
+ list_for_each_entry(b_all, &state->all_blocks_list, all_blocks_node) {
+ const struct btrfsic_block_link *l;
printk(KERN_INFO "%c-block @%llu (%s/%llu/%d)\n",
btrfsic_get_block_type(state, b_all),
b_all->logical_bytenr, b_all->dev_state->name,
b_all->dev_bytenr, b_all->mirror_num);
- list_for_each(elem_ref_to, &b_all->ref_to_list) {
- const struct btrfsic_block_link *const l =
- list_entry(elem_ref_to,
- struct btrfsic_block_link,
- node_ref_to);
-
+ list_for_each_entry(l, &b_all->ref_to_list, node_ref_to) {
printk(KERN_INFO " %c @%llu (%s/%llu/%d)"
" refers %u* to"
" %c @%llu (%s/%llu/%d)\n",
@@ -1757,12 +1736,7 @@ static void btrfsic_dump_database(struct btrfsic_state *state)
l->block_ref_to->mirror_num);
}
- list_for_each(elem_ref_from, &b_all->ref_from_list) {
- const struct btrfsic_block_link *const l =
- list_entry(elem_ref_from,
- struct btrfsic_block_link,
- node_ref_from);
-
+ list_for_each_entry(l, &b_all->ref_from_list, node_ref_from) {
printk(KERN_INFO " %c @%llu (%s/%llu/%d)"
" is ref %u* from"
" %c @%llu (%s/%llu/%d)\n",
@@ -1845,8 +1819,7 @@ again:
&state->block_hashtable);
if (NULL != block) {
u64 bytenr = 0;
- struct list_head *elem_ref_to;
- struct list_head *tmp_ref_to;
+ struct btrfsic_block_link *l, *tmp;
if (block->is_superblock) {
bytenr = btrfs_super_bytenr((struct btrfs_super_block *)
@@ -1967,13 +1940,8 @@ again:
* because it still carries valueable information
* like whether it was ever written and IO completed.
*/
- list_for_each_safe(elem_ref_to, tmp_ref_to,
- &block->ref_to_list) {
- struct btrfsic_block_link *const l =
- list_entry(elem_ref_to,
- struct btrfsic_block_link,
- node_ref_to);
-
+ list_for_each_entry_safe(l, tmp, &block->ref_to_list,
+ node_ref_to) {
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
btrfsic_print_rem_link(state, l);
l->ref_cnt--;
@@ -2207,7 +2175,7 @@ continue_loop:
goto again;
}
-static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status)
+static void btrfsic_bio_end_io(struct bio *bp)
{
struct btrfsic_block *block = (struct btrfsic_block *)bp->bi_private;
int iodone_w_error;
@@ -2215,7 +2183,7 @@ static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status)
/* mutex is not held! This is not save if IO is not yet completed
* on umount */
iodone_w_error = 0;
- if (bio_error_status)
+ if (bp->bi_error)
iodone_w_error = 1;
BUG_ON(NULL == block);
@@ -2230,7 +2198,7 @@ static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status)
BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
printk(KERN_INFO
"bio_end_io(err=%d) for %c @%llu (%s/%llu/%d)\n",
- bio_error_status,
+ bp->bi_error,
btrfsic_get_block_type(dev_state->state, block),
block->logical_bytenr, dev_state->name,
block->dev_bytenr, block->mirror_num);
@@ -2252,7 +2220,7 @@ static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status)
block = next_block;
} while (NULL != block);
- bp->bi_end_io(bp, bio_error_status);
+ bp->bi_end_io(bp);
}
static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate)
@@ -2436,7 +2404,7 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
struct btrfsic_block *const block,
int recursion_level)
{
- struct list_head *elem_ref_to;
+ const struct btrfsic_block_link *l;
int ret = 0;
if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
@@ -2464,11 +2432,7 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
* This algorithm is recursive because the amount of used stack
* space is very small and the max recursion depth is limited.
*/
- list_for_each(elem_ref_to, &block->ref_to_list) {
- const struct btrfsic_block_link *const l =
- list_entry(elem_ref_to, struct btrfsic_block_link,
- node_ref_to);
-
+ list_for_each_entry(l, &block->ref_to_list, node_ref_to) {
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
printk(KERN_INFO
"rl=%d, %c @%llu (%s/%llu/%d)"
@@ -2561,7 +2525,7 @@ static int btrfsic_is_block_ref_by_superblock(
const struct btrfsic_block *block,
int recursion_level)
{
- struct list_head *elem_ref_from;
+ const struct btrfsic_block_link *l;
if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
/* refer to comment at "abort cyclic linkage (case 1)" */
@@ -2576,11 +2540,7 @@ static int btrfsic_is_block_ref_by_superblock(
* This algorithm is recursive because the amount of used stack space
* is very small and the max recursion depth is limited.
*/
- list_for_each(elem_ref_from, &block->ref_from_list) {
- const struct btrfsic_block_link *const l =
- list_entry(elem_ref_from, struct btrfsic_block_link,
- node_ref_from);
-
+ list_for_each_entry(l, &block->ref_from_list, node_ref_from) {
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
printk(KERN_INFO
"rl=%d, %c @%llu (%s/%llu/%d)"
@@ -2669,7 +2629,7 @@ static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
const struct btrfsic_block *block,
int indent_level)
{
- struct list_head *elem_ref_to;
+ const struct btrfsic_block_link *l;
int indent_add;
static char buf[80];
int cursor_position;
@@ -2704,11 +2664,7 @@ static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
}
cursor_position = indent_level;
- list_for_each(elem_ref_to, &block->ref_to_list) {
- const struct btrfsic_block_link *const l =
- list_entry(elem_ref_to, struct btrfsic_block_link,
- node_ref_to);
-
+ list_for_each_entry(l, &block->ref_to_list, node_ref_to) {
while (cursor_position < indent_level) {
printk(" ");
cursor_position++;
@@ -3165,8 +3121,7 @@ int btrfsic_mount(struct btrfs_root *root,
void btrfsic_unmount(struct btrfs_root *root,
struct btrfs_fs_devices *fs_devices)
{
- struct list_head *elem_all;
- struct list_head *tmp_all;
+ struct btrfsic_block *b_all, *tmp_all;
struct btrfsic_state *state;
struct list_head *dev_head = &fs_devices->devices;
struct btrfs_device *device;
@@ -3206,20 +3161,12 @@ void btrfsic_unmount(struct btrfs_root *root,
* just free all memory that was allocated dynamically.
* Free the blocks and the block_links.
*/
- list_for_each_safe(elem_all, tmp_all, &state->all_blocks_list) {
- struct btrfsic_block *const b_all =
- list_entry(elem_all, struct btrfsic_block,
- all_blocks_node);
- struct list_head *elem_ref_to;
- struct list_head *tmp_ref_to;
-
- list_for_each_safe(elem_ref_to, tmp_ref_to,
- &b_all->ref_to_list) {
- struct btrfsic_block_link *const l =
- list_entry(elem_ref_to,
- struct btrfsic_block_link,
- node_ref_to);
+ list_for_each_entry_safe(b_all, tmp_all, &state->all_blocks_list,
+ all_blocks_node) {
+ struct btrfsic_block_link *l, *tmp;
+ list_for_each_entry_safe(l, tmp, &b_all->ref_to_list,
+ node_ref_to) {
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
btrfsic_print_rem_link(state, l);
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index ce62324..c473c42 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -97,10 +97,7 @@ static inline int compressed_bio_size(struct btrfs_root *root,
static struct bio *compressed_bio_alloc(struct block_device *bdev,
u64 first_byte, gfp_t gfp_flags)
{
- int nr_vecs;
-
- nr_vecs = bio_get_nr_vecs(bdev);
- return btrfs_bio_alloc(bdev, first_byte >> 9, nr_vecs, gfp_flags);
+ return btrfs_bio_alloc(bdev, first_byte >> 9, BIO_MAX_PAGES, gfp_flags);
}
static int check_compressed_csum(struct inode *inode,
@@ -152,7 +149,7 @@ fail:
* The compressed pages are freed here, and it must be run
* in process context
*/
-static void end_compressed_bio_read(struct bio *bio, int err)
+static void end_compressed_bio_read(struct bio *bio)
{
struct compressed_bio *cb = bio->bi_private;
struct inode *inode;
@@ -160,7 +157,7 @@ static void end_compressed_bio_read(struct bio *bio, int err)
unsigned long index;
int ret;
- if (err)
+ if (bio->bi_error)
cb->errors = 1;
/* if there are more bios still pending for this compressed
@@ -210,7 +207,7 @@ csum_failed:
bio_for_each_segment_all(bvec, cb->orig_bio, i)
SetPageChecked(bvec->bv_page);
- bio_endio(cb->orig_bio, 0);
+ bio_endio(cb->orig_bio);
}
/* finally free the cb struct */
@@ -266,7 +263,7 @@ static noinline void end_compressed_writeback(struct inode *inode,
* This also calls the writeback end hooks for the file pages so that
* metadata and checksums can be updated in the file.
*/
-static void end_compressed_bio_write(struct bio *bio, int err)
+static void end_compressed_bio_write(struct bio *bio)
{
struct extent_io_tree *tree;
struct compressed_bio *cb = bio->bi_private;
@@ -274,7 +271,7 @@ static void end_compressed_bio_write(struct bio *bio, int err)
struct page *page;
unsigned long index;
- if (err)
+ if (bio->bi_error)
cb->errors = 1;
/* if there are more bios still pending for this compressed
@@ -293,7 +290,7 @@ static void end_compressed_bio_write(struct bio *bio, int err)
cb->start,
cb->start + cb->len - 1,
NULL,
- err ? 0 : 1);
+ bio->bi_error ? 0 : 1);
cb->compressed_pages[0]->mapping = NULL;
end_compressed_writeback(inode, cb);
@@ -485,13 +482,12 @@ static noinline int add_ra_bio_pages(struct inode *inode,
goto next;
}
- page = __page_cache_alloc(mapping_gfp_mask(mapping) &
- ~__GFP_FS);
+ page = __page_cache_alloc(mapping_gfp_constraint(mapping,
+ ~__GFP_FS));
if (!page)
break;
- if (add_to_page_cache_lru(page, mapping, pg_index,
- GFP_NOFS)) {
+ if (add_to_page_cache_lru(page, mapping, pg_index, GFP_NOFS)) {
page_cache_release(page);
goto next;
}
@@ -697,8 +693,10 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
ret = btrfs_map_bio(root, READ, comp_bio,
mirror_num, 0);
- if (ret)
- bio_endio(comp_bio, ret);
+ if (ret) {
+ bio->bi_error = ret;
+ bio_endio(comp_bio);
+ }
bio_put(comp_bio);
@@ -724,8 +722,10 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
}
ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
- if (ret)
- bio_endio(comp_bio, ret);
+ if (ret) {
+ bio->bi_error = ret;
+ bio_endio(comp_bio);
+ }
bio_put(comp_bio);
return 0;
@@ -744,11 +744,13 @@ out:
return ret;
}
-static struct list_head comp_idle_workspace[BTRFS_COMPRESS_TYPES];
-static spinlock_t comp_workspace_lock[BTRFS_COMPRESS_TYPES];
-static int comp_num_workspace[BTRFS_COMPRESS_TYPES];
-static atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES];
-static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES];
+static struct {
+ struct list_head idle_ws;
+ spinlock_t ws_lock;
+ int num_ws;
+ atomic_t alloc_ws;
+ wait_queue_head_t ws_wait;
+} btrfs_comp_ws[BTRFS_COMPRESS_TYPES];
static const struct btrfs_compress_op * const btrfs_compress_op[] = {
&btrfs_zlib_compress,
@@ -760,10 +762,10 @@ void __init btrfs_init_compress(void)
int i;
for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
- INIT_LIST_HEAD(&comp_idle_workspace[i]);
- spin_lock_init(&comp_workspace_lock[i]);
- atomic_set(&comp_alloc_workspace[i], 0);
- init_waitqueue_head(&comp_workspace_wait[i]);
+ INIT_LIST_HEAD(&btrfs_comp_ws[i].idle_ws);
+ spin_lock_init(&btrfs_comp_ws[i].ws_lock);
+ atomic_set(&btrfs_comp_ws[i].alloc_ws, 0);
+ init_waitqueue_head(&btrfs_comp_ws[i].ws_wait);
}
}
@@ -777,38 +779,38 @@ static struct list_head *find_workspace(int type)
int cpus = num_online_cpus();
int idx = type - 1;
- struct list_head *idle_workspace = &comp_idle_workspace[idx];
- spinlock_t *workspace_lock = &comp_workspace_lock[idx];
- atomic_t *alloc_workspace = &comp_alloc_workspace[idx];
- wait_queue_head_t *workspace_wait = &comp_workspace_wait[idx];
- int *num_workspace = &comp_num_workspace[idx];
+ struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws;
+ spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock;
+ atomic_t *alloc_ws = &btrfs_comp_ws[idx].alloc_ws;
+ wait_queue_head_t *ws_wait = &btrfs_comp_ws[idx].ws_wait;
+ int *num_ws = &btrfs_comp_ws[idx].num_ws;
again:
- spin_lock(workspace_lock);
- if (!list_empty(idle_workspace)) {
- workspace = idle_workspace->next;
+ spin_lock(ws_lock);
+ if (!list_empty(idle_ws)) {
+ workspace = idle_ws->next;
list_del(workspace);
- (*num_workspace)--;
- spin_unlock(workspace_lock);
+ (*num_ws)--;
+ spin_unlock(ws_lock);
return workspace;
}
- if (atomic_read(alloc_workspace) > cpus) {
+ if (atomic_read(alloc_ws) > cpus) {
DEFINE_WAIT(wait);
- spin_unlock(workspace_lock);
- prepare_to_wait(workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
- if (atomic_read(alloc_workspace) > cpus && !*num_workspace)
+ spin_unlock(ws_lock);
+ prepare_to_wait(ws_wait, &wait, TASK_UNINTERRUPTIBLE);
+ if (atomic_read(alloc_ws) > cpus && !*num_ws)
schedule();
- finish_wait(workspace_wait, &wait);
+ finish_wait(ws_wait, &wait);
goto again;
}
- atomic_inc(alloc_workspace);
- spin_unlock(workspace_lock);
+ atomic_inc(alloc_ws);
+ spin_unlock(ws_lock);
workspace = btrfs_compress_op[idx]->alloc_workspace();
if (IS_ERR(workspace)) {
- atomic_dec(alloc_workspace);
- wake_up(workspace_wait);
+ atomic_dec(alloc_ws);
+ wake_up(ws_wait);
}
return workspace;
}
@@ -820,27 +822,30 @@ again:
static void free_workspace(int type, struct list_head *workspace)
{
int idx = type - 1;
- struct list_head *idle_workspace = &comp_idle_workspace[idx];
- spinlock_t *workspace_lock = &comp_workspace_lock[idx];
- atomic_t *alloc_workspace = &comp_alloc_workspace[idx];
- wait_queue_head_t *workspace_wait = &comp_workspace_wait[idx];
- int *num_workspace = &comp_num_workspace[idx];
-
- spin_lock(workspace_lock);
- if (*num_workspace < num_online_cpus()) {
- list_add(workspace, idle_workspace);
- (*num_workspace)++;
- spin_unlock(workspace_lock);
+ struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws;
+ spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock;
+ atomic_t *alloc_ws = &btrfs_comp_ws[idx].alloc_ws;
+ wait_queue_head_t *ws_wait = &btrfs_comp_ws[idx].ws_wait;
+ int *num_ws = &btrfs_comp_ws[idx].num_ws;
+
+ spin_lock(ws_lock);
+ if (*num_ws < num_online_cpus()) {
+ list_add(workspace, idle_ws);
+ (*num_ws)++;
+ spin_unlock(ws_lock);
goto wake;
}
- spin_unlock(workspace_lock);
+ spin_unlock(ws_lock);
btrfs_compress_op[idx]->free_workspace(workspace);
- atomic_dec(alloc_workspace);
+ atomic_dec(alloc_ws);
wake:
+ /*
+ * Make sure counter is updated before we wake up waiters.
+ */
smp_mb();
- if (waitqueue_active(workspace_wait))
- wake_up(workspace_wait);
+ if (waitqueue_active(ws_wait))
+ wake_up(ws_wait);
}
/*
@@ -852,11 +857,11 @@ static void free_workspaces(void)
int i;
for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
- while (!list_empty(&comp_idle_workspace[i])) {
- workspace = comp_idle_workspace[i].next;
+ while (!list_empty(&btrfs_comp_ws[i].idle_ws)) {
+ workspace = btrfs_comp_ws[i].idle_ws.next;
list_del(workspace);
btrfs_compress_op[i]->free_workspace(workspace);
- atomic_dec(&comp_alloc_workspace[i]);
+ atomic_dec(&btrfs_comp_ws[i].alloc_ws);
}
}
}
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 54114b4..769e0ff 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1011,7 +1011,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
return ret;
if (refs == 0) {
ret = -EROFS;
- btrfs_std_error(root->fs_info, ret);
+ btrfs_std_error(root->fs_info, ret, NULL);
return ret;
}
} else {
@@ -1159,8 +1159,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
ret = btrfs_reloc_cow_block(trans, root, buf, cow);
- if (ret)
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
return ret;
+ }
}
if (buf == root->node) {
@@ -1553,7 +1555,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
return 0;
}
- search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
+ search_start = buf->start & ~((u64)SZ_1G - 1);
if (parent)
btrfs_set_lock_blocking(parent);
@@ -1925,7 +1927,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
child = read_node_slot(root, mid, 0);
if (!child) {
ret = -EROFS;
- btrfs_std_error(root->fs_info, ret);
+ btrfs_std_error(root->fs_info, ret, NULL);
goto enospc;
}
@@ -2028,7 +2030,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
*/
if (!left) {
ret = -EROFS;
- btrfs_std_error(root->fs_info, ret);
+ btrfs_std_error(root->fs_info, ret, NULL);
goto enospc;
}
wret = balance_node_right(trans, root, mid, left);
@@ -2246,7 +2248,6 @@ static void reada_for_search(struct btrfs_root *root,
u64 target;
u64 nread = 0;
u64 gen;
- int direction = path->reada;
struct extent_buffer *eb;
u32 nr;
u32 blocksize;
@@ -2274,16 +2275,16 @@ static void reada_for_search(struct btrfs_root *root,
nr = slot;
while (1) {
- if (direction < 0) {
+ if (path->reada == READA_BACK) {
if (nr == 0)
break;
nr--;
- } else if (direction > 0) {
+ } else if (path->reada == READA_FORWARD) {
nr++;
if (nr >= nritems)
break;
}
- if (path->reada < 0 && objectid) {
+ if (path->reada == READA_BACK && objectid) {
btrfs_node_key(node, &disk_key, nr);
if (btrfs_disk_key_objectid(&disk_key) != objectid)
break;
@@ -2491,7 +2492,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
btrfs_set_path_blocking(p);
free_extent_buffer(tmp);
- if (p->reada)
+ if (p->reada != READA_NONE)
reada_for_search(root, p, level, slot, key->objectid);
btrfs_release_path(p);
@@ -4938,8 +4939,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
{
struct extent_buffer *leaf;
struct btrfs_item *item;
- int last_off;
- int dsize = 0;
+ u32 last_off;
+ u32 dsize = 0;
int ret = 0;
int wret;
int i;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index aac314e..97ad9bb 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -35,6 +35,7 @@
#include <linux/btrfs.h>
#include <linux/workqueue.h>
#include <linux/security.h>
+#include <linux/sizes.h>
#include "extent_io.h"
#include "extent_map.h"
#include "async-thread.h"
@@ -96,6 +97,9 @@ struct btrfs_ordered_sum;
/* for storing items that use the BTRFS_UUID_KEY* types */
#define BTRFS_UUID_TREE_OBJECTID 9ULL
+/* tracks free space in block groups. */
+#define BTRFS_FREE_SPACE_TREE_OBJECTID 10ULL
+
/* for storing balance parameters in the root tree */
#define BTRFS_BALANCE_OBJECTID -4ULL
@@ -174,7 +178,7 @@ struct btrfs_ordered_sum;
/* csum types */
#define BTRFS_CSUM_TYPE_CRC32 0
-static int btrfs_csum_sizes[] = { 4 };
+static const int btrfs_csum_sizes[] = { 4 };
/* four bytes for CRC32 */
#define BTRFS_EMPTY_DIR_SIZE 0
@@ -196,9 +200,9 @@ static int btrfs_csum_sizes[] = { 4 };
/* ioprio of readahead is set to idle */
#define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0))
-#define BTRFS_DIRTY_METADATA_THRESH (32 * 1024 * 1024)
+#define BTRFS_DIRTY_METADATA_THRESH SZ_32M
-#define BTRFS_MAX_EXTENT_SIZE (128 * 1024 * 1024)
+#define BTRFS_MAX_EXTENT_SIZE SZ_128M
/*
* The key defines the order in the tree, and so it also defines (optimal)
@@ -500,6 +504,8 @@ struct btrfs_super_block {
* Compat flags that we support. If any incompat flags are set other than the
* ones specified below then we will fail to mount
*/
+#define BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE (1ULL << 0)
+
#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0)
#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1)
#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2)
@@ -526,7 +532,10 @@ struct btrfs_super_block {
#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
#define BTRFS_FEATURE_COMPAT_SAFE_SET 0ULL
#define BTRFS_FEATURE_COMPAT_SAFE_CLEAR 0ULL
-#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL
+
+#define BTRFS_FEATURE_COMPAT_RO_SUPP \
+ (BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE)
+
#define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL
#define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL
@@ -590,14 +599,15 @@ struct btrfs_node {
* The slots array records the index of the item or block pointer
* used while walking the tree.
*/
+enum { READA_NONE = 0, READA_BACK, READA_FORWARD };
struct btrfs_path {
struct extent_buffer *nodes[BTRFS_MAX_LEVEL];
int slots[BTRFS_MAX_LEVEL];
/* if there is real range locking, this locks field will change */
- int locks[BTRFS_MAX_LEVEL];
- int reada;
+ u8 locks[BTRFS_MAX_LEVEL];
+ u8 reada;
/* keep some upper locks as we walk down */
- int lowest_level;
+ u8 lowest_level;
/*
* set by btrfs_split_item, tells search_slot to keep all locks
@@ -823,8 +833,18 @@ struct btrfs_disk_balance_args {
*/
__le64 profiles;
- /* usage filter */
- __le64 usage;
+ /*
+ * usage filter
+ * BTRFS_BALANCE_ARGS_USAGE with a single value means '0..N'
+ * BTRFS_BALANCE_ARGS_USAGE_RANGE - range syntax, min..max
+ */
+ union {
+ __le64 usage;
+ struct {
+ __le32 usage_min;
+ __le32 usage_max;
+ };
+ };
/* devid filter */
__le64 devid;
@@ -846,10 +866,27 @@ struct btrfs_disk_balance_args {
/* BTRFS_BALANCE_ARGS_* */
__le64 flags;
- /* BTRFS_BALANCE_ARGS_LIMIT value */
- __le64 limit;
+ /*
+ * BTRFS_BALANCE_ARGS_LIMIT with value 'limit'
+ * BTRFS_BALANCE_ARGS_LIMIT_RANGE - the extend version can use minimum
+ * and maximum
+ */
+ union {
+ __le64 limit;
+ struct {
+ __le32 limit_min;
+ __le32 limit_max;
+ };
+ };
- __le64 unused[7];
+ /*
+ * Process chunks that cross stripes_min..stripes_max devices,
+ * BTRFS_BALANCE_ARGS_STRIPES_RANGE
+ */
+ __le32 stripes_min;
+ __le32 stripes_max;
+
+ __le64 unused[6];
} __attribute__ ((__packed__));
/*
@@ -1061,6 +1098,13 @@ struct btrfs_block_group_item {
__le64 flags;
} __attribute__ ((__packed__));
+struct btrfs_free_space_info {
+ __le32 extent_count;
+ __le32 flags;
+} __attribute__ ((__packed__));
+
+#define BTRFS_FREE_SPACE_USING_BITMAPS (1ULL << 0)
+
#define BTRFS_QGROUP_LEVEL_SHIFT 48
static inline u64 btrfs_qgroup_level(u64 qgroupid)
{
@@ -1154,6 +1198,10 @@ struct btrfs_space_info {
delalloc/allocations */
u64 bytes_readonly; /* total bytes that are read only */
+ u64 max_extent_size; /* This will hold the maximum extent size of
+ the space info if we had an ENOSPC in the
+ allocator. */
+
unsigned int full:1; /* indicates that we cannot allocate any more
chunks for this space */
unsigned int chunk_alloc:1; /* set if we are allocating a chunk */
@@ -1228,6 +1276,9 @@ struct btrfs_free_cluster {
/* first extent starting offset */
u64 window_start;
+ /* We did a full search and couldn't create a cluster */
+ bool fragmented;
+
struct btrfs_block_group_cache *block_group;
/*
* when a cluster is allocated from a block group, we put the
@@ -1262,6 +1313,9 @@ struct btrfs_caching_control {
atomic_t count;
};
+/* Once caching_thread() finds this much free space, it will wake up waiters. */
+#define CACHING_CTL_WAKE_UP (1024 * 1024 * 2)
+
struct btrfs_io_ctl {
void *cur, *orig;
struct page *page;
@@ -1287,8 +1341,20 @@ struct btrfs_block_group_cache {
u64 delalloc_bytes;
u64 bytes_super;
u64 flags;
- u64 sectorsize;
u64 cache_generation;
+ u32 sectorsize;
+
+ /*
+ * If the free space extent count exceeds this number, convert the block
+ * group to bitmaps.
+ */
+ u32 bitmap_high_thresh;
+
+ /*
+ * If the free space extent count drops below this number, convert the
+ * block group back to extents.
+ */
+ u32 bitmap_low_thresh;
/*
* It is just used for the delayed data space allocation because
@@ -1300,7 +1366,7 @@ struct btrfs_block_group_cache {
/* for raid56, this is a full stripe, without parity */
unsigned long full_stripe_len;
- unsigned int ro:1;
+ unsigned int ro;
unsigned int iref:1;
unsigned int has_caching_ctl:1;
unsigned int removed:1;
@@ -1344,6 +1410,15 @@ struct btrfs_block_group_cache {
struct list_head io_list;
struct btrfs_io_ctl io_ctl;
+
+ /* Lock for free space tree operations. */
+ struct mutex free_space_lock;
+
+ /*
+ * Does the block group need to be added to the free space tree?
+ * Protected by free_space_lock.
+ */
+ int needs_free_space;
};
/* delayed seq elem */
@@ -1395,6 +1470,7 @@ struct btrfs_fs_info {
struct btrfs_root *csum_root;
struct btrfs_root *quota_root;
struct btrfs_root *uuid_root;
+ struct btrfs_root *free_space_root;
/* the log root tree is a directory of all the other log roots */
struct btrfs_root *log_root_tree;
@@ -1518,12 +1594,6 @@ struct btrfs_fs_info {
*/
struct mutex ordered_operations_mutex;
- /*
- * Same as ordered_operations_mutex except this is for ordered extents
- * and not the operations.
- */
- struct mutex ordered_extent_flush_mutex;
-
struct rw_semaphore commit_root_sem;
struct rw_semaphore cleanup_work_sem;
@@ -1788,6 +1858,8 @@ struct btrfs_fs_info {
* and will be latter freed. Protected by fs_info->chunk_mutex.
*/
struct list_head pinned_chunks;
+
+ int creating_free_space_tree;
};
struct btrfs_subvolume_writers {
@@ -1949,6 +2021,9 @@ struct btrfs_root {
int send_in_progress;
struct btrfs_subvolume_writers *subv_writers;
atomic_t will_be_snapshoted;
+
+ /* For qgroup metadata space reserve */
+ atomic_t qgroup_meta_rsv;
};
struct btrfs_ioctl_defrag_range_args {
@@ -2061,6 +2136,27 @@ struct btrfs_ioctl_defrag_range_args {
*/
#define BTRFS_BLOCK_GROUP_ITEM_KEY 192
+/*
+ * Every block group is represented in the free space tree by a free space info
+ * item, which stores some accounting information. It is keyed on
+ * (block_group_start, FREE_SPACE_INFO, block_group_length).
+ */
+#define BTRFS_FREE_SPACE_INFO_KEY 198
+
+/*
+ * A free space extent tracks an extent of space that is free in a block group.
+ * It is keyed on (start, FREE_SPACE_EXTENT, length).
+ */
+#define BTRFS_FREE_SPACE_EXTENT_KEY 199
+
+/*
+ * When a block group becomes very fragmented, we convert it to use bitmaps
+ * instead of extents. A free space bitmap is keyed on
+ * (start, FREE_SPACE_BITMAP, length); the corresponding item is a bitmap with
+ * (length / sectorsize) bits.
+ */
+#define BTRFS_FREE_SPACE_BITMAP_KEY 200
+
#define BTRFS_DEV_EXTENT_KEY 204
#define BTRFS_DEV_ITEM_KEY 216
#define BTRFS_CHUNK_ITEM_KEY 228
@@ -2151,6 +2247,9 @@ struct btrfs_ioctl_defrag_range_args {
#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21)
#define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR (1 << 22)
#define BTRFS_MOUNT_RESCAN_UUID_TREE (1 << 23)
+#define BTRFS_MOUNT_FRAGMENT_DATA (1 << 24)
+#define BTRFS_MOUNT_FRAGMENT_METADATA (1 << 25)
+#define BTRFS_MOUNT_FREE_SPACE_TREE (1 << 26)
#define BTRFS_DEFAULT_COMMIT_INTERVAL (30)
#define BTRFS_DEFAULT_MAX_INLINE (8192)
@@ -2175,6 +2274,18 @@ struct btrfs_ioctl_defrag_range_args {
btrfs_clear_opt(root->fs_info->mount_opt, opt); \
}
+#ifdef CONFIG_BTRFS_DEBUG
+static inline int
+btrfs_should_fragment_free_space(struct btrfs_root *root,
+ struct btrfs_block_group_cache *block_group)
+{
+ return (btrfs_test_opt(root, FRAGMENT_METADATA) &&
+ block_group->flags & BTRFS_BLOCK_GROUP_METADATA) ||
+ (btrfs_test_opt(root, FRAGMENT_DATA) &&
+ block_group->flags & BTRFS_BLOCK_GROUP_DATA);
+}
+#endif
+
/*
* Requests for changes that need to be done during transaction commit.
*
@@ -2461,6 +2572,11 @@ BTRFS_SETGET_FUNCS(disk_block_group_flags,
BTRFS_SETGET_STACK_FUNCS(block_group_flags,
struct btrfs_block_group_item, flags, 64);
+/* struct btrfs_free_space_info */
+BTRFS_SETGET_FUNCS(free_space_extent_count, struct btrfs_free_space_info,
+ extent_count, 32);
+BTRFS_SETGET_FUNCS(free_space_flags, struct btrfs_free_space_info, flags, 32);
+
/* struct btrfs_inode_ref */
BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);
@@ -3322,7 +3438,7 @@ static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
{
- return mapping_gfp_mask(mapping) & ~__GFP_FS;
+ return mapping_gfp_constraint(mapping, ~__GFP_FS);
}
/* extent-tree.c */
@@ -3371,6 +3487,7 @@ int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
struct btrfs_block_group_cache *btrfs_lookup_block_group(
struct btrfs_fs_info *info,
u64 bytenr);
+void btrfs_get_block_group(struct btrfs_block_group_cache *cache);
void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
int get_block_group_index(struct btrfs_block_group_cache *cache);
struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
@@ -3385,7 +3502,8 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 root_objectid, u64 owner,
- u64 offset, struct btrfs_key *ins);
+ u64 offset, u64 ram_bytes,
+ struct btrfs_key *ins);
int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 root_objectid, u64 owner, u64 offset,
@@ -3404,7 +3522,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
int btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
- u64 owner, u64 offset, int no_quota);
+ u64 owner, u64 offset);
int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len,
int delalloc);
@@ -3417,7 +3535,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent,
- u64 root_objectid, u64 owner, u64 offset, int no_quota);
+ u64 root_objectid, u64 owner, u64 offset);
int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
@@ -3433,10 +3551,15 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytes_used,
u64 type, u64 chunk_objectid, u64 chunk_offset,
u64 size);
+struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
+ struct btrfs_fs_info *fs_info,
+ const u64 chunk_offset);
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 group_start,
struct extent_map *em);
void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
+void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache);
+void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *cache);
void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
@@ -3453,8 +3576,11 @@ enum btrfs_reserve_flush_enum {
BTRFS_RESERVE_FLUSH_ALL,
};
-int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes);
-void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
+int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len);
+int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes);
+void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len);
+void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
+ u64 len);
void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
@@ -3470,8 +3596,8 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root,
u64 qgroup_reserved);
int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
-int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
-void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes);
+int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len);
+void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len);
void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
unsigned short type);
@@ -3495,9 +3621,9 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
void btrfs_block_rsv_release(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv,
u64 num_bytes);
-int btrfs_set_block_group_ro(struct btrfs_root *root,
+int btrfs_inc_block_group_ro(struct btrfs_root *root,
struct btrfs_block_group_cache *cache);
-void btrfs_set_block_group_rw(struct btrfs_root *root,
+void btrfs_dec_block_group_ro(struct btrfs_root *root,
struct btrfs_block_group_cache *cache);
void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
@@ -3518,6 +3644,9 @@ void btrfs_end_write_no_snapshoting(struct btrfs_root *root);
void check_system_chunk(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
const u64 type);
+u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
+ struct btrfs_fs_info *info, u64 start, u64 end);
+
/* ctree.c */
int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
int level, int *slot);
@@ -3682,6 +3811,7 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info)
kfree(fs_info->csum_root);
kfree(fs_info->quota_root);
kfree(fs_info->uuid_root);
+ kfree(fs_info->free_space_root);
kfree(fs_info->super_copy);
kfree(fs_info->super_for_commit);
security_free_mnt_opts(&fs_info->security_opts);
@@ -3851,7 +3981,6 @@ void btrfs_extent_item_to_extent_map(struct inode *inode,
/* inode.c */
struct btrfs_delalloc_work {
struct inode *inode;
- int wait;
int delay_iput;
struct completion completion;
struct list_head list;
@@ -3859,7 +3988,7 @@ struct btrfs_delalloc_work {
};
struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
- int wait, int delay_iput);
+ int delay_iput);
void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work);
struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
@@ -3969,7 +4098,8 @@ void btrfs_get_block_group_info(struct list_head *groups_list,
struct btrfs_ioctl_space_info *space);
void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
struct btrfs_ioctl_balance_args *bargs);
-
+ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen,
+ struct file *dst_file, u64 dst_loff);
/* file.c */
int btrfs_auto_defrag_init(void);
@@ -4000,6 +4130,11 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
loff_t pos, size_t write_bytes,
struct extent_state **cached);
int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
+ssize_t btrfs_copy_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ size_t len, unsigned int flags);
+int btrfs_clone_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out, u64 len);
/* tree-defrag.c */
int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
@@ -4008,8 +4143,8 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
/* sysfs.c */
int btrfs_init_sysfs(void);
void btrfs_exit_sysfs(void);
-int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info);
-void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info);
+int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info);
+void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info);
/* xattr.c */
ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
@@ -4043,14 +4178,102 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
#define btrfs_info(fs_info, fmt, args...) \
btrfs_printk(fs_info, KERN_INFO fmt, ##args)
+/*
+ * Wrappers that use printk_in_rcu
+ */
+#define btrfs_emerg_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_in_rcu(fs_info, KERN_EMERG fmt, ##args)
+#define btrfs_alert_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_in_rcu(fs_info, KERN_ALERT fmt, ##args)
+#define btrfs_crit_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_in_rcu(fs_info, KERN_CRIT fmt, ##args)
+#define btrfs_err_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_in_rcu(fs_info, KERN_ERR fmt, ##args)
+#define btrfs_warn_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_in_rcu(fs_info, KERN_WARNING fmt, ##args)
+#define btrfs_notice_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_in_rcu(fs_info, KERN_NOTICE fmt, ##args)
+#define btrfs_info_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_in_rcu(fs_info, KERN_INFO fmt, ##args)
+
+/*
+ * Wrappers that use a ratelimited printk_in_rcu
+ */
+#define btrfs_emerg_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_rl_in_rcu(fs_info, KERN_EMERG fmt, ##args)
+#define btrfs_alert_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_rl_in_rcu(fs_info, KERN_ALERT fmt, ##args)
+#define btrfs_crit_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_rl_in_rcu(fs_info, KERN_CRIT fmt, ##args)
+#define btrfs_err_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_rl_in_rcu(fs_info, KERN_ERR fmt, ##args)
+#define btrfs_warn_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_rl_in_rcu(fs_info, KERN_WARNING fmt, ##args)
+#define btrfs_notice_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_rl_in_rcu(fs_info, KERN_NOTICE fmt, ##args)
+#define btrfs_info_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_rl_in_rcu(fs_info, KERN_INFO fmt, ##args)
+
+/*
+ * Wrappers that use a ratelimited printk
+ */
+#define btrfs_emerg_rl(fs_info, fmt, args...) \
+ btrfs_printk_ratelimited(fs_info, KERN_EMERG fmt, ##args)
+#define btrfs_alert_rl(fs_info, fmt, args...) \
+ btrfs_printk_ratelimited(fs_info, KERN_ALERT fmt, ##args)
+#define btrfs_crit_rl(fs_info, fmt, args...) \
+ btrfs_printk_ratelimited(fs_info, KERN_CRIT fmt, ##args)
+#define btrfs_err_rl(fs_info, fmt, args...) \
+ btrfs_printk_ratelimited(fs_info, KERN_ERR fmt, ##args)
+#define btrfs_warn_rl(fs_info, fmt, args...) \
+ btrfs_printk_ratelimited(fs_info, KERN_WARNING fmt, ##args)
+#define btrfs_notice_rl(fs_info, fmt, args...) \
+ btrfs_printk_ratelimited(fs_info, KERN_NOTICE fmt, ##args)
+#define btrfs_info_rl(fs_info, fmt, args...) \
+ btrfs_printk_ratelimited(fs_info, KERN_INFO fmt, ##args)
#ifdef DEBUG
#define btrfs_debug(fs_info, fmt, args...) \
btrfs_printk(fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl(fs_info, fmt, args...) \
+ btrfs_printk_ratelimited(fs_info, KERN_DEBUG fmt, ##args)
#else
#define btrfs_debug(fs_info, fmt, args...) \
no_printk(KERN_DEBUG fmt, ##args)
+#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
+ no_printk(KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
+ no_printk(KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl(fs_info, fmt, args...) \
+ no_printk(KERN_DEBUG fmt, ##args)
#endif
+#define btrfs_printk_in_rcu(fs_info, fmt, args...) \
+do { \
+ rcu_read_lock(); \
+ btrfs_printk(fs_info, fmt, ##args); \
+ rcu_read_unlock(); \
+} while (0)
+
+#define btrfs_printk_ratelimited(fs_info, fmt, args...) \
+do { \
+ static DEFINE_RATELIMIT_STATE(_rs, \
+ DEFAULT_RATELIMIT_INTERVAL, \
+ DEFAULT_RATELIMIT_BURST); \
+ if (__ratelimit(&_rs)) \
+ btrfs_printk(fs_info, fmt, ##args); \
+} while (0)
+
+#define btrfs_printk_rl_in_rcu(fs_info, fmt, args...) \
+do { \
+ rcu_read_lock(); \
+ btrfs_printk_ratelimited(fs_info, fmt, ##args); \
+ rcu_read_unlock(); \
+} while (0)
+
#ifdef CONFIG_BTRFS_ASSERT
__cold
@@ -4073,6 +4296,7 @@ __cold
void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int errno, const char *fmt, ...);
+const char *btrfs_decode_error(int errno);
__cold
void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
@@ -4103,16 +4327,98 @@ static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info,
}
}
+#define btrfs_clear_fs_incompat(__fs_info, opt) \
+ __btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt)
+
+static inline void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info,
+ u64 flag)
+{
+ struct btrfs_super_block *disk_super;
+ u64 features;
+
+ disk_super = fs_info->super_copy;
+ features = btrfs_super_incompat_flags(disk_super);
+ if (features & flag) {
+ spin_lock(&fs_info->super_lock);
+ features = btrfs_super_incompat_flags(disk_super);
+ if (features & flag) {
+ features &= ~flag;
+ btrfs_set_super_incompat_flags(disk_super, features);
+ btrfs_info(fs_info, "clearing %llu feature flag",
+ flag);
+ }
+ spin_unlock(&fs_info->super_lock);
+ }
+}
+
#define btrfs_fs_incompat(fs_info, opt) \
__btrfs_fs_incompat((fs_info), BTRFS_FEATURE_INCOMPAT_##opt)
-static inline int __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag)
+static inline bool __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag)
{
struct btrfs_super_block *disk_super;
disk_super = fs_info->super_copy;
return !!(btrfs_super_incompat_flags(disk_super) & flag);
}
+#define btrfs_set_fs_compat_ro(__fs_info, opt) \
+ __btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt)
+
+static inline void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info,
+ u64 flag)
+{
+ struct btrfs_super_block *disk_super;
+ u64 features;
+
+ disk_super = fs_info->super_copy;
+ features = btrfs_super_compat_ro_flags(disk_super);
+ if (!(features & flag)) {
+ spin_lock(&fs_info->super_lock);
+ features = btrfs_super_compat_ro_flags(disk_super);
+ if (!(features & flag)) {
+ features |= flag;
+ btrfs_set_super_compat_ro_flags(disk_super, features);
+ btrfs_info(fs_info, "setting %llu ro feature flag",
+ flag);
+ }
+ spin_unlock(&fs_info->super_lock);
+ }
+}
+
+#define btrfs_clear_fs_compat_ro(__fs_info, opt) \
+ __btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt)
+
+static inline void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info,
+ u64 flag)
+{
+ struct btrfs_super_block *disk_super;
+ u64 features;
+
+ disk_super = fs_info->super_copy;
+ features = btrfs_super_compat_ro_flags(disk_super);
+ if (features & flag) {
+ spin_lock(&fs_info->super_lock);
+ features = btrfs_super_compat_ro_flags(disk_super);
+ if (features & flag) {
+ features &= ~flag;
+ btrfs_set_super_compat_ro_flags(disk_super, features);
+ btrfs_info(fs_info, "clearing %llu ro feature flag",
+ flag);
+ }
+ spin_unlock(&fs_info->super_lock);
+ }
+}
+
+#define btrfs_fs_compat_ro(fs_info, opt) \
+ __btrfs_fs_compat_ro((fs_info), BTRFS_FEATURE_COMPAT_RO_##opt)
+
+static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag)
+{
+ struct btrfs_super_block *disk_super;
+ disk_super = fs_info->super_copy;
+ return !!(btrfs_super_compat_ro_flags(disk_super) & flag);
+}
+
/*
* Call btrfs_abort_transaction as early as possible when an error condition is
* detected, that way the exact line number is reported.
@@ -4130,14 +4436,7 @@ do { \
__LINE__, (errno)); \
} while (0)
-#define btrfs_std_error(fs_info, errno) \
-do { \
- if ((errno)) \
- __btrfs_std_error((fs_info), __func__, \
- __LINE__, (errno), NULL); \
-} while (0)
-
-#define btrfs_error(fs_info, errno, fmt, args...) \
+#define btrfs_std_error(fs_info, errno, fmt, args...) \
do { \
__btrfs_std_error((fs_info), __func__, __LINE__, \
(errno), fmt, ##args); \
@@ -4185,8 +4484,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *buf,
struct extent_buffer *cow);
-void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
- struct btrfs_pending_snapshot *pending,
+void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
u64 *bytes_to_reserve);
int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
struct btrfs_pending_snapshot *pending);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index a2ae427..0be47e4 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -54,16 +54,11 @@ static inline void btrfs_init_delayed_node(
delayed_node->root = root;
delayed_node->inode_id = inode_id;
atomic_set(&delayed_node->refs, 0);
- delayed_node->count = 0;
- delayed_node->flags = 0;
delayed_node->ins_root = RB_ROOT;
delayed_node->del_root = RB_ROOT;
mutex_init(&delayed_node->mutex);
- delayed_node->index_cnt = 0;
INIT_LIST_HEAD(&delayed_node->n_list);
INIT_LIST_HEAD(&delayed_node->p_list);
- delayed_node->bytes_reserved = 0;
- memset(&delayed_node->inode_item, 0, sizeof(delayed_node->inode_item));
}
static inline int btrfs_is_continuous_delayed_item(
@@ -132,7 +127,7 @@ again:
if (node)
return node;
- node = kmem_cache_alloc(delayed_node_cache, GFP_NOFS);
+ node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS);
if (!node)
return ERR_PTR(-ENOMEM);
btrfs_init_delayed_node(node, root, ino);
@@ -463,6 +458,10 @@ static int __btrfs_add_delayed_deletion_item(struct btrfs_delayed_node *node,
static void finish_one_item(struct btrfs_delayed_root *delayed_root)
{
int seq = atomic_inc_return(&delayed_root->items_seq);
+
+ /*
+ * atomic_dec_return implies a barrier for waitqueue_active
+ */
if ((atomic_dec_return(&delayed_root->items) <
BTRFS_DELAYED_BACKGROUND || seq % BTRFS_DELAYED_BATCH == 0) &&
waitqueue_active(&delayed_root->wait))
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index ac3e81d..914ac13 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -197,6 +197,119 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
trans->delayed_ref_updates--;
}
+static bool merge_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_head *head,
+ struct btrfs_delayed_ref_node *ref,
+ u64 seq)
+{
+ struct btrfs_delayed_ref_node *next;
+ bool done = false;
+
+ next = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node,
+ list);
+ while (!done && &next->list != &head->ref_list) {
+ int mod;
+ struct btrfs_delayed_ref_node *next2;
+
+ next2 = list_next_entry(next, list);
+
+ if (next == ref)
+ goto next;
+
+ if (seq && next->seq >= seq)
+ goto next;
+
+ if (next->type != ref->type)
+ goto next;
+
+ if ((ref->type == BTRFS_TREE_BLOCK_REF_KEY ||
+ ref->type == BTRFS_SHARED_BLOCK_REF_KEY) &&
+ comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref),
+ btrfs_delayed_node_to_tree_ref(next),
+ ref->type))
+ goto next;
+ if ((ref->type == BTRFS_EXTENT_DATA_REF_KEY ||
+ ref->type == BTRFS_SHARED_DATA_REF_KEY) &&
+ comp_data_refs(btrfs_delayed_node_to_data_ref(ref),
+ btrfs_delayed_node_to_data_ref(next)))
+ goto next;
+
+ if (ref->action == next->action) {
+ mod = next->ref_mod;
+ } else {
+ if (ref->ref_mod < next->ref_mod) {
+ swap(ref, next);
+ done = true;
+ }
+ mod = -next->ref_mod;
+ }
+
+ drop_delayed_ref(trans, delayed_refs, head, next);
+ ref->ref_mod += mod;
+ if (ref->ref_mod == 0) {
+ drop_delayed_ref(trans, delayed_refs, head, ref);
+ done = true;
+ } else {
+ /*
+ * Can't have multiples of the same ref on a tree block.
+ */
+ WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY ||
+ ref->type == BTRFS_SHARED_BLOCK_REF_KEY);
+ }
+next:
+ next = next2;
+ }
+
+ return done;
+}
+
+void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_head *head)
+{
+ struct btrfs_delayed_ref_node *ref;
+ u64 seq = 0;
+
+ assert_spin_locked(&head->lock);
+
+ if (list_empty(&head->ref_list))
+ return;
+
+ /* We don't have too many refs to merge for data. */
+ if (head->is_data)
+ return;
+
+ spin_lock(&fs_info->tree_mod_seq_lock);
+ if (!list_empty(&fs_info->tree_mod_seq_list)) {
+ struct seq_list *elem;
+
+ elem = list_first_entry(&fs_info->tree_mod_seq_list,
+ struct seq_list, list);
+ seq = elem->seq;
+ }
+ spin_unlock(&fs_info->tree_mod_seq_lock);
+
+ ref = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node,
+ list);
+ while (&ref->list != &head->ref_list) {
+ if (seq && ref->seq >= seq)
+ goto next;
+
+ if (merge_ref(trans, delayed_refs, head, ref, seq)) {
+ if (list_empty(&head->ref_list))
+ break;
+ ref = list_first_entry(&head->ref_list,
+ struct btrfs_delayed_ref_node,
+ list);
+ continue;
+ }
+next:
+ ref = list_next_entry(ref, list);
+ }
+}
+
int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
u64 seq)
@@ -292,8 +405,7 @@ add_delayed_ref_tail_merge(struct btrfs_trans_handle *trans,
exist = list_entry(href->ref_list.prev, struct btrfs_delayed_ref_node,
list);
/* No need to compare bytenr nor is_head */
- if (exist->type != ref->type || exist->no_quota != ref->no_quota ||
- exist->seq != ref->seq)
+ if (exist->type != ref->type || exist->seq != ref->seq)
goto add_tail;
if ((exist->type == BTRFS_TREE_BLOCK_REF_KEY ||
@@ -381,12 +493,12 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs,
memcpy(&existing_ref->extent_op->key,
&ref->extent_op->key,
sizeof(ref->extent_op->key));
- existing_ref->extent_op->update_key = 1;
+ existing_ref->extent_op->update_key = true;
}
if (ref->extent_op->update_flags) {
existing_ref->extent_op->flags_to_set |=
ref->extent_op->flags_to_set;
- existing_ref->extent_op->update_flags = 1;
+ existing_ref->extent_op->update_flags = true;
}
btrfs_free_delayed_extent_op(ref->extent_op);
}
@@ -423,7 +535,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *ref,
struct btrfs_qgroup_extent_record *qrecord,
- u64 bytenr, u64 num_bytes, int action, int is_data)
+ u64 bytenr, u64 num_bytes, u64 ref_root, u64 reserved,
+ int action, int is_data)
{
struct btrfs_delayed_ref_head *existing;
struct btrfs_delayed_ref_head *head_ref = NULL;
@@ -432,6 +545,9 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
int count_mod = 1;
int must_insert_reserved = 0;
+ /* If reserved is provided, it must be a data extent. */
+ BUG_ON(!is_data && reserved);
+
/*
* the head node stores the sum of all the mods, so dropping a ref
* should drop the sum in the head node by one.
@@ -476,9 +592,16 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
INIT_LIST_HEAD(&head_ref->ref_list);
head_ref->processing = 0;
head_ref->total_ref_mod = count_mod;
+ head_ref->qgroup_reserved = 0;
+ head_ref->qgroup_ref_root = 0;
/* Record qgroup extent info if provided */
if (qrecord) {
+ if (ref_root && reserved) {
+ head_ref->qgroup_ref_root = ref_root;
+ head_ref->qgroup_reserved = reserved;
+ }
+
qrecord->bytenr = bytenr;
qrecord->num_bytes = num_bytes;
qrecord->old_roots = NULL;
@@ -497,6 +620,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
existing = htree_insert(&delayed_refs->href_root,
&head_ref->href_node);
if (existing) {
+ WARN_ON(ref_root && reserved && existing->qgroup_ref_root
+ && existing->qgroup_reserved);
update_existing_head_ref(delayed_refs, &existing->node, ref);
/*
* we've updated the existing ref, free the newly
@@ -524,7 +649,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_head *head_ref,
struct btrfs_delayed_ref_node *ref, u64 bytenr,
u64 num_bytes, u64 parent, u64 ref_root, int level,
- int action, int no_quota)
+ int action)
{
struct btrfs_delayed_tree_ref *full_ref;
struct btrfs_delayed_ref_root *delayed_refs;
@@ -546,7 +671,6 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
ref->action = action;
ref->is_head = 0;
ref->in_tree = 1;
- ref->no_quota = no_quota;
ref->seq = seq;
full_ref = btrfs_delayed_node_to_tree_ref(ref);
@@ -579,7 +703,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_head *head_ref,
struct btrfs_delayed_ref_node *ref, u64 bytenr,
u64 num_bytes, u64 parent, u64 ref_root, u64 owner,
- u64 offset, int action, int no_quota)
+ u64 offset, int action)
{
struct btrfs_delayed_data_ref *full_ref;
struct btrfs_delayed_ref_root *delayed_refs;
@@ -602,7 +726,6 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
ref->action = action;
ref->is_head = 0;
ref->in_tree = 1;
- ref->no_quota = no_quota;
ref->seq = seq;
full_ref = btrfs_delayed_node_to_data_ref(ref);
@@ -633,17 +756,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, u64 parent,
u64 ref_root, int level, int action,
- struct btrfs_delayed_extent_op *extent_op,
- int no_quota)
+ struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_delayed_tree_ref *ref;
struct btrfs_delayed_ref_head *head_ref;
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_qgroup_extent_record *record = NULL;
- if (!is_fstree(ref_root) || !fs_info->quota_enabled)
- no_quota = 0;
-
BUG_ON(extent_op && extent_op->is_data);
ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);
if (!ref)
@@ -669,11 +788,10 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
* the spin lock
*/
head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record,
- bytenr, num_bytes, action, 0);
+ bytenr, num_bytes, 0, 0, action, 0);
add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr,
- num_bytes, parent, ref_root, level, action,
- no_quota);
+ num_bytes, parent, ref_root, level, action);
spin_unlock(&delayed_refs->lock);
return 0;
@@ -693,18 +811,14 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
u64 parent, u64 ref_root,
- u64 owner, u64 offset, int action,
- struct btrfs_delayed_extent_op *extent_op,
- int no_quota)
+ u64 owner, u64 offset, u64 reserved, int action,
+ struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_delayed_data_ref *ref;
struct btrfs_delayed_ref_head *head_ref;
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_qgroup_extent_record *record = NULL;
- if (!is_fstree(ref_root) || !fs_info->quota_enabled)
- no_quota = 0;
-
BUG_ON(extent_op && !extent_op->is_data);
ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS);
if (!ref)
@@ -736,16 +850,44 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
* the spin lock
*/
head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record,
- bytenr, num_bytes, action, 1);
+ bytenr, num_bytes, ref_root, reserved,
+ action, 1);
add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr,
num_bytes, parent, ref_root, owner, offset,
- action, no_quota);
+ action);
spin_unlock(&delayed_refs->lock);
return 0;
}
+int btrfs_add_delayed_qgroup_reserve(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans,
+ u64 ref_root, u64 bytenr, u64 num_bytes)
+{
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_delayed_ref_head *ref_head;
+ int ret = 0;
+
+ if (!fs_info->quota_enabled || !is_fstree(ref_root))
+ return 0;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+
+ spin_lock(&delayed_refs->lock);
+ ref_head = find_ref_head(&delayed_refs->href_root, bytenr, 0);
+ if (!ref_head) {
+ ret = -ENOENT;
+ goto out;
+ }
+ WARN_ON(ref_head->qgroup_reserved || ref_head->qgroup_ref_root);
+ ref_head->qgroup_ref_root = ref_root;
+ ref_head->qgroup_reserved = num_bytes;
+out:
+ spin_unlock(&delayed_refs->lock);
+ return ret;
+}
+
int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
@@ -764,7 +906,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
spin_lock(&delayed_refs->lock);
add_delayed_ref_head(fs_info, trans, &head_ref->node, NULL, bytenr,
- num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
+ num_bytes, 0, 0, BTRFS_UPDATE_DELAYED_HEAD,
extent_op->is_data);
spin_unlock(&delayed_refs->lock);
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 13fb5e6..c24b653 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -68,7 +68,6 @@ struct btrfs_delayed_ref_node {
unsigned int action:8;
unsigned int type:8;
- unsigned int no_quota:1;
/* is this node still in the rbtree? */
unsigned int is_head:1;
unsigned int in_tree:1;
@@ -76,11 +75,11 @@ struct btrfs_delayed_ref_node {
struct btrfs_delayed_extent_op {
struct btrfs_disk_key key;
+ u8 level;
+ bool update_key;
+ bool update_flags;
+ bool is_data;
u64 flags_to_set;
- int level;
- unsigned int update_key:1;
- unsigned int update_flags:1;
- unsigned int is_data:1;
};
/*
@@ -113,6 +112,17 @@ struct btrfs_delayed_ref_head {
int total_ref_mod;
/*
+ * For qgroup reserved space freeing.
+ *
+ * ref_root and reserved will be recorded after
+ * BTRFS_ADD_DELAYED_EXTENT is called.
+ * And will be used to free reserved qgroup space at
+ * run_delayed_refs() time.
+ */
+ u64 qgroup_ref_root;
+ u64 qgroup_reserved;
+
+ /*
* when a new extent is allocated, it is just reserved in memory
* The actual extent isn't inserted into the extent allocation tree
* until the delayed ref is processed. must_insert_reserved is
@@ -233,15 +243,16 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, u64 parent,
u64 ref_root, int level, int action,
- struct btrfs_delayed_extent_op *extent_op,
- int no_quota);
+ struct btrfs_delayed_extent_op *extent_op);
int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
u64 parent, u64 ref_root,
- u64 owner, u64 offset, int action,
- struct btrfs_delayed_extent_op *extent_op,
- int no_quota);
+ u64 owner, u64 offset, u64 reserved, int action,
+ struct btrfs_delayed_extent_op *extent_op);
+int btrfs_add_delayed_qgroup_reserve(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans,
+ u64 ref_root, u64 bytenr, u64 num_bytes);
int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 564a7de..1e668fb 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -183,8 +183,7 @@ no_valid_dev_replace_entry_found:
}
out:
- if (path)
- btrfs_free_path(path);
+ btrfs_free_path(path);
return ret;
}
@@ -328,19 +327,6 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
args->start.tgtdev_name[0] == '\0')
return -EINVAL;
- /*
- * Here we commit the transaction to make sure commit_total_bytes
- * of all the devices are updated.
- */
- trans = btrfs_attach_transaction(root);
- if (!IS_ERR(trans)) {
- ret = btrfs_commit_transaction(trans, root);
- if (ret)
- return ret;
- } else if (PTR_ERR(trans) != -ENOENT) {
- return PTR_ERR(trans);
- }
-
/* the disk copy procedure reuses the scrub code */
mutex_lock(&fs_info->volume_mutex);
ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid,
@@ -357,6 +343,19 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
if (ret)
return ret;
+ /*
+ * Here we commit the transaction to make sure commit_total_bytes
+ * of all the devices are updated.
+ */
+ trans = btrfs_attach_transaction(root);
+ if (!IS_ERR(trans)) {
+ ret = btrfs_commit_transaction(trans, root);
+ if (ret)
+ return ret;
+ } else if (PTR_ERR(trans) != -ENOENT) {
+ return PTR_ERR(trans);
+ }
+
btrfs_dev_replace_lock(dev_replace);
switch (dev_replace->replace_state) {
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
@@ -376,12 +375,8 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
WARN_ON(!tgt_device);
dev_replace->tgtdev = tgt_device;
- ret = btrfs_kobj_add_device(tgt_device->fs_devices, tgt_device);
- if (ret)
- btrfs_err(root->fs_info, "kobj add dev failed %d\n", ret);
-
- printk_in_rcu(KERN_INFO
- "BTRFS: dev_replace from %s (devid %llu) to %s started\n",
+ btrfs_info_in_rcu(root->fs_info,
+ "dev_replace from %s (devid %llu) to %s started",
src_device->missing ? "<missing disk>" :
rcu_str_deref(src_device->name),
src_device->devid,
@@ -402,6 +397,10 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
btrfs_dev_replace_unlock(dev_replace);
+ ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device);
+ if (ret)
+ btrfs_err(root->fs_info, "kobj add dev failed %d\n", ret);
+
btrfs_wait_ordered_roots(root->fs_info, -1);
/* force writing the updated state information to disk */
@@ -455,8 +454,7 @@ static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info)
{
clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
- if (waitqueue_active(&fs_info->replace_wait))
- wake_up(&fs_info->replace_wait);
+ wake_up(&fs_info->replace_wait);
}
static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
@@ -524,8 +522,8 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
src_device,
tgt_device);
} else {
- printk_in_rcu(KERN_ERR
- "BTRFS: btrfs_scrub_dev(%s, %llu, %s) failed %d\n",
+ btrfs_err_in_rcu(root->fs_info,
+ "btrfs_scrub_dev(%s, %llu, %s) failed %d",
src_device->missing ? "<missing disk>" :
rcu_str_deref(src_device->name),
src_device->devid,
@@ -541,8 +539,8 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
return scrub_ret;
}
- printk_in_rcu(KERN_INFO
- "BTRFS: dev_replace from %s (devid %llu) to %s finished\n",
+ btrfs_info_in_rcu(root->fs_info,
+ "dev_replace from %s (devid %llu) to %s finished",
src_device->missing ? "<missing disk>" :
rcu_str_deref(src_device->name),
src_device->devid,
@@ -587,7 +585,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
mutex_unlock(&uuid_mutex);
/* replace the sysfs entry */
- btrfs_kobj_rm_device(fs_info->fs_devices, src_device);
+ btrfs_sysfs_rm_device_link(fs_info->fs_devices, src_device);
btrfs_rm_dev_replace_free_srcdev(fs_info, src_device);
/* write back the superblocks */
@@ -810,8 +808,8 @@ static int btrfs_dev_replace_kthread(void *data)
progress = status_args->status.progress_1000;
kfree(status_args);
progress = div_u64(progress, 10);
- printk_in_rcu(KERN_INFO
- "BTRFS: continuing dev_replace from %s (devid %llu) to %s @%u%%\n",
+ btrfs_info_in_rcu(fs_info,
+ "continuing dev_replace from %s (devid %llu) to %s @%u%%",
dev_replace->srcdev->missing ? "<missing disk>" :
rcu_str_deref(dev_replace->srcdev->name),
dev_replace->srcdev->devid,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f556c37..e99ccd6 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -42,6 +42,7 @@
#include "locking.h"
#include "tree-log.h"
#include "free-space-cache.h"
+#include "free-space-tree.h"
#include "inode-map.h"
#include "check-integrity.h"
#include "rcu-string.h"
@@ -319,9 +320,9 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info,
memcpy(&found, result, csum_size);
read_extent_buffer(buf, &val, 0, csum_size);
- printk_ratelimited(KERN_WARNING
- "BTRFS: %s checksum verify failed on %llu wanted %X found %X "
- "level %d\n",
+ btrfs_warn_rl(fs_info,
+ "%s checksum verify failed on %llu wanted %X found %X "
+ "level %d",
fs_info->sb->s_id, buf->start,
val, found, btrfs_header_level(buf));
if (result != (char *)&inline_result)
@@ -362,15 +363,15 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
}
lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
- 0, &cached_state);
+ &cached_state);
if (extent_buffer_uptodate(eb) &&
btrfs_header_generation(eb) == parent_transid) {
ret = 0;
goto out;
}
- printk_ratelimited(KERN_ERR
- "BTRFS (device %s): parent transid verify failed on %llu wanted %llu found %llu\n",
- eb->fs_info->sb->s_id, eb->start,
+ btrfs_err_rl(eb->fs_info,
+ "parent transid verify failed on %llu wanted %llu found %llu",
+ eb->start,
parent_transid, btrfs_header_generation(eb));
ret = 1;
@@ -629,15 +630,14 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
found_start = btrfs_header_bytenr(eb);
if (found_start != eb->start) {
- printk_ratelimited(KERN_ERR "BTRFS (device %s): bad tree block start "
- "%llu %llu\n",
- eb->fs_info->sb->s_id, found_start, eb->start);
+ btrfs_err_rl(eb->fs_info, "bad tree block start %llu %llu",
+ found_start, eb->start);
ret = -EIO;
goto err;
}
if (check_tree_block_fsid(root->fs_info, eb)) {
- printk_ratelimited(KERN_ERR "BTRFS (device %s): bad fsid on block %llu\n",
- eb->fs_info->sb->s_id, eb->start);
+ btrfs_err_rl(eb->fs_info, "bad fsid on block %llu",
+ eb->start);
ret = -EIO;
goto err;
}
@@ -703,7 +703,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror)
return -EIO; /* we fixed nothing */
}
-static void end_workqueue_bio(struct bio *bio, int err)
+static void end_workqueue_bio(struct bio *bio)
{
struct btrfs_end_io_wq *end_io_wq = bio->bi_private;
struct btrfs_fs_info *fs_info;
@@ -711,7 +711,7 @@ static void end_workqueue_bio(struct bio *bio, int err)
btrfs_work_func_t func;
fs_info = end_io_wq->info;
- end_io_wq->error = err;
+ end_io_wq->error = bio->bi_error;
if (bio->bi_rw & REQ_WRITE) {
if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) {
@@ -802,13 +802,17 @@ static void run_one_async_done(struct btrfs_work *work)
limit = btrfs_async_submit_limit(fs_info);
limit = limit * 2 / 3;
+ /*
+ * atomic_dec_return implies a barrier for waitqueue_active
+ */
if (atomic_dec_return(&fs_info->nr_async_submits) < limit &&
waitqueue_active(&fs_info->async_submit_wait))
wake_up(&fs_info->async_submit_wait);
/* If an error occured we just want to clean up the bio and move on */
if (async->error) {
- bio_endio(async->bio, async->error);
+ async->bio->bi_error = async->error;
+ bio_endio(async->bio);
return;
}
@@ -908,8 +912,10 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
* submission context. Just jump into btrfs_map_bio
*/
ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
- if (ret)
- bio_endio(bio, ret);
+ if (ret) {
+ bio->bi_error = ret;
+ bio_endio(bio);
+ }
return ret;
}
@@ -918,7 +924,7 @@ static int check_async_write(struct inode *inode, unsigned long bio_flags)
if (bio_flags & EXTENT_BIO_TREE_LOG)
return 0;
#ifdef CONFIG_X86
- if (cpu_has_xmm4_2)
+ if (static_cpu_has_safe(X86_FEATURE_XMM4_2))
return 0;
#endif
return 1;
@@ -960,10 +966,13 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
__btree_submit_bio_done);
}
- if (ret) {
+ if (ret)
+ goto out_w_error;
+ return 0;
+
out_w_error:
- bio_endio(bio, ret);
- }
+ bio->bi_error = ret;
+ bio_endio(bio);
return ret;
}
@@ -1259,6 +1268,7 @@ static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize,
atomic_set(&root->orphan_inodes, 0);
atomic_set(&root->refs, 1);
atomic_set(&root->will_be_snapshoted, 0);
+ atomic_set(&root->qgroup_meta_rsv, 0);
root->log_transid = 0;
root->log_transid_committed = -1;
root->last_log_commit = 0;
@@ -1641,6 +1651,9 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
if (location->objectid == BTRFS_UUID_TREE_OBJECTID)
return fs_info->uuid_root ? fs_info->uuid_root :
ERR_PTR(-ENOENT);
+ if (location->objectid == BTRFS_FREE_SPACE_TREE_OBJECTID)
+ return fs_info->free_space_root ? fs_info->free_space_root :
+ ERR_PTR(-ENOENT);
again:
root = btrfs_lookup_fs_root(fs_info, location->objectid);
if (root) {
@@ -1724,6 +1737,7 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE;
bdi->congested_fn = btrfs_congested_fn;
bdi->congested_data = info;
+ bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK;
return 0;
}
@@ -1735,16 +1749,15 @@ static void end_workqueue_fn(struct btrfs_work *work)
{
struct bio *bio;
struct btrfs_end_io_wq *end_io_wq;
- int error;
end_io_wq = container_of(work, struct btrfs_end_io_wq, work);
bio = end_io_wq->bio;
- error = end_io_wq->error;
+ bio->bi_error = end_io_wq->error;
bio->bi_private = end_io_wq->private;
bio->bi_end_io = end_io_wq->end_io;
kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq);
- bio_endio(bio, error);
+ bio_endio(bio);
}
static int cleaner_kthread(void *arg)
@@ -1753,6 +1766,7 @@ static int cleaner_kthread(void *arg)
int again;
struct btrfs_trans_handle *trans;
+ set_freezable();
do {
again = 0;
@@ -2138,6 +2152,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
free_root_extent_buffers(info->uuid_root);
if (chunk_root)
free_root_extent_buffers(info->chunk_root);
+ free_root_extent_buffers(info->free_space_root);
}
void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
@@ -2342,8 +2357,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
u64 bytenr = btrfs_super_log_root(disk_super);
if (fs_devices->rw_devices == 0) {
- printk(KERN_WARNING "BTRFS: log replay required "
- "on RO media\n");
+ btrfs_warn(fs_info, "log replay required on RO media");
return -EIO;
}
@@ -2358,12 +2372,12 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
log_tree_root->node = read_tree_block(tree_root, bytenr,
fs_info->generation + 1);
if (IS_ERR(log_tree_root->node)) {
- printk(KERN_ERR "BTRFS: failed to read log tree\n");
+ btrfs_warn(fs_info, "failed to read log tree");
ret = PTR_ERR(log_tree_root->node);
kfree(log_tree_root);
return ret;
} else if (!extent_buffer_uptodate(log_tree_root->node)) {
- printk(KERN_ERR "BTRFS: failed to read log tree\n");
+ btrfs_err(fs_info, "failed to read log tree");
free_extent_buffer(log_tree_root->node);
kfree(log_tree_root);
return -EIO;
@@ -2371,7 +2385,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
/* returns with log_tree_root freed on success */
ret = btrfs_recover_log_trees(log_tree_root);
if (ret) {
- btrfs_error(tree_root->fs_info, ret,
+ btrfs_std_error(tree_root->fs_info, ret,
"Failed to recover log tree");
free_extent_buffer(log_tree_root->node);
kfree(log_tree_root);
@@ -2439,6 +2453,15 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info,
fs_info->uuid_root = root;
}
+ if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
+ location.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID;
+ root = btrfs_read_tree_root(tree_root, &location);
+ if (IS_ERR(root))
+ return PTR_ERR(root);
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ fs_info->free_space_root = root;
+ }
+
return 0;
}
@@ -2566,7 +2589,7 @@ int open_ctree(struct super_block *sb,
fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */
/* readahead state */
- INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
+ INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
spin_lock_init(&fs_info->reada_lock);
fs_info->thread_pool_size = min_t(unsigned long,
@@ -2608,7 +2631,6 @@ int open_ctree(struct super_block *sb,
mutex_init(&fs_info->ordered_operations_mutex);
- mutex_init(&fs_info->ordered_extent_flush_mutex);
mutex_init(&fs_info->tree_log_mutex);
mutex_init(&fs_info->chunk_mutex);
mutex_init(&fs_info->transaction_kthread_mutex);
@@ -2648,8 +2670,8 @@ int open_ctree(struct super_block *sb,
* Read super block and check the signature bytes only
*/
bh = btrfs_read_dev_super(fs_devices->latest_bdev);
- if (!bh) {
- err = -EINVAL;
+ if (IS_ERR(bh)) {
+ err = PTR_ERR(bh);
goto fail_alloc;
}
@@ -2660,6 +2682,7 @@ int open_ctree(struct super_block *sb,
if (btrfs_check_super_csum(bh->b_data)) {
printk(KERN_ERR "BTRFS: superblock checksum mismatch\n");
err = -EINVAL;
+ brelse(bh);
goto fail_alloc;
}
@@ -2801,7 +2824,7 @@ int open_ctree(struct super_block *sb,
fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
- 4 * 1024 * 1024 / PAGE_CACHE_SIZE);
+ SZ_4M / PAGE_CACHE_SIZE);
tree_root->nodesize = nodesize;
tree_root->sectorsize = sectorsize;
@@ -2842,6 +2865,8 @@ int open_ctree(struct super_block *sb,
!extent_buffer_uptodate(chunk_root->node)) {
printk(KERN_ERR "BTRFS: failed to read chunk root on %s\n",
sb->s_id);
+ if (!IS_ERR(chunk_root->node))
+ free_extent_buffer(chunk_root->node);
chunk_root->node = NULL;
goto fail_tree_roots;
}
@@ -2880,6 +2905,8 @@ retry_root_backup:
!extent_buffer_uptodate(tree_root->node)) {
printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n",
sb->s_id);
+ if (!IS_ERR(tree_root->node))
+ free_extent_buffer(tree_root->node);
tree_root->node = NULL;
goto recovery_tree_root;
}
@@ -2928,7 +2955,7 @@ retry_root_backup:
goto fail_fsdev_sysfs;
}
- ret = btrfs_sysfs_add_one(fs_info);
+ ret = btrfs_sysfs_add_mounted(fs_info);
if (ret) {
pr_err("BTRFS: failed to init sysfs interface: %d\n", ret);
goto fail_fsdev_sysfs;
@@ -2950,8 +2977,9 @@ retry_root_backup:
if (fs_info->fs_devices->missing_devices >
fs_info->num_tolerated_disk_barrier_failures &&
!(sb->s_flags & MS_RDONLY)) {
- printk(KERN_WARNING "BTRFS: "
- "too many missing devices, writeable mount is not allowed\n");
+ pr_warn("BTRFS: missing devices(%llu) exceeds the limit(%d), writeable mount is not allowed\n",
+ fs_info->fs_devices->missing_devices,
+ fs_info->num_tolerated_disk_barrier_failures);
goto fail_sysfs;
}
@@ -3038,6 +3066,18 @@ retry_root_backup:
if (sb->s_flags & MS_RDONLY)
return 0;
+ if (btrfs_test_opt(tree_root, FREE_SPACE_TREE) &&
+ !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
+ pr_info("BTRFS: creating free space tree\n");
+ ret = btrfs_create_free_space_tree(fs_info);
+ if (ret) {
+ pr_warn("BTRFS: failed to create free space tree %d\n",
+ ret);
+ close_ctree(tree_root);
+ return ret;
+ }
+ }
+
down_read(&fs_info->cleanup_work_sem);
if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) ||
(ret = btrfs_orphan_cleanup(fs_info->tree_root))) {
@@ -3063,6 +3103,18 @@ retry_root_backup:
btrfs_qgroup_rescan_resume(fs_info);
+ if (btrfs_test_opt(tree_root, CLEAR_CACHE) &&
+ btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
+ pr_info("BTRFS: clearing free space tree\n");
+ ret = btrfs_clear_free_space_tree(fs_info);
+ if (ret) {
+ pr_warn("BTRFS: failed to clear free space tree %d\n",
+ ret);
+ close_ctree(tree_root);
+ return ret;
+ }
+ }
+
if (!fs_info->uuid_root) {
pr_info("BTRFS: creating UUID tree\n");
ret = btrfs_create_uuid_tree(fs_info);
@@ -3107,7 +3159,7 @@ fail_cleaner:
filemap_write_and_wait(fs_info->btree_inode->i_mapping);
fail_sysfs:
- btrfs_sysfs_remove_one(fs_info);
+ btrfs_sysfs_remove_mounted(fs_info);
fail_fsdev_sysfs:
btrfs_sysfs_remove_fsid(fs_info->fs_devices);
@@ -3169,8 +3221,8 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
struct btrfs_device *device = (struct btrfs_device *)
bh->b_private;
- printk_ratelimited_in_rcu(KERN_WARNING "BTRFS: lost page write due to "
- "I/O error on %s\n",
+ btrfs_warn_rl_in_rcu(device->dev_root->fs_info,
+ "lost page write due to IO error on %s",
rcu_str_deref(device->name));
/* note, we dont' set_buffer_write_io_error because we have
* our own ways of dealing with the IO errors
@@ -3182,6 +3234,37 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
put_bh(bh);
}
+int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num,
+ struct buffer_head **bh_ret)
+{
+ struct buffer_head *bh;
+ struct btrfs_super_block *super;
+ u64 bytenr;
+
+ bytenr = btrfs_sb_offset(copy_num);
+ if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode))
+ return -EINVAL;
+
+ bh = __bread(bdev, bytenr / 4096, BTRFS_SUPER_INFO_SIZE);
+ /*
+ * If we fail to read from the underlying devices, as of now
+ * the best option we have is to mark it EIO.
+ */
+ if (!bh)
+ return -EIO;
+
+ super = (struct btrfs_super_block *)bh->b_data;
+ if (btrfs_super_bytenr(super) != bytenr ||
+ btrfs_super_magic(super) != BTRFS_MAGIC) {
+ brelse(bh);
+ return -EINVAL;
+ }
+
+ *bh_ret = bh;
+ return 0;
+}
+
+
struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
{
struct buffer_head *bh;
@@ -3189,7 +3272,7 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
struct btrfs_super_block *super;
int i;
u64 transid = 0;
- u64 bytenr;
+ int ret = -EINVAL;
/* we would like to check all the supers, but that would make
* a btrfs mount succeed after a mkfs from a different FS.
@@ -3197,21 +3280,11 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
* later supers, using BTRFS_SUPER_MIRROR_MAX instead
*/
for (i = 0; i < 1; i++) {
- bytenr = btrfs_sb_offset(i);
- if (bytenr + BTRFS_SUPER_INFO_SIZE >=
- i_size_read(bdev->bd_inode))
- break;
- bh = __bread(bdev, bytenr / 4096,
- BTRFS_SUPER_INFO_SIZE);
- if (!bh)
+ ret = btrfs_read_dev_one_super(bdev, i, &bh);
+ if (ret)
continue;
super = (struct btrfs_super_block *)bh->b_data;
- if (btrfs_super_bytenr(super) != bytenr ||
- btrfs_super_magic(super) != BTRFS_MAGIC) {
- brelse(bh);
- continue;
- }
if (!latest || btrfs_super_generation(super) > transid) {
brelse(latest);
@@ -3221,6 +3294,10 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
brelse(bh);
}
}
+
+ if (!latest)
+ return ERR_PTR(ret);
+
return latest;
}
@@ -3289,8 +3366,9 @@ static int write_dev_supers(struct btrfs_device *device,
bh = __getblk(device->bdev, bytenr / 4096,
BTRFS_SUPER_INFO_SIZE);
if (!bh) {
- printk(KERN_ERR "BTRFS: couldn't get super "
- "buffer head for bytenr %Lu\n", bytenr);
+ btrfs_err(device->dev_root->fs_info,
+ "couldn't get super buffer head for bytenr %llu",
+ bytenr);
errors++;
continue;
}
@@ -3324,10 +3402,8 @@ static int write_dev_supers(struct btrfs_device *device,
* endio for the write_dev_flush, this will wake anyone waiting
* for the barrier when it is done
*/
-static void btrfs_end_empty_barrier(struct bio *bio, int err)
+static void btrfs_end_empty_barrier(struct bio *bio)
{
- if (err)
- clear_bit(BIO_UPTODATE, &bio->bi_flags);
if (bio->bi_private)
complete(bio->bi_private);
bio_put(bio);
@@ -3355,8 +3431,8 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
wait_for_completion(&device->flush_wait);
- if (!bio_flagged(bio, BIO_UPTODATE)) {
- ret = -EIO;
+ if (bio->bi_error) {
+ ret = bio->bi_error;
btrfs_dev_stat_inc_and_print(device,
BTRFS_DEV_STAT_FLUSH_ERRS);
}
@@ -3439,6 +3515,35 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
return 0;
}
+int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags)
+{
+ int raid_type;
+ int min_tolerated = INT_MAX;
+
+ if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 ||
+ (flags & BTRFS_AVAIL_ALLOC_BIT_SINGLE))
+ min_tolerated = min(min_tolerated,
+ btrfs_raid_array[BTRFS_RAID_SINGLE].
+ tolerated_failures);
+
+ for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
+ if (raid_type == BTRFS_RAID_SINGLE)
+ continue;
+ if (!(flags & btrfs_raid_group[raid_type]))
+ continue;
+ min_tolerated = min(min_tolerated,
+ btrfs_raid_array[raid_type].
+ tolerated_failures);
+ }
+
+ if (min_tolerated == INT_MAX) {
+ pr_warn("BTRFS: unknown raid flag: %llu\n", flags);
+ min_tolerated = 0;
+ }
+
+ return min_tolerated;
+}
+
int btrfs_calc_num_tolerated_disk_barrier_failures(
struct btrfs_fs_info *fs_info)
{
@@ -3448,13 +3553,12 @@ int btrfs_calc_num_tolerated_disk_barrier_failures(
BTRFS_BLOCK_GROUP_SYSTEM,
BTRFS_BLOCK_GROUP_METADATA,
BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA};
- int num_types = 4;
int i;
int c;
int num_tolerated_disk_barrier_failures =
(int)fs_info->fs_devices->num_devices;
- for (i = 0; i < num_types; i++) {
+ for (i = 0; i < ARRAY_SIZE(types); i++) {
struct btrfs_space_info *tmp;
sinfo = NULL;
@@ -3472,44 +3576,21 @@ int btrfs_calc_num_tolerated_disk_barrier_failures(
down_read(&sinfo->groups_sem);
for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
- if (!list_empty(&sinfo->block_groups[c])) {
- u64 flags;
-
- btrfs_get_block_group_info(
- &sinfo->block_groups[c], &space);
- if (space.total_bytes == 0 ||
- space.used_bytes == 0)
- continue;
- flags = space.flags;
- /*
- * return
- * 0: if dup, single or RAID0 is configured for
- * any of metadata, system or data, else
- * 1: if RAID5 is configured, or if RAID1 or
- * RAID10 is configured and only two mirrors
- * are used, else
- * 2: if RAID6 is configured, else
- * num_mirrors - 1: if RAID1 or RAID10 is
- * configured and more than
- * 2 mirrors are used.
- */
- if (num_tolerated_disk_barrier_failures > 0 &&
- ((flags & (BTRFS_BLOCK_GROUP_DUP |
- BTRFS_BLOCK_GROUP_RAID0)) ||
- ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK)
- == 0)))
- num_tolerated_disk_barrier_failures = 0;
- else if (num_tolerated_disk_barrier_failures > 1) {
- if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
- BTRFS_BLOCK_GROUP_RAID5 |
- BTRFS_BLOCK_GROUP_RAID10)) {
- num_tolerated_disk_barrier_failures = 1;
- } else if (flags &
- BTRFS_BLOCK_GROUP_RAID6) {
- num_tolerated_disk_barrier_failures = 2;
- }
- }
- }
+ u64 flags;
+
+ if (list_empty(&sinfo->block_groups[c]))
+ continue;
+
+ btrfs_get_block_group_info(&sinfo->block_groups[c],
+ &space);
+ if (space.total_bytes == 0 || space.used_bytes == 0)
+ continue;
+ flags = space.flags;
+
+ num_tolerated_disk_barrier_failures = min(
+ num_tolerated_disk_barrier_failures,
+ btrfs_get_num_tolerated_disk_barrier_failures(
+ flags));
}
up_read(&sinfo->groups_sem);
}
@@ -3544,7 +3625,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
if (ret) {
mutex_unlock(
&root->fs_info->fs_devices->device_list_mutex);
- btrfs_error(root->fs_info, ret,
+ btrfs_std_error(root->fs_info, ret,
"errors while submitting device barriers.");
return ret;
}
@@ -3584,7 +3665,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
/* FUA is masked off if unsupported and can't be the reason */
- btrfs_error(root->fs_info, -EIO,
+ btrfs_std_error(root->fs_info, -EIO,
"%d errors while writing supers", total_errors);
return -EIO;
}
@@ -3602,7 +3683,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
}
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
if (total_errors > max_errors) {
- btrfs_error(root->fs_info, -EIO,
+ btrfs_std_error(root->fs_info, -EIO,
"%d errors while writing supers", total_errors);
return -EIO;
}
@@ -3738,6 +3819,9 @@ void close_ctree(struct btrfs_root *root)
fs_info->closing = 1;
smp_mb();
+ /* wait for the qgroup rescan worker to stop */
+ btrfs_qgroup_wait_for_completion(fs_info);
+
/* wait for the uuid_scan task to finish */
down(&fs_info->uuid_tree_rescan_sem);
/* avoid complains from lockdep et al., set sem back to initial state */
@@ -3760,6 +3844,13 @@ void close_ctree(struct btrfs_root *root)
cancel_work_sync(&fs_info->async_reclaim_work);
if (!(fs_info->sb->s_flags & MS_RDONLY)) {
+ /*
+ * If the cleaner thread is stopped and there are
+ * block groups queued for removal, the deletion will be
+ * skipped when we quit the cleaner thread.
+ */
+ btrfs_delete_unused_bgs(root->fs_info);
+
ret = btrfs_commit_super(root);
if (ret)
btrfs_err(fs_info, "commit super ret %d", ret);
@@ -3781,7 +3872,7 @@ void close_ctree(struct btrfs_root *root)
percpu_counter_sum(&fs_info->delalloc_bytes));
}
- btrfs_sysfs_remove_one(fs_info);
+ btrfs_sysfs_remove_mounted(fs_info);
btrfs_sysfs_remove_fsid(fs_info->fs_devices);
btrfs_free_fs_roots(fs_info);
@@ -3850,11 +3941,6 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
return !ret;
}
-int btrfs_set_buffer_uptodate(struct extent_buffer *buf)
-{
- return set_extent_buffer_uptodate(buf);
-}
-
void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
{
struct btrfs_root *root;
@@ -3910,7 +3996,6 @@ static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
balance_dirty_pages_ratelimited(
root->fs_info->btree_inode->i_mapping);
}
- return;
}
void btrfs_btree_balance_dirty(struct btrfs_root *root)
@@ -4279,25 +4364,6 @@ again:
return 0;
}
-static void btrfs_free_pending_ordered(struct btrfs_transaction *cur_trans,
- struct btrfs_fs_info *fs_info)
-{
- struct btrfs_ordered_extent *ordered;
-
- spin_lock(&fs_info->trans_lock);
- while (!list_empty(&cur_trans->pending_ordered)) {
- ordered = list_first_entry(&cur_trans->pending_ordered,
- struct btrfs_ordered_extent,
- trans_list);
- list_del_init(&ordered->trans_list);
- spin_unlock(&fs_info->trans_lock);
-
- btrfs_put_ordered_extent(ordered);
- spin_lock(&fs_info->trans_lock);
- }
- spin_unlock(&fs_info->trans_lock);
-}
-
void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
struct btrfs_root *root)
{
@@ -4309,7 +4375,6 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
cur_trans->state = TRANS_STATE_UNBLOCKED;
wake_up(&root->fs_info->transaction_wait);
- btrfs_free_pending_ordered(cur_trans, root->fs_info);
btrfs_destroy_delayed_inodes(root);
btrfs_assert_delayed_root_empty(root);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index d4cbfee..8e79d00 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -19,7 +19,7 @@
#ifndef __DISKIO__
#define __DISKIO__
-#define BTRFS_SUPER_INFO_OFFSET (64 * 1024)
+#define BTRFS_SUPER_INFO_OFFSET SZ_64K
#define BTRFS_SUPER_INFO_SIZE 4096
#define BTRFS_SUPER_MIRROR_MAX 3
@@ -35,7 +35,7 @@ enum btrfs_wq_endio_type {
static inline u64 btrfs_sb_offset(int mirror)
{
- u64 start = 16 * 1024;
+ u64 start = SZ_16K;
if (mirror)
return start << (BTRFS_SUPER_MIRROR_SHIFT * mirror);
return BTRFS_SUPER_INFO_OFFSET;
@@ -60,6 +60,8 @@ void close_ctree(struct btrfs_root *root);
int write_ctree_super(struct btrfs_trans_handle *trans,
struct btrfs_root *root, int max_mirrors);
struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
+int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num,
+ struct buffer_head **bh_ret);
int btrfs_commit_super(struct btrfs_root *root);
struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info,
u64 bytenr);
@@ -114,7 +116,6 @@ static inline void btrfs_put_fs_root(struct btrfs_root *root)
void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
int atomic);
-int btrfs_set_buffer_uptodate(struct extent_buffer *buf);
int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid);
u32 btrfs_csum_data(char *data, u32 seed, size_t len);
void btrfs_csum_final(u32 crc, char *result);
@@ -139,6 +140,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
u64 objectid);
int btree_lock_page_hook(struct page *page, void *data,
void (*flush_fn)(void *));
+int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags);
int btrfs_calc_num_tolerated_disk_barrier_failures(
struct btrfs_fs_info *fs_info);
int __init btrfs_end_io_wq_init(void);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 8d05220..2513a7f 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -112,11 +112,11 @@ static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
u32 generation;
if (fh_type == FILEID_BTRFS_WITH_PARENT) {
- if (fh_len != BTRFS_FID_SIZE_CONNECTABLE)
+ if (fh_len < BTRFS_FID_SIZE_CONNECTABLE)
return NULL;
root_objectid = fid->root_objectid;
} else if (fh_type == FILEID_BTRFS_WITH_PARENT_ROOT) {
- if (fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT)
+ if (fh_len < BTRFS_FID_SIZE_CONNECTABLE_ROOT)
return NULL;
root_objectid = fid->parent_root_objectid;
} else
@@ -136,11 +136,11 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
u32 generation;
if ((fh_type != FILEID_BTRFS_WITH_PARENT ||
- fh_len != BTRFS_FID_SIZE_CONNECTABLE) &&
+ fh_len < BTRFS_FID_SIZE_CONNECTABLE) &&
(fh_type != FILEID_BTRFS_WITH_PARENT_ROOT ||
- fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT) &&
+ fh_len < BTRFS_FID_SIZE_CONNECTABLE_ROOT) &&
(fh_type != FILEID_BTRFS_WITHOUT_PARENT ||
- fh_len != BTRFS_FID_SIZE_NON_CONNECTABLE))
+ fh_len < BTRFS_FID_SIZE_NON_CONNECTABLE))
return NULL;
objectid = fid->objectid;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 07204bf..60cc139 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -33,6 +33,7 @@
#include "raid56.h"
#include "locking.h"
#include "free-space-cache.h"
+#include "free-space-tree.h"
#include "math.h"
#include "sysfs.h"
#include "qgroup.h"
@@ -95,8 +96,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 parent, u64 root_objectid,
u64 flags, struct btrfs_disk_key *key,
- int level, struct btrfs_key *ins,
- int no_quota);
+ int level, struct btrfs_key *ins);
static int do_chunk_alloc(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root, u64 flags,
int force);
@@ -125,7 +125,7 @@ static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
return (cache->flags & bits) == bits;
}
-static void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
+void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
{
atomic_inc(&cache->count);
}
@@ -332,13 +332,34 @@ static void put_caching_control(struct btrfs_caching_control *ctl)
kfree(ctl);
}
+#ifdef CONFIG_BTRFS_DEBUG
+static void fragment_free_space(struct btrfs_root *root,
+ struct btrfs_block_group_cache *block_group)
+{
+ u64 start = block_group->key.objectid;
+ u64 len = block_group->key.offset;
+ u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
+ root->nodesize : root->sectorsize;
+ u64 step = chunk << 1;
+
+ while (len > chunk) {
+ btrfs_remove_free_space(block_group, start, chunk);
+ start += step;
+ if (len < step)
+ len = 0;
+ else
+ len -= step;
+ }
+}
+#endif
+
/*
* this is only called by cache_block_group, since we could have freed extents
* we need to check the pinned_extents for any extents that can't be used yet
* since their free space will be released as soon as the transaction commits.
*/
-static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
- struct btrfs_fs_info *info, u64 start, u64 end)
+u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
+ struct btrfs_fs_info *info, u64 start, u64 end)
{
u64 extent_start, extent_end, size, total_added = 0;
int ret;
@@ -375,11 +396,10 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
return total_added;
}
-static noinline void caching_thread(struct btrfs_work *work)
+static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
{
struct btrfs_block_group_cache *block_group;
struct btrfs_fs_info *fs_info;
- struct btrfs_caching_control *caching_ctl;
struct btrfs_root *extent_root;
struct btrfs_path *path;
struct extent_buffer *leaf;
@@ -387,19 +407,28 @@ static noinline void caching_thread(struct btrfs_work *work)
u64 total_found = 0;
u64 last = 0;
u32 nritems;
- int ret = -ENOMEM;
+ int ret;
+ bool wakeup = true;
- caching_ctl = container_of(work, struct btrfs_caching_control, work);
block_group = caching_ctl->block_group;
fs_info = block_group->fs_info;
extent_root = fs_info->extent_root;
path = btrfs_alloc_path();
if (!path)
- goto out;
+ return -ENOMEM;
last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
+#ifdef CONFIG_BTRFS_DEBUG
+ /*
+ * If we're fragmenting we don't want to make anybody think we can
+ * allocate from this block group until we've had a chance to fragment
+ * the free space.
+ */
+ if (btrfs_should_fragment_free_space(extent_root, block_group))
+ wakeup = false;
+#endif
/*
* We don't want to deadlock with somebody trying to allocate a new
* extent for the extent root while also trying to search the extent
@@ -408,20 +437,16 @@ static noinline void caching_thread(struct btrfs_work *work)
*/
path->skip_locking = 1;
path->search_commit_root = 1;
- path->reada = 1;
+ path->reada = READA_FORWARD;
key.objectid = last;
key.offset = 0;
key.type = BTRFS_EXTENT_ITEM_KEY;
-again:
- mutex_lock(&caching_ctl->mutex);
- /* need to make sure the commit_root doesn't disappear */
- down_read(&fs_info->commit_root_sem);
next:
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
if (ret < 0)
- goto err;
+ goto out;
leaf = path->nodes[0];
nritems = btrfs_header_nritems(leaf);
@@ -441,17 +466,20 @@ next:
if (need_resched() ||
rwsem_is_contended(&fs_info->commit_root_sem)) {
- caching_ctl->progress = last;
+ if (wakeup)
+ caching_ctl->progress = last;
btrfs_release_path(path);
up_read(&fs_info->commit_root_sem);
mutex_unlock(&caching_ctl->mutex);
cond_resched();
- goto again;
+ mutex_lock(&caching_ctl->mutex);
+ down_read(&fs_info->commit_root_sem);
+ goto next;
}
ret = btrfs_next_leaf(extent_root, path);
if (ret < 0)
- goto err;
+ goto out;
if (ret)
break;
leaf = path->nodes[0];
@@ -464,7 +492,8 @@ next:
key.offset = 0;
key.type = BTRFS_EXTENT_ITEM_KEY;
- caching_ctl->progress = last;
+ if (wakeup)
+ caching_ctl->progress = last;
btrfs_release_path(path);
goto next;
}
@@ -489,9 +518,10 @@ next:
else
last = key.objectid + key.offset;
- if (total_found > (1024 * 1024 * 2)) {
+ if (total_found > CACHING_CTL_WAKE_UP) {
total_found = 0;
- wake_up(&caching_ctl->wait);
+ if (wakeup)
+ wake_up(&caching_ctl->wait);
}
}
path->slots[0]++;
@@ -503,25 +533,58 @@ next:
block_group->key.offset);
caching_ctl->progress = (u64)-1;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static noinline void caching_thread(struct btrfs_work *work)
+{
+ struct btrfs_block_group_cache *block_group;
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_caching_control *caching_ctl;
+ struct btrfs_root *extent_root;
+ int ret;
+
+ caching_ctl = container_of(work, struct btrfs_caching_control, work);
+ block_group = caching_ctl->block_group;
+ fs_info = block_group->fs_info;
+ extent_root = fs_info->extent_root;
+
+ mutex_lock(&caching_ctl->mutex);
+ down_read(&fs_info->commit_root_sem);
+
+ if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+ ret = load_free_space_tree(caching_ctl);
+ else
+ ret = load_extent_tree_free(caching_ctl);
+
spin_lock(&block_group->lock);
block_group->caching_ctl = NULL;
- block_group->cached = BTRFS_CACHE_FINISHED;
+ block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
spin_unlock(&block_group->lock);
-err:
- btrfs_free_path(path);
- up_read(&fs_info->commit_root_sem);
-
- free_excluded_extents(extent_root, block_group);
+#ifdef CONFIG_BTRFS_DEBUG
+ if (btrfs_should_fragment_free_space(extent_root, block_group)) {
+ u64 bytes_used;
- mutex_unlock(&caching_ctl->mutex);
-out:
- if (ret) {
+ spin_lock(&block_group->space_info->lock);
spin_lock(&block_group->lock);
- block_group->caching_ctl = NULL;
- block_group->cached = BTRFS_CACHE_ERROR;
+ bytes_used = block_group->key.offset -
+ btrfs_block_group_used(&block_group->item);
+ block_group->space_info->bytes_used += bytes_used >> 1;
spin_unlock(&block_group->lock);
+ spin_unlock(&block_group->space_info->lock);
+ fragment_free_space(extent_root, block_group);
}
+#endif
+
+ caching_ctl->progress = (u64)-1;
+
+ up_read(&fs_info->commit_root_sem);
+ free_excluded_extents(fs_info->extent_root, block_group);
+ mutex_unlock(&caching_ctl->mutex);
+
wake_up(&caching_ctl->wait);
put_caching_control(caching_ctl);
@@ -607,6 +670,22 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
}
}
spin_unlock(&cache->lock);
+#ifdef CONFIG_BTRFS_DEBUG
+ if (ret == 1 &&
+ btrfs_should_fragment_free_space(fs_info->extent_root,
+ cache)) {
+ u64 bytes_used;
+
+ spin_lock(&cache->space_info->lock);
+ spin_lock(&cache->lock);
+ bytes_used = cache->key.offset -
+ btrfs_block_group_used(&cache->item);
+ cache->space_info->bytes_used += bytes_used >> 1;
+ spin_unlock(&cache->lock);
+ spin_unlock(&cache->space_info->lock);
+ fragment_free_space(fs_info->extent_root, cache);
+ }
+#endif
mutex_unlock(&caching_ctl->mutex);
wake_up(&caching_ctl->wait);
@@ -617,8 +696,8 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
}
} else {
/*
- * We are not going to do the fast caching, set cached to the
- * appropriate value and wakeup any waiters.
+ * We're either using the free space tree or no caching at all.
+ * Set cached to the appropriate value and wakeup any waiters.
*/
spin_lock(&cache->lock);
if (load_cache_only) {
@@ -1316,8 +1395,7 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
return ret;
}
-static noinline u32 extent_data_ref_count(struct btrfs_root *root,
- struct btrfs_path *path,
+static noinline u32 extent_data_ref_count(struct btrfs_path *path,
struct btrfs_extent_inline_ref *iref)
{
struct btrfs_key key;
@@ -1883,10 +1961,77 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
return ret;
}
-static int btrfs_issue_discard(struct block_device *bdev,
- u64 start, u64 len)
+#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len))
+static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
+ u64 *discarded_bytes)
{
- return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0);
+ int j, ret = 0;
+ u64 bytes_left, end;
+ u64 aligned_start = ALIGN(start, 1 << 9);
+
+ if (WARN_ON(start != aligned_start)) {
+ len -= aligned_start - start;
+ len = round_down(len, 1 << 9);
+ start = aligned_start;
+ }
+
+ *discarded_bytes = 0;
+
+ if (!len)
+ return 0;
+
+ end = start + len;
+ bytes_left = len;
+
+ /* Skip any superblocks on this device. */
+ for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
+ u64 sb_start = btrfs_sb_offset(j);
+ u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
+ u64 size = sb_start - start;
+
+ if (!in_range(sb_start, start, bytes_left) &&
+ !in_range(sb_end, start, bytes_left) &&
+ !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
+ continue;
+
+ /*
+ * Superblock spans beginning of range. Adjust start and
+ * try again.
+ */
+ if (sb_start <= start) {
+ start += sb_end - start;
+ if (start > end) {
+ bytes_left = 0;
+ break;
+ }
+ bytes_left = end - start;
+ continue;
+ }
+
+ if (size) {
+ ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
+ GFP_NOFS, 0);
+ if (!ret)
+ *discarded_bytes += size;
+ else if (ret != -EOPNOTSUPP)
+ return ret;
+ }
+
+ start = sb_end;
+ if (start > end) {
+ bytes_left = 0;
+ break;
+ }
+ bytes_left = end - start;
+ }
+
+ if (bytes_left) {
+ ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
+ GFP_NOFS, 0);
+ if (!ret)
+ *discarded_bytes += bytes_left;
+ }
+ return ret;
}
int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
@@ -1907,14 +2052,16 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
for (i = 0; i < bbio->num_stripes; i++, stripe++) {
+ u64 bytes;
if (!stripe->dev->can_discard)
continue;
ret = btrfs_issue_discard(stripe->dev->bdev,
stripe->physical,
- stripe->length);
+ stripe->length,
+ &bytes);
if (!ret)
- discarded_bytes += stripe->length;
+ discarded_bytes += bytes;
else if (ret != -EOPNOTSUPP)
break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
@@ -1941,8 +2088,7 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent,
- u64 root_objectid, u64 owner, u64 offset,
- int no_quota)
+ u64 root_objectid, u64 owner, u64 offset)
{
int ret;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -1954,12 +2100,12 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
num_bytes,
parent, root_objectid, (int)owner,
- BTRFS_ADD_DELAYED_REF, NULL, no_quota);
+ BTRFS_ADD_DELAYED_REF, NULL);
} else {
ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
- num_bytes,
- parent, root_objectid, owner, offset,
- BTRFS_ADD_DELAYED_REF, NULL, no_quota);
+ num_bytes, parent, root_objectid,
+ owner, offset, 0,
+ BTRFS_ADD_DELAYED_REF, NULL);
}
return ret;
}
@@ -1980,16 +2126,12 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
u64 num_bytes = node->num_bytes;
u64 refs;
int ret;
- int no_quota = node->no_quota;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- if (!is_fstree(root_objectid) || !root->fs_info->quota_enabled)
- no_quota = 1;
-
- path->reada = 1;
+ path->reada = READA_FORWARD;
path->leave_spinning = 1;
/* this will setup the path even if it fails to insert the back ref */
ret = insert_inline_extent_backref(trans, fs_info->extent_root, path,
@@ -2015,7 +2157,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
- path->reada = 1;
+ path->reada = READA_FORWARD;
path->leave_spinning = 1;
/* now insert the actual backref */
ret = insert_extent_backref(trans, root->fs_info->extent_root,
@@ -2128,7 +2270,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
}
again:
- path->reada = 1;
+ path->reada = READA_FORWARD;
path->leave_spinning = 1;
ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
path, 0, 1);
@@ -2223,8 +2365,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
parent, ref_root,
extent_op->flags_to_set,
&extent_op->key,
- ref->level, &ins,
- node->no_quota);
+ ref->level, &ins);
} else if (node->action == BTRFS_ADD_DELAYED_REF) {
ret = __btrfs_inc_extent_ref(trans, root, node,
parent, ref_root,
@@ -2277,6 +2418,11 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
node->num_bytes);
}
}
+
+ /* Also free its reserved qgroup space */
+ btrfs_qgroup_free_delayed_ref(root->fs_info,
+ head->qgroup_ref_root,
+ head->qgroup_reserved);
return ret;
}
@@ -2365,7 +2511,21 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
}
}
+ /*
+ * We need to try and merge add/drops of the same ref since we
+ * can run into issues with relocate dropping the implicit ref
+ * and then it being added back again before the drop can
+ * finish. If we merged anything we need to re-loop so we can
+ * get a good ref.
+ * Or we can get node references of the same type that weren't
+ * merged when created due to bumps in the tree mod seq, and
+ * we need to merge them to prevent adding an inline extent
+ * backref before dropping it (triggering a BUG_ON at
+ * insert_inline_extent_backref()).
+ */
spin_lock(&locked_ref->lock);
+ btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
+ locked_ref);
/*
* locked_ref is the head node, so we have to go one
@@ -2760,11 +2920,15 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *head;
int ret;
int run_all = count == (unsigned long)-1;
+ bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
/* We'll clean this up in btrfs_cleanup_transaction */
if (trans->aborted)
return 0;
+ if (root->fs_info->creating_free_space_tree)
+ return 0;
+
if (root == root->fs_info->extent_root)
root = root->fs_info->tree_root;
@@ -2776,6 +2940,7 @@ again:
#ifdef SCRAMBLE_DELAYED_REFS
delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
#endif
+ trans->can_flush_pending_bgs = false;
ret = __btrfs_run_delayed_refs(trans, root, count);
if (ret < 0) {
btrfs_abort_transaction(trans, root, ret);
@@ -2825,6 +2990,7 @@ again:
}
out:
assert_qgroups_uptodate(trans);
+ trans->can_flush_pending_bgs = can_flush_pending_bgs;
return 0;
}
@@ -2841,9 +3007,9 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
return -ENOMEM;
extent_op->flags_to_set = flags;
- extent_op->update_flags = 1;
- extent_op->update_key = 0;
- extent_op->is_data = is_data ? 1 : 0;
+ extent_op->update_flags = true;
+ extent_op->update_key = false;
+ extent_op->is_data = is_data ? true : false;
extent_op->level = level;
ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
@@ -3038,7 +3204,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
int level;
int ret = 0;
int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
- u64, u64, u64, u64, u64, u64, int);
+ u64, u64, u64, u64, u64, u64);
if (btrfs_test_is_dummy_root(root))
@@ -3079,15 +3245,14 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
key.offset -= btrfs_file_extent_offset(buf, fi);
ret = process_func(trans, root, bytenr, num_bytes,
parent, ref_root, key.objectid,
- key.offset, 1);
+ key.offset);
if (ret)
goto fail;
} else {
bytenr = btrfs_node_blockptr(buf, i);
num_bytes = root->nodesize;
ret = process_func(trans, root, bytenr, num_bytes,
- parent, ref_root, level - 1, 0,
- 1);
+ parent, ref_root, level - 1, 0);
if (ret)
goto fail;
}
@@ -3182,7 +3347,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group,
* If this block group is smaller than 100 megs don't bother caching the
* block group.
*/
- if (block_group->key.offset < (100 * 1024 * 1024)) {
+ if (block_group->key.offset < (100 * SZ_1M)) {
spin_lock(&block_group->lock);
block_group->disk_cache_state = BTRFS_DC_WRITTEN;
spin_unlock(&block_group->lock);
@@ -3268,28 +3433,47 @@ again:
spin_unlock(&block_group->lock);
/*
+ * We hit an ENOSPC when setting up the cache in this transaction, just
+ * skip doing the setup, we've already cleared the cache so we're safe.
+ */
+ if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
+ ret = -ENOSPC;
+ goto out_put;
+ }
+
+ /*
* Try to preallocate enough space based on how big the block group is.
* Keep in mind this has to include any pinned space which could end up
* taking up quite a bit since it's not folded into the other space
* cache.
*/
- num_pages = div_u64(block_group->key.offset, 256 * 1024 * 1024);
+ num_pages = div_u64(block_group->key.offset, SZ_256M);
if (!num_pages)
num_pages = 1;
num_pages *= 16;
num_pages *= PAGE_CACHE_SIZE;
- ret = btrfs_check_data_free_space(inode, num_pages, num_pages);
+ ret = btrfs_check_data_free_space(inode, 0, num_pages);
if (ret)
goto out_put;
ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
num_pages, num_pages,
&alloc_hint);
+ /*
+ * Our cache requires contiguous chunks so that we don't modify a bunch
+ * of metadata or split extents when writing the cache out, which means
+ * we can enospc if we are heavily fragmented in addition to just normal
+ * out of space conditions. So if we hit this just skip setting up any
+ * other block groups for this transaction, maybe we'll unpin enough
+ * space the next time around.
+ */
if (!ret)
dcs = BTRFS_DC_SETUP;
- btrfs_free_reserved_data_space(inode, num_pages);
+ else if (ret == -ENOSPC)
+ set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
+ btrfs_free_reserved_data_space(inode, 0, num_pages);
out_put:
iput(inode);
@@ -3519,11 +3703,21 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
return -ENOMEM;
/*
- * We don't need the lock here since we are protected by the transaction
- * commit. We want to do the cache_save_setup first and then run the
+ * Even though we are in the critical section of the transaction commit,
+ * we can still have concurrent tasks adding elements to this
+ * transaction's list of dirty block groups. These tasks correspond to
+ * endio free space workers started when writeback finishes for a
+ * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
+ * allocate new block groups as a result of COWing nodes of the root
+ * tree when updating the free space inode. The writeback for the space
+ * caches is triggered by an earlier call to
+ * btrfs_start_dirty_block_groups() and iterations of the following
+ * loop.
+ * Also we want to do the cache_save_setup first and then run the
* delayed refs to make sure we have the best chance at doing this all
* in one shot.
*/
+ spin_lock(&cur_trans->dirty_bgs_lock);
while (!list_empty(&cur_trans->dirty_bgs)) {
cache = list_first_entry(&cur_trans->dirty_bgs,
struct btrfs_block_group_cache,
@@ -3535,11 +3729,13 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
* finish and then do it all again
*/
if (!list_empty(&cache->io_list)) {
+ spin_unlock(&cur_trans->dirty_bgs_lock);
list_del_init(&cache->io_list);
btrfs_wait_cache_io(root, trans, cache,
&cache->io_ctl, path,
cache->key.objectid);
btrfs_put_block_group(cache);
+ spin_lock(&cur_trans->dirty_bgs_lock);
}
/*
@@ -3547,6 +3743,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
* on any pending IO
*/
list_del_init(&cache->dirty_list);
+ spin_unlock(&cur_trans->dirty_bgs_lock);
should_put = 1;
cache_save_setup(cache, trans, path);
@@ -3571,6 +3768,25 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
}
if (!ret) {
ret = write_one_cache_group(trans, root, path, cache);
+ /*
+ * One of the free space endio workers might have
+ * created a new block group while updating a free space
+ * cache's inode (at inode.c:btrfs_finish_ordered_io())
+ * and hasn't released its transaction handle yet, in
+ * which case the new block group is still attached to
+ * its transaction handle and its creation has not
+ * finished yet (no block group item in the extent tree
+ * yet, etc). If this is the case, wait for all free
+ * space endio workers to finish and retry. This is a
+ * a very rare case so no need for a more efficient and
+ * complex approach.
+ */
+ if (ret == -ENOENT) {
+ wait_event(cur_trans->writer_wait,
+ atomic_read(&cur_trans->num_writers) == 1);
+ ret = write_one_cache_group(trans, root, path,
+ cache);
+ }
if (ret)
btrfs_abort_transaction(trans, root, ret);
}
@@ -3578,7 +3794,9 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
/* if its not on the io list, we need to put the block group */
if (should_put)
btrfs_put_block_group(cache);
+ spin_lock(&cur_trans->dirty_bgs_lock);
}
+ spin_unlock(&cur_trans->dirty_bgs_lock);
while (!list_empty(io)) {
cache = list_first_entry(io, struct btrfs_block_group_cache,
@@ -3674,10 +3892,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
found->bytes_reserved = 0;
found->bytes_readonly = 0;
found->bytes_may_use = 0;
- if (total_bytes > 0)
- found->full = 0;
- else
- found->full = 1;
+ found->full = 0;
+ found->max_extent_size = 0;
found->force_alloc = CHUNK_ALLOC_NO_FORCE;
found->chunk_alloc = 0;
found->flush = 0;
@@ -3754,7 +3970,8 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
{
u64 num_devices = root->fs_info->fs_devices->rw_devices;
u64 target;
- u64 tmp;
+ u64 raid_type;
+ u64 allowed = 0;
/*
* see if restripe for this chunk_type is in progress, if so
@@ -3772,31 +3989,26 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
spin_unlock(&root->fs_info->balance_lock);
/* First, mask out the RAID levels which aren't possible */
- if (num_devices == 1)
- flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 |
- BTRFS_BLOCK_GROUP_RAID5);
- if (num_devices < 3)
- flags &= ~BTRFS_BLOCK_GROUP_RAID6;
- if (num_devices < 4)
- flags &= ~BTRFS_BLOCK_GROUP_RAID10;
-
- tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
- BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 |
- BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10);
- flags &= ~tmp;
-
- if (tmp & BTRFS_BLOCK_GROUP_RAID6)
- tmp = BTRFS_BLOCK_GROUP_RAID6;
- else if (tmp & BTRFS_BLOCK_GROUP_RAID5)
- tmp = BTRFS_BLOCK_GROUP_RAID5;
- else if (tmp & BTRFS_BLOCK_GROUP_RAID10)
- tmp = BTRFS_BLOCK_GROUP_RAID10;
- else if (tmp & BTRFS_BLOCK_GROUP_RAID1)
- tmp = BTRFS_BLOCK_GROUP_RAID1;
- else if (tmp & BTRFS_BLOCK_GROUP_RAID0)
- tmp = BTRFS_BLOCK_GROUP_RAID0;
-
- return extended_to_chunk(flags | tmp);
+ for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
+ if (num_devices >= btrfs_raid_array[raid_type].devs_min)
+ allowed |= btrfs_raid_group[raid_type];
+ }
+ allowed &= flags;
+
+ if (allowed & BTRFS_BLOCK_GROUP_RAID6)
+ allowed = BTRFS_BLOCK_GROUP_RAID6;
+ else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
+ allowed = BTRFS_BLOCK_GROUP_RAID5;
+ else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
+ allowed = BTRFS_BLOCK_GROUP_RAID10;
+ else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
+ allowed = BTRFS_BLOCK_GROUP_RAID1;
+ else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
+ allowed = BTRFS_BLOCK_GROUP_RAID0;
+
+ flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+ return extended_to_chunk(flags | allowed);
}
static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags)
@@ -3835,11 +4047,7 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
return ret;
}
-/*
- * This will check the space that the inode allocates from to make sure we have
- * enough space for bytes.
- */
-int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes)
+int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes)
{
struct btrfs_space_info *data_sinfo;
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -3938,7 +4146,8 @@ commit_trans:
if (IS_ERR(trans))
return PTR_ERR(trans);
if (have_pinned_space >= 0 ||
- trans->transaction->have_free_bgs ||
+ test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
+ &trans->transaction->flags) ||
need_commit > 0) {
ret = btrfs_commit_transaction(trans, root);
if (ret)
@@ -3960,38 +4169,86 @@ commit_trans:
data_sinfo->flags, bytes, 1);
return -ENOSPC;
}
- ret = btrfs_qgroup_reserve(root, write_bytes);
- if (ret)
- goto out;
data_sinfo->bytes_may_use += bytes;
trace_btrfs_space_reservation(root->fs_info, "space_info",
data_sinfo->flags, bytes, 1);
-out:
spin_unlock(&data_sinfo->lock);
return ret;
}
/*
- * Called if we need to clear a data reservation for this inode.
+ * New check_data_free_space() with ability for precious data reservation
+ * Will replace old btrfs_check_data_free_space(), but for patch split,
+ * add a new function first and then replace it.
+ */
+int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret;
+
+ /* align the range */
+ len = round_up(start + len, root->sectorsize) -
+ round_down(start, root->sectorsize);
+ start = round_down(start, root->sectorsize);
+
+ ret = btrfs_alloc_data_chunk_ondemand(inode, len);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * Use new btrfs_qgroup_reserve_data to reserve precious data space
+ *
+ * TODO: Find a good method to avoid reserve data space for NOCOW
+ * range, but don't impact performance on quota disable case.
+ */
+ ret = btrfs_qgroup_reserve_data(inode, start, len);
+ return ret;
+}
+
+/*
+ * Called if we need to clear a data reservation for this inode
+ * Normally in a error case.
+ *
+ * This one will *NOT* use accurate qgroup reserved space API, just for case
+ * which we can't sleep and is sure it won't affect qgroup reserved space.
+ * Like clear_bit_hook().
*/
-void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
+void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
+ u64 len)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_space_info *data_sinfo;
- /* make sure bytes are sectorsize aligned */
- bytes = ALIGN(bytes, root->sectorsize);
+ /* Make sure the range is aligned to sectorsize */
+ len = round_up(start + len, root->sectorsize) -
+ round_down(start, root->sectorsize);
+ start = round_down(start, root->sectorsize);
data_sinfo = root->fs_info->data_sinfo;
spin_lock(&data_sinfo->lock);
- WARN_ON(data_sinfo->bytes_may_use < bytes);
- data_sinfo->bytes_may_use -= bytes;
+ if (WARN_ON(data_sinfo->bytes_may_use < len))
+ data_sinfo->bytes_may_use = 0;
+ else
+ data_sinfo->bytes_may_use -= len;
trace_btrfs_space_reservation(root->fs_info, "space_info",
- data_sinfo->flags, bytes, 0);
+ data_sinfo->flags, len, 0);
spin_unlock(&data_sinfo->lock);
}
+/*
+ * Called if we need to clear a data reservation for this inode
+ * Normally in a error case.
+ *
+ * This one will handle the per-indoe data rsv map for accurate reserved
+ * space framework.
+ */
+void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len)
+{
+ btrfs_free_reserved_data_space_noquota(inode, start, len);
+ btrfs_qgroup_free_data(inode, start, len);
+}
+
static void force_metadata_allocation(struct btrfs_fs_info *info)
{
struct list_head *head = &info->space_info;
@@ -4035,14 +4292,13 @@ static int should_alloc_chunk(struct btrfs_root *root,
*/
if (force == CHUNK_ALLOC_LIMITED) {
thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
- thresh = max_t(u64, 64 * 1024 * 1024,
- div_factor_fine(thresh, 1));
+ thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
if (num_bytes - num_allocated < thresh)
return 1;
}
- if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8))
+ if (num_allocated + SZ_2M < div_factor(num_bytes, 8))
return 0;
return 1;
}
@@ -4241,7 +4497,8 @@ out:
* the block groups that were made dirty during the lifetime of the
* transaction.
*/
- if (trans->chunk_bytes_reserved >= (2 * 1024 * 1024ull)) {
+ if (trans->can_flush_pending_bgs &&
+ trans->chunk_bytes_reserved >= (u64)SZ_2M) {
btrfs_create_pending_block_groups(trans, trans->root);
btrfs_trans_release_chunk_metadata(trans);
}
@@ -4339,7 +4596,7 @@ static inline int calc_reclaim_items_nr(struct btrfs_root *root, u64 to_reclaim)
return nr;
}
-#define EXTENT_SIZE_PER_ITEM (256 * 1024)
+#define EXTENT_SIZE_PER_ITEM SZ_256K
/*
* shrink metadata reservation for delalloc
@@ -4544,8 +4801,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
u64 expected;
u64 to_reclaim;
- to_reclaim = min_t(u64, num_online_cpus() * 1024 * 1024,
- 16 * 1024 * 1024);
+ to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
spin_lock(&space_info->lock);
if (can_overcommit(root, space_info, to_reclaim,
BTRFS_RESERVE_FLUSH_ALL)) {
@@ -4556,8 +4812,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
used = space_info->bytes_used + space_info->bytes_reserved +
space_info->bytes_pinned + space_info->bytes_readonly +
space_info->bytes_may_use;
- if (can_overcommit(root, space_info, 1024 * 1024,
- BTRFS_RESERVE_FLUSH_ALL))
+ if (can_overcommit(root, space_info, SZ_1M, BTRFS_RESERVE_FLUSH_ALL))
expected = div_factor_fine(space_info->total_bytes, 95);
else
expected = div_factor_fine(space_info->total_bytes, 90);
@@ -4822,13 +5077,9 @@ static struct btrfs_block_rsv *get_block_rsv(
{
struct btrfs_block_rsv *block_rsv = NULL;
- if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
- block_rsv = trans->block_rsv;
-
- if (root == root->fs_info->csum_root && trans->adding_csums)
- block_rsv = trans->block_rsv;
-
- if (root == root->fs_info->uuid_root)
+ if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
+ (root == root->fs_info->csum_root && trans->adding_csums) ||
+ (root == root->fs_info->uuid_root))
block_rsv = trans->block_rsv;
if (!block_rsv)
@@ -5117,7 +5368,7 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
spin_lock(&sinfo->lock);
spin_lock(&block_rsv->lock);
- block_rsv->size = min_t(u64, num_bytes, 512 * 1024 * 1024);
+ block_rsv->size = min_t(u64, num_bytes, SZ_512M);
num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
sinfo->bytes_reserved + sinfo->bytes_readonly +
@@ -5271,7 +5522,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
if (root->fs_info->quota_enabled) {
/* One for parent inode, two for dir entries */
num_bytes = 3 * root->nodesize;
- ret = btrfs_qgroup_reserve(root, num_bytes);
+ ret = btrfs_qgroup_reserve_meta(root, num_bytes);
if (ret)
return ret;
} else {
@@ -5289,10 +5540,8 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
if (ret == -ENOSPC && use_global_rsv)
ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes);
- if (ret) {
- if (*qgroup_reserved)
- btrfs_qgroup_free(root, *qgroup_reserved);
- }
+ if (ret && *qgroup_reserved)
+ btrfs_qgroup_free_meta(root, *qgroup_reserved);
return ret;
}
@@ -5453,15 +5702,15 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
spin_unlock(&BTRFS_I(inode)->lock);
if (root->fs_info->quota_enabled) {
- ret = btrfs_qgroup_reserve(root, nr_extents * root->nodesize);
+ ret = btrfs_qgroup_reserve_meta(root,
+ nr_extents * root->nodesize);
if (ret)
goto out_fail;
}
ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
if (unlikely(ret)) {
- if (root->fs_info->quota_enabled)
- btrfs_qgroup_free(root, nr_extents * root->nodesize);
+ btrfs_qgroup_free_meta(root, nr_extents * root->nodesize);
goto out_fail;
}
@@ -5584,41 +5833,48 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
}
/**
- * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
+ * btrfs_delalloc_reserve_space - reserve data and metadata space for
+ * delalloc
* @inode: inode we're writing to
- * @num_bytes: the number of bytes we want to allocate
+ * @start: start range we are writing to
+ * @len: how long the range we are writing to
+ *
+ * TODO: This function will finally replace old btrfs_delalloc_reserve_space()
*
* This will do the following things
*
- * o reserve space in the data space info for num_bytes
- * o reserve space in the metadata space info based on number of outstanding
+ * o reserve space in data space info for num bytes
+ * and reserve precious corresponding qgroup space
+ * (Done in check_data_free_space)
+ *
+ * o reserve space for metadata space, based on the number of outstanding
* extents and how much csums will be needed
- * o add to the inodes ->delalloc_bytes
+ * also reserve metadata space in a per root over-reserve method.
+ * o add to the inodes->delalloc_bytes
* o add it to the fs_info's delalloc inodes list.
+ * (Above 3 all done in delalloc_reserve_metadata)
*
- * This will return 0 for success and -ENOSPC if there is no space left.
+ * Return 0 for success
+ * Return <0 for error(-ENOSPC or -EQUOT)
*/
-int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
+int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len)
{
int ret;
- ret = btrfs_check_data_free_space(inode, num_bytes, num_bytes);
- if (ret)
- return ret;
-
- ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
- if (ret) {
- btrfs_free_reserved_data_space(inode, num_bytes);
+ ret = btrfs_check_data_free_space(inode, start, len);
+ if (ret < 0)
return ret;
- }
-
- return 0;
+ ret = btrfs_delalloc_reserve_metadata(inode, len);
+ if (ret < 0)
+ btrfs_free_reserved_data_space(inode, start, len);
+ return ret;
}
/**
* btrfs_delalloc_release_space - release data and metadata space for delalloc
* @inode: inode we're releasing space for
- * @num_bytes: the number of bytes we want to free up
+ * @start: start position of the space already reserved
+ * @len: the len of the space already reserved
*
* This must be matched with a call to btrfs_delalloc_reserve_space. This is
* called in the case that we don't need the metadata AND data reservations
@@ -5627,11 +5883,12 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
* This function will release the metadata space that was not used and will
* decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
* list if there are no delalloc bytes left.
+ * Also it will handle the qgroup reserved space.
*/
-void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
+void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len)
{
- btrfs_delalloc_release_metadata(inode, num_bytes);
- btrfs_free_reserved_data_space(inode, num_bytes);
+ btrfs_delalloc_release_metadata(inode, len);
+ btrfs_free_reserved_data_space(inode, start, len);
}
static int update_block_group(struct btrfs_trans_handle *trans,
@@ -5708,19 +5965,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
set_extent_dirty(info->pinned_extents,
bytenr, bytenr + num_bytes - 1,
GFP_NOFS | __GFP_NOFAIL);
- /*
- * No longer have used bytes in this block group, queue
- * it for deletion.
- */
- if (old_val == 0) {
- spin_lock(&info->unused_bgs_lock);
- if (list_empty(&cache->bg_list)) {
- btrfs_get_block_group(cache);
- list_add_tail(&cache->bg_list,
- &info->unused_bgs);
- }
- spin_unlock(&info->unused_bgs_lock);
- }
}
spin_lock(&trans->transaction->dirty_bgs_lock);
@@ -5732,6 +5976,22 @@ static int update_block_group(struct btrfs_trans_handle *trans,
}
spin_unlock(&trans->transaction->dirty_bgs_lock);
+ /*
+ * No longer have used bytes in this block group, queue it for
+ * deletion. We do this after adding the block group to the
+ * dirty list to avoid races between cleaner kthread and space
+ * cache writeout.
+ */
+ if (!alloc && old_val == 0) {
+ spin_lock(&info->unused_bgs_lock);
+ if (list_empty(&cache->bg_list)) {
+ btrfs_get_block_group(cache);
+ list_add_tail(&cache->bg_list,
+ &info->unused_bgs);
+ }
+ spin_unlock(&info->unused_bgs_lock);
+ }
+
btrfs_put_block_group(cache);
total -= num_bytes;
bytenr += num_bytes;
@@ -5996,6 +6256,34 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
update_global_block_rsv(fs_info);
}
+/*
+ * Returns the free cluster for the given space info and sets empty_cluster to
+ * what it should be based on the mount options.
+ */
+static struct btrfs_free_cluster *
+fetch_cluster_info(struct btrfs_root *root, struct btrfs_space_info *space_info,
+ u64 *empty_cluster)
+{
+ struct btrfs_free_cluster *ret = NULL;
+ bool ssd = btrfs_test_opt(root, SSD);
+
+ *empty_cluster = 0;
+ if (btrfs_mixed_space_info(space_info))
+ return ret;
+
+ if (ssd)
+ *empty_cluster = SZ_2M;
+ if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
+ ret = &root->fs_info->meta_alloc_cluster;
+ if (!ssd)
+ *empty_cluster = SZ_64K;
+ } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && ssd) {
+ ret = &root->fs_info->data_alloc_cluster;
+ }
+
+ return ret;
+}
+
static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
const bool return_free_space)
{
@@ -6003,7 +6291,10 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
struct btrfs_block_group_cache *cache = NULL;
struct btrfs_space_info *space_info;
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+ struct btrfs_free_cluster *cluster = NULL;
u64 len;
+ u64 total_unpinned = 0;
+ u64 empty_cluster = 0;
bool readonly;
while (start <= end) {
@@ -6012,8 +6303,14 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
start >= cache->key.objectid + cache->key.offset) {
if (cache)
btrfs_put_block_group(cache);
+ total_unpinned = 0;
cache = btrfs_lookup_block_group(fs_info, start);
BUG_ON(!cache); /* Logic error */
+
+ cluster = fetch_cluster_info(root,
+ cache->space_info,
+ &empty_cluster);
+ empty_cluster <<= 1;
}
len = cache->key.objectid + cache->key.offset - start;
@@ -6026,12 +6323,27 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
}
start += len;
+ total_unpinned += len;
space_info = cache->space_info;
+ /*
+ * If this space cluster has been marked as fragmented and we've
+ * unpinned enough in this block group to potentially allow a
+ * cluster to be created inside of it go ahead and clear the
+ * fragmented check.
+ */
+ if (cluster && cluster->fragmented &&
+ total_unpinned > empty_cluster) {
+ spin_lock(&cluster->lock);
+ cluster->fragmented = 0;
+ spin_unlock(&cluster->lock);
+ }
+
spin_lock(&space_info->lock);
spin_lock(&cache->lock);
cache->pinned -= len;
space_info->bytes_pinned -= len;
+ space_info->max_extent_size = 0;
percpu_counter_add(&space_info->total_bytes_pinned, -len);
if (cache->ro) {
space_info->bytes_readonly += len;
@@ -6062,20 +6374,19 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_block_group_cache *block_group, *tmp;
+ struct list_head *deleted_bgs;
struct extent_io_tree *unpin;
u64 start;
u64 end;
int ret;
- if (trans->aborted)
- return 0;
-
if (fs_info->pinned_extents == &fs_info->freed_extents[0])
unpin = &fs_info->freed_extents[1];
else
unpin = &fs_info->freed_extents[0];
- while (1) {
+ while (!trans->aborted) {
mutex_lock(&fs_info->unused_bg_unpin_mutex);
ret = find_first_extent_bit(unpin, 0, &start, &end,
EXTENT_DIRTY, NULL);
@@ -6094,6 +6405,34 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
cond_resched();
}
+ /*
+ * Transaction is finished. We don't need the lock anymore. We
+ * do need to clean up the block groups in case of a transaction
+ * abort.
+ */
+ deleted_bgs = &trans->transaction->deleted_bgs;
+ list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
+ u64 trimmed = 0;
+
+ ret = -EROFS;
+ if (!trans->aborted)
+ ret = btrfs_discard_extent(root,
+ block_group->key.objectid,
+ block_group->key.offset,
+ &trimmed);
+
+ list_del_init(&block_group->bg_list);
+ btrfs_put_block_group_trimming(block_group);
+ btrfs_put_block_group(block_group);
+
+ if (ret) {
+ const char *errstr = btrfs_decode_error(ret);
+ btrfs_warn(fs_info,
+ "Discard failed while removing blockgroup: errno=%d %s\n",
+ ret, errstr);
+ }
+ }
+
return 0;
}
@@ -6137,7 +6476,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
int extent_slot = 0;
int found_extent = 0;
int num_to_del = 1;
- int no_quota = node->no_quota;
u32 item_size;
u64 refs;
u64 bytenr = node->bytenr;
@@ -6146,14 +6484,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
SKINNY_METADATA);
- if (!info->quota_enabled || !is_fstree(root_objectid))
- no_quota = 1;
-
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->reada = 1;
+ path->reada = READA_FORWARD;
path->leave_spinning = 1;
is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
@@ -6349,7 +6684,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
} else {
if (found_extent) {
BUG_ON(is_data && refs_to_drop !=
- extent_data_ref_count(root, path, iref));
+ extent_data_ref_count(path, iref));
if (iref) {
BUG_ON(path->slots[0] != extent_slot);
} else {
@@ -6376,6 +6711,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
}
+ ret = add_to_free_space_tree(trans, root->fs_info, bytenr,
+ num_bytes);
+ if (ret) {
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto out;
+ }
+
ret = update_block_group(trans, root, bytenr, num_bytes, 0);
if (ret) {
btrfs_abort_transaction(trans, extent_root, ret);
@@ -6474,7 +6816,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
buf->start, buf->len,
parent, root->root_key.objectid,
btrfs_header_level(buf),
- BTRFS_DROP_DELAYED_REF, NULL, 0);
+ BTRFS_DROP_DELAYED_REF, NULL);
BUG_ON(ret); /* -ENOMEM */
}
@@ -6522,7 +6864,7 @@ out:
/* Can return -ENOMEM */
int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
- u64 owner, u64 offset, int no_quota)
+ u64 owner, u64 offset)
{
int ret;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -6545,13 +6887,13 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
num_bytes,
parent, root_objectid, (int)owner,
- BTRFS_DROP_DELAYED_REF, NULL, no_quota);
+ BTRFS_DROP_DELAYED_REF, NULL);
} else {
ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
num_bytes,
parent, root_objectid, owner,
- offset, BTRFS_DROP_DELAYED_REF,
- NULL, no_quota);
+ offset, 0,
+ BTRFS_DROP_DELAYED_REF, NULL);
}
return ret;
}
@@ -6737,7 +7079,7 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
struct btrfs_block_group_cache *block_group = NULL;
u64 search_start = 0;
u64 max_extent_size = 0;
- int empty_cluster = 2 * 1024 * 1024;
+ u64 empty_cluster = 0;
struct btrfs_space_info *space_info;
int loop = 0;
int index = __get_raid_index(flags);
@@ -6747,6 +7089,8 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
bool failed_alloc = false;
bool use_cluster = true;
bool have_caching_bg = false;
+ bool orig_have_caching_bg = false;
+ bool full_search = false;
WARN_ON(num_bytes < root->sectorsize);
ins->type = BTRFS_EXTENT_ITEM_KEY;
@@ -6762,36 +7106,47 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
}
/*
- * If the space info is for both data and metadata it means we have a
- * small filesystem and we can't use the clustering stuff.
+ * If our free space is heavily fragmented we may not be able to make
+ * big contiguous allocations, so instead of doing the expensive search
+ * for free space, simply return ENOSPC with our max_extent_size so we
+ * can go ahead and search for a more manageable chunk.
+ *
+ * If our max_extent_size is large enough for our allocation simply
+ * disable clustering since we will likely not be able to find enough
+ * space to create a cluster and induce latency trying.
*/
- if (btrfs_mixed_space_info(space_info))
- use_cluster = false;
-
- if (flags & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
- last_ptr = &root->fs_info->meta_alloc_cluster;
- if (!btrfs_test_opt(root, SSD))
- empty_cluster = 64 * 1024;
- }
-
- if ((flags & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
- btrfs_test_opt(root, SSD)) {
- last_ptr = &root->fs_info->data_alloc_cluster;
+ if (unlikely(space_info->max_extent_size)) {
+ spin_lock(&space_info->lock);
+ if (space_info->max_extent_size &&
+ num_bytes > space_info->max_extent_size) {
+ ins->offset = space_info->max_extent_size;
+ spin_unlock(&space_info->lock);
+ return -ENOSPC;
+ } else if (space_info->max_extent_size) {
+ use_cluster = false;
+ }
+ spin_unlock(&space_info->lock);
}
+ last_ptr = fetch_cluster_info(orig_root, space_info, &empty_cluster);
if (last_ptr) {
spin_lock(&last_ptr->lock);
if (last_ptr->block_group)
hint_byte = last_ptr->window_start;
+ if (last_ptr->fragmented) {
+ /*
+ * We still set window_start so we can keep track of the
+ * last place we found an allocation to try and save
+ * some time.
+ */
+ hint_byte = last_ptr->window_start;
+ use_cluster = false;
+ }
spin_unlock(&last_ptr->lock);
}
search_start = max(search_start, first_logical_byte(root, 0));
search_start = max(search_start, hint_byte);
-
- if (!last_ptr)
- empty_cluster = 0;
-
if (search_start == hint_byte) {
block_group = btrfs_lookup_block_group(root->fs_info,
search_start);
@@ -6826,6 +7181,8 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
}
search:
have_caching_bg = false;
+ if (index == 0 || index == __get_raid_index(flags))
+ full_search = true;
down_read(&space_info->groups_sem);
list_for_each_entry(block_group, &space_info->block_groups[index],
list) {
@@ -6859,6 +7216,7 @@ search:
have_block_group:
cached = block_group_cache_done(block_group);
if (unlikely(!cached)) {
+ have_caching_bg = true;
ret = cache_block_group(block_group, 0);
BUG_ON(ret < 0);
ret = 0;
@@ -6873,7 +7231,7 @@ have_block_group:
* Ok we want to try and use the cluster allocator, so
* lets look there
*/
- if (last_ptr) {
+ if (last_ptr && use_cluster) {
struct btrfs_block_group_cache *used_block_group;
unsigned long aligned_cluster;
/*
@@ -6999,6 +7357,16 @@ refill_cluster:
}
unclustered_alloc:
+ /*
+ * We are doing an unclustered alloc, set the fragmented flag so
+ * we don't bother trying to setup a cluster again until we get
+ * more space.
+ */
+ if (unlikely(last_ptr)) {
+ spin_lock(&last_ptr->lock);
+ last_ptr->fragmented = 1;
+ spin_unlock(&last_ptr->lock);
+ }
spin_lock(&block_group->free_space_ctl->tree_lock);
if (cached &&
block_group->free_space_ctl->free_space <
@@ -7031,8 +7399,6 @@ unclustered_alloc:
failed_alloc = true;
goto have_block_group;
} else if (!offset) {
- if (!cached)
- have_caching_bg = true;
goto loop;
}
checks:
@@ -7073,6 +7439,10 @@ loop:
}
up_read(&space_info->groups_sem);
+ if ((loop == LOOP_CACHING_NOWAIT) && have_caching_bg
+ && !orig_have_caching_bg)
+ orig_have_caching_bg = true;
+
if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
goto search;
@@ -7089,7 +7459,20 @@ loop:
*/
if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
index = 0;
- loop++;
+ if (loop == LOOP_CACHING_NOWAIT) {
+ /*
+ * We want to skip the LOOP_CACHING_WAIT step if we
+ * don't have any unached bgs and we've alrelady done a
+ * full search through.
+ */
+ if (orig_have_caching_bg || !full_search)
+ loop = LOOP_CACHING_WAIT;
+ else
+ loop = LOOP_ALLOC_CHUNK;
+ } else {
+ loop++;
+ }
+
if (loop == LOOP_ALLOC_CHUNK) {
struct btrfs_trans_handle *trans;
int exist = 0;
@@ -7107,6 +7490,15 @@ loop:
ret = do_chunk_alloc(trans, root, flags,
CHUNK_ALLOC_FORCE);
+
+ /*
+ * If we can't allocate a new chunk we've already looped
+ * through at least once, move on to the NO_EMPTY_SIZE
+ * case.
+ */
+ if (ret == -ENOSPC)
+ loop = LOOP_NO_EMPTY_SIZE;
+
/*
* Do not bail out on ENOSPC since we
* can do more things.
@@ -7123,6 +7515,15 @@ loop:
}
if (loop == LOOP_NO_EMPTY_SIZE) {
+ /*
+ * Don't loop again if we already have no empty_size and
+ * no empty_cluster.
+ */
+ if (empty_size == 0 &&
+ empty_cluster == 0) {
+ ret = -ENOSPC;
+ goto out;
+ }
empty_size = 0;
empty_cluster = 0;
}
@@ -7131,11 +7532,20 @@ loop:
} else if (!ins->objectid) {
ret = -ENOSPC;
} else if (ins->objectid) {
+ if (!use_cluster && last_ptr) {
+ spin_lock(&last_ptr->lock);
+ last_ptr->window_start = ins->objectid;
+ spin_unlock(&last_ptr->lock);
+ }
ret = 0;
}
out:
- if (ret == -ENOSPC)
+ if (ret == -ENOSPC) {
+ spin_lock(&space_info->lock);
+ space_info->max_extent_size = max_extent_size;
+ spin_unlock(&space_info->lock);
ins->offset = max_extent_size;
+ }
return ret;
}
@@ -7184,7 +7594,7 @@ int btrfs_reserve_extent(struct btrfs_root *root,
u64 empty_size, u64 hint_byte,
struct btrfs_key *ins, int is_data, int delalloc)
{
- bool final_tried = false;
+ bool final_tried = num_bytes == min_alloc_size;
u64 flags;
int ret;
@@ -7319,6 +7729,11 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(path->nodes[0]);
btrfs_free_path(path);
+ ret = remove_from_free_space_tree(trans, fs_info, ins->objectid,
+ ins->offset);
+ if (ret)
+ return ret;
+
ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
if (ret) { /* -ENOENT, logic error */
btrfs_err(fs_info, "update block group failed for %llu %llu",
@@ -7333,8 +7748,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 parent, u64 root_objectid,
u64 flags, struct btrfs_disk_key *key,
- int level, struct btrfs_key *ins,
- int no_quota)
+ int level, struct btrfs_key *ins)
{
int ret;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -7400,6 +7814,11 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_free_path(path);
+ ret = remove_from_free_space_tree(trans, fs_info, ins->objectid,
+ num_bytes);
+ if (ret)
+ return ret;
+
ret = update_block_group(trans, root, ins->objectid, root->nodesize,
1);
if (ret) { /* -ENOENT, logic error */
@@ -7415,7 +7834,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 root_objectid, u64 owner,
- u64 offset, struct btrfs_key *ins)
+ u64 offset, u64 ram_bytes,
+ struct btrfs_key *ins)
{
int ret;
@@ -7424,7 +7844,8 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
ins->offset, 0,
root_objectid, owner, offset,
- BTRFS_ADD_DELAYED_EXTENT, NULL, 0);
+ ram_bytes, BTRFS_ADD_DELAYED_EXTENT,
+ NULL);
return ret;
}
@@ -7480,7 +7901,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
btrfs_set_lock_blocking(buf);
- btrfs_set_buffer_uptodate(buf);
+ set_extent_buffer_uptodate(buf);
if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
buf->log_index = root->log_transid % 2;
@@ -7567,9 +7988,6 @@ static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
/*
* finds a free extent and does all the dirty work required for allocation
- * returns the key for the extent through ins, and a tree buffer for
- * the first block of the extent through buf.
- *
* returns the tree buffer or an ERR_PTR on error.
*/
struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
@@ -7629,19 +8047,16 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
else
memset(&extent_op->key, 0, sizeof(extent_op->key));
extent_op->flags_to_set = flags;
- if (skinny_metadata)
- extent_op->update_key = 0;
- else
- extent_op->update_key = 1;
- extent_op->update_flags = 1;
- extent_op->is_data = 0;
+ extent_op->update_key = skinny_metadata ? false : true;
+ extent_op->update_flags = true;
+ extent_op->is_data = false;
extent_op->level = level;
ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
ins.objectid, ins.offset,
parent, root_objectid, level,
BTRFS_ADD_DELAYED_EXTENT,
- extent_op, 0);
+ extent_op);
if (ret)
goto out_free_delayed;
}
@@ -7757,21 +8172,47 @@ reada:
}
/*
- * TODO: Modify related function to add related node/leaf to dirty_extent_root,
- * for later qgroup accounting.
- *
- * Current, this function does nothing.
+ * These may not be seen by the usual inc/dec ref code so we have to
+ * add them here.
*/
+static int record_one_subtree_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes)
+{
+ struct btrfs_qgroup_extent_record *qrecord;
+ struct btrfs_delayed_ref_root *delayed_refs;
+
+ qrecord = kmalloc(sizeof(*qrecord), GFP_NOFS);
+ if (!qrecord)
+ return -ENOMEM;
+
+ qrecord->bytenr = bytenr;
+ qrecord->num_bytes = num_bytes;
+ qrecord->old_roots = NULL;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ spin_lock(&delayed_refs->lock);
+ if (btrfs_qgroup_insert_dirty_extent(delayed_refs, qrecord))
+ kfree(qrecord);
+ spin_unlock(&delayed_refs->lock);
+
+ return 0;
+}
+
static int account_leaf_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *eb)
{
int nr = btrfs_header_nritems(eb);
- int i, extent_type;
+ int i, extent_type, ret;
struct btrfs_key key;
struct btrfs_file_extent_item *fi;
u64 bytenr, num_bytes;
+ /* We can be called directly from walk_up_proc() */
+ if (!root->fs_info->quota_enabled)
+ return 0;
+
for (i = 0; i < nr; i++) {
btrfs_item_key_to_cpu(eb, &key, i);
@@ -7790,6 +8231,10 @@ static int account_leaf_items(struct btrfs_trans_handle *trans,
continue;
num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
+
+ ret = record_one_subtree_extent(trans, root, bytenr, num_bytes);
+ if (ret)
+ return ret;
}
return 0;
}
@@ -7858,8 +8303,6 @@ static int adjust_slots_upwards(struct btrfs_root *root,
/*
* root_eb is the subtree root and is locked before this function is called.
- * TODO: Modify this function to mark all (including complete shared node)
- * to dirty_extent_root to allow it get accounted in qgroup.
*/
static int account_shared_subtree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -7937,6 +8380,11 @@ walk_down:
btrfs_tree_read_lock(eb);
btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
+
+ ret = record_one_subtree_extent(trans, root, child_bytenr,
+ root->nodesize);
+ if (ret)
+ goto out;
}
if (level == 0) {
@@ -8182,14 +8630,15 @@ skip:
ret = account_shared_subtree(trans, root, next,
generation, level - 1);
if (ret) {
- printk_ratelimited(KERN_ERR "BTRFS: %s Error "
+ btrfs_err_rl(root->fs_info,
+ "Error "
"%d accounting shared subtree. Quota "
- "is out of sync, rescan required.\n",
- root->fs_info->sb->s_id, ret);
+ "is out of sync, rescan required.",
+ ret);
}
}
ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
- root->root_key.objectid, level - 1, 0, 0);
+ root->root_key.objectid, level - 1, 0);
BUG_ON(ret); /* -ENOMEM */
}
btrfs_tree_unlock(next);
@@ -8274,10 +8723,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
BUG_ON(ret); /* -ENOMEM */
ret = account_leaf_items(trans, root, eb);
if (ret) {
- printk_ratelimited(KERN_ERR "BTRFS: %s Error "
+ btrfs_err_rl(root->fs_info,
+ "error "
"%d accounting leaf items. Quota "
- "is out of sync, rescan required.\n",
- root->fs_info->sb->s_id, ret);
+ "is out of sync, rescan required.",
+ ret);
}
}
/* make block locked assertion in clean_tree_block happy */
@@ -8576,7 +9026,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
}
if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
- btrfs_drop_and_free_fs_root(tree_root->fs_info, root);
+ btrfs_add_dropped_root(trans, root);
} else {
free_extent_buffer(root->node);
free_extent_buffer(root->commit_root);
@@ -8599,7 +9049,7 @@ out:
if (!for_reloc && root_dropped == false)
btrfs_add_dead_root(root);
if (err && err != -EAGAIN)
- btrfs_std_error(root->fs_info, err);
+ btrfs_std_error(root->fs_info, err, NULL);
return err;
}
@@ -8723,14 +9173,13 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
return flags;
}
-static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
+static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
{
struct btrfs_space_info *sinfo = cache->space_info;
u64 num_bytes;
u64 min_allocable_bytes;
int ret = -ENOSPC;
-
/*
* We need some metadata space and system metadata space for
* allocating chunks in some corner cases until we force to set
@@ -8739,7 +9188,7 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
if ((sinfo->flags &
(BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
!force)
- min_allocable_bytes = 1 * 1024 * 1024;
+ min_allocable_bytes = SZ_1M;
else
min_allocable_bytes = 0;
@@ -8747,6 +9196,7 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
spin_lock(&cache->lock);
if (cache->ro) {
+ cache->ro++;
ret = 0;
goto out;
}
@@ -8758,7 +9208,7 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
min_allocable_bytes <= sinfo->total_bytes) {
sinfo->bytes_readonly += num_bytes;
- cache->ro = 1;
+ cache->ro++;
list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
ret = 0;
}
@@ -8768,7 +9218,7 @@ out:
return ret;
}
-int btrfs_set_block_group_ro(struct btrfs_root *root,
+int btrfs_inc_block_group_ro(struct btrfs_root *root,
struct btrfs_block_group_cache *cache)
{
@@ -8776,8 +9226,6 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
u64 alloc_flags;
int ret;
- BUG_ON(cache->ro);
-
again:
trans = btrfs_join_transaction(root);
if (IS_ERR(trans))
@@ -8789,7 +9237,7 @@ again:
* back off and let this transaction commit
*/
mutex_lock(&root->fs_info->ro_block_group_mutex);
- if (trans->transaction->dirty_bg_run) {
+ if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
u64 transid = trans->transid;
mutex_unlock(&root->fs_info->ro_block_group_mutex);
@@ -8820,7 +9268,7 @@ again:
goto out;
}
- ret = set_block_group_ro(cache, 0);
+ ret = inc_block_group_ro(cache, 0);
if (!ret)
goto out;
alloc_flags = get_alloc_profile(root, cache->space_info->flags);
@@ -8828,7 +9276,7 @@ again:
CHUNK_ALLOC_FORCE);
if (ret < 0)
goto out;
- ret = set_block_group_ro(cache, 0);
+ ret = inc_block_group_ro(cache, 0);
out:
if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
alloc_flags = update_block_group_flags(root, cache->flags);
@@ -8891,7 +9339,7 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
return free_bytes;
}
-void btrfs_set_block_group_rw(struct btrfs_root *root,
+void btrfs_dec_block_group_ro(struct btrfs_root *root,
struct btrfs_block_group_cache *cache)
{
struct btrfs_space_info *sinfo = cache->space_info;
@@ -8901,11 +9349,13 @@ void btrfs_set_block_group_rw(struct btrfs_root *root,
spin_lock(&sinfo->lock);
spin_lock(&cache->lock);
- num_bytes = cache->key.offset - cache->reserved - cache->pinned -
- cache->bytes_super - btrfs_block_group_used(&cache->item);
- sinfo->bytes_readonly -= num_bytes;
- cache->ro = 0;
- list_del_init(&cache->ro_list);
+ if (!--cache->ro) {
+ num_bytes = cache->key.offset - cache->reserved -
+ cache->pinned - cache->bytes_super -
+ btrfs_block_group_used(&cache->item);
+ sinfo->bytes_readonly -= num_bytes;
+ list_del_init(&cache->ro_list);
+ }
spin_unlock(&cache->lock);
spin_unlock(&sinfo->lock);
}
@@ -9270,6 +9720,8 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
cache->full_stripe_len = btrfs_full_stripe_len(root,
&root->fs_info->mapping_tree,
start);
+ set_free_space_tree_thresholds(cache);
+
atomic_set(&cache->count, 1);
spin_lock_init(&cache->lock);
init_rwsem(&cache->data_rwsem);
@@ -9281,6 +9733,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
INIT_LIST_HEAD(&cache->io_list);
btrfs_init_free_space_ctl(cache);
atomic_set(&cache->trimming, 0);
+ mutex_init(&cache->free_space_lock);
return cache;
}
@@ -9305,7 +9758,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->reada = 1;
+ path->reada = READA_FORWARD;
cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
if (btrfs_test_opt(root, SPACE_CACHE) &&
@@ -9421,7 +9874,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
set_avail_alloc_bits(root->fs_info, cache->flags);
if (btrfs_chunk_readonly(root, cache->key.objectid)) {
- set_block_group_ro(cache, 1);
+ inc_block_group_ro(cache, 1);
} else if (btrfs_block_group_used(&cache->item) == 0) {
spin_lock(&info->unused_bgs_lock);
/* Should always be true but just in case. */
@@ -9449,11 +9902,11 @@ int btrfs_read_block_groups(struct btrfs_root *root)
list_for_each_entry(cache,
&space_info->block_groups[BTRFS_RAID_RAID0],
list)
- set_block_group_ro(cache, 1);
+ inc_block_group_ro(cache, 1);
list_for_each_entry(cache,
&space_info->block_groups[BTRFS_RAID_SINGLE],
list)
- set_block_group_ro(cache, 1);
+ inc_block_group_ro(cache, 1);
}
init_global_block_rsv(info);
@@ -9471,7 +9924,9 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
struct btrfs_block_group_item item;
struct btrfs_key key;
int ret = 0;
+ bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
+ trans->can_flush_pending_bgs = false;
list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
if (ret)
goto next;
@@ -9489,9 +9944,12 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
key.objectid, key.offset);
if (ret)
btrfs_abort_transaction(trans, extent_root, ret);
+ add_block_group_free_space(trans, root->fs_info, block_group);
+ /* already aborted the transaction if it failed. */
next:
list_del_init(&block_group->bg_list);
}
+ trans->can_flush_pending_bgs = can_flush_pending_bgs;
}
int btrfs_make_block_group(struct btrfs_trans_handle *trans,
@@ -9518,6 +9976,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
cache->flags = type;
cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
+ cache->needs_free_space = 1;
ret = exclude_super_stripes(root, cache);
if (ret) {
/*
@@ -9534,6 +9993,14 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
free_excluded_extents(root, cache);
+#ifdef CONFIG_BTRFS_DEBUG
+ if (btrfs_should_fragment_free_space(root, cache)) {
+ u64 new_bytes_used = size - bytes_used;
+
+ bytes_used += new_bytes_used >> 1;
+ fragment_free_space(root, cache);
+ }
+#endif
/*
* Call to ensure the corresponding space_info object is created and
* assigned to our block group, but don't update its counters just yet.
@@ -9834,6 +10301,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
* currently running transaction might finish and a new one start,
* allowing for new block groups to be created that can reuse the same
* physical device locations unless we take this special care.
+ *
+ * There may also be an implicit trim operation if the file system
+ * is mounted with -odiscard. The same protections must remain
+ * in place until the extents have been discarded completely when
+ * the transaction commit has completed.
*/
remove_em = (atomic_read(&block_group->trimming) == 0);
/*
@@ -9875,6 +10347,10 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
unlock_chunks(root);
+ ret = remove_block_group_free_space(trans, root->fs_info, block_group);
+ if (ret)
+ goto out;
+
btrfs_put_block_group(block_group);
btrfs_put_block_group(block_group);
@@ -9890,6 +10366,47 @@ out:
return ret;
}
+struct btrfs_trans_handle *
+btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
+ const u64 chunk_offset)
+{
+ struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
+ struct extent_map *em;
+ struct map_lookup *map;
+ unsigned int num_items;
+
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, chunk_offset, 1);
+ read_unlock(&em_tree->lock);
+ ASSERT(em && em->start == chunk_offset);
+
+ /*
+ * We need to reserve 3 + N units from the metadata space info in order
+ * to remove a block group (done at btrfs_remove_chunk() and at
+ * btrfs_remove_block_group()), which are used for:
+ *
+ * 1 unit for adding the free space inode's orphan (located in the tree
+ * of tree roots).
+ * 1 unit for deleting the block group item (located in the extent
+ * tree).
+ * 1 unit for deleting the free space item (located in tree of tree
+ * roots).
+ * N units for deleting N device extent items corresponding to each
+ * stripe (located in the device tree).
+ *
+ * In order to remove a block group we also need to reserve units in the
+ * system space info in order to update the chunk tree (update one or
+ * more device items and remove one chunk item), but this is done at
+ * btrfs_remove_chunk() through a call to check_system_chunk().
+ */
+ map = (struct map_lookup *)em->bdev;
+ num_items = 3 + map->num_stripes;
+ free_extent_map(em);
+
+ return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
+ num_items, 1);
+}
+
/*
* Process the unused_bgs list and remove any that don't have any allocated
* space inside of them.
@@ -9908,26 +10425,30 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
spin_lock(&fs_info->unused_bgs_lock);
while (!list_empty(&fs_info->unused_bgs)) {
u64 start, end;
+ int trimming;
block_group = list_first_entry(&fs_info->unused_bgs,
struct btrfs_block_group_cache,
bg_list);
- space_info = block_group->space_info;
list_del_init(&block_group->bg_list);
+
+ space_info = block_group->space_info;
+
if (ret || btrfs_mixed_space_info(space_info)) {
btrfs_put_block_group(block_group);
continue;
}
spin_unlock(&fs_info->unused_bgs_lock);
- mutex_lock(&root->fs_info->delete_unused_bgs_mutex);
+ mutex_lock(&fs_info->delete_unused_bgs_mutex);
/* Don't want to race with allocators so take the groups_sem */
down_write(&space_info->groups_sem);
spin_lock(&block_group->lock);
if (block_group->reserved ||
btrfs_block_group_used(&block_group->item) ||
- block_group->ro) {
+ block_group->ro ||
+ list_is_singular(&block_group->list)) {
/*
* We want to bail if we made new allocations or have
* outstanding allocations in this block group. We do
@@ -9941,7 +10462,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
spin_unlock(&block_group->lock);
/* We don't want to force the issue, only flip if it's ok. */
- ret = set_block_group_ro(block_group, 0);
+ ret = inc_block_group_ro(block_group, 0);
up_write(&space_info->groups_sem);
if (ret < 0) {
ret = 0;
@@ -9952,10 +10473,10 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
* Want to do this before we do anything else so we can recover
* properly if we fail to join the transaction.
*/
- /* 1 for btrfs_orphan_reserve_metadata() */
- trans = btrfs_start_transaction(root, 1);
+ trans = btrfs_start_trans_remove_block_group(fs_info,
+ block_group->key.objectid);
if (IS_ERR(trans)) {
- btrfs_set_block_group_rw(root, block_group);
+ btrfs_dec_block_group_ro(root, block_group);
ret = PTR_ERR(trans);
goto next;
}
@@ -9982,14 +10503,14 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
EXTENT_DIRTY, GFP_NOFS);
if (ret) {
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
- btrfs_set_block_group_rw(root, block_group);
+ btrfs_dec_block_group_ro(root, block_group);
goto end_trans;
}
ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
EXTENT_DIRTY, GFP_NOFS);
if (ret) {
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
- btrfs_set_block_group_rw(root, block_group);
+ btrfs_dec_block_group_ro(root, block_group);
goto end_trans;
}
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
@@ -10007,16 +10528,47 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
spin_unlock(&block_group->lock);
spin_unlock(&space_info->lock);
+ /* DISCARD can flip during remount */
+ trimming = btrfs_test_opt(root, DISCARD);
+
+ /* Implicit trim during transaction commit. */
+ if (trimming)
+ btrfs_get_block_group_trimming(block_group);
+
/*
* Btrfs_remove_chunk will abort the transaction if things go
* horribly wrong.
*/
ret = btrfs_remove_chunk(trans, root,
block_group->key.objectid);
+
+ if (ret) {
+ if (trimming)
+ btrfs_put_block_group_trimming(block_group);
+ goto end_trans;
+ }
+
+ /*
+ * If we're not mounted with -odiscard, we can just forget
+ * about this block group. Otherwise we'll need to wait
+ * until transaction commit to do the actual discard.
+ */
+ if (trimming) {
+ spin_lock(&fs_info->unused_bgs_lock);
+ /*
+ * A concurrent scrub might have added us to the list
+ * fs_info->unused_bgs, so use a list_move operation
+ * to add the block group to the deleted_bgs list.
+ */
+ list_move(&block_group->bg_list,
+ &trans->transaction->deleted_bgs);
+ spin_unlock(&fs_info->unused_bgs_lock);
+ btrfs_get_block_group(block_group);
+ }
end_trans:
btrfs_end_transaction(trans, root);
next:
- mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
btrfs_put_block_group(block_group);
spin_lock(&fs_info->unused_bgs_lock);
}
@@ -10066,10 +10618,99 @@ int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
return unpin_extent_range(root, start, end, false);
}
+/*
+ * It used to be that old block groups would be left around forever.
+ * Iterating over them would be enough to trim unused space. Since we
+ * now automatically remove them, we also need to iterate over unallocated
+ * space.
+ *
+ * We don't want a transaction for this since the discard may take a
+ * substantial amount of time. We don't require that a transaction be
+ * running, but we do need to take a running transaction into account
+ * to ensure that we're not discarding chunks that were released in
+ * the current transaction.
+ *
+ * Holding the chunks lock will prevent other threads from allocating
+ * or releasing chunks, but it won't prevent a running transaction
+ * from committing and releasing the memory that the pending chunks
+ * list head uses. For that, we need to take a reference to the
+ * transaction.
+ */
+static int btrfs_trim_free_extents(struct btrfs_device *device,
+ u64 minlen, u64 *trimmed)
+{
+ u64 start = 0, len = 0;
+ int ret;
+
+ *trimmed = 0;
+
+ /* Not writeable = nothing to do. */
+ if (!device->writeable)
+ return 0;
+
+ /* No free space = nothing to do. */
+ if (device->total_bytes <= device->bytes_used)
+ return 0;
+
+ ret = 0;
+
+ while (1) {
+ struct btrfs_fs_info *fs_info = device->dev_root->fs_info;
+ struct btrfs_transaction *trans;
+ u64 bytes;
+
+ ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
+ if (ret)
+ return ret;
+
+ down_read(&fs_info->commit_root_sem);
+
+ spin_lock(&fs_info->trans_lock);
+ trans = fs_info->running_transaction;
+ if (trans)
+ atomic_inc(&trans->use_count);
+ spin_unlock(&fs_info->trans_lock);
+
+ ret = find_free_dev_extent_start(trans, device, minlen, start,
+ &start, &len);
+ if (trans)
+ btrfs_put_transaction(trans);
+
+ if (ret) {
+ up_read(&fs_info->commit_root_sem);
+ mutex_unlock(&fs_info->chunk_mutex);
+ if (ret == -ENOSPC)
+ ret = 0;
+ break;
+ }
+
+ ret = btrfs_issue_discard(device->bdev, start, len, &bytes);
+ up_read(&fs_info->commit_root_sem);
+ mutex_unlock(&fs_info->chunk_mutex);
+
+ if (ret)
+ break;
+
+ start += len;
+ *trimmed += bytes;
+
+ if (fatal_signal_pending(current)) {
+ ret = -ERESTARTSYS;
+ break;
+ }
+
+ cond_resched();
+ }
+
+ return ret;
+}
+
int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_group_cache *cache = NULL;
+ struct btrfs_device *device;
+ struct list_head *devices;
u64 group_trimmed;
u64 start;
u64 end;
@@ -10124,6 +10765,18 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
cache = next_block_group(fs_info->tree_root, cache);
}
+ mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+ devices = &root->fs_info->fs_devices->alloc_list;
+ list_for_each_entry(device, devices, dev_alloc_list) {
+ ret = btrfs_trim_free_extents(device, range->minlen,
+ &group_trimmed);
+ if (ret)
+ break;
+
+ trimmed += group_trimmed;
+ }
+ mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
range->len = trimmed;
return ret;
}
@@ -10140,8 +10793,7 @@ void btrfs_end_write_no_snapshoting(struct btrfs_root *root)
{
percpu_counter_dec(&root->subv_writers->counter);
/*
- * Make sure counter is updated before we wake up
- * waiters.
+ * Make sure counter is updated before we wake up waiters.
*/
smp_mb();
if (waitqueue_active(&root->subv_writers->wait))
diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
deleted file mode 100644
index e69de29..0000000
--- a/fs/btrfs/extent-tree.h
+++ /dev/null
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 02d0581..2e7c97a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -96,8 +96,8 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller,
inode = tree->mapping->host;
isize = i_size_read(inode);
if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
- printk_ratelimited(KERN_DEBUG
- "BTRFS: %s: ino %llu isize %llu odd range [%llu,%llu]\n",
+ btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
+ "%s: ino %llu isize %llu odd range [%llu,%llu]",
caller, btrfs_ino(inode), isize, start, end);
}
}
@@ -131,6 +131,25 @@ struct extent_page_data {
unsigned int sync_io:1;
};
+static void add_extent_changeset(struct extent_state *state, unsigned bits,
+ struct extent_changeset *changeset,
+ int set)
+{
+ int ret;
+
+ if (!changeset)
+ return;
+ if (set && (state->state & bits) == bits)
+ return;
+ if (!set && (state->state & bits) == 0)
+ return;
+ changeset->bytes_changed += state->end - state->start + 1;
+ ret = ulist_add(changeset->range_changed, state->start, state->end,
+ GFP_ATOMIC);
+ /* ENOMEM */
+ BUG_ON(ret < 0);
+}
+
static noinline void flush_write_bio(void *data);
static inline struct btrfs_fs_info *
tree_fs_info(struct extent_io_tree *tree)
@@ -410,7 +429,8 @@ static void clear_state_cb(struct extent_io_tree *tree,
}
static void set_state_bits(struct extent_io_tree *tree,
- struct extent_state *state, unsigned *bits);
+ struct extent_state *state, unsigned *bits,
+ struct extent_changeset *changeset);
/*
* insert an extent_state struct into the tree. 'bits' are set on the
@@ -426,7 +446,7 @@ static int insert_state(struct extent_io_tree *tree,
struct extent_state *state, u64 start, u64 end,
struct rb_node ***p,
struct rb_node **parent,
- unsigned *bits)
+ unsigned *bits, struct extent_changeset *changeset)
{
struct rb_node *node;
@@ -436,7 +456,7 @@ static int insert_state(struct extent_io_tree *tree,
state->start = start;
state->end = end;
- set_state_bits(tree, state, bits);
+ set_state_bits(tree, state, bits, changeset);
node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent);
if (node) {
@@ -511,7 +531,8 @@ static struct extent_state *next_state(struct extent_state *state)
*/
static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
struct extent_state *state,
- unsigned *bits, int wake)
+ unsigned *bits, int wake,
+ struct extent_changeset *changeset)
{
struct extent_state *next;
unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS;
@@ -522,6 +543,7 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
tree->dirty_bytes -= range;
}
clear_state_cb(tree, state, bits);
+ add_extent_changeset(state, bits_to_clear, changeset, 0);
state->state &= ~bits_to_clear;
if (wake)
wake_up(&state->wq);
@@ -569,10 +591,10 @@ static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
*
* This takes the tree lock, and returns 0 on success and < 0 on error.
*/
-int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, int wake, int delete,
- struct extent_state **cached_state,
- gfp_t mask)
+static int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, int wake, int delete,
+ struct extent_state **cached_state,
+ gfp_t mask, struct extent_changeset *changeset)
{
struct extent_state *state;
struct extent_state *cached;
@@ -594,7 +616,7 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
clear = 1;
again:
- if (!prealloc && (mask & __GFP_WAIT)) {
+ if (!prealloc && gfpflags_allow_blocking(mask)) {
/*
* Don't care for allocation failure here because we might end
* up not needing the pre-allocated extent state at all, which
@@ -671,7 +693,8 @@ hit_next:
if (err)
goto out;
if (state->end <= end) {
- state = clear_state_bit(tree, state, &bits, wake);
+ state = clear_state_bit(tree, state, &bits, wake,
+ changeset);
goto next;
}
goto search_again;
@@ -692,13 +715,13 @@ hit_next:
if (wake)
wake_up(&state->wq);
- clear_state_bit(tree, prealloc, &bits, wake);
+ clear_state_bit(tree, prealloc, &bits, wake, changeset);
prealloc = NULL;
goto out;
}
- state = clear_state_bit(tree, state, &bits, wake);
+ state = clear_state_bit(tree, state, &bits, wake, changeset);
next:
if (last_end == (u64)-1)
goto out;
@@ -718,7 +741,7 @@ search_again:
if (start > end)
goto out;
spin_unlock(&tree->lock);
- if (mask & __GFP_WAIT)
+ if (gfpflags_allow_blocking(mask))
cond_resched();
goto again;
}
@@ -789,7 +812,7 @@ out:
static void set_state_bits(struct extent_io_tree *tree,
struct extent_state *state,
- unsigned *bits)
+ unsigned *bits, struct extent_changeset *changeset)
{
unsigned bits_to_set = *bits & ~EXTENT_CTLBITS;
@@ -798,6 +821,7 @@ static void set_state_bits(struct extent_io_tree *tree,
u64 range = state->end - state->start + 1;
tree->dirty_bytes += range;
}
+ add_extent_changeset(state, bits_to_set, changeset, 1);
state->state |= bits_to_set;
}
@@ -835,7 +859,7 @@ static int __must_check
__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, unsigned exclusive_bits,
u64 *failed_start, struct extent_state **cached_state,
- gfp_t mask)
+ gfp_t mask, struct extent_changeset *changeset)
{
struct extent_state *state;
struct extent_state *prealloc = NULL;
@@ -850,7 +874,7 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
bits |= EXTENT_FIRST_DELALLOC;
again:
- if (!prealloc && (mask & __GFP_WAIT)) {
+ if (!prealloc && gfpflags_allow_blocking(mask)) {
prealloc = alloc_extent_state(mask);
BUG_ON(!prealloc);
}
@@ -873,7 +897,7 @@ again:
prealloc = alloc_extent_state_atomic(prealloc);
BUG_ON(!prealloc);
err = insert_state(tree, prealloc, start, end,
- &p, &parent, &bits);
+ &p, &parent, &bits, changeset);
if (err)
extent_io_tree_panic(tree, err);
@@ -899,7 +923,7 @@ hit_next:
goto out;
}
- set_state_bits(tree, state, &bits);
+ set_state_bits(tree, state, &bits, changeset);
cache_state(state, cached_state);
merge_state(tree, state);
if (last_end == (u64)-1)
@@ -945,7 +969,7 @@ hit_next:
if (err)
goto out;
if (state->end <= end) {
- set_state_bits(tree, state, &bits);
+ set_state_bits(tree, state, &bits, changeset);
cache_state(state, cached_state);
merge_state(tree, state);
if (last_end == (u64)-1)
@@ -980,7 +1004,7 @@ hit_next:
* the later extent.
*/
err = insert_state(tree, prealloc, start, this_end,
- NULL, NULL, &bits);
+ NULL, NULL, &bits, changeset);
if (err)
extent_io_tree_panic(tree, err);
@@ -1008,7 +1032,7 @@ hit_next:
if (err)
extent_io_tree_panic(tree, err);
- set_state_bits(tree, prealloc, &bits);
+ set_state_bits(tree, prealloc, &bits, changeset);
cache_state(prealloc, cached_state);
merge_state(tree, prealloc);
prealloc = NULL;
@@ -1028,7 +1052,7 @@ search_again:
if (start > end)
goto out;
spin_unlock(&tree->lock);
- if (mask & __GFP_WAIT)
+ if (gfpflags_allow_blocking(mask))
cond_resched();
goto again;
}
@@ -1038,7 +1062,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached_state, gfp_t mask)
{
return __set_extent_bit(tree, start, end, bits, 0, failed_start,
- cached_state, mask);
+ cached_state, mask, NULL);
}
@@ -1076,7 +1100,7 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
btrfs_debug_check_extent_io_range(tree, start, end);
again:
- if (!prealloc && (mask & __GFP_WAIT)) {
+ if (!prealloc && gfpflags_allow_blocking(mask)) {
/*
* Best effort, don't worry if extent state allocation fails
* here for the first iteration. We might have a cached state
@@ -1111,7 +1135,7 @@ again:
goto out;
}
err = insert_state(tree, prealloc, start, end,
- &p, &parent, &bits);
+ &p, &parent, &bits, NULL);
if (err)
extent_io_tree_panic(tree, err);
cache_state(prealloc, cached_state);
@@ -1130,9 +1154,9 @@ hit_next:
* Just lock what we found and keep going
*/
if (state->start == start && state->end <= end) {
- set_state_bits(tree, state, &bits);
+ set_state_bits(tree, state, &bits, NULL);
cache_state(state, cached_state);
- state = clear_state_bit(tree, state, &clear_bits, 0);
+ state = clear_state_bit(tree, state, &clear_bits, 0, NULL);
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
@@ -1171,9 +1195,10 @@ hit_next:
if (err)
goto out;
if (state->end <= end) {
- set_state_bits(tree, state, &bits);
+ set_state_bits(tree, state, &bits, NULL);
cache_state(state, cached_state);
- state = clear_state_bit(tree, state, &clear_bits, 0);
+ state = clear_state_bit(tree, state, &clear_bits, 0,
+ NULL);
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
@@ -1208,7 +1233,7 @@ hit_next:
* the later extent.
*/
err = insert_state(tree, prealloc, start, this_end,
- NULL, NULL, &bits);
+ NULL, NULL, &bits, NULL);
if (err)
extent_io_tree_panic(tree, err);
cache_state(prealloc, cached_state);
@@ -1233,9 +1258,9 @@ hit_next:
if (err)
extent_io_tree_panic(tree, err);
- set_state_bits(tree, prealloc, &bits);
+ set_state_bits(tree, prealloc, &bits, NULL);
cache_state(prealloc, cached_state);
- clear_state_bit(tree, prealloc, &clear_bits, 0);
+ clear_state_bit(tree, prealloc, &clear_bits, 0, NULL);
prealloc = NULL;
goto out;
}
@@ -1253,81 +1278,49 @@ search_again:
if (start > end)
goto out;
spin_unlock(&tree->lock);
- if (mask & __GFP_WAIT)
+ if (gfpflags_allow_blocking(mask))
cond_resched();
first_iteration = false;
goto again;
}
/* wrappers around set/clear extent bit */
-int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
- gfp_t mask)
-{
- return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL,
- NULL, mask);
-}
-
-int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, gfp_t mask)
-{
- return set_extent_bit(tree, start, end, bits, NULL,
- NULL, mask);
-}
-
-int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, gfp_t mask)
+int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, gfp_t mask,
+ struct extent_changeset *changeset)
{
- int wake = 0;
-
- if (bits & EXTENT_LOCKED)
- wake = 1;
-
- return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, mask);
-}
-
-int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached_state, gfp_t mask)
-{
- return set_extent_bit(tree, start, end,
- EXTENT_DELALLOC | EXTENT_UPTODATE,
- NULL, cached_state, mask);
-}
-
-int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached_state, gfp_t mask)
-{
- return set_extent_bit(tree, start, end,
- EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG,
- NULL, cached_state, mask);
-}
+ /*
+ * We don't support EXTENT_LOCKED yet, as current changeset will
+ * record any bits changed, so for EXTENT_LOCKED case, it will
+ * either fail with -EEXIST or changeset will record the whole
+ * range.
+ */
+ BUG_ON(bits & EXTENT_LOCKED);
-int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
- gfp_t mask)
-{
- return clear_extent_bit(tree, start, end,
- EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
+ return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, mask,
+ changeset);
}
-int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
- gfp_t mask)
+int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, int wake, int delete,
+ struct extent_state **cached, gfp_t mask)
{
- return set_extent_bit(tree, start, end, EXTENT_NEW, NULL,
- NULL, mask);
+ return __clear_extent_bit(tree, start, end, bits, wake, delete,
+ cached, mask, NULL);
}
-int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached_state, gfp_t mask)
+int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, gfp_t mask,
+ struct extent_changeset *changeset)
{
- return set_extent_bit(tree, start, end, EXTENT_UPTODATE, NULL,
- cached_state, mask);
-}
+ /*
+ * Don't support EXTENT_LOCKED case, same reason as
+ * set_record_extent_bits().
+ */
+ BUG_ON(bits & EXTENT_LOCKED);
-int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached_state, gfp_t mask)
-{
- return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
- cached_state, mask);
+ return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask,
+ changeset);
}
/*
@@ -1335,15 +1328,15 @@ int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
* us if waiting is desired.
*/
int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, struct extent_state **cached_state)
+ struct extent_state **cached_state)
{
int err;
u64 failed_start;
while (1) {
- err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
+ err = __set_extent_bit(tree, start, end, EXTENT_LOCKED,
EXTENT_LOCKED, &failed_start,
- cached_state, GFP_NOFS);
+ cached_state, GFP_NOFS, NULL);
if (err == -EEXIST) {
wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
start = failed_start;
@@ -1354,18 +1347,13 @@ int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
return err;
}
-int lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
-{
- return lock_extent_bits(tree, start, end, 0, NULL);
-}
-
int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
{
int err;
u64 failed_start;
err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
- &failed_start, NULL, GFP_NOFS);
+ &failed_start, NULL, GFP_NOFS, NULL);
if (err == -EEXIST) {
if (failed_start > start)
clear_extent_bit(tree, start, failed_start - 1,
@@ -1375,20 +1363,7 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
return 1;
}
-int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached, gfp_t mask)
-{
- return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
- mask);
-}
-
-int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end)
-{
- return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
- GFP_NOFS);
-}
-
-int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
+void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
{
unsigned long index = start >> PAGE_CACHE_SHIFT;
unsigned long end_index = end >> PAGE_CACHE_SHIFT;
@@ -1401,10 +1376,9 @@ int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
page_cache_release(page);
index++;
}
- return 0;
}
-int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
+void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
{
unsigned long index = start >> PAGE_CACHE_SHIFT;
unsigned long end_index = end >> PAGE_CACHE_SHIFT;
@@ -1418,13 +1392,12 @@ int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
page_cache_release(page);
index++;
}
- return 0;
}
/*
* helper function to set both pages and extents in the tree writeback
*/
-static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
+static void set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
{
unsigned long index = start >> PAGE_CACHE_SHIFT;
unsigned long end_index = end >> PAGE_CACHE_SHIFT;
@@ -1437,7 +1410,6 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
page_cache_release(page);
index++;
}
- return 0;
}
/* find the first state struct with 'bits' set after 'start', and
@@ -1737,7 +1709,7 @@ again:
BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */
/* step three, lock the state bits for the whole range */
- lock_extent_bits(tree, delalloc_start, delalloc_end, 0, &cached_state);
+ lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state);
/* then test to make sure it is all still delalloc */
ret = test_range_bit(tree, delalloc_start, delalloc_end,
@@ -1757,7 +1729,7 @@ out_failed:
return found;
}
-int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
+void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
struct page *locked_page,
unsigned clear_bits,
unsigned long page_ops)
@@ -1772,7 +1744,7 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
if (page_ops == 0)
- return 0;
+ return;
if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0)
mapping_set_error(inode->i_mapping, -EIO);
@@ -1806,7 +1778,6 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
index += ret;
cond_resched();
}
- return 0;
}
/*
@@ -2078,8 +2049,8 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
return -EIO;
}
- printk_ratelimited_in_rcu(KERN_INFO
- "BTRFS: read error corrected: ino %llu off %llu (dev %s sector %llu)\n",
+ btrfs_info_rl_in_rcu(fs_info,
+ "read error corrected: ino %llu off %llu (dev %s sector %llu)",
btrfs_ino(inode), start,
rcu_str_deref(dev->name), sector);
bio_put(bio);
@@ -2453,7 +2424,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
/* lots and lots of room for performance fixes in the end_bio funcs */
-int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
+void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
{
int uptodate = (err == 0);
struct extent_io_tree *tree;
@@ -2474,7 +2445,6 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
ret = ret < 0 ? ret : -EIO;
mapping_set_error(page->mapping, ret);
}
- return 0;
}
/*
@@ -2486,7 +2456,7 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
* Scheduling is not allowed, so the extent state tree is expected
* to have one and only one object corresponding to this IO.
*/
-static void end_bio_extent_writepage(struct bio *bio, int err)
+static void end_bio_extent_writepage(struct bio *bio)
{
struct bio_vec *bvec;
u64 start;
@@ -2516,9 +2486,7 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
start = page_offset(page);
end = start + bvec->bv_offset + bvec->bv_len - 1;
- if (end_extent_writepage(page, err, start, end))
- continue;
-
+ end_extent_writepage(page, bio->bi_error, start, end);
end_page_writeback(page);
}
@@ -2548,10 +2516,10 @@ endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len,
* Scheduling is not allowed, so the extent state tree is expected
* to have one and only one object corresponding to this IO.
*/
-static void end_bio_extent_readpage(struct bio *bio, int err)
+static void end_bio_extent_readpage(struct bio *bio)
{
struct bio_vec *bvec;
- int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ int uptodate = !bio->bi_error;
struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
struct extent_io_tree *tree;
u64 offset = 0;
@@ -2564,16 +2532,13 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
int ret;
int i;
- if (err)
- uptodate = 0;
-
bio_for_each_segment_all(bvec, bio, i) {
struct page *page = bvec->bv_page;
struct inode *inode = page->mapping->host;
pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
- "mirror=%u\n", (u64)bio->bi_iter.bi_sector, err,
- io_bio->mirror_num);
+ "mirror=%u\n", (u64)bio->bi_iter.bi_sector,
+ bio->bi_error, io_bio->mirror_num);
tree = &BTRFS_I(inode)->io_tree;
/* We always issue full-page reads, but if some block
@@ -2614,8 +2579,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
if (tree->ops && tree->ops->readpage_io_failed_hook) {
ret = tree->ops->readpage_io_failed_hook(page, mirror);
- if (!ret && !err &&
- test_bit(BIO_UPTODATE, &bio->bi_flags))
+ if (!ret && !bio->bi_error)
uptodate = 1;
} else {
/*
@@ -2631,10 +2595,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
ret = bio_readpage_error(bio, offset, page, start, end,
mirror);
if (ret == 0) {
- uptodate =
- test_bit(BIO_UPTODATE, &bio->bi_flags);
- if (err)
- uptodate = 0;
+ uptodate = !bio->bi_error;
offset += len;
continue;
}
@@ -2684,7 +2645,7 @@ readpage_ok:
endio_readpage_release_extent(tree, extent_start, extent_len,
uptodate);
if (io_bio->end_io)
- io_bio->end_io(io_bio, err);
+ io_bio->end_io(io_bio, bio->bi_error);
bio_put(bio);
}
@@ -2730,6 +2691,12 @@ struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask)
btrfs_bio->csum = NULL;
btrfs_bio->csum_allocated = NULL;
btrfs_bio->end_io = NULL;
+
+#ifdef CONFIG_BLK_CGROUP
+ /* FIXME, put this into bio_clone_bioset */
+ if (bio->bi_css)
+ bio_associate_blkcg(new, bio->bi_css);
+#endif
}
return new;
}
@@ -2790,6 +2757,7 @@ static int merge_bio(int rw, struct extent_io_tree *tree, struct page *page,
}
static int submit_extent_page(int rw, struct extent_io_tree *tree,
+ struct writeback_control *wbc,
struct page *page, sector_t sector,
size_t size, unsigned long offset,
struct block_device *bdev,
@@ -2798,13 +2766,12 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
bio_end_io_t end_io_func,
int mirror_num,
unsigned long prev_bio_flags,
- unsigned long bio_flags)
+ unsigned long bio_flags,
+ bool force_bio_submit)
{
int ret = 0;
struct bio *bio;
- int nr;
int contig = 0;
- int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE);
@@ -2816,6 +2783,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
contig = bio_end_sector(bio) == sector;
if (prev_bio_flags != bio_flags || !contig ||
+ force_bio_submit ||
merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) ||
bio_add_page(bio, page, page_size, offset) < page_size) {
ret = submit_one_bio(rw, bio, mirror_num,
@@ -2826,21 +2794,24 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
}
bio = NULL;
} else {
+ if (wbc)
+ wbc_account_io(wbc, page, page_size);
return 0;
}
}
- if (this_compressed)
- nr = BIO_MAX_PAGES;
- else
- nr = bio_get_nr_vecs(bdev);
- bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
+ bio = btrfs_bio_alloc(bdev, sector, BIO_MAX_PAGES,
+ GFP_NOFS | __GFP_HIGH);
if (!bio)
return -ENOMEM;
bio_add_page(bio, page, page_size, offset);
bio->bi_end_io = end_io_func;
bio->bi_private = tree;
+ if (wbc) {
+ wbc_init_bio(wbc, bio);
+ wbc_account_io(wbc, page, page_size);
+ }
if (bio_ret)
*bio_ret = bio;
@@ -2909,7 +2880,8 @@ static int __do_readpage(struct extent_io_tree *tree,
get_extent_t *get_extent,
struct extent_map **em_cached,
struct bio **bio, int mirror_num,
- unsigned long *bio_flags, int rw)
+ unsigned long *bio_flags, int rw,
+ u64 *prev_em_start)
{
struct inode *inode = page->mapping->host;
u64 start = page_offset(page);
@@ -2957,6 +2929,7 @@ static int __do_readpage(struct extent_io_tree *tree,
}
while (cur <= end) {
unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
+ bool force_bio_submit = false;
if (cur >= last_byte) {
char *userpage;
@@ -3007,6 +2980,49 @@ static int __do_readpage(struct extent_io_tree *tree,
block_start = em->block_start;
if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
block_start = EXTENT_MAP_HOLE;
+
+ /*
+ * If we have a file range that points to a compressed extent
+ * and it's followed by a consecutive file range that points to
+ * to the same compressed extent (possibly with a different
+ * offset and/or length, so it either points to the whole extent
+ * or only part of it), we must make sure we do not submit a
+ * single bio to populate the pages for the 2 ranges because
+ * this makes the compressed extent read zero out the pages
+ * belonging to the 2nd range. Imagine the following scenario:
+ *
+ * File layout
+ * [0 - 8K] [8K - 24K]
+ * | |
+ * | |
+ * points to extent X, points to extent X,
+ * offset 4K, length of 8K offset 0, length 16K
+ *
+ * [extent X, compressed length = 4K uncompressed length = 16K]
+ *
+ * If the bio to read the compressed extent covers both ranges,
+ * it will decompress extent X into the pages belonging to the
+ * first range and then it will stop, zeroing out the remaining
+ * pages that belong to the other range that points to extent X.
+ * So here we make sure we submit 2 bios, one for the first
+ * range and another one for the third range. Both will target
+ * the same physical extent from disk, but we can't currently
+ * make the compressed bio endio callback populate the pages
+ * for both ranges because each compressed bio is tightly
+ * coupled with a single extent map, and each range can have
+ * an extent map with a different offset value relative to the
+ * uncompressed data of our extent and different lengths. This
+ * is a corner case so we prioritize correctness over
+ * non-optimal behavior (submitting 2 bios for the same extent).
+ */
+ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) &&
+ prev_em_start && *prev_em_start != (u64)-1 &&
+ *prev_em_start != em->orig_start)
+ force_bio_submit = true;
+
+ if (prev_em_start)
+ *prev_em_start = em->orig_start;
+
free_extent_map(em);
em = NULL;
@@ -3022,8 +3038,12 @@ static int __do_readpage(struct extent_io_tree *tree,
set_extent_uptodate(tree, cur, cur + iosize - 1,
&cached, GFP_NOFS);
- unlock_extent_cached(tree, cur, cur + iosize - 1,
- &cached, GFP_NOFS);
+ if (parent_locked)
+ free_extent_state(cached);
+ else
+ unlock_extent_cached(tree, cur,
+ cur + iosize - 1,
+ &cached, GFP_NOFS);
cur = cur + iosize;
pg_offset += iosize;
continue;
@@ -3051,12 +3071,13 @@ static int __do_readpage(struct extent_io_tree *tree,
}
pnr -= page->index;
- ret = submit_extent_page(rw, tree, page,
+ ret = submit_extent_page(rw, tree, NULL, page,
sector, disk_io_size, pg_offset,
bdev, bio, pnr,
end_bio_extent_readpage, mirror_num,
*bio_flags,
- this_bio_flag);
+ this_bio_flag,
+ force_bio_submit);
if (!ret) {
nr++;
*bio_flags = this_bio_flag;
@@ -3083,7 +3104,8 @@ static inline void __do_contiguous_readpages(struct extent_io_tree *tree,
get_extent_t *get_extent,
struct extent_map **em_cached,
struct bio **bio, int mirror_num,
- unsigned long *bio_flags, int rw)
+ unsigned long *bio_flags, int rw,
+ u64 *prev_em_start)
{
struct inode *inode;
struct btrfs_ordered_extent *ordered;
@@ -3103,7 +3125,7 @@ static inline void __do_contiguous_readpages(struct extent_io_tree *tree,
for (index = 0; index < nr_pages; index++) {
__do_readpage(tree, pages[index], get_extent, em_cached, bio,
- mirror_num, bio_flags, rw);
+ mirror_num, bio_flags, rw, prev_em_start);
page_cache_release(pages[index]);
}
}
@@ -3113,7 +3135,8 @@ static void __extent_readpages(struct extent_io_tree *tree,
int nr_pages, get_extent_t *get_extent,
struct extent_map **em_cached,
struct bio **bio, int mirror_num,
- unsigned long *bio_flags, int rw)
+ unsigned long *bio_flags, int rw,
+ u64 *prev_em_start)
{
u64 start = 0;
u64 end = 0;
@@ -3134,7 +3157,7 @@ static void __extent_readpages(struct extent_io_tree *tree,
index - first_index, start,
end, get_extent, em_cached,
bio, mirror_num, bio_flags,
- rw);
+ rw, prev_em_start);
start = page_start;
end = start + PAGE_CACHE_SIZE - 1;
first_index = index;
@@ -3145,7 +3168,8 @@ static void __extent_readpages(struct extent_io_tree *tree,
__do_contiguous_readpages(tree, &pages[first_index],
index - first_index, start,
end, get_extent, em_cached, bio,
- mirror_num, bio_flags, rw);
+ mirror_num, bio_flags, rw,
+ prev_em_start);
}
static int __extent_read_full_page(struct extent_io_tree *tree,
@@ -3171,7 +3195,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
}
ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num,
- bio_flags, rw);
+ bio_flags, rw, NULL);
return ret;
}
@@ -3197,7 +3221,7 @@ int extent_read_full_page_nolock(struct extent_io_tree *tree, struct page *page,
int ret;
ret = __do_readpage(tree, page, get_extent, NULL, &bio, mirror_num,
- &bio_flags, READ);
+ &bio_flags, READ, NULL);
if (bio)
ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
return ret;
@@ -3446,11 +3470,11 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
page->index, cur, end);
}
- ret = submit_extent_page(write_flags, tree, page,
+ ret = submit_extent_page(write_flags, tree, wbc, page,
sector, iosize, pg_offset,
bdev, &epd->bio, max_nr,
end_bio_extent_writepage,
- 0, 0, 0);
+ 0, 0, 0, false);
if (ret)
SetPageError(page);
}
@@ -3696,7 +3720,7 @@ static void set_btree_ioerr(struct page *page)
}
}
-static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
+static void end_bio_extent_buffer_writepage(struct bio *bio)
{
struct bio_vec *bvec;
struct extent_buffer *eb;
@@ -3709,7 +3733,8 @@ static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
BUG_ON(!eb);
done = atomic_dec_and_test(&eb->io_pages);
- if (err || test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
+ if (bio->bi_error ||
+ test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
ClearPageUptodate(page);
set_btree_ioerr(page);
}
@@ -3749,10 +3774,10 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
clear_page_dirty_for_io(p);
set_page_writeback(p);
- ret = submit_extent_page(rw, tree, p, offset >> 9,
+ ret = submit_extent_page(rw, tree, wbc, p, offset >> 9,
PAGE_CACHE_SIZE, 0, bdev, &epd->bio,
-1, end_bio_extent_buffer_writepage,
- 0, epd->bio_flags, bio_flags);
+ 0, epd->bio_flags, bio_flags, false);
epd->bio_flags = bio_flags;
if (ret) {
set_btree_ioerr(p);
@@ -4156,6 +4181,7 @@ int extent_readpages(struct extent_io_tree *tree,
struct page *page;
struct extent_map *em_cached = NULL;
int nr = 0;
+ u64 prev_em_start = (u64)-1;
for (page_idx = 0; page_idx < nr_pages; page_idx++) {
page = list_entry(pages->prev, struct page, lru);
@@ -4172,12 +4198,12 @@ int extent_readpages(struct extent_io_tree *tree,
if (nr < ARRAY_SIZE(pagepool))
continue;
__extent_readpages(tree, pagepool, nr, get_extent, &em_cached,
- &bio, 0, &bio_flags, READ);
+ &bio, 0, &bio_flags, READ, &prev_em_start);
nr = 0;
}
if (nr)
__extent_readpages(tree, pagepool, nr, get_extent, &em_cached,
- &bio, 0, &bio_flags, READ);
+ &bio, 0, &bio_flags, READ, &prev_em_start);
if (em_cached)
free_extent_map(em_cached);
@@ -4205,7 +4231,7 @@ int extent_invalidatepage(struct extent_io_tree *tree,
if (start > end)
return 0;
- lock_extent_bits(tree, start, end, 0, &cached_state);
+ lock_extent_bits(tree, start, end, &cached_state);
wait_on_page_writeback(page);
clear_extent_bit(tree, start, end,
EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
@@ -4265,8 +4291,8 @@ int try_release_extent_mapping(struct extent_map_tree *map,
u64 start = page_offset(page);
u64 end = start + PAGE_CACHE_SIZE - 1;
- if ((mask & __GFP_WAIT) &&
- page->mapping->host->i_size > 16 * 1024 * 1024) {
+ if (gfpflags_allow_blocking(mask) &&
+ page->mapping->host->i_size > SZ_16M) {
u64 len;
while (start <= end) {
len = end - start + 1;
@@ -4415,7 +4441,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
last_for_get_extent = isize;
}
- lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1, 0,
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1,
&cached_state);
em = get_extent_skip_holes(inode, start, last_for_get_extent,
@@ -4614,9 +4640,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
{
struct extent_buffer *eb = NULL;
- eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS);
- if (eb == NULL)
- return NULL;
+ eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL);
eb->start = start;
eb->len = len;
eb->fs_info = fs_info;
@@ -4678,24 +4702,14 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
return new;
}
-struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
- u64 start)
+struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
+ u64 start, unsigned long len)
{
struct extent_buffer *eb;
- unsigned long len;
unsigned long num_pages;
unsigned long i;
- if (!fs_info) {
- /*
- * Called only from tests that don't always have a fs_info
- * available, but we know that nodesize is 4096
- */
- len = 4096;
- } else {
- len = fs_info->tree_root->nodesize;
- }
- num_pages = num_extent_pages(0, len);
+ num_pages = num_extent_pages(start, len);
eb = __alloc_extent_buffer(fs_info, start, len);
if (!eb)
@@ -4718,6 +4732,24 @@ err:
return NULL;
}
+struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
+ u64 start)
+{
+ unsigned long len;
+
+ if (!fs_info) {
+ /*
+ * Called only from tests that don't always have a fs_info
+ * available, but we know that nodesize is 4096
+ */
+ len = 4096;
+ } else {
+ len = fs_info->tree_root->nodesize;
+ }
+
+ return __alloc_dummy_extent_buffer(fs_info, start, len);
+}
+
static void check_buffer_tree_ref(struct extent_buffer *eb)
{
int refs;
@@ -4874,7 +4906,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
return NULL;
for (i = 0; i < num_pages; i++, index++) {
- p = find_or_create_page(mapping, index, GFP_NOFS);
+ p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
if (!p)
goto free_eb;
@@ -5108,7 +5140,7 @@ int set_extent_buffer_dirty(struct extent_buffer *eb)
return was_dirty;
}
-int clear_extent_buffer_uptodate(struct extent_buffer *eb)
+void clear_extent_buffer_uptodate(struct extent_buffer *eb)
{
unsigned long i;
struct page *page;
@@ -5121,10 +5153,9 @@ int clear_extent_buffer_uptodate(struct extent_buffer *eb)
if (page)
ClearPageUptodate(page);
}
- return 0;
}
-int set_extent_buffer_uptodate(struct extent_buffer *eb)
+void set_extent_buffer_uptodate(struct extent_buffer *eb)
{
unsigned long i;
struct page *page;
@@ -5136,7 +5167,6 @@ int set_extent_buffer_uptodate(struct extent_buffer *eb)
page = eb->pages[i];
SetPageUptodate(page);
}
- return 0;
}
int extent_buffer_uptodate(struct extent_buffer *eb)
@@ -5475,6 +5505,155 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
}
}
+/*
+ * The extent buffer bitmap operations are done with byte granularity because
+ * bitmap items are not guaranteed to be aligned to a word and therefore a
+ * single word in a bitmap may straddle two pages in the extent buffer.
+ */
+#define BIT_BYTE(nr) ((nr) / BITS_PER_BYTE)
+#define BYTE_MASK ((1 << BITS_PER_BYTE) - 1)
+#define BITMAP_FIRST_BYTE_MASK(start) \
+ ((BYTE_MASK << ((start) & (BITS_PER_BYTE - 1))) & BYTE_MASK)
+#define BITMAP_LAST_BYTE_MASK(nbits) \
+ (BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1)))
+
+/*
+ * eb_bitmap_offset() - calculate the page and offset of the byte containing the
+ * given bit number
+ * @eb: the extent buffer
+ * @start: offset of the bitmap item in the extent buffer
+ * @nr: bit number
+ * @page_index: return index of the page in the extent buffer that contains the
+ * given bit number
+ * @page_offset: return offset into the page given by page_index
+ *
+ * This helper hides the ugliness of finding the byte in an extent buffer which
+ * contains a given bit.
+ */
+static inline void eb_bitmap_offset(struct extent_buffer *eb,
+ unsigned long start, unsigned long nr,
+ unsigned long *page_index,
+ size_t *page_offset)
+{
+ size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+ size_t byte_offset = BIT_BYTE(nr);
+ size_t offset;
+
+ /*
+ * The byte we want is the offset of the extent buffer + the offset of
+ * the bitmap item in the extent buffer + the offset of the byte in the
+ * bitmap item.
+ */
+ offset = start_offset + start + byte_offset;
+
+ *page_index = offset >> PAGE_CACHE_SHIFT;
+ *page_offset = offset & (PAGE_CACHE_SIZE - 1);
+}
+
+/**
+ * extent_buffer_test_bit - determine whether a bit in a bitmap item is set
+ * @eb: the extent buffer
+ * @start: offset of the bitmap item in the extent buffer
+ * @nr: bit number to test
+ */
+int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start,
+ unsigned long nr)
+{
+ char *kaddr;
+ struct page *page;
+ unsigned long i;
+ size_t offset;
+
+ eb_bitmap_offset(eb, start, nr, &i, &offset);
+ page = eb->pages[i];
+ WARN_ON(!PageUptodate(page));
+ kaddr = page_address(page);
+ return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1)));
+}
+
+/**
+ * extent_buffer_bitmap_set - set an area of a bitmap
+ * @eb: the extent buffer
+ * @start: offset of the bitmap item in the extent buffer
+ * @pos: bit number of the first bit
+ * @len: number of bits to set
+ */
+void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start,
+ unsigned long pos, unsigned long len)
+{
+ char *kaddr;
+ struct page *page;
+ unsigned long i;
+ size_t offset;
+ const unsigned int size = pos + len;
+ int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
+ unsigned int mask_to_set = BITMAP_FIRST_BYTE_MASK(pos);
+
+ eb_bitmap_offset(eb, start, pos, &i, &offset);
+ page = eb->pages[i];
+ WARN_ON(!PageUptodate(page));
+ kaddr = page_address(page);
+
+ while (len >= bits_to_set) {
+ kaddr[offset] |= mask_to_set;
+ len -= bits_to_set;
+ bits_to_set = BITS_PER_BYTE;
+ mask_to_set = ~0U;
+ if (++offset >= PAGE_CACHE_SIZE && len > 0) {
+ offset = 0;
+ page = eb->pages[++i];
+ WARN_ON(!PageUptodate(page));
+ kaddr = page_address(page);
+ }
+ }
+ if (len) {
+ mask_to_set &= BITMAP_LAST_BYTE_MASK(size);
+ kaddr[offset] |= mask_to_set;
+ }
+}
+
+
+/**
+ * extent_buffer_bitmap_clear - clear an area of a bitmap
+ * @eb: the extent buffer
+ * @start: offset of the bitmap item in the extent buffer
+ * @pos: bit number of the first bit
+ * @len: number of bits to clear
+ */
+void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start,
+ unsigned long pos, unsigned long len)
+{
+ char *kaddr;
+ struct page *page;
+ unsigned long i;
+ size_t offset;
+ const unsigned int size = pos + len;
+ int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
+ unsigned int mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos);
+
+ eb_bitmap_offset(eb, start, pos, &i, &offset);
+ page = eb->pages[i];
+ WARN_ON(!PageUptodate(page));
+ kaddr = page_address(page);
+
+ while (len >= bits_to_clear) {
+ kaddr[offset] &= ~mask_to_clear;
+ len -= bits_to_clear;
+ bits_to_clear = BITS_PER_BYTE;
+ mask_to_clear = ~0U;
+ if (++offset >= PAGE_CACHE_SIZE && len > 0) {
+ offset = 0;
+ page = eb->pages[++i];
+ WARN_ON(!PageUptodate(page));
+ kaddr = page_address(page);
+ }
+ }
+ if (len) {
+ mask_to_clear &= BITMAP_LAST_BYTE_MASK(size);
+ kaddr[offset] &= ~mask_to_clear;
+ }
+}
+
static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
{
unsigned long distance = (src > dst) ? src - dst : dst - src;
@@ -5514,13 +5693,15 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
unsigned long src_i;
if (src_offset + len > dst->len) {
- printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move "
- "len %lu dst len %lu\n", src_offset, len, dst->len);
+ btrfs_err(dst->fs_info,
+ "memmove bogus src_offset %lu move "
+ "len %lu dst len %lu", src_offset, len, dst->len);
BUG_ON(1);
}
if (dst_offset + len > dst->len) {
- printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move "
- "len %lu dst len %lu\n", dst_offset, len, dst->len);
+ btrfs_err(dst->fs_info,
+ "memmove bogus dst_offset %lu move "
+ "len %lu dst len %lu", dst_offset, len, dst->len);
BUG_ON(1);
}
@@ -5560,13 +5741,13 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
unsigned long src_i;
if (src_offset + len > dst->len) {
- printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move "
- "len %lu len %lu\n", src_offset, len, dst->len);
+ btrfs_err(dst->fs_info, "memmove bogus src_offset %lu move "
+ "len %lu len %lu", src_offset, len, dst->len);
BUG_ON(1);
}
if (dst_offset + len > dst->len) {
- printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move "
- "len %lu len %lu\n", dst_offset, len, dst->len);
+ btrfs_err(dst->fs_info, "memmove bogus dst_offset %lu move "
+ "len %lu len %lu", dst_offset, len, dst->len);
BUG_ON(1);
}
if (dst_offset < src_offset) {
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index c668f36..0377413 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -2,6 +2,7 @@
#define __EXTENTIO__
#include <linux/rbtree.h>
+#include "ulist.h"
/* bits for the extent state */
#define EXTENT_DIRTY (1U << 0)
@@ -18,6 +19,7 @@
#define EXTENT_NEED_WAIT (1U << 13)
#define EXTENT_DAMAGED (1U << 14)
#define EXTENT_NORESERVE (1U << 15)
+#define EXTENT_QGROUP_RESERVED (1U << 16)
#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
@@ -161,6 +163,17 @@ struct extent_buffer {
#endif
};
+/*
+ * Structure to record how many bytes and which ranges are set/cleared
+ */
+struct extent_changeset {
+ /* How many bytes are set/cleared in this operation */
+ u64 bytes_changed;
+
+ /* Changed ranges */
+ struct ulist *range_changed;
+};
+
static inline void extent_set_compress_type(unsigned long *bio_flags,
int compress_type)
{
@@ -186,12 +199,14 @@ int try_release_extent_mapping(struct extent_map_tree *map,
struct extent_io_tree *tree, struct page *page,
gfp_t mask);
int try_release_extent_buffer(struct page *page);
-int lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, struct extent_state **cached);
-int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end);
-int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached, gfp_t mask);
+ struct extent_state **cached);
+
+static inline int lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
+{
+ return lock_extent_bits(tree, start, end, NULL);
+}
+
int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
get_extent_t *get_extent, int mirror_num);
@@ -208,33 +223,105 @@ void free_extent_state(struct extent_state *state);
int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, int filled,
struct extent_state *cached_state);
-int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, gfp_t mask);
+int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, gfp_t mask,
+ struct extent_changeset *changeset);
int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, int wake, int delete,
struct extent_state **cached, gfp_t mask);
-int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, gfp_t mask);
+
+static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end)
+{
+ return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
+ GFP_NOFS);
+}
+
+static inline int unlock_extent_cached(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached, gfp_t mask)
+{
+ return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
+ mask);
+}
+
+static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start,
+ u64 end, unsigned bits, gfp_t mask)
+{
+ int wake = 0;
+
+ if (bits & EXTENT_LOCKED)
+ wake = 1;
+
+ return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, mask);
+}
+
+int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, gfp_t mask,
+ struct extent_changeset *changeset);
int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, u64 *failed_start,
struct extent_state **cached_state, gfp_t mask);
-int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached_state, gfp_t mask);
-int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached_state, gfp_t mask);
-int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
- gfp_t mask);
-int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
- gfp_t mask);
-int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
- gfp_t mask);
+
+static inline int set_extent_bits(struct extent_io_tree *tree, u64 start,
+ u64 end, unsigned bits, gfp_t mask)
+{
+ return set_extent_bit(tree, start, end, bits, NULL, NULL, mask);
+}
+
+static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached_state, gfp_t mask)
+{
+ return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
+ cached_state, mask);
+}
+
+static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start,
+ u64 end, gfp_t mask)
+{
+ return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL,
+ NULL, mask);
+}
+
+static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start,
+ u64 end, gfp_t mask)
+{
+ return clear_extent_bit(tree, start, end,
+ EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
+}
+
int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, unsigned clear_bits,
struct extent_state **cached_state, gfp_t mask);
-int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached_state, gfp_t mask);
-int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached_state, gfp_t mask);
+
+static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached_state, gfp_t mask)
+{
+ return set_extent_bit(tree, start, end,
+ EXTENT_DELALLOC | EXTENT_UPTODATE,
+ NULL, cached_state, mask);
+}
+
+static inline int set_extent_defrag(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached_state, gfp_t mask)
+{
+ return set_extent_bit(tree, start, end,
+ EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG,
+ NULL, cached_state, mask);
+}
+
+static inline int set_extent_new(struct extent_io_tree *tree, u64 start,
+ u64 end, gfp_t mask)
+{
+ return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, NULL, mask);
+}
+
+static inline int set_extent_uptodate(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached_state, gfp_t mask)
+{
+ return set_extent_bit(tree, start, end, EXTENT_UPTODATE, NULL,
+ cached_state, mask);
+}
+
int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
u64 *start_ret, u64 *end_ret, unsigned bits,
struct extent_state **cached_state);
@@ -263,8 +350,10 @@ void set_page_extent_mapped(struct page *page);
struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start);
+struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
+ u64 start, unsigned long len);
struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
- u64 start);
+ u64 start);
struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src);
struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start);
@@ -309,19 +398,25 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
unsigned long src_offset, unsigned long len);
void memset_extent_buffer(struct extent_buffer *eb, char c,
unsigned long start, unsigned long len);
+int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start,
+ unsigned long pos);
+void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start,
+ unsigned long pos, unsigned long len);
+void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start,
+ unsigned long pos, unsigned long len);
void clear_extent_buffer_dirty(struct extent_buffer *eb);
int set_extent_buffer_dirty(struct extent_buffer *eb);
-int set_extent_buffer_uptodate(struct extent_buffer *eb);
-int clear_extent_buffer_uptodate(struct extent_buffer *eb);
+void set_extent_buffer_uptodate(struct extent_buffer *eb);
+void clear_extent_buffer_uptodate(struct extent_buffer *eb);
int extent_buffer_uptodate(struct extent_buffer *eb);
int extent_buffer_under_io(struct extent_buffer *eb);
int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
unsigned long min_len, char **map,
unsigned long *map_start,
unsigned long *map_len);
-int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end);
-int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
-int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
+void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end);
+void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
+void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
struct page *locked_page,
unsigned bits_to_clear,
unsigned long page_ops);
@@ -338,7 +433,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
int mirror_num);
int clean_io_failure(struct inode *inode, u64 start, struct page *page,
unsigned int pg_offset);
-int end_extent_writepage(struct page *page, int err, u64 start, u64 end);
+void end_extent_writepage(struct page *page, int err, u64 start, u64 end);
int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
int mirror_num);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 58ece65..a67e1c8 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -202,7 +202,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
}
if (bio->bi_iter.bi_size > PAGE_CACHE_SIZE * 8)
- path->reada = 2;
+ path->reada = READA_FORWARD;
WARN_ON(bio->bi_vcnt <= 0);
@@ -328,7 +328,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
if (search_commit) {
path->skip_locking = 1;
- path->reada = 2;
+ path->reada = READA_FORWARD;
path->search_commit_root = 1;
}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index b823fac9..83d7859 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -756,8 +756,16 @@ next_slot:
}
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
- if (key.objectid > ino ||
- key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
+
+ if (key.objectid > ino)
+ break;
+ if (WARN_ON_ONCE(key.objectid < ino) ||
+ key.type < BTRFS_EXTENT_DATA_KEY) {
+ ASSERT(del_nr == 0);
+ path->slots[0]++;
+ goto next_slot;
+ }
+ if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
break;
fi = btrfs_item_ptr(leaf, path->slots[0],
@@ -776,8 +784,8 @@ next_slot:
btrfs_file_extent_inline_len(leaf,
path->slots[0], fi);
} else {
- WARN_ON(1);
- extent_end = search_start;
+ /* can't happen */
+ BUG();
}
/*
@@ -847,7 +855,7 @@ next_slot:
disk_bytenr, num_bytes, 0,
root->root_key.objectid,
new_key.objectid,
- start - extent_offset, 1);
+ start - extent_offset);
BUG_ON(ret); /* -ENOMEM */
}
key.offset = start;
@@ -925,7 +933,7 @@ delete_extent_item:
disk_bytenr, num_bytes, 0,
root->root_key.objectid,
key.objectid, key.offset -
- extent_offset, 0);
+ extent_offset);
BUG_ON(ret); /* -ENOMEM */
inode_sub_bytes(inode,
extent_end - key.offset);
@@ -1204,7 +1212,7 @@ again:
ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
root->root_key.objectid,
- ino, orig_offset, 1);
+ ino, orig_offset);
BUG_ON(ret); /* -ENOMEM */
if (split == start) {
@@ -1231,7 +1239,7 @@ again:
del_nr++;
ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
0, root->root_key.objectid,
- ino, orig_offset, 0);
+ ino, orig_offset);
BUG_ON(ret); /* -ENOMEM */
}
other_start = 0;
@@ -1248,7 +1256,7 @@ again:
del_nr++;
ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
0, root->root_key.objectid,
- ino, orig_offset, 0);
+ ino, orig_offset);
BUG_ON(ret); /* -ENOMEM */
}
if (del_nr == 0) {
@@ -1283,7 +1291,8 @@ out:
* on error we return an unlocked page and the error value
* on success we return a locked page and 0
*/
-static int prepare_uptodate_page(struct page *page, u64 pos,
+static int prepare_uptodate_page(struct inode *inode,
+ struct page *page, u64 pos,
bool force_uptodate)
{
int ret = 0;
@@ -1298,6 +1307,10 @@ static int prepare_uptodate_page(struct page *page, u64 pos,
unlock_page(page);
return -EIO;
}
+ if (page->mapping != inode->i_mapping) {
+ unlock_page(page);
+ return -EAGAIN;
+ }
}
return 0;
}
@@ -1316,6 +1329,7 @@ static noinline int prepare_pages(struct inode *inode, struct page **pages,
int faili;
for (i = 0; i < num_pages; i++) {
+again:
pages[i] = find_or_create_page(inode->i_mapping, index + i,
mask | __GFP_WRITE);
if (!pages[i]) {
@@ -1325,13 +1339,17 @@ static noinline int prepare_pages(struct inode *inode, struct page **pages,
}
if (i == 0)
- err = prepare_uptodate_page(pages[i], pos,
+ err = prepare_uptodate_page(inode, pages[i], pos,
force_uptodate);
- if (i == num_pages - 1)
- err = prepare_uptodate_page(pages[i],
+ if (!err && i == num_pages - 1)
+ err = prepare_uptodate_page(inode, pages[i],
pos + write_bytes, false);
if (err) {
page_cache_release(pages[i]);
+ if (err == -EAGAIN) {
+ err = 0;
+ goto again;
+ }
faili = i - 1;
goto fail;
}
@@ -1376,7 +1394,7 @@ lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages,
if (start_pos < inode->i_size) {
struct btrfs_ordered_extent *ordered;
lock_extent_bits(&BTRFS_I(inode)->io_tree,
- start_pos, last_pos, 0, cached_state);
+ start_pos, last_pos, cached_state);
ordered = btrfs_lookup_ordered_range(inode, start_pos,
last_pos - start_pos + 1);
if (ordered &&
@@ -1469,7 +1487,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
u64 release_bytes = 0;
u64 lockstart;
u64 lockend;
- unsigned long first_index;
size_t num_written = 0;
int nrptrs;
int ret = 0;
@@ -1485,8 +1502,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
if (!pages)
return -ENOMEM;
- first_index = pos >> PAGE_CACHE_SHIFT;
-
while (iov_iter_count(i) > 0) {
size_t offset = pos & (PAGE_CACHE_SIZE - 1);
size_t write_bytes = min(iov_iter_count(i),
@@ -1510,12 +1525,17 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
}
reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
- ret = btrfs_check_data_free_space(inode, reserve_bytes, write_bytes);
- if (ret == -ENOSPC &&
- (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
- BTRFS_INODE_PREALLOC))) {
+
+ if (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
+ BTRFS_INODE_PREALLOC)) {
ret = check_can_nocow(inode, pos, &write_bytes);
+ if (ret < 0)
+ break;
if (ret > 0) {
+ /*
+ * For nodata cow case, no need to reserve
+ * data space.
+ */
only_release_metadata = true;
/*
* our prealloc extent may be smaller than
@@ -1524,20 +1544,19 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
num_pages = DIV_ROUND_UP(write_bytes + offset,
PAGE_CACHE_SIZE);
reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
- ret = 0;
- } else {
- ret = -ENOSPC;
+ goto reserve_metadata;
}
}
-
- if (ret)
+ ret = btrfs_check_data_free_space(inode, pos, write_bytes);
+ if (ret < 0)
break;
+reserve_metadata:
ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes);
if (ret) {
if (!only_release_metadata)
- btrfs_free_reserved_data_space(inode,
- reserve_bytes);
+ btrfs_free_reserved_data_space(inode, pos,
+ write_bytes);
else
btrfs_end_write_no_snapshoting(root);
break;
@@ -1603,12 +1622,17 @@ again:
BTRFS_I(inode)->outstanding_extents++;
spin_unlock(&BTRFS_I(inode)->lock);
}
- if (only_release_metadata)
+ if (only_release_metadata) {
btrfs_delalloc_release_metadata(inode,
release_bytes);
- else
- btrfs_delalloc_release_space(inode,
+ } else {
+ u64 __pos;
+
+ __pos = round_down(pos, root->sectorsize) +
+ (dirty_pages << PAGE_CACHE_SHIFT);
+ btrfs_delalloc_release_space(inode, __pos,
release_bytes);
+ }
}
release_bytes = dirty_pages << PAGE_CACHE_SHIFT;
@@ -1660,7 +1684,7 @@ again:
btrfs_end_write_no_snapshoting(root);
btrfs_delalloc_release_metadata(inode, release_bytes);
} else {
- btrfs_delalloc_release_space(inode, release_bytes);
+ btrfs_delalloc_release_space(inode, pos, release_bytes);
}
}
@@ -1868,8 +1892,13 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
struct btrfs_log_ctx ctx;
int ret = 0;
bool full_sync = 0;
- const u64 len = end - start + 1;
+ u64 len;
+ /*
+ * The range length can be represented by u64, we have to do the typecasts
+ * to avoid signed overflow if it's [0, LLONG_MAX] eg. from fsync()
+ */
+ len = (u64)end - (u64)start + 1;
trace_btrfs_sync_file(file, datasync);
/*
@@ -2057,8 +2086,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
}
}
if (!full_sync) {
- ret = btrfs_wait_ordered_range(inode, start,
- end - start + 1);
+ ret = btrfs_wait_ordered_range(inode, start, len);
if (ret) {
btrfs_end_transaction(trans, root);
goto out;
@@ -2266,7 +2294,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
u64 drop_end;
int ret = 0;
int err = 0;
- int rsv_count;
+ unsigned int rsv_count;
bool same_page;
bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
u64 ino_size;
@@ -2370,7 +2398,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
truncate_pagecache_range(inode, lockstart, lockend);
lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- 0, &cached_state);
+ &cached_state);
ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
/*
@@ -2488,6 +2516,19 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
trans->block_rsv = &root->fs_info->trans_block_rsv;
/*
+ * If we are using the NO_HOLES feature we might have had already an
+ * hole that overlaps a part of the region [lockstart, lockend] and
+ * ends at (or beyond) lockend. Since we have no file extent items to
+ * represent holes, drop_end can be less than lockend and so we must
+ * make sure we have an extent map representing the existing hole (the
+ * call to __btrfs_drop_extents() might have dropped the existing extent
+ * map representing the existing hole), otherwise the fast fsync path
+ * will not record the existence of the hole region
+ * [existing_hole_start, lockend].
+ */
+ if (drop_end <= lockend)
+ drop_end = lockend + 1;
+ /*
* Don't insert file hole extent item if it's for a range beyond eof
* (because it's useless) or if it represents a 0 bytes range (when
* cur_offset == drop_end).
@@ -2541,17 +2582,61 @@ out_only_mutex:
return err;
}
+/* Helper structure to record which range is already reserved */
+struct falloc_range {
+ struct list_head list;
+ u64 start;
+ u64 len;
+};
+
+/*
+ * Helper function to add falloc range
+ *
+ * Caller should have locked the larger range of extent containing
+ * [start, len)
+ */
+static int add_falloc_range(struct list_head *head, u64 start, u64 len)
+{
+ struct falloc_range *prev = NULL;
+ struct falloc_range *range = NULL;
+
+ if (list_empty(head))
+ goto insert;
+
+ /*
+ * As fallocate iterate by bytenr order, we only need to check
+ * the last range.
+ */
+ prev = list_entry(head->prev, struct falloc_range, list);
+ if (prev->start + prev->len == start) {
+ prev->len += len;
+ return 0;
+ }
+insert:
+ range = kmalloc(sizeof(*range), GFP_NOFS);
+ if (!range)
+ return -ENOMEM;
+ range->start = start;
+ range->len = len;
+ list_add_tail(&range->list, head);
+ return 0;
+}
+
static long btrfs_fallocate(struct file *file, int mode,
loff_t offset, loff_t len)
{
struct inode *inode = file_inode(file);
struct extent_state *cached_state = NULL;
+ struct falloc_range *range;
+ struct falloc_range *tmp;
+ struct list_head reserve_list;
u64 cur_offset;
u64 last_byte;
u64 alloc_start;
u64 alloc_end;
u64 alloc_hint = 0;
u64 locked_end;
+ u64 actual_end = 0;
struct extent_map *em;
int blocksize = BTRFS_I(inode)->root->sectorsize;
int ret;
@@ -2567,11 +2652,12 @@ static long btrfs_fallocate(struct file *file, int mode,
return btrfs_punch_hole(inode, offset, len);
/*
- * Make sure we have enough space before we do the
- * allocation.
+ * Only trigger disk allocation, don't trigger qgroup reserve
+ *
+ * For qgroup space, it will be checked later.
*/
- ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start, alloc_end - alloc_start);
- if (ret)
+ ret = btrfs_alloc_data_chunk_ondemand(inode, alloc_end - alloc_start);
+ if (ret < 0)
return ret;
mutex_lock(&inode->i_mutex);
@@ -2579,12 +2665,19 @@ static long btrfs_fallocate(struct file *file, int mode,
if (ret)
goto out;
+ /*
+ * TODO: Move these two operations after we have checked
+ * accurate reserved space, or fallocate can still fail but
+ * with page truncated or size expanded.
+ *
+ * But that's a minor problem and won't do much harm BTW.
+ */
if (alloc_start > inode->i_size) {
ret = btrfs_cont_expand(inode, i_size_read(inode),
alloc_start);
if (ret)
goto out;
- } else {
+ } else if (offset + len > inode->i_size) {
/*
* If we are fallocating from the end of the file onward we
* need to zero out the end of the page if i_size lands in the
@@ -2612,7 +2705,7 @@ static long btrfs_fallocate(struct file *file, int mode,
* transaction
*/
lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
- locked_end, 0, &cached_state);
+ locked_end, &cached_state);
ordered = btrfs_lookup_first_ordered_extent(inode,
alloc_end - 1);
if (ordered &&
@@ -2637,10 +2730,10 @@ static long btrfs_fallocate(struct file *file, int mode,
}
}
+ /* First, check if we exceed the qgroup limit */
+ INIT_LIST_HEAD(&reserve_list);
cur_offset = alloc_start;
while (1) {
- u64 actual_end;
-
em = btrfs_get_extent(inode, NULL, 0, cur_offset,
alloc_end - cur_offset, 0);
if (IS_ERR_OR_NULL(em)) {
@@ -2653,57 +2746,82 @@ static long btrfs_fallocate(struct file *file, int mode,
last_byte = min(extent_map_end(em), alloc_end);
actual_end = min_t(u64, extent_map_end(em), offset + len);
last_byte = ALIGN(last_byte, blocksize);
-
if (em->block_start == EXTENT_MAP_HOLE ||
(cur_offset >= inode->i_size &&
!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
- ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
- last_byte - cur_offset,
- 1 << inode->i_blkbits,
- offset + len,
- &alloc_hint);
- } else if (actual_end > inode->i_size &&
- !(mode & FALLOC_FL_KEEP_SIZE)) {
- struct btrfs_trans_handle *trans;
- struct btrfs_root *root = BTRFS_I(inode)->root;
-
- /*
- * We didn't need to allocate any more space, but we
- * still extended the size of the file so we need to
- * update i_size and the inode item.
- */
- trans = btrfs_start_transaction(root, 1);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- } else {
- inode->i_ctime = CURRENT_TIME;
- i_size_write(inode, actual_end);
- btrfs_ordered_update_i_size(inode, actual_end,
- NULL);
- ret = btrfs_update_inode(trans, root, inode);
- if (ret)
- btrfs_end_transaction(trans, root);
- else
- ret = btrfs_end_transaction(trans,
- root);
+ ret = add_falloc_range(&reserve_list, cur_offset,
+ last_byte - cur_offset);
+ if (ret < 0) {
+ free_extent_map(em);
+ break;
}
+ ret = btrfs_qgroup_reserve_data(inode, cur_offset,
+ last_byte - cur_offset);
+ if (ret < 0)
+ break;
}
free_extent_map(em);
- if (ret < 0)
- break;
-
cur_offset = last_byte;
- if (cur_offset >= alloc_end) {
- ret = 0;
+ if (cur_offset >= alloc_end)
break;
+ }
+
+ /*
+ * If ret is still 0, means we're OK to fallocate.
+ * Or just cleanup the list and exit.
+ */
+ list_for_each_entry_safe(range, tmp, &reserve_list, list) {
+ if (!ret)
+ ret = btrfs_prealloc_file_range(inode, mode,
+ range->start,
+ range->len, 1 << inode->i_blkbits,
+ offset + len, &alloc_hint);
+ list_del(&range->list);
+ kfree(range);
+ }
+ if (ret < 0)
+ goto out_unlock;
+
+ if (actual_end > inode->i_size &&
+ !(mode & FALLOC_FL_KEEP_SIZE)) {
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+
+ /*
+ * We didn't need to allocate any more space, but we
+ * still extended the size of the file so we need to
+ * update i_size and the inode item.
+ */
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ } else {
+ inode->i_ctime = CURRENT_TIME;
+ i_size_write(inode, actual_end);
+ btrfs_ordered_update_i_size(inode, actual_end, NULL);
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret)
+ btrfs_end_transaction(trans, root);
+ else
+ ret = btrfs_end_transaction(trans, root);
}
}
+out_unlock:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
&cached_state, GFP_NOFS);
out:
+ /*
+ * As we waited the extent range, the data_rsv_map must be empty
+ * in the range, as written data range will be released from it.
+ * And for prealloacted extent, it will also be released when
+ * its metadata is written.
+ * So this is completely used as cleanup.
+ */
+ btrfs_qgroup_free_data(inode, alloc_start, alloc_end - alloc_start);
mutex_unlock(&inode->i_mutex);
/* Let go of our reservation. */
- btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
+ btrfs_free_reserved_data_space(inode, alloc_start,
+ alloc_end - alloc_start);
return ret;
}
@@ -2734,7 +2852,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
lockend--;
len = lockend - lockstart + 1;
- lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0,
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
&cached_state);
while (start < inode->i_size) {
@@ -2816,6 +2934,9 @@ const struct file_operations btrfs_file_operations = {
#ifdef CONFIG_COMPAT
.compat_ioctl = btrfs_ioctl,
#endif
+ .copy_file_range = btrfs_copy_file_range,
+ .clone_file_range = btrfs_clone_file_range,
+ .dedupe_file_range = btrfs_dedupe_file_range,
};
void btrfs_auto_defrag_exit(void)
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index fb5a6b1..8f835bf 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -30,7 +30,7 @@
#include "volumes.h"
#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8)
-#define MAX_CACHE_BYTES_PER_GIG (32 * 1024)
+#define MAX_CACHE_BYTES_PER_GIG SZ_32K
struct btrfs_trim_range {
u64 start;
@@ -85,8 +85,8 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
}
mapping_set_gfp_mask(inode->i_mapping,
- mapping_gfp_mask(inode->i_mapping) &
- ~(__GFP_FS | __GFP_HIGHMEM));
+ mapping_gfp_constraint(inode->i_mapping,
+ ~(__GFP_FS | __GFP_HIGHMEM)));
return inode;
}
@@ -450,9 +450,9 @@ static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
gen = io_ctl->cur;
if (le64_to_cpu(*gen) != generation) {
- printk_ratelimited(KERN_ERR "BTRFS: space cache generation "
- "(%Lu) does not match inode (%Lu)\n", *gen,
- generation);
+ btrfs_err_rl(io_ctl->root->fs_info,
+ "space cache generation (%llu) does not match inode (%llu)",
+ *gen, generation);
io_ctl_unmap_page(io_ctl);
return -EIO;
}
@@ -506,8 +506,8 @@ static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
PAGE_CACHE_SIZE - offset);
btrfs_csum_final(crc, (char *)&crc);
if (val != crc) {
- printk_ratelimited(KERN_ERR "BTRFS: csum mismatch on free "
- "space cache\n");
+ btrfs_err_rl(io_ctl->root->fs_info,
+ "csum mismatch on free space cache");
io_ctl_unmap_page(io_ctl);
return -EIO;
}
@@ -891,7 +891,7 @@ out:
spin_unlock(&block_group->lock);
ret = 0;
- btrfs_warn(fs_info, "failed to load free space cache for block group %llu, rebuild it now",
+ btrfs_warn(fs_info, "failed to load free space cache for block group %llu, rebuilding it now",
block_group->key.objectid);
}
@@ -1086,14 +1086,11 @@ write_pinned_extent_entries(struct btrfs_root *root,
static noinline_for_stack int
write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list)
{
- struct list_head *pos, *n;
+ struct btrfs_free_space *entry, *next;
int ret;
/* Write out the bitmaps */
- list_for_each_safe(pos, n, bitmap_list) {
- struct btrfs_free_space *entry =
- list_entry(pos, struct btrfs_free_space, list);
-
+ list_for_each_entry_safe(entry, next, bitmap_list, list) {
ret = io_ctl_add_bitmap(io_ctl, entry->bitmap);
if (ret)
return -ENOSPC;
@@ -1119,13 +1116,10 @@ static int flush_dirty_cache(struct inode *inode)
static void noinline_for_stack
cleanup_bitmap_list(struct list_head *bitmap_list)
{
- struct list_head *pos, *n;
+ struct btrfs_free_space *entry, *next;
- list_for_each_safe(pos, n, bitmap_list) {
- struct btrfs_free_space *entry =
- list_entry(pos, struct btrfs_free_space, list);
+ list_for_each_entry_safe(entry, next, bitmap_list, list)
list_del_init(&entry->list);
- }
}
static void noinline_for_stack
@@ -1215,7 +1209,7 @@ out:
* @offset - the offset for the key we'll insert
*
* This function writes out a free space cache struct to disk for quick recovery
- * on mount. This will return 0 if it was successfull in writing the cache out,
+ * on mount. This will return 0 if it was successful in writing the cache out,
* or an errno if it was not.
*/
static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
@@ -1261,7 +1255,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
goto out;
lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
- 0, &cached_state);
+ &cached_state);
io_ctl_set_generation(io_ctl, trans->transid);
@@ -1656,11 +1650,10 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
* at or below 32k, so we need to adjust how much memory we allow to be
* used by extent based free space tracking
*/
- if (size < 1024 * 1024 * 1024)
+ if (size < SZ_1G)
max_bytes = MAX_CACHE_BYTES_PER_GIG;
else
- max_bytes = MAX_CACHE_BYTES_PER_GIG *
- div_u64(size, 1024 * 1024 * 1024);
+ max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G);
/*
* we want to account for 1 more bitmap than what we have so we can make
@@ -1730,7 +1723,7 @@ static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
*/
static int search_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *bitmap_info, u64 *offset,
- u64 *bytes)
+ u64 *bytes, bool for_alloc)
{
unsigned long found_bits = 0;
unsigned long max_bits = 0;
@@ -1738,11 +1731,26 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
unsigned long next_zero;
unsigned long extent_bits;
+ /*
+ * Skip searching the bitmap if we don't have a contiguous section that
+ * is large enough for this allocation.
+ */
+ if (for_alloc &&
+ bitmap_info->max_extent_size &&
+ bitmap_info->max_extent_size < *bytes) {
+ *bytes = bitmap_info->max_extent_size;
+ return -1;
+ }
+
i = offset_to_bit(bitmap_info->offset, ctl->unit,
max_t(u64, *offset, bitmap_info->offset));
bits = bytes_to_bits(*bytes, ctl->unit);
for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) {
+ if (for_alloc && bits == 1) {
+ found_bits = 1;
+ break;
+ }
next_zero = find_next_zero_bit(bitmap_info->bitmap,
BITS_PER_BITMAP, i);
extent_bits = next_zero - i;
@@ -1762,6 +1770,7 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
}
*bytes = (u64)(max_bits) * ctl->unit;
+ bitmap_info->max_extent_size = *bytes;
return -1;
}
@@ -1813,7 +1822,7 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
if (entry->bitmap) {
u64 size = *bytes;
- ret = search_bitmap(ctl, entry, &tmp, &size);
+ ret = search_bitmap(ctl, entry, &tmp, &size, true);
if (!ret) {
*offset = tmp;
*bytes = size;
@@ -1874,7 +1883,8 @@ again:
search_start = *offset;
search_bytes = ctl->unit;
search_bytes = min(search_bytes, end - search_start + 1);
- ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes);
+ ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes,
+ false);
if (ret < 0 || search_start != *offset)
return -EINVAL;
@@ -1919,7 +1929,7 @@ again:
search_start = *offset;
search_bytes = ctl->unit;
ret = search_bitmap(ctl, bitmap_info, &search_start,
- &search_bytes);
+ &search_bytes, false);
if (ret < 0 || search_start != *offset)
return -EAGAIN;
@@ -1943,6 +1953,12 @@ static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
bitmap_set_bits(ctl, info, offset, bytes_to_set);
+ /*
+ * We set some bytes, we have no idea what the max extent size is
+ * anymore.
+ */
+ info->max_extent_size = 0;
+
return bytes_to_set;
}
@@ -1951,12 +1967,19 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info)
{
struct btrfs_block_group_cache *block_group = ctl->private;
+ bool forced = false;
+
+#ifdef CONFIG_BTRFS_DEBUG
+ if (btrfs_should_fragment_free_space(block_group->fs_info->extent_root,
+ block_group))
+ forced = true;
+#endif
/*
* If we are below the extents threshold then we can add this as an
* extent, and don't have to deal with the bitmap
*/
- if (ctl->free_extents < ctl->extents_thresh) {
+ if (!forced && ctl->free_extents < ctl->extents_thresh) {
/*
* If this block group has some small extents we don't want to
* use up all of our free slots in the cache with them, we want
@@ -1986,7 +2009,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
return true;
}
-static struct btrfs_free_space_op free_space_op = {
+static const struct btrfs_free_space_op free_space_op = {
.recalc_thresholds = recalculate_thresholds,
.use_bitmap = use_bitmap,
};
@@ -2459,8 +2482,7 @@ void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group)
* track of free space, and if we pass 1/2 of that we want to
* start converting things over to using bitmaps
*/
- ctl->extents_thresh = ((1024 * 32) / 2) /
- sizeof(struct btrfs_free_space);
+ ctl->extents_thresh = (SZ_32K / 2) / sizeof(struct btrfs_free_space);
}
/*
@@ -2661,7 +2683,7 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
search_start = min_start;
search_bytes = bytes;
- err = search_bitmap(ctl, entry, &search_start, &search_bytes);
+ err = search_bitmap(ctl, entry, &search_start, &search_bytes, true);
if (err) {
if (search_bytes > *max_extent_size)
*max_extent_size = search_bytes;
@@ -2775,6 +2797,7 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
unsigned long want_bits;
unsigned long min_bits;
unsigned long found_bits;
+ unsigned long max_bits = 0;
unsigned long start = 0;
unsigned long total_found = 0;
int ret;
@@ -2784,6 +2807,13 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
want_bits = bytes_to_bits(bytes, ctl->unit);
min_bits = bytes_to_bits(min_bytes, ctl->unit);
+ /*
+ * Don't bother looking for a cluster in this bitmap if it's heavily
+ * fragmented.
+ */
+ if (entry->max_extent_size &&
+ entry->max_extent_size < cont1_bytes)
+ return -ENOSPC;
again:
found_bits = 0;
for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) {
@@ -2791,13 +2821,19 @@ again:
BITS_PER_BITMAP, i);
if (next_zero - i >= min_bits) {
found_bits = next_zero - i;
+ if (found_bits > max_bits)
+ max_bits = found_bits;
break;
}
+ if (next_zero - i > max_bits)
+ max_bits = next_zero - i;
i = next_zero;
}
- if (!found_bits)
+ if (!found_bits) {
+ entry->max_extent_size = (u64)max_bits * ctl->unit;
return -ENOSPC;
+ }
if (!total_found) {
start = i;
@@ -2928,7 +2964,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
u64 cont1_bytes, u64 min_bytes)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
- struct btrfs_free_space *entry;
+ struct btrfs_free_space *entry = NULL;
int ret = -ENOSPC;
u64 bitmap_offset = offset_to_bitmap(ctl, offset);
@@ -2939,8 +2975,10 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
* The bitmap that covers offset won't be in the list unless offset
* is just its start offset.
*/
- entry = list_first_entry(bitmaps, struct btrfs_free_space, list);
- if (entry->offset != bitmap_offset) {
+ if (!list_empty(bitmaps))
+ entry = list_first_entry(bitmaps, struct btrfs_free_space, list);
+
+ if (!entry || entry->offset != bitmap_offset) {
entry = tree_search_offset(ctl, bitmap_offset, 1, 0);
if (entry && list_empty(&entry->list))
list_add(&entry->list, bitmaps);
@@ -3056,6 +3094,7 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
spin_lock_init(&cluster->refill_lock);
cluster->root = RB_ROOT;
cluster->max_size = 0;
+ cluster->fragmented = false;
INIT_LIST_HEAD(&cluster->block_group_list);
cluster->block_group = NULL;
}
@@ -3223,7 +3262,7 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
}
bytes = minlen;
- ret2 = search_bitmap(ctl, entry, &start, &bytes);
+ ret2 = search_bitmap(ctl, entry, &start, &bytes, false);
if (ret2 || start >= end) {
spin_unlock(&ctl->tree_lock);
mutex_unlock(&ctl->cache_writeout_mutex);
@@ -3272,35 +3311,23 @@ next:
return ret;
}
-int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
- u64 *trimmed, u64 start, u64 end, u64 minlen)
+void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache)
{
- int ret;
+ atomic_inc(&cache->trimming);
+}
- *trimmed = 0;
+void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *block_group)
+{
+ struct extent_map_tree *em_tree;
+ struct extent_map *em;
+ bool cleanup;
spin_lock(&block_group->lock);
- if (block_group->removed) {
- spin_unlock(&block_group->lock);
- return 0;
- }
- atomic_inc(&block_group->trimming);
+ cleanup = (atomic_dec_and_test(&block_group->trimming) &&
+ block_group->removed);
spin_unlock(&block_group->lock);
- ret = trim_no_bitmap(block_group, trimmed, start, end, minlen);
- if (ret)
- goto out;
-
- ret = trim_bitmaps(block_group, trimmed, start, end, minlen);
-out:
- spin_lock(&block_group->lock);
- if (atomic_dec_and_test(&block_group->trimming) &&
- block_group->removed) {
- struct extent_map_tree *em_tree;
- struct extent_map *em;
-
- spin_unlock(&block_group->lock);
-
+ if (cleanup) {
lock_chunks(block_group->fs_info->chunk_root);
em_tree = &block_group->fs_info->mapping_tree.map_tree;
write_lock(&em_tree->lock);
@@ -3324,10 +3351,31 @@ out:
* this block group have left 1 entry each one. Free them.
*/
__btrfs_remove_free_space_cache(block_group->free_space_ctl);
- } else {
+ }
+}
+
+int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
+ u64 *trimmed, u64 start, u64 end, u64 minlen)
+{
+ int ret;
+
+ *trimmed = 0;
+
+ spin_lock(&block_group->lock);
+ if (block_group->removed) {
spin_unlock(&block_group->lock);
+ return 0;
}
+ btrfs_get_block_group_trimming(block_group);
+ spin_unlock(&block_group->lock);
+ ret = trim_no_bitmap(block_group, trimmed, start, end, minlen);
+ if (ret)
+ goto out;
+
+ ret = trim_bitmaps(block_group, trimmed, start, end, minlen);
+out:
+ btrfs_put_block_group_trimming(block_group);
return ret;
}
@@ -3367,7 +3415,7 @@ u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root)
u64 count = 1;
int ret;
- ret = search_bitmap(ctl, entry, &offset, &count);
+ ret = search_bitmap(ctl, entry, &offset, &count, true);
/* Logic error; Should be empty if it can't find anything */
ASSERT(!ret);
@@ -3523,6 +3571,7 @@ again:
spin_lock(&ctl->tree_lock);
info->offset = offset;
info->bytes = bytes;
+ info->max_extent_size = 0;
ret = link_free_space(ctl, info);
spin_unlock(&ctl->tree_lock);
if (ret)
@@ -3550,6 +3599,7 @@ again:
}
bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
+
bytes -= bytes_added;
offset += bytes_added;
spin_unlock(&ctl->tree_lock);
@@ -3593,7 +3643,7 @@ have_info:
bit_off = offset;
bit_bytes = ctl->unit;
- ret = search_bitmap(ctl, info, &bit_off, &bit_bytes);
+ ret = search_bitmap(ctl, info, &bit_off, &bit_bytes, false);
if (!ret) {
if (bit_off == offset) {
ret = 1;
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index a16a029..33178c4 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -23,6 +23,7 @@ struct btrfs_free_space {
struct rb_node offset_index;
u64 offset;
u64 bytes;
+ u64 max_extent_size;
unsigned long *bitmap;
struct list_head list;
};
@@ -36,7 +37,7 @@ struct btrfs_free_space_ctl {
int total_bitmaps;
int unit;
u64 start;
- struct btrfs_free_space_op *op;
+ const struct btrfs_free_space_op *op;
void *private;
struct mutex cache_writeout_mutex;
struct list_head trimming_ranges;
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
new file mode 100644
index 0000000..393e36b
--- /dev/null
+++ b/fs/btrfs/free-space-tree.c
@@ -0,0 +1,1591 @@
+/*
+ * Copyright (C) 2015 Facebook. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/vmalloc.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "locking.h"
+#include "free-space-tree.h"
+#include "transaction.h"
+
+static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path);
+
+void set_free_space_tree_thresholds(struct btrfs_block_group_cache *cache)
+{
+ u32 bitmap_range;
+ size_t bitmap_size;
+ u64 num_bitmaps, total_bitmap_size;
+
+ /*
+ * We convert to bitmaps when the disk space required for using extents
+ * exceeds that required for using bitmaps.
+ */
+ bitmap_range = cache->sectorsize * BTRFS_FREE_SPACE_BITMAP_BITS;
+ num_bitmaps = div_u64(cache->key.offset + bitmap_range - 1,
+ bitmap_range);
+ bitmap_size = sizeof(struct btrfs_item) + BTRFS_FREE_SPACE_BITMAP_SIZE;
+ total_bitmap_size = num_bitmaps * bitmap_size;
+ cache->bitmap_high_thresh = div_u64(total_bitmap_size,
+ sizeof(struct btrfs_item));
+
+ /*
+ * We allow for a small buffer between the high threshold and low
+ * threshold to avoid thrashing back and forth between the two formats.
+ */
+ if (cache->bitmap_high_thresh > 100)
+ cache->bitmap_low_thresh = cache->bitmap_high_thresh - 100;
+ else
+ cache->bitmap_low_thresh = 0;
+}
+
+static int add_new_free_space_info(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path)
+{
+ struct btrfs_root *root = fs_info->free_space_root;
+ struct btrfs_free_space_info *info;
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ int ret;
+
+ key.objectid = block_group->key.objectid;
+ key.type = BTRFS_FREE_SPACE_INFO_KEY;
+ key.offset = block_group->key.offset;
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*info));
+ if (ret)
+ goto out;
+
+ leaf = path->nodes[0];
+ info = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_free_space_info);
+ btrfs_set_free_space_extent_count(leaf, info, 0);
+ btrfs_set_free_space_flags(leaf, info, 0);
+ btrfs_mark_buffer_dirty(leaf);
+
+ ret = 0;
+out:
+ btrfs_release_path(path);
+ return ret;
+}
+
+struct btrfs_free_space_info *
+search_free_space_info(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path, int cow)
+{
+ struct btrfs_root *root = fs_info->free_space_root;
+ struct btrfs_key key;
+ int ret;
+
+ key.objectid = block_group->key.objectid;
+ key.type = BTRFS_FREE_SPACE_INFO_KEY;
+ key.offset = block_group->key.offset;
+
+ ret = btrfs_search_slot(trans, root, &key, path, 0, cow);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ if (ret != 0) {
+ btrfs_warn(fs_info, "missing free space info for %llu\n",
+ block_group->key.objectid);
+ ASSERT(0);
+ return ERR_PTR(-ENOENT);
+ }
+
+ return btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_free_space_info);
+}
+
+/*
+ * btrfs_search_slot() but we're looking for the greatest key less than the
+ * passed key.
+ */
+static int btrfs_search_prev_slot(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_key *key, struct btrfs_path *p,
+ int ins_len, int cow)
+{
+ int ret;
+
+ ret = btrfs_search_slot(trans, root, key, p, ins_len, cow);
+ if (ret < 0)
+ return ret;
+
+ if (ret == 0) {
+ ASSERT(0);
+ return -EIO;
+ }
+
+ if (p->slots[0] == 0) {
+ ASSERT(0);
+ return -EIO;
+ }
+ p->slots[0]--;
+
+ return 0;
+}
+
+static inline u32 free_space_bitmap_size(u64 size, u32 sectorsize)
+{
+ return DIV_ROUND_UP((u32)div_u64(size, sectorsize), BITS_PER_BYTE);
+}
+
+static unsigned long *alloc_bitmap(u32 bitmap_size)
+{
+ return __vmalloc(bitmap_size, GFP_NOFS | __GFP_HIGHMEM | __GFP_ZERO,
+ PAGE_KERNEL);
+}
+
+int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path)
+{
+ struct btrfs_root *root = fs_info->free_space_root;
+ struct btrfs_free_space_info *info;
+ struct btrfs_key key, found_key;
+ struct extent_buffer *leaf;
+ unsigned long *bitmap;
+ char *bitmap_cursor;
+ u64 start, end;
+ u64 bitmap_range, i;
+ u32 bitmap_size, flags, expected_extent_count;
+ u32 extent_count = 0;
+ int done = 0, nr;
+ int ret;
+
+ bitmap_size = free_space_bitmap_size(block_group->key.offset,
+ block_group->sectorsize);
+ bitmap = alloc_bitmap(bitmap_size);
+ if (!bitmap) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ start = block_group->key.objectid;
+ end = block_group->key.objectid + block_group->key.offset;
+
+ key.objectid = end - 1;
+ key.type = (u8)-1;
+ key.offset = (u64)-1;
+
+ while (!done) {
+ ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
+ if (ret)
+ goto out;
+
+ leaf = path->nodes[0];
+ nr = 0;
+ path->slots[0]++;
+ while (path->slots[0] > 0) {
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1);
+
+ if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) {
+ ASSERT(found_key.objectid == block_group->key.objectid);
+ ASSERT(found_key.offset == block_group->key.offset);
+ done = 1;
+ break;
+ } else if (found_key.type == BTRFS_FREE_SPACE_EXTENT_KEY) {
+ u64 first, last;
+
+ ASSERT(found_key.objectid >= start);
+ ASSERT(found_key.objectid < end);
+ ASSERT(found_key.objectid + found_key.offset <= end);
+
+ first = div_u64(found_key.objectid - start,
+ block_group->sectorsize);
+ last = div_u64(found_key.objectid + found_key.offset - start,
+ block_group->sectorsize);
+ bitmap_set(bitmap, first, last - first);
+
+ extent_count++;
+ nr++;
+ path->slots[0]--;
+ } else {
+ ASSERT(0);
+ }
+ }
+
+ ret = btrfs_del_items(trans, root, path, path->slots[0], nr);
+ if (ret)
+ goto out;
+ btrfs_release_path(path);
+ }
+
+ info = search_free_space_info(trans, fs_info, block_group, path, 1);
+ if (IS_ERR(info)) {
+ ret = PTR_ERR(info);
+ goto out;
+ }
+ leaf = path->nodes[0];
+ flags = btrfs_free_space_flags(leaf, info);
+ flags |= BTRFS_FREE_SPACE_USING_BITMAPS;
+ btrfs_set_free_space_flags(leaf, info, flags);
+ expected_extent_count = btrfs_free_space_extent_count(leaf, info);
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_release_path(path);
+
+ if (extent_count != expected_extent_count) {
+ btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u",
+ block_group->key.objectid, extent_count,
+ expected_extent_count);
+ ASSERT(0);
+ ret = -EIO;
+ goto out;
+ }
+
+ bitmap_cursor = (char *)bitmap;
+ bitmap_range = block_group->sectorsize * BTRFS_FREE_SPACE_BITMAP_BITS;
+ i = start;
+ while (i < end) {
+ unsigned long ptr;
+ u64 extent_size;
+ u32 data_size;
+
+ extent_size = min(end - i, bitmap_range);
+ data_size = free_space_bitmap_size(extent_size,
+ block_group->sectorsize);
+
+ key.objectid = i;
+ key.type = BTRFS_FREE_SPACE_BITMAP_KEY;
+ key.offset = extent_size;
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key,
+ data_size);
+ if (ret)
+ goto out;
+
+ leaf = path->nodes[0];
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ write_extent_buffer(leaf, bitmap_cursor, ptr,
+ data_size);
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_release_path(path);
+
+ i += extent_size;
+ bitmap_cursor += data_size;
+ }
+
+ ret = 0;
+out:
+ vfree(bitmap);
+ if (ret)
+ btrfs_abort_transaction(trans, root, ret);
+ return ret;
+}
+
+int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path)
+{
+ struct btrfs_root *root = fs_info->free_space_root;
+ struct btrfs_free_space_info *info;
+ struct btrfs_key key, found_key;
+ struct extent_buffer *leaf;
+ unsigned long *bitmap;
+ u64 start, end;
+ /* Initialize to silence GCC. */
+ u64 extent_start = 0;
+ u64 offset;
+ u32 bitmap_size, flags, expected_extent_count;
+ int prev_bit = 0, bit, bitnr;
+ u32 extent_count = 0;
+ int done = 0, nr;
+ int ret;
+
+ bitmap_size = free_space_bitmap_size(block_group->key.offset,
+ block_group->sectorsize);
+ bitmap = alloc_bitmap(bitmap_size);
+ if (!bitmap) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ start = block_group->key.objectid;
+ end = block_group->key.objectid + block_group->key.offset;
+
+ key.objectid = end - 1;
+ key.type = (u8)-1;
+ key.offset = (u64)-1;
+
+ while (!done) {
+ ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
+ if (ret)
+ goto out;
+
+ leaf = path->nodes[0];
+ nr = 0;
+ path->slots[0]++;
+ while (path->slots[0] > 0) {
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1);
+
+ if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) {
+ ASSERT(found_key.objectid == block_group->key.objectid);
+ ASSERT(found_key.offset == block_group->key.offset);
+ done = 1;
+ break;
+ } else if (found_key.type == BTRFS_FREE_SPACE_BITMAP_KEY) {
+ unsigned long ptr;
+ char *bitmap_cursor;
+ u32 bitmap_pos, data_size;
+
+ ASSERT(found_key.objectid >= start);
+ ASSERT(found_key.objectid < end);
+ ASSERT(found_key.objectid + found_key.offset <= end);
+
+ bitmap_pos = div_u64(found_key.objectid - start,
+ block_group->sectorsize *
+ BITS_PER_BYTE);
+ bitmap_cursor = ((char *)bitmap) + bitmap_pos;
+ data_size = free_space_bitmap_size(found_key.offset,
+ block_group->sectorsize);
+
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0] - 1);
+ read_extent_buffer(leaf, bitmap_cursor, ptr,
+ data_size);
+
+ nr++;
+ path->slots[0]--;
+ } else {
+ ASSERT(0);
+ }
+ }
+
+ ret = btrfs_del_items(trans, root, path, path->slots[0], nr);
+ if (ret)
+ goto out;
+ btrfs_release_path(path);
+ }
+
+ info = search_free_space_info(trans, fs_info, block_group, path, 1);
+ if (IS_ERR(info)) {
+ ret = PTR_ERR(info);
+ goto out;
+ }
+ leaf = path->nodes[0];
+ flags = btrfs_free_space_flags(leaf, info);
+ flags &= ~BTRFS_FREE_SPACE_USING_BITMAPS;
+ btrfs_set_free_space_flags(leaf, info, flags);
+ expected_extent_count = btrfs_free_space_extent_count(leaf, info);
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_release_path(path);
+
+ offset = start;
+ bitnr = 0;
+ while (offset < end) {
+ bit = !!test_bit(bitnr, bitmap);
+ if (prev_bit == 0 && bit == 1) {
+ extent_start = offset;
+ } else if (prev_bit == 1 && bit == 0) {
+ key.objectid = extent_start;
+ key.type = BTRFS_FREE_SPACE_EXTENT_KEY;
+ key.offset = offset - extent_start;
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+ if (ret)
+ goto out;
+ btrfs_release_path(path);
+
+ extent_count++;
+ }
+ prev_bit = bit;
+ offset += block_group->sectorsize;
+ bitnr++;
+ }
+ if (prev_bit == 1) {
+ key.objectid = extent_start;
+ key.type = BTRFS_FREE_SPACE_EXTENT_KEY;
+ key.offset = end - extent_start;
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+ if (ret)
+ goto out;
+ btrfs_release_path(path);
+
+ extent_count++;
+ }
+
+ if (extent_count != expected_extent_count) {
+ btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u",
+ block_group->key.objectid, extent_count,
+ expected_extent_count);
+ ASSERT(0);
+ ret = -EIO;
+ goto out;
+ }
+
+ ret = 0;
+out:
+ vfree(bitmap);
+ if (ret)
+ btrfs_abort_transaction(trans, root, ret);
+ return ret;
+}
+
+static int update_free_space_extent_count(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path,
+ int new_extents)
+{
+ struct btrfs_free_space_info *info;
+ u32 flags;
+ u32 extent_count;
+ int ret = 0;
+
+ if (new_extents == 0)
+ return 0;
+
+ info = search_free_space_info(trans, fs_info, block_group, path, 1);
+ if (IS_ERR(info)) {
+ ret = PTR_ERR(info);
+ goto out;
+ }
+ flags = btrfs_free_space_flags(path->nodes[0], info);
+ extent_count = btrfs_free_space_extent_count(path->nodes[0], info);
+
+ extent_count += new_extents;
+ btrfs_set_free_space_extent_count(path->nodes[0], info, extent_count);
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+ btrfs_release_path(path);
+
+ if (!(flags & BTRFS_FREE_SPACE_USING_BITMAPS) &&
+ extent_count > block_group->bitmap_high_thresh) {
+ ret = convert_free_space_to_bitmaps(trans, fs_info, block_group,
+ path);
+ } else if ((flags & BTRFS_FREE_SPACE_USING_BITMAPS) &&
+ extent_count < block_group->bitmap_low_thresh) {
+ ret = convert_free_space_to_extents(trans, fs_info, block_group,
+ path);
+ }
+
+out:
+ return ret;
+}
+
+int free_space_test_bit(struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path, u64 offset)
+{
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ u64 found_start, found_end;
+ unsigned long ptr, i;
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ ASSERT(key.type == BTRFS_FREE_SPACE_BITMAP_KEY);
+
+ found_start = key.objectid;
+ found_end = key.objectid + key.offset;
+ ASSERT(offset >= found_start && offset < found_end);
+
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ i = div_u64(offset - found_start, block_group->sectorsize);
+ return !!extent_buffer_test_bit(leaf, ptr, i);
+}
+
+static void free_space_set_bits(struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path, u64 *start, u64 *size,
+ int bit)
+{
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ u64 end = *start + *size;
+ u64 found_start, found_end;
+ unsigned long ptr, first, last;
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ ASSERT(key.type == BTRFS_FREE_SPACE_BITMAP_KEY);
+
+ found_start = key.objectid;
+ found_end = key.objectid + key.offset;
+ ASSERT(*start >= found_start && *start < found_end);
+ ASSERT(end > found_start);
+
+ if (end > found_end)
+ end = found_end;
+
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ first = div_u64(*start - found_start, block_group->sectorsize);
+ last = div_u64(end - found_start, block_group->sectorsize);
+ if (bit)
+ extent_buffer_bitmap_set(leaf, ptr, first, last - first);
+ else
+ extent_buffer_bitmap_clear(leaf, ptr, first, last - first);
+ btrfs_mark_buffer_dirty(leaf);
+
+ *size -= end - *start;
+ *start = end;
+}
+
+/*
+ * We can't use btrfs_next_item() in modify_free_space_bitmap() because
+ * btrfs_next_leaf() doesn't get the path for writing. We can forgo the fancy
+ * tree walking in btrfs_next_leaf() anyways because we know exactly what we're
+ * looking for.
+ */
+static int free_space_next_bitmap(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *p)
+{
+ struct btrfs_key key;
+
+ if (p->slots[0] + 1 < btrfs_header_nritems(p->nodes[0])) {
+ p->slots[0]++;
+ return 0;
+ }
+
+ btrfs_item_key_to_cpu(p->nodes[0], &key, p->slots[0]);
+ btrfs_release_path(p);
+
+ key.objectid += key.offset;
+ key.type = (u8)-1;
+ key.offset = (u64)-1;
+
+ return btrfs_search_prev_slot(trans, root, &key, p, 0, 1);
+}
+
+/*
+ * If remove is 1, then we are removing free space, thus clearing bits in the
+ * bitmap. If remove is 0, then we are adding free space, thus setting bits in
+ * the bitmap.
+ */
+static int modify_free_space_bitmap(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path,
+ u64 start, u64 size, int remove)
+{
+ struct btrfs_root *root = fs_info->free_space_root;
+ struct btrfs_key key;
+ u64 end = start + size;
+ u64 cur_start, cur_size;
+ int prev_bit, next_bit;
+ int new_extents;
+ int ret;
+
+ /*
+ * Read the bit for the block immediately before the extent of space if
+ * that block is within the block group.
+ */
+ if (start > block_group->key.objectid) {
+ u64 prev_block = start - block_group->sectorsize;
+
+ key.objectid = prev_block;
+ key.type = (u8)-1;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_prev_slot(trans, root, &key, path, 0, 1);
+ if (ret)
+ goto out;
+
+ prev_bit = free_space_test_bit(block_group, path, prev_block);
+
+ /* The previous block may have been in the previous bitmap. */
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (start >= key.objectid + key.offset) {
+ ret = free_space_next_bitmap(trans, root, path);
+ if (ret)
+ goto out;
+ }
+ } else {
+ key.objectid = start;
+ key.type = (u8)-1;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_prev_slot(trans, root, &key, path, 0, 1);
+ if (ret)
+ goto out;
+
+ prev_bit = -1;
+ }
+
+ /*
+ * Iterate over all of the bitmaps overlapped by the extent of space,
+ * clearing/setting bits as required.
+ */
+ cur_start = start;
+ cur_size = size;
+ while (1) {
+ free_space_set_bits(block_group, path, &cur_start, &cur_size,
+ !remove);
+ if (cur_size == 0)
+ break;
+ ret = free_space_next_bitmap(trans, root, path);
+ if (ret)
+ goto out;
+ }
+
+ /*
+ * Read the bit for the block immediately after the extent of space if
+ * that block is within the block group.
+ */
+ if (end < block_group->key.objectid + block_group->key.offset) {
+ /* The next block may be in the next bitmap. */
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (end >= key.objectid + key.offset) {
+ ret = free_space_next_bitmap(trans, root, path);
+ if (ret)
+ goto out;
+ }
+
+ next_bit = free_space_test_bit(block_group, path, end);
+ } else {
+ next_bit = -1;
+ }
+
+ if (remove) {
+ new_extents = -1;
+ if (prev_bit == 1) {
+ /* Leftover on the left. */
+ new_extents++;
+ }
+ if (next_bit == 1) {
+ /* Leftover on the right. */
+ new_extents++;
+ }
+ } else {
+ new_extents = 1;
+ if (prev_bit == 1) {
+ /* Merging with neighbor on the left. */
+ new_extents--;
+ }
+ if (next_bit == 1) {
+ /* Merging with neighbor on the right. */
+ new_extents--;
+ }
+ }
+
+ btrfs_release_path(path);
+ ret = update_free_space_extent_count(trans, fs_info, block_group, path,
+ new_extents);
+
+out:
+ return ret;
+}
+
+static int remove_free_space_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path,
+ u64 start, u64 size)
+{
+ struct btrfs_root *root = fs_info->free_space_root;
+ struct btrfs_key key;
+ u64 found_start, found_end;
+ u64 end = start + size;
+ int new_extents = -1;
+ int ret;
+
+ key.objectid = start;
+ key.type = (u8)-1;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
+ if (ret)
+ goto out;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+ ASSERT(key.type == BTRFS_FREE_SPACE_EXTENT_KEY);
+
+ found_start = key.objectid;
+ found_end = key.objectid + key.offset;
+ ASSERT(start >= found_start && end <= found_end);
+
+ /*
+ * Okay, now that we've found the free space extent which contains the
+ * free space that we are removing, there are four cases:
+ *
+ * 1. We're using the whole extent: delete the key we found and
+ * decrement the free space extent count.
+ * 2. We are using part of the extent starting at the beginning: delete
+ * the key we found and insert a new key representing the leftover at
+ * the end. There is no net change in the number of extents.
+ * 3. We are using part of the extent ending at the end: delete the key
+ * we found and insert a new key representing the leftover at the
+ * beginning. There is no net change in the number of extents.
+ * 4. We are using part of the extent in the middle: delete the key we
+ * found and insert two new keys representing the leftovers on each
+ * side. Where we used to have one extent, we now have two, so increment
+ * the extent count. We may need to convert the block group to bitmaps
+ * as a result.
+ */
+
+ /* Delete the existing key (cases 1-4). */
+ ret = btrfs_del_item(trans, root, path);
+ if (ret)
+ goto out;
+
+ /* Add a key for leftovers at the beginning (cases 3 and 4). */
+ if (start > found_start) {
+ key.objectid = found_start;
+ key.type = BTRFS_FREE_SPACE_EXTENT_KEY;
+ key.offset = start - found_start;
+
+ btrfs_release_path(path);
+ ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+ if (ret)
+ goto out;
+ new_extents++;
+ }
+
+ /* Add a key for leftovers at the end (cases 2 and 4). */
+ if (end < found_end) {
+ key.objectid = end;
+ key.type = BTRFS_FREE_SPACE_EXTENT_KEY;
+ key.offset = found_end - end;
+
+ btrfs_release_path(path);
+ ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+ if (ret)
+ goto out;
+ new_extents++;
+ }
+
+ btrfs_release_path(path);
+ ret = update_free_space_extent_count(trans, fs_info, block_group, path,
+ new_extents);
+
+out:
+ return ret;
+}
+
+int __remove_from_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path, u64 start, u64 size)
+{
+ struct btrfs_free_space_info *info;
+ u32 flags;
+ int ret;
+
+ if (block_group->needs_free_space) {
+ ret = __add_block_group_free_space(trans, fs_info, block_group,
+ path);
+ if (ret)
+ return ret;
+ }
+
+ info = search_free_space_info(NULL, fs_info, block_group, path, 0);
+ if (IS_ERR(info))
+ return PTR_ERR(info);
+ flags = btrfs_free_space_flags(path->nodes[0], info);
+ btrfs_release_path(path);
+
+ if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) {
+ return modify_free_space_bitmap(trans, fs_info, block_group,
+ path, start, size, 1);
+ } else {
+ return remove_free_space_extent(trans, fs_info, block_group,
+ path, start, size);
+ }
+}
+
+int remove_from_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ u64 start, u64 size)
+{
+ struct btrfs_block_group_cache *block_group;
+ struct btrfs_path *path;
+ int ret;
+
+ if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+ return 0;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ block_group = btrfs_lookup_block_group(fs_info, start);
+ if (!block_group) {
+ ASSERT(0);
+ ret = -ENOENT;
+ goto out;
+ }
+
+ mutex_lock(&block_group->free_space_lock);
+ ret = __remove_from_free_space_tree(trans, fs_info, block_group, path,
+ start, size);
+ mutex_unlock(&block_group->free_space_lock);
+
+ btrfs_put_block_group(block_group);
+out:
+ btrfs_free_path(path);
+ if (ret)
+ btrfs_abort_transaction(trans, fs_info->free_space_root, ret);
+ return ret;
+}
+
+static int add_free_space_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path,
+ u64 start, u64 size)
+{
+ struct btrfs_root *root = fs_info->free_space_root;
+ struct btrfs_key key, new_key;
+ u64 found_start, found_end;
+ u64 end = start + size;
+ int new_extents = 1;
+ int ret;
+
+ /*
+ * We are adding a new extent of free space, but we need to merge
+ * extents. There are four cases here:
+ *
+ * 1. The new extent does not have any immediate neighbors to merge
+ * with: add the new key and increment the free space extent count. We
+ * may need to convert the block group to bitmaps as a result.
+ * 2. The new extent has an immediate neighbor before it: remove the
+ * previous key and insert a new key combining both of them. There is no
+ * net change in the number of extents.
+ * 3. The new extent has an immediate neighbor after it: remove the next
+ * key and insert a new key combining both of them. There is no net
+ * change in the number of extents.
+ * 4. The new extent has immediate neighbors on both sides: remove both
+ * of the keys and insert a new key combining all of them. Where we used
+ * to have two extents, we now have one, so decrement the extent count.
+ */
+
+ new_key.objectid = start;
+ new_key.type = BTRFS_FREE_SPACE_EXTENT_KEY;
+ new_key.offset = size;
+
+ /* Search for a neighbor on the left. */
+ if (start == block_group->key.objectid)
+ goto right;
+ key.objectid = start - 1;
+ key.type = (u8)-1;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
+ if (ret)
+ goto out;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+ if (key.type != BTRFS_FREE_SPACE_EXTENT_KEY) {
+ ASSERT(key.type == BTRFS_FREE_SPACE_INFO_KEY);
+ btrfs_release_path(path);
+ goto right;
+ }
+
+ found_start = key.objectid;
+ found_end = key.objectid + key.offset;
+ ASSERT(found_start >= block_group->key.objectid &&
+ found_end > block_group->key.objectid);
+ ASSERT(found_start < start && found_end <= start);
+
+ /*
+ * Delete the neighbor on the left and absorb it into the new key (cases
+ * 2 and 4).
+ */
+ if (found_end == start) {
+ ret = btrfs_del_item(trans, root, path);
+ if (ret)
+ goto out;
+ new_key.objectid = found_start;
+ new_key.offset += key.offset;
+ new_extents--;
+ }
+ btrfs_release_path(path);
+
+right:
+ /* Search for a neighbor on the right. */
+ if (end == block_group->key.objectid + block_group->key.offset)
+ goto insert;
+ key.objectid = end;
+ key.type = (u8)-1;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
+ if (ret)
+ goto out;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+ if (key.type != BTRFS_FREE_SPACE_EXTENT_KEY) {
+ ASSERT(key.type == BTRFS_FREE_SPACE_INFO_KEY);
+ btrfs_release_path(path);
+ goto insert;
+ }
+
+ found_start = key.objectid;
+ found_end = key.objectid + key.offset;
+ ASSERT(found_start >= block_group->key.objectid &&
+ found_end > block_group->key.objectid);
+ ASSERT((found_start < start && found_end <= start) ||
+ (found_start >= end && found_end > end));
+
+ /*
+ * Delete the neighbor on the right and absorb it into the new key
+ * (cases 3 and 4).
+ */
+ if (found_start == end) {
+ ret = btrfs_del_item(trans, root, path);
+ if (ret)
+ goto out;
+ new_key.offset += key.offset;
+ new_extents--;
+ }
+ btrfs_release_path(path);
+
+insert:
+ /* Insert the new key (cases 1-4). */
+ ret = btrfs_insert_empty_item(trans, root, path, &new_key, 0);
+ if (ret)
+ goto out;
+
+ btrfs_release_path(path);
+ ret = update_free_space_extent_count(trans, fs_info, block_group, path,
+ new_extents);
+
+out:
+ return ret;
+}
+
+int __add_to_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path, u64 start, u64 size)
+{
+ struct btrfs_free_space_info *info;
+ u32 flags;
+ int ret;
+
+ if (block_group->needs_free_space) {
+ ret = __add_block_group_free_space(trans, fs_info, block_group,
+ path);
+ if (ret)
+ return ret;
+ }
+
+ info = search_free_space_info(NULL, fs_info, block_group, path, 0);
+ if (IS_ERR(info))
+ return PTR_ERR(info);
+ flags = btrfs_free_space_flags(path->nodes[0], info);
+ btrfs_release_path(path);
+
+ if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) {
+ return modify_free_space_bitmap(trans, fs_info, block_group,
+ path, start, size, 0);
+ } else {
+ return add_free_space_extent(trans, fs_info, block_group, path,
+ start, size);
+ }
+}
+
+int add_to_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ u64 start, u64 size)
+{
+ struct btrfs_block_group_cache *block_group;
+ struct btrfs_path *path;
+ int ret;
+
+ if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+ return 0;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ block_group = btrfs_lookup_block_group(fs_info, start);
+ if (!block_group) {
+ ASSERT(0);
+ ret = -ENOENT;
+ goto out;
+ }
+
+ mutex_lock(&block_group->free_space_lock);
+ ret = __add_to_free_space_tree(trans, fs_info, block_group, path, start,
+ size);
+ mutex_unlock(&block_group->free_space_lock);
+
+ btrfs_put_block_group(block_group);
+out:
+ btrfs_free_path(path);
+ if (ret)
+ btrfs_abort_transaction(trans, fs_info->free_space_root, ret);
+ return ret;
+}
+
+/*
+ * Populate the free space tree by walking the extent tree. Operations on the
+ * extent tree that happen as a result of writes to the free space tree will go
+ * through the normal add/remove hooks.
+ */
+static int populate_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group)
+{
+ struct btrfs_root *extent_root = fs_info->extent_root;
+ struct btrfs_path *path, *path2;
+ struct btrfs_key key;
+ u64 start, end;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->reada = 1;
+
+ path2 = btrfs_alloc_path();
+ if (!path2) {
+ btrfs_free_path(path);
+ return -ENOMEM;
+ }
+
+ ret = add_new_free_space_info(trans, fs_info, block_group, path2);
+ if (ret)
+ goto out;
+
+ mutex_lock(&block_group->free_space_lock);
+
+ /*
+ * Iterate through all of the extent and metadata items in this block
+ * group, adding the free space between them and the free space at the
+ * end. Note that EXTENT_ITEM and METADATA_ITEM are less than
+ * BLOCK_GROUP_ITEM, so an extent may precede the block group that it's
+ * contained in.
+ */
+ key.objectid = block_group->key.objectid;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot_for_read(extent_root, &key, path, 1, 0);
+ if (ret < 0)
+ goto out_locked;
+ ASSERT(ret == 0);
+
+ start = block_group->key.objectid;
+ end = block_group->key.objectid + block_group->key.offset;
+ while (1) {
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+ if (key.type == BTRFS_EXTENT_ITEM_KEY ||
+ key.type == BTRFS_METADATA_ITEM_KEY) {
+ if (key.objectid >= end)
+ break;
+
+ if (start < key.objectid) {
+ ret = __add_to_free_space_tree(trans, fs_info,
+ block_group,
+ path2, start,
+ key.objectid -
+ start);
+ if (ret)
+ goto out_locked;
+ }
+ start = key.objectid;
+ if (key.type == BTRFS_METADATA_ITEM_KEY)
+ start += fs_info->tree_root->nodesize;
+ else
+ start += key.offset;
+ } else if (key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
+ if (key.objectid != block_group->key.objectid)
+ break;
+ }
+
+ ret = btrfs_next_item(extent_root, path);
+ if (ret < 0)
+ goto out_locked;
+ if (ret)
+ break;
+ }
+ if (start < end) {
+ ret = __add_to_free_space_tree(trans, fs_info, block_group,
+ path2, start, end - start);
+ if (ret)
+ goto out_locked;
+ }
+
+ ret = 0;
+out_locked:
+ mutex_unlock(&block_group->free_space_lock);
+out:
+ btrfs_free_path(path2);
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *tree_root = fs_info->tree_root;
+ struct btrfs_root *free_space_root;
+ struct btrfs_block_group_cache *block_group;
+ struct rb_node *node;
+ int ret;
+
+ trans = btrfs_start_transaction(tree_root, 0);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ fs_info->creating_free_space_tree = 1;
+ free_space_root = btrfs_create_tree(trans, fs_info,
+ BTRFS_FREE_SPACE_TREE_OBJECTID);
+ if (IS_ERR(free_space_root)) {
+ ret = PTR_ERR(free_space_root);
+ goto abort;
+ }
+ fs_info->free_space_root = free_space_root;
+
+ node = rb_first(&fs_info->block_group_cache_tree);
+ while (node) {
+ block_group = rb_entry(node, struct btrfs_block_group_cache,
+ cache_node);
+ ret = populate_free_space_tree(trans, fs_info, block_group);
+ if (ret)
+ goto abort;
+ node = rb_next(node);
+ }
+
+ btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE);
+ fs_info->creating_free_space_tree = 0;
+
+ ret = btrfs_commit_transaction(trans, tree_root);
+ if (ret)
+ return ret;
+
+ return 0;
+
+abort:
+ fs_info->creating_free_space_tree = 0;
+ btrfs_abort_transaction(trans, tree_root, ret);
+ btrfs_end_transaction(trans, tree_root);
+ return ret;
+}
+
+static int clear_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int nr;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ path->leave_spinning = 1;
+
+ key.objectid = 0;
+ key.type = 0;
+ key.offset = 0;
+
+ while (1) {
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret < 0)
+ goto out;
+
+ nr = btrfs_header_nritems(path->nodes[0]);
+ if (!nr)
+ break;
+
+ path->slots[0] = 0;
+ ret = btrfs_del_items(trans, root, path, 0, nr);
+ if (ret)
+ goto out;
+
+ btrfs_release_path(path);
+ }
+
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *tree_root = fs_info->tree_root;
+ struct btrfs_root *free_space_root = fs_info->free_space_root;
+ int ret;
+
+ trans = btrfs_start_transaction(tree_root, 0);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE);
+ fs_info->free_space_root = NULL;
+
+ ret = clear_free_space_tree(trans, free_space_root);
+ if (ret)
+ goto abort;
+
+ ret = btrfs_del_root(trans, tree_root, &free_space_root->root_key);
+ if (ret)
+ goto abort;
+
+ list_del(&free_space_root->dirty_list);
+
+ btrfs_tree_lock(free_space_root->node);
+ clean_tree_block(trans, tree_root->fs_info, free_space_root->node);
+ btrfs_tree_unlock(free_space_root->node);
+ btrfs_free_tree_block(trans, free_space_root, free_space_root->node,
+ 0, 1);
+
+ free_extent_buffer(free_space_root->node);
+ free_extent_buffer(free_space_root->commit_root);
+ kfree(free_space_root);
+
+ ret = btrfs_commit_transaction(trans, tree_root);
+ if (ret)
+ return ret;
+
+ return 0;
+
+abort:
+ btrfs_abort_transaction(trans, tree_root, ret);
+ btrfs_end_transaction(trans, tree_root);
+ return ret;
+}
+
+static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path)
+{
+ u64 start, end;
+ int ret;
+
+ start = block_group->key.objectid;
+ end = block_group->key.objectid + block_group->key.offset;
+
+ block_group->needs_free_space = 0;
+
+ ret = add_new_free_space_info(trans, fs_info, block_group, path);
+ if (ret)
+ return ret;
+
+ return __add_to_free_space_tree(trans, fs_info, block_group, path,
+ block_group->key.objectid,
+ block_group->key.offset);
+}
+
+int add_block_group_free_space(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group)
+{
+ struct btrfs_path *path = NULL;
+ int ret = 0;
+
+ if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+ return 0;
+
+ mutex_lock(&block_group->free_space_lock);
+ if (!block_group->needs_free_space)
+ goto out;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = __add_block_group_free_space(trans, fs_info, block_group, path);
+
+out:
+ btrfs_free_path(path);
+ mutex_unlock(&block_group->free_space_lock);
+ if (ret)
+ btrfs_abort_transaction(trans, fs_info->free_space_root, ret);
+ return ret;
+}
+
+int remove_block_group_free_space(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group)
+{
+ struct btrfs_root *root = fs_info->free_space_root;
+ struct btrfs_path *path;
+ struct btrfs_key key, found_key;
+ struct extent_buffer *leaf;
+ u64 start, end;
+ int done = 0, nr;
+ int ret;
+
+ if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+ return 0;
+
+ if (block_group->needs_free_space) {
+ /* We never added this block group to the free space tree. */
+ return 0;
+ }
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ start = block_group->key.objectid;
+ end = block_group->key.objectid + block_group->key.offset;
+
+ key.objectid = end - 1;
+ key.type = (u8)-1;
+ key.offset = (u64)-1;
+
+ while (!done) {
+ ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
+ if (ret)
+ goto out;
+
+ leaf = path->nodes[0];
+ nr = 0;
+ path->slots[0]++;
+ while (path->slots[0] > 0) {
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1);
+
+ if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) {
+ ASSERT(found_key.objectid == block_group->key.objectid);
+ ASSERT(found_key.offset == block_group->key.offset);
+ done = 1;
+ nr++;
+ path->slots[0]--;
+ break;
+ } else if (found_key.type == BTRFS_FREE_SPACE_EXTENT_KEY ||
+ found_key.type == BTRFS_FREE_SPACE_BITMAP_KEY) {
+ ASSERT(found_key.objectid >= start);
+ ASSERT(found_key.objectid < end);
+ ASSERT(found_key.objectid + found_key.offset <= end);
+ nr++;
+ path->slots[0]--;
+ } else {
+ ASSERT(0);
+ }
+ }
+
+ ret = btrfs_del_items(trans, root, path, path->slots[0], nr);
+ if (ret)
+ goto out;
+ btrfs_release_path(path);
+ }
+
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ if (ret)
+ btrfs_abort_transaction(trans, root, ret);
+ return ret;
+}
+
+static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
+ struct btrfs_path *path,
+ u32 expected_extent_count)
+{
+ struct btrfs_block_group_cache *block_group;
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_root *root;
+ struct btrfs_key key;
+ int prev_bit = 0, bit;
+ /* Initialize to silence GCC. */
+ u64 extent_start = 0;
+ u64 end, offset;
+ u64 total_found = 0;
+ u32 extent_count = 0;
+ int ret;
+
+ block_group = caching_ctl->block_group;
+ fs_info = block_group->fs_info;
+ root = fs_info->free_space_root;
+
+ end = block_group->key.objectid + block_group->key.offset;
+
+ while (1) {
+ ret = btrfs_next_item(root, path);
+ if (ret < 0)
+ goto out;
+ if (ret)
+ break;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+ if (key.type == BTRFS_FREE_SPACE_INFO_KEY)
+ break;
+
+ ASSERT(key.type == BTRFS_FREE_SPACE_BITMAP_KEY);
+ ASSERT(key.objectid < end && key.objectid + key.offset <= end);
+
+ caching_ctl->progress = key.objectid;
+
+ offset = key.objectid;
+ while (offset < key.objectid + key.offset) {
+ bit = free_space_test_bit(block_group, path, offset);
+ if (prev_bit == 0 && bit == 1) {
+ extent_start = offset;
+ } else if (prev_bit == 1 && bit == 0) {
+ total_found += add_new_free_space(block_group,
+ fs_info,
+ extent_start,
+ offset);
+ if (total_found > CACHING_CTL_WAKE_UP) {
+ total_found = 0;
+ wake_up(&caching_ctl->wait);
+ }
+ extent_count++;
+ }
+ prev_bit = bit;
+ offset += block_group->sectorsize;
+ }
+ }
+ if (prev_bit == 1) {
+ total_found += add_new_free_space(block_group, fs_info,
+ extent_start, end);
+ extent_count++;
+ }
+
+ if (extent_count != expected_extent_count) {
+ btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u",
+ block_group->key.objectid, extent_count,
+ expected_extent_count);
+ ASSERT(0);
+ ret = -EIO;
+ goto out;
+ }
+
+ caching_ctl->progress = (u64)-1;
+
+ ret = 0;
+out:
+ return ret;
+}
+
+static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
+ struct btrfs_path *path,
+ u32 expected_extent_count)
+{
+ struct btrfs_block_group_cache *block_group;
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_root *root;
+ struct btrfs_key key;
+ u64 end;
+ u64 total_found = 0;
+ u32 extent_count = 0;
+ int ret;
+
+ block_group = caching_ctl->block_group;
+ fs_info = block_group->fs_info;
+ root = fs_info->free_space_root;
+
+ end = block_group->key.objectid + block_group->key.offset;
+
+ while (1) {
+ ret = btrfs_next_item(root, path);
+ if (ret < 0)
+ goto out;
+ if (ret)
+ break;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+ if (key.type == BTRFS_FREE_SPACE_INFO_KEY)
+ break;
+
+ ASSERT(key.type == BTRFS_FREE_SPACE_EXTENT_KEY);
+ ASSERT(key.objectid < end && key.objectid + key.offset <= end);
+
+ caching_ctl->progress = key.objectid;
+
+ total_found += add_new_free_space(block_group, fs_info,
+ key.objectid,
+ key.objectid + key.offset);
+ if (total_found > CACHING_CTL_WAKE_UP) {
+ total_found = 0;
+ wake_up(&caching_ctl->wait);
+ }
+ extent_count++;
+ }
+
+ if (extent_count != expected_extent_count) {
+ btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u",
+ block_group->key.objectid, extent_count,
+ expected_extent_count);
+ ASSERT(0);
+ ret = -EIO;
+ goto out;
+ }
+
+ caching_ctl->progress = (u64)-1;
+
+ ret = 0;
+out:
+ return ret;
+}
+
+int load_free_space_tree(struct btrfs_caching_control *caching_ctl)
+{
+ struct btrfs_block_group_cache *block_group;
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_free_space_info *info;
+ struct btrfs_path *path;
+ u32 extent_count, flags;
+ int ret;
+
+ block_group = caching_ctl->block_group;
+ fs_info = block_group->fs_info;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ /*
+ * Just like caching_thread() doesn't want to deadlock on the extent
+ * tree, we don't want to deadlock on the free space tree.
+ */
+ path->skip_locking = 1;
+ path->search_commit_root = 1;
+ path->reada = 1;
+
+ info = search_free_space_info(NULL, fs_info, block_group, path, 0);
+ if (IS_ERR(info)) {
+ ret = PTR_ERR(info);
+ goto out;
+ }
+ extent_count = btrfs_free_space_extent_count(path->nodes[0], info);
+ flags = btrfs_free_space_flags(path->nodes[0], info);
+
+ /*
+ * We left path pointing to the free space info item, so now
+ * load_free_space_foo can just iterate through the free space tree from
+ * there.
+ */
+ if (flags & BTRFS_FREE_SPACE_USING_BITMAPS)
+ ret = load_free_space_bitmaps(caching_ctl, path, extent_count);
+ else
+ ret = load_free_space_extents(caching_ctl, path, extent_count);
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
diff --git a/fs/btrfs/free-space-tree.h b/fs/btrfs/free-space-tree.h
new file mode 100644
index 0000000..54ffced
--- /dev/null
+++ b/fs/btrfs/free-space-tree.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (C) 2015 Facebook. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_FREE_SPACE_TREE
+#define __BTRFS_FREE_SPACE_TREE
+
+/*
+ * The default size for new free space bitmap items. The last bitmap in a block
+ * group may be truncated, and none of the free space tree code assumes that
+ * existing bitmaps are this size.
+ */
+#define BTRFS_FREE_SPACE_BITMAP_SIZE 256
+#define BTRFS_FREE_SPACE_BITMAP_BITS (BTRFS_FREE_SPACE_BITMAP_SIZE * BITS_PER_BYTE)
+
+void set_free_space_tree_thresholds(struct btrfs_block_group_cache *block_group);
+int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info);
+int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info);
+int load_free_space_tree(struct btrfs_caching_control *caching_ctl);
+int add_block_group_free_space(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group);
+int remove_block_group_free_space(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group);
+int add_to_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ u64 start, u64 size);
+int remove_from_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ u64 start, u64 size);
+
+/* Exposed for testing. */
+struct btrfs_free_space_info *
+search_free_space_info(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path, int cow);
+int __add_to_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path, u64 start, u64 size);
+int __remove_from_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path, u64 start, u64 size);
+int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path);
+int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path);
+int free_space_test_bit(struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path, u64 offset);
+
+#endif
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 265e03c..be4d22a 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -157,7 +157,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
*/
if (!btrfs_find_name_in_ext_backref(path, ref_objectid,
name, name_len, &extref)) {
- btrfs_std_error(root->fs_info, -ENOENT);
+ btrfs_std_error(root->fs_info, -ENOENT, NULL);
ret = -EROFS;
goto out;
}
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index d4a582a..8b57c17 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -48,7 +48,7 @@ static int caching_kthread(void *data)
/* Since the commit root is read-only, we can safely skip locking. */
path->skip_locking = 1;
path->search_commit_root = 1;
- path->reada = 2;
+ path->reada = READA_FORWARD;
key.objectid = BTRFS_FIRST_FREE_OBJECTID;
key.offset = 0;
@@ -282,7 +282,7 @@ void btrfs_unpin_free_ino(struct btrfs_root *root)
}
}
-#define INIT_THRESHOLD (((1024 * 32) / 2) / sizeof(struct btrfs_free_space))
+#define INIT_THRESHOLD ((SZ_32K / 2) / sizeof(struct btrfs_free_space))
#define INODES_PER_BITMAP (PAGE_CACHE_SIZE * 8)
/*
@@ -334,7 +334,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
return true;
}
-static struct btrfs_free_space_op free_ino_op = {
+static const struct btrfs_free_space_op free_ino_op = {
.recalc_thresholds = recalculate_thresholds,
.use_bitmap = use_bitmap,
};
@@ -356,7 +356,7 @@ static bool pinned_use_bitmap(struct btrfs_free_space_ctl *ctl,
return false;
}
-static struct btrfs_free_space_op pinned_free_ino_op = {
+static const struct btrfs_free_space_op pinned_free_ino_op = {
.recalc_thresholds = pinned_recalc_thresholds,
.use_bitmap = pinned_use_bitmap,
};
@@ -488,17 +488,17 @@ again:
/* Just to make sure we have enough space */
prealloc += 8 * PAGE_CACHE_SIZE;
- ret = btrfs_delalloc_reserve_space(inode, prealloc);
+ ret = btrfs_delalloc_reserve_space(inode, 0, prealloc);
if (ret)
goto out_put;
ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
prealloc, prealloc, &alloc_hint);
if (ret) {
- btrfs_delalloc_release_space(inode, prealloc);
+ btrfs_delalloc_release_space(inode, 0, prealloc);
goto out_put;
}
- btrfs_free_reserved_data_space(inode, prealloc);
+ btrfs_free_reserved_data_space(inode, 0, prealloc);
ret = btrfs_write_out_ino_cache(root, trans, path, inode);
out_put:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e33dff3..2478301 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -66,6 +66,13 @@ struct btrfs_iget_args {
struct btrfs_root *root;
};
+struct btrfs_dio_data {
+ u64 outstanding_extents;
+ u64 reserve;
+ u64 unsubmitted_oe_range_start;
+ u64 unsubmitted_oe_range_end;
+};
+
static const struct inode_operations btrfs_dir_inode_operations;
static const struct inode_operations btrfs_symlink_inode_operations;
static const struct inode_operations btrfs_dir_ro_inode_operations;
@@ -74,17 +81,16 @@ static const struct inode_operations btrfs_file_inode_operations;
static const struct address_space_operations btrfs_aops;
static const struct address_space_operations btrfs_symlink_aops;
static const struct file_operations btrfs_dir_file_operations;
-static struct extent_io_ops btrfs_extent_io_ops;
+static const struct extent_io_ops btrfs_extent_io_ops;
static struct kmem_cache *btrfs_inode_cachep;
-static struct kmem_cache *btrfs_delalloc_work_cachep;
struct kmem_cache *btrfs_trans_handle_cachep;
struct kmem_cache *btrfs_transaction_cachep;
struct kmem_cache *btrfs_path_cachep;
struct kmem_cache *btrfs_free_space_cachep;
#define S_SHIFT 12
-static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
+static const unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
[S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE,
[S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR,
[S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV,
@@ -310,6 +316,13 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
btrfs_delalloc_release_metadata(inode, end + 1 - start);
btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
out:
+ /*
+ * Don't forget to free the reserved space, as for inlined extent
+ * it won't count as data extent, free them directly here.
+ * And at reserve time, it's always aligned to page size, so
+ * just free one page here.
+ */
+ btrfs_qgroup_free_data(inode, 0, PAGE_CACHE_SIZE);
btrfs_free_path(path);
btrfs_end_transaction(trans, root);
return ret;
@@ -407,15 +420,15 @@ static noinline void compress_file_range(struct inode *inode,
unsigned long nr_pages_ret = 0;
unsigned long total_compressed = 0;
unsigned long total_in = 0;
- unsigned long max_compressed = 128 * 1024;
- unsigned long max_uncompressed = 128 * 1024;
+ unsigned long max_compressed = SZ_128K;
+ unsigned long max_uncompressed = SZ_128K;
int i;
int will_compress;
int compress_type = root->fs_info->compress_type;
int redirty = 0;
/* if this is a small write inside eof, kick off a defrag */
- if ((end - start + 1) < 16 * 1024 &&
+ if ((end - start + 1) < SZ_16K &&
(start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
btrfs_add_inode_defrag(NULL, inode);
@@ -423,7 +436,7 @@ static noinline void compress_file_range(struct inode *inode,
again:
will_compress = 0;
nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
- nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
+ nr_pages = min_t(unsigned long, nr_pages, SZ_128K / PAGE_CACHE_SIZE);
/*
* we don't want to send crud past the end of i_size through
@@ -937,7 +950,7 @@ static noinline int cow_file_range(struct inode *inode,
disk_num_bytes = num_bytes;
/* if this is a small write inside eof, kick off defrag */
- if (num_bytes < 64 * 1024 &&
+ if (num_bytes < SZ_64K &&
(start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
btrfs_add_inode_defrag(NULL, inode);
@@ -1096,8 +1109,11 @@ static noinline void async_cow_submit(struct btrfs_work *work)
nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
PAGE_CACHE_SHIFT;
+ /*
+ * atomic_sub_return implies a barrier for waitqueue_active
+ */
if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) <
- 5 * 1024 * 1024 &&
+ 5 * SZ_1M &&
waitqueue_active(&root->fs_info->async_submit_wait))
wake_up(&root->fs_info->async_submit_wait);
@@ -1122,7 +1138,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
struct btrfs_root *root = BTRFS_I(inode)->root;
unsigned long nr_pages;
u64 cur_end;
- int limit = 10 * 1024 * 1024;
+ int limit = 10 * SZ_1M;
clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
1, 0, NULL, GFP_NOFS);
@@ -1138,7 +1154,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
!btrfs_test_opt(root, FORCE_COMPRESS))
cur_end = end;
else
- cur_end = min(end, start + 512 * 1024 - 1);
+ cur_end = min(end, start + SZ_512K - 1);
async_cow->end = cur_end;
INIT_LIST_HEAD(&async_cow->extents);
@@ -1294,8 +1310,14 @@ next_slot:
num_bytes = 0;
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
- if (found_key.objectid > ino ||
- found_key.type > BTRFS_EXTENT_DATA_KEY ||
+ if (found_key.objectid > ino)
+ break;
+ if (WARN_ON_ONCE(found_key.objectid < ino) ||
+ found_key.type < BTRFS_EXTENT_DATA_KEY) {
+ path->slots[0]++;
+ goto next_slot;
+ }
+ if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
found_key.offset > end)
break;
@@ -1766,7 +1788,8 @@ static void btrfs_clear_bit_hook(struct inode *inode,
if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
&& do_list && !(state->state & EXTENT_NORESERVE))
- btrfs_free_reserved_data_space(inode, len);
+ btrfs_free_reserved_data_space_noquota(inode,
+ state->start, len);
__percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
root->fs_info->delalloc_batch);
@@ -1845,8 +1868,10 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
int ret;
ret = btrfs_map_bio(root, rw, bio, mirror_num, 1);
- if (ret)
- bio_endio(bio, ret);
+ if (ret) {
+ bio->bi_error = ret;
+ bio_endio(bio);
+ }
return ret;
}
@@ -1859,15 +1884,15 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
u64 bio_offset)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
+ enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
int ret = 0;
int skip_sum;
- int metadata = 0;
int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
if (btrfs_is_free_space_inode(inode))
- metadata = 2;
+ metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
if (!(rw & REQ_WRITE)) {
ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
@@ -1906,8 +1931,10 @@ mapit:
ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
out:
- if (ret < 0)
- bio_endio(bio, ret);
+ if (ret < 0) {
+ bio->bi_error = ret;
+ bio_endio(bio);
+ }
return ret;
}
@@ -1968,7 +1995,7 @@ again:
page_start = page_offset(page);
page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
- lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0,
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
&cached_state);
/* already ordered? We're done */
@@ -1985,7 +2012,8 @@ again:
goto again;
}
- ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+ ret = btrfs_delalloc_reserve_space(inode, page_start,
+ PAGE_CACHE_SIZE);
if (ret) {
mapping_set_error(page->mapping, ret);
end_extent_writepage(page, ret, page_start, page_end);
@@ -2111,7 +2139,13 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
ins.type = BTRFS_EXTENT_ITEM_KEY;
ret = btrfs_alloc_reserved_file_extent(trans, root,
root->root_key.objectid,
- btrfs_ino(inode), file_pos, &ins);
+ btrfs_ino(inode), file_pos,
+ ram_bytes, &ins);
+ /*
+ * Release the reserved range from inode dirty range map, as it is
+ * already moved into delayed_ref_head
+ */
+ btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
out:
btrfs_free_path(path);
@@ -2454,7 +2488,7 @@ static noinline int relink_extent_backref(struct btrfs_path *path,
lock_start = backref->file_pos;
lock_end = backref->file_pos + backref->num_bytes - 1;
lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
- 0, &cached);
+ &cached);
ordered = btrfs_lookup_first_ordered_extent(inode, lock_end);
if (ordered) {
@@ -2569,7 +2603,7 @@ again:
ret = btrfs_inc_extent_ref(trans, root, new->bytenr,
new->disk_len, 0,
backref->root_id, backref->inum,
- new->file_pos, 0); /* start - extent_offset */
+ new->file_pos); /* start - extent_offset */
if (ret) {
btrfs_abort_transaction(trans, root, ret);
goto out_free_path;
@@ -2595,7 +2629,6 @@ static void free_sa_defrag_extent(struct new_sa_defrag_extent *new)
return;
list_for_each_entry_safe(old, tmp, &new->head, list) {
- list_del(&old->list);
kfree(old);
}
kfree(new);
@@ -2820,6 +2853,14 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
+
+ /*
+ * For mwrite(mmap + memset to write) case, we still reserve
+ * space for NOCOW range.
+ * As NOCOW won't cause a new delayed ref, just free the space
+ */
+ btrfs_qgroup_free_data(inode, ordered_extent->file_offset,
+ ordered_extent->len);
btrfs_ordered_update_i_size(inode, 0, ordered_extent);
if (nolock)
trans = btrfs_join_transaction_nolock(root);
@@ -2839,7 +2880,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
lock_extent_bits(io_tree, ordered_extent->file_offset,
ordered_extent->file_offset + ordered_extent->len - 1,
- 0, &cached_state);
+ &cached_state);
ret = test_range_bit(io_tree, ordered_extent->file_offset,
ordered_extent->file_offset + ordered_extent->len - 1,
@@ -3014,8 +3055,6 @@ static int __readpage_endio_check(struct inode *inode,
char *kaddr;
u32 csum_expected;
u32 csum = ~(u32)0;
- static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
- DEFAULT_RATELIMIT_BURST);
csum_expected = *(((u32 *)io_bio->csum) + icsum);
@@ -3028,9 +3067,8 @@ static int __readpage_endio_check(struct inode *inode,
kunmap_atomic(kaddr);
return 0;
zeroit:
- if (__ratelimit(&_rs))
- btrfs_warn(BTRFS_I(inode)->root->fs_info,
- "csum failed ino %llu off %llu csum %u expected csum %u",
+ btrfs_warn_rl(BTRFS_I(inode)->root->fs_info,
+ "csum failed ino %llu off %llu csum %u expected csum %u",
btrfs_ino(inode), start, csum, csum_expected);
memset(kaddr + pgoff, 1, len);
flush_dcache_page(page);
@@ -3074,55 +3112,47 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
start, (size_t)(end - start + 1));
}
-struct delayed_iput {
- struct list_head list;
- struct inode *inode;
-};
-
-/* JDM: If this is fs-wide, why can't we add a pointer to
- * btrfs_inode instead and avoid the allocation? */
void btrfs_add_delayed_iput(struct inode *inode)
{
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
- struct delayed_iput *delayed;
+ struct btrfs_inode *binode = BTRFS_I(inode);
if (atomic_add_unless(&inode->i_count, -1, 1))
return;
- delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL);
- delayed->inode = inode;
-
spin_lock(&fs_info->delayed_iput_lock);
- list_add_tail(&delayed->list, &fs_info->delayed_iputs);
+ if (binode->delayed_iput_count == 0) {
+ ASSERT(list_empty(&binode->delayed_iput));
+ list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs);
+ } else {
+ binode->delayed_iput_count++;
+ }
spin_unlock(&fs_info->delayed_iput_lock);
}
void btrfs_run_delayed_iputs(struct btrfs_root *root)
{
- LIST_HEAD(list);
struct btrfs_fs_info *fs_info = root->fs_info;
- struct delayed_iput *delayed;
- int empty;
-
- spin_lock(&fs_info->delayed_iput_lock);
- empty = list_empty(&fs_info->delayed_iputs);
- spin_unlock(&fs_info->delayed_iput_lock);
- if (empty)
- return;
down_read(&fs_info->delayed_iput_sem);
-
spin_lock(&fs_info->delayed_iput_lock);
- list_splice_init(&fs_info->delayed_iputs, &list);
- spin_unlock(&fs_info->delayed_iput_lock);
-
- while (!list_empty(&list)) {
- delayed = list_entry(list.next, struct delayed_iput, list);
- list_del(&delayed->list);
- iput(delayed->inode);
- kfree(delayed);
+ while (!list_empty(&fs_info->delayed_iputs)) {
+ struct btrfs_inode *inode;
+
+ inode = list_first_entry(&fs_info->delayed_iputs,
+ struct btrfs_inode, delayed_iput);
+ if (inode->delayed_iput_count) {
+ inode->delayed_iput_count--;
+ list_move_tail(&inode->delayed_iput,
+ &fs_info->delayed_iputs);
+ } else {
+ list_del_init(&inode->delayed_iput);
+ }
+ spin_unlock(&fs_info->delayed_iput_lock);
+ iput(&inode->vfs_inode);
+ spin_lock(&fs_info->delayed_iput_lock);
}
-
+ spin_unlock(&fs_info->delayed_iput_lock);
up_read(&root->fs_info->delayed_iput_sem);
}
@@ -3319,7 +3349,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
ret = -ENOMEM;
goto out;
}
- path->reada = -1;
+ path->reada = READA_BACK;
key.objectid = BTRFS_ORPHAN_OBJECTID;
key.type = BTRFS_ORPHAN_ITEM_KEY;
@@ -3518,10 +3548,10 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
int scanned = 0;
if (!xattr_access) {
- xattr_access = btrfs_name_hash(POSIX_ACL_XATTR_ACCESS,
- strlen(POSIX_ACL_XATTR_ACCESS));
- xattr_default = btrfs_name_hash(POSIX_ACL_XATTR_DEFAULT,
- strlen(POSIX_ACL_XATTR_DEFAULT));
+ xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS,
+ strlen(XATTR_NAME_POSIX_ACL_ACCESS));
+ xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT,
+ strlen(XATTR_NAME_POSIX_ACL_DEFAULT));
}
slot++;
@@ -3654,6 +3684,35 @@ cache_index:
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&BTRFS_I(inode)->runtime_flags);
+ /*
+ * We don't persist the id of the transaction where an unlink operation
+ * against the inode was last made. So here we assume the inode might
+ * have been evicted, and therefore the exact value of last_unlink_trans
+ * lost, and set it to last_trans to avoid metadata inconsistencies
+ * between the inode and its parent if the inode is fsync'ed and the log
+ * replayed. For example, in the scenario:
+ *
+ * touch mydir/foo
+ * ln mydir/foo mydir/bar
+ * sync
+ * unlink mydir/bar
+ * echo 2 > /proc/sys/vm/drop_caches # evicts inode
+ * xfs_io -c fsync mydir/foo
+ * <power failure>
+ * mount fs, triggers fsync log replay
+ *
+ * We must make sure that when we fsync our inode foo we also log its
+ * parent inode, otherwise after log replay the parent still has the
+ * dentry with the "bar" name but our inode foo has a link count of 1
+ * and doesn't have an inode ref with the name "bar" anymore.
+ *
+ * Setting last_unlink_trans to last_trans is a pessimistic approach,
+ * but it guarantees correctness at the expense of ocassional full
+ * transaction commits on fsync if our inode is a directory, or if our
+ * inode is not a directory, logging its parent unnecessarily.
+ */
+ BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;
+
path->slots[0]++;
if (inode->i_nlink != 1 ||
path->slots[0] >= btrfs_header_nritems(leaf))
@@ -3713,6 +3772,7 @@ cache_acl:
break;
case S_IFLNK:
inode->i_op = &btrfs_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &btrfs_symlink_aops;
break;
default:
@@ -3985,9 +4045,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
*/
static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
{
- struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(dir)->root;
- int ret;
/*
* 1 for the possible orphan item
@@ -3996,27 +4054,7 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
* 1 for the inode ref
* 1 for the inode
*/
- trans = btrfs_start_transaction(root, 5);
- if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
- return trans;
-
- if (PTR_ERR(trans) == -ENOSPC) {
- u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5);
-
- trans = btrfs_start_transaction(root, 0);
- if (IS_ERR(trans))
- return trans;
- ret = btrfs_cond_migrate_bytes(root->fs_info,
- &root->fs_info->trans_block_rsv,
- num_bytes, 5);
- if (ret) {
- btrfs_end_transaction(trans, root);
- return ERR_PTR(ret);
- }
- trans->block_rsv = &root->fs_info->trans_block_rsv;
- trans->bytes_reserved = num_bytes;
- }
- return trans;
+ return btrfs_start_transaction_fallback_global_rsv(root, 5, 5);
}
static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
@@ -4184,6 +4222,47 @@ static int truncate_space_check(struct btrfs_trans_handle *trans,
}
+static int truncate_inline_extent(struct inode *inode,
+ struct btrfs_path *path,
+ struct btrfs_key *found_key,
+ const u64 item_end,
+ const u64 new_size)
+{
+ struct extent_buffer *leaf = path->nodes[0];
+ int slot = path->slots[0];
+ struct btrfs_file_extent_item *fi;
+ u32 size = (u32)(new_size - found_key->offset);
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+
+ fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+
+ if (btrfs_file_extent_compression(leaf, fi) != BTRFS_COMPRESS_NONE) {
+ loff_t offset = new_size;
+ loff_t page_end = ALIGN(offset, PAGE_CACHE_SIZE);
+
+ /*
+ * Zero out the remaining of the last page of our inline extent,
+ * instead of directly truncating our inline extent here - that
+ * would be much more complex (decompressing all the data, then
+ * compressing the truncated data, which might be bigger than
+ * the size of the inline extent, resize the extent, etc).
+ * We release the path because to get the page we might need to
+ * read the extent item from disk (data not in the page cache).
+ */
+ btrfs_release_path(path);
+ return btrfs_truncate_page(inode, offset, page_end - offset, 0);
+ }
+
+ btrfs_set_file_extent_ram_bytes(leaf, fi, size);
+ size = btrfs_file_extent_calc_inline_size(size);
+ btrfs_truncate_item(root, path, size, 1);
+
+ if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
+ inode_sub_bytes(inode, item_end + 1 - new_size);
+
+ return 0;
+}
+
/*
* this can truncate away extent items, csum items and directory items.
* It starts at a high offset and removes keys until it can't find
@@ -4237,7 +4316,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->reada = -1;
+ path->reada = READA_BACK;
/*
* We want to drop from the next block forward in case this new size is
@@ -4268,7 +4347,7 @@ search_again:
* up a huge file in a single leaf. Most of the time that
* bytes_deleted is > 0, it will be huge by the time we get here
*/
- if (be_nice && bytes_deleted > 32 * 1024 * 1024) {
+ if (be_nice && bytes_deleted > SZ_32M) {
if (btrfs_should_end_transaction(trans, root)) {
err = -EAGAIN;
goto error;
@@ -4378,27 +4457,40 @@ search_again:
* special encodings
*/
if (!del_item &&
- btrfs_file_extent_compression(leaf, fi) == 0 &&
btrfs_file_extent_encryption(leaf, fi) == 0 &&
btrfs_file_extent_other_encoding(leaf, fi) == 0) {
- u32 size = new_size - found_key.offset;
-
- if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
- inode_sub_bytes(inode, item_end + 1 -
- new_size);
/*
- * update the ram bytes to properly reflect
- * the new size of our item
+ * Need to release path in order to truncate a
+ * compressed extent. So delete any accumulated
+ * extent items so far.
*/
- btrfs_set_file_extent_ram_bytes(leaf, fi, size);
- size =
- btrfs_file_extent_calc_inline_size(size);
- btrfs_truncate_item(root, path, size, 1);
+ if (btrfs_file_extent_compression(leaf, fi) !=
+ BTRFS_COMPRESS_NONE && pending_del_nr) {
+ err = btrfs_del_items(trans, root, path,
+ pending_del_slot,
+ pending_del_nr);
+ if (err) {
+ btrfs_abort_transaction(trans,
+ root,
+ err);
+ goto error;
+ }
+ pending_del_nr = 0;
+ }
+
+ err = truncate_inline_extent(inode, path,
+ &found_key,
+ item_end,
+ new_size);
+ if (err) {
+ btrfs_abort_transaction(trans,
+ root, err);
+ goto error;
+ }
} else if (test_bit(BTRFS_ROOT_REF_COWS,
&root->state)) {
- inode_sub_bytes(inode, item_end + 1 -
- found_key.offset);
+ inode_sub_bytes(inode, item_end + 1 - new_size);
}
}
delete:
@@ -4428,7 +4520,7 @@ delete:
ret = btrfs_free_extent(trans, root, extent_start,
extent_num_bytes, 0,
btrfs_header_owner(leaf),
- ino, extent_offset, 0);
+ ino, extent_offset);
BUG_ON(ret);
if (btrfs_should_throttle_delayed_refs(trans, root))
btrfs_async_run_delayed_refs(root,
@@ -4498,7 +4590,7 @@ error:
btrfs_free_path(path);
- if (be_nice && bytes_deleted > 32 * 1024 * 1024) {
+ if (be_nice && bytes_deleted > SZ_32M) {
unsigned long updates = trans->delayed_ref_updates;
if (updates) {
trans->delayed_ref_updates = 0;
@@ -4542,14 +4634,17 @@ int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
if ((offset & (blocksize - 1)) == 0 &&
(!len || ((len & (blocksize - 1)) == 0)))
goto out;
- ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+ ret = btrfs_delalloc_reserve_space(inode,
+ round_down(from, PAGE_CACHE_SIZE), PAGE_CACHE_SIZE);
if (ret)
goto out;
again:
page = find_or_create_page(mapping, index, mask);
if (!page) {
- btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+ btrfs_delalloc_release_space(inode,
+ round_down(from, PAGE_CACHE_SIZE),
+ PAGE_CACHE_SIZE);
ret = -ENOMEM;
goto out;
}
@@ -4572,7 +4667,7 @@ again:
}
wait_on_page_writeback(page);
- lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state);
+ lock_extent_bits(io_tree, page_start, page_end, &cached_state);
set_page_extent_mapped(page);
ordered = btrfs_lookup_ordered_extent(inode, page_start);
@@ -4617,7 +4712,8 @@ again:
out_unlock:
if (ret)
- btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+ btrfs_delalloc_release_space(inode, page_start,
+ PAGE_CACHE_SIZE);
unlock_page(page);
page_cache_release(page);
out:
@@ -4702,7 +4798,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
while (1) {
struct btrfs_ordered_extent *ordered;
- lock_extent_bits(io_tree, hole_start, block_end - 1, 0,
+ lock_extent_bits(io_tree, hole_start, block_end - 1,
&cached_state);
ordered = btrfs_lookup_ordered_range(inode, hole_start,
block_end - hole_start);
@@ -5014,7 +5110,19 @@ static void evict_inode_truncate_pages(struct inode *inode)
end = state->end;
spin_unlock(&io_tree->lock);
- lock_extent_bits(io_tree, start, end, 0, &cached_state);
+ lock_extent_bits(io_tree, start, end, &cached_state);
+
+ /*
+ * If still has DELALLOC flag, the extent didn't reach disk,
+ * and its reserved space won't be freed by delayed_ref.
+ * So we need to free its reserved space here.
+ * (Refer to comment in btrfs_invalidatepage, case 2)
+ *
+ * Note, end is the bytenr of last byte, so we need + 1 here.
+ */
+ if (state->state & EXTENT_DELALLOC)
+ btrfs_qgroup_free_data(inode, start, end - start + 1);
+
clear_extent_bit(io_tree, start, end,
EXTENT_LOCKED | EXTENT_DIRTY |
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
@@ -5051,7 +5159,8 @@ void btrfs_evict_inode(struct inode *inode)
goto no_delete;
}
/* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
- btrfs_wait_ordered_range(inode, 0, (u64)-1);
+ if (!special_file(inode->i_mode))
+ btrfs_wait_ordered_range(inode, 0, (u64)-1);
btrfs_free_io_failure_record(inode, 0, (u64)-1);
@@ -5194,7 +5303,6 @@ void btrfs_evict_inode(struct inode *inode)
no_delete:
btrfs_remove_delayed_node(inode);
clear_inode(inode);
- return;
}
/*
@@ -5643,7 +5751,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
if (!path)
return -ENOMEM;
- path->reada = 1;
+ path->reada = READA_FORWARD;
if (key_type == BTRFS_DIR_INDEX_KEY) {
INIT_LIST_HEAD(&ins_list);
@@ -6234,9 +6342,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
u64 objectid;
u64 index = 0;
- if (!new_valid_dev(rdev))
- return -EINVAL;
-
/*
* 2 for inode item and ref
* 2 for dir items
@@ -6374,7 +6479,7 @@ out_unlock_inode:
static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
struct dentry *dentry)
{
- struct btrfs_trans_handle *trans;
+ struct btrfs_trans_handle *trans = NULL;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct inode *inode = d_inode(old_dentry);
u64 index;
@@ -6400,6 +6505,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
trans = btrfs_start_transaction(root, 5);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
+ trans = NULL;
goto fail;
}
@@ -6433,9 +6539,10 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
btrfs_log_new_name(trans, inode, NULL, parent);
}
- btrfs_end_transaction(trans, root);
btrfs_balance_delayed_items(root);
fail:
+ if (trans)
+ btrfs_end_transaction(trans, root);
if (drop_inode) {
inode_dec_link_count(inode);
iput(inode);
@@ -6580,7 +6687,7 @@ static int merge_extent_mapping(struct extent_map_tree *em_tree,
}
static noinline int uncompress_inline(struct btrfs_path *path,
- struct inode *inode, struct page *page,
+ struct page *page,
size_t pg_offset, u64 extent_offset,
struct btrfs_file_extent_item *item)
{
@@ -6677,7 +6784,7 @@ again:
* Chances are we'll be called again, so go ahead and do
* readahead
*/
- path->reada = 1;
+ path->reada = READA_FORWARD;
}
ret = btrfs_lookup_file_extent(trans, root, path,
@@ -6776,8 +6883,7 @@ next:
if (create == 0 && !PageUptodate(page)) {
if (btrfs_file_extent_compression(leaf, item) !=
BTRFS_COMPRESS_NONE) {
- ret = uncompress_inline(path, inode, page,
- pg_offset,
+ ret = uncompress_inline(path, page, pg_offset,
extent_offset, item);
if (ret) {
err = ret;
@@ -6876,8 +6982,7 @@ out:
trace_btrfs_get_extent(root, em);
- if (path)
- btrfs_free_path(path);
+ btrfs_free_path(path);
if (trans) {
ret = btrfs_end_transaction(trans, root);
if (!err)
@@ -7274,7 +7379,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
while (1) {
lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- 0, cached_state);
+ cached_state);
/*
* We're concerned with the entire range that we're going to be
* doing DIO to, so we need to make sure theres no ordered
@@ -7302,25 +7407,21 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
btrfs_start_ordered_extent(inode, ordered, 1);
btrfs_put_ordered_extent(ordered);
} else {
- /* Screw you mmap */
- ret = btrfs_fdatawrite_range(inode, lockstart, lockend);
- if (ret)
- break;
- ret = filemap_fdatawait_range(inode->i_mapping,
- lockstart,
- lockend);
- if (ret)
- break;
-
/*
- * If we found a page that couldn't be invalidated just
- * fall back to buffered.
+ * We could trigger writeback for this range (and wait
+ * for it to complete) and then invalidate the pages for
+ * this range (through invalidate_inode_pages2_range()),
+ * but that can lead us to a deadlock with a concurrent
+ * call to readpages() (a buffered read or a defrag call
+ * triggered a readahead) on a page lock due to an
+ * ordered dio extent we created before but did not have
+ * yet a corresponding bio submitted (whence it can not
+ * complete), which makes readpages() wait for that
+ * ordered extent to complete while holding a lock on
+ * that page.
*/
- ret = invalidate_inode_pages2_range(inode->i_mapping,
- lockstart >> PAGE_CACHE_SHIFT,
- lockend >> PAGE_CACHE_SHIFT);
- if (ret)
- break;
+ ret = -ENOTBLK;
+ break;
}
cond_resched();
@@ -7376,6 +7477,27 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
return em;
}
+static void adjust_dio_outstanding_extents(struct inode *inode,
+ struct btrfs_dio_data *dio_data,
+ const u64 len)
+{
+ unsigned num_extents;
+
+ num_extents = (unsigned) div64_u64(len + BTRFS_MAX_EXTENT_SIZE - 1,
+ BTRFS_MAX_EXTENT_SIZE);
+ /*
+ * If we have an outstanding_extents count still set then we're
+ * within our reservation, otherwise we need to adjust our inode
+ * counter appropriately.
+ */
+ if (dio_data->outstanding_extents) {
+ dio_data->outstanding_extents -= num_extents;
+ } else {
+ spin_lock(&BTRFS_I(inode)->lock);
+ BTRFS_I(inode)->outstanding_extents += num_extents;
+ spin_unlock(&BTRFS_I(inode)->lock);
+ }
+}
static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
@@ -7383,10 +7505,10 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
struct extent_map *em;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_state *cached_state = NULL;
+ struct btrfs_dio_data *dio_data = NULL;
u64 start = iblock << inode->i_blkbits;
u64 lockstart, lockend;
u64 len = bh_result->b_size;
- u64 *outstanding_extents = NULL;
int unlock_bits = EXTENT_LOCKED;
int ret = 0;
@@ -7404,7 +7526,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
* that anything that needs to check if there's a transction doesn't get
* confused.
*/
- outstanding_extents = current->journal_info;
+ dio_data = current->journal_info;
current->journal_info = NULL;
}
@@ -7412,8 +7534,11 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
* If this errors out it's because we couldn't invalidate pagecache for
* this range and we need to fallback to buffered.
*/
- if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create))
- return -ENOTBLK;
+ if (lock_extent_direct(inode, lockstart, lockend, &cached_state,
+ create)) {
+ ret = -ENOTBLK;
+ goto err;
+ }
em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
if (IS_ERR(em)) {
@@ -7531,22 +7656,12 @@ unlock:
if (start + len > i_size_read(inode))
i_size_write(inode, start + len);
- /*
- * If we have an outstanding_extents count still set then we're
- * within our reservation, otherwise we need to adjust our inode
- * counter appropriately.
- */
- if (*outstanding_extents) {
- (*outstanding_extents)--;
- } else {
- spin_lock(&BTRFS_I(inode)->lock);
- BTRFS_I(inode)->outstanding_extents++;
- spin_unlock(&BTRFS_I(inode)->lock);
- }
-
- current->journal_info = outstanding_extents;
- btrfs_free_reserved_data_space(inode, len);
- set_bit(BTRFS_INODE_DIO_READY, &BTRFS_I(inode)->runtime_flags);
+ adjust_dio_outstanding_extents(inode, dio_data, len);
+ btrfs_free_reserved_data_space(inode, start, len);
+ WARN_ON(dio_data->reserve < len);
+ dio_data->reserve -= len;
+ dio_data->unsubmitted_oe_range_end = start + len;
+ current->journal_info = dio_data;
}
/*
@@ -7569,8 +7684,17 @@ unlock:
unlock_err:
clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
unlock_bits, 1, 0, &cached_state, GFP_NOFS);
- if (outstanding_extents)
- current->journal_info = outstanding_extents;
+err:
+ if (dio_data)
+ current->journal_info = dio_data;
+ /*
+ * Compensate the delalloc release we do in btrfs_direct_IO() when we
+ * write less data then expected, so that we don't underflow our inode's
+ * outstanding extents counter.
+ */
+ if (create && dio_data)
+ adjust_dio_outstanding_extents(inode, dio_data, len);
+
return ret;
}
@@ -7688,13 +7812,13 @@ struct btrfs_retry_complete {
int uptodate;
};
-static void btrfs_retry_endio_nocsum(struct bio *bio, int err)
+static void btrfs_retry_endio_nocsum(struct bio *bio)
{
struct btrfs_retry_complete *done = bio->bi_private;
struct bio_vec *bvec;
int i;
- if (err)
+ if (bio->bi_error)
goto end;
done->uptodate = 1;
@@ -7743,7 +7867,7 @@ try_again:
return 0;
}
-static void btrfs_retry_endio(struct bio *bio, int err)
+static void btrfs_retry_endio(struct bio *bio)
{
struct btrfs_retry_complete *done = bio->bi_private;
struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
@@ -7752,7 +7876,7 @@ static void btrfs_retry_endio(struct bio *bio, int err)
int ret;
int i;
- if (err)
+ if (bio->bi_error)
goto end;
uptodate = 1;
@@ -7835,12 +7959,13 @@ static int btrfs_subio_endio_read(struct inode *inode,
}
}
-static void btrfs_endio_direct_read(struct bio *bio, int err)
+static void btrfs_endio_direct_read(struct bio *bio)
{
struct btrfs_dio_private *dip = bio->bi_private;
struct inode *inode = dip->inode;
struct bio *dio_bio;
struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+ int err = bio->bi_error;
if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED)
err = btrfs_subio_endio_read(inode, io_bio, err);
@@ -7851,31 +7976,29 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
kfree(dip);
- /* If we had a csum failure make sure to clear the uptodate flag */
- if (err)
- clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
- dio_end_io(dio_bio, err);
+ dio_end_io(dio_bio, bio->bi_error);
if (io_bio->end_io)
io_bio->end_io(io_bio, err);
bio_put(bio);
}
-static void btrfs_endio_direct_write(struct bio *bio, int err)
+static void btrfs_endio_direct_write_update_ordered(struct inode *inode,
+ const u64 offset,
+ const u64 bytes,
+ const int uptodate)
{
- struct btrfs_dio_private *dip = bio->bi_private;
- struct inode *inode = dip->inode;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_ordered_extent *ordered = NULL;
- u64 ordered_offset = dip->logical_offset;
- u64 ordered_bytes = dip->bytes;
- struct bio *dio_bio;
+ u64 ordered_offset = offset;
+ u64 ordered_bytes = bytes;
int ret;
again:
ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
&ordered_offset,
- ordered_bytes, !err);
+ ordered_bytes,
+ uptodate);
if (!ret)
goto out_test;
@@ -7888,20 +8011,26 @@ out_test:
* our bio might span multiple ordered extents. If we haven't
* completed the accounting for the whole dio, go back and try again
*/
- if (ordered_offset < dip->logical_offset + dip->bytes) {
- ordered_bytes = dip->logical_offset + dip->bytes -
- ordered_offset;
+ if (ordered_offset < offset + bytes) {
+ ordered_bytes = offset + bytes - ordered_offset;
ordered = NULL;
goto again;
}
- dio_bio = dip->dio_bio;
+}
+
+static void btrfs_endio_direct_write(struct bio *bio)
+{
+ struct btrfs_dio_private *dip = bio->bi_private;
+ struct bio *dio_bio = dip->dio_bio;
+
+ btrfs_endio_direct_write_update_ordered(dip->inode,
+ dip->logical_offset,
+ dip->bytes,
+ !bio->bi_error);
kfree(dip);
- /* If we had an error make sure to clear the uptodate flag */
- if (err)
- clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
- dio_end_io(dio_bio, err);
+ dio_end_io(dio_bio, bio->bi_error);
bio_put(bio);
}
@@ -7916,9 +8045,10 @@ static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
return 0;
}
-static void btrfs_end_dio_bio(struct bio *bio, int err)
+static void btrfs_end_dio_bio(struct bio *bio)
{
struct btrfs_dio_private *dip = bio->bi_private;
+ int err = bio->bi_error;
if (err)
btrfs_warn(BTRFS_I(dip->inode)->root->fs_info,
@@ -7947,8 +8077,8 @@ static void btrfs_end_dio_bio(struct bio *bio, int err)
if (dip->errors) {
bio_io_error(dip->orig_bio);
} else {
- set_bit(BIO_UPTODATE, &dip->dio_bio->bi_flags);
- bio_endio(dip->orig_bio, 0);
+ dip->dio_bio->bi_error = 0;
+ bio_endio(dip->orig_bio);
}
out:
bio_put(bio);
@@ -7957,8 +8087,11 @@ out:
static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
u64 first_sector, gfp_t gfp_flags)
{
- int nr_vecs = bio_get_nr_vecs(bdev);
- return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags);
+ struct bio *bio;
+ bio = btrfs_bio_alloc(bdev, first_sector, BIO_MAX_PAGES, gfp_flags);
+ if (bio)
+ bio_associate_current(bio);
+ return bio;
}
static inline int btrfs_lookup_and_bind_dio_csum(struct btrfs_root *root,
@@ -8201,6 +8334,21 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
dip->subio_endio = btrfs_subio_endio_read;
}
+ /*
+ * Reset the range for unsubmitted ordered extents (to a 0 length range)
+ * even if we fail to submit a bio, because in such case we do the
+ * corresponding error handling below and it must not be done a second
+ * time by btrfs_direct_IO().
+ */
+ if (write) {
+ struct btrfs_dio_data *dio_data = current->journal_info;
+
+ dio_data->unsubmitted_oe_range_end = dip->logical_offset +
+ dip->bytes;
+ dio_data->unsubmitted_oe_range_start =
+ dio_data->unsubmitted_oe_range_end;
+ }
+
ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
if (!ret)
return;
@@ -8219,7 +8367,8 @@ free_ordered:
* callbacks - they require an allocated dip and a clone of dio_bio.
*/
if (io_bio && dip) {
- bio_endio(io_bio, ret);
+ io_bio->bi_error = -EIO;
+ bio_endio(io_bio);
/*
* The end io callbacks free our dip, do the final put on io_bio
* and all the cleanup and final put for dio_bio (through
@@ -8228,25 +8377,16 @@ free_ordered:
dip = NULL;
io_bio = NULL;
} else {
- if (write) {
- struct btrfs_ordered_extent *ordered;
-
- ordered = btrfs_lookup_ordered_extent(inode,
- file_offset);
- set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
- /*
- * Decrements our ref on the ordered extent and removes
- * the ordered extent from the inode's ordered tree,
- * doing all the proper resource cleanup such as for the
- * reserved space and waking up any waiters for this
- * ordered extent (through btrfs_remove_ordered_extent).
- */
- btrfs_finish_ordered_io(ordered);
- } else {
+ if (write)
+ btrfs_endio_direct_write_update_ordered(inode,
+ file_offset,
+ dio_bio->bi_iter.bi_size,
+ 0);
+ else
unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
file_offset + dio_bio->bi_iter.bi_size - 1);
- }
- clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
+
+ dio_bio->bi_error = -EIO;
/*
* Releases and cleans up our dio_bio, no need to bio_put()
* nor bio_endio()/bio_io_error() against dio_bio.
@@ -8296,7 +8436,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
- u64 outstanding_extents = 0;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_dio_data dio_data = { 0 };
size_t count = 0;
int flags = 0;
bool wakeup = true;
@@ -8331,10 +8472,10 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
mutex_unlock(&inode->i_mutex);
relock = true;
}
- ret = btrfs_delalloc_reserve_space(inode, count);
+ ret = btrfs_delalloc_reserve_space(inode, offset, count);
if (ret)
goto out;
- outstanding_extents = div64_u64(count +
+ dio_data.outstanding_extents = div64_u64(count +
BTRFS_MAX_EXTENT_SIZE - 1,
BTRFS_MAX_EXTENT_SIZE);
@@ -8343,7 +8484,10 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
* do the accounting properly if we go over the number we
* originally calculated. Abuse current->journal_info for this.
*/
- current->journal_info = &outstanding_extents;
+ dio_data.reserve = round_up(count, root->sectorsize);
+ dio_data.unsubmitted_oe_range_start = (u64)offset;
+ dio_data.unsubmitted_oe_range_end = (u64)offset;
+ current->journal_info = &dio_data;
} else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
&BTRFS_I(inode)->runtime_flags)) {
inode_dio_end(inode);
@@ -8358,18 +8502,24 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
if (iov_iter_rw(iter) == WRITE) {
current->journal_info = NULL;
if (ret < 0 && ret != -EIOCBQUEUED) {
+ if (dio_data.reserve)
+ btrfs_delalloc_release_space(inode, offset,
+ dio_data.reserve);
/*
- * If the error comes from submitting stage,
- * btrfs_get_blocsk_direct() has free'd data space,
- * and metadata space will be handled by
- * finish_ordered_fn, don't do that again to make
- * sure bytes_may_use is correct.
+ * On error we might have left some ordered extents
+ * without submitting corresponding bios for them, so
+ * cleanup them up to avoid other tasks getting them
+ * and waiting for them to complete forever.
*/
- if (!test_and_clear_bit(BTRFS_INODE_DIO_READY,
- &BTRFS_I(inode)->runtime_flags))
- btrfs_delalloc_release_space(inode, count);
+ if (dio_data.unsubmitted_oe_range_start <
+ dio_data.unsubmitted_oe_range_end)
+ btrfs_endio_direct_write_update_ordered(inode,
+ dio_data.unsubmitted_oe_range_start,
+ dio_data.unsubmitted_oe_range_end -
+ dio_data.unsubmitted_oe_range_start,
+ 0);
} else if (ret >= 0 && (size_t)ret < count)
- btrfs_delalloc_release_space(inode,
+ btrfs_delalloc_release_space(inode, offset,
count - (size_t)ret);
}
out:
@@ -8405,15 +8555,28 @@ int btrfs_readpage(struct file *file, struct page *page)
static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
{
struct extent_io_tree *tree;
-
+ struct inode *inode = page->mapping->host;
+ int ret;
if (current->flags & PF_MEMALLOC) {
redirty_page_for_writepage(wbc, page);
unlock_page(page);
return 0;
}
+
+ /*
+ * If we are under memory pressure we will call this directly from the
+ * VM, we need to make sure we have the inode referenced for the ordered
+ * extent. If not just return like we didn't do anything.
+ */
+ if (!igrab(inode)) {
+ redirty_page_for_writepage(wbc, page);
+ return AOP_WRITEPAGE_ACTIVATE;
+ }
tree = &BTRFS_I(page->mapping->host)->io_tree;
- return extent_write_full_page(tree, page, btrfs_get_extent, wbc);
+ ret = extent_write_full_page(tree, page, btrfs_get_extent, wbc);
+ btrfs_add_delayed_iput(inode);
+ return ret;
}
static int btrfs_writepages(struct address_space *mapping,
@@ -8485,7 +8648,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
}
if (!inode_evicting)
- lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
+ lock_extent_bits(tree, page_start, page_end, &cached_state);
ordered = btrfs_lookup_ordered_extent(inode, page_start);
if (ordered) {
/*
@@ -8523,11 +8686,23 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
btrfs_put_ordered_extent(ordered);
if (!inode_evicting) {
cached_state = NULL;
- lock_extent_bits(tree, page_start, page_end, 0,
+ lock_extent_bits(tree, page_start, page_end,
&cached_state);
}
}
+ /*
+ * Qgroup reserved space handler
+ * Page here will be either
+ * 1) Already written to disk
+ * In this case, its reserved space is released from data rsv map
+ * and will be freed by delayed_ref handler finally.
+ * So even we call qgroup_free_data(), it won't decrease reserved
+ * space.
+ * 2) Not written to disk
+ * This means the reserved space should be freed here.
+ */
+ btrfs_qgroup_free_data(inode, page_start, PAGE_CACHE_SIZE);
if (!inode_evicting) {
clear_extent_bit(tree, page_start, page_end,
EXTENT_LOCKED | EXTENT_DIRTY |
@@ -8578,7 +8753,11 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
u64 page_end;
sb_start_pagefault(inode->i_sb);
- ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+ page_start = page_offset(page);
+ page_end = page_start + PAGE_CACHE_SIZE - 1;
+
+ ret = btrfs_delalloc_reserve_space(inode, page_start,
+ PAGE_CACHE_SIZE);
if (!ret) {
ret = file_update_time(vma->vm_file);
reserved = 1;
@@ -8597,8 +8776,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
again:
lock_page(page);
size = i_size_read(inode);
- page_start = page_offset(page);
- page_end = page_start + PAGE_CACHE_SIZE - 1;
if ((page->mapping != inode->i_mapping) ||
(page_start >= size)) {
@@ -8607,7 +8784,7 @@ again:
}
wait_on_page_writeback(page);
- lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state);
+ lock_extent_bits(io_tree, page_start, page_end, &cached_state);
set_page_extent_mapped(page);
/*
@@ -8675,7 +8852,7 @@ out_unlock:
}
unlock_page(page);
out:
- btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+ btrfs_delalloc_release_space(inode, page_start, PAGE_CACHE_SIZE);
out_noreserve:
sb_end_pagefault(inode->i_sb);
return ret;
@@ -8881,6 +9058,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->dir_index = 0;
ei->last_unlink_trans = 0;
ei->last_log_commit = 0;
+ ei->delayed_iput_count = 0;
spin_lock_init(&ei->lock);
ei->outstanding_extents = 0;
@@ -8905,6 +9083,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
mutex_init(&ei->delalloc_mutex);
btrfs_ordered_inode_tree_init(&ei->ordered_tree);
INIT_LIST_HEAD(&ei->delalloc_inodes);
+ INIT_LIST_HEAD(&ei->delayed_iput);
RB_CLEAR_NODE(&ei->rb_node);
return inode;
@@ -8964,6 +9143,7 @@ void btrfs_destroy_inode(struct inode *inode)
btrfs_put_ordered_extent(ordered);
}
}
+ btrfs_qgroup_check_reserved_leak(inode);
inode_tree_del(inode);
btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
free:
@@ -9008,15 +9188,14 @@ void btrfs_destroy_cachep(void)
kmem_cache_destroy(btrfs_path_cachep);
if (btrfs_free_space_cachep)
kmem_cache_destroy(btrfs_free_space_cachep);
- if (btrfs_delalloc_work_cachep)
- kmem_cache_destroy(btrfs_delalloc_work_cachep);
}
int btrfs_init_cachep(void)
{
btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
sizeof(struct btrfs_inode), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once);
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT,
+ init_once);
if (!btrfs_inode_cachep)
goto fail;
@@ -9044,13 +9223,6 @@ int btrfs_init_cachep(void)
if (!btrfs_free_space_cachep)
goto fail;
- btrfs_delalloc_work_cachep = kmem_cache_create("btrfs_delalloc_work",
- sizeof(struct btrfs_delalloc_work), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
- NULL);
- if (!btrfs_delalloc_work_cachep)
- goto fail;
-
return 0;
fail:
btrfs_destroy_cachep();
@@ -9274,14 +9446,10 @@ static void btrfs_run_delalloc_work(struct btrfs_work *work)
delalloc_work = container_of(work, struct btrfs_delalloc_work,
work);
inode = delalloc_work->inode;
- if (delalloc_work->wait) {
- btrfs_wait_ordered_range(inode, 0, (u64)-1);
- } else {
+ filemap_flush(inode->i_mapping);
+ if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags))
filemap_flush(inode->i_mapping);
- if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags))
- filemap_flush(inode->i_mapping);
- }
if (delalloc_work->delay_iput)
btrfs_add_delayed_iput(inode);
@@ -9291,18 +9459,17 @@ static void btrfs_run_delalloc_work(struct btrfs_work *work)
}
struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
- int wait, int delay_iput)
+ int delay_iput)
{
struct btrfs_delalloc_work *work;
- work = kmem_cache_zalloc(btrfs_delalloc_work_cachep, GFP_NOFS);
+ work = kmalloc(sizeof(*work), GFP_NOFS);
if (!work)
return NULL;
init_completion(&work->completion);
INIT_LIST_HEAD(&work->list);
work->inode = inode;
- work->wait = wait;
work->delay_iput = delay_iput;
WARN_ON_ONCE(!inode);
btrfs_init_work(&work->work, btrfs_flush_delalloc_helper,
@@ -9314,7 +9481,7 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
{
wait_for_completion(&work->completion);
- kmem_cache_free(btrfs_delalloc_work_cachep, work);
+ kfree(work);
}
/*
@@ -9350,7 +9517,7 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
}
spin_unlock(&root->delalloc_lock);
- work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
+ work = btrfs_alloc_delalloc_work(inode, delay_iput);
if (!work) {
if (delay_iput)
btrfs_add_delayed_iput(inode);
@@ -9492,9 +9659,11 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
/*
* 2 items for inode item and ref
* 2 items for dir items
+ * 1 item for updating parent inode item
+ * 1 item for the inline extent item
* 1 item for xattr if selinux is on
*/
- trans = btrfs_start_transaction(root, 5);
+ trans = btrfs_start_transaction(root, 7);
if (IS_ERR(trans))
return PTR_ERR(trans);
@@ -9525,10 +9694,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
if (err)
goto out_unlock_inode;
- err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
- if (err)
- goto out_unlock_inode;
-
path = btrfs_alloc_path();
if (!path) {
err = -ENOMEM;
@@ -9561,10 +9726,18 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
btrfs_free_path(path);
inode->i_op = &btrfs_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &btrfs_symlink_aops;
inode_set_bytes(inode, name_len);
btrfs_i_size_write(inode, name_len);
err = btrfs_update_inode(trans, root, inode);
+ /*
+ * Last step, add directory indexes for our symlink inode. This is the
+ * last step to avoid extra cleanup of these indexes if an error happens
+ * elsewhere above.
+ */
+ if (!err)
+ err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
if (err) {
drop_inode = 1;
goto out_unlock_inode;
@@ -9600,6 +9773,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
u64 cur_offset = start;
u64 i_size;
u64 cur_bytes;
+ u64 last_alloc = (u64)-1;
int ret = 0;
bool own_trans = true;
@@ -9614,8 +9788,15 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
}
}
- cur_bytes = min(num_bytes, 256ULL * 1024 * 1024);
+ cur_bytes = min_t(u64, num_bytes, SZ_256M);
cur_bytes = max(cur_bytes, min_size);
+ /*
+ * If we are severely fragmented we could end up with really
+ * small allocations, so if the allocator is returning small
+ * chunks lets make its job easier by only searching for those
+ * sized chunks.
+ */
+ cur_bytes = min(cur_bytes, last_alloc);
ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0,
*alloc_hint, &ins, 1, 0);
if (ret) {
@@ -9624,6 +9805,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
break;
}
+ last_alloc = ins.offset;
ret = insert_reserved_file_extent(trans, inode,
cur_offset, ins.objectid,
ins.offset, ins.offset,
@@ -9841,7 +10023,7 @@ static const struct inode_operations btrfs_dir_inode_operations = {
.setattr = btrfs_setattr,
.mknod = btrfs_mknod,
.setxattr = btrfs_setxattr,
- .getxattr = btrfs_getxattr,
+ .getxattr = generic_getxattr,
.listxattr = btrfs_listxattr,
.removexattr = btrfs_removexattr,
.permission = btrfs_permission,
@@ -9870,7 +10052,7 @@ static const struct file_operations btrfs_dir_file_operations = {
.fsync = btrfs_sync_file,
};
-static struct extent_io_ops btrfs_extent_io_ops = {
+static const struct extent_io_ops btrfs_extent_io_ops = {
.fill_delalloc = run_delalloc_range,
.submit_bio_hook = btrfs_submit_bio_hook,
.merge_bio_hook = btrfs_merge_bio_hook,
@@ -9918,7 +10100,7 @@ static const struct inode_operations btrfs_file_inode_operations = {
.getattr = btrfs_getattr,
.setattr = btrfs_setattr,
.setxattr = btrfs_setxattr,
- .getxattr = btrfs_getxattr,
+ .getxattr = generic_getxattr,
.listxattr = btrfs_listxattr,
.removexattr = btrfs_removexattr,
.permission = btrfs_permission,
@@ -9932,7 +10114,7 @@ static const struct inode_operations btrfs_special_inode_operations = {
.setattr = btrfs_setattr,
.permission = btrfs_permission,
.setxattr = btrfs_setxattr,
- .getxattr = btrfs_getxattr,
+ .getxattr = generic_getxattr,
.listxattr = btrfs_listxattr,
.removexattr = btrfs_removexattr,
.get_acl = btrfs_get_acl,
@@ -9941,13 +10123,12 @@ static const struct inode_operations btrfs_special_inode_operations = {
};
static const struct inode_operations btrfs_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
.getattr = btrfs_getattr,
.setattr = btrfs_setattr,
.permission = btrfs_permission,
.setxattr = btrfs_setxattr,
- .getxattr = btrfs_getxattr,
+ .getxattr = generic_getxattr,
.listxattr = btrfs_listxattr,
.removexattr = btrfs_removexattr,
.update_time = btrfs_update_time,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 0770c91..2a47a31 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -655,22 +655,28 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
return -EINVAL;
+ pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
+ if (!pending_snapshot)
+ return -ENOMEM;
+
+ pending_snapshot->root_item = kzalloc(sizeof(struct btrfs_root_item),
+ GFP_NOFS);
+ pending_snapshot->path = btrfs_alloc_path();
+ if (!pending_snapshot->root_item || !pending_snapshot->path) {
+ ret = -ENOMEM;
+ goto free_pending;
+ }
+
atomic_inc(&root->will_be_snapshoted);
smp_mb__after_atomic();
btrfs_wait_for_no_snapshoting_writes(root);
ret = btrfs_start_delalloc_inodes(root, 0);
if (ret)
- goto out;
+ goto dec_and_free;
btrfs_wait_ordered_extents(root, -1);
- pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
- if (!pending_snapshot) {
- ret = -ENOMEM;
- goto out;
- }
-
btrfs_init_block_rsv(&pending_snapshot->block_rsv,
BTRFS_BLOCK_RSV_TEMP);
/*
@@ -686,7 +692,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
&pending_snapshot->qgroup_reserved,
false);
if (ret)
- goto free;
+ goto dec_and_free;
pending_snapshot->dentry = dentry;
pending_snapshot->root = root;
@@ -737,11 +743,14 @@ fail:
btrfs_subvolume_release_metadata(BTRFS_I(dir)->root,
&pending_snapshot->block_rsv,
pending_snapshot->qgroup_reserved);
-free:
- kfree(pending_snapshot);
-out:
+dec_and_free:
if (atomic_dec_and_test(&root->will_be_snapshoted))
wake_up_atomic_t(&root->will_be_snapshoted);
+free_pending:
+ kfree(pending_snapshot->root_item);
+ btrfs_free_path(pending_snapshot->path);
+ kfree(pending_snapshot);
+
return ret;
}
@@ -992,7 +1001,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start)
u64 end = start + len - 1;
/* get the big lock and read metadata off disk */
- lock_extent_bits(io_tree, start, end, 0, &cached);
+ lock_extent_bits(io_tree, start, end, &cached);
em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
unlock_extent_cached(io_tree, start, end, &cached, GFP_NOFS);
@@ -1016,7 +1025,7 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
ret = false;
else if ((em->block_start + em->block_len == next->block_start) &&
- (em->block_len > 128 * 1024 && next->block_len > 128 * 1024))
+ (em->block_len > SZ_128K && next->block_len > SZ_128K))
ret = false;
free_extent_map(next);
@@ -1030,6 +1039,7 @@ static int should_defrag_range(struct inode *inode, u64 start, u32 thresh,
struct extent_map *em;
int ret = 1;
bool next_mergeable = true;
+ bool prev_mergeable = true;
/*
* make sure that once we start defragging an extent, we keep on
@@ -1050,13 +1060,16 @@ static int should_defrag_range(struct inode *inode, u64 start, u32 thresh,
goto out;
}
+ if (!*defrag_end)
+ prev_mergeable = false;
+
next_mergeable = defrag_check_next_extent(inode, em);
/*
* we hit a real extent, if it is big or the next extent is not a
* real extent, don't bother defragging it
*/
if (!compress && (*last_len == 0 || *last_len >= thresh) &&
- (em->len >= thresh || !next_mergeable))
+ (em->len >= thresh || (!next_mergeable && !prev_mergeable)))
ret = 0;
out:
/*
@@ -1116,7 +1129,8 @@ static int cluster_pages_for_defrag(struct inode *inode,
page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1);
ret = btrfs_delalloc_reserve_space(inode,
- page_cnt << PAGE_CACHE_SHIFT);
+ start_index << PAGE_CACHE_SHIFT,
+ page_cnt << PAGE_CACHE_SHIFT);
if (ret)
return ret;
i_done = 0;
@@ -1135,7 +1149,7 @@ again:
page_end = page_start + PAGE_CACHE_SIZE - 1;
while (1) {
lock_extent_bits(tree, page_start, page_end,
- 0, &cached_state);
+ &cached_state);
ordered = btrfs_lookup_ordered_extent(inode,
page_start);
unlock_extent_cached(tree, page_start, page_end,
@@ -1195,7 +1209,7 @@ again:
page_end = page_offset(pages[i_done - 1]) + PAGE_CACHE_SIZE;
lock_extent_bits(&BTRFS_I(inode)->io_tree,
- page_start, page_end - 1, 0, &cached_state);
+ page_start, page_end - 1, &cached_state);
clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,
page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0,
@@ -1206,7 +1220,8 @@ again:
BTRFS_I(inode)->outstanding_extents++;
spin_unlock(&BTRFS_I(inode)->lock);
btrfs_delalloc_release_space(inode,
- (page_cnt - i_done) << PAGE_CACHE_SHIFT);
+ start_index << PAGE_CACHE_SHIFT,
+ (page_cnt - i_done) << PAGE_CACHE_SHIFT);
}
@@ -1231,7 +1246,9 @@ out:
unlock_page(pages[i]);
page_cache_release(pages[i]);
}
- btrfs_delalloc_release_space(inode, page_cnt << PAGE_CACHE_SHIFT);
+ btrfs_delalloc_release_space(inode,
+ start_index << PAGE_CACHE_SHIFT,
+ page_cnt << PAGE_CACHE_SHIFT);
return ret;
}
@@ -1254,9 +1271,9 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
int defrag_count = 0;
int compress_type = BTRFS_COMPRESS_ZLIB;
u32 extent_thresh = range->extent_thresh;
- unsigned long max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
+ unsigned long max_cluster = SZ_256K >> PAGE_CACHE_SHIFT;
unsigned long cluster = max_cluster;
- u64 new_align = ~((u64)128 * 1024 - 1);
+ u64 new_align = ~((u64)SZ_128K - 1);
struct page **pages = NULL;
if (isize == 0)
@@ -1273,7 +1290,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
}
if (extent_thresh == 0)
- extent_thresh = 256 * 1024;
+ extent_thresh = SZ_256K;
/*
* if we were not given a file, allocate a readahead
@@ -1305,7 +1322,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
if (newer_than) {
ret = find_new_extents(root, inode, newer_than,
- &newer_off, 64 * 1024);
+ &newer_off, SZ_64K);
if (!ret) {
range->start = newer_off;
/*
@@ -1338,7 +1355,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
break;
if (btrfs_defrag_cancelled(root->fs_info)) {
- printk(KERN_DEBUG "BTRFS: defrag_file cancelled\n");
+ btrfs_debug(root->fs_info, "defrag_file cancelled");
ret = -EAGAIN;
break;
}
@@ -1395,9 +1412,8 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
newer_off = max(newer_off + 1,
(u64)i << PAGE_CACHE_SHIFT);
- ret = find_new_extents(root, inode,
- newer_than, &newer_off,
- 64 * 1024);
+ ret = find_new_extents(root, inode, newer_than,
+ &newer_off, SZ_64K);
if (!ret) {
range->start = newer_off;
i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
@@ -1563,7 +1579,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
new_size = old_size + new_size;
}
- if (new_size < 256 * 1024 * 1024) {
+ if (new_size < SZ_256M) {
ret = -EINVAL;
goto out_free;
}
@@ -1575,7 +1591,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
new_size = div_u64(new_size, root->sectorsize);
new_size *= root->sectorsize;
- printk_in_rcu(KERN_INFO "BTRFS: new size for %s is %llu\n",
+ btrfs_info_in_rcu(root->fs_info, "new size for %s is %llu",
rcu_str_deref(device->name), new_size);
if (new_size > old_size) {
@@ -1933,6 +1949,7 @@ static noinline int copy_to_sk(struct btrfs_root *root,
u64 found_transid;
struct extent_buffer *leaf;
struct btrfs_ioctl_search_header sh;
+ struct btrfs_key test;
unsigned long item_off;
unsigned long item_len;
int nritems;
@@ -2016,12 +2033,17 @@ static noinline int copy_to_sk(struct btrfs_root *root,
}
advance_key:
ret = 0;
- if (key->offset < (u64)-1 && key->offset < sk->max_offset)
+ test.objectid = sk->max_objectid;
+ test.type = sk->max_type;
+ test.offset = sk->max_offset;
+ if (btrfs_comp_cpu_keys(key, &test) >= 0)
+ ret = 1;
+ else if (key->offset < (u64)-1)
key->offset++;
- else if (key->type < (u8)-1 && key->type < sk->max_type) {
+ else if (key->type < (u8)-1) {
key->offset = 0;
key->type++;
- } else if (key->objectid < (u64)-1 && key->objectid < sk->max_objectid) {
+ } else if (key->objectid < (u64)-1) {
key->offset = 0;
key->type = 0;
key->objectid++;
@@ -2071,7 +2093,7 @@ static noinline int search_ioctl(struct inode *inode,
key.offset = (u64)-1;
root = btrfs_read_fs_root_no_name(info, &key);
if (IS_ERR(root)) {
- printk(KERN_ERR "BTRFS: could not find root %llu\n",
+ btrfs_err(info, "could not find root %llu",
sk->tree_id);
btrfs_free_path(path);
return -ENOENT;
@@ -2146,7 +2168,7 @@ static noinline int btrfs_ioctl_tree_search_v2(struct file *file,
struct inode *inode;
int ret;
size_t buf_size;
- const size_t buf_limit = 16 * 1024 * 1024;
+ const size_t buf_limit = SZ_16M;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -2211,7 +2233,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
key.offset = (u64)-1;
root = btrfs_read_fs_root_no_name(info, &key);
if (IS_ERR(root)) {
- printk(KERN_ERR "BTRFS: could not find root %llu\n", tree_id);
+ btrfs_err(info, "could not find root %llu", tree_id);
ret = -ENOENT;
goto out;
}
@@ -2689,7 +2711,6 @@ static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg)
{
struct btrfs_ioctl_fs_info_args *fi_args;
struct btrfs_device *device;
- struct btrfs_device *next;
struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
int ret = 0;
@@ -2701,7 +2722,7 @@ static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg)
fi_args->num_devices = fs_devices->num_devices;
memcpy(&fi_args->fsid, root->fs_info->fsid, sizeof(fi_args->fsid));
- list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
if (device->devid > fi_args->max_id)
fi_args->max_id = device->devid;
}
@@ -2842,8 +2863,7 @@ static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2)
swap(inode1, inode2);
mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
- if (inode1 != inode2)
- mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
+ mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
}
static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
@@ -2861,8 +2881,7 @@ static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
swap(loff1, loff2);
}
lock_extent_range(inode1, loff1, len);
- if (inode1 != inode2)
- lock_extent_range(inode2, loff2, len);
+ lock_extent_range(inode2, loff2, len);
}
struct cmp_pages {
@@ -2951,7 +2970,7 @@ static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst,
flush_dcache_page(dst_page);
if (memcmp(addr, dst_addr, cmp_len))
- ret = BTRFS_SAME_DATA_DIFFERS;
+ ret = -EBADE;
kunmap_atomic(addr);
kunmap_atomic(dst_addr);
@@ -3085,55 +3104,18 @@ out_unlock:
return ret;
}
-#define BTRFS_MAX_DEDUPE_LEN (16 * 1024 * 1024)
+#define BTRFS_MAX_DEDUPE_LEN SZ_16M
-static long btrfs_ioctl_file_extent_same(struct file *file,
- struct btrfs_ioctl_same_args __user *argp)
+ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen,
+ struct file *dst_file, u64 dst_loff)
{
- struct btrfs_ioctl_same_args *same = NULL;
- struct btrfs_ioctl_same_extent_info *info;
- struct inode *src = file_inode(file);
- u64 off;
- u64 len;
- int i;
- int ret;
- unsigned long size;
+ struct inode *src = file_inode(src_file);
+ struct inode *dst = file_inode(dst_file);
u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
- bool is_admin = capable(CAP_SYS_ADMIN);
- u16 count;
+ ssize_t res;
- if (!(file->f_mode & FMODE_READ))
- return -EINVAL;
-
- ret = mnt_want_write_file(file);
- if (ret)
- return ret;
-
- if (get_user(count, &argp->dest_count)) {
- ret = -EFAULT;
- goto out;
- }
-
- size = offsetof(struct btrfs_ioctl_same_args __user, info[count]);
-
- same = memdup_user(argp, size);
-
- if (IS_ERR(same)) {
- ret = PTR_ERR(same);
- same = NULL;
- goto out;
- }
-
- off = same->logical_offset;
- len = same->length;
-
- /*
- * Limit the total length we will dedupe for each operation.
- * This is intended to bound the total time spent in this
- * ioctl to something sane.
- */
- if (len > BTRFS_MAX_DEDUPE_LEN)
- len = BTRFS_MAX_DEDUPE_LEN;
+ if (olen > BTRFS_MAX_DEDUPE_LEN)
+ olen = BTRFS_MAX_DEDUPE_LEN;
if (WARN_ON_ONCE(bs < PAGE_CACHE_SIZE)) {
/*
@@ -3141,93 +3123,13 @@ static long btrfs_ioctl_file_extent_same(struct file *file,
* result, btrfs_cmp_data() won't correctly handle
* this situation without an update.
*/
- ret = -EINVAL;
- goto out;
- }
-
- ret = -EISDIR;
- if (S_ISDIR(src->i_mode))
- goto out;
-
- ret = -EACCES;
- if (!S_ISREG(src->i_mode))
- goto out;
-
- /* pre-format output fields to sane values */
- for (i = 0; i < count; i++) {
- same->info[i].bytes_deduped = 0ULL;
- same->info[i].status = 0;
- }
-
- for (i = 0, info = same->info; i < count; i++, info++) {
- struct inode *dst;
- struct fd dst_file = fdget(info->fd);
- if (!dst_file.file) {
- info->status = -EBADF;
- continue;
- }
- dst = file_inode(dst_file.file);
-
- if (!(is_admin || (dst_file.file->f_mode & FMODE_WRITE))) {
- info->status = -EINVAL;
- } else if (file->f_path.mnt != dst_file.file->f_path.mnt) {
- info->status = -EXDEV;
- } else if (S_ISDIR(dst->i_mode)) {
- info->status = -EISDIR;
- } else if (!S_ISREG(dst->i_mode)) {
- info->status = -EACCES;
- } else {
- info->status = btrfs_extent_same(src, off, len, dst,
- info->logical_offset);
- if (info->status == 0)
- info->bytes_deduped += len;
- }
- fdput(dst_file);
+ return -EINVAL;
}
- ret = copy_to_user(argp, same, size);
- if (ret)
- ret = -EFAULT;
-
-out:
- mnt_drop_write_file(file);
- kfree(same);
- return ret;
-}
-
-/* Helper to check and see if this root currently has a ref on the given disk
- * bytenr. If it does then we need to update the quota for this root. This
- * doesn't do anything if quotas aren't enabled.
- */
-static int check_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- u64 disko)
-{
- struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
- struct ulist *roots;
- struct ulist_iterator uiter;
- struct ulist_node *root_node = NULL;
- int ret;
-
- if (!root->fs_info->quota_enabled)
- return 1;
-
- btrfs_get_tree_mod_seq(root->fs_info, &tree_mod_seq_elem);
- ret = btrfs_find_all_roots(trans, root->fs_info, disko,
- tree_mod_seq_elem.seq, &roots);
- if (ret < 0)
- goto out;
- ret = 0;
- ULIST_ITER_INIT(&uiter);
- while ((root_node = ulist_next(roots, &uiter))) {
- if (root_node->val == root->objectid) {
- ret = 1;
- break;
- }
- }
- ulist_free(roots);
-out:
- btrfs_put_tree_mod_seq(root->fs_info, &tree_mod_seq_elem);
- return ret;
+ res = btrfs_extent_same(src, loff, olen, dst, dst_loff);
+ if (res)
+ return res;
+ return olen;
}
static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
@@ -3320,6 +3222,150 @@ static void clone_update_extent_map(struct inode *inode,
&BTRFS_I(inode)->runtime_flags);
}
+/*
+ * Make sure we do not end up inserting an inline extent into a file that has
+ * already other (non-inline) extents. If a file has an inline extent it can
+ * not have any other extents and the (single) inline extent must start at the
+ * file offset 0. Failing to respect these rules will lead to file corruption,
+ * resulting in EIO errors on read/write operations, hitting BUG_ON's in mm, etc
+ *
+ * We can have extents that have been already written to disk or we can have
+ * dirty ranges still in delalloc, in which case the extent maps and items are
+ * created only when we run delalloc, and the delalloc ranges might fall outside
+ * the range we are currently locking in the inode's io tree. So we check the
+ * inode's i_size because of that (i_size updates are done while holding the
+ * i_mutex, which we are holding here).
+ * We also check to see if the inode has a size not greater than "datal" but has
+ * extents beyond it, due to an fallocate with FALLOC_FL_KEEP_SIZE (and we are
+ * protected against such concurrent fallocate calls by the i_mutex).
+ *
+ * If the file has no extents but a size greater than datal, do not allow the
+ * copy because we would need turn the inline extent into a non-inline one (even
+ * with NO_HOLES enabled). If we find our destination inode only has one inline
+ * extent, just overwrite it with the source inline extent if its size is less
+ * than the source extent's size, or we could copy the source inline extent's
+ * data into the destination inode's inline extent if the later is greater then
+ * the former.
+ */
+static int clone_copy_inline_extent(struct inode *src,
+ struct inode *dst,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ struct btrfs_key *new_key,
+ const u64 drop_start,
+ const u64 datal,
+ const u64 skip,
+ const u64 size,
+ char *inline_data)
+{
+ struct btrfs_root *root = BTRFS_I(dst)->root;
+ const u64 aligned_end = ALIGN(new_key->offset + datal,
+ root->sectorsize);
+ int ret;
+ struct btrfs_key key;
+
+ if (new_key->offset > 0)
+ return -EOPNOTSUPP;
+
+ key.objectid = btrfs_ino(dst);
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = 0;
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0) {
+ return ret;
+ } else if (ret > 0) {
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ return ret;
+ else if (ret > 0)
+ goto copy_inline_extent;
+ }
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.objectid == btrfs_ino(dst) &&
+ key.type == BTRFS_EXTENT_DATA_KEY) {
+ ASSERT(key.offset > 0);
+ return -EOPNOTSUPP;
+ }
+ } else if (i_size_read(dst) <= datal) {
+ struct btrfs_file_extent_item *ei;
+ u64 ext_len;
+
+ /*
+ * If the file size is <= datal, make sure there are no other
+ * extents following (can happen do to an fallocate call with
+ * the flag FALLOC_FL_KEEP_SIZE).
+ */
+ ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_file_extent_item);
+ /*
+ * If it's an inline extent, it can not have other extents
+ * following it.
+ */
+ if (btrfs_file_extent_type(path->nodes[0], ei) ==
+ BTRFS_FILE_EXTENT_INLINE)
+ goto copy_inline_extent;
+
+ ext_len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
+ if (ext_len > aligned_end)
+ return -EOPNOTSUPP;
+
+ ret = btrfs_next_item(root, path);
+ if (ret < 0) {
+ return ret;
+ } else if (ret == 0) {
+ btrfs_item_key_to_cpu(path->nodes[0], &key,
+ path->slots[0]);
+ if (key.objectid == btrfs_ino(dst) &&
+ key.type == BTRFS_EXTENT_DATA_KEY)
+ return -EOPNOTSUPP;
+ }
+ }
+
+copy_inline_extent:
+ /*
+ * We have no extent items, or we have an extent at offset 0 which may
+ * or may not be inlined. All these cases are dealt the same way.
+ */
+ if (i_size_read(dst) > datal) {
+ /*
+ * If the destination inode has an inline extent...
+ * This would require copying the data from the source inline
+ * extent into the beginning of the destination's inline extent.
+ * But this is really complex, both extents can be compressed
+ * or just one of them, which would require decompressing and
+ * re-compressing data (which could increase the new compressed
+ * size, not allowing the compressed data to fit anymore in an
+ * inline extent).
+ * So just don't support this case for now (it should be rare,
+ * we are not really saving space when cloning inline extents).
+ */
+ return -EOPNOTSUPP;
+ }
+
+ btrfs_release_path(path);
+ ret = btrfs_drop_extents(trans, root, dst, drop_start, aligned_end, 1);
+ if (ret)
+ return ret;
+ ret = btrfs_insert_empty_item(trans, root, path, new_key, size);
+ if (ret)
+ return ret;
+
+ if (skip) {
+ const u32 start = btrfs_file_extent_calc_inline_size(0);
+
+ memmove(inline_data + start, inline_data + start + skip, datal);
+ }
+
+ write_extent_buffer(path->nodes[0], inline_data,
+ btrfs_item_ptr_offset(path->nodes[0],
+ path->slots[0]),
+ size);
+ inode_add_bytes(dst, datal);
+
+ return 0;
+}
+
/**
* btrfs_clone() - clone a range from inode file to another
*
@@ -3344,9 +3390,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
u32 nritems;
int slot;
int ret;
- int no_quota;
const u64 len = olen_aligned;
- u64 last_disko = 0;
u64 last_dest_end = destoff;
ret = -ENOMEM;
@@ -3360,7 +3404,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
return ret;
}
- path->reada = 2;
+ path->reada = READA_FORWARD;
/* clone data */
key.objectid = btrfs_ino(src);
key.type = BTRFS_EXTENT_DATA_KEY;
@@ -3392,7 +3436,6 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
nritems = btrfs_header_nritems(path->nodes[0]);
process_slot:
- no_quota = 1;
if (path->slots[0] >= nritems) {
ret = btrfs_next_leaf(BTRFS_I(src)->root, path);
if (ret < 0)
@@ -3544,35 +3587,13 @@ process_slot:
btrfs_set_file_extent_num_bytes(leaf, extent,
datal);
- /*
- * We need to look up the roots that point at
- * this bytenr and see if the new root does. If
- * it does not we need to make sure we update
- * quotas appropriately.
- */
- if (disko && root != BTRFS_I(src)->root &&
- disko != last_disko) {
- no_quota = check_ref(trans, root,
- disko);
- if (no_quota < 0) {
- btrfs_abort_transaction(trans,
- root,
- ret);
- btrfs_end_transaction(trans,
- root);
- ret = no_quota;
- goto out;
- }
- }
-
if (disko) {
inode_add_bytes(inode, datal);
ret = btrfs_inc_extent_ref(trans, root,
disko, diskl, 0,
root->root_key.objectid,
btrfs_ino(inode),
- new_key.offset - datao,
- no_quota);
+ new_key.offset - datao);
if (ret) {
btrfs_abort_transaction(trans,
root,
@@ -3586,21 +3607,6 @@ process_slot:
} else if (type == BTRFS_FILE_EXTENT_INLINE) {
u64 skip = 0;
u64 trim = 0;
- u64 aligned_end = 0;
-
- /*
- * Don't copy an inline extent into an offset
- * greater than zero. Having an inline extent
- * at such an offset results in chaos as btrfs
- * isn't prepared for such cases. Just skip
- * this case for the same reasons as commented
- * at btrfs_ioctl_clone().
- */
- if (last_dest_end > 0) {
- ret = -EOPNOTSUPP;
- btrfs_end_transaction(trans, root);
- goto out;
- }
if (off > key.offset) {
skip = off - key.offset;
@@ -3618,42 +3624,22 @@ process_slot:
size -= skip + trim;
datal -= skip + trim;
- aligned_end = ALIGN(new_key.offset + datal,
- root->sectorsize);
- ret = btrfs_drop_extents(trans, root, inode,
- drop_start,
- aligned_end,
- 1);
+ ret = clone_copy_inline_extent(src, inode,
+ trans, path,
+ &new_key,
+ drop_start,
+ datal,
+ skip, size, buf);
if (ret) {
if (ret != -EOPNOTSUPP)
btrfs_abort_transaction(trans,
- root, ret);
- btrfs_end_transaction(trans, root);
- goto out;
- }
-
- ret = btrfs_insert_empty_item(trans, root, path,
- &new_key, size);
- if (ret) {
- btrfs_abort_transaction(trans, root,
- ret);
+ root,
+ ret);
btrfs_end_transaction(trans, root);
goto out;
}
-
- if (skip) {
- u32 start =
- btrfs_file_extent_calc_inline_size(0);
- memmove(buf+start, buf+start+skip,
- datal);
- }
-
leaf = path->nodes[0];
slot = path->slots[0];
- write_extent_buffer(leaf, buf,
- btrfs_item_ptr_offset(leaf, slot),
- size);
- inode_add_bytes(inode, datal);
}
/* If we have an implicit hole (NO_HOLES feature). */
@@ -3719,17 +3705,16 @@ out:
return ret;
}
-static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
- u64 off, u64 olen, u64 destoff)
+static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
+ u64 off, u64 olen, u64 destoff)
{
struct inode *inode = file_inode(file);
+ struct inode *src = file_inode(file_src);
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct fd src_file;
- struct inode *src;
int ret;
u64 len = olen;
u64 bs = root->fs_info->sb->s_blocksize;
- int same_inode = 0;
+ int same_inode = src == inode;
/*
* TODO:
@@ -3742,58 +3727,23 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
* be either compressed or non-compressed.
*/
- /* the destination must be opened for writing */
- if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND))
- return -EINVAL;
-
if (btrfs_root_readonly(root))
return -EROFS;
- ret = mnt_want_write_file(file);
- if (ret)
- return ret;
-
- src_file = fdget(srcfd);
- if (!src_file.file) {
- ret = -EBADF;
- goto out_drop_write;
- }
-
- ret = -EXDEV;
- if (src_file.file->f_path.mnt != file->f_path.mnt)
- goto out_fput;
-
- src = file_inode(src_file.file);
-
- ret = -EINVAL;
- if (src == inode)
- same_inode = 1;
-
- /* the src must be open for reading */
- if (!(src_file.file->f_mode & FMODE_READ))
- goto out_fput;
+ if (file_src->f_path.mnt != file->f_path.mnt ||
+ src->i_sb != inode->i_sb)
+ return -EXDEV;
/* don't make the dst file partly checksummed */
if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM))
- goto out_fput;
+ return -EINVAL;
- ret = -EISDIR;
if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
- goto out_fput;
-
- ret = -EXDEV;
- if (src->i_sb != inode->i_sb)
- goto out_fput;
+ return -EISDIR;
if (!same_inode) {
- if (inode < src) {
- mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
- mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD);
- } else {
- mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT);
- mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
- }
+ btrfs_double_inode_lock(src, inode);
} else {
mutex_lock(&src->i_mutex);
}
@@ -3843,8 +3793,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
lock_extent_range(src, lock_start, lock_len);
} else {
- lock_extent_range(src, off, len);
- lock_extent_range(inode, destoff, len);
+ btrfs_double_extent_lock(src, off, inode, destoff, len);
}
ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
@@ -3855,9 +3804,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
unlock_extent(&BTRFS_I(src)->io_tree, lock_start, lock_end);
} else {
- unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
- unlock_extent(&BTRFS_I(inode)->io_tree, destoff,
- destoff + len - 1);
+ btrfs_double_extent_unlock(src, off, inode, destoff, len);
}
/*
* Truncate page cache pages so that future reads will see the cloned
@@ -3866,32 +3813,29 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
truncate_inode_pages_range(&inode->i_data, destoff,
PAGE_CACHE_ALIGN(destoff + len) - 1);
out_unlock:
- if (!same_inode) {
- if (inode < src) {
- mutex_unlock(&src->i_mutex);
- mutex_unlock(&inode->i_mutex);
- } else {
- mutex_unlock(&inode->i_mutex);
- mutex_unlock(&src->i_mutex);
- }
- } else {
+ if (!same_inode)
+ btrfs_double_inode_unlock(src, inode);
+ else
mutex_unlock(&src->i_mutex);
- }
-out_fput:
- fdput(src_file);
-out_drop_write:
- mnt_drop_write_file(file);
return ret;
}
-static long btrfs_ioctl_clone_range(struct file *file, void __user *argp)
+ssize_t btrfs_copy_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ size_t len, unsigned int flags)
{
- struct btrfs_ioctl_clone_range_args args;
+ ssize_t ret;
- if (copy_from_user(&args, argp, sizeof(args)))
- return -EFAULT;
- return btrfs_ioctl_clone(file, args.src_fd, args.src_offset,
- args.src_length, args.dest_offset);
+ ret = btrfs_clone_files(file_out, file_in, pos_in, len, pos_out);
+ if (ret == 0)
+ ret = len;
+ return ret;
+}
+
+int btrfs_clone_file_range(struct file *src_file, loff_t off,
+ struct file *dst_file, loff_t destoff, u64 len)
+{
+ return btrfs_clone_files(dst_file, src_file, off, len, destoff);
}
/*
@@ -4103,7 +4047,7 @@ static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
return -ENOMEM;
space_args.total_spaces = 0;
- dest = kmalloc(alloc_size, GFP_NOFS);
+ dest = kmalloc(alloc_size, GFP_KERNEL);
if (!dest)
return -ENOMEM;
dest_orig = dest;
@@ -4480,7 +4424,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
goto out;
}
- size = min_t(u32, loi->size, 64 * 1024);
+ size = min_t(u32, loi->size, SZ_64K);
inodes = init_data_container(size);
if (IS_ERR(inodes)) {
ret = PTR_ERR(inodes);
@@ -4629,7 +4573,7 @@ locked:
goto out_bargs;
}
- bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
+ bctl = kzalloc(sizeof(*bctl), GFP_KERNEL);
if (!bctl) {
ret = -ENOMEM;
goto out_bargs;
@@ -4647,6 +4591,11 @@ locked:
bctl->flags |= BTRFS_BALANCE_TYPE_MASK;
}
+ if (bctl->flags & ~(BTRFS_BALANCE_ARGS_MASK | BTRFS_BALANCE_TYPE_MASK)) {
+ ret = -EINVAL;
+ goto out_bctl;
+ }
+
do_balance:
/*
* Ownership of bctl and mutually_exclusive_operation_running
@@ -4658,12 +4607,15 @@ do_balance:
need_unlock = false;
ret = btrfs_balance(bctl, bargs);
+ bctl = NULL;
if (arg) {
if (copy_to_user(arg, bargs, sizeof(*bargs)))
ret = -EFAULT;
}
+out_bctl:
+ kfree(bctl);
out_bargs:
kfree(bargs);
out_unlock:
@@ -4707,7 +4659,7 @@ static long btrfs_ioctl_balance_progress(struct btrfs_root *root,
goto out;
}
- bargs = kzalloc(sizeof(*bargs), GFP_NOFS);
+ bargs = kzalloc(sizeof(*bargs), GFP_KERNEL);
if (!bargs) {
ret = -ENOMEM;
goto out;
@@ -4814,7 +4766,7 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
/* update qgroup status and info */
err = btrfs_run_qgroups(trans, root->fs_info);
if (err < 0)
- btrfs_error(root->fs_info, ret,
+ btrfs_std_error(root->fs_info, ret,
"failed to update qgroup status and info\n");
err = btrfs_end_transaction(trans, root);
if (err && !ret)
@@ -4967,7 +4919,7 @@ static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- qsa = kzalloc(sizeof(*qsa), GFP_NOFS);
+ qsa = kzalloc(sizeof(*qsa), GFP_KERNEL);
if (!qsa)
return -ENOMEM;
@@ -5097,7 +5049,7 @@ static long btrfs_ioctl_set_received_subvol_32(struct file *file,
goto out;
}
- args64 = kmalloc(sizeof(*args64), GFP_NOFS);
+ args64 = kmalloc(sizeof(*args64), GFP_KERNEL);
if (!args64) {
ret = -ENOMEM;
goto out;
@@ -5234,7 +5186,7 @@ out_unlock:
static int btrfs_ioctl_get_supported_features(struct file *file,
void __user *arg)
{
- static struct btrfs_ioctl_feature_flags features[3] = {
+ static const struct btrfs_ioctl_feature_flags features[3] = {
INIT_FEATURE_FLAGS(SUPP),
INIT_FEATURE_FLAGS(SAFE_SET),
INIT_FEATURE_FLAGS(SAFE_CLEAR)
@@ -5433,10 +5385,6 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_dev_info(root, argp);
case BTRFS_IOC_BALANCE:
return btrfs_ioctl_balance(file, NULL);
- case BTRFS_IOC_CLONE:
- return btrfs_ioctl_clone(file, arg, 0, 0, 0);
- case BTRFS_IOC_CLONE_RANGE:
- return btrfs_ioctl_clone_range(file, argp);
case BTRFS_IOC_TRANS_START:
return btrfs_ioctl_trans_start(file);
case BTRFS_IOC_TRANS_END:
@@ -5514,8 +5462,6 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_get_fslabel(file, argp);
case BTRFS_IOC_SET_FSLABEL:
return btrfs_ioctl_set_fslabel(file, argp);
- case BTRFS_IOC_FILE_EXTENT_SAME:
- return btrfs_ioctl_file_extent_same(file, argp);
case BTRFS_IOC_GET_SUPPORTED_FEATURES:
return btrfs_ioctl_get_supported_features(file, argp);
case BTRFS_IOC_GET_FEATURES:
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index f8229ef..d13128c 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -56,7 +56,6 @@ void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
atomic_dec(&eb->spinning_readers);
read_unlock(&eb->lock);
}
- return;
}
/*
@@ -79,6 +78,9 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
write_lock(&eb->lock);
WARN_ON(atomic_read(&eb->spinning_writers));
atomic_inc(&eb->spinning_writers);
+ /*
+ * atomic_dec_and_test implies a barrier for waitqueue_active
+ */
if (atomic_dec_and_test(&eb->blocking_writers) &&
waitqueue_active(&eb->write_lock_wq))
wake_up(&eb->write_lock_wq);
@@ -86,11 +88,13 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
BUG_ON(atomic_read(&eb->blocking_readers) == 0);
read_lock(&eb->lock);
atomic_inc(&eb->spinning_readers);
+ /*
+ * atomic_dec_and_test implies a barrier for waitqueue_active
+ */
if (atomic_dec_and_test(&eb->blocking_readers) &&
waitqueue_active(&eb->read_lock_wq))
wake_up(&eb->read_lock_wq);
}
- return;
}
/*
@@ -229,6 +233,9 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
}
btrfs_assert_tree_read_locked(eb);
WARN_ON(atomic_read(&eb->blocking_readers) == 0);
+ /*
+ * atomic_dec_and_test implies a barrier for waitqueue_active
+ */
if (atomic_dec_and_test(&eb->blocking_readers) &&
waitqueue_active(&eb->read_lock_wq))
wake_up(&eb->read_lock_wq);
@@ -241,6 +248,7 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
*/
void btrfs_tree_lock(struct extent_buffer *eb)
{
+ WARN_ON(eb->lock_owner == current->pid);
again:
wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0);
wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
@@ -279,6 +287,9 @@ void btrfs_tree_unlock(struct extent_buffer *eb)
if (blockers) {
WARN_ON(atomic_read(&eb->spinning_writers));
atomic_dec(&eb->blocking_writers);
+ /*
+ * Make sure counter is updated before we wake up waiters.
+ */
smp_mb();
if (waitqueue_active(&eb->write_lock_wq))
wake_up(&eb->write_lock_wq);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 52170cf..8c27292 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -345,6 +345,9 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode,
if (entry->bytes_left == 0) {
ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
+ /*
+ * Implicit memory barrier after test_and_set_bit
+ */
if (waitqueue_active(&entry->wait))
wake_up(&entry->wait);
} else {
@@ -409,6 +412,9 @@ have_entry:
if (entry->bytes_left == 0) {
ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
+ /*
+ * Implicit memory barrier after test_and_set_bit
+ */
if (waitqueue_active(&entry->wait))
wake_up(&entry->wait);
} else {
@@ -484,15 +490,16 @@ void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
spin_lock_irq(&log->log_extents_lock[index]);
while (!list_empty(&log->logged_list[index])) {
+ struct inode *inode;
ordered = list_first_entry(&log->logged_list[index],
struct btrfs_ordered_extent,
log_list);
list_del_init(&ordered->log_list);
+ inode = ordered->inode;
spin_unlock_irq(&log->log_extents_lock[index]);
if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) &&
!test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) {
- struct inode *inode = ordered->inode;
u64 start = ordered->file_offset;
u64 end = ordered->file_offset + ordered->len - 1;
@@ -503,20 +510,25 @@ void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
&ordered->flags));
/*
- * If our ordered extent completed it means it updated the
- * fs/subvol and csum trees already, so no need to make the
- * current transaction's commit wait for it, as we end up
- * holding memory unnecessarily and delaying the inode's iput
- * until the transaction commit (we schedule an iput for the
- * inode when the ordered extent's refcount drops to 0), which
- * prevents it from being evictable until the transaction
- * commits.
+ * In order to keep us from losing our ordered extent
+ * information when committing the transaction we have to make
+ * sure that any logged extents are completed when we go to
+ * commit the transaction. To do this we simply increase the
+ * current transactions pending_ordered counter and decrement it
+ * when the ordered extent completes.
*/
- if (test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags))
- btrfs_put_ordered_extent(ordered);
- else
- list_add_tail(&ordered->trans_list, &trans->ordered);
-
+ if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
+ struct btrfs_ordered_inode_tree *tree;
+
+ tree = &BTRFS_I(inode)->ordered_tree;
+ spin_lock_irq(&tree->lock);
+ if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
+ set_bit(BTRFS_ORDERED_PENDING, &ordered->flags);
+ atomic_inc(&trans->transaction->pending_ordered);
+ }
+ spin_unlock_irq(&tree->lock);
+ }
+ btrfs_put_ordered_extent(ordered);
spin_lock_irq(&log->log_extents_lock[index]);
}
spin_unlock_irq(&log->log_extents_lock[index]);
@@ -578,6 +590,7 @@ void btrfs_remove_ordered_extent(struct inode *inode,
struct btrfs_ordered_inode_tree *tree;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct rb_node *node;
+ bool dec_pending_ordered = false;
tree = &BTRFS_I(inode)->ordered_tree;
spin_lock_irq(&tree->lock);
@@ -587,8 +600,37 @@ void btrfs_remove_ordered_extent(struct inode *inode,
if (tree->last == node)
tree->last = NULL;
set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
+ if (test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags))
+ dec_pending_ordered = true;
spin_unlock_irq(&tree->lock);
+ /*
+ * The current running transaction is waiting on us, we need to let it
+ * know that we're complete and wake it up.
+ */
+ if (dec_pending_ordered) {
+ struct btrfs_transaction *trans;
+
+ /*
+ * The checks for trans are just a formality, it should be set,
+ * but if it isn't we don't want to deref/assert under the spin
+ * lock, so be nice and check if trans is set, but ASSERT() so
+ * if it isn't set a developer will notice.
+ */
+ spin_lock(&root->fs_info->trans_lock);
+ trans = root->fs_info->running_transaction;
+ if (trans)
+ atomic_inc(&trans->use_count);
+ spin_unlock(&root->fs_info->trans_lock);
+
+ ASSERT(trans);
+ if (trans) {
+ if (atomic_dec_and_test(&trans->pending_ordered))
+ wake_up(&trans->pending_wait);
+ btrfs_put_transaction(trans);
+ }
+ }
+
spin_lock(&root->ordered_extent_lock);
list_del_init(&entry->root_extent_list);
root->nr_ordered_extents--;
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 7176cc0..23c9605 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -73,6 +73,8 @@ struct btrfs_ordered_sum {
#define BTRFS_ORDERED_LOGGED 10 /* Set when we've waited on this ordered extent
* in the logging code. */
+#define BTRFS_ORDERED_PENDING 11 /* We are waiting for this ordered extent to
+ * complete in the current transaction. */
struct btrfs_ordered_extent {
/* logical offset in the file */
u64 file_offset;
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index dca137b..f9e6023 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -49,18 +49,16 @@ static struct prop_handler prop_handlers[] = {
.extract = prop_compression_extract,
.inheritable = 1
},
- {
- .xattr_name = NULL
- }
};
void __init btrfs_props_init(void)
{
- struct prop_handler *p;
+ int i;
hash_init(prop_handlers_ht);
- for (p = &prop_handlers[0]; p->xattr_name; p++) {
+ for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) {
+ struct prop_handler *p = &prop_handlers[i];
u64 h = btrfs_name_hash(p->xattr_name, strlen(p->xattr_name));
hash_add(prop_handlers_ht, &p->node, h);
@@ -301,15 +299,16 @@ static int inherit_props(struct btrfs_trans_handle *trans,
struct inode *inode,
struct inode *parent)
{
- const struct prop_handler *h;
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret;
+ int i;
if (!test_bit(BTRFS_INODE_HAS_PROPS,
&BTRFS_I(parent)->runtime_flags))
return 0;
- for (h = &prop_handlers[0]; h->xattr_name; h++) {
+ for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) {
+ const struct prop_handler *h = &prop_handlers[i];
const char *value;
u64 num_bytes;
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 8a82029..5279fda 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -376,7 +376,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
qgroup = find_qgroup_rb(fs_info, found_key.offset);
if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) ||
(!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) {
- btrfs_err(fs_info, "inconsitent qgroup config");
+ btrfs_err(fs_info, "inconsistent qgroup config");
flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
}
if (!qgroup) {
@@ -993,9 +993,10 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans,
mutex_lock(&fs_info->qgroup_ioctl_lock);
if (!fs_info->quota_root)
goto out;
- spin_lock(&fs_info->qgroup_lock);
fs_info->quota_enabled = 0;
fs_info->pending_quota_state = 0;
+ btrfs_qgroup_wait_for_completion(fs_info);
+ spin_lock(&fs_info->qgroup_lock);
quota_root = fs_info->quota_root;
fs_info->quota_root = NULL;
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
@@ -1461,6 +1462,8 @@ struct btrfs_qgroup_extent_record
struct btrfs_qgroup_extent_record *entry;
u64 bytenr = record->bytenr;
+ assert_spin_locked(&delayed_refs->lock);
+
while (*p) {
parent_node = *p;
entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record,
@@ -1652,10 +1655,6 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info,
}
}
- /* For exclusive extent, free its reserved bytes too */
- if (nr_old_roots == 0 && nr_new_roots == 1 &&
- cur_new_count == nr_new_roots)
- qg->reserved -= num_bytes;
if (dirty)
qgroup_dirty(fs_info, qg);
}
@@ -2035,7 +2034,7 @@ out:
return ret;
}
-int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
+static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
{
struct btrfs_root *quota_root;
struct btrfs_qgroup *qgroup;
@@ -2116,14 +2115,13 @@ out:
return ret;
}
-void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
+void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
+ u64 ref_root, u64 num_bytes)
{
struct btrfs_root *quota_root;
struct btrfs_qgroup *qgroup;
- struct btrfs_fs_info *fs_info = root->fs_info;
struct ulist_node *unode;
struct ulist_iterator uiter;
- u64 ref_root = root->root_key.objectid;
int ret = 0;
if (!is_fstree(ref_root))
@@ -2169,6 +2167,11 @@ out:
spin_unlock(&fs_info->qgroup_lock);
}
+static inline void qgroup_free(struct btrfs_root *root, u64 num_bytes)
+{
+ return btrfs_qgroup_free_refroot(root->fs_info, root->objectid,
+ num_bytes);
+}
void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
{
if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq)
@@ -2188,17 +2191,16 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
*/
static int
qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
- struct btrfs_trans_handle *trans,
- struct extent_buffer *scratch_leaf)
+ struct btrfs_trans_handle *trans)
{
struct btrfs_key found;
+ struct extent_buffer *scratch_leaf = NULL;
struct ulist *roots = NULL;
struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
u64 num_bytes;
int slot;
int ret;
- path->leave_spinning = 1;
mutex_lock(&fs_info->qgroup_rescan_lock);
ret = btrfs_search_slot_for_read(fs_info->extent_root,
&fs_info->qgroup_rescan_progress,
@@ -2229,7 +2231,15 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
fs_info->qgroup_rescan_progress.objectid = found.objectid + 1;
btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
- memcpy(scratch_leaf, path->nodes[0], sizeof(*scratch_leaf));
+ scratch_leaf = btrfs_clone_extent_buffer(path->nodes[0]);
+ if (!scratch_leaf) {
+ ret = -ENOMEM;
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+ goto out;
+ }
+ extent_buffer_get(scratch_leaf);
+ btrfs_tree_read_lock(scratch_leaf);
+ btrfs_set_lock_blocking_rw(scratch_leaf, BTRFS_READ_LOCK);
slot = path->slots[0];
btrfs_release_path(path);
mutex_unlock(&fs_info->qgroup_rescan_lock);
@@ -2255,6 +2265,10 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
goto out;
}
out:
+ if (scratch_leaf) {
+ btrfs_tree_read_unlock_blocking(scratch_leaf);
+ free_extent_buffer(scratch_leaf);
+ }
btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
return ret;
@@ -2266,19 +2280,15 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
qgroup_rescan_work);
struct btrfs_path *path;
struct btrfs_trans_handle *trans = NULL;
- struct extent_buffer *scratch_leaf = NULL;
int err = -ENOMEM;
int ret = 0;
path = btrfs_alloc_path();
if (!path)
goto out;
- scratch_leaf = kmalloc(sizeof(*scratch_leaf), GFP_NOFS);
- if (!scratch_leaf)
- goto out;
err = 0;
- while (!err) {
+ while (!err && !btrfs_fs_closing(fs_info)) {
trans = btrfs_start_transaction(fs_info->fs_root, 0);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
@@ -2287,8 +2297,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
if (!fs_info->quota_enabled) {
err = -EINTR;
} else {
- err = qgroup_rescan_leaf(fs_info, path, trans,
- scratch_leaf);
+ err = qgroup_rescan_leaf(fs_info, path, trans);
}
if (err > 0)
btrfs_commit_transaction(trans, fs_info->fs_root);
@@ -2297,11 +2306,11 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
}
out:
- kfree(scratch_leaf);
btrfs_free_path(path);
mutex_lock(&fs_info->qgroup_rescan_lock);
- fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+ if (!btrfs_fs_closing(fs_info))
+ fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
if (err > 0 &&
fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) {
@@ -2330,7 +2339,9 @@ out:
}
btrfs_end_transaction(trans, fs_info->quota_root);
- if (err >= 0) {
+ if (btrfs_fs_closing(fs_info)) {
+ btrfs_info(fs_info, "qgroup scan paused");
+ } else if (err >= 0) {
btrfs_info(fs_info, "qgroup scan completed%s",
err > 0 ? " (inconsistency flag cleared)" : "");
} else {
@@ -2378,12 +2389,11 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
memset(&fs_info->qgroup_rescan_progress, 0,
sizeof(fs_info->qgroup_rescan_progress));
fs_info->qgroup_rescan_progress.objectid = progress_objectid;
+ init_completion(&fs_info->qgroup_rescan_completion);
spin_unlock(&fs_info->qgroup_lock);
mutex_unlock(&fs_info->qgroup_rescan_lock);
- init_completion(&fs_info->qgroup_rescan_completion);
-
memset(&fs_info->qgroup_rescan_work, 0,
sizeof(fs_info->qgroup_rescan_work));
btrfs_init_work(&fs_info->qgroup_rescan_work,
@@ -2486,3 +2496,190 @@ btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
btrfs_queue_work(fs_info->qgroup_rescan_workers,
&fs_info->qgroup_rescan_work);
}
+
+/*
+ * Reserve qgroup space for range [start, start + len).
+ *
+ * This function will either reserve space from related qgroups or doing
+ * nothing if the range is already reserved.
+ *
+ * Return 0 for successful reserve
+ * Return <0 for error (including -EQUOT)
+ *
+ * NOTE: this function may sleep for memory allocation.
+ */
+int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct extent_changeset changeset;
+ struct ulist_node *unode;
+ struct ulist_iterator uiter;
+ int ret;
+
+ if (!root->fs_info->quota_enabled || !is_fstree(root->objectid) ||
+ len == 0)
+ return 0;
+
+ changeset.bytes_changed = 0;
+ changeset.range_changed = ulist_alloc(GFP_NOFS);
+ ret = set_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
+ start + len -1, EXTENT_QGROUP_RESERVED, GFP_NOFS,
+ &changeset);
+ trace_btrfs_qgroup_reserve_data(inode, start, len,
+ changeset.bytes_changed,
+ QGROUP_RESERVE);
+ if (ret < 0)
+ goto cleanup;
+ ret = qgroup_reserve(root, changeset.bytes_changed);
+ if (ret < 0)
+ goto cleanup;
+
+ ulist_free(changeset.range_changed);
+ return ret;
+
+cleanup:
+ /* cleanup already reserved ranges */
+ ULIST_ITER_INIT(&uiter);
+ while ((unode = ulist_next(changeset.range_changed, &uiter)))
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val,
+ unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL,
+ GFP_NOFS);
+ ulist_free(changeset.range_changed);
+ return ret;
+}
+
+static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len,
+ int free)
+{
+ struct extent_changeset changeset;
+ int trace_op = QGROUP_RELEASE;
+ int ret;
+
+ changeset.bytes_changed = 0;
+ changeset.range_changed = ulist_alloc(GFP_NOFS);
+ if (!changeset.range_changed)
+ return -ENOMEM;
+
+ ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
+ start + len -1, EXTENT_QGROUP_RESERVED, GFP_NOFS,
+ &changeset);
+ if (ret < 0)
+ goto out;
+
+ if (free) {
+ qgroup_free(BTRFS_I(inode)->root, changeset.bytes_changed);
+ trace_op = QGROUP_FREE;
+ }
+ trace_btrfs_qgroup_release_data(inode, start, len,
+ changeset.bytes_changed, trace_op);
+out:
+ ulist_free(changeset.range_changed);
+ return ret;
+}
+
+/*
+ * Free a reserved space range from io_tree and related qgroups
+ *
+ * Should be called when a range of pages get invalidated before reaching disk.
+ * Or for error cleanup case.
+ *
+ * For data written to disk, use btrfs_qgroup_release_data().
+ *
+ * NOTE: This function may sleep for memory allocation.
+ */
+int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len)
+{
+ return __btrfs_qgroup_release_data(inode, start, len, 1);
+}
+
+/*
+ * Release a reserved space range from io_tree only.
+ *
+ * Should be called when a range of pages get written to disk and corresponding
+ * FILE_EXTENT is inserted into corresponding root.
+ *
+ * Since new qgroup accounting framework will only update qgroup numbers at
+ * commit_transaction() time, its reserved space shouldn't be freed from
+ * related qgroups.
+ *
+ * But we should release the range from io_tree, to allow further write to be
+ * COWed.
+ *
+ * NOTE: This function may sleep for memory allocation.
+ */
+int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len)
+{
+ return __btrfs_qgroup_release_data(inode, start, len, 0);
+}
+
+int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes)
+{
+ int ret;
+
+ if (!root->fs_info->quota_enabled || !is_fstree(root->objectid) ||
+ num_bytes == 0)
+ return 0;
+
+ BUG_ON(num_bytes != round_down(num_bytes, root->nodesize));
+ ret = qgroup_reserve(root, num_bytes);
+ if (ret < 0)
+ return ret;
+ atomic_add(num_bytes, &root->qgroup_meta_rsv);
+ return ret;
+}
+
+void btrfs_qgroup_free_meta_all(struct btrfs_root *root)
+{
+ int reserved;
+
+ if (!root->fs_info->quota_enabled || !is_fstree(root->objectid))
+ return;
+
+ reserved = atomic_xchg(&root->qgroup_meta_rsv, 0);
+ if (reserved == 0)
+ return;
+ qgroup_free(root, reserved);
+}
+
+void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes)
+{
+ if (!root->fs_info->quota_enabled || !is_fstree(root->objectid))
+ return;
+
+ BUG_ON(num_bytes != round_down(num_bytes, root->nodesize));
+ WARN_ON(atomic_read(&root->qgroup_meta_rsv) < num_bytes);
+ atomic_sub(num_bytes, &root->qgroup_meta_rsv);
+ qgroup_free(root, num_bytes);
+}
+
+/*
+ * Check qgroup reserved space leaking, normally at destory inode
+ * time
+ */
+void btrfs_qgroup_check_reserved_leak(struct inode *inode)
+{
+ struct extent_changeset changeset;
+ struct ulist_node *unode;
+ struct ulist_iterator iter;
+ int ret;
+
+ changeset.bytes_changed = 0;
+ changeset.range_changed = ulist_alloc(GFP_NOFS);
+ if (WARN_ON(!changeset.range_changed))
+ return;
+
+ ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
+ EXTENT_QGROUP_RESERVED, GFP_NOFS, &changeset);
+
+ WARN_ON(ret < 0);
+ if (WARN_ON(changeset.bytes_changed)) {
+ ULIST_ITER_INIT(&iter);
+ while ((unode = ulist_next(changeset.range_changed, &iter))) {
+ btrfs_warn(BTRFS_I(inode)->root->fs_info,
+ "leaking qgroup reserved space, ino: %lu, start: %llu, end: %llu",
+ inode->i_ino, unode->val, unode->aux);
+ }
+ qgroup_free(BTRFS_I(inode)->root, changeset.bytes_changed);
+ }
+ ulist_free(changeset.range_changed);
+}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 6387dcf..ecb2c14 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -33,6 +33,13 @@ struct btrfs_qgroup_extent_record {
struct ulist *old_roots;
};
+/*
+ * For qgroup event trace points only
+ */
+#define QGROUP_RESERVE (1<<0)
+#define QGROUP_RELEASE (1<<1)
+#define QGROUP_FREE (1<<2)
+
int btrfs_quota_enable(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
int btrfs_quota_disable(struct btrfs_trans_handle *trans,
@@ -71,9 +78,18 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
struct btrfs_qgroup_inherit *inherit);
-int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes);
-void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes);
-
+void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
+ u64 ref_root, u64 num_bytes);
+/*
+ * TODO: Add proper trace point for it, as btrfs_qgroup_free() is
+ * called by everywhere, can't provide good trace for delayed ref case.
+ */
+static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info,
+ u64 ref_root, u64 num_bytes)
+{
+ btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes);
+ trace_btrfs_qgroup_free_delayed_ref(ref_root, num_bytes);
+}
void assert_qgroups_uptodate(struct btrfs_trans_handle *trans);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
@@ -81,4 +97,13 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
u64 rfer, u64 excl);
#endif
+/* New io_tree based accurate qgroup reserve API */
+int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len);
+int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len);
+int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len);
+
+int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes);
+void btrfs_qgroup_free_meta_all(struct btrfs_root *root);
+void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes);
+void btrfs_qgroup_check_reserved_leak(struct inode *inode);
#endif /* __BTRFS_QGROUP__ */
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index fa72068..6d70754 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -61,9 +61,10 @@
#define RBIO_CACHE_SIZE 1024
enum btrfs_rbio_ops {
- BTRFS_RBIO_WRITE = 0,
- BTRFS_RBIO_READ_REBUILD = 1,
- BTRFS_RBIO_PARITY_SCRUB = 2,
+ BTRFS_RBIO_WRITE,
+ BTRFS_RBIO_READ_REBUILD,
+ BTRFS_RBIO_PARITY_SCRUB,
+ BTRFS_RBIO_REBUILD_MISSING,
};
struct btrfs_raid_bio {
@@ -502,7 +503,6 @@ static void cache_rbio(struct btrfs_raid_bio *rbio)
}
spin_unlock_irqrestore(&table->cache_lock, flags);
- return;
}
/*
@@ -602,6 +602,10 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
cur->operation == BTRFS_RBIO_PARITY_SCRUB)
return 0;
+ if (last->operation == BTRFS_RBIO_REBUILD_MISSING ||
+ cur->operation == BTRFS_RBIO_REBUILD_MISSING)
+ return 0;
+
return 1;
}
@@ -793,7 +797,10 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
if (next->operation == BTRFS_RBIO_READ_REBUILD)
async_read_rebuild(next);
- else if (next->operation == BTRFS_RBIO_WRITE) {
+ else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) {
+ steal_rbio(rbio, next);
+ async_read_rebuild(next);
+ } else if (next->operation == BTRFS_RBIO_WRITE) {
steal_rbio(rbio, next);
async_rmw_stripe(next);
} else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
@@ -802,7 +809,11 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
}
goto done_nolock;
- } else if (waitqueue_active(&h->wait)) {
+ /*
+ * The barrier for this waitqueue_active is not needed,
+ * we're protected by h->lock and can't miss a wakeup.
+ */
+ } else if (waitqueue_active(&h->wait)) {
spin_unlock(&rbio->bio_list_lock);
spin_unlock_irqrestore(&h->lock, flags);
wake_up(&h->wait);
@@ -851,7 +862,7 @@ static void free_raid_bio(struct btrfs_raid_bio *rbio)
* this frees the rbio and runs through all the bios in the
* bio_list and calls end_io on them
*/
-static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
+static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err)
{
struct bio *cur = bio_list_get(&rbio->bio_list);
struct bio *next;
@@ -864,9 +875,8 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
while (cur) {
next = cur->bi_next;
cur->bi_next = NULL;
- if (uptodate)
- set_bit(BIO_UPTODATE, &cur->bi_flags);
- bio_endio(cur, err);
+ cur->bi_error = err;
+ bio_endio(cur);
cur = next;
}
}
@@ -875,9 +885,10 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
* end io function used by finish_rmw. When we finally
* get here, we've written a full stripe
*/
-static void raid_write_end_io(struct bio *bio, int err)
+static void raid_write_end_io(struct bio *bio)
{
struct btrfs_raid_bio *rbio = bio->bi_private;
+ int err = bio->bi_error;
if (err)
fail_bio_stripe(rbio, bio);
@@ -893,8 +904,7 @@ static void raid_write_end_io(struct bio *bio, int err)
if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
err = -EIO;
- rbio_orig_end_io(rbio, err, 0);
- return;
+ rbio_orig_end_io(rbio, err);
}
/*
@@ -1071,7 +1081,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
* devices or if they are not contiguous
*/
if (last_end == disk_start && stripe->dev->bdev &&
- test_bit(BIO_UPTODATE, &last->bi_flags) &&
+ !last->bi_error &&
last->bi_bdev == stripe->dev->bdev) {
ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0);
if (ret == PAGE_CACHE_SIZE)
@@ -1087,7 +1097,6 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
bio->bi_iter.bi_size = 0;
bio->bi_bdev = stripe->dev->bdev;
bio->bi_iter.bi_sector = disk_start >> 9;
- set_bit(BIO_UPTODATE, &bio->bi_flags);
bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
bio_list_add(bio_list, bio);
@@ -1312,13 +1321,12 @@ write_data:
bio->bi_private = rbio;
bio->bi_end_io = raid_write_end_io;
- BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
submit_bio(WRITE, bio);
}
return;
cleanup:
- rbio_orig_end_io(rbio, -EIO, 0);
+ rbio_orig_end_io(rbio, -EIO);
}
/*
@@ -1441,11 +1449,11 @@ static void set_bio_pages_uptodate(struct bio *bio)
* This will usually kick off finish_rmw once all the bios are read in, but it
* may trigger parity reconstruction if we had any errors along the way
*/
-static void raid_rmw_end_io(struct bio *bio, int err)
+static void raid_rmw_end_io(struct bio *bio)
{
struct btrfs_raid_bio *rbio = bio->bi_private;
- if (err)
+ if (bio->bi_error)
fail_bio_stripe(rbio, bio);
else
set_bio_pages_uptodate(bio);
@@ -1455,7 +1463,6 @@ static void raid_rmw_end_io(struct bio *bio, int err)
if (!atomic_dec_and_test(&rbio->stripes_pending))
return;
- err = 0;
if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
goto cleanup;
@@ -1469,7 +1476,7 @@ static void raid_rmw_end_io(struct bio *bio, int err)
cleanup:
- rbio_orig_end_io(rbio, -EIO, 0);
+ rbio_orig_end_io(rbio, -EIO);
}
static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
@@ -1572,14 +1579,13 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
btrfs_bio_wq_end_io(rbio->fs_info, bio,
BTRFS_WQ_ENDIO_RAID56);
- BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
submit_bio(READ, bio);
}
/* the actual write will happen once the reads are done */
return 0;
cleanup:
- rbio_orig_end_io(rbio, -EIO, 0);
+ rbio_orig_end_io(rbio, -EIO);
return -EIO;
finish:
@@ -1809,7 +1815,8 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
faila = rbio->faila;
failb = rbio->failb;
- if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
+ if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
+ rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
spin_lock_irq(&rbio->bio_list_lock);
set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
spin_unlock_irq(&rbio->bio_list_lock);
@@ -1834,7 +1841,8 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
* if we're rebuilding a read, we have to use
* pages from the bio list
*/
- if (rbio->operation == BTRFS_RBIO_READ_REBUILD &&
+ if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
+ rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
(stripe == faila || stripe == failb)) {
page = page_in_rbio(rbio, stripe, pagenr, 0);
} else {
@@ -1943,7 +1951,8 @@ pstripe:
* if we're rebuilding a read, we have to use
* pages from the bio list
*/
- if (rbio->operation == BTRFS_RBIO_READ_REBUILD &&
+ if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
+ rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
(stripe == faila || stripe == failb)) {
page = page_in_rbio(rbio, stripe, pagenr, 0);
} else {
@@ -1964,7 +1973,9 @@ cleanup_io:
else
clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
- rbio_orig_end_io(rbio, err, err == 0);
+ rbio_orig_end_io(rbio, err);
+ } else if (rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
+ rbio_orig_end_io(rbio, err);
} else if (err == 0) {
rbio->faila = -1;
rbio->failb = -1;
@@ -1976,7 +1987,7 @@ cleanup_io:
else
BUG();
} else {
- rbio_orig_end_io(rbio, err, 0);
+ rbio_orig_end_io(rbio, err);
}
}
@@ -1984,7 +1995,7 @@ cleanup_io:
* This is called only for stripes we've read from disk to
* reconstruct the parity.
*/
-static void raid_recover_end_io(struct bio *bio, int err)
+static void raid_recover_end_io(struct bio *bio)
{
struct btrfs_raid_bio *rbio = bio->bi_private;
@@ -1992,7 +2003,7 @@ static void raid_recover_end_io(struct bio *bio, int err)
* we only read stripe pages off the disk, set them
* up to date if there were no errors
*/
- if (err)
+ if (bio->bi_error)
fail_bio_stripe(rbio, bio);
else
set_bio_pages_uptodate(bio);
@@ -2002,7 +2013,7 @@ static void raid_recover_end_io(struct bio *bio, int err)
return;
if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
- rbio_orig_end_io(rbio, -EIO, 0);
+ rbio_orig_end_io(rbio, -EIO);
else
__raid_recover_end_io(rbio);
}
@@ -2094,15 +2105,15 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
btrfs_bio_wq_end_io(rbio->fs_info, bio,
BTRFS_WQ_ENDIO_RAID56);
- BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
submit_bio(READ, bio);
}
out:
return 0;
cleanup:
- if (rbio->operation == BTRFS_RBIO_READ_REBUILD)
- rbio_orig_end_io(rbio, -EIO, 0);
+ if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
+ rbio->operation == BTRFS_RBIO_REBUILD_MISSING)
+ rbio_orig_end_io(rbio, -EIO);
return -EIO;
}
@@ -2232,8 +2243,9 @@ raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio,
return rbio;
}
-void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio,
- struct page *page, u64 logical)
+/* Used for both parity scrub and missing. */
+void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
+ u64 logical)
{
int stripe_offset;
int index;
@@ -2277,11 +2289,12 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
* end io function used by finish_rmw. When we finally
* get here, we've written a full stripe
*/
-static void raid_write_parity_end_io(struct bio *bio, int err)
+static void raid_write_parity_end_io(struct bio *bio)
{
struct btrfs_raid_bio *rbio = bio->bi_private;
+ int err = bio->bi_error;
- if (err)
+ if (bio->bi_error)
fail_bio_stripe(rbio, bio);
bio_put(bio);
@@ -2294,7 +2307,7 @@ static void raid_write_parity_end_io(struct bio *bio, int err)
if (atomic_read(&rbio->error))
err = -EIO;
- rbio_orig_end_io(rbio, err, 0);
+ rbio_orig_end_io(rbio, err);
}
static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
@@ -2437,7 +2450,7 @@ submit_write:
nr_data = bio_list_size(&bio_list);
if (!nr_data) {
/* Every parity is right */
- rbio_orig_end_io(rbio, 0, 0);
+ rbio_orig_end_io(rbio, 0);
return;
}
@@ -2450,13 +2463,12 @@ submit_write:
bio->bi_private = rbio;
bio->bi_end_io = raid_write_parity_end_io;
- BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
submit_bio(WRITE, bio);
}
return;
cleanup:
- rbio_orig_end_io(rbio, -EIO, 0);
+ rbio_orig_end_io(rbio, -EIO);
}
static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
@@ -2524,7 +2536,7 @@ static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
return;
cleanup:
- rbio_orig_end_io(rbio, -EIO, 0);
+ rbio_orig_end_io(rbio, -EIO);
}
/*
@@ -2535,11 +2547,11 @@ cleanup:
* This will usually kick off finish_rmw once all the bios are read in, but it
* may trigger parity reconstruction if we had any errors along the way
*/
-static void raid56_parity_scrub_end_io(struct bio *bio, int err)
+static void raid56_parity_scrub_end_io(struct bio *bio)
{
struct btrfs_raid_bio *rbio = bio->bi_private;
- if (err)
+ if (bio->bi_error)
fail_bio_stripe(rbio, bio);
else
set_bio_pages_uptodate(bio);
@@ -2632,14 +2644,13 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
btrfs_bio_wq_end_io(rbio->fs_info, bio,
BTRFS_WQ_ENDIO_RAID56);
- BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
submit_bio(READ, bio);
}
/* the actual write will happen once the reads are done */
return;
cleanup:
- rbio_orig_end_io(rbio, -EIO, 0);
+ rbio_orig_end_io(rbio, -EIO);
return;
finish:
@@ -2668,3 +2679,55 @@ void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
if (!lock_stripe_add(rbio))
async_scrub_parity(rbio);
}
+
+/* The following code is used for dev replace of a missing RAID 5/6 device. */
+
+struct btrfs_raid_bio *
+raid56_alloc_missing_rbio(struct btrfs_root *root, struct bio *bio,
+ struct btrfs_bio *bbio, u64 length)
+{
+ struct btrfs_raid_bio *rbio;
+
+ rbio = alloc_rbio(root, bbio, length);
+ if (IS_ERR(rbio))
+ return NULL;
+
+ rbio->operation = BTRFS_RBIO_REBUILD_MISSING;
+ bio_list_add(&rbio->bio_list, bio);
+ /*
+ * This is a special bio which is used to hold the completion handler
+ * and make the scrub rbio is similar to the other types
+ */
+ ASSERT(!bio->bi_iter.bi_size);
+
+ rbio->faila = find_logical_bio_stripe(rbio, bio);
+ if (rbio->faila == -1) {
+ BUG();
+ kfree(rbio);
+ return NULL;
+ }
+
+ return rbio;
+}
+
+static void missing_raid56_work(struct btrfs_work *work)
+{
+ struct btrfs_raid_bio *rbio;
+
+ rbio = container_of(work, struct btrfs_raid_bio, work);
+ __raid56_parity_recover(rbio);
+}
+
+static void async_missing_raid56(struct btrfs_raid_bio *rbio)
+{
+ btrfs_init_work(&rbio->work, btrfs_rmw_helper,
+ missing_raid56_work, NULL, NULL);
+
+ btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work);
+}
+
+void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio)
+{
+ if (!lock_stripe_add(rbio))
+ async_missing_raid56(rbio);
+}
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index 2b5d797..8b69469 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -48,15 +48,21 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
struct btrfs_bio *bbio, u64 stripe_len);
+void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
+ u64 logical);
+
struct btrfs_raid_bio *
raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio,
struct btrfs_bio *bbio, u64 stripe_len,
struct btrfs_device *scrub_dev,
unsigned long *dbitmap, int stripe_nsectors);
-void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio,
- struct page *page, u64 logical);
void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio);
+struct btrfs_raid_bio *
+raid56_alloc_missing_rbio(struct btrfs_root *root, struct bio *bio,
+ struct btrfs_bio *bbio, u64 length);
+void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio);
+
int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info);
#endif
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 0e7beea..619f929 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -328,6 +328,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
struct btrfs_device *prev_dev;
u32 blocksize;
u64 length;
+ int real_stripes;
int nzones = 0;
int i;
unsigned long index = logical >> PAGE_CACHE_SHIFT;
@@ -369,7 +370,8 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
goto error;
}
- for (nzones = 0; nzones < bbio->num_stripes; ++nzones) {
+ real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
+ for (nzones = 0; nzones < real_stripes; ++nzones) {
struct reada_zone *zone;
dev = bbio->stripes[nzones].dev;
@@ -567,7 +569,7 @@ static int reada_add_block(struct reada_control *rc, u64 logical,
rec = kzalloc(sizeof(*rec), GFP_NOFS);
if (!rec) {
reada_extent_put(root->fs_info, re);
- return -1;
+ return -ENOMEM;
}
rec->rc = rc;
@@ -916,6 +918,7 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
u64 start;
u64 generation;
int level;
+ int ret;
struct extent_buffer *node;
static struct btrfs_key max_key = {
.objectid = (u64)-1,
@@ -941,9 +944,10 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
generation = btrfs_header_generation(node);
free_extent_buffer(node);
- if (reada_add_block(rc, start, &max_key, level, generation)) {
+ ret = reada_add_block(rc, start, &max_key, level, generation);
+ if (ret) {
kfree(rc);
- return ERR_PTR(-ENOMEM);
+ return ERR_PTR(ret);
}
reada_start_machine(root->fs_info);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 88cbb59..ef6d8fc 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -708,8 +708,8 @@ struct backref_node *build_backref_tree(struct reloc_control *rc,
err = -ENOMEM;
goto out;
}
- path1->reada = 1;
- path2->reada = 2;
+ path1->reada = READA_FORWARD;
+ path2->reada = READA_FORWARD;
node = alloc_backref_node(cache);
if (!node) {
@@ -1716,7 +1716,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
num_bytes, parent,
btrfs_header_owner(leaf),
- key.objectid, key.offset, 1);
+ key.objectid, key.offset);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
break;
@@ -1724,7 +1724,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
parent, btrfs_header_owner(leaf),
- key.objectid, key.offset, 1);
+ key.objectid, key.offset);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
break;
@@ -1900,23 +1900,21 @@ again:
ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize,
path->nodes[level]->start,
- src->root_key.objectid, level - 1, 0,
- 1);
+ src->root_key.objectid, level - 1, 0);
BUG_ON(ret);
ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize,
0, dest->root_key.objectid, level - 1,
- 0, 1);
+ 0);
BUG_ON(ret);
ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
path->nodes[level]->start,
- src->root_key.objectid, level - 1, 0,
- 1);
+ src->root_key.objectid, level - 1, 0);
BUG_ON(ret);
ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
0, dest->root_key.objectid, level - 1,
- 0, 1);
+ 0);
BUG_ON(ret);
btrfs_unlock_up_safe(path, 0);
@@ -2132,7 +2130,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->reada = 1;
+ path->reada = READA_FORWARD;
reloc_root = root->reloc_root;
root_item = &reloc_root->root_item;
@@ -2418,7 +2416,7 @@ again:
}
out:
if (ret) {
- btrfs_std_error(root->fs_info, ret);
+ btrfs_std_error(root->fs_info, ret, NULL);
if (!list_empty(&reloc_roots))
free_reloc_roots(&reloc_roots);
@@ -2523,8 +2521,7 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
* counted. return -ENOENT if the block is root of reloc tree.
*/
static noinline_for_stack
-struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
- struct backref_node *node)
+struct btrfs_root *select_one_root(struct backref_node *node)
{
struct backref_node *next;
struct btrfs_root *root;
@@ -2746,7 +2743,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
node->eb->start, blocksize,
upper->eb->start,
btrfs_header_owner(upper->eb),
- node->level, 0, 1);
+ node->level, 0);
BUG_ON(ret);
ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
@@ -2912,7 +2909,7 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
return 0;
BUG_ON(node->processed);
- root = select_one_root(trans, node);
+ root = select_one_root(node);
if (root == ERR_PTR(-ENOENT)) {
update_processed_blocks(rc, node);
goto out;
@@ -3035,8 +3032,8 @@ int prealloc_file_extent_cluster(struct inode *inode,
BUG_ON(cluster->start != cluster->boundary[0]);
mutex_lock(&inode->i_mutex);
- ret = btrfs_check_data_free_space(inode, cluster->end +
- 1 - cluster->start, 0);
+ ret = btrfs_check_data_free_space(inode, cluster->start,
+ cluster->end + 1 - cluster->start);
if (ret)
goto out;
@@ -3057,8 +3054,8 @@ int prealloc_file_extent_cluster(struct inode *inode,
break;
nr++;
}
- btrfs_free_reserved_data_space(inode, cluster->end +
- 1 - cluster->start);
+ btrfs_free_reserved_data_space(inode, cluster->start,
+ cluster->end + 1 - cluster->start);
out:
mutex_unlock(&inode->i_mutex);
return ret;
@@ -3530,7 +3527,7 @@ static int find_data_references(struct reloc_control *rc,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->reada = 1;
+ path->reada = READA_FORWARD;
root = read_fs_root(rc->extent_root->fs_info, ref_root);
if (IS_ERR(root)) {
@@ -3755,8 +3752,7 @@ out:
* helper to find next unprocessed extent
*/
static noinline_for_stack
-int find_next_extent(struct btrfs_trans_handle *trans,
- struct reloc_control *rc, struct btrfs_path *path,
+int find_next_extent(struct reloc_control *rc, struct btrfs_path *path,
struct btrfs_key *extent_key)
{
struct btrfs_key key;
@@ -3921,7 +3917,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->reada = 1;
+ path->reada = READA_FORWARD;
ret = prepare_to_relocate(rc);
if (ret) {
@@ -3951,7 +3947,7 @@ restart:
continue;
}
- ret = find_next_extent(trans, rc, path, &key);
+ ret = find_next_extent(rc, path, &key);
if (ret < 0)
err = ret;
if (ret != 0)
@@ -3976,6 +3972,10 @@ restart:
sizeof(struct btrfs_extent_item_v0));
ret = get_ref_objectid_v0(rc, path, &key, &ref_owner,
&path_change);
+ if (ret < 0) {
+ err = ret;
+ break;
+ }
if (ref_owner < BTRFS_FIRST_FREE_OBJECTID)
flags = BTRFS_EXTENT_FLAG_TREE_BLOCK;
else
@@ -4140,7 +4140,7 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans;
struct btrfs_root *root;
struct btrfs_key key;
- u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
+ u64 objectid;
int err = 0;
root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
@@ -4215,14 +4215,12 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
BUG_ON(!rc->block_group);
- if (!rc->block_group->ro) {
- ret = btrfs_set_block_group_ro(extent_root, rc->block_group);
- if (ret) {
- err = ret;
- goto out;
- }
- rw = 1;
+ ret = btrfs_inc_block_group_ro(extent_root, rc->block_group);
+ if (ret) {
+ err = ret;
+ goto out;
}
+ rw = 1;
path = btrfs_alloc_path();
if (!path) {
@@ -4294,7 +4292,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
out:
if (err && rw)
- btrfs_set_block_group_rw(extent_root, rc->block_group);
+ btrfs_dec_block_group_ro(extent_root, rc->block_group);
iput(rc->data_inode);
btrfs_put_block_group(rc->block_group);
kfree(rc);
@@ -4345,7 +4343,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->reada = -1;
+ path->reada = READA_BACK;
key.objectid = BTRFS_TREE_RELOC_OBJECTID;
key.type = BTRFS_ROOT_ITEM_KEY;
@@ -4594,8 +4592,7 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
* called before creating snapshot. it calculates metadata reservation
* requried for relocating tree blocks in the snapshot
*/
-void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
- struct btrfs_pending_snapshot *pending,
+void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
u64 *bytes_to_reserve)
{
struct btrfs_root *root;
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 360a728..7cf8509 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -45,12 +45,13 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot,
if (!need_reset && btrfs_root_generation(item)
!= btrfs_root_generation_v2(item)) {
if (btrfs_root_generation_v2(item) != 0) {
- printk(KERN_WARNING "BTRFS: mismatching "
+ btrfs_warn(eb->fs_info,
+ "mismatching "
"generation and generation_v2 "
"found in root item. This root "
"was probably mounted with an "
"older kernel. Resetting all "
- "new fields.\n");
+ "new fields.");
}
need_reset = 1;
}
@@ -141,7 +142,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
int ret;
int slot;
unsigned long ptr;
- int old_len;
+ u32 old_len;
path = btrfs_alloc_path();
if (!path)
@@ -283,7 +284,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
trans = btrfs_join_transaction(tree_root);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
- btrfs_error(tree_root->fs_info, err,
+ btrfs_std_error(tree_root->fs_info, err,
"Failed to start trans to delete "
"orphan item");
break;
@@ -292,7 +293,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
root_key.objectid);
btrfs_end_transaction(trans, tree_root);
if (err) {
- btrfs_error(tree_root->fs_info, err,
+ btrfs_std_error(tree_root->fs_info, err,
"Failed to delete root orphan "
"item");
break;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 94db0fa..0c981eb 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -125,6 +125,7 @@ struct scrub_block {
/* It is for the data with checksum */
unsigned int data_corrected:1;
};
+ struct btrfs_work work;
};
/* Used for the chunks with parity stripe such RAID5/6 */
@@ -247,14 +248,9 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
struct scrub_block *sblocks_for_recheck);
static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
- struct scrub_block *sblock, int is_metadata,
- int have_csum, u8 *csum, u64 generation,
- u16 csum_size, int retry_failed_mirror);
-static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
- struct scrub_block *sblock,
- int is_metadata, int have_csum,
- const u8 *csum, u64 generation,
- u16 csum_size);
+ struct scrub_block *sblock,
+ int retry_failed_mirror);
+static void scrub_recheck_block_checksum(struct scrub_block *sblock);
static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
struct scrub_block *sblock_good);
static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
@@ -278,7 +274,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
u64 physical, struct btrfs_device *dev, u64 flags,
u64 gen, int mirror_num, u8 *csum, int force,
u64 physical_for_dev_replace);
-static void scrub_bio_end_io(struct bio *bio, int err);
+static void scrub_bio_end_io(struct bio *bio);
static void scrub_bio_end_io_worker(struct btrfs_work *work);
static void scrub_block_complete(struct scrub_block *sblock);
static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
@@ -295,7 +291,7 @@ static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx);
static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
struct scrub_page *spage);
static void scrub_wr_submit(struct scrub_ctx *sctx);
-static void scrub_wr_bio_end_io(struct bio *bio, int err);
+static void scrub_wr_bio_end_io(struct bio *bio);
static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
static int write_page_nocow(struct scrub_ctx *sctx,
u64 physical_for_dev_replace, struct page *page);
@@ -332,11 +328,14 @@ static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
}
}
-static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
+static void scrub_pause_on(struct btrfs_fs_info *fs_info)
{
atomic_inc(&fs_info->scrubs_paused);
wake_up(&fs_info->scrub_pause_wait);
+}
+static void scrub_pause_off(struct btrfs_fs_info *fs_info)
+{
mutex_lock(&fs_info->scrub_lock);
__scrub_blocked_if_needed(fs_info);
atomic_dec(&fs_info->scrubs_paused);
@@ -345,6 +344,12 @@ static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
wake_up(&fs_info->scrub_pause_wait);
}
+static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
+{
+ scrub_pause_on(fs_info);
+ scrub_pause_off(fs_info);
+}
+
/*
* used for workers that require transaction commits (i.e., for the
* NOCOW case)
@@ -454,27 +459,14 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
struct scrub_ctx *sctx;
int i;
struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
- int pages_per_rd_bio;
int ret;
- /*
- * the setting of pages_per_rd_bio is correct for scrub but might
- * be wrong for the dev_replace code where we might read from
- * different devices in the initial huge bios. However, that
- * code is able to correctly handle the case when adding a page
- * to a bio fails.
- */
- if (dev->bdev)
- pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO,
- bio_get_nr_vecs(dev->bdev));
- else
- pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
if (!sctx)
goto nomem;
atomic_set(&sctx->refs, 1);
sctx->is_dev_replace = is_dev_replace;
- sctx->pages_per_rd_bio = pages_per_rd_bio;
+ sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
sctx->curr = -1;
sctx->dev_root = dev->dev_root;
for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
@@ -583,9 +575,9 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
* hold all of the paths here
*/
for (i = 0; i < ipath->fspath->elem_cnt; ++i)
- printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "
+ btrfs_warn_in_rcu(fs_info, "%s at logical %llu on dev "
"%s, sector %llu, root %llu, inode %llu, offset %llu, "
- "length %llu, links %u (path: %s)\n", swarn->errstr,
+ "length %llu, links %u (path: %s)", swarn->errstr,
swarn->logical, rcu_str_deref(swarn->dev->name),
(unsigned long long)swarn->sector, root, inum, offset,
min(isize - offset, (u64)PAGE_SIZE), nlink,
@@ -595,9 +587,9 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
return 0;
err:
- printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "
+ btrfs_warn_in_rcu(fs_info, "%s at logical %llu on dev "
"%s, sector %llu, root %llu, inode %llu, offset %llu: path "
- "resolving failed with ret=%d\n", swarn->errstr,
+ "resolving failed with ret=%d", swarn->errstr,
swarn->logical, rcu_str_deref(swarn->dev->name),
(unsigned long long)swarn->sector, root, inum, offset, ret);
@@ -652,10 +644,10 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
item_size, &ref_root,
&ref_level);
- printk_in_rcu(KERN_WARNING
- "BTRFS: %s at logical %llu on dev %s, "
+ btrfs_warn_in_rcu(fs_info,
+ "%s at logical %llu on dev %s, "
"sector %llu: metadata %s (level %d) in tree "
- "%llu\n", errstr, swarn.logical,
+ "%llu", errstr, swarn.logical,
rcu_str_deref(dev->name),
(unsigned long long)swarn.sector,
ref_level ? "node" : "leaf",
@@ -853,8 +845,8 @@ out:
btrfs_dev_replace_stats_inc(
&sctx->dev_root->fs_info->dev_replace.
num_uncorrectable_read_errors);
- printk_ratelimited_in_rcu(KERN_ERR "BTRFS: "
- "unable to fixup (nodatasum) error at logical %llu on dev %s\n",
+ btrfs_err_rl_in_rcu(sctx->dev_root->fs_info,
+ "unable to fixup (nodatasum) error at logical %llu on dev %s",
fixup->logical, rcu_str_deref(fixup->dev->name));
}
@@ -892,11 +884,9 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
struct btrfs_fs_info *fs_info;
u64 length;
u64 logical;
- u64 generation;
unsigned int failed_mirror_index;
unsigned int is_metadata;
unsigned int have_csum;
- u8 *csum;
struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
struct scrub_block *sblock_bad;
int ret;
@@ -921,13 +911,11 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
}
length = sblock_to_check->page_count * PAGE_SIZE;
logical = sblock_to_check->pagev[0]->logical;
- generation = sblock_to_check->pagev[0]->generation;
BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
is_metadata = !(sblock_to_check->pagev[0]->flags &
BTRFS_EXTENT_FLAG_DATA);
have_csum = sblock_to_check->pagev[0]->have_csum;
- csum = sblock_to_check->pagev[0]->csum;
dev = sblock_to_check->pagev[0]->dev;
if (sctx->is_dev_replace && !is_metadata && !have_csum) {
@@ -990,8 +978,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
sblock_bad = sblocks_for_recheck + failed_mirror_index;
/* build and submit the bios for the failed mirror, check checksums */
- scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
- csum, generation, sctx->csum_size, 1);
+ scrub_recheck_block(fs_info, sblock_bad, 1);
if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
sblock_bad->no_io_error_seen) {
@@ -1104,9 +1091,7 @@ nodatasum_case:
sblock_other = sblocks_for_recheck + mirror_index;
/* build and submit the bios, check checksums */
- scrub_recheck_block(fs_info, sblock_other, is_metadata,
- have_csum, csum, generation,
- sctx->csum_size, 0);
+ scrub_recheck_block(fs_info, sblock_other, 0);
if (!sblock_other->header_error &&
!sblock_other->checksum_error &&
@@ -1218,9 +1203,7 @@ nodatasum_case:
* is verified, but most likely the data comes out
* of the page cache.
*/
- scrub_recheck_block(fs_info, sblock_bad,
- is_metadata, have_csum, csum,
- generation, sctx->csum_size, 1);
+ scrub_recheck_block(fs_info, sblock_bad, 1);
if (!sblock_bad->header_error &&
!sblock_bad->checksum_error &&
sblock_bad->no_io_error_seen)
@@ -1233,8 +1216,8 @@ corrected_error:
sctx->stat.corrected_errors++;
sblock_to_check->data_corrected = 1;
spin_unlock(&sctx->stat_lock);
- printk_ratelimited_in_rcu(KERN_ERR
- "BTRFS: fixed up error at logical %llu on dev %s\n",
+ btrfs_err_rl_in_rcu(fs_info,
+ "fixed up error at logical %llu on dev %s",
logical, rcu_str_deref(dev->name));
}
} else {
@@ -1242,8 +1225,8 @@ did_not_correct_error:
spin_lock(&sctx->stat_lock);
sctx->stat.uncorrectable_errors++;
spin_unlock(&sctx->stat_lock);
- printk_ratelimited_in_rcu(KERN_ERR
- "BTRFS: unable to fixup (regular) error at logical %llu on dev %s\n",
+ btrfs_err_rl_in_rcu(fs_info,
+ "unable to fixup (regular) error at logical %llu on dev %s",
logical, rcu_str_deref(dev->name));
}
@@ -1321,6 +1304,9 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
u64 length = original_sblock->page_count * PAGE_SIZE;
u64 logical = original_sblock->pagev[0]->logical;
+ u64 generation = original_sblock->pagev[0]->generation;
+ u64 flags = original_sblock->pagev[0]->flags;
+ u64 have_csum = original_sblock->pagev[0]->have_csum;
struct scrub_recover *recover;
struct btrfs_bio *bbio;
u64 sublen;
@@ -1375,6 +1361,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
sblock = sblocks_for_recheck + mirror_index;
sblock->sctx = sctx;
+
page = kzalloc(sizeof(*page), GFP_NOFS);
if (!page) {
leave_nomem:
@@ -1386,7 +1373,15 @@ leave_nomem:
}
scrub_page_get(page);
sblock->pagev[page_index] = page;
+ page->sblock = sblock;
+ page->flags = flags;
+ page->generation = generation;
page->logical = logical;
+ page->have_csum = have_csum;
+ if (have_csum)
+ memcpy(page->csum,
+ original_sblock->pagev[0]->csum,
+ sctx->csum_size);
scrub_stripe_index_and_offset(logical,
bbio->map_type,
@@ -1429,11 +1424,11 @@ struct scrub_bio_ret {
int error;
};
-static void scrub_bio_wait_endio(struct bio *bio, int error)
+static void scrub_bio_wait_endio(struct bio *bio)
{
struct scrub_bio_ret *ret = bio->bi_private;
- ret->error = error;
+ ret->error = bio->bi_error;
complete(&ret->event);
}
@@ -1477,15 +1472,12 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
* the pages that are errored in the just handled mirror can be repaired.
*/
static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
- struct scrub_block *sblock, int is_metadata,
- int have_csum, u8 *csum, u64 generation,
- u16 csum_size, int retry_failed_mirror)
+ struct scrub_block *sblock,
+ int retry_failed_mirror)
{
int page_num;
sblock->no_io_error_seen = 1;
- sblock->header_error = 0;
- sblock->checksum_error = 0;
for (page_num = 0; page_num < sblock->page_count; page_num++) {
struct bio *bio;
@@ -1521,11 +1513,7 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
}
if (sblock->no_io_error_seen)
- scrub_recheck_block_checksum(fs_info, sblock, is_metadata,
- have_csum, csum, generation,
- csum_size);
-
- return;
+ scrub_recheck_block_checksum(sblock);
}
static inline int scrub_check_fsid(u8 fsid[],
@@ -1538,61 +1526,16 @@ static inline int scrub_check_fsid(u8 fsid[],
return !ret;
}
-static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
- struct scrub_block *sblock,
- int is_metadata, int have_csum,
- const u8 *csum, u64 generation,
- u16 csum_size)
+static void scrub_recheck_block_checksum(struct scrub_block *sblock)
{
- int page_num;
- u8 calculated_csum[BTRFS_CSUM_SIZE];
- u32 crc = ~(u32)0;
- void *mapped_buffer;
-
- WARN_ON(!sblock->pagev[0]->page);
- if (is_metadata) {
- struct btrfs_header *h;
-
- mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
- h = (struct btrfs_header *)mapped_buffer;
-
- if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h) ||
- !scrub_check_fsid(h->fsid, sblock->pagev[0]) ||
- memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
- BTRFS_UUID_SIZE)) {
- sblock->header_error = 1;
- } else if (generation != btrfs_stack_header_generation(h)) {
- sblock->header_error = 1;
- sblock->generation_error = 1;
- }
- csum = h->csum;
- } else {
- if (!have_csum)
- return;
-
- mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
- }
-
- for (page_num = 0;;) {
- if (page_num == 0 && is_metadata)
- crc = btrfs_csum_data(
- ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE,
- crc, PAGE_SIZE - BTRFS_CSUM_SIZE);
- else
- crc = btrfs_csum_data(mapped_buffer, crc, PAGE_SIZE);
-
- kunmap_atomic(mapped_buffer);
- page_num++;
- if (page_num >= sblock->page_count)
- break;
- WARN_ON(!sblock->pagev[page_num]->page);
-
- mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page);
- }
+ sblock->header_error = 0;
+ sblock->checksum_error = 0;
+ sblock->generation_error = 0;
- btrfs_csum_final(crc, calculated_csum);
- if (memcmp(calculated_csum, csum, csum_size))
- sblock->checksum_error = 1;
+ if (sblock->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA)
+ scrub_checksum_data(sblock);
+ else
+ scrub_checksum_tree_block(sblock);
}
static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
@@ -1629,9 +1572,9 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
int ret;
if (!page_bad->dev->bdev) {
- printk_ratelimited(KERN_WARNING "BTRFS: "
+ btrfs_warn_rl(sblock_bad->sctx->dev_root->fs_info,
"scrub_repair_page_from_good_copy(bdev == NULL) "
- "is unexpected!\n");
+ "is unexpected");
return -EIO;
}
@@ -1790,12 +1733,12 @@ static void scrub_wr_submit(struct scrub_ctx *sctx)
btrfsic_submit_bio(WRITE, sbio->bio);
}
-static void scrub_wr_bio_end_io(struct bio *bio, int err)
+static void scrub_wr_bio_end_io(struct bio *bio)
{
struct scrub_bio *sbio = bio->bi_private;
struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
- sbio->err = err;
+ sbio->err = bio->bi_error;
sbio->bio = bio;
btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper,
@@ -1836,6 +1779,18 @@ static int scrub_checksum(struct scrub_block *sblock)
u64 flags;
int ret;
+ /*
+ * No need to initialize these stats currently,
+ * because this function only use return value
+ * instead of these stats value.
+ *
+ * Todo:
+ * always use stats
+ */
+ sblock->header_error = 0;
+ sblock->generation_error = 0;
+ sblock->checksum_error = 0;
+
WARN_ON(sblock->page_count < 1);
flags = sblock->pagev[0]->flags;
ret = 0;
@@ -1861,7 +1816,6 @@ static int scrub_checksum_data(struct scrub_block *sblock)
struct page *page;
void *buffer;
u32 crc = ~(u32)0;
- int fail = 0;
u64 len;
int index;
@@ -1892,9 +1846,9 @@ static int scrub_checksum_data(struct scrub_block *sblock)
btrfs_csum_final(crc, csum);
if (memcmp(csum, on_disk_csum, sctx->csum_size))
- fail = 1;
+ sblock->checksum_error = 1;
- return fail;
+ return sblock->checksum_error;
}
static int scrub_checksum_tree_block(struct scrub_block *sblock)
@@ -1910,8 +1864,6 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
u64 mapped_size;
void *p;
u32 crc = ~(u32)0;
- int fail = 0;
- int crc_fail = 0;
u64 len;
int index;
@@ -1926,19 +1878,20 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
* a) don't have an extent buffer and
* b) the page is already kmapped
*/
-
if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h))
- ++fail;
+ sblock->header_error = 1;
- if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h))
- ++fail;
+ if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h)) {
+ sblock->header_error = 1;
+ sblock->generation_error = 1;
+ }
if (!scrub_check_fsid(h->fsid, sblock->pagev[0]))
- ++fail;
+ sblock->header_error = 1;
if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
BTRFS_UUID_SIZE))
- ++fail;
+ sblock->header_error = 1;
len = sctx->nodesize - BTRFS_CSUM_SIZE;
mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
@@ -1963,9 +1916,9 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
btrfs_csum_final(crc, calculated_csum);
if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
- ++crc_fail;
+ sblock->checksum_error = 1;
- return fail || crc_fail;
+ return sblock->header_error || sblock->checksum_error;
}
static int scrub_checksum_super(struct scrub_block *sblock)
@@ -2087,21 +2040,7 @@ static void scrub_submit(struct scrub_ctx *sctx)
sbio = sctx->bios[sctx->curr];
sctx->curr = -1;
scrub_pending_bio_inc(sctx);
-
- if (!sbio->bio->bi_bdev) {
- /*
- * this case should not happen. If btrfs_map_block() is
- * wrong, it could happen for dev-replace operations on
- * missing devices when no mirrors are available, but in
- * this case it should already fail the mount.
- * This case is handled correctly (but _very_ slowly).
- */
- printk_ratelimited(KERN_WARNING
- "BTRFS: scrub_submit(bio bdev == NULL) is unexpected!\n");
- bio_endio(sbio->bio, -EIO);
- } else {
- btrfsic_submit_bio(READ, sbio->bio);
- }
+ btrfsic_submit_bio(READ, sbio->bio);
}
static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
@@ -2178,6 +2117,122 @@ again:
return 0;
}
+static void scrub_missing_raid56_end_io(struct bio *bio)
+{
+ struct scrub_block *sblock = bio->bi_private;
+ struct btrfs_fs_info *fs_info = sblock->sctx->dev_root->fs_info;
+
+ if (bio->bi_error)
+ sblock->no_io_error_seen = 0;
+
+ btrfs_queue_work(fs_info->scrub_workers, &sblock->work);
+}
+
+static void scrub_missing_raid56_worker(struct btrfs_work *work)
+{
+ struct scrub_block *sblock = container_of(work, struct scrub_block, work);
+ struct scrub_ctx *sctx = sblock->sctx;
+ u64 logical;
+ struct btrfs_device *dev;
+
+ logical = sblock->pagev[0]->logical;
+ dev = sblock->pagev[0]->dev;
+
+ if (sblock->no_io_error_seen)
+ scrub_recheck_block_checksum(sblock);
+
+ if (!sblock->no_io_error_seen) {
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.read_errors++;
+ spin_unlock(&sctx->stat_lock);
+ btrfs_err_rl_in_rcu(sctx->dev_root->fs_info,
+ "IO error rebuilding logical %llu for dev %s",
+ logical, rcu_str_deref(dev->name));
+ } else if (sblock->header_error || sblock->checksum_error) {
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.uncorrectable_errors++;
+ spin_unlock(&sctx->stat_lock);
+ btrfs_err_rl_in_rcu(sctx->dev_root->fs_info,
+ "failed to rebuild valid logical %llu for dev %s",
+ logical, rcu_str_deref(dev->name));
+ } else {
+ scrub_write_block_to_dev_replace(sblock);
+ }
+
+ scrub_block_put(sblock);
+
+ if (sctx->is_dev_replace &&
+ atomic_read(&sctx->wr_ctx.flush_all_writes)) {
+ mutex_lock(&sctx->wr_ctx.wr_lock);
+ scrub_wr_submit(sctx);
+ mutex_unlock(&sctx->wr_ctx.wr_lock);
+ }
+
+ scrub_pending_bio_dec(sctx);
+}
+
+static void scrub_missing_raid56_pages(struct scrub_block *sblock)
+{
+ struct scrub_ctx *sctx = sblock->sctx;
+ struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+ u64 length = sblock->page_count * PAGE_SIZE;
+ u64 logical = sblock->pagev[0]->logical;
+ struct btrfs_bio *bbio;
+ struct bio *bio;
+ struct btrfs_raid_bio *rbio;
+ int ret;
+ int i;
+
+ ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical, &length,
+ &bbio, 0, 1);
+ if (ret || !bbio || !bbio->raid_map)
+ goto bbio_out;
+
+ if (WARN_ON(!sctx->is_dev_replace ||
+ !(bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
+ /*
+ * We shouldn't be scrubbing a missing device. Even for dev
+ * replace, we should only get here for RAID 5/6. We either
+ * managed to mount something with no mirrors remaining or
+ * there's a bug in scrub_remap_extent()/btrfs_map_block().
+ */
+ goto bbio_out;
+ }
+
+ bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
+ if (!bio)
+ goto bbio_out;
+
+ bio->bi_iter.bi_sector = logical >> 9;
+ bio->bi_private = sblock;
+ bio->bi_end_io = scrub_missing_raid56_end_io;
+
+ rbio = raid56_alloc_missing_rbio(sctx->dev_root, bio, bbio, length);
+ if (!rbio)
+ goto rbio_out;
+
+ for (i = 0; i < sblock->page_count; i++) {
+ struct scrub_page *spage = sblock->pagev[i];
+
+ raid56_add_scrub_pages(rbio, spage->page, spage->logical);
+ }
+
+ btrfs_init_work(&sblock->work, btrfs_scrub_helper,
+ scrub_missing_raid56_worker, NULL, NULL);
+ scrub_block_get(sblock);
+ scrub_pending_bio_inc(sctx);
+ raid56_submit_missing_rbio(rbio);
+ return;
+
+rbio_out:
+ bio_put(bio);
+bbio_out:
+ btrfs_put_bbio(bbio);
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.malloc_errors++;
+ spin_unlock(&sctx->stat_lock);
+}
+
static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
u64 physical, struct btrfs_device *dev, u64 flags,
u64 gen, int mirror_num, u8 *csum, int force,
@@ -2241,31 +2296,39 @@ leave_nomem:
}
WARN_ON(sblock->page_count == 0);
- for (index = 0; index < sblock->page_count; index++) {
- struct scrub_page *spage = sblock->pagev[index];
- int ret;
+ if (dev->missing) {
+ /*
+ * This case should only be hit for RAID 5/6 device replace. See
+ * the comment in scrub_missing_raid56_pages() for details.
+ */
+ scrub_missing_raid56_pages(sblock);
+ } else {
+ for (index = 0; index < sblock->page_count; index++) {
+ struct scrub_page *spage = sblock->pagev[index];
+ int ret;
- ret = scrub_add_page_to_rd_bio(sctx, spage);
- if (ret) {
- scrub_block_put(sblock);
- return ret;
+ ret = scrub_add_page_to_rd_bio(sctx, spage);
+ if (ret) {
+ scrub_block_put(sblock);
+ return ret;
+ }
}
- }
- if (force)
- scrub_submit(sctx);
+ if (force)
+ scrub_submit(sctx);
+ }
/* last one frees, either here or in bio completion for last page */
scrub_block_put(sblock);
return 0;
}
-static void scrub_bio_end_io(struct bio *bio, int err)
+static void scrub_bio_end_io(struct bio *bio)
{
struct scrub_bio *sbio = bio->bi_private;
struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
- sbio->err = err;
+ sbio->err = bio->bi_error;
sbio->bio = bio;
btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
@@ -2381,8 +2444,7 @@ static void scrub_block_complete(struct scrub_block *sblock)
}
}
-static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
- u8 *csum)
+static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
{
struct btrfs_ordered_sum *sum = NULL;
unsigned long index;
@@ -2446,7 +2508,7 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
if (flags & BTRFS_EXTENT_FLAG_DATA) {
/* push csums to sbio */
- have_csum = scrub_find_csum(sctx, logical, l, csum);
+ have_csum = scrub_find_csum(sctx, logical, csum);
if (have_csum == 0)
++sctx->stat.no_csum;
if (sctx->is_dev_replace && !have_csum) {
@@ -2564,6 +2626,11 @@ static int scrub_extent_for_parity(struct scrub_parity *sparity,
u8 csum[BTRFS_CSUM_SIZE];
u32 blocksize;
+ if (dev->missing) {
+ scrub_parity_mark_sectors_error(sparity, logical, len);
+ return 0;
+ }
+
if (flags & BTRFS_EXTENT_FLAG_DATA) {
blocksize = sctx->sectorsize;
} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
@@ -2579,7 +2646,7 @@ static int scrub_extent_for_parity(struct scrub_parity *sparity,
if (flags & BTRFS_EXTENT_FLAG_DATA) {
/* push csums to sbio */
- have_csum = scrub_find_csum(sctx, logical, l, csum);
+ have_csum = scrub_find_csum(sctx, logical, csum);
if (have_csum == 0)
goto skip;
}
@@ -2672,11 +2739,11 @@ static void scrub_parity_bio_endio_worker(struct btrfs_work *work)
scrub_pending_bio_dec(sctx);
}
-static void scrub_parity_bio_endio(struct bio *bio, int error)
+static void scrub_parity_bio_endio(struct bio *bio)
{
struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
- if (error)
+ if (bio->bi_error)
bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
sparity->nsectors);
@@ -2702,7 +2769,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
sparity->nsectors))
goto out;
- length = sparity->logic_end - sparity->logic_start + 1;
+ length = sparity->logic_end - sparity->logic_start;
ret = btrfs_map_sblock(sctx->dev_root->fs_info, WRITE,
sparity->logic_start,
&length, &bbio, 0, 1);
@@ -2725,8 +2792,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
goto rbio_out;
list_for_each_entry(spage, &sparity->spages, list)
- raid56_parity_add_scrub_pages(rbio, spage->page,
- spage->logical);
+ raid56_add_scrub_pages(rbio, spage->page, spage->logical);
scrub_pending_bio_inc(sctx);
raid56_parity_submit_scrub_rbio(rbio);
@@ -2774,6 +2840,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
struct btrfs_root *root = fs_info->extent_root;
struct btrfs_root *csum_root = fs_info->csum_root;
struct btrfs_extent_item *extent;
+ struct btrfs_bio *bbio = NULL;
u64 flags;
int ret;
int slot;
@@ -2783,6 +2850,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
u64 extent_logical;
u64 extent_physical;
u64 extent_len;
+ u64 mapped_length;
struct btrfs_device *extent_dev;
struct scrub_parity *sparity;
int nsectors;
@@ -2856,6 +2924,10 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
}
btrfs_item_key_to_cpu(l, &key, slot);
+ if (key.type != BTRFS_EXTENT_ITEM_KEY &&
+ key.type != BTRFS_METADATA_ITEM_KEY)
+ goto next;
+
if (key.type == BTRFS_METADATA_ITEM_KEY)
bytes = root->nodesize;
else
@@ -2864,11 +2936,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
if (key.objectid + bytes <= logic_start)
goto next;
- if (key.type != BTRFS_EXTENT_ITEM_KEY &&
- key.type != BTRFS_METADATA_ITEM_KEY)
- goto next;
-
- if (key.objectid > logic_end) {
+ if (key.objectid >= logic_end) {
stop_loop = 1;
break;
}
@@ -2881,11 +2949,15 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
flags = btrfs_extent_flags(l, extent);
generation = btrfs_extent_generation(l, extent);
- if (key.objectid < logic_start &&
- (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
- btrfs_err(fs_info,
- "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
- key.objectid, logic_start);
+ if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
+ (key.objectid < logic_start ||
+ key.objectid + bytes >
+ logic_start + map->stripe_len)) {
+ btrfs_err(fs_info, "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
+ key.objectid, logic_start);
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.uncorrectable_errors++;
+ spin_unlock(&sctx->stat_lock);
goto next;
}
again:
@@ -2905,10 +2977,21 @@ again:
scrub_parity_mark_sectors_data(sparity, extent_logical,
extent_len);
- scrub_remap_extent(fs_info, extent_logical,
- extent_len, &extent_physical,
- &extent_dev,
- &extent_mirror_num);
+ mapped_length = extent_len;
+ ret = btrfs_map_block(fs_info, READ, extent_logical,
+ &mapped_length, &bbio, 0);
+ if (!ret) {
+ if (!bbio || mapped_length < extent_len)
+ ret = -EIO;
+ }
+ if (ret) {
+ btrfs_put_bbio(bbio);
+ goto out;
+ }
+ extent_physical = bbio->stripes[0].physical;
+ extent_mirror_num = bbio->mirror_num;
+ extent_dev = bbio->stripes[0].dev;
+ btrfs_put_bbio(bbio);
ret = btrfs_lookup_csums_range(csum_root,
extent_logical,
@@ -2923,10 +3006,12 @@ again:
extent_dev, flags,
generation,
extent_mirror_num);
+
+ scrub_free_csums(sctx);
+
if (ret)
goto out;
- scrub_free_csums(sctx);
if (extent_logical + extent_len <
key.objectid + bytes) {
logic_start += map->stripe_len;
@@ -2955,7 +3040,7 @@ next:
out:
if (ret < 0)
scrub_parity_mark_sectors_error(sparity, logic_start,
- logic_end - logic_start + 1);
+ logic_end - logic_start);
scrub_parity_put(sparity);
scrub_submit(sctx);
mutex_lock(&sctx->wr_ctx.wr_lock);
@@ -3104,22 +3189,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
*/
ret = 0;
while (physical < physical_end) {
- /* for raid56, we skip parity stripe */
- if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- ret = get_raid56_logic_offset(physical, num,
- map, &logical, &stripe_logical);
- logical += base;
- if (ret) {
- stripe_logical += base;
- stripe_end = stripe_logical + increment - 1;
- ret = scrub_raid56_parity(sctx, map, scrub_dev,
- ppath, stripe_logical,
- stripe_end);
- if (ret)
- goto out;
- goto skip;
- }
- }
/*
* canceled?
*/
@@ -3144,6 +3213,24 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
scrub_blocked_if_needed(fs_info);
}
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ ret = get_raid56_logic_offset(physical, num, map,
+ &logical,
+ &stripe_logical);
+ logical += base;
+ if (ret) {
+ /* it is parity strip */
+ stripe_logical += base;
+ stripe_end = stripe_logical + increment;
+ ret = scrub_raid56_parity(sctx, map, scrub_dev,
+ ppath, stripe_logical,
+ stripe_end);
+ if (ret)
+ goto out;
+ goto skip;
+ }
+ }
+
if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
key.type = BTRFS_METADATA_ITEM_KEY;
else
@@ -3188,6 +3275,10 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
}
btrfs_item_key_to_cpu(l, &key, slot);
+ if (key.type != BTRFS_EXTENT_ITEM_KEY &&
+ key.type != BTRFS_METADATA_ITEM_KEY)
+ goto next;
+
if (key.type == BTRFS_METADATA_ITEM_KEY)
bytes = root->nodesize;
else
@@ -3196,10 +3287,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
if (key.objectid + bytes <= logical)
goto next;
- if (key.type != BTRFS_EXTENT_ITEM_KEY &&
- key.type != BTRFS_METADATA_ITEM_KEY)
- goto next;
-
if (key.objectid >= logical + map->stripe_len) {
/* out of this device extent */
if (key.objectid >= logic_end)
@@ -3212,12 +3299,17 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
flags = btrfs_extent_flags(l, extent);
generation = btrfs_extent_generation(l, extent);
- if (key.objectid < logical &&
- (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
+ if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
+ (key.objectid < logical ||
+ key.objectid + bytes >
+ logical + map->stripe_len)) {
btrfs_err(fs_info,
"scrub: tree block %llu spanning "
"stripes, ignored. logical=%llu",
key.objectid, logical);
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.uncorrectable_errors++;
+ spin_unlock(&sctx->stat_lock);
goto next;
}
@@ -3247,9 +3339,11 @@ again:
&extent_dev,
&extent_mirror_num);
- ret = btrfs_lookup_csums_range(csum_root, logical,
- logical + map->stripe_len - 1,
- &sctx->csum_list, 1);
+ ret = btrfs_lookup_csums_range(csum_root,
+ extent_logical,
+ extent_logical +
+ extent_len - 1,
+ &sctx->csum_list, 1);
if (ret)
goto out;
@@ -3257,10 +3351,12 @@ again:
extent_physical, extent_dev, flags,
generation, extent_mirror_num,
extent_logical - logical + physical);
+
+ scrub_free_csums(sctx);
+
if (ret)
goto out;
- scrub_free_csums(sctx);
if (extent_logical + extent_len <
key.objectid + bytes) {
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
@@ -3278,7 +3374,7 @@ loop:
if (ret && physical < physical_end) {
stripe_logical += base;
stripe_end = stripe_logical +
- increment - 1;
+ increment;
ret = scrub_raid56_parity(sctx,
map, scrub_dev, ppath,
stripe_logical,
@@ -3333,9 +3429,10 @@ out:
static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
struct btrfs_device *scrub_dev,
- u64 chunk_tree, u64 chunk_objectid,
u64 chunk_offset, u64 length,
- u64 dev_offset, int is_dev_replace)
+ u64 dev_offset,
+ struct btrfs_block_group_cache *cache,
+ int is_dev_replace)
{
struct btrfs_mapping_tree *map_tree =
&sctx->dev_root->fs_info->mapping_tree;
@@ -3348,8 +3445,18 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
read_unlock(&map_tree->map_tree.lock);
- if (!em)
- return -EINVAL;
+ if (!em) {
+ /*
+ * Might have been an unused block group deleted by the cleaner
+ * kthread or relocation.
+ */
+ spin_lock(&cache->lock);
+ if (!cache->removed)
+ ret = -EINVAL;
+ spin_unlock(&cache->lock);
+
+ return ret;
+ }
map = (struct map_lookup *)em->bdev;
if (em->start != chunk_offset)
@@ -3384,10 +3491,9 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
struct btrfs_root *root = sctx->dev_root;
struct btrfs_fs_info *fs_info = root->fs_info;
u64 length;
- u64 chunk_tree;
- u64 chunk_objectid;
u64 chunk_offset;
- int ret;
+ int ret = 0;
+ int ro_set;
int slot;
struct extent_buffer *l;
struct btrfs_key key;
@@ -3399,7 +3505,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
if (!path)
return -ENOMEM;
- path->reada = 2;
+ path->reada = READA_FORWARD;
path->search_commit_root = 1;
path->skip_locking = 1;
@@ -3415,8 +3521,14 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
if (path->slots[0] >=
btrfs_header_nritems(path->nodes[0])) {
ret = btrfs_next_leaf(root, path);
- if (ret)
+ if (ret < 0)
break;
+ if (ret > 0) {
+ ret = 0;
+ break;
+ }
+ } else {
+ ret = 0;
}
}
@@ -3443,8 +3555,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
if (found_key.offset + length <= start)
goto skip;
- chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
- chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
/*
@@ -3458,12 +3568,41 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
if (!cache)
goto skip;
+ /*
+ * we need call btrfs_inc_block_group_ro() with scrubs_paused,
+ * to avoid deadlock caused by:
+ * btrfs_inc_block_group_ro()
+ * -> btrfs_wait_for_commit()
+ * -> btrfs_commit_transaction()
+ * -> btrfs_scrub_pause()
+ */
+ scrub_pause_on(fs_info);
+ ret = btrfs_inc_block_group_ro(root, cache);
+ scrub_pause_off(fs_info);
+
+ if (ret == 0) {
+ ro_set = 1;
+ } else if (ret == -ENOSPC) {
+ /*
+ * btrfs_inc_block_group_ro return -ENOSPC when it
+ * failed in creating new chunk for metadata.
+ * It is not a problem for scrub/replace, because
+ * metadata are always cowed, and our scrub paused
+ * commit_transactions.
+ */
+ ro_set = 0;
+ } else {
+ btrfs_warn(fs_info, "failed setting block group ro, ret=%d\n",
+ ret);
+ btrfs_put_block_group(cache);
+ break;
+ }
+
dev_replace->cursor_right = found_key.offset + length;
dev_replace->cursor_left = found_key.offset;
dev_replace->item_needs_writeback = 1;
- ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid,
- chunk_offset, length, found_key.offset,
- is_dev_replace);
+ ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length,
+ found_key.offset, cache, is_dev_replace);
/*
* flush, submit all pending read and write bios, afterwards
@@ -3483,8 +3622,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
wait_event(sctx->list_wait,
atomic_read(&sctx->bios_in_flight) == 0);
- atomic_inc(&fs_info->scrubs_paused);
- wake_up(&fs_info->scrub_pause_wait);
+
+ scrub_pause_on(fs_info);
/*
* must be called before we decrease @scrub_paused.
@@ -3495,11 +3634,32 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
atomic_read(&sctx->workers_pending) == 0);
atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
- mutex_lock(&fs_info->scrub_lock);
- __scrub_blocked_if_needed(fs_info);
- atomic_dec(&fs_info->scrubs_paused);
- mutex_unlock(&fs_info->scrub_lock);
- wake_up(&fs_info->scrub_pause_wait);
+ scrub_pause_off(fs_info);
+
+ if (ro_set)
+ btrfs_dec_block_group_ro(root, cache);
+
+ /*
+ * We might have prevented the cleaner kthread from deleting
+ * this block group if it was already unused because we raced
+ * and set it to RO mode first. So add it back to the unused
+ * list, otherwise it might not ever be deleted unless a manual
+ * balance is triggered or it becomes used and unused again.
+ */
+ spin_lock(&cache->lock);
+ if (!cache->removed && !cache->ro && cache->reserved == 0 &&
+ btrfs_block_group_used(&cache->item) == 0) {
+ spin_unlock(&cache->lock);
+ spin_lock(&fs_info->unused_bgs_lock);
+ if (list_empty(&cache->bg_list)) {
+ btrfs_get_block_group(cache);
+ list_add_tail(&cache->bg_list,
+ &fs_info->unused_bgs);
+ }
+ spin_unlock(&fs_info->unused_bgs_lock);
+ } else {
+ spin_unlock(&cache->lock);
+ }
btrfs_put_block_group(cache);
if (ret)
@@ -3523,11 +3683,7 @@ skip:
btrfs_free_path(path);
- /*
- * ret can still be 1 from search_slot or next_leaf,
- * that's not an error
- */
- return ret < 0 ? ret : 0;
+ return ret;
}
static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
@@ -3577,27 +3733,27 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
if (fs_info->scrub_workers_refcnt == 0) {
if (is_dev_replace)
fs_info->scrub_workers =
- btrfs_alloc_workqueue("btrfs-scrub", flags,
+ btrfs_alloc_workqueue("scrub", flags,
1, 4);
else
fs_info->scrub_workers =
- btrfs_alloc_workqueue("btrfs-scrub", flags,
+ btrfs_alloc_workqueue("scrub", flags,
max_active, 4);
if (!fs_info->scrub_workers)
goto fail_scrub_workers;
fs_info->scrub_wr_completion_workers =
- btrfs_alloc_workqueue("btrfs-scrubwrc", flags,
+ btrfs_alloc_workqueue("scrubwrc", flags,
max_active, 2);
if (!fs_info->scrub_wr_completion_workers)
goto fail_scrub_wr_completion_workers;
fs_info->scrub_nocow_workers =
- btrfs_alloc_workqueue("btrfs-scrubnc", flags, 1, 0);
+ btrfs_alloc_workqueue("scrubnc", flags, 1, 0);
if (!fs_info->scrub_nocow_workers)
goto fail_scrub_nocow_workers;
fs_info->scrub_parity_workers =
- btrfs_alloc_workqueue("btrfs-scrubparity", flags,
+ btrfs_alloc_workqueue("scrubparity", flags,
max_active, 2);
if (!fs_info->scrub_parity_workers)
goto fail_scrub_parity_workers;
@@ -3896,8 +4052,7 @@ static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
return 0;
WARN_ON(!dev->bdev);
- wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO,
- bio_get_nr_vecs(dev->bdev));
+ wr_ctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO;
wr_ctx->tgtdev = dev;
atomic_set(&wr_ctx->flush_all_writes, 0);
return 0;
@@ -4054,7 +4209,7 @@ static int check_extent_to_block(struct inode *inode, u64 start, u64 len,
io_tree = &BTRFS_I(inode)->io_tree;
- lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state);
+ lock_extent_bits(io_tree, lockstart, lockend, &cached_state);
ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
if (ordered) {
btrfs_put_ordered_extent(ordered);
@@ -4219,8 +4374,8 @@ static int write_page_nocow(struct scrub_ctx *sctx,
if (!dev)
return -EIO;
if (!dev->bdev) {
- printk_ratelimited(KERN_WARNING
- "BTRFS: scrub write_page_nocow(bdev == NULL) is unexpected!\n");
+ btrfs_warn_rl(dev->dev_root->fs_info,
+ "scrub write_page_nocow(bdev == NULL) is unexpected");
return -EIO;
}
bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index aa72bfd..63a6152 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1434,16 +1434,6 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "
}
if (cur_clone_root) {
- if (compressed != BTRFS_COMPRESS_NONE) {
- /*
- * Offsets given by iterate_extent_inodes() are relative
- * to the start of the extent, we need to add logical
- * offset from the file extent item.
- * (See why at backref.c:check_extent_in_eb())
- */
- cur_clone_root->offset += btrfs_file_extent_offset(eb,
- fi);
- }
*found = cur_clone_root;
ret = 0;
} else {
@@ -1479,7 +1469,21 @@ static int read_symlink(struct btrfs_root *root,
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto out;
- BUG_ON(ret);
+ if (ret) {
+ /*
+ * An empty symlink inode. Can happen in rare error paths when
+ * creating a symlink (transaction committed before the inode
+ * eviction handler removed the symlink inode items and a crash
+ * happened in between or the subvol was snapshoted in between).
+ * Print an informative message to dmesg/syslog so that the user
+ * can delete the symlink.
+ */
+ btrfs_err(root->fs_info,
+ "Found empty symlink inode %llu at root %llu",
+ ino, root->root_key.objectid);
+ ret = -EIO;
+ goto out;
+ }
ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_file_extent_item);
@@ -1920,10 +1924,12 @@ static int did_overwrite_ref(struct send_ctx *sctx,
/*
* We know that it is or will be overwritten. Check this now.
* The current inode being processed might have been the one that caused
- * inode 'ino' to be orphanized, therefore ow_inode can actually be the
- * same as sctx->send_progress.
+ * inode 'ino' to be orphanized, therefore check if ow_inode matches
+ * the current inode being processed.
*/
- if (ow_inode <= sctx->send_progress)
+ if ((ow_inode < sctx->send_progress) ||
+ (ino != sctx->cur_ino && ow_inode == sctx->cur_ino &&
+ gen == sctx->cur_inode_gen))
ret = 1;
else
ret = 0;
@@ -2351,8 +2357,14 @@ static int send_subvol_begin(struct send_ctx *sctx)
}
TLV_PUT_STRING(sctx, BTRFS_SEND_A_PATH, name, namelen);
- TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
- sctx->send_root->root_item.uuid);
+
+ if (!btrfs_is_empty_uuid(sctx->send_root->root_item.received_uuid))
+ TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
+ sctx->send_root->root_item.received_uuid);
+ else
+ TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
+ sctx->send_root->root_item.uuid);
+
TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID,
le64_to_cpu(sctx->send_root->root_item.ctransid));
if (parent_root) {
@@ -2562,7 +2574,7 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino);
} else if (S_ISSOCK(mode)) {
cmd = BTRFS_SEND_C_MKSOCK;
} else {
- printk(KERN_WARNING "btrfs: unexpected inode type %o",
+ btrfs_warn(sctx->send_root->fs_info, "unexpected inode type %o",
(int)(mode & S_IFMT));
ret = -ENOTSUPP;
goto out;
@@ -4685,6 +4697,171 @@ tlv_put_failure:
return ret;
}
+static int send_extent_data(struct send_ctx *sctx,
+ const u64 offset,
+ const u64 len)
+{
+ u64 sent = 0;
+
+ if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
+ return send_update_extent(sctx, offset, len);
+
+ while (sent < len) {
+ u64 size = len - sent;
+ int ret;
+
+ if (size > BTRFS_SEND_READ_SIZE)
+ size = BTRFS_SEND_READ_SIZE;
+ ret = send_write(sctx, offset + sent, size);
+ if (ret < 0)
+ return ret;
+ if (!ret)
+ break;
+ sent += ret;
+ }
+ return 0;
+}
+
+static int clone_range(struct send_ctx *sctx,
+ struct clone_root *clone_root,
+ const u64 disk_byte,
+ u64 data_offset,
+ u64 offset,
+ u64 len)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int ret;
+
+ path = alloc_path_for_send();
+ if (!path)
+ return -ENOMEM;
+
+ /*
+ * We can't send a clone operation for the entire range if we find
+ * extent items in the respective range in the source file that
+ * refer to different extents or if we find holes.
+ * So check for that and do a mix of clone and regular write/copy
+ * operations if needed.
+ *
+ * Example:
+ *
+ * mkfs.btrfs -f /dev/sda
+ * mount /dev/sda /mnt
+ * xfs_io -f -c "pwrite -S 0xaa 0K 100K" /mnt/foo
+ * cp --reflink=always /mnt/foo /mnt/bar
+ * xfs_io -c "pwrite -S 0xbb 50K 50K" /mnt/foo
+ * btrfs subvolume snapshot -r /mnt /mnt/snap
+ *
+ * If when we send the snapshot and we are processing file bar (which
+ * has a higher inode number than foo) we blindly send a clone operation
+ * for the [0, 100K[ range from foo to bar, the receiver ends up getting
+ * a file bar that matches the content of file foo - iow, doesn't match
+ * the content from bar in the original filesystem.
+ */
+ key.objectid = clone_root->ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = clone_root->offset;
+ ret = btrfs_search_slot(NULL, clone_root->root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ if (ret > 0 && path->slots[0] > 0) {
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
+ if (key.objectid == clone_root->ino &&
+ key.type == BTRFS_EXTENT_DATA_KEY)
+ path->slots[0]--;
+ }
+
+ while (true) {
+ struct extent_buffer *leaf = path->nodes[0];
+ int slot = path->slots[0];
+ struct btrfs_file_extent_item *ei;
+ u8 type;
+ u64 ext_len;
+ u64 clone_len;
+
+ if (slot >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(clone_root->root, path);
+ if (ret < 0)
+ goto out;
+ else if (ret > 0)
+ break;
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+
+ /*
+ * We might have an implicit trailing hole (NO_HOLES feature
+ * enabled). We deal with it after leaving this loop.
+ */
+ if (key.objectid != clone_root->ino ||
+ key.type != BTRFS_EXTENT_DATA_KEY)
+ break;
+
+ ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+ type = btrfs_file_extent_type(leaf, ei);
+ if (type == BTRFS_FILE_EXTENT_INLINE) {
+ ext_len = btrfs_file_extent_inline_len(leaf, slot, ei);
+ ext_len = PAGE_CACHE_ALIGN(ext_len);
+ } else {
+ ext_len = btrfs_file_extent_num_bytes(leaf, ei);
+ }
+
+ if (key.offset + ext_len <= clone_root->offset)
+ goto next;
+
+ if (key.offset > clone_root->offset) {
+ /* Implicit hole, NO_HOLES feature enabled. */
+ u64 hole_len = key.offset - clone_root->offset;
+
+ if (hole_len > len)
+ hole_len = len;
+ ret = send_extent_data(sctx, offset, hole_len);
+ if (ret < 0)
+ goto out;
+
+ len -= hole_len;
+ if (len == 0)
+ break;
+ offset += hole_len;
+ clone_root->offset += hole_len;
+ data_offset += hole_len;
+ }
+
+ if (key.offset >= clone_root->offset + len)
+ break;
+
+ clone_len = min_t(u64, ext_len, len);
+
+ if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte &&
+ btrfs_file_extent_offset(leaf, ei) == data_offset)
+ ret = send_clone(sctx, offset, clone_len, clone_root);
+ else
+ ret = send_extent_data(sctx, offset, clone_len);
+
+ if (ret < 0)
+ goto out;
+
+ len -= clone_len;
+ if (len == 0)
+ break;
+ offset += clone_len;
+ clone_root->offset += clone_len;
+ data_offset += clone_len;
+next:
+ path->slots[0]++;
+ }
+
+ if (len > 0)
+ ret = send_extent_data(sctx, offset, len);
+ else
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
static int send_write_or_clone(struct send_ctx *sctx,
struct btrfs_path *path,
struct btrfs_key *key,
@@ -4693,9 +4870,7 @@ static int send_write_or_clone(struct send_ctx *sctx,
int ret = 0;
struct btrfs_file_extent_item *ei;
u64 offset = key->offset;
- u64 pos = 0;
u64 len;
- u32 l;
u8 type;
u64 bs = sctx->send_root->fs_info->sb->s_blocksize;
@@ -4723,22 +4898,15 @@ static int send_write_or_clone(struct send_ctx *sctx,
}
if (clone_root && IS_ALIGNED(offset + len, bs)) {
- ret = send_clone(sctx, offset, len, clone_root);
- } else if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) {
- ret = send_update_extent(sctx, offset, len);
+ u64 disk_byte;
+ u64 data_offset;
+
+ disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei);
+ data_offset = btrfs_file_extent_offset(path->nodes[0], ei);
+ ret = clone_range(sctx, clone_root, disk_byte, data_offset,
+ offset, len);
} else {
- while (pos < len) {
- l = len - pos;
- if (l > BTRFS_SEND_READ_SIZE)
- l = BTRFS_SEND_READ_SIZE;
- ret = send_write(sctx, pos + offset, l);
- if (ret < 0)
- goto out;
- if (!ret)
- break;
- pos += ret;
- }
- ret = 0;
+ ret = send_extent_data(sctx, offset, len);
}
out:
return ret;
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index 48d425a..02e0016 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -22,8 +22,8 @@
#define BTRFS_SEND_STREAM_MAGIC "btrfs-stream"
#define BTRFS_SEND_STREAM_VERSION 1
-#define BTRFS_SEND_BUF_SIZE (1024 * 64)
-#define BTRFS_SEND_READ_SIZE (1024 * 48)
+#define BTRFS_SEND_BUF_SIZE SZ_64K
+#define BTRFS_SEND_READ_SIZE (48 * SZ_1K)
enum btrfs_tlv_type {
BTRFS_TLV_U8,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index cd7ef34..9b9eab6 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -69,7 +69,7 @@ static struct file_system_type btrfs_fs_type;
static int btrfs_remount(struct super_block *sb, int *flags, char *data);
-static const char *btrfs_decode_error(int errno)
+const char *btrfs_decode_error(int errno)
{
char *errstr = "unknown";
@@ -130,7 +130,6 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
}
}
-#ifdef CONFIG_PRINTK
/*
* __btrfs_std_error decodes expected errors from the caller and
* invokes the approciate error response.
@@ -140,7 +139,9 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int errno, const char *fmt, ...)
{
struct super_block *sb = fs_info->sb;
+#ifdef CONFIG_PRINTK
const char *errstr;
+#endif
/*
* Special case: if the error is EROFS, and we're already
@@ -149,6 +150,7 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
return;
+#ifdef CONFIG_PRINTK
errstr = btrfs_decode_error(errno);
if (fmt) {
struct va_format vaf;
@@ -166,6 +168,7 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
printk(KERN_CRIT "BTRFS: error (device %s) in %s:%d: errno=%d %s\n",
sb->s_id, function, line, errno, errstr);
}
+#endif
/* Don't go through full error handling during mount */
save_error_info(fs_info);
@@ -173,6 +176,7 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
btrfs_handle_error(fs_info);
}
+#ifdef CONFIG_PRINTK
static const char * const logtypes[] = {
"emergency",
"alert",
@@ -212,27 +216,6 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
va_end(args);
}
-
-#else
-
-void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
- unsigned int line, int errno, const char *fmt, ...)
-{
- struct super_block *sb = fs_info->sb;
-
- /*
- * Special case: if the error is EROFS, and we're already
- * under MS_RDONLY, then it is safe here.
- */
- if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
- return;
-
- /* Don't go through full error handling during mount */
- if (sb->s_flags & MS_BORN) {
- save_error_info(fs_info);
- btrfs_handle_error(fs_info);
- }
-}
#endif
/*
@@ -312,18 +295,22 @@ enum {
Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
- Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
- Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache,
- Opt_no_space_cache, Opt_recovery, Opt_skip_balance,
- Opt_check_integrity, Opt_check_integrity_including_extent_data,
+ Opt_space_cache, Opt_space_cache_version, Opt_clear_cache,
+ Opt_user_subvol_rm_allowed, Opt_enospc_debug, Opt_subvolrootid,
+ Opt_defrag, Opt_inode_cache, Opt_no_space_cache, Opt_recovery,
+ Opt_skip_balance, Opt_check_integrity,
+ Opt_check_integrity_including_extent_data,
Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_rescan_uuid_tree,
Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard,
Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow,
Opt_datasum, Opt_treelog, Opt_noinode_cache,
+#ifdef CONFIG_BTRFS_DEBUG
+ Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all,
+#endif
Opt_err,
};
-static match_table_t tokens = {
+static const match_table_t tokens = {
{Opt_degraded, "degraded"},
{Opt_subvol, "subvol=%s"},
{Opt_subvolid, "subvolid=%s"},
@@ -354,6 +341,7 @@ static match_table_t tokens = {
{Opt_discard, "discard"},
{Opt_nodiscard, "nodiscard"},
{Opt_space_cache, "space_cache"},
+ {Opt_space_cache_version, "space_cache=%s"},
{Opt_clear_cache, "clear_cache"},
{Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
{Opt_enospc_debug, "enospc_debug"},
@@ -372,6 +360,11 @@ static match_table_t tokens = {
{Opt_rescan_uuid_tree, "rescan_uuid_tree"},
{Opt_fatal_errors, "fatal_errors=%s"},
{Opt_commit_interval, "commit=%d"},
+#ifdef CONFIG_BTRFS_DEBUG
+ {Opt_fragment_data, "fragment=data"},
+ {Opt_fragment_metadata, "fragment=metadata"},
+ {Opt_fragment_all, "fragment=all"},
+#endif
{Opt_err, NULL},
};
@@ -392,7 +385,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
bool compress_force = false;
cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
- if (cache_gen)
+ if (btrfs_fs_compat_ro(root->fs_info, FREE_SPACE_TREE))
+ btrfs_set_opt(info->mount_opt, FREE_SPACE_TREE);
+ else if (cache_gen)
btrfs_set_opt(info->mount_opt, SPACE_CACHE);
if (!options)
@@ -626,15 +621,35 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
"turning off discard");
break;
case Opt_space_cache:
- btrfs_set_and_info(root, SPACE_CACHE,
- "enabling disk space caching");
+ case Opt_space_cache_version:
+ if (token == Opt_space_cache ||
+ strcmp(args[0].from, "v1") == 0) {
+ btrfs_clear_opt(root->fs_info->mount_opt,
+ FREE_SPACE_TREE);
+ btrfs_set_and_info(root, SPACE_CACHE,
+ "enabling disk space caching");
+ } else if (strcmp(args[0].from, "v2") == 0) {
+ btrfs_clear_opt(root->fs_info->mount_opt,
+ SPACE_CACHE);
+ btrfs_set_and_info(root, FREE_SPACE_TREE,
+ "enabling free space tree");
+ } else {
+ ret = -EINVAL;
+ goto out;
+ }
break;
case Opt_rescan_uuid_tree:
btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE);
break;
case Opt_no_space_cache:
- btrfs_clear_and_info(root, SPACE_CACHE,
- "disabling disk space caching");
+ if (btrfs_test_opt(root, SPACE_CACHE)) {
+ btrfs_clear_and_info(root, SPACE_CACHE,
+ "disabling disk space caching");
+ }
+ if (btrfs_test_opt(root, FREE_SPACE_TREE)) {
+ btrfs_clear_and_info(root, FREE_SPACE_TREE,
+ "disabling free space tree");
+ }
break;
case Opt_inode_cache:
btrfs_set_pending_and_info(info, INODE_MAP_CACHE,
@@ -738,6 +753,22 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
}
break;
+#ifdef CONFIG_BTRFS_DEBUG
+ case Opt_fragment_all:
+ btrfs_info(root->fs_info, "fragmenting all space");
+ btrfs_set_opt(info->mount_opt, FRAGMENT_DATA);
+ btrfs_set_opt(info->mount_opt, FRAGMENT_METADATA);
+ break;
+ case Opt_fragment_metadata:
+ btrfs_info(root->fs_info, "fragmenting metadata");
+ btrfs_set_opt(info->mount_opt,
+ FRAGMENT_METADATA);
+ break;
+ case Opt_fragment_data:
+ btrfs_info(root->fs_info, "fragmenting data");
+ btrfs_set_opt(info->mount_opt, FRAGMENT_DATA);
+ break;
+#endif
case Opt_err:
btrfs_info(root->fs_info, "unrecognized mount option '%s'", p);
ret = -EINVAL;
@@ -747,8 +778,17 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
}
}
out:
+ if (btrfs_fs_compat_ro(root->fs_info, FREE_SPACE_TREE) &&
+ !btrfs_test_opt(root, FREE_SPACE_TREE) &&
+ !btrfs_test_opt(root, CLEAR_CACHE)) {
+ btrfs_err(root->fs_info, "cannot disable free space tree");
+ ret = -EINVAL;
+
+ }
if (!ret && btrfs_test_opt(root, SPACE_CACHE))
btrfs_info(root->fs_info, "disk space caching is enabled");
+ if (!ret && btrfs_test_opt(root, FREE_SPACE_TREE))
+ btrfs_info(root->fs_info, "using free space tree");
kfree(orig);
return ret;
}
@@ -1033,6 +1073,7 @@ static int btrfs_fill_super(struct super_block *sb,
sb->s_flags |= MS_POSIXACL;
#endif
sb->s_flags |= MS_I_VERSION;
+ sb->s_iflags |= SB_I_CGROUPWB;
err = open_ctree(sb, fs_devices, (char *)data);
if (err) {
printk(KERN_ERR "BTRFS: open_ctree failed\n");
@@ -1154,6 +1195,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
seq_puts(seq, ",noacl");
if (btrfs_test_opt(root, SPACE_CACHE))
seq_puts(seq, ",space_cache");
+ else if (btrfs_test_opt(root, FREE_SPACE_TREE))
+ seq_puts(seq, ",space_cache=v2");
else
seq_puts(seq, ",nospace_cache");
if (btrfs_test_opt(root, RESCAN_UUID_TREE))
@@ -1188,6 +1231,12 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
seq_puts(seq, ",fatal_errors=panic");
if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL)
seq_printf(seq, ",commit=%d", info->commit_interval);
+#ifdef CONFIG_BTRFS_DEBUG
+ if (btrfs_test_opt(root, FRAGMENT_DATA))
+ seq_puts(seq, ",fragment=data");
+ if (btrfs_test_opt(root, FRAGMENT_METADATA))
+ seq_puts(seq, ",fragment=metadata");
+#endif
seq_printf(seq, ",subvolid=%llu",
BTRFS_I(d_inode(dentry))->root->root_key.objectid);
seq_puts(seq, ",subvol=");
@@ -1500,9 +1549,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
if ((flags ^ s->s_flags) & MS_RDONLY)
error = -EBUSY;
} else {
- char b[BDEVNAME_SIZE];
-
- strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
+ snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
btrfs_sb(s)->bdev_holder = fs_type;
error = btrfs_fill_super(s, fs_devices, data,
flags & MS_SILENT ? 1 : 0);
@@ -1650,6 +1697,15 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
sb->s_flags |= MS_RDONLY;
+ /*
+ * Setting MS_RDONLY will put the cleaner thread to
+ * sleep at the next loop if it's already active.
+ * If it's already asleep, we'll leave unused block
+ * groups on disk until we're mounted read-write again
+ * unless we clean them up here.
+ */
+ btrfs_delete_unused_bgs(fs_info);
+
btrfs_dev_replace_suspend_for_unmount(fs_info);
btrfs_scrub_cancel(fs_info);
btrfs_pause_balance(fs_info);
@@ -1842,7 +1898,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
* btrfs starts at an offset of at least 1MB when doing chunk
* allocation.
*/
- skip_space = 1024 * 1024;
+ skip_space = SZ_1M;
/* user can set the offset in fs_info->alloc_start. */
if (fs_info->alloc_start &&
@@ -1933,6 +1989,8 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
* there are other factors that may change the result (like a new metadata
* chunk).
*
+ * If metadata is exhausted, f_bavail will be 0.
+ *
* FIXME: not accurate for mixed block groups, total and free/used are ok,
* available appears slightly larger.
*/
@@ -1944,11 +2002,13 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
struct btrfs_space_info *found;
u64 total_used = 0;
u64 total_free_data = 0;
+ u64 total_free_meta = 0;
int bits = dentry->d_sb->s_blocksize_bits;
__be32 *fsid = (__be32 *)fs_info->fsid;
unsigned factor = 1;
struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
int ret;
+ u64 thresh = 0;
/*
* holding chunk_muext to avoid allocating new chunks, holding
@@ -1974,6 +2034,8 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
}
}
}
+ if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
+ total_free_meta += found->disk_total - found->disk_used;
total_used += found->disk_used;
}
@@ -1996,6 +2058,24 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_bavail += div_u64(total_free_data, factor);
buf->f_bavail = buf->f_bavail >> bits;
+ /*
+ * We calculate the remaining metadata space minus global reserve. If
+ * this is (supposedly) smaller than zero, there's no space. But this
+ * does not hold in practice, the exhausted state happens where's still
+ * some positive delta. So we apply some guesswork and compare the
+ * delta to a 4M threshold. (Practically observed delta was ~2M.)
+ *
+ * We probably cannot calculate the exact threshold value because this
+ * depends on the internal reservations requested by various
+ * operations, so some operations that consume a few metadata will
+ * succeed even if the Avail is zero. But this is better than the other
+ * way around.
+ */
+ thresh = 4 * 1024 * 1024;
+
+ if (total_free_meta - thresh < block_rsv->size)
+ buf->f_bavail = 0;
+
buf->f_type = BTRFS_SUPER_MAGIC;
buf->f_bsize = dentry->d_sb->s_blocksize;
buf->f_namelen = BTRFS_NAME_LEN;
@@ -2163,8 +2243,7 @@ static int btrfs_interface_init(void)
static void btrfs_interface_exit(void)
{
- if (misc_deregister(&btrfs_misc) < 0)
- printk(KERN_INFO "BTRFS: misc_deregister failed for control device\n");
+ misc_deregister(&btrfs_misc);
}
static void btrfs_print_info(void)
@@ -2203,6 +2282,9 @@ static int btrfs_run_sanity_tests(void)
if (ret)
goto out;
ret = btrfs_test_qgroups();
+ if (ret)
+ goto out;
+ ret = btrfs_test_free_space_tree();
out:
btrfs_destroy_test_fs();
return ret;
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 603b0cc..e0ac859 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -437,24 +437,24 @@ static const struct attribute *btrfs_attrs[] = {
NULL,
};
-static void btrfs_release_super_kobj(struct kobject *kobj)
+static void btrfs_release_fsid_kobj(struct kobject *kobj)
{
struct btrfs_fs_devices *fs_devs = to_fs_devs(kobj);
- memset(&fs_devs->super_kobj, 0, sizeof(struct kobject));
+ memset(&fs_devs->fsid_kobj, 0, sizeof(struct kobject));
complete(&fs_devs->kobj_unregister);
}
static struct kobj_type btrfs_ktype = {
.sysfs_ops = &kobj_sysfs_ops,
- .release = btrfs_release_super_kobj,
+ .release = btrfs_release_fsid_kobj,
};
static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj)
{
if (kobj->ktype != &btrfs_ktype)
return NULL;
- return container_of(kobj, struct btrfs_fs_devices, super_kobj);
+ return container_of(kobj, struct btrfs_fs_devices, fsid_kobj);
}
static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj)
@@ -502,12 +502,12 @@ static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add)
attrs[0] = &fa->kobj_attr.attr;
if (add) {
int ret;
- ret = sysfs_merge_group(&fs_info->fs_devices->super_kobj,
+ ret = sysfs_merge_group(&fs_info->fs_devices->fsid_kobj,
&agroup);
if (ret)
return ret;
} else
- sysfs_unmerge_group(&fs_info->fs_devices->super_kobj,
+ sysfs_unmerge_group(&fs_info->fs_devices->fsid_kobj,
&agroup);
}
@@ -523,9 +523,9 @@ static void __btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs)
fs_devs->device_dir_kobj = NULL;
}
- if (fs_devs->super_kobj.state_initialized) {
- kobject_del(&fs_devs->super_kobj);
- kobject_put(&fs_devs->super_kobj);
+ if (fs_devs->fsid_kobj.state_initialized) {
+ kobject_del(&fs_devs->fsid_kobj);
+ kobject_put(&fs_devs->fsid_kobj);
wait_for_completion(&fs_devs->kobj_unregister);
}
}
@@ -545,7 +545,7 @@ void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs)
}
}
-void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
+void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info)
{
btrfs_reset_fs_info_ptr(fs_info);
@@ -555,9 +555,9 @@ void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
kobject_put(fs_info->space_info_kobj);
}
addrm_unknown_feature_attrs(fs_info, false);
- sysfs_remove_group(&fs_info->fs_devices->super_kobj, &btrfs_feature_attr_group);
- sysfs_remove_files(&fs_info->fs_devices->super_kobj, btrfs_attrs);
- btrfs_kobj_rm_device(fs_info->fs_devices, NULL);
+ sysfs_remove_group(&fs_info->fs_devices->fsid_kobj, &btrfs_feature_attr_group);
+ sysfs_remove_files(&fs_info->fs_devices->fsid_kobj, btrfs_attrs);
+ btrfs_sysfs_rm_device_link(fs_info->fs_devices, NULL);
}
const char * const btrfs_feature_set_names[3] = {
@@ -637,7 +637,7 @@ static void init_feature_attrs(void)
/* when one_device is NULL, it removes all device links */
-int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices,
+int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *one_device)
{
struct hd_struct *disk;
@@ -675,7 +675,7 @@ int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs)
{
if (!fs_devs->device_dir_kobj)
fs_devs->device_dir_kobj = kobject_create_and_add("devices",
- &fs_devs->super_kobj);
+ &fs_devs->fsid_kobj);
if (!fs_devs->device_dir_kobj)
return -ENOMEM;
@@ -683,7 +683,7 @@ int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs)
return 0;
}
-int btrfs_kobj_add_device(struct btrfs_fs_devices *fs_devices,
+int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *one_device)
{
int error = 0;
@@ -730,31 +730,31 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs,
int error;
init_completion(&fs_devs->kobj_unregister);
- fs_devs->super_kobj.kset = btrfs_kset;
- error = kobject_init_and_add(&fs_devs->super_kobj,
+ fs_devs->fsid_kobj.kset = btrfs_kset;
+ error = kobject_init_and_add(&fs_devs->fsid_kobj,
&btrfs_ktype, parent, "%pU", fs_devs->fsid);
return error;
}
-int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
+int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info)
{
int error;
struct btrfs_fs_devices *fs_devs = fs_info->fs_devices;
- struct kobject *super_kobj = &fs_devs->super_kobj;
+ struct kobject *fsid_kobj = &fs_devs->fsid_kobj;
btrfs_set_fs_info_ptr(fs_info);
- error = btrfs_kobj_add_device(fs_devs, NULL);
+ error = btrfs_sysfs_add_device_link(fs_devs, NULL);
if (error)
return error;
- error = sysfs_create_files(super_kobj, btrfs_attrs);
+ error = sysfs_create_files(fsid_kobj, btrfs_attrs);
if (error) {
- btrfs_kobj_rm_device(fs_devs, NULL);
+ btrfs_sysfs_rm_device_link(fs_devs, NULL);
return error;
}
- error = sysfs_create_group(super_kobj,
+ error = sysfs_create_group(fsid_kobj,
&btrfs_feature_attr_group);
if (error)
goto failure;
@@ -764,7 +764,7 @@ int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
goto failure;
fs_info->space_info_kobj = kobject_create_and_add("allocation",
- super_kobj);
+ fsid_kobj);
if (!fs_info->space_info_kobj) {
error = -ENOMEM;
goto failure;
@@ -776,7 +776,7 @@ int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
return 0;
failure:
- btrfs_sysfs_remove_one(fs_info);
+ btrfs_sysfs_remove_mounted(fs_info);
return error;
}
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index 6392527..9c09522 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -82,9 +82,9 @@ char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags);
extern const char * const btrfs_feature_set_names[3];
extern struct kobj_type space_info_ktype;
extern struct kobj_type btrfs_raid_ktype;
-int btrfs_kobj_add_device(struct btrfs_fs_devices *fs_devices,
+int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *one_device);
-int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices,
+int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *one_device);
int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs,
struct kobject *parent);
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index 9626252..b1d920b 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -21,6 +21,9 @@
#include <linux/magic.h>
#include "btrfs-tests.h"
#include "../ctree.h"
+#include "../free-space-cache.h"
+#include "../free-space-tree.h"
+#include "../transaction.h"
#include "../volumes.h"
#include "../disk-io.h"
#include "../qgroup.h"
@@ -122,6 +125,9 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void)
INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
+ extent_io_tree_init(&fs_info->freed_extents[0], NULL);
+ extent_io_tree_init(&fs_info->freed_extents[1], NULL);
+ fs_info->pinned_extents = &fs_info->freed_extents[0];
return fs_info;
}
@@ -169,3 +175,55 @@ void btrfs_free_dummy_root(struct btrfs_root *root)
kfree(root);
}
+struct btrfs_block_group_cache *
+btrfs_alloc_dummy_block_group(unsigned long length)
+{
+ struct btrfs_block_group_cache *cache;
+
+ cache = kzalloc(sizeof(*cache), GFP_NOFS);
+ if (!cache)
+ return NULL;
+ cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
+ GFP_NOFS);
+ if (!cache->free_space_ctl) {
+ kfree(cache);
+ return NULL;
+ }
+ cache->fs_info = btrfs_alloc_dummy_fs_info();
+ if (!cache->fs_info) {
+ kfree(cache->free_space_ctl);
+ kfree(cache);
+ return NULL;
+ }
+
+ cache->key.objectid = 0;
+ cache->key.offset = length;
+ cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+ cache->sectorsize = 4096;
+ cache->full_stripe_len = 4096;
+
+ INIT_LIST_HEAD(&cache->list);
+ INIT_LIST_HEAD(&cache->cluster_list);
+ INIT_LIST_HEAD(&cache->bg_list);
+ btrfs_init_free_space_ctl(cache);
+ mutex_init(&cache->free_space_lock);
+
+ return cache;
+}
+
+void btrfs_free_dummy_block_group(struct btrfs_block_group_cache *cache)
+{
+ if (!cache)
+ return;
+ __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ kfree(cache->free_space_ctl);
+ kfree(cache);
+}
+
+void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans)
+{
+ memset(trans, 0, sizeof(*trans));
+ trans->transid = 1;
+ INIT_LIST_HEAD(&trans->qgroup_ref_list);
+ trans->type = __TRANS_DUMMY;
+}
diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h
index fd39542..054b8c7 100644
--- a/fs/btrfs/tests/btrfs-tests.h
+++ b/fs/btrfs/tests/btrfs-tests.h
@@ -24,17 +24,23 @@
#define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt, ##__VA_ARGS__)
struct btrfs_root;
+struct btrfs_trans_handle;
int btrfs_test_free_space_cache(void);
int btrfs_test_extent_buffer_operations(void);
int btrfs_test_extent_io(void);
int btrfs_test_inodes(void);
int btrfs_test_qgroups(void);
+int btrfs_test_free_space_tree(void);
int btrfs_init_test_fs(void);
void btrfs_destroy_test_fs(void);
struct inode *btrfs_new_test_inode(void);
struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void);
void btrfs_free_dummy_root(struct btrfs_root *root);
+struct btrfs_block_group_cache *
+btrfs_alloc_dummy_block_group(unsigned long length);
+void btrfs_free_dummy_block_group(struct btrfs_block_group_cache *cache);
+void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans);
#else
static inline int btrfs_test_free_space_cache(void)
{
@@ -63,6 +69,10 @@ static inline int btrfs_test_qgroups(void)
{
return 0;
}
+static inline int btrfs_test_free_space_tree(void)
+{
+ return 0;
+}
#endif
#endif
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index 9e9f236..e29fa29 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -18,6 +18,8 @@
#include <linux/pagemap.h>
#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/sizes.h>
#include "btrfs-tests.h"
#include "../extent_io.h"
@@ -70,12 +72,14 @@ static int test_find_delalloc(void)
struct page *page;
struct page *locked_page = NULL;
unsigned long index = 0;
- u64 total_dirty = 256 * 1024 * 1024;
- u64 max_bytes = 128 * 1024 * 1024;
+ u64 total_dirty = SZ_256M;
+ u64 max_bytes = SZ_128M;
u64 start, end, test_start;
u64 found;
int ret = -EINVAL;
+ test_msg("Running find delalloc tests\n");
+
inode = btrfs_new_test_inode();
if (!inode) {
test_msg("Failed to allocate test inode\n");
@@ -133,7 +137,7 @@ static int test_find_delalloc(void)
* |--- delalloc ---|
* |--- search ---|
*/
- test_start = 64 * 1024 * 1024;
+ test_start = SZ_64M;
locked_page = find_lock_page(inode->i_mapping,
test_start >> PAGE_CACHE_SHIFT);
if (!locked_page) {
@@ -220,8 +224,8 @@ static int test_find_delalloc(void)
* Now to test where we run into a page that is no longer dirty in the
* range we want to find.
*/
- page = find_get_page(inode->i_mapping, (max_bytes + (1 * 1024 * 1024))
- >> PAGE_CACHE_SHIFT);
+ page = find_get_page(inode->i_mapping,
+ (max_bytes + SZ_1M) >> PAGE_CACHE_SHIFT);
if (!page) {
test_msg("Couldn't find our page\n");
goto out_bits;
@@ -268,8 +272,139 @@ out:
return ret;
}
+static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb,
+ unsigned long len)
+{
+ unsigned long i, x;
+
+ memset(bitmap, 0, len);
+ memset_extent_buffer(eb, 0, 0, len);
+ if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) {
+ test_msg("Bitmap was not zeroed\n");
+ return -EINVAL;
+ }
+
+ bitmap_set(bitmap, 0, len * BITS_PER_BYTE);
+ extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE);
+ if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) {
+ test_msg("Setting all bits failed\n");
+ return -EINVAL;
+ }
+
+ bitmap_clear(bitmap, 0, len * BITS_PER_BYTE);
+ extent_buffer_bitmap_clear(eb, 0, 0, len * BITS_PER_BYTE);
+ if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) {
+ test_msg("Clearing all bits failed\n");
+ return -EINVAL;
+ }
+
+ bitmap_set(bitmap, (PAGE_CACHE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE,
+ sizeof(long) * BITS_PER_BYTE);
+ extent_buffer_bitmap_set(eb, PAGE_CACHE_SIZE - sizeof(long) / 2, 0,
+ sizeof(long) * BITS_PER_BYTE);
+ if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) {
+ test_msg("Setting straddling pages failed\n");
+ return -EINVAL;
+ }
+
+ bitmap_set(bitmap, 0, len * BITS_PER_BYTE);
+ bitmap_clear(bitmap,
+ (PAGE_CACHE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE,
+ sizeof(long) * BITS_PER_BYTE);
+ extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE);
+ extent_buffer_bitmap_clear(eb, PAGE_CACHE_SIZE - sizeof(long) / 2, 0,
+ sizeof(long) * BITS_PER_BYTE);
+ if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) {
+ test_msg("Clearing straddling pages failed\n");
+ return -EINVAL;
+ }
+
+ /*
+ * Generate a wonky pseudo-random bit pattern for the sake of not using
+ * something repetitive that could miss some hypothetical off-by-n bug.
+ */
+ x = 0;
+ for (i = 0; i < len / sizeof(long); i++) {
+ x = (0x19660dULL * (u64)x + 0x3c6ef35fULL) & 0xffffffffUL;
+ bitmap[i] = x;
+ }
+ write_extent_buffer(eb, bitmap, 0, len);
+
+ for (i = 0; i < len * BITS_PER_BYTE; i++) {
+ int bit, bit1;
+
+ bit = !!test_bit(i, bitmap);
+ bit1 = !!extent_buffer_test_bit(eb, 0, i);
+ if (bit1 != bit) {
+ test_msg("Testing bit pattern failed\n");
+ return -EINVAL;
+ }
+
+ bit1 = !!extent_buffer_test_bit(eb, i / BITS_PER_BYTE,
+ i % BITS_PER_BYTE);
+ if (bit1 != bit) {
+ test_msg("Testing bit pattern with offset failed\n");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int test_eb_bitmaps(void)
+{
+ unsigned long len = PAGE_CACHE_SIZE * 4;
+ unsigned long *bitmap;
+ struct extent_buffer *eb;
+ int ret;
+
+ test_msg("Running extent buffer bitmap tests\n");
+
+ bitmap = kmalloc(len, GFP_NOFS);
+ if (!bitmap) {
+ test_msg("Couldn't allocate test bitmap\n");
+ return -ENOMEM;
+ }
+
+ eb = __alloc_dummy_extent_buffer(NULL, 0, len);
+ if (!eb) {
+ test_msg("Couldn't allocate test extent buffer\n");
+ kfree(bitmap);
+ return -ENOMEM;
+ }
+
+ ret = __test_eb_bitmaps(bitmap, eb, len);
+ if (ret)
+ goto out;
+
+ /* Do it over again with an extent buffer which isn't page-aligned. */
+ free_extent_buffer(eb);
+ eb = __alloc_dummy_extent_buffer(NULL, PAGE_CACHE_SIZE / 2, len);
+ if (!eb) {
+ test_msg("Couldn't allocate test extent buffer\n");
+ kfree(bitmap);
+ return -ENOMEM;
+ }
+
+ ret = __test_eb_bitmaps(bitmap, eb, len);
+out:
+ free_extent_buffer(eb);
+ kfree(bitmap);
+ return ret;
+}
+
int btrfs_test_extent_io(void)
{
- test_msg("Running find delalloc tests\n");
- return test_find_delalloc();
+ int ret;
+
+ test_msg("Running extent I/O tests\n");
+
+ ret = test_find_delalloc();
+ if (ret)
+ goto out;
+
+ ret = test_eb_bitmaps();
+out:
+ test_msg("Extent I/O tests finished\n");
+ return ret;
}
diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
index 2299bfd..c9ad97b 100644
--- a/fs/btrfs/tests/free-space-tests.c
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -19,38 +19,10 @@
#include <linux/slab.h>
#include "btrfs-tests.h"
#include "../ctree.h"
+#include "../disk-io.h"
#include "../free-space-cache.h"
#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8)
-static struct btrfs_block_group_cache *init_test_block_group(void)
-{
- struct btrfs_block_group_cache *cache;
-
- cache = kzalloc(sizeof(*cache), GFP_NOFS);
- if (!cache)
- return NULL;
- cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
- GFP_NOFS);
- if (!cache->free_space_ctl) {
- kfree(cache);
- return NULL;
- }
-
- cache->key.objectid = 0;
- cache->key.offset = 1024 * 1024 * 1024;
- cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
- cache->sectorsize = 4096;
- cache->full_stripe_len = 4096;
-
- spin_lock_init(&cache->lock);
- INIT_LIST_HEAD(&cache->list);
- INIT_LIST_HEAD(&cache->cluster_list);
- INIT_LIST_HEAD(&cache->bg_list);
-
- btrfs_init_free_space_ctl(cache);
-
- return cache;
-}
/*
* This test just does basic sanity checking, making sure we can add an exten
@@ -64,59 +36,59 @@ static int test_extents(struct btrfs_block_group_cache *cache)
test_msg("Running extent only tests\n");
/* First just make sure we can remove an entire entry */
- ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024);
+ ret = btrfs_add_free_space(cache, 0, SZ_4M);
if (ret) {
test_msg("Error adding initial extents %d\n", ret);
return ret;
}
- ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024);
+ ret = btrfs_remove_free_space(cache, 0, SZ_4M);
if (ret) {
test_msg("Error removing extent %d\n", ret);
return ret;
}
- if (test_check_exists(cache, 0, 4 * 1024 * 1024)) {
+ if (test_check_exists(cache, 0, SZ_4M)) {
test_msg("Full remove left some lingering space\n");
return -1;
}
/* Ok edge and middle cases now */
- ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024);
+ ret = btrfs_add_free_space(cache, 0, SZ_4M);
if (ret) {
test_msg("Error adding half extent %d\n", ret);
return ret;
}
- ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 1 * 1024 * 1024);
+ ret = btrfs_remove_free_space(cache, 3 * SZ_1M, SZ_1M);
if (ret) {
test_msg("Error removing tail end %d\n", ret);
return ret;
}
- ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024);
+ ret = btrfs_remove_free_space(cache, 0, SZ_1M);
if (ret) {
test_msg("Error removing front end %d\n", ret);
return ret;
}
- ret = btrfs_remove_free_space(cache, 2 * 1024 * 1024, 4096);
+ ret = btrfs_remove_free_space(cache, SZ_2M, 4096);
if (ret) {
test_msg("Error removing middle piece %d\n", ret);
return ret;
}
- if (test_check_exists(cache, 0, 1 * 1024 * 1024)) {
+ if (test_check_exists(cache, 0, SZ_1M)) {
test_msg("Still have space at the front\n");
return -1;
}
- if (test_check_exists(cache, 2 * 1024 * 1024, 4096)) {
+ if (test_check_exists(cache, SZ_2M, 4096)) {
test_msg("Still have space in the middle\n");
return -1;
}
- if (test_check_exists(cache, 3 * 1024 * 1024, 1 * 1024 * 1024)) {
+ if (test_check_exists(cache, 3 * SZ_1M, SZ_1M)) {
test_msg("Still have space at the end\n");
return -1;
}
@@ -134,30 +106,30 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache)
test_msg("Running bitmap only tests\n");
- ret = test_add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1);
+ ret = test_add_free_space_entry(cache, 0, SZ_4M, 1);
if (ret) {
test_msg("Couldn't create a bitmap entry %d\n", ret);
return ret;
}
- ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024);
+ ret = btrfs_remove_free_space(cache, 0, SZ_4M);
if (ret) {
test_msg("Error removing bitmap full range %d\n", ret);
return ret;
}
- if (test_check_exists(cache, 0, 4 * 1024 * 1024)) {
+ if (test_check_exists(cache, 0, SZ_4M)) {
test_msg("Left some space in bitmap\n");
return -1;
}
- ret = test_add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1);
+ ret = test_add_free_space_entry(cache, 0, SZ_4M, 1);
if (ret) {
test_msg("Couldn't add to our bitmap entry %d\n", ret);
return ret;
}
- ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 2 * 1024 * 1024);
+ ret = btrfs_remove_free_space(cache, SZ_1M, SZ_2M);
if (ret) {
test_msg("Couldn't remove middle chunk %d\n", ret);
return ret;
@@ -170,23 +142,21 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache)
next_bitmap_offset = (u64)(BITS_PER_BITMAP * 4096);
/* Test a bit straddling two bitmaps */
- ret = test_add_free_space_entry(cache, next_bitmap_offset -
- (2 * 1024 * 1024), 4 * 1024 * 1024, 1);
+ ret = test_add_free_space_entry(cache, next_bitmap_offset - SZ_2M,
+ SZ_4M, 1);
if (ret) {
test_msg("Couldn't add space that straddles two bitmaps %d\n",
ret);
return ret;
}
- ret = btrfs_remove_free_space(cache, next_bitmap_offset -
- (1 * 1024 * 1024), 2 * 1024 * 1024);
+ ret = btrfs_remove_free_space(cache, next_bitmap_offset - SZ_1M, SZ_2M);
if (ret) {
test_msg("Couldn't remove overlapping space %d\n", ret);
return ret;
}
- if (test_check_exists(cache, next_bitmap_offset - (1 * 1024 * 1024),
- 2 * 1024 * 1024)) {
+ if (test_check_exists(cache, next_bitmap_offset - SZ_1M, SZ_2M)) {
test_msg("Left some space when removing overlapping\n");
return -1;
}
@@ -209,43 +179,43 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
* bitmap, but the free space completely in the extent and then
* completely in the bitmap.
*/
- ret = test_add_free_space_entry(cache, 4 * 1024 * 1024, 1 * 1024 * 1024, 1);
+ ret = test_add_free_space_entry(cache, SZ_4M, SZ_1M, 1);
if (ret) {
test_msg("Couldn't create bitmap entry %d\n", ret);
return ret;
}
- ret = test_add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0);
+ ret = test_add_free_space_entry(cache, 0, SZ_1M, 0);
if (ret) {
test_msg("Couldn't add extent entry %d\n", ret);
return ret;
}
- ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024);
+ ret = btrfs_remove_free_space(cache, 0, SZ_1M);
if (ret) {
test_msg("Couldn't remove extent entry %d\n", ret);
return ret;
}
- if (test_check_exists(cache, 0, 1 * 1024 * 1024)) {
+ if (test_check_exists(cache, 0, SZ_1M)) {
test_msg("Left remnants after our remove\n");
return -1;
}
/* Now to add back the extent entry and remove from the bitmap */
- ret = test_add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0);
+ ret = test_add_free_space_entry(cache, 0, SZ_1M, 0);
if (ret) {
test_msg("Couldn't re-add extent entry %d\n", ret);
return ret;
}
- ret = btrfs_remove_free_space(cache, 4 * 1024 * 1024, 1 * 1024 * 1024);
+ ret = btrfs_remove_free_space(cache, SZ_4M, SZ_1M);
if (ret) {
test_msg("Couldn't remove from bitmap %d\n", ret);
return ret;
}
- if (test_check_exists(cache, 4 * 1024 * 1024, 1 * 1024 * 1024)) {
+ if (test_check_exists(cache, SZ_4M, SZ_1M)) {
test_msg("Left remnants in the bitmap\n");
return -1;
}
@@ -254,19 +224,19 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
* Ok so a little more evil, extent entry and bitmap at the same offset,
* removing an overlapping chunk.
*/
- ret = test_add_free_space_entry(cache, 1 * 1024 * 1024, 4 * 1024 * 1024, 1);
+ ret = test_add_free_space_entry(cache, SZ_1M, SZ_4M, 1);
if (ret) {
test_msg("Couldn't add to a bitmap %d\n", ret);
return ret;
}
- ret = btrfs_remove_free_space(cache, 512 * 1024, 3 * 1024 * 1024);
+ ret = btrfs_remove_free_space(cache, SZ_512K, 3 * SZ_1M);
if (ret) {
test_msg("Couldn't remove overlapping space %d\n", ret);
return ret;
}
- if (test_check_exists(cache, 512 * 1024, 3 * 1024 * 1024)) {
+ if (test_check_exists(cache, SZ_512K, 3 * SZ_1M)) {
test_msg("Left over pieces after removing overlapping\n");
return -1;
}
@@ -274,25 +244,25 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
__btrfs_remove_free_space_cache(cache->free_space_ctl);
/* Now with the extent entry offset into the bitmap */
- ret = test_add_free_space_entry(cache, 4 * 1024 * 1024, 4 * 1024 * 1024, 1);
+ ret = test_add_free_space_entry(cache, SZ_4M, SZ_4M, 1);
if (ret) {
test_msg("Couldn't add space to the bitmap %d\n", ret);
return ret;
}
- ret = test_add_free_space_entry(cache, 2 * 1024 * 1024, 2 * 1024 * 1024, 0);
+ ret = test_add_free_space_entry(cache, SZ_2M, SZ_2M, 0);
if (ret) {
test_msg("Couldn't add extent to the cache %d\n", ret);
return ret;
}
- ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 4 * 1024 * 1024);
+ ret = btrfs_remove_free_space(cache, 3 * SZ_1M, SZ_4M);
if (ret) {
test_msg("Problem removing overlapping space %d\n", ret);
return ret;
}
- if (test_check_exists(cache, 3 * 1024 * 1024, 4 * 1024 * 1024)) {
+ if (test_check_exists(cache, 3 * SZ_1M, SZ_4M)) {
test_msg("Left something behind when removing space");
return -1;
}
@@ -308,29 +278,26 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
* [ del ]
*/
__btrfs_remove_free_space_cache(cache->free_space_ctl);
- ret = test_add_free_space_entry(cache, bitmap_offset + 4 * 1024 * 1024,
- 4 * 1024 * 1024, 1);
+ ret = test_add_free_space_entry(cache, bitmap_offset + SZ_4M, SZ_4M, 1);
if (ret) {
test_msg("Couldn't add bitmap %d\n", ret);
return ret;
}
- ret = test_add_free_space_entry(cache, bitmap_offset - 1 * 1024 * 1024,
- 5 * 1024 * 1024, 0);
+ ret = test_add_free_space_entry(cache, bitmap_offset - SZ_1M,
+ 5 * SZ_1M, 0);
if (ret) {
test_msg("Couldn't add extent entry %d\n", ret);
return ret;
}
- ret = btrfs_remove_free_space(cache, bitmap_offset + 1 * 1024 * 1024,
- 5 * 1024 * 1024);
+ ret = btrfs_remove_free_space(cache, bitmap_offset + SZ_1M, 5 * SZ_1M);
if (ret) {
test_msg("Failed to free our space %d\n", ret);
return ret;
}
- if (test_check_exists(cache, bitmap_offset + 1 * 1024 * 1024,
- 5 * 1024 * 1024)) {
+ if (test_check_exists(cache, bitmap_offset + SZ_1M, 5 * SZ_1M)) {
test_msg("Left stuff over\n");
return -1;
}
@@ -343,19 +310,19 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
* to return -EAGAIN back from btrfs_remove_extent, make sure this
* doesn't happen.
*/
- ret = test_add_free_space_entry(cache, 1 * 1024 * 1024, 2 * 1024 * 1024, 1);
+ ret = test_add_free_space_entry(cache, SZ_1M, SZ_2M, 1);
if (ret) {
test_msg("Couldn't add bitmap entry %d\n", ret);
return ret;
}
- ret = test_add_free_space_entry(cache, 3 * 1024 * 1024, 1 * 1024 * 1024, 0);
+ ret = test_add_free_space_entry(cache, 3 * SZ_1M, SZ_1M, 0);
if (ret) {
test_msg("Couldn't add extent entry %d\n", ret);
return ret;
}
- ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 3 * 1024 * 1024);
+ ret = btrfs_remove_free_space(cache, SZ_1M, 3 * SZ_1M);
if (ret) {
test_msg("Error removing bitmap and extent overlapping %d\n", ret);
return ret;
@@ -438,9 +405,11 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
int ret;
u64 offset;
u64 max_extent_size;
-
- bool (*use_bitmap_op)(struct btrfs_free_space_ctl *,
- struct btrfs_free_space *);
+ const struct btrfs_free_space_op test_free_space_ops = {
+ .recalc_thresholds = cache->free_space_ctl->op->recalc_thresholds,
+ .use_bitmap = test_use_bitmap,
+ };
+ const struct btrfs_free_space_op *orig_free_space_ops;
test_msg("Running space stealing from bitmap to extent\n");
@@ -462,22 +431,21 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* that forces use of bitmaps as soon as we have at least 1
* extent entry.
*/
- use_bitmap_op = cache->free_space_ctl->op->use_bitmap;
- cache->free_space_ctl->op->use_bitmap = test_use_bitmap;
+ orig_free_space_ops = cache->free_space_ctl->op;
+ cache->free_space_ctl->op = &test_free_space_ops;
/*
* Extent entry covering free space range [128Mb - 256Kb, 128Mb - 128Kb[
*/
- ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 - 256 * 1024,
- 128 * 1024, 0);
+ ret = test_add_free_space_entry(cache, SZ_128M - SZ_256K, SZ_128K, 0);
if (ret) {
test_msg("Couldn't add extent entry %d\n", ret);
return ret;
}
/* Bitmap entry covering free space range [128Mb + 512Kb, 256Mb[ */
- ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 + 512 * 1024,
- 128 * 1024 * 1024 - 512 * 1024, 1);
+ ret = test_add_free_space_entry(cache, SZ_128M + SZ_512K,
+ SZ_128M - SZ_512K, 1);
if (ret) {
test_msg("Couldn't add bitmap entry %d\n", ret);
return ret;
@@ -495,21 +463,19 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* [128Mb + 512Kb, 128Mb + 768Kb[
*/
ret = btrfs_remove_free_space(cache,
- 128 * 1024 * 1024 + 768 * 1024,
- 128 * 1024 * 1024 - 768 * 1024);
+ SZ_128M + 768 * SZ_1K,
+ SZ_128M - 768 * SZ_1K);
if (ret) {
test_msg("Failed to free part of bitmap space %d\n", ret);
return ret;
}
/* Confirm that only those 2 ranges are marked as free. */
- if (!test_check_exists(cache, 128 * 1024 * 1024 - 256 * 1024,
- 128 * 1024)) {
+ if (!test_check_exists(cache, SZ_128M - SZ_256K, SZ_128K)) {
test_msg("Free space range missing\n");
return -ENOENT;
}
- if (!test_check_exists(cache, 128 * 1024 * 1024 + 512 * 1024,
- 256 * 1024)) {
+ if (!test_check_exists(cache, SZ_128M + SZ_512K, SZ_256K)) {
test_msg("Free space range missing\n");
return -ENOENT;
}
@@ -518,8 +484,8 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* Confirm that the bitmap range [128Mb + 768Kb, 256Mb[ isn't marked
* as free anymore.
*/
- if (test_check_exists(cache, 128 * 1024 * 1024 + 768 * 1024,
- 128 * 1024 * 1024 - 768 * 1024)) {
+ if (test_check_exists(cache, SZ_128M + 768 * SZ_1K,
+ SZ_128M - 768 * SZ_1K)) {
test_msg("Bitmap region not removed from space cache\n");
return -EINVAL;
}
@@ -528,8 +494,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* Confirm that the region [128Mb + 256Kb, 128Mb + 512Kb[, which is
* covered by the bitmap, isn't marked as free.
*/
- if (test_check_exists(cache, 128 * 1024 * 1024 + 256 * 1024,
- 256 * 1024)) {
+ if (test_check_exists(cache, SZ_128M + SZ_256K, SZ_256K)) {
test_msg("Invalid bitmap region marked as free\n");
return -EINVAL;
}
@@ -538,8 +503,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* Confirm that the region [128Mb, 128Mb + 256Kb[, which is covered
* by the bitmap too, isn't marked as free either.
*/
- if (test_check_exists(cache, 128 * 1024 * 1024,
- 256 * 1024)) {
+ if (test_check_exists(cache, SZ_128M, SZ_256K)) {
test_msg("Invalid bitmap region marked as free\n");
return -EINVAL;
}
@@ -549,13 +513,13 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* lets make sure the free space cache marks it as free in the bitmap,
* and doesn't insert a new extent entry to represent this region.
*/
- ret = btrfs_add_free_space(cache, 128 * 1024 * 1024, 512 * 1024);
+ ret = btrfs_add_free_space(cache, SZ_128M, SZ_512K);
if (ret) {
test_msg("Error adding free space: %d\n", ret);
return ret;
}
/* Confirm the region is marked as free. */
- if (!test_check_exists(cache, 128 * 1024 * 1024, 512 * 1024)) {
+ if (!test_check_exists(cache, SZ_128M, SZ_512K)) {
test_msg("Bitmap region not marked as free\n");
return -ENOENT;
}
@@ -574,8 +538,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* The goal is to test that the bitmap entry space stealing doesn't
* steal this space region.
*/
- ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 + 16 * 1024 * 1024,
- 4096);
+ ret = btrfs_add_free_space(cache, SZ_128M + SZ_16M, 4096);
if (ret) {
test_msg("Error adding free space: %d\n", ret);
return ret;
@@ -594,15 +557,13 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* expand the range covered by the existing extent entry that represents
* the free space [128Mb - 256Kb, 128Mb - 128Kb[.
*/
- ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 - 128 * 1024,
- 128 * 1024);
+ ret = btrfs_add_free_space(cache, SZ_128M - SZ_128K, SZ_128K);
if (ret) {
test_msg("Error adding free space: %d\n", ret);
return ret;
}
/* Confirm the region is marked as free. */
- if (!test_check_exists(cache, 128 * 1024 * 1024 - 128 * 1024,
- 128 * 1024)) {
+ if (!test_check_exists(cache, SZ_128M - SZ_128K, SZ_128K)) {
test_msg("Extent region not marked as free\n");
return -ENOENT;
}
@@ -630,21 +591,20 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* that represents the 1Mb free space, and therefore we're able to
* allocate the whole free space at once.
*/
- if (!test_check_exists(cache, 128 * 1024 * 1024 - 256 * 1024,
- 1 * 1024 * 1024)) {
+ if (!test_check_exists(cache, SZ_128M - SZ_256K, SZ_1M)) {
test_msg("Expected region not marked as free\n");
return -ENOENT;
}
- if (cache->free_space_ctl->free_space != (1 * 1024 * 1024 + 4096)) {
+ if (cache->free_space_ctl->free_space != (SZ_1M + 4096)) {
test_msg("Cache free space is not 1Mb + 4Kb\n");
return -EINVAL;
}
offset = btrfs_find_space_for_alloc(cache,
- 0, 1 * 1024 * 1024, 0,
+ 0, SZ_1M, 0,
&max_extent_size);
- if (offset != (128 * 1024 * 1024 - 256 * 1024)) {
+ if (offset != (SZ_128M - SZ_256K)) {
test_msg("Failed to allocate 1Mb from space cache, returned offset is: %llu\n",
offset);
return -EINVAL;
@@ -663,7 +623,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
offset = btrfs_find_space_for_alloc(cache,
0, 4096, 0,
&max_extent_size);
- if (offset != (128 * 1024 * 1024 + 16 * 1024 * 1024)) {
+ if (offset != (SZ_128M + SZ_16M)) {
test_msg("Failed to allocate 4Kb from space cache, returned offset is: %llu\n",
offset);
return -EINVAL;
@@ -684,16 +644,14 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
/*
* Extent entry covering free space range [128Mb + 128Kb, 128Mb + 256Kb[
*/
- ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 + 128 * 1024,
- 128 * 1024, 0);
+ ret = test_add_free_space_entry(cache, SZ_128M + SZ_128K, SZ_128K, 0);
if (ret) {
test_msg("Couldn't add extent entry %d\n", ret);
return ret;
}
/* Bitmap entry covering free space range [0, 128Mb - 512Kb[ */
- ret = test_add_free_space_entry(cache, 0,
- 128 * 1024 * 1024 - 512 * 1024, 1);
+ ret = test_add_free_space_entry(cache, 0, SZ_128M - SZ_512K, 1);
if (ret) {
test_msg("Couldn't add bitmap entry %d\n", ret);
return ret;
@@ -710,22 +668,18 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* [128Mb + 128b, 128Mb + 256Kb[
* [128Mb - 768Kb, 128Mb - 512Kb[
*/
- ret = btrfs_remove_free_space(cache,
- 0,
- 128 * 1024 * 1024 - 768 * 1024);
+ ret = btrfs_remove_free_space(cache, 0, SZ_128M - 768 * SZ_1K);
if (ret) {
test_msg("Failed to free part of bitmap space %d\n", ret);
return ret;
}
/* Confirm that only those 2 ranges are marked as free. */
- if (!test_check_exists(cache, 128 * 1024 * 1024 + 128 * 1024,
- 128 * 1024)) {
+ if (!test_check_exists(cache, SZ_128M + SZ_128K, SZ_128K)) {
test_msg("Free space range missing\n");
return -ENOENT;
}
- if (!test_check_exists(cache, 128 * 1024 * 1024 - 768 * 1024,
- 256 * 1024)) {
+ if (!test_check_exists(cache, SZ_128M - 768 * SZ_1K, SZ_256K)) {
test_msg("Free space range missing\n");
return -ENOENT;
}
@@ -734,8 +688,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* Confirm that the bitmap range [0, 128Mb - 768Kb[ isn't marked
* as free anymore.
*/
- if (test_check_exists(cache, 0,
- 128 * 1024 * 1024 - 768 * 1024)) {
+ if (test_check_exists(cache, 0, SZ_128M - 768 * SZ_1K)) {
test_msg("Bitmap region not removed from space cache\n");
return -EINVAL;
}
@@ -744,8 +697,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* Confirm that the region [128Mb - 512Kb, 128Mb[, which is
* covered by the bitmap, isn't marked as free.
*/
- if (test_check_exists(cache, 128 * 1024 * 1024 - 512 * 1024,
- 512 * 1024)) {
+ if (test_check_exists(cache, SZ_128M - SZ_512K, SZ_512K)) {
test_msg("Invalid bitmap region marked as free\n");
return -EINVAL;
}
@@ -755,15 +707,13 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* lets make sure the free space cache marks it as free in the bitmap,
* and doesn't insert a new extent entry to represent this region.
*/
- ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 - 512 * 1024,
- 512 * 1024);
+ ret = btrfs_add_free_space(cache, SZ_128M - SZ_512K, SZ_512K);
if (ret) {
test_msg("Error adding free space: %d\n", ret);
return ret;
}
/* Confirm the region is marked as free. */
- if (!test_check_exists(cache, 128 * 1024 * 1024 - 512 * 1024,
- 512 * 1024)) {
+ if (!test_check_exists(cache, SZ_128M - SZ_512K, SZ_512K)) {
test_msg("Bitmap region not marked as free\n");
return -ENOENT;
}
@@ -782,7 +732,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* The goal is to test that the bitmap entry space stealing doesn't
* steal this space region.
*/
- ret = btrfs_add_free_space(cache, 32 * 1024 * 1024, 8192);
+ ret = btrfs_add_free_space(cache, SZ_32M, 8192);
if (ret) {
test_msg("Error adding free space: %d\n", ret);
return ret;
@@ -793,13 +743,13 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* expand the range covered by the existing extent entry that represents
* the free space [128Mb + 128Kb, 128Mb + 256Kb[.
*/
- ret = btrfs_add_free_space(cache, 128 * 1024 * 1024, 128 * 1024);
+ ret = btrfs_add_free_space(cache, SZ_128M, SZ_128K);
if (ret) {
test_msg("Error adding free space: %d\n", ret);
return ret;
}
/* Confirm the region is marked as free. */
- if (!test_check_exists(cache, 128 * 1024 * 1024, 128 * 1024)) {
+ if (!test_check_exists(cache, SZ_128M, SZ_128K)) {
test_msg("Extent region not marked as free\n");
return -ENOENT;
}
@@ -827,21 +777,19 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* that represents the 1Mb free space, and therefore we're able to
* allocate the whole free space at once.
*/
- if (!test_check_exists(cache, 128 * 1024 * 1024 - 768 * 1024,
- 1 * 1024 * 1024)) {
+ if (!test_check_exists(cache, SZ_128M - 768 * SZ_1K, SZ_1M)) {
test_msg("Expected region not marked as free\n");
return -ENOENT;
}
- if (cache->free_space_ctl->free_space != (1 * 1024 * 1024 + 8192)) {
+ if (cache->free_space_ctl->free_space != (SZ_1M + 8192)) {
test_msg("Cache free space is not 1Mb + 8Kb\n");
return -EINVAL;
}
- offset = btrfs_find_space_for_alloc(cache,
- 0, 1 * 1024 * 1024, 0,
+ offset = btrfs_find_space_for_alloc(cache, 0, SZ_1M, 0,
&max_extent_size);
- if (offset != (128 * 1024 * 1024 - 768 * 1024)) {
+ if (offset != (SZ_128M - 768 * SZ_1K)) {
test_msg("Failed to allocate 1Mb from space cache, returned offset is: %llu\n",
offset);
return -EINVAL;
@@ -860,7 +808,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
offset = btrfs_find_space_for_alloc(cache,
0, 8192, 0,
&max_extent_size);
- if (offset != (32 * 1024 * 1024)) {
+ if (offset != SZ_32M) {
test_msg("Failed to allocate 8Kb from space cache, returned offset is: %llu\n",
offset);
return -EINVAL;
@@ -870,7 +818,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
if (ret)
return ret;
- cache->free_space_ctl->op->use_bitmap = use_bitmap_op;
+ cache->free_space_ctl->op = orig_free_space_ops;
__btrfs_remove_free_space_cache(cache->free_space_ctl);
return 0;
@@ -879,16 +827,30 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
int btrfs_test_free_space_cache(void)
{
struct btrfs_block_group_cache *cache;
- int ret;
+ struct btrfs_root *root = NULL;
+ int ret = -ENOMEM;
test_msg("Running btrfs free space cache tests\n");
- cache = init_test_block_group();
+ cache = btrfs_alloc_dummy_block_group(1024 * 1024 * 1024);
if (!cache) {
test_msg("Couldn't run the tests\n");
return 0;
}
+ root = btrfs_alloc_dummy_root();
+ if (IS_ERR(root)) {
+ ret = PTR_ERR(root);
+ goto out;
+ }
+
+ root->fs_info = btrfs_alloc_dummy_fs_info();
+ if (!root->fs_info)
+ goto out;
+
+ root->fs_info->extent_root = root;
+ cache->fs_info = root->fs_info;
+
ret = test_extents(cache);
if (ret)
goto out;
@@ -901,9 +863,8 @@ int btrfs_test_free_space_cache(void)
ret = test_steal_space_from_bitmap_to_extent(cache);
out:
- __btrfs_remove_free_space_cache(cache->free_space_ctl);
- kfree(cache->free_space_ctl);
- kfree(cache);
+ btrfs_free_dummy_block_group(cache);
+ btrfs_free_dummy_root(root);
test_msg("Free space cache tests finished\n");
return ret;
}
diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c
new file mode 100644
index 0000000..d05fe1a
--- /dev/null
+++ b/fs/btrfs/tests/free-space-tree-tests.c
@@ -0,0 +1,571 @@
+/*
+ * Copyright (C) 2015 Facebook. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "btrfs-tests.h"
+#include "../ctree.h"
+#include "../disk-io.h"
+#include "../free-space-tree.h"
+#include "../transaction.h"
+
+struct free_space_extent {
+ u64 start, length;
+};
+
+/*
+ * The test cases align their operations to this in order to hit some of the
+ * edge cases in the bitmap code.
+ */
+#define BITMAP_RANGE (BTRFS_FREE_SPACE_BITMAP_BITS * 4096)
+
+static int __check_free_space_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *cache,
+ struct btrfs_path *path,
+ struct free_space_extent *extents,
+ unsigned int num_extents)
+{
+ struct btrfs_free_space_info *info;
+ struct btrfs_key key;
+ int prev_bit = 0, bit;
+ u64 extent_start = 0, offset, end;
+ u32 flags, extent_count;
+ unsigned int i;
+ int ret;
+
+ info = search_free_space_info(trans, fs_info, cache, path, 0);
+ if (IS_ERR(info)) {
+ test_msg("Could not find free space info\n");
+ ret = PTR_ERR(info);
+ goto out;
+ }
+ flags = btrfs_free_space_flags(path->nodes[0], info);
+ extent_count = btrfs_free_space_extent_count(path->nodes[0], info);
+
+ if (extent_count != num_extents) {
+ test_msg("Extent count is wrong\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) {
+ if (path->slots[0] != 0)
+ goto invalid;
+ end = cache->key.objectid + cache->key.offset;
+ i = 0;
+ while (++path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.type != BTRFS_FREE_SPACE_BITMAP_KEY)
+ goto invalid;
+ offset = key.objectid;
+ while (offset < key.objectid + key.offset) {
+ bit = free_space_test_bit(cache, path, offset);
+ if (prev_bit == 0 && bit == 1) {
+ extent_start = offset;
+ } else if (prev_bit == 1 && bit == 0) {
+ if (i >= num_extents)
+ goto invalid;
+ if (i >= num_extents ||
+ extent_start != extents[i].start ||
+ offset - extent_start != extents[i].length)
+ goto invalid;
+ i++;
+ }
+ prev_bit = bit;
+ offset += cache->sectorsize;
+ }
+ }
+ if (prev_bit == 1) {
+ if (i >= num_extents ||
+ extent_start != extents[i].start ||
+ end - extent_start != extents[i].length)
+ goto invalid;
+ i++;
+ }
+ if (i != num_extents)
+ goto invalid;
+ } else {
+ if (btrfs_header_nritems(path->nodes[0]) != num_extents + 1 ||
+ path->slots[0] != 0)
+ goto invalid;
+ for (i = 0; i < num_extents; i++) {
+ path->slots[0]++;
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.type != BTRFS_FREE_SPACE_EXTENT_KEY ||
+ key.objectid != extents[i].start ||
+ key.offset != extents[i].length)
+ goto invalid;
+ }
+ }
+
+ ret = 0;
+out:
+ btrfs_release_path(path);
+ return ret;
+invalid:
+ test_msg("Free space tree is invalid\n");
+ ret = -EINVAL;
+ goto out;
+}
+
+static int check_free_space_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *cache,
+ struct btrfs_path *path,
+ struct free_space_extent *extents,
+ unsigned int num_extents)
+{
+ struct btrfs_free_space_info *info;
+ u32 flags;
+ int ret;
+
+ info = search_free_space_info(trans, fs_info, cache, path, 0);
+ if (IS_ERR(info)) {
+ test_msg("Could not find free space info\n");
+ btrfs_release_path(path);
+ return PTR_ERR(info);
+ }
+ flags = btrfs_free_space_flags(path->nodes[0], info);
+ btrfs_release_path(path);
+
+ ret = __check_free_space_extents(trans, fs_info, cache, path, extents,
+ num_extents);
+ if (ret)
+ return ret;
+
+ /* Flip it to the other format and check that for good measure. */
+ if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) {
+ ret = convert_free_space_to_extents(trans, fs_info, cache, path);
+ if (ret) {
+ test_msg("Could not convert to extents\n");
+ return ret;
+ }
+ } else {
+ ret = convert_free_space_to_bitmaps(trans, fs_info, cache, path);
+ if (ret) {
+ test_msg("Could not convert to bitmaps\n");
+ return ret;
+ }
+ }
+ return __check_free_space_extents(trans, fs_info, cache, path, extents,
+ num_extents);
+}
+
+static int test_empty_block_group(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *cache,
+ struct btrfs_path *path)
+{
+ struct free_space_extent extents[] = {
+ {cache->key.objectid, cache->key.offset},
+ };
+
+ return check_free_space_extents(trans, fs_info, cache, path,
+ extents, ARRAY_SIZE(extents));
+}
+
+static int test_remove_all(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *cache,
+ struct btrfs_path *path)
+{
+ struct free_space_extent extents[] = {};
+ int ret;
+
+ ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid,
+ cache->key.offset);
+ if (ret) {
+ test_msg("Could not remove free space\n");
+ return ret;
+ }
+
+ return check_free_space_extents(trans, fs_info, cache, path,
+ extents, ARRAY_SIZE(extents));
+}
+
+static int test_remove_beginning(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *cache,
+ struct btrfs_path *path)
+{
+ struct free_space_extent extents[] = {
+ {cache->key.objectid + BITMAP_RANGE,
+ cache->key.offset - BITMAP_RANGE},
+ };
+ int ret;
+
+ ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid, BITMAP_RANGE);
+ if (ret) {
+ test_msg("Could not remove free space\n");
+ return ret;
+ }
+
+ return check_free_space_extents(trans, fs_info, cache, path,
+ extents, ARRAY_SIZE(extents));
+
+}
+
+static int test_remove_end(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *cache,
+ struct btrfs_path *path)
+{
+ struct free_space_extent extents[] = {
+ {cache->key.objectid, cache->key.offset - BITMAP_RANGE},
+ };
+ int ret;
+
+ ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid +
+ cache->key.offset - BITMAP_RANGE,
+ BITMAP_RANGE);
+ if (ret) {
+ test_msg("Could not remove free space\n");
+ return ret;
+ }
+
+ return check_free_space_extents(trans, fs_info, cache, path,
+ extents, ARRAY_SIZE(extents));
+}
+
+static int test_remove_middle(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *cache,
+ struct btrfs_path *path)
+{
+ struct free_space_extent extents[] = {
+ {cache->key.objectid, BITMAP_RANGE},
+ {cache->key.objectid + 2 * BITMAP_RANGE,
+ cache->key.offset - 2 * BITMAP_RANGE},
+ };
+ int ret;
+
+ ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid + BITMAP_RANGE,
+ BITMAP_RANGE);
+ if (ret) {
+ test_msg("Could not remove free space\n");
+ return ret;
+ }
+
+ return check_free_space_extents(trans, fs_info, cache, path,
+ extents, ARRAY_SIZE(extents));
+}
+
+static int test_merge_left(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *cache,
+ struct btrfs_path *path)
+{
+ struct free_space_extent extents[] = {
+ {cache->key.objectid, 2 * BITMAP_RANGE},
+ };
+ int ret;
+
+ ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid,
+ cache->key.offset);
+ if (ret) {
+ test_msg("Could not remove free space\n");
+ return ret;
+ }
+
+ ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid, BITMAP_RANGE);
+ if (ret) {
+ test_msg("Could not add free space\n");
+ return ret;
+ }
+
+ ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid + BITMAP_RANGE,
+ BITMAP_RANGE);
+ if (ret) {
+ test_msg("Could not add free space\n");
+ return ret;
+ }
+
+ return check_free_space_extents(trans, fs_info, cache, path,
+ extents, ARRAY_SIZE(extents));
+}
+
+static int test_merge_right(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *cache,
+ struct btrfs_path *path)
+{
+ struct free_space_extent extents[] = {
+ {cache->key.objectid + BITMAP_RANGE, 2 * BITMAP_RANGE},
+ };
+ int ret;
+
+ ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid,
+ cache->key.offset);
+ if (ret) {
+ test_msg("Could not remove free space\n");
+ return ret;
+ }
+
+ ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid + 2 * BITMAP_RANGE,
+ BITMAP_RANGE);
+ if (ret) {
+ test_msg("Could not add free space\n");
+ return ret;
+ }
+
+ ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid + BITMAP_RANGE,
+ BITMAP_RANGE);
+ if (ret) {
+ test_msg("Could not add free space\n");
+ return ret;
+ }
+
+ return check_free_space_extents(trans, fs_info, cache, path,
+ extents, ARRAY_SIZE(extents));
+}
+
+static int test_merge_both(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *cache,
+ struct btrfs_path *path)
+{
+ struct free_space_extent extents[] = {
+ {cache->key.objectid, 3 * BITMAP_RANGE},
+ };
+ int ret;
+
+ ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid,
+ cache->key.offset);
+ if (ret) {
+ test_msg("Could not remove free space\n");
+ return ret;
+ }
+
+ ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid, BITMAP_RANGE);
+ if (ret) {
+ test_msg("Could not add free space\n");
+ return ret;
+ }
+
+ ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid + 2 * BITMAP_RANGE,
+ BITMAP_RANGE);
+ if (ret) {
+ test_msg("Could not add free space\n");
+ return ret;
+ }
+
+ ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid + BITMAP_RANGE,
+ BITMAP_RANGE);
+ if (ret) {
+ test_msg("Could not add free space\n");
+ return ret;
+ }
+
+ return check_free_space_extents(trans, fs_info, cache, path,
+ extents, ARRAY_SIZE(extents));
+}
+
+static int test_merge_none(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *cache,
+ struct btrfs_path *path)
+{
+ struct free_space_extent extents[] = {
+ {cache->key.objectid, BITMAP_RANGE},
+ {cache->key.objectid + 2 * BITMAP_RANGE, BITMAP_RANGE},
+ {cache->key.objectid + 4 * BITMAP_RANGE, BITMAP_RANGE},
+ };
+ int ret;
+
+ ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid,
+ cache->key.offset);
+ if (ret) {
+ test_msg("Could not remove free space\n");
+ return ret;
+ }
+
+ ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid, BITMAP_RANGE);
+ if (ret) {
+ test_msg("Could not add free space\n");
+ return ret;
+ }
+
+ ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid + 4 * BITMAP_RANGE,
+ BITMAP_RANGE);
+ if (ret) {
+ test_msg("Could not add free space\n");
+ return ret;
+ }
+
+ ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid + 2 * BITMAP_RANGE,
+ BITMAP_RANGE);
+ if (ret) {
+ test_msg("Could not add free space\n");
+ return ret;
+ }
+
+ return check_free_space_extents(trans, fs_info, cache, path,
+ extents, ARRAY_SIZE(extents));
+}
+
+typedef int (*test_func_t)(struct btrfs_trans_handle *,
+ struct btrfs_fs_info *,
+ struct btrfs_block_group_cache *,
+ struct btrfs_path *);
+
+static int run_test(test_func_t test_func, int bitmaps)
+{
+ struct btrfs_root *root = NULL;
+ struct btrfs_block_group_cache *cache = NULL;
+ struct btrfs_trans_handle trans;
+ struct btrfs_path *path = NULL;
+ int ret;
+
+ root = btrfs_alloc_dummy_root();
+ if (IS_ERR(root)) {
+ test_msg("Couldn't allocate dummy root\n");
+ ret = PTR_ERR(root);
+ goto out;
+ }
+
+ root->fs_info = btrfs_alloc_dummy_fs_info();
+ if (!root->fs_info) {
+ test_msg("Couldn't allocate dummy fs info\n");
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ btrfs_set_super_compat_ro_flags(root->fs_info->super_copy,
+ BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE);
+ root->fs_info->free_space_root = root;
+ root->fs_info->tree_root = root;
+
+ root->node = alloc_test_extent_buffer(root->fs_info, 4096);
+ if (!root->node) {
+ test_msg("Couldn't allocate dummy buffer\n");
+ ret = -ENOMEM;
+ goto out;
+ }
+ btrfs_set_header_level(root->node, 0);
+ btrfs_set_header_nritems(root->node, 0);
+ root->alloc_bytenr += 8192;
+
+ cache = btrfs_alloc_dummy_block_group(8 * BITMAP_RANGE);
+ if (!cache) {
+ test_msg("Couldn't allocate dummy block group cache\n");
+ ret = -ENOMEM;
+ goto out;
+ }
+ cache->bitmap_low_thresh = 0;
+ cache->bitmap_high_thresh = (u32)-1;
+ cache->needs_free_space = 1;
+
+ btrfs_init_dummy_trans(&trans);
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ test_msg("Couldn't allocate path\n");
+ return -ENOMEM;
+ }
+
+ ret = add_block_group_free_space(&trans, root->fs_info, cache);
+ if (ret) {
+ test_msg("Could not add block group free space\n");
+ goto out;
+ }
+
+ if (bitmaps) {
+ ret = convert_free_space_to_bitmaps(&trans, root->fs_info,
+ cache, path);
+ if (ret) {
+ test_msg("Could not convert block group to bitmaps\n");
+ goto out;
+ }
+ }
+
+ ret = test_func(&trans, root->fs_info, cache, path);
+ if (ret)
+ goto out;
+
+ ret = remove_block_group_free_space(&trans, root->fs_info, cache);
+ if (ret) {
+ test_msg("Could not remove block group free space\n");
+ goto out;
+ }
+
+ if (btrfs_header_nritems(root->node) != 0) {
+ test_msg("Free space tree has leftover items\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ btrfs_free_dummy_block_group(cache);
+ btrfs_free_dummy_root(root);
+ return ret;
+}
+
+static int run_test_both_formats(test_func_t test_func)
+{
+ int ret;
+
+ ret = run_test(test_func, 0);
+ if (ret)
+ return ret;
+ return run_test(test_func, 1);
+}
+
+int btrfs_test_free_space_tree(void)
+{
+ test_func_t tests[] = {
+ test_empty_block_group,
+ test_remove_all,
+ test_remove_beginning,
+ test_remove_end,
+ test_remove_middle,
+ test_merge_left,
+ test_merge_right,
+ test_merge_both,
+ test_merge_none,
+ };
+ int i;
+
+ test_msg("Running free space tree tests\n");
+ for (i = 0; i < ARRAY_SIZE(tests); i++) {
+ int ret = run_test_both_formats(tests[i]);
+ if (ret) {
+ test_msg("%pf failed\n", tests[i]);
+ return ret;
+ }
+ }
+
+ return 0;
+}
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 054fc0d..5de55fd 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -100,7 +100,7 @@ static void insert_inode_item_key(struct btrfs_root *root)
static void setup_file_extents(struct btrfs_root *root)
{
int slot = 0;
- u64 disk_bytenr = 1 * 1024 * 1024;
+ u64 disk_bytenr = SZ_1M;
u64 offset = 0;
/* First we want a hole */
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index 846d277..8ea5d34 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -23,14 +23,6 @@
#include "../qgroup.h"
#include "../backref.h"
-static void init_dummy_trans(struct btrfs_trans_handle *trans)
-{
- memset(trans, 0, sizeof(*trans));
- trans->transid = 1;
- INIT_LIST_HEAD(&trans->qgroup_ref_list);
- trans->type = __TRANS_DUMMY;
-}
-
static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr,
u64 num_bytes, u64 parent, u64 root_objectid)
{
@@ -44,7 +36,7 @@ static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr,
u32 size = sizeof(*item) + sizeof(*iref) + sizeof(*block_info);
int ret;
- init_dummy_trans(&trans);
+ btrfs_init_dummy_trans(&trans);
ins.objectid = bytenr;
ins.type = BTRFS_EXTENT_ITEM_KEY;
@@ -94,7 +86,7 @@ static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
u64 refs;
int ret;
- init_dummy_trans(&trans);
+ btrfs_init_dummy_trans(&trans);
key.objectid = bytenr;
key.type = BTRFS_EXTENT_ITEM_KEY;
@@ -144,7 +136,7 @@ static int remove_extent_item(struct btrfs_root *root, u64 bytenr,
struct btrfs_path *path;
int ret;
- init_dummy_trans(&trans);
+ btrfs_init_dummy_trans(&trans);
key.objectid = bytenr;
key.type = BTRFS_EXTENT_ITEM_KEY;
@@ -178,7 +170,7 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr,
u64 refs;
int ret;
- init_dummy_trans(&trans);
+ btrfs_init_dummy_trans(&trans);
key.objectid = bytenr;
key.type = BTRFS_EXTENT_ITEM_KEY;
@@ -232,7 +224,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root)
struct ulist *new_roots = NULL;
int ret;
- init_dummy_trans(&trans);
+ btrfs_init_dummy_trans(&trans);
test_msg("Qgroup basic add\n");
ret = btrfs_create_qgroup(NULL, fs_info, 5);
@@ -326,7 +318,7 @@ static int test_multiple_refs(struct btrfs_root *root)
struct ulist *new_roots = NULL;
int ret;
- init_dummy_trans(&trans);
+ btrfs_init_dummy_trans(&trans);
test_msg("Qgroup multiple refs test\n");
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index f5021fc..b6031ce 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -75,6 +75,23 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
list_del_init(&em->list);
free_extent_map(em);
}
+ /*
+ * If any block groups are found in ->deleted_bgs then it's
+ * because the transaction was aborted and a commit did not
+ * happen (things failed before writing the new superblock
+ * and calling btrfs_finish_extent_commit()), so we can not
+ * discard the physical locations of the block groups.
+ */
+ while (!list_empty(&transaction->deleted_bgs)) {
+ struct btrfs_block_group_cache *cache;
+
+ cache = list_first_entry(&transaction->deleted_bgs,
+ struct btrfs_block_group_cache,
+ bg_list);
+ list_del_init(&cache->bg_list);
+ btrfs_put_block_group_trimming(cache);
+ btrfs_put_block_group(cache);
+ }
kmem_cache_free(btrfs_transaction_cachep, transaction);
}
}
@@ -82,6 +99,12 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
static void clear_btree_io_tree(struct extent_io_tree *tree)
{
spin_lock(&tree->lock);
+ /*
+ * Do a single barrier for the waitqueue_active check here, the state
+ * of the waitqueue should not change once clear_btree_io_tree is
+ * called.
+ */
+ smp_mb();
while (!RB_EMPTY_ROOT(&tree->state)) {
struct rb_node *node;
struct extent_state *state;
@@ -117,6 +140,18 @@ static noinline void switch_commit_roots(struct btrfs_transaction *trans,
btrfs_unpin_free_ino(root);
clear_btree_io_tree(&root->dirty_log_pages);
}
+
+ /* We can free old roots now. */
+ spin_lock(&trans->dropped_roots_lock);
+ while (!list_empty(&trans->dropped_roots)) {
+ root = list_first_entry(&trans->dropped_roots,
+ struct btrfs_root, root_list);
+ list_del_init(&root->root_list);
+ spin_unlock(&trans->dropped_roots_lock);
+ btrfs_drop_and_free_fs_root(fs_info, root);
+ spin_lock(&trans->dropped_roots_lock);
+ }
+ spin_unlock(&trans->dropped_roots_lock);
up_write(&fs_info->commit_root_sem);
}
@@ -214,25 +249,22 @@ loop:
extwriter_counter_init(cur_trans, type);
init_waitqueue_head(&cur_trans->writer_wait);
init_waitqueue_head(&cur_trans->commit_wait);
+ init_waitqueue_head(&cur_trans->pending_wait);
cur_trans->state = TRANS_STATE_RUNNING;
/*
* One for this trans handle, one so it will live on until we
* commit the transaction.
*/
atomic_set(&cur_trans->use_count, 2);
- cur_trans->have_free_bgs = 0;
+ atomic_set(&cur_trans->pending_ordered, 0);
+ cur_trans->flags = 0;
cur_trans->start_time = get_seconds();
- cur_trans->dirty_bg_run = 0;
+
+ memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs));
cur_trans->delayed_refs.href_root = RB_ROOT;
cur_trans->delayed_refs.dirty_extent_root = RB_ROOT;
atomic_set(&cur_trans->delayed_refs.num_entries, 0);
- cur_trans->delayed_refs.num_heads_ready = 0;
- cur_trans->delayed_refs.pending_csums = 0;
- cur_trans->delayed_refs.num_heads = 0;
- cur_trans->delayed_refs.flushing = 0;
- cur_trans->delayed_refs.run_delayed_start = 0;
- cur_trans->delayed_refs.qgroup_to_skip = 0;
/*
* although the tree mod log is per file system and not per transaction,
@@ -252,12 +284,14 @@ loop:
INIT_LIST_HEAD(&cur_trans->pending_snapshots);
INIT_LIST_HEAD(&cur_trans->pending_chunks);
INIT_LIST_HEAD(&cur_trans->switch_commits);
- INIT_LIST_HEAD(&cur_trans->pending_ordered);
INIT_LIST_HEAD(&cur_trans->dirty_bgs);
INIT_LIST_HEAD(&cur_trans->io_bgs);
+ INIT_LIST_HEAD(&cur_trans->dropped_roots);
mutex_init(&cur_trans->cache_write_mutex);
cur_trans->num_dirty_bgs = 0;
spin_lock_init(&cur_trans->dirty_bgs_lock);
+ INIT_LIST_HEAD(&cur_trans->deleted_bgs);
+ spin_lock_init(&cur_trans->dropped_roots_lock);
list_add_tail(&cur_trans->list, &fs_info->trans_list);
extent_io_tree_init(&cur_trans->dirty_pages,
fs_info->btree_inode->i_mapping);
@@ -334,6 +368,24 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
}
+void btrfs_add_dropped_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_transaction *cur_trans = trans->transaction;
+
+ /* Add ourselves to the transaction dropped list */
+ spin_lock(&cur_trans->dropped_roots_lock);
+ list_add_tail(&root->root_list, &cur_trans->dropped_roots);
+ spin_unlock(&cur_trans->dropped_roots_lock);
+
+ /* Make sure we don't try to update the root at commit time */
+ spin_lock(&root->fs_info->fs_roots_radix_lock);
+ radix_tree_tag_clear(&root->fs_info->fs_roots_radix,
+ (unsigned long)root->root_key.objectid,
+ BTRFS_ROOT_TRANS_TAG);
+ spin_unlock(&root->fs_info->fs_roots_radix_lock);
+}
+
int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
@@ -413,8 +465,8 @@ static inline bool need_reserve_reloc_root(struct btrfs_root *root)
}
static struct btrfs_trans_handle *
-start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
- enum btrfs_reserve_flush_enum flush)
+start_transaction(struct btrfs_root *root, unsigned int num_items,
+ unsigned int type, enum btrfs_reserve_flush_enum flush)
{
struct btrfs_trans_handle *h;
struct btrfs_transaction *cur_trans;
@@ -444,13 +496,10 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
* the appropriate flushing if need be.
*/
if (num_items > 0 && root != root->fs_info->chunk_root) {
- if (root->fs_info->quota_enabled &&
- is_fstree(root->root_key.objectid)) {
- qgroup_reserved = num_items * root->nodesize;
- ret = btrfs_qgroup_reserve(root, qgroup_reserved);
- if (ret)
- return ERR_PTR(ret);
- }
+ qgroup_reserved = num_items * root->nodesize;
+ ret = btrfs_qgroup_reserve_meta(root, qgroup_reserved);
+ if (ret)
+ return ERR_PTR(ret);
num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
/*
@@ -468,7 +517,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
goto reserve_fail;
}
again:
- h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
+ h = kmem_cache_zalloc(btrfs_trans_handle_cachep, GFP_NOFS);
if (!h) {
ret = -ENOMEM;
goto alloc_fail;
@@ -509,25 +558,13 @@ again:
h->transid = cur_trans->transid;
h->transaction = cur_trans;
- h->blocks_used = 0;
- h->bytes_reserved = 0;
- h->chunk_bytes_reserved = 0;
h->root = root;
- h->delayed_ref_updates = 0;
h->use_count = 1;
- h->adding_csums = 0;
- h->block_rsv = NULL;
- h->orig_rsv = NULL;
- h->aborted = 0;
- h->qgroup_reserved = 0;
- h->delayed_ref_elem.seq = 0;
+
h->type = type;
- h->allocating_chunk = false;
- h->reloc_reserved = false;
- h->sync = false;
+ h->can_flush_pending_bgs = true;
INIT_LIST_HEAD(&h->qgroup_ref_list);
INIT_LIST_HEAD(&h->new_bgs);
- INIT_LIST_HEAD(&h->ordered);
smp_mb();
if (cur_trans->state >= TRANS_STATE_BLOCKED &&
@@ -544,7 +581,6 @@ again:
h->bytes_reserved = num_bytes;
h->reloc_reserved = reloc_reserved;
}
- h->qgroup_reserved = qgroup_reserved;
got_it:
btrfs_record_root_in_trans(h, root);
@@ -562,20 +598,52 @@ alloc_fail:
btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv,
num_bytes);
reserve_fail:
- if (qgroup_reserved)
- btrfs_qgroup_free(root, qgroup_reserved);
+ btrfs_qgroup_free_meta(root, qgroup_reserved);
return ERR_PTR(ret);
}
struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
- int num_items)
+ unsigned int num_items)
{
return start_transaction(root, num_items, TRANS_START,
BTRFS_RESERVE_FLUSH_ALL);
}
+struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv(
+ struct btrfs_root *root,
+ unsigned int num_items,
+ int min_factor)
+{
+ struct btrfs_trans_handle *trans;
+ u64 num_bytes;
+ int ret;
+
+ trans = btrfs_start_transaction(root, num_items);
+ if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
+ return trans;
+
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans))
+ return trans;
+
+ num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
+ ret = btrfs_cond_migrate_bytes(root->fs_info,
+ &root->fs_info->trans_block_rsv,
+ num_bytes,
+ min_factor);
+ if (ret) {
+ btrfs_end_transaction(trans, root);
+ return ERR_PTR(ret);
+ }
+
+ trans->block_rsv = &root->fs_info->trans_block_rsv;
+ trans->bytes_reserved = num_bytes;
+
+ return trans;
+}
struct btrfs_trans_handle *btrfs_start_transaction_lflush(
- struct btrfs_root *root, int num_items)
+ struct btrfs_root *root,
+ unsigned int num_items)
{
return start_transaction(root, num_items, TRANS_START,
BTRFS_RESERVE_FLUSH_LIMIT);
@@ -583,17 +651,20 @@ struct btrfs_trans_handle *btrfs_start_transaction_lflush(
struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
{
- return start_transaction(root, 0, TRANS_JOIN, 0);
+ return start_transaction(root, 0, TRANS_JOIN,
+ BTRFS_RESERVE_NO_FLUSH);
}
struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root)
{
- return start_transaction(root, 0, TRANS_JOIN_NOLOCK, 0);
+ return start_transaction(root, 0, TRANS_JOIN_NOLOCK,
+ BTRFS_RESERVE_NO_FLUSH);
}
struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root)
{
- return start_transaction(root, 0, TRANS_USERSPACE, 0);
+ return start_transaction(root, 0, TRANS_USERSPACE,
+ BTRFS_RESERVE_NO_FLUSH);
}
/*
@@ -611,7 +682,8 @@ struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root
*/
struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)
{
- return start_transaction(root, 0, TRANS_ATTACH, 0);
+ return start_transaction(root, 0, TRANS_ATTACH,
+ BTRFS_RESERVE_NO_FLUSH);
}
/*
@@ -626,7 +698,8 @@ btrfs_attach_transaction_barrier(struct btrfs_root *root)
{
struct btrfs_trans_handle *trans;
- trans = start_transaction(root, 0, TRANS_ATTACH, 0);
+ trans = start_transaction(root, 0, TRANS_ATTACH,
+ BTRFS_RESERVE_NO_FLUSH);
if (IS_ERR(trans) && PTR_ERR(trans) == -ENOENT)
btrfs_wait_for_commit(root, 0);
@@ -759,12 +832,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
if (!list_empty(&trans->new_bgs))
btrfs_create_pending_block_groups(trans, root);
- if (!list_empty(&trans->ordered)) {
- spin_lock(&info->trans_lock);
- list_splice_init(&trans->ordered, &cur_trans->pending_ordered);
- spin_unlock(&info->trans_lock);
- }
-
trans->delayed_ref_updates = 0;
if (!trans->sync) {
must_run_delayed_refs =
@@ -780,15 +847,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
must_run_delayed_refs = 2;
}
- if (trans->qgroup_reserved) {
- /*
- * the same root has to be passed here between start_transaction
- * and end_transaction. Subvolume quota depends on this.
- */
- btrfs_qgroup_free(trans->root, trans->qgroup_reserved);
- trans->qgroup_reserved = 0;
- }
-
btrfs_trans_release_metadata(trans, root);
trans->block_rsv = NULL;
@@ -821,6 +879,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
atomic_dec(&cur_trans->num_writers);
extwriter_counter_dec(cur_trans, trans->type);
+ /*
+ * Make sure counter is updated before we wake up waiters.
+ */
smp_mb();
if (waitqueue_active(&cur_trans->writer_wait))
wake_up(&cur_trans->writer_wait);
@@ -1203,6 +1264,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
spin_lock(&fs_info->fs_roots_radix_lock);
if (err)
break;
+ btrfs_qgroup_free_meta_all(root);
}
}
spin_unlock(&fs_info->fs_roots_radix_lock);
@@ -1279,17 +1341,11 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
u64 root_flags;
uuid_le new_uuid;
- path = btrfs_alloc_path();
- if (!path) {
- pending->error = -ENOMEM;
- return 0;
- }
+ ASSERT(pending->path);
+ path = pending->path;
- new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
- if (!new_root_item) {
- pending->error = -ENOMEM;
- goto root_item_alloc_fail;
- }
+ ASSERT(pending->root_item);
+ new_root_item = pending->root_item;
pending->error = btrfs_find_free_objectid(tree_root, &objectid);
if (pending->error)
@@ -1301,7 +1357,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
*/
btrfs_set_skip_qgroup(trans, objectid);
- btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
+ btrfs_reloc_pre_snapshot(pending, &to_reserve);
if (to_reserve > 0) {
pending->error = btrfs_block_rsv_add(root,
@@ -1522,8 +1578,10 @@ clear_skip_qgroup:
btrfs_clear_skip_qgroup(trans);
no_free_objectid:
kfree(new_root_item);
-root_item_alloc_fail:
+ pending->root_item = NULL;
btrfs_free_path(path);
+ pending->path = NULL;
+
return ret;
}
@@ -1638,9 +1696,7 @@ static void do_async_commit(struct work_struct *work)
* Tell lockdep about it.
*/
if (ac->newtrans->type & __TRANS_FREEZABLE)
- rwsem_acquire_read(
- &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
- 0, 1, _THIS_IP_);
+ __sb_writers_acquired(ac->root->fs_info->sb, SB_FREEZE_FS);
current->journal_info = ac->newtrans;
@@ -1679,9 +1735,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
* async commit thread will be the one to unlock it.
*/
if (ac->newtrans->type & __TRANS_FREEZABLE)
- rwsem_release(
- &root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
- 1, _THIS_IP_);
+ __sb_writers_release(root->fs_info->sb, SB_FREEZE_FS);
schedule_work(&ac->work);
@@ -1764,25 +1818,10 @@ static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
}
static inline void
-btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans,
- struct btrfs_fs_info *fs_info)
+btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans)
{
- struct btrfs_ordered_extent *ordered;
-
- spin_lock(&fs_info->trans_lock);
- while (!list_empty(&cur_trans->pending_ordered)) {
- ordered = list_first_entry(&cur_trans->pending_ordered,
- struct btrfs_ordered_extent,
- trans_list);
- list_del_init(&ordered->trans_list);
- spin_unlock(&fs_info->trans_lock);
-
- wait_event(ordered->wait, test_bit(BTRFS_ORDERED_COMPLETE,
- &ordered->flags));
- btrfs_put_ordered_extent(ordered);
- spin_lock(&fs_info->trans_lock);
- }
- spin_unlock(&fs_info->trans_lock);
+ wait_event(cur_trans->pending_wait,
+ atomic_read(&cur_trans->pending_ordered) == 0);
}
int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
@@ -1811,10 +1850,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
btrfs_trans_release_metadata(trans, root);
trans->block_rsv = NULL;
- if (trans->qgroup_reserved) {
- btrfs_qgroup_free(root, trans->qgroup_reserved);
- trans->qgroup_reserved = 0;
- }
cur_trans = trans->transaction;
@@ -1834,7 +1869,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
return ret;
}
- if (!cur_trans->dirty_bg_run) {
+ if (!test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &cur_trans->flags)) {
int run_it = 0;
/* this mutex is also taken before trying to set
@@ -1843,18 +1878,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
* after a extents from that block group have been
* allocated for cache files. btrfs_set_block_group_ro
* will wait for the transaction to commit if it
- * finds dirty_bg_run = 1
+ * finds BTRFS_TRANS_DIRTY_BG_RUN set.
*
- * The dirty_bg_run flag is also used to make sure only
- * one process starts all the block group IO. It wouldn't
+ * The BTRFS_TRANS_DIRTY_BG_RUN flag is also used to make sure
+ * only one process starts all the block group IO. It wouldn't
* hurt to have more than one go through, but there's no
* real advantage to it either.
*/
mutex_lock(&root->fs_info->ro_block_group_mutex);
- if (!cur_trans->dirty_bg_run) {
+ if (!test_and_set_bit(BTRFS_TRANS_DIRTY_BG_RUN,
+ &cur_trans->flags))
run_it = 1;
- cur_trans->dirty_bg_run = 1;
- }
mutex_unlock(&root->fs_info->ro_block_group_mutex);
if (run_it)
@@ -1866,7 +1900,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
}
spin_lock(&root->fs_info->trans_lock);
- list_splice_init(&trans->ordered, &cur_trans->pending_ordered);
if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
spin_unlock(&root->fs_info->trans_lock);
atomic_inc(&cur_trans->use_count);
@@ -1893,8 +1926,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
spin_unlock(&root->fs_info->trans_lock);
wait_for_commit(root, prev_trans);
+ ret = prev_trans->aborted;
btrfs_put_transaction(prev_trans);
+ if (ret)
+ goto cleanup_transaction;
} else {
spin_unlock(&root->fs_info->trans_lock);
}
@@ -1922,7 +1958,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
btrfs_wait_delalloc_flush(root->fs_info);
- btrfs_wait_pending_ordered(cur_trans, root->fs_info);
+ btrfs_wait_pending_ordered(cur_trans);
btrfs_scrub_pause(root);
/*
@@ -2102,7 +2138,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
ret = btrfs_write_and_wait_transaction(trans, root);
if (ret) {
- btrfs_error(root->fs_info, ret,
+ btrfs_std_error(root->fs_info, ret,
"Error while writing out transaction");
mutex_unlock(&root->fs_info->tree_log_mutex);
goto scrub_continue;
@@ -2122,7 +2158,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
btrfs_finish_extent_commit(trans, root);
- if (cur_trans->have_free_bgs)
+ if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags))
btrfs_clear_space_info_full(root->fs_info);
root->fs_info->last_trans_committed = cur_trans->transid;
@@ -2164,10 +2200,6 @@ cleanup_transaction:
btrfs_trans_release_metadata(trans, root);
btrfs_trans_release_chunk_metadata(trans);
trans->block_rsv = NULL;
- if (trans->qgroup_reserved) {
- btrfs_qgroup_free(root, trans->qgroup_reserved);
- trans->qgroup_reserved = 0;
- }
btrfs_warn(root->fs_info, "Skipping commit of aborted transaction.");
if (current->journal_info == trans)
current->journal_info = NULL;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index eb09c20..72be51f 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -32,6 +32,10 @@ enum btrfs_trans_state {
TRANS_STATE_MAX = 6,
};
+#define BTRFS_TRANS_HAVE_FREE_BGS 0
+#define BTRFS_TRANS_DIRTY_BG_RUN 1
+#define BTRFS_TRANS_CACHE_ENOSPC 2
+
struct btrfs_transaction {
u64 transid;
/*
@@ -46,11 +50,9 @@ struct btrfs_transaction {
*/
atomic_t num_writers;
atomic_t use_count;
+ atomic_t pending_ordered;
- /*
- * true if there is free bgs operations in this transaction
- */
- int have_free_bgs;
+ unsigned long flags;
/* Be protected by fs_info->trans_lock when we want to change it. */
enum btrfs_trans_state state;
@@ -59,12 +61,13 @@ struct btrfs_transaction {
unsigned long start_time;
wait_queue_head_t writer_wait;
wait_queue_head_t commit_wait;
+ wait_queue_head_t pending_wait;
struct list_head pending_snapshots;
struct list_head pending_chunks;
- struct list_head pending_ordered;
struct list_head switch_commits;
struct list_head dirty_bgs;
struct list_head io_bgs;
+ struct list_head dropped_roots;
u64 num_dirty_bgs;
/*
@@ -74,9 +77,11 @@ struct btrfs_transaction {
*/
struct mutex cache_write_mutex;
spinlock_t dirty_bgs_lock;
+ /* Protected by spin lock fs_info->unused_bgs_lock. */
+ struct list_head deleted_bgs;
+ spinlock_t dropped_roots_lock;
struct btrfs_delayed_ref_root delayed_refs;
int aborted;
- int dirty_bg_run;
};
#define __TRANS_FREEZABLE (1U << 0)
@@ -103,7 +108,6 @@ struct btrfs_trans_handle {
u64 transid;
u64 bytes_reserved;
u64 chunk_bytes_reserved;
- u64 qgroup_reserved;
unsigned long use_count;
unsigned long blocks_reserved;
unsigned long blocks_used;
@@ -114,6 +118,7 @@ struct btrfs_trans_handle {
short aborted;
short adding_csums;
bool allocating_chunk;
+ bool can_flush_pending_bgs;
bool reloc_reserved;
bool sync;
unsigned int type;
@@ -124,7 +129,6 @@ struct btrfs_trans_handle {
*/
struct btrfs_root *root;
struct seq_list delayed_ref_elem;
- struct list_head ordered;
struct list_head qgroup_ref_list;
struct list_head new_bgs;
};
@@ -133,8 +137,10 @@ struct btrfs_pending_snapshot {
struct dentry *dentry;
struct inode *dir;
struct btrfs_root *root;
+ struct btrfs_root_item *root_item;
struct btrfs_root *snap;
struct btrfs_qgroup_inherit *inherit;
+ struct btrfs_path *path;
/* block reservation for the operation */
struct btrfs_block_rsv block_rsv;
u64 qgroup_reserved;
@@ -180,9 +186,14 @@ static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans)
int btrfs_end_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
- int num_items);
+ unsigned int num_items);
+struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv(
+ struct btrfs_root *root,
+ unsigned int num_items,
+ int min_factor);
struct btrfs_trans_handle *btrfs_start_transaction_lflush(
- struct btrfs_root *root, int num_items);
+ struct btrfs_root *root,
+ unsigned int num_items);
struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root);
@@ -214,5 +225,6 @@ int btrfs_transaction_blocked(struct btrfs_fs_info *info);
int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
void btrfs_put_transaction(struct btrfs_transaction *transaction);
void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info);
-
+void btrfs_add_dropped_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
#endif
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index a4b9c8b..cb65089 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -89,6 +89,12 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
goto out;
}
btrfs_release_path(path);
+ /*
+ * We don't need a lock on a leaf. btrfs_realloc_node() will lock all
+ * leafs from path->nodes[1], so set lowest_level to 1 to avoid later
+ * a deadlock (attempting to write lock an already write locked leaf).
+ */
+ path->lowest_level = 1;
wret = btrfs_search_slot(trans, root, &key, path, 0, 1);
if (wret < 0) {
@@ -99,9 +105,12 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
ret = 0;
goto out;
}
- path->slots[1] = btrfs_header_nritems(path->nodes[1]);
- next_key_ret = btrfs_find_next_key(root, path, &key, 1,
- min_trans);
+ /*
+ * The node at level 1 must always be locked when our path has
+ * keep_locks set and lowest_level is 1, regardless of the value of
+ * path->slots[1].
+ */
+ BUG_ON(path->locks[1] == 0);
ret = btrfs_realloc_node(trans, root,
path->nodes[1], 0,
&last_ret,
@@ -110,13 +119,24 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
WARN_ON(ret == -EAGAIN);
goto out;
}
+ /*
+ * Now that we reallocated the node we can find the next key. Note that
+ * btrfs_find_next_key() can release our path and do another search
+ * without COWing, this is because even with path->keep_locks = 1,
+ * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a
+ * node when path->slots[node_level - 1] does not point to the last
+ * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore
+ * we search for the next key after reallocating our node.
+ */
+ path->slots[1] = btrfs_header_nritems(path->nodes[1]);
+ next_key_ret = btrfs_find_next_key(root, path, &key, 1,
+ min_trans);
if (next_key_ret == 0) {
memcpy(&root->defrag_progress, &key, sizeof(key));
ret = -EAGAIN;
}
out:
- if (path)
- btrfs_free_path(path);
+ btrfs_free_path(path);
if (ret == -EAGAIN) {
if (root->defrag_max.objectid > root->defrag_progress.objectid)
goto done;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 9c45431..323e12c 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -140,55 +140,46 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_log_ctx *ctx)
{
- int index;
- int ret;
+ int ret = 0;
mutex_lock(&root->log_mutex);
+
if (root->log_root) {
if (btrfs_need_log_full_commit(root->fs_info, trans)) {
ret = -EAGAIN;
goto out;
}
+
if (!root->log_start_pid) {
- root->log_start_pid = current->pid;
clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
+ root->log_start_pid = current->pid;
} else if (root->log_start_pid != current->pid) {
set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
}
+ } else {
+ mutex_lock(&root->fs_info->tree_log_mutex);
+ if (!root->fs_info->log_root_tree)
+ ret = btrfs_init_log_root_tree(trans, root->fs_info);
+ mutex_unlock(&root->fs_info->tree_log_mutex);
+ if (ret)
+ goto out;
- atomic_inc(&root->log_batch);
- atomic_inc(&root->log_writers);
- if (ctx) {
- index = root->log_transid % 2;
- list_add_tail(&ctx->list, &root->log_ctxs[index]);
- ctx->log_transid = root->log_transid;
- }
- mutex_unlock(&root->log_mutex);
- return 0;
- }
-
- ret = 0;
- mutex_lock(&root->fs_info->tree_log_mutex);
- if (!root->fs_info->log_root_tree)
- ret = btrfs_init_log_root_tree(trans, root->fs_info);
- mutex_unlock(&root->fs_info->tree_log_mutex);
- if (ret)
- goto out;
-
- if (!root->log_root) {
ret = btrfs_add_log_tree(trans, root);
if (ret)
goto out;
+
+ clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
+ root->log_start_pid = current->pid;
}
- clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
- root->log_start_pid = current->pid;
+
atomic_inc(&root->log_batch);
atomic_inc(&root->log_writers);
if (ctx) {
- index = root->log_transid % 2;
+ int index = root->log_transid % 2;
list_add_tail(&ctx->list, &root->log_ctxs[index]);
ctx->log_transid = root->log_transid;
}
+
out:
mutex_unlock(&root->log_mutex);
return ret;
@@ -238,7 +229,9 @@ int btrfs_pin_log_trans(struct btrfs_root *root)
void btrfs_end_log_trans(struct btrfs_root *root)
{
if (atomic_dec_and_test(&root->log_writers)) {
- smp_mb();
+ /*
+ * Implicit memory barrier after atomic_dec_and_test
+ */
if (waitqueue_active(&root->log_writer_wait))
wake_up(&root->log_writer_wait);
}
@@ -700,7 +693,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
ret = btrfs_inc_extent_ref(trans, root,
ins.objectid, ins.offset,
0, root->root_key.objectid,
- key->objectid, offset, 0);
+ key->objectid, offset);
if (ret)
goto out;
} else {
@@ -731,12 +724,66 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
&ordered_sums, 0);
if (ret)
goto out;
+ /*
+ * Now delete all existing cums in the csum root that
+ * cover our range. We do this because we can have an
+ * extent that is completely referenced by one file
+ * extent item and partially referenced by another
+ * file extent item (like after using the clone or
+ * extent_same ioctls). In this case if we end up doing
+ * the replay of the one that partially references the
+ * extent first, and we do not do the csum deletion
+ * below, we can get 2 csum items in the csum tree that
+ * overlap each other. For example, imagine our log has
+ * the two following file extent items:
+ *
+ * key (257 EXTENT_DATA 409600)
+ * extent data disk byte 12845056 nr 102400
+ * extent data offset 20480 nr 20480 ram 102400
+ *
+ * key (257 EXTENT_DATA 819200)
+ * extent data disk byte 12845056 nr 102400
+ * extent data offset 0 nr 102400 ram 102400
+ *
+ * Where the second one fully references the 100K extent
+ * that starts at disk byte 12845056, and the log tree
+ * has a single csum item that covers the entire range
+ * of the extent:
+ *
+ * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
+ *
+ * After the first file extent item is replayed, the
+ * csum tree gets the following csum item:
+ *
+ * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
+ *
+ * Which covers the 20K sub-range starting at offset 20K
+ * of our extent. Now when we replay the second file
+ * extent item, if we do not delete existing csum items
+ * that cover any of its blocks, we end up getting two
+ * csum items in our csum tree that overlap each other:
+ *
+ * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
+ * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
+ *
+ * Which is a problem, because after this anyone trying
+ * to lookup up for the checksum of any block of our
+ * extent starting at an offset of 40K or higher, will
+ * end up looking at the second csum item only, which
+ * does not contain the checksum for any block starting
+ * at offset 40K or higher of our extent.
+ */
while (!list_empty(&ordered_sums)) {
struct btrfs_ordered_sum *sums;
sums = list_entry(ordered_sums.next,
struct btrfs_ordered_sum,
list);
if (!ret)
+ ret = btrfs_del_csums(trans,
+ root->fs_info->csum_root,
+ sums->bytenr,
+ sums->len);
+ if (!ret)
ret = btrfs_csum_file_blocks(trans,
root->fs_info->csum_root,
sums);
@@ -1549,9 +1596,8 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
*/
static noinline int insert_one_name(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- struct btrfs_path *path,
u64 dirid, u64 index,
- char *name, int name_len, u8 type,
+ char *name, int name_len,
struct btrfs_key *location)
{
struct inode *inode;
@@ -1613,6 +1659,9 @@ static bool name_in_log_ref(struct btrfs_root *log_root,
* not exist in the FS, it is skipped. fsyncs on directories
* do not force down inodes inside that directory, just changes to the
* names or unlinks in a directory.
+ *
+ * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
+ * non-existing inode) and 1 if the name was replayed.
*/
static noinline int replay_one_name(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -1631,6 +1680,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
int exists;
int ret = 0;
bool update_size = (key->type == BTRFS_DIR_INDEX_KEY);
+ bool name_added = false;
dir = read_one_inode(root, key->objectid);
if (!dir)
@@ -1708,6 +1758,8 @@ out:
}
kfree(name);
iput(dir);
+ if (!ret && name_added)
+ ret = 1;
return ret;
insert:
@@ -1719,10 +1771,12 @@ insert:
goto out;
}
btrfs_release_path(path);
- ret = insert_one_name(trans, root, path, key->objectid, key->offset,
- name, name_len, log_type, &log_key);
+ ret = insert_one_name(trans, root, key->objectid, key->offset,
+ name, name_len, &log_key);
if (ret && ret != -ENOENT && ret != -EEXIST)
goto out;
+ if (!ret)
+ name_added = true;
update_size = false;
ret = 0;
goto out;
@@ -1740,12 +1794,13 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
struct extent_buffer *eb, int slot,
struct btrfs_key *key)
{
- int ret;
+ int ret = 0;
u32 item_size = btrfs_item_size_nr(eb, slot);
struct btrfs_dir_item *di;
int name_len;
unsigned long ptr;
unsigned long ptr_end;
+ struct btrfs_path *fixup_path = NULL;
ptr = btrfs_item_ptr_offset(eb, slot);
ptr_end = ptr + item_size;
@@ -1755,12 +1810,59 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
return -EIO;
name_len = btrfs_dir_name_len(eb, di);
ret = replay_one_name(trans, root, path, eb, di, key);
- if (ret)
- return ret;
+ if (ret < 0)
+ break;
ptr = (unsigned long)(di + 1);
ptr += name_len;
+
+ /*
+ * If this entry refers to a non-directory (directories can not
+ * have a link count > 1) and it was added in the transaction
+ * that was not committed, make sure we fixup the link count of
+ * the inode it the entry points to. Otherwise something like
+ * the following would result in a directory pointing to an
+ * inode with a wrong link that does not account for this dir
+ * entry:
+ *
+ * mkdir testdir
+ * touch testdir/foo
+ * touch testdir/bar
+ * sync
+ *
+ * ln testdir/bar testdir/bar_link
+ * ln testdir/foo testdir/foo_link
+ * xfs_io -c "fsync" testdir/bar
+ *
+ * <power failure>
+ *
+ * mount fs, log replay happens
+ *
+ * File foo would remain with a link count of 1 when it has two
+ * entries pointing to it in the directory testdir. This would
+ * make it impossible to ever delete the parent directory has
+ * it would result in stale dentries that can never be deleted.
+ */
+ if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) {
+ struct btrfs_key di_key;
+
+ if (!fixup_path) {
+ fixup_path = btrfs_alloc_path();
+ if (!fixup_path) {
+ ret = -ENOMEM;
+ break;
+ }
+ }
+
+ btrfs_dir_item_key_to_cpu(eb, di, &di_key);
+ ret = link_to_fixup_dir(trans, root, fixup_path,
+ di_key.objectid);
+ if (ret)
+ break;
+ }
+ ret = 0;
}
- return 0;
+ btrfs_free_path(fixup_path);
+ return ret;
}
/*
@@ -2535,8 +2637,7 @@ static int update_log_root(struct btrfs_trans_handle *trans,
return ret;
}
-static void wait_log_commit(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, int transid)
+static void wait_log_commit(struct btrfs_root *root, int transid)
{
DEFINE_WAIT(wait);
int index = transid % 2;
@@ -2561,8 +2662,7 @@ static void wait_log_commit(struct btrfs_trans_handle *trans,
atomic_read(&root->log_commit[index]));
}
-static void wait_for_writer(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+static void wait_for_writer(struct btrfs_root *root)
{
DEFINE_WAIT(wait);
@@ -2642,7 +2742,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
index1 = log_transid % 2;
if (atomic_read(&root->log_commit[index1])) {
- wait_log_commit(trans, root, log_transid);
+ wait_log_commit(root, log_transid);
mutex_unlock(&root->log_mutex);
return ctx->log_ret;
}
@@ -2651,7 +2751,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
/* wait for previous tree log sync to complete */
if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
- wait_log_commit(trans, root, log_transid - 1);
+ wait_log_commit(root, log_transid - 1);
while (1) {
int batch = atomic_read(&root->log_batch);
@@ -2662,7 +2762,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
schedule_timeout_uninterruptible(1);
mutex_lock(&root->log_mutex);
}
- wait_for_writer(trans, root);
+ wait_for_writer(root);
if (batch == atomic_read(&root->log_batch))
break;
}
@@ -2722,7 +2822,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
mutex_lock(&log_root_tree->log_mutex);
if (atomic_dec_and_test(&log_root_tree->log_writers)) {
- smp_mb();
+ /*
+ * Implicit memory barrier after atomic_dec_and_test
+ */
if (waitqueue_active(&log_root_tree->log_writer_wait))
wake_up(&log_root_tree->log_writer_wait);
}
@@ -2759,7 +2861,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages,
mark);
btrfs_wait_logged_extents(trans, log, log_transid);
- wait_log_commit(trans, log_root_tree,
+ wait_log_commit(log_root_tree,
root_log_ctx.log_transid);
mutex_unlock(&log_root_tree->log_mutex);
if (!ret)
@@ -2770,11 +2872,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
atomic_set(&log_root_tree->log_commit[index2], 1);
if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
- wait_log_commit(trans, log_root_tree,
+ wait_log_commit(log_root_tree,
root_log_ctx.log_transid - 1);
}
- wait_for_writer(trans, log_root_tree);
+ wait_for_writer(log_root_tree);
/*
* now that we've moved on to the tree of log tree roots,
@@ -2852,6 +2954,9 @@ out_wake_log_root:
atomic_set(&log_root_tree->log_commit[index2], 0);
mutex_unlock(&log_root_tree->log_mutex);
+ /*
+ * The barrier before waitqueue_active is implied by mutex_unlock
+ */
if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
wake_up(&log_root_tree->log_commit_wait[index2]);
out:
@@ -2863,6 +2968,9 @@ out:
atomic_set(&root->log_commit[index1], 0);
mutex_unlock(&root->log_mutex);
+ /*
+ * The barrier before waitqueue_active is implied by mutex_unlock
+ */
if (waitqueue_active(&root->log_commit_wait[index1]))
wake_up(&root->log_commit_wait[index1]);
return ret;
@@ -4904,6 +5012,94 @@ next_dir_inode:
return ret;
}
+static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
+ struct inode *inode,
+ struct btrfs_log_ctx *ctx)
+{
+ int ret;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ const u64 ino = btrfs_ino(inode);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->skip_locking = 1;
+ path->search_commit_root = 1;
+
+ key.objectid = ino;
+ key.type = BTRFS_INODE_REF_KEY;
+ key.offset = 0;
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ while (true) {
+ struct extent_buffer *leaf = path->nodes[0];
+ int slot = path->slots[0];
+ u32 cur_offset = 0;
+ u32 item_size;
+ unsigned long ptr;
+
+ if (slot >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto out;
+ else if (ret > 0)
+ break;
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */
+ if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY)
+ break;
+
+ item_size = btrfs_item_size_nr(leaf, slot);
+ ptr = btrfs_item_ptr_offset(leaf, slot);
+ while (cur_offset < item_size) {
+ struct btrfs_key inode_key;
+ struct inode *dir_inode;
+
+ inode_key.type = BTRFS_INODE_ITEM_KEY;
+ inode_key.offset = 0;
+
+ if (key.type == BTRFS_INODE_EXTREF_KEY) {
+ struct btrfs_inode_extref *extref;
+
+ extref = (struct btrfs_inode_extref *)
+ (ptr + cur_offset);
+ inode_key.objectid = btrfs_inode_extref_parent(
+ leaf, extref);
+ cur_offset += sizeof(*extref);
+ cur_offset += btrfs_inode_extref_name_len(leaf,
+ extref);
+ } else {
+ inode_key.objectid = key.offset;
+ cur_offset = item_size;
+ }
+
+ dir_inode = btrfs_iget(root->fs_info->sb, &inode_key,
+ root, NULL);
+ /* If parent inode was deleted, skip it. */
+ if (IS_ERR(dir_inode))
+ continue;
+
+ ret = btrfs_log_inode(trans, root, dir_inode,
+ LOG_INODE_ALL, 0, LLONG_MAX, ctx);
+ iput(dir_inode);
+ if (ret)
+ goto out;
+ }
+ path->slots[0]++;
+ }
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
/*
* helper function around btrfs_log_inode to make sure newly created
* parent directories also end up in the log. A minimal inode and backref
@@ -4923,9 +5119,6 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
struct dentry *old_parent = NULL;
int ret = 0;
u64 last_committed = root->fs_info->last_trans_committed;
- const struct dentry * const first_parent = parent;
- const bool did_unlink = (BTRFS_I(inode)->last_unlink_trans >
- last_committed);
bool log_dentries = false;
struct inode *orig_inode = inode;
@@ -4986,6 +5179,53 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
if (S_ISDIR(inode->i_mode) && ctx && ctx->log_new_dentries)
log_dentries = true;
+ /*
+ * On unlink we must make sure all our current and old parent directores
+ * inodes are fully logged. This is to prevent leaving dangling
+ * directory index entries in directories that were our parents but are
+ * not anymore. Not doing this results in old parent directory being
+ * impossible to delete after log replay (rmdir will always fail with
+ * error -ENOTEMPTY).
+ *
+ * Example 1:
+ *
+ * mkdir testdir
+ * touch testdir/foo
+ * ln testdir/foo testdir/bar
+ * sync
+ * unlink testdir/bar
+ * xfs_io -c fsync testdir/foo
+ * <power failure>
+ * mount fs, triggers log replay
+ *
+ * If we don't log the parent directory (testdir), after log replay the
+ * directory still has an entry pointing to the file inode using the bar
+ * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and
+ * the file inode has a link count of 1.
+ *
+ * Example 2:
+ *
+ * mkdir testdir
+ * touch foo
+ * ln foo testdir/foo2
+ * ln foo testdir/foo3
+ * sync
+ * unlink testdir/foo3
+ * xfs_io -c fsync foo
+ * <power failure>
+ * mount fs, triggers log replay
+ *
+ * Similar as the first example, after log replay the parent directory
+ * testdir still has an entry pointing to the inode file with name foo3
+ * but the file inode does not have a matching BTRFS_INODE_REF_KEY item
+ * and has a link count of 2.
+ */
+ if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
+ ret = btrfs_log_all_parents(trans, orig_inode, ctx);
+ if (ret)
+ goto end_trans;
+ }
+
while (1) {
if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb)
break;
@@ -4994,23 +5234,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
if (root != BTRFS_I(inode)->root)
break;
- /*
- * On unlink we must make sure our immediate parent directory
- * inode is fully logged. This is to prevent leaving dangling
- * directory index entries and a wrong directory inode's i_size.
- * Not doing so can result in a directory being impossible to
- * delete after log replay (rmdir will always fail with error
- * -ENOTEMPTY).
- */
- if (did_unlink && parent == first_parent)
- inode_only = LOG_INODE_ALL;
- else
- inode_only = LOG_INODE_EXISTS;
-
- if (BTRFS_I(inode)->generation >
- root->fs_info->last_trans_committed ||
- inode_only == LOG_INODE_ALL) {
- ret = btrfs_log_inode(trans, root, inode, inode_only,
+ if (BTRFS_I(inode)->generation > last_committed) {
+ ret = btrfs_log_inode(trans, root, inode,
+ LOG_INODE_EXISTS,
0, LLONG_MAX, ctx);
if (ret)
goto end_trans;
@@ -5098,7 +5324,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
ret = walk_log_tree(trans, log_root_tree, &wc);
if (ret) {
- btrfs_error(fs_info, ret, "Failed to pin buffers while "
+ btrfs_std_error(fs_info, ret, "Failed to pin buffers while "
"recovering log root tree.");
goto error;
}
@@ -5112,7 +5338,7 @@ again:
ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
if (ret < 0) {
- btrfs_error(fs_info, ret,
+ btrfs_std_error(fs_info, ret,
"Couldn't find tree log root.");
goto error;
}
@@ -5130,7 +5356,7 @@ again:
log = btrfs_read_fs_root(log_root_tree, &found_key);
if (IS_ERR(log)) {
ret = PTR_ERR(log);
- btrfs_error(fs_info, ret,
+ btrfs_std_error(fs_info, ret,
"Couldn't read tree log root.");
goto error;
}
@@ -5145,7 +5371,7 @@ again:
free_extent_buffer(log->node);
free_extent_buffer(log->commit_root);
kfree(log);
- btrfs_error(fs_info, ret, "Couldn't read target root "
+ btrfs_std_error(fs_info, ret, "Couldn't read target root "
"for tree log recovery.");
goto error;
}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index fbe7c10..c32abbc 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -42,6 +42,82 @@
#include "dev-replace.h"
#include "sysfs.h"
+const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
+ [BTRFS_RAID_RAID10] = {
+ .sub_stripes = 2,
+ .dev_stripes = 1,
+ .devs_max = 0, /* 0 == as many as possible */
+ .devs_min = 4,
+ .tolerated_failures = 1,
+ .devs_increment = 2,
+ .ncopies = 2,
+ },
+ [BTRFS_RAID_RAID1] = {
+ .sub_stripes = 1,
+ .dev_stripes = 1,
+ .devs_max = 2,
+ .devs_min = 2,
+ .tolerated_failures = 1,
+ .devs_increment = 2,
+ .ncopies = 2,
+ },
+ [BTRFS_RAID_DUP] = {
+ .sub_stripes = 1,
+ .dev_stripes = 2,
+ .devs_max = 1,
+ .devs_min = 1,
+ .tolerated_failures = 0,
+ .devs_increment = 1,
+ .ncopies = 2,
+ },
+ [BTRFS_RAID_RAID0] = {
+ .sub_stripes = 1,
+ .dev_stripes = 1,
+ .devs_max = 0,
+ .devs_min = 2,
+ .tolerated_failures = 0,
+ .devs_increment = 1,
+ .ncopies = 1,
+ },
+ [BTRFS_RAID_SINGLE] = {
+ .sub_stripes = 1,
+ .dev_stripes = 1,
+ .devs_max = 1,
+ .devs_min = 1,
+ .tolerated_failures = 0,
+ .devs_increment = 1,
+ .ncopies = 1,
+ },
+ [BTRFS_RAID_RAID5] = {
+ .sub_stripes = 1,
+ .dev_stripes = 1,
+ .devs_max = 0,
+ .devs_min = 2,
+ .tolerated_failures = 1,
+ .devs_increment = 1,
+ .ncopies = 2,
+ },
+ [BTRFS_RAID_RAID6] = {
+ .sub_stripes = 1,
+ .dev_stripes = 1,
+ .devs_max = 0,
+ .devs_min = 3,
+ .tolerated_failures = 2,
+ .devs_increment = 1,
+ .ncopies = 3,
+ },
+};
+
+const u64 const btrfs_raid_group[BTRFS_NR_RAID_TYPES] = {
+ [BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10,
+ [BTRFS_RAID_RAID1] = BTRFS_BLOCK_GROUP_RAID1,
+ [BTRFS_RAID_DUP] = BTRFS_BLOCK_GROUP_DUP,
+ [BTRFS_RAID_RAID0] = BTRFS_BLOCK_GROUP_RAID0,
+ [BTRFS_RAID_SINGLE] = 0,
+ [BTRFS_RAID_RAID5] = BTRFS_BLOCK_GROUP_RAID5,
+ [BTRFS_RAID_RAID6] = BTRFS_BLOCK_GROUP_RAID6,
+};
+
static int init_first_rw_device(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_device *device);
@@ -49,6 +125,7 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
+static void btrfs_close_one_device(struct btrfs_device *device);
DEFINE_MUTEX(uuid_mutex);
static LIST_HEAD(fs_uuids);
@@ -156,8 +233,8 @@ static struct btrfs_device *__alloc_device(void)
spin_lock_init(&dev->reada_lock);
atomic_set(&dev->reada_in_flight, 0);
atomic_set(&dev->dev_stats_ccnt, 0);
- INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_WAIT);
- INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_WAIT);
+ INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
+ INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
return dev;
}
@@ -198,7 +275,6 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
if (IS_ERR(*bdev)) {
ret = PTR_ERR(*bdev);
- printk(KERN_INFO "BTRFS: open %s failed\n", device_path);
goto error;
}
@@ -211,8 +287,8 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
}
invalidate_bdev(*bdev);
*bh = btrfs_read_dev_super(*bdev);
- if (!*bh) {
- ret = -EINVAL;
+ if (IS_ERR(*bh)) {
+ ret = PTR_ERR(*bh);
blkdev_put(*bdev, flags);
goto error;
}
@@ -345,6 +421,9 @@ loop_lock:
pending = pending->bi_next;
cur->bi_next = NULL;
+ /*
+ * atomic_dec_return implies a barrier for waitqueue_active
+ */
if (atomic_dec_return(&fs_info->nr_async_bios) < limit &&
waitqueue_active(&fs_info->async_submit_wait))
wake_up(&fs_info->async_submit_wait);
@@ -765,36 +844,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
mutex_lock(&fs_devices->device_list_mutex);
list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) {
- struct btrfs_device *new_device;
- struct rcu_string *name;
-
- if (device->bdev)
- fs_devices->open_devices--;
-
- if (device->writeable &&
- device->devid != BTRFS_DEV_REPLACE_DEVID) {
- list_del_init(&device->dev_alloc_list);
- fs_devices->rw_devices--;
- }
-
- if (device->missing)
- fs_devices->missing_devices--;
-
- new_device = btrfs_alloc_device(NULL, &device->devid,
- device->uuid);
- BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
-
- /* Safe because we are under uuid_mutex */
- if (device->name) {
- name = rcu_string_strdup(device->name->str, GFP_NOFS);
- BUG_ON(!name); /* -ENOMEM */
- rcu_assign_pointer(new_device->name, name);
- }
-
- list_replace_rcu(&device->dev_list, &new_device->dev_list);
- new_device->fs_devices = device->fs_devices;
-
- call_rcu(&device->rcu, free_device);
+ btrfs_close_one_device(device);
}
mutex_unlock(&fs_devices->device_list_mutex);
@@ -1053,7 +1103,7 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->reada = 2;
+ path->reada = READA_FORWARD;
key.objectid = device->devid;
key.offset = start;
@@ -1116,15 +1166,18 @@ out:
return ret;
}
-static int contains_pending_extent(struct btrfs_trans_handle *trans,
+static int contains_pending_extent(struct btrfs_transaction *transaction,
struct btrfs_device *device,
u64 *start, u64 len)
{
+ struct btrfs_fs_info *fs_info = device->dev_root->fs_info;
struct extent_map *em;
- struct list_head *search_list = &trans->transaction->pending_chunks;
+ struct list_head *search_list = &fs_info->pinned_chunks;
int ret = 0;
u64 physical_start = *start;
+ if (transaction)
+ search_list = &transaction->pending_chunks;
again:
list_for_each_entry(em, search_list, list) {
struct map_lookup *map;
@@ -1159,8 +1212,8 @@ again:
}
}
}
- if (search_list == &trans->transaction->pending_chunks) {
- search_list = &trans->root->fs_info->pinned_chunks;
+ if (search_list != &fs_info->pinned_chunks) {
+ search_list = &fs_info->pinned_chunks;
goto again;
}
@@ -1169,12 +1222,13 @@ again:
/*
- * find_free_dev_extent - find free space in the specified device
- * @device: the device which we search the free space in
- * @num_bytes: the size of the free space that we need
- * @start: store the start of the free space.
- * @len: the size of the free space. that we find, or the size of the max
- * free space if we don't find suitable free space
+ * find_free_dev_extent_start - find free space in the specified device
+ * @device: the device which we search the free space in
+ * @num_bytes: the size of the free space that we need
+ * @search_start: the position from which to begin the search
+ * @start: store the start of the free space.
+ * @len: the size of the free space. that we find, or the size
+ * of the max free space if we don't find suitable free space
*
* this uses a pretty simple search, the expectation is that it is
* called very infrequently and that a given device has a small number
@@ -1188,9 +1242,9 @@ again:
* But if we don't find suitable free space, it is used to store the size of
* the max free space.
*/
-int find_free_dev_extent(struct btrfs_trans_handle *trans,
- struct btrfs_device *device, u64 num_bytes,
- u64 *start, u64 *len)
+int find_free_dev_extent_start(struct btrfs_transaction *transaction,
+ struct btrfs_device *device, u64 num_bytes,
+ u64 search_start, u64 *start, u64 *len)
{
struct btrfs_key key;
struct btrfs_root *root = device->dev_root;
@@ -1200,18 +1254,19 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
u64 max_hole_start;
u64 max_hole_size;
u64 extent_end;
- u64 search_start;
u64 search_end = device->total_bytes;
int ret;
int slot;
struct extent_buffer *l;
+ u64 min_search_start;
- /* FIXME use last free of some kind */
-
- /* we don't want to overwrite the superblock on the drive,
- * so we make sure to start at an offset of at least 1MB
+ /*
+ * We don't want to overwrite the superblock on the drive nor any area
+ * used by the boot loader (grub for example), so we make sure to start
+ * at an offset of at least 1MB.
*/
- search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
+ min_search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
+ search_start = max(search_start, min_search_start);
path = btrfs_alloc_path();
if (!path)
@@ -1226,7 +1281,7 @@ again:
goto out;
}
- path->reada = 2;
+ path->reada = READA_FORWARD;
path->search_commit_root = 1;
path->skip_locking = 1;
@@ -1273,7 +1328,7 @@ again:
* Have to check before we set max_hole_start, otherwise
* we could end up sending back this offset anyway.
*/
- if (contains_pending_extent(trans, device,
+ if (contains_pending_extent(transaction, device,
&search_start,
hole_size)) {
if (key.offset >= search_start) {
@@ -1322,7 +1377,7 @@ next:
if (search_end > search_start) {
hole_size = search_end - search_start;
- if (contains_pending_extent(trans, device, &search_start,
+ if (contains_pending_extent(transaction, device, &search_start,
hole_size)) {
btrfs_release_path(path);
goto again;
@@ -1348,6 +1403,15 @@ out:
return ret;
}
+int find_free_dev_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device, u64 num_bytes,
+ u64 *start, u64 *len)
+{
+ /* FIXME use last free of some kind */
+ return find_free_dev_extent_start(trans->transaction, device,
+ num_bytes, 0, start, len);
+}
+
static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
struct btrfs_device *device,
u64 start, u64 *dev_extent_len)
@@ -1388,7 +1452,7 @@ again:
extent = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_dev_extent);
} else {
- btrfs_error(root->fs_info, ret, "Slot search failed");
+ btrfs_std_error(root->fs_info, ret, "Slot search failed");
goto out;
}
@@ -1396,10 +1460,10 @@ again:
ret = btrfs_del_item(trans, root, path);
if (ret) {
- btrfs_error(root->fs_info, ret,
+ btrfs_std_error(root->fs_info, ret,
"Failed to remove dev extent item");
} else {
- trans->transaction->have_free_bgs = 1;
+ set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
}
out:
btrfs_free_path(path);
@@ -1579,7 +1643,6 @@ static void update_dev_time(char *path_name)
return;
file_update_time(filp);
filp_close(filp, NULL);
- return;
}
static int btrfs_rm_dev_item(struct btrfs_root *root,
@@ -1787,7 +1850,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
if (device->bdev) {
device->fs_devices->open_devices--;
/* remove sysfs entry */
- btrfs_kobj_rm_device(root->fs_info->fs_devices, device);
+ btrfs_sysfs_rm_device_link(root->fs_info->fs_devices, device);
}
call_rcu(&device->rcu, free_device);
@@ -1910,7 +1973,7 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
if (srcdev->writeable) {
fs_devices->rw_devices--;
/* zero out the old super if it is writable */
- btrfs_scratch_superblock(srcdev);
+ btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
}
if (srcdev->bdev)
@@ -1957,10 +2020,10 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
WARN_ON(!tgtdev);
mutex_lock(&fs_info->fs_devices->device_list_mutex);
- btrfs_kobj_rm_device(fs_info->fs_devices, tgtdev);
+ btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev);
if (tgtdev->bdev) {
- btrfs_scratch_superblock(tgtdev);
+ btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
fs_info->fs_devices->open_devices--;
}
fs_info->fs_devices->num_devices--;
@@ -2027,10 +2090,8 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
}
}
- if (!*device) {
- btrfs_err(root->fs_info, "no missing device found");
- return -ENOENT;
- }
+ if (!*device)
+ return BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
return 0;
} else {
@@ -2295,7 +2356,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
tmp + 1);
/* add sysfs device entry */
- btrfs_kobj_add_device(root->fs_info->fs_devices, device);
+ btrfs_sysfs_add_device_link(root->fs_info->fs_devices, device);
/*
* we've got more storage, clear any full flags on the space
@@ -2336,9 +2397,10 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
*/
snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
root->fs_info->fsid);
- if (kobject_rename(&root->fs_info->fs_devices->super_kobj,
+ if (kobject_rename(&root->fs_info->fs_devices->fsid_kobj,
fsid_buf))
- pr_warn("BTRFS: sysfs: failed to create fsid for sprout\n");
+ btrfs_warn(root->fs_info,
+ "sysfs: failed to create fsid for sprout");
}
root->fs_info->num_tolerated_disk_barrier_failures =
@@ -2354,7 +2416,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
ret = btrfs_relocate_sys_chunks(root);
if (ret < 0)
- btrfs_error(root->fs_info, ret,
+ btrfs_std_error(root->fs_info, ret,
"Failed to relocate sys chunks after "
"device initialization. This can be fixed "
"using the \"btrfs balance\" command.");
@@ -2374,7 +2436,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
error_trans:
btrfs_end_transaction(trans, root);
rcu_string_free(device->name);
- btrfs_kobj_rm_device(root->fs_info->fs_devices, device);
+ btrfs_sysfs_rm_device_link(root->fs_info->fs_devices, device);
kfree(device);
error:
blkdev_put(bdev, FMODE_EXCL);
@@ -2599,7 +2661,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
if (ret < 0)
goto out;
else if (ret > 0) { /* Logic error or corruption */
- btrfs_error(root->fs_info, -ENOENT,
+ btrfs_std_error(root->fs_info, -ENOENT,
"Failed lookup while freeing chunk.");
ret = -ENOENT;
goto out;
@@ -2607,7 +2669,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
ret = btrfs_del_item(trans, root, path);
if (ret < 0)
- btrfs_error(root->fs_info, ret,
+ btrfs_std_error(root->fs_info, ret,
"Failed to delete chunk item.");
out:
btrfs_free_path(path);
@@ -2755,9 +2817,7 @@ out:
return ret;
}
-static int btrfs_relocate_chunk(struct btrfs_root *root,
- u64 chunk_objectid,
- u64 chunk_offset)
+static int btrfs_relocate_chunk(struct btrfs_root *root, u64 chunk_offset)
{
struct btrfs_root *extent_root;
struct btrfs_trans_handle *trans;
@@ -2785,14 +2845,17 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
return -ENOSPC;
/* step one, relocate all the extents inside this chunk */
+ btrfs_scrub_pause(root);
ret = btrfs_relocate_block_group(extent_root, chunk_offset);
+ btrfs_scrub_continue(root);
if (ret)
return ret;
- trans = btrfs_start_transaction(root, 0);
+ trans = btrfs_start_trans_remove_block_group(root->fs_info,
+ chunk_offset);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
- btrfs_std_error(root->fs_info, ret);
+ btrfs_std_error(root->fs_info, ret, NULL);
return ret;
}
@@ -2855,7 +2918,6 @@ again:
if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
ret = btrfs_relocate_chunk(chunk_root,
- found_key.objectid,
found_key.offset);
if (ret == -ENOSPC)
failed++;
@@ -2996,16 +3058,19 @@ static void update_balance_args(struct btrfs_balance_control *bctl)
* (albeit full) chunks.
*/
if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+ !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
!(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
bctl->data.usage = 90;
}
if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+ !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
!(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
bctl->sys.usage = 90;
}
if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+ !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
!(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
bctl->meta.usage = 90;
@@ -3057,17 +3122,50 @@ static int chunk_profiles_filter(u64 chunk_type,
return 1;
}
-static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
+static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
struct btrfs_balance_args *bargs)
{
struct btrfs_block_group_cache *cache;
+ u64 chunk_used;
+ u64 user_thresh_min;
+ u64 user_thresh_max;
+ int ret = 1;
+
+ cache = btrfs_lookup_block_group(fs_info, chunk_offset);
+ chunk_used = btrfs_block_group_used(&cache->item);
+
+ if (bargs->usage_min == 0)
+ user_thresh_min = 0;
+ else
+ user_thresh_min = div_factor_fine(cache->key.offset,
+ bargs->usage_min);
+
+ if (bargs->usage_max == 0)
+ user_thresh_max = 1;
+ else if (bargs->usage_max > 100)
+ user_thresh_max = cache->key.offset;
+ else
+ user_thresh_max = div_factor_fine(cache->key.offset,
+ bargs->usage_max);
+
+ if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
+ ret = 0;
+
+ btrfs_put_block_group(cache);
+ return ret;
+}
+
+static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
+ u64 chunk_offset, struct btrfs_balance_args *bargs)
+{
+ struct btrfs_block_group_cache *cache;
u64 chunk_used, user_thresh;
int ret = 1;
cache = btrfs_lookup_block_group(fs_info, chunk_offset);
chunk_used = btrfs_block_group_used(&cache->item);
- if (bargs->usage == 0)
+ if (bargs->usage_min == 0)
user_thresh = 1;
else if (bargs->usage > 100)
user_thresh = cache->key.offset;
@@ -3157,6 +3255,19 @@ static int chunk_vrange_filter(struct extent_buffer *leaf,
return 1;
}
+static int chunk_stripes_range_filter(struct extent_buffer *leaf,
+ struct btrfs_chunk *chunk,
+ struct btrfs_balance_args *bargs)
+{
+ int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+
+ if (bargs->stripes_min <= num_stripes
+ && num_stripes <= bargs->stripes_max)
+ return 0;
+
+ return 1;
+}
+
static int chunk_soft_convert_filter(u64 chunk_type,
struct btrfs_balance_args *bargs)
{
@@ -3203,6 +3314,9 @@ static int should_balance_chunk(struct btrfs_root *root,
if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) {
return 0;
+ } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
+ chunk_usage_range_filter(bctl->fs_info, chunk_offset, bargs)) {
+ return 0;
}
/* devid filter */
@@ -3223,6 +3337,12 @@ static int should_balance_chunk(struct btrfs_root *root,
return 0;
}
+ /* stripes filter */
+ if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
+ chunk_stripes_range_filter(leaf, chunk, bargs)) {
+ return 0;
+ }
+
/* soft profile changing mode */
if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
chunk_soft_convert_filter(chunk_type, bargs)) {
@@ -3237,6 +3357,16 @@ static int should_balance_chunk(struct btrfs_root *root,
return 0;
else
bargs->limit--;
+ } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
+ /*
+ * Same logic as the 'limit' filter; the minimum cannot be
+ * determined here because we do not have the global informatoin
+ * about the count of all chunks that satisfy the filters.
+ */
+ if (bargs->limit_max == 0)
+ return 0;
+ else
+ bargs->limit_max--;
}
return 1;
@@ -3251,6 +3381,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
struct btrfs_device *device;
u64 old_size;
u64 size_to_free;
+ u64 chunk_type;
struct btrfs_chunk *chunk;
struct btrfs_path *path;
struct btrfs_key key;
@@ -3261,16 +3392,21 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
int ret;
int enospc_errors = 0;
bool counting = true;
+ /* The single value limit and min/max limits use the same bytes in the */
u64 limit_data = bctl->data.limit;
u64 limit_meta = bctl->meta.limit;
u64 limit_sys = bctl->sys.limit;
+ u32 count_data = 0;
+ u32 count_meta = 0;
+ u32 count_sys = 0;
+ int chunk_reserved = 0;
/* step one make some room on all the devices */
devices = &fs_info->fs_devices->devices;
list_for_each_entry(device, devices, dev_list) {
old_size = btrfs_device_get_total_bytes(device);
size_to_free = div_factor(old_size, 1);
- size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
+ size_to_free = min_t(u64, size_to_free, SZ_1M);
if (!device->writeable ||
btrfs_device_get_total_bytes(device) -
btrfs_device_get_bytes_used(device) > size_to_free ||
@@ -3304,6 +3440,10 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
spin_unlock(&fs_info->balance_lock);
again:
if (!counting) {
+ /*
+ * The single value limit and min/max limits use the same bytes
+ * in the
+ */
bctl->data.limit = limit_data;
bctl->meta.limit = limit_meta;
bctl->sys.limit = limit_sys;
@@ -3351,6 +3491,7 @@ again:
}
chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
+ chunk_type = btrfs_chunk_type(leaf, chunk);
if (!counting) {
spin_lock(&fs_info->balance_lock);
@@ -3360,6 +3501,7 @@ again:
ret = should_balance_chunk(chunk_root, leaf, chunk,
found_key.offset);
+
btrfs_release_path(path);
if (!ret) {
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
@@ -3371,11 +3513,50 @@ again:
spin_lock(&fs_info->balance_lock);
bctl->stat.expected++;
spin_unlock(&fs_info->balance_lock);
+
+ if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
+ count_data++;
+ else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
+ count_sys++;
+ else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
+ count_meta++;
+
+ goto loop;
+ }
+
+ /*
+ * Apply limit_min filter, no need to check if the LIMITS
+ * filter is used, limit_min is 0 by default
+ */
+ if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
+ count_data < bctl->data.limit_min)
+ || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) &&
+ count_meta < bctl->meta.limit_min)
+ || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
+ count_sys < bctl->sys.limit_min)) {
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
goto loop;
}
+ if ((chunk_type & BTRFS_BLOCK_GROUP_DATA) && !chunk_reserved) {
+ trans = btrfs_start_transaction(chunk_root, 0);
+ if (IS_ERR(trans)) {
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ ret = PTR_ERR(trans);
+ goto error;
+ }
+
+ ret = btrfs_force_chunk_alloc(trans, chunk_root,
+ BTRFS_BLOCK_GROUP_DATA);
+ btrfs_end_transaction(trans, chunk_root);
+ if (ret < 0) {
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ goto error;
+ }
+ chunk_reserved = 1;
+ }
+
ret = btrfs_relocate_chunk(chunk_root,
- found_key.objectid,
found_key.offset);
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
if (ret && ret != -ENOSPC)
@@ -3449,11 +3630,20 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info)
unset_balance_control(fs_info);
ret = del_balance_item(fs_info->tree_root);
if (ret)
- btrfs_std_error(fs_info, ret);
+ btrfs_std_error(fs_info, ret, NULL);
atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
}
+/* Non-zero return value signifies invalidity */
+static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg,
+ u64 allowed)
+{
+ return ((bctl_arg->flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+ (!alloc_profile_is_valid(bctl_arg->target, 1) ||
+ (bctl_arg->target & ~allowed)));
+}
+
/*
* Should be called with both balance and volume mutexes held
*/
@@ -3511,27 +3701,21 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
if (num_devices > 3)
allowed |= (BTRFS_BLOCK_GROUP_RAID10 |
BTRFS_BLOCK_GROUP_RAID6);
- if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
- (!alloc_profile_is_valid(bctl->data.target, 1) ||
- (bctl->data.target & ~allowed))) {
+ if (validate_convert_profile(&bctl->data, allowed)) {
btrfs_err(fs_info, "unable to start balance with target "
"data profile %llu",
bctl->data.target);
ret = -EINVAL;
goto out;
}
- if ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
- (!alloc_profile_is_valid(bctl->meta.target, 1) ||
- (bctl->meta.target & ~allowed))) {
+ if (validate_convert_profile(&bctl->meta, allowed)) {
btrfs_err(fs_info,
"unable to start balance with target metadata profile %llu",
bctl->meta.target);
ret = -EINVAL;
goto out;
}
- if ((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
- (!alloc_profile_is_valid(bctl->sys.target, 1) ||
- (bctl->sys.target & ~allowed))) {
+ if (validate_convert_profile(&bctl->sys, allowed)) {
btrfs_err(fs_info,
"unable to start balance with target system profile %llu",
bctl->sys.target);
@@ -3539,14 +3723,6 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
goto out;
}
- /* allow dup'ed data chunks only in mixed mode */
- if (!mixed && (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
- (bctl->data.target & BTRFS_BLOCK_GROUP_DUP)) {
- btrfs_err(fs_info, "dup for data is not allowed");
- ret = -EINVAL;
- goto out;
- }
-
/* allow to reduce meta or sys integrity only if force set */
allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
BTRFS_BLOCK_GROUP_RAID10 |
@@ -3572,24 +3748,18 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
}
} while (read_seqretry(&fs_info->profiles_lock, seq));
- if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
- int num_tolerated_disk_barrier_failures;
- u64 target = bctl->sys.target;
-
- num_tolerated_disk_barrier_failures =
- btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
- if (num_tolerated_disk_barrier_failures > 0 &&
- (target &
- (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
- BTRFS_AVAIL_ALLOC_BIT_SINGLE)))
- num_tolerated_disk_barrier_failures = 0;
- else if (num_tolerated_disk_barrier_failures > 1 &&
- (target &
- (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)))
- num_tolerated_disk_barrier_failures = 1;
+ if (btrfs_get_num_tolerated_disk_barrier_failures(bctl->meta.target) <
+ btrfs_get_num_tolerated_disk_barrier_failures(bctl->data.target)) {
+ btrfs_warn(fs_info,
+ "metatdata profile 0x%llx has lower redundancy than data profile 0x%llx",
+ bctl->meta.target, bctl->data.target);
+ }
- fs_info->num_tolerated_disk_barrier_failures =
- num_tolerated_disk_barrier_failures;
+ if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+ fs_info->num_tolerated_disk_barrier_failures = min(
+ btrfs_calc_num_tolerated_disk_barrier_failures(fs_info),
+ btrfs_get_num_tolerated_disk_barrier_failures(
+ bctl->sys.target));
}
ret = insert_balance_item(fs_info->tree_root, bctl);
@@ -4077,7 +4247,6 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
struct btrfs_dev_extent *dev_extent = NULL;
struct btrfs_path *path;
u64 length;
- u64 chunk_objectid;
u64 chunk_offset;
int ret;
int slot;
@@ -4098,7 +4267,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
if (!path)
return -ENOMEM;
- path->reada = 2;
+ path->reada = READA_FORWARD;
lock_chunks(root);
@@ -4154,11 +4323,10 @@ again:
break;
}
- chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
btrfs_release_path(path);
- ret = btrfs_relocate_chunk(root, chunk_objectid, chunk_offset);
+ ret = btrfs_relocate_chunk(root, chunk_offset);
mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
if (ret && ret != -ENOSPC)
goto done;
@@ -4200,7 +4368,8 @@ again:
u64 start = new_size;
u64 len = old_size - new_size;
- if (contains_pending_extent(trans, device, &start, len)) {
+ if (contains_pending_extent(trans->transaction, device,
+ &start, len)) {
unlock_chunks(root);
checked_pending_chunks = true;
failed = 0;
@@ -4287,69 +4456,10 @@ static int btrfs_cmp_device_info(const void *a, const void *b)
return 0;
}
-static const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
- [BTRFS_RAID_RAID10] = {
- .sub_stripes = 2,
- .dev_stripes = 1,
- .devs_max = 0, /* 0 == as many as possible */
- .devs_min = 4,
- .devs_increment = 2,
- .ncopies = 2,
- },
- [BTRFS_RAID_RAID1] = {
- .sub_stripes = 1,
- .dev_stripes = 1,
- .devs_max = 2,
- .devs_min = 2,
- .devs_increment = 2,
- .ncopies = 2,
- },
- [BTRFS_RAID_DUP] = {
- .sub_stripes = 1,
- .dev_stripes = 2,
- .devs_max = 1,
- .devs_min = 1,
- .devs_increment = 1,
- .ncopies = 2,
- },
- [BTRFS_RAID_RAID0] = {
- .sub_stripes = 1,
- .dev_stripes = 1,
- .devs_max = 0,
- .devs_min = 2,
- .devs_increment = 1,
- .ncopies = 1,
- },
- [BTRFS_RAID_SINGLE] = {
- .sub_stripes = 1,
- .dev_stripes = 1,
- .devs_max = 1,
- .devs_min = 1,
- .devs_increment = 1,
- .ncopies = 1,
- },
- [BTRFS_RAID_RAID5] = {
- .sub_stripes = 1,
- .dev_stripes = 1,
- .devs_max = 0,
- .devs_min = 2,
- .devs_increment = 1,
- .ncopies = 2,
- },
- [BTRFS_RAID_RAID6] = {
- .sub_stripes = 1,
- .dev_stripes = 1,
- .devs_max = 0,
- .devs_min = 3,
- .devs_increment = 1,
- .ncopies = 3,
- },
-};
-
static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target)
{
/* TODO allow them to set a preferred stripe size */
- return 64 * 1024;
+ return SZ_64K;
}
static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
@@ -4417,21 +4527,21 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
ncopies = btrfs_raid_array[index].ncopies;
if (type & BTRFS_BLOCK_GROUP_DATA) {
- max_stripe_size = 1024 * 1024 * 1024;
+ max_stripe_size = SZ_1G;
max_chunk_size = 10 * max_stripe_size;
if (!devs_max)
devs_max = BTRFS_MAX_DEVS(info->chunk_root);
} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
/* for larger filesystems, use larger metadata chunks */
- if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024)
- max_stripe_size = 1024 * 1024 * 1024;
+ if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
+ max_stripe_size = SZ_1G;
else
- max_stripe_size = 256 * 1024 * 1024;
+ max_stripe_size = SZ_256M;
max_chunk_size = max_stripe_size;
if (!devs_max)
devs_max = BTRFS_MAX_DEVS(info->chunk_root);
} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
- max_stripe_size = 32 * 1024 * 1024;
+ max_stripe_size = SZ_32M;
max_chunk_size = 2 * max_stripe_size;
if (!devs_max)
devs_max = BTRFS_MAX_DEVS_SYS_CHUNK;
@@ -4682,7 +4792,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
u64 dev_offset;
u64 stripe_size;
int i = 0;
- int ret;
+ int ret = 0;
em_tree = &extent_root->fs_info->mapping_tree.map_tree;
read_lock(&em_tree->lock);
@@ -4713,20 +4823,32 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
goto out;
}
+ /*
+ * Take the device list mutex to prevent races with the final phase of
+ * a device replace operation that replaces the device object associated
+ * with the map's stripes, because the device object's id can change
+ * at any time during that final phase of the device replace operation
+ * (dev-replace.c:btrfs_dev_replace_finishing()).
+ */
+ mutex_lock(&chunk_root->fs_info->fs_devices->device_list_mutex);
for (i = 0; i < map->num_stripes; i++) {
device = map->stripes[i].dev;
dev_offset = map->stripes[i].physical;
ret = btrfs_update_device(trans, device);
if (ret)
- goto out;
+ break;
ret = btrfs_alloc_dev_extent(trans, device,
chunk_root->root_key.objectid,
BTRFS_FIRST_CHUNK_TREE_OBJECTID,
chunk_offset, dev_offset,
stripe_size);
if (ret)
- goto out;
+ break;
+ }
+ if (ret) {
+ mutex_unlock(&chunk_root->fs_info->fs_devices->device_list_mutex);
+ goto out;
}
stripe = &chunk->stripe;
@@ -4739,6 +4861,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
stripe++;
}
+ mutex_unlock(&chunk_root->fs_info->fs_devices->device_list_mutex);
btrfs_set_stack_chunk_length(chunk, chunk_size);
btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
@@ -5071,9 +5194,7 @@ static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
* and the stripes
*/
sizeof(u64) * (total_stripes),
- GFP_NOFS);
- if (!bbio)
- return NULL;
+ GFP_NOFS|__GFP_NOFAIL);
atomic_set(&bbio->error, 0);
atomic_set(&bbio->refs, 1);
@@ -5741,23 +5862,23 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
return 0;
}
-static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio, int err)
+static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio)
{
bio->bi_private = bbio->private;
bio->bi_end_io = bbio->end_io;
- bio_endio(bio, err);
+ bio_endio(bio);
btrfs_put_bbio(bbio);
}
-static void btrfs_end_bio(struct bio *bio, int err)
+static void btrfs_end_bio(struct bio *bio)
{
struct btrfs_bio *bbio = bio->bi_private;
int is_orig_bio = 0;
- if (err) {
+ if (bio->bi_error) {
atomic_inc(&bbio->error);
- if (err == -EIO || err == -EREMOTEIO) {
+ if (bio->bi_error == -EIO || bio->bi_error == -EREMOTEIO) {
unsigned int stripe_index =
btrfs_io_bio(bio)->stripe_index;
struct btrfs_device *dev;
@@ -5795,17 +5916,16 @@ static void btrfs_end_bio(struct bio *bio, int err)
* beyond the tolerance of the btrfs bio
*/
if (atomic_read(&bbio->error) > bbio->max_errors) {
- err = -EIO;
+ bio->bi_error = -EIO;
} else {
/*
* this bio is actually up to date, we didn't
* go over the max number of errors
*/
- set_bit(BIO_UPTODATE, &bio->bi_flags);
- err = 0;
+ bio->bi_error = 0;
}
- btrfs_end_bbio(bbio, bio, err);
+ btrfs_end_bbio(bbio, bio);
} else if (!is_orig_bio) {
bio_put(bio);
}
@@ -5826,7 +5946,7 @@ static noinline void btrfs_schedule_bio(struct btrfs_root *root,
struct btrfs_pending_bios *pending_bios;
if (device->missing || !device->bdev) {
- bio_endio(bio, -EIO);
+ bio_io_error(bio);
return;
}
@@ -5871,34 +5991,6 @@ static noinline void btrfs_schedule_bio(struct btrfs_root *root,
&device->work);
}
-static int bio_size_ok(struct block_device *bdev, struct bio *bio,
- sector_t sector)
-{
- struct bio_vec *prev;
- struct request_queue *q = bdev_get_queue(bdev);
- unsigned int max_sectors = queue_max_sectors(q);
- struct bvec_merge_data bvm = {
- .bi_bdev = bdev,
- .bi_sector = sector,
- .bi_rw = bio->bi_rw,
- };
-
- if (WARN_ON(bio->bi_vcnt == 0))
- return 1;
-
- prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
- if (bio_sectors(bio) > max_sectors)
- return 0;
-
- if (!q->merge_bvec_fn)
- return 1;
-
- bvm.bi_size = bio->bi_iter.bi_size - prev->bv_len;
- if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len)
- return 0;
- return 1;
-}
-
static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
struct bio *bio, u64 physical, int dev_nr,
int rw, int async)
@@ -5932,38 +6024,6 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
btrfsic_submit_bio(rw, bio);
}
-static int breakup_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
- struct bio *first_bio, struct btrfs_device *dev,
- int dev_nr, int rw, int async)
-{
- struct bio_vec *bvec = first_bio->bi_io_vec;
- struct bio *bio;
- int nr_vecs = bio_get_nr_vecs(dev->bdev);
- u64 physical = bbio->stripes[dev_nr].physical;
-
-again:
- bio = btrfs_bio_alloc(dev->bdev, physical >> 9, nr_vecs, GFP_NOFS);
- if (!bio)
- return -ENOMEM;
-
- while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) {
- if (bio_add_page(bio, bvec->bv_page, bvec->bv_len,
- bvec->bv_offset) < bvec->bv_len) {
- u64 len = bio->bi_iter.bi_size;
-
- atomic_inc(&bbio->stripes_pending);
- submit_stripe_bio(root, bbio, bio, physical, dev_nr,
- rw, async);
- physical += len;
- goto again;
- }
- bvec++;
- }
-
- submit_stripe_bio(root, bbio, bio, physical, dev_nr, rw, async);
- return 0;
-}
-
static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
{
atomic_inc(&bbio->error);
@@ -5973,8 +6033,8 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
bio->bi_iter.bi_sector = logical >> 9;
-
- btrfs_end_bbio(bbio, bio, -EIO);
+ bio->bi_error = -EIO;
+ btrfs_end_bbio(bbio, bio);
}
}
@@ -6036,18 +6096,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
continue;
}
- /*
- * Check and see if we're ok with this bio based on it's size
- * and offset with the given device.
- */
- if (!bio_size_ok(dev->bdev, first_bio,
- bbio->stripes[dev_nr].physical >> 9)) {
- ret = breakup_stripe_bio(root, bbio, first_bio, dev,
- dev_nr, rw, async_submit);
- BUG_ON(ret);
- continue;
- }
-
if (dev_nr < total_devs - 1) {
bio = btrfs_bio_clone(first_bio, GFP_NOFS);
BUG_ON(!bio); /* -ENOMEM */
@@ -6429,11 +6477,11 @@ int btrfs_read_sys_array(struct btrfs_root *root)
sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET);
if (!sb)
return -ENOMEM;
- btrfs_set_buffer_uptodate(sb);
+ set_extent_buffer_uptodate(sb);
btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
/*
* The sb extent buffer is artifical and just used to read the system array.
- * btrfs_set_buffer_uptodate() call does not properly mark all it's
+ * set_extent_buffer_uptodate() call does not properly mark all it's
* pages up-to-date when the page is larger: extent does not cover the
* whole page and consequently check_page_uptodate does not find all
* the page's extents up-to-date (the hole beyond sb),
@@ -6476,6 +6524,14 @@ int btrfs_read_sys_array(struct btrfs_root *root)
goto out_short_read;
num_stripes = btrfs_chunk_num_stripes(sb, chunk);
+ if (!num_stripes) {
+ printk(KERN_ERR
+ "BTRFS: invalid number of stripes %u in sys_array at offset %u\n",
+ num_stripes, cur_offset);
+ ret = -EIO;
+ break;
+ }
+
len = btrfs_chunk_item_size(num_stripes);
if (cur_offset + len > array_size)
goto out_short_read;
@@ -6484,6 +6540,9 @@ int btrfs_read_sys_array(struct btrfs_root *root)
if (ret)
break;
} else {
+ printk(KERN_ERR
+ "BTRFS: unexpected item type %u in sys_array at offset %u\n",
+ (u32)key.type, cur_offset);
ret = -EIO;
break;
}
@@ -6671,8 +6730,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
BUG_ON(!path);
ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
if (ret < 0) {
- printk_in_rcu(KERN_WARNING "BTRFS: "
- "error %d while searching for dev_stats item for device %s!\n",
+ btrfs_warn_in_rcu(dev_root->fs_info,
+ "error %d while searching for dev_stats item for device %s",
ret, rcu_str_deref(device->name));
goto out;
}
@@ -6682,8 +6741,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
/* need to delete old one and insert a new one */
ret = btrfs_del_item(trans, dev_root, path);
if (ret != 0) {
- printk_in_rcu(KERN_WARNING "BTRFS: "
- "delete too small dev_stats item for device %s failed %d!\n",
+ btrfs_warn_in_rcu(dev_root->fs_info,
+ "delete too small dev_stats item for device %s failed %d",
rcu_str_deref(device->name), ret);
goto out;
}
@@ -6696,9 +6755,9 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
ret = btrfs_insert_empty_item(trans, dev_root, path,
&key, sizeof(*ptr));
if (ret < 0) {
- printk_in_rcu(KERN_WARNING "BTRFS: "
- "insert dev_stats item for device %s failed %d!\n",
- rcu_str_deref(device->name), ret);
+ btrfs_warn_in_rcu(dev_root->fs_info,
+ "insert dev_stats item for device %s failed %d",
+ rcu_str_deref(device->name), ret);
goto out;
}
}
@@ -6752,8 +6811,8 @@ static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
{
if (!dev->dev_stats_valid)
return;
- printk_ratelimited_in_rcu(KERN_ERR "BTRFS: "
- "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
+ btrfs_err_rl_in_rcu(dev->dev_root->fs_info,
+ "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
rcu_str_deref(dev->name),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
@@ -6772,8 +6831,8 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
if (i == BTRFS_DEV_STAT_VALUES_MAX)
return; /* all values == 0, suppress message */
- printk_in_rcu(KERN_INFO "BTRFS: "
- "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
+ btrfs_info_in_rcu(dev->dev_root->fs_info,
+ "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
rcu_str_deref(dev->name),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
@@ -6817,22 +6876,34 @@ int btrfs_get_dev_stats(struct btrfs_root *root,
return 0;
}
-int btrfs_scratch_superblock(struct btrfs_device *device)
+void btrfs_scratch_superblocks(struct block_device *bdev, char *device_path)
{
struct buffer_head *bh;
struct btrfs_super_block *disk_super;
+ int copy_num;
- bh = btrfs_read_dev_super(device->bdev);
- if (!bh)
- return -EINVAL;
- disk_super = (struct btrfs_super_block *)bh->b_data;
+ if (!bdev)
+ return;
- memset(&disk_super->magic, 0, sizeof(disk_super->magic));
- set_buffer_dirty(bh);
- sync_dirty_buffer(bh);
- brelse(bh);
+ for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX;
+ copy_num++) {
- return 0;
+ if (btrfs_read_dev_one_super(bdev, copy_num, &bh))
+ continue;
+
+ disk_super = (struct btrfs_super_block *)bh->b_data;
+
+ memset(&disk_super->magic, 0, sizeof(disk_super->magic));
+ set_buffer_dirty(bh);
+ sync_dirty_buffer(bh);
+ brelse(bh);
+ }
+
+ /* Notify udev that device has changed */
+ btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
+
+ /* Update ctime/mtime for device path for libblkid */
+ update_dev_time(device_path);
}
/*
@@ -6900,3 +6971,38 @@ void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info)
fs_devices = fs_devices->seed;
}
}
+
+static void btrfs_close_one_device(struct btrfs_device *device)
+{
+ struct btrfs_fs_devices *fs_devices = device->fs_devices;
+ struct btrfs_device *new_device;
+ struct rcu_string *name;
+
+ if (device->bdev)
+ fs_devices->open_devices--;
+
+ if (device->writeable &&
+ device->devid != BTRFS_DEV_REPLACE_DEVID) {
+ list_del_init(&device->dev_alloc_list);
+ fs_devices->rw_devices--;
+ }
+
+ if (device->missing)
+ fs_devices->missing_devices--;
+
+ new_device = btrfs_alloc_device(NULL, &device->devid,
+ device->uuid);
+ BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
+
+ /* Safe because we are under uuid_mutex */
+ if (device->name) {
+ name = rcu_string_strdup(device->name->str, GFP_NOFS);
+ BUG_ON(!name); /* -ENOMEM */
+ rcu_assign_pointer(new_device->name, name);
+ }
+
+ list_replace_rcu(&device->dev_list, &new_device->dev_list);
+ new_device->fs_devices = device->fs_devices;
+
+ call_rcu(&device->rcu, free_device);
+}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 95842a9..1939ebd 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -26,7 +26,7 @@
extern struct mutex uuid_mutex;
-#define BTRFS_STRIPE_LEN (64 * 1024)
+#define BTRFS_STRIPE_LEN SZ_64K
struct buffer_head;
struct btrfs_pending_bios {
@@ -256,7 +256,7 @@ struct btrfs_fs_devices {
struct btrfs_fs_info *fs_info;
/* sysfs kobjects */
- struct kobject super_kobj;
+ struct kobject fsid_kobj;
struct kobject *device_dir_kobj;
struct completion kobj_unregister;
};
@@ -334,10 +334,15 @@ struct btrfs_raid_attr {
int dev_stripes; /* stripes per dev */
int devs_max; /* max devs to use */
int devs_min; /* min devs needed */
+ int tolerated_failures; /* max tolerated fail devs */
int devs_increment; /* ndevs has to be a multiple of this */
int ncopies; /* how many copies to data has */
};
+extern const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES];
+
+extern const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES];
+
struct map_lookup {
u64 type;
int io_align;
@@ -375,6 +380,20 @@ struct map_lookup {
#define BTRFS_BALANCE_ARGS_DRANGE (1ULL << 3)
#define BTRFS_BALANCE_ARGS_VRANGE (1ULL << 4)
#define BTRFS_BALANCE_ARGS_LIMIT (1ULL << 5)
+#define BTRFS_BALANCE_ARGS_LIMIT_RANGE (1ULL << 6)
+#define BTRFS_BALANCE_ARGS_STRIPES_RANGE (1ULL << 7)
+#define BTRFS_BALANCE_ARGS_USAGE_RANGE (1ULL << 10)
+
+#define BTRFS_BALANCE_ARGS_MASK \
+ (BTRFS_BALANCE_ARGS_PROFILES | \
+ BTRFS_BALANCE_ARGS_USAGE | \
+ BTRFS_BALANCE_ARGS_DEVID | \
+ BTRFS_BALANCE_ARGS_DRANGE | \
+ BTRFS_BALANCE_ARGS_VRANGE | \
+ BTRFS_BALANCE_ARGS_LIMIT | \
+ BTRFS_BALANCE_ARGS_LIMIT_RANGE | \
+ BTRFS_BALANCE_ARGS_STRIPES_RANGE | \
+ BTRFS_BALANCE_ARGS_USAGE_RANGE)
/*
* Profile changing flags. When SOFT is set we won't relocate chunk if
@@ -453,6 +472,9 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info);
int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info);
int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
+int find_free_dev_extent_start(struct btrfs_transaction *transaction,
+ struct btrfs_device *device, u64 num_bytes,
+ u64 search_start, u64 *start, u64 *max_avail);
int find_free_dev_extent(struct btrfs_trans_handle *trans,
struct btrfs_device *device, u64 num_bytes,
u64 *start, u64 *max_avail);
@@ -471,7 +493,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
struct btrfs_device *tgtdev);
void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
struct btrfs_device *tgtdev);
-int btrfs_scratch_superblock(struct btrfs_device *device);
+void btrfs_scratch_superblocks(struct block_device *bdev, char *device_path);
int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
u64 logical, u64 len, int mirror_num);
unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 6f518c9..fd953c3 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -283,7 +283,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->reada = 2;
+ path->reada = READA_FORWARD;
/* search for our xattrs */
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -313,8 +313,10 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
/* check to make sure this item is what we want */
if (found_key.objectid != key.objectid)
break;
- if (found_key.type != BTRFS_XATTR_ITEM_KEY)
+ if (found_key.type > BTRFS_XATTR_ITEM_KEY)
break;
+ if (found_key.type < BTRFS_XATTR_ITEM_KEY)
+ goto next;
di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
if (verify_dir_item(root, leaf, di))
@@ -349,137 +351,89 @@ err:
return ret;
}
-/*
- * List of handlers for synthetic system.* attributes. All real ondisk
- * attributes are handled directly.
- */
-const struct xattr_handler *btrfs_xattr_handlers[] = {
-#ifdef CONFIG_BTRFS_FS_POSIX_ACL
- &posix_acl_access_xattr_handler,
- &posix_acl_default_xattr_handler,
-#endif
- NULL,
-};
-
-/*
- * Check if the attribute is in a supported namespace.
- *
- * This is applied after the check for the synthetic attributes in the system
- * namespace.
- */
-static int btrfs_is_valid_xattr(const char *name)
+static int btrfs_xattr_handler_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
- int len = strlen(name);
- int prefixlen = 0;
-
- if (!strncmp(name, XATTR_SECURITY_PREFIX,
- XATTR_SECURITY_PREFIX_LEN))
- prefixlen = XATTR_SECURITY_PREFIX_LEN;
- else if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
- prefixlen = XATTR_SYSTEM_PREFIX_LEN;
- else if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN))
- prefixlen = XATTR_TRUSTED_PREFIX_LEN;
- else if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
- prefixlen = XATTR_USER_PREFIX_LEN;
- else if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
- prefixlen = XATTR_BTRFS_PREFIX_LEN;
- else
- return -EOPNOTSUPP;
-
- /*
- * The name cannot consist of just prefix
- */
- if (len <= prefixlen)
- return -EINVAL;
+ struct inode *inode = d_inode(dentry);
- return 0;
+ name = xattr_full_name(handler, name);
+ return __btrfs_getxattr(inode, name, buffer, size);
}
-ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
- void *buffer, size_t size)
+static int btrfs_xattr_handler_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *buffer, size_t size,
+ int flags)
{
- int ret;
+ struct inode *inode = d_inode(dentry);
- /*
- * If this is a request for a synthetic attribute in the system.*
- * namespace use the generic infrastructure to resolve a handler
- * for it via sb->s_xattr.
- */
- if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
- return generic_getxattr(dentry, name, buffer, size);
+ name = xattr_full_name(handler, name);
+ return __btrfs_setxattr(NULL, inode, name, buffer, size, flags);
+}
- ret = btrfs_is_valid_xattr(name);
- if (ret)
- return ret;
- return __btrfs_getxattr(d_inode(dentry), name, buffer, size);
+static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
+ struct dentry *dentry,
+ const char *name, const void *value,
+ size_t size, int flags)
+{
+ name = xattr_full_name(handler, name);
+ return btrfs_set_prop(d_inode(dentry), name, value, size, flags);
}
+static const struct xattr_handler btrfs_security_xattr_handler = {
+ .prefix = XATTR_SECURITY_PREFIX,
+ .get = btrfs_xattr_handler_get,
+ .set = btrfs_xattr_handler_set,
+};
+
+static const struct xattr_handler btrfs_trusted_xattr_handler = {
+ .prefix = XATTR_TRUSTED_PREFIX,
+ .get = btrfs_xattr_handler_get,
+ .set = btrfs_xattr_handler_set,
+};
+
+static const struct xattr_handler btrfs_user_xattr_handler = {
+ .prefix = XATTR_USER_PREFIX,
+ .get = btrfs_xattr_handler_get,
+ .set = btrfs_xattr_handler_set,
+};
+
+static const struct xattr_handler btrfs_btrfs_xattr_handler = {
+ .prefix = XATTR_BTRFS_PREFIX,
+ .get = btrfs_xattr_handler_get,
+ .set = btrfs_xattr_handler_set_prop,
+};
+
+const struct xattr_handler *btrfs_xattr_handlers[] = {
+ &btrfs_security_xattr_handler,
+#ifdef CONFIG_BTRFS_FS_POSIX_ACL
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
+#endif
+ &btrfs_trusted_xattr_handler,
+ &btrfs_user_xattr_handler,
+ &btrfs_btrfs_xattr_handler,
+ NULL,
+};
+
int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
size_t size, int flags)
{
struct btrfs_root *root = BTRFS_I(d_inode(dentry))->root;
- int ret;
- /*
- * The permission on security.* and system.* is not checked
- * in permission().
- */
if (btrfs_root_readonly(root))
return -EROFS;
-
- /*
- * If this is a request for a synthetic attribute in the system.*
- * namespace use the generic infrastructure to resolve a handler
- * for it via sb->s_xattr.
- */
- if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
- return generic_setxattr(dentry, name, value, size, flags);
-
- ret = btrfs_is_valid_xattr(name);
- if (ret)
- return ret;
-
- if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
- return btrfs_set_prop(d_inode(dentry), name,
- value, size, flags);
-
- if (size == 0)
- value = ""; /* empty EA, do not remove */
-
- return __btrfs_setxattr(NULL, d_inode(dentry), name, value, size,
- flags);
+ return generic_setxattr(dentry, name, value, size, flags);
}
int btrfs_removexattr(struct dentry *dentry, const char *name)
{
struct btrfs_root *root = BTRFS_I(d_inode(dentry))->root;
- int ret;
- /*
- * The permission on security.* and system.* is not checked
- * in permission().
- */
if (btrfs_root_readonly(root))
return -EROFS;
-
- /*
- * If this is a request for a synthetic attribute in the system.*
- * namespace use the generic infrastructure to resolve a handler
- * for it via sb->s_xattr.
- */
- if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
- return generic_removexattr(dentry, name);
-
- ret = btrfs_is_valid_xattr(name);
- if (ret)
- return ret;
-
- if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
- return btrfs_set_prop(d_inode(dentry), name,
- NULL, 0, XATTR_REPLACE);
-
- return __btrfs_setxattr(NULL, d_inode(dentry), name, NULL, 0,
- XATTR_REPLACE);
+ return generic_removexattr(dentry, name);
}
static int btrfs_initxattrs(struct inode *inode,
@@ -492,7 +446,7 @@ static int btrfs_initxattrs(struct inode *inode,
for (xattr = xattr_array; xattr->name != NULL; xattr++) {
name = kmalloc(XATTR_SECURITY_PREFIX_LEN +
- strlen(xattr->name) + 1, GFP_NOFS);
+ strlen(xattr->name) + 1, GFP_KERNEL);
if (!name) {
err = -ENOMEM;
break;
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 5049608..96807b3 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -28,8 +28,6 @@ extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
extern int __btrfs_setxattr(struct btrfs_trans_handle *trans,
struct inode *inode, const char *name,
const void *value, size_t size, int flags);
-extern ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
- void *buffer, size_t size);
extern int btrfs_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags);
extern int btrfs_removexattr(struct dentry *dentry, const char *name);
diff --git a/fs/buffer.c b/fs/buffer.c
index 1cf7a53..e1632ab 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -134,13 +134,10 @@ __clear_page_buffers(struct page *page)
static void buffer_io_error(struct buffer_head *bh, char *msg)
{
- char b[BDEVNAME_SIZE];
-
if (!test_bit(BH_Quiet, &bh->b_state))
printk_ratelimited(KERN_ERR
- "Buffer I/O error on dev %s, logical block %llu%s\n",
- bdevname(bh->b_bdev, b),
- (unsigned long long)bh->b_blocknr, msg);
+ "Buffer I/O error on dev %pg, logical block %llu%s\n",
+ bh->b_bdev, (unsigned long long)bh->b_blocknr, msg);
}
/*
@@ -237,15 +234,13 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
* elsewhere, don't buffer_error if we had some unmapped buffers
*/
if (all_mapped) {
- char b[BDEVNAME_SIZE];
-
printk("__find_get_block_slow() failed. "
"block=%llu, b_blocknr=%llu\n",
(unsigned long long)block,
(unsigned long long)bh->b_blocknr);
printk("b_state=0x%08lx, b_size=%zu\n",
bh->b_state, bh->b_size);
- printk("device %s blocksize: %d\n", bdevname(bdev, b),
+ printk("device %pg blocksize: %d\n", bdev,
1 << bd_inode->i_blkbits);
}
out_unlock:
@@ -531,10 +526,8 @@ repeat:
static void do_thaw_one(struct super_block *sb, void *unused)
{
- char b[BDEVNAME_SIZE];
while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
- printk(KERN_WARNING "Emergency Thaw on %s\n",
- bdevname(sb->s_bdev, b));
+ printk(KERN_WARNING "Emergency Thaw on %pg\n", sb->s_bdev);
}
static void do_thaw_all(struct work_struct *work)
@@ -999,7 +992,7 @@ grow_dev_page(struct block_device *bdev, sector_t block,
int ret = 0; /* Will call free_more_memory() */
gfp_t gfp_mask;
- gfp_mask = (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS) | gfp;
+ gfp_mask = mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS) | gfp;
/*
* XXX: __getblk_slow() can not really deal with failure and
@@ -1074,12 +1067,10 @@ grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp)
* pagecache index. (this comparison is done using sector_t types).
*/
if (unlikely(index != block >> sizebits)) {
- char b[BDEVNAME_SIZE];
-
printk(KERN_ERR "%s: requested out-of-range block %llu for "
- "device %s\n",
+ "device %pg\n",
__func__, (unsigned long long)block,
- bdevname(bdev, b));
+ bdev);
return -EIO;
}
@@ -2420,9 +2411,9 @@ EXPORT_SYMBOL(block_commit_write);
* unlock the page.
*
* Direct callers of this function should protect against filesystem freezing
- * using sb_start_write() - sb_end_write() functions.
+ * using sb_start_pagefault() - sb_end_pagefault() functions.
*/
-int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
+int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
get_block_t get_block)
{
struct page *page = vmf->page;
@@ -2459,26 +2450,6 @@ out_unlock:
unlock_page(page);
return ret;
}
-EXPORT_SYMBOL(__block_page_mkwrite);
-
-int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
- get_block_t get_block)
-{
- int ret;
- struct super_block *sb = file_inode(vma->vm_file)->i_sb;
-
- sb_start_pagefault(sb);
-
- /*
- * Update file times before taking page lock. We may end up failing the
- * fault so this update may be superfluous but who really cares...
- */
- file_update_time(vma->vm_file);
-
- ret = __block_page_mkwrite(vma, vmf, get_block);
- sb_end_pagefault(sb);
- return block_page_mkwrite_return(ret);
-}
EXPORT_SYMBOL(block_page_mkwrite);
/*
@@ -2957,14 +2928,14 @@ sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
}
EXPORT_SYMBOL(generic_block_bmap);
-static void end_bio_bh_io_sync(struct bio *bio, int err)
+static void end_bio_bh_io_sync(struct bio *bio)
{
struct buffer_head *bh = bio->bi_private;
- if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
+ if (unlikely(bio_flagged(bio, BIO_QUIET)))
set_bit(BH_Quiet, &bh->b_state);
- bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
+ bh->b_end_io(bh, !bio->bi_error);
bio_put(bio);
}
@@ -3046,12 +3017,9 @@ static int submit_bh_wbc(int rw, struct buffer_head *bh,
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio->bi_bdev = bh->b_bdev;
- bio->bi_io_vec[0].bv_page = bh->b_page;
- bio->bi_io_vec[0].bv_len = bh->b_size;
- bio->bi_io_vec[0].bv_offset = bh_offset(bh);
- bio->bi_vcnt = 1;
- bio->bi_iter.bi_size = bh->b_size;
+ bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
+ BUG_ON(bio->bi_iter.bi_size != bh->b_size);
bio->bi_end_io = end_bio_bh_io_sync;
bio->bi_private = bh;
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index f601def..452e98d 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -226,15 +226,9 @@ static ssize_t cachefiles_daemon_write(struct file *file,
return -EOPNOTSUPP;
/* drag the command string into the kernel so we can parse it */
- data = kmalloc(datalen + 1, GFP_KERNEL);
- if (!data)
- return -ENOMEM;
-
- ret = -EFAULT;
- if (copy_from_user(data, _data, datalen) != 0)
- goto error;
-
- data[datalen] = '\0';
+ data = memdup_user_nul(_data, datalen);
+ if (IS_ERR(data))
+ return PTR_ERR(data);
ret = -EINVAL;
if (memchr(data, '\0', datalen))
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index aecd085..9c4b737 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -30,7 +30,7 @@ extern unsigned cachefiles_debug;
#define CACHEFILES_DEBUG_KLEAVE 2
#define CACHEFILES_DEBUG_KDEBUG 4
-#define cachefiles_gfp (__GFP_WAIT | __GFP_NORETRY | __GFP_NOMEMALLOC)
+#define cachefiles_gfp (__GFP_RECLAIM | __GFP_NORETRY | __GFP_NOMEMALLOC)
/*
* node records
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index fc1056f..c4b8934 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -655,6 +655,8 @@ lookup_again:
aops = d_backing_inode(object->dentry)->i_mapping->a_ops;
if (!aops->bmap)
goto check_error;
+ if (object->dentry->d_sb->s_blocksize > PAGE_SIZE)
+ goto check_error;
object->backer = object->dentry;
} else {
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 3cbb0e8..c0f3da3 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -414,9 +414,6 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
ASSERT(inode->i_mapping->a_ops->readpages);
/* calculate the shift required to use bmap */
- if (inode->i_sb->s_blocksize > PAGE_SIZE)
- goto enobufs;
-
shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
op->op.flags &= FSCACHE_OP_KEEP_FLAGS;
@@ -711,9 +708,6 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
ASSERT(inode->i_mapping->a_ops->readpages);
/* calculate the shift required to use bmap */
- if (inode->i_sb->s_blocksize > PAGE_SIZE)
- goto all_enobufs;
-
shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
pagevec_init(&pagevec, 0);
@@ -885,7 +879,7 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page)
loff_t pos, eof;
size_t len;
void *data;
- int ret;
+ int ret = -ENOBUFS;
ASSERT(op != NULL);
ASSERT(page != NULL);
@@ -905,6 +899,15 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page)
cache = container_of(object->fscache.cache,
struct cachefiles_cache, cache);
+ pos = (loff_t)page->index << PAGE_SHIFT;
+
+ /* We mustn't write more data than we have, so we have to beware of a
+ * partial page at EOF.
+ */
+ eof = object->fscache.store_limit_l;
+ if (pos >= eof)
+ goto error;
+
/* write the page to the backing filesystem and let it store it in its
* own time */
path.mnt = cache->mnt;
@@ -912,40 +915,38 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page)
file = dentry_open(&path, O_RDWR | O_LARGEFILE, cache->cache_cred);
if (IS_ERR(file)) {
ret = PTR_ERR(file);
- } else {
- pos = (loff_t) page->index << PAGE_SHIFT;
-
- /* we mustn't write more data than we have, so we have
- * to beware of a partial page at EOF */
- eof = object->fscache.store_limit_l;
- len = PAGE_SIZE;
- if (eof & ~PAGE_MASK) {
- ASSERTCMP(pos, <, eof);
- if (eof - pos < PAGE_SIZE) {
- _debug("cut short %llx to %llx",
- pos, eof);
- len = eof - pos;
- ASSERTCMP(pos + len, ==, eof);
- }
- }
-
- data = kmap(page);
- ret = __kernel_write(file, data, len, &pos);
- kunmap(page);
- if (ret != len)
- ret = -EIO;
- fput(file);
+ goto error_2;
}
- if (ret < 0) {
- if (ret == -EIO)
- cachefiles_io_error_obj(
- object, "Write page to backing file failed");
- ret = -ENOBUFS;
+ len = PAGE_SIZE;
+ if (eof & ~PAGE_MASK) {
+ if (eof - pos < PAGE_SIZE) {
+ _debug("cut short %llx to %llx",
+ pos, eof);
+ len = eof - pos;
+ ASSERTCMP(pos + len, ==, eof);
+ }
}
- _leave(" = %d", ret);
- return ret;
+ data = kmap(page);
+ ret = __kernel_write(file, data, len, &pos);
+ kunmap(page);
+ fput(file);
+ if (ret != len)
+ goto error_eio;
+
+ _leave(" = 0");
+ return 0;
+
+error_eio:
+ ret = -EIO;
+error_2:
+ if (ret == -EIO)
+ cachefiles_io_error_obj(object,
+ "Write page to backing file failed");
+error:
+ _leave(" = -ENOBUFS [%d]", ret);
+ return -ENOBUFS;
}
/*
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
index 8f84646..f197084 100644
--- a/fs/ceph/acl.c
+++ b/fs/ceph/acl.c
@@ -49,10 +49,10 @@ struct posix_acl *ceph_get_acl(struct inode *inode, int type)
switch (type) {
case ACL_TYPE_ACCESS:
- name = POSIX_ACL_XATTR_ACCESS;
+ name = XATTR_NAME_POSIX_ACL_ACCESS;
break;
case ACL_TYPE_DEFAULT:
- name = POSIX_ACL_XATTR_DEFAULT;
+ name = XATTR_NAME_POSIX_ACL_DEFAULT;
break;
default:
BUG();
@@ -92,7 +92,7 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type)
switch (type) {
case ACL_TYPE_ACCESS:
- name = POSIX_ACL_XATTR_ACCESS;
+ name = XATTR_NAME_POSIX_ACL_ACCESS;
if (acl) {
ret = posix_acl_equiv_mode(acl, &new_mode);
if (ret < 0)
@@ -106,7 +106,7 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type)
ret = acl ? -EINVAL : 0;
goto out;
}
- name = POSIX_ACL_XATTR_DEFAULT;
+ name = XATTR_NAME_POSIX_ACL_DEFAULT;
break;
default:
ret = -EINVAL;
@@ -202,11 +202,11 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
ceph_pagelist_encode_32(pagelist, acl && default_acl ? 2 : 1);
if (acl) {
- size_t len = strlen(POSIX_ACL_XATTR_ACCESS);
+ size_t len = strlen(XATTR_NAME_POSIX_ACL_ACCESS);
err = ceph_pagelist_reserve(pagelist, len + val_size1 + 8);
if (err)
goto out_err;
- ceph_pagelist_encode_string(pagelist, POSIX_ACL_XATTR_ACCESS,
+ ceph_pagelist_encode_string(pagelist, XATTR_NAME_POSIX_ACL_ACCESS,
len);
err = posix_acl_to_xattr(&init_user_ns, acl,
tmp_buf, val_size1);
@@ -216,12 +216,12 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
ceph_pagelist_append(pagelist, tmp_buf, val_size1);
}
if (default_acl) {
- size_t len = strlen(POSIX_ACL_XATTR_DEFAULT);
+ size_t len = strlen(XATTR_NAME_POSIX_ACL_DEFAULT);
err = ceph_pagelist_reserve(pagelist, len + val_size2 + 8);
if (err)
goto out_err;
err = ceph_pagelist_encode_string(pagelist,
- POSIX_ACL_XATTR_DEFAULT, len);
+ XATTR_NAME_POSIX_ACL_DEFAULT, len);
err = posix_acl_to_xattr(&init_user_ns, default_acl,
tmp_buf, val_size2);
if (err < 0)
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 890c509..b7d218a 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -276,7 +276,7 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
for (i = 0; i < num_pages; i++) {
struct page *page = osd_data->pages[i];
- if (rc < 0)
+ if (rc < 0 && rc != ENOENT)
goto unlock;
if (bytes < (int)PAGE_CACHE_SIZE) {
/* zero (remainder of) page */
@@ -717,8 +717,10 @@ static int ceph_writepages_start(struct address_space *mapping,
wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
(wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
- if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) {
+ if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
pr_warn("writepage_start %p on forced umount\n", inode);
+ truncate_pagecache(inode, 0);
+ mapping_set_error(mapping, -EIO);
return -EIO; /* we're in a forced umount, don't write! */
}
if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize)
@@ -1281,8 +1283,8 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
int ret1;
struct address_space *mapping = inode->i_mapping;
struct page *page = find_or_create_page(mapping, 0,
- mapping_gfp_mask(mapping) &
- ~__GFP_FS);
+ mapping_gfp_constraint(mapping,
+ ~__GFP_FS));
if (!page) {
ret = VM_FAULT_OOM;
goto out;
@@ -1426,7 +1428,8 @@ void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
if (i_size_read(inode) == 0)
return;
page = find_or_create_page(mapping, 0,
- mapping_gfp_mask(mapping) & ~__GFP_FS);
+ mapping_gfp_constraint(mapping,
+ ~__GFP_FS));
if (!page)
return;
if (PageUptodate(page)) {
@@ -1593,7 +1596,7 @@ out:
return err;
}
-static struct vm_operations_struct ceph_vmops = {
+static const struct vm_operations_struct ceph_vmops = {
.fault = ceph_filemap_fault,
.page_mkwrite = ceph_page_mkwrite,
};
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index 834f9f3..a4766de 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -88,7 +88,7 @@ static uint16_t ceph_fscache_inode_get_key(const void *cookie_netfs_data,
const struct ceph_inode_info* ci = cookie_netfs_data;
uint16_t klen;
- /* use ceph virtual inode (id + snaphot) */
+ /* use ceph virtual inode (id + snapshot) */
klen = sizeof(ci->i_vino);
if (klen > maxbuf)
return 0;
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index ddd5e94..c69e125 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1655,9 +1655,8 @@ retry_locked:
!S_ISDIR(inode->i_mode) && /* ignore readdir cache */
ci->i_wrbuffer_ref == 0 && /* no dirty pages... */
inode->i_data.nrpages && /* have cached pages */
- (file_wanted == 0 || /* no open files */
- (revoking & (CEPH_CAP_FILE_CACHE|
- CEPH_CAP_FILE_LAZYIO))) && /* or revoking cache */
+ (revoking & (CEPH_CAP_FILE_CACHE|
+ CEPH_CAP_FILE_LAZYIO)) && /* or revoking cache */
!tried_invalidate) {
dout("check_caps trying to invalidate on %p\n", inode);
if (try_nonblocking_invalidate(inode) < 0) {
@@ -1971,49 +1970,46 @@ out:
}
/*
- * wait for any uncommitted directory operations to commit.
+ * wait for any unsafe requests to complete.
*/
-static int unsafe_dirop_wait(struct inode *inode)
+static int unsafe_request_wait(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- struct list_head *head = &ci->i_unsafe_dirops;
- struct ceph_mds_request *req;
- u64 last_tid;
- int ret = 0;
-
- if (!S_ISDIR(inode->i_mode))
- return 0;
+ struct ceph_mds_request *req1 = NULL, *req2 = NULL;
+ int ret, err = 0;
spin_lock(&ci->i_unsafe_lock);
- if (list_empty(head))
- goto out;
-
- req = list_last_entry(head, struct ceph_mds_request,
- r_unsafe_dir_item);
- last_tid = req->r_tid;
-
- do {
- ceph_mdsc_get_request(req);
- spin_unlock(&ci->i_unsafe_lock);
+ if (S_ISDIR(inode->i_mode) && !list_empty(&ci->i_unsafe_dirops)) {
+ req1 = list_last_entry(&ci->i_unsafe_dirops,
+ struct ceph_mds_request,
+ r_unsafe_dir_item);
+ ceph_mdsc_get_request(req1);
+ }
+ if (!list_empty(&ci->i_unsafe_iops)) {
+ req2 = list_last_entry(&ci->i_unsafe_iops,
+ struct ceph_mds_request,
+ r_unsafe_target_item);
+ ceph_mdsc_get_request(req2);
+ }
+ spin_unlock(&ci->i_unsafe_lock);
- dout("unsafe_dirop_wait %p wait on tid %llu (until %llu)\n",
- inode, req->r_tid, last_tid);
- ret = !wait_for_completion_timeout(&req->r_safe_completion,
- ceph_timeout_jiffies(req->r_timeout));
+ dout("unsafe_requeset_wait %p wait on tid %llu %llu\n",
+ inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL);
+ if (req1) {
+ ret = !wait_for_completion_timeout(&req1->r_safe_completion,
+ ceph_timeout_jiffies(req1->r_timeout));
if (ret)
- ret = -EIO; /* timed out */
-
- ceph_mdsc_put_request(req);
-
- spin_lock(&ci->i_unsafe_lock);
- if (ret || list_empty(head))
- break;
- req = list_first_entry(head, struct ceph_mds_request,
- r_unsafe_dir_item);
- } while (req->r_tid < last_tid);
-out:
- spin_unlock(&ci->i_unsafe_lock);
- return ret;
+ err = -EIO;
+ ceph_mdsc_put_request(req1);
+ }
+ if (req2) {
+ ret = !wait_for_completion_timeout(&req2->r_safe_completion,
+ ceph_timeout_jiffies(req2->r_timeout));
+ if (ret)
+ err = -EIO;
+ ceph_mdsc_put_request(req2);
+ }
+ return err;
}
int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
@@ -2039,7 +2035,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
dirty = try_flush_caps(inode, &flush_tid);
dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
- ret = unsafe_dirop_wait(inode);
+ ret = unsafe_request_wait(inode);
/*
* only wait on non-file metadata writeback (the mds
@@ -2413,6 +2409,14 @@ again:
goto out_unlock;
}
+ if (!__ceph_is_any_caps(ci) &&
+ ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+ dout("get_cap_refs %p forced umount\n", inode);
+ *err = -EIO;
+ ret = 1;
+ goto out_unlock;
+ }
+
dout("get_cap_refs %p have %s needed %s\n", inode,
ceph_cap_string(have), ceph_cap_string(need));
}
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 8b79d87..3c68e6a 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -34,6 +34,74 @@
* need to wait for MDS acknowledgement.
*/
+/*
+ * Calculate the length sum of direct io vectors that can
+ * be combined into one page vector.
+ */
+static size_t dio_get_pagev_size(const struct iov_iter *it)
+{
+ const struct iovec *iov = it->iov;
+ const struct iovec *iovend = iov + it->nr_segs;
+ size_t size;
+
+ size = iov->iov_len - it->iov_offset;
+ /*
+ * An iov can be page vectored when both the current tail
+ * and the next base are page aligned.
+ */
+ while (PAGE_ALIGNED((iov->iov_base + iov->iov_len)) &&
+ (++iov < iovend && PAGE_ALIGNED((iov->iov_base)))) {
+ size += iov->iov_len;
+ }
+ dout("dio_get_pagevlen len = %zu\n", size);
+ return size;
+}
+
+/*
+ * Allocate a page vector based on (@it, @nbytes).
+ * The return value is the tuple describing a page vector,
+ * that is (@pages, @page_align, @num_pages).
+ */
+static struct page **
+dio_get_pages_alloc(const struct iov_iter *it, size_t nbytes,
+ size_t *page_align, int *num_pages)
+{
+ struct iov_iter tmp_it = *it;
+ size_t align;
+ struct page **pages;
+ int ret = 0, idx, npages;
+
+ align = (unsigned long)(it->iov->iov_base + it->iov_offset) &
+ (PAGE_SIZE - 1);
+ npages = calc_pages_for(align, nbytes);
+ pages = kmalloc(sizeof(*pages) * npages, GFP_KERNEL);
+ if (!pages) {
+ pages = vmalloc(sizeof(*pages) * npages);
+ if (!pages)
+ return ERR_PTR(-ENOMEM);
+ }
+
+ for (idx = 0; idx < npages; ) {
+ size_t start;
+ ret = iov_iter_get_pages(&tmp_it, pages + idx, nbytes,
+ npages - idx, &start);
+ if (ret < 0)
+ goto fail;
+
+ iov_iter_advance(&tmp_it, ret);
+ nbytes -= ret;
+ idx += (ret + start + PAGE_SIZE - 1) / PAGE_SIZE;
+ }
+
+ BUG_ON(nbytes != 0);
+ *num_pages = npages;
+ *page_align = align;
+ dout("dio_get_pages_alloc: got %d pages align %zu\n", npages, align);
+ return pages;
+fail:
+ ceph_put_page_vector(pages, idx, false);
+ return ERR_PTR(ret);
+}
/*
* Prepare an open request. Preallocate ceph_cap to avoid an
@@ -136,7 +204,6 @@ int ceph_open(struct inode *inode, struct file *file)
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
struct ceph_file_info *cf = file->private_data;
- struct inode *parent_inode = NULL;
int err;
int flags, fmode, wanted;
@@ -210,10 +277,7 @@ int ceph_open(struct inode *inode, struct file *file)
ihold(inode);
req->r_num_caps = 1;
- if (flags & O_CREAT)
- parent_inode = ceph_get_dentry_parent_inode(file->f_path.dentry);
- err = ceph_mdsc_do_request(mdsc, parent_inode, req);
- iput(parent_inode);
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
if (!err)
err = ceph_init_file(inode, file, req->r_fmode);
ceph_mdsc_put_request(req);
@@ -279,7 +343,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
if (err)
goto out_req;
- if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
+ if ((flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
err = ceph_handle_notrace_create(dir, dentry);
if (d_unhashed(dentry)) {
@@ -462,11 +526,10 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
size_t start;
ssize_t n;
- n = iov_iter_get_pages_alloc(i, &pages, INT_MAX, &start);
- if (n < 0)
- return n;
-
- num_pages = (n + start + PAGE_SIZE - 1) / PAGE_SIZE;
+ n = dio_get_pagev_size(i);
+ pages = dio_get_pages_alloc(i, n, &start, &num_pages);
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
ret = striped_read(inode, off, n,
pages, num_pages, checkeof,
@@ -596,7 +659,7 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
CEPH_OSD_FLAG_WRITE;
while (iov_iter_count(from) > 0) {
- u64 len = iov_iter_single_seg_count(from);
+ u64 len = dio_get_pagev_size(from);
size_t start;
ssize_t n;
@@ -615,14 +678,14 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
- n = iov_iter_get_pages_alloc(from, &pages, len, &start);
- if (unlikely(n < 0)) {
- ret = n;
+ n = len;
+ pages = dio_get_pages_alloc(from, len, &start, &num_pages);
+ if (IS_ERR(pages)) {
ceph_osdc_put_request(req);
+ ret = PTR_ERR(pages);
break;
}
- num_pages = (n + start + PAGE_SIZE - 1) / PAGE_SIZE;
/*
* throw out any page cache pages in this range. this
* may block.
@@ -956,6 +1019,12 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
/* We can write back this queue in page reclaim */
current->backing_dev_info = inode_to_bdi(inode);
+ if (iocb->ki_flags & IOCB_APPEND) {
+ err = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false);
+ if (err < 0)
+ goto out;
+ }
+
err = generic_write_checks(iocb, from);
if (err <= 0)
goto out;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 96d2bd8..da55eb8 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -452,6 +452,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
INIT_LIST_HEAD(&ci->i_unsafe_writes);
INIT_LIST_HEAD(&ci->i_unsafe_dirops);
+ INIT_LIST_HEAD(&ci->i_unsafe_iops);
spin_lock_init(&ci->i_unsafe_lock);
ci->i_snap_realm = NULL;
@@ -1755,7 +1756,7 @@ retry:
*/
static const struct inode_operations ceph_symlink_iops = {
.readlink = generic_readlink,
- .follow_link = simple_follow_link,
+ .get_link = simple_get_link,
.setattr = ceph_setattr,
.getattr = ceph_getattr,
.setxattr = ceph_setxattr,
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 6706bde..a2cb0c2 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -228,12 +228,12 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
file, lock_cmd, wait, fl);
if (!err) {
- err = flock_lock_file_wait(file, fl);
+ err = locks_lock_file_wait(file, fl);
if (err) {
ceph_lock_message(CEPH_LOCK_FLOCK,
CEPH_MDS_OP_SETFILELOCK,
file, CEPH_LOCK_UNLOCK, 0, fl);
- dout("got %d on flock_lock_file_wait, undid lock", err);
+ dout("got %d on locks_lock_file_wait, undid lock", err);
}
}
return err;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 6aa07af..e7b130a 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -633,13 +633,8 @@ static void __register_request(struct ceph_mds_client *mdsc,
mdsc->oldest_tid = req->r_tid;
if (dir) {
- struct ceph_inode_info *ci = ceph_inode(dir);
-
ihold(dir);
- spin_lock(&ci->i_unsafe_lock);
req->r_unsafe_dir = dir;
- list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops);
- spin_unlock(&ci->i_unsafe_lock);
}
}
@@ -665,13 +660,20 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
rb_erase(&req->r_node, &mdsc->request_tree);
RB_CLEAR_NODE(&req->r_node);
- if (req->r_unsafe_dir) {
+ if (req->r_unsafe_dir && req->r_got_unsafe) {
struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
-
spin_lock(&ci->i_unsafe_lock);
list_del_init(&req->r_unsafe_dir_item);
spin_unlock(&ci->i_unsafe_lock);
+ }
+ if (req->r_target_inode && req->r_got_unsafe) {
+ struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
+ spin_lock(&ci->i_unsafe_lock);
+ list_del_init(&req->r_unsafe_target_item);
+ spin_unlock(&ci->i_unsafe_lock);
+ }
+ if (req->r_unsafe_dir) {
iput(req->r_unsafe_dir);
req->r_unsafe_dir = NULL;
}
@@ -1430,6 +1432,13 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
if ((used | wanted) & CEPH_CAP_ANY_WR)
goto out;
}
+ /* The inode has cached pages, but it's no longer used.
+ * we can safely drop it */
+ if (wanted == 0 && used == CEPH_CAP_FILE_CACHE &&
+ !(oissued & CEPH_CAP_FILE_CACHE)) {
+ used = 0;
+ oissued = 0;
+ }
if ((used | wanted) & ~oissued & mine)
goto out; /* we need these caps */
@@ -1438,7 +1447,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
/* we aren't the only cap.. just remove us */
__ceph_remove_cap(cap, true);
} else {
- /* try to drop referring dentries */
+ /* try dropping referring dentries */
spin_unlock(&ci->i_ceph_lock);
d_prune_aliases(inode);
dout("trim_caps_cb %p cap %p pruned, count now %d\n",
@@ -1704,6 +1713,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
req->r_started = jiffies;
req->r_resend_mds = -1;
INIT_LIST_HEAD(&req->r_unsafe_dir_item);
+ INIT_LIST_HEAD(&req->r_unsafe_target_item);
req->r_fmode = -1;
kref_init(&req->r_kref);
INIT_LIST_HEAD(&req->r_wait);
@@ -1935,7 +1945,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
len = sizeof(*head) +
pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)) +
- sizeof(struct timespec);
+ sizeof(struct ceph_timespec);
/* calculate (max) length for cap releases */
len += sizeof(struct ceph_mds_request_release) *
@@ -2107,7 +2117,6 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
msg = create_request_message(mdsc, req, mds, drop_cap_releases);
if (IS_ERR(msg)) {
req->r_err = PTR_ERR(msg);
- complete_request(mdsc, req);
return PTR_ERR(msg);
}
req->r_request = msg;
@@ -2135,7 +2144,7 @@ static int __do_request(struct ceph_mds_client *mdsc,
{
struct ceph_mds_session *session = NULL;
int mds = -1;
- int err = -EAGAIN;
+ int err = 0;
if (req->r_err || req->r_got_result) {
if (req->r_aborted)
@@ -2149,6 +2158,11 @@ static int __do_request(struct ceph_mds_client *mdsc,
err = -EIO;
goto finish;
}
+ if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+ dout("do_request forced umount\n");
+ err = -EIO;
+ goto finish;
+ }
put_request_session(req);
@@ -2196,13 +2210,15 @@ static int __do_request(struct ceph_mds_client *mdsc,
out_session:
ceph_put_mds_session(session);
+finish:
+ if (err) {
+ dout("__do_request early error %d\n", err);
+ req->r_err = err;
+ complete_request(mdsc, req);
+ __unregister_request(mdsc, req);
+ }
out:
return err;
-
-finish:
- req->r_err = err;
- complete_request(mdsc, req);
- goto out;
}
/*
@@ -2289,8 +2305,6 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
if (req->r_err) {
err = req->r_err;
- __unregister_request(mdsc, req);
- dout("do_request early error %d\n", err);
goto out;
}
@@ -2411,7 +2425,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
mutex_unlock(&mdsc->mutex);
goto out;
}
- if (req->r_got_safe && !head->safe) {
+ if (req->r_got_safe) {
pr_warn("got unsafe after safe on %llu from mds%d\n",
tid, mds);
mutex_unlock(&mdsc->mutex);
@@ -2473,6 +2487,14 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
} else {
req->r_got_unsafe = true;
list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
+ if (req->r_unsafe_dir) {
+ struct ceph_inode_info *ci =
+ ceph_inode(req->r_unsafe_dir);
+ spin_lock(&ci->i_unsafe_lock);
+ list_add_tail(&req->r_unsafe_dir_item,
+ &ci->i_unsafe_dirops);
+ spin_unlock(&ci->i_unsafe_lock);
+ }
}
dout("handle_reply tid %lld result %d\n", tid, result);
@@ -2514,14 +2536,20 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
up_read(&mdsc->snap_rwsem);
if (realm)
ceph_put_snap_realm(mdsc, realm);
+
+ if (err == 0 && req->r_got_unsafe && req->r_target_inode) {
+ struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
+ spin_lock(&ci->i_unsafe_lock);
+ list_add_tail(&req->r_unsafe_target_item, &ci->i_unsafe_iops);
+ spin_unlock(&ci->i_unsafe_lock);
+ }
out_err:
mutex_lock(&mdsc->mutex);
if (!req->r_aborted) {
if (err) {
req->r_err = err;
} else {
- req->r_reply = msg;
- ceph_msg_get(msg);
+ req->r_reply = ceph_msg_get(msg);
req->r_got_result = true;
}
} else {
@@ -3555,7 +3583,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
{
u64 want_tid, want_flush, want_snap;
- if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
+ if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
return;
dout("sync\n");
@@ -3584,7 +3612,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
*/
static bool done_closing_sessions(struct ceph_mds_client *mdsc)
{
- if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
+ if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
return true;
return atomic_read(&mdsc->num_sessions) == 0;
}
@@ -3643,6 +3671,34 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
dout("stopped\n");
}
+void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
+{
+ struct ceph_mds_session *session;
+ int mds;
+
+ dout("force umount\n");
+
+ mutex_lock(&mdsc->mutex);
+ for (mds = 0; mds < mdsc->max_sessions; mds++) {
+ session = __ceph_lookup_mds_session(mdsc, mds);
+ if (!session)
+ continue;
+ mutex_unlock(&mdsc->mutex);
+ mutex_lock(&session->s_mutex);
+ __close_session(mdsc, session);
+ if (session->s_state == CEPH_MDS_SESSION_CLOSING) {
+ cleanup_session_requests(mdsc, session);
+ remove_session_caps(session);
+ }
+ mutex_unlock(&session->s_mutex);
+ ceph_put_mds_session(session);
+ mutex_lock(&mdsc->mutex);
+ kick_requests(mdsc, mds);
+ }
+ __wake_requests(mdsc, &mdsc->waiting_for_map);
+ mutex_unlock(&mdsc->mutex);
+}
+
static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
{
dout("stop\n");
@@ -3886,17 +3942,19 @@ static struct ceph_msg *mds_alloc_msg(struct ceph_connection *con,
return msg;
}
-static int sign_message(struct ceph_connection *con, struct ceph_msg *msg)
+static int mds_sign_message(struct ceph_msg *msg)
{
- struct ceph_mds_session *s = con->private;
+ struct ceph_mds_session *s = msg->con->private;
struct ceph_auth_handshake *auth = &s->s_auth;
+
return ceph_auth_sign_message(auth, msg);
}
-static int check_message_signature(struct ceph_connection *con, struct ceph_msg *msg)
+static int mds_check_message_signature(struct ceph_msg *msg)
{
- struct ceph_mds_session *s = con->private;
+ struct ceph_mds_session *s = msg->con->private;
struct ceph_auth_handshake *auth = &s->s_auth;
+
return ceph_auth_check_message_signature(auth, msg);
}
@@ -3909,8 +3967,8 @@ static const struct ceph_connection_operations mds_con_ops = {
.invalidate_authorizer = invalidate_authorizer,
.peer_reset = peer_reset,
.alloc_msg = mds_alloc_msg,
- .sign_message = sign_message,
- .check_message_signature = check_message_signature,
+ .sign_message = mds_sign_message,
+ .check_message_signature = mds_check_message_signature,
};
/* eof */
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 762757e..ccf11ef 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -236,6 +236,9 @@ struct ceph_mds_request {
struct inode *r_unsafe_dir;
struct list_head r_unsafe_dir_item;
+ /* unsafe requests that modify the target inode */
+ struct list_head r_unsafe_target_item;
+
struct ceph_mds_session *r_session;
int r_attempts; /* resend attempts */
@@ -366,6 +369,7 @@ extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
extern int ceph_mdsc_init(struct ceph_fs_client *fsc);
extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
+extern void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc);
extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc);
extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 233d906a..4aa7122 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -338,12 +338,6 @@ static int build_snap_context(struct ceph_snap_realm *realm)
return 0;
}
- if (num == 0 && realm->seq == ceph_empty_snapc->seq) {
- ceph_get_snap_context(ceph_empty_snapc);
- snapc = ceph_empty_snapc;
- goto done;
- }
-
/* alloc new snap context */
err = -ENOMEM;
if (num > (SIZE_MAX - sizeof(*snapc)) / sizeof(u64))
@@ -381,7 +375,6 @@ static int build_snap_context(struct ceph_snap_realm *realm)
realm->ino, realm, snapc, snapc->seq,
(unsigned int) snapc->num_snaps);
-done:
ceph_put_snap_context(realm->cached_context);
realm->cached_context = snapc;
return 0;
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index d1c833c..ca4d5e8 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -479,7 +479,7 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes);
if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
- seq_printf(m, ",snapdirname=%s", fsopt->snapdir_name);
+ seq_show_option(m, "snapdirname", fsopt->snapdir_name);
return 0;
}
@@ -639,8 +639,8 @@ static int __init init_caches(void)
ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
sizeof(struct ceph_inode_info),
__alignof__(struct ceph_inode_info),
- (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
- ceph_inode_init_once);
+ SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
+ SLAB_ACCOUNT, ceph_inode_init_once);
if (ceph_inode_cachep == NULL)
return -ENOMEM;
@@ -708,6 +708,7 @@ static void ceph_umount_begin(struct super_block *sb)
if (!fsc)
return;
fsc->mount_state = CEPH_MOUNT_SHUTDOWN;
+ ceph_mdsc_force_umount(fsc->mdsc);
return;
}
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 2f2460d..75b7d12 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -342,6 +342,7 @@ struct ceph_inode_info {
struct list_head i_unsafe_writes; /* uncommitted sync writes */
struct list_head i_unsafe_dirops; /* uncommitted mds dir ops */
+ struct list_head i_unsafe_iops; /* uncommitted mds inode ops */
spinlock_t i_unsafe_lock;
struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
diff --git a/fs/char_dev.c b/fs/char_dev.c
index ea06a3d..24b1425 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -274,7 +274,7 @@ out2:
}
/**
- * unregister_chrdev_region() - return a range of device numbers
+ * unregister_chrdev_region() - unregister a range of device numbers
* @from: the first in the range of numbers to unregister
* @count: the number of device numbers to unregister
*
diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h
new file mode 100644
index 0000000..0065256
--- /dev/null
+++ b/fs/cifs/cifs_ioctl.h
@@ -0,0 +1,42 @@
+/*
+ * fs/cifs/cifs_ioctl.h
+ *
+ * Structure definitions for io control for cifs/smb3
+ *
+ * Copyright (c) 2015 Steve French <steve.french@primarydata.com>
+ *
+ * This library is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU Lesser General Public License for more details.
+ *
+ */
+
+struct smb_mnt_fs_info {
+ __u32 version; /* 0001 */
+ __u16 protocol_id;
+ __u16 tcon_flags;
+ __u32 vol_serial_number;
+ __u32 vol_create_time;
+ __u32 share_caps;
+ __u32 share_flags;
+ __u32 sector_flags;
+ __u32 optimal_sector_size;
+ __u32 max_bytes_chunk;
+ __u32 fs_attributes;
+ __u32 max_path_component;
+ __u32 device_type;
+ __u32 device_characteristics;
+ __u32 maximal_access;
+ __u64 cifs_posix_caps;
+} __packed;
+
+#define CIFS_IOCTL_MAGIC 0xCF
+#define CIFS_IOC_COPYCHUNK_FILE _IOW(CIFS_IOCTL_MAGIC, 3, int)
+#define CIFS_IOC_SET_INTEGRITY _IO(CIFS_IOCTL_MAGIC, 4)
+#define CIFS_IOC_GET_MNT_INFO _IOR(CIFS_IOCTL_MAGIC, 5, struct smb_mnt_fs_info)
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index f4cf200..6908080 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -42,7 +42,7 @@ cifs_spnego_key_instantiate(struct key *key, struct key_preparsed_payload *prep)
goto error;
/* attach the data */
- key->payload.data = payload;
+ key->payload.data[0] = payload;
ret = 0;
error:
@@ -52,7 +52,7 @@ error:
static void
cifs_spnego_key_destroy(struct key *key)
{
- kfree(key->payload.data);
+ kfree(key->payload.data[0]);
}
@@ -167,7 +167,7 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo)
#ifdef CONFIG_CIFS_DEBUG2
if (cifsFYI && !IS_ERR(spnego_key)) {
- struct cifs_spnego_msg *msg = spnego_key->payload.data;
+ struct cifs_spnego_msg *msg = spnego_key->payload.data[0];
cifs_dump_mem("SPNEGO reply blob:", msg->data, min(1024U,
msg->secblob_len + msg->sesskey_len));
}
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 1ea780b..3f93125 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -58,16 +58,15 @@ cifs_idmap_key_instantiate(struct key *key, struct key_preparsed_payload *prep)
* dereference payload.data!
*/
if (prep->datalen <= sizeof(key->payload)) {
- key->payload.value = 0;
- memcpy(&key->payload.value, prep->data, prep->datalen);
- key->datalen = prep->datalen;
- return 0;
+ key->payload.data[0] = NULL;
+ memcpy(&key->payload, prep->data, prep->datalen);
+ } else {
+ payload = kmemdup(prep->data, prep->datalen, GFP_KERNEL);
+ if (!payload)
+ return -ENOMEM;
+ key->payload.data[0] = payload;
}
- payload = kmemdup(prep->data, prep->datalen, GFP_KERNEL);
- if (!payload)
- return -ENOMEM;
- key->payload.data = payload;
key->datalen = prep->datalen;
return 0;
}
@@ -76,7 +75,7 @@ static inline void
cifs_idmap_key_destroy(struct key *key)
{
if (key->datalen > sizeof(key->payload))
- kfree(key->payload.data);
+ kfree(key->payload.data[0]);
}
static struct key_type cifs_idmap_key_type = {
@@ -233,8 +232,8 @@ id_to_sid(unsigned int cid, uint sidtype, struct cifs_sid *ssid)
* it could be.
*/
ksid = sidkey->datalen <= sizeof(sidkey->payload) ?
- (struct cifs_sid *)&sidkey->payload.value :
- (struct cifs_sid *)sidkey->payload.data;
+ (struct cifs_sid *)&sidkey->payload :
+ (struct cifs_sid *)sidkey->payload.data[0];
ksid_size = CIFS_SID_BASE_SIZE + (ksid->num_subauth * sizeof(__le32));
if (ksid_size > sidkey->datalen) {
@@ -307,14 +306,14 @@ sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid,
if (sidtype == SIDOWNER) {
kuid_t uid;
uid_t id;
- memcpy(&id, &sidkey->payload.value, sizeof(uid_t));
+ memcpy(&id, &sidkey->payload.data[0], sizeof(uid_t));
uid = make_kuid(&init_user_ns, id);
if (uid_valid(uid))
fuid = uid;
} else {
kgid_t gid;
gid_t id;
- memcpy(&id, &sidkey->payload.value, sizeof(gid_t));
+ memcpy(&id, &sidkey->payload.data[0], sizeof(gid_t));
gid = make_kgid(&init_user_ns, id);
if (gid_valid(gid))
fgid = gid;
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index aa0dc25..afa09fc 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -444,6 +444,48 @@ find_domain_name(struct cifs_ses *ses, const struct nls_table *nls_cp)
return 0;
}
+/* Server has provided av pairs/target info in the type 2 challenge
+ * packet and we have plucked it and stored within smb session.
+ * We parse that blob here to find the server given timestamp
+ * as part of ntlmv2 authentication (or local current time as
+ * default in case of failure)
+ */
+static __le64
+find_timestamp(struct cifs_ses *ses)
+{
+ unsigned int attrsize;
+ unsigned int type;
+ unsigned int onesize = sizeof(struct ntlmssp2_name);
+ unsigned char *blobptr;
+ unsigned char *blobend;
+ struct ntlmssp2_name *attrptr;
+
+ if (!ses->auth_key.len || !ses->auth_key.response)
+ return 0;
+
+ blobptr = ses->auth_key.response;
+ blobend = blobptr + ses->auth_key.len;
+
+ while (blobptr + onesize < blobend) {
+ attrptr = (struct ntlmssp2_name *) blobptr;
+ type = le16_to_cpu(attrptr->type);
+ if (type == NTLMSSP_AV_EOL)
+ break;
+ blobptr += 2; /* advance attr type */
+ attrsize = le16_to_cpu(attrptr->length);
+ blobptr += 2; /* advance attr size */
+ if (blobptr + attrsize > blobend)
+ break;
+ if (type == NTLMSSP_AV_TIMESTAMP) {
+ if (attrsize == sizeof(u64))
+ return *((__le64 *)blobptr);
+ }
+ blobptr += attrsize; /* advance attr value */
+ }
+
+ return cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
+}
+
static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
const struct nls_table *nls_cp)
{
@@ -641,6 +683,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
struct ntlmv2_resp *ntlmv2;
char ntlmv2_hash[16];
unsigned char *tiblob = NULL; /* target info blob */
+ __le64 rsp_timestamp;
if (ses->server->negflavor == CIFS_NEGFLAVOR_EXTENDED) {
if (!ses->domainName) {
@@ -659,6 +702,12 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
}
}
+ /* Must be within 5 minutes of the server (or in range +/-2h
+ * in case of Mac OS X), so simply carry over server timestamp
+ * (as Windows 7 does)
+ */
+ rsp_timestamp = find_timestamp(ses);
+
baselen = CIFS_SESS_KEY_SIZE + sizeof(struct ntlmv2_resp);
tilen = ses->auth_key.len;
tiblob = ses->auth_key.response;
@@ -675,8 +724,8 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
(ses->auth_key.response + CIFS_SESS_KEY_SIZE);
ntlmv2->blob_signature = cpu_to_le32(0x00000101);
ntlmv2->reserved = 0;
- /* Must be within 5 minutes of the server */
- ntlmv2->time = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
+ ntlmv2->time = rsp_timestamp;
+
get_random_bytes(&ntlmv2->client_chal, sizeof(ntlmv2->client_chal));
ntlmv2->reserved2 = 0;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 0a9fb6b..c4c1169 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -325,8 +325,11 @@ cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server)
static void
cifs_show_security(struct seq_file *s, struct cifs_ses *ses)
{
- if (ses->sectype == Unspecified)
+ if (ses->sectype == Unspecified) {
+ if (ses->user_name == NULL)
+ seq_puts(s, ",sec=none");
return;
+ }
seq_puts(s, ",sec=");
@@ -394,17 +397,17 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
struct sockaddr *srcaddr;
srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr;
- seq_printf(s, ",vers=%s", tcon->ses->server->vals->version_string);
+ seq_show_option(s, "vers", tcon->ses->server->vals->version_string);
cifs_show_security(s, tcon->ses);
cifs_show_cache_flavor(s, cifs_sb);
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)
seq_puts(s, ",multiuser");
else if (tcon->ses->user_name)
- seq_printf(s, ",username=%s", tcon->ses->user_name);
+ seq_show_option(s, "username", tcon->ses->user_name);
if (tcon->ses->domainName)
- seq_printf(s, ",domain=%s", tcon->ses->domainName);
+ seq_show_option(s, "domain", tcon->ses->domainName);
if (srcaddr->sa_family != AF_UNSPEC) {
struct sockaddr_in *saddr4;
@@ -451,6 +454,10 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
seq_puts(s, ",nocase");
if (tcon->retry)
seq_puts(s, ",hard");
+ if (tcon->use_persistent)
+ seq_puts(s, ",persistenthandles");
+ else if (tcon->use_resilient)
+ seq_puts(s, ",resilienthandles");
if (tcon->unix_ext)
seq_puts(s, ",unix");
else
@@ -893,8 +900,7 @@ const struct inode_operations cifs_file_inode_ops = {
const struct inode_operations cifs_symlink_inode_ops = {
.readlink = generic_readlink,
- .follow_link = cifs_follow_link,
- .put_link = kfree_put_link,
+ .get_link = cifs_get_link,
.permission = cifs_permission,
/* BB add the following two eventually */
/* revalidate: cifs_revalidate,
@@ -907,6 +913,59 @@ const struct inode_operations cifs_symlink_inode_ops = {
#endif
};
+static int cifs_clone_file_range(struct file *src_file, loff_t off,
+ struct file *dst_file, loff_t destoff, u64 len)
+{
+ struct inode *src_inode = file_inode(src_file);
+ struct inode *target_inode = file_inode(dst_file);
+ struct cifsFileInfo *smb_file_src = src_file->private_data;
+ struct cifsFileInfo *smb_file_target = dst_file->private_data;
+ struct cifs_tcon *target_tcon = tlink_tcon(smb_file_target->tlink);
+ unsigned int xid;
+ int rc;
+
+ cifs_dbg(FYI, "clone range\n");
+
+ xid = get_xid();
+
+ if (!src_file->private_data || !dst_file->private_data) {
+ rc = -EBADF;
+ cifs_dbg(VFS, "missing cifsFileInfo on copy range src file\n");
+ goto out;
+ }
+
+ /*
+ * Note: cifs case is easier than btrfs since server responsible for
+ * checks for proper open modes and file type and if it wants
+ * server could even support copy of range where source = target
+ */
+ lock_two_nondirectories(target_inode, src_inode);
+
+ if (len == 0)
+ len = src_inode->i_size - off;
+
+ cifs_dbg(FYI, "about to flush pages\n");
+ /* should we flush first and last page first */
+ truncate_inode_pages_range(&target_inode->i_data, destoff,
+ PAGE_CACHE_ALIGN(destoff + len)-1);
+
+ if (target_tcon->ses->server->ops->duplicate_extents)
+ rc = target_tcon->ses->server->ops->duplicate_extents(xid,
+ smb_file_src, smb_file_target, off, len, destoff);
+ else
+ rc = -EOPNOTSUPP;
+
+ /* force revalidate of size and timestamps of target file now
+ that target is updated on the server */
+ CIFS_I(target_inode)->time = 0;
+ /* although unlocking in the reverse order from locking is not
+ strictly necessary here it is a little cleaner to be consistent */
+ unlock_two_nondirectories(src_inode, target_inode);
+out:
+ free_xid(xid);
+ return rc;
+}
+
const struct file_operations cifs_file_ops = {
.read_iter = cifs_loose_read_iter,
.write_iter = cifs_file_write_iter,
@@ -918,9 +977,8 @@ const struct file_operations cifs_file_ops = {
.mmap = cifs_file_mmap,
.splice_read = generic_file_splice_read,
.llseek = cifs_llseek,
-#ifdef CONFIG_CIFS_POSIX
.unlocked_ioctl = cifs_ioctl,
-#endif /* CONFIG_CIFS_POSIX */
+ .clone_file_range = cifs_clone_file_range,
.setlease = cifs_setlease,
.fallocate = cifs_fallocate,
};
@@ -936,9 +994,9 @@ const struct file_operations cifs_file_strict_ops = {
.mmap = cifs_file_strict_mmap,
.splice_read = generic_file_splice_read,
.llseek = cifs_llseek,
-#ifdef CONFIG_CIFS_POSIX
.unlocked_ioctl = cifs_ioctl,
-#endif /* CONFIG_CIFS_POSIX */
+ .clone_file_range = cifs_clone_file_range,
+ .clone_file_range = cifs_clone_file_range,
.setlease = cifs_setlease,
.fallocate = cifs_fallocate,
};
@@ -954,9 +1012,8 @@ const struct file_operations cifs_file_direct_ops = {
.flush = cifs_flush,
.mmap = cifs_file_mmap,
.splice_read = generic_file_splice_read,
-#ifdef CONFIG_CIFS_POSIX
.unlocked_ioctl = cifs_ioctl,
-#endif /* CONFIG_CIFS_POSIX */
+ .clone_file_range = cifs_clone_file_range,
.llseek = cifs_llseek,
.setlease = cifs_setlease,
.fallocate = cifs_fallocate,
@@ -972,9 +1029,8 @@ const struct file_operations cifs_file_nobrl_ops = {
.mmap = cifs_file_mmap,
.splice_read = generic_file_splice_read,
.llseek = cifs_llseek,
-#ifdef CONFIG_CIFS_POSIX
.unlocked_ioctl = cifs_ioctl,
-#endif /* CONFIG_CIFS_POSIX */
+ .clone_file_range = cifs_clone_file_range,
.setlease = cifs_setlease,
.fallocate = cifs_fallocate,
};
@@ -989,9 +1045,8 @@ const struct file_operations cifs_file_strict_nobrl_ops = {
.mmap = cifs_file_strict_mmap,
.splice_read = generic_file_splice_read,
.llseek = cifs_llseek,
-#ifdef CONFIG_CIFS_POSIX
.unlocked_ioctl = cifs_ioctl,
-#endif /* CONFIG_CIFS_POSIX */
+ .clone_file_range = cifs_clone_file_range,
.setlease = cifs_setlease,
.fallocate = cifs_fallocate,
};
@@ -1006,9 +1061,8 @@ const struct file_operations cifs_file_direct_nobrl_ops = {
.flush = cifs_flush,
.mmap = cifs_file_mmap,
.splice_read = generic_file_splice_read,
-#ifdef CONFIG_CIFS_POSIX
.unlocked_ioctl = cifs_ioctl,
-#endif /* CONFIG_CIFS_POSIX */
+ .clone_file_range = cifs_clone_file_range,
.llseek = cifs_llseek,
.setlease = cifs_setlease,
.fallocate = cifs_fallocate,
@@ -1019,6 +1073,7 @@ const struct file_operations cifs_dir_ops = {
.release = cifs_closedir,
.read = generic_read_dir,
.unlocked_ioctl = cifs_ioctl,
+ .clone_file_range = cifs_clone_file_range,
.llseek = generic_file_llseek,
};
@@ -1037,7 +1092,7 @@ cifs_init_inodecache(void)
cifs_inode_cachep = kmem_cache_create("cifs_inode_cache",
sizeof(struct cifsInodeInfo),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
cifs_init_once);
if (cifs_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index a782b22..68c4547 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -120,9 +120,8 @@ extern struct vfsmount *cifs_dfs_d_automount(struct path *path);
#endif
/* Functions related to symlinks */
-extern const char *cifs_follow_link(struct dentry *direntry, void **cookie);
-extern int cifs_readlink(struct dentry *direntry, char __user *buffer,
- int buflen);
+extern const char *cifs_get_link(struct dentry *, struct inode *,
+ struct delayed_call *);
extern int cifs_symlink(struct inode *inode, struct dentry *direntry,
const char *symname);
extern int cifs_removexattr(struct dentry *, const char *);
@@ -131,10 +130,9 @@ extern int cifs_setxattr(struct dentry *, const char *, const void *,
extern ssize_t cifs_getxattr(struct dentry *, const char *, void *, size_t);
extern ssize_t cifs_listxattr(struct dentry *, char *, size_t);
extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
-
#ifdef CONFIG_CIFS_NFSD_EXPORT
extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
-#define CIFS_VERSION "2.06"
+#define CIFS_VERSION "2.08"
#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index b406a32..2b510c5 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -493,7 +493,10 @@ struct smb_vol {
bool mfsymlinks:1; /* use Minshall+French Symlinks */
bool multiuser:1;
bool rwpidforward:1; /* pid forward for read/write operations */
- bool nosharesock;
+ bool nosharesock:1;
+ bool persistent:1;
+ bool nopersistent:1;
+ bool resilient:1; /* noresilient not required since not fored for CA */
unsigned int rsize;
unsigned int wsize;
bool sockopt_tcp_nodelay:1;
@@ -895,6 +898,8 @@ struct cifs_tcon {
bool broken_posix_open; /* e.g. Samba server versions < 3.3.2, 3.2.9 */
bool broken_sparse_sup; /* if server or share does not support sparse */
bool need_reconnect:1; /* connection reset, tid now invalid */
+ bool use_resilient:1; /* use resilient instead of durable handles */
+ bool use_persistent:1; /* use persistent instead of durable handles */
#ifdef CONFIG_CIFS_SMB2
bool print:1; /* set if connection to printer share */
bool bad_network_name:1; /* set if ret status STATUS_BAD_NETWORK_NAME */
@@ -1015,6 +1020,7 @@ struct cifs_fid {
__u64 persistent_fid; /* persist file id for smb2 */
__u64 volatile_fid; /* volatile file id for smb2 */
__u8 lease_key[SMB2_LEASE_KEY_SIZE]; /* lease key for smb2 */
+ __u8 create_guid[16];
#endif
struct cifs_pending_open *pending_open;
unsigned int epoch;
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 47b030d..f5b8730 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -2245,6 +2245,20 @@ typedef struct {
#define FILE_DEVICE_VIRTUAL_DISK 0x00000024
#define FILE_DEVICE_NETWORK_REDIRECTOR 0x00000028
+/* Device Characteristics */
+#define FILE_REMOVABLE_MEDIA 0x00000001
+#define FILE_READ_ONLY_DEVICE 0x00000002
+#define FILE_FLOPPY_DISKETTE 0x00000004
+#define FILE_WRITE_ONCE_MEDIA 0x00000008
+#define FILE_REMOTE_DEVICE 0x00000010
+#define FILE_DEVICE_IS_MOUNTED 0x00000020
+#define FILE_VIRTUAL_VOLUME 0x00000040
+#define FILE_DEVICE_SECURE_OPEN 0x00000100
+#define FILE_CHARACTERISTIC_TS_DEVICE 0x00001000
+#define FILE_CHARACTERISTIC_WEBDAV_DEVICE 0x00002000
+#define FILE_PORTABLE_DEVICE 0x00004000
+#define FILE_DEVICE_ALLOW_APPCONTAINER_TRAVERSAL 0x00020000
+
typedef struct {
__le32 DeviceType;
__le32 DeviceCharacteristics;
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 672ef35..90b4f9f 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -696,7 +696,9 @@ cifs_echo_callback(struct mid_q_entry *mid)
{
struct TCP_Server_Info *server = mid->callback_data;
+ mutex_lock(&server->srv_mutex);
DeleteMidQEntry(mid);
+ mutex_unlock(&server->srv_mutex);
add_credits(server, 1, CIFS_ECHO_OP);
}
@@ -1572,7 +1574,9 @@ cifs_readv_callback(struct mid_q_entry *mid)
}
queue_work(cifsiod_wq, &rdata->work);
+ mutex_lock(&server->srv_mutex);
DeleteMidQEntry(mid);
+ mutex_unlock(&server->srv_mutex);
add_credits(server, 1, 0);
}
@@ -2032,6 +2036,7 @@ cifs_writev_callback(struct mid_q_entry *mid)
{
struct cifs_writedata *wdata = mid->callback_data;
struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
+ struct TCP_Server_Info *server = tcon->ses->server;
unsigned int written;
WRITE_RSP *smb = (WRITE_RSP *)mid->resp_buf;
@@ -2068,7 +2073,9 @@ cifs_writev_callback(struct mid_q_entry *mid)
}
queue_work(cifsiod_wq, &wdata->work);
+ mutex_lock(&server->srv_mutex);
DeleteMidQEntry(mid);
+ mutex_unlock(&server->srv_mutex);
add_credits(tcon->ses->server, 1, 0);
}
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 773f4dc..ecb0803 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -87,6 +87,8 @@ enum {
Opt_sign, Opt_seal, Opt_noac,
Opt_fsc, Opt_mfsymlinks,
Opt_multiuser, Opt_sloppy, Opt_nosharesock,
+ Opt_persistent, Opt_nopersistent,
+ Opt_resilient, Opt_noresilient,
/* Mount options which take numeric value */
Opt_backupuid, Opt_backupgid, Opt_uid,
@@ -169,6 +171,10 @@ static const match_table_t cifs_mount_option_tokens = {
{ Opt_multiuser, "multiuser" },
{ Opt_sloppy, "sloppy" },
{ Opt_nosharesock, "nosharesock" },
+ { Opt_persistent, "persistenthandles"},
+ { Opt_nopersistent, "nopersistenthandles"},
+ { Opt_resilient, "resilienthandles"},
+ { Opt_noresilient, "noresilienthandles"},
{ Opt_backupuid, "backupuid=%s" },
{ Opt_backupgid, "backupgid=%s" },
@@ -1497,6 +1503,33 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
case Opt_nosharesock:
vol->nosharesock = true;
break;
+ case Opt_nopersistent:
+ vol->nopersistent = true;
+ if (vol->persistent) {
+ cifs_dbg(VFS,
+ "persistenthandles mount options conflict\n");
+ goto cifs_parse_mount_err;
+ }
+ break;
+ case Opt_persistent:
+ vol->persistent = true;
+ if ((vol->nopersistent) || (vol->resilient)) {
+ cifs_dbg(VFS,
+ "persistenthandles mount options conflict\n");
+ goto cifs_parse_mount_err;
+ }
+ break;
+ case Opt_resilient:
+ vol->resilient = true;
+ if (vol->persistent) {
+ cifs_dbg(VFS,
+ "persistenthandles mount options conflict\n");
+ goto cifs_parse_mount_err;
+ }
+ break;
+ case Opt_noresilient:
+ vol->resilient = false; /* already the default */
+ break;
/* Numeric Values */
case Opt_backupuid:
@@ -2325,13 +2358,14 @@ static int
cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses)
{
int rc = 0;
- char *desc, *delim, *payload;
+ const char *delim, *payload;
+ char *desc;
ssize_t len;
struct key *key;
struct TCP_Server_Info *server = ses->server;
struct sockaddr_in *sa;
struct sockaddr_in6 *sa6;
- struct user_key_payload *upayload;
+ const struct user_key_payload *upayload;
desc = kmalloc(CIFSCREDS_DESC_SIZE, GFP_KERNEL);
if (!desc)
@@ -2374,14 +2408,14 @@ cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses)
}
down_read(&key->sem);
- upayload = key->payload.data;
+ upayload = user_key_payload(key);
if (IS_ERR_OR_NULL(upayload)) {
rc = upayload ? PTR_ERR(upayload) : -EINVAL;
goto out_key_put;
}
/* find first : in payload */
- payload = (char *)upayload->data;
+ payload = upayload->data;
delim = strnchr(payload, upayload->datalen, ':');
cifs_dbg(FYI, "payload=%s\n", payload);
if (!delim) {
@@ -2654,6 +2688,42 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info)
cifs_dbg(FYI, "DFS disabled (%d)\n", tcon->Flags);
}
tcon->seal = volume_info->seal;
+ tcon->use_persistent = false;
+ /* check if SMB2 or later, CIFS does not support persistent handles */
+ if (volume_info->persistent) {
+ if (ses->server->vals->protocol_id == 0) {
+ cifs_dbg(VFS,
+ "SMB3 or later required for persistent handles\n");
+ rc = -EOPNOTSUPP;
+ goto out_fail;
+#ifdef CONFIG_CIFS_SMB2
+ } else if (ses->server->capabilities &
+ SMB2_GLOBAL_CAP_PERSISTENT_HANDLES)
+ tcon->use_persistent = true;
+ else /* persistent handles requested but not supported */ {
+ cifs_dbg(VFS,
+ "Persistent handles not supported on share\n");
+ rc = -EOPNOTSUPP;
+ goto out_fail;
+#endif /* CONFIG_CIFS_SMB2 */
+ }
+#ifdef CONFIG_CIFS_SMB2
+ } else if ((tcon->capabilities & SMB2_SHARE_CAP_CONTINUOUS_AVAILABILITY)
+ && (ses->server->capabilities & SMB2_GLOBAL_CAP_PERSISTENT_HANDLES)
+ && (volume_info->nopersistent == false)) {
+ cifs_dbg(FYI, "enabling persistent handles\n");
+ tcon->use_persistent = true;
+#endif /* CONFIG_CIFS_SMB2 */
+ } else if (volume_info->resilient) {
+ if (ses->server->vals->protocol_id == 0) {
+ cifs_dbg(VFS,
+ "SMB2.1 or later required for resilient handles\n");
+ rc = -EOPNOTSUPP;
+ goto out_fail;
+ }
+ tcon->use_resilient = true;
+ }
+
/*
* We can have only one retry value for a connection to a share so for
* resources mounted more than once to the same server share the last
@@ -3502,6 +3572,15 @@ try_mount_again:
goto mount_fail_check;
}
+#ifdef CONFIG_CIFS_SMB2
+ if ((volume_info->persistent == true) && ((ses->server->capabilities &
+ SMB2_GLOBAL_CAP_PERSISTENT_HANDLES) == 0)) {
+ cifs_dbg(VFS, "persistent handles not supported by server\n");
+ rc = -EOPNOTSUPP;
+ goto mount_fail_check;
+ }
+#endif /* CONFIG_CIFS_SMB2*/
+
/* search for existing tcon to this server share */
tcon = cifs_get_tcon(ses, volume_info);
if (IS_ERR(tcon)) {
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 3f50cee..0a2752b 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1553,7 +1553,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
out:
if (flock->fl_flags & FL_POSIX && !rc)
- rc = posix_lock_file_wait(file, flock);
+ rc = locks_lock_file_wait(file, flock);
return rc;
}
@@ -3216,7 +3216,7 @@ cifs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
return VM_FAULT_LOCKED;
}
-static struct vm_operations_struct cifs_file_vm_ops = {
+static const struct vm_operations_struct cifs_file_vm_ops = {
.fault = filemap_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = cifs_page_mkwrite,
@@ -3380,6 +3380,7 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
struct page *page, *tpage;
unsigned int expected_index;
int rc;
+ gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL);
INIT_LIST_HEAD(tmplist);
@@ -3390,13 +3391,13 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
* should have access to this page, we're safe to simply set
* PG_locked without checking it first.
*/
- __set_page_locked(page);
+ __SetPageLocked(page);
rc = add_to_page_cache_locked(page, mapping,
- page->index, GFP_KERNEL);
+ page->index, gfp);
/* give up if we can't stick it in the cache */
if (rc) {
- __clear_page_locked(page);
+ __ClearPageLocked(page);
return rc;
}
@@ -3417,10 +3418,9 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
if (*bytes + PAGE_CACHE_SIZE > rsize)
break;
- __set_page_locked(page);
- if (add_to_page_cache_locked(page, mapping, page->index,
- GFP_KERNEL)) {
- __clear_page_locked(page);
+ __SetPageLocked(page);
+ if (add_to_page_cache_locked(page, mapping, page->index, gfp)) {
+ __ClearPageLocked(page);
break;
}
list_move_tail(&page->lru, tmplist);
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index f621b44..a329f5b 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1831,11 +1831,11 @@ cifs_invalidate_mapping(struct inode *inode)
* @word: long word containing the bit lock
*/
static int
-cifs_wait_bit_killable(struct wait_bit_key *key)
+cifs_wait_bit_killable(struct wait_bit_key *key, int mode)
{
- if (fatal_signal_pending(current))
- return -ERESTARTSYS;
freezable_schedule_unsafe();
+ if (signal_pending_state(mode, current))
+ return -ERESTARTSYS;
return 0;
}
@@ -2034,7 +2034,6 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
struct tcon_link *tlink = NULL;
struct cifs_tcon *tcon = NULL;
struct TCP_Server_Info *server;
- struct cifs_io_parms io_parms;
/*
* To avoid spurious oplock breaks from server, in the case of
@@ -2056,18 +2055,6 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
rc = -ENOSYS;
cifsFileInfo_put(open_file);
cifs_dbg(FYI, "SetFSize for attrs rc = %d\n", rc);
- if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
- unsigned int bytes_written;
-
- io_parms.netfid = open_file->fid.netfid;
- io_parms.pid = open_file->pid;
- io_parms.tcon = tcon;
- io_parms.offset = 0;
- io_parms.length = attrs->ia_size;
- rc = CIFSSMBWrite(xid, &io_parms, &bytes_written,
- NULL, NULL, 1);
- cifs_dbg(FYI, "Wrt seteof rc %d\n", rc);
- }
} else
rc = -EINVAL;
@@ -2093,28 +2080,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
else
rc = -ENOSYS;
cifs_dbg(FYI, "SetEOF by path (setattrs) rc = %d\n", rc);
- if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
- __u16 netfid;
- int oplock = 0;
- rc = SMBLegacyOpen(xid, tcon, full_path, FILE_OPEN,
- GENERIC_WRITE, CREATE_NOT_DIR, &netfid,
- &oplock, NULL, cifs_sb->local_nls,
- cifs_remap(cifs_sb));
- if (rc == 0) {
- unsigned int bytes_written;
-
- io_parms.netfid = netfid;
- io_parms.pid = current->tgid;
- io_parms.tcon = tcon;
- io_parms.offset = 0;
- io_parms.length = attrs->ia_size;
- rc = CIFSSMBWrite(xid, &io_parms, &bytes_written, NULL,
- NULL, 1);
- cifs_dbg(FYI, "wrt seteof rc %d\n", rc);
- CIFSSMBClose(xid, tcon, netfid);
- }
- }
if (tlink)
cifs_put_tlink(tlink);
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 49b8b6e..7a3b84e 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -31,68 +31,39 @@
#include "cifsproto.h"
#include "cifs_debug.h"
#include "cifsfs.h"
+#include "cifs_ioctl.h"
#include <linux/btrfs.h>
-#define CIFS_IOCTL_MAGIC 0xCF
-#define CIFS_IOC_COPYCHUNK_FILE _IOW(CIFS_IOCTL_MAGIC, 3, int)
-#define CIFS_IOC_SET_INTEGRITY _IO(CIFS_IOCTL_MAGIC, 4)
-
-static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file,
- unsigned long srcfd, u64 off, u64 len, u64 destoff,
- bool dup_extents)
+static int cifs_file_clone_range(unsigned int xid, struct file *src_file,
+ struct file *dst_file)
{
- int rc;
- struct cifsFileInfo *smb_file_target = dst_file->private_data;
+ struct inode *src_inode = file_inode(src_file);
struct inode *target_inode = file_inode(dst_file);
- struct cifs_tcon *target_tcon;
- struct fd src_file;
struct cifsFileInfo *smb_file_src;
- struct inode *src_inode;
+ struct cifsFileInfo *smb_file_target;
struct cifs_tcon *src_tcon;
+ struct cifs_tcon *target_tcon;
+ int rc;
cifs_dbg(FYI, "ioctl clone range\n");
- /* the destination must be opened for writing */
- if (!(dst_file->f_mode & FMODE_WRITE)) {
- cifs_dbg(FYI, "file target not open for write\n");
- return -EINVAL;
- }
- /* check if target volume is readonly and take reference */
- rc = mnt_want_write_file(dst_file);
- if (rc) {
- cifs_dbg(FYI, "mnt_want_write failed with rc %d\n", rc);
- return rc;
- }
-
- src_file = fdget(srcfd);
- if (!src_file.file) {
- rc = -EBADF;
- goto out_drop_write;
- }
-
- if ((!src_file.file->private_data) || (!dst_file->private_data)) {
+ if (!src_file->private_data || !dst_file->private_data) {
rc = -EBADF;
cifs_dbg(VFS, "missing cifsFileInfo on copy range src file\n");
- goto out_fput;
+ goto out;
}
rc = -EXDEV;
smb_file_target = dst_file->private_data;
- smb_file_src = src_file.file->private_data;
+ smb_file_src = src_file->private_data;
src_tcon = tlink_tcon(smb_file_src->tlink);
target_tcon = tlink_tcon(smb_file_target->tlink);
- /* check if source and target are on same tree connection */
- if (src_tcon != target_tcon) {
- cifs_dbg(VFS, "file copy src and target on different volume\n");
- goto out_fput;
+ if (src_tcon->ses != target_tcon->ses) {
+ cifs_dbg(VFS, "source and target of copy not on same server\n");
+ goto out;
}
- src_inode = file_inode(src_file.file);
- rc = -EINVAL;
- if (S_ISDIR(src_inode->i_mode))
- goto out_fput;
-
/*
* Note: cifs case is easier than btrfs since server responsible for
* checks for proper open modes and file type and if it wants
@@ -100,34 +71,66 @@ static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file,
*/
lock_two_nondirectories(target_inode, src_inode);
- /* determine range to clone */
- rc = -EINVAL;
- if (off + len > src_inode->i_size || off + len < off)
- goto out_unlock;
- if (len == 0)
- len = src_inode->i_size - off;
-
cifs_dbg(FYI, "about to flush pages\n");
/* should we flush first and last page first */
- truncate_inode_pages_range(&target_inode->i_data, destoff,
- PAGE_CACHE_ALIGN(destoff + len)-1);
+ truncate_inode_pages(&target_inode->i_data, 0);
- if (dup_extents && target_tcon->ses->server->ops->duplicate_extents)
- rc = target_tcon->ses->server->ops->duplicate_extents(xid,
- smb_file_src, smb_file_target, off, len, destoff);
- else if (!dup_extents && target_tcon->ses->server->ops->clone_range)
+ if (target_tcon->ses->server->ops->clone_range)
rc = target_tcon->ses->server->ops->clone_range(xid,
- smb_file_src, smb_file_target, off, len, destoff);
+ smb_file_src, smb_file_target, 0, src_inode->i_size, 0);
else
rc = -EOPNOTSUPP;
/* force revalidate of size and timestamps of target file now
that target is updated on the server */
CIFS_I(target_inode)->time = 0;
-out_unlock:
/* although unlocking in the reverse order from locking is not
strictly necessary here it is a little cleaner to be consistent */
unlock_two_nondirectories(src_inode, target_inode);
+out:
+ return rc;
+}
+
+static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file,
+ unsigned long srcfd)
+{
+ int rc;
+ struct fd src_file;
+ struct inode *src_inode;
+
+ cifs_dbg(FYI, "ioctl clone range\n");
+ /* the destination must be opened for writing */
+ if (!(dst_file->f_mode & FMODE_WRITE)) {
+ cifs_dbg(FYI, "file target not open for write\n");
+ return -EINVAL;
+ }
+
+ /* check if target volume is readonly and take reference */
+ rc = mnt_want_write_file(dst_file);
+ if (rc) {
+ cifs_dbg(FYI, "mnt_want_write failed with rc %d\n", rc);
+ return rc;
+ }
+
+ src_file = fdget(srcfd);
+ if (!src_file.file) {
+ rc = -EBADF;
+ goto out_drop_write;
+ }
+
+ if (src_file.file->f_op->unlocked_ioctl != cifs_ioctl) {
+ rc = -EBADF;
+ cifs_dbg(VFS, "src file seems to be from a different filesystem type\n");
+ goto out_fput;
+ }
+
+ src_inode = file_inode(src_file.file);
+ rc = -EINVAL;
+ if (S_ISDIR(src_inode->i_mode))
+ goto out_fput;
+
+ rc = cifs_file_clone_range(xid, src_file.file, dst_file);
+
out_fput:
fdput(src_file);
out_drop_write:
@@ -135,6 +138,43 @@ out_drop_write:
return rc;
}
+static long smb_mnt_get_fsinfo(unsigned int xid, struct cifs_tcon *tcon,
+ void __user *arg)
+{
+ int rc = 0;
+ struct smb_mnt_fs_info *fsinf;
+
+ fsinf = kzalloc(sizeof(struct smb_mnt_fs_info), GFP_KERNEL);
+ if (fsinf == NULL)
+ return -ENOMEM;
+
+ fsinf->version = 1;
+ fsinf->protocol_id = tcon->ses->server->vals->protocol_id;
+ fsinf->device_characteristics =
+ le32_to_cpu(tcon->fsDevInfo.DeviceCharacteristics);
+ fsinf->device_type = le32_to_cpu(tcon->fsDevInfo.DeviceType);
+ fsinf->fs_attributes = le32_to_cpu(tcon->fsAttrInfo.Attributes);
+ fsinf->max_path_component =
+ le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength);
+#ifdef CONFIG_CIFS_SMB2
+ fsinf->vol_serial_number = tcon->vol_serial_number;
+ fsinf->vol_create_time = le64_to_cpu(tcon->vol_create_time);
+ fsinf->share_flags = tcon->share_flags;
+ fsinf->share_caps = le32_to_cpu(tcon->capabilities);
+ fsinf->sector_flags = tcon->ss_flags;
+ fsinf->optimal_sector_size = tcon->perf_sector_size;
+ fsinf->max_bytes_chunk = tcon->max_bytes_chunk;
+ fsinf->maximal_access = tcon->maximal_access;
+#endif /* SMB2 */
+ fsinf->cifs_posix_caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
+
+ if (copy_to_user(arg, fsinf, sizeof(struct smb_mnt_fs_info)))
+ rc = -EFAULT;
+
+ kfree(fsinf);
+ return rc;
+}
+
long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
{
struct inode *inode = file_inode(filep);
@@ -148,8 +188,6 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
xid = get_xid();
- cifs_dbg(FYI, "ioctl file %p cmd %u arg %lu\n", filep, command, arg);
-
cifs_sb = CIFS_SB(inode->i_sb);
switch (command) {
@@ -213,10 +251,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
}
break;
case CIFS_IOC_COPYCHUNK_FILE:
- rc = cifs_ioctl_clone(xid, filep, arg, 0, 0, 0, false);
- break;
- case BTRFS_IOC_CLONE:
- rc = cifs_ioctl_clone(xid, filep, arg, 0, 0, 0, true);
+ rc = cifs_ioctl_clone(xid, filep, arg);
break;
case CIFS_IOC_SET_INTEGRITY:
if (pSMBFile == NULL)
@@ -228,6 +263,10 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
else
rc = -EOPNOTSUPP;
break;
+ case CIFS_IOC_GET_MNT_INFO:
+ tcon = tlink_tcon(pSMBFile->tlink);
+ rc = smb_mnt_get_fsinfo(xid, tcon, (void __user *)arg);
+ break;
default:
cifs_dbg(FYI, "unsupported ioctl\n");
break;
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index e3548f7..062c237 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -627,9 +627,9 @@ cifs_hl_exit:
}
const char *
-cifs_follow_link(struct dentry *direntry, void **cookie)
+cifs_get_link(struct dentry *direntry, struct inode *inode,
+ struct delayed_call *done)
{
- struct inode *inode = d_inode(direntry);
int rc = -ENOMEM;
unsigned int xid;
char *full_path = NULL;
@@ -639,6 +639,9 @@ cifs_follow_link(struct dentry *direntry, void **cookie)
struct cifs_tcon *tcon;
struct TCP_Server_Info *server;
+ if (!direntry)
+ return ERR_PTR(-ECHILD);
+
xid = get_xid();
tlink = cifs_sb_tlink(cifs_sb);
@@ -678,7 +681,8 @@ cifs_follow_link(struct dentry *direntry, void **cookie)
kfree(target_path);
return ERR_PTR(rc);
}
- return *cookie = target_path;
+ set_delayed_call(done, kfree_link, target_path);
+ return target_path;
}
int
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index b1eede3..0557c45 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -84,7 +84,7 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name,
cifs_dbg(FYI, "%s: for %s\n", __func__, name->name);
dentry = d_hash_and_lookup(parent, name);
- if (unlikely(IS_ERR(dentry)))
+ if (IS_ERR(dentry))
return;
if (dentry) {
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index bce6fdc..59727e3 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -988,7 +988,7 @@ sess_auth_kerberos(struct sess_data *sess_data)
goto out;
}
- msg = spnego_key->payload.data;
+ msg = spnego_key->payload.data[0];
/*
* check version field to make sure that cifs.upcall is
* sending us a response in an expected form
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index 2ab297d..f9e766f 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -43,6 +43,7 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
struct smb2_file_all_info *smb2_data = NULL;
__u8 smb2_oplock[17];
struct cifs_fid *fid = oparms->fid;
+ struct network_resiliency_req nr_ioctl_req;
smb2_path = cifs_convert_path_to_utf16(oparms->path, oparms->cifs_sb);
if (smb2_path == NULL) {
@@ -67,6 +68,24 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
if (rc)
goto out;
+
+ if (oparms->tcon->use_resilient) {
+ nr_ioctl_req.Timeout = 0; /* use server default (120 seconds) */
+ nr_ioctl_req.Reserved = 0;
+ rc = SMB2_ioctl(xid, oparms->tcon, fid->persistent_fid,
+ fid->volatile_fid, FSCTL_LMR_REQUEST_RESILIENCY, true,
+ (char *)&nr_ioctl_req, sizeof(nr_ioctl_req),
+ NULL, NULL /* no return info */);
+ if (rc == -EOPNOTSUPP) {
+ cifs_dbg(VFS,
+ "resiliency not supported by server, disabling\n");
+ oparms->tcon->use_resilient = false;
+ } else if (rc)
+ cifs_dbg(FYI, "error %d setting resiliency\n", rc);
+
+ rc = 0;
+ }
+
if (buf) {
/* open response does not have IndexNumber field - get it */
rc = SMB2_get_srv_num(xid, oparms->tcon, fid->persistent_fid,
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index df91bcf..53ccdde 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -50,9 +50,13 @@ change_conf(struct TCP_Server_Info *server)
break;
default:
server->echoes = true;
- server->oplocks = true;
+ if (enable_oplocks) {
+ server->oplocks = true;
+ server->oplock_credits = 1;
+ } else
+ server->oplocks = false;
+
server->echo_credits = 1;
- server->oplock_credits = 1;
}
server->credits -= server->echo_credits + server->oplock_credits;
return 0;
@@ -806,7 +810,6 @@ smb2_set_file_size(const unsigned int xid, struct cifs_tcon *tcon,
cfile->fid.volatile_fid, cfile->pid, &eof, false);
}
-#ifdef CONFIG_CIFS_SMB311
static int
smb2_duplicate_extents(const unsigned int xid,
struct cifsFileInfo *srcfile,
@@ -850,8 +853,6 @@ smb2_duplicate_extents(const unsigned int xid,
duplicate_extents_out:
return rc;
}
-#endif /* CONFIG_CIFS_SMB311 */
-
static int
smb2_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
@@ -1699,6 +1700,7 @@ struct smb_version_operations smb30_operations = {
.create_lease_buf = smb3_create_lease_buf,
.parse_lease_buf = smb3_parse_lease_buf,
.clone_range = smb2_clone_range,
+ .duplicate_extents = smb2_duplicate_extents,
.validate_negotiate = smb3_validate_negotiate,
.wp_retry_size = smb2_wp_retry_size,
.dir_needs_close = smb2_dir_needs_close,
@@ -1836,7 +1838,7 @@ struct smb_version_values smb21_values = {
struct smb_version_values smb30_values = {
.version_string = SMB30_VERSION_STRING,
.protocol_id = SMB30_PROT_ID,
- .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU,
+ .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES,
.large_lock_type = 0,
.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
@@ -1856,7 +1858,7 @@ struct smb_version_values smb30_values = {
struct smb_version_values smb302_values = {
.version_string = SMB302_VERSION_STRING,
.protocol_id = SMB302_PROT_ID,
- .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU,
+ .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES,
.large_lock_type = 0,
.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
@@ -1877,7 +1879,7 @@ struct smb_version_values smb302_values = {
struct smb_version_values smb311_values = {
.version_string = SMB311_VERSION_STRING,
.protocol_id = SMB311_PROT_ID,
- .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU,
+ .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES,
.large_lock_type = 0,
.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index b8b4f08..7675555 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -46,6 +46,7 @@
#include "smb2status.h"
#include "smb2glob.h"
#include "cifspdu.h"
+#include "cifs_spnego.h"
/*
* The following table defines the expected "StructureSize" of SMB2 requests
@@ -486,19 +487,15 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
cifs_dbg(FYI, "missing security blob on negprot\n");
rc = cifs_enable_signing(server, ses->sign);
-#ifdef CONFIG_SMB2_ASN1 /* BB REMOVEME when updated asn1.c ready */
if (rc)
goto neg_exit;
- if (blob_length)
+ if (blob_length) {
rc = decode_negTokenInit(security_blob, blob_length, server);
- if (rc == 1)
- rc = 0;
- else if (rc == 0) {
- rc = -EIO;
- goto neg_exit;
+ if (rc == 1)
+ rc = 0;
+ else if (rc == 0)
+ rc = -EIO;
}
-#endif
-
neg_exit:
free_rsp_buf(resp_buftype, rsp);
return rc;
@@ -592,7 +589,8 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses,
__le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */
struct TCP_Server_Info *server = ses->server;
u16 blob_length = 0;
- char *security_blob;
+ struct key *spnego_key = NULL;
+ char *security_blob = NULL;
char *ntlmssp_blob = NULL;
bool use_spnego = false; /* else use raw ntlmssp */
@@ -620,7 +618,8 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses,
ses->ntlmssp->sesskey_per_smbsess = true;
/* FIXME: allow for other auth types besides NTLMSSP (e.g. krb5) */
- ses->sectype = RawNTLMSSP;
+ if (ses->sectype != Kerberos && ses->sectype != RawNTLMSSP)
+ ses->sectype = RawNTLMSSP;
ssetup_ntlmssp_authenticate:
if (phase == NtLmChallenge)
@@ -649,7 +648,48 @@ ssetup_ntlmssp_authenticate:
iov[0].iov_base = (char *)req;
/* 4 for rfc1002 length field and 1 for pad */
iov[0].iov_len = get_rfc1002_length(req) + 4 - 1;
- if (phase == NtLmNegotiate) {
+
+ if (ses->sectype == Kerberos) {
+#ifdef CONFIG_CIFS_UPCALL
+ struct cifs_spnego_msg *msg;
+
+ spnego_key = cifs_get_spnego_key(ses);
+ if (IS_ERR(spnego_key)) {
+ rc = PTR_ERR(spnego_key);
+ spnego_key = NULL;
+ goto ssetup_exit;
+ }
+
+ msg = spnego_key->payload.data[0];
+ /*
+ * check version field to make sure that cifs.upcall is
+ * sending us a response in an expected form
+ */
+ if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) {
+ cifs_dbg(VFS,
+ "bad cifs.upcall version. Expected %d got %d",
+ CIFS_SPNEGO_UPCALL_VERSION, msg->version);
+ rc = -EKEYREJECTED;
+ goto ssetup_exit;
+ }
+ ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len,
+ GFP_KERNEL);
+ if (!ses->auth_key.response) {
+ cifs_dbg(VFS,
+ "Kerberos can't allocate (%u bytes) memory",
+ msg->sesskey_len);
+ rc = -ENOMEM;
+ goto ssetup_exit;
+ }
+ ses->auth_key.len = msg->sesskey_len;
+ blob_length = msg->secblob_len;
+ iov[1].iov_base = msg->data + msg->sesskey_len;
+ iov[1].iov_len = blob_length;
+#else
+ rc = -EOPNOTSUPP;
+ goto ssetup_exit;
+#endif /* CONFIG_CIFS_UPCALL */
+ } else if (phase == NtLmNegotiate) { /* if not krb5 must be ntlmssp */
ntlmssp_blob = kmalloc(sizeof(struct _NEGOTIATE_MESSAGE),
GFP_KERNEL);
if (ntlmssp_blob == NULL) {
@@ -672,6 +712,8 @@ ssetup_ntlmssp_authenticate:
/* with raw NTLMSSP we don't encapsulate in SPNEGO */
security_blob = ntlmssp_blob;
}
+ iov[1].iov_base = security_blob;
+ iov[1].iov_len = blob_length;
} else if (phase == NtLmAuthenticate) {
req->hdr.SessionId = ses->Suid;
ntlmssp_blob = kzalloc(sizeof(struct _NEGOTIATE_MESSAGE) + 500,
@@ -699,6 +741,8 @@ ssetup_ntlmssp_authenticate:
} else {
security_blob = ntlmssp_blob;
}
+ iov[1].iov_base = security_blob;
+ iov[1].iov_len = blob_length;
} else {
cifs_dbg(VFS, "illegal ntlmssp phase\n");
rc = -EIO;
@@ -710,8 +754,6 @@ ssetup_ntlmssp_authenticate:
cpu_to_le16(sizeof(struct smb2_sess_setup_req) -
1 /* pad */ - 4 /* rfc1001 len */);
req->SecurityBufferLength = cpu_to_le16(blob_length);
- iov[1].iov_base = security_blob;
- iov[1].iov_len = blob_length;
inc_rfc1001_len(req, blob_length - 1 /* pad */);
@@ -722,6 +764,7 @@ ssetup_ntlmssp_authenticate:
kfree(security_blob);
rsp = (struct smb2_sess_setup_rsp *)iov[0].iov_base;
+ ses->Suid = rsp->hdr.SessionId;
if (resp_buftype != CIFS_NO_BUFFER &&
rsp->hdr.Status == STATUS_MORE_PROCESSING_REQUIRED) {
if (phase != NtLmNegotiate) {
@@ -739,7 +782,6 @@ ssetup_ntlmssp_authenticate:
/* NTLMSSP Negotiate sent now processing challenge (response) */
phase = NtLmChallenge; /* process ntlmssp challenge */
rc = 0; /* MORE_PROCESSING is not an error here but expected */
- ses->Suid = rsp->hdr.SessionId;
rc = decode_ntlmssp_challenge(rsp->Buffer,
le16_to_cpu(rsp->SecurityBufferLength), ses);
}
@@ -796,6 +838,10 @@ keygen_exit:
kfree(ses->auth_key.response);
ses->auth_key.response = NULL;
}
+ if (spnego_key) {
+ key_invalidate(spnego_key);
+ key_put(spnego_key);
+ }
kfree(ses->ntlmssp);
return rc;
@@ -876,6 +922,12 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
if (tcon && tcon->bad_network_name)
return -ENOENT;
+ if ((tcon && tcon->seal) &&
+ ((ses->server->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION) == 0)) {
+ cifs_dbg(VFS, "encryption requested but no server support");
+ return -EOPNOTSUPP;
+ }
+
unc_path = kmalloc(MAX_SHARENAME_LENGTH * 2, GFP_KERNEL);
if (unc_path == NULL)
return -ENOMEM;
@@ -955,6 +1007,8 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
((tcon->share_flags & SHI1005_FLAGS_DFS) == 0))
cifs_dbg(VFS, "DFS capability contradicts DFS flag\n");
init_copy_chunk_defaults(tcon);
+ if (tcon->share_flags & SHI1005_FLAGS_ENCRYPT_DATA)
+ cifs_dbg(VFS, "Encrypted shares not supported");
if (tcon->ses->server->ops->validate_negotiate)
rc = tcon->ses->server->ops->validate_negotiate(xid, tcon);
tcon_exit:
@@ -1097,13 +1151,130 @@ add_lease_context(struct TCP_Server_Info *server, struct kvec *iov,
return 0;
}
+static struct create_durable_v2 *
+create_durable_v2_buf(struct cifs_fid *pfid)
+{
+ struct create_durable_v2 *buf;
+
+ buf = kzalloc(sizeof(struct create_durable_v2), GFP_KERNEL);
+ if (!buf)
+ return NULL;
+
+ buf->ccontext.DataOffset = cpu_to_le16(offsetof
+ (struct create_durable_v2, dcontext));
+ buf->ccontext.DataLength = cpu_to_le32(sizeof(struct durable_context_v2));
+ buf->ccontext.NameOffset = cpu_to_le16(offsetof
+ (struct create_durable_v2, Name));
+ buf->ccontext.NameLength = cpu_to_le16(4);
+
+ buf->dcontext.Timeout = 0; /* Should this be configurable by workload */
+ buf->dcontext.Flags = cpu_to_le32(SMB2_DHANDLE_FLAG_PERSISTENT);
+ get_random_bytes(buf->dcontext.CreateGuid, 16);
+ memcpy(pfid->create_guid, buf->dcontext.CreateGuid, 16);
+
+ /* SMB2_CREATE_DURABLE_HANDLE_REQUEST is "DH2Q" */
+ buf->Name[0] = 'D';
+ buf->Name[1] = 'H';
+ buf->Name[2] = '2';
+ buf->Name[3] = 'Q';
+ return buf;
+}
+
+static struct create_durable_handle_reconnect_v2 *
+create_reconnect_durable_v2_buf(struct cifs_fid *fid)
+{
+ struct create_durable_handle_reconnect_v2 *buf;
+
+ buf = kzalloc(sizeof(struct create_durable_handle_reconnect_v2),
+ GFP_KERNEL);
+ if (!buf)
+ return NULL;
+
+ buf->ccontext.DataOffset =
+ cpu_to_le16(offsetof(struct create_durable_handle_reconnect_v2,
+ dcontext));
+ buf->ccontext.DataLength =
+ cpu_to_le32(sizeof(struct durable_reconnect_context_v2));
+ buf->ccontext.NameOffset =
+ cpu_to_le16(offsetof(struct create_durable_handle_reconnect_v2,
+ Name));
+ buf->ccontext.NameLength = cpu_to_le16(4);
+
+ buf->dcontext.Fid.PersistentFileId = fid->persistent_fid;
+ buf->dcontext.Fid.VolatileFileId = fid->volatile_fid;
+ buf->dcontext.Flags = cpu_to_le32(SMB2_DHANDLE_FLAG_PERSISTENT);
+ memcpy(buf->dcontext.CreateGuid, fid->create_guid, 16);
+
+ /* SMB2_CREATE_DURABLE_HANDLE_RECONNECT_V2 is "DH2C" */
+ buf->Name[0] = 'D';
+ buf->Name[1] = 'H';
+ buf->Name[2] = '2';
+ buf->Name[3] = 'C';
+ return buf;
+}
+
static int
-add_durable_context(struct kvec *iov, unsigned int *num_iovec,
+add_durable_v2_context(struct kvec *iov, unsigned int *num_iovec,
+ struct cifs_open_parms *oparms)
+{
+ struct smb2_create_req *req = iov[0].iov_base;
+ unsigned int num = *num_iovec;
+
+ iov[num].iov_base = create_durable_v2_buf(oparms->fid);
+ if (iov[num].iov_base == NULL)
+ return -ENOMEM;
+ iov[num].iov_len = sizeof(struct create_durable_v2);
+ if (!req->CreateContextsOffset)
+ req->CreateContextsOffset =
+ cpu_to_le32(sizeof(struct smb2_create_req) - 4 +
+ iov[1].iov_len);
+ le32_add_cpu(&req->CreateContextsLength, sizeof(struct create_durable_v2));
+ inc_rfc1001_len(&req->hdr, sizeof(struct create_durable_v2));
+ *num_iovec = num + 1;
+ return 0;
+}
+
+static int
+add_durable_reconnect_v2_context(struct kvec *iov, unsigned int *num_iovec,
struct cifs_open_parms *oparms)
{
struct smb2_create_req *req = iov[0].iov_base;
unsigned int num = *num_iovec;
+ /* indicate that we don't need to relock the file */
+ oparms->reconnect = false;
+
+ iov[num].iov_base = create_reconnect_durable_v2_buf(oparms->fid);
+ if (iov[num].iov_base == NULL)
+ return -ENOMEM;
+ iov[num].iov_len = sizeof(struct create_durable_handle_reconnect_v2);
+ if (!req->CreateContextsOffset)
+ req->CreateContextsOffset =
+ cpu_to_le32(sizeof(struct smb2_create_req) - 4 +
+ iov[1].iov_len);
+ le32_add_cpu(&req->CreateContextsLength,
+ sizeof(struct create_durable_handle_reconnect_v2));
+ inc_rfc1001_len(&req->hdr,
+ sizeof(struct create_durable_handle_reconnect_v2));
+ *num_iovec = num + 1;
+ return 0;
+}
+
+static int
+add_durable_context(struct kvec *iov, unsigned int *num_iovec,
+ struct cifs_open_parms *oparms, bool use_persistent)
+{
+ struct smb2_create_req *req = iov[0].iov_base;
+ unsigned int num = *num_iovec;
+
+ if (use_persistent) {
+ if (oparms->reconnect)
+ return add_durable_reconnect_v2_context(iov, num_iovec,
+ oparms);
+ else
+ return add_durable_v2_context(iov, num_iovec, oparms);
+ }
+
if (oparms->reconnect) {
iov[num].iov_base = create_reconnect_durable_buf(oparms->fid);
/* indicate that we don't need to relock the file */
@@ -1221,7 +1392,9 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
ccontext->Next =
cpu_to_le32(server->vals->create_lease_size);
}
- rc = add_durable_context(iov, &num_iovecs, oparms);
+
+ rc = add_durable_context(iov, &num_iovecs, oparms,
+ tcon->use_persistent);
if (rc) {
cifs_small_buf_release(req);
kfree(copy_path);
@@ -1626,7 +1799,9 @@ smb2_echo_callback(struct mid_q_entry *mid)
if (mid->mid_state == MID_RESPONSE_RECEIVED)
credits_received = le16_to_cpu(smb2->hdr.CreditRequest);
+ mutex_lock(&server->srv_mutex);
DeleteMidQEntry(mid);
+ mutex_unlock(&server->srv_mutex);
add_credits(server, credits_received, CIFS_ECHO_OP);
}
@@ -1810,7 +1985,9 @@ smb2_readv_callback(struct mid_q_entry *mid)
cifs_stats_fail_inc(tcon, SMB2_READ_HE);
queue_work(cifsiod_wq, &rdata->work);
+ mutex_lock(&server->srv_mutex);
DeleteMidQEntry(mid);
+ mutex_unlock(&server->srv_mutex);
add_credits(server, credits_received, 0);
}
@@ -1938,6 +2115,7 @@ smb2_writev_callback(struct mid_q_entry *mid)
{
struct cifs_writedata *wdata = mid->callback_data;
struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
+ struct TCP_Server_Info *server = tcon->ses->server;
unsigned int written;
struct smb2_write_rsp *rsp = (struct smb2_write_rsp *)mid->resp_buf;
unsigned int credits_received = 1;
@@ -1977,7 +2155,9 @@ smb2_writev_callback(struct mid_q_entry *mid)
cifs_stats_fail_inc(tcon, SMB2_WRITE_HE);
queue_work(cifsiod_wq, &wdata->work);
+ mutex_lock(&server->srv_mutex);
DeleteMidQEntry(mid);
+ mutex_unlock(&server->srv_mutex);
add_credits(tcon->ses->server, credits_received, 0);
}
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index 4511082..4af5278 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -590,6 +590,44 @@ struct create_durable {
} Data;
} __packed;
+/* See MS-SMB2 2.2.13.2.11 */
+/* Flags */
+#define SMB2_DHANDLE_FLAG_PERSISTENT 0x00000002
+struct durable_context_v2 {
+ __le32 Timeout;
+ __le32 Flags;
+ __u64 Reserved;
+ __u8 CreateGuid[16];
+} __packed;
+
+struct create_durable_v2 {
+ struct create_context ccontext;
+ __u8 Name[8];
+ struct durable_context_v2 dcontext;
+} __packed;
+
+/* See MS-SMB2 2.2.13.2.12 */
+struct durable_reconnect_context_v2 {
+ struct {
+ __u64 PersistentFileId;
+ __u64 VolatileFileId;
+ } Fid;
+ __u8 CreateGuid[16];
+ __le32 Flags; /* see above DHANDLE_FLAG_PERSISTENT */
+} __packed;
+
+/* See MS-SMB2 2.2.14.2.12 */
+struct durable_reconnect_context_v2_rsp {
+ __le32 Timeout;
+ __le32 Flags; /* see above DHANDLE_FLAG_PERSISTENT */
+} __packed;
+
+struct create_durable_handle_reconnect_v2 {
+ struct create_context ccontext;
+ __u8 Name[8];
+ struct durable_reconnect_context_v2 dcontext;
+} __packed;
+
#define COPY_CHUNK_RES_KEY_SIZE 24
struct resume_key_req {
char ResumeKey[COPY_CHUNK_RES_KEY_SIZE];
@@ -643,6 +681,13 @@ struct fsctl_get_integrity_information_rsp {
/* Integrity flags for above */
#define FSCTL_INTEGRITY_FLAG_CHECKSUM_ENFORCEMENT_OFF 0x00000001
+/* See MS-SMB2 2.2.31.3 */
+struct network_resiliency_req {
+ __le32 Timeout;
+ __le32 Reserved;
+} __packed;
+/* There is no buffer for the response ie no struct network_resiliency_rsp */
+
struct validate_negotiate_info_req {
__le32 Capabilities;
diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h
index a639d0d..f996dae 100644
--- a/fs/cifs/smbfsctl.h
+++ b/fs/cifs/smbfsctl.h
@@ -90,7 +90,7 @@
#define FSCTL_SRV_ENUMERATE_SNAPSHOTS 0x00144064
/* Retrieve an opaque file reference for server-side data movement ie copy */
#define FSCTL_SRV_REQUEST_RESUME_KEY 0x00140078
-#define FSCTL_LMR_REQUEST_RESILIENCY 0x001401D4 /* BB add struct */
+#define FSCTL_LMR_REQUEST_RESILIENCY 0x001401D4
#define FSCTL_LMR_GET_LINK_TRACK_INF 0x001400E8 /* BB add struct */
#define FSCTL_LMR_SET_LINK_TRACK_INF 0x001400EC /* BB add struct */
#define FSCTL_VALIDATE_NEGOTIATE_INFO 0x00140204
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 126f46b..2a24c52 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -644,7 +644,9 @@ cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
}
spin_unlock(&GlobalMid_Lock);
+ mutex_lock(&server->srv_mutex);
DeleteMidQEntry(mid);
+ mutex_unlock(&server->srv_mutex);
return rc;
}
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index ff9e1f8..f5dc2f0 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -190,8 +190,8 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
#endif /* CONFIG_CIFS_ACL */
} else {
int temp;
- temp = strncmp(ea_name, POSIX_ACL_XATTR_ACCESS,
- strlen(POSIX_ACL_XATTR_ACCESS));
+ temp = strncmp(ea_name, XATTR_NAME_POSIX_ACL_ACCESS,
+ strlen(XATTR_NAME_POSIX_ACL_ACCESS));
if (temp == 0) {
#ifdef CONFIG_CIFS_POSIX
if (sb->s_flags & MS_POSIXACL)
@@ -203,8 +203,8 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
#else
cifs_dbg(FYI, "set POSIX ACL not supported\n");
#endif
- } else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT,
- strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) {
+ } else if (strncmp(ea_name, XATTR_NAME_POSIX_ACL_DEFAULT,
+ strlen(XATTR_NAME_POSIX_ACL_DEFAULT)) == 0) {
#ifdef CONFIG_CIFS_POSIX
if (sb->s_flags & MS_POSIXACL)
rc = CIFSSMBSetPosixACL(xid, pTcon, full_path,
@@ -292,8 +292,8 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon,
full_path, ea_name, ea_value, buf_size,
cifs_sb->local_nls, cifs_remap(cifs_sb));
- } else if (strncmp(ea_name, POSIX_ACL_XATTR_ACCESS,
- strlen(POSIX_ACL_XATTR_ACCESS)) == 0) {
+ } else if (strncmp(ea_name, XATTR_NAME_POSIX_ACL_ACCESS,
+ strlen(XATTR_NAME_POSIX_ACL_ACCESS)) == 0) {
#ifdef CONFIG_CIFS_POSIX
if (sb->s_flags & MS_POSIXACL)
rc = CIFSSMBGetPosixACL(xid, pTcon, full_path,
@@ -303,8 +303,8 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
#else
cifs_dbg(FYI, "Query POSIX ACL not supported yet\n");
#endif /* CONFIG_CIFS_POSIX */
- } else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT,
- strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) {
+ } else if (strncmp(ea_name, XATTR_NAME_POSIX_ACL_DEFAULT,
+ strlen(XATTR_NAME_POSIX_ACL_DEFAULT)) == 0) {
#ifdef CONFIG_CIFS_POSIX
if (sb->s_flags & MS_POSIXACL)
rc = CIFSSMBGetPosixACL(xid, pTcon, full_path,
diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c
index 7740b1c..1bfb7ba 100644
--- a/fs/coda/cnode.c
+++ b/fs/coda/cnode.c
@@ -8,6 +8,7 @@
#include <linux/coda.h>
#include <linux/coda_psdev.h>
+#include <linux/pagemap.h>
#include "coda_linux.h"
static inline int coda_fideq(struct CodaFid *fid1, struct CodaFid *fid2)
@@ -17,8 +18,7 @@ static inline int coda_fideq(struct CodaFid *fid1, struct CodaFid *fid2)
static const struct inode_operations coda_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
.setattr = coda_setattr,
};
@@ -35,6 +35,7 @@ static void coda_fill_inode(struct inode *inode, struct coda_vattr *attr)
inode->i_fop = &coda_dir_operations;
} else if (S_ISLNK(inode->i_mode)) {
inode->i_op = &coda_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_data.a_ops = &coda_symlink_aops;
inode->i_mapping = &inode->i_data;
} else
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index cac1390..57e81cb 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -74,9 +74,9 @@ static void init_once(void *foo)
int __init coda_init_inodecache(void)
{
coda_inode_cachep = kmem_cache_create("coda_inode_cache",
- sizeof(struct coda_inode_info),
- 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
- init_once);
+ sizeof(struct coda_inode_info), 0,
+ SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
+ SLAB_ACCOUNT, init_once);
if (coda_inode_cachep == NULL)
return -ENOMEM;
return 0;
diff --git a/fs/coda/symlink.c b/fs/coda/symlink.c
index ab94ef6..03736e2 100644
--- a/fs/coda/symlink.c
+++ b/fs/coda/symlink.c
@@ -26,7 +26,7 @@ static int coda_symlink_filler(struct file *file, struct page *page)
int error;
struct coda_inode_info *cii;
unsigned int len = PAGE_SIZE;
- char *p = kmap(page);
+ char *p = page_address(page);
cii = ITOC(inode);
@@ -34,13 +34,11 @@ static int coda_symlink_filler(struct file *file, struct page *page)
if (error)
goto fail;
SetPageUptodate(page);
- kunmap(page);
unlock_page(page);
return 0;
fail:
SetPageError(page);
- kunmap(page);
unlock_page(page);
return error;
}
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index 9b1ffaa..f6c6c8a 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -353,7 +353,7 @@ int venus_readlink(struct super_block *sb, struct CodaFid *fid,
char *result;
insize = max_t(unsigned int,
- INSIZE(readlink), OUTSIZE(readlink)+ *length + 1);
+ INSIZE(readlink), OUTSIZE(readlink)+ *length);
UPARG(CODA_READLINK);
inp->coda_readlink.VFid = *fid;
@@ -361,8 +361,8 @@ int venus_readlink(struct super_block *sb, struct CodaFid *fid,
error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
if (!error) {
retlen = outp->coda_readlink.count;
- if ( retlen > *length )
- retlen = *length;
+ if (retlen >= *length)
+ retlen = *length - 1;
*length = retlen;
result = (char *)outp + (long)outp->coda_readlink.data;
memcpy(buffer, result, retlen);
diff --git a/fs/compat.c b/fs/compat.c
index 6fd272d..a71936a 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -792,7 +792,7 @@ COMPAT_SYSCALL_DEFINE5(mount, const char __user *, dev_name,
const void __user *, data)
{
char *kernel_type;
- unsigned long data_page;
+ void *options;
char *kernel_dev;
int retval;
@@ -806,26 +806,25 @@ COMPAT_SYSCALL_DEFINE5(mount, const char __user *, dev_name,
if (IS_ERR(kernel_dev))
goto out1;
- retval = copy_mount_options(data, &data_page);
- if (retval < 0)
+ options = copy_mount_options(data);
+ retval = PTR_ERR(options);
+ if (IS_ERR(options))
goto out2;
- retval = -EINVAL;
-
- if (kernel_type && data_page) {
+ if (kernel_type && options) {
if (!strcmp(kernel_type, NCPFS_NAME)) {
- do_ncp_super_data_conv((void *)data_page);
+ do_ncp_super_data_conv(options);
} else if (!strcmp(kernel_type, NFS4_NAME)) {
- if (do_nfs4_super_data_conv((void *) data_page))
+ retval = -EINVAL;
+ if (do_nfs4_super_data_conv(options))
goto out3;
}
}
- retval = do_mount(kernel_dev, dir_name, kernel_type,
- flags, (void*)data_page);
+ retval = do_mount(kernel_dev, dir_name, kernel_type, flags, options);
out3:
- free_page(data_page);
+ kfree(options);
out2:
kfree(kernel_dev);
out1:
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 48851f6..a5b8eb6 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -58,6 +58,8 @@
#include <linux/atalk.h>
#include <linux/gfp.h>
+#include "internal.h"
+
#include <net/bluetooth/bluetooth.h>
#include <net/bluetooth/hci_sock.h>
#include <net/bluetooth/rfcomm.h>
@@ -115,19 +117,38 @@
#include <asm/fbio.h>
#endif
-static int w_long(unsigned int fd, unsigned int cmd,
- compat_ulong_t __user *argp)
+#define convert_in_user(srcptr, dstptr) \
+({ \
+ typeof(*srcptr) val; \
+ \
+ get_user(val, srcptr) || put_user(val, dstptr); \
+})
+
+static int do_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
- mm_segment_t old_fs = get_fs();
int err;
- unsigned long val;
- set_fs (KERNEL_DS);
- err = sys_ioctl(fd, cmd, (unsigned long)&val);
- set_fs (old_fs);
- if (!err && put_user(val, argp))
+ err = security_file_ioctl(file, cmd, arg);
+ if (err)
+ return err;
+
+ return vfs_ioctl(file, cmd, arg);
+}
+
+static int w_long(struct file *file,
+ unsigned int cmd, compat_ulong_t __user *argp)
+{
+ int err;
+ unsigned long __user *valp = compat_alloc_user_space(sizeof(*valp));
+
+ if (valp == NULL)
return -EFAULT;
- return err;
+ err = do_ioctl(file, cmd, (unsigned long)valp);
+ if (err)
+ return err;
+ if (convert_in_user(valp, argp))
+ return -EFAULT;
+ return 0;
}
struct compat_video_event {
@@ -139,23 +160,23 @@ struct compat_video_event {
} u;
};
-static int do_video_get_event(unsigned int fd, unsigned int cmd,
- struct compat_video_event __user *up)
+static int do_video_get_event(struct file *file,
+ unsigned int cmd, struct compat_video_event __user *up)
{
- struct video_event kevent;
- mm_segment_t old_fs = get_fs();
+ struct video_event __user *kevent =
+ compat_alloc_user_space(sizeof(*kevent));
int err;
- set_fs(KERNEL_DS);
- err = sys_ioctl(fd, cmd, (unsigned long) &kevent);
- set_fs(old_fs);
+ if (kevent == NULL)
+ return -EFAULT;
+ err = do_ioctl(file, cmd, (unsigned long)kevent);
if (!err) {
- err = put_user(kevent.type, &up->type);
- err |= put_user(kevent.timestamp, &up->timestamp);
- err |= put_user(kevent.u.size.w, &up->u.size.w);
- err |= put_user(kevent.u.size.h, &up->u.size.h);
- err |= put_user(kevent.u.size.aspect_ratio,
+ err = convert_in_user(&kevent->type, &up->type);
+ err |= convert_in_user(&kevent->timestamp, &up->timestamp);
+ err |= convert_in_user(&kevent->u.size.w, &up->u.size.w);
+ err |= convert_in_user(&kevent->u.size.h, &up->u.size.h);
+ err |= convert_in_user(&kevent->u.size.aspect_ratio,
&up->u.size.aspect_ratio);
if (err)
err = -EFAULT;
@@ -169,8 +190,8 @@ struct compat_video_still_picture {
int32_t size;
};
-static int do_video_stillpicture(unsigned int fd, unsigned int cmd,
- struct compat_video_still_picture __user *up)
+static int do_video_stillpicture(struct file *file,
+ unsigned int cmd, struct compat_video_still_picture __user *up)
{
struct video_still_picture __user *up_native;
compat_uptr_t fp;
@@ -190,7 +211,7 @@ static int do_video_stillpicture(unsigned int fd, unsigned int cmd,
if (err)
return -EFAULT;
- err = sys_ioctl(fd, cmd, (unsigned long) up_native);
+ err = do_ioctl(file, cmd, (unsigned long) up_native);
return err;
}
@@ -200,8 +221,8 @@ struct compat_video_spu_palette {
compat_uptr_t palette;
};
-static int do_video_set_spu_palette(unsigned int fd, unsigned int cmd,
- struct compat_video_spu_palette __user *up)
+static int do_video_set_spu_palette(struct file *file,
+ unsigned int cmd, struct compat_video_spu_palette __user *up)
{
struct video_spu_palette __user *up_native;
compat_uptr_t palp;
@@ -218,7 +239,7 @@ static int do_video_set_spu_palette(unsigned int fd, unsigned int cmd,
if (err)
return -EFAULT;
- err = sys_ioctl(fd, cmd, (unsigned long) up_native);
+ err = do_ioctl(file, cmd, (unsigned long) up_native);
return err;
}
@@ -276,7 +297,7 @@ static int sg_build_iovec(sg_io_hdr_t __user *sgio, void __user *dxferp, u16 iov
return 0;
}
-static int sg_ioctl_trans(unsigned int fd, unsigned int cmd,
+static int sg_ioctl_trans(struct file *file, unsigned int cmd,
sg_io_hdr32_t __user *sgio32)
{
sg_io_hdr_t __user *sgio;
@@ -289,7 +310,7 @@ static int sg_ioctl_trans(unsigned int fd, unsigned int cmd,
if (get_user(interface_id, &sgio32->interface_id))
return -EFAULT;
if (interface_id != 'S')
- return sys_ioctl(fd, cmd, (unsigned long)sgio32);
+ return do_ioctl(file, cmd, (unsigned long)sgio32);
if (get_user(iovec_count, &sgio32->iovec_count))
return -EFAULT;
@@ -349,7 +370,7 @@ static int sg_ioctl_trans(unsigned int fd, unsigned int cmd,
if (put_user(compat_ptr(data), &sgio->usr_ptr))
return -EFAULT;
- err = sys_ioctl(fd, cmd, (unsigned long) sgio);
+ err = do_ioctl(file, cmd, (unsigned long) sgio);
if (err >= 0) {
void __user *datap;
@@ -380,13 +401,13 @@ struct compat_sg_req_info { /* used by SG_GET_REQUEST_TABLE ioctl() */
int unused;
};
-static int sg_grt_trans(unsigned int fd, unsigned int cmd, struct
- compat_sg_req_info __user *o)
+static int sg_grt_trans(struct file *file,
+ unsigned int cmd, struct compat_sg_req_info __user *o)
{
int err, i;
sg_req_info_t __user *r;
r = compat_alloc_user_space(sizeof(sg_req_info_t)*SG_MAX_QUEUE);
- err = sys_ioctl(fd,cmd,(unsigned long)r);
+ err = do_ioctl(file, cmd, (unsigned long)r);
if (err < 0)
return err;
for (i = 0; i < SG_MAX_QUEUE; i++) {
@@ -412,8 +433,8 @@ struct sock_fprog32 {
#define PPPIOCSPASS32 _IOW('t', 71, struct sock_fprog32)
#define PPPIOCSACTIVE32 _IOW('t', 70, struct sock_fprog32)
-static int ppp_sock_fprog_ioctl_trans(unsigned int fd, unsigned int cmd,
- struct sock_fprog32 __user *u_fprog32)
+static int ppp_sock_fprog_ioctl_trans(struct file *file,
+ unsigned int cmd, struct sock_fprog32 __user *u_fprog32)
{
struct sock_fprog __user *u_fprog64 = compat_alloc_user_space(sizeof(struct sock_fprog));
void __user *fptr64;
@@ -435,7 +456,7 @@ static int ppp_sock_fprog_ioctl_trans(unsigned int fd, unsigned int cmd,
else
cmd = PPPIOCSACTIVE;
- return sys_ioctl(fd, cmd, (unsigned long) u_fprog64);
+ return do_ioctl(file, cmd, (unsigned long) u_fprog64);
}
struct ppp_option_data32 {
@@ -451,7 +472,7 @@ struct ppp_idle32 {
};
#define PPPIOCGIDLE32 _IOR('t', 63, struct ppp_idle32)
-static int ppp_gidle(unsigned int fd, unsigned int cmd,
+static int ppp_gidle(struct file *file, unsigned int cmd,
struct ppp_idle32 __user *idle32)
{
struct ppp_idle __user *idle;
@@ -460,7 +481,7 @@ static int ppp_gidle(unsigned int fd, unsigned int cmd,
idle = compat_alloc_user_space(sizeof(*idle));
- err = sys_ioctl(fd, PPPIOCGIDLE, (unsigned long) idle);
+ err = do_ioctl(file, PPPIOCGIDLE, (unsigned long) idle);
if (!err) {
if (get_user(xmit, &idle->xmit_idle) ||
@@ -472,7 +493,7 @@ static int ppp_gidle(unsigned int fd, unsigned int cmd,
return err;
}
-static int ppp_scompress(unsigned int fd, unsigned int cmd,
+static int ppp_scompress(struct file *file, unsigned int cmd,
struct ppp_option_data32 __user *odata32)
{
struct ppp_option_data __user *odata;
@@ -492,7 +513,7 @@ static int ppp_scompress(unsigned int fd, unsigned int cmd,
sizeof(__u32) + sizeof(int)))
return -EFAULT;
- return sys_ioctl(fd, PPPIOCSCOMPRESS, (unsigned long) odata);
+ return do_ioctl(file, PPPIOCSCOMPRESS, (unsigned long) odata);
}
#ifdef CONFIG_BLOCK
@@ -512,12 +533,13 @@ struct mtpos32 {
};
#define MTIOCPOS32 _IOR('m', 3, struct mtpos32)
-static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, void __user *argp)
+static int mt_ioctl_trans(struct file *file,
+ unsigned int cmd, void __user *argp)
{
- mm_segment_t old_fs = get_fs();
- struct mtget get;
+ /* NULL initialization to make gcc shut up */
+ struct mtget __user *get = NULL;
struct mtget32 __user *umget32;
- struct mtpos pos;
+ struct mtpos __user *pos = NULL;
struct mtpos32 __user *upos32;
unsigned long kcmd;
void *karg;
@@ -526,32 +548,34 @@ static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, void __user *argp)
switch(cmd) {
case MTIOCPOS32:
kcmd = MTIOCPOS;
- karg = &pos;
+ pos = compat_alloc_user_space(sizeof(*pos));
+ karg = pos;
break;
default: /* MTIOCGET32 */
kcmd = MTIOCGET;
- karg = &get;
+ get = compat_alloc_user_space(sizeof(*get));
+ karg = get;
break;
}
- set_fs (KERNEL_DS);
- err = sys_ioctl (fd, kcmd, (unsigned long)karg);
- set_fs (old_fs);
+ if (karg == NULL)
+ return -EFAULT;
+ err = do_ioctl(file, kcmd, (unsigned long)karg);
if (err)
return err;
switch (cmd) {
case MTIOCPOS32:
upos32 = argp;
- err = __put_user(pos.mt_blkno, &upos32->mt_blkno);
+ err = convert_in_user(&pos->mt_blkno, &upos32->mt_blkno);
break;
case MTIOCGET32:
umget32 = argp;
- err = __put_user(get.mt_type, &umget32->mt_type);
- err |= __put_user(get.mt_resid, &umget32->mt_resid);
- err |= __put_user(get.mt_dsreg, &umget32->mt_dsreg);
- err |= __put_user(get.mt_gstat, &umget32->mt_gstat);
- err |= __put_user(get.mt_erreg, &umget32->mt_erreg);
- err |= __put_user(get.mt_fileno, &umget32->mt_fileno);
- err |= __put_user(get.mt_blkno, &umget32->mt_blkno);
+ err = convert_in_user(&get->mt_type, &umget32->mt_type);
+ err |= convert_in_user(&get->mt_resid, &umget32->mt_resid);
+ err |= convert_in_user(&get->mt_dsreg, &umget32->mt_dsreg);
+ err |= convert_in_user(&get->mt_gstat, &umget32->mt_gstat);
+ err |= convert_in_user(&get->mt_erreg, &umget32->mt_erreg);
+ err |= convert_in_user(&get->mt_fileno, &umget32->mt_fileno);
+ err |= convert_in_user(&get->mt_blkno, &umget32->mt_blkno);
break;
}
return err ? -EFAULT: 0;
@@ -605,42 +629,41 @@ struct serial_struct32 {
compat_int_t reserved[1];
};
-static int serial_struct_ioctl(unsigned fd, unsigned cmd,
- struct serial_struct32 __user *ss32)
+static int serial_struct_ioctl(struct file *file,
+ unsigned cmd, struct serial_struct32 __user *ss32)
{
typedef struct serial_struct32 SS32;
int err;
- struct serial_struct ss;
- mm_segment_t oldseg = get_fs();
+ struct serial_struct __user *ss = compat_alloc_user_space(sizeof(*ss));
__u32 udata;
unsigned int base;
+ unsigned char *iomem_base;
+ if (ss == NULL)
+ return -EFAULT;
if (cmd == TIOCSSERIAL) {
- if (!access_ok(VERIFY_READ, ss32, sizeof(SS32)))
- return -EFAULT;
- if (__copy_from_user(&ss, ss32, offsetof(SS32, iomem_base)))
- return -EFAULT;
- if (__get_user(udata, &ss32->iomem_base))
+ if (copy_in_user(ss, ss32, offsetof(SS32, iomem_base)) ||
+ get_user(udata, &ss32->iomem_base))
return -EFAULT;
- ss.iomem_base = compat_ptr(udata);
- if (__get_user(ss.iomem_reg_shift, &ss32->iomem_reg_shift) ||
- __get_user(ss.port_high, &ss32->port_high))
+ iomem_base = compat_ptr(udata);
+ if (put_user(iomem_base, &ss->iomem_base) ||
+ convert_in_user(&ss32->iomem_reg_shift,
+ &ss->iomem_reg_shift) ||
+ convert_in_user(&ss32->port_high, &ss->port_high) ||
+ put_user(0UL, &ss->iomap_base))
return -EFAULT;
- ss.iomap_base = 0UL;
}
- set_fs(KERNEL_DS);
- err = sys_ioctl(fd,cmd,(unsigned long)(&ss));
- set_fs(oldseg);
+ err = do_ioctl(file, cmd, (unsigned long)ss);
if (cmd == TIOCGSERIAL && err >= 0) {
- if (!access_ok(VERIFY_WRITE, ss32, sizeof(SS32)))
- return -EFAULT;
- if (__copy_to_user(ss32,&ss,offsetof(SS32,iomem_base)))
+ if (copy_in_user(ss32, ss, offsetof(SS32, iomem_base)) ||
+ get_user(iomem_base, &ss->iomem_base))
return -EFAULT;
- base = (unsigned long)ss.iomem_base >> 32 ?
- 0xffffffff : (unsigned)(unsigned long)ss.iomem_base;
- if (__put_user(base, &ss32->iomem_base) ||
- __put_user(ss.iomem_reg_shift, &ss32->iomem_reg_shift) ||
- __put_user(ss.port_high, &ss32->port_high))
+ base = (unsigned long)iomem_base >> 32 ?
+ 0xffffffff : (unsigned)(unsigned long)iomem_base;
+ if (put_user(base, &ss32->iomem_base) ||
+ convert_in_user(&ss->iomem_reg_shift,
+ &ss32->iomem_reg_shift) ||
+ convert_in_user(&ss->port_high, &ss32->port_high))
return -EFAULT;
}
return err;
@@ -674,8 +697,8 @@ struct i2c_rdwr_aligned {
struct i2c_msg msgs[0];
};
-static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd,
- struct i2c_rdwr_ioctl_data32 __user *udata)
+static int do_i2c_rdwr_ioctl(struct file *file,
+ unsigned int cmd, struct i2c_rdwr_ioctl_data32 __user *udata)
{
struct i2c_rdwr_aligned __user *tdata;
struct i2c_msg __user *tmsgs;
@@ -686,7 +709,7 @@ static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd,
if (get_user(nmsgs, &udata->nmsgs))
return -EFAULT;
- if (nmsgs > I2C_RDRW_IOCTL_MAX_MSGS)
+ if (nmsgs > I2C_RDWR_IOCTL_MAX_MSGS)
return -EINVAL;
if (get_user(datap, &udata->msgs))
@@ -708,11 +731,11 @@ static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd,
put_user(compat_ptr(datap), &tmsgs[i].buf))
return -EFAULT;
}
- return sys_ioctl(fd, cmd, (unsigned long)tdata);
+ return do_ioctl(file, cmd, (unsigned long)tdata);
}
-static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd,
- struct i2c_smbus_ioctl_data32 __user *udata)
+static int do_i2c_smbus_ioctl(struct file *file,
+ unsigned int cmd, struct i2c_smbus_ioctl_data32 __user *udata)
{
struct i2c_smbus_ioctl_data __user *tdata;
compat_caddr_t datap;
@@ -734,7 +757,7 @@ static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd,
__put_user(compat_ptr(datap), &tdata->data))
return -EFAULT;
- return sys_ioctl(fd, cmd, (unsigned long)tdata);
+ return do_ioctl(file, cmd, (unsigned long)tdata);
}
#define RTC_IRQP_READ32 _IOR('p', 0x0b, compat_ulong_t)
@@ -742,29 +765,27 @@ static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd,
#define RTC_EPOCH_READ32 _IOR('p', 0x0d, compat_ulong_t)
#define RTC_EPOCH_SET32 _IOW('p', 0x0e, compat_ulong_t)
-static int rtc_ioctl(unsigned fd, unsigned cmd, void __user *argp)
+static int rtc_ioctl(struct file *file,
+ unsigned cmd, void __user *argp)
{
- mm_segment_t oldfs = get_fs();
- compat_ulong_t val32;
- unsigned long kval;
+ unsigned long __user *valp = compat_alloc_user_space(sizeof(*valp));
int ret;
+ if (valp == NULL)
+ return -EFAULT;
switch (cmd) {
case RTC_IRQP_READ32:
case RTC_EPOCH_READ32:
- set_fs(KERNEL_DS);
- ret = sys_ioctl(fd, (cmd == RTC_IRQP_READ32) ?
+ ret = do_ioctl(file, (cmd == RTC_IRQP_READ32) ?
RTC_IRQP_READ : RTC_EPOCH_READ,
- (unsigned long)&kval);
- set_fs(oldfs);
+ (unsigned long)valp);
if (ret)
return ret;
- val32 = kval;
- return put_user(val32, (unsigned int __user *)argp);
+ return convert_in_user(valp, (unsigned int __user *)argp);
case RTC_IRQP_SET32:
- return sys_ioctl(fd, RTC_IRQP_SET, (unsigned long)argp);
+ return do_ioctl(file, RTC_IRQP_SET, (unsigned long)argp);
case RTC_EPOCH_SET32:
- return sys_ioctl(fd, RTC_EPOCH_SET, (unsigned long)argp);
+ return do_ioctl(file, RTC_EPOCH_SET, (unsigned long)argp);
}
return -ENOIOCTLCMD;
@@ -1284,12 +1305,6 @@ COMPATIBLE_IOCTL(PCIIOC_CONTROLLER)
COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_IO)
COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_MEM)
COMPATIBLE_IOCTL(PCIIOC_WRITE_COMBINE)
-/* NBD */
-COMPATIBLE_IOCTL(NBD_DO_IT)
-COMPATIBLE_IOCTL(NBD_CLEAR_SOCK)
-COMPATIBLE_IOCTL(NBD_CLEAR_QUE)
-COMPATIBLE_IOCTL(NBD_PRINT_DEBUG)
-COMPATIBLE_IOCTL(NBD_DISCONNECT)
/* i2c */
COMPATIBLE_IOCTL(I2C_SLAVE)
COMPATIBLE_IOCTL(I2C_SLAVE_FORCE)
@@ -1436,53 +1451,53 @@ IGNORE_IOCTL(FBIOGCURSOR32)
* a compat_ioctl operation in the place that handleѕ the
* ioctl for the native case.
*/
-static long do_ioctl_trans(int fd, unsigned int cmd,
+static long do_ioctl_trans(unsigned int cmd,
unsigned long arg, struct file *file)
{
void __user *argp = compat_ptr(arg);
switch (cmd) {
case PPPIOCGIDLE32:
- return ppp_gidle(fd, cmd, argp);
+ return ppp_gidle(file, cmd, argp);
case PPPIOCSCOMPRESS32:
- return ppp_scompress(fd, cmd, argp);
+ return ppp_scompress(file, cmd, argp);
case PPPIOCSPASS32:
case PPPIOCSACTIVE32:
- return ppp_sock_fprog_ioctl_trans(fd, cmd, argp);
+ return ppp_sock_fprog_ioctl_trans(file, cmd, argp);
#ifdef CONFIG_BLOCK
case SG_IO:
- return sg_ioctl_trans(fd, cmd, argp);
+ return sg_ioctl_trans(file, cmd, argp);
case SG_GET_REQUEST_TABLE:
- return sg_grt_trans(fd, cmd, argp);
+ return sg_grt_trans(file, cmd, argp);
case MTIOCGET32:
case MTIOCPOS32:
- return mt_ioctl_trans(fd, cmd, argp);
+ return mt_ioctl_trans(file, cmd, argp);
#endif
/* Serial */
case TIOCGSERIAL:
case TIOCSSERIAL:
- return serial_struct_ioctl(fd, cmd, argp);
+ return serial_struct_ioctl(file, cmd, argp);
/* i2c */
case I2C_FUNCS:
- return w_long(fd, cmd, argp);
+ return w_long(file, cmd, argp);
case I2C_RDWR:
- return do_i2c_rdwr_ioctl(fd, cmd, argp);
+ return do_i2c_rdwr_ioctl(file, cmd, argp);
case I2C_SMBUS:
- return do_i2c_smbus_ioctl(fd, cmd, argp);
+ return do_i2c_smbus_ioctl(file, cmd, argp);
/* Not implemented in the native kernel */
case RTC_IRQP_READ32:
case RTC_IRQP_SET32:
case RTC_EPOCH_READ32:
case RTC_EPOCH_SET32:
- return rtc_ioctl(fd, cmd, argp);
+ return rtc_ioctl(file, cmd, argp);
/* dvb */
case VIDEO_GET_EVENT:
- return do_video_get_event(fd, cmd, argp);
+ return do_video_get_event(file, cmd, argp);
case VIDEO_STILLPICTURE:
- return do_video_stillpicture(fd, cmd, argp);
+ return do_video_stillpicture(file, cmd, argp);
case VIDEO_SET_SPU_PALETTE:
- return do_video_set_spu_palette(fd, cmd, argp);
+ return do_video_set_spu_palette(file, cmd, argp);
}
/*
@@ -1508,12 +1523,7 @@ static long do_ioctl_trans(int fd, unsigned int cmd,
case KDSKBMETA:
case KDSKBLED:
case KDSETLED:
- /* NBD */
- case NBD_SET_SOCK:
- case NBD_SET_BLKSIZE:
- case NBD_SET_SIZE:
- case NBD_SET_SIZE_BLOCKS:
- return do_vfs_ioctl(file, fd, cmd, arg);
+ return vfs_ioctl(file, cmd, arg);
}
return -ENOIOCTLCMD;
@@ -1580,6 +1590,11 @@ COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd,
goto out_fput;
#endif
+ case FICLONE:
+ case FICLONERANGE:
+ case FIDEDUPERANGE:
+ goto do_ioctl;
+
case FIBMAP:
case FIGETBSZ:
case FIONREAD:
@@ -1602,7 +1617,7 @@ COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd,
if (compat_ioctl_check_table(XFORM(cmd)))
goto found_handler;
- error = do_ioctl_trans(fd, cmd, arg, f.file);
+ error = do_ioctl_trans(cmd, arg, f.file);
if (error == -ENOIOCTLCMD)
error = -ENOTTY;
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index b65d1ef..ccc31fa 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -53,13 +53,14 @@ struct configfs_dirent {
#define CONFIGFS_ROOT 0x0001
#define CONFIGFS_DIR 0x0002
#define CONFIGFS_ITEM_ATTR 0x0004
+#define CONFIGFS_ITEM_BIN_ATTR 0x0008
#define CONFIGFS_ITEM_LINK 0x0020
#define CONFIGFS_USET_DIR 0x0040
#define CONFIGFS_USET_DEFAULT 0x0080
#define CONFIGFS_USET_DROPPING 0x0100
#define CONFIGFS_USET_IN_MKDIR 0x0200
#define CONFIGFS_USET_CREATING 0x0400
-#define CONFIGFS_NOT_PINNED (CONFIGFS_ITEM_ATTR)
+#define CONFIGFS_NOT_PINNED (CONFIGFS_ITEM_ATTR | CONFIGFS_ITEM_BIN_ATTR)
extern struct mutex configfs_symlink_mutex;
extern spinlock_t configfs_dirent_lock;
@@ -72,6 +73,8 @@ extern struct inode * configfs_new_inode(umode_t mode, struct configfs_dirent *,
extern int configfs_create(struct dentry *, umode_t mode, void (*init)(struct inode *));
extern int configfs_create_file(struct config_item *, const struct configfs_attribute *);
+extern int configfs_create_bin_file(struct config_item *,
+ const struct configfs_bin_attribute *);
extern int configfs_make_dirent(struct configfs_dirent *,
struct dentry *, void *, umode_t, int);
extern int configfs_dirent_is_ready(struct configfs_dirent *);
@@ -88,7 +91,7 @@ extern void configfs_release_fs(void);
extern struct rw_semaphore configfs_rename_sem;
extern const struct file_operations configfs_dir_operations;
extern const struct file_operations configfs_file_operations;
-extern const struct file_operations bin_fops;
+extern const struct file_operations configfs_bin_file_operations;
extern const struct inode_operations configfs_dir_inode_operations;
extern const struct inode_operations configfs_root_inode_operations;
extern const struct inode_operations configfs_symlink_inode_operations;
@@ -119,6 +122,13 @@ static inline struct configfs_attribute * to_attr(struct dentry * dentry)
return ((struct configfs_attribute *) sd->s_element);
}
+static inline struct configfs_bin_attribute *to_bin_attr(struct dentry *dentry)
+{
+ struct configfs_attribute *attr = to_attr(dentry);
+
+ return container_of(attr, struct configfs_bin_attribute, cb_attr);
+}
+
static inline struct config_item *configfs_get_config_item(struct dentry *dentry)
{
struct config_item * item = NULL;
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index c81ce7f..cab612b 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -255,6 +255,12 @@ static void configfs_init_file(struct inode * inode)
inode->i_fop = &configfs_file_operations;
}
+static void configfs_init_bin_file(struct inode *inode)
+{
+ inode->i_size = 0;
+ inode->i_fop = &configfs_bin_file_operations;
+}
+
static void init_symlink(struct inode * inode)
{
inode->i_op = &configfs_symlink_inode_operations;
@@ -423,7 +429,9 @@ static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * den
spin_unlock(&configfs_dirent_lock);
error = configfs_create(dentry, (attr->ca_mode & S_IALLUGO) | S_IFREG,
- configfs_init_file);
+ (sd->s_type & CONFIGFS_ITEM_BIN_ATTR) ?
+ configfs_init_bin_file :
+ configfs_init_file);
if (error) {
configfs_put(sd);
return error;
@@ -583,6 +591,7 @@ static int populate_attrs(struct config_item *item)
{
struct config_item_type *t = item->ci_type;
struct configfs_attribute *attr;
+ struct configfs_bin_attribute *bin_attr;
int error = 0;
int i;
@@ -594,6 +603,13 @@ static int populate_attrs(struct config_item *item)
break;
}
}
+ if (t->ct_bin_attrs) {
+ for (i = 0; (bin_attr = t->ct_bin_attrs[i]) != NULL; i++) {
+ error = configfs_create_bin_file(item, bin_attr);
+ if (error)
+ break;
+ }
+ }
if (error)
detach_attrs(item);
@@ -1054,11 +1070,55 @@ out:
return ret;
}
+static int configfs_do_depend_item(struct dentry *subsys_dentry,
+ struct config_item *target)
+{
+ struct configfs_dirent *p;
+ int ret;
+
+ spin_lock(&configfs_dirent_lock);
+ /* Scan the tree, return 0 if found */
+ ret = configfs_depend_prep(subsys_dentry, target);
+ if (ret)
+ goto out_unlock_dirent_lock;
+
+ /*
+ * We are sure that the item is not about to be removed by rmdir(), and
+ * not in the middle of attachment by mkdir().
+ */
+ p = target->ci_dentry->d_fsdata;
+ p->s_dependent_count += 1;
+
+out_unlock_dirent_lock:
+ spin_unlock(&configfs_dirent_lock);
+
+ return ret;
+}
+
+static inline struct configfs_dirent *
+configfs_find_subsys_dentry(struct configfs_dirent *root_sd,
+ struct config_item *subsys_item)
+{
+ struct configfs_dirent *p;
+ struct configfs_dirent *ret = NULL;
+
+ list_for_each_entry(p, &root_sd->s_children, s_sibling) {
+ if (p->s_type & CONFIGFS_DIR &&
+ p->s_element == subsys_item) {
+ ret = p;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+
int configfs_depend_item(struct configfs_subsystem *subsys,
struct config_item *target)
{
int ret;
- struct configfs_dirent *p, *root_sd, *subsys_sd = NULL;
+ struct configfs_dirent *subsys_sd;
struct config_item *s_item = &subsys->su_group.cg_item;
struct dentry *root;
@@ -1077,39 +1137,15 @@ int configfs_depend_item(struct configfs_subsystem *subsys,
*/
mutex_lock(&d_inode(root)->i_mutex);
- root_sd = root->d_fsdata;
-
- list_for_each_entry(p, &root_sd->s_children, s_sibling) {
- if (p->s_type & CONFIGFS_DIR) {
- if (p->s_element == s_item) {
- subsys_sd = p;
- break;
- }
- }
- }
-
+ subsys_sd = configfs_find_subsys_dentry(root->d_fsdata, s_item);
if (!subsys_sd) {
ret = -ENOENT;
goto out_unlock_fs;
}
/* Ok, now we can trust subsys/s_item */
+ ret = configfs_do_depend_item(subsys_sd->s_dentry, target);
- spin_lock(&configfs_dirent_lock);
- /* Scan the tree, return 0 if found */
- ret = configfs_depend_prep(subsys_sd->s_dentry, target);
- if (ret)
- goto out_unlock_dirent_lock;
-
- /*
- * We are sure that the item is not about to be removed by rmdir(), and
- * not in the middle of attachment by mkdir().
- */
- p = target->ci_dentry->d_fsdata;
- p->s_dependent_count += 1;
-
-out_unlock_dirent_lock:
- spin_unlock(&configfs_dirent_lock);
out_unlock_fs:
mutex_unlock(&d_inode(root)->i_mutex);
@@ -1128,8 +1164,7 @@ EXPORT_SYMBOL(configfs_depend_item);
* configfs_depend_item() because we know that that the client driver is
* pinned, thus the subsystem is pinned, and therefore configfs is pinned.
*/
-void configfs_undepend_item(struct configfs_subsystem *subsys,
- struct config_item *target)
+void configfs_undepend_item(struct config_item *target)
{
struct configfs_dirent *sd;
@@ -1152,6 +1187,79 @@ void configfs_undepend_item(struct configfs_subsystem *subsys,
}
EXPORT_SYMBOL(configfs_undepend_item);
+/*
+ * caller_subsys is a caller's subsystem not target's. This is used to
+ * determine if we should lock root and check subsys or not. When we are
+ * in the same subsystem as our target there is no need to do locking as
+ * we know that subsys is valid and is not unregistered during this function
+ * as we are called from callback of one of his children and VFS holds a lock
+ * on some inode. Otherwise we have to lock our root to ensure that target's
+ * subsystem it is not unregistered during this function.
+ */
+int configfs_depend_item_unlocked(struct configfs_subsystem *caller_subsys,
+ struct config_item *target)
+{
+ struct configfs_subsystem *target_subsys;
+ struct config_group *root, *parent;
+ struct configfs_dirent *subsys_sd;
+ int ret = -ENOENT;
+
+ /* Disallow this function for configfs root */
+ if (configfs_is_root(target))
+ return -EINVAL;
+
+ parent = target->ci_group;
+ /*
+ * This may happen when someone is trying to depend root
+ * directory of some subsystem
+ */
+ if (configfs_is_root(&parent->cg_item)) {
+ target_subsys = to_configfs_subsystem(to_config_group(target));
+ root = parent;
+ } else {
+ target_subsys = parent->cg_subsys;
+ /* Find a cofnigfs root as we may need it for locking */
+ for (root = parent; !configfs_is_root(&root->cg_item);
+ root = root->cg_item.ci_group)
+ ;
+ }
+
+ if (target_subsys != caller_subsys) {
+ /*
+ * We are in other configfs subsystem, so we have to do
+ * additional locking to prevent other subsystem from being
+ * unregistered
+ */
+ mutex_lock(&d_inode(root->cg_item.ci_dentry)->i_mutex);
+
+ /*
+ * As we are trying to depend item from other subsystem
+ * we have to check if this subsystem is still registered
+ */
+ subsys_sd = configfs_find_subsys_dentry(
+ root->cg_item.ci_dentry->d_fsdata,
+ &target_subsys->su_group.cg_item);
+ if (!subsys_sd)
+ goto out_root_unlock;
+ } else {
+ subsys_sd = target_subsys->su_group.cg_item.ci_dentry->d_fsdata;
+ }
+
+ /* Now we can execute core of depend item */
+ ret = configfs_do_depend_item(subsys_sd->s_dentry, target);
+
+ if (target_subsys != caller_subsys)
+out_root_unlock:
+ /*
+ * We were called from subsystem other than our target so we
+ * took some locks so now it's time to release them
+ */
+ mutex_unlock(&d_inode(root->cg_item.ci_dentry)->i_mutex);
+
+ return ret;
+}
+EXPORT_SYMBOL(configfs_depend_item_unlocked);
+
static int configfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
int ret = 0;
@@ -1636,6 +1744,116 @@ const struct file_operations configfs_dir_operations = {
.iterate = configfs_readdir,
};
+/**
+ * configfs_register_group - creates a parent-child relation between two groups
+ * @parent_group: parent group
+ * @group: child group
+ *
+ * link groups, creates dentry for the child and attaches it to the
+ * parent dentry.
+ *
+ * Return: 0 on success, negative errno code on error
+ */
+int configfs_register_group(struct config_group *parent_group,
+ struct config_group *group)
+{
+ struct configfs_subsystem *subsys = parent_group->cg_subsys;
+ struct dentry *parent;
+ int ret;
+
+ mutex_lock(&subsys->su_mutex);
+ link_group(parent_group, group);
+ mutex_unlock(&subsys->su_mutex);
+
+ parent = parent_group->cg_item.ci_dentry;
+
+ mutex_lock_nested(&d_inode(parent)->i_mutex, I_MUTEX_PARENT);
+ ret = create_default_group(parent_group, group);
+ if (!ret) {
+ spin_lock(&configfs_dirent_lock);
+ configfs_dir_set_ready(group->cg_item.ci_dentry->d_fsdata);
+ spin_unlock(&configfs_dirent_lock);
+ }
+ mutex_unlock(&d_inode(parent)->i_mutex);
+ return ret;
+}
+EXPORT_SYMBOL(configfs_register_group);
+
+/**
+ * configfs_unregister_group() - unregisters a child group from its parent
+ * @group: parent group to be unregistered
+ *
+ * Undoes configfs_register_group()
+ */
+void configfs_unregister_group(struct config_group *group)
+{
+ struct configfs_subsystem *subsys = group->cg_subsys;
+ struct dentry *dentry = group->cg_item.ci_dentry;
+ struct dentry *parent = group->cg_item.ci_parent->ci_dentry;
+
+ mutex_lock_nested(&d_inode(parent)->i_mutex, I_MUTEX_PARENT);
+ spin_lock(&configfs_dirent_lock);
+ configfs_detach_prep(dentry, NULL);
+ spin_unlock(&configfs_dirent_lock);
+
+ configfs_detach_group(&group->cg_item);
+ d_inode(dentry)->i_flags |= S_DEAD;
+ dont_mount(dentry);
+ d_delete(dentry);
+ mutex_unlock(&d_inode(parent)->i_mutex);
+
+ dput(dentry);
+
+ mutex_lock(&subsys->su_mutex);
+ unlink_group(group);
+ mutex_unlock(&subsys->su_mutex);
+}
+EXPORT_SYMBOL(configfs_unregister_group);
+
+/**
+ * configfs_register_default_group() - allocates and registers a child group
+ * @parent_group: parent group
+ * @name: child group name
+ * @item_type: child item type description
+ *
+ * boilerplate to allocate and register a child group with its parent. We need
+ * kzalloc'ed memory because child's default_group is initially empty.
+ *
+ * Return: allocated config group or ERR_PTR() on error
+ */
+struct config_group *
+configfs_register_default_group(struct config_group *parent_group,
+ const char *name,
+ struct config_item_type *item_type)
+{
+ int ret;
+ struct config_group *group;
+
+ group = kzalloc(sizeof(*group), GFP_KERNEL);
+ if (!group)
+ return ERR_PTR(-ENOMEM);
+ config_group_init_type_name(group, name, item_type);
+
+ ret = configfs_register_group(parent_group, group);
+ if (ret) {
+ kfree(group);
+ return ERR_PTR(ret);
+ }
+ return group;
+}
+EXPORT_SYMBOL(configfs_register_default_group);
+
+/**
+ * configfs_unregister_default_group() - unregisters and frees a child group
+ * @group: the group to act on
+ */
+void configfs_unregister_default_group(struct config_group *group)
+{
+ configfs_unregister_group(group);
+ kfree(group);
+}
+EXPORT_SYMBOL(configfs_unregister_default_group);
+
int configfs_register_subsystem(struct configfs_subsystem *subsys)
{
int err;
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index 403269f..3687187 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -28,6 +28,7 @@
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/mutex.h>
+#include <linux/vmalloc.h>
#include <asm/uaccess.h>
#include <linux/configfs.h>
@@ -48,6 +49,10 @@ struct configfs_buffer {
struct configfs_item_operations * ops;
struct mutex mutex;
int needs_read_fill;
+ bool read_in_progress;
+ bool write_in_progress;
+ char *bin_buffer;
+ int bin_buffer_size;
};
@@ -65,7 +70,6 @@ static int fill_read_buffer(struct dentry * dentry, struct configfs_buffer * buf
{
struct configfs_attribute * attr = to_attr(dentry);
struct config_item * item = to_item(dentry->d_parent);
- struct configfs_item_operations * ops = buffer->ops;
int ret = 0;
ssize_t count;
@@ -74,7 +78,8 @@ static int fill_read_buffer(struct dentry * dentry, struct configfs_buffer * buf
if (!buffer->page)
return -ENOMEM;
- count = ops->show_attribute(item,attr,buffer->page);
+ count = attr->show(item, buffer->page);
+
buffer->needs_read_fill = 0;
BUG_ON(count > (ssize_t)SIMPLE_ATTR_SIZE);
if (count >= 0)
@@ -123,6 +128,87 @@ out:
return retval;
}
+/**
+ * configfs_read_bin_file - read a binary attribute.
+ * @file: file pointer.
+ * @buf: buffer to fill.
+ * @count: number of bytes to read.
+ * @ppos: starting offset in file.
+ *
+ * Userspace wants to read a binary attribute file. The attribute
+ * descriptor is in the file's ->d_fsdata. The target item is in the
+ * directory's ->d_fsdata.
+ *
+ * We check whether we need to refill the buffer. If so we will
+ * call the attributes' attr->read() twice. The first time we
+ * will pass a NULL as a buffer pointer, which the attributes' method
+ * will use to return the size of the buffer required. If no error
+ * occurs we will allocate the buffer using vmalloc and call
+ * attr->read() again passing that buffer as an argument.
+ * Then we just copy to user-space using simple_read_from_buffer.
+ */
+
+static ssize_t
+configfs_read_bin_file(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct configfs_buffer *buffer = file->private_data;
+ struct dentry *dentry = file->f_path.dentry;
+ struct config_item *item = to_item(dentry->d_parent);
+ struct configfs_bin_attribute *bin_attr = to_bin_attr(dentry);
+ ssize_t retval = 0;
+ ssize_t len = min_t(size_t, count, PAGE_SIZE);
+
+ mutex_lock(&buffer->mutex);
+
+ /* we don't support switching read/write modes */
+ if (buffer->write_in_progress) {
+ retval = -ETXTBSY;
+ goto out;
+ }
+ buffer->read_in_progress = 1;
+
+ if (buffer->needs_read_fill) {
+ /* perform first read with buf == NULL to get extent */
+ len = bin_attr->read(item, NULL, 0);
+ if (len <= 0) {
+ retval = len;
+ goto out;
+ }
+
+ /* do not exceed the maximum value */
+ if (bin_attr->cb_max_size && len > bin_attr->cb_max_size) {
+ retval = -EFBIG;
+ goto out;
+ }
+
+ buffer->bin_buffer = vmalloc(len);
+ if (buffer->bin_buffer == NULL) {
+ retval = -ENOMEM;
+ goto out;
+ }
+ buffer->bin_buffer_size = len;
+
+ /* perform second read to fill buffer */
+ len = bin_attr->read(item, buffer->bin_buffer, len);
+ if (len < 0) {
+ retval = len;
+ vfree(buffer->bin_buffer);
+ buffer->bin_buffer_size = 0;
+ buffer->bin_buffer = NULL;
+ goto out;
+ }
+
+ buffer->needs_read_fill = 0;
+ }
+
+ retval = simple_read_from_buffer(buf, count, ppos, buffer->bin_buffer,
+ buffer->bin_buffer_size);
+out:
+ mutex_unlock(&buffer->mutex);
+ return retval;
+}
+
/**
* fill_write_buffer - copy buffer from userspace.
@@ -171,9 +257,8 @@ flush_write_buffer(struct dentry * dentry, struct configfs_buffer * buffer, size
{
struct configfs_attribute * attr = to_attr(dentry);
struct config_item * item = to_item(dentry->d_parent);
- struct configfs_item_operations * ops = buffer->ops;
- return ops->store_attribute(item,attr,buffer->page,count);
+ return attr->store(item, buffer->page, count);
}
@@ -210,10 +295,80 @@ configfs_write_file(struct file *file, const char __user *buf, size_t count, lof
return len;
}
-static int check_perm(struct inode * inode, struct file * file)
+/**
+ * configfs_write_bin_file - write a binary attribute.
+ * @file: file pointer
+ * @buf: data to write
+ * @count: number of bytes
+ * @ppos: starting offset
+ *
+ * Writing to a binary attribute file is similar to a normal read.
+ * We buffer the consecutive writes (binary attribute files do not
+ * support lseek) in a continuously growing buffer, but we don't
+ * commit until the close of the file.
+ */
+
+static ssize_t
+configfs_write_bin_file(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct configfs_buffer *buffer = file->private_data;
+ struct dentry *dentry = file->f_path.dentry;
+ struct configfs_bin_attribute *bin_attr = to_bin_attr(dentry);
+ void *tbuf = NULL;
+ ssize_t len;
+
+ mutex_lock(&buffer->mutex);
+
+ /* we don't support switching read/write modes */
+ if (buffer->read_in_progress) {
+ len = -ETXTBSY;
+ goto out;
+ }
+ buffer->write_in_progress = 1;
+
+ /* buffer grows? */
+ if (*ppos + count > buffer->bin_buffer_size) {
+
+ if (bin_attr->cb_max_size &&
+ *ppos + count > bin_attr->cb_max_size) {
+ len = -EFBIG;
+ }
+
+ tbuf = vmalloc(*ppos + count);
+ if (tbuf == NULL) {
+ len = -ENOMEM;
+ goto out;
+ }
+
+ /* copy old contents */
+ if (buffer->bin_buffer) {
+ memcpy(tbuf, buffer->bin_buffer,
+ buffer->bin_buffer_size);
+ vfree(buffer->bin_buffer);
+ }
+
+ /* clear the new area */
+ memset(tbuf + buffer->bin_buffer_size, 0,
+ *ppos + count - buffer->bin_buffer_size);
+ buffer->bin_buffer = tbuf;
+ buffer->bin_buffer_size = *ppos + count;
+ }
+
+ len = simple_write_to_buffer(buffer->bin_buffer,
+ buffer->bin_buffer_size, ppos, buf, count);
+ if (len > 0)
+ *ppos += len;
+out:
+ mutex_unlock(&buffer->mutex);
+ return len;
+}
+
+static int check_perm(struct inode * inode, struct file * file, int type)
{
struct config_item *item = configfs_get_config_item(file->f_path.dentry->d_parent);
struct configfs_attribute * attr = to_attr(file->f_path.dentry);
+ struct configfs_bin_attribute *bin_attr = NULL;
struct configfs_buffer * buffer;
struct configfs_item_operations * ops = NULL;
int error = 0;
@@ -221,6 +376,9 @@ static int check_perm(struct inode * inode, struct file * file)
if (!item || !attr)
goto Einval;
+ if (type & CONFIGFS_ITEM_BIN_ATTR)
+ bin_attr = to_bin_attr(file->f_path.dentry);
+
/* Grab the module reference for this attribute if we have one */
if (!try_module_get(attr->ca_owner)) {
error = -ENODEV;
@@ -237,10 +395,14 @@ static int check_perm(struct inode * inode, struct file * file)
* and we must have a store method.
*/
if (file->f_mode & FMODE_WRITE) {
+ if (!(inode->i_mode & S_IWUGO))
+ goto Eaccess;
- if (!(inode->i_mode & S_IWUGO) || !ops->store_attribute)
+ if ((type & CONFIGFS_ITEM_ATTR) && !attr->store)
goto Eaccess;
+ if ((type & CONFIGFS_ITEM_BIN_ATTR) && !bin_attr->write)
+ goto Eaccess;
}
/* File needs read support.
@@ -248,7 +410,13 @@ static int check_perm(struct inode * inode, struct file * file)
* must be a show method for it.
*/
if (file->f_mode & FMODE_READ) {
- if (!(inode->i_mode & S_IRUGO) || !ops->show_attribute)
+ if (!(inode->i_mode & S_IRUGO))
+ goto Eaccess;
+
+ if ((type & CONFIGFS_ITEM_ATTR) && !attr->show)
+ goto Eaccess;
+
+ if ((type & CONFIGFS_ITEM_BIN_ATTR) && !bin_attr->read)
goto Eaccess;
}
@@ -262,6 +430,8 @@ static int check_perm(struct inode * inode, struct file * file)
}
mutex_init(&buffer->mutex);
buffer->needs_read_fill = 1;
+ buffer->read_in_progress = 0;
+ buffer->write_in_progress = 0;
buffer->ops = ops;
file->private_data = buffer;
goto Done;
@@ -279,12 +449,7 @@ static int check_perm(struct inode * inode, struct file * file)
return error;
}
-static int configfs_open_file(struct inode * inode, struct file * filp)
-{
- return check_perm(inode,filp);
-}
-
-static int configfs_release(struct inode * inode, struct file * filp)
+static int configfs_release(struct inode *inode, struct file *filp)
{
struct config_item * item = to_item(filp->f_path.dentry->d_parent);
struct configfs_attribute * attr = to_attr(filp->f_path.dentry);
@@ -305,6 +470,47 @@ static int configfs_release(struct inode * inode, struct file * filp)
return 0;
}
+static int configfs_open_file(struct inode *inode, struct file *filp)
+{
+ return check_perm(inode, filp, CONFIGFS_ITEM_ATTR);
+}
+
+static int configfs_open_bin_file(struct inode *inode, struct file *filp)
+{
+ return check_perm(inode, filp, CONFIGFS_ITEM_BIN_ATTR);
+}
+
+static int configfs_release_bin_file(struct inode *inode, struct file *filp)
+{
+ struct configfs_buffer *buffer = filp->private_data;
+ struct dentry *dentry = filp->f_path.dentry;
+ struct config_item *item = to_item(dentry->d_parent);
+ struct configfs_bin_attribute *bin_attr = to_bin_attr(dentry);
+ ssize_t len = 0;
+ int ret;
+
+ buffer->read_in_progress = 0;
+
+ if (buffer->write_in_progress) {
+ buffer->write_in_progress = 0;
+
+ len = bin_attr->write(item, buffer->bin_buffer,
+ buffer->bin_buffer_size);
+
+ /* vfree on NULL is safe */
+ vfree(buffer->bin_buffer);
+ buffer->bin_buffer = NULL;
+ buffer->bin_buffer_size = 0;
+ buffer->needs_read_fill = 1;
+ }
+
+ ret = configfs_release(inode, filp);
+ if (len < 0)
+ return len;
+ return ret;
+}
+
+
const struct file_operations configfs_file_operations = {
.read = configfs_read_file,
.write = configfs_write_file,
@@ -313,6 +519,14 @@ const struct file_operations configfs_file_operations = {
.release = configfs_release,
};
+const struct file_operations configfs_bin_file_operations = {
+ .read = configfs_read_bin_file,
+ .write = configfs_write_bin_file,
+ .llseek = NULL, /* bin file is not seekable */
+ .open = configfs_open_bin_file,
+ .release = configfs_release_bin_file,
+};
+
/**
* configfs_create_file - create an attribute file for an item.
* @item: item we're creating for.
@@ -334,3 +548,24 @@ int configfs_create_file(struct config_item * item, const struct configfs_attrib
return error;
}
+/**
+ * configfs_create_bin_file - create a binary attribute file for an item.
+ * @item: item we're creating for.
+ * @attr: atrribute descriptor.
+ */
+
+int configfs_create_bin_file(struct config_item *item,
+ const struct configfs_bin_attribute *bin_attr)
+{
+ struct dentry *dir = item->ci_dentry;
+ struct configfs_dirent *parent_sd = dir->d_fsdata;
+ umode_t mode = (bin_attr->cb_attr.ca_mode & S_IALLUGO) | S_IFREG;
+ int error = 0;
+
+ mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_NORMAL);
+ error = configfs_make_dirent(parent_sd, NULL, (void *) bin_attr, mode,
+ CONFIGFS_ITEM_BIN_ATTR);
+ mutex_unlock(&dir->d_inode->i_mutex);
+
+ return error;
+}
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index eae8757..0cc810e 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -218,7 +218,7 @@ const unsigned char * configfs_get_name(struct configfs_dirent *sd)
if (sd->s_type & (CONFIGFS_DIR | CONFIGFS_ITEM_LINK))
return sd->s_dentry->d_name.name;
- if (sd->s_type & CONFIGFS_ITEM_ATTR) {
+ if (sd->s_type & (CONFIGFS_ITEM_ATTR | CONFIGFS_ITEM_BIN_ATTR)) {
attr = sd->s_element;
return attr->ca_name;
}
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index ec5c832..db6d692 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -279,27 +279,33 @@ static int configfs_getlink(struct dentry *dentry, char * path)
}
-static const char *configfs_follow_link(struct dentry *dentry, void **cookie)
+static const char *configfs_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- unsigned long page = get_zeroed_page(GFP_KERNEL);
+ char *body;
int error;
- if (!page)
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
+ body = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!body)
return ERR_PTR(-ENOMEM);
- error = configfs_getlink(dentry, (char *)page);
+ error = configfs_getlink(dentry, body);
if (!error) {
- return *cookie = (void *)page;
+ set_delayed_call(done, kfree_link, body);
+ return body;
}
- free_page(page);
+ kfree(body);
return ERR_PTR(error);
}
const struct inode_operations configfs_symlink_inode_operations = {
- .follow_link = configfs_follow_link,
+ .get_link = configfs_get_link,
.readlink = generic_readlink,
- .put_link = free_page_put_link,
.setattr = configfs_setattr,
};
diff --git a/fs/coredump.c b/fs/coredump.c
index c5ecde6..b3c153c 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -32,6 +32,7 @@
#include <linux/pipe_fs_i.h>
#include <linux/oom.h>
#include <linux/compat.h>
+#include <linux/timekeeping.h>
#include <asm/uaccess.h>
#include <asm/mmu_context.h>
@@ -232,9 +233,10 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm)
break;
/* UNIX time of coredump */
case 't': {
- struct timeval tv;
- do_gettimeofday(&tv);
- err = cn_printf(cn, "%lu", tv.tv_sec);
+ time64_t time;
+
+ time = ktime_get_real_seconds();
+ err = cn_printf(cn, "%lld", time);
break;
}
/* hostname */
@@ -280,23 +282,24 @@ out:
return ispipe;
}
-static int zap_process(struct task_struct *start, int exit_code)
+static int zap_process(struct task_struct *start, int exit_code, int flags)
{
struct task_struct *t;
int nr = 0;
+ /* ignore all signals except SIGKILL, see prepare_signal() */
+ start->signal->flags = SIGNAL_GROUP_COREDUMP | flags;
start->signal->group_exit_code = exit_code;
start->signal->group_stop_count = 0;
- t = start;
- do {
+ for_each_thread(start, t) {
task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
if (t != current && t->mm) {
sigaddset(&t->pending.signal, SIGKILL);
signal_wake_up(t, 1);
nr++;
}
- } while_each_thread(start, t);
+ }
return nr;
}
@@ -311,10 +314,8 @@ static int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
spin_lock_irq(&tsk->sighand->siglock);
if (!signal_group_exit(tsk->signal)) {
mm->core_state = core_state;
- nr = zap_process(tsk, exit_code);
tsk->signal->group_exit_task = tsk;
- /* ignore all signals except SIGKILL, see prepare_signal() */
- tsk->signal->flags = SIGNAL_GROUP_COREDUMP;
+ nr = zap_process(tsk, exit_code, 0);
clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
}
spin_unlock_irq(&tsk->sighand->siglock);
@@ -360,18 +361,18 @@ static int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
continue;
if (g->flags & PF_KTHREAD)
continue;
- p = g;
- do {
- if (p->mm) {
- if (unlikely(p->mm == mm)) {
- lock_task_sighand(p, &flags);
- nr += zap_process(p, exit_code);
- p->signal->flags = SIGNAL_GROUP_EXIT;
- unlock_task_sighand(p, &flags);
- }
- break;
+
+ for_each_thread(g, p) {
+ if (unlikely(!p->mm))
+ continue;
+ if (unlikely(p->mm == mm)) {
+ lock_task_sighand(p, &flags);
+ nr += zap_process(p, exit_code,
+ SIGNAL_GROUP_EXIT);
+ unlock_task_sighand(p, &flags);
}
- } while_each_thread(g, p);
+ break;
+ }
}
rcu_read_unlock();
done:
@@ -513,10 +514,10 @@ void do_coredump(const siginfo_t *siginfo)
const struct cred *old_cred;
struct cred *cred;
int retval = 0;
- int flag = 0;
int ispipe;
struct files_struct *displaced;
- bool need_nonrelative = false;
+ /* require nonrelative corefile path and be extra careful */
+ bool need_suid_safe = false;
bool core_dumped = false;
static atomic_t core_dump_count = ATOMIC_INIT(0);
struct coredump_params cprm = {
@@ -550,9 +551,8 @@ void do_coredump(const siginfo_t *siginfo)
*/
if (__get_dumpable(cprm.mm_flags) == SUID_DUMP_ROOT) {
/* Setuid core dump mode */
- flag = O_EXCL; /* Stop rewrite attacks */
cred->fsuid = GLOBAL_ROOT_UID; /* Dump root private */
- need_nonrelative = true;
+ need_suid_safe = true;
}
retval = coredump_wait(siginfo->si_signo, &core_state);
@@ -633,7 +633,7 @@ void do_coredump(const siginfo_t *siginfo)
if (cprm.limit < binfmt->min_coredump)
goto fail_unlock;
- if (need_nonrelative && cn.corename[0] != '/') {
+ if (need_suid_safe && cn.corename[0] != '/') {
printk(KERN_WARNING "Pid %d(%s) can only dump core "\
"to fully qualified path!\n",
task_tgid_vnr(current), current->comm);
@@ -641,8 +641,35 @@ void do_coredump(const siginfo_t *siginfo)
goto fail_unlock;
}
+ /*
+ * Unlink the file if it exists unless this is a SUID
+ * binary - in that case, we're running around with root
+ * privs and don't want to unlink another user's coredump.
+ */
+ if (!need_suid_safe) {
+ mm_segment_t old_fs;
+
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ /*
+ * If it doesn't exist, that's fine. If there's some
+ * other problem, we'll catch it at the filp_open().
+ */
+ (void) sys_unlink((const char __user *)cn.corename);
+ set_fs(old_fs);
+ }
+
+ /*
+ * There is a race between unlinking and creating the
+ * file, but if that causes an EEXIST here, that's
+ * fine - another process raced with us while creating
+ * the corefile, and the other process won. To userspace,
+ * what matters is that at least one of the two processes
+ * writes its coredump successfully, not which one.
+ */
cprm.file = filp_open(cn.corename,
- O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
+ O_CREAT | 2 | O_NOFOLLOW |
+ O_LARGEFILE | O_EXCL,
0600);
if (IS_ERR(cprm.file))
goto fail_unlock;
@@ -659,11 +686,15 @@ void do_coredump(const siginfo_t *siginfo)
if (!S_ISREG(inode->i_mode))
goto close_fail;
/*
- * Dont allow local users get cute and trick others to coredump
- * into their pre-created files.
+ * Don't dump core if the filesystem changed owner or mode
+ * of the file during file creation. This is an issue when
+ * a process dumps core while its cwd is e.g. on a vfat
+ * filesystem.
*/
if (!uid_eq(inode->i_uid, current_fsuid()))
goto close_fail;
+ if ((inode->i_mode & 0677) != 0600)
+ goto close_fail;
if (!(cprm.file->f_mode & FMODE_CAN_WRITE))
goto close_fail;
if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file))
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 355c522..b862bc2 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -100,6 +100,7 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
break;
case S_IFLNK:
inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_data.a_ops = &cramfs_aops;
break;
default:
diff --git a/fs/dax.c b/fs/dax.c
index a7f77e1..7af8797 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -17,68 +17,89 @@
#include <linux/atomic.h>
#include <linux/blkdev.h>
#include <linux/buffer_head.h>
+#include <linux/dax.h>
#include <linux/fs.h>
#include <linux/genhd.h>
#include <linux/highmem.h>
#include <linux/memcontrol.h>
#include <linux/mm.h>
#include <linux/mutex.h>
+#include <linux/pmem.h>
#include <linux/sched.h>
#include <linux/uio.h>
#include <linux/vmstat.h>
+#include <linux/pfn_t.h>
+#include <linux/sizes.h>
-int dax_clear_blocks(struct inode *inode, sector_t block, long size)
+static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
+{
+ struct request_queue *q = bdev->bd_queue;
+ long rc = -EIO;
+
+ dax->addr = (void __pmem *) ERR_PTR(-EIO);
+ if (blk_queue_enter(q, true) != 0)
+ return rc;
+
+ rc = bdev_direct_access(bdev, dax);
+ if (rc < 0) {
+ dax->addr = (void __pmem *) ERR_PTR(rc);
+ blk_queue_exit(q);
+ return rc;
+ }
+ return rc;
+}
+
+static void dax_unmap_atomic(struct block_device *bdev,
+ const struct blk_dax_ctl *dax)
+{
+ if (IS_ERR(dax->addr))
+ return;
+ blk_queue_exit(bdev->bd_queue);
+}
+
+/*
+ * dax_clear_blocks() is called from within transaction context from XFS,
+ * and hence this means the stack from this point must follow GFP_NOFS
+ * semantics for all operations.
+ */
+int dax_clear_blocks(struct inode *inode, sector_t block, long _size)
{
struct block_device *bdev = inode->i_sb->s_bdev;
- sector_t sector = block << (inode->i_blkbits - 9);
+ struct blk_dax_ctl dax = {
+ .sector = block << (inode->i_blkbits - 9),
+ .size = _size,
+ };
might_sleep();
do {
- void *addr;
- unsigned long pfn;
- long count;
+ long count, sz;
- count = bdev_direct_access(bdev, sector, &addr, &pfn, size);
+ count = dax_map_atomic(bdev, &dax);
if (count < 0)
return count;
- BUG_ON(size < count);
- while (count > 0) {
- unsigned pgsz = PAGE_SIZE - offset_in_page(addr);
- if (pgsz > count)
- pgsz = count;
- if (pgsz < PAGE_SIZE)
- memset(addr, 0, pgsz);
- else
- clear_page(addr);
- addr += pgsz;
- size -= pgsz;
- count -= pgsz;
- BUG_ON(pgsz & 511);
- sector += pgsz / 512;
- cond_resched();
- }
- } while (size);
-
+ sz = min_t(long, count, SZ_128K);
+ clear_pmem(dax.addr, sz);
+ dax.size -= sz;
+ dax.sector += sz / 512;
+ dax_unmap_atomic(bdev, &dax);
+ cond_resched();
+ } while (dax.size);
+
+ wmb_pmem();
return 0;
}
EXPORT_SYMBOL_GPL(dax_clear_blocks);
-static long dax_get_addr(struct buffer_head *bh, void **addr, unsigned blkbits)
-{
- unsigned long pfn;
- sector_t sector = bh->b_blocknr << (blkbits - 9);
- return bdev_direct_access(bh->b_bdev, sector, addr, &pfn, bh->b_size);
-}
-
-static void dax_new_buf(void *addr, unsigned size, unsigned first, loff_t pos,
- loff_t end)
+/* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */
+static void dax_new_buf(void __pmem *addr, unsigned size, unsigned first,
+ loff_t pos, loff_t end)
{
loff_t final = end - pos + first; /* The final byte of the buffer */
if (first > 0)
- memset(addr, 0, first);
+ clear_pmem(addr, first);
if (final < size)
- memset(addr + final, 0, size - final);
+ clear_pmem(addr + final, size - final);
}
static bool buffer_written(struct buffer_head *bh)
@@ -98,38 +119,50 @@ static bool buffer_size_valid(struct buffer_head *bh)
return bh->b_state != 0;
}
+
+static sector_t to_sector(const struct buffer_head *bh,
+ const struct inode *inode)
+{
+ sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
+
+ return sector;
+}
+
static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
loff_t start, loff_t end, get_block_t get_block,
struct buffer_head *bh)
{
- ssize_t retval = 0;
- loff_t pos = start;
- loff_t max = start;
- loff_t bh_max = start;
- void *addr;
- bool hole = false;
-
- if (iov_iter_rw(iter) != WRITE)
+ loff_t pos = start, max = start, bh_max = start;
+ bool hole = false, need_wmb = false;
+ struct block_device *bdev = NULL;
+ int rw = iov_iter_rw(iter), rc;
+ long map_len = 0;
+ struct blk_dax_ctl dax = {
+ .addr = (void __pmem *) ERR_PTR(-EIO),
+ };
+
+ if (rw == READ)
end = min(end, i_size_read(inode));
while (pos < end) {
- unsigned len;
+ size_t len;
if (pos == max) {
unsigned blkbits = inode->i_blkbits;
- sector_t block = pos >> blkbits;
+ long page = pos >> PAGE_SHIFT;
+ sector_t block = page << (PAGE_SHIFT - blkbits);
unsigned first = pos - (block << blkbits);
long size;
if (pos == bh_max) {
bh->b_size = PAGE_ALIGN(end - pos);
bh->b_state = 0;
- retval = get_block(inode, block, bh,
- iov_iter_rw(iter) == WRITE);
- if (retval)
+ rc = get_block(inode, block, bh, rw == WRITE);
+ if (rc)
break;
if (!buffer_size_valid(bh))
bh->b_size = 1 << blkbits;
bh_max = pos - first + bh->b_size;
+ bdev = bh->b_bdev;
} else {
unsigned done = bh->b_size -
(bh_max - (pos - first));
@@ -137,38 +170,53 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
bh->b_size -= done;
}
- hole = iov_iter_rw(iter) != WRITE && !buffer_written(bh);
+ hole = rw == READ && !buffer_written(bh);
if (hole) {
- addr = NULL;
size = bh->b_size - first;
} else {
- retval = dax_get_addr(bh, &addr, blkbits);
- if (retval < 0)
+ dax_unmap_atomic(bdev, &dax);
+ dax.sector = to_sector(bh, inode);
+ dax.size = bh->b_size;
+ map_len = dax_map_atomic(bdev, &dax);
+ if (map_len < 0) {
+ rc = map_len;
break;
- if (buffer_unwritten(bh) || buffer_new(bh))
- dax_new_buf(addr, retval, first, pos,
- end);
- addr += first;
- size = retval - first;
+ }
+ if (buffer_unwritten(bh) || buffer_new(bh)) {
+ dax_new_buf(dax.addr, map_len, first,
+ pos, end);
+ need_wmb = true;
+ }
+ dax.addr += first;
+ size = map_len - first;
}
max = min(pos + size, end);
}
- if (iov_iter_rw(iter) == WRITE)
- len = copy_from_iter_nocache(addr, max - pos, iter);
- else if (!hole)
- len = copy_to_iter(addr, max - pos, iter);
+ if (iov_iter_rw(iter) == WRITE) {
+ len = copy_from_iter_pmem(dax.addr, max - pos, iter);
+ need_wmb = true;
+ } else if (!hole)
+ len = copy_to_iter((void __force *) dax.addr, max - pos,
+ iter);
else
len = iov_iter_zero(max - pos, iter);
- if (!len)
+ if (!len) {
+ rc = -EFAULT;
break;
+ }
pos += len;
- addr += len;
+ if (!IS_ERR(dax.addr))
+ dax.addr += len;
}
- return (pos == start) ? retval : pos - start;
+ if (need_wmb)
+ wmb_pmem();
+ dax_unmap_atomic(bdev, &dax);
+
+ return (pos == start) ? rc : pos - start;
}
/**
@@ -257,26 +305,35 @@ static int dax_load_hole(struct address_space *mapping, struct page *page,
return VM_FAULT_LOCKED;
}
-static int copy_user_bh(struct page *to, struct buffer_head *bh,
- unsigned blkbits, unsigned long vaddr)
+static int copy_user_bh(struct page *to, struct inode *inode,
+ struct buffer_head *bh, unsigned long vaddr)
{
- void *vfrom, *vto;
- if (dax_get_addr(bh, &vfrom, blkbits) < 0)
- return -EIO;
+ struct blk_dax_ctl dax = {
+ .sector = to_sector(bh, inode),
+ .size = bh->b_size,
+ };
+ struct block_device *bdev = bh->b_bdev;
+ void *vto;
+
+ if (dax_map_atomic(bdev, &dax) < 0)
+ return PTR_ERR(dax.addr);
vto = kmap_atomic(to);
- copy_user_page(vto, vfrom, vaddr, to);
+ copy_user_page(vto, (void __force *)dax.addr, vaddr, to);
kunmap_atomic(vto);
+ dax_unmap_atomic(bdev, &dax);
return 0;
}
static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
struct vm_area_struct *vma, struct vm_fault *vmf)
{
- struct address_space *mapping = inode->i_mapping;
- sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
unsigned long vaddr = (unsigned long)vmf->virtual_address;
- void *addr;
- unsigned long pfn;
+ struct address_space *mapping = inode->i_mapping;
+ struct block_device *bdev = bh->b_bdev;
+ struct blk_dax_ctl dax = {
+ .sector = to_sector(bh, inode),
+ .size = bh->b_size,
+ };
pgoff_t size;
int error;
@@ -295,18 +352,18 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
goto out;
}
- error = bdev_direct_access(bh->b_bdev, sector, &addr, &pfn, bh->b_size);
- if (error < 0)
- goto out;
- if (error < PAGE_SIZE) {
- error = -EIO;
+ if (dax_map_atomic(bdev, &dax) < 0) {
+ error = PTR_ERR(dax.addr);
goto out;
}
- if (buffer_unwritten(bh) || buffer_new(bh))
- clear_page(addr);
+ if (buffer_unwritten(bh) || buffer_new(bh)) {
+ clear_pmem(dax.addr, PAGE_SIZE);
+ wmb_pmem();
+ }
+ dax_unmap_atomic(bdev, &dax);
- error = vm_insert_mixed(vma, vaddr, pfn);
+ error = vm_insert_mixed(vma, vaddr, dax.pfn);
out:
i_mmap_unlock_read(mapping);
@@ -400,7 +457,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
if (vmf->cow_page) {
struct page *new_page = vmf->cow_page;
if (buffer_written(&bh))
- error = copy_user_bh(new_page, &bh, blkbits, vaddr);
+ error = copy_user_bh(new_page, inode, &bh, vaddr);
else
clear_user_highpage(new_page, vaddr);
if (error)
@@ -494,6 +551,241 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
}
EXPORT_SYMBOL_GPL(dax_fault);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/*
+ * The 'colour' (ie low bits) within a PMD of a page offset. This comes up
+ * more often than one might expect in the below function.
+ */
+#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
+
+static void __dax_dbg(struct buffer_head *bh, unsigned long address,
+ const char *reason, const char *fn)
+{
+ if (bh) {
+ char bname[BDEVNAME_SIZE];
+ bdevname(bh->b_bdev, bname);
+ pr_debug("%s: %s addr: %lx dev %s state %lx start %lld "
+ "length %zd fallback: %s\n", fn, current->comm,
+ address, bname, bh->b_state, (u64)bh->b_blocknr,
+ bh->b_size, reason);
+ } else {
+ pr_debug("%s: %s addr: %lx fallback: %s\n", fn,
+ current->comm, address, reason);
+ }
+}
+
+#define dax_pmd_dbg(bh, address, reason) __dax_dbg(bh, address, reason, "dax_pmd")
+
+int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmd, unsigned int flags, get_block_t get_block,
+ dax_iodone_t complete_unwritten)
+{
+ struct file *file = vma->vm_file;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ struct buffer_head bh;
+ unsigned blkbits = inode->i_blkbits;
+ unsigned long pmd_addr = address & PMD_MASK;
+ bool write = flags & FAULT_FLAG_WRITE;
+ struct block_device *bdev;
+ pgoff_t size, pgoff;
+ sector_t block;
+ int result = 0;
+
+ /* dax pmd mappings require pfn_t_devmap() */
+ if (!IS_ENABLED(CONFIG_FS_DAX_PMD))
+ return VM_FAULT_FALLBACK;
+
+ /* Fall back to PTEs if we're going to COW */
+ if (write && !(vma->vm_flags & VM_SHARED)) {
+ split_huge_pmd(vma, pmd, address);
+ dax_pmd_dbg(NULL, address, "cow write");
+ return VM_FAULT_FALLBACK;
+ }
+ /* If the PMD would extend outside the VMA */
+ if (pmd_addr < vma->vm_start) {
+ dax_pmd_dbg(NULL, address, "vma start unaligned");
+ return VM_FAULT_FALLBACK;
+ }
+ if ((pmd_addr + PMD_SIZE) > vma->vm_end) {
+ dax_pmd_dbg(NULL, address, "vma end unaligned");
+ return VM_FAULT_FALLBACK;
+ }
+
+ pgoff = linear_page_index(vma, pmd_addr);
+ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ if (pgoff >= size)
+ return VM_FAULT_SIGBUS;
+ /* If the PMD would cover blocks out of the file */
+ if ((pgoff | PG_PMD_COLOUR) >= size) {
+ dax_pmd_dbg(NULL, address,
+ "offset + huge page size > file size");
+ return VM_FAULT_FALLBACK;
+ }
+
+ memset(&bh, 0, sizeof(bh));
+ block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
+
+ bh.b_size = PMD_SIZE;
+ if (get_block(inode, block, &bh, write) != 0)
+ return VM_FAULT_SIGBUS;
+ bdev = bh.b_bdev;
+ i_mmap_lock_read(mapping);
+
+ /*
+ * If the filesystem isn't willing to tell us the length of a hole,
+ * just fall back to PTEs. Calling get_block 512 times in a loop
+ * would be silly.
+ */
+ if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) {
+ dax_pmd_dbg(&bh, address, "allocated block too small");
+ goto fallback;
+ }
+
+ /*
+ * If we allocated new storage, make sure no process has any
+ * zero pages covering this hole
+ */
+ if (buffer_new(&bh)) {
+ i_mmap_unlock_read(mapping);
+ unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0);
+ i_mmap_lock_read(mapping);
+ }
+
+ /*
+ * If a truncate happened while we were allocating blocks, we may
+ * leave blocks allocated to the file that are beyond EOF. We can't
+ * take i_mutex here, so just leave them hanging; they'll be freed
+ * when the file is deleted.
+ */
+ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ if (pgoff >= size) {
+ result = VM_FAULT_SIGBUS;
+ goto out;
+ }
+ if ((pgoff | PG_PMD_COLOUR) >= size) {
+ dax_pmd_dbg(&bh, address, "pgoff unaligned");
+ goto fallback;
+ }
+
+ if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) {
+ spinlock_t *ptl;
+ pmd_t entry;
+ struct page *zero_page = get_huge_zero_page();
+
+ if (unlikely(!zero_page)) {
+ dax_pmd_dbg(&bh, address, "no zero page");
+ goto fallback;
+ }
+
+ ptl = pmd_lock(vma->vm_mm, pmd);
+ if (!pmd_none(*pmd)) {
+ spin_unlock(ptl);
+ dax_pmd_dbg(&bh, address, "pmd already present");
+ goto fallback;
+ }
+
+ dev_dbg(part_to_dev(bdev->bd_part),
+ "%s: %s addr: %lx pfn: <zero> sect: %llx\n",
+ __func__, current->comm, address,
+ (unsigned long long) to_sector(&bh, inode));
+
+ entry = mk_pmd(zero_page, vma->vm_page_prot);
+ entry = pmd_mkhuge(entry);
+ set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry);
+ result = VM_FAULT_NOPAGE;
+ spin_unlock(ptl);
+ } else {
+ struct blk_dax_ctl dax = {
+ .sector = to_sector(&bh, inode),
+ .size = PMD_SIZE,
+ };
+ long length = dax_map_atomic(bdev, &dax);
+
+ if (length < 0) {
+ result = VM_FAULT_SIGBUS;
+ goto out;
+ }
+ if (length < PMD_SIZE) {
+ dax_pmd_dbg(&bh, address, "dax-length too small");
+ dax_unmap_atomic(bdev, &dax);
+ goto fallback;
+ }
+ if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) {
+ dax_pmd_dbg(&bh, address, "pfn unaligned");
+ dax_unmap_atomic(bdev, &dax);
+ goto fallback;
+ }
+
+ if (!pfn_t_devmap(dax.pfn)) {
+ dax_unmap_atomic(bdev, &dax);
+ dax_pmd_dbg(&bh, address, "pfn not in memmap");
+ goto fallback;
+ }
+
+ if (buffer_unwritten(&bh) || buffer_new(&bh)) {
+ clear_pmem(dax.addr, PMD_SIZE);
+ wmb_pmem();
+ count_vm_event(PGMAJFAULT);
+ mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+ result |= VM_FAULT_MAJOR;
+ }
+ dax_unmap_atomic(bdev, &dax);
+
+ dev_dbg(part_to_dev(bdev->bd_part),
+ "%s: %s addr: %lx pfn: %lx sect: %llx\n",
+ __func__, current->comm, address,
+ pfn_t_to_pfn(dax.pfn),
+ (unsigned long long) dax.sector);
+ result |= vmf_insert_pfn_pmd(vma, address, pmd,
+ dax.pfn, write);
+ }
+
+ out:
+ i_mmap_unlock_read(mapping);
+
+ if (buffer_unwritten(&bh))
+ complete_unwritten(&bh, !(result & VM_FAULT_ERROR));
+
+ return result;
+
+ fallback:
+ count_vm_event(THP_FAULT_FALLBACK);
+ result = VM_FAULT_FALLBACK;
+ goto out;
+}
+EXPORT_SYMBOL_GPL(__dax_pmd_fault);
+
+/**
+ * dax_pmd_fault - handle a PMD fault on a DAX file
+ * @vma: The virtual memory area where the fault occurred
+ * @vmf: The description of the fault
+ * @get_block: The filesystem method used to translate file offsets to blocks
+ *
+ * When a page fault occurs, filesystems may call this helper in their
+ * pmd_fault handler for DAX files.
+ */
+int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmd, unsigned int flags, get_block_t get_block,
+ dax_iodone_t complete_unwritten)
+{
+ int result;
+ struct super_block *sb = file_inode(vma->vm_file)->i_sb;
+
+ if (flags & FAULT_FLAG_WRITE) {
+ sb_start_pagefault(sb);
+ file_update_time(vma->vm_file);
+ }
+ result = __dax_pmd_fault(vma, address, pmd, flags, get_block,
+ complete_unwritten);
+ if (flags & FAULT_FLAG_WRITE)
+ sb_end_pagefault(sb);
+
+ return result;
+}
+EXPORT_SYMBOL_GPL(dax_pmd_fault);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
/**
* dax_pfn_mkwrite - handle first write to DAX page
* @vma: The virtual memory area where the fault occurred
@@ -548,11 +840,17 @@ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
if (err < 0)
return err;
if (buffer_written(&bh)) {
- void *addr;
- err = dax_get_addr(&bh, &addr, inode->i_blkbits);
- if (err < 0)
- return err;
- memset(addr + offset, 0, length);
+ struct block_device *bdev = bh.b_bdev;
+ struct blk_dax_ctl dax = {
+ .sector = to_sector(&bh, inode),
+ .size = PAGE_CACHE_SIZE,
+ };
+
+ if (dax_map_atomic(bdev, &dax) < 0)
+ return PTR_ERR(dax.addr);
+ clear_pmem(dax.addr + offset, length);
+ wmb_pmem();
+ dax_unmap_atomic(bdev, &dax);
}
return 0;
diff --git a/fs/dcache.c b/fs/dcache.c
index 9b5fe50..b4539e8 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1571,7 +1571,8 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
dentry->d_iname[DNAME_INLINE_LEN-1] = 0;
if (name->len > DNAME_INLINE_LEN-1) {
size_t size = offsetof(struct external_name, name[1]);
- struct external_name *p = kmalloc(size + name->len, GFP_KERNEL);
+ struct external_name *p = kmalloc(size + name->len,
+ GFP_KERNEL_ACCOUNT);
if (!p) {
kmem_cache_free(dentry_cache, dentry);
return NULL;
@@ -1734,7 +1735,7 @@ static unsigned d_flags_for_inode(struct inode *inode)
}
if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) {
- if (unlikely(inode->i_op->follow_link)) {
+ if (unlikely(inode->i_op->get_link)) {
add_flags = DCACHE_SYMLINK_TYPE;
goto type_determined;
}
@@ -2718,7 +2719,7 @@ struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2)
* This helper attempts to cope with remotely renamed directories
*
* It assumes that the caller is already holding
- * dentry->d_parent->d_inode->i_mutex, inode->i_lock and rename_lock
+ * dentry->d_parent->d_inode->i_mutex, and rename_lock
*
* Note: If ever the locking in lock_rename() changes, then please
* remember to update this too...
@@ -2744,7 +2745,6 @@ out_unalias:
__d_move(alias, dentry, false);
ret = 0;
out_err:
- spin_unlock(&inode->i_lock);
if (m2)
mutex_unlock(m2);
if (m1)
@@ -2790,10 +2790,11 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
if (S_ISDIR(inode->i_mode)) {
struct dentry *new = __d_find_any_alias(inode);
if (unlikely(new)) {
+ /* The reference to new ensures it remains an alias */
+ spin_unlock(&inode->i_lock);
write_seqlock(&rename_lock);
if (unlikely(d_ancestor(new, dentry))) {
write_sequnlock(&rename_lock);
- spin_unlock(&inode->i_lock);
dput(new);
new = ERR_PTR(-ELOOP);
pr_warn_ratelimited(
@@ -2812,7 +2813,6 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
} else {
__d_move(new, dentry, false);
write_sequnlock(&rename_lock);
- spin_unlock(&inode->i_lock);
security_d_instantiate(new, inode);
}
iput(inode);
@@ -2926,6 +2926,13 @@ restart:
if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) {
struct mount *parent = ACCESS_ONCE(mnt->mnt_parent);
+ /* Escaped? */
+ if (dentry != vfsmnt->mnt_root) {
+ bptr = *buffer;
+ blen = *buflen;
+ error = 3;
+ break;
+ }
/* Global root? */
if (mnt != parent) {
dentry = ACCESS_ONCE(mnt->mnt_mountpoint);
@@ -3297,18 +3304,18 @@ out:
* @new_dentry: new dentry
* @old_dentry: old dentry
*
- * Returns 1 if new_dentry is a subdirectory of the parent (at any depth).
- * Returns 0 otherwise.
+ * Returns true if new_dentry is a subdirectory of the parent (at any depth).
+ * Returns false otherwise.
* Caller must ensure that "new_dentry" is pinned before calling is_subdir()
*/
-int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
+bool is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
{
- int result;
+ bool result;
unsigned seq;
if (new_dentry == old_dentry)
- return 1;
+ return true;
do {
/* for restarting inner loop in case of seq retry */
@@ -3319,9 +3326,9 @@ int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
*/
rcu_read_lock();
if (d_ancestor(old_dentry, new_dentry))
- result = 1;
+ result = true;
else
- result = 0;
+ result = false;
rcu_read_unlock();
} while (read_seqretry(&rename_lock, seq));
@@ -3409,7 +3416,7 @@ static void __init dcache_init(void)
* of the dcache.
*/
dentry_cache = KMEM_CACHE(dentry,
- SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD);
+ SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD|SLAB_ACCOUNT);
/* Hash may have been set up in dcache_init_early */
if (!hashdist)
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 284f9aa..d2ba12e 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -42,6 +42,22 @@ const struct file_operations debugfs_file_operations = {
.llseek = noop_llseek,
};
+static struct dentry *debugfs_create_mode(const char *name, umode_t mode,
+ struct dentry *parent, void *value,
+ const struct file_operations *fops,
+ const struct file_operations *fops_ro,
+ const struct file_operations *fops_wo)
+{
+ /* if there are no write bits set, make read only */
+ if (!(mode & S_IWUGO))
+ return debugfs_create_file(name, mode, parent, value, fops_ro);
+ /* if there are no read bits set, make write only */
+ if (!(mode & S_IRUGO))
+ return debugfs_create_file(name, mode, parent, value, fops_wo);
+
+ return debugfs_create_file(name, mode, parent, value, fops);
+}
+
static int debugfs_u8_set(void *data, u64 val)
{
*(u8 *)data = val;
@@ -83,14 +99,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u8_wo, NULL, debugfs_u8_set, "%llu\n");
struct dentry *debugfs_create_u8(const char *name, umode_t mode,
struct dentry *parent, u8 *value)
{
- /* if there are no write bits set, make read only */
- if (!(mode & S_IWUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_u8_ro);
- /* if there are no read bits set, make write only */
- if (!(mode & S_IRUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_u8_wo);
-
- return debugfs_create_file(name, mode, parent, value, &fops_u8);
+ return debugfs_create_mode(name, mode, parent, value, &fops_u8,
+ &fops_u8_ro, &fops_u8_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_u8);
@@ -135,14 +145,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u16_wo, NULL, debugfs_u16_set, "%llu\n");
struct dentry *debugfs_create_u16(const char *name, umode_t mode,
struct dentry *parent, u16 *value)
{
- /* if there are no write bits set, make read only */
- if (!(mode & S_IWUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_u16_ro);
- /* if there are no read bits set, make write only */
- if (!(mode & S_IRUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_u16_wo);
-
- return debugfs_create_file(name, mode, parent, value, &fops_u16);
+ return debugfs_create_mode(name, mode, parent, value, &fops_u16,
+ &fops_u16_ro, &fops_u16_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_u16);
@@ -187,14 +191,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u32_wo, NULL, debugfs_u32_set, "%llu\n");
struct dentry *debugfs_create_u32(const char *name, umode_t mode,
struct dentry *parent, u32 *value)
{
- /* if there are no write bits set, make read only */
- if (!(mode & S_IWUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_u32_ro);
- /* if there are no read bits set, make write only */
- if (!(mode & S_IRUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_u32_wo);
-
- return debugfs_create_file(name, mode, parent, value, &fops_u32);
+ return debugfs_create_mode(name, mode, parent, value, &fops_u32,
+ &fops_u32_ro, &fops_u32_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_u32);
@@ -240,17 +238,59 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n");
struct dentry *debugfs_create_u64(const char *name, umode_t mode,
struct dentry *parent, u64 *value)
{
- /* if there are no write bits set, make read only */
- if (!(mode & S_IWUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_u64_ro);
- /* if there are no read bits set, make write only */
- if (!(mode & S_IRUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_u64_wo);
-
- return debugfs_create_file(name, mode, parent, value, &fops_u64);
+ return debugfs_create_mode(name, mode, parent, value, &fops_u64,
+ &fops_u64_ro, &fops_u64_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_u64);
+static int debugfs_ulong_set(void *data, u64 val)
+{
+ *(unsigned long *)data = val;
+ return 0;
+}
+
+static int debugfs_ulong_get(void *data, u64 *val)
+{
+ *val = *(unsigned long *)data;
+ return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(fops_ulong, debugfs_ulong_get, debugfs_ulong_set, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_ulong_ro, debugfs_ulong_get, NULL, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_ulong_wo, NULL, debugfs_ulong_set, "%llu\n");
+
+/**
+ * debugfs_create_ulong - create a debugfs file that is used to read and write
+ * an unsigned long value.
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file. This should be a
+ * directory dentry if set. If this parameter is %NULL, then the
+ * file will be created in the root of the debugfs filesystem.
+ * @value: a pointer to the variable that the file should read to and write
+ * from.
+ *
+ * This function creates a file in debugfs with the given name that
+ * contains the value of the variable @value. If the @mode variable is so
+ * set, it can be read from, and written to.
+ *
+ * This function will return a pointer to a dentry if it succeeds. This
+ * pointer must be passed to the debugfs_remove() function when the file is
+ * to be removed (no automatic cleanup happens if your module is unloaded,
+ * you are responsible here.) If an error occurs, %NULL will be returned.
+ *
+ * If debugfs is not enabled in the kernel, the value -%ENODEV will be
+ * returned. It is not wise to check for this value, but rather, check for
+ * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
+ * code.
+ */
+struct dentry *debugfs_create_ulong(const char *name, umode_t mode,
+ struct dentry *parent, unsigned long *value)
+{
+ return debugfs_create_mode(name, mode, parent, value, &fops_ulong,
+ &fops_ulong_ro, &fops_ulong_wo);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_ulong);
+
DEFINE_SIMPLE_ATTRIBUTE(fops_x8, debugfs_u8_get, debugfs_u8_set, "0x%02llx\n");
DEFINE_SIMPLE_ATTRIBUTE(fops_x8_ro, debugfs_u8_get, NULL, "0x%02llx\n");
DEFINE_SIMPLE_ATTRIBUTE(fops_x8_wo, NULL, debugfs_u8_set, "0x%02llx\n");
@@ -264,6 +304,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_x32_ro, debugfs_u32_get, NULL, "0x%08llx\n");
DEFINE_SIMPLE_ATTRIBUTE(fops_x32_wo, NULL, debugfs_u32_set, "0x%08llx\n");
DEFINE_SIMPLE_ATTRIBUTE(fops_x64, debugfs_u64_get, debugfs_u64_set, "0x%016llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_x64_ro, debugfs_u64_get, NULL, "0x%016llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_x64_wo, NULL, debugfs_u64_set, "0x%016llx\n");
/*
* debugfs_create_x{8,16,32,64} - create a debugfs file that is used to read and write an unsigned {8,16,32,64}-bit value
@@ -286,14 +328,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_x64, debugfs_u64_get, debugfs_u64_set, "0x%016llx\n
struct dentry *debugfs_create_x8(const char *name, umode_t mode,
struct dentry *parent, u8 *value)
{
- /* if there are no write bits set, make read only */
- if (!(mode & S_IWUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_x8_ro);
- /* if there are no read bits set, make write only */
- if (!(mode & S_IRUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_x8_wo);
-
- return debugfs_create_file(name, mode, parent, value, &fops_x8);
+ return debugfs_create_mode(name, mode, parent, value, &fops_x8,
+ &fops_x8_ro, &fops_x8_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_x8);
@@ -310,14 +346,8 @@ EXPORT_SYMBOL_GPL(debugfs_create_x8);
struct dentry *debugfs_create_x16(const char *name, umode_t mode,
struct dentry *parent, u16 *value)
{
- /* if there are no write bits set, make read only */
- if (!(mode & S_IWUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_x16_ro);
- /* if there are no read bits set, make write only */
- if (!(mode & S_IRUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_x16_wo);
-
- return debugfs_create_file(name, mode, parent, value, &fops_x16);
+ return debugfs_create_mode(name, mode, parent, value, &fops_x16,
+ &fops_x16_ro, &fops_x16_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_x16);
@@ -334,14 +364,8 @@ EXPORT_SYMBOL_GPL(debugfs_create_x16);
struct dentry *debugfs_create_x32(const char *name, umode_t mode,
struct dentry *parent, u32 *value)
{
- /* if there are no write bits set, make read only */
- if (!(mode & S_IWUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_x32_ro);
- /* if there are no read bits set, make write only */
- if (!(mode & S_IRUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_x32_wo);
-
- return debugfs_create_file(name, mode, parent, value, &fops_x32);
+ return debugfs_create_mode(name, mode, parent, value, &fops_x32,
+ &fops_x32_ro, &fops_x32_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_x32);
@@ -358,7 +382,8 @@ EXPORT_SYMBOL_GPL(debugfs_create_x32);
struct dentry *debugfs_create_x64(const char *name, umode_t mode,
struct dentry *parent, u64 *value)
{
- return debugfs_create_file(name, mode, parent, value, &fops_x64);
+ return debugfs_create_mode(name, mode, parent, value, &fops_x64,
+ &fops_x64_ro, &fops_x64_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_x64);
@@ -375,6 +400,8 @@ static int debugfs_size_t_get(void *data, u64 *val)
}
DEFINE_SIMPLE_ATTRIBUTE(fops_size_t, debugfs_size_t_get, debugfs_size_t_set,
"%llu\n"); /* %llu and %zu are more or less the same */
+DEFINE_SIMPLE_ATTRIBUTE(fops_size_t_ro, debugfs_size_t_get, NULL, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_size_t_wo, NULL, debugfs_size_t_set, "%llu\n");
/**
* debugfs_create_size_t - create a debugfs file that is used to read and write an size_t value
@@ -389,7 +416,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_size_t, debugfs_size_t_get, debugfs_size_t_set,
struct dentry *debugfs_create_size_t(const char *name, umode_t mode,
struct dentry *parent, size_t *value)
{
- return debugfs_create_file(name, mode, parent, value, &fops_size_t);
+ return debugfs_create_mode(name, mode, parent, value, &fops_size_t,
+ &fops_size_t_ro, &fops_size_t_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_size_t);
@@ -422,24 +450,16 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t_wo, NULL, debugfs_atomic_t_set, "%lld\n");
struct dentry *debugfs_create_atomic_t(const char *name, umode_t mode,
struct dentry *parent, atomic_t *value)
{
- /* if there are no write bits set, make read only */
- if (!(mode & S_IWUGO))
- return debugfs_create_file(name, mode, parent, value,
- &fops_atomic_t_ro);
- /* if there are no read bits set, make write only */
- if (!(mode & S_IRUGO))
- return debugfs_create_file(name, mode, parent, value,
- &fops_atomic_t_wo);
-
- return debugfs_create_file(name, mode, parent, value, &fops_atomic_t);
+ return debugfs_create_mode(name, mode, parent, value, &fops_atomic_t,
+ &fops_atomic_t_ro, &fops_atomic_t_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_atomic_t);
-static ssize_t read_file_bool(struct file *file, char __user *user_buf,
- size_t count, loff_t *ppos)
+ssize_t debugfs_read_file_bool(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
{
char buf[3];
- u32 *val = file->private_data;
+ bool *val = file->private_data;
if (*val)
buf[0] = 'Y';
@@ -449,14 +469,15 @@ static ssize_t read_file_bool(struct file *file, char __user *user_buf,
buf[2] = 0x00;
return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
}
+EXPORT_SYMBOL_GPL(debugfs_read_file_bool);
-static ssize_t write_file_bool(struct file *file, const char __user *user_buf,
- size_t count, loff_t *ppos)
+ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf,
+ size_t count, loff_t *ppos)
{
char buf[32];
size_t buf_size;
bool bv;
- u32 *val = file->private_data;
+ bool *val = file->private_data;
buf_size = min(count, (sizeof(buf)-1));
if (copy_from_user(buf, user_buf, buf_size))
@@ -468,10 +489,23 @@ static ssize_t write_file_bool(struct file *file, const char __user *user_buf,
return count;
}
+EXPORT_SYMBOL_GPL(debugfs_write_file_bool);
static const struct file_operations fops_bool = {
- .read = read_file_bool,
- .write = write_file_bool,
+ .read = debugfs_read_file_bool,
+ .write = debugfs_write_file_bool,
+ .open = simple_open,
+ .llseek = default_llseek,
+};
+
+static const struct file_operations fops_bool_ro = {
+ .read = debugfs_read_file_bool,
+ .open = simple_open,
+ .llseek = default_llseek,
+};
+
+static const struct file_operations fops_bool_wo = {
+ .write = debugfs_write_file_bool,
.open = simple_open,
.llseek = default_llseek,
};
@@ -501,9 +535,10 @@ static const struct file_operations fops_bool = {
* code.
*/
struct dentry *debugfs_create_bool(const char *name, umode_t mode,
- struct dentry *parent, u32 *value)
+ struct dentry *parent, bool *value)
{
- return debugfs_create_file(name, mode, parent, value, &fops_bool);
+ return debugfs_create_mode(name, mode, parent, value, &fops_bool,
+ &fops_bool_ro, &fops_bool_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_bool);
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index c711be8..b7fcc0d 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -271,8 +271,12 @@ static struct dentry *start_creating(const char *name, struct dentry *parent)
dput(dentry);
dentry = ERR_PTR(-EEXIST);
}
- if (IS_ERR(dentry))
+
+ if (IS_ERR(dentry)) {
mutex_unlock(&d_inode(parent)->i_mutex);
+ simple_release_fs(&debugfs_mount, &debugfs_mount_count);
+ }
+
return dentry;
}
@@ -533,7 +537,8 @@ static int __debugfs_remove(struct dentry *dentry, struct dentry *parent)
/**
* debugfs_remove - removes a file or directory from the debugfs filesystem
* @dentry: a pointer to a the dentry of the file or directory to be
- * removed.
+ * removed. If this parameter is NULL or an error value, nothing
+ * will be done.
*
* This function removes a file or directory in debugfs that was previously
* created with a call to another debugfs function (like
@@ -565,7 +570,8 @@ EXPORT_SYMBOL_GPL(debugfs_remove);
/**
* debugfs_remove_recursive - recursively removes a directory
- * @dentry: a pointer to a the dentry of the directory to be removed.
+ * @dentry: a pointer to a the dentry of the directory to be removed. If this
+ * parameter is NULL or an error value, nothing will be done.
*
* This function recursively removes a directory tree in debugfs that
* was previously created with a call to another debugfs function
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 745d234..602e844 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -109,6 +109,8 @@ struct dio_submit {
struct dio {
int flags; /* doesn't change */
int rw;
+ blk_qc_t bio_cookie;
+ struct block_device *bio_bdev;
struct inode *inode;
loff_t i_size; /* i_size when submitted */
dio_iodone_t *end_io; /* IO completion function */
@@ -120,6 +122,7 @@ struct dio {
int page_errors; /* errno from get_user_pages() */
int is_async; /* is IO async ? */
bool defer_completion; /* defer AIO completion to workqueue? */
+ bool should_dirty; /* if pages should be dirtied */
int io_error; /* IO error in completion path */
unsigned long refcount; /* direct_io_worker() and bios */
struct bio *bio_list; /* singly linked via bi_private */
@@ -285,7 +288,7 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio);
/*
* Asynchronous IO callback.
*/
-static void dio_bio_end_aio(struct bio *bio, int error)
+static void dio_bio_end_aio(struct bio *bio)
{
struct dio *dio = bio->bi_private;
unsigned long remaining;
@@ -318,7 +321,7 @@ static void dio_bio_end_aio(struct bio *bio, int error)
* During I/O bi_private points at the dio. After I/O, bi_private is used to
* implement a singly-linked list of completed BIOs, at dio->bio_list.
*/
-static void dio_bio_end_io(struct bio *bio, int error)
+static void dio_bio_end_io(struct bio *bio)
{
struct dio *dio = bio->bi_private;
unsigned long flags;
@@ -345,9 +348,9 @@ void dio_end_io(struct bio *bio, int error)
struct dio *dio = bio->bi_private;
if (dio->is_async)
- dio_bio_end_aio(bio, error);
+ dio_bio_end_aio(bio);
else
- dio_bio_end_io(bio, error);
+ dio_bio_end_io(bio);
}
EXPORT_SYMBOL_GPL(dio_end_io);
@@ -360,7 +363,7 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
/*
* bio_alloc() is guaranteed to return a bio when called with
- * __GFP_WAIT and we request a valid number of vectors.
+ * __GFP_RECLAIM and we request a valid number of vectors.
*/
bio = bio_alloc(GFP_KERNEL, nr_vecs);
@@ -393,14 +396,17 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
dio->refcount++;
spin_unlock_irqrestore(&dio->bio_lock, flags);
- if (dio->is_async && dio->rw == READ)
+ if (dio->is_async && dio->rw == READ && dio->should_dirty)
bio_set_pages_dirty(bio);
- if (sdio->submit_io)
+ dio->bio_bdev = bio->bi_bdev;
+
+ if (sdio->submit_io) {
sdio->submit_io(dio->rw, bio, dio->inode,
sdio->logical_offset_in_bio);
- else
- submit_bio(dio->rw, bio);
+ dio->bio_cookie = BLK_QC_T_NONE;
+ } else
+ dio->bio_cookie = submit_bio(dio->rw, bio);
sdio->bio = NULL;
sdio->boundary = 0;
@@ -439,7 +445,8 @@ static struct bio *dio_await_one(struct dio *dio)
__set_current_state(TASK_UNINTERRUPTIBLE);
dio->waiter = current;
spin_unlock_irqrestore(&dio->bio_lock, flags);
- io_schedule();
+ if (!blk_poll(bdev_get_queue(dio->bio_bdev), dio->bio_cookie))
+ io_schedule();
/* wake up sets us TASK_RUNNING */
spin_lock_irqsave(&dio->bio_lock, flags);
dio->waiter = NULL;
@@ -457,26 +464,29 @@ static struct bio *dio_await_one(struct dio *dio)
*/
static int dio_bio_complete(struct dio *dio, struct bio *bio)
{
- const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct bio_vec *bvec;
unsigned i;
+ int err;
- if (!uptodate)
+ if (bio->bi_error)
dio->io_error = -EIO;
- if (dio->is_async && dio->rw == READ) {
+ if (dio->is_async && dio->rw == READ && dio->should_dirty) {
bio_check_pages_dirty(bio); /* transfers ownership */
+ err = bio->bi_error;
} else {
bio_for_each_segment_all(bvec, bio, i) {
struct page *page = bvec->bv_page;
- if (dio->rw == READ && !PageCompound(page))
+ if (dio->rw == READ && !PageCompound(page) &&
+ dio->should_dirty)
set_page_dirty_lock(page);
page_cache_release(page);
}
+ err = bio->bi_error;
bio_put(bio);
}
- return uptodate ? 0 : -EIO;
+ return err;
}
/*
@@ -653,7 +663,7 @@ static inline int dio_new_bio(struct dio *dio, struct dio_submit *sdio,
if (ret)
goto out;
sector = start_sector << (sdio->blkbits - 9);
- nr_pages = min(sdio->pages_in_io, bio_get_nr_vecs(map_bh->b_bdev));
+ nr_pages = min(sdio->pages_in_io, BIO_MAX_PAGES);
BUG_ON(nr_pages <= 0);
dio_bio_alloc(dio, sdio, map_bh->b_bdev, sector, nr_pages);
sdio->boundary = 0;
@@ -1159,6 +1169,16 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
}
}
+ /* Once we sampled i_size check for reads beyond EOF */
+ dio->i_size = i_size_read(inode);
+ if (iov_iter_rw(iter) == READ && offset >= dio->i_size) {
+ if (dio->flags & DIO_LOCKING)
+ mutex_unlock(&inode->i_mutex);
+ kmem_cache_free(dio_cache, dio);
+ retval = 0;
+ goto out;
+ }
+
/*
* For file extending writes updating i_size before data writeouts
* complete can expose uninitialized blocks in dumb filesystems.
@@ -1212,11 +1232,11 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
sdio.next_block_for_io = -1;
dio->iocb = iocb;
- dio->i_size = i_size_read(inode);
spin_lock_init(&dio->bio_lock);
dio->refcount = 1;
+ dio->should_dirty = (iter->type == ITER_IOVEC);
sdio.iter = iter;
sdio.final_block_in_request =
(offset + iov_iter_count(iter)) >> blkbits;
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index d521bdd..8e294fb 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -61,35 +61,8 @@ static struct config_item *make_node(struct config_group *, const char *);
static void drop_node(struct config_group *, struct config_item *);
static void release_node(struct config_item *);
-static ssize_t show_cluster(struct config_item *i, struct configfs_attribute *a,
- char *buf);
-static ssize_t store_cluster(struct config_item *i,
- struct configfs_attribute *a,
- const char *buf, size_t len);
-static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
- char *buf);
-static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
- const char *buf, size_t len);
-static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
- char *buf);
-static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
- const char *buf, size_t len);
-
-static ssize_t comm_nodeid_read(struct dlm_comm *cm, char *buf);
-static ssize_t comm_nodeid_write(struct dlm_comm *cm, const char *buf,
- size_t len);
-static ssize_t comm_local_read(struct dlm_comm *cm, char *buf);
-static ssize_t comm_local_write(struct dlm_comm *cm, const char *buf,
- size_t len);
-static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf,
- size_t len);
-static ssize_t comm_addr_list_read(struct dlm_comm *cm, char *buf);
-static ssize_t node_nodeid_read(struct dlm_node *nd, char *buf);
-static ssize_t node_nodeid_write(struct dlm_node *nd, const char *buf,
- size_t len);
-static ssize_t node_weight_read(struct dlm_node *nd, char *buf);
-static ssize_t node_weight_write(struct dlm_node *nd, const char *buf,
- size_t len);
+static struct configfs_attribute *comm_attrs[];
+static struct configfs_attribute *node_attrs[];
struct dlm_cluster {
struct config_group group;
@@ -108,6 +81,12 @@ struct dlm_cluster {
char cl_cluster_name[DLM_LOCKSPACE_LEN];
};
+static struct dlm_cluster *config_item_to_cluster(struct config_item *i)
+{
+ return i ? container_of(to_config_group(i), struct dlm_cluster, group) :
+ NULL;
+}
+
enum {
CLUSTER_ATTR_TCP_PORT = 0,
CLUSTER_ATTR_BUFFER_SIZE,
@@ -124,33 +103,24 @@ enum {
CLUSTER_ATTR_CLUSTER_NAME,
};
-struct cluster_attribute {
- struct configfs_attribute attr;
- ssize_t (*show)(struct dlm_cluster *, char *);
- ssize_t (*store)(struct dlm_cluster *, const char *, size_t);
-};
-
-static ssize_t cluster_cluster_name_read(struct dlm_cluster *cl, char *buf)
+static ssize_t cluster_cluster_name_show(struct config_item *item, char *buf)
{
+ struct dlm_cluster *cl = config_item_to_cluster(item);
return sprintf(buf, "%s\n", cl->cl_cluster_name);
}
-static ssize_t cluster_cluster_name_write(struct dlm_cluster *cl,
+static ssize_t cluster_cluster_name_store(struct config_item *item,
const char *buf, size_t len)
{
+ struct dlm_cluster *cl = config_item_to_cluster(item);
+
strlcpy(dlm_config.ci_cluster_name, buf,
sizeof(dlm_config.ci_cluster_name));
strlcpy(cl->cl_cluster_name, buf, sizeof(cl->cl_cluster_name));
return len;
}
-static struct cluster_attribute cluster_attr_cluster_name = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "cluster_name",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = cluster_cluster_name_read,
- .store = cluster_cluster_name_write,
-};
+CONFIGFS_ATTR(cluster_, cluster_name);
static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field,
int *info_field, int check_zero,
@@ -175,17 +145,19 @@ static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field,
}
#define CLUSTER_ATTR(name, check_zero) \
-static ssize_t name##_write(struct dlm_cluster *cl, const char *buf, size_t len) \
+static ssize_t cluster_##name##_store(struct config_item *item, \
+ const char *buf, size_t len) \
{ \
+ struct dlm_cluster *cl = config_item_to_cluster(item); \
return cluster_set(cl, &cl->cl_##name, &dlm_config.ci_##name, \
check_zero, buf, len); \
} \
-static ssize_t name##_read(struct dlm_cluster *cl, char *buf) \
+static ssize_t cluster_##name##_show(struct config_item *item, char *buf) \
{ \
+ struct dlm_cluster *cl = config_item_to_cluster(item); \
return snprintf(buf, PAGE_SIZE, "%u\n", cl->cl_##name); \
} \
-static struct cluster_attribute cluster_attr_##name = \
-__CONFIGFS_ATTR(name, 0644, name##_read, name##_write)
+CONFIGFS_ATTR(cluster_, name);
CLUSTER_ATTR(tcp_port, 1);
CLUSTER_ATTR(buffer_size, 1);
@@ -201,19 +173,19 @@ CLUSTER_ATTR(new_rsb_count, 0);
CLUSTER_ATTR(recover_callbacks, 0);
static struct configfs_attribute *cluster_attrs[] = {
- [CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port.attr,
- [CLUSTER_ATTR_BUFFER_SIZE] = &cluster_attr_buffer_size.attr,
- [CLUSTER_ATTR_RSBTBL_SIZE] = &cluster_attr_rsbtbl_size.attr,
- [CLUSTER_ATTR_RECOVER_TIMER] = &cluster_attr_recover_timer.attr,
- [CLUSTER_ATTR_TOSS_SECS] = &cluster_attr_toss_secs.attr,
- [CLUSTER_ATTR_SCAN_SECS] = &cluster_attr_scan_secs.attr,
- [CLUSTER_ATTR_LOG_DEBUG] = &cluster_attr_log_debug.attr,
- [CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol.attr,
- [CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs.attr,
- [CLUSTER_ATTR_WAITWARN_US] = &cluster_attr_waitwarn_us.attr,
- [CLUSTER_ATTR_NEW_RSB_COUNT] = &cluster_attr_new_rsb_count.attr,
- [CLUSTER_ATTR_RECOVER_CALLBACKS] = &cluster_attr_recover_callbacks.attr,
- [CLUSTER_ATTR_CLUSTER_NAME] = &cluster_attr_cluster_name.attr,
+ [CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port,
+ [CLUSTER_ATTR_BUFFER_SIZE] = &cluster_attr_buffer_size,
+ [CLUSTER_ATTR_RSBTBL_SIZE] = &cluster_attr_rsbtbl_size,
+ [CLUSTER_ATTR_RECOVER_TIMER] = &cluster_attr_recover_timer,
+ [CLUSTER_ATTR_TOSS_SECS] = &cluster_attr_toss_secs,
+ [CLUSTER_ATTR_SCAN_SECS] = &cluster_attr_scan_secs,
+ [CLUSTER_ATTR_LOG_DEBUG] = &cluster_attr_log_debug,
+ [CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol,
+ [CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs,
+ [CLUSTER_ATTR_WAITWARN_US] = &cluster_attr_waitwarn_us,
+ [CLUSTER_ATTR_NEW_RSB_COUNT] = &cluster_attr_new_rsb_count,
+ [CLUSTER_ATTR_RECOVER_CALLBACKS] = &cluster_attr_recover_callbacks,
+ [CLUSTER_ATTR_CLUSTER_NAME] = &cluster_attr_cluster_name,
NULL,
};
@@ -224,83 +196,11 @@ enum {
COMM_ATTR_ADDR_LIST,
};
-struct comm_attribute {
- struct configfs_attribute attr;
- ssize_t (*show)(struct dlm_comm *, char *);
- ssize_t (*store)(struct dlm_comm *, const char *, size_t);
-};
-
-static struct comm_attribute comm_attr_nodeid = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "nodeid",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = comm_nodeid_read,
- .store = comm_nodeid_write,
-};
-
-static struct comm_attribute comm_attr_local = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "local",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = comm_local_read,
- .store = comm_local_write,
-};
-
-static struct comm_attribute comm_attr_addr = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "addr",
- .ca_mode = S_IWUSR },
- .store = comm_addr_write,
-};
-
-static struct comm_attribute comm_attr_addr_list = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "addr_list",
- .ca_mode = S_IRUGO },
- .show = comm_addr_list_read,
-};
-
-static struct configfs_attribute *comm_attrs[] = {
- [COMM_ATTR_NODEID] = &comm_attr_nodeid.attr,
- [COMM_ATTR_LOCAL] = &comm_attr_local.attr,
- [COMM_ATTR_ADDR] = &comm_attr_addr.attr,
- [COMM_ATTR_ADDR_LIST] = &comm_attr_addr_list.attr,
- NULL,
-};
-
enum {
NODE_ATTR_NODEID = 0,
NODE_ATTR_WEIGHT,
};
-struct node_attribute {
- struct configfs_attribute attr;
- ssize_t (*show)(struct dlm_node *, char *);
- ssize_t (*store)(struct dlm_node *, const char *, size_t);
-};
-
-static struct node_attribute node_attr_nodeid = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "nodeid",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = node_nodeid_read,
- .store = node_nodeid_write,
-};
-
-static struct node_attribute node_attr_weight = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "weight",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = node_weight_read,
- .store = node_weight_write,
-};
-
-static struct configfs_attribute *node_attrs[] = {
- [NODE_ATTR_NODEID] = &node_attr_nodeid.attr,
- [NODE_ATTR_WEIGHT] = &node_attr_weight.attr,
- NULL,
-};
-
struct dlm_clusters {
struct configfs_subsystem subsys;
};
@@ -349,8 +249,6 @@ static struct configfs_group_operations clusters_ops = {
static struct configfs_item_operations cluster_ops = {
.release = release_cluster,
- .show_attribute = show_cluster,
- .store_attribute = store_cluster,
};
static struct configfs_group_operations spaces_ops = {
@@ -369,8 +267,6 @@ static struct configfs_group_operations comms_ops = {
static struct configfs_item_operations comm_ops = {
.release = release_comm,
- .show_attribute = show_comm,
- .store_attribute = store_comm,
};
static struct configfs_group_operations nodes_ops = {
@@ -380,8 +276,6 @@ static struct configfs_group_operations nodes_ops = {
static struct configfs_item_operations node_ops = {
.release = release_node,
- .show_attribute = show_node,
- .store_attribute = store_node,
};
static struct config_item_type clusters_type = {
@@ -427,12 +321,6 @@ static struct config_item_type node_type = {
.ct_owner = THIS_MODULE,
};
-static struct dlm_cluster *config_item_to_cluster(struct config_item *i)
-{
- return i ? container_of(to_config_group(i), struct dlm_cluster, group) :
- NULL;
-}
-
static struct dlm_space *config_item_to_space(struct config_item *i)
{
return i ? container_of(to_config_group(i), struct dlm_space, group) :
@@ -687,66 +575,30 @@ void dlm_config_exit(void)
* Functions for user space to read/write attributes
*/
-static ssize_t show_cluster(struct config_item *i, struct configfs_attribute *a,
- char *buf)
-{
- struct dlm_cluster *cl = config_item_to_cluster(i);
- struct cluster_attribute *cla =
- container_of(a, struct cluster_attribute, attr);
- return cla->show ? cla->show(cl, buf) : 0;
-}
-
-static ssize_t store_cluster(struct config_item *i,
- struct configfs_attribute *a,
- const char *buf, size_t len)
+static ssize_t comm_nodeid_show(struct config_item *item, char *buf)
{
- struct dlm_cluster *cl = config_item_to_cluster(i);
- struct cluster_attribute *cla =
- container_of(a, struct cluster_attribute, attr);
- return cla->store ? cla->store(cl, buf, len) : -EINVAL;
-}
-
-static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
- char *buf)
-{
- struct dlm_comm *cm = config_item_to_comm(i);
- struct comm_attribute *cma =
- container_of(a, struct comm_attribute, attr);
- return cma->show ? cma->show(cm, buf) : 0;
-}
-
-static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
- const char *buf, size_t len)
-{
- struct dlm_comm *cm = config_item_to_comm(i);
- struct comm_attribute *cma =
- container_of(a, struct comm_attribute, attr);
- return cma->store ? cma->store(cm, buf, len) : -EINVAL;
+ return sprintf(buf, "%d\n", config_item_to_comm(item)->nodeid);
}
-static ssize_t comm_nodeid_read(struct dlm_comm *cm, char *buf)
-{
- return sprintf(buf, "%d\n", cm->nodeid);
-}
-
-static ssize_t comm_nodeid_write(struct dlm_comm *cm, const char *buf,
+static ssize_t comm_nodeid_store(struct config_item *item, const char *buf,
size_t len)
{
- int rc = kstrtoint(buf, 0, &cm->nodeid);
+ int rc = kstrtoint(buf, 0, &config_item_to_comm(item)->nodeid);
if (rc)
return rc;
return len;
}
-static ssize_t comm_local_read(struct dlm_comm *cm, char *buf)
+static ssize_t comm_local_show(struct config_item *item, char *buf)
{
- return sprintf(buf, "%d\n", cm->local);
+ return sprintf(buf, "%d\n", config_item_to_comm(item)->local);
}
-static ssize_t comm_local_write(struct dlm_comm *cm, const char *buf,
+static ssize_t comm_local_store(struct config_item *item, const char *buf,
size_t len)
{
+ struct dlm_comm *cm = config_item_to_comm(item);
int rc = kstrtoint(buf, 0, &cm->local);
if (rc)
@@ -756,8 +608,10 @@ static ssize_t comm_local_write(struct dlm_comm *cm, const char *buf,
return len;
}
-static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf, size_t len)
+static ssize_t comm_addr_store(struct config_item *item, const char *buf,
+ size_t len)
{
+ struct dlm_comm *cm = config_item_to_comm(item);
struct sockaddr_storage *addr;
int rv;
@@ -783,8 +637,9 @@ static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf, size_t len)
return len;
}
-static ssize_t comm_addr_list_read(struct dlm_comm *cm, char *buf)
+static ssize_t comm_addr_list_show(struct config_item *item, char *buf)
{
+ struct dlm_comm *cm = config_item_to_comm(item);
ssize_t s;
ssize_t allowance;
int i;
@@ -827,32 +682,28 @@ static ssize_t comm_addr_list_read(struct dlm_comm *cm, char *buf)
return 4096 - allowance;
}
-static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
- char *buf)
-{
- struct dlm_node *nd = config_item_to_node(i);
- struct node_attribute *nda =
- container_of(a, struct node_attribute, attr);
- return nda->show ? nda->show(nd, buf) : 0;
-}
+CONFIGFS_ATTR(comm_, nodeid);
+CONFIGFS_ATTR(comm_, local);
+CONFIGFS_ATTR_WO(comm_, addr);
+CONFIGFS_ATTR_RO(comm_, addr_list);
-static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
- const char *buf, size_t len)
-{
- struct dlm_node *nd = config_item_to_node(i);
- struct node_attribute *nda =
- container_of(a, struct node_attribute, attr);
- return nda->store ? nda->store(nd, buf, len) : -EINVAL;
-}
+static struct configfs_attribute *comm_attrs[] = {
+ [COMM_ATTR_NODEID] = &comm_attr_nodeid,
+ [COMM_ATTR_LOCAL] = &comm_attr_local,
+ [COMM_ATTR_ADDR] = &comm_attr_addr,
+ [COMM_ATTR_ADDR_LIST] = &comm_attr_addr_list,
+ NULL,
+};
-static ssize_t node_nodeid_read(struct dlm_node *nd, char *buf)
+static ssize_t node_nodeid_show(struct config_item *item, char *buf)
{
- return sprintf(buf, "%d\n", nd->nodeid);
+ return sprintf(buf, "%d\n", config_item_to_node(item)->nodeid);
}
-static ssize_t node_nodeid_write(struct dlm_node *nd, const char *buf,
+static ssize_t node_nodeid_store(struct config_item *item, const char *buf,
size_t len)
{
+ struct dlm_node *nd = config_item_to_node(item);
uint32_t seq = 0;
int rc = kstrtoint(buf, 0, &nd->nodeid);
@@ -863,21 +714,30 @@ static ssize_t node_nodeid_write(struct dlm_node *nd, const char *buf,
return len;
}
-static ssize_t node_weight_read(struct dlm_node *nd, char *buf)
+static ssize_t node_weight_show(struct config_item *item, char *buf)
{
- return sprintf(buf, "%d\n", nd->weight);
+ return sprintf(buf, "%d\n", config_item_to_node(item)->weight);
}
-static ssize_t node_weight_write(struct dlm_node *nd, const char *buf,
+static ssize_t node_weight_store(struct config_item *item, const char *buf,
size_t len)
{
- int rc = kstrtoint(buf, 0, &nd->weight);
+ int rc = kstrtoint(buf, 0, &config_item_to_node(item)->weight);
if (rc)
return rc;
return len;
}
+CONFIGFS_ATTR(node_, nodeid);
+CONFIGFS_ATTR(node_, weight);
+
+static struct configfs_attribute *node_attrs[] = {
+ [NODE_ATTR_NODEID] = &node_attr_nodeid,
+ [NODE_ATTR_WEIGHT] = &node_attr_weight,
+ NULL,
+};
+
/*
* Functions for the dlm to get the info that's been configured
*/
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 754fd6c..3a37bd3 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -120,12 +120,11 @@ struct connection {
struct cbuf cb;
int retries;
#define MAX_CONNECT_RETRIES 3
- int sctp_assoc;
struct hlist_node list;
struct connection *othercon;
struct work_struct rwork; /* Receive workqueue */
struct work_struct swork; /* Send workqueue */
- bool try_new_addr;
+ void (*orig_error_report)(struct sock *sk);
};
#define sock2con(x) ((struct connection *)(x)->sk_user_data)
@@ -252,26 +251,6 @@ static struct connection *nodeid2con(int nodeid, gfp_t allocation)
return con;
}
-/* This is a bit drastic, but only called when things go wrong */
-static struct connection *assoc2con(int assoc_id)
-{
- int i;
- struct connection *con;
-
- mutex_lock(&connections_lock);
-
- for (i = 0 ; i < CONN_HASH_SIZE; i++) {
- hlist_for_each_entry(con, &connection_hash[i], list) {
- if (con->sctp_assoc == assoc_id) {
- mutex_unlock(&connections_lock);
- return con;
- }
- }
- }
- mutex_unlock(&connections_lock);
- return NULL;
-}
-
static struct dlm_node_addr *find_node_addr(int nodeid)
{
struct dlm_node_addr *na;
@@ -322,14 +301,14 @@ static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out,
spin_lock(&dlm_node_addrs_spin);
na = find_node_addr(nodeid);
if (na && na->addr_count) {
+ memcpy(&sas, na->addr[na->curr_addr_index],
+ sizeof(struct sockaddr_storage));
+
if (try_new_addr) {
na->curr_addr_index++;
if (na->curr_addr_index == na->addr_count)
na->curr_addr_index = 0;
}
-
- memcpy(&sas, na->addr[na->curr_addr_index ],
- sizeof(struct sockaddr_storage));
}
spin_unlock(&dlm_node_addrs_spin);
@@ -442,7 +421,7 @@ static void lowcomms_write_space(struct sock *sk)
if (test_and_clear_bit(CF_APP_LIMITED, &con->flags)) {
con->sock->sk->sk_write_pending--;
- clear_bit(SOCK_ASYNC_NOSPACE, &con->sock->flags);
+ clear_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags);
}
if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags))
@@ -459,18 +438,23 @@ static inline void lowcomms_connect_sock(struct connection *con)
static void lowcomms_state_change(struct sock *sk)
{
- if (sk->sk_state == TCP_ESTABLISHED)
+ /* SCTP layer is not calling sk_data_ready when the connection
+ * is done, so we catch the signal through here. Also, it
+ * doesn't switch socket state when entering shutdown, so we
+ * skip the write in that case.
+ */
+ if (sk->sk_shutdown) {
+ if (sk->sk_shutdown == RCV_SHUTDOWN)
+ lowcomms_data_ready(sk);
+ } else if (sk->sk_state == TCP_ESTABLISHED) {
lowcomms_write_space(sk);
+ }
}
int dlm_lowcomms_connect_node(int nodeid)
{
struct connection *con;
- /* with sctp there's no connecting without sending */
- if (dlm_config.ci_protocol != 0)
- return 0;
-
if (nodeid == dlm_our_nodeid())
return 0;
@@ -481,6 +465,43 @@ int dlm_lowcomms_connect_node(int nodeid)
return 0;
}
+static void lowcomms_error_report(struct sock *sk)
+{
+ struct connection *con = sock2con(sk);
+ struct sockaddr_storage saddr;
+
+ if (nodeid_to_addr(con->nodeid, &saddr, NULL, false)) {
+ printk_ratelimited(KERN_ERR "dlm: node %d: socket error "
+ "sending to node %d, port %d, "
+ "sk_err=%d/%d\n", dlm_our_nodeid(),
+ con->nodeid, dlm_config.ci_tcp_port,
+ sk->sk_err, sk->sk_err_soft);
+ return;
+ } else if (saddr.ss_family == AF_INET) {
+ struct sockaddr_in *sin4 = (struct sockaddr_in *)&saddr;
+
+ printk_ratelimited(KERN_ERR "dlm: node %d: socket error "
+ "sending to node %d at %pI4, port %d, "
+ "sk_err=%d/%d\n", dlm_our_nodeid(),
+ con->nodeid, &sin4->sin_addr.s_addr,
+ dlm_config.ci_tcp_port, sk->sk_err,
+ sk->sk_err_soft);
+ } else {
+ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&saddr;
+
+ printk_ratelimited(KERN_ERR "dlm: node %d: socket error "
+ "sending to node %d at %u.%u.%u.%u, "
+ "port %d, sk_err=%d/%d\n", dlm_our_nodeid(),
+ con->nodeid, sin6->sin6_addr.s6_addr32[0],
+ sin6->sin6_addr.s6_addr32[1],
+ sin6->sin6_addr.s6_addr32[2],
+ sin6->sin6_addr.s6_addr32[3],
+ dlm_config.ci_tcp_port, sk->sk_err,
+ sk->sk_err_soft);
+ }
+ con->orig_error_report(sk);
+}
+
/* Make a socket active */
static void add_sock(struct socket *sock, struct connection *con)
{
@@ -492,6 +513,8 @@ static void add_sock(struct socket *sock, struct connection *con)
con->sock->sk->sk_state_change = lowcomms_state_change;
con->sock->sk->sk_user_data = con;
con->sock->sk->sk_allocation = GFP_NOFS;
+ con->orig_error_report = con->sock->sk->sk_error_report;
+ con->sock->sk->sk_error_report = lowcomms_error_report;
}
/* Add the port number to an IPv6 or 4 sockaddr and return the address
@@ -514,17 +537,24 @@ static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port,
}
/* Close a remote connection and tidy up */
-static void close_connection(struct connection *con, bool and_other)
+static void close_connection(struct connection *con, bool and_other,
+ bool tx, bool rx)
{
- mutex_lock(&con->sock_mutex);
+ clear_bit(CF_CONNECT_PENDING, &con->flags);
+ clear_bit(CF_WRITE_PENDING, &con->flags);
+ if (tx && cancel_work_sync(&con->swork))
+ log_print("canceled swork for node %d", con->nodeid);
+ if (rx && cancel_work_sync(&con->rwork))
+ log_print("canceled rwork for node %d", con->nodeid);
+ mutex_lock(&con->sock_mutex);
if (con->sock) {
sock_release(con->sock);
con->sock = NULL;
}
if (con->othercon && and_other) {
/* Will only re-enter once. */
- close_connection(con->othercon, false);
+ close_connection(con->othercon, false, true, true);
}
if (con->rx_page) {
__free_page(con->rx_page);
@@ -535,254 +565,6 @@ static void close_connection(struct connection *con, bool and_other)
mutex_unlock(&con->sock_mutex);
}
-/* We only send shutdown messages to nodes that are not part of the cluster */
-static void sctp_send_shutdown(sctp_assoc_t associd)
-{
- static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
- struct msghdr outmessage;
- struct cmsghdr *cmsg;
- struct sctp_sndrcvinfo *sinfo;
- int ret;
- struct connection *con;
-
- con = nodeid2con(0,0);
- BUG_ON(con == NULL);
-
- outmessage.msg_name = NULL;
- outmessage.msg_namelen = 0;
- outmessage.msg_control = outcmsg;
- outmessage.msg_controllen = sizeof(outcmsg);
- outmessage.msg_flags = MSG_EOR;
-
- cmsg = CMSG_FIRSTHDR(&outmessage);
- cmsg->cmsg_level = IPPROTO_SCTP;
- cmsg->cmsg_type = SCTP_SNDRCV;
- cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
- outmessage.msg_controllen = cmsg->cmsg_len;
- sinfo = CMSG_DATA(cmsg);
- memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
-
- sinfo->sinfo_flags |= MSG_EOF;
- sinfo->sinfo_assoc_id = associd;
-
- ret = kernel_sendmsg(con->sock, &outmessage, NULL, 0, 0);
-
- if (ret != 0)
- log_print("send EOF to node failed: %d", ret);
-}
-
-static void sctp_init_failed_foreach(struct connection *con)
-{
-
- /*
- * Don't try to recover base con and handle race where the
- * other node's assoc init creates a assoc and we get that
- * notification, then we get a notification that our attempt
- * failed due. This happens when we are still trying the primary
- * address, but the other node has already tried secondary addrs
- * and found one that worked.
- */
- if (!con->nodeid || con->sctp_assoc)
- return;
-
- log_print("Retrying SCTP association init for node %d\n", con->nodeid);
-
- con->try_new_addr = true;
- con->sctp_assoc = 0;
- if (test_and_clear_bit(CF_INIT_PENDING, &con->flags)) {
- if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags))
- queue_work(send_workqueue, &con->swork);
- }
-}
-
-/* INIT failed but we don't know which node...
- restart INIT on all pending nodes */
-static void sctp_init_failed(void)
-{
- mutex_lock(&connections_lock);
-
- foreach_conn(sctp_init_failed_foreach);
-
- mutex_unlock(&connections_lock);
-}
-
-static void retry_failed_sctp_send(struct connection *recv_con,
- struct sctp_send_failed *sn_send_failed,
- char *buf)
-{
- int len = sn_send_failed->ssf_length - sizeof(struct sctp_send_failed);
- struct dlm_mhandle *mh;
- struct connection *con;
- char *retry_buf;
- int nodeid = sn_send_failed->ssf_info.sinfo_ppid;
-
- log_print("Retry sending %d bytes to node id %d", len, nodeid);
-
- if (!nodeid) {
- log_print("Shouldn't resend data via listening connection.");
- return;
- }
-
- con = nodeid2con(nodeid, 0);
- if (!con) {
- log_print("Could not look up con for nodeid %d\n",
- nodeid);
- return;
- }
-
- mh = dlm_lowcomms_get_buffer(nodeid, len, GFP_NOFS, &retry_buf);
- if (!mh) {
- log_print("Could not allocate buf for retry.");
- return;
- }
- memcpy(retry_buf, buf + sizeof(struct sctp_send_failed), len);
- dlm_lowcomms_commit_buffer(mh);
-
- /*
- * If we got a assoc changed event before the send failed event then
- * we only need to retry the send.
- */
- if (con->sctp_assoc) {
- if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags))
- queue_work(send_workqueue, &con->swork);
- } else
- sctp_init_failed_foreach(con);
-}
-
-/* Something happened to an association */
-static void process_sctp_notification(struct connection *con,
- struct msghdr *msg, char *buf)
-{
- union sctp_notification *sn = (union sctp_notification *)buf;
- struct linger linger;
-
- switch (sn->sn_header.sn_type) {
- case SCTP_SEND_FAILED:
- retry_failed_sctp_send(con, &sn->sn_send_failed, buf);
- break;
- case SCTP_ASSOC_CHANGE:
- switch (sn->sn_assoc_change.sac_state) {
- case SCTP_COMM_UP:
- case SCTP_RESTART:
- {
- /* Check that the new node is in the lockspace */
- struct sctp_prim prim;
- int nodeid;
- int prim_len, ret;
- int addr_len;
- struct connection *new_con;
-
- /*
- * We get this before any data for an association.
- * We verify that the node is in the cluster and
- * then peel off a socket for it.
- */
- if ((int)sn->sn_assoc_change.sac_assoc_id <= 0) {
- log_print("COMM_UP for invalid assoc ID %d",
- (int)sn->sn_assoc_change.sac_assoc_id);
- sctp_init_failed();
- return;
- }
- memset(&prim, 0, sizeof(struct sctp_prim));
- prim_len = sizeof(struct sctp_prim);
- prim.ssp_assoc_id = sn->sn_assoc_change.sac_assoc_id;
-
- ret = kernel_getsockopt(con->sock,
- IPPROTO_SCTP,
- SCTP_PRIMARY_ADDR,
- (char*)&prim,
- &prim_len);
- if (ret < 0) {
- log_print("getsockopt/sctp_primary_addr on "
- "new assoc %d failed : %d",
- (int)sn->sn_assoc_change.sac_assoc_id,
- ret);
-
- /* Retry INIT later */
- new_con = assoc2con(sn->sn_assoc_change.sac_assoc_id);
- if (new_con)
- clear_bit(CF_CONNECT_PENDING, &con->flags);
- return;
- }
- make_sockaddr(&prim.ssp_addr, 0, &addr_len);
- if (addr_to_nodeid(&prim.ssp_addr, &nodeid)) {
- unsigned char *b=(unsigned char *)&prim.ssp_addr;
- log_print("reject connect from unknown addr");
- print_hex_dump_bytes("ss: ", DUMP_PREFIX_NONE,
- b, sizeof(struct sockaddr_storage));
- sctp_send_shutdown(prim.ssp_assoc_id);
- return;
- }
-
- new_con = nodeid2con(nodeid, GFP_NOFS);
- if (!new_con)
- return;
-
- /* Peel off a new sock */
- lock_sock(con->sock->sk);
- ret = sctp_do_peeloff(con->sock->sk,
- sn->sn_assoc_change.sac_assoc_id,
- &new_con->sock);
- release_sock(con->sock->sk);
- if (ret < 0) {
- log_print("Can't peel off a socket for "
- "connection %d to node %d: err=%d",
- (int)sn->sn_assoc_change.sac_assoc_id,
- nodeid, ret);
- return;
- }
- add_sock(new_con->sock, new_con);
-
- linger.l_onoff = 1;
- linger.l_linger = 0;
- ret = kernel_setsockopt(new_con->sock, SOL_SOCKET, SO_LINGER,
- (char *)&linger, sizeof(linger));
- if (ret < 0)
- log_print("set socket option SO_LINGER failed");
-
- log_print("connecting to %d sctp association %d",
- nodeid, (int)sn->sn_assoc_change.sac_assoc_id);
-
- new_con->sctp_assoc = sn->sn_assoc_change.sac_assoc_id;
- new_con->try_new_addr = false;
- /* Send any pending writes */
- clear_bit(CF_CONNECT_PENDING, &new_con->flags);
- clear_bit(CF_INIT_PENDING, &new_con->flags);
- if (!test_and_set_bit(CF_WRITE_PENDING, &new_con->flags)) {
- queue_work(send_workqueue, &new_con->swork);
- }
- if (!test_and_set_bit(CF_READ_PENDING, &new_con->flags))
- queue_work(recv_workqueue, &new_con->rwork);
- }
- break;
-
- case SCTP_COMM_LOST:
- case SCTP_SHUTDOWN_COMP:
- {
- con = assoc2con(sn->sn_assoc_change.sac_assoc_id);
- if (con) {
- con->sctp_assoc = 0;
- }
- }
- break;
-
- case SCTP_CANT_STR_ASSOC:
- {
- /* Will retry init when we get the send failed notification */
- log_print("Can't start SCTP association - retrying");
- }
- break;
-
- default:
- log_print("unexpected SCTP assoc change id=%d state=%d",
- (int)sn->sn_assoc_change.sac_assoc_id,
- sn->sn_assoc_change.sac_state);
- }
- default:
- ; /* fall through */
- }
-}
-
/* Data received from remote end */
static int receive_from_sock(struct connection *con)
{
@@ -793,7 +575,6 @@ static int receive_from_sock(struct connection *con)
int r;
int call_again_soon = 0;
int nvec;
- char incmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
mutex_lock(&con->sock_mutex);
@@ -801,6 +582,10 @@ static int receive_from_sock(struct connection *con)
ret = -EAGAIN;
goto out_close;
}
+ if (con->nodeid == 0) {
+ ret = -EINVAL;
+ goto out_close;
+ }
if (con->rx_page == NULL) {
/*
@@ -813,11 +598,6 @@ static int receive_from_sock(struct connection *con)
cbuf_init(&con->cb, PAGE_CACHE_SIZE);
}
- /* Only SCTP needs these really */
- memset(&incmsg, 0, sizeof(incmsg));
- msg.msg_control = incmsg;
- msg.msg_controllen = sizeof(incmsg);
-
/*
* iov[0] is the bit of the circular buffer between the current end
* point (cb.base + cb.len) and the end of the buffer.
@@ -843,31 +623,18 @@ static int receive_from_sock(struct connection *con)
MSG_DONTWAIT | MSG_NOSIGNAL);
if (ret <= 0)
goto out_close;
-
- /* Process SCTP notifications */
- if (msg.msg_flags & MSG_NOTIFICATION) {
- msg.msg_control = incmsg;
- msg.msg_controllen = sizeof(incmsg);
-
- process_sctp_notification(con, &msg,
- page_address(con->rx_page) + con->cb.base);
- mutex_unlock(&con->sock_mutex);
- return 0;
- }
- BUG_ON(con->nodeid == 0);
-
- if (ret == len)
+ else if (ret == len)
call_again_soon = 1;
+
cbuf_add(&con->cb, ret);
ret = dlm_process_incoming_buffer(con->nodeid,
page_address(con->rx_page),
con->cb.base, con->cb.len,
PAGE_CACHE_SIZE);
if (ret == -EBADMSG) {
- log_print("lowcomms: addr=%p, base=%u, len=%u, "
- "iov_len=%u, iov_base[0]=%p, read=%d",
- page_address(con->rx_page), con->cb.base, con->cb.len,
- len, iov[0].iov_base, r);
+ log_print("lowcomms: addr=%p, base=%u, len=%u, read=%d",
+ page_address(con->rx_page), con->cb.base,
+ con->cb.len, r);
}
if (ret < 0)
goto out_close;
@@ -892,7 +659,7 @@ out_resched:
out_close:
mutex_unlock(&con->sock_mutex);
if (ret != -EAGAIN) {
- close_connection(con, false);
+ close_connection(con, false, true, false);
/* Reconnect when there is something to send */
}
/* Don't return success if we really got EOF */
@@ -1033,6 +800,120 @@ accept_err:
return result;
}
+static int sctp_accept_from_sock(struct connection *con)
+{
+ /* Check that the new node is in the lockspace */
+ struct sctp_prim prim;
+ int nodeid;
+ int prim_len, ret;
+ int addr_len;
+ struct connection *newcon;
+ struct connection *addcon;
+ struct socket *newsock;
+
+ mutex_lock(&connections_lock);
+ if (!dlm_allow_conn) {
+ mutex_unlock(&connections_lock);
+ return -1;
+ }
+ mutex_unlock(&connections_lock);
+
+ mutex_lock_nested(&con->sock_mutex, 0);
+
+ ret = kernel_accept(con->sock, &newsock, O_NONBLOCK);
+ if (ret < 0)
+ goto accept_err;
+
+ memset(&prim, 0, sizeof(struct sctp_prim));
+ prim_len = sizeof(struct sctp_prim);
+
+ ret = kernel_getsockopt(newsock, IPPROTO_SCTP, SCTP_PRIMARY_ADDR,
+ (char *)&prim, &prim_len);
+ if (ret < 0) {
+ log_print("getsockopt/sctp_primary_addr failed: %d", ret);
+ goto accept_err;
+ }
+
+ make_sockaddr(&prim.ssp_addr, 0, &addr_len);
+ if (addr_to_nodeid(&prim.ssp_addr, &nodeid)) {
+ unsigned char *b = (unsigned char *)&prim.ssp_addr;
+
+ log_print("reject connect from unknown addr");
+ print_hex_dump_bytes("ss: ", DUMP_PREFIX_NONE,
+ b, sizeof(struct sockaddr_storage));
+ goto accept_err;
+ }
+
+ newcon = nodeid2con(nodeid, GFP_NOFS);
+ if (!newcon) {
+ ret = -ENOMEM;
+ goto accept_err;
+ }
+
+ mutex_lock_nested(&newcon->sock_mutex, 1);
+
+ if (newcon->sock) {
+ struct connection *othercon = newcon->othercon;
+
+ if (!othercon) {
+ othercon = kmem_cache_zalloc(con_cache, GFP_NOFS);
+ if (!othercon) {
+ log_print("failed to allocate incoming socket");
+ mutex_unlock(&newcon->sock_mutex);
+ ret = -ENOMEM;
+ goto accept_err;
+ }
+ othercon->nodeid = nodeid;
+ othercon->rx_action = receive_from_sock;
+ mutex_init(&othercon->sock_mutex);
+ INIT_WORK(&othercon->swork, process_send_sockets);
+ INIT_WORK(&othercon->rwork, process_recv_sockets);
+ set_bit(CF_IS_OTHERCON, &othercon->flags);
+ }
+ if (!othercon->sock) {
+ newcon->othercon = othercon;
+ othercon->sock = newsock;
+ newsock->sk->sk_user_data = othercon;
+ add_sock(newsock, othercon);
+ addcon = othercon;
+ } else {
+ printk("Extra connection from node %d attempted\n", nodeid);
+ ret = -EAGAIN;
+ mutex_unlock(&newcon->sock_mutex);
+ goto accept_err;
+ }
+ } else {
+ newsock->sk->sk_user_data = newcon;
+ newcon->rx_action = receive_from_sock;
+ add_sock(newsock, newcon);
+ addcon = newcon;
+ }
+
+ log_print("connected to %d", nodeid);
+
+ mutex_unlock(&newcon->sock_mutex);
+
+ /*
+ * Add it to the active queue in case we got data
+ * between processing the accept adding the socket
+ * to the read_sockets list
+ */
+ if (!test_and_set_bit(CF_READ_PENDING, &addcon->flags))
+ queue_work(recv_workqueue, &addcon->rwork);
+ mutex_unlock(&con->sock_mutex);
+
+ return 0;
+
+accept_err:
+ mutex_unlock(&con->sock_mutex);
+ if (newsock)
+ sock_release(newsock);
+ if (ret != -EAGAIN)
+ log_print("error accepting connection from node: %d", ret);
+
+ return ret;
+}
+
static void free_entry(struct writequeue_entry *e)
{
__free_page(e->page);
@@ -1057,97 +938,129 @@ static void writequeue_entry_complete(struct writequeue_entry *e, int completed)
}
}
+/*
+ * sctp_bind_addrs - bind a SCTP socket to all our addresses
+ */
+static int sctp_bind_addrs(struct connection *con, uint16_t port)
+{
+ struct sockaddr_storage localaddr;
+ int i, addr_len, result = 0;
+
+ for (i = 0; i < dlm_local_count; i++) {
+ memcpy(&localaddr, dlm_local_addr[i], sizeof(localaddr));
+ make_sockaddr(&localaddr, port, &addr_len);
+
+ if (!i)
+ result = kernel_bind(con->sock,
+ (struct sockaddr *)&localaddr,
+ addr_len);
+ else
+ result = kernel_setsockopt(con->sock, SOL_SCTP,
+ SCTP_SOCKOPT_BINDX_ADD,
+ (char *)&localaddr, addr_len);
+
+ if (result < 0) {
+ log_print("Can't bind to %d addr number %d, %d.\n",
+ port, i + 1, result);
+ break;
+ }
+ }
+ return result;
+}
+
/* Initiate an SCTP association.
This is a special case of send_to_sock() in that we don't yet have a
peeled-off socket for this association, so we use the listening socket
and add the primary IP address of the remote node.
*/
-static void sctp_init_assoc(struct connection *con)
+static void sctp_connect_to_sock(struct connection *con)
{
- struct sockaddr_storage rem_addr;
- char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
- struct msghdr outmessage;
- struct cmsghdr *cmsg;
- struct sctp_sndrcvinfo *sinfo;
- struct connection *base_con;
- struct writequeue_entry *e;
- int len, offset;
- int ret;
- int addrlen;
- struct kvec iov[1];
+ struct sockaddr_storage daddr;
+ int one = 1;
+ int result;
+ int addr_len;
+ struct socket *sock;
+
+ if (con->nodeid == 0) {
+ log_print("attempt to connect sock 0 foiled");
+ return;
+ }
mutex_lock(&con->sock_mutex);
- if (test_and_set_bit(CF_INIT_PENDING, &con->flags))
- goto unlock;
- if (nodeid_to_addr(con->nodeid, NULL, (struct sockaddr *)&rem_addr,
- con->try_new_addr)) {
+ /* Some odd races can cause double-connects, ignore them */
+ if (con->retries++ > MAX_CONNECT_RETRIES)
+ goto out;
+
+ if (con->sock) {
+ log_print("node %d already connected.", con->nodeid);
+ goto out;
+ }
+
+ memset(&daddr, 0, sizeof(daddr));
+ result = nodeid_to_addr(con->nodeid, &daddr, NULL, true);
+ if (result < 0) {
log_print("no address for nodeid %d", con->nodeid);
- goto unlock;
+ goto out;
}
- base_con = nodeid2con(0, 0);
- BUG_ON(base_con == NULL);
- make_sockaddr(&rem_addr, dlm_config.ci_tcp_port, &addrlen);
+ /* Create a socket to communicate with */
+ result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family,
+ SOCK_STREAM, IPPROTO_SCTP, &sock);
+ if (result < 0)
+ goto socket_err;
- outmessage.msg_name = &rem_addr;
- outmessage.msg_namelen = addrlen;
- outmessage.msg_control = outcmsg;
- outmessage.msg_controllen = sizeof(outcmsg);
- outmessage.msg_flags = MSG_EOR;
+ sock->sk->sk_user_data = con;
+ con->rx_action = receive_from_sock;
+ con->connect_action = sctp_connect_to_sock;
+ add_sock(sock, con);
- spin_lock(&con->writequeue_lock);
+ /* Bind to all addresses. */
+ if (sctp_bind_addrs(con, 0))
+ goto bind_err;
- if (list_empty(&con->writequeue)) {
- spin_unlock(&con->writequeue_lock);
- log_print("writequeue empty for nodeid %d", con->nodeid);
- goto unlock;
- }
+ make_sockaddr(&daddr, dlm_config.ci_tcp_port, &addr_len);
- e = list_first_entry(&con->writequeue, struct writequeue_entry, list);
- len = e->len;
- offset = e->offset;
+ log_print("connecting to %d", con->nodeid);
- /* Send the first block off the write queue */
- iov[0].iov_base = page_address(e->page)+offset;
- iov[0].iov_len = len;
- spin_unlock(&con->writequeue_lock);
+ /* Turn off Nagle's algorithm */
+ kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one,
+ sizeof(one));
- if (rem_addr.ss_family == AF_INET) {
- struct sockaddr_in *sin = (struct sockaddr_in *)&rem_addr;
- log_print("Trying to connect to %pI4", &sin->sin_addr.s_addr);
- } else {
- struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&rem_addr;
- log_print("Trying to connect to %pI6", &sin6->sin6_addr);
- }
+ result = sock->ops->connect(sock, (struct sockaddr *)&daddr, addr_len,
+ O_NONBLOCK);
+ if (result == -EINPROGRESS)
+ result = 0;
+ if (result == 0)
+ goto out;
- cmsg = CMSG_FIRSTHDR(&outmessage);
- cmsg->cmsg_level = IPPROTO_SCTP;
- cmsg->cmsg_type = SCTP_SNDRCV;
- cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
- sinfo = CMSG_DATA(cmsg);
- memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
- sinfo->sinfo_ppid = cpu_to_le32(con->nodeid);
- outmessage.msg_controllen = cmsg->cmsg_len;
- sinfo->sinfo_flags |= SCTP_ADDR_OVER;
- ret = kernel_sendmsg(base_con->sock, &outmessage, iov, 1, len);
- if (ret < 0) {
- log_print("Send first packet to node %d failed: %d",
- con->nodeid, ret);
+bind_err:
+ con->sock = NULL;
+ sock_release(sock);
- /* Try again later */
+socket_err:
+ /*
+ * Some errors are fatal and this list might need adjusting. For other
+ * errors we try again until the max number of retries is reached.
+ */
+ if (result != -EHOSTUNREACH &&
+ result != -ENETUNREACH &&
+ result != -ENETDOWN &&
+ result != -EINVAL &&
+ result != -EPROTONOSUPPORT) {
+ log_print("connect %d try %d error %d", con->nodeid,
+ con->retries, result);
+ mutex_unlock(&con->sock_mutex);
+ msleep(1000);
clear_bit(CF_CONNECT_PENDING, &con->flags);
- clear_bit(CF_INIT_PENDING, &con->flags);
- }
- else {
- spin_lock(&con->writequeue_lock);
- writequeue_entry_complete(e, ret);
- spin_unlock(&con->writequeue_lock);
+ lowcomms_connect_sock(con);
+ return;
}
-unlock:
+out:
mutex_unlock(&con->sock_mutex);
+ set_bit(CF_WRITE_PENDING, &con->flags);
}
/* Connect a new socket to its peer */
@@ -1236,11 +1149,13 @@ out_err:
con->retries, result);
mutex_unlock(&con->sock_mutex);
msleep(1000);
+ clear_bit(CF_CONNECT_PENDING, &con->flags);
lowcomms_connect_sock(con);
return;
}
out:
mutex_unlock(&con->sock_mutex);
+ set_bit(CF_WRITE_PENDING, &con->flags);
return;
}
@@ -1325,37 +1240,11 @@ static void init_local(void)
}
}
-/* Bind to an IP address. SCTP allows multiple address so it can do
- multi-homing */
-static int add_sctp_bind_addr(struct connection *sctp_con,
- struct sockaddr_storage *addr,
- int addr_len, int num)
-{
- int result = 0;
-
- if (num == 1)
- result = kernel_bind(sctp_con->sock,
- (struct sockaddr *) addr,
- addr_len);
- else
- result = kernel_setsockopt(sctp_con->sock, SOL_SCTP,
- SCTP_SOCKOPT_BINDX_ADD,
- (char *)addr, addr_len);
-
- if (result < 0)
- log_print("Can't bind to port %d addr number %d",
- dlm_config.ci_tcp_port, num);
-
- return result;
-}
-
/* Initialise SCTP socket and bind to all interfaces */
static int sctp_listen_for_all(void)
{
struct socket *sock = NULL;
- struct sockaddr_storage localaddr;
- struct sctp_event_subscribe subscribe;
- int result = -EINVAL, num = 1, i, addr_len;
+ int result = -EINVAL;
struct connection *con = nodeid2con(0, GFP_NOFS);
int bufsize = NEEDED_RMEM;
int one = 1;
@@ -1366,33 +1255,17 @@ static int sctp_listen_for_all(void)
log_print("Using SCTP for communications");
result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family,
- SOCK_SEQPACKET, IPPROTO_SCTP, &sock);
+ SOCK_STREAM, IPPROTO_SCTP, &sock);
if (result < 0) {
log_print("Can't create comms socket, check SCTP is loaded");
goto out;
}
- /* Listen for events */
- memset(&subscribe, 0, sizeof(subscribe));
- subscribe.sctp_data_io_event = 1;
- subscribe.sctp_association_event = 1;
- subscribe.sctp_send_failure_event = 1;
- subscribe.sctp_shutdown_event = 1;
- subscribe.sctp_partial_delivery_event = 1;
-
result = kernel_setsockopt(sock, SOL_SOCKET, SO_RCVBUFFORCE,
(char *)&bufsize, sizeof(bufsize));
if (result)
log_print("Error increasing buffer space on socket %d", result);
- result = kernel_setsockopt(sock, SOL_SCTP, SCTP_EVENTS,
- (char *)&subscribe, sizeof(subscribe));
- if (result < 0) {
- log_print("Failed to set SCTP_EVENTS on socket: result=%d",
- result);
- goto create_delsock;
- }
-
result = kernel_setsockopt(sock, SOL_SCTP, SCTP_NODELAY, (char *)&one,
sizeof(one));
if (result < 0)
@@ -1402,19 +1275,12 @@ static int sctp_listen_for_all(void)
sock->sk->sk_user_data = con;
con->sock = sock;
con->sock->sk->sk_data_ready = lowcomms_data_ready;
- con->rx_action = receive_from_sock;
- con->connect_action = sctp_init_assoc;
+ con->rx_action = sctp_accept_from_sock;
+ con->connect_action = sctp_connect_to_sock;
- /* Bind to all interfaces. */
- for (i = 0; i < dlm_local_count; i++) {
- memcpy(&localaddr, dlm_local_addr[i], sizeof(localaddr));
- make_sockaddr(&localaddr, dlm_config.ci_tcp_port, &addr_len);
-
- result = add_sctp_bind_addr(con, &localaddr, addr_len, num);
- if (result)
- goto create_delsock;
- ++num;
- }
+ /* Bind to all addresses. */
+ if (sctp_bind_addrs(con, dlm_config.ci_tcp_port))
+ goto create_delsock;
result = sock->ops->listen(sock, 5);
if (result < 0) {
@@ -1582,7 +1448,7 @@ static void send_to_sock(struct connection *con)
msg_flags);
if (ret == -EAGAIN || ret == 0) {
if (ret == -EAGAIN &&
- test_bit(SOCK_ASYNC_NOSPACE, &con->sock->flags) &&
+ test_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags) &&
!test_and_set_bit(CF_APP_LIMITED, &con->flags)) {
/* Notify TCP that we're limited by the
* application window size.
@@ -1612,14 +1478,13 @@ out:
send_error:
mutex_unlock(&con->sock_mutex);
- close_connection(con, false);
+ close_connection(con, false, false, true);
lowcomms_connect_sock(con);
return;
out_connect:
mutex_unlock(&con->sock_mutex);
- if (!test_bit(CF_INIT_PENDING, &con->flags))
- lowcomms_connect_sock(con);
+ lowcomms_connect_sock(con);
}
static void clean_one_writequeue(struct connection *con)
@@ -1644,15 +1509,9 @@ int dlm_lowcomms_close(int nodeid)
log_print("closing connection to node %d", nodeid);
con = nodeid2con(nodeid, 0);
if (con) {
- clear_bit(CF_CONNECT_PENDING, &con->flags);
- clear_bit(CF_WRITE_PENDING, &con->flags);
set_bit(CF_CLOSE, &con->flags);
- if (cancel_work_sync(&con->swork))
- log_print("canceled swork for node %d", nodeid);
- if (cancel_work_sync(&con->rwork))
- log_print("canceled rwork for node %d", nodeid);
+ close_connection(con, true, true, true);
clean_one_writequeue(con);
- close_connection(con, true);
}
spin_lock(&dlm_node_addrs_spin);
@@ -1685,10 +1544,8 @@ static void process_send_sockets(struct work_struct *work)
{
struct connection *con = container_of(work, struct connection, swork);
- if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) {
+ if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags))
con->connect_action(con);
- set_bit(CF_WRITE_PENDING, &con->flags);
- }
if (test_and_clear_bit(CF_WRITE_PENDING, &con->flags))
send_to_sock(con);
}
@@ -1735,7 +1592,7 @@ static void stop_conn(struct connection *con)
static void free_conn(struct connection *con)
{
- close_connection(con, true);
+ close_connection(con, true, true, true);
if (con->othercon)
kmem_cache_free(con_cache, con->othercon);
hlist_del(&con->list);
@@ -1806,7 +1663,7 @@ fail_unlisten:
dlm_allow_conn = 0;
con = nodeid2con(0,0);
if (con) {
- close_connection(con, false);
+ close_connection(con, false, true, true);
kmem_cache_free(con_cache, con);
}
fail_destroy:
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index e0ab3a9..d401425 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -145,7 +145,7 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
send_op(op);
if (xop->callback == NULL) {
- rv = wait_event_killable(recv_wq, (op->done != 0));
+ rv = wait_event_interruptible(recv_wq, (op->done != 0));
if (rv == -ERESTARTSYS) {
log_debug(ls, "dlm_posix_lock: wait killed %llx",
(unsigned long long)number);
@@ -172,7 +172,7 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
rv = op->info.rv;
if (!rv) {
- if (posix_lock_file_wait(file, fl) < 0)
+ if (locks_lock_file_wait(file, fl) < 0)
log_error(ls, "dlm_posix_lock: vfs lock error %llx",
(unsigned long long)number);
}
@@ -262,7 +262,7 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
/* cause the vfs unlock to return ENOENT if lock is not found */
fl->fl_flags |= FL_EXISTS;
- rv = posix_lock_file_wait(file, fl);
+ rv = locks_lock_file_wait(file, fl);
if (rv == -ENOENT) {
rv = 0;
goto out_free;
@@ -509,7 +509,6 @@ int dlm_plock_init(void)
void dlm_plock_exit(void)
{
- if (misc_deregister(&plock_dev_misc) < 0)
- log_print("dlm_plock_exit: misc_deregister failed");
+ misc_deregister(&plock_dev_misc);
}
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index fb85f32..1925d6d 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -362,18 +362,15 @@ fail:
int dlm_device_deregister(struct dlm_ls *ls)
{
- int error;
-
/* The device is not registered. This happens when the lockspace
was never used from userspace, or when device_create_lockspace()
calls dlm_release_lockspace() after the register fails. */
if (!ls->ls_device.name)
return 0;
- error = misc_deregister(&ls->ls_device);
- if (!error)
- kfree(ls->ls_device.name);
- return error;
+ misc_deregister(&ls->ls_device);
+ kfree(ls->ls_device.name);
+ return 0;
}
static int device_user_purge(struct dlm_user_proc *proc,
@@ -518,14 +515,9 @@ static ssize_t device_write(struct file *file, const char __user *buf,
if (count > sizeof(struct dlm_write_request) + DLM_RESNAME_MAXLEN)
return -EINVAL;
- kbuf = kzalloc(count + 1, GFP_NOFS);
- if (!kbuf)
- return -ENOMEM;
-
- if (copy_from_user(kbuf, buf, count)) {
- error = -EFAULT;
- goto out_free;
- }
+ kbuf = memdup_user_nul(buf, count);
+ if (!IS_ERR(kbuf))
+ return PTR_ERR(kbuf);
if (check_version(kbuf)) {
error = -EBADE;
@@ -785,6 +777,7 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
DECLARE_WAITQUEUE(wait, current);
struct dlm_callback cb;
int rv, resid, copy_lvb = 0;
+ int old_mode, new_mode;
if (count == sizeof(struct dlm_device_version)) {
rv = copy_version_to_user(buf, count);
@@ -841,6 +834,9 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
lkb = list_entry(proc->asts.next, struct dlm_lkb, lkb_cb_list);
+ /* rem_lkb_callback sets a new lkb_last_cast */
+ old_mode = lkb->lkb_last_cast.mode;
+
rv = dlm_rem_lkb_callback(lkb->lkb_resource->res_ls, lkb, &cb, &resid);
if (rv < 0) {
/* this shouldn't happen; lkb should have been removed from
@@ -864,9 +860,6 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
}
if (cb.flags & DLM_CB_CAST) {
- int old_mode, new_mode;
-
- old_mode = lkb->lkb_last_cast.mode;
new_mode = cb.mode;
if (!cb.sb_status && lkb->lkb_lksb->sb_lvbptr &&
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 5718cb9..d72d52b 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -17,7 +17,7 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
{
struct inode *inode, *toput_inode = NULL;
- spin_lock(&inode_sb_list_lock);
+ spin_lock(&sb->s_inode_list_lock);
list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
spin_lock(&inode->i_lock);
if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
@@ -27,13 +27,15 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
}
__iget(inode);
spin_unlock(&inode->i_lock);
- spin_unlock(&inode_sb_list_lock);
+ spin_unlock(&sb->s_inode_list_lock);
+
invalidate_mapping_pages(inode->i_mapping, 0, -1);
iput(toput_inode);
toput_inode = inode;
- spin_lock(&inode_sb_list_lock);
+
+ spin_lock(&sb->s_inode_list_lock);
}
- spin_unlock(&inode_sb_list_lock);
+ spin_unlock(&sb->s_inode_list_lock);
iput(toput_inode);
}
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 97315f2..80d6901 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -258,8 +258,7 @@ void ecryptfs_destroy_mount_crypt_stat(
&mount_crypt_stat->global_auth_tok_list,
mount_crypt_stat_list) {
list_del(&auth_tok->mount_crypt_stat_list);
- if (auth_tok->global_auth_tok_key
- && !(auth_tok->flags & ECRYPTFS_AUTH_TOK_INVALID))
+ if (!(auth_tok->flags & ECRYPTFS_AUTH_TOK_INVALID))
key_put(auth_tok->global_auth_tok_key);
kmem_cache_free(ecryptfs_global_auth_tok_cache, auth_tok);
}
diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c
index 8db0b46..63cd2c1 100644
--- a/fs/ecryptfs/dentry.c
+++ b/fs/ecryptfs/dentry.c
@@ -45,20 +45,20 @@
static int ecryptfs_d_revalidate(struct dentry *dentry, unsigned int flags)
{
struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
- int rc;
-
- if (!(lower_dentry->d_flags & DCACHE_OP_REVALIDATE))
- return 1;
+ int rc = 1;
if (flags & LOOKUP_RCU)
return -ECHILD;
- rc = lower_dentry->d_op->d_revalidate(lower_dentry, flags);
+ if (lower_dentry->d_flags & DCACHE_OP_REVALIDATE)
+ rc = lower_dentry->d_op->d_revalidate(lower_dentry, flags);
+
if (d_really_is_positive(dentry)) {
- struct inode *lower_inode =
- ecryptfs_inode_to_lower(d_inode(dentry));
+ struct inode *inode = d_inode(dentry);
- fsstack_copy_attr_all(d_inode(dentry), lower_inode);
+ fsstack_copy_attr_all(inode, ecryptfs_inode_to_lower(inode));
+ if (!inode->i_nlink)
+ return 0;
}
return rc;
}
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 5ba029e..7b39260 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -86,7 +86,7 @@ ecryptfs_get_encrypted_key_payload_data(struct key *key)
{
if (key->type == &key_type_encrypted)
return (struct ecryptfs_auth_tok *)
- (&((struct encrypted_key_payload *)key->payload.data)->payload_data);
+ (&((struct encrypted_key_payload *)key->payload.data[0])->payload_data);
else
return NULL;
}
@@ -117,8 +117,7 @@ ecryptfs_get_key_payload_data(struct key *key)
auth_tok = ecryptfs_get_encrypted_key_payload_data(key);
if (!auth_tok)
- return (struct ecryptfs_auth_tok *)
- (((struct user_key_payload *)key->payload.data)->data);
+ return (struct ecryptfs_auth_tok *)user_key_payload(key)->data;
else
return auth_tok;
}
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 3c4db11..040aa87 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -270,7 +270,7 @@ ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry,
ecryptfs_inode = ecryptfs_do_create(directory_inode, ecryptfs_dentry,
mode);
- if (unlikely(IS_ERR(ecryptfs_inode))) {
+ if (IS_ERR(ecryptfs_inode)) {
ecryptfs_printk(KERN_WARNING, "Failed to create file in"
"lower filesystem\n");
rc = PTR_ERR(ecryptfs_inode);
@@ -282,9 +282,7 @@ ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry,
if (rc) {
ecryptfs_do_unlink(directory_inode, ecryptfs_dentry,
ecryptfs_inode);
- make_bad_inode(ecryptfs_inode);
- unlock_new_inode(ecryptfs_inode);
- iput(ecryptfs_inode);
+ iget_failed(ecryptfs_inode);
goto out;
}
unlock_new_inode(ecryptfs_inode);
@@ -674,16 +672,24 @@ out:
return rc ? ERR_PTR(rc) : buf;
}
-static const char *ecryptfs_follow_link(struct dentry *dentry, void **cookie)
+static const char *ecryptfs_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
size_t len;
- char *buf = ecryptfs_readlink_lower(dentry, &len);
+ char *buf;
+
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
+ buf = ecryptfs_readlink_lower(dentry, &len);
if (IS_ERR(buf))
return buf;
fsstack_copy_attr_atime(d_inode(dentry),
d_inode(ecryptfs_dentry_to_lower(dentry)));
buf[len] = '\0';
- return *cookie = buf;
+ set_delayed_call(done, kfree_link, buf);
+ return buf;
}
/**
@@ -1095,8 +1101,7 @@ out:
const struct inode_operations ecryptfs_symlink_iops = {
.readlink = generic_readlink,
- .follow_link = ecryptfs_follow_link,
- .put_link = kfree_put_link,
+ .get_link = ecryptfs_get_link,
.permission = ecryptfs_permission,
.setattr = ecryptfs_setattr,
.getattr = ecryptfs_getattr_link,
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 4f4d047..e25b6b0 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -663,6 +663,7 @@ static struct ecryptfs_cache_info {
struct kmem_cache **cache;
const char *name;
size_t size;
+ unsigned long flags;
void (*ctor)(void *obj);
} ecryptfs_cache_infos[] = {
{
@@ -684,6 +685,7 @@ static struct ecryptfs_cache_info {
.cache = &ecryptfs_inode_info_cache,
.name = "ecryptfs_inode_cache",
.size = sizeof(struct ecryptfs_inode_info),
+ .flags = SLAB_ACCOUNT,
.ctor = inode_info_init_once,
},
{
@@ -755,8 +757,8 @@ static int ecryptfs_init_kmem_caches(void)
struct ecryptfs_cache_info *info;
info = &ecryptfs_cache_infos[i];
- *(info->cache) = kmem_cache_create(info->name, info->size,
- 0, SLAB_HWCACHE_ALIGN, info->ctor);
+ *(info->cache) = kmem_cache_create(info->name, info->size, 0,
+ SLAB_HWCACHE_ALIGN | info->flags, info->ctor);
if (!*(info->cache)) {
ecryptfs_free_kmem_caches();
ecryptfs_printk(KERN_WARNING, "%s: "
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index cf20852..caba848 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -299,7 +299,7 @@ static int ecryptfs_write_begin(struct file *file,
rc = ecryptfs_read_lower_page_segment(
page, index, 0, PAGE_CACHE_SIZE, mapping->host);
if (rc) {
- printk(KERN_ERR "%s: Error attemping to read "
+ printk(KERN_ERR "%s: Error attempting to read "
"lower page segment; rc = [%d]\n",
__func__, rc);
ClearPageUptodate(page);
diff --git a/fs/efs/inode.c b/fs/efs/inode.c
index 079d203..cdf0872 100644
--- a/fs/efs/inode.c
+++ b/fs/efs/inode.c
@@ -151,6 +151,7 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino)
break;
case S_IFLNK:
inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_data.a_ops = &efs_symlink_aops;
break;
case S_IFCHR:
diff --git a/fs/efs/super.c b/fs/efs/super.c
index c8411a3..cb68dac 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -94,9 +94,9 @@ static void init_once(void *foo)
static int __init init_inodecache(void)
{
efs_inode_cachep = kmem_cache_create("efs_inode_cache",
- sizeof(struct efs_inode_info),
- 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
- init_once);
+ sizeof(struct efs_inode_info), 0,
+ SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
+ SLAB_ACCOUNT, init_once);
if (efs_inode_cachep == NULL)
return -ENOMEM;
return 0;
diff --git a/fs/efs/symlink.c b/fs/efs/symlink.c
index 75117d0..4870cc8 100644
--- a/fs/efs/symlink.c
+++ b/fs/efs/symlink.c
@@ -13,7 +13,7 @@
static int efs_symlink_readpage(struct file *file, struct page *page)
{
- char *link = kmap(page);
+ char *link = page_address(page);
struct buffer_head * bh;
struct inode * inode = page->mapping->host;
efs_block_t size = inode->i_size;
@@ -39,12 +39,10 @@ static int efs_symlink_readpage(struct file *file, struct page *page)
}
link[size] = '\0';
SetPageUptodate(page);
- kunmap(page);
unlock_page(page);
return 0;
fail:
SetPageError(page);
- kunmap(page);
unlock_page(page);
return err;
}
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 8d0c0df..ed70cf9 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -45,10 +45,10 @@ struct eventfd_ctx {
*
* This function is supposed to be called by the kernel in paths that do not
* allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
- * value, and we signal this as overflow condition by returining a POLLERR
+ * value, and we signal this as overflow condition by returning a POLLERR
* to poll(2).
*
- * Returns the amount by which the counter was incrememnted. This will be less
+ * Returns the amount by which the counter was incremented. This will be less
* than @n if the counter has overflowed.
*/
__u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
diff --git a/fs/exec.c b/fs/exec.c
index 1977c2a..828ec5f 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -98,6 +98,12 @@ static inline void put_binfmt(struct linux_binfmt * fmt)
module_put(fmt->module);
}
+bool path_noexec(const struct path *path)
+{
+ return (path->mnt->mnt_flags & MNT_NOEXEC) ||
+ (path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC);
+}
+
#ifdef CONFIG_USELIB
/*
* Note that a shared library must be both readable and executable due to
@@ -113,7 +119,7 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
int error = PTR_ERR(tmp);
static const struct open_flags uselib_flags = {
.open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
- .acc_mode = MAY_READ | MAY_EXEC | MAY_OPEN,
+ .acc_mode = MAY_READ | MAY_EXEC,
.intent = LOOKUP_OPEN,
.lookup_flags = LOOKUP_FOLLOW,
};
@@ -132,7 +138,7 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
goto exit;
error = -EACCES;
- if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
+ if (path_noexec(&file->f_path))
goto exit;
fsnotify_open(file);
@@ -757,7 +763,7 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags)
int err;
struct open_flags open_exec_flags = {
.open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
- .acc_mode = MAY_EXEC | MAY_OPEN,
+ .acc_mode = MAY_EXEC,
.intent = LOOKUP_OPEN,
.lookup_flags = LOOKUP_FOLLOW,
};
@@ -777,7 +783,7 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags)
if (!S_ISREG(file_inode(file)->i_mode))
goto exit;
- if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
+ if (path_noexec(&file->f_path))
goto exit;
err = deny_write_access(file);
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 73c64da..9eaf595 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -592,10 +592,7 @@ static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
}
unlock_page(page);
}
- if (PageDirty(page) || PageWriteback(page))
- *uptodate = true;
- else
- *uptodate = PageUptodate(page);
+ *uptodate = PageUptodate(page);
EXOFS_DBGMSG2("index=0x%lx uptodate=%d\n", index, *uptodate);
return page;
} else {
@@ -1227,6 +1224,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
inode->i_link = (char *)oi->i_data;
} else {
inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &exofs_aops;
}
} else {
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
index 09a6bb1..c20d77d 100644
--- a/fs/exofs/namei.c
+++ b/fs/exofs/namei.c
@@ -80,9 +80,6 @@ static int exofs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
struct inode *inode;
int err;
- if (!new_valid_dev(rdev))
- return -EINVAL;
-
inode = exofs_new_inode(dir, mode);
err = PTR_ERR(inode);
if (!IS_ERR(inode)) {
@@ -114,6 +111,7 @@ static int exofs_symlink(struct inode *dir, struct dentry *dentry,
if (l > sizeof(oi->i_data)) {
/* slow symlink */
inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &exofs_aops;
memset(oi->i_data, 0, sizeof(oi->i_data));
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index b795c56..6658a50 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -194,8 +194,8 @@ static int init_inodecache(void)
{
exofs_inode_cachep = kmem_cache_create("exofs_inode_cache",
sizeof(struct exofs_i_info), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
- exofs_init_once);
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD |
+ SLAB_ACCOUNT, exofs_init_once);
if (exofs_inode_cachep == NULL)
return -ENOMEM;
return 0;
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 8d15feb..4c69c94 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -684,6 +684,9 @@ struct ext2_inode_info {
struct rw_semaphore xattr_sem;
#endif
rwlock_t i_meta_lock;
+#ifdef CONFIG_FS_DAX
+ struct rw_semaphore dax_sem;
+#endif
/*
* truncate_mutex is for serialising ext2_truncate() against
@@ -699,6 +702,14 @@ struct ext2_inode_info {
#endif
};
+#ifdef CONFIG_FS_DAX
+#define dax_sem_down_write(ext2_inode) down_write(&(ext2_inode)->dax_sem)
+#define dax_sem_up_write(ext2_inode) up_write(&(ext2_inode)->dax_sem)
+#else
+#define dax_sem_down_write(ext2_inode)
+#define dax_sem_up_write(ext2_inode)
+#endif
+
/*
* Inode dynamic state flags
*/
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 3b57c9f..11a42c5 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -20,26 +20,110 @@
#include <linux/time.h>
#include <linux/pagemap.h>
+#include <linux/dax.h>
#include <linux/quotaops.h>
#include "ext2.h"
#include "xattr.h"
#include "acl.h"
#ifdef CONFIG_FS_DAX
+/*
+ * The lock ordering for ext2 DAX fault paths is:
+ *
+ * mmap_sem (MM)
+ * sb_start_pagefault (vfs, freeze)
+ * ext2_inode_info->dax_sem
+ * address_space->i_mmap_rwsem or page_lock (mutually exclusive in DAX)
+ * ext2_inode_info->truncate_mutex
+ *
+ * The default page_lock and i_size verification done by non-DAX fault paths
+ * is sufficient because ext2 doesn't support hole punching.
+ */
static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
- return dax_fault(vma, vmf, ext2_get_block, NULL);
+ struct inode *inode = file_inode(vma->vm_file);
+ struct ext2_inode_info *ei = EXT2_I(inode);
+ int ret;
+
+ if (vmf->flags & FAULT_FLAG_WRITE) {
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vma->vm_file);
+ }
+ down_read(&ei->dax_sem);
+
+ ret = __dax_fault(vma, vmf, ext2_get_block, NULL);
+
+ up_read(&ei->dax_sem);
+ if (vmf->flags & FAULT_FLAG_WRITE)
+ sb_end_pagefault(inode->i_sb);
+ return ret;
+}
+
+static int ext2_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmd, unsigned int flags)
+{
+ struct inode *inode = file_inode(vma->vm_file);
+ struct ext2_inode_info *ei = EXT2_I(inode);
+ int ret;
+
+ if (flags & FAULT_FLAG_WRITE) {
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vma->vm_file);
+ }
+ down_read(&ei->dax_sem);
+
+ ret = __dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block, NULL);
+
+ up_read(&ei->dax_sem);
+ if (flags & FAULT_FLAG_WRITE)
+ sb_end_pagefault(inode->i_sb);
+ return ret;
}
static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
- return dax_mkwrite(vma, vmf, ext2_get_block, NULL);
+ struct inode *inode = file_inode(vma->vm_file);
+ struct ext2_inode_info *ei = EXT2_I(inode);
+ int ret;
+
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vma->vm_file);
+ down_read(&ei->dax_sem);
+
+ ret = __dax_mkwrite(vma, vmf, ext2_get_block, NULL);
+
+ up_read(&ei->dax_sem);
+ sb_end_pagefault(inode->i_sb);
+ return ret;
+}
+
+static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma,
+ struct vm_fault *vmf)
+{
+ struct inode *inode = file_inode(vma->vm_file);
+ struct ext2_inode_info *ei = EXT2_I(inode);
+ int ret = VM_FAULT_NOPAGE;
+ loff_t size;
+
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vma->vm_file);
+ down_read(&ei->dax_sem);
+
+ /* check that the faulting page hasn't raced with truncate */
+ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ if (vmf->pgoff >= size)
+ ret = VM_FAULT_SIGBUS;
+
+ up_read(&ei->dax_sem);
+ sb_end_pagefault(inode->i_sb);
+ return ret;
}
static const struct vm_operations_struct ext2_dax_vm_ops = {
.fault = ext2_dax_fault,
+ .pmd_fault = ext2_dax_pmd_fault,
.page_mkwrite = ext2_dax_mkwrite,
- .pfn_mkwrite = dax_pfn_mkwrite,
+ .pfn_mkwrite = ext2_dax_pfn_mkwrite,
};
static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma)
@@ -49,7 +133,7 @@ static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma)
file_accessed(file);
vma->vm_ops = &ext2_dax_vm_ops;
- vma->vm_flags |= VM_MIXEDMAP;
+ vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
return 0;
}
#else
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 5c04a0d..efe5fb2 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -577,7 +577,10 @@ got:
goto fail;
}
- dquot_initialize(inode);
+ err = dquot_initialize(inode);
+ if (err)
+ goto fail_drop;
+
err = dquot_alloc_inode(inode);
if (err)
goto fail_drop;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 5c09776..338eefd 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -25,6 +25,7 @@
#include <linux/time.h>
#include <linux/highuid.h>
#include <linux/pagemap.h>
+#include <linux/dax.h>
#include <linux/quotaops.h>
#include <linux/writeback.h>
#include <linux/buffer_head.h>
@@ -1084,6 +1085,7 @@ static void ext2_free_branches(struct inode *inode, __le32 *p, __le32 *q, int de
ext2_free_data(inode, p, q);
}
+/* dax_sem must be held when calling this function */
static void __ext2_truncate_blocks(struct inode *inode, loff_t offset)
{
__le32 *i_data = EXT2_I(inode)->i_data;
@@ -1099,6 +1101,10 @@ static void __ext2_truncate_blocks(struct inode *inode, loff_t offset)
blocksize = inode->i_sb->s_blocksize;
iblock = (offset + blocksize-1) >> EXT2_BLOCK_SIZE_BITS(inode->i_sb);
+#ifdef CONFIG_FS_DAX
+ WARN_ON(!rwsem_is_locked(&ei->dax_sem));
+#endif
+
n = ext2_block_to_path(inode, iblock, offsets, NULL);
if (n == 0)
return;
@@ -1184,7 +1190,10 @@ static void ext2_truncate_blocks(struct inode *inode, loff_t offset)
return;
if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
return;
+
+ dax_sem_down_write(EXT2_I(inode));
__ext2_truncate_blocks(inode, offset);
+ dax_sem_up_write(EXT2_I(inode));
}
static int ext2_setsize(struct inode *inode, loff_t newsize)
@@ -1212,8 +1221,10 @@ static int ext2_setsize(struct inode *inode, loff_t newsize)
if (error)
return error;
+ dax_sem_down_write(EXT2_I(inode));
truncate_setsize(inode, newsize);
__ext2_truncate_blocks(inode, newsize);
+ dax_sem_up_write(EXT2_I(inode));
inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
if (inode_needs_sync(inode)) {
@@ -1409,6 +1420,7 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
sizeof(ei->i_data) - 1);
} else {
inode->i_op = &ext2_symlink_inode_operations;
+ inode_nohighmem(inode);
if (test_opt(inode->i_sb, NOBH))
inode->i_mapping->a_ops = &ext2_nobh_aops;
else
@@ -1552,8 +1564,11 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
if (error)
return error;
- if (is_quota_modification(inode, iattr))
- dquot_initialize(inode);
+ if (is_quota_modification(inode, iattr)) {
+ error = dquot_initialize(inode);
+ if (error)
+ return error;
+ }
if ((iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)) ||
(iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid))) {
error = dquot_transfer(inode, iattr);
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 13ec54a..7a2be8f 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -96,8 +96,11 @@ struct dentry *ext2_get_parent(struct dentry *child)
static int ext2_create (struct inode * dir, struct dentry * dentry, umode_t mode, bool excl)
{
struct inode *inode;
+ int err;
- dquot_initialize(dir);
+ err = dquot_initialize(dir);
+ if (err)
+ return err;
inode = ext2_new_inode(dir, mode, &dentry->d_name);
if (IS_ERR(inode))
@@ -140,10 +143,9 @@ static int ext2_mknod (struct inode * dir, struct dentry *dentry, umode_t mode,
struct inode * inode;
int err;
- if (!new_valid_dev(rdev))
- return -EINVAL;
-
- dquot_initialize(dir);
+ err = dquot_initialize(dir);
+ if (err)
+ return err;
inode = ext2_new_inode (dir, mode, &dentry->d_name);
err = PTR_ERR(inode);
@@ -169,7 +171,9 @@ static int ext2_symlink (struct inode * dir, struct dentry * dentry,
if (l > sb->s_blocksize)
goto out;
- dquot_initialize(dir);
+ err = dquot_initialize(dir);
+ if (err)
+ goto out;
inode = ext2_new_inode (dir, S_IFLNK | S_IRWXUGO, &dentry->d_name);
err = PTR_ERR(inode);
@@ -179,6 +183,7 @@ static int ext2_symlink (struct inode * dir, struct dentry * dentry,
if (l > sizeof (EXT2_I(inode)->i_data)) {
/* slow symlink */
inode->i_op = &ext2_symlink_inode_operations;
+ inode_nohighmem(inode);
if (test_opt(inode->i_sb, NOBH))
inode->i_mapping->a_ops = &ext2_nobh_aops;
else
@@ -212,7 +217,9 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir,
struct inode *inode = d_inode(old_dentry);
int err;
- dquot_initialize(dir);
+ err = dquot_initialize(dir);
+ if (err)
+ return err;
inode->i_ctime = CURRENT_TIME_SEC;
inode_inc_link_count(inode);
@@ -233,7 +240,9 @@ static int ext2_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
struct inode * inode;
int err;
- dquot_initialize(dir);
+ err = dquot_initialize(dir);
+ if (err)
+ return err;
inode_inc_link_count(dir);
@@ -279,13 +288,17 @@ static int ext2_unlink(struct inode * dir, struct dentry *dentry)
struct inode * inode = d_inode(dentry);
struct ext2_dir_entry_2 * de;
struct page * page;
- int err = -ENOENT;
+ int err;
- dquot_initialize(dir);
+ err = dquot_initialize(dir);
+ if (err)
+ goto out;
de = ext2_find_entry (dir, &dentry->d_name, &page);
- if (!de)
+ if (!de) {
+ err = -ENOENT;
goto out;
+ }
err = ext2_delete_entry (de, page);
if (err)
@@ -323,14 +336,21 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
struct ext2_dir_entry_2 * dir_de = NULL;
struct page * old_page;
struct ext2_dir_entry_2 * old_de;
- int err = -ENOENT;
+ int err;
+
+ err = dquot_initialize(old_dir);
+ if (err)
+ goto out;
- dquot_initialize(old_dir);
- dquot_initialize(new_dir);
+ err = dquot_initialize(new_dir);
+ if (err)
+ goto out;
old_de = ext2_find_entry (old_dir, &old_dentry->d_name, &old_page);
- if (!old_de)
+ if (!old_de) {
+ err = -ENOENT;
goto out;
+ }
if (S_ISDIR(old_inode->i_mode)) {
err = -EIO;
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 900e19c..2a18841 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -192,6 +192,9 @@ static void init_once(void *foo)
init_rwsem(&ei->xattr_sem);
#endif
mutex_init(&ei->truncate_mutex);
+#ifdef CONFIG_FS_DAX
+ init_rwsem(&ei->dax_sem);
+#endif
inode_init_once(&ei->vfs_inode);
}
@@ -200,7 +203,7 @@ static int __init init_inodecache(void)
ext2_inode_cachep = kmem_cache_create("ext2_inode_cache",
sizeof(struct ext2_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (ext2_inode_cachep == NULL)
return -ENOMEM;
@@ -566,6 +569,8 @@ static int parse_options(char *options, struct super_block *sb)
/* Fall through */
case Opt_dax:
#ifdef CONFIG_FS_DAX
+ ext2_msg(sb, KERN_WARNING,
+ "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
set_opt(sbi->s_mount_opt, DAX);
#else
ext2_msg(sb, KERN_INFO, "dax option not supported");
diff --git a/fs/ext2/symlink.c b/fs/ext2/symlink.c
index ae17179..3495d8a 100644
--- a/fs/ext2/symlink.c
+++ b/fs/ext2/symlink.c
@@ -22,8 +22,7 @@
const struct inode_operations ext2_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
.setattr = ext2_setattr,
#ifdef CONFIG_EXT2_FS_XATTR
.setxattr = generic_setxattr,
@@ -35,7 +34,7 @@ const struct inode_operations ext2_symlink_inode_operations = {
const struct inode_operations ext2_fast_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = simple_follow_link,
+ .get_link = simple_get_link,
.setattr = ext2_setattr,
#ifdef CONFIG_EXT2_FS_XATTR
.setxattr = generic_setxattr,
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 0b6bfd3..f57a7ab 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -77,10 +77,8 @@
printk("\n"); \
} while (0)
# define ea_bdebug(bh, f...) do { \
- char b[BDEVNAME_SIZE]; \
- printk(KERN_DEBUG "block %s:%lu: ", \
- bdevname(bh->b_bdev, b), \
- (unsigned long) bh->b_blocknr); \
+ printk(KERN_DEBUG "block %pg:%lu: ", \
+ bh->b_bdev, (unsigned long) bh->b_blocknr); \
printk(f); \
printk("\n"); \
} while (0)
@@ -292,17 +290,21 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_list",
const struct xattr_handler *handler =
ext2_xattr_handler(entry->e_name_index);
- if (handler) {
- size_t size = handler->list(dentry, buffer, rest,
- entry->e_name,
- entry->e_name_len,
- handler->flags);
+ if (handler && (!handler->list || handler->list(dentry))) {
+ const char *prefix = handler->prefix ?: handler->name;
+ size_t prefix_len = strlen(prefix);
+ size_t size = prefix_len + entry->e_name_len + 1;
+
if (buffer) {
if (size > rest) {
error = -ERANGE;
goto cleanup;
}
- buffer += size;
+ memcpy(buffer, prefix, prefix_len);
+ buffer += prefix_len;
+ memcpy(buffer, entry->e_name, entry->e_name_len);
+ buffer += entry->e_name_len;
+ *buffer++ = 0;
}
rest -= size;
}
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index 702fc68..ba97f24 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -7,37 +7,20 @@
#include <linux/security.h>
#include "xattr.h"
-static size_t
-ext2_xattr_security_list(struct dentry *dentry, char *list, size_t list_size,
- const char *name, size_t name_len, int type)
-{
- const int prefix_len = XATTR_SECURITY_PREFIX_LEN;
- const size_t total_len = prefix_len + name_len + 1;
-
- if (list && total_len <= list_size) {
- memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
- memcpy(list+prefix_len, name, name_len);
- list[prefix_len + name_len] = '\0';
- }
- return total_len;
-}
-
static int
-ext2_xattr_security_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+ext2_xattr_security_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
return ext2_xattr_get(d_inode(dentry), EXT2_XATTR_INDEX_SECURITY, name,
buffer, size);
}
static int
-ext2_xattr_security_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+ext2_xattr_security_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_SECURITY, name,
value, size, flags);
}
@@ -68,7 +51,6 @@ ext2_init_security(struct inode *inode, struct inode *dir,
const struct xattr_handler ext2_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
- .list = ext2_xattr_security_list,
.get = ext2_xattr_security_get,
.set = ext2_xattr_security_set,
};
diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c
index 42b6e98..2c94d19 100644
--- a/fs/ext2/xattr_trusted.c
+++ b/fs/ext2/xattr_trusted.c
@@ -8,40 +8,26 @@
#include "ext2.h"
#include "xattr.h"
-static size_t
-ext2_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size,
- const char *name, size_t name_len, int type)
+static bool
+ext2_xattr_trusted_list(struct dentry *dentry)
{
- const int prefix_len = XATTR_TRUSTED_PREFIX_LEN;
- const size_t total_len = prefix_len + name_len + 1;
-
- if (!capable(CAP_SYS_ADMIN))
- return 0;
-
- if (list && total_len <= list_size) {
- memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
- memcpy(list+prefix_len, name, name_len);
- list[prefix_len + name_len] = '\0';
- }
- return total_len;
+ return capable(CAP_SYS_ADMIN);
}
static int
-ext2_xattr_trusted_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+ext2_xattr_trusted_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
return ext2_xattr_get(d_inode(dentry), EXT2_XATTR_INDEX_TRUSTED, name,
buffer, size);
}
static int
-ext2_xattr_trusted_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+ext2_xattr_trusted_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_TRUSTED, name,
value, size, flags);
}
diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c
index ecdc460..72a2a96 100644
--- a/fs/ext2/xattr_user.c
+++ b/fs/ext2/xattr_user.c
@@ -10,30 +10,17 @@
#include "ext2.h"
#include "xattr.h"
-static size_t
-ext2_xattr_user_list(struct dentry *dentry, char *list, size_t list_size,
- const char *name, size_t name_len, int type)
+static bool
+ext2_xattr_user_list(struct dentry *dentry)
{
- const size_t prefix_len = XATTR_USER_PREFIX_LEN;
- const size_t total_len = prefix_len + name_len + 1;
-
- if (!test_opt(dentry->d_sb, XATTR_USER))
- return 0;
-
- if (list && total_len <= list_size) {
- memcpy(list, XATTR_USER_PREFIX, prefix_len);
- memcpy(list+prefix_len, name, name_len);
- list[prefix_len + name_len] = '\0';
- }
- return total_len;
+ return test_opt(dentry->d_sb, XATTR_USER);
}
static int
-ext2_xattr_user_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+ext2_xattr_user_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
if (!test_opt(dentry->d_sb, XATTR_USER))
return -EOPNOTSUPP;
return ext2_xattr_get(d_inode(dentry), EXT2_XATTR_INDEX_USER,
@@ -41,11 +28,10 @@ ext2_xattr_user_get(struct dentry *dentry, const char *name,
}
static int
-ext2_xattr_user_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+ext2_xattr_user_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
if (!test_opt(dentry->d_sb, XATTR_USER))
return -EOPNOTSUPP;
diff --git a/fs/ext3/Kconfig b/fs/ext3/Kconfig
deleted file mode 100644
index e8c6ba0..0000000
--- a/fs/ext3/Kconfig
+++ /dev/null
@@ -1,89 +0,0 @@
-config EXT3_FS
- tristate "Ext3 journalling file system support"
- select JBD
- help
- This is the journalling version of the Second extended file system
- (often called ext3), the de facto standard Linux file system
- (method to organize files on a storage device) for hard disks.
-
- The journalling code included in this driver means you do not have
- to run e2fsck (file system checker) on your file systems after a
- crash. The journal keeps track of any changes that were being made
- at the time the system crashed, and can ensure that your file system
- is consistent without the need for a lengthy check.
-
- Other than adding the journal to the file system, the on-disk format
- of ext3 is identical to ext2. It is possible to freely switch
- between using the ext3 driver and the ext2 driver, as long as the
- file system has been cleanly unmounted, or e2fsck is run on the file
- system.
-
- To add a journal on an existing ext2 file system or change the
- behavior of ext3 file systems, you can use the tune2fs utility ("man
- tune2fs"). To modify attributes of files and directories on ext3
- file systems, use chattr ("man chattr"). You need to be using
- e2fsprogs version 1.20 or later in order to create ext3 journals
- (available at <http://sourceforge.net/projects/e2fsprogs/>).
-
- To compile this file system support as a module, choose M here: the
- module will be called ext3.
-
-config EXT3_DEFAULTS_TO_ORDERED
- bool "Default to 'data=ordered' in ext3"
- depends on EXT3_FS
- default y
- help
- The journal mode options for ext3 have different tradeoffs
- between when data is guaranteed to be on disk and
- performance. The use of "data=writeback" can cause
- unwritten data to appear in files after an system crash or
- power failure, which can be a security issue. However,
- "data=ordered" mode can also result in major performance
- problems, including seconds-long delays before an fsync()
- call returns. For details, see:
-
- http://ext4.wiki.kernel.org/index.php/Ext3_data_mode_tradeoffs
-
- If you have been historically happy with ext3's performance,
- data=ordered mode will be a safe choice and you should
- answer 'y' here. If you understand the reliability and data
- privacy issues of data=writeback and are willing to make
- that trade off, answer 'n'.
-
-config EXT3_FS_XATTR
- bool "Ext3 extended attributes"
- depends on EXT3_FS
- default y
- help
- Extended attributes are name:value pairs associated with inodes by
- the kernel or by users (see the attr(5) manual page, or visit
- <http://acl.bestbits.at/> for details).
-
- If unsure, say N.
-
- You need this for POSIX ACL support on ext3.
-
-config EXT3_FS_POSIX_ACL
- bool "Ext3 POSIX Access Control Lists"
- depends on EXT3_FS_XATTR
- select FS_POSIX_ACL
- help
- Posix Access Control Lists (ACLs) support permissions for users and
- groups beyond the owner/group/world scheme.
-
- To learn more about Access Control Lists, visit the Posix ACLs for
- Linux website <http://acl.bestbits.at/>.
-
- If you don't know what Access Control Lists are, say N
-
-config EXT3_FS_SECURITY
- bool "Ext3 Security Labels"
- depends on EXT3_FS_XATTR
- help
- Security labels support alternative access control models
- implemented by security modules like SELinux. This option
- enables an extended attribute handler for file security
- labels in the ext3 filesystem.
-
- If you are not using a security module that requires using
- extended attributes for file security labels, say N.
diff --git a/fs/ext3/Makefile b/fs/ext3/Makefile
deleted file mode 100644
index e77766a..0000000
--- a/fs/ext3/Makefile
+++ /dev/null
@@ -1,12 +0,0 @@
-#
-# Makefile for the linux ext3-filesystem routines.
-#
-
-obj-$(CONFIG_EXT3_FS) += ext3.o
-
-ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
- ioctl.o namei.o super.o symlink.o hash.o resize.o ext3_jbd.o
-
-ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
-ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
-ext3-$(CONFIG_EXT3_FS_SECURITY) += xattr_security.o
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
deleted file mode 100644
index 8bbaf5b..0000000
--- a/fs/ext3/acl.c
+++ /dev/null
@@ -1,281 +0,0 @@
-/*
- * linux/fs/ext3/acl.c
- *
- * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
- */
-
-#include "ext3.h"
-#include "xattr.h"
-#include "acl.h"
-
-/*
- * Convert from filesystem to in-memory representation.
- */
-static struct posix_acl *
-ext3_acl_from_disk(const void *value, size_t size)
-{
- const char *end = (char *)value + size;
- int n, count;
- struct posix_acl *acl;
-
- if (!value)
- return NULL;
- if (size < sizeof(ext3_acl_header))
- return ERR_PTR(-EINVAL);
- if (((ext3_acl_header *)value)->a_version !=
- cpu_to_le32(EXT3_ACL_VERSION))
- return ERR_PTR(-EINVAL);
- value = (char *)value + sizeof(ext3_acl_header);
- count = ext3_acl_count(size);
- if (count < 0)
- return ERR_PTR(-EINVAL);
- if (count == 0)
- return NULL;
- acl = posix_acl_alloc(count, GFP_NOFS);
- if (!acl)
- return ERR_PTR(-ENOMEM);
- for (n=0; n < count; n++) {
- ext3_acl_entry *entry =
- (ext3_acl_entry *)value;
- if ((char *)value + sizeof(ext3_acl_entry_short) > end)
- goto fail;
- acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag);
- acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
- switch(acl->a_entries[n].e_tag) {
- case ACL_USER_OBJ:
- case ACL_GROUP_OBJ:
- case ACL_MASK:
- case ACL_OTHER:
- value = (char *)value +
- sizeof(ext3_acl_entry_short);
- break;
-
- case ACL_USER:
- value = (char *)value + sizeof(ext3_acl_entry);
- if ((char *)value > end)
- goto fail;
- acl->a_entries[n].e_uid =
- make_kuid(&init_user_ns,
- le32_to_cpu(entry->e_id));
- break;
- case ACL_GROUP:
- value = (char *)value + sizeof(ext3_acl_entry);
- if ((char *)value > end)
- goto fail;
- acl->a_entries[n].e_gid =
- make_kgid(&init_user_ns,
- le32_to_cpu(entry->e_id));
- break;
-
- default:
- goto fail;
- }
- }
- if (value != end)
- goto fail;
- return acl;
-
-fail:
- posix_acl_release(acl);
- return ERR_PTR(-EINVAL);
-}
-
-/*
- * Convert from in-memory to filesystem representation.
- */
-static void *
-ext3_acl_to_disk(const struct posix_acl *acl, size_t *size)
-{
- ext3_acl_header *ext_acl;
- char *e;
- size_t n;
-
- *size = ext3_acl_size(acl->a_count);
- ext_acl = kmalloc(sizeof(ext3_acl_header) + acl->a_count *
- sizeof(ext3_acl_entry), GFP_NOFS);
- if (!ext_acl)
- return ERR_PTR(-ENOMEM);
- ext_acl->a_version = cpu_to_le32(EXT3_ACL_VERSION);
- e = (char *)ext_acl + sizeof(ext3_acl_header);
- for (n=0; n < acl->a_count; n++) {
- const struct posix_acl_entry *acl_e = &acl->a_entries[n];
- ext3_acl_entry *entry = (ext3_acl_entry *)e;
- entry->e_tag = cpu_to_le16(acl_e->e_tag);
- entry->e_perm = cpu_to_le16(acl_e->e_perm);
- switch(acl_e->e_tag) {
- case ACL_USER:
- entry->e_id = cpu_to_le32(
- from_kuid(&init_user_ns, acl_e->e_uid));
- e += sizeof(ext3_acl_entry);
- break;
- case ACL_GROUP:
- entry->e_id = cpu_to_le32(
- from_kgid(&init_user_ns, acl_e->e_gid));
- e += sizeof(ext3_acl_entry);
- break;
-
- case ACL_USER_OBJ:
- case ACL_GROUP_OBJ:
- case ACL_MASK:
- case ACL_OTHER:
- e += sizeof(ext3_acl_entry_short);
- break;
-
- default:
- goto fail;
- }
- }
- return (char *)ext_acl;
-
-fail:
- kfree(ext_acl);
- return ERR_PTR(-EINVAL);
-}
-
-/*
- * Inode operation get_posix_acl().
- *
- * inode->i_mutex: don't care
- */
-struct posix_acl *
-ext3_get_acl(struct inode *inode, int type)
-{
- int name_index;
- char *value = NULL;
- struct posix_acl *acl;
- int retval;
-
- switch (type) {
- case ACL_TYPE_ACCESS:
- name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS;
- break;
- case ACL_TYPE_DEFAULT:
- name_index = EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT;
- break;
- default:
- BUG();
- }
-
- retval = ext3_xattr_get(inode, name_index, "", NULL, 0);
- if (retval > 0) {
- value = kmalloc(retval, GFP_NOFS);
- if (!value)
- return ERR_PTR(-ENOMEM);
- retval = ext3_xattr_get(inode, name_index, "", value, retval);
- }
- if (retval > 0)
- acl = ext3_acl_from_disk(value, retval);
- else if (retval == -ENODATA || retval == -ENOSYS)
- acl = NULL;
- else
- acl = ERR_PTR(retval);
- kfree(value);
-
- if (!IS_ERR(acl))
- set_cached_acl(inode, type, acl);
-
- return acl;
-}
-
-/*
- * Set the access or default ACL of an inode.
- *
- * inode->i_mutex: down unless called from ext3_new_inode
- */
-static int
-__ext3_set_acl(handle_t *handle, struct inode *inode, int type,
- struct posix_acl *acl)
-{
- int name_index;
- void *value = NULL;
- size_t size = 0;
- int error;
-
- switch(type) {
- case ACL_TYPE_ACCESS:
- name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS;
- if (acl) {
- error = posix_acl_equiv_mode(acl, &inode->i_mode);
- if (error < 0)
- return error;
- else {
- inode->i_ctime = CURRENT_TIME_SEC;
- ext3_mark_inode_dirty(handle, inode);
- if (error == 0)
- acl = NULL;
- }
- }
- break;
-
- case ACL_TYPE_DEFAULT:
- name_index = EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT;
- if (!S_ISDIR(inode->i_mode))
- return acl ? -EACCES : 0;
- break;
-
- default:
- return -EINVAL;
- }
- if (acl) {
- value = ext3_acl_to_disk(acl, &size);
- if (IS_ERR(value))
- return (int)PTR_ERR(value);
- }
-
- error = ext3_xattr_set_handle(handle, inode, name_index, "",
- value, size, 0);
-
- kfree(value);
-
- if (!error)
- set_cached_acl(inode, type, acl);
-
- return error;
-}
-
-int
-ext3_set_acl(struct inode *inode, struct posix_acl *acl, int type)
-{
- handle_t *handle;
- int error, retries = 0;
-
-retry:
- handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS(inode->i_sb));
- if (IS_ERR(handle))
- return PTR_ERR(handle);
- error = __ext3_set_acl(handle, inode, type, acl);
- ext3_journal_stop(handle);
- if (error == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
- goto retry;
- return error;
-}
-
-/*
- * Initialize the ACLs of a new inode. Called from ext3_new_inode.
- *
- * dir->i_mutex: down
- * inode->i_mutex: up (access to inode is still exclusive)
- */
-int
-ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
-{
- struct posix_acl *default_acl, *acl;
- int error;
-
- error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
- if (error)
- return error;
-
- if (default_acl) {
- error = __ext3_set_acl(handle, inode, ACL_TYPE_DEFAULT,
- default_acl);
- posix_acl_release(default_acl);
- }
- if (acl) {
- if (!error)
- error = __ext3_set_acl(handle, inode, ACL_TYPE_ACCESS,
- acl);
- posix_acl_release(acl);
- }
- return error;
-}
diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h
deleted file mode 100644
index ea1c69e..0000000
--- a/fs/ext3/acl.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- File: fs/ext3/acl.h
-
- (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
-*/
-
-#include <linux/posix_acl_xattr.h>
-
-#define EXT3_ACL_VERSION 0x0001
-
-typedef struct {
- __le16 e_tag;
- __le16 e_perm;
- __le32 e_id;
-} ext3_acl_entry;
-
-typedef struct {
- __le16 e_tag;
- __le16 e_perm;
-} ext3_acl_entry_short;
-
-typedef struct {
- __le32 a_version;
-} ext3_acl_header;
-
-static inline size_t ext3_acl_size(int count)
-{
- if (count <= 4) {
- return sizeof(ext3_acl_header) +
- count * sizeof(ext3_acl_entry_short);
- } else {
- return sizeof(ext3_acl_header) +
- 4 * sizeof(ext3_acl_entry_short) +
- (count - 4) * sizeof(ext3_acl_entry);
- }
-}
-
-static inline int ext3_acl_count(size_t size)
-{
- ssize_t s;
- size -= sizeof(ext3_acl_header);
- s = size - 4 * sizeof(ext3_acl_entry_short);
- if (s < 0) {
- if (size % sizeof(ext3_acl_entry_short))
- return -1;
- return size / sizeof(ext3_acl_entry_short);
- } else {
- if (s % sizeof(ext3_acl_entry))
- return -1;
- return s / sizeof(ext3_acl_entry) + 4;
- }
-}
-
-#ifdef CONFIG_EXT3_FS_POSIX_ACL
-
-/* acl.c */
-extern struct posix_acl *ext3_get_acl(struct inode *inode, int type);
-extern int ext3_set_acl(struct inode *inode, struct posix_acl *acl, int type);
-extern int ext3_init_acl (handle_t *, struct inode *, struct inode *);
-
-#else /* CONFIG_EXT3_FS_POSIX_ACL */
-#include <linux/sched.h>
-#define ext3_get_acl NULL
-#define ext3_set_acl NULL
-
-static inline int
-ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
-{
- return 0;
-}
-#endif /* CONFIG_EXT3_FS_POSIX_ACL */
-
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
deleted file mode 100644
index 158b5d4..0000000
--- a/fs/ext3/balloc.c
+++ /dev/null
@@ -1,2158 +0,0 @@
-/*
- * linux/fs/ext3/balloc.c
- *
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- *
- * Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993
- * Big-endian to little-endian byte-swapping/bitmaps by
- * David S. Miller (davem@caip.rutgers.edu), 1995
- */
-
-#include <linux/quotaops.h>
-#include <linux/blkdev.h>
-#include "ext3.h"
-
-/*
- * balloc.c contains the blocks allocation and deallocation routines
- */
-
-/*
- * The free blocks are managed by bitmaps. A file system contains several
- * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap
- * block for inodes, N blocks for the inode table and data blocks.
- *
- * The file system contains group descriptors which are located after the
- * super block. Each descriptor contains the number of the bitmap block and
- * the free blocks count in the block. The descriptors are loaded in memory
- * when a file system is mounted (see ext3_fill_super).
- */
-
-
-#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
-
-/*
- * Calculate the block group number and offset, given a block number
- */
-static void ext3_get_group_no_and_offset(struct super_block *sb,
- ext3_fsblk_t blocknr, unsigned long *blockgrpp, ext3_grpblk_t *offsetp)
-{
- struct ext3_super_block *es = EXT3_SB(sb)->s_es;
-
- blocknr = blocknr - le32_to_cpu(es->s_first_data_block);
- if (offsetp)
- *offsetp = blocknr % EXT3_BLOCKS_PER_GROUP(sb);
- if (blockgrpp)
- *blockgrpp = blocknr / EXT3_BLOCKS_PER_GROUP(sb);
-}
-
-/**
- * ext3_get_group_desc() -- load group descriptor from disk
- * @sb: super block
- * @block_group: given block group
- * @bh: pointer to the buffer head to store the block
- * group descriptor
- */
-struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
- unsigned int block_group,
- struct buffer_head ** bh)
-{
- unsigned long group_desc;
- unsigned long offset;
- struct ext3_group_desc * desc;
- struct ext3_sb_info *sbi = EXT3_SB(sb);
-
- if (block_group >= sbi->s_groups_count) {
- ext3_error (sb, "ext3_get_group_desc",
- "block_group >= groups_count - "
- "block_group = %d, groups_count = %lu",
- block_group, sbi->s_groups_count);
-
- return NULL;
- }
- smp_rmb();
-
- group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(sb);
- offset = block_group & (EXT3_DESC_PER_BLOCK(sb) - 1);
- if (!sbi->s_group_desc[group_desc]) {
- ext3_error (sb, "ext3_get_group_desc",
- "Group descriptor not loaded - "
- "block_group = %d, group_desc = %lu, desc = %lu",
- block_group, group_desc, offset);
- return NULL;
- }
-
- desc = (struct ext3_group_desc *) sbi->s_group_desc[group_desc]->b_data;
- if (bh)
- *bh = sbi->s_group_desc[group_desc];
- return desc + offset;
-}
-
-static int ext3_valid_block_bitmap(struct super_block *sb,
- struct ext3_group_desc *desc,
- unsigned int block_group,
- struct buffer_head *bh)
-{
- ext3_grpblk_t offset;
- ext3_grpblk_t next_zero_bit;
- ext3_fsblk_t bitmap_blk;
- ext3_fsblk_t group_first_block;
-
- group_first_block = ext3_group_first_block_no(sb, block_group);
-
- /* check whether block bitmap block number is set */
- bitmap_blk = le32_to_cpu(desc->bg_block_bitmap);
- offset = bitmap_blk - group_first_block;
- if (!ext3_test_bit(offset, bh->b_data))
- /* bad block bitmap */
- goto err_out;
-
- /* check whether the inode bitmap block number is set */
- bitmap_blk = le32_to_cpu(desc->bg_inode_bitmap);
- offset = bitmap_blk - group_first_block;
- if (!ext3_test_bit(offset, bh->b_data))
- /* bad block bitmap */
- goto err_out;
-
- /* check whether the inode table block number is set */
- bitmap_blk = le32_to_cpu(desc->bg_inode_table);
- offset = bitmap_blk - group_first_block;
- next_zero_bit = ext3_find_next_zero_bit(bh->b_data,
- offset + EXT3_SB(sb)->s_itb_per_group,
- offset);
- if (next_zero_bit >= offset + EXT3_SB(sb)->s_itb_per_group)
- /* good bitmap for inode tables */
- return 1;
-
-err_out:
- ext3_error(sb, __func__,
- "Invalid block bitmap - "
- "block_group = %d, block = %lu",
- block_group, bitmap_blk);
- return 0;
-}
-
-/**
- * read_block_bitmap()
- * @sb: super block
- * @block_group: given block group
- *
- * Read the bitmap for a given block_group,and validate the
- * bits for block/inode/inode tables are set in the bitmaps
- *
- * Return buffer_head on success or NULL in case of failure.
- */
-static struct buffer_head *
-read_block_bitmap(struct super_block *sb, unsigned int block_group)
-{
- struct ext3_group_desc * desc;
- struct buffer_head * bh = NULL;
- ext3_fsblk_t bitmap_blk;
-
- desc = ext3_get_group_desc(sb, block_group, NULL);
- if (!desc)
- return NULL;
- trace_ext3_read_block_bitmap(sb, block_group);
- bitmap_blk = le32_to_cpu(desc->bg_block_bitmap);
- bh = sb_getblk(sb, bitmap_blk);
- if (unlikely(!bh)) {
- ext3_error(sb, __func__,
- "Cannot read block bitmap - "
- "block_group = %d, block_bitmap = %u",
- block_group, le32_to_cpu(desc->bg_block_bitmap));
- return NULL;
- }
- if (likely(bh_uptodate_or_lock(bh)))
- return bh;
-
- if (bh_submit_read(bh) < 0) {
- brelse(bh);
- ext3_error(sb, __func__,
- "Cannot read block bitmap - "
- "block_group = %d, block_bitmap = %u",
- block_group, le32_to_cpu(desc->bg_block_bitmap));
- return NULL;
- }
- ext3_valid_block_bitmap(sb, desc, block_group, bh);
- /*
- * file system mounted not to panic on error, continue with corrupt
- * bitmap
- */
- return bh;
-}
-/*
- * The reservation window structure operations
- * --------------------------------------------
- * Operations include:
- * dump, find, add, remove, is_empty, find_next_reservable_window, etc.
- *
- * We use a red-black tree to represent per-filesystem reservation
- * windows.
- *
- */
-
-/**
- * __rsv_window_dump() -- Dump the filesystem block allocation reservation map
- * @rb_root: root of per-filesystem reservation rb tree
- * @verbose: verbose mode
- * @fn: function which wishes to dump the reservation map
- *
- * If verbose is turned on, it will print the whole block reservation
- * windows(start, end). Otherwise, it will only print out the "bad" windows,
- * those windows that overlap with their immediate neighbors.
- */
-#if 1
-static void __rsv_window_dump(struct rb_root *root, int verbose,
- const char *fn)
-{
- struct rb_node *n;
- struct ext3_reserve_window_node *rsv, *prev;
- int bad;
-
-restart:
- n = rb_first(root);
- bad = 0;
- prev = NULL;
-
- printk("Block Allocation Reservation Windows Map (%s):\n", fn);
- while (n) {
- rsv = rb_entry(n, struct ext3_reserve_window_node, rsv_node);
- if (verbose)
- printk("reservation window 0x%p "
- "start: %lu, end: %lu\n",
- rsv, rsv->rsv_start, rsv->rsv_end);
- if (rsv->rsv_start && rsv->rsv_start >= rsv->rsv_end) {
- printk("Bad reservation %p (start >= end)\n",
- rsv);
- bad = 1;
- }
- if (prev && prev->rsv_end >= rsv->rsv_start) {
- printk("Bad reservation %p (prev->end >= start)\n",
- rsv);
- bad = 1;
- }
- if (bad) {
- if (!verbose) {
- printk("Restarting reservation walk in verbose mode\n");
- verbose = 1;
- goto restart;
- }
- }
- n = rb_next(n);
- prev = rsv;
- }
- printk("Window map complete.\n");
- BUG_ON(bad);
-}
-#define rsv_window_dump(root, verbose) \
- __rsv_window_dump((root), (verbose), __func__)
-#else
-#define rsv_window_dump(root, verbose) do {} while (0)
-#endif
-
-/**
- * goal_in_my_reservation()
- * @rsv: inode's reservation window
- * @grp_goal: given goal block relative to the allocation block group
- * @group: the current allocation block group
- * @sb: filesystem super block
- *
- * Test if the given goal block (group relative) is within the file's
- * own block reservation window range.
- *
- * If the reservation window is outside the goal allocation group, return 0;
- * grp_goal (given goal block) could be -1, which means no specific
- * goal block. In this case, always return 1.
- * If the goal block is within the reservation window, return 1;
- * otherwise, return 0;
- */
-static int
-goal_in_my_reservation(struct ext3_reserve_window *rsv, ext3_grpblk_t grp_goal,
- unsigned int group, struct super_block * sb)
-{
- ext3_fsblk_t group_first_block, group_last_block;
-
- group_first_block = ext3_group_first_block_no(sb, group);
- group_last_block = group_first_block + (EXT3_BLOCKS_PER_GROUP(sb) - 1);
-
- if ((rsv->_rsv_start > group_last_block) ||
- (rsv->_rsv_end < group_first_block))
- return 0;
- if ((grp_goal >= 0) && ((grp_goal + group_first_block < rsv->_rsv_start)
- || (grp_goal + group_first_block > rsv->_rsv_end)))
- return 0;
- return 1;
-}
-
-/**
- * search_reserve_window()
- * @rb_root: root of reservation tree
- * @goal: target allocation block
- *
- * Find the reserved window which includes the goal, or the previous one
- * if the goal is not in any window.
- * Returns NULL if there are no windows or if all windows start after the goal.
- */
-static struct ext3_reserve_window_node *
-search_reserve_window(struct rb_root *root, ext3_fsblk_t goal)
-{
- struct rb_node *n = root->rb_node;
- struct ext3_reserve_window_node *rsv;
-
- if (!n)
- return NULL;
-
- do {
- rsv = rb_entry(n, struct ext3_reserve_window_node, rsv_node);
-
- if (goal < rsv->rsv_start)
- n = n->rb_left;
- else if (goal > rsv->rsv_end)
- n = n->rb_right;
- else
- return rsv;
- } while (n);
- /*
- * We've fallen off the end of the tree: the goal wasn't inside
- * any particular node. OK, the previous node must be to one
- * side of the interval containing the goal. If it's the RHS,
- * we need to back up one.
- */
- if (rsv->rsv_start > goal) {
- n = rb_prev(&rsv->rsv_node);
- rsv = rb_entry(n, struct ext3_reserve_window_node, rsv_node);
- }
- return rsv;
-}
-
-/**
- * ext3_rsv_window_add() -- Insert a window to the block reservation rb tree.
- * @sb: super block
- * @rsv: reservation window to add
- *
- * Must be called with rsv_lock hold.
- */
-void ext3_rsv_window_add(struct super_block *sb,
- struct ext3_reserve_window_node *rsv)
-{
- struct rb_root *root = &EXT3_SB(sb)->s_rsv_window_root;
- struct rb_node *node = &rsv->rsv_node;
- ext3_fsblk_t start = rsv->rsv_start;
-
- struct rb_node ** p = &root->rb_node;
- struct rb_node * parent = NULL;
- struct ext3_reserve_window_node *this;
-
- trace_ext3_rsv_window_add(sb, rsv);
- while (*p)
- {
- parent = *p;
- this = rb_entry(parent, struct ext3_reserve_window_node, rsv_node);
-
- if (start < this->rsv_start)
- p = &(*p)->rb_left;
- else if (start > this->rsv_end)
- p = &(*p)->rb_right;
- else {
- rsv_window_dump(root, 1);
- BUG();
- }
- }
-
- rb_link_node(node, parent, p);
- rb_insert_color(node, root);
-}
-
-/**
- * ext3_rsv_window_remove() -- unlink a window from the reservation rb tree
- * @sb: super block
- * @rsv: reservation window to remove
- *
- * Mark the block reservation window as not allocated, and unlink it
- * from the filesystem reservation window rb tree. Must be called with
- * rsv_lock hold.
- */
-static void rsv_window_remove(struct super_block *sb,
- struct ext3_reserve_window_node *rsv)
-{
- rsv->rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
- rsv->rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
- rsv->rsv_alloc_hit = 0;
- rb_erase(&rsv->rsv_node, &EXT3_SB(sb)->s_rsv_window_root);
-}
-
-/*
- * rsv_is_empty() -- Check if the reservation window is allocated.
- * @rsv: given reservation window to check
- *
- * returns 1 if the end block is EXT3_RESERVE_WINDOW_NOT_ALLOCATED.
- */
-static inline int rsv_is_empty(struct ext3_reserve_window *rsv)
-{
- /* a valid reservation end block could not be 0 */
- return rsv->_rsv_end == EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
-}
-
-/**
- * ext3_init_block_alloc_info()
- * @inode: file inode structure
- *
- * Allocate and initialize the reservation window structure, and
- * link the window to the ext3 inode structure at last
- *
- * The reservation window structure is only dynamically allocated
- * and linked to ext3 inode the first time the open file
- * needs a new block. So, before every ext3_new_block(s) call, for
- * regular files, we should check whether the reservation window
- * structure exists or not. In the latter case, this function is called.
- * Fail to do so will result in block reservation being turned off for that
- * open file.
- *
- * This function is called from ext3_get_blocks_handle(), also called
- * when setting the reservation window size through ioctl before the file
- * is open for write (needs block allocation).
- *
- * Needs truncate_mutex protection prior to call this function.
- */
-void ext3_init_block_alloc_info(struct inode *inode)
-{
- struct ext3_inode_info *ei = EXT3_I(inode);
- struct ext3_block_alloc_info *block_i;
- struct super_block *sb = inode->i_sb;
-
- block_i = kmalloc(sizeof(*block_i), GFP_NOFS);
- if (block_i) {
- struct ext3_reserve_window_node *rsv = &block_i->rsv_window_node;
-
- rsv->rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
- rsv->rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
-
- /*
- * if filesystem is mounted with NORESERVATION, the goal
- * reservation window size is set to zero to indicate
- * block reservation is off
- */
- if (!test_opt(sb, RESERVATION))
- rsv->rsv_goal_size = 0;
- else
- rsv->rsv_goal_size = EXT3_DEFAULT_RESERVE_BLOCKS;
- rsv->rsv_alloc_hit = 0;
- block_i->last_alloc_logical_block = 0;
- block_i->last_alloc_physical_block = 0;
- }
- ei->i_block_alloc_info = block_i;
-}
-
-/**
- * ext3_discard_reservation()
- * @inode: inode
- *
- * Discard(free) block reservation window on last file close, or truncate
- * or at last iput().
- *
- * It is being called in three cases:
- * ext3_release_file(): last writer close the file
- * ext3_clear_inode(): last iput(), when nobody link to this file.
- * ext3_truncate(): when the block indirect map is about to change.
- *
- */
-void ext3_discard_reservation(struct inode *inode)
-{
- struct ext3_inode_info *ei = EXT3_I(inode);
- struct ext3_block_alloc_info *block_i = ei->i_block_alloc_info;
- struct ext3_reserve_window_node *rsv;
- spinlock_t *rsv_lock = &EXT3_SB(inode->i_sb)->s_rsv_window_lock;
-
- if (!block_i)
- return;
-
- rsv = &block_i->rsv_window_node;
- if (!rsv_is_empty(&rsv->rsv_window)) {
- spin_lock(rsv_lock);
- if (!rsv_is_empty(&rsv->rsv_window)) {
- trace_ext3_discard_reservation(inode, rsv);
- rsv_window_remove(inode->i_sb, rsv);
- }
- spin_unlock(rsv_lock);
- }
-}
-
-/**
- * ext3_free_blocks_sb() -- Free given blocks and update quota
- * @handle: handle to this transaction
- * @sb: super block
- * @block: start physical block to free
- * @count: number of blocks to free
- * @pdquot_freed_blocks: pointer to quota
- */
-void ext3_free_blocks_sb(handle_t *handle, struct super_block *sb,
- ext3_fsblk_t block, unsigned long count,
- unsigned long *pdquot_freed_blocks)
-{
- struct buffer_head *bitmap_bh = NULL;
- struct buffer_head *gd_bh;
- unsigned long block_group;
- ext3_grpblk_t bit;
- unsigned long i;
- unsigned long overflow;
- struct ext3_group_desc * desc;
- struct ext3_super_block * es;
- struct ext3_sb_info *sbi;
- int err = 0, ret;
- ext3_grpblk_t group_freed;
-
- *pdquot_freed_blocks = 0;
- sbi = EXT3_SB(sb);
- es = sbi->s_es;
- if (block < le32_to_cpu(es->s_first_data_block) ||
- block + count < block ||
- block + count > le32_to_cpu(es->s_blocks_count)) {
- ext3_error (sb, "ext3_free_blocks",
- "Freeing blocks not in datazone - "
- "block = "E3FSBLK", count = %lu", block, count);
- goto error_return;
- }
-
- ext3_debug ("freeing block(s) %lu-%lu\n", block, block + count - 1);
-
-do_more:
- overflow = 0;
- block_group = (block - le32_to_cpu(es->s_first_data_block)) /
- EXT3_BLOCKS_PER_GROUP(sb);
- bit = (block - le32_to_cpu(es->s_first_data_block)) %
- EXT3_BLOCKS_PER_GROUP(sb);
- /*
- * Check to see if we are freeing blocks across a group
- * boundary.
- */
- if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) {
- overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb);
- count -= overflow;
- }
- brelse(bitmap_bh);
- bitmap_bh = read_block_bitmap(sb, block_group);
- if (!bitmap_bh)
- goto error_return;
- desc = ext3_get_group_desc (sb, block_group, &gd_bh);
- if (!desc)
- goto error_return;
-
- if (in_range (le32_to_cpu(desc->bg_block_bitmap), block, count) ||
- in_range (le32_to_cpu(desc->bg_inode_bitmap), block, count) ||
- in_range (block, le32_to_cpu(desc->bg_inode_table),
- sbi->s_itb_per_group) ||
- in_range (block + count - 1, le32_to_cpu(desc->bg_inode_table),
- sbi->s_itb_per_group)) {
- ext3_error (sb, "ext3_free_blocks",
- "Freeing blocks in system zones - "
- "Block = "E3FSBLK", count = %lu",
- block, count);
- goto error_return;
- }
-
- /*
- * We are about to start releasing blocks in the bitmap,
- * so we need undo access.
- */
- /* @@@ check errors */
- BUFFER_TRACE(bitmap_bh, "getting undo access");
- err = ext3_journal_get_undo_access(handle, bitmap_bh);
- if (err)
- goto error_return;
-
- /*
- * We are about to modify some metadata. Call the journal APIs
- * to unshare ->b_data if a currently-committing transaction is
- * using it
- */
- BUFFER_TRACE(gd_bh, "get_write_access");
- err = ext3_journal_get_write_access(handle, gd_bh);
- if (err)
- goto error_return;
-
- jbd_lock_bh_state(bitmap_bh);
-
- for (i = 0, group_freed = 0; i < count; i++) {
- /*
- * An HJ special. This is expensive...
- */
-#ifdef CONFIG_JBD_DEBUG
- jbd_unlock_bh_state(bitmap_bh);
- {
- struct buffer_head *debug_bh;
- debug_bh = sb_find_get_block(sb, block + i);
- if (debug_bh) {
- BUFFER_TRACE(debug_bh, "Deleted!");
- if (!bh2jh(bitmap_bh)->b_committed_data)
- BUFFER_TRACE(debug_bh,
- "No committed data in bitmap");
- BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap");
- __brelse(debug_bh);
- }
- }
- jbd_lock_bh_state(bitmap_bh);
-#endif
- if (need_resched()) {
- jbd_unlock_bh_state(bitmap_bh);
- cond_resched();
- jbd_lock_bh_state(bitmap_bh);
- }
- /* @@@ This prevents newly-allocated data from being
- * freed and then reallocated within the same
- * transaction.
- *
- * Ideally we would want to allow that to happen, but to
- * do so requires making journal_forget() capable of
- * revoking the queued write of a data block, which
- * implies blocking on the journal lock. *forget()
- * cannot block due to truncate races.
- *
- * Eventually we can fix this by making journal_forget()
- * return a status indicating whether or not it was able
- * to revoke the buffer. On successful revoke, it is
- * safe not to set the allocation bit in the committed
- * bitmap, because we know that there is no outstanding
- * activity on the buffer any more and so it is safe to
- * reallocate it.
- */
- BUFFER_TRACE(bitmap_bh, "set in b_committed_data");
- J_ASSERT_BH(bitmap_bh,
- bh2jh(bitmap_bh)->b_committed_data != NULL);
- ext3_set_bit_atomic(sb_bgl_lock(sbi, block_group), bit + i,
- bh2jh(bitmap_bh)->b_committed_data);
-
- /*
- * We clear the bit in the bitmap after setting the committed
- * data bit, because this is the reverse order to that which
- * the allocator uses.
- */
- BUFFER_TRACE(bitmap_bh, "clear bit");
- if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
- bit + i, bitmap_bh->b_data)) {
- jbd_unlock_bh_state(bitmap_bh);
- ext3_error(sb, __func__,
- "bit already cleared for block "E3FSBLK,
- block + i);
- jbd_lock_bh_state(bitmap_bh);
- BUFFER_TRACE(bitmap_bh, "bit already cleared");
- } else {
- group_freed++;
- }
- }
- jbd_unlock_bh_state(bitmap_bh);
-
- spin_lock(sb_bgl_lock(sbi, block_group));
- le16_add_cpu(&desc->bg_free_blocks_count, group_freed);
- spin_unlock(sb_bgl_lock(sbi, block_group));
- percpu_counter_add(&sbi->s_freeblocks_counter, count);
-
- /* We dirtied the bitmap block */
- BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
- err = ext3_journal_dirty_metadata(handle, bitmap_bh);
-
- /* And the group descriptor block */
- BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
- ret = ext3_journal_dirty_metadata(handle, gd_bh);
- if (!err) err = ret;
- *pdquot_freed_blocks += group_freed;
-
- if (overflow && !err) {
- block += count;
- count = overflow;
- goto do_more;
- }
-
-error_return:
- brelse(bitmap_bh);
- ext3_std_error(sb, err);
- return;
-}
-
-/**
- * ext3_free_blocks() -- Free given blocks and update quota
- * @handle: handle for this transaction
- * @inode: inode
- * @block: start physical block to free
- * @count: number of blocks to count
- */
-void ext3_free_blocks(handle_t *handle, struct inode *inode,
- ext3_fsblk_t block, unsigned long count)
-{
- struct super_block *sb = inode->i_sb;
- unsigned long dquot_freed_blocks;
-
- trace_ext3_free_blocks(inode, block, count);
- ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
- if (dquot_freed_blocks)
- dquot_free_block(inode, dquot_freed_blocks);
- return;
-}
-
-/**
- * ext3_test_allocatable()
- * @nr: given allocation block group
- * @bh: bufferhead contains the bitmap of the given block group
- *
- * For ext3 allocations, we must not reuse any blocks which are
- * allocated in the bitmap buffer's "last committed data" copy. This
- * prevents deletes from freeing up the page for reuse until we have
- * committed the delete transaction.
- *
- * If we didn't do this, then deleting something and reallocating it as
- * data would allow the old block to be overwritten before the
- * transaction committed (because we force data to disk before commit).
- * This would lead to corruption if we crashed between overwriting the
- * data and committing the delete.
- *
- * @@@ We may want to make this allocation behaviour conditional on
- * data-writes at some point, and disable it for metadata allocations or
- * sync-data inodes.
- */
-static int ext3_test_allocatable(ext3_grpblk_t nr, struct buffer_head *bh)
-{
- int ret;
- struct journal_head *jh = bh2jh(bh);
-
- if (ext3_test_bit(nr, bh->b_data))
- return 0;
-
- jbd_lock_bh_state(bh);
- if (!jh->b_committed_data)
- ret = 1;
- else
- ret = !ext3_test_bit(nr, jh->b_committed_data);
- jbd_unlock_bh_state(bh);
- return ret;
-}
-
-/**
- * bitmap_search_next_usable_block()
- * @start: the starting block (group relative) of the search
- * @bh: bufferhead contains the block group bitmap
- * @maxblocks: the ending block (group relative) of the reservation
- *
- * The bitmap search --- search forward alternately through the actual
- * bitmap on disk and the last-committed copy in journal, until we find a
- * bit free in both bitmaps.
- */
-static ext3_grpblk_t
-bitmap_search_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
- ext3_grpblk_t maxblocks)
-{
- ext3_grpblk_t next;
- struct journal_head *jh = bh2jh(bh);
-
- while (start < maxblocks) {
- next = ext3_find_next_zero_bit(bh->b_data, maxblocks, start);
- if (next >= maxblocks)
- return -1;
- if (ext3_test_allocatable(next, bh))
- return next;
- jbd_lock_bh_state(bh);
- if (jh->b_committed_data)
- start = ext3_find_next_zero_bit(jh->b_committed_data,
- maxblocks, next);
- jbd_unlock_bh_state(bh);
- }
- return -1;
-}
-
-/**
- * find_next_usable_block()
- * @start: the starting block (group relative) to find next
- * allocatable block in bitmap.
- * @bh: bufferhead contains the block group bitmap
- * @maxblocks: the ending block (group relative) for the search
- *
- * Find an allocatable block in a bitmap. We honor both the bitmap and
- * its last-committed copy (if that exists), and perform the "most
- * appropriate allocation" algorithm of looking for a free block near
- * the initial goal; then for a free byte somewhere in the bitmap; then
- * for any free bit in the bitmap.
- */
-static ext3_grpblk_t
-find_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
- ext3_grpblk_t maxblocks)
-{
- ext3_grpblk_t here, next;
- char *p, *r;
-
- if (start > 0) {
- /*
- * The goal was occupied; search forward for a free
- * block within the next XX blocks.
- *
- * end_goal is more or less random, but it has to be
- * less than EXT3_BLOCKS_PER_GROUP. Aligning up to the
- * next 64-bit boundary is simple..
- */
- ext3_grpblk_t end_goal = (start + 63) & ~63;
- if (end_goal > maxblocks)
- end_goal = maxblocks;
- here = ext3_find_next_zero_bit(bh->b_data, end_goal, start);
- if (here < end_goal && ext3_test_allocatable(here, bh))
- return here;
- ext3_debug("Bit not found near goal\n");
- }
-
- here = start;
- if (here < 0)
- here = 0;
-
- p = bh->b_data + (here >> 3);
- r = memscan(p, 0, ((maxblocks + 7) >> 3) - (here >> 3));
- next = (r - bh->b_data) << 3;
-
- if (next < maxblocks && next >= start && ext3_test_allocatable(next, bh))
- return next;
-
- /*
- * The bitmap search --- search forward alternately through the actual
- * bitmap and the last-committed copy until we find a bit free in
- * both
- */
- here = bitmap_search_next_usable_block(here, bh, maxblocks);
- return here;
-}
-
-/**
- * claim_block()
- * @lock: the spin lock for this block group
- * @block: the free block (group relative) to allocate
- * @bh: the buffer_head contains the block group bitmap
- *
- * We think we can allocate this block in this bitmap. Try to set the bit.
- * If that succeeds then check that nobody has allocated and then freed the
- * block since we saw that is was not marked in b_committed_data. If it _was_
- * allocated and freed then clear the bit in the bitmap again and return
- * zero (failure).
- */
-static inline int
-claim_block(spinlock_t *lock, ext3_grpblk_t block, struct buffer_head *bh)
-{
- struct journal_head *jh = bh2jh(bh);
- int ret;
-
- if (ext3_set_bit_atomic(lock, block, bh->b_data))
- return 0;
- jbd_lock_bh_state(bh);
- if (jh->b_committed_data && ext3_test_bit(block,jh->b_committed_data)) {
- ext3_clear_bit_atomic(lock, block, bh->b_data);
- ret = 0;
- } else {
- ret = 1;
- }
- jbd_unlock_bh_state(bh);
- return ret;
-}
-
-/**
- * ext3_try_to_allocate()
- * @sb: superblock
- * @handle: handle to this transaction
- * @group: given allocation block group
- * @bitmap_bh: bufferhead holds the block bitmap
- * @grp_goal: given target block within the group
- * @count: target number of blocks to allocate
- * @my_rsv: reservation window
- *
- * Attempt to allocate blocks within a give range. Set the range of allocation
- * first, then find the first free bit(s) from the bitmap (within the range),
- * and at last, allocate the blocks by claiming the found free bit as allocated.
- *
- * To set the range of this allocation:
- * if there is a reservation window, only try to allocate block(s) from the
- * file's own reservation window;
- * Otherwise, the allocation range starts from the give goal block, ends at
- * the block group's last block.
- *
- * If we failed to allocate the desired block then we may end up crossing to a
- * new bitmap. In that case we must release write access to the old one via
- * ext3_journal_release_buffer(), else we'll run out of credits.
- */
-static ext3_grpblk_t
-ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
- struct buffer_head *bitmap_bh, ext3_grpblk_t grp_goal,
- unsigned long *count, struct ext3_reserve_window *my_rsv)
-{
- ext3_fsblk_t group_first_block;
- ext3_grpblk_t start, end;
- unsigned long num = 0;
-
- /* we do allocation within the reservation window if we have a window */
- if (my_rsv) {
- group_first_block = ext3_group_first_block_no(sb, group);
- if (my_rsv->_rsv_start >= group_first_block)
- start = my_rsv->_rsv_start - group_first_block;
- else
- /* reservation window cross group boundary */
- start = 0;
- end = my_rsv->_rsv_end - group_first_block + 1;
- if (end > EXT3_BLOCKS_PER_GROUP(sb))
- /* reservation window crosses group boundary */
- end = EXT3_BLOCKS_PER_GROUP(sb);
- if ((start <= grp_goal) && (grp_goal < end))
- start = grp_goal;
- else
- grp_goal = -1;
- } else {
- if (grp_goal > 0)
- start = grp_goal;
- else
- start = 0;
- end = EXT3_BLOCKS_PER_GROUP(sb);
- }
-
- BUG_ON(start > EXT3_BLOCKS_PER_GROUP(sb));
-
-repeat:
- if (grp_goal < 0 || !ext3_test_allocatable(grp_goal, bitmap_bh)) {
- grp_goal = find_next_usable_block(start, bitmap_bh, end);
- if (grp_goal < 0)
- goto fail_access;
- if (!my_rsv) {
- int i;
-
- for (i = 0; i < 7 && grp_goal > start &&
- ext3_test_allocatable(grp_goal - 1,
- bitmap_bh);
- i++, grp_goal--)
- ;
- }
- }
- start = grp_goal;
-
- if (!claim_block(sb_bgl_lock(EXT3_SB(sb), group),
- grp_goal, bitmap_bh)) {
- /*
- * The block was allocated by another thread, or it was
- * allocated and then freed by another thread
- */
- start++;
- grp_goal++;
- if (start >= end)
- goto fail_access;
- goto repeat;
- }
- num++;
- grp_goal++;
- while (num < *count && grp_goal < end
- && ext3_test_allocatable(grp_goal, bitmap_bh)
- && claim_block(sb_bgl_lock(EXT3_SB(sb), group),
- grp_goal, bitmap_bh)) {
- num++;
- grp_goal++;
- }
- *count = num;
- return grp_goal - num;
-fail_access:
- *count = num;
- return -1;
-}
-
-/**
- * find_next_reservable_window():
- * find a reservable space within the given range.
- * It does not allocate the reservation window for now:
- * alloc_new_reservation() will do the work later.
- *
- * @search_head: the head of the searching list;
- * This is not necessarily the list head of the whole filesystem
- *
- * We have both head and start_block to assist the search
- * for the reservable space. The list starts from head,
- * but we will shift to the place where start_block is,
- * then start from there, when looking for a reservable space.
- *
- * @my_rsv: the reservation window
- *
- * @sb: the super block
- *
- * @start_block: the first block we consider to start
- * the real search from
- *
- * @last_block:
- * the maximum block number that our goal reservable space
- * could start from. This is normally the last block in this
- * group. The search will end when we found the start of next
- * possible reservable space is out of this boundary.
- * This could handle the cross boundary reservation window
- * request.
- *
- * basically we search from the given range, rather than the whole
- * reservation double linked list, (start_block, last_block)
- * to find a free region that is of my size and has not
- * been reserved.
- *
- */
-static int find_next_reservable_window(
- struct ext3_reserve_window_node *search_head,
- struct ext3_reserve_window_node *my_rsv,
- struct super_block * sb,
- ext3_fsblk_t start_block,
- ext3_fsblk_t last_block)
-{
- struct rb_node *next;
- struct ext3_reserve_window_node *rsv, *prev;
- ext3_fsblk_t cur;
- int size = my_rsv->rsv_goal_size;
-
- /* TODO: make the start of the reservation window byte-aligned */
- /* cur = *start_block & ~7;*/
- cur = start_block;
- rsv = search_head;
- if (!rsv)
- return -1;
-
- while (1) {
- if (cur <= rsv->rsv_end)
- cur = rsv->rsv_end + 1;
-
- /* TODO?
- * in the case we could not find a reservable space
- * that is what is expected, during the re-search, we could
- * remember what's the largest reservable space we could have
- * and return that one.
- *
- * For now it will fail if we could not find the reservable
- * space with expected-size (or more)...
- */
- if (cur > last_block)
- return -1; /* fail */
-
- prev = rsv;
- next = rb_next(&rsv->rsv_node);
- rsv = rb_entry(next,struct ext3_reserve_window_node,rsv_node);
-
- /*
- * Reached the last reservation, we can just append to the
- * previous one.
- */
- if (!next)
- break;
-
- if (cur + size <= rsv->rsv_start) {
- /*
- * Found a reserveable space big enough. We could
- * have a reservation across the group boundary here
- */
- break;
- }
- }
- /*
- * we come here either :
- * when we reach the end of the whole list,
- * and there is empty reservable space after last entry in the list.
- * append it to the end of the list.
- *
- * or we found one reservable space in the middle of the list,
- * return the reservation window that we could append to.
- * succeed.
- */
-
- if ((prev != my_rsv) && (!rsv_is_empty(&my_rsv->rsv_window)))
- rsv_window_remove(sb, my_rsv);
-
- /*
- * Let's book the whole available window for now. We will check the
- * disk bitmap later and then, if there are free blocks then we adjust
- * the window size if it's larger than requested.
- * Otherwise, we will remove this node from the tree next time
- * call find_next_reservable_window.
- */
- my_rsv->rsv_start = cur;
- my_rsv->rsv_end = cur + size - 1;
- my_rsv->rsv_alloc_hit = 0;
-
- if (prev != my_rsv)
- ext3_rsv_window_add(sb, my_rsv);
-
- return 0;
-}
-
-/**
- * alloc_new_reservation()--allocate a new reservation window
- *
- * To make a new reservation, we search part of the filesystem
- * reservation list (the list that inside the group). We try to
- * allocate a new reservation window near the allocation goal,
- * or the beginning of the group, if there is no goal.
- *
- * We first find a reservable space after the goal, then from
- * there, we check the bitmap for the first free block after
- * it. If there is no free block until the end of group, then the
- * whole group is full, we failed. Otherwise, check if the free
- * block is inside the expected reservable space, if so, we
- * succeed.
- * If the first free block is outside the reservable space, then
- * start from the first free block, we search for next available
- * space, and go on.
- *
- * on succeed, a new reservation will be found and inserted into the list
- * It contains at least one free block, and it does not overlap with other
- * reservation windows.
- *
- * failed: we failed to find a reservation window in this group
- *
- * @my_rsv: the reservation window
- *
- * @grp_goal: The goal (group-relative). It is where the search for a
- * free reservable space should start from.
- * if we have a grp_goal(grp_goal >0 ), then start from there,
- * no grp_goal(grp_goal = -1), we start from the first block
- * of the group.
- *
- * @sb: the super block
- * @group: the group we are trying to allocate in
- * @bitmap_bh: the block group block bitmap
- *
- */
-static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv,
- ext3_grpblk_t grp_goal, struct super_block *sb,
- unsigned int group, struct buffer_head *bitmap_bh)
-{
- struct ext3_reserve_window_node *search_head;
- ext3_fsblk_t group_first_block, group_end_block, start_block;
- ext3_grpblk_t first_free_block;
- struct rb_root *fs_rsv_root = &EXT3_SB(sb)->s_rsv_window_root;
- unsigned long size;
- int ret;
- spinlock_t *rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock;
-
- group_first_block = ext3_group_first_block_no(sb, group);
- group_end_block = group_first_block + (EXT3_BLOCKS_PER_GROUP(sb) - 1);
-
- if (grp_goal < 0)
- start_block = group_first_block;
- else
- start_block = grp_goal + group_first_block;
-
- trace_ext3_alloc_new_reservation(sb, start_block);
- size = my_rsv->rsv_goal_size;
-
- if (!rsv_is_empty(&my_rsv->rsv_window)) {
- /*
- * if the old reservation is cross group boundary
- * and if the goal is inside the old reservation window,
- * we will come here when we just failed to allocate from
- * the first part of the window. We still have another part
- * that belongs to the next group. In this case, there is no
- * point to discard our window and try to allocate a new one
- * in this group(which will fail). we should
- * keep the reservation window, just simply move on.
- *
- * Maybe we could shift the start block of the reservation
- * window to the first block of next group.
- */
-
- if ((my_rsv->rsv_start <= group_end_block) &&
- (my_rsv->rsv_end > group_end_block) &&
- (start_block >= my_rsv->rsv_start))
- return -1;
-
- if ((my_rsv->rsv_alloc_hit >
- (my_rsv->rsv_end - my_rsv->rsv_start + 1) / 2)) {
- /*
- * if the previously allocation hit ratio is
- * greater than 1/2, then we double the size of
- * the reservation window the next time,
- * otherwise we keep the same size window
- */
- size = size * 2;
- if (size > EXT3_MAX_RESERVE_BLOCKS)
- size = EXT3_MAX_RESERVE_BLOCKS;
- my_rsv->rsv_goal_size= size;
- }
- }
-
- spin_lock(rsv_lock);
- /*
- * shift the search start to the window near the goal block
- */
- search_head = search_reserve_window(fs_rsv_root, start_block);
-
- /*
- * find_next_reservable_window() simply finds a reservable window
- * inside the given range(start_block, group_end_block).
- *
- * To make sure the reservation window has a free bit inside it, we
- * need to check the bitmap after we found a reservable window.
- */
-retry:
- ret = find_next_reservable_window(search_head, my_rsv, sb,
- start_block, group_end_block);
-
- if (ret == -1) {
- if (!rsv_is_empty(&my_rsv->rsv_window))
- rsv_window_remove(sb, my_rsv);
- spin_unlock(rsv_lock);
- return -1;
- }
-
- /*
- * On success, find_next_reservable_window() returns the
- * reservation window where there is a reservable space after it.
- * Before we reserve this reservable space, we need
- * to make sure there is at least a free block inside this region.
- *
- * searching the first free bit on the block bitmap and copy of
- * last committed bitmap alternatively, until we found a allocatable
- * block. Search start from the start block of the reservable space
- * we just found.
- */
- spin_unlock(rsv_lock);
- first_free_block = bitmap_search_next_usable_block(
- my_rsv->rsv_start - group_first_block,
- bitmap_bh, group_end_block - group_first_block + 1);
-
- if (first_free_block < 0) {
- /*
- * no free block left on the bitmap, no point
- * to reserve the space. return failed.
- */
- spin_lock(rsv_lock);
- if (!rsv_is_empty(&my_rsv->rsv_window))
- rsv_window_remove(sb, my_rsv);
- spin_unlock(rsv_lock);
- return -1; /* failed */
- }
-
- start_block = first_free_block + group_first_block;
- /*
- * check if the first free block is within the
- * free space we just reserved
- */
- if (start_block >= my_rsv->rsv_start &&
- start_block <= my_rsv->rsv_end) {
- trace_ext3_reserved(sb, start_block, my_rsv);
- return 0; /* success */
- }
- /*
- * if the first free bit we found is out of the reservable space
- * continue search for next reservable space,
- * start from where the free block is,
- * we also shift the list head to where we stopped last time
- */
- search_head = my_rsv;
- spin_lock(rsv_lock);
- goto retry;
-}
-
-/**
- * try_to_extend_reservation()
- * @my_rsv: given reservation window
- * @sb: super block
- * @size: the delta to extend
- *
- * Attempt to expand the reservation window large enough to have
- * required number of free blocks
- *
- * Since ext3_try_to_allocate() will always allocate blocks within
- * the reservation window range, if the window size is too small,
- * multiple blocks allocation has to stop at the end of the reservation
- * window. To make this more efficient, given the total number of
- * blocks needed and the current size of the window, we try to
- * expand the reservation window size if necessary on a best-effort
- * basis before ext3_new_blocks() tries to allocate blocks,
- */
-static void try_to_extend_reservation(struct ext3_reserve_window_node *my_rsv,
- struct super_block *sb, int size)
-{
- struct ext3_reserve_window_node *next_rsv;
- struct rb_node *next;
- spinlock_t *rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock;
-
- if (!spin_trylock(rsv_lock))
- return;
-
- next = rb_next(&my_rsv->rsv_node);
-
- if (!next)
- my_rsv->rsv_end += size;
- else {
- next_rsv = rb_entry(next, struct ext3_reserve_window_node, rsv_node);
-
- if ((next_rsv->rsv_start - my_rsv->rsv_end - 1) >= size)
- my_rsv->rsv_end += size;
- else
- my_rsv->rsv_end = next_rsv->rsv_start - 1;
- }
- spin_unlock(rsv_lock);
-}
-
-/**
- * ext3_try_to_allocate_with_rsv()
- * @sb: superblock
- * @handle: handle to this transaction
- * @group: given allocation block group
- * @bitmap_bh: bufferhead holds the block bitmap
- * @grp_goal: given target block within the group
- * @my_rsv: reservation window
- * @count: target number of blocks to allocate
- * @errp: pointer to store the error code
- *
- * This is the main function used to allocate a new block and its reservation
- * window.
- *
- * Each time when a new block allocation is need, first try to allocate from
- * its own reservation. If it does not have a reservation window, instead of
- * looking for a free bit on bitmap first, then look up the reservation list to
- * see if it is inside somebody else's reservation window, we try to allocate a
- * reservation window for it starting from the goal first. Then do the block
- * allocation within the reservation window.
- *
- * This will avoid keeping on searching the reservation list again and
- * again when somebody is looking for a free block (without
- * reservation), and there are lots of free blocks, but they are all
- * being reserved.
- *
- * We use a red-black tree for the per-filesystem reservation list.
- *
- */
-static ext3_grpblk_t
-ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
- unsigned int group, struct buffer_head *bitmap_bh,
- ext3_grpblk_t grp_goal,
- struct ext3_reserve_window_node * my_rsv,
- unsigned long *count, int *errp)
-{
- ext3_fsblk_t group_first_block, group_last_block;
- ext3_grpblk_t ret = 0;
- int fatal;
- unsigned long num = *count;
-
- *errp = 0;
-
- /*
- * Make sure we use undo access for the bitmap, because it is critical
- * that we do the frozen_data COW on bitmap buffers in all cases even
- * if the buffer is in BJ_Forget state in the committing transaction.
- */
- BUFFER_TRACE(bitmap_bh, "get undo access for new block");
- fatal = ext3_journal_get_undo_access(handle, bitmap_bh);
- if (fatal) {
- *errp = fatal;
- return -1;
- }
-
- /*
- * we don't deal with reservation when
- * filesystem is mounted without reservation
- * or the file is not a regular file
- * or last attempt to allocate a block with reservation turned on failed
- */
- if (my_rsv == NULL ) {
- ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh,
- grp_goal, count, NULL);
- goto out;
- }
- /*
- * grp_goal is a group relative block number (if there is a goal)
- * 0 <= grp_goal < EXT3_BLOCKS_PER_GROUP(sb)
- * first block is a filesystem wide block number
- * first block is the block number of the first block in this group
- */
- group_first_block = ext3_group_first_block_no(sb, group);
- group_last_block = group_first_block + (EXT3_BLOCKS_PER_GROUP(sb) - 1);
-
- /*
- * Basically we will allocate a new block from inode's reservation
- * window.
- *
- * We need to allocate a new reservation window, if:
- * a) inode does not have a reservation window; or
- * b) last attempt to allocate a block from existing reservation
- * failed; or
- * c) we come here with a goal and with a reservation window
- *
- * We do not need to allocate a new reservation window if we come here
- * at the beginning with a goal and the goal is inside the window, or
- * we don't have a goal but already have a reservation window.
- * then we could go to allocate from the reservation window directly.
- */
- while (1) {
- if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) ||
- !goal_in_my_reservation(&my_rsv->rsv_window,
- grp_goal, group, sb)) {
- if (my_rsv->rsv_goal_size < *count)
- my_rsv->rsv_goal_size = *count;
- ret = alloc_new_reservation(my_rsv, grp_goal, sb,
- group, bitmap_bh);
- if (ret < 0)
- break; /* failed */
-
- if (!goal_in_my_reservation(&my_rsv->rsv_window,
- grp_goal, group, sb))
- grp_goal = -1;
- } else if (grp_goal >= 0) {
- int curr = my_rsv->rsv_end -
- (grp_goal + group_first_block) + 1;
-
- if (curr < *count)
- try_to_extend_reservation(my_rsv, sb,
- *count - curr);
- }
-
- if ((my_rsv->rsv_start > group_last_block) ||
- (my_rsv->rsv_end < group_first_block)) {
- rsv_window_dump(&EXT3_SB(sb)->s_rsv_window_root, 1);
- BUG();
- }
- ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh,
- grp_goal, &num, &my_rsv->rsv_window);
- if (ret >= 0) {
- my_rsv->rsv_alloc_hit += num;
- *count = num;
- break; /* succeed */
- }
- num = *count;
- }
-out:
- if (ret >= 0) {
- BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for "
- "bitmap block");
- fatal = ext3_journal_dirty_metadata(handle, bitmap_bh);
- if (fatal) {
- *errp = fatal;
- return -1;
- }
- return ret;
- }
-
- BUFFER_TRACE(bitmap_bh, "journal_release_buffer");
- ext3_journal_release_buffer(handle, bitmap_bh);
- return ret;
-}
-
-/**
- * ext3_has_free_blocks()
- * @sbi: in-core super block structure.
- *
- * Check if filesystem has at least 1 free block available for allocation.
- */
-static int ext3_has_free_blocks(struct ext3_sb_info *sbi, int use_reservation)
-{
- ext3_fsblk_t free_blocks, root_blocks;
-
- free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
- root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count);
- if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
- !use_reservation && !uid_eq(sbi->s_resuid, current_fsuid()) &&
- (gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) ||
- !in_group_p (sbi->s_resgid))) {
- return 0;
- }
- return 1;
-}
-
-/**
- * ext3_should_retry_alloc()
- * @sb: super block
- * @retries number of attemps has been made
- *
- * ext3_should_retry_alloc() is called when ENOSPC is returned, and if
- * it is profitable to retry the operation, this function will wait
- * for the current or committing transaction to complete, and then
- * return TRUE.
- *
- * if the total number of retries exceed three times, return FALSE.
- */
-int ext3_should_retry_alloc(struct super_block *sb, int *retries)
-{
- if (!ext3_has_free_blocks(EXT3_SB(sb), 0) || (*retries)++ > 3)
- return 0;
-
- jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
-
- return journal_force_commit_nested(EXT3_SB(sb)->s_journal);
-}
-
-/**
- * ext3_new_blocks() -- core block(s) allocation function
- * @handle: handle to this transaction
- * @inode: file inode
- * @goal: given target block(filesystem wide)
- * @count: target number of blocks to allocate
- * @errp: error code
- *
- * ext3_new_blocks uses a goal block to assist allocation. It tries to
- * allocate block(s) from the block group contains the goal block first. If that
- * fails, it will try to allocate block(s) from other block groups without
- * any specific goal block.
- *
- */
-ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode,
- ext3_fsblk_t goal, unsigned long *count, int *errp)
-{
- struct buffer_head *bitmap_bh = NULL;
- struct buffer_head *gdp_bh;
- int group_no;
- int goal_group;
- ext3_grpblk_t grp_target_blk; /* blockgroup relative goal block */
- ext3_grpblk_t grp_alloc_blk; /* blockgroup-relative allocated block*/
- ext3_fsblk_t ret_block; /* filesyetem-wide allocated block */
- int bgi; /* blockgroup iteration index */
- int fatal = 0, err;
- int performed_allocation = 0;
- ext3_grpblk_t free_blocks; /* number of free blocks in a group */
- struct super_block *sb;
- struct ext3_group_desc *gdp;
- struct ext3_super_block *es;
- struct ext3_sb_info *sbi;
- struct ext3_reserve_window_node *my_rsv = NULL;
- struct ext3_block_alloc_info *block_i;
- unsigned short windowsz = 0;
-#ifdef EXT3FS_DEBUG
- static int goal_hits, goal_attempts;
-#endif
- unsigned long ngroups;
- unsigned long num = *count;
-
- *errp = -ENOSPC;
- sb = inode->i_sb;
-
- /*
- * Check quota for allocation of this block.
- */
- err = dquot_alloc_block(inode, num);
- if (err) {
- *errp = err;
- return 0;
- }
-
- trace_ext3_request_blocks(inode, goal, num);
-
- sbi = EXT3_SB(sb);
- es = sbi->s_es;
- ext3_debug("goal=%lu.\n", goal);
- /*
- * Allocate a block from reservation only when
- * filesystem is mounted with reservation(default,-o reservation), and
- * it's a regular file, and
- * the desired window size is greater than 0 (One could use ioctl
- * command EXT3_IOC_SETRSVSZ to set the window size to 0 to turn off
- * reservation on that particular file)
- */
- block_i = EXT3_I(inode)->i_block_alloc_info;
- if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
- my_rsv = &block_i->rsv_window_node;
-
- if (!ext3_has_free_blocks(sbi, IS_NOQUOTA(inode))) {
- *errp = -ENOSPC;
- goto out;
- }
-
- /*
- * First, test whether the goal block is free.
- */
- if (goal < le32_to_cpu(es->s_first_data_block) ||
- goal >= le32_to_cpu(es->s_blocks_count))
- goal = le32_to_cpu(es->s_first_data_block);
- group_no = (goal - le32_to_cpu(es->s_first_data_block)) /
- EXT3_BLOCKS_PER_GROUP(sb);
- goal_group = group_no;
-retry_alloc:
- gdp = ext3_get_group_desc(sb, group_no, &gdp_bh);
- if (!gdp)
- goto io_error;
-
- free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
- /*
- * if there is not enough free blocks to make a new resevation
- * turn off reservation for this allocation
- */
- if (my_rsv && (free_blocks < windowsz)
- && (free_blocks > 0)
- && (rsv_is_empty(&my_rsv->rsv_window)))
- my_rsv = NULL;
-
- if (free_blocks > 0) {
- grp_target_blk = ((goal - le32_to_cpu(es->s_first_data_block)) %
- EXT3_BLOCKS_PER_GROUP(sb));
- bitmap_bh = read_block_bitmap(sb, group_no);
- if (!bitmap_bh)
- goto io_error;
- grp_alloc_blk = ext3_try_to_allocate_with_rsv(sb, handle,
- group_no, bitmap_bh, grp_target_blk,
- my_rsv, &num, &fatal);
- if (fatal)
- goto out;
- if (grp_alloc_blk >= 0)
- goto allocated;
- }
-
- ngroups = EXT3_SB(sb)->s_groups_count;
- smp_rmb();
-
- /*
- * Now search the rest of the groups. We assume that
- * group_no and gdp correctly point to the last group visited.
- */
- for (bgi = 0; bgi < ngroups; bgi++) {
- group_no++;
- if (group_no >= ngroups)
- group_no = 0;
- gdp = ext3_get_group_desc(sb, group_no, &gdp_bh);
- if (!gdp)
- goto io_error;
- free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
- /*
- * skip this group (and avoid loading bitmap) if there
- * are no free blocks
- */
- if (!free_blocks)
- continue;
- /*
- * skip this group if the number of
- * free blocks is less than half of the reservation
- * window size.
- */
- if (my_rsv && (free_blocks <= (windowsz/2)))
- continue;
-
- brelse(bitmap_bh);
- bitmap_bh = read_block_bitmap(sb, group_no);
- if (!bitmap_bh)
- goto io_error;
- /*
- * try to allocate block(s) from this group, without a goal(-1).
- */
- grp_alloc_blk = ext3_try_to_allocate_with_rsv(sb, handle,
- group_no, bitmap_bh, -1, my_rsv,
- &num, &fatal);
- if (fatal)
- goto out;
- if (grp_alloc_blk >= 0)
- goto allocated;
- }
- /*
- * We may end up a bogus earlier ENOSPC error due to
- * filesystem is "full" of reservations, but
- * there maybe indeed free blocks available on disk
- * In this case, we just forget about the reservations
- * just do block allocation as without reservations.
- */
- if (my_rsv) {
- my_rsv = NULL;
- windowsz = 0;
- group_no = goal_group;
- goto retry_alloc;
- }
- /* No space left on the device */
- *errp = -ENOSPC;
- goto out;
-
-allocated:
-
- ext3_debug("using block group %d(%d)\n",
- group_no, gdp->bg_free_blocks_count);
-
- BUFFER_TRACE(gdp_bh, "get_write_access");
- fatal = ext3_journal_get_write_access(handle, gdp_bh);
- if (fatal)
- goto out;
-
- ret_block = grp_alloc_blk + ext3_group_first_block_no(sb, group_no);
-
- if (in_range(le32_to_cpu(gdp->bg_block_bitmap), ret_block, num) ||
- in_range(le32_to_cpu(gdp->bg_inode_bitmap), ret_block, num) ||
- in_range(ret_block, le32_to_cpu(gdp->bg_inode_table),
- EXT3_SB(sb)->s_itb_per_group) ||
- in_range(ret_block + num - 1, le32_to_cpu(gdp->bg_inode_table),
- EXT3_SB(sb)->s_itb_per_group)) {
- ext3_error(sb, "ext3_new_block",
- "Allocating block in system zone - "
- "blocks from "E3FSBLK", length %lu",
- ret_block, num);
- /*
- * claim_block() marked the blocks we allocated as in use. So we
- * may want to selectively mark some of the blocks as free.
- */
- goto retry_alloc;
- }
-
- performed_allocation = 1;
-
-#ifdef CONFIG_JBD_DEBUG
- {
- struct buffer_head *debug_bh;
-
- /* Record bitmap buffer state in the newly allocated block */
- debug_bh = sb_find_get_block(sb, ret_block);
- if (debug_bh) {
- BUFFER_TRACE(debug_bh, "state when allocated");
- BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap state");
- brelse(debug_bh);
- }
- }
- jbd_lock_bh_state(bitmap_bh);
- spin_lock(sb_bgl_lock(sbi, group_no));
- if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data) {
- int i;
-
- for (i = 0; i < num; i++) {
- if (ext3_test_bit(grp_alloc_blk+i,
- bh2jh(bitmap_bh)->b_committed_data)) {
- printk("%s: block was unexpectedly set in "
- "b_committed_data\n", __func__);
- }
- }
- }
- ext3_debug("found bit %d\n", grp_alloc_blk);
- spin_unlock(sb_bgl_lock(sbi, group_no));
- jbd_unlock_bh_state(bitmap_bh);
-#endif
-
- if (ret_block + num - 1 >= le32_to_cpu(es->s_blocks_count)) {
- ext3_error(sb, "ext3_new_block",
- "block("E3FSBLK") >= blocks count(%d) - "
- "block_group = %d, es == %p ", ret_block,
- le32_to_cpu(es->s_blocks_count), group_no, es);
- goto out;
- }
-
- /*
- * It is up to the caller to add the new buffer to a journal
- * list of some description. We don't know in advance whether
- * the caller wants to use it as metadata or data.
- */
- ext3_debug("allocating block %lu. Goal hits %d of %d.\n",
- ret_block, goal_hits, goal_attempts);
-
- spin_lock(sb_bgl_lock(sbi, group_no));
- le16_add_cpu(&gdp->bg_free_blocks_count, -num);
- spin_unlock(sb_bgl_lock(sbi, group_no));
- percpu_counter_sub(&sbi->s_freeblocks_counter, num);
-
- BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
- fatal = ext3_journal_dirty_metadata(handle, gdp_bh);
- if (fatal)
- goto out;
-
- *errp = 0;
- brelse(bitmap_bh);
-
- if (num < *count) {
- dquot_free_block(inode, *count-num);
- *count = num;
- }
-
- trace_ext3_allocate_blocks(inode, goal, num,
- (unsigned long long)ret_block);
-
- return ret_block;
-
-io_error:
- *errp = -EIO;
-out:
- if (fatal) {
- *errp = fatal;
- ext3_std_error(sb, fatal);
- }
- /*
- * Undo the block allocation
- */
- if (!performed_allocation)
- dquot_free_block(inode, *count);
- brelse(bitmap_bh);
- return 0;
-}
-
-ext3_fsblk_t ext3_new_block(handle_t *handle, struct inode *inode,
- ext3_fsblk_t goal, int *errp)
-{
- unsigned long count = 1;
-
- return ext3_new_blocks(handle, inode, goal, &count, errp);
-}
-
-/**
- * ext3_count_free_blocks() -- count filesystem free blocks
- * @sb: superblock
- *
- * Adds up the number of free blocks from each block group.
- */
-ext3_fsblk_t ext3_count_free_blocks(struct super_block *sb)
-{
- ext3_fsblk_t desc_count;
- struct ext3_group_desc *gdp;
- int i;
- unsigned long ngroups = EXT3_SB(sb)->s_groups_count;
-#ifdef EXT3FS_DEBUG
- struct ext3_super_block *es;
- ext3_fsblk_t bitmap_count;
- unsigned long x;
- struct buffer_head *bitmap_bh = NULL;
-
- es = EXT3_SB(sb)->s_es;
- desc_count = 0;
- bitmap_count = 0;
- gdp = NULL;
-
- smp_rmb();
- for (i = 0; i < ngroups; i++) {
- gdp = ext3_get_group_desc(sb, i, NULL);
- if (!gdp)
- continue;
- desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
- brelse(bitmap_bh);
- bitmap_bh = read_block_bitmap(sb, i);
- if (bitmap_bh == NULL)
- continue;
-
- x = ext3_count_free(bitmap_bh, sb->s_blocksize);
- printk("group %d: stored = %d, counted = %lu\n",
- i, le16_to_cpu(gdp->bg_free_blocks_count), x);
- bitmap_count += x;
- }
- brelse(bitmap_bh);
- printk("ext3_count_free_blocks: stored = "E3FSBLK
- ", computed = "E3FSBLK", "E3FSBLK"\n",
- (ext3_fsblk_t)le32_to_cpu(es->s_free_blocks_count),
- desc_count, bitmap_count);
- return bitmap_count;
-#else
- desc_count = 0;
- smp_rmb();
- for (i = 0; i < ngroups; i++) {
- gdp = ext3_get_group_desc(sb, i, NULL);
- if (!gdp)
- continue;
- desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
- }
-
- return desc_count;
-#endif
-}
-
-static inline int test_root(int a, int b)
-{
- int num = b;
-
- while (a > num)
- num *= b;
- return num == a;
-}
-
-static int ext3_group_sparse(int group)
-{
- if (group <= 1)
- return 1;
- if (!(group & 1))
- return 0;
- return (test_root(group, 7) || test_root(group, 5) ||
- test_root(group, 3));
-}
-
-/**
- * ext3_bg_has_super - number of blocks used by the superblock in group
- * @sb: superblock for filesystem
- * @group: group number to check
- *
- * Return the number of blocks used by the superblock (primary or backup)
- * in this group. Currently this will be only 0 or 1.
- */
-int ext3_bg_has_super(struct super_block *sb, int group)
-{
- if (EXT3_HAS_RO_COMPAT_FEATURE(sb,
- EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER) &&
- !ext3_group_sparse(group))
- return 0;
- return 1;
-}
-
-static unsigned long ext3_bg_num_gdb_meta(struct super_block *sb, int group)
-{
- unsigned long metagroup = group / EXT3_DESC_PER_BLOCK(sb);
- unsigned long first = metagroup * EXT3_DESC_PER_BLOCK(sb);
- unsigned long last = first + EXT3_DESC_PER_BLOCK(sb) - 1;
-
- if (group == first || group == first + 1 || group == last)
- return 1;
- return 0;
-}
-
-static unsigned long ext3_bg_num_gdb_nometa(struct super_block *sb, int group)
-{
- return ext3_bg_has_super(sb, group) ? EXT3_SB(sb)->s_gdb_count : 0;
-}
-
-/**
- * ext3_bg_num_gdb - number of blocks used by the group table in group
- * @sb: superblock for filesystem
- * @group: group number to check
- *
- * Return the number of blocks used by the group descriptor table
- * (primary or backup) in this group. In the future there may be a
- * different number of descriptor blocks in each group.
- */
-unsigned long ext3_bg_num_gdb(struct super_block *sb, int group)
-{
- unsigned long first_meta_bg =
- le32_to_cpu(EXT3_SB(sb)->s_es->s_first_meta_bg);
- unsigned long metagroup = group / EXT3_DESC_PER_BLOCK(sb);
-
- if (!EXT3_HAS_INCOMPAT_FEATURE(sb,EXT3_FEATURE_INCOMPAT_META_BG) ||
- metagroup < first_meta_bg)
- return ext3_bg_num_gdb_nometa(sb,group);
-
- return ext3_bg_num_gdb_meta(sb,group);
-
-}
-
-/**
- * ext3_trim_all_free -- function to trim all free space in alloc. group
- * @sb: super block for file system
- * @group: allocation group to trim
- * @start: first group block to examine
- * @max: last group block to examine
- * @gdp: allocation group description structure
- * @minblocks: minimum extent block count
- *
- * ext3_trim_all_free walks through group's block bitmap searching for free
- * blocks. When the free block is found, it tries to allocate this block and
- * consequent free block to get the biggest free extent possible, until it
- * reaches any used block. Then issue a TRIM command on this extent and free
- * the extent in the block bitmap. This is done until whole group is scanned.
- */
-static ext3_grpblk_t ext3_trim_all_free(struct super_block *sb,
- unsigned int group,
- ext3_grpblk_t start, ext3_grpblk_t max,
- ext3_grpblk_t minblocks)
-{
- handle_t *handle;
- ext3_grpblk_t next, free_blocks, bit, freed, count = 0;
- ext3_fsblk_t discard_block;
- struct ext3_sb_info *sbi;
- struct buffer_head *gdp_bh, *bitmap_bh = NULL;
- struct ext3_group_desc *gdp;
- int err = 0, ret = 0;
-
- /*
- * We will update one block bitmap, and one group descriptor
- */
- handle = ext3_journal_start_sb(sb, 2);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
-
- bitmap_bh = read_block_bitmap(sb, group);
- if (!bitmap_bh) {
- err = -EIO;
- goto err_out;
- }
-
- BUFFER_TRACE(bitmap_bh, "getting undo access");
- err = ext3_journal_get_undo_access(handle, bitmap_bh);
- if (err)
- goto err_out;
-
- gdp = ext3_get_group_desc(sb, group, &gdp_bh);
- if (!gdp) {
- err = -EIO;
- goto err_out;
- }
-
- BUFFER_TRACE(gdp_bh, "get_write_access");
- err = ext3_journal_get_write_access(handle, gdp_bh);
- if (err)
- goto err_out;
-
- free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
- sbi = EXT3_SB(sb);
-
- /* Walk through the whole group */
- while (start <= max) {
- start = bitmap_search_next_usable_block(start, bitmap_bh, max);
- if (start < 0)
- break;
- next = start;
-
- /*
- * Allocate contiguous free extents by setting bits in the
- * block bitmap
- */
- while (next <= max
- && claim_block(sb_bgl_lock(sbi, group),
- next, bitmap_bh)) {
- next++;
- }
-
- /* We did not claim any blocks */
- if (next == start)
- continue;
-
- discard_block = (ext3_fsblk_t)start +
- ext3_group_first_block_no(sb, group);
-
- /* Update counters */
- spin_lock(sb_bgl_lock(sbi, group));
- le16_add_cpu(&gdp->bg_free_blocks_count, start - next);
- spin_unlock(sb_bgl_lock(sbi, group));
- percpu_counter_sub(&sbi->s_freeblocks_counter, next - start);
-
- free_blocks -= next - start;
- /* Do not issue a TRIM on extents smaller than minblocks */
- if ((next - start) < minblocks)
- goto free_extent;
-
- trace_ext3_discard_blocks(sb, discard_block, next - start);
- /* Send the TRIM command down to the device */
- err = sb_issue_discard(sb, discard_block, next - start,
- GFP_NOFS, 0);
- count += (next - start);
-free_extent:
- freed = 0;
-
- /*
- * Clear bits in the bitmap
- */
- for (bit = start; bit < next; bit++) {
- BUFFER_TRACE(bitmap_bh, "clear bit");
- if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, group),
- bit, bitmap_bh->b_data)) {
- ext3_error(sb, __func__,
- "bit already cleared for block "E3FSBLK,
- (unsigned long)bit);
- BUFFER_TRACE(bitmap_bh, "bit already cleared");
- } else {
- freed++;
- }
- }
-
- /* Update couters */
- spin_lock(sb_bgl_lock(sbi, group));
- le16_add_cpu(&gdp->bg_free_blocks_count, freed);
- spin_unlock(sb_bgl_lock(sbi, group));
- percpu_counter_add(&sbi->s_freeblocks_counter, freed);
-
- start = next;
- if (err < 0) {
- if (err != -EOPNOTSUPP)
- ext3_warning(sb, __func__, "Discard command "
- "returned error %d\n", err);
- break;
- }
-
- if (fatal_signal_pending(current)) {
- err = -ERESTARTSYS;
- break;
- }
-
- cond_resched();
-
- /* No more suitable extents */
- if (free_blocks < minblocks)
- break;
- }
-
- /* We dirtied the bitmap block */
- BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
- ret = ext3_journal_dirty_metadata(handle, bitmap_bh);
- if (!err)
- err = ret;
-
- /* And the group descriptor block */
- BUFFER_TRACE(gdp_bh, "dirtied group descriptor block");
- ret = ext3_journal_dirty_metadata(handle, gdp_bh);
- if (!err)
- err = ret;
-
- ext3_debug("trimmed %d blocks in the group %d\n",
- count, group);
-
-err_out:
- if (err)
- count = err;
- ext3_journal_stop(handle);
- brelse(bitmap_bh);
-
- return count;
-}
-
-/**
- * ext3_trim_fs() -- trim ioctl handle function
- * @sb: superblock for filesystem
- * @start: First Byte to trim
- * @len: number of Bytes to trim from start
- * @minlen: minimum extent length in Bytes
- *
- * ext3_trim_fs goes through all allocation groups containing Bytes from
- * start to start+len. For each such a group ext3_trim_all_free function
- * is invoked to trim all free space.
- */
-int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range)
-{
- ext3_grpblk_t last_block, first_block;
- unsigned long group, first_group, last_group;
- struct ext3_group_desc *gdp;
- struct ext3_super_block *es = EXT3_SB(sb)->s_es;
- uint64_t start, minlen, end, trimmed = 0;
- ext3_fsblk_t first_data_blk =
- le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block);
- ext3_fsblk_t max_blks = le32_to_cpu(es->s_blocks_count);
- int ret = 0;
-
- start = range->start >> sb->s_blocksize_bits;
- end = start + (range->len >> sb->s_blocksize_bits) - 1;
- minlen = range->minlen >> sb->s_blocksize_bits;
-
- if (minlen > EXT3_BLOCKS_PER_GROUP(sb) ||
- start >= max_blks ||
- range->len < sb->s_blocksize)
- return -EINVAL;
- if (end >= max_blks)
- end = max_blks - 1;
- if (end <= first_data_blk)
- goto out;
- if (start < first_data_blk)
- start = first_data_blk;
-
- smp_rmb();
-
- /* Determine first and last group to examine based on start and len */
- ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) start,
- &first_group, &first_block);
- ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) end,
- &last_group, &last_block);
-
- /* end now represents the last block to discard in this group */
- end = EXT3_BLOCKS_PER_GROUP(sb) - 1;
-
- for (group = first_group; group <= last_group; group++) {
- gdp = ext3_get_group_desc(sb, group, NULL);
- if (!gdp)
- break;
-
- /*
- * For all the groups except the last one, last block will
- * always be EXT3_BLOCKS_PER_GROUP(sb)-1, so we only need to
- * change it for the last group, note that last_block is
- * already computed earlier by ext3_get_group_no_and_offset()
- */
- if (group == last_group)
- end = last_block;
-
- if (le16_to_cpu(gdp->bg_free_blocks_count) >= minlen) {
- ret = ext3_trim_all_free(sb, group, first_block,
- end, minlen);
- if (ret < 0)
- break;
- trimmed += ret;
- }
-
- /*
- * For every group except the first one, we are sure
- * that the first block to discard will be block #0.
- */
- first_block = 0;
- }
-
- if (ret > 0)
- ret = 0;
-
-out:
- range->len = trimmed * sb->s_blocksize;
- return ret;
-}
diff --git a/fs/ext3/bitmap.c b/fs/ext3/bitmap.c
deleted file mode 100644
index ef9c643..0000000
--- a/fs/ext3/bitmap.c
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- * linux/fs/ext3/bitmap.c
- *
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- */
-
-#include "ext3.h"
-
-#ifdef EXT3FS_DEBUG
-
-unsigned long ext3_count_free (struct buffer_head * map, unsigned int numchars)
-{
- return numchars * BITS_PER_BYTE - memweight(map->b_data, numchars);
-}
-
-#endif /* EXT3FS_DEBUG */
-
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
deleted file mode 100644
index 17742ee..0000000
--- a/fs/ext3/dir.c
+++ /dev/null
@@ -1,537 +0,0 @@
-/*
- * linux/fs/ext3/dir.c
- *
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- *
- * from
- *
- * linux/fs/minix/dir.c
- *
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * ext3 directory handling functions
- *
- * Big-endian to little-endian byte-swapping/bitmaps by
- * David S. Miller (davem@caip.rutgers.edu), 1995
- *
- * Hash Tree Directory indexing (c) 2001 Daniel Phillips
- *
- */
-
-#include <linux/compat.h>
-#include "ext3.h"
-
-static unsigned char ext3_filetype_table[] = {
- DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
-};
-
-static int ext3_dx_readdir(struct file *, struct dir_context *);
-
-static unsigned char get_dtype(struct super_block *sb, int filetype)
-{
- if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE) ||
- (filetype >= EXT3_FT_MAX))
- return DT_UNKNOWN;
-
- return (ext3_filetype_table[filetype]);
-}
-
-/**
- * Check if the given dir-inode refers to an htree-indexed directory
- * (or a directory which could potentially get converted to use htree
- * indexing).
- *
- * Return 1 if it is a dx dir, 0 if not
- */
-static int is_dx_dir(struct inode *inode)
-{
- struct super_block *sb = inode->i_sb;
-
- if (EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
- EXT3_FEATURE_COMPAT_DIR_INDEX) &&
- ((EXT3_I(inode)->i_flags & EXT3_INDEX_FL) ||
- ((inode->i_size >> sb->s_blocksize_bits) == 1)))
- return 1;
-
- return 0;
-}
-
-int ext3_check_dir_entry (const char * function, struct inode * dir,
- struct ext3_dir_entry_2 * de,
- struct buffer_head * bh,
- unsigned long offset)
-{
- const char * error_msg = NULL;
- const int rlen = ext3_rec_len_from_disk(de->rec_len);
-
- if (unlikely(rlen < EXT3_DIR_REC_LEN(1)))
- error_msg = "rec_len is smaller than minimal";
- else if (unlikely(rlen % 4 != 0))
- error_msg = "rec_len % 4 != 0";
- else if (unlikely(rlen < EXT3_DIR_REC_LEN(de->name_len)))
- error_msg = "rec_len is too small for name_len";
- else if (unlikely((((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)))
- error_msg = "directory entry across blocks";
- else if (unlikely(le32_to_cpu(de->inode) >
- le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count)))
- error_msg = "inode out of bounds";
-
- if (unlikely(error_msg != NULL))
- ext3_error (dir->i_sb, function,
- "bad entry in directory #%lu: %s - "
- "offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
- dir->i_ino, error_msg, offset,
- (unsigned long) le32_to_cpu(de->inode),
- rlen, de->name_len);
-
- return error_msg == NULL ? 1 : 0;
-}
-
-static int ext3_readdir(struct file *file, struct dir_context *ctx)
-{
- unsigned long offset;
- int i;
- struct ext3_dir_entry_2 *de;
- int err;
- struct inode *inode = file_inode(file);
- struct super_block *sb = inode->i_sb;
- int dir_has_error = 0;
-
- if (is_dx_dir(inode)) {
- err = ext3_dx_readdir(file, ctx);
- if (err != ERR_BAD_DX_DIR)
- return err;
- /*
- * We don't set the inode dirty flag since it's not
- * critical that it get flushed back to the disk.
- */
- EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL;
- }
- offset = ctx->pos & (sb->s_blocksize - 1);
-
- while (ctx->pos < inode->i_size) {
- unsigned long blk = ctx->pos >> EXT3_BLOCK_SIZE_BITS(sb);
- struct buffer_head map_bh;
- struct buffer_head *bh = NULL;
-
- map_bh.b_state = 0;
- err = ext3_get_blocks_handle(NULL, inode, blk, 1, &map_bh, 0);
- if (err > 0) {
- pgoff_t index = map_bh.b_blocknr >>
- (PAGE_CACHE_SHIFT - inode->i_blkbits);
- if (!ra_has_index(&file->f_ra, index))
- page_cache_sync_readahead(
- sb->s_bdev->bd_inode->i_mapping,
- &file->f_ra, file,
- index, 1);
- file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
- bh = ext3_bread(NULL, inode, blk, 0, &err);
- }
-
- /*
- * We ignore I/O errors on directories so users have a chance
- * of recovering data when there's a bad sector
- */
- if (!bh) {
- if (!dir_has_error) {
- ext3_error(sb, __func__, "directory #%lu "
- "contains a hole at offset %lld",
- inode->i_ino, ctx->pos);
- dir_has_error = 1;
- }
- /* corrupt size? Maybe no more blocks to read */
- if (ctx->pos > inode->i_blocks << 9)
- break;
- ctx->pos += sb->s_blocksize - offset;
- continue;
- }
-
- /* If the dir block has changed since the last call to
- * readdir(2), then we might be pointing to an invalid
- * dirent right now. Scan from the start of the block
- * to make sure. */
- if (offset && file->f_version != inode->i_version) {
- for (i = 0; i < sb->s_blocksize && i < offset; ) {
- de = (struct ext3_dir_entry_2 *)
- (bh->b_data + i);
- /* It's too expensive to do a full
- * dirent test each time round this
- * loop, but we do have to test at
- * least that it is non-zero. A
- * failure will be detected in the
- * dirent test below. */
- if (ext3_rec_len_from_disk(de->rec_len) <
- EXT3_DIR_REC_LEN(1))
- break;
- i += ext3_rec_len_from_disk(de->rec_len);
- }
- offset = i;
- ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
- | offset;
- file->f_version = inode->i_version;
- }
-
- while (ctx->pos < inode->i_size
- && offset < sb->s_blocksize) {
- de = (struct ext3_dir_entry_2 *) (bh->b_data + offset);
- if (!ext3_check_dir_entry ("ext3_readdir", inode, de,
- bh, offset)) {
- /* On error, skip the to the
- next block. */
- ctx->pos = (ctx->pos |
- (sb->s_blocksize - 1)) + 1;
- break;
- }
- offset += ext3_rec_len_from_disk(de->rec_len);
- if (le32_to_cpu(de->inode)) {
- if (!dir_emit(ctx, de->name, de->name_len,
- le32_to_cpu(de->inode),
- get_dtype(sb, de->file_type))) {
- brelse(bh);
- return 0;
- }
- }
- ctx->pos += ext3_rec_len_from_disk(de->rec_len);
- }
- offset = 0;
- brelse (bh);
- if (ctx->pos < inode->i_size)
- if (!dir_relax(inode))
- return 0;
- }
- return 0;
-}
-
-static inline int is_32bit_api(void)
-{
-#ifdef CONFIG_COMPAT
- return is_compat_task();
-#else
- return (BITS_PER_LONG == 32);
-#endif
-}
-
-/*
- * These functions convert from the major/minor hash to an f_pos
- * value for dx directories
- *
- * Upper layer (for example NFS) should specify FMODE_32BITHASH or
- * FMODE_64BITHASH explicitly. On the other hand, we allow ext3 to be mounted
- * directly on both 32-bit and 64-bit nodes, under such case, neither
- * FMODE_32BITHASH nor FMODE_64BITHASH is specified.
- */
-static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor)
-{
- if ((filp->f_mode & FMODE_32BITHASH) ||
- (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
- return major >> 1;
- else
- return ((__u64)(major >> 1) << 32) | (__u64)minor;
-}
-
-static inline __u32 pos2maj_hash(struct file *filp, loff_t pos)
-{
- if ((filp->f_mode & FMODE_32BITHASH) ||
- (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
- return (pos << 1) & 0xffffffff;
- else
- return ((pos >> 32) << 1) & 0xffffffff;
-}
-
-static inline __u32 pos2min_hash(struct file *filp, loff_t pos)
-{
- if ((filp->f_mode & FMODE_32BITHASH) ||
- (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
- return 0;
- else
- return pos & 0xffffffff;
-}
-
-/*
- * Return 32- or 64-bit end-of-file for dx directories
- */
-static inline loff_t ext3_get_htree_eof(struct file *filp)
-{
- if ((filp->f_mode & FMODE_32BITHASH) ||
- (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
- return EXT3_HTREE_EOF_32BIT;
- else
- return EXT3_HTREE_EOF_64BIT;
-}
-
-
-/*
- * ext3_dir_llseek() calls generic_file_llseek[_size]() to handle both
- * non-htree and htree directories, where the "offset" is in terms
- * of the filename hash value instead of the byte offset.
- *
- * Because we may return a 64-bit hash that is well beyond s_maxbytes,
- * we need to pass the max hash as the maximum allowable offset in
- * the htree directory case.
- *
- * NOTE: offsets obtained *before* ext3_set_inode_flag(dir, EXT3_INODE_INDEX)
- * will be invalid once the directory was converted into a dx directory
- */
-static loff_t ext3_dir_llseek(struct file *file, loff_t offset, int whence)
-{
- struct inode *inode = file->f_mapping->host;
- int dx_dir = is_dx_dir(inode);
- loff_t htree_max = ext3_get_htree_eof(file);
-
- if (likely(dx_dir))
- return generic_file_llseek_size(file, offset, whence,
- htree_max, htree_max);
- else
- return generic_file_llseek(file, offset, whence);
-}
-
-/*
- * This structure holds the nodes of the red-black tree used to store
- * the directory entry in hash order.
- */
-struct fname {
- __u32 hash;
- __u32 minor_hash;
- struct rb_node rb_hash;
- struct fname *next;
- __u32 inode;
- __u8 name_len;
- __u8 file_type;
- char name[0];
-};
-
-/*
- * This functoin implements a non-recursive way of freeing all of the
- * nodes in the red-black tree.
- */
-static void free_rb_tree_fname(struct rb_root *root)
-{
- struct fname *fname, *next;
-
- rbtree_postorder_for_each_entry_safe(fname, next, root, rb_hash)
- do {
- struct fname *old = fname;
- fname = fname->next;
- kfree(old);
- } while (fname);
-
- *root = RB_ROOT;
-}
-
-static struct dir_private_info *ext3_htree_create_dir_info(struct file *filp,
- loff_t pos)
-{
- struct dir_private_info *p;
-
- p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
- if (!p)
- return NULL;
- p->curr_hash = pos2maj_hash(filp, pos);
- p->curr_minor_hash = pos2min_hash(filp, pos);
- return p;
-}
-
-void ext3_htree_free_dir_info(struct dir_private_info *p)
-{
- free_rb_tree_fname(&p->root);
- kfree(p);
-}
-
-/*
- * Given a directory entry, enter it into the fname rb tree.
- */
-int ext3_htree_store_dirent(struct file *dir_file, __u32 hash,
- __u32 minor_hash,
- struct ext3_dir_entry_2 *dirent)
-{
- struct rb_node **p, *parent = NULL;
- struct fname * fname, *new_fn;
- struct dir_private_info *info;
- int len;
-
- info = (struct dir_private_info *) dir_file->private_data;
- p = &info->root.rb_node;
-
- /* Create and allocate the fname structure */
- len = sizeof(struct fname) + dirent->name_len + 1;
- new_fn = kzalloc(len, GFP_KERNEL);
- if (!new_fn)
- return -ENOMEM;
- new_fn->hash = hash;
- new_fn->minor_hash = minor_hash;
- new_fn->inode = le32_to_cpu(dirent->inode);
- new_fn->name_len = dirent->name_len;
- new_fn->file_type = dirent->file_type;
- memcpy(new_fn->name, dirent->name, dirent->name_len);
- new_fn->name[dirent->name_len] = 0;
-
- while (*p) {
- parent = *p;
- fname = rb_entry(parent, struct fname, rb_hash);
-
- /*
- * If the hash and minor hash match up, then we put
- * them on a linked list. This rarely happens...
- */
- if ((new_fn->hash == fname->hash) &&
- (new_fn->minor_hash == fname->minor_hash)) {
- new_fn->next = fname->next;
- fname->next = new_fn;
- return 0;
- }
-
- if (new_fn->hash < fname->hash)
- p = &(*p)->rb_left;
- else if (new_fn->hash > fname->hash)
- p = &(*p)->rb_right;
- else if (new_fn->minor_hash < fname->minor_hash)
- p = &(*p)->rb_left;
- else /* if (new_fn->minor_hash > fname->minor_hash) */
- p = &(*p)->rb_right;
- }
-
- rb_link_node(&new_fn->rb_hash, parent, p);
- rb_insert_color(&new_fn->rb_hash, &info->root);
- return 0;
-}
-
-
-
-/*
- * This is a helper function for ext3_dx_readdir. It calls filldir
- * for all entres on the fname linked list. (Normally there is only
- * one entry on the linked list, unless there are 62 bit hash collisions.)
- */
-static bool call_filldir(struct file *file, struct dir_context *ctx,
- struct fname *fname)
-{
- struct dir_private_info *info = file->private_data;
- struct inode *inode = file_inode(file);
- struct super_block *sb = inode->i_sb;
-
- if (!fname) {
- printk("call_filldir: called with null fname?!?\n");
- return true;
- }
- ctx->pos = hash2pos(file, fname->hash, fname->minor_hash);
- while (fname) {
- if (!dir_emit(ctx, fname->name, fname->name_len,
- fname->inode,
- get_dtype(sb, fname->file_type))) {
- info->extra_fname = fname;
- return false;
- }
- fname = fname->next;
- }
- return true;
-}
-
-static int ext3_dx_readdir(struct file *file, struct dir_context *ctx)
-{
- struct dir_private_info *info = file->private_data;
- struct inode *inode = file_inode(file);
- struct fname *fname;
- int ret;
-
- if (!info) {
- info = ext3_htree_create_dir_info(file, ctx->pos);
- if (!info)
- return -ENOMEM;
- file->private_data = info;
- }
-
- if (ctx->pos == ext3_get_htree_eof(file))
- return 0; /* EOF */
-
- /* Some one has messed with f_pos; reset the world */
- if (info->last_pos != ctx->pos) {
- free_rb_tree_fname(&info->root);
- info->curr_node = NULL;
- info->extra_fname = NULL;
- info->curr_hash = pos2maj_hash(file, ctx->pos);
- info->curr_minor_hash = pos2min_hash(file, ctx->pos);
- }
-
- /*
- * If there are any leftover names on the hash collision
- * chain, return them first.
- */
- if (info->extra_fname) {
- if (!call_filldir(file, ctx, info->extra_fname))
- goto finished;
- info->extra_fname = NULL;
- goto next_node;
- } else if (!info->curr_node)
- info->curr_node = rb_first(&info->root);
-
- while (1) {
- /*
- * Fill the rbtree if we have no more entries,
- * or the inode has changed since we last read in the
- * cached entries.
- */
- if ((!info->curr_node) ||
- (file->f_version != inode->i_version)) {
- info->curr_node = NULL;
- free_rb_tree_fname(&info->root);
- file->f_version = inode->i_version;
- ret = ext3_htree_fill_tree(file, info->curr_hash,
- info->curr_minor_hash,
- &info->next_hash);
- if (ret < 0)
- return ret;
- if (ret == 0) {
- ctx->pos = ext3_get_htree_eof(file);
- break;
- }
- info->curr_node = rb_first(&info->root);
- }
-
- fname = rb_entry(info->curr_node, struct fname, rb_hash);
- info->curr_hash = fname->hash;
- info->curr_minor_hash = fname->minor_hash;
- if (!call_filldir(file, ctx, fname))
- break;
- next_node:
- info->curr_node = rb_next(info->curr_node);
- if (info->curr_node) {
- fname = rb_entry(info->curr_node, struct fname,
- rb_hash);
- info->curr_hash = fname->hash;
- info->curr_minor_hash = fname->minor_hash;
- } else {
- if (info->next_hash == ~0) {
- ctx->pos = ext3_get_htree_eof(file);
- break;
- }
- info->curr_hash = info->next_hash;
- info->curr_minor_hash = 0;
- }
- }
-finished:
- info->last_pos = ctx->pos;
- return 0;
-}
-
-static int ext3_release_dir (struct inode * inode, struct file * filp)
-{
- if (filp->private_data)
- ext3_htree_free_dir_info(filp->private_data);
-
- return 0;
-}
-
-const struct file_operations ext3_dir_operations = {
- .llseek = ext3_dir_llseek,
- .read = generic_read_dir,
- .iterate = ext3_readdir,
- .unlocked_ioctl = ext3_ioctl,
-#ifdef CONFIG_COMPAT
- .compat_ioctl = ext3_compat_ioctl,
-#endif
- .fsync = ext3_sync_file,
- .release = ext3_release_dir,
-};
diff --git a/fs/ext3/ext3.h b/fs/ext3/ext3.h
deleted file mode 100644
index f483a80..0000000
--- a/fs/ext3/ext3.h
+++ /dev/null
@@ -1,1332 +0,0 @@
-/*
- * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
- *
- * Copyright 1998--1999 Red Hat corp --- All Rights Reserved
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- *
- * from
- *
- * linux/include/linux/minix_fs.h
- *
- * Copyright (C) 1991, 1992 Linus Torvalds
- */
-
-#include <linux/fs.h>
-#include <linux/jbd.h>
-#include <linux/magic.h>
-#include <linux/bug.h>
-#include <linux/blockgroup_lock.h>
-
-/*
- * The second extended filesystem constants/structures
- */
-
-/*
- * Define EXT3FS_DEBUG to produce debug messages
- */
-#undef EXT3FS_DEBUG
-
-/*
- * Define EXT3_RESERVATION to reserve data blocks for expanding files
- */
-#define EXT3_DEFAULT_RESERVE_BLOCKS 8
-/*max window size: 1024(direct blocks) + 3([t,d]indirect blocks) */
-#define EXT3_MAX_RESERVE_BLOCKS 1027
-#define EXT3_RESERVE_WINDOW_NOT_ALLOCATED 0
-
-/*
- * Debug code
- */
-#ifdef EXT3FS_DEBUG
-#define ext3_debug(f, a...) \
- do { \
- printk (KERN_DEBUG "EXT3-fs DEBUG (%s, %d): %s:", \
- __FILE__, __LINE__, __func__); \
- printk (KERN_DEBUG f, ## a); \
- } while (0)
-#else
-#define ext3_debug(f, a...) do {} while (0)
-#endif
-
-/*
- * Special inodes numbers
- */
-#define EXT3_BAD_INO 1 /* Bad blocks inode */
-#define EXT3_ROOT_INO 2 /* Root inode */
-#define EXT3_BOOT_LOADER_INO 5 /* Boot loader inode */
-#define EXT3_UNDEL_DIR_INO 6 /* Undelete directory inode */
-#define EXT3_RESIZE_INO 7 /* Reserved group descriptors inode */
-#define EXT3_JOURNAL_INO 8 /* Journal inode */
-
-/* First non-reserved inode for old ext3 filesystems */
-#define EXT3_GOOD_OLD_FIRST_INO 11
-
-/*
- * Maximal count of links to a file
- */
-#define EXT3_LINK_MAX 32000
-
-/*
- * Macro-instructions used to manage several block sizes
- */
-#define EXT3_MIN_BLOCK_SIZE 1024
-#define EXT3_MAX_BLOCK_SIZE 65536
-#define EXT3_MIN_BLOCK_LOG_SIZE 10
-#define EXT3_BLOCK_SIZE(s) ((s)->s_blocksize)
-#define EXT3_ADDR_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / sizeof (__u32))
-#define EXT3_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits)
-#define EXT3_ADDR_PER_BLOCK_BITS(s) (EXT3_SB(s)->s_addr_per_block_bits)
-#define EXT3_INODE_SIZE(s) (EXT3_SB(s)->s_inode_size)
-#define EXT3_FIRST_INO(s) (EXT3_SB(s)->s_first_ino)
-
-/*
- * Macro-instructions used to manage fragments
- */
-#define EXT3_MIN_FRAG_SIZE 1024
-#define EXT3_MAX_FRAG_SIZE 4096
-#define EXT3_MIN_FRAG_LOG_SIZE 10
-#define EXT3_FRAG_SIZE(s) (EXT3_SB(s)->s_frag_size)
-#define EXT3_FRAGS_PER_BLOCK(s) (EXT3_SB(s)->s_frags_per_block)
-
-/*
- * Structure of a blocks group descriptor
- */
-struct ext3_group_desc
-{
- __le32 bg_block_bitmap; /* Blocks bitmap block */
- __le32 bg_inode_bitmap; /* Inodes bitmap block */
- __le32 bg_inode_table; /* Inodes table block */
- __le16 bg_free_blocks_count; /* Free blocks count */
- __le16 bg_free_inodes_count; /* Free inodes count */
- __le16 bg_used_dirs_count; /* Directories count */
- __u16 bg_pad;
- __le32 bg_reserved[3];
-};
-
-/*
- * Macro-instructions used to manage group descriptors
- */
-#define EXT3_BLOCKS_PER_GROUP(s) (EXT3_SB(s)->s_blocks_per_group)
-#define EXT3_DESC_PER_BLOCK(s) (EXT3_SB(s)->s_desc_per_block)
-#define EXT3_INODES_PER_GROUP(s) (EXT3_SB(s)->s_inodes_per_group)
-#define EXT3_DESC_PER_BLOCK_BITS(s) (EXT3_SB(s)->s_desc_per_block_bits)
-
-/*
- * Constants relative to the data blocks
- */
-#define EXT3_NDIR_BLOCKS 12
-#define EXT3_IND_BLOCK EXT3_NDIR_BLOCKS
-#define EXT3_DIND_BLOCK (EXT3_IND_BLOCK + 1)
-#define EXT3_TIND_BLOCK (EXT3_DIND_BLOCK + 1)
-#define EXT3_N_BLOCKS (EXT3_TIND_BLOCK + 1)
-
-/*
- * Inode flags
- */
-#define EXT3_SECRM_FL 0x00000001 /* Secure deletion */
-#define EXT3_UNRM_FL 0x00000002 /* Undelete */
-#define EXT3_COMPR_FL 0x00000004 /* Compress file */
-#define EXT3_SYNC_FL 0x00000008 /* Synchronous updates */
-#define EXT3_IMMUTABLE_FL 0x00000010 /* Immutable file */
-#define EXT3_APPEND_FL 0x00000020 /* writes to file may only append */
-#define EXT3_NODUMP_FL 0x00000040 /* do not dump file */
-#define EXT3_NOATIME_FL 0x00000080 /* do not update atime */
-/* Reserved for compression usage... */
-#define EXT3_DIRTY_FL 0x00000100
-#define EXT3_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */
-#define EXT3_NOCOMPR_FL 0x00000400 /* Don't compress */
-#define EXT3_ECOMPR_FL 0x00000800 /* Compression error */
-/* End compression flags --- maybe not all used */
-#define EXT3_INDEX_FL 0x00001000 /* hash-indexed directory */
-#define EXT3_IMAGIC_FL 0x00002000 /* AFS directory */
-#define EXT3_JOURNAL_DATA_FL 0x00004000 /* file data should be journaled */
-#define EXT3_NOTAIL_FL 0x00008000 /* file tail should not be merged */
-#define EXT3_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */
-#define EXT3_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/
-#define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */
-
-#define EXT3_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */
-#define EXT3_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */
-
-/* Flags that should be inherited by new inodes from their parent. */
-#define EXT3_FL_INHERITED (EXT3_SECRM_FL | EXT3_UNRM_FL | EXT3_COMPR_FL |\
- EXT3_SYNC_FL | EXT3_NODUMP_FL |\
- EXT3_NOATIME_FL | EXT3_COMPRBLK_FL |\
- EXT3_NOCOMPR_FL | EXT3_JOURNAL_DATA_FL |\
- EXT3_NOTAIL_FL | EXT3_DIRSYNC_FL)
-
-/* Flags that are appropriate for regular files (all but dir-specific ones). */
-#define EXT3_REG_FLMASK (~(EXT3_DIRSYNC_FL | EXT3_TOPDIR_FL))
-
-/* Flags that are appropriate for non-directories/regular files. */
-#define EXT3_OTHER_FLMASK (EXT3_NODUMP_FL | EXT3_NOATIME_FL)
-
-/* Mask out flags that are inappropriate for the given type of inode. */
-static inline __u32 ext3_mask_flags(umode_t mode, __u32 flags)
-{
- if (S_ISDIR(mode))
- return flags;
- else if (S_ISREG(mode))
- return flags & EXT3_REG_FLMASK;
- else
- return flags & EXT3_OTHER_FLMASK;
-}
-
-/* Used to pass group descriptor data when online resize is done */
-struct ext3_new_group_input {
- __u32 group; /* Group number for this data */
- __u32 block_bitmap; /* Absolute block number of block bitmap */
- __u32 inode_bitmap; /* Absolute block number of inode bitmap */
- __u32 inode_table; /* Absolute block number of inode table start */
- __u32 blocks_count; /* Total number of blocks in this group */
- __u16 reserved_blocks; /* Number of reserved blocks in this group */
- __u16 unused;
-};
-
-/* The struct ext3_new_group_input in kernel space, with free_blocks_count */
-struct ext3_new_group_data {
- __u32 group;
- __u32 block_bitmap;
- __u32 inode_bitmap;
- __u32 inode_table;
- __u32 blocks_count;
- __u16 reserved_blocks;
- __u16 unused;
- __u32 free_blocks_count;
-};
-
-
-/*
- * ioctl commands
- */
-#define EXT3_IOC_GETFLAGS FS_IOC_GETFLAGS
-#define EXT3_IOC_SETFLAGS FS_IOC_SETFLAGS
-#define EXT3_IOC_GETVERSION _IOR('f', 3, long)
-#define EXT3_IOC_SETVERSION _IOW('f', 4, long)
-#define EXT3_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long)
-#define EXT3_IOC_GROUP_ADD _IOW('f', 8,struct ext3_new_group_input)
-#define EXT3_IOC_GETVERSION_OLD FS_IOC_GETVERSION
-#define EXT3_IOC_SETVERSION_OLD FS_IOC_SETVERSION
-#ifdef CONFIG_JBD_DEBUG
-#define EXT3_IOC_WAIT_FOR_READONLY _IOR('f', 99, long)
-#endif
-#define EXT3_IOC_GETRSVSZ _IOR('f', 5, long)
-#define EXT3_IOC_SETRSVSZ _IOW('f', 6, long)
-
-/*
- * ioctl commands in 32 bit emulation
- */
-#define EXT3_IOC32_GETFLAGS FS_IOC32_GETFLAGS
-#define EXT3_IOC32_SETFLAGS FS_IOC32_SETFLAGS
-#define EXT3_IOC32_GETVERSION _IOR('f', 3, int)
-#define EXT3_IOC32_SETVERSION _IOW('f', 4, int)
-#define EXT3_IOC32_GETRSVSZ _IOR('f', 5, int)
-#define EXT3_IOC32_SETRSVSZ _IOW('f', 6, int)
-#define EXT3_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int)
-#ifdef CONFIG_JBD_DEBUG
-#define EXT3_IOC32_WAIT_FOR_READONLY _IOR('f', 99, int)
-#endif
-#define EXT3_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION
-#define EXT3_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION
-
-/* Number of supported quota types */
-#define EXT3_MAXQUOTAS 2
-
-/*
- * Mount options
- */
-struct ext3_mount_options {
- unsigned long s_mount_opt;
- kuid_t s_resuid;
- kgid_t s_resgid;
- unsigned long s_commit_interval;
-#ifdef CONFIG_QUOTA
- int s_jquota_fmt;
- char *s_qf_names[EXT3_MAXQUOTAS];
-#endif
-};
-
-/*
- * Structure of an inode on the disk
- */
-struct ext3_inode {
- __le16 i_mode; /* File mode */
- __le16 i_uid; /* Low 16 bits of Owner Uid */
- __le32 i_size; /* Size in bytes */
- __le32 i_atime; /* Access time */
- __le32 i_ctime; /* Creation time */
- __le32 i_mtime; /* Modification time */
- __le32 i_dtime; /* Deletion Time */
- __le16 i_gid; /* Low 16 bits of Group Id */
- __le16 i_links_count; /* Links count */
- __le32 i_blocks; /* Blocks count */
- __le32 i_flags; /* File flags */
- union {
- struct {
- __u32 l_i_reserved1;
- } linux1;
- struct {
- __u32 h_i_translator;
- } hurd1;
- struct {
- __u32 m_i_reserved1;
- } masix1;
- } osd1; /* OS dependent 1 */
- __le32 i_block[EXT3_N_BLOCKS];/* Pointers to blocks */
- __le32 i_generation; /* File version (for NFS) */
- __le32 i_file_acl; /* File ACL */
- __le32 i_dir_acl; /* Directory ACL */
- __le32 i_faddr; /* Fragment address */
- union {
- struct {
- __u8 l_i_frag; /* Fragment number */
- __u8 l_i_fsize; /* Fragment size */
- __u16 i_pad1;
- __le16 l_i_uid_high; /* these 2 fields */
- __le16 l_i_gid_high; /* were reserved2[0] */
- __u32 l_i_reserved2;
- } linux2;
- struct {
- __u8 h_i_frag; /* Fragment number */
- __u8 h_i_fsize; /* Fragment size */
- __u16 h_i_mode_high;
- __u16 h_i_uid_high;
- __u16 h_i_gid_high;
- __u32 h_i_author;
- } hurd2;
- struct {
- __u8 m_i_frag; /* Fragment number */
- __u8 m_i_fsize; /* Fragment size */
- __u16 m_pad1;
- __u32 m_i_reserved2[2];
- } masix2;
- } osd2; /* OS dependent 2 */
- __le16 i_extra_isize;
- __le16 i_pad1;
-};
-
-#define i_size_high i_dir_acl
-
-#define i_reserved1 osd1.linux1.l_i_reserved1
-#define i_frag osd2.linux2.l_i_frag
-#define i_fsize osd2.linux2.l_i_fsize
-#define i_uid_low i_uid
-#define i_gid_low i_gid
-#define i_uid_high osd2.linux2.l_i_uid_high
-#define i_gid_high osd2.linux2.l_i_gid_high
-#define i_reserved2 osd2.linux2.l_i_reserved2
-
-/*
- * File system states
- */
-#define EXT3_VALID_FS 0x0001 /* Unmounted cleanly */
-#define EXT3_ERROR_FS 0x0002 /* Errors detected */
-#define EXT3_ORPHAN_FS 0x0004 /* Orphans being recovered */
-
-/*
- * Misc. filesystem flags
- */
-#define EXT2_FLAGS_SIGNED_HASH 0x0001 /* Signed dirhash in use */
-#define EXT2_FLAGS_UNSIGNED_HASH 0x0002 /* Unsigned dirhash in use */
-#define EXT2_FLAGS_TEST_FILESYS 0x0004 /* to test development code */
-
-/*
- * Mount flags
- */
-#define EXT3_MOUNT_CHECK 0x00001 /* Do mount-time checks */
-/* EXT3_MOUNT_OLDALLOC was there */
-#define EXT3_MOUNT_GRPID 0x00004 /* Create files with directory's group */
-#define EXT3_MOUNT_DEBUG 0x00008 /* Some debugging messages */
-#define EXT3_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */
-#define EXT3_MOUNT_ERRORS_RO 0x00020 /* Remount fs ro on errors */
-#define EXT3_MOUNT_ERRORS_PANIC 0x00040 /* Panic on errors */
-#define EXT3_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */
-#define EXT3_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/
-#define EXT3_MOUNT_ABORT 0x00200 /* Fatal error detected */
-#define EXT3_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */
-#define EXT3_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */
-#define EXT3_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */
-#define EXT3_MOUNT_WRITEBACK_DATA 0x00C00 /* No data ordering */
-#define EXT3_MOUNT_UPDATE_JOURNAL 0x01000 /* Update the journal format */
-#define EXT3_MOUNT_NO_UID32 0x02000 /* Disable 32-bit UIDs */
-#define EXT3_MOUNT_XATTR_USER 0x04000 /* Extended user attributes */
-#define EXT3_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */
-#define EXT3_MOUNT_RESERVATION 0x10000 /* Preallocation */
-#define EXT3_MOUNT_BARRIER 0x20000 /* Use block barriers */
-#define EXT3_MOUNT_QUOTA 0x80000 /* Some quota option set */
-#define EXT3_MOUNT_USRQUOTA 0x100000 /* "old" user quota */
-#define EXT3_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */
-#define EXT3_MOUNT_DATA_ERR_ABORT 0x400000 /* Abort on file data write
- * error in ordered mode */
-
-/* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
-#ifndef _LINUX_EXT2_FS_H
-#define clear_opt(o, opt) o &= ~EXT3_MOUNT_##opt
-#define set_opt(o, opt) o |= EXT3_MOUNT_##opt
-#define test_opt(sb, opt) (EXT3_SB(sb)->s_mount_opt & \
- EXT3_MOUNT_##opt)
-#else
-#define EXT2_MOUNT_NOLOAD EXT3_MOUNT_NOLOAD
-#define EXT2_MOUNT_ABORT EXT3_MOUNT_ABORT
-#define EXT2_MOUNT_DATA_FLAGS EXT3_MOUNT_DATA_FLAGS
-#endif
-
-#define ext3_set_bit __set_bit_le
-#define ext3_set_bit_atomic ext2_set_bit_atomic
-#define ext3_clear_bit __clear_bit_le
-#define ext3_clear_bit_atomic ext2_clear_bit_atomic
-#define ext3_test_bit test_bit_le
-#define ext3_find_next_zero_bit find_next_zero_bit_le
-
-/*
- * Maximal mount counts between two filesystem checks
- */
-#define EXT3_DFL_MAX_MNT_COUNT 20 /* Allow 20 mounts */
-#define EXT3_DFL_CHECKINTERVAL 0 /* Don't use interval check */
-
-/*
- * Behaviour when detecting errors
- */
-#define EXT3_ERRORS_CONTINUE 1 /* Continue execution */
-#define EXT3_ERRORS_RO 2 /* Remount fs read-only */
-#define EXT3_ERRORS_PANIC 3 /* Panic */
-#define EXT3_ERRORS_DEFAULT EXT3_ERRORS_CONTINUE
-
-/*
- * Structure of the super block
- */
-struct ext3_super_block {
-/*00*/ __le32 s_inodes_count; /* Inodes count */
- __le32 s_blocks_count; /* Blocks count */
- __le32 s_r_blocks_count; /* Reserved blocks count */
- __le32 s_free_blocks_count; /* Free blocks count */
-/*10*/ __le32 s_free_inodes_count; /* Free inodes count */
- __le32 s_first_data_block; /* First Data Block */
- __le32 s_log_block_size; /* Block size */
- __le32 s_log_frag_size; /* Fragment size */
-/*20*/ __le32 s_blocks_per_group; /* # Blocks per group */
- __le32 s_frags_per_group; /* # Fragments per group */
- __le32 s_inodes_per_group; /* # Inodes per group */
- __le32 s_mtime; /* Mount time */
-/*30*/ __le32 s_wtime; /* Write time */
- __le16 s_mnt_count; /* Mount count */
- __le16 s_max_mnt_count; /* Maximal mount count */
- __le16 s_magic; /* Magic signature */
- __le16 s_state; /* File system state */
- __le16 s_errors; /* Behaviour when detecting errors */
- __le16 s_minor_rev_level; /* minor revision level */
-/*40*/ __le32 s_lastcheck; /* time of last check */
- __le32 s_checkinterval; /* max. time between checks */
- __le32 s_creator_os; /* OS */
- __le32 s_rev_level; /* Revision level */
-/*50*/ __le16 s_def_resuid; /* Default uid for reserved blocks */
- __le16 s_def_resgid; /* Default gid for reserved blocks */
- /*
- * These fields are for EXT3_DYNAMIC_REV superblocks only.
- *
- * Note: the difference between the compatible feature set and
- * the incompatible feature set is that if there is a bit set
- * in the incompatible feature set that the kernel doesn't
- * know about, it should refuse to mount the filesystem.
- *
- * e2fsck's requirements are more strict; if it doesn't know
- * about a feature in either the compatible or incompatible
- * feature set, it must abort and not try to meddle with
- * things it doesn't understand...
- */
- __le32 s_first_ino; /* First non-reserved inode */
- __le16 s_inode_size; /* size of inode structure */
- __le16 s_block_group_nr; /* block group # of this superblock */
- __le32 s_feature_compat; /* compatible feature set */
-/*60*/ __le32 s_feature_incompat; /* incompatible feature set */
- __le32 s_feature_ro_compat; /* readonly-compatible feature set */
-/*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */
-/*78*/ char s_volume_name[16]; /* volume name */
-/*88*/ char s_last_mounted[64]; /* directory where last mounted */
-/*C8*/ __le32 s_algorithm_usage_bitmap; /* For compression */
- /*
- * Performance hints. Directory preallocation should only
- * happen if the EXT3_FEATURE_COMPAT_DIR_PREALLOC flag is on.
- */
- __u8 s_prealloc_blocks; /* Nr of blocks to try to preallocate*/
- __u8 s_prealloc_dir_blocks; /* Nr to preallocate for dirs */
- __le16 s_reserved_gdt_blocks; /* Per group desc for online growth */
- /*
- * Journaling support valid if EXT3_FEATURE_COMPAT_HAS_JOURNAL set.
- */
-/*D0*/ __u8 s_journal_uuid[16]; /* uuid of journal superblock */
-/*E0*/ __le32 s_journal_inum; /* inode number of journal file */
- __le32 s_journal_dev; /* device number of journal file */
- __le32 s_last_orphan; /* start of list of inodes to delete */
- __le32 s_hash_seed[4]; /* HTREE hash seed */
- __u8 s_def_hash_version; /* Default hash version to use */
- __u8 s_reserved_char_pad;
- __u16 s_reserved_word_pad;
- __le32 s_default_mount_opts;
- __le32 s_first_meta_bg; /* First metablock block group */
- __le32 s_mkfs_time; /* When the filesystem was created */
- __le32 s_jnl_blocks[17]; /* Backup of the journal inode */
- /* 64bit support valid if EXT4_FEATURE_COMPAT_64BIT */
-/*150*/ __le32 s_blocks_count_hi; /* Blocks count */
- __le32 s_r_blocks_count_hi; /* Reserved blocks count */
- __le32 s_free_blocks_count_hi; /* Free blocks count */
- __le16 s_min_extra_isize; /* All inodes have at least # bytes */
- __le16 s_want_extra_isize; /* New inodes should reserve # bytes */
- __le32 s_flags; /* Miscellaneous flags */
- __le16 s_raid_stride; /* RAID stride */
- __le16 s_mmp_interval; /* # seconds to wait in MMP checking */
- __le64 s_mmp_block; /* Block for multi-mount protection */
- __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
- __u8 s_log_groups_per_flex; /* FLEX_BG group size */
- __u8 s_reserved_char_pad2;
- __le16 s_reserved_pad;
- __u32 s_reserved[162]; /* Padding to the end of the block */
-};
-
-/* data type for block offset of block group */
-typedef int ext3_grpblk_t;
-
-/* data type for filesystem-wide blocks number */
-typedef unsigned long ext3_fsblk_t;
-
-#define E3FSBLK "%lu"
-
-struct ext3_reserve_window {
- ext3_fsblk_t _rsv_start; /* First byte reserved */
- ext3_fsblk_t _rsv_end; /* Last byte reserved or 0 */
-};
-
-struct ext3_reserve_window_node {
- struct rb_node rsv_node;
- __u32 rsv_goal_size;
- __u32 rsv_alloc_hit;
- struct ext3_reserve_window rsv_window;
-};
-
-struct ext3_block_alloc_info {
- /* information about reservation window */
- struct ext3_reserve_window_node rsv_window_node;
- /*
- * was i_next_alloc_block in ext3_inode_info
- * is the logical (file-relative) number of the
- * most-recently-allocated block in this file.
- * We use this for detecting linearly ascending allocation requests.
- */
- __u32 last_alloc_logical_block;
- /*
- * Was i_next_alloc_goal in ext3_inode_info
- * is the *physical* companion to i_next_alloc_block.
- * it the physical block number of the block which was most-recentl
- * allocated to this file. This give us the goal (target) for the next
- * allocation when we detect linearly ascending requests.
- */
- ext3_fsblk_t last_alloc_physical_block;
-};
-
-#define rsv_start rsv_window._rsv_start
-#define rsv_end rsv_window._rsv_end
-
-/*
- * third extended file system inode data in memory
- */
-struct ext3_inode_info {
- __le32 i_data[15]; /* unconverted */
- __u32 i_flags;
-#ifdef EXT3_FRAGMENTS
- __u32 i_faddr;
- __u8 i_frag_no;
- __u8 i_frag_size;
-#endif
- ext3_fsblk_t i_file_acl;
- __u32 i_dir_acl;
- __u32 i_dtime;
-
- /*
- * i_block_group is the number of the block group which contains
- * this file's inode. Constant across the lifetime of the inode,
- * it is ued for making block allocation decisions - we try to
- * place a file's data blocks near its inode block, and new inodes
- * near to their parent directory's inode.
- */
- __u32 i_block_group;
- unsigned long i_state_flags; /* Dynamic state flags for ext3 */
-
- /* block reservation info */
- struct ext3_block_alloc_info *i_block_alloc_info;
-
- __u32 i_dir_start_lookup;
-#ifdef CONFIG_EXT3_FS_XATTR
- /*
- * Extended attributes can be read independently of the main file
- * data. Taking i_mutex even when reading would cause contention
- * between readers of EAs and writers of regular file data, so
- * instead we synchronize on xattr_sem when reading or changing
- * EAs.
- */
- struct rw_semaphore xattr_sem;
-#endif
-
- struct list_head i_orphan; /* unlinked but open inodes */
-
- /*
- * i_disksize keeps track of what the inode size is ON DISK, not
- * in memory. During truncate, i_size is set to the new size by
- * the VFS prior to calling ext3_truncate(), but the filesystem won't
- * set i_disksize to 0 until the truncate is actually under way.
- *
- * The intent is that i_disksize always represents the blocks which
- * are used by this file. This allows recovery to restart truncate
- * on orphans if we crash during truncate. We actually write i_disksize
- * into the on-disk inode when writing inodes out, instead of i_size.
- *
- * The only time when i_disksize and i_size may be different is when
- * a truncate is in progress. The only things which change i_disksize
- * are ext3_get_block (growth) and ext3_truncate (shrinkth).
- */
- loff_t i_disksize;
-
- /* on-disk additional length */
- __u16 i_extra_isize;
-
- /*
- * truncate_mutex is for serialising ext3_truncate() against
- * ext3_getblock(). In the 2.4 ext2 design, great chunks of inode's
- * data tree are chopped off during truncate. We can't do that in
- * ext3 because whenever we perform intermediate commits during
- * truncate, the inode and all the metadata blocks *must* be in a
- * consistent state which allows truncation of the orphans to restart
- * during recovery. Hence we must fix the get_block-vs-truncate race
- * by other means, so we have truncate_mutex.
- */
- struct mutex truncate_mutex;
-
- /*
- * Transactions that contain inode's metadata needed to complete
- * fsync and fdatasync, respectively.
- */
- atomic_t i_sync_tid;
- atomic_t i_datasync_tid;
-
-#ifdef CONFIG_QUOTA
- struct dquot *i_dquot[MAXQUOTAS];
-#endif
-
- struct inode vfs_inode;
-};
-
-/*
- * third extended-fs super-block data in memory
- */
-struct ext3_sb_info {
- unsigned long s_frag_size; /* Size of a fragment in bytes */
- unsigned long s_frags_per_block;/* Number of fragments per block */
- unsigned long s_inodes_per_block;/* Number of inodes per block */
- unsigned long s_frags_per_group;/* Number of fragments in a group */
- unsigned long s_blocks_per_group;/* Number of blocks in a group */
- unsigned long s_inodes_per_group;/* Number of inodes in a group */
- unsigned long s_itb_per_group; /* Number of inode table blocks per group */
- unsigned long s_gdb_count; /* Number of group descriptor blocks */
- unsigned long s_desc_per_block; /* Number of group descriptors per block */
- unsigned long s_groups_count; /* Number of groups in the fs */
- unsigned long s_overhead_last; /* Last calculated overhead */
- unsigned long s_blocks_last; /* Last seen block count */
- struct buffer_head * s_sbh; /* Buffer containing the super block */
- struct ext3_super_block * s_es; /* Pointer to the super block in the buffer */
- struct buffer_head ** s_group_desc;
- unsigned long s_mount_opt;
- ext3_fsblk_t s_sb_block;
- kuid_t s_resuid;
- kgid_t s_resgid;
- unsigned short s_mount_state;
- unsigned short s_pad;
- int s_addr_per_block_bits;
- int s_desc_per_block_bits;
- int s_inode_size;
- int s_first_ino;
- spinlock_t s_next_gen_lock;
- u32 s_next_generation;
- u32 s_hash_seed[4];
- int s_def_hash_version;
- int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */
- struct percpu_counter s_freeblocks_counter;
- struct percpu_counter s_freeinodes_counter;
- struct percpu_counter s_dirs_counter;
- struct blockgroup_lock *s_blockgroup_lock;
-
- /* root of the per fs reservation window tree */
- spinlock_t s_rsv_window_lock;
- struct rb_root s_rsv_window_root;
- struct ext3_reserve_window_node s_rsv_window_head;
-
- /* Journaling */
- struct inode * s_journal_inode;
- struct journal_s * s_journal;
- struct list_head s_orphan;
- struct mutex s_orphan_lock;
- struct mutex s_resize_lock;
- unsigned long s_commit_interval;
- struct block_device *journal_bdev;
-#ifdef CONFIG_QUOTA
- char *s_qf_names[EXT3_MAXQUOTAS]; /* Names of quota files with journalled quota */
- int s_jquota_fmt; /* Format of quota to use */
-#endif
-};
-
-static inline spinlock_t *
-sb_bgl_lock(struct ext3_sb_info *sbi, unsigned int block_group)
-{
- return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group);
-}
-
-static inline struct ext3_sb_info * EXT3_SB(struct super_block *sb)
-{
- return sb->s_fs_info;
-}
-static inline struct ext3_inode_info *EXT3_I(struct inode *inode)
-{
- return container_of(inode, struct ext3_inode_info, vfs_inode);
-}
-
-static inline int ext3_valid_inum(struct super_block *sb, unsigned long ino)
-{
- return ino == EXT3_ROOT_INO ||
- ino == EXT3_JOURNAL_INO ||
- ino == EXT3_RESIZE_INO ||
- (ino >= EXT3_FIRST_INO(sb) &&
- ino <= le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count));
-}
-
-/*
- * Inode dynamic state flags
- */
-enum {
- EXT3_STATE_JDATA, /* journaled data exists */
- EXT3_STATE_NEW, /* inode is newly created */
- EXT3_STATE_XATTR, /* has in-inode xattrs */
- EXT3_STATE_FLUSH_ON_CLOSE, /* flush dirty pages on close */
-};
-
-static inline int ext3_test_inode_state(struct inode *inode, int bit)
-{
- return test_bit(bit, &EXT3_I(inode)->i_state_flags);
-}
-
-static inline void ext3_set_inode_state(struct inode *inode, int bit)
-{
- set_bit(bit, &EXT3_I(inode)->i_state_flags);
-}
-
-static inline void ext3_clear_inode_state(struct inode *inode, int bit)
-{
- clear_bit(bit, &EXT3_I(inode)->i_state_flags);
-}
-
-#define NEXT_ORPHAN(inode) EXT3_I(inode)->i_dtime
-
-/*
- * Codes for operating systems
- */
-#define EXT3_OS_LINUX 0
-#define EXT3_OS_HURD 1
-#define EXT3_OS_MASIX 2
-#define EXT3_OS_FREEBSD 3
-#define EXT3_OS_LITES 4
-
-/*
- * Revision levels
- */
-#define EXT3_GOOD_OLD_REV 0 /* The good old (original) format */
-#define EXT3_DYNAMIC_REV 1 /* V2 format w/ dynamic inode sizes */
-
-#define EXT3_CURRENT_REV EXT3_GOOD_OLD_REV
-#define EXT3_MAX_SUPP_REV EXT3_DYNAMIC_REV
-
-#define EXT3_GOOD_OLD_INODE_SIZE 128
-
-/*
- * Feature set definitions
- */
-
-#define EXT3_HAS_COMPAT_FEATURE(sb,mask) \
- ( EXT3_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask) )
-#define EXT3_HAS_RO_COMPAT_FEATURE(sb,mask) \
- ( EXT3_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask) )
-#define EXT3_HAS_INCOMPAT_FEATURE(sb,mask) \
- ( EXT3_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask) )
-#define EXT3_SET_COMPAT_FEATURE(sb,mask) \
- EXT3_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask)
-#define EXT3_SET_RO_COMPAT_FEATURE(sb,mask) \
- EXT3_SB(sb)->s_es->s_feature_ro_compat |= cpu_to_le32(mask)
-#define EXT3_SET_INCOMPAT_FEATURE(sb,mask) \
- EXT3_SB(sb)->s_es->s_feature_incompat |= cpu_to_le32(mask)
-#define EXT3_CLEAR_COMPAT_FEATURE(sb,mask) \
- EXT3_SB(sb)->s_es->s_feature_compat &= ~cpu_to_le32(mask)
-#define EXT3_CLEAR_RO_COMPAT_FEATURE(sb,mask) \
- EXT3_SB(sb)->s_es->s_feature_ro_compat &= ~cpu_to_le32(mask)
-#define EXT3_CLEAR_INCOMPAT_FEATURE(sb,mask) \
- EXT3_SB(sb)->s_es->s_feature_incompat &= ~cpu_to_le32(mask)
-
-#define EXT3_FEATURE_COMPAT_DIR_PREALLOC 0x0001
-#define EXT3_FEATURE_COMPAT_IMAGIC_INODES 0x0002
-#define EXT3_FEATURE_COMPAT_HAS_JOURNAL 0x0004
-#define EXT3_FEATURE_COMPAT_EXT_ATTR 0x0008
-#define EXT3_FEATURE_COMPAT_RESIZE_INODE 0x0010
-#define EXT3_FEATURE_COMPAT_DIR_INDEX 0x0020
-
-#define EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001
-#define EXT3_FEATURE_RO_COMPAT_LARGE_FILE 0x0002
-#define EXT3_FEATURE_RO_COMPAT_BTREE_DIR 0x0004
-
-#define EXT3_FEATURE_INCOMPAT_COMPRESSION 0x0001
-#define EXT3_FEATURE_INCOMPAT_FILETYPE 0x0002
-#define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */
-#define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */
-#define EXT3_FEATURE_INCOMPAT_META_BG 0x0010
-
-#define EXT3_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR
-#define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \
- EXT3_FEATURE_INCOMPAT_RECOVER| \
- EXT3_FEATURE_INCOMPAT_META_BG)
-#define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \
- EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \
- EXT3_FEATURE_RO_COMPAT_BTREE_DIR)
-
-/*
- * Default values for user and/or group using reserved blocks
- */
-#define EXT3_DEF_RESUID 0
-#define EXT3_DEF_RESGID 0
-
-/*
- * Default mount options
- */
-#define EXT3_DEFM_DEBUG 0x0001
-#define EXT3_DEFM_BSDGROUPS 0x0002
-#define EXT3_DEFM_XATTR_USER 0x0004
-#define EXT3_DEFM_ACL 0x0008
-#define EXT3_DEFM_UID16 0x0010
-#define EXT3_DEFM_JMODE 0x0060
-#define EXT3_DEFM_JMODE_DATA 0x0020
-#define EXT3_DEFM_JMODE_ORDERED 0x0040
-#define EXT3_DEFM_JMODE_WBACK 0x0060
-
-/*
- * Structure of a directory entry
- */
-#define EXT3_NAME_LEN 255
-
-struct ext3_dir_entry {
- __le32 inode; /* Inode number */
- __le16 rec_len; /* Directory entry length */
- __le16 name_len; /* Name length */
- char name[EXT3_NAME_LEN]; /* File name */
-};
-
-/*
- * The new version of the directory entry. Since EXT3 structures are
- * stored in intel byte order, and the name_len field could never be
- * bigger than 255 chars, it's safe to reclaim the extra byte for the
- * file_type field.
- */
-struct ext3_dir_entry_2 {
- __le32 inode; /* Inode number */
- __le16 rec_len; /* Directory entry length */
- __u8 name_len; /* Name length */
- __u8 file_type;
- char name[EXT3_NAME_LEN]; /* File name */
-};
-
-/*
- * Ext3 directory file types. Only the low 3 bits are used. The
- * other bits are reserved for now.
- */
-#define EXT3_FT_UNKNOWN 0
-#define EXT3_FT_REG_FILE 1
-#define EXT3_FT_DIR 2
-#define EXT3_FT_CHRDEV 3
-#define EXT3_FT_BLKDEV 4
-#define EXT3_FT_FIFO 5
-#define EXT3_FT_SOCK 6
-#define EXT3_FT_SYMLINK 7
-
-#define EXT3_FT_MAX 8
-
-/*
- * EXT3_DIR_PAD defines the directory entries boundaries
- *
- * NOTE: It must be a multiple of 4
- */
-#define EXT3_DIR_PAD 4
-#define EXT3_DIR_ROUND (EXT3_DIR_PAD - 1)
-#define EXT3_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT3_DIR_ROUND) & \
- ~EXT3_DIR_ROUND)
-#define EXT3_MAX_REC_LEN ((1<<16)-1)
-
-/*
- * Tests against MAX_REC_LEN etc were put in place for 64k block
- * sizes; if that is not possible on this arch, we can skip
- * those tests and speed things up.
- */
-static inline unsigned ext3_rec_len_from_disk(__le16 dlen)
-{
- unsigned len = le16_to_cpu(dlen);
-
-#if (PAGE_CACHE_SIZE >= 65536)
- if (len == EXT3_MAX_REC_LEN)
- return 1 << 16;
-#endif
- return len;
-}
-
-static inline __le16 ext3_rec_len_to_disk(unsigned len)
-{
-#if (PAGE_CACHE_SIZE >= 65536)
- if (len == (1 << 16))
- return cpu_to_le16(EXT3_MAX_REC_LEN);
- else if (len > (1 << 16))
- BUG();
-#endif
- return cpu_to_le16(len);
-}
-
-/*
- * Hash Tree Directory indexing
- * (c) Daniel Phillips, 2001
- */
-
-#define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \
- EXT3_FEATURE_COMPAT_DIR_INDEX) && \
- (EXT3_I(dir)->i_flags & EXT3_INDEX_FL))
-#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX)
-#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
-
-/* Legal values for the dx_root hash_version field: */
-
-#define DX_HASH_LEGACY 0
-#define DX_HASH_HALF_MD4 1
-#define DX_HASH_TEA 2
-#define DX_HASH_LEGACY_UNSIGNED 3
-#define DX_HASH_HALF_MD4_UNSIGNED 4
-#define DX_HASH_TEA_UNSIGNED 5
-
-/* hash info structure used by the directory hash */
-struct dx_hash_info
-{
- u32 hash;
- u32 minor_hash;
- int hash_version;
- u32 *seed;
-};
-
-
-/* 32 and 64 bit signed EOF for dx directories */
-#define EXT3_HTREE_EOF_32BIT ((1UL << (32 - 1)) - 1)
-#define EXT3_HTREE_EOF_64BIT ((1ULL << (64 - 1)) - 1)
-
-
-/*
- * Control parameters used by ext3_htree_next_block
- */
-#define HASH_NB_ALWAYS 1
-
-
-/*
- * Describe an inode's exact location on disk and in memory
- */
-struct ext3_iloc
-{
- struct buffer_head *bh;
- unsigned long offset;
- unsigned long block_group;
-};
-
-static inline struct ext3_inode *ext3_raw_inode(struct ext3_iloc *iloc)
-{
- return (struct ext3_inode *) (iloc->bh->b_data + iloc->offset);
-}
-
-/*
- * This structure is stuffed into the struct file's private_data field
- * for directories. It is where we put information so that we can do
- * readdir operations in hash tree order.
- */
-struct dir_private_info {
- struct rb_root root;
- struct rb_node *curr_node;
- struct fname *extra_fname;
- loff_t last_pos;
- __u32 curr_hash;
- __u32 curr_minor_hash;
- __u32 next_hash;
-};
-
-/* calculate the first block number of the group */
-static inline ext3_fsblk_t
-ext3_group_first_block_no(struct super_block *sb, unsigned long group_no)
-{
- return group_no * (ext3_fsblk_t)EXT3_BLOCKS_PER_GROUP(sb) +
- le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block);
-}
-
-/*
- * Special error return code only used by dx_probe() and its callers.
- */
-#define ERR_BAD_DX_DIR -75000
-
-/*
- * Function prototypes
- */
-
-/*
- * Ok, these declarations are also in <linux/kernel.h> but none of the
- * ext3 source programs needs to include it so they are duplicated here.
- */
-# define NORET_TYPE /**/
-# define ATTRIB_NORET __attribute__((noreturn))
-# define NORET_AND noreturn,
-
-/* balloc.c */
-extern int ext3_bg_has_super(struct super_block *sb, int group);
-extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group);
-extern ext3_fsblk_t ext3_new_block (handle_t *handle, struct inode *inode,
- ext3_fsblk_t goal, int *errp);
-extern ext3_fsblk_t ext3_new_blocks (handle_t *handle, struct inode *inode,
- ext3_fsblk_t goal, unsigned long *count, int *errp);
-extern void ext3_free_blocks (handle_t *handle, struct inode *inode,
- ext3_fsblk_t block, unsigned long count);
-extern void ext3_free_blocks_sb (handle_t *handle, struct super_block *sb,
- ext3_fsblk_t block, unsigned long count,
- unsigned long *pdquot_freed_blocks);
-extern ext3_fsblk_t ext3_count_free_blocks (struct super_block *);
-extern void ext3_check_blocks_bitmap (struct super_block *);
-extern struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
- unsigned int block_group,
- struct buffer_head ** bh);
-extern int ext3_should_retry_alloc(struct super_block *sb, int *retries);
-extern void ext3_init_block_alloc_info(struct inode *);
-extern void ext3_rsv_window_add(struct super_block *sb, struct ext3_reserve_window_node *rsv);
-extern int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range);
-
-/* dir.c */
-extern int ext3_check_dir_entry(const char *, struct inode *,
- struct ext3_dir_entry_2 *,
- struct buffer_head *, unsigned long);
-extern int ext3_htree_store_dirent(struct file *dir_file, __u32 hash,
- __u32 minor_hash,
- struct ext3_dir_entry_2 *dirent);
-extern void ext3_htree_free_dir_info(struct dir_private_info *p);
-
-/* fsync.c */
-extern int ext3_sync_file(struct file *, loff_t, loff_t, int);
-
-/* hash.c */
-extern int ext3fs_dirhash(const char *name, int len, struct
- dx_hash_info *hinfo);
-
-/* ialloc.c */
-extern struct inode * ext3_new_inode (handle_t *, struct inode *,
- const struct qstr *, umode_t);
-extern void ext3_free_inode (handle_t *, struct inode *);
-extern struct inode * ext3_orphan_get (struct super_block *, unsigned long);
-extern unsigned long ext3_count_free_inodes (struct super_block *);
-extern unsigned long ext3_count_dirs (struct super_block *);
-extern void ext3_check_inodes_bitmap (struct super_block *);
-extern unsigned long ext3_count_free (struct buffer_head *, unsigned);
-
-
-/* inode.c */
-int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode,
- struct buffer_head *bh, ext3_fsblk_t blocknr);
-struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *);
-struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *);
-int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
- sector_t iblock, unsigned long maxblocks, struct buffer_head *bh_result,
- int create);
-
-extern struct inode *ext3_iget(struct super_block *, unsigned long);
-extern int ext3_write_inode (struct inode *, struct writeback_control *);
-extern int ext3_setattr (struct dentry *, struct iattr *);
-extern void ext3_evict_inode (struct inode *);
-extern int ext3_sync_inode (handle_t *, struct inode *);
-extern void ext3_discard_reservation (struct inode *);
-extern void ext3_dirty_inode(struct inode *, int);
-extern int ext3_change_inode_journal_flag(struct inode *, int);
-extern int ext3_get_inode_loc(struct inode *, struct ext3_iloc *);
-extern int ext3_can_truncate(struct inode *inode);
-extern void ext3_truncate(struct inode *inode);
-extern void ext3_set_inode_flags(struct inode *);
-extern void ext3_get_inode_flags(struct ext3_inode_info *);
-extern void ext3_set_aops(struct inode *inode);
-extern int ext3_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
- u64 start, u64 len);
-
-/* ioctl.c */
-extern long ext3_ioctl(struct file *, unsigned int, unsigned long);
-extern long ext3_compat_ioctl(struct file *, unsigned int, unsigned long);
-
-/* namei.c */
-extern int ext3_orphan_add(handle_t *, struct inode *);
-extern int ext3_orphan_del(handle_t *, struct inode *);
-extern int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
- __u32 start_minor_hash, __u32 *next_hash);
-
-/* resize.c */
-extern int ext3_group_add(struct super_block *sb,
- struct ext3_new_group_data *input);
-extern int ext3_group_extend(struct super_block *sb,
- struct ext3_super_block *es,
- ext3_fsblk_t n_blocks_count);
-
-/* super.c */
-extern __printf(3, 4)
-void ext3_error(struct super_block *, const char *, const char *, ...);
-extern void __ext3_std_error (struct super_block *, const char *, int);
-extern __printf(3, 4)
-void ext3_abort(struct super_block *, const char *, const char *, ...);
-extern __printf(3, 4)
-void ext3_warning(struct super_block *, const char *, const char *, ...);
-extern __printf(3, 4)
-void ext3_msg(struct super_block *, const char *, const char *, ...);
-extern void ext3_update_dynamic_rev (struct super_block *sb);
-
-#define ext3_std_error(sb, errno) \
-do { \
- if ((errno)) \
- __ext3_std_error((sb), __func__, (errno)); \
-} while (0)
-
-/*
- * Inodes and files operations
- */
-
-/* dir.c */
-extern const struct file_operations ext3_dir_operations;
-
-/* file.c */
-extern const struct inode_operations ext3_file_inode_operations;
-extern const struct file_operations ext3_file_operations;
-
-/* namei.c */
-extern const struct inode_operations ext3_dir_inode_operations;
-extern const struct inode_operations ext3_special_inode_operations;
-
-/* symlink.c */
-extern const struct inode_operations ext3_symlink_inode_operations;
-extern const struct inode_operations ext3_fast_symlink_inode_operations;
-
-#define EXT3_JOURNAL(inode) (EXT3_SB((inode)->i_sb)->s_journal)
-
-/* Define the number of blocks we need to account to a transaction to
- * modify one block of data.
- *
- * We may have to touch one inode, one bitmap buffer, up to three
- * indirection blocks, the group and superblock summaries, and the data
- * block to complete the transaction. */
-
-#define EXT3_SINGLEDATA_TRANS_BLOCKS 8U
-
-/* Extended attribute operations touch at most two data buffers,
- * two bitmap buffers, and two group summaries, in addition to the inode
- * and the superblock, which are already accounted for. */
-
-#define EXT3_XATTR_TRANS_BLOCKS 6U
-
-/* Define the minimum size for a transaction which modifies data. This
- * needs to take into account the fact that we may end up modifying two
- * quota files too (one for the group, one for the user quota). The
- * superblock only gets updated once, of course, so don't bother
- * counting that again for the quota updates. */
-
-#define EXT3_DATA_TRANS_BLOCKS(sb) (EXT3_SINGLEDATA_TRANS_BLOCKS + \
- EXT3_XATTR_TRANS_BLOCKS - 2 + \
- EXT3_MAXQUOTAS_TRANS_BLOCKS(sb))
-
-/* Delete operations potentially hit one directory's namespace plus an
- * entire inode, plus arbitrary amounts of bitmap/indirection data. Be
- * generous. We can grow the delete transaction later if necessary. */
-
-#define EXT3_DELETE_TRANS_BLOCKS(sb) (EXT3_MAXQUOTAS_TRANS_BLOCKS(sb) + 64)
-
-/* Define an arbitrary limit for the amount of data we will anticipate
- * writing to any given transaction. For unbounded transactions such as
- * write(2) and truncate(2) we can write more than this, but we always
- * start off at the maximum transaction size and grow the transaction
- * optimistically as we go. */
-
-#define EXT3_MAX_TRANS_DATA 64U
-
-/* We break up a large truncate or write transaction once the handle's
- * buffer credits gets this low, we need either to extend the
- * transaction or to start a new one. Reserve enough space here for
- * inode, bitmap, superblock, group and indirection updates for at least
- * one block, plus two quota updates. Quota allocations are not
- * needed. */
-
-#define EXT3_RESERVE_TRANS_BLOCKS 12U
-
-#define EXT3_INDEX_EXTRA_TRANS_BLOCKS 8
-
-#ifdef CONFIG_QUOTA
-/* Amount of blocks needed for quota update - we know that the structure was
- * allocated so we need to update only inode+data */
-#define EXT3_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 2 : 0)
-/* Amount of blocks needed for quota insert/delete - we do some block writes
- * but inode, sb and group updates are done only once */
-#define EXT3_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\
- (EXT3_SINGLEDATA_TRANS_BLOCKS-3)+3+DQUOT_INIT_REWRITE) : 0)
-#define EXT3_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\
- (EXT3_SINGLEDATA_TRANS_BLOCKS-3)+3+DQUOT_DEL_REWRITE) : 0)
-#else
-#define EXT3_QUOTA_TRANS_BLOCKS(sb) 0
-#define EXT3_QUOTA_INIT_BLOCKS(sb) 0
-#define EXT3_QUOTA_DEL_BLOCKS(sb) 0
-#endif
-#define EXT3_MAXQUOTAS_TRANS_BLOCKS(sb) (EXT3_MAXQUOTAS*EXT3_QUOTA_TRANS_BLOCKS(sb))
-#define EXT3_MAXQUOTAS_INIT_BLOCKS(sb) (EXT3_MAXQUOTAS*EXT3_QUOTA_INIT_BLOCKS(sb))
-#define EXT3_MAXQUOTAS_DEL_BLOCKS(sb) (EXT3_MAXQUOTAS*EXT3_QUOTA_DEL_BLOCKS(sb))
-
-int
-ext3_mark_iloc_dirty(handle_t *handle,
- struct inode *inode,
- struct ext3_iloc *iloc);
-
-/*
- * On success, We end up with an outstanding reference count against
- * iloc->bh. This _must_ be cleaned up later.
- */
-
-int ext3_reserve_inode_write(handle_t *handle, struct inode *inode,
- struct ext3_iloc *iloc);
-
-int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode);
-
-/*
- * Wrapper functions with which ext3 calls into JBD. The intent here is
- * to allow these to be turned into appropriate stubs so ext3 can control
- * ext2 filesystems, so ext2+ext3 systems only nee one fs. This work hasn't
- * been done yet.
- */
-
-static inline void ext3_journal_release_buffer(handle_t *handle,
- struct buffer_head *bh)
-{
- journal_release_buffer(handle, bh);
-}
-
-void ext3_journal_abort_handle(const char *caller, const char *err_fn,
- struct buffer_head *bh, handle_t *handle, int err);
-
-int __ext3_journal_get_undo_access(const char *where, handle_t *handle,
- struct buffer_head *bh);
-
-int __ext3_journal_get_write_access(const char *where, handle_t *handle,
- struct buffer_head *bh);
-
-int __ext3_journal_forget(const char *where, handle_t *handle,
- struct buffer_head *bh);
-
-int __ext3_journal_revoke(const char *where, handle_t *handle,
- unsigned long blocknr, struct buffer_head *bh);
-
-int __ext3_journal_get_create_access(const char *where,
- handle_t *handle, struct buffer_head *bh);
-
-int __ext3_journal_dirty_metadata(const char *where,
- handle_t *handle, struct buffer_head *bh);
-
-#define ext3_journal_get_undo_access(handle, bh) \
- __ext3_journal_get_undo_access(__func__, (handle), (bh))
-#define ext3_journal_get_write_access(handle, bh) \
- __ext3_journal_get_write_access(__func__, (handle), (bh))
-#define ext3_journal_revoke(handle, blocknr, bh) \
- __ext3_journal_revoke(__func__, (handle), (blocknr), (bh))
-#define ext3_journal_get_create_access(handle, bh) \
- __ext3_journal_get_create_access(__func__, (handle), (bh))
-#define ext3_journal_dirty_metadata(handle, bh) \
- __ext3_journal_dirty_metadata(__func__, (handle), (bh))
-#define ext3_journal_forget(handle, bh) \
- __ext3_journal_forget(__func__, (handle), (bh))
-
-int ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh);
-
-handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks);
-int __ext3_journal_stop(const char *where, handle_t *handle);
-
-static inline handle_t *ext3_journal_start(struct inode *inode, int nblocks)
-{
- return ext3_journal_start_sb(inode->i_sb, nblocks);
-}
-
-#define ext3_journal_stop(handle) \
- __ext3_journal_stop(__func__, (handle))
-
-static inline handle_t *ext3_journal_current_handle(void)
-{
- return journal_current_handle();
-}
-
-static inline int ext3_journal_extend(handle_t *handle, int nblocks)
-{
- return journal_extend(handle, nblocks);
-}
-
-static inline int ext3_journal_restart(handle_t *handle, int nblocks)
-{
- return journal_restart(handle, nblocks);
-}
-
-static inline int ext3_journal_blocks_per_page(struct inode *inode)
-{
- return journal_blocks_per_page(inode);
-}
-
-static inline int ext3_journal_force_commit(journal_t *journal)
-{
- return journal_force_commit(journal);
-}
-
-/* super.c */
-int ext3_force_commit(struct super_block *sb);
-
-static inline int ext3_should_journal_data(struct inode *inode)
-{
- if (!S_ISREG(inode->i_mode))
- return 1;
- if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA)
- return 1;
- if (EXT3_I(inode)->i_flags & EXT3_JOURNAL_DATA_FL)
- return 1;
- return 0;
-}
-
-static inline int ext3_should_order_data(struct inode *inode)
-{
- if (!S_ISREG(inode->i_mode))
- return 0;
- if (EXT3_I(inode)->i_flags & EXT3_JOURNAL_DATA_FL)
- return 0;
- if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA)
- return 1;
- return 0;
-}
-
-static inline int ext3_should_writeback_data(struct inode *inode)
-{
- if (!S_ISREG(inode->i_mode))
- return 0;
- if (EXT3_I(inode)->i_flags & EXT3_JOURNAL_DATA_FL)
- return 0;
- if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)
- return 1;
- return 0;
-}
-
-#include <trace/events/ext3.h>
diff --git a/fs/ext3/ext3_jbd.c b/fs/ext3/ext3_jbd.c
deleted file mode 100644
index 785a326..0000000
--- a/fs/ext3/ext3_jbd.c
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Interface between ext3 and JBD
- */
-
-#include "ext3.h"
-
-int __ext3_journal_get_undo_access(const char *where, handle_t *handle,
- struct buffer_head *bh)
-{
- int err = journal_get_undo_access(handle, bh);
- if (err)
- ext3_journal_abort_handle(where, __func__, bh, handle,err);
- return err;
-}
-
-int __ext3_journal_get_write_access(const char *where, handle_t *handle,
- struct buffer_head *bh)
-{
- int err = journal_get_write_access(handle, bh);
- if (err)
- ext3_journal_abort_handle(where, __func__, bh, handle,err);
- return err;
-}
-
-int __ext3_journal_forget(const char *where, handle_t *handle,
- struct buffer_head *bh)
-{
- int err = journal_forget(handle, bh);
- if (err)
- ext3_journal_abort_handle(where, __func__, bh, handle,err);
- return err;
-}
-
-int __ext3_journal_revoke(const char *where, handle_t *handle,
- unsigned long blocknr, struct buffer_head *bh)
-{
- int err = journal_revoke(handle, blocknr, bh);
- if (err)
- ext3_journal_abort_handle(where, __func__, bh, handle,err);
- return err;
-}
-
-int __ext3_journal_get_create_access(const char *where,
- handle_t *handle, struct buffer_head *bh)
-{
- int err = journal_get_create_access(handle, bh);
- if (err)
- ext3_journal_abort_handle(where, __func__, bh, handle,err);
- return err;
-}
-
-int __ext3_journal_dirty_metadata(const char *where,
- handle_t *handle, struct buffer_head *bh)
-{
- int err = journal_dirty_metadata(handle, bh);
- if (err)
- ext3_journal_abort_handle(where, __func__, bh, handle,err);
- return err;
-}
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
deleted file mode 100644
index 3b8f650..0000000
--- a/fs/ext3/file.c
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * linux/fs/ext3/file.c
- *
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- *
- * from
- *
- * linux/fs/minix/file.c
- *
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * ext3 fs regular file handling primitives
- *
- * 64-bit file support on 64-bit platforms by Jakub Jelinek
- * (jj@sunsite.ms.mff.cuni.cz)
- */
-
-#include <linux/quotaops.h>
-#include "ext3.h"
-#include "xattr.h"
-#include "acl.h"
-
-/*
- * Called when an inode is released. Note that this is different
- * from ext3_file_open: open gets called at every open, but release
- * gets called only when /all/ the files are closed.
- */
-static int ext3_release_file (struct inode * inode, struct file * filp)
-{
- if (ext3_test_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE)) {
- filemap_flush(inode->i_mapping);
- ext3_clear_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE);
- }
- /* if we are the last writer on the inode, drop the block reservation */
- if ((filp->f_mode & FMODE_WRITE) &&
- (atomic_read(&inode->i_writecount) == 1))
- {
- mutex_lock(&EXT3_I(inode)->truncate_mutex);
- ext3_discard_reservation(inode);
- mutex_unlock(&EXT3_I(inode)->truncate_mutex);
- }
- if (is_dx(inode) && filp->private_data)
- ext3_htree_free_dir_info(filp->private_data);
-
- return 0;
-}
-
-const struct file_operations ext3_file_operations = {
- .llseek = generic_file_llseek,
- .read_iter = generic_file_read_iter,
- .write_iter = generic_file_write_iter,
- .unlocked_ioctl = ext3_ioctl,
-#ifdef CONFIG_COMPAT
- .compat_ioctl = ext3_compat_ioctl,
-#endif
- .mmap = generic_file_mmap,
- .open = dquot_file_open,
- .release = ext3_release_file,
- .fsync = ext3_sync_file,
- .splice_read = generic_file_splice_read,
- .splice_write = iter_file_splice_write,
-};
-
-const struct inode_operations ext3_file_inode_operations = {
- .setattr = ext3_setattr,
-#ifdef CONFIG_EXT3_FS_XATTR
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
- .listxattr = ext3_listxattr,
- .removexattr = generic_removexattr,
-#endif
- .get_acl = ext3_get_acl,
- .set_acl = ext3_set_acl,
- .fiemap = ext3_fiemap,
-};
-
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
deleted file mode 100644
index 1cb9c7e..0000000
--- a/fs/ext3/fsync.c
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * linux/fs/ext3/fsync.c
- *
- * Copyright (C) 1993 Stephen Tweedie (sct@redhat.com)
- * from
- * Copyright (C) 1992 Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- * from
- * linux/fs/minix/truncate.c Copyright (C) 1991, 1992 Linus Torvalds
- *
- * ext3fs fsync primitive
- *
- * Big-endian to little-endian byte-swapping/bitmaps by
- * David S. Miller (davem@caip.rutgers.edu), 1995
- *
- * Removed unnecessary code duplication for little endian machines
- * and excessive __inline__s.
- * Andi Kleen, 1997
- *
- * Major simplications and cleanup - we only need to do the metadata, because
- * we can depend on generic_block_fdatasync() to sync the data blocks.
- */
-
-#include <linux/blkdev.h>
-#include <linux/writeback.h>
-#include "ext3.h"
-
-/*
- * akpm: A new design for ext3_sync_file().
- *
- * This is only called from sys_fsync(), sys_fdatasync() and sys_msync().
- * There cannot be a transaction open by this task.
- * Another task could have dirtied this inode. Its data can be in any
- * state in the journalling system.
- *
- * What we do is just kick off a commit and wait on it. This will snapshot the
- * inode to disk.
- */
-
-int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
-{
- struct inode *inode = file->f_mapping->host;
- struct ext3_inode_info *ei = EXT3_I(inode);
- journal_t *journal = EXT3_SB(inode->i_sb)->s_journal;
- int ret, needs_barrier = 0;
- tid_t commit_tid;
-
- trace_ext3_sync_file_enter(file, datasync);
-
- if (inode->i_sb->s_flags & MS_RDONLY) {
- /* Make sure that we read updated state */
- smp_rmb();
- if (EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)
- return -EROFS;
- return 0;
- }
- ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
- if (ret)
- goto out;
-
- J_ASSERT(ext3_journal_current_handle() == NULL);
-
- /*
- * data=writeback,ordered:
- * The caller's filemap_fdatawrite()/wait will sync the data.
- * Metadata is in the journal, we wait for a proper transaction
- * to commit here.
- *
- * data=journal:
- * filemap_fdatawrite won't do anything (the buffers are clean).
- * ext3_force_commit will write the file data into the journal and
- * will wait on that.
- * filemap_fdatawait() will encounter a ton of newly-dirtied pages
- * (they were dirtied by commit). But that's OK - the blocks are
- * safe in-journal, which is all fsync() needs to ensure.
- */
- if (ext3_should_journal_data(inode)) {
- ret = ext3_force_commit(inode->i_sb);
- goto out;
- }
-
- if (datasync)
- commit_tid = atomic_read(&ei->i_datasync_tid);
- else
- commit_tid = atomic_read(&ei->i_sync_tid);
-
- if (test_opt(inode->i_sb, BARRIER) &&
- !journal_trans_will_send_data_barrier(journal, commit_tid))
- needs_barrier = 1;
- log_start_commit(journal, commit_tid);
- ret = log_wait_commit(journal, commit_tid);
-
- /*
- * In case we didn't commit a transaction, we have to flush
- * disk caches manually so that data really is on persistent
- * storage
- */
- if (needs_barrier) {
- int err;
-
- err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
- if (!ret)
- ret = err;
- }
-out:
- trace_ext3_sync_file_exit(inode, ret);
- return ret;
-}
diff --git a/fs/ext3/hash.c b/fs/ext3/hash.c
deleted file mode 100644
index ede315c..0000000
--- a/fs/ext3/hash.c
+++ /dev/null
@@ -1,206 +0,0 @@
-/*
- * linux/fs/ext3/hash.c
- *
- * Copyright (C) 2002 by Theodore Ts'o
- *
- * This file is released under the GPL v2.
- *
- * This file may be redistributed under the terms of the GNU Public
- * License.
- */
-
-#include "ext3.h"
-#include <linux/cryptohash.h>
-
-#define DELTA 0x9E3779B9
-
-static void TEA_transform(__u32 buf[4], __u32 const in[])
-{
- __u32 sum = 0;
- __u32 b0 = buf[0], b1 = buf[1];
- __u32 a = in[0], b = in[1], c = in[2], d = in[3];
- int n = 16;
-
- do {
- sum += DELTA;
- b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
- b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
- } while(--n);
-
- buf[0] += b0;
- buf[1] += b1;
-}
-
-
-/* The old legacy hash */
-static __u32 dx_hack_hash_unsigned(const char *name, int len)
-{
- __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
- const unsigned char *ucp = (const unsigned char *) name;
-
- while (len--) {
- hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373));
-
- if (hash & 0x80000000)
- hash -= 0x7fffffff;
- hash1 = hash0;
- hash0 = hash;
- }
- return hash0 << 1;
-}
-
-static __u32 dx_hack_hash_signed(const char *name, int len)
-{
- __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
- const signed char *scp = (const signed char *) name;
-
- while (len--) {
- hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373));
-
- if (hash & 0x80000000)
- hash -= 0x7fffffff;
- hash1 = hash0;
- hash0 = hash;
- }
- return hash0 << 1;
-}
-
-static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num)
-{
- __u32 pad, val;
- int i;
- const signed char *scp = (const signed char *) msg;
-
- pad = (__u32)len | ((__u32)len << 8);
- pad |= pad << 16;
-
- val = pad;
- if (len > num*4)
- len = num * 4;
- for (i = 0; i < len; i++) {
- if ((i % 4) == 0)
- val = pad;
- val = ((int) scp[i]) + (val << 8);
- if ((i % 4) == 3) {
- *buf++ = val;
- val = pad;
- num--;
- }
- }
- if (--num >= 0)
- *buf++ = val;
- while (--num >= 0)
- *buf++ = pad;
-}
-
-static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
-{
- __u32 pad, val;
- int i;
- const unsigned char *ucp = (const unsigned char *) msg;
-
- pad = (__u32)len | ((__u32)len << 8);
- pad |= pad << 16;
-
- val = pad;
- if (len > num*4)
- len = num * 4;
- for (i=0; i < len; i++) {
- if ((i % 4) == 0)
- val = pad;
- val = ((int) ucp[i]) + (val << 8);
- if ((i % 4) == 3) {
- *buf++ = val;
- val = pad;
- num--;
- }
- }
- if (--num >= 0)
- *buf++ = val;
- while (--num >= 0)
- *buf++ = pad;
-}
-
-/*
- * Returns the hash of a filename. If len is 0 and name is NULL, then
- * this function can be used to test whether or not a hash version is
- * supported.
- *
- * The seed is an 4 longword (32 bits) "secret" which can be used to
- * uniquify a hash. If the seed is all zero's, then some default seed
- * may be used.
- *
- * A particular hash version specifies whether or not the seed is
- * represented, and whether or not the returned hash is 32 bits or 64
- * bits. 32 bit hashes will return 0 for the minor hash.
- */
-int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
-{
- __u32 hash;
- __u32 minor_hash = 0;
- const char *p;
- int i;
- __u32 in[8], buf[4];
- void (*str2hashbuf)(const char *, int, __u32 *, int) =
- str2hashbuf_signed;
-
- /* Initialize the default seed for the hash checksum functions */
- buf[0] = 0x67452301;
- buf[1] = 0xefcdab89;
- buf[2] = 0x98badcfe;
- buf[3] = 0x10325476;
-
- /* Check to see if the seed is all zero's */
- if (hinfo->seed) {
- for (i=0; i < 4; i++) {
- if (hinfo->seed[i])
- break;
- }
- if (i < 4)
- memcpy(buf, hinfo->seed, sizeof(buf));
- }
-
- switch (hinfo->hash_version) {
- case DX_HASH_LEGACY_UNSIGNED:
- hash = dx_hack_hash_unsigned(name, len);
- break;
- case DX_HASH_LEGACY:
- hash = dx_hack_hash_signed(name, len);
- break;
- case DX_HASH_HALF_MD4_UNSIGNED:
- str2hashbuf = str2hashbuf_unsigned;
- case DX_HASH_HALF_MD4:
- p = name;
- while (len > 0) {
- (*str2hashbuf)(p, len, in, 8);
- half_md4_transform(buf, in);
- len -= 32;
- p += 32;
- }
- minor_hash = buf[2];
- hash = buf[1];
- break;
- case DX_HASH_TEA_UNSIGNED:
- str2hashbuf = str2hashbuf_unsigned;
- case DX_HASH_TEA:
- p = name;
- while (len > 0) {
- (*str2hashbuf)(p, len, in, 4);
- TEA_transform(buf, in);
- len -= 16;
- p += 16;
- }
- hash = buf[0];
- minor_hash = buf[1];
- break;
- default:
- hinfo->hash = 0;
- return -1;
- }
- hash = hash & ~1;
- if (hash == (EXT3_HTREE_EOF_32BIT << 1))
- hash = (EXT3_HTREE_EOF_32BIT - 1) << 1;
- hinfo->hash = hash;
- hinfo->minor_hash = minor_hash;
- return 0;
-}
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
deleted file mode 100644
index 3ad242e..0000000
--- a/fs/ext3/ialloc.c
+++ /dev/null
@@ -1,706 +0,0 @@
-/*
- * linux/fs/ext3/ialloc.c
- *
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- *
- * BSD ufs-inspired inode and directory allocation by
- * Stephen Tweedie (sct@redhat.com), 1993
- * Big-endian to little-endian byte-swapping/bitmaps by
- * David S. Miller (davem@caip.rutgers.edu), 1995
- */
-
-#include <linux/quotaops.h>
-#include <linux/random.h>
-
-#include "ext3.h"
-#include "xattr.h"
-#include "acl.h"
-
-/*
- * ialloc.c contains the inodes allocation and deallocation routines
- */
-
-/*
- * The free inodes are managed by bitmaps. A file system contains several
- * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap
- * block for inodes, N blocks for the inode table and data blocks.
- *
- * The file system contains group descriptors which are located after the
- * super block. Each descriptor contains the number of the bitmap block and
- * the free blocks count in the block.
- */
-
-
-/*
- * Read the inode allocation bitmap for a given block_group, reading
- * into the specified slot in the superblock's bitmap cache.
- *
- * Return buffer_head of bitmap on success or NULL.
- */
-static struct buffer_head *
-read_inode_bitmap(struct super_block * sb, unsigned long block_group)
-{
- struct ext3_group_desc *desc;
- struct buffer_head *bh = NULL;
-
- desc = ext3_get_group_desc(sb, block_group, NULL);
- if (!desc)
- goto error_out;
-
- bh = sb_bread(sb, le32_to_cpu(desc->bg_inode_bitmap));
- if (!bh)
- ext3_error(sb, "read_inode_bitmap",
- "Cannot read inode bitmap - "
- "block_group = %lu, inode_bitmap = %u",
- block_group, le32_to_cpu(desc->bg_inode_bitmap));
-error_out:
- return bh;
-}
-
-/*
- * NOTE! When we get the inode, we're the only people
- * that have access to it, and as such there are no
- * race conditions we have to worry about. The inode
- * is not on the hash-lists, and it cannot be reached
- * through the filesystem because the directory entry
- * has been deleted earlier.
- *
- * HOWEVER: we must make sure that we get no aliases,
- * which means that we have to call "clear_inode()"
- * _before_ we mark the inode not in use in the inode
- * bitmaps. Otherwise a newly created file might use
- * the same inode number (not actually the same pointer
- * though), and then we'd have two inodes sharing the
- * same inode number and space on the harddisk.
- */
-void ext3_free_inode (handle_t *handle, struct inode * inode)
-{
- struct super_block * sb = inode->i_sb;
- int is_directory;
- unsigned long ino;
- struct buffer_head *bitmap_bh = NULL;
- struct buffer_head *bh2;
- unsigned long block_group;
- unsigned long bit;
- struct ext3_group_desc * gdp;
- struct ext3_super_block * es;
- struct ext3_sb_info *sbi;
- int fatal = 0, err;
-
- if (atomic_read(&inode->i_count) > 1) {
- printk ("ext3_free_inode: inode has count=%d\n",
- atomic_read(&inode->i_count));
- return;
- }
- if (inode->i_nlink) {
- printk ("ext3_free_inode: inode has nlink=%d\n",
- inode->i_nlink);
- return;
- }
- if (!sb) {
- printk("ext3_free_inode: inode on nonexistent device\n");
- return;
- }
- sbi = EXT3_SB(sb);
-
- ino = inode->i_ino;
- ext3_debug ("freeing inode %lu\n", ino);
- trace_ext3_free_inode(inode);
-
- is_directory = S_ISDIR(inode->i_mode);
-
- es = EXT3_SB(sb)->s_es;
- if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
- ext3_error (sb, "ext3_free_inode",
- "reserved or nonexistent inode %lu", ino);
- goto error_return;
- }
- block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
- bit = (ino - 1) % EXT3_INODES_PER_GROUP(sb);
- bitmap_bh = read_inode_bitmap(sb, block_group);
- if (!bitmap_bh)
- goto error_return;
-
- BUFFER_TRACE(bitmap_bh, "get_write_access");
- fatal = ext3_journal_get_write_access(handle, bitmap_bh);
- if (fatal)
- goto error_return;
-
- /* Ok, now we can actually update the inode bitmaps.. */
- if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
- bit, bitmap_bh->b_data))
- ext3_error (sb, "ext3_free_inode",
- "bit already cleared for inode %lu", ino);
- else {
- gdp = ext3_get_group_desc (sb, block_group, &bh2);
-
- BUFFER_TRACE(bh2, "get_write_access");
- fatal = ext3_journal_get_write_access(handle, bh2);
- if (fatal) goto error_return;
-
- if (gdp) {
- spin_lock(sb_bgl_lock(sbi, block_group));
- le16_add_cpu(&gdp->bg_free_inodes_count, 1);
- if (is_directory)
- le16_add_cpu(&gdp->bg_used_dirs_count, -1);
- spin_unlock(sb_bgl_lock(sbi, block_group));
- percpu_counter_inc(&sbi->s_freeinodes_counter);
- if (is_directory)
- percpu_counter_dec(&sbi->s_dirs_counter);
-
- }
- BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata");
- err = ext3_journal_dirty_metadata(handle, bh2);
- if (!fatal) fatal = err;
- }
- BUFFER_TRACE(bitmap_bh, "call ext3_journal_dirty_metadata");
- err = ext3_journal_dirty_metadata(handle, bitmap_bh);
- if (!fatal)
- fatal = err;
-
-error_return:
- brelse(bitmap_bh);
- ext3_std_error(sb, fatal);
-}
-
-/*
- * Orlov's allocator for directories.
- *
- * We always try to spread first-level directories.
- *
- * If there are blockgroups with both free inodes and free blocks counts
- * not worse than average we return one with smallest directory count.
- * Otherwise we simply return a random group.
- *
- * For the rest rules look so:
- *
- * It's OK to put directory into a group unless
- * it has too many directories already (max_dirs) or
- * it has too few free inodes left (min_inodes) or
- * it has too few free blocks left (min_blocks).
- * Parent's group is preferred, if it doesn't satisfy these
- * conditions we search cyclically through the rest. If none
- * of the groups look good we just look for a group with more
- * free inodes than average (starting at parent's group).
- *
- * Debt is incremented each time we allocate a directory and decremented
- * when we allocate an inode, within 0--255.
- */
-
-static int find_group_orlov(struct super_block *sb, struct inode *parent)
-{
- int parent_group = EXT3_I(parent)->i_block_group;
- struct ext3_sb_info *sbi = EXT3_SB(sb);
- int ngroups = sbi->s_groups_count;
- int inodes_per_group = EXT3_INODES_PER_GROUP(sb);
- unsigned int freei, avefreei;
- ext3_fsblk_t freeb, avefreeb;
- unsigned int ndirs;
- int max_dirs, min_inodes;
- ext3_grpblk_t min_blocks;
- int group = -1, i;
- struct ext3_group_desc *desc;
-
- freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
- avefreei = freei / ngroups;
- freeb = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
- avefreeb = freeb / ngroups;
- ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
-
- if ((parent == d_inode(sb->s_root)) ||
- (EXT3_I(parent)->i_flags & EXT3_TOPDIR_FL)) {
- int best_ndir = inodes_per_group;
- int best_group = -1;
-
- group = prandom_u32();
- parent_group = (unsigned)group % ngroups;
- for (i = 0; i < ngroups; i++) {
- group = (parent_group + i) % ngroups;
- desc = ext3_get_group_desc (sb, group, NULL);
- if (!desc || !desc->bg_free_inodes_count)
- continue;
- if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir)
- continue;
- if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
- continue;
- if (le16_to_cpu(desc->bg_free_blocks_count) < avefreeb)
- continue;
- best_group = group;
- best_ndir = le16_to_cpu(desc->bg_used_dirs_count);
- }
- if (best_group >= 0)
- return best_group;
- goto fallback;
- }
-
- max_dirs = ndirs / ngroups + inodes_per_group / 16;
- min_inodes = avefreei - inodes_per_group / 4;
- min_blocks = avefreeb - EXT3_BLOCKS_PER_GROUP(sb) / 4;
-
- for (i = 0; i < ngroups; i++) {
- group = (parent_group + i) % ngroups;
- desc = ext3_get_group_desc (sb, group, NULL);
- if (!desc || !desc->bg_free_inodes_count)
- continue;
- if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs)
- continue;
- if (le16_to_cpu(desc->bg_free_inodes_count) < min_inodes)
- continue;
- if (le16_to_cpu(desc->bg_free_blocks_count) < min_blocks)
- continue;
- return group;
- }
-
-fallback:
- for (i = 0; i < ngroups; i++) {
- group = (parent_group + i) % ngroups;
- desc = ext3_get_group_desc (sb, group, NULL);
- if (!desc || !desc->bg_free_inodes_count)
- continue;
- if (le16_to_cpu(desc->bg_free_inodes_count) >= avefreei)
- return group;
- }
-
- if (avefreei) {
- /*
- * The free-inodes counter is approximate, and for really small
- * filesystems the above test can fail to find any blockgroups
- */
- avefreei = 0;
- goto fallback;
- }
-
- return -1;
-}
-
-static int find_group_other(struct super_block *sb, struct inode *parent)
-{
- int parent_group = EXT3_I(parent)->i_block_group;
- int ngroups = EXT3_SB(sb)->s_groups_count;
- struct ext3_group_desc *desc;
- int group, i;
-
- /*
- * Try to place the inode in its parent directory
- */
- group = parent_group;
- desc = ext3_get_group_desc (sb, group, NULL);
- if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
- le16_to_cpu(desc->bg_free_blocks_count))
- return group;
-
- /*
- * We're going to place this inode in a different blockgroup from its
- * parent. We want to cause files in a common directory to all land in
- * the same blockgroup. But we want files which are in a different
- * directory which shares a blockgroup with our parent to land in a
- * different blockgroup.
- *
- * So add our directory's i_ino into the starting point for the hash.
- */
- group = (group + parent->i_ino) % ngroups;
-
- /*
- * Use a quadratic hash to find a group with a free inode and some free
- * blocks.
- */
- for (i = 1; i < ngroups; i <<= 1) {
- group += i;
- if (group >= ngroups)
- group -= ngroups;
- desc = ext3_get_group_desc (sb, group, NULL);
- if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
- le16_to_cpu(desc->bg_free_blocks_count))
- return group;
- }
-
- /*
- * That failed: try linear search for a free inode, even if that group
- * has no free blocks.
- */
- group = parent_group;
- for (i = 0; i < ngroups; i++) {
- if (++group >= ngroups)
- group = 0;
- desc = ext3_get_group_desc (sb, group, NULL);
- if (desc && le16_to_cpu(desc->bg_free_inodes_count))
- return group;
- }
-
- return -1;
-}
-
-/*
- * There are two policies for allocating an inode. If the new inode is
- * a directory, then a forward search is made for a block group with both
- * free space and a low directory-to-inode ratio; if that fails, then of
- * the groups with above-average free space, that group with the fewest
- * directories already is chosen.
- *
- * For other inodes, search forward from the parent directory's block
- * group to find a free inode.
- */
-struct inode *ext3_new_inode(handle_t *handle, struct inode * dir,
- const struct qstr *qstr, umode_t mode)
-{
- struct super_block *sb;
- struct buffer_head *bitmap_bh = NULL;
- struct buffer_head *bh2;
- int group;
- unsigned long ino = 0;
- struct inode * inode;
- struct ext3_group_desc * gdp = NULL;
- struct ext3_super_block * es;
- struct ext3_inode_info *ei;
- struct ext3_sb_info *sbi;
- int err = 0;
- struct inode *ret;
- int i;
-
- /* Cannot create files in a deleted directory */
- if (!dir || !dir->i_nlink)
- return ERR_PTR(-EPERM);
-
- sb = dir->i_sb;
- trace_ext3_request_inode(dir, mode);
- inode = new_inode(sb);
- if (!inode)
- return ERR_PTR(-ENOMEM);
- ei = EXT3_I(inode);
-
- sbi = EXT3_SB(sb);
- es = sbi->s_es;
- if (S_ISDIR(mode))
- group = find_group_orlov(sb, dir);
- else
- group = find_group_other(sb, dir);
-
- err = -ENOSPC;
- if (group == -1)
- goto out;
-
- for (i = 0; i < sbi->s_groups_count; i++) {
- err = -EIO;
-
- gdp = ext3_get_group_desc(sb, group, &bh2);
- if (!gdp)
- goto fail;
-
- brelse(bitmap_bh);
- bitmap_bh = read_inode_bitmap(sb, group);
- if (!bitmap_bh)
- goto fail;
-
- ino = 0;
-
-repeat_in_this_group:
- ino = ext3_find_next_zero_bit((unsigned long *)
- bitmap_bh->b_data, EXT3_INODES_PER_GROUP(sb), ino);
- if (ino < EXT3_INODES_PER_GROUP(sb)) {
-
- BUFFER_TRACE(bitmap_bh, "get_write_access");
- err = ext3_journal_get_write_access(handle, bitmap_bh);
- if (err)
- goto fail;
-
- if (!ext3_set_bit_atomic(sb_bgl_lock(sbi, group),
- ino, bitmap_bh->b_data)) {
- /* we won it */
- BUFFER_TRACE(bitmap_bh,
- "call ext3_journal_dirty_metadata");
- err = ext3_journal_dirty_metadata(handle,
- bitmap_bh);
- if (err)
- goto fail;
- goto got;
- }
- /* we lost it */
- journal_release_buffer(handle, bitmap_bh);
-
- if (++ino < EXT3_INODES_PER_GROUP(sb))
- goto repeat_in_this_group;
- }
-
- /*
- * This case is possible in concurrent environment. It is very
- * rare. We cannot repeat the find_group_xxx() call because
- * that will simply return the same blockgroup, because the
- * group descriptor metadata has not yet been updated.
- * So we just go onto the next blockgroup.
- */
- if (++group == sbi->s_groups_count)
- group = 0;
- }
- err = -ENOSPC;
- goto out;
-
-got:
- ino += group * EXT3_INODES_PER_GROUP(sb) + 1;
- if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
- ext3_error (sb, "ext3_new_inode",
- "reserved inode or inode > inodes count - "
- "block_group = %d, inode=%lu", group, ino);
- err = -EIO;
- goto fail;
- }
-
- BUFFER_TRACE(bh2, "get_write_access");
- err = ext3_journal_get_write_access(handle, bh2);
- if (err) goto fail;
- spin_lock(sb_bgl_lock(sbi, group));
- le16_add_cpu(&gdp->bg_free_inodes_count, -1);
- if (S_ISDIR(mode)) {
- le16_add_cpu(&gdp->bg_used_dirs_count, 1);
- }
- spin_unlock(sb_bgl_lock(sbi, group));
- BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata");
- err = ext3_journal_dirty_metadata(handle, bh2);
- if (err) goto fail;
-
- percpu_counter_dec(&sbi->s_freeinodes_counter);
- if (S_ISDIR(mode))
- percpu_counter_inc(&sbi->s_dirs_counter);
-
-
- if (test_opt(sb, GRPID)) {
- inode->i_mode = mode;
- inode->i_uid = current_fsuid();
- inode->i_gid = dir->i_gid;
- } else
- inode_init_owner(inode, dir, mode);
-
- inode->i_ino = ino;
- /* This is the optimal IO size (for stat), not the fs block size */
- inode->i_blocks = 0;
- inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
-
- memset(ei->i_data, 0, sizeof(ei->i_data));
- ei->i_dir_start_lookup = 0;
- ei->i_disksize = 0;
-
- ei->i_flags =
- ext3_mask_flags(mode, EXT3_I(dir)->i_flags & EXT3_FL_INHERITED);
-#ifdef EXT3_FRAGMENTS
- ei->i_faddr = 0;
- ei->i_frag_no = 0;
- ei->i_frag_size = 0;
-#endif
- ei->i_file_acl = 0;
- ei->i_dir_acl = 0;
- ei->i_dtime = 0;
- ei->i_block_alloc_info = NULL;
- ei->i_block_group = group;
-
- ext3_set_inode_flags(inode);
- if (IS_DIRSYNC(inode))
- handle->h_sync = 1;
- if (insert_inode_locked(inode) < 0) {
- /*
- * Likely a bitmap corruption causing inode to be allocated
- * twice.
- */
- err = -EIO;
- goto fail;
- }
- spin_lock(&sbi->s_next_gen_lock);
- inode->i_generation = sbi->s_next_generation++;
- spin_unlock(&sbi->s_next_gen_lock);
-
- ei->i_state_flags = 0;
- ext3_set_inode_state(inode, EXT3_STATE_NEW);
-
- /* See comment in ext3_iget for explanation */
- if (ino >= EXT3_FIRST_INO(sb) + 1 &&
- EXT3_INODE_SIZE(sb) > EXT3_GOOD_OLD_INODE_SIZE) {
- ei->i_extra_isize =
- sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE;
- } else {
- ei->i_extra_isize = 0;
- }
-
- ret = inode;
- dquot_initialize(inode);
- err = dquot_alloc_inode(inode);
- if (err)
- goto fail_drop;
-
- err = ext3_init_acl(handle, inode, dir);
- if (err)
- goto fail_free_drop;
-
- err = ext3_init_security(handle, inode, dir, qstr);
- if (err)
- goto fail_free_drop;
-
- err = ext3_mark_inode_dirty(handle, inode);
- if (err) {
- ext3_std_error(sb, err);
- goto fail_free_drop;
- }
-
- ext3_debug("allocating inode %lu\n", inode->i_ino);
- trace_ext3_allocate_inode(inode, dir, mode);
- goto really_out;
-fail:
- ext3_std_error(sb, err);
-out:
- iput(inode);
- ret = ERR_PTR(err);
-really_out:
- brelse(bitmap_bh);
- return ret;
-
-fail_free_drop:
- dquot_free_inode(inode);
-
-fail_drop:
- dquot_drop(inode);
- inode->i_flags |= S_NOQUOTA;
- clear_nlink(inode);
- unlock_new_inode(inode);
- iput(inode);
- brelse(bitmap_bh);
- return ERR_PTR(err);
-}
-
-/* Verify that we are loading a valid orphan from disk */
-struct inode *ext3_orphan_get(struct super_block *sb, unsigned long ino)
-{
- unsigned long max_ino = le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count);
- unsigned long block_group;
- int bit;
- struct buffer_head *bitmap_bh;
- struct inode *inode = NULL;
- long err = -EIO;
-
- /* Error cases - e2fsck has already cleaned up for us */
- if (ino > max_ino) {
- ext3_warning(sb, __func__,
- "bad orphan ino %lu! e2fsck was run?", ino);
- goto error;
- }
-
- block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
- bit = (ino - 1) % EXT3_INODES_PER_GROUP(sb);
- bitmap_bh = read_inode_bitmap(sb, block_group);
- if (!bitmap_bh) {
- ext3_warning(sb, __func__,
- "inode bitmap error for orphan %lu", ino);
- goto error;
- }
-
- /* Having the inode bit set should be a 100% indicator that this
- * is a valid orphan (no e2fsck run on fs). Orphans also include
- * inodes that were being truncated, so we can't check i_nlink==0.
- */
- if (!ext3_test_bit(bit, bitmap_bh->b_data))
- goto bad_orphan;
-
- inode = ext3_iget(sb, ino);
- if (IS_ERR(inode))
- goto iget_failed;
-
- /*
- * If the orphans has i_nlinks > 0 then it should be able to be
- * truncated, otherwise it won't be removed from the orphan list
- * during processing and an infinite loop will result.
- */
- if (inode->i_nlink && !ext3_can_truncate(inode))
- goto bad_orphan;
-
- if (NEXT_ORPHAN(inode) > max_ino)
- goto bad_orphan;
- brelse(bitmap_bh);
- return inode;
-
-iget_failed:
- err = PTR_ERR(inode);
- inode = NULL;
-bad_orphan:
- ext3_warning(sb, __func__,
- "bad orphan inode %lu! e2fsck was run?", ino);
- printk(KERN_NOTICE "ext3_test_bit(bit=%d, block=%llu) = %d\n",
- bit, (unsigned long long)bitmap_bh->b_blocknr,
- ext3_test_bit(bit, bitmap_bh->b_data));
- printk(KERN_NOTICE "inode=%p\n", inode);
- if (inode) {
- printk(KERN_NOTICE "is_bad_inode(inode)=%d\n",
- is_bad_inode(inode));
- printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n",
- NEXT_ORPHAN(inode));
- printk(KERN_NOTICE "max_ino=%lu\n", max_ino);
- printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink);
- /* Avoid freeing blocks if we got a bad deleted inode */
- if (inode->i_nlink == 0)
- inode->i_blocks = 0;
- iput(inode);
- }
- brelse(bitmap_bh);
-error:
- return ERR_PTR(err);
-}
-
-unsigned long ext3_count_free_inodes (struct super_block * sb)
-{
- unsigned long desc_count;
- struct ext3_group_desc *gdp;
- int i;
-#ifdef EXT3FS_DEBUG
- struct ext3_super_block *es;
- unsigned long bitmap_count, x;
- struct buffer_head *bitmap_bh = NULL;
-
- es = EXT3_SB(sb)->s_es;
- desc_count = 0;
- bitmap_count = 0;
- gdp = NULL;
- for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) {
- gdp = ext3_get_group_desc (sb, i, NULL);
- if (!gdp)
- continue;
- desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
- brelse(bitmap_bh);
- bitmap_bh = read_inode_bitmap(sb, i);
- if (!bitmap_bh)
- continue;
-
- x = ext3_count_free(bitmap_bh, EXT3_INODES_PER_GROUP(sb) / 8);
- printk("group %d: stored = %d, counted = %lu\n",
- i, le16_to_cpu(gdp->bg_free_inodes_count), x);
- bitmap_count += x;
- }
- brelse(bitmap_bh);
- printk("ext3_count_free_inodes: stored = %u, computed = %lu, %lu\n",
- le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count);
- return desc_count;
-#else
- desc_count = 0;
- for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) {
- gdp = ext3_get_group_desc (sb, i, NULL);
- if (!gdp)
- continue;
- desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
- cond_resched();
- }
- return desc_count;
-#endif
-}
-
-/* Called at mount-time, super-block is locked */
-unsigned long ext3_count_dirs (struct super_block * sb)
-{
- unsigned long count = 0;
- int i;
-
- for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) {
- struct ext3_group_desc *gdp = ext3_get_group_desc (sb, i, NULL);
- if (!gdp)
- continue;
- count += le16_to_cpu(gdp->bg_used_dirs_count);
- }
- return count;
-}
-
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
deleted file mode 100644
index 6c7e546..0000000
--- a/fs/ext3/inode.c
+++ /dev/null
@@ -1,3574 +0,0 @@
-/*
- * linux/fs/ext3/inode.c
- *
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- *
- * from
- *
- * linux/fs/minix/inode.c
- *
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * Goal-directed block allocation by Stephen Tweedie
- * (sct@redhat.com), 1993, 1998
- * Big-endian to little-endian byte-swapping/bitmaps by
- * David S. Miller (davem@caip.rutgers.edu), 1995
- * 64-bit file support on 64-bit platforms by Jakub Jelinek
- * (jj@sunsite.ms.mff.cuni.cz)
- *
- * Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000
- */
-
-#include <linux/highuid.h>
-#include <linux/quotaops.h>
-#include <linux/writeback.h>
-#include <linux/mpage.h>
-#include <linux/namei.h>
-#include <linux/uio.h>
-#include "ext3.h"
-#include "xattr.h"
-#include "acl.h"
-
-static int ext3_writepage_trans_blocks(struct inode *inode);
-static int ext3_block_truncate_page(struct inode *inode, loff_t from);
-
-/*
- * Test whether an inode is a fast symlink.
- */
-static int ext3_inode_is_fast_symlink(struct inode *inode)
-{
- int ea_blocks = EXT3_I(inode)->i_file_acl ?
- (inode->i_sb->s_blocksize >> 9) : 0;
-
- return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
-}
-
-/*
- * The ext3 forget function must perform a revoke if we are freeing data
- * which has been journaled. Metadata (eg. indirect blocks) must be
- * revoked in all cases.
- *
- * "bh" may be NULL: a metadata block may have been freed from memory
- * but there may still be a record of it in the journal, and that record
- * still needs to be revoked.
- */
-int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode,
- struct buffer_head *bh, ext3_fsblk_t blocknr)
-{
- int err;
-
- might_sleep();
-
- trace_ext3_forget(inode, is_metadata, blocknr);
- BUFFER_TRACE(bh, "enter");
-
- jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
- "data mode %lx\n",
- bh, is_metadata, inode->i_mode,
- test_opt(inode->i_sb, DATA_FLAGS));
-
- /* Never use the revoke function if we are doing full data
- * journaling: there is no need to, and a V1 superblock won't
- * support it. Otherwise, only skip the revoke on un-journaled
- * data blocks. */
-
- if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ||
- (!is_metadata && !ext3_should_journal_data(inode))) {
- if (bh) {
- BUFFER_TRACE(bh, "call journal_forget");
- return ext3_journal_forget(handle, bh);
- }
- return 0;
- }
-
- /*
- * data!=journal && (is_metadata || should_journal_data(inode))
- */
- BUFFER_TRACE(bh, "call ext3_journal_revoke");
- err = ext3_journal_revoke(handle, blocknr, bh);
- if (err)
- ext3_abort(inode->i_sb, __func__,
- "error %d when attempting revoke", err);
- BUFFER_TRACE(bh, "exit");
- return err;
-}
-
-/*
- * Work out how many blocks we need to proceed with the next chunk of a
- * truncate transaction.
- */
-static unsigned long blocks_for_truncate(struct inode *inode)
-{
- unsigned long needed;
-
- needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
-
- /* Give ourselves just enough room to cope with inodes in which
- * i_blocks is corrupt: we've seen disk corruptions in the past
- * which resulted in random data in an inode which looked enough
- * like a regular file for ext3 to try to delete it. Things
- * will go a bit crazy if that happens, but at least we should
- * try not to panic the whole kernel. */
- if (needed < 2)
- needed = 2;
-
- /* But we need to bound the transaction so we don't overflow the
- * journal. */
- if (needed > EXT3_MAX_TRANS_DATA)
- needed = EXT3_MAX_TRANS_DATA;
-
- return EXT3_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
-}
-
-/*
- * Truncate transactions can be complex and absolutely huge. So we need to
- * be able to restart the transaction at a conventient checkpoint to make
- * sure we don't overflow the journal.
- *
- * start_transaction gets us a new handle for a truncate transaction,
- * and extend_transaction tries to extend the existing one a bit. If
- * extend fails, we need to propagate the failure up and restart the
- * transaction in the top-level truncate loop. --sct
- */
-static handle_t *start_transaction(struct inode *inode)
-{
- handle_t *result;
-
- result = ext3_journal_start(inode, blocks_for_truncate(inode));
- if (!IS_ERR(result))
- return result;
-
- ext3_std_error(inode->i_sb, PTR_ERR(result));
- return result;
-}
-
-/*
- * Try to extend this transaction for the purposes of truncation.
- *
- * Returns 0 if we managed to create more room. If we can't create more
- * room, and the transaction must be restarted we return 1.
- */
-static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
-{
- if (handle->h_buffer_credits > EXT3_RESERVE_TRANS_BLOCKS)
- return 0;
- if (!ext3_journal_extend(handle, blocks_for_truncate(inode)))
- return 0;
- return 1;
-}
-
-/*
- * Restart the transaction associated with *handle. This does a commit,
- * so before we call here everything must be consistently dirtied against
- * this transaction.
- */
-static int truncate_restart_transaction(handle_t *handle, struct inode *inode)
-{
- int ret;
-
- jbd_debug(2, "restarting handle %p\n", handle);
- /*
- * Drop truncate_mutex to avoid deadlock with ext3_get_blocks_handle
- * At this moment, get_block can be called only for blocks inside
- * i_size since page cache has been already dropped and writes are
- * blocked by i_mutex. So we can safely drop the truncate_mutex.
- */
- mutex_unlock(&EXT3_I(inode)->truncate_mutex);
- ret = ext3_journal_restart(handle, blocks_for_truncate(inode));
- mutex_lock(&EXT3_I(inode)->truncate_mutex);
- return ret;
-}
-
-/*
- * Called at inode eviction from icache
- */
-void ext3_evict_inode (struct inode *inode)
-{
- struct ext3_inode_info *ei = EXT3_I(inode);
- struct ext3_block_alloc_info *rsv;
- handle_t *handle;
- int want_delete = 0;
-
- trace_ext3_evict_inode(inode);
- if (!inode->i_nlink && !is_bad_inode(inode)) {
- dquot_initialize(inode);
- want_delete = 1;
- }
-
- /*
- * When journalling data dirty buffers are tracked only in the journal.
- * So although mm thinks everything is clean and ready for reaping the
- * inode might still have some pages to write in the running
- * transaction or waiting to be checkpointed. Thus calling
- * journal_invalidatepage() (via truncate_inode_pages()) to discard
- * these buffers can cause data loss. Also even if we did not discard
- * these buffers, we would have no way to find them after the inode
- * is reaped and thus user could see stale data if he tries to read
- * them before the transaction is checkpointed. So be careful and
- * force everything to disk here... We use ei->i_datasync_tid to
- * store the newest transaction containing inode's data.
- *
- * Note that directories do not have this problem because they don't
- * use page cache.
- *
- * The s_journal check handles the case when ext3_get_journal() fails
- * and puts the journal inode.
- */
- if (inode->i_nlink && ext3_should_journal_data(inode) &&
- EXT3_SB(inode->i_sb)->s_journal &&
- (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) &&
- inode->i_ino != EXT3_JOURNAL_INO) {
- tid_t commit_tid = atomic_read(&ei->i_datasync_tid);
- journal_t *journal = EXT3_SB(inode->i_sb)->s_journal;
-
- log_start_commit(journal, commit_tid);
- log_wait_commit(journal, commit_tid);
- filemap_write_and_wait(&inode->i_data);
- }
- truncate_inode_pages_final(&inode->i_data);
-
- ext3_discard_reservation(inode);
- rsv = ei->i_block_alloc_info;
- ei->i_block_alloc_info = NULL;
- if (unlikely(rsv))
- kfree(rsv);
-
- if (!want_delete)
- goto no_delete;
-
- handle = start_transaction(inode);
- if (IS_ERR(handle)) {
- /*
- * If we're going to skip the normal cleanup, we still need to
- * make sure that the in-core orphan linked list is properly
- * cleaned up.
- */
- ext3_orphan_del(NULL, inode);
- goto no_delete;
- }
-
- if (IS_SYNC(inode))
- handle->h_sync = 1;
- inode->i_size = 0;
- if (inode->i_blocks)
- ext3_truncate(inode);
- /*
- * Kill off the orphan record created when the inode lost the last
- * link. Note that ext3_orphan_del() has to be able to cope with the
- * deletion of a non-existent orphan - ext3_truncate() could
- * have removed the record.
- */
- ext3_orphan_del(handle, inode);
- ei->i_dtime = get_seconds();
-
- /*
- * One subtle ordering requirement: if anything has gone wrong
- * (transaction abort, IO errors, whatever), then we can still
- * do these next steps (the fs will already have been marked as
- * having errors), but we can't free the inode if the mark_dirty
- * fails.
- */
- if (ext3_mark_inode_dirty(handle, inode)) {
- /* If that failed, just dquot_drop() and be done with that */
- dquot_drop(inode);
- clear_inode(inode);
- } else {
- ext3_xattr_delete_inode(handle, inode);
- dquot_free_inode(inode);
- dquot_drop(inode);
- clear_inode(inode);
- ext3_free_inode(handle, inode);
- }
- ext3_journal_stop(handle);
- return;
-no_delete:
- clear_inode(inode);
- dquot_drop(inode);
-}
-
-typedef struct {
- __le32 *p;
- __le32 key;
- struct buffer_head *bh;
-} Indirect;
-
-static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
-{
- p->key = *(p->p = v);
- p->bh = bh;
-}
-
-static int verify_chain(Indirect *from, Indirect *to)
-{
- while (from <= to && from->key == *from->p)
- from++;
- return (from > to);
-}
-
-/**
- * ext3_block_to_path - parse the block number into array of offsets
- * @inode: inode in question (we are only interested in its superblock)
- * @i_block: block number to be parsed
- * @offsets: array to store the offsets in
- * @boundary: set this non-zero if the referred-to block is likely to be
- * followed (on disk) by an indirect block.
- *
- * To store the locations of file's data ext3 uses a data structure common
- * for UNIX filesystems - tree of pointers anchored in the inode, with
- * data blocks at leaves and indirect blocks in intermediate nodes.
- * This function translates the block number into path in that tree -
- * return value is the path length and @offsets[n] is the offset of
- * pointer to (n+1)th node in the nth one. If @block is out of range
- * (negative or too large) warning is printed and zero returned.
- *
- * Note: function doesn't find node addresses, so no IO is needed. All
- * we need to know is the capacity of indirect blocks (taken from the
- * inode->i_sb).
- */
-
-/*
- * Portability note: the last comparison (check that we fit into triple
- * indirect block) is spelled differently, because otherwise on an
- * architecture with 32-bit longs and 8Kb pages we might get into trouble
- * if our filesystem had 8Kb blocks. We might use long long, but that would
- * kill us on x86. Oh, well, at least the sign propagation does not matter -
- * i_block would have to be negative in the very beginning, so we would not
- * get there at all.
- */
-
-static int ext3_block_to_path(struct inode *inode,
- long i_block, int offsets[4], int *boundary)
-{
- int ptrs = EXT3_ADDR_PER_BLOCK(inode->i_sb);
- int ptrs_bits = EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb);
- const long direct_blocks = EXT3_NDIR_BLOCKS,
- indirect_blocks = ptrs,
- double_blocks = (1 << (ptrs_bits * 2));
- int n = 0;
- int final = 0;
-
- if (i_block < 0) {
- ext3_warning (inode->i_sb, "ext3_block_to_path", "block < 0");
- } else if (i_block < direct_blocks) {
- offsets[n++] = i_block;
- final = direct_blocks;
- } else if ( (i_block -= direct_blocks) < indirect_blocks) {
- offsets[n++] = EXT3_IND_BLOCK;
- offsets[n++] = i_block;
- final = ptrs;
- } else if ((i_block -= indirect_blocks) < double_blocks) {
- offsets[n++] = EXT3_DIND_BLOCK;
- offsets[n++] = i_block >> ptrs_bits;
- offsets[n++] = i_block & (ptrs - 1);
- final = ptrs;
- } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
- offsets[n++] = EXT3_TIND_BLOCK;
- offsets[n++] = i_block >> (ptrs_bits * 2);
- offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
- offsets[n++] = i_block & (ptrs - 1);
- final = ptrs;
- } else {
- ext3_warning(inode->i_sb, "ext3_block_to_path", "block > big");
- }
- if (boundary)
- *boundary = final - 1 - (i_block & (ptrs - 1));
- return n;
-}
-
-/**
- * ext3_get_branch - read the chain of indirect blocks leading to data
- * @inode: inode in question
- * @depth: depth of the chain (1 - direct pointer, etc.)
- * @offsets: offsets of pointers in inode/indirect blocks
- * @chain: place to store the result
- * @err: here we store the error value
- *
- * Function fills the array of triples <key, p, bh> and returns %NULL
- * if everything went OK or the pointer to the last filled triple
- * (incomplete one) otherwise. Upon the return chain[i].key contains
- * the number of (i+1)-th block in the chain (as it is stored in memory,
- * i.e. little-endian 32-bit), chain[i].p contains the address of that
- * number (it points into struct inode for i==0 and into the bh->b_data
- * for i>0) and chain[i].bh points to the buffer_head of i-th indirect
- * block for i>0 and NULL for i==0. In other words, it holds the block
- * numbers of the chain, addresses they were taken from (and where we can
- * verify that chain did not change) and buffer_heads hosting these
- * numbers.
- *
- * Function stops when it stumbles upon zero pointer (absent block)
- * (pointer to last triple returned, *@err == 0)
- * or when it gets an IO error reading an indirect block
- * (ditto, *@err == -EIO)
- * or when it notices that chain had been changed while it was reading
- * (ditto, *@err == -EAGAIN)
- * or when it reads all @depth-1 indirect blocks successfully and finds
- * the whole chain, all way to the data (returns %NULL, *err == 0).
- */
-static Indirect *ext3_get_branch(struct inode *inode, int depth, int *offsets,
- Indirect chain[4], int *err)
-{
- struct super_block *sb = inode->i_sb;
- Indirect *p = chain;
- struct buffer_head *bh;
-
- *err = 0;
- /* i_data is not going away, no lock needed */
- add_chain (chain, NULL, EXT3_I(inode)->i_data + *offsets);
- if (!p->key)
- goto no_block;
- while (--depth) {
- bh = sb_bread(sb, le32_to_cpu(p->key));
- if (!bh)
- goto failure;
- /* Reader: pointers */
- if (!verify_chain(chain, p))
- goto changed;
- add_chain(++p, bh, (__le32*)bh->b_data + *++offsets);
- /* Reader: end */
- if (!p->key)
- goto no_block;
- }
- return NULL;
-
-changed:
- brelse(bh);
- *err = -EAGAIN;
- goto no_block;
-failure:
- *err = -EIO;
-no_block:
- return p;
-}
-
-/**
- * ext3_find_near - find a place for allocation with sufficient locality
- * @inode: owner
- * @ind: descriptor of indirect block.
- *
- * This function returns the preferred place for block allocation.
- * It is used when heuristic for sequential allocation fails.
- * Rules are:
- * + if there is a block to the left of our position - allocate near it.
- * + if pointer will live in indirect block - allocate near that block.
- * + if pointer will live in inode - allocate in the same
- * cylinder group.
- *
- * In the latter case we colour the starting block by the callers PID to
- * prevent it from clashing with concurrent allocations for a different inode
- * in the same block group. The PID is used here so that functionally related
- * files will be close-by on-disk.
- *
- * Caller must make sure that @ind is valid and will stay that way.
- */
-static ext3_fsblk_t ext3_find_near(struct inode *inode, Indirect *ind)
-{
- struct ext3_inode_info *ei = EXT3_I(inode);
- __le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data;
- __le32 *p;
- ext3_fsblk_t bg_start;
- ext3_grpblk_t colour;
-
- /* Try to find previous block */
- for (p = ind->p - 1; p >= start; p--) {
- if (*p)
- return le32_to_cpu(*p);
- }
-
- /* No such thing, so let's try location of indirect block */
- if (ind->bh)
- return ind->bh->b_blocknr;
-
- /*
- * It is going to be referred to from the inode itself? OK, just put it
- * into the same cylinder group then.
- */
- bg_start = ext3_group_first_block_no(inode->i_sb, ei->i_block_group);
- colour = (current->pid % 16) *
- (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
- return bg_start + colour;
-}
-
-/**
- * ext3_find_goal - find a preferred place for allocation.
- * @inode: owner
- * @block: block we want
- * @partial: pointer to the last triple within a chain
- *
- * Normally this function find the preferred place for block allocation,
- * returns it.
- */
-
-static ext3_fsblk_t ext3_find_goal(struct inode *inode, long block,
- Indirect *partial)
-{
- struct ext3_block_alloc_info *block_i;
-
- block_i = EXT3_I(inode)->i_block_alloc_info;
-
- /*
- * try the heuristic for sequential allocation,
- * failing that at least try to get decent locality.
- */
- if (block_i && (block == block_i->last_alloc_logical_block + 1)
- && (block_i->last_alloc_physical_block != 0)) {
- return block_i->last_alloc_physical_block + 1;
- }
-
- return ext3_find_near(inode, partial);
-}
-
-/**
- * ext3_blks_to_allocate - Look up the block map and count the number
- * of direct blocks need to be allocated for the given branch.
- *
- * @branch: chain of indirect blocks
- * @k: number of blocks need for indirect blocks
- * @blks: number of data blocks to be mapped.
- * @blocks_to_boundary: the offset in the indirect block
- *
- * return the total number of blocks to be allocate, including the
- * direct and indirect blocks.
- */
-static int ext3_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
- int blocks_to_boundary)
-{
- unsigned long count = 0;
-
- /*
- * Simple case, [t,d]Indirect block(s) has not allocated yet
- * then it's clear blocks on that path have not allocated
- */
- if (k > 0) {
- /* right now we don't handle cross boundary allocation */
- if (blks < blocks_to_boundary + 1)
- count += blks;
- else
- count += blocks_to_boundary + 1;
- return count;
- }
-
- count++;
- while (count < blks && count <= blocks_to_boundary &&
- le32_to_cpu(*(branch[0].p + count)) == 0) {
- count++;
- }
- return count;
-}
-
-/**
- * ext3_alloc_blocks - multiple allocate blocks needed for a branch
- * @handle: handle for this transaction
- * @inode: owner
- * @goal: preferred place for allocation
- * @indirect_blks: the number of blocks need to allocate for indirect
- * blocks
- * @blks: number of blocks need to allocated for direct blocks
- * @new_blocks: on return it will store the new block numbers for
- * the indirect blocks(if needed) and the first direct block,
- * @err: here we store the error value
- *
- * return the number of direct blocks allocated
- */
-static int ext3_alloc_blocks(handle_t *handle, struct inode *inode,
- ext3_fsblk_t goal, int indirect_blks, int blks,
- ext3_fsblk_t new_blocks[4], int *err)
-{
- int target, i;
- unsigned long count = 0;
- int index = 0;
- ext3_fsblk_t current_block = 0;
- int ret = 0;
-
- /*
- * Here we try to allocate the requested multiple blocks at once,
- * on a best-effort basis.
- * To build a branch, we should allocate blocks for
- * the indirect blocks(if not allocated yet), and at least
- * the first direct block of this branch. That's the
- * minimum number of blocks need to allocate(required)
- */
- target = blks + indirect_blks;
-
- while (1) {
- count = target;
- /* allocating blocks for indirect blocks and direct blocks */
- current_block = ext3_new_blocks(handle,inode,goal,&count,err);
- if (*err)
- goto failed_out;
-
- target -= count;
- /* allocate blocks for indirect blocks */
- while (index < indirect_blks && count) {
- new_blocks[index++] = current_block++;
- count--;
- }
-
- if (count > 0)
- break;
- }
-
- /* save the new block number for the first direct block */
- new_blocks[index] = current_block;
-
- /* total number of blocks allocated for direct blocks */
- ret = count;
- *err = 0;
- return ret;
-failed_out:
- for (i = 0; i <index; i++)
- ext3_free_blocks(handle, inode, new_blocks[i], 1);
- return ret;
-}
-
-/**
- * ext3_alloc_branch - allocate and set up a chain of blocks.
- * @handle: handle for this transaction
- * @inode: owner
- * @indirect_blks: number of allocated indirect blocks
- * @blks: number of allocated direct blocks
- * @goal: preferred place for allocation
- * @offsets: offsets (in the blocks) to store the pointers to next.
- * @branch: place to store the chain in.
- *
- * This function allocates blocks, zeroes out all but the last one,
- * links them into chain and (if we are synchronous) writes them to disk.
- * In other words, it prepares a branch that can be spliced onto the
- * inode. It stores the information about that chain in the branch[], in
- * the same format as ext3_get_branch() would do. We are calling it after
- * we had read the existing part of chain and partial points to the last
- * triple of that (one with zero ->key). Upon the exit we have the same
- * picture as after the successful ext3_get_block(), except that in one
- * place chain is disconnected - *branch->p is still zero (we did not
- * set the last link), but branch->key contains the number that should
- * be placed into *branch->p to fill that gap.
- *
- * If allocation fails we free all blocks we've allocated (and forget
- * their buffer_heads) and return the error value the from failed
- * ext3_alloc_block() (normally -ENOSPC). Otherwise we set the chain
- * as described above and return 0.
- */
-static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
- int indirect_blks, int *blks, ext3_fsblk_t goal,
- int *offsets, Indirect *branch)
-{
- int blocksize = inode->i_sb->s_blocksize;
- int i, n = 0;
- int err = 0;
- struct buffer_head *bh;
- int num;
- ext3_fsblk_t new_blocks[4];
- ext3_fsblk_t current_block;
-
- num = ext3_alloc_blocks(handle, inode, goal, indirect_blks,
- *blks, new_blocks, &err);
- if (err)
- return err;
-
- branch[0].key = cpu_to_le32(new_blocks[0]);
- /*
- * metadata blocks and data blocks are allocated.
- */
- for (n = 1; n <= indirect_blks; n++) {
- /*
- * Get buffer_head for parent block, zero it out
- * and set the pointer to new one, then send
- * parent to disk.
- */
- bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
- if (unlikely(!bh)) {
- err = -ENOMEM;
- goto failed;
- }
- branch[n].bh = bh;
- lock_buffer(bh);
- BUFFER_TRACE(bh, "call get_create_access");
- err = ext3_journal_get_create_access(handle, bh);
- if (err) {
- unlock_buffer(bh);
- brelse(bh);
- goto failed;
- }
-
- memset(bh->b_data, 0, blocksize);
- branch[n].p = (__le32 *) bh->b_data + offsets[n];
- branch[n].key = cpu_to_le32(new_blocks[n]);
- *branch[n].p = branch[n].key;
- if ( n == indirect_blks) {
- current_block = new_blocks[n];
- /*
- * End of chain, update the last new metablock of
- * the chain to point to the new allocated
- * data blocks numbers
- */
- for (i=1; i < num; i++)
- *(branch[n].p + i) = cpu_to_le32(++current_block);
- }
- BUFFER_TRACE(bh, "marking uptodate");
- set_buffer_uptodate(bh);
- unlock_buffer(bh);
-
- BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
- err = ext3_journal_dirty_metadata(handle, bh);
- if (err)
- goto failed;
- }
- *blks = num;
- return err;
-failed:
- /* Allocation failed, free what we already allocated */
- for (i = 1; i <= n ; i++) {
- BUFFER_TRACE(branch[i].bh, "call journal_forget");
- ext3_journal_forget(handle, branch[i].bh);
- }
- for (i = 0; i < indirect_blks; i++)
- ext3_free_blocks(handle, inode, new_blocks[i], 1);
-
- ext3_free_blocks(handle, inode, new_blocks[i], num);
-
- return err;
-}
-
-/**
- * ext3_splice_branch - splice the allocated branch onto inode.
- * @handle: handle for this transaction
- * @inode: owner
- * @block: (logical) number of block we are adding
- * @where: location of missing link
- * @num: number of indirect blocks we are adding
- * @blks: number of direct blocks we are adding
- *
- * This function fills the missing link and does all housekeeping needed in
- * inode (->i_blocks, etc.). In case of success we end up with the full
- * chain to new block and return 0.
- */
-static int ext3_splice_branch(handle_t *handle, struct inode *inode,
- long block, Indirect *where, int num, int blks)
-{
- int i;
- int err = 0;
- struct ext3_block_alloc_info *block_i;
- ext3_fsblk_t current_block;
- struct ext3_inode_info *ei = EXT3_I(inode);
- struct timespec now;
-
- block_i = ei->i_block_alloc_info;
- /*
- * If we're splicing into a [td]indirect block (as opposed to the
- * inode) then we need to get write access to the [td]indirect block
- * before the splice.
- */
- if (where->bh) {
- BUFFER_TRACE(where->bh, "get_write_access");
- err = ext3_journal_get_write_access(handle, where->bh);
- if (err)
- goto err_out;
- }
- /* That's it */
-
- *where->p = where->key;
-
- /*
- * Update the host buffer_head or inode to point to more just allocated
- * direct blocks blocks
- */
- if (num == 0 && blks > 1) {
- current_block = le32_to_cpu(where->key) + 1;
- for (i = 1; i < blks; i++)
- *(where->p + i ) = cpu_to_le32(current_block++);
- }
-
- /*
- * update the most recently allocated logical & physical block
- * in i_block_alloc_info, to assist find the proper goal block for next
- * allocation
- */
- if (block_i) {
- block_i->last_alloc_logical_block = block + blks - 1;
- block_i->last_alloc_physical_block =
- le32_to_cpu(where[num].key) + blks - 1;
- }
-
- /* We are done with atomic stuff, now do the rest of housekeeping */
- now = CURRENT_TIME_SEC;
- if (!timespec_equal(&inode->i_ctime, &now) || !where->bh) {
- inode->i_ctime = now;
- ext3_mark_inode_dirty(handle, inode);
- }
- /* ext3_mark_inode_dirty already updated i_sync_tid */
- atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid);
-
- /* had we spliced it onto indirect block? */
- if (where->bh) {
- /*
- * If we spliced it onto an indirect block, we haven't
- * altered the inode. Note however that if it is being spliced
- * onto an indirect block at the very end of the file (the
- * file is growing) then we *will* alter the inode to reflect
- * the new i_size. But that is not done here - it is done in
- * generic_commit_write->__mark_inode_dirty->ext3_dirty_inode.
- */
- jbd_debug(5, "splicing indirect only\n");
- BUFFER_TRACE(where->bh, "call ext3_journal_dirty_metadata");
- err = ext3_journal_dirty_metadata(handle, where->bh);
- if (err)
- goto err_out;
- } else {
- /*
- * OK, we spliced it into the inode itself on a direct block.
- * Inode was dirtied above.
- */
- jbd_debug(5, "splicing direct\n");
- }
- return err;
-
-err_out:
- for (i = 1; i <= num; i++) {
- BUFFER_TRACE(where[i].bh, "call journal_forget");
- ext3_journal_forget(handle, where[i].bh);
- ext3_free_blocks(handle,inode,le32_to_cpu(where[i-1].key),1);
- }
- ext3_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks);
-
- return err;
-}
-
-/*
- * Allocation strategy is simple: if we have to allocate something, we will
- * have to go the whole way to leaf. So let's do it before attaching anything
- * to tree, set linkage between the newborn blocks, write them if sync is
- * required, recheck the path, free and repeat if check fails, otherwise
- * set the last missing link (that will protect us from any truncate-generated
- * removals - all blocks on the path are immune now) and possibly force the
- * write on the parent block.
- * That has a nice additional property: no special recovery from the failed
- * allocations is needed - we simply release blocks and do not touch anything
- * reachable from inode.
- *
- * `handle' can be NULL if create == 0.
- *
- * The BKL may not be held on entry here. Be sure to take it early.
- * return > 0, # of blocks mapped or allocated.
- * return = 0, if plain lookup failed.
- * return < 0, error case.
- */
-int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
- sector_t iblock, unsigned long maxblocks,
- struct buffer_head *bh_result,
- int create)
-{
- int err = -EIO;
- int offsets[4];
- Indirect chain[4];
- Indirect *partial;
- ext3_fsblk_t goal;
- int indirect_blks;
- int blocks_to_boundary = 0;
- int depth;
- struct ext3_inode_info *ei = EXT3_I(inode);
- int count = 0;
- ext3_fsblk_t first_block = 0;
-
-
- trace_ext3_get_blocks_enter(inode, iblock, maxblocks, create);
- J_ASSERT(handle != NULL || create == 0);
- depth = ext3_block_to_path(inode,iblock,offsets,&blocks_to_boundary);
-
- if (depth == 0)
- goto out;
-
- partial = ext3_get_branch(inode, depth, offsets, chain, &err);
-
- /* Simplest case - block found, no allocation needed */
- if (!partial) {
- first_block = le32_to_cpu(chain[depth - 1].key);
- clear_buffer_new(bh_result);
- count++;
- /*map more blocks*/
- while (count < maxblocks && count <= blocks_to_boundary) {
- ext3_fsblk_t blk;
-
- if (!verify_chain(chain, chain + depth - 1)) {
- /*
- * Indirect block might be removed by
- * truncate while we were reading it.
- * Handling of that case: forget what we've
- * got now. Flag the err as EAGAIN, so it
- * will reread.
- */
- err = -EAGAIN;
- count = 0;
- break;
- }
- blk = le32_to_cpu(*(chain[depth-1].p + count));
-
- if (blk == first_block + count)
- count++;
- else
- break;
- }
- if (err != -EAGAIN)
- goto got_it;
- }
-
- /* Next simple case - plain lookup or failed read of indirect block */
- if (!create || err == -EIO)
- goto cleanup;
-
- /*
- * Block out ext3_truncate while we alter the tree
- */
- mutex_lock(&ei->truncate_mutex);
-
- /*
- * If the indirect block is missing while we are reading
- * the chain(ext3_get_branch() returns -EAGAIN err), or
- * if the chain has been changed after we grab the semaphore,
- * (either because another process truncated this branch, or
- * another get_block allocated this branch) re-grab the chain to see if
- * the request block has been allocated or not.
- *
- * Since we already block the truncate/other get_block
- * at this point, we will have the current copy of the chain when we
- * splice the branch into the tree.
- */
- if (err == -EAGAIN || !verify_chain(chain, partial)) {
- while (partial > chain) {
- brelse(partial->bh);
- partial--;
- }
- partial = ext3_get_branch(inode, depth, offsets, chain, &err);
- if (!partial) {
- count++;
- mutex_unlock(&ei->truncate_mutex);
- if (err)
- goto cleanup;
- clear_buffer_new(bh_result);
- goto got_it;
- }
- }
-
- /*
- * Okay, we need to do block allocation. Lazily initialize the block
- * allocation info here if necessary
- */
- if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info))
- ext3_init_block_alloc_info(inode);
-
- goal = ext3_find_goal(inode, iblock, partial);
-
- /* the number of blocks need to allocate for [d,t]indirect blocks */
- indirect_blks = (chain + depth) - partial - 1;
-
- /*
- * Next look up the indirect map to count the totoal number of
- * direct blocks to allocate for this branch.
- */
- count = ext3_blks_to_allocate(partial, indirect_blks,
- maxblocks, blocks_to_boundary);
- err = ext3_alloc_branch(handle, inode, indirect_blks, &count, goal,
- offsets + (partial - chain), partial);
-
- /*
- * The ext3_splice_branch call will free and forget any buffers
- * on the new chain if there is a failure, but that risks using
- * up transaction credits, especially for bitmaps where the
- * credits cannot be returned. Can we handle this somehow? We
- * may need to return -EAGAIN upwards in the worst case. --sct
- */
- if (!err)
- err = ext3_splice_branch(handle, inode, iblock,
- partial, indirect_blks, count);
- mutex_unlock(&ei->truncate_mutex);
- if (err)
- goto cleanup;
-
- set_buffer_new(bh_result);
-got_it:
- map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
- if (count > blocks_to_boundary)
- set_buffer_boundary(bh_result);
- err = count;
- /* Clean up and exit */
- partial = chain + depth - 1; /* the whole chain */
-cleanup:
- while (partial > chain) {
- BUFFER_TRACE(partial->bh, "call brelse");
- brelse(partial->bh);
- partial--;
- }
- BUFFER_TRACE(bh_result, "returned");
-out:
- trace_ext3_get_blocks_exit(inode, iblock,
- depth ? le32_to_cpu(chain[depth-1].key) : 0,
- count, err);
- return err;
-}
-
-/* Maximum number of blocks we map for direct IO at once. */
-#define DIO_MAX_BLOCKS 4096
-/*
- * Number of credits we need for writing DIO_MAX_BLOCKS:
- * We need sb + group descriptor + bitmap + inode -> 4
- * For B blocks with A block pointers per block we need:
- * 1 (triple ind.) + (B/A/A + 2) (doubly ind.) + (B/A + 2) (indirect).
- * If we plug in 4096 for B and 256 for A (for 1KB block size), we get 25.
- */
-#define DIO_CREDITS 25
-
-static int ext3_get_block(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
-{
- handle_t *handle = ext3_journal_current_handle();
- int ret = 0, started = 0;
- unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
-
- if (create && !handle) { /* Direct IO write... */
- if (max_blocks > DIO_MAX_BLOCKS)
- max_blocks = DIO_MAX_BLOCKS;
- handle = ext3_journal_start(inode, DIO_CREDITS +
- EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb));
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out;
- }
- started = 1;
- }
-
- ret = ext3_get_blocks_handle(handle, inode, iblock,
- max_blocks, bh_result, create);
- if (ret > 0) {
- bh_result->b_size = (ret << inode->i_blkbits);
- ret = 0;
- }
- if (started)
- ext3_journal_stop(handle);
-out:
- return ret;
-}
-
-int ext3_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
- u64 start, u64 len)
-{
- return generic_block_fiemap(inode, fieinfo, start, len,
- ext3_get_block);
-}
-
-/*
- * `handle' can be NULL if create is zero
- */
-struct buffer_head *ext3_getblk(handle_t *handle, struct inode *inode,
- long block, int create, int *errp)
-{
- struct buffer_head dummy;
- int fatal = 0, err;
-
- J_ASSERT(handle != NULL || create == 0);
-
- dummy.b_state = 0;
- dummy.b_blocknr = -1000;
- buffer_trace_init(&dummy.b_history);
- err = ext3_get_blocks_handle(handle, inode, block, 1,
- &dummy, create);
- /*
- * ext3_get_blocks_handle() returns number of blocks
- * mapped. 0 in case of a HOLE.
- */
- if (err > 0) {
- WARN_ON(err > 1);
- err = 0;
- }
- *errp = err;
- if (!err && buffer_mapped(&dummy)) {
- struct buffer_head *bh;
- bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
- if (unlikely(!bh)) {
- *errp = -ENOMEM;
- goto err;
- }
- if (buffer_new(&dummy)) {
- J_ASSERT(create != 0);
- J_ASSERT(handle != NULL);
-
- /*
- * Now that we do not always journal data, we should
- * keep in mind whether this should always journal the
- * new buffer as metadata. For now, regular file
- * writes use ext3_get_block instead, so it's not a
- * problem.
- */
- lock_buffer(bh);
- BUFFER_TRACE(bh, "call get_create_access");
- fatal = ext3_journal_get_create_access(handle, bh);
- if (!fatal && !buffer_uptodate(bh)) {
- memset(bh->b_data,0,inode->i_sb->s_blocksize);
- set_buffer_uptodate(bh);
- }
- unlock_buffer(bh);
- BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
- err = ext3_journal_dirty_metadata(handle, bh);
- if (!fatal)
- fatal = err;
- } else {
- BUFFER_TRACE(bh, "not a new buffer");
- }
- if (fatal) {
- *errp = fatal;
- brelse(bh);
- bh = NULL;
- }
- return bh;
- }
-err:
- return NULL;
-}
-
-struct buffer_head *ext3_bread(handle_t *handle, struct inode *inode,
- int block, int create, int *err)
-{
- struct buffer_head * bh;
-
- bh = ext3_getblk(handle, inode, block, create, err);
- if (!bh)
- return bh;
- if (bh_uptodate_or_lock(bh))
- return bh;
- get_bh(bh);
- bh->b_end_io = end_buffer_read_sync;
- submit_bh(READ | REQ_META | REQ_PRIO, bh);
- wait_on_buffer(bh);
- if (buffer_uptodate(bh))
- return bh;
- put_bh(bh);
- *err = -EIO;
- return NULL;
-}
-
-static int walk_page_buffers( handle_t *handle,
- struct buffer_head *head,
- unsigned from,
- unsigned to,
- int *partial,
- int (*fn)( handle_t *handle,
- struct buffer_head *bh))
-{
- struct buffer_head *bh;
- unsigned block_start, block_end;
- unsigned blocksize = head->b_size;
- int err, ret = 0;
- struct buffer_head *next;
-
- for ( bh = head, block_start = 0;
- ret == 0 && (bh != head || !block_start);
- block_start = block_end, bh = next)
- {
- next = bh->b_this_page;
- block_end = block_start + blocksize;
- if (block_end <= from || block_start >= to) {
- if (partial && !buffer_uptodate(bh))
- *partial = 1;
- continue;
- }
- err = (*fn)(handle, bh);
- if (!ret)
- ret = err;
- }
- return ret;
-}
-
-/*
- * To preserve ordering, it is essential that the hole instantiation and
- * the data write be encapsulated in a single transaction. We cannot
- * close off a transaction and start a new one between the ext3_get_block()
- * and the commit_write(). So doing the journal_start at the start of
- * prepare_write() is the right place.
- *
- * Also, this function can nest inside ext3_writepage() ->
- * block_write_full_page(). In that case, we *know* that ext3_writepage()
- * has generated enough buffer credits to do the whole page. So we won't
- * block on the journal in that case, which is good, because the caller may
- * be PF_MEMALLOC.
- *
- * By accident, ext3 can be reentered when a transaction is open via
- * quota file writes. If we were to commit the transaction while thus
- * reentered, there can be a deadlock - we would be holding a quota
- * lock, and the commit would never complete if another thread had a
- * transaction open and was blocking on the quota lock - a ranking
- * violation.
- *
- * So what we do is to rely on the fact that journal_stop/journal_start
- * will _not_ run commit under these circumstances because handle->h_ref
- * is elevated. We'll still have enough credits for the tiny quotafile
- * write.
- */
-static int do_journal_get_write_access(handle_t *handle,
- struct buffer_head *bh)
-{
- int dirty = buffer_dirty(bh);
- int ret;
-
- if (!buffer_mapped(bh) || buffer_freed(bh))
- return 0;
- /*
- * __block_prepare_write() could have dirtied some buffers. Clean
- * the dirty bit as jbd2_journal_get_write_access() could complain
- * otherwise about fs integrity issues. Setting of the dirty bit
- * by __block_prepare_write() isn't a real problem here as we clear
- * the bit before releasing a page lock and thus writeback cannot
- * ever write the buffer.
- */
- if (dirty)
- clear_buffer_dirty(bh);
- ret = ext3_journal_get_write_access(handle, bh);
- if (!ret && dirty)
- ret = ext3_journal_dirty_metadata(handle, bh);
- return ret;
-}
-
-/*
- * Truncate blocks that were not used by write. We have to truncate the
- * pagecache as well so that corresponding buffers get properly unmapped.
- */
-static void ext3_truncate_failed_write(struct inode *inode)
-{
- truncate_inode_pages(inode->i_mapping, inode->i_size);
- ext3_truncate(inode);
-}
-
-/*
- * Truncate blocks that were not used by direct IO write. We have to zero out
- * the last file block as well because direct IO might have written to it.
- */
-static void ext3_truncate_failed_direct_write(struct inode *inode)
-{
- ext3_block_truncate_page(inode, inode->i_size);
- ext3_truncate(inode);
-}
-
-static int ext3_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
- struct page **pagep, void **fsdata)
-{
- struct inode *inode = mapping->host;
- int ret;
- handle_t *handle;
- int retries = 0;
- struct page *page;
- pgoff_t index;
- unsigned from, to;
- /* Reserve one block more for addition to orphan list in case
- * we allocate blocks but write fails for some reason */
- int needed_blocks = ext3_writepage_trans_blocks(inode) + 1;
-
- trace_ext3_write_begin(inode, pos, len, flags);
-
- index = pos >> PAGE_CACHE_SHIFT;
- from = pos & (PAGE_CACHE_SIZE - 1);
- to = from + len;
-
-retry:
- page = grab_cache_page_write_begin(mapping, index, flags);
- if (!page)
- return -ENOMEM;
- *pagep = page;
-
- handle = ext3_journal_start(inode, needed_blocks);
- if (IS_ERR(handle)) {
- unlock_page(page);
- page_cache_release(page);
- ret = PTR_ERR(handle);
- goto out;
- }
- ret = __block_write_begin(page, pos, len, ext3_get_block);
- if (ret)
- goto write_begin_failed;
-
- if (ext3_should_journal_data(inode)) {
- ret = walk_page_buffers(handle, page_buffers(page),
- from, to, NULL, do_journal_get_write_access);
- }
-write_begin_failed:
- if (ret) {
- /*
- * block_write_begin may have instantiated a few blocks
- * outside i_size. Trim these off again. Don't need
- * i_size_read because we hold i_mutex.
- *
- * Add inode to orphan list in case we crash before truncate
- * finishes. Do this only if ext3_can_truncate() agrees so
- * that orphan processing code is happy.
- */
- if (pos + len > inode->i_size && ext3_can_truncate(inode))
- ext3_orphan_add(handle, inode);
- ext3_journal_stop(handle);
- unlock_page(page);
- page_cache_release(page);
- if (pos + len > inode->i_size)
- ext3_truncate_failed_write(inode);
- }
- if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
- goto retry;
-out:
- return ret;
-}
-
-
-int ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
-{
- int err = journal_dirty_data(handle, bh);
- if (err)
- ext3_journal_abort_handle(__func__, __func__,
- bh, handle, err);
- return err;
-}
-
-/* For ordered writepage and write_end functions */
-static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
-{
- /*
- * Write could have mapped the buffer but it didn't copy the data in
- * yet. So avoid filing such buffer into a transaction.
- */
- if (buffer_mapped(bh) && buffer_uptodate(bh))
- return ext3_journal_dirty_data(handle, bh);
- return 0;
-}
-
-/* For write_end() in data=journal mode */
-static int write_end_fn(handle_t *handle, struct buffer_head *bh)
-{
- if (!buffer_mapped(bh) || buffer_freed(bh))
- return 0;
- set_buffer_uptodate(bh);
- return ext3_journal_dirty_metadata(handle, bh);
-}
-
-/*
- * This is nasty and subtle: ext3_write_begin() could have allocated blocks
- * for the whole page but later we failed to copy the data in. Update inode
- * size according to what we managed to copy. The rest is going to be
- * truncated in write_end function.
- */
-static void update_file_sizes(struct inode *inode, loff_t pos, unsigned copied)
-{
- /* What matters to us is i_disksize. We don't write i_size anywhere */
- if (pos + copied > inode->i_size)
- i_size_write(inode, pos + copied);
- if (pos + copied > EXT3_I(inode)->i_disksize) {
- EXT3_I(inode)->i_disksize = pos + copied;
- mark_inode_dirty(inode);
- }
-}
-
-/*
- * We need to pick up the new inode size which generic_commit_write gave us
- * `file' can be NULL - eg, when called from page_symlink().
- *
- * ext3 never places buffers on inode->i_mapping->private_list. metadata
- * buffers are managed internally.
- */
-static int ext3_ordered_write_end(struct file *file,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
-{
- handle_t *handle = ext3_journal_current_handle();
- struct inode *inode = file->f_mapping->host;
- unsigned from, to;
- int ret = 0, ret2;
-
- trace_ext3_ordered_write_end(inode, pos, len, copied);
- copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
-
- from = pos & (PAGE_CACHE_SIZE - 1);
- to = from + copied;
- ret = walk_page_buffers(handle, page_buffers(page),
- from, to, NULL, journal_dirty_data_fn);
-
- if (ret == 0)
- update_file_sizes(inode, pos, copied);
- /*
- * There may be allocated blocks outside of i_size because
- * we failed to copy some data. Prepare for truncate.
- */
- if (pos + len > inode->i_size && ext3_can_truncate(inode))
- ext3_orphan_add(handle, inode);
- ret2 = ext3_journal_stop(handle);
- if (!ret)
- ret = ret2;
- unlock_page(page);
- page_cache_release(page);
-
- if (pos + len > inode->i_size)
- ext3_truncate_failed_write(inode);
- return ret ? ret : copied;
-}
-
-static int ext3_writeback_write_end(struct file *file,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
-{
- handle_t *handle = ext3_journal_current_handle();
- struct inode *inode = file->f_mapping->host;
- int ret;
-
- trace_ext3_writeback_write_end(inode, pos, len, copied);
- copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
- update_file_sizes(inode, pos, copied);
- /*
- * There may be allocated blocks outside of i_size because
- * we failed to copy some data. Prepare for truncate.
- */
- if (pos + len > inode->i_size && ext3_can_truncate(inode))
- ext3_orphan_add(handle, inode);
- ret = ext3_journal_stop(handle);
- unlock_page(page);
- page_cache_release(page);
-
- if (pos + len > inode->i_size)
- ext3_truncate_failed_write(inode);
- return ret ? ret : copied;
-}
-
-static int ext3_journalled_write_end(struct file *file,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
-{
- handle_t *handle = ext3_journal_current_handle();
- struct inode *inode = mapping->host;
- struct ext3_inode_info *ei = EXT3_I(inode);
- int ret = 0, ret2;
- int partial = 0;
- unsigned from, to;
-
- trace_ext3_journalled_write_end(inode, pos, len, copied);
- from = pos & (PAGE_CACHE_SIZE - 1);
- to = from + len;
-
- if (copied < len) {
- if (!PageUptodate(page))
- copied = 0;
- page_zero_new_buffers(page, from + copied, to);
- to = from + copied;
- }
-
- ret = walk_page_buffers(handle, page_buffers(page), from,
- to, &partial, write_end_fn);
- if (!partial)
- SetPageUptodate(page);
-
- if (pos + copied > inode->i_size)
- i_size_write(inode, pos + copied);
- /*
- * There may be allocated blocks outside of i_size because
- * we failed to copy some data. Prepare for truncate.
- */
- if (pos + len > inode->i_size && ext3_can_truncate(inode))
- ext3_orphan_add(handle, inode);
- ext3_set_inode_state(inode, EXT3_STATE_JDATA);
- atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid);
- if (inode->i_size > ei->i_disksize) {
- ei->i_disksize = inode->i_size;
- ret2 = ext3_mark_inode_dirty(handle, inode);
- if (!ret)
- ret = ret2;
- }
-
- ret2 = ext3_journal_stop(handle);
- if (!ret)
- ret = ret2;
- unlock_page(page);
- page_cache_release(page);
-
- if (pos + len > inode->i_size)
- ext3_truncate_failed_write(inode);
- return ret ? ret : copied;
-}
-
-/*
- * bmap() is special. It gets used by applications such as lilo and by
- * the swapper to find the on-disk block of a specific piece of data.
- *
- * Naturally, this is dangerous if the block concerned is still in the
- * journal. If somebody makes a swapfile on an ext3 data-journaling
- * filesystem and enables swap, then they may get a nasty shock when the
- * data getting swapped to that swapfile suddenly gets overwritten by
- * the original zero's written out previously to the journal and
- * awaiting writeback in the kernel's buffer cache.
- *
- * So, if we see any bmap calls here on a modified, data-journaled file,
- * take extra steps to flush any blocks which might be in the cache.
- */
-static sector_t ext3_bmap(struct address_space *mapping, sector_t block)
-{
- struct inode *inode = mapping->host;
- journal_t *journal;
- int err;
-
- if (ext3_test_inode_state(inode, EXT3_STATE_JDATA)) {
- /*
- * This is a REALLY heavyweight approach, but the use of
- * bmap on dirty files is expected to be extremely rare:
- * only if we run lilo or swapon on a freshly made file
- * do we expect this to happen.
- *
- * (bmap requires CAP_SYS_RAWIO so this does not
- * represent an unprivileged user DOS attack --- we'd be
- * in trouble if mortal users could trigger this path at
- * will.)
- *
- * NB. EXT3_STATE_JDATA is not set on files other than
- * regular files. If somebody wants to bmap a directory
- * or symlink and gets confused because the buffer
- * hasn't yet been flushed to disk, they deserve
- * everything they get.
- */
-
- ext3_clear_inode_state(inode, EXT3_STATE_JDATA);
- journal = EXT3_JOURNAL(inode);
- journal_lock_updates(journal);
- err = journal_flush(journal);
- journal_unlock_updates(journal);
-
- if (err)
- return 0;
- }
-
- return generic_block_bmap(mapping,block,ext3_get_block);
-}
-
-static int bget_one(handle_t *handle, struct buffer_head *bh)
-{
- get_bh(bh);
- return 0;
-}
-
-static int bput_one(handle_t *handle, struct buffer_head *bh)
-{
- put_bh(bh);
- return 0;
-}
-
-static int buffer_unmapped(handle_t *handle, struct buffer_head *bh)
-{
- return !buffer_mapped(bh);
-}
-
-/*
- * Note that whenever we need to map blocks we start a transaction even if
- * we're not journalling data. This is to preserve ordering: any hole
- * instantiation within __block_write_full_page -> ext3_get_block() should be
- * journalled along with the data so we don't crash and then get metadata which
- * refers to old data.
- *
- * In all journalling modes block_write_full_page() will start the I/O.
- *
- * We don't honour synchronous mounts for writepage(). That would be
- * disastrous. Any write() or metadata operation will sync the fs for
- * us.
- */
-static int ext3_ordered_writepage(struct page *page,
- struct writeback_control *wbc)
-{
- struct inode *inode = page->mapping->host;
- struct buffer_head *page_bufs;
- handle_t *handle = NULL;
- int ret = 0;
- int err;
-
- J_ASSERT(PageLocked(page));
- /*
- * We don't want to warn for emergency remount. The condition is
- * ordered to avoid dereferencing inode->i_sb in non-error case to
- * avoid slow-downs.
- */
- WARN_ON_ONCE(IS_RDONLY(inode) &&
- !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS));
-
- /*
- * We give up here if we're reentered, because it might be for a
- * different filesystem.
- */
- if (ext3_journal_current_handle())
- goto out_fail;
-
- trace_ext3_ordered_writepage(page);
- if (!page_has_buffers(page)) {
- create_empty_buffers(page, inode->i_sb->s_blocksize,
- (1 << BH_Dirty)|(1 << BH_Uptodate));
- page_bufs = page_buffers(page);
- } else {
- page_bufs = page_buffers(page);
- if (!walk_page_buffers(NULL, page_bufs, 0, PAGE_CACHE_SIZE,
- NULL, buffer_unmapped)) {
- /* Provide NULL get_block() to catch bugs if buffers
- * weren't really mapped */
- return block_write_full_page(page, NULL, wbc);
- }
- }
- handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
-
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out_fail;
- }
-
- walk_page_buffers(handle, page_bufs, 0,
- PAGE_CACHE_SIZE, NULL, bget_one);
-
- ret = block_write_full_page(page, ext3_get_block, wbc);
-
- /*
- * The page can become unlocked at any point now, and
- * truncate can then come in and change things. So we
- * can't touch *page from now on. But *page_bufs is
- * safe due to elevated refcount.
- */
-
- /*
- * And attach them to the current transaction. But only if
- * block_write_full_page() succeeded. Otherwise they are unmapped,
- * and generally junk.
- */
- if (ret == 0)
- ret = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
- NULL, journal_dirty_data_fn);
- walk_page_buffers(handle, page_bufs, 0,
- PAGE_CACHE_SIZE, NULL, bput_one);
- err = ext3_journal_stop(handle);
- if (!ret)
- ret = err;
- return ret;
-
-out_fail:
- redirty_page_for_writepage(wbc, page);
- unlock_page(page);
- return ret;
-}
-
-static int ext3_writeback_writepage(struct page *page,
- struct writeback_control *wbc)
-{
- struct inode *inode = page->mapping->host;
- handle_t *handle = NULL;
- int ret = 0;
- int err;
-
- J_ASSERT(PageLocked(page));
- /*
- * We don't want to warn for emergency remount. The condition is
- * ordered to avoid dereferencing inode->i_sb in non-error case to
- * avoid slow-downs.
- */
- WARN_ON_ONCE(IS_RDONLY(inode) &&
- !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS));
-
- if (ext3_journal_current_handle())
- goto out_fail;
-
- trace_ext3_writeback_writepage(page);
- if (page_has_buffers(page)) {
- if (!walk_page_buffers(NULL, page_buffers(page), 0,
- PAGE_CACHE_SIZE, NULL, buffer_unmapped)) {
- /* Provide NULL get_block() to catch bugs if buffers
- * weren't really mapped */
- return block_write_full_page(page, NULL, wbc);
- }
- }
-
- handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out_fail;
- }
-
- ret = block_write_full_page(page, ext3_get_block, wbc);
-
- err = ext3_journal_stop(handle);
- if (!ret)
- ret = err;
- return ret;
-
-out_fail:
- redirty_page_for_writepage(wbc, page);
- unlock_page(page);
- return ret;
-}
-
-static int ext3_journalled_writepage(struct page *page,
- struct writeback_control *wbc)
-{
- struct inode *inode = page->mapping->host;
- handle_t *handle = NULL;
- int ret = 0;
- int err;
-
- J_ASSERT(PageLocked(page));
- /*
- * We don't want to warn for emergency remount. The condition is
- * ordered to avoid dereferencing inode->i_sb in non-error case to
- * avoid slow-downs.
- */
- WARN_ON_ONCE(IS_RDONLY(inode) &&
- !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS));
-
- trace_ext3_journalled_writepage(page);
- if (!page_has_buffers(page) || PageChecked(page)) {
- if (ext3_journal_current_handle())
- goto no_write;
-
- handle = ext3_journal_start(inode,
- ext3_writepage_trans_blocks(inode));
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto no_write;
- }
- /*
- * It's mmapped pagecache. Add buffers and journal it. There
- * doesn't seem much point in redirtying the page here.
- */
- ClearPageChecked(page);
- ret = __block_write_begin(page, 0, PAGE_CACHE_SIZE,
- ext3_get_block);
- if (ret != 0) {
- ext3_journal_stop(handle);
- goto out_unlock;
- }
- ret = walk_page_buffers(handle, page_buffers(page), 0,
- PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
-
- err = walk_page_buffers(handle, page_buffers(page), 0,
- PAGE_CACHE_SIZE, NULL, write_end_fn);
- if (ret == 0)
- ret = err;
- ext3_set_inode_state(inode, EXT3_STATE_JDATA);
- atomic_set(&EXT3_I(inode)->i_datasync_tid,
- handle->h_transaction->t_tid);
- unlock_page(page);
- err = ext3_journal_stop(handle);
- if (!ret)
- ret = err;
- } else {
- /*
- * It is a page full of checkpoint-mode buffers. Go and write
- * them. They should have been already mapped when they went
- * to the journal so provide NULL get_block function to catch
- * errors.
- */
- ret = block_write_full_page(page, NULL, wbc);
- }
-out:
- return ret;
-
-no_write:
- redirty_page_for_writepage(wbc, page);
-out_unlock:
- unlock_page(page);
- goto out;
-}
-
-static int ext3_readpage(struct file *file, struct page *page)
-{
- trace_ext3_readpage(page);
- return mpage_readpage(page, ext3_get_block);
-}
-
-static int
-ext3_readpages(struct file *file, struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
-{
- return mpage_readpages(mapping, pages, nr_pages, ext3_get_block);
-}
-
-static void ext3_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
-{
- journal_t *journal = EXT3_JOURNAL(page->mapping->host);
-
- trace_ext3_invalidatepage(page, offset, length);
-
- /*
- * If it's a full truncate we just forget about the pending dirtying
- */
- if (offset == 0 && length == PAGE_CACHE_SIZE)
- ClearPageChecked(page);
-
- journal_invalidatepage(journal, page, offset, length);
-}
-
-static int ext3_releasepage(struct page *page, gfp_t wait)
-{
- journal_t *journal = EXT3_JOURNAL(page->mapping->host);
-
- trace_ext3_releasepage(page);
- WARN_ON(PageChecked(page));
- if (!page_has_buffers(page))
- return 0;
- return journal_try_to_free_buffers(journal, page, wait);
-}
-
-/*
- * If the O_DIRECT write will extend the file then add this inode to the
- * orphan list. So recovery will truncate it back to the original size
- * if the machine crashes during the write.
- *
- * If the O_DIRECT write is intantiating holes inside i_size and the machine
- * crashes then stale disk data _may_ be exposed inside the file. But current
- * VFS code falls back into buffered path in that case so we are safe.
- */
-static ssize_t ext3_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
- loff_t offset)
-{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_mapping->host;
- struct ext3_inode_info *ei = EXT3_I(inode);
- handle_t *handle;
- ssize_t ret;
- int orphan = 0;
- size_t count = iov_iter_count(iter);
- int retries = 0;
-
- trace_ext3_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
-
- if (iov_iter_rw(iter) == WRITE) {
- loff_t final_size = offset + count;
-
- if (final_size > inode->i_size) {
- /* Credits for sb + inode write */
- handle = ext3_journal_start(inode, 2);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out;
- }
- ret = ext3_orphan_add(handle, inode);
- if (ret) {
- ext3_journal_stop(handle);
- goto out;
- }
- orphan = 1;
- ei->i_disksize = inode->i_size;
- ext3_journal_stop(handle);
- }
- }
-
-retry:
- ret = blockdev_direct_IO(iocb, inode, iter, offset, ext3_get_block);
- /*
- * In case of error extending write may have instantiated a few
- * blocks outside i_size. Trim these off again.
- */
- if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
- loff_t isize = i_size_read(inode);
- loff_t end = offset + count;
-
- if (end > isize)
- ext3_truncate_failed_direct_write(inode);
- }
- if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
- goto retry;
-
- if (orphan) {
- int err;
-
- /* Credits for sb + inode write */
- handle = ext3_journal_start(inode, 2);
- if (IS_ERR(handle)) {
- /* This is really bad luck. We've written the data
- * but cannot extend i_size. Truncate allocated blocks
- * and pretend the write failed... */
- ext3_truncate_failed_direct_write(inode);
- ret = PTR_ERR(handle);
- if (inode->i_nlink)
- ext3_orphan_del(NULL, inode);
- goto out;
- }
- if (inode->i_nlink)
- ext3_orphan_del(handle, inode);
- if (ret > 0) {
- loff_t end = offset + ret;
- if (end > inode->i_size) {
- ei->i_disksize = end;
- i_size_write(inode, end);
- /*
- * We're going to return a positive `ret'
- * here due to non-zero-length I/O, so there's
- * no way of reporting error returns from
- * ext3_mark_inode_dirty() to userspace. So
- * ignore it.
- */
- ext3_mark_inode_dirty(handle, inode);
- }
- }
- err = ext3_journal_stop(handle);
- if (ret == 0)
- ret = err;
- }
-out:
- trace_ext3_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret);
- return ret;
-}
-
-/*
- * Pages can be marked dirty completely asynchronously from ext3's journalling
- * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do
- * much here because ->set_page_dirty is called under VFS locks. The page is
- * not necessarily locked.
- *
- * We cannot just dirty the page and leave attached buffers clean, because the
- * buffers' dirty state is "definitive". We cannot just set the buffers dirty
- * or jbddirty because all the journalling code will explode.
- *
- * So what we do is to mark the page "pending dirty" and next time writepage
- * is called, propagate that into the buffers appropriately.
- */
-static int ext3_journalled_set_page_dirty(struct page *page)
-{
- SetPageChecked(page);
- return __set_page_dirty_nobuffers(page);
-}
-
-static const struct address_space_operations ext3_ordered_aops = {
- .readpage = ext3_readpage,
- .readpages = ext3_readpages,
- .writepage = ext3_ordered_writepage,
- .write_begin = ext3_write_begin,
- .write_end = ext3_ordered_write_end,
- .bmap = ext3_bmap,
- .invalidatepage = ext3_invalidatepage,
- .releasepage = ext3_releasepage,
- .direct_IO = ext3_direct_IO,
- .migratepage = buffer_migrate_page,
- .is_partially_uptodate = block_is_partially_uptodate,
- .is_dirty_writeback = buffer_check_dirty_writeback,
- .error_remove_page = generic_error_remove_page,
-};
-
-static const struct address_space_operations ext3_writeback_aops = {
- .readpage = ext3_readpage,
- .readpages = ext3_readpages,
- .writepage = ext3_writeback_writepage,
- .write_begin = ext3_write_begin,
- .write_end = ext3_writeback_write_end,
- .bmap = ext3_bmap,
- .invalidatepage = ext3_invalidatepage,
- .releasepage = ext3_releasepage,
- .direct_IO = ext3_direct_IO,
- .migratepage = buffer_migrate_page,
- .is_partially_uptodate = block_is_partially_uptodate,
- .error_remove_page = generic_error_remove_page,
-};
-
-static const struct address_space_operations ext3_journalled_aops = {
- .readpage = ext3_readpage,
- .readpages = ext3_readpages,
- .writepage = ext3_journalled_writepage,
- .write_begin = ext3_write_begin,
- .write_end = ext3_journalled_write_end,
- .set_page_dirty = ext3_journalled_set_page_dirty,
- .bmap = ext3_bmap,
- .invalidatepage = ext3_invalidatepage,
- .releasepage = ext3_releasepage,
- .is_partially_uptodate = block_is_partially_uptodate,
- .error_remove_page = generic_error_remove_page,
-};
-
-void ext3_set_aops(struct inode *inode)
-{
- if (ext3_should_order_data(inode))
- inode->i_mapping->a_ops = &ext3_ordered_aops;
- else if (ext3_should_writeback_data(inode))
- inode->i_mapping->a_ops = &ext3_writeback_aops;
- else
- inode->i_mapping->a_ops = &ext3_journalled_aops;
-}
-
-/*
- * ext3_block_truncate_page() zeroes out a mapping from file offset `from'
- * up to the end of the block which corresponds to `from'.
- * This required during truncate. We need to physically zero the tail end
- * of that block so it doesn't yield old data if the file is later grown.
- */
-static int ext3_block_truncate_page(struct inode *inode, loff_t from)
-{
- ext3_fsblk_t index = from >> PAGE_CACHE_SHIFT;
- unsigned offset = from & (PAGE_CACHE_SIZE - 1);
- unsigned blocksize, iblock, length, pos;
- struct page *page;
- handle_t *handle = NULL;
- struct buffer_head *bh;
- int err = 0;
-
- /* Truncated on block boundary - nothing to do */
- blocksize = inode->i_sb->s_blocksize;
- if ((from & (blocksize - 1)) == 0)
- return 0;
-
- page = grab_cache_page(inode->i_mapping, index);
- if (!page)
- return -ENOMEM;
- length = blocksize - (offset & (blocksize - 1));
- iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
-
- if (!page_has_buffers(page))
- create_empty_buffers(page, blocksize, 0);
-
- /* Find the buffer that contains "offset" */
- bh = page_buffers(page);
- pos = blocksize;
- while (offset >= pos) {
- bh = bh->b_this_page;
- iblock++;
- pos += blocksize;
- }
-
- err = 0;
- if (buffer_freed(bh)) {
- BUFFER_TRACE(bh, "freed: skip");
- goto unlock;
- }
-
- if (!buffer_mapped(bh)) {
- BUFFER_TRACE(bh, "unmapped");
- ext3_get_block(inode, iblock, bh, 0);
- /* unmapped? It's a hole - nothing to do */
- if (!buffer_mapped(bh)) {
- BUFFER_TRACE(bh, "still unmapped");
- goto unlock;
- }
- }
-
- /* Ok, it's mapped. Make sure it's up-to-date */
- if (PageUptodate(page))
- set_buffer_uptodate(bh);
-
- if (!bh_uptodate_or_lock(bh)) {
- err = bh_submit_read(bh);
- /* Uhhuh. Read error. Complain and punt. */
- if (err)
- goto unlock;
- }
-
- /* data=writeback mode doesn't need transaction to zero-out data */
- if (!ext3_should_writeback_data(inode)) {
- /* We journal at most one block */
- handle = ext3_journal_start(inode, 1);
- if (IS_ERR(handle)) {
- clear_highpage(page);
- flush_dcache_page(page);
- err = PTR_ERR(handle);
- goto unlock;
- }
- }
-
- if (ext3_should_journal_data(inode)) {
- BUFFER_TRACE(bh, "get write access");
- err = ext3_journal_get_write_access(handle, bh);
- if (err)
- goto stop;
- }
-
- zero_user(page, offset, length);
- BUFFER_TRACE(bh, "zeroed end of block");
-
- err = 0;
- if (ext3_should_journal_data(inode)) {
- err = ext3_journal_dirty_metadata(handle, bh);
- } else {
- if (ext3_should_order_data(inode))
- err = ext3_journal_dirty_data(handle, bh);
- mark_buffer_dirty(bh);
- }
-stop:
- if (handle)
- ext3_journal_stop(handle);
-
-unlock:
- unlock_page(page);
- page_cache_release(page);
- return err;
-}
-
-/*
- * Probably it should be a library function... search for first non-zero word
- * or memcmp with zero_page, whatever is better for particular architecture.
- * Linus?
- */
-static inline int all_zeroes(__le32 *p, __le32 *q)
-{
- while (p < q)
- if (*p++)
- return 0;
- return 1;
-}
-
-/**
- * ext3_find_shared - find the indirect blocks for partial truncation.
- * @inode: inode in question
- * @depth: depth of the affected branch
- * @offsets: offsets of pointers in that branch (see ext3_block_to_path)
- * @chain: place to store the pointers to partial indirect blocks
- * @top: place to the (detached) top of branch
- *
- * This is a helper function used by ext3_truncate().
- *
- * When we do truncate() we may have to clean the ends of several
- * indirect blocks but leave the blocks themselves alive. Block is
- * partially truncated if some data below the new i_size is referred
- * from it (and it is on the path to the first completely truncated
- * data block, indeed). We have to free the top of that path along
- * with everything to the right of the path. Since no allocation
- * past the truncation point is possible until ext3_truncate()
- * finishes, we may safely do the latter, but top of branch may
- * require special attention - pageout below the truncation point
- * might try to populate it.
- *
- * We atomically detach the top of branch from the tree, store the
- * block number of its root in *@top, pointers to buffer_heads of
- * partially truncated blocks - in @chain[].bh and pointers to
- * their last elements that should not be removed - in
- * @chain[].p. Return value is the pointer to last filled element
- * of @chain.
- *
- * The work left to caller to do the actual freeing of subtrees:
- * a) free the subtree starting from *@top
- * b) free the subtrees whose roots are stored in
- * (@chain[i].p+1 .. end of @chain[i].bh->b_data)
- * c) free the subtrees growing from the inode past the @chain[0].
- * (no partially truncated stuff there). */
-
-static Indirect *ext3_find_shared(struct inode *inode, int depth,
- int offsets[4], Indirect chain[4], __le32 *top)
-{
- Indirect *partial, *p;
- int k, err;
-
- *top = 0;
- /* Make k index the deepest non-null offset + 1 */
- for (k = depth; k > 1 && !offsets[k-1]; k--)
- ;
- partial = ext3_get_branch(inode, k, offsets, chain, &err);
- /* Writer: pointers */
- if (!partial)
- partial = chain + k-1;
- /*
- * If the branch acquired continuation since we've looked at it -
- * fine, it should all survive and (new) top doesn't belong to us.
- */
- if (!partial->key && *partial->p)
- /* Writer: end */
- goto no_top;
- for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--)
- ;
- /*
- * OK, we've found the last block that must survive. The rest of our
- * branch should be detached before unlocking. However, if that rest
- * of branch is all ours and does not grow immediately from the inode
- * it's easier to cheat and just decrement partial->p.
- */
- if (p == chain + k - 1 && p > chain) {
- p->p--;
- } else {
- *top = *p->p;
- /* Nope, don't do this in ext3. Must leave the tree intact */
-#if 0
- *p->p = 0;
-#endif
- }
- /* Writer: end */
-
- while(partial > p) {
- brelse(partial->bh);
- partial--;
- }
-no_top:
- return partial;
-}
-
-/*
- * Zero a number of block pointers in either an inode or an indirect block.
- * If we restart the transaction we must again get write access to the
- * indirect block for further modification.
- *
- * We release `count' blocks on disk, but (last - first) may be greater
- * than `count' because there can be holes in there.
- */
-static void ext3_clear_blocks(handle_t *handle, struct inode *inode,
- struct buffer_head *bh, ext3_fsblk_t block_to_free,
- unsigned long count, __le32 *first, __le32 *last)
-{
- __le32 *p;
- if (try_to_extend_transaction(handle, inode)) {
- if (bh) {
- BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
- if (ext3_journal_dirty_metadata(handle, bh))
- return;
- }
- ext3_mark_inode_dirty(handle, inode);
- truncate_restart_transaction(handle, inode);
- if (bh) {
- BUFFER_TRACE(bh, "retaking write access");
- if (ext3_journal_get_write_access(handle, bh))
- return;
- }
- }
-
- /*
- * Any buffers which are on the journal will be in memory. We find
- * them on the hash table so journal_revoke() will run journal_forget()
- * on them. We've already detached each block from the file, so
- * bforget() in journal_forget() should be safe.
- *
- * AKPM: turn on bforget in journal_forget()!!!
- */
- for (p = first; p < last; p++) {
- u32 nr = le32_to_cpu(*p);
- if (nr) {
- struct buffer_head *bh;
-
- *p = 0;
- bh = sb_find_get_block(inode->i_sb, nr);
- ext3_forget(handle, 0, inode, bh, nr);
- }
- }
-
- ext3_free_blocks(handle, inode, block_to_free, count);
-}
-
-/**
- * ext3_free_data - free a list of data blocks
- * @handle: handle for this transaction
- * @inode: inode we are dealing with
- * @this_bh: indirect buffer_head which contains *@first and *@last
- * @first: array of block numbers
- * @last: points immediately past the end of array
- *
- * We are freeing all blocks referred from that array (numbers are stored as
- * little-endian 32-bit) and updating @inode->i_blocks appropriately.
- *
- * We accumulate contiguous runs of blocks to free. Conveniently, if these
- * blocks are contiguous then releasing them at one time will only affect one
- * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
- * actually use a lot of journal space.
- *
- * @this_bh will be %NULL if @first and @last point into the inode's direct
- * block pointers.
- */
-static void ext3_free_data(handle_t *handle, struct inode *inode,
- struct buffer_head *this_bh,
- __le32 *first, __le32 *last)
-{
- ext3_fsblk_t block_to_free = 0; /* Starting block # of a run */
- unsigned long count = 0; /* Number of blocks in the run */
- __le32 *block_to_free_p = NULL; /* Pointer into inode/ind
- corresponding to
- block_to_free */
- ext3_fsblk_t nr; /* Current block # */
- __le32 *p; /* Pointer into inode/ind
- for current block */
- int err;
-
- if (this_bh) { /* For indirect block */
- BUFFER_TRACE(this_bh, "get_write_access");
- err = ext3_journal_get_write_access(handle, this_bh);
- /* Important: if we can't update the indirect pointers
- * to the blocks, we can't free them. */
- if (err)
- return;
- }
-
- for (p = first; p < last; p++) {
- nr = le32_to_cpu(*p);
- if (nr) {
- /* accumulate blocks to free if they're contiguous */
- if (count == 0) {
- block_to_free = nr;
- block_to_free_p = p;
- count = 1;
- } else if (nr == block_to_free + count) {
- count++;
- } else {
- ext3_clear_blocks(handle, inode, this_bh,
- block_to_free,
- count, block_to_free_p, p);
- block_to_free = nr;
- block_to_free_p = p;
- count = 1;
- }
- }
- }
-
- if (count > 0)
- ext3_clear_blocks(handle, inode, this_bh, block_to_free,
- count, block_to_free_p, p);
-
- if (this_bh) {
- BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata");
-
- /*
- * The buffer head should have an attached journal head at this
- * point. However, if the data is corrupted and an indirect
- * block pointed to itself, it would have been detached when
- * the block was cleared. Check for this instead of OOPSing.
- */
- if (bh2jh(this_bh))
- ext3_journal_dirty_metadata(handle, this_bh);
- else
- ext3_error(inode->i_sb, "ext3_free_data",
- "circular indirect block detected, "
- "inode=%lu, block=%llu",
- inode->i_ino,
- (unsigned long long)this_bh->b_blocknr);
- }
-}
-
-/**
- * ext3_free_branches - free an array of branches
- * @handle: JBD handle for this transaction
- * @inode: inode we are dealing with
- * @parent_bh: the buffer_head which contains *@first and *@last
- * @first: array of block numbers
- * @last: pointer immediately past the end of array
- * @depth: depth of the branches to free
- *
- * We are freeing all blocks referred from these branches (numbers are
- * stored as little-endian 32-bit) and updating @inode->i_blocks
- * appropriately.
- */
-static void ext3_free_branches(handle_t *handle, struct inode *inode,
- struct buffer_head *parent_bh,
- __le32 *first, __le32 *last, int depth)
-{
- ext3_fsblk_t nr;
- __le32 *p;
-
- if (is_handle_aborted(handle))
- return;
-
- if (depth--) {
- struct buffer_head *bh;
- int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
- p = last;
- while (--p >= first) {
- nr = le32_to_cpu(*p);
- if (!nr)
- continue; /* A hole */
-
- /* Go read the buffer for the next level down */
- bh = sb_bread(inode->i_sb, nr);
-
- /*
- * A read failure? Report error and clear slot
- * (should be rare).
- */
- if (!bh) {
- ext3_error(inode->i_sb, "ext3_free_branches",
- "Read failure, inode=%lu, block="E3FSBLK,
- inode->i_ino, nr);
- continue;
- }
-
- /* This zaps the entire block. Bottom up. */
- BUFFER_TRACE(bh, "free child branches");
- ext3_free_branches(handle, inode, bh,
- (__le32*)bh->b_data,
- (__le32*)bh->b_data + addr_per_block,
- depth);
-
- /*
- * Everything below this this pointer has been
- * released. Now let this top-of-subtree go.
- *
- * We want the freeing of this indirect block to be
- * atomic in the journal with the updating of the
- * bitmap block which owns it. So make some room in
- * the journal.
- *
- * We zero the parent pointer *after* freeing its
- * pointee in the bitmaps, so if extend_transaction()
- * for some reason fails to put the bitmap changes and
- * the release into the same transaction, recovery
- * will merely complain about releasing a free block,
- * rather than leaking blocks.
- */
- if (is_handle_aborted(handle))
- return;
- if (try_to_extend_transaction(handle, inode)) {
- ext3_mark_inode_dirty(handle, inode);
- truncate_restart_transaction(handle, inode);
- }
-
- /*
- * We've probably journalled the indirect block several
- * times during the truncate. But it's no longer
- * needed and we now drop it from the transaction via
- * journal_revoke().
- *
- * That's easy if it's exclusively part of this
- * transaction. But if it's part of the committing
- * transaction then journal_forget() will simply
- * brelse() it. That means that if the underlying
- * block is reallocated in ext3_get_block(),
- * unmap_underlying_metadata() will find this block
- * and will try to get rid of it. damn, damn. Thus
- * we don't allow a block to be reallocated until
- * a transaction freeing it has fully committed.
- *
- * We also have to make sure journal replay after a
- * crash does not overwrite non-journaled data blocks
- * with old metadata when the block got reallocated for
- * data. Thus we have to store a revoke record for a
- * block in the same transaction in which we free the
- * block.
- */
- ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
-
- ext3_free_blocks(handle, inode, nr, 1);
-
- if (parent_bh) {
- /*
- * The block which we have just freed is
- * pointed to by an indirect block: journal it
- */
- BUFFER_TRACE(parent_bh, "get_write_access");
- if (!ext3_journal_get_write_access(handle,
- parent_bh)){
- *p = 0;
- BUFFER_TRACE(parent_bh,
- "call ext3_journal_dirty_metadata");
- ext3_journal_dirty_metadata(handle,
- parent_bh);
- }
- }
- }
- } else {
- /* We have reached the bottom of the tree. */
- BUFFER_TRACE(parent_bh, "free data blocks");
- ext3_free_data(handle, inode, parent_bh, first, last);
- }
-}
-
-int ext3_can_truncate(struct inode *inode)
-{
- if (S_ISREG(inode->i_mode))
- return 1;
- if (S_ISDIR(inode->i_mode))
- return 1;
- if (S_ISLNK(inode->i_mode))
- return !ext3_inode_is_fast_symlink(inode);
- return 0;
-}
-
-/*
- * ext3_truncate()
- *
- * We block out ext3_get_block() block instantiations across the entire
- * transaction, and VFS/VM ensures that ext3_truncate() cannot run
- * simultaneously on behalf of the same inode.
- *
- * As we work through the truncate and commit bits of it to the journal there
- * is one core, guiding principle: the file's tree must always be consistent on
- * disk. We must be able to restart the truncate after a crash.
- *
- * The file's tree may be transiently inconsistent in memory (although it
- * probably isn't), but whenever we close off and commit a journal transaction,
- * the contents of (the filesystem + the journal) must be consistent and
- * restartable. It's pretty simple, really: bottom up, right to left (although
- * left-to-right works OK too).
- *
- * Note that at recovery time, journal replay occurs *before* the restart of
- * truncate against the orphan inode list.
- *
- * The committed inode has the new, desired i_size (which is the same as
- * i_disksize in this case). After a crash, ext3_orphan_cleanup() will see
- * that this inode's truncate did not complete and it will again call
- * ext3_truncate() to have another go. So there will be instantiated blocks
- * to the right of the truncation point in a crashed ext3 filesystem. But
- * that's fine - as long as they are linked from the inode, the post-crash
- * ext3_truncate() run will find them and release them.
- */
-void ext3_truncate(struct inode *inode)
-{
- handle_t *handle;
- struct ext3_inode_info *ei = EXT3_I(inode);
- __le32 *i_data = ei->i_data;
- int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
- int offsets[4];
- Indirect chain[4];
- Indirect *partial;
- __le32 nr = 0;
- int n;
- long last_block;
- unsigned blocksize = inode->i_sb->s_blocksize;
-
- trace_ext3_truncate_enter(inode);
-
- if (!ext3_can_truncate(inode))
- goto out_notrans;
-
- if (inode->i_size == 0 && ext3_should_writeback_data(inode))
- ext3_set_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE);
-
- handle = start_transaction(inode);
- if (IS_ERR(handle))
- goto out_notrans;
-
- last_block = (inode->i_size + blocksize-1)
- >> EXT3_BLOCK_SIZE_BITS(inode->i_sb);
- n = ext3_block_to_path(inode, last_block, offsets, NULL);
- if (n == 0)
- goto out_stop; /* error */
-
- /*
- * OK. This truncate is going to happen. We add the inode to the
- * orphan list, so that if this truncate spans multiple transactions,
- * and we crash, we will resume the truncate when the filesystem
- * recovers. It also marks the inode dirty, to catch the new size.
- *
- * Implication: the file must always be in a sane, consistent
- * truncatable state while each transaction commits.
- */
- if (ext3_orphan_add(handle, inode))
- goto out_stop;
-
- /*
- * The orphan list entry will now protect us from any crash which
- * occurs before the truncate completes, so it is now safe to propagate
- * the new, shorter inode size (held for now in i_size) into the
- * on-disk inode. We do this via i_disksize, which is the value which
- * ext3 *really* writes onto the disk inode.
- */
- ei->i_disksize = inode->i_size;
-
- /*
- * From here we block out all ext3_get_block() callers who want to
- * modify the block allocation tree.
- */
- mutex_lock(&ei->truncate_mutex);
-
- if (n == 1) { /* direct blocks */
- ext3_free_data(handle, inode, NULL, i_data+offsets[0],
- i_data + EXT3_NDIR_BLOCKS);
- goto do_indirects;
- }
-
- partial = ext3_find_shared(inode, n, offsets, chain, &nr);
- /* Kill the top of shared branch (not detached) */
- if (nr) {
- if (partial == chain) {
- /* Shared branch grows from the inode */
- ext3_free_branches(handle, inode, NULL,
- &nr, &nr+1, (chain+n-1) - partial);
- *partial->p = 0;
- /*
- * We mark the inode dirty prior to restart,
- * and prior to stop. No need for it here.
- */
- } else {
- /* Shared branch grows from an indirect block */
- ext3_free_branches(handle, inode, partial->bh,
- partial->p,
- partial->p+1, (chain+n-1) - partial);
- }
- }
- /* Clear the ends of indirect blocks on the shared branch */
- while (partial > chain) {
- ext3_free_branches(handle, inode, partial->bh, partial->p + 1,
- (__le32*)partial->bh->b_data+addr_per_block,
- (chain+n-1) - partial);
- BUFFER_TRACE(partial->bh, "call brelse");
- brelse (partial->bh);
- partial--;
- }
-do_indirects:
- /* Kill the remaining (whole) subtrees */
- switch (offsets[0]) {
- default:
- nr = i_data[EXT3_IND_BLOCK];
- if (nr) {
- ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
- i_data[EXT3_IND_BLOCK] = 0;
- }
- case EXT3_IND_BLOCK:
- nr = i_data[EXT3_DIND_BLOCK];
- if (nr) {
- ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
- i_data[EXT3_DIND_BLOCK] = 0;
- }
- case EXT3_DIND_BLOCK:
- nr = i_data[EXT3_TIND_BLOCK];
- if (nr) {
- ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
- i_data[EXT3_TIND_BLOCK] = 0;
- }
- case EXT3_TIND_BLOCK:
- ;
- }
-
- ext3_discard_reservation(inode);
-
- mutex_unlock(&ei->truncate_mutex);
- inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
- ext3_mark_inode_dirty(handle, inode);
-
- /*
- * In a multi-transaction truncate, we only make the final transaction
- * synchronous
- */
- if (IS_SYNC(inode))
- handle->h_sync = 1;
-out_stop:
- /*
- * If this was a simple ftruncate(), and the file will remain alive
- * then we need to clear up the orphan record which we created above.
- * However, if this was a real unlink then we were called by
- * ext3_evict_inode(), and we allow that function to clean up the
- * orphan info for us.
- */
- if (inode->i_nlink)
- ext3_orphan_del(handle, inode);
-
- ext3_journal_stop(handle);
- trace_ext3_truncate_exit(inode);
- return;
-out_notrans:
- /*
- * Delete the inode from orphan list so that it doesn't stay there
- * forever and trigger assertion on umount.
- */
- if (inode->i_nlink)
- ext3_orphan_del(NULL, inode);
- trace_ext3_truncate_exit(inode);
-}
-
-static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb,
- unsigned long ino, struct ext3_iloc *iloc)
-{
- unsigned long block_group;
- unsigned long offset;
- ext3_fsblk_t block;
- struct ext3_group_desc *gdp;
-
- if (!ext3_valid_inum(sb, ino)) {
- /*
- * This error is already checked for in namei.c unless we are
- * looking at an NFS filehandle, in which case no error
- * report is needed
- */
- return 0;
- }
-
- block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
- gdp = ext3_get_group_desc(sb, block_group, NULL);
- if (!gdp)
- return 0;
- /*
- * Figure out the offset within the block group inode table
- */
- offset = ((ino - 1) % EXT3_INODES_PER_GROUP(sb)) *
- EXT3_INODE_SIZE(sb);
- block = le32_to_cpu(gdp->bg_inode_table) +
- (offset >> EXT3_BLOCK_SIZE_BITS(sb));
-
- iloc->block_group = block_group;
- iloc->offset = offset & (EXT3_BLOCK_SIZE(sb) - 1);
- return block;
-}
-
-/*
- * ext3_get_inode_loc returns with an extra refcount against the inode's
- * underlying buffer_head on success. If 'in_mem' is true, we have all
- * data in memory that is needed to recreate the on-disk version of this
- * inode.
- */
-static int __ext3_get_inode_loc(struct inode *inode,
- struct ext3_iloc *iloc, int in_mem)
-{
- ext3_fsblk_t block;
- struct buffer_head *bh;
-
- block = ext3_get_inode_block(inode->i_sb, inode->i_ino, iloc);
- if (!block)
- return -EIO;
-
- bh = sb_getblk(inode->i_sb, block);
- if (unlikely(!bh)) {
- ext3_error (inode->i_sb, "ext3_get_inode_loc",
- "unable to read inode block - "
- "inode=%lu, block="E3FSBLK,
- inode->i_ino, block);
- return -ENOMEM;
- }
- if (!buffer_uptodate(bh)) {
- lock_buffer(bh);
-
- /*
- * If the buffer has the write error flag, we have failed
- * to write out another inode in the same block. In this
- * case, we don't have to read the block because we may
- * read the old inode data successfully.
- */
- if (buffer_write_io_error(bh) && !buffer_uptodate(bh))
- set_buffer_uptodate(bh);
-
- if (buffer_uptodate(bh)) {
- /* someone brought it uptodate while we waited */
- unlock_buffer(bh);
- goto has_buffer;
- }
-
- /*
- * If we have all information of the inode in memory and this
- * is the only valid inode in the block, we need not read the
- * block.
- */
- if (in_mem) {
- struct buffer_head *bitmap_bh;
- struct ext3_group_desc *desc;
- int inodes_per_buffer;
- int inode_offset, i;
- int block_group;
- int start;
-
- block_group = (inode->i_ino - 1) /
- EXT3_INODES_PER_GROUP(inode->i_sb);
- inodes_per_buffer = bh->b_size /
- EXT3_INODE_SIZE(inode->i_sb);
- inode_offset = ((inode->i_ino - 1) %
- EXT3_INODES_PER_GROUP(inode->i_sb));
- start = inode_offset & ~(inodes_per_buffer - 1);
-
- /* Is the inode bitmap in cache? */
- desc = ext3_get_group_desc(inode->i_sb,
- block_group, NULL);
- if (!desc)
- goto make_io;
-
- bitmap_bh = sb_getblk(inode->i_sb,
- le32_to_cpu(desc->bg_inode_bitmap));
- if (unlikely(!bitmap_bh))
- goto make_io;
-
- /*
- * If the inode bitmap isn't in cache then the
- * optimisation may end up performing two reads instead
- * of one, so skip it.
- */
- if (!buffer_uptodate(bitmap_bh)) {
- brelse(bitmap_bh);
- goto make_io;
- }
- for (i = start; i < start + inodes_per_buffer; i++) {
- if (i == inode_offset)
- continue;
- if (ext3_test_bit(i, bitmap_bh->b_data))
- break;
- }
- brelse(bitmap_bh);
- if (i == start + inodes_per_buffer) {
- /* all other inodes are free, so skip I/O */
- memset(bh->b_data, 0, bh->b_size);
- set_buffer_uptodate(bh);
- unlock_buffer(bh);
- goto has_buffer;
- }
- }
-
-make_io:
- /*
- * There are other valid inodes in the buffer, this inode
- * has in-inode xattrs, or we don't have this inode in memory.
- * Read the block from disk.
- */
- trace_ext3_load_inode(inode);
- get_bh(bh);
- bh->b_end_io = end_buffer_read_sync;
- submit_bh(READ | REQ_META | REQ_PRIO, bh);
- wait_on_buffer(bh);
- if (!buffer_uptodate(bh)) {
- ext3_error(inode->i_sb, "ext3_get_inode_loc",
- "unable to read inode block - "
- "inode=%lu, block="E3FSBLK,
- inode->i_ino, block);
- brelse(bh);
- return -EIO;
- }
- }
-has_buffer:
- iloc->bh = bh;
- return 0;
-}
-
-int ext3_get_inode_loc(struct inode *inode, struct ext3_iloc *iloc)
-{
- /* We have all inode data except xattrs in memory here. */
- return __ext3_get_inode_loc(inode, iloc,
- !ext3_test_inode_state(inode, EXT3_STATE_XATTR));
-}
-
-void ext3_set_inode_flags(struct inode *inode)
-{
- unsigned int flags = EXT3_I(inode)->i_flags;
-
- inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
- if (flags & EXT3_SYNC_FL)
- inode->i_flags |= S_SYNC;
- if (flags & EXT3_APPEND_FL)
- inode->i_flags |= S_APPEND;
- if (flags & EXT3_IMMUTABLE_FL)
- inode->i_flags |= S_IMMUTABLE;
- if (flags & EXT3_NOATIME_FL)
- inode->i_flags |= S_NOATIME;
- if (flags & EXT3_DIRSYNC_FL)
- inode->i_flags |= S_DIRSYNC;
-}
-
-/* Propagate flags from i_flags to EXT3_I(inode)->i_flags */
-void ext3_get_inode_flags(struct ext3_inode_info *ei)
-{
- unsigned int flags = ei->vfs_inode.i_flags;
-
- ei->i_flags &= ~(EXT3_SYNC_FL|EXT3_APPEND_FL|
- EXT3_IMMUTABLE_FL|EXT3_NOATIME_FL|EXT3_DIRSYNC_FL);
- if (flags & S_SYNC)
- ei->i_flags |= EXT3_SYNC_FL;
- if (flags & S_APPEND)
- ei->i_flags |= EXT3_APPEND_FL;
- if (flags & S_IMMUTABLE)
- ei->i_flags |= EXT3_IMMUTABLE_FL;
- if (flags & S_NOATIME)
- ei->i_flags |= EXT3_NOATIME_FL;
- if (flags & S_DIRSYNC)
- ei->i_flags |= EXT3_DIRSYNC_FL;
-}
-
-struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
-{
- struct ext3_iloc iloc;
- struct ext3_inode *raw_inode;
- struct ext3_inode_info *ei;
- struct buffer_head *bh;
- struct inode *inode;
- journal_t *journal = EXT3_SB(sb)->s_journal;
- transaction_t *transaction;
- long ret;
- int block;
- uid_t i_uid;
- gid_t i_gid;
-
- inode = iget_locked(sb, ino);
- if (!inode)
- return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
- return inode;
-
- ei = EXT3_I(inode);
- ei->i_block_alloc_info = NULL;
-
- ret = __ext3_get_inode_loc(inode, &iloc, 0);
- if (ret < 0)
- goto bad_inode;
- bh = iloc.bh;
- raw_inode = ext3_raw_inode(&iloc);
- inode->i_mode = le16_to_cpu(raw_inode->i_mode);
- i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
- i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
- if(!(test_opt (inode->i_sb, NO_UID32))) {
- i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
- i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
- }
- i_uid_write(inode, i_uid);
- i_gid_write(inode, i_gid);
- set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
- inode->i_size = le32_to_cpu(raw_inode->i_size);
- inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime);
- inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime);
- inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime);
- inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;
-
- ei->i_state_flags = 0;
- ei->i_dir_start_lookup = 0;
- ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
- /* We now have enough fields to check if the inode was active or not.
- * This is needed because nfsd might try to access dead inodes
- * the test is that same one that e2fsck uses
- * NeilBrown 1999oct15
- */
- if (inode->i_nlink == 0) {
- if (inode->i_mode == 0 ||
- !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ORPHAN_FS)) {
- /* this inode is deleted */
- brelse (bh);
- ret = -ESTALE;
- goto bad_inode;
- }
- /* The only unlinked inodes we let through here have
- * valid i_mode and are being read by the orphan
- * recovery code: that's fine, we're about to complete
- * the process of deleting those. */
- }
- inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
- ei->i_flags = le32_to_cpu(raw_inode->i_flags);
-#ifdef EXT3_FRAGMENTS
- ei->i_faddr = le32_to_cpu(raw_inode->i_faddr);
- ei->i_frag_no = raw_inode->i_frag;
- ei->i_frag_size = raw_inode->i_fsize;
-#endif
- ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
- if (!S_ISREG(inode->i_mode)) {
- ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
- } else {
- inode->i_size |=
- ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
- }
- ei->i_disksize = inode->i_size;
- inode->i_generation = le32_to_cpu(raw_inode->i_generation);
- ei->i_block_group = iloc.block_group;
- /*
- * NOTE! The in-memory inode i_data array is in little-endian order
- * even on big-endian machines: we do NOT byteswap the block numbers!
- */
- for (block = 0; block < EXT3_N_BLOCKS; block++)
- ei->i_data[block] = raw_inode->i_block[block];
- INIT_LIST_HEAD(&ei->i_orphan);
-
- /*
- * Set transaction id's of transactions that have to be committed
- * to finish f[data]sync. We set them to currently running transaction
- * as we cannot be sure that the inode or some of its metadata isn't
- * part of the transaction - the inode could have been reclaimed and
- * now it is reread from disk.
- */
- if (journal) {
- tid_t tid;
-
- spin_lock(&journal->j_state_lock);
- if (journal->j_running_transaction)
- transaction = journal->j_running_transaction;
- else
- transaction = journal->j_committing_transaction;
- if (transaction)
- tid = transaction->t_tid;
- else
- tid = journal->j_commit_sequence;
- spin_unlock(&journal->j_state_lock);
- atomic_set(&ei->i_sync_tid, tid);
- atomic_set(&ei->i_datasync_tid, tid);
- }
-
- if (inode->i_ino >= EXT3_FIRST_INO(inode->i_sb) + 1 &&
- EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) {
- /*
- * When mke2fs creates big inodes it does not zero out
- * the unused bytes above EXT3_GOOD_OLD_INODE_SIZE,
- * so ignore those first few inodes.
- */
- ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
- if (EXT3_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
- EXT3_INODE_SIZE(inode->i_sb)) {
- brelse (bh);
- ret = -EIO;
- goto bad_inode;
- }
- if (ei->i_extra_isize == 0) {
- /* The extra space is currently unused. Use it. */
- ei->i_extra_isize = sizeof(struct ext3_inode) -
- EXT3_GOOD_OLD_INODE_SIZE;
- } else {
- __le32 *magic = (void *)raw_inode +
- EXT3_GOOD_OLD_INODE_SIZE +
- ei->i_extra_isize;
- if (*magic == cpu_to_le32(EXT3_XATTR_MAGIC))
- ext3_set_inode_state(inode, EXT3_STATE_XATTR);
- }
- } else
- ei->i_extra_isize = 0;
-
- if (S_ISREG(inode->i_mode)) {
- inode->i_op = &ext3_file_inode_operations;
- inode->i_fop = &ext3_file_operations;
- ext3_set_aops(inode);
- } else if (S_ISDIR(inode->i_mode)) {
- inode->i_op = &ext3_dir_inode_operations;
- inode->i_fop = &ext3_dir_operations;
- } else if (S_ISLNK(inode->i_mode)) {
- if (ext3_inode_is_fast_symlink(inode)) {
- inode->i_op = &ext3_fast_symlink_inode_operations;
- nd_terminate_link(ei->i_data, inode->i_size,
- sizeof(ei->i_data) - 1);
- inode->i_link = (char *)ei->i_data;
- } else {
- inode->i_op = &ext3_symlink_inode_operations;
- ext3_set_aops(inode);
- }
- } else {
- inode->i_op = &ext3_special_inode_operations;
- if (raw_inode->i_block[0])
- init_special_inode(inode, inode->i_mode,
- old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
- else
- init_special_inode(inode, inode->i_mode,
- new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
- }
- brelse (iloc.bh);
- ext3_set_inode_flags(inode);
- unlock_new_inode(inode);
- return inode;
-
-bad_inode:
- iget_failed(inode);
- return ERR_PTR(ret);
-}
-
-/*
- * Post the struct inode info into an on-disk inode location in the
- * buffer-cache. This gobbles the caller's reference to the
- * buffer_head in the inode location struct.
- *
- * The caller must have write access to iloc->bh.
- */
-static int ext3_do_update_inode(handle_t *handle,
- struct inode *inode,
- struct ext3_iloc *iloc)
-{
- struct ext3_inode *raw_inode = ext3_raw_inode(iloc);
- struct ext3_inode_info *ei = EXT3_I(inode);
- struct buffer_head *bh = iloc->bh;
- int err = 0, rc, block;
- int need_datasync = 0;
- __le32 disksize;
- uid_t i_uid;
- gid_t i_gid;
-
-again:
- /* we can't allow multiple procs in here at once, its a bit racey */
- lock_buffer(bh);
-
- /* For fields not not tracking in the in-memory inode,
- * initialise them to zero for new inodes. */
- if (ext3_test_inode_state(inode, EXT3_STATE_NEW))
- memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
-
- ext3_get_inode_flags(ei);
- raw_inode->i_mode = cpu_to_le16(inode->i_mode);
- i_uid = i_uid_read(inode);
- i_gid = i_gid_read(inode);
- if(!(test_opt(inode->i_sb, NO_UID32))) {
- raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid));
- raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid));
-/*
- * Fix up interoperability with old kernels. Otherwise, old inodes get
- * re-used with the upper 16 bits of the uid/gid intact
- */
- if(!ei->i_dtime) {
- raw_inode->i_uid_high =
- cpu_to_le16(high_16_bits(i_uid));
- raw_inode->i_gid_high =
- cpu_to_le16(high_16_bits(i_gid));
- } else {
- raw_inode->i_uid_high = 0;
- raw_inode->i_gid_high = 0;
- }
- } else {
- raw_inode->i_uid_low =
- cpu_to_le16(fs_high2lowuid(i_uid));
- raw_inode->i_gid_low =
- cpu_to_le16(fs_high2lowgid(i_gid));
- raw_inode->i_uid_high = 0;
- raw_inode->i_gid_high = 0;
- }
- raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
- disksize = cpu_to_le32(ei->i_disksize);
- if (disksize != raw_inode->i_size) {
- need_datasync = 1;
- raw_inode->i_size = disksize;
- }
- raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
- raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
- raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
- raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
- raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
- raw_inode->i_flags = cpu_to_le32(ei->i_flags);
-#ifdef EXT3_FRAGMENTS
- raw_inode->i_faddr = cpu_to_le32(ei->i_faddr);
- raw_inode->i_frag = ei->i_frag_no;
- raw_inode->i_fsize = ei->i_frag_size;
-#endif
- raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl);
- if (!S_ISREG(inode->i_mode)) {
- raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
- } else {
- disksize = cpu_to_le32(ei->i_disksize >> 32);
- if (disksize != raw_inode->i_size_high) {
- raw_inode->i_size_high = disksize;
- need_datasync = 1;
- }
- if (ei->i_disksize > 0x7fffffffULL) {
- struct super_block *sb = inode->i_sb;
- if (!EXT3_HAS_RO_COMPAT_FEATURE(sb,
- EXT3_FEATURE_RO_COMPAT_LARGE_FILE) ||
- EXT3_SB(sb)->s_es->s_rev_level ==
- cpu_to_le32(EXT3_GOOD_OLD_REV)) {
- /* If this is the first large file
- * created, add a flag to the superblock.
- */
- unlock_buffer(bh);
- err = ext3_journal_get_write_access(handle,
- EXT3_SB(sb)->s_sbh);
- if (err)
- goto out_brelse;
-
- ext3_update_dynamic_rev(sb);
- EXT3_SET_RO_COMPAT_FEATURE(sb,
- EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
- handle->h_sync = 1;
- err = ext3_journal_dirty_metadata(handle,
- EXT3_SB(sb)->s_sbh);
- /* get our lock and start over */
- goto again;
- }
- }
- }
- raw_inode->i_generation = cpu_to_le32(inode->i_generation);
- if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
- if (old_valid_dev(inode->i_rdev)) {
- raw_inode->i_block[0] =
- cpu_to_le32(old_encode_dev(inode->i_rdev));
- raw_inode->i_block[1] = 0;
- } else {
- raw_inode->i_block[0] = 0;
- raw_inode->i_block[1] =
- cpu_to_le32(new_encode_dev(inode->i_rdev));
- raw_inode->i_block[2] = 0;
- }
- } else for (block = 0; block < EXT3_N_BLOCKS; block++)
- raw_inode->i_block[block] = ei->i_data[block];
-
- if (ei->i_extra_isize)
- raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
-
- BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
- unlock_buffer(bh);
- rc = ext3_journal_dirty_metadata(handle, bh);
- if (!err)
- err = rc;
- ext3_clear_inode_state(inode, EXT3_STATE_NEW);
-
- atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid);
- if (need_datasync)
- atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid);
-out_brelse:
- brelse (bh);
- ext3_std_error(inode->i_sb, err);
- return err;
-}
-
-/*
- * ext3_write_inode()
- *
- * We are called from a few places:
- *
- * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files.
- * Here, there will be no transaction running. We wait for any running
- * transaction to commit.
- *
- * - Within flush work (for sys_sync(), kupdate and such).
- * We wait on commit, if told to.
- *
- * - Within iput_final() -> write_inode_now()
- * We wait on commit, if told to.
- *
- * In all cases it is actually safe for us to return without doing anything,
- * because the inode has been copied into a raw inode buffer in
- * ext3_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL
- * writeback.
- *
- * Note that we are absolutely dependent upon all inode dirtiers doing the
- * right thing: they *must* call mark_inode_dirty() after dirtying info in
- * which we are interested.
- *
- * It would be a bug for them to not do this. The code:
- *
- * mark_inode_dirty(inode)
- * stuff();
- * inode->i_size = expr;
- *
- * is in error because write_inode() could occur while `stuff()' is running,
- * and the new i_size will be lost. Plus the inode will no longer be on the
- * superblock's dirty inode list.
- */
-int ext3_write_inode(struct inode *inode, struct writeback_control *wbc)
-{
- if (WARN_ON_ONCE(current->flags & PF_MEMALLOC))
- return 0;
-
- if (ext3_journal_current_handle()) {
- jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
- dump_stack();
- return -EIO;
- }
-
- /*
- * No need to force transaction in WB_SYNC_NONE mode. Also
- * ext3_sync_fs() will force the commit after everything is
- * written.
- */
- if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync)
- return 0;
-
- return ext3_force_commit(inode->i_sb);
-}
-
-/*
- * ext3_setattr()
- *
- * Called from notify_change.
- *
- * We want to trap VFS attempts to truncate the file as soon as
- * possible. In particular, we want to make sure that when the VFS
- * shrinks i_size, we put the inode on the orphan list and modify
- * i_disksize immediately, so that during the subsequent flushing of
- * dirty pages and freeing of disk blocks, we can guarantee that any
- * commit will leave the blocks being flushed in an unused state on
- * disk. (On recovery, the inode will get truncated and the blocks will
- * be freed, so we have a strong guarantee that no future commit will
- * leave these blocks visible to the user.)
- *
- * Called with inode->sem down.
- */
-int ext3_setattr(struct dentry *dentry, struct iattr *attr)
-{
- struct inode *inode = d_inode(dentry);
- int error, rc = 0;
- const unsigned int ia_valid = attr->ia_valid;
-
- error = inode_change_ok(inode, attr);
- if (error)
- return error;
-
- if (is_quota_modification(inode, attr))
- dquot_initialize(inode);
- if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) ||
- (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
- handle_t *handle;
-
- /* (user+group)*(old+new) structure, inode write (sb,
- * inode block, ? - but truncate inode update has it) */
- handle = ext3_journal_start(inode, EXT3_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
- EXT3_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)+3);
- if (IS_ERR(handle)) {
- error = PTR_ERR(handle);
- goto err_out;
- }
- error = dquot_transfer(inode, attr);
- if (error) {
- ext3_journal_stop(handle);
- return error;
- }
- /* Update corresponding info in inode so that everything is in
- * one transaction */
- if (attr->ia_valid & ATTR_UID)
- inode->i_uid = attr->ia_uid;
- if (attr->ia_valid & ATTR_GID)
- inode->i_gid = attr->ia_gid;
- error = ext3_mark_inode_dirty(handle, inode);
- ext3_journal_stop(handle);
- }
-
- if (attr->ia_valid & ATTR_SIZE)
- inode_dio_wait(inode);
-
- if (S_ISREG(inode->i_mode) &&
- attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
- handle_t *handle;
-
- handle = ext3_journal_start(inode, 3);
- if (IS_ERR(handle)) {
- error = PTR_ERR(handle);
- goto err_out;
- }
-
- error = ext3_orphan_add(handle, inode);
- if (error) {
- ext3_journal_stop(handle);
- goto err_out;
- }
- EXT3_I(inode)->i_disksize = attr->ia_size;
- error = ext3_mark_inode_dirty(handle, inode);
- ext3_journal_stop(handle);
- if (error) {
- /* Some hard fs error must have happened. Bail out. */
- ext3_orphan_del(NULL, inode);
- goto err_out;
- }
- rc = ext3_block_truncate_page(inode, attr->ia_size);
- if (rc) {
- /* Cleanup orphan list and exit */
- handle = ext3_journal_start(inode, 3);
- if (IS_ERR(handle)) {
- ext3_orphan_del(NULL, inode);
- goto err_out;
- }
- ext3_orphan_del(handle, inode);
- ext3_journal_stop(handle);
- goto err_out;
- }
- }
-
- if ((attr->ia_valid & ATTR_SIZE) &&
- attr->ia_size != i_size_read(inode)) {
- truncate_setsize(inode, attr->ia_size);
- ext3_truncate(inode);
- }
-
- setattr_copy(inode, attr);
- mark_inode_dirty(inode);
-
- if (ia_valid & ATTR_MODE)
- rc = posix_acl_chmod(inode, inode->i_mode);
-
-err_out:
- ext3_std_error(inode->i_sb, error);
- if (!error)
- error = rc;
- return error;
-}
-
-
-/*
- * How many blocks doth make a writepage()?
- *
- * With N blocks per page, it may be:
- * N data blocks
- * 2 indirect block
- * 2 dindirect
- * 1 tindirect
- * N+5 bitmap blocks (from the above)
- * N+5 group descriptor summary blocks
- * 1 inode block
- * 1 superblock.
- * 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files
- *
- * 3 * (N + 5) + 2 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS
- *
- * With ordered or writeback data it's the same, less the N data blocks.
- *
- * If the inode's direct blocks can hold an integral number of pages then a
- * page cannot straddle two indirect blocks, and we can only touch one indirect
- * and dindirect block, and the "5" above becomes "3".
- *
- * This still overestimates under most circumstances. If we were to pass the
- * start and end offsets in here as well we could do block_to_path() on each
- * block and work out the exact number of indirects which are touched. Pah.
- */
-
-static int ext3_writepage_trans_blocks(struct inode *inode)
-{
- int bpp = ext3_journal_blocks_per_page(inode);
- int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3;
- int ret;
-
- if (ext3_should_journal_data(inode))
- ret = 3 * (bpp + indirects) + 2;
- else
- ret = 2 * (bpp + indirects) + indirects + 2;
-
-#ifdef CONFIG_QUOTA
- /* We know that structure was already allocated during dquot_initialize so
- * we will be updating only the data blocks + inodes */
- ret += EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
-#endif
-
- return ret;
-}
-
-/*
- * The caller must have previously called ext3_reserve_inode_write().
- * Give this, we know that the caller already has write access to iloc->bh.
- */
-int ext3_mark_iloc_dirty(handle_t *handle,
- struct inode *inode, struct ext3_iloc *iloc)
-{
- int err = 0;
-
- /* the do_update_inode consumes one bh->b_count */
- get_bh(iloc->bh);
-
- /* ext3_do_update_inode() does journal_dirty_metadata */
- err = ext3_do_update_inode(handle, inode, iloc);
- put_bh(iloc->bh);
- return err;
-}
-
-/*
- * On success, We end up with an outstanding reference count against
- * iloc->bh. This _must_ be cleaned up later.
- */
-
-int
-ext3_reserve_inode_write(handle_t *handle, struct inode *inode,
- struct ext3_iloc *iloc)
-{
- int err = 0;
- if (handle) {
- err = ext3_get_inode_loc(inode, iloc);
- if (!err) {
- BUFFER_TRACE(iloc->bh, "get_write_access");
- err = ext3_journal_get_write_access(handle, iloc->bh);
- if (err) {
- brelse(iloc->bh);
- iloc->bh = NULL;
- }
- }
- }
- ext3_std_error(inode->i_sb, err);
- return err;
-}
-
-/*
- * What we do here is to mark the in-core inode as clean with respect to inode
- * dirtiness (it may still be data-dirty).
- * This means that the in-core inode may be reaped by prune_icache
- * without having to perform any I/O. This is a very good thing,
- * because *any* task may call prune_icache - even ones which
- * have a transaction open against a different journal.
- *
- * Is this cheating? Not really. Sure, we haven't written the
- * inode out, but prune_icache isn't a user-visible syncing function.
- * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
- * we start and wait on commits.
- */
-int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
-{
- struct ext3_iloc iloc;
- int err;
-
- might_sleep();
- trace_ext3_mark_inode_dirty(inode, _RET_IP_);
- err = ext3_reserve_inode_write(handle, inode, &iloc);
- if (!err)
- err = ext3_mark_iloc_dirty(handle, inode, &iloc);
- return err;
-}
-
-/*
- * ext3_dirty_inode() is called from __mark_inode_dirty()
- *
- * We're really interested in the case where a file is being extended.
- * i_size has been changed by generic_commit_write() and we thus need
- * to include the updated inode in the current transaction.
- *
- * Also, dquot_alloc_space() will always dirty the inode when blocks
- * are allocated to the file.
- *
- * If the inode is marked synchronous, we don't honour that here - doing
- * so would cause a commit on atime updates, which we don't bother doing.
- * We handle synchronous inodes at the highest possible level.
- */
-void ext3_dirty_inode(struct inode *inode, int flags)
-{
- handle_t *current_handle = ext3_journal_current_handle();
- handle_t *handle;
-
- handle = ext3_journal_start(inode, 2);
- if (IS_ERR(handle))
- goto out;
- if (current_handle &&
- current_handle->h_transaction != handle->h_transaction) {
- /* This task has a transaction open against a different fs */
- printk(KERN_EMERG "%s: transactions do not match!\n",
- __func__);
- } else {
- jbd_debug(5, "marking dirty. outer handle=%p\n",
- current_handle);
- ext3_mark_inode_dirty(handle, inode);
- }
- ext3_journal_stop(handle);
-out:
- return;
-}
-
-#if 0
-/*
- * Bind an inode's backing buffer_head into this transaction, to prevent
- * it from being flushed to disk early. Unlike
- * ext3_reserve_inode_write, this leaves behind no bh reference and
- * returns no iloc structure, so the caller needs to repeat the iloc
- * lookup to mark the inode dirty later.
- */
-static int ext3_pin_inode(handle_t *handle, struct inode *inode)
-{
- struct ext3_iloc iloc;
-
- int err = 0;
- if (handle) {
- err = ext3_get_inode_loc(inode, &iloc);
- if (!err) {
- BUFFER_TRACE(iloc.bh, "get_write_access");
- err = journal_get_write_access(handle, iloc.bh);
- if (!err)
- err = ext3_journal_dirty_metadata(handle,
- iloc.bh);
- brelse(iloc.bh);
- }
- }
- ext3_std_error(inode->i_sb, err);
- return err;
-}
-#endif
-
-int ext3_change_inode_journal_flag(struct inode *inode, int val)
-{
- journal_t *journal;
- handle_t *handle;
- int err;
-
- /*
- * We have to be very careful here: changing a data block's
- * journaling status dynamically is dangerous. If we write a
- * data block to the journal, change the status and then delete
- * that block, we risk forgetting to revoke the old log record
- * from the journal and so a subsequent replay can corrupt data.
- * So, first we make sure that the journal is empty and that
- * nobody is changing anything.
- */
-
- journal = EXT3_JOURNAL(inode);
- if (is_journal_aborted(journal))
- return -EROFS;
-
- journal_lock_updates(journal);
- journal_flush(journal);
-
- /*
- * OK, there are no updates running now, and all cached data is
- * synced to disk. We are now in a completely consistent state
- * which doesn't have anything in the journal, and we know that
- * no filesystem updates are running, so it is safe to modify
- * the inode's in-core data-journaling state flag now.
- */
-
- if (val)
- EXT3_I(inode)->i_flags |= EXT3_JOURNAL_DATA_FL;
- else
- EXT3_I(inode)->i_flags &= ~EXT3_JOURNAL_DATA_FL;
- ext3_set_aops(inode);
-
- journal_unlock_updates(journal);
-
- /* Finally we can mark the inode as dirty. */
-
- handle = ext3_journal_start(inode, 1);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
-
- err = ext3_mark_inode_dirty(handle, inode);
- handle->h_sync = 1;
- ext3_journal_stop(handle);
- ext3_std_error(inode->i_sb, err);
-
- return err;
-}
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
deleted file mode 100644
index 4d96e9a..0000000
--- a/fs/ext3/ioctl.c
+++ /dev/null
@@ -1,327 +0,0 @@
-/*
- * linux/fs/ext3/ioctl.c
- *
- * Copyright (C) 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- */
-
-#include <linux/mount.h>
-#include <linux/compat.h>
-#include <asm/uaccess.h>
-#include "ext3.h"
-
-long ext3_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-{
- struct inode *inode = file_inode(filp);
- struct ext3_inode_info *ei = EXT3_I(inode);
- unsigned int flags;
- unsigned short rsv_window_size;
-
- ext3_debug ("cmd = %u, arg = %lu\n", cmd, arg);
-
- switch (cmd) {
- case EXT3_IOC_GETFLAGS:
- ext3_get_inode_flags(ei);
- flags = ei->i_flags & EXT3_FL_USER_VISIBLE;
- return put_user(flags, (int __user *) arg);
- case EXT3_IOC_SETFLAGS: {
- handle_t *handle = NULL;
- int err;
- struct ext3_iloc iloc;
- unsigned int oldflags;
- unsigned int jflag;
-
- if (!inode_owner_or_capable(inode))
- return -EACCES;
-
- if (get_user(flags, (int __user *) arg))
- return -EFAULT;
-
- err = mnt_want_write_file(filp);
- if (err)
- return err;
-
- flags = ext3_mask_flags(inode->i_mode, flags);
-
- mutex_lock(&inode->i_mutex);
-
- /* Is it quota file? Do not allow user to mess with it */
- err = -EPERM;
- if (IS_NOQUOTA(inode))
- goto flags_out;
-
- oldflags = ei->i_flags;
-
- /* The JOURNAL_DATA flag is modifiable only by root */
- jflag = flags & EXT3_JOURNAL_DATA_FL;
-
- /*
- * The IMMUTABLE and APPEND_ONLY flags can only be changed by
- * the relevant capability.
- *
- * This test looks nicer. Thanks to Pauline Middelink
- */
- if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) {
- if (!capable(CAP_LINUX_IMMUTABLE))
- goto flags_out;
- }
-
- /*
- * The JOURNAL_DATA flag can only be changed by
- * the relevant capability.
- */
- if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) {
- if (!capable(CAP_SYS_RESOURCE))
- goto flags_out;
- }
-
- handle = ext3_journal_start(inode, 1);
- if (IS_ERR(handle)) {
- err = PTR_ERR(handle);
- goto flags_out;
- }
- if (IS_SYNC(inode))
- handle->h_sync = 1;
- err = ext3_reserve_inode_write(handle, inode, &iloc);
- if (err)
- goto flags_err;
-
- flags = flags & EXT3_FL_USER_MODIFIABLE;
- flags |= oldflags & ~EXT3_FL_USER_MODIFIABLE;
- ei->i_flags = flags;
-
- ext3_set_inode_flags(inode);
- inode->i_ctime = CURRENT_TIME_SEC;
-
- err = ext3_mark_iloc_dirty(handle, inode, &iloc);
-flags_err:
- ext3_journal_stop(handle);
- if (err)
- goto flags_out;
-
- if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL))
- err = ext3_change_inode_journal_flag(inode, jflag);
-flags_out:
- mutex_unlock(&inode->i_mutex);
- mnt_drop_write_file(filp);
- return err;
- }
- case EXT3_IOC_GETVERSION:
- case EXT3_IOC_GETVERSION_OLD:
- return put_user(inode->i_generation, (int __user *) arg);
- case EXT3_IOC_SETVERSION:
- case EXT3_IOC_SETVERSION_OLD: {
- handle_t *handle;
- struct ext3_iloc iloc;
- __u32 generation;
- int err;
-
- if (!inode_owner_or_capable(inode))
- return -EPERM;
-
- err = mnt_want_write_file(filp);
- if (err)
- return err;
- if (get_user(generation, (int __user *) arg)) {
- err = -EFAULT;
- goto setversion_out;
- }
-
- mutex_lock(&inode->i_mutex);
- handle = ext3_journal_start(inode, 1);
- if (IS_ERR(handle)) {
- err = PTR_ERR(handle);
- goto unlock_out;
- }
- err = ext3_reserve_inode_write(handle, inode, &iloc);
- if (err == 0) {
- inode->i_ctime = CURRENT_TIME_SEC;
- inode->i_generation = generation;
- err = ext3_mark_iloc_dirty(handle, inode, &iloc);
- }
- ext3_journal_stop(handle);
-
-unlock_out:
- mutex_unlock(&inode->i_mutex);
-setversion_out:
- mnt_drop_write_file(filp);
- return err;
- }
- case EXT3_IOC_GETRSVSZ:
- if (test_opt(inode->i_sb, RESERVATION)
- && S_ISREG(inode->i_mode)
- && ei->i_block_alloc_info) {
- rsv_window_size = ei->i_block_alloc_info->rsv_window_node.rsv_goal_size;
- return put_user(rsv_window_size, (int __user *)arg);
- }
- return -ENOTTY;
- case EXT3_IOC_SETRSVSZ: {
- int err;
-
- if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
- return -ENOTTY;
-
- err = mnt_want_write_file(filp);
- if (err)
- return err;
-
- if (!inode_owner_or_capable(inode)) {
- err = -EACCES;
- goto setrsvsz_out;
- }
-
- if (get_user(rsv_window_size, (int __user *)arg)) {
- err = -EFAULT;
- goto setrsvsz_out;
- }
-
- if (rsv_window_size > EXT3_MAX_RESERVE_BLOCKS)
- rsv_window_size = EXT3_MAX_RESERVE_BLOCKS;
-
- /*
- * need to allocate reservation structure for this inode
- * before set the window size
- */
- mutex_lock(&ei->truncate_mutex);
- if (!ei->i_block_alloc_info)
- ext3_init_block_alloc_info(inode);
-
- if (ei->i_block_alloc_info){
- struct ext3_reserve_window_node *rsv = &ei->i_block_alloc_info->rsv_window_node;
- rsv->rsv_goal_size = rsv_window_size;
- }
- mutex_unlock(&ei->truncate_mutex);
-setrsvsz_out:
- mnt_drop_write_file(filp);
- return err;
- }
- case EXT3_IOC_GROUP_EXTEND: {
- ext3_fsblk_t n_blocks_count;
- struct super_block *sb = inode->i_sb;
- int err, err2;
-
- if (!capable(CAP_SYS_RESOURCE))
- return -EPERM;
-
- err = mnt_want_write_file(filp);
- if (err)
- return err;
-
- if (get_user(n_blocks_count, (__u32 __user *)arg)) {
- err = -EFAULT;
- goto group_extend_out;
- }
- err = ext3_group_extend(sb, EXT3_SB(sb)->s_es, n_blocks_count);
- journal_lock_updates(EXT3_SB(sb)->s_journal);
- err2 = journal_flush(EXT3_SB(sb)->s_journal);
- journal_unlock_updates(EXT3_SB(sb)->s_journal);
- if (err == 0)
- err = err2;
-group_extend_out:
- mnt_drop_write_file(filp);
- return err;
- }
- case EXT3_IOC_GROUP_ADD: {
- struct ext3_new_group_data input;
- struct super_block *sb = inode->i_sb;
- int err, err2;
-
- if (!capable(CAP_SYS_RESOURCE))
- return -EPERM;
-
- err = mnt_want_write_file(filp);
- if (err)
- return err;
-
- if (copy_from_user(&input, (struct ext3_new_group_input __user *)arg,
- sizeof(input))) {
- err = -EFAULT;
- goto group_add_out;
- }
-
- err = ext3_group_add(sb, &input);
- journal_lock_updates(EXT3_SB(sb)->s_journal);
- err2 = journal_flush(EXT3_SB(sb)->s_journal);
- journal_unlock_updates(EXT3_SB(sb)->s_journal);
- if (err == 0)
- err = err2;
-group_add_out:
- mnt_drop_write_file(filp);
- return err;
- }
- case FITRIM: {
-
- struct super_block *sb = inode->i_sb;
- struct fstrim_range range;
- int ret = 0;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (copy_from_user(&range, (struct fstrim_range __user *)arg,
- sizeof(range)))
- return -EFAULT;
-
- ret = ext3_trim_fs(sb, &range);
- if (ret < 0)
- return ret;
-
- if (copy_to_user((struct fstrim_range __user *)arg, &range,
- sizeof(range)))
- return -EFAULT;
-
- return 0;
- }
-
- default:
- return -ENOTTY;
- }
-}
-
-#ifdef CONFIG_COMPAT
-long ext3_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
-{
- /* These are just misnamed, they actually get/put from/to user an int */
- switch (cmd) {
- case EXT3_IOC32_GETFLAGS:
- cmd = EXT3_IOC_GETFLAGS;
- break;
- case EXT3_IOC32_SETFLAGS:
- cmd = EXT3_IOC_SETFLAGS;
- break;
- case EXT3_IOC32_GETVERSION:
- cmd = EXT3_IOC_GETVERSION;
- break;
- case EXT3_IOC32_SETVERSION:
- cmd = EXT3_IOC_SETVERSION;
- break;
- case EXT3_IOC32_GROUP_EXTEND:
- cmd = EXT3_IOC_GROUP_EXTEND;
- break;
- case EXT3_IOC32_GETVERSION_OLD:
- cmd = EXT3_IOC_GETVERSION_OLD;
- break;
- case EXT3_IOC32_SETVERSION_OLD:
- cmd = EXT3_IOC_SETVERSION_OLD;
- break;
-#ifdef CONFIG_JBD_DEBUG
- case EXT3_IOC32_WAIT_FOR_READONLY:
- cmd = EXT3_IOC_WAIT_FOR_READONLY;
- break;
-#endif
- case EXT3_IOC32_GETRSVSZ:
- cmd = EXT3_IOC_GETRSVSZ;
- break;
- case EXT3_IOC32_SETRSVSZ:
- cmd = EXT3_IOC_SETRSVSZ;
- break;
- case EXT3_IOC_GROUP_ADD:
- break;
- default:
- return -ENOIOCTLCMD;
- }
- return ext3_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
-}
-#endif
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
deleted file mode 100644
index c9e767c..0000000
--- a/fs/ext3/namei.c
+++ /dev/null
@@ -1,2586 +0,0 @@
-/*
- * linux/fs/ext3/namei.c
- *
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- *
- * from
- *
- * linux/fs/minix/namei.c
- *
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * Big-endian to little-endian byte-swapping/bitmaps by
- * David S. Miller (davem@caip.rutgers.edu), 1995
- * Directory entry file type support and forward compatibility hooks
- * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998
- * Hash Tree Directory indexing (c)
- * Daniel Phillips, 2001
- * Hash Tree Directory indexing porting
- * Christopher Li, 2002
- * Hash Tree Directory indexing cleanup
- * Theodore Ts'o, 2002
- */
-
-#include <linux/quotaops.h>
-#include "ext3.h"
-#include "namei.h"
-#include "xattr.h"
-#include "acl.h"
-
-/*
- * define how far ahead to read directories while searching them.
- */
-#define NAMEI_RA_CHUNKS 2
-#define NAMEI_RA_BLOCKS 4
-#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
-
-static struct buffer_head *ext3_append(handle_t *handle,
- struct inode *inode,
- u32 *block, int *err)
-{
- struct buffer_head *bh;
-
- *block = inode->i_size >> inode->i_sb->s_blocksize_bits;
-
- if ((bh = ext3_dir_bread(handle, inode, *block, 1, err))) {
- inode->i_size += inode->i_sb->s_blocksize;
- EXT3_I(inode)->i_disksize = inode->i_size;
- *err = ext3_journal_get_write_access(handle, bh);
- if (*err) {
- brelse(bh);
- bh = NULL;
- }
- }
- return bh;
-}
-
-#ifndef assert
-#define assert(test) J_ASSERT(test)
-#endif
-
-#ifdef DX_DEBUG
-#define dxtrace(command) command
-#else
-#define dxtrace(command)
-#endif
-
-struct fake_dirent
-{
- __le32 inode;
- __le16 rec_len;
- u8 name_len;
- u8 file_type;
-};
-
-struct dx_countlimit
-{
- __le16 limit;
- __le16 count;
-};
-
-struct dx_entry
-{
- __le32 hash;
- __le32 block;
-};
-
-/*
- * dx_root_info is laid out so that if it should somehow get overlaid by a
- * dirent the two low bits of the hash version will be zero. Therefore, the
- * hash version mod 4 should never be 0. Sincerely, the paranoia department.
- */
-
-struct dx_root
-{
- struct fake_dirent dot;
- char dot_name[4];
- struct fake_dirent dotdot;
- char dotdot_name[4];
- struct dx_root_info
- {
- __le32 reserved_zero;
- u8 hash_version;
- u8 info_length; /* 8 */
- u8 indirect_levels;
- u8 unused_flags;
- }
- info;
- struct dx_entry entries[0];
-};
-
-struct dx_node
-{
- struct fake_dirent fake;
- struct dx_entry entries[0];
-};
-
-
-struct dx_frame
-{
- struct buffer_head *bh;
- struct dx_entry *entries;
- struct dx_entry *at;
-};
-
-struct dx_map_entry
-{
- u32 hash;
- u16 offs;
- u16 size;
-};
-
-static inline unsigned dx_get_block (struct dx_entry *entry);
-static void dx_set_block (struct dx_entry *entry, unsigned value);
-static inline unsigned dx_get_hash (struct dx_entry *entry);
-static void dx_set_hash (struct dx_entry *entry, unsigned value);
-static unsigned dx_get_count (struct dx_entry *entries);
-static unsigned dx_get_limit (struct dx_entry *entries);
-static void dx_set_count (struct dx_entry *entries, unsigned value);
-static void dx_set_limit (struct dx_entry *entries, unsigned value);
-static unsigned dx_root_limit (struct inode *dir, unsigned infosize);
-static unsigned dx_node_limit (struct inode *dir);
-static struct dx_frame *dx_probe(struct qstr *entry,
- struct inode *dir,
- struct dx_hash_info *hinfo,
- struct dx_frame *frame,
- int *err);
-static void dx_release (struct dx_frame *frames);
-static int dx_make_map(struct ext3_dir_entry_2 *de, unsigned blocksize,
- struct dx_hash_info *hinfo, struct dx_map_entry map[]);
-static void dx_sort_map(struct dx_map_entry *map, unsigned count);
-static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to,
- struct dx_map_entry *offsets, int count);
-static struct ext3_dir_entry_2 *dx_pack_dirents(char *base, unsigned blocksize);
-static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block);
-static int ext3_htree_next_block(struct inode *dir, __u32 hash,
- struct dx_frame *frame,
- struct dx_frame *frames,
- __u32 *start_hash);
-static struct buffer_head * ext3_dx_find_entry(struct inode *dir,
- struct qstr *entry, struct ext3_dir_entry_2 **res_dir,
- int *err);
-static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
- struct inode *inode);
-
-/*
- * p is at least 6 bytes before the end of page
- */
-static inline struct ext3_dir_entry_2 *
-ext3_next_entry(struct ext3_dir_entry_2 *p)
-{
- return (struct ext3_dir_entry_2 *)((char *)p +
- ext3_rec_len_from_disk(p->rec_len));
-}
-
-/*
- * Future: use high four bits of block for coalesce-on-delete flags
- * Mask them off for now.
- */
-
-static inline unsigned dx_get_block (struct dx_entry *entry)
-{
- return le32_to_cpu(entry->block) & 0x00ffffff;
-}
-
-static inline void dx_set_block (struct dx_entry *entry, unsigned value)
-{
- entry->block = cpu_to_le32(value);
-}
-
-static inline unsigned dx_get_hash (struct dx_entry *entry)
-{
- return le32_to_cpu(entry->hash);
-}
-
-static inline void dx_set_hash (struct dx_entry *entry, unsigned value)
-{
- entry->hash = cpu_to_le32(value);
-}
-
-static inline unsigned dx_get_count (struct dx_entry *entries)
-{
- return le16_to_cpu(((struct dx_countlimit *) entries)->count);
-}
-
-static inline unsigned dx_get_limit (struct dx_entry *entries)
-{
- return le16_to_cpu(((struct dx_countlimit *) entries)->limit);
-}
-
-static inline void dx_set_count (struct dx_entry *entries, unsigned value)
-{
- ((struct dx_countlimit *) entries)->count = cpu_to_le16(value);
-}
-
-static inline void dx_set_limit (struct dx_entry *entries, unsigned value)
-{
- ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
-}
-
-static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
-{
- unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(1) -
- EXT3_DIR_REC_LEN(2) - infosize;
- return entry_space / sizeof(struct dx_entry);
-}
-
-static inline unsigned dx_node_limit (struct inode *dir)
-{
- unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(0);
- return entry_space / sizeof(struct dx_entry);
-}
-
-/*
- * Debug
- */
-#ifdef DX_DEBUG
-static void dx_show_index (char * label, struct dx_entry *entries)
-{
- int i, n = dx_get_count (entries);
- printk("%s index ", label);
- for (i = 0; i < n; i++)
- {
- printk("%x->%u ", i? dx_get_hash(entries + i): 0, dx_get_block(entries + i));
- }
- printk("\n");
-}
-
-struct stats
-{
- unsigned names;
- unsigned space;
- unsigned bcount;
-};
-
-static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext3_dir_entry_2 *de,
- int size, int show_names)
-{
- unsigned names = 0, space = 0;
- char *base = (char *) de;
- struct dx_hash_info h = *hinfo;
-
- printk("names: ");
- while ((char *) de < base + size)
- {
- if (de->inode)
- {
- if (show_names)
- {
- int len = de->name_len;
- char *name = de->name;
- while (len--) printk("%c", *name++);
- ext3fs_dirhash(de->name, de->name_len, &h);
- printk(":%x.%u ", h.hash,
- (unsigned) ((char *) de - base));
- }
- space += EXT3_DIR_REC_LEN(de->name_len);
- names++;
- }
- de = ext3_next_entry(de);
- }
- printk("(%i)\n", names);
- return (struct stats) { names, space, 1 };
-}
-
-struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
- struct dx_entry *entries, int levels)
-{
- unsigned blocksize = dir->i_sb->s_blocksize;
- unsigned count = dx_get_count (entries), names = 0, space = 0, i;
- unsigned bcount = 0;
- struct buffer_head *bh;
- int err;
- printk("%i indexed blocks...\n", count);
- for (i = 0; i < count; i++, entries++)
- {
- u32 block = dx_get_block(entries), hash = i? dx_get_hash(entries): 0;
- u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash;
- struct stats stats;
- printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range);
- if (!(bh = ext3_bread (NULL,dir, block, 0,&err))) continue;
- stats = levels?
- dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1):
- dx_show_leaf(hinfo, (struct ext3_dir_entry_2 *) bh->b_data, blocksize, 0);
- names += stats.names;
- space += stats.space;
- bcount += stats.bcount;
- brelse (bh);
- }
- if (bcount)
- printk("%snames %u, fullness %u (%u%%)\n", levels?"":" ",
- names, space/bcount,(space/bcount)*100/blocksize);
- return (struct stats) { names, space, bcount};
-}
-#endif /* DX_DEBUG */
-
-/*
- * Probe for a directory leaf block to search.
- *
- * dx_probe can return ERR_BAD_DX_DIR, which means there was a format
- * error in the directory index, and the caller should fall back to
- * searching the directory normally. The callers of dx_probe **MUST**
- * check for this error code, and make sure it never gets reflected
- * back to userspace.
- */
-static struct dx_frame *
-dx_probe(struct qstr *entry, struct inode *dir,
- struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
-{
- unsigned count, indirect;
- struct dx_entry *at, *entries, *p, *q, *m;
- struct dx_root *root;
- struct buffer_head *bh;
- struct dx_frame *frame = frame_in;
- u32 hash;
-
- frame->bh = NULL;
- if (!(bh = ext3_dir_bread(NULL, dir, 0, 0, err))) {
- *err = ERR_BAD_DX_DIR;
- goto fail;
- }
- root = (struct dx_root *) bh->b_data;
- if (root->info.hash_version != DX_HASH_TEA &&
- root->info.hash_version != DX_HASH_HALF_MD4 &&
- root->info.hash_version != DX_HASH_LEGACY) {
- ext3_warning(dir->i_sb, __func__,
- "Unrecognised inode hash code %d",
- root->info.hash_version);
- brelse(bh);
- *err = ERR_BAD_DX_DIR;
- goto fail;
- }
- hinfo->hash_version = root->info.hash_version;
- if (hinfo->hash_version <= DX_HASH_TEA)
- hinfo->hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned;
- hinfo->seed = EXT3_SB(dir->i_sb)->s_hash_seed;
- if (entry)
- ext3fs_dirhash(entry->name, entry->len, hinfo);
- hash = hinfo->hash;
-
- if (root->info.unused_flags & 1) {
- ext3_warning(dir->i_sb, __func__,
- "Unimplemented inode hash flags: %#06x",
- root->info.unused_flags);
- brelse(bh);
- *err = ERR_BAD_DX_DIR;
- goto fail;
- }
-
- if ((indirect = root->info.indirect_levels) > 1) {
- ext3_warning(dir->i_sb, __func__,
- "Unimplemented inode hash depth: %#06x",
- root->info.indirect_levels);
- brelse(bh);
- *err = ERR_BAD_DX_DIR;
- goto fail;
- }
-
- entries = (struct dx_entry *) (((char *)&root->info) +
- root->info.info_length);
-
- if (dx_get_limit(entries) != dx_root_limit(dir,
- root->info.info_length)) {
- ext3_warning(dir->i_sb, __func__,
- "dx entry: limit != root limit");
- brelse(bh);
- *err = ERR_BAD_DX_DIR;
- goto fail;
- }
-
- dxtrace (printk("Look up %x", hash));
- while (1)
- {
- count = dx_get_count(entries);
- if (!count || count > dx_get_limit(entries)) {
- ext3_warning(dir->i_sb, __func__,
- "dx entry: no count or count > limit");
- brelse(bh);
- *err = ERR_BAD_DX_DIR;
- goto fail2;
- }
-
- p = entries + 1;
- q = entries + count - 1;
- while (p <= q)
- {
- m = p + (q - p)/2;
- dxtrace(printk("."));
- if (dx_get_hash(m) > hash)
- q = m - 1;
- else
- p = m + 1;
- }
-
- if (0) // linear search cross check
- {
- unsigned n = count - 1;
- at = entries;
- while (n--)
- {
- dxtrace(printk(","));
- if (dx_get_hash(++at) > hash)
- {
- at--;
- break;
- }
- }
- assert (at == p - 1);
- }
-
- at = p - 1;
- dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at)));
- frame->bh = bh;
- frame->entries = entries;
- frame->at = at;
- if (!indirect--) return frame;
- if (!(bh = ext3_dir_bread(NULL, dir, dx_get_block(at), 0, err))) {
- *err = ERR_BAD_DX_DIR;
- goto fail2;
- }
- at = entries = ((struct dx_node *) bh->b_data)->entries;
- if (dx_get_limit(entries) != dx_node_limit (dir)) {
- ext3_warning(dir->i_sb, __func__,
- "dx entry: limit != node limit");
- brelse(bh);
- *err = ERR_BAD_DX_DIR;
- goto fail2;
- }
- frame++;
- frame->bh = NULL;
- }
-fail2:
- while (frame >= frame_in) {
- brelse(frame->bh);
- frame--;
- }
-fail:
- if (*err == ERR_BAD_DX_DIR)
- ext3_warning(dir->i_sb, __func__,
- "Corrupt dir inode %ld, running e2fsck is "
- "recommended.", dir->i_ino);
- return NULL;
-}
-
-static void dx_release (struct dx_frame *frames)
-{
- if (frames[0].bh == NULL)
- return;
-
- if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels)
- brelse(frames[1].bh);
- brelse(frames[0].bh);
-}
-
-/*
- * This function increments the frame pointer to search the next leaf
- * block, and reads in the necessary intervening nodes if the search
- * should be necessary. Whether or not the search is necessary is
- * controlled by the hash parameter. If the hash value is even, then
- * the search is only continued if the next block starts with that
- * hash value. This is used if we are searching for a specific file.
- *
- * If the hash value is HASH_NB_ALWAYS, then always go to the next block.
- *
- * This function returns 1 if the caller should continue to search,
- * or 0 if it should not. If there is an error reading one of the
- * index blocks, it will a negative error code.
- *
- * If start_hash is non-null, it will be filled in with the starting
- * hash of the next page.
- */
-static int ext3_htree_next_block(struct inode *dir, __u32 hash,
- struct dx_frame *frame,
- struct dx_frame *frames,
- __u32 *start_hash)
-{
- struct dx_frame *p;
- struct buffer_head *bh;
- int err, num_frames = 0;
- __u32 bhash;
-
- p = frame;
- /*
- * Find the next leaf page by incrementing the frame pointer.
- * If we run out of entries in the interior node, loop around and
- * increment pointer in the parent node. When we break out of
- * this loop, num_frames indicates the number of interior
- * nodes need to be read.
- */
- while (1) {
- if (++(p->at) < p->entries + dx_get_count(p->entries))
- break;
- if (p == frames)
- return 0;
- num_frames++;
- p--;
- }
-
- /*
- * If the hash is 1, then continue only if the next page has a
- * continuation hash of any value. This is used for readdir
- * handling. Otherwise, check to see if the hash matches the
- * desired contiuation hash. If it doesn't, return since
- * there's no point to read in the successive index pages.
- */
- bhash = dx_get_hash(p->at);
- if (start_hash)
- *start_hash = bhash;
- if ((hash & 1) == 0) {
- if ((bhash & ~1) != hash)
- return 0;
- }
- /*
- * If the hash is HASH_NB_ALWAYS, we always go to the next
- * block so no check is necessary
- */
- while (num_frames--) {
- if (!(bh = ext3_dir_bread(NULL, dir, dx_get_block(p->at),
- 0, &err)))
- return err; /* Failure */
- p++;
- brelse (p->bh);
- p->bh = bh;
- p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
- }
- return 1;
-}
-
-
-/*
- * This function fills a red-black tree with information from a
- * directory block. It returns the number directory entries loaded
- * into the tree. If there is an error it is returned in err.
- */
-static int htree_dirblock_to_tree(struct file *dir_file,
- struct inode *dir, int block,
- struct dx_hash_info *hinfo,
- __u32 start_hash, __u32 start_minor_hash)
-{
- struct buffer_head *bh;
- struct ext3_dir_entry_2 *de, *top;
- int err = 0, count = 0;
-
- dxtrace(printk("In htree dirblock_to_tree: block %d\n", block));
-
- if (!(bh = ext3_dir_bread(NULL, dir, block, 0, &err)))
- return err;
-
- de = (struct ext3_dir_entry_2 *) bh->b_data;
- top = (struct ext3_dir_entry_2 *) ((char *) de +
- dir->i_sb->s_blocksize -
- EXT3_DIR_REC_LEN(0));
- for (; de < top; de = ext3_next_entry(de)) {
- if (!ext3_check_dir_entry("htree_dirblock_to_tree", dir, de, bh,
- (block<<EXT3_BLOCK_SIZE_BITS(dir->i_sb))
- +((char *)de - bh->b_data))) {
- /* silently ignore the rest of the block */
- break;
- }
- ext3fs_dirhash(de->name, de->name_len, hinfo);
- if ((hinfo->hash < start_hash) ||
- ((hinfo->hash == start_hash) &&
- (hinfo->minor_hash < start_minor_hash)))
- continue;
- if (de->inode == 0)
- continue;
- if ((err = ext3_htree_store_dirent(dir_file,
- hinfo->hash, hinfo->minor_hash, de)) != 0) {
- brelse(bh);
- return err;
- }
- count++;
- }
- brelse(bh);
- return count;
-}
-
-
-/*
- * This function fills a red-black tree with information from a
- * directory. We start scanning the directory in hash order, starting
- * at start_hash and start_minor_hash.
- *
- * This function returns the number of entries inserted into the tree,
- * or a negative error code.
- */
-int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
- __u32 start_minor_hash, __u32 *next_hash)
-{
- struct dx_hash_info hinfo;
- struct ext3_dir_entry_2 *de;
- struct dx_frame frames[2], *frame;
- struct inode *dir;
- int block, err;
- int count = 0;
- int ret;
- __u32 hashval;
-
- dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash,
- start_minor_hash));
- dir = file_inode(dir_file);
- if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) {
- hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version;
- if (hinfo.hash_version <= DX_HASH_TEA)
- hinfo.hash_version +=
- EXT3_SB(dir->i_sb)->s_hash_unsigned;
- hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed;
- count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
- start_hash, start_minor_hash);
- *next_hash = ~0;
- return count;
- }
- hinfo.hash = start_hash;
- hinfo.minor_hash = 0;
- frame = dx_probe(NULL, file_inode(dir_file), &hinfo, frames, &err);
- if (!frame)
- return err;
-
- /* Add '.' and '..' from the htree header */
- if (!start_hash && !start_minor_hash) {
- de = (struct ext3_dir_entry_2 *) frames[0].bh->b_data;
- if ((err = ext3_htree_store_dirent(dir_file, 0, 0, de)) != 0)
- goto errout;
- count++;
- }
- if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) {
- de = (struct ext3_dir_entry_2 *) frames[0].bh->b_data;
- de = ext3_next_entry(de);
- if ((err = ext3_htree_store_dirent(dir_file, 2, 0, de)) != 0)
- goto errout;
- count++;
- }
-
- while (1) {
- block = dx_get_block(frame->at);
- ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo,
- start_hash, start_minor_hash);
- if (ret < 0) {
- err = ret;
- goto errout;
- }
- count += ret;
- hashval = ~0;
- ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS,
- frame, frames, &hashval);
- *next_hash = hashval;
- if (ret < 0) {
- err = ret;
- goto errout;
- }
- /*
- * Stop if: (a) there are no more entries, or
- * (b) we have inserted at least one entry and the
- * next hash value is not a continuation
- */
- if ((ret == 0) ||
- (count && ((hashval & 1) == 0)))
- break;
- }
- dx_release(frames);
- dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n",
- count, *next_hash));
- return count;
-errout:
- dx_release(frames);
- return (err);
-}
-
-
-/*
- * Directory block splitting, compacting
- */
-
-/*
- * Create map of hash values, offsets, and sizes, stored at end of block.
- * Returns number of entries mapped.
- */
-static int dx_make_map(struct ext3_dir_entry_2 *de, unsigned blocksize,
- struct dx_hash_info *hinfo, struct dx_map_entry *map_tail)
-{
- int count = 0;
- char *base = (char *) de;
- struct dx_hash_info h = *hinfo;
-
- while ((char *) de < base + blocksize)
- {
- if (de->name_len && de->inode) {
- ext3fs_dirhash(de->name, de->name_len, &h);
- map_tail--;
- map_tail->hash = h.hash;
- map_tail->offs = (u16) ((char *) de - base);
- map_tail->size = le16_to_cpu(de->rec_len);
- count++;
- cond_resched();
- }
- /* XXX: do we need to check rec_len == 0 case? -Chris */
- de = ext3_next_entry(de);
- }
- return count;
-}
-
-/* Sort map by hash value */
-static void dx_sort_map (struct dx_map_entry *map, unsigned count)
-{
- struct dx_map_entry *p, *q, *top = map + count - 1;
- int more;
- /* Combsort until bubble sort doesn't suck */
- while (count > 2)
- {
- count = count*10/13;
- if (count - 9 < 2) /* 9, 10 -> 11 */
- count = 11;
- for (p = top, q = p - count; q >= map; p--, q--)
- if (p->hash < q->hash)
- swap(*p, *q);
- }
- /* Garden variety bubble sort */
- do {
- more = 0;
- q = top;
- while (q-- > map)
- {
- if (q[1].hash >= q[0].hash)
- continue;
- swap(*(q+1), *q);
- more = 1;
- }
- } while(more);
-}
-
-static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block)
-{
- struct dx_entry *entries = frame->entries;
- struct dx_entry *old = frame->at, *new = old + 1;
- int count = dx_get_count(entries);
-
- assert(count < dx_get_limit(entries));
- assert(old < entries + count);
- memmove(new + 1, new, (char *)(entries + count) - (char *)(new));
- dx_set_hash(new, hash);
- dx_set_block(new, block);
- dx_set_count(entries, count + 1);
-}
-
-static void ext3_update_dx_flag(struct inode *inode)
-{
- if (!EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
- EXT3_FEATURE_COMPAT_DIR_INDEX))
- EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL;
-}
-
-/*
- * NOTE! unlike strncmp, ext3_match returns 1 for success, 0 for failure.
- *
- * `len <= EXT3_NAME_LEN' is guaranteed by caller.
- * `de != NULL' is guaranteed by caller.
- */
-static inline int ext3_match (int len, const char * const name,
- struct ext3_dir_entry_2 * de)
-{
- if (len != de->name_len)
- return 0;
- if (!de->inode)
- return 0;
- return !memcmp(name, de->name, len);
-}
-
-/*
- * Returns 0 if not found, -1 on failure, and 1 on success
- */
-static inline int search_dirblock(struct buffer_head * bh,
- struct inode *dir,
- struct qstr *child,
- unsigned long offset,
- struct ext3_dir_entry_2 ** res_dir)
-{
- struct ext3_dir_entry_2 * de;
- char * dlimit;
- int de_len;
- const char *name = child->name;
- int namelen = child->len;
-
- de = (struct ext3_dir_entry_2 *) bh->b_data;
- dlimit = bh->b_data + dir->i_sb->s_blocksize;
- while ((char *) de < dlimit) {
- /* this code is executed quadratically often */
- /* do minimal checking `by hand' */
-
- if ((char *) de + namelen <= dlimit &&
- ext3_match (namelen, name, de)) {
- /* found a match - just to be sure, do a full check */
- if (!ext3_check_dir_entry("ext3_find_entry",
- dir, de, bh, offset))
- return -1;
- *res_dir = de;
- return 1;
- }
- /* prevent looping on a bad block */
- de_len = ext3_rec_len_from_disk(de->rec_len);
- if (de_len <= 0)
- return -1;
- offset += de_len;
- de = (struct ext3_dir_entry_2 *) ((char *) de + de_len);
- }
- return 0;
-}
-
-
-/*
- * ext3_find_entry()
- *
- * finds an entry in the specified directory with the wanted name. It
- * returns the cache buffer in which the entry was found, and the entry
- * itself (as a parameter - res_dir). It does NOT read the inode of the
- * entry - you'll have to do that yourself if you want to.
- *
- * The returned buffer_head has ->b_count elevated. The caller is expected
- * to brelse() it when appropriate.
- */
-static struct buffer_head *ext3_find_entry(struct inode *dir,
- struct qstr *entry,
- struct ext3_dir_entry_2 **res_dir)
-{
- struct super_block * sb;
- struct buffer_head * bh_use[NAMEI_RA_SIZE];
- struct buffer_head * bh, *ret = NULL;
- unsigned long start, block, b;
- const u8 *name = entry->name;
- int ra_max = 0; /* Number of bh's in the readahead
- buffer, bh_use[] */
- int ra_ptr = 0; /* Current index into readahead
- buffer */
- int num = 0;
- int nblocks, i, err;
- int namelen;
-
- *res_dir = NULL;
- sb = dir->i_sb;
- namelen = entry->len;
- if (namelen > EXT3_NAME_LEN)
- return NULL;
- if ((namelen <= 2) && (name[0] == '.') &&
- (name[1] == '.' || name[1] == 0)) {
- /*
- * "." or ".." will only be in the first block
- * NFS may look up ".."; "." should be handled by the VFS
- */
- block = start = 0;
- nblocks = 1;
- goto restart;
- }
- if (is_dx(dir)) {
- bh = ext3_dx_find_entry(dir, entry, res_dir, &err);
- /*
- * On success, or if the error was file not found,
- * return. Otherwise, fall back to doing a search the
- * old fashioned way.
- */
- if (bh || (err != ERR_BAD_DX_DIR))
- return bh;
- dxtrace(printk("ext3_find_entry: dx failed, falling back\n"));
- }
- nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb);
- start = EXT3_I(dir)->i_dir_start_lookup;
- if (start >= nblocks)
- start = 0;
- block = start;
-restart:
- do {
- /*
- * We deal with the read-ahead logic here.
- */
- if (ra_ptr >= ra_max) {
- /* Refill the readahead buffer */
- ra_ptr = 0;
- b = block;
- for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) {
- /*
- * Terminate if we reach the end of the
- * directory and must wrap, or if our
- * search has finished at this block.
- */
- if (b >= nblocks || (num && block == start)) {
- bh_use[ra_max] = NULL;
- break;
- }
- num++;
- bh = ext3_getblk(NULL, dir, b++, 0, &err);
- bh_use[ra_max] = bh;
- if (bh && !bh_uptodate_or_lock(bh)) {
- get_bh(bh);
- bh->b_end_io = end_buffer_read_sync;
- submit_bh(READ | REQ_META | REQ_PRIO,
- bh);
- }
- }
- }
- if ((bh = bh_use[ra_ptr++]) == NULL)
- goto next;
- wait_on_buffer(bh);
- if (!buffer_uptodate(bh)) {
- /* read error, skip block & hope for the best */
- ext3_error(sb, __func__, "reading directory #%lu "
- "offset %lu", dir->i_ino, block);
- brelse(bh);
- goto next;
- }
- i = search_dirblock(bh, dir, entry,
- block << EXT3_BLOCK_SIZE_BITS(sb), res_dir);
- if (i == 1) {
- EXT3_I(dir)->i_dir_start_lookup = block;
- ret = bh;
- goto cleanup_and_exit;
- } else {
- brelse(bh);
- if (i < 0)
- goto cleanup_and_exit;
- }
- next:
- if (++block >= nblocks)
- block = 0;
- } while (block != start);
-
- /*
- * If the directory has grown while we were searching, then
- * search the last part of the directory before giving up.
- */
- block = nblocks;
- nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb);
- if (block < nblocks) {
- start = 0;
- goto restart;
- }
-
-cleanup_and_exit:
- /* Clean up the read-ahead blocks */
- for (; ra_ptr < ra_max; ra_ptr++)
- brelse (bh_use[ra_ptr]);
- return ret;
-}
-
-static struct buffer_head * ext3_dx_find_entry(struct inode *dir,
- struct qstr *entry, struct ext3_dir_entry_2 **res_dir,
- int *err)
-{
- struct super_block *sb = dir->i_sb;
- struct dx_hash_info hinfo;
- struct dx_frame frames[2], *frame;
- struct buffer_head *bh;
- unsigned long block;
- int retval;
-
- if (!(frame = dx_probe(entry, dir, &hinfo, frames, err)))
- return NULL;
- do {
- block = dx_get_block(frame->at);
- if (!(bh = ext3_dir_bread (NULL, dir, block, 0, err)))
- goto errout;
-
- retval = search_dirblock(bh, dir, entry,
- block << EXT3_BLOCK_SIZE_BITS(sb),
- res_dir);
- if (retval == 1) {
- dx_release(frames);
- return bh;
- }
- brelse(bh);
- if (retval == -1) {
- *err = ERR_BAD_DX_DIR;
- goto errout;
- }
-
- /* Check to see if we should continue to search */
- retval = ext3_htree_next_block(dir, hinfo.hash, frame,
- frames, NULL);
- if (retval < 0) {
- ext3_warning(sb, __func__,
- "error reading index page in directory #%lu",
- dir->i_ino);
- *err = retval;
- goto errout;
- }
- } while (retval == 1);
-
- *err = -ENOENT;
-errout:
- dxtrace(printk("%s not found\n", entry->name));
- dx_release (frames);
- return NULL;
-}
-
-static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, unsigned int flags)
-{
- struct inode * inode;
- struct ext3_dir_entry_2 * de;
- struct buffer_head * bh;
-
- if (dentry->d_name.len > EXT3_NAME_LEN)
- return ERR_PTR(-ENAMETOOLONG);
-
- bh = ext3_find_entry(dir, &dentry->d_name, &de);
- inode = NULL;
- if (bh) {
- unsigned long ino = le32_to_cpu(de->inode);
- brelse (bh);
- if (!ext3_valid_inum(dir->i_sb, ino)) {
- ext3_error(dir->i_sb, "ext3_lookup",
- "bad inode number: %lu", ino);
- return ERR_PTR(-EIO);
- }
- inode = ext3_iget(dir->i_sb, ino);
- if (inode == ERR_PTR(-ESTALE)) {
- ext3_error(dir->i_sb, __func__,
- "deleted inode referenced: %lu",
- ino);
- return ERR_PTR(-EIO);
- }
- }
- return d_splice_alias(inode, dentry);
-}
-
-
-struct dentry *ext3_get_parent(struct dentry *child)
-{
- unsigned long ino;
- struct qstr dotdot = QSTR_INIT("..", 2);
- struct ext3_dir_entry_2 * de;
- struct buffer_head *bh;
-
- bh = ext3_find_entry(d_inode(child), &dotdot, &de);
- if (!bh)
- return ERR_PTR(-ENOENT);
- ino = le32_to_cpu(de->inode);
- brelse(bh);
-
- if (!ext3_valid_inum(d_inode(child)->i_sb, ino)) {
- ext3_error(d_inode(child)->i_sb, "ext3_get_parent",
- "bad inode number: %lu", ino);
- return ERR_PTR(-EIO);
- }
-
- return d_obtain_alias(ext3_iget(d_inode(child)->i_sb, ino));
-}
-
-#define S_SHIFT 12
-static unsigned char ext3_type_by_mode[S_IFMT >> S_SHIFT] = {
- [S_IFREG >> S_SHIFT] = EXT3_FT_REG_FILE,
- [S_IFDIR >> S_SHIFT] = EXT3_FT_DIR,
- [S_IFCHR >> S_SHIFT] = EXT3_FT_CHRDEV,
- [S_IFBLK >> S_SHIFT] = EXT3_FT_BLKDEV,
- [S_IFIFO >> S_SHIFT] = EXT3_FT_FIFO,
- [S_IFSOCK >> S_SHIFT] = EXT3_FT_SOCK,
- [S_IFLNK >> S_SHIFT] = EXT3_FT_SYMLINK,
-};
-
-static inline void ext3_set_de_type(struct super_block *sb,
- struct ext3_dir_entry_2 *de,
- umode_t mode) {
- if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE))
- de->file_type = ext3_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
-}
-
-/*
- * Move count entries from end of map between two memory locations.
- * Returns pointer to last entry moved.
- */
-static struct ext3_dir_entry_2 *
-dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
-{
- unsigned rec_len = 0;
-
- while (count--) {
- struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs);
- rec_len = EXT3_DIR_REC_LEN(de->name_len);
- memcpy (to, de, rec_len);
- ((struct ext3_dir_entry_2 *) to)->rec_len =
- ext3_rec_len_to_disk(rec_len);
- de->inode = 0;
- map++;
- to += rec_len;
- }
- return (struct ext3_dir_entry_2 *) (to - rec_len);
-}
-
-/*
- * Compact each dir entry in the range to the minimal rec_len.
- * Returns pointer to last entry in range.
- */
-static struct ext3_dir_entry_2 *dx_pack_dirents(char *base, unsigned blocksize)
-{
- struct ext3_dir_entry_2 *next, *to, *prev;
- struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *)base;
- unsigned rec_len = 0;
-
- prev = to = de;
- while ((char *)de < base + blocksize) {
- next = ext3_next_entry(de);
- if (de->inode && de->name_len) {
- rec_len = EXT3_DIR_REC_LEN(de->name_len);
- if (de > to)
- memmove(to, de, rec_len);
- to->rec_len = ext3_rec_len_to_disk(rec_len);
- prev = to;
- to = (struct ext3_dir_entry_2 *) (((char *) to) + rec_len);
- }
- de = next;
- }
- return prev;
-}
-
-/*
- * Split a full leaf block to make room for a new dir entry.
- * Allocate a new block, and move entries so that they are approx. equally full.
- * Returns pointer to de in block into which the new entry will be inserted.
- */
-static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
- struct buffer_head **bh,struct dx_frame *frame,
- struct dx_hash_info *hinfo, int *error)
-{
- unsigned blocksize = dir->i_sb->s_blocksize;
- unsigned count, continued;
- struct buffer_head *bh2;
- u32 newblock;
- u32 hash2;
- struct dx_map_entry *map;
- char *data1 = (*bh)->b_data, *data2;
- unsigned split, move, size;
- struct ext3_dir_entry_2 *de = NULL, *de2;
- int err = 0, i;
-
- bh2 = ext3_append (handle, dir, &newblock, &err);
- if (!(bh2)) {
- brelse(*bh);
- *bh = NULL;
- goto errout;
- }
-
- BUFFER_TRACE(*bh, "get_write_access");
- err = ext3_journal_get_write_access(handle, *bh);
- if (err)
- goto journal_error;
-
- BUFFER_TRACE(frame->bh, "get_write_access");
- err = ext3_journal_get_write_access(handle, frame->bh);
- if (err)
- goto journal_error;
-
- data2 = bh2->b_data;
-
- /* create map in the end of data2 block */
- map = (struct dx_map_entry *) (data2 + blocksize);
- count = dx_make_map ((struct ext3_dir_entry_2 *) data1,
- blocksize, hinfo, map);
- map -= count;
- dx_sort_map (map, count);
- /* Split the existing block in the middle, size-wise */
- size = 0;
- move = 0;
- for (i = count-1; i >= 0; i--) {
- /* is more than half of this entry in 2nd half of the block? */
- if (size + map[i].size/2 > blocksize/2)
- break;
- size += map[i].size;
- move++;
- }
- /* map index at which we will split */
- split = count - move;
- hash2 = map[split].hash;
- continued = hash2 == map[split - 1].hash;
- dxtrace(printk("Split block %i at %x, %i/%i\n",
- dx_get_block(frame->at), hash2, split, count-split));
-
- /* Fancy dance to stay within two buffers */
- de2 = dx_move_dirents(data1, data2, map + split, count - split);
- de = dx_pack_dirents(data1,blocksize);
- de->rec_len = ext3_rec_len_to_disk(data1 + blocksize - (char *) de);
- de2->rec_len = ext3_rec_len_to_disk(data2 + blocksize - (char *) de2);
- dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1));
- dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1));
-
- /* Which block gets the new entry? */
- if (hinfo->hash >= hash2)
- {
- swap(*bh, bh2);
- de = de2;
- }
- dx_insert_block (frame, hash2 + continued, newblock);
- err = ext3_journal_dirty_metadata (handle, bh2);
- if (err)
- goto journal_error;
- err = ext3_journal_dirty_metadata (handle, frame->bh);
- if (err)
- goto journal_error;
- brelse (bh2);
- dxtrace(dx_show_index ("frame", frame->entries));
- return de;
-
-journal_error:
- brelse(*bh);
- brelse(bh2);
- *bh = NULL;
- ext3_std_error(dir->i_sb, err);
-errout:
- *error = err;
- return NULL;
-}
-
-
-/*
- * Add a new entry into a directory (leaf) block. If de is non-NULL,
- * it points to a directory entry which is guaranteed to be large
- * enough for new directory entry. If de is NULL, then
- * add_dirent_to_buf will attempt search the directory block for
- * space. It will return -ENOSPC if no space is available, and -EIO
- * and -EEXIST if directory entry already exists.
- *
- * NOTE! bh is NOT released in the case where ENOSPC is returned. In
- * all other cases bh is released.
- */
-static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
- struct inode *inode, struct ext3_dir_entry_2 *de,
- struct buffer_head * bh)
-{
- struct inode *dir = d_inode(dentry->d_parent);
- const char *name = dentry->d_name.name;
- int namelen = dentry->d_name.len;
- unsigned long offset = 0;
- unsigned short reclen;
- int nlen, rlen, err;
- char *top;
-
- reclen = EXT3_DIR_REC_LEN(namelen);
- if (!de) {
- de = (struct ext3_dir_entry_2 *)bh->b_data;
- top = bh->b_data + dir->i_sb->s_blocksize - reclen;
- while ((char *) de <= top) {
- if (!ext3_check_dir_entry("ext3_add_entry", dir, de,
- bh, offset)) {
- brelse (bh);
- return -EIO;
- }
- if (ext3_match (namelen, name, de)) {
- brelse (bh);
- return -EEXIST;
- }
- nlen = EXT3_DIR_REC_LEN(de->name_len);
- rlen = ext3_rec_len_from_disk(de->rec_len);
- if ((de->inode? rlen - nlen: rlen) >= reclen)
- break;
- de = (struct ext3_dir_entry_2 *)((char *)de + rlen);
- offset += rlen;
- }
- if ((char *) de > top)
- return -ENOSPC;
- }
- BUFFER_TRACE(bh, "get_write_access");
- err = ext3_journal_get_write_access(handle, bh);
- if (err) {
- ext3_std_error(dir->i_sb, err);
- brelse(bh);
- return err;
- }
-
- /* By now the buffer is marked for journaling */
- nlen = EXT3_DIR_REC_LEN(de->name_len);
- rlen = ext3_rec_len_from_disk(de->rec_len);
- if (de->inode) {
- struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen);
- de1->rec_len = ext3_rec_len_to_disk(rlen - nlen);
- de->rec_len = ext3_rec_len_to_disk(nlen);
- de = de1;
- }
- de->file_type = EXT3_FT_UNKNOWN;
- if (inode) {
- de->inode = cpu_to_le32(inode->i_ino);
- ext3_set_de_type(dir->i_sb, de, inode->i_mode);
- } else
- de->inode = 0;
- de->name_len = namelen;
- memcpy (de->name, name, namelen);
- /*
- * XXX shouldn't update any times until successful
- * completion of syscall, but too many callers depend
- * on this.
- *
- * XXX similarly, too many callers depend on
- * ext3_new_inode() setting the times, but error
- * recovery deletes the inode, so the worst that can
- * happen is that the times are slightly out of date
- * and/or different from the directory change time.
- */
- dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
- ext3_update_dx_flag(dir);
- dir->i_version++;
- ext3_mark_inode_dirty(handle, dir);
- BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
- err = ext3_journal_dirty_metadata(handle, bh);
- if (err)
- ext3_std_error(dir->i_sb, err);
- brelse(bh);
- return 0;
-}
-
-/*
- * This converts a one block unindexed directory to a 3 block indexed
- * directory, and adds the dentry to the indexed directory.
- */
-static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
- struct inode *inode, struct buffer_head *bh)
-{
- struct inode *dir = d_inode(dentry->d_parent);
- const char *name = dentry->d_name.name;
- int namelen = dentry->d_name.len;
- struct buffer_head *bh2;
- struct dx_root *root;
- struct dx_frame frames[2], *frame;
- struct dx_entry *entries;
- struct ext3_dir_entry_2 *de, *de2;
- char *data1, *top;
- unsigned len;
- int retval;
- unsigned blocksize;
- struct dx_hash_info hinfo;
- u32 block;
- struct fake_dirent *fde;
-
- blocksize = dir->i_sb->s_blocksize;
- dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino));
- retval = ext3_journal_get_write_access(handle, bh);
- if (retval) {
- ext3_std_error(dir->i_sb, retval);
- brelse(bh);
- return retval;
- }
- root = (struct dx_root *) bh->b_data;
-
- /* The 0th block becomes the root, move the dirents out */
- fde = &root->dotdot;
- de = (struct ext3_dir_entry_2 *)((char *)fde +
- ext3_rec_len_from_disk(fde->rec_len));
- if ((char *) de >= (((char *) root) + blocksize)) {
- ext3_error(dir->i_sb, __func__,
- "invalid rec_len for '..' in inode %lu",
- dir->i_ino);
- brelse(bh);
- return -EIO;
- }
- len = ((char *) root) + blocksize - (char *) de;
-
- bh2 = ext3_append (handle, dir, &block, &retval);
- if (!(bh2)) {
- brelse(bh);
- return retval;
- }
- EXT3_I(dir)->i_flags |= EXT3_INDEX_FL;
- data1 = bh2->b_data;
-
- memcpy (data1, de, len);
- de = (struct ext3_dir_entry_2 *) data1;
- top = data1 + len;
- while ((char *)(de2 = ext3_next_entry(de)) < top)
- de = de2;
- de->rec_len = ext3_rec_len_to_disk(data1 + blocksize - (char *) de);
- /* Initialize the root; the dot dirents already exist */
- de = (struct ext3_dir_entry_2 *) (&root->dotdot);
- de->rec_len = ext3_rec_len_to_disk(blocksize - EXT3_DIR_REC_LEN(2));
- memset (&root->info, 0, sizeof(root->info));
- root->info.info_length = sizeof(root->info);
- root->info.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version;
- entries = root->entries;
- dx_set_block (entries, 1);
- dx_set_count (entries, 1);
- dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info)));
-
- /* Initialize as for dx_probe */
- hinfo.hash_version = root->info.hash_version;
- if (hinfo.hash_version <= DX_HASH_TEA)
- hinfo.hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned;
- hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed;
- ext3fs_dirhash(name, namelen, &hinfo);
- frame = frames;
- frame->entries = entries;
- frame->at = entries;
- frame->bh = bh;
- bh = bh2;
- /*
- * Mark buffers dirty here so that if do_split() fails we write a
- * consistent set of buffers to disk.
- */
- ext3_journal_dirty_metadata(handle, frame->bh);
- ext3_journal_dirty_metadata(handle, bh);
- de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
- if (!de) {
- ext3_mark_inode_dirty(handle, dir);
- dx_release(frames);
- return retval;
- }
- dx_release(frames);
-
- return add_dirent_to_buf(handle, dentry, inode, de, bh);
-}
-
-/*
- * ext3_add_entry()
- *
- * adds a file entry to the specified directory, using the same
- * semantics as ext3_find_entry(). It returns NULL if it failed.
- *
- * NOTE!! The inode part of 'de' is left at 0 - which means you
- * may not sleep between calling this and putting something into
- * the entry, as someone else might have used it while you slept.
- */
-static int ext3_add_entry (handle_t *handle, struct dentry *dentry,
- struct inode *inode)
-{
- struct inode *dir = d_inode(dentry->d_parent);
- struct buffer_head * bh;
- struct ext3_dir_entry_2 *de;
- struct super_block * sb;
- int retval;
- int dx_fallback=0;
- unsigned blocksize;
- u32 block, blocks;
-
- sb = dir->i_sb;
- blocksize = sb->s_blocksize;
- if (!dentry->d_name.len)
- return -EINVAL;
- if (is_dx(dir)) {
- retval = ext3_dx_add_entry(handle, dentry, inode);
- if (!retval || (retval != ERR_BAD_DX_DIR))
- return retval;
- EXT3_I(dir)->i_flags &= ~EXT3_INDEX_FL;
- dx_fallback++;
- ext3_mark_inode_dirty(handle, dir);
- }
- blocks = dir->i_size >> sb->s_blocksize_bits;
- for (block = 0; block < blocks; block++) {
- if (!(bh = ext3_dir_bread(handle, dir, block, 0, &retval)))
- return retval;
-
- retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
- if (retval != -ENOSPC)
- return retval;
-
- if (blocks == 1 && !dx_fallback &&
- EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX))
- return make_indexed_dir(handle, dentry, inode, bh);
- brelse(bh);
- }
- bh = ext3_append(handle, dir, &block, &retval);
- if (!bh)
- return retval;
- de = (struct ext3_dir_entry_2 *) bh->b_data;
- de->inode = 0;
- de->rec_len = ext3_rec_len_to_disk(blocksize);
- return add_dirent_to_buf(handle, dentry, inode, de, bh);
-}
-
-/*
- * Returns 0 for success, or a negative error value
- */
-static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
- struct inode *inode)
-{
- struct dx_frame frames[2], *frame;
- struct dx_entry *entries, *at;
- struct dx_hash_info hinfo;
- struct buffer_head * bh;
- struct inode *dir = d_inode(dentry->d_parent);
- struct super_block * sb = dir->i_sb;
- struct ext3_dir_entry_2 *de;
- int err;
-
- frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
- if (!frame)
- return err;
- entries = frame->entries;
- at = frame->at;
-
- if (!(bh = ext3_dir_bread(handle, dir, dx_get_block(frame->at), 0, &err)))
- goto cleanup;
-
- BUFFER_TRACE(bh, "get_write_access");
- err = ext3_journal_get_write_access(handle, bh);
- if (err)
- goto journal_error;
-
- err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
- if (err != -ENOSPC) {
- bh = NULL;
- goto cleanup;
- }
-
- /* Block full, should compress but for now just split */
- dxtrace(printk("using %u of %u node entries\n",
- dx_get_count(entries), dx_get_limit(entries)));
- /* Need to split index? */
- if (dx_get_count(entries) == dx_get_limit(entries)) {
- u32 newblock;
- unsigned icount = dx_get_count(entries);
- int levels = frame - frames;
- struct dx_entry *entries2;
- struct dx_node *node2;
- struct buffer_head *bh2;
-
- if (levels && (dx_get_count(frames->entries) ==
- dx_get_limit(frames->entries))) {
- ext3_warning(sb, __func__,
- "Directory index full!");
- err = -ENOSPC;
- goto cleanup;
- }
- bh2 = ext3_append (handle, dir, &newblock, &err);
- if (!(bh2))
- goto cleanup;
- node2 = (struct dx_node *)(bh2->b_data);
- entries2 = node2->entries;
- memset(&node2->fake, 0, sizeof(struct fake_dirent));
- node2->fake.rec_len = ext3_rec_len_to_disk(sb->s_blocksize);
- BUFFER_TRACE(frame->bh, "get_write_access");
- err = ext3_journal_get_write_access(handle, frame->bh);
- if (err)
- goto journal_error;
- if (levels) {
- unsigned icount1 = icount/2, icount2 = icount - icount1;
- unsigned hash2 = dx_get_hash(entries + icount1);
- dxtrace(printk("Split index %i/%i\n", icount1, icount2));
-
- BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
- err = ext3_journal_get_write_access(handle,
- frames[0].bh);
- if (err)
- goto journal_error;
-
- memcpy ((char *) entries2, (char *) (entries + icount1),
- icount2 * sizeof(struct dx_entry));
- dx_set_count (entries, icount1);
- dx_set_count (entries2, icount2);
- dx_set_limit (entries2, dx_node_limit(dir));
-
- /* Which index block gets the new entry? */
- if (at - entries >= icount1) {
- frame->at = at = at - entries - icount1 + entries2;
- frame->entries = entries = entries2;
- swap(frame->bh, bh2);
- }
- dx_insert_block (frames + 0, hash2, newblock);
- dxtrace(dx_show_index ("node", frames[1].entries));
- dxtrace(dx_show_index ("node",
- ((struct dx_node *) bh2->b_data)->entries));
- err = ext3_journal_dirty_metadata(handle, bh2);
- if (err)
- goto journal_error;
- brelse (bh2);
- } else {
- dxtrace(printk("Creating second level index...\n"));
- memcpy((char *) entries2, (char *) entries,
- icount * sizeof(struct dx_entry));
- dx_set_limit(entries2, dx_node_limit(dir));
-
- /* Set up root */
- dx_set_count(entries, 1);
- dx_set_block(entries + 0, newblock);
- ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
-
- /* Add new access path frame */
- frame = frames + 1;
- frame->at = at = at - entries + entries2;
- frame->entries = entries = entries2;
- frame->bh = bh2;
- err = ext3_journal_get_write_access(handle,
- frame->bh);
- if (err)
- goto journal_error;
- }
- err = ext3_journal_dirty_metadata(handle, frames[0].bh);
- if (err)
- goto journal_error;
- }
- de = do_split(handle, dir, &bh, frame, &hinfo, &err);
- if (!de)
- goto cleanup;
- err = add_dirent_to_buf(handle, dentry, inode, de, bh);
- bh = NULL;
- goto cleanup;
-
-journal_error:
- ext3_std_error(dir->i_sb, err);
-cleanup:
- if (bh)
- brelse(bh);
- dx_release(frames);
- return err;
-}
-
-/*
- * ext3_delete_entry deletes a directory entry by merging it with the
- * previous entry
- */
-static int ext3_delete_entry (handle_t *handle,
- struct inode * dir,
- struct ext3_dir_entry_2 * de_del,
- struct buffer_head * bh)
-{
- struct ext3_dir_entry_2 * de, * pde;
- int i;
-
- i = 0;
- pde = NULL;
- de = (struct ext3_dir_entry_2 *) bh->b_data;
- while (i < bh->b_size) {
- if (!ext3_check_dir_entry("ext3_delete_entry", dir, de, bh, i))
- return -EIO;
- if (de == de_del) {
- int err;
-
- BUFFER_TRACE(bh, "get_write_access");
- err = ext3_journal_get_write_access(handle, bh);
- if (err)
- goto journal_error;
-
- if (pde)
- pde->rec_len = ext3_rec_len_to_disk(
- ext3_rec_len_from_disk(pde->rec_len) +
- ext3_rec_len_from_disk(de->rec_len));
- else
- de->inode = 0;
- dir->i_version++;
- BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
- err = ext3_journal_dirty_metadata(handle, bh);
- if (err) {
-journal_error:
- ext3_std_error(dir->i_sb, err);
- return err;
- }
- return 0;
- }
- i += ext3_rec_len_from_disk(de->rec_len);
- pde = de;
- de = ext3_next_entry(de);
- }
- return -ENOENT;
-}
-
-static int ext3_add_nondir(handle_t *handle,
- struct dentry *dentry, struct inode *inode)
-{
- int err = ext3_add_entry(handle, dentry, inode);
- if (!err) {
- ext3_mark_inode_dirty(handle, inode);
- unlock_new_inode(inode);
- d_instantiate(dentry, inode);
- return 0;
- }
- drop_nlink(inode);
- unlock_new_inode(inode);
- iput(inode);
- return err;
-}
-
-/*
- * By the time this is called, we already have created
- * the directory cache entry for the new file, but it
- * is so far negative - it has no inode.
- *
- * If the create succeeds, we fill in the inode information
- * with d_instantiate().
- */
-static int ext3_create (struct inode * dir, struct dentry * dentry, umode_t mode,
- bool excl)
-{
- handle_t *handle;
- struct inode * inode;
- int err, retries = 0;
-
- dquot_initialize(dir);
-
-retry:
- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
- EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
- if (IS_ERR(handle))
- return PTR_ERR(handle);
-
- if (IS_DIRSYNC(dir))
- handle->h_sync = 1;
-
- inode = ext3_new_inode (handle, dir, &dentry->d_name, mode);
- err = PTR_ERR(inode);
- if (!IS_ERR(inode)) {
- inode->i_op = &ext3_file_inode_operations;
- inode->i_fop = &ext3_file_operations;
- ext3_set_aops(inode);
- err = ext3_add_nondir(handle, dentry, inode);
- }
- ext3_journal_stop(handle);
- if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
- goto retry;
- return err;
-}
-
-static int ext3_mknod (struct inode * dir, struct dentry *dentry,
- umode_t mode, dev_t rdev)
-{
- handle_t *handle;
- struct inode *inode;
- int err, retries = 0;
-
- if (!new_valid_dev(rdev))
- return -EINVAL;
-
- dquot_initialize(dir);
-
-retry:
- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
- EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
- if (IS_ERR(handle))
- return PTR_ERR(handle);
-
- if (IS_DIRSYNC(dir))
- handle->h_sync = 1;
-
- inode = ext3_new_inode (handle, dir, &dentry->d_name, mode);
- err = PTR_ERR(inode);
- if (!IS_ERR(inode)) {
- init_special_inode(inode, inode->i_mode, rdev);
-#ifdef CONFIG_EXT3_FS_XATTR
- inode->i_op = &ext3_special_inode_operations;
-#endif
- err = ext3_add_nondir(handle, dentry, inode);
- }
- ext3_journal_stop(handle);
- if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
- goto retry;
- return err;
-}
-
-static int ext3_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
-{
- handle_t *handle;
- struct inode *inode;
- int err, retries = 0;
-
- dquot_initialize(dir);
-
-retry:
- handle = ext3_journal_start(dir, EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
- 4 + EXT3_XATTR_TRANS_BLOCKS);
-
- if (IS_ERR(handle))
- return PTR_ERR(handle);
-
- inode = ext3_new_inode (handle, dir, NULL, mode);
- err = PTR_ERR(inode);
- if (!IS_ERR(inode)) {
- inode->i_op = &ext3_file_inode_operations;
- inode->i_fop = &ext3_file_operations;
- ext3_set_aops(inode);
- d_tmpfile(dentry, inode);
- err = ext3_orphan_add(handle, inode);
- if (err)
- goto err_unlock_inode;
- mark_inode_dirty(inode);
- unlock_new_inode(inode);
- }
- ext3_journal_stop(handle);
- if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
- goto retry;
- return err;
-err_unlock_inode:
- ext3_journal_stop(handle);
- unlock_new_inode(inode);
- return err;
-}
-
-static int ext3_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
-{
- handle_t *handle;
- struct inode * inode;
- struct buffer_head * dir_block = NULL;
- struct ext3_dir_entry_2 * de;
- int err, retries = 0;
-
- if (dir->i_nlink >= EXT3_LINK_MAX)
- return -EMLINK;
-
- dquot_initialize(dir);
-
-retry:
- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
- EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
- if (IS_ERR(handle))
- return PTR_ERR(handle);
-
- if (IS_DIRSYNC(dir))
- handle->h_sync = 1;
-
- inode = ext3_new_inode (handle, dir, &dentry->d_name, S_IFDIR | mode);
- err = PTR_ERR(inode);
- if (IS_ERR(inode))
- goto out_stop;
-
- inode->i_op = &ext3_dir_inode_operations;
- inode->i_fop = &ext3_dir_operations;
- inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize;
- if (!(dir_block = ext3_dir_bread(handle, inode, 0, 1, &err)))
- goto out_clear_inode;
-
- BUFFER_TRACE(dir_block, "get_write_access");
- err = ext3_journal_get_write_access(handle, dir_block);
- if (err)
- goto out_clear_inode;
-
- de = (struct ext3_dir_entry_2 *) dir_block->b_data;
- de->inode = cpu_to_le32(inode->i_ino);
- de->name_len = 1;
- de->rec_len = ext3_rec_len_to_disk(EXT3_DIR_REC_LEN(de->name_len));
- strcpy (de->name, ".");
- ext3_set_de_type(dir->i_sb, de, S_IFDIR);
- de = ext3_next_entry(de);
- de->inode = cpu_to_le32(dir->i_ino);
- de->rec_len = ext3_rec_len_to_disk(inode->i_sb->s_blocksize -
- EXT3_DIR_REC_LEN(1));
- de->name_len = 2;
- strcpy (de->name, "..");
- ext3_set_de_type(dir->i_sb, de, S_IFDIR);
- set_nlink(inode, 2);
- BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata");
- err = ext3_journal_dirty_metadata(handle, dir_block);
- if (err)
- goto out_clear_inode;
-
- err = ext3_mark_inode_dirty(handle, inode);
- if (!err)
- err = ext3_add_entry (handle, dentry, inode);
-
- if (err) {
-out_clear_inode:
- clear_nlink(inode);
- unlock_new_inode(inode);
- ext3_mark_inode_dirty(handle, inode);
- iput (inode);
- goto out_stop;
- }
- inc_nlink(dir);
- ext3_update_dx_flag(dir);
- err = ext3_mark_inode_dirty(handle, dir);
- if (err)
- goto out_clear_inode;
-
- unlock_new_inode(inode);
- d_instantiate(dentry, inode);
-out_stop:
- brelse(dir_block);
- ext3_journal_stop(handle);
- if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
- goto retry;
- return err;
-}
-
-/*
- * routine to check that the specified directory is empty (for rmdir)
- */
-static int empty_dir (struct inode * inode)
-{
- unsigned long offset;
- struct buffer_head * bh;
- struct ext3_dir_entry_2 * de, * de1;
- struct super_block * sb;
- int err = 0;
-
- sb = inode->i_sb;
- if (inode->i_size < EXT3_DIR_REC_LEN(1) + EXT3_DIR_REC_LEN(2) ||
- !(bh = ext3_dir_bread(NULL, inode, 0, 0, &err))) {
- if (err)
- ext3_error(inode->i_sb, __func__,
- "error %d reading directory #%lu offset 0",
- err, inode->i_ino);
- else
- ext3_warning(inode->i_sb, __func__,
- "bad directory (dir #%lu) - no data block",
- inode->i_ino);
- return 1;
- }
- de = (struct ext3_dir_entry_2 *) bh->b_data;
- de1 = ext3_next_entry(de);
- if (le32_to_cpu(de->inode) != inode->i_ino ||
- !le32_to_cpu(de1->inode) ||
- strcmp (".", de->name) ||
- strcmp ("..", de1->name)) {
- ext3_warning (inode->i_sb, "empty_dir",
- "bad directory (dir #%lu) - no `.' or `..'",
- inode->i_ino);
- brelse (bh);
- return 1;
- }
- offset = ext3_rec_len_from_disk(de->rec_len) +
- ext3_rec_len_from_disk(de1->rec_len);
- de = ext3_next_entry(de1);
- while (offset < inode->i_size ) {
- if (!bh ||
- (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
- err = 0;
- brelse (bh);
- if (!(bh = ext3_dir_bread (NULL, inode,
- offset >> EXT3_BLOCK_SIZE_BITS(sb), 0, &err))) {
- if (err)
- ext3_error(sb, __func__,
- "error %d reading directory"
- " #%lu offset %lu",
- err, inode->i_ino, offset);
- offset += sb->s_blocksize;
- continue;
- }
- de = (struct ext3_dir_entry_2 *) bh->b_data;
- }
- if (!ext3_check_dir_entry("empty_dir", inode, de, bh, offset)) {
- de = (struct ext3_dir_entry_2 *)(bh->b_data +
- sb->s_blocksize);
- offset = (offset | (sb->s_blocksize - 1)) + 1;
- continue;
- }
- if (le32_to_cpu(de->inode)) {
- brelse (bh);
- return 0;
- }
- offset += ext3_rec_len_from_disk(de->rec_len);
- de = ext3_next_entry(de);
- }
- brelse (bh);
- return 1;
-}
-
-/* ext3_orphan_add() links an unlinked or truncated inode into a list of
- * such inodes, starting at the superblock, in case we crash before the
- * file is closed/deleted, or in case the inode truncate spans multiple
- * transactions and the last transaction is not recovered after a crash.
- *
- * At filesystem recovery time, we walk this list deleting unlinked
- * inodes and truncating linked inodes in ext3_orphan_cleanup().
- */
-int ext3_orphan_add(handle_t *handle, struct inode *inode)
-{
- struct super_block *sb = inode->i_sb;
- struct ext3_iloc iloc;
- int err = 0, rc;
-
- mutex_lock(&EXT3_SB(sb)->s_orphan_lock);
- if (!list_empty(&EXT3_I(inode)->i_orphan))
- goto out_unlock;
-
- /* Orphan handling is only valid for files with data blocks
- * being truncated, or files being unlinked. */
-
- /* @@@ FIXME: Observation from aviro:
- * I think I can trigger J_ASSERT in ext3_orphan_add(). We block
- * here (on s_orphan_lock), so race with ext3_link() which might bump
- * ->i_nlink. For, say it, character device. Not a regular file,
- * not a directory, not a symlink and ->i_nlink > 0.
- *
- * tytso, 4/25/2009: I'm not sure how that could happen;
- * shouldn't the fs core protect us from these sort of
- * unlink()/link() races?
- */
- J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
- S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
-
- BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "get_write_access");
- err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh);
- if (err)
- goto out_unlock;
-
- err = ext3_reserve_inode_write(handle, inode, &iloc);
- if (err)
- goto out_unlock;
-
- /* Insert this inode at the head of the on-disk orphan list... */
- NEXT_ORPHAN(inode) = le32_to_cpu(EXT3_SB(sb)->s_es->s_last_orphan);
- EXT3_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
- err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
- rc = ext3_mark_iloc_dirty(handle, inode, &iloc);
- if (!err)
- err = rc;
-
- /* Only add to the head of the in-memory list if all the
- * previous operations succeeded. If the orphan_add is going to
- * fail (possibly taking the journal offline), we can't risk
- * leaving the inode on the orphan list: stray orphan-list
- * entries can cause panics at unmount time.
- *
- * This is safe: on error we're going to ignore the orphan list
- * anyway on the next recovery. */
- if (!err)
- list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan);
-
- jbd_debug(4, "superblock will point to %lu\n", inode->i_ino);
- jbd_debug(4, "orphan inode %lu will point to %d\n",
- inode->i_ino, NEXT_ORPHAN(inode));
-out_unlock:
- mutex_unlock(&EXT3_SB(sb)->s_orphan_lock);
- ext3_std_error(inode->i_sb, err);
- return err;
-}
-
-/*
- * ext3_orphan_del() removes an unlinked or truncated inode from the list
- * of such inodes stored on disk, because it is finally being cleaned up.
- */
-int ext3_orphan_del(handle_t *handle, struct inode *inode)
-{
- struct list_head *prev;
- struct ext3_inode_info *ei = EXT3_I(inode);
- struct ext3_sb_info *sbi;
- unsigned long ino_next;
- struct ext3_iloc iloc;
- int err = 0;
-
- mutex_lock(&EXT3_SB(inode->i_sb)->s_orphan_lock);
- if (list_empty(&ei->i_orphan))
- goto out;
-
- ino_next = NEXT_ORPHAN(inode);
- prev = ei->i_orphan.prev;
- sbi = EXT3_SB(inode->i_sb);
-
- jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino);
-
- list_del_init(&ei->i_orphan);
-
- /* If we're on an error path, we may not have a valid
- * transaction handle with which to update the orphan list on
- * disk, but we still need to remove the inode from the linked
- * list in memory. */
- if (!handle)
- goto out;
-
- err = ext3_reserve_inode_write(handle, inode, &iloc);
- if (err)
- goto out_err;
-
- if (prev == &sbi->s_orphan) {
- jbd_debug(4, "superblock will point to %lu\n", ino_next);
- BUFFER_TRACE(sbi->s_sbh, "get_write_access");
- err = ext3_journal_get_write_access(handle, sbi->s_sbh);
- if (err)
- goto out_brelse;
- sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
- err = ext3_journal_dirty_metadata(handle, sbi->s_sbh);
- } else {
- struct ext3_iloc iloc2;
- struct inode *i_prev =
- &list_entry(prev, struct ext3_inode_info, i_orphan)->vfs_inode;
-
- jbd_debug(4, "orphan inode %lu will point to %lu\n",
- i_prev->i_ino, ino_next);
- err = ext3_reserve_inode_write(handle, i_prev, &iloc2);
- if (err)
- goto out_brelse;
- NEXT_ORPHAN(i_prev) = ino_next;
- err = ext3_mark_iloc_dirty(handle, i_prev, &iloc2);
- }
- if (err)
- goto out_brelse;
- NEXT_ORPHAN(inode) = 0;
- err = ext3_mark_iloc_dirty(handle, inode, &iloc);
-
-out_err:
- ext3_std_error(inode->i_sb, err);
-out:
- mutex_unlock(&EXT3_SB(inode->i_sb)->s_orphan_lock);
- return err;
-
-out_brelse:
- brelse(iloc.bh);
- goto out_err;
-}
-
-static int ext3_rmdir (struct inode * dir, struct dentry *dentry)
-{
- int retval;
- struct inode * inode;
- struct buffer_head * bh;
- struct ext3_dir_entry_2 * de;
- handle_t *handle;
-
- /* Initialize quotas before so that eventual writes go in
- * separate transaction */
- dquot_initialize(dir);
- dquot_initialize(d_inode(dentry));
-
- handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb));
- if (IS_ERR(handle))
- return PTR_ERR(handle);
-
- retval = -ENOENT;
- bh = ext3_find_entry(dir, &dentry->d_name, &de);
- if (!bh)
- goto end_rmdir;
-
- if (IS_DIRSYNC(dir))
- handle->h_sync = 1;
-
- inode = d_inode(dentry);
-
- retval = -EIO;
- if (le32_to_cpu(de->inode) != inode->i_ino)
- goto end_rmdir;
-
- retval = -ENOTEMPTY;
- if (!empty_dir (inode))
- goto end_rmdir;
-
- retval = ext3_delete_entry(handle, dir, de, bh);
- if (retval)
- goto end_rmdir;
- if (inode->i_nlink != 2)
- ext3_warning (inode->i_sb, "ext3_rmdir",
- "empty directory has nlink!=2 (%d)",
- inode->i_nlink);
- inode->i_version++;
- clear_nlink(inode);
- /* There's no need to set i_disksize: the fact that i_nlink is
- * zero will ensure that the right thing happens during any
- * recovery. */
- inode->i_size = 0;
- ext3_orphan_add(handle, inode);
- inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
- ext3_mark_inode_dirty(handle, inode);
- drop_nlink(dir);
- ext3_update_dx_flag(dir);
- ext3_mark_inode_dirty(handle, dir);
-
-end_rmdir:
- ext3_journal_stop(handle);
- brelse (bh);
- return retval;
-}
-
-static int ext3_unlink(struct inode * dir, struct dentry *dentry)
-{
- int retval;
- struct inode * inode;
- struct buffer_head * bh;
- struct ext3_dir_entry_2 * de;
- handle_t *handle;
-
- trace_ext3_unlink_enter(dir, dentry);
- /* Initialize quotas before so that eventual writes go
- * in separate transaction */
- dquot_initialize(dir);
- dquot_initialize(d_inode(dentry));
-
- handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb));
- if (IS_ERR(handle))
- return PTR_ERR(handle);
-
- if (IS_DIRSYNC(dir))
- handle->h_sync = 1;
-
- retval = -ENOENT;
- bh = ext3_find_entry(dir, &dentry->d_name, &de);
- if (!bh)
- goto end_unlink;
-
- inode = d_inode(dentry);
-
- retval = -EIO;
- if (le32_to_cpu(de->inode) != inode->i_ino)
- goto end_unlink;
-
- if (!inode->i_nlink) {
- ext3_warning (inode->i_sb, "ext3_unlink",
- "Deleting nonexistent file (%lu), %d",
- inode->i_ino, inode->i_nlink);
- set_nlink(inode, 1);
- }
- retval = ext3_delete_entry(handle, dir, de, bh);
- if (retval)
- goto end_unlink;
- dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
- ext3_update_dx_flag(dir);
- ext3_mark_inode_dirty(handle, dir);
- drop_nlink(inode);
- if (!inode->i_nlink)
- ext3_orphan_add(handle, inode);
- inode->i_ctime = dir->i_ctime;
- ext3_mark_inode_dirty(handle, inode);
- retval = 0;
-
-end_unlink:
- ext3_journal_stop(handle);
- brelse (bh);
- trace_ext3_unlink_exit(dentry, retval);
- return retval;
-}
-
-static int ext3_symlink (struct inode * dir,
- struct dentry *dentry, const char * symname)
-{
- handle_t *handle;
- struct inode * inode;
- int l, err, retries = 0;
- int credits;
-
- l = strlen(symname)+1;
- if (l > dir->i_sb->s_blocksize)
- return -ENAMETOOLONG;
-
- dquot_initialize(dir);
-
- if (l > EXT3_N_BLOCKS * 4) {
- /*
- * For non-fast symlinks, we just allocate inode and put it on
- * orphan list in the first transaction => we need bitmap,
- * group descriptor, sb, inode block, quota blocks, and
- * possibly selinux xattr blocks.
- */
- credits = 4 + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
- EXT3_XATTR_TRANS_BLOCKS;
- } else {
- /*
- * Fast symlink. We have to add entry to directory
- * (EXT3_DATA_TRANS_BLOCKS + EXT3_INDEX_EXTRA_TRANS_BLOCKS),
- * allocate new inode (bitmap, group descriptor, inode block,
- * quota blocks, sb is already counted in previous macros).
- */
- credits = EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
- EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
- }
-retry:
- handle = ext3_journal_start(dir, credits);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
-
- if (IS_DIRSYNC(dir))
- handle->h_sync = 1;
-
- inode = ext3_new_inode (handle, dir, &dentry->d_name, S_IFLNK|S_IRWXUGO);
- err = PTR_ERR(inode);
- if (IS_ERR(inode))
- goto out_stop;
-
- if (l > EXT3_N_BLOCKS * 4) {
- inode->i_op = &ext3_symlink_inode_operations;
- ext3_set_aops(inode);
- /*
- * We cannot call page_symlink() with transaction started
- * because it calls into ext3_write_begin() which acquires page
- * lock which ranks below transaction start (and it can also
- * wait for journal commit if we are running out of space). So
- * we have to stop transaction now and restart it when symlink
- * contents is written.
- *
- * To keep fs consistent in case of crash, we have to put inode
- * to orphan list in the mean time.
- */
- drop_nlink(inode);
- err = ext3_orphan_add(handle, inode);
- ext3_journal_stop(handle);
- if (err)
- goto err_drop_inode;
- err = __page_symlink(inode, symname, l, 1);
- if (err)
- goto err_drop_inode;
- /*
- * Now inode is being linked into dir (EXT3_DATA_TRANS_BLOCKS
- * + EXT3_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified
- */
- handle = ext3_journal_start(dir,
- EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT3_INDEX_EXTRA_TRANS_BLOCKS + 1);
- if (IS_ERR(handle)) {
- err = PTR_ERR(handle);
- goto err_drop_inode;
- }
- set_nlink(inode, 1);
- err = ext3_orphan_del(handle, inode);
- if (err) {
- ext3_journal_stop(handle);
- drop_nlink(inode);
- goto err_drop_inode;
- }
- } else {
- inode->i_op = &ext3_fast_symlink_inode_operations;
- inode->i_link = (char*)&EXT3_I(inode)->i_data;
- memcpy(inode->i_link, symname, l);
- inode->i_size = l-1;
- }
- EXT3_I(inode)->i_disksize = inode->i_size;
- err = ext3_add_nondir(handle, dentry, inode);
-out_stop:
- ext3_journal_stop(handle);
- if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
- goto retry;
- return err;
-err_drop_inode:
- unlock_new_inode(inode);
- iput(inode);
- return err;
-}
-
-static int ext3_link (struct dentry * old_dentry,
- struct inode * dir, struct dentry *dentry)
-{
- handle_t *handle;
- struct inode *inode = d_inode(old_dentry);
- int err, retries = 0;
-
- if (inode->i_nlink >= EXT3_LINK_MAX)
- return -EMLINK;
-
- dquot_initialize(dir);
-
-retry:
- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT3_INDEX_EXTRA_TRANS_BLOCKS + 1);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
-
- if (IS_DIRSYNC(dir))
- handle->h_sync = 1;
-
- inode->i_ctime = CURRENT_TIME_SEC;
- inc_nlink(inode);
- ihold(inode);
-
- err = ext3_add_entry(handle, dentry, inode);
- if (!err) {
- ext3_mark_inode_dirty(handle, inode);
- /* this can happen only for tmpfile being
- * linked the first time
- */
- if (inode->i_nlink == 1)
- ext3_orphan_del(handle, inode);
- d_instantiate(dentry, inode);
- } else {
- drop_nlink(inode);
- iput(inode);
- }
- ext3_journal_stop(handle);
- if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
- goto retry;
- return err;
-}
-
-#define PARENT_INO(buffer) \
- (ext3_next_entry((struct ext3_dir_entry_2 *)(buffer))->inode)
-
-/*
- * Anybody can rename anything with this: the permission checks are left to the
- * higher-level routines.
- */
-static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
- struct inode * new_dir,struct dentry *new_dentry)
-{
- handle_t *handle;
- struct inode * old_inode, * new_inode;
- struct buffer_head * old_bh, * new_bh, * dir_bh;
- struct ext3_dir_entry_2 * old_de, * new_de;
- int retval, flush_file = 0;
-
- dquot_initialize(old_dir);
- dquot_initialize(new_dir);
-
- old_bh = new_bh = dir_bh = NULL;
-
- /* Initialize quotas before so that eventual writes go
- * in separate transaction */
- if (d_really_is_positive(new_dentry))
- dquot_initialize(d_inode(new_dentry));
- handle = ext3_journal_start(old_dir, 2 *
- EXT3_DATA_TRANS_BLOCKS(old_dir->i_sb) +
- EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
-
- if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
- handle->h_sync = 1;
-
- old_bh = ext3_find_entry(old_dir, &old_dentry->d_name, &old_de);
- /*
- * Check for inode number is _not_ due to possible IO errors.
- * We might rmdir the source, keep it as pwd of some process
- * and merrily kill the link to whatever was created under the
- * same name. Goodbye sticky bit ;-<
- */
- old_inode = d_inode(old_dentry);
- retval = -ENOENT;
- if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino)
- goto end_rename;
-
- new_inode = d_inode(new_dentry);
- new_bh = ext3_find_entry(new_dir, &new_dentry->d_name, &new_de);
- if (new_bh) {
- if (!new_inode) {
- brelse (new_bh);
- new_bh = NULL;
- }
- }
- if (S_ISDIR(old_inode->i_mode)) {
- if (new_inode) {
- retval = -ENOTEMPTY;
- if (!empty_dir (new_inode))
- goto end_rename;
- }
- retval = -EIO;
- dir_bh = ext3_dir_bread(handle, old_inode, 0, 0, &retval);
- if (!dir_bh)
- goto end_rename;
- if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino)
- goto end_rename;
- retval = -EMLINK;
- if (!new_inode && new_dir!=old_dir &&
- new_dir->i_nlink >= EXT3_LINK_MAX)
- goto end_rename;
- }
- if (!new_bh) {
- retval = ext3_add_entry (handle, new_dentry, old_inode);
- if (retval)
- goto end_rename;
- } else {
- BUFFER_TRACE(new_bh, "get write access");
- retval = ext3_journal_get_write_access(handle, new_bh);
- if (retval)
- goto journal_error;
- new_de->inode = cpu_to_le32(old_inode->i_ino);
- if (EXT3_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
- EXT3_FEATURE_INCOMPAT_FILETYPE))
- new_de->file_type = old_de->file_type;
- new_dir->i_version++;
- new_dir->i_ctime = new_dir->i_mtime = CURRENT_TIME_SEC;
- ext3_mark_inode_dirty(handle, new_dir);
- BUFFER_TRACE(new_bh, "call ext3_journal_dirty_metadata");
- retval = ext3_journal_dirty_metadata(handle, new_bh);
- if (retval)
- goto journal_error;
- brelse(new_bh);
- new_bh = NULL;
- }
-
- /*
- * Like most other Unix systems, set the ctime for inodes on a
- * rename.
- */
- old_inode->i_ctime = CURRENT_TIME_SEC;
- ext3_mark_inode_dirty(handle, old_inode);
-
- /*
- * ok, that's it
- */
- if (le32_to_cpu(old_de->inode) != old_inode->i_ino ||
- old_de->name_len != old_dentry->d_name.len ||
- strncmp(old_de->name, old_dentry->d_name.name, old_de->name_len) ||
- (retval = ext3_delete_entry(handle, old_dir,
- old_de, old_bh)) == -ENOENT) {
- /* old_de could have moved from under us during htree split, so
- * make sure that we are deleting the right entry. We might
- * also be pointing to a stale entry in the unused part of
- * old_bh so just checking inum and the name isn't enough. */
- struct buffer_head *old_bh2;
- struct ext3_dir_entry_2 *old_de2;
-
- old_bh2 = ext3_find_entry(old_dir, &old_dentry->d_name,
- &old_de2);
- if (old_bh2) {
- retval = ext3_delete_entry(handle, old_dir,
- old_de2, old_bh2);
- brelse(old_bh2);
- }
- }
- if (retval) {
- ext3_warning(old_dir->i_sb, "ext3_rename",
- "Deleting old file (%lu), %d, error=%d",
- old_dir->i_ino, old_dir->i_nlink, retval);
- }
-
- if (new_inode) {
- drop_nlink(new_inode);
- new_inode->i_ctime = CURRENT_TIME_SEC;
- }
- old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC;
- ext3_update_dx_flag(old_dir);
- if (dir_bh) {
- BUFFER_TRACE(dir_bh, "get_write_access");
- retval = ext3_journal_get_write_access(handle, dir_bh);
- if (retval)
- goto journal_error;
- PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
- BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata");
- retval = ext3_journal_dirty_metadata(handle, dir_bh);
- if (retval) {
-journal_error:
- ext3_std_error(new_dir->i_sb, retval);
- goto end_rename;
- }
- drop_nlink(old_dir);
- if (new_inode) {
- drop_nlink(new_inode);
- } else {
- inc_nlink(new_dir);
- ext3_update_dx_flag(new_dir);
- ext3_mark_inode_dirty(handle, new_dir);
- }
- }
- ext3_mark_inode_dirty(handle, old_dir);
- if (new_inode) {
- ext3_mark_inode_dirty(handle, new_inode);
- if (!new_inode->i_nlink)
- ext3_orphan_add(handle, new_inode);
- if (ext3_should_writeback_data(new_inode))
- flush_file = 1;
- }
- retval = 0;
-
-end_rename:
- brelse (dir_bh);
- brelse (old_bh);
- brelse (new_bh);
- ext3_journal_stop(handle);
- if (retval == 0 && flush_file)
- filemap_flush(old_inode->i_mapping);
- return retval;
-}
-
-/*
- * directories can handle most operations...
- */
-const struct inode_operations ext3_dir_inode_operations = {
- .create = ext3_create,
- .lookup = ext3_lookup,
- .link = ext3_link,
- .unlink = ext3_unlink,
- .symlink = ext3_symlink,
- .mkdir = ext3_mkdir,
- .rmdir = ext3_rmdir,
- .mknod = ext3_mknod,
- .tmpfile = ext3_tmpfile,
- .rename = ext3_rename,
- .setattr = ext3_setattr,
-#ifdef CONFIG_EXT3_FS_XATTR
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
- .listxattr = ext3_listxattr,
- .removexattr = generic_removexattr,
-#endif
- .get_acl = ext3_get_acl,
- .set_acl = ext3_set_acl,
-};
-
-const struct inode_operations ext3_special_inode_operations = {
- .setattr = ext3_setattr,
-#ifdef CONFIG_EXT3_FS_XATTR
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
- .listxattr = ext3_listxattr,
- .removexattr = generic_removexattr,
-#endif
- .get_acl = ext3_get_acl,
- .set_acl = ext3_set_acl,
-};
diff --git a/fs/ext3/namei.h b/fs/ext3/namei.h
deleted file mode 100644
index 46304d8..0000000
--- a/fs/ext3/namei.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* linux/fs/ext3/namei.h
- *
- * Copyright (C) 2005 Simtec Electronics
- * Ben Dooks <ben@simtec.co.uk>
- *
-*/
-
-extern struct dentry *ext3_get_parent(struct dentry *child);
-
-static inline struct buffer_head *ext3_dir_bread(handle_t *handle,
- struct inode *inode,
- int block, int create,
- int *err)
-{
- struct buffer_head *bh;
-
- bh = ext3_bread(handle, inode, block, create, err);
-
- if (!bh && !(*err)) {
- *err = -EIO;
- ext3_error(inode->i_sb, __func__,
- "Directory hole detected on inode %lu\n",
- inode->i_ino);
- return NULL;
- }
- return bh;
-}
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
deleted file mode 100644
index 2710565..0000000
--- a/fs/ext3/resize.c
+++ /dev/null
@@ -1,1117 +0,0 @@
-/*
- * linux/fs/ext3/resize.c
- *
- * Support for resizing an ext3 filesystem while it is mounted.
- *
- * Copyright (C) 2001, 2002 Andreas Dilger <adilger@clusterfs.com>
- *
- * This could probably be made into a module, because it is not often in use.
- */
-
-
-#define EXT3FS_DEBUG
-
-#include "ext3.h"
-
-
-#define outside(b, first, last) ((b) < (first) || (b) >= (last))
-#define inside(b, first, last) ((b) >= (first) && (b) < (last))
-
-static int verify_group_input(struct super_block *sb,
- struct ext3_new_group_data *input)
-{
- struct ext3_sb_info *sbi = EXT3_SB(sb);
- struct ext3_super_block *es = sbi->s_es;
- ext3_fsblk_t start = le32_to_cpu(es->s_blocks_count);
- ext3_fsblk_t end = start + input->blocks_count;
- unsigned group = input->group;
- ext3_fsblk_t itend = input->inode_table + sbi->s_itb_per_group;
- unsigned overhead = ext3_bg_has_super(sb, group) ?
- (1 + ext3_bg_num_gdb(sb, group) +
- le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
- ext3_fsblk_t metaend = start + overhead;
- struct buffer_head *bh = NULL;
- ext3_grpblk_t free_blocks_count;
- int err = -EINVAL;
-
- input->free_blocks_count = free_blocks_count =
- input->blocks_count - 2 - overhead - sbi->s_itb_per_group;
-
- if (test_opt(sb, DEBUG))
- printk(KERN_DEBUG "EXT3-fs: adding %s group %u: %u blocks "
- "(%d free, %u reserved)\n",
- ext3_bg_has_super(sb, input->group) ? "normal" :
- "no-super", input->group, input->blocks_count,
- free_blocks_count, input->reserved_blocks);
-
- if (group != sbi->s_groups_count)
- ext3_warning(sb, __func__,
- "Cannot add at group %u (only %lu groups)",
- input->group, sbi->s_groups_count);
- else if ((start - le32_to_cpu(es->s_first_data_block)) %
- EXT3_BLOCKS_PER_GROUP(sb))
- ext3_warning(sb, __func__, "Last group not full");
- else if (input->reserved_blocks > input->blocks_count / 5)
- ext3_warning(sb, __func__, "Reserved blocks too high (%u)",
- input->reserved_blocks);
- else if (free_blocks_count < 0)
- ext3_warning(sb, __func__, "Bad blocks count %u",
- input->blocks_count);
- else if (!(bh = sb_bread(sb, end - 1)))
- ext3_warning(sb, __func__,
- "Cannot read last block ("E3FSBLK")",
- end - 1);
- else if (outside(input->block_bitmap, start, end))
- ext3_warning(sb, __func__,
- "Block bitmap not in group (block %u)",
- input->block_bitmap);
- else if (outside(input->inode_bitmap, start, end))
- ext3_warning(sb, __func__,
- "Inode bitmap not in group (block %u)",
- input->inode_bitmap);
- else if (outside(input->inode_table, start, end) ||
- outside(itend - 1, start, end))
- ext3_warning(sb, __func__,
- "Inode table not in group (blocks %u-"E3FSBLK")",
- input->inode_table, itend - 1);
- else if (input->inode_bitmap == input->block_bitmap)
- ext3_warning(sb, __func__,
- "Block bitmap same as inode bitmap (%u)",
- input->block_bitmap);
- else if (inside(input->block_bitmap, input->inode_table, itend))
- ext3_warning(sb, __func__,
- "Block bitmap (%u) in inode table (%u-"E3FSBLK")",
- input->block_bitmap, input->inode_table, itend-1);
- else if (inside(input->inode_bitmap, input->inode_table, itend))
- ext3_warning(sb, __func__,
- "Inode bitmap (%u) in inode table (%u-"E3FSBLK")",
- input->inode_bitmap, input->inode_table, itend-1);
- else if (inside(input->block_bitmap, start, metaend))
- ext3_warning(sb, __func__,
- "Block bitmap (%u) in GDT table"
- " ("E3FSBLK"-"E3FSBLK")",
- input->block_bitmap, start, metaend - 1);
- else if (inside(input->inode_bitmap, start, metaend))
- ext3_warning(sb, __func__,
- "Inode bitmap (%u) in GDT table"
- " ("E3FSBLK"-"E3FSBLK")",
- input->inode_bitmap, start, metaend - 1);
- else if (inside(input->inode_table, start, metaend) ||
- inside(itend - 1, start, metaend))
- ext3_warning(sb, __func__,
- "Inode table (%u-"E3FSBLK") overlaps"
- "GDT table ("E3FSBLK"-"E3FSBLK")",
- input->inode_table, itend - 1, start, metaend - 1);
- else
- err = 0;
- brelse(bh);
-
- return err;
-}
-
-static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
- ext3_fsblk_t blk)
-{
- struct buffer_head *bh;
- int err;
-
- bh = sb_getblk(sb, blk);
- if (unlikely(!bh))
- return ERR_PTR(-ENOMEM);
- if ((err = ext3_journal_get_write_access(handle, bh))) {
- brelse(bh);
- bh = ERR_PTR(err);
- } else {
- lock_buffer(bh);
- memset(bh->b_data, 0, sb->s_blocksize);
- set_buffer_uptodate(bh);
- unlock_buffer(bh);
- }
-
- return bh;
-}
-
-/*
- * To avoid calling the atomic setbit hundreds or thousands of times, we only
- * need to use it within a single byte (to ensure we get endianness right).
- * We can use memset for the rest of the bitmap as there are no other users.
- */
-static void mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
-{
- int i;
-
- if (start_bit >= end_bit)
- return;
-
- ext3_debug("mark end bits +%d through +%d used\n", start_bit, end_bit);
- for (i = start_bit; i < ((start_bit + 7) & ~7UL); i++)
- ext3_set_bit(i, bitmap);
- if (i < end_bit)
- memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3);
-}
-
-/*
- * If we have fewer than thresh credits, extend by EXT3_MAX_TRANS_DATA.
- * If that fails, restart the transaction & regain write access for the
- * buffer head which is used for block_bitmap modifications.
- */
-static int extend_or_restart_transaction(handle_t *handle, int thresh,
- struct buffer_head *bh)
-{
- int err;
-
- if (handle->h_buffer_credits >= thresh)
- return 0;
-
- err = ext3_journal_extend(handle, EXT3_MAX_TRANS_DATA);
- if (err < 0)
- return err;
- if (err) {
- err = ext3_journal_restart(handle, EXT3_MAX_TRANS_DATA);
- if (err)
- return err;
- err = ext3_journal_get_write_access(handle, bh);
- if (err)
- return err;
- }
-
- return 0;
-}
-
-/*
- * Set up the block and inode bitmaps, and the inode table for the new group.
- * This doesn't need to be part of the main transaction, since we are only
- * changing blocks outside the actual filesystem. We still do journaling to
- * ensure the recovery is correct in case of a failure just after resize.
- * If any part of this fails, we simply abort the resize.
- */
-static int setup_new_group_blocks(struct super_block *sb,
- struct ext3_new_group_data *input)
-{
- struct ext3_sb_info *sbi = EXT3_SB(sb);
- ext3_fsblk_t start = ext3_group_first_block_no(sb, input->group);
- int reserved_gdb = ext3_bg_has_super(sb, input->group) ?
- le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0;
- unsigned long gdblocks = ext3_bg_num_gdb(sb, input->group);
- struct buffer_head *bh;
- handle_t *handle;
- ext3_fsblk_t block;
- ext3_grpblk_t bit;
- int i;
- int err = 0, err2;
-
- /* This transaction may be extended/restarted along the way */
- handle = ext3_journal_start_sb(sb, EXT3_MAX_TRANS_DATA);
-
- if (IS_ERR(handle))
- return PTR_ERR(handle);
-
- mutex_lock(&sbi->s_resize_lock);
- if (input->group != sbi->s_groups_count) {
- err = -EBUSY;
- goto exit_journal;
- }
-
- if (IS_ERR(bh = bclean(handle, sb, input->block_bitmap))) {
- err = PTR_ERR(bh);
- goto exit_journal;
- }
-
- if (ext3_bg_has_super(sb, input->group)) {
- ext3_debug("mark backup superblock %#04lx (+0)\n", start);
- ext3_set_bit(0, bh->b_data);
- }
-
- /* Copy all of the GDT blocks into the backup in this group */
- for (i = 0, bit = 1, block = start + 1;
- i < gdblocks; i++, block++, bit++) {
- struct buffer_head *gdb;
-
- ext3_debug("update backup group %#04lx (+%d)\n", block, bit);
-
- err = extend_or_restart_transaction(handle, 1, bh);
- if (err)
- goto exit_bh;
-
- gdb = sb_getblk(sb, block);
- if (unlikely(!gdb)) {
- err = -ENOMEM;
- goto exit_bh;
- }
- if ((err = ext3_journal_get_write_access(handle, gdb))) {
- brelse(gdb);
- goto exit_bh;
- }
- lock_buffer(gdb);
- memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size);
- set_buffer_uptodate(gdb);
- unlock_buffer(gdb);
- err = ext3_journal_dirty_metadata(handle, gdb);
- if (err) {
- brelse(gdb);
- goto exit_bh;
- }
- ext3_set_bit(bit, bh->b_data);
- brelse(gdb);
- }
-
- /* Zero out all of the reserved backup group descriptor table blocks */
- for (i = 0, bit = gdblocks + 1, block = start + bit;
- i < reserved_gdb; i++, block++, bit++) {
- struct buffer_head *gdb;
-
- ext3_debug("clear reserved block %#04lx (+%d)\n", block, bit);
-
- err = extend_or_restart_transaction(handle, 1, bh);
- if (err)
- goto exit_bh;
-
- if (IS_ERR(gdb = bclean(handle, sb, block))) {
- err = PTR_ERR(gdb);
- goto exit_bh;
- }
- err = ext3_journal_dirty_metadata(handle, gdb);
- if (err) {
- brelse(gdb);
- goto exit_bh;
- }
- ext3_set_bit(bit, bh->b_data);
- brelse(gdb);
- }
- ext3_debug("mark block bitmap %#04x (+%ld)\n", input->block_bitmap,
- input->block_bitmap - start);
- ext3_set_bit(input->block_bitmap - start, bh->b_data);
- ext3_debug("mark inode bitmap %#04x (+%ld)\n", input->inode_bitmap,
- input->inode_bitmap - start);
- ext3_set_bit(input->inode_bitmap - start, bh->b_data);
-
- /* Zero out all of the inode table blocks */
- for (i = 0, block = input->inode_table, bit = block - start;
- i < sbi->s_itb_per_group; i++, bit++, block++) {
- struct buffer_head *it;
-
- ext3_debug("clear inode block %#04lx (+%d)\n", block, bit);
-
- err = extend_or_restart_transaction(handle, 1, bh);
- if (err)
- goto exit_bh;
-
- if (IS_ERR(it = bclean(handle, sb, block))) {
- err = PTR_ERR(it);
- goto exit_bh;
- }
- err = ext3_journal_dirty_metadata(handle, it);
- if (err) {
- brelse(it);
- goto exit_bh;
- }
- brelse(it);
- ext3_set_bit(bit, bh->b_data);
- }
-
- err = extend_or_restart_transaction(handle, 2, bh);
- if (err)
- goto exit_bh;
-
- mark_bitmap_end(input->blocks_count, EXT3_BLOCKS_PER_GROUP(sb),
- bh->b_data);
- err = ext3_journal_dirty_metadata(handle, bh);
- if (err)
- goto exit_bh;
- brelse(bh);
-
- /* Mark unused entries in inode bitmap used */
- ext3_debug("clear inode bitmap %#04x (+%ld)\n",
- input->inode_bitmap, input->inode_bitmap - start);
- if (IS_ERR(bh = bclean(handle, sb, input->inode_bitmap))) {
- err = PTR_ERR(bh);
- goto exit_journal;
- }
-
- mark_bitmap_end(EXT3_INODES_PER_GROUP(sb), EXT3_BLOCKS_PER_GROUP(sb),
- bh->b_data);
- err = ext3_journal_dirty_metadata(handle, bh);
-exit_bh:
- brelse(bh);
-
-exit_journal:
- mutex_unlock(&sbi->s_resize_lock);
- if ((err2 = ext3_journal_stop(handle)) && !err)
- err = err2;
-
- return err;
-}
-
-/*
- * Iterate through the groups which hold BACKUP superblock/GDT copies in an
- * ext3 filesystem. The counters should be initialized to 1, 5, and 7 before
- * calling this for the first time. In a sparse filesystem it will be the
- * sequence of powers of 3, 5, and 7: 1, 3, 5, 7, 9, 25, 27, 49, 81, ...
- * For a non-sparse filesystem it will be every group: 1, 2, 3, 4, ...
- */
-static unsigned ext3_list_backups(struct super_block *sb, unsigned *three,
- unsigned *five, unsigned *seven)
-{
- unsigned *min = three;
- int mult = 3;
- unsigned ret;
-
- if (!EXT3_HAS_RO_COMPAT_FEATURE(sb,
- EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
- ret = *min;
- *min += 1;
- return ret;
- }
-
- if (*five < *min) {
- min = five;
- mult = 5;
- }
- if (*seven < *min) {
- min = seven;
- mult = 7;
- }
-
- ret = *min;
- *min *= mult;
-
- return ret;
-}
-
-/*
- * Check that all of the backup GDT blocks are held in the primary GDT block.
- * It is assumed that they are stored in group order. Returns the number of
- * groups in current filesystem that have BACKUPS, or -ve error code.
- */
-static int verify_reserved_gdb(struct super_block *sb,
- struct buffer_head *primary)
-{
- const ext3_fsblk_t blk = primary->b_blocknr;
- const unsigned long end = EXT3_SB(sb)->s_groups_count;
- unsigned three = 1;
- unsigned five = 5;
- unsigned seven = 7;
- unsigned grp;
- __le32 *p = (__le32 *)primary->b_data;
- int gdbackups = 0;
-
- while ((grp = ext3_list_backups(sb, &three, &five, &seven)) < end) {
- if (le32_to_cpu(*p++) != grp * EXT3_BLOCKS_PER_GROUP(sb) + blk){
- ext3_warning(sb, __func__,
- "reserved GDT "E3FSBLK
- " missing grp %d ("E3FSBLK")",
- blk, grp,
- grp * EXT3_BLOCKS_PER_GROUP(sb) + blk);
- return -EINVAL;
- }
- if (++gdbackups > EXT3_ADDR_PER_BLOCK(sb))
- return -EFBIG;
- }
-
- return gdbackups;
-}
-
-/*
- * Called when we need to bring a reserved group descriptor table block into
- * use from the resize inode. The primary copy of the new GDT block currently
- * is an indirect block (under the double indirect block in the resize inode).
- * The new backup GDT blocks will be stored as leaf blocks in this indirect
- * block, in group order. Even though we know all the block numbers we need,
- * we check to ensure that the resize inode has actually reserved these blocks.
- *
- * Don't need to update the block bitmaps because the blocks are still in use.
- *
- * We get all of the error cases out of the way, so that we are sure to not
- * fail once we start modifying the data on disk, because JBD has no rollback.
- */
-static int add_new_gdb(handle_t *handle, struct inode *inode,
- struct ext3_new_group_data *input,
- struct buffer_head **primary)
-{
- struct super_block *sb = inode->i_sb;
- struct ext3_super_block *es = EXT3_SB(sb)->s_es;
- unsigned long gdb_num = input->group / EXT3_DESC_PER_BLOCK(sb);
- ext3_fsblk_t gdblock = EXT3_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num;
- struct buffer_head **o_group_desc, **n_group_desc;
- struct buffer_head *dind;
- int gdbackups;
- struct ext3_iloc iloc;
- __le32 *data;
- int err;
-
- if (test_opt(sb, DEBUG))
- printk(KERN_DEBUG
- "EXT3-fs: ext3_add_new_gdb: adding group block %lu\n",
- gdb_num);
-
- /*
- * If we are not using the primary superblock/GDT copy don't resize,
- * because the user tools have no way of handling this. Probably a
- * bad time to do it anyways.
- */
- if (EXT3_SB(sb)->s_sbh->b_blocknr !=
- le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block)) {
- ext3_warning(sb, __func__,
- "won't resize using backup superblock at %llu",
- (unsigned long long)EXT3_SB(sb)->s_sbh->b_blocknr);
- return -EPERM;
- }
-
- *primary = sb_bread(sb, gdblock);
- if (!*primary)
- return -EIO;
-
- if ((gdbackups = verify_reserved_gdb(sb, *primary)) < 0) {
- err = gdbackups;
- goto exit_bh;
- }
-
- data = EXT3_I(inode)->i_data + EXT3_DIND_BLOCK;
- dind = sb_bread(sb, le32_to_cpu(*data));
- if (!dind) {
- err = -EIO;
- goto exit_bh;
- }
-
- data = (__le32 *)dind->b_data;
- if (le32_to_cpu(data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)]) != gdblock) {
- ext3_warning(sb, __func__,
- "new group %u GDT block "E3FSBLK" not reserved",
- input->group, gdblock);
- err = -EINVAL;
- goto exit_dind;
- }
-
- if ((err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh)))
- goto exit_dind;
-
- if ((err = ext3_journal_get_write_access(handle, *primary)))
- goto exit_sbh;
-
- if ((err = ext3_journal_get_write_access(handle, dind)))
- goto exit_primary;
-
- /* ext3_reserve_inode_write() gets a reference on the iloc */
- if ((err = ext3_reserve_inode_write(handle, inode, &iloc)))
- goto exit_dindj;
-
- n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *),
- GFP_NOFS);
- if (!n_group_desc) {
- err = -ENOMEM;
- ext3_warning (sb, __func__,
- "not enough memory for %lu groups", gdb_num + 1);
- goto exit_inode;
- }
-
- /*
- * Finally, we have all of the possible failures behind us...
- *
- * Remove new GDT block from inode double-indirect block and clear out
- * the new GDT block for use (which also "frees" the backup GDT blocks
- * from the reserved inode). We don't need to change the bitmaps for
- * these blocks, because they are marked as in-use from being in the
- * reserved inode, and will become GDT blocks (primary and backup).
- */
- data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)] = 0;
- err = ext3_journal_dirty_metadata(handle, dind);
- if (err)
- goto exit_group_desc;
- brelse(dind);
- dind = NULL;
- inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
- err = ext3_mark_iloc_dirty(handle, inode, &iloc);
- if (err)
- goto exit_group_desc;
- memset((*primary)->b_data, 0, sb->s_blocksize);
- err = ext3_journal_dirty_metadata(handle, *primary);
- if (err)
- goto exit_group_desc;
-
- o_group_desc = EXT3_SB(sb)->s_group_desc;
- memcpy(n_group_desc, o_group_desc,
- EXT3_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
- n_group_desc[gdb_num] = *primary;
- EXT3_SB(sb)->s_group_desc = n_group_desc;
- EXT3_SB(sb)->s_gdb_count++;
- kfree(o_group_desc);
-
- le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
- err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
- if (err)
- goto exit_inode;
-
- return 0;
-
-exit_group_desc:
- kfree(n_group_desc);
-exit_inode:
- //ext3_journal_release_buffer(handle, iloc.bh);
- brelse(iloc.bh);
-exit_dindj:
- //ext3_journal_release_buffer(handle, dind);
-exit_primary:
- //ext3_journal_release_buffer(handle, *primary);
-exit_sbh:
- //ext3_journal_release_buffer(handle, *primary);
-exit_dind:
- brelse(dind);
-exit_bh:
- brelse(*primary);
-
- ext3_debug("leaving with error %d\n", err);
- return err;
-}
-
-/*
- * Called when we are adding a new group which has a backup copy of each of
- * the GDT blocks (i.e. sparse group) and there are reserved GDT blocks.
- * We need to add these reserved backup GDT blocks to the resize inode, so
- * that they are kept for future resizing and not allocated to files.
- *
- * Each reserved backup GDT block will go into a different indirect block.
- * The indirect blocks are actually the primary reserved GDT blocks,
- * so we know in advance what their block numbers are. We only get the
- * double-indirect block to verify it is pointing to the primary reserved
- * GDT blocks so we don't overwrite a data block by accident. The reserved
- * backup GDT blocks are stored in their reserved primary GDT block.
- */
-static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
- struct ext3_new_group_data *input)
-{
- struct super_block *sb = inode->i_sb;
- int reserved_gdb =le16_to_cpu(EXT3_SB(sb)->s_es->s_reserved_gdt_blocks);
- struct buffer_head **primary;
- struct buffer_head *dind;
- struct ext3_iloc iloc;
- ext3_fsblk_t blk;
- __le32 *data, *end;
- int gdbackups = 0;
- int res, i;
- int err;
-
- primary = kmalloc(reserved_gdb * sizeof(*primary), GFP_NOFS);
- if (!primary)
- return -ENOMEM;
-
- data = EXT3_I(inode)->i_data + EXT3_DIND_BLOCK;
- dind = sb_bread(sb, le32_to_cpu(*data));
- if (!dind) {
- err = -EIO;
- goto exit_free;
- }
-
- blk = EXT3_SB(sb)->s_sbh->b_blocknr + 1 + EXT3_SB(sb)->s_gdb_count;
- data = (__le32 *)dind->b_data + (EXT3_SB(sb)->s_gdb_count %
- EXT3_ADDR_PER_BLOCK(sb));
- end = (__le32 *)dind->b_data + EXT3_ADDR_PER_BLOCK(sb);
-
- /* Get each reserved primary GDT block and verify it holds backups */
- for (res = 0; res < reserved_gdb; res++, blk++) {
- if (le32_to_cpu(*data) != blk) {
- ext3_warning(sb, __func__,
- "reserved block "E3FSBLK
- " not at offset %ld",
- blk,
- (long)(data - (__le32 *)dind->b_data));
- err = -EINVAL;
- goto exit_bh;
- }
- primary[res] = sb_bread(sb, blk);
- if (!primary[res]) {
- err = -EIO;
- goto exit_bh;
- }
- if ((gdbackups = verify_reserved_gdb(sb, primary[res])) < 0) {
- brelse(primary[res]);
- err = gdbackups;
- goto exit_bh;
- }
- if (++data >= end)
- data = (__le32 *)dind->b_data;
- }
-
- for (i = 0; i < reserved_gdb; i++) {
- if ((err = ext3_journal_get_write_access(handle, primary[i]))) {
- /*
- int j;
- for (j = 0; j < i; j++)
- ext3_journal_release_buffer(handle, primary[j]);
- */
- goto exit_bh;
- }
- }
-
- if ((err = ext3_reserve_inode_write(handle, inode, &iloc)))
- goto exit_bh;
-
- /*
- * Finally we can add each of the reserved backup GDT blocks from
- * the new group to its reserved primary GDT block.
- */
- blk = input->group * EXT3_BLOCKS_PER_GROUP(sb);
- for (i = 0; i < reserved_gdb; i++) {
- int err2;
- data = (__le32 *)primary[i]->b_data;
- /* printk("reserving backup %lu[%u] = %lu\n",
- primary[i]->b_blocknr, gdbackups,
- blk + primary[i]->b_blocknr); */
- data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr);
- err2 = ext3_journal_dirty_metadata(handle, primary[i]);
- if (!err)
- err = err2;
- }
- inode->i_blocks += reserved_gdb * sb->s_blocksize >> 9;
- ext3_mark_iloc_dirty(handle, inode, &iloc);
-
-exit_bh:
- while (--res >= 0)
- brelse(primary[res]);
- brelse(dind);
-
-exit_free:
- kfree(primary);
-
- return err;
-}
-
-/*
- * Update the backup copies of the ext3 metadata. These don't need to be part
- * of the main resize transaction, because e2fsck will re-write them if there
- * is a problem (basically only OOM will cause a problem). However, we
- * _should_ update the backups if possible, in case the primary gets trashed
- * for some reason and we need to run e2fsck from a backup superblock. The
- * important part is that the new block and inode counts are in the backup
- * superblocks, and the location of the new group metadata in the GDT backups.
- *
- * We do not need take the s_resize_lock for this, because these
- * blocks are not otherwise touched by the filesystem code when it is
- * mounted. We don't need to worry about last changing from
- * sbi->s_groups_count, because the worst that can happen is that we
- * do not copy the full number of backups at this time. The resize
- * which changed s_groups_count will backup again.
- */
-static void update_backups(struct super_block *sb,
- int blk_off, char *data, int size)
-{
- struct ext3_sb_info *sbi = EXT3_SB(sb);
- const unsigned long last = sbi->s_groups_count;
- const int bpg = EXT3_BLOCKS_PER_GROUP(sb);
- unsigned three = 1;
- unsigned five = 5;
- unsigned seven = 7;
- unsigned group;
- int rest = sb->s_blocksize - size;
- handle_t *handle;
- int err = 0, err2;
-
- handle = ext3_journal_start_sb(sb, EXT3_MAX_TRANS_DATA);
- if (IS_ERR(handle)) {
- group = 1;
- err = PTR_ERR(handle);
- goto exit_err;
- }
-
- while ((group = ext3_list_backups(sb, &three, &five, &seven)) < last) {
- struct buffer_head *bh;
-
- /* Out of journal space, and can't get more - abort - so sad */
- if (handle->h_buffer_credits == 0 &&
- ext3_journal_extend(handle, EXT3_MAX_TRANS_DATA) &&
- (err = ext3_journal_restart(handle, EXT3_MAX_TRANS_DATA)))
- break;
-
- bh = sb_getblk(sb, group * bpg + blk_off);
- if (unlikely(!bh)) {
- err = -ENOMEM;
- break;
- }
- ext3_debug("update metadata backup %#04lx\n",
- (unsigned long)bh->b_blocknr);
- if ((err = ext3_journal_get_write_access(handle, bh))) {
- brelse(bh);
- break;
- }
- lock_buffer(bh);
- memcpy(bh->b_data, data, size);
- if (rest)
- memset(bh->b_data + size, 0, rest);
- set_buffer_uptodate(bh);
- unlock_buffer(bh);
- err = ext3_journal_dirty_metadata(handle, bh);
- brelse(bh);
- if (err)
- break;
- }
- if ((err2 = ext3_journal_stop(handle)) && !err)
- err = err2;
-
- /*
- * Ugh! Need to have e2fsck write the backup copies. It is too
- * late to revert the resize, we shouldn't fail just because of
- * the backup copies (they are only needed in case of corruption).
- *
- * However, if we got here we have a journal problem too, so we
- * can't really start a transaction to mark the superblock.
- * Chicken out and just set the flag on the hope it will be written
- * to disk, and if not - we will simply wait until next fsck.
- */
-exit_err:
- if (err) {
- ext3_warning(sb, __func__,
- "can't update backup for group %d (err %d), "
- "forcing fsck on next reboot", group, err);
- sbi->s_mount_state &= ~EXT3_VALID_FS;
- sbi->s_es->s_state &= cpu_to_le16(~EXT3_VALID_FS);
- mark_buffer_dirty(sbi->s_sbh);
- }
-}
-
-/* Add group descriptor data to an existing or new group descriptor block.
- * Ensure we handle all possible error conditions _before_ we start modifying
- * the filesystem, because we cannot abort the transaction and not have it
- * write the data to disk.
- *
- * If we are on a GDT block boundary, we need to get the reserved GDT block.
- * Otherwise, we may need to add backup GDT blocks for a sparse group.
- *
- * We only need to hold the superblock lock while we are actually adding
- * in the new group's counts to the superblock. Prior to that we have
- * not really "added" the group at all. We re-check that we are still
- * adding in the last group in case things have changed since verifying.
- */
-int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
-{
- struct ext3_sb_info *sbi = EXT3_SB(sb);
- struct ext3_super_block *es = sbi->s_es;
- int reserved_gdb = ext3_bg_has_super(sb, input->group) ?
- le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
- struct buffer_head *primary = NULL;
- struct ext3_group_desc *gdp;
- struct inode *inode = NULL;
- handle_t *handle;
- int gdb_off, gdb_num;
- int err, err2;
-
- gdb_num = input->group / EXT3_DESC_PER_BLOCK(sb);
- gdb_off = input->group % EXT3_DESC_PER_BLOCK(sb);
-
- if (gdb_off == 0 && !EXT3_HAS_RO_COMPAT_FEATURE(sb,
- EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
- ext3_warning(sb, __func__,
- "Can't resize non-sparse filesystem further");
- return -EPERM;
- }
-
- if (le32_to_cpu(es->s_blocks_count) + input->blocks_count <
- le32_to_cpu(es->s_blocks_count)) {
- ext3_warning(sb, __func__, "blocks_count overflow\n");
- return -EINVAL;
- }
-
- if (le32_to_cpu(es->s_inodes_count) + EXT3_INODES_PER_GROUP(sb) <
- le32_to_cpu(es->s_inodes_count)) {
- ext3_warning(sb, __func__, "inodes_count overflow\n");
- return -EINVAL;
- }
-
- if (reserved_gdb || gdb_off == 0) {
- if (!EXT3_HAS_COMPAT_FEATURE(sb,
- EXT3_FEATURE_COMPAT_RESIZE_INODE)
- || !le16_to_cpu(es->s_reserved_gdt_blocks)) {
- ext3_warning(sb, __func__,
- "No reserved GDT blocks, can't resize");
- return -EPERM;
- }
- inode = ext3_iget(sb, EXT3_RESIZE_INO);
- if (IS_ERR(inode)) {
- ext3_warning(sb, __func__,
- "Error opening resize inode");
- return PTR_ERR(inode);
- }
- }
-
- if ((err = verify_group_input(sb, input)))
- goto exit_put;
-
- if ((err = setup_new_group_blocks(sb, input)))
- goto exit_put;
-
- /*
- * We will always be modifying at least the superblock and a GDT
- * block. If we are adding a group past the last current GDT block,
- * we will also modify the inode and the dindirect block. If we
- * are adding a group with superblock/GDT backups we will also
- * modify each of the reserved GDT dindirect blocks.
- */
- handle = ext3_journal_start_sb(sb,
- ext3_bg_has_super(sb, input->group) ?
- 3 + reserved_gdb : 4);
- if (IS_ERR(handle)) {
- err = PTR_ERR(handle);
- goto exit_put;
- }
-
- mutex_lock(&sbi->s_resize_lock);
- if (input->group != sbi->s_groups_count) {
- ext3_warning(sb, __func__,
- "multiple resizers run on filesystem!");
- err = -EBUSY;
- goto exit_journal;
- }
-
- if ((err = ext3_journal_get_write_access(handle, sbi->s_sbh)))
- goto exit_journal;
-
- /*
- * We will only either add reserved group blocks to a backup group
- * or remove reserved blocks for the first group in a new group block.
- * Doing both would be mean more complex code, and sane people don't
- * use non-sparse filesystems anymore. This is already checked above.
- */
- if (gdb_off) {
- primary = sbi->s_group_desc[gdb_num];
- if ((err = ext3_journal_get_write_access(handle, primary)))
- goto exit_journal;
-
- if (reserved_gdb && ext3_bg_num_gdb(sb, input->group) &&
- (err = reserve_backup_gdb(handle, inode, input)))
- goto exit_journal;
- } else if ((err = add_new_gdb(handle, inode, input, &primary)))
- goto exit_journal;
-
- /*
- * OK, now we've set up the new group. Time to make it active.
- *
- * We do not lock all allocations via s_resize_lock
- * so we have to be safe wrt. concurrent accesses the group
- * data. So we need to be careful to set all of the relevant
- * group descriptor data etc. *before* we enable the group.
- *
- * The key field here is sbi->s_groups_count: as long as
- * that retains its old value, nobody is going to access the new
- * group.
- *
- * So first we update all the descriptor metadata for the new
- * group; then we update the total disk blocks count; then we
- * update the groups count to enable the group; then finally we
- * update the free space counts so that the system can start
- * using the new disk blocks.
- */
-
- /* Update group descriptor block for new group */
- gdp = (struct ext3_group_desc *)primary->b_data + gdb_off;
-
- gdp->bg_block_bitmap = cpu_to_le32(input->block_bitmap);
- gdp->bg_inode_bitmap = cpu_to_le32(input->inode_bitmap);
- gdp->bg_inode_table = cpu_to_le32(input->inode_table);
- gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count);
- gdp->bg_free_inodes_count = cpu_to_le16(EXT3_INODES_PER_GROUP(sb));
-
- /*
- * Make the new blocks and inodes valid next. We do this before
- * increasing the group count so that once the group is enabled,
- * all of its blocks and inodes are already valid.
- *
- * We always allocate group-by-group, then block-by-block or
- * inode-by-inode within a group, so enabling these
- * blocks/inodes before the group is live won't actually let us
- * allocate the new space yet.
- */
- le32_add_cpu(&es->s_blocks_count, input->blocks_count);
- le32_add_cpu(&es->s_inodes_count, EXT3_INODES_PER_GROUP(sb));
-
- /*
- * We need to protect s_groups_count against other CPUs seeing
- * inconsistent state in the superblock.
- *
- * The precise rules we use are:
- *
- * * Writers of s_groups_count *must* hold s_resize_lock
- * AND
- * * Writers must perform a smp_wmb() after updating all dependent
- * data and before modifying the groups count
- *
- * * Readers must hold s_resize_lock over the access
- * OR
- * * Readers must perform an smp_rmb() after reading the groups count
- * and before reading any dependent data.
- *
- * NB. These rules can be relaxed when checking the group count
- * while freeing data, as we can only allocate from a block
- * group after serialising against the group count, and we can
- * only then free after serialising in turn against that
- * allocation.
- */
- smp_wmb();
-
- /* Update the global fs size fields */
- sbi->s_groups_count++;
-
- err = ext3_journal_dirty_metadata(handle, primary);
- if (err)
- goto exit_journal;
-
- /* Update the reserved block counts only once the new group is
- * active. */
- le32_add_cpu(&es->s_r_blocks_count, input->reserved_blocks);
-
- /* Update the free space counts */
- percpu_counter_add(&sbi->s_freeblocks_counter,
- input->free_blocks_count);
- percpu_counter_add(&sbi->s_freeinodes_counter,
- EXT3_INODES_PER_GROUP(sb));
-
- err = ext3_journal_dirty_metadata(handle, sbi->s_sbh);
-
-exit_journal:
- mutex_unlock(&sbi->s_resize_lock);
- if ((err2 = ext3_journal_stop(handle)) && !err)
- err = err2;
- if (!err) {
- update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es,
- sizeof(struct ext3_super_block));
- update_backups(sb, primary->b_blocknr, primary->b_data,
- primary->b_size);
- }
-exit_put:
- iput(inode);
- return err;
-} /* ext3_group_add */
-
-/* Extend the filesystem to the new number of blocks specified. This entry
- * point is only used to extend the current filesystem to the end of the last
- * existing group. It can be accessed via ioctl, or by "remount,resize=<size>"
- * for emergencies (because it has no dependencies on reserved blocks).
- *
- * If we _really_ wanted, we could use default values to call ext3_group_add()
- * allow the "remount" trick to work for arbitrary resizing, assuming enough
- * GDT blocks are reserved to grow to the desired size.
- */
-int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
- ext3_fsblk_t n_blocks_count)
-{
- ext3_fsblk_t o_blocks_count;
- ext3_grpblk_t last;
- ext3_grpblk_t add;
- struct buffer_head * bh;
- handle_t *handle;
- int err;
- unsigned long freed_blocks;
-
- /* We don't need to worry about locking wrt other resizers just
- * yet: we're going to revalidate es->s_blocks_count after
- * taking the s_resize_lock below. */
- o_blocks_count = le32_to_cpu(es->s_blocks_count);
-
- if (test_opt(sb, DEBUG))
- printk(KERN_DEBUG "EXT3-fs: extending last group from "E3FSBLK
- " up to "E3FSBLK" blocks\n",
- o_blocks_count, n_blocks_count);
-
- if (n_blocks_count == 0 || n_blocks_count == o_blocks_count)
- return 0;
-
- if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
- printk(KERN_ERR "EXT3-fs: filesystem on %s:"
- " too large to resize to "E3FSBLK" blocks safely\n",
- sb->s_id, n_blocks_count);
- if (sizeof(sector_t) < 8)
- ext3_warning(sb, __func__,
- "CONFIG_LBDAF not enabled\n");
- return -EINVAL;
- }
-
- if (n_blocks_count < o_blocks_count) {
- ext3_warning(sb, __func__,
- "can't shrink FS - resize aborted");
- return -EBUSY;
- }
-
- /* Handle the remaining blocks in the last group only. */
- last = (o_blocks_count - le32_to_cpu(es->s_first_data_block)) %
- EXT3_BLOCKS_PER_GROUP(sb);
-
- if (last == 0) {
- ext3_warning(sb, __func__,
- "need to use ext2online to resize further");
- return -EPERM;
- }
-
- add = EXT3_BLOCKS_PER_GROUP(sb) - last;
-
- if (o_blocks_count + add < o_blocks_count) {
- ext3_warning(sb, __func__, "blocks_count overflow");
- return -EINVAL;
- }
-
- if (o_blocks_count + add > n_blocks_count)
- add = n_blocks_count - o_blocks_count;
-
- if (o_blocks_count + add < n_blocks_count)
- ext3_warning(sb, __func__,
- "will only finish group ("E3FSBLK
- " blocks, %u new)",
- o_blocks_count + add, add);
-
- /* See if the device is actually as big as what was requested */
- bh = sb_bread(sb, o_blocks_count + add -1);
- if (!bh) {
- ext3_warning(sb, __func__,
- "can't read last block, resize aborted");
- return -ENOSPC;
- }
- brelse(bh);
-
- /* We will update the superblock, one block bitmap, and
- * one group descriptor via ext3_free_blocks().
- */
- handle = ext3_journal_start_sb(sb, 3);
- if (IS_ERR(handle)) {
- err = PTR_ERR(handle);
- ext3_warning(sb, __func__, "error %d on journal start",err);
- goto exit_put;
- }
-
- mutex_lock(&EXT3_SB(sb)->s_resize_lock);
- if (o_blocks_count != le32_to_cpu(es->s_blocks_count)) {
- ext3_warning(sb, __func__,
- "multiple resizers run on filesystem!");
- mutex_unlock(&EXT3_SB(sb)->s_resize_lock);
- ext3_journal_stop(handle);
- err = -EBUSY;
- goto exit_put;
- }
-
- if ((err = ext3_journal_get_write_access(handle,
- EXT3_SB(sb)->s_sbh))) {
- ext3_warning(sb, __func__,
- "error %d on journal write access", err);
- mutex_unlock(&EXT3_SB(sb)->s_resize_lock);
- ext3_journal_stop(handle);
- goto exit_put;
- }
- es->s_blocks_count = cpu_to_le32(o_blocks_count + add);
- err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
- mutex_unlock(&EXT3_SB(sb)->s_resize_lock);
- if (err) {
- ext3_warning(sb, __func__,
- "error %d on journal dirty metadata", err);
- ext3_journal_stop(handle);
- goto exit_put;
- }
- ext3_debug("freeing blocks "E3FSBLK" through "E3FSBLK"\n",
- o_blocks_count, o_blocks_count + add);
- ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
- ext3_debug("freed blocks "E3FSBLK" through "E3FSBLK"\n",
- o_blocks_count, o_blocks_count + add);
- if ((err = ext3_journal_stop(handle)))
- goto exit_put;
- if (test_opt(sb, DEBUG))
- printk(KERN_DEBUG "EXT3-fs: extended group to %u blocks\n",
- le32_to_cpu(es->s_blocks_count));
- update_backups(sb, EXT3_SB(sb)->s_sbh->b_blocknr, (char *)es,
- sizeof(struct ext3_super_block));
-exit_put:
- return err;
-} /* ext3_group_extend */
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
deleted file mode 100644
index 5ed0044..0000000
--- a/fs/ext3/super.c
+++ /dev/null
@@ -1,3165 +0,0 @@
-/*
- * linux/fs/ext3/super.c
- *
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- *
- * from
- *
- * linux/fs/minix/inode.c
- *
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * Big-endian to little-endian byte-swapping/bitmaps by
- * David S. Miller (davem@caip.rutgers.edu), 1995
- */
-
-#include <linux/module.h>
-#include <linux/blkdev.h>
-#include <linux/parser.h>
-#include <linux/exportfs.h>
-#include <linux/statfs.h>
-#include <linux/random.h>
-#include <linux/mount.h>
-#include <linux/quotaops.h>
-#include <linux/seq_file.h>
-#include <linux/log2.h>
-#include <linux/cleancache.h>
-#include <linux/namei.h>
-
-#include <asm/uaccess.h>
-
-#define CREATE_TRACE_POINTS
-
-#include "ext3.h"
-#include "xattr.h"
-#include "acl.h"
-#include "namei.h"
-
-#ifdef CONFIG_EXT3_DEFAULTS_TO_ORDERED
- #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_ORDERED_DATA
-#else
- #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_WRITEBACK_DATA
-#endif
-
-static int ext3_load_journal(struct super_block *, struct ext3_super_block *,
- unsigned long journal_devnum);
-static int ext3_create_journal(struct super_block *, struct ext3_super_block *,
- unsigned int);
-static int ext3_commit_super(struct super_block *sb,
- struct ext3_super_block *es,
- int sync);
-static void ext3_mark_recovery_complete(struct super_block * sb,
- struct ext3_super_block * es);
-static void ext3_clear_journal_err(struct super_block * sb,
- struct ext3_super_block * es);
-static int ext3_sync_fs(struct super_block *sb, int wait);
-static const char *ext3_decode_error(struct super_block * sb, int errno,
- char nbuf[16]);
-static int ext3_remount (struct super_block * sb, int * flags, char * data);
-static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf);
-static int ext3_unfreeze(struct super_block *sb);
-static int ext3_freeze(struct super_block *sb);
-
-/*
- * Wrappers for journal_start/end.
- */
-handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks)
-{
- journal_t *journal;
-
- if (sb->s_flags & MS_RDONLY)
- return ERR_PTR(-EROFS);
-
- /* Special case here: if the journal has aborted behind our
- * backs (eg. EIO in the commit thread), then we still need to
- * take the FS itself readonly cleanly. */
- journal = EXT3_SB(sb)->s_journal;
- if (is_journal_aborted(journal)) {
- ext3_abort(sb, __func__,
- "Detected aborted journal");
- return ERR_PTR(-EROFS);
- }
-
- return journal_start(journal, nblocks);
-}
-
-int __ext3_journal_stop(const char *where, handle_t *handle)
-{
- struct super_block *sb;
- int err;
- int rc;
-
- sb = handle->h_transaction->t_journal->j_private;
- err = handle->h_err;
- rc = journal_stop(handle);
-
- if (!err)
- err = rc;
- if (err)
- __ext3_std_error(sb, where, err);
- return err;
-}
-
-void ext3_journal_abort_handle(const char *caller, const char *err_fn,
- struct buffer_head *bh, handle_t *handle, int err)
-{
- char nbuf[16];
- const char *errstr = ext3_decode_error(NULL, err, nbuf);
-
- if (bh)
- BUFFER_TRACE(bh, "abort");
-
- if (!handle->h_err)
- handle->h_err = err;
-
- if (is_handle_aborted(handle))
- return;
-
- printk(KERN_ERR "EXT3-fs: %s: aborting transaction: %s in %s\n",
- caller, errstr, err_fn);
-
- journal_abort_handle(handle);
-}
-
-void ext3_msg(struct super_block *sb, const char *prefix,
- const char *fmt, ...)
-{
- struct va_format vaf;
- va_list args;
-
- va_start(args, fmt);
-
- vaf.fmt = fmt;
- vaf.va = &args;
-
- printk("%sEXT3-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
-
- va_end(args);
-}
-
-/* Deal with the reporting of failure conditions on a filesystem such as
- * inconsistencies detected or read IO failures.
- *
- * On ext2, we can store the error state of the filesystem in the
- * superblock. That is not possible on ext3, because we may have other
- * write ordering constraints on the superblock which prevent us from
- * writing it out straight away; and given that the journal is about to
- * be aborted, we can't rely on the current, or future, transactions to
- * write out the superblock safely.
- *
- * We'll just use the journal_abort() error code to record an error in
- * the journal instead. On recovery, the journal will complain about
- * that error until we've noted it down and cleared it.
- */
-
-static void ext3_handle_error(struct super_block *sb)
-{
- struct ext3_super_block *es = EXT3_SB(sb)->s_es;
-
- EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
- es->s_state |= cpu_to_le16(EXT3_ERROR_FS);
-
- if (sb->s_flags & MS_RDONLY)
- return;
-
- if (!test_opt (sb, ERRORS_CONT)) {
- journal_t *journal = EXT3_SB(sb)->s_journal;
-
- set_opt(EXT3_SB(sb)->s_mount_opt, ABORT);
- if (journal)
- journal_abort(journal, -EIO);
- }
- if (test_opt (sb, ERRORS_RO)) {
- ext3_msg(sb, KERN_CRIT,
- "error: remounting filesystem read-only");
- /*
- * Make sure updated value of ->s_mount_state will be visible
- * before ->s_flags update.
- */
- smp_wmb();
- sb->s_flags |= MS_RDONLY;
- }
- ext3_commit_super(sb, es, 1);
- if (test_opt(sb, ERRORS_PANIC))
- panic("EXT3-fs (%s): panic forced after error\n",
- sb->s_id);
-}
-
-void ext3_error(struct super_block *sb, const char *function,
- const char *fmt, ...)
-{
- struct va_format vaf;
- va_list args;
-
- va_start(args, fmt);
-
- vaf.fmt = fmt;
- vaf.va = &args;
-
- printk(KERN_CRIT "EXT3-fs error (device %s): %s: %pV\n",
- sb->s_id, function, &vaf);
-
- va_end(args);
-
- ext3_handle_error(sb);
-}
-
-static const char *ext3_decode_error(struct super_block * sb, int errno,
- char nbuf[16])
-{
- char *errstr = NULL;
-
- switch (errno) {
- case -EIO:
- errstr = "IO failure";
- break;
- case -ENOMEM:
- errstr = "Out of memory";
- break;
- case -EROFS:
- if (!sb || EXT3_SB(sb)->s_journal->j_flags & JFS_ABORT)
- errstr = "Journal has aborted";
- else
- errstr = "Readonly filesystem";
- break;
- default:
- /* If the caller passed in an extra buffer for unknown
- * errors, textualise them now. Else we just return
- * NULL. */
- if (nbuf) {
- /* Check for truncated error codes... */
- if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
- errstr = nbuf;
- }
- break;
- }
-
- return errstr;
-}
-
-/* __ext3_std_error decodes expected errors from journaling functions
- * automatically and invokes the appropriate error response. */
-
-void __ext3_std_error (struct super_block * sb, const char * function,
- int errno)
-{
- char nbuf[16];
- const char *errstr;
-
- /* Special case: if the error is EROFS, and we're not already
- * inside a transaction, then there's really no point in logging
- * an error. */
- if (errno == -EROFS && journal_current_handle() == NULL &&
- (sb->s_flags & MS_RDONLY))
- return;
-
- errstr = ext3_decode_error(sb, errno, nbuf);
- ext3_msg(sb, KERN_CRIT, "error in %s: %s", function, errstr);
-
- ext3_handle_error(sb);
-}
-
-/*
- * ext3_abort is a much stronger failure handler than ext3_error. The
- * abort function may be used to deal with unrecoverable failures such
- * as journal IO errors or ENOMEM at a critical moment in log management.
- *
- * We unconditionally force the filesystem into an ABORT|READONLY state,
- * unless the error response on the fs has been set to panic in which
- * case we take the easy way out and panic immediately.
- */
-
-void ext3_abort(struct super_block *sb, const char *function,
- const char *fmt, ...)
-{
- struct va_format vaf;
- va_list args;
-
- va_start(args, fmt);
-
- vaf.fmt = fmt;
- vaf.va = &args;
-
- printk(KERN_CRIT "EXT3-fs (%s): error: %s: %pV\n",
- sb->s_id, function, &vaf);
-
- va_end(args);
-
- if (test_opt(sb, ERRORS_PANIC))
- panic("EXT3-fs: panic from previous error\n");
-
- if (sb->s_flags & MS_RDONLY)
- return;
-
- ext3_msg(sb, KERN_CRIT,
- "error: remounting filesystem read-only");
- EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
- set_opt(EXT3_SB(sb)->s_mount_opt, ABORT);
- /*
- * Make sure updated value of ->s_mount_state will be visible
- * before ->s_flags update.
- */
- smp_wmb();
- sb->s_flags |= MS_RDONLY;
-
- if (EXT3_SB(sb)->s_journal)
- journal_abort(EXT3_SB(sb)->s_journal, -EIO);
-}
-
-void ext3_warning(struct super_block *sb, const char *function,
- const char *fmt, ...)
-{
- struct va_format vaf;
- va_list args;
-
- va_start(args, fmt);
-
- vaf.fmt = fmt;
- vaf.va = &args;
-
- printk(KERN_WARNING "EXT3-fs (%s): warning: %s: %pV\n",
- sb->s_id, function, &vaf);
-
- va_end(args);
-}
-
-void ext3_update_dynamic_rev(struct super_block *sb)
-{
- struct ext3_super_block *es = EXT3_SB(sb)->s_es;
-
- if (le32_to_cpu(es->s_rev_level) > EXT3_GOOD_OLD_REV)
- return;
-
- ext3_msg(sb, KERN_WARNING,
- "warning: updating to rev %d because of "
- "new feature flag, running e2fsck is recommended",
- EXT3_DYNAMIC_REV);
-
- es->s_first_ino = cpu_to_le32(EXT3_GOOD_OLD_FIRST_INO);
- es->s_inode_size = cpu_to_le16(EXT3_GOOD_OLD_INODE_SIZE);
- es->s_rev_level = cpu_to_le32(EXT3_DYNAMIC_REV);
- /* leave es->s_feature_*compat flags alone */
- /* es->s_uuid will be set by e2fsck if empty */
-
- /*
- * The rest of the superblock fields should be zero, and if not it
- * means they are likely already in use, so leave them alone. We
- * can leave it up to e2fsck to clean up any inconsistencies there.
- */
-}
-
-/*
- * Open the external journal device
- */
-static struct block_device *ext3_blkdev_get(dev_t dev, struct super_block *sb)
-{
- struct block_device *bdev;
- char b[BDEVNAME_SIZE];
-
- bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
- if (IS_ERR(bdev))
- goto fail;
- return bdev;
-
-fail:
- ext3_msg(sb, KERN_ERR, "error: failed to open journal device %s: %ld",
- __bdevname(dev, b), PTR_ERR(bdev));
-
- return NULL;
-}
-
-/*
- * Release the journal device
- */
-static void ext3_blkdev_put(struct block_device *bdev)
-{
- blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
-}
-
-static void ext3_blkdev_remove(struct ext3_sb_info *sbi)
-{
- struct block_device *bdev;
- bdev = sbi->journal_bdev;
- if (bdev) {
- ext3_blkdev_put(bdev);
- sbi->journal_bdev = NULL;
- }
-}
-
-static inline struct inode *orphan_list_entry(struct list_head *l)
-{
- return &list_entry(l, struct ext3_inode_info, i_orphan)->vfs_inode;
-}
-
-static void dump_orphan_list(struct super_block *sb, struct ext3_sb_info *sbi)
-{
- struct list_head *l;
-
- ext3_msg(sb, KERN_ERR, "error: sb orphan head is %d",
- le32_to_cpu(sbi->s_es->s_last_orphan));
-
- ext3_msg(sb, KERN_ERR, "sb_info orphan list:");
- list_for_each(l, &sbi->s_orphan) {
- struct inode *inode = orphan_list_entry(l);
- ext3_msg(sb, KERN_ERR, " "
- "inode %s:%lu at %p: mode %o, nlink %d, next %d\n",
- inode->i_sb->s_id, inode->i_ino, inode,
- inode->i_mode, inode->i_nlink,
- NEXT_ORPHAN(inode));
- }
-}
-
-static void ext3_put_super (struct super_block * sb)
-{
- struct ext3_sb_info *sbi = EXT3_SB(sb);
- struct ext3_super_block *es = sbi->s_es;
- int i, err;
-
- dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
- ext3_xattr_put_super(sb);
- err = journal_destroy(sbi->s_journal);
- sbi->s_journal = NULL;
- if (err < 0)
- ext3_abort(sb, __func__, "Couldn't clean up the journal");
-
- if (!(sb->s_flags & MS_RDONLY)) {
- EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
- es->s_state = cpu_to_le16(sbi->s_mount_state);
- BUFFER_TRACE(sbi->s_sbh, "marking dirty");
- mark_buffer_dirty(sbi->s_sbh);
- ext3_commit_super(sb, es, 1);
- }
-
- for (i = 0; i < sbi->s_gdb_count; i++)
- brelse(sbi->s_group_desc[i]);
- kfree(sbi->s_group_desc);
- percpu_counter_destroy(&sbi->s_freeblocks_counter);
- percpu_counter_destroy(&sbi->s_freeinodes_counter);
- percpu_counter_destroy(&sbi->s_dirs_counter);
- brelse(sbi->s_sbh);
-#ifdef CONFIG_QUOTA
- for (i = 0; i < EXT3_MAXQUOTAS; i++)
- kfree(sbi->s_qf_names[i]);
-#endif
-
- /* Debugging code just in case the in-memory inode orphan list
- * isn't empty. The on-disk one can be non-empty if we've
- * detected an error and taken the fs readonly, but the
- * in-memory list had better be clean by this point. */
- if (!list_empty(&sbi->s_orphan))
- dump_orphan_list(sb, sbi);
- J_ASSERT(list_empty(&sbi->s_orphan));
-
- invalidate_bdev(sb->s_bdev);
- if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) {
- /*
- * Invalidate the journal device's buffers. We don't want them
- * floating about in memory - the physical journal device may
- * hotswapped, and it breaks the `ro-after' testing code.
- */
- sync_blockdev(sbi->journal_bdev);
- invalidate_bdev(sbi->journal_bdev);
- ext3_blkdev_remove(sbi);
- }
- sb->s_fs_info = NULL;
- kfree(sbi->s_blockgroup_lock);
- mutex_destroy(&sbi->s_orphan_lock);
- mutex_destroy(&sbi->s_resize_lock);
- kfree(sbi);
-}
-
-static struct kmem_cache *ext3_inode_cachep;
-
-/*
- * Called inside transaction, so use GFP_NOFS
- */
-static struct inode *ext3_alloc_inode(struct super_block *sb)
-{
- struct ext3_inode_info *ei;
-
- ei = kmem_cache_alloc(ext3_inode_cachep, GFP_NOFS);
- if (!ei)
- return NULL;
- ei->i_block_alloc_info = NULL;
- ei->vfs_inode.i_version = 1;
- atomic_set(&ei->i_datasync_tid, 0);
- atomic_set(&ei->i_sync_tid, 0);
-#ifdef CONFIG_QUOTA
- memset(&ei->i_dquot, 0, sizeof(ei->i_dquot));
-#endif
-
- return &ei->vfs_inode;
-}
-
-static int ext3_drop_inode(struct inode *inode)
-{
- int drop = generic_drop_inode(inode);
-
- trace_ext3_drop_inode(inode, drop);
- return drop;
-}
-
-static void ext3_i_callback(struct rcu_head *head)
-{
- struct inode *inode = container_of(head, struct inode, i_rcu);
- kmem_cache_free(ext3_inode_cachep, EXT3_I(inode));
-}
-
-static void ext3_destroy_inode(struct inode *inode)
-{
- if (!list_empty(&(EXT3_I(inode)->i_orphan))) {
- printk("EXT3 Inode %p: orphan list check failed!\n",
- EXT3_I(inode));
- print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4,
- EXT3_I(inode), sizeof(struct ext3_inode_info),
- false);
- dump_stack();
- }
- call_rcu(&inode->i_rcu, ext3_i_callback);
-}
-
-static void init_once(void *foo)
-{
- struct ext3_inode_info *ei = (struct ext3_inode_info *) foo;
-
- INIT_LIST_HEAD(&ei->i_orphan);
-#ifdef CONFIG_EXT3_FS_XATTR
- init_rwsem(&ei->xattr_sem);
-#endif
- mutex_init(&ei->truncate_mutex);
- inode_init_once(&ei->vfs_inode);
-}
-
-static int __init init_inodecache(void)
-{
- ext3_inode_cachep = kmem_cache_create("ext3_inode_cache",
- sizeof(struct ext3_inode_info),
- 0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
- init_once);
- if (ext3_inode_cachep == NULL)
- return -ENOMEM;
- return 0;
-}
-
-static void destroy_inodecache(void)
-{
- /*
- * Make sure all delayed rcu free inodes are flushed before we
- * destroy cache.
- */
- rcu_barrier();
- kmem_cache_destroy(ext3_inode_cachep);
-}
-
-static inline void ext3_show_quota_options(struct seq_file *seq, struct super_block *sb)
-{
-#if defined(CONFIG_QUOTA)
- struct ext3_sb_info *sbi = EXT3_SB(sb);
-
- if (sbi->s_jquota_fmt) {
- char *fmtname = "";
-
- switch (sbi->s_jquota_fmt) {
- case QFMT_VFS_OLD:
- fmtname = "vfsold";
- break;
- case QFMT_VFS_V0:
- fmtname = "vfsv0";
- break;
- case QFMT_VFS_V1:
- fmtname = "vfsv1";
- break;
- }
- seq_printf(seq, ",jqfmt=%s", fmtname);
- }
-
- if (sbi->s_qf_names[USRQUOTA])
- seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
-
- if (sbi->s_qf_names[GRPQUOTA])
- seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
-
- if (test_opt(sb, USRQUOTA))
- seq_puts(seq, ",usrquota");
-
- if (test_opt(sb, GRPQUOTA))
- seq_puts(seq, ",grpquota");
-#endif
-}
-
-static char *data_mode_string(unsigned long mode)
-{
- switch (mode) {
- case EXT3_MOUNT_JOURNAL_DATA:
- return "journal";
- case EXT3_MOUNT_ORDERED_DATA:
- return "ordered";
- case EXT3_MOUNT_WRITEBACK_DATA:
- return "writeback";
- }
- return "unknown";
-}
-
-/*
- * Show an option if
- * - it's set to a non-default value OR
- * - if the per-sb default is different from the global default
- */
-static int ext3_show_options(struct seq_file *seq, struct dentry *root)
-{
- struct super_block *sb = root->d_sb;
- struct ext3_sb_info *sbi = EXT3_SB(sb);
- struct ext3_super_block *es = sbi->s_es;
- unsigned long def_mount_opts;
-
- def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
-
- if (sbi->s_sb_block != 1)
- seq_printf(seq, ",sb=%lu", sbi->s_sb_block);
- if (test_opt(sb, MINIX_DF))
- seq_puts(seq, ",minixdf");
- if (test_opt(sb, GRPID))
- seq_puts(seq, ",grpid");
- if (!test_opt(sb, GRPID) && (def_mount_opts & EXT3_DEFM_BSDGROUPS))
- seq_puts(seq, ",nogrpid");
- if (!uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT3_DEF_RESUID)) ||
- le16_to_cpu(es->s_def_resuid) != EXT3_DEF_RESUID) {
- seq_printf(seq, ",resuid=%u",
- from_kuid_munged(&init_user_ns, sbi->s_resuid));
- }
- if (!gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT3_DEF_RESGID)) ||
- le16_to_cpu(es->s_def_resgid) != EXT3_DEF_RESGID) {
- seq_printf(seq, ",resgid=%u",
- from_kgid_munged(&init_user_ns, sbi->s_resgid));
- }
- if (test_opt(sb, ERRORS_RO)) {
- int def_errors = le16_to_cpu(es->s_errors);
-
- if (def_errors == EXT3_ERRORS_PANIC ||
- def_errors == EXT3_ERRORS_CONTINUE) {
- seq_puts(seq, ",errors=remount-ro");
- }
- }
- if (test_opt(sb, ERRORS_CONT))
- seq_puts(seq, ",errors=continue");
- if (test_opt(sb, ERRORS_PANIC))
- seq_puts(seq, ",errors=panic");
- if (test_opt(sb, NO_UID32))
- seq_puts(seq, ",nouid32");
- if (test_opt(sb, DEBUG))
- seq_puts(seq, ",debug");
-#ifdef CONFIG_EXT3_FS_XATTR
- if (test_opt(sb, XATTR_USER))
- seq_puts(seq, ",user_xattr");
- if (!test_opt(sb, XATTR_USER) &&
- (def_mount_opts & EXT3_DEFM_XATTR_USER)) {
- seq_puts(seq, ",nouser_xattr");
- }
-#endif
-#ifdef CONFIG_EXT3_FS_POSIX_ACL
- if (test_opt(sb, POSIX_ACL))
- seq_puts(seq, ",acl");
- if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT3_DEFM_ACL))
- seq_puts(seq, ",noacl");
-#endif
- if (!test_opt(sb, RESERVATION))
- seq_puts(seq, ",noreservation");
- if (sbi->s_commit_interval) {
- seq_printf(seq, ",commit=%u",
- (unsigned) (sbi->s_commit_interval / HZ));
- }
-
- /*
- * Always display barrier state so it's clear what the status is.
- */
- seq_puts(seq, ",barrier=");
- seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0");
- seq_printf(seq, ",data=%s", data_mode_string(test_opt(sb, DATA_FLAGS)));
- if (test_opt(sb, DATA_ERR_ABORT))
- seq_puts(seq, ",data_err=abort");
-
- if (test_opt(sb, NOLOAD))
- seq_puts(seq, ",norecovery");
-
- ext3_show_quota_options(seq, sb);
-
- return 0;
-}
-
-
-static struct inode *ext3_nfs_get_inode(struct super_block *sb,
- u64 ino, u32 generation)
-{
- struct inode *inode;
-
- if (ino < EXT3_FIRST_INO(sb) && ino != EXT3_ROOT_INO)
- return ERR_PTR(-ESTALE);
- if (ino > le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count))
- return ERR_PTR(-ESTALE);
-
- /* iget isn't really right if the inode is currently unallocated!!
- *
- * ext3_read_inode will return a bad_inode if the inode had been
- * deleted, so we should be safe.
- *
- * Currently we don't know the generation for parent directory, so
- * a generation of 0 means "accept any"
- */
- inode = ext3_iget(sb, ino);
- if (IS_ERR(inode))
- return ERR_CAST(inode);
- if (generation && inode->i_generation != generation) {
- iput(inode);
- return ERR_PTR(-ESTALE);
- }
-
- return inode;
-}
-
-static struct dentry *ext3_fh_to_dentry(struct super_block *sb, struct fid *fid,
- int fh_len, int fh_type)
-{
- return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
- ext3_nfs_get_inode);
-}
-
-static struct dentry *ext3_fh_to_parent(struct super_block *sb, struct fid *fid,
- int fh_len, int fh_type)
-{
- return generic_fh_to_parent(sb, fid, fh_len, fh_type,
- ext3_nfs_get_inode);
-}
-
-/*
- * Try to release metadata pages (indirect blocks, directories) which are
- * mapped via the block device. Since these pages could have journal heads
- * which would prevent try_to_free_buffers() from freeing them, we must use
- * jbd layer's try_to_free_buffers() function to release them.
- */
-static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
- gfp_t wait)
-{
- journal_t *journal = EXT3_SB(sb)->s_journal;
-
- WARN_ON(PageChecked(page));
- if (!page_has_buffers(page))
- return 0;
- if (journal)
- return journal_try_to_free_buffers(journal, page,
- wait & ~__GFP_WAIT);
- return try_to_free_buffers(page);
-}
-
-#ifdef CONFIG_QUOTA
-#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group")
-#define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
-
-static int ext3_write_dquot(struct dquot *dquot);
-static int ext3_acquire_dquot(struct dquot *dquot);
-static int ext3_release_dquot(struct dquot *dquot);
-static int ext3_mark_dquot_dirty(struct dquot *dquot);
-static int ext3_write_info(struct super_block *sb, int type);
-static int ext3_quota_on(struct super_block *sb, int type, int format_id,
- struct path *path);
-static int ext3_quota_on_mount(struct super_block *sb, int type);
-static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data,
- size_t len, loff_t off);
-static ssize_t ext3_quota_write(struct super_block *sb, int type,
- const char *data, size_t len, loff_t off);
-static struct dquot **ext3_get_dquots(struct inode *inode)
-{
- return EXT3_I(inode)->i_dquot;
-}
-
-static const struct dquot_operations ext3_quota_operations = {
- .write_dquot = ext3_write_dquot,
- .acquire_dquot = ext3_acquire_dquot,
- .release_dquot = ext3_release_dquot,
- .mark_dirty = ext3_mark_dquot_dirty,
- .write_info = ext3_write_info,
- .alloc_dquot = dquot_alloc,
- .destroy_dquot = dquot_destroy,
-};
-
-static const struct quotactl_ops ext3_qctl_operations = {
- .quota_on = ext3_quota_on,
- .quota_off = dquot_quota_off,
- .quota_sync = dquot_quota_sync,
- .get_state = dquot_get_state,
- .set_info = dquot_set_dqinfo,
- .get_dqblk = dquot_get_dqblk,
- .set_dqblk = dquot_set_dqblk
-};
-#endif
-
-static const struct super_operations ext3_sops = {
- .alloc_inode = ext3_alloc_inode,
- .destroy_inode = ext3_destroy_inode,
- .write_inode = ext3_write_inode,
- .dirty_inode = ext3_dirty_inode,
- .drop_inode = ext3_drop_inode,
- .evict_inode = ext3_evict_inode,
- .put_super = ext3_put_super,
- .sync_fs = ext3_sync_fs,
- .freeze_fs = ext3_freeze,
- .unfreeze_fs = ext3_unfreeze,
- .statfs = ext3_statfs,
- .remount_fs = ext3_remount,
- .show_options = ext3_show_options,
-#ifdef CONFIG_QUOTA
- .quota_read = ext3_quota_read,
- .quota_write = ext3_quota_write,
- .get_dquots = ext3_get_dquots,
-#endif
- .bdev_try_to_free_page = bdev_try_to_free_page,
-};
-
-static const struct export_operations ext3_export_ops = {
- .fh_to_dentry = ext3_fh_to_dentry,
- .fh_to_parent = ext3_fh_to_parent,
- .get_parent = ext3_get_parent,
-};
-
-enum {
- Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
- Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
- Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov,
- Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
- Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
- Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
- Opt_journal_path,
- Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
- Opt_data_err_abort, Opt_data_err_ignore,
- Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
- Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
- Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err,
- Opt_resize, Opt_usrquota, Opt_grpquota
-};
-
-static const match_table_t tokens = {
- {Opt_bsd_df, "bsddf"},
- {Opt_minix_df, "minixdf"},
- {Opt_grpid, "grpid"},
- {Opt_grpid, "bsdgroups"},
- {Opt_nogrpid, "nogrpid"},
- {Opt_nogrpid, "sysvgroups"},
- {Opt_resgid, "resgid=%u"},
- {Opt_resuid, "resuid=%u"},
- {Opt_sb, "sb=%u"},
- {Opt_err_cont, "errors=continue"},
- {Opt_err_panic, "errors=panic"},
- {Opt_err_ro, "errors=remount-ro"},
- {Opt_nouid32, "nouid32"},
- {Opt_nocheck, "nocheck"},
- {Opt_nocheck, "check=none"},
- {Opt_debug, "debug"},
- {Opt_oldalloc, "oldalloc"},
- {Opt_orlov, "orlov"},
- {Opt_user_xattr, "user_xattr"},
- {Opt_nouser_xattr, "nouser_xattr"},
- {Opt_acl, "acl"},
- {Opt_noacl, "noacl"},
- {Opt_reservation, "reservation"},
- {Opt_noreservation, "noreservation"},
- {Opt_noload, "noload"},
- {Opt_noload, "norecovery"},
- {Opt_nobh, "nobh"},
- {Opt_bh, "bh"},
- {Opt_commit, "commit=%u"},
- {Opt_journal_update, "journal=update"},
- {Opt_journal_inum, "journal=%u"},
- {Opt_journal_dev, "journal_dev=%u"},
- {Opt_journal_path, "journal_path=%s"},
- {Opt_abort, "abort"},
- {Opt_data_journal, "data=journal"},
- {Opt_data_ordered, "data=ordered"},
- {Opt_data_writeback, "data=writeback"},
- {Opt_data_err_abort, "data_err=abort"},
- {Opt_data_err_ignore, "data_err=ignore"},
- {Opt_offusrjquota, "usrjquota="},
- {Opt_usrjquota, "usrjquota=%s"},
- {Opt_offgrpjquota, "grpjquota="},
- {Opt_grpjquota, "grpjquota=%s"},
- {Opt_jqfmt_vfsold, "jqfmt=vfsold"},
- {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
- {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"},
- {Opt_grpquota, "grpquota"},
- {Opt_noquota, "noquota"},
- {Opt_quota, "quota"},
- {Opt_usrquota, "usrquota"},
- {Opt_barrier, "barrier=%u"},
- {Opt_barrier, "barrier"},
- {Opt_nobarrier, "nobarrier"},
- {Opt_resize, "resize"},
- {Opt_err, NULL},
-};
-
-static ext3_fsblk_t get_sb_block(void **data, struct super_block *sb)
-{
- ext3_fsblk_t sb_block;
- char *options = (char *) *data;
-
- if (!options || strncmp(options, "sb=", 3) != 0)
- return 1; /* Default location */
- options += 3;
- /*todo: use simple_strtoll with >32bit ext3 */
- sb_block = simple_strtoul(options, &options, 0);
- if (*options && *options != ',') {
- ext3_msg(sb, KERN_ERR, "error: invalid sb specification: %s",
- (char *) *data);
- return 1;
- }
- if (*options == ',')
- options++;
- *data = (void *) options;
- return sb_block;
-}
-
-#ifdef CONFIG_QUOTA
-static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
-{
- struct ext3_sb_info *sbi = EXT3_SB(sb);
- char *qname;
-
- if (sb_any_quota_loaded(sb) &&
- !sbi->s_qf_names[qtype]) {
- ext3_msg(sb, KERN_ERR,
- "Cannot change journaled "
- "quota options when quota turned on");
- return 0;
- }
- qname = match_strdup(args);
- if (!qname) {
- ext3_msg(sb, KERN_ERR,
- "Not enough memory for storing quotafile name");
- return 0;
- }
- if (sbi->s_qf_names[qtype]) {
- int same = !strcmp(sbi->s_qf_names[qtype], qname);
-
- kfree(qname);
- if (!same) {
- ext3_msg(sb, KERN_ERR,
- "%s quota file already specified",
- QTYPE2NAME(qtype));
- }
- return same;
- }
- if (strchr(qname, '/')) {
- ext3_msg(sb, KERN_ERR,
- "quotafile must be on filesystem root");
- kfree(qname);
- return 0;
- }
- sbi->s_qf_names[qtype] = qname;
- set_opt(sbi->s_mount_opt, QUOTA);
- return 1;
-}
-
-static int clear_qf_name(struct super_block *sb, int qtype) {
-
- struct ext3_sb_info *sbi = EXT3_SB(sb);
-
- if (sb_any_quota_loaded(sb) &&
- sbi->s_qf_names[qtype]) {
- ext3_msg(sb, KERN_ERR, "Cannot change journaled quota options"
- " when quota turned on");
- return 0;
- }
- if (sbi->s_qf_names[qtype]) {
- kfree(sbi->s_qf_names[qtype]);
- sbi->s_qf_names[qtype] = NULL;
- }
- return 1;
-}
-#endif
-
-static int parse_options (char *options, struct super_block *sb,
- unsigned int *inum, unsigned long *journal_devnum,
- ext3_fsblk_t *n_blocks_count, int is_remount)
-{
- struct ext3_sb_info *sbi = EXT3_SB(sb);
- char * p;
- substring_t args[MAX_OPT_ARGS];
- int data_opt = 0;
- int option;
- kuid_t uid;
- kgid_t gid;
- char *journal_path;
- struct inode *journal_inode;
- struct path path;
- int error;
-
-#ifdef CONFIG_QUOTA
- int qfmt;
-#endif
-
- if (!options)
- return 1;
-
- while ((p = strsep (&options, ",")) != NULL) {
- int token;
- if (!*p)
- continue;
- /*
- * Initialize args struct so we know whether arg was
- * found; some options take optional arguments.
- */
- args[0].to = args[0].from = NULL;
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_bsd_df:
- clear_opt (sbi->s_mount_opt, MINIX_DF);
- break;
- case Opt_minix_df:
- set_opt (sbi->s_mount_opt, MINIX_DF);
- break;
- case Opt_grpid:
- set_opt (sbi->s_mount_opt, GRPID);
- break;
- case Opt_nogrpid:
- clear_opt (sbi->s_mount_opt, GRPID);
- break;
- case Opt_resuid:
- if (match_int(&args[0], &option))
- return 0;
- uid = make_kuid(current_user_ns(), option);
- if (!uid_valid(uid)) {
- ext3_msg(sb, KERN_ERR, "Invalid uid value %d", option);
- return 0;
-
- }
- sbi->s_resuid = uid;
- break;
- case Opt_resgid:
- if (match_int(&args[0], &option))
- return 0;
- gid = make_kgid(current_user_ns(), option);
- if (!gid_valid(gid)) {
- ext3_msg(sb, KERN_ERR, "Invalid gid value %d", option);
- return 0;
- }
- sbi->s_resgid = gid;
- break;
- case Opt_sb:
- /* handled by get_sb_block() instead of here */
- /* *sb_block = match_int(&args[0]); */
- break;
- case Opt_err_panic:
- clear_opt (sbi->s_mount_opt, ERRORS_CONT);
- clear_opt (sbi->s_mount_opt, ERRORS_RO);
- set_opt (sbi->s_mount_opt, ERRORS_PANIC);
- break;
- case Opt_err_ro:
- clear_opt (sbi->s_mount_opt, ERRORS_CONT);
- clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
- set_opt (sbi->s_mount_opt, ERRORS_RO);
- break;
- case Opt_err_cont:
- clear_opt (sbi->s_mount_opt, ERRORS_RO);
- clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
- set_opt (sbi->s_mount_opt, ERRORS_CONT);
- break;
- case Opt_nouid32:
- set_opt (sbi->s_mount_opt, NO_UID32);
- break;
- case Opt_nocheck:
- clear_opt (sbi->s_mount_opt, CHECK);
- break;
- case Opt_debug:
- set_opt (sbi->s_mount_opt, DEBUG);
- break;
- case Opt_oldalloc:
- ext3_msg(sb, KERN_WARNING,
- "Ignoring deprecated oldalloc option");
- break;
- case Opt_orlov:
- ext3_msg(sb, KERN_WARNING,
- "Ignoring deprecated orlov option");
- break;
-#ifdef CONFIG_EXT3_FS_XATTR
- case Opt_user_xattr:
- set_opt (sbi->s_mount_opt, XATTR_USER);
- break;
- case Opt_nouser_xattr:
- clear_opt (sbi->s_mount_opt, XATTR_USER);
- break;
-#else
- case Opt_user_xattr:
- case Opt_nouser_xattr:
- ext3_msg(sb, KERN_INFO,
- "(no)user_xattr options not supported");
- break;
-#endif
-#ifdef CONFIG_EXT3_FS_POSIX_ACL
- case Opt_acl:
- set_opt(sbi->s_mount_opt, POSIX_ACL);
- break;
- case Opt_noacl:
- clear_opt(sbi->s_mount_opt, POSIX_ACL);
- break;
-#else
- case Opt_acl:
- case Opt_noacl:
- ext3_msg(sb, KERN_INFO,
- "(no)acl options not supported");
- break;
-#endif
- case Opt_reservation:
- set_opt(sbi->s_mount_opt, RESERVATION);
- break;
- case Opt_noreservation:
- clear_opt(sbi->s_mount_opt, RESERVATION);
- break;
- case Opt_journal_update:
- /* @@@ FIXME */
- /* Eventually we will want to be able to create
- a journal file here. For now, only allow the
- user to specify an existing inode to be the
- journal file. */
- if (is_remount) {
- ext3_msg(sb, KERN_ERR, "error: cannot specify "
- "journal on remount");
- return 0;
- }
- set_opt (sbi->s_mount_opt, UPDATE_JOURNAL);
- break;
- case Opt_journal_inum:
- if (is_remount) {
- ext3_msg(sb, KERN_ERR, "error: cannot specify "
- "journal on remount");
- return 0;
- }
- if (match_int(&args[0], &option))
- return 0;
- *inum = option;
- break;
- case Opt_journal_dev:
- if (is_remount) {
- ext3_msg(sb, KERN_ERR, "error: cannot specify "
- "journal on remount");
- return 0;
- }
- if (match_int(&args[0], &option))
- return 0;
- *journal_devnum = option;
- break;
- case Opt_journal_path:
- if (is_remount) {
- ext3_msg(sb, KERN_ERR, "error: cannot specify "
- "journal on remount");
- return 0;
- }
-
- journal_path = match_strdup(&args[0]);
- if (!journal_path) {
- ext3_msg(sb, KERN_ERR, "error: could not dup "
- "journal device string");
- return 0;
- }
-
- error = kern_path(journal_path, LOOKUP_FOLLOW, &path);
- if (error) {
- ext3_msg(sb, KERN_ERR, "error: could not find "
- "journal device path: error %d", error);
- kfree(journal_path);
- return 0;
- }
-
- journal_inode = d_inode(path.dentry);
- if (!S_ISBLK(journal_inode->i_mode)) {
- ext3_msg(sb, KERN_ERR, "error: journal path %s "
- "is not a block device", journal_path);
- path_put(&path);
- kfree(journal_path);
- return 0;
- }
-
- *journal_devnum = new_encode_dev(journal_inode->i_rdev);
- path_put(&path);
- kfree(journal_path);
- break;
- case Opt_noload:
- set_opt (sbi->s_mount_opt, NOLOAD);
- break;
- case Opt_commit:
- if (match_int(&args[0], &option))
- return 0;
- if (option < 0)
- return 0;
- if (option == 0)
- option = JBD_DEFAULT_MAX_COMMIT_AGE;
- sbi->s_commit_interval = HZ * option;
- break;
- case Opt_data_journal:
- data_opt = EXT3_MOUNT_JOURNAL_DATA;
- goto datacheck;
- case Opt_data_ordered:
- data_opt = EXT3_MOUNT_ORDERED_DATA;
- goto datacheck;
- case Opt_data_writeback:
- data_opt = EXT3_MOUNT_WRITEBACK_DATA;
- datacheck:
- if (is_remount) {
- if (test_opt(sb, DATA_FLAGS) == data_opt)
- break;
- ext3_msg(sb, KERN_ERR,
- "error: cannot change "
- "data mode on remount. The filesystem "
- "is mounted in data=%s mode and you "
- "try to remount it in data=%s mode.",
- data_mode_string(test_opt(sb,
- DATA_FLAGS)),
- data_mode_string(data_opt));
- return 0;
- } else {
- clear_opt(sbi->s_mount_opt, DATA_FLAGS);
- sbi->s_mount_opt |= data_opt;
- }
- break;
- case Opt_data_err_abort:
- set_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
- break;
- case Opt_data_err_ignore:
- clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
- break;
-#ifdef CONFIG_QUOTA
- case Opt_usrjquota:
- if (!set_qf_name(sb, USRQUOTA, &args[0]))
- return 0;
- break;
- case Opt_grpjquota:
- if (!set_qf_name(sb, GRPQUOTA, &args[0]))
- return 0;
- break;
- case Opt_offusrjquota:
- if (!clear_qf_name(sb, USRQUOTA))
- return 0;
- break;
- case Opt_offgrpjquota:
- if (!clear_qf_name(sb, GRPQUOTA))
- return 0;
- break;
- case Opt_jqfmt_vfsold:
- qfmt = QFMT_VFS_OLD;
- goto set_qf_format;
- case Opt_jqfmt_vfsv0:
- qfmt = QFMT_VFS_V0;
- goto set_qf_format;
- case Opt_jqfmt_vfsv1:
- qfmt = QFMT_VFS_V1;
-set_qf_format:
- if (sb_any_quota_loaded(sb) &&
- sbi->s_jquota_fmt != qfmt) {
- ext3_msg(sb, KERN_ERR, "error: cannot change "
- "journaled quota options when "
- "quota turned on.");
- return 0;
- }
- sbi->s_jquota_fmt = qfmt;
- break;
- case Opt_quota:
- case Opt_usrquota:
- set_opt(sbi->s_mount_opt, QUOTA);
- set_opt(sbi->s_mount_opt, USRQUOTA);
- break;
- case Opt_grpquota:
- set_opt(sbi->s_mount_opt, QUOTA);
- set_opt(sbi->s_mount_opt, GRPQUOTA);
- break;
- case Opt_noquota:
- if (sb_any_quota_loaded(sb)) {
- ext3_msg(sb, KERN_ERR, "error: cannot change "
- "quota options when quota turned on.");
- return 0;
- }
- clear_opt(sbi->s_mount_opt, QUOTA);
- clear_opt(sbi->s_mount_opt, USRQUOTA);
- clear_opt(sbi->s_mount_opt, GRPQUOTA);
- break;
-#else
- case Opt_quota:
- case Opt_usrquota:
- case Opt_grpquota:
- ext3_msg(sb, KERN_ERR,
- "error: quota options not supported.");
- break;
- case Opt_usrjquota:
- case Opt_grpjquota:
- case Opt_offusrjquota:
- case Opt_offgrpjquota:
- case Opt_jqfmt_vfsold:
- case Opt_jqfmt_vfsv0:
- case Opt_jqfmt_vfsv1:
- ext3_msg(sb, KERN_ERR,
- "error: journaled quota options not "
- "supported.");
- break;
- case Opt_noquota:
- break;
-#endif
- case Opt_abort:
- set_opt(sbi->s_mount_opt, ABORT);
- break;
- case Opt_nobarrier:
- clear_opt(sbi->s_mount_opt, BARRIER);
- break;
- case Opt_barrier:
- if (args[0].from) {
- if (match_int(&args[0], &option))
- return 0;
- } else
- option = 1; /* No argument, default to 1 */
- if (option)
- set_opt(sbi->s_mount_opt, BARRIER);
- else
- clear_opt(sbi->s_mount_opt, BARRIER);
- break;
- case Opt_ignore:
- break;
- case Opt_resize:
- if (!is_remount) {
- ext3_msg(sb, KERN_ERR,
- "error: resize option only available "
- "for remount");
- return 0;
- }
- if (match_int(&args[0], &option) != 0)
- return 0;
- *n_blocks_count = option;
- break;
- case Opt_nobh:
- ext3_msg(sb, KERN_WARNING,
- "warning: ignoring deprecated nobh option");
- break;
- case Opt_bh:
- ext3_msg(sb, KERN_WARNING,
- "warning: ignoring deprecated bh option");
- break;
- default:
- ext3_msg(sb, KERN_ERR,
- "error: unrecognized mount option \"%s\" "
- "or missing value", p);
- return 0;
- }
- }
-#ifdef CONFIG_QUOTA
- if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
- if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
- clear_opt(sbi->s_mount_opt, USRQUOTA);
- if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA])
- clear_opt(sbi->s_mount_opt, GRPQUOTA);
-
- if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
- ext3_msg(sb, KERN_ERR, "error: old and new quota "
- "format mixing.");
- return 0;
- }
-
- if (!sbi->s_jquota_fmt) {
- ext3_msg(sb, KERN_ERR, "error: journaled quota format "
- "not specified.");
- return 0;
- }
- }
-#endif
- return 1;
-}
-
-static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es,
- int read_only)
-{
- struct ext3_sb_info *sbi = EXT3_SB(sb);
- int res = 0;
-
- if (le32_to_cpu(es->s_rev_level) > EXT3_MAX_SUPP_REV) {
- ext3_msg(sb, KERN_ERR,
- "error: revision level too high, "
- "forcing read-only mode");
- res = MS_RDONLY;
- }
- if (read_only)
- return res;
- if (!(sbi->s_mount_state & EXT3_VALID_FS))
- ext3_msg(sb, KERN_WARNING,
- "warning: mounting unchecked fs, "
- "running e2fsck is recommended");
- else if ((sbi->s_mount_state & EXT3_ERROR_FS))
- ext3_msg(sb, KERN_WARNING,
- "warning: mounting fs with errors, "
- "running e2fsck is recommended");
- else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 &&
- le16_to_cpu(es->s_mnt_count) >=
- le16_to_cpu(es->s_max_mnt_count))
- ext3_msg(sb, KERN_WARNING,
- "warning: maximal mount count reached, "
- "running e2fsck is recommended");
- else if (le32_to_cpu(es->s_checkinterval) &&
- (le32_to_cpu(es->s_lastcheck) +
- le32_to_cpu(es->s_checkinterval) <= get_seconds()))
- ext3_msg(sb, KERN_WARNING,
- "warning: checktime reached, "
- "running e2fsck is recommended");
-#if 0
- /* @@@ We _will_ want to clear the valid bit if we find
- inconsistencies, to force a fsck at reboot. But for
- a plain journaled filesystem we can keep it set as
- valid forever! :) */
- es->s_state &= cpu_to_le16(~EXT3_VALID_FS);
-#endif
- if (!le16_to_cpu(es->s_max_mnt_count))
- es->s_max_mnt_count = cpu_to_le16(EXT3_DFL_MAX_MNT_COUNT);
- le16_add_cpu(&es->s_mnt_count, 1);
- es->s_mtime = cpu_to_le32(get_seconds());
- ext3_update_dynamic_rev(sb);
- EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
-
- ext3_commit_super(sb, es, 1);
- if (test_opt(sb, DEBUG))
- ext3_msg(sb, KERN_INFO, "[bs=%lu, gc=%lu, "
- "bpg=%lu, ipg=%lu, mo=%04lx]",
- sb->s_blocksize,
- sbi->s_groups_count,
- EXT3_BLOCKS_PER_GROUP(sb),
- EXT3_INODES_PER_GROUP(sb),
- sbi->s_mount_opt);
-
- if (EXT3_SB(sb)->s_journal->j_inode == NULL) {
- char b[BDEVNAME_SIZE];
- ext3_msg(sb, KERN_INFO, "using external journal on %s",
- bdevname(EXT3_SB(sb)->s_journal->j_dev, b));
- } else {
- ext3_msg(sb, KERN_INFO, "using internal journal");
- }
- cleancache_init_fs(sb);
- return res;
-}
-
-/* Called at mount-time, super-block is locked */
-static int ext3_check_descriptors(struct super_block *sb)
-{
- struct ext3_sb_info *sbi = EXT3_SB(sb);
- int i;
-
- ext3_debug ("Checking group descriptors");
-
- for (i = 0; i < sbi->s_groups_count; i++) {
- struct ext3_group_desc *gdp = ext3_get_group_desc(sb, i, NULL);
- ext3_fsblk_t first_block = ext3_group_first_block_no(sb, i);
- ext3_fsblk_t last_block;
-
- if (i == sbi->s_groups_count - 1)
- last_block = le32_to_cpu(sbi->s_es->s_blocks_count) - 1;
- else
- last_block = first_block +
- (EXT3_BLOCKS_PER_GROUP(sb) - 1);
-
- if (le32_to_cpu(gdp->bg_block_bitmap) < first_block ||
- le32_to_cpu(gdp->bg_block_bitmap) > last_block)
- {
- ext3_error (sb, "ext3_check_descriptors",
- "Block bitmap for group %d"
- " not in group (block %lu)!",
- i, (unsigned long)
- le32_to_cpu(gdp->bg_block_bitmap));
- return 0;
- }
- if (le32_to_cpu(gdp->bg_inode_bitmap) < first_block ||
- le32_to_cpu(gdp->bg_inode_bitmap) > last_block)
- {
- ext3_error (sb, "ext3_check_descriptors",
- "Inode bitmap for group %d"
- " not in group (block %lu)!",
- i, (unsigned long)
- le32_to_cpu(gdp->bg_inode_bitmap));
- return 0;
- }
- if (le32_to_cpu(gdp->bg_inode_table) < first_block ||
- le32_to_cpu(gdp->bg_inode_table) + sbi->s_itb_per_group - 1 >
- last_block)
- {
- ext3_error (sb, "ext3_check_descriptors",
- "Inode table for group %d"
- " not in group (block %lu)!",
- i, (unsigned long)
- le32_to_cpu(gdp->bg_inode_table));
- return 0;
- }
- }
-
- sbi->s_es->s_free_blocks_count=cpu_to_le32(ext3_count_free_blocks(sb));
- sbi->s_es->s_free_inodes_count=cpu_to_le32(ext3_count_free_inodes(sb));
- return 1;
-}
-
-
-/* ext3_orphan_cleanup() walks a singly-linked list of inodes (starting at
- * the superblock) which were deleted from all directories, but held open by
- * a process at the time of a crash. We walk the list and try to delete these
- * inodes at recovery time (only with a read-write filesystem).
- *
- * In order to keep the orphan inode chain consistent during traversal (in
- * case of crash during recovery), we link each inode into the superblock
- * orphan list_head and handle it the same way as an inode deletion during
- * normal operation (which journals the operations for us).
- *
- * We only do an iget() and an iput() on each inode, which is very safe if we
- * accidentally point at an in-use or already deleted inode. The worst that
- * can happen in this case is that we get a "bit already cleared" message from
- * ext3_free_inode(). The only reason we would point at a wrong inode is if
- * e2fsck was run on this filesystem, and it must have already done the orphan
- * inode cleanup for us, so we can safely abort without any further action.
- */
-static void ext3_orphan_cleanup (struct super_block * sb,
- struct ext3_super_block * es)
-{
- unsigned int s_flags = sb->s_flags;
- int nr_orphans = 0, nr_truncates = 0;
-#ifdef CONFIG_QUOTA
- int i;
-#endif
- if (!es->s_last_orphan) {
- jbd_debug(4, "no orphan inodes to clean up\n");
- return;
- }
-
- if (bdev_read_only(sb->s_bdev)) {
- ext3_msg(sb, KERN_ERR, "error: write access "
- "unavailable, skipping orphan cleanup.");
- return;
- }
-
- /* Check if feature set allows readwrite operations */
- if (EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP)) {
- ext3_msg(sb, KERN_INFO, "Skipping orphan cleanup due to "
- "unknown ROCOMPAT features");
- return;
- }
-
- if (EXT3_SB(sb)->s_mount_state & EXT3_ERROR_FS) {
- /* don't clear list on RO mount w/ errors */
- if (es->s_last_orphan && !(s_flags & MS_RDONLY)) {
- jbd_debug(1, "Errors on filesystem, "
- "clearing orphan list.\n");
- es->s_last_orphan = 0;
- }
- jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
- return;
- }
-
- if (s_flags & MS_RDONLY) {
- ext3_msg(sb, KERN_INFO, "orphan cleanup on readonly fs");
- sb->s_flags &= ~MS_RDONLY;
- }
-#ifdef CONFIG_QUOTA
- /* Needed for iput() to work correctly and not trash data */
- sb->s_flags |= MS_ACTIVE;
- /* Turn on quotas so that they are updated correctly */
- for (i = 0; i < EXT3_MAXQUOTAS; i++) {
- if (EXT3_SB(sb)->s_qf_names[i]) {
- int ret = ext3_quota_on_mount(sb, i);
- if (ret < 0)
- ext3_msg(sb, KERN_ERR,
- "error: cannot turn on journaled "
- "quota: %d", ret);
- }
- }
-#endif
-
- while (es->s_last_orphan) {
- struct inode *inode;
-
- inode = ext3_orphan_get(sb, le32_to_cpu(es->s_last_orphan));
- if (IS_ERR(inode)) {
- es->s_last_orphan = 0;
- break;
- }
-
- list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan);
- dquot_initialize(inode);
- if (inode->i_nlink) {
- printk(KERN_DEBUG
- "%s: truncating inode %lu to %Ld bytes\n",
- __func__, inode->i_ino, inode->i_size);
- jbd_debug(2, "truncating inode %lu to %Ld bytes\n",
- inode->i_ino, inode->i_size);
- ext3_truncate(inode);
- nr_truncates++;
- } else {
- printk(KERN_DEBUG
- "%s: deleting unreferenced inode %lu\n",
- __func__, inode->i_ino);
- jbd_debug(2, "deleting unreferenced inode %lu\n",
- inode->i_ino);
- nr_orphans++;
- }
- iput(inode); /* The delete magic happens here! */
- }
-
-#define PLURAL(x) (x), ((x)==1) ? "" : "s"
-
- if (nr_orphans)
- ext3_msg(sb, KERN_INFO, "%d orphan inode%s deleted",
- PLURAL(nr_orphans));
- if (nr_truncates)
- ext3_msg(sb, KERN_INFO, "%d truncate%s cleaned up",
- PLURAL(nr_truncates));
-#ifdef CONFIG_QUOTA
- /* Turn quotas off */
- for (i = 0; i < EXT3_MAXQUOTAS; i++) {
- if (sb_dqopt(sb)->files[i])
- dquot_quota_off(sb, i);
- }
-#endif
- sb->s_flags = s_flags; /* Restore MS_RDONLY status */
-}
-
-/*
- * Maximal file size. There is a direct, and {,double-,triple-}indirect
- * block limit, and also a limit of (2^32 - 1) 512-byte sectors in i_blocks.
- * We need to be 1 filesystem block less than the 2^32 sector limit.
- */
-static loff_t ext3_max_size(int bits)
-{
- loff_t res = EXT3_NDIR_BLOCKS;
- int meta_blocks;
- loff_t upper_limit;
-
- /* This is calculated to be the largest file size for a
- * dense, file such that the total number of
- * sectors in the file, including data and all indirect blocks,
- * does not exceed 2^32 -1
- * __u32 i_blocks representing the total number of
- * 512 bytes blocks of the file
- */
- upper_limit = (1LL << 32) - 1;
-
- /* total blocks in file system block size */
- upper_limit >>= (bits - 9);
-
-
- /* indirect blocks */
- meta_blocks = 1;
- /* double indirect blocks */
- meta_blocks += 1 + (1LL << (bits-2));
- /* tripple indirect blocks */
- meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2)));
-
- upper_limit -= meta_blocks;
- upper_limit <<= bits;
-
- res += 1LL << (bits-2);
- res += 1LL << (2*(bits-2));
- res += 1LL << (3*(bits-2));
- res <<= bits;
- if (res > upper_limit)
- res = upper_limit;
-
- if (res > MAX_LFS_FILESIZE)
- res = MAX_LFS_FILESIZE;
-
- return res;
-}
-
-static ext3_fsblk_t descriptor_loc(struct super_block *sb,
- ext3_fsblk_t logic_sb_block,
- int nr)
-{
- struct ext3_sb_info *sbi = EXT3_SB(sb);
- unsigned long bg, first_meta_bg;
- int has_super = 0;
-
- first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);
-
- if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_META_BG) ||
- nr < first_meta_bg)
- return (logic_sb_block + nr + 1);
- bg = sbi->s_desc_per_block * nr;
- if (ext3_bg_has_super(sb, bg))
- has_super = 1;
- return (has_super + ext3_group_first_block_no(sb, bg));
-}
-
-
-static int ext3_fill_super (struct super_block *sb, void *data, int silent)
-{
- struct buffer_head * bh;
- struct ext3_super_block *es = NULL;
- struct ext3_sb_info *sbi;
- ext3_fsblk_t block;
- ext3_fsblk_t sb_block = get_sb_block(&data, sb);
- ext3_fsblk_t logic_sb_block;
- unsigned long offset = 0;
- unsigned int journal_inum = 0;
- unsigned long journal_devnum = 0;
- unsigned long def_mount_opts;
- struct inode *root;
- int blocksize;
- int hblock;
- int db_count;
- int i;
- int needs_recovery;
- int ret = -EINVAL;
- __le32 features;
- int err;
-
- sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
- if (!sbi)
- return -ENOMEM;
-
- sbi->s_blockgroup_lock =
- kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
- if (!sbi->s_blockgroup_lock) {
- kfree(sbi);
- return -ENOMEM;
- }
- sb->s_fs_info = sbi;
- sbi->s_sb_block = sb_block;
-
- blocksize = sb_min_blocksize(sb, EXT3_MIN_BLOCK_SIZE);
- if (!blocksize) {
- ext3_msg(sb, KERN_ERR, "error: unable to set blocksize");
- goto out_fail;
- }
-
- /*
- * The ext3 superblock will not be buffer aligned for other than 1kB
- * block sizes. We need to calculate the offset from buffer start.
- */
- if (blocksize != EXT3_MIN_BLOCK_SIZE) {
- logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize;
- offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize;
- } else {
- logic_sb_block = sb_block;
- }
-
- if (!(bh = sb_bread(sb, logic_sb_block))) {
- ext3_msg(sb, KERN_ERR, "error: unable to read superblock");
- goto out_fail;
- }
- /*
- * Note: s_es must be initialized as soon as possible because
- * some ext3 macro-instructions depend on its value
- */
- es = (struct ext3_super_block *) (bh->b_data + offset);
- sbi->s_es = es;
- sb->s_magic = le16_to_cpu(es->s_magic);
- if (sb->s_magic != EXT3_SUPER_MAGIC)
- goto cantfind_ext3;
-
- /* Set defaults before we parse the mount options */
- def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
- if (def_mount_opts & EXT3_DEFM_DEBUG)
- set_opt(sbi->s_mount_opt, DEBUG);
- if (def_mount_opts & EXT3_DEFM_BSDGROUPS)
- set_opt(sbi->s_mount_opt, GRPID);
- if (def_mount_opts & EXT3_DEFM_UID16)
- set_opt(sbi->s_mount_opt, NO_UID32);
-#ifdef CONFIG_EXT3_FS_XATTR
- if (def_mount_opts & EXT3_DEFM_XATTR_USER)
- set_opt(sbi->s_mount_opt, XATTR_USER);
-#endif
-#ifdef CONFIG_EXT3_FS_POSIX_ACL
- if (def_mount_opts & EXT3_DEFM_ACL)
- set_opt(sbi->s_mount_opt, POSIX_ACL);
-#endif
- if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_DATA)
- set_opt(sbi->s_mount_opt, JOURNAL_DATA);
- else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_ORDERED)
- set_opt(sbi->s_mount_opt, ORDERED_DATA);
- else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_WBACK)
- set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
-
- if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_PANIC)
- set_opt(sbi->s_mount_opt, ERRORS_PANIC);
- else if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_CONTINUE)
- set_opt(sbi->s_mount_opt, ERRORS_CONT);
- else
- set_opt(sbi->s_mount_opt, ERRORS_RO);
-
- sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid));
- sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid));
-
- /* enable barriers by default */
- set_opt(sbi->s_mount_opt, BARRIER);
- set_opt(sbi->s_mount_opt, RESERVATION);
-
- if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
- NULL, 0))
- goto failed_mount;
-
- sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
- (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
-
- if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV &&
- (EXT3_HAS_COMPAT_FEATURE(sb, ~0U) ||
- EXT3_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
- EXT3_HAS_INCOMPAT_FEATURE(sb, ~0U)))
- ext3_msg(sb, KERN_WARNING,
- "warning: feature flags set on rev 0 fs, "
- "running e2fsck is recommended");
- /*
- * Check feature flags regardless of the revision level, since we
- * previously didn't change the revision level when setting the flags,
- * so there is a chance incompat flags are set on a rev 0 filesystem.
- */
- features = EXT3_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP);
- if (features) {
- ext3_msg(sb, KERN_ERR,
- "error: couldn't mount because of unsupported "
- "optional features (%x)", le32_to_cpu(features));
- goto failed_mount;
- }
- features = EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP);
- if (!(sb->s_flags & MS_RDONLY) && features) {
- ext3_msg(sb, KERN_ERR,
- "error: couldn't mount RDWR because of unsupported "
- "optional features (%x)", le32_to_cpu(features));
- goto failed_mount;
- }
- blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
-
- if (blocksize < EXT3_MIN_BLOCK_SIZE ||
- blocksize > EXT3_MAX_BLOCK_SIZE) {
- ext3_msg(sb, KERN_ERR,
- "error: couldn't mount because of unsupported "
- "filesystem blocksize %d", blocksize);
- goto failed_mount;
- }
-
- hblock = bdev_logical_block_size(sb->s_bdev);
- if (sb->s_blocksize != blocksize) {
- /*
- * Make sure the blocksize for the filesystem is larger
- * than the hardware sectorsize for the machine.
- */
- if (blocksize < hblock) {
- ext3_msg(sb, KERN_ERR,
- "error: fsblocksize %d too small for "
- "hardware sectorsize %d", blocksize, hblock);
- goto failed_mount;
- }
-
- brelse (bh);
- if (!sb_set_blocksize(sb, blocksize)) {
- ext3_msg(sb, KERN_ERR,
- "error: bad blocksize %d", blocksize);
- goto out_fail;
- }
- logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize;
- offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize;
- bh = sb_bread(sb, logic_sb_block);
- if (!bh) {
- ext3_msg(sb, KERN_ERR,
- "error: can't read superblock on 2nd try");
- goto failed_mount;
- }
- es = (struct ext3_super_block *)(bh->b_data + offset);
- sbi->s_es = es;
- if (es->s_magic != cpu_to_le16(EXT3_SUPER_MAGIC)) {
- ext3_msg(sb, KERN_ERR,
- "error: magic mismatch");
- goto failed_mount;
- }
- }
-
- sb->s_maxbytes = ext3_max_size(sb->s_blocksize_bits);
-
- if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV) {
- sbi->s_inode_size = EXT3_GOOD_OLD_INODE_SIZE;
- sbi->s_first_ino = EXT3_GOOD_OLD_FIRST_INO;
- } else {
- sbi->s_inode_size = le16_to_cpu(es->s_inode_size);
- sbi->s_first_ino = le32_to_cpu(es->s_first_ino);
- if ((sbi->s_inode_size < EXT3_GOOD_OLD_INODE_SIZE) ||
- (!is_power_of_2(sbi->s_inode_size)) ||
- (sbi->s_inode_size > blocksize)) {
- ext3_msg(sb, KERN_ERR,
- "error: unsupported inode size: %d",
- sbi->s_inode_size);
- goto failed_mount;
- }
- }
- sbi->s_frag_size = EXT3_MIN_FRAG_SIZE <<
- le32_to_cpu(es->s_log_frag_size);
- if (blocksize != sbi->s_frag_size) {
- ext3_msg(sb, KERN_ERR,
- "error: fragsize %lu != blocksize %u (unsupported)",
- sbi->s_frag_size, blocksize);
- goto failed_mount;
- }
- sbi->s_frags_per_block = 1;
- sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
- sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group);
- sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
- if (EXT3_INODE_SIZE(sb) == 0 || EXT3_INODES_PER_GROUP(sb) == 0)
- goto cantfind_ext3;
- sbi->s_inodes_per_block = blocksize / EXT3_INODE_SIZE(sb);
- if (sbi->s_inodes_per_block == 0)
- goto cantfind_ext3;
- sbi->s_itb_per_group = sbi->s_inodes_per_group /
- sbi->s_inodes_per_block;
- sbi->s_desc_per_block = blocksize / sizeof(struct ext3_group_desc);
- sbi->s_sbh = bh;
- sbi->s_mount_state = le16_to_cpu(es->s_state);
- sbi->s_addr_per_block_bits = ilog2(EXT3_ADDR_PER_BLOCK(sb));
- sbi->s_desc_per_block_bits = ilog2(EXT3_DESC_PER_BLOCK(sb));
- for (i = 0; i < 4; i++)
- sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
- sbi->s_def_hash_version = es->s_def_hash_version;
- i = le32_to_cpu(es->s_flags);
- if (i & EXT2_FLAGS_UNSIGNED_HASH)
- sbi->s_hash_unsigned = 3;
- else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
-#ifdef __CHAR_UNSIGNED__
- es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
- sbi->s_hash_unsigned = 3;
-#else
- es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
-#endif
- }
-
- if (sbi->s_blocks_per_group > blocksize * 8) {
- ext3_msg(sb, KERN_ERR,
- "#blocks per group too big: %lu",
- sbi->s_blocks_per_group);
- goto failed_mount;
- }
- if (sbi->s_frags_per_group > blocksize * 8) {
- ext3_msg(sb, KERN_ERR,
- "error: #fragments per group too big: %lu",
- sbi->s_frags_per_group);
- goto failed_mount;
- }
- if (sbi->s_inodes_per_group > blocksize * 8) {
- ext3_msg(sb, KERN_ERR,
- "error: #inodes per group too big: %lu",
- sbi->s_inodes_per_group);
- goto failed_mount;
- }
-
- err = generic_check_addressable(sb->s_blocksize_bits,
- le32_to_cpu(es->s_blocks_count));
- if (err) {
- ext3_msg(sb, KERN_ERR,
- "error: filesystem is too large to mount safely");
- if (sizeof(sector_t) < 8)
- ext3_msg(sb, KERN_ERR,
- "error: CONFIG_LBDAF not enabled");
- ret = err;
- goto failed_mount;
- }
-
- if (EXT3_BLOCKS_PER_GROUP(sb) == 0)
- goto cantfind_ext3;
- sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) -
- le32_to_cpu(es->s_first_data_block) - 1)
- / EXT3_BLOCKS_PER_GROUP(sb)) + 1;
- db_count = DIV_ROUND_UP(sbi->s_groups_count, EXT3_DESC_PER_BLOCK(sb));
- sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *),
- GFP_KERNEL);
- if (sbi->s_group_desc == NULL) {
- ext3_msg(sb, KERN_ERR,
- "error: not enough memory");
- ret = -ENOMEM;
- goto failed_mount;
- }
-
- bgl_lock_init(sbi->s_blockgroup_lock);
-
- for (i = 0; i < db_count; i++) {
- block = descriptor_loc(sb, logic_sb_block, i);
- sbi->s_group_desc[i] = sb_bread(sb, block);
- if (!sbi->s_group_desc[i]) {
- ext3_msg(sb, KERN_ERR,
- "error: can't read group descriptor %d", i);
- db_count = i;
- goto failed_mount2;
- }
- }
- if (!ext3_check_descriptors (sb)) {
- ext3_msg(sb, KERN_ERR,
- "error: group descriptors corrupted");
- goto failed_mount2;
- }
- sbi->s_gdb_count = db_count;
- get_random_bytes(&sbi->s_next_generation, sizeof(u32));
- spin_lock_init(&sbi->s_next_gen_lock);
-
- /* per fileystem reservation list head & lock */
- spin_lock_init(&sbi->s_rsv_window_lock);
- sbi->s_rsv_window_root = RB_ROOT;
- /* Add a single, static dummy reservation to the start of the
- * reservation window list --- it gives us a placeholder for
- * append-at-start-of-list which makes the allocation logic
- * _much_ simpler. */
- sbi->s_rsv_window_head.rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
- sbi->s_rsv_window_head.rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
- sbi->s_rsv_window_head.rsv_alloc_hit = 0;
- sbi->s_rsv_window_head.rsv_goal_size = 0;
- ext3_rsv_window_add(sb, &sbi->s_rsv_window_head);
-
- /*
- * set up enough so that it can read an inode
- */
- sb->s_op = &ext3_sops;
- sb->s_export_op = &ext3_export_ops;
- sb->s_xattr = ext3_xattr_handlers;
-#ifdef CONFIG_QUOTA
- sb->s_qcop = &ext3_qctl_operations;
- sb->dq_op = &ext3_quota_operations;
- sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
-#endif
- memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
- INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
- mutex_init(&sbi->s_orphan_lock);
- mutex_init(&sbi->s_resize_lock);
-
- sb->s_root = NULL;
-
- needs_recovery = (es->s_last_orphan != 0 ||
- EXT3_HAS_INCOMPAT_FEATURE(sb,
- EXT3_FEATURE_INCOMPAT_RECOVER));
-
- /*
- * The first inode we look at is the journal inode. Don't try
- * root first: it may be modified in the journal!
- */
- if (!test_opt(sb, NOLOAD) &&
- EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) {
- if (ext3_load_journal(sb, es, journal_devnum))
- goto failed_mount2;
- } else if (journal_inum) {
- if (ext3_create_journal(sb, es, journal_inum))
- goto failed_mount2;
- } else {
- if (!silent)
- ext3_msg(sb, KERN_ERR,
- "error: no journal found. "
- "mounting ext3 over ext2?");
- goto failed_mount2;
- }
- err = percpu_counter_init(&sbi->s_freeblocks_counter,
- ext3_count_free_blocks(sb), GFP_KERNEL);
- if (!err) {
- err = percpu_counter_init(&sbi->s_freeinodes_counter,
- ext3_count_free_inodes(sb), GFP_KERNEL);
- }
- if (!err) {
- err = percpu_counter_init(&sbi->s_dirs_counter,
- ext3_count_dirs(sb), GFP_KERNEL);
- }
- if (err) {
- ext3_msg(sb, KERN_ERR, "error: insufficient memory");
- ret = err;
- goto failed_mount3;
- }
-
- /* We have now updated the journal if required, so we can
- * validate the data journaling mode. */
- switch (test_opt(sb, DATA_FLAGS)) {
- case 0:
- /* No mode set, assume a default based on the journal
- capabilities: ORDERED_DATA if the journal can
- cope, else JOURNAL_DATA */
- if (journal_check_available_features
- (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE))
- set_opt(sbi->s_mount_opt, DEFAULT_DATA_MODE);
- else
- set_opt(sbi->s_mount_opt, JOURNAL_DATA);
- break;
-
- case EXT3_MOUNT_ORDERED_DATA:
- case EXT3_MOUNT_WRITEBACK_DATA:
- if (!journal_check_available_features
- (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) {
- ext3_msg(sb, KERN_ERR,
- "error: journal does not support "
- "requested data journaling mode");
- goto failed_mount3;
- }
- default:
- break;
- }
-
- /*
- * The journal_load will have done any necessary log recovery,
- * so we can safely mount the rest of the filesystem now.
- */
-
- root = ext3_iget(sb, EXT3_ROOT_INO);
- if (IS_ERR(root)) {
- ext3_msg(sb, KERN_ERR, "error: get root inode failed");
- ret = PTR_ERR(root);
- goto failed_mount3;
- }
- if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
- iput(root);
- ext3_msg(sb, KERN_ERR, "error: corrupt root inode, run e2fsck");
- goto failed_mount3;
- }
- sb->s_root = d_make_root(root);
- if (!sb->s_root) {
- ext3_msg(sb, KERN_ERR, "error: get root dentry failed");
- ret = -ENOMEM;
- goto failed_mount3;
- }
-
- if (ext3_setup_super(sb, es, sb->s_flags & MS_RDONLY))
- sb->s_flags |= MS_RDONLY;
-
- EXT3_SB(sb)->s_mount_state |= EXT3_ORPHAN_FS;
- ext3_orphan_cleanup(sb, es);
- EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS;
- if (needs_recovery) {
- ext3_mark_recovery_complete(sb, es);
- ext3_msg(sb, KERN_INFO, "recovery complete");
- }
- ext3_msg(sb, KERN_INFO, "mounted filesystem with %s data mode",
- test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal":
- test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
- "writeback");
-
- return 0;
-
-cantfind_ext3:
- if (!silent)
- ext3_msg(sb, KERN_INFO,
- "error: can't find ext3 filesystem on dev %s.",
- sb->s_id);
- goto failed_mount;
-
-failed_mount3:
- percpu_counter_destroy(&sbi->s_freeblocks_counter);
- percpu_counter_destroy(&sbi->s_freeinodes_counter);
- percpu_counter_destroy(&sbi->s_dirs_counter);
- journal_destroy(sbi->s_journal);
-failed_mount2:
- for (i = 0; i < db_count; i++)
- brelse(sbi->s_group_desc[i]);
- kfree(sbi->s_group_desc);
-failed_mount:
-#ifdef CONFIG_QUOTA
- for (i = 0; i < EXT3_MAXQUOTAS; i++)
- kfree(sbi->s_qf_names[i]);
-#endif
- ext3_blkdev_remove(sbi);
- brelse(bh);
-out_fail:
- sb->s_fs_info = NULL;
- kfree(sbi->s_blockgroup_lock);
- kfree(sbi);
- return ret;
-}
-
-/*
- * Setup any per-fs journal parameters now. We'll do this both on
- * initial mount, once the journal has been initialised but before we've
- * done any recovery; and again on any subsequent remount.
- */
-static void ext3_init_journal_params(struct super_block *sb, journal_t *journal)
-{
- struct ext3_sb_info *sbi = EXT3_SB(sb);
-
- if (sbi->s_commit_interval)
- journal->j_commit_interval = sbi->s_commit_interval;
- /* We could also set up an ext3-specific default for the commit
- * interval here, but for now we'll just fall back to the jbd
- * default. */
-
- spin_lock(&journal->j_state_lock);
- if (test_opt(sb, BARRIER))
- journal->j_flags |= JFS_BARRIER;
- else
- journal->j_flags &= ~JFS_BARRIER;
- if (test_opt(sb, DATA_ERR_ABORT))
- journal->j_flags |= JFS_ABORT_ON_SYNCDATA_ERR;
- else
- journal->j_flags &= ~JFS_ABORT_ON_SYNCDATA_ERR;
- spin_unlock(&journal->j_state_lock);
-}
-
-static journal_t *ext3_get_journal(struct super_block *sb,
- unsigned int journal_inum)
-{
- struct inode *journal_inode;
- journal_t *journal;
-
- /* First, test for the existence of a valid inode on disk. Bad
- * things happen if we iget() an unused inode, as the subsequent
- * iput() will try to delete it. */
-
- journal_inode = ext3_iget(sb, journal_inum);
- if (IS_ERR(journal_inode)) {
- ext3_msg(sb, KERN_ERR, "error: no journal found");
- return NULL;
- }
- if (!journal_inode->i_nlink) {
- make_bad_inode(journal_inode);
- iput(journal_inode);
- ext3_msg(sb, KERN_ERR, "error: journal inode is deleted");
- return NULL;
- }
-
- jbd_debug(2, "Journal inode found at %p: %Ld bytes\n",
- journal_inode, journal_inode->i_size);
- if (!S_ISREG(journal_inode->i_mode)) {
- ext3_msg(sb, KERN_ERR, "error: invalid journal inode");
- iput(journal_inode);
- return NULL;
- }
-
- journal = journal_init_inode(journal_inode);
- if (!journal) {
- ext3_msg(sb, KERN_ERR, "error: could not load journal inode");
- iput(journal_inode);
- return NULL;
- }
- journal->j_private = sb;
- ext3_init_journal_params(sb, journal);
- return journal;
-}
-
-static journal_t *ext3_get_dev_journal(struct super_block *sb,
- dev_t j_dev)
-{
- struct buffer_head * bh;
- journal_t *journal;
- ext3_fsblk_t start;
- ext3_fsblk_t len;
- int hblock, blocksize;
- ext3_fsblk_t sb_block;
- unsigned long offset;
- struct ext3_super_block * es;
- struct block_device *bdev;
-
- bdev = ext3_blkdev_get(j_dev, sb);
- if (bdev == NULL)
- return NULL;
-
- blocksize = sb->s_blocksize;
- hblock = bdev_logical_block_size(bdev);
- if (blocksize < hblock) {
- ext3_msg(sb, KERN_ERR,
- "error: blocksize too small for journal device");
- goto out_bdev;
- }
-
- sb_block = EXT3_MIN_BLOCK_SIZE / blocksize;
- offset = EXT3_MIN_BLOCK_SIZE % blocksize;
- set_blocksize(bdev, blocksize);
- if (!(bh = __bread(bdev, sb_block, blocksize))) {
- ext3_msg(sb, KERN_ERR, "error: couldn't read superblock of "
- "external journal");
- goto out_bdev;
- }
-
- es = (struct ext3_super_block *) (bh->b_data + offset);
- if ((le16_to_cpu(es->s_magic) != EXT3_SUPER_MAGIC) ||
- !(le32_to_cpu(es->s_feature_incompat) &
- EXT3_FEATURE_INCOMPAT_JOURNAL_DEV)) {
- ext3_msg(sb, KERN_ERR, "error: external journal has "
- "bad superblock");
- brelse(bh);
- goto out_bdev;
- }
-
- if (memcmp(EXT3_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
- ext3_msg(sb, KERN_ERR, "error: journal UUID does not match");
- brelse(bh);
- goto out_bdev;
- }
-
- len = le32_to_cpu(es->s_blocks_count);
- start = sb_block + 1;
- brelse(bh); /* we're done with the superblock */
-
- journal = journal_init_dev(bdev, sb->s_bdev,
- start, len, blocksize);
- if (!journal) {
- ext3_msg(sb, KERN_ERR,
- "error: failed to create device journal");
- goto out_bdev;
- }
- journal->j_private = sb;
- if (!bh_uptodate_or_lock(journal->j_sb_buffer)) {
- if (bh_submit_read(journal->j_sb_buffer)) {
- ext3_msg(sb, KERN_ERR, "I/O error on journal device");
- goto out_journal;
- }
- }
- if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
- ext3_msg(sb, KERN_ERR,
- "error: external journal has more than one "
- "user (unsupported) - %d",
- be32_to_cpu(journal->j_superblock->s_nr_users));
- goto out_journal;
- }
- EXT3_SB(sb)->journal_bdev = bdev;
- ext3_init_journal_params(sb, journal);
- return journal;
-out_journal:
- journal_destroy(journal);
-out_bdev:
- ext3_blkdev_put(bdev);
- return NULL;
-}
-
-static int ext3_load_journal(struct super_block *sb,
- struct ext3_super_block *es,
- unsigned long journal_devnum)
-{
- journal_t *journal;
- unsigned int journal_inum = le32_to_cpu(es->s_journal_inum);
- dev_t journal_dev;
- int err = 0;
- int really_read_only;
-
- if (journal_devnum &&
- journal_devnum != le32_to_cpu(es->s_journal_dev)) {
- ext3_msg(sb, KERN_INFO, "external journal device major/minor "
- "numbers have changed");
- journal_dev = new_decode_dev(journal_devnum);
- } else
- journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
-
- really_read_only = bdev_read_only(sb->s_bdev);
-
- /*
- * Are we loading a blank journal or performing recovery after a
- * crash? For recovery, we need to check in advance whether we
- * can get read-write access to the device.
- */
-
- if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER)) {
- if (sb->s_flags & MS_RDONLY) {
- ext3_msg(sb, KERN_INFO,
- "recovery required on readonly filesystem");
- if (really_read_only) {
- ext3_msg(sb, KERN_ERR, "error: write access "
- "unavailable, cannot proceed");
- return -EROFS;
- }
- ext3_msg(sb, KERN_INFO,
- "write access will be enabled during recovery");
- }
- }
-
- if (journal_inum && journal_dev) {
- ext3_msg(sb, KERN_ERR, "error: filesystem has both journal "
- "and inode journals");
- return -EINVAL;
- }
-
- if (journal_inum) {
- if (!(journal = ext3_get_journal(sb, journal_inum)))
- return -EINVAL;
- } else {
- if (!(journal = ext3_get_dev_journal(sb, journal_dev)))
- return -EINVAL;
- }
-
- if (!(journal->j_flags & JFS_BARRIER))
- printk(KERN_INFO "EXT3-fs: barriers not enabled\n");
-
- if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
- err = journal_update_format(journal);
- if (err) {
- ext3_msg(sb, KERN_ERR, "error updating journal");
- journal_destroy(journal);
- return err;
- }
- }
-
- if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER))
- err = journal_wipe(journal, !really_read_only);
- if (!err)
- err = journal_load(journal);
-
- if (err) {
- ext3_msg(sb, KERN_ERR, "error loading journal");
- journal_destroy(journal);
- return err;
- }
-
- EXT3_SB(sb)->s_journal = journal;
- ext3_clear_journal_err(sb, es);
-
- if (!really_read_only && journal_devnum &&
- journal_devnum != le32_to_cpu(es->s_journal_dev)) {
- es->s_journal_dev = cpu_to_le32(journal_devnum);
-
- /* Make sure we flush the recovery flag to disk. */
- ext3_commit_super(sb, es, 1);
- }
-
- return 0;
-}
-
-static int ext3_create_journal(struct super_block *sb,
- struct ext3_super_block *es,
- unsigned int journal_inum)
-{
- journal_t *journal;
- int err;
-
- if (sb->s_flags & MS_RDONLY) {
- ext3_msg(sb, KERN_ERR,
- "error: readonly filesystem when trying to "
- "create journal");
- return -EROFS;
- }
-
- journal = ext3_get_journal(sb, journal_inum);
- if (!journal)
- return -EINVAL;
-
- ext3_msg(sb, KERN_INFO, "creating new journal on inode %u",
- journal_inum);
-
- err = journal_create(journal);
- if (err) {
- ext3_msg(sb, KERN_ERR, "error creating journal");
- journal_destroy(journal);
- return -EIO;
- }
-
- EXT3_SB(sb)->s_journal = journal;
-
- ext3_update_dynamic_rev(sb);
- EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
- EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL);
-
- es->s_journal_inum = cpu_to_le32(journal_inum);
-
- /* Make sure we flush the recovery flag to disk. */
- ext3_commit_super(sb, es, 1);
-
- return 0;
-}
-
-static int ext3_commit_super(struct super_block *sb,
- struct ext3_super_block *es,
- int sync)
-{
- struct buffer_head *sbh = EXT3_SB(sb)->s_sbh;
- int error = 0;
-
- if (!sbh)
- return error;
-
- if (buffer_write_io_error(sbh)) {
- /*
- * Oh, dear. A previous attempt to write the
- * superblock failed. This could happen because the
- * USB device was yanked out. Or it could happen to
- * be a transient write error and maybe the block will
- * be remapped. Nothing we can do but to retry the
- * write and hope for the best.
- */
- ext3_msg(sb, KERN_ERR, "previous I/O error to "
- "superblock detected");
- clear_buffer_write_io_error(sbh);
- set_buffer_uptodate(sbh);
- }
- /*
- * If the file system is mounted read-only, don't update the
- * superblock write time. This avoids updating the superblock
- * write time when we are mounting the root file system
- * read/only but we need to replay the journal; at that point,
- * for people who are east of GMT and who make their clock
- * tick in localtime for Windows bug-for-bug compatibility,
- * the clock is set in the future, and this will cause e2fsck
- * to complain and force a full file system check.
- */
- if (!(sb->s_flags & MS_RDONLY))
- es->s_wtime = cpu_to_le32(get_seconds());
- es->s_free_blocks_count = cpu_to_le32(ext3_count_free_blocks(sb));
- es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb));
- BUFFER_TRACE(sbh, "marking dirty");
- mark_buffer_dirty(sbh);
- if (sync) {
- error = sync_dirty_buffer(sbh);
- if (buffer_write_io_error(sbh)) {
- ext3_msg(sb, KERN_ERR, "I/O error while writing "
- "superblock");
- clear_buffer_write_io_error(sbh);
- set_buffer_uptodate(sbh);
- }
- }
- return error;
-}
-
-
-/*
- * Have we just finished recovery? If so, and if we are mounting (or
- * remounting) the filesystem readonly, then we will end up with a
- * consistent fs on disk. Record that fact.
- */
-static void ext3_mark_recovery_complete(struct super_block * sb,
- struct ext3_super_block * es)
-{
- journal_t *journal = EXT3_SB(sb)->s_journal;
-
- journal_lock_updates(journal);
- if (journal_flush(journal) < 0)
- goto out;
-
- if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) &&
- sb->s_flags & MS_RDONLY) {
- EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
- ext3_commit_super(sb, es, 1);
- }
-
-out:
- journal_unlock_updates(journal);
-}
-
-/*
- * If we are mounting (or read-write remounting) a filesystem whose journal
- * has recorded an error from a previous lifetime, move that error to the
- * main filesystem now.
- */
-static void ext3_clear_journal_err(struct super_block *sb,
- struct ext3_super_block *es)
-{
- journal_t *journal;
- int j_errno;
- const char *errstr;
-
- journal = EXT3_SB(sb)->s_journal;
-
- /*
- * Now check for any error status which may have been recorded in the
- * journal by a prior ext3_error() or ext3_abort()
- */
-
- j_errno = journal_errno(journal);
- if (j_errno) {
- char nbuf[16];
-
- errstr = ext3_decode_error(sb, j_errno, nbuf);
- ext3_warning(sb, __func__, "Filesystem error recorded "
- "from previous mount: %s", errstr);
- ext3_warning(sb, __func__, "Marking fs in need of "
- "filesystem check.");
-
- EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
- es->s_state |= cpu_to_le16(EXT3_ERROR_FS);
- ext3_commit_super (sb, es, 1);
-
- journal_clear_err(journal);
- }
-}
-
-/*
- * Force the running and committing transactions to commit,
- * and wait on the commit.
- */
-int ext3_force_commit(struct super_block *sb)
-{
- journal_t *journal;
- int ret;
-
- if (sb->s_flags & MS_RDONLY)
- return 0;
-
- journal = EXT3_SB(sb)->s_journal;
- ret = ext3_journal_force_commit(journal);
- return ret;
-}
-
-static int ext3_sync_fs(struct super_block *sb, int wait)
-{
- tid_t target;
-
- trace_ext3_sync_fs(sb, wait);
- /*
- * Writeback quota in non-journalled quota case - journalled quota has
- * no dirty dquots
- */
- dquot_writeback_dquots(sb, -1);
- if (journal_start_commit(EXT3_SB(sb)->s_journal, &target)) {
- if (wait)
- log_wait_commit(EXT3_SB(sb)->s_journal, target);
- }
- return 0;
-}
-
-/*
- * LVM calls this function before a (read-only) snapshot is created. This
- * gives us a chance to flush the journal completely and mark the fs clean.
- */
-static int ext3_freeze(struct super_block *sb)
-{
- int error = 0;
- journal_t *journal;
-
- if (!(sb->s_flags & MS_RDONLY)) {
- journal = EXT3_SB(sb)->s_journal;
-
- /* Now we set up the journal barrier. */
- journal_lock_updates(journal);
-
- /*
- * We don't want to clear needs_recovery flag when we failed
- * to flush the journal.
- */
- error = journal_flush(journal);
- if (error < 0)
- goto out;
-
- /* Journal blocked and flushed, clear needs_recovery flag. */
- EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
- error = ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1);
- if (error)
- goto out;
- }
- return 0;
-
-out:
- journal_unlock_updates(journal);
- return error;
-}
-
-/*
- * Called by LVM after the snapshot is done. We need to reset the RECOVER
- * flag here, even though the filesystem is not technically dirty yet.
- */
-static int ext3_unfreeze(struct super_block *sb)
-{
- if (!(sb->s_flags & MS_RDONLY)) {
- /* Reser the needs_recovery flag before the fs is unlocked. */
- EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
- ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1);
- journal_unlock_updates(EXT3_SB(sb)->s_journal);
- }
- return 0;
-}
-
-static int ext3_remount (struct super_block * sb, int * flags, char * data)
-{
- struct ext3_super_block * es;
- struct ext3_sb_info *sbi = EXT3_SB(sb);
- ext3_fsblk_t n_blocks_count = 0;
- unsigned long old_sb_flags;
- struct ext3_mount_options old_opts;
- int enable_quota = 0;
- int err;
-#ifdef CONFIG_QUOTA
- int i;
-#endif
-
- sync_filesystem(sb);
-
- /* Store the original options */
- old_sb_flags = sb->s_flags;
- old_opts.s_mount_opt = sbi->s_mount_opt;
- old_opts.s_resuid = sbi->s_resuid;
- old_opts.s_resgid = sbi->s_resgid;
- old_opts.s_commit_interval = sbi->s_commit_interval;
-#ifdef CONFIG_QUOTA
- old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
- for (i = 0; i < EXT3_MAXQUOTAS; i++)
- if (sbi->s_qf_names[i]) {
- old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i],
- GFP_KERNEL);
- if (!old_opts.s_qf_names[i]) {
- int j;
-
- for (j = 0; j < i; j++)
- kfree(old_opts.s_qf_names[j]);
- return -ENOMEM;
- }
- } else
- old_opts.s_qf_names[i] = NULL;
-#endif
-
- /*
- * Allow the "check" option to be passed as a remount option.
- */
- if (!parse_options(data, sb, NULL, NULL, &n_blocks_count, 1)) {
- err = -EINVAL;
- goto restore_opts;
- }
-
- if (test_opt(sb, ABORT))
- ext3_abort(sb, __func__, "Abort forced by user");
-
- sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
- (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
-
- es = sbi->s_es;
-
- ext3_init_journal_params(sb, sbi->s_journal);
-
- if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) ||
- n_blocks_count > le32_to_cpu(es->s_blocks_count)) {
- if (test_opt(sb, ABORT)) {
- err = -EROFS;
- goto restore_opts;
- }
-
- if (*flags & MS_RDONLY) {
- err = dquot_suspend(sb, -1);
- if (err < 0)
- goto restore_opts;
-
- /*
- * First of all, the unconditional stuff we have to do
- * to disable replay of the journal when we next remount
- */
- sb->s_flags |= MS_RDONLY;
-
- /*
- * OK, test if we are remounting a valid rw partition
- * readonly, and if so set the rdonly flag and then
- * mark the partition as valid again.
- */
- if (!(es->s_state & cpu_to_le16(EXT3_VALID_FS)) &&
- (sbi->s_mount_state & EXT3_VALID_FS))
- es->s_state = cpu_to_le16(sbi->s_mount_state);
-
- ext3_mark_recovery_complete(sb, es);
- } else {
- __le32 ret;
- if ((ret = EXT3_HAS_RO_COMPAT_FEATURE(sb,
- ~EXT3_FEATURE_RO_COMPAT_SUPP))) {
- ext3_msg(sb, KERN_WARNING,
- "warning: couldn't remount RDWR "
- "because of unsupported optional "
- "features (%x)", le32_to_cpu(ret));
- err = -EROFS;
- goto restore_opts;
- }
-
- /*
- * If we have an unprocessed orphan list hanging
- * around from a previously readonly bdev mount,
- * require a full umount & mount for now.
- */
- if (es->s_last_orphan) {
- ext3_msg(sb, KERN_WARNING, "warning: couldn't "
- "remount RDWR because of unprocessed "
- "orphan inode list. Please "
- "umount & mount instead.");
- err = -EINVAL;
- goto restore_opts;
- }
-
- /*
- * Mounting a RDONLY partition read-write, so reread
- * and store the current valid flag. (It may have
- * been changed by e2fsck since we originally mounted
- * the partition.)
- */
- ext3_clear_journal_err(sb, es);
- sbi->s_mount_state = le16_to_cpu(es->s_state);
- if ((err = ext3_group_extend(sb, es, n_blocks_count)))
- goto restore_opts;
- if (!ext3_setup_super (sb, es, 0))
- sb->s_flags &= ~MS_RDONLY;
- enable_quota = 1;
- }
- }
-#ifdef CONFIG_QUOTA
- /* Release old quota file names */
- for (i = 0; i < EXT3_MAXQUOTAS; i++)
- kfree(old_opts.s_qf_names[i]);
-#endif
- if (enable_quota)
- dquot_resume(sb, -1);
- return 0;
-restore_opts:
- sb->s_flags = old_sb_flags;
- sbi->s_mount_opt = old_opts.s_mount_opt;
- sbi->s_resuid = old_opts.s_resuid;
- sbi->s_resgid = old_opts.s_resgid;
- sbi->s_commit_interval = old_opts.s_commit_interval;
-#ifdef CONFIG_QUOTA
- sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
- for (i = 0; i < EXT3_MAXQUOTAS; i++) {
- kfree(sbi->s_qf_names[i]);
- sbi->s_qf_names[i] = old_opts.s_qf_names[i];
- }
-#endif
- return err;
-}
-
-static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf)
-{
- struct super_block *sb = dentry->d_sb;
- struct ext3_sb_info *sbi = EXT3_SB(sb);
- struct ext3_super_block *es = sbi->s_es;
- u64 fsid;
-
- if (test_opt(sb, MINIX_DF)) {
- sbi->s_overhead_last = 0;
- } else if (sbi->s_blocks_last != le32_to_cpu(es->s_blocks_count)) {
- unsigned long ngroups = sbi->s_groups_count, i;
- ext3_fsblk_t overhead = 0;
- smp_rmb();
-
- /*
- * Compute the overhead (FS structures). This is constant
- * for a given filesystem unless the number of block groups
- * changes so we cache the previous value until it does.
- */
-
- /*
- * All of the blocks before first_data_block are
- * overhead
- */
- overhead = le32_to_cpu(es->s_first_data_block);
-
- /*
- * Add the overhead attributed to the superblock and
- * block group descriptors. If the sparse superblocks
- * feature is turned on, then not all groups have this.
- */
- for (i = 0; i < ngroups; i++) {
- overhead += ext3_bg_has_super(sb, i) +
- ext3_bg_num_gdb(sb, i);
- cond_resched();
- }
-
- /*
- * Every block group has an inode bitmap, a block
- * bitmap, and an inode table.
- */
- overhead += ngroups * (2 + sbi->s_itb_per_group);
-
- /* Add the internal journal blocks as well */
- if (sbi->s_journal && !sbi->journal_bdev)
- overhead += sbi->s_journal->j_maxlen;
-
- sbi->s_overhead_last = overhead;
- smp_wmb();
- sbi->s_blocks_last = le32_to_cpu(es->s_blocks_count);
- }
-
- buf->f_type = EXT3_SUPER_MAGIC;
- buf->f_bsize = sb->s_blocksize;
- buf->f_blocks = le32_to_cpu(es->s_blocks_count) - sbi->s_overhead_last;
- buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter);
- buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count);
- if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count))
- buf->f_bavail = 0;
- buf->f_files = le32_to_cpu(es->s_inodes_count);
- buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
- buf->f_namelen = EXT3_NAME_LEN;
- fsid = le64_to_cpup((void *)es->s_uuid) ^
- le64_to_cpup((void *)es->s_uuid + sizeof(u64));
- buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
- buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
- return 0;
-}
-
-/* Helper function for writing quotas on sync - we need to start transaction before quota file
- * is locked for write. Otherwise the are possible deadlocks:
- * Process 1 Process 2
- * ext3_create() quota_sync()
- * journal_start() write_dquot()
- * dquot_initialize() down(dqio_mutex)
- * down(dqio_mutex) journal_start()
- *
- */
-
-#ifdef CONFIG_QUOTA
-
-static inline struct inode *dquot_to_inode(struct dquot *dquot)
-{
- return sb_dqopt(dquot->dq_sb)->files[dquot->dq_id.type];
-}
-
-static int ext3_write_dquot(struct dquot *dquot)
-{
- int ret, err;
- handle_t *handle;
- struct inode *inode;
-
- inode = dquot_to_inode(dquot);
- handle = ext3_journal_start(inode,
- EXT3_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
- if (IS_ERR(handle))
- return PTR_ERR(handle);
- ret = dquot_commit(dquot);
- err = ext3_journal_stop(handle);
- if (!ret)
- ret = err;
- return ret;
-}
-
-static int ext3_acquire_dquot(struct dquot *dquot)
-{
- int ret, err;
- handle_t *handle;
-
- handle = ext3_journal_start(dquot_to_inode(dquot),
- EXT3_QUOTA_INIT_BLOCKS(dquot->dq_sb));
- if (IS_ERR(handle))
- return PTR_ERR(handle);
- ret = dquot_acquire(dquot);
- err = ext3_journal_stop(handle);
- if (!ret)
- ret = err;
- return ret;
-}
-
-static int ext3_release_dquot(struct dquot *dquot)
-{
- int ret, err;
- handle_t *handle;
-
- handle = ext3_journal_start(dquot_to_inode(dquot),
- EXT3_QUOTA_DEL_BLOCKS(dquot->dq_sb));
- if (IS_ERR(handle)) {
- /* Release dquot anyway to avoid endless cycle in dqput() */
- dquot_release(dquot);
- return PTR_ERR(handle);
- }
- ret = dquot_release(dquot);
- err = ext3_journal_stop(handle);
- if (!ret)
- ret = err;
- return ret;
-}
-
-static int ext3_mark_dquot_dirty(struct dquot *dquot)
-{
- /* Are we journaling quotas? */
- if (EXT3_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] ||
- EXT3_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) {
- dquot_mark_dquot_dirty(dquot);
- return ext3_write_dquot(dquot);
- } else {
- return dquot_mark_dquot_dirty(dquot);
- }
-}
-
-static int ext3_write_info(struct super_block *sb, int type)
-{
- int ret, err;
- handle_t *handle;
-
- /* Data block + inode block */
- handle = ext3_journal_start(d_inode(sb->s_root), 2);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
- ret = dquot_commit_info(sb, type);
- err = ext3_journal_stop(handle);
- if (!ret)
- ret = err;
- return ret;
-}
-
-/*
- * Turn on quotas during mount time - we need to find
- * the quota file and such...
- */
-static int ext3_quota_on_mount(struct super_block *sb, int type)
-{
- return dquot_quota_on_mount(sb, EXT3_SB(sb)->s_qf_names[type],
- EXT3_SB(sb)->s_jquota_fmt, type);
-}
-
-/*
- * Standard function to be called on quota_on
- */
-static int ext3_quota_on(struct super_block *sb, int type, int format_id,
- struct path *path)
-{
- int err;
-
- if (!test_opt(sb, QUOTA))
- return -EINVAL;
-
- /* Quotafile not on the same filesystem? */
- if (path->dentry->d_sb != sb)
- return -EXDEV;
- /* Journaling quota? */
- if (EXT3_SB(sb)->s_qf_names[type]) {
- /* Quotafile not of fs root? */
- if (path->dentry->d_parent != sb->s_root)
- ext3_msg(sb, KERN_WARNING,
- "warning: Quota file not on filesystem root. "
- "Journaled quota will not work.");
- }
-
- /*
- * When we journal data on quota file, we have to flush journal to see
- * all updates to the file when we bypass pagecache...
- */
- if (ext3_should_journal_data(d_inode(path->dentry))) {
- /*
- * We don't need to lock updates but journal_flush() could
- * otherwise be livelocked...
- */
- journal_lock_updates(EXT3_SB(sb)->s_journal);
- err = journal_flush(EXT3_SB(sb)->s_journal);
- journal_unlock_updates(EXT3_SB(sb)->s_journal);
- if (err)
- return err;
- }
-
- return dquot_quota_on(sb, type, format_id, path);
-}
-
-/* Read data from quotafile - avoid pagecache and such because we cannot afford
- * acquiring the locks... As quota files are never truncated and quota code
- * itself serializes the operations (and no one else should touch the files)
- * we don't have to be afraid of races */
-static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data,
- size_t len, loff_t off)
-{
- struct inode *inode = sb_dqopt(sb)->files[type];
- sector_t blk = off >> EXT3_BLOCK_SIZE_BITS(sb);
- int err = 0;
- int offset = off & (sb->s_blocksize - 1);
- int tocopy;
- size_t toread;
- struct buffer_head *bh;
- loff_t i_size = i_size_read(inode);
-
- if (off > i_size)
- return 0;
- if (off+len > i_size)
- len = i_size-off;
- toread = len;
- while (toread > 0) {
- tocopy = sb->s_blocksize - offset < toread ?
- sb->s_blocksize - offset : toread;
- bh = ext3_bread(NULL, inode, blk, 0, &err);
- if (err)
- return err;
- if (!bh) /* A hole? */
- memset(data, 0, tocopy);
- else
- memcpy(data, bh->b_data+offset, tocopy);
- brelse(bh);
- offset = 0;
- toread -= tocopy;
- data += tocopy;
- blk++;
- }
- return len;
-}
-
-/* Write to quotafile (we know the transaction is already started and has
- * enough credits) */
-static ssize_t ext3_quota_write(struct super_block *sb, int type,
- const char *data, size_t len, loff_t off)
-{
- struct inode *inode = sb_dqopt(sb)->files[type];
- sector_t blk = off >> EXT3_BLOCK_SIZE_BITS(sb);
- int err = 0;
- int offset = off & (sb->s_blocksize - 1);
- int journal_quota = EXT3_SB(sb)->s_qf_names[type] != NULL;
- struct buffer_head *bh;
- handle_t *handle = journal_current_handle();
-
- if (!handle) {
- ext3_msg(sb, KERN_WARNING,
- "warning: quota write (off=%llu, len=%llu)"
- " cancelled because transaction is not started.",
- (unsigned long long)off, (unsigned long long)len);
- return -EIO;
- }
-
- /*
- * Since we account only one data block in transaction credits,
- * then it is impossible to cross a block boundary.
- */
- if (sb->s_blocksize - offset < len) {
- ext3_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
- " cancelled because not block aligned",
- (unsigned long long)off, (unsigned long long)len);
- return -EIO;
- }
- bh = ext3_bread(handle, inode, blk, 1, &err);
- if (!bh)
- goto out;
- if (journal_quota) {
- err = ext3_journal_get_write_access(handle, bh);
- if (err) {
- brelse(bh);
- goto out;
- }
- }
- lock_buffer(bh);
- memcpy(bh->b_data+offset, data, len);
- flush_dcache_page(bh->b_page);
- unlock_buffer(bh);
- if (journal_quota)
- err = ext3_journal_dirty_metadata(handle, bh);
- else {
- /* Always do at least ordered writes for quotas */
- err = ext3_journal_dirty_data(handle, bh);
- mark_buffer_dirty(bh);
- }
- brelse(bh);
-out:
- if (err)
- return err;
- if (inode->i_size < off + len) {
- i_size_write(inode, off + len);
- EXT3_I(inode)->i_disksize = inode->i_size;
- }
- inode->i_version++;
- inode->i_mtime = inode->i_ctime = CURRENT_TIME;
- ext3_mark_inode_dirty(handle, inode);
- return len;
-}
-
-#endif
-
-static struct dentry *ext3_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
-{
- return mount_bdev(fs_type, flags, dev_name, data, ext3_fill_super);
-}
-
-static struct file_system_type ext3_fs_type = {
- .owner = THIS_MODULE,
- .name = "ext3",
- .mount = ext3_mount,
- .kill_sb = kill_block_super,
- .fs_flags = FS_REQUIRES_DEV,
-};
-MODULE_ALIAS_FS("ext3");
-
-static int __init init_ext3_fs(void)
-{
- int err = init_ext3_xattr();
- if (err)
- return err;
- err = init_inodecache();
- if (err)
- goto out1;
- err = register_filesystem(&ext3_fs_type);
- if (err)
- goto out;
- return 0;
-out:
- destroy_inodecache();
-out1:
- exit_ext3_xattr();
- return err;
-}
-
-static void __exit exit_ext3_fs(void)
-{
- unregister_filesystem(&ext3_fs_type);
- destroy_inodecache();
- exit_ext3_xattr();
-}
-
-MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
-MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions");
-MODULE_LICENSE("GPL");
-module_init(init_ext3_fs)
-module_exit(exit_ext3_fs)
diff --git a/fs/ext3/symlink.c b/fs/ext3/symlink.c
deleted file mode 100644
index c08c590..0000000
--- a/fs/ext3/symlink.c
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * linux/fs/ext3/symlink.c
- *
- * Only fast symlinks left here - the rest is done by generic code. AV, 1999
- *
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- *
- * from
- *
- * linux/fs/minix/symlink.c
- *
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * ext3 symlink handling code
- */
-
-#include "ext3.h"
-#include "xattr.h"
-
-const struct inode_operations ext3_symlink_inode_operations = {
- .readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
- .setattr = ext3_setattr,
-#ifdef CONFIG_EXT3_FS_XATTR
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
- .listxattr = ext3_listxattr,
- .removexattr = generic_removexattr,
-#endif
-};
-
-const struct inode_operations ext3_fast_symlink_inode_operations = {
- .readlink = generic_readlink,
- .follow_link = simple_follow_link,
- .setattr = ext3_setattr,
-#ifdef CONFIG_EXT3_FS_XATTR
- .setxattr = generic_setxattr,
- .getxattr = generic_getxattr,
- .listxattr = ext3_listxattr,
- .removexattr = generic_removexattr,
-#endif
-};
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
deleted file mode 100644
index 7cf3650..0000000
--- a/fs/ext3/xattr.c
+++ /dev/null
@@ -1,1330 +0,0 @@
-/*
- * linux/fs/ext3/xattr.c
- *
- * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
- *
- * Fix by Harrison Xing <harrison@mountainviewdata.com>.
- * Ext3 code with a lot of help from Eric Jarman <ejarman@acm.org>.
- * Extended attributes for symlinks and special files added per
- * suggestion of Luka Renko <luka.renko@hermes.si>.
- * xattr consolidation Copyright (c) 2004 James Morris <jmorris@redhat.com>,
- * Red Hat Inc.
- * ea-in-inode support by Alex Tomas <alex@clusterfs.com> aka bzzz
- * and Andreas Gruenbacher <agruen@suse.de>.
- */
-
-/*
- * Extended attributes are stored directly in inodes (on file systems with
- * inodes bigger than 128 bytes) and on additional disk blocks. The i_file_acl
- * field contains the block number if an inode uses an additional block. All
- * attributes must fit in the inode and one additional block. Blocks that
- * contain the identical set of attributes may be shared among several inodes.
- * Identical blocks are detected by keeping a cache of blocks that have
- * recently been accessed.
- *
- * The attributes in inodes and on blocks have a different header; the entries
- * are stored in the same format:
- *
- * +------------------+
- * | header |
- * | entry 1 | |
- * | entry 2 | | growing downwards
- * | entry 3 | v
- * | four null bytes |
- * | . . . |
- * | value 1 | ^
- * | value 3 | | growing upwards
- * | value 2 | |
- * +------------------+
- *
- * The header is followed by multiple entry descriptors. In disk blocks, the
- * entry descriptors are kept sorted. In inodes, they are unsorted. The
- * attribute values are aligned to the end of the block in no specific order.
- *
- * Locking strategy
- * ----------------
- * EXT3_I(inode)->i_file_acl is protected by EXT3_I(inode)->xattr_sem.
- * EA blocks are only changed if they are exclusive to an inode, so
- * holding xattr_sem also means that nothing but the EA block's reference
- * count can change. Multiple writers to the same block are synchronized
- * by the buffer lock.
- */
-
-#include "ext3.h"
-#include <linux/mbcache.h>
-#include <linux/quotaops.h>
-#include "xattr.h"
-#include "acl.h"
-
-#define BHDR(bh) ((struct ext3_xattr_header *)((bh)->b_data))
-#define ENTRY(ptr) ((struct ext3_xattr_entry *)(ptr))
-#define BFIRST(bh) ENTRY(BHDR(bh)+1)
-#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
-
-#define IHDR(inode, raw_inode) \
- ((struct ext3_xattr_ibody_header *) \
- ((void *)raw_inode + \
- EXT3_GOOD_OLD_INODE_SIZE + \
- EXT3_I(inode)->i_extra_isize))
-#define IFIRST(hdr) ((struct ext3_xattr_entry *)((hdr)+1))
-
-#ifdef EXT3_XATTR_DEBUG
-# define ea_idebug(inode, f...) do { \
- printk(KERN_DEBUG "inode %s:%lu: ", \
- inode->i_sb->s_id, inode->i_ino); \
- printk(f); \
- printk("\n"); \
- } while (0)
-# define ea_bdebug(bh, f...) do { \
- char b[BDEVNAME_SIZE]; \
- printk(KERN_DEBUG "block %s:%lu: ", \
- bdevname(bh->b_bdev, b), \
- (unsigned long) bh->b_blocknr); \
- printk(f); \
- printk("\n"); \
- } while (0)
-#else
-# define ea_idebug(f...)
-# define ea_bdebug(f...)
-#endif
-
-static void ext3_xattr_cache_insert(struct buffer_head *);
-static struct buffer_head *ext3_xattr_cache_find(struct inode *,
- struct ext3_xattr_header *,
- struct mb_cache_entry **);
-static void ext3_xattr_rehash(struct ext3_xattr_header *,
- struct ext3_xattr_entry *);
-static int ext3_xattr_list(struct dentry *dentry, char *buffer,
- size_t buffer_size);
-
-static struct mb_cache *ext3_xattr_cache;
-
-static const struct xattr_handler *ext3_xattr_handler_map[] = {
- [EXT3_XATTR_INDEX_USER] = &ext3_xattr_user_handler,
-#ifdef CONFIG_EXT3_FS_POSIX_ACL
- [EXT3_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler,
- [EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler,
-#endif
- [EXT3_XATTR_INDEX_TRUSTED] = &ext3_xattr_trusted_handler,
-#ifdef CONFIG_EXT3_FS_SECURITY
- [EXT3_XATTR_INDEX_SECURITY] = &ext3_xattr_security_handler,
-#endif
-};
-
-const struct xattr_handler *ext3_xattr_handlers[] = {
- &ext3_xattr_user_handler,
- &ext3_xattr_trusted_handler,
-#ifdef CONFIG_EXT3_FS_POSIX_ACL
- &posix_acl_access_xattr_handler,
- &posix_acl_default_xattr_handler,
-#endif
-#ifdef CONFIG_EXT3_FS_SECURITY
- &ext3_xattr_security_handler,
-#endif
- NULL
-};
-
-static inline const struct xattr_handler *
-ext3_xattr_handler(int name_index)
-{
- const struct xattr_handler *handler = NULL;
-
- if (name_index > 0 && name_index < ARRAY_SIZE(ext3_xattr_handler_map))
- handler = ext3_xattr_handler_map[name_index];
- return handler;
-}
-
-/*
- * Inode operation listxattr()
- *
- * d_inode(dentry)->i_mutex: don't care
- */
-ssize_t
-ext3_listxattr(struct dentry *dentry, char *buffer, size_t size)
-{
- return ext3_xattr_list(dentry, buffer, size);
-}
-
-static int
-ext3_xattr_check_names(struct ext3_xattr_entry *entry, void *end)
-{
- while (!IS_LAST_ENTRY(entry)) {
- struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(entry);
- if ((void *)next >= end)
- return -EIO;
- entry = next;
- }
- return 0;
-}
-
-static inline int
-ext3_xattr_check_block(struct buffer_head *bh)
-{
- int error;
-
- if (BHDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
- BHDR(bh)->h_blocks != cpu_to_le32(1))
- return -EIO;
- error = ext3_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size);
- return error;
-}
-
-static inline int
-ext3_xattr_check_entry(struct ext3_xattr_entry *entry, size_t size)
-{
- size_t value_size = le32_to_cpu(entry->e_value_size);
-
- if (entry->e_value_block != 0 || value_size > size ||
- le16_to_cpu(entry->e_value_offs) + value_size > size)
- return -EIO;
- return 0;
-}
-
-static int
-ext3_xattr_find_entry(struct ext3_xattr_entry **pentry, int name_index,
- const char *name, size_t size, int sorted)
-{
- struct ext3_xattr_entry *entry;
- size_t name_len;
- int cmp = 1;
-
- if (name == NULL)
- return -EINVAL;
- name_len = strlen(name);
- entry = *pentry;
- for (; !IS_LAST_ENTRY(entry); entry = EXT3_XATTR_NEXT(entry)) {
- cmp = name_index - entry->e_name_index;
- if (!cmp)
- cmp = name_len - entry->e_name_len;
- if (!cmp)
- cmp = memcmp(name, entry->e_name, name_len);
- if (cmp <= 0 && (sorted || cmp == 0))
- break;
- }
- *pentry = entry;
- if (!cmp && ext3_xattr_check_entry(entry, size))
- return -EIO;
- return cmp ? -ENODATA : 0;
-}
-
-static int
-ext3_xattr_block_get(struct inode *inode, int name_index, const char *name,
- void *buffer, size_t buffer_size)
-{
- struct buffer_head *bh = NULL;
- struct ext3_xattr_entry *entry;
- size_t size;
- int error;
-
- ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
- name_index, name, buffer, (long)buffer_size);
-
- error = -ENODATA;
- if (!EXT3_I(inode)->i_file_acl)
- goto cleanup;
- ea_idebug(inode, "reading block %u", EXT3_I(inode)->i_file_acl);
- bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
- if (!bh)
- goto cleanup;
- ea_bdebug(bh, "b_count=%d, refcount=%d",
- atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
- if (ext3_xattr_check_block(bh)) {
-bad_block: ext3_error(inode->i_sb, __func__,
- "inode %lu: bad block "E3FSBLK, inode->i_ino,
- EXT3_I(inode)->i_file_acl);
- error = -EIO;
- goto cleanup;
- }
- ext3_xattr_cache_insert(bh);
- entry = BFIRST(bh);
- error = ext3_xattr_find_entry(&entry, name_index, name, bh->b_size, 1);
- if (error == -EIO)
- goto bad_block;
- if (error)
- goto cleanup;
- size = le32_to_cpu(entry->e_value_size);
- if (buffer) {
- error = -ERANGE;
- if (size > buffer_size)
- goto cleanup;
- memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs),
- size);
- }
- error = size;
-
-cleanup:
- brelse(bh);
- return error;
-}
-
-static int
-ext3_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
- void *buffer, size_t buffer_size)
-{
- struct ext3_xattr_ibody_header *header;
- struct ext3_xattr_entry *entry;
- struct ext3_inode *raw_inode;
- struct ext3_iloc iloc;
- size_t size;
- void *end;
- int error;
-
- if (!ext3_test_inode_state(inode, EXT3_STATE_XATTR))
- return -ENODATA;
- error = ext3_get_inode_loc(inode, &iloc);
- if (error)
- return error;
- raw_inode = ext3_raw_inode(&iloc);
- header = IHDR(inode, raw_inode);
- entry = IFIRST(header);
- end = (void *)raw_inode + EXT3_SB(inode->i_sb)->s_inode_size;
- error = ext3_xattr_check_names(entry, end);
- if (error)
- goto cleanup;
- error = ext3_xattr_find_entry(&entry, name_index, name,
- end - (void *)entry, 0);
- if (error)
- goto cleanup;
- size = le32_to_cpu(entry->e_value_size);
- if (buffer) {
- error = -ERANGE;
- if (size > buffer_size)
- goto cleanup;
- memcpy(buffer, (void *)IFIRST(header) +
- le16_to_cpu(entry->e_value_offs), size);
- }
- error = size;
-
-cleanup:
- brelse(iloc.bh);
- return error;
-}
-
-/*
- * ext3_xattr_get()
- *
- * Copy an extended attribute into the buffer
- * provided, or compute the buffer size required.
- * Buffer is NULL to compute the size of the buffer required.
- *
- * Returns a negative error number on failure, or the number of bytes
- * used / required on success.
- */
-int
-ext3_xattr_get(struct inode *inode, int name_index, const char *name,
- void *buffer, size_t buffer_size)
-{
- int error;
-
- down_read(&EXT3_I(inode)->xattr_sem);
- error = ext3_xattr_ibody_get(inode, name_index, name, buffer,
- buffer_size);
- if (error == -ENODATA)
- error = ext3_xattr_block_get(inode, name_index, name, buffer,
- buffer_size);
- up_read(&EXT3_I(inode)->xattr_sem);
- return error;
-}
-
-static int
-ext3_xattr_list_entries(struct dentry *dentry, struct ext3_xattr_entry *entry,
- char *buffer, size_t buffer_size)
-{
- size_t rest = buffer_size;
-
- for (; !IS_LAST_ENTRY(entry); entry = EXT3_XATTR_NEXT(entry)) {
- const struct xattr_handler *handler =
- ext3_xattr_handler(entry->e_name_index);
-
- if (handler) {
- size_t size = handler->list(dentry, buffer, rest,
- entry->e_name,
- entry->e_name_len,
- handler->flags);
- if (buffer) {
- if (size > rest)
- return -ERANGE;
- buffer += size;
- }
- rest -= size;
- }
- }
- return buffer_size - rest;
-}
-
-static int
-ext3_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
-{
- struct inode *inode = d_inode(dentry);
- struct buffer_head *bh = NULL;
- int error;
-
- ea_idebug(inode, "buffer=%p, buffer_size=%ld",
- buffer, (long)buffer_size);
-
- error = 0;
- if (!EXT3_I(inode)->i_file_acl)
- goto cleanup;
- ea_idebug(inode, "reading block %u", EXT3_I(inode)->i_file_acl);
- bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
- error = -EIO;
- if (!bh)
- goto cleanup;
- ea_bdebug(bh, "b_count=%d, refcount=%d",
- atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
- if (ext3_xattr_check_block(bh)) {
- ext3_error(inode->i_sb, __func__,
- "inode %lu: bad block "E3FSBLK, inode->i_ino,
- EXT3_I(inode)->i_file_acl);
- error = -EIO;
- goto cleanup;
- }
- ext3_xattr_cache_insert(bh);
- error = ext3_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size);
-
-cleanup:
- brelse(bh);
-
- return error;
-}
-
-static int
-ext3_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size)
-{
- struct inode *inode = d_inode(dentry);
- struct ext3_xattr_ibody_header *header;
- struct ext3_inode *raw_inode;
- struct ext3_iloc iloc;
- void *end;
- int error;
-
- if (!ext3_test_inode_state(inode, EXT3_STATE_XATTR))
- return 0;
- error = ext3_get_inode_loc(inode, &iloc);
- if (error)
- return error;
- raw_inode = ext3_raw_inode(&iloc);
- header = IHDR(inode, raw_inode);
- end = (void *)raw_inode + EXT3_SB(inode->i_sb)->s_inode_size;
- error = ext3_xattr_check_names(IFIRST(header), end);
- if (error)
- goto cleanup;
- error = ext3_xattr_list_entries(dentry, IFIRST(header),
- buffer, buffer_size);
-
-cleanup:
- brelse(iloc.bh);
- return error;
-}
-
-/*
- * ext3_xattr_list()
- *
- * Copy a list of attribute names into the buffer
- * provided, or compute the buffer size required.
- * Buffer is NULL to compute the size of the buffer required.
- *
- * Returns a negative error number on failure, or the number of bytes
- * used / required on success.
- */
-static int
-ext3_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
-{
- int i_error, b_error;
-
- down_read(&EXT3_I(d_inode(dentry))->xattr_sem);
- i_error = ext3_xattr_ibody_list(dentry, buffer, buffer_size);
- if (i_error < 0) {
- b_error = 0;
- } else {
- if (buffer) {
- buffer += i_error;
- buffer_size -= i_error;
- }
- b_error = ext3_xattr_block_list(dentry, buffer, buffer_size);
- if (b_error < 0)
- i_error = 0;
- }
- up_read(&EXT3_I(d_inode(dentry))->xattr_sem);
- return i_error + b_error;
-}
-
-/*
- * If the EXT3_FEATURE_COMPAT_EXT_ATTR feature of this file system is
- * not set, set it.
- */
-static void ext3_xattr_update_super_block(handle_t *handle,
- struct super_block *sb)
-{
- if (EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR))
- return;
-
- if (ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh) == 0) {
- EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR);
- ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
- }
-}
-
-/*
- * Release the xattr block BH: If the reference count is > 1, decrement
- * it; otherwise free the block.
- */
-static void
-ext3_xattr_release_block(handle_t *handle, struct inode *inode,
- struct buffer_head *bh)
-{
- struct mb_cache_entry *ce = NULL;
- int error = 0;
-
- ce = mb_cache_entry_get(ext3_xattr_cache, bh->b_bdev, bh->b_blocknr);
- error = ext3_journal_get_write_access(handle, bh);
- if (error)
- goto out;
-
- lock_buffer(bh);
-
- if (BHDR(bh)->h_refcount == cpu_to_le32(1)) {
- ea_bdebug(bh, "refcount now=0; freeing");
- if (ce)
- mb_cache_entry_free(ce);
- ext3_free_blocks(handle, inode, bh->b_blocknr, 1);
- get_bh(bh);
- ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
- } else {
- le32_add_cpu(&BHDR(bh)->h_refcount, -1);
- error = ext3_journal_dirty_metadata(handle, bh);
- if (IS_SYNC(inode))
- handle->h_sync = 1;
- dquot_free_block(inode, 1);
- ea_bdebug(bh, "refcount now=%d; releasing",
- le32_to_cpu(BHDR(bh)->h_refcount));
- if (ce)
- mb_cache_entry_release(ce);
- }
- unlock_buffer(bh);
-out:
- ext3_std_error(inode->i_sb, error);
- return;
-}
-
-struct ext3_xattr_info {
- int name_index;
- const char *name;
- const void *value;
- size_t value_len;
-};
-
-struct ext3_xattr_search {
- struct ext3_xattr_entry *first;
- void *base;
- void *end;
- struct ext3_xattr_entry *here;
- int not_found;
-};
-
-static int
-ext3_xattr_set_entry(struct ext3_xattr_info *i, struct ext3_xattr_search *s)
-{
- struct ext3_xattr_entry *last;
- size_t free, min_offs = s->end - s->base, name_len = strlen(i->name);
-
- /* Compute min_offs and last. */
- last = s->first;
- for (; !IS_LAST_ENTRY(last); last = EXT3_XATTR_NEXT(last)) {
- if (!last->e_value_block && last->e_value_size) {
- size_t offs = le16_to_cpu(last->e_value_offs);
- if (offs < min_offs)
- min_offs = offs;
- }
- }
- free = min_offs - ((void *)last - s->base) - sizeof(__u32);
- if (!s->not_found) {
- if (!s->here->e_value_block && s->here->e_value_size) {
- size_t size = le32_to_cpu(s->here->e_value_size);
- free += EXT3_XATTR_SIZE(size);
- }
- free += EXT3_XATTR_LEN(name_len);
- }
- if (i->value) {
- if (free < EXT3_XATTR_LEN(name_len) +
- EXT3_XATTR_SIZE(i->value_len))
- return -ENOSPC;
- }
-
- if (i->value && s->not_found) {
- /* Insert the new name. */
- size_t size = EXT3_XATTR_LEN(name_len);
- size_t rest = (void *)last - (void *)s->here + sizeof(__u32);
- memmove((void *)s->here + size, s->here, rest);
- memset(s->here, 0, size);
- s->here->e_name_index = i->name_index;
- s->here->e_name_len = name_len;
- memcpy(s->here->e_name, i->name, name_len);
- } else {
- if (!s->here->e_value_block && s->here->e_value_size) {
- void *first_val = s->base + min_offs;
- size_t offs = le16_to_cpu(s->here->e_value_offs);
- void *val = s->base + offs;
- size_t size = EXT3_XATTR_SIZE(
- le32_to_cpu(s->here->e_value_size));
-
- if (i->value && size == EXT3_XATTR_SIZE(i->value_len)) {
- /* The old and the new value have the same
- size. Just replace. */
- s->here->e_value_size =
- cpu_to_le32(i->value_len);
- memset(val + size - EXT3_XATTR_PAD, 0,
- EXT3_XATTR_PAD); /* Clear pad bytes. */
- memcpy(val, i->value, i->value_len);
- return 0;
- }
-
- /* Remove the old value. */
- memmove(first_val + size, first_val, val - first_val);
- memset(first_val, 0, size);
- s->here->e_value_size = 0;
- s->here->e_value_offs = 0;
- min_offs += size;
-
- /* Adjust all value offsets. */
- last = s->first;
- while (!IS_LAST_ENTRY(last)) {
- size_t o = le16_to_cpu(last->e_value_offs);
- if (!last->e_value_block &&
- last->e_value_size && o < offs)
- last->e_value_offs =
- cpu_to_le16(o + size);
- last = EXT3_XATTR_NEXT(last);
- }
- }
- if (!i->value) {
- /* Remove the old name. */
- size_t size = EXT3_XATTR_LEN(name_len);
- last = ENTRY((void *)last - size);
- memmove(s->here, (void *)s->here + size,
- (void *)last - (void *)s->here + sizeof(__u32));
- memset(last, 0, size);
- }
- }
-
- if (i->value) {
- /* Insert the new value. */
- s->here->e_value_size = cpu_to_le32(i->value_len);
- if (i->value_len) {
- size_t size = EXT3_XATTR_SIZE(i->value_len);
- void *val = s->base + min_offs - size;
- s->here->e_value_offs = cpu_to_le16(min_offs - size);
- memset(val + size - EXT3_XATTR_PAD, 0,
- EXT3_XATTR_PAD); /* Clear the pad bytes. */
- memcpy(val, i->value, i->value_len);
- }
- }
- return 0;
-}
-
-struct ext3_xattr_block_find {
- struct ext3_xattr_search s;
- struct buffer_head *bh;
-};
-
-static int
-ext3_xattr_block_find(struct inode *inode, struct ext3_xattr_info *i,
- struct ext3_xattr_block_find *bs)
-{
- struct super_block *sb = inode->i_sb;
- int error;
-
- ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld",
- i->name_index, i->name, i->value, (long)i->value_len);
-
- if (EXT3_I(inode)->i_file_acl) {
- /* The inode already has an extended attribute block. */
- bs->bh = sb_bread(sb, EXT3_I(inode)->i_file_acl);
- error = -EIO;
- if (!bs->bh)
- goto cleanup;
- ea_bdebug(bs->bh, "b_count=%d, refcount=%d",
- atomic_read(&(bs->bh->b_count)),
- le32_to_cpu(BHDR(bs->bh)->h_refcount));
- if (ext3_xattr_check_block(bs->bh)) {
- ext3_error(sb, __func__,
- "inode %lu: bad block "E3FSBLK, inode->i_ino,
- EXT3_I(inode)->i_file_acl);
- error = -EIO;
- goto cleanup;
- }
- /* Find the named attribute. */
- bs->s.base = BHDR(bs->bh);
- bs->s.first = BFIRST(bs->bh);
- bs->s.end = bs->bh->b_data + bs->bh->b_size;
- bs->s.here = bs->s.first;
- error = ext3_xattr_find_entry(&bs->s.here, i->name_index,
- i->name, bs->bh->b_size, 1);
- if (error && error != -ENODATA)
- goto cleanup;
- bs->s.not_found = error;
- }
- error = 0;
-
-cleanup:
- return error;
-}
-
-static int
-ext3_xattr_block_set(handle_t *handle, struct inode *inode,
- struct ext3_xattr_info *i,
- struct ext3_xattr_block_find *bs)
-{
- struct super_block *sb = inode->i_sb;
- struct buffer_head *new_bh = NULL;
- struct ext3_xattr_search *s = &bs->s;
- struct mb_cache_entry *ce = NULL;
- int error = 0;
-
-#define header(x) ((struct ext3_xattr_header *)(x))
-
- if (i->value && i->value_len > sb->s_blocksize)
- return -ENOSPC;
- if (s->base) {
- ce = mb_cache_entry_get(ext3_xattr_cache, bs->bh->b_bdev,
- bs->bh->b_blocknr);
- error = ext3_journal_get_write_access(handle, bs->bh);
- if (error)
- goto cleanup;
- lock_buffer(bs->bh);
-
- if (header(s->base)->h_refcount == cpu_to_le32(1)) {
- if (ce) {
- mb_cache_entry_free(ce);
- ce = NULL;
- }
- ea_bdebug(bs->bh, "modifying in-place");
- error = ext3_xattr_set_entry(i, s);
- if (!error) {
- if (!IS_LAST_ENTRY(s->first))
- ext3_xattr_rehash(header(s->base),
- s->here);
- ext3_xattr_cache_insert(bs->bh);
- }
- unlock_buffer(bs->bh);
- if (error == -EIO)
- goto bad_block;
- if (!error)
- error = ext3_journal_dirty_metadata(handle,
- bs->bh);
- if (error)
- goto cleanup;
- goto inserted;
- } else {
- int offset = (char *)s->here - bs->bh->b_data;
-
- unlock_buffer(bs->bh);
- journal_release_buffer(handle, bs->bh);
-
- if (ce) {
- mb_cache_entry_release(ce);
- ce = NULL;
- }
- ea_bdebug(bs->bh, "cloning");
- s->base = kmalloc(bs->bh->b_size, GFP_NOFS);
- error = -ENOMEM;
- if (s->base == NULL)
- goto cleanup;
- memcpy(s->base, BHDR(bs->bh), bs->bh->b_size);
- s->first = ENTRY(header(s->base)+1);
- header(s->base)->h_refcount = cpu_to_le32(1);
- s->here = ENTRY(s->base + offset);
- s->end = s->base + bs->bh->b_size;
- }
- } else {
- /* Allocate a buffer where we construct the new block. */
- s->base = kzalloc(sb->s_blocksize, GFP_NOFS);
- /* assert(header == s->base) */
- error = -ENOMEM;
- if (s->base == NULL)
- goto cleanup;
- header(s->base)->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC);
- header(s->base)->h_blocks = cpu_to_le32(1);
- header(s->base)->h_refcount = cpu_to_le32(1);
- s->first = ENTRY(header(s->base)+1);
- s->here = ENTRY(header(s->base)+1);
- s->end = s->base + sb->s_blocksize;
- }
-
- error = ext3_xattr_set_entry(i, s);
- if (error == -EIO)
- goto bad_block;
- if (error)
- goto cleanup;
- if (!IS_LAST_ENTRY(s->first))
- ext3_xattr_rehash(header(s->base), s->here);
-
-inserted:
- if (!IS_LAST_ENTRY(s->first)) {
- new_bh = ext3_xattr_cache_find(inode, header(s->base), &ce);
- if (new_bh) {
- /* We found an identical block in the cache. */
- if (new_bh == bs->bh)
- ea_bdebug(new_bh, "keeping");
- else {
- /* The old block is released after updating
- the inode. */
- error = dquot_alloc_block(inode, 1);
- if (error)
- goto cleanup;
- error = ext3_journal_get_write_access(handle,
- new_bh);
- if (error)
- goto cleanup_dquot;
- lock_buffer(new_bh);
- le32_add_cpu(&BHDR(new_bh)->h_refcount, 1);
- ea_bdebug(new_bh, "reusing; refcount now=%d",
- le32_to_cpu(BHDR(new_bh)->h_refcount));
- unlock_buffer(new_bh);
- error = ext3_journal_dirty_metadata(handle,
- new_bh);
- if (error)
- goto cleanup_dquot;
- }
- mb_cache_entry_release(ce);
- ce = NULL;
- } else if (bs->bh && s->base == bs->bh->b_data) {
- /* We were modifying this block in-place. */
- ea_bdebug(bs->bh, "keeping this block");
- new_bh = bs->bh;
- get_bh(new_bh);
- } else {
- /* We need to allocate a new block */
- ext3_fsblk_t goal = ext3_group_first_block_no(sb,
- EXT3_I(inode)->i_block_group);
- ext3_fsblk_t block;
-
- /*
- * Protect us agaist concurrent allocations to the
- * same inode from ext3_..._writepage(). Reservation
- * code does not expect racing allocations.
- */
- mutex_lock(&EXT3_I(inode)->truncate_mutex);
- block = ext3_new_block(handle, inode, goal, &error);
- mutex_unlock(&EXT3_I(inode)->truncate_mutex);
- if (error)
- goto cleanup;
- ea_idebug(inode, "creating block %d", block);
-
- new_bh = sb_getblk(sb, block);
- if (unlikely(!new_bh)) {
-getblk_failed:
- ext3_free_blocks(handle, inode, block, 1);
- error = -ENOMEM;
- goto cleanup;
- }
- lock_buffer(new_bh);
- error = ext3_journal_get_create_access(handle, new_bh);
- if (error) {
- unlock_buffer(new_bh);
- goto getblk_failed;
- }
- memcpy(new_bh->b_data, s->base, new_bh->b_size);
- set_buffer_uptodate(new_bh);
- unlock_buffer(new_bh);
- ext3_xattr_cache_insert(new_bh);
- error = ext3_journal_dirty_metadata(handle, new_bh);
- if (error)
- goto cleanup;
- }
- }
-
- /* Update the inode. */
- EXT3_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
-
- /* Drop the previous xattr block. */
- if (bs->bh && bs->bh != new_bh)
- ext3_xattr_release_block(handle, inode, bs->bh);
- error = 0;
-
-cleanup:
- if (ce)
- mb_cache_entry_release(ce);
- brelse(new_bh);
- if (!(bs->bh && s->base == bs->bh->b_data))
- kfree(s->base);
-
- return error;
-
-cleanup_dquot:
- dquot_free_block(inode, 1);
- goto cleanup;
-
-bad_block:
- ext3_error(inode->i_sb, __func__,
- "inode %lu: bad block "E3FSBLK, inode->i_ino,
- EXT3_I(inode)->i_file_acl);
- goto cleanup;
-
-#undef header
-}
-
-struct ext3_xattr_ibody_find {
- struct ext3_xattr_search s;
- struct ext3_iloc iloc;
-};
-
-static int
-ext3_xattr_ibody_find(struct inode *inode, struct ext3_xattr_info *i,
- struct ext3_xattr_ibody_find *is)
-{
- struct ext3_xattr_ibody_header *header;
- struct ext3_inode *raw_inode;
- int error;
-
- if (EXT3_I(inode)->i_extra_isize == 0)
- return 0;
- raw_inode = ext3_raw_inode(&is->iloc);
- header = IHDR(inode, raw_inode);
- is->s.base = is->s.first = IFIRST(header);
- is->s.here = is->s.first;
- is->s.end = (void *)raw_inode + EXT3_SB(inode->i_sb)->s_inode_size;
- if (ext3_test_inode_state(inode, EXT3_STATE_XATTR)) {
- error = ext3_xattr_check_names(IFIRST(header), is->s.end);
- if (error)
- return error;
- /* Find the named attribute. */
- error = ext3_xattr_find_entry(&is->s.here, i->name_index,
- i->name, is->s.end -
- (void *)is->s.base, 0);
- if (error && error != -ENODATA)
- return error;
- is->s.not_found = error;
- }
- return 0;
-}
-
-static int
-ext3_xattr_ibody_set(handle_t *handle, struct inode *inode,
- struct ext3_xattr_info *i,
- struct ext3_xattr_ibody_find *is)
-{
- struct ext3_xattr_ibody_header *header;
- struct ext3_xattr_search *s = &is->s;
- int error;
-
- if (EXT3_I(inode)->i_extra_isize == 0)
- return -ENOSPC;
- error = ext3_xattr_set_entry(i, s);
- if (error)
- return error;
- header = IHDR(inode, ext3_raw_inode(&is->iloc));
- if (!IS_LAST_ENTRY(s->first)) {
- header->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC);
- ext3_set_inode_state(inode, EXT3_STATE_XATTR);
- } else {
- header->h_magic = cpu_to_le32(0);
- ext3_clear_inode_state(inode, EXT3_STATE_XATTR);
- }
- return 0;
-}
-
-/*
- * ext3_xattr_set_handle()
- *
- * Create, replace or remove an extended attribute for this inode. Value
- * is NULL to remove an existing extended attribute, and non-NULL to
- * either replace an existing extended attribute, or create a new extended
- * attribute. The flags XATTR_REPLACE and XATTR_CREATE
- * specify that an extended attribute must exist and must not exist
- * previous to the call, respectively.
- *
- * Returns 0, or a negative error number on failure.
- */
-int
-ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
- const char *name, const void *value, size_t value_len,
- int flags)
-{
- struct ext3_xattr_info i = {
- .name_index = name_index,
- .name = name,
- .value = value,
- .value_len = value_len,
-
- };
- struct ext3_xattr_ibody_find is = {
- .s = { .not_found = -ENODATA, },
- };
- struct ext3_xattr_block_find bs = {
- .s = { .not_found = -ENODATA, },
- };
- int error;
-
- if (!name)
- return -EINVAL;
- if (strlen(name) > 255)
- return -ERANGE;
- down_write(&EXT3_I(inode)->xattr_sem);
- error = ext3_get_inode_loc(inode, &is.iloc);
- if (error)
- goto cleanup;
-
- error = ext3_journal_get_write_access(handle, is.iloc.bh);
- if (error)
- goto cleanup;
-
- if (ext3_test_inode_state(inode, EXT3_STATE_NEW)) {
- struct ext3_inode *raw_inode = ext3_raw_inode(&is.iloc);
- memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
- ext3_clear_inode_state(inode, EXT3_STATE_NEW);
- }
-
- error = ext3_xattr_ibody_find(inode, &i, &is);
- if (error)
- goto cleanup;
- if (is.s.not_found)
- error = ext3_xattr_block_find(inode, &i, &bs);
- if (error)
- goto cleanup;
- if (is.s.not_found && bs.s.not_found) {
- error = -ENODATA;
- if (flags & XATTR_REPLACE)
- goto cleanup;
- error = 0;
- if (!value)
- goto cleanup;
- } else {
- error = -EEXIST;
- if (flags & XATTR_CREATE)
- goto cleanup;
- }
- if (!value) {
- if (!is.s.not_found)
- error = ext3_xattr_ibody_set(handle, inode, &i, &is);
- else if (!bs.s.not_found)
- error = ext3_xattr_block_set(handle, inode, &i, &bs);
- } else {
- error = ext3_xattr_ibody_set(handle, inode, &i, &is);
- if (!error && !bs.s.not_found) {
- i.value = NULL;
- error = ext3_xattr_block_set(handle, inode, &i, &bs);
- } else if (error == -ENOSPC) {
- if (EXT3_I(inode)->i_file_acl && !bs.s.base) {
- error = ext3_xattr_block_find(inode, &i, &bs);
- if (error)
- goto cleanup;
- }
- error = ext3_xattr_block_set(handle, inode, &i, &bs);
- if (error)
- goto cleanup;
- if (!is.s.not_found) {
- i.value = NULL;
- error = ext3_xattr_ibody_set(handle, inode, &i,
- &is);
- }
- }
- }
- if (!error) {
- ext3_xattr_update_super_block(handle, inode->i_sb);
- inode->i_ctime = CURRENT_TIME_SEC;
- error = ext3_mark_iloc_dirty(handle, inode, &is.iloc);
- /*
- * The bh is consumed by ext3_mark_iloc_dirty, even with
- * error != 0.
- */
- is.iloc.bh = NULL;
- if (IS_SYNC(inode))
- handle->h_sync = 1;
- }
-
-cleanup:
- brelse(is.iloc.bh);
- brelse(bs.bh);
- up_write(&EXT3_I(inode)->xattr_sem);
- return error;
-}
-
-/*
- * ext3_xattr_set()
- *
- * Like ext3_xattr_set_handle, but start from an inode. This extended
- * attribute modification is a filesystem transaction by itself.
- *
- * Returns 0, or a negative error number on failure.
- */
-int
-ext3_xattr_set(struct inode *inode, int name_index, const char *name,
- const void *value, size_t value_len, int flags)
-{
- handle_t *handle;
- int error, retries = 0;
-
-retry:
- handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS(inode->i_sb));
- if (IS_ERR(handle)) {
- error = PTR_ERR(handle);
- } else {
- int error2;
-
- error = ext3_xattr_set_handle(handle, inode, name_index, name,
- value, value_len, flags);
- error2 = ext3_journal_stop(handle);
- if (error == -ENOSPC &&
- ext3_should_retry_alloc(inode->i_sb, &retries))
- goto retry;
- if (error == 0)
- error = error2;
- }
-
- return error;
-}
-
-/*
- * ext3_xattr_delete_inode()
- *
- * Free extended attribute resources associated with this inode. This
- * is called immediately before an inode is freed. We have exclusive
- * access to the inode.
- */
-void
-ext3_xattr_delete_inode(handle_t *handle, struct inode *inode)
-{
- struct buffer_head *bh = NULL;
-
- if (!EXT3_I(inode)->i_file_acl)
- goto cleanup;
- bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
- if (!bh) {
- ext3_error(inode->i_sb, __func__,
- "inode %lu: block "E3FSBLK" read error", inode->i_ino,
- EXT3_I(inode)->i_file_acl);
- goto cleanup;
- }
- if (BHDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
- BHDR(bh)->h_blocks != cpu_to_le32(1)) {
- ext3_error(inode->i_sb, __func__,
- "inode %lu: bad block "E3FSBLK, inode->i_ino,
- EXT3_I(inode)->i_file_acl);
- goto cleanup;
- }
- ext3_xattr_release_block(handle, inode, bh);
- EXT3_I(inode)->i_file_acl = 0;
-
-cleanup:
- brelse(bh);
-}
-
-/*
- * ext3_xattr_put_super()
- *
- * This is called when a file system is unmounted.
- */
-void
-ext3_xattr_put_super(struct super_block *sb)
-{
- mb_cache_shrink(sb->s_bdev);
-}
-
-/*
- * ext3_xattr_cache_insert()
- *
- * Create a new entry in the extended attribute cache, and insert
- * it unless such an entry is already in the cache.
- *
- * Returns 0, or a negative error number on failure.
- */
-static void
-ext3_xattr_cache_insert(struct buffer_head *bh)
-{
- __u32 hash = le32_to_cpu(BHDR(bh)->h_hash);
- struct mb_cache_entry *ce;
- int error;
-
- ce = mb_cache_entry_alloc(ext3_xattr_cache, GFP_NOFS);
- if (!ce) {
- ea_bdebug(bh, "out of memory");
- return;
- }
- error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash);
- if (error) {
- mb_cache_entry_free(ce);
- if (error == -EBUSY) {
- ea_bdebug(bh, "already in cache");
- error = 0;
- }
- } else {
- ea_bdebug(bh, "inserting [%x]", (int)hash);
- mb_cache_entry_release(ce);
- }
-}
-
-/*
- * ext3_xattr_cmp()
- *
- * Compare two extended attribute blocks for equality.
- *
- * Returns 0 if the blocks are equal, 1 if they differ, and
- * a negative error number on errors.
- */
-static int
-ext3_xattr_cmp(struct ext3_xattr_header *header1,
- struct ext3_xattr_header *header2)
-{
- struct ext3_xattr_entry *entry1, *entry2;
-
- entry1 = ENTRY(header1+1);
- entry2 = ENTRY(header2+1);
- while (!IS_LAST_ENTRY(entry1)) {
- if (IS_LAST_ENTRY(entry2))
- return 1;
- if (entry1->e_hash != entry2->e_hash ||
- entry1->e_name_index != entry2->e_name_index ||
- entry1->e_name_len != entry2->e_name_len ||
- entry1->e_value_size != entry2->e_value_size ||
- memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len))
- return 1;
- if (entry1->e_value_block != 0 || entry2->e_value_block != 0)
- return -EIO;
- if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs),
- (char *)header2 + le16_to_cpu(entry2->e_value_offs),
- le32_to_cpu(entry1->e_value_size)))
- return 1;
-
- entry1 = EXT3_XATTR_NEXT(entry1);
- entry2 = EXT3_XATTR_NEXT(entry2);
- }
- if (!IS_LAST_ENTRY(entry2))
- return 1;
- return 0;
-}
-
-/*
- * ext3_xattr_cache_find()
- *
- * Find an identical extended attribute block.
- *
- * Returns a pointer to the block found, or NULL if such a block was
- * not found or an error occurred.
- */
-static struct buffer_head *
-ext3_xattr_cache_find(struct inode *inode, struct ext3_xattr_header *header,
- struct mb_cache_entry **pce)
-{
- __u32 hash = le32_to_cpu(header->h_hash);
- struct mb_cache_entry *ce;
-
- if (!header->h_hash)
- return NULL; /* never share */
- ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
-again:
- ce = mb_cache_entry_find_first(ext3_xattr_cache, inode->i_sb->s_bdev,
- hash);
- while (ce) {
- struct buffer_head *bh;
-
- if (IS_ERR(ce)) {
- if (PTR_ERR(ce) == -EAGAIN)
- goto again;
- break;
- }
- bh = sb_bread(inode->i_sb, ce->e_block);
- if (!bh) {
- ext3_error(inode->i_sb, __func__,
- "inode %lu: block %lu read error",
- inode->i_ino, (unsigned long) ce->e_block);
- } else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
- EXT3_XATTR_REFCOUNT_MAX) {
- ea_idebug(inode, "block %lu refcount %d>=%d",
- (unsigned long) ce->e_block,
- le32_to_cpu(BHDR(bh)->h_refcount),
- EXT3_XATTR_REFCOUNT_MAX);
- } else if (ext3_xattr_cmp(header, BHDR(bh)) == 0) {
- *pce = ce;
- return bh;
- }
- brelse(bh);
- ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash);
- }
- return NULL;
-}
-
-#define NAME_HASH_SHIFT 5
-#define VALUE_HASH_SHIFT 16
-
-/*
- * ext3_xattr_hash_entry()
- *
- * Compute the hash of an extended attribute.
- */
-static inline void ext3_xattr_hash_entry(struct ext3_xattr_header *header,
- struct ext3_xattr_entry *entry)
-{
- __u32 hash = 0;
- char *name = entry->e_name;
- int n;
-
- for (n=0; n < entry->e_name_len; n++) {
- hash = (hash << NAME_HASH_SHIFT) ^
- (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^
- *name++;
- }
-
- if (entry->e_value_block == 0 && entry->e_value_size != 0) {
- __le32 *value = (__le32 *)((char *)header +
- le16_to_cpu(entry->e_value_offs));
- for (n = (le32_to_cpu(entry->e_value_size) +
- EXT3_XATTR_ROUND) >> EXT3_XATTR_PAD_BITS; n; n--) {
- hash = (hash << VALUE_HASH_SHIFT) ^
- (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^
- le32_to_cpu(*value++);
- }
- }
- entry->e_hash = cpu_to_le32(hash);
-}
-
-#undef NAME_HASH_SHIFT
-#undef VALUE_HASH_SHIFT
-
-#define BLOCK_HASH_SHIFT 16
-
-/*
- * ext3_xattr_rehash()
- *
- * Re-compute the extended attribute hash value after an entry has changed.
- */
-static void ext3_xattr_rehash(struct ext3_xattr_header *header,
- struct ext3_xattr_entry *entry)
-{
- struct ext3_xattr_entry *here;
- __u32 hash = 0;
-
- ext3_xattr_hash_entry(header, entry);
- here = ENTRY(header+1);
- while (!IS_LAST_ENTRY(here)) {
- if (!here->e_hash) {
- /* Block is not shared if an entry's hash value == 0 */
- hash = 0;
- break;
- }
- hash = (hash << BLOCK_HASH_SHIFT) ^
- (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^
- le32_to_cpu(here->e_hash);
- here = EXT3_XATTR_NEXT(here);
- }
- header->h_hash = cpu_to_le32(hash);
-}
-
-#undef BLOCK_HASH_SHIFT
-
-int __init
-init_ext3_xattr(void)
-{
- ext3_xattr_cache = mb_cache_create("ext3_xattr", 6);
- if (!ext3_xattr_cache)
- return -ENOMEM;
- return 0;
-}
-
-void
-exit_ext3_xattr(void)
-{
- if (ext3_xattr_cache)
- mb_cache_destroy(ext3_xattr_cache);
- ext3_xattr_cache = NULL;
-}
diff --git a/fs/ext3/xattr.h b/fs/ext3/xattr.h
deleted file mode 100644
index 32e93eb..0000000
--- a/fs/ext3/xattr.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- File: fs/ext3/xattr.h
-
- On-disk format of extended attributes for the ext3 filesystem.
-
- (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
-*/
-
-#include <linux/xattr.h>
-
-/* Magic value in attribute blocks */
-#define EXT3_XATTR_MAGIC 0xEA020000
-
-/* Maximum number of references to one attribute block */
-#define EXT3_XATTR_REFCOUNT_MAX 1024
-
-/* Name indexes */
-#define EXT3_XATTR_INDEX_USER 1
-#define EXT3_XATTR_INDEX_POSIX_ACL_ACCESS 2
-#define EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT 3
-#define EXT3_XATTR_INDEX_TRUSTED 4
-#define EXT3_XATTR_INDEX_LUSTRE 5
-#define EXT3_XATTR_INDEX_SECURITY 6
-
-struct ext3_xattr_header {
- __le32 h_magic; /* magic number for identification */
- __le32 h_refcount; /* reference count */
- __le32 h_blocks; /* number of disk blocks used */
- __le32 h_hash; /* hash value of all attributes */
- __u32 h_reserved[4]; /* zero right now */
-};
-
-struct ext3_xattr_ibody_header {
- __le32 h_magic; /* magic number for identification */
-};
-
-struct ext3_xattr_entry {
- __u8 e_name_len; /* length of name */
- __u8 e_name_index; /* attribute name index */
- __le16 e_value_offs; /* offset in disk block of value */
- __le32 e_value_block; /* disk block attribute is stored on (n/i) */
- __le32 e_value_size; /* size of attribute value */
- __le32 e_hash; /* hash value of name and value */
- char e_name[0]; /* attribute name */
-};
-
-#define EXT3_XATTR_PAD_BITS 2
-#define EXT3_XATTR_PAD (1<<EXT3_XATTR_PAD_BITS)
-#define EXT3_XATTR_ROUND (EXT3_XATTR_PAD-1)
-#define EXT3_XATTR_LEN(name_len) \
- (((name_len) + EXT3_XATTR_ROUND + \
- sizeof(struct ext3_xattr_entry)) & ~EXT3_XATTR_ROUND)
-#define EXT3_XATTR_NEXT(entry) \
- ( (struct ext3_xattr_entry *)( \
- (char *)(entry) + EXT3_XATTR_LEN((entry)->e_name_len)) )
-#define EXT3_XATTR_SIZE(size) \
- (((size) + EXT3_XATTR_ROUND) & ~EXT3_XATTR_ROUND)
-
-# ifdef CONFIG_EXT3_FS_XATTR
-
-extern const struct xattr_handler ext3_xattr_user_handler;
-extern const struct xattr_handler ext3_xattr_trusted_handler;
-extern const struct xattr_handler ext3_xattr_security_handler;
-
-extern ssize_t ext3_listxattr(struct dentry *, char *, size_t);
-
-extern int ext3_xattr_get(struct inode *, int, const char *, void *, size_t);
-extern int ext3_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
-extern int ext3_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
-
-extern void ext3_xattr_delete_inode(handle_t *, struct inode *);
-extern void ext3_xattr_put_super(struct super_block *);
-
-extern int init_ext3_xattr(void);
-extern void exit_ext3_xattr(void);
-
-extern const struct xattr_handler *ext3_xattr_handlers[];
-
-# else /* CONFIG_EXT3_FS_XATTR */
-
-static inline int
-ext3_xattr_get(struct inode *inode, int name_index, const char *name,
- void *buffer, size_t size, int flags)
-{
- return -EOPNOTSUPP;
-}
-
-static inline int
-ext3_xattr_set(struct inode *inode, int name_index, const char *name,
- const void *value, size_t size, int flags)
-{
- return -EOPNOTSUPP;
-}
-
-static inline int
-ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
- const char *name, const void *value, size_t size, int flags)
-{
- return -EOPNOTSUPP;
-}
-
-static inline void
-ext3_xattr_delete_inode(handle_t *handle, struct inode *inode)
-{
-}
-
-static inline void
-ext3_xattr_put_super(struct super_block *sb)
-{
-}
-
-static inline int
-init_ext3_xattr(void)
-{
- return 0;
-}
-
-static inline void
-exit_ext3_xattr(void)
-{
-}
-
-#define ext3_xattr_handlers NULL
-
-# endif /* CONFIG_EXT3_FS_XATTR */
-
-#ifdef CONFIG_EXT3_FS_SECURITY
-extern int ext3_init_security(handle_t *handle, struct inode *inode,
- struct inode *dir, const struct qstr *qstr);
-#else
-static inline int ext3_init_security(handle_t *handle, struct inode *inode,
- struct inode *dir, const struct qstr *qstr)
-{
- return 0;
-}
-#endif
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
deleted file mode 100644
index c9506d5..0000000
--- a/fs/ext3/xattr_security.c
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * linux/fs/ext3/xattr_security.c
- * Handler for storing security labels as extended attributes.
- */
-
-#include <linux/security.h>
-#include "ext3.h"
-#include "xattr.h"
-
-static size_t
-ext3_xattr_security_list(struct dentry *dentry, char *list, size_t list_size,
- const char *name, size_t name_len, int type)
-{
- const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
- const size_t total_len = prefix_len + name_len + 1;
-
-
- if (list && total_len <= list_size) {
- memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
- memcpy(list+prefix_len, name, name_len);
- list[prefix_len + name_len] = '\0';
- }
- return total_len;
-}
-
-static int
-ext3_xattr_security_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
-{
- if (strcmp(name, "") == 0)
- return -EINVAL;
- return ext3_xattr_get(d_inode(dentry), EXT3_XATTR_INDEX_SECURITY,
- name, buffer, size);
-}
-
-static int
-ext3_xattr_security_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
-{
- if (strcmp(name, "") == 0)
- return -EINVAL;
- return ext3_xattr_set(d_inode(dentry), EXT3_XATTR_INDEX_SECURITY,
- name, value, size, flags);
-}
-
-static int ext3_initxattrs(struct inode *inode,
- const struct xattr *xattr_array,
- void *fs_info)
-{
- const struct xattr *xattr;
- handle_t *handle = fs_info;
- int err = 0;
-
- for (xattr = xattr_array; xattr->name != NULL; xattr++) {
- err = ext3_xattr_set_handle(handle, inode,
- EXT3_XATTR_INDEX_SECURITY,
- xattr->name, xattr->value,
- xattr->value_len, 0);
- if (err < 0)
- break;
- }
- return err;
-}
-
-int
-ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir,
- const struct qstr *qstr)
-{
- return security_inode_init_security(inode, dir, qstr,
- &ext3_initxattrs, handle);
-}
-
-const struct xattr_handler ext3_xattr_security_handler = {
- .prefix = XATTR_SECURITY_PREFIX,
- .list = ext3_xattr_security_list,
- .get = ext3_xattr_security_get,
- .set = ext3_xattr_security_set,
-};
diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c
deleted file mode 100644
index 206cc66..0000000
--- a/fs/ext3/xattr_trusted.c
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * linux/fs/ext3/xattr_trusted.c
- * Handler for trusted extended attributes.
- *
- * Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
- */
-
-#include "ext3.h"
-#include "xattr.h"
-
-static size_t
-ext3_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size,
- const char *name, size_t name_len, int type)
-{
- const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
- const size_t total_len = prefix_len + name_len + 1;
-
- if (!capable(CAP_SYS_ADMIN))
- return 0;
-
- if (list && total_len <= list_size) {
- memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
- memcpy(list+prefix_len, name, name_len);
- list[prefix_len + name_len] = '\0';
- }
- return total_len;
-}
-
-static int
-ext3_xattr_trusted_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
-{
- if (strcmp(name, "") == 0)
- return -EINVAL;
- return ext3_xattr_get(d_inode(dentry), EXT3_XATTR_INDEX_TRUSTED,
- name, buffer, size);
-}
-
-static int
-ext3_xattr_trusted_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
-{
- if (strcmp(name, "") == 0)
- return -EINVAL;
- return ext3_xattr_set(d_inode(dentry), EXT3_XATTR_INDEX_TRUSTED, name,
- value, size, flags);
-}
-
-const struct xattr_handler ext3_xattr_trusted_handler = {
- .prefix = XATTR_TRUSTED_PREFIX,
- .list = ext3_xattr_trusted_list,
- .get = ext3_xattr_trusted_get,
- .set = ext3_xattr_trusted_set,
-};
diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c
deleted file mode 100644
index 021508a..0000000
--- a/fs/ext3/xattr_user.c
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * linux/fs/ext3/xattr_user.c
- * Handler for extended user attributes.
- *
- * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
- */
-
-#include "ext3.h"
-#include "xattr.h"
-
-static size_t
-ext3_xattr_user_list(struct dentry *dentry, char *list, size_t list_size,
- const char *name, size_t name_len, int type)
-{
- const size_t prefix_len = XATTR_USER_PREFIX_LEN;
- const size_t total_len = prefix_len + name_len + 1;
-
- if (!test_opt(dentry->d_sb, XATTR_USER))
- return 0;
-
- if (list && total_len <= list_size) {
- memcpy(list, XATTR_USER_PREFIX, prefix_len);
- memcpy(list+prefix_len, name, name_len);
- list[prefix_len + name_len] = '\0';
- }
- return total_len;
-}
-
-static int
-ext3_xattr_user_get(struct dentry *dentry, const char *name, void *buffer,
- size_t size, int type)
-{
- if (strcmp(name, "") == 0)
- return -EINVAL;
- if (!test_opt(dentry->d_sb, XATTR_USER))
- return -EOPNOTSUPP;
- return ext3_xattr_get(d_inode(dentry), EXT3_XATTR_INDEX_USER,
- name, buffer, size);
-}
-
-static int
-ext3_xattr_user_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
-{
- if (strcmp(name, "") == 0)
- return -EINVAL;
- if (!test_opt(dentry->d_sb, XATTR_USER))
- return -EOPNOTSUPP;
- return ext3_xattr_set(d_inode(dentry), EXT3_XATTR_INDEX_USER,
- name, value, size, flags);
-}
-
-const struct xattr_handler ext3_xattr_user_handler = {
- .prefix = XATTR_USER_PREFIX,
- .list = ext3_xattr_user_list,
- .get = ext3_xattr_user_get,
- .set = ext3_xattr_user_set,
-};
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index bf8bc8a..b46e9fc 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -1,5 +1,38 @@
+# Ext3 configs are here for backward compatibility with old configs which may
+# have EXT3_FS set but not EXT4_FS set and thus would result in non-bootable
+# kernels after the removal of ext3 driver.
+config EXT3_FS
+ tristate "The Extended 3 (ext3) filesystem"
+ # These must match EXT4_FS selects...
+ select EXT4_FS
+ select JBD2
+ select CRC16
+ select CRYPTO
+ select CRYPTO_CRC32C
+ help
+ This config option is here only for backward compatibility. ext3
+ filesystem is now handled by the ext4 driver.
+
+config EXT3_FS_POSIX_ACL
+ bool "Ext3 POSIX Access Control Lists"
+ depends on EXT3_FS
+ select EXT4_FS_POSIX_ACL
+ select FS_POSIX_ACL
+ help
+ This config option is here only for backward compatibility. ext3
+ filesystem is now handled by the ext4 driver.
+
+config EXT3_FS_SECURITY
+ bool "Ext3 Security Labels"
+ depends on EXT3_FS
+ select EXT4_FS_SECURITY
+ help
+ This config option is here only for backward compatibility. ext3
+ filesystem is now handled by the ext4 driver.
+
config EXT4_FS
tristate "The Extended 4 (ext4) filesystem"
+ # Please update EXT3_FS selects when changing these
select JBD2
select CRC16
select CRYPTO
@@ -16,26 +49,27 @@ config EXT4_FS
up fsck time. For more information, please see the web pages at
http://ext4.wiki.kernel.org.
- The ext4 filesystem will support mounting an ext3
- filesystem; while there will be some performance gains from
- the delayed allocation and inode table readahead, the best
- performance gains will require enabling ext4 features in the
- filesystem, or formatting a new filesystem as an ext4
- filesystem initially.
+ The ext4 filesystem supports mounting an ext3 filesystem; while there
+ are some performance gains from the delayed allocation and inode
+ table readahead, the best performance gains require enabling ext4
+ features in the filesystem using tune2fs, or formatting a new
+ filesystem as an ext4 filesystem initially. Without explicit enabling
+ of ext4 features, the on disk filesystem format stays fully backward
+ compatible.
To compile this file system support as a module, choose M here. The
module will be called ext4.
If unsure, say N.
-config EXT4_USE_FOR_EXT23
- bool "Use ext4 for ext2/ext3 file systems"
+config EXT4_USE_FOR_EXT2
+ bool "Use ext4 for ext2 file systems"
depends on EXT4_FS
- depends on EXT3_FS=n || EXT2_FS=n
+ depends on EXT2_FS=n
default y
help
- Allow the ext4 file system driver code to be used for ext2 or
- ext3 file system mounts. This allows users to reduce their
+ Allow the ext4 file system driver code to be used for ext2
+ file system mounts. This allows users to reduce their
compiled kernel size by using one file system driver for
ext2, ext3, and ext4 file systems.
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 75285ea..f52cf54 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -8,7 +8,7 @@ ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
mmp.o indirect.o extents_status.o xattr.o xattr_user.o \
- xattr_trusted.o inline.o readpage.o
+ xattr_trusted.o inline.o readpage.o sysfs.o
ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index cd6ea29..ec0668a 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -191,6 +191,7 @@ static int ext4_init_block_bitmap(struct super_block *sb,
/* If checksum is bad mark all blocks used to prevent allocation
* essentially implementing a per-group read-only flag. */
if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
+ ext4_error(sb, "Checksum bad for group %u", block_group);
grp = ext4_get_group_info(sb, block_group);
if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
percpu_counter_sub(&sbi->s_freeclusters_counter,
@@ -203,7 +204,7 @@ static int ext4_init_block_bitmap(struct super_block *sb,
count);
}
set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
- return -EIO;
+ return -EFSBADCRC;
}
memset(bh->b_data, 0, sb->s_blocksize);
@@ -213,7 +214,7 @@ static int ext4_init_block_bitmap(struct super_block *sb,
start = ext4_group_first_block_no(sb, block_group);
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
+ if (ext4_has_feature_flex_bg(sb))
flex_bg = 1;
/* Set bits for block and inode bitmaps, and inode table */
@@ -322,7 +323,7 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb,
ext4_fsblk_t blk;
ext4_fsblk_t group_first_block;
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
+ if (ext4_has_feature_flex_bg(sb)) {
/* with FLEX_BG, the inode/block bitmaps and itable
* blocks may not be in the group at all
* so the bitmap validation will be skipped for those groups
@@ -360,42 +361,45 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb,
return 0;
}
-static void ext4_validate_block_bitmap(struct super_block *sb,
- struct ext4_group_desc *desc,
- ext4_group_t block_group,
- struct buffer_head *bh)
+static int ext4_validate_block_bitmap(struct super_block *sb,
+ struct ext4_group_desc *desc,
+ ext4_group_t block_group,
+ struct buffer_head *bh)
{
ext4_fsblk_t blk;
struct ext4_group_info *grp = ext4_get_group_info(sb, block_group);
struct ext4_sb_info *sbi = EXT4_SB(sb);
- if (buffer_verified(bh) || EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
- return;
+ if (buffer_verified(bh))
+ return 0;
+ if (EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+ return -EFSCORRUPTED;
ext4_lock_group(sb, block_group);
- blk = ext4_valid_block_bitmap(sb, desc, block_group, bh);
- if (unlikely(blk != 0)) {
+ if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group,
+ desc, bh))) {
ext4_unlock_group(sb, block_group);
- ext4_error(sb, "bg %u: block %llu: invalid block bitmap",
- block_group, blk);
+ ext4_error(sb, "bg %u: bad block bitmap checksum", block_group);
if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
percpu_counter_sub(&sbi->s_freeclusters_counter,
grp->bb_free);
set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
- return;
+ return -EFSBADCRC;
}
- if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group,
- desc, bh))) {
+ blk = ext4_valid_block_bitmap(sb, desc, block_group, bh);
+ if (unlikely(blk != 0)) {
ext4_unlock_group(sb, block_group);
- ext4_error(sb, "bg %u: bad block bitmap checksum", block_group);
+ ext4_error(sb, "bg %u: block %llu: invalid block bitmap",
+ block_group, blk);
if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
percpu_counter_sub(&sbi->s_freeclusters_counter,
grp->bb_free);
set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
- return;
+ return -EFSCORRUPTED;
}
set_buffer_verified(bh);
ext4_unlock_group(sb, block_group);
+ return 0;
}
/**
@@ -414,17 +418,18 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
struct ext4_group_desc *desc;
struct buffer_head *bh;
ext4_fsblk_t bitmap_blk;
+ int err;
desc = ext4_get_group_desc(sb, block_group, NULL);
if (!desc)
- return NULL;
+ return ERR_PTR(-EFSCORRUPTED);
bitmap_blk = ext4_block_bitmap(sb, desc);
bh = sb_getblk(sb, bitmap_blk);
if (unlikely(!bh)) {
ext4_error(sb, "Cannot get buffer for block bitmap - "
"block_group = %u, block_bitmap = %llu",
block_group, bitmap_blk);
- return NULL;
+ return ERR_PTR(-ENOMEM);
}
if (bitmap_uptodate(bh))
@@ -437,7 +442,6 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
}
ext4_lock_group(sb, block_group);
if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
- int err;
err = ext4_init_block_bitmap(sb, bh, block_group, desc);
set_bitmap_uptodate(bh);
@@ -445,7 +449,7 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
ext4_unlock_group(sb, block_group);
unlock_buffer(bh);
if (err)
- ext4_error(sb, "Checksum bad for grp %u", block_group);
+ goto out;
goto verify;
}
ext4_unlock_group(sb, block_group);
@@ -468,11 +472,13 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
submit_bh(READ | REQ_META | REQ_PRIO, bh);
return bh;
verify:
- ext4_validate_block_bitmap(sb, desc, block_group, bh);
- if (buffer_verified(bh))
- return bh;
+ err = ext4_validate_block_bitmap(sb, desc, block_group, bh);
+ if (err)
+ goto out;
+ return bh;
+out:
put_bh(bh);
- return NULL;
+ return ERR_PTR(err);
}
/* Returns 0 on success, 1 on error */
@@ -485,32 +491,32 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
return 0;
desc = ext4_get_group_desc(sb, block_group, NULL);
if (!desc)
- return 1;
+ return -EFSCORRUPTED;
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
ext4_error(sb, "Cannot read block bitmap - "
"block_group = %u, block_bitmap = %llu",
block_group, (unsigned long long) bh->b_blocknr);
- return 1;
+ return -EIO;
}
clear_buffer_new(bh);
/* Panic or remount fs read-only if block bitmap is invalid */
- ext4_validate_block_bitmap(sb, desc, block_group, bh);
- /* ...but check for error just in case errors=continue. */
- return !buffer_verified(bh);
+ return ext4_validate_block_bitmap(sb, desc, block_group, bh);
}
struct buffer_head *
ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
{
struct buffer_head *bh;
+ int err;
bh = ext4_read_block_bitmap_nowait(sb, block_group);
- if (!bh)
- return NULL;
- if (ext4_wait_block_bitmap(sb, block_group, bh)) {
+ if (IS_ERR(bh))
+ return bh;
+ err = ext4_wait_block_bitmap(sb, block_group, bh);
+ if (err) {
put_bh(bh);
- return NULL;
+ return ERR_PTR(err);
}
return bh;
}
@@ -681,8 +687,10 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
desc_count += ext4_free_group_clusters(sb, gdp);
brelse(bitmap_bh);
bitmap_bh = ext4_read_block_bitmap(sb, i);
- if (bitmap_bh == NULL)
+ if (IS_ERR(bitmap_bh)) {
+ bitmap_bh = NULL;
continue;
+ }
x = ext4_count_free(bitmap_bh->b_data,
EXT4_CLUSTERS_PER_GROUP(sb) / 8);
@@ -740,14 +748,13 @@ int ext4_bg_has_super(struct super_block *sb, ext4_group_t group)
if (group == 0)
return 1;
- if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_SPARSE_SUPER2)) {
+ if (ext4_has_feature_sparse_super2(sb)) {
if (group == le32_to_cpu(es->s_backup_bgs[0]) ||
group == le32_to_cpu(es->s_backup_bgs[1]))
return 1;
return 0;
}
- if ((group <= 1) || !EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER))
+ if ((group <= 1) || !ext4_has_feature_sparse_super(sb))
return 1;
if (!(group & 1))
return 0;
@@ -776,7 +783,7 @@ static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb,
if (!ext4_bg_has_super(sb, group))
return 0;
- if (EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG))
+ if (ext4_has_feature_meta_bg(sb))
return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
else
return EXT4_SB(sb)->s_gdb_count;
@@ -797,8 +804,7 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group)
le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb);
- if (!EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG) ||
- metagroup < first_meta_bg)
+ if (!ext4_has_feature_meta_bg(sb) || metagroup < first_meta_bg)
return ext4_bg_num_gdb_nometa(sb, group);
return ext4_bg_num_gdb_meta(sb,group);
@@ -818,7 +824,7 @@ static unsigned ext4_num_base_meta_clusters(struct super_block *sb,
/* Check for superblock and gdt backups in this group */
num = ext4_bg_has_super(sb, block_group);
- if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) ||
+ if (!ext4_has_feature_meta_bg(sb) ||
block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) *
sbi->s_desc_per_block) {
if (num) {
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 3522340..02ddec6 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -234,7 +234,7 @@ int ext4_check_blockref(const char *function, unsigned int line,
es->s_last_error_block = cpu_to_le64(blk);
ext4_error_inode(inode, function, line, blk,
"invalid block");
- return -EIO;
+ return -EFSCORRUPTED;
}
}
return 0;
diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c
index 4573155..1a08350 100644
--- a/fs/ext4/crypto.c
+++ b/fs/ext4/crypto.c
@@ -253,8 +253,7 @@ typedef enum {
EXT4_ENCRYPT,
} ext4_direction_t;
-static int ext4_page_crypto(struct ext4_crypto_ctx *ctx,
- struct inode *inode,
+static int ext4_page_crypto(struct inode *inode,
ext4_direction_t rw,
pgoff_t index,
struct page *src_page,
@@ -296,7 +295,6 @@ static int ext4_page_crypto(struct ext4_crypto_ctx *ctx,
else
res = crypto_ablkcipher_encrypt(req);
if (res == -EINPROGRESS || res == -EBUSY) {
- BUG_ON(req->base.data != &ecr);
wait_for_completion(&ecr.completion);
res = ecr.res;
}
@@ -353,7 +351,7 @@ struct page *ext4_encrypt(struct inode *inode,
if (IS_ERR(ciphertext_page))
goto errout;
ctx->w.control_page = plaintext_page;
- err = ext4_page_crypto(ctx, inode, EXT4_ENCRYPT, plaintext_page->index,
+ err = ext4_page_crypto(inode, EXT4_ENCRYPT, plaintext_page->index,
plaintext_page, ciphertext_page);
if (err) {
ciphertext_page = ERR_PTR(err);
@@ -378,40 +376,29 @@ struct page *ext4_encrypt(struct inode *inode,
*
* Return: Zero on success, non-zero otherwise.
*/
-int ext4_decrypt(struct ext4_crypto_ctx *ctx, struct page *page)
+int ext4_decrypt(struct page *page)
{
BUG_ON(!PageLocked(page));
- return ext4_page_crypto(ctx, page->mapping->host,
+ return ext4_page_crypto(page->mapping->host,
EXT4_DECRYPT, page->index, page, page);
}
-/*
- * Convenience function which takes care of allocating and
- * deallocating the encryption context
- */
-int ext4_decrypt_one(struct inode *inode, struct page *page)
-{
- int ret;
-
- struct ext4_crypto_ctx *ctx = ext4_get_crypto_ctx(inode);
-
- if (IS_ERR(ctx))
- return PTR_ERR(ctx);
- ret = ext4_decrypt(ctx, page);
- ext4_release_crypto_ctx(ctx);
- return ret;
-}
-
int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex)
{
struct ext4_crypto_ctx *ctx;
struct page *ciphertext_page = NULL;
struct bio *bio;
- ext4_lblk_t lblk = ex->ee_block;
+ ext4_lblk_t lblk = le32_to_cpu(ex->ee_block);
ext4_fsblk_t pblk = ext4_ext_pblock(ex);
unsigned int len = ext4_ext_get_actual_len(ex);
- int err = 0;
+ int ret, err = 0;
+
+#if 0
+ ext4_msg(inode->i_sb, KERN_CRIT,
+ "ext4_encrypted_zeroout ino %lu lblk %u len %u",
+ (unsigned long) inode->i_ino, lblk, len);
+#endif
BUG_ON(inode->i_sb->s_blocksize != PAGE_CACHE_SIZE);
@@ -426,7 +413,7 @@ int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex)
}
while (len--) {
- err = ext4_page_crypto(ctx, inode, EXT4_ENCRYPT, lblk,
+ err = ext4_page_crypto(inode, EXT4_ENCRYPT, lblk,
ZERO_PAGE(0), ciphertext_page);
if (err)
goto errout;
@@ -437,17 +424,26 @@ int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex)
goto errout;
}
bio->bi_bdev = inode->i_sb->s_bdev;
- bio->bi_iter.bi_sector = pblk;
- err = bio_add_page(bio, ciphertext_page,
+ bio->bi_iter.bi_sector =
+ pblk << (inode->i_sb->s_blocksize_bits - 9);
+ ret = bio_add_page(bio, ciphertext_page,
inode->i_sb->s_blocksize, 0);
- if (err) {
+ if (ret != inode->i_sb->s_blocksize) {
+ /* should never happen! */
+ ext4_msg(inode->i_sb, KERN_ERR,
+ "bio_add_page failed: %d", ret);
+ WARN_ON(1);
bio_put(bio);
+ err = -EIO;
goto errout;
}
err = submit_bio_wait(WRITE, bio);
+ if ((err == 0) && bio->bi_error)
+ err = -EIO;
bio_put(bio);
if (err)
goto errout;
+ lblk++; pblk++;
}
err = 0;
errout:
diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c
index 7dc4eb5..2fbef8a 100644
--- a/fs/ext4/crypto_fname.c
+++ b/fs/ext4/crypto_fname.c
@@ -19,7 +19,6 @@
#include <linux/gfp.h>
#include <linux/kernel.h>
#include <linux/key.h>
-#include <linux/key.h>
#include <linux/list.h>
#include <linux/mempool.h>
#include <linux/random.h>
@@ -121,7 +120,6 @@ static int ext4_fname_encrypt(struct inode *inode,
ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv);
res = crypto_ablkcipher_encrypt(req);
if (res == -EINPROGRESS || res == -EBUSY) {
- BUG_ON(req->base.data != &ecr);
wait_for_completion(&ecr.completion);
res = ecr.res;
}
@@ -183,7 +181,6 @@ static int ext4_fname_decrypt(struct inode *inode,
ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv);
res = crypto_ablkcipher_decrypt(req);
if (res == -EINPROGRESS || res == -EBUSY) {
- BUG_ON(req->base.data != &ecr);
wait_for_completion(&ecr.completion);
res = ecr.res;
}
@@ -329,6 +326,10 @@ int _ext4_fname_disk_to_usr(struct inode *inode,
return oname->len;
}
}
+ if (iname->len < EXT4_CRYPTO_BLOCK_SIZE) {
+ EXT4_ERROR_INODE(inode, "encrypted inode too small");
+ return -EUCLEAN;
+ }
if (EXT4_I(inode)->i_crypt_info)
return ext4_fname_decrypt(inode, iname, oname);
diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c
index 442d24e..c5882b3 100644
--- a/fs/ext4/crypto_key.c
+++ b/fs/ext4/crypto_key.c
@@ -30,7 +30,7 @@ static void derive_crypt_complete(struct crypto_async_request *req, int rc)
/**
* ext4_derive_key_aes() - Derive a key using AES-128-ECB
- * @deriving_key: Encryption key used for derivatio.
+ * @deriving_key: Encryption key used for derivation.
* @source_key: Source key to which to apply derivation.
* @derived_key: Derived key.
*
@@ -71,7 +71,6 @@ static int ext4_derive_key_aes(char deriving_key[EXT4_AES_128_ECB_KEY_SIZE],
EXT4_AES_256_XTS_KEY_SIZE, NULL);
res = crypto_ablkcipher_encrypt(req);
if (res == -EINPROGRESS || res == -EBUSY) {
- BUG_ON(req->base.data != &ecr);
wait_for_completion(&ecr.completion);
res = ecr.res;
}
@@ -121,7 +120,7 @@ int _ext4_get_encryption_info(struct inode *inode)
struct key *keyring_key = NULL;
struct ext4_encryption_key *master_key;
struct ext4_encryption_context ctx;
- struct user_key_payload *ukp;
+ const struct user_key_payload *ukp;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct crypto_ablkcipher *ctfm;
const char *cipher_str;
@@ -208,8 +207,13 @@ retry:
goto out;
}
crypt_info->ci_keyring_key = keyring_key;
- BUG_ON(keyring_key->type != &key_type_logon);
- ukp = ((struct user_key_payload *)keyring_key->payload.data);
+ if (keyring_key->type != &key_type_logon) {
+ printk_once(KERN_WARNING
+ "ext4: key type must be logon\n");
+ res = -ENOKEY;
+ goto out;
+ }
+ ukp = user_key_payload(keyring_key);
if (ukp->datalen != sizeof(struct ext4_encryption_key)) {
res = -EINVAL;
goto out;
@@ -217,9 +221,17 @@ retry:
master_key = (struct ext4_encryption_key *)ukp->data;
BUILD_BUG_ON(EXT4_AES_128_ECB_KEY_SIZE !=
EXT4_KEY_DERIVATION_NONCE_SIZE);
- BUG_ON(master_key->size != EXT4_AES_256_XTS_KEY_SIZE);
+ if (master_key->size != EXT4_AES_256_XTS_KEY_SIZE) {
+ printk_once(KERN_WARNING
+ "ext4: key size incorrect: %d\n",
+ master_key->size);
+ res = -ENOKEY;
+ goto out;
+ }
res = ext4_derive_key_aes(ctx.nonce, master_key->raw,
raw_key);
+ if (res)
+ goto out;
got_key:
ctfm = crypto_alloc_ablkcipher(cipher_str, 0, 0);
if (!ctfm || IS_ERR(ctfm)) {
diff --git a/fs/ext4/crypto_policy.c b/fs/ext4/crypto_policy.c
index 02c4e5d..ad05069 100644
--- a/fs/ext4/crypto_policy.c
+++ b/fs/ext4/crypto_policy.c
@@ -12,6 +12,7 @@
#include <linux/string.h>
#include <linux/types.h>
+#include "ext4_jbd2.h"
#include "ext4.h"
#include "xattr.h"
@@ -49,7 +50,8 @@ static int ext4_create_encryption_context_from_policy(
struct inode *inode, const struct ext4_encryption_policy *policy)
{
struct ext4_encryption_context ctx;
- int res = 0;
+ handle_t *handle;
+ int res, res2;
res = ext4_convert_inline_data(inode);
if (res)
@@ -78,11 +80,22 @@ static int ext4_create_encryption_context_from_policy(
BUILD_BUG_ON(sizeof(ctx.nonce) != EXT4_KEY_DERIVATION_NONCE_SIZE);
get_random_bytes(ctx.nonce, EXT4_KEY_DERIVATION_NONCE_SIZE);
+ handle = ext4_journal_start(inode, EXT4_HT_MISC,
+ ext4_jbd2_credits_xattr(inode));
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
res = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION,
EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx,
sizeof(ctx), 0);
- if (!res)
+ if (!res) {
ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
+ res = ext4_mark_inode_dirty(handle, inode);
+ if (res)
+ EXT4_ERROR_INODE(inode, "Failed to mark inode dirty");
+ }
+ res2 = ext4_journal_stop(handle);
+ if (!res)
+ res = res2;
return res;
}
@@ -137,7 +150,8 @@ int ext4_is_child_context_consistent_with_parent(struct inode *parent,
if ((parent == NULL) || (child == NULL)) {
pr_err("parent %p child %p\n", parent, child);
- BUG_ON(1);
+ WARN_ON(1); /* Should never happen */
+ return 0;
}
/* no restrictions if the parent directory is not encrypted */
if (!ext4_encrypted_inode(parent))
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index f9e1491..1d1bca7 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -40,8 +40,7 @@ static int is_dx_dir(struct inode *inode)
{
struct super_block *sb = inode->i_sb;
- if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_COMPAT_DIR_INDEX) &&
+ if (ext4_has_feature_dir_index(inode->i_sb) &&
((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
((inode->i_size >> sb->s_blocksize_bits) == 1) ||
ext4_has_inline_data(inode)))
@@ -621,14 +620,14 @@ int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf,
while ((char *) de < top) {
if (ext4_check_dir_entry(dir, NULL, de, bh,
buf, buf_size, offset))
- return -EIO;
+ return -EFSCORRUPTED;
nlen = EXT4_DIR_REC_LEN(de->name_len);
rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
offset += rlen;
}
if ((char *) de > top)
- return -EIO;
+ return -EFSCORRUPTED;
return 0;
}
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index f5e9f04..cc7ca4e 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -26,6 +26,7 @@
#include <linux/seqlock.h>
#include <linux/mutex.h>
#include <linux/timer.h>
+#include <linux/version.h>
#include <linux/wait.h>
#include <linux/blockgroup_lock.h>
#include <linux/percpu_counter.h>
@@ -187,7 +188,7 @@ typedef struct ext4_io_end {
} ext4_io_end_t;
struct ext4_io_submit {
- int io_op;
+ struct writeback_control *io_wbc;
struct bio *io_bio;
ext4_io_end_t *io_end;
sector_t io_next_block;
@@ -374,6 +375,7 @@ struct flex_groups {
#define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */
#define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */
#define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */
+#define EXT4_PROJINHERIT_FL 0x20000000 /* Create with parents projid */
#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
#define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */
@@ -431,6 +433,7 @@ enum {
EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */
EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */
EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */
+ EXT4_INODE_PROJINHERIT = 29, /* Create with parents projid */
EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */
};
@@ -475,6 +478,7 @@ static inline void ext4_check_flag_values(void)
CHECK_FLAG_VALUE(EA_INODE);
CHECK_FLAG_VALUE(EOFBLOCKS);
CHECK_FLAG_VALUE(INLINE_DATA);
+ CHECK_FLAG_VALUE(PROJINHERIT);
CHECK_FLAG_VALUE(RESERVED);
}
@@ -692,6 +696,7 @@ struct ext4_inode {
__le32 i_crtime; /* File Creation time */
__le32 i_crtime_extra; /* extra FileCreationtime (nsec << 2 | epoch) */
__le32 i_version_hi; /* high 32 bits for 64-bit version */
+ __le32 i_projid; /* Project ID */
};
struct move_extent {
@@ -723,19 +728,55 @@ struct move_extent {
<= (EXT4_GOOD_OLD_INODE_SIZE + \
(einode)->i_extra_isize)) \
+/*
+ * We use an encoding that preserves the times for extra epoch "00":
+ *
+ * extra msb of adjust for signed
+ * epoch 32-bit 32-bit tv_sec to
+ * bits time decoded 64-bit tv_sec 64-bit tv_sec valid time range
+ * 0 0 1 -0x80000000..-0x00000001 0x000000000 1901-12-13..1969-12-31
+ * 0 0 0 0x000000000..0x07fffffff 0x000000000 1970-01-01..2038-01-19
+ * 0 1 1 0x080000000..0x0ffffffff 0x100000000 2038-01-19..2106-02-07
+ * 0 1 0 0x100000000..0x17fffffff 0x100000000 2106-02-07..2174-02-25
+ * 1 0 1 0x180000000..0x1ffffffff 0x200000000 2174-02-25..2242-03-16
+ * 1 0 0 0x200000000..0x27fffffff 0x200000000 2242-03-16..2310-04-04
+ * 1 1 1 0x280000000..0x2ffffffff 0x300000000 2310-04-04..2378-04-22
+ * 1 1 0 0x300000000..0x37fffffff 0x300000000 2378-04-22..2446-05-10
+ *
+ * Note that previous versions of the kernel on 64-bit systems would
+ * incorrectly use extra epoch bits 1,1 for dates between 1901 and
+ * 1970. e2fsck will correct this, assuming that it is run on the
+ * affected filesystem before 2242.
+ */
+
static inline __le32 ext4_encode_extra_time(struct timespec *time)
{
- return cpu_to_le32((sizeof(time->tv_sec) > 4 ?
- (time->tv_sec >> 32) & EXT4_EPOCH_MASK : 0) |
- ((time->tv_nsec << EXT4_EPOCH_BITS) & EXT4_NSEC_MASK));
+ u32 extra = sizeof(time->tv_sec) > 4 ?
+ ((time->tv_sec - (s32)time->tv_sec) >> 32) & EXT4_EPOCH_MASK : 0;
+ return cpu_to_le32(extra | (time->tv_nsec << EXT4_EPOCH_BITS));
}
static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra)
{
- if (sizeof(time->tv_sec) > 4)
- time->tv_sec |= (__u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK)
- << 32;
- time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS;
+ if (unlikely(sizeof(time->tv_sec) > 4 &&
+ (extra & cpu_to_le32(EXT4_EPOCH_MASK)))) {
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4,20,0)
+ /* Handle legacy encoding of pre-1970 dates with epoch
+ * bits 1,1. We assume that by kernel version 4.20,
+ * everyone will have run fsck over the affected
+ * filesystems to correct the problem. (This
+ * backwards compatibility may be removed before this
+ * time, at the discretion of the ext4 developers.)
+ */
+ u64 extra_bits = le32_to_cpu(extra) & EXT4_EPOCH_MASK;
+ if (extra_bits == 3 && ((time->tv_sec) & 0x80000000) != 0)
+ extra_bits = 0;
+ time->tv_sec += extra_bits << 32;
+#else
+ time->tv_sec += (u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) << 32;
+#endif
+ }
+ time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS;
}
#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \
@@ -1019,6 +1060,9 @@ struct ext4_inode_info {
#define EXT4_MOUNT2_HURD_COMPAT 0x00000004 /* Support HURD-castrated
file systems */
+#define EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM 0x00000008 /* User explicitly
+ specified journal checksum */
+
#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \
~EXT4_MOUNT_##opt
#define set_opt(sb, opt) EXT4_SB(sb)->s_mount_opt |= \
@@ -1179,7 +1223,9 @@ struct ext4_super_block {
__u8 s_encrypt_algos[4]; /* Encryption algorithms in use */
__u8 s_encrypt_pw_salt[16]; /* Salt used for string2key algorithm */
__le32 s_lpf_ino; /* Location of the lost+found inode */
- __le32 s_reserved[100]; /* Padding to the end of the block */
+ __le32 s_prj_quota_inum; /* inode for tracking project quota */
+ __le32 s_checksum_seed; /* crc32c(uuid) if csum_seed set */
+ __le32 s_reserved[98]; /* Padding to the end of the block */
__le32 s_checksum; /* crc32c(superblock) */
};
@@ -1522,6 +1568,7 @@ static inline int ext4_encrypted_inode(struct inode *inode)
* Feature set definitions
*/
+/* Use the ext4_{has,set,clear}_feature_* helpers; these will be removed */
#define EXT4_HAS_COMPAT_FEATURE(sb,mask) \
((EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask)) != 0)
#define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask) \
@@ -1566,6 +1613,7 @@ static inline int ext4_encrypted_inode(struct inode *inode)
*/
#define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM 0x0400
#define EXT4_FEATURE_RO_COMPAT_READONLY 0x1000
+#define EXT4_FEATURE_RO_COMPAT_PROJECT 0x2000
#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001
#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002
@@ -1578,11 +1626,99 @@ static inline int ext4_encrypted_inode(struct inode *inode)
#define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200
#define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */
#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */
-#define EXT4_FEATURE_INCOMPAT_BG_USE_META_CSUM 0x2000 /* use crc32c for bg */
+#define EXT4_FEATURE_INCOMPAT_CSUM_SEED 0x2000
#define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */
#define EXT4_FEATURE_INCOMPAT_INLINE_DATA 0x8000 /* data in inode */
#define EXT4_FEATURE_INCOMPAT_ENCRYPT 0x10000
+#define EXT4_FEATURE_COMPAT_FUNCS(name, flagname) \
+static inline bool ext4_has_feature_##name(struct super_block *sb) \
+{ \
+ return ((EXT4_SB(sb)->s_es->s_feature_compat & \
+ cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname)) != 0); \
+} \
+static inline void ext4_set_feature_##name(struct super_block *sb) \
+{ \
+ EXT4_SB(sb)->s_es->s_feature_compat |= \
+ cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \
+} \
+static inline void ext4_clear_feature_##name(struct super_block *sb) \
+{ \
+ EXT4_SB(sb)->s_es->s_feature_compat &= \
+ ~cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \
+}
+
+#define EXT4_FEATURE_RO_COMPAT_FUNCS(name, flagname) \
+static inline bool ext4_has_feature_##name(struct super_block *sb) \
+{ \
+ return ((EXT4_SB(sb)->s_es->s_feature_ro_compat & \
+ cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname)) != 0); \
+} \
+static inline void ext4_set_feature_##name(struct super_block *sb) \
+{ \
+ EXT4_SB(sb)->s_es->s_feature_ro_compat |= \
+ cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \
+} \
+static inline void ext4_clear_feature_##name(struct super_block *sb) \
+{ \
+ EXT4_SB(sb)->s_es->s_feature_ro_compat &= \
+ ~cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \
+}
+
+#define EXT4_FEATURE_INCOMPAT_FUNCS(name, flagname) \
+static inline bool ext4_has_feature_##name(struct super_block *sb) \
+{ \
+ return ((EXT4_SB(sb)->s_es->s_feature_incompat & \
+ cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname)) != 0); \
+} \
+static inline void ext4_set_feature_##name(struct super_block *sb) \
+{ \
+ EXT4_SB(sb)->s_es->s_feature_incompat |= \
+ cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \
+} \
+static inline void ext4_clear_feature_##name(struct super_block *sb) \
+{ \
+ EXT4_SB(sb)->s_es->s_feature_incompat &= \
+ ~cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \
+}
+
+EXT4_FEATURE_COMPAT_FUNCS(dir_prealloc, DIR_PREALLOC)
+EXT4_FEATURE_COMPAT_FUNCS(imagic_inodes, IMAGIC_INODES)
+EXT4_FEATURE_COMPAT_FUNCS(journal, HAS_JOURNAL)
+EXT4_FEATURE_COMPAT_FUNCS(xattr, EXT_ATTR)
+EXT4_FEATURE_COMPAT_FUNCS(resize_inode, RESIZE_INODE)
+EXT4_FEATURE_COMPAT_FUNCS(dir_index, DIR_INDEX)
+EXT4_FEATURE_COMPAT_FUNCS(sparse_super2, SPARSE_SUPER2)
+
+EXT4_FEATURE_RO_COMPAT_FUNCS(sparse_super, SPARSE_SUPER)
+EXT4_FEATURE_RO_COMPAT_FUNCS(large_file, LARGE_FILE)
+EXT4_FEATURE_RO_COMPAT_FUNCS(btree_dir, BTREE_DIR)
+EXT4_FEATURE_RO_COMPAT_FUNCS(huge_file, HUGE_FILE)
+EXT4_FEATURE_RO_COMPAT_FUNCS(gdt_csum, GDT_CSUM)
+EXT4_FEATURE_RO_COMPAT_FUNCS(dir_nlink, DIR_NLINK)
+EXT4_FEATURE_RO_COMPAT_FUNCS(extra_isize, EXTRA_ISIZE)
+EXT4_FEATURE_RO_COMPAT_FUNCS(quota, QUOTA)
+EXT4_FEATURE_RO_COMPAT_FUNCS(bigalloc, BIGALLOC)
+EXT4_FEATURE_RO_COMPAT_FUNCS(metadata_csum, METADATA_CSUM)
+EXT4_FEATURE_RO_COMPAT_FUNCS(readonly, READONLY)
+EXT4_FEATURE_RO_COMPAT_FUNCS(project, PROJECT)
+
+EXT4_FEATURE_INCOMPAT_FUNCS(compression, COMPRESSION)
+EXT4_FEATURE_INCOMPAT_FUNCS(filetype, FILETYPE)
+EXT4_FEATURE_INCOMPAT_FUNCS(journal_needs_recovery, RECOVER)
+EXT4_FEATURE_INCOMPAT_FUNCS(journal_dev, JOURNAL_DEV)
+EXT4_FEATURE_INCOMPAT_FUNCS(meta_bg, META_BG)
+EXT4_FEATURE_INCOMPAT_FUNCS(extents, EXTENTS)
+EXT4_FEATURE_INCOMPAT_FUNCS(64bit, 64BIT)
+EXT4_FEATURE_INCOMPAT_FUNCS(mmp, MMP)
+EXT4_FEATURE_INCOMPAT_FUNCS(flex_bg, FLEX_BG)
+EXT4_FEATURE_INCOMPAT_FUNCS(ea_inode, EA_INODE)
+EXT4_FEATURE_INCOMPAT_FUNCS(dirdata, DIRDATA)
+EXT4_FEATURE_INCOMPAT_FUNCS(csum_seed, CSUM_SEED)
+EXT4_FEATURE_INCOMPAT_FUNCS(largedir, LARGEDIR)
+EXT4_FEATURE_INCOMPAT_FUNCS(inline_data, INLINE_DATA)
+EXT4_FEATURE_INCOMPAT_FUNCS(encrypt, ENCRYPT)
+
#define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR
#define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
EXT4_FEATURE_INCOMPAT_META_BG)
@@ -1598,7 +1734,7 @@ static inline int ext4_encrypted_inode(struct inode *inode)
EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
-#define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR
+#define EXT4_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR
#define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
EXT4_FEATURE_INCOMPAT_RECOVER| \
EXT4_FEATURE_INCOMPAT_META_BG| \
@@ -1607,7 +1743,8 @@ static inline int ext4_encrypted_inode(struct inode *inode)
EXT4_FEATURE_INCOMPAT_FLEX_BG| \
EXT4_FEATURE_INCOMPAT_MMP | \
EXT4_FEATURE_INCOMPAT_INLINE_DATA | \
- EXT4_FEATURE_INCOMPAT_ENCRYPT)
+ EXT4_FEATURE_INCOMPAT_ENCRYPT | \
+ EXT4_FEATURE_INCOMPAT_CSUM_SEED)
#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
@@ -1619,6 +1756,40 @@ static inline int ext4_encrypted_inode(struct inode *inode)
EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\
EXT4_FEATURE_RO_COMPAT_QUOTA)
+#define EXTN_FEATURE_FUNCS(ver) \
+static inline bool ext4_has_unknown_ext##ver##_compat_features(struct super_block *sb) \
+{ \
+ return ((EXT4_SB(sb)->s_es->s_feature_compat & \
+ cpu_to_le32(~EXT##ver##_FEATURE_COMPAT_SUPP)) != 0); \
+} \
+static inline bool ext4_has_unknown_ext##ver##_ro_compat_features(struct super_block *sb) \
+{ \
+ return ((EXT4_SB(sb)->s_es->s_feature_ro_compat & \
+ cpu_to_le32(~EXT##ver##_FEATURE_RO_COMPAT_SUPP)) != 0); \
+} \
+static inline bool ext4_has_unknown_ext##ver##_incompat_features(struct super_block *sb) \
+{ \
+ return ((EXT4_SB(sb)->s_es->s_feature_incompat & \
+ cpu_to_le32(~EXT##ver##_FEATURE_INCOMPAT_SUPP)) != 0); \
+}
+
+EXTN_FEATURE_FUNCS(2)
+EXTN_FEATURE_FUNCS(3)
+EXTN_FEATURE_FUNCS(4)
+
+static inline bool ext4_has_compat_features(struct super_block *sb)
+{
+ return (EXT4_SB(sb)->s_es->s_feature_compat != 0);
+}
+static inline bool ext4_has_ro_compat_features(struct super_block *sb)
+{
+ return (EXT4_SB(sb)->s_es->s_feature_ro_compat != 0);
+}
+static inline bool ext4_has_incompat_features(struct super_block *sb)
+{
+ return (EXT4_SB(sb)->s_es->s_feature_incompat != 0);
+}
+
/*
* Default values for user and/or group using reserved blocks
*/
@@ -1769,8 +1940,7 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
* (c) Daniel Phillips, 2001
*/
-#define is_dx(dir) (EXT4_HAS_COMPAT_FEATURE(dir->i_sb, \
- EXT4_FEATURE_COMPAT_DIR_INDEX) && \
+#define is_dx(dir) (ext4_has_feature_dir_index((dir)->i_sb) && \
ext4_test_inode_flag((dir), EXT4_INODE_INDEX))
#define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT4_LINK_MAX)
#define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
@@ -2063,8 +2233,7 @@ void ext4_release_crypto_ctx(struct ext4_crypto_ctx *ctx);
void ext4_restore_control_page(struct page *data_page);
struct page *ext4_encrypt(struct inode *inode,
struct page *plaintext_page);
-int ext4_decrypt(struct ext4_crypto_ctx *ctx, struct page *page);
-int ext4_decrypt_one(struct inode *inode, struct page *page);
+int ext4_decrypt(struct page *page);
int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex);
#ifdef CONFIG_EXT4_FS_ENCRYPTION
@@ -2072,7 +2241,7 @@ int ext4_init_crypto(void);
void ext4_exit_crypto(void);
static inline int ext4_sb_has_crypto(struct super_block *sb)
{
- return EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT);
+ return ext4_has_feature_encrypt(sb);
}
#else
static inline int ext4_init_crypto(void) { return 0; }
@@ -2193,8 +2362,7 @@ int ext4_insert_dentry(struct inode *dir,
struct ext4_filename *fname);
static inline void ext4_update_dx_flag(struct inode *inode)
{
- if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_COMPAT_DIR_INDEX))
+ if (!ext4_has_feature_dir_index(inode->i_sb))
ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
}
static unsigned char ext4_filetype_table[] = {
@@ -2203,8 +2371,7 @@ static unsigned char ext4_filetype_table[] = {
static inline unsigned char get_dtype(struct super_block *sb, int filetype)
{
- if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) ||
- (filetype >= EXT4_FT_MAX))
+ if (!ext4_has_feature_filetype(sb) || filetype >= EXT4_FT_MAX)
return DT_UNKNOWN;
return ext4_filetype_table[filetype];
@@ -2245,6 +2412,7 @@ extern int ext4_init_inode_table(struct super_block *sb,
extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate);
/* mballoc.c */
+extern const struct file_operations ext4_seq_mb_groups_fops;
extern long ext4_mb_stats;
extern long ext4_mb_max_to_scan;
extern int ext4_mb_init(struct super_block *);
@@ -2272,6 +2440,8 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
int ext4_get_block_write(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
+int ext4_get_block_dax(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create);
int ext4_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
@@ -2370,6 +2540,7 @@ extern int ext4_group_extend(struct super_block *sb,
extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count);
/* super.c */
+extern int ext4_seq_options_show(struct seq_file *seq, void *offset);
extern int ext4_calculate_overhead(struct super_block *sb);
extern void ext4_superblock_csum_set(struct super_block *sb);
extern void *ext4_kvmalloc(size_t size, gfp_t flags);
@@ -2532,15 +2703,13 @@ extern int ext4_register_li_request(struct super_block *sb,
static inline int ext4_has_group_desc_csum(struct super_block *sb)
{
- return EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_GDT_CSUM) ||
- (EXT4_SB(sb)->s_chksum_driver != NULL);
+ return ext4_has_feature_gdt_csum(sb) ||
+ EXT4_SB(sb)->s_chksum_driver != NULL;
}
static inline int ext4_has_metadata_csum(struct super_block *sb)
{
- WARN_ON_ONCE(EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
+ WARN_ON_ONCE(ext4_has_feature_metadata_csum(sb) &&
!EXT4_SB(sb)->s_chksum_driver);
return (EXT4_SB(sb)->s_chksum_driver != NULL);
@@ -2887,7 +3056,7 @@ static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = {
static inline void ext4_set_de_type(struct super_block *sb,
struct ext4_dir_entry_2 *de,
umode_t mode) {
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE))
+ if (ext4_has_feature_filetype(sb))
de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
}
@@ -2901,6 +3070,12 @@ extern const struct inode_operations ext4_encrypted_symlink_inode_operations;
extern const struct inode_operations ext4_symlink_inode_operations;
extern const struct inode_operations ext4_fast_symlink_inode_operations;
+/* sysfs.c */
+extern int ext4_register_sysfs(struct super_block *sb);
+extern void ext4_unregister_sysfs(struct super_block *sb);
+extern int __init ext4_init_sysfs(void);
+extern void ext4_exit_sysfs(void);
+
/* block_validity */
extern void ext4_release_system_zone(struct super_block *sb);
extern int ext4_setup_system_zone(struct super_block *sb);
@@ -3047,4 +3222,7 @@ extern void ext4_resize_end(struct super_block *sb);
#endif /* __KERNEL__ */
+#define EFSBADCRC EBADMSG /* Bad CRC detected */
+#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
+
#endif /* _EXT4_H */
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index d418431..e770c1ee 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -88,13 +88,13 @@ int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
return 0;
}
+ err = handle->h_err;
if (!handle->h_transaction) {
- err = jbd2_journal_stop(handle);
- return handle->h_err ? handle->h_err : err;
+ rc = jbd2_journal_stop(handle);
+ return err ? err : rc;
}
sb = handle->h_transaction->t_journal->j_private;
- err = handle->h_err;
rc = jbd2_journal_stop(handle);
if (!err)
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 9c5b49f..5f58462 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -34,8 +34,7 @@
*/
#define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \
- (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \
- ? 20U : 8U)
+ (ext4_has_feature_extents(sb) ? 20U : 8U)
/* Extended attribute operations touch at most two data buffers,
* two bitmap buffers, and two group summaries, in addition to the inode
@@ -84,17 +83,16 @@
/* Amount of blocks needed for quota update - we know that the structure was
* allocated so we need to update only data block */
#define EXT4_QUOTA_TRANS_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\
- EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) ?\
- 1 : 0)
+ ext4_has_feature_quota(sb)) ? 1 : 0)
/* Amount of blocks needed for quota insert/delete - we do some block writes
* but inode, sb and group updates are done only once */
#define EXT4_QUOTA_INIT_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\
- EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) ?\
+ ext4_has_feature_quota(sb)) ?\
(DQUOT_INIT_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\
+3+DQUOT_INIT_REWRITE) : 0)
#define EXT4_QUOTA_DEL_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\
- EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) ?\
+ ext4_has_feature_quota(sb)) ?\
(DQUOT_DEL_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\
+3+DQUOT_DEL_REWRITE) : 0)
#else
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 2553aa8..551353b 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -442,7 +442,7 @@ static int __ext4_ext_check(const char *function, unsigned int line,
int depth, ext4_fsblk_t pblk)
{
const char *error_msg;
- int max = 0;
+ int max = 0, err = -EFSCORRUPTED;
if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
error_msg = "invalid magic";
@@ -473,6 +473,7 @@ static int __ext4_ext_check(const char *function, unsigned int line,
if (ext_depth(inode) != depth &&
!ext4_extent_block_csum_verify(inode, eh)) {
error_msg = "extent tree corrupted";
+ err = -EFSBADCRC;
goto corrupted;
}
return 0;
@@ -485,7 +486,7 @@ corrupted:
le16_to_cpu(eh->eh_magic),
le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
max, le16_to_cpu(eh->eh_depth), depth);
- return -EIO;
+ return err;
}
#define ext4_ext_check(inode, eh, depth, pblk) \
@@ -899,7 +900,7 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block,
bh = read_extent_tree_block(inode, path[ppos].p_block, --i,
flags);
- if (unlikely(IS_ERR(bh))) {
+ if (IS_ERR(bh)) {
ret = PTR_ERR(bh);
goto err;
}
@@ -910,7 +911,7 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block,
put_bh(bh);
EXT4_ERROR_INODE(inode,
"ppos %d > depth %d", ppos, depth);
- ret = -EIO;
+ ret = -EFSCORRUPTED;
goto err;
}
path[ppos].p_bh = bh;
@@ -959,7 +960,7 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
EXT4_ERROR_INODE(inode,
"logical %d == ei_block %d!",
logical, le32_to_cpu(curp->p_idx->ei_block));
- return -EIO;
+ return -EFSCORRUPTED;
}
if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
@@ -968,7 +969,7 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
"eh_entries %d >= eh_max %d!",
le16_to_cpu(curp->p_hdr->eh_entries),
le16_to_cpu(curp->p_hdr->eh_max));
- return -EIO;
+ return -EFSCORRUPTED;
}
if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
@@ -992,7 +993,7 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
- return -EIO;
+ return -EFSCORRUPTED;
}
ix->ei_block = cpu_to_le32(logical);
@@ -1001,7 +1002,7 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) {
EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!");
- return -EIO;
+ return -EFSCORRUPTED;
}
err = ext4_ext_dirty(handle, inode, curp);
@@ -1042,7 +1043,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
* border from split point */
if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) {
EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!");
- return -EIO;
+ return -EFSCORRUPTED;
}
if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
border = path[depth].p_ext[1].ee_block;
@@ -1086,7 +1087,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
newblock = ablocks[--a];
if (unlikely(newblock == 0)) {
EXT4_ERROR_INODE(inode, "newblock == 0!");
- err = -EIO;
+ err = -EFSCORRUPTED;
goto cleanup;
}
bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS);
@@ -1112,7 +1113,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!",
path[depth].p_hdr->eh_entries,
path[depth].p_hdr->eh_max);
- err = -EIO;
+ err = -EFSCORRUPTED;
goto cleanup;
}
/* start copy from next extent */
@@ -1151,7 +1152,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
k = depth - at - 1;
if (unlikely(k < 0)) {
EXT4_ERROR_INODE(inode, "k %d < 0!", k);
- err = -EIO;
+ err = -EFSCORRUPTED;
goto cleanup;
}
if (k)
@@ -1191,7 +1192,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
EXT4_ERROR_INODE(inode,
"EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!",
le32_to_cpu(path[i].p_ext->ee_block));
- err = -EIO;
+ err = -EFSCORRUPTED;
goto cleanup;
}
/* start copy indexes */
@@ -1425,7 +1426,7 @@ static int ext4_ext_search_left(struct inode *inode,
if (unlikely(path == NULL)) {
EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
- return -EIO;
+ return -EFSCORRUPTED;
}
depth = path->p_depth;
*phys = 0;
@@ -1444,7 +1445,7 @@ static int ext4_ext_search_left(struct inode *inode,
EXT4_ERROR_INODE(inode,
"EXT_FIRST_EXTENT != ex *logical %d ee_block %d!",
*logical, le32_to_cpu(ex->ee_block));
- return -EIO;
+ return -EFSCORRUPTED;
}
while (--depth >= 0) {
ix = path[depth].p_idx;
@@ -1455,7 +1456,7 @@ static int ext4_ext_search_left(struct inode *inode,
EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0,
depth);
- return -EIO;
+ return -EFSCORRUPTED;
}
}
return 0;
@@ -1465,7 +1466,7 @@ static int ext4_ext_search_left(struct inode *inode,
EXT4_ERROR_INODE(inode,
"logical %d < ee_block %d + ee_len %d!",
*logical, le32_to_cpu(ex->ee_block), ee_len);
- return -EIO;
+ return -EFSCORRUPTED;
}
*logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
@@ -1495,7 +1496,7 @@ static int ext4_ext_search_right(struct inode *inode,
if (unlikely(path == NULL)) {
EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
- return -EIO;
+ return -EFSCORRUPTED;
}
depth = path->p_depth;
*phys = 0;
@@ -1514,7 +1515,7 @@ static int ext4_ext_search_right(struct inode *inode,
EXT4_ERROR_INODE(inode,
"first_extent(path[%d].p_hdr) != ex",
depth);
- return -EIO;
+ return -EFSCORRUPTED;
}
while (--depth >= 0) {
ix = path[depth].p_idx;
@@ -1522,7 +1523,7 @@ static int ext4_ext_search_right(struct inode *inode,
EXT4_ERROR_INODE(inode,
"ix != EXT_FIRST_INDEX *logical %d!",
*logical);
- return -EIO;
+ return -EFSCORRUPTED;
}
}
goto found_extent;
@@ -1532,7 +1533,7 @@ static int ext4_ext_search_right(struct inode *inode,
EXT4_ERROR_INODE(inode,
"logical %d < ee_block %d + ee_len %d!",
*logical, le32_to_cpu(ex->ee_block), ee_len);
- return -EIO;
+ return -EFSCORRUPTED;
}
if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
@@ -1670,7 +1671,7 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
if (unlikely(ex == NULL || eh == NULL)) {
EXT4_ERROR_INODE(inode,
"ex %p == NULL or eh %p == NULL", ex, eh);
- return -EIO;
+ return -EFSCORRUPTED;
}
if (depth == 0) {
@@ -1938,14 +1939,14 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
mb_flags |= EXT4_MB_DELALLOC_RESERVED;
if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
- return -EIO;
+ return -EFSCORRUPTED;
}
depth = ext_depth(inode);
ex = path[depth].p_ext;
eh = path[depth].p_hdr;
if (unlikely(path[depth].p_hdr == NULL)) {
EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
- return -EIO;
+ return -EFSCORRUPTED;
}
/* try to insert block into found extent and return */
@@ -2172,7 +2173,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
if (unlikely(path[depth].p_hdr == NULL)) {
up_read(&EXT4_I(inode)->i_data_sem);
EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
- err = -EIO;
+ err = -EFSCORRUPTED;
break;
}
ex = path[depth].p_ext;
@@ -2241,7 +2242,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
if (unlikely(es.es_len == 0)) {
EXT4_ERROR_INODE(inode, "es.es_len == 0");
- err = -EIO;
+ err = -EFSCORRUPTED;
break;
}
@@ -2264,7 +2265,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
"next extent == %u, next "
"delalloc extent = %u",
next, next_del);
- err = -EIO;
+ err = -EFSCORRUPTED;
break;
}
}
@@ -2363,7 +2364,7 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
leaf = ext4_idx_pblock(path->p_idx);
if (unlikely(path->p_hdr->eh_entries == 0)) {
EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
- return -EIO;
+ return -EFSCORRUPTED;
}
err = ext4_ext_get_access(handle, inode, path);
if (err)
@@ -2612,7 +2613,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
eh = path[depth].p_hdr;
if (unlikely(path[depth].p_hdr == NULL)) {
EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
- return -EIO;
+ return -EFSCORRUPTED;
}
/* find where to start removing */
ex = path[depth].p_ext;
@@ -2666,7 +2667,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
"on extent %u:%u",
start, end, ex_ee_block,
ex_ee_block + ex_ee_len - 1);
- err = -EIO;
+ err = -EFSCORRUPTED;
goto out;
} else if (a != ex_ee_block) {
/* remove tail of the extent */
@@ -2841,7 +2842,7 @@ again:
EXT4_ERROR_INODE(inode,
"path[%d].p_hdr == NULL",
depth);
- err = -EIO;
+ err = -EFSCORRUPTED;
}
goto out;
}
@@ -2920,7 +2921,7 @@ again:
i = 0;
if (ext4_ext_check(inode, path[0].p_hdr, depth, 0)) {
- err = -EIO;
+ err = -EFSCORRUPTED;
goto out;
}
}
@@ -2978,7 +2979,7 @@ again:
* Should be a no-op if we did IO above. */
cond_resched();
if (WARN_ON(i + 1 > depth)) {
- err = -EIO;
+ err = -EFSCORRUPTED;
break;
}
path[i + 1].p_bh = bh;
@@ -3054,7 +3055,7 @@ void ext4_ext_init(struct super_block *sb)
* possible initialization would be here
*/
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+ if (ext4_has_feature_extents(sb)) {
#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
printk(KERN_INFO "EXT4-fs: file extents enabled"
#ifdef AGGRESSIVE_TEST
@@ -3081,7 +3082,7 @@ void ext4_ext_init(struct super_block *sb)
*/
void ext4_ext_release(struct super_block *sb)
{
- if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
+ if (!ext4_has_feature_extents(sb))
return;
#ifdef EXTENTS_STATS
@@ -3345,7 +3346,7 @@ static int ext4_split_extent(handle_t *handle,
if (!ex) {
EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
(unsigned long) map->m_lblk);
- return -EIO;
+ return -EFSCORRUPTED;
}
unwritten = ext4_ext_is_unwritten(ex);
split_flag1 = 0;
@@ -3558,6 +3559,9 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
max_zeroout = sbi->s_extent_max_zeroout_kb >>
(inode->i_sb->s_blocksize_bits - 10);
+ if (ext4_encrypted_inode(inode))
+ max_zeroout = 0;
+
/* If extent is less than s_max_zeroout_kb, zeroout directly */
if (max_zeroout && (ee_len <= max_zeroout)) {
err = ext4_ext_zeroout(inode, ex);
@@ -3970,7 +3974,7 @@ convert_initialized_extent(handle_t *handle, struct inode *inode,
if (!ex) {
EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
(unsigned long) map->m_lblk);
- return -EIO;
+ return -EFSCORRUPTED;
}
}
@@ -4308,7 +4312,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
"lblock: %lu, depth: %d pblock %lld",
(unsigned long) map->m_lblk, depth,
path[depth].p_block);
- err = -EIO;
+ err = -EFSCORRUPTED;
goto out2;
}
@@ -5271,7 +5275,7 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
if (depth == path->p_depth) {
ex_start = path[depth].p_ext;
if (!ex_start)
- return -EIO;
+ return -EFSCORRUPTED;
ex_last = EXT_LAST_EXTENT(path[depth].p_hdr);
@@ -5411,7 +5415,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
if (!extent) {
EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
(unsigned long) *iterator);
- return -EIO;
+ return -EFSCORRUPTED;
}
if (SHIFT == SHIFT_LEFT && *iterator >
le32_to_cpu(extent->ee_block)) {
@@ -5792,7 +5796,7 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
int split = 0;
path1 = ext4_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE);
- if (unlikely(IS_ERR(path1))) {
+ if (IS_ERR(path1)) {
*erp = PTR_ERR(path1);
path1 = NULL;
finish:
@@ -5800,7 +5804,7 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
goto repeat;
}
path2 = ext4_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE);
- if (unlikely(IS_ERR(path2))) {
+ if (IS_ERR(path2)) {
*erp = PTR_ERR(path2);
path2 = NULL;
goto finish;
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 26724ae..ac748b3 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -1089,20 +1089,9 @@ static unsigned long ext4_es_scan(struct shrinker *shrink,
return nr_shrunk;
}
-static void *ext4_es_seq_shrinker_info_start(struct seq_file *seq, loff_t *pos)
+int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v)
{
- return *pos ? NULL : SEQ_START_TOKEN;
-}
-
-static void *
-ext4_es_seq_shrinker_info_next(struct seq_file *seq, void *v, loff_t *pos)
-{
- return NULL;
-}
-
-static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v)
-{
- struct ext4_sb_info *sbi = seq->private;
+ struct ext4_sb_info *sbi = EXT4_SB((struct super_block *) seq->private);
struct ext4_es_stats *es_stats = &sbi->s_es_stats;
struct ext4_inode_info *ei, *max = NULL;
unsigned int inode_cnt = 0;
@@ -1143,45 +1132,6 @@ static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v)
return 0;
}
-static void ext4_es_seq_shrinker_info_stop(struct seq_file *seq, void *v)
-{
-}
-
-static const struct seq_operations ext4_es_seq_shrinker_info_ops = {
- .start = ext4_es_seq_shrinker_info_start,
- .next = ext4_es_seq_shrinker_info_next,
- .stop = ext4_es_seq_shrinker_info_stop,
- .show = ext4_es_seq_shrinker_info_show,
-};
-
-static int
-ext4_es_seq_shrinker_info_open(struct inode *inode, struct file *file)
-{
- int ret;
-
- ret = seq_open(file, &ext4_es_seq_shrinker_info_ops);
- if (!ret) {
- struct seq_file *m = file->private_data;
- m->private = PDE_DATA(inode);
- }
-
- return ret;
-}
-
-static int
-ext4_es_seq_shrinker_info_release(struct inode *inode, struct file *file)
-{
- return seq_release(inode, file);
-}
-
-static const struct file_operations ext4_es_seq_shrinker_info_fops = {
- .owner = THIS_MODULE,
- .open = ext4_es_seq_shrinker_info_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = ext4_es_seq_shrinker_info_release,
-};
-
int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
{
int err;
@@ -1210,10 +1160,6 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
if (err)
goto err2;
- if (sbi->s_proc)
- proc_create_data("es_shrinker_info", S_IRUGO, sbi->s_proc,
- &ext4_es_seq_shrinker_info_fops, sbi);
-
return 0;
err2:
@@ -1225,8 +1171,6 @@ err1:
void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
{
- if (sbi->s_proc)
- remove_proc_entry("es_shrinker_info", sbi->s_proc);
percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
unregister_shrinker(&sbi->s_es_shrinker);
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index 691b526..f7aa24f 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -172,4 +172,6 @@ static inline void ext4_es_store_pblock_status(struct extent_status *es,
extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi);
extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
+extern int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v);
+
#endif /* _EXT4_EXTENTS_STATUS_H */
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index bc313ac..113837e 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -22,6 +22,7 @@
#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/path.h>
+#include <linux/dax.h>
#include <linux/quotaops.h>
#include <linux/pagevec.h>
#include <linux/uio.h>
@@ -195,7 +196,7 @@ out:
static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
{
struct inode *inode = bh->b_assoc_map->host;
- /* XXX: breaks on 32-bit > 16GB. Is that even supported? */
+ /* XXX: breaks on 32-bit > 16TB. Is that even supported? */
loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
int err;
if (!uptodate)
@@ -206,17 +207,74 @@ static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
- return dax_fault(vma, vmf, ext4_get_block, ext4_end_io_unwritten);
- /* Is this the right get_block? */
+ int result;
+ handle_t *handle = NULL;
+ struct super_block *sb = file_inode(vma->vm_file)->i_sb;
+ bool write = vmf->flags & FAULT_FLAG_WRITE;
+
+ if (write) {
+ sb_start_pagefault(sb);
+ file_update_time(vma->vm_file);
+ handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
+ EXT4_DATA_TRANS_BLOCKS(sb));
+ }
+
+ if (IS_ERR(handle))
+ result = VM_FAULT_SIGBUS;
+ else
+ result = __dax_fault(vma, vmf, ext4_get_block_dax,
+ ext4_end_io_unwritten);
+
+ if (write) {
+ if (!IS_ERR(handle))
+ ext4_journal_stop(handle);
+ sb_end_pagefault(sb);
+ }
+
+ return result;
+}
+
+static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmd, unsigned int flags)
+{
+ int result;
+ handle_t *handle = NULL;
+ struct inode *inode = file_inode(vma->vm_file);
+ struct super_block *sb = inode->i_sb;
+ bool write = flags & FAULT_FLAG_WRITE;
+
+ if (write) {
+ sb_start_pagefault(sb);
+ file_update_time(vma->vm_file);
+ handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
+ ext4_chunk_trans_blocks(inode,
+ PMD_SIZE / PAGE_SIZE));
+ }
+
+ if (IS_ERR(handle))
+ result = VM_FAULT_SIGBUS;
+ else
+ result = __dax_pmd_fault(vma, addr, pmd, flags,
+ ext4_get_block_dax, ext4_end_io_unwritten);
+
+ if (write) {
+ if (!IS_ERR(handle))
+ ext4_journal_stop(handle);
+ sb_end_pagefault(sb);
+ }
+
+ return result;
}
static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
- return dax_mkwrite(vma, vmf, ext4_get_block, ext4_end_io_unwritten);
+ return dax_mkwrite(vma, vmf, ext4_get_block_dax,
+ ext4_end_io_unwritten);
}
static const struct vm_operations_struct ext4_dax_vm_ops = {
.fault = ext4_dax_fault,
+ .pmd_fault = ext4_dax_pmd_fault,
.page_mkwrite = ext4_dax_mkwrite,
.pfn_mkwrite = dax_pfn_mkwrite,
};
@@ -244,7 +302,7 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
file_accessed(file);
if (IS_DAX(file_inode(file))) {
vma->vm_ops = &ext4_dax_vm_ops;
- vma->vm_flags |= VM_MIXEDMAP;
+ vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
} else {
vma->vm_ops = &ext4_file_vm_ops;
}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 173c1ae..1b8024d 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -64,7 +64,7 @@ void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
}
/* Initializes an uninitialized inode bitmap */
-static unsigned ext4_init_inode_bitmap(struct super_block *sb,
+static int ext4_init_inode_bitmap(struct super_block *sb,
struct buffer_head *bh,
ext4_group_t block_group,
struct ext4_group_desc *gdp)
@@ -89,7 +89,7 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb,
count);
}
set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
- return 0;
+ return -EFSBADCRC;
}
memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
@@ -99,7 +99,7 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb,
EXT4_INODES_PER_GROUP(sb) / 8);
ext4_group_desc_csum_set(sb, block_group, gdp);
- return EXT4_INODES_PER_GROUP(sb);
+ return 0;
}
void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate)
@@ -112,6 +112,42 @@ void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate)
put_bh(bh);
}
+static int ext4_validate_inode_bitmap(struct super_block *sb,
+ struct ext4_group_desc *desc,
+ ext4_group_t block_group,
+ struct buffer_head *bh)
+{
+ ext4_fsblk_t blk;
+ struct ext4_group_info *grp = ext4_get_group_info(sb, block_group);
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (buffer_verified(bh))
+ return 0;
+ if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
+ return -EFSCORRUPTED;
+
+ ext4_lock_group(sb, block_group);
+ blk = ext4_inode_bitmap(sb, desc);
+ if (!ext4_inode_bitmap_csum_verify(sb, block_group, desc, bh,
+ EXT4_INODES_PER_GROUP(sb) / 8)) {
+ ext4_unlock_group(sb, block_group);
+ ext4_error(sb, "Corrupt inode bitmap - block_group = %u, "
+ "inode_bitmap = %llu", block_group, blk);
+ grp = ext4_get_group_info(sb, block_group);
+ if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
+ int count;
+ count = ext4_free_inodes_count(sb, desc);
+ percpu_counter_sub(&sbi->s_freeinodes_counter,
+ count);
+ }
+ set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
+ return -EFSBADCRC;
+ }
+ set_buffer_verified(bh);
+ ext4_unlock_group(sb, block_group);
+ return 0;
+}
+
/*
* Read the inode allocation bitmap for a given block_group, reading
* into the specified slot in the superblock's bitmap cache.
@@ -124,12 +160,11 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
struct ext4_group_desc *desc;
struct buffer_head *bh = NULL;
ext4_fsblk_t bitmap_blk;
- struct ext4_group_info *grp;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int err;
desc = ext4_get_group_desc(sb, block_group, NULL);
if (!desc)
- return NULL;
+ return ERR_PTR(-EFSCORRUPTED);
bitmap_blk = ext4_inode_bitmap(sb, desc);
bh = sb_getblk(sb, bitmap_blk);
@@ -137,7 +172,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
ext4_error(sb, "Cannot read inode bitmap - "
"block_group = %u, inode_bitmap = %llu",
block_group, bitmap_blk);
- return NULL;
+ return ERR_PTR(-EIO);
}
if (bitmap_uptodate(bh))
goto verify;
@@ -150,12 +185,14 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
ext4_lock_group(sb, block_group);
if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
- ext4_init_inode_bitmap(sb, bh, block_group, desc);
+ err = ext4_init_inode_bitmap(sb, bh, block_group, desc);
set_bitmap_uptodate(bh);
set_buffer_uptodate(bh);
set_buffer_verified(bh);
ext4_unlock_group(sb, block_group);
unlock_buffer(bh);
+ if (err)
+ goto out;
return bh;
}
ext4_unlock_group(sb, block_group);
@@ -182,31 +219,17 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
ext4_error(sb, "Cannot read inode bitmap - "
"block_group = %u, inode_bitmap = %llu",
block_group, bitmap_blk);
- return NULL;
+ return ERR_PTR(-EIO);
}
verify:
- ext4_lock_group(sb, block_group);
- if (!buffer_verified(bh) &&
- !ext4_inode_bitmap_csum_verify(sb, block_group, desc, bh,
- EXT4_INODES_PER_GROUP(sb) / 8)) {
- ext4_unlock_group(sb, block_group);
- put_bh(bh);
- ext4_error(sb, "Corrupt inode bitmap - block_group = %u, "
- "inode_bitmap = %llu", block_group, bitmap_blk);
- grp = ext4_get_group_info(sb, block_group);
- if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
- int count;
- count = ext4_free_inodes_count(sb, desc);
- percpu_counter_sub(&sbi->s_freeinodes_counter,
- count);
- }
- set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
- return NULL;
- }
- ext4_unlock_group(sb, block_group);
- set_buffer_verified(bh);
+ err = ext4_validate_inode_bitmap(sb, desc, block_group, bh);
+ if (err)
+ goto out;
return bh;
+out:
+ put_bh(bh);
+ return ERR_PTR(err);
}
/*
@@ -286,8 +309,15 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
/* Don't bother if the inode bitmap is corrupt. */
grp = ext4_get_group_info(sb, block_group);
- if (unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) || !bitmap_bh)
+ if (IS_ERR(bitmap_bh)) {
+ fatal = PTR_ERR(bitmap_bh);
+ bitmap_bh = NULL;
goto error_return;
+ }
+ if (unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp))) {
+ fatal = -EFSCORRUPTED;
+ goto error_return;
+ }
BUFFER_TRACE(bitmap_bh, "get_write_access");
fatal = ext4_journal_get_write_access(handle, bitmap_bh);
@@ -721,7 +751,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
struct ext4_group_desc *gdp = NULL;
struct ext4_inode_info *ei;
struct ext4_sb_info *sbi;
- int ret2, err = 0;
+ int ret2, err;
struct inode *ret;
ext4_group_t i;
ext4_group_t flex_group;
@@ -769,7 +799,9 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
inode->i_gid = dir->i_gid;
} else
inode_init_owner(inode, dir, mode);
- dquot_initialize(inode);
+ err = dquot_initialize(inode);
+ if (err)
+ goto out;
if (!goal)
goal = sbi->s_inode_goal;
@@ -824,7 +856,9 @@ got_group:
brelse(inode_bitmap_bh);
inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
/* Skip groups with suspicious inode tables */
- if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp) || !inode_bitmap_bh) {
+ if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp) ||
+ IS_ERR(inode_bitmap_bh)) {
+ inode_bitmap_bh = NULL;
if (++group == ngroups)
group = 0;
continue;
@@ -900,8 +934,8 @@ got:
struct buffer_head *block_bitmap_bh;
block_bitmap_bh = ext4_read_block_bitmap(sb, group);
- if (!block_bitmap_bh) {
- err = -EIO;
+ if (IS_ERR(block_bitmap_bh)) {
+ err = PTR_ERR(block_bitmap_bh);
goto out;
}
BUFFER_TRACE(block_bitmap_bh, "get block bitmap access");
@@ -1043,7 +1077,7 @@ got:
ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
ei->i_inline_off = 0;
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_INLINE_DATA))
+ if (ext4_has_feature_inline_data(sb))
ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
ret = inode;
err = dquot_alloc_inode(inode);
@@ -1058,7 +1092,7 @@ got:
if (err)
goto fail_free_drop;
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+ if (ext4_has_feature_extents(sb)) {
/* set extent flag only for directory, file and normal symlink*/
if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
@@ -1114,14 +1148,17 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
/* Error cases - e2fsck has already cleaned up for us */
if (ino > max_ino) {
ext4_warning(sb, "bad orphan ino %lu! e2fsck was run?", ino);
+ err = -EFSCORRUPTED;
goto error;
}
block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
- if (!bitmap_bh) {
- ext4_warning(sb, "inode bitmap error for orphan %lu", ino);
+ if (IS_ERR(bitmap_bh)) {
+ err = PTR_ERR(bitmap_bh);
+ ext4_warning(sb, "inode bitmap error %ld for orphan %lu",
+ ino, err);
goto error;
}
@@ -1196,8 +1233,10 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
desc_count += ext4_free_inodes_count(sb, gdp);
brelse(bitmap_bh);
bitmap_bh = ext4_read_inode_bitmap(sb, i);
- if (!bitmap_bh)
+ if (IS_ERR(bitmap_bh)) {
+ bitmap_bh = NULL;
continue;
+ }
x = ext4_count_free(bitmap_bh->b_data,
EXT4_INODES_PER_GROUP(sb) / 8);
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 4f6ac49..355ef9c 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -22,6 +22,7 @@
#include "ext4_jbd2.h"
#include "truncate.h"
+#include <linux/dax.h>
#include <linux/uio.h>
#include <trace/events/ext4.h>
@@ -561,11 +562,10 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
/*
* Okay, we need to do block allocation.
*/
- if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+ if (ext4_has_feature_bigalloc(inode->i_sb)) {
EXT4_ERROR_INODE(inode, "Can't allocate blocks for "
"non-extent mapped inodes with bigalloc");
- return -EUCLEAN;
+ return -EFSCORRUPTED;
}
/* Set up for the direct block allocation */
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index cd944a7..d884989 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -434,8 +434,7 @@ static int ext4_destroy_inline_data_nolock(handle_t *handle,
memset((void *)ext4_raw_inode(&is.iloc)->i_block,
0, EXT4_MIN_INLINE_DATA_SIZE);
- if (EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+ if (ext4_has_feature_extents(inode->i_sb)) {
if (S_ISDIR(inode->i_mode) ||
S_ISREG(inode->i_mode) || S_ISLNK(inode->i_mode)) {
ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index cecf9aa..b3bd912 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -22,6 +22,7 @@
#include <linux/time.h>
#include <linux/highuid.h>
#include <linux/pagemap.h>
+#include <linux/dax.h>
#include <linux/quotaops.h>
#include <linux/string.h>
#include <linux/buffer_head.h>
@@ -377,7 +378,7 @@ static int __check_block_validity(struct inode *inode, const char *func,
"lblock %lu mapped to illegal pblock "
"(length %d)", (unsigned long) map->m_lblk,
map->m_len);
- return -EIO;
+ return -EFSCORRUPTED;
}
return 0;
}
@@ -479,7 +480,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
/* We can handle the block number less than EXT_MAX_BLOCKS */
if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS))
- return -EIO;
+ return -EFSCORRUPTED;
/* Lookup extent status tree firstly */
if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
@@ -964,7 +965,7 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
if (unlikely(err))
page_zero_new_buffers(page, from, to);
else if (decrypt)
- err = ext4_decrypt_one(inode, page);
+ err = ext4_decrypt(page);
return err;
}
#endif
@@ -1180,6 +1181,38 @@ errout:
return ret ? ret : copied;
}
+/*
+ * This is a private version of page_zero_new_buffers() which doesn't
+ * set the buffer to be dirty, since in data=journalled mode we need
+ * to call ext4_handle_dirty_metadata() instead.
+ */
+static void zero_new_buffers(struct page *page, unsigned from, unsigned to)
+{
+ unsigned int block_start = 0, block_end;
+ struct buffer_head *head, *bh;
+
+ bh = head = page_buffers(page);
+ do {
+ block_end = block_start + bh->b_size;
+ if (buffer_new(bh)) {
+ if (block_end > from && block_start < to) {
+ if (!PageUptodate(page)) {
+ unsigned start, size;
+
+ start = max(from, block_start);
+ size = min(to, block_end) - start;
+
+ zero_user(page, start, size);
+ set_buffer_uptodate(bh);
+ }
+ clear_buffer_new(bh);
+ }
+ }
+ block_start = block_end;
+ bh = bh->b_this_page;
+ } while (bh != head);
+}
+
static int ext4_journalled_write_end(struct file *file,
struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
@@ -1206,7 +1239,7 @@ static int ext4_journalled_write_end(struct file *file,
if (copied < len) {
if (!PageUptodate(page))
copied = 0;
- page_zero_new_buffers(page, from+copied, to);
+ zero_new_buffers(page, from+copied, to);
}
ret = ext4_walk_page_buffers(handle, page_buffers(page), from,
@@ -1814,11 +1847,22 @@ static int ext4_writepage(struct page *page,
* the page. But we may reach here when we do a journal commit via
* journal_submit_inode_data_buffers() and in that case we must write
* allocated buffers to achieve data=ordered mode guarantees.
+ *
+ * Also, if there is only one buffer per page (the fs block
+ * size == the page size), if one buffer needs block
+ * allocation or needs to modify the extent tree to clear the
+ * unwritten flag, we know that the page can't be written at
+ * all, so we might as well refuse the write immediately.
+ * Unfortunately if the block size != page size, we can't as
+ * easily detect this case using ext4_walk_page_buffers(), but
+ * for the extremely common case, this is an optimization that
+ * skips a useless round trip through ext4_bio_write_page().
*/
if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL,
ext4_bh_delay_or_unwritten)) {
redirty_page_for_writepage(wbc, page);
- if (current->flags & PF_MEMALLOC) {
+ if ((current->flags & PF_MEMALLOC) ||
+ (inode->i_sb->s_blocksize == PAGE_CACHE_SIZE)) {
/*
* For memory cleaning there's no point in writing only
* some buffers. So just bail out. Warn if we came here
@@ -2598,8 +2642,7 @@ static int ext4_nonda_switch(struct super_block *sb)
/* We always reserve for an inode update; the superblock could be there too */
static int ext4_da_write_credits(struct inode *inode, loff_t pos, unsigned len)
{
- if (likely(EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_RO_COMPAT_LARGE_FILE)))
+ if (likely(ext4_has_feature_large_file(inode->i_sb)))
return 1;
if (pos + len <= 0x7fffffffULL)
@@ -3020,6 +3063,17 @@ static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
EXT4_GET_BLOCKS_NO_LOCK);
}
+int ext4_get_block_dax(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ int flags = EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_UNWRIT_EXT;
+ if (create)
+ flags |= EXT4_GET_BLOCKS_CREATE;
+ ext4_debug("ext4_get_block_dax: inode %lu, create flag %d\n",
+ inode->i_ino, create);
+ return _ext4_get_block(inode, iblock, bh_result, flags);
+}
+
static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
ssize_t size, void *private)
{
@@ -3332,7 +3386,7 @@ static int __ext4_block_zero_page_range(handle_t *handle,
int err = 0;
page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
- mapping_gfp_mask(mapping) & ~__GFP_FS);
+ mapping_gfp_constraint(mapping, ~__GFP_FS));
if (!page)
return -ENOMEM;
@@ -3381,7 +3435,7 @@ static int __ext4_block_zero_page_range(handle_t *handle,
/* We expect the key to be set. */
BUG_ON(!ext4_has_encryption_key(inode));
BUG_ON(blocksize != PAGE_CACHE_SIZE);
- WARN_ON_ONCE(ext4_decrypt_one(inode, page));
+ WARN_ON_ONCE(ext4_decrypt(page));
}
}
if (ext4_should_journal_data(inode)) {
@@ -3808,7 +3862,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
iloc->bh = NULL;
if (!ext4_valid_inum(sb, inode->i_ino))
- return -EIO;
+ return -EFSCORRUPTED;
iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);
gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
@@ -3994,8 +4048,7 @@ static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
struct inode *inode = &(ei->vfs_inode);
struct super_block *sb = inode->i_sb;
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
+ if (ext4_has_feature_huge_file(sb)) {
/* we are using combined 48 bit field */
i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 |
le32_to_cpu(raw_inode->i_blocks_lo);
@@ -4056,7 +4109,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
EXT4_ERROR_INODE(inode, "bad extra_isize (%u != %u)",
EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize,
EXT4_INODE_SIZE(inode->i_sb));
- ret = -EIO;
+ ret = -EFSCORRUPTED;
goto bad_inode;
}
} else
@@ -4076,7 +4129,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
if (!ext4_inode_csum_verify(inode, raw_inode, ei)) {
EXT4_ERROR_INODE(inode, "checksum invalid");
- ret = -EIO;
+ ret = -EFSBADCRC;
goto bad_inode;
}
@@ -4118,7 +4171,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
ei->i_flags = le32_to_cpu(raw_inode->i_flags);
inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))
+ if (ext4_has_feature_64bit(sb))
ei->i_file_acl |=
((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
inode->i_size = ext4_isize(raw_inode);
@@ -4191,7 +4244,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
!ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
EXT4_ERROR_INODE(inode, "bad extended attribute block %llu",
ei->i_file_acl);
- ret = -EIO;
+ ret = -EFSCORRUPTED;
goto bad_inode;
} else if (!ext4_has_inline_data(inode)) {
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
@@ -4230,6 +4283,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
inode->i_op = &ext4_symlink_inode_operations;
ext4_set_aops(inode);
}
+ inode_nohighmem(inode);
} else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
inode->i_op = &ext4_special_inode_operations;
@@ -4242,7 +4296,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
} else if (ino == EXT4_BOOT_LOADER_INO) {
make_bad_inode(inode);
} else {
- ret = -EIO;
+ ret = -EFSCORRUPTED;
EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);
goto bad_inode;
}
@@ -4260,7 +4314,7 @@ bad_inode:
struct inode *ext4_iget_normal(struct super_block *sb, unsigned long ino)
{
if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)
- return ERR_PTR(-EIO);
+ return ERR_PTR(-EFSCORRUPTED);
return ext4_iget(sb, ino);
}
@@ -4282,7 +4336,7 @@ static int ext4_inode_blocks_set(handle_t *handle,
ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
return 0;
}
- if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE))
+ if (!ext4_has_feature_huge_file(sb))
return -EFBIG;
if (i_blocks <= 0xffffffffffffULL) {
@@ -4443,8 +4497,7 @@ static int ext4_do_update_inode(handle_t *handle,
need_datasync = 1;
}
if (ei->i_disksize > 0x7fffffffULL) {
- if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_LARGE_FILE) ||
+ if (!ext4_has_feature_large_file(sb) ||
EXT4_SB(sb)->s_es->s_rev_level ==
cpu_to_le32(EXT4_GOOD_OLD_REV))
set_large_file = 1;
@@ -4493,8 +4546,7 @@ static int ext4_do_update_inode(handle_t *handle,
if (err)
goto out_brelse;
ext4_update_dynamic_rev(sb);
- EXT4_SET_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
+ ext4_set_feature_large_file(sb);
ext4_handle_sync(handle);
err = ext4_handle_dirty_super(handle, sb);
}
@@ -4661,8 +4713,11 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
if (error)
return error;
- if (is_quota_modification(inode, attr))
- dquot_initialize(inode);
+ if (is_quota_modification(inode, attr)) {
+ error = dquot_initialize(inode);
+ if (error)
+ return error;
+ }
if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) ||
(ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
handle_t *handle;
@@ -4725,6 +4780,14 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
error = ext4_orphan_add(handle, inode);
orphan = 1;
}
+ /*
+ * Update c/mtime on truncate up, ext4_truncate() will
+ * update c/mtime in shrink case below
+ */
+ if (!shrink) {
+ inode->i_mtime = ext4_current_time(inode);
+ inode->i_ctime = inode->i_mtime;
+ }
down_write(&EXT4_I(inode)->i_data_sem);
EXT4_I(inode)->i_disksize = attr->ia_size;
rc = ext4_mark_inode_dirty(handle, inode);
@@ -5221,7 +5284,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
!ext4_should_journal_data(inode) &&
!ext4_nonda_switch(inode->i_sb)) {
do {
- ret = __block_page_mkwrite(vma, vmf,
+ ret = block_page_mkwrite(vma, vmf,
ext4_da_get_block_prep);
} while (ret == -ENOSPC &&
ext4_should_retry_alloc(inode->i_sb, &retries));
@@ -5268,7 +5331,7 @@ retry_alloc:
ret = VM_FAULT_SIGBUS;
goto out;
}
- ret = __block_page_mkwrite(vma, vmf, get_block);
+ ret = block_page_mkwrite(vma, vmf, get_block);
if (!ret && ext4_should_journal_data(inode)) {
if (ext4_walk_page_buffers(handle, page_buffers(page), 0,
PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) {
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 1346cfa..5e872fd 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -145,8 +145,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
inode_bl->i_version = 1;
i_size_write(inode_bl, 0);
inode_bl->i_mode = S_IFREG;
- if (EXT4_HAS_INCOMPAT_FEATURE(sb,
- EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+ if (ext4_has_feature_extents(sb)) {
ext4_set_inode_flag(inode_bl, EXT4_INODE_EXTENTS);
ext4_ext_tree_init(handle, inode_bl);
} else
@@ -383,8 +382,7 @@ setversion_out:
goto group_extend_out;
}
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+ if (ext4_has_feature_bigalloc(sb)) {
ext4_msg(sb, KERN_ERR,
"Online resizing not supported with bigalloc");
err = -EOPNOTSUPP;
@@ -432,8 +430,7 @@ group_extend_out:
goto mext_out;
}
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+ if (ext4_has_feature_bigalloc(sb)) {
ext4_msg(sb, KERN_ERR,
"Online defrag not supported with bigalloc");
err = -EOPNOTSUPP;
@@ -470,8 +467,7 @@ mext_out:
goto group_add_out;
}
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+ if (ext4_has_feature_bigalloc(sb)) {
ext4_msg(sb, KERN_ERR,
"Online resizing not supported with bigalloc");
err = -EOPNOTSUPP;
@@ -553,8 +549,7 @@ group_add_out:
int err = 0, err2 = 0;
ext4_group_t o_group = EXT4_SB(sb)->s_groups_count;
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+ if (ext4_has_feature_bigalloc(sb)) {
ext4_msg(sb, KERN_ERR,
"Online resizing not (yet) supported with bigalloc");
return -EOPNOTSUPP;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 34b610e..61eaf74 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -874,8 +874,10 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
bh[i] = NULL;
continue;
}
- if (!(bh[i] = ext4_read_block_bitmap_nowait(sb, group))) {
- err = -ENOMEM;
+ bh[i] = ext4_read_block_bitmap_nowait(sb, group);
+ if (IS_ERR(bh[i])) {
+ err = PTR_ERR(bh[i]);
+ bh[i] = NULL;
goto out;
}
mb_debug(1, "read bitmap for group %u\n", group);
@@ -883,8 +885,13 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
/* wait for I/O completion */
for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
- if (bh[i] && ext4_wait_block_bitmap(sb, group, bh[i]))
- err = -EIO;
+ int err2;
+
+ if (!bh[i])
+ continue;
+ err2 = ext4_wait_block_bitmap(sb, group, bh[i]);
+ if (!err)
+ err = err2;
}
first_block = page->index * blocks_per_page;
@@ -2333,7 +2340,7 @@ static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
}
-static const struct file_operations ext4_mb_seq_groups_fops = {
+const struct file_operations ext4_seq_mb_groups_fops = {
.owner = THIS_MODULE,
.open = ext4_mb_seq_groups_open,
.read = seq_read,
@@ -2447,7 +2454,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
kmalloc(sb->s_blocksize, GFP_NOFS);
BUG_ON(meta_group_info[i]->bb_bitmap == NULL);
bh = ext4_read_block_bitmap(sb, group);
- BUG_ON(bh == NULL);
+ BUG_ON(IS_ERR_OR_NULL(bh));
memcpy(meta_group_info[i]->bb_bitmap, bh->b_data,
sb->s_blocksize);
put_bh(bh);
@@ -2661,10 +2668,6 @@ int ext4_mb_init(struct super_block *sb)
if (ret != 0)
goto out_free_locality_groups;
- if (sbi->s_proc)
- proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
- &ext4_mb_seq_groups_fops, sb);
-
return 0;
out_free_locality_groups:
@@ -2705,9 +2708,6 @@ int ext4_mb_release(struct super_block *sb)
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
- if (sbi->s_proc)
- remove_proc_entry("mb_groups", sbi->s_proc);
-
if (sbi->s_group_info) {
for (i = 0; i < ngroups; i++) {
grinfo = ext4_get_group_info(sb, i);
@@ -2896,10 +2896,12 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
sb = ac->ac_sb;
sbi = EXT4_SB(sb);
- err = -EIO;
bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
- if (!bitmap_bh)
+ if (IS_ERR(bitmap_bh)) {
+ err = PTR_ERR(bitmap_bh);
+ bitmap_bh = NULL;
goto out_err;
+ }
BUFFER_TRACE(bitmap_bh, "getting write access");
err = ext4_journal_get_write_access(handle, bitmap_bh);
@@ -3843,8 +3845,10 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
return 0;
bitmap_bh = ext4_read_block_bitmap(sb, group);
- if (bitmap_bh == NULL) {
- ext4_error(sb, "Error reading block bitmap for %u", group);
+ if (IS_ERR(bitmap_bh)) {
+ err = PTR_ERR(bitmap_bh);
+ ext4_error(sb, "Error %d reading block bitmap for %u",
+ err, group);
return 0;
}
@@ -4015,9 +4019,10 @@ repeat:
}
bitmap_bh = ext4_read_block_bitmap(sb, group);
- if (bitmap_bh == NULL) {
- ext4_error(sb, "Error reading block bitmap for %u",
- group);
+ if (IS_ERR(bitmap_bh)) {
+ err = PTR_ERR(bitmap_bh);
+ ext4_error(sb, "Error %d reading block bitmap for %u",
+ err, group);
ext4_mb_unload_buddy(&e4b);
continue;
}
@@ -4682,22 +4687,11 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
ext4_debug("freeing block %llu\n", block);
trace_ext4_free_blocks(inode, block, count, flags);
- if (flags & EXT4_FREE_BLOCKS_FORGET) {
- struct buffer_head *tbh = bh;
- int i;
-
- BUG_ON(bh && (count > 1));
+ if (bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
+ BUG_ON(count > 1);
- for (i = 0; i < count; i++) {
- cond_resched();
- if (!bh)
- tbh = sb_find_get_block(inode->i_sb,
- block + i);
- if (!tbh)
- continue;
- ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
- inode, tbh, block + i);
- }
+ ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
+ inode, bh, block);
}
/*
@@ -4742,6 +4736,19 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
count += sbi->s_cluster_ratio - overflow;
}
+ if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
+ int i;
+
+ for (i = 0; i < count; i++) {
+ cond_resched();
+ bh = sb_find_get_block(inode->i_sb, block + i);
+ if (!bh)
+ continue;
+ ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
+ inode, bh, block + i);
+ }
+ }
+
do_more:
overflow = 0;
ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
@@ -4761,8 +4768,9 @@ do_more:
}
count_clusters = EXT4_NUM_B2C(sbi, count);
bitmap_bh = ext4_read_block_bitmap(sb, block_group);
- if (!bitmap_bh) {
- err = -EIO;
+ if (IS_ERR(bitmap_bh)) {
+ err = PTR_ERR(bitmap_bh);
+ bitmap_bh = NULL;
goto error_return;
}
gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
@@ -4931,8 +4939,9 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
}
bitmap_bh = ext4_read_block_bitmap(sb, block_group);
- if (!bitmap_bh) {
- err = -EIO;
+ if (IS_ERR(bitmap_bh)) {
+ err = PTR_ERR(bitmap_bh);
+ bitmap_bh = NULL;
goto error_return;
}
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 6163ad2..a465189 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -448,8 +448,7 @@ int ext4_ext_migrate(struct inode *inode)
* If the filesystem does not support extents, or the inode
* already is extent-based, error out.
*/
- if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_INCOMPAT_EXTENTS) ||
+ if (!ext4_has_feature_extents(inode->i_sb) ||
(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
return -EINVAL;
@@ -625,13 +624,11 @@ int ext4_ind_migrate(struct inode *inode)
handle_t *handle;
int ret;
- if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_INCOMPAT_EXTENTS) ||
+ if (!ext4_has_feature_extents(inode->i_sb) ||
(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
return -EINVAL;
- if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_RO_COMPAT_BIGALLOC))
+ if (ext4_has_feature_bigalloc(inode->i_sb))
return -EOPNOTSUPP;
/*
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 8313ca3..0a512aa 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -69,6 +69,7 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
ext4_fsblk_t mmp_block)
{
struct mmp_struct *mmp;
+ int ret;
if (*bh)
clear_buffer_uptodate(*bh);
@@ -76,33 +77,38 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
/* This would be sb_bread(sb, mmp_block), except we need to be sure
* that the MD RAID device cache has been bypassed, and that the read
* is not blocked in the elevator. */
- if (!*bh)
+ if (!*bh) {
*bh = sb_getblk(sb, mmp_block);
- if (!*bh)
- return -ENOMEM;
- if (*bh) {
- get_bh(*bh);
- lock_buffer(*bh);
- (*bh)->b_end_io = end_buffer_read_sync;
- submit_bh(READ_SYNC | REQ_META | REQ_PRIO, *bh);
- wait_on_buffer(*bh);
- if (!buffer_uptodate(*bh)) {
- brelse(*bh);
- *bh = NULL;
+ if (!*bh) {
+ ret = -ENOMEM;
+ goto warn_exit;
}
}
- if (unlikely(!*bh)) {
- ext4_warning(sb, "Error while reading MMP block %llu",
- mmp_block);
- return -EIO;
+
+ get_bh(*bh);
+ lock_buffer(*bh);
+ (*bh)->b_end_io = end_buffer_read_sync;
+ submit_bh(READ_SYNC | REQ_META | REQ_PRIO, *bh);
+ wait_on_buffer(*bh);
+ if (!buffer_uptodate(*bh)) {
+ brelse(*bh);
+ *bh = NULL;
+ ret = -EIO;
+ goto warn_exit;
}
mmp = (struct mmp_struct *)((*bh)->b_data);
- if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC ||
- !ext4_mmp_csum_verify(sb, mmp))
- return -EINVAL;
-
- return 0;
+ if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC)
+ ret = -EFSCORRUPTED;
+ else if (!ext4_mmp_csum_verify(sb, mmp))
+ ret = -EFSBADCRC;
+ else
+ return 0;
+
+warn_exit:
+ ext4_warning(sb, "Error %d while reading MMP block %llu",
+ ret, mmp_block);
+ return ret;
}
/*
@@ -111,7 +117,7 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp,
const char *function, unsigned int line, const char *msg)
{
- __ext4_warning(sb, function, line, msg);
+ __ext4_warning(sb, function, line, "%s", msg);
__ext4_warning(sb, function, line,
"MMP failure info: last update time: %llu, last update "
"node: %s, last update device: %s\n",
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 011dcfb..f27e0c2 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -109,7 +109,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
if (!bh) {
ext4_error_inode(inode, func, line, block,
"Directory hole found");
- return ERR_PTR(-EIO);
+ return ERR_PTR(-EFSCORRUPTED);
}
dirent = (struct ext4_dir_entry *) bh->b_data;
/* Determine whether or not we have an index block */
@@ -124,7 +124,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
if (!is_dx_block && type == INDEX) {
ext4_error_inode(inode, func, line, block,
"directory leaf block found instead of index block");
- return ERR_PTR(-EIO);
+ return ERR_PTR(-EFSCORRUPTED);
}
if (!ext4_has_metadata_csum(inode->i_sb) ||
buffer_verified(bh))
@@ -142,7 +142,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
ext4_error_inode(inode, func, line, block,
"Directory index failed checksum");
brelse(bh);
- return ERR_PTR(-EIO);
+ return ERR_PTR(-EFSBADCRC);
}
}
if (!is_dx_block) {
@@ -152,7 +152,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
ext4_error_inode(inode, func, line, block,
"Directory block failed checksum");
brelse(bh);
- return ERR_PTR(-EIO);
+ return ERR_PTR(-EFSBADCRC);
}
}
return bh;
@@ -1429,7 +1429,7 @@ restart:
}
num++;
bh = ext4_getblk(NULL, dir, b++, 0);
- if (unlikely(IS_ERR(bh))) {
+ if (IS_ERR(bh)) {
if (ra_max == 0) {
ret = bh;
goto cleanup_and_exit;
@@ -1570,19 +1570,19 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
brelse(bh);
if (!ext4_valid_inum(dir->i_sb, ino)) {
EXT4_ERROR_INODE(dir, "bad inode number: %u", ino);
- return ERR_PTR(-EIO);
+ return ERR_PTR(-EFSCORRUPTED);
}
if (unlikely(ino == dir->i_ino)) {
EXT4_ERROR_INODE(dir, "'%pd' linked to parent dir",
dentry);
- return ERR_PTR(-EIO);
+ return ERR_PTR(-EFSCORRUPTED);
}
inode = ext4_iget_normal(dir->i_sb, ino);
if (inode == ERR_PTR(-ESTALE)) {
EXT4_ERROR_INODE(dir,
"deleted inode referenced: %u",
ino);
- return ERR_PTR(-EIO);
+ return ERR_PTR(-EFSCORRUPTED);
}
if (!IS_ERR(inode) && ext4_encrypted_inode(dir) &&
(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
@@ -1619,7 +1619,7 @@ struct dentry *ext4_get_parent(struct dentry *child)
if (!ext4_valid_inum(d_inode(child)->i_sb, ino)) {
EXT4_ERROR_INODE(d_inode(child),
"bad parent inode number: %u", ino);
- return ERR_PTR(-EIO);
+ return ERR_PTR(-EFSCORRUPTED);
}
return d_obtain_alias(ext4_iget_normal(d_inode(child)->i_sb, ino));
@@ -1807,7 +1807,7 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode,
while ((char *) de <= top) {
if (ext4_check_dir_entry(dir, NULL, de, bh,
buf, buf_size, offset)) {
- res = -EIO;
+ res = -EFSCORRUPTED;
goto return_result;
}
/* Provide crypto context and crypto buffer to ext4 match */
@@ -1967,7 +1967,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
if ((char *) de >= (((char *) root) + blocksize)) {
EXT4_ERROR_INODE(dir, "invalid rec_len for '..'");
brelse(bh);
- return -EIO;
+ return -EFSCORRUPTED;
}
len = ((char *) root) + (blocksize - csum_size) - (char *) de;
@@ -2118,7 +2118,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
goto out;
if (blocks == 1 && !dx_fallback &&
- EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) {
+ ext4_has_feature_dir_index(sb)) {
retval = make_indexed_dir(handle, &fname, dentry,
inode, bh);
bh = NULL; /* make_indexed_dir releases bh */
@@ -2315,7 +2315,7 @@ int ext4_generic_delete_entry(handle_t *handle,
while (i < buf_size - csum_size) {
if (ext4_check_dir_entry(dir, NULL, de, bh,
bh->b_data, bh->b_size, i))
- return -EIO;
+ return -EFSCORRUPTED;
if (de == de_del) {
if (pde)
pde->rec_len = ext4_rec_len_to_disk(
@@ -2388,8 +2388,7 @@ static void ext4_inc_count(handle_t *handle, struct inode *inode)
/* limit is 16-bit i_links_count */
if (inode->i_nlink >= EXT4_LINK_MAX || inode->i_nlink == 2) {
set_nlink(inode, 1);
- EXT4_SET_RO_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_RO_COMPAT_DIR_NLINK);
+ ext4_set_feature_dir_nlink(inode->i_sb);
}
}
}
@@ -2436,7 +2435,9 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode,
struct inode *inode;
int err, credits, retries = 0;
- dquot_initialize(dir);
+ err = dquot_initialize(dir);
+ if (err)
+ return err;
credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
@@ -2467,10 +2468,9 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry,
struct inode *inode;
int err, credits, retries = 0;
- if (!new_valid_dev(rdev))
- return -EINVAL;
-
- dquot_initialize(dir);
+ err = dquot_initialize(dir);
+ if (err)
+ return err;
credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
@@ -2499,7 +2499,9 @@ static int ext4_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
struct inode *inode;
int err, retries = 0;
- dquot_initialize(dir);
+ err = dquot_initialize(dir);
+ if (err)
+ return err;
retry:
inode = ext4_new_inode_start_handle(dir, mode,
@@ -2612,7 +2614,9 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
if (EXT4_DIR_LINK_MAX(dir))
return -EMLINK;
- dquot_initialize(dir);
+ err = dquot_initialize(dir);
+ if (err)
+ return err;
credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
@@ -2910,8 +2914,12 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
/* Initialize quotas before so that eventual writes go in
* separate transaction */
- dquot_initialize(dir);
- dquot_initialize(d_inode(dentry));
+ retval = dquot_initialize(dir);
+ if (retval)
+ return retval;
+ retval = dquot_initialize(d_inode(dentry));
+ if (retval)
+ return retval;
retval = -ENOENT;
bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
@@ -2922,7 +2930,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
inode = d_inode(dentry);
- retval = -EIO;
+ retval = -EFSCORRUPTED;
if (le32_to_cpu(de->inode) != inode->i_ino)
goto end_rmdir;
@@ -2980,8 +2988,12 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
trace_ext4_unlink_enter(dir, dentry);
/* Initialize quotas before so that eventual writes go
* in separate transaction */
- dquot_initialize(dir);
- dquot_initialize(d_inode(dentry));
+ retval = dquot_initialize(dir);
+ if (retval)
+ return retval;
+ retval = dquot_initialize(d_inode(dentry));
+ if (retval)
+ return retval;
retval = -ENOENT;
bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
@@ -2992,7 +3004,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
inode = d_inode(dentry);
- retval = -EIO;
+ retval = -EFSCORRUPTED;
if (le32_to_cpu(de->inode) != inode->i_ino)
goto end_unlink;
@@ -3066,7 +3078,9 @@ static int ext4_symlink(struct inode *dir,
goto err_free_sd;
}
- dquot_initialize(dir);
+ err = dquot_initialize(dir);
+ if (err)
+ goto err_free_sd;
if ((disk_link.len > EXT4_N_BLOCKS * 4)) {
/*
@@ -3118,6 +3132,7 @@ static int ext4_symlink(struct inode *dir,
if ((disk_link.len > EXT4_N_BLOCKS * 4)) {
if (!encryption_required)
inode->i_op = &ext4_symlink_inode_operations;
+ inode_nohighmem(inode);
ext4_set_aops(inode);
/*
* We cannot call page_symlink() with transaction started
@@ -3197,7 +3212,9 @@ static int ext4_link(struct dentry *old_dentry,
if (ext4_encrypted_inode(dir) &&
!ext4_is_child_context_consistent_with_parent(dir, inode))
return -EPERM;
- dquot_initialize(dir);
+ err = dquot_initialize(dir);
+ if (err)
+ return err;
retry:
handle = ext4_journal_start(dir, EXT4_HT_DIR,
@@ -3290,7 +3307,7 @@ static int ext4_rename_dir_prepare(handle_t *handle, struct ext4_renament *ent)
if (!ent->dir_bh)
return retval;
if (le32_to_cpu(ent->parent_de->inode) != ent->dir->i_ino)
- return -EIO;
+ return -EFSCORRUPTED;
BUFFER_TRACE(ent->dir_bh, "get_write_access");
return ext4_journal_get_write_access(handle, ent->dir_bh);
}
@@ -3332,8 +3349,7 @@ static int ext4_setent(handle_t *handle, struct ext4_renament *ent,
if (retval)
return retval;
ent->de->inode = cpu_to_le32(ino);
- if (EXT4_HAS_INCOMPAT_FEATURE(ent->dir->i_sb,
- EXT4_FEATURE_INCOMPAT_FILETYPE))
+ if (ext4_has_feature_filetype(ent->dir->i_sb))
ent->de->file_type = file_type;
ent->dir->i_version++;
ent->dir->i_ctime = ent->dir->i_mtime =
@@ -3476,13 +3492,20 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
int credits;
u8 old_file_type;
- dquot_initialize(old.dir);
- dquot_initialize(new.dir);
+ retval = dquot_initialize(old.dir);
+ if (retval)
+ return retval;
+ retval = dquot_initialize(new.dir);
+ if (retval)
+ return retval;
/* Initialize quotas before so that eventual writes go
* in separate transaction */
- if (new.inode)
- dquot_initialize(new.inode);
+ if (new.inode) {
+ retval = dquot_initialize(new.inode);
+ if (retval)
+ return retval;
+ }
old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL);
if (IS_ERR(old.bh))
@@ -3678,8 +3701,12 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
new.inode)))
return -EPERM;
- dquot_initialize(old.dir);
- dquot_initialize(new.dir);
+ retval = dquot_initialize(old.dir);
+ if (retval)
+ return retval;
+ retval = dquot_initialize(new.dir);
+ if (retval)
+ return retval;
old.bh = ext4_find_entry(old.dir, &old.dentry->d_name,
&old.de, &old.inlined);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 5602450..090b349 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -52,16 +52,14 @@ void ext4_exit_pageio(void)
*/
static void buffer_io_error(struct buffer_head *bh)
{
- char b[BDEVNAME_SIZE];
- printk_ratelimited(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n",
- bdevname(bh->b_bdev, b),
+ printk_ratelimited(KERN_ERR "Buffer I/O error on device %pg, logical block %llu\n",
+ bh->b_bdev,
(unsigned long long)bh->b_blocknr);
}
static void ext4_finish_bio(struct bio *bio)
{
int i;
- int error = !test_bit(BIO_UPTODATE, &bio->bi_flags);
struct bio_vec *bvec;
bio_for_each_segment_all(bvec, bio, i) {
@@ -88,7 +86,7 @@ static void ext4_finish_bio(struct bio *bio)
}
#endif
- if (error) {
+ if (bio->bi_error) {
SetPageError(page);
set_bit(AS_EIO, &page->mapping->flags);
}
@@ -107,7 +105,7 @@ static void ext4_finish_bio(struct bio *bio)
continue;
}
clear_buffer_async_write(bh);
- if (error)
+ if (bio->bi_error)
buffer_io_error(bh);
} while ((bh = bh->b_this_page) != head);
bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
@@ -310,27 +308,25 @@ ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end)
}
/* BIO completion function for page writeback */
-static void ext4_end_bio(struct bio *bio, int error)
+static void ext4_end_bio(struct bio *bio)
{
ext4_io_end_t *io_end = bio->bi_private;
sector_t bi_sector = bio->bi_iter.bi_sector;
BUG_ON(!io_end);
bio->bi_end_io = NULL;
- if (test_bit(BIO_UPTODATE, &bio->bi_flags))
- error = 0;
- if (error) {
+ if (bio->bi_error) {
struct inode *inode = io_end->inode;
ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu "
"(offset %llu size %ld starting block %llu)",
- error, inode->i_ino,
+ bio->bi_error, inode->i_ino,
(unsigned long long) io_end->offset,
(long) io_end->size,
(unsigned long long)
bi_sector >> (inode->i_blkbits - 9));
- mapping_set_error(inode->i_mapping, error);
+ mapping_set_error(inode->i_mapping, bio->bi_error);
}
if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
@@ -357,8 +353,10 @@ void ext4_io_submit(struct ext4_io_submit *io)
struct bio *bio = io->io_bio;
if (bio) {
+ int io_op = io->io_wbc->sync_mode == WB_SYNC_ALL ?
+ WRITE_SYNC : WRITE;
bio_get(io->io_bio);
- submit_bio(io->io_op, io->io_bio);
+ submit_bio(io_op, io->io_bio);
bio_put(io->io_bio);
}
io->io_bio = NULL;
@@ -367,7 +365,7 @@ void ext4_io_submit(struct ext4_io_submit *io)
void ext4_io_submit_init(struct ext4_io_submit *io,
struct writeback_control *wbc)
{
- io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
+ io->io_wbc = wbc;
io->io_bio = NULL;
io->io_end = NULL;
}
@@ -375,12 +373,12 @@ void ext4_io_submit_init(struct ext4_io_submit *io,
static int io_submit_init_bio(struct ext4_io_submit *io,
struct buffer_head *bh)
{
- int nvecs = bio_get_nr_vecs(bh->b_bdev);
struct bio *bio;
- bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES));
+ bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
if (!bio)
return -ENOMEM;
+ wbc_init_bio(io->io_wbc, bio);
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio->bi_bdev = bh->b_bdev;
bio->bi_end_io = ext4_end_bio;
@@ -409,6 +407,7 @@ submit_and_retry:
ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh));
if (ret != bh->b_size)
goto submit_and_retry;
+ wbc_account_io(io->io_wbc, page, bh->b_size);
io->io_next_block++;
return 0;
}
@@ -425,6 +424,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
struct buffer_head *bh, *head;
int ret = 0;
int nr_submitted = 0;
+ int nr_to_submit = 0;
blocksize = 1 << inode->i_blkbits;
@@ -477,11 +477,13 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
}
set_buffer_async_write(bh);
+ nr_to_submit++;
} while ((bh = bh->b_this_page) != head);
bh = head = page_buffers(page);
- if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
+ if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode) &&
+ nr_to_submit) {
data_page = ext4_encrypt(inode, page);
if (IS_ERR(data_page)) {
ret = PTR_ERR(data_page);
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index ec3ef93..5dc5e95 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -62,7 +62,7 @@ static void completion_pages(struct work_struct *work)
bio_for_each_segment_all(bv, bio, i) {
struct page *page = bv->bv_page;
- int ret = ext4_decrypt(ctx, page);
+ int ret = ext4_decrypt(page);
if (ret) {
WARN_ON_ONCE(1);
SetPageError(page);
@@ -98,7 +98,7 @@ static inline bool ext4_bio_encrypted(struct bio *bio)
* status of that page is hard. See end_buffer_async_read() for the details.
* There is no point in duplicating all that complexity.
*/
-static void mpage_end_io(struct bio *bio, int err)
+static void mpage_end_io(struct bio *bio)
{
struct bio_vec *bv;
int i;
@@ -106,7 +106,7 @@ static void mpage_end_io(struct bio *bio, int err)
if (ext4_bio_encrypted(bio)) {
struct ext4_crypto_ctx *ctx = bio->bi_private;
- if (err) {
+ if (bio->bi_error) {
ext4_release_crypto_ctx(ctx);
} else {
INIT_WORK(&ctx->r.work, completion_pages);
@@ -118,7 +118,7 @@ static void mpage_end_io(struct bio *bio, int err)
bio_for_each_segment_all(bv, bio, i) {
struct page *page = bv->bv_page;
- if (!err) {
+ if (!bio->bi_error) {
SetPageUptodate(page);
} else {
ClearPageUptodate(page);
@@ -165,8 +165,8 @@ int ext4_mpage_readpages(struct address_space *mapping,
if (pages) {
page = list_entry(pages->prev, struct page, lru);
list_del(&page->lru);
- if (add_to_page_cache_lru(page, mapping,
- page->index, GFP_KERNEL))
+ if (add_to_page_cache_lru(page, mapping, page->index,
+ mapping_gfp_constraint(mapping, GFP_KERNEL)))
goto next_page;
}
@@ -284,7 +284,7 @@ int ext4_mpage_readpages(struct address_space *mapping,
goto set_error_page;
}
bio = bio_alloc(GFP_KERNEL,
- min_t(int, nr_pages, bio_get_nr_vecs(bdev)));
+ min_t(int, nr_pages, BIO_MAX_PAGES));
if (!bio) {
if (ctx)
ext4_release_crypto_ctx(ctx);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index cf0c472..ad62d7a 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -490,7 +490,7 @@ static int setup_new_flex_group_blocks(struct super_block *sb,
group_data[0].group != sbi->s_groups_count);
reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks);
- meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG);
+ meta_bg = ext4_has_feature_meta_bg(sb);
/* This transaction may be extended/restarted along the way */
handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, EXT4_MAX_TRANS_DATA);
@@ -680,8 +680,7 @@ static unsigned ext4_list_backups(struct super_block *sb, unsigned *three,
int mult = 3;
unsigned ret;
- if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
+ if (!ext4_has_feature_sparse_super(sb)) {
ret = *min;
*min += 1;
return ret;
@@ -1040,7 +1039,7 @@ exit_free:
* do not copy the full number of backups at this time. The resize
* which changed s_groups_count will backup again.
*/
-static void update_backups(struct super_block *sb, int blk_off, char *data,
+static void update_backups(struct super_block *sb, sector_t blk_off, char *data,
int size, int meta_bg)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -1065,7 +1064,7 @@ static void update_backups(struct super_block *sb, int blk_off, char *data,
group = ext4_list_backups(sb, &three, &five, &seven);
last = sbi->s_groups_count;
} else {
- group = ext4_meta_bg_first_group(sb, group) + 1;
+ group = ext4_get_group_number(sb, blk_off) + 1;
last = (ext4_group_t)(group + EXT4_DESC_PER_BLOCK(sb) - 2);
}
@@ -1158,7 +1157,7 @@ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb,
int i, gdb_off, gdb_num, err = 0;
int meta_bg;
- meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG);
+ meta_bg = ext4_has_feature_meta_bg(sb);
for (i = 0; i < count; i++, group++) {
int reserved_gdb = ext4_bg_has_super(sb, group) ?
le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
@@ -1381,9 +1380,7 @@ static void ext4_update_super(struct super_block *sb,
ext4_debug("free blocks count %llu",
percpu_counter_read(&sbi->s_freeclusters_counter));
- if (EXT4_HAS_INCOMPAT_FEATURE(sb,
- EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
- sbi->s_log_groups_per_flex) {
+ if (ext4_has_feature_flex_bg(sb) && sbi->s_log_groups_per_flex) {
ext4_group_t flex_group;
flex_group = ext4_flex_group(sbi, group_data[0].group);
atomic64_add(EXT4_NUM_B2C(sbi, free_blocks),
@@ -1476,8 +1473,7 @@ exit_journal:
int gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
int gdb_num_end = ((group + flex_gd->count - 1) /
EXT4_DESC_PER_BLOCK(sb));
- int meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb,
- EXT4_FEATURE_INCOMPAT_META_BG);
+ int meta_bg = ext4_has_feature_meta_bg(sb);
sector_t old_gdb = 0;
update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es,
@@ -1585,8 +1581,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb);
- if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
+ if (gdb_off == 0 && !ext4_has_feature_sparse_super(sb)) {
ext4_warning(sb, "Can't resize non-sparse filesystem further");
return -EPERM;
}
@@ -1604,9 +1599,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
}
if (reserved_gdb || gdb_off == 0) {
- if (!EXT4_HAS_COMPAT_FEATURE(sb,
- EXT4_FEATURE_COMPAT_RESIZE_INODE)
- || !le16_to_cpu(es->s_reserved_gdt_blocks)) {
+ if (ext4_has_feature_resize_inode(sb) ||
+ !le16_to_cpu(es->s_reserved_gdt_blocks)) {
ext4_warning(sb,
"No reserved GDT blocks, can't resize");
return -EPERM;
@@ -1825,8 +1819,8 @@ static int ext4_convert_meta_bg(struct super_block *sb, struct inode *inode)
if (err)
goto errout;
- EXT4_CLEAR_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE);
- EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG);
+ ext4_clear_feature_resize_inode(sb);
+ ext4_set_feature_meta_bg(sb);
sbi->s_es->s_first_meta_bg =
cpu_to_le32(num_desc_blocks(sb, sbi->s_groups_count));
@@ -1918,9 +1912,9 @@ retry:
n_desc_blocks = num_desc_blocks(sb, n_group + 1);
o_desc_blocks = num_desc_blocks(sb, sbi->s_groups_count);
- meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG);
+ meta_bg = ext4_has_feature_meta_bg(sb);
- if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE)) {
+ if (ext4_has_feature_resize_inode(sb)) {
if (meta_bg) {
ext4_error(sb, "resize_inode and meta_bg enabled "
"simultaneously");
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 58987b5..f1b56ff 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -34,7 +34,6 @@
#include <linux/namei.h>
#include <linux/quotaops.h>
#include <linux/seq_file.h>
-#include <linux/proc_fs.h>
#include <linux/ctype.h>
#include <linux/log2.h>
#include <linux/crc16.h>
@@ -54,12 +53,10 @@
#define CREATE_TRACE_POINTS
#include <trace/events/ext4.h>
-static struct proc_dir_entry *ext4_proc_root;
-static struct kset *ext4_kset;
static struct ext4_lazy_init *ext4_li_info;
static struct mutex ext4_li_mtx;
-static struct ext4_features *ext4_feat;
static int ext4_mballoc_ready;
+static struct ratelimit_state ext4_mount_msg_ratelimit;
static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
unsigned long journal_devnum);
@@ -82,9 +79,8 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly);
static void ext4_destroy_lazyinit_thread(void);
static void ext4_unregister_li_request(struct super_block *sb);
static void ext4_clear_request_list(void);
-static int ext4_reserve_clusters(struct ext4_sb_info *, ext4_fsblk_t);
-#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2)
static struct file_system_type ext2_fs_type = {
.owner = THIS_MODULE,
.name = "ext2",
@@ -100,7 +96,6 @@ MODULE_ALIAS("ext2");
#endif
-#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
static struct file_system_type ext3_fs_type = {
.owner = THIS_MODULE,
.name = "ext3",
@@ -111,15 +106,11 @@ static struct file_system_type ext3_fs_type = {
MODULE_ALIAS_FS("ext3");
MODULE_ALIAS("ext3");
#define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type)
-#else
-#define IS_EXT3_SB(sb) (0)
-#endif
static int ext4_verify_csum_type(struct super_block *sb,
struct ext4_super_block *es)
{
- if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ if (!ext4_has_feature_metadata_csum(sb))
return 1;
return es->s_checksum_type == EXT4_CRC32C_CHKSUM;
@@ -325,6 +316,22 @@ static void save_error_info(struct super_block *sb, const char *func,
ext4_commit_super(sb, 1);
}
+/*
+ * The del_gendisk() function uninitializes the disk-specific data
+ * structures, including the bdi structure, without telling anyone
+ * else. Once this happens, any attempt to call mark_buffer_dirty()
+ * (for example, by ext4_commit_super), will cause a kernel OOPS.
+ * This is a kludge to prevent these oops until we can put in a proper
+ * hook in del_gendisk() to inform the VFS and file system layers.
+ */
+static int block_device_ejected(struct super_block *sb)
+{
+ struct inode *bd_inode = sb->s_bdev->bd_inode;
+ struct backing_dev_info *bdi = inode_to_bdi(bd_inode);
+
+ return bdi->dev == NULL;
+}
+
static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
{
struct super_block *sb = journal->j_private;
@@ -381,9 +388,13 @@ static void ext4_handle_error(struct super_block *sb)
smp_wmb();
sb->s_flags |= MS_RDONLY;
}
- if (test_opt(sb, ERRORS_PANIC))
+ if (test_opt(sb, ERRORS_PANIC)) {
+ if (EXT4_SB(sb)->s_journal &&
+ !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR))
+ return;
panic("EXT4-fs (device %s): panic forced after error\n",
sb->s_id);
+ }
}
#define ext4_error_ratelimit(sb) \
@@ -482,6 +493,12 @@ const char *ext4_decode_error(struct super_block *sb, int errno,
char *errstr = NULL;
switch (errno) {
+ case -EFSCORRUPTED:
+ errstr = "Corrupt filesystem";
+ break;
+ case -EFSBADCRC:
+ errstr = "Filesystem failed CRC";
+ break;
case -EIO:
errstr = "IO failure";
break;
@@ -572,8 +589,12 @@ void __ext4_abort(struct super_block *sb, const char *function,
jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
save_error_info(sb, function, line);
}
- if (test_opt(sb, ERRORS_PANIC))
+ if (test_opt(sb, ERRORS_PANIC)) {
+ if (EXT4_SB(sb)->s_journal &&
+ !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR))
+ return;
panic("EXT4-fs panic from previous error\n");
+ }
}
void __ext4_msg(struct super_block *sb,
@@ -787,6 +808,7 @@ static void ext4_put_super(struct super_block *sb)
ext4_abort(sb, "Couldn't clean up the journal");
}
+ ext4_unregister_sysfs(sb);
ext4_es_unregister_shrinker(sbi);
del_timer_sync(&sbi->s_err_report);
ext4_release_system_zone(sb);
@@ -795,18 +817,12 @@ static void ext4_put_super(struct super_block *sb)
ext4_xattr_put_super(sb);
if (!(sb->s_flags & MS_RDONLY)) {
- EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+ ext4_clear_feature_journal_needs_recovery(sb);
es->s_state = cpu_to_le16(sbi->s_mount_state);
}
if (!(sb->s_flags & MS_RDONLY))
ext4_commit_super(sb, 1);
- if (sbi->s_proc) {
- remove_proc_entry("options", sbi->s_proc);
- remove_proc_entry(sb->s_id, ext4_proc_root);
- }
- kobject_del(&sbi->s_kobj);
-
for (i = 0; i < sbi->s_gdb_count; i++)
brelse(sbi->s_group_desc[i]);
kvfree(sbi->s_group_desc);
@@ -950,7 +966,7 @@ static int __init init_inodecache(void)
ext4_inode_cachep = kmem_cache_create("ext4_inode_cache",
sizeof(struct ext4_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (ext4_inode_cachep == NULL)
return -ENOMEM;
@@ -1045,7 +1061,7 @@ static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
return 0;
if (journal)
return jbd2_journal_try_to_free_buffers(journal, page,
- wait & ~__GFP_WAIT);
+ wait & ~__GFP_DIRECT_RECLAIM);
return try_to_free_buffers(page);
}
@@ -1275,7 +1291,7 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
"quota options when quota turned on");
return -1;
}
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) {
+ if (ext4_has_feature_quota(sb)) {
ext4_msg(sb, KERN_ERR, "Cannot set journaled quota options "
"when QUOTA feature is enabled");
return -1;
@@ -1368,10 +1384,10 @@ static const struct mount_opts {
{Opt_nojournal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
MOPT_EXT4_ONLY | MOPT_CLEAR},
{Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
- MOPT_EXT4_ONLY | MOPT_SET},
+ MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
{Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT |
EXT4_MOUNT_JOURNAL_CHECKSUM),
- MOPT_EXT4_ONLY | MOPT_SET},
+ MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
{Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_NO_EXT2 | MOPT_SET},
{Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | MOPT_CLEAR_ERR},
{Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR},
@@ -1394,9 +1410,9 @@ static const struct mount_opts {
{Opt_stripe, 0, MOPT_GTE0},
{Opt_resuid, 0, MOPT_GTE0},
{Opt_resgid, 0, MOPT_GTE0},
- {Opt_journal_dev, 0, MOPT_GTE0},
- {Opt_journal_path, 0, MOPT_STRING},
- {Opt_journal_ioprio, 0, MOPT_GTE0},
+ {Opt_journal_dev, 0, MOPT_NO_EXT2 | MOPT_GTE0},
+ {Opt_journal_path, 0, MOPT_NO_EXT2 | MOPT_STRING},
+ {Opt_journal_ioprio, 0, MOPT_NO_EXT2 | MOPT_GTE0},
{Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_NO_EXT2 | MOPT_DATAJ},
{Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_NO_EXT2 | MOPT_DATAJ},
{Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA,
@@ -1500,8 +1516,14 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
return -1;
if (args->from && (m->flags & MOPT_GTE0) && (arg < 0))
return -1;
- if (m->flags & MOPT_EXPLICIT)
- set_opt2(sb, EXPLICIT_DELALLOC);
+ if (m->flags & MOPT_EXPLICIT) {
+ if (m->mount_opt & EXT4_MOUNT_DELALLOC) {
+ set_opt2(sb, EXPLICIT_DELALLOC);
+ } else if (m->mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) {
+ set_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM);
+ } else
+ return -1;
+ }
if (m->flags & MOPT_CLEAR_ERR)
clear_opt(sb, ERRORS_MASK);
if (token == Opt_noquota && sb_any_quota_loaded(sb)) {
@@ -1634,8 +1656,7 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
"quota options when quota turned on");
return -1;
}
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_QUOTA)) {
+ if (ext4_has_feature_quota(sb)) {
ext4_msg(sb, KERN_ERR,
"Cannot set journaled quota options "
"when QUOTA feature is enabled");
@@ -1643,8 +1664,12 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
}
sbi->s_jquota_fmt = m->mount_opt;
#endif
-#ifndef CONFIG_FS_DAX
} else if (token == Opt_dax) {
+#ifdef CONFIG_FS_DAX
+ ext4_msg(sb, KERN_WARNING,
+ "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
+ sbi->s_mount_opt |= m->mount_opt;
+#else
ext4_msg(sb, KERN_INFO, "dax option not supported");
return -1;
#endif
@@ -1694,7 +1719,7 @@ static int parse_options(char *options, struct super_block *sb,
return 0;
}
#ifdef CONFIG_QUOTA
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
+ if (ext4_has_feature_quota(sb) &&
(test_opt(sb, USRQUOTA) || test_opt(sb, GRPQUOTA))) {
ext4_msg(sb, KERN_ERR, "Cannot set quota options when QUOTA "
"feature is enabled");
@@ -1763,10 +1788,10 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
}
if (sbi->s_qf_names[USRQUOTA])
- seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
+ seq_show_option(seq, "usrjquota", sbi->s_qf_names[USRQUOTA]);
if (sbi->s_qf_names[GRPQUOTA])
- seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
+ seq_show_option(seq, "grpjquota", sbi->s_qf_names[GRPQUOTA]);
#endif
}
@@ -1867,7 +1892,7 @@ static int ext4_show_options(struct seq_file *seq, struct dentry *root)
return _ext4_show_options(seq, root->d_sb, 0);
}
-static int options_seq_show(struct seq_file *seq, void *offset)
+int ext4_seq_options_show(struct seq_file *seq, void *offset)
{
struct super_block *sb = seq->private;
int rc;
@@ -1878,19 +1903,6 @@ static int options_seq_show(struct seq_file *seq, void *offset)
return rc;
}
-static int options_open_fs(struct inode *inode, struct file *file)
-{
- return single_open(file, options_seq_show, PDE_DATA(inode));
-}
-
-static const struct file_operations ext4_seq_options_fops = {
- .owner = THIS_MODULE,
- .open = options_open_fs,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
int read_only)
{
@@ -1931,7 +1943,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
es->s_mtime = cpu_to_le32(get_seconds());
ext4_update_dynamic_rev(sb);
if (sbi->s_journal)
- EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+ ext4_set_feature_journal_needs_recovery(sb);
ext4_commit_super(sb, 1);
done:
@@ -2014,12 +2026,13 @@ failed:
return 0;
}
-static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
+static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group,
struct ext4_group_desc *gdp)
{
int offset;
__u16 crc = 0;
__le32 le_group = cpu_to_le32(block_group);
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
if (ext4_has_metadata_csum(sbi->s_sb)) {
/* Use new metadata_csum algorithm */
@@ -2039,8 +2052,7 @@ static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
}
/* old crc16 code */
- if (!(sbi->s_es->s_feature_ro_compat &
- cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM)))
+ if (!ext4_has_feature_gdt_csum(sb))
return 0;
offset = offsetof(struct ext4_group_desc, bg_checksum);
@@ -2050,8 +2062,7 @@ static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
crc = crc16(crc, (__u8 *)gdp, offset);
offset += sizeof(gdp->bg_checksum); /* skip checksum */
/* for checksum of struct ext4_group_desc do the rest...*/
- if ((sbi->s_es->s_feature_incompat &
- cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT)) &&
+ if (ext4_has_feature_64bit(sb) &&
offset < le16_to_cpu(sbi->s_es->s_desc_size))
crc = crc16(crc, (__u8 *)gdp + offset,
le16_to_cpu(sbi->s_es->s_desc_size) -
@@ -2065,8 +2076,7 @@ int ext4_group_desc_csum_verify(struct super_block *sb, __u32 block_group,
struct ext4_group_desc *gdp)
{
if (ext4_has_group_desc_csum(sb) &&
- (gdp->bg_checksum != ext4_group_desc_csum(EXT4_SB(sb),
- block_group, gdp)))
+ (gdp->bg_checksum != ext4_group_desc_csum(sb, block_group, gdp)))
return 0;
return 1;
@@ -2077,7 +2087,7 @@ void ext4_group_desc_csum_set(struct super_block *sb, __u32 block_group,
{
if (!ext4_has_group_desc_csum(sb))
return;
- gdp->bg_checksum = ext4_group_desc_csum(EXT4_SB(sb), block_group, gdp);
+ gdp->bg_checksum = ext4_group_desc_csum(sb, block_group, gdp);
}
/* Called at mount-time, super-block is locked */
@@ -2093,7 +2103,7 @@ static int ext4_check_descriptors(struct super_block *sb,
int flexbg_flag = 0;
ext4_group_t i, grp = sbi->s_groups_count;
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
+ if (ext4_has_feature_flex_bg(sb))
flexbg_flag = 1;
ext4_debug("Checking group descriptors");
@@ -2137,7 +2147,7 @@ static int ext4_check_descriptors(struct super_block *sb,
if (!ext4_group_desc_csum_verify(sb, i, gdp)) {
ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
"Checksum for group %u failed (%u!=%u)",
- i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
+ i, le16_to_cpu(ext4_group_desc_csum(sb, i,
gdp)), le16_to_cpu(gdp->bg_checksum));
if (!(sb->s_flags & MS_RDONLY)) {
ext4_unlock_group(sb, i);
@@ -2400,8 +2410,7 @@ static ext4_fsblk_t descriptor_loc(struct super_block *sb,
first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);
- if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) ||
- nr < first_meta_bg)
+ if (!ext4_has_feature_meta_bg(sb) || nr < first_meta_bg)
return logical_sb_block + nr + 1;
bg = sbi->s_desc_per_block * nr;
if (ext4_bg_has_super(sb, bg))
@@ -2457,335 +2466,6 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
return ret;
}
-/* sysfs supprt */
-
-struct ext4_attr {
- struct attribute attr;
- ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
- ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *,
- const char *, size_t);
- union {
- int offset;
- int deprecated_val;
- } u;
-};
-
-static int parse_strtoull(const char *buf,
- unsigned long long max, unsigned long long *value)
-{
- int ret;
-
- ret = kstrtoull(skip_spaces(buf), 0, value);
- if (!ret && *value > max)
- ret = -EINVAL;
- return ret;
-}
-
-static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a,
- struct ext4_sb_info *sbi,
- char *buf)
-{
- return snprintf(buf, PAGE_SIZE, "%llu\n",
- (s64) EXT4_C2B(sbi,
- percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
-}
-
-static ssize_t session_write_kbytes_show(struct ext4_attr *a,
- struct ext4_sb_info *sbi, char *buf)
-{
- struct super_block *sb = sbi->s_buddy_cache->i_sb;
-
- if (!sb->s_bdev->bd_part)
- return snprintf(buf, PAGE_SIZE, "0\n");
- return snprintf(buf, PAGE_SIZE, "%lu\n",
- (part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
- sbi->s_sectors_written_start) >> 1);
-}
-
-static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
- struct ext4_sb_info *sbi, char *buf)
-{
- struct super_block *sb = sbi->s_buddy_cache->i_sb;
-
- if (!sb->s_bdev->bd_part)
- return snprintf(buf, PAGE_SIZE, "0\n");
- return snprintf(buf, PAGE_SIZE, "%llu\n",
- (unsigned long long)(sbi->s_kbytes_written +
- ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
- EXT4_SB(sb)->s_sectors_written_start) >> 1)));
-}
-
-static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
- struct ext4_sb_info *sbi,
- const char *buf, size_t count)
-{
- unsigned long t;
- int ret;
-
- ret = kstrtoul(skip_spaces(buf), 0, &t);
- if (ret)
- return ret;
-
- if (t && (!is_power_of_2(t) || t > 0x40000000))
- return -EINVAL;
-
- sbi->s_inode_readahead_blks = t;
- return count;
-}
-
-static ssize_t sbi_ui_show(struct ext4_attr *a,
- struct ext4_sb_info *sbi, char *buf)
-{
- unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
-
- return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
-}
-
-static ssize_t sbi_ui_store(struct ext4_attr *a,
- struct ext4_sb_info *sbi,
- const char *buf, size_t count)
-{
- unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
- unsigned long t;
- int ret;
-
- ret = kstrtoul(skip_spaces(buf), 0, &t);
- if (ret)
- return ret;
- *ui = t;
- return count;
-}
-
-static ssize_t es_ui_show(struct ext4_attr *a,
- struct ext4_sb_info *sbi, char *buf)
-{
-
- unsigned int *ui = (unsigned int *) (((char *) sbi->s_es) +
- a->u.offset);
-
- return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
-}
-
-static ssize_t reserved_clusters_show(struct ext4_attr *a,
- struct ext4_sb_info *sbi, char *buf)
-{
- return snprintf(buf, PAGE_SIZE, "%llu\n",
- (unsigned long long) atomic64_read(&sbi->s_resv_clusters));
-}
-
-static ssize_t reserved_clusters_store(struct ext4_attr *a,
- struct ext4_sb_info *sbi,
- const char *buf, size_t count)
-{
- unsigned long long val;
- int ret;
-
- if (parse_strtoull(buf, -1ULL, &val))
- return -EINVAL;
- ret = ext4_reserve_clusters(sbi, val);
-
- return ret ? ret : count;
-}
-
-static ssize_t trigger_test_error(struct ext4_attr *a,
- struct ext4_sb_info *sbi,
- const char *buf, size_t count)
-{
- int len = count;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (len && buf[len-1] == '\n')
- len--;
-
- if (len)
- ext4_error(sbi->s_sb, "%.*s", len, buf);
- return count;
-}
-
-static ssize_t sbi_deprecated_show(struct ext4_attr *a,
- struct ext4_sb_info *sbi, char *buf)
-{
- return snprintf(buf, PAGE_SIZE, "%d\n", a->u.deprecated_val);
-}
-
-#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \
-static struct ext4_attr ext4_attr_##_name = { \
- .attr = {.name = __stringify(_name), .mode = _mode }, \
- .show = _show, \
- .store = _store, \
- .u = { \
- .offset = offsetof(struct ext4_sb_info, _elname),\
- }, \
-}
-
-#define EXT4_ATTR_OFFSET_ES(_name,_mode,_show,_store,_elname) \
-static struct ext4_attr ext4_attr_##_name = { \
- .attr = {.name = __stringify(_name), .mode = _mode }, \
- .show = _show, \
- .store = _store, \
- .u = { \
- .offset = offsetof(struct ext4_super_block, _elname), \
- }, \
-}
-
-#define EXT4_ATTR(name, mode, show, store) \
-static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
-
-#define EXT4_INFO_ATTR(name) EXT4_ATTR(name, 0444, NULL, NULL)
-#define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL)
-#define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store)
-
-#define EXT4_RO_ATTR_ES_UI(name, elname) \
- EXT4_ATTR_OFFSET_ES(name, 0444, es_ui_show, NULL, elname)
-#define EXT4_RW_ATTR_SBI_UI(name, elname) \
- EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname)
-
-#define ATTR_LIST(name) &ext4_attr_##name.attr
-#define EXT4_DEPRECATED_ATTR(_name, _val) \
-static struct ext4_attr ext4_attr_##_name = { \
- .attr = {.name = __stringify(_name), .mode = 0444 }, \
- .show = sbi_deprecated_show, \
- .u = { \
- .deprecated_val = _val, \
- }, \
-}
-
-EXT4_RO_ATTR(delayed_allocation_blocks);
-EXT4_RO_ATTR(session_write_kbytes);
-EXT4_RO_ATTR(lifetime_write_kbytes);
-EXT4_RW_ATTR(reserved_clusters);
-EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
- inode_readahead_blks_store, s_inode_readahead_blks);
-EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
-EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats);
-EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
-EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
-EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
-EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
-EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
-EXT4_DEPRECATED_ATTR(max_writeback_mb_bump, 128);
-EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
-EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error);
-EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval);
-EXT4_RW_ATTR_SBI_UI(err_ratelimit_burst, s_err_ratelimit_state.burst);
-EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval);
-EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst);
-EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval);
-EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
-EXT4_RO_ATTR_ES_UI(errors_count, s_error_count);
-EXT4_RO_ATTR_ES_UI(first_error_time, s_first_error_time);
-EXT4_RO_ATTR_ES_UI(last_error_time, s_last_error_time);
-
-static struct attribute *ext4_attrs[] = {
- ATTR_LIST(delayed_allocation_blocks),
- ATTR_LIST(session_write_kbytes),
- ATTR_LIST(lifetime_write_kbytes),
- ATTR_LIST(reserved_clusters),
- ATTR_LIST(inode_readahead_blks),
- ATTR_LIST(inode_goal),
- ATTR_LIST(mb_stats),
- ATTR_LIST(mb_max_to_scan),
- ATTR_LIST(mb_min_to_scan),
- ATTR_LIST(mb_order2_req),
- ATTR_LIST(mb_stream_req),
- ATTR_LIST(mb_group_prealloc),
- ATTR_LIST(max_writeback_mb_bump),
- ATTR_LIST(extent_max_zeroout_kb),
- ATTR_LIST(trigger_fs_error),
- ATTR_LIST(err_ratelimit_interval_ms),
- ATTR_LIST(err_ratelimit_burst),
- ATTR_LIST(warning_ratelimit_interval_ms),
- ATTR_LIST(warning_ratelimit_burst),
- ATTR_LIST(msg_ratelimit_interval_ms),
- ATTR_LIST(msg_ratelimit_burst),
- ATTR_LIST(errors_count),
- ATTR_LIST(first_error_time),
- ATTR_LIST(last_error_time),
- NULL,
-};
-
-/* Features this copy of ext4 supports */
-EXT4_INFO_ATTR(lazy_itable_init);
-EXT4_INFO_ATTR(batched_discard);
-EXT4_INFO_ATTR(meta_bg_resize);
-EXT4_INFO_ATTR(encryption);
-
-static struct attribute *ext4_feat_attrs[] = {
- ATTR_LIST(lazy_itable_init),
- ATTR_LIST(batched_discard),
- ATTR_LIST(meta_bg_resize),
- ATTR_LIST(encryption),
- NULL,
-};
-
-static ssize_t ext4_attr_show(struct kobject *kobj,
- struct attribute *attr, char *buf)
-{
- struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
- s_kobj);
- struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
-
- return a->show ? a->show(a, sbi, buf) : 0;
-}
-
-static ssize_t ext4_attr_store(struct kobject *kobj,
- struct attribute *attr,
- const char *buf, size_t len)
-{
- struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
- s_kobj);
- struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
-
- return a->store ? a->store(a, sbi, buf, len) : 0;
-}
-
-static void ext4_sb_release(struct kobject *kobj)
-{
- struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
- s_kobj);
- complete(&sbi->s_kobj_unregister);
-}
-
-static const struct sysfs_ops ext4_attr_ops = {
- .show = ext4_attr_show,
- .store = ext4_attr_store,
-};
-
-static struct kobj_type ext4_ktype = {
- .default_attrs = ext4_attrs,
- .sysfs_ops = &ext4_attr_ops,
- .release = ext4_sb_release,
-};
-
-static void ext4_feat_release(struct kobject *kobj)
-{
- complete(&ext4_feat->f_kobj_unregister);
-}
-
-static ssize_t ext4_feat_show(struct kobject *kobj,
- struct attribute *attr, char *buf)
-{
- return snprintf(buf, PAGE_SIZE, "supported\n");
-}
-
-/*
- * We can not use ext4_attr_show/store because it relies on the kobject
- * being embedded in the ext4_sb_info structure which is definitely not
- * true in this case.
- */
-static const struct sysfs_ops ext4_feat_ops = {
- .show = ext4_feat_show,
- .store = NULL,
-};
-
-static struct kobj_type ext4_feat_ktype = {
- .default_attrs = ext4_feat_attrs,
- .sysfs_ops = &ext4_feat_ops,
- .release = ext4_feat_release,
-};
-
/*
* Check whether this filesystem can be mounted based on
* the features present and the RDONLY/RDWR mount requested.
@@ -2794,7 +2474,7 @@ static struct kobj_type ext4_feat_ktype = {
*/
static int ext4_feature_set_ok(struct super_block *sb, int readonly)
{
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP)) {
+ if (ext4_has_unknown_ext4_incompat_features(sb)) {
ext4_msg(sb, KERN_ERR,
"Couldn't mount because of "
"unsupported optional features (%x)",
@@ -2806,14 +2486,14 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
if (readonly)
return 1;
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_READONLY)) {
+ if (ext4_has_feature_readonly(sb)) {
ext4_msg(sb, KERN_INFO, "filesystem is read-only");
sb->s_flags |= MS_RDONLY;
return 1;
}
/* Check that feature set is OK for a read-write mount */
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP)) {
+ if (ext4_has_unknown_ext4_ro_compat_features(sb)) {
ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of "
"unsupported optional features (%x)",
(le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
@@ -2824,7 +2504,7 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
* Large file size enabled file system can only be mounted
* read-write on 32-bit systems if kernel is built with CONFIG_LBDAF
*/
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
+ if (ext4_has_feature_huge_file(sb)) {
if (sizeof(blkcnt_t) < sizeof(u64)) {
ext4_msg(sb, KERN_ERR, "Filesystem with huge files "
"cannot be mounted RDWR without "
@@ -2832,8 +2512,7 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
return 0;
}
}
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC) &&
- !EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+ if (ext4_has_feature_bigalloc(sb) && !ext4_has_feature_extents(sb)) {
ext4_msg(sb, KERN_ERR,
"Can't support bigalloc feature without "
"extents feature\n");
@@ -2841,8 +2520,7 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
}
#ifndef CONFIG_QUOTA
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
- !readonly) {
+ if (ext4_has_feature_quota(sb) && !readonly) {
ext4_msg(sb, KERN_ERR,
"Filesystem with quota feature cannot be mounted RDWR "
"without CONFIG_QUOTA");
@@ -3299,7 +2977,7 @@ static int count_overhead(struct super_block *sb, ext4_group_t grp,
ext4_group_t i, ngroups = ext4_get_groups_count(sb);
int s, j, count = 0;
- if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC))
+ if (!ext4_has_feature_bigalloc(sb))
return (ext4_bg_has_super(sb, grp) + ext4_bg_num_gdb(sb, grp) +
sbi->s_itb_per_group + 2);
@@ -3390,10 +3068,10 @@ int ext4_calculate_overhead(struct super_block *sb)
return 0;
}
-
-static ext4_fsblk_t ext4_calculate_resv_clusters(struct super_block *sb)
+static void ext4_set_resv_clusters(struct super_block *sb)
{
ext4_fsblk_t resv_clusters;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
/*
* There's no need to reserve anything when we aren't using extents.
@@ -3401,8 +3079,8 @@ static ext4_fsblk_t ext4_calculate_resv_clusters(struct super_block *sb)
* hole punching doesn't need new metadata... This is needed especially
* to keep ext2/3 backward compatibility.
*/
- if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
- return 0;
+ if (!ext4_has_feature_extents(sb))
+ return;
/*
* By default we reserve 2% or 4096 clusters, whichever is smaller.
* This should cover the situations where we can not afford to run
@@ -3411,26 +3089,13 @@ static ext4_fsblk_t ext4_calculate_resv_clusters(struct super_block *sb)
* allocation would require 1, or 2 blocks, higher numbers are
* very rare.
*/
- resv_clusters = ext4_blocks_count(EXT4_SB(sb)->s_es) >>
- EXT4_SB(sb)->s_cluster_bits;
+ resv_clusters = (ext4_blocks_count(sbi->s_es) >>
+ sbi->s_cluster_bits);
do_div(resv_clusters, 50);
resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096);
- return resv_clusters;
-}
-
-
-static int ext4_reserve_clusters(struct ext4_sb_info *sbi, ext4_fsblk_t count)
-{
- ext4_fsblk_t clusters = ext4_blocks_count(sbi->s_es) >>
- sbi->s_cluster_bits;
-
- if (count >= clusters)
- return -EINVAL;
-
- atomic64_set(&sbi->s_resv_clusters, count);
- return 0;
+ atomic64_set(&sbi->s_resv_clusters, resv_clusters);
}
static int ext4_fill_super(struct super_block *sb, void *data, int silent)
@@ -3513,9 +3178,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written);
/* Warn if metadata_csum and gdt_csum are both set. */
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
- EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
+ if (ext4_has_feature_metadata_csum(sb) &&
+ ext4_has_feature_gdt_csum(sb))
ext4_warning(sb, "metadata_csum and uninit_bg are "
"redundant flags; please run fsck.");
@@ -3528,8 +3192,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
/* Load the checksum driver */
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
+ if (ext4_has_feature_metadata_csum(sb)) {
sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
if (IS_ERR(sbi->s_chksum_driver)) {
ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver.");
@@ -3544,11 +3207,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
"invalid superblock checksum. Run e2fsck?");
silent = 1;
+ ret = -EFSBADCRC;
goto cantfind_ext4;
}
/* Precompute checksum seed for all metadata */
- if (ext4_has_metadata_csum(sb))
+ if (ext4_has_feature_csum_seed(sb))
+ sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed);
+ else if (ext4_has_metadata_csum(sb))
sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid,
sizeof(es->s_uuid));
@@ -3643,23 +3309,24 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
if (test_opt(sb, DELALLOC))
clear_opt(sb, DELALLOC);
+ } else {
+ sb->s_iflags |= SB_I_CGROUPWB;
}
sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
(test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV &&
- (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) ||
- EXT4_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
- EXT4_HAS_INCOMPAT_FEATURE(sb, ~0U)))
+ (ext4_has_compat_features(sb) ||
+ ext4_has_ro_compat_features(sb) ||
+ ext4_has_incompat_features(sb)))
ext4_msg(sb, KERN_WARNING,
"feature flags set on rev 0 fs, "
"running e2fsck is recommended");
if (es->s_creator_os == cpu_to_le32(EXT4_OS_HURD)) {
set_opt2(sb, HURD_COMPAT);
- if (EXT4_HAS_INCOMPAT_FEATURE(sb,
- EXT4_FEATURE_INCOMPAT_64BIT)) {
+ if (ext4_has_feature_64bit(sb)) {
ext4_msg(sb, KERN_ERR,
"The Hurd can't support 64-bit file systems");
goto failed_mount;
@@ -3717,8 +3384,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
}
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT) &&
- es->s_encryption_level) {
+ if (ext4_has_feature_encrypt(sb) && es->s_encryption_level) {
ext4_msg(sb, KERN_ERR, "Unsupported encryption level %d",
es->s_encryption_level);
goto failed_mount;
@@ -3750,8 +3416,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
}
- has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
+ has_huge_files = ext4_has_feature_huge_file(sb);
sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
has_huge_files);
sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);
@@ -3775,7 +3440,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
sbi->s_desc_size = le16_to_cpu(es->s_desc_size);
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) {
+ if (ext4_has_feature_64bit(sb)) {
if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT ||
sbi->s_desc_size > EXT4_MAX_DESC_SIZE ||
!is_power_of_2(sbi->s_desc_size)) {
@@ -3806,7 +3471,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
for (i = 0; i < 4; i++)
sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
sbi->s_def_hash_version = es->s_def_hash_version;
- if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) {
+ if (ext4_has_feature_dir_index(sb)) {
i = le32_to_cpu(es->s_flags);
if (i & EXT2_FLAGS_UNSIGNED_HASH)
sbi->s_hash_unsigned = 3;
@@ -3826,8 +3491,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
/* Handle clustersize */
clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size);
- has_bigalloc = EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_BIGALLOC);
+ has_bigalloc = ext4_has_feature_bigalloc(sb);
if (has_bigalloc) {
if (clustersize < blocksize) {
ext4_msg(sb, KERN_ERR,
@@ -3946,13 +3610,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount;
}
- if (ext4_proc_root)
- sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
-
- if (sbi->s_proc)
- proc_create_data("options", S_IRUGO, sbi->s_proc,
- &ext4_seq_options_fops, sb);
-
bgl_lock_init(sbi->s_blockgroup_lock);
for (i = 0; i < db_count; i++) {
@@ -3967,6 +3624,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
if (!ext4_check_descriptors(sb, &first_not_zeroed)) {
ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
+ ret = -EFSCORRUPTED;
goto failed_mount2;
}
@@ -3992,7 +3650,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sb->s_xattr = ext4_xattr_handlers;
#ifdef CONFIG_QUOTA
sb->dq_op = &ext4_quota_operations;
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA))
+ if (ext4_has_feature_quota(sb))
sb->s_qcop = &dquot_quotactl_sysfile_ops;
else
sb->s_qcop = &ext4_qctl_operations;
@@ -4006,11 +3664,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sb->s_root = NULL;
needs_recovery = (es->s_last_orphan != 0 ||
- EXT4_HAS_INCOMPAT_FEATURE(sb,
- EXT4_FEATURE_INCOMPAT_RECOVER));
+ ext4_has_feature_journal_needs_recovery(sb));
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) &&
- !(sb->s_flags & MS_RDONLY))
+ if (ext4_has_feature_mmp(sb) && !(sb->s_flags & MS_RDONLY))
if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)))
goto failed_mount3a;
@@ -4018,23 +3674,47 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
* The first inode we look at is the journal inode. Don't try
* root first: it may be modified in the journal!
*/
- if (!test_opt(sb, NOLOAD) &&
- EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
+ if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) {
if (ext4_load_journal(sb, es, journal_devnum))
goto failed_mount3a;
} else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) &&
- EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
+ ext4_has_feature_journal_needs_recovery(sb)) {
ext4_msg(sb, KERN_ERR, "required journal recovery "
"suppressed and not mounted read-only");
goto failed_mount_wq;
} else {
+ /* Nojournal mode, all journal mount options are illegal */
+ if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "journal_checksum, fs mounted w/o journal");
+ goto failed_mount_wq;
+ }
+ if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "journal_async_commit, fs mounted w/o journal");
+ goto failed_mount_wq;
+ }
+ if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "commit=%lu, fs mounted w/o journal",
+ sbi->s_commit_interval / HZ);
+ goto failed_mount_wq;
+ }
+ if (EXT4_MOUNT_DATA_FLAGS &
+ (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "data=, fs mounted w/o journal");
+ goto failed_mount_wq;
+ }
+ sbi->s_def_mount_opt &= EXT4_MOUNT_JOURNAL_CHECKSUM;
+ clear_opt(sb, JOURNAL_CHECKSUM);
clear_opt(sb, DATA_FLAGS);
sbi->s_journal = NULL;
needs_recovery = 0;
goto no_journal;
}
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT) &&
+ if (ext4_has_feature_64bit(sb) &&
!jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
JBD2_FEATURE_INCOMPAT_64BIT)) {
ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
@@ -4086,18 +3766,16 @@ no_journal:
}
}
- if ((DUMMY_ENCRYPTION_ENABLED(sbi) ||
- EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT)) &&
+ if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) &&
(blocksize != PAGE_CACHE_SIZE)) {
ext4_msg(sb, KERN_ERR,
"Unsupported blocksize for fs encryption");
goto failed_mount_wq;
}
- if (DUMMY_ENCRYPTION_ENABLED(sbi) &&
- !(sb->s_flags & MS_RDONLY) &&
- !EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT)) {
- EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT);
+ if (DUMMY_ENCRYPTION_ENABLED(sbi) && !(sb->s_flags & MS_RDONLY) &&
+ !ext4_has_feature_encrypt(sb)) {
+ ext4_set_feature_encrypt(sb);
ext4_commit_super(sb, 1);
}
@@ -4156,8 +3834,7 @@ no_journal:
if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) {
sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
EXT4_GOOD_OLD_INODE_SIZE;
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE)) {
+ if (ext4_has_feature_extra_isize(sb)) {
if (sbi->s_want_extra_isize <
le16_to_cpu(es->s_want_extra_isize))
sbi->s_want_extra_isize =
@@ -4177,12 +3854,7 @@ no_journal:
"available");
}
- err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sb));
- if (err) {
- ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for "
- "reserved pool", ext4_calculate_resv_clusters(sb));
- goto failed_mount4a;
- }
+ ext4_set_resv_clusters(sb);
err = ext4_setup_system_zone(sb);
if (err) {
@@ -4221,7 +3893,7 @@ no_journal:
goto failed_mount6;
}
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
+ if (ext4_has_feature_flex_bg(sb))
if (!ext4_fill_flex_info(sb)) {
ext4_msg(sb, KERN_ERR,
"unable to initialize "
@@ -4233,17 +3905,13 @@ no_journal:
if (err)
goto failed_mount6;
- sbi->s_kobj.kset = ext4_kset;
- init_completion(&sbi->s_kobj_unregister);
- err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL,
- "%s", sb->s_id);
+ err = ext4_register_sysfs(sb);
if (err)
goto failed_mount7;
#ifdef CONFIG_QUOTA
/* Enable quota usage during mount. */
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
- !(sb->s_flags & MS_RDONLY)) {
+ if (ext4_has_feature_quota(sb) && !(sb->s_flags & MS_RDONLY)) {
err = ext4_enable_quotas(sb);
if (err)
goto failed_mount8;
@@ -4275,9 +3943,10 @@ no_journal:
"the device does not support discard");
}
- ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
- "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts,
- *sbi->s_es->s_mount_opts ? "; " : "", orig_data);
+ if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount"))
+ ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
+ "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts,
+ *sbi->s_es->s_mount_opts ? "; " : "", orig_data);
if (es->s_error_count)
mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
@@ -4297,7 +3966,7 @@ cantfind_ext4:
#ifdef CONFIG_QUOTA
failed_mount8:
- kobject_del(&sbi->s_kobj);
+ ext4_unregister_sysfs(sb);
#endif
failed_mount7:
ext4_unregister_li_request(sb);
@@ -4337,10 +4006,6 @@ failed_mount2:
failed_mount:
if (sbi->s_chksum_driver)
crypto_free_shash(sbi->s_chksum_driver);
- if (sbi->s_proc) {
- remove_proc_entry("options", sbi->s_proc);
- remove_proc_entry(sb->s_id, ext4_proc_root);
- }
#ifdef CONFIG_QUOTA
for (i = 0; i < EXT4_MAXQUOTAS; i++)
kfree(sbi->s_qf_names[i]);
@@ -4387,7 +4052,7 @@ static journal_t *ext4_get_journal(struct super_block *sb,
struct inode *journal_inode;
journal_t *journal;
- BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
+ BUG_ON(!ext4_has_feature_journal(sb));
/* First, test for the existence of a valid inode on disk. Bad
* things happen if we iget() an unused inode, as the subsequent
@@ -4437,7 +4102,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
struct ext4_super_block *es;
struct block_device *bdev;
- BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
+ BUG_ON(!ext4_has_feature_journal(sb));
bdev = ext4_blkdev_get(j_dev, sb);
if (bdev == NULL)
@@ -4529,7 +4194,7 @@ static int ext4_load_journal(struct super_block *sb,
int err = 0;
int really_read_only;
- BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
+ BUG_ON(!ext4_has_feature_journal(sb));
if (journal_devnum &&
journal_devnum != le32_to_cpu(es->s_journal_dev)) {
@@ -4546,7 +4211,7 @@ static int ext4_load_journal(struct super_block *sb,
* crash? For recovery, we need to check in advance whether we
* can get read-write access to the device.
*/
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
+ if (ext4_has_feature_journal_needs_recovery(sb)) {
if (sb->s_flags & MS_RDONLY) {
ext4_msg(sb, KERN_INFO, "INFO: recovery "
"required on readonly filesystem");
@@ -4577,7 +4242,7 @@ static int ext4_load_journal(struct super_block *sb,
if (!(journal->j_flags & JBD2_BARRIER))
ext4_msg(sb, KERN_INFO, "barriers disabled");
- if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER))
+ if (!ext4_has_feature_journal_needs_recovery(sb))
err = jbd2_journal_wipe(journal, !really_read_only);
if (!err) {
char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL);
@@ -4617,7 +4282,7 @@ static int ext4_commit_super(struct super_block *sb, int sync)
struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
int error = 0;
- if (!sbh)
+ if (!sbh || block_device_ejected(sb))
return error;
if (buffer_write_io_error(sbh)) {
/*
@@ -4665,7 +4330,8 @@ static int ext4_commit_super(struct super_block *sb, int sync)
ext4_superblock_csum_set(sb);
mark_buffer_dirty(sbh);
if (sync) {
- error = sync_dirty_buffer(sbh);
+ error = __sync_dirty_buffer(sbh,
+ test_opt(sb, BARRIER) ? WRITE_FUA : WRITE_SYNC);
if (error)
return error;
@@ -4690,7 +4356,7 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
{
journal_t *journal = EXT4_SB(sb)->s_journal;
- if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
+ if (!ext4_has_feature_journal(sb)) {
BUG_ON(journal != NULL);
return;
}
@@ -4698,9 +4364,9 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
if (jbd2_journal_flush(journal) < 0)
goto out;
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) &&
+ if (ext4_has_feature_journal_needs_recovery(sb) &&
sb->s_flags & MS_RDONLY) {
- EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+ ext4_clear_feature_journal_needs_recovery(sb);
ext4_commit_super(sb, 1);
}
@@ -4720,7 +4386,7 @@ static void ext4_clear_journal_err(struct super_block *sb,
int j_errno;
const char *errstr;
- BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
+ BUG_ON(!ext4_has_feature_journal(sb));
journal = EXT4_SB(sb)->s_journal;
@@ -4833,10 +4499,11 @@ static int ext4_freeze(struct super_block *sb)
error = jbd2_journal_flush(journal);
if (error < 0)
goto out;
+
+ /* Journal blocked and flushed, clear needs_recovery flag. */
+ ext4_clear_feature_journal_needs_recovery(sb);
}
- /* Journal blocked and flushed, clear needs_recovery flag. */
- EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
error = ext4_commit_super(sb, 1);
out:
if (journal)
@@ -4854,8 +4521,11 @@ static int ext4_unfreeze(struct super_block *sb)
if (sb->s_flags & MS_RDONLY)
return 0;
- /* Reset the needs_recovery flag before the fs is unlocked. */
- EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+ if (EXT4_SB(sb)->s_journal) {
+ /* Reset the needs_recovery flag before the fs is unlocked. */
+ ext4_set_feature_journal_needs_recovery(sb);
+ }
+
ext4_commit_super(sb, 1);
return 0;
}
@@ -5006,8 +4676,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
ext4_mark_recovery_complete(sb, es);
} else {
/* Make sure we can mount this feature set readwrite */
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_READONLY) ||
+ if (ext4_has_feature_readonly(sb) ||
!ext4_feature_set_ok(sb, 0)) {
err = -EROFS;
goto restore_opts;
@@ -5023,9 +4692,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
if (!ext4_group_desc_csum_verify(sb, g, gdp)) {
ext4_msg(sb, KERN_ERR,
"ext4_remount: Checksum for group %u failed (%u!=%u)",
- g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)),
+ g, le16_to_cpu(ext4_group_desc_csum(sb, g, gdp)),
le16_to_cpu(gdp->bg_checksum));
- err = -EINVAL;
+ err = -EFSBADCRC;
goto restore_opts;
}
}
@@ -5055,8 +4724,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
sbi->s_mount_state = le16_to_cpu(es->s_state);
if (!ext4_setup_super(sb, es, 0))
sb->s_flags &= ~MS_RDONLY;
- if (EXT4_HAS_INCOMPAT_FEATURE(sb,
- EXT4_FEATURE_INCOMPAT_MMP))
+ if (ext4_has_feature_mmp(sb))
if (ext4_multi_mount_protect(sb,
le64_to_cpu(es->s_mmp_block))) {
err = -EROFS;
@@ -5089,8 +4757,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
if (enable_quota) {
if (sb_any_quota_suspended(sb))
dquot_resume(sb, -1);
- else if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_QUOTA)) {
+ else if (ext4_has_feature_quota(sb)) {
err = ext4_enable_quotas(sb);
if (err)
goto restore_opts;
@@ -5234,7 +4901,7 @@ static int ext4_mark_dquot_dirty(struct dquot *dquot)
struct ext4_sb_info *sbi = EXT4_SB(sb);
/* Are we journaling quotas? */
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) ||
+ if (ext4_has_feature_quota(sb) ||
sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
dquot_mark_dquot_dirty(dquot);
return ext4_write_dquot(dquot);
@@ -5322,7 +4989,7 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum)
};
- BUG_ON(!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA));
+ BUG_ON(!ext4_has_feature_quota(sb));
if (!qf_inums[type])
return -EPERM;
@@ -5500,7 +5167,7 @@ static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
return mount_bdev(fs_type, flags, dev_name, data, ext4_fill_super);
}
-#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2)
static inline void register_as_ext2(void)
{
int err = register_filesystem(&ext2_fs_type);
@@ -5516,11 +5183,11 @@ static inline void unregister_as_ext2(void)
static inline int ext2_feature_set_ok(struct super_block *sb)
{
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT2_FEATURE_INCOMPAT_SUPP))
+ if (ext4_has_unknown_ext2_incompat_features(sb))
return 0;
if (sb->s_flags & MS_RDONLY)
return 1;
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP))
+ if (ext4_has_unknown_ext2_ro_compat_features(sb))
return 0;
return 1;
}
@@ -5530,7 +5197,6 @@ static inline void unregister_as_ext2(void) { }
static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; }
#endif
-#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
static inline void register_as_ext3(void)
{
int err = register_filesystem(&ext3_fs_type);
@@ -5546,21 +5212,16 @@ static inline void unregister_as_ext3(void)
static inline int ext3_feature_set_ok(struct super_block *sb)
{
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP))
+ if (ext4_has_unknown_ext3_incompat_features(sb))
return 0;
- if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
+ if (!ext4_has_feature_journal(sb))
return 0;
if (sb->s_flags & MS_RDONLY)
return 1;
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP))
+ if (ext4_has_unknown_ext3_ro_compat_features(sb))
return 0;
return 1;
}
-#else
-static inline void register_as_ext3(void) { }
-static inline void unregister_as_ext3(void) { }
-static inline int ext3_feature_set_ok(struct super_block *sb) { return 0; }
-#endif
static struct file_system_type ext4_fs_type = {
.owner = THIS_MODULE,
@@ -5571,37 +5232,6 @@ static struct file_system_type ext4_fs_type = {
};
MODULE_ALIAS_FS("ext4");
-static int __init ext4_init_feat_adverts(void)
-{
- struct ext4_features *ef;
- int ret = -ENOMEM;
-
- ef = kzalloc(sizeof(struct ext4_features), GFP_KERNEL);
- if (!ef)
- goto out;
-
- ef->f_kobj.kset = ext4_kset;
- init_completion(&ef->f_kobj_unregister);
- ret = kobject_init_and_add(&ef->f_kobj, &ext4_feat_ktype, NULL,
- "features");
- if (ret) {
- kfree(ef);
- goto out;
- }
-
- ext4_feat = ef;
- ret = 0;
-out:
- return ret;
-}
-
-static void ext4_exit_feat_adverts(void)
-{
- kobject_put(&ext4_feat->f_kobj);
- wait_for_completion(&ext4_feat->f_kobj_unregister);
- kfree(ext4_feat);
-}
-
/* Shared across all ext4 file systems */
wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
@@ -5610,6 +5240,7 @@ static int __init ext4_init_fs(void)
{
int i, err;
+ ratelimit_state_init(&ext4_mount_msg_ratelimit, 30 * HZ, 64);
ext4_li_info = NULL;
mutex_init(&ext4_li_mtx);
@@ -5627,21 +5258,15 @@ static int __init ext4_init_fs(void)
err = ext4_init_pageio();
if (err)
- goto out7;
+ goto out5;
err = ext4_init_system_zone();
if (err)
- goto out6;
- ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
- if (!ext4_kset) {
- err = -ENOMEM;
- goto out5;
- }
- ext4_proc_root = proc_mkdir("fs/ext4", NULL);
+ goto out4;
- err = ext4_init_feat_adverts();
+ err = ext4_init_sysfs();
if (err)
- goto out4;
+ goto out3;
err = ext4_init_mballoc();
if (err)
@@ -5666,16 +5291,12 @@ out1:
ext4_mballoc_ready = 0;
ext4_exit_mballoc();
out2:
- ext4_exit_feat_adverts();
-out4:
- if (ext4_proc_root)
- remove_proc_entry("fs/ext4", NULL);
- kset_unregister(ext4_kset);
-out5:
+ ext4_exit_sysfs();
+out3:
ext4_exit_system_zone();
-out6:
+out4:
ext4_exit_pageio();
-out7:
+out5:
ext4_exit_es();
return err;
@@ -5690,9 +5311,7 @@ static void __exit ext4_exit_fs(void)
unregister_filesystem(&ext4_fs_type);
destroy_inodecache();
ext4_exit_mballoc();
- ext4_exit_feat_adverts();
- remove_proc_entry("fs/ext4", NULL);
- kset_unregister(ext4_kset);
+ ext4_exit_sysfs();
ext4_exit_system_zone();
ext4_exit_pageio();
ext4_exit_es();
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index c677f2c..6f7ee30 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -23,17 +23,21 @@
#include "xattr.h"
#ifdef CONFIG_EXT4_FS_ENCRYPTION
-static const char *ext4_encrypted_follow_link(struct dentry *dentry, void **cookie)
+static const char *ext4_encrypted_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
struct page *cpage = NULL;
char *caddr, *paddr = NULL;
struct ext4_str cstr, pstr;
- struct inode *inode = d_inode(dentry);
struct ext4_encrypted_symlink_data *sd;
loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1);
int res;
u32 plen, max_size = inode->i_sb->s_blocksize;
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
res = ext4_get_encryption_info(inode);
if (res)
return ERR_PTR(res);
@@ -45,19 +49,19 @@ static const char *ext4_encrypted_follow_link(struct dentry *dentry, void **cook
cpage = read_mapping_page(inode->i_mapping, 0, NULL);
if (IS_ERR(cpage))
return ERR_CAST(cpage);
- caddr = kmap(cpage);
+ caddr = page_address(cpage);
caddr[size] = 0;
}
/* Symlink is encrypted */
sd = (struct ext4_encrypted_symlink_data *)caddr;
cstr.name = sd->encrypted_path;
- cstr.len = le32_to_cpu(sd->len);
+ cstr.len = le16_to_cpu(sd->len);
if ((cstr.len +
sizeof(struct ext4_encrypted_symlink_data) - 1) >
max_size) {
/* Symlink data on the disk is corrupted */
- res = -EIO;
+ res = -EFSCORRUPTED;
goto errout;
}
plen = (cstr.len < EXT4_FNAME_CRYPTO_DIGEST_SIZE*2) ?
@@ -75,24 +79,20 @@ static const char *ext4_encrypted_follow_link(struct dentry *dentry, void **cook
/* Null-terminate the name */
if (res <= plen)
paddr[res] = '\0';
- if (cpage) {
- kunmap(cpage);
+ if (cpage)
page_cache_release(cpage);
- }
- return *cookie = paddr;
+ set_delayed_call(done, kfree_link, paddr);
+ return paddr;
errout:
- if (cpage) {
- kunmap(cpage);
+ if (cpage)
page_cache_release(cpage);
- }
kfree(paddr);
return ERR_PTR(res);
}
const struct inode_operations ext4_encrypted_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = ext4_encrypted_follow_link,
- .put_link = kfree_put_link,
+ .get_link = ext4_encrypted_get_link,
.setattr = ext4_setattr,
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
@@ -103,8 +103,7 @@ const struct inode_operations ext4_encrypted_symlink_inode_operations = {
const struct inode_operations ext4_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
.setattr = ext4_setattr,
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
@@ -114,7 +113,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
const struct inode_operations ext4_fast_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = simple_follow_link,
+ .get_link = simple_get_link,
.setattr = ext4_setattr,
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
new file mode 100644
index 0000000..1420a3c
--- /dev/null
+++ b/fs/ext4/sysfs.c
@@ -0,0 +1,448 @@
+/*
+ * linux/fs/ext4/sysfs.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Theodore Ts'o (tytso@mit.edu)
+ *
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+
+#include "ext4.h"
+#include "ext4_jbd2.h"
+
+typedef enum {
+ attr_noop,
+ attr_delayed_allocation_blocks,
+ attr_session_write_kbytes,
+ attr_lifetime_write_kbytes,
+ attr_reserved_clusters,
+ attr_inode_readahead,
+ attr_trigger_test_error,
+ attr_feature,
+ attr_pointer_ui,
+ attr_pointer_atomic,
+} attr_id_t;
+
+typedef enum {
+ ptr_explicit,
+ ptr_ext4_sb_info_offset,
+ ptr_ext4_super_block_offset,
+} attr_ptr_t;
+
+static const char *proc_dirname = "fs/ext4";
+static struct proc_dir_entry *ext4_proc_root;
+
+struct ext4_attr {
+ struct attribute attr;
+ short attr_id;
+ short attr_ptr;
+ union {
+ int offset;
+ void *explicit_ptr;
+ } u;
+};
+
+static ssize_t session_write_kbytes_show(struct ext4_attr *a,
+ struct ext4_sb_info *sbi, char *buf)
+{
+ struct super_block *sb = sbi->s_buddy_cache->i_sb;
+
+ if (!sb->s_bdev->bd_part)
+ return snprintf(buf, PAGE_SIZE, "0\n");
+ return snprintf(buf, PAGE_SIZE, "%lu\n",
+ (part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+ sbi->s_sectors_written_start) >> 1);
+}
+
+static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
+ struct ext4_sb_info *sbi, char *buf)
+{
+ struct super_block *sb = sbi->s_buddy_cache->i_sb;
+
+ if (!sb->s_bdev->bd_part)
+ return snprintf(buf, PAGE_SIZE, "0\n");
+ return snprintf(buf, PAGE_SIZE, "%llu\n",
+ (unsigned long long)(sbi->s_kbytes_written +
+ ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+ EXT4_SB(sb)->s_sectors_written_start) >> 1)));
+}
+
+static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
+ struct ext4_sb_info *sbi,
+ const char *buf, size_t count)
+{
+ unsigned long t;
+ int ret;
+
+ ret = kstrtoul(skip_spaces(buf), 0, &t);
+ if (ret)
+ return ret;
+
+ if (t && (!is_power_of_2(t) || t > 0x40000000))
+ return -EINVAL;
+
+ sbi->s_inode_readahead_blks = t;
+ return count;
+}
+
+static ssize_t reserved_clusters_store(struct ext4_attr *a,
+ struct ext4_sb_info *sbi,
+ const char *buf, size_t count)
+{
+ unsigned long long val;
+ ext4_fsblk_t clusters = (ext4_blocks_count(sbi->s_es) >>
+ sbi->s_cluster_bits);
+ int ret;
+
+ ret = kstrtoull(skip_spaces(buf), 0, &val);
+ if (!ret || val >= clusters)
+ return -EINVAL;
+
+ atomic64_set(&sbi->s_resv_clusters, val);
+ return count;
+}
+
+static ssize_t trigger_test_error(struct ext4_attr *a,
+ struct ext4_sb_info *sbi,
+ const char *buf, size_t count)
+{
+ int len = count;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (len && buf[len-1] == '\n')
+ len--;
+
+ if (len)
+ ext4_error(sbi->s_sb, "%.*s", len, buf);
+ return count;
+}
+
+#define EXT4_ATTR(_name,_mode,_id) \
+static struct ext4_attr ext4_attr_##_name = { \
+ .attr = {.name = __stringify(_name), .mode = _mode }, \
+ .attr_id = attr_##_id, \
+}
+
+#define EXT4_ATTR_FUNC(_name,_mode) EXT4_ATTR(_name,_mode,_name)
+
+#define EXT4_ATTR_FEATURE(_name) EXT4_ATTR(_name, 0444, feature)
+
+#define EXT4_ATTR_OFFSET(_name,_mode,_id,_struct,_elname) \
+static struct ext4_attr ext4_attr_##_name = { \
+ .attr = {.name = __stringify(_name), .mode = _mode }, \
+ .attr_id = attr_##_id, \
+ .attr_ptr = ptr_##_struct##_offset, \
+ .u = { \
+ .offset = offsetof(struct _struct, _elname),\
+ }, \
+}
+
+#define EXT4_RO_ATTR_ES_UI(_name,_elname) \
+ EXT4_ATTR_OFFSET(_name, 0444, pointer_ui, ext4_super_block, _elname)
+
+#define EXT4_RW_ATTR_SBI_UI(_name,_elname) \
+ EXT4_ATTR_OFFSET(_name, 0644, pointer_ui, ext4_sb_info, _elname)
+
+#define EXT4_ATTR_PTR(_name,_mode,_id,_ptr) \
+static struct ext4_attr ext4_attr_##_name = { \
+ .attr = {.name = __stringify(_name), .mode = _mode }, \
+ .attr_id = attr_##_id, \
+ .attr_ptr = ptr_explicit, \
+ .u = { \
+ .explicit_ptr = _ptr, \
+ }, \
+}
+
+#define ATTR_LIST(name) &ext4_attr_##name.attr
+
+EXT4_ATTR_FUNC(delayed_allocation_blocks, 0444);
+EXT4_ATTR_FUNC(session_write_kbytes, 0444);
+EXT4_ATTR_FUNC(lifetime_write_kbytes, 0444);
+EXT4_ATTR_FUNC(reserved_clusters, 0644);
+
+EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, inode_readahead,
+ ext4_sb_info, s_inode_readahead_blks);
+EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
+EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats);
+EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
+EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
+EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
+EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
+EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
+EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
+EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error);
+EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval);
+EXT4_RW_ATTR_SBI_UI(err_ratelimit_burst, s_err_ratelimit_state.burst);
+EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval);
+EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst);
+EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval);
+EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
+EXT4_RO_ATTR_ES_UI(errors_count, s_error_count);
+EXT4_RO_ATTR_ES_UI(first_error_time, s_first_error_time);
+EXT4_RO_ATTR_ES_UI(last_error_time, s_last_error_time);
+
+static unsigned int old_bump_val = 128;
+EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val);
+
+static struct attribute *ext4_attrs[] = {
+ ATTR_LIST(delayed_allocation_blocks),
+ ATTR_LIST(session_write_kbytes),
+ ATTR_LIST(lifetime_write_kbytes),
+ ATTR_LIST(reserved_clusters),
+ ATTR_LIST(inode_readahead_blks),
+ ATTR_LIST(inode_goal),
+ ATTR_LIST(mb_stats),
+ ATTR_LIST(mb_max_to_scan),
+ ATTR_LIST(mb_min_to_scan),
+ ATTR_LIST(mb_order2_req),
+ ATTR_LIST(mb_stream_req),
+ ATTR_LIST(mb_group_prealloc),
+ ATTR_LIST(max_writeback_mb_bump),
+ ATTR_LIST(extent_max_zeroout_kb),
+ ATTR_LIST(trigger_fs_error),
+ ATTR_LIST(err_ratelimit_interval_ms),
+ ATTR_LIST(err_ratelimit_burst),
+ ATTR_LIST(warning_ratelimit_interval_ms),
+ ATTR_LIST(warning_ratelimit_burst),
+ ATTR_LIST(msg_ratelimit_interval_ms),
+ ATTR_LIST(msg_ratelimit_burst),
+ ATTR_LIST(errors_count),
+ ATTR_LIST(first_error_time),
+ ATTR_LIST(last_error_time),
+ NULL,
+};
+
+/* Features this copy of ext4 supports */
+EXT4_ATTR_FEATURE(lazy_itable_init);
+EXT4_ATTR_FEATURE(batched_discard);
+EXT4_ATTR_FEATURE(meta_bg_resize);
+EXT4_ATTR_FEATURE(encryption);
+EXT4_ATTR_FEATURE(metadata_csum_seed);
+
+static struct attribute *ext4_feat_attrs[] = {
+ ATTR_LIST(lazy_itable_init),
+ ATTR_LIST(batched_discard),
+ ATTR_LIST(meta_bg_resize),
+ ATTR_LIST(encryption),
+ ATTR_LIST(metadata_csum_seed),
+ NULL,
+};
+
+static void *calc_ptr(struct ext4_attr *a, struct ext4_sb_info *sbi)
+{
+ switch (a->attr_ptr) {
+ case ptr_explicit:
+ return a->u.explicit_ptr;
+ case ptr_ext4_sb_info_offset:
+ return (void *) (((char *) sbi) + a->u.offset);
+ case ptr_ext4_super_block_offset:
+ return (void *) (((char *) sbi->s_es) + a->u.offset);
+ }
+ return NULL;
+}
+
+static ssize_t ext4_attr_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
+ s_kobj);
+ struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
+ void *ptr = calc_ptr(a, sbi);
+
+ switch (a->attr_id) {
+ case attr_delayed_allocation_blocks:
+ return snprintf(buf, PAGE_SIZE, "%llu\n",
+ (s64) EXT4_C2B(sbi,
+ percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
+ case attr_session_write_kbytes:
+ return session_write_kbytes_show(a, sbi, buf);
+ case attr_lifetime_write_kbytes:
+ return lifetime_write_kbytes_show(a, sbi, buf);
+ case attr_reserved_clusters:
+ return snprintf(buf, PAGE_SIZE, "%llu\n",
+ (unsigned long long)
+ atomic64_read(&sbi->s_resv_clusters));
+ case attr_inode_readahead:
+ case attr_pointer_ui:
+ if (!ptr)
+ return 0;
+ return snprintf(buf, PAGE_SIZE, "%u\n",
+ *((unsigned int *) ptr));
+ case attr_pointer_atomic:
+ if (!ptr)
+ return 0;
+ return snprintf(buf, PAGE_SIZE, "%d\n",
+ atomic_read((atomic_t *) ptr));
+ case attr_feature:
+ return snprintf(buf, PAGE_SIZE, "supported\n");
+ }
+
+ return 0;
+}
+
+static ssize_t ext4_attr_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf, size_t len)
+{
+ struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
+ s_kobj);
+ struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
+ void *ptr = calc_ptr(a, sbi);
+ unsigned long t;
+ int ret;
+
+ switch (a->attr_id) {
+ case attr_reserved_clusters:
+ return reserved_clusters_store(a, sbi, buf, len);
+ case attr_pointer_ui:
+ if (!ptr)
+ return 0;
+ ret = kstrtoul(skip_spaces(buf), 0, &t);
+ if (ret)
+ return ret;
+ *((unsigned int *) ptr) = t;
+ return len;
+ case attr_inode_readahead:
+ return inode_readahead_blks_store(a, sbi, buf, len);
+ case attr_trigger_test_error:
+ return trigger_test_error(a, sbi, buf, len);
+ }
+ return 0;
+}
+
+static void ext4_sb_release(struct kobject *kobj)
+{
+ struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
+ s_kobj);
+ complete(&sbi->s_kobj_unregister);
+}
+
+static const struct sysfs_ops ext4_attr_ops = {
+ .show = ext4_attr_show,
+ .store = ext4_attr_store,
+};
+
+static struct kobj_type ext4_sb_ktype = {
+ .default_attrs = ext4_attrs,
+ .sysfs_ops = &ext4_attr_ops,
+ .release = ext4_sb_release,
+};
+
+static struct kobj_type ext4_ktype = {
+ .sysfs_ops = &ext4_attr_ops,
+};
+
+static struct kset ext4_kset = {
+ .kobj = {.ktype = &ext4_ktype},
+};
+
+static struct kobj_type ext4_feat_ktype = {
+ .default_attrs = ext4_feat_attrs,
+ .sysfs_ops = &ext4_attr_ops,
+};
+
+static struct kobject ext4_feat = {
+ .kset = &ext4_kset,
+};
+
+#define PROC_FILE_SHOW_DEFN(name) \
+static int name##_open(struct inode *inode, struct file *file) \
+{ \
+ return single_open(file, ext4_seq_##name##_show, PDE_DATA(inode)); \
+} \
+\
+static const struct file_operations ext4_seq_##name##_fops = { \
+ .owner = THIS_MODULE, \
+ .open = name##_open, \
+ .read = seq_read, \
+ .llseek = seq_lseek, \
+ .release = single_release, \
+}
+
+#define PROC_FILE_LIST(name) \
+ { __stringify(name), &ext4_seq_##name##_fops }
+
+PROC_FILE_SHOW_DEFN(es_shrinker_info);
+PROC_FILE_SHOW_DEFN(options);
+
+static struct ext4_proc_files {
+ const char *name;
+ const struct file_operations *fops;
+} proc_files[] = {
+ PROC_FILE_LIST(options),
+ PROC_FILE_LIST(es_shrinker_info),
+ PROC_FILE_LIST(mb_groups),
+ { NULL, NULL },
+};
+
+int ext4_register_sysfs(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_proc_files *p;
+ int err;
+
+ sbi->s_kobj.kset = &ext4_kset;
+ init_completion(&sbi->s_kobj_unregister);
+ err = kobject_init_and_add(&sbi->s_kobj, &ext4_sb_ktype, NULL,
+ "%s", sb->s_id);
+ if (err)
+ return err;
+
+ if (ext4_proc_root)
+ sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
+
+ if (sbi->s_proc) {
+ for (p = proc_files; p->name; p++)
+ proc_create_data(p->name, S_IRUGO, sbi->s_proc,
+ p->fops, sb);
+ }
+ return 0;
+}
+
+void ext4_unregister_sysfs(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_proc_files *p;
+
+ if (sbi->s_proc) {
+ for (p = proc_files; p->name; p++)
+ remove_proc_entry(p->name, sbi->s_proc);
+ remove_proc_entry(sb->s_id, ext4_proc_root);
+ }
+ kobject_del(&sbi->s_kobj);
+}
+
+int __init ext4_init_sysfs(void)
+{
+ int ret;
+
+ kobject_set_name(&ext4_kset.kobj, "ext4");
+ ext4_kset.kobj.parent = fs_kobj;
+ ret = kset_register(&ext4_kset);
+ if (ret)
+ return ret;
+
+ ret = kobject_init_and_add(&ext4_feat, &ext4_feat_ktype,
+ NULL, "features");
+ if (ret)
+ kset_unregister(&ext4_kset);
+ else
+ ext4_proc_root = proc_mkdir(proc_dirname, NULL);
+ return ret;
+}
+
+void ext4_exit_sysfs(void)
+{
+ kobject_put(&ext4_feat);
+ kset_unregister(&ext4_kset);
+ remove_proc_entry(proc_dirname, NULL);
+ ext4_proc_root = NULL;
+}
+
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 16e28c0..a95151e 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -68,10 +68,8 @@
printk("\n"); \
} while (0)
# define ea_bdebug(bh, f...) do { \
- char b[BDEVNAME_SIZE]; \
- printk(KERN_DEBUG "block %s:%lu: ", \
- bdevname(bh->b_bdev, b), \
- (unsigned long) bh->b_blocknr); \
+ printk(KERN_DEBUG "block %pg:%lu: ", \
+ bh->b_bdev, (unsigned long) bh->b_blocknr); \
printk(f); \
printk("\n"); \
} while (0)
@@ -195,7 +193,7 @@ ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end,
while (!IS_LAST_ENTRY(e)) {
struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(e);
if ((void *)next >= end)
- return -EIO;
+ return -EFSCORRUPTED;
e = next;
}
@@ -205,7 +203,7 @@ ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end,
(void *)e + sizeof(__u32) ||
value_start + le16_to_cpu(entry->e_value_offs) +
le32_to_cpu(entry->e_value_size) > end))
- return -EIO;
+ return -EFSCORRUPTED;
entry = EXT4_XATTR_NEXT(entry);
}
@@ -222,9 +220,9 @@ ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh)
if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
BHDR(bh)->h_blocks != cpu_to_le32(1))
- return -EIO;
+ return -EFSCORRUPTED;
if (!ext4_xattr_block_csum_verify(inode, bh->b_blocknr, BHDR(bh)))
- return -EIO;
+ return -EFSBADCRC;
error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size,
bh->b_data);
if (!error)
@@ -239,7 +237,7 @@ ext4_xattr_check_entry(struct ext4_xattr_entry *entry, size_t size)
if (entry->e_value_block != 0 || value_size > size ||
le16_to_cpu(entry->e_value_offs) + value_size > size)
- return -EIO;
+ return -EFSCORRUPTED;
return 0;
}
@@ -266,7 +264,7 @@ ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index,
}
*pentry = entry;
if (!cmp && ext4_xattr_check_entry(entry, size))
- return -EIO;
+ return -EFSCORRUPTED;
return cmp ? -ENODATA : 0;
}
@@ -297,13 +295,13 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
bad_block:
EXT4_ERROR_INODE(inode, "bad block %llu",
EXT4_I(inode)->i_file_acl);
- error = -EIO;
+ error = -EFSCORRUPTED;
goto cleanup;
}
ext4_xattr_cache_insert(ext4_mb_cache, bh);
entry = BFIRST(bh);
error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1);
- if (error == -EIO)
+ if (error == -EFSCORRUPTED)
goto bad_block;
if (error)
goto cleanup;
@@ -404,20 +402,24 @@ ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry,
const struct xattr_handler *handler =
ext4_xattr_handler(entry->e_name_index);
- if (handler) {
- size_t size = handler->list(dentry, buffer, rest,
- entry->e_name,
- entry->e_name_len,
- handler->flags);
+ if (handler && (!handler->list || handler->list(dentry))) {
+ const char *prefix = handler->prefix ?: handler->name;
+ size_t prefix_len = strlen(prefix);
+ size_t size = prefix_len + entry->e_name_len + 1;
+
if (buffer) {
if (size > rest)
return -ERANGE;
- buffer += size;
+ memcpy(buffer, prefix, prefix_len);
+ buffer += prefix_len;
+ memcpy(buffer, entry->e_name, entry->e_name_len);
+ buffer += entry->e_name_len;
+ *buffer++ = 0;
}
rest -= size;
}
}
- return buffer_size - rest;
+ return buffer_size - rest; /* total size */
}
static int
@@ -445,7 +447,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
if (ext4_xattr_check_block(inode, bh)) {
EXT4_ERROR_INODE(inode, "bad block %llu",
EXT4_I(inode)->i_file_acl);
- error = -EIO;
+ error = -EFSCORRUPTED;
goto cleanup;
}
ext4_xattr_cache_insert(ext4_mb_cache, bh);
@@ -525,12 +527,12 @@ errout:
static void ext4_xattr_update_super_block(handle_t *handle,
struct super_block *sb)
{
- if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR))
+ if (ext4_has_feature_xattr(sb))
return;
BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) {
- EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR);
+ ext4_set_feature_xattr(sb);
ext4_handle_dirty_super(handle, sb);
}
}
@@ -751,7 +753,7 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
if (ext4_xattr_check_block(inode, bs->bh)) {
EXT4_ERROR_INODE(inode, "bad block %llu",
EXT4_I(inode)->i_file_acl);
- error = -EIO;
+ error = -EFSCORRUPTED;
goto cleanup;
}
/* Find the named attribute. */
@@ -811,7 +813,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
bs->bh);
}
unlock_buffer(bs->bh);
- if (error == -EIO)
+ if (error == -EFSCORRUPTED)
goto bad_block;
if (!error)
error = ext4_handle_dirty_xattr_block(handle,
@@ -855,7 +857,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
}
error = ext4_xattr_set_entry(i, s);
- if (error == -EIO)
+ if (error == -EFSCORRUPTED)
goto bad_block;
if (error)
goto cleanup;
@@ -1314,7 +1316,7 @@ retry:
if (ext4_xattr_check_block(inode, bh)) {
EXT4_ERROR_INODE(inode, "bad block %llu",
EXT4_I(inode)->i_file_acl);
- error = -EIO;
+ error = -EFSCORRUPTED;
goto cleanup;
}
base = BHDR(bh);
@@ -1579,7 +1581,7 @@ ext4_xattr_cmp(struct ext4_xattr_header *header1,
memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len))
return 1;
if (entry1->e_value_block != 0 || entry2->e_value_block != 0)
- return -EIO;
+ return -EFSCORRUPTED;
if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs),
(char *)header2 + le16_to_cpu(entry2->e_value_offs),
le32_to_cpu(entry1->e_value_size)))
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index 95d90e0..3e81bdc 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -11,38 +11,20 @@
#include "ext4.h"
#include "xattr.h"
-static size_t
-ext4_xattr_security_list(struct dentry *dentry, char *list, size_t list_size,
- const char *name, size_t name_len, int type)
-{
- const size_t prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1;
- const size_t total_len = prefix_len + name_len + 1;
-
-
- if (list && total_len <= list_size) {
- memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
- memcpy(list+prefix_len, name, name_len);
- list[prefix_len + name_len] = '\0';
- }
- return total_len;
-}
-
static int
-ext4_xattr_security_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+ext4_xattr_security_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
return ext4_xattr_get(d_inode(dentry), EXT4_XATTR_INDEX_SECURITY,
name, buffer, size);
}
static int
-ext4_xattr_security_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+ext4_xattr_security_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_SECURITY,
name, value, size, flags);
}
@@ -76,7 +58,6 @@ ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir,
const struct xattr_handler ext4_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
- .list = ext4_xattr_security_list,
.get = ext4_xattr_security_get,
.set = ext4_xattr_security_set,
};
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index 891ee2d..2a3c6f9 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -12,40 +12,26 @@
#include "ext4.h"
#include "xattr.h"
-static size_t
-ext4_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size,
- const char *name, size_t name_len, int type)
+static bool
+ext4_xattr_trusted_list(struct dentry *dentry)
{
- const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
- const size_t total_len = prefix_len + name_len + 1;
-
- if (!capable(CAP_SYS_ADMIN))
- return 0;
-
- if (list && total_len <= list_size) {
- memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
- memcpy(list+prefix_len, name, name_len);
- list[prefix_len + name_len] = '\0';
- }
- return total_len;
+ return capable(CAP_SYS_ADMIN);
}
static int
-ext4_xattr_trusted_get(struct dentry *dentry, const char *name, void *buffer,
- size_t size, int type)
+ext4_xattr_trusted_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name, void *buffer,
+ size_t size)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
return ext4_xattr_get(d_inode(dentry), EXT4_XATTR_INDEX_TRUSTED,
name, buffer, size);
}
static int
-ext4_xattr_trusted_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+ext4_xattr_trusted_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_TRUSTED,
name, value, size, flags);
}
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index 6ed932b..d152f43 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -11,30 +11,17 @@
#include "ext4.h"
#include "xattr.h"
-static size_t
-ext4_xattr_user_list(struct dentry *dentry, char *list, size_t list_size,
- const char *name, size_t name_len, int type)
+static bool
+ext4_xattr_user_list(struct dentry *dentry)
{
- const size_t prefix_len = XATTR_USER_PREFIX_LEN;
- const size_t total_len = prefix_len + name_len + 1;
-
- if (!test_opt(dentry->d_sb, XATTR_USER))
- return 0;
-
- if (list && total_len <= list_size) {
- memcpy(list, XATTR_USER_PREFIX, prefix_len);
- memcpy(list+prefix_len, name, name_len);
- list[prefix_len + name_len] = '\0';
- }
- return total_len;
+ return test_opt(dentry->d_sb, XATTR_USER);
}
static int
-ext4_xattr_user_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+ext4_xattr_user_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
if (!test_opt(dentry->d_sb, XATTR_USER))
return -EOPNOTSUPP;
return ext4_xattr_get(d_inode(dentry), EXT4_XATTR_INDEX_USER,
@@ -42,11 +29,10 @@ ext4_xattr_user_get(struct dentry *dentry, const char *name,
}
static int
-ext4_xattr_user_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+ext4_xattr_user_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
if (!test_opt(dentry->d_sb, XATTR_USER))
return -EOPNOTSUPP;
return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_USER,
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index c629762..b0a9dc9 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -45,7 +45,7 @@ config F2FS_FS_POSIX_ACL
default y
help
Posix Access Control Lists (ACLs) support permissions for users and
- gourps beyond the owner/group/world scheme.
+ groups beyond the owner/group/world scheme.
To learn more about Access Control Lists, visit the POSIX ACLs for
Linux website <http://acl.bestbits.at/>.
diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile
index 396be1a..08e101e 100644
--- a/fs/f2fs/Makefile
+++ b/fs/f2fs/Makefile
@@ -2,6 +2,7 @@ obj-$(CONFIG_F2FS_FS) += f2fs.o
f2fs-y := dir.o file.o inode.o namei.o hash.o super.o inline.o
f2fs-y += checkpoint.o gc.o data.o node.o segment.o recovery.o
+f2fs-y += shrinker.o extent_cache.o
f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o
f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o
f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index b70bbe1..3842af9 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -47,7 +47,8 @@ repeat:
/*
* We guarantee no failure on the returned page.
*/
-struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
+static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index,
+ bool is_meta)
{
struct address_space *mapping = META_MAPPING(sbi);
struct page *page;
@@ -58,6 +59,9 @@ struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
.blk_addr = index,
.encrypted_page = NULL,
};
+
+ if (unlikely(!is_meta))
+ fio.rw &= ~REQ_META;
repeat:
page = grab_cache_page(mapping, index);
if (!page) {
@@ -69,18 +73,39 @@ repeat:
fio.page = page;
- if (f2fs_submit_page_bio(&fio))
+ if (f2fs_submit_page_bio(&fio)) {
+ f2fs_put_page(page, 1);
goto repeat;
+ }
lock_page(page);
if (unlikely(page->mapping != mapping)) {
f2fs_put_page(page, 1);
goto repeat;
}
+
+ /*
+ * if there is any IO error when accessing device, make our filesystem
+ * readonly and make sure do not write checkpoint with non-uptodate
+ * meta page.
+ */
+ if (unlikely(!PageUptodate(page)))
+ f2fs_stop_checkpoint(sbi);
out:
return page;
}
+struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
+{
+ return __get_meta_page(sbi, index, true);
+}
+
+/* for POR only */
+struct page *get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index)
+{
+ return __get_meta_page(sbi, index, false);
+}
+
bool is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type)
{
switch (type) {
@@ -115,7 +140,8 @@ bool is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type)
/*
* Readahead CP/NAT/SIT/SSA pages
*/
-int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type)
+int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
+ int type, bool sync)
{
block_t prev_blk_addr = 0;
struct page *page;
@@ -123,10 +149,13 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type
struct f2fs_io_info fio = {
.sbi = sbi,
.type = META,
- .rw = READ_SYNC | REQ_META | REQ_PRIO,
+ .rw = sync ? (READ_SYNC | REQ_META | REQ_PRIO) : READA,
.encrypted_page = NULL,
};
+ if (unlikely(type == META_POR))
+ fio.rw &= ~REQ_META;
+
for (; nrpages-- > 0; blkno++) {
if (!is_valid_blkaddr(sbi, blkno, type))
@@ -186,7 +215,7 @@ void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index)
f2fs_put_page(page, 0);
if (readahead)
- ra_meta_pages(sbi, index, MAX_BIO_BLOCKS(sbi), META_POR);
+ ra_meta_pages(sbi, index, MAX_BIO_BLOCKS(sbi), META_POR, true);
}
static int f2fs_write_meta_page(struct page *page,
@@ -208,7 +237,7 @@ static int f2fs_write_meta_page(struct page *page,
dec_page_count(sbi, F2FS_DIRTY_META);
unlock_page(page);
- if (wbc->for_reclaim)
+ if (wbc->for_reclaim || unlikely(f2fs_cp_error(sbi)))
f2fs_submit_merged_bio(sbi, META, WRITE);
return 0;
@@ -247,7 +276,7 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
long nr_to_write)
{
struct address_space *mapping = META_MAPPING(sbi);
- pgoff_t index = 0, end = LONG_MAX;
+ pgoff_t index = 0, end = LONG_MAX, prev = LONG_MAX;
struct pagevec pvec;
long nwritten = 0;
struct writeback_control wbc = {
@@ -267,6 +296,13 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
+ if (prev == LONG_MAX)
+ prev = page->index - 1;
+ if (nr_to_write != LONG_MAX && page->index != prev + 1) {
+ pagevec_release(&pvec);
+ goto stop;
+ }
+
lock_page(page);
if (unlikely(page->mapping != mapping)) {
@@ -287,13 +323,14 @@ continue_unlock:
break;
}
nwritten++;
+ prev = page->index;
if (unlikely(nwritten >= nr_to_write))
break;
}
pagevec_release(&pvec);
cond_resched();
}
-
+stop:
if (nwritten)
f2fs_submit_merged_bio(sbi, type, WRITE);
@@ -326,26 +363,18 @@ const struct address_space_operations f2fs_meta_aops = {
static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
{
struct inode_management *im = &sbi->im[type];
- struct ino_entry *e;
+ struct ino_entry *e, *tmp;
+
+ tmp = f2fs_kmem_cache_alloc(ino_entry_slab, GFP_NOFS);
retry:
- if (radix_tree_preload(GFP_NOFS)) {
- cond_resched();
- goto retry;
- }
+ radix_tree_preload(GFP_NOFS | __GFP_NOFAIL);
spin_lock(&im->ino_lock);
-
e = radix_tree_lookup(&im->ino_root, ino);
if (!e) {
- e = kmem_cache_alloc(ino_entry_slab, GFP_ATOMIC);
- if (!e) {
- spin_unlock(&im->ino_lock);
- radix_tree_preload_end();
- goto retry;
- }
+ e = tmp;
if (radix_tree_insert(&im->ino_root, ino, e)) {
spin_unlock(&im->ino_lock);
- kmem_cache_free(ino_entry_slab, e);
radix_tree_preload_end();
goto retry;
}
@@ -358,6 +387,9 @@ retry:
}
spin_unlock(&im->ino_lock);
radix_tree_preload_end();
+
+ if (e != tmp)
+ kmem_cache_free(ino_entry_slab, tmp);
}
static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
@@ -378,13 +410,13 @@ static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
spin_unlock(&im->ino_lock);
}
-void add_dirty_inode(struct f2fs_sb_info *sbi, nid_t ino, int type)
+void add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
{
/* add new dirty ino entry into list */
__add_ino_entry(sbi, ino, type);
}
-void remove_dirty_inode(struct f2fs_sb_info *sbi, nid_t ino, int type)
+void remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
{
/* remove dirty ino entry from list */
__remove_ino_entry(sbi, ino, type);
@@ -402,7 +434,7 @@ bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode)
return e ? true : false;
}
-void release_dirty_inode(struct f2fs_sb_info *sbi)
+void release_ino_entry(struct f2fs_sb_info *sbi)
{
struct ino_entry *e, *tmp;
int i;
@@ -458,29 +490,39 @@ void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
__remove_ino_entry(sbi, ino, ORPHAN_INO);
}
-static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
+static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
{
- struct inode *inode = f2fs_iget(sbi->sb, ino);
- f2fs_bug_on(sbi, IS_ERR(inode));
+ struct inode *inode;
+
+ inode = f2fs_iget(sbi->sb, ino);
+ if (IS_ERR(inode)) {
+ /*
+ * there should be a bug that we can't find the entry
+ * to orphan inode.
+ */
+ f2fs_bug_on(sbi, PTR_ERR(inode) == -ENOENT);
+ return PTR_ERR(inode);
+ }
+
clear_nlink(inode);
/* truncate all the data during iput */
iput(inode);
+ return 0;
}
-void recover_orphan_inodes(struct f2fs_sb_info *sbi)
+int recover_orphan_inodes(struct f2fs_sb_info *sbi)
{
block_t start_blk, orphan_blocks, i, j;
+ int err;
if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG))
- return;
-
- set_sbi_flag(sbi, SBI_POR_DOING);
+ return 0;
start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi);
orphan_blocks = __start_sum_addr(sbi) - 1 - __cp_payload(sbi);
- ra_meta_pages(sbi, start_blk, orphan_blocks, META_CP);
+ ra_meta_pages(sbi, start_blk, orphan_blocks, META_CP, true);
for (i = 0; i < orphan_blocks; i++) {
struct page *page = get_meta_page(sbi, start_blk + i);
@@ -489,14 +531,17 @@ void recover_orphan_inodes(struct f2fs_sb_info *sbi)
orphan_blk = (struct f2fs_orphan_block *)page_address(page);
for (j = 0; j < le32_to_cpu(orphan_blk->entry_count); j++) {
nid_t ino = le32_to_cpu(orphan_blk->ino[j]);
- recover_orphan_inode(sbi, ino);
+ err = recover_orphan_inode(sbi, ino);
+ if (err) {
+ f2fs_put_page(page, 1);
+ return err;
+ }
}
f2fs_put_page(page, 1);
}
/* clear Orphan Flag */
clear_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG);
- clear_sbi_flag(sbi, SBI_POR_DOING);
- return;
+ return 0;
}
static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
@@ -504,7 +549,7 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
struct list_head *head;
struct f2fs_orphan_block *orphan_blk = NULL;
unsigned int nentries = 0;
- unsigned short index;
+ unsigned short index = 1;
unsigned short orphan_blocks;
struct page *page = NULL;
struct ino_entry *orphan = NULL;
@@ -512,11 +557,6 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
orphan_blocks = GET_ORPHAN_BLOCKS(im->ino_num);
- for (index = 0; index < orphan_blocks; index++)
- grab_meta_page(sbi, start_blk + index);
-
- index = 1;
-
/*
* we don't need to do spin_lock(&im->ino_lock) here, since all the
* orphan inode operations are covered under f2fs_lock_op().
@@ -527,12 +567,10 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
/* loop for each orphan inode entry and write them in Jornal block */
list_for_each_entry(orphan, head, list) {
if (!page) {
- page = find_get_page(META_MAPPING(sbi), start_blk++);
- f2fs_bug_on(sbi, !page);
+ page = grab_meta_page(sbi, start_blk++);
orphan_blk =
(struct f2fs_orphan_block *)page_address(page);
memset(orphan_blk, 0, sizeof(*orphan_blk));
- f2fs_put_page(page, 0);
}
orphan_blk->ino[nentries++] = cpu_to_le32(orphan->ino);
@@ -684,46 +722,48 @@ fail_no_cp:
return -EINVAL;
}
-static int __add_dirty_inode(struct inode *inode, struct inode_entry *new)
+static void __add_dirty_inode(struct inode *inode, enum inode_type type)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ int flag = (type == DIR_INODE) ? FI_DIRTY_DIR : FI_DIRTY_FILE;
- if (is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR))
- return -EEXIST;
+ if (is_inode_flag_set(fi, flag))
+ return;
- set_inode_flag(F2FS_I(inode), FI_DIRTY_DIR);
- F2FS_I(inode)->dirty_dir = new;
- list_add_tail(&new->list, &sbi->dir_inode_list);
- stat_inc_dirty_dir(sbi);
- return 0;
+ set_inode_flag(fi, flag);
+ list_add_tail(&fi->dirty_list, &sbi->inode_list[type]);
+ stat_inc_dirty_inode(sbi, type);
}
-void update_dirty_page(struct inode *inode, struct page *page)
+static void __remove_dirty_inode(struct inode *inode, enum inode_type type)
{
- struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct inode_entry *new;
- int ret = 0;
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ int flag = (type == DIR_INODE) ? FI_DIRTY_DIR : FI_DIRTY_FILE;
- if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode))
+ if (get_dirty_pages(inode) ||
+ !is_inode_flag_set(F2FS_I(inode), flag))
return;
- if (!S_ISDIR(inode->i_mode)) {
- inode_inc_dirty_pages(inode);
- goto out;
- }
+ list_del_init(&fi->dirty_list);
+ clear_inode_flag(fi, flag);
+ stat_dec_dirty_inode(F2FS_I_SB(inode), type);
+}
+
+void update_dirty_page(struct inode *inode, struct page *page)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE;
- new = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
- new->inode = inode;
- INIT_LIST_HEAD(&new->list);
+ if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) &&
+ !S_ISLNK(inode->i_mode))
+ return;
- spin_lock(&sbi->dir_inode_lock);
- ret = __add_dirty_inode(inode, new);
+ spin_lock(&sbi->inode_lock[type]);
+ __add_dirty_inode(inode, type);
inode_inc_dirty_pages(inode);
- spin_unlock(&sbi->dir_inode_lock);
+ spin_unlock(&sbi->inode_lock[type]);
- if (ret)
- kmem_cache_free(inode_entry_slab, new);
-out:
SetPagePrivate(page);
f2fs_trace_pid(page);
}
@@ -731,70 +771,60 @@ out:
void add_dirty_dir_inode(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct inode_entry *new =
- f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
- int ret = 0;
-
- new->inode = inode;
- INIT_LIST_HEAD(&new->list);
-
- spin_lock(&sbi->dir_inode_lock);
- ret = __add_dirty_inode(inode, new);
- spin_unlock(&sbi->dir_inode_lock);
- if (ret)
- kmem_cache_free(inode_entry_slab, new);
+ spin_lock(&sbi->inode_lock[DIR_INODE]);
+ __add_dirty_inode(inode, DIR_INODE);
+ spin_unlock(&sbi->inode_lock[DIR_INODE]);
}
-void remove_dirty_dir_inode(struct inode *inode)
+void remove_dirty_inode(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct inode_entry *entry;
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE;
- if (!S_ISDIR(inode->i_mode))
+ if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) &&
+ !S_ISLNK(inode->i_mode))
return;
- spin_lock(&sbi->dir_inode_lock);
- if (get_dirty_pages(inode) ||
- !is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR)) {
- spin_unlock(&sbi->dir_inode_lock);
- return;
- }
-
- entry = F2FS_I(inode)->dirty_dir;
- list_del(&entry->list);
- F2FS_I(inode)->dirty_dir = NULL;
- clear_inode_flag(F2FS_I(inode), FI_DIRTY_DIR);
- stat_dec_dirty_dir(sbi);
- spin_unlock(&sbi->dir_inode_lock);
- kmem_cache_free(inode_entry_slab, entry);
+ spin_lock(&sbi->inode_lock[type]);
+ __remove_dirty_inode(inode, type);
+ spin_unlock(&sbi->inode_lock[type]);
/* Only from the recovery routine */
- if (is_inode_flag_set(F2FS_I(inode), FI_DELAY_IPUT)) {
- clear_inode_flag(F2FS_I(inode), FI_DELAY_IPUT);
+ if (is_inode_flag_set(fi, FI_DELAY_IPUT)) {
+ clear_inode_flag(fi, FI_DELAY_IPUT);
iput(inode);
}
}
-void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi)
+int sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type)
{
struct list_head *head;
- struct inode_entry *entry;
struct inode *inode;
+ struct f2fs_inode_info *fi;
+ bool is_dir = (type == DIR_INODE);
+
+ trace_f2fs_sync_dirty_inodes_enter(sbi->sb, is_dir,
+ get_pages(sbi, is_dir ?
+ F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA));
retry:
if (unlikely(f2fs_cp_error(sbi)))
- return;
+ return -EIO;
- spin_lock(&sbi->dir_inode_lock);
+ spin_lock(&sbi->inode_lock[type]);
- head = &sbi->dir_inode_list;
+ head = &sbi->inode_list[type];
if (list_empty(head)) {
- spin_unlock(&sbi->dir_inode_lock);
- return;
+ spin_unlock(&sbi->inode_lock[type]);
+ trace_f2fs_sync_dirty_inodes_exit(sbi->sb, is_dir,
+ get_pages(sbi, is_dir ?
+ F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA));
+ return 0;
}
- entry = list_entry(head->next, struct inode_entry, list);
- inode = igrab(entry->inode);
- spin_unlock(&sbi->dir_inode_lock);
+ fi = list_entry(head->next, struct f2fs_inode_info, dirty_list);
+ inode = igrab(&fi->vfs_inode);
+ spin_unlock(&sbi->inode_lock[type]);
if (inode) {
filemap_fdatawrite(inode->i_mapping);
iput(inode);
@@ -829,11 +859,9 @@ retry_flush_dents:
/* write all the dirty dentry pages */
if (get_pages(sbi, F2FS_DIRTY_DENTS)) {
f2fs_unlock_all(sbi);
- sync_dirty_dir_inodes(sbi);
- if (unlikely(f2fs_cp_error(sbi))) {
- err = -EIO;
+ err = sync_dirty_inodes(sbi, DIR_INODE);
+ if (err)
goto out;
- }
goto retry_flush_dents;
}
@@ -846,10 +874,9 @@ retry_flush_nodes:
if (get_pages(sbi, F2FS_DIRTY_NODES)) {
up_write(&sbi->node_write);
- sync_node_pages(sbi, 0, &wbc);
- if (unlikely(f2fs_cp_error(sbi))) {
+ err = sync_node_pages(sbi, 0, &wbc);
+ if (err) {
f2fs_unlock_all(sbi);
- err = -EIO;
goto out;
}
goto retry_flush_nodes;
@@ -880,7 +907,7 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi)
finish_wait(&sbi->cp_wait, &wait);
}
-static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
+static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
{
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
@@ -892,18 +919,21 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
__u32 crc32 = 0;
int i;
int cp_payload_blks = __cp_payload(sbi);
+ block_t discard_blk = NEXT_FREE_BLKADDR(sbi, curseg);
+ bool invalidate = false;
/*
* This avoids to conduct wrong roll-forward operations and uses
* metapages, so should be called prior to sync_meta_pages below.
*/
- discard_next_dnode(sbi, NEXT_FREE_BLKADDR(sbi, curseg));
+ if (discard_next_dnode(sbi, discard_blk))
+ invalidate = true;
/* Flush all the NAT/SIT pages */
while (get_pages(sbi, F2FS_DIRTY_META)) {
sync_meta_pages(sbi, META, LONG_MAX);
if (unlikely(f2fs_cp_error(sbi)))
- return;
+ return -EIO;
}
next_free_nid(sbi, &last_nid);
@@ -985,6 +1015,11 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
start_blk = __start_cp_addr(sbi);
+ /* need to wait for end_io results */
+ wait_on_all_pages_writeback(sbi);
+ if (unlikely(f2fs_cp_error(sbi)))
+ return -EIO;
+
/* write out checkpoint buffer at block 0 */
update_meta_page(sbi, ckpt, start_blk++);
@@ -1011,7 +1046,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
wait_on_all_pages_writeback(sbi);
if (unlikely(f2fs_cp_error(sbi)))
- return;
+ return -EIO;
filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LONG_MAX);
filemap_fdatawait_range(META_MAPPING(sbi), 0, LONG_MAX);
@@ -1026,22 +1061,33 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
/* wait for previous submitted meta pages writeback */
wait_on_all_pages_writeback(sbi);
- release_dirty_inode(sbi);
+ /*
+ * invalidate meta page which is used temporarily for zeroing out
+ * block at the end of warm node chain.
+ */
+ if (invalidate)
+ invalidate_mapping_pages(META_MAPPING(sbi), discard_blk,
+ discard_blk);
+
+ release_ino_entry(sbi);
if (unlikely(f2fs_cp_error(sbi)))
- return;
+ return -EIO;
clear_prefree_segments(sbi, cpc);
clear_sbi_flag(sbi, SBI_IS_DIRTY);
+
+ return 0;
}
/*
* We guarantee that this checkpoint procedure will not fail.
*/
-void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
+int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
{
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
unsigned long long ckpt_ver;
+ int err = 0;
mutex_lock(&sbi->cp_mutex);
@@ -1049,14 +1095,19 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
(cpc->reason == CP_FASTBOOT || cpc->reason == CP_SYNC ||
(cpc->reason == CP_DISCARD && !sbi->discard_blks)))
goto out;
- if (unlikely(f2fs_cp_error(sbi)))
+ if (unlikely(f2fs_cp_error(sbi))) {
+ err = -EIO;
goto out;
- if (f2fs_readonly(sbi->sb))
+ }
+ if (f2fs_readonly(sbi->sb)) {
+ err = -EROFS;
goto out;
+ }
trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "start block_ops");
- if (block_operations(sbi))
+ err = block_operations(sbi);
+ if (err)
goto out;
trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish block_ops");
@@ -1078,7 +1129,7 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
flush_sit_entries(sbi, cpc);
/* unlock all the fs_lock[] in do_checkpoint() */
- do_checkpoint(sbi, cpc);
+ err = do_checkpoint(sbi, cpc);
unblock_operations(sbi);
stat_inc_cp_count(sbi->stat_info);
@@ -1086,9 +1137,13 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
if (cpc->reason == CP_RECOVERY)
f2fs_msg(sbi->sb, KERN_NOTICE,
"checkpoint: version = %llx", ckpt_ver);
+
+ /* do checkpoint periodically */
+ f2fs_update_time(sbi, CP_TIME);
+ trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint");
out:
mutex_unlock(&sbi->cp_mutex);
- trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint");
+ return err;
}
void init_ino_entry_info(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/crypto_key.c b/fs/f2fs/crypto_key.c
index 95b8f93..5de2d86 100644
--- a/fs/f2fs/crypto_key.c
+++ b/fs/f2fs/crypto_key.c
@@ -92,8 +92,7 @@ static void f2fs_free_crypt_info(struct f2fs_crypt_info *ci)
if (!ci)
return;
- if (ci->ci_keyring_key)
- key_put(ci->ci_keyring_key);
+ key_put(ci->ci_keyring_key);
crypto_free_ablkcipher(ci->ci_ctfm);
kmem_cache_free(f2fs_crypt_info_cachep, ci);
}
@@ -123,7 +122,7 @@ int _f2fs_get_encryption_info(struct inode *inode)
struct key *keyring_key = NULL;
struct f2fs_encryption_key *master_key;
struct f2fs_encryption_context ctx;
- struct user_key_payload *ukp;
+ const struct user_key_payload *ukp;
struct crypto_ablkcipher *ctfm;
const char *cipher_str;
char raw_key[F2FS_MAX_KEY_SIZE];
@@ -200,7 +199,7 @@ retry:
}
crypt_info->ci_keyring_key = keyring_key;
BUG_ON(keyring_key->type != &key_type_logon);
- ukp = ((struct user_key_payload *)keyring_key->payload.data);
+ ukp = user_key_payload(keyring_key);
if (ukp->datalen != sizeof(struct f2fs_encryption_key)) {
res = -EINVAL;
goto out;
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index f71e19a..ac9e7c6 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -14,6 +14,7 @@
#include <linux/mpage.h>
#include <linux/writeback.h>
#include <linux/backing-dev.h>
+#include <linux/pagevec.h>
#include <linux/blkdev.h>
#include <linux/bio.h>
#include <linux/prefetch.h>
@@ -26,16 +27,13 @@
#include "trace.h"
#include <trace/events/f2fs.h>
-static struct kmem_cache *extent_tree_slab;
-static struct kmem_cache *extent_node_slab;
-
-static void f2fs_read_end_io(struct bio *bio, int err)
+static void f2fs_read_end_io(struct bio *bio)
{
struct bio_vec *bvec;
int i;
if (f2fs_bio_encrypted(bio)) {
- if (err) {
+ if (bio->bi_error) {
f2fs_release_crypto_ctx(bio->bi_private);
} else {
f2fs_end_io_crypto_work(bio->bi_private, bio);
@@ -46,7 +44,7 @@ static void f2fs_read_end_io(struct bio *bio, int err)
bio_for_each_segment_all(bvec, bio, i) {
struct page *page = bvec->bv_page;
- if (!err) {
+ if (!bio->bi_error) {
SetPageUptodate(page);
} else {
ClearPageUptodate(page);
@@ -57,7 +55,7 @@ static void f2fs_read_end_io(struct bio *bio, int err)
bio_put(bio);
}
-static void f2fs_write_end_io(struct bio *bio, int err)
+static void f2fs_write_end_io(struct bio *bio)
{
struct f2fs_sb_info *sbi = bio->bi_private;
struct bio_vec *bvec;
@@ -68,7 +66,7 @@ static void f2fs_write_end_io(struct bio *bio, int err)
f2fs_restore_and_release_control_page(&page);
- if (unlikely(err)) {
+ if (unlikely(bio->bi_error)) {
set_page_dirty(page);
set_bit(AS_EIO, &page->mapping->flags);
f2fs_stop_checkpoint(sbi);
@@ -92,8 +90,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
{
struct bio *bio;
- /* No failure on bio allocation */
- bio = bio_alloc(GFP_NOIO, npages);
+ bio = f2fs_bio_alloc(npages);
bio->bi_bdev = sbi->sb->s_bdev;
bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr);
@@ -158,7 +155,6 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
bio_put(bio);
- f2fs_put_page(page, 1);
return -EFAULT;
}
@@ -229,7 +225,8 @@ void set_data_blkaddr(struct dnode_of_data *dn)
/* Get physical address of data block */
addr_array = blkaddr_in_node(rn);
addr_array[ofs_in_node] = cpu_to_le32(dn->data_blkaddr);
- set_page_dirty(node_page);
+ if (set_page_dirty(node_page))
+ dn->node_changed = true;
}
int reserve_new_block(struct dnode_of_data *dn)
@@ -266,648 +263,21 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index)
return err;
}
-static bool lookup_extent_info(struct inode *inode, pgoff_t pgofs,
- struct extent_info *ei)
-{
- struct f2fs_inode_info *fi = F2FS_I(inode);
- pgoff_t start_fofs, end_fofs;
- block_t start_blkaddr;
-
- read_lock(&fi->ext_lock);
- if (fi->ext.len == 0) {
- read_unlock(&fi->ext_lock);
- return false;
- }
-
- stat_inc_total_hit(inode->i_sb);
-
- start_fofs = fi->ext.fofs;
- end_fofs = fi->ext.fofs + fi->ext.len - 1;
- start_blkaddr = fi->ext.blk;
-
- if (pgofs >= start_fofs && pgofs <= end_fofs) {
- *ei = fi->ext;
- stat_inc_read_hit(inode->i_sb);
- read_unlock(&fi->ext_lock);
- return true;
- }
- read_unlock(&fi->ext_lock);
- return false;
-}
-
-static bool update_extent_info(struct inode *inode, pgoff_t fofs,
- block_t blkaddr)
-{
- struct f2fs_inode_info *fi = F2FS_I(inode);
- pgoff_t start_fofs, end_fofs;
- block_t start_blkaddr, end_blkaddr;
- int need_update = true;
-
- write_lock(&fi->ext_lock);
-
- start_fofs = fi->ext.fofs;
- end_fofs = fi->ext.fofs + fi->ext.len - 1;
- start_blkaddr = fi->ext.blk;
- end_blkaddr = fi->ext.blk + fi->ext.len - 1;
-
- /* Drop and initialize the matched extent */
- if (fi->ext.len == 1 && fofs == start_fofs)
- fi->ext.len = 0;
-
- /* Initial extent */
- if (fi->ext.len == 0) {
- if (blkaddr != NULL_ADDR) {
- fi->ext.fofs = fofs;
- fi->ext.blk = blkaddr;
- fi->ext.len = 1;
- }
- goto end_update;
- }
-
- /* Front merge */
- if (fofs == start_fofs - 1 && blkaddr == start_blkaddr - 1) {
- fi->ext.fofs--;
- fi->ext.blk--;
- fi->ext.len++;
- goto end_update;
- }
-
- /* Back merge */
- if (fofs == end_fofs + 1 && blkaddr == end_blkaddr + 1) {
- fi->ext.len++;
- goto end_update;
- }
-
- /* Split the existing extent */
- if (fi->ext.len > 1 &&
- fofs >= start_fofs && fofs <= end_fofs) {
- if ((end_fofs - fofs) < (fi->ext.len >> 1)) {
- fi->ext.len = fofs - start_fofs;
- } else {
- fi->ext.fofs = fofs + 1;
- fi->ext.blk = start_blkaddr + fofs - start_fofs + 1;
- fi->ext.len -= fofs - start_fofs + 1;
- }
- } else {
- need_update = false;
- }
-
- /* Finally, if the extent is very fragmented, let's drop the cache. */
- if (fi->ext.len < F2FS_MIN_EXTENT_LEN) {
- fi->ext.len = 0;
- set_inode_flag(fi, FI_NO_EXTENT);
- need_update = true;
- }
-end_update:
- write_unlock(&fi->ext_lock);
- return need_update;
-}
-
-static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi,
- struct extent_tree *et, struct extent_info *ei,
- struct rb_node *parent, struct rb_node **p)
-{
- struct extent_node *en;
-
- en = kmem_cache_alloc(extent_node_slab, GFP_ATOMIC);
- if (!en)
- return NULL;
-
- en->ei = *ei;
- INIT_LIST_HEAD(&en->list);
-
- rb_link_node(&en->rb_node, parent, p);
- rb_insert_color(&en->rb_node, &et->root);
- et->count++;
- atomic_inc(&sbi->total_ext_node);
- return en;
-}
-
-static void __detach_extent_node(struct f2fs_sb_info *sbi,
- struct extent_tree *et, struct extent_node *en)
-{
- rb_erase(&en->rb_node, &et->root);
- et->count--;
- atomic_dec(&sbi->total_ext_node);
-
- if (et->cached_en == en)
- et->cached_en = NULL;
-}
-
-static struct extent_tree *__find_extent_tree(struct f2fs_sb_info *sbi,
- nid_t ino)
+int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index)
{
- struct extent_tree *et;
-
- down_read(&sbi->extent_tree_lock);
- et = radix_tree_lookup(&sbi->extent_tree_root, ino);
- if (!et) {
- up_read(&sbi->extent_tree_lock);
- return NULL;
- }
- atomic_inc(&et->refcount);
- up_read(&sbi->extent_tree_lock);
-
- return et;
-}
-
-static struct extent_tree *__grab_extent_tree(struct inode *inode)
-{
- struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct extent_tree *et;
- nid_t ino = inode->i_ino;
-
- down_write(&sbi->extent_tree_lock);
- et = radix_tree_lookup(&sbi->extent_tree_root, ino);
- if (!et) {
- et = f2fs_kmem_cache_alloc(extent_tree_slab, GFP_NOFS);
- f2fs_radix_tree_insert(&sbi->extent_tree_root, ino, et);
- memset(et, 0, sizeof(struct extent_tree));
- et->ino = ino;
- et->root = RB_ROOT;
- et->cached_en = NULL;
- rwlock_init(&et->lock);
- atomic_set(&et->refcount, 0);
- et->count = 0;
- sbi->total_ext_tree++;
- }
- atomic_inc(&et->refcount);
- up_write(&sbi->extent_tree_lock);
-
- return et;
-}
-
-static struct extent_node *__lookup_extent_tree(struct extent_tree *et,
- unsigned int fofs)
-{
- struct rb_node *node = et->root.rb_node;
- struct extent_node *en;
-
- if (et->cached_en) {
- struct extent_info *cei = &et->cached_en->ei;
-
- if (cei->fofs <= fofs && cei->fofs + cei->len > fofs)
- return et->cached_en;
- }
-
- while (node) {
- en = rb_entry(node, struct extent_node, rb_node);
-
- if (fofs < en->ei.fofs) {
- node = node->rb_left;
- } else if (fofs >= en->ei.fofs + en->ei.len) {
- node = node->rb_right;
- } else {
- et->cached_en = en;
- return en;
- }
- }
- return NULL;
-}
-
-static struct extent_node *__try_back_merge(struct f2fs_sb_info *sbi,
- struct extent_tree *et, struct extent_node *en)
-{
- struct extent_node *prev;
- struct rb_node *node;
-
- node = rb_prev(&en->rb_node);
- if (!node)
- return NULL;
-
- prev = rb_entry(node, struct extent_node, rb_node);
- if (__is_back_mergeable(&en->ei, &prev->ei)) {
- en->ei.fofs = prev->ei.fofs;
- en->ei.blk = prev->ei.blk;
- en->ei.len += prev->ei.len;
- __detach_extent_node(sbi, et, prev);
- return prev;
- }
- return NULL;
-}
-
-static struct extent_node *__try_front_merge(struct f2fs_sb_info *sbi,
- struct extent_tree *et, struct extent_node *en)
-{
- struct extent_node *next;
- struct rb_node *node;
-
- node = rb_next(&en->rb_node);
- if (!node)
- return NULL;
-
- next = rb_entry(node, struct extent_node, rb_node);
- if (__is_front_mergeable(&en->ei, &next->ei)) {
- en->ei.len += next->ei.len;
- __detach_extent_node(sbi, et, next);
- return next;
- }
- return NULL;
-}
-
-static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi,
- struct extent_tree *et, struct extent_info *ei,
- struct extent_node **den)
-{
- struct rb_node **p = &et->root.rb_node;
- struct rb_node *parent = NULL;
- struct extent_node *en;
-
- while (*p) {
- parent = *p;
- en = rb_entry(parent, struct extent_node, rb_node);
-
- if (ei->fofs < en->ei.fofs) {
- if (__is_front_mergeable(ei, &en->ei)) {
- f2fs_bug_on(sbi, !den);
- en->ei.fofs = ei->fofs;
- en->ei.blk = ei->blk;
- en->ei.len += ei->len;
- *den = __try_back_merge(sbi, et, en);
- return en;
- }
- p = &(*p)->rb_left;
- } else if (ei->fofs >= en->ei.fofs + en->ei.len) {
- if (__is_back_mergeable(ei, &en->ei)) {
- f2fs_bug_on(sbi, !den);
- en->ei.len += ei->len;
- *den = __try_front_merge(sbi, et, en);
- return en;
- }
- p = &(*p)->rb_right;
- } else {
- f2fs_bug_on(sbi, 1);
- }
- }
-
- return __attach_extent_node(sbi, et, ei, parent, p);
-}
-
-static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi,
- struct extent_tree *et, bool free_all)
-{
- struct rb_node *node, *next;
- struct extent_node *en;
- unsigned int count = et->count;
-
- node = rb_first(&et->root);
- while (node) {
- next = rb_next(node);
- en = rb_entry(node, struct extent_node, rb_node);
-
- if (free_all) {
- spin_lock(&sbi->extent_lock);
- if (!list_empty(&en->list))
- list_del_init(&en->list);
- spin_unlock(&sbi->extent_lock);
- }
-
- if (free_all || list_empty(&en->list)) {
- __detach_extent_node(sbi, et, en);
- kmem_cache_free(extent_node_slab, en);
- }
- node = next;
- }
-
- return count - et->count;
-}
-
-static void f2fs_init_extent_tree(struct inode *inode,
- struct f2fs_extent *i_ext)
-{
- struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct extent_tree *et;
- struct extent_node *en;
struct extent_info ei;
+ struct inode *inode = dn->inode;
- if (le32_to_cpu(i_ext->len) < F2FS_MIN_EXTENT_LEN)
- return;
-
- et = __grab_extent_tree(inode);
-
- write_lock(&et->lock);
- if (et->count)
- goto out;
-
- set_extent_info(&ei, le32_to_cpu(i_ext->fofs),
- le32_to_cpu(i_ext->blk), le32_to_cpu(i_ext->len));
-
- en = __insert_extent_tree(sbi, et, &ei, NULL);
- if (en) {
- et->cached_en = en;
-
- spin_lock(&sbi->extent_lock);
- list_add_tail(&en->list, &sbi->extent_list);
- spin_unlock(&sbi->extent_lock);
- }
-out:
- write_unlock(&et->lock);
- atomic_dec(&et->refcount);
-}
-
-static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
- struct extent_info *ei)
-{
- struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct extent_tree *et;
- struct extent_node *en;
-
- trace_f2fs_lookup_extent_tree_start(inode, pgofs);
-
- et = __find_extent_tree(sbi, inode->i_ino);
- if (!et)
- return false;
-
- read_lock(&et->lock);
- en = __lookup_extent_tree(et, pgofs);
- if (en) {
- *ei = en->ei;
- spin_lock(&sbi->extent_lock);
- if (!list_empty(&en->list))
- list_move_tail(&en->list, &sbi->extent_list);
- spin_unlock(&sbi->extent_lock);
- stat_inc_read_hit(sbi->sb);
- }
- stat_inc_total_hit(sbi->sb);
- read_unlock(&et->lock);
-
- trace_f2fs_lookup_extent_tree_end(inode, pgofs, en);
-
- atomic_dec(&et->refcount);
- return en ? true : false;
-}
-
-static void f2fs_update_extent_tree(struct inode *inode, pgoff_t fofs,
- block_t blkaddr)
-{
- struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct extent_tree *et;
- struct extent_node *en = NULL, *en1 = NULL, *en2 = NULL, *en3 = NULL;
- struct extent_node *den = NULL;
- struct extent_info ei, dei;
- unsigned int endofs;
-
- trace_f2fs_update_extent_tree(inode, fofs, blkaddr);
-
- et = __grab_extent_tree(inode);
-
- write_lock(&et->lock);
-
- /* 1. lookup and remove existing extent info in cache */
- en = __lookup_extent_tree(et, fofs);
- if (!en)
- goto update_extent;
-
- dei = en->ei;
- __detach_extent_node(sbi, et, en);
-
- /* 2. if extent can be split more, split and insert the left part */
- if (dei.len > 1) {
- /* insert left part of split extent into cache */
- if (fofs - dei.fofs >= F2FS_MIN_EXTENT_LEN) {
- set_extent_info(&ei, dei.fofs, dei.blk,
- fofs - dei.fofs);
- en1 = __insert_extent_tree(sbi, et, &ei, NULL);
- }
-
- /* insert right part of split extent into cache */
- endofs = dei.fofs + dei.len - 1;
- if (endofs - fofs >= F2FS_MIN_EXTENT_LEN) {
- set_extent_info(&ei, fofs + 1,
- fofs - dei.fofs + dei.blk, endofs - fofs);
- en2 = __insert_extent_tree(sbi, et, &ei, NULL);
- }
- }
-
-update_extent:
- /* 3. update extent in extent cache */
- if (blkaddr) {
- set_extent_info(&ei, fofs, blkaddr, 1);
- en3 = __insert_extent_tree(sbi, et, &ei, &den);
- }
-
- /* 4. update in global extent list */
- spin_lock(&sbi->extent_lock);
- if (en && !list_empty(&en->list))
- list_del(&en->list);
- /*
- * en1 and en2 split from en, they will become more and more smaller
- * fragments after splitting several times. So if the length is smaller
- * than F2FS_MIN_EXTENT_LEN, we will not add them into extent tree.
- */
- if (en1)
- list_add_tail(&en1->list, &sbi->extent_list);
- if (en2)
- list_add_tail(&en2->list, &sbi->extent_list);
- if (en3) {
- if (list_empty(&en3->list))
- list_add_tail(&en3->list, &sbi->extent_list);
- else
- list_move_tail(&en3->list, &sbi->extent_list);
- }
- if (den && !list_empty(&den->list))
- list_del(&den->list);
- spin_unlock(&sbi->extent_lock);
-
- /* 5. release extent node */
- if (en)
- kmem_cache_free(extent_node_slab, en);
- if (den)
- kmem_cache_free(extent_node_slab, den);
-
- write_unlock(&et->lock);
- atomic_dec(&et->refcount);
-}
-
-void f2fs_preserve_extent_tree(struct inode *inode)
-{
- struct extent_tree *et;
- struct extent_info *ext = &F2FS_I(inode)->ext;
- bool sync = false;
-
- if (!test_opt(F2FS_I_SB(inode), EXTENT_CACHE))
- return;
-
- et = __find_extent_tree(F2FS_I_SB(inode), inode->i_ino);
- if (!et) {
- if (ext->len) {
- ext->len = 0;
- update_inode_page(inode);
- }
- return;
- }
-
- read_lock(&et->lock);
- if (et->count) {
- struct extent_node *en;
-
- if (et->cached_en) {
- en = et->cached_en;
- } else {
- struct rb_node *node = rb_first(&et->root);
-
- if (!node)
- node = rb_last(&et->root);
- en = rb_entry(node, struct extent_node, rb_node);
- }
-
- if (__is_extent_same(ext, &en->ei))
- goto out;
-
- *ext = en->ei;
- sync = true;
- } else if (ext->len) {
- ext->len = 0;
- sync = true;
- }
-out:
- read_unlock(&et->lock);
- atomic_dec(&et->refcount);
-
- if (sync)
- update_inode_page(inode);
-}
-
-void f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
-{
- struct extent_tree *treevec[EXT_TREE_VEC_SIZE];
- struct extent_node *en, *tmp;
- unsigned long ino = F2FS_ROOT_INO(sbi);
- struct radix_tree_iter iter;
- void **slot;
- unsigned int found;
- unsigned int node_cnt = 0, tree_cnt = 0;
-
- if (!test_opt(sbi, EXTENT_CACHE))
- return;
-
- if (available_free_memory(sbi, EXTENT_CACHE))
- return;
-
- spin_lock(&sbi->extent_lock);
- list_for_each_entry_safe(en, tmp, &sbi->extent_list, list) {
- if (!nr_shrink--)
- break;
- list_del_init(&en->list);
- }
- spin_unlock(&sbi->extent_lock);
-
- down_read(&sbi->extent_tree_lock);
- while ((found = radix_tree_gang_lookup(&sbi->extent_tree_root,
- (void **)treevec, ino, EXT_TREE_VEC_SIZE))) {
- unsigned i;
-
- ino = treevec[found - 1]->ino + 1;
- for (i = 0; i < found; i++) {
- struct extent_tree *et = treevec[i];
-
- atomic_inc(&et->refcount);
- write_lock(&et->lock);
- node_cnt += __free_extent_tree(sbi, et, false);
- write_unlock(&et->lock);
- atomic_dec(&et->refcount);
- }
- }
- up_read(&sbi->extent_tree_lock);
-
- down_write(&sbi->extent_tree_lock);
- radix_tree_for_each_slot(slot, &sbi->extent_tree_root, &iter,
- F2FS_ROOT_INO(sbi)) {
- struct extent_tree *et = (struct extent_tree *)*slot;
-
- if (!atomic_read(&et->refcount) && !et->count) {
- radix_tree_delete(&sbi->extent_tree_root, et->ino);
- kmem_cache_free(extent_tree_slab, et);
- sbi->total_ext_tree--;
- tree_cnt++;
- }
+ if (f2fs_lookup_extent_cache(inode, index, &ei)) {
+ dn->data_blkaddr = ei.blk + index - ei.fofs;
+ return 0;
}
- up_write(&sbi->extent_tree_lock);
- trace_f2fs_shrink_extent_tree(sbi, node_cnt, tree_cnt);
+ return f2fs_reserve_block(dn, index);
}
-void f2fs_destroy_extent_tree(struct inode *inode)
-{
- struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct extent_tree *et;
- unsigned int node_cnt = 0;
-
- if (!test_opt(sbi, EXTENT_CACHE))
- return;
-
- et = __find_extent_tree(sbi, inode->i_ino);
- if (!et)
- goto out;
-
- /* free all extent info belong to this extent tree */
- write_lock(&et->lock);
- node_cnt = __free_extent_tree(sbi, et, true);
- write_unlock(&et->lock);
-
- atomic_dec(&et->refcount);
-
- /* try to find and delete extent tree entry in radix tree */
- down_write(&sbi->extent_tree_lock);
- et = radix_tree_lookup(&sbi->extent_tree_root, inode->i_ino);
- if (!et) {
- up_write(&sbi->extent_tree_lock);
- goto out;
- }
- f2fs_bug_on(sbi, atomic_read(&et->refcount) || et->count);
- radix_tree_delete(&sbi->extent_tree_root, inode->i_ino);
- kmem_cache_free(extent_tree_slab, et);
- sbi->total_ext_tree--;
- up_write(&sbi->extent_tree_lock);
-out:
- trace_f2fs_destroy_extent_tree(inode, node_cnt);
- return;
-}
-
-void f2fs_init_extent_cache(struct inode *inode, struct f2fs_extent *i_ext)
-{
- if (test_opt(F2FS_I_SB(inode), EXTENT_CACHE))
- f2fs_init_extent_tree(inode, i_ext);
-
- write_lock(&F2FS_I(inode)->ext_lock);
- get_extent_info(&F2FS_I(inode)->ext, *i_ext);
- write_unlock(&F2FS_I(inode)->ext_lock);
-}
-
-static bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs,
- struct extent_info *ei)
-{
- if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT))
- return false;
-
- if (test_opt(F2FS_I_SB(inode), EXTENT_CACHE))
- return f2fs_lookup_extent_tree(inode, pgofs, ei);
-
- return lookup_extent_info(inode, pgofs, ei);
-}
-
-void f2fs_update_extent_cache(struct dnode_of_data *dn)
-{
- struct f2fs_inode_info *fi = F2FS_I(dn->inode);
- pgoff_t fofs;
-
- f2fs_bug_on(F2FS_I_SB(dn->inode), dn->data_blkaddr == NEW_ADDR);
-
- if (is_inode_flag_set(fi, FI_NO_EXTENT))
- return;
-
- fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
- dn->ofs_in_node;
-
- if (test_opt(F2FS_I_SB(dn->inode), EXTENT_CACHE))
- return f2fs_update_extent_tree(dn->inode, fofs,
- dn->data_blkaddr);
-
- if (update_extent_info(dn->inode, fofs, dn->data_blkaddr))
- sync_inode_page(dn);
-}
-
-struct page *get_read_data_page(struct inode *inode, pgoff_t index, int rw)
+struct page *get_read_data_page(struct inode *inode, pgoff_t index,
+ int rw, bool for_write)
{
struct address_space *mapping = inode->i_mapping;
struct dnode_of_data dn;
@@ -924,7 +294,7 @@ struct page *get_read_data_page(struct inode *inode, pgoff_t index, int rw)
if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
return read_mapping_page(mapping, index, NULL);
- page = grab_cache_page(mapping, index);
+ page = f2fs_grab_cache_page(mapping, index, for_write);
if (!page)
return ERR_PTR(-ENOMEM);
@@ -935,15 +305,13 @@ struct page *get_read_data_page(struct inode *inode, pgoff_t index, int rw)
set_new_dnode(&dn, inode, NULL, NULL, 0);
err = get_dnode_of_data(&dn, index, LOOKUP_NODE);
- if (err) {
- f2fs_put_page(page, 1);
- return ERR_PTR(err);
- }
+ if (err)
+ goto put_err;
f2fs_put_dnode(&dn);
if (unlikely(dn.data_blkaddr == NULL_ADDR)) {
- f2fs_put_page(page, 1);
- return ERR_PTR(-ENOENT);
+ err = -ENOENT;
+ goto put_err;
}
got_it:
if (PageUptodate(page)) {
@@ -968,8 +336,12 @@ got_it:
fio.page = page;
err = f2fs_submit_page_bio(&fio);
if (err)
- return ERR_PTR(err);
+ goto put_err;
return page;
+
+put_err:
+ f2fs_put_page(page, 1);
+ return ERR_PTR(err);
}
struct page *find_data_page(struct inode *inode, pgoff_t index)
@@ -982,7 +354,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index)
return page;
f2fs_put_page(page, 0);
- page = get_read_data_page(inode, index, READ_SYNC);
+ page = get_read_data_page(inode, index, READ_SYNC, false);
if (IS_ERR(page))
return page;
@@ -1002,12 +374,13 @@ struct page *find_data_page(struct inode *inode, pgoff_t index)
* Because, the callers, functions in dir.c and GC, should be able to know
* whether this page exists or not.
*/
-struct page *get_lock_data_page(struct inode *inode, pgoff_t index)
+struct page *get_lock_data_page(struct inode *inode, pgoff_t index,
+ bool for_write)
{
struct address_space *mapping = inode->i_mapping;
struct page *page;
repeat:
- page = get_read_data_page(inode, index, READ_SYNC);
+ page = get_read_data_page(inode, index, READ_SYNC, for_write);
if (IS_ERR(page))
return page;
@@ -1030,7 +403,8 @@ repeat:
*
* Also, caller should grab and release a rwsem by calling f2fs_lock_op() and
* f2fs_unlock_op().
- * Note that, ipage is set only by make_empty_dir.
+ * Note that, ipage is set only by make_empty_dir, and if any error occur,
+ * ipage should be released by this function.
*/
struct page *get_new_data_page(struct inode *inode,
struct page *ipage, pgoff_t index, bool new_i_size)
@@ -1039,10 +413,16 @@ struct page *get_new_data_page(struct inode *inode,
struct page *page;
struct dnode_of_data dn;
int err;
-repeat:
- page = grab_cache_page(mapping, index);
- if (!page)
+
+ page = f2fs_grab_cache_page(mapping, index, true);
+ if (!page) {
+ /*
+ * before exiting, we should make sure ipage will be released
+ * if any error occur.
+ */
+ f2fs_put_page(ipage, 1);
return ERR_PTR(-ENOMEM);
+ }
set_new_dnode(&dn, inode, ipage, NULL, 0);
err = f2fs_reserve_block(&dn, index);
@@ -1062,17 +442,16 @@ repeat:
} else {
f2fs_put_page(page, 1);
- page = get_read_data_page(inode, index, READ_SYNC);
+ /* if ipage exists, blkaddr should be NEW_ADDR */
+ f2fs_bug_on(F2FS_I_SB(inode), ipage);
+ page = get_lock_data_page(inode, index, true);
if (IS_ERR(page))
- goto repeat;
-
- /* wait for read completion */
- lock_page(page);
+ return page;
}
got_it:
- if (new_i_size &&
- i_size_read(inode) < ((index + 1) << PAGE_CACHE_SHIFT)) {
- i_size_write(inode, ((index + 1) << PAGE_CACHE_SHIFT));
+ if (new_i_size && i_size_read(inode) <
+ ((loff_t)(index + 1) << PAGE_CACHE_SHIFT)) {
+ i_size_write(inode, ((loff_t)(index + 1) << PAGE_CACHE_SHIFT));
/* Only the directory inode sets new_i_size */
set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR);
}
@@ -1107,20 +486,18 @@ alloc:
allocate_data_block(sbi, NULL, dn->data_blkaddr, &dn->data_blkaddr,
&sum, seg);
-
- /* direct IO doesn't use extent cache to maximize the performance */
set_data_blkaddr(dn);
/* update i_size */
fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
dn->ofs_in_node;
- if (i_size_read(dn->inode) < ((fofs + 1) << PAGE_CACHE_SHIFT))
- i_size_write(dn->inode, ((fofs + 1) << PAGE_CACHE_SHIFT));
-
+ if (i_size_read(dn->inode) < ((loff_t)(fofs + 1) << PAGE_CACHE_SHIFT))
+ i_size_write(dn->inode,
+ ((loff_t)(fofs + 1) << PAGE_CACHE_SHIFT));
return 0;
}
-static void __allocate_data_blocks(struct inode *inode, loff_t offset,
+static int __allocate_data_blocks(struct inode *inode, loff_t offset,
size_t count)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
@@ -1129,14 +506,15 @@ static void __allocate_data_blocks(struct inode *inode, loff_t offset,
u64 len = F2FS_BYTES_TO_BLK(count);
bool allocated;
u64 end_offset;
+ int err = 0;
while (len) {
- f2fs_balance_fs(sbi);
f2fs_lock_op(sbi);
/* When reading holes, we need its node page */
set_new_dnode(&dn, inode, NULL, NULL, 0);
- if (get_dnode_of_data(&dn, start, ALLOC_NODE))
+ err = get_dnode_of_data(&dn, start, ALLOC_NODE);
+ if (err)
goto out;
allocated = false;
@@ -1145,9 +523,15 @@ static void __allocate_data_blocks(struct inode *inode, loff_t offset,
while (dn.ofs_in_node < end_offset && len) {
block_t blkaddr;
+ if (unlikely(f2fs_cp_error(sbi))) {
+ err = -EIO;
+ goto sync_out;
+ }
+
blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
if (blkaddr == NULL_ADDR || blkaddr == NEW_ADDR) {
- if (__allocate_data_block(&dn))
+ err = __allocate_data_block(&dn);
+ if (err)
goto sync_out;
allocated = true;
}
@@ -1161,8 +545,10 @@ static void __allocate_data_blocks(struct inode *inode, loff_t offset,
f2fs_put_dnode(&dn);
f2fs_unlock_op(sbi);
+
+ f2fs_balance_fs(sbi, dn.node_changed);
}
- return;
+ return err;
sync_out:
if (allocated)
@@ -1170,7 +556,8 @@ sync_out:
f2fs_put_dnode(&dn);
out:
f2fs_unlock_op(sbi);
- return;
+ f2fs_balance_fs(sbi, dn.node_changed);
+ return err;
}
/*
@@ -1182,16 +569,18 @@ out:
* b. do not use extent cache for better performance
* c. give the block addresses to blockdev
*/
-static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
- int create, bool fiemap)
+int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
+ int create, int flag)
{
unsigned int maxblocks = map->m_len;
struct dnode_of_data dn;
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
int mode = create ? ALLOC_NODE : LOOKUP_NODE_RA;
pgoff_t pgofs, end_offset;
int err = 0, ofs = 1;
struct extent_info ei;
bool allocated = false;
+ block_t blkaddr;
map->m_len = 0;
map->m_flags = 0;
@@ -1207,7 +596,7 @@ static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
}
if (create)
- f2fs_lock_op(F2FS_I_SB(inode));
+ f2fs_lock_op(sbi);
/* When reading holes, we need its node page */
set_new_dnode(&dn, inode, NULL, NULL, 0);
@@ -1217,37 +606,59 @@ static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
err = 0;
goto unlock_out;
}
- if (dn.data_blkaddr == NEW_ADDR && !fiemap)
- goto put_out;
- if (dn.data_blkaddr != NULL_ADDR) {
- map->m_flags = F2FS_MAP_MAPPED;
- map->m_pblk = dn.data_blkaddr;
- if (dn.data_blkaddr == NEW_ADDR)
- map->m_flags |= F2FS_MAP_UNWRITTEN;
- } else if (create) {
- err = __allocate_data_block(&dn);
- if (err)
- goto put_out;
- allocated = true;
- map->m_flags = F2FS_MAP_NEW | F2FS_MAP_MAPPED;
- map->m_pblk = dn.data_blkaddr;
- } else {
- goto put_out;
+ if (dn.data_blkaddr == NEW_ADDR || dn.data_blkaddr == NULL_ADDR) {
+ if (create) {
+ if (unlikely(f2fs_cp_error(sbi))) {
+ err = -EIO;
+ goto put_out;
+ }
+ err = __allocate_data_block(&dn);
+ if (err)
+ goto put_out;
+ allocated = true;
+ map->m_flags = F2FS_MAP_NEW;
+ } else {
+ if (flag != F2FS_GET_BLOCK_FIEMAP ||
+ dn.data_blkaddr != NEW_ADDR) {
+ if (flag == F2FS_GET_BLOCK_BMAP)
+ err = -ENOENT;
+ goto put_out;
+ }
+
+ /*
+ * preallocated unwritten block should be mapped
+ * for fiemap.
+ */
+ if (dn.data_blkaddr == NEW_ADDR)
+ map->m_flags = F2FS_MAP_UNWRITTEN;
+ }
}
- end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
+ map->m_flags |= F2FS_MAP_MAPPED;
+ map->m_pblk = dn.data_blkaddr;
map->m_len = 1;
+
+ end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
dn.ofs_in_node++;
pgofs++;
get_next:
+ if (map->m_len >= maxblocks)
+ goto sync_out;
+
if (dn.ofs_in_node >= end_offset) {
if (allocated)
sync_inode_page(&dn);
allocated = false;
f2fs_put_dnode(&dn);
+ if (create) {
+ f2fs_unlock_op(sbi);
+ f2fs_balance_fs(sbi, dn.node_changed);
+ f2fs_lock_op(sbi);
+ }
+
set_new_dnode(&dn, inode, NULL, NULL, 0);
err = get_dnode_of_data(&dn, pgofs, mode);
if (err) {
@@ -1255,49 +666,64 @@ get_next:
err = 0;
goto unlock_out;
}
- if (dn.data_blkaddr == NEW_ADDR && !fiemap)
- goto put_out;
end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
}
- if (maxblocks > map->m_len) {
- block_t blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
- if (blkaddr == NULL_ADDR && create) {
+ blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
+
+ if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) {
+ if (create) {
+ if (unlikely(f2fs_cp_error(sbi))) {
+ err = -EIO;
+ goto sync_out;
+ }
err = __allocate_data_block(&dn);
if (err)
goto sync_out;
allocated = true;
map->m_flags |= F2FS_MAP_NEW;
blkaddr = dn.data_blkaddr;
- }
- /* Give more consecutive addresses for the readahead */
- if ((map->m_pblk != NEW_ADDR &&
- blkaddr == (map->m_pblk + ofs)) ||
- (map->m_pblk == NEW_ADDR &&
- blkaddr == NEW_ADDR)) {
- ofs++;
- dn.ofs_in_node++;
- pgofs++;
- map->m_len++;
- goto get_next;
+ } else {
+ /*
+ * we only merge preallocated unwritten blocks
+ * for fiemap.
+ */
+ if (flag != F2FS_GET_BLOCK_FIEMAP ||
+ blkaddr != NEW_ADDR)
+ goto sync_out;
}
}
+
+ /* Give more consecutive addresses for the readahead */
+ if ((map->m_pblk != NEW_ADDR &&
+ blkaddr == (map->m_pblk + ofs)) ||
+ (map->m_pblk == NEW_ADDR &&
+ blkaddr == NEW_ADDR)) {
+ ofs++;
+ dn.ofs_in_node++;
+ pgofs++;
+ map->m_len++;
+ goto get_next;
+ }
+
sync_out:
if (allocated)
sync_inode_page(&dn);
put_out:
f2fs_put_dnode(&dn);
unlock_out:
- if (create)
- f2fs_unlock_op(F2FS_I_SB(inode));
+ if (create) {
+ f2fs_unlock_op(sbi);
+ f2fs_balance_fs(sbi, dn.node_changed);
+ }
out:
trace_f2fs_map_blocks(inode, map, err);
return err;
}
static int __get_data_block(struct inode *inode, sector_t iblock,
- struct buffer_head *bh, int create, bool fiemap)
+ struct buffer_head *bh, int create, int flag)
{
struct f2fs_map_blocks map;
int ret;
@@ -1305,7 +731,7 @@ static int __get_data_block(struct inode *inode, sector_t iblock,
map.m_lblk = iblock;
map.m_len = bh->b_size >> inode->i_blkbits;
- ret = f2fs_map_blocks(inode, &map, create, fiemap);
+ ret = f2fs_map_blocks(inode, &map, create, flag);
if (!ret) {
map_bh(bh, inode->i_sb, map.m_pblk);
bh->b_state = (bh->b_state & ~F2FS_MAP_FLAGS) | map.m_flags;
@@ -1315,15 +741,27 @@ static int __get_data_block(struct inode *inode, sector_t iblock,
}
static int get_data_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create, int flag)
+{
+ return __get_data_block(inode, iblock, bh_result, create, flag);
+}
+
+static int get_data_block_dio(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
- return __get_data_block(inode, iblock, bh_result, create, false);
+ return __get_data_block(inode, iblock, bh_result, create,
+ F2FS_GET_BLOCK_DIO);
}
-static int get_data_block_fiemap(struct inode *inode, sector_t iblock,
+static int get_data_block_bmap(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
- return __get_data_block(inode, iblock, bh_result, create, true);
+ /* Block number less than F2FS MAX BLOCKS */
+ if (unlikely(iblock >= F2FS_I_SB(inode)->max_file_blocks))
+ return -EFBIG;
+
+ return __get_data_block(inode, iblock, bh_result, create,
+ F2FS_GET_BLOCK_BMAP);
}
static inline sector_t logical_to_blk(struct inode *inode, loff_t offset)
@@ -1341,91 +779,78 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
{
struct buffer_head map_bh;
sector_t start_blk, last_blk;
- loff_t isize = i_size_read(inode);
+ loff_t isize;
u64 logical = 0, phys = 0, size = 0;
u32 flags = 0;
- bool past_eof = false, whole_file = false;
int ret = 0;
ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
if (ret)
return ret;
+ if (f2fs_has_inline_data(inode)) {
+ ret = f2fs_inline_data_fiemap(inode, fieinfo, start, len);
+ if (ret != -EAGAIN)
+ return ret;
+ }
+
mutex_lock(&inode->i_mutex);
- if (len >= isize) {
- whole_file = true;
- len = isize;
- }
+ isize = i_size_read(inode);
+ if (start >= isize)
+ goto out;
+
+ if (start + len > isize)
+ len = isize - start;
if (logical_to_blk(inode, len) == 0)
len = blk_to_logical(inode, 1);
start_blk = logical_to_blk(inode, start);
last_blk = logical_to_blk(inode, start + len - 1);
+
next:
memset(&map_bh, 0, sizeof(struct buffer_head));
map_bh.b_size = len;
- ret = get_data_block_fiemap(inode, start_blk, &map_bh, 0);
+ ret = get_data_block(inode, start_blk, &map_bh, 0,
+ F2FS_GET_BLOCK_FIEMAP);
if (ret)
goto out;
/* HOLE */
if (!buffer_mapped(&map_bh)) {
- start_blk++;
-
- if (!past_eof && blk_to_logical(inode, start_blk) >= isize)
- past_eof = 1;
-
- if (past_eof && size) {
- flags |= FIEMAP_EXTENT_LAST;
- ret = fiemap_fill_next_extent(fieinfo, logical,
- phys, size, flags);
- } else if (size) {
- ret = fiemap_fill_next_extent(fieinfo, logical,
- phys, size, flags);
- size = 0;
- }
+ /* Go through holes util pass the EOF */
+ if (blk_to_logical(inode, start_blk++) < isize)
+ goto prep_next;
+ /* Found a hole beyond isize means no more extents.
+ * Note that the premise is that filesystems don't
+ * punch holes beyond isize and keep size unchanged.
+ */
+ flags |= FIEMAP_EXTENT_LAST;
+ }
- /* if we have holes up to/past EOF then we're done */
- if (start_blk > last_blk || past_eof || ret)
- goto out;
- } else {
- if (start_blk > last_blk && !whole_file) {
- ret = fiemap_fill_next_extent(fieinfo, logical,
- phys, size, flags);
- goto out;
- }
+ if (size) {
+ if (f2fs_encrypted_inode(inode))
+ flags |= FIEMAP_EXTENT_DATA_ENCRYPTED;
- /*
- * if size != 0 then we know we already have an extent
- * to add, so add it.
- */
- if (size) {
- ret = fiemap_fill_next_extent(fieinfo, logical,
- phys, size, flags);
- if (ret)
- goto out;
- }
+ ret = fiemap_fill_next_extent(fieinfo, logical,
+ phys, size, flags);
+ }
- logical = blk_to_logical(inode, start_blk);
- phys = blk_to_logical(inode, map_bh.b_blocknr);
- size = map_bh.b_size;
- flags = 0;
- if (buffer_unwritten(&map_bh))
- flags = FIEMAP_EXTENT_UNWRITTEN;
+ if (start_blk > last_blk || ret)
+ goto out;
- start_blk += logical_to_blk(inode, size);
+ logical = blk_to_logical(inode, start_blk);
+ phys = blk_to_logical(inode, map_bh.b_blocknr);
+ size = map_bh.b_size;
+ flags = 0;
+ if (buffer_unwritten(&map_bh))
+ flags = FIEMAP_EXTENT_UNWRITTEN;
- /*
- * If we are past the EOF, then we need to make sure as
- * soon as we find a hole that the last extent we found
- * is marked with FIEMAP_EXTENT_LAST
- */
- if (!past_eof && logical + size >= isize)
- past_eof = true;
- }
+ start_blk += logical_to_blk(inode, size);
+
+prep_next:
cond_resched();
if (fatal_signal_pending(current))
ret = -EINTR;
@@ -1501,7 +926,8 @@ static int f2fs_mpage_readpages(struct address_space *mapping,
map.m_lblk = block_in_file;
map.m_len = last_block - block_in_file;
- if (f2fs_map_blocks(inode, &map, 0, false))
+ if (f2fs_map_blocks(inode, &map, 0,
+ F2FS_GET_BLOCK_READ))
goto set_error_page;
}
got_it:
@@ -1534,25 +960,18 @@ submit_and_realloc:
if (f2fs_encrypted_inode(inode) &&
S_ISREG(inode->i_mode)) {
- struct page *cpage;
ctx = f2fs_get_crypto_ctx(inode);
if (IS_ERR(ctx))
goto set_error_page;
/* wait the page to be moved by cleaning */
- cpage = find_lock_page(
- META_MAPPING(F2FS_I_SB(inode)),
- block_nr);
- if (cpage) {
- f2fs_wait_on_page_writeback(cpage,
- DATA);
- f2fs_put_page(cpage, 1);
- }
+ f2fs_wait_on_encrypted_page_writeback(
+ F2FS_I_SB(inode), block_nr);
}
bio = bio_alloc(GFP_KERNEL,
- min_t(int, nr_pages, bio_get_nr_vecs(bdev)));
+ min_t(int, nr_pages, BIO_MAX_PAGES));
if (!bio) {
if (ctx)
f2fs_release_crypto_ctx(ctx);
@@ -1610,6 +1029,9 @@ static int f2fs_read_data_pages(struct file *file,
struct list_head *pages, unsigned nr_pages)
{
struct inode *inode = file->f_mapping->host;
+ struct page *page = list_entry(pages->prev, struct page, lru);
+
+ trace_f2fs_readpages(inode, page, nr_pages);
/* If the file has inline data, skip readpages */
if (f2fs_has_inline_data(inode))
@@ -1639,6 +1061,11 @@ int do_write_data_page(struct f2fs_io_info *fio)
}
if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
+
+ /* wait for GCed encrypted page writeback */
+ f2fs_wait_on_encrypted_page_writeback(F2FS_I_SB(inode),
+ fio->blk_addr);
+
fio->encrypted_page = f2fs_encrypt(inode, fio->page);
if (IS_ERR(fio->encrypted_page)) {
err = PTR_ERR(fio->encrypted_page);
@@ -1654,6 +1081,7 @@ int do_write_data_page(struct f2fs_io_info *fio)
*/
if (unlikely(fio->blk_addr != NEW_ADDR &&
!is_cold_data(page) &&
+ !IS_ATOMIC_WRITTEN_PAGE(page) &&
need_inplace_update(inode))) {
rewrite_data_page(fio);
set_inode_flag(F2FS_I(inode), FI_UPDATE_WRITE);
@@ -1750,10 +1178,11 @@ out:
if (err)
ClearPageUptodate(page);
unlock_page(page);
- if (need_balance_fs)
- f2fs_balance_fs(sbi);
- if (wbc->for_reclaim)
+ f2fs_balance_fs(sbi, need_balance_fs);
+ if (wbc->for_reclaim || unlikely(f2fs_cp_error(sbi))) {
f2fs_submit_merged_bio(sbi, DATA, WRITE);
+ remove_dirty_inode(inode);
+ }
return 0;
redirty_out:
@@ -1770,6 +1199,137 @@ static int __f2fs_writepage(struct page *page, struct writeback_control *wbc,
return ret;
}
+/*
+ * This function was copied from write_cche_pages from mm/page-writeback.c.
+ * The major change is making write step of cold data page separately from
+ * warm/hot data page.
+ */
+static int f2fs_write_cache_pages(struct address_space *mapping,
+ struct writeback_control *wbc, writepage_t writepage,
+ void *data)
+{
+ int ret = 0;
+ int done = 0;
+ struct pagevec pvec;
+ int nr_pages;
+ pgoff_t uninitialized_var(writeback_index);
+ pgoff_t index;
+ pgoff_t end; /* Inclusive */
+ pgoff_t done_index;
+ int cycled;
+ int range_whole = 0;
+ int tag;
+ int step = 0;
+
+ pagevec_init(&pvec, 0);
+next:
+ if (wbc->range_cyclic) {
+ writeback_index = mapping->writeback_index; /* prev offset */
+ index = writeback_index;
+ if (index == 0)
+ cycled = 1;
+ else
+ cycled = 0;
+ end = -1;
+ } else {
+ index = wbc->range_start >> PAGE_CACHE_SHIFT;
+ end = wbc->range_end >> PAGE_CACHE_SHIFT;
+ if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+ range_whole = 1;
+ cycled = 1; /* ignore range_cyclic tests */
+ }
+ if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
+ tag = PAGECACHE_TAG_TOWRITE;
+ else
+ tag = PAGECACHE_TAG_DIRTY;
+retry:
+ if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
+ tag_pages_for_writeback(mapping, index, end);
+ done_index = index;
+ while (!done && (index <= end)) {
+ int i;
+
+ nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1);
+ if (nr_pages == 0)
+ break;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ if (page->index > end) {
+ done = 1;
+ break;
+ }
+
+ done_index = page->index;
+
+ lock_page(page);
+
+ if (unlikely(page->mapping != mapping)) {
+continue_unlock:
+ unlock_page(page);
+ continue;
+ }
+
+ if (!PageDirty(page)) {
+ /* someone wrote it for us */
+ goto continue_unlock;
+ }
+
+ if (step == is_cold_data(page))
+ goto continue_unlock;
+
+ if (PageWriteback(page)) {
+ if (wbc->sync_mode != WB_SYNC_NONE)
+ f2fs_wait_on_page_writeback(page, DATA);
+ else
+ goto continue_unlock;
+ }
+
+ BUG_ON(PageWriteback(page));
+ if (!clear_page_dirty_for_io(page))
+ goto continue_unlock;
+
+ ret = (*writepage)(page, wbc, data);
+ if (unlikely(ret)) {
+ if (ret == AOP_WRITEPAGE_ACTIVATE) {
+ unlock_page(page);
+ ret = 0;
+ } else {
+ done_index = page->index + 1;
+ done = 1;
+ break;
+ }
+ }
+
+ if (--wbc->nr_to_write <= 0 &&
+ wbc->sync_mode == WB_SYNC_NONE) {
+ done = 1;
+ break;
+ }
+ }
+ pagevec_release(&pvec);
+ cond_resched();
+ }
+
+ if (step < 1) {
+ step++;
+ goto next;
+ }
+
+ if (!cycled && !done) {
+ cycled = 1;
+ index = 0;
+ end = writeback_index - 1;
+ goto retry;
+ }
+ if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+ mapping->writeback_index = done_index;
+
+ return ret;
+}
+
static int f2fs_write_data_pages(struct address_space *mapping,
struct writeback_control *wbc)
{
@@ -1785,11 +1345,19 @@ static int f2fs_write_data_pages(struct address_space *mapping,
if (!mapping->a_ops->writepage)
return 0;
+ /* skip writing if there is no dirty page in this inode */
+ if (!get_dirty_pages(inode) && wbc->sync_mode == WB_SYNC_NONE)
+ return 0;
+
if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE &&
get_dirty_pages(inode) < nr_pages_to_skip(sbi, DATA) &&
available_free_memory(sbi, DIRTY_DENTS))
goto skip_write;
+ /* skip writing during file defragment */
+ if (is_inode_flag_set(F2FS_I(inode), FI_DO_DEFRAG))
+ goto skip_write;
+
/* during POR, we don't need to trigger writepage at all. */
if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
goto skip_write;
@@ -1800,13 +1368,12 @@ static int f2fs_write_data_pages(struct address_space *mapping,
mutex_lock(&sbi->writepages);
locked = true;
}
- ret = write_cache_pages(mapping, wbc, __f2fs_writepage, mapping);
+ ret = f2fs_write_cache_pages(mapping, wbc, __f2fs_writepage, mapping);
+ f2fs_submit_merged_bio(sbi, DATA, WRITE);
if (locked)
mutex_unlock(&sbi->writepages);
- f2fs_submit_merged_bio(sbi, DATA, WRITE);
-
- remove_dirty_dir_inode(inode);
+ remove_dirty_inode(inode);
wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff);
return ret;
@@ -1819,28 +1386,99 @@ skip_write:
static void f2fs_write_failed(struct address_space *mapping, loff_t to)
{
struct inode *inode = mapping->host;
+ loff_t i_size = i_size_read(inode);
- if (to > inode->i_size) {
- truncate_pagecache(inode, inode->i_size);
- truncate_blocks(inode, inode->i_size, true);
+ if (to > i_size) {
+ truncate_pagecache(inode, i_size);
+ truncate_blocks(inode, i_size, true);
}
}
+static int prepare_write_begin(struct f2fs_sb_info *sbi,
+ struct page *page, loff_t pos, unsigned len,
+ block_t *blk_addr, bool *node_changed)
+{
+ struct inode *inode = page->mapping->host;
+ pgoff_t index = page->index;
+ struct dnode_of_data dn;
+ struct page *ipage;
+ bool locked = false;
+ struct extent_info ei;
+ int err = 0;
+
+ if (f2fs_has_inline_data(inode) ||
+ (pos & PAGE_CACHE_MASK) >= i_size_read(inode)) {
+ f2fs_lock_op(sbi);
+ locked = true;
+ }
+restart:
+ /* check inline_data */
+ ipage = get_node_page(sbi, inode->i_ino);
+ if (IS_ERR(ipage)) {
+ err = PTR_ERR(ipage);
+ goto unlock_out;
+ }
+
+ set_new_dnode(&dn, inode, ipage, ipage, 0);
+
+ if (f2fs_has_inline_data(inode)) {
+ if (pos + len <= MAX_INLINE_DATA) {
+ read_inline_data(page, ipage);
+ set_inode_flag(F2FS_I(inode), FI_DATA_EXIST);
+ sync_inode_page(&dn);
+ } else {
+ err = f2fs_convert_inline_page(&dn, page);
+ if (err)
+ goto out;
+ if (dn.data_blkaddr == NULL_ADDR)
+ err = f2fs_get_block(&dn, index);
+ }
+ } else if (locked) {
+ err = f2fs_get_block(&dn, index);
+ } else {
+ if (f2fs_lookup_extent_cache(inode, index, &ei)) {
+ dn.data_blkaddr = ei.blk + index - ei.fofs;
+ } else {
+ bool restart = false;
+
+ /* hole case */
+ err = get_dnode_of_data(&dn, index, LOOKUP_NODE);
+ if (err || (!err && dn.data_blkaddr == NULL_ADDR))
+ restart = true;
+ if (restart) {
+ f2fs_put_dnode(&dn);
+ f2fs_lock_op(sbi);
+ locked = true;
+ goto restart;
+ }
+ }
+ }
+
+ /* convert_inline_page can make node_changed */
+ *blk_addr = dn.data_blkaddr;
+ *node_changed = dn.node_changed;
+out:
+ f2fs_put_dnode(&dn);
+unlock_out:
+ if (locked)
+ f2fs_unlock_op(sbi);
+ return err;
+}
+
static int f2fs_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
{
struct inode *inode = mapping->host;
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct page *page, *ipage;
+ struct page *page = NULL;
pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT;
- struct dnode_of_data dn;
+ bool need_balance = false;
+ block_t blkaddr = NULL_ADDR;
int err = 0;
trace_f2fs_write_begin(inode, pos, len, flags);
- f2fs_balance_fs(sbi);
-
/*
* We should check this at this moment to avoid deadlock on inode page
* and #0 page. The locking rule for inline_data conversion should be:
@@ -1860,57 +1498,50 @@ repeat:
*pagep = page;
- f2fs_lock_op(sbi);
-
- /* check inline_data */
- ipage = get_node_page(sbi, inode->i_ino);
- if (IS_ERR(ipage)) {
- err = PTR_ERR(ipage);
- goto unlock_fail;
- }
-
- set_new_dnode(&dn, inode, ipage, ipage, 0);
+ err = prepare_write_begin(sbi, page, pos, len,
+ &blkaddr, &need_balance);
+ if (err)
+ goto fail;
- if (f2fs_has_inline_data(inode)) {
- if (pos + len <= MAX_INLINE_DATA) {
- read_inline_data(page, ipage);
- set_inode_flag(F2FS_I(inode), FI_DATA_EXIST);
- sync_inode_page(&dn);
- goto put_next;
+ if (need_balance && has_not_enough_free_secs(sbi, 0)) {
+ unlock_page(page);
+ f2fs_balance_fs(sbi, true);
+ lock_page(page);
+ if (page->mapping != mapping) {
+ /* The page got truncated from under us */
+ f2fs_put_page(page, 1);
+ goto repeat;
}
- err = f2fs_convert_inline_page(&dn, page);
- if (err)
- goto put_fail;
}
- err = f2fs_reserve_block(&dn, index);
- if (err)
- goto put_fail;
-put_next:
- f2fs_put_dnode(&dn);
- f2fs_unlock_op(sbi);
-
- if ((len == PAGE_CACHE_SIZE) || PageUptodate(page))
- return 0;
f2fs_wait_on_page_writeback(page, DATA);
+ /* wait for GCed encrypted page writeback */
+ if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
+ f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr);
+
+ if (len == PAGE_CACHE_SIZE)
+ goto out_update;
+ if (PageUptodate(page))
+ goto out_clear;
+
if ((pos & PAGE_CACHE_MASK) >= i_size_read(inode)) {
unsigned start = pos & (PAGE_CACHE_SIZE - 1);
unsigned end = start + len;
/* Reading beyond i_size is simple: memset to zero */
zero_user_segments(page, 0, start, end, PAGE_CACHE_SIZE);
- goto out;
+ goto out_update;
}
- if (dn.data_blkaddr == NEW_ADDR) {
+ if (blkaddr == NEW_ADDR) {
zero_user_segment(page, 0, PAGE_CACHE_SIZE);
} else {
struct f2fs_io_info fio = {
.sbi = sbi,
.type = DATA,
.rw = READ_SYNC,
- .blk_addr = dn.data_blkaddr,
+ .blk_addr = blkaddr,
.page = page,
.encrypted_page = NULL,
};
@@ -1920,7 +1551,6 @@ put_next:
lock_page(page);
if (unlikely(!PageUptodate(page))) {
- f2fs_put_page(page, 1);
err = -EIO;
goto fail;
}
@@ -1932,23 +1562,18 @@ put_next:
/* avoid symlink page */
if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
err = f2fs_decrypt_one(inode, page);
- if (err) {
- f2fs_put_page(page, 1);
+ if (err)
goto fail;
- }
}
}
-out:
+out_update:
SetPageUptodate(page);
+out_clear:
clear_cold_data(page);
return 0;
-put_fail:
- f2fs_put_dnode(&dn);
-unlock_fail:
- f2fs_unlock_op(sbi);
- f2fs_put_page(page, 1);
fail:
+ f2fs_put_page(page, 1);
f2fs_write_failed(mapping, pos + len);
return err;
}
@@ -1971,6 +1596,7 @@ static int f2fs_write_end(struct file *file,
}
f2fs_put_page(page, 1);
+ f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
return copied;
}
@@ -1979,9 +1605,6 @@ static int check_direct_IO(struct inode *inode, struct iov_iter *iter,
{
unsigned blocksize_mask = inode->i_sb->s_blocksize - 1;
- if (iov_iter_rw(iter) == READ)
- return 0;
-
if (offset & blocksize_mask)
return -EINVAL;
@@ -2001,24 +1624,27 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
int err;
/* we don't need to use inline_data strictly */
- if (f2fs_has_inline_data(inode)) {
- err = f2fs_convert_inline_inode(inode);
- if (err)
- return err;
- }
+ err = f2fs_convert_inline_inode(inode);
+ if (err)
+ return err;
if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
return 0;
- if (check_direct_IO(inode, iter, offset))
- return 0;
+ err = check_direct_IO(inode, iter, offset);
+ if (err)
+ return err;
trace_f2fs_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
- if (iov_iter_rw(iter) == WRITE)
- __allocate_data_blocks(inode, offset, count);
+ if (iov_iter_rw(iter) == WRITE) {
+ err = __allocate_data_blocks(inode, offset, count);
+ if (err)
+ goto out;
+ }
- err = blockdev_direct_IO(iocb, inode, iter, offset, get_data_block);
+ err = blockdev_direct_IO(iocb, inode, iter, offset, get_data_block_dio);
+out:
if (err < 0 && iov_iter_rw(iter) == WRITE)
f2fs_write_failed(mapping, offset + count);
@@ -2045,6 +1671,11 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset,
else
inode_dec_dirty_pages(inode);
}
+
+ /* This is atomic written page, keep Private */
+ if (IS_ATOMIC_WRITTEN_PAGE(page))
+ return;
+
ClearPagePrivate(page);
}
@@ -2054,6 +1685,10 @@ int f2fs_release_page(struct page *page, gfp_t wait)
if (PageDirty(page))
return 0;
+ /* This is atomic written page, keep Private */
+ if (IS_ATOMIC_WRITTEN_PAGE(page))
+ return 0;
+
ClearPagePrivate(page);
return 1;
}
@@ -2068,8 +1703,15 @@ static int f2fs_set_data_page_dirty(struct page *page)
SetPageUptodate(page);
if (f2fs_is_atomic_file(inode)) {
- register_inmem_page(inode, page);
- return 1;
+ if (!IS_ATOMIC_WRITTEN_PAGE(page)) {
+ register_inmem_page(inode, page);
+ return 1;
+ }
+ /*
+ * Previously, this page has been registered, we just
+ * return here.
+ */
+ return 0;
}
if (!PageDirty(page)) {
@@ -2084,44 +1726,14 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block)
{
struct inode *inode = mapping->host;
- /* we don't need to use inline_data strictly */
- if (f2fs_has_inline_data(inode)) {
- int err = f2fs_convert_inline_inode(inode);
- if (err)
- return err;
- }
- return generic_block_bmap(mapping, block, get_data_block);
-}
-
-void init_extent_cache_info(struct f2fs_sb_info *sbi)
-{
- INIT_RADIX_TREE(&sbi->extent_tree_root, GFP_NOIO);
- init_rwsem(&sbi->extent_tree_lock);
- INIT_LIST_HEAD(&sbi->extent_list);
- spin_lock_init(&sbi->extent_lock);
- sbi->total_ext_tree = 0;
- atomic_set(&sbi->total_ext_node, 0);
-}
+ if (f2fs_has_inline_data(inode))
+ return 0;
-int __init create_extent_cache(void)
-{
- extent_tree_slab = f2fs_kmem_cache_create("f2fs_extent_tree",
- sizeof(struct extent_tree));
- if (!extent_tree_slab)
- return -ENOMEM;
- extent_node_slab = f2fs_kmem_cache_create("f2fs_extent_node",
- sizeof(struct extent_node));
- if (!extent_node_slab) {
- kmem_cache_destroy(extent_tree_slab);
- return -ENOMEM;
- }
- return 0;
-}
+ /* make sure allocating whole blocks */
+ if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
+ filemap_write_and_wait(mapping);
-void destroy_extent_cache(void)
-{
- kmem_cache_destroy(extent_node_slab);
- kmem_cache_destroy(extent_tree_slab);
+ return generic_block_bmap(mapping, block, get_data_block_bmap);
}
const struct address_space_operations f2fs_dblock_aops = {
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 75176e0..4fb6ef8 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -33,14 +33,20 @@ static void update_general_status(struct f2fs_sb_info *sbi)
int i;
/* validation check of the segment numbers */
- si->hit_ext = sbi->read_hit_ext;
- si->total_ext = sbi->total_hit_ext;
- si->ext_tree = sbi->total_ext_tree;
+ si->hit_largest = atomic64_read(&sbi->read_hit_largest);
+ si->hit_cached = atomic64_read(&sbi->read_hit_cached);
+ si->hit_rbtree = atomic64_read(&sbi->read_hit_rbtree);
+ si->hit_total = si->hit_largest + si->hit_cached + si->hit_rbtree;
+ si->total_ext = atomic64_read(&sbi->total_hit_ext);
+ si->ext_tree = atomic_read(&sbi->total_ext_tree);
+ si->zombie_tree = atomic_read(&sbi->total_zombie_tree);
si->ext_node = atomic_read(&sbi->total_ext_node);
si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES);
si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS);
- si->ndirty_dirs = sbi->n_dirty_dirs;
si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META);
+ si->ndirty_data = get_pages(sbi, F2FS_DIRTY_DATA);
+ si->ndirty_dirs = sbi->ndirty_inode[DIR_INODE];
+ si->ndirty_files = sbi->ndirty_inode[FILE_INODE];
si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES);
si->wb_pages = get_pages(sbi, F2FS_WRITEBACK);
si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg;
@@ -49,6 +55,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->valid_count = valid_user_blocks(sbi);
si->valid_node_count = valid_node_count(sbi);
si->valid_inode_count = valid_inode_count(sbi);
+ si->inline_xattr = atomic_read(&sbi->inline_xattr);
si->inline_inode = atomic_read(&sbi->inline_inode);
si->inline_dir = atomic_read(&sbi->inline_dir);
si->utilization = utilization(sbi);
@@ -101,7 +108,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
bimodal = 0;
total_vblocks = 0;
- blks_per_sec = sbi->segs_per_sec * (1 << sbi->log_blocks_per_seg);
+ blks_per_sec = sbi->segs_per_sec * sbi->blocks_per_seg;
hblks_per_sec = blks_per_sec / 2;
for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) {
vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec);
@@ -114,7 +121,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
}
}
dist = div_u64(MAIN_SECS(sbi) * hblks_per_sec * hblks_per_sec, 100);
- si->bimodal = div_u64(bimodal, dist);
+ si->bimodal = div64_u64(bimodal, dist);
if (si->dirty_count)
si->avg_vblocks = div_u64(total_vblocks, ndirty);
else
@@ -185,18 +192,18 @@ get_cache:
si->cache_mem += NM_I(sbi)->dirty_nat_cnt *
sizeof(struct nat_entry_set);
si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages);
- si->cache_mem += sbi->n_dirty_dirs * sizeof(struct inode_entry);
for (i = 0; i <= UPDATE_INO; i++)
si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry);
- si->cache_mem += sbi->total_ext_tree * sizeof(struct extent_tree);
+ si->cache_mem += atomic_read(&sbi->total_ext_tree) *
+ sizeof(struct extent_tree);
si->cache_mem += atomic_read(&sbi->total_ext_node) *
sizeof(struct extent_node);
si->page_mem = 0;
npages = NODE_MAPPING(sbi)->nrpages;
- si->page_mem += npages << PAGE_CACHE_SHIFT;
+ si->page_mem += (unsigned long long)npages << PAGE_CACHE_SHIFT;
npages = META_MAPPING(sbi)->nrpages;
- si->page_mem += npages << PAGE_CACHE_SHIFT;
+ si->page_mem += (unsigned long long)npages << PAGE_CACHE_SHIFT;
}
static int stat_show(struct seq_file *s, void *v)
@@ -207,12 +214,10 @@ static int stat_show(struct seq_file *s, void *v)
mutex_lock(&f2fs_stat_mutex);
list_for_each_entry(si, &f2fs_stat_list, stat_list) {
- char devname[BDEVNAME_SIZE];
-
update_general_status(si->sbi);
- seq_printf(s, "\n=====[ partition info(%s). #%d ]=====\n",
- bdevname(si->sbi->sb->s_bdev, devname), i++);
+ seq_printf(s, "\n=====[ partition info(%pg). #%d ]=====\n",
+ si->sbi->sb->s_bdev, i++);
seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ",
si->sit_area_segs, si->nat_area_segs);
seq_printf(s, "[SSA: %d] [MAIN: %d",
@@ -226,6 +231,8 @@ static int stat_show(struct seq_file *s, void *v)
seq_printf(s, "Other: %u)\n - Data: %u\n",
si->valid_node_count - si->valid_inode_count,
si->valid_count - si->valid_node_count);
+ seq_printf(s, " - Inline_xattr Inode: %u\n",
+ si->inline_xattr);
seq_printf(s, " - Inline_data Inode: %u\n",
si->inline_inode);
seq_printf(s, " - Inline_dentry Inode: %u\n",
@@ -263,7 +270,8 @@ static int stat_show(struct seq_file *s, void *v)
si->dirty_count);
seq_printf(s, " - Prefree: %d\n - Free: %d (%d)\n\n",
si->prefree_count, si->free_segs, si->free_secs);
- seq_printf(s, "CP calls: %d\n", si->cp_count);
+ seq_printf(s, "CP calls: %d (BG: %d)\n",
+ si->cp_count, si->bg_cp_count);
seq_printf(s, "GC calls: %d (BG: %d)\n",
si->call_count, si->bg_gc);
seq_printf(s, " - data segments : %d (%d)\n",
@@ -276,10 +284,16 @@ static int stat_show(struct seq_file *s, void *v)
si->bg_data_blks);
seq_printf(s, " - node blocks : %d (%d)\n", si->node_blks,
si->bg_node_blks);
- seq_printf(s, "\nExtent Hit Ratio: %d / %d\n",
- si->hit_ext, si->total_ext);
- seq_printf(s, "\nExtent Tree Count: %d\n", si->ext_tree);
- seq_printf(s, "\nExtent Node Count: %d\n", si->ext_node);
+ seq_puts(s, "\nExtent Cache:\n");
+ seq_printf(s, " - Hit Count: L1-1:%llu L1-2:%llu L2:%llu\n",
+ si->hit_largest, si->hit_cached,
+ si->hit_rbtree);
+ seq_printf(s, " - Hit Ratio: %llu%% (%llu / %llu)\n",
+ !si->total_ext ? 0 :
+ div64_u64(si->hit_total * 100, si->total_ext),
+ si->hit_total, si->total_ext);
+ seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n",
+ si->ext_tree, si->zombie_tree, si->ext_node);
seq_puts(s, "\nBalancing F2FS Async:\n");
seq_printf(s, " - inmem: %4d, wb: %4d\n",
si->inmem_pages, si->wb_pages);
@@ -287,6 +301,8 @@ static int stat_show(struct seq_file *s, void *v)
si->ndirty_node, si->node_pages);
seq_printf(s, " - dents: %4d in dirs:%4d\n",
si->ndirty_dent, si->ndirty_dirs);
+ seq_printf(s, " - datas: %4d in files:%4d\n",
+ si->ndirty_data, si->ndirty_files);
seq_printf(s, " - meta: %4d in %4d\n",
si->ndirty_meta, si->meta_pages);
seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n",
@@ -321,13 +337,13 @@ static int stat_show(struct seq_file *s, void *v)
/* memory footprint */
update_mem_info(si->sbi);
- seq_printf(s, "\nMemory: %u KB\n",
+ seq_printf(s, "\nMemory: %llu KB\n",
(si->base_mem + si->cache_mem + si->page_mem) >> 10);
- seq_printf(s, " - static: %u KB\n",
+ seq_printf(s, " - static: %llu KB\n",
si->base_mem >> 10);
- seq_printf(s, " - cached: %u KB\n",
+ seq_printf(s, " - cached: %llu KB\n",
si->cache_mem >> 10);
- seq_printf(s, " - paged : %u KB\n",
+ seq_printf(s, " - paged : %llu KB\n",
si->page_mem >> 10);
}
mutex_unlock(&f2fs_stat_mutex);
@@ -366,6 +382,12 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
si->sbi = sbi;
sbi->stat_info = si;
+ atomic64_set(&sbi->total_hit_ext, 0);
+ atomic64_set(&sbi->read_hit_rbtree, 0);
+ atomic64_set(&sbi->read_hit_largest, 0);
+ atomic64_set(&sbi->read_hit_cached, 0);
+
+ atomic_set(&sbi->inline_xattr, 0);
atomic_set(&sbi->inline_inode, 0);
atomic_set(&sbi->inline_dir, 0);
atomic_set(&sbi->inplace_count, 0);
@@ -388,20 +410,23 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi)
kfree(si);
}
-void __init f2fs_create_root_stats(void)
+int __init f2fs_create_root_stats(void)
{
struct dentry *file;
f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL);
if (!f2fs_debugfs_root)
- return;
+ return -ENOMEM;
file = debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root,
NULL, &stat_fops);
if (!file) {
debugfs_remove(f2fs_debugfs_root);
f2fs_debugfs_root = NULL;
+ return -ENOMEM;
}
+
+ return 0;
}
void f2fs_destroy_root_stats(void)
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index a34ebd8..faa7495 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -172,8 +172,6 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
namehash = f2fs_dentry_hash(&name);
- f2fs_bug_on(F2FS_I_SB(dir), level > MAX_DIR_HASH_DEPTH);
-
nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level);
nblock = bucket_blocks(level);
@@ -238,6 +236,14 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
goto out;
max_depth = F2FS_I(dir)->i_current_depth;
+ if (unlikely(max_depth > MAX_DIR_HASH_DEPTH)) {
+ f2fs_msg(F2FS_I_SB(dir)->sb, KERN_WARNING,
+ "Corrupted max_depth of %lu: %u",
+ dir->i_ino, max_depth);
+ max_depth = MAX_DIR_HASH_DEPTH;
+ F2FS_I(dir)->i_current_depth = max_depth;
+ mark_inode_dirty(dir);
+ }
for (level = 0; level < max_depth; level++) {
de = find_in_level(dir, level, &fname, res_page);
@@ -258,7 +264,7 @@ struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p)
if (f2fs_has_inline_dentry(dir))
return f2fs_parent_inline_dir(dir, p);
- page = get_lock_data_page(dir, 0);
+ page = get_lock_data_page(dir, 0, false);
if (IS_ERR(page))
return NULL;
@@ -444,7 +450,7 @@ error:
/* once the failed inode becomes a bad inode, i_mode is S_IFREG */
truncate_inode_pages(&inode->i_data, 0);
truncate_blocks(inode, 0, false);
- remove_dirty_dir_inode(inode);
+ remove_dirty_inode(inode);
remove_inode_page(inode);
return ERR_PTR(err);
}
@@ -630,6 +636,7 @@ fail:
f2fs_put_page(dentry_page, 1);
out:
f2fs_fname_free_filename(&fname);
+ f2fs_update_time(F2FS_I_SB(dir), REQ_TIME);
return err;
}
@@ -651,6 +658,7 @@ int f2fs_do_tmpfile(struct inode *inode, struct inode *dir)
clear_inode_flag(F2FS_I(inode), FI_NEW_INODE);
fail:
up_write(&F2FS_I(inode)->i_sem);
+ f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
return err;
}
@@ -695,6 +703,8 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len));
int i;
+ f2fs_update_time(F2FS_I_SB(dir), REQ_TIME);
+
if (f2fs_has_inline_dentry(dir))
return f2fs_delete_inline_entry(dentry, page, dir, inode);
@@ -718,8 +728,8 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
if (inode)
f2fs_drop_nlink(dir, inode, NULL);
- if (bit_pos == NR_DENTRY_IN_BLOCK) {
- truncate_hole(dir, page->index, page->index + 1);
+ if (bit_pos == NR_DENTRY_IN_BLOCK &&
+ !truncate_hole(dir, page->index, page->index + 1)) {
clear_page_dirty_for_io(page);
ClearPagePrivate(page);
ClearPageUptodate(page);
@@ -740,7 +750,7 @@ bool f2fs_empty_dir(struct inode *dir)
return f2fs_empty_inline_dir(dir);
for (bidx = 0; bidx < nblock; bidx++) {
- dentry_page = get_lock_data_page(dir, bidx);
+ dentry_page = get_lock_data_page(dir, bidx, false);
if (IS_ERR(dentry_page)) {
if (PTR_ERR(dentry_page) == -ENOENT)
continue;
@@ -787,7 +797,6 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
else
d_type = DT_UNKNOWN;
- /* encrypted case */
de_name.name = d->filename[bit_pos];
de_name.len = le16_to_cpu(de->name_len);
@@ -795,12 +804,20 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
int save_len = fstr->len;
int ret;
+ de_name.name = kmalloc(de_name.len, GFP_NOFS);
+ if (!de_name.name)
+ return false;
+
+ memcpy(de_name.name, d->filename[bit_pos], de_name.len);
+
ret = f2fs_fname_disk_to_usr(d->inode, &de->hash_code,
&de_name, fstr);
- de_name = *fstr;
- fstr->len = save_len;
+ kfree(de_name.name);
if (ret < 0)
return true;
+
+ de_name = *fstr;
+ fstr->len = save_len;
}
if (!dir_emit(ctx, de_name.name, de_name.len,
@@ -847,26 +864,28 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES));
for (; n < npages; n++) {
- dentry_page = get_lock_data_page(inode, n);
- if (IS_ERR(dentry_page))
- continue;
+ dentry_page = get_lock_data_page(inode, n, false);
+ if (IS_ERR(dentry_page)) {
+ err = PTR_ERR(dentry_page);
+ if (err == -ENOENT)
+ continue;
+ else
+ goto out;
+ }
dentry_blk = kmap(dentry_page);
make_dentry_ptr(inode, &d, (void *)dentry_blk, 1);
- if (f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr))
- goto stop;
+ if (f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr)) {
+ kunmap(dentry_page);
+ f2fs_put_page(dentry_page, 1);
+ break;
+ }
ctx->pos = (n + 1) * NR_DENTRY_IN_BLOCK;
kunmap(dentry_page);
f2fs_put_page(dentry_page, 1);
- dentry_page = NULL;
- }
-stop:
- if (dentry_page && !IS_ERR(dentry_page)) {
- kunmap(dentry_page);
- f2fs_put_page(dentry_page, 1);
}
out:
f2fs_fname_crypto_free_buffer(&fstr);
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
new file mode 100644
index 0000000..ccd5c63
--- /dev/null
+++ b/fs/f2fs/extent_cache.c
@@ -0,0 +1,762 @@
+/*
+ * f2fs extent cache support
+ *
+ * Copyright (c) 2015 Motorola Mobility
+ * Copyright (c) 2015 Samsung Electronics
+ * Authors: Jaegeuk Kim <jaegeuk@kernel.org>
+ * Chao Yu <chao2.yu@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+
+#include "f2fs.h"
+#include "node.h"
+#include <trace/events/f2fs.h>
+
+static struct kmem_cache *extent_tree_slab;
+static struct kmem_cache *extent_node_slab;
+
+static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi,
+ struct extent_tree *et, struct extent_info *ei,
+ struct rb_node *parent, struct rb_node **p)
+{
+ struct extent_node *en;
+
+ en = kmem_cache_alloc(extent_node_slab, GFP_ATOMIC);
+ if (!en)
+ return NULL;
+
+ en->ei = *ei;
+ INIT_LIST_HEAD(&en->list);
+
+ rb_link_node(&en->rb_node, parent, p);
+ rb_insert_color(&en->rb_node, &et->root);
+ atomic_inc(&et->node_cnt);
+ atomic_inc(&sbi->total_ext_node);
+ return en;
+}
+
+static void __detach_extent_node(struct f2fs_sb_info *sbi,
+ struct extent_tree *et, struct extent_node *en)
+{
+ rb_erase(&en->rb_node, &et->root);
+ atomic_dec(&et->node_cnt);
+ atomic_dec(&sbi->total_ext_node);
+
+ if (et->cached_en == en)
+ et->cached_en = NULL;
+}
+
+static struct extent_tree *__grab_extent_tree(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct extent_tree *et;
+ nid_t ino = inode->i_ino;
+
+ down_write(&sbi->extent_tree_lock);
+ et = radix_tree_lookup(&sbi->extent_tree_root, ino);
+ if (!et) {
+ et = f2fs_kmem_cache_alloc(extent_tree_slab, GFP_NOFS);
+ f2fs_radix_tree_insert(&sbi->extent_tree_root, ino, et);
+ memset(et, 0, sizeof(struct extent_tree));
+ et->ino = ino;
+ et->root = RB_ROOT;
+ et->cached_en = NULL;
+ rwlock_init(&et->lock);
+ INIT_LIST_HEAD(&et->list);
+ atomic_set(&et->node_cnt, 0);
+ atomic_inc(&sbi->total_ext_tree);
+ } else {
+ atomic_dec(&sbi->total_zombie_tree);
+ list_del_init(&et->list);
+ }
+ up_write(&sbi->extent_tree_lock);
+
+ /* never died until evict_inode */
+ F2FS_I(inode)->extent_tree = et;
+
+ return et;
+}
+
+static struct extent_node *__lookup_extent_tree(struct f2fs_sb_info *sbi,
+ struct extent_tree *et, unsigned int fofs)
+{
+ struct rb_node *node = et->root.rb_node;
+ struct extent_node *en = et->cached_en;
+
+ if (en) {
+ struct extent_info *cei = &en->ei;
+
+ if (cei->fofs <= fofs && cei->fofs + cei->len > fofs) {
+ stat_inc_cached_node_hit(sbi);
+ return en;
+ }
+ }
+
+ while (node) {
+ en = rb_entry(node, struct extent_node, rb_node);
+
+ if (fofs < en->ei.fofs) {
+ node = node->rb_left;
+ } else if (fofs >= en->ei.fofs + en->ei.len) {
+ node = node->rb_right;
+ } else {
+ stat_inc_rbtree_node_hit(sbi);
+ return en;
+ }
+ }
+ return NULL;
+}
+
+static struct extent_node *__init_extent_tree(struct f2fs_sb_info *sbi,
+ struct extent_tree *et, struct extent_info *ei)
+{
+ struct rb_node **p = &et->root.rb_node;
+ struct extent_node *en;
+
+ en = __attach_extent_node(sbi, et, ei, NULL, p);
+ if (!en)
+ return NULL;
+
+ et->largest = en->ei;
+ et->cached_en = en;
+ return en;
+}
+
+static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi,
+ struct extent_tree *et, bool free_all)
+{
+ struct rb_node *node, *next;
+ struct extent_node *en;
+ unsigned int count = atomic_read(&et->node_cnt);
+
+ node = rb_first(&et->root);
+ while (node) {
+ next = rb_next(node);
+ en = rb_entry(node, struct extent_node, rb_node);
+
+ if (free_all) {
+ spin_lock(&sbi->extent_lock);
+ if (!list_empty(&en->list))
+ list_del_init(&en->list);
+ spin_unlock(&sbi->extent_lock);
+ }
+
+ if (free_all || list_empty(&en->list)) {
+ __detach_extent_node(sbi, et, en);
+ kmem_cache_free(extent_node_slab, en);
+ }
+ node = next;
+ }
+
+ return count - atomic_read(&et->node_cnt);
+}
+
+static void __drop_largest_extent(struct inode *inode,
+ pgoff_t fofs, unsigned int len)
+{
+ struct extent_info *largest = &F2FS_I(inode)->extent_tree->largest;
+
+ if (fofs < largest->fofs + largest->len && fofs + len > largest->fofs)
+ largest->len = 0;
+}
+
+/* return true, if inode page is changed */
+bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct extent_tree *et;
+ struct extent_node *en;
+ struct extent_info ei;
+
+ if (!f2fs_may_extent_tree(inode)) {
+ /* drop largest extent */
+ if (i_ext && i_ext->len) {
+ i_ext->len = 0;
+ return true;
+ }
+ return false;
+ }
+
+ et = __grab_extent_tree(inode);
+
+ if (!i_ext || !i_ext->len)
+ return false;
+
+ set_extent_info(&ei, le32_to_cpu(i_ext->fofs),
+ le32_to_cpu(i_ext->blk), le32_to_cpu(i_ext->len));
+
+ write_lock(&et->lock);
+ if (atomic_read(&et->node_cnt))
+ goto out;
+
+ en = __init_extent_tree(sbi, et, &ei);
+ if (en) {
+ spin_lock(&sbi->extent_lock);
+ list_add_tail(&en->list, &sbi->extent_list);
+ spin_unlock(&sbi->extent_lock);
+ }
+out:
+ write_unlock(&et->lock);
+ return false;
+}
+
+static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
+ struct extent_info *ei)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct extent_tree *et = F2FS_I(inode)->extent_tree;
+ struct extent_node *en;
+ bool ret = false;
+
+ f2fs_bug_on(sbi, !et);
+
+ trace_f2fs_lookup_extent_tree_start(inode, pgofs);
+
+ read_lock(&et->lock);
+
+ if (et->largest.fofs <= pgofs &&
+ et->largest.fofs + et->largest.len > pgofs) {
+ *ei = et->largest;
+ ret = true;
+ stat_inc_largest_node_hit(sbi);
+ goto out;
+ }
+
+ en = __lookup_extent_tree(sbi, et, pgofs);
+ if (en) {
+ *ei = en->ei;
+ spin_lock(&sbi->extent_lock);
+ if (!list_empty(&en->list))
+ list_move_tail(&en->list, &sbi->extent_list);
+ et->cached_en = en;
+ spin_unlock(&sbi->extent_lock);
+ ret = true;
+ }
+out:
+ stat_inc_total_hit(sbi);
+ read_unlock(&et->lock);
+
+ trace_f2fs_lookup_extent_tree_end(inode, pgofs, ei);
+ return ret;
+}
+
+
+/*
+ * lookup extent at @fofs, if hit, return the extent
+ * if not, return NULL and
+ * @prev_ex: extent before fofs
+ * @next_ex: extent after fofs
+ * @insert_p: insert point for new extent at fofs
+ * in order to simpfy the insertion after.
+ * tree must stay unchanged between lookup and insertion.
+ */
+static struct extent_node *__lookup_extent_tree_ret(struct extent_tree *et,
+ unsigned int fofs,
+ struct extent_node **prev_ex,
+ struct extent_node **next_ex,
+ struct rb_node ***insert_p,
+ struct rb_node **insert_parent)
+{
+ struct rb_node **pnode = &et->root.rb_node;
+ struct rb_node *parent = NULL, *tmp_node;
+ struct extent_node *en = et->cached_en;
+
+ *insert_p = NULL;
+ *insert_parent = NULL;
+ *prev_ex = NULL;
+ *next_ex = NULL;
+
+ if (RB_EMPTY_ROOT(&et->root))
+ return NULL;
+
+ if (en) {
+ struct extent_info *cei = &en->ei;
+
+ if (cei->fofs <= fofs && cei->fofs + cei->len > fofs)
+ goto lookup_neighbors;
+ }
+
+ while (*pnode) {
+ parent = *pnode;
+ en = rb_entry(*pnode, struct extent_node, rb_node);
+
+ if (fofs < en->ei.fofs)
+ pnode = &(*pnode)->rb_left;
+ else if (fofs >= en->ei.fofs + en->ei.len)
+ pnode = &(*pnode)->rb_right;
+ else
+ goto lookup_neighbors;
+ }
+
+ *insert_p = pnode;
+ *insert_parent = parent;
+
+ en = rb_entry(parent, struct extent_node, rb_node);
+ tmp_node = parent;
+ if (parent && fofs > en->ei.fofs)
+ tmp_node = rb_next(parent);
+ *next_ex = tmp_node ?
+ rb_entry(tmp_node, struct extent_node, rb_node) : NULL;
+
+ tmp_node = parent;
+ if (parent && fofs < en->ei.fofs)
+ tmp_node = rb_prev(parent);
+ *prev_ex = tmp_node ?
+ rb_entry(tmp_node, struct extent_node, rb_node) : NULL;
+ return NULL;
+
+lookup_neighbors:
+ if (fofs == en->ei.fofs) {
+ /* lookup prev node for merging backward later */
+ tmp_node = rb_prev(&en->rb_node);
+ *prev_ex = tmp_node ?
+ rb_entry(tmp_node, struct extent_node, rb_node) : NULL;
+ }
+ if (fofs == en->ei.fofs + en->ei.len - 1) {
+ /* lookup next node for merging frontward later */
+ tmp_node = rb_next(&en->rb_node);
+ *next_ex = tmp_node ?
+ rb_entry(tmp_node, struct extent_node, rb_node) : NULL;
+ }
+ return en;
+}
+
+static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi,
+ struct extent_tree *et, struct extent_info *ei,
+ struct extent_node **den,
+ struct extent_node *prev_ex,
+ struct extent_node *next_ex)
+{
+ struct extent_node *en = NULL;
+
+ if (prev_ex && __is_back_mergeable(ei, &prev_ex->ei)) {
+ prev_ex->ei.len += ei->len;
+ ei = &prev_ex->ei;
+ en = prev_ex;
+ }
+
+ if (next_ex && __is_front_mergeable(ei, &next_ex->ei)) {
+ if (en) {
+ __detach_extent_node(sbi, et, prev_ex);
+ *den = prev_ex;
+ }
+ next_ex->ei.fofs = ei->fofs;
+ next_ex->ei.blk = ei->blk;
+ next_ex->ei.len += ei->len;
+ en = next_ex;
+ }
+
+ if (en) {
+ __try_update_largest_extent(et, en);
+ et->cached_en = en;
+ }
+ return en;
+}
+
+static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi,
+ struct extent_tree *et, struct extent_info *ei,
+ struct rb_node **insert_p,
+ struct rb_node *insert_parent)
+{
+ struct rb_node **p = &et->root.rb_node;
+ struct rb_node *parent = NULL;
+ struct extent_node *en = NULL;
+
+ if (insert_p && insert_parent) {
+ parent = insert_parent;
+ p = insert_p;
+ goto do_insert;
+ }
+
+ while (*p) {
+ parent = *p;
+ en = rb_entry(parent, struct extent_node, rb_node);
+
+ if (ei->fofs < en->ei.fofs)
+ p = &(*p)->rb_left;
+ else if (ei->fofs >= en->ei.fofs + en->ei.len)
+ p = &(*p)->rb_right;
+ else
+ f2fs_bug_on(sbi, 1);
+ }
+do_insert:
+ en = __attach_extent_node(sbi, et, ei, parent, p);
+ if (!en)
+ return NULL;
+
+ __try_update_largest_extent(et, en);
+ et->cached_en = en;
+ return en;
+}
+
+static unsigned int f2fs_update_extent_tree_range(struct inode *inode,
+ pgoff_t fofs, block_t blkaddr, unsigned int len)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct extent_tree *et = F2FS_I(inode)->extent_tree;
+ struct extent_node *en = NULL, *en1 = NULL;
+ struct extent_node *prev_en = NULL, *next_en = NULL;
+ struct extent_info ei, dei, prev;
+ struct rb_node **insert_p = NULL, *insert_parent = NULL;
+ unsigned int end = fofs + len;
+ unsigned int pos = (unsigned int)fofs;
+
+ if (!et)
+ return false;
+
+ trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, len);
+
+ write_lock(&et->lock);
+
+ if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) {
+ write_unlock(&et->lock);
+ return false;
+ }
+
+ prev = et->largest;
+ dei.len = 0;
+
+ /*
+ * drop largest extent before lookup, in case it's already
+ * been shrunk from extent tree
+ */
+ __drop_largest_extent(inode, fofs, len);
+
+ /* 1. lookup first extent node in range [fofs, fofs + len - 1] */
+ en = __lookup_extent_tree_ret(et, fofs, &prev_en, &next_en,
+ &insert_p, &insert_parent);
+ if (!en)
+ en = next_en;
+
+ /* 2. invlidate all extent nodes in range [fofs, fofs + len - 1] */
+ while (en && en->ei.fofs < end) {
+ unsigned int org_end;
+ int parts = 0; /* # of parts current extent split into */
+
+ next_en = en1 = NULL;
+
+ dei = en->ei;
+ org_end = dei.fofs + dei.len;
+ f2fs_bug_on(sbi, pos >= org_end);
+
+ if (pos > dei.fofs && pos - dei.fofs >= F2FS_MIN_EXTENT_LEN) {
+ en->ei.len = pos - en->ei.fofs;
+ prev_en = en;
+ parts = 1;
+ }
+
+ if (end < org_end && org_end - end >= F2FS_MIN_EXTENT_LEN) {
+ if (parts) {
+ set_extent_info(&ei, end,
+ end - dei.fofs + dei.blk,
+ org_end - end);
+ en1 = __insert_extent_tree(sbi, et, &ei,
+ NULL, NULL);
+ next_en = en1;
+ } else {
+ en->ei.fofs = end;
+ en->ei.blk += end - dei.fofs;
+ en->ei.len -= end - dei.fofs;
+ next_en = en;
+ }
+ parts++;
+ }
+
+ if (!next_en) {
+ struct rb_node *node = rb_next(&en->rb_node);
+
+ next_en = node ?
+ rb_entry(node, struct extent_node, rb_node)
+ : NULL;
+ }
+
+ if (parts)
+ __try_update_largest_extent(et, en);
+ else
+ __detach_extent_node(sbi, et, en);
+
+ /*
+ * if original extent is split into zero or two parts, extent
+ * tree has been altered by deletion or insertion, therefore
+ * invalidate pointers regard to tree.
+ */
+ if (parts != 1) {
+ insert_p = NULL;
+ insert_parent = NULL;
+ }
+
+ /* update in global extent list */
+ spin_lock(&sbi->extent_lock);
+ if (!parts && !list_empty(&en->list))
+ list_del(&en->list);
+ if (en1)
+ list_add_tail(&en1->list, &sbi->extent_list);
+ spin_unlock(&sbi->extent_lock);
+
+ /* release extent node */
+ if (!parts)
+ kmem_cache_free(extent_node_slab, en);
+
+ en = next_en;
+ }
+
+ /* 3. update extent in extent cache */
+ if (blkaddr) {
+ struct extent_node *den = NULL;
+
+ set_extent_info(&ei, fofs, blkaddr, len);
+ en1 = __try_merge_extent_node(sbi, et, &ei, &den,
+ prev_en, next_en);
+ if (!en1)
+ en1 = __insert_extent_tree(sbi, et, &ei,
+ insert_p, insert_parent);
+
+ /* give up extent_cache, if split and small updates happen */
+ if (dei.len >= 1 &&
+ prev.len < F2FS_MIN_EXTENT_LEN &&
+ et->largest.len < F2FS_MIN_EXTENT_LEN) {
+ et->largest.len = 0;
+ set_inode_flag(F2FS_I(inode), FI_NO_EXTENT);
+ }
+
+ spin_lock(&sbi->extent_lock);
+ if (en1) {
+ if (list_empty(&en1->list))
+ list_add_tail(&en1->list, &sbi->extent_list);
+ else
+ list_move_tail(&en1->list, &sbi->extent_list);
+ }
+ if (den && !list_empty(&den->list))
+ list_del(&den->list);
+ spin_unlock(&sbi->extent_lock);
+
+ if (den)
+ kmem_cache_free(extent_node_slab, den);
+ }
+
+ if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT))
+ __free_extent_tree(sbi, et, true);
+
+ write_unlock(&et->lock);
+
+ return !__is_extent_same(&prev, &et->largest);
+}
+
+unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
+{
+ struct extent_tree *treevec[EXT_TREE_VEC_SIZE];
+ struct extent_tree *et, *next;
+ struct extent_node *en, *tmp;
+ unsigned long ino = F2FS_ROOT_INO(sbi);
+ unsigned int found;
+ unsigned int node_cnt = 0, tree_cnt = 0;
+ int remained;
+ bool do_free = false;
+
+ if (!test_opt(sbi, EXTENT_CACHE))
+ return 0;
+
+ if (!atomic_read(&sbi->total_zombie_tree))
+ goto free_node;
+
+ if (!down_write_trylock(&sbi->extent_tree_lock))
+ goto out;
+
+ /* 1. remove unreferenced extent tree */
+ list_for_each_entry_safe(et, next, &sbi->zombie_list, list) {
+ if (atomic_read(&et->node_cnt)) {
+ write_lock(&et->lock);
+ node_cnt += __free_extent_tree(sbi, et, true);
+ write_unlock(&et->lock);
+ }
+
+ list_del_init(&et->list);
+ radix_tree_delete(&sbi->extent_tree_root, et->ino);
+ kmem_cache_free(extent_tree_slab, et);
+ atomic_dec(&sbi->total_ext_tree);
+ atomic_dec(&sbi->total_zombie_tree);
+ tree_cnt++;
+
+ if (node_cnt + tree_cnt >= nr_shrink)
+ goto unlock_out;
+ }
+ up_write(&sbi->extent_tree_lock);
+
+free_node:
+ /* 2. remove LRU extent entries */
+ if (!down_write_trylock(&sbi->extent_tree_lock))
+ goto out;
+
+ remained = nr_shrink - (node_cnt + tree_cnt);
+
+ spin_lock(&sbi->extent_lock);
+ list_for_each_entry_safe(en, tmp, &sbi->extent_list, list) {
+ if (!remained--)
+ break;
+ list_del_init(&en->list);
+ do_free = true;
+ }
+ spin_unlock(&sbi->extent_lock);
+
+ if (do_free == false)
+ goto unlock_out;
+
+ /*
+ * reset ino for searching victims from beginning of global extent tree.
+ */
+ ino = F2FS_ROOT_INO(sbi);
+
+ while ((found = radix_tree_gang_lookup(&sbi->extent_tree_root,
+ (void **)treevec, ino, EXT_TREE_VEC_SIZE))) {
+ unsigned i;
+
+ ino = treevec[found - 1]->ino + 1;
+ for (i = 0; i < found; i++) {
+ struct extent_tree *et = treevec[i];
+
+ if (!atomic_read(&et->node_cnt))
+ continue;
+
+ if (write_trylock(&et->lock)) {
+ node_cnt += __free_extent_tree(sbi, et, false);
+ write_unlock(&et->lock);
+ }
+
+ if (node_cnt + tree_cnt >= nr_shrink)
+ goto unlock_out;
+ }
+ }
+unlock_out:
+ up_write(&sbi->extent_tree_lock);
+out:
+ trace_f2fs_shrink_extent_tree(sbi, node_cnt, tree_cnt);
+
+ return node_cnt + tree_cnt;
+}
+
+unsigned int f2fs_destroy_extent_node(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct extent_tree *et = F2FS_I(inode)->extent_tree;
+ unsigned int node_cnt = 0;
+
+ if (!et || !atomic_read(&et->node_cnt))
+ return 0;
+
+ write_lock(&et->lock);
+ node_cnt = __free_extent_tree(sbi, et, true);
+ write_unlock(&et->lock);
+
+ return node_cnt;
+}
+
+void f2fs_destroy_extent_tree(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct extent_tree *et = F2FS_I(inode)->extent_tree;
+ unsigned int node_cnt = 0;
+
+ if (!et)
+ return;
+
+ if (inode->i_nlink && !is_bad_inode(inode) &&
+ atomic_read(&et->node_cnt)) {
+ down_write(&sbi->extent_tree_lock);
+ list_add_tail(&et->list, &sbi->zombie_list);
+ atomic_inc(&sbi->total_zombie_tree);
+ up_write(&sbi->extent_tree_lock);
+ return;
+ }
+
+ /* free all extent info belong to this extent tree */
+ node_cnt = f2fs_destroy_extent_node(inode);
+
+ /* delete extent tree entry in radix tree */
+ down_write(&sbi->extent_tree_lock);
+ f2fs_bug_on(sbi, atomic_read(&et->node_cnt));
+ radix_tree_delete(&sbi->extent_tree_root, inode->i_ino);
+ kmem_cache_free(extent_tree_slab, et);
+ atomic_dec(&sbi->total_ext_tree);
+ up_write(&sbi->extent_tree_lock);
+
+ F2FS_I(inode)->extent_tree = NULL;
+
+ trace_f2fs_destroy_extent_tree(inode, node_cnt);
+}
+
+bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs,
+ struct extent_info *ei)
+{
+ if (!f2fs_may_extent_tree(inode))
+ return false;
+
+ return f2fs_lookup_extent_tree(inode, pgofs, ei);
+}
+
+void f2fs_update_extent_cache(struct dnode_of_data *dn)
+{
+ struct f2fs_inode_info *fi = F2FS_I(dn->inode);
+ pgoff_t fofs;
+
+ if (!f2fs_may_extent_tree(dn->inode))
+ return;
+
+ f2fs_bug_on(F2FS_I_SB(dn->inode), dn->data_blkaddr == NEW_ADDR);
+
+
+ fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
+ dn->ofs_in_node;
+
+ if (f2fs_update_extent_tree_range(dn->inode, fofs, dn->data_blkaddr, 1))
+ sync_inode_page(dn);
+}
+
+void f2fs_update_extent_cache_range(struct dnode_of_data *dn,
+ pgoff_t fofs, block_t blkaddr, unsigned int len)
+
+{
+ if (!f2fs_may_extent_tree(dn->inode))
+ return;
+
+ if (f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, len))
+ sync_inode_page(dn);
+}
+
+void init_extent_cache_info(struct f2fs_sb_info *sbi)
+{
+ INIT_RADIX_TREE(&sbi->extent_tree_root, GFP_NOIO);
+ init_rwsem(&sbi->extent_tree_lock);
+ INIT_LIST_HEAD(&sbi->extent_list);
+ spin_lock_init(&sbi->extent_lock);
+ atomic_set(&sbi->total_ext_tree, 0);
+ INIT_LIST_HEAD(&sbi->zombie_list);
+ atomic_set(&sbi->total_zombie_tree, 0);
+ atomic_set(&sbi->total_ext_node, 0);
+}
+
+int __init create_extent_cache(void)
+{
+ extent_tree_slab = f2fs_kmem_cache_create("f2fs_extent_tree",
+ sizeof(struct extent_tree));
+ if (!extent_tree_slab)
+ return -ENOMEM;
+ extent_node_slab = f2fs_kmem_cache_create("f2fs_extent_node",
+ sizeof(struct extent_node));
+ if (!extent_node_slab) {
+ kmem_cache_destroy(extent_tree_slab);
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+void destroy_extent_cache(void)
+{
+ kmem_cache_destroy(extent_node_slab);
+ kmem_cache_destroy(extent_tree_slab);
+}
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index a8327ed..ff79054 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -19,6 +19,9 @@
#include <linux/magic.h>
#include <linux/kobject.h>
#include <linux/sched.h>
+#include <linux/vmalloc.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
#ifdef CONFIG_F2FS_CHECK_FS
#define f2fs_bug_on(sbi, condition) BUG_ON(condition)
@@ -51,6 +54,8 @@
#define F2FS_MOUNT_NOBARRIER 0x00000800
#define F2FS_MOUNT_FASTBOOT 0x00001000
#define F2FS_MOUNT_EXTENT_CACHE 0x00002000
+#define F2FS_MOUNT_FORCE_FG_GC 0x00004000
+#define F2FS_MOUNT_DATA_FLUSH 0x00008000
#define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option)
#define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option)
@@ -121,6 +126,8 @@ enum {
(SM_I(sbi)->trim_sections * (sbi)->segs_per_sec)
#define BATCHED_TRIM_BLOCKS(sbi) \
(BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg)
+#define DEF_CP_INTERVAL 60 /* 60 secs */
+#define DEF_IDLE_INTERVAL 120 /* 2 mins */
struct cp_control {
int reason;
@@ -154,13 +161,7 @@ struct ino_entry {
nid_t ino; /* inode number */
};
-/*
- * for the list of directory inodes or gc inodes.
- * NOTE: there are two slab users for this structure, if we add/modify/delete
- * fields in structure for one of slab users, it may affect fields or size of
- * other one, in this condition, it's better to split both of slab and related
- * data structure.
- */
+/* for the list of inodes to be GCed */
struct inode_entry {
struct list_head list; /* list head */
struct inode *inode; /* vfs inode pointer */
@@ -228,6 +229,9 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size,
#define F2FS_IOC_START_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 3)
#define F2FS_IOC_RELEASE_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 4)
#define F2FS_IOC_ABORT_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 5)
+#define F2FS_IOC_GARBAGE_COLLECT _IO(F2FS_IOCTL_MAGIC, 6)
+#define F2FS_IOC_WRITE_CHECKPOINT _IO(F2FS_IOCTL_MAGIC, 7)
+#define F2FS_IOC_DEFRAGMENT _IO(F2FS_IOCTL_MAGIC, 8)
#define F2FS_IOC_SET_ENCRYPTION_POLICY \
_IOR('f', 19, struct f2fs_encryption_policy)
@@ -244,15 +248,22 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size,
#define F2FS_GOING_DOWN_FULLSYNC 0x0 /* going down with full sync */
#define F2FS_GOING_DOWN_METASYNC 0x1 /* going down with metadata */
#define F2FS_GOING_DOWN_NOSYNC 0x2 /* going down */
+#define F2FS_GOING_DOWN_METAFLUSH 0x3 /* going down with meta flush */
#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
/*
* ioctl commands in 32 bit emulation
*/
-#define F2FS_IOC32_GETFLAGS FS_IOC32_GETFLAGS
-#define F2FS_IOC32_SETFLAGS FS_IOC32_SETFLAGS
+#define F2FS_IOC32_GETFLAGS FS_IOC32_GETFLAGS
+#define F2FS_IOC32_SETFLAGS FS_IOC32_SETFLAGS
+#define F2FS_IOC32_GETVERSION FS_IOC32_GETVERSION
#endif
+struct f2fs_defragment {
+ u64 start;
+ u64 len;
+};
+
/*
* For INODE and NODE manager
*/
@@ -320,7 +331,7 @@ enum {
*/
};
-#define F2FS_LINK_MAX 32000 /* maximum link count per file */
+#define F2FS_LINK_MAX 0xffffffff /* maximum link count per file */
#define MAX_DIR_RA_PAGES 4 /* maximum ra pages of dir */
@@ -349,9 +360,10 @@ struct extent_tree {
nid_t ino; /* inode number */
struct rb_root root; /* root of extent info rb-tree */
struct extent_node *cached_en; /* recently accessed extent node */
+ struct extent_info largest; /* largested extent info */
+ struct list_head list; /* to be used by sbi->zombie_list */
rwlock_t lock; /* protect extent info rb-tree */
- atomic_t refcount; /* reference count of rb-tree */
- unsigned int count; /* # of extent node in rb-tree*/
+ atomic_t node_cnt; /* # of extent node in rb-tree*/
};
/*
@@ -372,6 +384,12 @@ struct f2fs_map_blocks {
unsigned int m_flags;
};
+/* for flag in get_data_block */
+#define F2FS_GET_BLOCK_READ 0
+#define F2FS_GET_BLOCK_DIO 1
+#define F2FS_GET_BLOCK_FIEMAP 2
+#define F2FS_GET_BLOCK_BMAP 3
+
/*
* i_advise uses FADVISE_XXX_BIT. We can add additional hints later.
*/
@@ -420,14 +438,13 @@ struct f2fs_inode_info {
unsigned int clevel; /* maximum level of given file name */
nid_t i_xattr_nid; /* node id that contains xattrs */
unsigned long long xattr_ver; /* cp version of xattr modification */
- struct extent_info ext; /* in-memory extent cache entry */
- rwlock_t ext_lock; /* rwlock for single extent cache */
- struct inode_entry *dirty_dir; /* the pointer of dirty dir */
- struct radix_tree_root inmem_root; /* radix tree for inmem pages */
+ struct list_head dirty_list; /* linked in global dirty list */
struct list_head inmem_pages; /* inmemory pages managed by f2fs */
struct mutex inmem_lock; /* lock for inmemory pages */
+ struct extent_tree *extent_tree; /* cached extent_tree entry */
+
#ifdef CONFIG_F2FS_FS_ENCRYPTION
/* Encryption params */
struct f2fs_crypt_info *i_crypt_info;
@@ -484,12 +501,20 @@ static inline bool __is_front_mergeable(struct extent_info *cur,
return __is_extent_mergeable(cur, front);
}
+static inline void __try_update_largest_extent(struct extent_tree *et,
+ struct extent_node *en)
+{
+ if (en->ei.len > et->largest.len)
+ et->largest = en->ei;
+}
+
struct f2fs_nm_info {
block_t nat_blkaddr; /* base disk address of NAT */
nid_t max_nid; /* maximum possible node ids */
nid_t available_nids; /* maximum available node ids */
nid_t next_scan_nid; /* the next nid to be scanned */
unsigned int ram_thresh; /* control the memory footprint */
+ unsigned int ra_nid_pages; /* # of nid pages to be readaheaded */
/* NAT cache management */
struct radix_tree_root nat_root;/* root of the nat entry cache */
@@ -523,6 +548,7 @@ struct dnode_of_data {
nid_t nid; /* node id of the direct node block */
unsigned int ofs_in_node; /* data offset in the node page */
bool inode_page_locked; /* inode page is locked or not */
+ bool node_changed; /* is node block changed */
block_t data_blkaddr; /* block address of the node block */
};
@@ -626,6 +652,7 @@ struct f2fs_sm_info {
enum count_type {
F2FS_WRITEBACK,
F2FS_DIRTY_DENTS,
+ F2FS_DIRTY_DATA,
F2FS_DIRTY_NODES,
F2FS_DIRTY_META,
F2FS_INMEM_PAGES,
@@ -674,6 +701,12 @@ struct f2fs_bio_info {
struct rw_semaphore io_rwsem; /* blocking op for bio */
};
+enum inode_type {
+ DIR_INODE, /* for dirty dir inode */
+ FILE_INODE, /* for dirty regular/symlink inode */
+ NR_INODE_TYPE,
+};
+
/* for inner inode cache management */
struct inode_management {
struct radix_tree_root ino_root; /* ino entry array */
@@ -690,11 +723,17 @@ enum {
SBI_POR_DOING, /* recovery is doing or not */
};
+enum {
+ CP_TIME,
+ REQ_TIME,
+ MAX_TIME,
+};
+
struct f2fs_sb_info {
struct super_block *sb; /* pointer to VFS super block */
struct proc_dir_entry *s_proc; /* proc entry */
- struct buffer_head *raw_super_buf; /* buffer head of raw sb */
struct f2fs_super_block *raw_super; /* raw super block pointer */
+ int valid_super_block; /* valid super block no */
int s_flag; /* flags for sbi */
/* for node-related operations */
@@ -716,22 +755,26 @@ struct f2fs_sb_info {
struct rw_semaphore node_write; /* locking node writes */
struct mutex writepages; /* mutex for writepages() */
wait_queue_head_t cp_wait;
+ unsigned long last_time[MAX_TIME]; /* to store time in jiffies */
+ long interval_time[MAX_TIME]; /* to store thresholds */
struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */
/* for orphan inode, use 0'th array */
unsigned int max_orphans; /* max orphan inodes */
- /* for directory inode management */
- struct list_head dir_inode_list; /* dir inode list */
- spinlock_t dir_inode_lock; /* for dir inode list lock */
+ /* for inode management */
+ struct list_head inode_list[NR_INODE_TYPE]; /* dirty inode list */
+ spinlock_t inode_lock[NR_INODE_TYPE]; /* for dirty inode list lock */
/* for extent tree cache */
struct radix_tree_root extent_tree_root;/* cache extent cache entries */
struct rw_semaphore extent_tree_lock; /* locking extent radix tree */
struct list_head extent_list; /* lru list for shrinker */
spinlock_t extent_lock; /* locking extent lru list */
- int total_ext_tree; /* extent tree count */
+ atomic_t total_ext_tree; /* extent tree count */
+ struct list_head zombie_list; /* extent zombie tree list */
+ atomic_t total_zombie_tree; /* extent zombie tree count */
atomic_t total_ext_node; /* extent info count */
/* basic filesystem units */
@@ -749,6 +792,7 @@ struct f2fs_sb_info {
unsigned int total_node_count; /* total node block count */
unsigned int total_valid_node_count; /* valid node block count */
unsigned int total_valid_inode_count; /* valid inode count */
+ loff_t max_file_blocks; /* max block index of file */
int active_logs; /* # of active logs */
int dir_level; /* directory level */
@@ -779,11 +823,15 @@ struct f2fs_sb_info {
unsigned int segment_count[2]; /* # of allocated segments */
unsigned int block_count[2]; /* # of allocated blocks */
atomic_t inplace_count; /* # of inplace update */
- int total_hit_ext, read_hit_ext; /* extent cache hit ratio */
+ atomic64_t total_hit_ext; /* # of lookup extent cache */
+ atomic64_t read_hit_rbtree; /* # of hit rbtree extent node */
+ atomic64_t read_hit_largest; /* # of hit largest extent node */
+ atomic64_t read_hit_cached; /* # of hit cached extent node */
+ atomic_t inline_xattr; /* # of inline_xattr inodes */
atomic_t inline_inode; /* # of inline_data inodes */
atomic_t inline_dir; /* # of inline_dentry inodes */
int bg_gc; /* background gc calls */
- unsigned int n_dirty_dirs; /* # of dir inodes */
+ unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */
#endif
unsigned int last_victim[2]; /* last victim segment # */
spinlock_t stat_lock; /* lock for stat operations */
@@ -791,8 +839,38 @@ struct f2fs_sb_info {
/* For sysfs suppport */
struct kobject s_kobj;
struct completion s_kobj_unregister;
+
+ /* For shrinker support */
+ struct list_head s_list;
+ struct mutex umount_mutex;
+ unsigned int shrinker_run_no;
};
+static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type)
+{
+ sbi->last_time[type] = jiffies;
+}
+
+static inline bool f2fs_time_over(struct f2fs_sb_info *sbi, int type)
+{
+ struct timespec ts = {sbi->interval_time[type], 0};
+ unsigned long interval = timespec_to_jiffies(&ts);
+
+ return time_after(jiffies, sbi->last_time[type] + interval);
+}
+
+static inline bool is_idle(struct f2fs_sb_info *sbi)
+{
+ struct block_device *bdev = sbi->sb->s_bdev;
+ struct request_queue *q = bdev_get_queue(bdev);
+ struct request_list *rl = &q->root_rl;
+
+ if (rl->count[BLK_RW_SYNC] || rl->count[BLK_RW_ASYNC])
+ return 0;
+
+ return f2fs_time_over(sbi, REQ_TIME);
+}
+
/*
* Inline functions
*/
@@ -1028,8 +1106,8 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
static inline void inode_inc_dirty_pages(struct inode *inode)
{
atomic_inc(&F2FS_I(inode)->dirty_pages);
- if (S_ISDIR(inode->i_mode))
- inc_page_count(F2FS_I_SB(inode), F2FS_DIRTY_DENTS);
+ inc_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ?
+ F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA);
}
static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type)
@@ -1039,13 +1117,13 @@ static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type)
static inline void inode_dec_dirty_pages(struct inode *inode)
{
- if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode))
+ if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) &&
+ !S_ISLNK(inode->i_mode))
return;
atomic_dec(&F2FS_I(inode)->dirty_pages);
-
- if (S_ISDIR(inode->i_mode))
- dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_DENTS);
+ dec_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ?
+ F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA);
}
static inline int get_pages(struct f2fs_sb_info *sbi, int count_type)
@@ -1060,8 +1138,7 @@ static inline int get_dirty_pages(struct inode *inode)
static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type)
{
- unsigned int pages_per_sec = sbi->segs_per_sec *
- (1 << sbi->log_blocks_per_seg);
+ unsigned int pages_per_sec = sbi->segs_per_sec * sbi->blocks_per_seg;
return ((get_pages(sbi, block_type) + pages_per_sec - 1)
>> sbi->log_blocks_per_seg) / sbi->segs_per_sec;
}
@@ -1202,6 +1279,24 @@ static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi)
return sbi->total_valid_inode_count;
}
+static inline struct page *f2fs_grab_cache_page(struct address_space *mapping,
+ pgoff_t index, bool for_write)
+{
+ if (!for_write)
+ return grab_cache_page(mapping, index);
+ return grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS);
+}
+
+static inline void f2fs_copy_page(struct page *src, struct page *dst)
+{
+ char *src_kaddr = kmap(src);
+ char *dst_kaddr = kmap(dst);
+
+ memcpy(dst_kaddr, src_kaddr, PAGE_SIZE);
+ kunmap(dst);
+ kunmap(src);
+}
+
static inline void f2fs_put_page(struct page *page, int unlock)
{
if (!page)
@@ -1234,16 +1329,24 @@ static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep,
gfp_t flags)
{
void *entry;
-retry:
- entry = kmem_cache_alloc(cachep, flags);
- if (!entry) {
- cond_resched();
- goto retry;
- }
+ entry = kmem_cache_alloc(cachep, flags);
+ if (!entry)
+ entry = kmem_cache_alloc(cachep, flags | __GFP_NOFAIL);
return entry;
}
+static inline struct bio *f2fs_bio_alloc(int npages)
+{
+ struct bio *bio;
+
+ /* No failure on bio allocation */
+ bio = bio_alloc(GFP_NOIO, npages);
+ if (!bio)
+ bio = bio_alloc(GFP_NOIO | __GFP_NOFAIL, npages);
+ return bio;
+}
+
static inline void f2fs_radix_tree_insert(struct radix_tree_root *root,
unsigned long index, void *item)
{
@@ -1342,6 +1445,7 @@ enum {
FI_INC_LINK, /* need to increment i_nlink */
FI_ACL_MODE, /* indicate acl mode */
FI_NO_ALLOC, /* should not allocate any blocks */
+ FI_FREE_NID, /* free allocated nide */
FI_UPDATE_DIR, /* should update inode block for consistency */
FI_DELAY_IPUT, /* used for the recovery */
FI_NO_EXTENT, /* not to use the extent cache */
@@ -1357,6 +1461,8 @@ enum {
FI_DROP_CACHE, /* drop dirty page cache */
FI_DATA_EXIST, /* indicate data exists */
FI_INLINE_DOTS, /* indicate inline dot dentries */
+ FI_DO_DEFRAG, /* indicate defragment is running */
+ FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */
};
static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag)
@@ -1541,6 +1647,35 @@ static inline bool is_dot_dotdot(const struct qstr *str)
return false;
}
+static inline bool f2fs_may_extent_tree(struct inode *inode)
+{
+ if (!test_opt(F2FS_I_SB(inode), EXTENT_CACHE) ||
+ is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT))
+ return false;
+
+ return S_ISREG(inode->i_mode);
+}
+
+static inline void *f2fs_kvmalloc(size_t size, gfp_t flags)
+{
+ void *ret;
+
+ ret = kmalloc(size, flags | __GFP_NOWARN);
+ if (!ret)
+ ret = __vmalloc(size, flags, PAGE_KERNEL);
+ return ret;
+}
+
+static inline void *f2fs_kvzalloc(size_t size, gfp_t flags)
+{
+ void *ret;
+
+ ret = kzalloc(size, flags | __GFP_NOWARN);
+ if (!ret)
+ ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL);
+ return ret;
+}
+
#define get_inode_mode(i) \
((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \
(F2FS_I(i)->i_acl_mode) : ((i)->i_mode))
@@ -1557,7 +1692,7 @@ static inline bool is_dot_dotdot(const struct qstr *str)
int f2fs_sync_file(struct file *, loff_t, loff_t, int);
void truncate_data_blocks(struct dnode_of_data *);
int truncate_blocks(struct inode *, u64, bool);
-void f2fs_truncate(struct inode *);
+int f2fs_truncate(struct inode *, bool);
int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
int f2fs_setattr(struct dentry *, struct iattr *);
int truncate_hole(struct inode *, pgoff_t, pgoff_t);
@@ -1571,8 +1706,8 @@ long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long);
void f2fs_set_inode_flags(struct inode *);
struct inode *f2fs_iget(struct super_block *, unsigned long);
int try_to_free_nats(struct f2fs_sb_info *, int);
-void update_inode(struct inode *, struct page *);
-void update_inode_page(struct inode *);
+int update_inode(struct inode *, struct page *);
+int update_inode_page(struct inode *);
int f2fs_write_inode(struct inode *, struct writeback_control *);
void f2fs_evict_inode(struct inode *);
void handle_failed_inode(struct inode *);
@@ -1649,7 +1784,7 @@ int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int);
int truncate_inode_blocks(struct inode *, pgoff_t);
int truncate_xattr_node(struct inode *, struct page *);
int wait_on_node_pages_writeback(struct f2fs_sb_info *, nid_t);
-void remove_inode_page(struct inode *);
+int remove_inode_page(struct inode *);
struct page *new_inode_page(struct inode *);
struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *);
void ra_node_page(struct f2fs_sb_info *, nid_t);
@@ -1660,6 +1795,7 @@ int sync_node_pages(struct f2fs_sb_info *, nid_t, struct writeback_control *);
bool alloc_nid(struct f2fs_sb_info *, nid_t *);
void alloc_nid_done(struct f2fs_sb_info *, nid_t);
void alloc_nid_failed(struct f2fs_sb_info *, nid_t);
+int try_to_free_nids(struct f2fs_sb_info *, int);
void recover_inline_xattr(struct inode *, struct page *);
void recover_xattr_data(struct inode *, struct page *, block_t);
int recover_inode_page(struct f2fs_sb_info *, struct page *);
@@ -1675,17 +1811,18 @@ void destroy_node_manager_caches(void);
* segment.c
*/
void register_inmem_page(struct inode *, struct page *);
-void commit_inmem_pages(struct inode *, bool);
-void f2fs_balance_fs(struct f2fs_sb_info *);
+int commit_inmem_pages(struct inode *, bool);
+void f2fs_balance_fs(struct f2fs_sb_info *, bool);
void f2fs_balance_fs_bg(struct f2fs_sb_info *);
int f2fs_issue_flush(struct f2fs_sb_info *);
int create_flush_cmd_control(struct f2fs_sb_info *);
void destroy_flush_cmd_control(struct f2fs_sb_info *);
void invalidate_blocks(struct f2fs_sb_info *, block_t);
+bool is_checkpointed_data(struct f2fs_sb_info *, block_t);
void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t);
void clear_prefree_segments(struct f2fs_sb_info *, struct cp_control *);
void release_discard_addrs(struct f2fs_sb_info *);
-void discard_next_dnode(struct f2fs_sb_info *, block_t);
+bool discard_next_dnode(struct f2fs_sb_info *, block_t);
int npages_for_summary_flush(struct f2fs_sb_info *, bool);
void allocate_new_segments(struct f2fs_sb_info *);
int f2fs_trim_fs(struct f2fs_sb_info *, struct fstrim_range *);
@@ -1700,6 +1837,7 @@ void f2fs_replace_block(struct f2fs_sb_info *, struct dnode_of_data *,
void allocate_data_block(struct f2fs_sb_info *, struct page *,
block_t, block_t *, struct f2fs_summary *, int);
void f2fs_wait_on_page_writeback(struct page *, enum page_type);
+void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *, block_t);
void write_data_summaries(struct f2fs_sb_info *, block_t);
void write_node_summaries(struct f2fs_sb_info *, block_t);
int lookup_journal_in_cursum(struct f2fs_summary_block *,
@@ -1715,25 +1853,26 @@ void destroy_segment_manager_caches(void);
*/
struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t);
struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t);
+struct page *get_tmp_page(struct f2fs_sb_info *, pgoff_t);
bool is_valid_blkaddr(struct f2fs_sb_info *, block_t, int);
-int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int);
+int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int, bool);
void ra_meta_pages_cond(struct f2fs_sb_info *, pgoff_t);
long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long);
-void add_dirty_inode(struct f2fs_sb_info *, nid_t, int type);
-void remove_dirty_inode(struct f2fs_sb_info *, nid_t, int type);
-void release_dirty_inode(struct f2fs_sb_info *);
+void add_ino_entry(struct f2fs_sb_info *, nid_t, int type);
+void remove_ino_entry(struct f2fs_sb_info *, nid_t, int type);
+void release_ino_entry(struct f2fs_sb_info *);
bool exist_written_data(struct f2fs_sb_info *, nid_t, int);
int acquire_orphan_inode(struct f2fs_sb_info *);
void release_orphan_inode(struct f2fs_sb_info *);
void add_orphan_inode(struct f2fs_sb_info *, nid_t);
void remove_orphan_inode(struct f2fs_sb_info *, nid_t);
-void recover_orphan_inodes(struct f2fs_sb_info *);
+int recover_orphan_inodes(struct f2fs_sb_info *);
int get_valid_checkpoint(struct f2fs_sb_info *);
void update_dirty_page(struct inode *, struct page *);
void add_dirty_dir_inode(struct inode *);
-void remove_dirty_dir_inode(struct inode *);
-void sync_dirty_dir_inodes(struct f2fs_sb_info *);
-void write_checkpoint(struct f2fs_sb_info *, struct cp_control *);
+void remove_dirty_inode(struct inode *);
+int sync_dirty_inodes(struct f2fs_sb_info *, enum inode_type);
+int write_checkpoint(struct f2fs_sb_info *, struct cp_control *);
void init_ino_entry_info(struct f2fs_sb_info *);
int __init create_checkpoint_caches(void);
void destroy_checkpoint_caches(void);
@@ -1746,21 +1885,15 @@ int f2fs_submit_page_bio(struct f2fs_io_info *);
void f2fs_submit_page_mbio(struct f2fs_io_info *);
void set_data_blkaddr(struct dnode_of_data *);
int reserve_new_block(struct dnode_of_data *);
+int f2fs_get_block(struct dnode_of_data *, pgoff_t);
int f2fs_reserve_block(struct dnode_of_data *, pgoff_t);
-void f2fs_shrink_extent_tree(struct f2fs_sb_info *, int);
-void f2fs_destroy_extent_tree(struct inode *);
-void f2fs_init_extent_cache(struct inode *, struct f2fs_extent *);
-void f2fs_update_extent_cache(struct dnode_of_data *);
-void f2fs_preserve_extent_tree(struct inode *);
-struct page *get_read_data_page(struct inode *, pgoff_t, int);
+struct page *get_read_data_page(struct inode *, pgoff_t, int, bool);
struct page *find_data_page(struct inode *, pgoff_t);
-struct page *get_lock_data_page(struct inode *, pgoff_t);
+struct page *get_lock_data_page(struct inode *, pgoff_t, bool);
struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool);
int do_write_data_page(struct f2fs_io_info *);
+int f2fs_map_blocks(struct inode *, struct f2fs_map_blocks *, int, int);
int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64);
-void init_extent_cache_info(struct f2fs_sb_info *);
-int __init create_extent_cache(void);
-void destroy_extent_cache(void);
void f2fs_invalidate_page(struct page *, unsigned int, unsigned int);
int f2fs_release_page(struct page *, gfp_t);
@@ -1770,7 +1903,7 @@ int f2fs_release_page(struct page *, gfp_t);
int start_gc_thread(struct f2fs_sb_info *);
void stop_gc_thread(struct f2fs_sb_info *);
block_t start_bidx_of_node(unsigned int, struct f2fs_inode_info *);
-int f2fs_gc(struct f2fs_sb_info *);
+int f2fs_gc(struct f2fs_sb_info *, bool);
void build_gc_manager(struct f2fs_sb_info *);
/*
@@ -1788,17 +1921,21 @@ struct f2fs_stat_info {
struct f2fs_sb_info *sbi;
int all_area_segs, sit_area_segs, nat_area_segs, ssa_area_segs;
int main_area_segs, main_area_sections, main_area_zones;
- int hit_ext, total_ext, ext_tree, ext_node;
- int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta;
+ unsigned long long hit_largest, hit_cached, hit_rbtree;
+ unsigned long long hit_total, total_ext;
+ int ext_tree, zombie_tree, ext_node;
+ int ndirty_node, ndirty_meta;
+ int ndirty_dent, ndirty_dirs, ndirty_data, ndirty_files;
int nats, dirty_nats, sits, dirty_sits, fnids;
int total_count, utilization;
- int bg_gc, inline_inode, inline_dir, inmem_pages, wb_pages;
+ int bg_gc, inmem_pages, wb_pages;
+ int inline_xattr, inline_inode, inline_dir;
unsigned int valid_count, valid_node_count, valid_inode_count;
unsigned int bimodal, avg_vblocks;
int util_free, util_valid, util_invalid;
int rsvd_segs, overp_segs;
int dirty_count, node_pages, meta_pages;
- int prefree_count, call_count, cp_count;
+ int prefree_count, call_count, cp_count, bg_cp_count;
int tot_segs, node_segs, data_segs, free_segs, free_secs;
int bg_node_segs, bg_data_segs;
int tot_blks, data_blks, node_blks;
@@ -1810,7 +1947,7 @@ struct f2fs_stat_info {
unsigned int segment_count[2];
unsigned int block_count[2];
unsigned int inplace_count;
- unsigned base_mem, cache_mem, page_mem;
+ unsigned long long base_mem, cache_mem, page_mem;
};
static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
@@ -1819,12 +1956,25 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
}
#define stat_inc_cp_count(si) ((si)->cp_count++)
+#define stat_inc_bg_cp_count(si) ((si)->bg_cp_count++)
#define stat_inc_call_count(si) ((si)->call_count++)
#define stat_inc_bggc_count(sbi) ((sbi)->bg_gc++)
-#define stat_inc_dirty_dir(sbi) ((sbi)->n_dirty_dirs++)
-#define stat_dec_dirty_dir(sbi) ((sbi)->n_dirty_dirs--)
-#define stat_inc_total_hit(sb) ((F2FS_SB(sb))->total_hit_ext++)
-#define stat_inc_read_hit(sb) ((F2FS_SB(sb))->read_hit_ext++)
+#define stat_inc_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]++)
+#define stat_dec_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]--)
+#define stat_inc_total_hit(sbi) (atomic64_inc(&(sbi)->total_hit_ext))
+#define stat_inc_rbtree_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_rbtree))
+#define stat_inc_largest_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_largest))
+#define stat_inc_cached_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_cached))
+#define stat_inc_inline_xattr(inode) \
+ do { \
+ if (f2fs_has_inline_xattr(inode)) \
+ (atomic_inc(&F2FS_I_SB(inode)->inline_xattr)); \
+ } while (0)
+#define stat_dec_inline_xattr(inode) \
+ do { \
+ if (f2fs_has_inline_xattr(inode)) \
+ (atomic_dec(&F2FS_I_SB(inode)->inline_xattr)); \
+ } while (0)
#define stat_inc_inline_inode(inode) \
do { \
if (f2fs_has_inline_data(inode)) \
@@ -1885,16 +2035,21 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
int f2fs_build_stats(struct f2fs_sb_info *);
void f2fs_destroy_stats(struct f2fs_sb_info *);
-void __init f2fs_create_root_stats(void);
+int __init f2fs_create_root_stats(void);
void f2fs_destroy_root_stats(void);
#else
#define stat_inc_cp_count(si)
+#define stat_inc_bg_cp_count(si)
#define stat_inc_call_count(si)
#define stat_inc_bggc_count(si)
-#define stat_inc_dirty_dir(sbi)
-#define stat_dec_dirty_dir(sbi)
+#define stat_inc_dirty_inode(sbi, type)
+#define stat_dec_dirty_inode(sbi, type)
#define stat_inc_total_hit(sb)
-#define stat_inc_read_hit(sb)
+#define stat_inc_rbtree_node_hit(sb)
+#define stat_inc_largest_node_hit(sbi)
+#define stat_inc_cached_node_hit(sbi)
+#define stat_inc_inline_xattr(inode)
+#define stat_dec_inline_xattr(inode)
#define stat_inc_inline_inode(inode)
#define stat_dec_inline_inode(inode)
#define stat_inc_inline_dir(inode)
@@ -1909,7 +2064,7 @@ void f2fs_destroy_root_stats(void);
static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; }
static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { }
-static inline void __init f2fs_create_root_stats(void) { }
+static inline int __init f2fs_create_root_stats(void) { return 0; }
static inline void f2fs_destroy_root_stats(void) { }
#endif
@@ -1948,6 +2103,31 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *, struct page *,
bool f2fs_empty_inline_dir(struct inode *);
int f2fs_read_inline_dir(struct file *, struct dir_context *,
struct f2fs_str *);
+int f2fs_inline_data_fiemap(struct inode *,
+ struct fiemap_extent_info *, __u64, __u64);
+
+/*
+ * shrinker.c
+ */
+unsigned long f2fs_shrink_count(struct shrinker *, struct shrink_control *);
+unsigned long f2fs_shrink_scan(struct shrinker *, struct shrink_control *);
+void f2fs_join_shrinker(struct f2fs_sb_info *);
+void f2fs_leave_shrinker(struct f2fs_sb_info *);
+
+/*
+ * extent_cache.c
+ */
+unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *, int);
+bool f2fs_init_extent_tree(struct inode *, struct f2fs_extent *);
+unsigned int f2fs_destroy_extent_node(struct inode *);
+void f2fs_destroy_extent_tree(struct inode *);
+bool f2fs_lookup_extent_cache(struct inode *, pgoff_t, struct extent_info *);
+void f2fs_update_extent_cache(struct dnode_of_data *);
+void f2fs_update_extent_cache_range(struct dnode_of_data *dn,
+ pgoff_t, block_t, unsigned int);
+void init_extent_cache_info(struct f2fs_sb_info *);
+int __init create_extent_cache(void);
+void destroy_extent_cache(void);
/*
* crypto support
@@ -1989,7 +2169,7 @@ static inline int f2fs_sb_has_crypto(struct super_block *sb)
static inline bool f2fs_may_encrypt(struct inode *inode)
{
#ifdef CONFIG_F2FS_FS_ENCRYPTION
- mode_t mode = inode->i_mode;
+ umode_t mode = inode->i_mode;
return (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode));
#else
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index b0f38c3..18ddb1e 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -27,6 +27,7 @@
#include "segment.h"
#include "xattr.h"
#include "acl.h"
+#include "gc.h"
#include "trace.h"
#include <trace/events/f2fs.h>
@@ -39,8 +40,6 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
struct dnode_of_data dn;
int err;
- f2fs_balance_fs(sbi);
-
sb_start_pagefault(inode->i_sb);
f2fs_bug_on(sbi, f2fs_has_inline_data(inode));
@@ -56,6 +55,8 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
f2fs_put_dnode(&dn);
f2fs_unlock_op(sbi);
+ f2fs_balance_fs(sbi, dn.node_changed);
+
file_update_time(vma->vm_file);
lock_page(page);
if (unlikely(page->mapping != inode->i_mapping ||
@@ -73,7 +74,8 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
goto mapped;
/* page is wholly or partially inside EOF */
- if (((page->index + 1) << PAGE_CACHE_SHIFT) > i_size_read(inode)) {
+ if (((loff_t)(page->index + 1) << PAGE_CACHE_SHIFT) >
+ i_size_read(inode)) {
unsigned offset;
offset = i_size_read(inode) & ~PAGE_CACHE_MASK;
zero_user_segment(page, offset, PAGE_CACHE_SIZE);
@@ -85,8 +87,16 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
mapped:
/* fill the page */
f2fs_wait_on_page_writeback(page, DATA);
+
+ /* wait for GCed encrypted page writeback */
+ if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
+ f2fs_wait_on_encrypted_page_writeback(sbi, dn.data_blkaddr);
+
+ /* if gced page is attached, don't write to cold segment */
+ clear_cold_data(page);
out:
sb_end_pagefault(inode->i_sb);
+ f2fs_update_time(sbi, REQ_TIME);
return block_page_mkwrite_return(err);
}
@@ -192,7 +202,7 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
trace_f2fs_sync_file_enter(inode);
/* if fdatasync is triggered, let's do in-place-update */
- if (get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks)
+ if (datasync || get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks)
set_inode_flag(fi, FI_NEED_IPU);
ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
clear_inode_flag(fi, FI_NEED_IPU);
@@ -203,8 +213,8 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
}
/* if the inode is dirty, let's recover all the time */
- if (!datasync && is_inode_flag_set(fi, FI_DIRTY_INODE)) {
- update_inode_page(inode);
+ if (!datasync) {
+ f2fs_write_inode(inode, NULL);
goto go_write;
}
@@ -224,9 +234,6 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
goto out;
}
go_write:
- /* guarantee free sections for fsync */
- f2fs_balance_fs(sbi);
-
/*
* Both of fdatasync() and fsync() are able to be recovered from
* sudden-power-off.
@@ -252,8 +259,10 @@ sync_nodes:
sync_node_pages(sbi, ino, &wbc);
/* if cp_error was enabled, we should avoid infinite loop */
- if (unlikely(f2fs_cp_error(sbi)))
+ if (unlikely(f2fs_cp_error(sbi))) {
+ ret = -EIO;
goto out;
+ }
if (need_inode_block_update(sbi, ino)) {
mark_inode_dirty_sync(inode);
@@ -266,12 +275,13 @@ sync_nodes:
goto out;
/* once recovery info is written, don't need to tack this */
- remove_dirty_inode(sbi, ino, APPEND_INO);
+ remove_ino_entry(sbi, ino, APPEND_INO);
clear_inode_flag(fi, FI_APPEND_WRITE);
flush_out:
- remove_dirty_inode(sbi, ino, UPDATE_INO);
+ remove_ino_entry(sbi, ino, UPDATE_INO);
clear_inode_flag(fi, FI_UPDATE_WRITE);
ret = f2fs_issue_flush(sbi);
+ f2fs_update_time(sbi, REQ_TIME);
out:
trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
f2fs_trace_ios(NULL, 1);
@@ -340,7 +350,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
dirty = __get_first_dirty_index(inode->i_mapping, pgofs, whence);
- for (; data_ofs < isize; data_ofs = pgofs << PAGE_CACHE_SHIFT) {
+ for (; data_ofs < isize; data_ofs = (loff_t)pgofs << PAGE_CACHE_SHIFT) {
set_new_dnode(&dn, inode, NULL, NULL, 0);
err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE_RA);
if (err && err != -ENOENT) {
@@ -409,19 +419,18 @@ static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence)
static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
{
struct inode *inode = file_inode(file);
+ int err;
if (f2fs_encrypted_inode(inode)) {
- int err = f2fs_get_encryption_info(inode);
+ err = f2fs_get_encryption_info(inode);
if (err)
return 0;
}
/* we don't need to use inline_data strictly */
- if (f2fs_has_inline_data(inode)) {
- int err = f2fs_convert_inline_inode(inode);
- if (err)
- return err;
- }
+ err = f2fs_convert_inline_inode(inode);
+ if (err)
+ return err;
file_accessed(file);
vma->vm_ops = &f2fs_file_vm_ops;
@@ -442,9 +451,9 @@ static int f2fs_file_open(struct inode *inode, struct file *filp)
int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
{
- int nr_free = 0, ofs = dn->ofs_in_node;
struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
struct f2fs_node *raw_node;
+ int nr_free = 0, ofs = dn->ofs_in_node, len = count;
__le32 *addr;
raw_node = F2FS_NODE(dn->node_page);
@@ -457,20 +466,28 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
dn->data_blkaddr = NULL_ADDR;
set_data_blkaddr(dn);
- f2fs_update_extent_cache(dn);
invalidate_blocks(sbi, blkaddr);
if (dn->ofs_in_node == 0 && IS_INODE(dn->node_page))
clear_inode_flag(F2FS_I(dn->inode),
FI_FIRST_BLOCK_WRITTEN);
nr_free++;
}
+
if (nr_free) {
+ pgoff_t fofs;
+ /*
+ * once we invalidate valid blkaddr in range [ofs, ofs + count],
+ * we will invalidate all blkaddr in the whole range.
+ */
+ fofs = start_bidx_of_node(ofs_of_node(dn->node_page),
+ F2FS_I(dn->inode)) + ofs;
+ f2fs_update_extent_cache_range(dn, fofs, 0, len);
dec_valid_block_count(sbi, dn->inode, nr_free);
- set_page_dirty(dn->node_page);
sync_inode_page(dn);
}
dn->ofs_in_node = ofs;
+ f2fs_update_time(sbi, REQ_TIME);
trace_f2fs_truncate_data_blocks_range(dn->inode, dn->nid,
dn->ofs_in_node, nr_free);
return nr_free;
@@ -493,14 +510,14 @@ static int truncate_partial_data_page(struct inode *inode, u64 from,
return 0;
if (cache_only) {
- page = grab_cache_page(mapping, index);
+ page = f2fs_grab_cache_page(mapping, index, false);
if (page && PageUptodate(page))
goto truncate_out;
f2fs_put_page(page, 1);
return 0;
}
- page = get_lock_data_page(inode, index);
+ page = get_lock_data_page(inode, index, true);
if (IS_ERR(page))
return 0;
truncate_out:
@@ -576,24 +593,30 @@ out:
return err;
}
-void f2fs_truncate(struct inode *inode)
+int f2fs_truncate(struct inode *inode, bool lock)
{
+ int err;
+
if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
S_ISLNK(inode->i_mode)))
- return;
+ return 0;
trace_f2fs_truncate(inode);
/* we should check inline_data size */
- if (f2fs_has_inline_data(inode) && !f2fs_may_inline_data(inode)) {
- if (f2fs_convert_inline_inode(inode))
- return;
+ if (!f2fs_may_inline_data(inode)) {
+ err = f2fs_convert_inline_inode(inode);
+ if (err)
+ return err;
}
- if (!truncate_blocks(inode, i_size_read(inode), true)) {
- inode->i_mtime = inode->i_ctime = CURRENT_TIME;
- mark_inode_dirty(inode);
- }
+ err = truncate_blocks(inode, i_size_read(inode), lock);
+ if (err)
+ return err;
+
+ inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ mark_inode_dirty(inode);
+ return 0;
}
int f2fs_getattr(struct vfsmount *mnt,
@@ -653,14 +676,24 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
if (attr->ia_size <= i_size_read(inode)) {
truncate_setsize(inode, attr->ia_size);
- f2fs_truncate(inode);
- f2fs_balance_fs(F2FS_I_SB(inode));
+ err = f2fs_truncate(inode, true);
+ if (err)
+ return err;
+ f2fs_balance_fs(F2FS_I_SB(inode), true);
} else {
/*
* do not trim all blocks after i_size if target size is
* larger than i_size.
*/
truncate_setsize(inode, attr->ia_size);
+
+ /* should convert inline inode here */
+ if (!f2fs_may_inline_data(inode)) {
+ err = f2fs_convert_inline_inode(inode);
+ if (err)
+ return err;
+ }
+ inode->i_mtime = inode->i_ctime = CURRENT_TIME;
}
}
@@ -692,48 +725,58 @@ const struct inode_operations f2fs_file_inode_operations = {
.fiemap = f2fs_fiemap,
};
-static void fill_zero(struct inode *inode, pgoff_t index,
+static int fill_zero(struct inode *inode, pgoff_t index,
loff_t start, loff_t len)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct page *page;
if (!len)
- return;
+ return 0;
- f2fs_balance_fs(sbi);
+ f2fs_balance_fs(sbi, true);
f2fs_lock_op(sbi);
page = get_new_data_page(inode, NULL, index, false);
f2fs_unlock_op(sbi);
- if (!IS_ERR(page)) {
- f2fs_wait_on_page_writeback(page, DATA);
- zero_user(page, start, len);
- set_page_dirty(page);
- f2fs_put_page(page, 1);
- }
+ if (IS_ERR(page))
+ return PTR_ERR(page);
+
+ f2fs_wait_on_page_writeback(page, DATA);
+ zero_user(page, start, len);
+ set_page_dirty(page);
+ f2fs_put_page(page, 1);
+ return 0;
}
int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end)
{
- pgoff_t index;
int err;
- for (index = pg_start; index < pg_end; index++) {
+ while (pg_start < pg_end) {
struct dnode_of_data dn;
+ pgoff_t end_offset, count;
set_new_dnode(&dn, inode, NULL, NULL, 0);
- err = get_dnode_of_data(&dn, index, LOOKUP_NODE);
+ err = get_dnode_of_data(&dn, pg_start, LOOKUP_NODE);
if (err) {
- if (err == -ENOENT)
+ if (err == -ENOENT) {
+ pg_start++;
continue;
+ }
return err;
}
- if (dn.data_blkaddr != NULL_ADDR)
- truncate_data_blocks_range(&dn, 1);
+ end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
+ count = min(end_offset - dn.ofs_in_node, pg_end - pg_start);
+
+ f2fs_bug_on(F2FS_I_SB(inode), count == 0 || count > end_offset);
+
+ truncate_data_blocks_range(&dn, count);
f2fs_put_dnode(&dn);
+
+ pg_start += count;
}
return 0;
}
@@ -742,16 +785,11 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
{
pgoff_t pg_start, pg_end;
loff_t off_start, off_end;
- int ret = 0;
-
- if (!S_ISREG(inode->i_mode))
- return -EOPNOTSUPP;
+ int ret;
- if (f2fs_has_inline_data(inode)) {
- ret = f2fs_convert_inline_inode(inode);
- if (ret)
- return ret;
- }
+ ret = f2fs_convert_inline_inode(inode);
+ if (ret)
+ return ret;
pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT;
pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT;
@@ -760,24 +798,32 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
off_end = (offset + len) & (PAGE_CACHE_SIZE - 1);
if (pg_start == pg_end) {
- fill_zero(inode, pg_start, off_start,
+ ret = fill_zero(inode, pg_start, off_start,
off_end - off_start);
+ if (ret)
+ return ret;
} else {
- if (off_start)
- fill_zero(inode, pg_start++, off_start,
- PAGE_CACHE_SIZE - off_start);
- if (off_end)
- fill_zero(inode, pg_end, 0, off_end);
+ if (off_start) {
+ ret = fill_zero(inode, pg_start++, off_start,
+ PAGE_CACHE_SIZE - off_start);
+ if (ret)
+ return ret;
+ }
+ if (off_end) {
+ ret = fill_zero(inode, pg_end, 0, off_end);
+ if (ret)
+ return ret;
+ }
if (pg_start < pg_end) {
struct address_space *mapping = inode->i_mapping;
loff_t blk_start, blk_end;
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- f2fs_balance_fs(sbi);
+ f2fs_balance_fs(sbi, true);
- blk_start = pg_start << PAGE_CACHE_SHIFT;
- blk_end = pg_end << PAGE_CACHE_SHIFT;
+ blk_start = (loff_t)pg_start << PAGE_CACHE_SHIFT;
+ blk_end = (loff_t)pg_end << PAGE_CACHE_SHIFT;
truncate_inode_pages_range(mapping, blk_start,
blk_end - 1);
@@ -790,82 +836,100 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
return ret;
}
-static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end)
+static int __exchange_data_block(struct inode *inode, pgoff_t src,
+ pgoff_t dst, bool full)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct dnode_of_data dn;
- pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE;
- int ret = 0;
-
- f2fs_lock_op(sbi);
-
- for (; end < nrpages; start++, end++) {
- block_t new_addr, old_addr;
+ block_t new_addr;
+ bool do_replace = false;
+ int ret;
- set_new_dnode(&dn, inode, NULL, NULL, 0);
- ret = get_dnode_of_data(&dn, end, LOOKUP_NODE_RA);
- if (ret && ret != -ENOENT) {
- goto out;
- } else if (ret == -ENOENT) {
- new_addr = NULL_ADDR;
- } else {
- new_addr = dn.data_blkaddr;
- truncate_data_blocks_range(&dn, 1);
- f2fs_put_dnode(&dn);
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ ret = get_dnode_of_data(&dn, src, LOOKUP_NODE_RA);
+ if (ret && ret != -ENOENT) {
+ return ret;
+ } else if (ret == -ENOENT) {
+ new_addr = NULL_ADDR;
+ } else {
+ new_addr = dn.data_blkaddr;
+ if (!is_checkpointed_data(sbi, new_addr)) {
+ dn.data_blkaddr = NULL_ADDR;
+ /* do not invalidate this block address */
+ set_data_blkaddr(&dn);
+ f2fs_update_extent_cache(&dn);
+ do_replace = true;
}
+ f2fs_put_dnode(&dn);
+ }
- if (new_addr == NULL_ADDR) {
- set_new_dnode(&dn, inode, NULL, NULL, 0);
- ret = get_dnode_of_data(&dn, start, LOOKUP_NODE_RA);
- if (ret && ret != -ENOENT)
- goto out;
- else if (ret == -ENOENT)
- continue;
+ if (new_addr == NULL_ADDR)
+ return full ? truncate_hole(inode, dst, dst + 1) : 0;
- if (dn.data_blkaddr == NULL_ADDR) {
- f2fs_put_dnode(&dn);
- continue;
- } else {
- truncate_data_blocks_range(&dn, 1);
- }
+ if (do_replace) {
+ struct page *ipage = get_node_page(sbi, inode->i_ino);
+ struct node_info ni;
- f2fs_put_dnode(&dn);
- } else {
- struct page *ipage;
+ if (IS_ERR(ipage)) {
+ ret = PTR_ERR(ipage);
+ goto err_out;
+ }
- ipage = get_node_page(sbi, inode->i_ino);
- if (IS_ERR(ipage)) {
- ret = PTR_ERR(ipage);
- goto out;
- }
+ set_new_dnode(&dn, inode, ipage, NULL, 0);
+ ret = f2fs_reserve_block(&dn, dst);
+ if (ret)
+ goto err_out;
- set_new_dnode(&dn, inode, ipage, NULL, 0);
- ret = f2fs_reserve_block(&dn, start);
- if (ret)
- goto out;
+ truncate_data_blocks_range(&dn, 1);
- old_addr = dn.data_blkaddr;
- if (old_addr != NEW_ADDR && new_addr == NEW_ADDR) {
- dn.data_blkaddr = NULL_ADDR;
- f2fs_update_extent_cache(&dn);
- invalidate_blocks(sbi, old_addr);
+ get_node_info(sbi, dn.nid, &ni);
+ f2fs_replace_block(sbi, &dn, dn.data_blkaddr, new_addr,
+ ni.version, true);
+ f2fs_put_dnode(&dn);
+ } else {
+ struct page *psrc, *pdst;
+
+ psrc = get_lock_data_page(inode, src, true);
+ if (IS_ERR(psrc))
+ return PTR_ERR(psrc);
+ pdst = get_new_data_page(inode, NULL, dst, false);
+ if (IS_ERR(pdst)) {
+ f2fs_put_page(psrc, 1);
+ return PTR_ERR(pdst);
+ }
+ f2fs_copy_page(psrc, pdst);
+ set_page_dirty(pdst);
+ f2fs_put_page(pdst, 1);
+ f2fs_put_page(psrc, 1);
- dn.data_blkaddr = new_addr;
- set_data_blkaddr(&dn);
- } else if (new_addr != NEW_ADDR) {
- struct node_info ni;
+ return truncate_hole(inode, src, src + 1);
+ }
+ return 0;
- get_node_info(sbi, dn.nid, &ni);
- f2fs_replace_block(sbi, &dn, old_addr, new_addr,
- ni.version, true);
- }
+err_out:
+ if (!get_dnode_of_data(&dn, src, LOOKUP_NODE)) {
+ dn.data_blkaddr = new_addr;
+ set_data_blkaddr(&dn);
+ f2fs_update_extent_cache(&dn);
+ f2fs_put_dnode(&dn);
+ }
+ return ret;
+}
- f2fs_put_dnode(&dn);
- }
+static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE;
+ int ret = 0;
+
+ for (; end < nrpages; start++, end++) {
+ f2fs_balance_fs(sbi, true);
+ f2fs_lock_op(sbi);
+ ret = __exchange_data_block(inode, end, start, true);
+ f2fs_unlock_op(sbi);
+ if (ret)
+ break;
}
- ret = 0;
-out:
- f2fs_unlock_op(sbi);
return ret;
}
@@ -875,9 +939,6 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
loff_t new_size;
int ret;
- if (!S_ISREG(inode->i_mode))
- return -EINVAL;
-
if (offset + len >= i_size_read(inode))
return -EINVAL;
@@ -885,6 +946,10 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
if (offset & (F2FS_BLKSIZE - 1) || len & (F2FS_BLKSIZE - 1))
return -EINVAL;
+ ret = f2fs_convert_inline_inode(inode);
+ if (ret)
+ return ret;
+
pg_start = offset >> PAGE_CACHE_SHIFT;
pg_end = (offset + len) >> PAGE_CACHE_SHIFT;
@@ -899,7 +964,12 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
if (ret)
return ret;
+ /* write out all moved pages, if possible */
+ filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX);
+ truncate_pagecache(inode, offset);
+
new_size = i_size_read(inode) - len;
+ truncate_pagecache(inode, new_size);
ret = truncate_blocks(inode, new_size, true);
if (!ret)
@@ -918,20 +988,13 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
loff_t off_start, off_end;
int ret = 0;
- if (!S_ISREG(inode->i_mode))
- return -EINVAL;
-
ret = inode_newsize_ok(inode, (len + offset));
if (ret)
return ret;
- f2fs_balance_fs(sbi);
-
- if (f2fs_has_inline_data(inode)) {
- ret = f2fs_convert_inline_inode(inode);
- if (ret)
- return ret;
- }
+ ret = f2fs_convert_inline_inode(inode);
+ if (ret)
+ return ret;
ret = filemap_write_and_wait_range(mapping, offset, offset + len - 1);
if (ret)
@@ -946,16 +1009,23 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
off_end = (offset + len) & (PAGE_CACHE_SIZE - 1);
if (pg_start == pg_end) {
- fill_zero(inode, pg_start, off_start, off_end - off_start);
+ ret = fill_zero(inode, pg_start, off_start,
+ off_end - off_start);
+ if (ret)
+ return ret;
+
if (offset + len > new_size)
new_size = offset + len;
new_size = max_t(loff_t, new_size, offset + len);
} else {
if (off_start) {
- fill_zero(inode, pg_start++, off_start,
- PAGE_CACHE_SIZE - off_start);
+ ret = fill_zero(inode, pg_start++, off_start,
+ PAGE_CACHE_SIZE - off_start);
+ if (ret)
+ return ret;
+
new_size = max_t(loff_t, new_size,
- pg_start << PAGE_CACHE_SHIFT);
+ (loff_t)pg_start << PAGE_CACHE_SHIFT);
}
for (index = pg_start; index < pg_end; index++) {
@@ -991,11 +1061,14 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
f2fs_unlock_op(sbi);
new_size = max_t(loff_t, new_size,
- (index + 1) << PAGE_CACHE_SHIFT);
+ (loff_t)(index + 1) << PAGE_CACHE_SHIFT);
}
if (off_end) {
- fill_zero(inode, pg_end, 0, off_end);
+ ret = fill_zero(inode, pg_end, 0, off_end);
+ if (ret)
+ goto out;
+
new_size = max_t(loff_t, new_size, offset + len);
}
}
@@ -1015,10 +1088,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
pgoff_t pg_start, pg_end, delta, nrpages, idx;
loff_t new_size;
- int ret;
-
- if (!S_ISREG(inode->i_mode))
- return -EINVAL;
+ int ret = 0;
new_size = i_size_read(inode) + len;
if (new_size > inode->i_sb->s_maxbytes)
@@ -1031,7 +1101,11 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
if (offset & (F2FS_BLKSIZE - 1) || len & (F2FS_BLKSIZE - 1))
return -EINVAL;
- f2fs_balance_fs(sbi);
+ ret = f2fs_convert_inline_inode(inode);
+ if (ret)
+ return ret;
+
+ f2fs_balance_fs(sbi, true);
ret = truncate_blocks(inode, i_size_read(inode), true);
if (ret)
@@ -1050,57 +1124,19 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE;
for (idx = nrpages - 1; idx >= pg_start && idx != -1; idx--) {
- struct dnode_of_data dn;
- struct page *ipage;
- block_t new_addr, old_addr;
-
f2fs_lock_op(sbi);
-
- set_new_dnode(&dn, inode, NULL, NULL, 0);
- ret = get_dnode_of_data(&dn, idx, LOOKUP_NODE_RA);
- if (ret && ret != -ENOENT) {
- goto out;
- } else if (ret == -ENOENT) {
- goto next;
- } else if (dn.data_blkaddr == NULL_ADDR) {
- f2fs_put_dnode(&dn);
- goto next;
- } else {
- new_addr = dn.data_blkaddr;
- truncate_data_blocks_range(&dn, 1);
- f2fs_put_dnode(&dn);
- }
-
- ipage = get_node_page(sbi, inode->i_ino);
- if (IS_ERR(ipage)) {
- ret = PTR_ERR(ipage);
- goto out;
- }
-
- set_new_dnode(&dn, inode, ipage, NULL, 0);
- ret = f2fs_reserve_block(&dn, idx + delta);
- if (ret)
- goto out;
-
- old_addr = dn.data_blkaddr;
- f2fs_bug_on(sbi, old_addr != NEW_ADDR);
-
- if (new_addr != NEW_ADDR) {
- struct node_info ni;
-
- get_node_info(sbi, dn.nid, &ni);
- f2fs_replace_block(sbi, &dn, old_addr, new_addr,
- ni.version, true);
- }
- f2fs_put_dnode(&dn);
-next:
+ ret = __exchange_data_block(inode, idx, idx + delta, false);
f2fs_unlock_op(sbi);
+ if (ret)
+ break;
}
- i_size_write(inode, new_size);
- return 0;
-out:
- f2fs_unlock_op(sbi);
+ /* write out all moved pages, if possible */
+ filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX);
+ truncate_pagecache(inode, offset);
+
+ if (!ret)
+ i_size_write(inode, new_size);
return ret;
}
@@ -1113,17 +1149,15 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
loff_t off_start, off_end;
int ret = 0;
- f2fs_balance_fs(sbi);
-
ret = inode_newsize_ok(inode, (len + offset));
if (ret)
return ret;
- if (f2fs_has_inline_data(inode)) {
- ret = f2fs_convert_inline_inode(inode);
- if (ret)
- return ret;
- }
+ ret = f2fs_convert_inline_inode(inode);
+ if (ret)
+ return ret;
+
+ f2fs_balance_fs(sbi, true);
pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT;
pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT;
@@ -1147,9 +1181,10 @@ noalloc:
if (pg_start == pg_end)
new_size = offset + len;
else if (index == pg_start && off_start)
- new_size = (index + 1) << PAGE_CACHE_SHIFT;
+ new_size = (loff_t)(index + 1) << PAGE_CACHE_SHIFT;
else if (index == pg_end)
- new_size = (index << PAGE_CACHE_SHIFT) + off_end;
+ new_size = ((loff_t)index << PAGE_CACHE_SHIFT) +
+ off_end;
else
new_size += PAGE_CACHE_SIZE;
}
@@ -1171,6 +1206,10 @@ static long f2fs_fallocate(struct file *file, int mode,
struct inode *inode = file_inode(file);
long ret = 0;
+ /* f2fs only support ->fallocate for regular file */
+ if (!S_ISREG(inode->i_mode))
+ return -EINVAL;
+
if (f2fs_encrypted_inode(inode) &&
(mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE)))
return -EOPNOTSUPP;
@@ -1200,6 +1239,7 @@ static long f2fs_fallocate(struct file *file, int mode,
if (!ret) {
inode->i_mtime = inode->i_ctime = CURRENT_TIME;
mark_inode_dirty(inode);
+ f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
}
out:
@@ -1302,18 +1342,22 @@ static int f2fs_ioc_getversion(struct file *filp, unsigned long arg)
static int f2fs_ioc_start_atomic_write(struct file *filp)
{
struct inode *inode = file_inode(filp);
+ int ret;
if (!inode_owner_or_capable(inode))
return -EACCES;
- f2fs_balance_fs(F2FS_I_SB(inode));
-
if (f2fs_is_atomic_file(inode))
return 0;
+ ret = f2fs_convert_inline_inode(inode);
+ if (ret)
+ return ret;
+
set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
+ f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
- return f2fs_convert_inline_inode(inode);
+ return 0;
}
static int f2fs_ioc_commit_atomic_write(struct file *filp)
@@ -1333,10 +1377,15 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp)
if (f2fs_is_atomic_file(inode)) {
clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
- commit_inmem_pages(inode, false);
+ ret = commit_inmem_pages(inode, false);
+ if (ret) {
+ set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
+ goto err_out;
+ }
}
- ret = f2fs_sync_file(filp, 0, LONG_MAX, 0);
+ ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0);
+err_out:
mnt_drop_write_file(filp);
return ret;
}
@@ -1344,6 +1393,7 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp)
static int f2fs_ioc_start_volatile_write(struct file *filp)
{
struct inode *inode = file_inode(filp);
+ int ret;
if (!inode_owner_or_capable(inode))
return -EACCES;
@@ -1351,9 +1401,13 @@ static int f2fs_ioc_start_volatile_write(struct file *filp)
if (f2fs_is_volatile_file(inode))
return 0;
- set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
+ ret = f2fs_convert_inline_inode(inode);
+ if (ret)
+ return ret;
- return f2fs_convert_inline_inode(inode);
+ set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
+ f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
+ return 0;
}
static int f2fs_ioc_release_volatile_write(struct file *filp)
@@ -1369,8 +1423,7 @@ static int f2fs_ioc_release_volatile_write(struct file *filp)
if (!f2fs_is_first_block_written(inode))
return truncate_partial_data_page(inode, 0, true);
- punch_hole(inode, 0, F2FS_BLKSIZE);
- return 0;
+ return punch_hole(inode, 0, F2FS_BLKSIZE);
}
static int f2fs_ioc_abort_volatile_write(struct file *filp)
@@ -1385,17 +1438,17 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp)
if (ret)
return ret;
- f2fs_balance_fs(F2FS_I_SB(inode));
-
if (f2fs_is_atomic_file(inode)) {
clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
- commit_inmem_pages(inode, false);
+ commit_inmem_pages(inode, true);
}
-
- if (f2fs_is_volatile_file(inode))
+ if (f2fs_is_volatile_file(inode)) {
clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
+ ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0);
+ }
mnt_drop_write_file(filp);
+ f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
return ret;
}
@@ -1428,9 +1481,14 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
case F2FS_GOING_DOWN_NOSYNC:
f2fs_stop_checkpoint(sbi);
break;
+ case F2FS_GOING_DOWN_METAFLUSH:
+ sync_meta_pages(sbi, META, LONG_MAX);
+ f2fs_stop_checkpoint(sbi);
+ break;
default:
return -EINVAL;
}
+ f2fs_update_time(sbi, REQ_TIME);
return 0;
}
@@ -1461,6 +1519,7 @@ static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg)
if (copy_to_user((struct fstrim_range __user *)arg, &range,
sizeof(range)))
return -EFAULT;
+ f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
return 0;
}
@@ -1484,6 +1543,7 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg)
sizeof(policy)))
return -EFAULT;
+ f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
return f2fs_process_policy(&policy, inode);
#else
return -EOPNOTSUPP;
@@ -1530,13 +1590,13 @@ static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg)
generate_random_uuid(sbi->raw_super->encrypt_pw_salt);
err = f2fs_commit_super(sbi, false);
-
- mnt_drop_write_file(filp);
if (err) {
/* undo new data */
memset(sbi->raw_super->encrypt_pw_salt, 0, 16);
+ mnt_drop_write_file(filp);
return err;
}
+ mnt_drop_write_file(filp);
got_it:
if (copy_to_user((__u8 __user *)arg, sbi->raw_super->encrypt_pw_salt,
16))
@@ -1544,6 +1604,234 @@ got_it:
return 0;
}
+static int f2fs_ioc_gc(struct file *filp, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ __u32 sync;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (get_user(sync, (__u32 __user *)arg))
+ return -EFAULT;
+
+ if (f2fs_readonly(sbi->sb))
+ return -EROFS;
+
+ if (!sync) {
+ if (!mutex_trylock(&sbi->gc_mutex))
+ return -EBUSY;
+ } else {
+ mutex_lock(&sbi->gc_mutex);
+ }
+
+ return f2fs_gc(sbi, sync);
+}
+
+static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (f2fs_readonly(sbi->sb))
+ return -EROFS;
+
+ return f2fs_sync_fs(sbi->sb, 1);
+}
+
+static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
+ struct file *filp,
+ struct f2fs_defragment *range)
+{
+ struct inode *inode = file_inode(filp);
+ struct f2fs_map_blocks map;
+ struct extent_info ei;
+ pgoff_t pg_start, pg_end;
+ unsigned int blk_per_seg = sbi->blocks_per_seg;
+ unsigned int total = 0, sec_num;
+ unsigned int pages_per_sec = sbi->segs_per_sec * blk_per_seg;
+ block_t blk_end = 0;
+ bool fragmented = false;
+ int err;
+
+ /* if in-place-update policy is enabled, don't waste time here */
+ if (need_inplace_update(inode))
+ return -EINVAL;
+
+ pg_start = range->start >> PAGE_CACHE_SHIFT;
+ pg_end = (range->start + range->len) >> PAGE_CACHE_SHIFT;
+
+ f2fs_balance_fs(sbi, true);
+
+ mutex_lock(&inode->i_mutex);
+
+ /* writeback all dirty pages in the range */
+ err = filemap_write_and_wait_range(inode->i_mapping, range->start,
+ range->start + range->len - 1);
+ if (err)
+ goto out;
+
+ /*
+ * lookup mapping info in extent cache, skip defragmenting if physical
+ * block addresses are continuous.
+ */
+ if (f2fs_lookup_extent_cache(inode, pg_start, &ei)) {
+ if (ei.fofs + ei.len >= pg_end)
+ goto out;
+ }
+
+ map.m_lblk = pg_start;
+
+ /*
+ * lookup mapping info in dnode page cache, skip defragmenting if all
+ * physical block addresses are continuous even if there are hole(s)
+ * in logical blocks.
+ */
+ while (map.m_lblk < pg_end) {
+ map.m_len = pg_end - map.m_lblk;
+ err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ);
+ if (err)
+ goto out;
+
+ if (!(map.m_flags & F2FS_MAP_FLAGS)) {
+ map.m_lblk++;
+ continue;
+ }
+
+ if (blk_end && blk_end != map.m_pblk) {
+ fragmented = true;
+ break;
+ }
+ blk_end = map.m_pblk + map.m_len;
+
+ map.m_lblk += map.m_len;
+ }
+
+ if (!fragmented)
+ goto out;
+
+ map.m_lblk = pg_start;
+ map.m_len = pg_end - pg_start;
+
+ sec_num = (map.m_len + pages_per_sec - 1) / pages_per_sec;
+
+ /*
+ * make sure there are enough free section for LFS allocation, this can
+ * avoid defragment running in SSR mode when free section are allocated
+ * intensively
+ */
+ if (has_not_enough_free_secs(sbi, sec_num)) {
+ err = -EAGAIN;
+ goto out;
+ }
+
+ while (map.m_lblk < pg_end) {
+ pgoff_t idx;
+ int cnt = 0;
+
+do_map:
+ map.m_len = pg_end - map.m_lblk;
+ err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ);
+ if (err)
+ goto clear_out;
+
+ if (!(map.m_flags & F2FS_MAP_FLAGS)) {
+ map.m_lblk++;
+ continue;
+ }
+
+ set_inode_flag(F2FS_I(inode), FI_DO_DEFRAG);
+
+ idx = map.m_lblk;
+ while (idx < map.m_lblk + map.m_len && cnt < blk_per_seg) {
+ struct page *page;
+
+ page = get_lock_data_page(inode, idx, true);
+ if (IS_ERR(page)) {
+ err = PTR_ERR(page);
+ goto clear_out;
+ }
+
+ set_page_dirty(page);
+ f2fs_put_page(page, 1);
+
+ idx++;
+ cnt++;
+ total++;
+ }
+
+ map.m_lblk = idx;
+
+ if (idx < pg_end && cnt < blk_per_seg)
+ goto do_map;
+
+ clear_inode_flag(F2FS_I(inode), FI_DO_DEFRAG);
+
+ err = filemap_fdatawrite(inode->i_mapping);
+ if (err)
+ goto out;
+ }
+clear_out:
+ clear_inode_flag(F2FS_I(inode), FI_DO_DEFRAG);
+out:
+ mutex_unlock(&inode->i_mutex);
+ if (!err)
+ range->len = (u64)total << PAGE_CACHE_SHIFT;
+ return err;
+}
+
+static int f2fs_ioc_defragment(struct file *filp, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct f2fs_defragment range;
+ int err;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!S_ISREG(inode->i_mode))
+ return -EINVAL;
+
+ err = mnt_want_write_file(filp);
+ if (err)
+ return err;
+
+ if (f2fs_readonly(sbi->sb)) {
+ err = -EROFS;
+ goto out;
+ }
+
+ if (copy_from_user(&range, (struct f2fs_defragment __user *)arg,
+ sizeof(range))) {
+ err = -EFAULT;
+ goto out;
+ }
+
+ /* verify alignment of offset & size */
+ if (range.start & (F2FS_BLKSIZE - 1) ||
+ range.len & (F2FS_BLKSIZE - 1)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = f2fs_defragment_range(sbi, filp, &range);
+ f2fs_update_time(sbi, REQ_TIME);
+ if (err < 0)
+ goto out;
+
+ if (copy_to_user((struct f2fs_defragment __user *)arg, &range,
+ sizeof(range)))
+ err = -EFAULT;
+out:
+ mnt_drop_write_file(filp);
+ return err;
+}
+
long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
switch (cmd) {
@@ -1573,6 +1861,12 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return f2fs_ioc_get_encryption_policy(filp, arg);
case F2FS_IOC_GET_ENCRYPTION_PWSALT:
return f2fs_ioc_get_encryption_pwsalt(filp, arg);
+ case F2FS_IOC_GARBAGE_COLLECT:
+ return f2fs_ioc_gc(filp, arg);
+ case F2FS_IOC_WRITE_CHECKPOINT:
+ return f2fs_ioc_write_checkpoint(filp, arg);
+ case F2FS_IOC_DEFRAGMENT:
+ return f2fs_ioc_defragment(filp, arg);
default:
return -ENOTTY;
}
@@ -1600,6 +1894,22 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case F2FS_IOC32_SETFLAGS:
cmd = F2FS_IOC_SETFLAGS;
break;
+ case F2FS_IOC32_GETVERSION:
+ cmd = F2FS_IOC_GETVERSION;
+ break;
+ case F2FS_IOC_START_ATOMIC_WRITE:
+ case F2FS_IOC_COMMIT_ATOMIC_WRITE:
+ case F2FS_IOC_START_VOLATILE_WRITE:
+ case F2FS_IOC_RELEASE_VOLATILE_WRITE:
+ case F2FS_IOC_ABORT_VOLATILE_WRITE:
+ case F2FS_IOC_SHUTDOWN:
+ case F2FS_IOC_SET_ENCRYPTION_POLICY:
+ case F2FS_IOC_GET_ENCRYPTION_PWSALT:
+ case F2FS_IOC_GET_ENCRYPTION_POLICY:
+ case F2FS_IOC_GARBAGE_COLLECT:
+ case F2FS_IOC_WRITE_CHECKPOINT:
+ case F2FS_IOC_DEFRAGMENT:
+ break;
default:
return -ENOIOCTLCMD;
}
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 22fb5ef..f610c2a 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -16,7 +16,6 @@
#include <linux/kthread.h>
#include <linux/delay.h>
#include <linux/freezer.h>
-#include <linux/blkdev.h>
#include "f2fs.h"
#include "node.h"
@@ -78,9 +77,12 @@ static int gc_thread_func(void *data)
stat_inc_bggc_count(sbi);
/* if return value is not zero, no victim was selected */
- if (f2fs_gc(sbi))
+ if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC)))
wait_ms = gc_th->no_gc_sleep_time;
+ trace_f2fs_background_gc(sbi->sb, wait_ms,
+ prefree_segments(sbi), free_segments(sbi));
+
/* balancing f2fs's metadata periodically */
f2fs_balance_fs_bg(sbi);
@@ -170,9 +172,9 @@ static unsigned int get_max_cost(struct f2fs_sb_info *sbi,
{
/* SSR allocates in a segment unit */
if (p->alloc_mode == SSR)
- return 1 << sbi->log_blocks_per_seg;
+ return sbi->blocks_per_seg;
if (p->gc_mode == GC_GREEDY)
- return (1 << sbi->log_blocks_per_seg) * p->ofs_unit;
+ return sbi->blocks_per_seg * p->ofs_unit;
else if (p->gc_mode == GC_CB)
return UINT_MAX;
else /* No other gc_mode */
@@ -257,6 +259,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
struct victim_sel_policy p;
unsigned int secno, max_cost;
+ unsigned int last_segment = MAIN_SEGS(sbi);
int nsearched = 0;
mutex_lock(&dirty_i->seglist_lock);
@@ -267,6 +270,9 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
p.min_segno = NULL_SEGNO;
p.min_cost = max_cost = get_max_cost(sbi, &p);
+ if (p.max_search == 0)
+ goto out;
+
if (p.alloc_mode == LFS && gc_type == FG_GC) {
p.min_segno = check_bg_victims(sbi);
if (p.min_segno != NULL_SEGNO)
@@ -277,9 +283,10 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
unsigned long cost;
unsigned int segno;
- segno = find_next_bit(p.dirty_segmap, MAIN_SEGS(sbi), p.offset);
- if (segno >= MAIN_SEGS(sbi)) {
+ segno = find_next_bit(p.dirty_segmap, last_segment, p.offset);
+ if (segno >= last_segment) {
if (sbi->last_victim[p.gc_mode]) {
+ last_segment = sbi->last_victim[p.gc_mode];
sbi->last_victim[p.gc_mode] = 0;
p.offset = 0;
continue;
@@ -327,6 +334,7 @@ got_it:
sbi->cur_victim_sec,
prefree_segments(sbi), free_segments(sbi));
}
+out:
mutex_unlock(&dirty_i->seglist_lock);
return (p.min_segno == NULL_SEGNO) ? 0 : 1;
@@ -391,23 +399,27 @@ static int check_valid_map(struct f2fs_sb_info *sbi,
* On validity, copy that node with cold status, otherwise (invalid node)
* ignore that.
*/
-static void gc_node_segment(struct f2fs_sb_info *sbi,
+static int gc_node_segment(struct f2fs_sb_info *sbi,
struct f2fs_summary *sum, unsigned int segno, int gc_type)
{
bool initial = true;
struct f2fs_summary *entry;
+ block_t start_addr;
int off;
+ start_addr = START_BLOCK(sbi, segno);
+
next_step:
entry = sum;
for (off = 0; off < sbi->blocks_per_seg; off++, entry++) {
nid_t nid = le32_to_cpu(entry->nid);
struct page *node_page;
+ struct node_info ni;
/* stop BG_GC if there is not enough free sections. */
if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0))
- return;
+ return 0;
if (check_valid_map(sbi, segno, off) == 0)
continue;
@@ -426,6 +438,12 @@ next_step:
continue;
}
+ get_node_info(sbi, nid, &ni);
+ if (ni.blk_addr != start_addr + off) {
+ f2fs_put_page(node_page, 1);
+ continue;
+ }
+
/* set page dirty and write it */
if (gc_type == FG_GC) {
f2fs_wait_on_page_writeback(node_page, NODE);
@@ -451,13 +469,11 @@ next_step:
};
sync_node_pages(sbi, 0, &wbc);
- /*
- * In the case of FG_GC, it'd be better to reclaim this victim
- * completely.
- */
- if (get_valid_blocks(sbi, segno, 1) != 0)
- goto next_step;
+ /* return 1 only if FG_GC succefully reclaimed one */
+ if (get_valid_blocks(sbi, segno, 1) == 0)
+ return 1;
}
+ return 0;
}
/*
@@ -487,7 +503,7 @@ block_t start_bidx_of_node(unsigned int node_ofs, struct f2fs_inode_info *fi)
return bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE(fi);
}
-static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
+static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
struct node_info *dni, block_t blkaddr, unsigned int *nofs)
{
struct page *node_page;
@@ -500,13 +516,13 @@ static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
node_page = get_node_page(sbi, nid);
if (IS_ERR(node_page))
- return 0;
+ return false;
get_node_info(sbi, nid, dni);
if (sum->version != dni->version) {
f2fs_put_page(node_page, 1);
- return 0;
+ return false;
}
*nofs = ofs_of_node(node_page);
@@ -514,8 +530,8 @@ static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
f2fs_put_page(node_page, 1);
if (source_blkaddr != blkaddr)
- return 0;
- return 1;
+ return false;
+ return true;
}
static void move_encrypted_block(struct inode *inode, block_t bidx)
@@ -533,7 +549,7 @@ static void move_encrypted_block(struct inode *inode, block_t bidx)
int err;
/* do not read out */
- page = grab_cache_page(inode->i_mapping, bidx);
+ page = f2fs_grab_cache_page(inode->i_mapping, bidx, false);
if (!page)
return;
@@ -542,8 +558,16 @@ static void move_encrypted_block(struct inode *inode, block_t bidx)
if (err)
goto out;
- if (unlikely(dn.data_blkaddr == NULL_ADDR))
+ if (unlikely(dn.data_blkaddr == NULL_ADDR)) {
+ ClearPageUptodate(page);
goto put_out;
+ }
+
+ /*
+ * don't cache encrypted data into meta inode until previous dirty
+ * data were writebacked to avoid racing between GC and flush.
+ */
+ f2fs_wait_on_page_writeback(page, DATA);
get_node_info(fio.sbi, dn.nid, &ni);
set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version);
@@ -552,7 +576,10 @@ static void move_encrypted_block(struct inode *inode, block_t bidx)
fio.page = page;
fio.blk_addr = dn.data_blkaddr;
- fio.encrypted_page = grab_cache_page(META_MAPPING(fio.sbi), fio.blk_addr);
+ fio.encrypted_page = pagecache_get_page(META_MAPPING(fio.sbi),
+ fio.blk_addr,
+ FGP_LOCK|FGP_CREAT,
+ GFP_NOFS);
if (!fio.encrypted_page)
goto put_out;
@@ -569,7 +596,7 @@ static void move_encrypted_block(struct inode *inode, block_t bidx)
goto put_page_out;
set_page_dirty(fio.encrypted_page);
- f2fs_wait_on_page_writeback(fio.encrypted_page, META);
+ f2fs_wait_on_page_writeback(fio.encrypted_page, DATA);
if (clear_page_dirty_for_io(fio.encrypted_page))
dec_page_count(fio.sbi, F2FS_DIRTY_META);
@@ -600,7 +627,7 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type)
{
struct page *page;
- page = get_lock_data_page(inode, bidx);
+ page = get_lock_data_page(inode, bidx, true);
if (IS_ERR(page))
return;
@@ -636,7 +663,7 @@ out:
* If the parent node is not valid or the data block address is different,
* the victim data block is ignored.
*/
-static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
+static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
struct gc_inode_list *gc_list, unsigned int segno, int gc_type)
{
struct super_block *sb = sbi->sb;
@@ -659,7 +686,7 @@ next_step:
/* stop BG_GC if there is not enough free sections. */
if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0))
- return;
+ return 0;
if (check_valid_map(sbi, segno, off) == 0)
continue;
@@ -670,7 +697,7 @@ next_step:
}
/* Get an inode by ino with checking validity */
- if (check_dnode(sbi, entry, &dni, start_addr + off, &nofs) == 0)
+ if (!is_alive(sbi, entry, &dni, start_addr + off, &nofs))
continue;
if (phase == 1) {
@@ -694,7 +721,7 @@ next_step:
start_bidx = start_bidx_of_node(nofs, F2FS_I(inode));
data_page = get_read_data_page(inode,
- start_bidx + ofs_in_node, READA);
+ start_bidx + ofs_in_node, READA, true);
if (IS_ERR(data_page)) {
iput(inode);
continue;
@@ -724,15 +751,11 @@ next_step:
if (gc_type == FG_GC) {
f2fs_submit_merged_bio(sbi, DATA, WRITE);
- /*
- * In the case of FG_GC, it'd be better to reclaim this victim
- * completely.
- */
- if (get_valid_blocks(sbi, segno, 1) != 0) {
- phase = 2;
- goto next_step;
- }
+ /* return 1 only if FG_GC succefully reclaimed one */
+ if (get_valid_blocks(sbi, segno, 1) == 0)
+ return 1;
}
+ return 0;
}
static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim,
@@ -748,12 +771,13 @@ static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim,
return ret;
}
-static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno,
+static int do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno,
struct gc_inode_list *gc_list, int gc_type)
{
struct page *sum_page;
struct f2fs_summary_block *sum;
struct blk_plug plug;
+ int nfree = 0;
/* read segment summary of victim */
sum_page = get_sum_page(sbi, segno);
@@ -773,10 +797,11 @@ static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno,
switch (GET_SUM_TYPE((&sum->footer))) {
case SUM_TYPE_NODE:
- gc_node_segment(sbi, sum->entries, segno, gc_type);
+ nfree = gc_node_segment(sbi, sum->entries, segno, gc_type);
break;
case SUM_TYPE_DATA:
- gc_data_segment(sbi, sum->entries, gc_list, segno, gc_type);
+ nfree = gc_data_segment(sbi, sum->entries, gc_list,
+ segno, gc_type);
break;
}
blk_finish_plug(&plug);
@@ -785,14 +810,15 @@ static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno,
stat_inc_call_count(sbi->stat_info);
f2fs_put_page(sum_page, 0);
+ return nfree;
}
-int f2fs_gc(struct f2fs_sb_info *sbi)
+int f2fs_gc(struct f2fs_sb_info *sbi, bool sync)
{
unsigned int segno, i;
- int gc_type = BG_GC;
- int nfree = 0;
- int ret = -1;
+ int gc_type = sync ? FG_GC : BG_GC;
+ int sec_freed = 0;
+ int ret = -EINVAL;
struct cp_control cpc;
struct gc_inode_list gc_list = {
.ilist = LIST_HEAD_INIT(gc_list.ilist),
@@ -801,43 +827,60 @@ int f2fs_gc(struct f2fs_sb_info *sbi)
cpc.reason = __get_cp_reason(sbi);
gc_more:
+ segno = NULL_SEGNO;
+
if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE)))
goto stop;
- if (unlikely(f2fs_cp_error(sbi)))
+ if (unlikely(f2fs_cp_error(sbi))) {
+ ret = -EIO;
goto stop;
+ }
- if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) {
+ if (gc_type == BG_GC && has_not_enough_free_secs(sbi, sec_freed)) {
gc_type = FG_GC;
- write_checkpoint(sbi, &cpc);
+ if (__get_victim(sbi, &segno, gc_type) || prefree_segments(sbi))
+ write_checkpoint(sbi, &cpc);
}
- if (!__get_victim(sbi, &segno, gc_type))
+ if (segno == NULL_SEGNO && !__get_victim(sbi, &segno, gc_type))
goto stop;
ret = 0;
/* readahead multi ssa blocks those have contiguous address */
if (sbi->segs_per_sec > 1)
ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), sbi->segs_per_sec,
- META_SSA);
-
- for (i = 0; i < sbi->segs_per_sec; i++)
- do_garbage_collect(sbi, segno + i, &gc_list, gc_type);
+ META_SSA, true);
- if (gc_type == FG_GC) {
- sbi->cur_victim_sec = NULL_SEGNO;
- nfree++;
- WARN_ON(get_valid_blocks(sbi, segno, sbi->segs_per_sec));
+ for (i = 0; i < sbi->segs_per_sec; i++) {
+ /*
+ * for FG_GC case, halt gcing left segments once failed one
+ * of segments in selected section to avoid long latency.
+ */
+ if (!do_garbage_collect(sbi, segno + i, &gc_list, gc_type) &&
+ gc_type == FG_GC)
+ break;
}
- if (has_not_enough_free_secs(sbi, nfree))
- goto gc_more;
+ if (i == sbi->segs_per_sec && gc_type == FG_GC)
+ sec_freed++;
if (gc_type == FG_GC)
- write_checkpoint(sbi, &cpc);
+ sbi->cur_victim_sec = NULL_SEGNO;
+
+ if (!sync) {
+ if (has_not_enough_free_secs(sbi, sec_freed))
+ goto gc_more;
+
+ if (gc_type == FG_GC)
+ write_checkpoint(sbi, &cpc);
+ }
stop:
mutex_unlock(&sbi->gc_mutex);
put_gc_inode(&gc_list);
+
+ if (sync)
+ ret = sec_freed ? 0 : -EAGAIN;
return ret;
}
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index b4a65be..a993967 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -100,11 +100,3 @@ static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi)
return true;
return false;
}
-
-static inline int is_idle(struct f2fs_sb_info *sbi)
-{
- struct block_device *bdev = sbi->sb->s_bdev;
- struct request_queue *q = bdev_get_queue(bdev);
- struct request_list *rl = &q->root_rl;
- return !(rl->count[BLK_RW_SYNC]) && !(rl->count[BLK_RW_ASYNC]);
-}
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index a13ffcc..c3f0b7d 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -12,12 +12,10 @@
#include <linux/f2fs_fs.h>
#include "f2fs.h"
+#include "node.h"
bool f2fs_may_inline_data(struct inode *inode)
{
- if (!test_opt(F2FS_I_SB(inode), INLINE_DATA))
- return false;
-
if (f2fs_is_atomic_file(inode))
return false;
@@ -176,6 +174,9 @@ int f2fs_convert_inline_inode(struct inode *inode)
struct page *ipage, *page;
int err = 0;
+ if (!f2fs_has_inline_data(inode))
+ return 0;
+
page = grab_cache_page(inode->i_mapping, 0);
if (!page)
return -ENOMEM;
@@ -198,6 +199,9 @@ out:
f2fs_unlock_op(sbi);
f2fs_put_page(page, 1);
+
+ f2fs_balance_fs(sbi, dn.node_changed);
+
return err;
}
@@ -274,12 +278,14 @@ process_inline:
if (f2fs_has_inline_data(inode)) {
ipage = get_node_page(sbi, inode->i_ino);
f2fs_bug_on(sbi, IS_ERR(ipage));
- truncate_inline_inode(ipage, 0);
+ if (!truncate_inline_inode(ipage, 0))
+ return false;
f2fs_clear_inline_inode(inode);
update_inode(inode, ipage);
f2fs_put_page(ipage, 1);
} else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) {
- truncate_blocks(inode, 0, false);
+ if (truncate_blocks(inode, 0, false))
+ return false;
goto process_inline;
}
return false;
@@ -360,6 +366,10 @@ int make_empty_inline_dir(struct inode *inode, struct inode *parent,
return 0;
}
+/*
+ * NOTE: ipage is grabbed by caller, but if any error occurs, we should
+ * release ipage in this function.
+ */
static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage,
struct f2fs_inline_dentry *inline_dentry)
{
@@ -369,8 +379,10 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage,
int err;
page = grab_cache_page(dir->i_mapping, 0);
- if (!page)
+ if (!page) {
+ f2fs_put_page(ipage, 1);
return -ENOMEM;
+ }
set_new_dnode(&dn, dir, ipage, NULL, 0);
err = f2fs_reserve_block(&dn, 0);
@@ -378,13 +390,21 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage,
goto out;
f2fs_wait_on_page_writeback(page, DATA);
- zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+ zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE);
dentry_blk = kmap_atomic(page);
/* copy data from inline dentry block to new dentry block */
memcpy(dentry_blk->dentry_bitmap, inline_dentry->dentry_bitmap,
INLINE_DENTRY_BITMAP_SIZE);
+ memset(dentry_blk->dentry_bitmap + INLINE_DENTRY_BITMAP_SIZE, 0,
+ SIZE_OF_DENTRY_BITMAP - INLINE_DENTRY_BITMAP_SIZE);
+ /*
+ * we do not need to zero out remainder part of dentry and filename
+ * field, since we have used bitmap for marking the usage status of
+ * them, besides, we can also ignore copying/zeroing reserved space
+ * of dentry block, because them haven't been used so far.
+ */
memcpy(dentry_blk->dentry, inline_dentry->dentry,
sizeof(struct f2fs_dir_entry) * NR_INLINE_DENTRY);
memcpy(dentry_blk->filename, inline_dentry->filename,
@@ -434,8 +454,9 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name,
slots, NR_INLINE_DENTRY);
if (bit_pos >= NR_INLINE_DENTRY) {
err = f2fs_convert_inline_dir(dir, ipage, dentry_blk);
- if (!err)
- err = -EAGAIN;
+ if (err)
+ return err;
+ err = -EAGAIN;
goto out;
}
@@ -553,3 +574,38 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx,
f2fs_put_page(ipage, 1);
return 0;
}
+
+int f2fs_inline_data_fiemap(struct inode *inode,
+ struct fiemap_extent_info *fieinfo, __u64 start, __u64 len)
+{
+ __u64 byteaddr, ilen;
+ __u32 flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED |
+ FIEMAP_EXTENT_LAST;
+ struct node_info ni;
+ struct page *ipage;
+ int err = 0;
+
+ ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino);
+ if (IS_ERR(ipage))
+ return PTR_ERR(ipage);
+
+ if (!f2fs_has_inline_data(inode)) {
+ err = -EAGAIN;
+ goto out;
+ }
+
+ ilen = min_t(size_t, MAX_INLINE_DATA, i_size_read(inode));
+ if (start >= ilen)
+ goto out;
+ if (start + len < ilen)
+ ilen = start + len;
+ ilen -= start;
+
+ get_node_info(F2FS_I_SB(inode), inode->i_ino, &ni);
+ byteaddr = (__u64)ni.blk_addr << inode->i_sb->s_blocksize_bits;
+ byteaddr += (char *)inline_data_addr(ipage) - (char *)F2FS_INODE(ipage);
+ err = fiemap_fill_next_extent(fieinfo, start, byteaddr, ilen, flags);
+out:
+ f2fs_put_page(ipage, 1);
+ return err;
+}
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 2550868..2adeff2 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -12,7 +12,6 @@
#include <linux/f2fs_fs.h>
#include <linux/buffer_head.h>
#include <linux/writeback.h>
-#include <linux/bitops.h>
#include "f2fs.h"
#include "node.h"
@@ -34,8 +33,8 @@ void f2fs_set_inode_flags(struct inode *inode)
new_fl |= S_NOATIME;
if (flags & FS_DIRSYNC_FL)
new_fl |= S_DIRSYNC;
- set_mask_bits(&inode->i_flags,
- S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC, new_fl);
+ inode_set_flags(inode, new_fl,
+ S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
}
static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
@@ -139,7 +138,8 @@ static int do_read_inode(struct inode *inode)
fi->i_pino = le32_to_cpu(ri->i_pino);
fi->i_dir_level = ri->i_dir_level;
- f2fs_init_extent_cache(inode, &ri->i_ext);
+ if (f2fs_init_extent_tree(inode, &ri->i_ext))
+ set_page_dirty(node_page);
get_inline_info(fi, ri);
@@ -155,6 +155,7 @@ static int do_read_inode(struct inode *inode)
f2fs_put_page(node_page, 1);
+ stat_inc_inline_xattr(inode);
stat_inc_inline_inode(inode);
stat_inc_inline_dir(inode);
@@ -202,6 +203,7 @@ make_now:
inode->i_op = &f2fs_encrypted_symlink_inode_operations;
else
inode->i_op = &f2fs_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &f2fs_dblock_aops;
} else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
@@ -221,7 +223,7 @@ bad_inode:
return ERR_PTR(ret);
}
-void update_inode(struct inode *inode, struct page *node_page)
+int update_inode(struct inode *inode, struct page *node_page)
{
struct f2fs_inode *ri;
@@ -237,10 +239,11 @@ void update_inode(struct inode *inode, struct page *node_page)
ri->i_size = cpu_to_le64(i_size_read(inode));
ri->i_blocks = cpu_to_le64(inode->i_blocks);
- read_lock(&F2FS_I(inode)->ext_lock);
- set_raw_extent(&F2FS_I(inode)->ext, &ri->i_ext);
- read_unlock(&F2FS_I(inode)->ext_lock);
-
+ if (F2FS_I(inode)->extent_tree)
+ set_raw_extent(&F2FS_I(inode)->extent_tree->largest,
+ &ri->i_ext);
+ else
+ memset(&ri->i_ext, 0, sizeof(ri->i_ext));
set_raw_inline(F2FS_I(inode), ri);
ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
@@ -258,15 +261,16 @@ void update_inode(struct inode *inode, struct page *node_page)
__set_inode_rdev(inode, ri);
set_cold_node(inode, node_page);
- set_page_dirty(node_page);
-
clear_inode_flag(F2FS_I(inode), FI_DIRTY_INODE);
+
+ return set_page_dirty(node_page);
}
-void update_inode_page(struct inode *inode)
+int update_inode_page(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct page *node_page;
+ int ret = 0;
retry:
node_page = get_node_page(sbi, inode->i_ino);
if (IS_ERR(node_page)) {
@@ -277,10 +281,11 @@ retry:
} else if (err != -ENOENT) {
f2fs_stop_checkpoint(sbi);
}
- return;
+ return 0;
}
- update_inode(inode, node_page);
+ ret = update_inode(inode, node_page);
f2fs_put_page(node_page, 1);
+ return ret;
}
int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
@@ -295,16 +300,11 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
return 0;
/*
- * We need to lock here to prevent from producing dirty node pages
+ * We need to balance fs here to prevent from producing dirty node pages
* during the urgent cleaning time when runing out of free sections.
*/
- f2fs_lock_op(sbi);
- update_inode_page(inode);
- f2fs_unlock_op(sbi);
-
- if (wbc)
- f2fs_balance_fs(sbi);
-
+ if (update_inode_page(inode))
+ f2fs_balance_fs(sbi, true);
return 0;
}
@@ -314,7 +314,9 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
void f2fs_evict_inode(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- nid_t xnid = F2FS_I(inode)->i_xattr_nid;
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ nid_t xnid = fi->i_xattr_nid;
+ int err = 0;
/* some remained atomic pages should discarded */
if (f2fs_is_atomic_file(inode))
@@ -328,43 +330,64 @@ void f2fs_evict_inode(struct inode *inode)
goto out_clear;
f2fs_bug_on(sbi, get_dirty_pages(inode));
- remove_dirty_dir_inode(inode);
+ remove_dirty_inode(inode);
+
+ f2fs_destroy_extent_tree(inode);
if (inode->i_nlink || is_bad_inode(inode))
goto no_delete;
sb_start_intwrite(inode->i_sb);
- set_inode_flag(F2FS_I(inode), FI_NO_ALLOC);
+ set_inode_flag(fi, FI_NO_ALLOC);
i_size_write(inode, 0);
if (F2FS_HAS_BLOCKS(inode))
- f2fs_truncate(inode);
+ err = f2fs_truncate(inode, true);
- f2fs_lock_op(sbi);
- remove_inode_page(inode);
- f2fs_unlock_op(sbi);
+ if (!err) {
+ f2fs_lock_op(sbi);
+ err = remove_inode_page(inode);
+ f2fs_unlock_op(sbi);
+ }
sb_end_intwrite(inode->i_sb);
no_delete:
+ stat_dec_inline_xattr(inode);
stat_dec_inline_dir(inode);
stat_dec_inline_inode(inode);
- /* update extent info in inode */
- if (inode->i_nlink)
- f2fs_preserve_extent_tree(inode);
- f2fs_destroy_extent_tree(inode);
-
invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino);
if (xnid)
invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid);
- if (is_inode_flag_set(F2FS_I(inode), FI_APPEND_WRITE))
- add_dirty_inode(sbi, inode->i_ino, APPEND_INO);
- if (is_inode_flag_set(F2FS_I(inode), FI_UPDATE_WRITE))
- add_dirty_inode(sbi, inode->i_ino, UPDATE_INO);
+ if (is_inode_flag_set(fi, FI_APPEND_WRITE))
+ add_ino_entry(sbi, inode->i_ino, APPEND_INO);
+ if (is_inode_flag_set(fi, FI_UPDATE_WRITE))
+ add_ino_entry(sbi, inode->i_ino, UPDATE_INO);
+ if (is_inode_flag_set(fi, FI_FREE_NID)) {
+ if (err && err != -ENOENT)
+ alloc_nid_done(sbi, inode->i_ino);
+ else
+ alloc_nid_failed(sbi, inode->i_ino);
+ clear_inode_flag(fi, FI_FREE_NID);
+ }
+
+ if (err && err != -ENOENT) {
+ if (!exist_written_data(sbi, inode->i_ino, ORPHAN_INO)) {
+ /*
+ * get here because we failed to release resource
+ * of inode previously, reminder our user to run fsck
+ * for fixing.
+ */
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ f2fs_msg(sbi->sb, KERN_WARNING,
+ "inode (ino:%lu) resource leak, run fsck "
+ "to fix this issue!", inode->i_ino);
+ }
+ }
out_clear:
#ifdef CONFIG_F2FS_FS_ENCRYPTION
- if (F2FS_I(inode)->i_crypt_info)
- f2fs_free_encryption_info(inode, F2FS_I(inode)->i_crypt_info);
+ if (fi->i_crypt_info)
+ f2fs_free_encryption_info(inode, fi->i_crypt_info);
#endif
clear_inode(inode);
}
@@ -373,6 +396,7 @@ out_clear:
void handle_failed_inode(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ int err = 0;
clear_nlink(inode);
make_bad_inode(inode);
@@ -380,13 +404,29 @@ void handle_failed_inode(struct inode *inode)
i_size_write(inode, 0);
if (F2FS_HAS_BLOCKS(inode))
- f2fs_truncate(inode);
+ err = f2fs_truncate(inode, false);
+
+ if (!err)
+ err = remove_inode_page(inode);
- remove_inode_page(inode);
+ /*
+ * if we skip truncate_node in remove_inode_page bacause we failed
+ * before, it's better to find another way to release resource of
+ * this inode (e.g. valid block count, node block or nid). Here we
+ * choose to add this inode to orphan list, so that we can call iput
+ * for releasing in orphan recovery flow.
+ *
+ * Note: we should add inode to orphan list before f2fs_unlock_op()
+ * so we can prevent losing this orphan when encoutering checkpoint
+ * and following suddenly power-off.
+ */
+ if (err && err != -ENOENT) {
+ err = acquire_orphan_inode(sbi);
+ if (!err)
+ add_orphan_inode(sbi, inode->i_ino);
+ }
- clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
- clear_inode_flag(F2FS_I(inode), FI_INLINE_DENTRY);
- alloc_nid_failed(sbi, inode->i_ino);
+ set_inode_flag(F2FS_I(inode), FI_FREE_NID);
f2fs_unlock_op(sbi);
/* iput will drop the inode object */
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index fdbae21..6f944e5 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -53,18 +53,21 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
if (err) {
err = -EINVAL;
nid_free = true;
- goto out;
+ goto fail;
}
/* If the directory encrypted, then we should encrypt the inode. */
if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode))
f2fs_set_encrypted_inode(inode);
- if (f2fs_may_inline_data(inode))
+ if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode))
set_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
if (f2fs_may_inline_dentry(inode))
set_inode_flag(F2FS_I(inode), FI_INLINE_DENTRY);
+ f2fs_init_extent_tree(inode, NULL);
+
+ stat_inc_inline_xattr(inode);
stat_inc_inline_inode(inode);
stat_inc_inline_dir(inode);
@@ -72,15 +75,12 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
mark_inode_dirty(inode);
return inode;
-out:
- clear_nlink(inode);
- unlock_new_inode(inode);
fail:
trace_f2fs_new_inode(inode, err);
make_bad_inode(inode);
- iput(inode);
if (nid_free)
- alloc_nid_failed(sbi, ino);
+ set_inode_flag(F2FS_I(inode), FI_FREE_NID);
+ iput(inode);
return ERR_PTR(err);
}
@@ -89,7 +89,14 @@ static int is_multimedia_file(const unsigned char *s, const char *sub)
size_t slen = strlen(s);
size_t sublen = strlen(sub);
- if (sublen > slen)
+ /*
+ * filename format of multimedia file should be defined as:
+ * "filename + '.' + extension".
+ */
+ if (slen < sublen + 2)
+ return 0;
+
+ if (s[slen - sublen - 1] != '.')
return 0;
return !strncasecmp(s + slen - sublen, sub, sublen);
@@ -121,8 +128,6 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
nid_t ino = 0;
int err;
- f2fs_balance_fs(sbi);
-
inode = f2fs_new_inode(dir, mode);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -135,6 +140,8 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
inode->i_mapping->a_ops = &f2fs_dblock_aops;
ino = inode->i_ino;
+ f2fs_balance_fs(sbi, true);
+
f2fs_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
if (err)
@@ -165,7 +172,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
!f2fs_is_child_context_consistent_with_parent(dir, inode))
return -EPERM;
- f2fs_balance_fs(sbi);
+ f2fs_balance_fs(sbi, true);
inode->i_ctime = CURRENT_TIME;
ihold(inode);
@@ -207,6 +214,15 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino)
struct page *page;
int err = 0;
+ if (f2fs_readonly(sbi->sb)) {
+ f2fs_msg(sbi->sb, KERN_INFO,
+ "skip recovering inline_dots inode (ino:%lu, pino:%u) "
+ "in readonly mountpoint", dir->i_ino, pino);
+ return 0;
+ }
+
+ f2fs_balance_fs(sbi, true);
+
f2fs_lock_op(sbi);
de = f2fs_find_entry(dir, &dot, &page);
@@ -281,12 +297,13 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
int err = -ENOENT;
trace_f2fs_unlink_enter(dir, dentry);
- f2fs_balance_fs(sbi);
de = f2fs_find_entry(dir, &dentry->d_name, &page);
if (!de)
goto fail;
+ f2fs_balance_fs(sbi, true);
+
f2fs_lock_op(sbi);
err = acquire_orphan_inode(sbi);
if (err) {
@@ -308,12 +325,15 @@ fail:
return err;
}
-static const char *f2fs_follow_link(struct dentry *dentry, void **cookie)
+static const char *f2fs_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- const char *link = page_follow_link_light(dentry, cookie);
+ const char *link = page_get_link(dentry, inode, done);
if (!IS_ERR(link) && !*link) {
/* this is broken symlink case */
- page_put_link(NULL, *cookie);
+ do_delayed_call(done);
+ clear_delayed_call(done);
link = ERR_PTR(-ENOENT);
}
return link;
@@ -334,8 +354,6 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
if (len > dir->i_sb->s_blocksize)
return -ENAMETOOLONG;
- f2fs_balance_fs(sbi);
-
inode = f2fs_new_inode(dir, S_IFLNK | S_IRWXUGO);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -344,8 +362,11 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
inode->i_op = &f2fs_encrypted_symlink_inode_operations;
else
inode->i_op = &f2fs_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &f2fs_dblock_aops;
+ f2fs_balance_fs(sbi, true);
+
f2fs_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
if (err)
@@ -403,11 +424,14 @@ err_out:
* If the symlink path is stored into inline_data, there is no
* performance regression.
*/
- if (!err)
+ if (!err) {
filemap_write_and_wait_range(inode->i_mapping, 0, p_len - 1);
- if (IS_DIRSYNC(dir))
- f2fs_sync_fs(sbi->sb, 1);
+ if (IS_DIRSYNC(dir))
+ f2fs_sync_fs(sbi->sb, 1);
+ } else {
+ f2fs_unlink(dir, dentry);
+ }
kfree(sd);
f2fs_fname_crypto_free_buffer(&disk_link);
@@ -423,8 +447,6 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
struct inode *inode;
int err;
- f2fs_balance_fs(sbi);
-
inode = f2fs_new_inode(dir, S_IFDIR | mode);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -434,6 +456,8 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
inode->i_mapping->a_ops = &f2fs_dblock_aops;
mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO);
+ f2fs_balance_fs(sbi, true);
+
set_inode_flag(F2FS_I(inode), FI_INC_LINK);
f2fs_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
@@ -471,11 +495,6 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
struct inode *inode;
int err = 0;
- if (!new_valid_dev(rdev))
- return -EINVAL;
-
- f2fs_balance_fs(sbi);
-
inode = f2fs_new_inode(dir, mode);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -483,6 +502,8 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
init_special_inode(inode, inode->i_mode, rdev);
inode->i_op = &f2fs_special_inode_operations;
+ f2fs_balance_fs(sbi, true);
+
f2fs_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
if (err)
@@ -509,9 +530,6 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry,
struct inode *inode;
int err;
- if (!whiteout)
- f2fs_balance_fs(sbi);
-
inode = f2fs_new_inode(dir, mode);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -525,6 +543,8 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry,
inode->i_mapping->a_ops = &f2fs_dblock_aops;
}
+ f2fs_balance_fs(sbi, true);
+
f2fs_lock_op(sbi);
err = acquire_orphan_inode(sbi);
if (err)
@@ -597,8 +617,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
goto out;
}
- f2fs_balance_fs(sbi);
-
old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page);
if (!old_entry)
goto out;
@@ -628,6 +646,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (!new_entry)
goto out_whiteout;
+ f2fs_balance_fs(sbi, true);
+
f2fs_lock_op(sbi);
err = acquire_orphan_inode(sbi);
@@ -659,6 +679,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
update_inode_page(old_inode);
update_inode_page(new_inode);
} else {
+ f2fs_balance_fs(sbi, true);
+
f2fs_lock_op(sbi);
err = f2fs_add_link(new_dentry, old_inode);
@@ -756,8 +778,6 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
new_inode)))
return -EPERM;
- f2fs_balance_fs(sbi);
-
old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page);
if (!old_entry)
goto out;
@@ -800,6 +820,8 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
goto out_new_dir;
}
+ f2fs_balance_fs(sbi, true);
+
f2fs_lock_op(sbi);
err = update_dent_inode(old_inode, new_inode, &new_dentry->d_name);
@@ -916,18 +938,22 @@ static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry,
}
#ifdef CONFIG_F2FS_FS_ENCRYPTION
-static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cookie)
+static const char *f2fs_encrypted_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
struct page *cpage = NULL;
char *caddr, *paddr = NULL;
- struct f2fs_str cstr;
+ struct f2fs_str cstr = FSTR_INIT(NULL, 0);
struct f2fs_str pstr = FSTR_INIT(NULL, 0);
- struct inode *inode = d_inode(dentry);
struct f2fs_encrypted_symlink_data *sd;
loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1);
u32 max_size = inode->i_sb->s_blocksize;
int res;
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
res = f2fs_get_encryption_info(inode);
if (res)
return ERR_PTR(res);
@@ -935,16 +961,27 @@ static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cook
cpage = read_mapping_page(inode->i_mapping, 0, NULL);
if (IS_ERR(cpage))
return ERR_CAST(cpage);
- caddr = kmap(cpage);
+ caddr = page_address(cpage);
caddr[size] = 0;
/* Symlink is encrypted */
sd = (struct f2fs_encrypted_symlink_data *)caddr;
- cstr.name = sd->encrypted_path;
cstr.len = le16_to_cpu(sd->len);
/* this is broken symlink case */
- if (cstr.name[0] == 0 && cstr.len == 0) {
+ if (unlikely(cstr.len == 0)) {
+ res = -ENOENT;
+ goto errout;
+ }
+ cstr.name = kmalloc(cstr.len, GFP_NOFS);
+ if (!cstr.name) {
+ res = -ENOMEM;
+ goto errout;
+ }
+ memcpy(cstr.name, sd->encrypted_path, cstr.len);
+
+ /* this is broken symlink case */
+ if (unlikely(cstr.name[0] == 0)) {
res = -ENOENT;
goto errout;
}
@@ -963,31 +1000,34 @@ static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cook
if (res < 0)
goto errout;
+ kfree(cstr.name);
+
paddr = pstr.name;
/* Null-terminate the name */
paddr[res] = '\0';
- kunmap(cpage);
page_cache_release(cpage);
- return *cookie = paddr;
+ set_delayed_call(done, kfree_link, paddr);
+ return paddr;
errout:
+ kfree(cstr.name);
f2fs_fname_crypto_free_buffer(&pstr);
- kunmap(cpage);
page_cache_release(cpage);
return ERR_PTR(res);
}
const struct inode_operations f2fs_encrypted_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = f2fs_encrypted_follow_link,
- .put_link = kfree_put_link,
+ .get_link = f2fs_encrypted_get_link,
.getattr = f2fs_getattr,
.setattr = f2fs_setattr,
+#ifdef CONFIG_F2FS_FS_XATTR
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.listxattr = f2fs_listxattr,
.removexattr = generic_removexattr,
+#endif
};
#endif
@@ -1016,8 +1056,7 @@ const struct inode_operations f2fs_dir_inode_operations = {
const struct inode_operations f2fs_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = f2fs_follow_link,
- .put_link = page_put_link,
+ .get_link = f2fs_get_link,
.getattr = f2fs_getattr,
.setattr = f2fs_setattr,
#ifdef CONFIG_F2FS_FS_XATTR
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 7dd63b7..342597a 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -65,13 +65,14 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
sizeof(struct ino_entry)) >> PAGE_CACHE_SHIFT;
res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
} else if (type == EXTENT_CACHE) {
- mem_size = (sbi->total_ext_tree * sizeof(struct extent_tree) +
+ mem_size = (atomic_read(&sbi->total_ext_tree) *
+ sizeof(struct extent_tree) +
atomic_read(&sbi->total_ext_node) *
sizeof(struct extent_node)) >> PAGE_CACHE_SHIFT;
res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
} else {
- if (sbi->sb->s_bdi->wb.dirty_exceeded)
- return false;
+ if (!sbi->sb->s_bdi->wb.dirty_exceeded)
+ return true;
}
return res;
}
@@ -159,7 +160,7 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
head = radix_tree_lookup(&nm_i->nat_set_root, set);
if (!head) {
- head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_ATOMIC);
+ head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_NOFS);
INIT_LIST_HEAD(&head->entry_list);
INIT_LIST_HEAD(&head->set_list);
@@ -246,7 +247,7 @@ static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid)
{
struct nat_entry *new;
- new = f2fs_kmem_cache_alloc(nat_entry_slab, GFP_ATOMIC);
+ new = f2fs_kmem_cache_alloc(nat_entry_slab, GFP_NOFS);
f2fs_radix_tree_insert(&nm_i->nat_root, nid, new);
memset(new, 0, sizeof(struct nat_entry));
nat_set_nid(new, nid);
@@ -261,13 +262,11 @@ static void cache_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid,
{
struct nat_entry *e;
- down_write(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, nid);
if (!e) {
e = grab_nat_entry(nm_i, nid);
node_info_from_raw_nat(&e->ni, ne);
}
- up_write(&nm_i->nat_tree_lock);
}
static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
@@ -306,6 +305,10 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
if (nat_get_blkaddr(e) != NEW_ADDR && new_blkaddr == NULL_ADDR) {
unsigned char version = nat_get_version(e);
nat_set_version(e, inc_node_version(version));
+
+ /* in order to reuse the nid */
+ if (nm_i->next_scan_nid > ni->nid)
+ nm_i->next_scan_nid = ni->nid;
}
/* change address */
@@ -328,11 +331,11 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
+ int nr = nr_shrink;
- if (available_free_memory(sbi, NAT_ENTRIES))
+ if (!down_write_trylock(&nm_i->nat_tree_lock))
return 0;
- down_write(&nm_i->nat_tree_lock);
while (nr_shrink && !list_empty(&nm_i->nat_entries)) {
struct nat_entry *ne;
ne = list_first_entry(&nm_i->nat_entries,
@@ -341,7 +344,7 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
nr_shrink--;
}
up_write(&nm_i->nat_tree_lock);
- return nr_shrink;
+ return nr - nr_shrink;
}
/*
@@ -375,6 +378,8 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
memset(&ne, 0, sizeof(struct f2fs_nat_entry));
+ down_write(&nm_i->nat_tree_lock);
+
/* Check current segment summary */
mutex_lock(&curseg->curseg_mutex);
i = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 0);
@@ -395,6 +400,7 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
cache:
/* cache nat entry */
cache_nat_entry(NM_I(sbi), nid, &ne);
+ up_write(&nm_i->nat_tree_lock);
}
/*
@@ -672,7 +678,8 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
ret = truncate_dnode(&rdn);
if (ret < 0)
goto out_err;
- set_nid(page, i, 0, false);
+ if (set_nid(page, i, 0, false))
+ dn->node_changed = true;
}
} else {
child_nofs = nofs + ofs * (NIDS_PER_BLOCK + 1) + 1;
@@ -685,7 +692,8 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
rdn.nid = child_nid;
ret = truncate_nodes(&rdn, child_nofs, 0, depth - 1);
if (ret == (NIDS_PER_BLOCK + 1)) {
- set_nid(page, i, 0, false);
+ if (set_nid(page, i, 0, false))
+ dn->node_changed = true;
child_nofs += ret;
} else if (ret < 0 && ret != -ENOENT) {
goto out_err;
@@ -746,7 +754,8 @@ static int truncate_partial_nodes(struct dnode_of_data *dn,
err = truncate_dnode(dn);
if (err < 0)
goto fail;
- set_nid(pages[idx], i, 0, false);
+ if (set_nid(pages[idx], i, 0, false))
+ dn->node_changed = true;
}
if (offset[idx + 1] == 0) {
@@ -898,17 +907,20 @@ int truncate_xattr_node(struct inode *inode, struct page *page)
* Caller should grab and release a rwsem by calling f2fs_lock_op() and
* f2fs_unlock_op().
*/
-void remove_inode_page(struct inode *inode)
+int remove_inode_page(struct inode *inode)
{
struct dnode_of_data dn;
+ int err;
set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino);
- if (get_dnode_of_data(&dn, 0, LOOKUP_NODE))
- return;
+ err = get_dnode_of_data(&dn, 0, LOOKUP_NODE);
+ if (err)
+ return err;
- if (truncate_xattr_node(inode, dn.inode_page)) {
+ err = truncate_xattr_node(inode, dn.inode_page);
+ if (err) {
f2fs_put_dnode(&dn);
- return;
+ return err;
}
/* remove potential inline_data blocks */
@@ -922,6 +934,7 @@ void remove_inode_page(struct inode *inode)
/* will put inode & node pages */
truncate_node(&dn);
+ return 0;
}
struct page *new_inode_page(struct inode *inode)
@@ -967,7 +980,8 @@ struct page *new_node_page(struct dnode_of_data *dn,
fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true);
set_cold_node(dn->inode, page);
SetPageUptodate(page);
- set_page_dirty(page);
+ if (set_page_dirty(page))
+ dn->node_changed = true;
if (f2fs_has_xattr_block(ofs))
F2FS_I(dn->inode)->i_xattr_nid = dn->nid;
@@ -991,8 +1005,7 @@ fail:
/*
* Caller should do after getting the following values.
* 0: f2fs_put_page(page, 0)
- * LOCKED_PAGE: f2fs_put_page(page, 1)
- * error: nothing
+ * LOCKED_PAGE or error: f2fs_put_page(page, 1)
*/
static int read_node_page(struct page *page, int rw)
{
@@ -1010,7 +1023,6 @@ static int read_node_page(struct page *page, int rw)
if (unlikely(ni.blk_addr == NULL_ADDR)) {
ClearPageUptodate(page);
- f2fs_put_page(page, 1);
return -ENOENT;
}
@@ -1029,6 +1041,10 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
struct page *apage;
int err;
+ if (!nid)
+ return;
+ f2fs_bug_on(sbi, check_nid_range(sbi, nid));
+
apage = find_get_page(NODE_MAPPING(sbi), nid);
if (apage && PageUptodate(apage)) {
f2fs_put_page(apage, 0);
@@ -1041,106 +1057,101 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
return;
err = read_node_page(apage, READA);
- if (err == 0)
- f2fs_put_page(apage, 0);
- else if (err == LOCKED_PAGE)
- f2fs_put_page(apage, 1);
+ f2fs_put_page(apage, err ? 1 : 0);
}
-struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid)
+/*
+ * readahead MAX_RA_NODE number of node pages.
+ */
+void ra_node_pages(struct page *parent, int start)
{
- struct page *page;
- int err;
-repeat:
- page = grab_cache_page(NODE_MAPPING(sbi), nid);
- if (!page)
- return ERR_PTR(-ENOMEM);
+ struct f2fs_sb_info *sbi = F2FS_P_SB(parent);
+ struct blk_plug plug;
+ int i, end;
+ nid_t nid;
- err = read_node_page(page, READ_SYNC);
- if (err < 0)
- return ERR_PTR(err);
- else if (err != LOCKED_PAGE)
- lock_page(page);
+ blk_start_plug(&plug);
- if (unlikely(!PageUptodate(page) || nid != nid_of_node(page))) {
- ClearPageUptodate(page);
- f2fs_put_page(page, 1);
- return ERR_PTR(-EIO);
- }
- if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
- f2fs_put_page(page, 1);
- goto repeat;
+ /* Then, try readahead for siblings of the desired node */
+ end = start + MAX_RA_NODE;
+ end = min(end, NIDS_PER_BLOCK);
+ for (i = start; i < end; i++) {
+ nid = get_nid(parent, i, false);
+ ra_node_page(sbi, nid);
}
- return page;
+
+ blk_finish_plug(&plug);
}
-/*
- * Return a locked page for the desired node page.
- * And, readahead MAX_RA_NODE number of node pages.
- */
-struct page *get_node_page_ra(struct page *parent, int start)
+struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid,
+ struct page *parent, int start)
{
- struct f2fs_sb_info *sbi = F2FS_P_SB(parent);
- struct blk_plug plug;
struct page *page;
- int err, i, end;
- nid_t nid;
+ int err;
- /* First, try getting the desired direct node. */
- nid = get_nid(parent, start, false);
if (!nid)
return ERR_PTR(-ENOENT);
+ f2fs_bug_on(sbi, check_nid_range(sbi, nid));
repeat:
page = grab_cache_page(NODE_MAPPING(sbi), nid);
if (!page)
return ERR_PTR(-ENOMEM);
err = read_node_page(page, READ_SYNC);
- if (err < 0)
+ if (err < 0) {
+ f2fs_put_page(page, 1);
return ERR_PTR(err);
- else if (err == LOCKED_PAGE)
+ } else if (err == LOCKED_PAGE) {
goto page_hit;
-
- blk_start_plug(&plug);
-
- /* Then, try readahead for siblings of the desired node */
- end = start + MAX_RA_NODE;
- end = min(end, NIDS_PER_BLOCK);
- for (i = start + 1; i < end; i++) {
- nid = get_nid(parent, i, false);
- if (!nid)
- continue;
- ra_node_page(sbi, nid);
}
- blk_finish_plug(&plug);
+ if (parent)
+ ra_node_pages(parent, start + 1);
lock_page(page);
+
+ if (unlikely(!PageUptodate(page))) {
+ f2fs_put_page(page, 1);
+ return ERR_PTR(-EIO);
+ }
if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
f2fs_put_page(page, 1);
goto repeat;
}
page_hit:
- if (unlikely(!PageUptodate(page))) {
- f2fs_put_page(page, 1);
- return ERR_PTR(-EIO);
- }
+ f2fs_bug_on(sbi, nid != nid_of_node(page));
return page;
}
+struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid)
+{
+ return __get_node_page(sbi, nid, NULL, 0);
+}
+
+struct page *get_node_page_ra(struct page *parent, int start)
+{
+ struct f2fs_sb_info *sbi = F2FS_P_SB(parent);
+ nid_t nid = get_nid(parent, start, false);
+
+ return __get_node_page(sbi, nid, parent, start);
+}
+
void sync_inode_page(struct dnode_of_data *dn)
{
+ int ret = 0;
+
if (IS_INODE(dn->node_page) || dn->inode_page == dn->node_page) {
- update_inode(dn->inode, dn->node_page);
+ ret = update_inode(dn->inode, dn->node_page);
} else if (dn->inode_page) {
if (!dn->inode_page_locked)
lock_page(dn->inode_page);
- update_inode(dn->inode, dn->inode_page);
+ ret = update_inode(dn->inode, dn->inode_page);
if (!dn->inode_page_locked)
unlock_page(dn->inode_page);
} else {
- update_inode_page(dn->inode);
+ ret = update_inode_page(dn->inode);
}
+ dn->node_changed = ret ? true: false;
}
int sync_node_pages(struct f2fs_sb_info *sbi, nid_t ino,
@@ -1168,6 +1179,11 @@ next_step:
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
+ if (unlikely(f2fs_cp_error(sbi))) {
+ pagevec_release(&pvec);
+ return -EIO;
+ }
+
/*
* flushing sequence with step:
* 0. indirect nodes
@@ -1316,23 +1332,24 @@ static int f2fs_write_node_page(struct page *page,
nid = nid_of_node(page);
f2fs_bug_on(sbi, page->index != nid);
+ if (wbc->for_reclaim) {
+ if (!down_read_trylock(&sbi->node_write))
+ goto redirty_out;
+ } else {
+ down_read(&sbi->node_write);
+ }
+
get_node_info(sbi, nid, &ni);
/* This page is already truncated */
if (unlikely(ni.blk_addr == NULL_ADDR)) {
ClearPageUptodate(page);
dec_page_count(sbi, F2FS_DIRTY_NODES);
+ up_read(&sbi->node_write);
unlock_page(page);
return 0;
}
- if (wbc->for_reclaim) {
- if (!down_read_trylock(&sbi->node_write))
- goto redirty_out;
- } else {
- down_read(&sbi->node_write);
- }
-
set_page_writeback(page);
fio.blk_addr = ni.blk_addr;
write_node_page(nid, &fio);
@@ -1341,7 +1358,7 @@ static int f2fs_write_node_page(struct page *page,
up_read(&sbi->node_write);
unlock_page(page);
- if (wbc->for_reclaim)
+ if (wbc->for_reclaim || unlikely(f2fs_cp_error(sbi)))
f2fs_submit_merged_bio(sbi, NODE, WRITE);
return 0;
@@ -1432,13 +1449,10 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
if (build) {
/* do not add allocated nids */
- down_read(&nm_i->nat_tree_lock);
ne = __lookup_nat_cache(nm_i, nid);
- if (ne &&
- (!get_nat_flag(ne, IS_CHECKPOINTED) ||
+ if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) ||
nat_get_blkaddr(ne) != NULL_ADDR))
allocated = true;
- up_read(&nm_i->nat_tree_lock);
if (allocated)
return 0;
}
@@ -1521,7 +1535,10 @@ static void build_free_nids(struct f2fs_sb_info *sbi)
return;
/* readahead nat pages to be scanned */
- ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, META_NAT);
+ ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES,
+ META_NAT, true);
+
+ down_read(&nm_i->nat_tree_lock);
while (1) {
struct page *page = get_current_nat_page(sbi, nid);
@@ -1533,7 +1550,7 @@ static void build_free_nids(struct f2fs_sb_info *sbi)
if (unlikely(nid >= nm_i->max_nid))
nid = 0;
- if (i++ == FREE_NID_PAGES)
+ if (++i >= FREE_NID_PAGES)
break;
}
@@ -1551,6 +1568,10 @@ static void build_free_nids(struct f2fs_sb_info *sbi)
remove_free_nid(nm_i, nid);
}
mutex_unlock(&curseg->curseg_mutex);
+ up_read(&nm_i->nat_tree_lock);
+
+ ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid),
+ nm_i->ra_nid_pages, META_NAT, false);
}
/*
@@ -1636,6 +1657,32 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
kmem_cache_free(free_nid_slab, i);
}
+int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink)
+{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ struct free_nid *i, *next;
+ int nr = nr_shrink;
+
+ if (!mutex_trylock(&nm_i->build_lock))
+ return 0;
+
+ spin_lock(&nm_i->free_nid_list_lock);
+ list_for_each_entry_safe(i, next, &nm_i->free_nid_list, list) {
+ if (nr_shrink <= 0 || nm_i->fcnt <= NAT_ENTRY_PER_BLOCK)
+ break;
+ if (i->state == NID_ALLOC)
+ continue;
+ __del_from_free_nid_list(nm_i, i);
+ kmem_cache_free(free_nid_slab, i);
+ nm_i->fcnt--;
+ nr_shrink--;
+ }
+ spin_unlock(&nm_i->free_nid_list_lock);
+ mutex_unlock(&nm_i->build_lock);
+
+ return nr - nr_shrink;
+}
+
void recover_inline_xattr(struct inode *inode, struct page *page)
{
void *src_addr, *dst_addr;
@@ -1761,10 +1808,10 @@ int restore_node_summary(struct f2fs_sb_info *sbi,
nrpages = min(last_offset - i, bio_blocks);
/* readahead node pages */
- ra_meta_pages(sbi, addr, nrpages, META_POR);
+ ra_meta_pages(sbi, addr, nrpages, META_POR, true);
for (idx = addr; idx < addr + nrpages; idx++) {
- struct page *page = get_meta_page(sbi, idx);
+ struct page *page = get_tmp_page(sbi, idx);
rn = F2FS_NODE(page);
sum_entry->nid = rn->footer.nid;
@@ -1795,14 +1842,12 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
raw_ne = nat_in_journal(sum, i);
- down_write(&nm_i->nat_tree_lock);
ne = __lookup_nat_cache(nm_i, nid);
if (!ne) {
ne = grab_nat_entry(nm_i, nid);
node_info_from_raw_nat(&ne->ni, &raw_ne);
}
__set_nat_cache_dirty(nm_i, ne);
- up_write(&nm_i->nat_tree_lock);
}
update_nats_in_cursum(sum, -i);
mutex_unlock(&curseg->curseg_mutex);
@@ -1836,7 +1881,6 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
struct f2fs_nat_block *nat_blk;
struct nat_entry *ne, *cur;
struct page *page = NULL;
- struct f2fs_nm_info *nm_i = NM_I(sbi);
/*
* there are two steps to flush nat entries:
@@ -1873,12 +1917,8 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
raw_ne = &nat_blk->entries[nid - start_nid];
}
raw_nat_from_node_info(raw_ne, &ne->ni);
-
- down_write(&NM_I(sbi)->nat_tree_lock);
nat_reset_flag(ne);
__clear_nat_cache_dirty(NM_I(sbi), ne);
- up_write(&NM_I(sbi)->nat_tree_lock);
-
if (nat_get_blkaddr(ne) == NULL_ADDR)
add_free_nid(sbi, nid, false);
}
@@ -1890,9 +1930,7 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
f2fs_bug_on(sbi, set->entry_cnt);
- down_write(&nm_i->nat_tree_lock);
radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
- up_write(&nm_i->nat_tree_lock);
kmem_cache_free(nat_entry_set_slab, set);
}
@@ -1912,6 +1950,9 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
if (!nm_i->dirty_nat_cnt)
return;
+
+ down_write(&nm_i->nat_tree_lock);
+
/*
* if there are no enough space in journal to store dirty nat
* entries, remove all entries from journal and merge them
@@ -1920,7 +1961,6 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt, NAT_JOURNAL))
remove_nats_in_journal(sbi);
- down_write(&nm_i->nat_tree_lock);
while ((found = __gang_lookup_nat_set(nm_i,
set_idx, SETVEC_SIZE, setvec))) {
unsigned idx;
@@ -1929,12 +1969,13 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
__adjust_nat_entry_set(setvec[idx], &sets,
MAX_NAT_JENTRIES(sum));
}
- up_write(&nm_i->nat_tree_lock);
/* flush dirty nats in nat entry set */
list_for_each_entry_safe(set, tmp, &sets, set_list)
__flush_nat_entry_set(sbi, set);
+ up_write(&nm_i->nat_tree_lock);
+
f2fs_bug_on(sbi, nm_i->dirty_nat_cnt);
}
@@ -1958,6 +1999,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
nm_i->fcnt = 0;
nm_i->nat_cnt = 0;
nm_i->ram_thresh = DEF_RAM_THRESHOLD;
+ nm_i->ra_nid_pages = DEF_RA_NID_PAGES;
INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC);
INIT_LIST_HEAD(&nm_i->free_nid_list);
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 7427e95..d4d1f63 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -14,9 +14,11 @@
/* node block offset on the NAT area dedicated to the given start node id */
#define NAT_BLOCK_OFFSET(start_nid) (start_nid / NAT_ENTRY_PER_BLOCK)
-/* # of pages to perform readahead before building free nids */
+/* # of pages to perform synchronous readahead before building free nids */
#define FREE_NID_PAGES 4
+#define DEF_RA_NID_PAGES 4 /* # of nid pages to be readaheaded */
+
/* maximum readahead size for node during getting data blocks */
#define MAX_RA_NODE 128
@@ -181,7 +183,7 @@ static inline pgoff_t current_nat_addr(struct f2fs_sb_info *sbi, nid_t start)
block_addr = (pgoff_t)(nm_i->nat_blkaddr +
(seg_off << sbi->log_blocks_per_seg << 1) +
- (block_off & ((1 << sbi->log_blocks_per_seg) - 1)));
+ (block_off & (sbi->blocks_per_seg - 1)));
if (f2fs_test_bit(block_off, nm_i->nat_bitmap))
block_addr += sbi->blocks_per_seg;
@@ -315,7 +317,7 @@ static inline bool IS_DNODE(struct page *node_page)
return true;
}
-static inline void set_nid(struct page *p, int off, nid_t nid, bool i)
+static inline int set_nid(struct page *p, int off, nid_t nid, bool i)
{
struct f2fs_node *rn = F2FS_NODE(p);
@@ -325,7 +327,7 @@ static inline void set_nid(struct page *p, int off, nid_t nid, bool i)
rn->i.i_nid[off - NODE_DIR1_BLOCK] = cpu_to_le32(nid);
else
rn->in.nid[off] = cpu_to_le32(nid);
- set_page_dirty(p);
+ return set_page_dirty(p);
}
static inline nid_t get_nid(struct page *p, int off, bool i)
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 24a8c1d..589b20b 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -168,6 +168,32 @@ static void recover_inode(struct inode *inode, struct page *page)
ino_of_node(page), name);
}
+static bool is_same_inode(struct inode *inode, struct page *ipage)
+{
+ struct f2fs_inode *ri = F2FS_INODE(ipage);
+ struct timespec disk;
+
+ if (!IS_INODE(ipage))
+ return true;
+
+ disk.tv_sec = le64_to_cpu(ri->i_ctime);
+ disk.tv_nsec = le32_to_cpu(ri->i_ctime_nsec);
+ if (timespec_compare(&inode->i_ctime, &disk) > 0)
+ return false;
+
+ disk.tv_sec = le64_to_cpu(ri->i_atime);
+ disk.tv_nsec = le32_to_cpu(ri->i_atime_nsec);
+ if (timespec_compare(&inode->i_atime, &disk) > 0)
+ return false;
+
+ disk.tv_sec = le64_to_cpu(ri->i_mtime);
+ disk.tv_nsec = le32_to_cpu(ri->i_mtime_nsec);
+ if (timespec_compare(&inode->i_mtime, &disk) > 0)
+ return false;
+
+ return true;
+}
+
static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
{
unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi));
@@ -180,7 +206,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
- ra_meta_pages(sbi, blkaddr, 1, META_POR);
+ ra_meta_pages(sbi, blkaddr, 1, META_POR, true);
while (1) {
struct fsync_inode_entry *entry;
@@ -188,7 +214,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
if (!is_valid_blkaddr(sbi, blkaddr, META_POR))
return 0;
- page = get_meta_page(sbi, blkaddr);
+ page = get_tmp_page(sbi, blkaddr);
if (cp_ver != cpver_of_node(page))
break;
@@ -197,7 +223,10 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
goto next;
entry = get_fsync_inode(head, ino_of_node(page));
- if (!entry) {
+ if (entry) {
+ if (!is_same_inode(entry->inode, page))
+ goto next;
+ } else {
if (IS_INODE(page) && is_dent_dnode(page)) {
err = recover_inode_page(sbi, page);
if (err)
@@ -383,15 +412,11 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
start = start_bidx_of_node(ofs_of_node(page), fi);
end = start + ADDRS_PER_PAGE(page, fi);
- f2fs_lock_op(sbi);
-
set_new_dnode(&dn, inode, NULL, NULL, 0);
err = get_dnode_of_data(&dn, start, ALLOC_NODE);
- if (err) {
- f2fs_unlock_op(sbi);
+ if (err)
goto out;
- }
f2fs_wait_on_page_writeback(dn.node_page, NODE);
@@ -399,14 +424,35 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
f2fs_bug_on(sbi, ni.ino != ino_of_node(page));
f2fs_bug_on(sbi, ofs_of_node(dn.node_page) != ofs_of_node(page));
- for (; start < end; start++) {
+ for (; start < end; start++, dn.ofs_in_node++) {
block_t src, dest;
src = datablock_addr(dn.node_page, dn.ofs_in_node);
dest = datablock_addr(page, dn.ofs_in_node);
- if (src != dest && dest != NEW_ADDR && dest != NULL_ADDR &&
- is_valid_blkaddr(sbi, dest, META_POR)) {
+ /* skip recovering if dest is the same as src */
+ if (src == dest)
+ continue;
+
+ /* dest is invalid, just invalidate src block */
+ if (dest == NULL_ADDR) {
+ truncate_data_blocks_range(&dn, 1);
+ continue;
+ }
+
+ /*
+ * dest is reserved block, invalidate src block
+ * and then reserve one new block in dnode page.
+ */
+ if (dest == NEW_ADDR) {
+ truncate_data_blocks_range(&dn, 1);
+ err = reserve_new_block(&dn);
+ f2fs_bug_on(sbi, err);
+ continue;
+ }
+
+ /* dest is valid block, try to recover from src to dest */
+ if (is_valid_blkaddr(sbi, dest, META_POR)) {
if (src == NULL_ADDR) {
err = reserve_new_block(&dn);
@@ -424,7 +470,6 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
ni.version, false);
recovered++;
}
- dn.ofs_in_node++;
}
if (IS_INODE(dn.node_page))
@@ -436,7 +481,6 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
set_page_dirty(dn.node_page);
err:
f2fs_put_dnode(&dn);
- f2fs_unlock_op(sbi);
out:
f2fs_msg(sbi->sb, KERN_NOTICE,
"recover_data: ino = %lx, recovered = %d blocks, err = %d",
@@ -444,8 +488,7 @@ out:
return err;
}
-static int recover_data(struct f2fs_sb_info *sbi,
- struct list_head *head, int type)
+static int recover_data(struct f2fs_sb_info *sbi, struct list_head *head)
{
unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi));
struct curseg_info *curseg;
@@ -454,7 +497,7 @@ static int recover_data(struct f2fs_sb_info *sbi,
block_t blkaddr;
/* get node pages in the current segment */
- curseg = CURSEG_I(sbi, type);
+ curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
while (1) {
@@ -465,7 +508,7 @@ static int recover_data(struct f2fs_sb_info *sbi,
ra_meta_pages_cond(sbi, blkaddr);
- page = get_meta_page(sbi, blkaddr);
+ page = get_tmp_page(sbi, blkaddr);
if (cp_ver != cpver_of_node(page)) {
f2fs_put_page(page, 1);
@@ -525,14 +568,12 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
INIT_LIST_HEAD(&inode_list);
- /* step #1: find fsynced inode numbers */
- set_sbi_flag(sbi, SBI_POR_DOING);
-
/* prevent checkpoint */
mutex_lock(&sbi->cp_mutex);
blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
+ /* step #1: find fsynced inode numbers */
err = find_fsync_dnodes(sbi, &inode_list);
if (err)
goto out;
@@ -543,7 +584,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
need_writecp = true;
/* step #2: recover data */
- err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE);
+ err = recover_data(sbi, &inode_list);
if (!err)
f2fs_bug_on(sbi, !list_empty(&inode_list));
out:
@@ -552,7 +593,7 @@ out:
/* truncate meta pages to be used by the recovery */
truncate_inode_pages_range(META_MAPPING(sbi),
- MAIN_BLKADDR(sbi) << PAGE_CACHE_SHIFT, -1);
+ (loff_t)MAIN_BLKADDR(sbi) << PAGE_CACHE_SHIFT, -1);
if (err) {
truncate_inode_pages_final(NODE_MAPPING(sbi));
@@ -561,11 +602,20 @@ out:
clear_sbi_flag(sbi, SBI_POR_DOING);
if (err) {
- discard_next_dnode(sbi, blkaddr);
+ bool invalidate = false;
+
+ if (discard_next_dnode(sbi, blkaddr))
+ invalidate = true;
/* Flush all the NAT/SIT pages */
while (get_pages(sbi, F2FS_DIRTY_META))
sync_meta_pages(sbi, META, LONG_MAX);
+
+ /* invalidate temporary meta page */
+ if (invalidate)
+ invalidate_mapping_pages(META_MAPPING(sbi),
+ blkaddr, blkaddr);
+
set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
mutex_unlock(&sbi->cp_mutex);
} else if (need_writecp) {
@@ -573,7 +623,7 @@ out:
.reason = CP_RECOVERY,
};
mutex_unlock(&sbi->cp_mutex);
- write_checkpoint(sbi, &cpc);
+ err = write_checkpoint(sbi, &cpc);
} else {
mutex_unlock(&sbi->cp_mutex);
}
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 61b97f9..5904a41 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -14,8 +14,8 @@
#include <linux/blkdev.h>
#include <linux/prefetch.h>
#include <linux/kthread.h>
-#include <linux/vmalloc.h>
#include <linux/swap.h>
+#include <linux/timer.h>
#include "f2fs.h"
#include "segment.h"
@@ -29,6 +29,21 @@ static struct kmem_cache *discard_entry_slab;
static struct kmem_cache *sit_entry_set_slab;
static struct kmem_cache *inmem_entry_slab;
+static unsigned long __reverse_ulong(unsigned char *str)
+{
+ unsigned long tmp = 0;
+ int shift = 24, idx = 0;
+
+#if BITS_PER_LONG == 64
+ shift = 56;
+#endif
+ while (shift >= 0) {
+ tmp |= (unsigned long)str[idx++] << shift;
+ shift -= BITS_PER_BYTE;
+ }
+ return tmp;
+}
+
/*
* __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since
* MSB and LSB are reversed in a byte by f2fs_set_bit.
@@ -38,27 +53,31 @@ static inline unsigned long __reverse_ffs(unsigned long word)
int num = 0;
#if BITS_PER_LONG == 64
- if ((word & 0xffffffff) == 0) {
+ if ((word & 0xffffffff00000000UL) == 0)
num += 32;
+ else
word >>= 32;
- }
#endif
- if ((word & 0xffff) == 0) {
+ if ((word & 0xffff0000) == 0)
num += 16;
+ else
word >>= 16;
- }
- if ((word & 0xff) == 0) {
+
+ if ((word & 0xff00) == 0)
num += 8;
+ else
word >>= 8;
- }
+
if ((word & 0xf0) == 0)
num += 4;
else
word >>= 4;
+
if ((word & 0xc) == 0)
num += 2;
else
word >>= 2;
+
if ((word & 0x2) == 0)
num += 1;
return num;
@@ -67,158 +86,103 @@ static inline unsigned long __reverse_ffs(unsigned long word)
/*
* __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c because
* f2fs_set_bit makes MSB and LSB reversed in a byte.
+ * @size must be integral times of unsigned long.
* Example:
- * LSB <--> MSB
- * f2fs_set_bit(0, bitmap) => 0000 0001
- * f2fs_set_bit(7, bitmap) => 1000 0000
+ * MSB <--> LSB
+ * f2fs_set_bit(0, bitmap) => 1000 0000
+ * f2fs_set_bit(7, bitmap) => 0000 0001
*/
static unsigned long __find_rev_next_bit(const unsigned long *addr,
unsigned long size, unsigned long offset)
{
- while (!f2fs_test_bit(offset, (unsigned char *)addr))
- offset++;
-
- if (offset > size)
- offset = size;
-
- return offset;
-#if 0
const unsigned long *p = addr + BIT_WORD(offset);
- unsigned long result = offset & ~(BITS_PER_LONG - 1);
+ unsigned long result = size;
unsigned long tmp;
- unsigned long mask, submask;
- unsigned long quot, rest;
if (offset >= size)
return size;
- size -= result;
+ size -= (offset & ~(BITS_PER_LONG - 1));
offset %= BITS_PER_LONG;
- if (!offset)
- goto aligned;
-
- tmp = *(p++);
- quot = (offset >> 3) << 3;
- rest = offset & 0x7;
- mask = ~0UL << quot;
- submask = (unsigned char)(0xff << rest) >> rest;
- submask <<= quot;
- mask &= submask;
- tmp &= mask;
- if (size < BITS_PER_LONG)
- goto found_first;
- if (tmp)
- goto found_middle;
-
- size -= BITS_PER_LONG;
- result += BITS_PER_LONG;
-aligned:
- while (size & ~(BITS_PER_LONG-1)) {
- tmp = *(p++);
+
+ while (1) {
+ if (*p == 0)
+ goto pass;
+
+ tmp = __reverse_ulong((unsigned char *)p);
+
+ tmp &= ~0UL >> offset;
+ if (size < BITS_PER_LONG)
+ tmp &= (~0UL << (BITS_PER_LONG - size));
if (tmp)
- goto found_middle;
- result += BITS_PER_LONG;
+ goto found;
+pass:
+ if (size <= BITS_PER_LONG)
+ break;
size -= BITS_PER_LONG;
+ offset = 0;
+ p++;
}
- if (!size)
- return result;
- tmp = *p;
-found_first:
- tmp &= (~0UL >> (BITS_PER_LONG - size));
- if (tmp == 0UL) /* Are any bits set? */
- return result + size; /* Nope. */
-found_middle:
- return result + __reverse_ffs(tmp);
-#endif
+ return result;
+found:
+ return result - size + __reverse_ffs(tmp);
}
static unsigned long __find_rev_next_zero_bit(const unsigned long *addr,
unsigned long size, unsigned long offset)
{
- while (f2fs_test_bit(offset, (unsigned char *)addr))
- offset++;
-
- if (offset > size)
- offset = size;
-
- return offset;
-#if 0
const unsigned long *p = addr + BIT_WORD(offset);
- unsigned long result = offset & ~(BITS_PER_LONG - 1);
+ unsigned long result = size;
unsigned long tmp;
- unsigned long mask, submask;
- unsigned long quot, rest;
if (offset >= size)
return size;
- size -= result;
+ size -= (offset & ~(BITS_PER_LONG - 1));
offset %= BITS_PER_LONG;
- if (!offset)
- goto aligned;
-
- tmp = *(p++);
- quot = (offset >> 3) << 3;
- rest = offset & 0x7;
- mask = ~(~0UL << quot);
- submask = (unsigned char)~((unsigned char)(0xff << rest) >> rest);
- submask <<= quot;
- mask += submask;
- tmp |= mask;
- if (size < BITS_PER_LONG)
- goto found_first;
- if (~tmp)
- goto found_middle;
-
- size -= BITS_PER_LONG;
- result += BITS_PER_LONG;
-aligned:
- while (size & ~(BITS_PER_LONG - 1)) {
- tmp = *(p++);
- if (~tmp)
- goto found_middle;
- result += BITS_PER_LONG;
+
+ while (1) {
+ if (*p == ~0UL)
+ goto pass;
+
+ tmp = __reverse_ulong((unsigned char *)p);
+
+ if (offset)
+ tmp |= ~0UL << (BITS_PER_LONG - offset);
+ if (size < BITS_PER_LONG)
+ tmp |= ~0UL >> size;
+ if (tmp != ~0UL)
+ goto found;
+pass:
+ if (size <= BITS_PER_LONG)
+ break;
size -= BITS_PER_LONG;
+ offset = 0;
+ p++;
}
- if (!size)
- return result;
- tmp = *p;
-
-found_first:
- tmp |= ~0UL << size;
- if (tmp == ~0UL) /* Are any bits zero? */
- return result + size; /* Nope. */
-found_middle:
- return result + __reverse_ffz(tmp);
-#endif
+ return result;
+found:
+ return result - size + __reverse_ffz(tmp);
}
void register_inmem_page(struct inode *inode, struct page *page)
{
struct f2fs_inode_info *fi = F2FS_I(inode);
struct inmem_pages *new;
- int err;
- SetPagePrivate(page);
f2fs_trace_pid(page);
+ set_page_private(page, (unsigned long)ATOMIC_WRITTEN_PAGE);
+ SetPagePrivate(page);
+
new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS);
/* add atomic page indices to the list */
new->page = page;
INIT_LIST_HEAD(&new->list);
-retry:
+
/* increase reference count with clean state */
mutex_lock(&fi->inmem_lock);
- err = radix_tree_insert(&fi->inmem_root, page->index, new);
- if (err == -EEXIST) {
- mutex_unlock(&fi->inmem_lock);
- kmem_cache_free(inmem_entry_slab, new);
- return;
- } else if (err) {
- mutex_unlock(&fi->inmem_lock);
- goto retry;
- }
get_page(page);
list_add_tail(&new->list, &fi->inmem_pages);
inc_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES);
@@ -227,7 +191,7 @@ retry:
trace_f2fs_register_inmem_page(page, INMEM);
}
-void commit_inmem_pages(struct inode *inode, bool abort)
+int commit_inmem_pages(struct inode *inode, bool abort)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct f2fs_inode_info *fi = F2FS_I(inode);
@@ -239,6 +203,7 @@ void commit_inmem_pages(struct inode *inode, bool abort)
.rw = WRITE_SYNC | REQ_PRIO,
.encrypted_page = NULL,
};
+ int err = 0;
/*
* The abort is true only when f2fs_evict_inode is called.
@@ -248,14 +213,14 @@ void commit_inmem_pages(struct inode *inode, bool abort)
* inode becomes free by iget_locked in f2fs_iget.
*/
if (!abort) {
- f2fs_balance_fs(sbi);
+ f2fs_balance_fs(sbi, true);
f2fs_lock_op(sbi);
}
mutex_lock(&fi->inmem_lock);
list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) {
+ lock_page(cur->page);
if (!abort) {
- lock_page(cur->page);
if (cur->page->mapping == inode->i_mapping) {
set_page_dirty(cur->page);
f2fs_wait_on_page_writeback(cur->page, DATA);
@@ -263,15 +228,22 @@ void commit_inmem_pages(struct inode *inode, bool abort)
inode_dec_dirty_pages(inode);
trace_f2fs_commit_inmem_page(cur->page, INMEM);
fio.page = cur->page;
- do_write_data_page(&fio);
+ err = do_write_data_page(&fio);
+ if (err) {
+ unlock_page(cur->page);
+ break;
+ }
+ clear_cold_data(cur->page);
submit_bio = true;
}
- f2fs_put_page(cur->page, 1);
} else {
+ ClearPageUptodate(cur->page);
trace_f2fs_commit_inmem_page(cur->page, INMEM_DROP);
- put_page(cur->page);
}
- radix_tree_delete(&fi->inmem_root, cur->page->index);
+ set_page_private(cur->page, 0);
+ ClearPagePrivate(cur->page);
+ f2fs_put_page(cur->page, 1);
+
list_del(&cur->list);
kmem_cache_free(inmem_entry_slab, cur);
dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES);
@@ -283,34 +255,50 @@ void commit_inmem_pages(struct inode *inode, bool abort)
if (submit_bio)
f2fs_submit_merged_bio(sbi, DATA, WRITE);
}
+ return err;
}
/*
* This function balances dirty node and dentry pages.
* In addition, it controls garbage collection.
*/
-void f2fs_balance_fs(struct f2fs_sb_info *sbi)
+void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
{
+ if (!need)
+ return;
/*
* We should do GC or end up with checkpoint, if there are so many dirty
* dir/node pages without enough free segments.
*/
if (has_not_enough_free_secs(sbi, 0)) {
mutex_lock(&sbi->gc_mutex);
- f2fs_gc(sbi);
+ f2fs_gc(sbi, false);
}
}
void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
{
/* try to shrink extent cache when there is no enough memory */
- f2fs_shrink_extent_tree(sbi, EXTENT_CACHE_SHRINK_NUMBER);
+ if (!available_free_memory(sbi, EXTENT_CACHE))
+ f2fs_shrink_extent_tree(sbi, EXTENT_CACHE_SHRINK_NUMBER);
+
+ /* check the # of cached NAT entries */
+ if (!available_free_memory(sbi, NAT_ENTRIES))
+ try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK);
+
+ if (!available_free_memory(sbi, FREE_NIDS))
+ try_to_free_nids(sbi, NAT_ENTRY_PER_BLOCK * FREE_NID_PAGES);
- /* check the # of cached NAT entries and prefree segments */
- if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK) ||
+ /* checkpoint is the only way to shrink partial cached entries */
+ if (!available_free_memory(sbi, NAT_ENTRIES) ||
excess_prefree_segs(sbi) ||
- !available_free_memory(sbi, INO_ENTRIES))
+ !available_free_memory(sbi, INO_ENTRIES) ||
+ (is_idle(sbi) && f2fs_time_over(sbi, CP_TIME))) {
+ if (test_opt(sbi, DATA_FLUSH))
+ sync_dirty_inodes(sbi, FILE_INODE);
f2fs_sync_fs(sbi->sb, true);
+ stat_inc_bg_cp_count(sbi->stat_info);
+ }
}
static int issue_flush_thread(void *data)
@@ -323,10 +311,12 @@ repeat:
return 0;
if (!llist_empty(&fcc->issue_list)) {
- struct bio *bio = bio_alloc(GFP_NOIO, 0);
+ struct bio *bio;
struct flush_cmd *cmd, *next;
int ret;
+ bio = f2fs_bio_alloc(0);
+
fcc->dispatch_list = llist_del_all(&fcc->issue_list);
fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list);
@@ -358,8 +348,15 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi)
if (test_opt(sbi, NOBARRIER))
return 0;
- if (!test_opt(sbi, FLUSH_MERGE))
- return blkdev_issue_flush(sbi->sb->s_bdev, GFP_KERNEL, NULL);
+ if (!test_opt(sbi, FLUSH_MERGE)) {
+ struct bio *bio = f2fs_bio_alloc(0);
+ int ret;
+
+ bio->bi_bdev = sbi->sb->s_bdev;
+ ret = submit_bio_wait(WRITE_FLUSH, bio);
+ bio_put(bio);
+ return ret;
+ }
init_completion(&cmd.wait);
@@ -503,7 +500,7 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
return blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0);
}
-void discard_next_dnode(struct f2fs_sb_info *sbi, block_t blkaddr)
+bool discard_next_dnode(struct f2fs_sb_info *sbi, block_t blkaddr)
{
int err = -ENOTSUPP;
@@ -513,13 +510,16 @@ void discard_next_dnode(struct f2fs_sb_info *sbi, block_t blkaddr)
unsigned int offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr);
if (f2fs_test_bit(offset, se->discard_map))
- return;
+ return false;
err = f2fs_issue_discard(sbi, blkaddr, 1);
}
- if (err)
+ if (err) {
update_meta_page(sbi, NULL, blkaddr);
+ return true;
+ }
+ return false;
}
static void __add_discard_entry(struct f2fs_sb_info *sbi,
@@ -748,6 +748,30 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr)
mutex_unlock(&sit_i->sentry_lock);
}
+bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr)
+{
+ struct sit_info *sit_i = SIT_I(sbi);
+ unsigned int segno, offset;
+ struct seg_entry *se;
+ bool is_cp = false;
+
+ if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR)
+ return true;
+
+ mutex_lock(&sit_i->sentry_lock);
+
+ segno = GET_SEGNO(sbi, blkaddr);
+ se = get_seg_entry(sbi, segno);
+ offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr);
+
+ if (f2fs_test_bit(offset, se->ckpt_valid_map))
+ is_cp = true;
+
+ mutex_unlock(&sit_i->sentry_lock);
+
+ return is_cp;
+}
+
/*
* This function should be resided under the curseg_mutex lock
*/
@@ -1097,6 +1121,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
__u64 end = start + F2FS_BYTES_TO_BLK(range->len) - 1;
unsigned int start_segno, end_segno;
struct cp_control cpc;
+ int err = 0;
if (start >= MAX_BLKADDR(sbi) || range->len < sbi->blocksize)
return -EINVAL;
@@ -1127,12 +1152,12 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
sbi->segs_per_sec) - 1, end_segno);
mutex_lock(&sbi->gc_mutex);
- write_checkpoint(sbi, &cpc);
+ err = write_checkpoint(sbi, &cpc);
mutex_unlock(&sbi->gc_mutex);
}
out:
range->len = F2FS_BLK_TO_BYTES(cpc.trimmed);
- return 0;
+ return err;
}
static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type)
@@ -1218,7 +1243,8 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
mutex_lock(&sit_i->sentry_lock);
/* direct_io'ed data is aligned to the segment for better performance */
- if (direct_io && curseg->next_blkoff)
+ if (direct_io && curseg->next_blkoff &&
+ !has_not_enough_free_secs(sbi, 0))
__allocate_new_segments(sbi, type);
*new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
@@ -1272,6 +1298,9 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page)
.encrypted_page = NULL,
};
+ if (unlikely(page->index >= MAIN_BLKADDR(sbi)))
+ fio.rw &= ~REQ_META;
+
set_page_writeback(page);
f2fs_submit_page_mbio(&fio);
}
@@ -1349,7 +1378,14 @@ static void __f2fs_replace_block(struct f2fs_sb_info *sbi,
curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr);
__add_sum_entry(sbi, type, sum);
- refresh_sit_entry(sbi, old_blkaddr, new_blkaddr);
+ if (!recover_curseg)
+ update_sit_entry(sbi, new_blkaddr, 1);
+ if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO)
+ update_sit_entry(sbi, old_blkaddr, -1);
+
+ locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
+ locate_dirty_segment(sbi, GET_SEGNO(sbi, new_blkaddr));
+
locate_dirty_segment(sbi, old_cursegno);
if (recover_curseg) {
@@ -1429,6 +1465,23 @@ void f2fs_wait_on_page_writeback(struct page *page,
}
}
+void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *sbi,
+ block_t blkaddr)
+{
+ struct page *cpage;
+
+ if (blkaddr == NEW_ADDR)
+ return;
+
+ f2fs_bug_on(sbi, blkaddr == NULL_ADDR);
+
+ cpage = find_lock_page(META_MAPPING(sbi), blkaddr);
+ if (cpage) {
+ f2fs_wait_on_page_writeback(cpage, DATA);
+ f2fs_put_page(cpage, 1);
+ }
+}
+
static int read_compacted_summaries(struct f2fs_sb_info *sbi)
{
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
@@ -1566,7 +1619,7 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi)
if (npages >= 2)
ra_meta_pages(sbi, start_sum_block(sbi), npages,
- META_CP);
+ META_CP, true);
/* restore for compacted data summary */
if (read_compacted_summaries(sbi))
@@ -1576,7 +1629,7 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi)
if (__exist_node_summaries(sbi))
ra_meta_pages(sbi, sum_blk_addr(sbi, NR_CURSEG_TYPE, type),
- NR_CURSEG_TYPE - type, META_CP);
+ NR_CURSEG_TYPE - type, META_CP, true);
for (; type <= CURSEG_COLD_NODE; type++) {
err = read_normal_summaries(sbi, type);
@@ -1684,13 +1737,13 @@ int lookup_journal_in_cursum(struct f2fs_summary_block *sum, int type,
if (le32_to_cpu(nid_in_journal(sum, i)) == val)
return i;
}
- if (alloc && nats_in_cursum(sum) < NAT_JOURNAL_ENTRIES)
+ if (alloc && __has_cursum_space(sum, 1, NAT_JOURNAL))
return update_nats_in_cursum(sum, 1);
} else if (type == SIT_JOURNAL) {
for (i = 0; i < sits_in_cursum(sum); i++)
if (le32_to_cpu(segno_in_journal(sum, i)) == val)
return i;
- if (alloc && sits_in_cursum(sum) < SIT_JOURNAL_ENTRIES)
+ if (alloc && __has_cursum_space(sum, 1, SIT_JOURNAL))
return update_sits_in_cursum(sum, 1);
}
return -1;
@@ -1733,7 +1786,7 @@ static struct page *get_next_sit_page(struct f2fs_sb_info *sbi,
static struct sit_entry_set *grab_sit_entry_set(void)
{
struct sit_entry_set *ses =
- f2fs_kmem_cache_alloc(sit_entry_set_slab, GFP_ATOMIC);
+ f2fs_kmem_cache_alloc(sit_entry_set_slab, GFP_NOFS);
ses->entry_cnt = 0;
INIT_LIST_HEAD(&ses->set_list);
@@ -1935,12 +1988,13 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
SM_I(sbi)->sit_info = sit_i;
- sit_i->sentries = vzalloc(MAIN_SEGS(sbi) * sizeof(struct seg_entry));
+ sit_i->sentries = f2fs_kvzalloc(MAIN_SEGS(sbi) *
+ sizeof(struct seg_entry), GFP_KERNEL);
if (!sit_i->sentries)
return -ENOMEM;
bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
- sit_i->dirty_sentries_bitmap = kzalloc(bitmap_size, GFP_KERNEL);
+ sit_i->dirty_sentries_bitmap = f2fs_kvzalloc(bitmap_size, GFP_KERNEL);
if (!sit_i->dirty_sentries_bitmap)
return -ENOMEM;
@@ -1962,8 +2016,8 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
return -ENOMEM;
if (sbi->segs_per_sec > 1) {
- sit_i->sec_entries = vzalloc(MAIN_SECS(sbi) *
- sizeof(struct sec_entry));
+ sit_i->sec_entries = f2fs_kvzalloc(MAIN_SECS(sbi) *
+ sizeof(struct sec_entry), GFP_KERNEL);
if (!sit_i->sec_entries)
return -ENOMEM;
}
@@ -2008,12 +2062,12 @@ static int build_free_segmap(struct f2fs_sb_info *sbi)
SM_I(sbi)->free_info = free_i;
bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
- free_i->free_segmap = kmalloc(bitmap_size, GFP_KERNEL);
+ free_i->free_segmap = f2fs_kvmalloc(bitmap_size, GFP_KERNEL);
if (!free_i->free_segmap)
return -ENOMEM;
sec_bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi));
- free_i->free_secmap = kmalloc(sec_bitmap_size, GFP_KERNEL);
+ free_i->free_secmap = f2fs_kvmalloc(sec_bitmap_size, GFP_KERNEL);
if (!free_i->free_secmap)
return -ENOMEM;
@@ -2062,7 +2116,7 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
int nrpages = MAX_BIO_BLOCKS(sbi);
do {
- readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT);
+ readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT, true);
start = start_blk * sit_i->sents_per_block;
end = (start_blk + readed) * sit_i->sents_per_block;
@@ -2154,7 +2208,7 @@ static int init_victim_secmap(struct f2fs_sb_info *sbi)
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi));
- dirty_i->victim_secmap = kzalloc(bitmap_size, GFP_KERNEL);
+ dirty_i->victim_secmap = f2fs_kvzalloc(bitmap_size, GFP_KERNEL);
if (!dirty_i->victim_secmap)
return -ENOMEM;
return 0;
@@ -2176,7 +2230,7 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi)
bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
for (i = 0; i < NR_DIRTY_TYPE; i++) {
- dirty_i->dirty_segmap[i] = kzalloc(bitmap_size, GFP_KERNEL);
+ dirty_i->dirty_segmap[i] = f2fs_kvzalloc(bitmap_size, GFP_KERNEL);
if (!dirty_i->dirty_segmap[i])
return -ENOMEM;
}
@@ -2281,7 +2335,7 @@ static void discard_dirty_segmap(struct f2fs_sb_info *sbi,
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
mutex_lock(&dirty_i->seglist_lock);
- kfree(dirty_i->dirty_segmap[dirty_type]);
+ kvfree(dirty_i->dirty_segmap[dirty_type]);
dirty_i->nr_dirty[dirty_type] = 0;
mutex_unlock(&dirty_i->seglist_lock);
}
@@ -2289,7 +2343,7 @@ static void discard_dirty_segmap(struct f2fs_sb_info *sbi,
static void destroy_victim_secmap(struct f2fs_sb_info *sbi)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
- kfree(dirty_i->victim_secmap);
+ kvfree(dirty_i->victim_secmap);
}
static void destroy_dirty_segmap(struct f2fs_sb_info *sbi)
@@ -2328,8 +2382,8 @@ static void destroy_free_segmap(struct f2fs_sb_info *sbi)
if (!free_i)
return;
SM_I(sbi)->free_info = NULL;
- kfree(free_i->free_segmap);
- kfree(free_i->free_secmap);
+ kvfree(free_i->free_segmap);
+ kvfree(free_i->free_secmap);
kfree(free_i);
}
@@ -2350,9 +2404,9 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi)
}
kfree(sit_i->tmp_map);
- vfree(sit_i->sentries);
- vfree(sit_i->sec_entries);
- kfree(sit_i->dirty_sentries_bitmap);
+ kvfree(sit_i->sentries);
+ kvfree(sit_i->sec_entries);
+ kvfree(sit_i->dirty_sentries_bitmap);
SM_I(sbi)->sit_info = NULL;
kfree(sit_i->sit_bitmap);
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 79e7b87..ee44d34 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -137,10 +137,12 @@ enum {
/*
* BG_GC means the background cleaning job.
* FG_GC means the on-demand cleaning job.
+ * FORCE_FG_GC means on-demand cleaning job in background.
*/
enum {
BG_GC = 0,
- FG_GC
+ FG_GC,
+ FORCE_FG_GC,
};
/* for a function parameter to select a victim segment */
@@ -177,6 +179,15 @@ struct segment_allocation {
void (*allocate_segment)(struct f2fs_sb_info *, int, bool);
};
+/*
+ * this value is set in page as a private data which indicate that
+ * the page is atomically written, and it is in inmem_pages list.
+ */
+#define ATOMIC_WRITTEN_PAGE 0x0000ffff
+
+#define IS_ATOMIC_WRITTEN_PAGE(page) \
+ (page_private(page) == (unsigned long)ATOMIC_WRITTEN_PAGE)
+
struct inmem_pages {
struct list_head list;
struct page *page;
@@ -555,16 +566,15 @@ static inline unsigned short curseg_blkoff(struct f2fs_sb_info *sbi, int type)
return curseg->next_blkoff;
}
-#ifdef CONFIG_F2FS_CHECK_FS
static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno)
{
- BUG_ON(segno > TOTAL_SEGS(sbi) - 1);
+ f2fs_bug_on(sbi, segno > TOTAL_SEGS(sbi) - 1);
}
static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr)
{
- BUG_ON(blk_addr < SEG0_BLKADDR(sbi));
- BUG_ON(blk_addr >= MAX_BLKADDR(sbi));
+ f2fs_bug_on(sbi, blk_addr < SEG0_BLKADDR(sbi)
+ || blk_addr >= MAX_BLKADDR(sbi));
}
/*
@@ -573,16 +583,11 @@ static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr)
static inline void check_block_count(struct f2fs_sb_info *sbi,
int segno, struct f2fs_sit_entry *raw_sit)
{
+#ifdef CONFIG_F2FS_CHECK_FS
bool is_valid = test_bit_le(0, raw_sit->valid_map) ? true : false;
int valid_blocks = 0;
int cur_pos = 0, next_pos;
- /* check segment usage */
- BUG_ON(GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg);
-
- /* check boundary of a given segment number */
- BUG_ON(segno > TOTAL_SEGS(sbi) - 1);
-
/* check bitmap with valid block count */
do {
if (is_valid) {
@@ -598,35 +603,11 @@ static inline void check_block_count(struct f2fs_sb_info *sbi,
is_valid = !is_valid;
} while (cur_pos < sbi->blocks_per_seg);
BUG_ON(GET_SIT_VBLOCKS(raw_sit) != valid_blocks);
-}
-#else
-static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno)
-{
- if (segno > TOTAL_SEGS(sbi) - 1)
- set_sbi_flag(sbi, SBI_NEED_FSCK);
-}
-
-static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr)
-{
- if (blk_addr < SEG0_BLKADDR(sbi) || blk_addr >= MAX_BLKADDR(sbi))
- set_sbi_flag(sbi, SBI_NEED_FSCK);
-}
-
-/*
- * Summary block is always treated as an invalid block
- */
-static inline void check_block_count(struct f2fs_sb_info *sbi,
- int segno, struct f2fs_sit_entry *raw_sit)
-{
- /* check segment usage */
- if (GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg)
- set_sbi_flag(sbi, SBI_NEED_FSCK);
-
- /* check boundary of a given segment number */
- if (segno > TOTAL_SEGS(sbi) - 1)
- set_sbi_flag(sbi, SBI_NEED_FSCK);
-}
#endif
+ /* check segment usage, and check boundary of a given segment number */
+ f2fs_bug_on(sbi, GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg
+ || segno > TOTAL_SEGS(sbi) - 1);
+}
static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi,
unsigned int start)
diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c
new file mode 100644
index 0000000..93606f2
--- /dev/null
+++ b/fs/f2fs/shrinker.c
@@ -0,0 +1,140 @@
+/*
+ * f2fs shrinker support
+ * the basic infra was copied from fs/ubifs/shrinker.c
+ *
+ * Copyright (c) 2015 Motorola Mobility
+ * Copyright (c) 2015 Jaegeuk Kim <jaegeuk@kernel.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+
+#include "f2fs.h"
+
+static LIST_HEAD(f2fs_list);
+static DEFINE_SPINLOCK(f2fs_list_lock);
+static unsigned int shrinker_run_no;
+
+static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi)
+{
+ return NM_I(sbi)->nat_cnt - NM_I(sbi)->dirty_nat_cnt;
+}
+
+static unsigned long __count_free_nids(struct f2fs_sb_info *sbi)
+{
+ if (NM_I(sbi)->fcnt > NAT_ENTRY_PER_BLOCK)
+ return NM_I(sbi)->fcnt - NAT_ENTRY_PER_BLOCK;
+ return 0;
+}
+
+static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi)
+{
+ return atomic_read(&sbi->total_zombie_tree) +
+ atomic_read(&sbi->total_ext_node);
+}
+
+unsigned long f2fs_shrink_count(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ struct f2fs_sb_info *sbi;
+ struct list_head *p;
+ unsigned long count = 0;
+
+ spin_lock(&f2fs_list_lock);
+ p = f2fs_list.next;
+ while (p != &f2fs_list) {
+ sbi = list_entry(p, struct f2fs_sb_info, s_list);
+
+ /* stop f2fs_put_super */
+ if (!mutex_trylock(&sbi->umount_mutex)) {
+ p = p->next;
+ continue;
+ }
+ spin_unlock(&f2fs_list_lock);
+
+ /* count extent cache entries */
+ count += __count_extent_cache(sbi);
+
+ /* shrink clean nat cache entries */
+ count += __count_nat_entries(sbi);
+
+ /* count free nids cache entries */
+ count += __count_free_nids(sbi);
+
+ spin_lock(&f2fs_list_lock);
+ p = p->next;
+ mutex_unlock(&sbi->umount_mutex);
+ }
+ spin_unlock(&f2fs_list_lock);
+ return count;
+}
+
+unsigned long f2fs_shrink_scan(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ unsigned long nr = sc->nr_to_scan;
+ struct f2fs_sb_info *sbi;
+ struct list_head *p;
+ unsigned int run_no;
+ unsigned long freed = 0;
+
+ spin_lock(&f2fs_list_lock);
+ do {
+ run_no = ++shrinker_run_no;
+ } while (run_no == 0);
+ p = f2fs_list.next;
+ while (p != &f2fs_list) {
+ sbi = list_entry(p, struct f2fs_sb_info, s_list);
+
+ if (sbi->shrinker_run_no == run_no)
+ break;
+
+ /* stop f2fs_put_super */
+ if (!mutex_trylock(&sbi->umount_mutex)) {
+ p = p->next;
+ continue;
+ }
+ spin_unlock(&f2fs_list_lock);
+
+ sbi->shrinker_run_no = run_no;
+
+ /* shrink extent cache entries */
+ freed += f2fs_shrink_extent_tree(sbi, nr >> 1);
+
+ /* shrink clean nat cache entries */
+ if (freed < nr)
+ freed += try_to_free_nats(sbi, nr - freed);
+
+ /* shrink free nids cache entries */
+ if (freed < nr)
+ freed += try_to_free_nids(sbi, nr - freed);
+
+ spin_lock(&f2fs_list_lock);
+ p = p->next;
+ list_move_tail(&sbi->s_list, &f2fs_list);
+ mutex_unlock(&sbi->umount_mutex);
+ if (freed >= nr)
+ break;
+ }
+ spin_unlock(&f2fs_list_lock);
+ return freed;
+}
+
+void f2fs_join_shrinker(struct f2fs_sb_info *sbi)
+{
+ spin_lock(&f2fs_list_lock);
+ list_add_tail(&sbi->s_list, &f2fs_list);
+ spin_unlock(&f2fs_list_lock);
+}
+
+void f2fs_leave_shrinker(struct f2fs_sb_info *sbi)
+{
+ f2fs_shrink_extent_tree(sbi, __count_extent_cache(sbi));
+
+ spin_lock(&f2fs_list_lock);
+ list_del(&sbi->s_list);
+ spin_unlock(&f2fs_list_lock);
+}
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index a06b0b4..6134832 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -39,6 +39,13 @@ static struct proc_dir_entry *f2fs_proc_root;
static struct kmem_cache *f2fs_inode_cachep;
static struct kset *f2fs_kset;
+/* f2fs-wide shrinker description */
+static struct shrinker f2fs_shrinker_info = {
+ .scan_objects = f2fs_shrink_scan,
+ .count_objects = f2fs_shrink_count,
+ .seeks = DEFAULT_SEEKS,
+};
+
enum {
Opt_gc_background,
Opt_disable_roll_forward,
@@ -58,7 +65,9 @@ enum {
Opt_nobarrier,
Opt_fastboot,
Opt_extent_cache,
+ Opt_noextent_cache,
Opt_noinline_data,
+ Opt_data_flush,
Opt_err,
};
@@ -81,7 +90,9 @@ static match_table_t f2fs_tokens = {
{Opt_nobarrier, "nobarrier"},
{Opt_fastboot, "fastboot"},
{Opt_extent_cache, "extent_cache"},
+ {Opt_noextent_cache, "noextent_cache"},
{Opt_noinline_data, "noinline_data"},
+ {Opt_data_flush, "data_flush"},
{Opt_err, NULL},
};
@@ -204,8 +215,11 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks);
F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh);
+F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]);
#define ATTR_LIST(name) (&f2fs_attr_##name.attr)
static struct attribute *f2fs_attrs[] = {
@@ -222,6 +236,9 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(max_victim_search),
ATTR_LIST(dir_level),
ATTR_LIST(ram_thresh),
+ ATTR_LIST(ra_nid_pages),
+ ATTR_LIST(cp_interval),
+ ATTR_LIST(idle_interval),
NULL,
};
@@ -283,11 +300,16 @@ static int parse_options(struct super_block *sb, char *options)
if (!name)
return -ENOMEM;
- if (strlen(name) == 2 && !strncmp(name, "on", 2))
+ if (strlen(name) == 2 && !strncmp(name, "on", 2)) {
set_opt(sbi, BG_GC);
- else if (strlen(name) == 3 && !strncmp(name, "off", 3))
+ clear_opt(sbi, FORCE_FG_GC);
+ } else if (strlen(name) == 3 && !strncmp(name, "off", 3)) {
clear_opt(sbi, BG_GC);
- else {
+ clear_opt(sbi, FORCE_FG_GC);
+ } else if (strlen(name) == 4 && !strncmp(name, "sync", 4)) {
+ set_opt(sbi, BG_GC);
+ set_opt(sbi, FORCE_FG_GC);
+ } else {
kfree(name);
return -EINVAL;
}
@@ -382,9 +404,15 @@ static int parse_options(struct super_block *sb, char *options)
case Opt_extent_cache:
set_opt(sbi, EXTENT_CACHE);
break;
+ case Opt_noextent_cache:
+ clear_opt(sbi, EXTENT_CACHE);
+ break;
case Opt_noinline_data:
clear_opt(sbi, INLINE_DATA);
break;
+ case Opt_data_flush:
+ set_opt(sbi, DATA_FLUSH);
+ break;
default:
f2fs_msg(sb, KERN_ERR,
"Unrecognized mount option \"%s\" or missing value",
@@ -410,9 +438,8 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
atomic_set(&fi->dirty_pages, 0);
fi->i_current_depth = 1;
fi->i_advise = 0;
- rwlock_init(&fi->ext_lock);
init_rwsem(&fi->i_sem);
- INIT_RADIX_TREE(&fi->inmem_root, GFP_NOFS);
+ INIT_LIST_HEAD(&fi->dirty_list);
INIT_LIST_HEAD(&fi->inmem_pages);
mutex_init(&fi->inmem_lock);
@@ -441,17 +468,22 @@ static int f2fs_drop_inode(struct inode *inode)
*/
if (!inode_unhashed(inode) && inode->i_state & I_SYNC) {
if (!inode->i_nlink && !is_bad_inode(inode)) {
+ /* to avoid evict_inode call simultaneously */
+ atomic_inc(&inode->i_count);
spin_unlock(&inode->i_lock);
/* some remained atomic pages should discarded */
if (f2fs_is_atomic_file(inode))
commit_inmem_pages(inode, true);
+ /* should remain fi->extent_tree for writepage */
+ f2fs_destroy_extent_node(inode);
+
sb_start_intwrite(inode->i_sb);
i_size_write(inode, 0);
if (F2FS_HAS_BLOCKS(inode))
- f2fs_truncate(inode);
+ f2fs_truncate(inode, true);
sb_end_intwrite(inode->i_sb);
@@ -461,6 +493,7 @@ static int f2fs_drop_inode(struct inode *inode)
F2FS_I(inode)->i_crypt_info);
#endif
spin_lock(&inode->i_lock);
+ atomic_dec(&inode->i_count);
}
return 0;
}
@@ -498,9 +531,11 @@ static void f2fs_put_super(struct super_block *sb)
}
kobject_del(&sbi->s_kobj);
- f2fs_destroy_stats(sbi);
stop_gc_thread(sbi);
+ /* prevent remaining shrinker jobs */
+ mutex_lock(&sbi->umount_mutex);
+
/*
* We don't need to do checkpoint when superblock is clean.
* But, the previous checkpoint was not done by umount, it needs to do
@@ -514,13 +549,19 @@ static void f2fs_put_super(struct super_block *sb)
write_checkpoint(sbi, &cpc);
}
+ /* write_checkpoint can update stat informaion */
+ f2fs_destroy_stats(sbi);
+
/*
* normally superblock is clean, so we need to release this.
* In addition, EIO will skip do checkpoint, we need this as well.
*/
- release_dirty_inode(sbi);
+ release_ino_entry(sbi);
release_discard_addrs(sbi);
+ f2fs_leave_shrinker(sbi);
+ mutex_unlock(&sbi->umount_mutex);
+
iput(sbi->node_inode);
iput(sbi->meta_inode);
@@ -533,13 +574,14 @@ static void f2fs_put_super(struct super_block *sb)
wait_for_completion(&sbi->s_kobj_unregister);
sb->s_fs_info = NULL;
- brelse(sbi->raw_super_buf);
+ kfree(sbi->raw_super);
kfree(sbi);
}
int f2fs_sync_fs(struct super_block *sb, int sync)
{
struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ int err = 0;
trace_f2fs_sync_fs(sb, sync);
@@ -549,14 +591,12 @@ int f2fs_sync_fs(struct super_block *sb, int sync)
cpc.reason = __get_cp_reason(sbi);
mutex_lock(&sbi->gc_mutex);
- write_checkpoint(sbi, &cpc);
+ err = write_checkpoint(sbi, &cpc);
mutex_unlock(&sbi->gc_mutex);
- } else {
- f2fs_balance_fs(sbi);
}
f2fs_trace_ios(NULL, 1);
- return 0;
+ return err;
}
static int f2fs_freeze(struct super_block *sb)
@@ -607,10 +647,14 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
{
struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb);
- if (!f2fs_readonly(sbi->sb) && test_opt(sbi, BG_GC))
- seq_printf(seq, ",background_gc=%s", "on");
- else
+ if (!f2fs_readonly(sbi->sb) && test_opt(sbi, BG_GC)) {
+ if (test_opt(sbi, FORCE_FG_GC))
+ seq_printf(seq, ",background_gc=%s", "sync");
+ else
+ seq_printf(seq, ",background_gc=%s", "on");
+ } else {
seq_printf(seq, ",background_gc=%s", "off");
+ }
if (test_opt(sbi, DISABLE_ROLL_FORWARD))
seq_puts(seq, ",disable_roll_forward");
if (test_opt(sbi, DISCARD))
@@ -647,6 +691,10 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
seq_puts(seq, ",fastboot");
if (test_opt(sbi, EXTENT_CACHE))
seq_puts(seq, ",extent_cache");
+ else
+ seq_puts(seq, ",noextent_cache");
+ if (test_opt(sbi, DATA_FLUSH))
+ seq_puts(seq, ",data_flush");
seq_printf(seq, ",active_logs=%u", sbi->active_logs);
return 0;
@@ -667,7 +715,7 @@ static int segment_info_seq_show(struct seq_file *seq, void *offset)
struct seg_entry *se = get_seg_entry(sbi, i);
if ((i % 10) == 0)
- seq_printf(seq, "%-5d", i);
+ seq_printf(seq, "%-10d", i);
seq_printf(seq, "%d|%-3u", se->type,
get_valid_blocks(sbi, i, 1));
if ((i % 10) == 9 || i == (total_segs - 1))
@@ -699,6 +747,7 @@ static void default_options(struct f2fs_sb_info *sbi)
set_opt(sbi, BG_GC);
set_opt(sbi, INLINE_DATA);
+ set_opt(sbi, EXTENT_CACHE);
#ifdef CONFIG_F2FS_FS_XATTR
set_opt(sbi, XATTR_USER);
@@ -715,6 +764,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
int err, active_logs;
bool need_restart_gc = false;
bool need_stop_gc = false;
+ bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE);
sync_filesystem(sb);
@@ -740,6 +790,14 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
if (f2fs_readonly(sb) && (*flags & MS_RDONLY))
goto skip;
+ /* disallow enable/disable extent_cache dynamically */
+ if (no_extent_cache == !!test_opt(sbi, EXTENT_CACHE)) {
+ err = -EINVAL;
+ f2fs_msg(sbi->sb, KERN_WARNING,
+ "switch extent_cache option is not allowed");
+ goto restore_opts;
+ }
+
/*
* We stop the GC thread if FS is mounted as RO
* or if background_gc = off is passed in mount
@@ -849,7 +907,7 @@ static const struct export_operations f2fs_export_ops = {
.get_parent = f2fs_get_parent,
};
-static loff_t max_file_size(unsigned bits)
+static loff_t max_file_blocks(void)
{
loff_t result = (DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS);
loff_t leaf_count = ADDRS_PER_BLOCK;
@@ -865,10 +923,82 @@ static loff_t max_file_size(unsigned bits)
leaf_count *= NIDS_PER_BLOCK;
result += leaf_count;
- result <<= bits;
return result;
}
+static inline bool sanity_check_area_boundary(struct super_block *sb,
+ struct f2fs_super_block *raw_super)
+{
+ u32 segment0_blkaddr = le32_to_cpu(raw_super->segment0_blkaddr);
+ u32 cp_blkaddr = le32_to_cpu(raw_super->cp_blkaddr);
+ u32 sit_blkaddr = le32_to_cpu(raw_super->sit_blkaddr);
+ u32 nat_blkaddr = le32_to_cpu(raw_super->nat_blkaddr);
+ u32 ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr);
+ u32 main_blkaddr = le32_to_cpu(raw_super->main_blkaddr);
+ u32 segment_count_ckpt = le32_to_cpu(raw_super->segment_count_ckpt);
+ u32 segment_count_sit = le32_to_cpu(raw_super->segment_count_sit);
+ u32 segment_count_nat = le32_to_cpu(raw_super->segment_count_nat);
+ u32 segment_count_ssa = le32_to_cpu(raw_super->segment_count_ssa);
+ u32 segment_count_main = le32_to_cpu(raw_super->segment_count_main);
+ u32 segment_count = le32_to_cpu(raw_super->segment_count);
+ u32 log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg);
+
+ if (segment0_blkaddr != cp_blkaddr) {
+ f2fs_msg(sb, KERN_INFO,
+ "Mismatch start address, segment0(%u) cp_blkaddr(%u)",
+ segment0_blkaddr, cp_blkaddr);
+ return true;
+ }
+
+ if (cp_blkaddr + (segment_count_ckpt << log_blocks_per_seg) !=
+ sit_blkaddr) {
+ f2fs_msg(sb, KERN_INFO,
+ "Wrong CP boundary, start(%u) end(%u) blocks(%u)",
+ cp_blkaddr, sit_blkaddr,
+ segment_count_ckpt << log_blocks_per_seg);
+ return true;
+ }
+
+ if (sit_blkaddr + (segment_count_sit << log_blocks_per_seg) !=
+ nat_blkaddr) {
+ f2fs_msg(sb, KERN_INFO,
+ "Wrong SIT boundary, start(%u) end(%u) blocks(%u)",
+ sit_blkaddr, nat_blkaddr,
+ segment_count_sit << log_blocks_per_seg);
+ return true;
+ }
+
+ if (nat_blkaddr + (segment_count_nat << log_blocks_per_seg) !=
+ ssa_blkaddr) {
+ f2fs_msg(sb, KERN_INFO,
+ "Wrong NAT boundary, start(%u) end(%u) blocks(%u)",
+ nat_blkaddr, ssa_blkaddr,
+ segment_count_nat << log_blocks_per_seg);
+ return true;
+ }
+
+ if (ssa_blkaddr + (segment_count_ssa << log_blocks_per_seg) !=
+ main_blkaddr) {
+ f2fs_msg(sb, KERN_INFO,
+ "Wrong SSA boundary, start(%u) end(%u) blocks(%u)",
+ ssa_blkaddr, main_blkaddr,
+ segment_count_ssa << log_blocks_per_seg);
+ return true;
+ }
+
+ if (main_blkaddr + (segment_count_main << log_blocks_per_seg) !=
+ segment0_blkaddr + (segment_count << log_blocks_per_seg)) {
+ f2fs_msg(sb, KERN_INFO,
+ "Wrong MAIN_AREA boundary, start(%u) end(%u) blocks(%u)",
+ main_blkaddr,
+ segment0_blkaddr + (segment_count << log_blocks_per_seg),
+ segment_count_main << log_blocks_per_seg);
+ return true;
+ }
+
+ return false;
+}
+
static int sanity_check_raw_super(struct super_block *sb,
struct f2fs_super_block *raw_super)
{
@@ -898,6 +1028,14 @@ static int sanity_check_raw_super(struct super_block *sb,
return 1;
}
+ /* check log blocks per segment */
+ if (le32_to_cpu(raw_super->log_blocks_per_seg) != 9) {
+ f2fs_msg(sb, KERN_INFO,
+ "Invalid log blocks per segment (%u)\n",
+ le32_to_cpu(raw_super->log_blocks_per_seg));
+ return 1;
+ }
+
/* Currently, support 512/1024/2048/4096 bytes sector size */
if (le32_to_cpu(raw_super->log_sectorsize) >
F2FS_MAX_LOG_SECTOR_SIZE ||
@@ -916,6 +1054,23 @@ static int sanity_check_raw_super(struct super_block *sb,
le32_to_cpu(raw_super->log_sectorsize));
return 1;
}
+
+ /* check reserved ino info */
+ if (le32_to_cpu(raw_super->node_ino) != 1 ||
+ le32_to_cpu(raw_super->meta_ino) != 2 ||
+ le32_to_cpu(raw_super->root_ino) != 3) {
+ f2fs_msg(sb, KERN_INFO,
+ "Invalid Fs Meta Ino: node(%u) meta(%u) root(%u)",
+ le32_to_cpu(raw_super->node_ino),
+ le32_to_cpu(raw_super->meta_ino),
+ le32_to_cpu(raw_super->root_ino));
+ return 1;
+ }
+
+ /* check CP/SIT/NAT/SSA/MAIN_AREA area boundary */
+ if (sanity_check_area_boundary(sb, raw_super))
+ return 1;
+
return 0;
}
@@ -969,7 +1124,12 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
atomic_set(&sbi->nr_pages[i], 0);
sbi->dir_level = DEF_DIR_LEVEL;
+ sbi->interval_time[CP_TIME] = DEF_CP_INTERVAL;
+ sbi->interval_time[REQ_TIME] = DEF_IDLE_INTERVAL;
clear_sbi_flag(sbi, SBI_NEED_FSCK);
+
+ INIT_LIST_HEAD(&sbi->s_list);
+ mutex_init(&sbi->umount_mutex);
}
/*
@@ -979,111 +1139,114 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
*/
static int read_raw_super_block(struct super_block *sb,
struct f2fs_super_block **raw_super,
- struct buffer_head **raw_super_buf,
- int *recovery)
+ int *valid_super_block, int *recovery)
{
int block = 0;
- struct buffer_head *buffer;
- struct f2fs_super_block *super;
+ struct buffer_head *bh;
+ struct f2fs_super_block *super, *buf;
int err = 0;
+ super = kzalloc(sizeof(struct f2fs_super_block), GFP_KERNEL);
+ if (!super)
+ return -ENOMEM;
retry:
- buffer = sb_bread(sb, block);
- if (!buffer) {
+ bh = sb_bread(sb, block);
+ if (!bh) {
*recovery = 1;
f2fs_msg(sb, KERN_ERR, "Unable to read %dth superblock",
block + 1);
- if (block == 0) {
- block++;
- goto retry;
- } else {
- err = -EIO;
- goto out;
- }
+ err = -EIO;
+ goto next;
}
- super = (struct f2fs_super_block *)
- ((char *)(buffer)->b_data + F2FS_SUPER_OFFSET);
+ buf = (struct f2fs_super_block *)(bh->b_data + F2FS_SUPER_OFFSET);
/* sanity checking of raw super */
- if (sanity_check_raw_super(sb, super)) {
- brelse(buffer);
+ if (sanity_check_raw_super(sb, buf)) {
+ brelse(bh);
*recovery = 1;
f2fs_msg(sb, KERN_ERR,
"Can't find valid F2FS filesystem in %dth superblock",
block + 1);
- if (block == 0) {
- block++;
- goto retry;
- } else {
- err = -EINVAL;
- goto out;
- }
+ err = -EINVAL;
+ goto next;
}
if (!*raw_super) {
- *raw_super_buf = buffer;
+ memcpy(super, buf, sizeof(*super));
+ *valid_super_block = block;
*raw_super = super;
- } else {
- /* already have a valid superblock */
- brelse(buffer);
}
+ brelse(bh);
+next:
/* check the validity of the second superblock */
if (block == 0) {
block++;
goto retry;
}
-out:
/* No valid superblock */
- if (!*raw_super)
+ if (!*raw_super) {
+ kfree(super);
return err;
+ }
return 0;
}
+static int __f2fs_commit_super(struct f2fs_sb_info *sbi, int block)
+{
+ struct f2fs_super_block *super = F2FS_RAW_SUPER(sbi);
+ struct buffer_head *bh;
+ int err;
+
+ bh = sb_getblk(sbi->sb, block);
+ if (!bh)
+ return -EIO;
+
+ lock_buffer(bh);
+ memcpy(bh->b_data + F2FS_SUPER_OFFSET, super, sizeof(*super));
+ set_buffer_uptodate(bh);
+ set_buffer_dirty(bh);
+ unlock_buffer(bh);
+
+ /* it's rare case, we can do fua all the time */
+ err = __sync_dirty_buffer(bh, WRITE_FLUSH_FUA);
+ brelse(bh);
+
+ return err;
+}
+
int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover)
{
- struct buffer_head *sbh = sbi->raw_super_buf;
- sector_t block = sbh->b_blocknr;
int err;
/* write back-up superblock first */
- sbh->b_blocknr = block ? 0 : 1;
- mark_buffer_dirty(sbh);
- err = sync_dirty_buffer(sbh);
-
- sbh->b_blocknr = block;
+ err = __f2fs_commit_super(sbi, sbi->valid_super_block ? 0 : 1);
/* if we are in recovery path, skip writing valid superblock */
if (recover || err)
- goto out;
+ return err;
/* write current valid superblock */
- mark_buffer_dirty(sbh);
- err = sync_dirty_buffer(sbh);
-out:
- clear_buffer_write_io_error(sbh);
- set_buffer_uptodate(sbh);
- return err;
+ return __f2fs_commit_super(sbi, sbi->valid_super_block);
}
static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
{
struct f2fs_sb_info *sbi;
struct f2fs_super_block *raw_super;
- struct buffer_head *raw_super_buf;
struct inode *root;
long err;
bool retry = true, need_fsck = false;
char *options = NULL;
- int recovery, i;
+ int recovery, i, valid_super_block;
try_onemore:
err = -EINVAL;
raw_super = NULL;
- raw_super_buf = NULL;
+ valid_super_block = -1;
recovery = 0;
/* allocate memory for f2fs-specific super block info */
@@ -1097,7 +1260,8 @@ try_onemore:
goto free_sbi;
}
- err = read_raw_super_block(sb, &raw_super, &raw_super_buf, &recovery);
+ err = read_raw_super_block(sb, &raw_super, &valid_super_block,
+ &recovery);
if (err)
goto free_sbi;
@@ -1114,7 +1278,9 @@ try_onemore:
if (err)
goto free_options;
- sb->s_maxbytes = max_file_size(le32_to_cpu(raw_super->log_blocksize));
+ sbi->max_file_blocks = max_file_blocks();
+ sb->s_maxbytes = sbi->max_file_blocks <<
+ le32_to_cpu(raw_super->log_blocksize);
sb->s_max_links = F2FS_LINK_MAX;
get_random_bytes(&sbi->s_next_generation, sizeof(u32));
@@ -1130,12 +1296,14 @@ try_onemore:
/* init f2fs-specific super block info */
sbi->sb = sb;
sbi->raw_super = raw_super;
- sbi->raw_super_buf = raw_super_buf;
+ sbi->valid_super_block = valid_super_block;
mutex_init(&sbi->gc_mutex);
mutex_init(&sbi->writepages);
mutex_init(&sbi->cp_mutex);
init_rwsem(&sbi->node_write);
- clear_sbi_flag(sbi, SBI_POR_DOING);
+
+ /* disallow all the data/node/meta page writes */
+ set_sbi_flag(sbi, SBI_POR_DOING);
spin_lock_init(&sbi->stat_lock);
init_rwsem(&sbi->read_io.io_rwsem);
@@ -1181,8 +1349,10 @@ try_onemore:
le64_to_cpu(sbi->ckpt->valid_block_count);
sbi->last_valid_block_count = sbi->total_valid_block_count;
sbi->alloc_valid_block_count = 0;
- INIT_LIST_HEAD(&sbi->dir_inode_list);
- spin_lock_init(&sbi->dir_inode_lock);
+ for (i = 0; i < NR_INODE_TYPE; i++) {
+ INIT_LIST_HEAD(&sbi->inode_list[i]);
+ spin_lock_init(&sbi->inode_lock[i]);
+ }
init_extent_cache_info(sbi);
@@ -1212,8 +1382,12 @@ try_onemore:
goto free_nm;
}
+ f2fs_join_shrinker(sbi);
+
/* if there are nt orphan nodes free them */
- recover_orphan_inodes(sbi);
+ err = recover_orphan_inodes(sbi);
+ if (err)
+ goto free_node_inode;
/* read root inode and dentry */
root = f2fs_iget(sb, F2FS_ROOT_INO(sbi));
@@ -1275,6 +1449,8 @@ try_onemore:
goto free_kobj;
}
}
+ /* recover_fsync_data() cleared this already */
+ clear_sbi_flag(sbi, SBI_POR_DOING);
/*
* If filesystem is not mounted as read-only then
@@ -1294,10 +1470,14 @@ try_onemore:
f2fs_commit_super(sbi, true);
}
+ f2fs_update_time(sbi, CP_TIME);
+ f2fs_update_time(sbi, REQ_TIME);
return 0;
free_kobj:
kobject_del(&sbi->s_kobj);
+ kobject_put(&sbi->s_kobj);
+ wait_for_completion(&sbi->s_kobj_unregister);
free_proc:
if (sbi->s_proc) {
remove_proc_entry("segment_info", sbi->s_proc);
@@ -1308,7 +1488,10 @@ free_root_inode:
dput(sb->s_root);
sb->s_root = NULL;
free_node_inode:
+ mutex_lock(&sbi->umount_mutex);
+ f2fs_leave_shrinker(sbi);
iput(sbi->node_inode);
+ mutex_unlock(&sbi->umount_mutex);
free_nm:
destroy_node_manager(sbi);
free_sm:
@@ -1321,7 +1504,7 @@ free_meta_inode:
free_options:
kfree(options);
free_sb_buf:
- brelse(raw_super_buf);
+ kfree(raw_super);
free_sbi:
kfree(sbi);
@@ -1358,8 +1541,9 @@ MODULE_ALIAS_FS("f2fs");
static int __init init_inodecache(void)
{
- f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache",
- sizeof(struct f2fs_inode_info));
+ f2fs_inode_cachep = kmem_cache_create("f2fs_inode_cache",
+ sizeof(struct f2fs_inode_info), 0,
+ SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, NULL);
if (!f2fs_inode_cachep)
return -ENOMEM;
return 0;
@@ -1404,13 +1588,24 @@ static int __init init_f2fs_fs(void)
err = f2fs_init_crypto();
if (err)
goto free_kset;
- err = register_filesystem(&f2fs_fs_type);
+
+ err = register_shrinker(&f2fs_shrinker_info);
if (err)
goto free_crypto;
- f2fs_create_root_stats();
+
+ err = register_filesystem(&f2fs_fs_type);
+ if (err)
+ goto free_shrinker;
+ err = f2fs_create_root_stats();
+ if (err)
+ goto free_filesystem;
f2fs_proc_root = proc_mkdir("fs/f2fs", NULL);
return 0;
+free_filesystem:
+ unregister_filesystem(&f2fs_fs_type);
+free_shrinker:
+ unregister_shrinker(&f2fs_shrinker_info);
free_crypto:
f2fs_exit_crypto();
free_kset:
@@ -1433,6 +1628,7 @@ static void __exit exit_f2fs_fs(void)
{
remove_proc_entry("fs/f2fs", NULL);
f2fs_destroy_root_stats();
+ unregister_shrinker(&f2fs_shrinker_info);
unregister_filesystem(&f2fs_fs_type);
f2fs_exit_crypto();
destroy_extent_cache();
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 07449b98..10f1e78 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -25,49 +25,37 @@
#include "f2fs.h"
#include "xattr.h"
-static size_t f2fs_xattr_generic_list(struct dentry *dentry, char *list,
- size_t list_size, const char *name, size_t len, int type)
+static int f2fs_xattr_generic_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name, void *buffer,
+ size_t size)
{
struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
- int total_len, prefix_len = 0;
- const char *prefix = NULL;
- switch (type) {
+ switch (handler->flags) {
case F2FS_XATTR_INDEX_USER:
if (!test_opt(sbi, XATTR_USER))
return -EOPNOTSUPP;
- prefix = XATTR_USER_PREFIX;
- prefix_len = XATTR_USER_PREFIX_LEN;
break;
case F2FS_XATTR_INDEX_TRUSTED:
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- prefix = XATTR_TRUSTED_PREFIX;
- prefix_len = XATTR_TRUSTED_PREFIX_LEN;
break;
case F2FS_XATTR_INDEX_SECURITY:
- prefix = XATTR_SECURITY_PREFIX;
- prefix_len = XATTR_SECURITY_PREFIX_LEN;
break;
default:
return -EINVAL;
}
-
- total_len = prefix_len + len + 1;
- if (list && total_len <= list_size) {
- memcpy(list, prefix, prefix_len);
- memcpy(list + prefix_len, name, len);
- list[prefix_len + len] = '\0';
- }
- return total_len;
+ return f2fs_getxattr(d_inode(dentry), handler->flags, name,
+ buffer, size, NULL);
}
-static int f2fs_xattr_generic_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+static int f2fs_xattr_generic_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name, const void *value,
+ size_t size, int flags)
{
struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
- switch (type) {
+ switch (handler->flags) {
case F2FS_XATTR_INDEX_USER:
if (!test_opt(sbi, XATTR_USER))
return -EOPNOTSUPP;
@@ -81,72 +69,39 @@ static int f2fs_xattr_generic_get(struct dentry *dentry, const char *name,
default:
return -EINVAL;
}
- if (strcmp(name, "") == 0)
- return -EINVAL;
- return f2fs_getxattr(d_inode(dentry), type, name, buffer, size, NULL);
+ return f2fs_setxattr(d_inode(dentry), handler->flags, name,
+ value, size, NULL, flags);
}
-static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+static bool f2fs_xattr_user_list(struct dentry *dentry)
{
struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
- switch (type) {
- case F2FS_XATTR_INDEX_USER:
- if (!test_opt(sbi, XATTR_USER))
- return -EOPNOTSUPP;
- break;
- case F2FS_XATTR_INDEX_TRUSTED:
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
- break;
- case F2FS_XATTR_INDEX_SECURITY:
- break;
- default:
- return -EINVAL;
- }
- if (strcmp(name, "") == 0)
- return -EINVAL;
-
- return f2fs_setxattr(d_inode(dentry), type, name,
- value, size, NULL, flags);
+ return test_opt(sbi, XATTR_USER);
}
-static size_t f2fs_xattr_advise_list(struct dentry *dentry, char *list,
- size_t list_size, const char *name, size_t len, int type)
+static bool f2fs_xattr_trusted_list(struct dentry *dentry)
{
- const char *xname = F2FS_SYSTEM_ADVISE_PREFIX;
- size_t size;
-
- if (type != F2FS_XATTR_INDEX_ADVISE)
- return 0;
-
- size = strlen(xname) + 1;
- if (list && size <= list_size)
- memcpy(list, xname, size);
- return size;
+ return capable(CAP_SYS_ADMIN);
}
-static int f2fs_xattr_advise_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+static int f2fs_xattr_advise_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name, void *buffer,
+ size_t size)
{
struct inode *inode = d_inode(dentry);
- if (strcmp(name, "") != 0)
- return -EINVAL;
-
if (buffer)
*((char *)buffer) = F2FS_I(inode)->i_advise;
return sizeof(char);
}
-static int f2fs_xattr_advise_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+static int f2fs_xattr_advise_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name, const void *value,
+ size_t size, int flags)
{
struct inode *inode = d_inode(dentry);
- if (strcmp(name, "") != 0)
- return -EINVAL;
if (!inode_owner_or_capable(inode))
return -EPERM;
if (value == NULL)
@@ -185,7 +140,7 @@ int f2fs_init_security(struct inode *inode, struct inode *dir,
const struct xattr_handler f2fs_xattr_user_handler = {
.prefix = XATTR_USER_PREFIX,
.flags = F2FS_XATTR_INDEX_USER,
- .list = f2fs_xattr_generic_list,
+ .list = f2fs_xattr_user_list,
.get = f2fs_xattr_generic_get,
.set = f2fs_xattr_generic_set,
};
@@ -193,15 +148,14 @@ const struct xattr_handler f2fs_xattr_user_handler = {
const struct xattr_handler f2fs_xattr_trusted_handler = {
.prefix = XATTR_TRUSTED_PREFIX,
.flags = F2FS_XATTR_INDEX_TRUSTED,
- .list = f2fs_xattr_generic_list,
+ .list = f2fs_xattr_trusted_list,
.get = f2fs_xattr_generic_get,
.set = f2fs_xattr_generic_set,
};
const struct xattr_handler f2fs_xattr_advise_handler = {
- .prefix = F2FS_SYSTEM_ADVISE_PREFIX,
+ .name = F2FS_SYSTEM_ADVISE_NAME,
.flags = F2FS_XATTR_INDEX_ADVISE,
- .list = f2fs_xattr_advise_list,
.get = f2fs_xattr_advise_get,
.set = f2fs_xattr_advise_set,
};
@@ -209,7 +163,6 @@ const struct xattr_handler f2fs_xattr_advise_handler = {
const struct xattr_handler f2fs_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
.flags = F2FS_XATTR_INDEX_SECURITY,
- .list = f2fs_xattr_generic_list,
.get = f2fs_xattr_generic_get,
.set = f2fs_xattr_generic_set,
};
@@ -457,20 +410,27 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
list_for_each_xattr(entry, base_addr) {
const struct xattr_handler *handler =
f2fs_xattr_handler(entry->e_name_index);
+ const char *prefix;
+ size_t prefix_len;
size_t size;
- if (!handler)
+ if (!handler || (handler->list && !handler->list(dentry)))
continue;
- size = handler->list(dentry, buffer, rest, entry->e_name,
- entry->e_name_len, handler->flags);
- if (buffer && size > rest) {
- error = -ERANGE;
- goto cleanup;
+ prefix = handler->prefix ?: handler->name;
+ prefix_len = strlen(prefix);
+ size = prefix_len + entry->e_name_len + 1;
+ if (buffer) {
+ if (size > rest) {
+ error = -ERANGE;
+ goto cleanup;
+ }
+ memcpy(buffer, prefix, prefix_len);
+ buffer += prefix_len;
+ memcpy(buffer, entry->e_name, entry->e_name_len);
+ buffer += entry->e_name_len;
+ *buffer++ = 0;
}
-
- if (buffer)
- buffer += size;
rest -= size;
}
error = buffer_size - rest;
@@ -499,9 +459,12 @@ static int __f2fs_setxattr(struct inode *inode, int index,
len = strlen(name);
- if (len > F2FS_NAME_LEN || size > MAX_VALUE_LEN(inode))
+ if (len > F2FS_NAME_LEN)
return -ERANGE;
+ if (size > MAX_VALUE_LEN(inode))
+ return -E2BIG;
+
base_addr = read_all_xattrs(inode, ipage);
if (!base_addr)
goto exit;
@@ -608,7 +571,7 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name,
if (ipage)
return __f2fs_setxattr(inode, index, name, value,
size, ipage, flags);
- f2fs_balance_fs(sbi);
+ f2fs_balance_fs(sbi, true);
f2fs_lock_op(sbi);
/* protect xattr_ver */
@@ -617,5 +580,6 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name,
up_write(&F2FS_I(inode)->i_sem);
f2fs_unlock_op(sbi);
+ f2fs_update_time(sbi, REQ_TIME);
return err;
}
diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h
index 71a7100..79dccc8 100644
--- a/fs/f2fs/xattr.h
+++ b/fs/f2fs/xattr.h
@@ -27,7 +27,7 @@
#define F2FS_XATTR_REFCOUNT_MAX 1024
/* Name indexes */
-#define F2FS_SYSTEM_ADVISE_PREFIX "system.advise"
+#define F2FS_SYSTEM_ADVISE_NAME "system.advise"
#define F2FS_XATTR_INDEX_USER 1
#define F2FS_XATTR_INDEX_POSIX_ACL_ACCESS 2
#define F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT 3
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 4afc4d9..8b2127f 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -610,9 +610,9 @@ parse_record:
int status = fat_parse_long(inode, &cpos, &bh, &de,
&unicode, &nr_slots);
if (status < 0) {
- ctx->pos = cpos;
+ bh = NULL;
ret = status;
- goto out;
+ goto end_of_dir;
} else if (status == PARSE_INVALID)
goto record_end;
else if (status == PARSE_NOT_LONGNAME)
@@ -654,8 +654,9 @@ parse_record:
fill_len = short_len;
start_filldir:
- if (!fake_offset)
- ctx->pos = cpos - (nr_slots + 1) * sizeof(struct msdos_dir_entry);
+ ctx->pos = cpos - (nr_slots + 1) * sizeof(struct msdos_dir_entry);
+ if (fake_offset && ctx->pos < 2)
+ ctx->pos = 2;
if (!memcmp(de->name, MSDOS_DOT, MSDOS_NAME)) {
if (!dir_emit_dot(file, ctx))
@@ -681,14 +682,19 @@ record_end:
fake_offset = 0;
ctx->pos = cpos;
goto get_new;
+
end_of_dir:
- ctx->pos = cpos;
+ if (fake_offset && cpos < 2)
+ ctx->pos = 2;
+ else
+ ctx->pos = cpos;
fill_failed:
brelse(bh);
if (unicode)
__putname(unicode);
out:
mutex_unlock(&sbi->s_lock);
+
return ret;
}
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 509411d..6aece96 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -677,7 +677,7 @@ static int __init fat_init_inodecache(void)
fat_inode_cachep = kmem_cache_create("fat_inode_cache",
sizeof(struct msdos_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (fat_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index ee85cd4..350a2c8 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -51,7 +51,8 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
if (arg & O_NDELAY)
arg |= O_NONBLOCK;
- if (arg & O_DIRECT) {
+ /* Pipe packetized mode is controlled by O_DIRECT flag */
+ if (!S_ISFIFO(filp->f_inode->i_mode) && (arg & O_DIRECT)) {
if (!filp->f_mapping || !filp->f_mapping->a_ops ||
!filp->f_mapping->a_ops->direct_IO)
return -EINVAL;
diff --git a/fs/file.c b/fs/file.c
index 6c672ad..1fbc5c0 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -25,9 +25,9 @@
int sysctl_nr_open __read_mostly = 1024*1024;
int sysctl_nr_open_min = BITS_PER_LONG;
-/* our max() is unusable in constant expressions ;-/ */
-#define __const_max(x, y) ((x) < (y) ? (x) : (y))
-int sysctl_nr_open_max = __const_max(INT_MAX, ~(size_t)0/sizeof(void *)) &
+/* our min() is unusable in constant expressions ;-/ */
+#define __const_min(x, y) ((x) < (y) ? (x) : (y))
+int sysctl_nr_open_max = __const_min(INT_MAX, ~(size_t)0/sizeof(void *)) &
-BITS_PER_LONG;
static void *alloc_fdmem(size_t size)
@@ -37,11 +37,12 @@ static void *alloc_fdmem(size_t size)
* vmalloc() if the allocation size will be considered "large" by the VM.
*/
if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
- void *data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN|__GFP_NORETRY);
+ void *data = kmalloc(size, GFP_KERNEL_ACCOUNT |
+ __GFP_NOWARN | __GFP_NORETRY);
if (data != NULL)
return data;
}
- return vmalloc(size);
+ return __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_HIGHMEM, PAGE_KERNEL);
}
static void __free_fdtable(struct fdtable *fdt)
@@ -56,9 +57,35 @@ static void free_fdtable_rcu(struct rcu_head *rcu)
__free_fdtable(container_of(rcu, struct fdtable, rcu));
}
+#define BITBIT_NR(nr) BITS_TO_LONGS(BITS_TO_LONGS(nr))
+#define BITBIT_SIZE(nr) (BITBIT_NR(nr) * sizeof(long))
+
+/*
+ * Copy 'count' fd bits from the old table to the new table and clear the extra
+ * space if any. This does not copy the file pointers. Called with the files
+ * spinlock held for write.
+ */
+static void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt,
+ unsigned int count)
+{
+ unsigned int cpy, set;
+
+ cpy = count / BITS_PER_BYTE;
+ set = (nfdt->max_fds - count) / BITS_PER_BYTE;
+ memcpy(nfdt->open_fds, ofdt->open_fds, cpy);
+ memset((char *)nfdt->open_fds + cpy, 0, set);
+ memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy);
+ memset((char *)nfdt->close_on_exec + cpy, 0, set);
+
+ cpy = BITBIT_SIZE(count);
+ set = BITBIT_SIZE(nfdt->max_fds) - cpy;
+ memcpy(nfdt->full_fds_bits, ofdt->full_fds_bits, cpy);
+ memset((char *)nfdt->full_fds_bits + cpy, 0, set);
+}
+
/*
- * Expand the fdset in the files_struct. Called with the files spinlock
- * held for write.
+ * Copy all file descriptors from the old table to the new, expanded table and
+ * clear the extra space. Called with the files spinlock held for write.
*/
static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
{
@@ -69,14 +96,9 @@ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
cpy = ofdt->max_fds * sizeof(struct file *);
set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *);
memcpy(nfdt->fd, ofdt->fd, cpy);
- memset((char *)(nfdt->fd) + cpy, 0, set);
+ memset((char *)nfdt->fd + cpy, 0, set);
- cpy = ofdt->max_fds / BITS_PER_BYTE;
- set = (nfdt->max_fds - ofdt->max_fds) / BITS_PER_BYTE;
- memcpy(nfdt->open_fds, ofdt->open_fds, cpy);
- memset((char *)(nfdt->open_fds) + cpy, 0, set);
- memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy);
- memset((char *)(nfdt->close_on_exec) + cpy, 0, set);
+ copy_fd_bitmaps(nfdt, ofdt, ofdt->max_fds);
}
static struct fdtable * alloc_fdtable(unsigned int nr)
@@ -105,7 +127,7 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
if (unlikely(nr > sysctl_nr_open))
nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1;
- fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL);
+ fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT);
if (!fdt)
goto out;
fdt->max_fds = nr;
@@ -115,12 +137,14 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
fdt->fd = data;
data = alloc_fdmem(max_t(size_t,
- 2 * nr / BITS_PER_BYTE, L1_CACHE_BYTES));
+ 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES));
if (!data)
goto out_arr;
fdt->open_fds = data;
data += nr / BITS_PER_BYTE;
fdt->close_on_exec = data;
+ data += nr / BITS_PER_BYTE;
+ fdt->full_fds_bits = data;
return fdt;
@@ -226,17 +250,22 @@ static inline void __set_close_on_exec(int fd, struct fdtable *fdt)
static inline void __clear_close_on_exec(int fd, struct fdtable *fdt)
{
- __clear_bit(fd, fdt->close_on_exec);
+ if (test_bit(fd, fdt->close_on_exec))
+ __clear_bit(fd, fdt->close_on_exec);
}
-static inline void __set_open_fd(int fd, struct fdtable *fdt)
+static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt)
{
__set_bit(fd, fdt->open_fds);
+ fd /= BITS_PER_LONG;
+ if (!~fdt->open_fds[fd])
+ __set_bit(fd, fdt->full_fds_bits);
}
-static inline void __clear_open_fd(int fd, struct fdtable *fdt)
+static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt)
{
__clear_bit(fd, fdt->open_fds);
+ __clear_bit(fd / BITS_PER_LONG, fdt->full_fds_bits);
}
static int count_open_files(struct fdtable *fdt)
@@ -262,7 +291,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
{
struct files_struct *newf;
struct file **old_fds, **new_fds;
- int open_files, size, i;
+ int open_files, i;
struct fdtable *old_fdt, *new_fdt;
*errorp = -ENOMEM;
@@ -280,6 +309,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
new_fdt->max_fds = NR_OPEN_DEFAULT;
new_fdt->close_on_exec = newf->close_on_exec_init;
new_fdt->open_fds = newf->open_fds_init;
+ new_fdt->full_fds_bits = newf->full_fds_bits_init;
new_fdt->fd = &newf->fd_array[0];
spin_lock(&oldf->file_lock);
@@ -318,12 +348,11 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
open_files = count_open_files(old_fdt);
}
+ copy_fd_bitmaps(new_fdt, old_fdt, open_files);
+
old_fds = old_fdt->fd;
new_fds = new_fdt->fd;
- memcpy(new_fdt->open_fds, old_fdt->open_fds, open_files / 8);
- memcpy(new_fdt->close_on_exec, old_fdt->close_on_exec, open_files / 8);
-
for (i = open_files; i != 0; i--) {
struct file *f = *old_fds++;
if (f) {
@@ -341,19 +370,8 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
}
spin_unlock(&oldf->file_lock);
- /* compute the remainder to be cleared */
- size = (new_fdt->max_fds - open_files) * sizeof(struct file *);
-
- /* This is long word aligned thus could use a optimized version */
- memset(new_fds, 0, size);
-
- if (new_fdt->max_fds > open_files) {
- int left = (new_fdt->max_fds - open_files) / 8;
- int start = open_files / BITS_PER_LONG;
-
- memset(&new_fdt->open_fds[start], 0, left);
- memset(&new_fdt->close_on_exec[start], 0, left);
- }
+ /* clear the remainder */
+ memset(new_fds, 0, (new_fdt->max_fds - open_files) * sizeof(struct file *));
rcu_assign_pointer(newf->fdt, new_fdt);
@@ -454,10 +472,25 @@ struct files_struct init_files = {
.fd = &init_files.fd_array[0],
.close_on_exec = init_files.close_on_exec_init,
.open_fds = init_files.open_fds_init,
+ .full_fds_bits = init_files.full_fds_bits_init,
},
.file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock),
};
+static unsigned long find_next_fd(struct fdtable *fdt, unsigned long start)
+{
+ unsigned long maxfd = fdt->max_fds;
+ unsigned long maxbit = maxfd / BITS_PER_LONG;
+ unsigned long bitbit = start / BITS_PER_LONG;
+
+ bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG;
+ if (bitbit > maxfd)
+ return maxfd;
+ if (bitbit > start)
+ start = bitbit;
+ return find_next_zero_bit(fdt->open_fds, maxfd, start);
+}
+
/*
* allocate a file descriptor, mark it busy.
*/
@@ -476,7 +509,7 @@ repeat:
fd = files->next_fd;
if (fd < fdt->max_fds)
- fd = find_next_zero_bit(fdt->open_fds, fdt->max_fds, fd);
+ fd = find_next_fd(fdt, fd);
/*
* N.B. For clone tasks sharing a files structure, this test
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index ef73ed6..3e2ccad 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -326,6 +326,7 @@ vxfs_iget(struct super_block *sbp, ino_t ino)
} else if (S_ISLNK(ip->i_mode)) {
if (!VXFS_ISIMMED(vip)) {
ip->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(ip);
ip->i_mapping->a_ops = &vxfs_aops;
} else {
ip->i_op = &simple_symlink_inode_operations;
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index 484b32d..1cff72d 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -192,7 +192,7 @@ vxfs_inode_by_name(struct inode *dip, struct dentry *dp)
* by @dp in @dip.
*
* Returns:
- * A NULL-pointer on success, else an negative error code encoded
+ * A NULL-pointer on success, else a negative error code encoded
* in the return pointer.
*/
static struct dentry *
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 5fa588e..6915c95 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -53,8 +53,6 @@ struct wb_writeback_work {
unsigned int for_background:1;
unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */
unsigned int auto_free:1; /* free on completion */
- unsigned int single_wait:1;
- unsigned int single_done:1;
enum wb_reason reason; /* why was writeback initiated? */
struct list_head list; /* pending work list */
@@ -88,7 +86,7 @@ unsigned int dirtytime_expire_interval = 12 * 60 * 60;
static inline struct inode *wb_inode(struct list_head *head)
{
- return list_entry(head, struct inode, i_wb_list);
+ return list_entry(head, struct inode, i_io_list);
}
/*
@@ -125,22 +123,22 @@ static void wb_io_lists_depopulated(struct bdi_writeback *wb)
}
/**
- * inode_wb_list_move_locked - move an inode onto a bdi_writeback IO list
+ * inode_io_list_move_locked - move an inode onto a bdi_writeback IO list
* @inode: inode to be moved
* @wb: target bdi_writeback
* @head: one of @wb->b_{dirty|io|more_io}
*
- * Move @inode->i_wb_list to @list of @wb and set %WB_has_dirty_io.
+ * Move @inode->i_io_list to @list of @wb and set %WB_has_dirty_io.
* Returns %true if @inode is the first occupant of the !dirty_time IO
* lists; otherwise, %false.
*/
-static bool inode_wb_list_move_locked(struct inode *inode,
+static bool inode_io_list_move_locked(struct inode *inode,
struct bdi_writeback *wb,
struct list_head *head)
{
assert_spin_locked(&wb->list_lock);
- list_move(&inode->i_wb_list, head);
+ list_move(&inode->i_io_list, head);
/* dirty_time doesn't count as dirty_io until expiration */
if (head != &wb->b_dirty_time)
@@ -151,19 +149,19 @@ static bool inode_wb_list_move_locked(struct inode *inode,
}
/**
- * inode_wb_list_del_locked - remove an inode from its bdi_writeback IO list
+ * inode_io_list_del_locked - remove an inode from its bdi_writeback IO list
* @inode: inode to be removed
* @wb: bdi_writeback @inode is being removed from
*
* Remove @inode which may be on one of @wb->b_{dirty|io|more_io} lists and
* clear %WB_has_dirty_io if all are empty afterwards.
*/
-static void inode_wb_list_del_locked(struct inode *inode,
+static void inode_io_list_del_locked(struct inode *inode,
struct bdi_writeback *wb)
{
assert_spin_locked(&wb->list_lock);
- list_del_init(&inode->i_wb_list);
+ list_del_init(&inode->i_io_list);
wb_io_lists_depopulated(wb);
}
@@ -178,14 +176,11 @@ static void wb_wakeup(struct bdi_writeback *wb)
static void wb_queue_work(struct bdi_writeback *wb,
struct wb_writeback_work *work)
{
- trace_writeback_queue(wb->bdi, work);
+ trace_writeback_queue(wb, work);
spin_lock_bh(&wb->work_lock);
- if (!test_bit(WB_registered, &wb->state)) {
- if (work->single_wait)
- work->single_done = 1;
+ if (!test_bit(WB_registered, &wb->state))
goto out_unlock;
- }
if (work->done)
atomic_inc(&work->done->cnt);
list_add_tail(&work->list, &wb->work_list);
@@ -351,7 +346,7 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
/*
* Once I_FREEING is visible under i_lock, the eviction path owns
- * the inode and we shouldn't modify ->i_wb_list.
+ * the inode and we shouldn't modify ->i_io_list.
*/
if (unlikely(inode->i_state & I_FREEING))
goto skip_switch;
@@ -390,16 +385,16 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
* is always correct including from ->b_dirty_time. The transfer
* preserves @inode->dirtied_when ordering.
*/
- if (!list_empty(&inode->i_wb_list)) {
+ if (!list_empty(&inode->i_io_list)) {
struct inode *pos;
- inode_wb_list_del_locked(inode, old_wb);
+ inode_io_list_del_locked(inode, old_wb);
inode->i_wb = new_wb;
- list_for_each_entry(pos, &new_wb->b_dirty, i_wb_list)
+ list_for_each_entry(pos, &new_wb->b_dirty, i_io_list)
if (time_after_eq(inode->dirtied_when,
pos->dirtied_when))
break;
- inode_wb_list_move_locked(inode, new_wb, pos->i_wb_list.prev);
+ inode_io_list_move_locked(inode, new_wb, pos->i_io_list.prev);
} else {
inode->i_wb = new_wb;
}
@@ -682,9 +677,7 @@ void wbc_account_io(struct writeback_control *wbc, struct page *page,
if (!wbc->wb)
return;
- rcu_read_lock();
id = mem_cgroup_css_from_page(page)->id;
- rcu_read_unlock();
if (id == wbc->wb_id) {
wbc->wb_bytes += bytes;
@@ -706,7 +699,7 @@ EXPORT_SYMBOL_GPL(wbc_account_io);
/**
* inode_congested - test whether an inode is congested
- * @inode: inode to test for congestion
+ * @inode: inode to test for congestion (may be NULL)
* @cong_bits: mask of WB_[a]sync_congested bits to test
*
* Tests whether @inode is congested. @cong_bits is the mask of congestion
@@ -716,6 +709,9 @@ EXPORT_SYMBOL_GPL(wbc_account_io);
* determined by whether the cgwb (cgroup bdi_writeback) for the blkcg
* associated with @inode is congested; otherwise, the root wb's congestion
* state is used.
+ *
+ * @inode is allowed to be NULL as this function is often called on
+ * mapping->host which is NULL for the swapper space.
*/
int inode_congested(struct inode *inode, int cong_bits)
{
@@ -738,32 +734,6 @@ int inode_congested(struct inode *inode, int cong_bits)
EXPORT_SYMBOL_GPL(inode_congested);
/**
- * wb_wait_for_single_work - wait for completion of a single bdi_writeback_work
- * @bdi: bdi the work item was issued to
- * @work: work item to wait for
- *
- * Wait for the completion of @work which was issued to one of @bdi's
- * bdi_writeback's. The caller must have set @work->single_wait before
- * issuing it. This wait operates independently fo
- * wb_wait_for_completion() and also disables automatic freeing of @work.
- */
-static void wb_wait_for_single_work(struct backing_dev_info *bdi,
- struct wb_writeback_work *work)
-{
- if (WARN_ON_ONCE(!work->single_wait))
- return;
-
- wait_event(bdi->wb_waitq, work->single_done);
-
- /*
- * Paired with smp_wmb() in wb_do_writeback() and ensures that all
- * modifications to @work prior to assertion of ->single_done is
- * visible to the caller once this function returns.
- */
- smp_rmb();
-}
-
-/**
* wb_split_bdi_pages - split nr_pages to write according to bandwidth
* @wb: target bdi_writeback to split @nr_pages to
* @nr_pages: number of pages to write for the whole bdi
@@ -792,38 +762,6 @@ static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
}
/**
- * wb_clone_and_queue_work - clone a wb_writeback_work and issue it to a wb
- * @wb: target bdi_writeback
- * @base_work: source wb_writeback_work
- *
- * Try to make a clone of @base_work and issue it to @wb. If cloning
- * succeeds, %true is returned; otherwise, @base_work is issued directly
- * and %false is returned. In the latter case, the caller is required to
- * wait for @base_work's completion using wb_wait_for_single_work().
- *
- * A clone is auto-freed on completion. @base_work never is.
- */
-static bool wb_clone_and_queue_work(struct bdi_writeback *wb,
- struct wb_writeback_work *base_work)
-{
- struct wb_writeback_work *work;
-
- work = kmalloc(sizeof(*work), GFP_ATOMIC);
- if (work) {
- *work = *base_work;
- work->auto_free = 1;
- work->single_wait = 0;
- } else {
- work = base_work;
- work->auto_free = 0;
- work->single_wait = 1;
- }
- work->single_done = 0;
- wb_queue_work(wb, work);
- return work != base_work;
-}
-
-/**
* bdi_split_work_to_wbs - split a wb_writeback_work to all wb's of a bdi
* @bdi: target backing_dev_info
* @base_work: wb_writeback_work to issue
@@ -838,15 +776,24 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
struct wb_writeback_work *base_work,
bool skip_if_busy)
{
- long nr_pages = base_work->nr_pages;
- int next_blkcg_id = 0;
- struct bdi_writeback *wb;
- struct wb_iter iter;
+ struct bdi_writeback *last_wb = NULL;
+ struct bdi_writeback *wb = list_entry(&bdi->wb_list,
+ struct bdi_writeback, bdi_node);
might_sleep();
restart:
rcu_read_lock();
- bdi_for_each_wb(wb, bdi, &iter, next_blkcg_id) {
+ list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) {
+ DEFINE_WB_COMPLETION_ONSTACK(fallback_work_done);
+ struct wb_writeback_work fallback_work;
+ struct wb_writeback_work *work;
+ long nr_pages;
+
+ if (last_wb) {
+ wb_put(last_wb);
+ last_wb = NULL;
+ }
+
/* SYNC_ALL writes out I_DIRTY_TIME too */
if (!wb_has_dirty_io(wb) &&
(base_work->sync_mode == WB_SYNC_NONE ||
@@ -855,15 +802,42 @@ restart:
if (skip_if_busy && writeback_in_progress(wb))
continue;
- base_work->nr_pages = wb_split_bdi_pages(wb, nr_pages);
- if (!wb_clone_and_queue_work(wb, base_work)) {
- next_blkcg_id = wb->blkcg_css->id + 1;
- rcu_read_unlock();
- wb_wait_for_single_work(bdi, base_work);
- goto restart;
+ nr_pages = wb_split_bdi_pages(wb, base_work->nr_pages);
+
+ work = kmalloc(sizeof(*work), GFP_ATOMIC);
+ if (work) {
+ *work = *base_work;
+ work->nr_pages = nr_pages;
+ work->auto_free = 1;
+ wb_queue_work(wb, work);
+ continue;
}
+
+ /* alloc failed, execute synchronously using on-stack fallback */
+ work = &fallback_work;
+ *work = *base_work;
+ work->nr_pages = nr_pages;
+ work->auto_free = 0;
+ work->done = &fallback_work_done;
+
+ wb_queue_work(wb, work);
+
+ /*
+ * Pin @wb so that it stays on @bdi->wb_list. This allows
+ * continuing iteration from @wb after dropping and
+ * regrabbing rcu read lock.
+ */
+ wb_get(wb);
+ last_wb = wb;
+
+ rcu_read_unlock();
+ wb_wait_for_completion(bdi, &fallback_work_done);
+ goto restart;
}
rcu_read_unlock();
+
+ if (last_wb)
+ wb_put(last_wb);
}
#else /* CONFIG_CGROUP_WRITEBACK */
@@ -902,8 +876,6 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) {
base_work->auto_free = 0;
- base_work->single_wait = 0;
- base_work->single_done = 0;
wb_queue_work(&bdi->wb, base_work);
}
}
@@ -924,7 +896,7 @@ void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
*/
work = kzalloc(sizeof(*work), GFP_ATOMIC);
if (!work) {
- trace_writeback_nowork(wb->bdi);
+ trace_writeback_nowork(wb);
wb_wakeup(wb);
return;
}
@@ -954,19 +926,19 @@ void wb_start_background_writeback(struct bdi_writeback *wb)
* We just wake up the flusher thread. It will perform background
* writeback as soon as there is no other work to do.
*/
- trace_writeback_wake_background(wb->bdi);
+ trace_writeback_wake_background(wb);
wb_wakeup(wb);
}
/*
* Remove the inode from the writeback list it is on.
*/
-void inode_wb_list_del(struct inode *inode)
+void inode_io_list_del(struct inode *inode)
{
struct bdi_writeback *wb;
wb = inode_to_wb_and_lock_list(inode);
- inode_wb_list_del_locked(inode, wb);
+ inode_io_list_del_locked(inode, wb);
spin_unlock(&wb->list_lock);
}
@@ -988,7 +960,7 @@ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
if (time_before(inode->dirtied_when, tail->dirtied_when))
inode->dirtied_when = jiffies;
}
- inode_wb_list_move_locked(inode, wb, &wb->b_dirty);
+ inode_io_list_move_locked(inode, wb, &wb->b_dirty);
}
/*
@@ -996,7 +968,7 @@ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
*/
static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
{
- inode_wb_list_move_locked(inode, wb, &wb->b_more_io);
+ inode_io_list_move_locked(inode, wb, &wb->b_more_io);
}
static void inode_sync_complete(struct inode *inode)
@@ -1055,7 +1027,7 @@ static int move_expired_inodes(struct list_head *delaying_queue,
if (older_than_this &&
inode_dirtied_after(inode, *older_than_this))
break;
- list_move(&inode->i_wb_list, &tmp);
+ list_move(&inode->i_io_list, &tmp);
moved++;
if (flags & EXPIRE_DIRTY_ATIME)
set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state);
@@ -1078,7 +1050,7 @@ static int move_expired_inodes(struct list_head *delaying_queue,
list_for_each_prev_safe(pos, node, &tmp) {
inode = wb_inode(pos);
if (inode->i_sb == sb)
- list_move(&inode->i_wb_list, dispatch_queue);
+ list_move(&inode->i_io_list, dispatch_queue);
}
}
out:
@@ -1232,10 +1204,10 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
redirty_tail(inode, wb);
} else if (inode->i_state & I_DIRTY_TIME) {
inode->dirtied_when = jiffies;
- inode_wb_list_move_locked(inode, wb, &wb->b_dirty_time);
+ inode_io_list_move_locked(inode, wb, &wb->b_dirty_time);
} else {
/* The inode is clean. Remove from writeback lists. */
- inode_wb_list_del_locked(inode, wb);
+ inode_io_list_del_locked(inode, wb);
}
}
@@ -1378,7 +1350,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
* touch it. See comment above for explanation.
*/
if (!(inode->i_state & I_DIRTY_ALL))
- inode_wb_list_del_locked(inode, wb);
+ inode_io_list_del_locked(inode, wb);
spin_unlock(&wb->list_lock);
inode_sync_complete(inode);
out:
@@ -1421,6 +1393,10 @@ static long writeback_chunk_size(struct bdi_writeback *wb,
* Write a portion of b_io inodes which belong to @sb.
*
* Return the number of pages and/or inodes written.
+ *
+ * NOTE! This is called with wb->list_lock held, and will
+ * unlock and relock that for each inode it ends up doing
+ * IO for.
*/
static long writeback_sb_inodes(struct super_block *sb,
struct bdi_writeback *wb,
@@ -1518,6 +1494,21 @@ static long writeback_sb_inodes(struct super_block *sb,
wbc_detach_inode(&wbc);
work->nr_pages -= write_chunk - wbc.nr_to_write;
wrote += write_chunk - wbc.nr_to_write;
+
+ if (need_resched()) {
+ /*
+ * We're trying to balance between building up a nice
+ * long list of IOs to improve our merge rate, and
+ * getting those IOs out quickly for anyone throttling
+ * in balance_dirty_pages(). cond_resched() doesn't
+ * unplug, so get our IOs out the door before we
+ * give up the CPU.
+ */
+ blk_flush_plug(current);
+ cond_resched();
+ }
+
+
spin_lock(&wb->list_lock);
spin_lock(&inode->i_lock);
if (!(inode->i_state & I_DIRTY_ALL))
@@ -1525,7 +1516,7 @@ static long writeback_sb_inodes(struct super_block *sb,
requeue_inode(inode, wb, &wbc);
inode_sync_complete(inode);
spin_unlock(&inode->i_lock);
- cond_resched_lock(&wb->list_lock);
+
/*
* bail out to wb_writeback() often enough to check
* background threshold and other termination conditions.
@@ -1583,12 +1574,15 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
.range_cyclic = 1,
.reason = reason,
};
+ struct blk_plug plug;
+ blk_start_plug(&plug);
spin_lock(&wb->list_lock);
if (list_empty(&wb->b_io))
queue_io(wb, &work);
__writeback_inodes_wb(wb, &work);
spin_unlock(&wb->list_lock);
+ blk_finish_plug(&plug);
return nr_pages - work.nr_pages;
}
@@ -1616,10 +1610,12 @@ static long wb_writeback(struct bdi_writeback *wb,
unsigned long oldest_jif;
struct inode *inode;
long progress;
+ struct blk_plug plug;
oldest_jif = jiffies;
work->older_than_this = &oldest_jif;
+ blk_start_plug(&plug);
spin_lock(&wb->list_lock);
for (;;) {
/*
@@ -1657,14 +1653,14 @@ static long wb_writeback(struct bdi_writeback *wb,
} else if (work->for_background)
oldest_jif = jiffies;
- trace_writeback_start(wb->bdi, work);
+ trace_writeback_start(wb, work);
if (list_empty(&wb->b_io))
queue_io(wb, work);
if (work->sb)
progress = writeback_sb_inodes(work->sb, wb, work);
else
progress = __writeback_inodes_wb(wb, work);
- trace_writeback_written(wb->bdi, work);
+ trace_writeback_written(wb, work);
wb_update_bandwidth(wb, wb_start);
@@ -1689,7 +1685,7 @@ static long wb_writeback(struct bdi_writeback *wb,
* we'll just busyloop.
*/
if (!list_empty(&wb->b_more_io)) {
- trace_writeback_wait(wb->bdi, work);
+ trace_writeback_wait(wb, work);
inode = wb_inode(wb->b_more_io.prev);
spin_lock(&inode->i_lock);
spin_unlock(&wb->list_lock);
@@ -1699,6 +1695,7 @@ static long wb_writeback(struct bdi_writeback *wb,
}
}
spin_unlock(&wb->list_lock);
+ blk_finish_plug(&plug);
return nr_pages - work->nr_pages;
}
@@ -1794,26 +1791,14 @@ static long wb_do_writeback(struct bdi_writeback *wb)
set_bit(WB_writeback_running, &wb->state);
while ((work = get_next_work_item(wb)) != NULL) {
struct wb_completion *done = work->done;
- bool need_wake_up = false;
- trace_writeback_exec(wb->bdi, work);
+ trace_writeback_exec(wb, work);
wrote += wb_writeback(wb, work);
- if (work->single_wait) {
- WARN_ON_ONCE(work->auto_free);
- /* paired w/ rmb in wb_wait_for_single_work() */
- smp_wmb();
- work->single_done = 1;
- need_wake_up = true;
- } else if (work->auto_free) {
+ if (work->auto_free)
kfree(work);
- }
-
if (done && atomic_dec_and_test(&done->cnt))
- need_wake_up = true;
-
- if (need_wake_up)
wake_up_all(&wb->bdi->wb_waitq);
}
@@ -1885,12 +1870,11 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
rcu_read_lock();
list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
struct bdi_writeback *wb;
- struct wb_iter iter;
if (!bdi_has_dirty_io(bdi))
continue;
- bdi_for_each_wb(wb, bdi, &iter, 0)
+ list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
wb_start_writeback(wb, wb_split_bdi_pages(wb, nr_pages),
false, reason);
}
@@ -1922,11 +1906,10 @@ static void wakeup_dirtytime_writeback(struct work_struct *w)
rcu_read_lock();
list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
struct bdi_writeback *wb;
- struct wb_iter iter;
- bdi_for_each_wb(wb, bdi, &iter, 0)
- if (!list_empty(&bdi->wb.b_dirty_time))
- wb_wakeup(&bdi->wb);
+ list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
+ if (!list_empty(&wb->b_dirty_time))
+ wb_wakeup(wb);
}
rcu_read_unlock();
schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
@@ -1996,9 +1979,9 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode)
* page->mapping->host, so the page-dirtying time is recorded in the internal
* blockdev inode.
*/
-#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
void __mark_inode_dirty(struct inode *inode, int flags)
{
+#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
struct super_block *sb = inode->i_sb;
int dirtytime;
@@ -2088,7 +2071,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
else
dirty_list = &wb->b_dirty_time;
- wakeup_bdi = inode_wb_list_move_locked(inode, wb,
+ wakeup_bdi = inode_io_list_move_locked(inode, wb,
dirty_list);
spin_unlock(&wb->list_lock);
@@ -2108,9 +2091,19 @@ void __mark_inode_dirty(struct inode *inode, int flags)
out_unlock_inode:
spin_unlock(&inode->i_lock);
+#undef I_DIRTY_INODE
}
EXPORT_SYMBOL(__mark_inode_dirty);
+/*
+ * The @s_sync_lock is used to serialise concurrent sync operations
+ * to avoid lock contention problems with concurrent wait_sb_inodes() calls.
+ * Concurrent callers will block on the s_sync_lock rather than doing contending
+ * walks. The queueing maintains sync(2) required behaviour as all the IO that
+ * has been issued up to the time this function is enter is guaranteed to be
+ * completed by the time we have gained the lock and waited for all IO that is
+ * in progress regardless of the order callers are granted the lock.
+ */
static void wait_sb_inodes(struct super_block *sb)
{
struct inode *inode, *old_inode = NULL;
@@ -2121,7 +2114,8 @@ static void wait_sb_inodes(struct super_block *sb)
*/
WARN_ON(!rwsem_is_locked(&sb->s_umount));
- spin_lock(&inode_sb_list_lock);
+ mutex_lock(&sb->s_sync_lock);
+ spin_lock(&sb->s_inode_list_lock);
/*
* Data integrity sync. Must wait for all pages under writeback,
@@ -2141,27 +2135,33 @@ static void wait_sb_inodes(struct super_block *sb)
}
__iget(inode);
spin_unlock(&inode->i_lock);
- spin_unlock(&inode_sb_list_lock);
+ spin_unlock(&sb->s_inode_list_lock);
/*
* We hold a reference to 'inode' so it couldn't have been
* removed from s_inodes list while we dropped the
- * inode_sb_list_lock. We cannot iput the inode now as we can
+ * s_inode_list_lock. We cannot iput the inode now as we can
* be holding the last reference and we cannot iput it under
- * inode_sb_list_lock. So we keep the reference and iput it
+ * s_inode_list_lock. So we keep the reference and iput it
* later.
*/
iput(old_inode);
old_inode = inode;
- filemap_fdatawait(mapping);
+ /*
+ * We keep the error status of individual mapping so that
+ * applications can catch the writeback error using fsync(2).
+ * See filemap_fdatawait_keep_errors() for details.
+ */
+ filemap_fdatawait_keep_errors(mapping);
cond_resched();
- spin_lock(&inode_sb_list_lock);
+ spin_lock(&sb->s_inode_list_lock);
}
- spin_unlock(&inode_sb_list_lock);
+ spin_unlock(&sb->s_inode_list_lock);
iput(old_inode);
+ mutex_unlock(&sb->s_sync_lock);
}
static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index d403c69..43040721 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -111,7 +111,7 @@ struct fscache_cookie *__fscache_acquire_cookie(
/* radix tree insertion won't use the preallocation pool unless it's
* told it may not wait */
- INIT_RADIX_TREE(&cookie->stores, GFP_NOFS & ~__GFP_WAIT);
+ INIT_RADIX_TREE(&cookie->stores, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
switch (cookie->def->type) {
case FSCACHE_COOKIE_TYPE_INDEX:
diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c
index 6d941f5..9b28649 100644
--- a/fs/fscache/netfs.c
+++ b/fs/fscache/netfs.c
@@ -22,6 +22,7 @@ static LIST_HEAD(fscache_netfs_list);
int __fscache_register_netfs(struct fscache_netfs *netfs)
{
struct fscache_netfs *ptr;
+ struct fscache_cookie *cookie;
int ret;
_enter("{%s}", netfs->name);
@@ -29,29 +30,25 @@ int __fscache_register_netfs(struct fscache_netfs *netfs)
INIT_LIST_HEAD(&netfs->link);
/* allocate a cookie for the primary index */
- netfs->primary_index =
- kmem_cache_zalloc(fscache_cookie_jar, GFP_KERNEL);
+ cookie = kmem_cache_zalloc(fscache_cookie_jar, GFP_KERNEL);
- if (!netfs->primary_index) {
+ if (!cookie) {
_leave(" = -ENOMEM");
return -ENOMEM;
}
/* initialise the primary index cookie */
- atomic_set(&netfs->primary_index->usage, 1);
- atomic_set(&netfs->primary_index->n_children, 0);
- atomic_set(&netfs->primary_index->n_active, 1);
+ atomic_set(&cookie->usage, 1);
+ atomic_set(&cookie->n_children, 0);
+ atomic_set(&cookie->n_active, 1);
- netfs->primary_index->def = &fscache_fsdef_netfs_def;
- netfs->primary_index->parent = &fscache_fsdef_index;
- netfs->primary_index->netfs_data = netfs;
- netfs->primary_index->flags = 1 << FSCACHE_COOKIE_ENABLED;
+ cookie->def = &fscache_fsdef_netfs_def;
+ cookie->parent = &fscache_fsdef_index;
+ cookie->netfs_data = netfs;
+ cookie->flags = 1 << FSCACHE_COOKIE_ENABLED;
- atomic_inc(&netfs->primary_index->parent->usage);
- atomic_inc(&netfs->primary_index->parent->n_children);
-
- spin_lock_init(&netfs->primary_index->lock);
- INIT_HLIST_HEAD(&netfs->primary_index->backing_objects);
+ spin_lock_init(&cookie->lock);
+ INIT_HLIST_HEAD(&cookie->backing_objects);
/* check the netfs type is not already present */
down_write(&fscache_addremove_sem);
@@ -62,6 +59,10 @@ int __fscache_register_netfs(struct fscache_netfs *netfs)
goto already_registered;
}
+ atomic_inc(&cookie->parent->usage);
+ atomic_inc(&cookie->parent->n_children);
+
+ netfs->primary_index = cookie;
list_add(&netfs->link, &fscache_netfs_list);
ret = 0;
@@ -70,11 +71,8 @@ int __fscache_register_netfs(struct fscache_netfs *netfs)
already_registered:
up_write(&fscache_addremove_sem);
- if (ret < 0) {
- netfs->primary_index->parent = NULL;
- __fscache_cookie_put(netfs->primary_index);
- netfs->primary_index = NULL;
- }
+ if (ret < 0)
+ kmem_cache_free(fscache_cookie_jar, cookie);
_leave(" = %d", ret);
return ret;
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index 51dde81..6b028b7 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -316,7 +316,7 @@ static const struct seq_operations fscache_objlist_ops = {
static void fscache_objlist_config(struct fscache_objlist_data *data)
{
#ifdef CONFIG_KEYS
- struct user_key_payload *confkey;
+ const struct user_key_payload *confkey;
unsigned long config;
struct key *key;
const char *buf;
@@ -329,7 +329,7 @@ static void fscache_objlist_config(struct fscache_objlist_data *data)
config = 0;
rcu_read_lock();
- confkey = key->payload.data;
+ confkey = user_key_payload(key);
buf = confkey->data;
for (len = confkey->datalen - 1; len >= 0; len--) {
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 483bbc6..6b35fc4 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -58,7 +58,7 @@ bool release_page_wait_timeout(struct fscache_cookie *cookie, struct page *page)
/*
* decide whether a page can be released, possibly by cancelling a store to it
- * - we're allowed to sleep if __GFP_WAIT is flagged
+ * - we're allowed to sleep if __GFP_DIRECT_RECLAIM is flagged
*/
bool __fscache_maybe_release_page(struct fscache_cookie *cookie,
struct page *page,
@@ -122,7 +122,7 @@ page_busy:
* allocator as the work threads writing to the cache may all end up
* sleeping on memory allocation, so we may need to impose a timeout
* too. */
- if (!(gfp & __GFP_WAIT) || !(gfp & __GFP_FS)) {
+ if (!(gfp & __GFP_DIRECT_RECLAIM) || !(gfp & __GFP_FS)) {
fscache_stat(&fscache_n_store_vmscan_busy);
return false;
}
@@ -132,7 +132,7 @@ page_busy:
_debug("fscache writeout timeout page: %p{%lx}",
page, page->index);
- gfp &= ~__GFP_WAIT;
+ gfp &= ~__GFP_DIRECT_RECLAIM;
goto try_again;
}
EXPORT_SYMBOL(__fscache_maybe_release_page);
@@ -816,7 +816,7 @@ static void fscache_write_op(struct fscache_operation *_op)
goto superseded;
page = results[0];
_debug("gang %d [%lx]", n, page->index);
- if (page->index > op->store_limit) {
+ if (page->index >= op->store_limit) {
fscache_stat(&fscache_n_store_pages_over_limit);
goto superseded;
}
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index eae2c11..8e3ee19 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -549,6 +549,8 @@ static int cuse_channel_release(struct inode *inode, struct file *file)
unregister_chrdev_region(cc->cdev->dev, 1);
cdev_del(cc->cdev);
}
+ /* Base reference is now owned by "fud" */
+ fuse_conn_put(&cc->fc);
rc = fuse_dev_release(inode, file); /* puts the base reference */
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 5e2e087..712601f 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1365,15 +1365,19 @@ static int fuse_readdir(struct file *file, struct dir_context *ctx)
return err;
}
-static const char *fuse_follow_link(struct dentry *dentry, void **cookie)
+static const char *fuse_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- struct inode *inode = d_inode(dentry);
struct fuse_conn *fc = get_fuse_conn(inode);
FUSE_ARGS(args);
char *link;
ssize_t ret;
- link = (char *) __get_free_page(GFP_KERNEL);
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
+ link = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (!link)
return ERR_PTR(-ENOMEM);
@@ -1385,11 +1389,11 @@ static const char *fuse_follow_link(struct dentry *dentry, void **cookie)
args.out.args[0].value = link;
ret = fuse_simple_request(fc, &args);
if (ret < 0) {
- free_page((unsigned long) link);
+ kfree(link);
link = ERR_PTR(ret);
} else {
link[ret] = '\0';
- *cookie = link;
+ set_delayed_call(done, kfree_link, link);
}
fuse_invalidate_atime(inode);
return link;
@@ -1909,8 +1913,7 @@ static const struct inode_operations fuse_common_inode_operations = {
static const struct inode_operations fuse_symlink_inode_operations = {
.setattr = fuse_setattr,
- .follow_link = fuse_follow_link,
- .put_link = free_page_put_link,
+ .get_link = fuse_get_link,
.readlink = generic_readlink,
.getattr = fuse_getattr,
.setxattr = fuse_setxattr,
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index f523f2f..aa03aab 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1049,6 +1049,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes);
flush_dcache_page(page);
+ iov_iter_advance(ii, tmp);
if (!tmp) {
unlock_page(page);
page_cache_release(page);
@@ -1061,7 +1062,6 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
req->page_descs[req->num_pages].length = tmp;
req->num_pages++;
- iov_iter_advance(ii, tmp);
count += tmp;
pos += tmp;
offset += tmp;
@@ -2189,7 +2189,7 @@ static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl)
int err;
if (fc->no_flock) {
- err = flock_lock_file_wait(file, fl);
+ err = locks_lock_file_wait(file, fl);
} else {
struct fuse_file *ff = file->private_data;
@@ -2231,20 +2231,77 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
return err ? 0 : outarg.block;
}
+static loff_t fuse_lseek(struct file *file, loff_t offset, int whence)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_file *ff = file->private_data;
+ FUSE_ARGS(args);
+ struct fuse_lseek_in inarg = {
+ .fh = ff->fh,
+ .offset = offset,
+ .whence = whence
+ };
+ struct fuse_lseek_out outarg;
+ int err;
+
+ if (fc->no_lseek)
+ goto fallback;
+
+ args.in.h.opcode = FUSE_LSEEK;
+ args.in.h.nodeid = ff->nodeid;
+ args.in.numargs = 1;
+ args.in.args[0].size = sizeof(inarg);
+ args.in.args[0].value = &inarg;
+ args.out.numargs = 1;
+ args.out.args[0].size = sizeof(outarg);
+ args.out.args[0].value = &outarg;
+ err = fuse_simple_request(fc, &args);
+ if (err) {
+ if (err == -ENOSYS) {
+ fc->no_lseek = 1;
+ goto fallback;
+ }
+ return err;
+ }
+
+ return vfs_setpos(file, outarg.offset, inode->i_sb->s_maxbytes);
+
+fallback:
+ err = fuse_update_attributes(inode, NULL, file, NULL);
+ if (!err)
+ return generic_file_llseek(file, offset, whence);
+ else
+ return err;
+}
+
static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence)
{
loff_t retval;
struct inode *inode = file_inode(file);
- /* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */
- if (whence == SEEK_CUR || whence == SEEK_SET)
- return generic_file_llseek(file, offset, whence);
-
- mutex_lock(&inode->i_mutex);
- retval = fuse_update_attributes(inode, NULL, file, NULL);
- if (!retval)
+ switch (whence) {
+ case SEEK_SET:
+ case SEEK_CUR:
+ /* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */
retval = generic_file_llseek(file, offset, whence);
- mutex_unlock(&inode->i_mutex);
+ break;
+ case SEEK_END:
+ mutex_lock(&inode->i_mutex);
+ retval = fuse_update_attributes(inode, NULL, file, NULL);
+ if (!retval)
+ retval = generic_file_llseek(file, offset, whence);
+ mutex_unlock(&inode->i_mutex);
+ break;
+ case SEEK_HOLE:
+ case SEEK_DATA:
+ mutex_lock(&inode->i_mutex);
+ retval = fuse_lseek(file, offset, whence);
+ mutex_unlock(&inode->i_mutex);
+ break;
+ default:
+ retval = -EINVAL;
+ }
return retval;
}
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 4051131..ce394b5 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -605,6 +605,9 @@ struct fuse_conn {
/** Does the filesystem support asynchronous direct-IO submission? */
unsigned async_dio:1;
+ /** Is lseek not implemented by fs? */
+ unsigned no_lseek:1;
+
/** The number of requests waiting for completion */
atomic_t num_waiting;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 2913db2..4d69d5c 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1255,8 +1255,8 @@ static int __init fuse_fs_init(void)
int err;
fuse_inode_cachep = kmem_cache_create("fuse_inode",
- sizeof(struct fuse_inode),
- 0, SLAB_HWCACHE_ALIGN,
+ sizeof(struct fuse_inode), 0,
+ SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT,
fuse_inode_init_once);
err = -ENOMEM;
if (!fuse_inode_cachep)
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 1be3b06..7919326 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -31,9 +31,9 @@ static const char *gfs2_acl_name(int type)
{
switch (type) {
case ACL_TYPE_ACCESS:
- return GFS2_POSIX_ACL_ACCESS;
+ return XATTR_POSIX_ACL_ACCESS;
case ACL_TYPE_DEFAULT:
- return GFS2_POSIX_ACL_DEFAULT;
+ return XATTR_POSIX_ACL_DEFAULT;
}
return NULL;
}
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
index 2d65ec4..3af4f40 100644
--- a/fs/gfs2/acl.h
+++ b/fs/gfs2/acl.h
@@ -12,8 +12,6 @@
#include "incore.h"
-#define GFS2_POSIX_ACL_ACCESS "posix_acl_access"
-#define GFS2_POSIX_ACL_DEFAULT "posix_acl_default"
#define GFS2_ACL_MAX_ENTRIES(sdp) ((300 << (sdp)->sd_sb.sb_bsize_shift) >> 12)
extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type);
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 1caee05..93f0746 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -914,7 +914,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
failed:
gfs2_trans_end(sdp);
gfs2_inplace_release(ip);
- if (ip->i_res->rs_qa_qd_num)
+ if (ip->i_qadata && ip->i_qadata->qa_qd_num)
gfs2_quota_unlock(ip);
if (inode == sdp->sd_rindex) {
gfs2_glock_dq(&m_ip->i_gh);
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 61296ec..0860f0b 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -787,8 +787,8 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
if (error)
goto out_rlist;
- if (gfs2_rs_active(ip->i_res)) /* needs to be done with the rgrp glock held */
- gfs2_rs_deltree(ip->i_res);
+ if (gfs2_rs_active(&ip->i_res)) /* needs to be done with the rgrp glock held */
+ gfs2_rs_deltree(&ip->i_res);
error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE +
RES_INDIRECT + RES_STATFS + RES_QUOTA,
@@ -1291,13 +1291,9 @@ int gfs2_setattr_size(struct inode *inode, u64 newsize)
if (ret)
return ret;
- ret = get_write_access(inode);
- if (ret)
- return ret;
-
inode_dio_wait(inode);
- ret = gfs2_rs_alloc(ip);
+ ret = gfs2_rsqa_alloc(ip);
if (ret)
goto out;
@@ -1307,10 +1303,9 @@ int gfs2_setattr_size(struct inode *inode, u64 newsize)
goto out;
}
- gfs2_rs_deltree(ip->i_res);
ret = do_shrink(inode, oldsize, newsize);
out:
- put_write_access(inode);
+ gfs2_rsqa_delete(ip, NULL);
return ret;
}
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 487527b..6a92592 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -82,6 +82,8 @@
#define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1)
#define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1))
+#define GFS2_HASH_INDEX_MASK 0xffffc000
+#define GFS2_USE_HASH_FLAG 0x2000
struct qstr gfs2_qdot __read_mostly;
struct qstr gfs2_qdotdot __read_mostly;
@@ -108,7 +110,7 @@ static int gfs2_dir_get_existing_buffer(struct gfs2_inode *ip, u64 block,
struct buffer_head *bh;
int error;
- error = gfs2_meta_read(ip->i_gl, block, DIO_WAIT, &bh);
+ error = gfs2_meta_read(ip->i_gl, block, DIO_WAIT, 0, &bh);
if (error)
return error;
if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_JD)) {
@@ -305,7 +307,7 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, __be64 *buf,
BUG_ON(extlen < 1);
bh = gfs2_meta_ra(ip->i_gl, dblock, extlen);
} else {
- error = gfs2_meta_read(ip->i_gl, dblock, DIO_WAIT, &bh);
+ error = gfs2_meta_read(ip->i_gl, dblock, DIO_WAIT, 0, &bh);
if (error)
goto fail;
}
@@ -388,8 +390,13 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip)
*/
void gfs2_dir_hash_inval(struct gfs2_inode *ip)
{
- __be64 *hc = ip->i_hash_cache;
+ __be64 *hc;
+
+ spin_lock(&ip->i_inode.i_lock);
+ hc = ip->i_hash_cache;
ip->i_hash_cache = NULL;
+ spin_unlock(&ip->i_inode.i_lock);
+
kvfree(hc);
}
@@ -438,6 +445,27 @@ static int gfs2_dirent_last(const struct gfs2_dirent *dent,
return 0;
}
+/* Look for the dirent that contains the offset specified in data. Once we
+ * find that dirent, there must be space available there for the new dirent */
+static int gfs2_dirent_find_offset(const struct gfs2_dirent *dent,
+ const struct qstr *name,
+ void *ptr)
+{
+ unsigned required = GFS2_DIRENT_SIZE(name->len);
+ unsigned actual = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len));
+ unsigned totlen = be16_to_cpu(dent->de_rec_len);
+
+ if (ptr < (void *)dent || ptr >= (void *)dent + totlen)
+ return 0;
+ if (gfs2_dirent_sentinel(dent))
+ actual = 0;
+ if (ptr < (void *)dent + actual)
+ return -1;
+ if ((void *)dent + totlen >= ptr + required)
+ return 1;
+ return -1;
+}
+
static int gfs2_dirent_find_space(const struct gfs2_dirent *dent,
const struct qstr *name,
void *opaque)
@@ -677,6 +705,27 @@ static void dirent_del(struct gfs2_inode *dip, struct buffer_head *bh,
prev->de_rec_len = cpu_to_be16(prev_rec_len);
}
+
+static struct gfs2_dirent *do_init_dirent(struct inode *inode,
+ struct gfs2_dirent *dent,
+ const struct qstr *name,
+ struct buffer_head *bh,
+ unsigned offset)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_dirent *ndent;
+ unsigned totlen;
+
+ totlen = be16_to_cpu(dent->de_rec_len);
+ BUG_ON(offset + name->len > totlen);
+ gfs2_trans_add_meta(ip->i_gl, bh);
+ ndent = (struct gfs2_dirent *)((char *)dent + offset);
+ dent->de_rec_len = cpu_to_be16(offset);
+ gfs2_qstr2dirent(name, totlen - offset, ndent);
+ return ndent;
+}
+
+
/*
* Takes a dent from which to grab space as an argument. Returns the
* newly created dent.
@@ -686,31 +735,25 @@ static struct gfs2_dirent *gfs2_init_dirent(struct inode *inode,
const struct qstr *name,
struct buffer_head *bh)
{
- struct gfs2_inode *ip = GFS2_I(inode);
- struct gfs2_dirent *ndent;
- unsigned offset = 0, totlen;
+ unsigned offset = 0;
if (!gfs2_dirent_sentinel(dent))
offset = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len));
- totlen = be16_to_cpu(dent->de_rec_len);
- BUG_ON(offset + name->len > totlen);
- gfs2_trans_add_meta(ip->i_gl, bh);
- ndent = (struct gfs2_dirent *)((char *)dent + offset);
- dent->de_rec_len = cpu_to_be16(offset);
- gfs2_qstr2dirent(name, totlen - offset, ndent);
- return ndent;
+ return do_init_dirent(inode, dent, name, bh, offset);
}
-static struct gfs2_dirent *gfs2_dirent_alloc(struct inode *inode,
- struct buffer_head *bh,
- const struct qstr *name)
+static struct gfs2_dirent *gfs2_dirent_split_alloc(struct inode *inode,
+ struct buffer_head *bh,
+ const struct qstr *name,
+ void *ptr)
{
struct gfs2_dirent *dent;
dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
- gfs2_dirent_find_space, name, NULL);
+ gfs2_dirent_find_offset, name, ptr);
if (!dent || IS_ERR(dent))
return dent;
- return gfs2_init_dirent(inode, dent, name, bh);
+ return do_init_dirent(inode, dent, name, bh,
+ (unsigned)(ptr - (void *)dent));
}
static int get_leaf(struct gfs2_inode *dip, u64 leaf_no,
@@ -718,7 +761,7 @@ static int get_leaf(struct gfs2_inode *dip, u64 leaf_no,
{
int error;
- error = gfs2_meta_read(dip->i_gl, leaf_no, DIO_WAIT, bhp);
+ error = gfs2_meta_read(dip->i_gl, leaf_no, DIO_WAIT, 0, bhp);
if (!error && gfs2_metatype_check(GFS2_SB(&dip->i_inode), *bhp, GFS2_METATYPE_LF)) {
/* pr_info("block num=%llu\n", leaf_no); */
error = -EIO;
@@ -1046,10 +1089,11 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
if (!gfs2_dirent_sentinel(dent) &&
be32_to_cpu(dent->de_hash) < divider) {
struct qstr str;
+ void *ptr = ((char *)dent - obh->b_data) + nbh->b_data;
str.name = (char*)(dent+1);
str.len = be16_to_cpu(dent->de_name_len);
str.hash = be32_to_cpu(dent->de_hash);
- new = gfs2_dirent_alloc(inode, nbh, &str);
+ new = gfs2_dirent_split_alloc(inode, nbh, &str, ptr);
if (IS_ERR(new)) {
error = PTR_ERR(new);
break;
@@ -1181,10 +1225,10 @@ static int compare_dents(const void *a, const void *b)
int ret = 0;
dent_a = *(const struct gfs2_dirent **)a;
- hash_a = be32_to_cpu(dent_a->de_hash);
+ hash_a = dent_a->de_cookie;
dent_b = *(const struct gfs2_dirent **)b;
- hash_b = be32_to_cpu(dent_b->de_hash);
+ hash_b = dent_b->de_cookie;
if (hash_a > hash_b)
ret = 1;
@@ -1222,19 +1266,20 @@ static int compare_dents(const void *a, const void *b)
*/
static int do_filldir_main(struct gfs2_inode *dip, struct dir_context *ctx,
- const struct gfs2_dirent **darr, u32 entries,
- int *copied)
+ struct gfs2_dirent **darr, u32 entries,
+ u32 sort_start, int *copied)
{
const struct gfs2_dirent *dent, *dent_next;
u64 off, off_next;
unsigned int x, y;
int run = 0;
- sort(darr, entries, sizeof(struct gfs2_dirent *), compare_dents, NULL);
+ if (sort_start < entries)
+ sort(&darr[sort_start], entries - sort_start,
+ sizeof(struct gfs2_dirent *), compare_dents, NULL);
dent_next = darr[0];
- off_next = be32_to_cpu(dent_next->de_hash);
- off_next = gfs2_disk_hash2offset(off_next);
+ off_next = dent_next->de_cookie;
for (x = 0, y = 1; x < entries; x++, y++) {
dent = dent_next;
@@ -1242,8 +1287,7 @@ static int do_filldir_main(struct gfs2_inode *dip, struct dir_context *ctx,
if (y < entries) {
dent_next = darr[y];
- off_next = be32_to_cpu(dent_next->de_hash);
- off_next = gfs2_disk_hash2offset(off_next);
+ off_next = dent_next->de_cookie;
if (off < ctx->pos)
continue;
@@ -1290,6 +1334,40 @@ static void *gfs2_alloc_sort_buffer(unsigned size)
return ptr;
}
+
+static int gfs2_set_cookies(struct gfs2_sbd *sdp, struct buffer_head *bh,
+ unsigned leaf_nr, struct gfs2_dirent **darr,
+ unsigned entries)
+{
+ int sort_id = -1;
+ int i;
+
+ for (i = 0; i < entries; i++) {
+ unsigned offset;
+
+ darr[i]->de_cookie = be32_to_cpu(darr[i]->de_hash);
+ darr[i]->de_cookie = gfs2_disk_hash2offset(darr[i]->de_cookie);
+
+ if (!sdp->sd_args.ar_loccookie)
+ continue;
+ offset = (char *)(darr[i]) -
+ (bh->b_data + gfs2_dirent_offset(bh->b_data));
+ offset /= GFS2_MIN_DIRENT_SIZE;
+ offset += leaf_nr * sdp->sd_max_dents_per_leaf;
+ if (offset >= GFS2_USE_HASH_FLAG ||
+ leaf_nr >= GFS2_USE_HASH_FLAG) {
+ darr[i]->de_cookie |= GFS2_USE_HASH_FLAG;
+ if (sort_id < 0)
+ sort_id = i;
+ continue;
+ }
+ darr[i]->de_cookie &= GFS2_HASH_INDEX_MASK;
+ darr[i]->de_cookie |= offset;
+ }
+ return sort_id;
+}
+
+
static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
int *copied, unsigned *depth,
u64 leaf_no)
@@ -1299,12 +1377,11 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
struct buffer_head *bh;
struct gfs2_leaf *lf;
unsigned entries = 0, entries2 = 0;
- unsigned leaves = 0;
- const struct gfs2_dirent **darr, *dent;
+ unsigned leaves = 0, leaf = 0, offset, sort_offset;
+ struct gfs2_dirent **darr, *dent;
struct dirent_gather g;
struct buffer_head **larr;
- int leaf = 0;
- int error, i;
+ int error, i, need_sort = 0, sort_id;
u64 lfn = leaf_no;
do {
@@ -1320,6 +1397,11 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
brelse(bh);
} while(lfn);
+ if (*depth < GFS2_DIR_MAX_DEPTH || !sdp->sd_args.ar_loccookie) {
+ need_sort = 1;
+ sort_offset = 0;
+ }
+
if (!entries)
return 0;
@@ -1333,8 +1415,8 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
larr = gfs2_alloc_sort_buffer((leaves + entries + 99) * sizeof(void *));
if (!larr)
goto out;
- darr = (const struct gfs2_dirent **)(larr + leaves);
- g.pdent = darr;
+ darr = (struct gfs2_dirent **)(larr + leaves);
+ g.pdent = (const struct gfs2_dirent **)darr;
g.offset = 0;
lfn = leaf_no;
@@ -1345,6 +1427,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
lf = (struct gfs2_leaf *)bh->b_data;
lfn = be64_to_cpu(lf->lf_next);
if (lf->lf_entries) {
+ offset = g.offset;
entries2 += be16_to_cpu(lf->lf_entries);
dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
gfs2_dirent_gather, NULL, &g);
@@ -1362,17 +1445,26 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
goto out_free;
}
error = 0;
+ sort_id = gfs2_set_cookies(sdp, bh, leaf, &darr[offset],
+ be16_to_cpu(lf->lf_entries));
+ if (!need_sort && sort_id >= 0) {
+ need_sort = 1;
+ sort_offset = offset + sort_id;
+ }
larr[leaf++] = bh;
} else {
+ larr[leaf++] = NULL;
brelse(bh);
}
} while(lfn);
BUG_ON(entries2 != entries);
- error = do_filldir_main(ip, ctx, darr, entries, copied);
+ error = do_filldir_main(ip, ctx, darr, entries, need_sort ?
+ sort_offset : entries, copied);
out_free:
for(i = 0; i < leaf; i++)
- brelse(larr[i]);
+ if (larr[i])
+ brelse(larr[i]);
kvfree(larr);
out:
return error;
@@ -1478,7 +1570,7 @@ int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
struct gfs2_inode *dip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
struct dirent_gather g;
- const struct gfs2_dirent **darr, *dent;
+ struct gfs2_dirent **darr, *dent;
struct buffer_head *dibh;
int copied = 0;
int error;
@@ -1502,7 +1594,7 @@ int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
/* 96 is max number of dirents which can be stuffed into an inode */
darr = kmalloc(96 * sizeof(struct gfs2_dirent *), GFP_NOFS);
if (darr) {
- g.pdent = darr;
+ g.pdent = (const struct gfs2_dirent **)darr;
g.offset = 0;
dent = gfs2_dirent_scan(inode, dibh->b_data, dibh->b_size,
gfs2_dirent_gather, NULL, &g);
@@ -1519,8 +1611,9 @@ int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
error = -EIO;
goto out;
}
+ gfs2_set_cookies(sdp, dibh, 0, darr, dip->i_entries);
error = do_filldir_main(dip, ctx, darr,
- dip->i_entries, &copied);
+ dip->i_entries, 0, &copied);
out:
kfree(darr);
}
@@ -1555,15 +1648,22 @@ struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name,
dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh);
if (dent) {
+ struct inode *inode;
+ u16 rahead;
+
if (IS_ERR(dent))
return ERR_CAST(dent);
dtype = be16_to_cpu(dent->de_type);
+ rahead = be16_to_cpu(dent->de_rahead);
addr = be64_to_cpu(dent->de_inum.no_addr);
formal_ino = be64_to_cpu(dent->de_inum.no_formal_ino);
brelse(bh);
if (fail_on_exist)
return ERR_PTR(-EEXIST);
- return gfs2_inode_lookup(dir->i_sb, dtype, addr, formal_ino, 0);
+ inode = gfs2_inode_lookup(dir->i_sb, dtype, addr, formal_ino, 0);
+ if (!IS_ERR(inode))
+ GFS2_I(inode)->i_rahead = rahead;
+ return inode;
}
return ERR_PTR(-ENOENT);
}
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index cf4ab89..7412863 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -298,9 +298,9 @@ static int gfs2_set_flags(struct file *filp, u32 __user *ptr)
gfsflags &= ~GFS2_DIF_TOPDIR;
if (gfsflags & GFS2_DIF_INHERIT_JDATA)
gfsflags ^= (GFS2_DIF_JDATA | GFS2_DIF_INHERIT_JDATA);
- return do_gfs2_set_flags(filp, gfsflags, ~0);
+ return do_gfs2_set_flags(filp, gfsflags, ~GFS2_DIF_SYSTEM);
}
- return do_gfs2_set_flags(filp, gfsflags, ~GFS2_DIF_JDATA);
+ return do_gfs2_set_flags(filp, gfsflags, ~(GFS2_DIF_SYSTEM | GFS2_DIF_JDATA));
}
static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
@@ -336,8 +336,8 @@ static void gfs2_size_hint(struct file *filep, loff_t offset, size_t size)
size_t blks = (size + sdp->sd_sb.sb_bsize - 1) >> sdp->sd_sb.sb_bsize_shift;
int hint = min_t(size_t, INT_MAX, blks);
- if (hint > atomic_read(&ip->i_res->rs_sizehint))
- atomic_set(&ip->i_res->rs_sizehint, hint);
+ if (hint > atomic_read(&ip->i_res.rs_sizehint))
+ atomic_set(&ip->i_res.rs_sizehint, hint);
}
/**
@@ -397,14 +397,10 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
/* Update file times before taking page lock */
file_update_time(vma->vm_file);
- ret = get_write_access(inode);
+ ret = gfs2_rsqa_alloc(ip);
if (ret)
goto out;
- ret = gfs2_rs_alloc(ip);
- if (ret)
- goto out_write_access;
-
gfs2_size_hint(vma->vm_file, pos, PAGE_CACHE_SIZE);
gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
@@ -486,8 +482,6 @@ out_uninit:
set_page_dirty(page);
wait_for_stable_page(page);
}
-out_write_access:
- put_write_access(inode);
out:
sb_end_pagefault(inode->i_sb);
return block_page_mkwrite_return(ret);
@@ -623,7 +617,7 @@ static int gfs2_release(struct inode *inode, struct file *file)
if (!(file->f_mode & FMODE_WRITE))
return 0;
- gfs2_rs_delete(ip, &inode->i_writecount);
+ gfs2_rsqa_delete(ip, &inode->i_writecount);
return 0;
}
@@ -703,7 +697,7 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct gfs2_inode *ip = GFS2_I(file_inode(file));
int ret;
- ret = gfs2_rs_alloc(ip);
+ ret = gfs2_rsqa_alloc(ip);
if (ret)
return ret;
@@ -897,8 +891,8 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t
if (!(mode & FALLOC_FL_KEEP_SIZE) && (pos + count) > inode->i_size) {
i_size_write(inode, pos + count);
- /* Marks the inode as dirty */
file_update_time(file);
+ mark_inode_dirty(inode);
}
return generic_write_sync(file, pos, count);
@@ -938,13 +932,14 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t le
if (ret)
goto out_unlock;
- ret = gfs2_rs_alloc(ip);
+ ret = gfs2_rsqa_alloc(ip);
if (ret)
goto out_putw;
ret = __gfs2_fallocate(file, mode, offset, len);
if (ret)
- gfs2_rs_deltree(ip->i_res);
+ gfs2_rs_deltree(&ip->i_res);
+
out_putw:
put_write_access(inode);
out_unlock:
@@ -962,7 +957,7 @@ static ssize_t gfs2_file_splice_write(struct pipe_inode_info *pipe,
int error;
struct gfs2_inode *ip = GFS2_I(out->f_mapping->host);
- error = gfs2_rs_alloc(ip);
+ error = gfs2_rsqa_alloc(ip);
if (error)
return (ssize_t)error;
@@ -1000,7 +995,7 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
}
if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
if (fl->fl_type == F_UNLCK)
- posix_lock_file_wait(file, fl);
+ locks_lock_file_wait(file, fl);
return -EIO;
}
if (IS_GETLK(cmd))
@@ -1018,7 +1013,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
struct gfs2_inode *ip = GFS2_I(file_inode(file));
struct gfs2_glock *gl;
unsigned int state;
- int flags;
+ u16 flags;
int error = 0;
int sleeptime;
@@ -1031,8 +1026,11 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
if (gl) {
if (fl_gh->gh_state == state)
goto out;
- flock_lock_file_wait(file,
- &(struct file_lock){.fl_type = F_UNLCK});
+ locks_lock_file_wait(file,
+ &(struct file_lock) {
+ .fl_type = F_UNLCK,
+ .fl_flags = FL_FLOCK
+ });
gfs2_glock_dq(fl_gh);
gfs2_holder_reinit(state, flags, fl_gh);
} else {
@@ -1056,7 +1054,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
if (error == GLR_TRYFAILED)
error = -EAGAIN;
} else {
- error = flock_lock_file_wait(file, fl);
+ error = locks_lock_file_wait(file, fl);
gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error);
}
@@ -1071,7 +1069,7 @@ static void do_unflock(struct file *file, struct file_lock *fl)
struct gfs2_holder *fl_gh = &fp->f_fl_gh;
mutex_lock(&fp->f_fl_mutex);
- flock_lock_file_wait(file, fl);
+ locks_lock_file_wait(file, fl);
if (fl_gh->gh_gl) {
gfs2_glock_dq(fl_gh);
gfs2_holder_uninit(fl_gh);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index a38e38f..a4ff7b5 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -34,6 +34,7 @@
#include <linux/percpu.h>
#include <linux/list_sort.h>
#include <linux/lockref.h>
+#include <linux/rhashtable.h>
#include "gfs2.h"
#include "incore.h"
@@ -50,9 +51,8 @@
#include "trace_gfs2.h"
struct gfs2_glock_iter {
- int hash; /* hash bucket index */
- unsigned nhash; /* Index within current bucket */
struct gfs2_sbd *sdp; /* incore superblock */
+ struct rhashtable_iter hti; /* rhashtable iterator */
struct gfs2_glock *gl; /* current glock struct */
loff_t last_pos; /* last position */
};
@@ -70,44 +70,19 @@ static DEFINE_SPINLOCK(lru_lock);
#define GFS2_GL_HASH_SHIFT 15
#define GFS2_GL_HASH_SIZE (1 << GFS2_GL_HASH_SHIFT)
-#define GFS2_GL_HASH_MASK (GFS2_GL_HASH_SIZE - 1)
-static struct hlist_bl_head gl_hash_table[GFS2_GL_HASH_SIZE];
-static struct dentry *gfs2_root;
-
-/**
- * gl_hash() - Turn glock number into hash bucket number
- * @lock: The glock number
- *
- * Returns: The number of the corresponding hash bucket
- */
-
-static unsigned int gl_hash(const struct gfs2_sbd *sdp,
- const struct lm_lockname *name)
-{
- unsigned int h;
-
- h = jhash(&name->ln_number, sizeof(u64), 0);
- h = jhash(&name->ln_type, sizeof(unsigned int), h);
- h = jhash(&sdp, sizeof(struct gfs2_sbd *), h);
- h &= GFS2_GL_HASH_MASK;
-
- return h;
-}
-
-static inline void spin_lock_bucket(unsigned int hash)
-{
- hlist_bl_lock(&gl_hash_table[hash]);
-}
+static struct rhashtable_params ht_parms = {
+ .nelem_hint = GFS2_GL_HASH_SIZE * 3 / 4,
+ .key_len = sizeof(struct lm_lockname),
+ .key_offset = offsetof(struct gfs2_glock, gl_name),
+ .head_offset = offsetof(struct gfs2_glock, gl_node),
+};
-static inline void spin_unlock_bucket(unsigned int hash)
-{
- hlist_bl_unlock(&gl_hash_table[hash]);
-}
+static struct rhashtable gl_hash_table;
-static void gfs2_glock_dealloc(struct rcu_head *rcu)
+void gfs2_glock_free(struct gfs2_glock *gl)
{
- struct gfs2_glock *gl = container_of(rcu, struct gfs2_glock, gl_rcu);
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
if (gl->gl_ops->go_flags & GLOF_ASPACE) {
kmem_cache_free(gfs2_glock_aspace_cachep, gl);
@@ -115,13 +90,6 @@ static void gfs2_glock_dealloc(struct rcu_head *rcu)
kfree(gl->gl_lksb.sb_lvbptr);
kmem_cache_free(gfs2_glock_cachep, gl);
}
-}
-
-void gfs2_glock_free(struct gfs2_glock *gl)
-{
- struct gfs2_sbd *sdp = gl->gl_sbd;
-
- call_rcu(&gl->gl_rcu, gfs2_glock_dealloc);
if (atomic_dec_and_test(&sdp->sd_glock_disposal))
wake_up(&sdp->sd_glock_wait);
}
@@ -192,7 +160,7 @@ static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
void gfs2_glock_put(struct gfs2_glock *gl)
{
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct address_space *mapping = gfs2_glock2aspace(gl);
if (lockref_put_or_lock(&gl->gl_lockref))
@@ -202,9 +170,7 @@ void gfs2_glock_put(struct gfs2_glock *gl)
gfs2_glock_remove_from_lru(gl);
spin_unlock(&gl->gl_lockref.lock);
- spin_lock_bucket(gl->gl_hash);
- hlist_bl_del_rcu(&gl->gl_list);
- spin_unlock_bucket(gl->gl_hash);
+ rhashtable_remove_fast(&gl_hash_table, &gl->gl_node, ht_parms);
GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
GLOCK_BUG_ON(gl, mapping && mapping->nrpages);
trace_gfs2_glock_put(gl);
@@ -212,33 +178,6 @@ void gfs2_glock_put(struct gfs2_glock *gl)
}
/**
- * search_bucket() - Find struct gfs2_glock by lock number
- * @bucket: the bucket to search
- * @name: The lock name
- *
- * Returns: NULL, or the struct gfs2_glock with the requested number
- */
-
-static struct gfs2_glock *search_bucket(unsigned int hash,
- const struct gfs2_sbd *sdp,
- const struct lm_lockname *name)
-{
- struct gfs2_glock *gl;
- struct hlist_bl_node *h;
-
- hlist_bl_for_each_entry_rcu(gl, h, &gl_hash_table[hash], gl_list) {
- if (!lm_name_equal(&gl->gl_name, name))
- continue;
- if (gl->gl_sbd != sdp)
- continue;
- if (lockref_get_not_dead(&gl->gl_lockref))
- return gl;
- }
-
- return NULL;
-}
-
-/**
* may_grant - check if its ok to grant a new lock
* @gl: The glock
* @gh: The lock request which we wish to grant
@@ -307,8 +246,8 @@ static inline void do_error(struct gfs2_glock *gl, const int ret)
*/
static int do_promote(struct gfs2_glock *gl)
-__releases(&gl->gl_spin)
-__acquires(&gl->gl_spin)
+__releases(&gl->gl_lockref.lock)
+__acquires(&gl->gl_lockref.lock)
{
const struct gfs2_glock_operations *glops = gl->gl_ops;
struct gfs2_holder *gh, *tmp;
@@ -321,10 +260,10 @@ restart:
if (may_grant(gl, gh)) {
if (gh->gh_list.prev == &gl->gl_holders &&
glops->go_lock) {
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
/* FIXME: eliminate this eventually */
ret = glops->go_lock(gh);
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
if (ret) {
if (ret == 1)
return 2;
@@ -422,7 +361,7 @@ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret)
unsigned state = ret & LM_OUT_ST_MASK;
int rv;
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
trace_gfs2_glock_state_change(gl, state);
state_change(gl, state);
gh = find_first_waiter(gl);
@@ -466,7 +405,7 @@ retry:
pr_err("wanted %u got %u\n", gl->gl_target, state);
GLOCK_BUG_ON(gl, 1);
}
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
return;
}
@@ -475,9 +414,9 @@ retry:
gfs2_demote_wake(gl);
if (state != LM_ST_UNLOCKED) {
if (glops->go_xmote_bh) {
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
rv = glops->go_xmote_bh(gl, gh);
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
if (rv) {
do_error(gl, rv);
goto out;
@@ -490,7 +429,7 @@ retry:
out:
clear_bit(GLF_LOCK, &gl->gl_flags);
out_locked:
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
}
/**
@@ -502,12 +441,12 @@ out_locked:
*/
static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target)
-__releases(&gl->gl_spin)
-__acquires(&gl->gl_spin)
+__releases(&gl->gl_lockref.lock)
+__acquires(&gl->gl_lockref.lock)
{
const struct gfs2_glock_operations *glops = gl->gl_ops;
- struct gfs2_sbd *sdp = gl->gl_sbd;
- unsigned int lck_flags = gh ? gh->gh_flags : 0;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
+ unsigned int lck_flags = (unsigned int)(gh ? gh->gh_flags : 0);
int ret;
lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP |
@@ -525,7 +464,7 @@ __acquires(&gl->gl_spin)
(gl->gl_state == LM_ST_EXCLUSIVE) ||
(lck_flags & (LM_FLAG_TRY|LM_FLAG_TRY_1CB)))
clear_bit(GLF_BLOCKING, &gl->gl_flags);
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
if (glops->go_sync)
glops->go_sync(gl);
if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags))
@@ -546,7 +485,7 @@ __acquires(&gl->gl_spin)
gfs2_glock_put(gl);
}
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
}
/**
@@ -574,8 +513,8 @@ static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl)
*/
static void run_queue(struct gfs2_glock *gl, const int nonblock)
-__releases(&gl->gl_spin)
-__acquires(&gl->gl_spin)
+__releases(&gl->gl_lockref.lock)
+__acquires(&gl->gl_lockref.lock)
{
struct gfs2_holder *gh = NULL;
int ret;
@@ -628,7 +567,7 @@ out_unlock:
static void delete_work_func(struct work_struct *work)
{
struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_delete);
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct gfs2_inode *ip;
struct inode *inode;
u64 no_addr = gl->gl_name.ln_number;
@@ -657,7 +596,7 @@ static void glock_work_func(struct work_struct *work)
finish_xmote(gl, gl->gl_reply);
drop_ref = 1;
}
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
gl->gl_state != LM_ST_UNLOCKED &&
gl->gl_demote_state != LM_ST_EXCLUSIVE) {
@@ -673,7 +612,7 @@ static void glock_work_func(struct work_struct *work)
}
}
run_queue(gl, 0);
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
if (!delay)
gfs2_glock_put(gl);
else {
@@ -704,15 +643,17 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
struct gfs2_glock **glp)
{
struct super_block *s = sdp->sd_vfs;
- struct lm_lockname name = { .ln_number = number, .ln_type = glops->go_type };
- struct gfs2_glock *gl, *tmp;
- unsigned int hash = gl_hash(sdp, &name);
+ struct lm_lockname name = { .ln_number = number,
+ .ln_type = glops->go_type,
+ .ln_sbd = sdp };
+ struct gfs2_glock *gl, *tmp = NULL;
struct address_space *mapping;
struct kmem_cache *cachep;
+ int ret, tries = 0;
- rcu_read_lock();
- gl = search_bucket(hash, sdp, &name);
- rcu_read_unlock();
+ gl = rhashtable_lookup_fast(&gl_hash_table, &name, ht_parms);
+ if (gl && !lockref_get_not_dead(&gl->gl_lockref))
+ gl = NULL;
*glp = gl;
if (gl)
@@ -739,14 +680,13 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
}
atomic_inc(&sdp->sd_glock_disposal);
- gl->gl_sbd = sdp;
+ gl->gl_node.next = NULL;
gl->gl_flags = 0;
gl->gl_name = name;
gl->gl_lockref.count = 1;
gl->gl_state = LM_ST_UNLOCKED;
gl->gl_target = LM_ST_UNLOCKED;
gl->gl_demote_state = LM_ST_EXCLUSIVE;
- gl->gl_hash = hash;
gl->gl_ops = glops;
gl->gl_dstamp = ktime_set(0, 0);
preempt_disable();
@@ -771,22 +711,34 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
mapping->writeback_index = 0;
}
- spin_lock_bucket(hash);
- tmp = search_bucket(hash, sdp, &name);
- if (tmp) {
- spin_unlock_bucket(hash);
- kfree(gl->gl_lksb.sb_lvbptr);
- kmem_cache_free(cachep, gl);
- atomic_dec(&sdp->sd_glock_disposal);
- gl = tmp;
- } else {
- hlist_bl_add_head_rcu(&gl->gl_list, &gl_hash_table[hash]);
- spin_unlock_bucket(hash);
+again:
+ ret = rhashtable_lookup_insert_fast(&gl_hash_table, &gl->gl_node,
+ ht_parms);
+ if (ret == 0) {
+ *glp = gl;
+ return 0;
}
- *glp = gl;
+ if (ret == -EEXIST) {
+ ret = 0;
+ tmp = rhashtable_lookup_fast(&gl_hash_table, &name, ht_parms);
+ if (tmp == NULL || !lockref_get_not_dead(&tmp->gl_lockref)) {
+ if (++tries < 100) {
+ cond_resched();
+ goto again;
+ }
+ tmp = NULL;
+ ret = -ENOMEM;
+ }
+ } else {
+ WARN_ON_ONCE(ret);
+ }
+ kfree(gl->gl_lksb.sb_lvbptr);
+ kmem_cache_free(cachep, gl);
+ atomic_dec(&sdp->sd_glock_disposal);
+ *glp = tmp;
- return 0;
+ return ret;
}
/**
@@ -798,7 +750,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
*
*/
-void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
+void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, u16 flags,
struct gfs2_holder *gh)
{
INIT_LIST_HEAD(&gh->gh_list);
@@ -822,7 +774,7 @@ void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
*
*/
-void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder *gh)
+void gfs2_holder_reinit(unsigned int state, u16 flags, struct gfs2_holder *gh)
{
gh->gh_state = state;
gh->gh_flags = flags;
@@ -924,11 +876,11 @@ void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
*/
static inline void add_to_queue(struct gfs2_holder *gh)
-__releases(&gl->gl_spin)
-__acquires(&gl->gl_spin)
+__releases(&gl->gl_lockref.lock)
+__acquires(&gl->gl_lockref.lock)
{
struct gfs2_glock *gl = gh->gh_gl;
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct list_head *insert_pt = NULL;
struct gfs2_holder *gh2;
int try_futile = 0;
@@ -974,10 +926,10 @@ fail:
do_cancel:
gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
if (!(gh->gh_flags & LM_FLAG_PRIORITY)) {
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
if (sdp->sd_lockstruct.ls_ops->lm_cancel)
sdp->sd_lockstruct.ls_ops->lm_cancel(gl);
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
}
return;
@@ -1006,7 +958,7 @@ trap_recursive:
int gfs2_glock_nq(struct gfs2_holder *gh)
{
struct gfs2_glock *gl = gh->gh_gl;
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
int error = 0;
if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
@@ -1015,7 +967,7 @@ int gfs2_glock_nq(struct gfs2_holder *gh)
if (test_bit(GLF_LRU, &gl->gl_flags))
gfs2_glock_remove_from_lru(gl);
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
add_to_queue(gh);
if (unlikely((LM_FLAG_NOEXP & gh->gh_flags) &&
test_and_clear_bit(GLF_FROZEN, &gl->gl_flags))) {
@@ -1025,7 +977,7 @@ int gfs2_glock_nq(struct gfs2_holder *gh)
gl->gl_lockref.count--;
}
run_queue(gl, 1);
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
if (!(gh->gh_flags & GL_ASYNC))
error = gfs2_glock_wait(gh);
@@ -1058,7 +1010,7 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
unsigned delay = 0;
int fast_path = 0;
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
if (gh->gh_flags & GL_NOCACHE)
handle_callback(gl, LM_ST_UNLOCKED, 0, false);
@@ -1066,9 +1018,9 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
if (find_first_holder(gl) == NULL) {
if (glops->go_unlock) {
GLOCK_BUG_ON(gl, test_and_set_bit(GLF_LOCK, &gl->gl_flags));
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
glops->go_unlock(gh);
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
clear_bit(GLF_LOCK, &gl->gl_flags);
}
if (list_empty(&gl->gl_holders) &&
@@ -1081,7 +1033,7 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
gfs2_glock_add_to_lru(gl);
trace_gfs2_glock_queue(gh, 0);
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
if (likely(fast_path))
return;
@@ -1128,7 +1080,7 @@ void gfs2_glock_dq_uninit(struct gfs2_holder *gh)
int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number,
const struct gfs2_glock_operations *glops,
- unsigned int state, int flags, struct gfs2_holder *gh)
+ unsigned int state, u16 flags, struct gfs2_holder *gh)
{
struct gfs2_glock *gl;
int error;
@@ -1265,9 +1217,9 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
delay = gl->gl_hold_time;
}
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
handle_callback(gl, state, delay, true);
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
gfs2_glock_put(gl);
}
@@ -1307,28 +1259,28 @@ static int gfs2_should_freeze(const struct gfs2_glock *gl)
* @gl: Pointer to the glock
* @ret: The return value from the dlm
*
- * The gl_reply field is under the gl_spin lock so that it is ok
+ * The gl_reply field is under the gl_lockref.lock lock so that it is ok
* to use a bitfield shared with other glock state fields.
*/
void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
{
- struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
+ struct lm_lockstruct *ls = &gl->gl_name.ln_sbd->sd_lockstruct;
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
gl->gl_reply = ret;
if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags))) {
if (gfs2_should_freeze(gl)) {
set_bit(GLF_FROZEN, &gl->gl_flags);
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
return;
}
}
gl->gl_lockref.count++;
set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
gfs2_glock_put(gl);
@@ -1374,14 +1326,14 @@ __acquires(&lru_lock)
while(!list_empty(list)) {
gl = list_entry(list->next, struct gfs2_glock, gl_lru);
list_del_init(&gl->gl_lru);
- if (!spin_trylock(&gl->gl_spin)) {
+ if (!spin_trylock(&gl->gl_lockref.lock)) {
add_back_to_lru:
list_add(&gl->gl_lru, &lru_list);
atomic_inc(&lru_count);
continue;
}
if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
goto add_back_to_lru;
}
clear_bit(GLF_LRU, &gl->gl_flags);
@@ -1391,7 +1343,7 @@ add_back_to_lru:
WARN_ON(!test_and_clear_bit(GLF_LOCK, &gl->gl_flags));
if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
gl->gl_lockref.count--;
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
cond_resched_lock(&lru_lock);
}
}
@@ -1462,31 +1414,26 @@ static struct shrinker glock_shrinker = {
*
*/
-static void examine_bucket(glock_examiner examiner, const struct gfs2_sbd *sdp,
- unsigned int hash)
+static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp)
{
struct gfs2_glock *gl;
- struct hlist_bl_head *head = &gl_hash_table[hash];
- struct hlist_bl_node *pos;
+ struct rhash_head *pos;
+ const struct bucket_table *tbl;
+ int i;
rcu_read_lock();
- hlist_bl_for_each_entry_rcu(gl, pos, head, gl_list) {
- if ((gl->gl_sbd == sdp) && lockref_get_not_dead(&gl->gl_lockref))
- examiner(gl);
+ tbl = rht_dereference_rcu(gl_hash_table.tbl, &gl_hash_table);
+ for (i = 0; i < tbl->size; i++) {
+ rht_for_each_entry_rcu(gl, pos, tbl, i, gl_node) {
+ if ((gl->gl_name.ln_sbd == sdp) &&
+ lockref_get_not_dead(&gl->gl_lockref))
+ examiner(gl);
+ }
}
rcu_read_unlock();
cond_resched();
}
-static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp)
-{
- unsigned x;
-
- for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
- examine_bucket(examiner, sdp, x);
-}
-
-
/**
* thaw_glock - thaw out a glock which has an unprocessed reply waiting
* @gl: The glock to thaw
@@ -1514,10 +1461,10 @@ static void clear_glock(struct gfs2_glock *gl)
{
gfs2_glock_remove_from_lru(gl);
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
if (gl->gl_state != LM_ST_UNLOCKED)
handle_callback(gl, LM_ST_UNLOCKED, 0, false);
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
gfs2_glock_put(gl);
}
@@ -1535,9 +1482,9 @@ void gfs2_glock_thaw(struct gfs2_sbd *sdp)
static void dump_glock(struct seq_file *seq, struct gfs2_glock *gl)
{
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
gfs2_dump_glock(seq, gl);
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
}
static void dump_glock_func(struct gfs2_glock *gl)
@@ -1559,7 +1506,9 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
flush_workqueue(glock_workqueue);
glock_hash_walk(clear_glock, sdp);
flush_workqueue(glock_workqueue);
- wait_event(sdp->sd_glock_wait, atomic_read(&sdp->sd_glock_disposal) == 0);
+ wait_event_timeout(sdp->sd_glock_wait,
+ atomic_read(&sdp->sd_glock_disposal) == 0,
+ HZ * 600);
glock_hash_walk(dump_glock_func, sdp);
}
@@ -1569,12 +1518,12 @@ void gfs2_glock_finish_truncate(struct gfs2_inode *ip)
int ret;
ret = gfs2_truncatei_resume(ip);
- gfs2_assert_withdraw(gl->gl_sbd, ret == 0);
+ gfs2_assert_withdraw(gl->gl_name.ln_sbd, ret == 0);
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
clear_bit(GLF_LOCK, &gl->gl_flags);
run_queue(gl, 1);
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
}
static const char *state2str(unsigned state)
@@ -1592,7 +1541,7 @@ static const char *state2str(unsigned state)
return "??";
}
-static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags)
+static const char *hflags2str(char *buf, u16 flags, unsigned long iflags)
{
char *p = buf;
if (flags & LM_FLAG_TRY)
@@ -1733,17 +1682,17 @@ static int gfs2_glstats_seq_show(struct seq_file *seq, void *iter_ptr)
{
struct gfs2_glock *gl = iter_ptr;
- seq_printf(seq, "G: n:%u/%llx rtt:%lld/%lld rttb:%lld/%lld irt:%lld/%lld dcnt: %lld qcnt: %lld\n",
+ seq_printf(seq, "G: n:%u/%llx rtt:%llu/%llu rttb:%llu/%llu irt:%llu/%llu dcnt: %llu qcnt: %llu\n",
gl->gl_name.ln_type,
(unsigned long long)gl->gl_name.ln_number,
- (long long)gl->gl_stats.stats[GFS2_LKS_SRTT],
- (long long)gl->gl_stats.stats[GFS2_LKS_SRTTVAR],
- (long long)gl->gl_stats.stats[GFS2_LKS_SRTTB],
- (long long)gl->gl_stats.stats[GFS2_LKS_SRTTVARB],
- (long long)gl->gl_stats.stats[GFS2_LKS_SIRT],
- (long long)gl->gl_stats.stats[GFS2_LKS_SIRTVAR],
- (long long)gl->gl_stats.stats[GFS2_LKS_DCOUNT],
- (long long)gl->gl_stats.stats[GFS2_LKS_QCOUNT]);
+ (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SRTT],
+ (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SRTTVAR],
+ (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SRTTB],
+ (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SRTTVARB],
+ (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SIRT],
+ (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SIRTVAR],
+ (unsigned long long)gl->gl_stats.stats[GFS2_LKS_DCOUNT],
+ (unsigned long long)gl->gl_stats.stats[GFS2_LKS_QCOUNT]);
return 0;
}
@@ -1776,11 +1725,10 @@ static const char *gfs2_stype[] = {
static int gfs2_sbstats_seq_show(struct seq_file *seq, void *iter_ptr)
{
- struct gfs2_glock_iter *gi = seq->private;
- struct gfs2_sbd *sdp = gi->sdp;
- unsigned index = gi->hash >> 3;
- unsigned subindex = gi->hash & 0x07;
- s64 value;
+ struct gfs2_sbd *sdp = seq->private;
+ loff_t pos = *(loff_t *)iter_ptr;
+ unsigned index = pos >> 3;
+ unsigned subindex = pos & 0x07;
int i;
if (index == 0 && subindex != 0)
@@ -1791,12 +1739,12 @@ static int gfs2_sbstats_seq_show(struct seq_file *seq, void *iter_ptr)
for_each_possible_cpu(i) {
const struct gfs2_pcpu_lkstats *lkstats = per_cpu_ptr(sdp->sd_lkstats, i);
- if (index == 0) {
- value = i;
- } else {
- value = lkstats->lkstats[index - 1].stats[subindex];
- }
- seq_printf(seq, " %15lld", (long long)value);
+
+ if (index == 0)
+ seq_printf(seq, " %15u", i);
+ else
+ seq_printf(seq, " %15llu", (unsigned long long)lkstats->
+ lkstats[index - 1].stats[subindex]);
}
seq_putc(seq, '\n');
return 0;
@@ -1804,20 +1752,24 @@ static int gfs2_sbstats_seq_show(struct seq_file *seq, void *iter_ptr)
int __init gfs2_glock_init(void)
{
- unsigned i;
- for(i = 0; i < GFS2_GL_HASH_SIZE; i++) {
- INIT_HLIST_BL_HEAD(&gl_hash_table[i]);
- }
+ int ret;
+
+ ret = rhashtable_init(&gl_hash_table, &ht_parms);
+ if (ret < 0)
+ return ret;
glock_workqueue = alloc_workqueue("glock_workqueue", WQ_MEM_RECLAIM |
WQ_HIGHPRI | WQ_FREEZABLE, 0);
- if (!glock_workqueue)
+ if (!glock_workqueue) {
+ rhashtable_destroy(&gl_hash_table);
return -ENOMEM;
+ }
gfs2_delete_workqueue = alloc_workqueue("delete_workqueue",
WQ_MEM_RECLAIM | WQ_FREEZABLE,
0);
if (!gfs2_delete_workqueue) {
destroy_workqueue(glock_workqueue);
+ rhashtable_destroy(&gl_hash_table);
return -ENOMEM;
}
@@ -1829,72 +1781,41 @@ int __init gfs2_glock_init(void)
void gfs2_glock_exit(void)
{
unregister_shrinker(&glock_shrinker);
+ rhashtable_destroy(&gl_hash_table);
destroy_workqueue(glock_workqueue);
destroy_workqueue(gfs2_delete_workqueue);
}
-static inline struct gfs2_glock *glock_hash_chain(unsigned hash)
-{
- return hlist_bl_entry(hlist_bl_first_rcu(&gl_hash_table[hash]),
- struct gfs2_glock, gl_list);
-}
-
-static inline struct gfs2_glock *glock_hash_next(struct gfs2_glock *gl)
+static void gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
{
- return hlist_bl_entry(rcu_dereference(gl->gl_list.next),
- struct gfs2_glock, gl_list);
-}
-
-static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
-{
- struct gfs2_glock *gl;
-
do {
- gl = gi->gl;
- if (gl) {
- gi->gl = glock_hash_next(gl);
- gi->nhash++;
- } else {
- if (gi->hash >= GFS2_GL_HASH_SIZE) {
- rcu_read_unlock();
- return 1;
- }
- gi->gl = glock_hash_chain(gi->hash);
- gi->nhash = 0;
- }
- while (gi->gl == NULL) {
- gi->hash++;
- if (gi->hash >= GFS2_GL_HASH_SIZE) {
- rcu_read_unlock();
- return 1;
- }
- gi->gl = glock_hash_chain(gi->hash);
- gi->nhash = 0;
+ gi->gl = rhashtable_walk_next(&gi->hti);
+ if (IS_ERR(gi->gl)) {
+ if (PTR_ERR(gi->gl) == -EAGAIN)
+ continue;
+ gi->gl = NULL;
}
/* Skip entries for other sb and dead entries */
- } while (gi->sdp != gi->gl->gl_sbd ||
- __lockref_is_dead(&gi->gl->gl_lockref));
-
- return 0;
+ } while ((gi->gl) && ((gi->sdp != gi->gl->gl_name.ln_sbd) ||
+ __lockref_is_dead(&gi->gl->gl_lockref)));
}
static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos)
{
struct gfs2_glock_iter *gi = seq->private;
loff_t n = *pos;
+ int ret;
if (gi->last_pos <= *pos)
- n = gi->nhash + (*pos - gi->last_pos);
- else
- gi->hash = 0;
+ n = (*pos - gi->last_pos);
- gi->nhash = 0;
- rcu_read_lock();
+ ret = rhashtable_walk_start(&gi->hti);
+ if (ret)
+ return NULL;
do {
- if (gfs2_glock_iter_next(gi))
- return NULL;
- } while (n--);
+ gfs2_glock_iter_next(gi);
+ } while (gi->gl && n--);
gi->last_pos = *pos;
return gi->gl;
@@ -1907,9 +1828,7 @@ static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr,
(*pos)++;
gi->last_pos = *pos;
- if (gfs2_glock_iter_next(gi))
- return NULL;
-
+ gfs2_glock_iter_next(gi);
return gi->gl;
}
@@ -1917,9 +1836,8 @@ static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr)
{
struct gfs2_glock_iter *gi = seq->private;
- if (gi->gl)
- rcu_read_unlock();
gi->gl = NULL;
+ rhashtable_walk_stop(&gi->hti);
}
static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr)
@@ -1930,26 +1848,19 @@ static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr)
static void *gfs2_sbstats_seq_start(struct seq_file *seq, loff_t *pos)
{
- struct gfs2_glock_iter *gi = seq->private;
-
- gi->hash = *pos;
+ preempt_disable();
if (*pos >= GFS2_NR_SBSTATS)
return NULL;
- preempt_disable();
- return SEQ_START_TOKEN;
+ return pos;
}
static void *gfs2_sbstats_seq_next(struct seq_file *seq, void *iter_ptr,
loff_t *pos)
{
- struct gfs2_glock_iter *gi = seq->private;
(*pos)++;
- gi->hash++;
- if (gi->hash >= GFS2_NR_SBSTATS) {
- preempt_enable();
+ if (*pos >= GFS2_NR_SBSTATS)
return NULL;
- }
- return SEQ_START_TOKEN;
+ return pos;
}
static void gfs2_sbstats_seq_stop(struct seq_file *seq, void *iter_ptr)
@@ -1987,14 +1898,28 @@ static int gfs2_glocks_open(struct inode *inode, struct file *file)
if (ret == 0) {
struct seq_file *seq = file->private_data;
struct gfs2_glock_iter *gi = seq->private;
+
gi->sdp = inode->i_private;
+ gi->last_pos = 0;
seq->buf = kmalloc(GFS2_SEQ_GOODSIZE, GFP_KERNEL | __GFP_NOWARN);
if (seq->buf)
seq->size = GFS2_SEQ_GOODSIZE;
+ gi->gl = NULL;
+ ret = rhashtable_walk_init(&gl_hash_table, &gi->hti);
}
return ret;
}
+static int gfs2_glocks_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq = file->private_data;
+ struct gfs2_glock_iter *gi = seq->private;
+
+ gi->gl = NULL;
+ rhashtable_walk_exit(&gi->hti);
+ return seq_release_private(inode, file);
+}
+
static int gfs2_glstats_open(struct inode *inode, struct file *file)
{
int ret = seq_open_private(file, &gfs2_glstats_seq_ops,
@@ -2003,21 +1928,22 @@ static int gfs2_glstats_open(struct inode *inode, struct file *file)
struct seq_file *seq = file->private_data;
struct gfs2_glock_iter *gi = seq->private;
gi->sdp = inode->i_private;
+ gi->last_pos = 0;
seq->buf = kmalloc(GFS2_SEQ_GOODSIZE, GFP_KERNEL | __GFP_NOWARN);
if (seq->buf)
seq->size = GFS2_SEQ_GOODSIZE;
+ gi->gl = NULL;
+ ret = rhashtable_walk_init(&gl_hash_table, &gi->hti);
}
return ret;
}
static int gfs2_sbstats_open(struct inode *inode, struct file *file)
{
- int ret = seq_open_private(file, &gfs2_sbstats_seq_ops,
- sizeof(struct gfs2_glock_iter));
+ int ret = seq_open(file, &gfs2_sbstats_seq_ops);
if (ret == 0) {
struct seq_file *seq = file->private_data;
- struct gfs2_glock_iter *gi = seq->private;
- gi->sdp = inode->i_private;
+ seq->private = inode->i_private; /* sdp */
}
return ret;
}
@@ -2027,7 +1953,7 @@ static const struct file_operations gfs2_glocks_fops = {
.open = gfs2_glocks_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release_private,
+ .release = gfs2_glocks_release,
};
static const struct file_operations gfs2_glstats_fops = {
@@ -2035,7 +1961,7 @@ static const struct file_operations gfs2_glstats_fops = {
.open = gfs2_glstats_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release_private,
+ .release = gfs2_glocks_release,
};
static const struct file_operations gfs2_sbstats_fops = {
@@ -2043,7 +1969,7 @@ static const struct file_operations gfs2_sbstats_fops = {
.open = gfs2_sbstats_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release_private,
+ .release = seq_release,
};
int gfs2_create_debugfs_file(struct gfs2_sbd *sdp)
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 32572f7..46ab67f 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -79,15 +79,15 @@ enum {
* requested had acquired and released the lock.
*/
-#define LM_FLAG_TRY 0x00000001
-#define LM_FLAG_TRY_1CB 0x00000002
-#define LM_FLAG_NOEXP 0x00000004
-#define LM_FLAG_ANY 0x00000008
-#define LM_FLAG_PRIORITY 0x00000010
-#define GL_ASYNC 0x00000040
-#define GL_EXACT 0x00000080
-#define GL_SKIP 0x00000100
-#define GL_NOCACHE 0x00000400
+#define LM_FLAG_TRY 0x0001
+#define LM_FLAG_TRY_1CB 0x0002
+#define LM_FLAG_NOEXP 0x0004
+#define LM_FLAG_ANY 0x0008
+#define LM_FLAG_PRIORITY 0x0010
+#define GL_ASYNC 0x0040
+#define GL_EXACT 0x0080
+#define GL_SKIP 0x0100
+#define GL_NOCACHE 0x0400
/*
* lm_async_cb return flags
@@ -141,7 +141,7 @@ static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *
struct pid *pid;
/* Look in glock's list of holders for one with current task as owner */
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
pid = task_pid(current);
list_for_each_entry(gh, &gl->gl_holders, gh_list) {
if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
@@ -151,7 +151,7 @@ static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *
}
gh = NULL;
out:
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
return gh;
}
@@ -183,8 +183,8 @@ extern int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
int create, struct gfs2_glock **glp);
extern void gfs2_glock_put(struct gfs2_glock *gl);
extern void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state,
- unsigned flags, struct gfs2_holder *gh);
-extern void gfs2_holder_reinit(unsigned int state, unsigned flags,
+ u16 flags, struct gfs2_holder *gh);
+extern void gfs2_holder_reinit(unsigned int state, u16 flags,
struct gfs2_holder *gh);
extern void gfs2_holder_uninit(struct gfs2_holder *gh);
extern int gfs2_glock_nq(struct gfs2_holder *gh);
@@ -195,7 +195,7 @@ extern void gfs2_glock_dq_wait(struct gfs2_holder *gh);
extern void gfs2_glock_dq_uninit(struct gfs2_holder *gh);
extern int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number,
const struct gfs2_glock_operations *glops,
- unsigned int state, int flags,
+ unsigned int state, u16 flags,
struct gfs2_holder *gh);
extern int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
extern void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
@@ -215,7 +215,7 @@ void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
*/
static inline int gfs2_glock_nq_init(struct gfs2_glock *gl,
- unsigned int state, int flags,
+ unsigned int state, u16 flags,
struct gfs2_holder *gh)
{
int error;
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index fa3fa5e..437fd73 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -13,6 +13,7 @@
#include <linux/gfs2_ondisk.h>
#include <linux/bio.h>
#include <linux/posix_acl.h>
+#include <linux/security.h>
#include "gfs2.h"
#include "incore.h"
@@ -32,13 +33,15 @@ struct workqueue_struct *gfs2_freeze_wq;
static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh)
{
- fs_err(gl->gl_sbd, "AIL buffer %p: blocknr %llu state 0x%08lx mapping %p page state 0x%lx\n",
+ fs_err(gl->gl_name.ln_sbd,
+ "AIL buffer %p: blocknr %llu state 0x%08lx mapping %p page "
+ "state 0x%lx\n",
bh, (unsigned long long)bh->b_blocknr, bh->b_state,
bh->b_page->mapping, bh->b_page->flags);
- fs_err(gl->gl_sbd, "AIL glock %u:%llu mapping %p\n",
+ fs_err(gl->gl_name.ln_sbd, "AIL glock %u:%llu mapping %p\n",
gl->gl_name.ln_type, gl->gl_name.ln_number,
gfs2_glock2aspace(gl));
- gfs2_lm_withdraw(gl->gl_sbd, "AIL error\n");
+ gfs2_lm_withdraw(gl->gl_name.ln_sbd, "AIL error\n");
}
/**
@@ -52,7 +55,7 @@ static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh)
static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync,
unsigned int nr_revokes)
{
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct list_head *head = &gl->gl_ail_list;
struct gfs2_bufdata *bd, *tmp;
struct buffer_head *bh;
@@ -80,7 +83,7 @@ static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync,
static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
{
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct gfs2_trans tr;
memset(&tr, 0, sizeof(tr));
@@ -109,7 +112,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
{
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
unsigned int revokes = atomic_read(&gl->gl_ail_count);
unsigned int max_revokes = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / sizeof(u64);
int ret;
@@ -139,16 +142,16 @@ void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
static void rgrp_go_sync(struct gfs2_glock *gl)
{
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct address_space *mapping = &sdp->sd_aspace;
struct gfs2_rgrpd *rgd;
int error;
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
rgd = gl->gl_object;
if (rgd)
gfs2_rgrp_brelse(rgd);
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
return;
@@ -160,11 +163,11 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
mapping_set_error(mapping, error);
gfs2_ail_empty_gl(gl);
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
rgd = gl->gl_object;
if (rgd)
gfs2_free_clones(rgd);
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
}
/**
@@ -179,7 +182,7 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
static void rgrp_go_inval(struct gfs2_glock *gl, int flags)
{
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct address_space *mapping = &sdp->sd_aspace;
struct gfs2_rgrpd *rgd = gl->gl_object;
@@ -218,7 +221,7 @@ static void inode_go_sync(struct gfs2_glock *gl)
GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE);
- gfs2_log_flush(gl->gl_sbd, gl, NORMAL_FLUSH);
+ gfs2_log_flush(gl->gl_name.ln_sbd, gl, NORMAL_FLUSH);
filemap_fdatawrite(metamapping);
if (ip) {
struct address_space *mapping = ip->i_inode.i_mapping;
@@ -252,7 +255,7 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
{
struct gfs2_inode *ip = gl->gl_object;
- gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count));
+ gfs2_assert_withdraw(gl->gl_name.ln_sbd, !atomic_read(&gl->gl_ail_count));
if (flags & DIO_METADATA) {
struct address_space *mapping = gfs2_glock2aspace(gl);
@@ -260,13 +263,14 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
if (ip) {
set_bit(GIF_INVALID, &ip->i_flags);
forget_all_cached_acls(&ip->i_inode);
+ security_inode_invalidate_secctx(&ip->i_inode);
gfs2_dir_hash_inval(ip);
}
}
- if (ip == GFS2_I(gl->gl_sbd->sd_rindex)) {
- gfs2_log_flush(gl->gl_sbd, NULL, NORMAL_FLUSH);
- gl->gl_sbd->sd_rindex_uptodate = 0;
+ if (ip == GFS2_I(gl->gl_name.ln_sbd->sd_rindex)) {
+ gfs2_log_flush(gl->gl_name.ln_sbd, NULL, NORMAL_FLUSH);
+ gl->gl_name.ln_sbd->sd_rindex_uptodate = 0;
}
if (ip && S_ISREG(ip->i_inode.i_mode))
truncate_inode_pages(ip->i_inode.i_mapping, 0);
@@ -281,7 +285,7 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
static int inode_go_demote_ok(const struct gfs2_glock *gl)
{
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct gfs2_holder *gh;
if (sdp->sd_jindex == gl->gl_object || sdp->sd_rindex == gl->gl_object)
@@ -416,7 +420,7 @@ int gfs2_inode_refresh(struct gfs2_inode *ip)
static int inode_go_lock(struct gfs2_holder *gh)
{
struct gfs2_glock *gl = gh->gh_gl;
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct gfs2_inode *ip = gl->gl_object;
int error = 0;
@@ -477,7 +481,7 @@ static void inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
static void freeze_go_sync(struct gfs2_glock *gl)
{
int error = 0;
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
if (gl->gl_state == LM_ST_SHARED &&
test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
@@ -500,7 +504,7 @@ static void freeze_go_sync(struct gfs2_glock *gl)
static int freeze_go_xmote_bh(struct gfs2_glock *gl, struct gfs2_holder *gh)
{
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode);
struct gfs2_glock *j_gl = ip->i_gl;
struct gfs2_log_header_host head;
@@ -540,12 +544,12 @@ static int freeze_go_demote_ok(const struct gfs2_glock *gl)
* iopen_go_callback - schedule the dcache entry for the inode to be deleted
* @gl: the glock
*
- * gl_spin lock is held while calling this
+ * gl_lockref.lock lock is held while calling this
*/
static void iopen_go_callback(struct gfs2_glock *gl, bool remote)
{
struct gfs2_inode *ip = (struct gfs2_inode *)gl->gl_object;
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
if (!remote || (sdp->sd_vfs->s_flags & MS_RDONLY))
return;
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index a1ec7c2..845fb09 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -22,6 +22,7 @@
#include <linux/ktime.h>
#include <linux/percpu.h>
#include <linux/lockref.h>
+#include <linux/rhashtable.h>
#define DIO_WAIT 0x00000010
#define DIO_METADATA 0x00000020
@@ -203,13 +204,15 @@ enum {
};
struct lm_lockname {
+ struct gfs2_sbd *ln_sbd;
u64 ln_number;
unsigned int ln_type;
};
#define lm_name_equal(name1, name2) \
- (((name1)->ln_number == (name2)->ln_number) && \
- ((name1)->ln_type == (name2)->ln_type))
+ (((name1)->ln_number == (name2)->ln_number) && \
+ ((name1)->ln_type == (name2)->ln_type) && \
+ ((name1)->ln_sbd == (name2)->ln_sbd))
struct gfs2_glock_operations {
@@ -241,7 +244,7 @@ enum {
};
struct gfs2_lkstats {
- s64 stats[GFS2_NR_LKSTATS];
+ u64 stats[GFS2_NR_LKSTATS];
};
enum {
@@ -256,8 +259,8 @@ struct gfs2_holder {
struct gfs2_glock *gh_gl;
struct pid *gh_owner_pid;
- unsigned int gh_state;
- unsigned gh_flags;
+ u16 gh_flags;
+ u16 gh_state;
int gh_error;
unsigned long gh_iflags; /* HIF_... */
@@ -267,6 +270,13 @@ struct gfs2_holder {
/* Number of quota types we support */
#define GFS2_MAXQUOTAS 2
+struct gfs2_qadata { /* quota allocation data */
+ /* Quota stuff */
+ struct gfs2_quota_data *qa_qd[2 * GFS2_MAXQUOTAS];
+ struct gfs2_holder qa_qd_ghs[2 * GFS2_MAXQUOTAS];
+ unsigned int qa_qd_num;
+};
+
/* Resource group multi-block reservation, in order of appearance:
Step 1. Function prepares to write, allocates a mb, sets the size hint.
@@ -285,11 +295,6 @@ struct gfs2_blkreserv {
struct gfs2_rbm rs_rbm; /* Start of reservation */
u32 rs_free; /* how many blocks are still free */
u64 rs_inum; /* Inode number for reservation */
-
- /* ancillary quota stuff */
- struct gfs2_quota_data *rs_qa_qd[2 * GFS2_MAXQUOTAS];
- struct gfs2_holder rs_qa_qd_ghs[2 * GFS2_MAXQUOTAS];
- unsigned int rs_qa_qd_num;
};
/*
@@ -327,21 +332,18 @@ enum {
struct gfs2_glock {
struct hlist_bl_node gl_list;
- struct gfs2_sbd *gl_sbd;
unsigned long gl_flags; /* GLF_... */
struct lm_lockname gl_name;
struct lockref gl_lockref;
-#define gl_spin gl_lockref.lock
- /* State fields protected by gl_spin */
+ /* State fields protected by gl_lockref.lock */
unsigned int gl_state:2, /* Current state */
gl_target:2, /* Target state */
gl_demote_state:2, /* State requested by remote node */
gl_req:2, /* State in last dlm request */
gl_reply:8; /* Last reply from the dlm */
- unsigned int gl_hash;
unsigned long gl_demote_time; /* time of first demote request */
long gl_hold_time;
struct list_head gl_holders;
@@ -367,7 +369,7 @@ struct gfs2_glock {
loff_t end;
} gl_vm;
};
- struct rcu_head gl_rcu;
+ struct rhash_head gl_node;
};
#define GFS2_MIN_LVB_SIZE 32 /* Min size of LVB that gfs2 supports */
@@ -391,7 +393,8 @@ struct gfs2_inode {
struct gfs2_glock *i_gl; /* Move into i_gh? */
struct gfs2_holder i_iopen_gh;
struct gfs2_holder i_gh; /* for prepare/commit_write only */
- struct gfs2_blkreserv *i_res; /* rgrp multi-block reservation */
+ struct gfs2_qadata *i_qadata; /* quota allocation data */
+ struct gfs2_blkreserv i_res; /* rgrp multi-block reservation */
struct gfs2_rgrpd *i_rgd;
u64 i_goal; /* goal block for allocations */
struct rw_semaphore i_rw_mutex;
@@ -402,6 +405,7 @@ struct gfs2_inode {
u32 i_diskflags;
u8 i_height;
u8 i_depth;
+ u16 i_rahead;
};
/*
@@ -558,6 +562,8 @@ struct gfs2_args {
unsigned int ar_errors:2; /* errors=withdraw | panic */
unsigned int ar_nobarrier:1; /* do not send barriers */
unsigned int ar_rgrplvb:1; /* use lvbs for rgrp info */
+ unsigned int ar_loccookie:1; /* use location based readdir
+ cookies */
int ar_commit; /* Commit interval */
int ar_statfs_quantum; /* The fast statfs interval */
int ar_quota_quantum; /* The quota interval */
@@ -685,6 +691,7 @@ struct gfs2_sbd {
u64 sd_heightsize[GFS2_MAX_META_HEIGHT + 1];
u32 sd_max_jheight; /* Max height of journaled file's meta tree */
u64 sd_jheightsize[GFS2_MAX_META_HEIGHT + 1];
+ u32 sd_max_dents_per_leaf; /* Max number of dirents in a leaf block */
struct gfs2_args sd_args; /* Mount arguments */
struct gfs2_tune sd_tune; /* Filesystem tuning structure */
@@ -835,7 +842,7 @@ static inline void gfs2_glstats_inc(struct gfs2_glock *gl, int which)
static inline void gfs2_sbstats_inc(const struct gfs2_glock *gl, int which)
{
- const struct gfs2_sbd *sdp = gl->gl_sbd;
+ const struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
preempt_disable();
this_cpu_ptr(sdp->sd_lkstats)->lkstats[gl->gl_name.ln_type].stats[which]++;
preempt_enable();
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 063fdfc..3e94400 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -191,13 +191,13 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
fail_refresh:
ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
ip->i_iopen_gh.gh_gl->gl_object = NULL;
- gfs2_glock_dq_uninit(&ip->i_iopen_gh);
+ gfs2_glock_dq_wait(&ip->i_iopen_gh);
+ gfs2_holder_uninit(&ip->i_iopen_gh);
fail_iopen:
if (io_gl)
gfs2_glock_put(io_gl);
fail_put:
ip->i_gl->gl_object = NULL;
- gfs2_glock_put(ip->i_gl);
fail:
iget_failed(inode);
return ERR_PTR(error);
@@ -593,7 +593,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
struct gfs2_inode *dip = GFS2_I(dir), *ip;
struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
struct gfs2_glock *io_gl;
- int error, free_vfs_inode = 0;
+ int error, free_vfs_inode = 1;
u32 aflags = 0;
unsigned blocks = 1;
struct gfs2_diradd da = { .bh = NULL, .save_loc = 1, };
@@ -601,7 +601,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
if (!name->len || name->len > GFS2_FNAMESIZE)
return -ENAMETOOLONG;
- error = gfs2_rs_alloc(dip);
+ error = gfs2_rsqa_alloc(dip);
if (error)
return error;
@@ -650,10 +650,10 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
error = posix_acl_create(dir, &mode, &default_acl, &acl);
if (error)
- goto fail_free_vfs_inode;
+ goto fail_gunlock;
ip = GFS2_I(inode);
- error = gfs2_rs_alloc(ip);
+ error = gfs2_rsqa_alloc(ip);
if (error)
goto fail_free_acls;
@@ -685,6 +685,11 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
ip->i_entries = 2;
break;
}
+
+ /* Force SYSTEM flag on all files and subdirs of a SYSTEM directory */
+ if (dip->i_diskflags & GFS2_DIF_SYSTEM)
+ ip->i_diskflags |= GFS2_DIF_SYSTEM;
+
gfs2_set_inode_flags(inode);
if ((GFS2_I(d_inode(sdp->sd_root_dir)) == dip) ||
@@ -733,6 +738,9 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
gfs2_set_iop(inode);
insert_inode_hash(inode);
+ free_vfs_inode = 0; /* After this point, the inode is no longer
+ considered free. Any failures need to undo
+ the gfs2 structures. */
if (default_acl) {
error = gfs2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
posix_acl_release(default_acl);
@@ -766,24 +774,19 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
return error;
fail_gunlock3:
- gfs2_glock_dq_uninit(ghs + 1);
- if (ip->i_gl)
- gfs2_glock_put(ip->i_gl);
- goto fail_gunlock;
-
+ gfs2_glock_dq_uninit(&ip->i_iopen_gh);
+ gfs2_glock_put(io_gl);
fail_gunlock2:
gfs2_glock_dq_uninit(ghs + 1);
fail_free_inode:
if (ip->i_gl)
gfs2_glock_put(ip->i_gl);
- gfs2_rs_delete(ip, NULL);
+ gfs2_rsqa_delete(ip, NULL);
fail_free_acls:
if (default_acl)
posix_acl_release(default_acl);
if (acl)
posix_acl_release(acl);
-fail_free_vfs_inode:
- free_vfs_inode = 1;
fail_gunlock:
gfs2_dir_no_add(&da);
gfs2_glock_dq_uninit(ghs);
@@ -898,7 +901,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
if (S_ISDIR(inode->i_mode))
return -EPERM;
- error = gfs2_rs_alloc(dip);
+ error = gfs2_rsqa_alloc(dip);
if (error)
return error;
@@ -1371,7 +1374,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
if (error)
return error;
- error = gfs2_rs_alloc(ndip);
+ error = gfs2_rsqa_alloc(ndip);
if (error)
return error;
@@ -1712,24 +1715,30 @@ static int gfs2_rename2(struct inode *odir, struct dentry *odentry,
}
/**
- * gfs2_follow_link - Follow a symbolic link
+ * gfs2_get_link - Follow a symbolic link
* @dentry: The dentry of the link
- * @nd: Data that we pass to vfs_follow_link()
+ * @inode: The inode of the link
+ * @done: destructor for return value
*
* This can handle symlinks of any size.
*
* Returns: 0 on success or error code
*/
-static const char *gfs2_follow_link(struct dentry *dentry, void **cookie)
+static const char *gfs2_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- struct gfs2_inode *ip = GFS2_I(d_inode(dentry));
+ struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_holder i_gh;
struct buffer_head *dibh;
unsigned int size;
char *buf;
int error;
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
error = gfs2_glock_nq(&i_gh);
if (error) {
@@ -1759,7 +1768,7 @@ static const char *gfs2_follow_link(struct dentry *dentry, void **cookie)
out:
gfs2_glock_dq_uninit(&i_gh);
if (!IS_ERR(buf))
- *cookie = buf;
+ set_delayed_call(done, kfree_link, buf);
return buf;
}
@@ -1854,11 +1863,7 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
if (!(attr->ia_valid & ATTR_GID) || gid_eq(ogid, ngid))
ogid = ngid = NO_GID_QUOTA_CHANGE;
- error = get_write_access(inode);
- if (error)
- return error;
-
- error = gfs2_rs_alloc(ip);
+ error = gfs2_rsqa_alloc(ip);
if (error)
goto out;
@@ -1898,7 +1903,6 @@ out_end_trans:
out_gunlock_q:
gfs2_quota_unlock(ip);
out:
- put_write_access(inode);
return error;
}
@@ -1920,7 +1924,7 @@ static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
struct gfs2_holder i_gh;
int error;
- error = gfs2_rs_alloc(ip);
+ error = gfs2_rsqa_alloc(ip);
if (error)
return error;
@@ -2002,7 +2006,7 @@ static int gfs2_setxattr(struct dentry *dentry, const char *name,
gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
ret = gfs2_glock_nq(&gh);
if (ret == 0) {
- ret = gfs2_rs_alloc(ip);
+ ret = gfs2_rsqa_alloc(ip);
if (ret == 0)
ret = generic_setxattr(dentry, name, data, size, flags);
gfs2_glock_dq(&gh);
@@ -2043,7 +2047,7 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name)
gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
ret = gfs2_glock_nq(&gh);
if (ret == 0) {
- ret = gfs2_rs_alloc(ip);
+ ret = gfs2_rsqa_alloc(ip);
if (ret == 0)
ret = generic_removexattr(dentry, name);
gfs2_glock_dq(&gh);
@@ -2132,8 +2136,7 @@ const struct inode_operations gfs2_dir_iops = {
const struct inode_operations gfs2_symlink_iops = {
.readlink = generic_readlink,
- .follow_link = gfs2_follow_link,
- .put_link = kfree_put_link,
+ .get_link = gfs2_get_link,
.permission = gfs2_permission,
.setattr = gfs2_setattr,
.getattr = gfs2_getattr,
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 641383a..8b907c5 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -31,7 +31,7 @@ extern struct workqueue_struct *gfs2_control_wq;
*
* @delta is the difference between the current rtt sample and the
* running average srtt. We add 1/8 of that to the srtt in order to
- * update the current srtt estimate. The varience estimate is a bit
+ * update the current srtt estimate. The variance estimate is a bit
* more complicated. We subtract the abs value of the @delta from
* the current variance estimate and add 1/4 of that to the running
* total.
@@ -50,7 +50,7 @@ static inline void gfs2_update_stats(struct gfs2_lkstats *s, unsigned index,
s64 delta = sample - s->stats[index];
s->stats[index] += (delta >> 3);
index++;
- s->stats[index] += ((abs64(delta) - s->stats[index]) >> 2);
+ s->stats[index] += ((abs(delta) - s->stats[index]) >> 2);
}
/**
@@ -80,7 +80,7 @@ static inline void gfs2_update_reply_times(struct gfs2_glock *gl)
preempt_disable();
rtt = ktime_to_ns(ktime_sub(ktime_get_real(), gl->gl_dstamp));
- lks = this_cpu_ptr(gl->gl_sbd->sd_lkstats);
+ lks = this_cpu_ptr(gl->gl_name.ln_sbd->sd_lkstats);
gfs2_update_stats(&gl->gl_stats, index, rtt); /* Local */
gfs2_update_stats(&lks->lkstats[gltype], index, rtt); /* Global */
preempt_enable();
@@ -108,7 +108,7 @@ static inline void gfs2_update_request_times(struct gfs2_glock *gl)
dstamp = gl->gl_dstamp;
gl->gl_dstamp = ktime_get_real();
irt = ktime_to_ns(ktime_sub(gl->gl_dstamp, dstamp));
- lks = this_cpu_ptr(gl->gl_sbd->sd_lkstats);
+ lks = this_cpu_ptr(gl->gl_name.ln_sbd->sd_lkstats);
gfs2_update_stats(&gl->gl_stats, GFS2_LKS_SIRT, irt); /* Local */
gfs2_update_stats(&lks->lkstats[gltype], GFS2_LKS_SIRT, irt); /* Global */
preempt_enable();
@@ -253,7 +253,7 @@ static void gfs2_reverse_hex(char *c, u64 value)
static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state,
unsigned int flags)
{
- struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
+ struct lm_lockstruct *ls = &gl->gl_name.ln_sbd->sd_lockstruct;
int req;
u32 lkf;
char strname[GDLM_STRNAME_BYTES] = "";
@@ -281,7 +281,7 @@ static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state,
static void gdlm_put_lock(struct gfs2_glock *gl)
{
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct lm_lockstruct *ls = &sdp->sd_lockstruct;
int lvb_needs_unlock = 0;
int error;
@@ -319,7 +319,7 @@ static void gdlm_put_lock(struct gfs2_glock *gl)
static void gdlm_cancel(struct gfs2_glock *gl)
{
- struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
+ struct lm_lockstruct *ls = &gl->gl_name.ln_sbd->sd_lockstruct;
dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_CANCEL, NULL, gl);
}
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 536e7a6..0ff028c 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -716,6 +716,9 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl,
}
trace_gfs2_log_flush(sdp, 1);
+ if (type == SHUTDOWN_FLUSH)
+ clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
+
sdp->sd_log_flush_head = sdp->sd_log_head;
sdp->sd_log_flush_wrapped = 0;
tr = sdp->sd_log_tr;
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 2c1ae86..d5369a1 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -70,7 +70,7 @@ static bool buffer_is_rgrp(const struct gfs2_bufdata *bd)
static void maybe_release_space(struct gfs2_bufdata *bd)
{
struct gfs2_glock *gl = bd->bd_gl;
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct gfs2_rgrpd *rgd = gl->gl_object;
unsigned int index = bd->bd_bh->b_blocknr - gl->gl_name.ln_number;
struct gfs2_bitmap *bi = rgd->rd_bits + index;
@@ -202,22 +202,22 @@ static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, struct bio_vec *bvec,
*
*/
-static void gfs2_end_log_write(struct bio *bio, int error)
+static void gfs2_end_log_write(struct bio *bio)
{
struct gfs2_sbd *sdp = bio->bi_private;
struct bio_vec *bvec;
struct page *page;
int i;
- if (error) {
- sdp->sd_log_error = error;
- fs_err(sdp, "Error %d writing to log\n", error);
+ if (bio->bi_error) {
+ sdp->sd_log_error = bio->bi_error;
+ fs_err(sdp, "Error %d writing to log\n", bio->bi_error);
}
bio_for_each_segment_all(bvec, bio, i) {
page = bvec->bv_page;
if (page_has_buffers(page))
- gfs2_end_log_write_bh(sdp, bvec, error);
+ gfs2_end_log_write_bh(sdp, bvec, bio->bi_error);
else
mempool_free(page, gfs2_page_pool);
}
@@ -261,18 +261,11 @@ void gfs2_log_flush_bio(struct gfs2_sbd *sdp, int rw)
static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno)
{
struct super_block *sb = sdp->sd_vfs;
- unsigned nrvecs = bio_get_nr_vecs(sb->s_bdev);
struct bio *bio;
BUG_ON(sdp->sd_log_bio);
- while (1) {
- bio = bio_alloc(GFP_NOIO, nrvecs);
- if (likely(bio))
- break;
- nrvecs = max(nrvecs/2, 1U);
- }
-
+ bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
bio->bi_iter.bi_sector = blkno * (sb->s_blocksize >> 9);
bio->bi_bdev = sb->s_bdev;
bio->bi_end_io = gfs2_end_log_write;
@@ -585,7 +578,7 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
static void gfs2_meta_sync(struct gfs2_glock *gl)
{
struct address_space *mapping = gfs2_glock2aspace(gl);
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
int error;
if (mapping == NULL)
@@ -595,7 +588,7 @@ static void gfs2_meta_sync(struct gfs2_glock *gl)
error = filemap_fdatawait(mapping);
if (error)
- gfs2_io_error(gl->gl_sbd);
+ gfs2_io_error(gl->gl_name.ln_sbd);
}
static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 241a399..f99f8e9 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -41,7 +41,9 @@ static void gfs2_init_inode_once(void *foo)
inode_init_once(&ip->i_inode);
init_rwsem(&ip->i_rw_mutex);
INIT_LIST_HEAD(&ip->i_trunc_list);
- ip->i_res = NULL;
+ ip->i_qadata = NULL;
+ memset(&ip->i_res, 0, sizeof(ip->i_res));
+ RB_CLEAR_NODE(&ip->i_res.rs_node);
ip->i_hash_cache = NULL;
}
@@ -50,7 +52,7 @@ static void gfs2_init_glock_once(void *foo)
struct gfs2_glock *gl = foo;
INIT_HLIST_BL_NODE(&gl->gl_list);
- spin_lock_init(&gl->gl_spin);
+ spin_lock_init(&gl->gl_lockref.lock);
INIT_LIST_HEAD(&gl->gl_holders);
INIT_LIST_HEAD(&gl->gl_lru);
INIT_LIST_HEAD(&gl->gl_ail_list);
@@ -112,7 +114,8 @@ static int __init init_gfs2_fs(void)
gfs2_inode_cachep = kmem_cache_create("gfs2_inode",
sizeof(struct gfs2_inode),
0, SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD,
+ SLAB_MEM_SPREAD|
+ SLAB_ACCOUNT,
gfs2_init_inode_once);
if (!gfs2_inode_cachep)
goto fail;
@@ -135,10 +138,10 @@ static int __init init_gfs2_fs(void)
if (!gfs2_quotad_cachep)
goto fail;
- gfs2_rsrv_cachep = kmem_cache_create("gfs2_mblk",
- sizeof(struct gfs2_blkreserv),
+ gfs2_qadata_cachep = kmem_cache_create("gfs2_qadata",
+ sizeof(struct gfs2_qadata),
0, 0, NULL);
- if (!gfs2_rsrv_cachep)
+ if (!gfs2_qadata_cachep)
goto fail;
register_shrinker(&gfs2_qd_shrinker);
@@ -193,8 +196,8 @@ fail_lru:
unregister_shrinker(&gfs2_qd_shrinker);
gfs2_glock_exit();
- if (gfs2_rsrv_cachep)
- kmem_cache_destroy(gfs2_rsrv_cachep);
+ if (gfs2_qadata_cachep)
+ kmem_cache_destroy(gfs2_qadata_cachep);
if (gfs2_quotad_cachep)
kmem_cache_destroy(gfs2_quotad_cachep);
@@ -238,7 +241,7 @@ static void __exit exit_gfs2_fs(void)
rcu_barrier();
mempool_destroy(gfs2_page_pool);
- kmem_cache_destroy(gfs2_rsrv_cachep);
+ kmem_cache_destroy(gfs2_qadata_cachep);
kmem_cache_destroy(gfs2_quotad_cachep);
kmem_cache_destroy(gfs2_rgrpd_cachep);
kmem_cache_destroy(gfs2_bufdata_cachep);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index b984a6e..e137d96 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -114,7 +114,7 @@ const struct address_space_operations gfs2_rgrp_aops = {
struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
{
struct address_space *mapping = gfs2_glock2aspace(gl);
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct page *page;
struct buffer_head *bh;
unsigned int shift;
@@ -187,6 +187,52 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
return bh;
}
+static void gfs2_meta_read_endio(struct bio *bio)
+{
+ struct bio_vec *bvec;
+ int i;
+
+ bio_for_each_segment_all(bvec, bio, i) {
+ struct page *page = bvec->bv_page;
+ struct buffer_head *bh = page_buffers(page);
+ unsigned int len = bvec->bv_len;
+
+ while (bh_offset(bh) < bvec->bv_offset)
+ bh = bh->b_this_page;
+ do {
+ struct buffer_head *next = bh->b_this_page;
+ len -= bh->b_size;
+ bh->b_end_io(bh, !bio->bi_error);
+ bh = next;
+ } while (bh && len);
+ }
+ bio_put(bio);
+}
+
+/*
+ * Submit several consecutive buffer head I/O requests as a single bio I/O
+ * request. (See submit_bh_wbc.)
+ */
+static void gfs2_submit_bhs(int rw, struct buffer_head *bhs[], int num)
+{
+ struct buffer_head *bh = bhs[0];
+ struct bio *bio;
+ int i;
+
+ if (!num)
+ return;
+
+ bio = bio_alloc(GFP_NOIO, num);
+ bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+ bio->bi_bdev = bh->b_bdev;
+ for (i = 0; i < num; i++) {
+ bh = bhs[i];
+ bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
+ }
+ bio->bi_end_io = gfs2_meta_read_endio;
+ submit_bio(rw, bio);
+}
+
/**
* gfs2_meta_read - Read a block from disk
* @gl: The glock covering the block
@@ -198,10 +244,11 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
*/
int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
- struct buffer_head **bhp)
+ int rahead, struct buffer_head **bhp)
{
- struct gfs2_sbd *sdp = gl->gl_sbd;
- struct buffer_head *bh;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
+ struct buffer_head *bh, *bhs[2];
+ int num = 0;
if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
*bhp = NULL;
@@ -213,14 +260,31 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
lock_buffer(bh);
if (buffer_uptodate(bh)) {
unlock_buffer(bh);
- return 0;
+ flags &= ~DIO_WAIT;
+ } else {
+ bh->b_end_io = end_buffer_read_sync;
+ get_bh(bh);
+ bhs[num++] = bh;
}
- bh->b_end_io = end_buffer_read_sync;
- get_bh(bh);
- submit_bh(READ_SYNC | REQ_META | REQ_PRIO, bh);
+
+ if (rahead) {
+ bh = gfs2_getbuf(gl, blkno + 1, CREATE);
+
+ lock_buffer(bh);
+ if (buffer_uptodate(bh)) {
+ unlock_buffer(bh);
+ brelse(bh);
+ } else {
+ bh->b_end_io = end_buffer_read_sync;
+ bhs[num++] = bh;
+ }
+ }
+
+ gfs2_submit_bhs(READ_SYNC | REQ_META | REQ_PRIO, bhs, num);
if (!(flags & DIO_WAIT))
return 0;
+ bh = *bhp;
wait_on_buffer(bh);
if (unlikely(!buffer_uptodate(bh))) {
struct gfs2_trans *tr = current->journal_info;
@@ -341,8 +405,12 @@ int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num,
struct buffer_head *bh;
int ret = 0;
u32 mtype = height ? GFS2_METATYPE_IN : GFS2_METATYPE_DI;
+ int rahead = 0;
+
+ if (num == ip->i_no_addr)
+ rahead = ip->i_rahead;
- ret = gfs2_meta_read(gl, num, DIO_WAIT, &bh);
+ ret = gfs2_meta_read(gl, num, DIO_WAIT, rahead, &bh);
if (ret == 0 && gfs2_metatype_check(sdp, bh, mtype)) {
brelse(bh);
ret = -EIO;
@@ -362,7 +430,7 @@ int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num,
struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
{
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct buffer_head *first_bh, *bh;
u32 max_ra = gfs2_tune_get(sdp, gt_max_readahead) >>
sdp->sd_sb.sb_bsize_shift;
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index ac5d802..c5086c8 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -44,7 +44,7 @@ static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping)
{
struct inode *inode = mapping->host;
if (mapping->a_ops == &gfs2_meta_aops)
- return (((struct gfs2_glock *)mapping) - 1)->gl_sbd;
+ return (((struct gfs2_glock *)mapping) - 1)->gl_name.ln_sbd;
else if (mapping->a_ops == &gfs2_rgrp_aops)
return container_of(mapping, struct gfs2_sbd, sd_aspace);
else
@@ -53,7 +53,7 @@ static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping)
extern struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno);
extern int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
- struct buffer_head **bhp);
+ int rahead, struct buffer_head **bhp);
extern int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh);
extern struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno,
int create);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 1e3a93f..dbed9e2 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -171,14 +171,14 @@ static int gfs2_check_sb(struct gfs2_sbd *sdp, int silent)
return -EINVAL;
}
-static void end_bio_io_page(struct bio *bio, int error)
+static void end_bio_io_page(struct bio *bio)
{
struct page *page = bio->bi_private;
- if (!error)
+ if (!bio->bi_error)
SetPageUptodate(page);
else
- pr_warn("error %d reading superblock\n", error);
+ pr_warn("error %d reading superblock\n", bio->bi_error);
unlock_page(page);
}
@@ -352,6 +352,9 @@ static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent)
sdp->sd_jheightsize[x] = ~0;
gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT);
+ sdp->sd_max_dents_per_leaf = (sdp->sd_sb.sb_bsize -
+ sizeof(struct gfs2_leaf)) /
+ GFS2_MIN_DIRENT_SIZE;
return 0;
}
@@ -910,8 +913,7 @@ fail_qc_i:
fail_ut_i:
iput(sdp->sd_sc_inode);
fail:
- if (pn)
- iput(pn);
+ iput(pn);
return error;
}
@@ -1291,6 +1293,9 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
up_write(&s->s_umount);
blkdev_put(bdev, mode);
down_write(&s->s_umount);
+ } else {
+ /* s_mode must be set before deactivate_locked_super calls */
+ s->s_mode = mode;
}
memset(&args, 0, sizeof(args));
@@ -1312,10 +1317,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
if ((flags ^ s->s_flags) & MS_RDONLY)
goto error_super;
} else {
- char b[BDEVNAME_SIZE];
-
- s->s_mode = mode;
- strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
+ snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
sb_set_blocksize(s, block_size(bdev));
error = fill_super(s, &args, flags & MS_SILENT ? 1 : 0);
if (error)
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 9b61f92..be6d9c4 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -119,7 +119,7 @@ static void gfs2_qd_dispose(struct list_head *list)
while (!list_empty(list)) {
qd = list_entry(list->next, struct gfs2_quota_data, qd_lru);
- sdp = qd->qd_gl->gl_sbd;
+ sdp = qd->qd_gl->gl_name.ln_sbd;
list_del(&qd->qd_lru);
@@ -302,7 +302,7 @@ static int qd_get(struct gfs2_sbd *sdp, struct kqid qid,
static void qd_hold(struct gfs2_quota_data *qd)
{
- struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+ struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd;
gfs2_assert(sdp, !__lockref_is_dead(&qd->qd_lockref));
lockref_get(&qd->qd_lockref);
}
@@ -367,7 +367,7 @@ static void slot_put(struct gfs2_quota_data *qd)
static int bh_get(struct gfs2_quota_data *qd)
{
- struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+ struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd;
struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
unsigned int block, offset;
struct buffer_head *bh;
@@ -388,7 +388,7 @@ static int bh_get(struct gfs2_quota_data *qd)
error = gfs2_block_map(&ip->i_inode, block, &bh_map, 0);
if (error)
goto fail;
- error = gfs2_meta_read(ip->i_gl, bh_map.b_blocknr, DIO_WAIT, &bh);
+ error = gfs2_meta_read(ip->i_gl, bh_map.b_blocknr, DIO_WAIT, 0, &bh);
if (error)
goto fail;
error = -EIO;
@@ -414,7 +414,7 @@ fail:
static void bh_put(struct gfs2_quota_data *qd)
{
- struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+ struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd;
mutex_lock(&sdp->sd_quota_mutex);
gfs2_assert(sdp, qd->qd_bh_count);
@@ -486,7 +486,7 @@ static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp)
static void qd_unlock(struct gfs2_quota_data *qd)
{
- gfs2_assert_warn(qd->qd_gl->gl_sbd,
+ gfs2_assert_warn(qd->qd_gl->gl_name.ln_sbd,
test_bit(QDF_LOCKED, &qd->qd_flags));
clear_bit(QDF_LOCKED, &qd->qd_flags);
bh_put(qd);
@@ -527,37 +527,70 @@ static void qdsb_put(struct gfs2_quota_data *qd)
qd_put(qd);
}
+/**
+ * gfs2_qa_alloc - make sure we have a quota allocations data structure,
+ * if necessary
+ * @ip: the inode for this reservation
+ */
+int gfs2_qa_alloc(struct gfs2_inode *ip)
+{
+ int error = 0;
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+
+ if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
+ return 0;
+
+ down_write(&ip->i_rw_mutex);
+ if (ip->i_qadata == NULL) {
+ ip->i_qadata = kmem_cache_zalloc(gfs2_qadata_cachep, GFP_NOFS);
+ if (!ip->i_qadata)
+ error = -ENOMEM;
+ }
+ up_write(&ip->i_rw_mutex);
+ return error;
+}
+
+void gfs2_qa_delete(struct gfs2_inode *ip, atomic_t *wcount)
+{
+ down_write(&ip->i_rw_mutex);
+ if (ip->i_qadata && ((wcount == NULL) || (atomic_read(wcount) <= 1))) {
+ kmem_cache_free(gfs2_qadata_cachep, ip->i_qadata);
+ ip->i_qadata = NULL;
+ }
+ up_write(&ip->i_rw_mutex);
+}
+
int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_quota_data **qd;
int error;
- if (ip->i_res == NULL) {
- error = gfs2_rs_alloc(ip);
+ if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
+ return 0;
+
+ if (ip->i_qadata == NULL) {
+ error = gfs2_rsqa_alloc(ip);
if (error)
return error;
}
- qd = ip->i_res->rs_qa_qd;
+ qd = ip->i_qadata->qa_qd;
- if (gfs2_assert_warn(sdp, !ip->i_res->rs_qa_qd_num) ||
+ if (gfs2_assert_warn(sdp, !ip->i_qadata->qa_qd_num) ||
gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags)))
return -EIO;
- if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
- return 0;
-
error = qdsb_get(sdp, make_kqid_uid(ip->i_inode.i_uid), qd);
if (error)
goto out;
- ip->i_res->rs_qa_qd_num++;
+ ip->i_qadata->qa_qd_num++;
qd++;
error = qdsb_get(sdp, make_kqid_gid(ip->i_inode.i_gid), qd);
if (error)
goto out;
- ip->i_res->rs_qa_qd_num++;
+ ip->i_qadata->qa_qd_num++;
qd++;
if (!uid_eq(uid, NO_UID_QUOTA_CHANGE) &&
@@ -565,7 +598,7 @@ int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
error = qdsb_get(sdp, make_kqid_uid(uid), qd);
if (error)
goto out;
- ip->i_res->rs_qa_qd_num++;
+ ip->i_qadata->qa_qd_num++;
qd++;
}
@@ -574,7 +607,7 @@ int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
error = qdsb_get(sdp, make_kqid_gid(gid), qd);
if (error)
goto out;
- ip->i_res->rs_qa_qd_num++;
+ ip->i_qadata->qa_qd_num++;
qd++;
}
@@ -587,17 +620,17 @@ out:
void gfs2_quota_unhold(struct gfs2_inode *ip)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- unsigned int x;
+ u32 x;
- if (ip->i_res == NULL)
+ if (ip->i_qadata == NULL)
return;
gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags));
- for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) {
- qdsb_put(ip->i_res->rs_qa_qd[x]);
- ip->i_res->rs_qa_qd[x] = NULL;
+ for (x = 0; x < ip->i_qadata->qa_qd_num; x++) {
+ qdsb_put(ip->i_qadata->qa_qd[x]);
+ ip->i_qadata->qa_qd[x] = NULL;
}
- ip->i_res->rs_qa_qd_num = 0;
+ ip->i_qadata->qa_qd_num = 0;
}
static int sort_qd(const void *a, const void *b)
@@ -614,7 +647,7 @@ static int sort_qd(const void *a, const void *b)
static void do_qc(struct gfs2_quota_data *qd, s64 change)
{
- struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+ struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd;
struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
struct gfs2_quota_change *qc = qd->qd_bh_qc;
s64 x;
@@ -831,7 +864,7 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
{
- struct gfs2_sbd *sdp = (*qda)->qd_gl->gl_sbd;
+ struct gfs2_sbd *sdp = (*qda)->qd_gl->gl_name.ln_sbd;
struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
struct gfs2_alloc_parms ap = { .aflags = 0, };
unsigned int data_blocks, ind_blocks;
@@ -843,7 +876,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
unsigned int nalloc = 0, blocks;
int error;
- error = gfs2_rs_alloc(ip);
+ error = gfs2_rsqa_alloc(ip);
if (error)
return error;
@@ -922,7 +955,7 @@ out:
gfs2_glock_dq_uninit(&ghs[qx]);
mutex_unlock(&ip->i_inode.i_mutex);
kfree(ghs);
- gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl, NORMAL_FLUSH);
+ gfs2_log_flush(ip->i_gl->gl_name.ln_sbd, ip->i_gl, NORMAL_FLUSH);
return error;
}
@@ -954,7 +987,7 @@ static int update_qd(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd)
static int do_glock(struct gfs2_quota_data *qd, int force_refresh,
struct gfs2_holder *q_gh)
{
- struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+ struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd;
struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
struct gfs2_holder i_gh;
int error;
@@ -1003,23 +1036,23 @@ int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_quota_data *qd;
- unsigned int x;
+ u32 x;
int error = 0;
- error = gfs2_quota_hold(ip, uid, gid);
- if (error)
- return error;
-
if (capable(CAP_SYS_RESOURCE) ||
sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
return 0;
- sort(ip->i_res->rs_qa_qd, ip->i_res->rs_qa_qd_num,
+ error = gfs2_quota_hold(ip, uid, gid);
+ if (error)
+ return error;
+
+ sort(ip->i_qadata->qa_qd, ip->i_qadata->qa_qd_num,
sizeof(struct gfs2_quota_data *), sort_qd, NULL);
- for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) {
- qd = ip->i_res->rs_qa_qd[x];
- error = do_glock(qd, NO_FORCE, &ip->i_res->rs_qa_qd_ghs[x]);
+ for (x = 0; x < ip->i_qadata->qa_qd_num; x++) {
+ qd = ip->i_qadata->qa_qd[x];
+ error = do_glock(qd, NO_FORCE, &ip->i_qadata->qa_qd_ghs[x]);
if (error)
break;
}
@@ -1028,7 +1061,7 @@ int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
set_bit(GIF_QD_LOCKED, &ip->i_flags);
else {
while (x--)
- gfs2_glock_dq_uninit(&ip->i_res->rs_qa_qd_ghs[x]);
+ gfs2_glock_dq_uninit(&ip->i_qadata->qa_qd_ghs[x]);
gfs2_quota_unhold(ip);
}
@@ -1037,7 +1070,7 @@ int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
static int need_sync(struct gfs2_quota_data *qd)
{
- struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+ struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd;
struct gfs2_tune *gt = &sdp->sd_tune;
s64 value;
unsigned int num, den;
@@ -1076,20 +1109,20 @@ void gfs2_quota_unlock(struct gfs2_inode *ip)
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_quota_data *qda[4];
unsigned int count = 0;
- unsigned int x;
+ u32 x;
int found;
if (!test_and_clear_bit(GIF_QD_LOCKED, &ip->i_flags))
goto out;
- for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) {
+ for (x = 0; x < ip->i_qadata->qa_qd_num; x++) {
struct gfs2_quota_data *qd;
int sync;
- qd = ip->i_res->rs_qa_qd[x];
+ qd = ip->i_qadata->qa_qd[x];
sync = need_sync(qd);
- gfs2_glock_dq_uninit(&ip->i_res->rs_qa_qd_ghs[x]);
+ gfs2_glock_dq_uninit(&ip->i_qadata->qa_qd_ghs[x]);
if (!sync)
continue;
@@ -1125,7 +1158,7 @@ out:
static int print_message(struct gfs2_quota_data *qd, char *type)
{
- struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+ struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd;
fs_info(sdp, "quota %s for %s %u\n",
type,
@@ -1158,7 +1191,7 @@ int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid,
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_quota_data *qd;
s64 value, warn, limit;
- unsigned int x;
+ u32 x;
int error = 0;
ap->allowed = UINT_MAX; /* Assume we are permitted a whole lot */
@@ -1168,8 +1201,8 @@ int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid,
if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
return 0;
- for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) {
- qd = ip->i_res->rs_qa_qd[x];
+ for (x = 0; x < ip->i_qadata->qa_qd_num; x++) {
+ qd = ip->i_qadata->qa_qd[x];
if (!(qid_eq(qd->qd_id, make_kqid_uid(uid)) ||
qid_eq(qd->qd_id, make_kqid_gid(gid))))
@@ -1216,15 +1249,17 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
kuid_t uid, kgid_t gid)
{
struct gfs2_quota_data *qd;
- unsigned int x;
+ u32 x;
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), change))
+ if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON ||
+ gfs2_assert_warn(sdp, change))
return;
if (ip->i_diskflags & GFS2_DIF_SYSTEM)
return;
- for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) {
- qd = ip->i_res->rs_qa_qd[x];
+ for (x = 0; x < ip->i_qadata->qa_qd_num; x++) {
+ qd = ip->i_qadata->qa_qd[x];
if (qid_eq(qd->qd_id, make_kqid_uid(uid)) ||
qid_eq(qd->qd_id, make_kqid_gid(gid))) {
@@ -1635,7 +1670,7 @@ static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid,
if (error)
return error;
- error = gfs2_rs_alloc(ip);
+ error = gfs2_rsqa_alloc(ip);
if (error)
goto out_put;
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index ad04b3ac..5e47c93 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -18,6 +18,8 @@ struct gfs2_sbd;
#define NO_UID_QUOTA_CHANGE INVALID_UID
#define NO_GID_QUOTA_CHANGE INVALID_GID
+extern int gfs2_qa_alloc(struct gfs2_inode *ip);
+extern void gfs2_qa_delete(struct gfs2_inode *ip, atomic_t *wcount);
extern int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid);
extern void gfs2_quota_unhold(struct gfs2_inode *ip);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index c6c6232..07c0265 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -596,27 +596,13 @@ void gfs2_free_clones(struct gfs2_rgrpd *rgd)
}
/**
- * gfs2_rs_alloc - make sure we have a reservation assigned to the inode
+ * gfs2_rsqa_alloc - make sure we have a reservation assigned to the inode
+ * plus a quota allocations data structure, if necessary
* @ip: the inode for this reservation
*/
-int gfs2_rs_alloc(struct gfs2_inode *ip)
+int gfs2_rsqa_alloc(struct gfs2_inode *ip)
{
- int error = 0;
-
- down_write(&ip->i_rw_mutex);
- if (ip->i_res)
- goto out;
-
- ip->i_res = kmem_cache_zalloc(gfs2_rsrv_cachep, GFP_NOFS);
- if (!ip->i_res) {
- error = -ENOMEM;
- goto out;
- }
-
- RB_CLEAR_NODE(&ip->i_res->rs_node);
-out:
- up_write(&ip->i_rw_mutex);
- return error;
+ return gfs2_qa_alloc(ip);
}
static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs)
@@ -678,21 +664,20 @@ void gfs2_rs_deltree(struct gfs2_blkreserv *rs)
}
/**
- * gfs2_rs_delete - delete a multi-block reservation
+ * gfs2_rsqa_delete - delete a multi-block reservation and quota allocation
* @ip: The inode for this reservation
* @wcount: The inode's write count, or NULL
*
*/
-void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount)
+void gfs2_rsqa_delete(struct gfs2_inode *ip, atomic_t *wcount)
{
down_write(&ip->i_rw_mutex);
- if (ip->i_res && ((wcount == NULL) || (atomic_read(wcount) <= 1))) {
- gfs2_rs_deltree(ip->i_res);
- BUG_ON(ip->i_res->rs_free);
- kmem_cache_free(gfs2_rsrv_cachep, ip->i_res);
- ip->i_res = NULL;
+ if ((wcount == NULL) || (atomic_read(wcount) <= 1)) {
+ gfs2_rs_deltree(&ip->i_res);
+ BUG_ON(ip->i_res.rs_free);
}
up_write(&ip->i_rw_mutex);
+ gfs2_qa_delete(ip, wcount);
}
/**
@@ -729,9 +714,9 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
rb_erase(n, &sdp->sd_rindex_tree);
if (gl) {
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
gl->gl_object = NULL;
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
gfs2_glock_add_to_lru(gl);
gfs2_glock_put(gl);
}
@@ -933,8 +918,9 @@ static int read_rindex_entry(struct gfs2_inode *ip)
goto fail;
rgd->rd_gl->gl_object = rgd;
- rgd->rd_gl->gl_vm.start = rgd->rd_addr * bsize;
- rgd->rd_gl->gl_vm.end = rgd->rd_gl->gl_vm.start + (rgd->rd_length * bsize) - 1;
+ rgd->rd_gl->gl_vm.start = (rgd->rd_addr * bsize) & PAGE_CACHE_MASK;
+ rgd->rd_gl->gl_vm.end = PAGE_CACHE_ALIGN((rgd->rd_addr +
+ rgd->rd_length) * bsize) - 1;
rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr;
rgd->rd_flags &= ~(GFS2_RDF_UPTODATE | GFS2_RDF_PREFERRED);
if (rgd->rd_data > sdp->sd_max_rg_data)
@@ -1157,7 +1143,7 @@ static int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
for (x = 0; x < length; x++) {
bi = rgd->rd_bits + x;
- error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, &bi->bi_bh);
+ error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, 0, &bi->bi_bh);
if (error)
goto fail;
}
@@ -1455,7 +1441,7 @@ static void rs_insert(struct gfs2_inode *ip)
{
struct rb_node **newn, *parent = NULL;
int rc;
- struct gfs2_blkreserv *rs = ip->i_res;
+ struct gfs2_blkreserv *rs = &ip->i_res;
struct gfs2_rgrpd *rgd = rs->rs_rbm.rgd;
u64 fsblock = gfs2_rbm_to_block(&rs->rs_rbm);
@@ -1502,7 +1488,7 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip,
{
struct gfs2_rbm rbm = { .rgd = rgd, };
u64 goal;
- struct gfs2_blkreserv *rs = ip->i_res;
+ struct gfs2_blkreserv *rs = &ip->i_res;
u32 extlen;
u32 free_blocks = rgd->rd_free_clone - rgd->rd_reserved;
int ret;
@@ -1573,7 +1559,7 @@ static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block,
}
if (n) {
- while ((rs_cmp(block, length, rs) == 0) && (ip->i_res != rs)) {
+ while ((rs_cmp(block, length, rs) == 0) && (&ip->i_res != rs)) {
block = gfs2_rbm_to_block(&rs->rs_rbm) + rs->rs_free;
n = n->rb_right;
if (n == NULL)
@@ -1803,7 +1789,7 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
continue;
*last_unlinked = block;
- error = gfs2_glock_get(sdp, block, &gfs2_inode_glops, CREATE, &gl);
+ error = gfs2_glock_get(sdp, block, &gfs2_iopen_glops, CREATE, &gl);
if (error)
continue;
@@ -1860,13 +1846,13 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops)
{
const struct gfs2_glock *gl = rgd->rd_gl;
- const struct gfs2_sbd *sdp = gl->gl_sbd;
+ const struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct gfs2_lkstats *st;
- s64 r_dcount, l_dcount;
- s64 l_srttb, a_srttb = 0;
+ u64 r_dcount, l_dcount;
+ u64 l_srttb, a_srttb = 0;
s64 srttb_diff;
- s64 sqr_diff;
- s64 var;
+ u64 sqr_diff;
+ u64 var;
int cpu, nonzero = 0;
preempt_disable();
@@ -1983,7 +1969,7 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_rgrpd *begin = NULL;
- struct gfs2_blkreserv *rs = ip->i_res;
+ struct gfs2_blkreserv *rs = &ip->i_res;
int error = 0, rg_locked, flags = 0;
u64 last_unlinked = NO_BLOCK;
int loops = 0;
@@ -2112,7 +2098,7 @@ next_rgrp:
void gfs2_inplace_release(struct gfs2_inode *ip)
{
- struct gfs2_blkreserv *rs = ip->i_res;
+ struct gfs2_blkreserv *rs = &ip->i_res;
if (rs->rs_rgd_gh.gh_gl)
gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
@@ -2266,7 +2252,7 @@ static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
static void gfs2_adjust_reservation(struct gfs2_inode *ip,
const struct gfs2_rbm *rbm, unsigned len)
{
- struct gfs2_blkreserv *rs = ip->i_res;
+ struct gfs2_blkreserv *rs = &ip->i_res;
struct gfs2_rgrpd *rgd = rbm->rgd;
unsigned rlen;
u64 block;
@@ -2309,8 +2295,8 @@ static void gfs2_set_alloc_start(struct gfs2_rbm *rbm,
{
u64 goal;
- if (gfs2_rs_active(ip->i_res)) {
- *rbm = ip->i_res->rs_rbm;
+ if (gfs2_rs_active(&ip->i_res)) {
+ *rbm = ip->i_res.rs_rbm;
return;
}
@@ -2364,7 +2350,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
gfs2_alloc_extent(&rbm, dinode, nblocks);
block = gfs2_rbm_to_block(&rbm);
rbm.rgd->rd_last_alloc = block - rbm.rgd->rd_data0;
- if (gfs2_rs_active(ip->i_res))
+ if (gfs2_rs_active(&ip->i_res))
gfs2_adjust_reservation(ip, &rbm, *nblocks);
ndata = *nblocks;
if (dinode)
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index c0ab33f..66b51cf 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -49,9 +49,9 @@ extern void gfs2_inplace_release(struct gfs2_inode *ip);
extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n,
bool dinode, u64 *generation);
-extern int gfs2_rs_alloc(struct gfs2_inode *ip);
+extern int gfs2_rsqa_alloc(struct gfs2_inode *ip);
extern void gfs2_rs_deltree(struct gfs2_blkreserv *rs);
-extern void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount);
+extern void gfs2_rsqa_delete(struct gfs2_inode *ip, atomic_t *wcount);
extern void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta);
extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
@@ -78,7 +78,7 @@ extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
extern int gfs2_fitrim(struct file *filp, void __user *argp);
/* This is how to tell if a reservation is in the rgrp tree: */
-static inline bool gfs2_rs_active(struct gfs2_blkreserv *rs)
+static inline bool gfs2_rs_active(const struct gfs2_blkreserv *rs)
{
return rs && !RB_EMPTY_NODE(&rs->rs_node);
}
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 2982445..8f960a5 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -83,6 +83,8 @@ enum {
Opt_nobarrier,
Opt_rgrplvb,
Opt_norgrplvb,
+ Opt_loccookie,
+ Opt_noloccookie,
Opt_error,
};
@@ -122,6 +124,8 @@ static const match_table_t tokens = {
{Opt_nobarrier, "nobarrier"},
{Opt_rgrplvb, "rgrplvb"},
{Opt_norgrplvb, "norgrplvb"},
+ {Opt_loccookie, "loccookie"},
+ {Opt_noloccookie, "noloccookie"},
{Opt_error, NULL}
};
@@ -278,6 +282,12 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
case Opt_norgrplvb:
args->ar_rgrplvb = 0;
break;
+ case Opt_loccookie:
+ args->ar_loccookie = 1;
+ break;
+ case Opt_noloccookie:
+ args->ar_loccookie = 0;
+ break;
case Opt_error:
default:
pr_warn("invalid mount option: %s\n", o);
@@ -556,6 +566,7 @@ void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
gfs2_trans_add_meta(l_ip->i_gl, l_bh);
+ gfs2_trans_add_meta(m_ip->i_gl, m_bh);
spin_lock(&sdp->sd_statfs_spin);
m_sc->sc_total += l_sc->sc_total;
@@ -564,10 +575,8 @@ void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
memset(l_sc, 0, sizeof(struct gfs2_statfs_change));
memset(l_bh->b_data + sizeof(struct gfs2_dinode),
0, sizeof(struct gfs2_statfs_change));
- spin_unlock(&sdp->sd_statfs_spin);
-
- gfs2_trans_add_meta(m_ip->i_gl, m_bh);
gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode));
+ spin_unlock(&sdp->sd_statfs_spin);
}
int gfs2_statfs_sync(struct super_block *sb, int type)
@@ -842,10 +851,6 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
gfs2_quota_sync(sdp->sd_vfs, 0);
gfs2_statfs_sync(sdp->sd_vfs, 0);
- down_write(&sdp->sd_log_flush_lock);
- clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
- up_write(&sdp->sd_log_flush_lock);
-
gfs2_log_flush(sdp, NULL, SHUTDOWN_FLUSH);
wait_event(sdp->sd_reserving_log_wait, atomic_read(&sdp->sd_reserving_log) == 0);
gfs2_assert_warn(sdp, atomic_read(&sdp->sd_log_blks_free) == sdp->sd_jdesc->jd_blocks);
@@ -1334,11 +1339,11 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root)
if (is_ancestor(root, sdp->sd_master_dir))
seq_puts(s, ",meta");
if (args->ar_lockproto[0])
- seq_printf(s, ",lockproto=%s", args->ar_lockproto);
+ seq_show_option(s, "lockproto", args->ar_lockproto);
if (args->ar_locktable[0])
- seq_printf(s, ",locktable=%s", args->ar_locktable);
+ seq_show_option(s, "locktable", args->ar_locktable);
if (args->ar_hostdata[0])
- seq_printf(s, ",hostdata=%s", args->ar_hostdata);
+ seq_show_option(s, "hostdata", args->ar_hostdata);
if (args->ar_spectator)
seq_puts(s, ",spectator");
if (args->ar_localflocks)
@@ -1419,6 +1424,8 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root)
seq_puts(s, ",demote_interface_used");
if (args->ar_rgrplvb)
seq_puts(s, ",rgrplvb");
+ if (args->ar_loccookie)
+ seq_puts(s, ",loccookie");
return 0;
}
@@ -1512,6 +1519,7 @@ static void gfs2_evict_inode(struct inode *inode)
struct gfs2_sbd *sdp = sb->s_fs_info;
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_holder gh;
+ struct address_space *metamapping;
int error;
if (test_bit(GIF_FREE_VFS_INODE, &ip->i_flags)) {
@@ -1526,7 +1534,8 @@ static void gfs2_evict_inode(struct inode *inode)
error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh);
if (unlikely(error)) {
ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
- gfs2_glock_dq_uninit(&ip->i_iopen_gh);
+ gfs2_glock_dq_wait(&ip->i_iopen_gh);
+ gfs2_holder_uninit(&ip->i_iopen_gh);
goto out;
}
@@ -1575,8 +1584,8 @@ static void gfs2_evict_inode(struct inode *inode)
out_truncate:
gfs2_log_flush(sdp, ip->i_gl, NORMAL_FLUSH);
+ metamapping = gfs2_glock2aspace(ip->i_gl);
if (test_bit(GLF_DIRTY, &ip->i_gl->gl_flags)) {
- struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl);
filemap_fdatawrite(metamapping);
filemap_fdatawait(metamapping);
}
@@ -1589,16 +1598,17 @@ out_truncate:
goto out_unlock;
/* Needs to be done before glock release & also in a transaction */
truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages(metamapping, 0);
gfs2_trans_end(sdp);
out_unlock:
/* Error path for case 1 */
- if (gfs2_rs_active(ip->i_res))
- gfs2_rs_deltree(ip->i_res);
+ if (gfs2_rs_active(&ip->i_res))
+ gfs2_rs_deltree(&ip->i_res);
if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) {
ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
- gfs2_glock_dq(&ip->i_iopen_gh);
+ gfs2_glock_dq_wait(&ip->i_iopen_gh);
}
gfs2_holder_uninit(&ip->i_iopen_gh);
gfs2_glock_dq_uninit(&gh);
@@ -1607,7 +1617,7 @@ out_unlock:
out:
/* Case 3 starts here */
truncate_inode_pages_final(&inode->i_data);
- gfs2_rs_delete(ip, NULL);
+ gfs2_rsqa_delete(ip, NULL);
gfs2_ordered_del_inode(ip);
clear_inode(inode);
gfs2_dir_hash_inval(ip);
@@ -1619,7 +1629,8 @@ out:
if (ip->i_iopen_gh.gh_gl) {
ip->i_iopen_gh.gh_gl->gl_object = NULL;
ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
- gfs2_glock_dq_uninit(&ip->i_iopen_gh);
+ gfs2_glock_dq_wait(&ip->i_iopen_gh);
+ gfs2_holder_uninit(&ip->i_iopen_gh);
}
}
@@ -1632,7 +1643,9 @@ static struct inode *gfs2_alloc_inode(struct super_block *sb)
ip->i_flags = 0;
ip->i_gl = NULL;
ip->i_rgd = NULL;
- ip->i_res = NULL;
+ memset(&ip->i_res, 0, sizeof(ip->i_res));
+ RB_CLEAR_NODE(&ip->i_res.rs_node);
+ ip->i_rahead = 0;
}
return &ip->i_inode;
}
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
index 20c007d..49ac55d 100644
--- a/fs/gfs2/trace_gfs2.h
+++ b/fs/gfs2/trace_gfs2.h
@@ -104,7 +104,7 @@ TRACE_EVENT(gfs2_glock_state_change,
),
TP_fast_assign(
- __entry->dev = gl->gl_sbd->sd_vfs->s_dev;
+ __entry->dev = gl->gl_name.ln_sbd->sd_vfs->s_dev;
__entry->glnum = gl->gl_name.ln_number;
__entry->gltype = gl->gl_name.ln_type;
__entry->cur_state = glock_trace_state(gl->gl_state);
@@ -140,7 +140,7 @@ TRACE_EVENT(gfs2_glock_put,
),
TP_fast_assign(
- __entry->dev = gl->gl_sbd->sd_vfs->s_dev;
+ __entry->dev = gl->gl_name.ln_sbd->sd_vfs->s_dev;
__entry->gltype = gl->gl_name.ln_type;
__entry->glnum = gl->gl_name.ln_number;
__entry->cur_state = glock_trace_state(gl->gl_state);
@@ -174,7 +174,7 @@ TRACE_EVENT(gfs2_demote_rq,
),
TP_fast_assign(
- __entry->dev = gl->gl_sbd->sd_vfs->s_dev;
+ __entry->dev = gl->gl_name.ln_sbd->sd_vfs->s_dev;
__entry->gltype = gl->gl_name.ln_type;
__entry->glnum = gl->gl_name.ln_number;
__entry->cur_state = glock_trace_state(gl->gl_state);
@@ -209,7 +209,7 @@ TRACE_EVENT(gfs2_promote,
),
TP_fast_assign(
- __entry->dev = gh->gh_gl->gl_sbd->sd_vfs->s_dev;
+ __entry->dev = gh->gh_gl->gl_name.ln_sbd->sd_vfs->s_dev;
__entry->glnum = gh->gh_gl->gl_name.ln_number;
__entry->gltype = gh->gh_gl->gl_name.ln_type;
__entry->first = first;
@@ -239,7 +239,7 @@ TRACE_EVENT(gfs2_glock_queue,
),
TP_fast_assign(
- __entry->dev = gh->gh_gl->gl_sbd->sd_vfs->s_dev;
+ __entry->dev = gh->gh_gl->gl_name.ln_sbd->sd_vfs->s_dev;
__entry->glnum = gh->gh_gl->gl_name.ln_number;
__entry->gltype = gh->gh_gl->gl_name.ln_type;
__entry->queue = queue;
@@ -267,18 +267,18 @@ TRACE_EVENT(gfs2_glock_lock_time,
__field( int, status )
__field( char, flags )
__field( s64, tdiff )
- __field( s64, srtt )
- __field( s64, srttvar )
- __field( s64, srttb )
- __field( s64, srttvarb )
- __field( s64, sirt )
- __field( s64, sirtvar )
- __field( s64, dcount )
- __field( s64, qcount )
+ __field( u64, srtt )
+ __field( u64, srttvar )
+ __field( u64, srttb )
+ __field( u64, srttvarb )
+ __field( u64, sirt )
+ __field( u64, sirtvar )
+ __field( u64, dcount )
+ __field( u64, qcount )
),
TP_fast_assign(
- __entry->dev = gl->gl_sbd->sd_vfs->s_dev;
+ __entry->dev = gl->gl_name.ln_sbd->sd_vfs->s_dev;
__entry->glnum = gl->gl_name.ln_number;
__entry->gltype = gl->gl_name.ln_type;
__entry->status = gl->gl_lksb.sb_status;
@@ -333,7 +333,7 @@ TRACE_EVENT(gfs2_pin,
),
TP_fast_assign(
- __entry->dev = bd->bd_gl->gl_sbd->sd_vfs->s_dev;
+ __entry->dev = bd->bd_gl->gl_name.ln_sbd->sd_vfs->s_dev;
__entry->pin = pin;
__entry->len = bd->bd_bh->b_size;
__entry->block = bd->bd_bh->b_blocknr;
@@ -449,7 +449,7 @@ TRACE_EVENT(gfs2_bmap,
),
TP_fast_assign(
- __entry->dev = ip->i_gl->gl_sbd->sd_vfs->s_dev;
+ __entry->dev = ip->i_gl->gl_name.ln_sbd->sd_vfs->s_dev;
__entry->lblock = lblock;
__entry->pblock = buffer_mapped(bh) ? bh->b_blocknr : 0;
__entry->inum = ip->i_no_addr;
@@ -489,7 +489,7 @@ TRACE_EVENT(gfs2_block_alloc,
),
TP_fast_assign(
- __entry->dev = rgd->rd_gl->gl_sbd->sd_vfs->s_dev;
+ __entry->dev = rgd->rd_gl->gl_name.ln_sbd->sd_vfs->s_dev;
__entry->start = block;
__entry->inum = ip->i_no_addr;
__entry->len = len;
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 88bff24..0c1bde3 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -158,7 +158,7 @@ static struct gfs2_bufdata *gfs2_alloc_bufdata(struct gfs2_glock *gl,
void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh)
{
struct gfs2_trans *tr = current->journal_info;
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct address_space *mapping = bh->b_page->mapping;
struct gfs2_inode *ip = GFS2_I(mapping->host);
struct gfs2_bufdata *bd;
@@ -176,6 +176,8 @@ void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh)
unlock_buffer(bh);
if (bh->b_private == NULL)
bd = gfs2_alloc_bufdata(gl, bh, &gfs2_databuf_lops);
+ else
+ bd = bh->b_private;
lock_buffer(bh);
gfs2_log_lock(sdp);
}
@@ -224,7 +226,7 @@ static void meta_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh)
{
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct gfs2_bufdata *bd;
lock_buffer(bh);
@@ -236,6 +238,8 @@ void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh)
lock_page(bh->b_page);
if (bh->b_private == NULL)
bd = gfs2_alloc_bufdata(gl, bh, &gfs2_buf_lops);
+ else
+ bd = bh->b_private;
unlock_page(bh->b_page);
lock_buffer(bh);
gfs2_log_lock(sdp);
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index 86d2035..cf64583 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -27,7 +27,7 @@ struct kmem_cache *gfs2_inode_cachep __read_mostly;
struct kmem_cache *gfs2_bufdata_cachep __read_mostly;
struct kmem_cache *gfs2_rgrpd_cachep __read_mostly;
struct kmem_cache *gfs2_quotad_cachep __read_mostly;
-struct kmem_cache *gfs2_rsrv_cachep __read_mostly;
+struct kmem_cache *gfs2_qadata_cachep __read_mostly;
mempool_t *gfs2_page_pool __read_mostly;
void gfs2_assert_i(struct gfs2_sbd *sdp)
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
index cbdcbdf..c81295f 100644
--- a/fs/gfs2/util.h
+++ b/fs/gfs2/util.h
@@ -149,7 +149,7 @@ extern struct kmem_cache *gfs2_inode_cachep;
extern struct kmem_cache *gfs2_bufdata_cachep;
extern struct kmem_cache *gfs2_rgrpd_cachep;
extern struct kmem_cache *gfs2_quotad_cachep;
-extern struct kmem_cache *gfs2_rsrv_cachep;
+extern struct kmem_cache *gfs2_qadata_cachep;
extern mempool_t *gfs2_page_pool;
static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt,
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 4c096fa..e8dfb47 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -119,7 +119,7 @@ static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data)
__be64 *eablk, *end;
int error;
- error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &bh);
+ error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, 0, &bh);
if (error)
return error;
@@ -143,7 +143,7 @@ static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data)
break;
bn = be64_to_cpu(*eablk);
- error = gfs2_meta_read(ip->i_gl, bn, DIO_WAIT, &eabh);
+ error = gfs2_meta_read(ip->i_gl, bn, DIO_WAIT, 0, &eabh);
if (error)
break;
error = ea_foreach_i(ip, eabh, ea_call, data);
@@ -477,7 +477,7 @@ static int gfs2_iter_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
return -ENOMEM;
for (x = 0; x < nptrs; x++) {
- error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), 0,
+ error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), 0, 0,
bh + x);
if (error) {
while (x--)
@@ -583,11 +583,13 @@ out:
*
* Returns: actual size of data on success, -errno on error
*/
-static int gfs2_xattr_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+static int gfs2_xattr_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
struct gfs2_inode *ip = GFS2_I(d_inode(dentry));
struct gfs2_ea_location el;
+ int type = handler->flags;
int error;
if (!ip->i_eattr)
@@ -977,7 +979,7 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) {
__be64 *end;
- error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT,
+ error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, 0,
&indbh);
if (error)
return error;
@@ -1227,61 +1229,12 @@ int __gfs2_xattr_set(struct inode *inode, const char *name,
return error;
}
-static int gfs2_xattr_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+static int gfs2_xattr_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
{
return __gfs2_xattr_set(d_inode(dentry), name, value,
- size, flags, type);
-}
-
-
-static int ea_acl_chmod_unstuffed(struct gfs2_inode *ip,
- struct gfs2_ea_header *ea, char *data)
-{
- struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- unsigned int amount = GFS2_EA_DATA_LEN(ea);
- unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize);
- int ret;
-
- ret = gfs2_trans_begin(sdp, nptrs + RES_DINODE, 0);
- if (ret)
- return ret;
-
- ret = gfs2_iter_unstuffed(ip, ea, data, NULL);
- gfs2_trans_end(sdp);
-
- return ret;
-}
-
-int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
-{
- struct inode *inode = &ip->i_inode;
- struct gfs2_sbd *sdp = GFS2_SB(inode);
- struct gfs2_ea_location el;
- int error;
-
- error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, GFS2_POSIX_ACL_ACCESS, &el);
- if (error)
- return error;
-
- if (GFS2_EA_IS_STUFFED(el.el_ea)) {
- error = gfs2_trans_begin(sdp, RES_DINODE + RES_EATTR, 0);
- if (error == 0) {
- gfs2_trans_add_meta(ip->i_gl, el.el_bh);
- memcpy(GFS2_EA2DATA(el.el_ea), data,
- GFS2_EA_DATA_LEN(el.el_ea));
- }
- } else {
- error = ea_acl_chmod_unstuffed(ip, el.el_ea, data);
- }
-
- brelse(el.el_bh);
- if (error)
- return error;
-
- error = gfs2_setattr_simple(inode, attr);
- gfs2_trans_end(sdp);
- return error;
+ size, flags, handler->flags);
}
static int ea_dealloc_indirect(struct gfs2_inode *ip)
@@ -1303,7 +1256,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
- error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &indbh);
+ error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, 0, &indbh);
if (error)
return error;
diff --git a/fs/gfs2/xattr.h b/fs/gfs2/xattr.h
index d392f83..2d887c8 100644
--- a/fs/gfs2/xattr.h
+++ b/fs/gfs2/xattr.h
@@ -62,6 +62,5 @@ extern int gfs2_ea_dealloc(struct gfs2_inode *ip);
/* Exported to acl.c */
extern int gfs2_xattr_acl_get(struct gfs2_inode *ip, const char *name, char **data);
-extern int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data);
#endif /* __EATTR_DOT_H__ */
diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index d3fa6bd..221719e 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -288,7 +288,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
page_cache_release(page);
goto fail;
}
- page_cache_release(page);
node->page[i] = page;
}
@@ -398,11 +397,11 @@ node_error:
void hfs_bnode_free(struct hfs_bnode *node)
{
- //int i;
+ int i;
- //for (i = 0; i < node->tree->pages_per_bnode; i++)
- // if (node->page[i])
- // page_cache_release(node->page[i]);
+ for (i = 0; i < node->tree->pages_per_bnode; i++)
+ if (node->page[i])
+ page_cache_release(node->page[i]);
kfree(node);
}
diff --git a/fs/hfs/brec.c b/fs/hfs/brec.c
index 9f4ee7f..6fc766d 100644
--- a/fs/hfs/brec.c
+++ b/fs/hfs/brec.c
@@ -131,13 +131,16 @@ skip:
hfs_bnode_write(node, entry, data_off + key_len, entry_len);
hfs_bnode_dump(node);
- if (new_node) {
- /* update parent key if we inserted a key
- * at the start of the first node
- */
- if (!rec && new_node != node)
- hfs_brec_update_parent(fd);
+ /*
+ * update parent key if we inserted a key
+ * at the start of the node and it is not the new node
+ */
+ if (!rec && new_node != node) {
+ hfs_bnode_read_key(node, fd->search_key, data_off + size);
+ hfs_brec_update_parent(fd);
+ }
+ if (new_node) {
hfs_bnode_put(fd->bnode);
if (!new_node->parent) {
hfs_btree_inc_height(tree);
@@ -166,9 +169,6 @@ skip:
goto again;
}
- if (!rec)
- hfs_brec_update_parent(fd);
-
return 0;
}
@@ -366,6 +366,8 @@ again:
if (IS_ERR(parent))
return PTR_ERR(parent);
__hfs_brec_find(parent, fd);
+ if (fd->record < 0)
+ return -ENOENT;
hfs_bnode_dump(parent);
rec = fd->record;
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index aa3f0d6..a3ec3ae 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -166,7 +166,7 @@ int hfs_mdb_get(struct super_block *sb)
pr_warn("continuing without an alternate MDB\n");
}
- HFS_SB(sb)->bitmap = (__be32 *)__get_free_pages(GFP_KERNEL, PAGE_SIZE < 8192 ? 1 : 0);
+ HFS_SB(sb)->bitmap = kmalloc(8192, GFP_KERNEL);
if (!HFS_SB(sb)->bitmap)
goto out;
@@ -360,7 +360,7 @@ void hfs_mdb_put(struct super_block *sb)
unload_nls(HFS_SB(sb)->nls_io);
unload_nls(HFS_SB(sb)->nls_disk);
- free_pages((unsigned long)HFS_SB(sb)->bitmap, PAGE_SIZE < 8192 ? 1 : 0);
+ kfree(HFS_SB(sb)->bitmap);
kfree(HFS_SB(sb));
sb->s_fs_info = NULL;
}
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 55c03b9..1ca95c2 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -136,9 +136,9 @@ static int hfs_show_options(struct seq_file *seq, struct dentry *root)
struct hfs_sb_info *sbi = HFS_SB(root->d_sb);
if (sbi->s_creator != cpu_to_be32(0x3f3f3f3f))
- seq_printf(seq, ",creator=%.4s", (char *)&sbi->s_creator);
+ seq_show_option_n(seq, "creator", (char *)&sbi->s_creator, 4);
if (sbi->s_type != cpu_to_be32(0x3f3f3f3f))
- seq_printf(seq, ",type=%.4s", (char *)&sbi->s_type);
+ seq_show_option_n(seq, "type", (char *)&sbi->s_type, 4);
seq_printf(seq, ",uid=%u,gid=%u",
from_kuid_munged(&init_user_ns, sbi->s_uid),
from_kgid_munged(&init_user_ns, sbi->s_gid));
@@ -483,8 +483,8 @@ static int __init init_hfs_fs(void)
int err;
hfs_inode_cachep = kmem_cache_create("hfs_inode_cache",
- sizeof(struct hfs_inode_info), 0, SLAB_HWCACHE_ALIGN,
- hfs_init_once);
+ sizeof(struct hfs_inode_info), 0,
+ SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, hfs_init_once);
if (!hfs_inode_cachep)
return -ENOMEM;
err = register_filesystem(&hfs_fs_type);
diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c
index 759708f..6392466 100644
--- a/fs/hfsplus/bnode.c
+++ b/fs/hfsplus/bnode.c
@@ -454,7 +454,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
page_cache_release(page);
goto fail;
}
- page_cache_release(page);
node->page[i] = page;
}
@@ -566,13 +565,11 @@ node_error:
void hfs_bnode_free(struct hfs_bnode *node)
{
-#if 0
int i;
for (i = 0; i < node->tree->pages_per_bnode; i++)
if (node->page[i])
page_cache_release(node->page[i]);
-#endif
kfree(node);
}
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 6dd107d..19b33f8 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -403,6 +403,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, umode_t mode)
} else if (S_ISLNK(inode->i_mode)) {
sbi->file_count++;
inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &hfsplus_aops;
hip->clump_blocks = 1;
} else
@@ -526,6 +527,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
inode->i_mapping->a_ops = &hfsplus_aops;
} else if (S_ISLNK(inode->i_mode)) {
inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &hfsplus_aops;
} else {
init_special_inode(inode, inode->i_mode,
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index c90b72e..bb806e5 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -218,9 +218,9 @@ int hfsplus_show_options(struct seq_file *seq, struct dentry *root)
struct hfsplus_sb_info *sbi = HFSPLUS_SB(root->d_sb);
if (sbi->creator != HFSPLUS_DEF_CR_TYPE)
- seq_printf(seq, ",creator=%.4s", (char *)&sbi->creator);
+ seq_show_option_n(seq, "creator", (char *)&sbi->creator, 4);
if (sbi->type != HFSPLUS_DEF_CR_TYPE)
- seq_printf(seq, ",type=%.4s", (char *)&sbi->type);
+ seq_show_option_n(seq, "type", (char *)&sbi->type, 4);
seq_printf(seq, ",umask=%o,uid=%u,gid=%u", sbi->umask,
from_kuid_munged(&init_user_ns, sbi->uid),
from_kgid_munged(&init_user_ns, sbi->gid));
diff --git a/fs/hfsplus/posix_acl.c b/fs/hfsplus/posix_acl.c
index df0c9af..afb33ed 100644
--- a/fs/hfsplus/posix_acl.c
+++ b/fs/hfsplus/posix_acl.c
@@ -21,10 +21,10 @@ struct posix_acl *hfsplus_get_posix_acl(struct inode *inode, int type)
switch (type) {
case ACL_TYPE_ACCESS:
- xattr_name = POSIX_ACL_XATTR_ACCESS;
+ xattr_name = XATTR_NAME_POSIX_ACL_ACCESS;
break;
case ACL_TYPE_DEFAULT:
- xattr_name = POSIX_ACL_XATTR_DEFAULT;
+ xattr_name = XATTR_NAME_POSIX_ACL_DEFAULT;
break;
default:
return ERR_PTR(-EINVAL);
@@ -66,7 +66,7 @@ int hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl,
switch (type) {
case ACL_TYPE_ACCESS:
- xattr_name = POSIX_ACL_XATTR_ACCESS;
+ xattr_name = XATTR_NAME_POSIX_ACL_ACCESS;
if (acl) {
err = posix_acl_equiv_mode(acl, &inode->i_mode);
if (err < 0)
@@ -76,7 +76,7 @@ int hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl,
break;
case ACL_TYPE_DEFAULT:
- xattr_name = POSIX_ACL_XATTR_DEFAULT;
+ xattr_name = XATTR_NAME_POSIX_ACL_DEFAULT;
if (!S_ISDIR(inode->i_mode))
return acl ? -EACCES : 0;
break;
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 7302d96..5d54490 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -663,7 +663,7 @@ static int __init init_hfsplus_fs(void)
int err;
hfsplus_inode_cachep = kmem_cache_create("hfsplus_icache",
- HFSPLUS_INODE_SIZE, 0, SLAB_HWCACHE_ALIGN,
+ HFSPLUS_INODE_SIZE, 0, SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT,
hfsplus_init_once);
if (!hfsplus_inode_cachep)
return -ENOMEM;
diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
index 416b1db..ab01530 100644
--- a/fs/hfsplus/xattr.c
+++ b/fs/hfsplus/xattr.c
@@ -431,9 +431,6 @@ int hfsplus_setxattr(struct dentry *dentry, const char *name,
char *xattr_name;
int res;
- if (!strcmp(name, ""))
- return -EINVAL;
-
xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1,
GFP_KERNEL);
if (!xattr_name)
@@ -589,9 +586,6 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
int res;
char *xattr_name;
- if (!strcmp(name, ""))
- return -EINVAL;
-
xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1,
GFP_KERNEL);
if (!xattr_name)
@@ -849,12 +843,10 @@ end_removexattr:
return err;
}
-static int hfsplus_osx_getxattr(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+static int hfsplus_osx_getxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
- if (!strcmp(name, ""))
- return -EINVAL;
-
/*
* Don't allow retrieving properly prefixed attributes
* by prepending them with "osx."
@@ -871,12 +863,10 @@ static int hfsplus_osx_getxattr(struct dentry *dentry, const char *name,
return __hfsplus_getxattr(d_inode(dentry), name, buffer, size);
}
-static int hfsplus_osx_setxattr(struct dentry *dentry, const char *name,
- const void *buffer, size_t size, int flags, int type)
+static int hfsplus_osx_setxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *buffer, size_t size, int flags)
{
- if (!strcmp(name, ""))
- return -EINVAL;
-
/*
* Don't allow setting properly prefixed attributes
* by prepending them with "osx."
@@ -893,19 +883,8 @@ static int hfsplus_osx_setxattr(struct dentry *dentry, const char *name,
return __hfsplus_setxattr(d_inode(dentry), name, buffer, size, flags);
}
-static size_t hfsplus_osx_listxattr(struct dentry *dentry, char *list,
- size_t list_size, const char *name, size_t name_len, int type)
-{
- /*
- * This method is not used.
- * It is used hfsplus_listxattr() instead of generic_listxattr().
- */
- return -EOPNOTSUPP;
-}
-
const struct xattr_handler hfsplus_xattr_osx_handler = {
.prefix = XATTR_MAC_OSX_PREFIX,
- .list = hfsplus_osx_listxattr,
.get = hfsplus_osx_getxattr,
.set = hfsplus_osx_setxattr,
};
diff --git a/fs/hfsplus/xattr_security.c b/fs/hfsplus/xattr_security.c
index aacff00..72a68a3 100644
--- a/fs/hfsplus/xattr_security.c
+++ b/fs/hfsplus/xattr_security.c
@@ -13,32 +13,24 @@
#include "xattr.h"
#include "acl.h"
-static int hfsplus_security_getxattr(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+static int hfsplus_security_getxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
return hfsplus_getxattr(dentry, name, buffer, size,
XATTR_SECURITY_PREFIX,
XATTR_SECURITY_PREFIX_LEN);
}
-static int hfsplus_security_setxattr(struct dentry *dentry, const char *name,
- const void *buffer, size_t size, int flags, int type)
+static int hfsplus_security_setxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *buffer, size_t size, int flags)
{
return hfsplus_setxattr(dentry, name, buffer, size, flags,
XATTR_SECURITY_PREFIX,
XATTR_SECURITY_PREFIX_LEN);
}
-static size_t hfsplus_security_listxattr(struct dentry *dentry, char *list,
- size_t list_size, const char *name, size_t name_len, int type)
-{
- /*
- * This method is not used.
- * It is used hfsplus_listxattr() instead of generic_listxattr().
- */
- return -EOPNOTSUPP;
-}
-
static int hfsplus_initxattrs(struct inode *inode,
const struct xattr *xattr_array,
void *fs_info)
@@ -92,7 +84,6 @@ int hfsplus_init_inode_security(struct inode *inode,
const struct xattr_handler hfsplus_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
- .list = hfsplus_security_listxattr,
.get = hfsplus_security_getxattr,
.set = hfsplus_security_setxattr,
};
diff --git a/fs/hfsplus/xattr_trusted.c b/fs/hfsplus/xattr_trusted.c
index bcf65089..95a7704 100644
--- a/fs/hfsplus/xattr_trusted.c
+++ b/fs/hfsplus/xattr_trusted.c
@@ -11,34 +11,25 @@
#include "hfsplus_fs.h"
#include "xattr.h"
-static int hfsplus_trusted_getxattr(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+static int hfsplus_trusted_getxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
return hfsplus_getxattr(dentry, name, buffer, size,
XATTR_TRUSTED_PREFIX,
XATTR_TRUSTED_PREFIX_LEN);
}
-static int hfsplus_trusted_setxattr(struct dentry *dentry, const char *name,
- const void *buffer, size_t size, int flags, int type)
+static int hfsplus_trusted_setxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *buffer, size_t size, int flags)
{
return hfsplus_setxattr(dentry, name, buffer, size, flags,
XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
}
-static size_t hfsplus_trusted_listxattr(struct dentry *dentry, char *list,
- size_t list_size, const char *name, size_t name_len, int type)
-{
- /*
- * This method is not used.
- * It is used hfsplus_listxattr() instead of generic_listxattr().
- */
- return -EOPNOTSUPP;
-}
-
const struct xattr_handler hfsplus_xattr_trusted_handler = {
.prefix = XATTR_TRUSTED_PREFIX,
- .list = hfsplus_trusted_listxattr,
.get = hfsplus_trusted_getxattr,
.set = hfsplus_trusted_setxattr,
};
diff --git a/fs/hfsplus/xattr_user.c b/fs/hfsplus/xattr_user.c
index 5aa0e6d..6fc269b 100644
--- a/fs/hfsplus/xattr_user.c
+++ b/fs/hfsplus/xattr_user.c
@@ -11,34 +11,25 @@
#include "hfsplus_fs.h"
#include "xattr.h"
-static int hfsplus_user_getxattr(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+static int hfsplus_user_getxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
return hfsplus_getxattr(dentry, name, buffer, size,
XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
}
-static int hfsplus_user_setxattr(struct dentry *dentry, const char *name,
- const void *buffer, size_t size, int flags, int type)
+static int hfsplus_user_setxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *buffer, size_t size, int flags)
{
return hfsplus_setxattr(dentry, name, buffer, size, flags,
XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
}
-static size_t hfsplus_user_listxattr(struct dentry *dentry, char *list,
- size_t list_size, const char *name, size_t name_len, int type)
-{
- /*
- * This method is not used.
- * It is used hfsplus_listxattr() instead of generic_listxattr().
- */
- return -EOPNOTSUPP;
-}
-
const struct xattr_handler hfsplus_xattr_user_handler = {
.prefix = XATTR_USER_PREFIX,
- .list = hfsplus_user_listxattr,
.get = hfsplus_user_getxattr,
.set = hfsplus_user_setxattr,
};
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 059597b..cfaa18c 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -223,7 +223,7 @@ static struct inode *hostfs_alloc_inode(struct super_block *sb)
{
struct hostfs_inode_info *hi;
- hi = kmalloc(sizeof(*hi), GFP_KERNEL);
+ hi = kmalloc(sizeof(*hi), GFP_KERNEL_ACCOUNT);
if (hi == NULL)
return NULL;
hi->fd = -1;
@@ -260,7 +260,7 @@ static int hostfs_show_options(struct seq_file *seq, struct dentry *root)
size_t offset = strlen(root_ino) + 1;
if (strlen(root_path) > offset)
- seq_printf(seq, ",%s", root_path + offset);
+ seq_show_option(seq, root_path + offset, NULL);
if (append)
seq_puts(seq, ",append");
@@ -730,15 +730,13 @@ static int hostfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
init_special_inode(inode, mode, dev);
err = do_mknod(name, mode, MAJOR(dev), MINOR(dev));
- if (!err)
+ if (err)
goto out_free;
err = read_name(inode, name);
__putname(name);
if (err)
goto out_put;
- if (err)
- goto out_put;
d_instantiate(dentry, inode);
return 0;
@@ -892,9 +890,14 @@ static const struct inode_operations hostfs_dir_iops = {
.setattr = hostfs_setattr,
};
-static const char *hostfs_follow_link(struct dentry *dentry, void **cookie)
+static const char *hostfs_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- char *link = __getname();
+ char *link;
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+ link = kmalloc(PATH_MAX, GFP_KERNEL);
if (link) {
char *path = dentry_name(dentry);
int err = -ENOMEM;
@@ -905,25 +908,20 @@ static const char *hostfs_follow_link(struct dentry *dentry, void **cookie)
__putname(path);
}
if (err < 0) {
- __putname(link);
+ kfree(link);
return ERR_PTR(err);
}
} else {
return ERR_PTR(-ENOMEM);
}
- return *cookie = link;
-}
-
-static void hostfs_put_link(struct inode *unused, void *cookie)
-{
- __putname(cookie);
+ set_delayed_call(done, kfree_link, link);
+ return link;
}
static const struct inode_operations hostfs_link_iops = {
.readlink = generic_readlink,
- .follow_link = hostfs_follow_link,
- .put_link = hostfs_put_link,
+ .get_link = hostfs_get_link,
};
static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
diff --git a/fs/hpfs/buffer.c b/fs/hpfs/buffer.c
index 8057fe4..f626114 100644
--- a/fs/hpfs/buffer.c
+++ b/fs/hpfs/buffer.c
@@ -10,6 +10,30 @@
#include <linux/blkdev.h>
#include "hpfs_fn.h"
+secno hpfs_search_hotfix_map(struct super_block *s, secno sec)
+{
+ unsigned i;
+ struct hpfs_sb_info *sbi = hpfs_sb(s);
+ for (i = 0; unlikely(i < sbi->n_hotfixes); i++) {
+ if (sbi->hotfix_from[i] == sec) {
+ return sbi->hotfix_to[i];
+ }
+ }
+ return sec;
+}
+
+unsigned hpfs_search_hotfix_map_for_range(struct super_block *s, secno sec, unsigned n)
+{
+ unsigned i;
+ struct hpfs_sb_info *sbi = hpfs_sb(s);
+ for (i = 0; unlikely(i < sbi->n_hotfixes); i++) {
+ if (sbi->hotfix_from[i] >= sec && sbi->hotfix_from[i] < sec + n) {
+ n = sbi->hotfix_from[i] - sec;
+ }
+ }
+ return n;
+}
+
void hpfs_prefetch_sectors(struct super_block *s, unsigned secno, int n)
{
struct buffer_head *bh;
@@ -18,6 +42,9 @@ void hpfs_prefetch_sectors(struct super_block *s, unsigned secno, int n)
if (n <= 0 || unlikely(secno >= hpfs_sb(s)->sb_fs_size))
return;
+ if (unlikely(hpfs_search_hotfix_map_for_range(s, secno, n) != n))
+ return;
+
bh = sb_find_get_block(s, secno);
if (bh) {
if (buffer_uptodate(bh)) {
@@ -51,7 +78,7 @@ void *hpfs_map_sector(struct super_block *s, unsigned secno, struct buffer_head
cond_resched();
- *bhp = bh = sb_bread(s, secno);
+ *bhp = bh = sb_bread(s, hpfs_search_hotfix_map(s, secno));
if (bh != NULL)
return bh->b_data;
else {
@@ -71,7 +98,7 @@ void *hpfs_get_sector(struct super_block *s, unsigned secno, struct buffer_head
cond_resched();
- if ((*bhp = bh = sb_getblk(s, secno)) != NULL) {
+ if ((*bhp = bh = sb_getblk(s, hpfs_search_hotfix_map(s, secno))) != NULL) {
if (!buffer_uptodate(bh)) wait_on_buffer(bh);
set_buffer_uptodate(bh);
return bh->b_data;
@@ -99,10 +126,10 @@ void *hpfs_map_4sectors(struct super_block *s, unsigned secno, struct quad_buffe
hpfs_prefetch_sectors(s, secno, 4 + ahead);
- if (!(qbh->bh[0] = sb_bread(s, secno + 0))) goto bail0;
- if (!(qbh->bh[1] = sb_bread(s, secno + 1))) goto bail1;
- if (!(qbh->bh[2] = sb_bread(s, secno + 2))) goto bail2;
- if (!(qbh->bh[3] = sb_bread(s, secno + 3))) goto bail3;
+ if (!hpfs_map_sector(s, secno + 0, &qbh->bh[0], 0)) goto bail0;
+ if (!hpfs_map_sector(s, secno + 1, &qbh->bh[1], 0)) goto bail1;
+ if (!hpfs_map_sector(s, secno + 2, &qbh->bh[2], 0)) goto bail2;
+ if (!hpfs_map_sector(s, secno + 3, &qbh->bh[3], 0)) goto bail3;
if (likely(qbh->bh[1]->b_data == qbh->bh[0]->b_data + 1 * 512) &&
likely(qbh->bh[2]->b_data == qbh->bh[0]->b_data + 2 * 512) &&
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index 7ca28d6..d3bcdd9 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -83,6 +83,11 @@ static int hpfs_get_block(struct inode *inode, sector_t iblock, struct buffer_he
if (s) {
if (bh_result->b_size >> 9 < n_secs)
n_secs = bh_result->b_size >> 9;
+ n_secs = hpfs_search_hotfix_map_for_range(inode->i_sb, s, n_secs);
+ if (unlikely(!n_secs)) {
+ s = hpfs_search_hotfix_map(inode->i_sb, s);
+ n_secs = 1;
+ }
map_bh(bh_result, inode->i_sb, s);
bh_result->b_size = n_secs << 9;
goto ret_0;
@@ -101,7 +106,7 @@ static int hpfs_get_block(struct inode *inode, sector_t iblock, struct buffer_he
inode->i_blocks++;
hpfs_i(inode)->mmu_private += 512;
set_buffer_new(bh_result);
- map_bh(bh_result, inode->i_sb, s);
+ map_bh(bh_result, inode->i_sb, hpfs_search_hotfix_map(inode->i_sb, s));
ret_0:
r = 0;
ret_r:
@@ -181,7 +186,7 @@ static int hpfs_write_end(struct file *file, struct address_space *mapping,
static sector_t _hpfs_bmap(struct address_space *mapping, sector_t block)
{
- return generic_block_bmap(mapping,block,hpfs_get_block);
+ return generic_block_bmap(mapping, block, hpfs_get_block);
}
const struct address_space_operations hpfs_aops = {
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index c4867b5..975654a 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -88,6 +88,10 @@ struct hpfs_sb_info {
unsigned sb_max_fwd_alloc; /* max forwad allocation */
int sb_timeshift;
struct rcu_head rcu;
+
+ unsigned n_hotfixes;
+ secno hotfix_from[256];
+ secno hotfix_to[256];
};
/* Four 512-byte buffers and the 2k block obtained by concatenating them */
@@ -217,6 +221,8 @@ void hpfs_remove_fnode(struct super_block *, fnode_secno fno);
/* buffer.c */
+secno hpfs_search_hotfix_map(struct super_block *s, secno sec);
+unsigned hpfs_search_hotfix_map_for_range(struct super_block *s, secno sec, unsigned n);
void hpfs_prefetch_sectors(struct super_block *, unsigned, int);
void *hpfs_map_sector(struct super_block *, unsigned, struct buffer_head **, int);
void *hpfs_get_sector(struct super_block *, unsigned, struct buffer_head **);
@@ -285,6 +291,7 @@ __le32 *hpfs_map_bitmap(struct super_block *, unsigned, struct quad_buffer_head
void hpfs_prefetch_bitmap(struct super_block *, unsigned);
unsigned char *hpfs_load_code_page(struct super_block *, secno);
__le32 *hpfs_load_bitmap_directory(struct super_block *, secno bmp);
+void hpfs_load_hotfix_map(struct super_block *s, struct hpfs_spare_block *spareblock);
struct fnode *hpfs_map_fnode(struct super_block *s, ino_t, struct buffer_head **);
struct anode *hpfs_map_anode(struct super_block *s, anode_secno, struct buffer_head **);
struct dnode *hpfs_map_dnode(struct super_block *s, dnode_secno, struct quad_buffer_head *);
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 933c737..1f3c6d7 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -77,6 +77,7 @@ void hpfs_read_inode(struct inode *i)
kfree(ea);
i->i_mode = S_IFLNK | 0777;
i->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(i);
i->i_data.a_ops = &hpfs_symlink_aops;
set_nlink(i, 1);
i->i_size = ea_size;
diff --git a/fs/hpfs/map.c b/fs/hpfs/map.c
index 442770e..a136929 100644
--- a/fs/hpfs/map.c
+++ b/fs/hpfs/map.c
@@ -130,6 +130,32 @@ __le32 *hpfs_load_bitmap_directory(struct super_block *s, secno bmp)
return b;
}
+void hpfs_load_hotfix_map(struct super_block *s, struct hpfs_spare_block *spareblock)
+{
+ struct quad_buffer_head qbh;
+ __le32 *directory;
+ u32 n_hotfixes, n_used_hotfixes;
+ unsigned i;
+
+ n_hotfixes = le32_to_cpu(spareblock->n_spares);
+ n_used_hotfixes = le32_to_cpu(spareblock->n_spares_used);
+
+ if (n_hotfixes > 256 || n_used_hotfixes > n_hotfixes) {
+ hpfs_error(s, "invalid number of hotfixes: %u, used: %u", n_hotfixes, n_used_hotfixes);
+ return;
+ }
+ if (!(directory = hpfs_map_4sectors(s, le32_to_cpu(spareblock->hotfix_map), &qbh, 0))) {
+ hpfs_error(s, "can't load hotfix map");
+ return;
+ }
+ for (i = 0; i < n_used_hotfixes; i++) {
+ hpfs_sb(s)->hotfix_from[i] = le32_to_cpu(directory[i]);
+ hpfs_sb(s)->hotfix_to[i] = le32_to_cpu(directory[n_hotfixes + i]);
+ }
+ hpfs_sb(s)->n_hotfixes = n_used_hotfixes;
+ hpfs_brelse4(&qbh);
+}
+
/*
* Load fnode to memory
*/
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index a0872f2..506765a 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -8,6 +8,17 @@
#include <linux/sched.h>
#include "hpfs_fn.h"
+static void hpfs_update_directory_times(struct inode *dir)
+{
+ time_t t = get_seconds();
+ if (t == dir->i_mtime.tv_sec &&
+ t == dir->i_ctime.tv_sec)
+ return;
+ dir->i_mtime.tv_sec = dir->i_ctime.tv_sec = t;
+ dir->i_mtime.tv_nsec = dir->i_ctime.tv_nsec = 0;
+ hpfs_write_inode_nolock(dir);
+}
+
static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
const unsigned char *name = dentry->d_name.name;
@@ -99,6 +110,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
result->i_mode = mode | S_IFDIR;
hpfs_write_inode_nolock(result);
}
+ hpfs_update_directory_times(dir);
d_instantiate(dentry, result);
hpfs_unlock(dir->i_sb);
return 0;
@@ -187,6 +199,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, b
result->i_mode = mode | S_IFREG;
hpfs_write_inode_nolock(result);
}
+ hpfs_update_directory_times(dir);
d_instantiate(dentry, result);
hpfs_unlock(dir->i_sb);
return 0;
@@ -214,8 +227,6 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, de
int err;
if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err;
if (hpfs_sb(dir->i_sb)->sb_eas < 2) return -EPERM;
- if (!new_valid_dev(rdev))
- return -EINVAL;
hpfs_lock(dir->i_sb);
err = -ENOSPC;
fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh);
@@ -262,6 +273,7 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, de
insert_inode_hash(result);
hpfs_write_inode_nolock(result);
+ hpfs_update_directory_times(dir);
d_instantiate(dentry, result);
brelse(bh);
hpfs_unlock(dir->i_sb);
@@ -320,6 +332,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy
result->i_blocks = 1;
set_nlink(result, 1);
result->i_size = strlen(symlink);
+ inode_nohighmem(result);
result->i_op = &page_symlink_inode_operations;
result->i_data.a_ops = &hpfs_symlink_aops;
@@ -340,6 +353,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy
insert_inode_hash(result);
hpfs_write_inode_nolock(result);
+ hpfs_update_directory_times(dir);
d_instantiate(dentry, result);
hpfs_unlock(dir->i_sb);
return 0;
@@ -423,6 +437,8 @@ again:
out1:
hpfs_brelse4(&qbh);
out:
+ if (!err)
+ hpfs_update_directory_times(dir);
hpfs_unlock(dir->i_sb);
return err;
}
@@ -477,13 +493,15 @@ static int hpfs_rmdir(struct inode *dir, struct dentry *dentry)
out1:
hpfs_brelse4(&qbh);
out:
+ if (!err)
+ hpfs_update_directory_times(dir);
hpfs_unlock(dir->i_sb);
return err;
}
static int hpfs_symlink_readpage(struct file *file, struct page *page)
{
- char *link = kmap(page);
+ char *link = page_address(page);
struct inode *i = page->mapping->host;
struct fnode *fnode;
struct buffer_head *bh;
@@ -499,14 +517,12 @@ static int hpfs_symlink_readpage(struct file *file, struct page *page)
goto fail;
hpfs_unlock(i->i_sb);
SetPageUptodate(page);
- kunmap(page);
unlock_page(page);
return 0;
fail:
hpfs_unlock(i->i_sb);
SetPageError(page);
- kunmap(page);
unlock_page(page);
return err;
}
@@ -595,7 +611,7 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
goto end1;
}
- end:
+end:
hpfs_i(i)->i_parent_dir = new_dir->i_ino;
if (S_ISDIR(i->i_mode)) {
inc_nlink(new_dir);
@@ -610,6 +626,10 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
brelse(bh);
}
end1:
+ if (!err) {
+ hpfs_update_directory_times(old_dir);
+ hpfs_update_directory_times(new_dir);
+ }
hpfs_unlock(i->i_sb);
return err;
}
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 68a9bed0..458cf46 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -261,7 +261,7 @@ static int init_inodecache(void)
hpfs_inode_cachep = kmem_cache_create("hpfs_inode_cache",
sizeof(struct hpfs_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (hpfs_inode_cachep == NULL)
return -ENOMEM;
@@ -628,6 +628,9 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
goto bail4;
}
+ if (spareblock->n_spares_used)
+ hpfs_load_hotfix_map(s, spareblock);
+
/* Load bitmap directory */
if (!(sbi->sb_bmp_dir = hpfs_load_bitmap_directory(s, le32_to_cpu(superblock->bitmaps))))
goto bail4;
@@ -647,18 +650,6 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
mark_buffer_dirty(bh2);
}
- if (spareblock->hotfixes_used || spareblock->n_spares_used) {
- if (errs >= 2) {
- pr_err("Hotfixes not supported here, try chkdsk\n");
- mark_dirty(s, 0);
- goto bail4;
- }
- hpfs_error(s, "hotfixes not supported here, try chkdsk");
- if (errs == 0)
- pr_err("Proceeding, but your filesystem will be probably corrupted by this driver...\n");
- else
- pr_err("This driver may read bad files or crash when operating on disk with hotfixes.\n");
- }
if (le32_to_cpu(spareblock->n_dnode_spares) != le32_to_cpu(spareblock->n_dnode_spares_free)) {
if (errs >= 2) {
pr_err("Spare dnodes used, try chkdsk\n");
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 973c24c..8bbf7f3 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -4,14 +4,15 @@
* Nadia Yvette Chambers, 2002
*
* Copyright (C) 2002 Linus Torvalds.
+ * License: GPL
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/module.h>
#include <linux/thread_info.h>
#include <asm/current.h>
#include <linux/sched.h> /* remove ASAP */
+#include <linux/falloc.h>
#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/file.h>
@@ -84,6 +85,29 @@ static const match_table_t tokens = {
{Opt_err, NULL},
};
+#ifdef CONFIG_NUMA
+static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
+ struct inode *inode, pgoff_t index)
+{
+ vma->vm_policy = mpol_shared_policy_lookup(&HUGETLBFS_I(inode)->policy,
+ index);
+}
+
+static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
+{
+ mpol_cond_put(vma->vm_policy);
+}
+#else
+static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
+ struct inode *inode, pgoff_t index)
+{
+}
+
+static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
+{
+}
+#endif
+
static void huge_pagevec_release(struct pagevec *pvec)
{
int i;
@@ -293,54 +317,178 @@ static int hugetlbfs_write_end(struct file *file, struct address_space *mapping,
return -EINVAL;
}
-static void truncate_huge_page(struct page *page)
+static void remove_huge_page(struct page *page)
{
ClearPageDirty(page);
ClearPageUptodate(page);
delete_from_page_cache(page);
}
-static void truncate_hugepages(struct inode *inode, loff_t lstart)
+static void
+hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
+{
+ struct vm_area_struct *vma;
+
+ /*
+ * end == 0 indicates that the entire range after
+ * start should be unmapped.
+ */
+ vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
+ unsigned long v_offset;
+ unsigned long v_end;
+
+ /*
+ * Can the expression below overflow on 32-bit arches?
+ * No, because the interval tree returns us only those vmas
+ * which overlap the truncated area starting at pgoff,
+ * and no vma on a 32-bit arch can span beyond the 4GB.
+ */
+ if (vma->vm_pgoff < start)
+ v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
+ else
+ v_offset = 0;
+
+ if (!end)
+ v_end = vma->vm_end;
+ else {
+ v_end = ((end - vma->vm_pgoff) << PAGE_SHIFT)
+ + vma->vm_start;
+ if (v_end > vma->vm_end)
+ v_end = vma->vm_end;
+ }
+
+ unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end,
+ NULL);
+ }
+}
+
+/*
+ * remove_inode_hugepages handles two distinct cases: truncation and hole
+ * punch. There are subtle differences in operation for each case.
+ *
+ * truncation is indicated by end of range being LLONG_MAX
+ * In this case, we first scan the range and release found pages.
+ * After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
+ * maps and global counts. Page faults can not race with truncation
+ * in this routine. hugetlb_no_page() prevents page faults in the
+ * truncated range. It checks i_size before allocation, and again after
+ * with the page table lock for the page held. The same lock must be
+ * acquired to unmap a page.
+ * hole punch is indicated if end is not LLONG_MAX
+ * In the hole punch case we scan the range and release found pages.
+ * Only when releasing a page is the associated region/reserv map
+ * deleted. The region/reserv map for ranges without associated
+ * pages are not modified. Page faults can race with hole punch.
+ * This is indicated if we find a mapped page.
+ * Note: If the passed end of range value is beyond the end of file, but
+ * not LLONG_MAX this routine still performs a hole punch operation.
+ */
+static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
+ loff_t lend)
{
struct hstate *h = hstate_inode(inode);
struct address_space *mapping = &inode->i_data;
const pgoff_t start = lstart >> huge_page_shift(h);
+ const pgoff_t end = lend >> huge_page_shift(h);
+ struct vm_area_struct pseudo_vma;
struct pagevec pvec;
pgoff_t next;
int i, freed = 0;
+ long lookup_nr = PAGEVEC_SIZE;
+ bool truncate_op = (lend == LLONG_MAX);
+ memset(&pseudo_vma, 0, sizeof(struct vm_area_struct));
+ pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
pagevec_init(&pvec, 0);
next = start;
- while (1) {
- if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
- if (next == start)
- break;
- next = start;
- continue;
- }
+ while (next < end) {
+ /*
+ * Don't grab more pages than the number left in the range.
+ */
+ if (end - next < lookup_nr)
+ lookup_nr = end - next;
+
+ /*
+ * When no more pages are found, we are done.
+ */
+ if (!pagevec_lookup(&pvec, mapping, next, lookup_nr))
+ break;
for (i = 0; i < pagevec_count(&pvec); ++i) {
struct page *page = pvec.pages[i];
+ bool rsv_on_error;
+ u32 hash;
+
+ /*
+ * The page (index) could be beyond end. This is
+ * only possible in the punch hole case as end is
+ * max page offset in the truncate case.
+ */
+ next = page->index;
+ if (next >= end)
+ break;
+
+ hash = hugetlb_fault_mutex_hash(h, current->mm,
+ &pseudo_vma,
+ mapping, next, 0);
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
+
+ /*
+ * If page is mapped, it was faulted in after being
+ * unmapped in caller. Unmap (again) now after taking
+ * the fault mutex. The mutex will prevent faults
+ * until we finish removing the page.
+ *
+ * This race can only happen in the hole punch case.
+ * Getting here in a truncate operation is a bug.
+ */
+ if (unlikely(page_mapped(page))) {
+ BUG_ON(truncate_op);
+
+ i_mmap_lock_write(mapping);
+ hugetlb_vmdelete_list(&mapping->i_mmap,
+ next * pages_per_huge_page(h),
+ (next + 1) * pages_per_huge_page(h));
+ i_mmap_unlock_write(mapping);
+ }
lock_page(page);
- if (page->index > next)
- next = page->index;
- ++next;
- truncate_huge_page(page);
- unlock_page(page);
+ /*
+ * We must free the huge page and remove from page
+ * cache (remove_huge_page) BEFORE removing the
+ * region/reserve map (hugetlb_unreserve_pages). In
+ * rare out of memory conditions, removal of the
+ * region/reserve map could fail. Before free'ing
+ * the page, note PagePrivate which is used in case
+ * of error.
+ */
+ rsv_on_error = !PagePrivate(page);
+ remove_huge_page(page);
freed++;
+ if (!truncate_op) {
+ if (unlikely(hugetlb_unreserve_pages(inode,
+ next, next + 1, 1)))
+ hugetlb_fix_reserve_counts(inode,
+ rsv_on_error);
+ }
+
+ unlock_page(page);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
}
+ ++next;
huge_pagevec_release(&pvec);
+ cond_resched();
}
- BUG_ON(!lstart && mapping->nrpages);
- hugetlb_unreserve_pages(inode, start, freed);
+
+ if (truncate_op)
+ (void)hugetlb_unreserve_pages(inode, start, LONG_MAX, freed);
}
static void hugetlbfs_evict_inode(struct inode *inode)
{
struct resv_map *resv_map;
- truncate_hugepages(inode, 0);
+ remove_inode_hugepages(inode, 0, LLONG_MAX);
resv_map = (struct resv_map *)inode->i_mapping->private_data;
/* root inode doesn't have the resv_map, so we should check it */
if (resv_map)
@@ -348,30 +496,6 @@ static void hugetlbfs_evict_inode(struct inode *inode)
clear_inode(inode);
}
-static inline void
-hugetlb_vmtruncate_list(struct rb_root *root, pgoff_t pgoff)
-{
- struct vm_area_struct *vma;
-
- vma_interval_tree_foreach(vma, root, pgoff, ULONG_MAX) {
- unsigned long v_offset;
-
- /*
- * Can the expression below overflow on 32-bit arches?
- * No, because the interval tree returns us only those vmas
- * which overlap the truncated area starting at pgoff,
- * and no vma on a 32-bit arch can span beyond the 4GB.
- */
- if (vma->vm_pgoff < pgoff)
- v_offset = (pgoff - vma->vm_pgoff) << PAGE_SHIFT;
- else
- v_offset = 0;
-
- unmap_hugepage_range(vma, vma->vm_start + v_offset,
- vma->vm_end, NULL);
- }
-}
-
static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
{
pgoff_t pgoff;
@@ -384,12 +508,161 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
i_size_write(inode, offset);
i_mmap_lock_write(mapping);
if (!RB_EMPTY_ROOT(&mapping->i_mmap))
- hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
+ hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0);
i_mmap_unlock_write(mapping);
- truncate_hugepages(inode, offset);
+ remove_inode_hugepages(inode, offset, LLONG_MAX);
return 0;
}
+static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
+{
+ struct hstate *h = hstate_inode(inode);
+ loff_t hpage_size = huge_page_size(h);
+ loff_t hole_start, hole_end;
+
+ /*
+ * For hole punch round up the beginning offset of the hole and
+ * round down the end.
+ */
+ hole_start = round_up(offset, hpage_size);
+ hole_end = round_down(offset + len, hpage_size);
+
+ if (hole_end > hole_start) {
+ struct address_space *mapping = inode->i_mapping;
+
+ mutex_lock(&inode->i_mutex);
+ i_mmap_lock_write(mapping);
+ if (!RB_EMPTY_ROOT(&mapping->i_mmap))
+ hugetlb_vmdelete_list(&mapping->i_mmap,
+ hole_start >> PAGE_SHIFT,
+ hole_end >> PAGE_SHIFT);
+ i_mmap_unlock_write(mapping);
+ remove_inode_hugepages(inode, hole_start, hole_end);
+ mutex_unlock(&inode->i_mutex);
+ }
+
+ return 0;
+}
+
+static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
+ loff_t len)
+{
+ struct inode *inode = file_inode(file);
+ struct address_space *mapping = inode->i_mapping;
+ struct hstate *h = hstate_inode(inode);
+ struct vm_area_struct pseudo_vma;
+ struct mm_struct *mm = current->mm;
+ loff_t hpage_size = huge_page_size(h);
+ unsigned long hpage_shift = huge_page_shift(h);
+ pgoff_t start, index, end;
+ int error;
+ u32 hash;
+
+ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+ return -EOPNOTSUPP;
+
+ if (mode & FALLOC_FL_PUNCH_HOLE)
+ return hugetlbfs_punch_hole(inode, offset, len);
+
+ /*
+ * Default preallocate case.
+ * For this range, start is rounded down and end is rounded up
+ * as well as being converted to page offsets.
+ */
+ start = offset >> hpage_shift;
+ end = (offset + len + hpage_size - 1) >> hpage_shift;
+
+ mutex_lock(&inode->i_mutex);
+
+ /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
+ error = inode_newsize_ok(inode, offset + len);
+ if (error)
+ goto out;
+
+ /*
+ * Initialize a pseudo vma as this is required by the huge page
+ * allocation routines. If NUMA is configured, use page index
+ * as input to create an allocation policy.
+ */
+ memset(&pseudo_vma, 0, sizeof(struct vm_area_struct));
+ pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
+ pseudo_vma.vm_file = file;
+
+ for (index = start; index < end; index++) {
+ /*
+ * This is supposed to be the vaddr where the page is being
+ * faulted in, but we have no vaddr here.
+ */
+ struct page *page;
+ unsigned long addr;
+ int avoid_reserve = 0;
+
+ cond_resched();
+
+ /*
+ * fallocate(2) manpage permits EINTR; we may have been
+ * interrupted because we are using up too much memory.
+ */
+ if (signal_pending(current)) {
+ error = -EINTR;
+ break;
+ }
+
+ /* Set numa allocation policy based on index */
+ hugetlb_set_vma_policy(&pseudo_vma, inode, index);
+
+ /* addr is the offset within the file (zero based) */
+ addr = index * hpage_size;
+
+ /* mutex taken here, fault path and hole punch */
+ hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping,
+ index, addr);
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
+
+ /* See if already present in mapping to avoid alloc/free */
+ page = find_get_page(mapping, index);
+ if (page) {
+ put_page(page);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ hugetlb_drop_vma_policy(&pseudo_vma);
+ continue;
+ }
+
+ /* Allocate page and add to page cache */
+ page = alloc_huge_page(&pseudo_vma, addr, avoid_reserve);
+ hugetlb_drop_vma_policy(&pseudo_vma);
+ if (IS_ERR(page)) {
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ error = PTR_ERR(page);
+ goto out;
+ }
+ clear_huge_page(page, addr, pages_per_huge_page(h));
+ __SetPageUptodate(page);
+ error = huge_add_to_page_cache(page, mapping, index);
+ if (unlikely(error)) {
+ put_page(page);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ goto out;
+ }
+
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+
+ /*
+ * page_put due to reference from alloc_huge_page()
+ * unlock_page because locked by add_to_page_cache()
+ */
+ put_page(page);
+ unlock_page(page);
+ }
+
+ if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
+ i_size_write(inode, offset + len);
+ inode->i_ctime = CURRENT_TIME;
+out:
+ mutex_unlock(&inode->i_mutex);
+ return error;
+}
+
static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = d_inode(dentry);
@@ -444,7 +717,7 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb,
/*
* Hugetlbfs is not reclaimable; therefore its i_mmap_rwsem will never
* be taken from reclaim -- unlike regular filesystems. This needs an
- * annotation because huge_pmd_share() does an allocation under
+ * annotation because huge_pmd_share() does an allocation under hugetlb's
* i_mmap_rwsem.
*/
static struct lock_class_key hugetlbfs_i_mmap_rwsem_key;
@@ -474,7 +747,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
/*
* The policy is initialized here even if we are creating a
* private inode because initialization simply creates an
- * an empty rb tree and calls spin_lock_init(), later when we
+ * an empty rb tree and calls rwlock_init(), later when we
* call mpol_free_shared_policy() it will just return because
* the rb tree will still be empty.
*/
@@ -496,6 +769,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
break;
case S_IFLNK:
inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
break;
}
lockdep_annotate_inode_mutex_key(inode);
@@ -701,7 +975,8 @@ const struct file_operations hugetlbfs_file_operations = {
.mmap = hugetlbfs_file_mmap,
.fsync = noop_fsync,
.get_unmapped_area = hugetlb_get_unmapped_area,
- .llseek = default_llseek,
+ .llseek = default_llseek,
+ .fallocate = hugetlbfs_fallocate,
};
static const struct inode_operations hugetlbfs_dir_inode_operations = {
@@ -936,7 +1211,6 @@ static struct file_system_type hugetlbfs_fs_type = {
.mount = hugetlbfs_mount,
.kill_sb = kill_litter_super,
};
-MODULE_ALIAS_FS("hugetlbfs");
static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE];
@@ -1056,7 +1330,7 @@ static int __init init_hugetlbfs_fs(void)
error = -ENOMEM;
hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache",
sizeof(struct hugetlbfs_inode_info),
- 0, 0, init_once);
+ 0, SLAB_ACCOUNT, init_once);
if (hugetlbfs_inode_cachep == NULL)
goto out2;
@@ -1090,26 +1364,4 @@ static int __init init_hugetlbfs_fs(void)
out2:
return error;
}
-
-static void __exit exit_hugetlbfs_fs(void)
-{
- struct hstate *h;
- int i;
-
-
- /*
- * Make sure all delayed rcu free inodes are flushed before we
- * destroy cache.
- */
- rcu_barrier();
- kmem_cache_destroy(hugetlbfs_inode_cachep);
- i = 0;
- for_each_hstate(h)
- kern_unmount(hugetlbfs_vfsmount[i++]);
- unregister_filesystem(&hugetlbfs_fs_type);
-}
-
-module_init(init_hugetlbfs_fs)
-module_exit(exit_hugetlbfs_fs)
-
-MODULE_LICENSE("GPL");
+fs_initcall(init_hugetlbfs_fs)
diff --git a/fs/inode.c b/fs/inode.c
index d30640f..e491e54 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -28,16 +28,16 @@
* inode->i_state, inode->i_hash, __iget()
* Inode LRU list locks protect:
* inode->i_sb->s_inode_lru, inode->i_lru
- * inode_sb_list_lock protects:
- * sb->s_inodes, inode->i_sb_list
+ * inode->i_sb->s_inode_list_lock protects:
+ * inode->i_sb->s_inodes, inode->i_sb_list
* bdi->wb.list_lock protects:
- * bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_wb_list
+ * bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_io_list
* inode_hash_lock protects:
* inode_hashtable, inode->i_hash
*
* Lock ordering:
*
- * inode_sb_list_lock
+ * inode->i_sb->s_inode_list_lock
* inode->i_lock
* Inode LRU list locks
*
@@ -45,7 +45,7 @@
* inode->i_lock
*
* inode_hash_lock
- * inode_sb_list_lock
+ * inode->i_sb->s_inode_list_lock
* inode->i_lock
*
* iunique_lock
@@ -57,8 +57,6 @@ static unsigned int i_hash_shift __read_mostly;
static struct hlist_head *inode_hashtable __read_mostly;
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
-__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock);
-
/*
* Empty aops. Can be used for the cases where the user does not
* define any of the address_space operations.
@@ -227,7 +225,7 @@ void __destroy_inode(struct inode *inode)
inode_detach_wb(inode);
security_inode_free(inode);
fsnotify_inode_delete(inode);
- locks_free_lock_context(inode->i_flctx);
+ locks_free_lock_context(inode);
if (!inode->i_nlink) {
WARN_ON(atomic_long_read(&inode->i_sb->s_remove_count) == 0);
atomic_long_dec(&inode->i_sb->s_remove_count);
@@ -359,7 +357,7 @@ void inode_init_once(struct inode *inode)
memset(inode, 0, sizeof(*inode));
INIT_HLIST_NODE(&inode->i_hash);
INIT_LIST_HEAD(&inode->i_devices);
- INIT_LIST_HEAD(&inode->i_wb_list);
+ INIT_LIST_HEAD(&inode->i_io_list);
INIT_LIST_HEAD(&inode->i_lru);
address_space_init_once(&inode->i_data);
i_size_ordered_init(inode);
@@ -426,18 +424,18 @@ static void inode_lru_list_del(struct inode *inode)
*/
void inode_sb_list_add(struct inode *inode)
{
- spin_lock(&inode_sb_list_lock);
+ spin_lock(&inode->i_sb->s_inode_list_lock);
list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
- spin_unlock(&inode_sb_list_lock);
+ spin_unlock(&inode->i_sb->s_inode_list_lock);
}
EXPORT_SYMBOL_GPL(inode_sb_list_add);
static inline void inode_sb_list_del(struct inode *inode)
{
if (!list_empty(&inode->i_sb_list)) {
- spin_lock(&inode_sb_list_lock);
+ spin_lock(&inode->i_sb->s_inode_list_lock);
list_del_init(&inode->i_sb_list);
- spin_unlock(&inode_sb_list_lock);
+ spin_unlock(&inode->i_sb->s_inode_list_lock);
}
}
@@ -527,8 +525,8 @@ static void evict(struct inode *inode)
BUG_ON(!(inode->i_state & I_FREEING));
BUG_ON(!list_empty(&inode->i_lru));
- if (!list_empty(&inode->i_wb_list))
- inode_wb_list_del(inode);
+ if (!list_empty(&inode->i_io_list))
+ inode_io_list_del(inode);
inode_sb_list_del(inode);
@@ -577,6 +575,7 @@ static void dispose_list(struct list_head *head)
list_del_init(&inode->i_lru);
evict(inode);
+ cond_resched();
}
}
@@ -594,7 +593,8 @@ void evict_inodes(struct super_block *sb)
struct inode *inode, *next;
LIST_HEAD(dispose);
- spin_lock(&inode_sb_list_lock);
+again:
+ spin_lock(&sb->s_inode_list_lock);
list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
if (atomic_read(&inode->i_count))
continue;
@@ -609,8 +609,20 @@ void evict_inodes(struct super_block *sb)
inode_lru_list_del(inode);
spin_unlock(&inode->i_lock);
list_add(&inode->i_lru, &dispose);
+
+ /*
+ * We can have a ton of inodes to evict at unmount time given
+ * enough memory, check to see if we need to go to sleep for a
+ * bit so we don't livelock.
+ */
+ if (need_resched()) {
+ spin_unlock(&sb->s_inode_list_lock);
+ cond_resched();
+ dispose_list(&dispose);
+ goto again;
+ }
}
- spin_unlock(&inode_sb_list_lock);
+ spin_unlock(&sb->s_inode_list_lock);
dispose_list(&dispose);
}
@@ -631,7 +643,7 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
struct inode *inode, *next;
LIST_HEAD(dispose);
- spin_lock(&inode_sb_list_lock);
+ spin_lock(&sb->s_inode_list_lock);
list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
spin_lock(&inode->i_lock);
if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
@@ -654,7 +666,7 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
spin_unlock(&inode->i_lock);
list_add(&inode->i_lru, &dispose);
}
- spin_unlock(&inode_sb_list_lock);
+ spin_unlock(&sb->s_inode_list_lock);
dispose_list(&dispose);
@@ -890,7 +902,7 @@ struct inode *new_inode(struct super_block *sb)
{
struct inode *inode;
- spin_lock_prefetch(&inode_sb_list_lock);
+ spin_lock_prefetch(&sb->s_inode_list_lock);
inode = new_inode_pseudo(sb);
if (inode)
@@ -1585,6 +1597,7 @@ static int update_time(struct inode *inode, struct timespec *time, int flags)
/**
* touch_atime - update the access time
* @path: the &struct path to update
+ * @inode: inode to update
*
* Update the accessed time on an inode and mark it for writeback.
* This function automatically handles read only file systems and media,
@@ -1870,7 +1883,7 @@ void __init inode_init(void)
sizeof(struct inode),
0,
(SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
/* Hash may have been set up in inode_init_early */
@@ -2015,3 +2028,9 @@ void inode_set_flags(struct inode *inode, unsigned int flags,
new_flags) != old_flags));
}
EXPORT_SYMBOL(inode_set_flags);
+
+void inode_nohighmem(struct inode *inode)
+{
+ mapping_set_gfp_mask(inode->i_mapping, GFP_USER);
+}
+EXPORT_SYMBOL(inode_nohighmem);
diff --git a/fs/internal.h b/fs/internal.h
index 4d5af58..b71deee 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -55,7 +55,7 @@ extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
/*
* namespace.c
*/
-extern int copy_mount_options(const void __user *, unsigned long *);
+extern void *copy_mount_options(const void __user *);
extern char *copy_mount_string(const void __user *);
extern struct vfsmount *lookup_mnt(struct path *);
@@ -112,14 +112,13 @@ extern int vfs_open(const struct path *, struct file *, const struct cred *);
/*
* inode.c
*/
-extern spinlock_t inode_sb_list_lock;
extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc);
extern void inode_add_lru(struct inode *inode);
/*
* fs-writeback.c
*/
-extern void inode_wb_list_del(struct inode *inode);
+extern void inode_io_list_del(struct inode *inode);
extern long get_nr_dirty_inodes(void);
extern void evict_inodes(struct super_block *);
@@ -152,3 +151,10 @@ extern void mnt_pin_kill(struct mount *m);
* fs/nsfs.c
*/
extern struct dentry_operations ns_dentry_operations;
+
+/*
+ * fs/ioctl.c
+ */
+extern int do_vfs_ioctl(struct file *file, unsigned int fd, unsigned int cmd,
+ unsigned long arg);
+extern long vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 5d01d26..29466c3 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -15,6 +15,7 @@
#include <linux/writeback.h>
#include <linux/buffer_head.h>
#include <linux/falloc.h>
+#include "internal.h"
#include <asm/ioctls.h>
@@ -32,8 +33,7 @@
*
* Returns 0 on success, -errno on error.
*/
-static long vfs_ioctl(struct file *filp, unsigned int cmd,
- unsigned long arg)
+long vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
int error = -ENOTTY;
@@ -215,6 +215,29 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
return error;
}
+static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd,
+ u64 off, u64 olen, u64 destoff)
+{
+ struct fd src_file = fdget(srcfd);
+ int ret;
+
+ if (!src_file.file)
+ return -EBADF;
+ ret = vfs_clone_file_range(src_file.file, off, dst_file, destoff, olen);
+ fdput(src_file);
+ return ret;
+}
+
+static long ioctl_file_clone_range(struct file *file, void __user *argp)
+{
+ struct file_clone_range args;
+
+ if (copy_from_user(&args, argp, sizeof(args)))
+ return -EFAULT;
+ return ioctl_file_clone(file, args.src_fd, args.src_offset,
+ args.src_length, args.dest_offset);
+}
+
#ifdef CONFIG_BLOCK
static inline sector_t logical_to_blk(struct inode *inode, loff_t offset)
@@ -545,6 +568,41 @@ static int ioctl_fsthaw(struct file *filp)
return thaw_super(sb);
}
+static long ioctl_file_dedupe_range(struct file *file, void __user *arg)
+{
+ struct file_dedupe_range __user *argp = arg;
+ struct file_dedupe_range *same = NULL;
+ int ret;
+ unsigned long size;
+ u16 count;
+
+ if (get_user(count, &argp->dest_count)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ size = offsetof(struct file_dedupe_range __user, info[count]);
+
+ same = memdup_user(argp, size);
+ if (IS_ERR(same)) {
+ ret = PTR_ERR(same);
+ same = NULL;
+ goto out;
+ }
+
+ ret = vfs_dedupe_file_range(file, same);
+ if (ret)
+ goto out;
+
+ ret = copy_to_user(argp, same, size);
+ if (ret)
+ ret = -EFAULT;
+
+out:
+ kfree(same);
+ return ret;
+}
+
/*
* When you add any new common ioctls to the switches above and below
* please update compat_sys_ioctl() too.
@@ -600,6 +658,15 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
case FIGETBSZ:
return put_user(inode->i_sb->s_blocksize, argp);
+ case FICLONE:
+ return ioctl_file_clone(filp, arg, 0, 0, 0);
+
+ case FICLONERANGE:
+ return ioctl_file_clone_range(filp, argp);
+
+ case FIDEDUPERANGE:
+ return ioctl_file_dedupe_range(filp, argp);
+
default:
if (S_ISREG(inode->i_mode))
error = file_ioctl(filp, cmd, arg);
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index d67a16f..bcd2d41 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -94,7 +94,7 @@ static int __init init_inodecache(void)
isofs_inode_cachep = kmem_cache_create("isofs_inode_cache",
sizeof(struct iso_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (isofs_inode_cachep == NULL)
return -ENOMEM;
@@ -1417,6 +1417,7 @@ static int isofs_read_inode(struct inode *inode, int relocated)
inode->i_fop = &isofs_dir_operations;
} else if (S_ISLNK(inode->i_mode)) {
inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_data.a_ops = &isofs_symlink_aops;
} else
/* XXX - parse_rock_ridge_inode() had already set i_rdev. */
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index 735d752..5384ceb 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -687,7 +687,7 @@ static int rock_ridge_symlink_readpage(struct file *file, struct page *page)
struct inode *inode = page->mapping->host;
struct iso_inode_info *ei = ISOFS_I(inode);
struct isofs_sb_info *sbi = ISOFS_SB(inode->i_sb);
- char *link = kmap(page);
+ char *link = page_address(page);
unsigned long bufsize = ISOFS_BUFFER_SIZE(inode);
struct buffer_head *bh;
char *rpnt = link;
@@ -774,7 +774,6 @@ repeat:
brelse(bh);
*rpnt = '\0';
SetPageUptodate(page);
- kunmap(page);
unlock_page(page);
return 0;
@@ -791,7 +790,6 @@ fail:
brelse(bh);
error:
SetPageError(page);
- kunmap(page);
unlock_page(page);
return -EIO;
}
diff --git a/fs/jbd/Kconfig b/fs/jbd/Kconfig
deleted file mode 100644
index 4e28bee..0000000
--- a/fs/jbd/Kconfig
+++ /dev/null
@@ -1,30 +0,0 @@
-config JBD
- tristate
- help
- This is a generic journalling layer for block devices. It is
- currently used by the ext3 file system, but it could also be
- used to add journal support to other file systems or block
- devices such as RAID or LVM.
-
- If you are using the ext3 file system, you need to say Y here.
- If you are not using ext3 then you will probably want to say N.
-
- To compile this device as a module, choose M here: the module will be
- called jbd. If you are compiling ext3 into the kernel, you
- cannot compile this code as a module.
-
-config JBD_DEBUG
- bool "JBD (ext3) debugging support"
- depends on JBD && DEBUG_FS
- help
- If you are using the ext3 journaled file system (or potentially any
- other file system/device using JBD), this option allows you to
- enable debugging output while the system is running, in order to
- help track down any problems you are having. By default the
- debugging output will be turned off.
-
- If you select Y here, then you will be able to turn on debugging
- with "echo N > /sys/kernel/debug/jbd/jbd-debug", where N is a
- number between 1 and 5, the higher the number, the more debugging
- output is generated. To turn debugging off again, do
- "echo 0 > /sys/kernel/debug/jbd/jbd-debug".
diff --git a/fs/jbd/Makefile b/fs/jbd/Makefile
deleted file mode 100644
index 54aca48..0000000
--- a/fs/jbd/Makefile
+++ /dev/null
@@ -1,7 +0,0 @@
-#
-# Makefile for the linux journaling routines.
-#
-
-obj-$(CONFIG_JBD) += jbd.o
-
-jbd-objs := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
deleted file mode 100644
index 08c0304..0000000
--- a/fs/jbd/checkpoint.c
+++ /dev/null
@@ -1,782 +0,0 @@
-/*
- * linux/fs/jbd/checkpoint.c
- *
- * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
- *
- * Copyright 1999 Red Hat Software --- All Rights Reserved
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
- * Checkpoint routines for the generic filesystem journaling code.
- * Part of the ext2fs journaling system.
- *
- * Checkpointing is the process of ensuring that a section of the log is
- * committed fully to disk, so that that portion of the log can be
- * reused.
- */
-
-#include <linux/time.h>
-#include <linux/fs.h>
-#include <linux/jbd.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <linux/blkdev.h>
-#include <trace/events/jbd.h>
-
-/*
- * Unlink a buffer from a transaction checkpoint list.
- *
- * Called with j_list_lock held.
- */
-static inline void __buffer_unlink_first(struct journal_head *jh)
-{
- transaction_t *transaction = jh->b_cp_transaction;
-
- jh->b_cpnext->b_cpprev = jh->b_cpprev;
- jh->b_cpprev->b_cpnext = jh->b_cpnext;
- if (transaction->t_checkpoint_list == jh) {
- transaction->t_checkpoint_list = jh->b_cpnext;
- if (transaction->t_checkpoint_list == jh)
- transaction->t_checkpoint_list = NULL;
- }
-}
-
-/*
- * Unlink a buffer from a transaction checkpoint(io) list.
- *
- * Called with j_list_lock held.
- */
-static inline void __buffer_unlink(struct journal_head *jh)
-{
- transaction_t *transaction = jh->b_cp_transaction;
-
- __buffer_unlink_first(jh);
- if (transaction->t_checkpoint_io_list == jh) {
- transaction->t_checkpoint_io_list = jh->b_cpnext;
- if (transaction->t_checkpoint_io_list == jh)
- transaction->t_checkpoint_io_list = NULL;
- }
-}
-
-/*
- * Move a buffer from the checkpoint list to the checkpoint io list
- *
- * Called with j_list_lock held
- */
-static inline void __buffer_relink_io(struct journal_head *jh)
-{
- transaction_t *transaction = jh->b_cp_transaction;
-
- __buffer_unlink_first(jh);
-
- if (!transaction->t_checkpoint_io_list) {
- jh->b_cpnext = jh->b_cpprev = jh;
- } else {
- jh->b_cpnext = transaction->t_checkpoint_io_list;
- jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev;
- jh->b_cpprev->b_cpnext = jh;
- jh->b_cpnext->b_cpprev = jh;
- }
- transaction->t_checkpoint_io_list = jh;
-}
-
-/*
- * Try to release a checkpointed buffer from its transaction.
- * Returns 1 if we released it and 2 if we also released the
- * whole transaction.
- *
- * Requires j_list_lock
- * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
- */
-static int __try_to_free_cp_buf(struct journal_head *jh)
-{
- int ret = 0;
- struct buffer_head *bh = jh2bh(jh);
-
- if (jh->b_jlist == BJ_None && !buffer_locked(bh) &&
- !buffer_dirty(bh) && !buffer_write_io_error(bh)) {
- /*
- * Get our reference so that bh cannot be freed before
- * we unlock it
- */
- get_bh(bh);
- JBUFFER_TRACE(jh, "remove from checkpoint list");
- ret = __journal_remove_checkpoint(jh) + 1;
- jbd_unlock_bh_state(bh);
- BUFFER_TRACE(bh, "release");
- __brelse(bh);
- } else {
- jbd_unlock_bh_state(bh);
- }
- return ret;
-}
-
-/*
- * __log_wait_for_space: wait until there is space in the journal.
- *
- * Called under j-state_lock *only*. It will be unlocked if we have to wait
- * for a checkpoint to free up some space in the log.
- */
-void __log_wait_for_space(journal_t *journal)
-{
- int nblocks, space_left;
- assert_spin_locked(&journal->j_state_lock);
-
- nblocks = jbd_space_needed(journal);
- while (__log_space_left(journal) < nblocks) {
- if (journal->j_flags & JFS_ABORT)
- return;
- spin_unlock(&journal->j_state_lock);
- mutex_lock(&journal->j_checkpoint_mutex);
-
- /*
- * Test again, another process may have checkpointed while we
- * were waiting for the checkpoint lock. If there are no
- * transactions ready to be checkpointed, try to recover
- * journal space by calling cleanup_journal_tail(), and if
- * that doesn't work, by waiting for the currently committing
- * transaction to complete. If there is absolutely no way
- * to make progress, this is either a BUG or corrupted
- * filesystem, so abort the journal and leave a stack
- * trace for forensic evidence.
- */
- spin_lock(&journal->j_state_lock);
- spin_lock(&journal->j_list_lock);
- nblocks = jbd_space_needed(journal);
- space_left = __log_space_left(journal);
- if (space_left < nblocks) {
- int chkpt = journal->j_checkpoint_transactions != NULL;
- tid_t tid = 0;
-
- if (journal->j_committing_transaction)
- tid = journal->j_committing_transaction->t_tid;
- spin_unlock(&journal->j_list_lock);
- spin_unlock(&journal->j_state_lock);
- if (chkpt) {
- log_do_checkpoint(journal);
- } else if (cleanup_journal_tail(journal) == 0) {
- /* We were able to recover space; yay! */
- ;
- } else if (tid) {
- log_wait_commit(journal, tid);
- } else {
- printk(KERN_ERR "%s: needed %d blocks and "
- "only had %d space available\n",
- __func__, nblocks, space_left);
- printk(KERN_ERR "%s: no way to get more "
- "journal space\n", __func__);
- WARN_ON(1);
- journal_abort(journal, 0);
- }
- spin_lock(&journal->j_state_lock);
- } else {
- spin_unlock(&journal->j_list_lock);
- }
- mutex_unlock(&journal->j_checkpoint_mutex);
- }
-}
-
-/*
- * We were unable to perform jbd_trylock_bh_state() inside j_list_lock.
- * The caller must restart a list walk. Wait for someone else to run
- * jbd_unlock_bh_state().
- */
-static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
- __releases(journal->j_list_lock)
-{
- get_bh(bh);
- spin_unlock(&journal->j_list_lock);
- jbd_lock_bh_state(bh);
- jbd_unlock_bh_state(bh);
- put_bh(bh);
-}
-
-/*
- * Clean up transaction's list of buffers submitted for io.
- * We wait for any pending IO to complete and remove any clean
- * buffers. Note that we take the buffers in the opposite ordering
- * from the one in which they were submitted for IO.
- *
- * Return 0 on success, and return <0 if some buffers have failed
- * to be written out.
- *
- * Called with j_list_lock held.
- */
-static int __wait_cp_io(journal_t *journal, transaction_t *transaction)
-{
- struct journal_head *jh;
- struct buffer_head *bh;
- tid_t this_tid;
- int released = 0;
- int ret = 0;
-
- this_tid = transaction->t_tid;
-restart:
- /* Did somebody clean up the transaction in the meanwhile? */
- if (journal->j_checkpoint_transactions != transaction ||
- transaction->t_tid != this_tid)
- return ret;
- while (!released && transaction->t_checkpoint_io_list) {
- jh = transaction->t_checkpoint_io_list;
- bh = jh2bh(jh);
- if (!jbd_trylock_bh_state(bh)) {
- jbd_sync_bh(journal, bh);
- spin_lock(&journal->j_list_lock);
- goto restart;
- }
- get_bh(bh);
- if (buffer_locked(bh)) {
- spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
- wait_on_buffer(bh);
- /* the journal_head may have gone by now */
- BUFFER_TRACE(bh, "brelse");
- __brelse(bh);
- spin_lock(&journal->j_list_lock);
- goto restart;
- }
- if (unlikely(buffer_write_io_error(bh)))
- ret = -EIO;
-
- /*
- * Now in whatever state the buffer currently is, we know that
- * it has been written out and so we can drop it from the list
- */
- released = __journal_remove_checkpoint(jh);
- jbd_unlock_bh_state(bh);
- __brelse(bh);
- }
-
- return ret;
-}
-
-#define NR_BATCH 64
-
-static void
-__flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
-{
- int i;
- struct blk_plug plug;
-
- blk_start_plug(&plug);
- for (i = 0; i < *batch_count; i++)
- write_dirty_buffer(bhs[i], WRITE_SYNC);
- blk_finish_plug(&plug);
-
- for (i = 0; i < *batch_count; i++) {
- struct buffer_head *bh = bhs[i];
- clear_buffer_jwrite(bh);
- BUFFER_TRACE(bh, "brelse");
- __brelse(bh);
- }
- *batch_count = 0;
-}
-
-/*
- * Try to flush one buffer from the checkpoint list to disk.
- *
- * Return 1 if something happened which requires us to abort the current
- * scan of the checkpoint list. Return <0 if the buffer has failed to
- * be written out.
- *
- * Called with j_list_lock held and drops it if 1 is returned
- * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
- */
-static int __process_buffer(journal_t *journal, struct journal_head *jh,
- struct buffer_head **bhs, int *batch_count)
-{
- struct buffer_head *bh = jh2bh(jh);
- int ret = 0;
-
- if (buffer_locked(bh)) {
- get_bh(bh);
- spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
- wait_on_buffer(bh);
- /* the journal_head may have gone by now */
- BUFFER_TRACE(bh, "brelse");
- __brelse(bh);
- ret = 1;
- } else if (jh->b_transaction != NULL) {
- transaction_t *t = jh->b_transaction;
- tid_t tid = t->t_tid;
-
- spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
- log_start_commit(journal, tid);
- log_wait_commit(journal, tid);
- ret = 1;
- } else if (!buffer_dirty(bh)) {
- ret = 1;
- if (unlikely(buffer_write_io_error(bh)))
- ret = -EIO;
- get_bh(bh);
- J_ASSERT_JH(jh, !buffer_jbddirty(bh));
- BUFFER_TRACE(bh, "remove from checkpoint");
- __journal_remove_checkpoint(jh);
- spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
- __brelse(bh);
- } else {
- /*
- * Important: we are about to write the buffer, and
- * possibly block, while still holding the journal lock.
- * We cannot afford to let the transaction logic start
- * messing around with this buffer before we write it to
- * disk, as that would break recoverability.
- */
- BUFFER_TRACE(bh, "queue");
- get_bh(bh);
- J_ASSERT_BH(bh, !buffer_jwrite(bh));
- set_buffer_jwrite(bh);
- bhs[*batch_count] = bh;
- __buffer_relink_io(jh);
- jbd_unlock_bh_state(bh);
- (*batch_count)++;
- if (*batch_count == NR_BATCH) {
- spin_unlock(&journal->j_list_lock);
- __flush_batch(journal, bhs, batch_count);
- ret = 1;
- }
- }
- return ret;
-}
-
-/*
- * Perform an actual checkpoint. We take the first transaction on the
- * list of transactions to be checkpointed and send all its buffers
- * to disk. We submit larger chunks of data at once.
- *
- * The journal should be locked before calling this function.
- * Called with j_checkpoint_mutex held.
- */
-int log_do_checkpoint(journal_t *journal)
-{
- transaction_t *transaction;
- tid_t this_tid;
- int result;
-
- jbd_debug(1, "Start checkpoint\n");
-
- /*
- * First thing: if there are any transactions in the log which
- * don't need checkpointing, just eliminate them from the
- * journal straight away.
- */
- result = cleanup_journal_tail(journal);
- trace_jbd_checkpoint(journal, result);
- jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
- if (result <= 0)
- return result;
-
- /*
- * OK, we need to start writing disk blocks. Take one transaction
- * and write it.
- */
- result = 0;
- spin_lock(&journal->j_list_lock);
- if (!journal->j_checkpoint_transactions)
- goto out;
- transaction = journal->j_checkpoint_transactions;
- this_tid = transaction->t_tid;
-restart:
- /*
- * If someone cleaned up this transaction while we slept, we're
- * done (maybe it's a new transaction, but it fell at the same
- * address).
- */
- if (journal->j_checkpoint_transactions == transaction &&
- transaction->t_tid == this_tid) {
- int batch_count = 0;
- struct buffer_head *bhs[NR_BATCH];
- struct journal_head *jh;
- int retry = 0, err;
-
- while (!retry && transaction->t_checkpoint_list) {
- struct buffer_head *bh;
-
- jh = transaction->t_checkpoint_list;
- bh = jh2bh(jh);
- if (!jbd_trylock_bh_state(bh)) {
- jbd_sync_bh(journal, bh);
- retry = 1;
- break;
- }
- retry = __process_buffer(journal, jh, bhs,&batch_count);
- if (retry < 0 && !result)
- result = retry;
- if (!retry && (need_resched() ||
- spin_needbreak(&journal->j_list_lock))) {
- spin_unlock(&journal->j_list_lock);
- retry = 1;
- break;
- }
- }
-
- if (batch_count) {
- if (!retry) {
- spin_unlock(&journal->j_list_lock);
- retry = 1;
- }
- __flush_batch(journal, bhs, &batch_count);
- }
-
- if (retry) {
- spin_lock(&journal->j_list_lock);
- goto restart;
- }
- /*
- * Now we have cleaned up the first transaction's checkpoint
- * list. Let's clean up the second one
- */
- err = __wait_cp_io(journal, transaction);
- if (!result)
- result = err;
- }
-out:
- spin_unlock(&journal->j_list_lock);
- if (result < 0)
- journal_abort(journal, result);
- else
- result = cleanup_journal_tail(journal);
-
- return (result < 0) ? result : 0;
-}
-
-/*
- * Check the list of checkpoint transactions for the journal to see if
- * we have already got rid of any since the last update of the log tail
- * in the journal superblock. If so, we can instantly roll the
- * superblock forward to remove those transactions from the log.
- *
- * Return <0 on error, 0 on success, 1 if there was nothing to clean up.
- *
- * This is the only part of the journaling code which really needs to be
- * aware of transaction aborts. Checkpointing involves writing to the
- * main filesystem area rather than to the journal, so it can proceed
- * even in abort state, but we must not update the super block if
- * checkpointing may have failed. Otherwise, we would lose some metadata
- * buffers which should be written-back to the filesystem.
- */
-
-int cleanup_journal_tail(journal_t *journal)
-{
- transaction_t * transaction;
- tid_t first_tid;
- unsigned int blocknr, freed;
-
- if (is_journal_aborted(journal))
- return 1;
-
- /*
- * OK, work out the oldest transaction remaining in the log, and
- * the log block it starts at.
- *
- * If the log is now empty, we need to work out which is the
- * next transaction ID we will write, and where it will
- * start.
- */
- spin_lock(&journal->j_state_lock);
- spin_lock(&journal->j_list_lock);
- transaction = journal->j_checkpoint_transactions;
- if (transaction) {
- first_tid = transaction->t_tid;
- blocknr = transaction->t_log_start;
- } else if ((transaction = journal->j_committing_transaction) != NULL) {
- first_tid = transaction->t_tid;
- blocknr = transaction->t_log_start;
- } else if ((transaction = journal->j_running_transaction) != NULL) {
- first_tid = transaction->t_tid;
- blocknr = journal->j_head;
- } else {
- first_tid = journal->j_transaction_sequence;
- blocknr = journal->j_head;
- }
- spin_unlock(&journal->j_list_lock);
- J_ASSERT(blocknr != 0);
-
- /* If the oldest pinned transaction is at the tail of the log
- already then there's not much we can do right now. */
- if (journal->j_tail_sequence == first_tid) {
- spin_unlock(&journal->j_state_lock);
- return 1;
- }
- spin_unlock(&journal->j_state_lock);
-
- /*
- * We need to make sure that any blocks that were recently written out
- * --- perhaps by log_do_checkpoint() --- are flushed out before we
- * drop the transactions from the journal. Similarly we need to be sure
- * superblock makes it to disk before next transaction starts reusing
- * freed space (otherwise we could replay some blocks of the new
- * transaction thinking they belong to the old one). So we use
- * WRITE_FLUSH_FUA. It's unlikely this will be necessary, especially
- * with an appropriately sized journal, but we need this to guarantee
- * correctness. Fortunately cleanup_journal_tail() doesn't get called
- * all that often.
- */
- journal_update_sb_log_tail(journal, first_tid, blocknr,
- WRITE_FLUSH_FUA);
-
- spin_lock(&journal->j_state_lock);
- /* OK, update the superblock to recover the freed space.
- * Physical blocks come first: have we wrapped beyond the end of
- * the log? */
- freed = blocknr - journal->j_tail;
- if (blocknr < journal->j_tail)
- freed = freed + journal->j_last - journal->j_first;
-
- trace_jbd_cleanup_journal_tail(journal, first_tid, blocknr, freed);
- jbd_debug(1,
- "Cleaning journal tail from %d to %d (offset %u), "
- "freeing %u\n",
- journal->j_tail_sequence, first_tid, blocknr, freed);
-
- journal->j_free += freed;
- journal->j_tail_sequence = first_tid;
- journal->j_tail = blocknr;
- spin_unlock(&journal->j_state_lock);
- return 0;
-}
-
-
-/* Checkpoint list management */
-
-/*
- * journal_clean_one_cp_list
- *
- * Find all the written-back checkpoint buffers in the given list and release
- * them.
- *
- * Called with j_list_lock held.
- * Returns number of buffers reaped (for debug)
- */
-
-static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
-{
- struct journal_head *last_jh;
- struct journal_head *next_jh = jh;
- int ret, freed = 0;
-
- *released = 0;
- if (!jh)
- return 0;
-
- last_jh = jh->b_cpprev;
- do {
- jh = next_jh;
- next_jh = jh->b_cpnext;
- /* Use trylock because of the ranking */
- if (jbd_trylock_bh_state(jh2bh(jh))) {
- ret = __try_to_free_cp_buf(jh);
- if (ret) {
- freed++;
- if (ret == 2) {
- *released = 1;
- return freed;
- }
- }
- }
- /*
- * This function only frees up some memory
- * if possible so we dont have an obligation
- * to finish processing. Bail out if preemption
- * requested:
- */
- if (need_resched())
- return freed;
- } while (jh != last_jh);
-
- return freed;
-}
-
-/*
- * journal_clean_checkpoint_list
- *
- * Find all the written-back checkpoint buffers in the journal and release them.
- *
- * Called with the journal locked.
- * Called with j_list_lock held.
- * Returns number of buffers reaped (for debug)
- */
-
-int __journal_clean_checkpoint_list(journal_t *journal)
-{
- transaction_t *transaction, *last_transaction, *next_transaction;
- int ret = 0;
- int released;
-
- transaction = journal->j_checkpoint_transactions;
- if (!transaction)
- goto out;
-
- last_transaction = transaction->t_cpprev;
- next_transaction = transaction;
- do {
- transaction = next_transaction;
- next_transaction = transaction->t_cpnext;
- ret += journal_clean_one_cp_list(transaction->
- t_checkpoint_list, &released);
- /*
- * This function only frees up some memory if possible so we
- * dont have an obligation to finish processing. Bail out if
- * preemption requested:
- */
- if (need_resched())
- goto out;
- if (released)
- continue;
- /*
- * It is essential that we are as careful as in the case of
- * t_checkpoint_list with removing the buffer from the list as
- * we can possibly see not yet submitted buffers on io_list
- */
- ret += journal_clean_one_cp_list(transaction->
- t_checkpoint_io_list, &released);
- if (need_resched())
- goto out;
- } while (transaction != last_transaction);
-out:
- return ret;
-}
-
-/*
- * journal_remove_checkpoint: called after a buffer has been committed
- * to disk (either by being write-back flushed to disk, or being
- * committed to the log).
- *
- * We cannot safely clean a transaction out of the log until all of the
- * buffer updates committed in that transaction have safely been stored
- * elsewhere on disk. To achieve this, all of the buffers in a
- * transaction need to be maintained on the transaction's checkpoint
- * lists until they have been rewritten, at which point this function is
- * called to remove the buffer from the existing transaction's
- * checkpoint lists.
- *
- * The function returns 1 if it frees the transaction, 0 otherwise.
- * The function can free jh and bh.
- *
- * This function is called with j_list_lock held.
- * This function is called with jbd_lock_bh_state(jh2bh(jh))
- */
-
-int __journal_remove_checkpoint(struct journal_head *jh)
-{
- transaction_t *transaction;
- journal_t *journal;
- int ret = 0;
-
- JBUFFER_TRACE(jh, "entry");
-
- if ((transaction = jh->b_cp_transaction) == NULL) {
- JBUFFER_TRACE(jh, "not on transaction");
- goto out;
- }
- journal = transaction->t_journal;
-
- JBUFFER_TRACE(jh, "removing from transaction");
- __buffer_unlink(jh);
- jh->b_cp_transaction = NULL;
- journal_put_journal_head(jh);
-
- if (transaction->t_checkpoint_list != NULL ||
- transaction->t_checkpoint_io_list != NULL)
- goto out;
-
- /*
- * There is one special case to worry about: if we have just pulled the
- * buffer off a running or committing transaction's checkpoing list,
- * then even if the checkpoint list is empty, the transaction obviously
- * cannot be dropped!
- *
- * The locking here around t_state is a bit sleazy.
- * See the comment at the end of journal_commit_transaction().
- */
- if (transaction->t_state != T_FINISHED)
- goto out;
-
- /* OK, that was the last buffer for the transaction: we can now
- safely remove this transaction from the log */
-
- __journal_drop_transaction(journal, transaction);
-
- /* Just in case anybody was waiting for more transactions to be
- checkpointed... */
- wake_up(&journal->j_wait_logspace);
- ret = 1;
-out:
- return ret;
-}
-
-/*
- * journal_insert_checkpoint: put a committed buffer onto a checkpoint
- * list so that we know when it is safe to clean the transaction out of
- * the log.
- *
- * Called with the journal locked.
- * Called with j_list_lock held.
- */
-void __journal_insert_checkpoint(struct journal_head *jh,
- transaction_t *transaction)
-{
- JBUFFER_TRACE(jh, "entry");
- J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh)));
- J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
-
- /* Get reference for checkpointing transaction */
- journal_grab_journal_head(jh2bh(jh));
- jh->b_cp_transaction = transaction;
-
- if (!transaction->t_checkpoint_list) {
- jh->b_cpnext = jh->b_cpprev = jh;
- } else {
- jh->b_cpnext = transaction->t_checkpoint_list;
- jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev;
- jh->b_cpprev->b_cpnext = jh;
- jh->b_cpnext->b_cpprev = jh;
- }
- transaction->t_checkpoint_list = jh;
-}
-
-/*
- * We've finished with this transaction structure: adios...
- *
- * The transaction must have no links except for the checkpoint by this
- * point.
- *
- * Called with the journal locked.
- * Called with j_list_lock held.
- */
-
-void __journal_drop_transaction(journal_t *journal, transaction_t *transaction)
-{
- assert_spin_locked(&journal->j_list_lock);
- if (transaction->t_cpnext) {
- transaction->t_cpnext->t_cpprev = transaction->t_cpprev;
- transaction->t_cpprev->t_cpnext = transaction->t_cpnext;
- if (journal->j_checkpoint_transactions == transaction)
- journal->j_checkpoint_transactions =
- transaction->t_cpnext;
- if (journal->j_checkpoint_transactions == transaction)
- journal->j_checkpoint_transactions = NULL;
- }
-
- J_ASSERT(transaction->t_state == T_FINISHED);
- J_ASSERT(transaction->t_buffers == NULL);
- J_ASSERT(transaction->t_sync_datalist == NULL);
- J_ASSERT(transaction->t_forget == NULL);
- J_ASSERT(transaction->t_iobuf_list == NULL);
- J_ASSERT(transaction->t_shadow_list == NULL);
- J_ASSERT(transaction->t_log_list == NULL);
- J_ASSERT(transaction->t_checkpoint_list == NULL);
- J_ASSERT(transaction->t_checkpoint_io_list == NULL);
- J_ASSERT(transaction->t_updates == 0);
- J_ASSERT(journal->j_committing_transaction != transaction);
- J_ASSERT(journal->j_running_transaction != transaction);
-
- trace_jbd_drop_transaction(journal, transaction);
- jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
- kfree(transaction);
-}
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
deleted file mode 100644
index bb217dc..0000000
--- a/fs/jbd/commit.c
+++ /dev/null
@@ -1,1021 +0,0 @@
-/*
- * linux/fs/jbd/commit.c
- *
- * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
- *
- * Copyright 1998 Red Hat corp --- All Rights Reserved
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
- * Journal commit routines for the generic filesystem journaling code;
- * part of the ext2fs journaling system.
- */
-
-#include <linux/time.h>
-#include <linux/fs.h>
-#include <linux/jbd.h>
-#include <linux/errno.h>
-#include <linux/mm.h>
-#include <linux/pagemap.h>
-#include <linux/bio.h>
-#include <linux/blkdev.h>
-#include <trace/events/jbd.h>
-
-/*
- * Default IO end handler for temporary BJ_IO buffer_heads.
- */
-static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
-{
- BUFFER_TRACE(bh, "");
- if (uptodate)
- set_buffer_uptodate(bh);
- else
- clear_buffer_uptodate(bh);
- unlock_buffer(bh);
-}
-
-/*
- * When an ext3-ordered file is truncated, it is possible that many pages are
- * not successfully freed, because they are attached to a committing transaction.
- * After the transaction commits, these pages are left on the LRU, with no
- * ->mapping, and with attached buffers. These pages are trivially reclaimable
- * by the VM, but their apparent absence upsets the VM accounting, and it makes
- * the numbers in /proc/meminfo look odd.
- *
- * So here, we have a buffer which has just come off the forget list. Look to
- * see if we can strip all buffers from the backing page.
- *
- * Called under journal->j_list_lock. The caller provided us with a ref
- * against the buffer, and we drop that here.
- */
-static void release_buffer_page(struct buffer_head *bh)
-{
- struct page *page;
-
- if (buffer_dirty(bh))
- goto nope;
- if (atomic_read(&bh->b_count) != 1)
- goto nope;
- page = bh->b_page;
- if (!page)
- goto nope;
- if (page->mapping)
- goto nope;
-
- /* OK, it's a truncated page */
- if (!trylock_page(page))
- goto nope;
-
- page_cache_get(page);
- __brelse(bh);
- try_to_free_buffers(page);
- unlock_page(page);
- page_cache_release(page);
- return;
-
-nope:
- __brelse(bh);
-}
-
-/*
- * Decrement reference counter for data buffer. If it has been marked
- * 'BH_Freed', release it and the page to which it belongs if possible.
- */
-static void release_data_buffer(struct buffer_head *bh)
-{
- if (buffer_freed(bh)) {
- WARN_ON_ONCE(buffer_dirty(bh));
- clear_buffer_freed(bh);
- clear_buffer_mapped(bh);
- clear_buffer_new(bh);
- clear_buffer_req(bh);
- bh->b_bdev = NULL;
- release_buffer_page(bh);
- } else
- put_bh(bh);
-}
-
-/*
- * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
- * held. For ranking reasons we must trylock. If we lose, schedule away and
- * return 0. j_list_lock is dropped in this case.
- */
-static int inverted_lock(journal_t *journal, struct buffer_head *bh)
-{
- if (!jbd_trylock_bh_state(bh)) {
- spin_unlock(&journal->j_list_lock);
- schedule();
- return 0;
- }
- return 1;
-}
-
-/* Done it all: now write the commit record. We should have
- * cleaned up our previous buffers by now, so if we are in abort
- * mode we can now just skip the rest of the journal write
- * entirely.
- *
- * Returns 1 if the journal needs to be aborted or 0 on success
- */
-static int journal_write_commit_record(journal_t *journal,
- transaction_t *commit_transaction)
-{
- struct journal_head *descriptor;
- struct buffer_head *bh;
- journal_header_t *header;
- int ret;
-
- if (is_journal_aborted(journal))
- return 0;
-
- descriptor = journal_get_descriptor_buffer(journal);
- if (!descriptor)
- return 1;
-
- bh = jh2bh(descriptor);
-
- header = (journal_header_t *)(bh->b_data);
- header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
- header->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
- header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
-
- JBUFFER_TRACE(descriptor, "write commit block");
- set_buffer_dirty(bh);
-
- if (journal->j_flags & JFS_BARRIER)
- ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_FLUSH_FUA);
- else
- ret = sync_dirty_buffer(bh);
-
- put_bh(bh); /* One for getblk() */
- journal_put_journal_head(descriptor);
-
- return (ret == -EIO);
-}
-
-static void journal_do_submit_data(struct buffer_head **wbuf, int bufs,
- int write_op)
-{
- int i;
-
- for (i = 0; i < bufs; i++) {
- wbuf[i]->b_end_io = end_buffer_write_sync;
- /*
- * Here we write back pagecache data that may be mmaped. Since
- * we cannot afford to clean the page and set PageWriteback
- * here due to lock ordering (page lock ranks above transaction
- * start), the data can change while IO is in flight. Tell the
- * block layer it should bounce the bio pages if stable data
- * during write is required.
- *
- * We use up our safety reference in submit_bh().
- */
- _submit_bh(write_op, wbuf[i], 1 << BIO_SNAP_STABLE);
- }
-}
-
-/*
- * Submit all the data buffers to disk
- */
-static int journal_submit_data_buffers(journal_t *journal,
- transaction_t *commit_transaction,
- int write_op)
-{
- struct journal_head *jh;
- struct buffer_head *bh;
- int locked;
- int bufs = 0;
- struct buffer_head **wbuf = journal->j_wbuf;
- int err = 0;
-
- /*
- * Whenever we unlock the journal and sleep, things can get added
- * onto ->t_sync_datalist, so we have to keep looping back to
- * write_out_data until we *know* that the list is empty.
- *
- * Cleanup any flushed data buffers from the data list. Even in
- * abort mode, we want to flush this out as soon as possible.
- */
-write_out_data:
- cond_resched();
- spin_lock(&journal->j_list_lock);
-
- while (commit_transaction->t_sync_datalist) {
- jh = commit_transaction->t_sync_datalist;
- bh = jh2bh(jh);
- locked = 0;
-
- /* Get reference just to make sure buffer does not disappear
- * when we are forced to drop various locks */
- get_bh(bh);
- /* If the buffer is dirty, we need to submit IO and hence
- * we need the buffer lock. We try to lock the buffer without
- * blocking. If we fail, we need to drop j_list_lock and do
- * blocking lock_buffer().
- */
- if (buffer_dirty(bh)) {
- if (!trylock_buffer(bh)) {
- BUFFER_TRACE(bh, "needs blocking lock");
- spin_unlock(&journal->j_list_lock);
- trace_jbd_do_submit_data(journal,
- commit_transaction);
- /* Write out all data to prevent deadlocks */
- journal_do_submit_data(wbuf, bufs, write_op);
- bufs = 0;
- lock_buffer(bh);
- spin_lock(&journal->j_list_lock);
- }
- locked = 1;
- }
- /* We have to get bh_state lock. Again out of order, sigh. */
- if (!inverted_lock(journal, bh)) {
- jbd_lock_bh_state(bh);
- spin_lock(&journal->j_list_lock);
- }
- /* Someone already cleaned up the buffer? */
- if (!buffer_jbd(bh) || bh2jh(bh) != jh
- || jh->b_transaction != commit_transaction
- || jh->b_jlist != BJ_SyncData) {
- jbd_unlock_bh_state(bh);
- if (locked)
- unlock_buffer(bh);
- BUFFER_TRACE(bh, "already cleaned up");
- release_data_buffer(bh);
- continue;
- }
- if (locked && test_clear_buffer_dirty(bh)) {
- BUFFER_TRACE(bh, "needs writeout, adding to array");
- wbuf[bufs++] = bh;
- __journal_file_buffer(jh, commit_transaction,
- BJ_Locked);
- jbd_unlock_bh_state(bh);
- if (bufs == journal->j_wbufsize) {
- spin_unlock(&journal->j_list_lock);
- trace_jbd_do_submit_data(journal,
- commit_transaction);
- journal_do_submit_data(wbuf, bufs, write_op);
- bufs = 0;
- goto write_out_data;
- }
- } else if (!locked && buffer_locked(bh)) {
- __journal_file_buffer(jh, commit_transaction,
- BJ_Locked);
- jbd_unlock_bh_state(bh);
- put_bh(bh);
- } else {
- BUFFER_TRACE(bh, "writeout complete: unfile");
- if (unlikely(!buffer_uptodate(bh)))
- err = -EIO;
- __journal_unfile_buffer(jh);
- jbd_unlock_bh_state(bh);
- if (locked)
- unlock_buffer(bh);
- release_data_buffer(bh);
- }
-
- if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
- spin_unlock(&journal->j_list_lock);
- goto write_out_data;
- }
- }
- spin_unlock(&journal->j_list_lock);
- trace_jbd_do_submit_data(journal, commit_transaction);
- journal_do_submit_data(wbuf, bufs, write_op);
-
- return err;
-}
-
-/*
- * journal_commit_transaction
- *
- * The primary function for committing a transaction to the log. This
- * function is called by the journal thread to begin a complete commit.
- */
-void journal_commit_transaction(journal_t *journal)
-{
- transaction_t *commit_transaction;
- struct journal_head *jh, *new_jh, *descriptor;
- struct buffer_head **wbuf = journal->j_wbuf;
- int bufs;
- int flags;
- int err;
- unsigned int blocknr;
- ktime_t start_time;
- u64 commit_time;
- char *tagp = NULL;
- journal_header_t *header;
- journal_block_tag_t *tag = NULL;
- int space_left = 0;
- int first_tag = 0;
- int tag_flag;
- int i;
- struct blk_plug plug;
- int write_op = WRITE;
-
- /*
- * First job: lock down the current transaction and wait for
- * all outstanding updates to complete.
- */
-
- /* Do we need to erase the effects of a prior journal_flush? */
- if (journal->j_flags & JFS_FLUSHED) {
- jbd_debug(3, "super block updated\n");
- mutex_lock(&journal->j_checkpoint_mutex);
- /*
- * We hold j_checkpoint_mutex so tail cannot change under us.
- * We don't need any special data guarantees for writing sb
- * since journal is empty and it is ok for write to be
- * flushed only with transaction commit.
- */
- journal_update_sb_log_tail(journal, journal->j_tail_sequence,
- journal->j_tail, WRITE_SYNC);
- mutex_unlock(&journal->j_checkpoint_mutex);
- } else {
- jbd_debug(3, "superblock not updated\n");
- }
-
- J_ASSERT(journal->j_running_transaction != NULL);
- J_ASSERT(journal->j_committing_transaction == NULL);
-
- commit_transaction = journal->j_running_transaction;
-
- trace_jbd_start_commit(journal, commit_transaction);
- jbd_debug(1, "JBD: starting commit of transaction %d\n",
- commit_transaction->t_tid);
-
- spin_lock(&journal->j_state_lock);
- J_ASSERT(commit_transaction->t_state == T_RUNNING);
- commit_transaction->t_state = T_LOCKED;
-
- trace_jbd_commit_locking(journal, commit_transaction);
- spin_lock(&commit_transaction->t_handle_lock);
- while (commit_transaction->t_updates) {
- DEFINE_WAIT(wait);
-
- prepare_to_wait(&journal->j_wait_updates, &wait,
- TASK_UNINTERRUPTIBLE);
- if (commit_transaction->t_updates) {
- spin_unlock(&commit_transaction->t_handle_lock);
- spin_unlock(&journal->j_state_lock);
- schedule();
- spin_lock(&journal->j_state_lock);
- spin_lock(&commit_transaction->t_handle_lock);
- }
- finish_wait(&journal->j_wait_updates, &wait);
- }
- spin_unlock(&commit_transaction->t_handle_lock);
-
- J_ASSERT (commit_transaction->t_outstanding_credits <=
- journal->j_max_transaction_buffers);
-
- /*
- * First thing we are allowed to do is to discard any remaining
- * BJ_Reserved buffers. Note, it is _not_ permissible to assume
- * that there are no such buffers: if a large filesystem
- * operation like a truncate needs to split itself over multiple
- * transactions, then it may try to do a journal_restart() while
- * there are still BJ_Reserved buffers outstanding. These must
- * be released cleanly from the current transaction.
- *
- * In this case, the filesystem must still reserve write access
- * again before modifying the buffer in the new transaction, but
- * we do not require it to remember exactly which old buffers it
- * has reserved. This is consistent with the existing behaviour
- * that multiple journal_get_write_access() calls to the same
- * buffer are perfectly permissible.
- */
- while (commit_transaction->t_reserved_list) {
- jh = commit_transaction->t_reserved_list;
- JBUFFER_TRACE(jh, "reserved, unused: refile");
- /*
- * A journal_get_undo_access()+journal_release_buffer() may
- * leave undo-committed data.
- */
- if (jh->b_committed_data) {
- struct buffer_head *bh = jh2bh(jh);
-
- jbd_lock_bh_state(bh);
- jbd_free(jh->b_committed_data, bh->b_size);
- jh->b_committed_data = NULL;
- jbd_unlock_bh_state(bh);
- }
- journal_refile_buffer(journal, jh);
- }
-
- /*
- * Now try to drop any written-back buffers from the journal's
- * checkpoint lists. We do this *before* commit because it potentially
- * frees some memory
- */
- spin_lock(&journal->j_list_lock);
- __journal_clean_checkpoint_list(journal);
- spin_unlock(&journal->j_list_lock);
-
- jbd_debug (3, "JBD: commit phase 1\n");
-
- /*
- * Clear revoked flag to reflect there is no revoked buffers
- * in the next transaction which is going to be started.
- */
- journal_clear_buffer_revoked_flags(journal);
-
- /*
- * Switch to a new revoke table.
- */
- journal_switch_revoke_table(journal);
-
- trace_jbd_commit_flushing(journal, commit_transaction);
- commit_transaction->t_state = T_FLUSH;
- journal->j_committing_transaction = commit_transaction;
- journal->j_running_transaction = NULL;
- start_time = ktime_get();
- commit_transaction->t_log_start = journal->j_head;
- wake_up(&journal->j_wait_transaction_locked);
- spin_unlock(&journal->j_state_lock);
-
- jbd_debug (3, "JBD: commit phase 2\n");
-
- if (tid_geq(journal->j_commit_waited, commit_transaction->t_tid))
- write_op = WRITE_SYNC;
-
- /*
- * Now start flushing things to disk, in the order they appear
- * on the transaction lists. Data blocks go first.
- */
- blk_start_plug(&plug);
- err = journal_submit_data_buffers(journal, commit_transaction,
- write_op);
- blk_finish_plug(&plug);
-
- /*
- * Wait for all previously submitted IO to complete.
- */
- spin_lock(&journal->j_list_lock);
- while (commit_transaction->t_locked_list) {
- struct buffer_head *bh;
-
- jh = commit_transaction->t_locked_list->b_tprev;
- bh = jh2bh(jh);
- get_bh(bh);
- if (buffer_locked(bh)) {
- spin_unlock(&journal->j_list_lock);
- wait_on_buffer(bh);
- spin_lock(&journal->j_list_lock);
- }
- if (unlikely(!buffer_uptodate(bh))) {
- if (!trylock_page(bh->b_page)) {
- spin_unlock(&journal->j_list_lock);
- lock_page(bh->b_page);
- spin_lock(&journal->j_list_lock);
- }
- if (bh->b_page->mapping)
- set_bit(AS_EIO, &bh->b_page->mapping->flags);
-
- unlock_page(bh->b_page);
- SetPageError(bh->b_page);
- err = -EIO;
- }
- if (!inverted_lock(journal, bh)) {
- put_bh(bh);
- spin_lock(&journal->j_list_lock);
- continue;
- }
- if (buffer_jbd(bh) && bh2jh(bh) == jh &&
- jh->b_transaction == commit_transaction &&
- jh->b_jlist == BJ_Locked)
- __journal_unfile_buffer(jh);
- jbd_unlock_bh_state(bh);
- release_data_buffer(bh);
- cond_resched_lock(&journal->j_list_lock);
- }
- spin_unlock(&journal->j_list_lock);
-
- if (err) {
- char b[BDEVNAME_SIZE];
-
- printk(KERN_WARNING
- "JBD: Detected IO errors while flushing file data "
- "on %s\n", bdevname(journal->j_fs_dev, b));
- if (journal->j_flags & JFS_ABORT_ON_SYNCDATA_ERR)
- journal_abort(journal, err);
- err = 0;
- }
-
- blk_start_plug(&plug);
-
- journal_write_revoke_records(journal, commit_transaction, write_op);
-
- /*
- * If we found any dirty or locked buffers, then we should have
- * looped back up to the write_out_data label. If there weren't
- * any then journal_clean_data_list should have wiped the list
- * clean by now, so check that it is in fact empty.
- */
- J_ASSERT (commit_transaction->t_sync_datalist == NULL);
-
- jbd_debug (3, "JBD: commit phase 3\n");
-
- /*
- * Way to go: we have now written out all of the data for a
- * transaction! Now comes the tricky part: we need to write out
- * metadata. Loop over the transaction's entire buffer list:
- */
- spin_lock(&journal->j_state_lock);
- commit_transaction->t_state = T_COMMIT;
- spin_unlock(&journal->j_state_lock);
-
- trace_jbd_commit_logging(journal, commit_transaction);
- J_ASSERT(commit_transaction->t_nr_buffers <=
- commit_transaction->t_outstanding_credits);
-
- descriptor = NULL;
- bufs = 0;
- while (commit_transaction->t_buffers) {
-
- /* Find the next buffer to be journaled... */
-
- jh = commit_transaction->t_buffers;
-
- /* If we're in abort mode, we just un-journal the buffer and
- release it. */
-
- if (is_journal_aborted(journal)) {
- clear_buffer_jbddirty(jh2bh(jh));
- JBUFFER_TRACE(jh, "journal is aborting: refile");
- journal_refile_buffer(journal, jh);
- /* If that was the last one, we need to clean up
- * any descriptor buffers which may have been
- * already allocated, even if we are now
- * aborting. */
- if (!commit_transaction->t_buffers)
- goto start_journal_io;
- continue;
- }
-
- /* Make sure we have a descriptor block in which to
- record the metadata buffer. */
-
- if (!descriptor) {
- struct buffer_head *bh;
-
- J_ASSERT (bufs == 0);
-
- jbd_debug(4, "JBD: get descriptor\n");
-
- descriptor = journal_get_descriptor_buffer(journal);
- if (!descriptor) {
- journal_abort(journal, -EIO);
- continue;
- }
-
- bh = jh2bh(descriptor);
- jbd_debug(4, "JBD: got buffer %llu (%p)\n",
- (unsigned long long)bh->b_blocknr, bh->b_data);
- header = (journal_header_t *)&bh->b_data[0];
- header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
- header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK);
- header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
-
- tagp = &bh->b_data[sizeof(journal_header_t)];
- space_left = bh->b_size - sizeof(journal_header_t);
- first_tag = 1;
- set_buffer_jwrite(bh);
- set_buffer_dirty(bh);
- wbuf[bufs++] = bh;
-
- /* Record it so that we can wait for IO
- completion later */
- BUFFER_TRACE(bh, "ph3: file as descriptor");
- journal_file_buffer(descriptor, commit_transaction,
- BJ_LogCtl);
- }
-
- /* Where is the buffer to be written? */
-
- err = journal_next_log_block(journal, &blocknr);
- /* If the block mapping failed, just abandon the buffer
- and repeat this loop: we'll fall into the
- refile-on-abort condition above. */
- if (err) {
- journal_abort(journal, err);
- continue;
- }
-
- /*
- * start_this_handle() uses t_outstanding_credits to determine
- * the free space in the log, but this counter is changed
- * by journal_next_log_block() also.
- */
- commit_transaction->t_outstanding_credits--;
-
- /* Bump b_count to prevent truncate from stumbling over
- the shadowed buffer! @@@ This can go if we ever get
- rid of the BJ_IO/BJ_Shadow pairing of buffers. */
- get_bh(jh2bh(jh));
-
- /* Make a temporary IO buffer with which to write it out
- (this will requeue both the metadata buffer and the
- temporary IO buffer). new_bh goes on BJ_IO*/
-
- set_buffer_jwrite(jh2bh(jh));
- /*
- * akpm: journal_write_metadata_buffer() sets
- * new_bh->b_transaction to commit_transaction.
- * We need to clean this up before we release new_bh
- * (which is of type BJ_IO)
- */
- JBUFFER_TRACE(jh, "ph3: write metadata");
- flags = journal_write_metadata_buffer(commit_transaction,
- jh, &new_jh, blocknr);
- set_buffer_jwrite(jh2bh(new_jh));
- wbuf[bufs++] = jh2bh(new_jh);
-
- /* Record the new block's tag in the current descriptor
- buffer */
-
- tag_flag = 0;
- if (flags & 1)
- tag_flag |= JFS_FLAG_ESCAPE;
- if (!first_tag)
- tag_flag |= JFS_FLAG_SAME_UUID;
-
- tag = (journal_block_tag_t *) tagp;
- tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr);
- tag->t_flags = cpu_to_be32(tag_flag);
- tagp += sizeof(journal_block_tag_t);
- space_left -= sizeof(journal_block_tag_t);
-
- if (first_tag) {
- memcpy (tagp, journal->j_uuid, 16);
- tagp += 16;
- space_left -= 16;
- first_tag = 0;
- }
-
- /* If there's no more to do, or if the descriptor is full,
- let the IO rip! */
-
- if (bufs == journal->j_wbufsize ||
- commit_transaction->t_buffers == NULL ||
- space_left < sizeof(journal_block_tag_t) + 16) {
-
- jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
-
- /* Write an end-of-descriptor marker before
- submitting the IOs. "tag" still points to
- the last tag we set up. */
-
- tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG);
-
-start_journal_io:
- for (i = 0; i < bufs; i++) {
- struct buffer_head *bh = wbuf[i];
- lock_buffer(bh);
- clear_buffer_dirty(bh);
- set_buffer_uptodate(bh);
- bh->b_end_io = journal_end_buffer_io_sync;
- /*
- * In data=journal mode, here we can end up
- * writing pagecache data that might be
- * mmapped. Since we can't afford to clean the
- * page and set PageWriteback (see the comment
- * near the other use of _submit_bh()), the
- * data can change while the write is in
- * flight. Tell the block layer to bounce the
- * bio pages if stable pages are required.
- */
- _submit_bh(write_op, bh, 1 << BIO_SNAP_STABLE);
- }
- cond_resched();
-
- /* Force a new descriptor to be generated next
- time round the loop. */
- descriptor = NULL;
- bufs = 0;
- }
- }
-
- blk_finish_plug(&plug);
-
- /* Lo and behold: we have just managed to send a transaction to
- the log. Before we can commit it, wait for the IO so far to
- complete. Control buffers being written are on the
- transaction's t_log_list queue, and metadata buffers are on
- the t_iobuf_list queue.
-
- Wait for the buffers in reverse order. That way we are
- less likely to be woken up until all IOs have completed, and
- so we incur less scheduling load.
- */
-
- jbd_debug(3, "JBD: commit phase 4\n");
-
- /*
- * akpm: these are BJ_IO, and j_list_lock is not needed.
- * See __journal_try_to_free_buffer.
- */
-wait_for_iobuf:
- while (commit_transaction->t_iobuf_list != NULL) {
- struct buffer_head *bh;
-
- jh = commit_transaction->t_iobuf_list->b_tprev;
- bh = jh2bh(jh);
- if (buffer_locked(bh)) {
- wait_on_buffer(bh);
- goto wait_for_iobuf;
- }
- if (cond_resched())
- goto wait_for_iobuf;
-
- if (unlikely(!buffer_uptodate(bh)))
- err = -EIO;
-
- clear_buffer_jwrite(bh);
-
- JBUFFER_TRACE(jh, "ph4: unfile after journal write");
- journal_unfile_buffer(journal, jh);
-
- /*
- * ->t_iobuf_list should contain only dummy buffer_heads
- * which were created by journal_write_metadata_buffer().
- */
- BUFFER_TRACE(bh, "dumping temporary bh");
- journal_put_journal_head(jh);
- __brelse(bh);
- J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
- free_buffer_head(bh);
-
- /* We also have to unlock and free the corresponding
- shadowed buffer */
- jh = commit_transaction->t_shadow_list->b_tprev;
- bh = jh2bh(jh);
- clear_buffer_jwrite(bh);
- J_ASSERT_BH(bh, buffer_jbddirty(bh));
-
- /* The metadata is now released for reuse, but we need
- to remember it against this transaction so that when
- we finally commit, we can do any checkpointing
- required. */
- JBUFFER_TRACE(jh, "file as BJ_Forget");
- journal_file_buffer(jh, commit_transaction, BJ_Forget);
- /*
- * Wake up any transactions which were waiting for this
- * IO to complete. The barrier must be here so that changes
- * by journal_file_buffer() take effect before wake_up_bit()
- * does the waitqueue check.
- */
- smp_mb();
- wake_up_bit(&bh->b_state, BH_Unshadow);
- JBUFFER_TRACE(jh, "brelse shadowed buffer");
- __brelse(bh);
- }
-
- J_ASSERT (commit_transaction->t_shadow_list == NULL);
-
- jbd_debug(3, "JBD: commit phase 5\n");
-
- /* Here we wait for the revoke record and descriptor record buffers */
- wait_for_ctlbuf:
- while (commit_transaction->t_log_list != NULL) {
- struct buffer_head *bh;
-
- jh = commit_transaction->t_log_list->b_tprev;
- bh = jh2bh(jh);
- if (buffer_locked(bh)) {
- wait_on_buffer(bh);
- goto wait_for_ctlbuf;
- }
- if (cond_resched())
- goto wait_for_ctlbuf;
-
- if (unlikely(!buffer_uptodate(bh)))
- err = -EIO;
-
- BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
- clear_buffer_jwrite(bh);
- journal_unfile_buffer(journal, jh);
- journal_put_journal_head(jh);
- __brelse(bh); /* One for getblk */
- /* AKPM: bforget here */
- }
-
- if (err)
- journal_abort(journal, err);
-
- jbd_debug(3, "JBD: commit phase 6\n");
-
- /* All metadata is written, now write commit record and do cleanup */
- spin_lock(&journal->j_state_lock);
- J_ASSERT(commit_transaction->t_state == T_COMMIT);
- commit_transaction->t_state = T_COMMIT_RECORD;
- spin_unlock(&journal->j_state_lock);
-
- if (journal_write_commit_record(journal, commit_transaction))
- err = -EIO;
-
- if (err)
- journal_abort(journal, err);
-
- /* End of a transaction! Finally, we can do checkpoint
- processing: any buffers committed as a result of this
- transaction can be removed from any checkpoint list it was on
- before. */
-
- jbd_debug(3, "JBD: commit phase 7\n");
-
- J_ASSERT(commit_transaction->t_sync_datalist == NULL);
- J_ASSERT(commit_transaction->t_buffers == NULL);
- J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
- J_ASSERT(commit_transaction->t_iobuf_list == NULL);
- J_ASSERT(commit_transaction->t_shadow_list == NULL);
- J_ASSERT(commit_transaction->t_log_list == NULL);
-
-restart_loop:
- /*
- * As there are other places (journal_unmap_buffer()) adding buffers
- * to this list we have to be careful and hold the j_list_lock.
- */
- spin_lock(&journal->j_list_lock);
- while (commit_transaction->t_forget) {
- transaction_t *cp_transaction;
- struct buffer_head *bh;
- int try_to_free = 0;
-
- jh = commit_transaction->t_forget;
- spin_unlock(&journal->j_list_lock);
- bh = jh2bh(jh);
- /*
- * Get a reference so that bh cannot be freed before we are
- * done with it.
- */
- get_bh(bh);
- jbd_lock_bh_state(bh);
- J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
- jh->b_transaction == journal->j_running_transaction);
-
- /*
- * If there is undo-protected committed data against
- * this buffer, then we can remove it now. If it is a
- * buffer needing such protection, the old frozen_data
- * field now points to a committed version of the
- * buffer, so rotate that field to the new committed
- * data.
- *
- * Otherwise, we can just throw away the frozen data now.
- */
- if (jh->b_committed_data) {
- jbd_free(jh->b_committed_data, bh->b_size);
- jh->b_committed_data = NULL;
- if (jh->b_frozen_data) {
- jh->b_committed_data = jh->b_frozen_data;
- jh->b_frozen_data = NULL;
- }
- } else if (jh->b_frozen_data) {
- jbd_free(jh->b_frozen_data, bh->b_size);
- jh->b_frozen_data = NULL;
- }
-
- spin_lock(&journal->j_list_lock);
- cp_transaction = jh->b_cp_transaction;
- if (cp_transaction) {
- JBUFFER_TRACE(jh, "remove from old cp transaction");
- __journal_remove_checkpoint(jh);
- }
-
- /* Only re-checkpoint the buffer_head if it is marked
- * dirty. If the buffer was added to the BJ_Forget list
- * by journal_forget, it may no longer be dirty and
- * there's no point in keeping a checkpoint record for
- * it. */
-
- /*
- * A buffer which has been freed while still being journaled by
- * a previous transaction.
- */
- if (buffer_freed(bh)) {
- /*
- * If the running transaction is the one containing
- * "add to orphan" operation (b_next_transaction !=
- * NULL), we have to wait for that transaction to
- * commit before we can really get rid of the buffer.
- * So just clear b_modified to not confuse transaction
- * credit accounting and refile the buffer to
- * BJ_Forget of the running transaction. If the just
- * committed transaction contains "add to orphan"
- * operation, we can completely invalidate the buffer
- * now. We are rather throughout in that since the
- * buffer may be still accessible when blocksize <
- * pagesize and it is attached to the last partial
- * page.
- */
- jh->b_modified = 0;
- if (!jh->b_next_transaction) {
- clear_buffer_freed(bh);
- clear_buffer_jbddirty(bh);
- clear_buffer_mapped(bh);
- clear_buffer_new(bh);
- clear_buffer_req(bh);
- bh->b_bdev = NULL;
- }
- }
-
- if (buffer_jbddirty(bh)) {
- JBUFFER_TRACE(jh, "add to new checkpointing trans");
- __journal_insert_checkpoint(jh, commit_transaction);
- if (is_journal_aborted(journal))
- clear_buffer_jbddirty(bh);
- } else {
- J_ASSERT_BH(bh, !buffer_dirty(bh));
- /*
- * The buffer on BJ_Forget list and not jbddirty means
- * it has been freed by this transaction and hence it
- * could not have been reallocated until this
- * transaction has committed. *BUT* it could be
- * reallocated once we have written all the data to
- * disk and before we process the buffer on BJ_Forget
- * list.
- */
- if (!jh->b_next_transaction)
- try_to_free = 1;
- }
- JBUFFER_TRACE(jh, "refile or unfile freed buffer");
- __journal_refile_buffer(jh);
- jbd_unlock_bh_state(bh);
- if (try_to_free)
- release_buffer_page(bh);
- else
- __brelse(bh);
- cond_resched_lock(&journal->j_list_lock);
- }
- spin_unlock(&journal->j_list_lock);
- /*
- * This is a bit sleazy. We use j_list_lock to protect transition
- * of a transaction into T_FINISHED state and calling
- * __journal_drop_transaction(). Otherwise we could race with
- * other checkpointing code processing the transaction...
- */
- spin_lock(&journal->j_state_lock);
- spin_lock(&journal->j_list_lock);
- /*
- * Now recheck if some buffers did not get attached to the transaction
- * while the lock was dropped...
- */
- if (commit_transaction->t_forget) {
- spin_unlock(&journal->j_list_lock);
- spin_unlock(&journal->j_state_lock);
- goto restart_loop;
- }
-
- /* Done with this transaction! */
-
- jbd_debug(3, "JBD: commit phase 8\n");
-
- J_ASSERT(commit_transaction->t_state == T_COMMIT_RECORD);
-
- commit_transaction->t_state = T_FINISHED;
- J_ASSERT(commit_transaction == journal->j_committing_transaction);
- journal->j_commit_sequence = commit_transaction->t_tid;
- journal->j_committing_transaction = NULL;
- commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
-
- /*
- * weight the commit time higher than the average time so we don't
- * react too strongly to vast changes in commit time
- */
- if (likely(journal->j_average_commit_time))
- journal->j_average_commit_time = (commit_time*3 +
- journal->j_average_commit_time) / 4;
- else
- journal->j_average_commit_time = commit_time;
-
- spin_unlock(&journal->j_state_lock);
-
- if (commit_transaction->t_checkpoint_list == NULL &&
- commit_transaction->t_checkpoint_io_list == NULL) {
- __journal_drop_transaction(journal, commit_transaction);
- } else {
- if (journal->j_checkpoint_transactions == NULL) {
- journal->j_checkpoint_transactions = commit_transaction;
- commit_transaction->t_cpnext = commit_transaction;
- commit_transaction->t_cpprev = commit_transaction;
- } else {
- commit_transaction->t_cpnext =
- journal->j_checkpoint_transactions;
- commit_transaction->t_cpprev =
- commit_transaction->t_cpnext->t_cpprev;
- commit_transaction->t_cpnext->t_cpprev =
- commit_transaction;
- commit_transaction->t_cpprev->t_cpnext =
- commit_transaction;
- }
- }
- spin_unlock(&journal->j_list_lock);
-
- trace_jbd_end_commit(journal, commit_transaction);
- jbd_debug(1, "JBD: commit %d complete, head %d\n",
- journal->j_commit_sequence, journal->j_tail_sequence);
-
- wake_up(&journal->j_wait_done_commit);
-}
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
deleted file mode 100644
index c46a79a..0000000
--- a/fs/jbd/journal.c
+++ /dev/null
@@ -1,2145 +0,0 @@
-/*
- * linux/fs/jbd/journal.c
- *
- * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
- *
- * Copyright 1998 Red Hat corp --- All Rights Reserved
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
- * Generic filesystem journal-writing code; part of the ext2fs
- * journaling system.
- *
- * This file manages journals: areas of disk reserved for logging
- * transactional updates. This includes the kernel journaling thread
- * which is responsible for scheduling updates to the log.
- *
- * We do not actually manage the physical storage of the journal in this
- * file: that is left to a per-journal policy function, which allows us
- * to store the journal within a filesystem-specified area for ext2
- * journaling (ext2 can use a reserved inode for storing the log).
- */
-
-#include <linux/module.h>
-#include <linux/time.h>
-#include <linux/fs.h>
-#include <linux/jbd.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/mm.h>
-#include <linux/freezer.h>
-#include <linux/pagemap.h>
-#include <linux/kthread.h>
-#include <linux/poison.h>
-#include <linux/proc_fs.h>
-#include <linux/debugfs.h>
-#include <linux/ratelimit.h>
-
-#define CREATE_TRACE_POINTS
-#include <trace/events/jbd.h>
-
-#include <asm/uaccess.h>
-#include <asm/page.h>
-
-EXPORT_SYMBOL(journal_start);
-EXPORT_SYMBOL(journal_restart);
-EXPORT_SYMBOL(journal_extend);
-EXPORT_SYMBOL(journal_stop);
-EXPORT_SYMBOL(journal_lock_updates);
-EXPORT_SYMBOL(journal_unlock_updates);
-EXPORT_SYMBOL(journal_get_write_access);
-EXPORT_SYMBOL(journal_get_create_access);
-EXPORT_SYMBOL(journal_get_undo_access);
-EXPORT_SYMBOL(journal_dirty_data);
-EXPORT_SYMBOL(journal_dirty_metadata);
-EXPORT_SYMBOL(journal_release_buffer);
-EXPORT_SYMBOL(journal_forget);
-#if 0
-EXPORT_SYMBOL(journal_sync_buffer);
-#endif
-EXPORT_SYMBOL(journal_flush);
-EXPORT_SYMBOL(journal_revoke);
-
-EXPORT_SYMBOL(journal_init_dev);
-EXPORT_SYMBOL(journal_init_inode);
-EXPORT_SYMBOL(journal_update_format);
-EXPORT_SYMBOL(journal_check_used_features);
-EXPORT_SYMBOL(journal_check_available_features);
-EXPORT_SYMBOL(journal_set_features);
-EXPORT_SYMBOL(journal_create);
-EXPORT_SYMBOL(journal_load);
-EXPORT_SYMBOL(journal_destroy);
-EXPORT_SYMBOL(journal_abort);
-EXPORT_SYMBOL(journal_errno);
-EXPORT_SYMBOL(journal_ack_err);
-EXPORT_SYMBOL(journal_clear_err);
-EXPORT_SYMBOL(log_wait_commit);
-EXPORT_SYMBOL(log_start_commit);
-EXPORT_SYMBOL(journal_start_commit);
-EXPORT_SYMBOL(journal_force_commit_nested);
-EXPORT_SYMBOL(journal_wipe);
-EXPORT_SYMBOL(journal_blocks_per_page);
-EXPORT_SYMBOL(journal_invalidatepage);
-EXPORT_SYMBOL(journal_try_to_free_buffers);
-EXPORT_SYMBOL(journal_force_commit);
-
-static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
-static void __journal_abort_soft (journal_t *journal, int errno);
-static const char *journal_dev_name(journal_t *journal, char *buffer);
-
-#ifdef CONFIG_JBD_DEBUG
-void __jbd_debug(int level, const char *file, const char *func,
- unsigned int line, const char *fmt, ...)
-{
- struct va_format vaf;
- va_list args;
-
- if (level > journal_enable_debug)
- return;
- va_start(args, fmt);
- vaf.fmt = fmt;
- vaf.va = &args;
- printk(KERN_DEBUG "%s: (%s, %u): %pV\n", file, func, line, &vaf);
- va_end(args);
-}
-EXPORT_SYMBOL(__jbd_debug);
-#endif
-
-/*
- * Helper function used to manage commit timeouts
- */
-
-static void commit_timeout(unsigned long __data)
-{
- struct task_struct * p = (struct task_struct *) __data;
-
- wake_up_process(p);
-}
-
-/*
- * kjournald: The main thread function used to manage a logging device
- * journal.
- *
- * This kernel thread is responsible for two things:
- *
- * 1) COMMIT: Every so often we need to commit the current state of the
- * filesystem to disk. The journal thread is responsible for writing
- * all of the metadata buffers to disk.
- *
- * 2) CHECKPOINT: We cannot reuse a used section of the log file until all
- * of the data in that part of the log has been rewritten elsewhere on
- * the disk. Flushing these old buffers to reclaim space in the log is
- * known as checkpointing, and this thread is responsible for that job.
- */
-
-static int kjournald(void *arg)
-{
- journal_t *journal = arg;
- transaction_t *transaction;
-
- /*
- * Set up an interval timer which can be used to trigger a commit wakeup
- * after the commit interval expires
- */
- setup_timer(&journal->j_commit_timer, commit_timeout,
- (unsigned long)current);
-
- set_freezable();
-
- /* Record that the journal thread is running */
- journal->j_task = current;
- wake_up(&journal->j_wait_done_commit);
-
- printk(KERN_INFO "kjournald starting. Commit interval %ld seconds\n",
- journal->j_commit_interval / HZ);
-
- /*
- * And now, wait forever for commit wakeup events.
- */
- spin_lock(&journal->j_state_lock);
-
-loop:
- if (journal->j_flags & JFS_UNMOUNT)
- goto end_loop;
-
- jbd_debug(1, "commit_sequence=%d, commit_request=%d\n",
- journal->j_commit_sequence, journal->j_commit_request);
-
- if (journal->j_commit_sequence != journal->j_commit_request) {
- jbd_debug(1, "OK, requests differ\n");
- spin_unlock(&journal->j_state_lock);
- del_timer_sync(&journal->j_commit_timer);
- journal_commit_transaction(journal);
- spin_lock(&journal->j_state_lock);
- goto loop;
- }
-
- wake_up(&journal->j_wait_done_commit);
- if (freezing(current)) {
- /*
- * The simpler the better. Flushing journal isn't a
- * good idea, because that depends on threads that may
- * be already stopped.
- */
- jbd_debug(1, "Now suspending kjournald\n");
- spin_unlock(&journal->j_state_lock);
- try_to_freeze();
- spin_lock(&journal->j_state_lock);
- } else {
- /*
- * We assume on resume that commits are already there,
- * so we don't sleep
- */
- DEFINE_WAIT(wait);
- int should_sleep = 1;
-
- prepare_to_wait(&journal->j_wait_commit, &wait,
- TASK_INTERRUPTIBLE);
- if (journal->j_commit_sequence != journal->j_commit_request)
- should_sleep = 0;
- transaction = journal->j_running_transaction;
- if (transaction && time_after_eq(jiffies,
- transaction->t_expires))
- should_sleep = 0;
- if (journal->j_flags & JFS_UNMOUNT)
- should_sleep = 0;
- if (should_sleep) {
- spin_unlock(&journal->j_state_lock);
- schedule();
- spin_lock(&journal->j_state_lock);
- }
- finish_wait(&journal->j_wait_commit, &wait);
- }
-
- jbd_debug(1, "kjournald wakes\n");
-
- /*
- * Were we woken up by a commit wakeup event?
- */
- transaction = journal->j_running_transaction;
- if (transaction && time_after_eq(jiffies, transaction->t_expires)) {
- journal->j_commit_request = transaction->t_tid;
- jbd_debug(1, "woke because of timeout\n");
- }
- goto loop;
-
-end_loop:
- spin_unlock(&journal->j_state_lock);
- del_timer_sync(&journal->j_commit_timer);
- journal->j_task = NULL;
- wake_up(&journal->j_wait_done_commit);
- jbd_debug(1, "Journal thread exiting.\n");
- return 0;
-}
-
-static int journal_start_thread(journal_t *journal)
-{
- struct task_struct *t;
-
- t = kthread_run(kjournald, journal, "kjournald");
- if (IS_ERR(t))
- return PTR_ERR(t);
-
- wait_event(journal->j_wait_done_commit, journal->j_task != NULL);
- return 0;
-}
-
-static void journal_kill_thread(journal_t *journal)
-{
- spin_lock(&journal->j_state_lock);
- journal->j_flags |= JFS_UNMOUNT;
-
- while (journal->j_task) {
- wake_up(&journal->j_wait_commit);
- spin_unlock(&journal->j_state_lock);
- wait_event(journal->j_wait_done_commit,
- journal->j_task == NULL);
- spin_lock(&journal->j_state_lock);
- }
- spin_unlock(&journal->j_state_lock);
-}
-
-/*
- * journal_write_metadata_buffer: write a metadata buffer to the journal.
- *
- * Writes a metadata buffer to a given disk block. The actual IO is not
- * performed but a new buffer_head is constructed which labels the data
- * to be written with the correct destination disk block.
- *
- * Any magic-number escaping which needs to be done will cause a
- * copy-out here. If the buffer happens to start with the
- * JFS_MAGIC_NUMBER, then we can't write it to the log directly: the
- * magic number is only written to the log for descripter blocks. In
- * this case, we copy the data and replace the first word with 0, and we
- * return a result code which indicates that this buffer needs to be
- * marked as an escaped buffer in the corresponding log descriptor
- * block. The missing word can then be restored when the block is read
- * during recovery.
- *
- * If the source buffer has already been modified by a new transaction
- * since we took the last commit snapshot, we use the frozen copy of
- * that data for IO. If we end up using the existing buffer_head's data
- * for the write, then we *have* to lock the buffer to prevent anyone
- * else from using and possibly modifying it while the IO is in
- * progress.
- *
- * The function returns a pointer to the buffer_heads to be used for IO.
- *
- * We assume that the journal has already been locked in this function.
- *
- * Return value:
- * <0: Error
- * >=0: Finished OK
- *
- * On success:
- * Bit 0 set == escape performed on the data
- * Bit 1 set == buffer copy-out performed (kfree the data after IO)
- */
-
-int journal_write_metadata_buffer(transaction_t *transaction,
- struct journal_head *jh_in,
- struct journal_head **jh_out,
- unsigned int blocknr)
-{
- int need_copy_out = 0;
- int done_copy_out = 0;
- int do_escape = 0;
- char *mapped_data;
- struct buffer_head *new_bh;
- struct journal_head *new_jh;
- struct page *new_page;
- unsigned int new_offset;
- struct buffer_head *bh_in = jh2bh(jh_in);
- journal_t *journal = transaction->t_journal;
-
- /*
- * The buffer really shouldn't be locked: only the current committing
- * transaction is allowed to write it, so nobody else is allowed
- * to do any IO.
- *
- * akpm: except if we're journalling data, and write() output is
- * also part of a shared mapping, and another thread has
- * decided to launch a writepage() against this buffer.
- */
- J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
-
- new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);
- /* keep subsequent assertions sane */
- atomic_set(&new_bh->b_count, 1);
- new_jh = journal_add_journal_head(new_bh); /* This sleeps */
-
- /*
- * If a new transaction has already done a buffer copy-out, then
- * we use that version of the data for the commit.
- */
- jbd_lock_bh_state(bh_in);
-repeat:
- if (jh_in->b_frozen_data) {
- done_copy_out = 1;
- new_page = virt_to_page(jh_in->b_frozen_data);
- new_offset = offset_in_page(jh_in->b_frozen_data);
- } else {
- new_page = jh2bh(jh_in)->b_page;
- new_offset = offset_in_page(jh2bh(jh_in)->b_data);
- }
-
- mapped_data = kmap_atomic(new_page);
- /*
- * Check for escaping
- */
- if (*((__be32 *)(mapped_data + new_offset)) ==
- cpu_to_be32(JFS_MAGIC_NUMBER)) {
- need_copy_out = 1;
- do_escape = 1;
- }
- kunmap_atomic(mapped_data);
-
- /*
- * Do we need to do a data copy?
- */
- if (need_copy_out && !done_copy_out) {
- char *tmp;
-
- jbd_unlock_bh_state(bh_in);
- tmp = jbd_alloc(bh_in->b_size, GFP_NOFS);
- jbd_lock_bh_state(bh_in);
- if (jh_in->b_frozen_data) {
- jbd_free(tmp, bh_in->b_size);
- goto repeat;
- }
-
- jh_in->b_frozen_data = tmp;
- mapped_data = kmap_atomic(new_page);
- memcpy(tmp, mapped_data + new_offset, jh2bh(jh_in)->b_size);
- kunmap_atomic(mapped_data);
-
- new_page = virt_to_page(tmp);
- new_offset = offset_in_page(tmp);
- done_copy_out = 1;
- }
-
- /*
- * Did we need to do an escaping? Now we've done all the
- * copying, we can finally do so.
- */
- if (do_escape) {
- mapped_data = kmap_atomic(new_page);
- *((unsigned int *)(mapped_data + new_offset)) = 0;
- kunmap_atomic(mapped_data);
- }
-
- set_bh_page(new_bh, new_page, new_offset);
- new_jh->b_transaction = NULL;
- new_bh->b_size = jh2bh(jh_in)->b_size;
- new_bh->b_bdev = transaction->t_journal->j_dev;
- new_bh->b_blocknr = blocknr;
- set_buffer_mapped(new_bh);
- set_buffer_dirty(new_bh);
-
- *jh_out = new_jh;
-
- /*
- * The to-be-written buffer needs to get moved to the io queue,
- * and the original buffer whose contents we are shadowing or
- * copying is moved to the transaction's shadow queue.
- */
- JBUFFER_TRACE(jh_in, "file as BJ_Shadow");
- spin_lock(&journal->j_list_lock);
- __journal_file_buffer(jh_in, transaction, BJ_Shadow);
- spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh_in);
-
- JBUFFER_TRACE(new_jh, "file as BJ_IO");
- journal_file_buffer(new_jh, transaction, BJ_IO);
-
- return do_escape | (done_copy_out << 1);
-}
-
-/*
- * Allocation code for the journal file. Manage the space left in the
- * journal, so that we can begin checkpointing when appropriate.
- */
-
-/*
- * __log_space_left: Return the number of free blocks left in the journal.
- *
- * Called with the journal already locked.
- *
- * Called under j_state_lock
- */
-
-int __log_space_left(journal_t *journal)
-{
- int left = journal->j_free;
-
- assert_spin_locked(&journal->j_state_lock);
-
- /*
- * Be pessimistic here about the number of those free blocks which
- * might be required for log descriptor control blocks.
- */
-
-#define MIN_LOG_RESERVED_BLOCKS 32 /* Allow for rounding errors */
-
- left -= MIN_LOG_RESERVED_BLOCKS;
-
- if (left <= 0)
- return 0;
- left -= (left >> 3);
- return left;
-}
-
-/*
- * Called under j_state_lock. Returns true if a transaction commit was started.
- */
-int __log_start_commit(journal_t *journal, tid_t target)
-{
- /*
- * The only transaction we can possibly wait upon is the
- * currently running transaction (if it exists). Otherwise,
- * the target tid must be an old one.
- */
- if (journal->j_commit_request != target &&
- journal->j_running_transaction &&
- journal->j_running_transaction->t_tid == target) {
- /*
- * We want a new commit: OK, mark the request and wakeup the
- * commit thread. We do _not_ do the commit ourselves.
- */
-
- journal->j_commit_request = target;
- jbd_debug(1, "JBD: requesting commit %d/%d\n",
- journal->j_commit_request,
- journal->j_commit_sequence);
- wake_up(&journal->j_wait_commit);
- return 1;
- } else if (!tid_geq(journal->j_commit_request, target))
- /* This should never happen, but if it does, preserve
- the evidence before kjournald goes into a loop and
- increments j_commit_sequence beyond all recognition. */
- WARN_ONCE(1, "jbd: bad log_start_commit: %u %u %u %u\n",
- journal->j_commit_request, journal->j_commit_sequence,
- target, journal->j_running_transaction ?
- journal->j_running_transaction->t_tid : 0);
- return 0;
-}
-
-int log_start_commit(journal_t *journal, tid_t tid)
-{
- int ret;
-
- spin_lock(&journal->j_state_lock);
- ret = __log_start_commit(journal, tid);
- spin_unlock(&journal->j_state_lock);
- return ret;
-}
-
-/*
- * Force and wait upon a commit if the calling process is not within
- * transaction. This is used for forcing out undo-protected data which contains
- * bitmaps, when the fs is running out of space.
- *
- * We can only force the running transaction if we don't have an active handle;
- * otherwise, we will deadlock.
- *
- * Returns true if a transaction was started.
- */
-int journal_force_commit_nested(journal_t *journal)
-{
- transaction_t *transaction = NULL;
- tid_t tid;
-
- spin_lock(&journal->j_state_lock);
- if (journal->j_running_transaction && !current->journal_info) {
- transaction = journal->j_running_transaction;
- __log_start_commit(journal, transaction->t_tid);
- } else if (journal->j_committing_transaction)
- transaction = journal->j_committing_transaction;
-
- if (!transaction) {
- spin_unlock(&journal->j_state_lock);
- return 0; /* Nothing to retry */
- }
-
- tid = transaction->t_tid;
- spin_unlock(&journal->j_state_lock);
- log_wait_commit(journal, tid);
- return 1;
-}
-
-/*
- * Start a commit of the current running transaction (if any). Returns true
- * if a transaction is going to be committed (or is currently already
- * committing), and fills its tid in at *ptid
- */
-int journal_start_commit(journal_t *journal, tid_t *ptid)
-{
- int ret = 0;
-
- spin_lock(&journal->j_state_lock);
- if (journal->j_running_transaction) {
- tid_t tid = journal->j_running_transaction->t_tid;
-
- __log_start_commit(journal, tid);
- /* There's a running transaction and we've just made sure
- * it's commit has been scheduled. */
- if (ptid)
- *ptid = tid;
- ret = 1;
- } else if (journal->j_committing_transaction) {
- /*
- * If commit has been started, then we have to wait for
- * completion of that transaction.
- */
- if (ptid)
- *ptid = journal->j_committing_transaction->t_tid;
- ret = 1;
- }
- spin_unlock(&journal->j_state_lock);
- return ret;
-}
-
-/*
- * Wait for a specified commit to complete.
- * The caller may not hold the journal lock.
- */
-int log_wait_commit(journal_t *journal, tid_t tid)
-{
- int err = 0;
-
-#ifdef CONFIG_JBD_DEBUG
- spin_lock(&journal->j_state_lock);
- if (!tid_geq(journal->j_commit_request, tid)) {
- printk(KERN_ERR
- "%s: error: j_commit_request=%d, tid=%d\n",
- __func__, journal->j_commit_request, tid);
- }
- spin_unlock(&journal->j_state_lock);
-#endif
- spin_lock(&journal->j_state_lock);
- /*
- * Not running or committing trans? Must be already committed. This
- * saves us from waiting for a *long* time when tid overflows.
- */
- if (!((journal->j_running_transaction &&
- journal->j_running_transaction->t_tid == tid) ||
- (journal->j_committing_transaction &&
- journal->j_committing_transaction->t_tid == tid)))
- goto out_unlock;
-
- if (!tid_geq(journal->j_commit_waited, tid))
- journal->j_commit_waited = tid;
- while (tid_gt(tid, journal->j_commit_sequence)) {
- jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n",
- tid, journal->j_commit_sequence);
- wake_up(&journal->j_wait_commit);
- spin_unlock(&journal->j_state_lock);
- wait_event(journal->j_wait_done_commit,
- !tid_gt(tid, journal->j_commit_sequence));
- spin_lock(&journal->j_state_lock);
- }
-out_unlock:
- spin_unlock(&journal->j_state_lock);
-
- if (unlikely(is_journal_aborted(journal)))
- err = -EIO;
- return err;
-}
-
-/*
- * Return 1 if a given transaction has not yet sent barrier request
- * connected with a transaction commit. If 0 is returned, transaction
- * may or may not have sent the barrier. Used to avoid sending barrier
- * twice in common cases.
- */
-int journal_trans_will_send_data_barrier(journal_t *journal, tid_t tid)
-{
- int ret = 0;
- transaction_t *commit_trans;
-
- if (!(journal->j_flags & JFS_BARRIER))
- return 0;
- spin_lock(&journal->j_state_lock);
- /* Transaction already committed? */
- if (tid_geq(journal->j_commit_sequence, tid))
- goto out;
- /*
- * Transaction is being committed and we already proceeded to
- * writing commit record?
- */
- commit_trans = journal->j_committing_transaction;
- if (commit_trans && commit_trans->t_tid == tid &&
- commit_trans->t_state >= T_COMMIT_RECORD)
- goto out;
- ret = 1;
-out:
- spin_unlock(&journal->j_state_lock);
- return ret;
-}
-EXPORT_SYMBOL(journal_trans_will_send_data_barrier);
-
-/*
- * Log buffer allocation routines:
- */
-
-int journal_next_log_block(journal_t *journal, unsigned int *retp)
-{
- unsigned int blocknr;
-
- spin_lock(&journal->j_state_lock);
- J_ASSERT(journal->j_free > 1);
-
- blocknr = journal->j_head;
- journal->j_head++;
- journal->j_free--;
- if (journal->j_head == journal->j_last)
- journal->j_head = journal->j_first;
- spin_unlock(&journal->j_state_lock);
- return journal_bmap(journal, blocknr, retp);
-}
-
-/*
- * Conversion of logical to physical block numbers for the journal
- *
- * On external journals the journal blocks are identity-mapped, so
- * this is a no-op. If needed, we can use j_blk_offset - everything is
- * ready.
- */
-int journal_bmap(journal_t *journal, unsigned int blocknr,
- unsigned int *retp)
-{
- int err = 0;
- unsigned int ret;
-
- if (journal->j_inode) {
- ret = bmap(journal->j_inode, blocknr);
- if (ret)
- *retp = ret;
- else {
- char b[BDEVNAME_SIZE];
-
- printk(KERN_ALERT "%s: journal block not found "
- "at offset %u on %s\n",
- __func__,
- blocknr,
- bdevname(journal->j_dev, b));
- err = -EIO;
- __journal_abort_soft(journal, err);
- }
- } else {
- *retp = blocknr; /* +journal->j_blk_offset */
- }
- return err;
-}
-
-/*
- * We play buffer_head aliasing tricks to write data/metadata blocks to
- * the journal without copying their contents, but for journal
- * descriptor blocks we do need to generate bona fide buffers.
- *
- * After the caller of journal_get_descriptor_buffer() has finished modifying
- * the buffer's contents they really should run flush_dcache_page(bh->b_page).
- * But we don't bother doing that, so there will be coherency problems with
- * mmaps of blockdevs which hold live JBD-controlled filesystems.
- */
-struct journal_head *journal_get_descriptor_buffer(journal_t *journal)
-{
- struct buffer_head *bh;
- unsigned int blocknr;
- int err;
-
- err = journal_next_log_block(journal, &blocknr);
-
- if (err)
- return NULL;
-
- bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
- if (!bh)
- return NULL;
- lock_buffer(bh);
- memset(bh->b_data, 0, journal->j_blocksize);
- set_buffer_uptodate(bh);
- unlock_buffer(bh);
- BUFFER_TRACE(bh, "return this buffer");
- return journal_add_journal_head(bh);
-}
-
-/*
- * Management for journal control blocks: functions to create and
- * destroy journal_t structures, and to initialise and read existing
- * journal blocks from disk. */
-
-/* First: create and setup a journal_t object in memory. We initialise
- * very few fields yet: that has to wait until we have created the
- * journal structures from from scratch, or loaded them from disk. */
-
-static journal_t * journal_init_common (void)
-{
- journal_t *journal;
- int err;
-
- journal = kzalloc(sizeof(*journal), GFP_KERNEL);
- if (!journal)
- goto fail;
-
- init_waitqueue_head(&journal->j_wait_transaction_locked);
- init_waitqueue_head(&journal->j_wait_logspace);
- init_waitqueue_head(&journal->j_wait_done_commit);
- init_waitqueue_head(&journal->j_wait_checkpoint);
- init_waitqueue_head(&journal->j_wait_commit);
- init_waitqueue_head(&journal->j_wait_updates);
- mutex_init(&journal->j_checkpoint_mutex);
- spin_lock_init(&journal->j_revoke_lock);
- spin_lock_init(&journal->j_list_lock);
- spin_lock_init(&journal->j_state_lock);
-
- journal->j_commit_interval = (HZ * JBD_DEFAULT_MAX_COMMIT_AGE);
-
- /* The journal is marked for error until we succeed with recovery! */
- journal->j_flags = JFS_ABORT;
-
- /* Set up a default-sized revoke table for the new mount. */
- err = journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH);
- if (err) {
- kfree(journal);
- goto fail;
- }
- return journal;
-fail:
- return NULL;
-}
-
-/* journal_init_dev and journal_init_inode:
- *
- * Create a journal structure assigned some fixed set of disk blocks to
- * the journal. We don't actually touch those disk blocks yet, but we
- * need to set up all of the mapping information to tell the journaling
- * system where the journal blocks are.
- *
- */
-
-/**
- * journal_t * journal_init_dev() - creates and initialises a journal structure
- * @bdev: Block device on which to create the journal
- * @fs_dev: Device which hold journalled filesystem for this journal.
- * @start: Block nr Start of journal.
- * @len: Length of the journal in blocks.
- * @blocksize: blocksize of journalling device
- *
- * Returns: a newly created journal_t *
- *
- * journal_init_dev creates a journal which maps a fixed contiguous
- * range of blocks on an arbitrary block device.
- *
- */
-journal_t * journal_init_dev(struct block_device *bdev,
- struct block_device *fs_dev,
- int start, int len, int blocksize)
-{
- journal_t *journal = journal_init_common();
- struct buffer_head *bh;
- int n;
-
- if (!journal)
- return NULL;
-
- /* journal descriptor can store up to n blocks -bzzz */
- journal->j_blocksize = blocksize;
- n = journal->j_blocksize / sizeof(journal_block_tag_t);
- journal->j_wbufsize = n;
- journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
- if (!journal->j_wbuf) {
- printk(KERN_ERR "%s: Can't allocate bhs for commit thread\n",
- __func__);
- goto out_err;
- }
- journal->j_dev = bdev;
- journal->j_fs_dev = fs_dev;
- journal->j_blk_offset = start;
- journal->j_maxlen = len;
-
- bh = __getblk(journal->j_dev, start, journal->j_blocksize);
- if (!bh) {
- printk(KERN_ERR
- "%s: Cannot get buffer for journal superblock\n",
- __func__);
- goto out_err;
- }
- journal->j_sb_buffer = bh;
- journal->j_superblock = (journal_superblock_t *)bh->b_data;
-
- return journal;
-out_err:
- kfree(journal->j_wbuf);
- kfree(journal);
- return NULL;
-}
-
-/**
- * journal_t * journal_init_inode () - creates a journal which maps to a inode.
- * @inode: An inode to create the journal in
- *
- * journal_init_inode creates a journal which maps an on-disk inode as
- * the journal. The inode must exist already, must support bmap() and
- * must have all data blocks preallocated.
- */
-journal_t * journal_init_inode (struct inode *inode)
-{
- struct buffer_head *bh;
- journal_t *journal = journal_init_common();
- int err;
- int n;
- unsigned int blocknr;
-
- if (!journal)
- return NULL;
-
- journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev;
- journal->j_inode = inode;
- jbd_debug(1,
- "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n",
- journal, inode->i_sb->s_id, inode->i_ino,
- (long long) inode->i_size,
- inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize);
-
- journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits;
- journal->j_blocksize = inode->i_sb->s_blocksize;
-
- /* journal descriptor can store up to n blocks -bzzz */
- n = journal->j_blocksize / sizeof(journal_block_tag_t);
- journal->j_wbufsize = n;
- journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
- if (!journal->j_wbuf) {
- printk(KERN_ERR "%s: Can't allocate bhs for commit thread\n",
- __func__);
- goto out_err;
- }
-
- err = journal_bmap(journal, 0, &blocknr);
- /* If that failed, give up */
- if (err) {
- printk(KERN_ERR "%s: Cannot locate journal superblock\n",
- __func__);
- goto out_err;
- }
-
- bh = getblk_unmovable(journal->j_dev, blocknr, journal->j_blocksize);
- if (!bh) {
- printk(KERN_ERR
- "%s: Cannot get buffer for journal superblock\n",
- __func__);
- goto out_err;
- }
- journal->j_sb_buffer = bh;
- journal->j_superblock = (journal_superblock_t *)bh->b_data;
-
- return journal;
-out_err:
- kfree(journal->j_wbuf);
- kfree(journal);
- return NULL;
-}
-
-/*
- * If the journal init or create aborts, we need to mark the journal
- * superblock as being NULL to prevent the journal destroy from writing
- * back a bogus superblock.
- */
-static void journal_fail_superblock (journal_t *journal)
-{
- struct buffer_head *bh = journal->j_sb_buffer;
- brelse(bh);
- journal->j_sb_buffer = NULL;
-}
-
-/*
- * Given a journal_t structure, initialise the various fields for
- * startup of a new journaling session. We use this both when creating
- * a journal, and after recovering an old journal to reset it for
- * subsequent use.
- */
-
-static int journal_reset(journal_t *journal)
-{
- journal_superblock_t *sb = journal->j_superblock;
- unsigned int first, last;
-
- first = be32_to_cpu(sb->s_first);
- last = be32_to_cpu(sb->s_maxlen);
- if (first + JFS_MIN_JOURNAL_BLOCKS > last + 1) {
- printk(KERN_ERR "JBD: Journal too short (blocks %u-%u).\n",
- first, last);
- journal_fail_superblock(journal);
- return -EINVAL;
- }
-
- journal->j_first = first;
- journal->j_last = last;
-
- journal->j_head = first;
- journal->j_tail = first;
- journal->j_free = last - first;
-
- journal->j_tail_sequence = journal->j_transaction_sequence;
- journal->j_commit_sequence = journal->j_transaction_sequence - 1;
- journal->j_commit_request = journal->j_commit_sequence;
-
- journal->j_max_transaction_buffers = journal->j_maxlen / 4;
-
- /*
- * As a special case, if the on-disk copy is already marked as needing
- * no recovery (s_start == 0), then we can safely defer the superblock
- * update until the next commit by setting JFS_FLUSHED. This avoids
- * attempting a write to a potential-readonly device.
- */
- if (sb->s_start == 0) {
- jbd_debug(1,"JBD: Skipping superblock update on recovered sb "
- "(start %u, seq %d, errno %d)\n",
- journal->j_tail, journal->j_tail_sequence,
- journal->j_errno);
- journal->j_flags |= JFS_FLUSHED;
- } else {
- /* Lock here to make assertions happy... */
- mutex_lock(&journal->j_checkpoint_mutex);
- /*
- * Update log tail information. We use WRITE_FUA since new
- * transaction will start reusing journal space and so we
- * must make sure information about current log tail is on
- * disk before that.
- */
- journal_update_sb_log_tail(journal,
- journal->j_tail_sequence,
- journal->j_tail,
- WRITE_FUA);
- mutex_unlock(&journal->j_checkpoint_mutex);
- }
- return journal_start_thread(journal);
-}
-
-/**
- * int journal_create() - Initialise the new journal file
- * @journal: Journal to create. This structure must have been initialised
- *
- * Given a journal_t structure which tells us which disk blocks we can
- * use, create a new journal superblock and initialise all of the
- * journal fields from scratch.
- **/
-int journal_create(journal_t *journal)
-{
- unsigned int blocknr;
- struct buffer_head *bh;
- journal_superblock_t *sb;
- int i, err;
-
- if (journal->j_maxlen < JFS_MIN_JOURNAL_BLOCKS) {
- printk (KERN_ERR "Journal length (%d blocks) too short.\n",
- journal->j_maxlen);
- journal_fail_superblock(journal);
- return -EINVAL;
- }
-
- if (journal->j_inode == NULL) {
- /*
- * We don't know what block to start at!
- */
- printk(KERN_EMERG
- "%s: creation of journal on external device!\n",
- __func__);
- BUG();
- }
-
- /* Zero out the entire journal on disk. We cannot afford to
- have any blocks on disk beginning with JFS_MAGIC_NUMBER. */
- jbd_debug(1, "JBD: Zeroing out journal blocks...\n");
- for (i = 0; i < journal->j_maxlen; i++) {
- err = journal_bmap(journal, i, &blocknr);
- if (err)
- return err;
- bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
- if (unlikely(!bh))
- return -ENOMEM;
- lock_buffer(bh);
- memset (bh->b_data, 0, journal->j_blocksize);
- BUFFER_TRACE(bh, "marking dirty");
- mark_buffer_dirty(bh);
- BUFFER_TRACE(bh, "marking uptodate");
- set_buffer_uptodate(bh);
- unlock_buffer(bh);
- __brelse(bh);
- }
-
- sync_blockdev(journal->j_dev);
- jbd_debug(1, "JBD: journal cleared.\n");
-
- /* OK, fill in the initial static fields in the new superblock */
- sb = journal->j_superblock;
-
- sb->s_header.h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
- sb->s_header.h_blocktype = cpu_to_be32(JFS_SUPERBLOCK_V2);
-
- sb->s_blocksize = cpu_to_be32(journal->j_blocksize);
- sb->s_maxlen = cpu_to_be32(journal->j_maxlen);
- sb->s_first = cpu_to_be32(1);
-
- journal->j_transaction_sequence = 1;
-
- journal->j_flags &= ~JFS_ABORT;
- journal->j_format_version = 2;
-
- return journal_reset(journal);
-}
-
-static void journal_write_superblock(journal_t *journal, int write_op)
-{
- struct buffer_head *bh = journal->j_sb_buffer;
- int ret;
-
- trace_journal_write_superblock(journal, write_op);
- if (!(journal->j_flags & JFS_BARRIER))
- write_op &= ~(REQ_FUA | REQ_FLUSH);
- lock_buffer(bh);
- if (buffer_write_io_error(bh)) {
- char b[BDEVNAME_SIZE];
- /*
- * Oh, dear. A previous attempt to write the journal
- * superblock failed. This could happen because the
- * USB device was yanked out. Or it could happen to
- * be a transient write error and maybe the block will
- * be remapped. Nothing we can do but to retry the
- * write and hope for the best.
- */
- printk(KERN_ERR "JBD: previous I/O error detected "
- "for journal superblock update for %s.\n",
- journal_dev_name(journal, b));
- clear_buffer_write_io_error(bh);
- set_buffer_uptodate(bh);
- }
-
- get_bh(bh);
- bh->b_end_io = end_buffer_write_sync;
- ret = submit_bh(write_op, bh);
- wait_on_buffer(bh);
- if (buffer_write_io_error(bh)) {
- clear_buffer_write_io_error(bh);
- set_buffer_uptodate(bh);
- ret = -EIO;
- }
- if (ret) {
- char b[BDEVNAME_SIZE];
- printk(KERN_ERR "JBD: Error %d detected "
- "when updating journal superblock for %s.\n",
- ret, journal_dev_name(journal, b));
- }
-}
-
-/**
- * journal_update_sb_log_tail() - Update log tail in journal sb on disk.
- * @journal: The journal to update.
- * @tail_tid: TID of the new transaction at the tail of the log
- * @tail_block: The first block of the transaction at the tail of the log
- * @write_op: With which operation should we write the journal sb
- *
- * Update a journal's superblock information about log tail and write it to
- * disk, waiting for the IO to complete.
- */
-void journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
- unsigned int tail_block, int write_op)
-{
- journal_superblock_t *sb = journal->j_superblock;
-
- BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
- jbd_debug(1,"JBD: updating superblock (start %u, seq %u)\n",
- tail_block, tail_tid);
-
- sb->s_sequence = cpu_to_be32(tail_tid);
- sb->s_start = cpu_to_be32(tail_block);
-
- journal_write_superblock(journal, write_op);
-
- /* Log is no longer empty */
- spin_lock(&journal->j_state_lock);
- WARN_ON(!sb->s_sequence);
- journal->j_flags &= ~JFS_FLUSHED;
- spin_unlock(&journal->j_state_lock);
-}
-
-/**
- * mark_journal_empty() - Mark on disk journal as empty.
- * @journal: The journal to update.
- *
- * Update a journal's dynamic superblock fields to show that journal is empty.
- * Write updated superblock to disk waiting for IO to complete.
- */
-static void mark_journal_empty(journal_t *journal)
-{
- journal_superblock_t *sb = journal->j_superblock;
-
- BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
- spin_lock(&journal->j_state_lock);
- /* Is it already empty? */
- if (sb->s_start == 0) {
- spin_unlock(&journal->j_state_lock);
- return;
- }
- jbd_debug(1, "JBD: Marking journal as empty (seq %d)\n",
- journal->j_tail_sequence);
-
- sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
- sb->s_start = cpu_to_be32(0);
- spin_unlock(&journal->j_state_lock);
-
- journal_write_superblock(journal, WRITE_FUA);
-
- spin_lock(&journal->j_state_lock);
- /* Log is empty */
- journal->j_flags |= JFS_FLUSHED;
- spin_unlock(&journal->j_state_lock);
-}
-
-/**
- * journal_update_sb_errno() - Update error in the journal.
- * @journal: The journal to update.
- *
- * Update a journal's errno. Write updated superblock to disk waiting for IO
- * to complete.
- */
-static void journal_update_sb_errno(journal_t *journal)
-{
- journal_superblock_t *sb = journal->j_superblock;
-
- spin_lock(&journal->j_state_lock);
- jbd_debug(1, "JBD: updating superblock error (errno %d)\n",
- journal->j_errno);
- sb->s_errno = cpu_to_be32(journal->j_errno);
- spin_unlock(&journal->j_state_lock);
-
- journal_write_superblock(journal, WRITE_SYNC);
-}
-
-/*
- * Read the superblock for a given journal, performing initial
- * validation of the format.
- */
-
-static int journal_get_superblock(journal_t *journal)
-{
- struct buffer_head *bh;
- journal_superblock_t *sb;
- int err = -EIO;
-
- bh = journal->j_sb_buffer;
-
- J_ASSERT(bh != NULL);
- if (!buffer_uptodate(bh)) {
- ll_rw_block(READ, 1, &bh);
- wait_on_buffer(bh);
- if (!buffer_uptodate(bh)) {
- printk (KERN_ERR
- "JBD: IO error reading journal superblock\n");
- goto out;
- }
- }
-
- sb = journal->j_superblock;
-
- err = -EINVAL;
-
- if (sb->s_header.h_magic != cpu_to_be32(JFS_MAGIC_NUMBER) ||
- sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) {
- printk(KERN_WARNING "JBD: no valid journal superblock found\n");
- goto out;
- }
-
- switch(be32_to_cpu(sb->s_header.h_blocktype)) {
- case JFS_SUPERBLOCK_V1:
- journal->j_format_version = 1;
- break;
- case JFS_SUPERBLOCK_V2:
- journal->j_format_version = 2;
- break;
- default:
- printk(KERN_WARNING "JBD: unrecognised superblock format ID\n");
- goto out;
- }
-
- if (be32_to_cpu(sb->s_maxlen) < journal->j_maxlen)
- journal->j_maxlen = be32_to_cpu(sb->s_maxlen);
- else if (be32_to_cpu(sb->s_maxlen) > journal->j_maxlen) {
- printk (KERN_WARNING "JBD: journal file too short\n");
- goto out;
- }
-
- if (be32_to_cpu(sb->s_first) == 0 ||
- be32_to_cpu(sb->s_first) >= journal->j_maxlen) {
- printk(KERN_WARNING
- "JBD: Invalid start block of journal: %u\n",
- be32_to_cpu(sb->s_first));
- goto out;
- }
-
- return 0;
-
-out:
- journal_fail_superblock(journal);
- return err;
-}
-
-/*
- * Load the on-disk journal superblock and read the key fields into the
- * journal_t.
- */
-
-static int load_superblock(journal_t *journal)
-{
- int err;
- journal_superblock_t *sb;
-
- err = journal_get_superblock(journal);
- if (err)
- return err;
-
- sb = journal->j_superblock;
-
- journal->j_tail_sequence = be32_to_cpu(sb->s_sequence);
- journal->j_tail = be32_to_cpu(sb->s_start);
- journal->j_first = be32_to_cpu(sb->s_first);
- journal->j_last = be32_to_cpu(sb->s_maxlen);
- journal->j_errno = be32_to_cpu(sb->s_errno);
-
- return 0;
-}
-
-
-/**
- * int journal_load() - Read journal from disk.
- * @journal: Journal to act on.
- *
- * Given a journal_t structure which tells us which disk blocks contain
- * a journal, read the journal from disk to initialise the in-memory
- * structures.
- */
-int journal_load(journal_t *journal)
-{
- int err;
- journal_superblock_t *sb;
-
- err = load_superblock(journal);
- if (err)
- return err;
-
- sb = journal->j_superblock;
- /* If this is a V2 superblock, then we have to check the
- * features flags on it. */
-
- if (journal->j_format_version >= 2) {
- if ((sb->s_feature_ro_compat &
- ~cpu_to_be32(JFS_KNOWN_ROCOMPAT_FEATURES)) ||
- (sb->s_feature_incompat &
- ~cpu_to_be32(JFS_KNOWN_INCOMPAT_FEATURES))) {
- printk (KERN_WARNING
- "JBD: Unrecognised features on journal\n");
- return -EINVAL;
- }
- }
-
- /* Let the recovery code check whether it needs to recover any
- * data from the journal. */
- if (journal_recover(journal))
- goto recovery_error;
-
- /* OK, we've finished with the dynamic journal bits:
- * reinitialise the dynamic contents of the superblock in memory
- * and reset them on disk. */
- if (journal_reset(journal))
- goto recovery_error;
-
- journal->j_flags &= ~JFS_ABORT;
- journal->j_flags |= JFS_LOADED;
- return 0;
-
-recovery_error:
- printk (KERN_WARNING "JBD: recovery failed\n");
- return -EIO;
-}
-
-/**
- * void journal_destroy() - Release a journal_t structure.
- * @journal: Journal to act on.
- *
- * Release a journal_t structure once it is no longer in use by the
- * journaled object.
- * Return <0 if we couldn't clean up the journal.
- */
-int journal_destroy(journal_t *journal)
-{
- int err = 0;
-
-
- /* Wait for the commit thread to wake up and die. */
- journal_kill_thread(journal);
-
- /* Force a final log commit */
- if (journal->j_running_transaction)
- journal_commit_transaction(journal);
-
- /* Force any old transactions to disk */
-
- /* We cannot race with anybody but must keep assertions happy */
- mutex_lock(&journal->j_checkpoint_mutex);
- /* Totally anal locking here... */
- spin_lock(&journal->j_list_lock);
- while (journal->j_checkpoint_transactions != NULL) {
- spin_unlock(&journal->j_list_lock);
- log_do_checkpoint(journal);
- spin_lock(&journal->j_list_lock);
- }
-
- J_ASSERT(journal->j_running_transaction == NULL);
- J_ASSERT(journal->j_committing_transaction == NULL);
- J_ASSERT(journal->j_checkpoint_transactions == NULL);
- spin_unlock(&journal->j_list_lock);
-
- if (journal->j_sb_buffer) {
- if (!is_journal_aborted(journal)) {
- journal->j_tail_sequence =
- ++journal->j_transaction_sequence;
- mark_journal_empty(journal);
- } else
- err = -EIO;
- brelse(journal->j_sb_buffer);
- }
- mutex_unlock(&journal->j_checkpoint_mutex);
-
- iput(journal->j_inode);
- if (journal->j_revoke)
- journal_destroy_revoke(journal);
- kfree(journal->j_wbuf);
- kfree(journal);
-
- return err;
-}
-
-
-/**
- *int journal_check_used_features () - Check if features specified are used.
- * @journal: Journal to check.
- * @compat: bitmask of compatible features
- * @ro: bitmask of features that force read-only mount
- * @incompat: bitmask of incompatible features
- *
- * Check whether the journal uses all of a given set of
- * features. Return true (non-zero) if it does.
- **/
-
-int journal_check_used_features (journal_t *journal, unsigned long compat,
- unsigned long ro, unsigned long incompat)
-{
- journal_superblock_t *sb;
-
- if (!compat && !ro && !incompat)
- return 1;
- if (journal->j_format_version == 1)
- return 0;
-
- sb = journal->j_superblock;
-
- if (((be32_to_cpu(sb->s_feature_compat) & compat) == compat) &&
- ((be32_to_cpu(sb->s_feature_ro_compat) & ro) == ro) &&
- ((be32_to_cpu(sb->s_feature_incompat) & incompat) == incompat))
- return 1;
-
- return 0;
-}
-
-/**
- * int journal_check_available_features() - Check feature set in journalling layer
- * @journal: Journal to check.
- * @compat: bitmask of compatible features
- * @ro: bitmask of features that force read-only mount
- * @incompat: bitmask of incompatible features
- *
- * Check whether the journaling code supports the use of
- * all of a given set of features on this journal. Return true
- * (non-zero) if it can. */
-
-int journal_check_available_features (journal_t *journal, unsigned long compat,
- unsigned long ro, unsigned long incompat)
-{
- if (!compat && !ro && !incompat)
- return 1;
-
- /* We can support any known requested features iff the
- * superblock is in version 2. Otherwise we fail to support any
- * extended sb features. */
-
- if (journal->j_format_version != 2)
- return 0;
-
- if ((compat & JFS_KNOWN_COMPAT_FEATURES) == compat &&
- (ro & JFS_KNOWN_ROCOMPAT_FEATURES) == ro &&
- (incompat & JFS_KNOWN_INCOMPAT_FEATURES) == incompat)
- return 1;
-
- return 0;
-}
-
-/**
- * int journal_set_features () - Mark a given journal feature in the superblock
- * @journal: Journal to act on.
- * @compat: bitmask of compatible features
- * @ro: bitmask of features that force read-only mount
- * @incompat: bitmask of incompatible features
- *
- * Mark a given journal feature as present on the
- * superblock. Returns true if the requested features could be set.
- *
- */
-
-int journal_set_features (journal_t *journal, unsigned long compat,
- unsigned long ro, unsigned long incompat)
-{
- journal_superblock_t *sb;
-
- if (journal_check_used_features(journal, compat, ro, incompat))
- return 1;
-
- if (!journal_check_available_features(journal, compat, ro, incompat))
- return 0;
-
- jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n",
- compat, ro, incompat);
-
- sb = journal->j_superblock;
-
- sb->s_feature_compat |= cpu_to_be32(compat);
- sb->s_feature_ro_compat |= cpu_to_be32(ro);
- sb->s_feature_incompat |= cpu_to_be32(incompat);
-
- return 1;
-}
-
-
-/**
- * int journal_update_format () - Update on-disk journal structure.
- * @journal: Journal to act on.
- *
- * Given an initialised but unloaded journal struct, poke about in the
- * on-disk structure to update it to the most recent supported version.
- */
-int journal_update_format (journal_t *journal)
-{
- journal_superblock_t *sb;
- int err;
-
- err = journal_get_superblock(journal);
- if (err)
- return err;
-
- sb = journal->j_superblock;
-
- switch (be32_to_cpu(sb->s_header.h_blocktype)) {
- case JFS_SUPERBLOCK_V2:
- return 0;
- case JFS_SUPERBLOCK_V1:
- return journal_convert_superblock_v1(journal, sb);
- default:
- break;
- }
- return -EINVAL;
-}
-
-static int journal_convert_superblock_v1(journal_t *journal,
- journal_superblock_t *sb)
-{
- int offset, blocksize;
- struct buffer_head *bh;
-
- printk(KERN_WARNING
- "JBD: Converting superblock from version 1 to 2.\n");
-
- /* Pre-initialise new fields to zero */
- offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb);
- blocksize = be32_to_cpu(sb->s_blocksize);
- memset(&sb->s_feature_compat, 0, blocksize-offset);
-
- sb->s_nr_users = cpu_to_be32(1);
- sb->s_header.h_blocktype = cpu_to_be32(JFS_SUPERBLOCK_V2);
- journal->j_format_version = 2;
-
- bh = journal->j_sb_buffer;
- BUFFER_TRACE(bh, "marking dirty");
- mark_buffer_dirty(bh);
- sync_dirty_buffer(bh);
- return 0;
-}
-
-
-/**
- * int journal_flush () - Flush journal
- * @journal: Journal to act on.
- *
- * Flush all data for a given journal to disk and empty the journal.
- * Filesystems can use this when remounting readonly to ensure that
- * recovery does not need to happen on remount.
- */
-
-int journal_flush(journal_t *journal)
-{
- int err = 0;
- transaction_t *transaction = NULL;
-
- spin_lock(&journal->j_state_lock);
-
- /* Force everything buffered to the log... */
- if (journal->j_running_transaction) {
- transaction = journal->j_running_transaction;
- __log_start_commit(journal, transaction->t_tid);
- } else if (journal->j_committing_transaction)
- transaction = journal->j_committing_transaction;
-
- /* Wait for the log commit to complete... */
- if (transaction) {
- tid_t tid = transaction->t_tid;
-
- spin_unlock(&journal->j_state_lock);
- log_wait_commit(journal, tid);
- } else {
- spin_unlock(&journal->j_state_lock);
- }
-
- /* ...and flush everything in the log out to disk. */
- spin_lock(&journal->j_list_lock);
- while (!err && journal->j_checkpoint_transactions != NULL) {
- spin_unlock(&journal->j_list_lock);
- mutex_lock(&journal->j_checkpoint_mutex);
- err = log_do_checkpoint(journal);
- mutex_unlock(&journal->j_checkpoint_mutex);
- spin_lock(&journal->j_list_lock);
- }
- spin_unlock(&journal->j_list_lock);
-
- if (is_journal_aborted(journal))
- return -EIO;
-
- mutex_lock(&journal->j_checkpoint_mutex);
- cleanup_journal_tail(journal);
-
- /* Finally, mark the journal as really needing no recovery.
- * This sets s_start==0 in the underlying superblock, which is
- * the magic code for a fully-recovered superblock. Any future
- * commits of data to the journal will restore the current
- * s_start value. */
- mark_journal_empty(journal);
- mutex_unlock(&journal->j_checkpoint_mutex);
- spin_lock(&journal->j_state_lock);
- J_ASSERT(!journal->j_running_transaction);
- J_ASSERT(!journal->j_committing_transaction);
- J_ASSERT(!journal->j_checkpoint_transactions);
- J_ASSERT(journal->j_head == journal->j_tail);
- J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
- spin_unlock(&journal->j_state_lock);
- return 0;
-}
-
-/**
- * int journal_wipe() - Wipe journal contents
- * @journal: Journal to act on.
- * @write: flag (see below)
- *
- * Wipe out all of the contents of a journal, safely. This will produce
- * a warning if the journal contains any valid recovery information.
- * Must be called between journal_init_*() and journal_load().
- *
- * If 'write' is non-zero, then we wipe out the journal on disk; otherwise
- * we merely suppress recovery.
- */
-
-int journal_wipe(journal_t *journal, int write)
-{
- int err = 0;
-
- J_ASSERT (!(journal->j_flags & JFS_LOADED));
-
- err = load_superblock(journal);
- if (err)
- return err;
-
- if (!journal->j_tail)
- goto no_recovery;
-
- printk (KERN_WARNING "JBD: %s recovery information on journal\n",
- write ? "Clearing" : "Ignoring");
-
- err = journal_skip_recovery(journal);
- if (write) {
- /* Lock to make assertions happy... */
- mutex_lock(&journal->j_checkpoint_mutex);
- mark_journal_empty(journal);
- mutex_unlock(&journal->j_checkpoint_mutex);
- }
-
- no_recovery:
- return err;
-}
-
-/*
- * journal_dev_name: format a character string to describe on what
- * device this journal is present.
- */
-
-static const char *journal_dev_name(journal_t *journal, char *buffer)
-{
- struct block_device *bdev;
-
- if (journal->j_inode)
- bdev = journal->j_inode->i_sb->s_bdev;
- else
- bdev = journal->j_dev;
-
- return bdevname(bdev, buffer);
-}
-
-/*
- * Journal abort has very specific semantics, which we describe
- * for journal abort.
- *
- * Two internal function, which provide abort to te jbd layer
- * itself are here.
- */
-
-/*
- * Quick version for internal journal use (doesn't lock the journal).
- * Aborts hard --- we mark the abort as occurred, but do _nothing_ else,
- * and don't attempt to make any other journal updates.
- */
-static void __journal_abort_hard(journal_t *journal)
-{
- transaction_t *transaction;
- char b[BDEVNAME_SIZE];
-
- if (journal->j_flags & JFS_ABORT)
- return;
-
- printk(KERN_ERR "Aborting journal on device %s.\n",
- journal_dev_name(journal, b));
-
- spin_lock(&journal->j_state_lock);
- journal->j_flags |= JFS_ABORT;
- transaction = journal->j_running_transaction;
- if (transaction)
- __log_start_commit(journal, transaction->t_tid);
- spin_unlock(&journal->j_state_lock);
-}
-
-/* Soft abort: record the abort error status in the journal superblock,
- * but don't do any other IO. */
-static void __journal_abort_soft (journal_t *journal, int errno)
-{
- if (journal->j_flags & JFS_ABORT)
- return;
-
- if (!journal->j_errno)
- journal->j_errno = errno;
-
- __journal_abort_hard(journal);
-
- if (errno)
- journal_update_sb_errno(journal);
-}
-
-/**
- * void journal_abort () - Shutdown the journal immediately.
- * @journal: the journal to shutdown.
- * @errno: an error number to record in the journal indicating
- * the reason for the shutdown.
- *
- * Perform a complete, immediate shutdown of the ENTIRE
- * journal (not of a single transaction). This operation cannot be
- * undone without closing and reopening the journal.
- *
- * The journal_abort function is intended to support higher level error
- * recovery mechanisms such as the ext2/ext3 remount-readonly error
- * mode.
- *
- * Journal abort has very specific semantics. Any existing dirty,
- * unjournaled buffers in the main filesystem will still be written to
- * disk by bdflush, but the journaling mechanism will be suspended
- * immediately and no further transaction commits will be honoured.
- *
- * Any dirty, journaled buffers will be written back to disk without
- * hitting the journal. Atomicity cannot be guaranteed on an aborted
- * filesystem, but we _do_ attempt to leave as much data as possible
- * behind for fsck to use for cleanup.
- *
- * Any attempt to get a new transaction handle on a journal which is in
- * ABORT state will just result in an -EROFS error return. A
- * journal_stop on an existing handle will return -EIO if we have
- * entered abort state during the update.
- *
- * Recursive transactions are not disturbed by journal abort until the
- * final journal_stop, which will receive the -EIO error.
- *
- * Finally, the journal_abort call allows the caller to supply an errno
- * which will be recorded (if possible) in the journal superblock. This
- * allows a client to record failure conditions in the middle of a
- * transaction without having to complete the transaction to record the
- * failure to disk. ext3_error, for example, now uses this
- * functionality.
- *
- * Errors which originate from within the journaling layer will NOT
- * supply an errno; a null errno implies that absolutely no further
- * writes are done to the journal (unless there are any already in
- * progress).
- *
- */
-
-void journal_abort(journal_t *journal, int errno)
-{
- __journal_abort_soft(journal, errno);
-}
-
-/**
- * int journal_errno () - returns the journal's error state.
- * @journal: journal to examine.
- *
- * This is the errno numbet set with journal_abort(), the last
- * time the journal was mounted - if the journal was stopped
- * without calling abort this will be 0.
- *
- * If the journal has been aborted on this mount time -EROFS will
- * be returned.
- */
-int journal_errno(journal_t *journal)
-{
- int err;
-
- spin_lock(&journal->j_state_lock);
- if (journal->j_flags & JFS_ABORT)
- err = -EROFS;
- else
- err = journal->j_errno;
- spin_unlock(&journal->j_state_lock);
- return err;
-}
-
-/**
- * int journal_clear_err () - clears the journal's error state
- * @journal: journal to act on.
- *
- * An error must be cleared or Acked to take a FS out of readonly
- * mode.
- */
-int journal_clear_err(journal_t *journal)
-{
- int err = 0;
-
- spin_lock(&journal->j_state_lock);
- if (journal->j_flags & JFS_ABORT)
- err = -EROFS;
- else
- journal->j_errno = 0;
- spin_unlock(&journal->j_state_lock);
- return err;
-}
-
-/**
- * void journal_ack_err() - Ack journal err.
- * @journal: journal to act on.
- *
- * An error must be cleared or Acked to take a FS out of readonly
- * mode.
- */
-void journal_ack_err(journal_t *journal)
-{
- spin_lock(&journal->j_state_lock);
- if (journal->j_errno)
- journal->j_flags |= JFS_ACK_ERR;
- spin_unlock(&journal->j_state_lock);
-}
-
-int journal_blocks_per_page(struct inode *inode)
-{
- return 1 << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
-}
-
-/*
- * Journal_head storage management
- */
-static struct kmem_cache *journal_head_cache;
-#ifdef CONFIG_JBD_DEBUG
-static atomic_t nr_journal_heads = ATOMIC_INIT(0);
-#endif
-
-static int journal_init_journal_head_cache(void)
-{
- int retval;
-
- J_ASSERT(journal_head_cache == NULL);
- journal_head_cache = kmem_cache_create("journal_head",
- sizeof(struct journal_head),
- 0, /* offset */
- SLAB_TEMPORARY, /* flags */
- NULL); /* ctor */
- retval = 0;
- if (!journal_head_cache) {
- retval = -ENOMEM;
- printk(KERN_EMERG "JBD: no memory for journal_head cache\n");
- }
- return retval;
-}
-
-static void journal_destroy_journal_head_cache(void)
-{
- if (journal_head_cache) {
- kmem_cache_destroy(journal_head_cache);
- journal_head_cache = NULL;
- }
-}
-
-/*
- * journal_head splicing and dicing
- */
-static struct journal_head *journal_alloc_journal_head(void)
-{
- struct journal_head *ret;
-
-#ifdef CONFIG_JBD_DEBUG
- atomic_inc(&nr_journal_heads);
-#endif
- ret = kmem_cache_zalloc(journal_head_cache, GFP_NOFS);
- if (ret == NULL) {
- jbd_debug(1, "out of memory for journal_head\n");
- printk_ratelimited(KERN_NOTICE "ENOMEM in %s, retrying.\n",
- __func__);
-
- while (ret == NULL) {
- yield();
- ret = kmem_cache_zalloc(journal_head_cache, GFP_NOFS);
- }
- }
- return ret;
-}
-
-static void journal_free_journal_head(struct journal_head *jh)
-{
-#ifdef CONFIG_JBD_DEBUG
- atomic_dec(&nr_journal_heads);
- memset(jh, JBD_POISON_FREE, sizeof(*jh));
-#endif
- kmem_cache_free(journal_head_cache, jh);
-}
-
-/*
- * A journal_head is attached to a buffer_head whenever JBD has an
- * interest in the buffer.
- *
- * Whenever a buffer has an attached journal_head, its ->b_state:BH_JBD bit
- * is set. This bit is tested in core kernel code where we need to take
- * JBD-specific actions. Testing the zeroness of ->b_private is not reliable
- * there.
- *
- * When a buffer has its BH_JBD bit set, its ->b_count is elevated by one.
- *
- * When a buffer has its BH_JBD bit set it is immune from being released by
- * core kernel code, mainly via ->b_count.
- *
- * A journal_head is detached from its buffer_head when the journal_head's
- * b_jcount reaches zero. Running transaction (b_transaction) and checkpoint
- * transaction (b_cp_transaction) hold their references to b_jcount.
- *
- * Various places in the kernel want to attach a journal_head to a buffer_head
- * _before_ attaching the journal_head to a transaction. To protect the
- * journal_head in this situation, journal_add_journal_head elevates the
- * journal_head's b_jcount refcount by one. The caller must call
- * journal_put_journal_head() to undo this.
- *
- * So the typical usage would be:
- *
- * (Attach a journal_head if needed. Increments b_jcount)
- * struct journal_head *jh = journal_add_journal_head(bh);
- * ...
- * (Get another reference for transaction)
- * journal_grab_journal_head(bh);
- * jh->b_transaction = xxx;
- * (Put original reference)
- * journal_put_journal_head(jh);
- */
-
-/*
- * Give a buffer_head a journal_head.
- *
- * May sleep.
- */
-struct journal_head *journal_add_journal_head(struct buffer_head *bh)
-{
- struct journal_head *jh;
- struct journal_head *new_jh = NULL;
-
-repeat:
- if (!buffer_jbd(bh))
- new_jh = journal_alloc_journal_head();
-
- jbd_lock_bh_journal_head(bh);
- if (buffer_jbd(bh)) {
- jh = bh2jh(bh);
- } else {
- J_ASSERT_BH(bh,
- (atomic_read(&bh->b_count) > 0) ||
- (bh->b_page && bh->b_page->mapping));
-
- if (!new_jh) {
- jbd_unlock_bh_journal_head(bh);
- goto repeat;
- }
-
- jh = new_jh;
- new_jh = NULL; /* We consumed it */
- set_buffer_jbd(bh);
- bh->b_private = jh;
- jh->b_bh = bh;
- get_bh(bh);
- BUFFER_TRACE(bh, "added journal_head");
- }
- jh->b_jcount++;
- jbd_unlock_bh_journal_head(bh);
- if (new_jh)
- journal_free_journal_head(new_jh);
- return bh->b_private;
-}
-
-/*
- * Grab a ref against this buffer_head's journal_head. If it ended up not
- * having a journal_head, return NULL
- */
-struct journal_head *journal_grab_journal_head(struct buffer_head *bh)
-{
- struct journal_head *jh = NULL;
-
- jbd_lock_bh_journal_head(bh);
- if (buffer_jbd(bh)) {
- jh = bh2jh(bh);
- jh->b_jcount++;
- }
- jbd_unlock_bh_journal_head(bh);
- return jh;
-}
-
-static void __journal_remove_journal_head(struct buffer_head *bh)
-{
- struct journal_head *jh = bh2jh(bh);
-
- J_ASSERT_JH(jh, jh->b_jcount >= 0);
- J_ASSERT_JH(jh, jh->b_transaction == NULL);
- J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
- J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
- J_ASSERT_JH(jh, jh->b_jlist == BJ_None);
- J_ASSERT_BH(bh, buffer_jbd(bh));
- J_ASSERT_BH(bh, jh2bh(jh) == bh);
- BUFFER_TRACE(bh, "remove journal_head");
- if (jh->b_frozen_data) {
- printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__);
- jbd_free(jh->b_frozen_data, bh->b_size);
- }
- if (jh->b_committed_data) {
- printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__);
- jbd_free(jh->b_committed_data, bh->b_size);
- }
- bh->b_private = NULL;
- jh->b_bh = NULL; /* debug, really */
- clear_buffer_jbd(bh);
- journal_free_journal_head(jh);
-}
-
-/*
- * Drop a reference on the passed journal_head. If it fell to zero then
- * release the journal_head from the buffer_head.
- */
-void journal_put_journal_head(struct journal_head *jh)
-{
- struct buffer_head *bh = jh2bh(jh);
-
- jbd_lock_bh_journal_head(bh);
- J_ASSERT_JH(jh, jh->b_jcount > 0);
- --jh->b_jcount;
- if (!jh->b_jcount) {
- __journal_remove_journal_head(bh);
- jbd_unlock_bh_journal_head(bh);
- __brelse(bh);
- } else
- jbd_unlock_bh_journal_head(bh);
-}
-
-/*
- * debugfs tunables
- */
-#ifdef CONFIG_JBD_DEBUG
-
-u8 journal_enable_debug __read_mostly;
-EXPORT_SYMBOL(journal_enable_debug);
-
-static struct dentry *jbd_debugfs_dir;
-static struct dentry *jbd_debug;
-
-static void __init jbd_create_debugfs_entry(void)
-{
- jbd_debugfs_dir = debugfs_create_dir("jbd", NULL);
- if (jbd_debugfs_dir)
- jbd_debug = debugfs_create_u8("jbd-debug", S_IRUGO | S_IWUSR,
- jbd_debugfs_dir,
- &journal_enable_debug);
-}
-
-static void __exit jbd_remove_debugfs_entry(void)
-{
- debugfs_remove(jbd_debug);
- debugfs_remove(jbd_debugfs_dir);
-}
-
-#else
-
-static inline void jbd_create_debugfs_entry(void)
-{
-}
-
-static inline void jbd_remove_debugfs_entry(void)
-{
-}
-
-#endif
-
-struct kmem_cache *jbd_handle_cache;
-
-static int __init journal_init_handle_cache(void)
-{
- jbd_handle_cache = kmem_cache_create("journal_handle",
- sizeof(handle_t),
- 0, /* offset */
- SLAB_TEMPORARY, /* flags */
- NULL); /* ctor */
- if (jbd_handle_cache == NULL) {
- printk(KERN_EMERG "JBD: failed to create handle cache\n");
- return -ENOMEM;
- }
- return 0;
-}
-
-static void journal_destroy_handle_cache(void)
-{
- if (jbd_handle_cache)
- kmem_cache_destroy(jbd_handle_cache);
-}
-
-/*
- * Module startup and shutdown
- */
-
-static int __init journal_init_caches(void)
-{
- int ret;
-
- ret = journal_init_revoke_caches();
- if (ret == 0)
- ret = journal_init_journal_head_cache();
- if (ret == 0)
- ret = journal_init_handle_cache();
- return ret;
-}
-
-static void journal_destroy_caches(void)
-{
- journal_destroy_revoke_caches();
- journal_destroy_journal_head_cache();
- journal_destroy_handle_cache();
-}
-
-static int __init journal_init(void)
-{
- int ret;
-
- BUILD_BUG_ON(sizeof(struct journal_superblock_s) != 1024);
-
- ret = journal_init_caches();
- if (ret != 0)
- journal_destroy_caches();
- jbd_create_debugfs_entry();
- return ret;
-}
-
-static void __exit journal_exit(void)
-{
-#ifdef CONFIG_JBD_DEBUG
- int n = atomic_read(&nr_journal_heads);
- if (n)
- printk(KERN_ERR "JBD: leaked %d journal_heads!\n", n);
-#endif
- jbd_remove_debugfs_entry();
- journal_destroy_caches();
-}
-
-MODULE_LICENSE("GPL");
-module_init(journal_init);
-module_exit(journal_exit);
-
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
deleted file mode 100644
index a748fe2..0000000
--- a/fs/jbd/recovery.c
+++ /dev/null
@@ -1,594 +0,0 @@
-/*
- * linux/fs/jbd/recovery.c
- *
- * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
- *
- * Copyright 1999-2000 Red Hat Software --- All Rights Reserved
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
- * Journal recovery routines for the generic filesystem journaling code;
- * part of the ext2fs journaling system.
- */
-
-#ifndef __KERNEL__
-#include "jfs_user.h"
-#else
-#include <linux/time.h>
-#include <linux/fs.h>
-#include <linux/jbd.h>
-#include <linux/errno.h>
-#include <linux/blkdev.h>
-#endif
-
-/*
- * Maintain information about the progress of the recovery job, so that
- * the different passes can carry information between them.
- */
-struct recovery_info
-{
- tid_t start_transaction;
- tid_t end_transaction;
-
- int nr_replays;
- int nr_revokes;
- int nr_revoke_hits;
-};
-
-enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY};
-static int do_one_pass(journal_t *journal,
- struct recovery_info *info, enum passtype pass);
-static int scan_revoke_records(journal_t *, struct buffer_head *,
- tid_t, struct recovery_info *);
-
-#ifdef __KERNEL__
-
-/* Release readahead buffers after use */
-static void journal_brelse_array(struct buffer_head *b[], int n)
-{
- while (--n >= 0)
- brelse (b[n]);
-}
-
-
-/*
- * When reading from the journal, we are going through the block device
- * layer directly and so there is no readahead being done for us. We
- * need to implement any readahead ourselves if we want it to happen at
- * all. Recovery is basically one long sequential read, so make sure we
- * do the IO in reasonably large chunks.
- *
- * This is not so critical that we need to be enormously clever about
- * the readahead size, though. 128K is a purely arbitrary, good-enough
- * fixed value.
- */
-
-#define MAXBUF 8
-static int do_readahead(journal_t *journal, unsigned int start)
-{
- int err;
- unsigned int max, nbufs, next;
- unsigned int blocknr;
- struct buffer_head *bh;
-
- struct buffer_head * bufs[MAXBUF];
-
- /* Do up to 128K of readahead */
- max = start + (128 * 1024 / journal->j_blocksize);
- if (max > journal->j_maxlen)
- max = journal->j_maxlen;
-
- /* Do the readahead itself. We'll submit MAXBUF buffer_heads at
- * a time to the block device IO layer. */
-
- nbufs = 0;
-
- for (next = start; next < max; next++) {
- err = journal_bmap(journal, next, &blocknr);
-
- if (err) {
- printk (KERN_ERR "JBD: bad block at offset %u\n",
- next);
- goto failed;
- }
-
- bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
- if (!bh) {
- err = -ENOMEM;
- goto failed;
- }
-
- if (!buffer_uptodate(bh) && !buffer_locked(bh)) {
- bufs[nbufs++] = bh;
- if (nbufs == MAXBUF) {
- ll_rw_block(READ, nbufs, bufs);
- journal_brelse_array(bufs, nbufs);
- nbufs = 0;
- }
- } else
- brelse(bh);
- }
-
- if (nbufs)
- ll_rw_block(READ, nbufs, bufs);
- err = 0;
-
-failed:
- if (nbufs)
- journal_brelse_array(bufs, nbufs);
- return err;
-}
-
-#endif /* __KERNEL__ */
-
-
-/*
- * Read a block from the journal
- */
-
-static int jread(struct buffer_head **bhp, journal_t *journal,
- unsigned int offset)
-{
- int err;
- unsigned int blocknr;
- struct buffer_head *bh;
-
- *bhp = NULL;
-
- if (offset >= journal->j_maxlen) {
- printk(KERN_ERR "JBD: corrupted journal superblock\n");
- return -EIO;
- }
-
- err = journal_bmap(journal, offset, &blocknr);
-
- if (err) {
- printk (KERN_ERR "JBD: bad block at offset %u\n",
- offset);
- return err;
- }
-
- bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
- if (!bh)
- return -ENOMEM;
-
- if (!buffer_uptodate(bh)) {
- /* If this is a brand new buffer, start readahead.
- Otherwise, we assume we are already reading it. */
- if (!buffer_req(bh))
- do_readahead(journal, offset);
- wait_on_buffer(bh);
- }
-
- if (!buffer_uptodate(bh)) {
- printk (KERN_ERR "JBD: Failed to read block at offset %u\n",
- offset);
- brelse(bh);
- return -EIO;
- }
-
- *bhp = bh;
- return 0;
-}
-
-
-/*
- * Count the number of in-use tags in a journal descriptor block.
- */
-
-static int count_tags(struct buffer_head *bh, int size)
-{
- char * tagp;
- journal_block_tag_t * tag;
- int nr = 0;
-
- tagp = &bh->b_data[sizeof(journal_header_t)];
-
- while ((tagp - bh->b_data + sizeof(journal_block_tag_t)) <= size) {
- tag = (journal_block_tag_t *) tagp;
-
- nr++;
- tagp += sizeof(journal_block_tag_t);
- if (!(tag->t_flags & cpu_to_be32(JFS_FLAG_SAME_UUID)))
- tagp += 16;
-
- if (tag->t_flags & cpu_to_be32(JFS_FLAG_LAST_TAG))
- break;
- }
-
- return nr;
-}
-
-
-/* Make sure we wrap around the log correctly! */
-#define wrap(journal, var) \
-do { \
- if (var >= (journal)->j_last) \
- var -= ((journal)->j_last - (journal)->j_first); \
-} while (0)
-
-/**
- * journal_recover - recovers a on-disk journal
- * @journal: the journal to recover
- *
- * The primary function for recovering the log contents when mounting a
- * journaled device.
- *
- * Recovery is done in three passes. In the first pass, we look for the
- * end of the log. In the second, we assemble the list of revoke
- * blocks. In the third and final pass, we replay any un-revoked blocks
- * in the log.
- */
-int journal_recover(journal_t *journal)
-{
- int err, err2;
- journal_superblock_t * sb;
-
- struct recovery_info info;
-
- memset(&info, 0, sizeof(info));
- sb = journal->j_superblock;
-
- /*
- * The journal superblock's s_start field (the current log head)
- * is always zero if, and only if, the journal was cleanly
- * unmounted.
- */
-
- if (!sb->s_start) {
- jbd_debug(1, "No recovery required, last transaction %d\n",
- be32_to_cpu(sb->s_sequence));
- journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1;
- return 0;
- }
-
- err = do_one_pass(journal, &info, PASS_SCAN);
- if (!err)
- err = do_one_pass(journal, &info, PASS_REVOKE);
- if (!err)
- err = do_one_pass(journal, &info, PASS_REPLAY);
-
- jbd_debug(1, "JBD: recovery, exit status %d, "
- "recovered transactions %u to %u\n",
- err, info.start_transaction, info.end_transaction);
- jbd_debug(1, "JBD: Replayed %d and revoked %d/%d blocks\n",
- info.nr_replays, info.nr_revoke_hits, info.nr_revokes);
-
- /* Restart the log at the next transaction ID, thus invalidating
- * any existing commit records in the log. */
- journal->j_transaction_sequence = ++info.end_transaction;
-
- journal_clear_revoke(journal);
- err2 = sync_blockdev(journal->j_fs_dev);
- if (!err)
- err = err2;
- /* Flush disk caches to get replayed data on the permanent storage */
- if (journal->j_flags & JFS_BARRIER) {
- err2 = blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
- if (!err)
- err = err2;
- }
-
- return err;
-}
-
-/**
- * journal_skip_recovery - Start journal and wipe exiting records
- * @journal: journal to startup
- *
- * Locate any valid recovery information from the journal and set up the
- * journal structures in memory to ignore it (presumably because the
- * caller has evidence that it is out of date).
- * This function does'nt appear to be exorted..
- *
- * We perform one pass over the journal to allow us to tell the user how
- * much recovery information is being erased, and to let us initialise
- * the journal transaction sequence numbers to the next unused ID.
- */
-int journal_skip_recovery(journal_t *journal)
-{
- int err;
- struct recovery_info info;
-
- memset (&info, 0, sizeof(info));
-
- err = do_one_pass(journal, &info, PASS_SCAN);
-
- if (err) {
- printk(KERN_ERR "JBD: error %d scanning journal\n", err);
- ++journal->j_transaction_sequence;
- } else {
-#ifdef CONFIG_JBD_DEBUG
- int dropped = info.end_transaction -
- be32_to_cpu(journal->j_superblock->s_sequence);
- jbd_debug(1,
- "JBD: ignoring %d transaction%s from the journal.\n",
- dropped, (dropped == 1) ? "" : "s");
-#endif
- journal->j_transaction_sequence = ++info.end_transaction;
- }
-
- journal->j_tail = 0;
- return err;
-}
-
-static int do_one_pass(journal_t *journal,
- struct recovery_info *info, enum passtype pass)
-{
- unsigned int first_commit_ID, next_commit_ID;
- unsigned int next_log_block;
- int err, success = 0;
- journal_superblock_t * sb;
- journal_header_t * tmp;
- struct buffer_head * bh;
- unsigned int sequence;
- int blocktype;
-
- /*
- * First thing is to establish what we expect to find in the log
- * (in terms of transaction IDs), and where (in terms of log
- * block offsets): query the superblock.
- */
-
- sb = journal->j_superblock;
- next_commit_ID = be32_to_cpu(sb->s_sequence);
- next_log_block = be32_to_cpu(sb->s_start);
-
- first_commit_ID = next_commit_ID;
- if (pass == PASS_SCAN)
- info->start_transaction = first_commit_ID;
-
- jbd_debug(1, "Starting recovery pass %d\n", pass);
-
- /*
- * Now we walk through the log, transaction by transaction,
- * making sure that each transaction has a commit block in the
- * expected place. Each complete transaction gets replayed back
- * into the main filesystem.
- */
-
- while (1) {
- int flags;
- char * tagp;
- journal_block_tag_t * tag;
- struct buffer_head * obh;
- struct buffer_head * nbh;
-
- cond_resched();
-
- /* If we already know where to stop the log traversal,
- * check right now that we haven't gone past the end of
- * the log. */
-
- if (pass != PASS_SCAN)
- if (tid_geq(next_commit_ID, info->end_transaction))
- break;
-
- jbd_debug(2, "Scanning for sequence ID %u at %u/%u\n",
- next_commit_ID, next_log_block, journal->j_last);
-
- /* Skip over each chunk of the transaction looking
- * either the next descriptor block or the final commit
- * record. */
-
- jbd_debug(3, "JBD: checking block %u\n", next_log_block);
- err = jread(&bh, journal, next_log_block);
- if (err)
- goto failed;
-
- next_log_block++;
- wrap(journal, next_log_block);
-
- /* What kind of buffer is it?
- *
- * If it is a descriptor block, check that it has the
- * expected sequence number. Otherwise, we're all done
- * here. */
-
- tmp = (journal_header_t *)bh->b_data;
-
- if (tmp->h_magic != cpu_to_be32(JFS_MAGIC_NUMBER)) {
- brelse(bh);
- break;
- }
-
- blocktype = be32_to_cpu(tmp->h_blocktype);
- sequence = be32_to_cpu(tmp->h_sequence);
- jbd_debug(3, "Found magic %d, sequence %d\n",
- blocktype, sequence);
-
- if (sequence != next_commit_ID) {
- brelse(bh);
- break;
- }
-
- /* OK, we have a valid descriptor block which matches
- * all of the sequence number checks. What are we going
- * to do with it? That depends on the pass... */
-
- switch(blocktype) {
- case JFS_DESCRIPTOR_BLOCK:
- /* If it is a valid descriptor block, replay it
- * in pass REPLAY; otherwise, just skip over the
- * blocks it describes. */
- if (pass != PASS_REPLAY) {
- next_log_block +=
- count_tags(bh, journal->j_blocksize);
- wrap(journal, next_log_block);
- brelse(bh);
- continue;
- }
-
- /* A descriptor block: we can now write all of
- * the data blocks. Yay, useful work is finally
- * getting done here! */
-
- tagp = &bh->b_data[sizeof(journal_header_t)];
- while ((tagp - bh->b_data +sizeof(journal_block_tag_t))
- <= journal->j_blocksize) {
- unsigned int io_block;
-
- tag = (journal_block_tag_t *) tagp;
- flags = be32_to_cpu(tag->t_flags);
-
- io_block = next_log_block++;
- wrap(journal, next_log_block);
- err = jread(&obh, journal, io_block);
- if (err) {
- /* Recover what we can, but
- * report failure at the end. */
- success = err;
- printk (KERN_ERR
- "JBD: IO error %d recovering "
- "block %u in log\n",
- err, io_block);
- } else {
- unsigned int blocknr;
-
- J_ASSERT(obh != NULL);
- blocknr = be32_to_cpu(tag->t_blocknr);
-
- /* If the block has been
- * revoked, then we're all done
- * here. */
- if (journal_test_revoke
- (journal, blocknr,
- next_commit_ID)) {
- brelse(obh);
- ++info->nr_revoke_hits;
- goto skip_write;
- }
-
- /* Find a buffer for the new
- * data being restored */
- nbh = __getblk(journal->j_fs_dev,
- blocknr,
- journal->j_blocksize);
- if (nbh == NULL) {
- printk(KERN_ERR
- "JBD: Out of memory "
- "during recovery.\n");
- err = -ENOMEM;
- brelse(bh);
- brelse(obh);
- goto failed;
- }
-
- lock_buffer(nbh);
- memcpy(nbh->b_data, obh->b_data,
- journal->j_blocksize);
- if (flags & JFS_FLAG_ESCAPE) {
- *((__be32 *)nbh->b_data) =
- cpu_to_be32(JFS_MAGIC_NUMBER);
- }
-
- BUFFER_TRACE(nbh, "marking dirty");
- set_buffer_uptodate(nbh);
- mark_buffer_dirty(nbh);
- BUFFER_TRACE(nbh, "marking uptodate");
- ++info->nr_replays;
- /* ll_rw_block(WRITE, 1, &nbh); */
- unlock_buffer(nbh);
- brelse(obh);
- brelse(nbh);
- }
-
- skip_write:
- tagp += sizeof(journal_block_tag_t);
- if (!(flags & JFS_FLAG_SAME_UUID))
- tagp += 16;
-
- if (flags & JFS_FLAG_LAST_TAG)
- break;
- }
-
- brelse(bh);
- continue;
-
- case JFS_COMMIT_BLOCK:
- /* Found an expected commit block: not much to
- * do other than move on to the next sequence
- * number. */
- brelse(bh);
- next_commit_ID++;
- continue;
-
- case JFS_REVOKE_BLOCK:
- /* If we aren't in the REVOKE pass, then we can
- * just skip over this block. */
- if (pass != PASS_REVOKE) {
- brelse(bh);
- continue;
- }
-
- err = scan_revoke_records(journal, bh,
- next_commit_ID, info);
- brelse(bh);
- if (err)
- goto failed;
- continue;
-
- default:
- jbd_debug(3, "Unrecognised magic %d, end of scan.\n",
- blocktype);
- brelse(bh);
- goto done;
- }
- }
-
- done:
- /*
- * We broke out of the log scan loop: either we came to the
- * known end of the log or we found an unexpected block in the
- * log. If the latter happened, then we know that the "current"
- * transaction marks the end of the valid log.
- */
-
- if (pass == PASS_SCAN)
- info->end_transaction = next_commit_ID;
- else {
- /* It's really bad news if different passes end up at
- * different places (but possible due to IO errors). */
- if (info->end_transaction != next_commit_ID) {
- printk (KERN_ERR "JBD: recovery pass %d ended at "
- "transaction %u, expected %u\n",
- pass, next_commit_ID, info->end_transaction);
- if (!success)
- success = -EIO;
- }
- }
-
- return success;
-
- failed:
- return err;
-}
-
-
-/* Scan a revoke record, marking all blocks mentioned as revoked. */
-
-static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
- tid_t sequence, struct recovery_info *info)
-{
- journal_revoke_header_t *header;
- int offset, max;
-
- header = (journal_revoke_header_t *) bh->b_data;
- offset = sizeof(journal_revoke_header_t);
- max = be32_to_cpu(header->r_count);
-
- while (offset < max) {
- unsigned int blocknr;
- int err;
-
- blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset)));
- offset += 4;
- err = journal_set_revoke(journal, blocknr, sequence);
- if (err)
- return err;
- ++info->nr_revokes;
- }
- return 0;
-}
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
deleted file mode 100644
index dcead63..0000000
--- a/fs/jbd/revoke.c
+++ /dev/null
@@ -1,733 +0,0 @@
-/*
- * linux/fs/jbd/revoke.c
- *
- * Written by Stephen C. Tweedie <sct@redhat.com>, 2000
- *
- * Copyright 2000 Red Hat corp --- All Rights Reserved
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
- * Journal revoke routines for the generic filesystem journaling code;
- * part of the ext2fs journaling system.
- *
- * Revoke is the mechanism used to prevent old log records for deleted
- * metadata from being replayed on top of newer data using the same
- * blocks. The revoke mechanism is used in two separate places:
- *
- * + Commit: during commit we write the entire list of the current
- * transaction's revoked blocks to the journal
- *
- * + Recovery: during recovery we record the transaction ID of all
- * revoked blocks. If there are multiple revoke records in the log
- * for a single block, only the last one counts, and if there is a log
- * entry for a block beyond the last revoke, then that log entry still
- * gets replayed.
- *
- * We can get interactions between revokes and new log data within a
- * single transaction:
- *
- * Block is revoked and then journaled:
- * The desired end result is the journaling of the new block, so we
- * cancel the revoke before the transaction commits.
- *
- * Block is journaled and then revoked:
- * The revoke must take precedence over the write of the block, so we
- * need either to cancel the journal entry or to write the revoke
- * later in the log than the log block. In this case, we choose the
- * latter: journaling a block cancels any revoke record for that block
- * in the current transaction, so any revoke for that block in the
- * transaction must have happened after the block was journaled and so
- * the revoke must take precedence.
- *
- * Block is revoked and then written as data:
- * The data write is allowed to succeed, but the revoke is _not_
- * cancelled. We still need to prevent old log records from
- * overwriting the new data. We don't even need to clear the revoke
- * bit here.
- *
- * We cache revoke status of a buffer in the current transaction in b_states
- * bits. As the name says, revokevalid flag indicates that the cached revoke
- * status of a buffer is valid and we can rely on the cached status.
- *
- * Revoke information on buffers is a tri-state value:
- *
- * RevokeValid clear: no cached revoke status, need to look it up
- * RevokeValid set, Revoked clear:
- * buffer has not been revoked, and cancel_revoke
- * need do nothing.
- * RevokeValid set, Revoked set:
- * buffer has been revoked.
- *
- * Locking rules:
- * We keep two hash tables of revoke records. One hashtable belongs to the
- * running transaction (is pointed to by journal->j_revoke), the other one
- * belongs to the committing transaction. Accesses to the second hash table
- * happen only from the kjournald and no other thread touches this table. Also
- * journal_switch_revoke_table() which switches which hashtable belongs to the
- * running and which to the committing transaction is called only from
- * kjournald. Therefore we need no locks when accessing the hashtable belonging
- * to the committing transaction.
- *
- * All users operating on the hash table belonging to the running transaction
- * have a handle to the transaction. Therefore they are safe from kjournald
- * switching hash tables under them. For operations on the lists of entries in
- * the hash table j_revoke_lock is used.
- *
- * Finally, also replay code uses the hash tables but at this moment no one else
- * can touch them (filesystem isn't mounted yet) and hence no locking is
- * needed.
- */
-
-#ifndef __KERNEL__
-#include "jfs_user.h"
-#else
-#include <linux/time.h>
-#include <linux/fs.h>
-#include <linux/jbd.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <linux/list.h>
-#include <linux/init.h>
-#include <linux/bio.h>
-#endif
-#include <linux/log2.h>
-#include <linux/hash.h>
-
-static struct kmem_cache *revoke_record_cache;
-static struct kmem_cache *revoke_table_cache;
-
-/* Each revoke record represents one single revoked block. During
- journal replay, this involves recording the transaction ID of the
- last transaction to revoke this block. */
-
-struct jbd_revoke_record_s
-{
- struct list_head hash;
- tid_t sequence; /* Used for recovery only */
- unsigned int blocknr;
-};
-
-
-/* The revoke table is just a simple hash table of revoke records. */
-struct jbd_revoke_table_s
-{
- /* It is conceivable that we might want a larger hash table
- * for recovery. Must be a power of two. */
- int hash_size;
- int hash_shift;
- struct list_head *hash_table;
-};
-
-
-#ifdef __KERNEL__
-static void write_one_revoke_record(journal_t *, transaction_t *,
- struct journal_head **, int *,
- struct jbd_revoke_record_s *, int);
-static void flush_descriptor(journal_t *, struct journal_head *, int, int);
-#endif
-
-/* Utility functions to maintain the revoke table */
-
-static inline int hash(journal_t *journal, unsigned int block)
-{
- struct jbd_revoke_table_s *table = journal->j_revoke;
-
- return hash_32(block, table->hash_shift);
-}
-
-static int insert_revoke_hash(journal_t *journal, unsigned int blocknr,
- tid_t seq)
-{
- struct list_head *hash_list;
- struct jbd_revoke_record_s *record;
-
-repeat:
- record = kmem_cache_alloc(revoke_record_cache, GFP_NOFS);
- if (!record)
- goto oom;
-
- record->sequence = seq;
- record->blocknr = blocknr;
- hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
- spin_lock(&journal->j_revoke_lock);
- list_add(&record->hash, hash_list);
- spin_unlock(&journal->j_revoke_lock);
- return 0;
-
-oom:
- if (!journal_oom_retry)
- return -ENOMEM;
- jbd_debug(1, "ENOMEM in %s, retrying\n", __func__);
- yield();
- goto repeat;
-}
-
-/* Find a revoke record in the journal's hash table. */
-
-static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal,
- unsigned int blocknr)
-{
- struct list_head *hash_list;
- struct jbd_revoke_record_s *record;
-
- hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
-
- spin_lock(&journal->j_revoke_lock);
- record = (struct jbd_revoke_record_s *) hash_list->next;
- while (&(record->hash) != hash_list) {
- if (record->blocknr == blocknr) {
- spin_unlock(&journal->j_revoke_lock);
- return record;
- }
- record = (struct jbd_revoke_record_s *) record->hash.next;
- }
- spin_unlock(&journal->j_revoke_lock);
- return NULL;
-}
-
-void journal_destroy_revoke_caches(void)
-{
- if (revoke_record_cache) {
- kmem_cache_destroy(revoke_record_cache);
- revoke_record_cache = NULL;
- }
- if (revoke_table_cache) {
- kmem_cache_destroy(revoke_table_cache);
- revoke_table_cache = NULL;
- }
-}
-
-int __init journal_init_revoke_caches(void)
-{
- J_ASSERT(!revoke_record_cache);
- J_ASSERT(!revoke_table_cache);
-
- revoke_record_cache = kmem_cache_create("revoke_record",
- sizeof(struct jbd_revoke_record_s),
- 0,
- SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY,
- NULL);
- if (!revoke_record_cache)
- goto record_cache_failure;
-
- revoke_table_cache = kmem_cache_create("revoke_table",
- sizeof(struct jbd_revoke_table_s),
- 0, SLAB_TEMPORARY, NULL);
- if (!revoke_table_cache)
- goto table_cache_failure;
-
- return 0;
-
-table_cache_failure:
- journal_destroy_revoke_caches();
-record_cache_failure:
- return -ENOMEM;
-}
-
-static struct jbd_revoke_table_s *journal_init_revoke_table(int hash_size)
-{
- int i;
- struct jbd_revoke_table_s *table;
-
- table = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL);
- if (!table)
- goto out;
-
- table->hash_size = hash_size;
- table->hash_shift = ilog2(hash_size);
- table->hash_table =
- kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL);
- if (!table->hash_table) {
- kmem_cache_free(revoke_table_cache, table);
- table = NULL;
- goto out;
- }
-
- for (i = 0; i < hash_size; i++)
- INIT_LIST_HEAD(&table->hash_table[i]);
-
-out:
- return table;
-}
-
-static void journal_destroy_revoke_table(struct jbd_revoke_table_s *table)
-{
- int i;
- struct list_head *hash_list;
-
- for (i = 0; i < table->hash_size; i++) {
- hash_list = &table->hash_table[i];
- J_ASSERT(list_empty(hash_list));
- }
-
- kfree(table->hash_table);
- kmem_cache_free(revoke_table_cache, table);
-}
-
-/* Initialise the revoke table for a given journal to a given size. */
-int journal_init_revoke(journal_t *journal, int hash_size)
-{
- J_ASSERT(journal->j_revoke_table[0] == NULL);
- J_ASSERT(is_power_of_2(hash_size));
-
- journal->j_revoke_table[0] = journal_init_revoke_table(hash_size);
- if (!journal->j_revoke_table[0])
- goto fail0;
-
- journal->j_revoke_table[1] = journal_init_revoke_table(hash_size);
- if (!journal->j_revoke_table[1])
- goto fail1;
-
- journal->j_revoke = journal->j_revoke_table[1];
-
- spin_lock_init(&journal->j_revoke_lock);
-
- return 0;
-
-fail1:
- journal_destroy_revoke_table(journal->j_revoke_table[0]);
-fail0:
- return -ENOMEM;
-}
-
-/* Destroy a journal's revoke table. The table must already be empty! */
-void journal_destroy_revoke(journal_t *journal)
-{
- journal->j_revoke = NULL;
- if (journal->j_revoke_table[0])
- journal_destroy_revoke_table(journal->j_revoke_table[0]);
- if (journal->j_revoke_table[1])
- journal_destroy_revoke_table(journal->j_revoke_table[1]);
-}
-
-
-#ifdef __KERNEL__
-
-/*
- * journal_revoke: revoke a given buffer_head from the journal. This
- * prevents the block from being replayed during recovery if we take a
- * crash after this current transaction commits. Any subsequent
- * metadata writes of the buffer in this transaction cancel the
- * revoke.
- *
- * Note that this call may block --- it is up to the caller to make
- * sure that there are no further calls to journal_write_metadata
- * before the revoke is complete. In ext3, this implies calling the
- * revoke before clearing the block bitmap when we are deleting
- * metadata.
- *
- * Revoke performs a journal_forget on any buffer_head passed in as a
- * parameter, but does _not_ forget the buffer_head if the bh was only
- * found implicitly.
- *
- * bh_in may not be a journalled buffer - it may have come off
- * the hash tables without an attached journal_head.
- *
- * If bh_in is non-zero, journal_revoke() will decrement its b_count
- * by one.
- */
-
-int journal_revoke(handle_t *handle, unsigned int blocknr,
- struct buffer_head *bh_in)
-{
- struct buffer_head *bh = NULL;
- journal_t *journal;
- struct block_device *bdev;
- int err;
-
- might_sleep();
- if (bh_in)
- BUFFER_TRACE(bh_in, "enter");
-
- journal = handle->h_transaction->t_journal;
- if (!journal_set_features(journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)){
- J_ASSERT (!"Cannot set revoke feature!");
- return -EINVAL;
- }
-
- bdev = journal->j_fs_dev;
- bh = bh_in;
-
- if (!bh) {
- bh = __find_get_block(bdev, blocknr, journal->j_blocksize);
- if (bh)
- BUFFER_TRACE(bh, "found on hash");
- }
-#ifdef JBD_EXPENSIVE_CHECKING
- else {
- struct buffer_head *bh2;
-
- /* If there is a different buffer_head lying around in
- * memory anywhere... */
- bh2 = __find_get_block(bdev, blocknr, journal->j_blocksize);
- if (bh2) {
- /* ... and it has RevokeValid status... */
- if (bh2 != bh && buffer_revokevalid(bh2))
- /* ...then it better be revoked too,
- * since it's illegal to create a revoke
- * record against a buffer_head which is
- * not marked revoked --- that would
- * risk missing a subsequent revoke
- * cancel. */
- J_ASSERT_BH(bh2, buffer_revoked(bh2));
- put_bh(bh2);
- }
- }
-#endif
-
- /* We really ought not ever to revoke twice in a row without
- first having the revoke cancelled: it's illegal to free a
- block twice without allocating it in between! */
- if (bh) {
- if (!J_EXPECT_BH(bh, !buffer_revoked(bh),
- "inconsistent data on disk")) {
- if (!bh_in)
- brelse(bh);
- return -EIO;
- }
- set_buffer_revoked(bh);
- set_buffer_revokevalid(bh);
- if (bh_in) {
- BUFFER_TRACE(bh_in, "call journal_forget");
- journal_forget(handle, bh_in);
- } else {
- BUFFER_TRACE(bh, "call brelse");
- __brelse(bh);
- }
- }
-
- jbd_debug(2, "insert revoke for block %u, bh_in=%p\n", blocknr, bh_in);
- err = insert_revoke_hash(journal, blocknr,
- handle->h_transaction->t_tid);
- BUFFER_TRACE(bh_in, "exit");
- return err;
-}
-
-/*
- * Cancel an outstanding revoke. For use only internally by the
- * journaling code (called from journal_get_write_access).
- *
- * We trust buffer_revoked() on the buffer if the buffer is already
- * being journaled: if there is no revoke pending on the buffer, then we
- * don't do anything here.
- *
- * This would break if it were possible for a buffer to be revoked and
- * discarded, and then reallocated within the same transaction. In such
- * a case we would have lost the revoked bit, but when we arrived here
- * the second time we would still have a pending revoke to cancel. So,
- * do not trust the Revoked bit on buffers unless RevokeValid is also
- * set.
- */
-int journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
-{
- struct jbd_revoke_record_s *record;
- journal_t *journal = handle->h_transaction->t_journal;
- int need_cancel;
- int did_revoke = 0; /* akpm: debug */
- struct buffer_head *bh = jh2bh(jh);
-
- jbd_debug(4, "journal_head %p, cancelling revoke\n", jh);
-
- /* Is the existing Revoke bit valid? If so, we trust it, and
- * only perform the full cancel if the revoke bit is set. If
- * not, we can't trust the revoke bit, and we need to do the
- * full search for a revoke record. */
- if (test_set_buffer_revokevalid(bh)) {
- need_cancel = test_clear_buffer_revoked(bh);
- } else {
- need_cancel = 1;
- clear_buffer_revoked(bh);
- }
-
- if (need_cancel) {
- record = find_revoke_record(journal, bh->b_blocknr);
- if (record) {
- jbd_debug(4, "cancelled existing revoke on "
- "blocknr %llu\n", (unsigned long long)bh->b_blocknr);
- spin_lock(&journal->j_revoke_lock);
- list_del(&record->hash);
- spin_unlock(&journal->j_revoke_lock);
- kmem_cache_free(revoke_record_cache, record);
- did_revoke = 1;
- }
- }
-
-#ifdef JBD_EXPENSIVE_CHECKING
- /* There better not be one left behind by now! */
- record = find_revoke_record(journal, bh->b_blocknr);
- J_ASSERT_JH(jh, record == NULL);
-#endif
-
- /* Finally, have we just cleared revoke on an unhashed
- * buffer_head? If so, we'd better make sure we clear the
- * revoked status on any hashed alias too, otherwise the revoke
- * state machine will get very upset later on. */
- if (need_cancel) {
- struct buffer_head *bh2;
- bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr, bh->b_size);
- if (bh2) {
- if (bh2 != bh)
- clear_buffer_revoked(bh2);
- __brelse(bh2);
- }
- }
- return did_revoke;
-}
-
-/*
- * journal_clear_revoked_flags clears revoked flag of buffers in
- * revoke table to reflect there is no revoked buffer in the next
- * transaction which is going to be started.
- */
-void journal_clear_buffer_revoked_flags(journal_t *journal)
-{
- struct jbd_revoke_table_s *revoke = journal->j_revoke;
- int i = 0;
-
- for (i = 0; i < revoke->hash_size; i++) {
- struct list_head *hash_list;
- struct list_head *list_entry;
- hash_list = &revoke->hash_table[i];
-
- list_for_each(list_entry, hash_list) {
- struct jbd_revoke_record_s *record;
- struct buffer_head *bh;
- record = (struct jbd_revoke_record_s *)list_entry;
- bh = __find_get_block(journal->j_fs_dev,
- record->blocknr,
- journal->j_blocksize);
- if (bh) {
- clear_buffer_revoked(bh);
- __brelse(bh);
- }
- }
- }
-}
-
-/* journal_switch_revoke table select j_revoke for next transaction
- * we do not want to suspend any processing until all revokes are
- * written -bzzz
- */
-void journal_switch_revoke_table(journal_t *journal)
-{
- int i;
-
- if (journal->j_revoke == journal->j_revoke_table[0])
- journal->j_revoke = journal->j_revoke_table[1];
- else
- journal->j_revoke = journal->j_revoke_table[0];
-
- for (i = 0; i < journal->j_revoke->hash_size; i++)
- INIT_LIST_HEAD(&journal->j_revoke->hash_table[i]);
-}
-
-/*
- * Write revoke records to the journal for all entries in the current
- * revoke hash, deleting the entries as we go.
- */
-void journal_write_revoke_records(journal_t *journal,
- transaction_t *transaction, int write_op)
-{
- struct journal_head *descriptor;
- struct jbd_revoke_record_s *record;
- struct jbd_revoke_table_s *revoke;
- struct list_head *hash_list;
- int i, offset, count;
-
- descriptor = NULL;
- offset = 0;
- count = 0;
-
- /* select revoke table for committing transaction */
- revoke = journal->j_revoke == journal->j_revoke_table[0] ?
- journal->j_revoke_table[1] : journal->j_revoke_table[0];
-
- for (i = 0; i < revoke->hash_size; i++) {
- hash_list = &revoke->hash_table[i];
-
- while (!list_empty(hash_list)) {
- record = (struct jbd_revoke_record_s *)
- hash_list->next;
- write_one_revoke_record(journal, transaction,
- &descriptor, &offset,
- record, write_op);
- count++;
- list_del(&record->hash);
- kmem_cache_free(revoke_record_cache, record);
- }
- }
- if (descriptor)
- flush_descriptor(journal, descriptor, offset, write_op);
- jbd_debug(1, "Wrote %d revoke records\n", count);
-}
-
-/*
- * Write out one revoke record. We need to create a new descriptor
- * block if the old one is full or if we have not already created one.
- */
-
-static void write_one_revoke_record(journal_t *journal,
- transaction_t *transaction,
- struct journal_head **descriptorp,
- int *offsetp,
- struct jbd_revoke_record_s *record,
- int write_op)
-{
- struct journal_head *descriptor;
- int offset;
- journal_header_t *header;
-
- /* If we are already aborting, this all becomes a noop. We
- still need to go round the loop in
- journal_write_revoke_records in order to free all of the
- revoke records: only the IO to the journal is omitted. */
- if (is_journal_aborted(journal))
- return;
-
- descriptor = *descriptorp;
- offset = *offsetp;
-
- /* Make sure we have a descriptor with space left for the record */
- if (descriptor) {
- if (offset == journal->j_blocksize) {
- flush_descriptor(journal, descriptor, offset, write_op);
- descriptor = NULL;
- }
- }
-
- if (!descriptor) {
- descriptor = journal_get_descriptor_buffer(journal);
- if (!descriptor)
- return;
- header = (journal_header_t *) &jh2bh(descriptor)->b_data[0];
- header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
- header->h_blocktype = cpu_to_be32(JFS_REVOKE_BLOCK);
- header->h_sequence = cpu_to_be32(transaction->t_tid);
-
- /* Record it so that we can wait for IO completion later */
- JBUFFER_TRACE(descriptor, "file as BJ_LogCtl");
- journal_file_buffer(descriptor, transaction, BJ_LogCtl);
-
- offset = sizeof(journal_revoke_header_t);
- *descriptorp = descriptor;
- }
-
- * ((__be32 *)(&jh2bh(descriptor)->b_data[offset])) =
- cpu_to_be32(record->blocknr);
- offset += 4;
- *offsetp = offset;
-}
-
-/*
- * Flush a revoke descriptor out to the journal. If we are aborting,
- * this is a noop; otherwise we are generating a buffer which needs to
- * be waited for during commit, so it has to go onto the appropriate
- * journal buffer list.
- */
-
-static void flush_descriptor(journal_t *journal,
- struct journal_head *descriptor,
- int offset, int write_op)
-{
- journal_revoke_header_t *header;
- struct buffer_head *bh = jh2bh(descriptor);
-
- if (is_journal_aborted(journal)) {
- put_bh(bh);
- return;
- }
-
- header = (journal_revoke_header_t *) jh2bh(descriptor)->b_data;
- header->r_count = cpu_to_be32(offset);
- set_buffer_jwrite(bh);
- BUFFER_TRACE(bh, "write");
- set_buffer_dirty(bh);
- write_dirty_buffer(bh, write_op);
-}
-#endif
-
-/*
- * Revoke support for recovery.
- *
- * Recovery needs to be able to:
- *
- * record all revoke records, including the tid of the latest instance
- * of each revoke in the journal
- *
- * check whether a given block in a given transaction should be replayed
- * (ie. has not been revoked by a revoke record in that or a subsequent
- * transaction)
- *
- * empty the revoke table after recovery.
- */
-
-/*
- * First, setting revoke records. We create a new revoke record for
- * every block ever revoked in the log as we scan it for recovery, and
- * we update the existing records if we find multiple revokes for a
- * single block.
- */
-
-int journal_set_revoke(journal_t *journal,
- unsigned int blocknr,
- tid_t sequence)
-{
- struct jbd_revoke_record_s *record;
-
- record = find_revoke_record(journal, blocknr);
- if (record) {
- /* If we have multiple occurrences, only record the
- * latest sequence number in the hashed record */
- if (tid_gt(sequence, record->sequence))
- record->sequence = sequence;
- return 0;
- }
- return insert_revoke_hash(journal, blocknr, sequence);
-}
-
-/*
- * Test revoke records. For a given block referenced in the log, has
- * that block been revoked? A revoke record with a given transaction
- * sequence number revokes all blocks in that transaction and earlier
- * ones, but later transactions still need replayed.
- */
-
-int journal_test_revoke(journal_t *journal,
- unsigned int blocknr,
- tid_t sequence)
-{
- struct jbd_revoke_record_s *record;
-
- record = find_revoke_record(journal, blocknr);
- if (!record)
- return 0;
- if (tid_gt(sequence, record->sequence))
- return 0;
- return 1;
-}
-
-/*
- * Finally, once recovery is over, we need to clear the revoke table so
- * that it can be reused by the running filesystem.
- */
-
-void journal_clear_revoke(journal_t *journal)
-{
- int i;
- struct list_head *hash_list;
- struct jbd_revoke_record_s *record;
- struct jbd_revoke_table_s *revoke;
-
- revoke = journal->j_revoke;
-
- for (i = 0; i < revoke->hash_size; i++) {
- hash_list = &revoke->hash_table[i];
- while (!list_empty(hash_list)) {
- record = (struct jbd_revoke_record_s*) hash_list->next;
- list_del(&record->hash);
- kmem_cache_free(revoke_record_cache, record);
- }
- }
-}
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
deleted file mode 100644
index 1695ba8..0000000
--- a/fs/jbd/transaction.c
+++ /dev/null
@@ -1,2237 +0,0 @@
-/*
- * linux/fs/jbd/transaction.c
- *
- * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
- *
- * Copyright 1998 Red Hat corp --- All Rights Reserved
- *
- * This file is part of the Linux kernel and is made available under
- * the terms of the GNU General Public License, version 2, or at your
- * option, any later version, incorporated herein by reference.
- *
- * Generic filesystem transaction handling code; part of the ext2fs
- * journaling system.
- *
- * This file manages transactions (compound commits managed by the
- * journaling code) and handles (individual atomic operations by the
- * filesystem).
- */
-
-#include <linux/time.h>
-#include <linux/fs.h>
-#include <linux/jbd.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <linux/timer.h>
-#include <linux/mm.h>
-#include <linux/highmem.h>
-#include <linux/hrtimer.h>
-
-static void __journal_temp_unlink_buffer(struct journal_head *jh);
-
-/*
- * get_transaction: obtain a new transaction_t object.
- *
- * Simply allocate and initialise a new transaction. Create it in
- * RUNNING state and add it to the current journal (which should not
- * have an existing running transaction: we only make a new transaction
- * once we have started to commit the old one).
- *
- * Preconditions:
- * The journal MUST be locked. We don't perform atomic mallocs on the
- * new transaction and we can't block without protecting against other
- * processes trying to touch the journal while it is in transition.
- *
- * Called under j_state_lock
- */
-
-static transaction_t *
-get_transaction(journal_t *journal, transaction_t *transaction)
-{
- transaction->t_journal = journal;
- transaction->t_state = T_RUNNING;
- transaction->t_start_time = ktime_get();
- transaction->t_tid = journal->j_transaction_sequence++;
- transaction->t_expires = jiffies + journal->j_commit_interval;
- spin_lock_init(&transaction->t_handle_lock);
-
- /* Set up the commit timer for the new transaction. */
- journal->j_commit_timer.expires =
- round_jiffies_up(transaction->t_expires);
- add_timer(&journal->j_commit_timer);
-
- J_ASSERT(journal->j_running_transaction == NULL);
- journal->j_running_transaction = transaction;
-
- return transaction;
-}
-
-/*
- * Handle management.
- *
- * A handle_t is an object which represents a single atomic update to a
- * filesystem, and which tracks all of the modifications which form part
- * of that one update.
- */
-
-/*
- * start_this_handle: Given a handle, deal with any locking or stalling
- * needed to make sure that there is enough journal space for the handle
- * to begin. Attach the handle to a transaction and set up the
- * transaction's buffer credits.
- */
-
-static int start_this_handle(journal_t *journal, handle_t *handle)
-{
- transaction_t *transaction;
- int needed;
- int nblocks = handle->h_buffer_credits;
- transaction_t *new_transaction = NULL;
- int ret = 0;
-
- if (nblocks > journal->j_max_transaction_buffers) {
- printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
- current->comm, nblocks,
- journal->j_max_transaction_buffers);
- ret = -ENOSPC;
- goto out;
- }
-
-alloc_transaction:
- if (!journal->j_running_transaction) {
- new_transaction = kzalloc(sizeof(*new_transaction),
- GFP_NOFS|__GFP_NOFAIL);
- if (!new_transaction) {
- ret = -ENOMEM;
- goto out;
- }
- }
-
- jbd_debug(3, "New handle %p going live.\n", handle);
-
-repeat:
-
- /*
- * We need to hold j_state_lock until t_updates has been incremented,
- * for proper journal barrier handling
- */
- spin_lock(&journal->j_state_lock);
-repeat_locked:
- if (is_journal_aborted(journal) ||
- (journal->j_errno != 0 && !(journal->j_flags & JFS_ACK_ERR))) {
- spin_unlock(&journal->j_state_lock);
- ret = -EROFS;
- goto out;
- }
-
- /* Wait on the journal's transaction barrier if necessary */
- if (journal->j_barrier_count) {
- spin_unlock(&journal->j_state_lock);
- wait_event(journal->j_wait_transaction_locked,
- journal->j_barrier_count == 0);
- goto repeat;
- }
-
- if (!journal->j_running_transaction) {
- if (!new_transaction) {
- spin_unlock(&journal->j_state_lock);
- goto alloc_transaction;
- }
- get_transaction(journal, new_transaction);
- new_transaction = NULL;
- }
-
- transaction = journal->j_running_transaction;
-
- /*
- * If the current transaction is locked down for commit, wait for the
- * lock to be released.
- */
- if (transaction->t_state == T_LOCKED) {
- DEFINE_WAIT(wait);
-
- prepare_to_wait(&journal->j_wait_transaction_locked,
- &wait, TASK_UNINTERRUPTIBLE);
- spin_unlock(&journal->j_state_lock);
- schedule();
- finish_wait(&journal->j_wait_transaction_locked, &wait);
- goto repeat;
- }
-
- /*
- * If there is not enough space left in the log to write all potential
- * buffers requested by this operation, we need to stall pending a log
- * checkpoint to free some more log space.
- */
- spin_lock(&transaction->t_handle_lock);
- needed = transaction->t_outstanding_credits + nblocks;
-
- if (needed > journal->j_max_transaction_buffers) {
- /*
- * If the current transaction is already too large, then start
- * to commit it: we can then go back and attach this handle to
- * a new transaction.
- */
- DEFINE_WAIT(wait);
-
- jbd_debug(2, "Handle %p starting new commit...\n", handle);
- spin_unlock(&transaction->t_handle_lock);
- prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
- TASK_UNINTERRUPTIBLE);
- __log_start_commit(journal, transaction->t_tid);
- spin_unlock(&journal->j_state_lock);
- schedule();
- finish_wait(&journal->j_wait_transaction_locked, &wait);
- goto repeat;
- }
-
- /*
- * The commit code assumes that it can get enough log space
- * without forcing a checkpoint. This is *critical* for
- * correctness: a checkpoint of a buffer which is also
- * associated with a committing transaction creates a deadlock,
- * so commit simply cannot force through checkpoints.
- *
- * We must therefore ensure the necessary space in the journal
- * *before* starting to dirty potentially checkpointed buffers
- * in the new transaction.
- *
- * The worst part is, any transaction currently committing can
- * reduce the free space arbitrarily. Be careful to account for
- * those buffers when checkpointing.
- */
-
- /*
- * @@@ AKPM: This seems rather over-defensive. We're giving commit
- * a _lot_ of headroom: 1/4 of the journal plus the size of
- * the committing transaction. Really, we only need to give it
- * committing_transaction->t_outstanding_credits plus "enough" for
- * the log control blocks.
- * Also, this test is inconsistent with the matching one in
- * journal_extend().
- */
- if (__log_space_left(journal) < jbd_space_needed(journal)) {
- jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle);
- spin_unlock(&transaction->t_handle_lock);
- __log_wait_for_space(journal);
- goto repeat_locked;
- }
-
- /* OK, account for the buffers that this operation expects to
- * use and add the handle to the running transaction. */
-
- handle->h_transaction = transaction;
- transaction->t_outstanding_credits += nblocks;
- transaction->t_updates++;
- transaction->t_handle_count++;
- jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n",
- handle, nblocks, transaction->t_outstanding_credits,
- __log_space_left(journal));
- spin_unlock(&transaction->t_handle_lock);
- spin_unlock(&journal->j_state_lock);
-
- lock_map_acquire(&handle->h_lockdep_map);
-out:
- if (unlikely(new_transaction)) /* It's usually NULL */
- kfree(new_transaction);
- return ret;
-}
-
-static struct lock_class_key jbd_handle_key;
-
-/* Allocate a new handle. This should probably be in a slab... */
-static handle_t *new_handle(int nblocks)
-{
- handle_t *handle = jbd_alloc_handle(GFP_NOFS);
- if (!handle)
- return NULL;
- handle->h_buffer_credits = nblocks;
- handle->h_ref = 1;
-
- lockdep_init_map(&handle->h_lockdep_map, "jbd_handle", &jbd_handle_key, 0);
-
- return handle;
-}
-
-/**
- * handle_t *journal_start() - Obtain a new handle.
- * @journal: Journal to start transaction on.
- * @nblocks: number of block buffer we might modify
- *
- * We make sure that the transaction can guarantee at least nblocks of
- * modified buffers in the log. We block until the log can guarantee
- * that much space.
- *
- * This function is visible to journal users (like ext3fs), so is not
- * called with the journal already locked.
- *
- * Return a pointer to a newly allocated handle, or an ERR_PTR() value
- * on failure.
- */
-handle_t *journal_start(journal_t *journal, int nblocks)
-{
- handle_t *handle = journal_current_handle();
- int err;
-
- if (!journal)
- return ERR_PTR(-EROFS);
-
- if (handle) {
- J_ASSERT(handle->h_transaction->t_journal == journal);
- handle->h_ref++;
- return handle;
- }
-
- handle = new_handle(nblocks);
- if (!handle)
- return ERR_PTR(-ENOMEM);
-
- current->journal_info = handle;
-
- err = start_this_handle(journal, handle);
- if (err < 0) {
- jbd_free_handle(handle);
- current->journal_info = NULL;
- handle = ERR_PTR(err);
- }
- return handle;
-}
-
-/**
- * int journal_extend() - extend buffer credits.
- * @handle: handle to 'extend'
- * @nblocks: nr blocks to try to extend by.
- *
- * Some transactions, such as large extends and truncates, can be done
- * atomically all at once or in several stages. The operation requests
- * a credit for a number of buffer modications in advance, but can
- * extend its credit if it needs more.
- *
- * journal_extend tries to give the running handle more buffer credits.
- * It does not guarantee that allocation - this is a best-effort only.
- * The calling process MUST be able to deal cleanly with a failure to
- * extend here.
- *
- * Return 0 on success, non-zero on failure.
- *
- * return code < 0 implies an error
- * return code > 0 implies normal transaction-full status.
- */
-int journal_extend(handle_t *handle, int nblocks)
-{
- transaction_t *transaction = handle->h_transaction;
- journal_t *journal = transaction->t_journal;
- int result;
- int wanted;
-
- result = -EIO;
- if (is_handle_aborted(handle))
- goto out;
-
- result = 1;
-
- spin_lock(&journal->j_state_lock);
-
- /* Don't extend a locked-down transaction! */
- if (handle->h_transaction->t_state != T_RUNNING) {
- jbd_debug(3, "denied handle %p %d blocks: "
- "transaction not running\n", handle, nblocks);
- goto error_out;
- }
-
- spin_lock(&transaction->t_handle_lock);
- wanted = transaction->t_outstanding_credits + nblocks;
-
- if (wanted > journal->j_max_transaction_buffers) {
- jbd_debug(3, "denied handle %p %d blocks: "
- "transaction too large\n", handle, nblocks);
- goto unlock;
- }
-
- if (wanted > __log_space_left(journal)) {
- jbd_debug(3, "denied handle %p %d blocks: "
- "insufficient log space\n", handle, nblocks);
- goto unlock;
- }
-
- handle->h_buffer_credits += nblocks;
- transaction->t_outstanding_credits += nblocks;
- result = 0;
-
- jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
-unlock:
- spin_unlock(&transaction->t_handle_lock);
-error_out:
- spin_unlock(&journal->j_state_lock);
-out:
- return result;
-}
-
-
-/**
- * int journal_restart() - restart a handle.
- * @handle: handle to restart
- * @nblocks: nr credits requested
- *
- * Restart a handle for a multi-transaction filesystem
- * operation.
- *
- * If the journal_extend() call above fails to grant new buffer credits
- * to a running handle, a call to journal_restart will commit the
- * handle's transaction so far and reattach the handle to a new
- * transaction capabable of guaranteeing the requested number of
- * credits.
- */
-
-int journal_restart(handle_t *handle, int nblocks)
-{
- transaction_t *transaction = handle->h_transaction;
- journal_t *journal = transaction->t_journal;
- int ret;
-
- /* If we've had an abort of any type, don't even think about
- * actually doing the restart! */
- if (is_handle_aborted(handle))
- return 0;
-
- /*
- * First unlink the handle from its current transaction, and start the
- * commit on that.
- */
- J_ASSERT(transaction->t_updates > 0);
- J_ASSERT(journal_current_handle() == handle);
-
- spin_lock(&journal->j_state_lock);
- spin_lock(&transaction->t_handle_lock);
- transaction->t_outstanding_credits -= handle->h_buffer_credits;
- transaction->t_updates--;
-
- if (!transaction->t_updates)
- wake_up(&journal->j_wait_updates);
- spin_unlock(&transaction->t_handle_lock);
-
- jbd_debug(2, "restarting handle %p\n", handle);
- __log_start_commit(journal, transaction->t_tid);
- spin_unlock(&journal->j_state_lock);
-
- lock_map_release(&handle->h_lockdep_map);
- handle->h_buffer_credits = nblocks;
- ret = start_this_handle(journal, handle);
- return ret;
-}
-
-
-/**
- * void journal_lock_updates () - establish a transaction barrier.
- * @journal: Journal to establish a barrier on.
- *
- * This locks out any further updates from being started, and blocks until all
- * existing updates have completed, returning only once the journal is in a
- * quiescent state with no updates running.
- *
- * We do not use simple mutex for synchronization as there are syscalls which
- * want to return with filesystem locked and that trips up lockdep. Also
- * hibernate needs to lock filesystem but locked mutex then blocks hibernation.
- * Since locking filesystem is rare operation, we use simple counter and
- * waitqueue for locking.
- */
-void journal_lock_updates(journal_t *journal)
-{
- DEFINE_WAIT(wait);
-
-wait:
- /* Wait for previous locked operation to finish */
- wait_event(journal->j_wait_transaction_locked,
- journal->j_barrier_count == 0);
-
- spin_lock(&journal->j_state_lock);
- /*
- * Check reliably under the lock whether we are the ones winning the race
- * and locking the journal
- */
- if (journal->j_barrier_count > 0) {
- spin_unlock(&journal->j_state_lock);
- goto wait;
- }
- ++journal->j_barrier_count;
-
- /* Wait until there are no running updates */
- while (1) {
- transaction_t *transaction = journal->j_running_transaction;
-
- if (!transaction)
- break;
-
- spin_lock(&transaction->t_handle_lock);
- if (!transaction->t_updates) {
- spin_unlock(&transaction->t_handle_lock);
- break;
- }
- prepare_to_wait(&journal->j_wait_updates, &wait,
- TASK_UNINTERRUPTIBLE);
- spin_unlock(&transaction->t_handle_lock);
- spin_unlock(&journal->j_state_lock);
- schedule();
- finish_wait(&journal->j_wait_updates, &wait);
- spin_lock(&journal->j_state_lock);
- }
- spin_unlock(&journal->j_state_lock);
-}
-
-/**
- * void journal_unlock_updates (journal_t* journal) - release barrier
- * @journal: Journal to release the barrier on.
- *
- * Release a transaction barrier obtained with journal_lock_updates().
- */
-void journal_unlock_updates (journal_t *journal)
-{
- J_ASSERT(journal->j_barrier_count != 0);
-
- spin_lock(&journal->j_state_lock);
- --journal->j_barrier_count;
- spin_unlock(&journal->j_state_lock);
- wake_up(&journal->j_wait_transaction_locked);
-}
-
-static void warn_dirty_buffer(struct buffer_head *bh)
-{
- char b[BDEVNAME_SIZE];
-
- printk(KERN_WARNING
- "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). "
- "There's a risk of filesystem corruption in case of system "
- "crash.\n",
- bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
-}
-
-/*
- * If the buffer is already part of the current transaction, then there
- * is nothing we need to do. If it is already part of a prior
- * transaction which we are still committing to disk, then we need to
- * make sure that we do not overwrite the old copy: we do copy-out to
- * preserve the copy going to disk. We also account the buffer against
- * the handle's metadata buffer credits (unless the buffer is already
- * part of the transaction, that is).
- *
- */
-static int
-do_get_write_access(handle_t *handle, struct journal_head *jh,
- int force_copy)
-{
- struct buffer_head *bh;
- transaction_t *transaction;
- journal_t *journal;
- int error;
- char *frozen_buffer = NULL;
- int need_copy = 0;
-
- if (is_handle_aborted(handle))
- return -EROFS;
-
- transaction = handle->h_transaction;
- journal = transaction->t_journal;
-
- jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
-
- JBUFFER_TRACE(jh, "entry");
-repeat:
- bh = jh2bh(jh);
-
- /* @@@ Need to check for errors here at some point. */
-
- lock_buffer(bh);
- jbd_lock_bh_state(bh);
-
- /* We now hold the buffer lock so it is safe to query the buffer
- * state. Is the buffer dirty?
- *
- * If so, there are two possibilities. The buffer may be
- * non-journaled, and undergoing a quite legitimate writeback.
- * Otherwise, it is journaled, and we don't expect dirty buffers
- * in that state (the buffers should be marked JBD_Dirty
- * instead.) So either the IO is being done under our own
- * control and this is a bug, or it's a third party IO such as
- * dump(8) (which may leave the buffer scheduled for read ---
- * ie. locked but not dirty) or tune2fs (which may actually have
- * the buffer dirtied, ugh.) */
-
- if (buffer_dirty(bh)) {
- /*
- * First question: is this buffer already part of the current
- * transaction or the existing committing transaction?
- */
- if (jh->b_transaction) {
- J_ASSERT_JH(jh,
- jh->b_transaction == transaction ||
- jh->b_transaction ==
- journal->j_committing_transaction);
- if (jh->b_next_transaction)
- J_ASSERT_JH(jh, jh->b_next_transaction ==
- transaction);
- warn_dirty_buffer(bh);
- }
- /*
- * In any case we need to clean the dirty flag and we must
- * do it under the buffer lock to be sure we don't race
- * with running write-out.
- */
- JBUFFER_TRACE(jh, "Journalling dirty buffer");
- clear_buffer_dirty(bh);
- set_buffer_jbddirty(bh);
- }
-
- unlock_buffer(bh);
-
- error = -EROFS;
- if (is_handle_aborted(handle)) {
- jbd_unlock_bh_state(bh);
- goto out;
- }
- error = 0;
-
- /*
- * The buffer is already part of this transaction if b_transaction or
- * b_next_transaction points to it
- */
- if (jh->b_transaction == transaction ||
- jh->b_next_transaction == transaction)
- goto done;
-
- /*
- * this is the first time this transaction is touching this buffer,
- * reset the modified flag
- */
- jh->b_modified = 0;
-
- /*
- * If there is already a copy-out version of this buffer, then we don't
- * need to make another one
- */
- if (jh->b_frozen_data) {
- JBUFFER_TRACE(jh, "has frozen data");
- J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
- jh->b_next_transaction = transaction;
- goto done;
- }
-
- /* Is there data here we need to preserve? */
-
- if (jh->b_transaction && jh->b_transaction != transaction) {
- JBUFFER_TRACE(jh, "owned by older transaction");
- J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
- J_ASSERT_JH(jh, jh->b_transaction ==
- journal->j_committing_transaction);
-
- /* There is one case we have to be very careful about.
- * If the committing transaction is currently writing
- * this buffer out to disk and has NOT made a copy-out,
- * then we cannot modify the buffer contents at all
- * right now. The essence of copy-out is that it is the
- * extra copy, not the primary copy, which gets
- * journaled. If the primary copy is already going to
- * disk then we cannot do copy-out here. */
-
- if (jh->b_jlist == BJ_Shadow) {
- DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow);
- wait_queue_head_t *wqh;
-
- wqh = bit_waitqueue(&bh->b_state, BH_Unshadow);
-
- JBUFFER_TRACE(jh, "on shadow: sleep");
- jbd_unlock_bh_state(bh);
- /* commit wakes up all shadow buffers after IO */
- for ( ; ; ) {
- prepare_to_wait(wqh, &wait.wait,
- TASK_UNINTERRUPTIBLE);
- if (jh->b_jlist != BJ_Shadow)
- break;
- schedule();
- }
- finish_wait(wqh, &wait.wait);
- goto repeat;
- }
-
- /* Only do the copy if the currently-owning transaction
- * still needs it. If it is on the Forget list, the
- * committing transaction is past that stage. The
- * buffer had better remain locked during the kmalloc,
- * but that should be true --- we hold the journal lock
- * still and the buffer is already on the BUF_JOURNAL
- * list so won't be flushed.
- *
- * Subtle point, though: if this is a get_undo_access,
- * then we will be relying on the frozen_data to contain
- * the new value of the committed_data record after the
- * transaction, so we HAVE to force the frozen_data copy
- * in that case. */
-
- if (jh->b_jlist != BJ_Forget || force_copy) {
- JBUFFER_TRACE(jh, "generate frozen data");
- if (!frozen_buffer) {
- JBUFFER_TRACE(jh, "allocate memory for buffer");
- jbd_unlock_bh_state(bh);
- frozen_buffer =
- jbd_alloc(jh2bh(jh)->b_size,
- GFP_NOFS);
- if (!frozen_buffer) {
- printk(KERN_ERR
- "%s: OOM for frozen_buffer\n",
- __func__);
- JBUFFER_TRACE(jh, "oom!");
- error = -ENOMEM;
- jbd_lock_bh_state(bh);
- goto done;
- }
- goto repeat;
- }
- jh->b_frozen_data = frozen_buffer;
- frozen_buffer = NULL;
- need_copy = 1;
- }
- jh->b_next_transaction = transaction;
- }
-
-
- /*
- * Finally, if the buffer is not journaled right now, we need to make
- * sure it doesn't get written to disk before the caller actually
- * commits the new data
- */
- if (!jh->b_transaction) {
- JBUFFER_TRACE(jh, "no transaction");
- J_ASSERT_JH(jh, !jh->b_next_transaction);
- JBUFFER_TRACE(jh, "file as BJ_Reserved");
- spin_lock(&journal->j_list_lock);
- __journal_file_buffer(jh, transaction, BJ_Reserved);
- spin_unlock(&journal->j_list_lock);
- }
-
-done:
- if (need_copy) {
- struct page *page;
- int offset;
- char *source;
-
- J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)),
- "Possible IO failure.\n");
- page = jh2bh(jh)->b_page;
- offset = offset_in_page(jh2bh(jh)->b_data);
- source = kmap_atomic(page);
- memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
- kunmap_atomic(source);
- }
- jbd_unlock_bh_state(bh);
-
- /*
- * If we are about to journal a buffer, then any revoke pending on it is
- * no longer valid
- */
- journal_cancel_revoke(handle, jh);
-
-out:
- if (unlikely(frozen_buffer)) /* It's usually NULL */
- jbd_free(frozen_buffer, bh->b_size);
-
- JBUFFER_TRACE(jh, "exit");
- return error;
-}
-
-/**
- * int journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update.
- * @handle: transaction to add buffer modifications to
- * @bh: bh to be used for metadata writes
- *
- * Returns an error code or 0 on success.
- *
- * In full data journalling mode the buffer may be of type BJ_AsyncData,
- * because we're write()ing a buffer which is also part of a shared mapping.
- */
-
-int journal_get_write_access(handle_t *handle, struct buffer_head *bh)
-{
- struct journal_head *jh = journal_add_journal_head(bh);
- int rc;
-
- /* We do not want to get caught playing with fields which the
- * log thread also manipulates. Make sure that the buffer
- * completes any outstanding IO before proceeding. */
- rc = do_get_write_access(handle, jh, 0);
- journal_put_journal_head(jh);
- return rc;
-}
-
-
-/*
- * When the user wants to journal a newly created buffer_head
- * (ie. getblk() returned a new buffer and we are going to populate it
- * manually rather than reading off disk), then we need to keep the
- * buffer_head locked until it has been completely filled with new
- * data. In this case, we should be able to make the assertion that
- * the bh is not already part of an existing transaction.
- *
- * The buffer should already be locked by the caller by this point.
- * There is no lock ranking violation: it was a newly created,
- * unlocked buffer beforehand. */
-
-/**
- * int journal_get_create_access () - notify intent to use newly created bh
- * @handle: transaction to new buffer to
- * @bh: new buffer.
- *
- * Call this if you create a new bh.
- */
-int journal_get_create_access(handle_t *handle, struct buffer_head *bh)
-{
- transaction_t *transaction = handle->h_transaction;
- journal_t *journal = transaction->t_journal;
- struct journal_head *jh = journal_add_journal_head(bh);
- int err;
-
- jbd_debug(5, "journal_head %p\n", jh);
- err = -EROFS;
- if (is_handle_aborted(handle))
- goto out;
- err = 0;
-
- JBUFFER_TRACE(jh, "entry");
- /*
- * The buffer may already belong to this transaction due to pre-zeroing
- * in the filesystem's new_block code. It may also be on the previous,
- * committing transaction's lists, but it HAS to be in Forget state in
- * that case: the transaction must have deleted the buffer for it to be
- * reused here.
- */
- jbd_lock_bh_state(bh);
- spin_lock(&journal->j_list_lock);
- J_ASSERT_JH(jh, (jh->b_transaction == transaction ||
- jh->b_transaction == NULL ||
- (jh->b_transaction == journal->j_committing_transaction &&
- jh->b_jlist == BJ_Forget)));
-
- J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
- J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
-
- if (jh->b_transaction == NULL) {
- /*
- * Previous journal_forget() could have left the buffer
- * with jbddirty bit set because it was being committed. When
- * the commit finished, we've filed the buffer for
- * checkpointing and marked it dirty. Now we are reallocating
- * the buffer so the transaction freeing it must have
- * committed and so it's safe to clear the dirty bit.
- */
- clear_buffer_dirty(jh2bh(jh));
-
- /* first access by this transaction */
- jh->b_modified = 0;
-
- JBUFFER_TRACE(jh, "file as BJ_Reserved");
- __journal_file_buffer(jh, transaction, BJ_Reserved);
- } else if (jh->b_transaction == journal->j_committing_transaction) {
- /* first access by this transaction */
- jh->b_modified = 0;
-
- JBUFFER_TRACE(jh, "set next transaction");
- jh->b_next_transaction = transaction;
- }
- spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
-
- /*
- * akpm: I added this. ext3_alloc_branch can pick up new indirect
- * blocks which contain freed but then revoked metadata. We need
- * to cancel the revoke in case we end up freeing it yet again
- * and the reallocating as data - this would cause a second revoke,
- * which hits an assertion error.
- */
- JBUFFER_TRACE(jh, "cancelling revoke");
- journal_cancel_revoke(handle, jh);
-out:
- journal_put_journal_head(jh);
- return err;
-}
-
-/**
- * int journal_get_undo_access() - Notify intent to modify metadata with non-rewindable consequences
- * @handle: transaction
- * @bh: buffer to undo
- *
- * Sometimes there is a need to distinguish between metadata which has
- * been committed to disk and that which has not. The ext3fs code uses
- * this for freeing and allocating space, we have to make sure that we
- * do not reuse freed space until the deallocation has been committed,
- * since if we overwrote that space we would make the delete
- * un-rewindable in case of a crash.
- *
- * To deal with that, journal_get_undo_access requests write access to a
- * buffer for parts of non-rewindable operations such as delete
- * operations on the bitmaps. The journaling code must keep a copy of
- * the buffer's contents prior to the undo_access call until such time
- * as we know that the buffer has definitely been committed to disk.
- *
- * We never need to know which transaction the committed data is part
- * of, buffers touched here are guaranteed to be dirtied later and so
- * will be committed to a new transaction in due course, at which point
- * we can discard the old committed data pointer.
- *
- * Returns error number or 0 on success.
- */
-int journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
-{
- int err;
- struct journal_head *jh = journal_add_journal_head(bh);
- char *committed_data = NULL;
-
- JBUFFER_TRACE(jh, "entry");
-
- /*
- * Do this first --- it can drop the journal lock, so we want to
- * make sure that obtaining the committed_data is done
- * atomically wrt. completion of any outstanding commits.
- */
- err = do_get_write_access(handle, jh, 1);
- if (err)
- goto out;
-
-repeat:
- if (!jh->b_committed_data) {
- committed_data = jbd_alloc(jh2bh(jh)->b_size, GFP_NOFS);
- if (!committed_data) {
- printk(KERN_ERR "%s: No memory for committed data\n",
- __func__);
- err = -ENOMEM;
- goto out;
- }
- }
-
- jbd_lock_bh_state(bh);
- if (!jh->b_committed_data) {
- /* Copy out the current buffer contents into the
- * preserved, committed copy. */
- JBUFFER_TRACE(jh, "generate b_committed data");
- if (!committed_data) {
- jbd_unlock_bh_state(bh);
- goto repeat;
- }
-
- jh->b_committed_data = committed_data;
- committed_data = NULL;
- memcpy(jh->b_committed_data, bh->b_data, bh->b_size);
- }
- jbd_unlock_bh_state(bh);
-out:
- journal_put_journal_head(jh);
- if (unlikely(committed_data))
- jbd_free(committed_data, bh->b_size);
- return err;
-}
-
-/**
- * int journal_dirty_data() - mark a buffer as containing dirty data to be flushed
- * @handle: transaction
- * @bh: bufferhead to mark
- *
- * Description:
- * Mark a buffer as containing dirty data which needs to be flushed before
- * we can commit the current transaction.
- *
- * The buffer is placed on the transaction's data list and is marked as
- * belonging to the transaction.
- *
- * Returns error number or 0 on success.
- *
- * journal_dirty_data() can be called via page_launder->ext3_writepage
- * by kswapd.
- */
-int journal_dirty_data(handle_t *handle, struct buffer_head *bh)
-{
- journal_t *journal = handle->h_transaction->t_journal;
- int need_brelse = 0;
- struct journal_head *jh;
- int ret = 0;
-
- if (is_handle_aborted(handle))
- return ret;
-
- jh = journal_add_journal_head(bh);
- JBUFFER_TRACE(jh, "entry");
-
- /*
- * The buffer could *already* be dirty. Writeout can start
- * at any time.
- */
- jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid);
-
- /*
- * What if the buffer is already part of a running transaction?
- *
- * There are two cases:
- * 1) It is part of the current running transaction. Refile it,
- * just in case we have allocated it as metadata, deallocated
- * it, then reallocated it as data.
- * 2) It is part of the previous, still-committing transaction.
- * If all we want to do is to guarantee that the buffer will be
- * written to disk before this new transaction commits, then
- * being sure that the *previous* transaction has this same
- * property is sufficient for us! Just leave it on its old
- * transaction.
- *
- * In case (2), the buffer must not already exist as metadata
- * --- that would violate write ordering (a transaction is free
- * to write its data at any point, even before the previous
- * committing transaction has committed). The caller must
- * never, ever allow this to happen: there's nothing we can do
- * about it in this layer.
- */
- jbd_lock_bh_state(bh);
- spin_lock(&journal->j_list_lock);
-
- /* Now that we have bh_state locked, are we really still mapped? */
- if (!buffer_mapped(bh)) {
- JBUFFER_TRACE(jh, "unmapped buffer, bailing out");
- goto no_journal;
- }
-
- if (jh->b_transaction) {
- JBUFFER_TRACE(jh, "has transaction");
- if (jh->b_transaction != handle->h_transaction) {
- JBUFFER_TRACE(jh, "belongs to older transaction");
- J_ASSERT_JH(jh, jh->b_transaction ==
- journal->j_committing_transaction);
-
- /* @@@ IS THIS TRUE ? */
- /*
- * Not any more. Scenario: someone does a write()
- * in data=journal mode. The buffer's transaction has
- * moved into commit. Then someone does another
- * write() to the file. We do the frozen data copyout
- * and set b_next_transaction to point to j_running_t.
- * And while we're in that state, someone does a
- * writepage() in an attempt to pageout the same area
- * of the file via a shared mapping. At present that
- * calls journal_dirty_data(), and we get right here.
- * It may be too late to journal the data. Simply
- * falling through to the next test will suffice: the
- * data will be dirty and wil be checkpointed. The
- * ordering comments in the next comment block still
- * apply.
- */
- //J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
-
- /*
- * If we're journalling data, and this buffer was
- * subject to a write(), it could be metadata, forget
- * or shadow against the committing transaction. Now,
- * someone has dirtied the same darn page via a mapping
- * and it is being writepage()'d.
- * We *could* just steal the page from commit, with some
- * fancy locking there. Instead, we just skip it -
- * don't tie the page's buffers to the new transaction
- * at all.
- * Implication: if we crash before the writepage() data
- * is written into the filesystem, recovery will replay
- * the write() data.
- */
- if (jh->b_jlist != BJ_None &&
- jh->b_jlist != BJ_SyncData &&
- jh->b_jlist != BJ_Locked) {
- JBUFFER_TRACE(jh, "Not stealing");
- goto no_journal;
- }
-
- /*
- * This buffer may be undergoing writeout in commit. We
- * can't return from here and let the caller dirty it
- * again because that can cause the write-out loop in
- * commit to never terminate.
- */
- if (buffer_dirty(bh)) {
- get_bh(bh);
- spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
- need_brelse = 1;
- sync_dirty_buffer(bh);
- jbd_lock_bh_state(bh);
- spin_lock(&journal->j_list_lock);
- /* Since we dropped the lock... */
- if (!buffer_mapped(bh)) {
- JBUFFER_TRACE(jh, "buffer got unmapped");
- goto no_journal;
- }
- /* The buffer may become locked again at any
- time if it is redirtied */
- }
-
- /*
- * We cannot remove the buffer with io error from the
- * committing transaction, because otherwise it would
- * miss the error and the commit would not abort.
- */
- if (unlikely(!buffer_uptodate(bh))) {
- ret = -EIO;
- goto no_journal;
- }
- /* We might have slept so buffer could be refiled now */
- if (jh->b_transaction != NULL &&
- jh->b_transaction != handle->h_transaction) {
- JBUFFER_TRACE(jh, "unfile from commit");
- __journal_temp_unlink_buffer(jh);
- /* It still points to the committing
- * transaction; move it to this one so
- * that the refile assert checks are
- * happy. */
- jh->b_transaction = handle->h_transaction;
- }
- /* The buffer will be refiled below */
-
- }
- /*
- * Special case --- the buffer might actually have been
- * allocated and then immediately deallocated in the previous,
- * committing transaction, so might still be left on that
- * transaction's metadata lists.
- */
- if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) {
- JBUFFER_TRACE(jh, "not on correct data list: unfile");
- J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
- JBUFFER_TRACE(jh, "file as data");
- __journal_file_buffer(jh, handle->h_transaction,
- BJ_SyncData);
- }
- } else {
- JBUFFER_TRACE(jh, "not on a transaction");
- __journal_file_buffer(jh, handle->h_transaction, BJ_SyncData);
- }
-no_journal:
- spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
- if (need_brelse) {
- BUFFER_TRACE(bh, "brelse");
- __brelse(bh);
- }
- JBUFFER_TRACE(jh, "exit");
- journal_put_journal_head(jh);
- return ret;
-}
-
-/**
- * int journal_dirty_metadata() - mark a buffer as containing dirty metadata
- * @handle: transaction to add buffer to.
- * @bh: buffer to mark
- *
- * Mark dirty metadata which needs to be journaled as part of the current
- * transaction.
- *
- * The buffer is placed on the transaction's metadata list and is marked
- * as belonging to the transaction.
- *
- * Returns error number or 0 on success.
- *
- * Special care needs to be taken if the buffer already belongs to the
- * current committing transaction (in which case we should have frozen
- * data present for that commit). In that case, we don't relink the
- * buffer: that only gets done when the old transaction finally
- * completes its commit.
- */
-int journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
-{
- transaction_t *transaction = handle->h_transaction;
- journal_t *journal = transaction->t_journal;
- struct journal_head *jh = bh2jh(bh);
-
- jbd_debug(5, "journal_head %p\n", jh);
- JBUFFER_TRACE(jh, "entry");
- if (is_handle_aborted(handle))
- goto out;
-
- jbd_lock_bh_state(bh);
-
- if (jh->b_modified == 0) {
- /*
- * This buffer's got modified and becoming part
- * of the transaction. This needs to be done
- * once a transaction -bzzz
- */
- jh->b_modified = 1;
- J_ASSERT_JH(jh, handle->h_buffer_credits > 0);
- handle->h_buffer_credits--;
- }
-
- /*
- * fastpath, to avoid expensive locking. If this buffer is already
- * on the running transaction's metadata list there is nothing to do.
- * Nobody can take it off again because there is a handle open.
- * I _think_ we're OK here with SMP barriers - a mistaken decision will
- * result in this test being false, so we go in and take the locks.
- */
- if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) {
- JBUFFER_TRACE(jh, "fastpath");
- J_ASSERT_JH(jh, jh->b_transaction ==
- journal->j_running_transaction);
- goto out_unlock_bh;
- }
-
- set_buffer_jbddirty(bh);
-
- /*
- * Metadata already on the current transaction list doesn't
- * need to be filed. Metadata on another transaction's list must
- * be committing, and will be refiled once the commit completes:
- * leave it alone for now.
- */
- if (jh->b_transaction != transaction) {
- JBUFFER_TRACE(jh, "already on other transaction");
- J_ASSERT_JH(jh, jh->b_transaction ==
- journal->j_committing_transaction);
- J_ASSERT_JH(jh, jh->b_next_transaction == transaction);
- /* And this case is illegal: we can't reuse another
- * transaction's data buffer, ever. */
- goto out_unlock_bh;
- }
-
- /* That test should have eliminated the following case: */
- J_ASSERT_JH(jh, jh->b_frozen_data == NULL);
-
- JBUFFER_TRACE(jh, "file as BJ_Metadata");
- spin_lock(&journal->j_list_lock);
- __journal_file_buffer(jh, handle->h_transaction, BJ_Metadata);
- spin_unlock(&journal->j_list_lock);
-out_unlock_bh:
- jbd_unlock_bh_state(bh);
-out:
- JBUFFER_TRACE(jh, "exit");
- return 0;
-}
-
-/*
- * journal_release_buffer: undo a get_write_access without any buffer
- * updates, if the update decided in the end that it didn't need access.
- *
- */
-void
-journal_release_buffer(handle_t *handle, struct buffer_head *bh)
-{
- BUFFER_TRACE(bh, "entry");
-}
-
-/**
- * void journal_forget() - bforget() for potentially-journaled buffers.
- * @handle: transaction handle
- * @bh: bh to 'forget'
- *
- * We can only do the bforget if there are no commits pending against the
- * buffer. If the buffer is dirty in the current running transaction we
- * can safely unlink it.
- *
- * bh may not be a journalled buffer at all - it may be a non-JBD
- * buffer which came off the hashtable. Check for this.
- *
- * Decrements bh->b_count by one.
- *
- * Allow this call even if the handle has aborted --- it may be part of
- * the caller's cleanup after an abort.
- */
-int journal_forget (handle_t *handle, struct buffer_head *bh)
-{
- transaction_t *transaction = handle->h_transaction;
- journal_t *journal = transaction->t_journal;
- struct journal_head *jh;
- int drop_reserve = 0;
- int err = 0;
- int was_modified = 0;
-
- BUFFER_TRACE(bh, "entry");
-
- jbd_lock_bh_state(bh);
- spin_lock(&journal->j_list_lock);
-
- if (!buffer_jbd(bh))
- goto not_jbd;
- jh = bh2jh(bh);
-
- /* Critical error: attempting to delete a bitmap buffer, maybe?
- * Don't do any jbd operations, and return an error. */
- if (!J_EXPECT_JH(jh, !jh->b_committed_data,
- "inconsistent data on disk")) {
- err = -EIO;
- goto not_jbd;
- }
-
- /* keep track of whether or not this transaction modified us */
- was_modified = jh->b_modified;
-
- /*
- * The buffer's going from the transaction, we must drop
- * all references -bzzz
- */
- jh->b_modified = 0;
-
- if (jh->b_transaction == handle->h_transaction) {
- J_ASSERT_JH(jh, !jh->b_frozen_data);
-
- /* If we are forgetting a buffer which is already part
- * of this transaction, then we can just drop it from
- * the transaction immediately. */
- clear_buffer_dirty(bh);
- clear_buffer_jbddirty(bh);
-
- JBUFFER_TRACE(jh, "belongs to current transaction: unfile");
-
- /*
- * we only want to drop a reference if this transaction
- * modified the buffer
- */
- if (was_modified)
- drop_reserve = 1;
-
- /*
- * We are no longer going to journal this buffer.
- * However, the commit of this transaction is still
- * important to the buffer: the delete that we are now
- * processing might obsolete an old log entry, so by
- * committing, we can satisfy the buffer's checkpoint.
- *
- * So, if we have a checkpoint on the buffer, we should
- * now refile the buffer on our BJ_Forget list so that
- * we know to remove the checkpoint after we commit.
- */
-
- if (jh->b_cp_transaction) {
- __journal_temp_unlink_buffer(jh);
- __journal_file_buffer(jh, transaction, BJ_Forget);
- } else {
- __journal_unfile_buffer(jh);
- if (!buffer_jbd(bh)) {
- spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
- __bforget(bh);
- goto drop;
- }
- }
- } else if (jh->b_transaction) {
- J_ASSERT_JH(jh, (jh->b_transaction ==
- journal->j_committing_transaction));
- /* However, if the buffer is still owned by a prior
- * (committing) transaction, we can't drop it yet... */
- JBUFFER_TRACE(jh, "belongs to older transaction");
- /* ... but we CAN drop it from the new transaction if we
- * have also modified it since the original commit. */
-
- if (jh->b_next_transaction) {
- J_ASSERT(jh->b_next_transaction == transaction);
- jh->b_next_transaction = NULL;
-
- /*
- * only drop a reference if this transaction modified
- * the buffer
- */
- if (was_modified)
- drop_reserve = 1;
- }
- }
-
-not_jbd:
- spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
- __brelse(bh);
-drop:
- if (drop_reserve) {
- /* no need to reserve log space for this block -bzzz */
- handle->h_buffer_credits++;
- }
- return err;
-}
-
-/**
- * int journal_stop() - complete a transaction
- * @handle: tranaction to complete.
- *
- * All done for a particular handle.
- *
- * There is not much action needed here. We just return any remaining
- * buffer credits to the transaction and remove the handle. The only
- * complication is that we need to start a commit operation if the
- * filesystem is marked for synchronous update.
- *
- * journal_stop itself will not usually return an error, but it may
- * do so in unusual circumstances. In particular, expect it to
- * return -EIO if a journal_abort has been executed since the
- * transaction began.
- */
-int journal_stop(handle_t *handle)
-{
- transaction_t *transaction = handle->h_transaction;
- journal_t *journal = transaction->t_journal;
- int err;
- pid_t pid;
-
- J_ASSERT(journal_current_handle() == handle);
-
- if (is_handle_aborted(handle))
- err = -EIO;
- else {
- J_ASSERT(transaction->t_updates > 0);
- err = 0;
- }
-
- if (--handle->h_ref > 0) {
- jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
- handle->h_ref);
- return err;
- }
-
- jbd_debug(4, "Handle %p going down\n", handle);
-
- /*
- * Implement synchronous transaction batching. If the handle
- * was synchronous, don't force a commit immediately. Let's
- * yield and let another thread piggyback onto this transaction.
- * Keep doing that while new threads continue to arrive.
- * It doesn't cost much - we're about to run a commit and sleep
- * on IO anyway. Speeds up many-threaded, many-dir operations
- * by 30x or more...
- *
- * We try and optimize the sleep time against what the underlying disk
- * can do, instead of having a static sleep time. This is useful for
- * the case where our storage is so fast that it is more optimal to go
- * ahead and force a flush and wait for the transaction to be committed
- * than it is to wait for an arbitrary amount of time for new writers to
- * join the transaction. We achieve this by measuring how long it takes
- * to commit a transaction, and compare it with how long this
- * transaction has been running, and if run time < commit time then we
- * sleep for the delta and commit. This greatly helps super fast disks
- * that would see slowdowns as more threads started doing fsyncs.
- *
- * But don't do this if this process was the most recent one to
- * perform a synchronous write. We do this to detect the case where a
- * single process is doing a stream of sync writes. No point in waiting
- * for joiners in that case.
- */
- pid = current->pid;
- if (handle->h_sync && journal->j_last_sync_writer != pid) {
- u64 commit_time, trans_time;
-
- journal->j_last_sync_writer = pid;
-
- spin_lock(&journal->j_state_lock);
- commit_time = journal->j_average_commit_time;
- spin_unlock(&journal->j_state_lock);
-
- trans_time = ktime_to_ns(ktime_sub(ktime_get(),
- transaction->t_start_time));
-
- commit_time = min_t(u64, commit_time,
- 1000*jiffies_to_usecs(1));
-
- if (trans_time < commit_time) {
- ktime_t expires = ktime_add_ns(ktime_get(),
- commit_time);
- set_current_state(TASK_UNINTERRUPTIBLE);
- schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
- }
- }
-
- current->journal_info = NULL;
- spin_lock(&journal->j_state_lock);
- spin_lock(&transaction->t_handle_lock);
- transaction->t_outstanding_credits -= handle->h_buffer_credits;
- transaction->t_updates--;
- if (!transaction->t_updates) {
- wake_up(&journal->j_wait_updates);
- if (journal->j_barrier_count)
- wake_up(&journal->j_wait_transaction_locked);
- }
-
- /*
- * If the handle is marked SYNC, we need to set another commit
- * going! We also want to force a commit if the current
- * transaction is occupying too much of the log, or if the
- * transaction is too old now.
- */
- if (handle->h_sync ||
- transaction->t_outstanding_credits >
- journal->j_max_transaction_buffers ||
- time_after_eq(jiffies, transaction->t_expires)) {
- /* Do this even for aborted journals: an abort still
- * completes the commit thread, it just doesn't write
- * anything to disk. */
- tid_t tid = transaction->t_tid;
-
- spin_unlock(&transaction->t_handle_lock);
- jbd_debug(2, "transaction too old, requesting commit for "
- "handle %p\n", handle);
- /* This is non-blocking */
- __log_start_commit(journal, transaction->t_tid);
- spin_unlock(&journal->j_state_lock);
-
- /*
- * Special case: JFS_SYNC synchronous updates require us
- * to wait for the commit to complete.
- */
- if (handle->h_sync && !(current->flags & PF_MEMALLOC))
- err = log_wait_commit(journal, tid);
- } else {
- spin_unlock(&transaction->t_handle_lock);
- spin_unlock(&journal->j_state_lock);
- }
-
- lock_map_release(&handle->h_lockdep_map);
-
- jbd_free_handle(handle);
- return err;
-}
-
-/**
- * int journal_force_commit() - force any uncommitted transactions
- * @journal: journal to force
- *
- * For synchronous operations: force any uncommitted transactions
- * to disk. May seem kludgy, but it reuses all the handle batching
- * code in a very simple manner.
- */
-int journal_force_commit(journal_t *journal)
-{
- handle_t *handle;
- int ret;
-
- handle = journal_start(journal, 1);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- } else {
- handle->h_sync = 1;
- ret = journal_stop(handle);
- }
- return ret;
-}
-
-/*
- *
- * List management code snippets: various functions for manipulating the
- * transaction buffer lists.
- *
- */
-
-/*
- * Append a buffer to a transaction list, given the transaction's list head
- * pointer.
- *
- * j_list_lock is held.
- *
- * jbd_lock_bh_state(jh2bh(jh)) is held.
- */
-
-static inline void
-__blist_add_buffer(struct journal_head **list, struct journal_head *jh)
-{
- if (!*list) {
- jh->b_tnext = jh->b_tprev = jh;
- *list = jh;
- } else {
- /* Insert at the tail of the list to preserve order */
- struct journal_head *first = *list, *last = first->b_tprev;
- jh->b_tprev = last;
- jh->b_tnext = first;
- last->b_tnext = first->b_tprev = jh;
- }
-}
-
-/*
- * Remove a buffer from a transaction list, given the transaction's list
- * head pointer.
- *
- * Called with j_list_lock held, and the journal may not be locked.
- *
- * jbd_lock_bh_state(jh2bh(jh)) is held.
- */
-
-static inline void
-__blist_del_buffer(struct journal_head **list, struct journal_head *jh)
-{
- if (*list == jh) {
- *list = jh->b_tnext;
- if (*list == jh)
- *list = NULL;
- }
- jh->b_tprev->b_tnext = jh->b_tnext;
- jh->b_tnext->b_tprev = jh->b_tprev;
-}
-
-/*
- * Remove a buffer from the appropriate transaction list.
- *
- * Note that this function can *change* the value of
- * bh->b_transaction->t_sync_datalist, t_buffers, t_forget,
- * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list. If the caller
- * is holding onto a copy of one of thee pointers, it could go bad.
- * Generally the caller needs to re-read the pointer from the transaction_t.
- *
- * Called under j_list_lock. The journal may not be locked.
- */
-static void __journal_temp_unlink_buffer(struct journal_head *jh)
-{
- struct journal_head **list = NULL;
- transaction_t *transaction;
- struct buffer_head *bh = jh2bh(jh);
-
- J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
- transaction = jh->b_transaction;
- if (transaction)
- assert_spin_locked(&transaction->t_journal->j_list_lock);
-
- J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
- if (jh->b_jlist != BJ_None)
- J_ASSERT_JH(jh, transaction != NULL);
-
- switch (jh->b_jlist) {
- case BJ_None:
- return;
- case BJ_SyncData:
- list = &transaction->t_sync_datalist;
- break;
- case BJ_Metadata:
- transaction->t_nr_buffers--;
- J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
- list = &transaction->t_buffers;
- break;
- case BJ_Forget:
- list = &transaction->t_forget;
- break;
- case BJ_IO:
- list = &transaction->t_iobuf_list;
- break;
- case BJ_Shadow:
- list = &transaction->t_shadow_list;
- break;
- case BJ_LogCtl:
- list = &transaction->t_log_list;
- break;
- case BJ_Reserved:
- list = &transaction->t_reserved_list;
- break;
- case BJ_Locked:
- list = &transaction->t_locked_list;
- break;
- }
-
- __blist_del_buffer(list, jh);
- jh->b_jlist = BJ_None;
- if (test_clear_buffer_jbddirty(bh))
- mark_buffer_dirty(bh); /* Expose it to the VM */
-}
-
-/*
- * Remove buffer from all transactions.
- *
- * Called with bh_state lock and j_list_lock
- *
- * jh and bh may be already freed when this function returns.
- */
-void __journal_unfile_buffer(struct journal_head *jh)
-{
- __journal_temp_unlink_buffer(jh);
- jh->b_transaction = NULL;
- journal_put_journal_head(jh);
-}
-
-void journal_unfile_buffer(journal_t *journal, struct journal_head *jh)
-{
- struct buffer_head *bh = jh2bh(jh);
-
- /* Get reference so that buffer cannot be freed before we unlock it */
- get_bh(bh);
- jbd_lock_bh_state(bh);
- spin_lock(&journal->j_list_lock);
- __journal_unfile_buffer(jh);
- spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
- __brelse(bh);
-}
-
-/*
- * Called from journal_try_to_free_buffers().
- *
- * Called under jbd_lock_bh_state(bh)
- */
-static void
-__journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
-{
- struct journal_head *jh;
-
- jh = bh2jh(bh);
-
- if (buffer_locked(bh) || buffer_dirty(bh))
- goto out;
-
- if (jh->b_next_transaction != NULL)
- goto out;
-
- spin_lock(&journal->j_list_lock);
- if (jh->b_transaction != NULL && jh->b_cp_transaction == NULL) {
- if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) {
- /* A written-back ordered data buffer */
- JBUFFER_TRACE(jh, "release data");
- __journal_unfile_buffer(jh);
- }
- } else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
- /* written-back checkpointed metadata buffer */
- if (jh->b_jlist == BJ_None) {
- JBUFFER_TRACE(jh, "remove from checkpoint list");
- __journal_remove_checkpoint(jh);
- }
- }
- spin_unlock(&journal->j_list_lock);
-out:
- return;
-}
-
-/**
- * int journal_try_to_free_buffers() - try to free page buffers.
- * @journal: journal for operation
- * @page: to try and free
- * @gfp_mask: we use the mask to detect how hard should we try to release
- * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
- * release the buffers.
- *
- *
- * For all the buffers on this page,
- * if they are fully written out ordered data, move them onto BUF_CLEAN
- * so try_to_free_buffers() can reap them.
- *
- * This function returns non-zero if we wish try_to_free_buffers()
- * to be called. We do this if the page is releasable by try_to_free_buffers().
- * We also do it if the page has locked or dirty buffers and the caller wants
- * us to perform sync or async writeout.
- *
- * This complicates JBD locking somewhat. We aren't protected by the
- * BKL here. We wish to remove the buffer from its committing or
- * running transaction's ->t_datalist via __journal_unfile_buffer.
- *
- * This may *change* the value of transaction_t->t_datalist, so anyone
- * who looks at t_datalist needs to lock against this function.
- *
- * Even worse, someone may be doing a journal_dirty_data on this
- * buffer. So we need to lock against that. journal_dirty_data()
- * will come out of the lock with the buffer dirty, which makes it
- * ineligible for release here.
- *
- * Who else is affected by this? hmm... Really the only contender
- * is do_get_write_access() - it could be looking at the buffer while
- * journal_try_to_free_buffer() is changing its state. But that
- * cannot happen because we never reallocate freed data as metadata
- * while the data is part of a transaction. Yes?
- *
- * Return 0 on failure, 1 on success
- */
-int journal_try_to_free_buffers(journal_t *journal,
- struct page *page, gfp_t gfp_mask)
-{
- struct buffer_head *head;
- struct buffer_head *bh;
- int ret = 0;
-
- J_ASSERT(PageLocked(page));
-
- head = page_buffers(page);
- bh = head;
- do {
- struct journal_head *jh;
-
- /*
- * We take our own ref against the journal_head here to avoid
- * having to add tons of locking around each instance of
- * journal_put_journal_head().
- */
- jh = journal_grab_journal_head(bh);
- if (!jh)
- continue;
-
- jbd_lock_bh_state(bh);
- __journal_try_to_free_buffer(journal, bh);
- journal_put_journal_head(jh);
- jbd_unlock_bh_state(bh);
- if (buffer_jbd(bh))
- goto busy;
- } while ((bh = bh->b_this_page) != head);
-
- ret = try_to_free_buffers(page);
-
-busy:
- return ret;
-}
-
-/*
- * This buffer is no longer needed. If it is on an older transaction's
- * checkpoint list we need to record it on this transaction's forget list
- * to pin this buffer (and hence its checkpointing transaction) down until
- * this transaction commits. If the buffer isn't on a checkpoint list, we
- * release it.
- * Returns non-zero if JBD no longer has an interest in the buffer.
- *
- * Called under j_list_lock.
- *
- * Called under jbd_lock_bh_state(bh).
- */
-static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
-{
- int may_free = 1;
- struct buffer_head *bh = jh2bh(jh);
-
- if (jh->b_cp_transaction) {
- JBUFFER_TRACE(jh, "on running+cp transaction");
- __journal_temp_unlink_buffer(jh);
- /*
- * We don't want to write the buffer anymore, clear the
- * bit so that we don't confuse checks in
- * __journal_file_buffer
- */
- clear_buffer_dirty(bh);
- __journal_file_buffer(jh, transaction, BJ_Forget);
- may_free = 0;
- } else {
- JBUFFER_TRACE(jh, "on running transaction");
- __journal_unfile_buffer(jh);
- }
- return may_free;
-}
-
-/*
- * journal_invalidatepage
- *
- * This code is tricky. It has a number of cases to deal with.
- *
- * There are two invariants which this code relies on:
- *
- * i_size must be updated on disk before we start calling invalidatepage on the
- * data.
- *
- * This is done in ext3 by defining an ext3_setattr method which
- * updates i_size before truncate gets going. By maintaining this
- * invariant, we can be sure that it is safe to throw away any buffers
- * attached to the current transaction: once the transaction commits,
- * we know that the data will not be needed.
- *
- * Note however that we can *not* throw away data belonging to the
- * previous, committing transaction!
- *
- * Any disk blocks which *are* part of the previous, committing
- * transaction (and which therefore cannot be discarded immediately) are
- * not going to be reused in the new running transaction
- *
- * The bitmap committed_data images guarantee this: any block which is
- * allocated in one transaction and removed in the next will be marked
- * as in-use in the committed_data bitmap, so cannot be reused until
- * the next transaction to delete the block commits. This means that
- * leaving committing buffers dirty is quite safe: the disk blocks
- * cannot be reallocated to a different file and so buffer aliasing is
- * not possible.
- *
- *
- * The above applies mainly to ordered data mode. In writeback mode we
- * don't make guarantees about the order in which data hits disk --- in
- * particular we don't guarantee that new dirty data is flushed before
- * transaction commit --- so it is always safe just to discard data
- * immediately in that mode. --sct
- */
-
-/*
- * The journal_unmap_buffer helper function returns zero if the buffer
- * concerned remains pinned as an anonymous buffer belonging to an older
- * transaction.
- *
- * We're outside-transaction here. Either or both of j_running_transaction
- * and j_committing_transaction may be NULL.
- */
-static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh,
- int partial_page)
-{
- transaction_t *transaction;
- struct journal_head *jh;
- int may_free = 1;
-
- BUFFER_TRACE(bh, "entry");
-
-retry:
- /*
- * It is safe to proceed here without the j_list_lock because the
- * buffers cannot be stolen by try_to_free_buffers as long as we are
- * holding the page lock. --sct
- */
-
- if (!buffer_jbd(bh))
- goto zap_buffer_unlocked;
-
- spin_lock(&journal->j_state_lock);
- jbd_lock_bh_state(bh);
- spin_lock(&journal->j_list_lock);
-
- jh = journal_grab_journal_head(bh);
- if (!jh)
- goto zap_buffer_no_jh;
-
- /*
- * We cannot remove the buffer from checkpoint lists until the
- * transaction adding inode to orphan list (let's call it T)
- * is committed. Otherwise if the transaction changing the
- * buffer would be cleaned from the journal before T is
- * committed, a crash will cause that the correct contents of
- * the buffer will be lost. On the other hand we have to
- * clear the buffer dirty bit at latest at the moment when the
- * transaction marking the buffer as freed in the filesystem
- * structures is committed because from that moment on the
- * block can be reallocated and used by a different page.
- * Since the block hasn't been freed yet but the inode has
- * already been added to orphan list, it is safe for us to add
- * the buffer to BJ_Forget list of the newest transaction.
- *
- * Also we have to clear buffer_mapped flag of a truncated buffer
- * because the buffer_head may be attached to the page straddling
- * i_size (can happen only when blocksize < pagesize) and thus the
- * buffer_head can be reused when the file is extended again. So we end
- * up keeping around invalidated buffers attached to transactions'
- * BJ_Forget list just to stop checkpointing code from cleaning up
- * the transaction this buffer was modified in.
- */
- transaction = jh->b_transaction;
- if (transaction == NULL) {
- /* First case: not on any transaction. If it
- * has no checkpoint link, then we can zap it:
- * it's a writeback-mode buffer so we don't care
- * if it hits disk safely. */
- if (!jh->b_cp_transaction) {
- JBUFFER_TRACE(jh, "not on any transaction: zap");
- goto zap_buffer;
- }
-
- if (!buffer_dirty(bh)) {
- /* bdflush has written it. We can drop it now */
- goto zap_buffer;
- }
-
- /* OK, it must be in the journal but still not
- * written fully to disk: it's metadata or
- * journaled data... */
-
- if (journal->j_running_transaction) {
- /* ... and once the current transaction has
- * committed, the buffer won't be needed any
- * longer. */
- JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget");
- may_free = __dispose_buffer(jh,
- journal->j_running_transaction);
- goto zap_buffer;
- } else {
- /* There is no currently-running transaction. So the
- * orphan record which we wrote for this file must have
- * passed into commit. We must attach this buffer to
- * the committing transaction, if it exists. */
- if (journal->j_committing_transaction) {
- JBUFFER_TRACE(jh, "give to committing trans");
- may_free = __dispose_buffer(jh,
- journal->j_committing_transaction);
- goto zap_buffer;
- } else {
- /* The orphan record's transaction has
- * committed. We can cleanse this buffer */
- clear_buffer_jbddirty(bh);
- goto zap_buffer;
- }
- }
- } else if (transaction == journal->j_committing_transaction) {
- JBUFFER_TRACE(jh, "on committing transaction");
- if (jh->b_jlist == BJ_Locked) {
- /*
- * The buffer is on the committing transaction's locked
- * list. We have the buffer locked, so I/O has
- * completed. So we can nail the buffer now.
- */
- may_free = __dispose_buffer(jh, transaction);
- goto zap_buffer;
- }
- /*
- * The buffer is committing, we simply cannot touch
- * it. If the page is straddling i_size we have to wait
- * for commit and try again.
- */
- if (partial_page) {
- tid_t tid = journal->j_committing_transaction->t_tid;
-
- journal_put_journal_head(jh);
- spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
- spin_unlock(&journal->j_state_lock);
- unlock_buffer(bh);
- log_wait_commit(journal, tid);
- lock_buffer(bh);
- goto retry;
- }
- /*
- * OK, buffer won't be reachable after truncate. We just set
- * j_next_transaction to the running transaction (if there is
- * one) and mark buffer as freed so that commit code knows it
- * should clear dirty bits when it is done with the buffer.
- */
- set_buffer_freed(bh);
- if (journal->j_running_transaction && buffer_jbddirty(bh))
- jh->b_next_transaction = journal->j_running_transaction;
- journal_put_journal_head(jh);
- spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
- spin_unlock(&journal->j_state_lock);
- return 0;
- } else {
- /* Good, the buffer belongs to the running transaction.
- * We are writing our own transaction's data, not any
- * previous one's, so it is safe to throw it away
- * (remember that we expect the filesystem to have set
- * i_size already for this truncate so recovery will not
- * expose the disk blocks we are discarding here.) */
- J_ASSERT_JH(jh, transaction == journal->j_running_transaction);
- JBUFFER_TRACE(jh, "on running transaction");
- may_free = __dispose_buffer(jh, transaction);
- }
-
-zap_buffer:
- /*
- * This is tricky. Although the buffer is truncated, it may be reused
- * if blocksize < pagesize and it is attached to the page straddling
- * EOF. Since the buffer might have been added to BJ_Forget list of the
- * running transaction, journal_get_write_access() won't clear
- * b_modified and credit accounting gets confused. So clear b_modified
- * here. */
- jh->b_modified = 0;
- journal_put_journal_head(jh);
-zap_buffer_no_jh:
- spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
- spin_unlock(&journal->j_state_lock);
-zap_buffer_unlocked:
- clear_buffer_dirty(bh);
- J_ASSERT_BH(bh, !buffer_jbddirty(bh));
- clear_buffer_mapped(bh);
- clear_buffer_req(bh);
- clear_buffer_new(bh);
- bh->b_bdev = NULL;
- return may_free;
-}
-
-/**
- * void journal_invalidatepage() - invalidate a journal page
- * @journal: journal to use for flush
- * @page: page to flush
- * @offset: offset of the range to invalidate
- * @length: length of the range to invalidate
- *
- * Reap page buffers containing data in specified range in page.
- */
-void journal_invalidatepage(journal_t *journal,
- struct page *page,
- unsigned int offset,
- unsigned int length)
-{
- struct buffer_head *head, *bh, *next;
- unsigned int stop = offset + length;
- unsigned int curr_off = 0;
- int partial_page = (offset || length < PAGE_CACHE_SIZE);
- int may_free = 1;
-
- if (!PageLocked(page))
- BUG();
- if (!page_has_buffers(page))
- return;
-
- BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
-
- /* We will potentially be playing with lists other than just the
- * data lists (especially for journaled data mode), so be
- * cautious in our locking. */
-
- head = bh = page_buffers(page);
- do {
- unsigned int next_off = curr_off + bh->b_size;
- next = bh->b_this_page;
-
- if (next_off > stop)
- return;
-
- if (offset <= curr_off) {
- /* This block is wholly outside the truncation point */
- lock_buffer(bh);
- may_free &= journal_unmap_buffer(journal, bh,
- partial_page);
- unlock_buffer(bh);
- }
- curr_off = next_off;
- bh = next;
-
- } while (bh != head);
-
- if (!partial_page) {
- if (may_free && try_to_free_buffers(page))
- J_ASSERT(!page_has_buffers(page));
- }
-}
-
-/*
- * File a buffer on the given transaction list.
- */
-void __journal_file_buffer(struct journal_head *jh,
- transaction_t *transaction, int jlist)
-{
- struct journal_head **list = NULL;
- int was_dirty = 0;
- struct buffer_head *bh = jh2bh(jh);
-
- J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
- assert_spin_locked(&transaction->t_journal->j_list_lock);
-
- J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
- J_ASSERT_JH(jh, jh->b_transaction == transaction ||
- jh->b_transaction == NULL);
-
- if (jh->b_transaction && jh->b_jlist == jlist)
- return;
-
- if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
- jlist == BJ_Shadow || jlist == BJ_Forget) {
- /*
- * For metadata buffers, we track dirty bit in buffer_jbddirty
- * instead of buffer_dirty. We should not see a dirty bit set
- * here because we clear it in do_get_write_access but e.g.
- * tune2fs can modify the sb and set the dirty bit at any time
- * so we try to gracefully handle that.
- */
- if (buffer_dirty(bh))
- warn_dirty_buffer(bh);
- if (test_clear_buffer_dirty(bh) ||
- test_clear_buffer_jbddirty(bh))
- was_dirty = 1;
- }
-
- if (jh->b_transaction)
- __journal_temp_unlink_buffer(jh);
- else
- journal_grab_journal_head(bh);
- jh->b_transaction = transaction;
-
- switch (jlist) {
- case BJ_None:
- J_ASSERT_JH(jh, !jh->b_committed_data);
- J_ASSERT_JH(jh, !jh->b_frozen_data);
- return;
- case BJ_SyncData:
- list = &transaction->t_sync_datalist;
- break;
- case BJ_Metadata:
- transaction->t_nr_buffers++;
- list = &transaction->t_buffers;
- break;
- case BJ_Forget:
- list = &transaction->t_forget;
- break;
- case BJ_IO:
- list = &transaction->t_iobuf_list;
- break;
- case BJ_Shadow:
- list = &transaction->t_shadow_list;
- break;
- case BJ_LogCtl:
- list = &transaction->t_log_list;
- break;
- case BJ_Reserved:
- list = &transaction->t_reserved_list;
- break;
- case BJ_Locked:
- list = &transaction->t_locked_list;
- break;
- }
-
- __blist_add_buffer(list, jh);
- jh->b_jlist = jlist;
-
- if (was_dirty)
- set_buffer_jbddirty(bh);
-}
-
-void journal_file_buffer(struct journal_head *jh,
- transaction_t *transaction, int jlist)
-{
- jbd_lock_bh_state(jh2bh(jh));
- spin_lock(&transaction->t_journal->j_list_lock);
- __journal_file_buffer(jh, transaction, jlist);
- spin_unlock(&transaction->t_journal->j_list_lock);
- jbd_unlock_bh_state(jh2bh(jh));
-}
-
-/*
- * Remove a buffer from its current buffer list in preparation for
- * dropping it from its current transaction entirely. If the buffer has
- * already started to be used by a subsequent transaction, refile the
- * buffer on that transaction's metadata list.
- *
- * Called under j_list_lock
- * Called under jbd_lock_bh_state(jh2bh(jh))
- *
- * jh and bh may be already free when this function returns
- */
-void __journal_refile_buffer(struct journal_head *jh)
-{
- int was_dirty, jlist;
- struct buffer_head *bh = jh2bh(jh);
-
- J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
- if (jh->b_transaction)
- assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock);
-
- /* If the buffer is now unused, just drop it. */
- if (jh->b_next_transaction == NULL) {
- __journal_unfile_buffer(jh);
- return;
- }
-
- /*
- * It has been modified by a later transaction: add it to the new
- * transaction's metadata list.
- */
-
- was_dirty = test_clear_buffer_jbddirty(bh);
- __journal_temp_unlink_buffer(jh);
- /*
- * We set b_transaction here because b_next_transaction will inherit
- * our jh reference and thus __journal_file_buffer() must not take a
- * new one.
- */
- jh->b_transaction = jh->b_next_transaction;
- jh->b_next_transaction = NULL;
- if (buffer_freed(bh))
- jlist = BJ_Forget;
- else if (jh->b_modified)
- jlist = BJ_Metadata;
- else
- jlist = BJ_Reserved;
- __journal_file_buffer(jh, jh->b_transaction, jlist);
- J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
-
- if (was_dirty)
- set_buffer_jbddirty(bh);
-}
-
-/*
- * __journal_refile_buffer() with necessary locking added. We take our bh
- * reference so that we can safely unlock bh.
- *
- * The jh and bh may be freed by this call.
- */
-void journal_refile_buffer(journal_t *journal, struct journal_head *jh)
-{
- struct buffer_head *bh = jh2bh(jh);
-
- /* Get reference so that buffer cannot be freed before we unlock it */
- get_bh(bh);
- jbd_lock_bh_state(bh);
- spin_lock(&journal->j_list_lock);
- __journal_refile_buffer(jh);
- jbd_unlock_bh_state(bh);
- spin_unlock(&journal->j_list_lock);
- __brelse(bh);
-}
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 4227dc4..684996c 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -417,17 +417,16 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
* journal_clean_one_cp_list
*
* Find all the written-back checkpoint buffers in the given list and
- * release them.
+ * release them. If 'destroy' is set, clean all buffers unconditionally.
*
* Called with j_list_lock held.
* Returns 1 if we freed the transaction, 0 otherwise.
*/
-static int journal_clean_one_cp_list(struct journal_head *jh)
+static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy)
{
struct journal_head *last_jh;
struct journal_head *next_jh = jh;
int ret;
- int freed = 0;
if (!jh)
return 0;
@@ -436,12 +435,14 @@ static int journal_clean_one_cp_list(struct journal_head *jh)
do {
jh = next_jh;
next_jh = jh->b_cpnext;
- ret = __try_to_free_cp_buf(jh);
+ if (!destroy)
+ ret = __try_to_free_cp_buf(jh);
+ else
+ ret = __jbd2_journal_remove_checkpoint(jh) + 1;
if (!ret)
- return freed;
+ return 0;
if (ret == 2)
return 1;
- freed = 1;
/*
* This function only frees up some memory
* if possible so we dont have an obligation
@@ -449,20 +450,21 @@ static int journal_clean_one_cp_list(struct journal_head *jh)
* requested:
*/
if (need_resched())
- return freed;
+ return 0;
} while (jh != last_jh);
- return freed;
+ return 0;
}
/*
* journal_clean_checkpoint_list
*
* Find all the written-back checkpoint buffers in the journal and release them.
+ * If 'destroy' is set, release all buffers unconditionally.
*
* Called with j_list_lock held.
*/
-void __jbd2_journal_clean_checkpoint_list(journal_t *journal)
+void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy)
{
transaction_t *transaction, *last_transaction, *next_transaction;
int ret;
@@ -476,7 +478,8 @@ void __jbd2_journal_clean_checkpoint_list(journal_t *journal)
do {
transaction = next_transaction;
next_transaction = transaction->t_cpnext;
- ret = journal_clean_one_cp_list(transaction->t_checkpoint_list);
+ ret = journal_clean_one_cp_list(transaction->t_checkpoint_list,
+ destroy);
/*
* This function only frees up some memory if possible so we
* dont have an obligation to finish processing. Bail out if
@@ -492,7 +495,7 @@ void __jbd2_journal_clean_checkpoint_list(journal_t *journal)
* we can possibly see not yet submitted buffers on io_list
*/
ret = journal_clean_one_cp_list(transaction->
- t_checkpoint_io_list);
+ t_checkpoint_io_list, destroy);
if (need_resched())
return;
/*
@@ -506,6 +509,28 @@ void __jbd2_journal_clean_checkpoint_list(journal_t *journal)
}
/*
+ * Remove buffers from all checkpoint lists as journal is aborted and we just
+ * need to free memory
+ */
+void jbd2_journal_destroy_checkpoint(journal_t *journal)
+{
+ /*
+ * We loop because __jbd2_journal_clean_checkpoint_list() may abort
+ * early due to a need of rescheduling.
+ */
+ while (1) {
+ spin_lock(&journal->j_list_lock);
+ if (!journal->j_checkpoint_transactions) {
+ spin_unlock(&journal->j_list_lock);
+ break;
+ }
+ __jbd2_journal_clean_checkpoint_list(journal, true);
+ spin_unlock(&journal->j_list_lock);
+ cond_resched();
+ }
+}
+
+/*
* journal_remove_checkpoint: called after a buffer has been committed
* to disk (either by being write-back flushed to disk, or being
* committed to the log).
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index b73e021..36345fe 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -142,8 +142,7 @@ static int journal_submit_commit_record(journal_t *journal,
tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
- if (JBD2_HAS_COMPAT_FEATURE(journal,
- JBD2_FEATURE_COMPAT_CHECKSUM)) {
+ if (jbd2_has_feature_checksum(journal)) {
tmp->h_chksum_type = JBD2_CRC32_CHKSUM;
tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE;
tmp->h_chksum[0] = cpu_to_be32(crc32_sum);
@@ -157,8 +156,7 @@ static int journal_submit_commit_record(journal_t *journal,
bh->b_end_io = journal_end_buffer_io_sync;
if (journal->j_flags & JBD2_BARRIER &&
- !JBD2_HAS_INCOMPAT_FEATURE(journal,
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
+ !jbd2_has_feature_async_commit(journal))
ret = submit_bh(WRITE_SYNC | WRITE_FLUSH_FUA, bh);
else
ret = submit_bh(WRITE_SYNC, bh);
@@ -317,7 +315,7 @@ static void write_tag_block(journal_t *j, journal_block_tag_t *tag,
unsigned long long block)
{
tag->t_blocknr = cpu_to_be32(block & (u32)~0);
- if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_64BIT))
+ if (jbd2_has_feature_64bit(j))
tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
}
@@ -356,7 +354,7 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
bh->b_size);
kunmap_atomic(addr);
- if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V3))
+ if (jbd2_has_feature_csum3(j))
tag3->t_checksum = cpu_to_be32(csum32);
else
tag->t_checksum = cpu_to_be16(csum32);
@@ -510,7 +508,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
* frees some memory
*/
spin_lock(&journal->j_list_lock);
- __jbd2_journal_clean_checkpoint_list(journal);
+ __jbd2_journal_clean_checkpoint_list(journal, false);
spin_unlock(&journal->j_list_lock);
jbd_debug(3, "JBD2: commit phase 1\n");
@@ -730,8 +728,7 @@ start_journal_io:
/*
* Compute checksum.
*/
- if (JBD2_HAS_COMPAT_FEATURE(journal,
- JBD2_FEATURE_COMPAT_CHECKSUM)) {
+ if (jbd2_has_feature_checksum(journal)) {
crc32_sum =
jbd2_checksum_data(crc32_sum, bh);
}
@@ -797,8 +794,7 @@ start_journal_io:
blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL);
/* Done it all: now write the commit record asynchronously. */
- if (JBD2_HAS_INCOMPAT_FEATURE(journal,
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+ if (jbd2_has_feature_async_commit(journal)) {
err = journal_submit_commit_record(journal, commit_transaction,
&cbh, crc32_sum);
if (err)
@@ -889,8 +885,7 @@ start_journal_io:
commit_transaction->t_state = T_COMMIT_JFLUSH;
write_unlock(&journal->j_state_lock);
- if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+ if (!jbd2_has_feature_async_commit(journal)) {
err = journal_submit_commit_record(journal, commit_transaction,
&cbh, crc32_sum);
if (err)
@@ -898,8 +893,7 @@ start_journal_io:
}
if (cbh)
err = journal_wait_on_commit_record(journal, cbh);
- if (JBD2_HAS_INCOMPAT_FEATURE(journal,
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) &&
+ if (jbd2_has_feature_async_commit(journal) &&
journal->j_flags & JBD2_BARRIER) {
blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL);
}
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 4ff3fad..81e6226 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -124,7 +124,7 @@ EXPORT_SYMBOL(__jbd2_debug);
/* Checksumming functions */
static int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb)
{
- if (!jbd2_journal_has_csum_v2or3(j))
+ if (!jbd2_journal_has_csum_v2or3_feature(j))
return 1;
return sb->s_checksum_type == JBD2_CRC32C_CHKSUM;
@@ -1456,7 +1456,7 @@ void jbd2_journal_update_sb_errno(journal_t *journal)
sb->s_errno = cpu_to_be32(journal->j_errno);
read_unlock(&journal->j_state_lock);
- jbd2_write_superblock(journal, WRITE_SYNC);
+ jbd2_write_superblock(journal, WRITE_FUA);
}
EXPORT_SYMBOL(jbd2_journal_update_sb_errno);
@@ -1523,16 +1523,16 @@ static int journal_get_superblock(journal_t *journal)
goto out;
}
- if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2) &&
- JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V3)) {
+ if (jbd2_has_feature_csum2(journal) &&
+ jbd2_has_feature_csum3(journal)) {
/* Can't have checksum v2 and v3 at the same time! */
printk(KERN_ERR "JBD2: Can't enable checksumming v2 and v3 "
"at the same time!\n");
goto out;
}
- if (jbd2_journal_has_csum_v2or3(journal) &&
- JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM)) {
+ if (jbd2_journal_has_csum_v2or3_feature(journal) &&
+ jbd2_has_feature_checksum(journal)) {
/* Can't have checksum v1 and v2 on at the same time! */
printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2/3 "
"at the same time!\n");
@@ -1545,7 +1545,7 @@ static int journal_get_superblock(journal_t *journal)
}
/* Load the checksum driver */
- if (jbd2_journal_has_csum_v2or3(journal)) {
+ if (jbd2_journal_has_csum_v2or3_feature(journal)) {
journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
if (IS_ERR(journal->j_chksum_driver)) {
printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n");
@@ -1558,6 +1558,7 @@ static int journal_get_superblock(journal_t *journal)
/* Check superblock checksum */
if (!jbd2_superblock_csum_verify(journal, sb)) {
printk(KERN_ERR "JBD2: journal checksum error\n");
+ err = -EFSBADCRC;
goto out;
}
@@ -1649,7 +1650,7 @@ int jbd2_journal_load(journal_t *journal)
printk(KERN_ERR "JBD2: journal transaction %u on %s "
"is corrupt.\n", journal->j_failed_commit,
journal->j_devname);
- return -EIO;
+ return -EFSCORRUPTED;
}
/* OK, we've finished with the dynamic journal bits:
@@ -1693,8 +1694,17 @@ int jbd2_journal_destroy(journal_t *journal)
while (journal->j_checkpoint_transactions != NULL) {
spin_unlock(&journal->j_list_lock);
mutex_lock(&journal->j_checkpoint_mutex);
- jbd2_log_do_checkpoint(journal);
+ err = jbd2_log_do_checkpoint(journal);
mutex_unlock(&journal->j_checkpoint_mutex);
+ /*
+ * If checkpointing failed, just free the buffers to avoid
+ * looping forever
+ */
+ if (err) {
+ jbd2_journal_destroy_checkpoint(journal);
+ spin_lock(&journal->j_list_lock);
+ break;
+ }
spin_lock(&journal->j_list_lock);
}
@@ -2062,8 +2072,12 @@ static void __journal_abort_soft (journal_t *journal, int errno)
__jbd2_journal_abort_hard(journal);
- if (errno)
+ if (errno) {
jbd2_journal_update_sb_errno(journal);
+ write_lock(&journal->j_state_lock);
+ journal->j_flags |= JBD2_REC_ERR;
+ write_unlock(&journal->j_state_lock);
+ }
}
/**
@@ -2188,15 +2202,15 @@ size_t journal_tag_bytes(journal_t *journal)
{
size_t sz;
- if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V3))
+ if (jbd2_has_feature_csum3(journal))
return sizeof(journal_block_tag3_t);
sz = sizeof(journal_block_tag_t);
- if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ if (jbd2_has_feature_csum2(journal))
sz += sizeof(__u16);
- if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
+ if (jbd2_has_feature_64bit(journal))
return sz;
else
return sz - sizeof(__u32);
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index a9079d0..7f277e4 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -140,7 +140,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal,
if (offset >= journal->j_maxlen) {
printk(KERN_ERR "JBD2: corrupted journal superblock\n");
- return -EIO;
+ return -EFSCORRUPTED;
}
err = jbd2_journal_bmap(journal, offset, &blocknr);
@@ -342,7 +342,7 @@ static inline unsigned long long read_tag_block(journal_t *journal,
journal_block_tag_t *tag)
{
unsigned long long block = be32_to_cpu(tag->t_blocknr);
- if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
+ if (jbd2_has_feature_64bit(journal))
block |= (u64)be32_to_cpu(tag->t_blocknr_high) << 32;
return block;
}
@@ -411,7 +411,7 @@ static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag,
csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize);
- if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V3))
+ if (jbd2_has_feature_csum3(j))
return tag3->t_checksum == cpu_to_be32(csum32);
else
return tag->t_checksum == cpu_to_be16(csum32);
@@ -527,7 +527,7 @@ static int do_one_pass(journal_t *journal,
printk(KERN_ERR "JBD2: Invalid checksum "
"recovering block %lu in log\n",
next_log_block);
- err = -EIO;
+ err = -EFSBADCRC;
brelse(bh);
goto failed;
}
@@ -538,8 +538,7 @@ static int do_one_pass(journal_t *journal,
* just skip over the blocks it describes. */
if (pass != PASS_REPLAY) {
if (pass == PASS_SCAN &&
- JBD2_HAS_COMPAT_FEATURE(journal,
- JBD2_FEATURE_COMPAT_CHECKSUM) &&
+ jbd2_has_feature_checksum(journal) &&
!info->end_transaction) {
if (calc_chksums(journal, bh,
&next_log_block,
@@ -602,7 +601,7 @@ static int do_one_pass(journal_t *journal,
journal, tag, obh->b_data,
be32_to_cpu(tmp->h_sequence))) {
brelse(obh);
- success = -EIO;
+ success = -EFSBADCRC;
printk(KERN_ERR "JBD2: Invalid "
"checksum recovering "
"block %llu in log\n",
@@ -694,8 +693,7 @@ static int do_one_pass(journal_t *journal,
* much to do other than move on to the next sequence
* number. */
if (pass == PASS_SCAN &&
- JBD2_HAS_COMPAT_FEATURE(journal,
- JBD2_FEATURE_COMPAT_CHECKSUM)) {
+ jbd2_has_feature_checksum(journal)) {
int chksum_err, chksum_seen;
struct commit_header *cbh =
(struct commit_header *)bh->b_data;
@@ -735,8 +733,7 @@ static int do_one_pass(journal_t *journal,
if (chksum_err) {
info->end_transaction = next_commit_ID;
- if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)){
+ if (!jbd2_has_feature_async_commit(journal)) {
journal->j_failed_commit =
next_commit_ID;
brelse(bh);
@@ -750,8 +747,7 @@ static int do_one_pass(journal_t *journal,
bh->b_data)) {
info->end_transaction = next_commit_ID;
- if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+ if (!jbd2_has_feature_async_commit(journal)) {
journal->j_failed_commit =
next_commit_ID;
brelse(bh);
@@ -851,7 +847,7 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
rcount = be32_to_cpu(header->r_count);
if (!jbd2_revoke_block_csum_verify(journal, header))
- return -EINVAL;
+ return -EFSBADCRC;
if (jbd2_journal_has_csum_v2or3(journal))
csum_size = sizeof(struct jbd2_journal_revoke_tail);
@@ -859,7 +855,7 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
return -EINVAL;
max = rcount;
- if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
+ if (jbd2_has_feature_64bit(journal))
record_len = 8;
while (offset + record_len <= max) {
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 0abf2e7f7..705ae57 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -589,7 +589,7 @@ static void write_one_revoke_record(journal_t *journal,
if (jbd2_journal_has_csum_v2or3(journal))
csum_size = sizeof(struct jbd2_journal_revoke_tail);
- if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
+ if (jbd2_has_feature_64bit(journal))
sz = 8;
else
sz = 4;
@@ -619,7 +619,7 @@ static void write_one_revoke_record(journal_t *journal,
*descriptorp = descriptor;
}
- if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
+ if (jbd2_has_feature_64bit(journal))
* ((__be64 *)(&descriptor->b_data[offset])) =
cpu_to_be64(record->blocknr);
else
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index f3d0617..081dff0 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -204,6 +204,20 @@ static int add_transaction_credits(journal_t *journal, int blocks,
* attach this handle to a new transaction.
*/
atomic_sub(total, &t->t_outstanding_credits);
+
+ /*
+ * Is the number of reserved credits in the current transaction too
+ * big to fit this handle? Wait until reserved credits are freed.
+ */
+ if (atomic_read(&journal->j_reserved_credits) + total >
+ journal->j_max_transaction_buffers) {
+ read_unlock(&journal->j_state_lock);
+ wait_event(journal->j_wait_reserved,
+ atomic_read(&journal->j_reserved_credits) + total <=
+ journal->j_max_transaction_buffers);
+ return 1;
+ }
+
wait_transaction_locked(journal);
return 1;
}
@@ -262,20 +276,24 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
int rsv_blocks = 0;
unsigned long ts = jiffies;
+ if (handle->h_rsv_handle)
+ rsv_blocks = handle->h_rsv_handle->h_buffer_credits;
+
/*
- * 1/2 of transaction can be reserved so we can practically handle
- * only 1/2 of maximum transaction size per operation
+ * Limit the number of reserved credits to 1/2 of maximum transaction
+ * size and limit the number of total credits to not exceed maximum
+ * transaction size per operation.
*/
- if (WARN_ON(blocks > journal->j_max_transaction_buffers / 2)) {
- printk(KERN_ERR "JBD2: %s wants too many credits (%d > %d)\n",
- current->comm, blocks,
- journal->j_max_transaction_buffers / 2);
+ if ((rsv_blocks > journal->j_max_transaction_buffers / 2) ||
+ (rsv_blocks + blocks > journal->j_max_transaction_buffers)) {
+ printk(KERN_ERR "JBD2: %s wants too many credits "
+ "credits:%d rsv_credits:%d max:%d\n",
+ current->comm, blocks, rsv_blocks,
+ journal->j_max_transaction_buffers);
+ WARN_ON(1);
return -ENOSPC;
}
- if (handle->h_rsv_handle)
- rsv_blocks = handle->h_rsv_handle->h_buffer_credits;
-
alloc_transaction:
if (!journal->j_running_transaction) {
/*
@@ -746,13 +764,11 @@ void jbd2_journal_unlock_updates (journal_t *journal)
static void warn_dirty_buffer(struct buffer_head *bh)
{
- char b[BDEVNAME_SIZE];
-
printk(KERN_WARNING
- "JBD2: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). "
+ "JBD2: Spotted dirty metadata buffer (dev = %pg, blocknr = %llu). "
"There's a risk of filesystem corruption in case of system "
"crash.\n",
- bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
+ bh->b_bdev, (unsigned long long)bh->b_blocknr);
}
/* Call t_frozen trigger and copy buffer data into jh->b_frozen_data. */
@@ -991,7 +1007,8 @@ out:
}
/* Fast check whether buffer is already attached to the required transaction */
-static bool jbd2_write_access_granted(handle_t *handle, struct buffer_head *bh)
+static bool jbd2_write_access_granted(handle_t *handle, struct buffer_head *bh,
+ bool undo)
{
struct journal_head *jh;
bool ret = false;
@@ -1018,6 +1035,9 @@ static bool jbd2_write_access_granted(handle_t *handle, struct buffer_head *bh)
jh = READ_ONCE(bh->b_private);
if (!jh)
goto out;
+ /* For undo access buffer must have data copied */
+ if (undo && !jh->b_committed_data)
+ goto out;
if (jh->b_transaction != handle->h_transaction &&
jh->b_next_transaction != handle->h_transaction)
goto out;
@@ -1055,7 +1075,7 @@ int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh)
struct journal_head *jh;
int rc;
- if (jbd2_write_access_granted(handle, bh))
+ if (jbd2_write_access_granted(handle, bh, false))
return 0;
jh = jbd2_journal_add_journal_head(bh);
@@ -1192,7 +1212,7 @@ int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
char *committed_data = NULL;
JBUFFER_TRACE(jh, "entry");
- if (jbd2_write_access_granted(handle, bh))
+ if (jbd2_write_access_granted(handle, bh, true))
return 0;
jh = jbd2_journal_add_journal_head(bh);
@@ -1280,8 +1300,6 @@ void jbd2_buffer_abort_trigger(struct journal_head *jh,
triggers->t_abort(triggers, jh2bh(jh));
}
-
-
/**
* int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata
* @handle: transaction to add buffer to.
@@ -1314,12 +1332,41 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
if (is_handle_aborted(handle))
return -EROFS;
- journal = transaction->t_journal;
- jh = jbd2_journal_grab_journal_head(bh);
- if (!jh) {
+ if (!buffer_jbd(bh)) {
ret = -EUCLEAN;
goto out;
}
+ /*
+ * We don't grab jh reference here since the buffer must be part
+ * of the running transaction.
+ */
+ jh = bh2jh(bh);
+ /*
+ * This and the following assertions are unreliable since we may see jh
+ * in inconsistent state unless we grab bh_state lock. But this is
+ * crucial to catch bugs so let's do a reliable check until the
+ * lockless handling is fully proven.
+ */
+ if (jh->b_transaction != transaction &&
+ jh->b_next_transaction != transaction) {
+ jbd_lock_bh_state(bh);
+ J_ASSERT_JH(jh, jh->b_transaction == transaction ||
+ jh->b_next_transaction == transaction);
+ jbd_unlock_bh_state(bh);
+ }
+ if (jh->b_modified == 1) {
+ /* If it's in our transaction it must be in BJ_Metadata list. */
+ if (jh->b_transaction == transaction &&
+ jh->b_jlist != BJ_Metadata) {
+ jbd_lock_bh_state(bh);
+ J_ASSERT_JH(jh, jh->b_transaction != transaction ||
+ jh->b_jlist == BJ_Metadata);
+ jbd_unlock_bh_state(bh);
+ }
+ goto out;
+ }
+
+ journal = transaction->t_journal;
jbd_debug(5, "journal_head %p\n", jh);
JBUFFER_TRACE(jh, "entry");
@@ -1410,7 +1457,6 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
spin_unlock(&journal->j_list_lock);
out_unlock_bh:
jbd_unlock_bh_state(bh);
- jbd2_journal_put_journal_head(jh);
out:
JBUFFER_TRACE(jh, "exit");
return ret;
@@ -1893,8 +1939,8 @@ out:
* @journal: journal for operation
* @page: to try and free
* @gfp_mask: we use the mask to detect how hard should we try to release
- * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
- * release the buffers.
+ * buffers. If __GFP_DIRECT_RECLAIM and __GFP_FS is set, we wait for commit
+ * code to release the buffers.
*
*
* For all the buffers on this page,
@@ -2108,6 +2154,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh,
if (!buffer_dirty(bh)) {
/* bdflush has written it. We can drop it now */
+ __jbd2_journal_remove_checkpoint(jh);
goto zap_buffer;
}
@@ -2137,6 +2184,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh,
/* The orphan record's transaction has
* committed. We can cleanse this buffer */
clear_buffer_jbddirty(bh);
+ __jbd2_journal_remove_checkpoint(jh);
goto zap_buffer;
}
}
diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index bb9cebc..e5c1783 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -80,7 +80,6 @@ static int jffs2_garbage_collect_thread(void *_c)
siginitset(&hupmask, sigmask(SIGHUP));
allow_signal(SIGKILL);
allow_signal(SIGSTOP);
- allow_signal(SIGCONT);
allow_signal(SIGHUP);
c->gc_task = current;
@@ -121,20 +120,18 @@ static int jffs2_garbage_collect_thread(void *_c)
/* Put_super will send a SIGKILL and then wait on the sem.
*/
while (signal_pending(current) || freezing(current)) {
- siginfo_t info;
unsigned long signr;
if (try_to_freeze())
goto again;
- signr = dequeue_signal_lock(current, &current->blocked, &info);
+ signr = kernel_dequeue_signal(NULL);
switch(signr) {
case SIGSTOP:
jffs2_dbg(1, "%s(): SIGSTOP received\n",
__func__);
- set_current_state(TASK_STOPPED);
- schedule();
+ kernel_signal_stop();
break;
case SIGKILL:
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 8118002..d211b8e 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -621,9 +621,6 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, umode_t mode
uint32_t alloclen;
int ret;
- if (!new_valid_dev(rdev))
- return -EINVAL;
-
ri = jffs2_alloc_raw_inode();
if (!ri)
return -ENOMEM;
diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c
index b8fd651..ce11897 100644
--- a/fs/jffs2/malloc.c
+++ b/fs/jffs2/malloc.c
@@ -97,25 +97,16 @@ int __init jffs2_create_slab_caches(void)
void jffs2_destroy_slab_caches(void)
{
- if(full_dnode_slab)
- kmem_cache_destroy(full_dnode_slab);
- if(raw_dirent_slab)
- kmem_cache_destroy(raw_dirent_slab);
- if(raw_inode_slab)
- kmem_cache_destroy(raw_inode_slab);
- if(tmp_dnode_info_slab)
- kmem_cache_destroy(tmp_dnode_info_slab);
- if(raw_node_ref_slab)
- kmem_cache_destroy(raw_node_ref_slab);
- if(node_frag_slab)
- kmem_cache_destroy(node_frag_slab);
- if(inode_cache_slab)
- kmem_cache_destroy(inode_cache_slab);
+ kmem_cache_destroy(full_dnode_slab);
+ kmem_cache_destroy(raw_dirent_slab);
+ kmem_cache_destroy(raw_inode_slab);
+ kmem_cache_destroy(tmp_dnode_info_slab);
+ kmem_cache_destroy(raw_node_ref_slab);
+ kmem_cache_destroy(node_frag_slab);
+ kmem_cache_destroy(inode_cache_slab);
#ifdef CONFIG_JFFS2_FS_XATTR
- if (xattr_datum_cache)
- kmem_cache_destroy(xattr_datum_cache);
- if (xattr_ref_cache)
- kmem_cache_destroy(xattr_ref_cache);
+ kmem_cache_destroy(xattr_datum_cache);
+ kmem_cache_destroy(xattr_ref_cache);
#endif
}
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index 28e0aab..bfebbf1 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -660,8 +660,12 @@ static inline int read_direntry(struct jffs2_sb_info *c, struct jffs2_raw_node_r
err = jffs2_flash_read(c, (ref_offset(ref)) + read,
rd->nsize - already, &read, &fd->name[already]);
- if (unlikely(read != rd->nsize - already) && likely(!err))
+ if (unlikely(read != rd->nsize - already) && likely(!err)) {
+ jffs2_free_full_dirent(fd);
+ JFFS2_ERROR("short read: wanted %d bytes, got %zd\n",
+ rd->nsize - already, read);
return -EIO;
+ }
if (unlikely(err)) {
JFFS2_ERROR("read remainder of name: error %d\n", err);
diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c
index d4b43fb..7a28fac 100644
--- a/fs/jffs2/security.c
+++ b/fs/jffs2/security.c
@@ -48,42 +48,24 @@ int jffs2_init_security(struct inode *inode, struct inode *dir,
}
/* ---- XATTR Handler for "security.*" ----------------- */
-static int jffs2_security_getxattr(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+static int jffs2_security_getxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
- if (!strcmp(name, ""))
- return -EINVAL;
-
return do_jffs2_getxattr(d_inode(dentry), JFFS2_XPREFIX_SECURITY,
name, buffer, size);
}
-static int jffs2_security_setxattr(struct dentry *dentry, const char *name,
- const void *buffer, size_t size, int flags, int type)
+static int jffs2_security_setxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *buffer, size_t size, int flags)
{
- if (!strcmp(name, ""))
- return -EINVAL;
-
return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_SECURITY,
name, buffer, size, flags);
}
-static size_t jffs2_security_listxattr(struct dentry *dentry, char *list,
- size_t list_size, const char *name, size_t name_len, int type)
-{
- size_t retlen = XATTR_SECURITY_PREFIX_LEN + name_len + 1;
-
- if (list && retlen <= list_size) {
- strcpy(list, XATTR_SECURITY_PREFIX);
- strcpy(list + XATTR_SECURITY_PREFIX_LEN, name);
- }
-
- return retlen;
-}
-
const struct xattr_handler jffs2_security_xattr_handler = {
.prefix = XATTR_SECURITY_PREFIX,
- .list = jffs2_security_listxattr,
.set = jffs2_security_setxattr,
.get = jffs2_security_getxattr
};
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index d86c5e3..bb080c2 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -387,7 +387,7 @@ static int __init init_jffs2_fs(void)
jffs2_inode_cachep = kmem_cache_create("jffs2_i",
sizeof(struct jffs2_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
jffs2_i_init_once);
if (!jffs2_inode_cachep) {
pr_err("error: Failed to initialise inode cache\n");
diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c
index 8ce2f24..2cabd64 100644
--- a/fs/jffs2/symlink.c
+++ b/fs/jffs2/symlink.c
@@ -14,7 +14,7 @@
const struct inode_operations jffs2_symlink_inode_operations =
{
.readlink = generic_readlink,
- .follow_link = simple_follow_link,
+ .get_link = simple_get_link,
.setattr = jffs2_setattr,
.setxattr = jffs2_setxattr,
.getxattr = jffs2_getxattr,
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 09ed551..5a3da3f 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -1153,7 +1153,7 @@ static struct jffs2_sb_info *work_to_sb(struct work_struct *work)
{
struct delayed_work *dwork;
- dwork = container_of(work, struct delayed_work, work);
+ dwork = to_delayed_work(work);
return container_of(dwork, struct jffs2_sb_info, wbuf_dwork);
}
@@ -1264,7 +1264,7 @@ int jffs2_dataflash_setup(struct jffs2_sb_info *c) {
if ((c->flash_size % c->sector_size) != 0) {
c->flash_size = (c->flash_size / c->sector_size) * c->sector_size;
pr_warn("flash size adjusted to %dKiB\n", c->flash_size);
- };
+ }
c->wbuf_ofs = 0xFFFFFFFF;
c->wbuf = kmalloc(c->wbuf_pagesize, GFP_KERNEL);
@@ -1274,7 +1274,6 @@ int jffs2_dataflash_setup(struct jffs2_sb_info *c) {
#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY
c->wbuf_verify = kmalloc(c->wbuf_pagesize, GFP_KERNEL);
if (!c->wbuf_verify) {
- kfree(c->oobbuf);
kfree(c->wbuf);
return -ENOMEM;
}
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index f092fee..da3e185 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -967,7 +967,8 @@ ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
struct jffs2_xattr_ref *ref, **pref;
struct jffs2_xattr_datum *xd;
const struct xattr_handler *xhandle;
- ssize_t len, rc;
+ const char *prefix;
+ ssize_t prefix_len, len, rc;
int retry = 0;
rc = check_xattr_ref_inode(c, ic);
@@ -998,17 +999,23 @@ ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
}
}
xhandle = xprefix_to_handler(xd->xprefix);
- if (!xhandle)
+ if (!xhandle || (xhandle->list && !xhandle->list(dentry)))
continue;
+ prefix = xhandle->prefix ?: xhandle->name;
+ prefix_len = strlen(prefix);
+ rc = prefix_len + xd->name_len + 1;
+
if (buffer) {
- rc = xhandle->list(dentry, buffer+len, size-len,
- xd->xname, xd->name_len, xd->flags);
- } else {
- rc = xhandle->list(dentry, NULL, 0, xd->xname,
- xd->name_len, xd->flags);
+ if (rc > size - len) {
+ rc = -ERANGE;
+ goto out;
+ }
+ memcpy(buffer, prefix, prefix_len);
+ buffer += prefix_len;
+ memcpy(buffer, xd->xname, xd->name_len);
+ buffer += xd->name_len;
+ *buffer++ = 0;
}
- if (rc < 0)
- goto out;
len += rc;
}
rc = len;
diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c
index ceaf9c6..b2555ef 100644
--- a/fs/jffs2/xattr_trusted.c
+++ b/fs/jffs2/xattr_trusted.c
@@ -16,35 +16,25 @@
#include <linux/mtd/mtd.h>
#include "nodelist.h"
-static int jffs2_trusted_getxattr(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+static int jffs2_trusted_getxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
- if (!strcmp(name, ""))
- return -EINVAL;
return do_jffs2_getxattr(d_inode(dentry), JFFS2_XPREFIX_TRUSTED,
name, buffer, size);
}
-static int jffs2_trusted_setxattr(struct dentry *dentry, const char *name,
- const void *buffer, size_t size, int flags, int type)
+static int jffs2_trusted_setxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *buffer, size_t size, int flags)
{
- if (!strcmp(name, ""))
- return -EINVAL;
return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_TRUSTED,
name, buffer, size, flags);
}
-static size_t jffs2_trusted_listxattr(struct dentry *dentry, char *list,
- size_t list_size, const char *name, size_t name_len, int type)
+static bool jffs2_trusted_listxattr(struct dentry *dentry)
{
- size_t retlen = XATTR_TRUSTED_PREFIX_LEN + name_len + 1;
-
- if (list && retlen<=list_size) {
- strcpy(list, XATTR_TRUSTED_PREFIX);
- strcpy(list + XATTR_TRUSTED_PREFIX_LEN, name);
- }
-
- return retlen;
+ return capable(CAP_SYS_ADMIN);
}
const struct xattr_handler jffs2_trusted_xattr_handler = {
diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c
index a71391e..539bd63 100644
--- a/fs/jffs2/xattr_user.c
+++ b/fs/jffs2/xattr_user.c
@@ -16,40 +16,24 @@
#include <linux/mtd/mtd.h>
#include "nodelist.h"
-static int jffs2_user_getxattr(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+static int jffs2_user_getxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
- if (!strcmp(name, ""))
- return -EINVAL;
return do_jffs2_getxattr(d_inode(dentry), JFFS2_XPREFIX_USER,
name, buffer, size);
}
-static int jffs2_user_setxattr(struct dentry *dentry, const char *name,
- const void *buffer, size_t size, int flags, int type)
+static int jffs2_user_setxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *buffer, size_t size, int flags)
{
- if (!strcmp(name, ""))
- return -EINVAL;
return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_USER,
name, buffer, size, flags);
}
-static size_t jffs2_user_listxattr(struct dentry *dentry, char *list,
- size_t list_size, const char *name, size_t name_len, int type)
-{
- size_t retlen = XATTR_USER_PREFIX_LEN + name_len + 1;
-
- if (list && retlen <= list_size) {
- strcpy(list, XATTR_USER_PREFIX);
- strcpy(list + XATTR_USER_PREFIX_LEN, name);
- }
-
- return retlen;
-}
-
const struct xattr_handler jffs2_user_xattr_handler = {
.prefix = XATTR_USER_PREFIX,
- .list = jffs2_user_listxattr,
.set = jffs2_user_setxattr,
.get = jffs2_user_getxattr
};
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index 0c8ca83..4945685 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -40,10 +40,10 @@ struct posix_acl *jfs_get_acl(struct inode *inode, int type)
switch(type) {
case ACL_TYPE_ACCESS:
- ea_name = POSIX_ACL_XATTR_ACCESS;
+ ea_name = XATTR_NAME_POSIX_ACL_ACCESS;
break;
case ACL_TYPE_DEFAULT:
- ea_name = POSIX_ACL_XATTR_DEFAULT;
+ ea_name = XATTR_NAME_POSIX_ACL_DEFAULT;
break;
default:
return ERR_PTR(-EINVAL);
@@ -82,7 +82,7 @@ static int __jfs_set_acl(tid_t tid, struct inode *inode, int type,
switch (type) {
case ACL_TYPE_ACCESS:
- ea_name = POSIX_ACL_XATTR_ACCESS;
+ ea_name = XATTR_NAME_POSIX_ACL_ACCESS;
if (acl) {
rc = posix_acl_equiv_mode(acl, &inode->i_mode);
if (rc < 0)
@@ -94,7 +94,7 @@ static int __jfs_set_acl(tid_t tid, struct inode *inode, int type,
}
break;
case ACL_TYPE_DEFAULT:
- ea_name = POSIX_ACL_XATTR_DEFAULT;
+ ea_name = XATTR_NAME_POSIX_ACL_DEFAULT;
break;
default:
return -EINVAL;
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index b9dc23c..0e026a7 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -107,8 +107,11 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
if (rc)
return rc;
- if (is_quota_modification(inode, iattr))
- dquot_initialize(inode);
+ if (is_quota_modification(inode, iattr)) {
+ rc = dquot_initialize(inode);
+ if (rc)
+ return rc;
+ }
if ((iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)) ||
(iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid))) {
rc = dquot_transfer(inode, iattr);
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 41aa3ca..9d9bae6 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -60,6 +60,7 @@ struct inode *jfs_iget(struct super_block *sb, unsigned long ino)
} else if (S_ISLNK(inode->i_mode)) {
if (inode->i_size >= IDATASIZE) {
inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &jfs_aops;
} else {
inode->i_op = &jfs_fast_symlink_inode_operations;
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index 6b0f816..cf7936f 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -109,7 +109,9 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
/*
* Allocate inode to quota.
*/
- dquot_initialize(inode);
+ rc = dquot_initialize(inode);
+ if (rc)
+ goto fail_drop;
rc = dquot_alloc_inode(inode);
if (rc)
goto fail_drop;
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index bc462dc..a270cb7 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1835,17 +1835,16 @@ static int lbmLogInit(struct jfs_log * log)
for (i = 0; i < LOGPAGES;) {
char *buffer;
uint offset;
- struct page *page;
+ struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO);
- buffer = (char *) get_zeroed_page(GFP_KERNEL);
- if (buffer == NULL)
+ if (!page)
goto error;
- page = virt_to_page(buffer);
+ buffer = page_address(page);
for (offset = 0; offset < PAGE_SIZE; offset += LOGPSIZE) {
lbuf = kmalloc(sizeof(struct lbuf), GFP_KERNEL);
if (lbuf == NULL) {
if (offset == 0)
- free_page((unsigned long) buffer);
+ __free_page(page);
goto error;
}
if (offset) /* we already have one reference */
@@ -1999,19 +1998,16 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp)
bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9);
bio->bi_bdev = log->bdev;
- bio->bi_io_vec[0].bv_page = bp->l_page;
- bio->bi_io_vec[0].bv_len = LOGPSIZE;
- bio->bi_io_vec[0].bv_offset = bp->l_offset;
- bio->bi_vcnt = 1;
- bio->bi_iter.bi_size = LOGPSIZE;
+ bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset);
+ BUG_ON(bio->bi_iter.bi_size != LOGPSIZE);
bio->bi_end_io = lbmIODone;
bio->bi_private = bp;
/*check if journaling to disk has been disabled*/
if (log->no_integrity) {
bio->bi_iter.bi_size = 0;
- lbmIODone(bio, 0);
+ lbmIODone(bio);
} else {
submit_bio(READ_SYNC, bio);
}
@@ -2145,12 +2141,9 @@ static void lbmStartIO(struct lbuf * bp)
bio = bio_alloc(GFP_NOFS, 1);
bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9);
bio->bi_bdev = log->bdev;
- bio->bi_io_vec[0].bv_page = bp->l_page;
- bio->bi_io_vec[0].bv_len = LOGPSIZE;
- bio->bi_io_vec[0].bv_offset = bp->l_offset;
- bio->bi_vcnt = 1;
- bio->bi_iter.bi_size = LOGPSIZE;
+ bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset);
+ BUG_ON(bio->bi_iter.bi_size != LOGPSIZE);
bio->bi_end_io = lbmIODone;
bio->bi_private = bp;
@@ -2158,7 +2151,7 @@ static void lbmStartIO(struct lbuf * bp)
/* check if journaling to disk has been disabled */
if (log->no_integrity) {
bio->bi_iter.bi_size = 0;
- lbmIODone(bio, 0);
+ lbmIODone(bio);
} else {
submit_bio(WRITE_SYNC, bio);
INCREMENT(lmStat.submitted);
@@ -2196,7 +2189,7 @@ static int lbmIOWait(struct lbuf * bp, int flag)
*
* executed at INTIODONE level
*/
-static void lbmIODone(struct bio *bio, int error)
+static void lbmIODone(struct bio *bio)
{
struct lbuf *bp = bio->bi_private;
struct lbuf *nextbp, *tail;
@@ -2212,7 +2205,7 @@ static void lbmIODone(struct bio *bio, int error)
bp->l_flag |= lbmDONE;
- if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+ if (bio->bi_error) {
bp->l_flag |= lbmERROR;
jfs_err("lbmIODone: I/O error in JFS log");
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 16a0922..a3eb316 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -276,11 +276,11 @@ static void last_read_complete(struct page *page)
unlock_page(page);
}
-static void metapage_read_end_io(struct bio *bio, int err)
+static void metapage_read_end_io(struct bio *bio)
{
struct page *page = bio->bi_private;
- if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+ if (bio->bi_error) {
printk(KERN_ERR "metapage_read_end_io: I/O error\n");
SetPageError(page);
}
@@ -331,13 +331,13 @@ static void last_write_complete(struct page *page)
end_page_writeback(page);
}
-static void metapage_write_end_io(struct bio *bio, int err)
+static void metapage_write_end_io(struct bio *bio)
{
struct page *page = bio->bi_private;
BUG_ON(!PagePrivate(page));
- if (! test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+ if (bio->bi_error) {
printk(KERN_ERR "metapage_write_end_io: I/O error\n");
SetPageError(page);
}
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index a5ac97b..701f893 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -86,7 +86,9 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, umode_t mode,
jfs_info("jfs_create: dip:0x%p name:%pd", dip, dentry);
- dquot_initialize(dip);
+ rc = dquot_initialize(dip);
+ if (rc)
+ goto out1;
/*
* search parent directory for entry/freespace
@@ -218,7 +220,9 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode)
jfs_info("jfs_mkdir: dip:0x%p name:%pd", dip, dentry);
- dquot_initialize(dip);
+ rc = dquot_initialize(dip);
+ if (rc)
+ goto out1;
/*
* search parent directory for entry/freespace
@@ -355,8 +359,12 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry)
jfs_info("jfs_rmdir: dip:0x%p name:%pd", dip, dentry);
/* Init inode for quota operations. */
- dquot_initialize(dip);
- dquot_initialize(ip);
+ rc = dquot_initialize(dip);
+ if (rc)
+ goto out;
+ rc = dquot_initialize(ip);
+ if (rc)
+ goto out;
/* directory must be empty to be removed */
if (!dtEmpty(ip)) {
@@ -483,8 +491,12 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry)
jfs_info("jfs_unlink: dip:0x%p name:%pd", dip, dentry);
/* Init inode for quota operations. */
- dquot_initialize(dip);
- dquot_initialize(ip);
+ rc = dquot_initialize(dip);
+ if (rc)
+ goto out;
+ rc = dquot_initialize(ip);
+ if (rc)
+ goto out;
if ((rc = get_UCSname(&dname, dentry)))
goto out;
@@ -799,7 +811,9 @@ static int jfs_link(struct dentry *old_dentry,
jfs_info("jfs_link: %pd %pd", old_dentry, dentry);
- dquot_initialize(dir);
+ rc = dquot_initialize(dir);
+ if (rc)
+ goto out;
tid = txBegin(ip->i_sb, 0);
@@ -810,7 +824,7 @@ static int jfs_link(struct dentry *old_dentry,
* scan parent directory for entry/freespace
*/
if ((rc = get_UCSname(&dname, dentry)))
- goto out;
+ goto out_tx;
if ((rc = dtSearch(dir, &dname, &ino, &btstack, JFS_CREATE)))
goto free_dname;
@@ -842,12 +856,13 @@ static int jfs_link(struct dentry *old_dentry,
free_dname:
free_UCSname(&dname);
- out:
+ out_tx:
txEnd(tid);
mutex_unlock(&JFS_IP(ip)->commit_mutex);
mutex_unlock(&JFS_IP(dir)->commit_mutex);
+ out:
jfs_info("jfs_link: rc:%d", rc);
return rc;
}
@@ -891,7 +906,9 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
jfs_info("jfs_symlink: dip:0x%p name:%s", dip, name);
- dquot_initialize(dip);
+ rc = dquot_initialize(dip);
+ if (rc)
+ goto out1;
ssize = strlen(name) + 1;
@@ -966,6 +983,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
jfs_info("jfs_symlink: allocate extent ip:0x%p", ip);
ip->i_op = &jfs_symlink_inode_operations;
+ inode_nohighmem(ip);
ip->i_mapping->a_ops = &jfs_aops;
/*
@@ -1082,8 +1100,12 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
jfs_info("jfs_rename: %pd %pd", old_dentry, new_dentry);
- dquot_initialize(old_dir);
- dquot_initialize(new_dir);
+ rc = dquot_initialize(old_dir);
+ if (rc)
+ goto out1;
+ rc = dquot_initialize(new_dir);
+ if (rc)
+ goto out1;
old_ip = d_inode(old_dentry);
new_ip = d_inode(new_dentry);
@@ -1130,7 +1152,9 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
} else if (new_ip) {
IWRITE_LOCK(new_ip, RDWRLOCK_NORMAL);
/* Init inode for quota operations. */
- dquot_initialize(new_ip);
+ rc = dquot_initialize(new_ip);
+ if (rc)
+ goto out_unlock;
}
/*
@@ -1318,6 +1342,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
clear_cflag(COMMIT_Stale, old_dir);
}
+ out_unlock:
if (new_ip && !S_ISDIR(new_ip->i_mode))
IWRITE_UNLOCK(new_ip);
out3:
@@ -1348,12 +1373,11 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry,
tid_t tid;
struct tblock *tblk;
- if (!new_valid_dev(rdev))
- return -EINVAL;
-
jfs_info("jfs_mknod: %pd", dentry);
- dquot_initialize(dir);
+ rc = dquot_initialize(dir);
+ if (rc)
+ goto out;
if ((rc = get_UCSname(&dname, dentry)))
goto out;
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 4cd9798f..900925b 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -496,9 +496,6 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
jfs_info("In jfs_read_super: s_flags=0x%lx", sb->s_flags);
- if (!new_valid_dev(sb->s_bdev->bd_dev))
- return -EOVERFLOW;
-
sbi = kzalloc(sizeof(struct jfs_sb_info), GFP_KERNEL);
if (!sbi)
return -ENOMEM;
@@ -901,7 +898,7 @@ static int __init init_jfs_fs(void)
jfs_inode_cachep =
kmem_cache_create("jfs_ip", sizeof(struct jfs_inode_info), 0,
- SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
+ SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT,
init_once);
if (jfs_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/jfs/symlink.c b/fs/jfs/symlink.c
index 5929e23..f8db4fd 100644
--- a/fs/jfs/symlink.c
+++ b/fs/jfs/symlink.c
@@ -23,7 +23,7 @@
const struct inode_operations jfs_fast_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = simple_follow_link,
+ .get_link = simple_get_link,
.setattr = jfs_setattr,
.setxattr = jfs_setxattr,
.getxattr = jfs_getxattr,
@@ -33,8 +33,7 @@ const struct inode_operations jfs_fast_symlink_inode_operations = {
const struct inode_operations jfs_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
.setattr = jfs_setattr,
.setxattr = jfs_setxattr,
.getxattr = jfs_getxattr,
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 2d48d28..8219738 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -92,6 +92,29 @@ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
}
/**
+ * kernfs_path_len - determine the length of the full path of a given node
+ * @kn: kernfs_node of interest
+ *
+ * The returned length doesn't include the space for the terminating '\0'.
+ */
+size_t kernfs_path_len(struct kernfs_node *kn)
+{
+ size_t len = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&kernfs_rename_lock, flags);
+
+ do {
+ len += strlen(kn->name) + 1;
+ kn = kn->parent;
+ } while (kn && kn->parent);
+
+ spin_unlock_irqrestore(&kernfs_rename_lock, flags);
+
+ return len;
+}
+
+/**
* kernfs_path - build full path of a given node
* @kn: kernfs_node of interest
* @buf: buffer to copy @kn's name into
@@ -518,14 +541,7 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
if (!kn)
goto err_out1;
- /*
- * If the ino of the sysfs entry created for a kmem cache gets
- * allocated from an ida layer, which is accounted to the memcg that
- * owns the cache, the memcg will get pinned forever. So do not account
- * ino ida allocations.
- */
- ret = ida_simple_get(&root->ino_ida, 1, 0,
- GFP_KERNEL | __GFP_NOACCOUNT);
+ ret = ida_simple_get(&root->ino_ida, 1, 0, GFP_KERNEL);
if (ret < 0)
goto err_out2;
kn->ino = ret;
@@ -671,6 +687,29 @@ static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent,
return NULL;
}
+static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent,
+ const unsigned char *path,
+ const void *ns)
+{
+ static char path_buf[PATH_MAX]; /* protected by kernfs_mutex */
+ size_t len = strlcpy(path_buf, path, PATH_MAX);
+ char *p = path_buf;
+ char *name;
+
+ lockdep_assert_held(&kernfs_mutex);
+
+ if (len >= PATH_MAX)
+ return NULL;
+
+ while ((name = strsep(&p, "/")) && parent) {
+ if (*name == '\0')
+ continue;
+ parent = kernfs_find_ns(parent, name, ns);
+ }
+
+ return parent;
+}
+
/**
* kernfs_find_and_get_ns - find and get kernfs_node with the given name
* @parent: kernfs_node to search under
@@ -696,6 +735,29 @@ struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent,
EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns);
/**
+ * kernfs_walk_and_get_ns - find and get kernfs_node with the given path
+ * @parent: kernfs_node to search under
+ * @path: path to look for
+ * @ns: the namespace tag to use
+ *
+ * Look for kernfs_node with path @path under @parent and get a reference
+ * if found. This function may sleep and returns pointer to the found
+ * kernfs_node on success, %NULL on failure.
+ */
+struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent,
+ const char *path, const void *ns)
+{
+ struct kernfs_node *kn;
+
+ mutex_lock(&kernfs_mutex);
+ kn = kernfs_walk_ns(parent, path, ns);
+ kernfs_get(kn);
+ mutex_unlock(&kernfs_mutex);
+
+ return kn;
+}
+
+/**
* kernfs_create_root - create a new kernfs hierarchy
* @scops: optional syscall operations for the hierarchy
* @flags: KERNFS_ROOT_* flags
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 756dd56..16405ae 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -205,7 +205,7 @@ int kernfs_iop_removexattr(struct dentry *dentry, const char *name)
if (!attrs)
return -ENOMEM;
- return simple_xattr_remove(&attrs->xattrs, name);
+ return simple_xattr_set(&attrs->xattrs, name, NULL, 0, XATTR_REPLACE);
}
ssize_t kernfs_iop_getxattr(struct dentry *dentry, const char *name, void *buf,
@@ -230,7 +230,7 @@ ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size)
if (!attrs)
return -ENOMEM;
- return simple_xattr_list(&attrs->xattrs, buf, size);
+ return simple_xattr_list(d_inode(dentry), &attrs->xattrs, buf, size);
}
static inline void set_default_inode_attr(struct inode *inode, umode_t mode)
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
index db27252..117b8b3 100644
--- a/fs/kernfs/symlink.c
+++ b/fs/kernfs/symlink.c
@@ -112,18 +112,25 @@ static int kernfs_getlink(struct dentry *dentry, char *path)
return error;
}
-static const char *kernfs_iop_follow_link(struct dentry *dentry, void **cookie)
+static const char *kernfs_iop_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- int error = -ENOMEM;
- unsigned long page = get_zeroed_page(GFP_KERNEL);
- if (!page)
+ char *body;
+ int error;
+
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+ body = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!body)
return ERR_PTR(-ENOMEM);
- error = kernfs_getlink(dentry, (char *)page);
+ error = kernfs_getlink(dentry, body);
if (unlikely(error < 0)) {
- free_page((unsigned long)page);
+ kfree(body);
return ERR_PTR(error);
}
- return *cookie = (char *)page;
+ set_delayed_call(done, kfree_link, body);
+ return body;
}
const struct inode_operations kernfs_symlink_iops = {
@@ -132,8 +139,7 @@ const struct inode_operations kernfs_symlink_iops = {
.getxattr = kernfs_iop_getxattr,
.listxattr = kernfs_iop_listxattr,
.readlink = generic_readlink,
- .follow_link = kernfs_iop_follow_link,
- .put_link = free_page_put_link,
+ .get_link = kernfs_iop_get_link,
.setattr = kernfs_iop_setattr,
.getattr = kernfs_iop_getattr,
.permission = kernfs_iop_permission,
diff --git a/fs/libfs.c b/fs/libfs.c
index 102edfd..0149129 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1019,17 +1019,12 @@ int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync)
}
EXPORT_SYMBOL(noop_fsync);
-void kfree_put_link(struct inode *unused, void *cookie)
+/* Because kfree isn't assignment-compatible with void(void*) ;-/ */
+void kfree_link(void *p)
{
- kfree(cookie);
+ kfree(p);
}
-EXPORT_SYMBOL(kfree_put_link);
-
-void free_page_put_link(struct inode *unused, void *cookie)
-{
- free_page((unsigned long) cookie);
-}
-EXPORT_SYMBOL(free_page_put_link);
+EXPORT_SYMBOL(kfree_link);
/*
* nop .set_page_dirty method so that people can use .page_mkwrite on
@@ -1092,14 +1087,15 @@ simple_nosetlease(struct file *filp, long arg, struct file_lock **flp,
}
EXPORT_SYMBOL(simple_nosetlease);
-const char *simple_follow_link(struct dentry *dentry, void **cookie)
+const char *simple_get_link(struct dentry *dentry, struct inode *inode,
+ struct delayed_call *done)
{
- return d_inode(dentry)->i_link;
+ return inode->i_link;
}
-EXPORT_SYMBOL(simple_follow_link);
+EXPORT_SYMBOL(simple_get_link);
const struct inode_operations simple_symlink_inode_operations = {
- .follow_link = simple_follow_link,
+ .get_link = simple_get_link,
.readlink = generic_readlink
};
EXPORT_SYMBOL(simple_symlink_inode_operations);
@@ -1185,7 +1181,7 @@ void make_empty_dir_inode(struct inode *inode)
inode->i_uid = GLOBAL_ROOT_UID;
inode->i_gid = GLOBAL_ROOT_GID;
inode->i_rdev = 0;
- inode->i_size = 2;
+ inode->i_size = 0;
inode->i_blkbits = PAGE_SHIFT;
inode->i_blocks = 0;
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index acd3947..1129520 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -474,18 +474,7 @@ static void nlmclnt_locks_init_private(struct file_lock *fl, struct nlm_host *ho
static int do_vfs_lock(struct file_lock *fl)
{
- int res = 0;
- switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) {
- case FL_POSIX:
- res = posix_lock_file_wait(fl->fl_file, fl);
- break;
- case FL_FLOCK:
- res = flock_lock_file_wait(fl->fl_file, fl);
- break;
- default:
- BUG();
- }
- return res;
+ return locks_lock_file_wait(fl->fl_file, fl);
}
/*
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 969d589..d716c99 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -116,7 +116,7 @@ static struct nlm_host *nlm_alloc_host(struct nlm_lookup_host_info *ni,
atomic_inc(&nsm->sm_count);
else {
host = NULL;
- nsm = nsm_get_handle(ni->sap, ni->salen,
+ nsm = nsm_get_handle(ni->net, ni->sap, ni->salen,
ni->hostname, ni->hostname_len);
if (unlikely(nsm == NULL)) {
dprintk("lockd: %s failed; no nsm handle\n",
@@ -161,6 +161,7 @@ static struct nlm_host *nlm_alloc_host(struct nlm_lookup_host_info *ni,
host->h_nsmhandle = nsm;
host->h_addrbuf = nsm->sm_addrbuf;
host->net = ni->net;
+ strlcpy(host->nodename, utsname()->nodename, sizeof(host->nodename));
out:
return host;
@@ -534,17 +535,18 @@ static struct nlm_host *next_host_state(struct hlist_head *cache,
/**
* nlm_host_rebooted - Release all resources held by rebooted host
+ * @net: network namespace
* @info: pointer to decoded results of NLM_SM_NOTIFY call
*
* We were notified that the specified host has rebooted. Release
* all resources held by that peer.
*/
-void nlm_host_rebooted(const struct nlm_reboot *info)
+void nlm_host_rebooted(const struct net *net, const struct nlm_reboot *info)
{
struct nsm_handle *nsm;
struct nlm_host *host;
- nsm = nsm_reboot_lookup(info);
+ nsm = nsm_reboot_lookup(net, info);
if (unlikely(nsm == NULL))
return;
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 47a32b6..19166d4 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -42,7 +42,7 @@ struct nsm_args {
u32 proc;
char *mon_name;
- char *nodename;
+ const char *nodename;
};
struct nsm_res {
@@ -51,7 +51,6 @@ struct nsm_res {
};
static const struct rpc_program nsm_program;
-static LIST_HEAD(nsm_handles);
static DEFINE_SPINLOCK(nsm_lock);
/*
@@ -87,69 +86,18 @@ static struct rpc_clnt *nsm_create(struct net *net, const char *nodename)
return rpc_create(&args);
}
-static struct rpc_clnt *nsm_client_set(struct lockd_net *ln,
- struct rpc_clnt *clnt)
-{
- spin_lock(&ln->nsm_clnt_lock);
- if (ln->nsm_users == 0) {
- if (clnt == NULL)
- goto out;
- ln->nsm_clnt = clnt;
- }
- clnt = ln->nsm_clnt;
- ln->nsm_users++;
-out:
- spin_unlock(&ln->nsm_clnt_lock);
- return clnt;
-}
-
-static struct rpc_clnt *nsm_client_get(struct net *net, const char *nodename)
-{
- struct rpc_clnt *clnt, *new;
- struct lockd_net *ln = net_generic(net, lockd_net_id);
-
- clnt = nsm_client_set(ln, NULL);
- if (clnt != NULL)
- goto out;
-
- clnt = new = nsm_create(net, nodename);
- if (IS_ERR(clnt))
- goto out;
-
- clnt = nsm_client_set(ln, new);
- if (clnt != new)
- rpc_shutdown_client(new);
-out:
- return clnt;
-}
-
-static void nsm_client_put(struct net *net)
-{
- struct lockd_net *ln = net_generic(net, lockd_net_id);
- struct rpc_clnt *clnt = NULL;
-
- spin_lock(&ln->nsm_clnt_lock);
- ln->nsm_users--;
- if (ln->nsm_users == 0) {
- clnt = ln->nsm_clnt;
- ln->nsm_clnt = NULL;
- }
- spin_unlock(&ln->nsm_clnt_lock);
- if (clnt != NULL)
- rpc_shutdown_client(clnt);
-}
-
static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res,
- struct rpc_clnt *clnt)
+ const struct nlm_host *host)
{
int status;
+ struct rpc_clnt *clnt;
struct nsm_args args = {
.priv = &nsm->sm_priv,
.prog = NLM_PROGRAM,
.vers = 3,
.proc = NLMPROC_NSM_NOTIFY,
.mon_name = nsm->sm_mon_name,
- .nodename = clnt->cl_nodename,
+ .nodename = host->nodename,
};
struct rpc_message msg = {
.rpc_argp = &args,
@@ -158,6 +106,13 @@ static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res,
memset(res, 0, sizeof(*res));
+ clnt = nsm_create(host->net, host->nodename);
+ if (IS_ERR(clnt)) {
+ dprintk("lockd: failed to create NSM upcall transport, "
+ "status=%ld, net=%p\n", PTR_ERR(clnt), host->net);
+ return PTR_ERR(clnt);
+ }
+
msg.rpc_proc = &clnt->cl_procinfo[proc];
status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFTCONN);
if (status == -ECONNREFUSED) {
@@ -171,6 +126,8 @@ static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res,
status);
else
status = 0;
+
+ rpc_shutdown_client(clnt);
return status;
}
@@ -190,32 +147,19 @@ int nsm_monitor(const struct nlm_host *host)
struct nsm_handle *nsm = host->h_nsmhandle;
struct nsm_res res;
int status;
- struct rpc_clnt *clnt;
- const char *nodename = NULL;
dprintk("lockd: nsm_monitor(%s)\n", nsm->sm_name);
if (nsm->sm_monitored)
return 0;
- if (host->h_rpcclnt)
- nodename = host->h_rpcclnt->cl_nodename;
-
/*
* Choose whether to record the caller_name or IP address of
* this peer in the local rpc.statd's database.
*/
nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf;
- clnt = nsm_client_get(host->net, nodename);
- if (IS_ERR(clnt)) {
- status = PTR_ERR(clnt);
- dprintk("lockd: failed to create NSM upcall transport, "
- "status=%d, net=%p\n", status, host->net);
- return status;
- }
-
- status = nsm_mon_unmon(nsm, NSMPROC_MON, &res, clnt);
+ status = nsm_mon_unmon(nsm, NSMPROC_MON, &res, host);
if (unlikely(res.status != 0))
status = -EIO;
if (unlikely(status < 0)) {
@@ -247,11 +191,9 @@ void nsm_unmonitor(const struct nlm_host *host)
if (atomic_read(&nsm->sm_count) == 1
&& nsm->sm_monitored && !nsm->sm_sticky) {
- struct lockd_net *ln = net_generic(host->net, lockd_net_id);
-
dprintk("lockd: nsm_unmonitor(%s)\n", nsm->sm_name);
- status = nsm_mon_unmon(nsm, NSMPROC_UNMON, &res, ln->nsm_clnt);
+ status = nsm_mon_unmon(nsm, NSMPROC_UNMON, &res, host);
if (res.status != 0)
status = -EIO;
if (status < 0)
@@ -259,38 +201,38 @@ void nsm_unmonitor(const struct nlm_host *host)
nsm->sm_name);
else
nsm->sm_monitored = 0;
-
- nsm_client_put(host->net);
}
}
-static struct nsm_handle *nsm_lookup_hostname(const char *hostname,
- const size_t len)
+static struct nsm_handle *nsm_lookup_hostname(const struct list_head *nsm_handles,
+ const char *hostname, const size_t len)
{
struct nsm_handle *nsm;
- list_for_each_entry(nsm, &nsm_handles, sm_link)
+ list_for_each_entry(nsm, nsm_handles, sm_link)
if (strlen(nsm->sm_name) == len &&
memcmp(nsm->sm_name, hostname, len) == 0)
return nsm;
return NULL;
}
-static struct nsm_handle *nsm_lookup_addr(const struct sockaddr *sap)
+static struct nsm_handle *nsm_lookup_addr(const struct list_head *nsm_handles,
+ const struct sockaddr *sap)
{
struct nsm_handle *nsm;
- list_for_each_entry(nsm, &nsm_handles, sm_link)
+ list_for_each_entry(nsm, nsm_handles, sm_link)
if (rpc_cmp_addr(nsm_addr(nsm), sap))
return nsm;
return NULL;
}
-static struct nsm_handle *nsm_lookup_priv(const struct nsm_private *priv)
+static struct nsm_handle *nsm_lookup_priv(const struct list_head *nsm_handles,
+ const struct nsm_private *priv)
{
struct nsm_handle *nsm;
- list_for_each_entry(nsm, &nsm_handles, sm_link)
+ list_for_each_entry(nsm, nsm_handles, sm_link)
if (memcmp(nsm->sm_priv.data, priv->data,
sizeof(priv->data)) == 0)
return nsm;
@@ -353,6 +295,7 @@ static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap,
/**
* nsm_get_handle - Find or create a cached nsm_handle
+ * @net: network namespace
* @sap: pointer to socket address of handle to find
* @salen: length of socket address
* @hostname: pointer to C string containing hostname to find
@@ -365,11 +308,13 @@ static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap,
* @hostname cannot be found in the handle cache. Returns NULL if
* an error occurs.
*/
-struct nsm_handle *nsm_get_handle(const struct sockaddr *sap,
+struct nsm_handle *nsm_get_handle(const struct net *net,
+ const struct sockaddr *sap,
const size_t salen, const char *hostname,
const size_t hostname_len)
{
struct nsm_handle *cached, *new = NULL;
+ struct lockd_net *ln = net_generic(net, lockd_net_id);
if (hostname && memchr(hostname, '/', hostname_len) != NULL) {
if (printk_ratelimit()) {
@@ -384,9 +329,10 @@ retry:
spin_lock(&nsm_lock);
if (nsm_use_hostnames && hostname != NULL)
- cached = nsm_lookup_hostname(hostname, hostname_len);
+ cached = nsm_lookup_hostname(&ln->nsm_handles,
+ hostname, hostname_len);
else
- cached = nsm_lookup_addr(sap);
+ cached = nsm_lookup_addr(&ln->nsm_handles, sap);
if (cached != NULL) {
atomic_inc(&cached->sm_count);
@@ -400,7 +346,7 @@ retry:
}
if (new != NULL) {
- list_add(&new->sm_link, &nsm_handles);
+ list_add(&new->sm_link, &ln->nsm_handles);
spin_unlock(&nsm_lock);
dprintk("lockd: created nsm_handle for %s (%s)\n",
new->sm_name, new->sm_addrbuf);
@@ -417,19 +363,22 @@ retry:
/**
* nsm_reboot_lookup - match NLMPROC_SM_NOTIFY arguments to an nsm_handle
+ * @net: network namespace
* @info: pointer to NLMPROC_SM_NOTIFY arguments
*
* Returns a matching nsm_handle if found in the nsm cache. The returned
* nsm_handle's reference count is bumped. Otherwise returns NULL if some
* error occurred.
*/
-struct nsm_handle *nsm_reboot_lookup(const struct nlm_reboot *info)
+struct nsm_handle *nsm_reboot_lookup(const struct net *net,
+ const struct nlm_reboot *info)
{
struct nsm_handle *cached;
+ struct lockd_net *ln = net_generic(net, lockd_net_id);
spin_lock(&nsm_lock);
- cached = nsm_lookup_priv(&info->priv);
+ cached = nsm_lookup_priv(&ln->nsm_handles, &info->priv);
if (unlikely(cached == NULL)) {
spin_unlock(&nsm_lock);
dprintk("lockd: never saw rebooted peer '%.*s' before\n",
diff --git a/fs/lockd/netns.h b/fs/lockd/netns.h
index 097bfa3..5426189 100644
--- a/fs/lockd/netns.h
+++ b/fs/lockd/netns.h
@@ -12,9 +12,7 @@ struct lockd_net {
struct delayed_work grace_period_end;
struct lock_manager lockd_manager;
- spinlock_t nsm_clnt_lock;
- unsigned int nsm_users;
- struct rpc_clnt *nsm_clnt;
+ struct list_head nsm_handles;
};
extern int lockd_net_id;
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 55505cb..154a107 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -25,13 +25,17 @@
#include <linux/mutex.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
+#include <linux/inetdevice.h>
#include <linux/sunrpc/types.h>
#include <linux/sunrpc/stats.h>
#include <linux/sunrpc/clnt.h>
#include <linux/sunrpc/svc.h>
#include <linux/sunrpc/svcsock.h>
+#include <linux/sunrpc/svc_xprt.h>
#include <net/ip.h>
+#include <net/addrconf.h>
+#include <net/ipv6.h>
#include <linux/lockd/lockd.h>
#include <linux/nfs.h>
@@ -44,7 +48,7 @@
static struct svc_program nlmsvc_program;
-struct nlmsvc_binding * nlmsvc_ops;
+const struct nlmsvc_binding *nlmsvc_ops;
EXPORT_SYMBOL_GPL(nlmsvc_ops);
static DEFINE_MUTEX(nlmsvc_mutex);
@@ -90,8 +94,7 @@ static unsigned long get_lockd_grace_period(void)
static void grace_ender(struct work_struct *grace)
{
- struct delayed_work *dwork = container_of(grace, struct delayed_work,
- work);
+ struct delayed_work *dwork = to_delayed_work(grace);
struct lockd_net *ln = container_of(dwork, struct lockd_net,
grace_period_end);
@@ -279,6 +282,68 @@ static void lockd_down_net(struct svc_serv *serv, struct net *net)
}
}
+static int lockd_inetaddr_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
+ struct sockaddr_in sin;
+
+ if (event != NETDEV_DOWN)
+ goto out;
+
+ if (nlmsvc_rqst) {
+ dprintk("lockd_inetaddr_event: removed %pI4\n",
+ &ifa->ifa_local);
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = ifa->ifa_local;
+ svc_age_temp_xprts_now(nlmsvc_rqst->rq_server,
+ (struct sockaddr *)&sin);
+ }
+
+out:
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block lockd_inetaddr_notifier = {
+ .notifier_call = lockd_inetaddr_event,
+};
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int lockd_inet6addr_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr;
+ struct sockaddr_in6 sin6;
+
+ if (event != NETDEV_DOWN)
+ goto out;
+
+ if (nlmsvc_rqst) {
+ dprintk("lockd_inet6addr_event: removed %pI6\n", &ifa->addr);
+ sin6.sin6_family = AF_INET6;
+ sin6.sin6_addr = ifa->addr;
+ svc_age_temp_xprts_now(nlmsvc_rqst->rq_server,
+ (struct sockaddr *)&sin6);
+ }
+
+out:
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block lockd_inet6addr_notifier = {
+ .notifier_call = lockd_inet6addr_event,
+};
+#endif
+
+static void lockd_svc_exit_thread(void)
+{
+ unregister_inetaddr_notifier(&lockd_inetaddr_notifier);
+#if IS_ENABLED(CONFIG_IPV6)
+ unregister_inet6addr_notifier(&lockd_inet6addr_notifier);
+#endif
+ svc_exit_thread(nlmsvc_rqst);
+}
+
static int lockd_start_svc(struct svc_serv *serv)
{
int error;
@@ -315,13 +380,18 @@ static int lockd_start_svc(struct svc_serv *serv)
return 0;
out_task:
- svc_exit_thread(nlmsvc_rqst);
+ lockd_svc_exit_thread();
nlmsvc_task = NULL;
out_rqst:
nlmsvc_rqst = NULL;
return error;
}
+static struct svc_serv_ops lockd_sv_ops = {
+ .svo_shutdown = svc_rpcb_cleanup,
+ .svo_enqueue_xprt = svc_xprt_do_enqueue,
+};
+
static struct svc_serv *lockd_create_svc(void)
{
struct svc_serv *serv;
@@ -350,11 +420,15 @@ static struct svc_serv *lockd_create_svc(void)
nlm_timeout = LOCKD_DFLT_TIMEO;
nlmsvc_timeout = nlm_timeout * HZ;
- serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, svc_rpcb_cleanup);
+ serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, &lockd_sv_ops);
if (!serv) {
printk(KERN_WARNING "lockd_up: create service failed\n");
return ERR_PTR(-ENOMEM);
}
+ register_inetaddr_notifier(&lockd_inetaddr_notifier);
+#if IS_ENABLED(CONFIG_IPV6)
+ register_inet6addr_notifier(&lockd_inet6addr_notifier);
+#endif
dprintk("lockd_up: service created\n");
return serv;
}
@@ -423,7 +497,7 @@ lockd_down(struct net *net)
}
kthread_stop(nlmsvc_task);
dprintk("lockd_down: service stopped\n");
- svc_exit_thread(nlmsvc_rqst);
+ lockd_svc_exit_thread();
dprintk("lockd_down: service destroyed\n");
nlmsvc_task = NULL;
nlmsvc_rqst = NULL;
@@ -586,7 +660,8 @@ static int lockd_init_net(struct net *net)
INIT_DELAYED_WORK(&ln->grace_period_end, grace_ender);
INIT_LIST_HEAD(&ln->lockd_manager.list);
- spin_lock_init(&ln->nsm_clnt_lock);
+ ln->lockd_manager.block_opens = false;
+ INIT_LIST_HEAD(&ln->nsm_handles);
return 0;
}
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index b147d1a..09c576f 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -421,7 +421,7 @@ nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
return rpc_system_err;
}
- nlm_host_rebooted(argp);
+ nlm_host_rebooted(SVC_NET(rqstp), argp);
return rpc_success;
}
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 21171f0..fb26b9f 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -464,7 +464,7 @@ nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
return rpc_system_err;
}
- nlm_host_rebooted(argp);
+ nlm_host_rebooted(SVC_NET(rqstp), argp);
return rpc_success;
}
diff --git a/fs/locks.c b/fs/locks.c
index d3d558b..af1ed74 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -119,7 +119,6 @@
#include <linux/fdtable.h>
#include <linux/fs.h>
#include <linux/init.h>
-#include <linux/module.h>
#include <linux/security.h>
#include <linux/slab.h>
#include <linux/syscalls.h>
@@ -205,37 +204,69 @@ static struct kmem_cache *filelock_cache __read_mostly;
static struct file_lock_context *
locks_get_lock_context(struct inode *inode, int type)
{
- struct file_lock_context *new;
+ struct file_lock_context *ctx;
- if (likely(inode->i_flctx) || type == F_UNLCK)
+ /* paired with cmpxchg() below */
+ ctx = smp_load_acquire(&inode->i_flctx);
+ if (likely(ctx) || type == F_UNLCK)
goto out;
- new = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
- if (!new)
+ ctx = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
+ if (!ctx)
goto out;
- spin_lock_init(&new->flc_lock);
- INIT_LIST_HEAD(&new->flc_flock);
- INIT_LIST_HEAD(&new->flc_posix);
- INIT_LIST_HEAD(&new->flc_lease);
+ spin_lock_init(&ctx->flc_lock);
+ INIT_LIST_HEAD(&ctx->flc_flock);
+ INIT_LIST_HEAD(&ctx->flc_posix);
+ INIT_LIST_HEAD(&ctx->flc_lease);
/*
* Assign the pointer if it's not already assigned. If it is, then
* free the context we just allocated.
*/
- if (cmpxchg(&inode->i_flctx, NULL, new))
- kmem_cache_free(flctx_cache, new);
+ if (cmpxchg(&inode->i_flctx, NULL, ctx)) {
+ kmem_cache_free(flctx_cache, ctx);
+ ctx = smp_load_acquire(&inode->i_flctx);
+ }
out:
- return inode->i_flctx;
+ trace_locks_get_lock_context(inode, type, ctx);
+ return ctx;
+}
+
+static void
+locks_dump_ctx_list(struct list_head *list, char *list_type)
+{
+ struct file_lock *fl;
+
+ list_for_each_entry(fl, list, fl_list) {
+ pr_warn("%s: fl_owner=%p fl_flags=0x%x fl_type=0x%x fl_pid=%u\n", list_type, fl->fl_owner, fl->fl_flags, fl->fl_type, fl->fl_pid);
+ }
+}
+
+static void
+locks_check_ctx_lists(struct inode *inode)
+{
+ struct file_lock_context *ctx = inode->i_flctx;
+
+ if (unlikely(!list_empty(&ctx->flc_flock) ||
+ !list_empty(&ctx->flc_posix) ||
+ !list_empty(&ctx->flc_lease))) {
+ pr_warn("Leaked locks on dev=0x%x:0x%x ino=0x%lx:\n",
+ MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev),
+ inode->i_ino);
+ locks_dump_ctx_list(&ctx->flc_flock, "FLOCK");
+ locks_dump_ctx_list(&ctx->flc_posix, "POSIX");
+ locks_dump_ctx_list(&ctx->flc_lease, "LEASE");
+ }
}
void
-locks_free_lock_context(struct file_lock_context *ctx)
+locks_free_lock_context(struct inode *inode)
{
- if (ctx) {
- WARN_ON_ONCE(!list_empty(&ctx->flc_flock));
- WARN_ON_ONCE(!list_empty(&ctx->flc_posix));
- WARN_ON_ONCE(!list_empty(&ctx->flc_lease));
+ struct file_lock_context *ctx = inode->i_flctx;
+
+ if (unlikely(ctx)) {
+ locks_check_ctx_lists(inode);
kmem_cache_free(flctx_cache, ctx);
}
}
@@ -762,7 +793,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
struct file_lock_context *ctx;
struct inode *inode = file_inode(filp);
- ctx = inode->i_flctx;
+ ctx = smp_load_acquire(&inode->i_flctx);
if (!ctx || list_empty_careful(&ctx->flc_posix)) {
fl->fl_type = F_UNLCK;
return;
@@ -930,7 +961,8 @@ out:
return error;
}
-static int __posix_lock_file(struct inode *inode, struct file_lock *request, struct file_lock *conflock)
+static int posix_lock_inode(struct inode *inode, struct file_lock *request,
+ struct file_lock *conflock)
{
struct file_lock *fl, *tmp;
struct file_lock *new_fl = NULL;
@@ -1138,6 +1170,8 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
if (new_fl2)
locks_free_lock(new_fl2);
locks_dispose_list(&dispose);
+ trace_posix_lock_inode(inode, request, error);
+
return error;
}
@@ -1158,7 +1192,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
int posix_lock_file(struct file *filp, struct file_lock *fl,
struct file_lock *conflock)
{
- return __posix_lock_file(file_inode(filp), fl, conflock);
+ return posix_lock_inode(file_inode(filp), fl, conflock);
}
EXPORT_SYMBOL(posix_lock_file);
@@ -1167,15 +1201,14 @@ EXPORT_SYMBOL(posix_lock_file);
* @inode: inode of file to which lock request should be applied
* @fl: The lock to be applied
*
- * Variant of posix_lock_file_wait that does not take a filp, and so can be
- * used after the filp has already been torn down.
+ * Apply a POSIX style lock request to an inode.
*/
-int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl)
+static int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl)
{
int error;
might_sleep ();
for (;;) {
- error = __posix_lock_file(inode, fl, NULL);
+ error = posix_lock_inode(inode, fl, NULL);
if (error != FILE_LOCK_DEFERRED)
break;
error = wait_event_interruptible(fl->fl_wait, !fl->fl_next);
@@ -1187,8 +1220,8 @@ int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl)
}
return error;
}
-EXPORT_SYMBOL(posix_lock_inode_wait);
+#ifdef CONFIG_MANDATORY_FILE_LOCKING
/**
* locks_mandatory_locked - Check for an active lock
* @file: the file to check
@@ -1203,7 +1236,7 @@ int locks_mandatory_locked(struct file *file)
struct file_lock_context *ctx;
struct file_lock *fl;
- ctx = inode->i_flctx;
+ ctx = smp_load_acquire(&inode->i_flctx);
if (!ctx || list_empty_careful(&ctx->flc_posix))
return 0;
@@ -1225,20 +1258,16 @@ int locks_mandatory_locked(struct file *file)
/**
* locks_mandatory_area - Check for a conflicting lock
- * @read_write: %FLOCK_VERIFY_WRITE for exclusive access, %FLOCK_VERIFY_READ
- * for shared
- * @inode: the file to check
+ * @inode: the file to check
* @filp: how the file was opened (if it was)
- * @offset: start of area to check
- * @count: length of area to check
+ * @start: first byte in the file to check
+ * @end: lastbyte in the file to check
+ * @type: %F_WRLCK for a write lock, else %F_RDLCK
*
* Searches the inode's list of locks to find any POSIX locks which conflict.
- * This function is called from rw_verify_area() and
- * locks_verify_truncate().
*/
-int locks_mandatory_area(int read_write, struct inode *inode,
- struct file *filp, loff_t offset,
- size_t count)
+int locks_mandatory_area(struct inode *inode, struct file *filp, loff_t start,
+ loff_t end, unsigned char type)
{
struct file_lock fl;
int error;
@@ -1250,15 +1279,15 @@ int locks_mandatory_area(int read_write, struct inode *inode,
fl.fl_flags = FL_POSIX | FL_ACCESS;
if (filp && !(filp->f_flags & O_NONBLOCK))
sleep = true;
- fl.fl_type = (read_write == FLOCK_VERIFY_WRITE) ? F_WRLCK : F_RDLCK;
- fl.fl_start = offset;
- fl.fl_end = offset + count - 1;
+ fl.fl_type = type;
+ fl.fl_start = start;
+ fl.fl_end = end;
for (;;) {
if (filp) {
fl.fl_owner = filp;
fl.fl_flags &= ~FL_SLEEP;
- error = __posix_lock_file(inode, &fl, NULL);
+ error = posix_lock_inode(inode, &fl, NULL);
if (!error)
break;
}
@@ -1266,7 +1295,7 @@ int locks_mandatory_area(int read_write, struct inode *inode,
if (sleep)
fl.fl_flags |= FL_SLEEP;
fl.fl_owner = current->files;
- error = __posix_lock_file(inode, &fl, NULL);
+ error = posix_lock_inode(inode, &fl, NULL);
if (error != FILE_LOCK_DEFERRED)
break;
error = wait_event_interruptible(fl.fl_wait, !fl.fl_next);
@@ -1287,6 +1316,7 @@ int locks_mandatory_area(int read_write, struct inode *inode,
}
EXPORT_SYMBOL(locks_mandatory_area);
+#endif /* CONFIG_MANDATORY_FILE_LOCKING */
static void lease_clear_pending(struct file_lock *fl, int arg)
{
@@ -1388,7 +1418,7 @@ any_leases_conflict(struct inode *inode, struct file_lock *breaker)
int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
{
int error = 0;
- struct file_lock_context *ctx = inode->i_flctx;
+ struct file_lock_context *ctx;
struct file_lock *new_fl, *fl, *tmp;
unsigned long break_time;
int want_write = (mode & O_ACCMODE) != O_RDONLY;
@@ -1400,6 +1430,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
new_fl->fl_flags = type;
/* typically we will check that ctx is non-NULL before calling */
+ ctx = smp_load_acquire(&inode->i_flctx);
if (!ctx) {
WARN_ON_ONCE(1);
return error;
@@ -1494,17 +1525,16 @@ EXPORT_SYMBOL(__break_lease);
void lease_get_mtime(struct inode *inode, struct timespec *time)
{
bool has_lease = false;
- struct file_lock_context *ctx = inode->i_flctx;
+ struct file_lock_context *ctx;
struct file_lock *fl;
+ ctx = smp_load_acquire(&inode->i_flctx);
if (ctx && !list_empty_careful(&ctx->flc_lease)) {
spin_lock(&ctx->flc_lock);
- if (!list_empty(&ctx->flc_lease)) {
- fl = list_first_entry(&ctx->flc_lease,
- struct file_lock, fl_list);
- if (fl->fl_type == F_WRLCK)
- has_lease = true;
- }
+ fl = list_first_entry_or_null(&ctx->flc_lease,
+ struct file_lock, fl_list);
+ if (fl && (fl->fl_type == F_WRLCK))
+ has_lease = true;
spin_unlock(&ctx->flc_lock);
}
@@ -1543,10 +1573,11 @@ int fcntl_getlease(struct file *filp)
{
struct file_lock *fl;
struct inode *inode = file_inode(filp);
- struct file_lock_context *ctx = inode->i_flctx;
+ struct file_lock_context *ctx;
int type = F_UNLCK;
LIST_HEAD(dispose);
+ ctx = smp_load_acquire(&inode->i_flctx);
if (ctx && !list_empty_careful(&ctx->flc_lease)) {
spin_lock(&ctx->flc_lock);
time_out_leases(file_inode(filp), &dispose);
@@ -1568,6 +1599,7 @@ int fcntl_getlease(struct file *filp)
* desired lease.
* @dentry: dentry to check
* @arg: type of lease that we're trying to acquire
+ * @flags: current lock flags
*
* Check to see if there's an existing open fd on this file that would
* conflict with the lease we're trying to set.
@@ -1710,11 +1742,11 @@ static int generic_delete_lease(struct file *filp, void *owner)
{
int error = -EAGAIN;
struct file_lock *fl, *victim = NULL;
- struct dentry *dentry = filp->f_path.dentry;
- struct inode *inode = dentry->d_inode;
- struct file_lock_context *ctx = inode->i_flctx;
+ struct inode *inode = file_inode(filp);
+ struct file_lock_context *ctx;
LIST_HEAD(dispose);
+ ctx = smp_load_acquire(&inode->i_flctx);
if (!ctx) {
trace_generic_delete_lease(inode, NULL);
return error;
@@ -1750,8 +1782,7 @@ static int generic_delete_lease(struct file *filp, void *owner)
int generic_setlease(struct file *filp, long arg, struct file_lock **flp,
void **priv)
{
- struct dentry *dentry = filp->f_path.dentry;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = file_inode(filp);
int error;
if ((!uid_eq(current_fsuid(), inode->i_uid)) && !capable(CAP_LEASE))
@@ -1855,7 +1886,7 @@ int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
*
* Apply a FLOCK style lock request to an inode.
*/
-int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl)
+static int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl)
{
int error;
might_sleep();
@@ -1872,7 +1903,30 @@ int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl)
}
return error;
}
-EXPORT_SYMBOL(flock_lock_inode_wait);
+
+/**
+ * locks_lock_inode_wait - Apply a lock to an inode
+ * @inode: inode of the file to apply to
+ * @fl: The lock to be applied
+ *
+ * Apply a POSIX or FLOCK style lock request to an inode.
+ */
+int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl)
+{
+ int res = 0;
+ switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) {
+ case FL_POSIX:
+ res = posix_lock_inode_wait(inode, fl);
+ break;
+ case FL_FLOCK:
+ res = flock_lock_inode_wait(inode, fl);
+ break;
+ default:
+ BUG();
+ }
+ return res;
+}
+EXPORT_SYMBOL(locks_lock_inode_wait);
/**
* sys_flock: - flock() system call.
@@ -1930,7 +1984,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
(can_sleep) ? F_SETLKW : F_SETLK,
lock);
else
- error = flock_lock_file_wait(f.file, lock);
+ error = locks_lock_file_wait(f.file, lock);
out_free:
locks_free_lock(lock);
@@ -2106,7 +2160,7 @@ static int do_lock_file_wait(struct file *filp, unsigned int cmd,
return error;
}
-/* Ensure that fl->fl_filp has compatible f_mode for F_SETLK calls */
+/* Ensure that fl->fl_file has compatible f_mode for F_SETLK calls */
static int
check_fmode_for_setlk(struct file_lock *fl)
{
@@ -2137,6 +2191,8 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
if (file_lock == NULL)
return -ENOLCK;
+ inode = file_inode(filp);
+
/*
* This might block, so we do it before checking the inode.
*/
@@ -2144,8 +2200,6 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
if (copy_from_user(&flock, l, sizeof(flock)))
goto out;
- inode = file_inode(filp);
-
/* Don't allow mandatory locks on files that may be memory mapped
* and shared.
*/
@@ -2154,7 +2208,6 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
goto out;
}
-again:
error = flock_to_posix_lock(filp, file_lock, &flock);
if (error)
goto out;
@@ -2193,23 +2246,29 @@ again:
error = do_lock_file_wait(filp, cmd, file_lock);
/*
- * Attempt to detect a close/fcntl race and recover by
- * releasing the lock that was just acquired.
+ * Attempt to detect a close/fcntl race and recover by releasing the
+ * lock that was just acquired. There is no need to do that when we're
+ * unlocking though, or for OFD locks.
*/
- /*
- * we need that spin_lock here - it prevents reordering between
- * update of i_flctx->flc_posix and check for it done in close().
- * rcu_read_lock() wouldn't do.
- */
- spin_lock(&current->files->file_lock);
- f = fcheck(fd);
- spin_unlock(&current->files->file_lock);
- if (!error && f != filp && flock.l_type != F_UNLCK) {
- flock.l_type = F_UNLCK;
- goto again;
+ if (!error && file_lock->fl_type != F_UNLCK &&
+ !(file_lock->fl_flags & FL_OFDLCK)) {
+ /*
+ * We need that spin_lock here - it prevents reordering between
+ * update of i_flctx->flc_posix and check for it done in
+ * close(). rcu_read_lock() wouldn't do.
+ */
+ spin_lock(&current->files->file_lock);
+ f = fcheck(fd);
+ spin_unlock(&current->files->file_lock);
+ if (f != filp) {
+ file_lock->fl_type = F_UNLCK;
+ error = do_lock_file_wait(filp, cmd, file_lock);
+ WARN_ON_ONCE(error);
+ error = -EBADF;
+ }
}
-
out:
+ trace_fcntl_setlk(inode, file_lock, error);
locks_free_lock(file_lock);
return error;
}
@@ -2294,7 +2353,6 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
goto out;
}
-again:
error = flock64_to_posix_lock(filp, file_lock, &flock);
if (error)
goto out;
@@ -2333,17 +2391,27 @@ again:
error = do_lock_file_wait(filp, cmd, file_lock);
/*
- * Attempt to detect a close/fcntl race and recover by
- * releasing the lock that was just acquired.
+ * Attempt to detect a close/fcntl race and recover by releasing the
+ * lock that was just acquired. There is no need to do that when we're
+ * unlocking though, or for OFD locks.
*/
- spin_lock(&current->files->file_lock);
- f = fcheck(fd);
- spin_unlock(&current->files->file_lock);
- if (!error && f != filp && flock.l_type != F_UNLCK) {
- flock.l_type = F_UNLCK;
- goto again;
+ if (!error && file_lock->fl_type != F_UNLCK &&
+ !(file_lock->fl_flags & FL_OFDLCK)) {
+ /*
+ * We need that spin_lock here - it prevents reordering between
+ * update of i_flctx->flc_posix and check for it done in
+ * close(). rcu_read_lock() wouldn't do.
+ */
+ spin_lock(&current->files->file_lock);
+ f = fcheck(fd);
+ spin_unlock(&current->files->file_lock);
+ if (f != filp) {
+ file_lock->fl_type = F_UNLCK;
+ error = do_lock_file_wait(filp, cmd, file_lock);
+ WARN_ON_ONCE(error);
+ error = -EBADF;
+ }
}
-
out:
locks_free_lock(file_lock);
return error;
@@ -2357,14 +2425,16 @@ out:
*/
void locks_remove_posix(struct file *filp, fl_owner_t owner)
{
+ int error;
struct file_lock lock;
- struct file_lock_context *ctx = file_inode(filp)->i_flctx;
+ struct file_lock_context *ctx;
/*
* If there are no locks held on this file, we don't need to call
* posix_lock_file(). Another process could be setting a lock on this
* file at the same time, but we wouldn't remove that lock anyway.
*/
+ ctx = smp_load_acquire(&file_inode(filp)->i_flctx);
if (!ctx || list_empty(&ctx->flc_posix))
return;
@@ -2378,17 +2448,18 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner)
lock.fl_ops = NULL;
lock.fl_lmops = NULL;
- vfs_lock_file(filp, F_SETLK, &lock, NULL);
+ error = vfs_lock_file(filp, F_SETLK, &lock, NULL);
if (lock.fl_ops && lock.fl_ops->fl_release_private)
lock.fl_ops->fl_release_private(&lock);
+ trace_locks_remove_posix(file_inode(filp), &lock, error);
}
EXPORT_SYMBOL(locks_remove_posix);
/* The i_flctx must be valid when calling into here */
static void
-locks_remove_flock(struct file *filp)
+locks_remove_flock(struct file *filp, struct file_lock_context *flctx)
{
struct file_lock fl = {
.fl_owner = filp,
@@ -2399,7 +2470,6 @@ locks_remove_flock(struct file *filp)
.fl_end = OFFSET_MAX,
};
struct inode *inode = file_inode(filp);
- struct file_lock_context *flctx = inode->i_flctx;
if (list_empty(&flctx->flc_flock))
return;
@@ -2415,10 +2485,8 @@ locks_remove_flock(struct file *filp)
/* The i_flctx must be valid when calling into here */
static void
-locks_remove_lease(struct file *filp)
+locks_remove_lease(struct file *filp, struct file_lock_context *ctx)
{
- struct inode *inode = file_inode(filp);
- struct file_lock_context *ctx = inode->i_flctx;
struct file_lock *fl, *tmp;
LIST_HEAD(dispose);
@@ -2438,17 +2506,20 @@ locks_remove_lease(struct file *filp)
*/
void locks_remove_file(struct file *filp)
{
- if (!file_inode(filp)->i_flctx)
+ struct file_lock_context *ctx;
+
+ ctx = smp_load_acquire(&file_inode(filp)->i_flctx);
+ if (!ctx)
return;
/* remove any OFD locks */
locks_remove_posix(filp, filp);
/* remove flock locks */
- locks_remove_flock(filp);
+ locks_remove_flock(filp, ctx);
/* remove any leases */
- locks_remove_lease(filp);
+ locks_remove_lease(filp, ctx);
}
/**
@@ -2615,7 +2686,7 @@ void show_fd_locks(struct seq_file *f,
struct file_lock_context *ctx;
int id = 0;
- ctx = inode->i_flctx;
+ ctx = smp_load_acquire(&inode->i_flctx);
if (!ctx)
return;
@@ -2677,7 +2748,7 @@ static int __init proc_locks_init(void)
proc_create("locks", 0, NULL, &proc_locks_operations);
return 0;
}
-module_init(proc_locks_init);
+fs_initcall(proc_locks_init);
#endif
static int __init filelock_init(void)
diff --git a/fs/logfs/Kconfig b/fs/logfs/Kconfig
index 09ed066..2b45031 100644
--- a/fs/logfs/Kconfig
+++ b/fs/logfs/Kconfig
@@ -1,6 +1,6 @@
config LOGFS
tristate "LogFS file system"
- depends on (MTD || BLOCK)
+ depends on MTD || (!MTD && BLOCK)
select ZLIB_INFLATE
select ZLIB_DEFLATE
select CRC32
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index 76279e1..a709d80 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -53,16 +53,14 @@ static int bdev_readpage(void *_sb, struct page *page)
static DECLARE_WAIT_QUEUE_HEAD(wq);
-static void writeseg_end_io(struct bio *bio, int err)
+static void writeseg_end_io(struct bio *bio)
{
- const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct bio_vec *bvec;
int i;
struct super_block *sb = bio->bi_private;
struct logfs_super *super = logfs_super(sb);
- BUG_ON(!uptodate); /* FIXME: Retry io or write elsewhere */
- BUG_ON(err);
+ BUG_ON(bio->bi_error); /* FIXME: Retry io or write elsewhere */
bio_for_each_segment_all(bvec, bio, i) {
end_page_writeback(bvec->bv_page);
@@ -83,7 +81,7 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
unsigned int max_pages;
int i;
- max_pages = min(nr_pages, (size_t) bio_get_nr_vecs(super->s_bdev));
+ max_pages = min_t(size_t, nr_pages, BIO_MAX_PAGES);
bio = bio_alloc(GFP_NOFS, max_pages);
BUG_ON(!bio);
@@ -153,14 +151,12 @@ static void bdev_writeseg(struct super_block *sb, u64 ofs, size_t len)
}
-static void erase_end_io(struct bio *bio, int err)
+static void erase_end_io(struct bio *bio)
{
- const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct super_block *sb = bio->bi_private;
struct logfs_super *super = logfs_super(sb);
- BUG_ON(!uptodate); /* FIXME: Retry io or write elsewhere */
- BUG_ON(err);
+ BUG_ON(bio->bi_error); /* FIXME: Retry io or write elsewhere */
BUG_ON(bio->bi_vcnt == 0);
bio_put(bio);
if (atomic_dec_and_test(&super->s_pending_writes))
@@ -175,7 +171,7 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
unsigned int max_pages;
int i;
- max_pages = min(nr_pages, (size_t) bio_get_nr_vecs(super->s_bdev));
+ max_pages = min_t(size_t, nr_pages, BIO_MAX_PAGES);
bio = bio_alloc(GFP_NOFS, max_pages);
BUG_ON(!bio);
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index f9b45d4..542468e 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -528,7 +528,8 @@ static int logfs_symlink(struct inode *dir, struct dentry *dentry,
if (IS_ERR(inode))
return PTR_ERR(inode);
- inode->i_op = &logfs_symlink_iops;
+ inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &logfs_reg_aops;
return __logfs_create(dir, dentry, inode, target, destlen);
@@ -776,12 +777,6 @@ fail:
return -EIO;
}
-const struct inode_operations logfs_symlink_iops = {
- .readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
-};
-
const struct inode_operations logfs_dir_iops = {
.create = logfs_create,
.link = logfs_link,
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index af49e2d..db9cfc5 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -64,7 +64,8 @@ static void logfs_inode_setops(struct inode *inode)
inode->i_mapping->a_ops = &logfs_reg_aops;
break;
case S_IFLNK:
- inode->i_op = &logfs_symlink_iops;
+ inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &logfs_reg_aops;
break;
case S_IFSOCK: /* fall through */
@@ -408,7 +409,8 @@ const struct super_operations logfs_super_operations = {
int logfs_init_inode_cache(void)
{
logfs_inode_cache = kmem_cache_create("logfs_inode_cache",
- sizeof(struct logfs_inode), 0, SLAB_RECLAIM_ACCOUNT,
+ sizeof(struct logfs_inode), 0,
+ SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT,
logfs_init_once);
if (!logfs_inode_cache)
return -ENOMEM;
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index 5f09376..27d040e 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -302,7 +302,7 @@ struct logfs_block {
struct inode *inode;
struct logfs_transaction *ta;
unsigned long alias_map[LOGFS_BLOCK_FACTOR / BITS_PER_LONG];
- struct logfs_block_ops *ops;
+ const struct logfs_block_ops *ops;
int full;
int partial;
int reserved_bytes;
@@ -485,7 +485,7 @@ static inline int logfs_get_sb_bdev(struct logfs_super *s,
#endif
/* dev_mtd.c */
-#ifdef CONFIG_MTD
+#if IS_ENABLED(CONFIG_MTD)
int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr);
#else
static inline int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr)
@@ -495,7 +495,6 @@ static inline int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr)
#endif
/* dir.c */
-extern const struct inode_operations logfs_symlink_iops;
extern const struct inode_operations logfs_dir_iops;
extern const struct file_operations logfs_dir_fops;
int logfs_replay_journal(struct super_block *sb);
@@ -579,7 +578,7 @@ int logfs_exist_block(struct inode *inode, u64 bix);
int get_page_reserve(struct inode *inode, struct page *page);
void logfs_get_wblocks(struct super_block *sb, struct page *page, int lock);
void logfs_put_wblocks(struct super_block *sb, struct page *page, int lock);
-extern struct logfs_block_ops indirect_block_ops;
+extern const struct logfs_block_ops indirect_block_ops;
/* segment.c */
int logfs_erase_segment(struct super_block *sb, u32 ofs, int ensure_erase);
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 380d86e..20973c9 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -569,13 +569,13 @@ static void indirect_free_block(struct super_block *sb,
}
-static struct logfs_block_ops inode_block_ops = {
+static const struct logfs_block_ops inode_block_ops = {
.write_block = inode_write_block,
.free_block = inode_free_block,
.write_alias = inode_write_alias,
};
-struct logfs_block_ops indirect_block_ops = {
+const struct logfs_block_ops indirect_block_ops = {
.write_block = indirect_write_block,
.free_block = indirect_free_block,
.write_alias = indirect_write_alias,
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
index 7f9b096..d270e4b 100644
--- a/fs/logfs/segment.c
+++ b/fs/logfs/segment.c
@@ -57,7 +57,7 @@ static struct page *get_mapping_page(struct super_block *sb, pgoff_t index,
filler_t *filler = super->s_devops->readpage;
struct page *page;
- BUG_ON(mapping_gfp_mask(mapping) & __GFP_FS);
+ BUG_ON(mapping_gfp_constraint(mapping, __GFP_FS));
if (use_filler)
page = read_cache_page(mapping, index, filler, sb);
else {
@@ -197,7 +197,7 @@ static int btree_write_alias(struct super_block *sb, struct logfs_block *block,
return 0;
}
-static struct logfs_block_ops btree_block_ops = {
+static const struct logfs_block_ops btree_block_ops = {
.write_block = btree_write_block,
.free_block = __free_block,
.write_alias = btree_write_alias,
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 086cd0a..f975d66 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -91,7 +91,7 @@ static int __init init_inodecache(void)
minix_inode_cachep = kmem_cache_create("minix_inode_cache",
sizeof(struct minix_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (minix_inode_cachep == NULL)
return -ENOMEM;
@@ -435,8 +435,7 @@ static const struct address_space_operations minix_aops = {
static const struct inode_operations minix_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
.getattr = minix_getattr,
};
@@ -452,6 +451,7 @@ void minix_set_inode(struct inode *inode, dev_t rdev)
inode->i_mapping->a_ops = &minix_aops;
} else if (S_ISLNK(inode->i_mode)) {
inode->i_op = &minix_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &minix_aops;
} else
init_special_inode(inode, inode->i_mode, rdev);
diff --git a/fs/minix/itree_v1.c b/fs/minix/itree_v1.c
index 282e15a..46ca39d 100644
--- a/fs/minix/itree_v1.c
+++ b/fs/minix/itree_v1.c
@@ -24,16 +24,15 @@ static inline block_t *i_data(struct inode *inode)
static int block_to_path(struct inode * inode, long block, int offsets[DEPTH])
{
int n = 0;
- char b[BDEVNAME_SIZE];
if (block < 0) {
- printk("MINIX-fs: block_to_path: block %ld < 0 on dev %s\n",
- block, bdevname(inode->i_sb->s_bdev, b));
+ printk("MINIX-fs: block_to_path: block %ld < 0 on dev %pg\n",
+ block, inode->i_sb->s_bdev);
} else if (block >= (minix_sb(inode->i_sb)->s_max_size/BLOCK_SIZE)) {
if (printk_ratelimit())
printk("MINIX-fs: block_to_path: "
- "block %ld too big on dev %s\n",
- block, bdevname(inode->i_sb->s_bdev, b));
+ "block %ld too big on dev %pg\n",
+ block, inode->i_sb->s_bdev);
} else if (block < 7) {
offsets[n++] = block;
} else if ((block -= 7) < 512) {
diff --git a/fs/minix/itree_v2.c b/fs/minix/itree_v2.c
index 78e2d93..1ee1013 100644
--- a/fs/minix/itree_v2.c
+++ b/fs/minix/itree_v2.c
@@ -26,18 +26,17 @@ static inline block_t *i_data(struct inode *inode)
static int block_to_path(struct inode * inode, long block, int offsets[DEPTH])
{
int n = 0;
- char b[BDEVNAME_SIZE];
struct super_block *sb = inode->i_sb;
if (block < 0) {
- printk("MINIX-fs: block_to_path: block %ld < 0 on dev %s\n",
- block, bdevname(sb->s_bdev, b));
+ printk("MINIX-fs: block_to_path: block %ld < 0 on dev %pg\n",
+ block, sb->s_bdev);
} else if ((u64)block * (u64)sb->s_blocksize >=
minix_sb(sb)->s_max_size) {
if (printk_ratelimit())
printk("MINIX-fs: block_to_path: "
- "block %ld too big on dev %s\n",
- block, bdevname(sb->s_bdev, b));
+ "block %ld too big on dev %pg\n",
+ block, sb->s_bdev);
} else if (block < DIRCOUNT) {
offsets[n++] = block;
} else if ((block -= DIRCOUNT) < INDIRCOUNT(sb)) {
diff --git a/fs/mpage.c b/fs/mpage.c
index ca0244b..1480d3a 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -42,14 +42,14 @@
* status of that page is hard. See end_buffer_async_read() for the details.
* There is no point in duplicating all that complexity.
*/
-static void mpage_end_io(struct bio *bio, int err)
+static void mpage_end_io(struct bio *bio)
{
struct bio_vec *bv;
int i;
bio_for_each_segment_all(bv, bio, i) {
struct page *page = bv->bv_page;
- page_endio(page, bio_data_dir(bio), err);
+ page_endio(page, bio_data_dir(bio), bio->bi_error);
}
bio_put(bio);
@@ -139,7 +139,8 @@ map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block)
static struct bio *
do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
sector_t *last_block_in_bio, struct buffer_head *map_bh,
- unsigned long *first_logical_block, get_block_t get_block)
+ unsigned long *first_logical_block, get_block_t get_block,
+ gfp_t gfp)
{
struct inode *inode = page->mapping->host;
const unsigned blkbits = inode->i_blkbits;
@@ -277,8 +278,7 @@ alloc_new:
goto out;
}
bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
- min_t(int, nr_pages, bio_get_nr_vecs(bdev)),
- GFP_KERNEL);
+ min_t(int, nr_pages, BIO_MAX_PAGES), gfp);
if (bio == NULL)
goto confused;
}
@@ -361,6 +361,7 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
sector_t last_block_in_bio = 0;
struct buffer_head map_bh;
unsigned long first_logical_block = 0;
+ gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL);
map_bh.b_state = 0;
map_bh.b_size = 0;
@@ -370,12 +371,13 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
prefetchw(&page->flags);
list_del(&page->lru);
if (!add_to_page_cache_lru(page, mapping,
- page->index, GFP_KERNEL)) {
+ page->index,
+ gfp)) {
bio = do_mpage_readpage(bio, page,
nr_pages - page_idx,
&last_block_in_bio, &map_bh,
&first_logical_block,
- get_block);
+ get_block, gfp);
}
page_cache_release(page);
}
@@ -395,11 +397,12 @@ int mpage_readpage(struct page *page, get_block_t get_block)
sector_t last_block_in_bio = 0;
struct buffer_head map_bh;
unsigned long first_logical_block = 0;
+ gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL);
map_bh.b_state = 0;
map_bh.b_size = 0;
bio = do_mpage_readpage(bio, page, 1, &last_block_in_bio,
- &map_bh, &first_logical_block, get_block);
+ &map_bh, &first_logical_block, get_block, gfp);
if (bio)
mpage_bio_submit(READ, bio);
return 0;
@@ -482,6 +485,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
struct buffer_head map_bh;
loff_t i_size = i_size_read(inode);
int ret = 0;
+ int wr = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
if (page_has_buffers(page)) {
struct buffer_head *head = page_buffers(page);
@@ -590,7 +594,7 @@ page_is_mapped:
* This page will go to BIO. Do we need to send this BIO off first?
*/
if (bio && mpd->last_block_in_bio != blocks[0] - 1)
- bio = mpage_bio_submit(WRITE, bio);
+ bio = mpage_bio_submit(wr, bio);
alloc_new:
if (bio == NULL) {
@@ -602,7 +606,7 @@ alloc_new:
}
}
bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
- bio_get_nr_vecs(bdev), GFP_NOFS|__GFP_HIGH);
+ BIO_MAX_PAGES, GFP_NOFS|__GFP_HIGH);
if (bio == NULL)
goto confused;
@@ -617,7 +621,7 @@ alloc_new:
wbc_account_io(wbc, page, PAGE_SIZE);
length = first_unmapped << blkbits;
if (bio_add_page(bio, page, length, 0) < length) {
- bio = mpage_bio_submit(WRITE, bio);
+ bio = mpage_bio_submit(wr, bio);
goto alloc_new;
}
@@ -627,7 +631,7 @@ alloc_new:
set_page_writeback(page);
unlock_page(page);
if (boundary || (first_unmapped != blocks_per_page)) {
- bio = mpage_bio_submit(WRITE, bio);
+ bio = mpage_bio_submit(wr, bio);
if (boundary_block) {
write_boundary_block(boundary_bdev,
boundary_block, 1 << blkbits);
@@ -639,7 +643,7 @@ alloc_new:
confused:
if (bio)
- bio = mpage_bio_submit(WRITE, bio);
+ bio = mpage_bio_submit(wr, bio);
if (mpd->use_writepage) {
ret = mapping->a_ops->writepage(page, wbc);
@@ -695,8 +699,11 @@ mpage_writepages(struct address_space *mapping,
};
ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd);
- if (mpd.bio)
- mpage_bio_submit(WRITE, mpd.bio);
+ if (mpd.bio) {
+ int wr = (wbc->sync_mode == WB_SYNC_ALL ?
+ WRITE_SYNC : WRITE);
+ mpage_bio_submit(wr, mpd.bio);
+ }
}
blk_finish_plug(&plug);
return ret;
@@ -713,8 +720,11 @@ int mpage_writepage(struct page *page, get_block_t get_block,
.use_writepage = 0,
};
int ret = __mpage_writepage(page, wbc, &mpd);
- if (mpd.bio)
- mpage_bio_submit(WRITE, mpd.bio);
+ if (mpd.bio) {
+ int wr = (wbc->sync_mode == WB_SYNC_ALL ?
+ WRITE_SYNC : WRITE);
+ mpage_bio_submit(wr, mpd.bio);
+ }
return ret;
}
EXPORT_SYMBOL(mpage_writepage);
diff --git a/fs/namei.c b/fs/namei.c
index 1c2105e..bceefd5 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -505,13 +505,13 @@ struct nameidata {
int total_link_count;
struct saved {
struct path link;
- void *cookie;
+ struct delayed_call done;
const char *name;
- struct inode *inode;
unsigned seq;
} *stack, internal[EMBEDDED_LEVELS];
struct filename *name;
struct nameidata *saved;
+ struct inode *link_inode;
unsigned root_seq;
int dfd;
};
@@ -534,10 +534,8 @@ static void restore_nameidata(void)
current->nameidata = old;
if (old)
old->total_link_count = now->total_link_count;
- if (now->stack != now->internal) {
+ if (now->stack != now->internal)
kfree(now->stack);
- now->stack = now->internal;
- }
}
static int __nd_alloc_stack(struct nameidata *nd)
@@ -560,6 +558,24 @@ static int __nd_alloc_stack(struct nameidata *nd)
return 0;
}
+/**
+ * path_connected - Verify that a path->dentry is below path->mnt.mnt_root
+ * @path: nameidate to verify
+ *
+ * Rename can sometimes move a file or directory outside of a bind
+ * mount, path_connected allows those cases to be detected.
+ */
+static bool path_connected(const struct path *path)
+{
+ struct vfsmount *mnt = path->mnt;
+
+ /* Only bind mounts can have disconnected paths */
+ if (mnt->mnt_root == mnt->mnt_sb->s_root)
+ return true;
+
+ return is_subdir(path->dentry, mnt->mnt_root);
+}
+
static inline int nd_alloc_stack(struct nameidata *nd)
{
if (likely(nd->depth != EMBEDDED_LEVELS))
@@ -574,11 +590,8 @@ static void drop_links(struct nameidata *nd)
int i = nd->depth;
while (i--) {
struct saved *last = nd->stack + i;
- struct inode *inode = last->inode;
- if (last->cookie && inode->i_op->put_link) {
- inode->i_op->put_link(inode, last->cookie);
- last->cookie = NULL;
- }
+ do_delayed_call(&last->done);
+ clear_delayed_call(&last->done);
}
}
@@ -639,7 +652,7 @@ static bool legitimize_links(struct nameidata *nd)
* Path walking has 2 modes, rcu-walk and ref-walk (see
* Documentation/filesystems/path-lookup.txt). In situations when we can't
* continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
- * normal reference counts on dentries and vfsmounts to transition to rcu-walk
+ * normal reference counts on dentries and vfsmounts to transition to ref-walk
* mode. Refcounts are grabbed at the last known good point before rcu-walk
* got stuck, so ref-walk may continue from there. If this is not successful
* (eg. a seqcount has changed), then failure is returned and it's up to caller
@@ -789,19 +802,19 @@ static int complete_walk(struct nameidata *nd)
static void set_root(struct nameidata *nd)
{
- get_fs_root(current->fs, &nd->root);
-}
-
-static void set_root_rcu(struct nameidata *nd)
-{
struct fs_struct *fs = current->fs;
- unsigned seq;
- do {
- seq = read_seqcount_begin(&fs->seq);
- nd->root = fs->root;
- nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
- } while (read_seqcount_retry(&fs->seq, seq));
+ if (nd->flags & LOOKUP_RCU) {
+ unsigned seq;
+
+ do {
+ seq = read_seqcount_begin(&fs->seq);
+ nd->root = fs->root;
+ nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
+ } while (read_seqcount_retry(&fs->seq, seq));
+ } else {
+ get_fs_root(fs, &nd->root);
+ }
}
static void path_put_conditional(struct path *path, struct nameidata *nd)
@@ -823,8 +836,28 @@ static inline void path_to_nameidata(const struct path *path,
nd->path.dentry = path->dentry;
}
+static int nd_jump_root(struct nameidata *nd)
+{
+ if (nd->flags & LOOKUP_RCU) {
+ struct dentry *d;
+ nd->path = nd->root;
+ d = nd->path.dentry;
+ nd->inode = d->d_inode;
+ nd->seq = nd->root_seq;
+ if (unlikely(read_seqcount_retry(&d->d_seq, nd->seq)))
+ return -ECHILD;
+ } else {
+ path_put(&nd->path);
+ nd->path = nd->root;
+ path_get(&nd->path);
+ nd->inode = nd->path.dentry->d_inode;
+ }
+ nd->flags |= LOOKUP_JUMPED;
+ return 0;
+}
+
/*
- * Helper to directly jump to a known parsed path from ->follow_link,
+ * Helper to directly jump to a known parsed path from ->get_link,
* caller must have taken a reference to path beforehand.
*/
void nd_jump_link(struct path *path)
@@ -840,9 +873,7 @@ void nd_jump_link(struct path *path)
static inline void put_link(struct nameidata *nd)
{
struct saved *last = nd->stack + --nd->depth;
- struct inode *inode = last->inode;
- if (last->cookie && inode->i_op->put_link)
- inode->i_op->put_link(inode, last->cookie);
+ do_delayed_call(&last->done);
if (!(nd->flags & LOOKUP_RCU))
path_put(&last->link);
}
@@ -874,7 +905,7 @@ static inline int may_follow_link(struct nameidata *nd)
return 0;
/* Allowed if owner and follower match. */
- inode = nd->stack[0].inode;
+ inode = nd->link_inode;
if (uid_eq(current_cred()->fsuid, inode->i_uid))
return 0;
@@ -937,26 +968,23 @@ static bool safe_hardlink_source(struct inode *inode)
* - sysctl_protected_hardlinks enabled
* - fsuid does not match inode
* - hardlink source is unsafe (see safe_hardlink_source() above)
- * - not CAP_FOWNER
+ * - not CAP_FOWNER in a namespace with the inode owner uid mapped
*
* Returns 0 if successful, -ve on error.
*/
static int may_linkat(struct path *link)
{
- const struct cred *cred;
struct inode *inode;
if (!sysctl_protected_hardlinks)
return 0;
- cred = current_cred();
inode = link->dentry->d_inode;
/* Source inode owner (or CAP_FOWNER) can hardlink all they like,
* otherwise, it must be a safe source.
*/
- if (uid_eq(cred->fsuid, inode->i_uid) || safe_hardlink_source(inode) ||
- capable(CAP_FOWNER))
+ if (inode_owner_or_capable(inode) || safe_hardlink_source(inode))
return 0;
audit_log_link_denied("linkat", link);
@@ -968,7 +996,7 @@ const char *get_link(struct nameidata *nd)
{
struct saved *last = nd->stack + nd->depth - 1;
struct dentry *dentry = last->link.dentry;
- struct inode *inode = last->inode;
+ struct inode *inode = nd->link_inode;
int error;
const char *res;
@@ -989,36 +1017,27 @@ const char *get_link(struct nameidata *nd)
nd->last_type = LAST_BIND;
res = inode->i_link;
if (!res) {
+ const char * (*get)(struct dentry *, struct inode *,
+ struct delayed_call *);
+ get = inode->i_op->get_link;
if (nd->flags & LOOKUP_RCU) {
- if (unlikely(unlazy_walk(nd, NULL, 0)))
- return ERR_PTR(-ECHILD);
+ res = get(NULL, inode, &last->done);
+ if (res == ERR_PTR(-ECHILD)) {
+ if (unlikely(unlazy_walk(nd, NULL, 0)))
+ return ERR_PTR(-ECHILD);
+ res = get(dentry, inode, &last->done);
+ }
+ } else {
+ res = get(dentry, inode, &last->done);
}
- res = inode->i_op->follow_link(dentry, &last->cookie);
- if (IS_ERR_OR_NULL(res)) {
- last->cookie = NULL;
+ if (IS_ERR_OR_NULL(res))
return res;
- }
}
if (*res == '/') {
- if (nd->flags & LOOKUP_RCU) {
- struct dentry *d;
- if (!nd->root.mnt)
- set_root_rcu(nd);
- nd->path = nd->root;
- d = nd->path.dentry;
- nd->inode = d->d_inode;
- nd->seq = nd->root_seq;
- if (unlikely(read_seqcount_retry(&d->d_seq, nd->seq)))
- return ERR_PTR(-ECHILD);
- } else {
- if (!nd->root.mnt)
- set_root(nd);
- path_put(&nd->path);
- nd->path = nd->root;
- path_get(&nd->root);
- nd->inode = nd->path.dentry->d_inode;
- }
- nd->flags |= LOOKUP_JUMPED;
+ if (!nd->root.mnt)
+ set_root(nd);
+ if (unlikely(nd_jump_root(nd)))
+ return ERR_PTR(-ECHILD);
while (unlikely(*++res == '/'))
;
}
@@ -1279,8 +1298,6 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
static int follow_dotdot_rcu(struct nameidata *nd)
{
struct inode *inode = nd->inode;
- if (!nd->root.mnt)
- set_root_rcu(nd);
while (1) {
if (path_equal(&nd->path, &nd->root))
@@ -1296,6 +1313,8 @@ static int follow_dotdot_rcu(struct nameidata *nd)
return -ECHILD;
nd->path.dentry = parent;
nd->seq = seq;
+ if (unlikely(!path_connected(&nd->path)))
+ return -ENOENT;
break;
} else {
struct mount *mnt = real_mount(nd->path.mnt);
@@ -1396,11 +1415,8 @@ static void follow_mount(struct path *path)
}
}
-static void follow_dotdot(struct nameidata *nd)
+static int follow_dotdot(struct nameidata *nd)
{
- if (!nd->root.mnt)
- set_root(nd);
-
while(1) {
struct dentry *old = nd->path.dentry;
@@ -1412,6 +1428,8 @@ static void follow_dotdot(struct nameidata *nd)
/* rare case of legitimate dget_parent()... */
nd->path.dentry = dget_parent(nd->path.dentry);
dput(old);
+ if (unlikely(!path_connected(&nd->path)))
+ return -ENOENT;
break;
}
if (!follow_up(&nd->path))
@@ -1419,6 +1437,7 @@ static void follow_dotdot(struct nameidata *nd)
}
follow_mount(&nd->path);
nd->inode = nd->path.dentry->d_inode;
+ return 0;
}
/*
@@ -1535,8 +1554,6 @@ static int lookup_fast(struct nameidata *nd,
negative = d_is_negative(dentry);
if (read_seqcount_retry(&dentry->d_seq, seq))
return -ECHILD;
- if (negative)
- return -ENOENT;
/*
* This sequence count validates that the parent had no
@@ -1557,6 +1574,12 @@ static int lookup_fast(struct nameidata *nd,
goto unlazy;
}
}
+ /*
+ * Note: do negative dentry check after revalidation in
+ * case that drops it.
+ */
+ if (negative)
+ return -ENOENT;
path->mnt = mnt;
path->dentry = dentry;
if (likely(__follow_mount_rcu(nd, path, inode, seqp)))
@@ -1631,10 +1654,12 @@ static inline int may_lookup(struct nameidata *nd)
static inline int handle_dots(struct nameidata *nd, int type)
{
if (type == LAST_DOTDOT) {
+ if (!nd->root.mnt)
+ set_root(nd);
if (nd->flags & LOOKUP_RCU) {
return follow_dotdot_rcu(nd);
} else
- follow_dotdot(nd);
+ return follow_dotdot(nd);
}
return 0;
}
@@ -1667,8 +1692,8 @@ static int pick_link(struct nameidata *nd, struct path *link,
last = nd->stack + nd->depth++;
last->link = *link;
- last->cookie = NULL;
- last->inode = inode;
+ clear_delayed_call(&last->done);
+ nd->link_inode = inode;
last->seq = seq;
return 1;
}
@@ -1942,7 +1967,7 @@ OK:
if (err) {
const char *s = get_link(nd);
- if (unlikely(IS_ERR(s)))
+ if (IS_ERR(s))
return PTR_ERR(s);
err = 0;
if (unlikely(!s)) {
@@ -1972,7 +1997,6 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
nd->last_type = LAST_ROOT; /* if there are only slashes... */
nd->flags = flags | LOOKUP_JUMPED | LOOKUP_PARENT;
nd->depth = 0;
- nd->total_link_count = 0;
if (flags & LOOKUP_ROOT) {
struct dentry *root = nd->root.dentry;
struct inode *inode = root->d_inode;
@@ -1997,18 +2021,19 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
}
nd->root.mnt = NULL;
+ nd->path.mnt = NULL;
+ nd->path.dentry = NULL;
nd->m_seq = read_seqbegin(&mount_lock);
if (*s == '/') {
- if (flags & LOOKUP_RCU) {
+ if (flags & LOOKUP_RCU)
rcu_read_lock();
- set_root_rcu(nd);
- nd->seq = nd->root_seq;
- } else {
- set_root(nd);
- path_get(&nd->root);
- }
- nd->path = nd->root;
+ set_root(nd);
+ if (likely(!nd_jump_root(nd)))
+ return s;
+ nd->root.mnt = NULL;
+ rcu_read_unlock();
+ return ERR_PTR(-ECHILD);
} else if (nd->dfd == AT_FDCWD) {
if (flags & LOOKUP_RCU) {
struct fs_struct *fs = current->fs;
@@ -2019,11 +2044,14 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
do {
seq = read_seqcount_begin(&fs->seq);
nd->path = fs->pwd;
+ nd->inode = nd->path.dentry->d_inode;
nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
} while (read_seqcount_retry(&fs->seq, seq));
} else {
get_fs_pwd(current->fs, &nd->path);
+ nd->inode = nd->path.dentry->d_inode;
}
+ return s;
} else {
/* Caller must check execute permissions on the starting path component */
struct fd f = fdget_raw(nd->dfd);
@@ -2053,16 +2081,6 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
fdput(f);
return s;
}
-
- nd->inode = nd->path.dentry->d_inode;
- if (!(flags & LOOKUP_RCU))
- return s;
- if (likely(!read_seqcount_retry(&nd->path.dentry->d_seq, nd->seq)))
- return s;
- if (!(nd->flags & LOOKUP_ROOT))
- nd->root.mnt = NULL;
- rcu_read_unlock();
- return ERR_PTR(-ECHILD);
}
static const char *trailing_symlink(struct nameidata *nd)
@@ -2255,6 +2273,8 @@ EXPORT_SYMBOL(vfs_path_lookup);
*
* Note that this routine is purely a helper for filesystem usage and should
* not be called by generic code.
+ *
+ * The caller must hold base->i_mutex.
*/
struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
{
@@ -2298,6 +2318,75 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
}
EXPORT_SYMBOL(lookup_one_len);
+/**
+ * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
+ * @name: pathname component to lookup
+ * @base: base directory to lookup from
+ * @len: maximum length @len should be interpreted to
+ *
+ * Note that this routine is purely a helper for filesystem usage and should
+ * not be called by generic code.
+ *
+ * Unlike lookup_one_len, it should be called without the parent
+ * i_mutex held, and will take the i_mutex itself if necessary.
+ */
+struct dentry *lookup_one_len_unlocked(const char *name,
+ struct dentry *base, int len)
+{
+ struct qstr this;
+ unsigned int c;
+ int err;
+ struct dentry *ret;
+
+ this.name = name;
+ this.len = len;
+ this.hash = full_name_hash(name, len);
+ if (!len)
+ return ERR_PTR(-EACCES);
+
+ if (unlikely(name[0] == '.')) {
+ if (len < 2 || (len == 2 && name[1] == '.'))
+ return ERR_PTR(-EACCES);
+ }
+
+ while (len--) {
+ c = *(const unsigned char *)name++;
+ if (c == '/' || c == '\0')
+ return ERR_PTR(-EACCES);
+ }
+ /*
+ * See if the low-level filesystem might want
+ * to use its own hash..
+ */
+ if (base->d_flags & DCACHE_OP_HASH) {
+ int err = base->d_op->d_hash(base, &this);
+ if (err < 0)
+ return ERR_PTR(err);
+ }
+
+ err = inode_permission(base->d_inode, MAY_EXEC);
+ if (err)
+ return ERR_PTR(err);
+
+ /*
+ * __d_lookup() is used to try to get a quick answer and avoid the
+ * mutex. A false-negative does no harm.
+ */
+ ret = __d_lookup(base, &this);
+ if (ret && unlikely(ret->d_flags & DCACHE_OP_REVALIDATE)) {
+ dput(ret);
+ ret = NULL;
+ }
+ if (ret)
+ return ret;
+
+ mutex_lock(&base->d_inode->i_mutex);
+ ret = __lookup_hash(&this, base, 0);
+ mutex_unlock(&base->d_inode->i_mutex);
+ return ret;
+}
+EXPORT_SYMBOL(lookup_one_len_unlocked);
+
int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
struct path *path, int *empty)
{
@@ -2415,7 +2504,7 @@ done:
/**
* path_mountpoint - look up a path to be umounted
- * @nameidata: lookup context
+ * @nd: lookup context
* @flags: lookup flags
* @path: pointer to container for result
*
@@ -2646,10 +2735,6 @@ static int may_open(struct path *path, int acc_mode, int flag)
struct inode *inode = dentry->d_inode;
int error;
- /* O_PATH? */
- if (!acc_mode)
- return 0;
-
if (!inode)
return -ENOENT;
@@ -2671,7 +2756,7 @@ static int may_open(struct path *path, int acc_mode, int flag)
break;
}
- error = inode_permission(inode, acc_mode);
+ error = inode_permission(inode, MAY_OPEN | acc_mode);
if (error)
return error;
@@ -2863,7 +2948,7 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry,
if (*opened & FILE_CREATED) {
WARN_ON(!(open_flag & O_CREAT));
fsnotify_create(dir, dentry);
- acc_mode = MAY_OPEN;
+ acc_mode = 0;
}
error = may_open(&file->f_path, acc_mode, open_flag);
if (error)
@@ -3076,7 +3161,7 @@ retry_lookup:
/* Don't check for write permission, don't truncate */
open_flag &= ~O_TRUNC;
will_truncate = false;
- acc_mode = MAY_OPEN;
+ acc_mode = 0;
path_to_nameidata(&path, nd);
goto finish_open_created;
}
@@ -3160,10 +3245,11 @@ finish_open:
got_write = true;
}
finish_open_created:
- error = may_open(&nd->path, acc_mode, open_flag);
- if (error)
- goto out;
-
+ if (likely(!(open_flag & O_PATH))) {
+ error = may_open(&nd->path, acc_mode, open_flag);
+ if (error)
+ goto out;
+ }
BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */
error = vfs_open(&nd->path, file, current_cred());
if (!error) {
@@ -3250,7 +3336,7 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags,
goto out2;
audit_inode(nd->name, child, 0);
/* Don't check for other permissions, the inode was just created */
- error = may_open(&path, MAY_OPEN, op->open_flag);
+ error = may_open(&path, 0, op->open_flag);
if (error)
goto out2;
file->f_path.mnt = path.mnt;
@@ -3356,7 +3442,7 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
return ERR_PTR(-ELOOP);
filename = getname_kernel(name);
- if (unlikely(IS_ERR(filename)))
+ if (IS_ERR(filename))
return ERR_CAST(filename);
set_nameidata(&nd, -1, filename);
@@ -4472,72 +4558,73 @@ EXPORT_SYMBOL(readlink_copy);
/*
* A helper for ->readlink(). This should be used *ONLY* for symlinks that
- * have ->follow_link() touching nd only in nd_set_link(). Using (or not
- * using) it for any given inode is up to filesystem.
+ * have ->get_link() not calling nd_jump_link(). Using (or not using) it
+ * for any given inode is up to filesystem.
*/
int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{
- void *cookie;
+ DEFINE_DELAYED_CALL(done);
struct inode *inode = d_inode(dentry);
const char *link = inode->i_link;
int res;
if (!link) {
- link = inode->i_op->follow_link(dentry, &cookie);
+ link = inode->i_op->get_link(dentry, inode, &done);
if (IS_ERR(link))
return PTR_ERR(link);
}
res = readlink_copy(buffer, buflen, link);
- if (inode->i_op->put_link)
- inode->i_op->put_link(inode, cookie);
+ do_delayed_call(&done);
return res;
}
EXPORT_SYMBOL(generic_readlink);
/* get the link contents into pagecache */
-static char *page_getlink(struct dentry * dentry, struct page **ppage)
+const char *page_get_link(struct dentry *dentry, struct inode *inode,
+ struct delayed_call *callback)
{
char *kaddr;
struct page *page;
- struct address_space *mapping = dentry->d_inode->i_mapping;
- page = read_mapping_page(mapping, 0, NULL);
- if (IS_ERR(page))
- return (char*)page;
- *ppage = page;
- kaddr = kmap(page);
- nd_terminate_link(kaddr, dentry->d_inode->i_size, PAGE_SIZE - 1);
+ struct address_space *mapping = inode->i_mapping;
+
+ if (!dentry) {
+ page = find_get_page(mapping, 0);
+ if (!page)
+ return ERR_PTR(-ECHILD);
+ if (!PageUptodate(page)) {
+ put_page(page);
+ return ERR_PTR(-ECHILD);
+ }
+ } else {
+ page = read_mapping_page(mapping, 0, NULL);
+ if (IS_ERR(page))
+ return (char*)page;
+ }
+ set_delayed_call(callback, page_put_link, page);
+ BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM);
+ kaddr = page_address(page);
+ nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1);
return kaddr;
}
-int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
-{
- struct page *page = NULL;
- int res = readlink_copy(buffer, buflen, page_getlink(dentry, &page));
- if (page) {
- kunmap(page);
- page_cache_release(page);
- }
- return res;
-}
-EXPORT_SYMBOL(page_readlink);
+EXPORT_SYMBOL(page_get_link);
-const char *page_follow_link_light(struct dentry *dentry, void **cookie)
+void page_put_link(void *arg)
{
- struct page *page = NULL;
- char *res = page_getlink(dentry, &page);
- if (!IS_ERR(res))
- *cookie = page;
- return res;
+ put_page(arg);
}
-EXPORT_SYMBOL(page_follow_link_light);
+EXPORT_SYMBOL(page_put_link);
-void page_put_link(struct inode *unused, void *cookie)
+int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{
- struct page *page = cookie;
- kunmap(page);
- page_cache_release(page);
+ DEFINE_DELAYED_CALL(done);
+ int res = readlink_copy(buffer, buflen,
+ page_get_link(dentry, d_inode(dentry),
+ &done));
+ do_delayed_call(&done);
+ return res;
}
-EXPORT_SYMBOL(page_put_link);
+EXPORT_SYMBOL(page_readlink);
/*
* The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS
@@ -4548,7 +4635,6 @@ int __page_symlink(struct inode *inode, const char *symname, int len, int nofs)
struct page *page;
void *fsdata;
int err;
- char *kaddr;
unsigned int flags = AOP_FLAG_UNINTERRUPTIBLE;
if (nofs)
flags |= AOP_FLAG_NOFS;
@@ -4559,9 +4645,7 @@ retry:
if (err)
goto fail;
- kaddr = kmap_atomic(page);
- memcpy(kaddr, symname, len-1);
- kunmap_atomic(kaddr);
+ memcpy(page_address(page), symname, len-1);
err = pagecache_write_end(NULL, mapping, 0, len-1, len-1,
page, fsdata);
@@ -4580,13 +4664,12 @@ EXPORT_SYMBOL(__page_symlink);
int page_symlink(struct inode *inode, const char *symname, int len)
{
return __page_symlink(inode, symname, len,
- !(mapping_gfp_mask(inode->i_mapping) & __GFP_FS));
+ !mapping_gfp_constraint(inode->i_mapping, __GFP_FS));
}
EXPORT_SYMBOL(page_symlink);
const struct inode_operations page_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
};
EXPORT_SYMBOL(page_symlink_inode_operations);
diff --git a/fs/namespace.c b/fs/namespace.c
index 2b8aa15..a830e14 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1584,6 +1584,14 @@ static inline bool may_mount(void)
return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN);
}
+static inline bool may_mandlock(void)
+{
+#ifndef CONFIG_MANDATORY_FILE_LOCKING
+ return false;
+#endif
+ return capable(CAP_SYS_ADMIN);
+}
+
/*
* Now umount can handle mount points as well as block devices.
* This is important for filesystems which use unnamed block devices.
@@ -2601,18 +2609,18 @@ static long exact_copy_from_user(void *to, const void __user * from,
return n;
}
-int copy_mount_options(const void __user * data, unsigned long *where)
+void *copy_mount_options(const void __user * data)
{
int i;
- unsigned long page;
unsigned long size;
+ char *copy;
- *where = 0;
if (!data)
- return 0;
+ return NULL;
- if (!(page = __get_free_page(GFP_KERNEL)))
- return -ENOMEM;
+ copy = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!copy)
+ return ERR_PTR(-ENOMEM);
/* We only care that *some* data at the address the user
* gave us is valid. Just in case, we'll zero
@@ -2623,15 +2631,14 @@ int copy_mount_options(const void __user * data, unsigned long *where)
if (size > PAGE_SIZE)
size = PAGE_SIZE;
- i = size - exact_copy_from_user((void *)page, data, size);
+ i = size - exact_copy_from_user(copy, data, size);
if (!i) {
- free_page(page);
- return -EFAULT;
+ kfree(copy);
+ return ERR_PTR(-EFAULT);
}
if (i != PAGE_SIZE)
- memset((char *)page + i, 0, PAGE_SIZE - i);
- *where = page;
- return 0;
+ memset(copy + i, 0, PAGE_SIZE - i);
+ return copy;
}
char *copy_mount_string(const void __user *data)
@@ -2677,6 +2684,8 @@ long do_mount(const char *dev_name, const char __user *dir_name,
type_page, flags, data_page);
if (!retval && !may_mount())
retval = -EPERM;
+ if (!retval && (flags & MS_MANDLOCK) && !may_mandlock())
+ retval = -EPERM;
if (retval)
goto dput_out;
@@ -2896,7 +2905,7 @@ SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
int ret;
char *kernel_type;
char *kernel_dev;
- unsigned long data_page;
+ void *options;
kernel_type = copy_mount_string(type);
ret = PTR_ERR(kernel_type);
@@ -2908,14 +2917,14 @@ SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
if (IS_ERR(kernel_dev))
goto out_dev;
- ret = copy_mount_options(data, &data_page);
- if (ret < 0)
+ options = copy_mount_options(data);
+ ret = PTR_ERR(options);
+ if (IS_ERR(options))
goto out_data;
- ret = do_mount(kernel_dev, dir_name, kernel_type, flags,
- (void *) data_page);
+ ret = do_mount(kernel_dev, dir_name, kernel_type, flags, options);
- free_page(data_page);
+ kfree(options);
out_data:
kfree(kernel_dev);
out_dev:
@@ -2939,9 +2948,9 @@ bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry);
}
-int path_is_under(struct path *path1, struct path *path2)
+bool path_is_under(struct path *path1, struct path *path2)
{
- int res;
+ bool res;
read_seqlock_excl(&mount_lock);
res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
read_sequnlock_excl(&mount_lock);
@@ -3218,6 +3227,8 @@ static bool fs_fully_visible(struct file_system_type *type, int *new_mnt_flags)
down_read(&namespace_sem);
list_for_each_entry(mnt, &ns->list, mnt_list) {
struct mount *child;
+ int mnt_flags;
+
if (mnt->mnt.mnt_sb->s_type != type)
continue;
@@ -3227,17 +3238,30 @@ static bool fs_fully_visible(struct file_system_type *type, int *new_mnt_flags)
if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root)
continue;
+ /* Read the mount flags and filter out flags that
+ * may safely be ignored.
+ */
+ mnt_flags = mnt->mnt.mnt_flags;
+ if (mnt->mnt.mnt_sb->s_iflags & SB_I_NOEXEC)
+ mnt_flags &= ~(MNT_LOCK_NOSUID | MNT_LOCK_NOEXEC);
+
/* Verify the mount flags are equal to or more permissive
* than the proposed new mount.
*/
- if ((mnt->mnt.mnt_flags & MNT_LOCK_READONLY) &&
+ if ((mnt_flags & MNT_LOCK_READONLY) &&
!(new_flags & MNT_READONLY))
continue;
- if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) &&
+ if ((mnt_flags & MNT_LOCK_NODEV) &&
!(new_flags & MNT_NODEV))
continue;
- if ((mnt->mnt.mnt_flags & MNT_LOCK_ATIME) &&
- ((mnt->mnt.mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK)))
+ if ((mnt_flags & MNT_LOCK_NOSUID) &&
+ !(new_flags & MNT_NOSUID))
+ continue;
+ if ((mnt_flags & MNT_LOCK_NOEXEC) &&
+ !(new_flags & MNT_NOEXEC))
+ continue;
+ if ((mnt_flags & MNT_LOCK_ATIME) &&
+ ((mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK)))
continue;
/* This mount is not fully visible if there are any
@@ -3247,16 +3271,18 @@ static bool fs_fully_visible(struct file_system_type *type, int *new_mnt_flags)
list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
struct inode *inode = child->mnt_mountpoint->d_inode;
/* Only worry about locked mounts */
- if (!(mnt->mnt.mnt_flags & MNT_LOCKED))
+ if (!(mnt_flags & MNT_LOCKED))
continue;
/* Is the directory permanetly empty? */
if (!is_empty_dir_inode(inode))
goto next;
}
/* Preserve the locked attributes */
- *new_mnt_flags |= mnt->mnt.mnt_flags & (MNT_LOCK_READONLY | \
- MNT_LOCK_NODEV | \
- MNT_LOCK_ATIME);
+ *new_mnt_flags |= mnt_flags & (MNT_LOCK_READONLY | \
+ MNT_LOCK_NODEV | \
+ MNT_LOCK_NOSUID | \
+ MNT_LOCK_NOEXEC | \
+ MNT_LOCK_ATIME);
visible = true;
goto found;
next: ;
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 93575e9..f0e3e9e 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -597,7 +597,7 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx,
qname.name = __name;
newdent = d_hash_and_lookup(dentry, &qname);
- if (unlikely(IS_ERR(newdent)))
+ if (IS_ERR(newdent))
goto end_advance;
if (!newdent) {
newdent = d_alloc(dentry, &qname);
@@ -1165,8 +1165,6 @@ out:
static int ncp_mknod(struct inode * dir, struct dentry *dentry,
umode_t mode, dev_t rdev)
{
- if (!new_valid_dev(rdev))
- return -EINVAL;
if (ncp_is_nfs_extras(NCP_SERVER(dir), NCP_FINFO(dir)->volNumber)) {
ncp_dbg(1, "mode = 0%ho\n", mode);
return ncp_create_new(dir, dentry, mode, rdev, 0);
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 9605a2f..1af15fc 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -82,7 +82,7 @@ static int init_inodecache(void)
ncp_inode_cachep = kmem_cache_create("ncp_inode_cache",
sizeof(struct ncp_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (ncp_inode_cachep == NULL)
return -ENOMEM;
@@ -244,8 +244,7 @@ static void ncp_set_attr(struct inode *inode, struct ncp_entry_info *nwinfo)
#if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS)
static const struct inode_operations ncp_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
.setattr = ncp_notify_change,
};
#endif
@@ -283,6 +282,7 @@ ncp_iget(struct super_block *sb, struct ncp_entry_info *info)
#if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS)
} else if (S_ISLNK(inode->i_mode)) {
inode->i_op = &ncp_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_data.a_ops = &ncp_symlink_aops;
#endif
} else {
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 79b1130..0a3f9b5 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -525,6 +525,8 @@ static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg
switch (rqdata.cmd) {
case NCP_LOCK_EX:
case NCP_LOCK_SH:
+ if (rqdata.timeout < 0)
+ return -EINVAL;
if (rqdata.timeout == 0)
rqdata.timeout = NCP_LOCK_DEFAULT_TIMEOUT;
else if (rqdata.timeout > NCP_LOCK_MAX_TIMEOUT)
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index d2554fe..ddd0138 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -116,7 +116,7 @@ bl_submit_bio(int rw, struct bio *bio)
static struct bio *
bl_alloc_init_bio(int npg, struct block_device *bdev, sector_t disk_sector,
- void (*end_io)(struct bio *, int err), struct parallel_io *par)
+ bio_end_io_t end_io, struct parallel_io *par)
{
struct bio *bio;
@@ -139,8 +139,7 @@ bl_alloc_init_bio(int npg, struct block_device *bdev, sector_t disk_sector,
static struct bio *
do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect,
struct page *page, struct pnfs_block_dev_map *map,
- struct pnfs_block_extent *be,
- void (*end_io)(struct bio *, int err),
+ struct pnfs_block_extent *be, bio_end_io_t end_io,
struct parallel_io *par, unsigned int offset, int *len)
{
struct pnfs_block_dev *dev =
@@ -183,11 +182,11 @@ retry:
return bio;
}
-static void bl_end_io_read(struct bio *bio, int err)
+static void bl_end_io_read(struct bio *bio)
{
struct parallel_io *par = bio->bi_private;
- if (err) {
+ if (bio->bi_error) {
struct nfs_pgio_header *header = par->data;
if (!header->pnfs_error)
@@ -230,7 +229,7 @@ bl_read_pagelist(struct nfs_pgio_header *header)
struct parallel_io *par;
loff_t f_offset = header->args.offset;
size_t bytes_left = header->args.count;
- unsigned int pg_offset, pg_len;
+ unsigned int pg_offset = header->args.pgbase, pg_len;
struct page **pages = header->args.pages;
int pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
const bool is_dio = (header->dreq != NULL);
@@ -263,7 +262,6 @@ bl_read_pagelist(struct nfs_pgio_header *header)
extent_length = be.be_length - (isect - be.be_f_offset);
}
- pg_offset = f_offset & ~PAGE_CACHE_MASK;
if (is_dio) {
if (pg_offset + bytes_left > PAGE_CACHE_SIZE)
pg_len = PAGE_CACHE_SIZE - pg_offset;
@@ -274,9 +272,6 @@ bl_read_pagelist(struct nfs_pgio_header *header)
pg_len = PAGE_CACHE_SIZE;
}
- isect += (pg_offset >> SECTOR_SHIFT);
- extent_length -= (pg_offset >> SECTOR_SHIFT);
-
if (is_hole(&be)) {
bio = bl_submit_bio(READ, bio);
/* Fill hole w/ zeroes w/o accessing device */
@@ -302,6 +297,7 @@ bl_read_pagelist(struct nfs_pgio_header *header)
extent_length -= (pg_len >> SECTOR_SHIFT);
f_offset += pg_len;
bytes_left -= pg_len;
+ pg_offset = 0;
}
if ((isect << SECTOR_SHIFT) >= header->inode->i_size) {
header->res.eof = 1;
@@ -316,13 +312,12 @@ out:
return PNFS_ATTEMPTED;
}
-static void bl_end_io_write(struct bio *bio, int err)
+static void bl_end_io_write(struct bio *bio)
{
struct parallel_io *par = bio->bi_private;
- const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct nfs_pgio_header *header = par->data;
- if (!uptodate) {
+ if (bio->bi_error) {
if (!header->pnfs_error)
header->pnfs_error = -EIO;
pnfs_set_lo_fail(header->lseg);
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 92dca9e..c556640 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -46,13 +46,6 @@
struct pnfs_block_dev;
-enum pnfs_block_volume_type {
- PNFS_BLOCK_VOLUME_SIMPLE = 0,
- PNFS_BLOCK_VOLUME_SLICE = 1,
- PNFS_BLOCK_VOLUME_CONCAT = 2,
- PNFS_BLOCK_VOLUME_STRIPE = 3,
-};
-
#define PNFS_BLOCK_MAX_UUIDS 4
#define PNFS_BLOCK_MAX_DEVICES 64
@@ -117,13 +110,6 @@ struct pnfs_block_dev {
struct pnfs_block_dev_map *map);
};
-enum exstate4 {
- PNFS_BLOCK_READWRITE_DATA = 0,
- PNFS_BLOCK_READ_DATA = 1,
- PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */
- PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */
-};
-
/* sector_t fields are all in 512-byte sectors */
struct pnfs_block_extent {
union {
@@ -134,15 +120,12 @@ struct pnfs_block_extent {
sector_t be_f_offset; /* the starting offset in the file */
sector_t be_length; /* the size of the extent */
sector_t be_v_offset; /* the starting offset in the volume */
- enum exstate4 be_state; /* the state of this extent */
+ enum pnfs_block_extent_state be_state; /* the state of this extent */
#define EXTENT_WRITTEN 1
#define EXTENT_COMMITTING 2
unsigned int be_tag;
};
-/* on the wire size of the extent */
-#define BL_EXTENT_SIZE (7 * sizeof(__be32) + NFS4_DEVICEID4_SIZE)
-
struct pnfs_block_layout {
struct pnfs_layout_hdr bl_layout;
struct rb_root bl_ext_rw;
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
index e535599..a861bbd 100644
--- a/fs/nfs/blocklayout/dev.c
+++ b/fs/nfs/blocklayout/dev.c
@@ -22,7 +22,7 @@ bl_free_device(struct pnfs_block_dev *dev)
kfree(dev->children);
} else {
if (dev->bdev)
- blkdev_put(dev->bdev, FMODE_READ);
+ blkdev_put(dev->bdev, FMODE_READ | FMODE_WRITE);
}
}
@@ -65,6 +65,11 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
return -EIO;
p = xdr_decode_hyper(p, &b->simple.sigs[i].offset);
b->simple.sigs[i].sig_len = be32_to_cpup(p++);
+ if (b->simple.sigs[i].sig_len > PNFS_BLOCK_UUID_LEN) {
+ pr_info("signature too long: %d\n",
+ b->simple.sigs[i].sig_len);
+ return -EIO;
+ }
p = xdr_inline_decode(xdr, b->simple.sigs[i].sig_len);
if (!p)
@@ -195,7 +200,7 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
if (!dev)
return -EIO;
- d->bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL);
+ d->bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_WRITE, NULL);
if (IS_ERR(d->bdev)) {
printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n",
MAJOR(dev), MINOR(dev), PTR_ERR(d->bdev));
diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
index 31d0b5e..c59a59c 100644
--- a/fs/nfs/blocklayout/extent_tree.c
+++ b/fs/nfs/blocklayout/extent_tree.c
@@ -462,6 +462,12 @@ out:
return err;
}
+static size_t ext_tree_layoutupdate_size(size_t count)
+{
+ return sizeof(__be32) /* number of entries */ +
+ PNFS_BLOCK_EXTENT_SIZE * count;
+}
+
static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
size_t buffer_size)
{
@@ -489,7 +495,7 @@ static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
continue;
(*count)++;
- if (*count * BL_EXTENT_SIZE > buffer_size) {
+ if (ext_tree_layoutupdate_size(*count) > buffer_size) {
/* keep counting.. */
ret = -ENOSPC;
continue;
@@ -530,7 +536,7 @@ retry:
if (unlikely(ret)) {
ext_tree_free_commitdata(arg, buffer_size);
- buffer_size = sizeof(__be32) + BL_EXTENT_SIZE * count;
+ buffer_size = ext_tree_layoutupdate_size(count);
count = 0;
arg->layoutupdate_pages =
@@ -549,17 +555,14 @@ retry:
}
*start_p = cpu_to_be32(count);
- arg->layoutupdate_len = sizeof(__be32) + BL_EXTENT_SIZE * count;
+ arg->layoutupdate_len = ext_tree_layoutupdate_size(count);
if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) {
- __be32 *p = start_p;
+ void *p = start_p, *end = p + arg->layoutupdate_len;
int i = 0;
- for (p = start_p;
- p < start_p + arg->layoutupdate_len;
- p += PAGE_SIZE) {
+ for ( ; p < end; p += PAGE_SIZE)
arg->layoutupdate_pages[i++] = vmalloc_to_page(p);
- }
}
dprintk("%s found %zu ranges\n", __func__, count);
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 682529c..a7f2e6e 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -99,17 +99,6 @@ nfs4_callback_up(struct svc_serv *serv)
}
#if defined(CONFIG_NFS_V4_1)
-static int nfs41_callback_up_net(struct svc_serv *serv, struct net *net)
-{
- /*
- * Create an svc_sock for the back channel service that shares the
- * fore channel connection.
- * Returns the input port (0) and sets the svc_serv bc_xprt on success
- */
- return svc_create_xprt(serv, "tcp-bc", net, PF_INET, 0,
- SVC_SOCK_ANONYMOUS);
-}
-
/*
* The callback service for NFSv4.1 callbacks
*/
@@ -162,10 +151,6 @@ nfs41_callback_up(struct svc_serv *serv)
spin_lock_init(&serv->sv_cb_lock);
init_waitqueue_head(&serv->sv_cb_waitq);
rqstp = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);
- if (IS_ERR(rqstp)) {
- svc_xprt_put(serv->sv_bc_xprt);
- serv->sv_bc_xprt = NULL;
- }
dprintk("--> %s return %d\n", __func__, PTR_ERR_OR_ZERO(rqstp));
return rqstp;
}
@@ -188,11 +173,6 @@ static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
xprt->bc_serv = serv;
}
#else
-static int nfs41_callback_up_net(struct svc_serv *serv, struct net *net)
-{
- return 0;
-}
-
static void nfs_minorversion_callback_svc_setup(struct svc_serv *serv,
struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp))
{
@@ -263,7 +243,8 @@ static void nfs_callback_down_net(u32 minorversion, struct svc_serv *serv, struc
svc_shutdown_net(serv, net);
}
-static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, struct net *net)
+static int nfs_callback_up_net(int minorversion, struct svc_serv *serv,
+ struct net *net, struct rpc_xprt *xprt)
{
struct nfs_net *nn = net_generic(net, nfs_net_id);
int ret;
@@ -279,20 +260,11 @@ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, struct n
goto err_bind;
}
- switch (minorversion) {
- case 0:
- ret = nfs4_callback_up_net(serv, net);
- break;
- case 1:
- case 2:
- ret = nfs41_callback_up_net(serv, net);
- break;
- default:
- printk(KERN_ERR "NFS: unknown callback version: %d\n",
- minorversion);
- ret = -EINVAL;
- break;
- }
+ ret = -EPROTONOSUPPORT;
+ if (minorversion == 0)
+ ret = nfs4_callback_up_net(serv, net);
+ else if (xprt->ops->bc_up)
+ ret = xprt->ops->bc_up(serv, net);
if (ret < 0) {
printk(KERN_ERR "NFS: callback service start failed\n");
@@ -308,6 +280,10 @@ err_bind:
return ret;
}
+static struct svc_serv_ops nfs_cb_sv_ops = {
+ .svo_enqueue_xprt = svc_xprt_do_enqueue,
+};
+
static struct svc_serv *nfs_callback_create_svc(int minorversion)
{
struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
@@ -333,7 +309,7 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion)
printk(KERN_WARNING "nfs_callback_create_svc: no kthread, %d users??\n",
cb_info->users);
- serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL);
+ serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, &nfs_cb_sv_ops);
if (!serv) {
printk(KERN_ERR "nfs_callback_create_svc: create service failed\n");
return ERR_PTR(-ENOMEM);
@@ -364,7 +340,7 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt)
goto err_create;
}
- ret = nfs_callback_up_net(minorversion, serv, net);
+ ret = nfs_callback_up_net(minorversion, serv, net, xprt);
if (ret < 0)
goto err_net;
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 84326e9..ff8195b 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -61,7 +61,6 @@ struct cb_compound_hdr_res {
};
struct cb_getattrargs {
- struct sockaddr *addr;
struct nfs_fh fh;
uint32_t bitmap[2];
};
@@ -76,7 +75,6 @@ struct cb_getattrres {
};
struct cb_recallargs {
- struct sockaddr *addr;
struct nfs_fh fh;
nfs4_stateid stateid;
uint32_t truncate;
@@ -119,9 +117,6 @@ extern __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
struct cb_sequenceres *res,
struct cb_process_state *cps);
-extern int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation,
- const nfs4_stateid *stateid);
-
#define RCA4_TYPE_MASK_RDATA_DLG 0
#define RCA4_TYPE_MASK_WDATA_DLG 1
#define RCA4_TYPE_MASK_DIR_DLG 2
@@ -134,7 +129,6 @@ extern int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation,
#define RCA4_TYPE_MASK_ALL 0xf31f
struct cb_recallanyargs {
- struct sockaddr *craa_addr;
uint32_t craa_objs_to_keep;
uint32_t craa_type_mask;
};
@@ -144,7 +138,6 @@ extern __be32 nfs4_callback_recallany(struct cb_recallanyargs *args,
struct cb_process_state *cps);
struct cb_recallslotargs {
- struct sockaddr *crsa_addr;
uint32_t crsa_target_highest_slotid;
};
extern __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args,
@@ -152,7 +145,6 @@ extern __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args,
struct cb_process_state *cps);
struct cb_layoutrecallargs {
- struct sockaddr *cbl_addr;
uint32_t cbl_recall_type;
uint32_t cbl_layout_type;
uint32_t cbl_layoutchanged;
@@ -196,9 +188,6 @@ extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
#if IS_ENABLED(CONFIG_NFS_V4)
extern int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt);
extern void nfs_callback_down(int minorversion, struct net *net);
-extern int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation,
- const nfs4_stateid *stateid);
-extern int nfs4_set_callback_sessionid(struct nfs_client *clp);
#endif /* CONFIG_NFS_V4 */
/*
* nfs41: Callbacks are expected to not cause substantial latency,
@@ -209,6 +198,5 @@ extern int nfs4_set_callback_sessionid(struct nfs_client *clp);
#define NFS41_BC_MAX_CALLBACKS 1
extern unsigned int nfs_callback_set_tcpport;
-extern unsigned short nfs_callback_tcpport;
#endif /* __LINUX_FS_NFS_CALLBACK_H */
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 29e3c1b..f0939d0 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -17,9 +17,7 @@
#include "nfs4session.h"
#include "nfs4trace.h"
-#ifdef NFS_DEBUG
#define NFSDBG_FACILITY NFSDBG_CALLBACK
-#endif
__be32 nfs4_callback_getattr(struct cb_getattrargs *args,
struct cb_getattrres *res,
@@ -40,8 +38,11 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
inode = nfs_delegation_find_inode(cps->clp, &args->fh);
- if (inode == NULL)
+ if (inode == NULL) {
+ trace_nfs4_cb_getattr(cps->clp, &args->fh, NULL,
+ -ntohl(res->status));
goto out;
+ }
nfsi = NFS_I(inode);
rcu_read_lock();
delegation = rcu_dereference(nfsi->delegation);
@@ -60,6 +61,7 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
res->status = 0;
out_iput:
rcu_read_unlock();
+ trace_nfs4_cb_getattr(cps->clp, &args->fh, inode, -ntohl(res->status));
iput(inode);
out:
dprintk("%s: exit with status = %d\n", __func__, ntohl(res->status));
@@ -81,8 +83,11 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
res = htonl(NFS4ERR_BADHANDLE);
inode = nfs_delegation_find_inode(cps->clp, &args->fh);
- if (inode == NULL)
+ if (inode == NULL) {
+ trace_nfs4_cb_recall(cps->clp, &args->fh, NULL,
+ &args->stateid, -ntohl(res));
goto out;
+ }
/* Set up a helper thread to actually return the delegation */
switch (nfs_async_inode_return_delegation(inode, &args->stateid)) {
case 0:
@@ -94,7 +99,8 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
default:
res = htonl(NFS4ERR_RESOURCE);
}
- trace_nfs4_recall_delegation(inode, -ntohl(res));
+ trace_nfs4_cb_recall(cps->clp, &args->fh, inode,
+ &args->stateid, -ntohl(res));
iput(inode);
out:
dprintk("%s: exit with status = %d\n", __func__, ntohl(res));
@@ -158,6 +164,22 @@ static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp,
return lo;
}
+/*
+ * Enforce RFC5661 section 12.5.5.2.1. (Layout Recall and Return Sequencing)
+ */
+static bool pnfs_check_stateid_sequence(struct pnfs_layout_hdr *lo,
+ const nfs4_stateid *new)
+{
+ u32 oldseq, newseq;
+
+ oldseq = be32_to_cpu(lo->plh_stateid.seqid);
+ newseq = be32_to_cpu(new->seqid);
+
+ if (newseq > oldseq + 1)
+ return false;
+ return true;
+}
+
static u32 initiate_file_draining(struct nfs_client *clp,
struct cb_layoutrecallargs *args)
{
@@ -167,33 +189,52 @@ static u32 initiate_file_draining(struct nfs_client *clp,
LIST_HEAD(free_me_list);
lo = get_layout_by_fh(clp, &args->cbl_fh, &args->cbl_stateid);
- if (!lo)
+ if (!lo) {
+ trace_nfs4_cb_layoutrecall_file(clp, &args->cbl_fh, NULL,
+ &args->cbl_stateid, -rv);
goto out;
+ }
ino = lo->plh_inode;
spin_lock(&ino->i_lock);
+ if (!pnfs_check_stateid_sequence(lo, &args->cbl_stateid)) {
+ rv = NFS4ERR_DELAY;
+ goto unlock;
+ }
pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
spin_unlock(&ino->i_lock);
pnfs_layoutcommit_inode(ino, false);
spin_lock(&ino->i_lock);
- if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
- pnfs_mark_matching_lsegs_invalid(lo, &free_me_list,
- &args->cbl_range)) {
+ /*
+ * Enforce RFC5661 Section 12.5.5.2.1.5 (Bulk Recall and Return)
+ */
+ if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
rv = NFS4ERR_DELAY;
goto unlock;
}
+ if (pnfs_mark_matching_lsegs_return(lo, &free_me_list,
+ &args->cbl_range)) {
+ rv = NFS4_OK;
+ goto unlock;
+ }
+
if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) {
NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo,
&args->cbl_range);
}
+ pnfs_mark_layout_returned_if_empty(lo);
unlock:
spin_unlock(&ino->i_lock);
pnfs_free_lseg_list(&free_me_list);
+ /* Free all lsegs that are attached to commit buckets */
+ nfs_commit_inode(ino, 0);
pnfs_put_layout_hdr(lo);
+ trace_nfs4_cb_layoutrecall_file(clp, &args->cbl_fh, ino,
+ &args->cbl_stateid, -rv);
iput(ino);
out:
return rv;
@@ -554,7 +595,7 @@ __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy,
status = htonl(NFS4_OK);
nfs41_set_target_slotid(fc_tbl, args->crsa_target_highest_slotid);
- nfs41_server_notify_target_slotid_update(cps->clp);
+ nfs41_notify_server(cps->clp);
out:
dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
return status;
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 6b1697a..646cdac 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -18,19 +18,21 @@
#include "internal.h"
#include "nfs4session.h"
-#define CB_OP_TAGLEN_MAXSZ (512)
-#define CB_OP_HDR_RES_MAXSZ (2 + CB_OP_TAGLEN_MAXSZ)
-#define CB_OP_GETATTR_BITMAP_MAXSZ (4)
-#define CB_OP_GETATTR_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \
- CB_OP_GETATTR_BITMAP_MAXSZ + \
- 2 + 2 + 3 + 3)
-#define CB_OP_RECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
+#define CB_OP_TAGLEN_MAXSZ (512)
+#define CB_OP_HDR_RES_MAXSZ (2 * 4) // opcode, status
+#define CB_OP_GETATTR_BITMAP_MAXSZ (4 * 4) // bitmap length, 3 bitmaps
+#define CB_OP_GETATTR_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \
+ CB_OP_GETATTR_BITMAP_MAXSZ + \
+ /* change, size, ctime, mtime */\
+ (2 + 2 + 3 + 3) * 4)
+#define CB_OP_RECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
#if defined(CONFIG_NFS_V4_1)
#define CB_OP_LAYOUTRECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
#define CB_OP_DEVICENOTIFY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
#define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \
- 4 + 1 + 3)
+ NFS4_MAX_SESSIONID_LEN + \
+ (1 + 3) * 4) // seqid, 3 slotids
#define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
#define CB_OP_RECALLSLOT_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
#endif /* CONFIG_NFS_V4_1 */
@@ -157,7 +159,7 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound
if (unlikely(status != 0))
return status;
/* We do not like overly long tags! */
- if (hdr->taglen > CB_OP_TAGLEN_MAXSZ - 12) {
+ if (hdr->taglen > CB_OP_TAGLEN_MAXSZ) {
printk("NFS: NFSv4 CALLBACK %s: client sent tag of length %u\n",
__func__, hdr->taglen);
return htonl(NFS4ERR_RESOURCE);
@@ -198,7 +200,6 @@ static __be32 decode_getattr_args(struct svc_rqst *rqstp, struct xdr_stream *xdr
status = decode_fh(xdr, &args->fh);
if (unlikely(status != 0))
goto out;
- args->addr = svc_addr(rqstp);
status = decode_bitmap(xdr, args->bitmap);
out:
dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
@@ -210,7 +211,6 @@ static __be32 decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr,
__be32 *p;
__be32 status;
- args->addr = svc_addr(rqstp);
status = decode_stateid(xdr, &args->stateid);
if (unlikely(status != 0))
goto out;
@@ -236,7 +236,6 @@ static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp,
__be32 status = 0;
uint32_t iomode;
- args->cbl_addr = svc_addr(rqstp);
p = read_buf(xdr, 4 * sizeof(uint32_t));
if (unlikely(p == NULL)) {
status = htonl(NFS4ERR_BADXDR);
@@ -383,13 +382,12 @@ static __be32 decode_sessionid(struct xdr_stream *xdr,
struct nfs4_sessionid *sid)
{
__be32 *p;
- int len = NFS4_MAX_SESSIONID_LEN;
- p = read_buf(xdr, len);
+ p = read_buf(xdr, NFS4_MAX_SESSIONID_LEN);
if (unlikely(p == NULL))
return htonl(NFS4ERR_RESOURCE);
- memcpy(sid->data, p, len);
+ memcpy(sid->data, p, NFS4_MAX_SESSIONID_LEN);
return 0;
}
@@ -500,7 +498,6 @@ static __be32 decode_recallany_args(struct svc_rqst *rqstp,
uint32_t bitmap[2];
__be32 *p, status;
- args->craa_addr = svc_addr(rqstp);
p = read_buf(xdr, 4);
if (unlikely(p == NULL))
return htonl(NFS4ERR_BADXDR);
@@ -519,7 +516,6 @@ static __be32 decode_recallslot_args(struct svc_rqst *rqstp,
{
__be32 *p;
- args->crsa_addr = svc_addr(rqstp);
p = read_buf(xdr, 4);
if (unlikely(p == NULL))
return htonl(NFS4ERR_BADXDR);
@@ -684,13 +680,12 @@ static __be32 encode_sessionid(struct xdr_stream *xdr,
const struct nfs4_sessionid *sid)
{
__be32 *p;
- int len = NFS4_MAX_SESSIONID_LEN;
- p = xdr_reserve_space(xdr, len);
+ p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN);
if (unlikely(p == NULL))
return htonl(NFS4ERR_RESOURCE);
- memcpy(p, sid, len);
+ memcpy(p, sid, NFS4_MAX_SESSIONID_LEN);
return 0;
}
@@ -704,7 +699,9 @@ static __be32 encode_cb_sequence_res(struct svc_rqst *rqstp,
if (unlikely(status != 0))
goto out;
- encode_sessionid(xdr, &res->csr_sessionid);
+ status = encode_sessionid(xdr, &res->csr_sessionid);
+ if (status)
+ goto out;
p = xdr_reserve_space(xdr, 4 * sizeof(uint32_t));
if (unlikely(p == NULL))
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 4a90c9b..d6d5d2a 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -20,6 +20,7 @@
#include <linux/stat.h>
#include <linux/errno.h>
#include <linux/unistd.h>
+#include <linux/sunrpc/addr.h>
#include <linux/sunrpc/clnt.h>
#include <linux/sunrpc/stats.h>
#include <linux/sunrpc/metrics.h>
@@ -285,116 +286,6 @@ void nfs_put_client(struct nfs_client *clp)
}
EXPORT_SYMBOL_GPL(nfs_put_client);
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-/*
- * Test if two ip6 socket addresses refer to the same socket by
- * comparing relevant fields. The padding bytes specifically, are not
- * compared. sin6_flowinfo is not compared because it only affects QoS
- * and sin6_scope_id is only compared if the address is "link local"
- * because "link local" addresses need only be unique to a specific
- * link. Conversely, ordinary unicast addresses might have different
- * sin6_scope_id.
- *
- * The caller should ensure both socket addresses are AF_INET6.
- */
-static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
- const struct sockaddr *sa2)
-{
- const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1;
- const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2;
-
- if (!ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr))
- return 0;
- else if (ipv6_addr_type(&sin1->sin6_addr) & IPV6_ADDR_LINKLOCAL)
- return sin1->sin6_scope_id == sin2->sin6_scope_id;
-
- return 1;
-}
-#else /* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */
-static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
- const struct sockaddr *sa2)
-{
- return 0;
-}
-#endif
-
-/*
- * Test if two ip4 socket addresses refer to the same socket, by
- * comparing relevant fields. The padding bytes specifically, are
- * not compared.
- *
- * The caller should ensure both socket addresses are AF_INET.
- */
-static int nfs_sockaddr_match_ipaddr4(const struct sockaddr *sa1,
- const struct sockaddr *sa2)
-{
- const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1;
- const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2;
-
- return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr;
-}
-
-static int nfs_sockaddr_cmp_ip6(const struct sockaddr *sa1,
- const struct sockaddr *sa2)
-{
- const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1;
- const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2;
-
- return nfs_sockaddr_match_ipaddr6(sa1, sa2) &&
- (sin1->sin6_port == sin2->sin6_port);
-}
-
-static int nfs_sockaddr_cmp_ip4(const struct sockaddr *sa1,
- const struct sockaddr *sa2)
-{
- const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1;
- const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2;
-
- return nfs_sockaddr_match_ipaddr4(sa1, sa2) &&
- (sin1->sin_port == sin2->sin_port);
-}
-
-#if defined(CONFIG_NFS_V4_1)
-/*
- * Test if two socket addresses represent the same actual socket,
- * by comparing (only) relevant fields, excluding the port number.
- */
-int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
- const struct sockaddr *sa2)
-{
- if (sa1->sa_family != sa2->sa_family)
- return 0;
-
- switch (sa1->sa_family) {
- case AF_INET:
- return nfs_sockaddr_match_ipaddr4(sa1, sa2);
- case AF_INET6:
- return nfs_sockaddr_match_ipaddr6(sa1, sa2);
- }
- return 0;
-}
-EXPORT_SYMBOL_GPL(nfs_sockaddr_match_ipaddr);
-#endif /* CONFIG_NFS_V4_1 */
-
-/*
- * Test if two socket addresses represent the same actual socket,
- * by comparing (only) relevant fields, including the port number.
- */
-static int nfs_sockaddr_cmp(const struct sockaddr *sa1,
- const struct sockaddr *sa2)
-{
- if (sa1->sa_family != sa2->sa_family)
- return 0;
-
- switch (sa1->sa_family) {
- case AF_INET:
- return nfs_sockaddr_cmp_ip4(sa1, sa2);
- case AF_INET6:
- return nfs_sockaddr_cmp_ip6(sa1, sa2);
- }
- return 0;
-}
-
/*
* Find an nfs_client on the list that matches the initialisation data
* that is supplied.
@@ -421,7 +312,7 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat
if (clp->cl_minorversion != data->minorversion)
continue;
/* Match the full socket address */
- if (!nfs_sockaddr_cmp(sap, clap))
+ if (!rpc_cmp_addr_port(sap, clap))
continue;
atomic_inc(&clp->cl_count);
@@ -873,6 +764,7 @@ static void nfs_server_set_fsinfo(struct nfs_server *server,
server->time_delta = fsinfo->time_delta;
+ server->clone_blksize = fsinfo->clone_blksize;
/* We're airborne Set socket buffersize */
rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100);
}
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 029d688..5166adc 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -113,7 +113,8 @@ out:
return status;
}
-static int nfs_delegation_claim_opens(struct inode *inode, const nfs4_stateid *stateid)
+static int nfs_delegation_claim_opens(struct inode *inode,
+ const nfs4_stateid *stateid, fmode_t type)
{
struct nfs_inode *nfsi = NFS_I(inode);
struct nfs_open_context *ctx;
@@ -140,7 +141,7 @@ again:
/* Block nfs4_proc_unlck */
mutex_lock(&sp->so_delegreturn_mutex);
seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
- err = nfs4_open_delegation_recall(ctx, state, stateid);
+ err = nfs4_open_delegation_recall(ctx, state, stateid, type);
if (!err)
err = nfs_delegation_claim_locks(ctx, state, stateid);
if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
@@ -175,7 +176,7 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred,
if (delegation->inode != NULL) {
nfs4_stateid_copy(&delegation->stateid, &res->delegation);
delegation->type = res->delegation_type;
- delegation->maxsize = res->maxsize;
+ delegation->pagemod_limit = res->pagemod_limit;
oldcred = delegation->cred;
delegation->cred = get_rpccred(cred);
clear_bit(NFS_DELEGATION_NEED_RECLAIM,
@@ -337,7 +338,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
return -ENOMEM;
nfs4_stateid_copy(&delegation->stateid, &res->delegation);
delegation->type = res->delegation_type;
- delegation->maxsize = res->maxsize;
+ delegation->pagemod_limit = res->pagemod_limit;
delegation->change_attr = inode->i_version;
delegation->cred = get_rpccred(cred);
delegation->inode = inode;
@@ -411,7 +412,8 @@ static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation
do {
if (test_bit(NFS_DELEGATION_REVOKED, &delegation->flags))
break;
- err = nfs_delegation_claim_opens(inode, &delegation->stateid);
+ err = nfs_delegation_claim_opens(inode, &delegation->stateid,
+ delegation->type);
if (!issync || err != -EAGAIN)
break;
/*
@@ -719,14 +721,12 @@ int nfs_async_inode_return_delegation(struct inode *inode,
struct nfs_client *clp = server->nfs_client;
struct nfs_delegation *delegation;
- filemap_flush(inode->i_mapping);
-
rcu_read_lock();
delegation = rcu_dereference(NFS_I(inode)->delegation);
if (delegation == NULL)
goto out_enoent;
-
- if (!clp->cl_mvops->match_stateid(&delegation->stateid, stateid))
+ if (stateid != NULL &&
+ !clp->cl_mvops->match_stateid(&delegation->stateid, stateid))
goto out_enoent;
nfs_mark_return_delegation(server, delegation);
rcu_read_unlock();
@@ -900,3 +900,28 @@ bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode,
rcu_read_unlock();
return ret;
}
+
+/**
+ * nfs4_delegation_flush_on_close - Check if we must flush file on close
+ * @inode: inode to check
+ *
+ * This function checks the number of outstanding writes to the file
+ * against the delegation 'space_limit' field to see if
+ * the spec requires us to flush the file on close.
+ */
+bool nfs4_delegation_flush_on_close(const struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs_delegation *delegation;
+ bool ret = true;
+
+ rcu_read_lock();
+ delegation = rcu_dereference(nfsi->delegation);
+ if (delegation == NULL || !(delegation->type & FMODE_WRITE))
+ goto out;
+ if (nfsi->nrequests < delegation->pagemod_limit)
+ ret = false;
+out:
+ rcu_read_unlock();
+ return ret;
+}
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index e3c20a3..333063e 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -18,7 +18,7 @@ struct nfs_delegation {
struct inode *inode;
nfs4_stateid stateid;
fmode_t type;
- loff_t maxsize;
+ unsigned long pagemod_limit;
__u64 change_attr;
unsigned long flags;
spinlock_t lock;
@@ -54,13 +54,14 @@ void nfs_delegation_reap_unclaimed(struct nfs_client *clp);
/* NFSv4 delegation-related procedures */
int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync);
-int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid);
+int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid, fmode_t type);
int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid);
bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, fmode_t flags);
void nfs_mark_delegation_referenced(struct nfs_delegation *delegation);
int nfs4_have_delegation(struct inode *inode, fmode_t flags);
int nfs4_check_delegation(struct inode *inode, fmode_t flags);
+bool nfs4_delegation_flush_on_close(const struct inode *inode);
#endif
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 547308a..c82a212 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -583,26 +583,19 @@ out_nopages:
}
static
-void nfs_readdir_free_pagearray(struct page **pages, unsigned int npages)
+void nfs_readdir_free_pages(struct page **pages, unsigned int npages)
{
unsigned int i;
for (i = 0; i < npages; i++)
put_page(pages[i]);
}
-static
-void nfs_readdir_free_large_page(void *ptr, struct page **pages,
- unsigned int npages)
-{
- nfs_readdir_free_pagearray(pages, npages);
-}
-
/*
* nfs_readdir_large_page will allocate pages that must be freed with a call
- * to nfs_readdir_free_large_page
+ * to nfs_readdir_free_pagearray
*/
static
-int nfs_readdir_large_page(struct page **pages, unsigned int npages)
+int nfs_readdir_alloc_pages(struct page **pages, unsigned int npages)
{
unsigned int i;
@@ -615,7 +608,7 @@ int nfs_readdir_large_page(struct page **pages, unsigned int npages)
return 0;
out_freepages:
- nfs_readdir_free_pagearray(pages, i);
+ nfs_readdir_free_pages(pages, i);
return -ENOMEM;
}
@@ -623,7 +616,6 @@ static
int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode)
{
struct page *pages[NFS_MAX_READDIR_PAGES];
- void *pages_ptr = NULL;
struct nfs_entry entry;
struct file *file = desc->file;
struct nfs_cache_array *array;
@@ -653,7 +645,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
memset(array, 0, sizeof(struct nfs_cache_array));
array->eof_index = -1;
- status = nfs_readdir_large_page(pages, array_size);
+ status = nfs_readdir_alloc_pages(pages, array_size);
if (status < 0)
goto out_release_array;
do {
@@ -671,7 +663,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
}
} while (array->eof_index < 0);
- nfs_readdir_free_large_page(pages_ptr, pages, array_size);
+ nfs_readdir_free_pages(pages, array_size);
out_release_array:
nfs_readdir_release_array(page);
out_label_free:
@@ -1722,9 +1714,6 @@ nfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev)
dfprintk(VFS, "NFS: mknod(%s/%lu), %pd\n",
dir->i_sb->s_id, dir->i_ino, dentry);
- if (!new_valid_dev(rdev))
- return -EINVAL;
-
attr.ia_mode = mode;
attr.ia_valid = ATTR_MODE;
@@ -1905,15 +1894,14 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
attr.ia_mode = S_IFLNK | S_IRWXUGO;
attr.ia_valid = ATTR_MODE;
- page = alloc_page(GFP_HIGHUSER);
+ page = alloc_page(GFP_USER);
if (!page)
return -ENOMEM;
- kaddr = kmap_atomic(page);
+ kaddr = page_address(page);
memcpy(kaddr, symname, pathlen);
if (pathlen < PAGE_SIZE)
memset(kaddr + pathlen, 0, PAGE_SIZE - pathlen);
- kunmap_atomic(kaddr);
trace_nfs_symlink_enter(dir, dentry);
error = NFS_PROTO(dir)->symlink(dir, dentry, page, pathlen, &attr);
@@ -2443,6 +2431,20 @@ int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags)
}
EXPORT_SYMBOL_GPL(nfs_may_open);
+static int nfs_execute_ok(struct inode *inode, int mask)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ int ret;
+
+ if (mask & MAY_NOT_BLOCK)
+ ret = nfs_revalidate_inode_rcu(server, inode);
+ else
+ ret = nfs_revalidate_inode(server, inode);
+ if (ret == 0 && !execute_ok(inode))
+ ret = -EACCES;
+ return ret;
+}
+
int nfs_permission(struct inode *inode, int mask)
{
struct rpc_cred *cred;
@@ -2460,6 +2462,9 @@ int nfs_permission(struct inode *inode, int mask)
case S_IFLNK:
goto out;
case S_IFREG:
+ if ((mask & MAY_OPEN) &&
+ nfs_server_capable(inode, NFS_CAP_ATOMIC_OPEN))
+ return 0;
break;
case S_IFDIR:
/*
@@ -2492,8 +2497,8 @@ force_lookup:
res = PTR_ERR(cred);
}
out:
- if (!res && (mask & MAY_EXEC) && !execute_ok(inode))
- res = -EACCES;
+ if (!res && (mask & MAY_EXEC))
+ res = nfs_execute_ok(inode, mask);
dfprintk(VFS, "NFS: permission(%s/%lu), mask=0x%x, res=%d\n",
inode->i_sb->s_id, inode->i_ino, mask, res);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 38678d9..7ab7ec9 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -117,12 +117,6 @@ static inline int put_dreq(struct nfs_direct_req *dreq)
return atomic_dec_and_test(&dreq->io_count);
}
-void nfs_direct_set_resched_writes(struct nfs_direct_req *dreq)
-{
- dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
-}
-EXPORT_SYMBOL_GPL(nfs_direct_set_resched_writes);
-
static void
nfs_direct_good_bytes(struct nfs_direct_req *dreq, struct nfs_pgio_header *hdr)
{
@@ -166,8 +160,11 @@ nfs_direct_select_verf(struct nfs_direct_req *dreq,
struct nfs_writeverf *verfp = &dreq->verf;
#ifdef CONFIG_NFS_V4_1
- if (ds_clp) {
- /* pNFS is in use, use the DS verf */
+ /*
+ * pNFS is in use, use the DS verf except commit_through_mds is set
+ * for layout segment where nbuckets is zero.
+ */
+ if (ds_clp && dreq->ds_cinfo.nbuckets > 0) {
if (commit_idx >= 0 && commit_idx < dreq->ds_cinfo.nbuckets)
verfp = &dreq->ds_cinfo.buckets[commit_idx].direct_verf;
else
@@ -667,6 +664,10 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
req = nfs_list_entry(reqs.next);
nfs_direct_setup_mirroring(dreq, &desc, req);
+ if (desc.pg_error < 0) {
+ list_splice_init(&reqs, &failed);
+ goto out_failed;
+ }
list_for_each_entry_safe(req, tmp, &reqs, wb_list) {
if (!nfs_pageio_add_request(&desc, req)) {
@@ -674,13 +675,17 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
nfs_list_add_request(req, &failed);
spin_lock(cinfo.lock);
dreq->flags = 0;
- dreq->error = -EIO;
+ if (desc.pg_error < 0)
+ dreq->error = desc.pg_error;
+ else
+ dreq->error = -EIO;
spin_unlock(cinfo.lock);
}
nfs_release_request(req);
}
nfs_pageio_complete(&desc);
+out_failed:
while (!list_empty(&failed)) {
req = nfs_list_entry(failed.next);
nfs_list_remove_request(req);
@@ -724,14 +729,20 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data)
nfs_direct_write_complete(dreq, data->inode);
}
-static void nfs_direct_error_cleanup(struct nfs_inode *nfsi)
+static void nfs_direct_resched_write(struct nfs_commit_info *cinfo,
+ struct nfs_page *req)
{
- /* There is no lock to clear */
+ struct nfs_direct_req *dreq = cinfo->dreq;
+
+ spin_lock(&dreq->lock);
+ dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+ spin_unlock(&dreq->lock);
+ nfs_mark_request_commit(req, NULL, cinfo, 0);
}
static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops = {
.completion = nfs_direct_commit_complete,
- .error_cleanup = nfs_direct_error_cleanup,
+ .resched_write = nfs_direct_resched_write,
};
static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
@@ -836,10 +847,25 @@ static void nfs_write_sync_pgio_error(struct list_head *head)
}
}
+static void nfs_direct_write_reschedule_io(struct nfs_pgio_header *hdr)
+{
+ struct nfs_direct_req *dreq = hdr->dreq;
+
+ spin_lock(&dreq->lock);
+ if (dreq->error == 0) {
+ dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+ /* fake unstable write to let common nfs resend pages */
+ hdr->verf.committed = NFS_UNSTABLE;
+ hdr->good_bytes = hdr->args.count;
+ }
+ spin_unlock(&dreq->lock);
+}
+
static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = {
.error_cleanup = nfs_write_sync_pgio_error,
.init_hdr = nfs_direct_pgio_init,
.completion = nfs_direct_write_completion,
+ .reschedule_io = nfs_direct_write_reschedule_io,
};
@@ -897,6 +923,11 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
}
nfs_direct_setup_mirroring(dreq, &desc, req);
+ if (desc.pg_error < 0) {
+ nfs_free_request(req);
+ result = desc.pg_error;
+ break;
+ }
nfs_lock_request(req);
req->wb_index = pos >> PAGE_SHIFT;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index cc4fa1e..4ef8f5a 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -82,7 +82,8 @@ nfs_file_release(struct inode *inode, struct file *filp)
dprintk("NFS: release(%pD2)\n", filp);
nfs_inc_stats(inode, NFSIOS_VFSRELEASE);
- return nfs_release(inode, filp);
+ nfs_file_clear_open_context(filp);
+ return 0;
}
EXPORT_SYMBOL_GPL(nfs_file_release);
@@ -141,7 +142,7 @@ EXPORT_SYMBOL_GPL(nfs_file_llseek);
/*
* Flush all dirty pages, and check for write errors.
*/
-int
+static int
nfs_file_flush(struct file *file, fl_owner_t id)
{
struct inode *inode = file_inode(file);
@@ -152,17 +153,9 @@ nfs_file_flush(struct file *file, fl_owner_t id)
if ((file->f_mode & FMODE_WRITE) == 0)
return 0;
- /*
- * If we're holding a write delegation, then just start the i/o
- * but don't wait for completion (or send a commit).
- */
- if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE))
- return filemap_fdatawrite(file->f_mapping);
-
/* Flush writes to the server and return any errors */
return vfs_fsync(file, 0);
}
-EXPORT_SYMBOL_GPL(nfs_file_flush);
ssize_t
nfs_file_read(struct kiocb *iocb, struct iov_iter *to)
@@ -480,8 +473,8 @@ static int nfs_release_page(struct page *page, gfp_t gfp)
dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
/* Always try to initiate a 'commit' if relevant, but only
- * wait for it if __GFP_WAIT is set. Even then, only wait 1
- * second and only if the 'bdi' is not congested.
+ * wait for it if the caller allows blocking. Even then,
+ * only wait 1 second and only if the 'bdi' is not congested.
* Waiting indefinitely can cause deadlocks when the NFS
* server is on this machine, when a new TCP connection is
* needed and in other rare cases. There is no particular
@@ -491,7 +484,7 @@ static int nfs_release_page(struct page *page, gfp_t gfp)
if (mapping) {
struct nfs_server *nfss = NFS_SERVER(mapping->host);
nfs_commit_inode(mapping->host, 0);
- if ((gfp & __GFP_WAIT) &&
+ if (gfpflags_allow_blocking(gfp) &&
!bdi_write_congested(&nfss->backing_dev_info)) {
wait_on_page_bit_killable_timeout(page, PG_private,
HZ);
@@ -521,7 +514,7 @@ static void nfs_check_dirty_writeback(struct page *page,
* so it will not block due to pages that will shortly be freeable.
*/
nfsi = NFS_I(mapping->host);
- if (test_bit(NFS_INO_COMMIT, &nfsi->flags)) {
+ if (atomic_read(&nfsi->commit_info.rpcs_out)) {
*writeback = true;
return;
}
@@ -552,7 +545,7 @@ static int nfs_launder_page(struct page *page)
inode->i_ino, (long long)page_offset(page));
nfs_fscache_wait_on_page_write(nfsi, page);
- return nfs_wb_page(inode, page);
+ return nfs_wb_launder_page(inode, page);
}
static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
@@ -644,12 +637,10 @@ static const struct vm_operations_struct nfs_file_vm_ops = {
.page_mkwrite = nfs_vm_page_mkwrite,
};
-static int nfs_need_sync_write(struct file *filp, struct inode *inode)
+static int nfs_need_check_write(struct file *filp, struct inode *inode)
{
struct nfs_open_context *ctx;
- if (IS_SYNC(inode) || (filp->f_flags & O_DSYNC))
- return 1;
ctx = nfs_file_open_context(filp);
if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags) ||
nfs_ctx_key_to_expire(ctx))
@@ -699,8 +690,8 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
if (result > 0)
written = result;
- /* Return error values for O_DSYNC and IS_SYNC() */
- if (result >= 0 && nfs_need_sync_write(file, inode)) {
+ /* Return error values */
+ if (result >= 0 && nfs_need_check_write(file, inode)) {
int err = vfs_fsync(file, 0);
if (err < 0)
result = err;
@@ -747,18 +738,7 @@ out_noconflict:
static int do_vfs_lock(struct file *file, struct file_lock *fl)
{
- int res = 0;
- switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) {
- case FL_POSIX:
- res = posix_lock_file_wait(file, fl);
- break;
- case FL_FLOCK:
- res = flock_lock_file_wait(file, fl);
- break;
- default:
- BUG();
- }
- return res;
+ return locks_lock_file_wait(file, fl);
}
static int
@@ -776,7 +756,7 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
l_ctx = nfs_get_lock_context(nfs_file_open_context(filp));
if (!IS_ERR(l_ctx)) {
- status = nfs_iocounter_wait(&l_ctx->io_count);
+ status = nfs_iocounter_wait(l_ctx);
nfs_put_lock_context(l_ctx);
if (status < 0)
return status;
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index b34f2e2..bb1f4e7 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -202,6 +202,7 @@ static int filelayout_async_handle_error(struct rpc_task *task,
task->tk_status);
nfs4_mark_deviceid_unavailable(devid);
pnfs_error_mark_layout_for_return(inode, lseg);
+ pnfs_set_lo_fail(lseg);
rpc_wake_up(&tbl->slot_tbl_waitq);
/* fall through */
default:
@@ -629,23 +630,18 @@ out_put:
goto out;
}
-static void filelayout_free_fh_array(struct nfs4_filelayout_segment *fl)
+static void _filelayout_free_lseg(struct nfs4_filelayout_segment *fl)
{
int i;
- for (i = 0; i < fl->num_fh; i++) {
- if (!fl->fh_array[i])
- break;
- kfree(fl->fh_array[i]);
+ if (fl->fh_array) {
+ for (i = 0; i < fl->num_fh; i++) {
+ if (!fl->fh_array[i])
+ break;
+ kfree(fl->fh_array[i]);
+ }
+ kfree(fl->fh_array);
}
- kfree(fl->fh_array);
- fl->fh_array = NULL;
-}
-
-static void
-_filelayout_free_lseg(struct nfs4_filelayout_segment *fl)
-{
- filelayout_free_fh_array(fl);
kfree(fl);
}
@@ -716,21 +712,21 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
/* Do we want to use a mempool here? */
fl->fh_array[i] = kmalloc(sizeof(struct nfs_fh), gfp_flags);
if (!fl->fh_array[i])
- goto out_err_free;
+ goto out_err;
p = xdr_inline_decode(&stream, 4);
if (unlikely(!p))
- goto out_err_free;
+ goto out_err;
fl->fh_array[i]->size = be32_to_cpup(p++);
if (sizeof(struct nfs_fh) < fl->fh_array[i]->size) {
printk(KERN_ERR "NFS: Too big fh %d received %d\n",
i, fl->fh_array[i]->size);
- goto out_err_free;
+ goto out_err;
}
p = xdr_inline_decode(&stream, fl->fh_array[i]->size);
if (unlikely(!p))
- goto out_err_free;
+ goto out_err;
memcpy(fl->fh_array[i]->data, p, fl->fh_array[i]->size);
dprintk("DEBUG: %s: fh len %d\n", __func__,
fl->fh_array[i]->size);
@@ -739,8 +735,6 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
__free_page(scratch);
return 0;
-out_err_free:
- filelayout_free_fh_array(fl);
out_err:
__free_page(scratch);
return -EIO;
@@ -890,13 +884,19 @@ static void
filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req)
{
- if (!pgio->pg_lseg)
+ if (!pgio->pg_lseg) {
pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
req->wb_context,
0,
NFS4_MAX_UINT64,
IOMODE_READ,
GFP_KERNEL);
+ if (IS_ERR(pgio->pg_lseg)) {
+ pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ return;
+ }
+ }
/* If no lseg, fall back to read through mds */
if (pgio->pg_lseg == NULL)
nfs_pageio_reset_read_mds(pgio);
@@ -909,13 +909,20 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio,
struct nfs_commit_info cinfo;
int status;
- if (!pgio->pg_lseg)
+ if (!pgio->pg_lseg) {
pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
req->wb_context,
0,
NFS4_MAX_UINT64,
IOMODE_RW,
GFP_NOFS);
+ if (IS_ERR(pgio->pg_lseg)) {
+ pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ return;
+ }
+ }
+
/* If no lseg, fall back to write through mds */
if (pgio->pg_lseg == NULL)
goto out_mds;
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index b3289d7..6594e9f 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -34,6 +34,7 @@ ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
ffl = kzalloc(sizeof(*ffl), gfp_flags);
if (ffl) {
INIT_LIST_HEAD(&ffl->error_list);
+ INIT_LIST_HEAD(&ffl->mirrors);
return &ffl->generic_hdr;
} else
return NULL;
@@ -135,6 +136,95 @@ decode_name(struct xdr_stream *xdr, u32 *id)
return 0;
}
+static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1,
+ const struct nfs4_ff_layout_mirror *m2)
+{
+ int i, j;
+
+ if (m1->fh_versions_cnt != m2->fh_versions_cnt)
+ return false;
+ for (i = 0; i < m1->fh_versions_cnt; i++) {
+ bool found_fh = false;
+ for (j = 0; j < m2->fh_versions_cnt; j++) {
+ if (nfs_compare_fh(&m1->fh_versions[i],
+ &m2->fh_versions[j]) == 0) {
+ found_fh = true;
+ break;
+ }
+ }
+ if (!found_fh)
+ return false;
+ }
+ return true;
+}
+
+static struct nfs4_ff_layout_mirror *
+ff_layout_add_mirror(struct pnfs_layout_hdr *lo,
+ struct nfs4_ff_layout_mirror *mirror)
+{
+ struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(lo);
+ struct nfs4_ff_layout_mirror *pos;
+ struct inode *inode = lo->plh_inode;
+
+ spin_lock(&inode->i_lock);
+ list_for_each_entry(pos, &ff_layout->mirrors, mirrors) {
+ if (mirror->mirror_ds != pos->mirror_ds)
+ continue;
+ if (!ff_mirror_match_fh(mirror, pos))
+ continue;
+ if (atomic_inc_not_zero(&pos->ref)) {
+ spin_unlock(&inode->i_lock);
+ return pos;
+ }
+ }
+ list_add(&mirror->mirrors, &ff_layout->mirrors);
+ mirror->layout = lo;
+ spin_unlock(&inode->i_lock);
+ return mirror;
+}
+
+static void
+ff_layout_remove_mirror(struct nfs4_ff_layout_mirror *mirror)
+{
+ struct inode *inode;
+ if (mirror->layout == NULL)
+ return;
+ inode = mirror->layout->plh_inode;
+ spin_lock(&inode->i_lock);
+ list_del(&mirror->mirrors);
+ spin_unlock(&inode->i_lock);
+ mirror->layout = NULL;
+}
+
+static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(gfp_t gfp_flags)
+{
+ struct nfs4_ff_layout_mirror *mirror;
+
+ mirror = kzalloc(sizeof(*mirror), gfp_flags);
+ if (mirror != NULL) {
+ spin_lock_init(&mirror->lock);
+ atomic_set(&mirror->ref, 1);
+ INIT_LIST_HEAD(&mirror->mirrors);
+ }
+ return mirror;
+}
+
+static void ff_layout_free_mirror(struct nfs4_ff_layout_mirror *mirror)
+{
+ ff_layout_remove_mirror(mirror);
+ kfree(mirror->fh_versions);
+ if (mirror->cred)
+ put_rpccred(mirror->cred);
+ nfs4_ff_layout_put_deviceid(mirror->mirror_ds);
+ kfree(mirror);
+}
+
+static void ff_layout_put_mirror(struct nfs4_ff_layout_mirror *mirror)
+{
+ if (mirror != NULL && atomic_dec_and_test(&mirror->ref))
+ ff_layout_free_mirror(mirror);
+}
+
static void ff_layout_free_mirror_array(struct nfs4_ff_layout_segment *fls)
{
int i;
@@ -144,11 +234,7 @@ static void ff_layout_free_mirror_array(struct nfs4_ff_layout_segment *fls)
/* normally mirror_ds is freed in
* .free_deviceid_node but we still do it here
* for .alloc_lseg error path */
- if (fls->mirror_array[i]) {
- kfree(fls->mirror_array[i]->fh_versions);
- nfs4_ff_layout_put_deviceid(fls->mirror_array[i]->mirror_ds);
- kfree(fls->mirror_array[i]);
- }
+ ff_layout_put_mirror(fls->mirror_array[i]);
}
kfree(fls->mirror_array);
fls->mirror_array = NULL;
@@ -181,6 +267,65 @@ static void _ff_layout_free_lseg(struct nfs4_ff_layout_segment *fls)
}
}
+static bool
+ff_lseg_range_is_after(const struct pnfs_layout_range *l1,
+ const struct pnfs_layout_range *l2)
+{
+ u64 end1, end2;
+
+ if (l1->iomode != l2->iomode)
+ return l1->iomode != IOMODE_READ;
+ end1 = pnfs_calc_offset_end(l1->offset, l1->length);
+ end2 = pnfs_calc_offset_end(l2->offset, l2->length);
+ if (end1 < l2->offset)
+ return false;
+ if (end2 < l1->offset)
+ return true;
+ return l2->offset <= l1->offset;
+}
+
+static bool
+ff_lseg_merge(struct pnfs_layout_segment *new,
+ struct pnfs_layout_segment *old)
+{
+ u64 new_end, old_end;
+
+ if (new->pls_range.iomode != old->pls_range.iomode)
+ return false;
+ old_end = pnfs_calc_offset_end(old->pls_range.offset,
+ old->pls_range.length);
+ if (old_end < new->pls_range.offset)
+ return false;
+ new_end = pnfs_calc_offset_end(new->pls_range.offset,
+ new->pls_range.length);
+ if (new_end < old->pls_range.offset)
+ return false;
+
+ /* Mergeable: copy info from 'old' to 'new' */
+ if (new_end < old_end)
+ new_end = old_end;
+ if (new->pls_range.offset < old->pls_range.offset)
+ new->pls_range.offset = old->pls_range.offset;
+ new->pls_range.length = pnfs_calc_offset_length(new->pls_range.offset,
+ new_end);
+ if (test_bit(NFS_LSEG_ROC, &old->pls_flags))
+ set_bit(NFS_LSEG_ROC, &new->pls_flags);
+ if (test_bit(NFS_LSEG_LAYOUTRETURN, &old->pls_flags))
+ set_bit(NFS_LSEG_LAYOUTRETURN, &new->pls_flags);
+ return true;
+}
+
+static void
+ff_layout_add_lseg(struct pnfs_layout_hdr *lo,
+ struct pnfs_layout_segment *lseg,
+ struct list_head *free_me)
+{
+ pnfs_generic_layout_insert_lseg(lo, lseg,
+ ff_lseg_range_is_after,
+ ff_lseg_merge,
+ free_me);
+}
+
static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls)
{
int i, j;
@@ -194,6 +339,19 @@ static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls)
}
}
+static void ff_layout_mark_devices_valid(struct nfs4_ff_layout_segment *fls)
+{
+ struct nfs4_deviceid_node *node;
+ int i;
+
+ if (!(fls->flags & FF_FLAGS_NO_IO_THRU_MDS))
+ return;
+ for (i = 0; i < fls->mirror_array_cnt; i++) {
+ node = &fls->mirror_array[i]->mirror_ds->id_node;
+ clear_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags);
+ }
+}
+
static struct pnfs_layout_segment *
ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
struct nfs4_layoutget_res *lgr,
@@ -246,6 +404,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
goto out_err_free;
for (i = 0; i < fls->mirror_array_cnt; i++) {
+ struct nfs4_ff_layout_mirror *mirror;
struct nfs4_deviceid devid;
struct nfs4_deviceid_node *idnode;
u32 ds_count;
@@ -262,17 +421,13 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
if (ds_count != 1)
goto out_err_free;
- fls->mirror_array[i] =
- kzalloc(sizeof(struct nfs4_ff_layout_mirror),
- gfp_flags);
+ fls->mirror_array[i] = ff_layout_alloc_mirror(gfp_flags);
if (fls->mirror_array[i] == NULL) {
rc = -ENOMEM;
goto out_err_free;
}
- spin_lock_init(&fls->mirror_array[i]->lock);
fls->mirror_array[i]->ds_count = ds_count;
- fls->mirror_array[i]->lseg = &fls->generic_hdr;
/* deviceid */
rc = decode_deviceid(&stream, &devid);
@@ -338,19 +493,34 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
if (rc)
goto out_err_free;
+ mirror = ff_layout_add_mirror(lh, fls->mirror_array[i]);
+ if (mirror != fls->mirror_array[i]) {
+ ff_layout_free_mirror(fls->mirror_array[i]);
+ fls->mirror_array[i] = mirror;
+ }
+
dprintk("%s: uid %d gid %d\n", __func__,
fls->mirror_array[i]->uid,
fls->mirror_array[i]->gid);
}
p = xdr_inline_decode(&stream, 4);
- if (p)
- fls->flags = be32_to_cpup(p);
+ if (!p)
+ goto out_sort_mirrors;
+ fls->flags = be32_to_cpup(p);
+ p = xdr_inline_decode(&stream, 4);
+ if (!p)
+ goto out_sort_mirrors;
+ for (i=0; i < fls->mirror_array_cnt; i++)
+ fls->mirror_array[i]->report_interval = be32_to_cpup(p);
+
+out_sort_mirrors:
ff_layout_sort_mirrors(fls);
rc = ff_layout_check_layout(lgr);
if (rc)
goto out_err_free;
+ ff_layout_mark_devices_valid(fls);
ret = &fls->generic_hdr;
dprintk("<-- %s (success)\n", __func__);
@@ -379,21 +549,9 @@ static void
ff_layout_free_lseg(struct pnfs_layout_segment *lseg)
{
struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
- int i;
dprintk("--> %s\n", __func__);
- for (i = 0; i < fls->mirror_array_cnt; i++) {
- if (fls->mirror_array[i]) {
- nfs4_ff_layout_put_deviceid(fls->mirror_array[i]->mirror_ds);
- fls->mirror_array[i]->mirror_ds = NULL;
- if (fls->mirror_array[i]->cred) {
- put_rpccred(fls->mirror_array[i]->cred);
- fls->mirror_array[i]->cred = NULL;
- }
- }
- }
-
if (lseg->pls_range.iomode == IOMODE_RW) {
struct nfs4_flexfile_layout *ffl;
struct inode *inode;
@@ -419,48 +577,46 @@ ff_layout_get_lseg_count(struct nfs4_ff_layout_segment *fls)
}
static void
-nfs4_ff_start_busy_timer(struct nfs4_ff_busy_timer *timer)
+nfs4_ff_start_busy_timer(struct nfs4_ff_busy_timer *timer, ktime_t now)
{
/* first IO request? */
if (atomic_inc_return(&timer->n_ops) == 1) {
- timer->start_time = ktime_get();
+ timer->start_time = now;
}
}
static ktime_t
-nfs4_ff_end_busy_timer(struct nfs4_ff_busy_timer *timer)
+nfs4_ff_end_busy_timer(struct nfs4_ff_busy_timer *timer, ktime_t now)
{
- ktime_t start, now;
+ ktime_t start;
if (atomic_dec_return(&timer->n_ops) < 0)
WARN_ON_ONCE(1);
- now = ktime_get();
start = timer->start_time;
timer->start_time = now;
return ktime_sub(now, start);
}
-static ktime_t
-nfs4_ff_layout_calc_completion_time(struct rpc_task *task)
-{
- return ktime_sub(ktime_get(), task->tk_start);
-}
-
static bool
nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror,
- struct nfs4_ff_layoutstat *layoutstat)
+ struct nfs4_ff_layoutstat *layoutstat,
+ ktime_t now)
{
static const ktime_t notime = {0};
- ktime_t now = ktime_get();
+ s64 report_interval = FF_LAYOUTSTATS_REPORT_INTERVAL;
- nfs4_ff_start_busy_timer(&layoutstat->busy_timer);
+ nfs4_ff_start_busy_timer(&layoutstat->busy_timer, now);
if (ktime_equal(mirror->start_time, notime))
mirror->start_time = now;
if (ktime_equal(mirror->last_report_time, notime))
mirror->last_report_time = now;
+ if (mirror->report_interval != 0)
+ report_interval = (s64)mirror->report_interval * 1000LL;
+ else if (layoutstats_timer != 0)
+ report_interval = (s64)layoutstats_timer * 1000LL;
if (ktime_to_ms(ktime_sub(now, mirror->last_report_time)) >=
- FF_LAYOUTSTATS_REPORT_INTERVAL) {
+ report_interval) {
mirror->last_report_time = now;
return true;
}
@@ -482,35 +638,39 @@ static void
nfs4_ff_layout_stat_io_update_completed(struct nfs4_ff_layoutstat *layoutstat,
__u64 requested,
__u64 completed,
- ktime_t time_completed)
+ ktime_t time_completed,
+ ktime_t time_started)
{
struct nfs4_ff_io_stat *iostat = &layoutstat->io_stat;
+ ktime_t completion_time = ktime_sub(time_completed, time_started);
ktime_t timer;
iostat->ops_completed++;
iostat->bytes_completed += completed;
iostat->bytes_not_delivered += requested - completed;
- timer = nfs4_ff_end_busy_timer(&layoutstat->busy_timer);
+ timer = nfs4_ff_end_busy_timer(&layoutstat->busy_timer, time_completed);
iostat->total_busy_time =
ktime_add(iostat->total_busy_time, timer);
iostat->aggregate_completion_time =
- ktime_add(iostat->aggregate_completion_time, time_completed);
+ ktime_add(iostat->aggregate_completion_time,
+ completion_time);
}
static void
-nfs4_ff_layout_stat_io_start_read(struct nfs4_ff_layout_mirror *mirror,
- __u64 requested)
+nfs4_ff_layout_stat_io_start_read(struct inode *inode,
+ struct nfs4_ff_layout_mirror *mirror,
+ __u64 requested, ktime_t now)
{
bool report;
spin_lock(&mirror->lock);
- report = nfs4_ff_layoutstat_start_io(mirror, &mirror->read_stat);
+ report = nfs4_ff_layoutstat_start_io(mirror, &mirror->read_stat, now);
nfs4_ff_layout_stat_io_update_requested(&mirror->read_stat, requested);
spin_unlock(&mirror->lock);
if (report)
- pnfs_report_layoutstat(mirror->lseg->pls_layout->plh_inode);
+ pnfs_report_layoutstat(inode, GFP_KERNEL);
}
static void
@@ -522,23 +682,24 @@ nfs4_ff_layout_stat_io_end_read(struct rpc_task *task,
spin_lock(&mirror->lock);
nfs4_ff_layout_stat_io_update_completed(&mirror->read_stat,
requested, completed,
- nfs4_ff_layout_calc_completion_time(task));
+ ktime_get(), task->tk_start);
spin_unlock(&mirror->lock);
}
static void
-nfs4_ff_layout_stat_io_start_write(struct nfs4_ff_layout_mirror *mirror,
- __u64 requested)
+nfs4_ff_layout_stat_io_start_write(struct inode *inode,
+ struct nfs4_ff_layout_mirror *mirror,
+ __u64 requested, ktime_t now)
{
bool report;
spin_lock(&mirror->lock);
- report = nfs4_ff_layoutstat_start_io(mirror , &mirror->write_stat);
+ report = nfs4_ff_layoutstat_start_io(mirror , &mirror->write_stat, now);
nfs4_ff_layout_stat_io_update_requested(&mirror->write_stat, requested);
spin_unlock(&mirror->lock);
if (report)
- pnfs_report_layoutstat(mirror->lseg->pls_layout->plh_inode);
+ pnfs_report_layoutstat(inode, GFP_NOIO);
}
static void
@@ -553,8 +714,7 @@ nfs4_ff_layout_stat_io_end_write(struct rpc_task *task,
spin_lock(&mirror->lock);
nfs4_ff_layout_stat_io_update_completed(&mirror->write_stat,
- requested, completed,
- nfs4_ff_layout_calc_completion_time(task));
+ requested, completed, ktime_get(), task->tk_start);
spin_unlock(&mirror->lock);
}
@@ -605,17 +765,17 @@ ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg,
}
static struct nfs4_pnfs_ds *
-ff_layout_choose_best_ds_for_read(struct nfs_pageio_descriptor *pgio,
+ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg,
+ int start_idx,
int *best_idx)
{
- struct nfs4_ff_layout_segment *fls;
+ struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
struct nfs4_pnfs_ds *ds;
int idx;
- fls = FF_LAYOUT_LSEG(pgio->pg_lseg);
/* mirrors are sorted by efficiency */
- for (idx = 0; idx < fls->mirror_array_cnt; idx++) {
- ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, idx, false);
+ for (idx = start_idx; idx < fls->mirror_array_cnt; idx++) {
+ ds = nfs4_ff_layout_prepare_ds(lseg, idx, false);
if (ds) {
*best_idx = idx;
return ds;
@@ -635,18 +795,24 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
int ds_idx;
/* Use full layout for now */
- if (!pgio->pg_lseg)
+ if (!pgio->pg_lseg) {
pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
req->wb_context,
0,
NFS4_MAX_UINT64,
IOMODE_READ,
GFP_KERNEL);
+ if (IS_ERR(pgio->pg_lseg)) {
+ pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ return;
+ }
+ }
/* If no lseg, fall back to read through mds */
if (pgio->pg_lseg == NULL)
goto out_mds;
- ds = ff_layout_choose_best_ds_for_read(pgio, &ds_idx);
+ ds = ff_layout_choose_best_ds_for_read(pgio->pg_lseg, 0, &ds_idx);
if (!ds)
goto out_mds;
mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx);
@@ -675,13 +841,19 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
int i;
int status;
- if (!pgio->pg_lseg)
+ if (!pgio->pg_lseg) {
pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
req->wb_context,
0,
NFS4_MAX_UINT64,
IOMODE_RW,
GFP_NOFS);
+ if (IS_ERR(pgio->pg_lseg)) {
+ pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ return;
+ }
+ }
/* If no lseg, fall back to write through mds */
if (pgio->pg_lseg == NULL)
goto out_mds;
@@ -717,20 +889,25 @@ static unsigned int
ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req)
{
- if (!pgio->pg_lseg)
+ if (!pgio->pg_lseg) {
pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
req->wb_context,
0,
NFS4_MAX_UINT64,
IOMODE_RW,
GFP_NOFS);
+ if (IS_ERR(pgio->pg_lseg)) {
+ pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ goto out;
+ }
+ }
if (pgio->pg_lseg)
return FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg);
/* no lseg means that pnfs is not in use, so no mirroring here */
- pnfs_put_lseg(pgio->pg_lseg);
- pgio->pg_lseg = NULL;
nfs_pageio_reset_write_mds(pgio);
+out:
return 1;
}
@@ -764,18 +941,7 @@ static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs)
hdr->args.count,
(unsigned long long)hdr->args.offset);
- if (!hdr->dreq) {
- struct nfs_open_context *ctx;
-
- ctx = nfs_list_entry(hdr->pages.next)->wb_context;
- set_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
- hdr->completion_ops->error_cleanup(&hdr->pages);
- } else {
- nfs_direct_set_resched_writes(hdr->dreq);
- /* fake unstable write to let common nfs resend pages */
- hdr->verf.committed = NFS_UNSTABLE;
- hdr->good_bytes = hdr->args.count;
- }
+ hdr->completion_ops->reschedule_io(hdr);
return;
}
@@ -901,7 +1067,8 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task,
rpc_wake_up(&tbl->slot_tbl_waitq);
/* fall through */
default:
- if (ff_layout_has_available_ds(lseg))
+ if (ff_layout_no_fallback_to_mds(lseg) ||
+ ff_layout_has_available_ds(lseg))
return -NFS4ERR_RESET_TO_PNFS;
reset:
dprintk("%s Retry through MDS. Error %d\n", __func__,
@@ -931,20 +1098,28 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task,
if (task->tk_status >= 0)
return 0;
- if (task->tk_status != -EJUKEBOX) {
+ switch (task->tk_status) {
+ /* File access problems. Don't mark the device as unavailable */
+ case -EACCES:
+ case -ESTALE:
+ case -EISDIR:
+ case -EBADHANDLE:
+ case -ELOOP:
+ case -ENOSPC:
+ break;
+ case -EJUKEBOX:
+ nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY);
+ goto out_retry;
+ default:
dprintk("%s DS connection error %d\n", __func__,
task->tk_status);
nfs4_mark_deviceid_unavailable(devid);
- if (ff_layout_has_available_ds(lseg))
- return -NFS4ERR_RESET_TO_PNFS;
- else
- return -NFS4ERR_RESET_TO_MDS;
}
-
- if (task->tk_status == -EJUKEBOX)
- nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY);
+ /* FIXME: Need to prevent infinite looping here. */
+ return -NFS4ERR_RESET_TO_PNFS;
+out_retry:
task->tk_status = 0;
- rpc_restart_call(task);
+ rpc_restart_call_prepare(task);
rpc_delay(task, NFS_JUKEBOX_RETRY_TIME);
return -EAGAIN;
}
@@ -972,54 +1147,89 @@ static int ff_layout_async_handle_error(struct rpc_task *task,
static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
int idx, u64 offset, u64 length,
- u32 status, int opnum)
+ u32 status, int opnum, int error)
{
struct nfs4_ff_layout_mirror *mirror;
int err;
+ if (status == 0) {
+ switch (error) {
+ case -ETIMEDOUT:
+ case -EPFNOSUPPORT:
+ case -EPROTONOSUPPORT:
+ case -EOPNOTSUPP:
+ case -ECONNREFUSED:
+ case -ECONNRESET:
+ case -EHOSTDOWN:
+ case -EHOSTUNREACH:
+ case -ENETUNREACH:
+ case -EADDRINUSE:
+ case -ENOBUFS:
+ case -EPIPE:
+ case -EPERM:
+ status = NFS4ERR_NXIO;
+ break;
+ case -EACCES:
+ status = NFS4ERR_ACCESS;
+ break;
+ default:
+ return;
+ }
+ }
+
+ switch (status) {
+ case NFS4ERR_DELAY:
+ case NFS4ERR_GRACE:
+ return;
+ default:
+ break;
+ }
+
mirror = FF_LAYOUT_COMP(lseg, idx);
err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
mirror, offset, length, status, opnum,
GFP_NOIO);
+ pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, lseg);
dprintk("%s: err %d op %d status %u\n", __func__, err, opnum, status);
}
/* NFS_PROTO call done callback routines */
-
static int ff_layout_read_done_cb(struct rpc_task *task,
struct nfs_pgio_header *hdr)
{
- struct inode *inode;
int err;
trace_nfs4_pnfs_read(hdr, task->tk_status);
- if (task->tk_status == -ETIMEDOUT && !hdr->res.op_status)
- hdr->res.op_status = NFS4ERR_NXIO;
- if (task->tk_status < 0 && hdr->res.op_status)
+ if (task->tk_status < 0)
ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx,
hdr->args.offset, hdr->args.count,
- hdr->res.op_status, OP_READ);
+ hdr->res.op_status, OP_READ,
+ task->tk_status);
err = ff_layout_async_handle_error(task, hdr->args.context->state,
hdr->ds_clp, hdr->lseg,
hdr->pgio_mirror_idx);
switch (err) {
case -NFS4ERR_RESET_TO_PNFS:
+ if (ff_layout_choose_best_ds_for_read(hdr->lseg,
+ hdr->pgio_mirror_idx + 1,
+ &hdr->pgio_mirror_idx))
+ goto out_eagain;
set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
&hdr->lseg->pls_layout->plh_flags);
pnfs_read_resend_pnfs(hdr);
return task->tk_status;
case -NFS4ERR_RESET_TO_MDS:
- inode = hdr->lseg->pls_layout->plh_inode;
- pnfs_error_mark_layout_for_return(inode, hdr->lseg);
ff_layout_reset_read(hdr);
return task->tk_status;
case -EAGAIN:
- rpc_restart_call_prepare(task);
- return -EAGAIN;
+ goto out_eagain;
}
return 0;
+out_eagain:
+ rpc_restart_call_prepare(task);
+ return -EAGAIN;
}
static bool
@@ -1058,13 +1268,31 @@ ff_layout_reset_to_mds(struct pnfs_layout_segment *lseg, int idx)
return ff_layout_test_devid_unavailable(node);
}
-static int ff_layout_read_prepare_common(struct rpc_task *task,
- struct nfs_pgio_header *hdr)
+static void ff_layout_read_record_layoutstats_start(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
{
- nfs4_ff_layout_stat_io_start_read(
+ if (test_and_set_bit(NFS_IOHDR_STAT, &hdr->flags))
+ return;
+ nfs4_ff_layout_stat_io_start_read(hdr->inode,
FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
- hdr->args.count);
+ hdr->args.count,
+ task->tk_start);
+}
+static void ff_layout_read_record_layoutstats_done(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
+ if (!test_and_clear_bit(NFS_IOHDR_STAT, &hdr->flags))
+ return;
+ nfs4_ff_layout_stat_io_end_read(task,
+ FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
+ hdr->args.count,
+ hdr->res.count);
+}
+
+static int ff_layout_read_prepare_common(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
rpc_exit(task, -EIO);
return -EIO;
@@ -1080,6 +1308,7 @@ static int ff_layout_read_prepare_common(struct rpc_task *task,
}
hdr->pgio_done_cb = ff_layout_read_done_cb;
+ ff_layout_read_record_layoutstats_start(task, hdr);
return 0;
}
@@ -1138,10 +1367,6 @@ static void ff_layout_read_call_done(struct rpc_task *task, void *data)
dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
- nfs4_ff_layout_stat_io_end_read(task,
- FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
- hdr->args.count, hdr->res.count);
-
if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
task->tk_status == 0) {
nfs4_sequence_done(task, &hdr->res.seq_res);
@@ -1156,42 +1381,43 @@ static void ff_layout_read_count_stats(struct rpc_task *task, void *data)
{
struct nfs_pgio_header *hdr = data;
+ ff_layout_read_record_layoutstats_done(task, hdr);
rpc_count_iostats_metrics(task,
&NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_READ]);
}
+static void ff_layout_read_release(void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ ff_layout_read_record_layoutstats_done(&hdr->task, hdr);
+ pnfs_generic_rw_release(data);
+}
+
+
static int ff_layout_write_done_cb(struct rpc_task *task,
struct nfs_pgio_header *hdr)
{
- struct inode *inode;
int err;
trace_nfs4_pnfs_write(hdr, task->tk_status);
- if (task->tk_status == -ETIMEDOUT && !hdr->res.op_status)
- hdr->res.op_status = NFS4ERR_NXIO;
- if (task->tk_status < 0 && hdr->res.op_status)
+ if (task->tk_status < 0)
ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx,
hdr->args.offset, hdr->args.count,
- hdr->res.op_status, OP_WRITE);
+ hdr->res.op_status, OP_WRITE,
+ task->tk_status);
err = ff_layout_async_handle_error(task, hdr->args.context->state,
hdr->ds_clp, hdr->lseg,
hdr->pgio_mirror_idx);
switch (err) {
case -NFS4ERR_RESET_TO_PNFS:
+ ff_layout_reset_write(hdr, true);
+ return task->tk_status;
case -NFS4ERR_RESET_TO_MDS:
- inode = hdr->lseg->pls_layout->plh_inode;
- pnfs_error_mark_layout_for_return(inode, hdr->lseg);
- if (err == -NFS4ERR_RESET_TO_PNFS) {
- pnfs_set_retry_layoutget(hdr->lseg->pls_layout);
- ff_layout_reset_write(hdr, true);
- } else {
- pnfs_clear_retry_layoutget(hdr->lseg->pls_layout);
- ff_layout_reset_write(hdr, false);
- }
+ ff_layout_reset_write(hdr, false);
return task->tk_status;
case -EAGAIN:
- rpc_restart_call_prepare(task);
return -EAGAIN;
}
@@ -1199,34 +1425,33 @@ static int ff_layout_write_done_cb(struct rpc_task *task,
hdr->res.verf->committed == NFS_DATA_SYNC)
ff_layout_set_layoutcommit(hdr);
+ /* zero out fattr since we don't care DS attr at all */
+ hdr->fattr.valid = 0;
+ if (task->tk_status >= 0)
+ nfs_writeback_update_inode(hdr);
+
return 0;
}
static int ff_layout_commit_done_cb(struct rpc_task *task,
struct nfs_commit_data *data)
{
- struct inode *inode;
int err;
trace_nfs4_pnfs_commit_ds(data, task->tk_status);
- if (task->tk_status == -ETIMEDOUT && !data->res.op_status)
- data->res.op_status = NFS4ERR_NXIO;
- if (task->tk_status < 0 && data->res.op_status)
+ if (task->tk_status < 0)
ff_layout_io_track_ds_error(data->lseg, data->ds_commit_index,
data->args.offset, data->args.count,
- data->res.op_status, OP_COMMIT);
+ data->res.op_status, OP_COMMIT,
+ task->tk_status);
err = ff_layout_async_handle_error(task, NULL, data->ds_clp,
data->lseg, data->ds_commit_index);
switch (err) {
case -NFS4ERR_RESET_TO_PNFS:
+ pnfs_generic_prepare_to_resend_writes(data);
+ return -EAGAIN;
case -NFS4ERR_RESET_TO_MDS:
- inode = data->lseg->pls_layout->plh_inode;
- pnfs_error_mark_layout_for_return(inode, data->lseg);
- if (err == -NFS4ERR_RESET_TO_PNFS)
- pnfs_set_retry_layoutget(data->lseg->pls_layout);
- else
- pnfs_clear_retry_layoutget(data->lseg->pls_layout);
pnfs_generic_prepare_to_resend_writes(data);
return -EAGAIN;
case -EAGAIN:
@@ -1241,13 +1466,31 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
return 0;
}
-static int ff_layout_write_prepare_common(struct rpc_task *task,
- struct nfs_pgio_header *hdr)
+static void ff_layout_write_record_layoutstats_start(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
+ if (test_and_set_bit(NFS_IOHDR_STAT, &hdr->flags))
+ return;
+ nfs4_ff_layout_stat_io_start_write(hdr->inode,
+ FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
+ hdr->args.count,
+ task->tk_start);
+}
+
+static void ff_layout_write_record_layoutstats_done(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
{
- nfs4_ff_layout_stat_io_start_write(
+ if (!test_and_clear_bit(NFS_IOHDR_STAT, &hdr->flags))
+ return;
+ nfs4_ff_layout_stat_io_end_write(task,
FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
- hdr->args.count);
+ hdr->args.count, hdr->res.count,
+ hdr->res.verf->committed);
+}
+static int ff_layout_write_prepare_common(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
rpc_exit(task, -EIO);
return -EIO;
@@ -1264,6 +1507,7 @@ static int ff_layout_write_prepare_common(struct rpc_task *task,
return -EAGAIN;
}
+ ff_layout_write_record_layoutstats_start(task, hdr);
return 0;
}
@@ -1299,11 +1543,6 @@ static void ff_layout_write_call_done(struct rpc_task *task, void *data)
{
struct nfs_pgio_header *hdr = data;
- nfs4_ff_layout_stat_io_end_write(task,
- FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
- hdr->args.count, hdr->res.count,
- hdr->res.verf->committed);
-
if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
task->tk_status == 0) {
nfs4_sequence_done(task, &hdr->res.seq_res);
@@ -1318,16 +1557,51 @@ static void ff_layout_write_count_stats(struct rpc_task *task, void *data)
{
struct nfs_pgio_header *hdr = data;
+ ff_layout_write_record_layoutstats_done(task, hdr);
rpc_count_iostats_metrics(task,
&NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_WRITE]);
}
-static void ff_layout_commit_prepare_common(struct rpc_task *task,
+static void ff_layout_write_release(void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ ff_layout_write_record_layoutstats_done(&hdr->task, hdr);
+ pnfs_generic_rw_release(data);
+}
+
+static void ff_layout_commit_record_layoutstats_start(struct rpc_task *task,
+ struct nfs_commit_data *cdata)
+{
+ if (test_and_set_bit(NFS_IOHDR_STAT, &cdata->flags))
+ return;
+ nfs4_ff_layout_stat_io_start_write(cdata->inode,
+ FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
+ 0, task->tk_start);
+}
+
+static void ff_layout_commit_record_layoutstats_done(struct rpc_task *task,
struct nfs_commit_data *cdata)
{
- nfs4_ff_layout_stat_io_start_write(
+ struct nfs_page *req;
+ __u64 count = 0;
+
+ if (!test_and_clear_bit(NFS_IOHDR_STAT, &cdata->flags))
+ return;
+
+ if (task->tk_status == 0) {
+ list_for_each_entry(req, &cdata->pages, wb_list)
+ count += req->wb_bytes;
+ }
+ nfs4_ff_layout_stat_io_end_write(task,
FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
- 0);
+ count, count, NFS_FILE_SYNC);
+}
+
+static void ff_layout_commit_prepare_common(struct rpc_task *task,
+ struct nfs_commit_data *cdata)
+{
+ ff_layout_commit_record_layoutstats_start(task, cdata);
}
static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data)
@@ -1350,19 +1624,6 @@ static void ff_layout_commit_prepare_v4(struct rpc_task *task, void *data)
static void ff_layout_commit_done(struct rpc_task *task, void *data)
{
- struct nfs_commit_data *cdata = data;
- struct nfs_page *req;
- __u64 count = 0;
-
- if (task->tk_status == 0) {
- list_for_each_entry(req, &cdata->pages, wb_list)
- count += req->wb_bytes;
- }
-
- nfs4_ff_layout_stat_io_end_write(task,
- FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
- count, count, NFS_FILE_SYNC);
-
pnfs_generic_write_commit_done(task, data);
}
@@ -1370,50 +1631,59 @@ static void ff_layout_commit_count_stats(struct rpc_task *task, void *data)
{
struct nfs_commit_data *cdata = data;
+ ff_layout_commit_record_layoutstats_done(task, cdata);
rpc_count_iostats_metrics(task,
&NFS_CLIENT(cdata->inode)->cl_metrics[NFSPROC4_CLNT_COMMIT]);
}
+static void ff_layout_commit_release(void *data)
+{
+ struct nfs_commit_data *cdata = data;
+
+ ff_layout_commit_record_layoutstats_done(&cdata->task, cdata);
+ pnfs_generic_commit_release(data);
+}
+
static const struct rpc_call_ops ff_layout_read_call_ops_v3 = {
.rpc_call_prepare = ff_layout_read_prepare_v3,
.rpc_call_done = ff_layout_read_call_done,
.rpc_count_stats = ff_layout_read_count_stats,
- .rpc_release = pnfs_generic_rw_release,
+ .rpc_release = ff_layout_read_release,
};
static const struct rpc_call_ops ff_layout_read_call_ops_v4 = {
.rpc_call_prepare = ff_layout_read_prepare_v4,
.rpc_call_done = ff_layout_read_call_done,
.rpc_count_stats = ff_layout_read_count_stats,
- .rpc_release = pnfs_generic_rw_release,
+ .rpc_release = ff_layout_read_release,
};
static const struct rpc_call_ops ff_layout_write_call_ops_v3 = {
.rpc_call_prepare = ff_layout_write_prepare_v3,
.rpc_call_done = ff_layout_write_call_done,
.rpc_count_stats = ff_layout_write_count_stats,
- .rpc_release = pnfs_generic_rw_release,
+ .rpc_release = ff_layout_write_release,
};
static const struct rpc_call_ops ff_layout_write_call_ops_v4 = {
.rpc_call_prepare = ff_layout_write_prepare_v4,
.rpc_call_done = ff_layout_write_call_done,
.rpc_count_stats = ff_layout_write_count_stats,
- .rpc_release = pnfs_generic_rw_release,
+ .rpc_release = ff_layout_write_release,
};
static const struct rpc_call_ops ff_layout_commit_call_ops_v3 = {
.rpc_call_prepare = ff_layout_commit_prepare_v3,
.rpc_call_done = ff_layout_commit_done,
.rpc_count_stats = ff_layout_commit_count_stats,
- .rpc_release = pnfs_generic_commit_release,
+ .rpc_release = ff_layout_commit_release,
};
static const struct rpc_call_ops ff_layout_commit_call_ops_v4 = {
.rpc_call_prepare = ff_layout_commit_prepare_v4,
.rpc_call_done = ff_layout_commit_done,
.rpc_count_stats = ff_layout_commit_count_stats,
- .rpc_release = pnfs_generic_commit_release,
+ .rpc_release = ff_layout_commit_release,
};
static enum pnfs_try_status
@@ -1842,53 +2112,55 @@ ff_layout_encode_layoutstats(struct xdr_stream *xdr,
*start = cpu_to_be32((xdr->p - start - 1) * 4);
}
-static bool
+static int
ff_layout_mirror_prepare_stats(struct nfs42_layoutstat_args *args,
- struct pnfs_layout_segment *pls,
- int *dev_count, int dev_limit)
+ struct pnfs_layout_hdr *lo,
+ int dev_limit)
{
+ struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(lo);
struct nfs4_ff_layout_mirror *mirror;
struct nfs4_deviceid_node *dev;
struct nfs42_layoutstat_devinfo *devinfo;
- int i;
+ int i = 0;
- for (i = 0; i < FF_LAYOUT_MIRROR_COUNT(pls); i++) {
- if (*dev_count >= dev_limit)
+ list_for_each_entry(mirror, &ff_layout->mirrors, mirrors) {
+ if (i >= dev_limit)
break;
- mirror = FF_LAYOUT_COMP(pls, i);
- if (!mirror || !mirror->mirror_ds)
+ if (!mirror->mirror_ds)
continue;
- dev = FF_LAYOUT_DEVID_NODE(pls, i);
- devinfo = &args->devinfo[*dev_count];
+ /* mirror refcount put in cleanup_layoutstats */
+ if (!atomic_inc_not_zero(&mirror->ref))
+ continue;
+ dev = &mirror->mirror_ds->id_node;
+ devinfo = &args->devinfo[i];
memcpy(&devinfo->dev_id, &dev->deviceid, NFS4_DEVICEID4_SIZE);
- devinfo->offset = pls->pls_range.offset;
- devinfo->length = pls->pls_range.length;
- /* well, we don't really know if IO is continuous or not! */
- devinfo->read_count = mirror->read_stat.io_stat.bytes_completed;
+ devinfo->offset = 0;
+ devinfo->length = NFS4_MAX_UINT64;
+ devinfo->read_count = mirror->read_stat.io_stat.ops_completed;
devinfo->read_bytes = mirror->read_stat.io_stat.bytes_completed;
- devinfo->write_count = mirror->write_stat.io_stat.bytes_completed;
+ devinfo->write_count = mirror->write_stat.io_stat.ops_completed;
devinfo->write_bytes = mirror->write_stat.io_stat.bytes_completed;
devinfo->layout_type = LAYOUT_FLEX_FILES;
devinfo->layoutstats_encode = ff_layout_encode_layoutstats;
devinfo->layout_private = mirror;
- /* lseg refcount put in cleanup_layoutstats */
- pnfs_get_lseg(pls);
- ++(*dev_count);
+ i++;
}
-
- return *dev_count < dev_limit;
+ return i;
}
static int
ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args)
{
- struct pnfs_layout_segment *pls;
+ struct nfs4_flexfile_layout *ff_layout;
+ struct nfs4_ff_layout_mirror *mirror;
int dev_count = 0;
spin_lock(&args->inode->i_lock);
- list_for_each_entry(pls, &NFS_I(args->inode)->layout->plh_segs, pls_list) {
- dev_count += FF_LAYOUT_MIRROR_COUNT(pls);
+ ff_layout = FF_LAYOUT_FROM_HDR(NFS_I(args->inode)->layout);
+ list_for_each_entry(mirror, &ff_layout->mirrors, mirrors) {
+ if (atomic_read(&mirror->ref) != 0)
+ dev_count ++;
}
spin_unlock(&args->inode->i_lock);
/* For now, send at most PNFS_LAYOUTSTATS_MAXDEV statistics */
@@ -1897,20 +2169,14 @@ ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args)
__func__, dev_count, PNFS_LAYOUTSTATS_MAXDEV);
dev_count = PNFS_LAYOUTSTATS_MAXDEV;
}
- args->devinfo = kmalloc(dev_count * sizeof(*args->devinfo), GFP_KERNEL);
+ args->devinfo = kmalloc_array(dev_count, sizeof(*args->devinfo), GFP_NOIO);
if (!args->devinfo)
return -ENOMEM;
- dev_count = 0;
spin_lock(&args->inode->i_lock);
- list_for_each_entry(pls, &NFS_I(args->inode)->layout->plh_segs, pls_list) {
- if (!ff_layout_mirror_prepare_stats(args, pls, &dev_count,
- PNFS_LAYOUTSTATS_MAXDEV)) {
- break;
- }
- }
+ args->num_dev = ff_layout_mirror_prepare_stats(args,
+ &ff_layout->generic_hdr, dev_count);
spin_unlock(&args->inode->i_lock);
- args->num_dev = dev_count;
return 0;
}
@@ -1924,7 +2190,7 @@ ff_layout_cleanup_layoutstats(struct nfs42_layoutstat_data *data)
for (i = 0; i < data->args.num_dev; i++) {
mirror = data->args.devinfo[i].layout_private;
data->args.devinfo[i].layout_private = NULL;
- pnfs_put_lseg(mirror->lseg);
+ ff_layout_put_mirror(mirror);
}
}
@@ -1936,6 +2202,7 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = {
.free_layout_hdr = ff_layout_free_layout_hdr,
.alloc_lseg = ff_layout_alloc_lseg,
.free_lseg = ff_layout_free_lseg,
+ .add_lseg = ff_layout_add_lseg,
.pg_read_ops = &ff_layout_pg_read_ops,
.pg_write_ops = &ff_layout_pg_write_ops,
.get_ds_info = ff_layout_get_ds_info,
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
index f92f9a0..dd353bb 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.h
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -10,6 +10,7 @@
#define FS_NFS_NFS4FLEXFILELAYOUT_H
#define FF_FLAGS_NO_LAYOUTCOMMIT 1
+#define FF_FLAGS_NO_IO_THRU_MDS 2
#include "../pnfs.h"
@@ -67,7 +68,8 @@ struct nfs4_ff_layoutstat {
};
struct nfs4_ff_layout_mirror {
- struct pnfs_layout_segment *lseg; /* back pointer */
+ struct pnfs_layout_hdr *layout;
+ struct list_head mirrors;
u32 ds_count;
u32 efficiency;
struct nfs4_ff_layout_ds *mirror_ds;
@@ -77,11 +79,13 @@ struct nfs4_ff_layout_mirror {
u32 uid;
u32 gid;
struct rpc_cred *cred;
+ atomic_t ref;
spinlock_t lock;
struct nfs4_ff_layoutstat read_stat;
struct nfs4_ff_layoutstat write_stat;
ktime_t start_time;
ktime_t last_report_time;
+ u32 report_interval;
};
struct nfs4_ff_layout_segment {
@@ -95,6 +99,7 @@ struct nfs4_ff_layout_segment {
struct nfs4_flexfile_layout {
struct pnfs_layout_hdr generic_hdr;
struct pnfs_ds_commit_info commit_info;
+ struct list_head mirrors;
struct list_head error_list; /* nfs4_ff_layout_ds_err */
};
@@ -143,6 +148,12 @@ FF_LAYOUT_MIRROR_COUNT(struct pnfs_layout_segment *lseg)
}
static inline bool
+ff_layout_no_fallback_to_mds(struct pnfs_layout_segment *lseg)
+{
+ return FF_LAYOUT_LSEG(lseg)->flags & FF_FLAGS_NO_IO_THRU_MDS;
+}
+
+static inline bool
ff_layout_test_devid_unavailable(struct nfs4_deviceid_node *node)
{
return nfs4_test_deviceid_unavailable(node);
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index f13e196..bd03275 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -172,6 +172,32 @@ out_err:
return NULL;
}
+static void ff_layout_mark_devid_invalid(struct pnfs_layout_segment *lseg,
+ struct nfs4_deviceid_node *devid)
+{
+ nfs4_mark_deviceid_unavailable(devid);
+ if (!ff_layout_has_available_ds(lseg))
+ pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode,
+ lseg);
+}
+
+static bool ff_layout_mirror_valid(struct pnfs_layout_segment *lseg,
+ struct nfs4_ff_layout_mirror *mirror)
+{
+ if (mirror == NULL || mirror->mirror_ds == NULL) {
+ pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode,
+ lseg);
+ return false;
+ }
+ if (mirror->mirror_ds->ds == NULL) {
+ struct nfs4_deviceid_node *devid;
+ devid = &mirror->mirror_ds->id_node;
+ ff_layout_mark_devid_invalid(lseg, devid);
+ return false;
+ }
+ return true;
+}
+
static u64
end_offset(u64 start, u64 len)
{
@@ -336,16 +362,10 @@ nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx)
{
struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, mirror_idx);
struct nfs_fh *fh = NULL;
- struct nfs4_deviceid_node *devid;
- if (mirror == NULL || mirror->mirror_ds == NULL ||
- mirror->mirror_ds->ds == NULL) {
- printk(KERN_ERR "NFS: %s: No data server for mirror offset index %d\n",
+ if (!ff_layout_mirror_valid(lseg, mirror)) {
+ pr_err_ratelimited("NFS: %s: No data server for mirror offset index %d\n",
__func__, mirror_idx);
- if (mirror && mirror->mirror_ds) {
- devid = &mirror->mirror_ds->id_node;
- pnfs_generic_mark_devid_invalid(devid);
- }
goto out;
}
@@ -368,14 +388,9 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
unsigned int max_payload;
rpc_authflavor_t flavor;
- if (mirror == NULL || mirror->mirror_ds == NULL ||
- mirror->mirror_ds->ds == NULL) {
- printk(KERN_ERR "NFS: %s: No data server for offset index %d\n",
+ if (!ff_layout_mirror_valid(lseg, mirror)) {
+ pr_err_ratelimited("NFS: %s: No data server for offset index %d\n",
__func__, ds_idx);
- if (mirror && mirror->mirror_ds) {
- devid = &mirror->mirror_ds->id_node;
- pnfs_generic_mark_devid_invalid(devid);
- }
goto out;
}
@@ -414,22 +429,14 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
mirror, lseg->pls_range.offset,
lseg->pls_range.length, NFS4ERR_NXIO,
OP_ILLEGAL, GFP_NOIO);
- if (fail_return) {
- pnfs_error_mark_layout_for_return(ino, lseg);
- if (ff_layout_has_available_ds(lseg))
- pnfs_set_retry_layoutget(lseg->pls_layout);
- else
- pnfs_clear_retry_layoutget(lseg->pls_layout);
-
- } else {
+ if (!fail_return) {
if (ff_layout_has_available_ds(lseg))
set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
&lseg->pls_layout->plh_flags);
- else {
+ else
pnfs_error_mark_layout_for_return(ino, lseg);
- pnfs_clear_retry_layoutget(lseg->pls_layout);
- }
- }
+ } else
+ pnfs_error_mark_layout_for_return(ino, lseg);
}
out_update_creds:
if (ff_layout_update_mirror_cred(mirror, ds))
@@ -500,16 +507,19 @@ int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo,
range->offset, range->length))
continue;
/* offset(8) + length(8) + stateid(NFS4_STATEID_SIZE)
- * + deviceid(NFS4_DEVICEID4_SIZE) + status(4) + opnum(4)
+ * + array length + deviceid(NFS4_DEVICEID4_SIZE)
+ * + status(4) + opnum(4)
*/
p = xdr_reserve_space(xdr,
- 24 + NFS4_STATEID_SIZE + NFS4_DEVICEID4_SIZE);
+ 28 + NFS4_STATEID_SIZE + NFS4_DEVICEID4_SIZE);
if (unlikely(!p))
return -ENOBUFS;
p = xdr_encode_hyper(p, err->offset);
p = xdr_encode_hyper(p, err->length);
p = xdr_encode_opaque_fixed(p, &err->stateid,
NFS4_STATEID_SIZE);
+ /* Encode 1 error */
+ *p++ = cpu_to_be32(1);
p = xdr_encode_opaque_fixed(p, &err->deviceid,
NFS4_DEVICEID4_SIZE);
*p++ = cpu_to_be32(err->status);
@@ -525,11 +535,11 @@ int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo,
return 0;
}
-bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg)
+static bool ff_read_layout_has_available_ds(struct pnfs_layout_segment *lseg)
{
struct nfs4_ff_layout_mirror *mirror;
struct nfs4_deviceid_node *devid;
- int idx;
+ u32 idx;
for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) {
mirror = FF_LAYOUT_COMP(lseg, idx);
@@ -543,6 +553,32 @@ bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg)
return false;
}
+static bool ff_rw_layout_has_available_ds(struct pnfs_layout_segment *lseg)
+{
+ struct nfs4_ff_layout_mirror *mirror;
+ struct nfs4_deviceid_node *devid;
+ u32 idx;
+
+ for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) {
+ mirror = FF_LAYOUT_COMP(lseg, idx);
+ if (!mirror || !mirror->mirror_ds)
+ return false;
+ devid = &mirror->mirror_ds->id_node;
+ if (ff_layout_test_devid_unavailable(devid))
+ return false;
+ }
+
+ return FF_LAYOUT_MIRROR_COUNT(lseg) != 0;
+}
+
+bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg)
+{
+ if (lseg->pls_range.iomode == IOMODE_READ)
+ return ff_read_layout_has_available_ds(lseg);
+ /* Note: RW layout needs all mirrors available */
+ return ff_rw_layout_has_available_ds(lseg);
+}
+
module_param(dataserver_retrans, uint, 0644);
MODULE_PARM_DESC(dataserver_retrans, "The number of times the NFSv4.1 client "
"retries a request before it attempts further "
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 0adc7d2..8e24d88 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -71,19 +71,25 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr)
return nfs_fileid_to_ino_t(fattr->fileid);
}
-/**
- * nfs_wait_bit_killable - helper for functions that are sleeping on bit locks
- * @word: long word containing the bit lock
- */
-int nfs_wait_bit_killable(struct wait_bit_key *key)
+static int nfs_wait_killable(int mode)
{
- if (fatal_signal_pending(current))
- return -ERESTARTSYS;
freezable_schedule_unsafe();
+ if (signal_pending_state(mode, current))
+ return -ERESTARTSYS;
return 0;
}
+
+int nfs_wait_bit_killable(struct wait_bit_key *key, int mode)
+{
+ return nfs_wait_killable(mode);
+}
EXPORT_SYMBOL_GPL(nfs_wait_bit_killable);
+int nfs_wait_atomic_killable(atomic_t *p)
+{
+ return nfs_wait_killable(TASK_KILLABLE);
+}
+
/**
* nfs_compat_user_ino64 - returns the user-visible inode number
* @fileid: 64-bit fileid
@@ -408,9 +414,10 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
inode->i_fop = NULL;
inode->i_flags |= S_AUTOMOUNT;
}
- } else if (S_ISLNK(inode->i_mode))
+ } else if (S_ISLNK(inode->i_mode)) {
inode->i_op = &nfs_symlink_inode_operations;
- else
+ inode_nohighmem(inode);
+ } else
init_special_inode(inode, inode->i_mode, fattr->rdev);
memset(&inode->i_atime, 0, sizeof(inode->i_atime));
@@ -504,7 +511,7 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = d_inode(dentry);
struct nfs_fattr *fattr;
- int error = -ENOMEM;
+ int error = 0;
nfs_inc_stats(inode, NFSIOS_VFSSETATTR);
@@ -513,15 +520,14 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
attr->ia_valid &= ~ATTR_MODE;
if (attr->ia_valid & ATTR_SIZE) {
- loff_t i_size;
-
BUG_ON(!S_ISREG(inode->i_mode));
- i_size = i_size_read(inode);
- if (attr->ia_size == i_size)
+ error = inode_newsize_ok(inode, attr->ia_size);
+ if (error)
+ return error;
+
+ if (attr->ia_size == i_size_read(inode))
attr->ia_valid &= ~ATTR_SIZE;
- else if (attr->ia_size < i_size && IS_SWAPFILE(inode))
- return -ETXTBSY;
}
/* Optimization: if the end result is no change, don't RPC */
@@ -536,8 +542,11 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
nfs_sync_inode(inode);
fattr = nfs_alloc_fattr();
- if (fattr == NULL)
+ if (fattr == NULL) {
+ error = -ENOMEM;
goto out;
+ }
+
/*
* Return any delegations if we're going to change ACLs
*/
@@ -616,7 +625,10 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr,
nfs_inc_stats(inode, NFSIOS_SETATTRTRUNC);
nfs_vmtruncate(inode, attr->ia_size);
}
- nfs_update_inode(inode, fattr);
+ if (fattr->valid)
+ nfs_update_inode(inode, fattr);
+ else
+ NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATTR;
spin_unlock(&inode->i_lock);
}
EXPORT_SYMBOL_GPL(nfs_setattr_update_inode);
@@ -694,7 +706,7 @@ static void nfs_init_lock_context(struct nfs_lock_context *l_ctx)
l_ctx->lockowner.l_owner = current->files;
l_ctx->lockowner.l_pid = current->tgid;
INIT_LIST_HEAD(&l_ctx->list);
- nfs_iocounter_init(&l_ctx->io_count);
+ atomic_set(&l_ctx->io_count, 0);
}
static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context *ctx)
@@ -759,11 +771,13 @@ EXPORT_SYMBOL_GPL(nfs_put_lock_context);
* @ctx: pointer to context
* @is_sync: is this a synchronous close
*
- * always ensure that the attributes are up to date if we're mounted
- * with close-to-open semantics
+ * Ensure that the attributes are up to date if we're mounted
+ * with close-to-open semantics and we have cached data that will
+ * need to be revalidated on open.
*/
void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
{
+ struct nfs_inode *nfsi;
struct inode *inode;
struct nfs_server *server;
@@ -772,7 +786,12 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
if (!is_sync)
return;
inode = d_inode(ctx->dentry);
- if (!list_empty(&NFS_I(inode)->open_files))
+ nfsi = NFS_I(inode);
+ if (inode->i_mapping->nrpages == 0)
+ return;
+ if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
+ return;
+ if (!list_empty(&nfsi->open_files))
return;
server = NFS_SERVER(inode);
if (server->flags & NFS_MOUNT_NOCTO)
@@ -844,6 +863,11 @@ void put_nfs_open_context(struct nfs_open_context *ctx)
}
EXPORT_SYMBOL_GPL(put_nfs_open_context);
+static void put_nfs_open_context_sync(struct nfs_open_context *ctx)
+{
+ __put_nfs_open_context(ctx, 1);
+}
+
/*
* Ensure that mmap has a recent RPC credential for use when writing out
* shared pages
@@ -888,18 +912,24 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_c
return ctx;
}
-static void nfs_file_clear_open_context(struct file *filp)
+void nfs_file_clear_open_context(struct file *filp)
{
struct nfs_open_context *ctx = nfs_file_open_context(filp);
if (ctx) {
struct inode *inode = d_inode(ctx->dentry);
+ /*
+ * We fatal error on write before. Try to writeback
+ * every page again.
+ */
+ if (ctx->error < 0)
+ invalidate_inode_pages2(inode->i_mapping);
filp->private_data = NULL;
spin_lock(&inode->i_lock);
list_move_tail(&ctx->list, &NFS_I(inode)->open_files);
spin_unlock(&inode->i_lock);
- __put_nfs_open_context(ctx, filp->f_flags & O_DIRECT ? 0 : 1);
+ put_nfs_open_context_sync(ctx);
}
}
@@ -919,12 +949,6 @@ int nfs_open(struct inode *inode, struct file *filp)
return 0;
}
-int nfs_release(struct inode *inode, struct file *filp)
-{
- nfs_file_clear_open_context(filp);
- return 0;
-}
-
/*
* This function is called whenever some part of NFS notices that
* the cached attributes have to be refreshed.
@@ -1075,6 +1099,27 @@ static bool nfs_mapping_need_revalidate_inode(struct inode *inode)
|| NFS_STALE(inode);
}
+int nfs_revalidate_mapping_rcu(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ unsigned long *bitlock = &nfsi->flags;
+ int ret = 0;
+
+ if (IS_SWAPFILE(inode))
+ goto out;
+ if (nfs_mapping_need_revalidate_inode(inode)) {
+ ret = -ECHILD;
+ goto out;
+ }
+ spin_lock(&inode->i_lock);
+ if (test_bit(NFS_INO_INVALIDATING, bitlock) ||
+ (nfsi->cache_validity & NFS_INO_INVALID_DATA))
+ ret = -ECHILD;
+ spin_unlock(&inode->i_lock);
+out:
+ return ret;
+}
+
/**
* __nfs_revalidate_mapping - Revalidate the pagecache
* @inode - pointer to host inode
@@ -1273,13 +1318,6 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
return 0;
}
-static int nfs_ctime_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
-{
- if (!(fattr->valid & NFS_ATTR_FATTR_CTIME))
- return 0;
- return timespec_compare(&fattr->ctime, &inode->i_ctime) > 0;
-}
-
static atomic_long_t nfs_attr_generation_counter;
static unsigned long nfs_read_attr_generation_counter(void)
@@ -1428,7 +1466,6 @@ static int nfs_inode_attrs_need_update(const struct inode *inode, const struct n
const struct nfs_inode *nfsi = NFS_I(inode);
return ((long)fattr->gencount - (long)nfsi->attr_gencount) > 0 ||
- nfs_ctime_need_update(inode, fattr) ||
((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0);
}
@@ -1491,6 +1528,13 @@ static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr
{
unsigned long invalid = NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+ /*
+ * Don't revalidate the pagecache if we hold a delegation, but do
+ * force an attribute update
+ */
+ if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
+ invalid = NFS_INO_INVALID_ATTR|NFS_INO_REVAL_FORCED;
+
if (S_ISDIR(inode->i_mode))
invalid |= NFS_INO_INVALID_DATA;
nfs_set_cache_invalid(inode, invalid);
@@ -1631,6 +1675,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
unsigned long invalid = 0;
unsigned long now = jiffies;
unsigned long save_cache_validity;
+ bool cache_revalidated = true;
dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%x)\n",
__func__, inode->i_sb->s_id, inode->i_ino,
@@ -1692,22 +1737,28 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
nfs_force_lookup_revalidate(inode);
inode->i_version = fattr->change_attr;
}
- } else
+ } else {
nfsi->cache_validity |= save_cache_validity;
+ cache_revalidated = false;
+ }
if (fattr->valid & NFS_ATTR_FATTR_MTIME) {
memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
- } else if (server->caps & NFS_CAP_MTIME)
+ } else if (server->caps & NFS_CAP_MTIME) {
nfsi->cache_validity |= save_cache_validity &
(NFS_INO_INVALID_ATTR
| NFS_INO_REVAL_FORCED);
+ cache_revalidated = false;
+ }
if (fattr->valid & NFS_ATTR_FATTR_CTIME) {
memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
- } else if (server->caps & NFS_CAP_CTIME)
+ } else if (server->caps & NFS_CAP_CTIME) {
nfsi->cache_validity |= save_cache_validity &
(NFS_INO_INVALID_ATTR
| NFS_INO_REVAL_FORCED);
+ cache_revalidated = false;
+ }
/* Check if our cached file size is stale */
if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
@@ -1727,19 +1778,23 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
(long long)cur_isize,
(long long)new_isize);
}
- } else
+ } else {
nfsi->cache_validity |= save_cache_validity &
(NFS_INO_INVALID_ATTR
| NFS_INO_REVAL_PAGECACHE
| NFS_INO_REVAL_FORCED);
+ cache_revalidated = false;
+ }
if (fattr->valid & NFS_ATTR_FATTR_ATIME)
memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime));
- else if (server->caps & NFS_CAP_ATIME)
+ else if (server->caps & NFS_CAP_ATIME) {
nfsi->cache_validity |= save_cache_validity &
(NFS_INO_INVALID_ATIME
| NFS_INO_REVAL_FORCED);
+ cache_revalidated = false;
+ }
if (fattr->valid & NFS_ATTR_FATTR_MODE) {
if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) {
@@ -1748,36 +1803,42 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
inode->i_mode = newmode;
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
}
- } else if (server->caps & NFS_CAP_MODE)
+ } else if (server->caps & NFS_CAP_MODE) {
nfsi->cache_validity |= save_cache_validity &
(NFS_INO_INVALID_ATTR
| NFS_INO_INVALID_ACCESS
| NFS_INO_INVALID_ACL
| NFS_INO_REVAL_FORCED);
+ cache_revalidated = false;
+ }
if (fattr->valid & NFS_ATTR_FATTR_OWNER) {
if (!uid_eq(inode->i_uid, fattr->uid)) {
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
inode->i_uid = fattr->uid;
}
- } else if (server->caps & NFS_CAP_OWNER)
+ } else if (server->caps & NFS_CAP_OWNER) {
nfsi->cache_validity |= save_cache_validity &
(NFS_INO_INVALID_ATTR
| NFS_INO_INVALID_ACCESS
| NFS_INO_INVALID_ACL
| NFS_INO_REVAL_FORCED);
+ cache_revalidated = false;
+ }
if (fattr->valid & NFS_ATTR_FATTR_GROUP) {
if (!gid_eq(inode->i_gid, fattr->gid)) {
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
inode->i_gid = fattr->gid;
}
- } else if (server->caps & NFS_CAP_OWNER_GROUP)
+ } else if (server->caps & NFS_CAP_OWNER_GROUP) {
nfsi->cache_validity |= save_cache_validity &
(NFS_INO_INVALID_ATTR
| NFS_INO_INVALID_ACCESS
| NFS_INO_INVALID_ACL
| NFS_INO_REVAL_FORCED);
+ cache_revalidated = false;
+ }
if (fattr->valid & NFS_ATTR_FATTR_NLINK) {
if (inode->i_nlink != fattr->nlink) {
@@ -1786,19 +1847,22 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
invalid |= NFS_INO_INVALID_DATA;
set_nlink(inode, fattr->nlink);
}
- } else if (server->caps & NFS_CAP_NLINK)
+ } else if (server->caps & NFS_CAP_NLINK) {
nfsi->cache_validity |= save_cache_validity &
(NFS_INO_INVALID_ATTR
| NFS_INO_REVAL_FORCED);
+ cache_revalidated = false;
+ }
if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
/*
* report the blocks in 512byte units
*/
inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
- }
- if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
+ } else if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
inode->i_blocks = fattr->du.nfs2.blocks;
+ else
+ cache_revalidated = false;
/* Update attrtimeo value if we're out of the unstable period */
if (invalid & NFS_INO_INVALID_ATTR) {
@@ -1808,16 +1872,24 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
/* Set barrier to be more recent than all outstanding updates */
nfsi->attr_gencount = nfs_inc_attr_generation_counter();
} else {
- if (!time_in_range_open(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) {
- if ((nfsi->attrtimeo <<= 1) > NFS_MAXATTRTIMEO(inode))
- nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode);
+ if (cache_revalidated) {
+ if (!time_in_range_open(now, nfsi->attrtimeo_timestamp,
+ nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) {
+ nfsi->attrtimeo <<= 1;
+ if (nfsi->attrtimeo > NFS_MAXATTRTIMEO(inode))
+ nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode);
+ }
nfsi->attrtimeo_timestamp = now;
}
/* Set the barrier to be more recent than this fattr */
if ((long)fattr->gencount - (long)nfsi->attr_gencount > 0)
nfsi->attr_gencount = fattr->gencount;
}
- invalid &= ~NFS_INO_INVALID_ATTR;
+
+ /* Don't declare attrcache up to date if there were no attrs! */
+ if (cache_revalidated)
+ invalid &= ~NFS_INO_INVALID_ATTR;
+
/* Don't invalidate the data if we were to blame */
if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
|| S_ISLNK(inode->i_mode)))
@@ -1897,7 +1969,7 @@ static int __init nfs_init_inodecache(void)
nfs_inode_cachep = kmem_cache_create("nfs_inode_cache",
sizeof(struct nfs_inode),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (nfs_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 9b372b8..4e8cc94 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -219,10 +219,6 @@ static inline void nfs_fs_proc_exit(void)
}
#endif
-#ifdef CONFIG_NFS_V4_1
-int nfs_sockaddr_match_ipaddr(const struct sockaddr *, const struct sockaddr *);
-#endif
-
/* callback_xdr.c */
extern struct svc_version nfs4_callback_version1;
extern struct svc_version nfs4_callback_version4;
@@ -242,7 +238,7 @@ extern void nfs_pgheader_init(struct nfs_pageio_descriptor *desc,
struct nfs_pgio_header *hdr,
void (*release)(struct nfs_pgio_header *hdr));
void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos);
-int nfs_iocounter_wait(struct nfs_io_counter *c);
+int nfs_iocounter_wait(struct nfs_lock_context *l_ctx);
extern const struct nfs_pageio_ops nfs_pgio_rw_ops;
struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *);
@@ -256,18 +252,18 @@ void nfs_free_request(struct nfs_page *req);
struct nfs_pgio_mirror *
nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc);
-static inline void nfs_iocounter_init(struct nfs_io_counter *c)
-{
- c->flags = 0;
- atomic_set(&c->io_count, 0);
-}
-
static inline bool nfs_pgio_has_mirroring(struct nfs_pageio_descriptor *desc)
{
WARN_ON_ONCE(desc->pg_mirror_count < 1);
return desc->pg_mirror_count > 1;
}
+static inline bool nfs_match_open_context(const struct nfs_open_context *ctx1,
+ const struct nfs_open_context *ctx2)
+{
+ return ctx1->cred == ctx2->cred && ctx1->state == ctx2->state;
+}
+
/* nfs2xdr.c */
extern struct rpc_procinfo nfs_procedures[];
extern int nfs2_decode_dirent(struct xdr_stream *,
@@ -364,7 +360,6 @@ int nfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *)
/* file.c */
int nfs_file_fsync_commit(struct file *, loff_t, loff_t, int);
loff_t nfs_file_llseek(struct file *, loff_t, int);
-int nfs_file_flush(struct file *, fl_owner_t);
ssize_t nfs_file_read(struct kiocb *, struct iov_iter *);
ssize_t nfs_file_splice_read(struct file *, loff_t *, struct pipe_inode_info *,
size_t, unsigned int);
@@ -384,7 +379,8 @@ extern int nfs_drop_inode(struct inode *);
extern void nfs_clear_inode(struct inode *);
extern void nfs_evict_inode(struct inode *);
void nfs_zap_acl_cache(struct inode *inode);
-extern int nfs_wait_bit_killable(struct wait_bit_key *key);
+extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode);
+extern int nfs_wait_atomic_killable(atomic_t *p);
/* super.c */
extern const struct super_operations nfs_sops;
@@ -490,6 +486,9 @@ void nfs_retry_commit(struct list_head *page_list,
void nfs_commitdata_release(struct nfs_commit_data *data);
void nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
struct nfs_commit_info *cinfo);
+void nfs_request_add_commit_list_locked(struct nfs_page *req,
+ struct list_head *dst,
+ struct nfs_commit_info *cinfo);
void nfs_request_remove_commit_list(struct nfs_page *req,
struct nfs_commit_info *cinfo);
void nfs_init_cinfo(struct nfs_commit_info *cinfo,
@@ -521,7 +520,6 @@ static inline void nfs_inode_dio_wait(struct inode *inode)
inode_dio_wait(inode);
}
extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);
-extern void nfs_direct_set_resched_writes(struct nfs_direct_req *dreq);
/* nfs4proc.c */
extern void __nfs4_read_done_cb(struct nfs_pgio_header *);
@@ -623,13 +621,15 @@ void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize)
* Record the page as unstable and mark its inode as dirty.
*/
static inline
-void nfs_mark_page_unstable(struct page *page)
+void nfs_mark_page_unstable(struct page *page, struct nfs_commit_info *cinfo)
{
- struct inode *inode = page_file_mapping(page)->host;
+ if (!cinfo->dreq) {
+ struct inode *inode = page_file_mapping(page)->host;
- inc_zone_page_state(page, NR_UNSTABLE_NFS);
- inc_wb_stat(&inode_to_bdi(inode)->wb, WB_RECLAIMABLE);
- __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+ inc_zone_page_state(page, NR_UNSTABLE_NFS);
+ inc_wb_stat(&inode_to_bdi(inode)->wb, WB_RECLAIMABLE);
+ __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+ }
}
/*
@@ -696,9 +696,32 @@ static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh)
{
return ~crc32_le(0xFFFFFFFF, &fh->data[0], fh->size);
}
+static inline u32 nfs_stateid_hash(const nfs4_stateid *stateid)
+{
+ return ~crc32_le(0xFFFFFFFF, &stateid->other[0],
+ NFS4_STATEID_OTHER_SIZE);
+}
#else
static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh)
{
return 0;
}
+static inline u32 nfs_stateid_hash(nfs4_stateid *stateid)
+{
+ return 0;
+}
#endif
+
+static inline bool nfs_error_is_fatal(int err)
+{
+ switch (err) {
+ case -ERESTARTSYS:
+ case -EIO:
+ case -ENOSPC:
+ case -EROFS:
+ case -E2BIG:
+ return true;
+ default:
+ return false;
+ }
+}
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 99a4528..09b1900 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -16,9 +16,7 @@
#include <linux/nfs_fs.h>
#include "internal.h"
-#ifdef NFS_DEBUG
-# define NFSDBG_FACILITY NFSDBG_MOUNT
-#endif
+#define NFSDBG_FACILITY NFSDBG_MOUNT
/*
* Defined by RFC 1094, section A.3; and RFC 1813, section 5.1.4
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 1ebe2fc..17c0fa1 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -284,12 +284,12 @@ nfs3_listxattr(struct dentry *dentry, char *data, size_t size)
int error;
error = nfs3_list_one_acl(inode, ACL_TYPE_ACCESS,
- POSIX_ACL_XATTR_ACCESS, data, size, &result);
+ XATTR_NAME_POSIX_ACL_ACCESS, data, size, &result);
if (error)
return error;
error = nfs3_list_one_acl(inode, ACL_TYPE_DEFAULT,
- POSIX_ACL_XATTR_DEFAULT, data, size, &result);
+ XATTR_NAME_POSIX_ACL_DEFAULT, data, size, &result);
if (error)
return error;
return result;
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 9b04c2e..267126d 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -1103,6 +1103,7 @@ static void nfs3_xdr_enc_symlink3args(struct rpc_rqst *req,
{
encode_diropargs3(xdr, args->fromfh, args->fromname, args->fromlen);
encode_symlinkdata3(xdr, args);
+ xdr->buf->flags |= XDRBUF_WRITE;
}
/*
diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h
index ff66ae7..b587ccd 100644
--- a/fs/nfs/nfs42.h
+++ b/fs/nfs/nfs42.h
@@ -17,7 +17,6 @@ int nfs42_proc_deallocate(struct file *, loff_t, loff_t);
loff_t nfs42_proc_llseek(struct file *, loff_t, int);
int nfs42_proc_layoutstats_generic(struct nfs_server *,
struct nfs42_layoutstat_data *);
-/* nfs4.2xdr.h */
-extern struct rpc_procinfo nfs4_2_procedures[];
+int nfs42_proc_clone(struct file *, struct file *, loff_t, loff_t, loff_t);
#endif /* __LINUX_FS_NFS_NFS4_2_H */
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index d731bbf..6e81749 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -14,7 +14,7 @@
#include "pnfs.h"
#include "internal.h"
-#define NFSDBG_FACILITY NFSDBG_PNFS
+#define NFSDBG_FACILITY NFSDBG_PROC
static int nfs42_set_rw_stateid(nfs4_stateid *dst, struct file *file,
fmode_t fmode)
@@ -175,10 +175,12 @@ loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence)
{
struct nfs_server *server = NFS_SERVER(file_inode(filep));
struct nfs4_exception exception = { };
- int err;
+ loff_t err;
do {
err = _nfs42_proc_llseek(filep, offset, whence);
+ if (err >= 0)
+ break;
if (err == -ENOTSUPP)
return -EOPNOTSUPP;
err = nfs4_handle_exception(server, err, &exception);
@@ -202,6 +204,8 @@ static void
nfs42_layoutstat_done(struct rpc_task *task, void *calldata)
{
struct nfs42_layoutstat_data *data = calldata;
+ struct inode *inode = data->inode;
+ struct pnfs_layout_hdr *lo;
if (!nfs4_sequence_done(task, &data->res.seq_res))
return;
@@ -209,12 +213,35 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata)
switch (task->tk_status) {
case 0:
break;
+ case -NFS4ERR_EXPIRED:
+ case -NFS4ERR_STALE_STATEID:
+ case -NFS4ERR_OLD_STATEID:
+ case -NFS4ERR_BAD_STATEID:
+ spin_lock(&inode->i_lock);
+ lo = NFS_I(inode)->layout;
+ if (lo && nfs4_stateid_match(&data->args.stateid,
+ &lo->plh_stateid)) {
+ LIST_HEAD(head);
+
+ /*
+ * Mark the bad layout state as invalid, then retry
+ * with the current stateid.
+ */
+ set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
+ pnfs_mark_matching_lsegs_invalid(lo, &head, NULL);
+ spin_unlock(&inode->i_lock);
+ pnfs_free_lseg_list(&head);
+ } else
+ spin_unlock(&inode->i_lock);
+ break;
case -ENOTSUPP:
case -EOPNOTSUPP:
- NFS_SERVER(data->inode)->caps &= ~NFS_CAP_LAYOUTSTATS;
+ NFS_SERVER(inode)->caps &= ~NFS_CAP_LAYOUTSTATS;
default:
- dprintk("%s server returns %d\n", __func__, task->tk_status);
+ break;
}
+
+ dprintk("%s server returns %d\n", __func__, task->tk_status);
}
static void
@@ -269,3 +296,75 @@ int nfs42_proc_layoutstats_generic(struct nfs_server *server,
return PTR_ERR(task);
return 0;
}
+
+static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f,
+ struct file *dst_f, loff_t src_offset,
+ loff_t dst_offset, loff_t count)
+{
+ struct inode *src_inode = file_inode(src_f);
+ struct inode *dst_inode = file_inode(dst_f);
+ struct nfs_server *server = NFS_SERVER(dst_inode);
+ struct nfs42_clone_args args = {
+ .src_fh = NFS_FH(src_inode),
+ .dst_fh = NFS_FH(dst_inode),
+ .src_offset = src_offset,
+ .dst_offset = dst_offset,
+ .count = count,
+ .dst_bitmask = server->cache_consistency_bitmask,
+ };
+ struct nfs42_clone_res res = {
+ .server = server,
+ };
+ int status;
+
+ msg->rpc_argp = &args;
+ msg->rpc_resp = &res;
+
+ status = nfs42_set_rw_stateid(&args.src_stateid, src_f, FMODE_READ);
+ if (status)
+ return status;
+
+ status = nfs42_set_rw_stateid(&args.dst_stateid, dst_f, FMODE_WRITE);
+ if (status)
+ return status;
+
+ res.dst_fattr = nfs_alloc_fattr();
+ if (!res.dst_fattr)
+ return -ENOMEM;
+
+ status = nfs4_call_sync(server->client, server, msg,
+ &args.seq_args, &res.seq_res, 0);
+ if (status == 0)
+ status = nfs_post_op_update_inode(dst_inode, res.dst_fattr);
+
+ kfree(res.dst_fattr);
+ return status;
+}
+
+int nfs42_proc_clone(struct file *src_f, struct file *dst_f,
+ loff_t src_offset, loff_t dst_offset, loff_t count)
+{
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLONE],
+ };
+ struct inode *inode = file_inode(src_f);
+ struct nfs_server *server = NFS_SERVER(file_inode(src_f));
+ struct nfs4_exception exception = { };
+ int err;
+
+ if (!nfs_server_capable(inode, NFS_CAP_CLONE))
+ return -EOPNOTSUPP;
+
+ do {
+ err = _nfs42_proc_clone(&msg, src_f, dst_f, src_offset,
+ dst_offset, count);
+ if (err == -ENOTSUPP || err == -EOPNOTSUPP) {
+ NFS_SERVER(inode)->caps &= ~NFS_CAP_CLONE;
+ return -EOPNOTSUPP;
+ }
+ err = nfs4_handle_exception(server, err, &exception);
+ } while (exception.retry);
+
+ return err;
+
+}
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index a6bd27d..0ca482a 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -34,6 +34,12 @@
1 /* opaque devaddr4 length */ + \
XDR_QUADLEN(PNFS_LAYOUTSTATS_MAXSIZE))
#define decode_layoutstats_maxsz (op_decode_hdr_maxsz)
+#define encode_clone_maxsz (encode_stateid_maxsz + \
+ encode_stateid_maxsz + \
+ 2 /* src offset */ + \
+ 2 /* dst offset */ + \
+ 2 /* count */)
+#define decode_clone_maxsz (op_decode_hdr_maxsz)
#define NFS4_enc_allocate_sz (compound_encode_hdr_maxsz + \
encode_putfh_maxsz + \
@@ -65,7 +71,20 @@
decode_sequence_maxsz + \
decode_putfh_maxsz + \
PNFS_LAYOUTSTATS_MAXDEV * decode_layoutstats_maxsz)
-
+#define NFS4_enc_clone_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_savefh_maxsz + \
+ encode_putfh_maxsz + \
+ encode_clone_maxsz + \
+ encode_getattr_maxsz)
+#define NFS4_dec_clone_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_savefh_maxsz + \
+ decode_putfh_maxsz + \
+ decode_clone_maxsz + \
+ decode_getattr_maxsz)
static void encode_fallocate(struct xdr_stream *xdr,
struct nfs42_falloc_args *args)
@@ -128,6 +147,21 @@ static void encode_layoutstats(struct xdr_stream *xdr,
encode_uint32(xdr, 0);
}
+static void encode_clone(struct xdr_stream *xdr,
+ struct nfs42_clone_args *args,
+ struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ encode_op_hdr(xdr, OP_CLONE, decode_clone_maxsz, hdr);
+ encode_nfs4_stateid(xdr, &args->src_stateid);
+ encode_nfs4_stateid(xdr, &args->dst_stateid);
+ p = reserve_space(xdr, 3*8);
+ p = xdr_encode_hyper(p, args->src_offset);
+ p = xdr_encode_hyper(p, args->dst_offset);
+ xdr_encode_hyper(p, args->count);
+}
+
/*
* Encode ALLOCATE request
*/
@@ -206,6 +240,27 @@ static void nfs4_xdr_enc_layoutstats(struct rpc_rqst *req,
encode_nops(&hdr);
}
+/*
+ * Encode CLONE request
+ */
+static void nfs4_xdr_enc_clone(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ struct nfs42_clone_args *args)
+{
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->src_fh, &hdr);
+ encode_savefh(xdr, &hdr);
+ encode_putfh(xdr, args->dst_fh, &hdr);
+ encode_clone(xdr, args, &hdr);
+ encode_getfattr(xdr, args->dst_bitmask, &hdr);
+ encode_nops(&hdr);
+}
+
static int decode_allocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res)
{
return decode_op_hdr(xdr, OP_ALLOCATE);
@@ -238,12 +293,16 @@ out_overflow:
return -EIO;
}
-static int decode_layoutstats(struct xdr_stream *xdr,
- struct nfs42_layoutstat_res *res)
+static int decode_layoutstats(struct xdr_stream *xdr)
{
return decode_op_hdr(xdr, OP_LAYOUTSTATS);
}
+static int decode_clone(struct xdr_stream *xdr)
+{
+ return decode_op_hdr(xdr, OP_CLONE);
+}
+
/*
* Decode ALLOCATE request
*/
@@ -343,7 +402,7 @@ static int nfs4_xdr_dec_layoutstats(struct rpc_rqst *rqstp,
goto out;
WARN_ON(res->num_dev > PNFS_LAYOUTSTATS_MAXDEV);
for (i = 0; i < res->num_dev; i++) {
- status = decode_layoutstats(xdr, res);
+ status = decode_layoutstats(xdr);
if (status)
goto out;
}
@@ -352,4 +411,39 @@ out:
return status;
}
+/*
+ * Decode CLONE request
+ */
+static int nfs4_xdr_dec_clone(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ struct nfs42_clone_res *res)
+{
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_savefh(xdr);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_clone(xdr);
+ if (status)
+ goto out;
+ status = decode_getfattr(xdr, res->dst_fattr, res->server);
+
+out:
+ res->rpc_status = status;
+ return status;
+}
+
#endif /* __LINUX_FS_NFS_NFS4_2XDR_H */
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index ea3bee9..4afdee4 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -183,10 +183,12 @@ struct nfs4_state {
struct nfs4_exception {
- long timeout;
- int retry;
struct nfs4_state *state;
struct inode *inode;
+ long timeout;
+ unsigned char delay : 1,
+ recovering : 1,
+ retry : 1;
};
struct nfs4_state_recovery_ops {
@@ -405,9 +407,7 @@ int nfs40_discover_server_trunking(struct nfs_client *clp,
int nfs41_discover_server_trunking(struct nfs_client *clp,
struct nfs_client **, struct rpc_cred *);
extern void nfs4_schedule_session_recovery(struct nfs4_session *, int);
-extern void nfs41_server_notify_target_slotid_update(struct nfs_client *clp);
-extern void nfs41_server_notify_highest_slotid_update(struct nfs_client *clp);
-
+extern void nfs41_notify_server(struct nfs_client *);
#else
static inline void nfs4_schedule_session_recovery(struct nfs4_session *session, int err)
{
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 3aa6a9b..10410e8 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -33,7 +33,7 @@ static int nfs_get_cb_ident_idr(struct nfs_client *clp, int minorversion)
return ret;
idr_preload(GFP_KERNEL);
spin_lock(&nn->nfs_client_lock);
- ret = idr_alloc(&nn->cb_ident_idr, clp, 0, 0, GFP_NOWAIT);
+ ret = idr_alloc(&nn->cb_ident_idr, clp, 1, 0, GFP_NOWAIT);
if (ret >= 0)
clp->cl_cb_ident = ret;
spin_unlock(&nn->nfs_client_lock);
@@ -729,10 +729,7 @@ static bool nfs4_cb_match_client(const struct sockaddr *addr,
return false;
/* Match only the IP address, not the port number */
- if (!nfs_sockaddr_match_ipaddr(addr, clap))
- return false;
-
- return true;
+ return rpc_cmp_addr(addr, clap);
}
/*
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index dcd39d4..26f9a23 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -4,9 +4,13 @@
* Copyright (C) 1992 Rick Sladkey
*/
#include <linux/fs.h>
+#include <linux/file.h>
#include <linux/falloc.h>
#include <linux/nfs_fs.h>
+#include <uapi/linux/btrfs.h> /* BTRFS_IOC_CLONE/BTRFS_IOC_CLONE_RANGE */
+#include "delegation.h"
#include "internal.h"
+#include "iostat.h"
#include "fscache.h"
#include "pnfs.h"
@@ -27,7 +31,6 @@ nfs4_file_open(struct inode *inode, struct file *filp)
struct inode *dir;
unsigned openflags = filp->f_flags;
struct iattr attr;
- int opened = 0;
int err;
/*
@@ -66,7 +69,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)
nfs_sync_inode(inode);
}
- inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, &opened);
+ inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, NULL);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
switch (err) {
@@ -100,6 +103,31 @@ out_drop:
goto out_put_ctx;
}
+/*
+ * Flush all dirty pages, and check for write errors.
+ */
+static int
+nfs4_file_flush(struct file *file, fl_owner_t id)
+{
+ struct inode *inode = file_inode(file);
+
+ dprintk("NFS: flush(%pD2)\n", file);
+
+ nfs_inc_stats(inode, NFSIOS_VFSFLUSH);
+ if ((file->f_mode & FMODE_WRITE) == 0)
+ return 0;
+
+ /*
+ * If we're holding a write delegation, then check if we're required
+ * to flush the i/o on close. If not, then just start the i/o now.
+ */
+ if (!nfs4_delegation_flush_on_close(inode))
+ return filemap_fdatawrite(file->f_mapping);
+
+ /* Flush writes to the server and return any errors */
+ return vfs_fsync(file, 0);
+}
+
static int
nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
@@ -166,28 +194,90 @@ static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t
return nfs42_proc_deallocate(filep, offset, len);
return nfs42_proc_allocate(filep, offset, len);
}
+
+static int nfs42_clone_file_range(struct file *src_file, loff_t src_off,
+ struct file *dst_file, loff_t dst_off, u64 count)
+{
+ struct inode *dst_inode = file_inode(dst_file);
+ struct nfs_server *server = NFS_SERVER(dst_inode);
+ struct inode *src_inode = file_inode(src_file);
+ unsigned int bs = server->clone_blksize;
+ bool same_inode = false;
+ int ret;
+
+ /* check alignment w.r.t. clone_blksize */
+ ret = -EINVAL;
+ if (bs) {
+ if (!IS_ALIGNED(src_off, bs) || !IS_ALIGNED(dst_off, bs))
+ goto out;
+ if (!IS_ALIGNED(count, bs) && i_size_read(src_inode) != (src_off + count))
+ goto out;
+ }
+
+ if (src_inode == dst_inode)
+ same_inode = true;
+
+ /* XXX: do we lock at all? what if server needs CB_RECALL_LAYOUT? */
+ if (same_inode) {
+ mutex_lock(&src_inode->i_mutex);
+ } else if (dst_inode < src_inode) {
+ mutex_lock_nested(&dst_inode->i_mutex, I_MUTEX_PARENT);
+ mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_CHILD);
+ } else {
+ mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_PARENT);
+ mutex_lock_nested(&dst_inode->i_mutex, I_MUTEX_CHILD);
+ }
+
+ /* flush all pending writes on both src and dst so that server
+ * has the latest data */
+ ret = nfs_sync_inode(src_inode);
+ if (ret)
+ goto out_unlock;
+ ret = nfs_sync_inode(dst_inode);
+ if (ret)
+ goto out_unlock;
+
+ ret = nfs42_proc_clone(src_file, dst_file, src_off, dst_off, count);
+
+ /* truncate inode page cache of the dst range so that future reads can fetch
+ * new data from server */
+ if (!ret)
+ truncate_inode_pages_range(&dst_inode->i_data, dst_off, dst_off + count - 1);
+
+out_unlock:
+ if (same_inode) {
+ mutex_unlock(&src_inode->i_mutex);
+ } else if (dst_inode < src_inode) {
+ mutex_unlock(&src_inode->i_mutex);
+ mutex_unlock(&dst_inode->i_mutex);
+ } else {
+ mutex_unlock(&dst_inode->i_mutex);
+ mutex_unlock(&src_inode->i_mutex);
+ }
+out:
+ return ret;
+}
#endif /* CONFIG_NFS_V4_2 */
const struct file_operations nfs4_file_operations = {
-#ifdef CONFIG_NFS_V4_2
- .llseek = nfs4_file_llseek,
-#else
- .llseek = nfs_file_llseek,
-#endif
.read_iter = nfs_file_read,
.write_iter = nfs_file_write,
.mmap = nfs_file_mmap,
.open = nfs4_file_open,
- .flush = nfs_file_flush,
+ .flush = nfs4_file_flush,
.release = nfs_file_release,
.fsync = nfs4_file_fsync,
.lock = nfs_lock,
.flock = nfs_flock,
.splice_read = nfs_file_splice_read,
.splice_write = iter_file_splice_write,
-#ifdef CONFIG_NFS_V4_2
- .fallocate = nfs42_fallocate,
-#endif /* CONFIG_NFS_V4_2 */
.check_flags = nfs_check_flags,
.setlease = simple_nosetlease,
+#ifdef CONFIG_NFS_V4_2
+ .llseek = nfs4_file_llseek,
+ .fallocate = nfs42_fallocate,
+ .clone_file_range = nfs42_clone_file_range,
+#else
+ .llseek = nfs_file_llseek,
+#endif
};
diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
index 535dfc6..5ba22c6 100644
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -184,7 +184,7 @@ static struct key_type key_type_id_resolver = {
.read = user_read,
};
-static int nfs_idmap_init_keyring(void)
+int nfs_idmap_init(void)
{
struct cred *cred;
struct key *keyring;
@@ -230,7 +230,7 @@ failed_put_cred:
return ret;
}
-static void nfs_idmap_quit_keyring(void)
+void nfs_idmap_quit(void)
{
key_revoke(id_resolver_cache->thread_keyring);
unregister_key_type(&key_type_id_resolver);
@@ -297,7 +297,7 @@ static ssize_t nfs_idmap_get_key(const char *name, size_t namelen,
{
const struct cred *saved_cred;
struct key *rkey;
- struct user_key_payload *payload;
+ const struct user_key_payload *payload;
ssize_t ret;
saved_cred = override_creds(id_resolver_cache);
@@ -316,7 +316,7 @@ static ssize_t nfs_idmap_get_key(const char *name, size_t namelen,
if (ret < 0)
goto out_up;
- payload = rcu_dereference(rkey->payload.rcudata);
+ payload = user_key_payload(rkey);
if (IS_ERR_OR_NULL(payload)) {
ret = PTR_ERR(payload);
goto out_up;
@@ -492,16 +492,6 @@ nfs_idmap_delete(struct nfs_client *clp)
kfree(idmap);
}
-int nfs_idmap_init(void)
-{
- return nfs_idmap_init_keyring();
-}
-
-void nfs_idmap_quit(void)
-{
- nfs_idmap_quit_keyring();
-}
-
static int nfs_idmap_prepare_message(char *desc, struct idmap *idmap,
struct idmap_msg *im,
struct rpc_pipe_msg *msg)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 3acb1eb..4bfc33a 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -78,7 +78,6 @@ struct nfs4_opendata;
static int _nfs4_proc_open(struct nfs4_opendata *data);
static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
-static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *, long *);
static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr);
static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, struct nfs4_label *label);
static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label);
@@ -209,6 +208,9 @@ static const u32 nfs4_pnfs_open_bitmap[3] = {
| FATTR4_WORD1_TIME_METADATA
| FATTR4_WORD1_TIME_MODIFY,
FATTR4_WORD2_MDSTHRESHOLD
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+ | FATTR4_WORD2_SECURITY_LABEL
+#endif
};
static const u32 nfs4_open_noattr_bitmap[3] = {
@@ -239,6 +241,7 @@ const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE
FATTR4_WORD1_TIME_DELTA
| FATTR4_WORD1_FS_LAYOUT_TYPES,
FATTR4_WORD2_LAYOUT_BLKSIZE
+ | FATTR4_WORD2_CLONE_BLKSIZE
};
const u32 nfs4_fs_locations_bitmap[3] = {
@@ -344,13 +347,16 @@ static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
/* This is the error handling routine for processes that are allowed
* to sleep.
*/
-int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_exception *exception)
+static int nfs4_do_handle_exception(struct nfs_server *server,
+ int errorcode, struct nfs4_exception *exception)
{
struct nfs_client *clp = server->nfs_client;
struct nfs4_state *state = exception->state;
struct inode *inode = exception->inode;
int ret = errorcode;
+ exception->delay = 0;
+ exception->recovering = 0;
exception->retry = 0;
switch(errorcode) {
case 0:
@@ -359,11 +365,9 @@ int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_
case -NFS4ERR_DELEG_REVOKED:
case -NFS4ERR_ADMIN_REVOKED:
case -NFS4ERR_BAD_STATEID:
- if (inode && nfs4_have_delegation(inode, FMODE_READ)) {
- nfs4_inode_return_delegation(inode);
- exception->retry = 1;
- return 0;
- }
+ if (inode && nfs_async_inode_return_delegation(inode,
+ NULL) == 0)
+ goto wait_on_recovery;
if (state == NULL)
break;
ret = nfs4_schedule_stateid_recovery(server, state);
@@ -409,11 +413,12 @@ int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_
ret = -EBUSY;
break;
}
- case -NFS4ERR_GRACE:
case -NFS4ERR_DELAY:
- ret = nfs4_delay(server->client, &exception->timeout);
- if (ret != 0)
- break;
+ nfs_inc_server_stats(server, NFSIOS_DELAY);
+ case -NFS4ERR_GRACE:
+ exception->delay = 1;
+ return 0;
+
case -NFS4ERR_RETRY_UNCACHED_REP:
case -NFS4ERR_OLD_STATEID:
exception->retry = 1;
@@ -434,14 +439,85 @@ int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_
/* We failed to handle the error */
return nfs4_map_errors(ret);
wait_on_recovery:
- ret = nfs4_wait_clnt_recover(clp);
+ exception->recovering = 1;
+ return 0;
+}
+
+/* This is the error handling routine for processes that are allowed
+ * to sleep.
+ */
+int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_exception *exception)
+{
+ struct nfs_client *clp = server->nfs_client;
+ int ret;
+
+ ret = nfs4_do_handle_exception(server, errorcode, exception);
+ if (exception->delay) {
+ ret = nfs4_delay(server->client, &exception->timeout);
+ goto out_retry;
+ }
+ if (exception->recovering) {
+ ret = nfs4_wait_clnt_recover(clp);
+ if (test_bit(NFS_MIG_FAILED, &server->mig_status))
+ return -EIO;
+ goto out_retry;
+ }
+ return ret;
+out_retry:
+ if (ret == 0)
+ exception->retry = 1;
+ return ret;
+}
+
+static int
+nfs4_async_handle_exception(struct rpc_task *task, struct nfs_server *server,
+ int errorcode, struct nfs4_exception *exception)
+{
+ struct nfs_client *clp = server->nfs_client;
+ int ret;
+
+ ret = nfs4_do_handle_exception(server, errorcode, exception);
+ if (exception->delay) {
+ rpc_delay(task, nfs4_update_delay(&exception->timeout));
+ goto out_retry;
+ }
+ if (exception->recovering) {
+ rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL);
+ if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0)
+ rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task);
+ goto out_retry;
+ }
if (test_bit(NFS_MIG_FAILED, &server->mig_status))
- return -EIO;
+ ret = -EIO;
+ return ret;
+out_retry:
if (ret == 0)
exception->retry = 1;
return ret;
}
+static int
+nfs4_async_handle_error(struct rpc_task *task, struct nfs_server *server,
+ struct nfs4_state *state, long *timeout)
+{
+ struct nfs4_exception exception = {
+ .state = state,
+ };
+
+ if (task->tk_status >= 0)
+ return 0;
+ if (timeout)
+ exception.timeout = *timeout;
+ task->tk_status = nfs4_async_handle_exception(task, server,
+ task->tk_status,
+ &exception);
+ if (exception.delay && timeout)
+ *timeout = exception.timeout;
+ if (exception.retry)
+ return -EAGAIN;
+ return 0;
+}
+
/*
* Return 'true' if 'clp' is using an rpc_client that is integrity protected
* or 'false' otherwise.
@@ -586,7 +662,7 @@ out_unlock:
spin_unlock(&tbl->slot_tbl_lock);
res->sr_slot = NULL;
if (send_new_highest_used_slotid)
- nfs41_server_notify_highest_slotid_update(session->clp);
+ nfs41_notify_server(session->clp);
}
int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
@@ -1127,6 +1203,21 @@ static int nfs4_wait_for_completion_rpc_task(struct rpc_task *task)
return ret;
}
+static bool nfs4_mode_match_open_stateid(struct nfs4_state *state,
+ fmode_t fmode)
+{
+ switch(fmode & (FMODE_READ|FMODE_WRITE)) {
+ case FMODE_READ|FMODE_WRITE:
+ return state->n_rdwr != 0;
+ case FMODE_WRITE:
+ return state->n_wronly != 0;
+ case FMODE_READ:
+ return state->n_rdonly != 0;
+ }
+ WARN_ON_ONCE(1);
+ return false;
+}
+
static int can_open_cached(struct nfs4_state *state, fmode_t mode, int open_mode)
{
int ret = 0;
@@ -1150,7 +1241,8 @@ out:
return ret;
}
-static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode)
+static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode,
+ enum open_claim_type4 claim)
{
if (delegation == NULL)
return 0;
@@ -1158,6 +1250,16 @@ static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode)
return 0;
if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags))
return 0;
+ switch (claim) {
+ case NFS4_OPEN_CLAIM_NULL:
+ case NFS4_OPEN_CLAIM_FH:
+ break;
+ case NFS4_OPEN_CLAIM_PREVIOUS:
+ if (!test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags))
+ break;
+ default:
+ return 0;
+ }
nfs_mark_delegation_referenced(delegation);
return 1;
}
@@ -1220,6 +1322,7 @@ static void nfs_resync_open_stateid_locked(struct nfs4_state *state)
}
static void nfs_clear_open_stateid_locked(struct nfs4_state *state,
+ nfs4_stateid *arg_stateid,
nfs4_stateid *stateid, fmode_t fmode)
{
clear_bit(NFS_O_RDWR_STATE, &state->flags);
@@ -1238,8 +1341,9 @@ static void nfs_clear_open_stateid_locked(struct nfs4_state *state,
if (stateid == NULL)
return;
/* Handle races with OPEN */
- if (!nfs4_stateid_match_other(stateid, &state->open_stateid) ||
- !nfs4_stateid_is_newer(stateid, &state->open_stateid)) {
+ if (!nfs4_stateid_match_other(arg_stateid, &state->open_stateid) ||
+ (nfs4_stateid_match_other(stateid, &state->open_stateid) &&
+ !nfs4_stateid_is_newer(stateid, &state->open_stateid))) {
nfs_resync_open_stateid_locked(state);
return;
}
@@ -1248,10 +1352,12 @@ static void nfs_clear_open_stateid_locked(struct nfs4_state *state,
nfs4_stateid_copy(&state->open_stateid, stateid);
}
-static void nfs_clear_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode)
+static void nfs_clear_open_stateid(struct nfs4_state *state,
+ nfs4_stateid *arg_stateid,
+ nfs4_stateid *stateid, fmode_t fmode)
{
write_seqlock(&state->seqlock);
- nfs_clear_open_stateid_locked(state, stateid, fmode);
+ nfs_clear_open_stateid_locked(state, arg_stateid, stateid, fmode);
write_sequnlock(&state->seqlock);
if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags))
nfs4_schedule_state_manager(state->owner->so_server->nfs_client);
@@ -1282,6 +1388,7 @@ static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_s
* Protect the call to nfs4_state_set_mode_locked and
* serialise the stateid update
*/
+ spin_lock(&state->owner->so_lock);
write_seqlock(&state->seqlock);
if (deleg_stateid != NULL) {
nfs4_stateid_copy(&state->stateid, deleg_stateid);
@@ -1290,7 +1397,6 @@ static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_s
if (open_stateid != NULL)
nfs_set_open_stateid_locked(state, open_stateid, fmode);
write_sequnlock(&state->seqlock);
- spin_lock(&state->owner->so_lock);
update_open_stateflags(state, fmode);
spin_unlock(&state->owner->so_lock);
}
@@ -1376,6 +1482,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
struct nfs_delegation *delegation;
int open_mode = opendata->o_arg.open_flags;
fmode_t fmode = opendata->o_arg.fmode;
+ enum open_claim_type4 claim = opendata->o_arg.claim;
nfs4_stateid stateid;
int ret = -EAGAIN;
@@ -1389,7 +1496,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
spin_unlock(&state->owner->so_lock);
rcu_read_lock();
delegation = rcu_dereference(nfsi->delegation);
- if (!can_open_delegated(delegation, fmode)) {
+ if (!can_open_delegated(delegation, fmode, claim)) {
rcu_read_unlock();
break;
}
@@ -1427,12 +1534,18 @@ nfs4_opendata_check_deleg(struct nfs4_opendata *data, struct nfs4_state *state)
if (delegation)
delegation_flags = delegation->flags;
rcu_read_unlock();
- if (data->o_arg.claim == NFS4_OPEN_CLAIM_DELEGATE_CUR) {
+ switch (data->o_arg.claim) {
+ default:
+ break;
+ case NFS4_OPEN_CLAIM_DELEGATE_CUR:
+ case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
pr_err_ratelimited("NFS: Broken NFSv4 server %s is "
"returning a delegation for "
"OPEN(CLAIM_DELEGATE_CUR)\n",
clp->cl_hostname);
- } else if ((delegation_flags & 1UL<<NFS_DELEGATION_NEED_RECLAIM) == 0)
+ return;
+ }
+ if ((delegation_flags & 1UL<<NFS_DELEGATION_NEED_RECLAIM) == 0)
nfs_inode_set_delegation(state->inode,
data->owner->so_cred,
&data->o_res);
@@ -1488,6 +1601,7 @@ _nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
if (!data->rpc_done) {
state = nfs4_try_open_cached(data);
+ trace_nfs4_cached_open(data->state);
goto out;
}
@@ -1555,17 +1669,13 @@ static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context
return opendata;
}
-static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, fmode_t fmode, struct nfs4_state **res)
+static int nfs4_open_recover_helper(struct nfs4_opendata *opendata,
+ fmode_t fmode)
{
struct nfs4_state *newstate;
int ret;
- if ((opendata->o_arg.claim == NFS4_OPEN_CLAIM_DELEGATE_CUR ||
- opendata->o_arg.claim == NFS4_OPEN_CLAIM_DELEG_CUR_FH) &&
- (opendata->o_arg.u.delegation_type & fmode) != fmode)
- /* This mode can't have been delegated, so we must have
- * a valid open_stateid to cover it - not need to reclaim.
- */
+ if (!nfs4_mode_match_open_stateid(opendata->state, fmode))
return 0;
opendata->o_arg.open_flags = 0;
opendata->o_arg.fmode = fmode;
@@ -1581,14 +1691,14 @@ static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, fmode_t fmod
newstate = nfs4_opendata_to_nfs4_state(opendata);
if (IS_ERR(newstate))
return PTR_ERR(newstate);
+ if (newstate != opendata->state)
+ ret = -ESTALE;
nfs4_close_state(newstate, fmode);
- *res = newstate;
- return 0;
+ return ret;
}
static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *state)
{
- struct nfs4_state *newstate;
int ret;
/* Don't trigger recovery in nfs_test_and_clear_all_open_stateid */
@@ -1599,27 +1709,15 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *
clear_bit(NFS_DELEGATED_STATE, &state->flags);
clear_bit(NFS_OPEN_STATE, &state->flags);
smp_rmb();
- if (state->n_rdwr != 0) {
- ret = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE, &newstate);
- if (ret != 0)
- return ret;
- if (newstate != state)
- return -ESTALE;
- }
- if (state->n_wronly != 0) {
- ret = nfs4_open_recover_helper(opendata, FMODE_WRITE, &newstate);
- if (ret != 0)
- return ret;
- if (newstate != state)
- return -ESTALE;
- }
- if (state->n_rdonly != 0) {
- ret = nfs4_open_recover_helper(opendata, FMODE_READ, &newstate);
- if (ret != 0)
- return ret;
- if (newstate != state)
- return -ESTALE;
- }
+ ret = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE);
+ if (ret != 0)
+ return ret;
+ ret = nfs4_open_recover_helper(opendata, FMODE_WRITE);
+ if (ret != 0)
+ return ret;
+ ret = nfs4_open_recover_helper(opendata, FMODE_READ);
+ if (ret != 0)
+ return ret;
/*
* We may have performed cached opens for all three recoveries.
* Check if we need to update the current stateid.
@@ -1743,18 +1841,35 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct
return err;
}
-int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid)
+int nfs4_open_delegation_recall(struct nfs_open_context *ctx,
+ struct nfs4_state *state, const nfs4_stateid *stateid,
+ fmode_t type)
{
struct nfs_server *server = NFS_SERVER(state->inode);
struct nfs4_opendata *opendata;
- int err;
+ int err = 0;
opendata = nfs4_open_recoverdata_alloc(ctx, state,
NFS4_OPEN_CLAIM_DELEG_CUR_FH);
if (IS_ERR(opendata))
return PTR_ERR(opendata);
nfs4_stateid_copy(&opendata->o_arg.u.delegation, stateid);
- err = nfs4_open_recover(opendata, state);
+ write_seqlock(&state->seqlock);
+ nfs4_stateid_copy(&state->stateid, &state->open_stateid);
+ write_sequnlock(&state->seqlock);
+ clear_bit(NFS_DELEGATED_STATE, &state->flags);
+ switch (type & (FMODE_READ|FMODE_WRITE)) {
+ case FMODE_READ|FMODE_WRITE:
+ case FMODE_WRITE:
+ err = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE);
+ if (err)
+ break;
+ err = nfs4_open_recover_helper(opendata, FMODE_WRITE);
+ if (err)
+ break;
+ case FMODE_READ:
+ err = nfs4_open_recover_helper(opendata, FMODE_READ);
+ }
nfs4_opendata_put(opendata);
return nfs4_handle_delegation_recall_error(server, state, stateid, err);
}
@@ -1834,6 +1949,8 @@ static int _nfs4_proc_open_confirm(struct nfs4_opendata *data)
data->rpc_done = 0;
data->rpc_status = 0;
data->timestamp = jiffies;
+ if (data->is_recover)
+ nfs4_set_sequence_privileged(&data->c_arg.seq_args);
task = rpc_run_task(&task_setup_data);
if (IS_ERR(task))
return PTR_ERR(task);
@@ -1852,6 +1969,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
struct nfs4_opendata *data = calldata;
struct nfs4_state_owner *sp = data->owner;
struct nfs_client *clp = sp->so_server->nfs_client;
+ enum open_claim_type4 claim = data->o_arg.claim;
if (nfs_wait_on_sequence(data->o_arg.seqid, task) != 0)
goto out_wait;
@@ -1866,15 +1984,15 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
goto out_no_action;
rcu_read_lock();
delegation = rcu_dereference(NFS_I(data->state->inode)->delegation);
- if (data->o_arg.claim != NFS4_OPEN_CLAIM_DELEGATE_CUR &&
- data->o_arg.claim != NFS4_OPEN_CLAIM_DELEG_CUR_FH &&
- can_open_delegated(delegation, data->o_arg.fmode))
+ if (can_open_delegated(delegation, data->o_arg.fmode, claim))
goto unlock_no_action;
rcu_read_unlock();
}
/* Update client id. */
data->o_arg.clientid = clp->cl_clientid;
- switch (data->o_arg.claim) {
+ switch (claim) {
+ default:
+ break;
case NFS4_OPEN_CLAIM_PREVIOUS:
case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
@@ -1901,6 +2019,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
}
return;
unlock_no_action:
+ trace_nfs4_cached_open(data->state);
rcu_read_unlock();
out_no_action:
task->tk_action = NULL;
@@ -2294,15 +2413,25 @@ static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st
* fields corresponding to attributes that were used to store the verifier.
* Make sure we clobber those fields in the later setattr call
*/
-static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, struct iattr *sattr)
+static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata,
+ struct iattr *sattr, struct nfs4_label **label)
{
- if ((opendata->o_res.attrset[1] & FATTR4_WORD1_TIME_ACCESS) &&
+ const u32 *attrset = opendata->o_res.attrset;
+
+ if ((attrset[1] & FATTR4_WORD1_TIME_ACCESS) &&
!(sattr->ia_valid & ATTR_ATIME_SET))
sattr->ia_valid |= ATTR_ATIME;
- if ((opendata->o_res.attrset[1] & FATTR4_WORD1_TIME_MODIFY) &&
+ if ((attrset[1] & FATTR4_WORD1_TIME_MODIFY) &&
!(sattr->ia_valid & ATTR_MTIME_SET))
sattr->ia_valid |= ATTR_MTIME;
+
+ /* Except MODE, it seems harmless of setting twice. */
+ if ((attrset[1] & FATTR4_WORD1_MODE))
+ sattr->ia_valid &= ~ATTR_MODE;
+
+ if (attrset[2] & FATTR4_WORD2_SECURITY_LABEL)
+ *label = NULL;
}
static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
@@ -2425,9 +2554,9 @@ static int _nfs4_do_open(struct inode *dir,
goto err_free_label;
state = ctx->state;
- if ((opendata->o_arg.open_flags & O_EXCL) &&
+ if ((opendata->o_arg.open_flags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL) &&
(opendata->o_arg.createmode != NFS4_CREATE_GUARDED)) {
- nfs4_exclusive_attrset(opendata, sattr);
+ nfs4_exclusive_attrset(opendata, sattr, &label);
nfs_fattr_init(opendata->o_res.f_attr);
status = nfs4_do_setattr(state->inode, cred,
@@ -2439,7 +2568,7 @@ static int _nfs4_do_open(struct inode *dir,
nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel);
}
}
- if (opendata->file_created)
+ if (opened && opendata->file_created)
*opened |= FILE_CREATED;
if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server)) {
@@ -2579,6 +2708,7 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
if (status == 0 && state != NULL)
renew_lease(server, timestamp);
+ trace_nfs4_setattr(inode, &arg.stateid, status);
return status;
}
@@ -2595,7 +2725,6 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
int err;
do {
err = _nfs4_do_setattr(inode, cred, fattr, sattr, state, ilabel, olabel);
- trace_nfs4_setattr(inode, err);
switch (err) {
case -NFS4ERR_OPENMODE:
if (!(sattr->ia_valid & ATTR_SIZE)) {
@@ -2618,6 +2747,15 @@ out:
return err;
}
+static bool
+nfs4_wait_on_layoutreturn(struct inode *inode, struct rpc_task *task)
+{
+ if (inode == NULL || !nfs_have_layout(inode))
+ return false;
+
+ return pnfs_wait_on_layoutreturn(inode, task);
+}
+
struct nfs4_closedata {
struct inode *inode;
struct nfs4_state *state;
@@ -2661,7 +2799,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
switch (task->tk_status) {
case 0:
res_stateid = &calldata->res.stateid;
- if (calldata->arg.fmode == 0 && calldata->roc)
+ if (calldata->roc)
pnfs_roc_set_barrier(state->inode,
calldata->roc_barrier);
renew_lease(server, calldata->timestamp);
@@ -2684,7 +2822,8 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
goto out_release;
}
}
- nfs_clear_open_stateid(state, res_stateid, calldata->arg.fmode);
+ nfs_clear_open_stateid(state, &calldata->arg.stateid,
+ res_stateid, calldata->arg.fmode);
out_release:
nfs_release_seqid(calldata->arg.seqid);
nfs_refresh_inode(calldata->inode, calldata->res.fattr);
@@ -2735,14 +2874,16 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
goto out_no_action;
}
- if (calldata->arg.fmode == 0) {
- task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE];
- if (calldata->roc &&
- pnfs_roc_drain(inode, &calldata->roc_barrier, task)) {
- nfs_release_seqid(calldata->arg.seqid);
- goto out_wait;
- }
+ if (nfs4_wait_on_layoutreturn(inode, task)) {
+ nfs_release_seqid(calldata->arg.seqid);
+ goto out_wait;
}
+
+ if (calldata->arg.fmode == 0)
+ task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE];
+ if (calldata->roc)
+ pnfs_roc_get_barrier(inode, &calldata->roc_barrier);
+
calldata->arg.share_access =
nfs4_map_atomic_open_share(NFS_SERVER(inode),
calldata->arg.fmode, 0);
@@ -2883,8 +3024,10 @@ static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
{
+ u32 bitmask[3] = {}, minorversion = server->nfs_client->cl_minorversion;
struct nfs4_server_caps_arg args = {
.fhandle = fhandle,
+ .bitmask = bitmask,
};
struct nfs4_server_caps_res res = {};
struct rpc_message msg = {
@@ -2894,10 +3037,18 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
};
int status;
+ bitmask[0] = FATTR4_WORD0_SUPPORTED_ATTRS |
+ FATTR4_WORD0_FH_EXPIRE_TYPE |
+ FATTR4_WORD0_LINK_SUPPORT |
+ FATTR4_WORD0_SYMLINK_SUPPORT |
+ FATTR4_WORD0_ACLSUPPORT;
+ if (minorversion)
+ bitmask[2] = FATTR4_WORD2_SUPPATTR_EXCLCREAT;
+
status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
if (status == 0) {
/* Sanity check the server answers */
- switch (server->nfs_client->cl_minorversion) {
+ switch (minorversion) {
case 0:
res.attr_bitmask[1] &= FATTR4_WORD1_NFS40_MASK;
res.attr_bitmask[2] = 0;
@@ -2950,6 +3101,8 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;
server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
server->cache_consistency_bitmask[2] = 0;
+ memcpy(server->exclcreat_bitmask, res.exclcreat_bitmask,
+ sizeof(server->exclcreat_bitmask));
server->acl_bitmask = res.acl_bitmask;
server->fh_expire_type = res.fh_expire_type;
}
@@ -3552,7 +3705,6 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
struct nfs4_label l, *ilabel = NULL;
struct nfs_open_context *ctx;
struct nfs4_state *state;
- int opened = 0;
int status = 0;
ctx = alloc_nfs_open_context(dentry, FMODE_READ);
@@ -3562,7 +3714,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
ilabel = nfs4_label_init_security(dir, dentry, sattr, &l);
sattr->ia_mode &= ~current_umask();
- state = nfs4_do_open(dir, ctx, flags, sattr, ilabel, &opened);
+ state = nfs4_do_open(dir, ctx, flags, sattr, ilabel, NULL);
if (IS_ERR(state)) {
status = PTR_ERR(state);
goto out;
@@ -4456,7 +4608,7 @@ static inline int nfs4_server_supports_acls(struct nfs_server *server)
#define NFS4ACL_MAXPAGES DIV_ROUND_UP(XATTR_SIZE_MAX, PAGE_SIZE)
static int buf_to_pages_noslab(const void *buf, size_t buflen,
- struct page **pages, unsigned int *pgbase)
+ struct page **pages)
{
struct page *newpage, **spages;
int rc = 0;
@@ -4600,7 +4752,6 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu
goto out_free;
args.acl_len = npages * PAGE_SIZE;
- args.acl_pgbase = 0;
dprintk("%s buf %p buflen %zu npages %d args.acl_len %zu\n",
__func__, buf, buflen, npages, args.acl_len);
@@ -4692,7 +4843,7 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
return -EOPNOTSUPP;
if (npages > ARRAY_SIZE(pages))
return -ERANGE;
- i = buf_to_pages_noslab(buf, buflen, arg.acl_pages, &arg.acl_pgbase);
+ i = buf_to_pages_noslab(buf, buflen, arg.acl_pages);
if (i < 0)
return i;
nfs4_inode_return_delegation(inode);
@@ -4881,79 +5032,6 @@ out:
#endif /* CONFIG_NFS_V4_SECURITY_LABEL */
-static int
-nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
- struct nfs4_state *state, long *timeout)
-{
- struct nfs_client *clp = server->nfs_client;
-
- if (task->tk_status >= 0)
- return 0;
- switch(task->tk_status) {
- case -NFS4ERR_DELEG_REVOKED:
- case -NFS4ERR_ADMIN_REVOKED:
- case -NFS4ERR_BAD_STATEID:
- case -NFS4ERR_OPENMODE:
- if (state == NULL)
- break;
- if (nfs4_schedule_stateid_recovery(server, state) < 0)
- goto recovery_failed;
- goto wait_on_recovery;
- case -NFS4ERR_EXPIRED:
- if (state != NULL) {
- if (nfs4_schedule_stateid_recovery(server, state) < 0)
- goto recovery_failed;
- }
- case -NFS4ERR_STALE_STATEID:
- case -NFS4ERR_STALE_CLIENTID:
- nfs4_schedule_lease_recovery(clp);
- goto wait_on_recovery;
- case -NFS4ERR_MOVED:
- if (nfs4_schedule_migration_recovery(server) < 0)
- goto recovery_failed;
- goto wait_on_recovery;
- case -NFS4ERR_LEASE_MOVED:
- nfs4_schedule_lease_moved_recovery(clp);
- goto wait_on_recovery;
-#if defined(CONFIG_NFS_V4_1)
- case -NFS4ERR_BADSESSION:
- case -NFS4ERR_BADSLOT:
- case -NFS4ERR_BAD_HIGH_SLOT:
- case -NFS4ERR_DEADSESSION:
- case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
- case -NFS4ERR_SEQ_FALSE_RETRY:
- case -NFS4ERR_SEQ_MISORDERED:
- dprintk("%s ERROR %d, Reset session\n", __func__,
- task->tk_status);
- nfs4_schedule_session_recovery(clp->cl_session, task->tk_status);
- goto wait_on_recovery;
-#endif /* CONFIG_NFS_V4_1 */
- case -NFS4ERR_DELAY:
- nfs_inc_server_stats(server, NFSIOS_DELAY);
- rpc_delay(task, nfs4_update_delay(timeout));
- goto restart_call;
- case -NFS4ERR_GRACE:
- rpc_delay(task, NFS4_POLL_RETRY_MAX);
- case -NFS4ERR_RETRY_UNCACHED_REP:
- case -NFS4ERR_OLD_STATEID:
- goto restart_call;
- }
- task->tk_status = nfs4_map_errors(task->tk_status);
- return 0;
-recovery_failed:
- task->tk_status = -EIO;
- return 0;
-wait_on_recovery:
- rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL);
- if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0)
- rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task);
- if (test_bit(NFS_MIG_FAILED, &server->mig_status))
- goto recovery_failed;
-restart_call:
- task->tk_status = 0;
- return -EAGAIN;
-}
-
static void nfs4_init_boot_verifier(const struct nfs_client *clp,
nfs4_verifier *bootverf)
{
@@ -4975,16 +5053,14 @@ static void nfs4_init_boot_verifier(const struct nfs_client *clp,
static int
nfs4_init_nonuniform_client_string(struct nfs_client *clp)
{
- int result;
size_t len;
char *str;
- bool retried = false;
if (clp->cl_owner_id != NULL)
return 0;
-retry:
+
rcu_read_lock();
- len = 10 + strlen(clp->cl_ipaddr) + 1 +
+ len = 14 + strlen(clp->cl_ipaddr) + 1 +
strlen(rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)) +
1 +
strlen(rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO)) +
@@ -5004,20 +5080,12 @@ retry:
return -ENOMEM;
rcu_read_lock();
- result = scnprintf(str, len, "Linux NFSv4.0 %s/%s %s",
+ scnprintf(str, len, "Linux NFSv4.0 %s/%s %s",
clp->cl_ipaddr,
rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR),
rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO));
rcu_read_unlock();
- /* Did something change? */
- if (result >= len) {
- kfree(str);
- if (retried)
- return -EINVAL;
- retried = true;
- goto retry;
- }
clp->cl_owner_id = str;
return 0;
}
@@ -5025,7 +5093,6 @@ retry:
static int
nfs4_init_uniquifier_client_string(struct nfs_client *clp)
{
- int result;
size_t len;
char *str;
@@ -5045,14 +5112,10 @@ nfs4_init_uniquifier_client_string(struct nfs_client *clp)
if (!str)
return -ENOMEM;
- result = scnprintf(str, len, "Linux NFSv%u.%u %s/%s",
+ scnprintf(str, len, "Linux NFSv%u.%u %s/%s",
clp->rpc_ops->version, clp->cl_minorversion,
nfs4_client_id_uniquifier,
clp->cl_rpcclient->cl_nodename);
- if (result >= len) {
- kfree(str);
- return -EINVAL;
- }
clp->cl_owner_id = str;
return 0;
}
@@ -5060,7 +5123,6 @@ nfs4_init_uniquifier_client_string(struct nfs_client *clp)
static int
nfs4_init_uniform_client_string(struct nfs_client *clp)
{
- int result;
size_t len;
char *str;
@@ -5085,13 +5147,9 @@ nfs4_init_uniform_client_string(struct nfs_client *clp)
if (!str)
return -ENOMEM;
- result = scnprintf(str, len, "Linux NFSv%u.%u %s",
+ scnprintf(str, len, "Linux NFSv%u.%u %s",
clp->rpc_ops->version, clp->cl_minorversion,
clp->cl_rpcclient->cl_nodename);
- if (result >= len) {
- kfree(str);
- return -EINVAL;
- }
clp->cl_owner_id = str;
return 0;
}
@@ -5289,10 +5347,12 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data)
d_data = (struct nfs4_delegreturndata *)data;
- if (d_data->roc &&
- pnfs_roc_drain(d_data->inode, &d_data->roc_barrier, task))
+ if (nfs4_wait_on_layoutreturn(d_data->inode, task))
return;
+ if (d_data->roc)
+ pnfs_roc_get_barrier(d_data->inode, &d_data->roc_barrier);
+
nfs4_setup_sequence(d_data->res.server,
&d_data->args.seq_args,
&d_data->res.seq_res,
@@ -5326,6 +5386,11 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
if (data == NULL)
return -ENOMEM;
nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
+
+ nfs4_state_protect(server->nfs_client,
+ NFS_SP4_MACH_CRED_CLEANUP,
+ &task_setup_data.rpc_client, &msg);
+
data->args.fhandle = &data->fh;
data->args.stateid = &data->stateid;
data->args.bitmask = server->cache_consistency_bitmask;
@@ -5368,7 +5433,7 @@ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4
int err;
do {
err = _nfs4_proc_delegreturn(inode, cred, stateid, issync);
- trace_nfs4_delegreturn(inode, err);
+ trace_nfs4_delegreturn(inode, stateid, err);
switch (err) {
case -NFS4ERR_STALE_STATEID:
case -NFS4ERR_EXPIRED:
@@ -5454,18 +5519,7 @@ static int nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock *
static int do_vfs_lock(struct inode *inode, struct file_lock *fl)
{
- int res = 0;
- switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) {
- case FL_POSIX:
- res = posix_lock_inode_wait(inode, fl);
- break;
- case FL_FLOCK:
- res = flock_lock_inode_wait(inode, fl);
- break;
- default:
- BUG();
- }
- return res;
+ return locks_lock_inode_wait(inode, fl);
}
struct nfs4_unlockdata {
@@ -5474,7 +5528,7 @@ struct nfs4_unlockdata {
struct nfs4_lock_state *lsp;
struct nfs_open_context *ctx;
struct file_lock fl;
- const struct nfs_server *server;
+ struct nfs_server *server;
unsigned long timestamp;
};
@@ -5889,6 +5943,7 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
data->cancelled = 1;
rpc_put_task(task);
dprintk("%s: done, ret = %d!\n", __func__, ret);
+ trace_nfs4_set_lock(fl, state, &data->res.stateid, cmd, ret);
return ret;
}
@@ -5905,7 +5960,6 @@ static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request
if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0)
return 0;
err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_RECLAIM);
- trace_nfs4_lock_reclaim(request, state, F_SETLK, err);
if (err != -NFS4ERR_DELAY)
break;
nfs4_handle_exception(server, err, &exception);
@@ -5932,7 +5986,6 @@ static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request
if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0)
return 0;
err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_EXPIRED);
- trace_nfs4_lock_expired(request, state, F_SETLK, err);
switch (err) {
default:
goto out;
@@ -6040,7 +6093,6 @@ static int nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *
do {
err = _nfs4_proc_setlk(state, cmd, request);
- trace_nfs4_set_lock(request, state, cmd, err);
if (err == -NFS4ERR_DENIED)
err = -EAGAIN;
err = nfs4_handle_exception(NFS_SERVER(state->inode),
@@ -6201,48 +6253,32 @@ nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp)
#define XATTR_NAME_NFSV4_ACL "system.nfs4_acl"
-static int nfs4_xattr_set_nfs4_acl(struct dentry *dentry, const char *key,
+static int nfs4_xattr_set_nfs4_acl(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *key,
const void *buf, size_t buflen,
- int flags, int type)
+ int flags)
{
- if (strcmp(key, "") != 0)
- return -EINVAL;
-
return nfs4_proc_set_acl(d_inode(dentry), buf, buflen);
}
-static int nfs4_xattr_get_nfs4_acl(struct dentry *dentry, const char *key,
- void *buf, size_t buflen, int type)
+static int nfs4_xattr_get_nfs4_acl(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *key,
+ void *buf, size_t buflen)
{
- if (strcmp(key, "") != 0)
- return -EINVAL;
-
return nfs4_proc_get_acl(d_inode(dentry), buf, buflen);
}
-static size_t nfs4_xattr_list_nfs4_acl(struct dentry *dentry, char *list,
- size_t list_len, const char *name,
- size_t name_len, int type)
+static bool nfs4_xattr_list_nfs4_acl(struct dentry *dentry)
{
- size_t len = sizeof(XATTR_NAME_NFSV4_ACL);
-
- if (!nfs4_server_supports_acls(NFS_SERVER(d_inode(dentry))))
- return 0;
-
- if (list && len <= list_len)
- memcpy(list, XATTR_NAME_NFSV4_ACL, len);
- return len;
+ return nfs4_server_supports_acls(NFS_SERVER(d_inode(dentry)));
}
#ifdef CONFIG_NFS_V4_SECURITY_LABEL
-static inline int nfs4_server_supports_labels(struct nfs_server *server)
-{
- return server->caps & NFS_CAP_SECURITY_LABEL;
-}
-static int nfs4_xattr_set_nfs4_label(struct dentry *dentry, const char *key,
- const void *buf, size_t buflen,
- int flags, int type)
+static int nfs4_xattr_set_nfs4_label(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *key,
+ const void *buf, size_t buflen,
+ int flags)
{
if (security_ismaclabel(key))
return nfs4_set_security_label(dentry, buf, buflen);
@@ -6250,36 +6286,43 @@ static int nfs4_xattr_set_nfs4_label(struct dentry *dentry, const char *key,
return -EOPNOTSUPP;
}
-static int nfs4_xattr_get_nfs4_label(struct dentry *dentry, const char *key,
- void *buf, size_t buflen, int type)
+static int nfs4_xattr_get_nfs4_label(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *key,
+ void *buf, size_t buflen)
{
if (security_ismaclabel(key))
return nfs4_get_security_label(d_inode(dentry), buf, buflen);
return -EOPNOTSUPP;
}
-static size_t nfs4_xattr_list_nfs4_label(struct dentry *dentry, char *list,
- size_t list_len, const char *name,
- size_t name_len, int type)
+static ssize_t
+nfs4_listxattr_nfs4_label(struct inode *inode, char *list, size_t list_len)
{
- size_t len = 0;
+ int len = 0;
- if (nfs_server_capable(d_inode(dentry), NFS_CAP_SECURITY_LABEL)) {
- len = security_inode_listsecurity(d_inode(dentry), NULL, 0);
- if (list && len <= list_len)
- security_inode_listsecurity(d_inode(dentry), list, len);
+ if (nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL)) {
+ len = security_inode_listsecurity(inode, list, list_len);
+ if (list_len && len > list_len)
+ return -ERANGE;
}
return len;
}
static const struct xattr_handler nfs4_xattr_nfs4_label_handler = {
.prefix = XATTR_SECURITY_PREFIX,
- .list = nfs4_xattr_list_nfs4_label,
.get = nfs4_xattr_get_nfs4_label,
.set = nfs4_xattr_set_nfs4_label,
};
-#endif
+#else
+
+static ssize_t
+nfs4_listxattr_nfs4_label(struct inode *inode, char *list, size_t list_len)
+{
+ return 0;
+}
+
+#endif
/*
* nfs_fhget will use either the mounted_on_fileid or the fileid
@@ -6809,10 +6852,13 @@ static const struct nfs41_state_protection nfs4_sp4_mach_cred_request = {
},
.allow.u.words = {
[0] = 1 << (OP_CLOSE) |
+ 1 << (OP_OPEN_DOWNGRADE) |
1 << (OP_LOCKU) |
+ 1 << (OP_DELEGRETURN) |
1 << (OP_COMMIT),
[1] = 1 << (OP_SECINFO - 32) |
1 << (OP_SECINFO_NO_NAME - 32) |
+ 1 << (OP_LAYOUTRETURN - 32) |
1 << (OP_TEST_STATEID - 32) |
1 << (OP_FREE_STATEID - 32) |
1 << (OP_WRITE - 32)
@@ -6877,11 +6923,19 @@ static int nfs4_sp4_select_mode(struct nfs_client *clp,
}
if (test_bit(OP_CLOSE, sp->allow.u.longs) &&
+ test_bit(OP_OPEN_DOWNGRADE, sp->allow.u.longs) &&
+ test_bit(OP_DELEGRETURN, sp->allow.u.longs) &&
test_bit(OP_LOCKU, sp->allow.u.longs)) {
dfprintk(MOUNT, " cleanup mode enabled\n");
set_bit(NFS_SP4_MACH_CRED_CLEANUP, &clp->cl_sp4_flags);
}
+ if (test_bit(OP_LAYOUTRETURN, sp->allow.u.longs)) {
+ dfprintk(MOUNT, " pnfs cleanup mode enabled\n");
+ set_bit(NFS_SP4_MACH_CRED_PNFS_CLEANUP,
+ &clp->cl_sp4_flags);
+ }
+
if (test_bit(OP_SECINFO, sp->allow.u.longs) &&
test_bit(OP_SECINFO_NO_NAME, sp->allow.u.longs)) {
dfprintk(MOUNT, " secinfo mode enabled\n");
@@ -7710,6 +7764,7 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
struct nfs4_layoutget *lgp = calldata;
struct nfs_server *server = NFS_SERVER(lgp->args.inode);
struct nfs4_session *session = nfs4_get_session(server);
+ int ret;
dprintk("--> %s\n", __func__);
/* Note the is a race here, where a CB_LAYOUTRECALL can come in
@@ -7720,12 +7775,12 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
if (nfs41_setup_sequence(session, &lgp->args.seq_args,
&lgp->res.seq_res, task))
return;
- if (pnfs_choose_layoutget_stateid(&lgp->args.stateid,
+ ret = pnfs_choose_layoutget_stateid(&lgp->args.stateid,
NFS_I(lgp->args.inode)->layout,
&lgp->args.range,
- lgp->args.ctx->state)) {
- rpc_exit(task, NFS4_OK);
- }
+ lgp->args.ctx->state);
+ if (ret < 0)
+ rpc_exit(task, ret);
}
static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
@@ -7745,11 +7800,29 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
switch (task->tk_status) {
case 0:
goto out;
+
+ /*
+ * NFS4ERR_LAYOUTUNAVAILABLE means we are not supposed to use pnfs
+ * on the file. set tk_status to -ENODATA to tell upper layer to
+ * retry go inband.
+ */
+ case -NFS4ERR_LAYOUTUNAVAILABLE:
+ task->tk_status = -ENODATA;
+ goto out;
+ /*
+ * NFS4ERR_BADLAYOUT means the MDS cannot return a layout of
+ * length lgp->args.minlength != 0 (see RFC5661 section 18.43.3).
+ */
+ case -NFS4ERR_BADLAYOUT:
+ goto out_overflow;
/*
* NFS4ERR_LAYOUTTRYLATER is a conflict with another client
- * (or clients) writing to the same RAID stripe
+ * (or clients) writing to the same RAID stripe except when
+ * the minlength argument is 0 (see RFC5661 section 18.43.3).
*/
case -NFS4ERR_LAYOUTTRYLATER:
+ if (lgp->args.minlength == 0)
+ goto out_overflow;
/*
* NFS4ERR_RECALLCONFLICT is when conflict with self (must recall
* existing layout before getting a new one).
@@ -7773,38 +7846,49 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
dprintk("%s: NFS4ERR_RECALLCONFLICT waiting %lu\n",
__func__, delay);
rpc_delay(task, delay);
- task->tk_status = 0;
- rpc_restart_call_prepare(task);
- goto out; /* Do not call nfs4_async_handle_error() */
+ /* Do not call nfs4_async_handle_error() */
+ goto out_restart;
}
break;
case -NFS4ERR_EXPIRED:
case -NFS4ERR_BAD_STATEID:
spin_lock(&inode->i_lock);
- lo = NFS_I(inode)->layout;
- if (!lo || list_empty(&lo->plh_segs)) {
+ if (nfs4_stateid_match(&lgp->args.stateid,
+ &lgp->args.ctx->state->stateid)) {
spin_unlock(&inode->i_lock);
/* If the open stateid was bad, then recover it. */
state = lgp->args.ctx->state;
- } else {
+ break;
+ }
+ lo = NFS_I(inode)->layout;
+ if (lo && nfs4_stateid_match(&lgp->args.stateid,
+ &lo->plh_stateid)) {
LIST_HEAD(head);
/*
* Mark the bad layout state as invalid, then retry
* with the current stateid.
*/
+ set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
pnfs_mark_matching_lsegs_invalid(lo, &head, NULL);
spin_unlock(&inode->i_lock);
pnfs_free_lseg_list(&head);
-
- task->tk_status = 0;
- rpc_restart_call_prepare(task);
- }
+ } else
+ spin_unlock(&inode->i_lock);
+ goto out_restart;
}
- if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN)
- rpc_restart_call_prepare(task);
+ if (nfs4_async_handle_error(task, server, state, &lgp->timeout) == -EAGAIN)
+ goto out_restart;
out:
dprintk("<-- %s\n", __func__);
+ return;
+out_restart:
+ task->tk_status = 0;
+ rpc_restart_call_prepare(task);
+ return;
+out_overflow:
+ task->tk_status = -EOVERFLOW;
+ goto out;
}
static size_t max_response_pages(struct nfs_server *server)
@@ -7921,6 +8005,7 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
trace_nfs4_layoutget(lgp->args.ctx,
&lgp->args.range,
&lgp->res.range,
+ &lgp->res.stateid,
status);
/* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */
if (status == 0 && lgp->res.layoutp->len)
@@ -7977,11 +8062,11 @@ static void nfs4_layoutreturn_release(void *calldata)
dprintk("--> %s\n", __func__);
spin_lock(&lo->plh_inode->i_lock);
+ pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range);
+ pnfs_mark_layout_returned_if_empty(lo);
if (lrp->res.lrs_present)
pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
- pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range);
pnfs_clear_layoutreturn_waitbit(lo);
- lo->plh_block_lgets--;
spin_unlock(&lo->plh_inode->i_lock);
pnfs_free_lseg_list(&freeme);
pnfs_put_layout_hdr(lrp->args.layout);
@@ -8013,6 +8098,10 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync)
};
int status = 0;
+ nfs4_state_protect(NFS_SERVER(lrp->args.inode)->nfs_client,
+ NFS_SP4_MACH_CRED_PNFS_CLEANUP,
+ &task_setup_data.rpc_client, &msg);
+
dprintk("--> %s\n", __func__);
if (!sync) {
lrp->inode = nfs_igrab_and_active(lrp->args.inode);
@@ -8028,7 +8117,7 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync)
return PTR_ERR(task);
if (sync)
status = task->tk_status;
- trace_nfs4_layoutreturn(lrp->args.inode, status);
+ trace_nfs4_layoutreturn(lrp->args.inode, &lrp->args.stateid, status);
dprintk("<-- %s status=%d\n", __func__, status);
rpc_put_task(task);
return status;
@@ -8176,7 +8265,7 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync)
return PTR_ERR(task);
if (sync)
status = task->tk_status;
- trace_nfs4_layoutcommit(data->args.inode, status);
+ trace_nfs4_layoutcommit(data->args.inode, &data->args.stateid, status);
dprintk("%s: status %d\n", __func__, status);
rpc_put_task(task);
return status;
@@ -8650,7 +8739,8 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
| NFS_CAP_ALLOCATE
| NFS_CAP_DEALLOCATE
| NFS_CAP_SEEK
- | NFS_CAP_LAYOUTSTATS,
+ | NFS_CAP_LAYOUTSTATS
+ | NFS_CAP_CLONE,
.init_client = nfs41_init_client,
.shutdown_client = nfs41_shutdown_client,
.match_stateid = nfs41_match_stateid,
@@ -8661,6 +8751,7 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
.reboot_recovery_ops = &nfs41_reboot_recovery_ops,
.nograce_recovery_ops = &nfs41_nograce_recovery_ops,
.state_renewal_ops = &nfs41_state_renewal_ops,
+ .mig_recovery_ops = &nfs41_mig_recovery_ops,
};
#endif
@@ -8674,6 +8765,24 @@ const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = {
#endif
};
+ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size)
+{
+ ssize_t error, error2;
+
+ error = generic_listxattr(dentry, list, size);
+ if (error < 0)
+ return error;
+ if (list) {
+ list += error;
+ size -= error;
+ }
+
+ error2 = nfs4_listxattr_nfs4_label(d_inode(dentry), list, size);
+ if (error2 < 0)
+ return error2;
+ return error + error2;
+}
+
static const struct inode_operations nfs4_dir_inode_operations = {
.create = nfs_create,
.lookup = nfs_lookup,
@@ -8690,7 +8799,7 @@ static const struct inode_operations nfs4_dir_inode_operations = {
.setattr = nfs_setattr,
.getxattr = generic_getxattr,
.setxattr = generic_setxattr,
- .listxattr = generic_listxattr,
+ .listxattr = nfs4_listxattr,
.removexattr = generic_removexattr,
};
@@ -8700,7 +8809,7 @@ static const struct inode_operations nfs4_file_inode_operations = {
.setattr = nfs_setattr,
.getxattr = generic_getxattr,
.setxattr = generic_setxattr,
- .listxattr = generic_listxattr,
+ .listxattr = nfs4_listxattr,
.removexattr = generic_removexattr,
};
@@ -8759,7 +8868,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
};
static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = {
- .prefix = XATTR_NAME_NFSV4_ACL,
+ .name = XATTR_NAME_NFSV4_ACL,
.list = nfs4_xattr_list_nfs4_acl,
.get = nfs4_xattr_get_nfs4_acl,
.set = nfs4_xattr_set_nfs4_acl,
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index f2e2ad8..d854693 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1481,7 +1481,7 @@ restart:
spin_unlock(&state->state_lock);
}
nfs4_put_open_state(state);
- clear_bit(NFS4CLNT_RECLAIM_NOGRACE,
+ clear_bit(NFS_STATE_RECLAIM_NOGRACE,
&state->flags);
spin_lock(&sp->so_lock);
goto restart;
@@ -1725,7 +1725,8 @@ restart:
if (!test_and_clear_bit(ops->owner_flag_bit,
&sp->so_flags))
continue;
- atomic_inc(&sp->so_count);
+ if (!atomic_inc_not_zero(&sp->so_count))
+ continue;
spin_unlock(&clp->cl_lock);
rcu_read_unlock();
@@ -2152,23 +2153,13 @@ void nfs4_schedule_session_recovery(struct nfs4_session *session, int err)
}
EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery);
-static void nfs41_ping_server(struct nfs_client *clp)
+void nfs41_notify_server(struct nfs_client *clp)
{
/* Use CHECK_LEASE to ping the server with a SEQUENCE */
set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
nfs4_schedule_state_manager(clp);
}
-void nfs41_server_notify_target_slotid_update(struct nfs_client *clp)
-{
- nfs41_ping_server(clp);
-}
-
-void nfs41_server_notify_highest_slotid_update(struct nfs_client *clp)
-{
- nfs41_ping_server(clp);
-}
-
static void nfs4_reset_all_state(struct nfs_client *clp)
{
if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) {
diff --git a/fs/nfs/nfs4sysctl.c b/fs/nfs/nfs4sysctl.c
index 0fbd3ab..8693d77 100644
--- a/fs/nfs/nfs4sysctl.c
+++ b/fs/nfs/nfs4sysctl.c
@@ -12,7 +12,7 @@
#include "nfs4idmap.h"
#include "callback.h"
-static const int nfs_set_port_min = 0;
+static const int nfs_set_port_min;
static const int nfs_set_port_max = 65535;
static struct ctl_table_header *nfs4_callback_sysctl_table;
diff --git a/fs/nfs/nfs4trace.c b/fs/nfs/nfs4trace.c
index d774335..2850bce 100644
--- a/fs/nfs/nfs4trace.c
+++ b/fs/nfs/nfs4trace.c
@@ -6,6 +6,7 @@
#include "internal.h"
#include "nfs4session.h"
#include "callback.h"
+#include "pnfs.h"
#define CREATE_TRACE_POINTS
#include "nfs4trace.h"
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index 470af1a..2c8d05d 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -321,6 +321,7 @@ TRACE_EVENT(nfs4_sequence_done,
__entry->highest_slotid = res->sr_highest_slotid;
__entry->target_highest_slotid =
res->sr_target_highest_slotid;
+ __entry->status_flags = res->sr_status_flags;
__entry->error = res->sr_status;
),
TP_printk(
@@ -399,6 +400,10 @@ DECLARE_EVENT_CLASS(nfs4_open_event,
__field(u64, fileid)
__field(u64, dir)
__string(name, ctx->dentry->d_name.name)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ __field(int, openstateid_seq)
+ __field(u32, openstateid_hash)
),
TP_fast_assign(
@@ -409,8 +414,22 @@ DECLARE_EVENT_CLASS(nfs4_open_event,
__entry->flags = flags;
__entry->fmode = (__force unsigned int)ctx->mode;
__entry->dev = ctx->dentry->d_sb->s_dev;
- if (!IS_ERR(state))
+ if (!IS_ERR_OR_NULL(state)) {
inode = state->inode;
+ __entry->stateid_seq =
+ be32_to_cpu(state->stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&state->stateid);
+ __entry->openstateid_seq =
+ be32_to_cpu(state->open_stateid.seqid);
+ __entry->openstateid_hash =
+ nfs_stateid_hash(&state->open_stateid);
+ } else {
+ __entry->stateid_seq = 0;
+ __entry->stateid_hash = 0;
+ __entry->openstateid_seq = 0;
+ __entry->openstateid_hash = 0;
+ }
if (inode != NULL) {
__entry->fileid = NFS_FILEID(inode);
__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
@@ -425,7 +444,8 @@ DECLARE_EVENT_CLASS(nfs4_open_event,
TP_printk(
"error=%d (%s) flags=%d (%s) fmode=%s "
"fileid=%02x:%02x:%llu fhandle=0x%08x "
- "name=%02x:%02x:%llu/%s",
+ "name=%02x:%02x:%llu/%s stateid=%d:0x%08x "
+ "openstateid=%d:0x%08x",
__entry->error,
show_nfsv4_errors(__entry->error),
__entry->flags,
@@ -436,7 +456,9 @@ DECLARE_EVENT_CLASS(nfs4_open_event,
__entry->fhandle,
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->dir,
- __get_str(name)
+ __get_str(name),
+ __entry->stateid_seq, __entry->stateid_hash,
+ __entry->openstateid_seq, __entry->openstateid_hash
)
);
@@ -452,6 +474,45 @@ DEFINE_NFS4_OPEN_EVENT(nfs4_open_reclaim);
DEFINE_NFS4_OPEN_EVENT(nfs4_open_expired);
DEFINE_NFS4_OPEN_EVENT(nfs4_open_file);
+TRACE_EVENT(nfs4_cached_open,
+ TP_PROTO(
+ const struct nfs4_state *state
+ ),
+ TP_ARGS(state),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(unsigned int, fmode)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = state->inode;
+
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->fmode = (__force unsigned int)state->state;
+ __entry->stateid_seq =
+ be32_to_cpu(state->stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&state->stateid);
+ ),
+
+ TP_printk(
+ "fmode=%s fileid=%02x:%02x:%llu "
+ "fhandle=0x%08x stateid=%d:0x%08x",
+ __entry->fmode ? show_fmode_flags(__entry->fmode) :
+ "closed",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash
+ )
+);
+
TRACE_EVENT(nfs4_close,
TP_PROTO(
const struct nfs4_state *state,
@@ -468,6 +529,8 @@ TRACE_EVENT(nfs4_close,
__field(u64, fileid)
__field(unsigned int, fmode)
__field(int, error)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
),
TP_fast_assign(
@@ -478,18 +541,23 @@ TRACE_EVENT(nfs4_close,
__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
__entry->fmode = (__force unsigned int)state->state;
__entry->error = error;
+ __entry->stateid_seq =
+ be32_to_cpu(args->stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&args->stateid);
),
TP_printk(
"error=%d (%s) fmode=%s fileid=%02x:%02x:%llu "
- "fhandle=0x%08x",
+ "fhandle=0x%08x openstateid=%d:0x%08x",
__entry->error,
show_nfsv4_errors(__entry->error),
__entry->fmode ? show_fmode_flags(__entry->fmode) :
"closed",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
- __entry->fhandle
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash
)
);
@@ -523,6 +591,8 @@ DECLARE_EVENT_CLASS(nfs4_lock_event,
__field(dev_t, dev)
__field(u32, fhandle)
__field(u64, fileid)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
),
TP_fast_assign(
@@ -536,11 +606,16 @@ DECLARE_EVENT_CLASS(nfs4_lock_event,
__entry->dev = inode->i_sb->s_dev;
__entry->fileid = NFS_FILEID(inode);
__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->stateid_seq =
+ be32_to_cpu(state->stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&state->stateid);
),
TP_printk(
"error=%d (%s) cmd=%s:%s range=%lld:%lld "
- "fileid=%02x:%02x:%llu fhandle=0x%08x",
+ "fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "stateid=%d:0x%08x",
__entry->error,
show_nfsv4_errors(__entry->error),
show_lock_cmd(__entry->cmd),
@@ -549,7 +624,8 @@ DECLARE_EVENT_CLASS(nfs4_lock_event,
(long long)__entry->end,
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
- __entry->fhandle
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash
)
);
@@ -563,11 +639,73 @@ DECLARE_EVENT_CLASS(nfs4_lock_event,
), \
TP_ARGS(request, state, cmd, error))
DEFINE_NFS4_LOCK_EVENT(nfs4_get_lock);
-DEFINE_NFS4_LOCK_EVENT(nfs4_set_lock);
-DEFINE_NFS4_LOCK_EVENT(nfs4_lock_reclaim);
-DEFINE_NFS4_LOCK_EVENT(nfs4_lock_expired);
DEFINE_NFS4_LOCK_EVENT(nfs4_unlock);
+TRACE_EVENT(nfs4_set_lock,
+ TP_PROTO(
+ const struct file_lock *request,
+ const struct nfs4_state *state,
+ const nfs4_stateid *lockstateid,
+ int cmd,
+ int error
+ ),
+
+ TP_ARGS(request, state, lockstateid, cmd, error),
+
+ TP_STRUCT__entry(
+ __field(int, error)
+ __field(int, cmd)
+ __field(char, type)
+ __field(loff_t, start)
+ __field(loff_t, end)
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ __field(int, lockstateid_seq)
+ __field(u32, lockstateid_hash)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = state->inode;
+
+ __entry->error = error;
+ __entry->cmd = cmd;
+ __entry->type = request->fl_type;
+ __entry->start = request->fl_start;
+ __entry->end = request->fl_end;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->stateid_seq =
+ be32_to_cpu(state->stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&state->stateid);
+ __entry->lockstateid_seq =
+ be32_to_cpu(lockstateid->seqid);
+ __entry->lockstateid_hash =
+ nfs_stateid_hash(lockstateid);
+ ),
+
+ TP_printk(
+ "error=%d (%s) cmd=%s:%s range=%lld:%lld "
+ "fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "stateid=%d:0x%08x lockstateid=%d:0x%08x",
+ __entry->error,
+ show_nfsv4_errors(__entry->error),
+ show_lock_cmd(__entry->cmd),
+ show_lock_type(__entry->type),
+ (long long)__entry->start,
+ (long long)__entry->end,
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash,
+ __entry->lockstateid_seq, __entry->lockstateid_hash
+ )
+);
+
DECLARE_EVENT_CLASS(nfs4_set_delegation_event,
TP_PROTO(
const struct inode *inode,
@@ -621,20 +759,28 @@ TRACE_EVENT(nfs4_delegreturn_exit,
__field(dev_t, dev)
__field(u32, fhandle)
__field(int, error)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
),
TP_fast_assign(
__entry->dev = res->server->s_dev;
__entry->fhandle = nfs_fhandle_hash(args->fhandle);
__entry->error = error;
+ __entry->stateid_seq =
+ be32_to_cpu(args->stateid->seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(args->stateid);
),
TP_printk(
- "error=%d (%s) dev=%02x:%02x fhandle=0x%08x",
+ "error=%d (%s) dev=%02x:%02x fhandle=0x%08x "
+ "stateid=%d:0x%08x",
__entry->error,
show_nfsv4_errors(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->fhandle
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash
)
);
@@ -653,6 +799,8 @@ DECLARE_EVENT_CLASS(nfs4_test_stateid_event,
__field(dev_t, dev)
__field(u32, fhandle)
__field(u64, fileid)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
),
TP_fast_assign(
@@ -662,15 +810,21 @@ DECLARE_EVENT_CLASS(nfs4_test_stateid_event,
__entry->dev = inode->i_sb->s_dev;
__entry->fileid = NFS_FILEID(inode);
__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->stateid_seq =
+ be32_to_cpu(state->stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&state->stateid);
),
TP_printk(
- "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x",
+ "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "stateid=%d:0x%08x",
__entry->error,
show_nfsv4_errors(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
- __entry->fhandle
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash
)
);
@@ -820,7 +974,6 @@ DECLARE_EVENT_CLASS(nfs4_inode_event,
), \
TP_ARGS(inode, error))
-DEFINE_NFS4_INODE_EVENT(nfs4_setattr);
DEFINE_NFS4_INODE_EVENT(nfs4_access);
DEFINE_NFS4_INODE_EVENT(nfs4_readlink);
DEFINE_NFS4_INODE_EVENT(nfs4_readdir);
@@ -830,8 +983,59 @@ DEFINE_NFS4_INODE_EVENT(nfs4_set_acl);
DEFINE_NFS4_INODE_EVENT(nfs4_get_security_label);
DEFINE_NFS4_INODE_EVENT(nfs4_set_security_label);
#endif /* CONFIG_NFS_V4_SECURITY_LABEL */
-DEFINE_NFS4_INODE_EVENT(nfs4_recall_delegation);
-DEFINE_NFS4_INODE_EVENT(nfs4_delegreturn);
+
+DECLARE_EVENT_CLASS(nfs4_inode_stateid_event,
+ TP_PROTO(
+ const struct inode *inode,
+ const nfs4_stateid *stateid,
+ int error
+ ),
+
+ TP_ARGS(inode, stateid, error),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(int, error)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->error = error;
+ __entry->stateid_seq =
+ be32_to_cpu(stateid->seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(stateid);
+ ),
+
+ TP_printk(
+ "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "stateid=%d:0x%08x",
+ __entry->error,
+ show_nfsv4_errors(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash
+ )
+);
+
+#define DEFINE_NFS4_INODE_STATEID_EVENT(name) \
+ DEFINE_EVENT(nfs4_inode_stateid_event, name, \
+ TP_PROTO( \
+ const struct inode *inode, \
+ const nfs4_stateid *stateid, \
+ int error \
+ ), \
+ TP_ARGS(inode, stateid, error))
+
+DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_setattr);
+DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_delegreturn);
DECLARE_EVENT_CLASS(nfs4_getattr_event,
TP_PROTO(
@@ -884,6 +1088,132 @@ DEFINE_NFS4_GETATTR_EVENT(nfs4_getattr);
DEFINE_NFS4_GETATTR_EVENT(nfs4_lookup_root);
DEFINE_NFS4_GETATTR_EVENT(nfs4_fsinfo);
+DECLARE_EVENT_CLASS(nfs4_inode_callback_event,
+ TP_PROTO(
+ const struct nfs_client *clp,
+ const struct nfs_fh *fhandle,
+ const struct inode *inode,
+ int error
+ ),
+
+ TP_ARGS(clp, fhandle, inode, error),
+
+ TP_STRUCT__entry(
+ __field(int, error)
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __string(dstaddr, clp ?
+ rpc_peeraddr2str(clp->cl_rpcclient,
+ RPC_DISPLAY_ADDR) : "unknown")
+ ),
+
+ TP_fast_assign(
+ __entry->error = error;
+ __entry->fhandle = nfs_fhandle_hash(fhandle);
+ if (inode != NULL) {
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->dev = inode->i_sb->s_dev;
+ } else {
+ __entry->fileid = 0;
+ __entry->dev = 0;
+ }
+ __assign_str(dstaddr, clp ?
+ rpc_peeraddr2str(clp->cl_rpcclient,
+ RPC_DISPLAY_ADDR) : "unknown")
+ ),
+
+ TP_printk(
+ "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "dstaddr=%s",
+ __entry->error,
+ show_nfsv4_errors(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ __get_str(dstaddr)
+ )
+);
+
+#define DEFINE_NFS4_INODE_CALLBACK_EVENT(name) \
+ DEFINE_EVENT(nfs4_inode_callback_event, name, \
+ TP_PROTO( \
+ const struct nfs_client *clp, \
+ const struct nfs_fh *fhandle, \
+ const struct inode *inode, \
+ int error \
+ ), \
+ TP_ARGS(clp, fhandle, inode, error))
+DEFINE_NFS4_INODE_CALLBACK_EVENT(nfs4_cb_getattr);
+
+DECLARE_EVENT_CLASS(nfs4_inode_stateid_callback_event,
+ TP_PROTO(
+ const struct nfs_client *clp,
+ const struct nfs_fh *fhandle,
+ const struct inode *inode,
+ const nfs4_stateid *stateid,
+ int error
+ ),
+
+ TP_ARGS(clp, fhandle, inode, stateid, error),
+
+ TP_STRUCT__entry(
+ __field(int, error)
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __string(dstaddr, clp ?
+ rpc_peeraddr2str(clp->cl_rpcclient,
+ RPC_DISPLAY_ADDR) : "unknown")
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ ),
+
+ TP_fast_assign(
+ __entry->error = error;
+ __entry->fhandle = nfs_fhandle_hash(fhandle);
+ if (inode != NULL) {
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->dev = inode->i_sb->s_dev;
+ } else {
+ __entry->fileid = 0;
+ __entry->dev = 0;
+ }
+ __assign_str(dstaddr, clp ?
+ rpc_peeraddr2str(clp->cl_rpcclient,
+ RPC_DISPLAY_ADDR) : "unknown")
+ __entry->stateid_seq =
+ be32_to_cpu(stateid->seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(stateid);
+ ),
+
+ TP_printk(
+ "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "stateid=%d:0x%08x dstaddr=%s",
+ __entry->error,
+ show_nfsv4_errors(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash,
+ __get_str(dstaddr)
+ )
+);
+
+#define DEFINE_NFS4_INODE_STATEID_CALLBACK_EVENT(name) \
+ DEFINE_EVENT(nfs4_inode_stateid_callback_event, name, \
+ TP_PROTO( \
+ const struct nfs_client *clp, \
+ const struct nfs_fh *fhandle, \
+ const struct inode *inode, \
+ const nfs4_stateid *stateid, \
+ int error \
+ ), \
+ TP_ARGS(clp, fhandle, inode, stateid, error))
+DEFINE_NFS4_INODE_STATEID_CALLBACK_EVENT(nfs4_cb_recall);
+DEFINE_NFS4_INODE_STATEID_CALLBACK_EVENT(nfs4_cb_layoutrecall_file);
+
DECLARE_EVENT_CLASS(nfs4_idmap_event,
TP_PROTO(
const char *name,
@@ -945,28 +1275,37 @@ DECLARE_EVENT_CLASS(nfs4_read_event,
__field(loff_t, offset)
__field(size_t, count)
__field(int, error)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
),
TP_fast_assign(
const struct inode *inode = hdr->inode;
+ const struct nfs4_state *state =
+ hdr->args.context->state;
__entry->dev = inode->i_sb->s_dev;
__entry->fileid = NFS_FILEID(inode);
__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
__entry->offset = hdr->args.offset;
__entry->count = hdr->args.count;
__entry->error = error;
+ __entry->stateid_seq =
+ be32_to_cpu(state->stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&state->stateid);
),
TP_printk(
"error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
- "offset=%lld count=%zu",
+ "offset=%lld count=%zu stateid=%d:0x%08x",
__entry->error,
show_nfsv4_errors(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle,
(long long)__entry->offset,
- __entry->count
+ __entry->count,
+ __entry->stateid_seq, __entry->stateid_hash
)
);
#define DEFINE_NFS4_READ_EVENT(name) \
@@ -996,28 +1335,37 @@ DECLARE_EVENT_CLASS(nfs4_write_event,
__field(loff_t, offset)
__field(size_t, count)
__field(int, error)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
),
TP_fast_assign(
const struct inode *inode = hdr->inode;
+ const struct nfs4_state *state =
+ hdr->args.context->state;
__entry->dev = inode->i_sb->s_dev;
__entry->fileid = NFS_FILEID(inode);
__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
__entry->offset = hdr->args.offset;
__entry->count = hdr->args.count;
__entry->error = error;
+ __entry->stateid_seq =
+ be32_to_cpu(state->stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&state->stateid);
),
TP_printk(
"error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
- "offset=%lld count=%zu",
+ "offset=%lld count=%zu stateid=%d:0x%08x",
__entry->error,
show_nfsv4_errors(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle,
(long long)__entry->offset,
- __entry->count
+ __entry->count,
+ __entry->stateid_seq, __entry->stateid_hash
)
);
@@ -1094,10 +1442,11 @@ TRACE_EVENT(nfs4_layoutget,
const struct nfs_open_context *ctx,
const struct pnfs_layout_range *args,
const struct pnfs_layout_range *res,
+ const nfs4_stateid *layout_stateid,
int error
),
- TP_ARGS(ctx, args, res, error),
+ TP_ARGS(ctx, args, res, layout_stateid, error),
TP_STRUCT__entry(
__field(dev_t, dev)
@@ -1107,10 +1456,15 @@ TRACE_EVENT(nfs4_layoutget,
__field(u64, offset)
__field(u64, count)
__field(int, error)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ __field(int, layoutstateid_seq)
+ __field(u32, layoutstateid_hash)
),
TP_fast_assign(
const struct inode *inode = d_inode(ctx->dentry);
+ const struct nfs4_state *state = ctx->state;
__entry->dev = inode->i_sb->s_dev;
__entry->fileid = NFS_FILEID(inode);
__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
@@ -1118,11 +1472,25 @@ TRACE_EVENT(nfs4_layoutget,
__entry->offset = args->offset;
__entry->count = args->length;
__entry->error = error;
+ __entry->stateid_seq =
+ be32_to_cpu(state->stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&state->stateid);
+ if (!error) {
+ __entry->layoutstateid_seq =
+ be32_to_cpu(layout_stateid->seqid);
+ __entry->layoutstateid_hash =
+ nfs_stateid_hash(layout_stateid);
+ } else {
+ __entry->layoutstateid_seq = 0;
+ __entry->layoutstateid_hash = 0;
+ }
),
TP_printk(
"error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
- "iomode=%s offset=%llu count=%llu",
+ "iomode=%s offset=%llu count=%llu stateid=%d:0x%08x "
+ "layoutstateid=%d:0x%08x",
__entry->error,
show_nfsv4_errors(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
@@ -1130,12 +1498,82 @@ TRACE_EVENT(nfs4_layoutget,
__entry->fhandle,
show_pnfs_iomode(__entry->iomode),
(unsigned long long)__entry->offset,
- (unsigned long long)__entry->count
+ (unsigned long long)__entry->count,
+ __entry->stateid_seq, __entry->stateid_hash,
+ __entry->layoutstateid_seq, __entry->layoutstateid_hash
)
);
-DEFINE_NFS4_INODE_EVENT(nfs4_layoutcommit);
-DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn);
+DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_layoutcommit);
+DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_layoutreturn);
+DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn_on_close);
+
+#define show_pnfs_update_layout_reason(reason) \
+ __print_symbolic(reason, \
+ { PNFS_UPDATE_LAYOUT_UNKNOWN, "unknown" }, \
+ { PNFS_UPDATE_LAYOUT_NO_PNFS, "no pnfs" }, \
+ { PNFS_UPDATE_LAYOUT_RD_ZEROLEN, "read+zerolen" }, \
+ { PNFS_UPDATE_LAYOUT_MDSTHRESH, "mdsthresh" }, \
+ { PNFS_UPDATE_LAYOUT_NOMEM, "nomem" }, \
+ { PNFS_UPDATE_LAYOUT_BULK_RECALL, "bulk recall" }, \
+ { PNFS_UPDATE_LAYOUT_IO_TEST_FAIL, "io test fail" }, \
+ { PNFS_UPDATE_LAYOUT_FOUND_CACHED, "found cached" }, \
+ { PNFS_UPDATE_LAYOUT_RETURN, "layoutreturn" }, \
+ { PNFS_UPDATE_LAYOUT_BLOCKED, "layouts blocked" }, \
+ { PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET, "sent layoutget" })
+
+TRACE_EVENT(pnfs_update_layout,
+ TP_PROTO(struct inode *inode,
+ loff_t pos,
+ u64 count,
+ enum pnfs_iomode iomode,
+ struct pnfs_layout_hdr *lo,
+ enum pnfs_update_layout_reason reason
+ ),
+ TP_ARGS(inode, pos, count, iomode, lo, reason),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u64, fileid)
+ __field(u32, fhandle)
+ __field(loff_t, pos)
+ __field(u64, count)
+ __field(enum pnfs_iomode, iomode)
+ __field(int, layoutstateid_seq)
+ __field(u32, layoutstateid_hash)
+ __field(enum pnfs_update_layout_reason, reason)
+ ),
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->pos = pos;
+ __entry->count = count;
+ __entry->iomode = iomode;
+ __entry->reason = reason;
+ if (lo != NULL) {
+ __entry->layoutstateid_seq =
+ be32_to_cpu(lo->plh_stateid.seqid);
+ __entry->layoutstateid_hash =
+ nfs_stateid_hash(&lo->plh_stateid);
+ } else {
+ __entry->layoutstateid_seq = 0;
+ __entry->layoutstateid_hash = 0;
+ }
+ ),
+ TP_printk(
+ "fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "iomode=%s pos=%llu count=%llu "
+ "layoutstateid=%d:0x%08x (%s)",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ show_pnfs_iomode(__entry->iomode),
+ (unsigned long long)__entry->pos,
+ (unsigned long long)__entry->count,
+ __entry->layoutstateid_seq, __entry->layoutstateid_hash,
+ show_pnfs_update_layout_reason(__entry->reason)
+ )
+);
#endif /* CONFIG_NFS_V4_1 */
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 558cd65d..4e44412 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -400,7 +400,8 @@ static int nfs4_stat_to_errno(int);
#define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3)
#define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \
encode_stateid_maxsz + \
- 1 /* FIXME: opaque lrf_body always empty at the moment */)
+ 1 + \
+ XDR_QUADLEN(NFS4_OPAQUE_LIMIT))
#define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \
1 + decode_stateid_maxsz)
#define encode_secinfo_no_name_maxsz (op_encode_hdr_maxsz + 1)
@@ -1001,7 +1002,8 @@ static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *ve
static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
const struct nfs4_label *label,
- const struct nfs_server *server)
+ const struct nfs_server *server,
+ bool excl_check)
{
char owner_name[IDMAP_NAMESZ];
char owner_group[IDMAP_NAMESZ];
@@ -1067,6 +1069,17 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET;
len += 4;
}
+
+ if (excl_check) {
+ const u32 *excl_bmval = server->exclcreat_bitmask;
+ bmval[0] &= excl_bmval[0];
+ bmval[1] &= excl_bmval[1];
+ bmval[2] &= excl_bmval[2];
+
+ if (!(excl_bmval[2] & FATTR4_WORD2_SECURITY_LABEL))
+ label = NULL;
+ }
+
if (label) {
len += 4 + 4 + 4 + (XDR_QUADLEN(label->len) << 2);
bmval[2] |= FATTR4_WORD2_SECURITY_LABEL;
@@ -1154,7 +1167,9 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
case NF4LNK:
p = reserve_space(xdr, 4);
*p = cpu_to_be32(create->u.symlink.len);
- xdr_write_pages(xdr, create->u.symlink.pages, 0, create->u.symlink.len);
+ xdr_write_pages(xdr, create->u.symlink.pages, 0,
+ create->u.symlink.len);
+ xdr->buf->flags |= XDRBUF_WRITE;
break;
case NF4BLK: case NF4CHR:
@@ -1168,7 +1183,7 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
}
encode_string(xdr, create->name->len, create->name->name);
- encode_attrs(xdr, create->attrs, create->label, create->server);
+ encode_attrs(xdr, create->attrs, create->label, create->server, false);
}
static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct compound_hdr *hdr)
@@ -1382,18 +1397,17 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg)
{
- struct iattr dummy;
__be32 *p;
p = reserve_space(xdr, 4);
switch(arg->createmode) {
case NFS4_CREATE_UNCHECKED:
*p = cpu_to_be32(NFS4_CREATE_UNCHECKED);
- encode_attrs(xdr, arg->u.attrs, arg->label, arg->server);
+ encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, false);
break;
case NFS4_CREATE_GUARDED:
*p = cpu_to_be32(NFS4_CREATE_GUARDED);
- encode_attrs(xdr, arg->u.attrs, arg->label, arg->server);
+ encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, false);
break;
case NFS4_CREATE_EXCLUSIVE:
*p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE);
@@ -1402,8 +1416,7 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
case NFS4_CREATE_EXCLUSIVE4_1:
*p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1);
encode_nfs4_verifier(xdr, &arg->u.verifier);
- dummy.ia_valid = 0;
- encode_attrs(xdr, &dummy, arg->label, arg->server);
+ encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, true);
}
}
@@ -1646,7 +1659,7 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun
*p = cpu_to_be32(FATTR4_WORD0_ACL);
p = reserve_space(xdr, 4);
*p = cpu_to_be32(arg->acl_len);
- xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len);
+ xdr_write_pages(xdr, arg->acl_pages, 0, arg->acl_len);
}
static void
@@ -1659,7 +1672,7 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs
{
encode_op_hdr(xdr, OP_SETATTR, decode_setattr_maxsz, hdr);
encode_nfs4_stateid(xdr, &arg->stateid);
- encode_attrs(xdr, arg->iap, arg->label, server);
+ encode_attrs(xdr, arg->iap, arg->label, server, false);
}
static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid, struct compound_hdr *hdr)
@@ -2478,7 +2491,7 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr,
encode_getattr_two(xdr, FATTR4_WORD0_ACL, 0, &hdr);
xdr_inline_pages(&req->rq_rcv_buf, replen << 2,
- args->acl_pages, args->acl_pgbase, args->acl_len);
+ args->acl_pages, 0, args->acl_len);
encode_nops(&hdr);
}
@@ -2580,6 +2593,7 @@ static void nfs4_xdr_enc_server_caps(struct rpc_rqst *req,
struct xdr_stream *xdr,
struct nfs4_server_caps_arg *args)
{
+ const u32 *bitmask = args->bitmask;
struct compound_hdr hdr = {
.minorversion = nfs4_xdr_minorversion(&args->seq_args),
};
@@ -2587,11 +2601,7 @@ static void nfs4_xdr_enc_server_caps(struct rpc_rqst *req,
encode_compound_hdr(xdr, req, &hdr);
encode_sequence(xdr, &args->seq_args, &hdr);
encode_putfh(xdr, args->fhandle, &hdr);
- encode_getattr_one(xdr, FATTR4_WORD0_SUPPORTED_ATTRS|
- FATTR4_WORD0_FH_EXPIRE_TYPE|
- FATTR4_WORD0_LINK_SUPPORT|
- FATTR4_WORD0_SYMLINK_SUPPORT|
- FATTR4_WORD0_ACLSUPPORT, &hdr);
+ encode_getattr_three(xdr, bitmask[0], bitmask[1], bitmask[2], &hdr);
encode_nops(&hdr);
}
@@ -3368,6 +3378,22 @@ out_overflow:
return -EIO;
}
+static int decode_attr_exclcreat_supported(struct xdr_stream *xdr,
+ uint32_t *bitmap, uint32_t *bitmask)
+{
+ if (likely(bitmap[2] & FATTR4_WORD2_SUPPATTR_EXCLCREAT)) {
+ int ret;
+ ret = decode_attr_bitmap(xdr, bitmask);
+ if (unlikely(ret < 0))
+ return ret;
+ bitmap[2] &= ~FATTR4_WORD2_SUPPATTR_EXCLCREAT;
+ } else
+ bitmask[0] = bitmask[1] = bitmask[2] = 0;
+ dprintk("%s: bitmask=%08x:%08x:%08x\n", __func__,
+ bitmask[0], bitmask[1], bitmask[2]);
+ return 0;
+}
+
static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fh *fh)
{
__be32 *p;
@@ -3589,6 +3615,7 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
status = 0;
if (unlikely(!(bitmap[0] & FATTR4_WORD0_FS_LOCATIONS)))
goto out;
+ bitmap[0] &= ~FATTR4_WORD0_FS_LOCATIONS;
status = -EIO;
/* Ignore borken servers that return unrequested attrs */
if (unlikely(res == NULL))
@@ -4321,6 +4348,9 @@ static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_re
goto xdr_error;
if ((status = decode_attr_aclsupport(xdr, bitmap, &res->acl_bitmask)) != 0)
goto xdr_error;
+ if ((status = decode_attr_exclcreat_supported(xdr, bitmap,
+ res->exclcreat_bitmask)) != 0)
+ goto xdr_error;
status = verify_attr_len(xdr, savep, attrlen);
xdr_error:
dprintk("%s: xdr returned %d!\n", __func__, -status);
@@ -4346,6 +4376,11 @@ static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat)
goto xdr_error;
if ((status = decode_attr_files_total(xdr, bitmap, &fsstat->tfiles)) != 0)
goto xdr_error;
+
+ status = -EIO;
+ if (unlikely(bitmap[0]))
+ goto xdr_error;
+
if ((status = decode_attr_space_avail(xdr, bitmap, &fsstat->abytes)) != 0)
goto xdr_error;
if ((status = decode_attr_space_free(xdr, bitmap, &fsstat->fbytes)) != 0)
@@ -4545,6 +4580,10 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
goto xdr_error;
fattr->valid |= status;
+ status = -EIO;
+ if (unlikely(bitmap[0]))
+ goto xdr_error;
+
status = decode_attr_mode(xdr, bitmap, &fmode);
if (status < 0)
goto xdr_error;
@@ -4598,6 +4637,10 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
goto xdr_error;
fattr->valid |= status;
+ status = -EIO;
+ if (unlikely(bitmap[1]))
+ goto xdr_error;
+
status = decode_attr_mdsthreshold(xdr, bitmap, fattr->mdsthreshold);
if (status < 0)
goto xdr_error;
@@ -4735,6 +4778,28 @@ static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap,
return 0;
}
+/*
+ * The granularity of a CLONE operation.
+ */
+static int decode_attr_clone_blksize(struct xdr_stream *xdr, uint32_t *bitmap,
+ uint32_t *res)
+{
+ __be32 *p;
+
+ dprintk("%s: bitmap is %x\n", __func__, bitmap[2]);
+ *res = 0;
+ if (bitmap[2] & FATTR4_WORD2_CLONE_BLKSIZE) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p)) {
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
+ }
+ *res = be32_to_cpup(p);
+ bitmap[2] &= ~FATTR4_WORD2_CLONE_BLKSIZE;
+ }
+ return 0;
+}
+
static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
{
unsigned int savep;
@@ -4760,15 +4825,28 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
if ((status = decode_attr_maxwrite(xdr, bitmap, &fsinfo->wtmax)) != 0)
goto xdr_error;
fsinfo->wtpref = fsinfo->wtmax;
+
+ status = -EIO;
+ if (unlikely(bitmap[0]))
+ goto xdr_error;
+
status = decode_attr_time_delta(xdr, bitmap, &fsinfo->time_delta);
if (status != 0)
goto xdr_error;
status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype);
if (status != 0)
goto xdr_error;
+
+ status = -EIO;
+ if (unlikely(bitmap[1]))
+ goto xdr_error;
+
status = decode_attr_layout_blksize(xdr, bitmap, &fsinfo->blksize);
if (status)
goto xdr_error;
+ status = decode_attr_clone_blksize(xdr, bitmap, &fsinfo->clone_blksize);
+ if (status)
+ goto xdr_error;
status = verify_attr_len(xdr, savep, attrlen);
xdr_error:
@@ -4903,24 +4981,28 @@ static int decode_lookup(struct xdr_stream *xdr)
}
/* This is too sick! */
-static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize)
+static int decode_space_limit(struct xdr_stream *xdr,
+ unsigned long *pagemod_limit)
{
__be32 *p;
uint32_t limit_type, nblocks, blocksize;
+ u64 maxsize = 0;
p = xdr_inline_decode(xdr, 12);
if (unlikely(!p))
goto out_overflow;
limit_type = be32_to_cpup(p++);
switch (limit_type) {
- case 1:
- xdr_decode_hyper(p, maxsize);
+ case NFS4_LIMIT_SIZE:
+ xdr_decode_hyper(p, &maxsize);
break;
- case 2:
+ case NFS4_LIMIT_BLOCKS:
nblocks = be32_to_cpup(p++);
blocksize = be32_to_cpup(p);
- *maxsize = (uint64_t)nblocks * (uint64_t)blocksize;
+ maxsize = (uint64_t)nblocks * (uint64_t)blocksize;
}
+ maxsize >>= PAGE_CACHE_SHIFT;
+ *pagemod_limit = min_t(u64, maxsize, ULONG_MAX);
return 0;
out_overflow:
print_overflow_msg(__func__, xdr);
@@ -4948,7 +5030,7 @@ static int decode_rw_delegation(struct xdr_stream *xdr,
break;
case NFS4_OPEN_DELEGATE_WRITE:
res->delegation_type = FMODE_WRITE|FMODE_READ;
- if (decode_space_limit(xdr, &res->maxsize) < 0)
+ if (decode_space_limit(xdr, &res->pagemod_limit) < 0)
return -EIO;
}
return decode_ace(xdr, NULL, res->server->nfs_client);
@@ -7432,6 +7514,7 @@ struct rpc_procinfo nfs4_procedures[] = {
PROC(ALLOCATE, enc_allocate, dec_allocate),
PROC(DEALLOCATE, enc_deallocate, dec_deallocate),
PROC(LAYOUTSTATS, enc_layoutstats, dec_layoutstats),
+ PROC(CLONE, enc_clone, dec_clone),
#endif /* CONFIG_NFS_V4_2 */
};
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 9bc9f04..89a15db 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -90,7 +90,7 @@
#define NFS_DEF_OPTIONS "vers=2,udp,rsize=4096,wsize=4096"
/* Parameters passed from the kernel command line */
-static char nfs_root_parms[256] __initdata = "";
+static char nfs_root_parms[NFS_MAXPATHLEN + 1] __initdata = "";
/* Text-based mount options passed to super.c */
static char nfs_root_options[256] __initdata = NFS_DEF_OPTIONS;
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 59f838c..9f80a08 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -39,7 +39,6 @@
{ 1 << NFS_INO_INVALIDATING, "INVALIDATING" }, \
{ 1 << NFS_INO_FLUSHING, "FLUSHING" }, \
{ 1 << NFS_INO_FSCACHE, "FSCACHE" }, \
- { 1 << NFS_INO_COMMIT, "COMMIT" }, \
{ 1 << NFS_INO_LAYOUTCOMMIT, "NEED_LAYOUTCOMMIT" }, \
{ 1 << NFS_INO_LAYOUTCOMMITTING, "LAYOUTCOMMIT" })
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 5aaed36..9aebffb 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -124,7 +124,7 @@ objio_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
retry_lookup:
od = osduld_info_lookup(&odi);
- if (unlikely(IS_ERR(od))) {
+ if (IS_ERR(od)) {
err = PTR_ERR(od);
dprintk("%s: osduld_info_lookup => %d\n", __func__, err);
if (err == -ENODEV && retry_flag) {
@@ -476,10 +476,7 @@ static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
}
unlock_page(page);
}
- if (PageDirty(page) || PageWriteback(page))
- *uptodate = true;
- else
- *uptodate = PageUptodate(page);
+ *uptodate = PageUptodate(page);
dprintk("%s: index=0x%lx uptodate=%d\n", __func__, index, *uptodate);
return page;
}
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 4984bbe..8ce4f61 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -77,8 +77,8 @@ EXPORT_SYMBOL_GPL(nfs_pgheader_init);
void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos)
{
spin_lock(&hdr->lock);
- if (pos < hdr->io_start + hdr->good_bytes) {
- set_bit(NFS_IOHDR_ERROR, &hdr->flags);
+ if (!test_and_set_bit(NFS_IOHDR_ERROR, &hdr->flags)
+ || pos < hdr->io_start + hdr->good_bytes) {
clear_bit(NFS_IOHDR_EOF, &hdr->flags);
hdr->good_bytes = pos - hdr->io_start;
hdr->error = error;
@@ -101,53 +101,18 @@ nfs_page_free(struct nfs_page *p)
kmem_cache_free(nfs_page_cachep, p);
}
-static void
-nfs_iocounter_inc(struct nfs_io_counter *c)
-{
- atomic_inc(&c->io_count);
-}
-
-static void
-nfs_iocounter_dec(struct nfs_io_counter *c)
-{
- if (atomic_dec_and_test(&c->io_count)) {
- clear_bit(NFS_IO_INPROGRESS, &c->flags);
- smp_mb__after_atomic();
- wake_up_bit(&c->flags, NFS_IO_INPROGRESS);
- }
-}
-
-static int
-__nfs_iocounter_wait(struct nfs_io_counter *c)
-{
- wait_queue_head_t *wq = bit_waitqueue(&c->flags, NFS_IO_INPROGRESS);
- DEFINE_WAIT_BIT(q, &c->flags, NFS_IO_INPROGRESS);
- int ret = 0;
-
- do {
- prepare_to_wait(wq, &q.wait, TASK_KILLABLE);
- set_bit(NFS_IO_INPROGRESS, &c->flags);
- if (atomic_read(&c->io_count) == 0)
- break;
- ret = nfs_wait_bit_killable(&q.key);
- } while (atomic_read(&c->io_count) != 0 && !ret);
- finish_wait(wq, &q.wait);
- return ret;
-}
-
/**
* nfs_iocounter_wait - wait for i/o to complete
- * @c: nfs_io_counter to use
+ * @l_ctx: nfs_lock_context with io_counter to use
*
* returns -ERESTARTSYS if interrupted by a fatal signal.
* Otherwise returns 0 once the io_count hits 0.
*/
int
-nfs_iocounter_wait(struct nfs_io_counter *c)
+nfs_iocounter_wait(struct nfs_lock_context *l_ctx)
{
- if (atomic_read(&c->io_count) == 0)
- return 0;
- return __nfs_iocounter_wait(c);
+ return wait_on_atomic_t(&l_ctx->io_count, nfs_wait_atomic_killable,
+ TASK_KILLABLE);
}
/*
@@ -370,7 +335,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct page *page,
return ERR_CAST(l_ctx);
}
req->wb_lock_context = l_ctx;
- nfs_iocounter_inc(&l_ctx->io_count);
+ atomic_inc(&l_ctx->io_count);
/* Initialize the request struct. Initially, we assume a
* long write-back delay. This will be adjusted in
@@ -431,7 +396,8 @@ static void nfs_clear_request(struct nfs_page *req)
req->wb_page = NULL;
}
if (l_ctx != NULL) {
- nfs_iocounter_dec(&l_ctx->io_count);
+ if (atomic_dec_and_test(&l_ctx->io_count))
+ wake_up_atomic_t(&l_ctx->io_count);
nfs_put_lock_context(l_ctx);
req->wb_lock_context = NULL;
}
@@ -508,7 +474,7 @@ size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
* for it without upsetting the slab allocator.
*/
if (((mirror->pg_count + req->wb_bytes) >> PAGE_SHIFT) *
- sizeof(struct page) > PAGE_SIZE)
+ sizeof(struct page *) > PAGE_SIZE)
return 0;
return min(mirror->pg_bsize - mirror->pg_count, (size_t)req->wb_bytes);
@@ -664,22 +630,11 @@ EXPORT_SYMBOL_GPL(nfs_initiate_pgio);
* @desc: IO descriptor
* @hdr: pageio header
*/
-static int nfs_pgio_error(struct nfs_pageio_descriptor *desc,
- struct nfs_pgio_header *hdr)
+static void nfs_pgio_error(struct nfs_pgio_header *hdr)
{
- struct nfs_pgio_mirror *mirror;
- u32 midx;
-
set_bit(NFS_IOHDR_REDO, &hdr->flags);
nfs_pgio_data_destroy(hdr);
hdr->completion_ops->completion(hdr);
- /* TODO: Make sure it's right to clean up all mirrors here
- * and not just hdr->pgio_mirror_idx */
- for (midx = 0; midx < desc->pg_mirror_count; midx++) {
- mirror = &desc->pg_mirrors[midx];
- desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
- }
- return -ENOMEM;
}
/**
@@ -800,8 +755,11 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
unsigned int pagecount, pageused;
pagecount = nfs_page_array_len(mirror->pg_base, mirror->pg_count);
- if (!nfs_pgarray_set(&hdr->page_array, pagecount))
- return nfs_pgio_error(desc, hdr);
+ if (!nfs_pgarray_set(&hdr->page_array, pagecount)) {
+ nfs_pgio_error(hdr);
+ desc->pg_error = -ENOMEM;
+ return desc->pg_error;
+ }
nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq);
pages = hdr->page_array.pagevec;
@@ -819,8 +777,11 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
*pages++ = last_page = req->wb_page;
}
}
- if (WARN_ON_ONCE(pageused != pagecount))
- return nfs_pgio_error(desc, hdr);
+ if (WARN_ON_ONCE(pageused != pagecount)) {
+ nfs_pgio_error(hdr);
+ desc->pg_error = -EINVAL;
+ return desc->pg_error;
+ }
if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
(desc->pg_moreio || nfs_reqs_to_commit(&cinfo)))
@@ -835,18 +796,13 @@ EXPORT_SYMBOL_GPL(nfs_generic_pgio);
static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc)
{
- struct nfs_pgio_mirror *mirror;
struct nfs_pgio_header *hdr;
int ret;
- mirror = nfs_pgio_current_mirror(desc);
-
hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
if (!hdr) {
- /* TODO: make sure this is right with mirroring - or
- * should it back out all mirrors? */
- desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
- return -ENOMEM;
+ desc->pg_error = -ENOMEM;
+ return desc->pg_error;
}
nfs_pgheader_init(desc, hdr, nfs_pgio_header_free);
ret = nfs_generic_pgio(desc, hdr);
@@ -874,6 +830,9 @@ static int nfs_pageio_setup_mirroring(struct nfs_pageio_descriptor *pgio,
mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req);
+ if (pgio->pg_error < 0)
+ return pgio->pg_error;
+
if (!mirror_count || mirror_count > NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX)
return -EINVAL;
@@ -903,12 +862,6 @@ static void nfs_pageio_cleanup_mirroring(struct nfs_pageio_descriptor *pgio)
pgio->pg_mirrors_dynamic = NULL;
}
-static bool nfs_match_open_context(const struct nfs_open_context *ctx1,
- const struct nfs_open_context *ctx2)
-{
- return ctx1->cred == ctx2->cred && ctx1->state == ctx2->state;
-}
-
static bool nfs_match_lock_context(const struct nfs_lock_context *l1,
const struct nfs_lock_context *l2)
{
@@ -982,6 +935,8 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
} else {
if (desc->pg_ops->pg_init)
desc->pg_ops->pg_init(desc, req);
+ if (desc->pg_error < 0)
+ return 0;
mirror->pg_base = req->wb_pgbase;
}
if (!nfs_can_coalesce_requests(prev, req, desc))
@@ -1147,6 +1102,8 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
bytes = req->wb_bytes;
nfs_pageio_setup_mirroring(desc, req);
+ if (desc->pg_error < 0)
+ goto out_failed;
for (midx = 0; midx < desc->pg_mirror_count; midx++) {
if (midx) {
@@ -1163,7 +1120,8 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
if (IS_ERR(dupreq)) {
nfs_page_group_unlock(req);
- return 0;
+ desc->pg_error = PTR_ERR(dupreq);
+ goto out_failed;
}
nfs_lock_request(dupreq);
@@ -1176,10 +1134,32 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
if (nfs_pgio_has_mirroring(desc))
desc->pg_mirror_idx = midx;
if (!nfs_pageio_add_request_mirror(desc, dupreq))
- return 0;
+ goto out_failed;
}
return 1;
+
+out_failed:
+ /*
+ * We might have failed before sending any reqs over wire.
+ * Clean up rest of the reqs in mirror pg_list.
+ */
+ if (desc->pg_error) {
+ struct nfs_pgio_mirror *mirror;
+ void (*func)(struct list_head *);
+
+ /* remember fatal errors */
+ if (nfs_error_is_fatal(desc->pg_error))
+ mapping_set_error(desc->pg_inode->i_mapping,
+ desc->pg_error);
+
+ func = desc->pg_completion_ops->error_cleanup;
+ for (midx = 0; midx < desc->pg_mirror_count; midx++) {
+ mirror = &desc->pg_mirrors[midx];
+ func(&mirror->pg_list);
+ }
+ }
+ return 0;
}
/*
@@ -1232,7 +1212,7 @@ int nfs_pageio_resend(struct nfs_pageio_descriptor *desc,
nfs_pageio_complete(desc);
if (!list_empty(&failed)) {
list_move(&failed, &hdr->pages);
- return -EIO;
+ return desc->pg_error < 0 ? desc->pg_error : -EIO;
}
return 0;
}
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 70bf706..a3592cc 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -53,7 +53,7 @@ static DEFINE_SPINLOCK(pnfs_spinlock);
static LIST_HEAD(pnfs_modules_tbl);
static int
-pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid,
+pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid,
enum pnfs_iomode iomode, bool sync);
/* Return the registered pnfs layout driver module matching given id */
@@ -368,7 +368,6 @@ pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo)
if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
return false;
lo->plh_return_iomode = 0;
- lo->plh_block_lgets++;
pnfs_get_layout_hdr(lo);
clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &lo->plh_flags);
return true;
@@ -386,13 +385,13 @@ static void pnfs_layoutreturn_before_put_lseg(struct pnfs_layout_segment *lseg,
enum pnfs_iomode iomode;
bool send;
- stateid = lo->plh_stateid;
+ nfs4_stateid_copy(&stateid, &lo->plh_stateid);
iomode = lo->plh_return_iomode;
send = pnfs_prepare_layoutreturn(lo);
spin_unlock(&inode->i_lock);
if (send) {
/* Send an async layoutreturn so we dont deadlock */
- pnfs_send_layoutreturn(lo, stateid, iomode, false);
+ pnfs_send_layoutreturn(lo, &stateid, iomode, false);
}
} else
spin_unlock(&inode->i_lock);
@@ -567,10 +566,10 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
int
pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
struct list_head *tmp_list,
- struct pnfs_layout_range *recall_range)
+ const struct pnfs_layout_range *recall_range)
{
struct pnfs_layout_segment *lseg, *next;
- int invalid = 0, removed = 0;
+ int remaining = 0;
dprintk("%s:Begin lo %p\n", __func__, lo);
@@ -583,11 +582,11 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
"offset %llu length %llu\n", __func__,
lseg, lseg->pls_range.iomode, lseg->pls_range.offset,
lseg->pls_range.length);
- invalid++;
- removed += mark_lseg_invalid(lseg, tmp_list);
+ if (!mark_lseg_invalid(lseg, tmp_list))
+ remaining++;
}
- dprintk("%s:Return %i\n", __func__, invalid - removed);
- return invalid - removed;
+ dprintk("%s:Return %i\n", __func__, remaining);
+ return remaining;
}
/* note free_me must contain lsegs from a single layout_hdr */
@@ -619,7 +618,6 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
pnfs_get_layout_hdr(lo);
pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RO_FAILED);
pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED);
- pnfs_clear_retry_layoutget(lo);
spin_unlock(&nfsi->vfs_inode.i_lock);
pnfs_free_lseg_list(&tmp_list);
pnfs_put_layout_hdr(lo);
@@ -704,6 +702,8 @@ pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list,
ret = -EAGAIN;
spin_unlock(&inode->i_lock);
pnfs_free_lseg_list(&lseg_list);
+ /* Free all lsegs that are attached to commit buckets */
+ nfs_commit_inode(inode, 0);
pnfs_put_layout_hdr(lo);
iput(inode);
}
@@ -817,37 +817,24 @@ pnfs_layout_stateid_blocked(const struct pnfs_layout_hdr *lo,
return !pnfs_seqid_is_newer(seqid, lo->plh_barrier);
}
-static bool
-pnfs_layout_returning(const struct pnfs_layout_hdr *lo,
- struct pnfs_layout_range *range)
-{
- return test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags) &&
- (lo->plh_return_iomode == IOMODE_ANY ||
- lo->plh_return_iomode == range->iomode);
-}
-
/* lget is set to 1 if called from inside send_layoutget call chain */
static bool
-pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo,
- struct pnfs_layout_range *range, int lget)
+pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo)
{
return lo->plh_block_lgets ||
- test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
- (list_empty(&lo->plh_segs) &&
- (atomic_read(&lo->plh_outstanding) > lget)) ||
- pnfs_layout_returning(lo, range);
+ test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
}
int
pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
- struct pnfs_layout_range *range,
+ const struct pnfs_layout_range *range,
struct nfs4_state *open_state)
{
int status = 0;
dprintk("--> %s\n", __func__);
spin_lock(&lo->plh_inode->i_lock);
- if (pnfs_layoutgets_blocked(lo, range, 1)) {
+ if (pnfs_layoutgets_blocked(lo)) {
status = -EAGAIN;
} else if (!nfs4_valid_open_stateid(open_state)) {
status = -EBADF;
@@ -875,46 +862,52 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
static struct pnfs_layout_segment *
send_layoutget(struct pnfs_layout_hdr *lo,
struct nfs_open_context *ctx,
- struct pnfs_layout_range *range,
+ const struct pnfs_layout_range *range,
gfp_t gfp_flags)
{
struct inode *ino = lo->plh_inode;
struct nfs_server *server = NFS_SERVER(ino);
struct nfs4_layoutget *lgp;
struct pnfs_layout_segment *lseg;
+ loff_t i_size;
dprintk("--> %s\n", __func__);
- lgp = kzalloc(sizeof(*lgp), gfp_flags);
- if (lgp == NULL)
- return NULL;
-
- lgp->args.minlength = PAGE_CACHE_SIZE;
- if (lgp->args.minlength > range->length)
- lgp->args.minlength = range->length;
- lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
- lgp->args.range = *range;
- lgp->args.type = server->pnfs_curr_ld->id;
- lgp->args.inode = ino;
- lgp->args.ctx = get_nfs_open_context(ctx);
- lgp->gfp_flags = gfp_flags;
- lgp->cred = lo->plh_lc_cred;
-
- /* Synchronously retrieve layout information from server and
- * store in lseg.
+ /*
+ * Synchronously retrieve layout information from server and
+ * store in lseg. If we race with a concurrent seqid morphing
+ * op, then re-send the LAYOUTGET.
*/
- lseg = nfs4_proc_layoutget(lgp, gfp_flags);
- if (IS_ERR(lseg)) {
- switch (PTR_ERR(lseg)) {
- case -ENOMEM:
- case -ERESTARTSYS:
- break;
- default:
- /* remember that LAYOUTGET failed and suspend trying */
- pnfs_layout_io_set_failed(lo, range->iomode);
+ do {
+ lgp = kzalloc(sizeof(*lgp), gfp_flags);
+ if (lgp == NULL)
+ return NULL;
+
+ i_size = i_size_read(ino);
+
+ lgp->args.minlength = PAGE_CACHE_SIZE;
+ if (lgp->args.minlength > range->length)
+ lgp->args.minlength = range->length;
+ if (range->iomode == IOMODE_READ) {
+ if (range->offset >= i_size)
+ lgp->args.minlength = 0;
+ else if (i_size - range->offset < lgp->args.minlength)
+ lgp->args.minlength = i_size - range->offset;
}
- return NULL;
- } else
+ lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
+ pnfs_copy_range(&lgp->args.range, range);
+ lgp->args.type = server->pnfs_curr_ld->id;
+ lgp->args.inode = ino;
+ lgp->args.ctx = get_nfs_open_context(ctx);
+ lgp->gfp_flags = gfp_flags;
+ lgp->cred = lo->plh_lc_cred;
+
+ lseg = nfs4_proc_layoutget(lgp, gfp_flags);
+ } while (lseg == ERR_PTR(-EAGAIN));
+
+ if (IS_ERR(lseg) && !nfs_error_is_fatal(PTR_ERR(lseg)))
+ lseg = NULL;
+ else
pnfs_layout_clear_fail_bit(lo,
pnfs_iomode_to_fail_bit(range->iomode));
@@ -945,7 +938,7 @@ void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo)
}
static int
-pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid,
+pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid,
enum pnfs_iomode iomode, bool sync)
{
struct inode *ino = lo->plh_inode;
@@ -956,15 +949,13 @@ pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid,
if (unlikely(lrp == NULL)) {
status = -ENOMEM;
spin_lock(&ino->i_lock);
- lo->plh_block_lgets--;
pnfs_clear_layoutreturn_waitbit(lo);
- rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq);
spin_unlock(&ino->i_lock);
pnfs_put_layout_hdr(lo);
goto out;
}
- lrp->args.stateid = stateid;
+ nfs4_stateid_copy(&lrp->args.stateid, stateid);
lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id;
lrp->args.inode = ino;
lrp->args.range.iomode = iomode;
@@ -1007,7 +998,7 @@ _pnfs_return_layout(struct inode *ino)
dprintk("NFS: %s no layout to return\n", __func__);
goto out;
}
- stateid = nfsi->layout->plh_stateid;
+ nfs4_stateid_copy(&stateid, &nfsi->layout->plh_stateid);
/* Reference matched in nfs4_layoutreturn_release */
pnfs_get_layout_hdr(lo);
empty = list_empty(&lo->plh_segs);
@@ -1035,7 +1026,7 @@ _pnfs_return_layout(struct inode *ino)
spin_unlock(&ino->i_lock);
pnfs_free_lseg_list(&tmp_list);
if (send)
- status = pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true);
+ status = pnfs_send_layoutreturn(lo, &stateid, IOMODE_ANY, true);
out_put_layout_hdr:
pnfs_put_layout_hdr(lo);
out:
@@ -1080,15 +1071,14 @@ bool pnfs_roc(struct inode *ino)
struct pnfs_layout_segment *lseg, *tmp;
nfs4_stateid stateid;
LIST_HEAD(tmp_list);
- bool found = false, layoutreturn = false;
+ bool found = false, layoutreturn = false, roc = false;
spin_lock(&ino->i_lock);
lo = nfsi->layout;
- if (!lo || !test_and_clear_bit(NFS_LAYOUT_ROC, &lo->plh_flags) ||
- test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
+ if (!lo || test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
goto out_noroc;
- /* Don't return layout if we hold a delegation */
+ /* no roc if we hold a delegation */
if (nfs4_check_delegation(ino, FMODE_READ))
goto out_noroc;
@@ -1099,34 +1089,35 @@ bool pnfs_roc(struct inode *ino)
goto out_noroc;
}
- pnfs_clear_retry_layoutget(lo);
+ nfs4_stateid_copy(&stateid, &lo->plh_stateid);
+ /* always send layoutreturn if being marked so */
+ if (test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
+ &lo->plh_flags))
+ layoutreturn = pnfs_prepare_layoutreturn(lo);
+
list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list)
- if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
+ /* If we are sending layoutreturn, invalidate all valid lsegs */
+ if (layoutreturn || test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
mark_lseg_invalid(lseg, &tmp_list);
found = true;
}
- if (!found)
- goto out_noroc;
- lo->plh_block_lgets++;
- pnfs_get_layout_hdr(lo); /* matched in pnfs_roc_release */
- spin_unlock(&ino->i_lock);
- pnfs_free_lseg_list(&tmp_list);
- pnfs_layoutcommit_inode(ino, true);
- return true;
+ /* ROC in two conditions:
+ * 1. there are ROC lsegs
+ * 2. we don't send layoutreturn
+ */
+ if (found && !layoutreturn) {
+ /* lo ref dropped in pnfs_roc_release() */
+ pnfs_get_layout_hdr(lo);
+ roc = true;
+ }
out_noroc:
- if (lo) {
- stateid = lo->plh_stateid;
- if (test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
- &lo->plh_flags))
- layoutreturn = pnfs_prepare_layoutreturn(lo);
- }
spin_unlock(&ino->i_lock);
- if (layoutreturn) {
- pnfs_layoutcommit_inode(ino, true);
- pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true);
- }
- return false;
+ pnfs_free_lseg_list(&tmp_list);
+ pnfs_layoutcommit_inode(ino, true);
+ if (layoutreturn)
+ pnfs_send_layoutreturn(lo, &stateid, IOMODE_ANY, true);
+ return roc;
}
void pnfs_roc_release(struct inode *ino)
@@ -1135,7 +1126,7 @@ void pnfs_roc_release(struct inode *ino)
spin_lock(&ino->i_lock);
lo = NFS_I(ino)->layout;
- lo->plh_block_lgets--;
+ pnfs_clear_layoutreturn_waitbit(lo);
if (atomic_dec_and_test(&lo->plh_refcount)) {
pnfs_detach_layout_hdr(lo);
spin_unlock(&ino->i_lock);
@@ -1150,30 +1141,20 @@ void pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
spin_lock(&ino->i_lock);
lo = NFS_I(ino)->layout;
+ pnfs_mark_layout_returned_if_empty(lo);
if (pnfs_seqid_is_newer(barrier, lo->plh_barrier))
lo->plh_barrier = barrier;
spin_unlock(&ino->i_lock);
+ trace_nfs4_layoutreturn_on_close(ino, 0);
}
-bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task)
+void pnfs_roc_get_barrier(struct inode *ino, u32 *barrier)
{
struct nfs_inode *nfsi = NFS_I(ino);
struct pnfs_layout_hdr *lo;
- struct pnfs_layout_segment *lseg;
- nfs4_stateid stateid;
u32 current_seqid;
- bool layoutreturn = false;
spin_lock(&ino->i_lock);
- list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list) {
- if (!test_bit(NFS_LSEG_ROC, &lseg->pls_flags))
- continue;
- if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags))
- continue;
- rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL);
- spin_unlock(&ino->i_lock);
- return true;
- }
lo = nfsi->layout;
current_seqid = be32_to_cpu(lo->plh_stateid.seqid);
@@ -1181,19 +1162,27 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task)
* a barrier, we choose the worst-case barrier.
*/
*barrier = current_seqid + atomic_read(&lo->plh_outstanding);
- stateid = lo->plh_stateid;
- if (test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
- &lo->plh_flags))
- layoutreturn = pnfs_prepare_layoutreturn(lo);
- if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
- rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL);
-
spin_unlock(&ino->i_lock);
- if (layoutreturn) {
- pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, false);
- return true;
- }
- return false;
+}
+
+bool pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task)
+{
+ struct nfs_inode *nfsi = NFS_I(ino);
+ struct pnfs_layout_hdr *lo;
+ bool sleep = false;
+
+ /* we might not have grabbed lo reference. so need to check under
+ * i_lock */
+ spin_lock(&ino->i_lock);
+ lo = nfsi->layout;
+ if (lo && test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
+ sleep = true;
+ spin_unlock(&ino->i_lock);
+
+ if (sleep)
+ rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL);
+
+ return sleep;
}
/*
@@ -1221,16 +1210,41 @@ pnfs_lseg_range_cmp(const struct pnfs_layout_range *l1,
return (int)(l1->iomode == IOMODE_READ) - (int)(l2->iomode == IOMODE_READ);
}
-static void
-pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo,
- struct pnfs_layout_segment *lseg)
+static bool
+pnfs_lseg_range_is_after(const struct pnfs_layout_range *l1,
+ const struct pnfs_layout_range *l2)
{
- struct pnfs_layout_segment *lp;
+ return pnfs_lseg_range_cmp(l1, l2) > 0;
+}
+
+static bool
+pnfs_lseg_no_merge(struct pnfs_layout_segment *lseg,
+ struct pnfs_layout_segment *old)
+{
+ return false;
+}
+
+void
+pnfs_generic_layout_insert_lseg(struct pnfs_layout_hdr *lo,
+ struct pnfs_layout_segment *lseg,
+ bool (*is_after)(const struct pnfs_layout_range *,
+ const struct pnfs_layout_range *),
+ bool (*do_merge)(struct pnfs_layout_segment *,
+ struct pnfs_layout_segment *),
+ struct list_head *free_me)
+{
+ struct pnfs_layout_segment *lp, *tmp;
dprintk("%s:Begin\n", __func__);
- list_for_each_entry(lp, &lo->plh_segs, pls_list) {
- if (pnfs_lseg_range_cmp(&lseg->pls_range, &lp->pls_range) > 0)
+ list_for_each_entry_safe(lp, tmp, &lo->plh_segs, pls_list) {
+ if (test_bit(NFS_LSEG_VALID, &lp->pls_flags) == 0)
+ continue;
+ if (do_merge(lseg, lp)) {
+ mark_lseg_invalid(lp, free_me);
+ continue;
+ }
+ if (is_after(&lseg->pls_range, &lp->pls_range))
continue;
list_add_tail(&lseg->pls_list, &lp->pls_list);
dprintk("%s: inserted lseg %p "
@@ -1252,6 +1266,24 @@ out:
dprintk("%s:Return\n", __func__);
}
+EXPORT_SYMBOL_GPL(pnfs_generic_layout_insert_lseg);
+
+static void
+pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo,
+ struct pnfs_layout_segment *lseg,
+ struct list_head *free_me)
+{
+ struct inode *inode = lo->plh_inode;
+ struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
+
+ if (ld->add_lseg != NULL)
+ ld->add_lseg(lo, lseg, free_me);
+ else
+ pnfs_generic_layout_insert_lseg(lo, lseg,
+ pnfs_lseg_range_is_after,
+ pnfs_lseg_no_merge,
+ free_me);
+}
static struct pnfs_layout_hdr *
alloc_init_layout_hdr(struct inode *ino,
@@ -1344,8 +1376,6 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo,
ret = pnfs_get_lseg(lseg);
break;
}
- if (lseg->pls_range.offset > range->offset)
- break;
}
dprintk("%s:Return lseg %p ref %d\n",
@@ -1428,14 +1458,6 @@ static bool pnfs_within_mdsthreshold(struct nfs_open_context *ctx,
return ret;
}
-/* stop waiting if someone clears NFS_LAYOUT_RETRY_LAYOUTGET bit. */
-static int pnfs_layoutget_retry_bit_wait(struct wait_bit_key *key)
-{
- if (!test_bit(NFS_LAYOUT_RETRY_LAYOUTGET, key->flags))
- return 1;
- return nfs_wait_bit_killable(key);
-}
-
static bool pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo)
{
/*
@@ -1444,7 +1466,7 @@ static bool pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo)
*/
pnfs_layoutcommit_inode(lo->plh_inode, false);
return !wait_on_bit_action(&lo->plh_flags, NFS_LAYOUT_RETURN,
- pnfs_layoutget_retry_bit_wait,
+ nfs_wait_bit_killable,
TASK_UNINTERRUPTIBLE);
}
@@ -1481,11 +1503,23 @@ pnfs_update_layout(struct inode *ino,
struct pnfs_layout_segment *lseg = NULL;
bool first;
- if (!pnfs_enabled_sb(NFS_SERVER(ino)))
+ if (!pnfs_enabled_sb(NFS_SERVER(ino))) {
+ trace_pnfs_update_layout(ino, pos, count, iomode, NULL,
+ PNFS_UPDATE_LAYOUT_NO_PNFS);
goto out;
+ }
+
+ if (iomode == IOMODE_READ && i_size_read(ino) == 0) {
+ trace_pnfs_update_layout(ino, pos, count, iomode, NULL,
+ PNFS_UPDATE_LAYOUT_RD_ZEROLEN);
+ goto out;
+ }
- if (pnfs_within_mdsthreshold(ctx, ino, iomode))
+ if (pnfs_within_mdsthreshold(ctx, ino, iomode)) {
+ trace_pnfs_update_layout(ino, pos, count, iomode, NULL,
+ PNFS_UPDATE_LAYOUT_MDSTHRESH);
goto out;
+ }
lookup_again:
first = false;
@@ -1493,19 +1527,25 @@ lookup_again:
lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
if (lo == NULL) {
spin_unlock(&ino->i_lock);
+ trace_pnfs_update_layout(ino, pos, count, iomode, NULL,
+ PNFS_UPDATE_LAYOUT_NOMEM);
goto out;
}
/* Do we even need to bother with this? */
if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+ PNFS_UPDATE_LAYOUT_BULK_RECALL);
dprintk("%s matches recall, use MDS\n", __func__);
goto out_unlock;
}
/* if LAYOUTGET already failed once we don't try again */
- if (pnfs_layout_io_test_failed(lo, iomode) &&
- !pnfs_should_retry_layoutget(lo))
+ if (pnfs_layout_io_test_failed(lo, iomode)) {
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+ PNFS_UPDATE_LAYOUT_IO_TEST_FAIL);
goto out_unlock;
+ }
first = list_empty(&lo->plh_segs);
if (first) {
@@ -1525,16 +1565,18 @@ lookup_again:
* already exists
*/
lseg = pnfs_find_lseg(lo, &arg);
- if (lseg)
+ if (lseg) {
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+ PNFS_UPDATE_LAYOUT_FOUND_CACHED);
goto out_unlock;
+ }
}
/*
* Because we free lsegs before sending LAYOUTRETURN, we need to wait
* for LAYOUTRETURN even if first is true.
*/
- if (!lseg && pnfs_should_retry_layoutget(lo) &&
- test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
+ if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
spin_unlock(&ino->i_lock);
dprintk("%s wait for layoutreturn\n", __func__);
if (pnfs_prepare_to_retry_layoutget(lo)) {
@@ -1544,11 +1586,16 @@ lookup_again:
dprintk("%s retrying\n", __func__);
goto lookup_again;
}
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+ PNFS_UPDATE_LAYOUT_RETURN);
goto out_put_layout_hdr;
}
- if (pnfs_layoutgets_blocked(lo, &arg, 0))
+ if (pnfs_layoutgets_blocked(lo)) {
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+ PNFS_UPDATE_LAYOUT_BLOCKED);
goto out_unlock;
+ }
atomic_inc(&lo->plh_outstanding);
spin_unlock(&ino->i_lock);
@@ -1571,8 +1618,9 @@ lookup_again:
arg.length = PAGE_CACHE_ALIGN(arg.length);
lseg = send_layoutget(lo, ctx, &arg, gfp_flags);
- pnfs_clear_retry_layoutget(lo);
atomic_dec(&lo->plh_outstanding);
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+ PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET);
out_put_layout_hdr:
if (first)
pnfs_clear_first_layoutget(lo);
@@ -1582,7 +1630,7 @@ out:
"(%s, offset: %llu, length: %llu)\n",
__func__, ino->i_sb->s_id,
(unsigned long long)NFS_FILEID(ino),
- lseg == NULL ? "not found" : "found",
+ IS_ERR_OR_NULL(lseg) ? "not found" : "found",
iomode==IOMODE_RW ? "read/write" : "read-only",
(unsigned long long)pos,
(unsigned long long)count);
@@ -1593,6 +1641,26 @@ out_unlock:
}
EXPORT_SYMBOL_GPL(pnfs_update_layout);
+static bool
+pnfs_sanity_check_layout_range(struct pnfs_layout_range *range)
+{
+ switch (range->iomode) {
+ case IOMODE_READ:
+ case IOMODE_RW:
+ break;
+ default:
+ return false;
+ }
+ if (range->offset == NFS4_MAX_UINT64)
+ return false;
+ if (range->length == 0)
+ return false;
+ if (range->length != NFS4_MAX_UINT64 &&
+ range->length > NFS4_MAX_UINT64 - range->offset)
+ return false;
+ return true;
+}
+
struct pnfs_layout_segment *
pnfs_layout_process(struct nfs4_layoutget *lgp)
{
@@ -1601,7 +1669,10 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
struct pnfs_layout_segment *lseg;
struct inode *ino = lo->plh_inode;
LIST_HEAD(free_me);
- int status = 0;
+ int status = -EINVAL;
+
+ if (!pnfs_sanity_check_layout_range(&res->range))
+ goto out;
/* Inject layout blob into I/O device driver */
lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags);
@@ -1619,12 +1690,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
lseg->pls_range = res->range;
spin_lock(&ino->i_lock);
- if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
- dprintk("%s forget reply due to recall\n", __func__);
- goto out_forget_reply;
- }
-
- if (pnfs_layoutgets_blocked(lo, &lgp->args.range, 1)) {
+ if (pnfs_layoutgets_blocked(lo)) {
dprintk("%s forget reply due to state\n", __func__);
goto out_forget_reply;
}
@@ -1633,6 +1699,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
/* existing state ID, make sure the sequence number matches. */
if (pnfs_layout_stateid_blocked(lo, &res->stateid)) {
dprintk("%s forget reply due to sequence\n", __func__);
+ status = -EAGAIN;
goto out_forget_reply;
}
pnfs_set_layout_stateid(lo, &res->stateid, false);
@@ -1651,12 +1718,10 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
pnfs_get_lseg(lseg);
- pnfs_layout_insert_lseg(lo, lseg);
+ pnfs_layout_insert_lseg(lo, lseg, &free_me);
- if (res->return_on_close) {
+ if (res->return_on_close)
set_bit(NFS_LSEG_ROC, &lseg->pls_flags);
- set_bit(NFS_LAYOUT_ROC, &lo->plh_flags);
- }
spin_unlock(&ino->i_lock);
pnfs_free_lseg_list(&free_me);
@@ -1672,16 +1737,29 @@ out_forget_reply:
}
static void
+pnfs_set_plh_return_iomode(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode)
+{
+ if (lo->plh_return_iomode == iomode)
+ return;
+ if (lo->plh_return_iomode != 0)
+ iomode = IOMODE_ANY;
+ lo->plh_return_iomode = iomode;
+}
+
+int
pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
struct list_head *tmp_list,
- struct pnfs_layout_range *return_range)
+ const struct pnfs_layout_range *return_range)
{
struct pnfs_layout_segment *lseg, *next;
+ int remaining = 0;
dprintk("%s:Begin lo %p\n", __func__, lo);
if (list_empty(&lo->plh_segs))
- return;
+ return 0;
+
+ assert_spin_locked(&lo->plh_inode->i_lock);
list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
if (should_free_lseg(&lseg->pls_range, return_range)) {
@@ -1691,36 +1769,47 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
lseg->pls_range.offset,
lseg->pls_range.length);
set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
- mark_lseg_invalid(lseg, tmp_list);
+ pnfs_set_plh_return_iomode(lo, return_range->iomode);
+ if (!mark_lseg_invalid(lseg, tmp_list))
+ remaining++;
+ set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
+ &lo->plh_flags);
}
+ return remaining;
}
void pnfs_error_mark_layout_for_return(struct inode *inode,
struct pnfs_layout_segment *lseg)
{
struct pnfs_layout_hdr *lo = NFS_I(inode)->layout;
- int iomode = pnfs_iomode_to_fail_bit(lseg->pls_range.iomode);
struct pnfs_layout_range range = {
.iomode = lseg->pls_range.iomode,
.offset = 0,
.length = NFS4_MAX_UINT64,
};
LIST_HEAD(free_me);
+ bool return_now = false;
spin_lock(&inode->i_lock);
- /* set failure bit so that pnfs path will be retried later */
- pnfs_layout_set_fail_bit(lo, iomode);
- if (lo->plh_return_iomode == 0)
- lo->plh_return_iomode = range.iomode;
- else if (lo->plh_return_iomode != range.iomode)
- lo->plh_return_iomode = IOMODE_ANY;
+ pnfs_set_plh_return_iomode(lo, range.iomode);
/*
* mark all matching lsegs so that we are sure to have no live
* segments at hand when sending layoutreturn. See pnfs_put_lseg()
* for how it works.
*/
- pnfs_mark_matching_lsegs_return(lo, &free_me, &range);
- spin_unlock(&inode->i_lock);
+ if (!pnfs_mark_matching_lsegs_return(lo, &free_me, &range)) {
+ nfs4_stateid stateid;
+ enum pnfs_iomode iomode = lo->plh_return_iomode;
+
+ nfs4_stateid_copy(&stateid, &lo->plh_stateid);
+ return_now = pnfs_prepare_layoutreturn(lo);
+ spin_unlock(&inode->i_lock);
+ if (return_now)
+ pnfs_send_layoutreturn(lo, &stateid, iomode, false);
+ } else {
+ spin_unlock(&inode->i_lock);
+ nfs_commit_inode(inode, 0);
+ }
pnfs_free_lseg_list(&free_me);
}
EXPORT_SYMBOL_GPL(pnfs_error_mark_layout_for_return);
@@ -1742,6 +1831,11 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r
rd_size,
IOMODE_READ,
GFP_KERNEL);
+ if (IS_ERR(pgio->pg_lseg)) {
+ pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ return;
+ }
}
/* If no lseg, fall back to read through mds */
if (pgio->pg_lseg == NULL)
@@ -1754,13 +1848,19 @@ void
pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req, u64 wb_size)
{
- if (pgio->pg_lseg == NULL)
+ if (pgio->pg_lseg == NULL) {
pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
req->wb_context,
req_offset(req),
wb_size,
IOMODE_RW,
GFP_NOFS);
+ if (IS_ERR(pgio->pg_lseg)) {
+ pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ return;
+ }
+ }
/* If no lseg, fall back to write through mds */
if (pgio->pg_lseg == NULL)
nfs_pageio_reset_write_mds(pgio);
@@ -1858,12 +1958,13 @@ static void pnfs_ld_handle_write_error(struct nfs_pgio_header *hdr)
*/
void pnfs_ld_write_done(struct nfs_pgio_header *hdr)
{
- trace_nfs4_pnfs_write(hdr, hdr->pnfs_error);
- if (!hdr->pnfs_error) {
+ if (likely(!hdr->pnfs_error)) {
pnfs_set_layoutcommit(hdr->inode, hdr->lseg,
hdr->mds_offset + hdr->res.count);
hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
- } else
+ }
+ trace_nfs4_pnfs_write(hdr, hdr->pnfs_error);
+ if (unlikely(hdr->pnfs_error))
pnfs_ld_handle_write_error(hdr);
hdr->mds_ops->rpc_release(hdr);
}
@@ -1927,15 +2028,13 @@ static void pnfs_writehdr_free(struct nfs_pgio_header *hdr)
int
pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
{
- struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
-
struct nfs_pgio_header *hdr;
int ret;
hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
if (!hdr) {
- desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
- return -ENOMEM;
+ desc->pg_error = -ENOMEM;
+ return desc->pg_error;
}
nfs_pgheader_init(desc, hdr, pnfs_writehdr_free);
@@ -1974,11 +2073,12 @@ static void pnfs_ld_handle_read_error(struct nfs_pgio_header *hdr)
*/
void pnfs_ld_read_done(struct nfs_pgio_header *hdr)
{
- trace_nfs4_pnfs_read(hdr, hdr->pnfs_error);
if (likely(!hdr->pnfs_error)) {
__nfs4_read_done_cb(hdr);
hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
- } else
+ }
+ trace_nfs4_pnfs_read(hdr, hdr->pnfs_error);
+ if (unlikely(hdr->pnfs_error))
pnfs_ld_handle_read_error(hdr);
hdr->mds_ops->rpc_release(hdr);
}
@@ -2057,15 +2157,13 @@ static void pnfs_readhdr_free(struct nfs_pgio_header *hdr)
int
pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
{
- struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
-
struct nfs_pgio_header *hdr;
int ret;
hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
if (!hdr) {
- desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
- return -ENOMEM;
+ desc->pg_error = -ENOMEM;
+ return desc->pg_error;
}
nfs_pgheader_init(desc, hdr, pnfs_readhdr_free);
hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
@@ -2267,7 +2365,7 @@ struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
#if IS_ENABLED(CONFIG_NFS_V4_2)
int
-pnfs_report_layoutstat(struct inode *inode)
+pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags)
{
struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
struct nfs_server *server = NFS_SERVER(inode);
@@ -2294,7 +2392,7 @@ pnfs_report_layoutstat(struct inode *inode)
pnfs_get_layout_hdr(hdr);
spin_unlock(&inode->i_lock);
- data = kzalloc(sizeof(*data), GFP_KERNEL);
+ data = kzalloc(sizeof(*data), gfp_flags);
if (!data) {
status = -ENOMEM;
goto out_put;
@@ -2324,3 +2422,7 @@ out_put:
}
EXPORT_SYMBOL_GPL(pnfs_report_layoutstat);
#endif
+
+unsigned int layoutstats_timer;
+module_param(layoutstats_timer, uint, 0644);
+EXPORT_SYMBOL_GPL(layoutstats_timer);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 3e6ab7b..9f4e2a4 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -94,12 +94,10 @@ enum {
NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */
NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */
NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */
- NFS_LAYOUT_ROC, /* some lseg had roc bit set */
NFS_LAYOUT_RETURN, /* Return this layout ASAP */
NFS_LAYOUT_RETURN_BEFORE_CLOSE, /* Return this layout before close */
NFS_LAYOUT_INVALID_STID, /* layout stateid id is invalid */
NFS_LAYOUT_FIRST_LAYOUTGET, /* Serialize first layoutget */
- NFS_LAYOUT_RETRY_LAYOUTGET, /* Retry layoutget */
};
enum layoutdriver_policy_flags {
@@ -129,6 +127,9 @@ struct pnfs_layoutdriver_type {
struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, gfp_t gfp_flags);
void (*free_lseg) (struct pnfs_layout_segment *lseg);
+ void (*add_lseg) (struct pnfs_layout_hdr *layoutid,
+ struct pnfs_layout_segment *lseg,
+ struct list_head *free_me);
void (*return_range) (struct pnfs_layout_hdr *lo,
struct pnfs_layout_range *range);
@@ -184,15 +185,15 @@ struct pnfs_layoutdriver_type {
struct pnfs_layout_hdr {
atomic_t plh_refcount;
+ atomic_t plh_outstanding; /* number of RPCs out */
struct list_head plh_layouts; /* other client layouts */
struct list_head plh_bulk_destroy;
struct list_head plh_segs; /* layout segments list */
- nfs4_stateid plh_stateid;
- atomic_t plh_outstanding; /* number of RPCs out */
unsigned long plh_block_lgets; /* block LAYOUTGET if >0 */
- u32 plh_barrier; /* ignore lower seqids */
unsigned long plh_retry_timestamp;
unsigned long plh_flags;
+ nfs4_stateid plh_stateid;
+ u32 plh_barrier; /* ignore lower seqids */
enum pnfs_iomode plh_return_iomode;
loff_t plh_lwb; /* last write byte for layoutcommit */
struct rpc_cred *plh_lc_cred; /* layoutcommit cred */
@@ -259,15 +260,19 @@ void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
bool update_barrier);
int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
struct pnfs_layout_hdr *lo,
- struct pnfs_layout_range *range,
+ const struct pnfs_layout_range *range,
struct nfs4_state *open_state);
int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
struct list_head *tmp_list,
- struct pnfs_layout_range *recall_range);
+ const struct pnfs_layout_range *recall_range);
+int pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
+ struct list_head *tmp_list,
+ const struct pnfs_layout_range *recall_range);
bool pnfs_roc(struct inode *ino);
void pnfs_roc_release(struct inode *ino);
void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
-bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task);
+void pnfs_roc_get_barrier(struct inode *ino, u32 *barrier);
+bool pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task);
void pnfs_set_layoutcommit(struct inode *, struct pnfs_layout_segment *, loff_t);
void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);
int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
@@ -286,6 +291,14 @@ struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
gfp_t gfp_flags);
void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo);
+void pnfs_generic_layout_insert_lseg(struct pnfs_layout_hdr *lo,
+ struct pnfs_layout_segment *lseg,
+ bool (*is_after)(const struct pnfs_layout_range *lseg_range,
+ const struct pnfs_layout_range *old),
+ bool (*do_merge)(struct pnfs_layout_segment *lseg,
+ struct pnfs_layout_segment *old),
+ struct list_head *free_me);
+
void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp);
int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *);
int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *);
@@ -368,26 +381,6 @@ nfs4_get_deviceid(struct nfs4_deviceid_node *d)
return d;
}
-static inline void pnfs_set_retry_layoutget(struct pnfs_layout_hdr *lo)
-{
- if (!test_and_set_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags))
- atomic_inc(&lo->plh_refcount);
-}
-
-static inline void pnfs_clear_retry_layoutget(struct pnfs_layout_hdr *lo)
-{
- if (test_and_clear_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags)) {
- atomic_dec(&lo->plh_refcount);
- /* wake up waiters for LAYOUTRETURN as that is not needed */
- wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN);
- }
-}
-
-static inline bool pnfs_should_retry_layoutget(struct pnfs_layout_hdr *lo)
-{
- return test_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags);
-}
-
static inline struct pnfs_layout_segment *
pnfs_get_lseg(struct pnfs_layout_segment *lseg)
{
@@ -398,6 +391,12 @@ pnfs_get_lseg(struct pnfs_layout_segment *lseg)
return lseg;
}
+static inline bool
+pnfs_is_valid_lseg(struct pnfs_layout_segment *lseg)
+{
+ return test_bit(NFS_LSEG_VALID, &lseg->pls_flags) != 0;
+}
+
/* Return true if a layout driver is being used for this mountpoint */
static inline int pnfs_enabled_sb(struct nfs_server *nfss)
{
@@ -529,12 +528,51 @@ pnfs_use_threshold(struct nfs4_threshold **dst, struct nfs4_threshold *src,
nfss->pnfs_curr_ld->id == src->l_type);
}
+static inline u64
+pnfs_calc_offset_end(u64 offset, u64 len)
+{
+ if (len == NFS4_MAX_UINT64 || len >= NFS4_MAX_UINT64 - offset)
+ return NFS4_MAX_UINT64;
+ return offset + len - 1;
+}
+
+static inline u64
+pnfs_calc_offset_length(u64 offset, u64 end)
+{
+ if (end == NFS4_MAX_UINT64 || end <= offset)
+ return NFS4_MAX_UINT64;
+ return 1 + end - offset;
+}
+
+/**
+ * pnfs_mark_layout_returned_if_empty - marks the layout as returned
+ * @lo: layout header
+ *
+ * Note: Caller must hold inode->i_lock
+ */
+static inline void
+pnfs_mark_layout_returned_if_empty(struct pnfs_layout_hdr *lo)
+{
+ if (list_empty(&lo->plh_segs))
+ set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
+}
+
+static inline void
+pnfs_copy_range(struct pnfs_layout_range *dst,
+ const struct pnfs_layout_range *src)
+{
+ memcpy(dst, src, sizeof(*dst));
+}
+
+extern unsigned int layoutstats_timer;
+
#ifdef NFS_DEBUG
void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id);
#else
static inline void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id)
{
}
+
#endif /* NFS_DEBUG */
#else /* CONFIG_NFS_V4_1 */
@@ -605,8 +643,13 @@ pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
{
}
+static inline void
+pnfs_roc_get_barrier(struct inode *ino, u32 *barrier)
+{
+}
+
static inline bool
-pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task)
+pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task)
{
return false;
}
@@ -691,10 +734,10 @@ static inline void nfs4_pnfs_v3_ds_connect_unload(void)
#endif /* CONFIG_NFS_V4_1 */
#if IS_ENABLED(CONFIG_NFS_V4_2)
-int pnfs_report_layoutstat(struct inode *inode);
+int pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags);
#else
static inline int
-pnfs_report_layoutstat(struct inode *inode)
+pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags)
{
return 0;
}
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index f37e25b..81ac648 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -124,11 +124,12 @@ pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
if (ret) {
cinfo->ds->nwritten -= ret;
cinfo->ds->ncommitting += ret;
- bucket->clseg = bucket->wlseg;
- if (list_empty(src))
+ if (bucket->clseg == NULL)
+ bucket->clseg = pnfs_get_lseg(bucket->wlseg);
+ if (list_empty(src)) {
+ pnfs_put_lseg_locked(bucket->wlseg);
bucket->wlseg = NULL;
- else
- pnfs_get_lseg(bucket->clseg);
+ }
}
return ret;
}
@@ -182,19 +183,23 @@ static void pnfs_generic_retry_commit(struct nfs_commit_info *cinfo, int idx)
struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
struct pnfs_commit_bucket *bucket;
struct pnfs_layout_segment *freeme;
+ LIST_HEAD(pages);
int i;
+ spin_lock(cinfo->lock);
for (i = idx; i < fl_cinfo->nbuckets; i++) {
bucket = &fl_cinfo->buckets[i];
if (list_empty(&bucket->committing))
continue;
- nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo, i);
- spin_lock(cinfo->lock);
freeme = bucket->clseg;
bucket->clseg = NULL;
+ list_splice_init(&bucket->committing, &pages);
spin_unlock(cinfo->lock);
+ nfs_retry_commit(&pages, freeme, cinfo, i);
pnfs_put_lseg(freeme);
+ spin_lock(cinfo->lock);
}
+ spin_unlock(cinfo->lock);
}
static unsigned int
@@ -216,10 +221,6 @@ pnfs_generic_alloc_ds_commits(struct nfs_commit_info *cinfo,
if (!data)
break;
data->ds_commit_index = i;
- spin_lock(cinfo->lock);
- data->lseg = bucket->clseg;
- bucket->clseg = NULL;
- spin_unlock(cinfo->lock);
list_add(&data->pages, list);
nreq++;
}
@@ -229,6 +230,22 @@ pnfs_generic_alloc_ds_commits(struct nfs_commit_info *cinfo,
return nreq;
}
+static inline
+void pnfs_fetch_commit_bucket_list(struct list_head *pages,
+ struct nfs_commit_data *data,
+ struct nfs_commit_info *cinfo)
+{
+ struct pnfs_commit_bucket *bucket;
+
+ bucket = &cinfo->ds->buckets[data->ds_commit_index];
+ spin_lock(cinfo->lock);
+ list_splice_init(&bucket->committing, pages);
+ data->lseg = bucket->clseg;
+ bucket->clseg = NULL;
+ spin_unlock(cinfo->lock);
+
+}
+
/* This follows nfs_commit_list pretty closely */
int
pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
@@ -243,41 +260,35 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
if (!list_empty(mds_pages)) {
data = nfs_commitdata_alloc();
if (data != NULL) {
- data->lseg = NULL;
+ data->ds_commit_index = -1;
list_add(&data->pages, &list);
nreq++;
} else {
nfs_retry_commit(mds_pages, NULL, cinfo, 0);
pnfs_generic_retry_commit(cinfo, 0);
- cinfo->completion_ops->error_cleanup(NFS_I(inode));
return -ENOMEM;
}
}
nreq += pnfs_generic_alloc_ds_commits(cinfo, &list);
- if (nreq == 0) {
- cinfo->completion_ops->error_cleanup(NFS_I(inode));
+ if (nreq == 0)
goto out;
- }
atomic_add(nreq, &cinfo->mds->rpcs_out);
list_for_each_entry_safe(data, tmp, &list, pages) {
list_del_init(&data->pages);
- if (!data->lseg) {
+ if (data->ds_commit_index < 0) {
nfs_init_commit(data, mds_pages, NULL, cinfo);
nfs_initiate_commit(NFS_CLIENT(inode), data,
NFS_PROTO(data->inode),
data->mds_ops, how, 0);
} else {
- struct pnfs_commit_bucket *buckets;
+ LIST_HEAD(pages);
- buckets = cinfo->ds->buckets;
- nfs_init_commit(data,
- &buckets[data->ds_commit_index].committing,
- data->lseg,
- cinfo);
+ pnfs_fetch_commit_bucket_list(&pages, data, cinfo);
+ nfs_init_commit(data, &pages, data->lseg, cinfo);
initiate_commit(data, how);
}
}
@@ -359,26 +370,31 @@ same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2)
return false;
}
+/*
+ * Checks if 'dsaddrs1' contains a subset of 'dsaddrs2'. If it does,
+ * declare a match.
+ */
static bool
_same_data_server_addrs_locked(const struct list_head *dsaddrs1,
const struct list_head *dsaddrs2)
{
struct nfs4_pnfs_ds_addr *da1, *da2;
-
- /* step through both lists, comparing as we go */
- for (da1 = list_first_entry(dsaddrs1, typeof(*da1), da_node),
- da2 = list_first_entry(dsaddrs2, typeof(*da2), da_node);
- da1 != NULL && da2 != NULL;
- da1 = list_entry(da1->da_node.next, typeof(*da1), da_node),
- da2 = list_entry(da2->da_node.next, typeof(*da2), da_node)) {
- if (!same_sockaddr((struct sockaddr *)&da1->da_addr,
- (struct sockaddr *)&da2->da_addr))
- return false;
+ struct sockaddr *sa1, *sa2;
+ bool match = false;
+
+ list_for_each_entry(da1, dsaddrs1, da_node) {
+ sa1 = (struct sockaddr *)&da1->da_addr;
+ match = false;
+ list_for_each_entry(da2, dsaddrs2, da_node) {
+ sa2 = (struct sockaddr *)&da2->da_addr;
+ match = same_sockaddr(sa1, sa2);
+ if (match)
+ break;
+ }
+ if (!match)
+ break;
}
- if (da1 == NULL && da2 == NULL)
- return true;
-
- return false;
+ return match;
}
/*
@@ -852,6 +868,11 @@ pnfs_layout_mark_request_commit(struct nfs_page *req,
buckets = cinfo->ds->buckets;
list = &buckets[ds_commit_idx].written;
if (list_empty(list)) {
+ if (!pnfs_is_valid_lseg(lseg)) {
+ spin_unlock(cinfo->lock);
+ cinfo->completion_ops->resched_write(cinfo, req);
+ return;
+ }
/* Non-empty buckets hold a reference on the lseg. That ref
* is normally transferred to the COMMIT call and released
* there. It could also be released if the last req is pulled
@@ -863,9 +884,10 @@ pnfs_layout_mark_request_commit(struct nfs_page *req,
}
set_bit(PG_COMMIT_TO_DS, &req->wb_flags);
cinfo->ds->nwritten++;
- spin_unlock(cinfo->lock);
- nfs_request_add_commit_list(req, list, cinfo);
+ nfs_request_add_commit_list_locked(req, list, cinfo);
+ spin_unlock(cinfo->lock);
+ nfs_mark_page_unstable(req->wb_page, cinfo);
}
EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit);
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index ae0ff7a..eb31e23 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -72,6 +72,9 @@ void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio)
{
struct nfs_pgio_mirror *mirror;
+ if (pgio->pg_ops && pgio->pg_ops->pg_cleanup)
+ pgio->pg_ops->pg_cleanup(pgio);
+
pgio->pg_ops = &nfs_pgio_rw_ops;
/* read path should never have more than one mirror */
@@ -82,6 +85,23 @@ void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio)
}
EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds);
+static void nfs_readpage_release(struct nfs_page *req)
+{
+ struct inode *inode = d_inode(req->wb_context->dentry);
+
+ dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb->s_id,
+ (unsigned long long)NFS_FILEID(inode), req->wb_bytes,
+ (long long)req_offset(req));
+
+ if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) {
+ if (PageUptodate(req->wb_page))
+ nfs_readpage_to_fscache(inode, req->wb_page, 0);
+
+ unlock_page(req->wb_page);
+ }
+ nfs_release_request(req);
+}
+
int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
struct page *page)
{
@@ -103,7 +123,10 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
nfs_pageio_init_read(&pgio, inode, false,
&nfs_async_read_completion_ops);
- nfs_pageio_add_request(&pgio, new);
+ if (!nfs_pageio_add_request(&pgio, new)) {
+ nfs_list_remove_request(new);
+ nfs_readpage_release(new);
+ }
nfs_pageio_complete(&pgio);
/* It doesn't make sense to do mirrored reads! */
@@ -112,24 +135,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
pgm = &pgio.pg_mirrors[0];
NFS_I(inode)->read_io += pgm->pg_bytes_written;
- return 0;
-}
-
-static void nfs_readpage_release(struct nfs_page *req)
-{
- struct inode *inode = d_inode(req->wb_context->dentry);
-
- dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb->s_id,
- (unsigned long long)NFS_FILEID(inode), req->wb_bytes,
- (long long)req_offset(req));
-
- if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) {
- if (PageUptodate(req->wb_page))
- nfs_readpage_to_fscache(inode, req->wb_page, 0);
-
- unlock_page(req->wb_page);
- }
- nfs_release_request(req);
+ return pgio.pg_error < 0 ? pgio.pg_error : 0;
}
static void nfs_page_group_set_uptodate(struct nfs_page *req)
@@ -243,6 +249,13 @@ static void nfs_readpage_retry(struct rpc_task *task,
nfs_set_pgio_error(hdr, -EIO, argp->offset);
return;
}
+
+ /* For non rpc-based layout drivers, retry-through-MDS */
+ if (!task->tk_ops) {
+ hdr->pnfs_error = -EAGAIN;
+ return;
+ }
+
/* Yes, so retry the read at the end of the hdr */
hdr->mds_offset += resp->count;
argp->offset += resp->count;
@@ -265,7 +278,7 @@ static void nfs_readpage_result(struct rpc_task *task,
hdr->good_bytes = bound - hdr->io_start;
}
spin_unlock(&hdr->lock);
- } else if (hdr->res.count != hdr->args.count)
+ } else if (hdr->res.count < hdr->args.count)
nfs_readpage_retry(task, hdr);
}
@@ -351,6 +364,8 @@ readpage_async_filler(void *data, struct page *page)
if (len < PAGE_CACHE_SIZE)
zero_user_segment(page, len, PAGE_CACHE_SIZE);
if (!nfs_pageio_add_request(desc->pgio, new)) {
+ nfs_list_remove_request(new);
+ nfs_readpage_release(new);
error = desc->pgio->pg_error;
goto out_unlock;
}
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index aa62004..f126828 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -381,9 +381,12 @@ int __init register_nfs_fs(void)
ret = nfs_register_sysctl();
if (ret < 0)
goto error_2;
- register_shrinker(&acl_shrinker);
+ ret = register_shrinker(&acl_shrinker);
+ if (ret < 0)
+ goto error_3;
return 0;
-
+error_3:
+ nfs_unregister_sysctl();
error_2:
unregister_nfs4_fs();
error_1:
@@ -2813,7 +2816,6 @@ out_invalid_transport_udp:
* NFS client for backwards compatibility
*/
unsigned int nfs_callback_set_tcpport;
-unsigned short nfs_callback_tcpport;
/* Default cache timeout is 10 minutes */
unsigned int nfs_idmap_cache_timeout = 600;
/* Turn off NFSv4 uid/gid mapping when using AUTH_SYS */
@@ -2824,7 +2826,6 @@ char nfs4_client_id_uniquifier[NFS4_CLIENT_ID_UNIQ_LEN] = "";
bool recover_lost_locks = false;
EXPORT_SYMBOL_GPL(nfs_callback_set_tcpport);
-EXPORT_SYMBOL_GPL(nfs_callback_tcpport);
EXPORT_SYMBOL_GPL(nfs_idmap_cache_timeout);
EXPORT_SYMBOL_GPL(nfs4_disable_idmapping);
EXPORT_SYMBOL_GPL(max_session_slots);
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
index b6de433..4fe3eea 100644
--- a/fs/nfs/symlink.c
+++ b/fs/nfs/symlink.c
@@ -42,21 +42,35 @@ error:
return -EIO;
}
-static const char *nfs_follow_link(struct dentry *dentry, void **cookie)
+static const char *nfs_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- struct inode *inode = d_inode(dentry);
struct page *page;
void *err;
- err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping));
- if (err)
- return err;
- page = read_cache_page(&inode->i_data, 0,
- (filler_t *)nfs_symlink_filler, inode);
- if (IS_ERR(page))
- return ERR_CAST(page);
- *cookie = page;
- return kmap(page);
+ if (!dentry) {
+ err = ERR_PTR(nfs_revalidate_mapping_rcu(inode));
+ if (err)
+ return err;
+ page = find_get_page(inode->i_mapping, 0);
+ if (!page)
+ return ERR_PTR(-ECHILD);
+ if (!PageUptodate(page)) {
+ put_page(page);
+ return ERR_PTR(-ECHILD);
+ }
+ } else {
+ err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping));
+ if (err)
+ return err;
+ page = read_cache_page(&inode->i_data, 0,
+ (filler_t *)nfs_symlink_filler, inode);
+ if (IS_ERR(page))
+ return ERR_CAST(page);
+ }
+ set_delayed_call(done, page_put_link, page);
+ return page_address(page);
}
/*
@@ -64,8 +78,7 @@ static const char *nfs_follow_link(struct dentry *dentry, void **cookie)
*/
const struct inode_operations nfs_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = nfs_follow_link,
- .put_link = page_put_link,
+ .get_link = nfs_get_link,
.getattr = nfs_getattr,
.setattr = nfs_setattr,
};
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 75a35a1..ce43cd6 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -21,6 +21,8 @@
#include <linux/nfs_page.h>
#include <linux/backing-dev.h>
#include <linux/export.h>
+#include <linux/freezer.h>
+#include <linux/wait.h>
#include <asm/uaccess.h>
@@ -244,11 +246,9 @@ static int wb_priority(struct writeback_control *wbc)
{
int ret = 0;
if (wbc->for_reclaim)
- return FLUSH_HIGHPRI | FLUSH_STABLE;
+ return FLUSH_HIGHPRI | FLUSH_COND_STABLE;
if (wbc->sync_mode == WB_SYNC_ALL)
ret = FLUSH_COND_STABLE;
- if (wbc->for_kupdate || wbc->for_background)
- ret |= FLUSH_LOWPRI;
return ret;
}
@@ -545,12 +545,22 @@ try_again:
return head;
}
+static void nfs_write_error_remove_page(struct nfs_page *req)
+{
+ nfs_unlock_request(req);
+ nfs_end_page_writeback(req);
+ nfs_release_request(req);
+ generic_error_remove_page(page_file_mapping(req->wb_page),
+ req->wb_page);
+}
+
/*
* Find an associated nfs write request, and prepare to flush it out
* May return an error if the user signalled nfs_wait_on_request().
*/
static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
- struct page *page, bool nonblock)
+ struct page *page, bool nonblock,
+ bool launder)
{
struct nfs_page *req;
int ret = 0;
@@ -567,23 +577,36 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
ret = 0;
if (!nfs_pageio_add_request(pgio, req)) {
- nfs_redirty_request(req);
ret = pgio->pg_error;
- }
+ /*
+ * Remove the problematic req upon fatal errors
+ * in launder case, while other dirty pages can
+ * still be around until they get flushed.
+ */
+ if (nfs_error_is_fatal(ret)) {
+ nfs_context_set_write_error(req->wb_context, ret);
+ if (launder) {
+ nfs_write_error_remove_page(req);
+ goto out;
+ }
+ }
+ nfs_redirty_request(req);
+ ret = -EAGAIN;
+ } else
+ nfs_add_stats(page_file_mapping(page)->host,
+ NFSIOS_WRITEPAGES, 1);
out:
return ret;
}
-static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio)
+static int nfs_do_writepage(struct page *page, struct writeback_control *wbc,
+ struct nfs_pageio_descriptor *pgio, bool launder)
{
- struct inode *inode = page_file_mapping(page)->host;
int ret;
- nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
- nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
-
nfs_pageio_cond_complete(pgio, page_file_index(page));
- ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE);
+ ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE,
+ launder);
if (ret == -EAGAIN) {
redirty_page_for_writepage(wbc, page);
ret = 0;
@@ -594,14 +617,18 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, st
/*
* Write an mmapped page to the server.
*/
-static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc)
+static int nfs_writepage_locked(struct page *page,
+ struct writeback_control *wbc,
+ bool launder)
{
struct nfs_pageio_descriptor pgio;
+ struct inode *inode = page_file_mapping(page)->host;
int err;
- nfs_pageio_init_write(&pgio, page->mapping->host, wb_priority(wbc),
+ nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
+ nfs_pageio_init_write(&pgio, inode, wb_priority(wbc),
false, &nfs_async_write_completion_ops);
- err = nfs_do_writepage(page, wbc, &pgio);
+ err = nfs_do_writepage(page, wbc, &pgio, launder);
nfs_pageio_complete(&pgio);
if (err < 0)
return err;
@@ -614,7 +641,7 @@ int nfs_writepage(struct page *page, struct writeback_control *wbc)
{
int ret;
- ret = nfs_writepage_locked(page, wbc);
+ ret = nfs_writepage_locked(page, wbc, false);
unlock_page(page);
return ret;
}
@@ -623,7 +650,7 @@ static int nfs_writepages_callback(struct page *page, struct writeback_control *
{
int ret;
- ret = nfs_do_writepage(page, wbc, data);
+ ret = nfs_do_writepage(page, wbc, data, false);
unlock_page(page);
return ret;
}
@@ -768,6 +795,28 @@ nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
}
/**
+ * nfs_request_add_commit_list_locked - add request to a commit list
+ * @req: pointer to a struct nfs_page
+ * @dst: commit list head
+ * @cinfo: holds list lock and accounting info
+ *
+ * This sets the PG_CLEAN bit, updates the cinfo count of
+ * number of outstanding requests requiring a commit as well as
+ * the MM page stats.
+ *
+ * The caller must hold the cinfo->lock, and the nfs_page lock.
+ */
+void
+nfs_request_add_commit_list_locked(struct nfs_page *req, struct list_head *dst,
+ struct nfs_commit_info *cinfo)
+{
+ set_bit(PG_CLEAN, &req->wb_flags);
+ nfs_list_add_request(req, dst);
+ cinfo->mds->ncommit++;
+}
+EXPORT_SYMBOL_GPL(nfs_request_add_commit_list_locked);
+
+/**
* nfs_request_add_commit_list - add request to a commit list
* @req: pointer to a struct nfs_page
* @dst: commit list head
@@ -784,13 +833,10 @@ void
nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
struct nfs_commit_info *cinfo)
{
- set_bit(PG_CLEAN, &(req)->wb_flags);
spin_lock(cinfo->lock);
- nfs_list_add_request(req, dst);
- cinfo->mds->ncommit++;
+ nfs_request_add_commit_list_locked(req, dst, cinfo);
spin_unlock(cinfo->lock);
- if (!cinfo->dreq)
- nfs_mark_page_unstable(req->wb_page);
+ nfs_mark_page_unstable(req->wb_page, cinfo);
}
EXPORT_SYMBOL_GPL(nfs_request_add_commit_list);
@@ -1109,7 +1155,8 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
if (req == NULL)
return 0;
l_ctx = req->wb_lock_context;
- do_flush = req->wb_page != page || req->wb_context != ctx;
+ do_flush = req->wb_page != page ||
+ !nfs_match_open_context(req->wb_context, ctx);
/* for now, flush if more than 1 request in page_group */
do_flush |= req->wb_this_page != req;
if (l_ctx && flctx &&
@@ -1204,7 +1251,7 @@ static int nfs_can_extend_write(struct file *file, struct page *page, struct ino
return 1;
if (!flctx || (list_empty_careful(&flctx->flc_flock) &&
list_empty_careful(&flctx->flc_posix)))
- return 0;
+ return 1;
/* Check to see if there are whole file write locks */
ret = 0;
@@ -1307,9 +1354,15 @@ static void nfs_async_write_error(struct list_head *head)
}
}
+static void nfs_async_write_reschedule_io(struct nfs_pgio_header *hdr)
+{
+ nfs_async_write_error(&hdr->pages);
+}
+
static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops = {
.error_cleanup = nfs_async_write_error,
.completion = nfs_write_completion,
+ .reschedule_io = nfs_async_write_reschedule_io,
};
void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
@@ -1332,6 +1385,9 @@ void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio)
{
struct nfs_pgio_mirror *mirror;
+ if (pgio->pg_ops && pgio->pg_ops->pg_cleanup)
+ pgio->pg_ops->pg_cleanup(pgio);
+
pgio->pg_ops = &nfs_pgio_rw_ops;
nfs_pageio_stop_mirroring(pgio);
@@ -1483,6 +1539,13 @@ static void nfs_writeback_result(struct rpc_task *task,
task->tk_status = -EIO;
return;
}
+
+ /* For non rpc-based layout drivers, retry-through-MDS */
+ if (!task->tk_ops) {
+ hdr->pnfs_error = -EAGAIN;
+ return;
+ }
+
/* Was this an NFSv2 write or an NFSv3 stable write? */
if (resp->verf->committed != NFS_UNSTABLE) {
/* Resend from where the server left off */
@@ -1500,27 +1563,21 @@ static void nfs_writeback_result(struct rpc_task *task,
}
}
-
-static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait)
+static int wait_on_commit(struct nfs_mds_commit_info *cinfo)
{
- int ret;
+ return wait_on_atomic_t(&cinfo->rpcs_out,
+ nfs_wait_atomic_killable, TASK_KILLABLE);
+}
- if (!test_and_set_bit(NFS_INO_COMMIT, &nfsi->flags))
- return 1;
- if (!may_wait)
- return 0;
- ret = out_of_line_wait_on_bit_lock(&nfsi->flags,
- NFS_INO_COMMIT,
- nfs_wait_bit_killable,
- TASK_KILLABLE);
- return (ret < 0) ? ret : 1;
+static void nfs_commit_begin(struct nfs_mds_commit_info *cinfo)
+{
+ atomic_inc(&cinfo->rpcs_out);
}
-static void nfs_commit_clear_lock(struct nfs_inode *nfsi)
+static void nfs_commit_end(struct nfs_mds_commit_info *cinfo)
{
- clear_bit(NFS_INO_COMMIT, &nfsi->flags);
- smp_mb__after_atomic();
- wake_up_bit(&nfsi->flags, NFS_INO_COMMIT);
+ if (atomic_dec_and_test(&cinfo->rpcs_out))
+ wake_up_atomic_t(&cinfo->rpcs_out);
}
void nfs_commitdata_release(struct nfs_commit_data *data)
@@ -1637,6 +1694,13 @@ void nfs_retry_commit(struct list_head *page_list,
}
EXPORT_SYMBOL_GPL(nfs_retry_commit);
+static void
+nfs_commit_resched_write(struct nfs_commit_info *cinfo,
+ struct nfs_page *req)
+{
+ __set_page_dirty_nobuffers(req->wb_page);
+}
+
/*
* Commit dirty pages
*/
@@ -1658,7 +1722,6 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how,
data->mds_ops, how, 0);
out_bad:
nfs_retry_commit(head, NULL, cinfo, 0);
- cinfo->completion_ops->error_cleanup(NFS_I(inode));
return -ENOMEM;
}
@@ -1720,8 +1783,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
nfs_init_cinfo(&cinfo, data->inode, data->dreq);
- if (atomic_dec_and_test(&cinfo.mds->rpcs_out))
- nfs_commit_clear_lock(NFS_I(data->inode));
+ nfs_commit_end(cinfo.mds);
}
static void nfs_commit_release(void *calldata)
@@ -1740,7 +1802,7 @@ static const struct rpc_call_ops nfs_commit_ops = {
static const struct nfs_commit_completion_ops nfs_commit_completion_ops = {
.completion = nfs_commit_release_pages,
- .error_cleanup = nfs_commit_clear_lock,
+ .resched_write = nfs_commit_resched_write,
};
int nfs_generic_commit_list(struct inode *inode, struct list_head *head,
@@ -1759,30 +1821,25 @@ int nfs_commit_inode(struct inode *inode, int how)
LIST_HEAD(head);
struct nfs_commit_info cinfo;
int may_wait = how & FLUSH_SYNC;
+ int error = 0;
int res;
- res = nfs_commit_set_lock(NFS_I(inode), may_wait);
- if (res <= 0)
- goto out_mark_dirty;
nfs_init_cinfo_from_inode(&cinfo, inode);
+ nfs_commit_begin(cinfo.mds);
res = nfs_scan_commit(inode, &head, &cinfo);
- if (res) {
- int error;
-
+ if (res)
error = nfs_generic_commit_list(inode, &head, how, &cinfo);
- if (error < 0)
- return error;
- if (!may_wait)
- goto out_mark_dirty;
- error = wait_on_bit_action(&NFS_I(inode)->flags,
- NFS_INO_COMMIT,
- nfs_wait_bit_killable,
- TASK_KILLABLE);
- if (error < 0)
- return error;
- } else
- nfs_commit_clear_lock(NFS_I(inode));
+ nfs_commit_end(cinfo.mds);
+ if (error < 0)
+ goto out_error;
+ if (!may_wait)
+ goto out_mark_dirty;
+ error = wait_on_commit(cinfo.mds);
+ if (error < 0)
+ return error;
return res;
+out_error:
+ res = error;
/* Note: If we exit without ensuring that the commit is complete,
* we must mark the inode as dirty. Otherwise, future calls to
* sync_inode() with the WB_SYNC_ALL flag set will fail to ensure
@@ -1792,8 +1849,9 @@ out_mark_dirty:
__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
return res;
}
+EXPORT_SYMBOL_GPL(nfs_commit_inode);
-static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc)
+int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
{
struct nfs_inode *nfsi = NFS_I(inode);
int flags = FLUSH_SYNC;
@@ -1828,11 +1886,6 @@ out_mark_dirty:
__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
return ret;
}
-
-int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
-{
- return nfs_commit_unstable_pages(inode, wbc);
-}
EXPORT_SYMBOL_GPL(nfs_write_inode);
/*
@@ -1887,7 +1940,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
/*
* Write back all requests on one page - we do this before reading it.
*/
-int nfs_wb_page(struct inode *inode, struct page *page)
+int nfs_wb_single_page(struct inode *inode, struct page *page, bool launder)
{
loff_t range_start = page_file_offset(page);
loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1);
@@ -1904,7 +1957,7 @@ int nfs_wb_page(struct inode *inode, struct page *page)
for (;;) {
wait_on_page_writeback(page);
if (clear_page_dirty_for_io(page)) {
- ret = nfs_writepage_locked(page, &wbc);
+ ret = nfs_writepage_locked(page, &wbc, launder);
if (ret < 0)
goto out_error;
continue;
diff --git a/fs/nfs_common/grace.c b/fs/nfs_common/grace.c
index ae6e58e..fd8c9a5 100644
--- a/fs/nfs_common/grace.c
+++ b/fs/nfs_common/grace.c
@@ -63,14 +63,33 @@ EXPORT_SYMBOL_GPL(locks_end_grace);
* lock reclaims.
*/
int
-locks_in_grace(struct net *net)
+__state_in_grace(struct net *net, bool open)
{
struct list_head *grace_list = net_generic(net, grace_net_id);
+ struct lock_manager *lm;
- return !list_empty(grace_list);
+ if (!open)
+ return !list_empty(grace_list);
+
+ list_for_each_entry(lm, grace_list, list) {
+ if (lm->block_opens)
+ return true;
+ }
+ return false;
+}
+
+int locks_in_grace(struct net *net)
+{
+ return __state_in_grace(net, 0);
}
EXPORT_SYMBOL_GPL(locks_in_grace);
+int opens_in_grace(struct net *net)
+{
+ return __state_in_grace(net, 1);
+}
+EXPORT_SYMBOL_GPL(opens_in_grace);
+
static int __net_init
grace_init_net(struct net *net)
{
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index cdefaa3..c29d942 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -56,14 +56,6 @@ nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp,
u32 device_generation = 0;
int error;
- /*
- * We do not attempt to support I/O smaller than the fs block size,
- * or not aligned to it.
- */
- if (args->lg_minlength < block_size) {
- dprintk("pnfsd: I/O too small\n");
- goto out_layoutunavailable;
- }
if (seg->offset & (block_size - 1)) {
dprintk("pnfsd: I/O misaligned\n");
goto out_layoutunavailable;
diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
index 9aa2796..6d834dc 100644
--- a/fs/nfsd/blocklayoutxdr.c
+++ b/fs/nfsd/blocklayoutxdr.c
@@ -101,7 +101,7 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
}
nr_iomaps = be32_to_cpup(p++);
- expected = sizeof(__be32) + nr_iomaps * NFS4_BLOCK_EXTENT_SIZE;
+ expected = sizeof(__be32) + nr_iomaps * PNFS_BLOCK_EXTENT_SIZE;
if (len != expected) {
dprintk("%s: extent array size mismatch: %u/%u\n",
__func__, len, expected);
diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h
index fdc7903..6de925f 100644
--- a/fs/nfsd/blocklayoutxdr.h
+++ b/fs/nfsd/blocklayoutxdr.h
@@ -7,13 +7,6 @@
struct iomap;
struct xdr_stream;
-enum pnfs_block_extent_state {
- PNFS_BLOCK_READWRITE_DATA = 0,
- PNFS_BLOCK_READ_DATA = 1,
- PNFS_BLOCK_INVALID_DATA = 2,
- PNFS_BLOCK_NONE_DATA = 3,
-};
-
struct pnfs_block_extent {
struct nfsd4_deviceid vol_id;
u64 foff;
@@ -21,14 +14,6 @@ struct pnfs_block_extent {
u64 soff;
enum pnfs_block_extent_state es;
};
-#define NFS4_BLOCK_EXTENT_SIZE 44
-
-enum pnfs_block_volume_type {
- PNFS_BLOCK_VOLUME_SIMPLE = 0,
- PNFS_BLOCK_VOLUME_SLICE = 1,
- PNFS_BLOCK_VOLUME_CONCAT = 2,
- PNFS_BLOCK_VOLUME_STRIPE = 3,
-};
/*
* Random upper cap for the uuid length to avoid unbounded allocation.
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index f79521a..b4d84b5 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1075,73 +1075,6 @@ exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp)
return rv;
}
-/* Iterator */
-
-static void *e_start(struct seq_file *m, loff_t *pos)
- __acquires(((struct cache_detail *)m->private)->hash_lock)
-{
- loff_t n = *pos;
- unsigned hash, export;
- struct cache_head *ch;
- struct cache_detail *cd = m->private;
- struct cache_head **export_table = cd->hash_table;
-
- read_lock(&cd->hash_lock);
- if (!n--)
- return SEQ_START_TOKEN;
- hash = n >> 32;
- export = n & ((1LL<<32) - 1);
-
-
- for (ch=export_table[hash]; ch; ch=ch->next)
- if (!export--)
- return ch;
- n &= ~((1LL<<32) - 1);
- do {
- hash++;
- n += 1LL<<32;
- } while(hash < EXPORT_HASHMAX && export_table[hash]==NULL);
- if (hash >= EXPORT_HASHMAX)
- return NULL;
- *pos = n+1;
- return export_table[hash];
-}
-
-static void *e_next(struct seq_file *m, void *p, loff_t *pos)
-{
- struct cache_head *ch = p;
- int hash = (*pos >> 32);
- struct cache_detail *cd = m->private;
- struct cache_head **export_table = cd->hash_table;
-
- if (p == SEQ_START_TOKEN)
- hash = 0;
- else if (ch->next == NULL) {
- hash++;
- *pos += 1LL<<32;
- } else {
- ++*pos;
- return ch->next;
- }
- *pos &= ~((1LL<<32) - 1);
- while (hash < EXPORT_HASHMAX && export_table[hash] == NULL) {
- hash++;
- *pos += 1LL<<32;
- }
- if (hash >= EXPORT_HASHMAX)
- return NULL;
- ++*pos;
- return export_table[hash];
-}
-
-static void e_stop(struct seq_file *m, void *p)
- __releases(((struct cache_detail *)m->private)->hash_lock)
-{
- struct cache_detail *cd = m->private;
-
- read_unlock(&cd->hash_lock);
-}
-
static struct flags {
int flag;
char *name[2];
@@ -1270,9 +1203,9 @@ static int e_show(struct seq_file *m, void *p)
}
const struct seq_operations nfs_exports_op = {
- .start = e_start,
- .next = e_next,
- .stop = e_stop,
+ .start = cache_seq_start,
+ .next = cache_seq_next,
+ .stop = cache_seq_stop,
.show = e_show,
};
diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h
index 1f52bfc..2e31507 100644
--- a/fs/nfsd/export.h
+++ b/fs/nfsd/export.h
@@ -6,6 +6,7 @@
#include <linux/sunrpc/cache.h>
#include <uapi/linux/nfsd/export.h>
+#include <linux/nfs4.h>
struct knfsd_fh;
struct svc_fh;
diff --git a/fs/nfsd/idmap.h b/fs/nfsd/idmap.h
index a3f3490..23cc85d 100644
--- a/fs/nfsd/idmap.h
+++ b/fs/nfsd/idmap.h
@@ -37,9 +37,7 @@
#include <linux/in.h>
#include <linux/sunrpc/svc.h>
-
-/* XXX from linux/nfs_idmap.h */
-#define IDMAP_NAMESZ 128
+#include <linux/nfs_idmap.h>
#ifdef CONFIG_NFSD_V4
int nfsd_idmap_init(struct net *);
diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c
index 77e7a5c..1a03bc3 100644
--- a/fs/nfsd/lockd.c
+++ b/fs/nfsd/lockd.c
@@ -58,7 +58,7 @@ nlm_fclose(struct file *filp)
fput(filp);
}
-static struct nlmsvc_binding nfsd_nlm_ops = {
+static const struct nlmsvc_binding nfsd_nlm_ops = {
.fopen = nlm_fopen, /* open file for locking */
.fclose = nlm_fclose, /* close file */
};
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index ea6749a..5fbf3bb 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -92,7 +92,7 @@ struct nfsd_net {
struct file *rec_file;
bool in_grace;
- struct nfsd4_client_tracking_ops *client_tracking_ops;
+ const struct nfsd4_client_tracking_ops *client_tracking_ops;
time_t nfsd4_lease;
time_t nfsd4_grace;
@@ -110,6 +110,7 @@ struct nfsd_net {
unsigned int max_connections;
u32 clientid_counter;
+ u32 clverifier_counter;
struct svc_serv *nfsd_serv;
};
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index d54701f..1580ea6 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -44,13 +44,13 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp,
inode = d_inode(fh->fh_dentry);
- if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
+ if (argp->mask & ~NFS_ACL_MASK)
RETURN_STATUS(nfserr_inval);
resp->mask = argp->mask;
nfserr = fh_getattr(fh, &resp->stat);
if (nfserr)
- goto fail;
+ RETURN_STATUS(nfserr);
if (resp->mask & (NFS_ACL|NFS_ACLCNT)) {
acl = get_acl(inode, ACL_TYPE_ACCESS);
@@ -202,7 +202,7 @@ static int nfsaclsvc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p,
if (!p)
return 0;
argp->mask = ntohl(*p++);
- if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT) ||
+ if (argp->mask & ~NFS_ACL_MASK ||
!xdr_argsize_check(rqstp, p))
return 0;
@@ -293,9 +293,7 @@ static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p,
resp->acl_default,
resp->mask & NFS_DFACL,
NFS_ACL_DEFAULT);
- if (n <= 0)
- return 0;
- return 1;
+ return (n > 0);
}
static int nfsaclsvc_encode_attrstatres(struct svc_rqst *rqstp, __be32 *p,
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index 882b1a1..01df4cd 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -41,7 +41,7 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst * rqstp,
inode = d_inode(fh->fh_dentry);
- if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
+ if (argp->mask & ~NFS_ACL_MASK)
RETURN_STATUS(nfserr_inval);
resp->mask = argp->mask;
@@ -148,7 +148,7 @@ static int nfs3svc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p,
if (!p)
return 0;
args->mask = ntohl(*p++);
- if (args->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT) ||
+ if (args->mask & ~NFS_ACL_MASK ||
!xdr_argsize_check(rqstp, p))
return 0;
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index f6e7cba..2246454 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -262,11 +262,11 @@ void fill_post_wcc(struct svc_fh *fhp)
err = fh_getattr(fhp, &fhp->fh_post_attr);
fhp->fh_post_change = d_inode(fhp->fh_dentry)->i_version;
if (err) {
- fhp->fh_post_saved = 0;
+ fhp->fh_post_saved = false;
/* Grab the ctime anyway - set_change_info might use it */
fhp->fh_post_attr.ctime = d_inode(fhp->fh_dentry)->i_ctime;
} else
- fhp->fh_post_saved = 1;
+ fhp->fh_post_saved = true;
}
/*
@@ -823,7 +823,7 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
} else
dchild = dget(dparent);
} else
- dchild = lookup_one_len(name, dparent, namlen);
+ dchild = lookup_one_len_unlocked(name, dparent, namlen);
if (IS_ERR(dchild))
return rv;
if (d_mountpoint(dchild))
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index eb5accf..6adabd6 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -34,8 +34,10 @@
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
+#include <linux/fs.h>
#include <linux/slab.h>
-#include <linux/nfs_fs.h>
+#include <linux/posix_acl.h>
+
#include "nfsfh.h"
#include "nfsd.h"
#include "acl.h"
@@ -100,7 +102,7 @@ deny_mask_from_posix(unsigned short perm, u32 flags)
/* We only map from NFSv4 to POSIX ACLs when setting ACLs, when we err on the
* side of being more restrictive, so the mode bit mapping below is
* pessimistic. An optimistic version would be needed to handle DENY's,
- * but we espect to coalesce all ALLOWs and DENYs before mapping to mode
+ * but we expect to coalesce all ALLOWs and DENYs before mapping to mode
* bits. */
static void
@@ -458,7 +460,7 @@ init_state(struct posix_acl_state *state, int cnt)
state->empty = 1;
/*
* In the worst case, each individual acl could be for a distinct
- * named user or group, but we don't no which, so we allocate
+ * named user or group, but we don't know which, so we allocate
* enough space for either:
*/
alloc = sizeof(struct posix_ace_state_array)
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index a492018..7389cb1 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -435,12 +435,12 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
*/
status = 0;
out:
- if (status)
- nfsd4_mark_cb_fault(cb->cb_clp, status);
+ cb->cb_seq_status = status;
return status;
out_overflow:
print_overflow_msg(__func__, xdr);
- return -EIO;
+ status = -EIO;
+ goto out;
}
static int decode_cb_sequence4res(struct xdr_stream *xdr,
@@ -451,11 +451,10 @@ static int decode_cb_sequence4res(struct xdr_stream *xdr,
if (cb->cb_minorversion == 0)
return 0;
- status = decode_cb_op_status(xdr, OP_CB_SEQUENCE, &cb->cb_status);
- if (unlikely(status || cb->cb_status))
+ status = decode_cb_op_status(xdr, OP_CB_SEQUENCE, &cb->cb_seq_status);
+ if (unlikely(status || cb->cb_seq_status))
return status;
- cb->cb_update_seq_nr = true;
return decode_cb_sequence4resok(xdr, cb);
}
@@ -527,7 +526,7 @@ static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp,
if (cb != NULL) {
status = decode_cb_sequence4res(xdr, cb);
- if (unlikely(status || cb->cb_status))
+ if (unlikely(status || cb->cb_seq_status))
return status;
}
@@ -617,7 +616,7 @@ static int nfs4_xdr_dec_cb_layout(struct rpc_rqst *rqstp,
if (cb) {
status = decode_cb_sequence4res(xdr, cb);
- if (unlikely(status || cb->cb_status))
+ if (unlikely(status || cb->cb_seq_status))
return status;
}
return decode_cb_op_status(xdr, OP_CB_LAYOUTRECALL, &cb->cb_status);
@@ -793,12 +792,16 @@ static void warn_no_callback_path(struct nfs4_client *clp, int reason)
static void nfsd4_mark_cb_down(struct nfs4_client *clp, int reason)
{
+ if (test_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags))
+ return;
clp->cl_cb_state = NFSD4_CB_DOWN;
warn_no_callback_path(clp, reason);
}
static void nfsd4_mark_cb_fault(struct nfs4_client *clp, int reason)
{
+ if (test_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags))
+ return;
clp->cl_cb_state = NFSD4_CB_FAULT;
warn_no_callback_path(clp, reason);
}
@@ -876,7 +879,11 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
u32 minorversion = clp->cl_minorversion;
cb->cb_minorversion = minorversion;
- cb->cb_update_seq_nr = false;
+ /*
+ * cb_seq_status is only set in decode_cb_sequence4res,
+ * and so will remain 1 if an rpc level failure occurs.
+ */
+ cb->cb_seq_status = 1;
cb->cb_status = 0;
if (minorversion) {
if (!nfsd41_cb_get_slot(clp, task))
@@ -885,15 +892,30 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
rpc_call_start(task);
}
-static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
+static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback *cb)
{
- struct nfsd4_callback *cb = calldata;
struct nfs4_client *clp = cb->cb_clp;
+ struct nfsd4_session *session = clp->cl_cb_session;
+ bool ret = true;
- dprintk("%s: minorversion=%d\n", __func__,
- clp->cl_minorversion);
+ if (!clp->cl_minorversion) {
+ /*
+ * If the backchannel connection was shut down while this
+ * task was queued, we need to resubmit it after setting up
+ * a new backchannel connection.
+ *
+ * Note that if we lost our callback connection permanently
+ * the submission code will error out, so we don't need to
+ * handle that case here.
+ */
+ if (task->tk_flags & RPC_TASK_KILLED)
+ goto need_restart;
- if (clp->cl_minorversion) {
+ return true;
+ }
+
+ switch (cb->cb_seq_status) {
+ case 0:
/*
* No need for lock, access serialized in nfsd4_cb_prepare
*
@@ -901,29 +923,63 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
* If CB_SEQUENCE returns an error, then the state of the slot
* (sequence ID, cached reply) MUST NOT change.
*/
- if (cb->cb_update_seq_nr)
- ++clp->cl_cb_session->se_cb_seq_nr;
-
- clear_bit(0, &clp->cl_cb_slot_busy);
- rpc_wake_up_next(&clp->cl_cb_waitq);
- dprintk("%s: freed slot, new seqid=%d\n", __func__,
- clp->cl_cb_session->se_cb_seq_nr);
+ ++session->se_cb_seq_nr;
+ break;
+ case -ESERVERFAULT:
+ ++session->se_cb_seq_nr;
+ case 1:
+ case -NFS4ERR_BADSESSION:
+ nfsd4_mark_cb_fault(cb->cb_clp, cb->cb_seq_status);
+ ret = false;
+ break;
+ case -NFS4ERR_DELAY:
+ if (!rpc_restart_call(task))
+ goto out;
+
+ rpc_delay(task, 2 * HZ);
+ return false;
+ case -NFS4ERR_BADSLOT:
+ goto retry_nowait;
+ case -NFS4ERR_SEQ_MISORDERED:
+ if (session->se_cb_seq_nr != 1) {
+ session->se_cb_seq_nr = 1;
+ goto retry_nowait;
+ }
+ break;
+ default:
+ dprintk("%s: unprocessed error %d\n", __func__,
+ cb->cb_seq_status);
}
- /*
- * If the backchannel connection was shut down while this
- * task was queued, we need to resubmit it after setting up
- * a new backchannel connection.
- *
- * Note that if we lost our callback connection permanently
- * the submission code will error out, so we don't need to
- * handle that case here.
- */
- if (task->tk_flags & RPC_TASK_KILLED) {
- task->tk_status = 0;
- cb->cb_need_restart = true;
+ clear_bit(0, &clp->cl_cb_slot_busy);
+ rpc_wake_up_next(&clp->cl_cb_waitq);
+ dprintk("%s: freed slot, new seqid=%d\n", __func__,
+ clp->cl_cb_session->se_cb_seq_nr);
+
+ if (task->tk_flags & RPC_TASK_KILLED)
+ goto need_restart;
+out:
+ return ret;
+retry_nowait:
+ if (rpc_restart_call_prepare(task))
+ ret = false;
+ goto out;
+need_restart:
+ task->tk_status = 0;
+ cb->cb_need_restart = true;
+ return false;
+}
+
+static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
+{
+ struct nfsd4_callback *cb = calldata;
+ struct nfs4_client *clp = cb->cb_clp;
+
+ dprintk("%s: minorversion=%d\n", __func__,
+ clp->cl_minorversion);
+
+ if (!nfsd4_cb_sequence_done(task, cb))
return;
- }
if (cb->cb_status) {
WARN_ON_ONCE(task->tk_status);
@@ -1091,7 +1147,7 @@ nfsd4_run_cb_work(struct work_struct *work)
}
void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
- struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op)
+ const struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op)
{
cb->cb_clp = clp;
cb->cb_msg.rpc_proc = &nfs4_cb_procedures[op];
@@ -1099,8 +1155,8 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
cb->cb_msg.rpc_resp = cb;
cb->cb_ops = ops;
INIT_WORK(&cb->cb_work, nfsd4_run_cb_work);
+ cb->cb_seq_status = 1;
cb->cb_status = 0;
- cb->cb_update_seq_nr = false;
cb->cb_need_restart = false;
}
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index e1b3d3d..5b20577 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -59,9 +59,6 @@ MODULE_PARM_DESC(nfs4_disable_idmapping,
* that.
*/
-#define IDMAP_TYPE_USER 0
-#define IDMAP_TYPE_GROUP 1
-
struct ent {
struct cache_head h;
int type; /* User / Group */
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index ebf90e4..ce2d010 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -22,7 +22,7 @@ struct nfs4_layout {
static struct kmem_cache *nfs4_layout_cache;
static struct kmem_cache *nfs4_layout_stateid_cache;
-static struct nfsd4_callback_ops nfsd4_cb_layout_ops;
+static const struct nfsd4_callback_ops nfsd4_cb_layout_ops;
static const struct lock_manager_operations nfsd4_layouts_lm_ops;
const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] = {
@@ -201,6 +201,7 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
INIT_LIST_HEAD(&ls->ls_perfile);
spin_lock_init(&ls->ls_lock);
INIT_LIST_HEAD(&ls->ls_layouts);
+ mutex_init(&ls->ls_mutex);
ls->ls_layout_type = layout_type;
nfsd4_init_cb(&ls->ls_recall, clp, &nfsd4_cb_layout_ops,
NFSPROC4_CLNT_CB_LAYOUT);
@@ -262,19 +263,23 @@ nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
status = nfserr_jukebox;
if (!ls)
goto out;
+ mutex_lock(&ls->ls_mutex);
} else {
ls = container_of(stid, struct nfs4_layout_stateid, ls_stid);
status = nfserr_bad_stateid;
+ mutex_lock(&ls->ls_mutex);
if (stateid->si_generation > stid->sc_stateid.si_generation)
- goto out_put_stid;
+ goto out_unlock_stid;
if (layout_type != ls->ls_layout_type)
- goto out_put_stid;
+ goto out_unlock_stid;
}
*lsp = ls;
return 0;
+out_unlock_stid:
+ mutex_unlock(&ls->ls_mutex);
out_put_stid:
nfs4_put_stid(stid);
out:
@@ -296,8 +301,6 @@ nfsd4_recall_file_layout(struct nfs4_layout_stateid *ls)
trace_layout_recall(&ls->ls_stid.sc_stateid);
atomic_inc(&ls->ls_stid.sc_count);
- update_stateid(&ls->ls_stid.sc_stateid);
- memcpy(&ls->ls_recall_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t));
nfsd4_run_cb(&ls->ls_recall);
out_unlock:
@@ -406,8 +409,7 @@ nfsd4_insert_layout(struct nfsd4_layoutget *lgp, struct nfs4_layout_stateid *ls)
list_add_tail(&new->lo_perstate, &ls->ls_layouts);
new = NULL;
done:
- update_stateid(&ls->ls_stid.sc_stateid);
- memcpy(&lgp->lg_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t));
+ nfs4_inc_and_copy_stateid(&lgp->lg_sid, &ls->ls_stid);
spin_unlock(&ls->ls_lock);
out:
spin_unlock(&fp->fi_lock);
@@ -481,11 +483,8 @@ nfsd4_return_file_layouts(struct svc_rqst *rqstp,
}
}
if (!list_empty(&ls->ls_layouts)) {
- if (found) {
- update_stateid(&ls->ls_stid.sc_stateid);
- memcpy(&lrp->lr_sid, &ls->ls_stid.sc_stateid,
- sizeof(stateid_t));
- }
+ if (found)
+ nfs4_inc_and_copy_stateid(&lrp->lr_sid, &ls->ls_stid);
lrp->lrs_present = 1;
} else {
trace_layoutstate_unhash(&ls->ls_stid.sc_stateid);
@@ -494,6 +493,7 @@ nfsd4_return_file_layouts(struct svc_rqst *rqstp,
}
spin_unlock(&ls->ls_lock);
+ mutex_unlock(&ls->ls_mutex);
nfs4_put_stid(&ls->ls_stid);
nfsd4_free_layouts(&reaplist);
return nfs_ok;
@@ -608,29 +608,55 @@ nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls)
}
}
+static void
+nfsd4_cb_layout_prepare(struct nfsd4_callback *cb)
+{
+ struct nfs4_layout_stateid *ls =
+ container_of(cb, struct nfs4_layout_stateid, ls_recall);
+
+ mutex_lock(&ls->ls_mutex);
+ nfs4_inc_and_copy_stateid(&ls->ls_recall_sid, &ls->ls_stid);
+ mutex_unlock(&ls->ls_mutex);
+}
+
static int
nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
{
struct nfs4_layout_stateid *ls =
container_of(cb, struct nfs4_layout_stateid, ls_recall);
+ struct nfsd_net *nn;
+ ktime_t now, cutoff;
LIST_HEAD(reaplist);
+
switch (task->tk_status) {
case 0:
- return 1;
+ case -NFS4ERR_DELAY:
+ /*
+ * Anything left? If not, then call it done. Note that we don't
+ * take the spinlock since this is an optimization and nothing
+ * should get added until the cb counter goes to zero.
+ */
+ if (list_empty(&ls->ls_layouts))
+ return 1;
+
+ /* Poll the client until it's done with the layout */
+ now = ktime_get();
+ nn = net_generic(ls->ls_stid.sc_client->net, nfsd_net_id);
+
+ /* Client gets 2 lease periods to return it */
+ cutoff = ktime_add_ns(task->tk_start,
+ nn->nfsd4_lease * NSEC_PER_SEC * 2);
+
+ if (ktime_before(now, cutoff)) {
+ rpc_delay(task, HZ/100); /* 10 mili-seconds */
+ return 0;
+ }
+ /* Fallthrough */
case -NFS4ERR_NOMATCHING_LAYOUT:
trace_layout_recall_done(&ls->ls_stid.sc_stateid);
task->tk_status = 0;
return 1;
- case -NFS4ERR_DELAY:
- /* Poll the client until it's done with the layout */
- /* FIXME: cap number of retries.
- * The pnfs standard states that we need to only expire
- * the client after at-least "lease time" .eg lease-time * 2
- * when failing to communicate a recall
- */
- rpc_delay(task, HZ/100); /* 10 mili-seconds */
- return 0;
default:
/*
* Unknown error or non-responding client, we'll need to fence.
@@ -654,7 +680,8 @@ nfsd4_cb_layout_release(struct nfsd4_callback *cb)
nfs4_put_stid(&ls->ls_stid);
}
-static struct nfsd4_callback_ops nfsd4_cb_layout_ops = {
+static const struct nfsd4_callback_ops nfsd4_cb_layout_ops = {
+ .prepare = nfsd4_cb_layout_prepare,
.done = nfsd4_cb_layout_done,
.release = nfsd4_cb_layout_release,
};
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 90cfda7..819ad81 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -276,13 +276,13 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru
nfsd4_security_inode_setsecctx(*resfh, &open->op_label, open->op_bmval);
/*
- * Following rfc 3530 14.2.16, use the returned bitmask
- * to indicate which attributes we used to store the
- * verifier:
+ * Following rfc 3530 14.2.16, and rfc 5661 18.16.4
+ * use the returned bitmask to indicate which attributes
+ * we used to store the verifier:
*/
- if (open->op_createmode == NFS4_CREATE_EXCLUSIVE && status == 0)
- open->op_bmval[1] = (FATTR4_WORD1_TIME_ACCESS |
- FATTR4_WORD1_TIME_MODIFY);
+ if (nfsd_create_is_exclusive(open->op_createmode) && status == 0)
+ open->op_bmval[1] |= (FATTR4_WORD1_TIME_ACCESS |
+ FATTR4_WORD1_TIME_MODIFY);
} else
/*
* Note this may exit with the parent still locked.
@@ -362,7 +362,6 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
{
__be32 status;
struct svc_fh *resfh = NULL;
- struct nfsd4_compoundres *resp;
struct net *net = SVC_NET(rqstp);
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
@@ -389,8 +388,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
copy_clientid(&open->op_clientid, cstate->session);
/* check seqid for replay. set nfs4_owner */
- resp = rqstp->rq_resp;
- status = nfsd4_process_open1(&resp->cstate, open, nn);
+ status = nfsd4_process_open1(cstate, open, nn);
if (status == nfserr_replay_me) {
struct nfs4_replay *rp = &open->op_openowner->oo_owner.so_replay;
fh_put(&cstate->current_fh);
@@ -417,10 +415,10 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
/* Openowner is now set, so sequence id will get bumped. Now we need
* these checks before we do any creates: */
status = nfserr_grace;
- if (locks_in_grace(net) && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS)
+ if (opens_in_grace(net) && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS)
goto out;
status = nfserr_no_grace;
- if (!locks_in_grace(net) && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS)
+ if (!opens_in_grace(net) && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS)
goto out;
switch (open->op_claim_type) {
@@ -776,8 +774,9 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags);
/* check stateid */
- status = nfs4_preprocess_stateid_op(rqstp, cstate, &read->rd_stateid,
- RD_STATE, &read->rd_filp, &read->rd_tmp_file);
+ status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
+ &read->rd_stateid, RD_STATE,
+ &read->rd_filp, &read->rd_tmp_file);
if (status) {
dprintk("NFSD: nfsd4_read: couldn't process stateid!\n");
goto out;
@@ -829,7 +828,7 @@ nfsd4_remove(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
{
__be32 status;
- if (locks_in_grace(SVC_NET(rqstp)))
+ if (opens_in_grace(SVC_NET(rqstp)))
return nfserr_grace;
status = nfsd_unlink(rqstp, &cstate->current_fh, 0,
remove->rm_name, remove->rm_namelen);
@@ -848,7 +847,7 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (!cstate->save_fh.fh_dentry)
return status;
- if (locks_in_grace(SVC_NET(rqstp)) &&
+ if (opens_in_grace(SVC_NET(rqstp)) &&
!(cstate->save_fh.fh_export->ex_flags & NFSEXP_NOSUBTREECHECK))
return nfserr_grace;
status = nfsd_rename(rqstp, &cstate->save_fh, rename->rn_sname,
@@ -923,7 +922,8 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (setattr->sa_iattr.ia_valid & ATTR_SIZE) {
status = nfs4_preprocess_stateid_op(rqstp, cstate,
- &setattr->sa_stateid, WR_STATE, NULL, NULL);
+ &cstate->current_fh, &setattr->sa_stateid,
+ WR_STATE, NULL, NULL);
if (status) {
dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n");
return status;
@@ -987,8 +987,8 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (write->wr_offset >= OFFSET_MAX)
return nfserr_inval;
- status = nfs4_preprocess_stateid_op(rqstp, cstate, stateid, WR_STATE,
- &filp, NULL);
+ status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
+ stateid, WR_STATE, &filp, NULL);
if (status) {
dprintk("NFSD: nfsd4_write: couldn't process stateid!\n");
return status;
@@ -1012,13 +1012,54 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
}
static __be32
+nfsd4_clone(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ struct nfsd4_clone *clone)
+{
+ struct file *src, *dst;
+ __be32 status;
+
+ status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->save_fh,
+ &clone->cl_src_stateid, RD_STATE,
+ &src, NULL);
+ if (status) {
+ dprintk("NFSD: %s: couldn't process src stateid!\n", __func__);
+ goto out;
+ }
+
+ status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
+ &clone->cl_dst_stateid, WR_STATE,
+ &dst, NULL);
+ if (status) {
+ dprintk("NFSD: %s: couldn't process dst stateid!\n", __func__);
+ goto out_put_src;
+ }
+
+ /* fix up for NFS-specific error code */
+ if (!S_ISREG(file_inode(src)->i_mode) ||
+ !S_ISREG(file_inode(dst)->i_mode)) {
+ status = nfserr_wrong_type;
+ goto out_put_dst;
+ }
+
+ status = nfsd4_clone_file_range(src, clone->cl_src_pos,
+ dst, clone->cl_dst_pos, clone->cl_count);
+
+out_put_dst:
+ fput(dst);
+out_put_src:
+ fput(src);
+out:
+ return status;
+}
+
+static __be32
nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_fallocate *fallocate, int flags)
{
__be32 status = nfserr_notsupp;
struct file *file;
- status = nfs4_preprocess_stateid_op(rqstp, cstate,
+ status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
&fallocate->falloc_stateid,
WR_STATE, &file, NULL);
if (status != nfs_ok) {
@@ -1057,7 +1098,7 @@ nfsd4_seek(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
__be32 status;
struct file *file;
- status = nfs4_preprocess_stateid_op(rqstp, cstate,
+ status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
&seek->seek_stateid,
RD_STATE, &file, NULL);
if (status) {
@@ -1311,6 +1352,7 @@ nfsd4_layoutget(struct svc_rqst *rqstp,
nfserr = nfsd4_insert_layout(lgp, ls);
out_put_stid:
+ mutex_unlock(&ls->ls_mutex);
nfs4_put_stid(&ls->ls_stid);
out:
return nfserr;
@@ -1364,9 +1406,8 @@ nfsd4_layoutcommit(struct svc_rqst *rqstp,
goto out;
}
- nfserr = ops->proc_layoutcommit(inode, lcp);
- if (nfserr)
- goto out_put_stid;
+ /* LAYOUTCOMMIT does not require any serialization */
+ mutex_unlock(&ls->ls_mutex);
if (new_size > i_size_read(inode)) {
lcp->lc_size_chg = 1;
@@ -1375,7 +1416,7 @@ nfsd4_layoutcommit(struct svc_rqst *rqstp,
lcp->lc_size_chg = 0;
}
-out_put_stid:
+ nfserr = ops->proc_layoutcommit(inode, lcp);
nfs4_put_stid(&ls->ls_stid);
out:
return nfserr;
@@ -2281,6 +2322,12 @@ static struct nfsd4_operation nfsd4_ops[] = {
.op_name = "OP_DEALLOCATE",
.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
+ [OP_CLONE] = {
+ .op_func = (nfsd4op_func)nfsd4_clone,
+ .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
+ .op_name = "OP_CLONE",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
+ },
[OP_SEEK] = {
.op_func = (nfsd4op_func)nfsd4_seek,
.op_name = "OP_SEEK",
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index d88ea7b..79f0307 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -272,6 +272,7 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn)
.ctx.actor = nfsd4_build_namelist,
.names = LIST_HEAD_INIT(ctx.names)
};
+ struct name_list *entry, *tmp;
int status;
status = nfs4_save_creds(&original_cred);
@@ -286,9 +287,8 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn)
status = iterate_dir(nn->rec_file, &ctx.ctx);
mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT);
- while (!list_empty(&ctx.names)) {
- struct name_list *entry;
- entry = list_entry(ctx.names.next, struct name_list, list);
+
+ list_for_each_entry_safe(entry, tmp, &ctx.names, list) {
if (!status) {
struct dentry *dentry;
dentry = lookup_one_len(entry->name, dir, HEXDIR_LEN-1);
@@ -304,6 +304,12 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn)
}
mutex_unlock(&d_inode(dir)->i_mutex);
nfs4_reset_creds(original_cred);
+
+ list_for_each_entry_safe(entry, tmp, &ctx.names, list) {
+ dprintk("NFSD: %s. Left entry %s\n", __func__, entry->name);
+ list_del(&entry->list);
+ kfree(entry);
+ }
return status;
}
@@ -541,8 +547,7 @@ nfsd4_legacy_tracking_init(struct net *net)
/* XXX: The legacy code won't work in a container */
if (net != &init_net) {
- WARN(1, KERN_ERR "NFSD: attempt to initialize legacy client "
- "tracking in a container!\n");
+ pr_warn("NFSD: attempt to initialize legacy client tracking in a container ignored.\n");
return -EINVAL;
}
@@ -626,7 +631,7 @@ nfsd4_check_legacy_client(struct nfs4_client *clp)
return -ENOENT;
}
-static struct nfsd4_client_tracking_ops nfsd4_legacy_tracking_ops = {
+static const struct nfsd4_client_tracking_ops nfsd4_legacy_tracking_ops = {
.init = nfsd4_legacy_tracking_init,
.exit = nfsd4_legacy_tracking_exit,
.create = nfsd4_create_clid_dir,
@@ -1045,7 +1050,7 @@ out_err:
printk(KERN_ERR "NFSD: Unable to end grace period: %d\n", ret);
}
-static struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops = {
+static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops = {
.init = nfsd4_init_cld_pipe,
.exit = nfsd4_remove_cld_pipe,
.create = nfsd4_cld_create,
@@ -1254,8 +1259,7 @@ nfsd4_umh_cltrack_init(struct net *net)
/* XXX: The usermode helper s not working in container yet. */
if (net != &init_net) {
- WARN(1, KERN_ERR "NFSD: attempt to initialize umh client "
- "tracking in a container!\n");
+ pr_warn("NFSD: attempt to initialize umh client tracking in a container ignored.\n");
return -EINVAL;
}
@@ -1390,7 +1394,7 @@ nfsd4_umh_cltrack_grace_done(struct nfsd_net *nn)
kfree(legacy);
}
-static struct nfsd4_client_tracking_ops nfsd4_umh_tracking_ops = {
+static const struct nfsd4_client_tracking_ops nfsd4_umh_tracking_ops = {
.init = nfsd4_umh_cltrack_init,
.exit = NULL,
.create = nfsd4_umh_cltrack_create,
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 9520271..c484a2b 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -98,7 +98,7 @@ static struct kmem_cache *odstate_slab;
static void free_session(struct nfsd4_session *);
-static struct nfsd4_callback_ops nfsd4_cb_recall_ops;
+static const struct nfsd4_callback_ops nfsd4_cb_recall_ops;
static bool is_session_dead(struct nfsd4_session *ses)
{
@@ -575,6 +575,7 @@ struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl,
stid->sc_stateid.si_opaque.so_clid = cl->cl_clientid;
/* Will be incremented before return to client: */
atomic_set(&stid->sc_count, 1);
+ spin_lock_init(&stid->sc_lock);
/*
* It shouldn't be a problem to reuse an opaque stateid value.
@@ -745,6 +746,18 @@ nfs4_put_stid(struct nfs4_stid *s)
put_nfs4_file(fp);
}
+void
+nfs4_inc_and_copy_stateid(stateid_t *dst, struct nfs4_stid *stid)
+{
+ stateid_t *src = &stid->sc_stateid;
+
+ spin_lock(&stid->sc_lock);
+ if (unlikely(++src->si_generation == 0))
+ src->si_generation = 1;
+ memcpy(dst, src, sizeof(*dst));
+ spin_unlock(&stid->sc_lock);
+}
+
static void nfs4_put_deleg_lease(struct nfs4_file *fp)
{
struct file *filp = NULL;
@@ -765,25 +778,80 @@ void nfs4_unhash_stid(struct nfs4_stid *s)
s->sc_type = 0;
}
-static void
+/**
+ * nfs4_get_existing_delegation - Discover if this delegation already exists
+ * @clp: a pointer to the nfs4_client we're granting a delegation to
+ * @fp: a pointer to the nfs4_file we're granting a delegation on
+ *
+ * Return:
+ * On success: NULL if an existing delegation was not found.
+ *
+ * On error: -EAGAIN if one was previously granted to this nfs4_client
+ * for this nfs4_file.
+ *
+ */
+
+static int
+nfs4_get_existing_delegation(struct nfs4_client *clp, struct nfs4_file *fp)
+{
+ struct nfs4_delegation *searchdp = NULL;
+ struct nfs4_client *searchclp = NULL;
+
+ lockdep_assert_held(&state_lock);
+ lockdep_assert_held(&fp->fi_lock);
+
+ list_for_each_entry(searchdp, &fp->fi_delegations, dl_perfile) {
+ searchclp = searchdp->dl_stid.sc_client;
+ if (clp == searchclp) {
+ return -EAGAIN;
+ }
+ }
+ return 0;
+}
+
+/**
+ * hash_delegation_locked - Add a delegation to the appropriate lists
+ * @dp: a pointer to the nfs4_delegation we are adding.
+ * @fp: a pointer to the nfs4_file we're granting a delegation on
+ *
+ * Return:
+ * On success: NULL if the delegation was successfully hashed.
+ *
+ * On error: -EAGAIN if one was previously granted to this
+ * nfs4_client for this nfs4_file. Delegation is not hashed.
+ *
+ */
+
+static int
hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp)
{
+ int status;
+ struct nfs4_client *clp = dp->dl_stid.sc_client;
+
lockdep_assert_held(&state_lock);
lockdep_assert_held(&fp->fi_lock);
+ status = nfs4_get_existing_delegation(clp, fp);
+ if (status)
+ return status;
+ ++fp->fi_delegees;
atomic_inc(&dp->dl_stid.sc_count);
dp->dl_stid.sc_type = NFS4_DELEG_STID;
list_add(&dp->dl_perfile, &fp->fi_delegations);
- list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations);
+ list_add(&dp->dl_perclnt, &clp->cl_delegations);
+ return 0;
}
-static void
+static bool
unhash_delegation_locked(struct nfs4_delegation *dp)
{
struct nfs4_file *fp = dp->dl_stid.sc_file;
lockdep_assert_held(&state_lock);
+ if (list_empty(&dp->dl_perfile))
+ return false;
+
dp->dl_stid.sc_type = NFS4_CLOSED_DELEG_STID;
/* Ensure that deleg break won't try to requeue it */
++dp->dl_time;
@@ -792,16 +860,21 @@ unhash_delegation_locked(struct nfs4_delegation *dp)
list_del_init(&dp->dl_recall_lru);
list_del_init(&dp->dl_perfile);
spin_unlock(&fp->fi_lock);
+ return true;
}
static void destroy_delegation(struct nfs4_delegation *dp)
{
+ bool unhashed;
+
spin_lock(&state_lock);
- unhash_delegation_locked(dp);
+ unhashed = unhash_delegation_locked(dp);
spin_unlock(&state_lock);
- put_clnt_odstate(dp->dl_clnt_odstate);
- nfs4_put_deleg_lease(dp->dl_stid.sc_file);
- nfs4_put_stid(&dp->dl_stid);
+ if (unhashed) {
+ put_clnt_odstate(dp->dl_clnt_odstate);
+ nfs4_put_deleg_lease(dp->dl_stid.sc_file);
+ nfs4_put_stid(&dp->dl_stid);
+ }
}
static void revoke_delegation(struct nfs4_delegation *dp)
@@ -990,6 +1063,12 @@ release_all_access(struct nfs4_ol_stateid *stp)
}
}
+static inline void nfs4_free_stateowner(struct nfs4_stateowner *sop)
+{
+ kfree(sop->so_owner.data);
+ sop->so_ops->so_free(sop);
+}
+
static void nfs4_put_stateowner(struct nfs4_stateowner *sop)
{
struct nfs4_client *clp = sop->so_client;
@@ -1000,20 +1079,23 @@ static void nfs4_put_stateowner(struct nfs4_stateowner *sop)
return;
sop->so_ops->so_unhash(sop);
spin_unlock(&clp->cl_lock);
- kfree(sop->so_owner.data);
- sop->so_ops->so_free(sop);
+ nfs4_free_stateowner(sop);
}
-static void unhash_ol_stateid(struct nfs4_ol_stateid *stp)
+static bool unhash_ol_stateid(struct nfs4_ol_stateid *stp)
{
struct nfs4_file *fp = stp->st_stid.sc_file;
lockdep_assert_held(&stp->st_stateowner->so_client->cl_lock);
+ if (list_empty(&stp->st_perfile))
+ return false;
+
spin_lock(&fp->fi_lock);
- list_del(&stp->st_perfile);
+ list_del_init(&stp->st_perfile);
spin_unlock(&fp->fi_lock);
list_del(&stp->st_perstateowner);
+ return true;
}
static void nfs4_free_ol_stateid(struct nfs4_stid *stid)
@@ -1063,25 +1145,27 @@ static void put_ol_stateid_locked(struct nfs4_ol_stateid *stp,
list_add(&stp->st_locks, reaplist);
}
-static void unhash_lock_stateid(struct nfs4_ol_stateid *stp)
+static bool unhash_lock_stateid(struct nfs4_ol_stateid *stp)
{
struct nfs4_openowner *oo = openowner(stp->st_openstp->st_stateowner);
lockdep_assert_held(&oo->oo_owner.so_client->cl_lock);
list_del_init(&stp->st_locks);
- unhash_ol_stateid(stp);
nfs4_unhash_stid(&stp->st_stid);
+ return unhash_ol_stateid(stp);
}
static void release_lock_stateid(struct nfs4_ol_stateid *stp)
{
struct nfs4_openowner *oo = openowner(stp->st_openstp->st_stateowner);
+ bool unhashed;
spin_lock(&oo->oo_owner.so_client->cl_lock);
- unhash_lock_stateid(stp);
+ unhashed = unhash_lock_stateid(stp);
spin_unlock(&oo->oo_owner.so_client->cl_lock);
- nfs4_put_stid(&stp->st_stid);
+ if (unhashed)
+ nfs4_put_stid(&stp->st_stid);
}
static void unhash_lockowner_locked(struct nfs4_lockowner *lo)
@@ -1129,7 +1213,7 @@ static void release_lockowner(struct nfs4_lockowner *lo)
while (!list_empty(&lo->lo_owner.so_stateids)) {
stp = list_first_entry(&lo->lo_owner.so_stateids,
struct nfs4_ol_stateid, st_perstateowner);
- unhash_lock_stateid(stp);
+ WARN_ON(!unhash_lock_stateid(stp));
put_ol_stateid_locked(stp, &reaplist);
}
spin_unlock(&clp->cl_lock);
@@ -1142,21 +1226,26 @@ static void release_open_stateid_locks(struct nfs4_ol_stateid *open_stp,
{
struct nfs4_ol_stateid *stp;
+ lockdep_assert_held(&open_stp->st_stid.sc_client->cl_lock);
+
while (!list_empty(&open_stp->st_locks)) {
stp = list_entry(open_stp->st_locks.next,
struct nfs4_ol_stateid, st_locks);
- unhash_lock_stateid(stp);
+ WARN_ON(!unhash_lock_stateid(stp));
put_ol_stateid_locked(stp, reaplist);
}
}
-static void unhash_open_stateid(struct nfs4_ol_stateid *stp,
+static bool unhash_open_stateid(struct nfs4_ol_stateid *stp,
struct list_head *reaplist)
{
+ bool unhashed;
+
lockdep_assert_held(&stp->st_stid.sc_client->cl_lock);
- unhash_ol_stateid(stp);
+ unhashed = unhash_ol_stateid(stp);
release_open_stateid_locks(stp, reaplist);
+ return unhashed;
}
static void release_open_stateid(struct nfs4_ol_stateid *stp)
@@ -1164,8 +1253,8 @@ static void release_open_stateid(struct nfs4_ol_stateid *stp)
LIST_HEAD(reaplist);
spin_lock(&stp->st_stid.sc_client->cl_lock);
- unhash_open_stateid(stp, &reaplist);
- put_ol_stateid_locked(stp, &reaplist);
+ if (unhash_open_stateid(stp, &reaplist))
+ put_ol_stateid_locked(stp, &reaplist);
spin_unlock(&stp->st_stid.sc_client->cl_lock);
free_ol_stateid_reaplist(&reaplist);
}
@@ -1210,8 +1299,8 @@ static void release_openowner(struct nfs4_openowner *oo)
while (!list_empty(&oo->oo_owner.so_stateids)) {
stp = list_first_entry(&oo->oo_owner.so_stateids,
struct nfs4_ol_stateid, st_perstateowner);
- unhash_open_stateid(stp, &reaplist);
- put_ol_stateid_locked(stp, &reaplist);
+ if (unhash_open_stateid(stp, &reaplist))
+ put_ol_stateid_locked(stp, &reaplist);
}
spin_unlock(&clp->cl_lock);
free_ol_stateid_reaplist(&reaplist);
@@ -1714,7 +1803,7 @@ __destroy_client(struct nfs4_client *clp)
spin_lock(&state_lock);
while (!list_empty(&clp->cl_delegations)) {
dp = list_entry(clp->cl_delegations.next, struct nfs4_delegation, dl_perclnt);
- unhash_delegation_locked(dp);
+ WARN_ON(!unhash_delegation_locked(dp));
list_add(&dp->dl_recall_lru, &reaplist);
}
spin_unlock(&state_lock);
@@ -1768,15 +1857,28 @@ static void copy_clid(struct nfs4_client *target, struct nfs4_client *source)
target->cl_clientid.cl_id = source->cl_clientid.cl_id;
}
-static int copy_cred(struct svc_cred *target, struct svc_cred *source)
+int strdup_if_nonnull(char **target, char *source)
{
- if (source->cr_principal) {
- target->cr_principal =
- kstrdup(source->cr_principal, GFP_KERNEL);
- if (target->cr_principal == NULL)
+ if (source) {
+ *target = kstrdup(source, GFP_KERNEL);
+ if (!*target)
return -ENOMEM;
} else
- target->cr_principal = NULL;
+ *target = NULL;
+ return 0;
+}
+
+static int copy_cred(struct svc_cred *target, struct svc_cred *source)
+{
+ int ret;
+
+ ret = strdup_if_nonnull(&target->cr_principal, source->cr_principal);
+ if (ret)
+ return ret;
+ ret = strdup_if_nonnull(&target->cr_raw_principal,
+ source->cr_raw_principal);
+ if (ret)
+ return ret;
target->cr_flavor = source->cr_flavor;
target->cr_uid = source->cr_uid;
target->cr_gid = source->cr_gid;
@@ -1880,6 +1982,9 @@ static bool mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp)
return false;
if (!svc_rqst_integrity_protected(rqstp))
return false;
+ if (cl->cl_cred.cr_raw_principal)
+ return 0 == strcmp(cl->cl_cred.cr_raw_principal,
+ cr->cr_raw_principal);
if (!cr->cr_principal)
return false;
return 0 == strcmp(cl->cl_cred.cr_principal, cr->cr_principal);
@@ -1894,7 +1999,7 @@ static void gen_confirm(struct nfs4_client *clp, struct nfsd_net *nn)
* __force to keep sparse happy
*/
verf[0] = (__force __be32)get_seconds();
- verf[1] = (__force __be32)nn->clientid_counter;
+ verf[1] = (__force __be32)nn->clverifier_counter++;
memcpy(clp->cl_confirm.data, verf, sizeof(clp->cl_confirm.data));
}
@@ -2151,7 +2256,8 @@ nfsd4_store_cache_entry(struct nfsd4_compoundres *resp)
base = resp->cstate.data_offset;
slot->sl_datalen = buf->len - base;
if (read_bytes_from_xdr_buf(buf, base, slot->sl_data, slot->sl_datalen))
- WARN("%s: sessions DRC could not cache compound\n", __func__);
+ WARN(1, "%s: sessions DRC could not cache compound\n",
+ __func__);
return;
}
@@ -2232,15 +2338,23 @@ nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid)
clid->flags = new->cl_exchange_flags;
}
+static bool client_has_openowners(struct nfs4_client *clp)
+{
+ struct nfs4_openowner *oo;
+
+ list_for_each_entry(oo, &clp->cl_openowners, oo_perclient) {
+ if (!list_empty(&oo->oo_owner.so_stateids))
+ return true;
+ }
+ return false;
+}
+
static bool client_has_state(struct nfs4_client *clp)
{
- /*
- * Note clp->cl_openowners check isn't quite right: there's no
- * need to count owners without stateid's.
- *
- * Also note we should probably be using this in 4.0 case too.
- */
- return !list_empty(&clp->cl_openowners)
+ return client_has_openowners(clp)
+#ifdef CONFIG_NFSD_PNFS
+ || !list_empty(&clp->cl_lo_states)
+#endif
|| !list_empty(&clp->cl_delegations)
|| !list_empty(&clp->cl_sessions);
}
@@ -2268,10 +2382,27 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
if (exid->flags & ~EXCHGID4_FLAG_MASK_A)
return nfserr_inval;
+ new = create_client(exid->clname, rqstp, &verf);
+ if (new == NULL)
+ return nfserr_jukebox;
+
switch (exid->spa_how) {
case SP4_MACH_CRED:
- if (!svc_rqst_integrity_protected(rqstp))
- return nfserr_inval;
+ if (!svc_rqst_integrity_protected(rqstp)) {
+ status = nfserr_inval;
+ goto out_nolock;
+ }
+ /*
+ * Sometimes userspace doesn't give us a principal.
+ * Which is a bug, really. Anyway, we can't enforce
+ * MACH_CRED in that case, better to give up now:
+ */
+ if (!new->cl_cred.cr_principal &&
+ !new->cl_cred.cr_raw_principal) {
+ status = nfserr_serverfault;
+ goto out_nolock;
+ }
+ new->cl_mach_cred = true;
case SP4_NONE:
break;
default: /* checked by xdr code */
@@ -2280,10 +2411,6 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
return nfserr_encr_alg_unsupp;
}
- new = create_client(exid->clname, rqstp, &verf);
- if (new == NULL)
- return nfserr_jukebox;
-
/* Cases below refer to rfc 5661 section 18.35.4: */
spin_lock(&nn->client_lock);
conf = find_confirmed_client_by_name(&exid->clname, nn);
@@ -2345,7 +2472,6 @@ out_new:
goto out;
}
new->cl_minorversion = cstate->minorversion;
- new->cl_mach_cred = (exid->spa_how == SP4_MACH_CRED);
gen_clid(new, nn);
add_to_unconfirmed(new);
@@ -2363,6 +2489,7 @@ out_copy:
out:
spin_unlock(&nn->client_lock);
+out_nolock:
if (new)
expire_client(new);
if (unconf)
@@ -2547,11 +2674,9 @@ nfsd4_create_session(struct svc_rqst *rqstp,
goto out_free_conn;
cs_slot = &conf->cl_cs_slot;
status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
- if (status == nfserr_replay_cache) {
- status = nfsd4_replay_create_session(cr_ses, cs_slot);
- goto out_free_conn;
- } else if (cr_ses->seqid != cs_slot->sl_seqid + 1) {
- status = nfserr_seq_misordered;
+ if (status) {
+ if (status == nfserr_replay_cache)
+ status = nfsd4_replay_create_session(cr_ses, cs_slot);
goto out_free_conn;
}
} else if (unconf) {
@@ -3024,7 +3149,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
/* Cases below refer to rfc 3530 section 14.2.33: */
spin_lock(&nn->client_lock);
conf = find_confirmed_client_by_name(&clname, nn);
- if (conf) {
+ if (conf && client_has_state(conf)) {
/* case 0: */
status = nfserr_clid_inuse;
if (clp_used_exchangeid(conf))
@@ -3041,10 +3166,11 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
unconf = find_unconfirmed_client_by_name(&clname, nn);
if (unconf)
unhash_client_locked(unconf);
- if (conf && same_verf(&conf->cl_verifier, &clverifier))
+ if (conf && same_verf(&conf->cl_verifier, &clverifier)) {
/* case 1: probable callback update */
copy_clid(new, conf);
- else /* case 4 (new client) or cases 2, 3 (client reboot): */
+ gen_confirm(new, nn);
+ } else /* case 4 (new client) or cases 2, 3 (client reboot): */
gen_clid(new, nn);
new->cl_minorversion = 0;
gen_callback(new, setclid, rqstp);
@@ -3085,10 +3211,11 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
/*
* We try hard to give out unique clientid's, so if we get an
* attempt to confirm the same clientid with a different cred,
- * there's a bug somewhere. Let's charitably assume it's our
- * bug.
+ * the client may be buggy; this should never happen.
+ *
+ * Nevertheless, RFC 7530 recommends INUSE for this case:
*/
- status = nfserr_serverfault;
+ status = nfserr_clid_inuse;
if (unconf && !same_creds(&unconf->cl_cred, &rqstp->rq_cred))
goto out;
if (conf && !same_creds(&conf->cl_cred, &rqstp->rq_cred))
@@ -3109,6 +3236,11 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
} else { /* case 3: normal case; new or rebooted client */
old = find_confirmed_client_by_name(&unconf->cl_name, nn);
if (old) {
+ status = nfserr_clid_inuse;
+ if (client_has_state(old)
+ && !same_creds(&unconf->cl_cred,
+ &old->cl_cred))
+ goto out;
status = mark_client_expired_locked(old);
if (status) {
old = NULL;
@@ -3290,6 +3422,27 @@ static const struct nfs4_stateowner_operations openowner_ops = {
.so_free = nfs4_free_openowner,
};
+static struct nfs4_ol_stateid *
+nfsd4_find_existing_open(struct nfs4_file *fp, struct nfsd4_open *open)
+{
+ struct nfs4_ol_stateid *local, *ret = NULL;
+ struct nfs4_openowner *oo = open->op_openowner;
+
+ lockdep_assert_held(&fp->fi_lock);
+
+ list_for_each_entry(local, &fp->fi_stateids, st_perfile) {
+ /* ignore lock owners */
+ if (local->st_stateowner->so_is_open_owner == 0)
+ continue;
+ if (local->st_stateowner == &oo->oo_owner) {
+ ret = local;
+ atomic_inc(&ret->st_stid.sc_count);
+ break;
+ }
+ }
+ return ret;
+}
+
static struct nfs4_openowner *
alloc_init_open_stateowner(unsigned int strhashval, struct nfsd4_open *open,
struct nfsd4_compound_state *cstate)
@@ -3315,14 +3468,26 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfsd4_open *open,
hash_openowner(oo, clp, strhashval);
ret = oo;
} else
- nfs4_free_openowner(&oo->oo_owner);
+ nfs4_free_stateowner(&oo->oo_owner);
+
spin_unlock(&clp->cl_lock);
return ret;
}
-static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) {
+static struct nfs4_ol_stateid *
+init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp,
+ struct nfsd4_open *open)
+{
+
struct nfs4_openowner *oo = open->op_openowner;
+ struct nfs4_ol_stateid *retstp = NULL;
+
+ spin_lock(&oo->oo_owner.so_client->cl_lock);
+ spin_lock(&fp->fi_lock);
+ retstp = nfsd4_find_existing_open(fp, open);
+ if (retstp)
+ goto out_unlock;
atomic_inc(&stp->st_stid.sc_count);
stp->st_stid.sc_type = NFS4_OPEN_STID;
INIT_LIST_HEAD(&stp->st_locks);
@@ -3332,12 +3497,14 @@ static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp,
stp->st_access_bmap = 0;
stp->st_deny_bmap = 0;
stp->st_openstp = NULL;
- spin_lock(&oo->oo_owner.so_client->cl_lock);
+ init_rwsem(&stp->st_rwsem);
list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids);
- spin_lock(&fp->fi_lock);
list_add(&stp->st_perfile, &fp->fi_stateids);
+
+out_unlock:
spin_unlock(&fp->fi_lock);
spin_unlock(&oo->oo_owner.so_client->cl_lock);
+ return retstp;
}
/*
@@ -3482,6 +3649,9 @@ static int nfsd4_cb_recall_done(struct nfsd4_callback *cb,
{
struct nfs4_delegation *dp = cb_to_delegation(cb);
+ if (dp->dl_stid.sc_type == NFS4_CLOSED_DELEG_STID)
+ return 1;
+
switch (task->tk_status) {
case 0:
return 1;
@@ -3508,7 +3678,7 @@ static void nfsd4_cb_recall_release(struct nfsd4_callback *cb)
nfs4_put_stid(&dp->dl_stid);
}
-static struct nfsd4_callback_ops nfsd4_cb_recall_ops = {
+static const struct nfsd4_callback_ops nfsd4_cb_recall_ops = {
.prepare = nfsd4_cb_recall_prepare,
.done = nfsd4_cb_recall_done,
.release = nfsd4_cb_recall_release,
@@ -3745,27 +3915,6 @@ out:
return nfs_ok;
}
-static struct nfs4_ol_stateid *
-nfsd4_find_existing_open(struct nfs4_file *fp, struct nfsd4_open *open)
-{
- struct nfs4_ol_stateid *local, *ret = NULL;
- struct nfs4_openowner *oo = open->op_openowner;
-
- spin_lock(&fp->fi_lock);
- list_for_each_entry(local, &fp->fi_stateids, st_perfile) {
- /* ignore lock owners */
- if (local->st_stateowner->so_is_open_owner == 0)
- continue;
- if (local->st_stateowner == &oo->oo_owner) {
- ret = local;
- atomic_inc(&ret->st_stid.sc_count);
- break;
- }
- }
- spin_unlock(&fp->fi_lock);
- return ret;
-}
-
static inline int nfs4_access_to_access(u32 nfs4_access)
{
int flags = 0;
@@ -3885,12 +4034,6 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *c
return status;
}
-static void
-nfs4_set_claim_prev(struct nfsd4_open *open, bool has_session)
-{
- open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
-}
-
/* Should we give out recallable state?: */
static bool nfsd4_cb_channel_good(struct nfs4_client *clp)
{
@@ -3920,10 +4063,22 @@ static struct file_lock *nfs4_alloc_init_lease(struct nfs4_file *fp, int flag)
return fl;
}
+/**
+ * nfs4_setlease - Obtain a delegation by requesting lease from vfs layer
+ * @dp: a pointer to the nfs4_delegation we're adding.
+ *
+ * Return:
+ * On success: Return code will be 0 on success.
+ *
+ * On error: -EAGAIN if there was an existing delegation.
+ * nonzero if there is an error in other cases.
+ *
+ */
+
static int nfs4_setlease(struct nfs4_delegation *dp)
{
struct nfs4_file *fp = dp->dl_stid.sc_file;
- struct file_lock *fl, *ret;
+ struct file_lock *fl;
struct file *filp;
int status = 0;
@@ -3934,10 +4089,10 @@ static int nfs4_setlease(struct nfs4_delegation *dp)
if (!filp) {
/* We should always have a readable file here */
WARN_ON_ONCE(1);
+ locks_free_lock(fl);
return -EBADF;
}
fl->fl_file = filp;
- ret = fl;
status = vfs_setlease(filp, fl->fl_type, &fl, NULL);
if (fl)
locks_free_lock(fl);
@@ -3951,16 +4106,19 @@ static int nfs4_setlease(struct nfs4_delegation *dp)
goto out_unlock;
/* Race breaker */
if (fp->fi_deleg_file) {
- status = 0;
- ++fp->fi_delegees;
- hash_delegation_locked(dp, fp);
+ status = hash_delegation_locked(dp, fp);
goto out_unlock;
}
fp->fi_deleg_file = filp;
- fp->fi_delegees = 1;
- hash_delegation_locked(dp, fp);
+ fp->fi_delegees = 0;
+ status = hash_delegation_locked(dp, fp);
spin_unlock(&fp->fi_lock);
spin_unlock(&state_lock);
+ if (status) {
+ /* Should never happen, this is a new fi_deleg_file */
+ WARN_ON_ONCE(1);
+ goto out_fput;
+ }
return 0;
out_unlock:
spin_unlock(&fp->fi_lock);
@@ -3980,6 +4138,15 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
if (fp->fi_had_conflict)
return ERR_PTR(-EAGAIN);
+ spin_lock(&state_lock);
+ spin_lock(&fp->fi_lock);
+ status = nfs4_get_existing_delegation(clp, fp);
+ spin_unlock(&fp->fi_lock);
+ spin_unlock(&state_lock);
+
+ if (status)
+ return ERR_PTR(status);
+
dp = alloc_init_deleg(clp, fh, odstate);
if (!dp)
return ERR_PTR(-ENOMEM);
@@ -3998,9 +4165,7 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
status = -EAGAIN;
goto out_unlock;
}
- ++fp->fi_delegees;
- hash_delegation_locked(dp, fp);
- status = 0;
+ status = hash_delegation_locked(dp, fp);
out_unlock:
spin_unlock(&fp->fi_lock);
spin_unlock(&state_lock);
@@ -4063,7 +4228,8 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open,
case NFS4_OPEN_CLAIM_FH:
/*
* Let's not give out any delegations till everyone's
- * had the chance to reclaim theirs....
+ * had the chance to reclaim theirs, *and* until
+ * NLM locks have all been reclaimed:
*/
if (locks_in_grace(clp->net))
goto out_no_deleg;
@@ -4134,6 +4300,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
struct nfs4_client *cl = open->op_openowner->oo_owner.so_client;
struct nfs4_file *fp = NULL;
struct nfs4_ol_stateid *stp = NULL;
+ struct nfs4_ol_stateid *swapstp = NULL;
struct nfs4_delegation *dp = NULL;
__be32 status;
@@ -4147,7 +4314,9 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
status = nfs4_check_deleg(cl, open, &dp);
if (status)
goto out;
+ spin_lock(&fp->fi_lock);
stp = nfsd4_find_existing_open(fp, open);
+ spin_unlock(&fp->fi_lock);
} else {
open->op_file = NULL;
status = nfserr_bad_stateid;
@@ -4161,15 +4330,32 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
*/
if (stp) {
/* Stateid was found, this is an OPEN upgrade */
+ down_read(&stp->st_rwsem);
status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open);
- if (status)
+ if (status) {
+ up_read(&stp->st_rwsem);
goto out;
+ }
} else {
stp = open->op_stp;
open->op_stp = NULL;
- init_open_stateid(stp, fp, open);
+ swapstp = init_open_stateid(stp, fp, open);
+ if (swapstp) {
+ nfs4_put_stid(&stp->st_stid);
+ stp = swapstp;
+ down_read(&stp->st_rwsem);
+ status = nfs4_upgrade_open(rqstp, fp, current_fh,
+ stp, open);
+ if (status) {
+ up_read(&stp->st_rwsem);
+ goto out;
+ }
+ goto upgrade_out;
+ }
+ down_read(&stp->st_rwsem);
status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open);
if (status) {
+ up_read(&stp->st_rwsem);
release_open_stateid(stp);
goto out;
}
@@ -4179,8 +4365,9 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
if (stp->st_clnt_odstate == open->op_odstate)
open->op_odstate = NULL;
}
- update_stateid(&stp->st_stid.sc_stateid);
- memcpy(&open->op_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
+upgrade_out:
+ nfs4_inc_and_copy_stateid(&open->op_stateid, &stp->st_stid);
+ up_read(&stp->st_rwsem);
if (nfsd4_has_session(&resp->cstate)) {
if (open->op_deleg_want & NFS4_SHARE_WANT_NO_DELEG) {
@@ -4209,7 +4396,7 @@ out:
if (fp)
put_nfs4_file(fp);
if (status == 0 && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS)
- nfs4_set_claim_prev(open, nfsd4_has_session(&resp->cstate));
+ open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
/*
* To finish the open response, we just need to set the rflags.
*/
@@ -4338,14 +4525,12 @@ nfs4_laundromat(struct nfsd_net *nn)
spin_lock(&state_lock);
list_for_each_safe(pos, next, &nn->del_recall_lru) {
dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
- if (net_generic(dp->dl_stid.sc_client->net, nfsd_net_id) != nn)
- continue;
if (time_after((unsigned long)dp->dl_time, (unsigned long)cutoff)) {
t = dp->dl_time - cutoff;
new_timeo = min(new_timeo, t);
break;
}
- unhash_delegation_locked(dp);
+ WARN_ON(!unhash_delegation_locked(dp));
list_add(&dp->dl_recall_lru, &reaplist);
}
spin_unlock(&state_lock);
@@ -4386,8 +4571,7 @@ static void
laundromat_main(struct work_struct *laundry)
{
time_t t;
- struct delayed_work *dwork = container_of(laundry, struct delayed_work,
- work);
+ struct delayed_work *dwork = to_delayed_work(laundry);
struct nfsd_net *nn = container_of(dwork, struct nfsd_net,
laundromat_work);
@@ -4440,7 +4624,7 @@ check_special_stateids(struct net *net, svc_fh *current_fh, stateid_t *stateid,
{
if (ONE_STATEID(stateid) && (flags & RD_STATE))
return nfs_ok;
- else if (locks_in_grace(net)) {
+ else if (opens_in_grace(net)) {
/* Answer in remaining cases depends on existence of
* conflicting state; so we must wait out the grace period. */
return nfserr_grace;
@@ -4459,7 +4643,7 @@ check_special_stateids(struct net *net, svc_fh *current_fh, stateid_t *stateid,
static inline int
grace_disallows_io(struct net *net, struct inode *inode)
{
- return locks_in_grace(net) && mandatory_lock(inode);
+ return opens_in_grace(net) && mandatory_lock(inode);
}
/* Returns true iff a is later than b: */
@@ -4642,10 +4826,9 @@ nfs4_check_file(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfs4_stid *s,
*/
__be32
nfs4_preprocess_stateid_op(struct svc_rqst *rqstp,
- struct nfsd4_compound_state *cstate, stateid_t *stateid,
- int flags, struct file **filpp, bool *tmp_file)
+ struct nfsd4_compound_state *cstate, struct svc_fh *fhp,
+ stateid_t *stateid, int flags, struct file **filpp, bool *tmp_file)
{
- struct svc_fh *fhp = &cstate->current_fh;
struct inode *ino = d_inode(fhp->fh_dentry);
struct net *net = SVC_NET(rqstp);
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
@@ -4751,7 +4934,7 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (check_for_locks(stp->st_stid.sc_file,
lockowner(stp->st_stateowner)))
break;
- unhash_lock_stateid(stp);
+ WARN_ON(!unhash_lock_stateid(stp));
spin_unlock(&cl->cl_lock);
nfs4_put_stid(s);
ret = nfs_ok;
@@ -4795,10 +4978,13 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_
* revoked delegations are kept only for free_stateid.
*/
return nfserr_bad_stateid;
+ down_write(&stp->st_rwsem);
status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate));
- if (status)
- return status;
- return nfs4_check_fh(current_fh, &stp->st_stid);
+ if (status == nfs_ok)
+ status = nfs4_check_fh(current_fh, &stp->st_stid);
+ if (status != nfs_ok)
+ up_write(&stp->st_rwsem);
+ return status;
}
/*
@@ -4845,6 +5031,7 @@ static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cs
return status;
oo = openowner(stp->st_stateowner);
if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) {
+ up_write(&stp->st_rwsem);
nfs4_put_stid(&stp->st_stid);
return nfserr_bad_stateid;
}
@@ -4875,11 +5062,13 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto out;
oo = openowner(stp->st_stateowner);
status = nfserr_bad_stateid;
- if (oo->oo_flags & NFS4_OO_CONFIRMED)
+ if (oo->oo_flags & NFS4_OO_CONFIRMED) {
+ up_write(&stp->st_rwsem);
goto put_stateid;
+ }
oo->oo_flags |= NFS4_OO_CONFIRMED;
- update_stateid(&stp->st_stid.sc_stateid);
- memcpy(&oc->oc_resp_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
+ nfs4_inc_and_copy_stateid(&oc->oc_resp_stateid, &stp->st_stid);
+ up_write(&stp->st_rwsem);
dprintk("NFSD: %s: success, seqid=%d stateid=" STATEID_FMT "\n",
__func__, oc->oc_seqid, STATEID_VAL(&stp->st_stid.sc_stateid));
@@ -4951,13 +5140,11 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
goto put_stateid;
}
nfs4_stateid_downgrade(stp, od->od_share_access);
-
reset_union_bmap_deny(od->od_share_deny, stp);
-
- update_stateid(&stp->st_stid.sc_stateid);
- memcpy(&od->od_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
+ nfs4_inc_and_copy_stateid(&od->od_stateid, &stp->st_stid);
status = nfs_ok;
put_stateid:
+ up_write(&stp->st_rwsem);
nfs4_put_stid(&stp->st_stid);
out:
nfsd4_bump_seqid(cstate, status);
@@ -4967,20 +5154,23 @@ out:
static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
{
struct nfs4_client *clp = s->st_stid.sc_client;
+ bool unhashed;
LIST_HEAD(reaplist);
s->st_stid.sc_type = NFS4_CLOSED_STID;
spin_lock(&clp->cl_lock);
- unhash_open_stateid(s, &reaplist);
+ unhashed = unhash_open_stateid(s, &reaplist);
if (clp->cl_minorversion) {
- put_ol_stateid_locked(s, &reaplist);
+ if (unhashed)
+ put_ol_stateid_locked(s, &reaplist);
spin_unlock(&clp->cl_lock);
free_ol_stateid_reaplist(&reaplist);
} else {
spin_unlock(&clp->cl_lock);
free_ol_stateid_reaplist(&reaplist);
- move_to_close_lru(s, clp->net);
+ if (unhashed)
+ move_to_close_lru(s, clp->net);
}
}
@@ -5006,8 +5196,8 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
nfsd4_bump_seqid(cstate, status);
if (status)
goto out;
- update_stateid(&stp->st_stid.sc_stateid);
- memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
+ nfs4_inc_and_copy_stateid(&close->cl_stateid, &stp->st_stid);
+ up_write(&stp->st_rwsem);
nfsd4_close_open_stateid(stp);
@@ -5045,9 +5235,6 @@ out:
return status;
}
-
-#define LOFF_OVERFLOW(start, len) ((u64)(len) > ~(u64)(start))
-
static inline u64
end_offset(u64 start, u64 len)
{
@@ -5139,8 +5326,7 @@ nevermind:
}
static struct nfs4_lockowner *
-find_lockowner_str_locked(clientid_t *clid, struct xdr_netobj *owner,
- struct nfs4_client *clp)
+find_lockowner_str_locked(struct nfs4_client *clp, struct xdr_netobj *owner)
{
unsigned int strhashval = ownerstr_hashval(owner);
struct nfs4_stateowner *so;
@@ -5158,13 +5344,12 @@ find_lockowner_str_locked(clientid_t *clid, struct xdr_netobj *owner,
}
static struct nfs4_lockowner *
-find_lockowner_str(clientid_t *clid, struct xdr_netobj *owner,
- struct nfs4_client *clp)
+find_lockowner_str(struct nfs4_client *clp, struct xdr_netobj *owner)
{
struct nfs4_lockowner *lo;
spin_lock(&clp->cl_lock);
- lo = find_lockowner_str_locked(clid, owner, clp);
+ lo = find_lockowner_str_locked(clp, owner);
spin_unlock(&clp->cl_lock);
return lo;
}
@@ -5208,14 +5393,14 @@ alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp,
lo->lo_owner.so_seqid = lock->lk_new_lock_seqid;
lo->lo_owner.so_ops = &lockowner_ops;
spin_lock(&clp->cl_lock);
- ret = find_lockowner_str_locked(&clp->cl_clientid,
- &lock->lk_new_owner, clp);
+ ret = find_lockowner_str_locked(clp, &lock->lk_new_owner);
if (ret == NULL) {
list_add(&lo->lo_owner.so_strhash,
&clp->cl_ownerstr_hashtbl[strhashval]);
ret = lo;
} else
- nfs4_free_lockowner(&lo->lo_owner);
+ nfs4_free_stateowner(&lo->lo_owner);
+
spin_unlock(&clp->cl_lock);
return ret;
}
@@ -5238,6 +5423,7 @@ init_lock_stateid(struct nfs4_ol_stateid *stp, struct nfs4_lockowner *lo,
stp->st_access_bmap = 0;
stp->st_deny_bmap = open_stp->st_deny_bmap;
stp->st_openstp = open_stp;
+ init_rwsem(&stp->st_rwsem);
list_add(&stp->st_locks, &open_stp->st_locks);
list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids);
spin_lock(&fp->fi_lock);
@@ -5298,8 +5484,8 @@ find_or_create_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fi,
static int
check_lock_length(u64 offset, u64 length)
{
- return ((length == 0) || ((length != NFS4_MAX_UINT64) &&
- LOFF_OVERFLOW(offset, length)));
+ return ((length == 0) || ((length != NFS4_MAX_UINT64) &&
+ (length > ~offset)));
}
static void get_lock_access(struct nfs4_ol_stateid *lock_stp, u32 access)
@@ -5328,9 +5514,9 @@ lookup_or_create_lock_state(struct nfsd4_compound_state *cstate,
struct nfs4_lockowner *lo;
unsigned int strhashval;
- lo = find_lockowner_str(&cl->cl_clientid, &lock->v.new.owner, cl);
+ lo = find_lockowner_str(cl, &lock->lk_new_owner);
if (!lo) {
- strhashval = ownerstr_hashval(&lock->v.new.owner);
+ strhashval = ownerstr_hashval(&lock->lk_new_owner);
lo = alloc_init_lock_stateowner(strhashval, cl, ost, lock);
if (lo == NULL)
return nfserr_jukebox;
@@ -5391,7 +5577,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (lock->lk_is_new) {
if (nfsd4_has_session(cstate))
/* See rfc 5661 18.10.3: given clientid is ignored: */
- memcpy(&lock->v.new.clientid,
+ memcpy(&lock->lk_new_clientid,
&cstate->session->se_client->cl_clientid,
sizeof(clientid_t));
@@ -5406,13 +5592,16 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
&open_stp, nn);
if (status)
goto out;
+ up_write(&open_stp->st_rwsem);
open_sop = openowner(open_stp->st_stateowner);
status = nfserr_bad_stateid;
if (!same_clid(&open_sop->oo_owner.so_client->cl_clientid,
- &lock->v.new.clientid))
+ &lock->lk_new_clientid))
goto out;
status = lookup_or_create_lock_state(cstate, open_stp, lock,
&lock_stp, &new);
+ if (status == nfs_ok)
+ down_write(&lock_stp->st_rwsem);
} else {
status = nfs4_preprocess_seqid_op(cstate,
lock->lk_old_lock_seqid,
@@ -5490,9 +5679,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
err = vfs_lock_file(filp, F_SETLK, file_lock, conflock);
switch (-err) {
case 0: /* success! */
- update_stateid(&lock_stp->st_stid.sc_stateid);
- memcpy(&lock->lk_resp_stateid, &lock_stp->st_stid.sc_stateid,
- sizeof(stateid_t));
+ nfs4_inc_and_copy_stateid(&lock->lk_resp_stateid, &lock_stp->st_stid);
status = 0;
break;
case (EAGAIN): /* conflock holds conflicting lock */
@@ -5518,6 +5705,8 @@ out:
seqid_mutating_err(ntohl(status)))
lock_sop->lo_owner.so_seqid++;
+ up_write(&lock_stp->st_rwsem);
+
/*
* If this is a new, never-before-used stateid, and we are
* returning an error, then just go ahead and release it.
@@ -5603,8 +5792,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto out;
}
- lo = find_lockowner_str(&lockt->lt_clientid, &lockt->lt_owner,
- cstate->clp);
+ lo = find_lockowner_str(cstate->clp, &lockt->lt_owner);
if (lo)
file_lock->fl_owner = (fl_owner_t)lo;
file_lock->fl_pid = current->tgid;
@@ -5683,11 +5871,11 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
dprintk("NFSD: nfs4_locku: vfs_lock_file failed!\n");
goto out_nfserr;
}
- update_stateid(&stp->st_stid.sc_stateid);
- memcpy(&locku->lu_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
+ nfs4_inc_and_copy_stateid(&locku->lu_stateid, &stp->st_stid);
fput:
fput(filp);
put_stateid:
+ up_write(&stp->st_rwsem);
nfs4_put_stid(&stp->st_stid);
out:
nfsd4_bump_seqid(cstate, status);
@@ -6019,7 +6207,7 @@ nfsd_inject_add_lock_to_list(struct nfs4_ol_stateid *lst,
static u64 nfsd_foreach_client_lock(struct nfs4_client *clp, u64 max,
struct list_head *collect,
- void (*func)(struct nfs4_ol_stateid *))
+ bool (*func)(struct nfs4_ol_stateid *))
{
struct nfs4_openowner *oop;
struct nfs4_ol_stateid *stp, *st_next;
@@ -6033,9 +6221,9 @@ static u64 nfsd_foreach_client_lock(struct nfs4_client *clp, u64 max,
list_for_each_entry_safe(lst, lst_next,
&stp->st_locks, st_locks) {
if (func) {
- func(lst);
- nfsd_inject_add_lock_to_list(lst,
- collect);
+ if (func(lst))
+ nfsd_inject_add_lock_to_list(lst,
+ collect);
}
++count;
/*
@@ -6305,7 +6493,7 @@ static u64 nfsd_find_all_delegations(struct nfs4_client *clp, u64 max,
continue;
atomic_inc(&clp->cl_refcount);
- unhash_delegation_locked(dp);
+ WARN_ON(!unhash_delegation_locked(dp));
list_add(&dp->dl_recall_lru, victims);
}
++count;
@@ -6584,6 +6772,7 @@ nfs4_state_start_net(struct net *net)
return ret;
nn->boot_time = get_seconds();
nn->grace_ended = false;
+ nn->nfsd4_manager.block_opens = true;
locks_start_grace(net, &nn->nfsd4_manager);
nfsd4_client_tracking_init(net);
printk(KERN_INFO "NFSD: starting %ld-second grace period (net %p)\n",
@@ -6602,7 +6791,7 @@ nfs4_state_start(void)
ret = set_callback_cred();
if (ret)
return -ENOMEM;
- laundry_wq = create_singlethread_workqueue("nfsd4");
+ laundry_wq = alloc_workqueue("%s", WQ_UNBOUND, 0, "nfsd4");
if (laundry_wq == NULL) {
ret = -ENOMEM;
goto out_recovery;
@@ -6635,7 +6824,7 @@ nfs4_state_shutdown_net(struct net *net)
spin_lock(&state_lock);
list_for_each_safe(pos, next, &nn->del_recall_lru) {
dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
- unhash_delegation_locked(dp);
+ WARN_ON(!unhash_delegation_locked(dp));
list_add(&dp->dl_recall_lru, &reaplist);
}
spin_unlock(&state_lock);
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 75e0563..d6ef095 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1675,6 +1675,25 @@ nfsd4_decode_fallocate(struct nfsd4_compoundargs *argp,
}
static __be32
+nfsd4_decode_clone(struct nfsd4_compoundargs *argp, struct nfsd4_clone *clone)
+{
+ DECODE_HEAD;
+
+ status = nfsd4_decode_stateid(argp, &clone->cl_src_stateid);
+ if (status)
+ return status;
+ status = nfsd4_decode_stateid(argp, &clone->cl_dst_stateid);
+ if (status)
+ return status;
+
+ READ_BUF(8 + 8 + 8);
+ p = xdr_decode_hyper(p, &clone->cl_src_pos);
+ p = xdr_decode_hyper(p, &clone->cl_dst_pos);
+ p = xdr_decode_hyper(p, &clone->cl_count);
+ DECODE_TAIL;
+}
+
+static __be32
nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek)
{
DECODE_HEAD;
@@ -1785,6 +1804,7 @@ static nfsd4_dec nfsd4_dec_ops[] = {
[OP_READ_PLUS] = (nfsd4_dec)nfsd4_decode_notsupp,
[OP_SEEK] = (nfsd4_dec)nfsd4_decode_seek,
[OP_WRITE_SAME] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_CLONE] = (nfsd4_dec)nfsd4_decode_clone,
};
static inline bool
@@ -2140,6 +2160,27 @@ nfsd4_encode_aclname(struct xdr_stream *xdr, struct svc_rqst *rqstp,
return nfsd4_encode_user(xdr, rqstp, ace->who_uid);
}
+static inline __be32
+nfsd4_encode_layout_type(struct xdr_stream *xdr, enum pnfs_layouttype layout_type)
+{
+ __be32 *p;
+
+ if (layout_type) {
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
+ return nfserr_resource;
+ *p++ = cpu_to_be32(1);
+ *p++ = cpu_to_be32(layout_type);
+ } else {
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ return nfserr_resource;
+ *p++ = cpu_to_be32(0);
+ }
+
+ return 0;
+}
+
#define WORD0_ABSENT_FS_ATTRS (FATTR4_WORD0_FS_LOCATIONS | FATTR4_WORD0_FSID | \
FATTR4_WORD0_RDATTR_ERROR)
#define WORD1_ABSENT_FS_ATTRS FATTR4_WORD1_MOUNTED_ON_FILEID
@@ -2205,6 +2246,39 @@ static int get_parent_attributes(struct svc_export *exp, struct kstat *stat)
return err;
}
+static __be32
+nfsd4_encode_bitmap(struct xdr_stream *xdr, u32 bmval0, u32 bmval1, u32 bmval2)
+{
+ __be32 *p;
+
+ if (bmval2) {
+ p = xdr_reserve_space(xdr, 16);
+ if (!p)
+ goto out_resource;
+ *p++ = cpu_to_be32(3);
+ *p++ = cpu_to_be32(bmval0);
+ *p++ = cpu_to_be32(bmval1);
+ *p++ = cpu_to_be32(bmval2);
+ } else if (bmval1) {
+ p = xdr_reserve_space(xdr, 12);
+ if (!p)
+ goto out_resource;
+ *p++ = cpu_to_be32(2);
+ *p++ = cpu_to_be32(bmval0);
+ *p++ = cpu_to_be32(bmval1);
+ } else {
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
+ goto out_resource;
+ *p++ = cpu_to_be32(1);
+ *p++ = cpu_to_be32(bmval0);
+ }
+
+ return 0;
+out_resource:
+ return nfserr_resource;
+}
+
/*
* Note: @fhp can be NULL; in this case, we might have to compose the filehandle
* ourselves.
@@ -2301,28 +2375,9 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
}
#endif /* CONFIG_NFSD_V4_SECURITY_LABEL */
- if (bmval2) {
- p = xdr_reserve_space(xdr, 16);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32(3);
- *p++ = cpu_to_be32(bmval0);
- *p++ = cpu_to_be32(bmval1);
- *p++ = cpu_to_be32(bmval2);
- } else if (bmval1) {
- p = xdr_reserve_space(xdr, 12);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32(2);
- *p++ = cpu_to_be32(bmval0);
- *p++ = cpu_to_be32(bmval1);
- } else {
- p = xdr_reserve_space(xdr, 8);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32(1);
- *p++ = cpu_to_be32(bmval0);
- }
+ status = nfsd4_encode_bitmap(xdr, bmval0, bmval1, bmval2);
+ if (status)
+ goto out;
attrlen_offset = xdr->buf->len;
p = xdr_reserve_space(xdr, 4);
@@ -2675,6 +2730,9 @@ out_acl:
*p++ = cpu_to_be32(stat.mtime.tv_nsec);
}
if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) {
+ struct kstat parent_stat;
+ u64 ino = stat.ino;
+
p = xdr_reserve_space(xdr, 8);
if (!p)
goto out_resource;
@@ -2683,25 +2741,25 @@ out_acl:
* and this is the root of a cross-mounted filesystem.
*/
if (ignore_crossmnt == 0 &&
- dentry == exp->ex_path.mnt->mnt_root)
- get_parent_attributes(exp, &stat);
- p = xdr_encode_hyper(p, stat.ino);
+ dentry == exp->ex_path.mnt->mnt_root) {
+ err = get_parent_attributes(exp, &parent_stat);
+ if (err)
+ goto out_nfserr;
+ ino = parent_stat.ino;
+ }
+ p = xdr_encode_hyper(p, ino);
}
#ifdef CONFIG_NFSD_PNFS
- if ((bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) ||
- (bmval2 & FATTR4_WORD2_LAYOUT_TYPES)) {
- if (exp->ex_layout_type) {
- p = xdr_reserve_space(xdr, 8);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32(1);
- *p++ = cpu_to_be32(exp->ex_layout_type);
- } else {
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32(0);
- }
+ if (bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) {
+ status = nfsd4_encode_layout_type(xdr, exp->ex_layout_type);
+ if (status)
+ goto out;
+ }
+
+ if (bmval2 & FATTR4_WORD2_LAYOUT_TYPES) {
+ status = nfsd4_encode_layout_type(xdr, exp->ex_layout_type);
+ if (status)
+ goto out;
}
if (bmval2 & FATTR4_WORD2_LAYOUT_BLKSIZE) {
@@ -2711,21 +2769,20 @@ out_acl:
*p++ = cpu_to_be32(stat.blksize);
}
#endif /* CONFIG_NFSD_PNFS */
+ if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) {
+ status = nfsd4_encode_bitmap(xdr, NFSD_SUPPATTR_EXCLCREAT_WORD0,
+ NFSD_SUPPATTR_EXCLCREAT_WORD1,
+ NFSD_SUPPATTR_EXCLCREAT_WORD2);
+ if (status)
+ goto out;
+ }
+
if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) {
status = nfsd4_encode_security_label(xdr, rqstp, context,
contextlen);
if (status)
goto out;
}
- if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) {
- p = xdr_reserve_space(xdr, 16);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32(3);
- *p++ = cpu_to_be32(NFSD_SUPPATTR_EXCLCREAT_WORD0);
- *p++ = cpu_to_be32(NFSD_SUPPATTR_EXCLCREAT_WORD1);
- *p++ = cpu_to_be32(NFSD_SUPPATTR_EXCLCREAT_WORD2);
- }
attrlen = htonl(xdr->buf->len - attrlen_offset - 4);
write_bytes_to_xdr_buf(xdr->buf, attrlen_offset, &attrlen, 4);
@@ -2801,14 +2858,14 @@ nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd,
__be32 nfserr;
int ignore_crossmnt = 0;
- dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
+ dentry = lookup_one_len_unlocked(name, cd->rd_fhp->fh_dentry, namlen);
if (IS_ERR(dentry))
return nfserrno(PTR_ERR(dentry));
if (d_really_is_negative(dentry)) {
/*
- * nfsd_buffered_readdir drops the i_mutex between
- * readdir and calling this callback, leaving a window
- * where this directory entry could have gone away.
+ * we're not holding the i_mutex here, so there's
+ * a window where this directory entry could have gone
+ * away.
*/
dput(dentry);
return nfserr_noent;
@@ -3044,13 +3101,12 @@ nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
__be32 *p;
if (!nfserr) {
- p = xdr_reserve_space(xdr, 32);
+ p = xdr_reserve_space(xdr, 20);
if (!p)
return nfserr_resource;
- p = encode_cinfo(p, &create->cr_cinfo);
- *p++ = cpu_to_be32(2);
- *p++ = cpu_to_be32(create->cr_bmval[0]);
- *p++ = cpu_to_be32(create->cr_bmval[1]);
+ encode_cinfo(p, &create->cr_cinfo);
+ nfserr = nfsd4_encode_bitmap(xdr, create->cr_bmval[0],
+ create->cr_bmval[1], create->cr_bmval[2]);
}
return nfserr;
}
@@ -3190,16 +3246,22 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op
nfserr = nfsd4_encode_stateid(xdr, &open->op_stateid);
if (nfserr)
goto out;
- p = xdr_reserve_space(xdr, 40);
+ p = xdr_reserve_space(xdr, 24);
if (!p)
return nfserr_resource;
p = encode_cinfo(p, &open->op_cinfo);
*p++ = cpu_to_be32(open->op_rflags);
- *p++ = cpu_to_be32(2);
- *p++ = cpu_to_be32(open->op_bmval[0]);
- *p++ = cpu_to_be32(open->op_bmval[1]);
- *p++ = cpu_to_be32(open->op_delegate_type);
+ nfserr = nfsd4_encode_bitmap(xdr, open->op_bmval[0], open->op_bmval[1],
+ open->op_bmval[2]);
+ if (nfserr)
+ goto out;
+
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ return nfserr_resource;
+
+ *p++ = cpu_to_be32(open->op_delegate_type);
switch (open->op_delegate_type) {
case NFS4_OPEN_DELEGATE_NONE:
break;
@@ -4250,6 +4312,7 @@ static nfsd4_enc nfsd4_enc_ops[] = {
[OP_READ_PLUS] = (nfsd4_enc)nfsd4_encode_noop,
[OP_SEEK] = (nfsd4_enc)nfsd4_encode_seek,
[OP_WRITE_SAME] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_CLONE] = (nfsd4_enc)nfsd4_encode_noop,
};
/*
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 46ec934..54cde9a 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -63,7 +63,6 @@ static unsigned int longest_chain;
static unsigned int longest_chain_cachesize;
static int nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec);
-static void cache_cleaner_func(struct work_struct *unused);
static unsigned long nfsd_reply_cache_count(struct shrinker *shrink,
struct shrink_control *sc);
static unsigned long nfsd_reply_cache_scan(struct shrinker *shrink,
@@ -76,13 +75,6 @@ static struct shrinker nfsd_reply_cache_shrinker = {
};
/*
- * locking for the reply cache:
- * A cache entry is "single use" if c_state == RC_INPROG
- * Otherwise, it when accessing _prev or _next, the lock must be held.
- */
-static DECLARE_DELAYED_WORK(cache_cleaner, cache_cleaner_func);
-
-/*
* Put a cap on the size of the DRC based on the amount of available
* low memory in the machine.
*
@@ -203,7 +195,6 @@ void nfsd_reply_cache_shutdown(void)
unsigned int i;
unregister_shrinker(&nfsd_reply_cache_shrinker);
- cancel_delayed_work_sync(&cache_cleaner);
for (i = 0; i < drc_hashsize; i++) {
struct list_head *head = &drc_hashtbl[i].lru_head;
@@ -217,10 +208,8 @@ void nfsd_reply_cache_shutdown(void)
drc_hashtbl = NULL;
drc_hashsize = 0;
- if (drc_slab) {
- kmem_cache_destroy(drc_slab);
- drc_slab = NULL;
- }
+ kmem_cache_destroy(drc_slab);
+ drc_slab = NULL;
}
/*
@@ -232,7 +221,6 @@ lru_put_end(struct nfsd_drc_bucket *b, struct svc_cacherep *rp)
{
rp->c_timestamp = jiffies;
list_move_tail(&rp->c_lru, &b->lru_head);
- schedule_delayed_work(&cache_cleaner, RC_EXPIRE);
}
static long
@@ -266,7 +254,6 @@ prune_cache_entries(void)
{
unsigned int i;
long freed = 0;
- bool cancel = true;
for (i = 0; i < drc_hashsize; i++) {
struct nfsd_drc_bucket *b = &drc_hashtbl[i];
@@ -275,26 +262,11 @@ prune_cache_entries(void)
continue;
spin_lock(&b->cache_lock);
freed += prune_bucket(b);
- if (!list_empty(&b->lru_head))
- cancel = false;
spin_unlock(&b->cache_lock);
}
-
- /*
- * Conditionally rearm the job to run in RC_EXPIRE since we just
- * ran the pruner.
- */
- if (!cancel)
- mod_delayed_work(system_wq, &cache_cleaner, RC_EXPIRE);
return freed;
}
-static void
-cache_cleaner_func(struct work_struct *unused)
-{
- prune_cache_entries();
-}
-
static unsigned long
nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc)
{
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 350041a..c1681ce 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -631,10 +631,7 @@ fh_put(struct svc_fh *fhp)
fh_unlock(fhp);
fhp->fh_dentry = NULL;
dput(dentry);
-#ifdef CONFIG_NFSD_V3
- fhp->fh_pre_saved = 0;
- fhp->fh_post_saved = 0;
-#endif
+ fh_clear_wcc(fhp);
}
fh_drop_write(fhp);
if (exp) {
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index 1e90dad..0770bcb 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -7,6 +7,7 @@
#ifndef _LINUX_NFSD_NFSFH_H
#define _LINUX_NFSD_NFSFH_H
+#include <linux/crc32.h>
#include <linux/sunrpc/svc.h>
#include <uapi/linux/nfsd/nfsfh.h>
@@ -26,16 +27,16 @@ static inline ino_t u32_to_ino_t(__u32 uino)
*/
typedef struct svc_fh {
struct knfsd_fh fh_handle; /* FH data */
+ int fh_maxsize; /* max size for fh_handle */
struct dentry * fh_dentry; /* validated dentry */
struct svc_export * fh_export; /* export pointer */
- int fh_maxsize; /* max size for fh_handle */
- unsigned char fh_locked; /* inode locked by us */
- unsigned char fh_want_write; /* remount protection taken */
+ bool fh_locked; /* inode locked by us */
+ bool fh_want_write; /* remount protection taken */
#ifdef CONFIG_NFSD_V3
- unsigned char fh_post_saved; /* post-op attrs saved */
- unsigned char fh_pre_saved; /* pre-op attrs saved */
+ bool fh_post_saved; /* post-op attrs saved */
+ bool fh_pre_saved; /* pre-op attrs saved */
/* Pre-op attributes saved during fh_lock */
__u64 fh_pre_size; /* size before operation */
@@ -205,6 +206,28 @@ static inline bool fh_fsid_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
return true;
}
+#ifdef CONFIG_CRC32
+/**
+ * knfsd_fh_hash - calculate the crc32 hash for the filehandle
+ * @fh - pointer to filehandle
+ *
+ * returns a crc32 hash for the filehandle that is compatible with
+ * the one displayed by "wireshark".
+ */
+
+static inline u32
+knfsd_fh_hash(struct knfsd_fh *fh)
+{
+ return ~crc32_le(0xFFFFFFFF, (unsigned char *)&fh->fh_base, fh->fh_size);
+}
+#else
+static inline u32
+knfsd_fh_hash(struct knfsd_fh *fh)
+{
+ return 0;
+}
+#endif
+
#ifdef CONFIG_NFSD_V3
/*
* The wcc data stored in current_fh should be cleared
@@ -213,8 +236,8 @@ static inline bool fh_fsid_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
static inline void
fh_clear_wcc(struct svc_fh *fhp)
{
- fhp->fh_post_saved = 0;
- fhp->fh_pre_saved = 0;
+ fhp->fh_post_saved = false;
+ fhp->fh_pre_saved = false;
}
/*
@@ -231,7 +254,7 @@ fill_pre_wcc(struct svc_fh *fhp)
fhp->fh_pre_ctime = inode->i_ctime;
fhp->fh_pre_size = inode->i_size;
fhp->fh_pre_change = inode->i_version;
- fhp->fh_pre_saved = 1;
+ fhp->fh_pre_saved = true;
}
}
@@ -267,7 +290,7 @@ fh_lock_nested(struct svc_fh *fhp, unsigned int subclass)
inode = d_inode(dentry);
mutex_lock_nested(&inode->i_mutex, subclass);
fill_pre_wcc(fhp);
- fhp->fh_locked = 1;
+ fhp->fh_locked = true;
}
static inline void
@@ -285,7 +308,7 @@ fh_unlock(struct svc_fh *fhp)
if (fhp->fh_locked) {
fill_post_wcc(fhp);
mutex_unlock(&d_inode(fhp->fh_dentry)->i_mutex);
- fhp->fh_locked = 0;
+ fhp->fh_locked = false;
}
}
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 9277cc9..45007ac 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -14,9 +14,13 @@
#include <linux/sunrpc/stats.h>
#include <linux/sunrpc/svcsock.h>
+#include <linux/sunrpc/svc_xprt.h>
#include <linux/lockd/bind.h>
#include <linux/nfsacl.h>
#include <linux/seq_file.h>
+#include <linux/inetdevice.h>
+#include <net/addrconf.h>
+#include <net/ipv6.h>
#include <net/net_namespace.h>
#include "nfsd.h"
#include "cache.h"
@@ -306,22 +310,81 @@ static void nfsd_shutdown_net(struct net *net)
nfsd_shutdown_generic();
}
+static int nfsd_inetaddr_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
+ struct net_device *dev = ifa->ifa_dev->dev;
+ struct net *net = dev_net(dev);
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ struct sockaddr_in sin;
+
+ if (event != NETDEV_DOWN)
+ goto out;
+
+ if (nn->nfsd_serv) {
+ dprintk("nfsd_inetaddr_event: removed %pI4\n", &ifa->ifa_local);
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = ifa->ifa_local;
+ svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin);
+ }
+
+out:
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block nfsd_inetaddr_notifier = {
+ .notifier_call = nfsd_inetaddr_event,
+};
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int nfsd_inet6addr_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr;
+ struct net_device *dev = ifa->idev->dev;
+ struct net *net = dev_net(dev);
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ struct sockaddr_in6 sin6;
+
+ if (event != NETDEV_DOWN)
+ goto out;
+
+ if (nn->nfsd_serv) {
+ dprintk("nfsd_inet6addr_event: removed %pI6\n", &ifa->addr);
+ sin6.sin6_family = AF_INET6;
+ sin6.sin6_addr = ifa->addr;
+ svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin6);
+ }
+
+out:
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block nfsd_inet6addr_notifier = {
+ .notifier_call = nfsd_inet6addr_event,
+};
+#endif
+
static void nfsd_last_thread(struct svc_serv *serv, struct net *net)
{
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ unregister_inetaddr_notifier(&nfsd_inetaddr_notifier);
+#if IS_ENABLED(CONFIG_IPV6)
+ unregister_inet6addr_notifier(&nfsd_inet6addr_notifier);
+#endif
/*
* write_ports can create the server without actually starting
* any threads--if we get shut down before any threads are
* started, then nfsd_last_thread will be run before any of this
- * other initialization has been done.
+ * other initialization has been done except the rpcb information.
*/
+ svc_rpcb_cleanup(serv, net);
if (!nn->nfsd_net_up)
return;
- nfsd_shutdown_net(net);
-
- svc_rpcb_cleanup(serv, net);
+ nfsd_shutdown_net(net);
printk(KERN_WARNING "nfsd: last server has exited, flushing export "
"cache\n");
nfsd_export_flush(net);
@@ -391,6 +454,14 @@ static int nfsd_get_default_max_blksize(void)
return ret;
}
+static struct svc_serv_ops nfsd_thread_sv_ops = {
+ .svo_shutdown = nfsd_last_thread,
+ .svo_function = nfsd,
+ .svo_enqueue_xprt = svc_xprt_do_enqueue,
+ .svo_setup = svc_set_num_threads,
+ .svo_module = THIS_MODULE,
+};
+
int nfsd_create_serv(struct net *net)
{
int error;
@@ -405,7 +476,7 @@ int nfsd_create_serv(struct net *net)
nfsd_max_blksize = nfsd_get_default_max_blksize();
nfsd_reset_versions();
nn->nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
- nfsd_last_thread, nfsd, THIS_MODULE);
+ &nfsd_thread_sv_ops);
if (nn->nfsd_serv == NULL)
return -ENOMEM;
@@ -417,6 +488,10 @@ int nfsd_create_serv(struct net *net)
}
set_max_drc();
+ register_inetaddr_notifier(&nfsd_inetaddr_notifier);
+#if IS_ENABLED(CONFIG_IPV6)
+ register_inet6addr_notifier(&nfsd_inet6addr_notifier);
+#endif
do_gettimeofday(&nn->nfssvc_boot); /* record boot time */
return 0;
}
@@ -500,8 +575,8 @@ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net)
/* apply the new numbers */
svc_get(nn->nfsd_serv);
for (i = 0; i < n; i++) {
- err = svc_set_num_threads(nn->nfsd_serv, &nn->nfsd_serv->sv_pools[i],
- nthreads[i]);
+ err = nn->nfsd_serv->sv_ops->svo_setup(nn->nfsd_serv,
+ &nn->nfsd_serv->sv_pools[i], nthreads[i]);
if (err)
break;
}
@@ -540,7 +615,8 @@ nfsd_svc(int nrservs, struct net *net)
error = nfsd_startup_net(nrservs, net);
if (error)
goto out_destroy;
- error = svc_set_num_threads(nn->nfsd_serv, NULL, nrservs);
+ error = nn->nfsd_serv->sv_ops->svo_setup(nn->nfsd_serv,
+ NULL, nrservs);
if (error)
goto out_shutdown;
/* We are holding a reference to nn->nfsd_serv which
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 4874ce5..c050c53 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -65,10 +65,10 @@ struct nfsd4_callback {
struct nfs4_client *cb_clp;
u32 cb_minorversion;
struct rpc_message cb_msg;
- struct nfsd4_callback_ops *cb_ops;
+ const struct nfsd4_callback_ops *cb_ops;
struct work_struct cb_work;
+ int cb_seq_status;
int cb_status;
- bool cb_update_seq_nr;
bool cb_need_restart;
};
@@ -84,7 +84,7 @@ struct nfsd4_callback_ops {
* fields that are of general use to any stateid.
*/
struct nfs4_stid {
- atomic_t sc_count;
+ atomic_t sc_count;
#define NFS4_OPEN_STID 1
#define NFS4_LOCK_STID 2
#define NFS4_DELEG_STID 4
@@ -94,11 +94,12 @@ struct nfs4_stid {
#define NFS4_REVOKED_DELEG_STID 16
#define NFS4_CLOSED_DELEG_STID 32
#define NFS4_LAYOUT_STID 64
- unsigned char sc_type;
- stateid_t sc_stateid;
- struct nfs4_client *sc_client;
- struct nfs4_file *sc_file;
- void (*sc_free)(struct nfs4_stid *);
+ unsigned char sc_type;
+ stateid_t sc_stateid;
+ spinlock_t sc_lock;
+ struct nfs4_client *sc_client;
+ struct nfs4_file *sc_file;
+ void (*sc_free)(struct nfs4_stid *);
};
/*
@@ -364,15 +365,6 @@ struct nfs4_client_reclaim {
char cr_recdir[HEXDIR_LEN]; /* recover dir */
};
-static inline void
-update_stateid(stateid_t *stateid)
-{
- stateid->si_generation++;
- /* Wraparound recommendation from 3530bis-13 9.1.3.2: */
- if (stateid->si_generation == 0)
- stateid->si_generation = 1;
-}
-
/* A reasonable value for REPLAY_ISIZE was estimated as follows:
* The OPEN response, typically the largest, requires
* 4(status) + 8(stateid) + 20(changeinfo) + 4(rflags) + 8(verifier) +
@@ -534,15 +526,16 @@ struct nfs4_file {
* Better suggestions welcome.
*/
struct nfs4_ol_stateid {
- struct nfs4_stid st_stid; /* must be first field */
- struct list_head st_perfile;
- struct list_head st_perstateowner;
- struct list_head st_locks;
- struct nfs4_stateowner * st_stateowner;
- struct nfs4_clnt_odstate * st_clnt_odstate;
- unsigned char st_access_bmap;
- unsigned char st_deny_bmap;
- struct nfs4_ol_stateid * st_openstp;
+ struct nfs4_stid st_stid;
+ struct list_head st_perfile;
+ struct list_head st_perstateowner;
+ struct list_head st_locks;
+ struct nfs4_stateowner *st_stateowner;
+ struct nfs4_clnt_odstate *st_clnt_odstate;
+ unsigned char st_access_bmap;
+ unsigned char st_deny_bmap;
+ struct nfs4_ol_stateid *st_openstp;
+ struct rw_semaphore st_rwsem;
};
static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s)
@@ -561,6 +554,7 @@ struct nfs4_layout_stateid {
struct nfsd4_callback ls_recall;
stateid_t ls_recall_sid;
bool ls_recalled;
+ struct mutex ls_mutex;
};
static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s)
@@ -584,8 +578,8 @@ struct nfsd4_compound_state;
struct nfsd_net;
extern __be32 nfs4_preprocess_stateid_op(struct svc_rqst *rqstp,
- struct nfsd4_compound_state *cstate, stateid_t *stateid,
- int flags, struct file **filp, bool *tmp_file);
+ struct nfsd4_compound_state *cstate, struct svc_fh *fhp,
+ stateid_t *stateid, int flags, struct file **filp, bool *tmp_file);
__be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
stateid_t *stateid, unsigned char typemask,
struct nfs4_stid **s, struct nfsd_net *nn);
@@ -593,6 +587,7 @@ struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl,
struct kmem_cache *slab);
void nfs4_unhash_stid(struct nfs4_stid *s);
void nfs4_put_stid(struct nfs4_stid *s);
+void nfs4_inc_and_copy_stateid(stateid_t *dst, struct nfs4_stid *stid);
void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *, struct nfsd_net *);
extern void nfs4_release_reclaim(struct nfsd_net *);
extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(const char *recdir,
@@ -604,7 +599,7 @@ extern void nfsd4_probe_callback(struct nfs4_client *clp);
extern void nfsd4_probe_callback_sync(struct nfs4_client *clp);
extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
extern void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
- struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op);
+ const struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op);
extern void nfsd4_run_cb(struct nfsd4_callback *cb);
extern int nfsd4_create_callback_queue(void);
extern void nfsd4_destroy_callback_queue(void);
diff --git a/fs/nfsd/trace.c b/fs/nfsd/trace.c
index 82f8907..9096746 100644
--- a/fs/nfsd/trace.c
+++ b/fs/nfsd/trace.c
@@ -1,5 +1,3 @@
-#include "state.h"
-
#define CREATE_TRACE_POINTS
#include "trace.h"
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index c668520..3287041 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -8,6 +8,49 @@
#define _NFSD_TRACE_H
#include <linux/tracepoint.h>
+#include "nfsfh.h"
+
+DECLARE_EVENT_CLASS(nfsd_io_class,
+ TP_PROTO(struct svc_rqst *rqstp,
+ struct svc_fh *fhp,
+ loff_t offset,
+ int len),
+ TP_ARGS(rqstp, fhp, offset, len),
+ TP_STRUCT__entry(
+ __field(__be32, xid)
+ __field_struct(struct knfsd_fh, fh)
+ __field(loff_t, offset)
+ __field(int, len)
+ ),
+ TP_fast_assign(
+ __entry->xid = rqstp->rq_xid,
+ fh_copy_shallow(&__entry->fh, &fhp->fh_handle);
+ __entry->offset = offset;
+ __entry->len = len;
+ ),
+ TP_printk("xid=0x%x fh=0x%x offset=%lld len=%d",
+ __be32_to_cpu(__entry->xid), knfsd_fh_hash(&__entry->fh),
+ __entry->offset, __entry->len)
+)
+
+#define DEFINE_NFSD_IO_EVENT(name) \
+DEFINE_EVENT(nfsd_io_class, name, \
+ TP_PROTO(struct svc_rqst *rqstp, \
+ struct svc_fh *fhp, \
+ loff_t offset, \
+ int len), \
+ TP_ARGS(rqstp, fhp, offset, len))
+
+DEFINE_NFSD_IO_EVENT(read_start);
+DEFINE_NFSD_IO_EVENT(read_opened);
+DEFINE_NFSD_IO_EVENT(read_io_done);
+DEFINE_NFSD_IO_EVENT(read_done);
+DEFINE_NFSD_IO_EVENT(write_start);
+DEFINE_NFSD_IO_EVENT(write_opened);
+DEFINE_NFSD_IO_EVENT(write_io_done);
+DEFINE_NFSD_IO_EVENT(write_done);
+
+#include "state.h"
DECLARE_EVENT_CLASS(nfsd_stateid_class,
TP_PROTO(stateid_t *stp),
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index b5e077a..6739077 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -36,12 +36,14 @@
#endif /* CONFIG_NFSD_V3 */
#ifdef CONFIG_NFSD_V4
+#include "../internal.h"
#include "acl.h"
#include "idmap.h"
#endif /* CONFIG_NFSD_V4 */
#include "nfsd.h"
#include "vfs.h"
+#include "trace.h"
#define NFSDDBG_FACILITY NFSDDBG_FILEOP
@@ -217,10 +219,16 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
host_err = PTR_ERR(dentry);
if (IS_ERR(dentry))
goto out_nfserr;
- /*
- * check if we have crossed a mount point ...
- */
if (nfsd_mountpoint(dentry, exp)) {
+ /*
+ * We don't need the i_mutex after all. It's
+ * still possible we could open this (regular
+ * files can be mountpoints too), but the
+ * i_mutex is just there to prevent renames of
+ * something that we might be about to delegate,
+ * and a mountpoint won't be renamed:
+ */
+ fh_unlock(fhp);
if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) {
dput(dentry);
goto out_nfserr;
@@ -498,6 +506,13 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp,
}
#endif
+__be32 nfsd4_clone_file_range(struct file *src, u64 src_pos, struct file *dst,
+ u64 dst_pos, u64 count)
+{
+ return nfserrno(vfs_clone_file_range(src, src_pos, dst, dst_pos,
+ count));
+}
+
__be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct file *file, loff_t offset, loff_t len,
int flags)
@@ -983,16 +998,23 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct raparms *ra;
__be32 err;
+ trace_read_start(rqstp, fhp, offset, vlen);
err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file);
if (err)
return err;
ra = nfsd_init_raparms(file);
+
+ trace_read_opened(rqstp, fhp, offset, vlen);
err = nfsd_vfs_read(rqstp, file, offset, vec, vlen, count);
+ trace_read_io_done(rqstp, fhp, offset, vlen);
+
if (ra)
nfsd_put_raparams(file, ra);
fput(file);
+ trace_read_done(rqstp, fhp, offset, vlen);
+
return err;
}
@@ -1008,24 +1030,31 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
{
__be32 err = 0;
+ trace_write_start(rqstp, fhp, offset, vlen);
+
if (file) {
err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry,
NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE);
if (err)
goto out;
+ trace_write_opened(rqstp, fhp, offset, vlen);
err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt,
stablep);
+ trace_write_io_done(rqstp, fhp, offset, vlen);
} else {
err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file);
if (err)
goto out;
+ trace_write_opened(rqstp, fhp, offset, vlen);
if (cnt)
err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen,
cnt, stablep);
+ trace_write_io_done(rqstp, fhp, offset, vlen);
fput(file);
}
out:
+ trace_write_done(rqstp, fhp, offset, vlen);
return err;
}
@@ -1249,12 +1278,6 @@ out_nfserr:
#ifdef CONFIG_NFSD_V3
-static inline int nfsd_create_is_exclusive(int createmode)
-{
- return createmode == NFS3_CREATE_EXCLUSIVE
- || createmode == NFS4_CREATE_EXCLUSIVE4_1;
-}
-
/*
* NFSv3 and NFSv4 version of nfsd_create
*/
@@ -1637,7 +1660,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
/* cannot use fh_lock as we need deadlock protective ordering
* so do it by hand */
trap = lock_rename(tdentry, fdentry);
- ffhp->fh_locked = tfhp->fh_locked = 1;
+ ffhp->fh_locked = tfhp->fh_locked = true;
fill_pre_wcc(ffhp);
fill_pre_wcc(tfhp);
@@ -1687,7 +1710,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
fill_post_wcc(ffhp);
fill_post_wcc(tfhp);
unlock_rename(tdentry, fdentry);
- ffhp->fh_locked = tfhp->fh_locked = 0;
+ ffhp->fh_locked = tfhp->fh_locked = false;
fh_drop_write(ffhp);
out:
@@ -1815,7 +1838,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
offset = *offsetp;
while (1) {
- struct inode *dir_inode = file_inode(file);
unsigned int reclen;
cdp->err = nfserr_eof; /* will be cleared on successful read */
@@ -1834,15 +1856,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
if (!size)
break;
- /*
- * Various filldir functions may end up calling back into
- * lookup_one_len() and the file system's ->lookup() method.
- * These expect i_mutex to be held, as it would within readdir.
- */
- host_err = mutex_lock_killable(&dir_inode->i_mutex);
- if (host_err)
- break;
-
de = (struct buffered_dirent *)buf.dirent;
while (size > 0) {
offset = de->offset;
@@ -1859,7 +1872,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
size -= reclen;
de = (struct buffered_dirent *)((char *)de + reclen);
}
- mutex_unlock(&dir_inode->i_mutex);
if (size > 0) /* We bailed out early */
break;
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 5be875e..c11ba31 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -56,6 +56,8 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *, struct svc_fh *,
struct xdr_netobj *);
__be32 nfsd4_vfs_fallocate(struct svc_rqst *, struct svc_fh *,
struct file *, loff_t, loff_t, int);
+__be32 nfsd4_clone_file_range(struct file *, u64, struct file *,
+ u64, u64);
#endif /* CONFIG_NFSD_V4 */
__be32 nfsd_create(struct svc_rqst *, struct svc_fh *,
char *name, int len, struct iattr *attrs,
@@ -112,14 +114,14 @@ static inline int fh_want_write(struct svc_fh *fh)
int ret = mnt_want_write(fh->fh_export->ex_path.mnt);
if (!ret)
- fh->fh_want_write = 1;
+ fh->fh_want_write = true;
return ret;
}
static inline void fh_drop_write(struct svc_fh *fh)
{
if (fh->fh_want_write) {
- fh->fh_want_write = 0;
+ fh->fh_want_write = false;
mnt_drop_write(fh->fh_export->ex_path.mnt);
}
}
@@ -131,4 +133,10 @@ static inline __be32 fh_getattr(struct svc_fh *fh, struct kstat *stat)
return nfserrno(vfs_getattr(&p, stat));
}
+static inline int nfsd_create_is_exclusive(int createmode)
+{
+ return createmode == NFS3_CREATE_EXCLUSIVE
+ || createmode == NFS4_CREATE_EXCLUSIVE4_1;
+}
+
#endif /* LINUX_NFSD_VFS_H */
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 9f99100..d955481 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -491,6 +491,15 @@ struct nfsd4_fallocate {
u64 falloc_length;
};
+struct nfsd4_clone {
+ /* request */
+ stateid_t cl_src_stateid;
+ stateid_t cl_dst_stateid;
+ u64 cl_src_pos;
+ u64 cl_dst_pos;
+ u64 cl_count;
+};
+
struct nfsd4_seek {
/* request */
stateid_t seek_stateid;
@@ -555,6 +564,7 @@ struct nfsd4_op {
/* NFSv4.2 */
struct nfsd4_fallocate allocate;
struct nfsd4_fallocate deallocate;
+ struct nfsd4_clone clone;
struct nfsd4_seek seek;
} u;
struct nfs4_replay * replay;
@@ -632,7 +642,7 @@ static inline void
set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp)
{
BUG_ON(!fhp->fh_pre_saved);
- cinfo->atomic = fhp->fh_post_saved;
+ cinfo->atomic = (u32)fhp->fh_post_saved;
cinfo->change_supported = IS_I_VERSION(d_inode(fhp->fh_dentry));
cinfo->before_change = fhp->fh_pre_change;
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 8df0f3b..2ccbf55 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -133,38 +133,38 @@ nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group)
/**
* nilfs_palloc_group_desc_nfrees - get the number of free entries in a group
- * @inode: inode of metadata file using this allocator
- * @group: group number
* @desc: pointer to descriptor structure for the group
+ * @lock: spin lock protecting @desc
*/
static unsigned long
-nilfs_palloc_group_desc_nfrees(struct inode *inode, unsigned long group,
- const struct nilfs_palloc_group_desc *desc)
+nilfs_palloc_group_desc_nfrees(const struct nilfs_palloc_group_desc *desc,
+ spinlock_t *lock)
{
unsigned long nfree;
- spin_lock(nilfs_mdt_bgl_lock(inode, group));
+ spin_lock(lock);
nfree = le32_to_cpu(desc->pg_nfrees);
- spin_unlock(nilfs_mdt_bgl_lock(inode, group));
+ spin_unlock(lock);
return nfree;
}
/**
* nilfs_palloc_group_desc_add_entries - adjust count of free entries
- * @inode: inode of metadata file using this allocator
- * @group: group number
* @desc: pointer to descriptor structure for the group
+ * @lock: spin lock protecting @desc
* @n: delta to be added
*/
-static void
-nilfs_palloc_group_desc_add_entries(struct inode *inode,
- unsigned long group,
- struct nilfs_palloc_group_desc *desc,
- u32 n)
+static u32
+nilfs_palloc_group_desc_add_entries(struct nilfs_palloc_group_desc *desc,
+ spinlock_t *lock, u32 n)
{
- spin_lock(nilfs_mdt_bgl_lock(inode, group));
+ u32 nfree;
+
+ spin_lock(lock);
le32_add_cpu(&desc->pg_nfrees, n);
- spin_unlock(nilfs_mdt_bgl_lock(inode, group));
+ nfree = le32_to_cpu(desc->pg_nfrees);
+ spin_unlock(lock);
+ return nfree;
}
/**
@@ -240,6 +240,26 @@ static int nilfs_palloc_get_block(struct inode *inode, unsigned long blkoff,
}
/**
+ * nilfs_palloc_delete_block - delete a block on the persistent allocator file
+ * @inode: inode of metadata file using this allocator
+ * @blkoff: block offset
+ * @prev: nilfs_bh_assoc struct of the last used buffer
+ * @lock: spin lock protecting @prev
+ */
+static int nilfs_palloc_delete_block(struct inode *inode, unsigned long blkoff,
+ struct nilfs_bh_assoc *prev,
+ spinlock_t *lock)
+{
+ spin_lock(lock);
+ if (prev->bh && blkoff == prev->blkoff) {
+ brelse(prev->bh);
+ prev->bh = NULL;
+ }
+ spin_unlock(lock);
+ return nilfs_mdt_delete_block(inode, blkoff);
+}
+
+/**
* nilfs_palloc_get_desc_block - get buffer head of a group descriptor block
* @inode: inode of metadata file using this allocator
* @group: group number
@@ -278,6 +298,22 @@ static int nilfs_palloc_get_bitmap_block(struct inode *inode,
}
/**
+ * nilfs_palloc_delete_bitmap_block - delete a bitmap block
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ */
+static int nilfs_palloc_delete_bitmap_block(struct inode *inode,
+ unsigned long group)
+{
+ struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache;
+
+ return nilfs_palloc_delete_block(inode,
+ nilfs_palloc_bitmap_blkoff(inode,
+ group),
+ &cache->prev_bitmap, &cache->lock);
+}
+
+/**
* nilfs_palloc_get_entry_block - get buffer head of an entry block
* @inode: inode of metadata file using this allocator
* @nr: serial number of the entry (e.g. inode number)
@@ -296,6 +332,20 @@ int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr,
}
/**
+ * nilfs_palloc_delete_entry_block - delete an entry block
+ * @inode: inode of metadata file using this allocator
+ * @nr: serial number of the entry
+ */
+static int nilfs_palloc_delete_entry_block(struct inode *inode, __u64 nr)
+{
+ struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache;
+
+ return nilfs_palloc_delete_block(inode,
+ nilfs_palloc_entry_blkoff(inode, nr),
+ &cache->prev_entry, &cache->lock);
+}
+
+/**
* nilfs_palloc_block_get_group_desc - get kernel address of a group descriptor
* @inode: inode of metadata file using this allocator
* @group: group number
@@ -332,51 +382,40 @@ void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr,
/**
* nilfs_palloc_find_available_slot - find available slot in a group
- * @inode: inode of metadata file using this allocator
- * @group: group number
- * @target: offset number of an entry in the group (start point)
* @bitmap: bitmap of the group
+ * @target: offset number of an entry in the group (start point)
* @bsize: size in bits
+ * @lock: spin lock protecting @bitmap
*/
-static int nilfs_palloc_find_available_slot(struct inode *inode,
- unsigned long group,
+static int nilfs_palloc_find_available_slot(unsigned char *bitmap,
unsigned long target,
- unsigned char *bitmap,
- int bsize)
-{
- int curr, pos, end, i;
-
- if (target > 0) {
- end = (target + BITS_PER_LONG - 1) & ~(BITS_PER_LONG - 1);
- if (end > bsize)
- end = bsize;
- pos = nilfs_find_next_zero_bit(bitmap, end, target);
- if (pos < end &&
- !nilfs_set_bit_atomic(
- nilfs_mdt_bgl_lock(inode, group), pos, bitmap))
- return pos;
- } else
- end = 0;
-
- for (i = 0, curr = end;
- i < bsize;
- i += BITS_PER_LONG, curr += BITS_PER_LONG) {
- /* wrap around */
- if (curr >= bsize)
- curr = 0;
- while (*((unsigned long *)bitmap + curr / BITS_PER_LONG)
- != ~0UL) {
- end = curr + BITS_PER_LONG;
- if (end > bsize)
- end = bsize;
- pos = nilfs_find_next_zero_bit(bitmap, end, curr);
- if ((pos < end) &&
- !nilfs_set_bit_atomic(
- nilfs_mdt_bgl_lock(inode, group), pos,
- bitmap))
+ unsigned bsize,
+ spinlock_t *lock)
+{
+ int pos, end = bsize;
+
+ if (likely(target < bsize)) {
+ pos = target;
+ do {
+ pos = nilfs_find_next_zero_bit(bitmap, end, pos);
+ if (pos >= end)
+ break;
+ if (!nilfs_set_bit_atomic(lock, pos, bitmap))
return pos;
- }
+ } while (++pos < end);
+
+ end = target;
+ }
+
+ /* wrap around */
+ for (pos = 0; pos < end; pos++) {
+ pos = nilfs_find_next_zero_bit(bitmap, end, pos);
+ if (pos >= end)
+ break;
+ if (!nilfs_set_bit_atomic(lock, pos, bitmap))
+ return pos;
}
+
return -ENOSPC;
}
@@ -475,15 +514,15 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
void *desc_kaddr, *bitmap_kaddr;
unsigned long group, maxgroup, ngroups;
unsigned long group_offset, maxgroup_offset;
- unsigned long n, entries_per_group, groups_per_desc_block;
+ unsigned long n, entries_per_group;
unsigned long i, j;
+ spinlock_t *lock;
int pos, ret;
ngroups = nilfs_palloc_groups_count(inode);
maxgroup = ngroups - 1;
group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
entries_per_group = nilfs_palloc_entries_per_group(inode);
- groups_per_desc_block = nilfs_palloc_groups_per_desc_block(inode);
for (i = 0; i < ngroups; i += n) {
if (group >= ngroups) {
@@ -501,8 +540,8 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
n = nilfs_palloc_rest_groups_in_desc_block(inode, group,
maxgroup);
for (j = 0; j < n; j++, desc++, group++) {
- if (nilfs_palloc_group_desc_nfrees(inode, group, desc)
- > 0) {
+ lock = nilfs_mdt_bgl_lock(inode, group);
+ if (nilfs_palloc_group_desc_nfrees(desc, lock) > 0) {
ret = nilfs_palloc_get_bitmap_block(
inode, group, 1, &bitmap_bh);
if (ret < 0)
@@ -510,12 +549,12 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
bitmap_kaddr = kmap(bitmap_bh->b_page);
bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
pos = nilfs_palloc_find_available_slot(
- inode, group, group_offset, bitmap,
- entries_per_group);
+ bitmap, group_offset,
+ entries_per_group, lock);
if (pos >= 0) {
/* found a free entry */
nilfs_palloc_group_desc_add_entries(
- inode, group, desc, -1);
+ desc, lock, -1);
req->pr_entry_nr =
entries_per_group * group + pos;
kunmap(desc_bh->b_page);
@@ -573,6 +612,7 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
unsigned long group, group_offset;
unsigned char *bitmap;
void *desc_kaddr, *bitmap_kaddr;
+ spinlock_t *lock;
group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
desc_kaddr = kmap(req->pr_desc_bh->b_page);
@@ -580,13 +620,15 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
req->pr_desc_bh, desc_kaddr);
bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh);
+ lock = nilfs_mdt_bgl_lock(inode, group);
- if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
- group_offset, bitmap))
- printk(KERN_WARNING "%s: entry number %llu already freed\n",
- __func__, (unsigned long long)req->pr_entry_nr);
+ if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap))
+ nilfs_warning(inode->i_sb, __func__,
+ "entry number %llu already freed: ino=%lu\n",
+ (unsigned long long)req->pr_entry_nr,
+ (unsigned long)inode->i_ino);
else
- nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
+ nilfs_palloc_group_desc_add_entries(desc, lock, 1);
kunmap(req->pr_bitmap_bh->b_page);
kunmap(req->pr_desc_bh->b_page);
@@ -611,6 +653,7 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
void *desc_kaddr, *bitmap_kaddr;
unsigned char *bitmap;
unsigned long group, group_offset;
+ spinlock_t *lock;
group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
desc_kaddr = kmap(req->pr_desc_bh->b_page);
@@ -618,12 +661,15 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
req->pr_desc_bh, desc_kaddr);
bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh);
- if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
- group_offset, bitmap))
- printk(KERN_WARNING "%s: entry number %llu already freed\n",
- __func__, (unsigned long long)req->pr_entry_nr);
+ lock = nilfs_mdt_bgl_lock(inode, group);
+
+ if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap))
+ nilfs_warning(inode->i_sb, __func__,
+ "entry number %llu already freed: ino=%lu\n",
+ (unsigned long long)req->pr_entry_nr,
+ (unsigned long)inode->i_ino);
else
- nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
+ nilfs_palloc_group_desc_add_entries(desc, lock, 1);
kunmap(req->pr_bitmap_bh->b_page);
kunmap(req->pr_desc_bh->b_page);
@@ -680,22 +726,6 @@ void nilfs_palloc_abort_free_entry(struct inode *inode,
}
/**
- * nilfs_palloc_group_is_in - judge if an entry is in a group
- * @inode: inode of metadata file using this allocator
- * @group: group number
- * @nr: serial number of the entry (e.g. inode number)
- */
-static int
-nilfs_palloc_group_is_in(struct inode *inode, unsigned long group, __u64 nr)
-{
- __u64 first, last;
-
- first = group * nilfs_palloc_entries_per_group(inode);
- last = first + nilfs_palloc_entries_per_group(inode) - 1;
- return (nr >= first) && (nr <= last);
-}
-
-/**
* nilfs_palloc_freev - deallocate a set of persistent objects
* @inode: inode of metadata file using this allocator
* @entry_nrs: array of entry numbers to be deallocated
@@ -708,9 +738,18 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
unsigned char *bitmap;
void *desc_kaddr, *bitmap_kaddr;
unsigned long group, group_offset;
- int i, j, n, ret;
+ __u64 group_min_nr, last_nrs[8];
+ const unsigned long epg = nilfs_palloc_entries_per_group(inode);
+ const unsigned epb = NILFS_MDT(inode)->mi_entries_per_block;
+ unsigned entry_start, end, pos;
+ spinlock_t *lock;
+ int i, j, k, ret;
+ u32 nfree;
for (i = 0; i < nitems; i = j) {
+ int change_group = false;
+ int nempties = 0, n = 0;
+
group = nilfs_palloc_group(inode, entry_nrs[i], &group_offset);
ret = nilfs_palloc_get_desc_block(inode, group, 0, &desc_bh);
if (ret < 0)
@@ -721,38 +760,89 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
brelse(desc_bh);
return ret;
}
- desc_kaddr = kmap(desc_bh->b_page);
- desc = nilfs_palloc_block_get_group_desc(
- inode, group, desc_bh, desc_kaddr);
+
+ /* Get the first entry number of the group */
+ group_min_nr = (__u64)group * epg;
+
bitmap_kaddr = kmap(bitmap_bh->b_page);
bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
- for (j = i, n = 0;
- (j < nitems) && nilfs_palloc_group_is_in(inode, group,
- entry_nrs[j]);
- j++) {
- nilfs_palloc_group(inode, entry_nrs[j], &group_offset);
- if (!nilfs_clear_bit_atomic(
- nilfs_mdt_bgl_lock(inode, group),
- group_offset, bitmap)) {
- printk(KERN_WARNING
- "%s: entry number %llu already freed\n",
- __func__,
- (unsigned long long)entry_nrs[j]);
+ lock = nilfs_mdt_bgl_lock(inode, group);
+
+ j = i;
+ entry_start = rounddown(group_offset, epb);
+ do {
+ if (!nilfs_clear_bit_atomic(lock, group_offset,
+ bitmap)) {
+ nilfs_warning(inode->i_sb, __func__,
+ "entry number %llu already freed: ino=%lu\n",
+ (unsigned long long)entry_nrs[j],
+ (unsigned long)inode->i_ino);
} else {
n++;
}
- }
- nilfs_palloc_group_desc_add_entries(inode, group, desc, n);
+
+ j++;
+ if (j >= nitems || entry_nrs[j] < group_min_nr ||
+ entry_nrs[j] >= group_min_nr + epg) {
+ change_group = true;
+ } else {
+ group_offset = entry_nrs[j] - group_min_nr;
+ if (group_offset >= entry_start &&
+ group_offset < entry_start + epb) {
+ /* This entry is in the same block */
+ continue;
+ }
+ }
+
+ /* Test if the entry block is empty or not */
+ end = entry_start + epb;
+ pos = nilfs_find_next_bit(bitmap, end, entry_start);
+ if (pos >= end) {
+ last_nrs[nempties++] = entry_nrs[j - 1];
+ if (nempties >= ARRAY_SIZE(last_nrs))
+ break;
+ }
+
+ if (change_group)
+ break;
+
+ /* Go on to the next entry block */
+ entry_start = rounddown(group_offset, epb);
+ } while (true);
kunmap(bitmap_bh->b_page);
- kunmap(desc_bh->b_page);
+ mark_buffer_dirty(bitmap_bh);
+ brelse(bitmap_bh);
+ for (k = 0; k < nempties; k++) {
+ ret = nilfs_palloc_delete_entry_block(inode,
+ last_nrs[k]);
+ if (ret && ret != -ENOENT) {
+ nilfs_warning(inode->i_sb, __func__,
+ "failed to delete block of entry %llu: ino=%lu, err=%d\n",
+ (unsigned long long)last_nrs[k],
+ (unsigned long)inode->i_ino, ret);
+ }
+ }
+
+ desc_kaddr = kmap_atomic(desc_bh->b_page);
+ desc = nilfs_palloc_block_get_group_desc(
+ inode, group, desc_bh, desc_kaddr);
+ nfree = nilfs_palloc_group_desc_add_entries(desc, lock, n);
+ kunmap_atomic(desc_kaddr);
mark_buffer_dirty(desc_bh);
- mark_buffer_dirty(bitmap_bh);
nilfs_mdt_mark_dirty(inode);
-
- brelse(bitmap_bh);
brelse(desc_bh);
+
+ if (nfree == nilfs_palloc_entries_per_group(inode)) {
+ ret = nilfs_palloc_delete_bitmap_block(inode, group);
+ if (ret && ret != -ENOENT) {
+ nilfs_warning(inode->i_sb, __func__,
+ "failed to delete bitmap block of group %lu: ino=%lu, err=%d\n",
+ group,
+ (unsigned long)inode->i_ino, ret);
+ }
+ }
}
return 0;
}
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index 4bd6451..6e6f49a 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -77,6 +77,7 @@ int nilfs_palloc_freev(struct inode *, __u64 *, size_t);
#define nilfs_set_bit_atomic ext2_set_bit_atomic
#define nilfs_clear_bit_atomic ext2_clear_bit_atomic
#define nilfs_find_next_zero_bit find_next_zero_bit_le
+#define nilfs_find_next_bit find_next_bit_le
/**
* struct nilfs_bh_assoc - block offset and buffer head association
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 919fd5b..3a3821b 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -919,8 +919,6 @@ static void nilfs_btree_split(struct nilfs_bmap *btree,
int level, __u64 *keyp, __u64 *ptrp)
{
struct nilfs_btree_node *node, *right;
- __u64 newkey;
- __u64 newptr;
int nchildren, n, move, ncblk;
node = nilfs_btree_get_nonroot_node(path, level);
@@ -942,9 +940,6 @@ static void nilfs_btree_split(struct nilfs_bmap *btree,
if (!buffer_dirty(path[level].bp_sib_bh))
mark_buffer_dirty(path[level].bp_sib_bh);
- newkey = nilfs_btree_node_get_key(right, 0);
- newptr = path[level].bp_newreq.bpr_ptr;
-
if (move) {
path[level].bp_index -= nilfs_btree_node_get_nchildren(node);
nilfs_btree_node_insert(right, path[level].bp_index,
@@ -1856,7 +1851,7 @@ int nilfs_btree_convert_and_insert(struct nilfs_bmap *btree,
__u64 key, __u64 ptr,
const __u64 *keys, const __u64 *ptrs, int n)
{
- struct buffer_head *bh;
+ struct buffer_head *bh = NULL;
union nilfs_bmap_ptr_req dreq, nreq, *di, *ni;
struct nilfs_bmap_stats stats;
int ret;
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 0d5fada..7dc23f1 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -155,7 +155,6 @@ void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req,
int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req)
{
struct nilfs_dat_entry *entry;
- __u64 start;
sector_t blocknr;
void *kaddr;
int ret;
@@ -169,7 +168,6 @@ int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req)
kaddr = kmap_atomic(req->pr_entry_bh->b_page);
entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
req->pr_entry_bh, kaddr);
- start = le64_to_cpu(entry->de_start);
blocknr = le64_to_cpu(entry->de_blocknr);
kunmap_atomic(kaddr);
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 54575e3..088ba00 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -109,7 +109,7 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
goto out;
file_update_time(vma->vm_file);
- ret = __block_page_mkwrite(vma, vmf, nilfs_get_block);
+ ret = block_page_mkwrite(vma, vmf, nilfs_get_block);
if (ret) {
nilfs_transaction_abort(inode->i_sb);
goto out;
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 4a73d6d..10b2252 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -356,7 +356,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
goto failed;
mapping_set_gfp_mask(inode->i_mapping,
- mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
+ mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS));
root = NILFS_I(dir)->i_root;
ii = NILFS_I(inode);
@@ -510,6 +510,7 @@ static int __nilfs_read_inode(struct super_block *sb,
inode->i_mapping->a_ops = &nilfs_aops;
} else if (S_ISLNK(inode->i_mode)) {
inode->i_op = &nilfs_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &nilfs_aops;
} else {
inode->i_op = &nilfs_special_inode_operations;
@@ -522,7 +523,7 @@ static int __nilfs_read_inode(struct super_block *sb,
up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
nilfs_set_inode_flags(inode);
mapping_set_gfp_mask(inode->i_mapping,
- mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
+ mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS));
return 0;
failed_unmap:
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index dee34d9..1125f40 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -33,6 +33,7 @@
#include "page.h"
#include "mdt.h"
+#include <trace/events/nilfs2.h>
#define NILFS_MDT_MAX_RA_BLOCKS (16 - 1)
@@ -68,6 +69,9 @@ nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block,
set_buffer_uptodate(bh);
mark_buffer_dirty(bh);
nilfs_mdt_mark_dirty(inode);
+
+ trace_nilfs2_mdt_insert_new_block(inode, inode->i_ino, block);
+
return 0;
}
@@ -158,6 +162,8 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
get_bh(bh);
submit_bh(mode, bh);
ret = 0;
+
+ trace_nilfs2_mdt_submit_block(inode, inode->i_ino, blkoff, mode);
out:
get_bh(bh);
*out_bh = bh;
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index fe529a8..03246ca 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -72,7 +72,7 @@ static inline struct nilfs_mdt_info *NILFS_MDT(const struct inode *inode)
}
/* Default GFP flags using highmem */
-#define NILFS_MDT_GFP (__GFP_WAIT | __GFP_IO | __GFP_HIGHMEM)
+#define NILFS_MDT_GFP (__GFP_RECLAIM | __GFP_IO | __GFP_HIGHMEM)
int nilfs_mdt_get_block(struct inode *, unsigned long, int,
void (*init_block)(struct inode *,
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 37dd6b0..7ccdb96 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -120,9 +120,6 @@ nilfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev)
struct nilfs_transaction_info ti;
int err;
- if (!new_valid_dev(rdev))
- return -EINVAL;
-
err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
if (err)
return err;
@@ -164,6 +161,7 @@ static int nilfs_symlink(struct inode *dir, struct dentry *dentry,
/* slow symlink */
inode->i_op = &nilfs_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &nilfs_aops;
err = page_symlink(inode, symname, l);
if (err)
@@ -571,8 +569,7 @@ const struct inode_operations nilfs_special_inode_operations = {
const struct inode_operations nilfs_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
.permission = nilfs_permission,
};
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index ff00a0b..9b4f205 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -582,7 +582,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
struct nilfs_recovery_info *ri)
{
struct buffer_head *bh_sum = NULL;
- struct nilfs_segment_summary *sum;
+ struct nilfs_segment_summary *sum = NULL;
sector_t pseg_start;
sector_t seg_start, seg_end; /* Starting/ending DBN of full segment */
unsigned long nsalvaged_blocks = 0;
@@ -814,7 +814,7 @@ int nilfs_search_super_root(struct the_nilfs *nilfs,
struct nilfs_recovery_info *ri)
{
struct buffer_head *bh_sum = NULL;
- struct nilfs_segment_summary *sum;
+ struct nilfs_segment_summary *sum = NULL;
sector_t pseg_start, pseg_end, sr_pseg_start = 0;
sector_t seg_start, seg_end; /* range of full segment (block number) */
sector_t b, end;
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 42468e5..f63620c 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -338,12 +338,11 @@ void nilfs_add_checksums_on_logs(struct list_head *logs, u32 seed)
/*
* BIO operations
*/
-static void nilfs_end_bio_write(struct bio *bio, int err)
+static void nilfs_end_bio_write(struct bio *bio)
{
- const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct nilfs_segment_buffer *segbuf = bio->bi_private;
- if (!uptodate)
+ if (bio->bi_error)
atomic_inc(&segbuf->sb_err);
bio_put(bio);
@@ -415,7 +414,7 @@ static void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *segbuf,
{
wi->bio = NULL;
wi->rest_blocks = segbuf->sb_sum.nblocks;
- wi->max_pages = bio_get_nr_vecs(wi->nilfs->ns_bdev);
+ wi->max_pages = BIO_MAX_PAGES;
wi->nr_vecs = min(wi->max_pages, wi->rest_blocks);
wi->start = wi->end = 0;
wi->blocknr = segbuf->sb_pseg_start;
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index c6abbad9..3b65ada 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -77,6 +77,36 @@ enum {
NILFS_ST_DONE,
};
+#define CREATE_TRACE_POINTS
+#include <trace/events/nilfs2.h>
+
+/*
+ * nilfs_sc_cstage_inc(), nilfs_sc_cstage_set(), nilfs_sc_cstage_get() are
+ * wrapper functions of stage count (nilfs_sc_info->sc_stage.scnt). Users of
+ * the variable must use them because transition of stage count must involve
+ * trace events (trace_nilfs2_collection_stage_transition).
+ *
+ * nilfs_sc_cstage_get() isn't required for the above purpose because it doesn't
+ * produce tracepoint events. It is provided just for making the intention
+ * clear.
+ */
+static inline void nilfs_sc_cstage_inc(struct nilfs_sc_info *sci)
+{
+ sci->sc_stage.scnt++;
+ trace_nilfs2_collection_stage_transition(sci);
+}
+
+static inline void nilfs_sc_cstage_set(struct nilfs_sc_info *sci, int next_scnt)
+{
+ sci->sc_stage.scnt = next_scnt;
+ trace_nilfs2_collection_stage_transition(sci);
+}
+
+static inline int nilfs_sc_cstage_get(struct nilfs_sc_info *sci)
+{
+ return sci->sc_stage.scnt;
+}
+
/* State flags of collection */
#define NILFS_CF_NODE 0x0001 /* Collecting node blocks */
#define NILFS_CF_IFILE_STARTED 0x0002 /* IFILE stage has started */
@@ -184,11 +214,18 @@ int nilfs_transaction_begin(struct super_block *sb,
{
struct the_nilfs *nilfs;
int ret = nilfs_prepare_segment_lock(ti);
+ struct nilfs_transaction_info *trace_ti;
if (unlikely(ret < 0))
return ret;
- if (ret > 0)
+ if (ret > 0) {
+ trace_ti = current->journal_info;
+
+ trace_nilfs2_transaction_transition(sb, trace_ti,
+ trace_ti->ti_count, trace_ti->ti_flags,
+ TRACE_NILFS2_TRANSACTION_BEGIN);
return 0;
+ }
sb_start_intwrite(sb);
@@ -199,6 +236,11 @@ int nilfs_transaction_begin(struct super_block *sb,
ret = -ENOSPC;
goto failed;
}
+
+ trace_ti = current->journal_info;
+ trace_nilfs2_transaction_transition(sb, trace_ti, trace_ti->ti_count,
+ trace_ti->ti_flags,
+ TRACE_NILFS2_TRANSACTION_BEGIN);
return 0;
failed:
@@ -231,6 +273,8 @@ int nilfs_transaction_commit(struct super_block *sb)
ti->ti_flags |= NILFS_TI_COMMIT;
if (ti->ti_count > 0) {
ti->ti_count--;
+ trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
+ ti->ti_flags, TRACE_NILFS2_TRANSACTION_COMMIT);
return 0;
}
if (nilfs->ns_writer) {
@@ -242,6 +286,9 @@ int nilfs_transaction_commit(struct super_block *sb)
nilfs_segctor_do_flush(sci, 0);
}
up_read(&nilfs->ns_segctor_sem);
+ trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
+ ti->ti_flags, TRACE_NILFS2_TRANSACTION_COMMIT);
+
current->journal_info = ti->ti_save;
if (ti->ti_flags & NILFS_TI_SYNC)
@@ -260,10 +307,15 @@ void nilfs_transaction_abort(struct super_block *sb)
BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
if (ti->ti_count > 0) {
ti->ti_count--;
+ trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
+ ti->ti_flags, TRACE_NILFS2_TRANSACTION_ABORT);
return;
}
up_read(&nilfs->ns_segctor_sem);
+ trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
+ ti->ti_flags, TRACE_NILFS2_TRANSACTION_ABORT);
+
current->journal_info = ti->ti_save;
if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
kmem_cache_free(nilfs_transaction_cachep, ti);
@@ -309,6 +361,9 @@ static void nilfs_transaction_lock(struct super_block *sb,
current->journal_info = ti;
for (;;) {
+ trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
+ ti->ti_flags, TRACE_NILFS2_TRANSACTION_TRYLOCK);
+
down_write(&nilfs->ns_segctor_sem);
if (!test_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags))
break;
@@ -320,6 +375,9 @@ static void nilfs_transaction_lock(struct super_block *sb,
}
if (gcflag)
ti->ti_flags |= NILFS_TI_GC;
+
+ trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
+ ti->ti_flags, TRACE_NILFS2_TRANSACTION_LOCK);
}
static void nilfs_transaction_unlock(struct super_block *sb)
@@ -332,6 +390,9 @@ static void nilfs_transaction_unlock(struct super_block *sb)
up_write(&nilfs->ns_segctor_sem);
current->journal_info = ti->ti_save;
+
+ trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
+ ti->ti_flags, TRACE_NILFS2_TRANSACTION_UNLOCK);
}
static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci,
@@ -1062,7 +1123,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
size_t ndone;
int err = 0;
- switch (sci->sc_stage.scnt) {
+ switch (nilfs_sc_cstage_get(sci)) {
case NILFS_ST_INIT:
/* Pre-processes */
sci->sc_stage.flags = 0;
@@ -1071,7 +1132,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
sci->sc_nblk_inc = 0;
sci->sc_curseg->sb_sum.flags = NILFS_SS_LOGBGN;
if (mode == SC_LSEG_DSYNC) {
- sci->sc_stage.scnt = NILFS_ST_DSYNC;
+ nilfs_sc_cstage_set(sci, NILFS_ST_DSYNC);
goto dsync_mode;
}
}
@@ -1079,10 +1140,10 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
sci->sc_stage.dirty_file_ptr = NULL;
sci->sc_stage.gc_inode_ptr = NULL;
if (mode == SC_FLUSH_DAT) {
- sci->sc_stage.scnt = NILFS_ST_DAT;
+ nilfs_sc_cstage_set(sci, NILFS_ST_DAT);
goto dat_stage;
}
- sci->sc_stage.scnt++; /* Fall through */
+ nilfs_sc_cstage_inc(sci); /* Fall through */
case NILFS_ST_GC:
if (nilfs_doing_gc()) {
head = &sci->sc_gc_inodes;
@@ -1103,7 +1164,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
}
sci->sc_stage.gc_inode_ptr = NULL;
}
- sci->sc_stage.scnt++; /* Fall through */
+ nilfs_sc_cstage_inc(sci); /* Fall through */
case NILFS_ST_FILE:
head = &sci->sc_dirty_files;
ii = list_prepare_entry(sci->sc_stage.dirty_file_ptr, head,
@@ -1125,10 +1186,10 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
}
sci->sc_stage.dirty_file_ptr = NULL;
if (mode == SC_FLUSH_FILE) {
- sci->sc_stage.scnt = NILFS_ST_DONE;
+ nilfs_sc_cstage_set(sci, NILFS_ST_DONE);
return 0;
}
- sci->sc_stage.scnt++;
+ nilfs_sc_cstage_inc(sci);
sci->sc_stage.flags |= NILFS_CF_IFILE_STARTED;
/* Fall through */
case NILFS_ST_IFILE:
@@ -1136,7 +1197,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
&nilfs_sc_file_ops);
if (unlikely(err))
break;
- sci->sc_stage.scnt++;
+ nilfs_sc_cstage_inc(sci);
/* Creating a checkpoint */
err = nilfs_segctor_create_checkpoint(sci);
if (unlikely(err))
@@ -1147,7 +1208,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
&nilfs_sc_file_ops);
if (unlikely(err))
break;
- sci->sc_stage.scnt++; /* Fall through */
+ nilfs_sc_cstage_inc(sci); /* Fall through */
case NILFS_ST_SUFILE:
err = nilfs_sufile_freev(nilfs->ns_sufile, sci->sc_freesegs,
sci->sc_nfreesegs, &ndone);
@@ -1163,7 +1224,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
&nilfs_sc_file_ops);
if (unlikely(err))
break;
- sci->sc_stage.scnt++; /* Fall through */
+ nilfs_sc_cstage_inc(sci); /* Fall through */
case NILFS_ST_DAT:
dat_stage:
err = nilfs_segctor_scan_file(sci, nilfs->ns_dat,
@@ -1171,10 +1232,10 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
if (unlikely(err))
break;
if (mode == SC_FLUSH_DAT) {
- sci->sc_stage.scnt = NILFS_ST_DONE;
+ nilfs_sc_cstage_set(sci, NILFS_ST_DONE);
return 0;
}
- sci->sc_stage.scnt++; /* Fall through */
+ nilfs_sc_cstage_inc(sci); /* Fall through */
case NILFS_ST_SR:
if (mode == SC_LSEG_SR) {
/* Appending a super root */
@@ -1184,7 +1245,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
}
/* End of a logical segment */
sci->sc_curseg->sb_sum.flags |= NILFS_SS_LOGEND;
- sci->sc_stage.scnt = NILFS_ST_DONE;
+ nilfs_sc_cstage_set(sci, NILFS_ST_DONE);
return 0;
case NILFS_ST_DSYNC:
dsync_mode:
@@ -1197,7 +1258,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
if (unlikely(err))
break;
sci->sc_curseg->sb_sum.flags |= NILFS_SS_LOGEND;
- sci->sc_stage.scnt = NILFS_ST_DONE;
+ nilfs_sc_cstage_set(sci, NILFS_ST_DONE);
return 0;
case NILFS_ST_DONE:
return 0;
@@ -1442,7 +1503,8 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
goto failed;
/* The current segment is filled up */
- if (mode != SC_LSEG_SR || sci->sc_stage.scnt < NILFS_ST_CPFILE)
+ if (mode != SC_LSEG_SR ||
+ nilfs_sc_cstage_get(sci) < NILFS_ST_CPFILE)
break;
nilfs_clear_logs(&sci->sc_segbufs);
@@ -1946,7 +2008,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
int err;
- sci->sc_stage.scnt = NILFS_ST_INIT;
+ nilfs_sc_cstage_set(sci, NILFS_ST_INIT);
sci->sc_cno = nilfs->ns_cno;
err = nilfs_segctor_collect_dirty_files(sci, nilfs);
@@ -1974,7 +2036,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
goto failed;
/* Avoid empty segment */
- if (sci->sc_stage.scnt == NILFS_ST_DONE &&
+ if (nilfs_sc_cstage_get(sci) == NILFS_ST_DONE &&
nilfs_segbuf_empty(sci->sc_curseg)) {
nilfs_segctor_abort_construction(sci, nilfs, 1);
goto out;
@@ -1988,7 +2050,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
nilfs_segctor_fill_in_file_bmap(sci);
if (mode == SC_LSEG_SR &&
- sci->sc_stage.scnt >= NILFS_ST_CPFILE) {
+ nilfs_sc_cstage_get(sci) >= NILFS_ST_CPFILE) {
err = nilfs_segctor_fill_in_checkpoint(sci);
if (unlikely(err))
goto failed_to_write;
@@ -2007,7 +2069,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
if (unlikely(err))
goto failed_to_write;
- if (sci->sc_stage.scnt == NILFS_ST_DONE ||
+ if (nilfs_sc_cstage_get(sci) == NILFS_ST_DONE ||
nilfs->ns_blocksize_bits != PAGE_CACHE_SHIFT) {
/*
* At this point, we avoid double buffering
@@ -2020,7 +2082,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
if (err)
goto failed_to_write;
}
- } while (sci->sc_stage.scnt != NILFS_ST_DONE);
+ } while (nilfs_sc_cstage_get(sci) != NILFS_ST_DONE);
out:
nilfs_segctor_drop_written_files(sci, nilfs);
@@ -2430,7 +2492,6 @@ static void nilfs_segctor_thread_construct(struct nilfs_sc_info *sci, int mode)
static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *sci)
{
int mode = 0;
- int err;
spin_lock(&sci->sc_state_lock);
mode = (sci->sc_flush_request & FLUSH_DAT_BIT) ?
@@ -2438,7 +2499,7 @@ static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *sci)
spin_unlock(&sci->sc_state_lock);
if (mode) {
- err = nilfs_segctor_do_construct(sci, mode);
+ nilfs_segctor_do_construct(sci, mode);
spin_lock(&sci->sc_state_lock);
sci->sc_flush_request &= (mode == SC_FLUSH_FILE) ?
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index a48d6de..0408b9b 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -67,7 +67,8 @@ struct nilfs_recovery_info {
/**
* struct nilfs_cstage - Context of collection stage
- * @scnt: Stage count
+ * @scnt: Stage count, must be accessed via wrappers:
+ * nilfs_sc_cstage_inc(), nilfs_sc_cstage_set(), nilfs_sc_cstage_get()
* @flags: State flags
* @dirty_file_ptr: Pointer on dirty_files list, or inode of a target file
* @gc_inode_ptr: Pointer on the list of gc-inodes
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 2a869c3..52821ff 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -30,6 +30,8 @@
#include "mdt.h"
#include "sufile.h"
+#include <trace/events/nilfs2.h>
+
/**
* struct nilfs_sufile_info - on-memory private data of sufile
* @mi: on-memory private data of metadata file
@@ -317,7 +319,7 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
size_t susz = NILFS_MDT(sufile)->mi_entry_size;
__u64 segnum, maxsegnum, last_alloc;
void *kaddr;
- unsigned long nsegments, ncleansegs, nsus, cnt;
+ unsigned long nsegments, nsus, cnt;
int ret, j;
down_write(&NILFS_MDT(sufile)->mi_sem);
@@ -327,7 +329,6 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
goto out_sem;
kaddr = kmap_atomic(header_bh->b_page);
header = kaddr + bh_offset(header_bh);
- ncleansegs = le64_to_cpu(header->sh_ncleansegs);
last_alloc = le64_to_cpu(header->sh_last_alloc);
kunmap_atomic(kaddr);
@@ -358,6 +359,7 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
break; /* never happens */
}
}
+ trace_nilfs2_segment_usage_check(sufile, segnum, cnt);
ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 1,
&su_bh);
if (ret < 0)
@@ -388,6 +390,9 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
nilfs_mdt_mark_dirty(sufile);
brelse(su_bh);
*segnump = segnum;
+
+ trace_nilfs2_segment_usage_allocated(sufile, segnum);
+
goto out_header;
}
@@ -490,6 +495,8 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum,
NILFS_SUI(sufile)->ncleansegs++;
nilfs_mdt_mark_dirty(sufile);
+
+ trace_nilfs2_segment_usage_freed(sufile, segnum);
}
/**
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index f47585b..7f5d3d9 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -361,7 +361,7 @@ static int nilfs_move_2nd_super(struct super_block *sb, loff_t sb2off)
struct nilfs_super_block *nsbp;
sector_t blocknr, newblocknr;
unsigned long offset;
- int sb2i = -1; /* array index of the secondary superblock */
+ int sb2i; /* array index of the secondary superblock */
int ret = 0;
/* nilfs->ns_sem must be locked by the caller. */
@@ -372,6 +372,9 @@ static int nilfs_move_2nd_super(struct super_block *sb, loff_t sb2off)
} else if (nilfs->ns_sbh[0]->b_blocknr > nilfs->ns_first_data_block) {
sb2i = 0;
blocknr = nilfs->ns_sbh[0]->b_blocknr;
+ } else {
+ sb2i = -1;
+ blocknr = 0;
}
if (sb2i >= 0 && (u64)blocknr << nilfs->ns_blocksize_bits == sb2off)
goto out; /* super block location is unchanged */
@@ -1313,13 +1316,11 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
}
if (!s->s_root) {
- char b[BDEVNAME_SIZE];
-
- s_new = true;
+ s_new = true;
/* New superblock instance created */
s->s_mode = mode;
- strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id));
+ snprintf(s->s_id, sizeof(s->s_id), "%pg", sd.bdev);
sb_set_blocksize(s, block_size(sd.bdev));
err = nilfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
@@ -1405,21 +1406,18 @@ static void nilfs_destroy_cachep(void)
*/
rcu_barrier();
- if (nilfs_inode_cachep)
- kmem_cache_destroy(nilfs_inode_cachep);
- if (nilfs_transaction_cachep)
- kmem_cache_destroy(nilfs_transaction_cachep);
- if (nilfs_segbuf_cachep)
- kmem_cache_destroy(nilfs_segbuf_cachep);
- if (nilfs_btree_path_cache)
- kmem_cache_destroy(nilfs_btree_path_cache);
+ kmem_cache_destroy(nilfs_inode_cachep);
+ kmem_cache_destroy(nilfs_transaction_cachep);
+ kmem_cache_destroy(nilfs_segbuf_cachep);
+ kmem_cache_destroy(nilfs_btree_path_cache);
}
static int __init nilfs_init_cachep(void)
{
nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache",
sizeof(struct nilfs_inode_info), 0,
- SLAB_RECLAIM_ACCOUNT, nilfs_inode_init_once);
+ SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT,
+ nilfs_inode_init_once);
if (!nilfs_inode_cachep)
goto fail;
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 44523f4..6faaf71 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -154,6 +154,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
struct dnotify_struct *dn;
struct dnotify_struct **prev;
struct inode *inode;
+ bool free = false;
inode = file_inode(filp);
if (!S_ISDIR(inode->i_mode))
@@ -182,11 +183,15 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
/* nothing else could have found us thanks to the dnotify_groups
mark_mutex */
- if (dn_mark->dn == NULL)
- fsnotify_destroy_mark_locked(fsn_mark, dnotify_group);
+ if (dn_mark->dn == NULL) {
+ fsnotify_detach_mark(fsn_mark);
+ free = true;
+ }
mutex_unlock(&dnotify_group->mark_mutex);
+ if (free)
+ fsnotify_free_mark(fsn_mark);
fsnotify_put_mark(fsn_mark);
}
@@ -362,9 +367,10 @@ out:
spin_unlock(&fsn_mark->lock);
if (destroy)
- fsnotify_destroy_mark_locked(fsn_mark, dnotify_group);
-
+ fsnotify_detach_mark(fsn_mark);
mutex_unlock(&dnotify_group->mark_mutex);
+ if (destroy)
+ fsnotify_free_mark(fsn_mark);
fsnotify_put_mark(fsn_mark);
out_err:
if (new_fsn_mark)
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index cf27550..8e8e6bc 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -529,8 +529,10 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
&destroy_mark);
if (destroy_mark)
- fsnotify_destroy_mark_locked(fsn_mark, group);
+ fsnotify_detach_mark(fsn_mark);
mutex_unlock(&group->mark_mutex);
+ if (destroy_mark)
+ fsnotify_free_mark(fsn_mark);
fsnotify_put_mark(fsn_mark);
if (removed & real_mount(mnt)->mnt_fsnotify_mask)
@@ -557,8 +559,10 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group,
removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
&destroy_mark);
if (destroy_mark)
- fsnotify_destroy_mark_locked(fsn_mark, group);
+ fsnotify_detach_mark(fsn_mark);
mutex_unlock(&group->mark_mutex);
+ if (destroy_mark)
+ fsnotify_free_mark(fsn_mark);
/* matches the fsnotify_find_inode_mark() */
fsnotify_put_mark(fsn_mark);
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index 58b7cdb..fd98e51 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -76,15 +76,23 @@ static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
struct inotify_inode_mark *inode_mark;
struct inode *inode;
- if (!(mark->flags & (FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_INODE)))
+ if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE) ||
+ !(mark->flags & FSNOTIFY_MARK_FLAG_INODE))
return;
inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark);
inode = igrab(mark->inode);
if (inode) {
+ /*
+ * IN_ALL_EVENTS represents all of the mask bits
+ * that we expose to userspace. There is at
+ * least one bit (FS_EVENT_ON_CHILD) which is
+ * used only internally to the kernel.
+ */
+ u32 mask = mark->mask & IN_ALL_EVENTS;
seq_printf(m, "inotify wd:%x ino:%lx sdev:%x mask:%x ignored_mask:%x ",
inode_mark->wd, inode->i_ino, inode->i_sb->s_dev,
- mark->mask, mark->ignored_mask);
+ mask, mark->ignored_mask);
show_mark_fhandle(m, inode);
seq_putc(m, '\n');
iput(inode);
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index dd3fb0b..db39de2 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -26,7 +26,6 @@
#include <linux/fsnotify_backend.h>
#include "fsnotify.h"
-#include "../mount.h"
/*
* Clear all of the marks on an inode when it is being evicted from core
@@ -205,6 +204,16 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
mnt = NULL;
/*
+ * Optimization: srcu_read_lock() has a memory barrier which can
+ * be expensive. It protects walking the *_fsnotify_marks lists.
+ * However, if we do not walk the lists, we do not have to do
+ * SRCU because we have no references to any objects and do not
+ * need SRCU to keep them "alive".
+ */
+ if (hlist_empty(&to_tell->i_fsnotify_marks) &&
+ (!mnt || hlist_empty(&mnt->mnt_fsnotify_marks)))
+ return 0;
+ /*
* if this is a modify event we may need to clear the ignored masks
* otherwise return if neither the inode nor the vfsmount care about
* this type of event.
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 13a00be..b44c68a 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -6,6 +6,8 @@
#include <linux/srcu.h>
#include <linux/types.h>
+#include "../mount.h"
+
/* destroy all events sitting in this groups notification queue */
extern void fsnotify_flush_notify(struct fsnotify_group *group);
@@ -38,15 +40,22 @@ extern int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark);
/* inode specific destruction of a mark */
extern void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark);
-/* Destroy all marks in the given list */
-extern void fsnotify_destroy_marks(struct list_head *to_free);
/* Find mark belonging to given group in the list of marks */
extern struct fsnotify_mark *fsnotify_find_mark(struct hlist_head *head,
struct fsnotify_group *group);
-/* run the list of all marks associated with inode and flag them to be freed */
-extern void fsnotify_clear_marks_by_inode(struct inode *inode);
-/* run the list of all marks associated with vfsmount and flag them to be freed */
-extern void fsnotify_clear_marks_by_mount(struct vfsmount *mnt);
+/* Destroy all marks in the given list protected by 'lock' */
+extern void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock);
+/* run the list of all marks associated with inode and destroy them */
+static inline void fsnotify_clear_marks_by_inode(struct inode *inode)
+{
+ fsnotify_destroy_marks(&inode->i_fsnotify_marks, &inode->i_lock);
+}
+/* run the list of all marks associated with vfsmount and destroy them */
+static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
+{
+ fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks,
+ &mnt->mnt_root->d_lock);
+}
/*
* update the dentry->d_flags of all of inode's children to indicate if inode cares
* about events that happen to its children.
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 3daf513..741077d 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -65,26 +65,6 @@ void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark)
}
/*
- * Given an inode, destroy all of the marks associated with that inode.
- */
-void fsnotify_clear_marks_by_inode(struct inode *inode)
-{
- struct fsnotify_mark *mark;
- struct hlist_node *n;
- LIST_HEAD(free_list);
-
- spin_lock(&inode->i_lock);
- hlist_for_each_entry_safe(mark, n, &inode->i_fsnotify_marks, obj_list) {
- list_add(&mark->free_list, &free_list);
- hlist_del_init_rcu(&mark->obj_list);
- fsnotify_get_mark(mark);
- }
- spin_unlock(&inode->i_lock);
-
- fsnotify_destroy_marks(&free_list);
-}
-
-/*
* Given a group clear all of the inode marks associated with that group.
*/
void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group)
@@ -163,17 +143,17 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
/**
* fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes.
- * @list: list of inodes being unmounted (sb->s_inodes)
+ * @sb: superblock being unmounted.
*
* Called during unmount with no locks held, so needs to be safe against
- * concurrent modifiers. We temporarily drop inode_sb_list_lock and CAN block.
+ * concurrent modifiers. We temporarily drop sb->s_inode_list_lock and CAN block.
*/
-void fsnotify_unmount_inodes(struct list_head *list)
+void fsnotify_unmount_inodes(struct super_block *sb)
{
struct inode *inode, *next_i, *need_iput = NULL;
- spin_lock(&inode_sb_list_lock);
- list_for_each_entry_safe(inode, next_i, list, i_sb_list) {
+ spin_lock(&sb->s_inode_list_lock);
+ list_for_each_entry_safe(inode, next_i, &sb->s_inodes, i_sb_list) {
struct inode *need_iput_tmp;
/*
@@ -209,7 +189,7 @@ void fsnotify_unmount_inodes(struct list_head *list)
spin_unlock(&inode->i_lock);
/* In case the dropping of a reference would nuke next_i. */
- while (&next_i->i_sb_list != list) {
+ while (&next_i->i_sb_list != &sb->s_inodes) {
spin_lock(&next_i->i_lock);
if (!(next_i->i_state & (I_FREEING | I_WILL_FREE)) &&
atomic_read(&next_i->i_count)) {
@@ -219,17 +199,16 @@ void fsnotify_unmount_inodes(struct list_head *list)
break;
}
spin_unlock(&next_i->i_lock);
- next_i = list_entry(next_i->i_sb_list.next,
- struct inode, i_sb_list);
+ next_i = list_next_entry(next_i, i_sb_list);
}
/*
- * We can safely drop inode_sb_list_lock here because either
+ * We can safely drop s_inode_list_lock here because either
* we actually hold references on both inode and next_i or
* end of list. Also no new inodes will be added since the
* umount has begun.
*/
- spin_unlock(&inode_sb_list_lock);
+ spin_unlock(&sb->s_inode_list_lock);
if (need_iput_tmp)
iput(need_iput_tmp);
@@ -241,7 +220,7 @@ void fsnotify_unmount_inodes(struct list_head *list)
iput(inode);
- spin_lock(&inode_sb_list_lock);
+ spin_lock(&sb->s_inode_list_lock);
}
- spin_unlock(&inode_sb_list_lock);
+ spin_unlock(&sb->s_inode_list_lock);
}
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 5b1e2a4..b8d08d0 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -706,7 +706,19 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
int ret;
unsigned flags = 0;
- /* don't allow invalid bits: we don't want flags set */
+ /*
+ * We share a lot of code with fs/dnotify. We also share
+ * the bit layout between inotify's IN_* and the fsnotify
+ * FS_*. This check ensures that only the inotify IN_*
+ * bits get passed in and set in watches/events.
+ */
+ if (unlikely(mask & ~ALL_INOTIFY_BITS))
+ return -EINVAL;
+ /*
+ * Require at least one valid bit set in the mask.
+ * Without _something_ set, we would have no events to
+ * watch for.
+ */
if (unlikely(!(mask & ALL_INOTIFY_BITS)))
return -EINVAL;
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 39ddcaf..cfcbf11 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -92,9 +92,6 @@
#include "fsnotify.h"
struct srcu_struct fsnotify_mark_srcu;
-static DEFINE_SPINLOCK(destroy_lock);
-static LIST_HEAD(destroy_list);
-static DECLARE_WAIT_QUEUE_HEAD(destroy_waitq);
void fsnotify_get_mark(struct fsnotify_mark *mark)
{
@@ -122,26 +119,27 @@ u32 fsnotify_recalc_mask(struct hlist_head *head)
}
/*
- * Any time a mark is getting freed we end up here.
- * The caller had better be holding a reference to this mark so we don't actually
- * do the final put under the mark->lock
+ * Remove mark from inode / vfsmount list, group list, drop inode reference
+ * if we got one.
+ *
+ * Must be called with group->mark_mutex held.
*/
-void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark,
- struct fsnotify_group *group)
+void fsnotify_detach_mark(struct fsnotify_mark *mark)
{
struct inode *inode = NULL;
+ struct fsnotify_group *group = mark->group;
BUG_ON(!mutex_is_locked(&group->mark_mutex));
spin_lock(&mark->lock);
/* something else already called this function on this mark */
- if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) {
+ if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) {
spin_unlock(&mark->lock);
return;
}
- mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
+ mark->flags &= ~FSNOTIFY_MARK_FLAG_ATTACHED;
if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) {
inode = mark->inode;
@@ -150,6 +148,12 @@ void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark,
fsnotify_destroy_vfsmount_mark(mark);
else
BUG();
+ /*
+ * Note that we didn't update flags telling whether inode cares about
+ * what's happening with children. We update these flags from
+ * __fsnotify_parent() lazily when next event happens on one of our
+ * children.
+ */
list_del_init(&mark->g_list);
@@ -157,18 +161,38 @@ void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark,
if (inode && (mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED))
iput(inode);
- /* release lock temporarily */
- mutex_unlock(&group->mark_mutex);
- spin_lock(&destroy_lock);
- list_add(&mark->g_list, &destroy_list);
- spin_unlock(&destroy_lock);
- wake_up(&destroy_waitq);
- /*
- * We don't necessarily have a ref on mark from caller so the above destroy
- * may have actually freed it, unless this group provides a 'freeing_mark'
- * function which must be holding a reference.
- */
+ atomic_dec(&group->num_marks);
+}
+
+static void
+fsnotify_mark_free_rcu(struct rcu_head *rcu)
+{
+ struct fsnotify_mark *mark;
+
+ mark = container_of(rcu, struct fsnotify_mark, g_rcu);
+ fsnotify_put_mark(mark);
+}
+
+/*
+ * Free fsnotify mark. The freeing is actually happening from a call_srcu
+ * callback. Caller must have a reference to the mark or be protected by
+ * fsnotify_mark_srcu.
+ */
+void fsnotify_free_mark(struct fsnotify_mark *mark)
+{
+ struct fsnotify_group *group = mark->group;
+
+ spin_lock(&mark->lock);
+ /* something else already called this function on this mark */
+ if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) {
+ spin_unlock(&mark->lock);
+ return;
+ }
+ mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
+ spin_unlock(&mark->lock);
+
+ call_srcu(&fsnotify_mark_srcu, &mark->g_rcu, fsnotify_mark_free_rcu);
/*
* Some groups like to know that marks are being freed. This is a
@@ -177,50 +201,45 @@ void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark,
*/
if (group->ops->freeing_mark)
group->ops->freeing_mark(mark, group);
-
- /*
- * __fsnotify_update_child_dentry_flags(inode);
- *
- * I really want to call that, but we can't, we have no idea if the inode
- * still exists the second we drop the mark->lock.
- *
- * The next time an event arrive to this inode from one of it's children
- * __fsnotify_parent will see that the inode doesn't care about it's
- * children and will update all of these flags then. So really this
- * is just a lazy update (and could be a perf win...)
- */
-
- atomic_dec(&group->num_marks);
-
- mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
}
void fsnotify_destroy_mark(struct fsnotify_mark *mark,
struct fsnotify_group *group)
{
mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
- fsnotify_destroy_mark_locked(mark, group);
+ fsnotify_detach_mark(mark);
mutex_unlock(&group->mark_mutex);
+ fsnotify_free_mark(mark);
}
-/*
- * Destroy all marks in the given list. The marks must be already detached from
- * the original inode / vfsmount.
- */
-void fsnotify_destroy_marks(struct list_head *to_free)
+void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock)
{
- struct fsnotify_mark *mark, *lmark;
- struct fsnotify_group *group;
-
- list_for_each_entry_safe(mark, lmark, to_free, free_list) {
- spin_lock(&mark->lock);
- fsnotify_get_group(mark->group);
- group = mark->group;
- spin_unlock(&mark->lock);
+ struct fsnotify_mark *mark;
- fsnotify_destroy_mark(mark, group);
+ while (1) {
+ /*
+ * We have to be careful since we can race with e.g.
+ * fsnotify_clear_marks_by_group() and once we drop 'lock',
+ * mark can get removed from the obj_list and destroyed. But
+ * we are holding mark reference so mark cannot be freed and
+ * calling fsnotify_destroy_mark() more than once is fine.
+ */
+ spin_lock(lock);
+ if (hlist_empty(head)) {
+ spin_unlock(lock);
+ break;
+ }
+ mark = hlist_entry(head->first, struct fsnotify_mark, obj_list);
+ /*
+ * We don't update i_fsnotify_mask / mnt_fsnotify_mask here
+ * since inode / mount is going away anyway. So just remove
+ * mark from the list.
+ */
+ hlist_del_init_rcu(&mark->obj_list);
+ fsnotify_get_mark(mark);
+ spin_unlock(lock);
+ fsnotify_destroy_mark(mark, mark->group);
fsnotify_put_mark(mark);
- fsnotify_put_group(group);
}
}
@@ -332,7 +351,7 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
* inode->i_lock
*/
spin_lock(&mark->lock);
- mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE;
+ mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_ATTACHED;
fsnotify_get_group(group);
mark->group = group;
@@ -369,11 +388,7 @@ err:
spin_unlock(&mark->lock);
- spin_lock(&destroy_lock);
- list_add(&mark->g_list, &destroy_list);
- spin_unlock(&destroy_lock);
- wake_up(&destroy_waitq);
-
+ call_srcu(&fsnotify_mark_srcu, &mark->g_rcu, fsnotify_mark_free_rcu);
return ret;
}
@@ -438,8 +453,9 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group,
}
mark = list_first_entry(&to_free, struct fsnotify_mark, g_list);
fsnotify_get_mark(mark);
- fsnotify_destroy_mark_locked(mark, group);
+ fsnotify_detach_mark(mark);
mutex_unlock(&group->mark_mutex);
+ fsnotify_free_mark(mark);
fsnotify_put_mark(mark);
}
}
@@ -475,40 +491,3 @@ void fsnotify_init_mark(struct fsnotify_mark *mark,
atomic_set(&mark->refcnt, 1);
mark->free_mark = free_mark;
}
-
-static int fsnotify_mark_destroy(void *ignored)
-{
- struct fsnotify_mark *mark, *next;
- struct list_head private_destroy_list;
-
- for (;;) {
- spin_lock(&destroy_lock);
- /* exchange the list head */
- list_replace_init(&destroy_list, &private_destroy_list);
- spin_unlock(&destroy_lock);
-
- synchronize_srcu(&fsnotify_mark_srcu);
-
- list_for_each_entry_safe(mark, next, &private_destroy_list, g_list) {
- list_del_init(&mark->g_list);
- fsnotify_put_mark(mark);
- }
-
- wait_event_interruptible(destroy_waitq, !list_empty(&destroy_list));
- }
-
- return 0;
-}
-
-static int __init fsnotify_mark_init(void)
-{
- struct task_struct *thread;
-
- thread = kthread_run(fsnotify_mark_destroy, NULL,
- "fsnotify_mark");
- if (IS_ERR(thread))
- panic("unable to start fsnotify mark destruction thread.");
-
- return 0;
-}
-device_initcall(fsnotify_mark_init);
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
index 326b148..a8fcab6 100644
--- a/fs/notify/vfsmount_mark.c
+++ b/fs/notify/vfsmount_mark.c
@@ -28,25 +28,6 @@
#include <linux/fsnotify_backend.h>
#include "fsnotify.h"
-#include "../mount.h"
-
-void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
-{
- struct fsnotify_mark *mark;
- struct hlist_node *n;
- struct mount *m = real_mount(mnt);
- LIST_HEAD(free_list);
-
- spin_lock(&mnt->mnt_root->d_lock);
- hlist_for_each_entry_safe(mark, n, &m->mnt_fsnotify_marks, obj_list) {
- list_add(&mark->free_list, &free_list);
- hlist_del_init_rcu(&mark->obj_list);
- fsnotify_get_mark(mark);
- }
- spin_unlock(&mnt->mnt_root->d_lock);
-
- fsnotify_destroy_marks(&free_list);
-}
void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group)
{
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 99521e7..8f20d60 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -4,6 +4,7 @@
#include <linux/proc_ns.h>
#include <linux/magic.h>
#include <linux/ktime.h>
+#include <linux/seq_file.h>
static struct vfsmount *nsfs_mnt;
@@ -136,9 +137,19 @@ out_invalid:
return ERR_PTR(-EINVAL);
}
+static int nsfs_show_path(struct seq_file *seq, struct dentry *dentry)
+{
+ struct inode *inode = d_inode(dentry);
+ const struct proc_ns_operations *ns_ops = dentry->d_fsdata;
+
+ seq_printf(seq, "%s:[%lu]", ns_ops->name, inode->i_ino);
+ return 0;
+}
+
static const struct super_operations nsfs_ops = {
.statfs = simple_statfs,
.evict_inode = nsfs_evict,
+ .show_path = nsfs_show_path,
};
static struct dentry *nsfs_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 262561f..9d383e5 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -525,8 +525,8 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
}
}
err = add_to_page_cache_lru(*cached_page, mapping,
- index,
- GFP_KERNEL & mapping_gfp_mask(mapping));
+ index,
+ mapping_gfp_constraint(mapping, GFP_KERNEL));
if (unlikely(err)) {
if (err == -EEXIST)
continue;
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 9e1e112..2f77f8d 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -543,7 +543,7 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
return -EROFS;
}
if (!ntfs_stamp_usnjrnl(vol)) {
- ntfs_error(sb, "Failed to stamp transation log "
+ ntfs_error(sb, "Failed to stamp transaction log "
"($UsnJrnl)%s", es);
NVolSetErrors(vol);
return -EROFS;
@@ -2204,17 +2204,12 @@ get_ctx_vol_failed:
return true;
#ifdef NTFS_RW
iput_usnjrnl_err_out:
- if (vol->usnjrnl_j_ino)
- iput(vol->usnjrnl_j_ino);
- if (vol->usnjrnl_max_ino)
- iput(vol->usnjrnl_max_ino);
- if (vol->usnjrnl_ino)
- iput(vol->usnjrnl_ino);
+ iput(vol->usnjrnl_j_ino);
+ iput(vol->usnjrnl_max_ino);
+ iput(vol->usnjrnl_ino);
iput_quota_err_out:
- if (vol->quota_q_ino)
- iput(vol->quota_q_ino);
- if (vol->quota_ino)
- iput(vol->quota_ino);
+ iput(vol->quota_q_ino);
+ iput(vol->quota_ino);
iput(vol->extend_ino);
#endif /* NTFS_RW */
iput_sec_err_out:
@@ -2223,8 +2218,7 @@ iput_root_err_out:
iput(vol->root_ino);
iput_logfile_err_out:
#ifdef NTFS_RW
- if (vol->logfile_ino)
- iput(vol->logfile_ino);
+ iput(vol->logfile_ino);
iput_vol_err_out:
#endif /* NTFS_RW */
iput(vol->vol_ino);
@@ -2254,8 +2248,7 @@ iput_mftbmp_err_out:
iput(vol->mftbmp_ino);
iput_mirr_err_out:
#ifdef NTFS_RW
- if (vol->mftmirr_ino)
- iput(vol->mftmirr_ino);
+ iput(vol->mftmirr_ino);
#endif /* NTFS_RW */
return false;
}
@@ -3146,8 +3139,8 @@ static int __init init_ntfs_fs(void)
ntfs_big_inode_cache = kmem_cache_create(ntfs_big_inode_cache_name,
sizeof(big_ntfs_inode), 0,
- SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
- ntfs_big_inode_init_once);
+ SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
+ SLAB_ACCOUNT, ntfs_big_inode_init_once);
if (!ntfs_big_inode_cache) {
pr_crit("Failed to create %s!\n", ntfs_big_inode_cache_name);
goto big_inode_err_out;
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index c58a1bc..0cdf497 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -284,7 +284,19 @@ int ocfs2_set_acl(handle_t *handle,
int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
- return ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL);
+ struct buffer_head *bh = NULL;
+ int status = 0;
+
+ status = ocfs2_inode_lock(inode, &bh, 1);
+ if (status < 0) {
+ if (status != -ENOENT)
+ mlog_errno(status);
+ return status;
+ }
+ status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL);
+ ocfs2_inode_unlock(inode, 1);
+ brelse(bh);
+ return status;
}
struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type)
@@ -292,19 +304,21 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type)
struct ocfs2_super *osb;
struct buffer_head *di_bh = NULL;
struct posix_acl *acl;
- int ret = -EAGAIN;
+ int ret;
osb = OCFS2_SB(inode->i_sb);
if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
return NULL;
-
- ret = ocfs2_read_inode_block(inode, &di_bh);
- if (ret < 0)
+ ret = ocfs2_inode_lock(inode, &di_bh, 0);
+ if (ret < 0) {
+ if (ret != -ENOENT)
+ mlog_errno(ret);
return ERR_PTR(ret);
+ }
acl = ocfs2_get_acl_nolock(inode, type, di_bh);
+ ocfs2_inode_unlock(inode, 0);
brelse(di_bh);
-
return acl;
}
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 5997c00..a3ded88 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -164,7 +164,7 @@ static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
struct ocfs2_extent_rec *rec);
static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et);
static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
-static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
+static const struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
.eo_set_last_eb_blk = ocfs2_dinode_set_last_eb_blk,
.eo_get_last_eb_blk = ocfs2_dinode_get_last_eb_blk,
.eo_update_clusters = ocfs2_dinode_update_clusters,
@@ -286,7 +286,7 @@ static void ocfs2_xattr_value_update_clusters(struct ocfs2_extent_tree *et,
le32_add_cpu(&vb->vb_xv->xr_clusters, clusters);
}
-static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
+static const struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
.eo_set_last_eb_blk = ocfs2_xattr_value_set_last_eb_blk,
.eo_get_last_eb_blk = ocfs2_xattr_value_get_last_eb_blk,
.eo_update_clusters = ocfs2_xattr_value_update_clusters,
@@ -332,7 +332,7 @@ static void ocfs2_xattr_tree_update_clusters(struct ocfs2_extent_tree *et,
le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters);
}
-static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
+static const struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
.eo_set_last_eb_blk = ocfs2_xattr_tree_set_last_eb_blk,
.eo_get_last_eb_blk = ocfs2_xattr_tree_get_last_eb_blk,
.eo_update_clusters = ocfs2_xattr_tree_update_clusters,
@@ -379,7 +379,7 @@ static void ocfs2_dx_root_fill_root_el(struct ocfs2_extent_tree *et)
et->et_root_el = &dx_root->dr_list;
}
-static struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = {
+static const struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = {
.eo_set_last_eb_blk = ocfs2_dx_root_set_last_eb_blk,
.eo_get_last_eb_blk = ocfs2_dx_root_get_last_eb_blk,
.eo_update_clusters = ocfs2_dx_root_update_clusters,
@@ -425,7 +425,7 @@ ocfs2_refcount_tree_extent_contig(struct ocfs2_extent_tree *et,
return CONTIG_NONE;
}
-static struct ocfs2_extent_tree_operations ocfs2_refcount_tree_et_ops = {
+static const struct ocfs2_extent_tree_operations ocfs2_refcount_tree_et_ops = {
.eo_set_last_eb_blk = ocfs2_refcount_tree_set_last_eb_blk,
.eo_get_last_eb_blk = ocfs2_refcount_tree_get_last_eb_blk,
.eo_update_clusters = ocfs2_refcount_tree_update_clusters,
@@ -438,7 +438,7 @@ static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
struct buffer_head *bh,
ocfs2_journal_access_func access,
void *obj,
- struct ocfs2_extent_tree_operations *ops)
+ const struct ocfs2_extent_tree_operations *ops)
{
et->et_ops = ops;
et->et_root_bh = bh;
@@ -908,32 +908,30 @@ static int ocfs2_validate_extent_block(struct super_block *sb,
*/
if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
- ocfs2_error(sb,
- "Extent block #%llu has bad signature %.*s",
- (unsigned long long)bh->b_blocknr, 7,
- eb->h_signature);
- return -EINVAL;
+ rc = ocfs2_error(sb,
+ "Extent block #%llu has bad signature %.*s\n",
+ (unsigned long long)bh->b_blocknr, 7,
+ eb->h_signature);
+ goto bail;
}
if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) {
- ocfs2_error(sb,
- "Extent block #%llu has an invalid h_blkno "
- "of %llu",
- (unsigned long long)bh->b_blocknr,
- (unsigned long long)le64_to_cpu(eb->h_blkno));
- return -EINVAL;
+ rc = ocfs2_error(sb,
+ "Extent block #%llu has an invalid h_blkno of %llu\n",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(eb->h_blkno));
+ goto bail;
}
if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) {
- ocfs2_error(sb,
- "Extent block #%llu has an invalid "
- "h_fs_generation of #%u",
- (unsigned long long)bh->b_blocknr,
- le32_to_cpu(eb->h_fs_generation));
- return -EINVAL;
+ rc = ocfs2_error(sb,
+ "Extent block #%llu has an invalid h_fs_generation of #%u\n",
+ (unsigned long long)bh->b_blocknr,
+ le32_to_cpu(eb->h_fs_generation));
+ goto bail;
}
-
- return 0;
+bail:
+ return rc;
}
int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno,
@@ -1446,8 +1444,7 @@ static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et,
while(le16_to_cpu(el->l_tree_depth) > 1) {
if (le16_to_cpu(el->l_next_free_rec) == 0) {
ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Owner %llu has empty "
- "extent list (next_free_rec == 0)",
+ "Owner %llu has empty extent list (next_free_rec == 0)\n",
(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
status = -EIO;
goto bail;
@@ -1456,9 +1453,7 @@ static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et,
blkno = le64_to_cpu(el->l_recs[i].e_blkno);
if (!blkno) {
ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Owner %llu has extent "
- "list where extent # %d has no physical "
- "block start",
+ "Owner %llu has extent list where extent # %d has no physical block start\n",
(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), i);
status = -EIO;
goto bail;
@@ -1788,8 +1783,7 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci,
while (el->l_tree_depth) {
if (le16_to_cpu(el->l_next_free_rec) == 0) {
ocfs2_error(ocfs2_metadata_cache_get_super(ci),
- "Owner %llu has empty extent list at "
- "depth %u\n",
+ "Owner %llu has empty extent list at depth %u\n",
(unsigned long long)ocfs2_metadata_cache_owner(ci),
le16_to_cpu(el->l_tree_depth));
ret = -EROFS;
@@ -1814,8 +1808,7 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci,
blkno = le64_to_cpu(el->l_recs[i].e_blkno);
if (blkno == 0) {
ocfs2_error(ocfs2_metadata_cache_get_super(ci),
- "Owner %llu has bad blkno in extent list "
- "at depth %u (index %d)\n",
+ "Owner %llu has bad blkno in extent list at depth %u (index %d)\n",
(unsigned long long)ocfs2_metadata_cache_owner(ci),
le16_to_cpu(el->l_tree_depth), i);
ret = -EROFS;
@@ -1836,8 +1829,7 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci,
if (le16_to_cpu(el->l_next_free_rec) >
le16_to_cpu(el->l_count)) {
ocfs2_error(ocfs2_metadata_cache_get_super(ci),
- "Owner %llu has bad count in extent list "
- "at block %llu (next free=%u, count=%u)\n",
+ "Owner %llu has bad count in extent list at block %llu (next free=%u, count=%u)\n",
(unsigned long long)ocfs2_metadata_cache_owner(ci),
(unsigned long long)bh->b_blocknr,
le16_to_cpu(el->l_next_free_rec),
@@ -2116,8 +2108,7 @@ static int ocfs2_rotate_subtree_right(handle_t *handle,
if (left_el->l_next_free_rec != left_el->l_count) {
ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Inode %llu has non-full interior leaf node %llu"
- "(next free = %u)",
+ "Inode %llu has non-full interior leaf node %llu (next free = %u)\n",
(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
(unsigned long long)left_leaf_bh->b_blocknr,
le16_to_cpu(left_el->l_next_free_rec));
@@ -2256,8 +2247,7 @@ int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
* If we got here, we never found a valid node where
* the tree indicated one should be.
*/
- ocfs2_error(sb,
- "Invalid extent tree at extent block %llu\n",
+ ocfs2_error(sb, "Invalid extent tree at extent block %llu\n",
(unsigned long long)blkno);
ret = -EROFS;
goto out;
@@ -2872,8 +2862,7 @@ int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
* If we got here, we never found a valid node where
* the tree indicated one should be.
*/
- ocfs2_error(sb,
- "Invalid extent tree at extent block %llu\n",
+ ocfs2_error(sb, "Invalid extent tree at extent block %llu\n",
(unsigned long long)blkno);
ret = -EROFS;
goto out;
@@ -3131,6 +3120,30 @@ out:
return ret;
}
+static int ocfs2_remove_rightmost_empty_extent(struct ocfs2_super *osb,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_path *path,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ handle_t *handle;
+ int ret;
+ int credits = path->p_tree_depth * 2 + 1;
+
+ handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ return ret;
+ }
+
+ ret = ocfs2_remove_rightmost_path(handle, et, path, dealloc);
+ if (ret)
+ mlog_errno(ret);
+
+ ocfs2_commit_trans(osb, handle);
+ return ret;
+}
+
/*
* Left rotation of btree records.
*
@@ -3200,7 +3213,7 @@ rightmost_no_delete:
if (le16_to_cpu(el->l_next_free_rec) == 0) {
ret = -EIO;
ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Owner %llu has empty extent block at %llu",
+ "Owner %llu has empty extent block at %llu\n",
(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
(unsigned long long)le64_to_cpu(eb->h_blkno));
goto out;
@@ -3930,7 +3943,7 @@ static void ocfs2_adjust_rightmost_records(handle_t *handle,
next_free = le16_to_cpu(el->l_next_free_rec);
if (next_free == 0) {
ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Owner %llu has a bad extent list",
+ "Owner %llu has a bad extent list\n",
(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
ret = -EIO;
return;
@@ -4355,10 +4368,7 @@ static int ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
bh = path_leaf_bh(left_path);
eb = (struct ocfs2_extent_block *)bh->b_data;
ocfs2_error(sb,
- "Extent block #%llu has an "
- "invalid l_next_free_rec of "
- "%d. It should have "
- "matched the l_count of %d",
+ "Extent block #%llu has an invalid l_next_free_rec of %d. It should have matched the l_count of %d\n",
(unsigned long long)le64_to_cpu(eb->h_blkno),
le16_to_cpu(new_el->l_next_free_rec),
le16_to_cpu(new_el->l_count));
@@ -4413,8 +4423,7 @@ static int ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
bh = path_leaf_bh(right_path);
eb = (struct ocfs2_extent_block *)bh->b_data;
ocfs2_error(sb,
- "Extent block #%llu has an "
- "invalid l_next_free_rec of %d",
+ "Extent block #%llu has an invalid l_next_free_rec of %d\n",
(unsigned long long)le64_to_cpu(eb->h_blkno),
le16_to_cpu(new_el->l_next_free_rec));
status = -EINVAL;
@@ -4970,10 +4979,9 @@ leftright:
split_index = ocfs2_search_extent_list(el, cpos);
if (split_index == -1) {
ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Owner %llu has an extent at cpos %u "
- "which can no longer be found.\n",
- (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
- cpos);
+ "Owner %llu has an extent at cpos %u which can no longer be found\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ cpos);
ret = -EROFS;
goto out;
}
@@ -5158,10 +5166,9 @@ int ocfs2_change_extent_flag(handle_t *handle,
index = ocfs2_search_extent_list(el, cpos);
if (index == -1) {
ocfs2_error(sb,
- "Owner %llu has an extent at cpos %u which can no "
- "longer be found.\n",
- (unsigned long long)
- ocfs2_metadata_cache_owner(et->et_ci), cpos);
+ "Owner %llu has an extent at cpos %u which can no longer be found\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ cpos);
ret = -EROFS;
goto out;
}
@@ -5228,9 +5235,7 @@ int ocfs2_mark_extent_written(struct inode *inode,
cpos, len, phys);
if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
- ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
- "that are being written to, but the feature bit "
- "is not set in the super block.",
+ ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents that are being written to, but the feature bit is not set in the super block\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno);
ret = -EROFS;
goto out;
@@ -5514,8 +5519,7 @@ int ocfs2_remove_extent(handle_t *handle,
index = ocfs2_search_extent_list(el, cpos);
if (index == -1) {
ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Owner %llu has an extent at cpos %u which can no "
- "longer be found.\n",
+ "Owner %llu has an extent at cpos %u which can no longer be found\n",
(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
cpos);
ret = -EROFS;
@@ -5580,7 +5584,7 @@ int ocfs2_remove_extent(handle_t *handle,
index = ocfs2_search_extent_list(el, cpos);
if (index == -1) {
ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Owner %llu: split at cpos %u lost record.",
+ "Owner %llu: split at cpos %u lost record\n",
(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
cpos);
ret = -EROFS;
@@ -5596,8 +5600,7 @@ int ocfs2_remove_extent(handle_t *handle,
ocfs2_rec_clusters(el, rec);
if (rec_range != trunc_range) {
ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Owner %llu: error after split at cpos %u"
- "trunc len %u, existing record is (%u,%u)",
+ "Owner %llu: error after split at cpos %u trunc len %u, existing record is (%u,%u)\n",
(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
cpos, len, le32_to_cpu(rec->e_cpos),
ocfs2_rec_clusters(el, rec));
@@ -6171,11 +6174,10 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
}
bail:
- if (tl_inode)
- iput(tl_inode);
+ iput(tl_inode);
brelse(tl_bh);
- if (status < 0 && (*tl_copy)) {
+ if (status < 0) {
kfree(*tl_copy);
*tl_copy = NULL;
mlog_errno(status);
@@ -7108,15 +7110,23 @@ start:
* to check it up here before changing the tree.
*/
if (root_el->l_tree_depth && rec->e_int_clusters == 0) {
- ocfs2_error(inode->i_sb, "Inode %lu has an empty "
+ mlog(ML_ERROR, "Inode %lu has an empty "
"extent record, depth %u\n", inode->i_ino,
le16_to_cpu(root_el->l_tree_depth));
- status = -EROFS;
- goto bail;
+ status = ocfs2_remove_rightmost_empty_extent(osb,
+ &et, path, &dealloc);
+ if (status) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ ocfs2_reinit_path(path, 1);
+ goto start;
+ } else {
+ trunc_cpos = le32_to_cpu(rec->e_cpos);
+ trunc_len = 0;
+ blkno = 0;
}
- trunc_cpos = le32_to_cpu(rec->e_cpos);
- trunc_len = 0;
- blkno = 0;
} else if (le32_to_cpu(rec->e_cpos) >= new_highest_cpos) {
/*
* Truncate entire record.
@@ -7204,8 +7214,7 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) ||
!ocfs2_supports_inline_data(osb)) {
ocfs2_error(inode->i_sb,
- "Inline data flags for inode %llu don't agree! "
- "Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n",
+ "Inline data flags for inode %llu don't agree! Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
le16_to_cpu(di->i_dyn_features),
OCFS2_I(inode)->ip_dyn_features,
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index fb09b97..f3dc1b0 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -54,7 +54,7 @@
*/
struct ocfs2_extent_tree_operations;
struct ocfs2_extent_tree {
- struct ocfs2_extent_tree_operations *et_ops;
+ const struct ocfs2_extent_tree_operations *et_ops;
struct buffer_head *et_root_bh;
struct ocfs2_extent_list *et_root_el;
struct ocfs2_caching_info *et_ci;
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 0f5fd9d..7f60472 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -227,7 +227,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL)) {
- ocfs2_error(inode->i_sb, "Inode %llu lost inline data flag",
+ ocfs2_error(inode->i_sb, "Inode %llu lost inline data flag\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno);
return -EROFS;
}
@@ -237,7 +237,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
if (size > PAGE_CACHE_SIZE ||
size > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) {
ocfs2_error(inode->i_sb,
- "Inode %llu has with inline data has bad size: %Lu",
+ "Inode %llu has with inline data has bad size: %Lu\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
(unsigned long long)size);
return -EROFS;
@@ -533,10 +533,14 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
+ down_read(&OCFS2_I(inode)->ip_alloc_sem);
+
/* This figures out the size of the next contiguous block, and
* our logical offset */
ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
&contig_blocks, &ext_flags);
+ up_read(&OCFS2_I(inode)->ip_alloc_sem);
+
if (ret) {
mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
(unsigned long long)iblock);
@@ -557,6 +561,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
alloc_locked = 1;
+ down_write(&OCFS2_I(inode)->ip_alloc_sem);
+
/* fill hole, allocate blocks can't be larger than the size
* of the hole */
clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len);
@@ -569,6 +575,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
ret = ocfs2_extend_allocation(inode, cpos,
clusters_to_alloc, 0);
if (ret < 0) {
+ up_write(&OCFS2_I(inode)->ip_alloc_sem);
mlog_errno(ret);
goto bail;
}
@@ -576,11 +583,14 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
&contig_blocks, &ext_flags);
if (ret < 0) {
+ up_write(&OCFS2_I(inode)->ip_alloc_sem);
mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
(unsigned long long)iblock);
ret = -EIO;
goto bail;
}
+ set_buffer_new(bh_result);
+ up_write(&OCFS2_I(inode)->ip_alloc_sem);
}
/*
@@ -627,10 +637,13 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
}
- ocfs2_iocb_clear_rw_locked(iocb);
+ /* Let rw unlock to be done later to protect append direct io write */
+ if (offset + bytes <= i_size_read(inode)) {
+ ocfs2_iocb_clear_rw_locked(iocb);
- level = ocfs2_iocb_rw_locked_level(iocb);
- ocfs2_rw_unlock(inode, level);
+ level = ocfs2_iocb_rw_locked_level(iocb);
+ ocfs2_rw_unlock(inode, level);
+ }
}
static int ocfs2_releasepage(struct page *page, gfp_t wait)
@@ -832,12 +845,17 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
/* zeroing out the previously allocated cluster tail
* that but not zeroed */
- if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+ if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
+ down_read(&OCFS2_I(inode)->ip_alloc_sem);
ret = ocfs2_direct_IO_zero_extend(osb, inode, offset,
zero_len_tail, cluster_align_tail);
- else
+ up_read(&OCFS2_I(inode)->ip_alloc_sem);
+ } else {
+ down_write(&OCFS2_I(inode)->ip_alloc_sem);
ret = ocfs2_direct_IO_extend_no_holes(osb, inode,
offset);
+ up_write(&OCFS2_I(inode)->ip_alloc_sem);
+ }
if (ret < 0) {
mlog_errno(ret);
ocfs2_inode_unlock(inode, 1);
@@ -847,6 +865,7 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
is_overwrite = ocfs2_is_overwrite(osb, inode, offset);
if (is_overwrite < 0) {
mlog_errno(is_overwrite);
+ ret = is_overwrite;
ocfs2_inode_unlock(inode, 1);
goto clean_orphan;
}
@@ -857,7 +876,8 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
written = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
offset, ocfs2_direct_IO_get_blocks,
ocfs2_dio_end_io, NULL, 0);
- if (unlikely(written < 0)) {
+ /* overwrite aio may return -EIOCBQUEUED, and it is not an error */
+ if ((written < 0) && (written != -EIOCBQUEUED)) {
loff_t i_size = i_size_read(inode);
if (offset + count > i_size) {
@@ -876,12 +896,14 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
ocfs2_inode_unlock(inode, 1);
brelse(di_bh);
+ di_bh = NULL;
goto clean_orphan;
}
}
ocfs2_inode_unlock(inode, 1);
brelse(di_bh);
+ di_bh = NULL;
ret = jbd2_journal_force_commit(journal);
if (ret < 0)
@@ -936,10 +958,12 @@ clean_orphan:
if (tmp_ret < 0) {
ret = tmp_ret;
mlog_errno(ret);
+ brelse(di_bh);
goto out;
}
ocfs2_inode_unlock(inode, 1);
+ brelse(di_bh);
tmp_ret = jbd2_journal_force_commit(journal);
if (tmp_ret < 0) {
@@ -2185,10 +2209,7 @@ try_again:
if (ret)
goto out_commit;
}
- /*
- * We don't want this to fail in ocfs2_write_end(), so do it
- * here.
- */
+
ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
@@ -2345,7 +2366,7 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
{
- int i;
+ int i, ret;
unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1);
struct inode *inode = mapping->host;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -2354,6 +2375,14 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
handle_t *handle = wc->w_handle;
struct page *tmppage;
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ copied = ret;
+ mlog_errno(ret);
+ goto out;
+ }
+
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
ocfs2_write_end_inline(inode, pos, len, &copied, di, wc);
goto out_write_size;
@@ -2409,6 +2438,7 @@ out_write_size:
ocfs2_update_inode_fsync_trans(handle, inode, 1);
ocfs2_journal_dirty(handle, wc->w_di_bh);
+out:
/* unlock pages before dealloc since it needs acquiring j_trans_barrier
* lock, or it will cause a deadlock since journal commit threads holds
* this lock and will ask for the page lock when flushing the data.
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 1edcb14..fe50ded 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -316,6 +316,12 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
bh = bhs[i];
if (!(flags & OCFS2_BH_READAHEAD)) {
+ if (status) {
+ /* Clear the rest of the buffers on error */
+ put_bh(bh);
+ bhs[i] = NULL;
+ continue;
+ }
/* We know this can't have changed as we hold the
* owner sem. Avoid doing any work on the bh if the
* journal has it. */
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 16eff45..a3cc6d2 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -36,7 +36,7 @@
#include <linux/debugfs.h>
#include <linux/slab.h>
#include <linux/bitmap.h>
-
+#include <linux/ktime.h>
#include "heartbeat.h"
#include "tcp.h"
#include "nodemanager.h"
@@ -219,7 +219,8 @@ struct o2hb_region {
unsigned hr_unclean_stop:1,
hr_aborted_start:1,
hr_item_pinned:1,
- hr_item_dropped:1;
+ hr_item_dropped:1,
+ hr_node_deleted:1;
/* protected by the hr_callback_sem */
struct task_struct *hr_task;
@@ -372,14 +373,13 @@ static void o2hb_wait_on_io(struct o2hb_region *reg,
wait_for_completion(&wc->wc_io_complete);
}
-static void o2hb_bio_end_io(struct bio *bio,
- int error)
+static void o2hb_bio_end_io(struct bio *bio)
{
struct o2hb_bio_wait_ctxt *wc = bio->bi_private;
- if (error) {
- mlog(ML_ERROR, "IO Error %d\n", error);
- wc->wc_error = error;
+ if (bio->bi_error) {
+ mlog(ML_ERROR, "IO Error %d\n", bio->bi_error);
+ wc->wc_error = bio->bi_error;
}
o2hb_bio_wait_dec(wc, 1);
@@ -1061,37 +1061,6 @@ bail:
return ret;
}
-/* Subtract b from a, storing the result in a. a *must* have a larger
- * value than b. */
-static void o2hb_tv_subtract(struct timeval *a,
- struct timeval *b)
-{
- /* just return 0 when a is after b */
- if (a->tv_sec < b->tv_sec ||
- (a->tv_sec == b->tv_sec && a->tv_usec < b->tv_usec)) {
- a->tv_sec = 0;
- a->tv_usec = 0;
- return;
- }
-
- a->tv_sec -= b->tv_sec;
- a->tv_usec -= b->tv_usec;
- while ( a->tv_usec < 0 ) {
- a->tv_sec--;
- a->tv_usec += 1000000;
- }
-}
-
-static unsigned int o2hb_elapsed_msecs(struct timeval *start,
- struct timeval *end)
-{
- struct timeval res = *end;
-
- o2hb_tv_subtract(&res, start);
-
- return res.tv_sec * 1000 + res.tv_usec / 1000;
-}
-
/*
* we ride the region ref that the region dir holds. before the region
* dir is removed and drops it ref it will wait to tear down this
@@ -1102,7 +1071,7 @@ static int o2hb_thread(void *data)
int i, ret;
struct o2hb_region *reg = data;
struct o2hb_bio_wait_ctxt write_wc;
- struct timeval before_hb, after_hb;
+ ktime_t before_hb, after_hb;
unsigned int elapsed_msec;
mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread running\n");
@@ -1110,7 +1079,13 @@ static int o2hb_thread(void *data)
set_user_nice(current, MIN_NICE);
/* Pin node */
- o2nm_depend_this_node();
+ ret = o2nm_depend_this_node();
+ if (ret) {
+ mlog(ML_ERROR, "Node has been deleted, ret = %d\n", ret);
+ reg->hr_node_deleted = 1;
+ wake_up(&o2hb_steady_queue);
+ return 0;
+ }
while (!kthread_should_stop() &&
!reg->hr_unclean_stop && !reg->hr_aborted_start) {
@@ -1119,18 +1094,18 @@ static int o2hb_thread(void *data)
* hr_timeout_ms between disk writes. On busy systems
* this should result in a heartbeat which is less
* likely to time itself out. */
- do_gettimeofday(&before_hb);
+ before_hb = ktime_get_real();
ret = o2hb_do_disk_heartbeat(reg);
- do_gettimeofday(&after_hb);
- elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb);
+ after_hb = ktime_get_real();
+
+ elapsed_msec = (unsigned int)
+ ktime_ms_delta(after_hb, before_hb);
mlog(ML_HEARTBEAT,
- "start = %lu.%lu, end = %lu.%lu, msec = %u, ret = %d\n",
- before_hb.tv_sec, (unsigned long) before_hb.tv_usec,
- after_hb.tv_sec, (unsigned long) after_hb.tv_usec,
- elapsed_msec, ret);
+ "start = %lld, end = %lld, msec = %u, ret = %d\n",
+ before_hb.tv64, after_hb.tv64, elapsed_msec, ret);
if (!kthread_should_stop() &&
elapsed_msec < reg->hr_timeout_ms) {
@@ -1505,16 +1480,17 @@ static int o2hb_read_block_input(struct o2hb_region *reg,
return 0;
}
-static ssize_t o2hb_region_block_bytes_read(struct o2hb_region *reg,
+static ssize_t o2hb_region_block_bytes_show(struct config_item *item,
char *page)
{
- return sprintf(page, "%u\n", reg->hr_block_bytes);
+ return sprintf(page, "%u\n", to_o2hb_region(item)->hr_block_bytes);
}
-static ssize_t o2hb_region_block_bytes_write(struct o2hb_region *reg,
+static ssize_t o2hb_region_block_bytes_store(struct config_item *item,
const char *page,
size_t count)
{
+ struct o2hb_region *reg = to_o2hb_region(item);
int status;
unsigned long block_bytes;
unsigned int block_bits;
@@ -1533,16 +1509,17 @@ static ssize_t o2hb_region_block_bytes_write(struct o2hb_region *reg,
return count;
}
-static ssize_t o2hb_region_start_block_read(struct o2hb_region *reg,
+static ssize_t o2hb_region_start_block_show(struct config_item *item,
char *page)
{
- return sprintf(page, "%llu\n", reg->hr_start_block);
+ return sprintf(page, "%llu\n", to_o2hb_region(item)->hr_start_block);
}
-static ssize_t o2hb_region_start_block_write(struct o2hb_region *reg,
+static ssize_t o2hb_region_start_block_store(struct config_item *item,
const char *page,
size_t count)
{
+ struct o2hb_region *reg = to_o2hb_region(item);
unsigned long long tmp;
char *p = (char *)page;
@@ -1558,16 +1535,16 @@ static ssize_t o2hb_region_start_block_write(struct o2hb_region *reg,
return count;
}
-static ssize_t o2hb_region_blocks_read(struct o2hb_region *reg,
- char *page)
+static ssize_t o2hb_region_blocks_show(struct config_item *item, char *page)
{
- return sprintf(page, "%d\n", reg->hr_blocks);
+ return sprintf(page, "%d\n", to_o2hb_region(item)->hr_blocks);
}
-static ssize_t o2hb_region_blocks_write(struct o2hb_region *reg,
+static ssize_t o2hb_region_blocks_store(struct config_item *item,
const char *page,
size_t count)
{
+ struct o2hb_region *reg = to_o2hb_region(item);
unsigned long tmp;
char *p = (char *)page;
@@ -1586,13 +1563,12 @@ static ssize_t o2hb_region_blocks_write(struct o2hb_region *reg,
return count;
}
-static ssize_t o2hb_region_dev_read(struct o2hb_region *reg,
- char *page)
+static ssize_t o2hb_region_dev_show(struct config_item *item, char *page)
{
unsigned int ret = 0;
- if (reg->hr_bdev)
- ret = sprintf(page, "%s\n", reg->hr_dev_name);
+ if (to_o2hb_region(item)->hr_bdev)
+ ret = sprintf(page, "%s\n", to_o2hb_region(item)->hr_dev_name);
return ret;
}
@@ -1620,17 +1596,13 @@ static int o2hb_map_slot_data(struct o2hb_region *reg)
struct o2hb_disk_slot *slot;
reg->hr_tmp_block = kmalloc(reg->hr_block_bytes, GFP_KERNEL);
- if (reg->hr_tmp_block == NULL) {
- mlog_errno(-ENOMEM);
+ if (reg->hr_tmp_block == NULL)
return -ENOMEM;
- }
reg->hr_slots = kcalloc(reg->hr_blocks,
sizeof(struct o2hb_disk_slot), GFP_KERNEL);
- if (reg->hr_slots == NULL) {
- mlog_errno(-ENOMEM);
+ if (reg->hr_slots == NULL)
return -ENOMEM;
- }
for(i = 0; i < reg->hr_blocks; i++) {
slot = &reg->hr_slots[i];
@@ -1646,17 +1618,13 @@ static int o2hb_map_slot_data(struct o2hb_region *reg)
reg->hr_slot_data = kcalloc(reg->hr_num_pages, sizeof(struct page *),
GFP_KERNEL);
- if (!reg->hr_slot_data) {
- mlog_errno(-ENOMEM);
+ if (!reg->hr_slot_data)
return -ENOMEM;
- }
for(i = 0; i < reg->hr_num_pages; i++) {
page = alloc_page(GFP_KERNEL);
- if (!page) {
- mlog_errno(-ENOMEM);
+ if (!page)
return -ENOMEM;
- }
reg->hr_slot_data[i] = page;
@@ -1688,10 +1656,8 @@ static int o2hb_populate_slot_data(struct o2hb_region *reg)
struct o2hb_disk_heartbeat_block *hb_block;
ret = o2hb_read_slots(reg, reg->hr_blocks);
- if (ret) {
- mlog_errno(ret);
+ if (ret)
goto out;
- }
/* We only want to get an idea of the values initially in each
* slot, so we do no verification - o2hb_check_slot will
@@ -1712,10 +1678,11 @@ out:
}
/* this is acting as commit; we set up all of hr_bdev and hr_task or nothing */
-static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
+static ssize_t o2hb_region_dev_store(struct config_item *item,
const char *page,
size_t count)
{
+ struct o2hb_region *reg = to_o2hb_region(item);
struct task_struct *hb_task;
long fd;
int sectsize;
@@ -1813,8 +1780,8 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
}
++live_threshold;
atomic_set(&reg->hr_steady_iterations, live_threshold);
- /* unsteady_iterations is double the steady_iterations */
- atomic_set(&reg->hr_unsteady_iterations, (live_threshold << 1));
+ /* unsteady_iterations is triple the steady_iterations */
+ atomic_set(&reg->hr_unsteady_iterations, (live_threshold * 3));
hb_task = kthread_run(o2hb_thread, reg, "o2hb-%s",
reg->hr_item.ci_name);
@@ -1829,7 +1796,8 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
spin_unlock(&o2hb_live_lock);
ret = wait_event_interruptible(o2hb_steady_queue,
- atomic_read(&reg->hr_steady_iterations) == 0);
+ atomic_read(&reg->hr_steady_iterations) == 0 ||
+ reg->hr_node_deleted);
if (ret) {
atomic_set(&reg->hr_steady_iterations, 0);
reg->hr_aborted_start = 1;
@@ -1840,6 +1808,11 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
goto out3;
}
+ if (reg->hr_node_deleted) {
+ ret = -EINVAL;
+ goto out3;
+ }
+
/* Ok, we were woken. Make sure it wasn't by drop_item() */
spin_lock(&o2hb_live_lock);
hb_task = reg->hr_task;
@@ -1870,9 +1843,9 @@ out:
return ret;
}
-static ssize_t o2hb_region_pid_read(struct o2hb_region *reg,
- char *page)
+static ssize_t o2hb_region_pid_show(struct config_item *item, char *page)
{
+ struct o2hb_region *reg = to_o2hb_region(item);
pid_t pid = 0;
spin_lock(&o2hb_live_lock);
@@ -1886,92 +1859,23 @@ static ssize_t o2hb_region_pid_read(struct o2hb_region *reg,
return sprintf(page, "%u\n", pid);
}
-struct o2hb_region_attribute {
- struct configfs_attribute attr;
- ssize_t (*show)(struct o2hb_region *, char *);
- ssize_t (*store)(struct o2hb_region *, const char *, size_t);
-};
-
-static struct o2hb_region_attribute o2hb_region_attr_block_bytes = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "block_bytes",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = o2hb_region_block_bytes_read,
- .store = o2hb_region_block_bytes_write,
-};
-
-static struct o2hb_region_attribute o2hb_region_attr_start_block = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "start_block",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = o2hb_region_start_block_read,
- .store = o2hb_region_start_block_write,
-};
-
-static struct o2hb_region_attribute o2hb_region_attr_blocks = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "blocks",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = o2hb_region_blocks_read,
- .store = o2hb_region_blocks_write,
-};
-
-static struct o2hb_region_attribute o2hb_region_attr_dev = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "dev",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = o2hb_region_dev_read,
- .store = o2hb_region_dev_write,
-};
-
-static struct o2hb_region_attribute o2hb_region_attr_pid = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "pid",
- .ca_mode = S_IRUGO | S_IRUSR },
- .show = o2hb_region_pid_read,
-};
+CONFIGFS_ATTR(o2hb_region_, block_bytes);
+CONFIGFS_ATTR(o2hb_region_, start_block);
+CONFIGFS_ATTR(o2hb_region_, blocks);
+CONFIGFS_ATTR(o2hb_region_, dev);
+CONFIGFS_ATTR_RO(o2hb_region_, pid);
static struct configfs_attribute *o2hb_region_attrs[] = {
- &o2hb_region_attr_block_bytes.attr,
- &o2hb_region_attr_start_block.attr,
- &o2hb_region_attr_blocks.attr,
- &o2hb_region_attr_dev.attr,
- &o2hb_region_attr_pid.attr,
+ &o2hb_region_attr_block_bytes,
+ &o2hb_region_attr_start_block,
+ &o2hb_region_attr_blocks,
+ &o2hb_region_attr_dev,
+ &o2hb_region_attr_pid,
NULL,
};
-static ssize_t o2hb_region_show(struct config_item *item,
- struct configfs_attribute *attr,
- char *page)
-{
- struct o2hb_region *reg = to_o2hb_region(item);
- struct o2hb_region_attribute *o2hb_region_attr =
- container_of(attr, struct o2hb_region_attribute, attr);
- ssize_t ret = 0;
-
- if (o2hb_region_attr->show)
- ret = o2hb_region_attr->show(reg, page);
- return ret;
-}
-
-static ssize_t o2hb_region_store(struct config_item *item,
- struct configfs_attribute *attr,
- const char *page, size_t count)
-{
- struct o2hb_region *reg = to_o2hb_region(item);
- struct o2hb_region_attribute *o2hb_region_attr =
- container_of(attr, struct o2hb_region_attribute, attr);
- ssize_t ret = -EINVAL;
-
- if (o2hb_region_attr->store)
- ret = o2hb_region_attr->store(reg, page, count);
- return ret;
-}
-
static struct configfs_item_operations o2hb_region_item_ops = {
.release = o2hb_region_release,
- .show_attribute = o2hb_region_show,
- .store_attribute = o2hb_region_store,
};
static struct config_item_type o2hb_region_type = {
@@ -2166,49 +2070,14 @@ unlock:
spin_unlock(&o2hb_live_lock);
}
-struct o2hb_heartbeat_group_attribute {
- struct configfs_attribute attr;
- ssize_t (*show)(struct o2hb_heartbeat_group *, char *);
- ssize_t (*store)(struct o2hb_heartbeat_group *, const char *, size_t);
-};
-
-static ssize_t o2hb_heartbeat_group_show(struct config_item *item,
- struct configfs_attribute *attr,
- char *page)
-{
- struct o2hb_heartbeat_group *reg = to_o2hb_heartbeat_group(to_config_group(item));
- struct o2hb_heartbeat_group_attribute *o2hb_heartbeat_group_attr =
- container_of(attr, struct o2hb_heartbeat_group_attribute, attr);
- ssize_t ret = 0;
-
- if (o2hb_heartbeat_group_attr->show)
- ret = o2hb_heartbeat_group_attr->show(reg, page);
- return ret;
-}
-
-static ssize_t o2hb_heartbeat_group_store(struct config_item *item,
- struct configfs_attribute *attr,
- const char *page, size_t count)
-{
- struct o2hb_heartbeat_group *reg = to_o2hb_heartbeat_group(to_config_group(item));
- struct o2hb_heartbeat_group_attribute *o2hb_heartbeat_group_attr =
- container_of(attr, struct o2hb_heartbeat_group_attribute, attr);
- ssize_t ret = -EINVAL;
-
- if (o2hb_heartbeat_group_attr->store)
- ret = o2hb_heartbeat_group_attr->store(reg, page, count);
- return ret;
-}
-
-static ssize_t o2hb_heartbeat_group_threshold_show(struct o2hb_heartbeat_group *group,
- char *page)
+static ssize_t o2hb_heartbeat_group_threshold_show(struct config_item *item,
+ char *page)
{
return sprintf(page, "%u\n", o2hb_dead_threshold);
}
-static ssize_t o2hb_heartbeat_group_threshold_store(struct o2hb_heartbeat_group *group,
- const char *page,
- size_t count)
+static ssize_t o2hb_heartbeat_group_threshold_store(struct config_item *item,
+ const char *page, size_t count)
{
unsigned long tmp;
char *p = (char *)page;
@@ -2223,17 +2092,15 @@ static ssize_t o2hb_heartbeat_group_threshold_store(struct o2hb_heartbeat_group
return count;
}
-static
-ssize_t o2hb_heartbeat_group_mode_show(struct o2hb_heartbeat_group *group,
- char *page)
+static ssize_t o2hb_heartbeat_group_mode_show(struct config_item *item,
+ char *page)
{
return sprintf(page, "%s\n",
o2hb_heartbeat_mode_desc[o2hb_heartbeat_mode]);
}
-static
-ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group,
- const char *page, size_t count)
+static ssize_t o2hb_heartbeat_group_mode_store(struct config_item *item,
+ const char *page, size_t count)
{
unsigned int i;
int ret;
@@ -2258,33 +2125,15 @@ ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group,
}
-static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "dead_threshold",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = o2hb_heartbeat_group_threshold_show,
- .store = o2hb_heartbeat_group_threshold_store,
-};
-
-static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_mode = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "mode",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = o2hb_heartbeat_group_mode_show,
- .store = o2hb_heartbeat_group_mode_store,
-};
+CONFIGFS_ATTR(o2hb_heartbeat_group_, threshold);
+CONFIGFS_ATTR(o2hb_heartbeat_group_, mode);
static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = {
- &o2hb_heartbeat_group_attr_threshold.attr,
- &o2hb_heartbeat_group_attr_mode.attr,
+ &o2hb_heartbeat_group_attr_threshold,
+ &o2hb_heartbeat_group_attr_mode,
NULL,
};
-static struct configfs_item_operations o2hb_heartbeat_group_item_ops = {
- .show_attribute = o2hb_heartbeat_group_show,
- .store_attribute = o2hb_heartbeat_group_store,
-};
-
static struct configfs_group_operations o2hb_heartbeat_group_group_ops = {
.make_item = o2hb_heartbeat_group_make_item,
.drop_item = o2hb_heartbeat_group_drop_item,
@@ -2292,7 +2141,6 @@ static struct configfs_group_operations o2hb_heartbeat_group_group_ops = {
static struct config_item_type o2hb_heartbeat_group_type = {
.ct_group_ops = &o2hb_heartbeat_group_group_ops,
- .ct_item_ops = &o2hb_heartbeat_group_item_ops,
.ct_attrs = o2hb_heartbeat_group_attrs,
.ct_owner = THIS_MODULE,
};
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 441c84e..ebe5438 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -172,9 +172,9 @@ static void o2nm_node_release(struct config_item *item)
kfree(node);
}
-static ssize_t o2nm_node_num_read(struct o2nm_node *node, char *page)
+static ssize_t o2nm_node_num_show(struct config_item *item, char *page)
{
- return sprintf(page, "%d\n", node->nd_num);
+ return sprintf(page, "%d\n", to_o2nm_node(item)->nd_num);
}
static struct o2nm_cluster *to_o2nm_cluster_from_node(struct o2nm_node *node)
@@ -188,15 +188,16 @@ enum {
O2NM_NODE_ATTR_NUM = 0,
O2NM_NODE_ATTR_PORT,
O2NM_NODE_ATTR_ADDRESS,
- O2NM_NODE_ATTR_LOCAL,
};
-static ssize_t o2nm_node_num_write(struct o2nm_node *node, const char *page,
+static ssize_t o2nm_node_num_store(struct config_item *item, const char *page,
size_t count)
{
+ struct o2nm_node *node = to_o2nm_node(item);
struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
unsigned long tmp;
char *p = (char *)page;
+ int ret = 0;
tmp = simple_strtoul(p, &p, 0);
if (!p || (*p && (*p != '\n')))
@@ -215,26 +216,30 @@ static ssize_t o2nm_node_num_write(struct o2nm_node *node, const char *page,
write_lock(&cluster->cl_nodes_lock);
if (cluster->cl_nodes[tmp])
- p = NULL;
+ ret = -EEXIST;
+ else if (test_and_set_bit(O2NM_NODE_ATTR_NUM,
+ &node->nd_set_attributes))
+ ret = -EBUSY;
else {
cluster->cl_nodes[tmp] = node;
node->nd_num = tmp;
set_bit(tmp, cluster->cl_nodes_bitmap);
}
write_unlock(&cluster->cl_nodes_lock);
- if (p == NULL)
- return -EEXIST;
+ if (ret)
+ return ret;
return count;
}
-static ssize_t o2nm_node_ipv4_port_read(struct o2nm_node *node, char *page)
+static ssize_t o2nm_node_ipv4_port_show(struct config_item *item, char *page)
{
- return sprintf(page, "%u\n", ntohs(node->nd_ipv4_port));
+ return sprintf(page, "%u\n", ntohs(to_o2nm_node(item)->nd_ipv4_port));
}
-static ssize_t o2nm_node_ipv4_port_write(struct o2nm_node *node,
+static ssize_t o2nm_node_ipv4_port_store(struct config_item *item,
const char *page, size_t count)
{
+ struct o2nm_node *node = to_o2nm_node(item);
unsigned long tmp;
char *p = (char *)page;
@@ -247,20 +252,23 @@ static ssize_t o2nm_node_ipv4_port_write(struct o2nm_node *node,
if (tmp >= (u16)-1)
return -ERANGE;
+ if (test_and_set_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes))
+ return -EBUSY;
node->nd_ipv4_port = htons(tmp);
return count;
}
-static ssize_t o2nm_node_ipv4_address_read(struct o2nm_node *node, char *page)
+static ssize_t o2nm_node_ipv4_address_show(struct config_item *item, char *page)
{
- return sprintf(page, "%pI4\n", &node->nd_ipv4_address);
+ return sprintf(page, "%pI4\n", &to_o2nm_node(item)->nd_ipv4_address);
}
-static ssize_t o2nm_node_ipv4_address_write(struct o2nm_node *node,
+static ssize_t o2nm_node_ipv4_address_store(struct config_item *item,
const char *page,
size_t count)
{
+ struct o2nm_node *node = to_o2nm_node(item);
struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
int ret, i;
struct rb_node **p, *parent;
@@ -282,6 +290,9 @@ static ssize_t o2nm_node_ipv4_address_write(struct o2nm_node *node,
write_lock(&cluster->cl_nodes_lock);
if (o2nm_node_ip_tree_lookup(cluster, ipv4_addr, &p, &parent))
ret = -EEXIST;
+ else if (test_and_set_bit(O2NM_NODE_ATTR_ADDRESS,
+ &node->nd_set_attributes))
+ ret = -EBUSY;
else {
rb_link_node(&node->nd_ip_node, parent, p);
rb_insert_color(&node->nd_ip_node, &cluster->cl_node_ip_tree);
@@ -295,14 +306,15 @@ static ssize_t o2nm_node_ipv4_address_write(struct o2nm_node *node,
return count;
}
-static ssize_t o2nm_node_local_read(struct o2nm_node *node, char *page)
+static ssize_t o2nm_node_local_show(struct config_item *item, char *page)
{
- return sprintf(page, "%d\n", node->nd_local);
+ return sprintf(page, "%d\n", to_o2nm_node(item)->nd_local);
}
-static ssize_t o2nm_node_local_write(struct o2nm_node *node, const char *page,
+static ssize_t o2nm_node_local_store(struct config_item *item, const char *page,
size_t count)
{
+ struct o2nm_node *node = to_o2nm_node(item);
struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
unsigned long tmp;
char *p = (char *)page;
@@ -349,108 +361,21 @@ static ssize_t o2nm_node_local_write(struct o2nm_node *node, const char *page,
return count;
}
-struct o2nm_node_attribute {
- struct configfs_attribute attr;
- ssize_t (*show)(struct o2nm_node *, char *);
- ssize_t (*store)(struct o2nm_node *, const char *, size_t);
-};
-
-static struct o2nm_node_attribute o2nm_node_attr_num = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "num",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = o2nm_node_num_read,
- .store = o2nm_node_num_write,
-};
-
-static struct o2nm_node_attribute o2nm_node_attr_ipv4_port = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "ipv4_port",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = o2nm_node_ipv4_port_read,
- .store = o2nm_node_ipv4_port_write,
-};
-
-static struct o2nm_node_attribute o2nm_node_attr_ipv4_address = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "ipv4_address",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = o2nm_node_ipv4_address_read,
- .store = o2nm_node_ipv4_address_write,
-};
-
-static struct o2nm_node_attribute o2nm_node_attr_local = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "local",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = o2nm_node_local_read,
- .store = o2nm_node_local_write,
-};
+CONFIGFS_ATTR(o2nm_node_, num);
+CONFIGFS_ATTR(o2nm_node_, ipv4_port);
+CONFIGFS_ATTR(o2nm_node_, ipv4_address);
+CONFIGFS_ATTR(o2nm_node_, local);
static struct configfs_attribute *o2nm_node_attrs[] = {
- [O2NM_NODE_ATTR_NUM] = &o2nm_node_attr_num.attr,
- [O2NM_NODE_ATTR_PORT] = &o2nm_node_attr_ipv4_port.attr,
- [O2NM_NODE_ATTR_ADDRESS] = &o2nm_node_attr_ipv4_address.attr,
- [O2NM_NODE_ATTR_LOCAL] = &o2nm_node_attr_local.attr,
+ &o2nm_node_attr_num,
+ &o2nm_node_attr_ipv4_port,
+ &o2nm_node_attr_ipv4_address,
+ &o2nm_node_attr_local,
NULL,
};
-static int o2nm_attr_index(struct configfs_attribute *attr)
-{
- int i;
- for (i = 0; i < ARRAY_SIZE(o2nm_node_attrs); i++) {
- if (attr == o2nm_node_attrs[i])
- return i;
- }
- BUG();
- return 0;
-}
-
-static ssize_t o2nm_node_show(struct config_item *item,
- struct configfs_attribute *attr,
- char *page)
-{
- struct o2nm_node *node = to_o2nm_node(item);
- struct o2nm_node_attribute *o2nm_node_attr =
- container_of(attr, struct o2nm_node_attribute, attr);
- ssize_t ret = 0;
-
- if (o2nm_node_attr->show)
- ret = o2nm_node_attr->show(node, page);
- return ret;
-}
-
-static ssize_t o2nm_node_store(struct config_item *item,
- struct configfs_attribute *attr,
- const char *page, size_t count)
-{
- struct o2nm_node *node = to_o2nm_node(item);
- struct o2nm_node_attribute *o2nm_node_attr =
- container_of(attr, struct o2nm_node_attribute, attr);
- ssize_t ret;
- int attr_index = o2nm_attr_index(attr);
-
- if (o2nm_node_attr->store == NULL) {
- ret = -EINVAL;
- goto out;
- }
-
- if (test_bit(attr_index, &node->nd_set_attributes))
- return -EBUSY;
-
- ret = o2nm_node_attr->store(node, page, count);
- if (ret < count)
- goto out;
-
- set_bit(attr_index, &node->nd_set_attributes);
-out:
- return ret;
-}
-
static struct configfs_item_operations o2nm_node_item_ops = {
.release = o2nm_node_release,
- .show_attribute = o2nm_node_show,
- .store_attribute = o2nm_node_store,
};
static struct config_item_type o2nm_node_type = {
@@ -475,12 +400,6 @@ static struct o2nm_node_group *to_o2nm_node_group(struct config_group *group)
}
#endif
-struct o2nm_cluster_attribute {
- struct configfs_attribute attr;
- ssize_t (*show)(struct o2nm_cluster *, char *);
- ssize_t (*store)(struct o2nm_cluster *, const char *, size_t);
-};
-
static ssize_t o2nm_cluster_attr_write(const char *page, ssize_t count,
unsigned int *val)
{
@@ -501,15 +420,16 @@ static ssize_t o2nm_cluster_attr_write(const char *page, ssize_t count,
return count;
}
-static ssize_t o2nm_cluster_attr_idle_timeout_ms_read(
- struct o2nm_cluster *cluster, char *page)
+static ssize_t o2nm_cluster_idle_timeout_ms_show(struct config_item *item,
+ char *page)
{
- return sprintf(page, "%u\n", cluster->cl_idle_timeout_ms);
+ return sprintf(page, "%u\n", to_o2nm_cluster(item)->cl_idle_timeout_ms);
}
-static ssize_t o2nm_cluster_attr_idle_timeout_ms_write(
- struct o2nm_cluster *cluster, const char *page, size_t count)
+static ssize_t o2nm_cluster_idle_timeout_ms_store(struct config_item *item,
+ const char *page, size_t count)
{
+ struct o2nm_cluster *cluster = to_o2nm_cluster(item);
ssize_t ret;
unsigned int val;
@@ -536,15 +456,17 @@ static ssize_t o2nm_cluster_attr_idle_timeout_ms_write(
return ret;
}
-static ssize_t o2nm_cluster_attr_keepalive_delay_ms_read(
- struct o2nm_cluster *cluster, char *page)
+static ssize_t o2nm_cluster_keepalive_delay_ms_show(
+ struct config_item *item, char *page)
{
- return sprintf(page, "%u\n", cluster->cl_keepalive_delay_ms);
+ return sprintf(page, "%u\n",
+ to_o2nm_cluster(item)->cl_keepalive_delay_ms);
}
-static ssize_t o2nm_cluster_attr_keepalive_delay_ms_write(
- struct o2nm_cluster *cluster, const char *page, size_t count)
+static ssize_t o2nm_cluster_keepalive_delay_ms_store(
+ struct config_item *item, const char *page, size_t count)
{
+ struct o2nm_cluster *cluster = to_o2nm_cluster(item);
ssize_t ret;
unsigned int val;
@@ -571,22 +493,24 @@ static ssize_t o2nm_cluster_attr_keepalive_delay_ms_write(
return ret;
}
-static ssize_t o2nm_cluster_attr_reconnect_delay_ms_read(
- struct o2nm_cluster *cluster, char *page)
+static ssize_t o2nm_cluster_reconnect_delay_ms_show(
+ struct config_item *item, char *page)
{
- return sprintf(page, "%u\n", cluster->cl_reconnect_delay_ms);
+ return sprintf(page, "%u\n",
+ to_o2nm_cluster(item)->cl_reconnect_delay_ms);
}
-static ssize_t o2nm_cluster_attr_reconnect_delay_ms_write(
- struct o2nm_cluster *cluster, const char *page, size_t count)
+static ssize_t o2nm_cluster_reconnect_delay_ms_store(
+ struct config_item *item, const char *page, size_t count)
{
return o2nm_cluster_attr_write(page, count,
- &cluster->cl_reconnect_delay_ms);
+ &to_o2nm_cluster(item)->cl_reconnect_delay_ms);
}
-static ssize_t o2nm_cluster_attr_fence_method_read(
- struct o2nm_cluster *cluster, char *page)
+static ssize_t o2nm_cluster_fence_method_show(
+ struct config_item *item, char *page)
{
+ struct o2nm_cluster *cluster = to_o2nm_cluster(item);
ssize_t ret = 0;
if (cluster)
@@ -595,8 +519,8 @@ static ssize_t o2nm_cluster_attr_fence_method_read(
return ret;
}
-static ssize_t o2nm_cluster_attr_fence_method_write(
- struct o2nm_cluster *cluster, const char *page, size_t count)
+static ssize_t o2nm_cluster_fence_method_store(
+ struct config_item *item, const char *page, size_t count)
{
unsigned int i;
@@ -608,10 +532,10 @@ static ssize_t o2nm_cluster_attr_fence_method_write(
continue;
if (strncasecmp(page, o2nm_fence_method_desc[i], count - 1))
continue;
- if (cluster->cl_fence_method != i) {
+ if (to_o2nm_cluster(item)->cl_fence_method != i) {
printk(KERN_INFO "ocfs2: Changing fence method to %s\n",
o2nm_fence_method_desc[i]);
- cluster->cl_fence_method = i;
+ to_o2nm_cluster(item)->cl_fence_method = i;
}
return count;
}
@@ -620,79 +544,18 @@ bail:
return -EINVAL;
}
-static struct o2nm_cluster_attribute o2nm_cluster_attr_idle_timeout_ms = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "idle_timeout_ms",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = o2nm_cluster_attr_idle_timeout_ms_read,
- .store = o2nm_cluster_attr_idle_timeout_ms_write,
-};
-
-static struct o2nm_cluster_attribute o2nm_cluster_attr_keepalive_delay_ms = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "keepalive_delay_ms",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = o2nm_cluster_attr_keepalive_delay_ms_read,
- .store = o2nm_cluster_attr_keepalive_delay_ms_write,
-};
-
-static struct o2nm_cluster_attribute o2nm_cluster_attr_reconnect_delay_ms = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "reconnect_delay_ms",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = o2nm_cluster_attr_reconnect_delay_ms_read,
- .store = o2nm_cluster_attr_reconnect_delay_ms_write,
-};
-
-static struct o2nm_cluster_attribute o2nm_cluster_attr_fence_method = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "fence_method",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = o2nm_cluster_attr_fence_method_read,
- .store = o2nm_cluster_attr_fence_method_write,
-};
+CONFIGFS_ATTR(o2nm_cluster_, idle_timeout_ms);
+CONFIGFS_ATTR(o2nm_cluster_, keepalive_delay_ms);
+CONFIGFS_ATTR(o2nm_cluster_, reconnect_delay_ms);
+CONFIGFS_ATTR(o2nm_cluster_, fence_method);
static struct configfs_attribute *o2nm_cluster_attrs[] = {
- &o2nm_cluster_attr_idle_timeout_ms.attr,
- &o2nm_cluster_attr_keepalive_delay_ms.attr,
- &o2nm_cluster_attr_reconnect_delay_ms.attr,
- &o2nm_cluster_attr_fence_method.attr,
+ &o2nm_cluster_attr_idle_timeout_ms,
+ &o2nm_cluster_attr_keepalive_delay_ms,
+ &o2nm_cluster_attr_reconnect_delay_ms,
+ &o2nm_cluster_attr_fence_method,
NULL,
};
-static ssize_t o2nm_cluster_show(struct config_item *item,
- struct configfs_attribute *attr,
- char *page)
-{
- struct o2nm_cluster *cluster = to_o2nm_cluster(item);
- struct o2nm_cluster_attribute *o2nm_cluster_attr =
- container_of(attr, struct o2nm_cluster_attribute, attr);
- ssize_t ret = 0;
-
- if (o2nm_cluster_attr->show)
- ret = o2nm_cluster_attr->show(cluster, page);
- return ret;
-}
-
-static ssize_t o2nm_cluster_store(struct config_item *item,
- struct configfs_attribute *attr,
- const char *page, size_t count)
-{
- struct o2nm_cluster *cluster = to_o2nm_cluster(item);
- struct o2nm_cluster_attribute *o2nm_cluster_attr =
- container_of(attr, struct o2nm_cluster_attribute, attr);
- ssize_t ret;
-
- if (o2nm_cluster_attr->store == NULL) {
- ret = -EINVAL;
- goto out;
- }
-
- ret = o2nm_cluster_attr->store(cluster, page, count);
- if (ret < count)
- goto out;
-out:
- return ret;
-}
static struct config_item *o2nm_node_group_make_item(struct config_group *group,
const char *name)
@@ -773,8 +636,6 @@ static void o2nm_cluster_release(struct config_item *item)
static struct configfs_item_operations o2nm_cluster_item_ops = {
.release = o2nm_cluster_release,
- .show_attribute = o2nm_cluster_show,
- .store_attribute = o2nm_cluster_store,
};
static struct config_item_type o2nm_cluster_type = {
@@ -896,7 +757,7 @@ int o2nm_depend_item(struct config_item *item)
void o2nm_undepend_item(struct config_item *item)
{
- configfs_undepend_item(&o2nm_cluster_group.cs_subsys, item);
+ configfs_undepend_item(item);
}
int o2nm_depend_this_node(void)
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 02878a8..ffecf89 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -480,33 +480,26 @@ static int ocfs2_check_dir_trailer(struct inode *dir, struct buffer_head *bh)
trailer = ocfs2_trailer_from_bh(bh, dir->i_sb);
if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) {
- rc = -EINVAL;
- ocfs2_error(dir->i_sb,
- "Invalid dirblock #%llu: "
- "signature = %.*s\n",
- (unsigned long long)bh->b_blocknr, 7,
- trailer->db_signature);
+ rc = ocfs2_error(dir->i_sb,
+ "Invalid dirblock #%llu: signature = %.*s\n",
+ (unsigned long long)bh->b_blocknr, 7,
+ trailer->db_signature);
goto out;
}
if (le64_to_cpu(trailer->db_blkno) != bh->b_blocknr) {
- rc = -EINVAL;
- ocfs2_error(dir->i_sb,
- "Directory block #%llu has an invalid "
- "db_blkno of %llu",
- (unsigned long long)bh->b_blocknr,
- (unsigned long long)le64_to_cpu(trailer->db_blkno));
+ rc = ocfs2_error(dir->i_sb,
+ "Directory block #%llu has an invalid db_blkno of %llu\n",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(trailer->db_blkno));
goto out;
}
if (le64_to_cpu(trailer->db_parent_dinode) !=
OCFS2_I(dir)->ip_blkno) {
- rc = -EINVAL;
- ocfs2_error(dir->i_sb,
- "Directory block #%llu on dinode "
- "#%llu has an invalid parent_dinode "
- "of %llu",
- (unsigned long long)bh->b_blocknr,
- (unsigned long long)OCFS2_I(dir)->ip_blkno,
- (unsigned long long)le64_to_cpu(trailer->db_blkno));
+ rc = ocfs2_error(dir->i_sb,
+ "Directory block #%llu on dinode #%llu has an invalid parent_dinode of %llu\n",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)OCFS2_I(dir)->ip_blkno,
+ (unsigned long long)le64_to_cpu(trailer->db_blkno));
goto out;
}
out:
@@ -604,14 +597,13 @@ static int ocfs2_validate_dx_root(struct super_block *sb,
}
if (!OCFS2_IS_VALID_DX_ROOT(dx_root)) {
- ocfs2_error(sb,
- "Dir Index Root # %llu has bad signature %.*s",
- (unsigned long long)le64_to_cpu(dx_root->dr_blkno),
- 7, dx_root->dr_signature);
- return -EINVAL;
+ ret = ocfs2_error(sb,
+ "Dir Index Root # %llu has bad signature %.*s\n",
+ (unsigned long long)le64_to_cpu(dx_root->dr_blkno),
+ 7, dx_root->dr_signature);
}
- return 0;
+ return ret;
}
static int ocfs2_read_dx_root(struct inode *dir, struct ocfs2_dinode *di,
@@ -648,12 +640,11 @@ static int ocfs2_validate_dx_leaf(struct super_block *sb,
}
if (!OCFS2_IS_VALID_DX_LEAF(dx_leaf)) {
- ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s",
- 7, dx_leaf->dl_signature);
- return -EROFS;
+ ret = ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s\n",
+ 7, dx_leaf->dl_signature);
}
- return 0;
+ return ret;
}
static int ocfs2_read_dx_leaf(struct inode *dir, u64 blkno,
@@ -812,11 +803,10 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode,
el = &eb->h_list;
if (el->l_tree_depth) {
- ocfs2_error(inode->i_sb,
- "Inode %lu has non zero tree depth in "
- "btree tree block %llu\n", inode->i_ino,
- (unsigned long long)eb_bh->b_blocknr);
- ret = -EROFS;
+ ret = ocfs2_error(inode->i_sb,
+ "Inode %lu has non zero tree depth in btree tree block %llu\n",
+ inode->i_ino,
+ (unsigned long long)eb_bh->b_blocknr);
goto out;
}
}
@@ -832,11 +822,11 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode,
}
if (!found) {
- ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
- "record (%u, %u, 0) in btree", inode->i_ino,
- le32_to_cpu(rec->e_cpos),
- ocfs2_rec_clusters(el, rec));
- ret = -EROFS;
+ ret = ocfs2_error(inode->i_sb,
+ "Inode %lu has bad extent record (%u, %u, 0) in btree\n",
+ inode->i_ino,
+ le32_to_cpu(rec->e_cpos),
+ ocfs2_rec_clusters(el, rec));
goto out;
}
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index e88ccf8..68c607e 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -376,17 +376,6 @@ struct dlm_lock
lksb_kernel_allocated:1;
};
-
-#define DLM_LKSB_UNUSED1 0x01
-#define DLM_LKSB_PUT_LVB 0x02
-#define DLM_LKSB_GET_LVB 0x04
-#define DLM_LKSB_UNUSED2 0x08
-#define DLM_LKSB_UNUSED3 0x10
-#define DLM_LKSB_UNUSED4 0x20
-#define DLM_LKSB_UNUSED5 0x40
-#define DLM_LKSB_UNUSED6 0x80
-
-
enum dlm_lockres_list {
DLM_GRANTED_LIST = 0,
DLM_CONVERTING_LIST = 1,
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 7df88a6..2ee7fe7 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1465,39 +1465,46 @@ static int dlm_request_join(struct dlm_ctxt *dlm,
if (status == -ENOPROTOOPT) {
status = 0;
*response = JOIN_OK_NO_MAP;
- } else if (packet.code == JOIN_DISALLOW ||
- packet.code == JOIN_OK_NO_MAP) {
- *response = packet.code;
- } else if (packet.code == JOIN_PROTOCOL_MISMATCH) {
- mlog(ML_NOTICE,
- "This node requested DLM locking protocol %u.%u and "
- "filesystem locking protocol %u.%u. At least one of "
- "the protocol versions on node %d is not compatible, "
- "disconnecting\n",
- dlm->dlm_locking_proto.pv_major,
- dlm->dlm_locking_proto.pv_minor,
- dlm->fs_locking_proto.pv_major,
- dlm->fs_locking_proto.pv_minor,
- node);
- status = -EPROTO;
- *response = packet.code;
- } else if (packet.code == JOIN_OK) {
- *response = packet.code;
- /* Use the same locking protocol as the remote node */
- dlm->dlm_locking_proto.pv_minor = packet.dlm_minor;
- dlm->fs_locking_proto.pv_minor = packet.fs_minor;
- mlog(0,
- "Node %d responds JOIN_OK with DLM locking protocol "
- "%u.%u and fs locking protocol %u.%u\n",
- node,
- dlm->dlm_locking_proto.pv_major,
- dlm->dlm_locking_proto.pv_minor,
- dlm->fs_locking_proto.pv_major,
- dlm->fs_locking_proto.pv_minor);
} else {
- status = -EINVAL;
- mlog(ML_ERROR, "invalid response %d from node %u\n",
- packet.code, node);
+ *response = packet.code;
+ switch (packet.code) {
+ case JOIN_DISALLOW:
+ case JOIN_OK_NO_MAP:
+ break;
+ case JOIN_PROTOCOL_MISMATCH:
+ mlog(ML_NOTICE,
+ "This node requested DLM locking protocol %u.%u and "
+ "filesystem locking protocol %u.%u. At least one of "
+ "the protocol versions on node %d is not compatible, "
+ "disconnecting\n",
+ dlm->dlm_locking_proto.pv_major,
+ dlm->dlm_locking_proto.pv_minor,
+ dlm->fs_locking_proto.pv_major,
+ dlm->fs_locking_proto.pv_minor,
+ node);
+ status = -EPROTO;
+ break;
+ case JOIN_OK:
+ /* Use the same locking protocol as the remote node */
+ dlm->dlm_locking_proto.pv_minor = packet.dlm_minor;
+ dlm->fs_locking_proto.pv_minor = packet.fs_minor;
+ mlog(0,
+ "Node %d responds JOIN_OK with DLM locking protocol "
+ "%u.%u and fs locking protocol %u.%u\n",
+ node,
+ dlm->dlm_locking_proto.pv_major,
+ dlm->dlm_locking_proto.pv_minor,
+ dlm->fs_locking_proto.pv_major,
+ dlm->fs_locking_proto.pv_minor);
+ break;
+ default:
+ status = -EINVAL;
+ mlog(ML_ERROR, "invalid response %d from node %u\n",
+ packet.code, node);
+ /* Reset response to JOIN_DISALLOW */
+ *response = JOIN_DISALLOW;
+ break;
+ }
}
mlog(0, "status %d, node %d response is %d\n", status, node,
@@ -1725,12 +1732,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB,
dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI);
+ o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB,
+ dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI);
+
status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_down);
if (status)
goto bail;
- o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB,
- dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI);
status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_up);
if (status)
goto bail;
@@ -1845,8 +1853,6 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
sizeof(struct dlm_exit_domain),
dlm_begin_exit_domain_handler,
dlm, NULL, &dlm->dlm_domain_handlers);
- if (status)
- goto bail;
bail:
if (status)
@@ -1860,6 +1866,7 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
int status;
unsigned int backoff;
unsigned int total_backoff = 0;
+ char wq_name[O2NM_MAX_NAME_LEN];
BUG_ON(!dlm);
@@ -1889,7 +1896,8 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
goto bail;
}
- dlm->dlm_worker = create_singlethread_workqueue("dlm_wq");
+ snprintf(wq_name, O2NM_MAX_NAME_LEN, "dlm_wq-%s", dlm->name);
+ dlm->dlm_worker = create_singlethread_workqueue(wq_name);
if (!dlm->dlm_worker) {
status = -ENOMEM;
mlog_errno(status);
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index fdf4b41..9477d6e 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -498,16 +498,6 @@ static void dlm_lockres_release(struct kref *kref)
mlog(0, "destroying lockres %.*s\n", res->lockname.len,
res->lockname.name);
- spin_lock(&dlm->track_lock);
- if (!list_empty(&res->tracking))
- list_del_init(&res->tracking);
- else {
- mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n",
- res->lockname.len, res->lockname.name);
- dlm_print_one_lock_resource(res);
- }
- spin_unlock(&dlm->track_lock);
-
atomic_dec(&dlm->res_cur_count);
if (!hlist_unhashed(&res->hash_node) ||
@@ -795,8 +785,18 @@ lookup:
dlm_lockres_grab_inflight_ref(dlm, tmpres);
spin_unlock(&tmpres->spinlock);
- if (res)
+ if (res) {
+ spin_lock(&dlm->track_lock);
+ if (!list_empty(&res->tracking))
+ list_del_init(&res->tracking);
+ else
+ mlog(ML_ERROR, "Resource %.*s not "
+ "on the Tracking list\n",
+ res->lockname.len,
+ res->lockname.name);
+ spin_unlock(&dlm->track_lock);
dlm_lockres_put(res);
+ }
res = tmpres;
goto leave;
}
@@ -1439,6 +1439,7 @@ int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data,
int found, ret;
int set_maybe;
int dispatch_assert = 0;
+ int dispatched = 0;
if (!dlm_grab(dlm))
return DLM_MASTER_RESP_NO;
@@ -1657,16 +1658,20 @@ send_response:
if (ret < 0) {
mlog(ML_ERROR, "failed to dispatch assert master work\n");
response = DLM_MASTER_RESP_ERROR;
+ spin_unlock(&res->spinlock);
dlm_lockres_put(res);
- } else
+ } else {
+ dispatched = 1;
__dlm_lockres_grab_inflight_worker(dlm, res);
- spin_unlock(&res->spinlock);
+ spin_unlock(&res->spinlock);
+ }
} else {
if (res)
dlm_lockres_put(res);
}
- dlm_put(dlm);
+ if (!dispatched)
+ dlm_put(dlm);
return response;
}
@@ -2090,7 +2095,6 @@ int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
/* queue up work for dlm_assert_master_worker */
- dlm_grab(dlm); /* get an extra ref for the work item */
dlm_init_work_item(dlm, item, dlm_assert_master_worker, NULL);
item->u.am.lockres = res; /* already have a ref */
/* can optionally ignore node numbers higher than this node */
@@ -2384,8 +2388,8 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
spin_lock(&res->spinlock);
BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF);
+ __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
if (test_bit(node, res->refmap)) {
- __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
dlm_lockres_clear_refmap_bit(dlm, res, node);
cleared = 1;
}
@@ -2515,6 +2519,11 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
spin_lock(&dlm->master_lock);
ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name,
namelen, target, dlm->node_num);
+ /* get an extra reference on the mle.
+ * otherwise the assert_master from the new
+ * master will destroy this.
+ */
+ dlm_get_mle_inuse(mle);
spin_unlock(&dlm->master_lock);
spin_unlock(&dlm->spinlock);
@@ -2540,7 +2549,7 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
}
fail:
- if (oldmle) {
+ if (ret != -EEXIST && oldmle) {
/* master is known, detach if not already detached */
dlm_mle_detach_hb_events(dlm, oldmle);
dlm_put_mle(oldmle);
@@ -2550,6 +2559,7 @@ fail:
if (mle_added) {
dlm_mle_detach_hb_events(dlm, mle);
dlm_put_mle(mle);
+ dlm_put_mle_inuse(mle);
} else if (mle) {
kmem_cache_free(dlm_mle_cache, mle);
mle = NULL;
@@ -2567,17 +2577,6 @@ fail:
* ensure that all assert_master work is flushed. */
flush_workqueue(dlm->dlm_worker);
- /* get an extra reference on the mle.
- * otherwise the assert_master from the new
- * master will destroy this.
- * also, make sure that all callers of dlm_get_mle
- * take both dlm->spinlock and dlm->master_lock */
- spin_lock(&dlm->spinlock);
- spin_lock(&dlm->master_lock);
- dlm_get_mle_inuse(mle);
- spin_unlock(&dlm->master_lock);
- spin_unlock(&dlm->spinlock);
-
/* notify new node and send all lock state */
/* call send_one_lockres with migration flag.
* this serves as notice to the target node that a
@@ -2839,6 +2838,8 @@ again:
res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY;
if (!ret)
BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING));
+ else
+ res->migration_pending = 0;
spin_unlock(&res->spinlock);
/*
@@ -3044,7 +3045,7 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
int ret = 0;
if (!dlm_grab(dlm))
- return -EINVAL;
+ return 0;
name = migrate->name;
namelen = migrate->namelen;
@@ -3135,7 +3136,8 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
mlog(0, "tried to migrate %.*s, but some "
"process beat me to it\n",
namelen, name);
- ret = -EEXIST;
+ spin_unlock(&tmp->spinlock);
+ return -EEXIST;
} else {
/* bad. 2 NODES are trying to migrate! */
mlog(ML_ERROR, "migration error mle: "
@@ -3306,6 +3308,15 @@ top:
mle->new_master != dead_node)
continue;
+ if (mle->new_master == dead_node && mle->inuse) {
+ mlog(ML_NOTICE, "%s: target %u died during "
+ "migration from %u, the MLE is "
+ "still keep used, ignore it!\n",
+ dlm->name, dead_node,
+ mle->master);
+ continue;
+ }
+
/* If we have reached this point, this mle needs to be
* removed from the list and freed. */
dlm_clean_migration_mle(dlm, mle);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index ce12e0b..c5bdf02 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -205,7 +205,7 @@ int dlm_launch_recovery_thread(struct dlm_ctxt *dlm)
mlog(0, "starting dlm recovery thread...\n");
dlm->dlm_reco_thread_task = kthread_run(dlm_recovery_thread, dlm,
- "dlm_reco_thread");
+ "dlm_reco-%s", dlm->name);
if (IS_ERR(dlm->dlm_reco_thread_task)) {
mlog_errno(PTR_ERR(dlm->dlm_reco_thread_task));
dlm->dlm_reco_thread_task = NULL;
@@ -1373,6 +1373,7 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
char *buf = NULL;
struct dlm_work_item *item = NULL;
struct dlm_lock_resource *res = NULL;
+ unsigned int hash;
if (!dlm_grab(dlm))
return -EINVAL;
@@ -1400,7 +1401,10 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
/* lookup the lock to see if we have a secondary queue for this
* already... just add the locks in and this will have its owner
* and RECOVERY flag changed when it completes. */
- res = dlm_lookup_lockres(dlm, mres->lockname, mres->lockname_len);
+ hash = dlm_lockid_hash(mres->lockname, mres->lockname_len);
+ spin_lock(&dlm->spinlock);
+ res = __dlm_lookup_lockres(dlm, mres->lockname, mres->lockname_len,
+ hash);
if (res) {
/* this will get a ref on res */
/* mark it as recovering/migrating and hash it */
@@ -1421,13 +1425,16 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
mres->lockname_len, mres->lockname);
ret = -EFAULT;
spin_unlock(&res->spinlock);
+ spin_unlock(&dlm->spinlock);
dlm_lockres_put(res);
goto leave;
}
res->state |= DLM_LOCK_RES_MIGRATING;
}
spin_unlock(&res->spinlock);
+ spin_unlock(&dlm->spinlock);
} else {
+ spin_unlock(&dlm->spinlock);
/* need to allocate, just like if it was
* mastered here normally */
res = dlm_new_lockres(dlm, mres->lockname, mres->lockname_len);
@@ -1694,6 +1701,7 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data,
unsigned int hash;
int master = DLM_LOCK_RES_OWNER_UNKNOWN;
u32 flags = DLM_ASSERT_MASTER_REQUERY;
+ int dispatched = 0;
if (!dlm_grab(dlm)) {
/* since the domain has gone away on this
@@ -1719,9 +1727,11 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data,
dlm_put(dlm);
/* sender will take care of this and retry */
return ret;
- } else
+ } else {
+ dispatched = 1;
__dlm_lockres_grab_inflight_worker(dlm, res);
- spin_unlock(&res->spinlock);
+ spin_unlock(&res->spinlock);
+ }
} else {
/* put.. incase we are not the master */
spin_unlock(&res->spinlock);
@@ -1730,7 +1740,8 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data,
}
spin_unlock(&dlm->spinlock);
- dlm_put(dlm);
+ if (!dispatched)
+ dlm_put(dlm);
return master;
}
@@ -2446,11 +2457,7 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
* perhaps later we can genericize this for other waiters. */
wake_up(&dlm->migration_wq);
- if (test_bit(idx, dlm->recovery_map))
- mlog(0, "domain %s, node %u already added "
- "to recovery map!\n", dlm->name, idx);
- else
- set_bit(idx, dlm->recovery_map);
+ set_bit(idx, dlm->recovery_map);
}
void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data)
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 69aac6f..c5f6c24 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -211,6 +211,16 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
__dlm_unhash_lockres(dlm, res);
+ spin_lock(&dlm->track_lock);
+ if (!list_empty(&res->tracking))
+ list_del_init(&res->tracking);
+ else {
+ mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n",
+ res->lockname.len, res->lockname.name);
+ __dlm_print_one_lock_resource(res);
+ }
+ spin_unlock(&dlm->track_lock);
+
/* lockres is not in the hash now. drop the flag and wake up
* any processes waiting in dlm_get_lock_resource. */
if (!master) {
@@ -483,7 +493,8 @@ int dlm_launch_thread(struct dlm_ctxt *dlm)
{
mlog(0, "Starting dlm_thread...\n");
- dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm_thread");
+ dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm-%s",
+ dlm->name);
if (IS_ERR(dlm->dlm_thread_task)) {
mlog_errno(PTR_ERR(dlm->dlm_thread_task));
dlm->dlm_thread_task = NULL;
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 2e3c9db..1082b2c 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -421,7 +421,7 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data,
}
if (!dlm_grab(dlm))
- return DLM_REJECTED;
+ return DLM_FORWARD;
mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
"Domain %s not fully joined!\n", dlm->name);
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index b5cf27d..03768bb 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -638,7 +638,7 @@ static int __init init_dlmfs_fs(void)
dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache",
sizeof(struct dlmfs_inode_private),
0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
dlmfs_init_once);
if (!dlmfs_inode_cache) {
status = -ENOMEM;
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 23157e4..f92612e 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2432,12 +2432,6 @@ bail:
* done this we have to return AOP_TRUNCATED_PAGE so the aop method
* that called us can bubble that back up into the VFS who will then
* immediately retry the aop call.
- *
- * We do a blocking lock and immediate unlock before returning, though, so that
- * the lock has a great chance of being cached on this node by the time the VFS
- * calls back to retry the aop. This has a potential to livelock as nodes
- * ping locks back and forth, but that's a risk we're willing to take to avoid
- * the lock inversion simply.
*/
int ocfs2_inode_lock_with_page(struct inode *inode,
struct buffer_head **ret_bh,
@@ -2449,8 +2443,6 @@ int ocfs2_inode_lock_with_page(struct inode *inode,
ret = ocfs2_inode_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK);
if (ret == -EAGAIN) {
unlock_page(page);
- if (ocfs2_inode_lock(inode, ret_bh, ex) == 0)
- ocfs2_inode_unlock(inode, ex);
ret = AOP_TRUNCATED_PAGE;
}
@@ -2998,7 +2990,8 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
}
/* launch downconvert thread */
- osb->dc_task = kthread_run(ocfs2_downconvert_thread, osb, "ocfs2dc");
+ osb->dc_task = kthread_run(ocfs2_downconvert_thread, osb, "ocfs2dc-%s",
+ osb->uuid_str);
if (IS_ERR(osb->dc_task)) {
status = PTR_ERR(osb->dc_task);
osb->dc_task = NULL;
@@ -3035,8 +3028,6 @@ local:
ocfs2_orphan_scan_lock_res_init(&osb->osb_orphan_scan.os_lockres, osb);
osb->cconn = conn;
-
- status = 0;
bail:
if (status < 0) {
ocfs2_dlm_shutdown_debug(osb);
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 767370b..e4719e0 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -305,8 +305,8 @@ static int ocfs2_last_eb_is_empty(struct inode *inode,
if (el->l_tree_depth) {
ocfs2_error(inode->i_sb,
- "Inode %lu has non zero tree depth in "
- "leaf block %llu\n", inode->i_ino,
+ "Inode %lu has non zero tree depth in leaf block %llu\n",
+ inode->i_ino,
(unsigned long long)eb_bh->b_blocknr);
ret = -EROFS;
goto out;
@@ -441,8 +441,8 @@ static int ocfs2_get_clusters_nocache(struct inode *inode,
if (el->l_tree_depth) {
ocfs2_error(inode->i_sb,
- "Inode %lu has non zero tree depth in "
- "leaf block %llu\n", inode->i_ino,
+ "Inode %lu has non zero tree depth in leaf block %llu\n",
+ inode->i_ino,
(unsigned long long)eb_bh->b_blocknr);
ret = -EROFS;
goto out;
@@ -475,8 +475,9 @@ static int ocfs2_get_clusters_nocache(struct inode *inode,
BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
if (!rec->e_blkno) {
- ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
- "record (%u, %u, 0)", inode->i_ino,
+ ocfs2_error(inode->i_sb,
+ "Inode %lu has bad extent record (%u, %u, 0)\n",
+ inode->i_ino,
le32_to_cpu(rec->e_cpos),
ocfs2_rec_clusters(el, rec));
ret = -EROFS;
@@ -564,8 +565,8 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
if (el->l_tree_depth) {
ocfs2_error(inode->i_sb,
- "Inode %lu has non zero tree depth in "
- "xattr leaf block %llu\n", inode->i_ino,
+ "Inode %lu has non zero tree depth in xattr leaf block %llu\n",
+ inode->i_ino,
(unsigned long long)eb_bh->b_blocknr);
ret = -EROFS;
goto out;
@@ -582,8 +583,9 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
if (!rec->e_blkno) {
- ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
- "record (%u, %u, 0) in xattr", inode->i_ino,
+ ocfs2_error(inode->i_sb,
+ "Inode %lu has bad extent record (%u, %u, 0) in xattr\n",
+ inode->i_ino,
le32_to_cpu(rec->e_cpos),
ocfs2_rec_clusters(el, rec));
ret = -EROFS;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 719f7f4..d631279 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -105,8 +105,11 @@ static int ocfs2_file_open(struct inode *inode, struct file *file)
file->f_path.dentry->d_name.len,
file->f_path.dentry->d_name.name, mode);
- if (file->f_mode & FMODE_WRITE)
- dquot_initialize(inode);
+ if (file->f_mode & FMODE_WRITE) {
+ status = dquot_initialize(inode);
+ if (status)
+ goto leave;
+ }
spin_lock(&oi->ip_lock);
@@ -1127,6 +1130,7 @@ out:
int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
{
int status = 0, size_change;
+ int inode_locked = 0;
struct inode *inode = d_inode(dentry);
struct super_block *sb = inode->i_sb;
struct ocfs2_super *osb = OCFS2_SB(sb);
@@ -1155,8 +1159,11 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
if (status)
return status;
- if (is_quota_modification(inode, attr))
- dquot_initialize(inode);
+ if (is_quota_modification(inode, attr)) {
+ status = dquot_initialize(inode);
+ if (status)
+ return status;
+ }
size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
if (size_change) {
status = ocfs2_rw_lock(inode, 1);
@@ -1172,6 +1179,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
mlog_errno(status);
goto bail_unlock_rw;
}
+ inode_locked = 1;
if (size_change) {
status = inode_newsize_ok(inode, attr->ia_size);
@@ -1209,8 +1217,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
&& OCFS2_HAS_RO_COMPAT_FEATURE(sb,
OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
transfer_to[USRQUOTA] = dqget(sb, make_kqid_uid(attr->ia_uid));
- if (!transfer_to[USRQUOTA]) {
- status = -ESRCH;
+ if (IS_ERR(transfer_to[USRQUOTA])) {
+ status = PTR_ERR(transfer_to[USRQUOTA]);
goto bail_unlock;
}
}
@@ -1218,8 +1226,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
&& OCFS2_HAS_RO_COMPAT_FEATURE(sb,
OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
transfer_to[GRPQUOTA] = dqget(sb, make_kqid_gid(attr->ia_gid));
- if (!transfer_to[GRPQUOTA]) {
- status = -ESRCH;
+ if (IS_ERR(transfer_to[GRPQUOTA])) {
+ status = PTR_ERR(transfer_to[GRPQUOTA]);
goto bail_unlock;
}
}
@@ -1252,7 +1260,10 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
bail_commit:
ocfs2_commit_trans(osb, handle);
bail_unlock:
- ocfs2_inode_unlock(inode, 1);
+ if (status) {
+ ocfs2_inode_unlock(inode, 1);
+ inode_locked = 0;
+ }
bail_unlock_rw:
if (size_change)
ocfs2_rw_unlock(inode, 1);
@@ -1268,6 +1279,8 @@ bail:
if (status < 0)
mlog_errno(status);
}
+ if (inode_locked)
+ ocfs2_inode_unlock(inode, 1);
return status;
}
@@ -1289,6 +1302,14 @@ int ocfs2_getattr(struct vfsmount *mnt,
}
generic_fillattr(inode, stat);
+ /*
+ * If there is inline data in the inode, the inode will normally not
+ * have data blocks allocated (it may have an external xattr block).
+ * Report at least one sector for such files, so tools like tar, rsync,
+ * others don't incorrectly think the file is completely sparse.
+ */
+ if (unlikely(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL))
+ stat->blocks += (stat->size + 511)>>9;
/* We set the blksize from the cluster size for performance */
stat->blksize = osb->s_clustersize;
@@ -2256,8 +2277,6 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
ssize_t written = 0;
ssize_t ret;
size_t count = iov_iter_count(from), orig_count;
- loff_t old_size;
- u32 old_clusters;
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -2265,6 +2284,8 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
OCFS2_MOUNT_COHERENCY_BUFFERED);
int unaligned_dio = 0;
int dropped_dio = 0;
+ int append_write = ((iocb->ki_pos + count) >=
+ i_size_read(inode) ? 1 : 0);
trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,
(unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -2284,8 +2305,9 @@ relock:
/*
* Concurrent O_DIRECT writes are allowed with
* mount_option "coherency=buffered".
+ * For append write, we must take rw EX.
*/
- rw_level = (!direct_io || full_coherency);
+ rw_level = (!direct_io || full_coherency || append_write);
ret = ocfs2_rw_lock(inode, rw_level);
if (ret < 0) {
@@ -2358,13 +2380,6 @@ relock:
ocfs2_iocb_set_unaligned_aio(iocb);
}
- /*
- * To later detect whether a journal commit for sync writes is
- * necessary, we sample i_size, and cluster count here.
- */
- old_size = i_size_read(inode);
- old_clusters = OCFS2_I(inode)->ip_clusters;
-
/* communicate with ocfs2_dio_end_io */
ocfs2_iocb_set_rw_locked(iocb, rw_level);
@@ -2372,6 +2387,20 @@ relock:
/* buffered aio wouldn't have proper lock coverage today */
BUG_ON(written == -EIOCBQUEUED && !(iocb->ki_flags & IOCB_DIRECT));
+ /*
+ * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
+ * function pointer which is called when o_direct io completes so that
+ * it can unlock our rw lock.
+ * Unfortunately there are error cases which call end_io and others
+ * that don't. so we don't have to unlock the rw_lock if either an
+ * async dio is going to do it in the future or an end_io after an
+ * error has already done it.
+ */
+ if ((written == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
+ rw_level = -1;
+ unaligned_dio = 0;
+ }
+
if (unlikely(written <= 0))
goto no_sync;
@@ -2396,21 +2425,7 @@ relock:
}
no_sync:
- /*
- * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
- * function pointer which is called when o_direct io completes so that
- * it can unlock our rw lock.
- * Unfortunately there are error cases which call end_io and others
- * that don't. so we don't have to unlock the rw_lock if either an
- * async dio is going to do it in the future or an end_io after an
- * error has already done it.
- */
- if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
- rw_level = -1;
- unaligned_dio = 0;
- }
-
- if (unaligned_dio) {
+ if (unaligned_dio && ocfs2_iocb_is_unaligned_aio(iocb)) {
ocfs2_iocb_clear_unaligned_aio(iocb);
mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
}
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index b254416..97a563b 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -361,6 +361,7 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
break;
case S_IFLNK:
inode->i_op = &ocfs2_symlink_inode_operations;
+ inode_nohighmem(inode);
i_size_write(inode, le64_to_cpu(fe->i_size));
break;
default:
@@ -971,6 +972,7 @@ static void ocfs2_delete_inode(struct inode *inode)
int wipe, status;
sigset_t oldset;
struct buffer_head *di_bh = NULL;
+ struct ocfs2_dinode *di = NULL;
trace_ocfs2_delete_inode(inode->i_ino,
(unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -1025,6 +1027,14 @@ static void ocfs2_delete_inode(struct inode *inode)
goto bail_unlock_nfs_sync;
}
+ di = (struct ocfs2_dinode *)di_bh->b_data;
+ /* Skip inode deletion and wait for dio orphan entry recovered
+ * first */
+ if (unlikely(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) {
+ ocfs2_cleanup_delete_inode(inode, 0);
+ goto bail_unlock_inode;
+ }
+
/* Query the cluster. This will be the final decision made
* before we go ahead and wipe the inode. */
status = ocfs2_query_inode_wipe(inode, di_bh, &wipe);
@@ -1191,17 +1201,19 @@ void ocfs2_evict_inode(struct inode *inode)
int ocfs2_drop_inode(struct inode *inode)
{
struct ocfs2_inode_info *oi = OCFS2_I(inode);
- int res;
trace_ocfs2_drop_inode((unsigned long long)oi->ip_blkno,
inode->i_nlink, oi->ip_flags);
- if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)
- res = 1;
- else
- res = generic_drop_inode(inode);
+ assert_spin_locked(&inode->i_lock);
+ inode->i_state |= I_WILL_FREE;
+ spin_unlock(&inode->i_lock);
+ write_inode_now(inode, 1);
+ spin_lock(&inode->i_lock);
+ WARN_ON(inode->i_state & I_NEW);
+ inode->i_state &= ~I_WILL_FREE;
- return res;
+ return 1;
}
/*
@@ -1350,32 +1362,32 @@ int ocfs2_validate_inode_block(struct super_block *sb,
rc = -EINVAL;
if (!OCFS2_IS_VALID_DINODE(di)) {
- ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n",
- (unsigned long long)bh->b_blocknr, 7,
- di->i_signature);
+ rc = ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n",
+ (unsigned long long)bh->b_blocknr, 7,
+ di->i_signature);
goto bail;
}
if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) {
- ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n",
- (unsigned long long)bh->b_blocknr,
- (unsigned long long)le64_to_cpu(di->i_blkno));
+ rc = ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(di->i_blkno));
goto bail;
}
if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
- ocfs2_error(sb,
- "Invalid dinode #%llu: OCFS2_VALID_FL not set\n",
- (unsigned long long)bh->b_blocknr);
+ rc = ocfs2_error(sb,
+ "Invalid dinode #%llu: OCFS2_VALID_FL not set\n",
+ (unsigned long long)bh->b_blocknr);
goto bail;
}
if (le32_to_cpu(di->i_fs_generation) !=
OCFS2_SB(sb)->fs_generation) {
- ocfs2_error(sb,
- "Invalid dinode #%llu: fs_generation is %u\n",
- (unsigned long long)bh->b_blocknr,
- le32_to_cpu(di->i_fs_generation));
+ rc = ocfs2_error(sb,
+ "Invalid dinode #%llu: fs_generation is %u\n",
+ (unsigned long long)bh->b_blocknr,
+ le32_to_cpu(di->i_fs_generation));
goto bail;
}
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 5e86b24..aac8b86 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -81,8 +81,6 @@ struct ocfs2_inode_info
tid_t i_sync_tid;
tid_t i_datasync_tid;
- wait_queue_head_t append_dio_wq;
-
struct dquot *i_dquot[MAXQUOTAS];
};
@@ -114,6 +112,8 @@ struct ocfs2_inode_info
#define OCFS2_INODE_OPEN_DIRECT 0x00000020
/* Tell the inode wipe code it's not in orphan dir */
#define OCFS2_INODE_SKIP_ORPHAN_DIR 0x00000040
+/* Entry in orphan dir with 'dio-' prefix */
+#define OCFS2_INODE_DIO_ORPHAN_ENTRY 0x00000080
static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
{
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 3cb097c..16b0bb4 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -606,9 +606,7 @@ bail:
if (gb_inode)
mutex_unlock(&gb_inode->i_mutex);
- if (gb_inode)
- iput(gb_inode);
-
+ iput(gb_inode);
brelse(bh);
return status;
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 7c099f7..3772a2d 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -374,7 +374,7 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
mlog_errno(PTR_ERR(handle));
if (is_journal_aborted(journal)) {
- ocfs2_abort(osb->sb, "Detected aborted journal");
+ ocfs2_abort(osb->sb, "Detected aborted journal\n");
handle = ERR_PTR(-EROFS);
}
} else {
@@ -668,7 +668,23 @@ static int __ocfs2_journal_access(handle_t *handle,
mlog(ML_ERROR, "giving me a buffer that's not uptodate!\n");
mlog(ML_ERROR, "b_blocknr=%llu\n",
(unsigned long long)bh->b_blocknr);
- BUG();
+
+ lock_buffer(bh);
+ /*
+ * A previous attempt to write this buffer head failed.
+ * Nothing we can do but to retry the write and hope for
+ * the best.
+ */
+ if (buffer_write_io_error(bh) && !buffer_uptodate(bh)) {
+ clear_buffer_write_io_error(bh);
+ set_buffer_uptodate(bh);
+ }
+
+ if (!buffer_uptodate(bh)) {
+ unlock_buffer(bh);
+ return -EIO;
+ }
+ unlock_buffer(bh);
}
/* Set the current transaction information on the ci so
@@ -1026,8 +1042,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
// up_write(&journal->j_trans_barrier);
done:
- if (inode)
- iput(inode);
+ iput(inode);
}
static void ocfs2_clear_journal_error(struct super_block *sb,
@@ -1074,7 +1089,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed)
/* Launch the commit thread */
if (!local) {
osb->commit_task = kthread_run(ocfs2_commit_thread, osb,
- "ocfs2cmt");
+ "ocfs2cmt-%s", osb->uuid_str);
if (IS_ERR(osb->commit_task)) {
status = PTR_ERR(osb->commit_task);
osb->commit_task = NULL;
@@ -1491,7 +1506,7 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
goto out;
osb->recovery_thread_task = kthread_run(__ocfs2_recovery_thread, osb,
- "ocfs2rec");
+ "ocfs2rec-%s", osb->uuid_str);
if (IS_ERR(osb->recovery_thread_task)) {
mlog_errno((int)PTR_ERR(osb->recovery_thread_task));
osb->recovery_thread_task = NULL;
@@ -1671,9 +1686,7 @@ done:
if (got_lock)
ocfs2_inode_unlock(inode, 1);
- if (inode)
- iput(inode);
-
+ iput(inode);
brelse(bh);
return status;
@@ -1780,8 +1793,7 @@ static int ocfs2_trylock_journal(struct ocfs2_super *osb,
ocfs2_inode_unlock(inode, 1);
bail:
- if (inode)
- iput(inode);
+ iput(inode);
return status;
}
@@ -2005,6 +2017,7 @@ struct ocfs2_orphan_filldir_priv {
struct dir_context ctx;
struct inode *head;
struct ocfs2_super *osb;
+ enum ocfs2_orphan_reco_type orphan_reco_type;
};
static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name,
@@ -2020,12 +2033,22 @@ static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name,
if (name_len == 2 && !strncmp("..", name, 2))
return 0;
+ /* do not include dio entry in case of orphan scan */
+ if ((p->orphan_reco_type == ORPHAN_NO_NEED_TRUNCATE) &&
+ (!strncmp(name, OCFS2_DIO_ORPHAN_PREFIX,
+ OCFS2_DIO_ORPHAN_PREFIX_LEN)))
+ return 0;
+
/* Skip bad inodes so that recovery can continue */
iter = ocfs2_iget(p->osb, ino,
OCFS2_FI_FLAG_ORPHAN_RECOVERY, 0);
if (IS_ERR(iter))
return 0;
+ if (!strncmp(name, OCFS2_DIO_ORPHAN_PREFIX,
+ OCFS2_DIO_ORPHAN_PREFIX_LEN))
+ OCFS2_I(iter)->ip_flags |= OCFS2_INODE_DIO_ORPHAN_ENTRY;
+
/* Skip inodes which are already added to recover list, since dio may
* happen concurrently with unlink/rename */
if (OCFS2_I(iter)->ip_next_orphan) {
@@ -2044,14 +2067,16 @@ static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name,
static int ocfs2_queue_orphans(struct ocfs2_super *osb,
int slot,
- struct inode **head)
+ struct inode **head,
+ enum ocfs2_orphan_reco_type orphan_reco_type)
{
int status;
struct inode *orphan_dir_inode = NULL;
struct ocfs2_orphan_filldir_priv priv = {
.ctx.actor = ocfs2_orphan_filldir,
.osb = osb,
- .head = *head
+ .head = *head,
+ .orphan_reco_type = orphan_reco_type
};
orphan_dir_inode = ocfs2_get_system_file_inode(osb,
@@ -2154,7 +2179,7 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
trace_ocfs2_recover_orphans(slot);
ocfs2_mark_recovering_orphan_dir(osb, slot);
- ret = ocfs2_queue_orphans(osb, slot, &inode);
+ ret = ocfs2_queue_orphans(osb, slot, &inode, orphan_reco_type);
ocfs2_clear_recovering_orphan_dir(osb, slot);
/* Error here should be noted, but we want to continue with as
@@ -2170,53 +2195,59 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
iter = oi->ip_next_orphan;
oi->ip_next_orphan = NULL;
- ret = ocfs2_rw_lock(inode, 1);
- if (ret < 0) {
- mlog_errno(ret);
- goto next;
- }
- /*
- * We need to take and drop the inode lock to
- * force read inode from disk.
- */
- ret = ocfs2_inode_lock(inode, &di_bh, 1);
- if (ret) {
- mlog_errno(ret);
- goto unlock_rw;
- }
+ if (oi->ip_flags & OCFS2_INODE_DIO_ORPHAN_ENTRY) {
+ mutex_lock(&inode->i_mutex);
+ ret = ocfs2_rw_lock(inode, 1);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto unlock_mutex;
+ }
+ /*
+ * We need to take and drop the inode lock to
+ * force read inode from disk.
+ */
+ ret = ocfs2_inode_lock(inode, &di_bh, 1);
+ if (ret) {
+ mlog_errno(ret);
+ goto unlock_rw;
+ }
+
+ di = (struct ocfs2_dinode *)di_bh->b_data;
- di = (struct ocfs2_dinode *)di_bh->b_data;
+ if (di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL)) {
+ ret = ocfs2_truncate_file(inode, di_bh,
+ i_size_read(inode));
+ if (ret < 0) {
+ if (ret != -ENOSPC)
+ mlog_errno(ret);
+ goto unlock_inode;
+ }
+
+ ret = ocfs2_del_inode_from_orphan(osb, inode,
+ di_bh, 0, 0);
+ if (ret)
+ mlog_errno(ret);
+ }
+unlock_inode:
+ ocfs2_inode_unlock(inode, 1);
+ brelse(di_bh);
+ di_bh = NULL;
+unlock_rw:
+ ocfs2_rw_unlock(inode, 1);
+unlock_mutex:
+ mutex_unlock(&inode->i_mutex);
- if (inode->i_nlink == 0) {
+ /* clear dio flag in ocfs2_inode_info */
+ oi->ip_flags &= ~OCFS2_INODE_DIO_ORPHAN_ENTRY;
+ } else {
spin_lock(&oi->ip_lock);
/* Set the proper information to get us going into
* ocfs2_delete_inode. */
oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
spin_unlock(&oi->ip_lock);
- } else if ((orphan_reco_type == ORPHAN_NEED_TRUNCATE) &&
- (di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) {
- ret = ocfs2_truncate_file(inode, di_bh,
- i_size_read(inode));
- if (ret < 0) {
- if (ret != -ENOSPC)
- mlog_errno(ret);
- goto unlock_inode;
- }
-
- ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh, 0, 0);
- if (ret)
- mlog_errno(ret);
+ }
- wake_up(&OCFS2_I(inode)->append_dio_wq);
- } /* else if ORPHAN_NO_NEED_TRUNCATE, do nothing */
-unlock_inode:
- ocfs2_inode_unlock(inode, 1);
-unlock_rw:
- ocfs2_rw_unlock(inode, 1);
-next:
iput(inode);
- brelse(di_bh);
- di_bh = NULL;
inode = iter;
}
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 857bbbc..e9c99e3 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -358,8 +358,7 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
bail:
if (status < 0)
brelse(alloc_bh);
- if (inode)
- iput(inode);
+ iput(inode);
trace_ocfs2_load_local_alloc(osb->local_alloc_bits);
@@ -473,8 +472,7 @@ out_mutex:
iput(main_bm_inode);
out:
- if (local_alloc_inode)
- iput(local_alloc_inode);
+ iput(local_alloc_inode);
kfree(alloc_copy);
}
@@ -665,8 +663,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
#ifdef CONFIG_OCFS2_DEBUG_FS
if (le32_to_cpu(alloc->id1.bitmap1.i_used) !=
ocfs2_local_alloc_count_bits(alloc)) {
- ocfs2_error(osb->sb, "local alloc inode %llu says it has "
- "%u used bits, but a count shows %u",
+ ocfs2_error(osb->sb, "local alloc inode %llu says it has %u used bits, but a count shows %u\n",
(unsigned long long)le64_to_cpu(alloc->i_blkno),
le32_to_cpu(alloc->id1.bitmap1.i_used),
ocfs2_local_alloc_count_bits(alloc));
@@ -1328,9 +1325,7 @@ bail:
brelse(main_bm_bh);
- if (main_bm_inode)
- iput(main_bm_inode);
-
+ iput(main_bm_inode);
kfree(alloc_copy);
if (ac)
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
index 6b6d092..d56f007 100644
--- a/fs/ocfs2/locks.c
+++ b/fs/ocfs2/locks.c
@@ -66,8 +66,11 @@ static int ocfs2_do_flock(struct file *file, struct inode *inode,
* level.
*/
- flock_lock_file_wait(file,
- &(struct file_lock){.fl_type = F_UNLCK});
+ locks_lock_file_wait(file,
+ &(struct file_lock) {
+ .fl_type = F_UNLCK,
+ .fl_flags = FL_FLOCK
+ });
ocfs2_file_unlock(file);
}
@@ -81,7 +84,7 @@ static int ocfs2_do_flock(struct file *file, struct inode *inode,
goto out;
}
- ret = flock_lock_file_wait(file, fl);
+ ret = locks_lock_file_wait(file, fl);
if (ret)
ocfs2_file_unlock(file);
@@ -98,7 +101,7 @@ static int ocfs2_do_funlock(struct file *file, int cmd, struct file_lock *fl)
mutex_lock(&fp->fp_mutex);
ocfs2_file_unlock(file);
- ret = flock_lock_file_wait(file, fl);
+ ret = locks_lock_file_wait(file, fl);
mutex_unlock(&fp->fp_mutex);
return ret;
@@ -119,7 +122,7 @@ int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl)
if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) ||
ocfs2_mount_local(osb))
- return flock_lock_file_wait(file, fl);
+ return locks_lock_file_wait(file, fl);
if (fl->fl_type == F_UNLCK)
return ocfs2_do_funlock(file, cmd, fl);
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 56a768d..124471d 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -99,11 +99,9 @@ static int __ocfs2_move_extent(handle_t *handle,
index = ocfs2_search_extent_list(el, cpos);
if (index == -1) {
- ocfs2_error(inode->i_sb,
- "Inode %llu has an extent at cpos %u which can no "
- "longer be found.\n",
- (unsigned long long)ino, cpos);
- ret = -EROFS;
+ ret = ocfs2_error(inode->i_sb,
+ "Inode %llu has an extent at cpos %u which can no longer be found\n",
+ (unsigned long long)ino, cpos);
goto out;
}
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 6e6abb9..ab42c38 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -106,8 +106,6 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2);
/* An orphan dir name is an 8 byte value, printed as a hex string */
#define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64)))
-#define OCFS2_DIO_ORPHAN_PREFIX "dio-"
-#define OCFS2_DIO_ORPHAN_PREFIX_LEN 4
static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags)
@@ -200,11 +198,12 @@ bail:
static struct inode *ocfs2_get_init_inode(struct inode *dir, umode_t mode)
{
struct inode *inode;
+ int status;
inode = new_inode(dir->i_sb);
if (!inode) {
mlog(ML_ERROR, "new_inode failed!\n");
- return NULL;
+ return ERR_PTR(-ENOMEM);
}
/* populate as many fields early on as possible - many of
@@ -213,7 +212,10 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, umode_t mode)
if (S_ISDIR(mode))
set_nlink(inode, 2);
inode_init_owner(inode, dir, mode);
- dquot_initialize(inode);
+ status = dquot_initialize(inode);
+ if (status)
+ return ERR_PTR(status);
+
return inode;
}
@@ -264,7 +266,11 @@ static int ocfs2_mknod(struct inode *dir,
(unsigned long long)OCFS2_I(dir)->ip_blkno,
(unsigned long)dev, mode);
- dquot_initialize(dir);
+ status = dquot_initialize(dir);
+ if (status) {
+ mlog_errno(status);
+ return status;
+ }
/* get our super block */
osb = OCFS2_SB(dir->i_sb);
@@ -311,8 +317,9 @@ static int ocfs2_mknod(struct inode *dir,
}
inode = ocfs2_get_init_inode(dir, mode);
- if (!inode) {
- status = -ENOMEM;
+ if (IS_ERR(inode)) {
+ status = PTR_ERR(inode);
+ inode = NULL;
mlog_errno(status);
goto leave;
}
@@ -360,7 +367,7 @@ static int ocfs2_mknod(struct inode *dir,
goto leave;
}
- status = posix_acl_create(dir, &mode, &default_acl, &acl);
+ status = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
if (status) {
mlog_errno(status);
goto leave;
@@ -648,9 +655,18 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
return status;
}
- return __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh,
+ status = __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh,
parent_fe_bh, handle, inode_ac,
fe_blkno, suballoc_loc, suballoc_bit);
+ if (status < 0) {
+ u64 bg_blkno = ocfs2_which_suballoc_group(fe_blkno, suballoc_bit);
+ int tmp = ocfs2_free_suballoc_bits(handle, inode_ac->ac_inode,
+ inode_ac->ac_bh, suballoc_bit, bg_blkno, 1);
+ if (tmp)
+ mlog_errno(tmp);
+ }
+
+ return status;
}
static int ocfs2_mkdir(struct inode *dir,
@@ -708,7 +724,11 @@ static int ocfs2_link(struct dentry *old_dentry,
if (S_ISDIR(inode->i_mode))
return -EPERM;
- dquot_initialize(dir);
+ err = dquot_initialize(dir);
+ if (err) {
+ mlog_errno(err);
+ return err;
+ }
err = ocfs2_double_lock(osb, &old_dir_bh, old_dir,
&parent_fe_bh, dir, 0);
@@ -896,7 +916,11 @@ static int ocfs2_unlink(struct inode *dir,
(unsigned long long)OCFS2_I(dir)->ip_blkno,
(unsigned long long)OCFS2_I(inode)->ip_blkno);
- dquot_initialize(dir);
+ status = dquot_initialize(dir);
+ if (status) {
+ mlog_errno(status);
+ return status;
+ }
BUG_ON(d_inode(dentry->d_parent) != dir);
@@ -1018,11 +1042,6 @@ leave:
if (handle)
ocfs2_commit_trans(osb, handle);
- if (child_locked)
- ocfs2_inode_unlock(inode, 1);
-
- ocfs2_inode_unlock(dir, 1);
-
if (orphan_dir) {
/* This was locked for us in ocfs2_prepare_orphan_dir() */
ocfs2_inode_unlock(orphan_dir, 1);
@@ -1030,6 +1049,11 @@ leave:
iput(orphan_dir);
}
+ if (child_locked)
+ ocfs2_inode_unlock(inode, 1);
+
+ ocfs2_inode_unlock(dir, 1);
+
brelse(fe_bh);
brelse(parent_node_bh);
@@ -1230,8 +1254,16 @@ static int ocfs2_rename(struct inode *old_dir,
old_dentry->d_name.len, old_dentry->d_name.name,
new_dentry->d_name.len, new_dentry->d_name.name);
- dquot_initialize(old_dir);
- dquot_initialize(new_dir);
+ status = dquot_initialize(old_dir);
+ if (status) {
+ mlog_errno(status);
+ goto bail;
+ }
+ status = dquot_initialize(new_dir);
+ if (status) {
+ mlog_errno(status);
+ goto bail;
+ }
osb = OCFS2_SB(old_dir->i_sb);
@@ -1284,6 +1316,11 @@ static int ocfs2_rename(struct inode *old_dir,
}
parents_locked = 1;
+ if (!new_dir->i_nlink) {
+ status = -EACCES;
+ goto bail;
+ }
+
/* make sure both dirs have bhs
* get an extra ref on old_dir_bh if old==new */
if (!new_dir_bh) {
@@ -1544,12 +1581,25 @@ static int ocfs2_rename(struct inode *old_dir,
status = ocfs2_find_entry(old_dentry->d_name.name,
old_dentry->d_name.len, old_dir,
&old_entry_lookup);
- if (status)
+ if (status) {
+ if (!is_journal_aborted(osb->journal->j_journal)) {
+ ocfs2_error(osb->sb, "new entry %.*s is added, but old entry %.*s "
+ "is not deleted.",
+ new_dentry->d_name.len, new_dentry->d_name.name,
+ old_dentry->d_name.len, old_dentry->d_name.name);
+ }
goto bail;
+ }
status = ocfs2_delete_entry(handle, old_dir, &old_entry_lookup);
if (status < 0) {
mlog_errno(status);
+ if (!is_journal_aborted(osb->journal->j_journal)) {
+ ocfs2_error(osb->sb, "new entry %.*s is added, but old entry %.*s "
+ "is not deleted.",
+ new_dentry->d_name.len, new_dentry->d_name.name,
+ old_dentry->d_name.len, old_dentry->d_name.name);
+ }
goto bail;
}
@@ -1608,21 +1658,9 @@ static int ocfs2_rename(struct inode *old_dir,
ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir);
status = 0;
bail:
- if (rename_lock)
- ocfs2_rename_unlock(osb);
-
if (handle)
ocfs2_commit_trans(osb, handle);
- if (parents_locked)
- ocfs2_double_unlock(old_dir, new_dir);
-
- if (old_child_locked)
- ocfs2_inode_unlock(old_inode, 1);
-
- if (new_child_locked)
- ocfs2_inode_unlock(new_inode, 1);
-
if (orphan_dir) {
/* This was locked for us in ocfs2_prepare_orphan_dir() */
ocfs2_inode_unlock(orphan_dir, 1);
@@ -1630,11 +1668,22 @@ bail:
iput(orphan_dir);
}
+ if (new_child_locked)
+ ocfs2_inode_unlock(new_inode, 1);
+
+ if (old_child_locked)
+ ocfs2_inode_unlock(old_inode, 1);
+
+ if (parents_locked)
+ ocfs2_double_unlock(old_dir, new_dir);
+
+ if (rename_lock)
+ ocfs2_rename_unlock(osb);
+
if (new_inode)
sync_mapping_buffers(old_inode->i_mapping);
- if (new_inode)
- iput(new_inode);
+ iput(new_inode);
ocfs2_free_dir_lookup_result(&target_lookup_res);
ocfs2_free_dir_lookup_result(&old_entry_lookup);
@@ -1786,7 +1835,11 @@ static int ocfs2_symlink(struct inode *dir,
trace_ocfs2_symlink_begin(dir, dentry, symname,
dentry->d_name.len, dentry->d_name.name);
- dquot_initialize(dir);
+ status = dquot_initialize(dir);
+ if (status) {
+ mlog_errno(status);
+ goto bail;
+ }
sb = dir->i_sb;
osb = OCFS2_SB(sb);
@@ -1831,8 +1884,9 @@ static int ocfs2_symlink(struct inode *dir,
}
inode = ocfs2_get_init_inode(dir, S_IFLNK | S_IRWXUGO);
- if (!inode) {
- status = -ENOMEM;
+ if (IS_ERR(inode)) {
+ status = PTR_ERR(inode);
+ inode = NULL;
mlog_errno(status);
goto bail;
}
@@ -1903,6 +1957,7 @@ static int ocfs2_symlink(struct inode *dir,
inode->i_rdev = 0;
newsize = l - 1;
inode->i_op = &ocfs2_symlink_inode_operations;
+ inode_nohighmem(inode);
if (l > ocfs2_fast_symlink_chars(sb)) {
u32 offset = 0;
@@ -2317,6 +2372,15 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
(unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno,
name, strlen(name));
+ status = ocfs2_journal_access_di(handle,
+ INODE_CACHE(orphan_dir_inode),
+ orphan_dir_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto leave;
+ }
+
/* find it's spot in the orphan directory */
status = ocfs2_find_entry(name, strlen(name), orphan_dir_inode,
&lookup);
@@ -2332,15 +2396,6 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
goto leave;
}
- status = ocfs2_journal_access_di(handle,
- INODE_CACHE(orphan_dir_inode),
- orphan_dir_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
-
/* do the i_nlink dance! :) */
orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
if (S_ISDIR(inode->i_mode))
@@ -2485,8 +2540,9 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
}
inode = ocfs2_get_init_inode(dir, mode);
- if (!inode) {
- status = -ENOMEM;
+ if (IS_ERR(inode)) {
+ status = PTR_ERR(inode);
+ inode = NULL;
mlog_errno(status);
goto leave;
}
@@ -2570,27 +2626,6 @@ leave:
return status;
}
-static int ocfs2_dio_orphan_recovered(struct inode *inode)
-{
- int ret;
- struct buffer_head *di_bh = NULL;
- struct ocfs2_dinode *di = NULL;
-
- ret = ocfs2_inode_lock(inode, &di_bh, 1);
- if (ret < 0) {
- mlog_errno(ret);
- return 0;
- }
-
- di = (struct ocfs2_dinode *) di_bh->b_data;
- ret = !(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL));
- ocfs2_inode_unlock(inode, 1);
- brelse(di_bh);
-
- return ret;
-}
-
-#define OCFS2_DIO_ORPHANED_FL_CHECK_INTERVAL 10000
int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
struct inode *inode)
{
@@ -2602,7 +2637,6 @@ int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
handle_t *handle = NULL;
struct ocfs2_dinode *di = NULL;
-restart:
status = ocfs2_inode_lock(inode, &di_bh, 1);
if (status < 0) {
mlog_errno(status);
@@ -2612,15 +2646,21 @@ restart:
di = (struct ocfs2_dinode *) di_bh->b_data;
/*
* Another append dio crashed?
- * If so, wait for recovery first.
+ * If so, manually recover it first.
*/
if (unlikely(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) {
- ocfs2_inode_unlock(inode, 1);
- brelse(di_bh);
- wait_event_interruptible_timeout(OCFS2_I(inode)->append_dio_wq,
- ocfs2_dio_orphan_recovered(inode),
- msecs_to_jiffies(OCFS2_DIO_ORPHANED_FL_CHECK_INTERVAL));
- goto restart;
+ status = ocfs2_truncate_file(inode, di_bh, i_size_read(inode));
+ if (status < 0) {
+ if (status != -ENOSPC)
+ mlog_errno(status);
+ goto bail_unlock_inode;
+ }
+
+ status = ocfs2_del_inode_from_orphan(osb, inode, di_bh, 0, 0);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail_unlock_inode;
+ }
}
status = ocfs2_prepare_orphan_dir(osb, &orphan_dir_inode,
diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h
index e173329..1155918 100644
--- a/fs/ocfs2/namei.h
+++ b/fs/ocfs2/namei.h
@@ -26,6 +26,9 @@
#ifndef OCFS2_NAMEI_H
#define OCFS2_NAMEI_H
+#define OCFS2_DIO_ORPHAN_PREFIX "dio-"
+#define OCFS2_DIO_ORPHAN_PREFIX_LEN 4
+
extern const struct inode_operations ocfs2_dir_iops;
struct dentry *ocfs2_get_parent(struct dentry *child);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 690ddc6..7a01262 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -286,6 +286,8 @@ enum ocfs2_mount_options
OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */
OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT = 1 << 15, /* Journal Async Commit */
+ OCFS2_MOUNT_ERRORS_CONT = 1 << 16, /* Return EIO to the calling process on error */
+ OCFS2_MOUNT_ERRORS_ROFS = 1 << 17, /* Change filesystem to read-only on error */
};
#define OCFS2_OSB_SOFT_RO 0x0001
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index db64ce2..540ab5b 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -168,7 +168,7 @@
/* Refcount tree support */
#define OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE 0x1000
-/* Discontigous block groups */
+/* Discontiguous block groups */
#define OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG 0x2000
/*
@@ -939,7 +939,7 @@ struct ocfs2_group_desc
/*
* Block groups may be discontiguous when
* OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG is set.
- * The extents of a discontigous block group are
+ * The extents of a discontiguous block group are
* stored in bg_list. It is a flat list.
* l_tree_depth must always be zero. A
* discontiguous group is signified by a non-zero
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index b6d5133..d153e6e 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -82,7 +82,7 @@ struct ocfs2_quota_chunk {
extern struct kmem_cache *ocfs2_dquot_cachep;
extern struct kmem_cache *ocfs2_qf_chunk_cachep;
-extern struct qtree_fmt_operations ocfs2_global_ops;
+extern const struct qtree_fmt_operations ocfs2_global_ops;
struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
struct ocfs2_super *osb, int slot_num);
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index c93d6722..fde9ef1 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -123,7 +123,7 @@ static int ocfs2_global_is_id(void *dp, struct dquot *dquot)
dquot->dq_id);
}
-struct qtree_fmt_operations ocfs2_global_ops = {
+const struct qtree_fmt_operations ocfs2_global_ops = {
.mem2disk_dqblk = ocfs2_global_mem2diskdqb,
.disk2mem_dqblk = ocfs2_global_disk2memdqb,
.is_id = ocfs2_global_is_id,
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 3d0b63d..8a54fd8 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -138,8 +138,7 @@ static int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) {
ocfs2_error(inode->i_sb,
- "Quota file %llu is probably corrupted! Requested "
- "to read block %Lu but file has size only %Lu\n",
+ "Quota file %llu is probably corrupted! Requested to read block %Lu but file has size only %Lu\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
(unsigned long long)v_block,
(unsigned long long)i_size_read(inode));
@@ -499,8 +498,8 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
dquot = dqget(sb,
make_kqid(&init_user_ns, type,
le64_to_cpu(dqblk->dqb_id)));
- if (!dquot) {
- status = -EIO;
+ if (IS_ERR(dquot)) {
+ status = PTR_ERR(dquot);
mlog(ML_ERROR, "Failed to get quota structure "
"for id %u, type %d. Cannot finish quota "
"file recovery.\n",
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index b69dd14..2521198 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -102,32 +102,30 @@ static int ocfs2_validate_refcount_block(struct super_block *sb,
if (!OCFS2_IS_VALID_REFCOUNT_BLOCK(rb)) {
- ocfs2_error(sb,
- "Refcount block #%llu has bad signature %.*s",
- (unsigned long long)bh->b_blocknr, 7,
- rb->rf_signature);
- return -EINVAL;
+ rc = ocfs2_error(sb,
+ "Refcount block #%llu has bad signature %.*s\n",
+ (unsigned long long)bh->b_blocknr, 7,
+ rb->rf_signature);
+ goto out;
}
if (le64_to_cpu(rb->rf_blkno) != bh->b_blocknr) {
- ocfs2_error(sb,
- "Refcount block #%llu has an invalid rf_blkno "
- "of %llu",
- (unsigned long long)bh->b_blocknr,
- (unsigned long long)le64_to_cpu(rb->rf_blkno));
- return -EINVAL;
+ rc = ocfs2_error(sb,
+ "Refcount block #%llu has an invalid rf_blkno of %llu\n",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(rb->rf_blkno));
+ goto out;
}
if (le32_to_cpu(rb->rf_fs_generation) != OCFS2_SB(sb)->fs_generation) {
- ocfs2_error(sb,
- "Refcount block #%llu has an invalid "
- "rf_fs_generation of #%u",
- (unsigned long long)bh->b_blocknr,
- le32_to_cpu(rb->rf_fs_generation));
- return -EINVAL;
+ rc = ocfs2_error(sb,
+ "Refcount block #%llu has an invalid rf_fs_generation of #%u\n",
+ (unsigned long long)bh->b_blocknr,
+ le32_to_cpu(rb->rf_fs_generation));
+ goto out;
}
-
- return 0;
+out:
+ return rc;
}
static int ocfs2_read_refcount_block(struct ocfs2_caching_info *ci,
@@ -1102,12 +1100,10 @@ static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci,
el = &eb->h_list;
if (el->l_tree_depth) {
- ocfs2_error(sb,
- "refcount tree %llu has non zero tree "
- "depth in leaf btree tree block %llu\n",
- (unsigned long long)ocfs2_metadata_cache_owner(ci),
- (unsigned long long)eb_bh->b_blocknr);
- ret = -EROFS;
+ ret = ocfs2_error(sb,
+ "refcount tree %llu has non zero tree depth in leaf btree tree block %llu\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ (unsigned long long)eb_bh->b_blocknr);
goto out;
}
}
@@ -2359,10 +2355,8 @@ static int ocfs2_mark_extent_refcounted(struct inode *inode,
cpos, len, phys);
if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
- ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
- "tree, but the feature bit is not set in the "
- "super block.", inode->i_ino);
- ret = -EROFS;
+ ret = ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n",
+ inode->i_ino);
goto out;
}
@@ -2545,10 +2539,8 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno);
if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
- ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
- "tree, but the feature bit is not set in the "
- "super block.", inode->i_ino);
- ret = -EROFS;
+ ret = ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n",
+ inode->i_ino);
goto out;
}
@@ -2672,11 +2664,10 @@ static int ocfs2_refcount_cal_cow_clusters(struct inode *inode,
el = &eb->h_list;
if (el->l_tree_depth) {
- ocfs2_error(inode->i_sb,
- "Inode %lu has non zero tree depth in "
- "leaf block %llu\n", inode->i_ino,
- (unsigned long long)eb_bh->b_blocknr);
- ret = -EROFS;
+ ret = ocfs2_error(inode->i_sb,
+ "Inode %lu has non zero tree depth in leaf block %llu\n",
+ inode->i_ino,
+ (unsigned long long)eb_bh->b_blocknr);
goto out;
}
}
@@ -2929,16 +2920,13 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle,
u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
struct page *page;
pgoff_t page_index;
- unsigned int from, to, readahead_pages;
+ unsigned int from, to;
loff_t offset, end, map_end;
struct address_space *mapping = inode->i_mapping;
trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster,
new_cluster, new_len);
- readahead_pages =
- (ocfs2_cow_contig_clusters(sb) <<
- OCFS2_SB(sb)->s_clustersize_bits) >> PAGE_CACHE_SHIFT;
offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits);
/*
@@ -3106,11 +3094,9 @@ static int ocfs2_clear_ext_refcount(handle_t *handle,
index = ocfs2_search_extent_list(el, cpos);
if (index == -1) {
- ocfs2_error(sb,
- "Inode %llu has an extent at cpos %u which can no "
- "longer be found.\n",
- (unsigned long long)ino, cpos);
- ret = -EROFS;
+ ret = ocfs2_error(sb,
+ "Inode %llu has an extent at cpos %u which can no longer be found\n",
+ (unsigned long long)ino, cpos);
goto out;
}
@@ -3376,10 +3362,8 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context)
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
- ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
- "tree, but the feature bit is not set in the "
- "super block.", inode->i_ino);
- return -EROFS;
+ return ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n",
+ inode->i_ino);
}
ocfs2_init_dealloc_ctxt(&context->dealloc);
@@ -4419,8 +4403,9 @@ static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir,
}
mutex_lock(&inode->i_mutex);
- dquot_initialize(dir);
- error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve);
+ error = dquot_initialize(dir);
+ if (!error)
+ error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve);
mutex_unlock(&inode->i_mutex);
if (!error)
fsnotify_create(dir, new_dentry);
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index d5da6f62..79b8021 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -54,11 +54,12 @@
static u16 ocfs2_calc_new_backup_super(struct inode *inode,
struct ocfs2_group_desc *gd,
u16 cl_cpg,
+ u16 old_bg_clusters,
int set)
{
int i;
u16 backups = 0;
- u32 cluster;
+ u32 cluster, lgd_cluster;
u64 blkno, gd_blkno, lgd_blkno = le64_to_cpu(gd->bg_blkno);
for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
@@ -71,6 +72,12 @@ static u16 ocfs2_calc_new_backup_super(struct inode *inode,
else if (gd_blkno > lgd_blkno)
break;
+ /* check if already done backup super */
+ lgd_cluster = ocfs2_blocks_to_clusters(inode->i_sb, lgd_blkno);
+ lgd_cluster += old_bg_clusters;
+ if (lgd_cluster >= cluster)
+ continue;
+
if (set)
ocfs2_set_bit(cluster % cl_cpg,
(unsigned long *)gd->bg_bitmap);
@@ -99,6 +106,7 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
u16 chain, num_bits, backups = 0;
u16 cl_bpc = le16_to_cpu(cl->cl_bpc);
u16 cl_cpg = le16_to_cpu(cl->cl_cpg);
+ u16 old_bg_clusters;
trace_ocfs2_update_last_group_and_inode(new_clusters,
first_new_cluster);
@@ -112,6 +120,7 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
group = (struct ocfs2_group_desc *)group_bh->b_data;
+ old_bg_clusters = le16_to_cpu(group->bg_bits) / cl_bpc;
/* update the group first. */
num_bits = new_clusters * cl_bpc;
le16_add_cpu(&group->bg_bits, num_bits);
@@ -125,7 +134,7 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
OCFS2_FEATURE_COMPAT_BACKUP_SB)) {
backups = ocfs2_calc_new_backup_super(bm_inode,
group,
- cl_cpg, 1);
+ cl_cpg, old_bg_clusters, 1);
le16_add_cpu(&group->bg_free_bits_count, -1 * backups);
}
@@ -163,7 +172,7 @@ out_rollback:
if (ret < 0) {
ocfs2_calc_new_backup_super(bm_inode,
group,
- cl_cpg, 0);
+ cl_cpg, old_bg_clusters, 0);
le16_add_cpu(&group->bg_free_bits_count, backups);
le16_add_cpu(&group->bg_bits, -1 * num_bits);
le16_add_cpu(&group->bg_free_bits_count, -1 * num_bits);
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index e78a203..1e09592 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -322,8 +322,7 @@ static void __ocfs2_free_slot_info(struct ocfs2_slot_info *si)
if (si == NULL)
return;
- if (si->si_inode)
- iput(si->si_inode);
+ iput(si->si_inode);
if (si->si_bh) {
for (i = 0; i < si->si_blocks; i++) {
if (si->si_bh[i]) {
@@ -503,8 +502,17 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
trace_ocfs2_find_slot(osb->slot_num);
status = ocfs2_update_disk_slot(osb, si, osb->slot_num);
- if (status < 0)
+ if (status < 0) {
mlog_errno(status);
+ /*
+ * if write block failed, invalidate slot to avoid overwrite
+ * slot during dismount in case another node rightly has mounted
+ */
+ spin_lock(&osb->osb_lock);
+ ocfs2_invalidate_slot(si, osb->slot_num);
+ osb->slot_num = OCFS2_INVALID_SLOT;
+ spin_unlock(&osb->osb_lock);
+ }
bail:
return status;
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 2768eb1..ced70c8 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -655,14 +655,7 @@ static int ocfs2_control_init(void)
static void ocfs2_control_exit(void)
{
- int rc;
-
- rc = misc_deregister(&ocfs2_control_device);
- if (rc)
- printk(KERN_ERR
- "ocfs2: Unable to deregister ocfs2_control device "
- "(errno %d)\n",
- -rc);
+ misc_deregister(&ocfs2_control_device);
}
static void fsdlm_lock_ast_wrapper(void *astarg)
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 4479029..fc6d25f 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -149,10 +149,8 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
brelse(ac->ac_bh);
ac->ac_bh = NULL;
ac->ac_resv = NULL;
- if (ac->ac_find_loc_priv) {
- kfree(ac->ac_find_loc_priv);
- ac->ac_find_loc_priv = NULL;
- }
+ kfree(ac->ac_find_loc_priv);
+ ac->ac_find_loc_priv = NULL;
}
void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
@@ -167,12 +165,12 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
}
#define do_error(fmt, ...) \
- do{ \
- if (resize) \
- mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__); \
- else \
- ocfs2_error(sb, fmt, ##__VA_ARGS__); \
- } while (0)
+do { \
+ if (resize) \
+ mlog(ML_ERROR, fmt, ##__VA_ARGS__); \
+ else \
+ return ocfs2_error(sb, fmt, ##__VA_ARGS__); \
+} while (0)
static int ocfs2_validate_gd_self(struct super_block *sb,
struct buffer_head *bh,
@@ -181,44 +179,35 @@ static int ocfs2_validate_gd_self(struct super_block *sb,
struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
- do_error("Group descriptor #%llu has bad signature %.*s",
+ do_error("Group descriptor #%llu has bad signature %.*s\n",
(unsigned long long)bh->b_blocknr, 7,
gd->bg_signature);
- return -EINVAL;
}
if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) {
- do_error("Group descriptor #%llu has an invalid bg_blkno "
- "of %llu",
+ do_error("Group descriptor #%llu has an invalid bg_blkno of %llu\n",
(unsigned long long)bh->b_blocknr,
(unsigned long long)le64_to_cpu(gd->bg_blkno));
- return -EINVAL;
}
if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) {
- do_error("Group descriptor #%llu has an invalid "
- "fs_generation of #%u",
+ do_error("Group descriptor #%llu has an invalid fs_generation of #%u\n",
(unsigned long long)bh->b_blocknr,
le32_to_cpu(gd->bg_generation));
- return -EINVAL;
}
if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
- do_error("Group descriptor #%llu has bit count %u but "
- "claims that %u are free",
+ do_error("Group descriptor #%llu has bit count %u but claims that %u are free\n",
(unsigned long long)bh->b_blocknr,
le16_to_cpu(gd->bg_bits),
le16_to_cpu(gd->bg_free_bits_count));
- return -EINVAL;
}
if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
- do_error("Group descriptor #%llu has bit count %u but "
- "max bitmap bits of %u",
+ do_error("Group descriptor #%llu has bit count %u but max bitmap bits of %u\n",
(unsigned long long)bh->b_blocknr,
le16_to_cpu(gd->bg_bits),
8 * le16_to_cpu(gd->bg_size));
- return -EINVAL;
}
return 0;
@@ -233,20 +222,17 @@ static int ocfs2_validate_gd_parent(struct super_block *sb,
struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
if (di->i_blkno != gd->bg_parent_dinode) {
- do_error("Group descriptor #%llu has bad parent "
- "pointer (%llu, expected %llu)",
+ do_error("Group descriptor #%llu has bad parent pointer (%llu, expected %llu)\n",
(unsigned long long)bh->b_blocknr,
(unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
(unsigned long long)le64_to_cpu(di->i_blkno));
- return -EINVAL;
}
max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
if (le16_to_cpu(gd->bg_bits) > max_bits) {
- do_error("Group descriptor #%llu has bit count of %u",
+ do_error("Group descriptor #%llu has bit count of %u\n",
(unsigned long long)bh->b_blocknr,
le16_to_cpu(gd->bg_bits));
- return -EINVAL;
}
/* In resize, we may meet the case bg_chain == cl_next_free_rec. */
@@ -254,10 +240,9 @@ static int ocfs2_validate_gd_parent(struct super_block *sb,
le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) ||
((le16_to_cpu(gd->bg_chain) ==
le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) {
- do_error("Group descriptor #%llu has bad chain %u",
+ do_error("Group descriptor #%llu has bad chain %u\n",
(unsigned long long)bh->b_blocknr,
le16_to_cpu(gd->bg_chain));
- return -EINVAL;
}
return 0;
@@ -384,11 +369,10 @@ static int ocfs2_block_group_fill(handle_t *handle,
struct super_block * sb = alloc_inode->i_sb;
if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
- ocfs2_error(alloc_inode->i_sb, "group block (%llu) != "
- "b_blocknr (%llu)",
- (unsigned long long)group_blkno,
- (unsigned long long) bg_bh->b_blocknr);
- status = -EIO;
+ status = ocfs2_error(alloc_inode->i_sb,
+ "group block (%llu) != b_blocknr (%llu)\n",
+ (unsigned long long)group_blkno,
+ (unsigned long long) bg_bh->b_blocknr);
goto bail;
}
@@ -834,9 +818,9 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
- ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu",
- (unsigned long long)le64_to_cpu(fe->i_blkno));
- status = -EIO;
+ status = ocfs2_error(alloc_inode->i_sb,
+ "Invalid chain allocator %llu\n",
+ (unsigned long long)le64_to_cpu(fe->i_blkno));
goto bail;
}
@@ -1370,12 +1354,11 @@ int ocfs2_block_group_set_bits(handle_t *handle,
le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
- ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
- " count %u but claims %u are freed. num_bits %d",
- (unsigned long long)le64_to_cpu(bg->bg_blkno),
- le16_to_cpu(bg->bg_bits),
- le16_to_cpu(bg->bg_free_bits_count), num_bits);
- return -EROFS;
+ return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n",
+ (unsigned long long)le64_to_cpu(bg->bg_blkno),
+ le16_to_cpu(bg->bg_bits),
+ le16_to_cpu(bg->bg_free_bits_count),
+ num_bits);
}
while(num_bits--)
ocfs2_set_bit(bit_off++, bitmap);
@@ -1905,13 +1888,11 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
le32_to_cpu(fe->id1.bitmap1.i_total)) {
- ocfs2_error(ac->ac_inode->i_sb,
- "Chain allocator dinode %llu has %u used "
- "bits but only %u total.",
- (unsigned long long)le64_to_cpu(fe->i_blkno),
- le32_to_cpu(fe->id1.bitmap1.i_used),
- le32_to_cpu(fe->id1.bitmap1.i_total));
- status = -EIO;
+ status = ocfs2_error(ac->ac_inode->i_sb,
+ "Chain allocator dinode %llu has %u used bits but only %u total\n",
+ (unsigned long long)le64_to_cpu(fe->i_blkno),
+ le32_to_cpu(fe->id1.bitmap1.i_used),
+ le32_to_cpu(fe->id1.bitmap1.i_total));
goto bail;
}
@@ -1939,7 +1920,10 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
res, &bits_left);
if (!status) {
- hint = ocfs2_group_from_res(res);
+ if (ocfs2_is_cluster_bitmap(ac->ac_inode))
+ hint = res->sr_bg_blkno;
+ else
+ hint = ocfs2_group_from_res(res);
goto set_hint;
}
if (status < 0 && status != -ENOSPC) {
@@ -2429,12 +2413,11 @@ static int ocfs2_block_group_clear_bits(handle_t *handle,
}
le16_add_cpu(&bg->bg_free_bits_count, num_bits);
if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
- ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
- " count %u but claims %u are freed. num_bits %d",
- (unsigned long long)le64_to_cpu(bg->bg_blkno),
- le16_to_cpu(bg->bg_bits),
- le16_to_cpu(bg->bg_free_bits_count), num_bits);
- return -EROFS;
+ return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n",
+ (unsigned long long)le64_to_cpu(bg->bg_blkno),
+ le16_to_cpu(bg->bg_bits),
+ le16_to_cpu(bg->bg_free_bits_count),
+ num_bits);
}
if (undo_fn)
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 403c566..faa1365 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -192,6 +192,7 @@ enum {
Opt_resv_level,
Opt_dir_resv_level,
Opt_journal_async_commit,
+ Opt_err_cont,
Opt_err,
};
@@ -224,6 +225,7 @@ static const match_table_t tokens = {
{Opt_resv_level, "resv_level=%u"},
{Opt_dir_resv_level, "dir_resv_level=%u"},
{Opt_journal_async_commit, "journal_async_commit"},
+ {Opt_err_cont, "errors=continue"},
{Opt_err, NULL}
};
@@ -1278,6 +1280,8 @@ static int ocfs2_parse_options(struct super_block *sb,
int status, user_stack = 0;
char *p;
u32 tmp;
+ int token, option;
+ substring_t args[MAX_OPT_ARGS];
trace_ocfs2_parse_options(is_remount, options ? options : "(none)");
@@ -1296,9 +1300,6 @@ static int ocfs2_parse_options(struct super_block *sb,
}
while ((p = strsep(&options, ",")) != NULL) {
- int token, option;
- substring_t args[MAX_OPT_ARGS];
-
if (!*p)
continue;
@@ -1330,10 +1331,19 @@ static int ocfs2_parse_options(struct super_block *sb,
mopt->mount_opt |= OCFS2_MOUNT_NOINTR;
break;
case Opt_err_panic:
+ mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_CONT;
+ mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_ROFS;
mopt->mount_opt |= OCFS2_MOUNT_ERRORS_PANIC;
break;
case Opt_err_ro:
+ mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_CONT;
+ mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC;
+ mopt->mount_opt |= OCFS2_MOUNT_ERRORS_ROFS;
+ break;
+ case Opt_err_cont:
+ mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_ROFS;
mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC;
+ mopt->mount_opt |= OCFS2_MOUNT_ERRORS_CONT;
break;
case Opt_data_ordered:
mopt->mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK;
@@ -1356,7 +1366,6 @@ static int ocfs2_parse_options(struct super_block *sb,
mopt->atime_quantum = option;
break;
case Opt_slot:
- option = 0;
if (match_int(&args[0], &option)) {
status = 0;
goto bail;
@@ -1365,7 +1374,6 @@ static int ocfs2_parse_options(struct super_block *sb,
mopt->slot = (s16)option;
break;
case Opt_commit:
- option = 0;
if (match_int(&args[0], &option)) {
status = 0;
goto bail;
@@ -1377,7 +1385,6 @@ static int ocfs2_parse_options(struct super_block *sb,
mopt->commit_interval = HZ * option;
break;
case Opt_localalloc:
- option = 0;
if (match_int(&args[0], &option)) {
status = 0;
goto bail;
@@ -1530,6 +1537,8 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root)
if (opts & OCFS2_MOUNT_ERRORS_PANIC)
seq_printf(s, ",errors=panic");
+ else if (opts & OCFS2_MOUNT_ERRORS_CONT)
+ seq_printf(s, ",errors=continue");
else
seq_printf(s, ",errors=remount-ro");
@@ -1550,8 +1559,8 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root)
seq_printf(s, ",localflocks,");
if (osb->osb_cluster_stack[0])
- seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN,
- osb->osb_cluster_stack);
+ seq_show_option_n(s, "cluster_stack", osb->osb_cluster_stack,
+ OCFS2_STACK_LABEL_LEN);
if (opts & OCFS2_MOUNT_USRQUOTA)
seq_printf(s, ",usrquota");
if (opts & OCFS2_MOUNT_GRPQUOTA)
@@ -1713,8 +1722,7 @@ static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
ocfs2_inode_unlock(inode, 0);
status = 0;
bail:
- if (inode)
- iput(inode);
+ iput(inode);
if (status)
mlog_errno(status);
@@ -1746,8 +1754,6 @@ static void ocfs2_inode_init_once(void *data)
ocfs2_lock_res_init_once(&oi->ip_inode_lockres);
ocfs2_lock_res_init_once(&oi->ip_open_lockres);
- init_waitqueue_head(&oi->append_dio_wq);
-
ocfs2_metadata_cache_init(INODE_CACHE(&oi->vfs_inode),
&ocfs2_inode_caching_ops);
@@ -1760,7 +1766,7 @@ static int ocfs2_initialize_mem_caches(void)
sizeof(struct ocfs2_inode_info),
0,
(SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
ocfs2_inode_init_once);
ocfs2_dquot_cachep = kmem_cache_create("ocfs2_dquot_cache",
sizeof(struct ocfs2_dquot),
@@ -2541,31 +2547,43 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
memset(osb, 0, sizeof(struct ocfs2_super));
}
-/* Put OCFS2 into a readonly state, or (if the user specifies it),
- * panic(). We do not support continue-on-error operation. */
-static void ocfs2_handle_error(struct super_block *sb)
+/* Depending on the mount option passed, perform one of the following:
+ * Put OCFS2 into a readonly state (default)
+ * Return EIO so that only the process errs
+ * Fix the error as if fsck.ocfs2 -y
+ * panic
+ */
+static int ocfs2_handle_error(struct super_block *sb)
{
struct ocfs2_super *osb = OCFS2_SB(sb);
-
- if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC)
- panic("OCFS2: (device %s): panic forced after error\n",
- sb->s_id);
+ int rv = 0;
ocfs2_set_osb_flag(osb, OCFS2_OSB_ERROR_FS);
+ pr_crit("On-disk corruption discovered. "
+ "Please run fsck.ocfs2 once the filesystem is unmounted.\n");
- if (sb->s_flags & MS_RDONLY &&
- (ocfs2_is_soft_readonly(osb) ||
- ocfs2_is_hard_readonly(osb)))
- return;
-
- printk(KERN_CRIT "File system is now read-only due to the potential "
- "of on-disk corruption. Please run fsck.ocfs2 once the file "
- "system is unmounted.\n");
- sb->s_flags |= MS_RDONLY;
- ocfs2_set_ro_flag(osb, 0);
+ if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC) {
+ panic("OCFS2: (device %s): panic forced after error\n",
+ sb->s_id);
+ } else if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_CONT) {
+ pr_crit("OCFS2: Returning error to the calling process.\n");
+ rv = -EIO;
+ } else { /* default option */
+ rv = -EROFS;
+ if (sb->s_flags & MS_RDONLY &&
+ (ocfs2_is_soft_readonly(osb) ||
+ ocfs2_is_hard_readonly(osb)))
+ return rv;
+
+ pr_crit("OCFS2: File system is now read-only.\n");
+ sb->s_flags |= MS_RDONLY;
+ ocfs2_set_ro_flag(osb, 0);
+ }
+
+ return rv;
}
-void __ocfs2_error(struct super_block *sb, const char *function,
+int __ocfs2_error(struct super_block *sb, const char *function,
const char *fmt, ...)
{
struct va_format vaf;
@@ -2577,12 +2595,12 @@ void __ocfs2_error(struct super_block *sb, const char *function,
/* Not using mlog here because we want to show the actual
* function the error came from. */
- printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %pV\n",
+ printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %pV",
sb->s_id, function, &vaf);
va_end(args);
- ocfs2_handle_error(sb);
+ return ocfs2_handle_error(sb);
}
/* Handle critical errors. This is intentionally more drastic than
@@ -2599,7 +2617,7 @@ void __ocfs2_abort(struct super_block *sb, const char *function,
vaf.fmt = fmt;
vaf.va = &args;
- printk(KERN_CRIT "OCFS2: abort (device %s): %s: %pV\n",
+ printk(KERN_CRIT "OCFS2: abort (device %s): %s: %pV",
sb->s_id, function, &vaf);
va_end(args);
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
index 74ff74c..b477d0b 100644
--- a/fs/ocfs2/super.h
+++ b/fs/ocfs2/super.h
@@ -32,16 +32,18 @@ int ocfs2_publish_get_mount_state(struct ocfs2_super *osb,
int node_num);
__printf(3, 4)
-void __ocfs2_error(struct super_block *sb, const char *function,
+int __ocfs2_error(struct super_block *sb, const char *function,
const char *fmt, ...);
-#define ocfs2_error(sb, fmt, args...) __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##args)
+#define ocfs2_error(sb, fmt, ...) \
+ __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##__VA_ARGS__)
__printf(3, 4)
void __ocfs2_abort(struct super_block *sb, const char *function,
const char *fmt, ...);
-#define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args)
+#define ocfs2_abort(sb, fmt, ...) \
+ __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##__VA_ARGS__)
/*
* Void signal blockers, because in-kernel sigprocmask() only fails
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index 66edce7..6c2a3e3 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -88,8 +88,7 @@ const struct address_space_operations ocfs2_fast_symlink_aops = {
const struct inode_operations ocfs2_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
.getattr = ocfs2_getattr,
.setattr = ocfs2_setattr,
.setxattr = generic_setxattr,
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 889f379..f0e241f 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -499,30 +499,24 @@ static int ocfs2_validate_xattr_block(struct super_block *sb,
*/
if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
- ocfs2_error(sb,
- "Extended attribute block #%llu has bad "
- "signature %.*s",
- (unsigned long long)bh->b_blocknr, 7,
- xb->xb_signature);
- return -EINVAL;
+ return ocfs2_error(sb,
+ "Extended attribute block #%llu has bad signature %.*s\n",
+ (unsigned long long)bh->b_blocknr, 7,
+ xb->xb_signature);
}
if (le64_to_cpu(xb->xb_blkno) != bh->b_blocknr) {
- ocfs2_error(sb,
- "Extended attribute block #%llu has an "
- "invalid xb_blkno of %llu",
- (unsigned long long)bh->b_blocknr,
- (unsigned long long)le64_to_cpu(xb->xb_blkno));
- return -EINVAL;
+ return ocfs2_error(sb,
+ "Extended attribute block #%llu has an invalid xb_blkno of %llu\n",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(xb->xb_blkno));
}
if (le32_to_cpu(xb->xb_fs_generation) != OCFS2_SB(sb)->fs_generation) {
- ocfs2_error(sb,
- "Extended attribute block #%llu has an invalid "
- "xb_fs_generation of #%u",
- (unsigned long long)bh->b_blocknr,
- le32_to_cpu(xb->xb_fs_generation));
- return -EINVAL;
+ return ocfs2_error(sb,
+ "Extended attribute block #%llu has an invalid xb_fs_generation of #%u\n",
+ (unsigned long long)bh->b_blocknr,
+ le32_to_cpu(xb->xb_fs_generation));
}
return 0;
@@ -550,8 +544,7 @@ static inline const char *ocfs2_xattr_prefix(int name_index)
if (name_index > 0 && name_index < OCFS2_XATTR_MAX)
handler = ocfs2_xattr_handler_map[name_index];
-
- return handler ? handler->prefix : NULL;
+ return handler ? xattr_prefix(handler) : NULL;
}
static u32 ocfs2_xattr_name_hash(struct inode *inode,
@@ -890,14 +883,39 @@ static int ocfs2_xattr_value_truncate(struct inode *inode,
return ret;
}
-static int ocfs2_xattr_list_entry(char *buffer, size_t size,
- size_t *result, const char *prefix,
+static int ocfs2_xattr_list_entry(struct super_block *sb,
+ char *buffer, size_t size,
+ size_t *result, int type,
const char *name, int name_len)
{
char *p = buffer + *result;
- int prefix_len = strlen(prefix);
- int total_len = prefix_len + name_len + 1;
+ const char *prefix;
+ int prefix_len;
+ int total_len;
+ switch(type) {
+ case OCFS2_XATTR_INDEX_USER:
+ if (OCFS2_SB(sb)->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
+ return 0;
+ break;
+
+ case OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS:
+ case OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT:
+ if (!(sb->s_flags & MS_POSIXACL))
+ return 0;
+ break;
+
+ case OCFS2_XATTR_INDEX_TRUSTED:
+ if (!capable(CAP_SYS_ADMIN))
+ return 0;
+ break;
+ }
+
+ prefix = ocfs2_xattr_prefix(type);
+ if (!prefix)
+ return 0;
+ prefix_len = strlen(prefix);
+ total_len = prefix_len + name_len + 1;
*result += total_len;
/* we are just looking for how big our buffer needs to be */
@@ -920,23 +938,20 @@ static int ocfs2_xattr_list_entries(struct inode *inode,
{
size_t result = 0;
int i, type, ret;
- const char *prefix, *name;
+ const char *name;
for (i = 0 ; i < le16_to_cpu(header->xh_count); i++) {
struct ocfs2_xattr_entry *entry = &header->xh_entries[i];
type = ocfs2_xattr_get_type(entry);
- prefix = ocfs2_xattr_prefix(type);
-
- if (prefix) {
- name = (const char *)header +
- le16_to_cpu(entry->xe_name_offset);
+ name = (const char *)header +
+ le16_to_cpu(entry->xe_name_offset);
- ret = ocfs2_xattr_list_entry(buffer, buffer_size,
- &result, prefix, name,
- entry->xe_name_len);
- if (ret)
- return ret;
- }
+ ret = ocfs2_xattr_list_entry(inode->i_sb,
+ buffer, buffer_size,
+ &result, type, name,
+ entry->xe_name_len);
+ if (ret)
+ return ret;
}
return result;
@@ -3694,11 +3709,10 @@ static int ocfs2_xattr_get_rec(struct inode *inode,
el = &eb->h_list;
if (el->l_tree_depth) {
- ocfs2_error(inode->i_sb,
- "Inode %lu has non zero tree depth in "
- "xattr tree block %llu\n", inode->i_ino,
- (unsigned long long)eb_bh->b_blocknr);
- ret = -EROFS;
+ ret = ocfs2_error(inode->i_sb,
+ "Inode %lu has non zero tree depth in xattr tree block %llu\n",
+ inode->i_ino,
+ (unsigned long long)eb_bh->b_blocknr);
goto out;
}
}
@@ -3713,11 +3727,10 @@ static int ocfs2_xattr_get_rec(struct inode *inode,
}
if (!e_blkno) {
- ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
- "record (%u, %u, 0) in xattr", inode->i_ino,
- le32_to_cpu(rec->e_cpos),
- ocfs2_rec_clusters(el, rec));
- ret = -EROFS;
+ ret = ocfs2_error(inode->i_sb, "Inode %lu has bad extent record (%u, %u, 0) in xattr\n",
+ inode->i_ino,
+ le32_to_cpu(rec->e_cpos),
+ ocfs2_rec_clusters(el, rec));
goto out;
}
@@ -4041,32 +4054,30 @@ static int ocfs2_list_xattr_bucket(struct inode *inode,
int ret = 0, type;
struct ocfs2_xattr_tree_list *xl = (struct ocfs2_xattr_tree_list *)para;
int i, block_off, new_offset;
- const char *prefix, *name;
+ const char *name;
for (i = 0 ; i < le16_to_cpu(bucket_xh(bucket)->xh_count); i++) {
struct ocfs2_xattr_entry *entry = &bucket_xh(bucket)->xh_entries[i];
type = ocfs2_xattr_get_type(entry);
- prefix = ocfs2_xattr_prefix(type);
- if (prefix) {
- ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
- bucket_xh(bucket),
- i,
- &block_off,
- &new_offset);
- if (ret)
- break;
+ ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
+ bucket_xh(bucket),
+ i,
+ &block_off,
+ &new_offset);
+ if (ret)
+ break;
- name = (const char *)bucket_block(bucket, block_off) +
- new_offset;
- ret = ocfs2_xattr_list_entry(xl->buffer,
- xl->buffer_size,
- &xl->result,
- prefix, name,
- entry->xe_name_len);
- if (ret)
- break;
- }
+ name = (const char *)bucket_block(bucket, block_off) +
+ new_offset;
+ ret = ocfs2_xattr_list_entry(inode->i_sb,
+ xl->buffer,
+ xl->buffer_size,
+ &xl->result,
+ type, name,
+ entry->xe_name_len);
+ if (ret)
+ break;
}
return ret;
@@ -7234,39 +7245,22 @@ int ocfs2_init_security_and_acl(struct inode *dir,
leave:
return ret;
}
+
/*
* 'security' attributes support
*/
-static size_t ocfs2_xattr_security_list(struct dentry *dentry, char *list,
- size_t list_size, const char *name,
- size_t name_len, int type)
+static int ocfs2_xattr_security_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
- const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
- const size_t total_len = prefix_len + name_len + 1;
-
- if (list && total_len <= list_size) {
- memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
- memcpy(list + prefix_len, name, name_len);
- list[prefix_len + name_len] = '\0';
- }
- return total_len;
-}
-
-static int ocfs2_xattr_security_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
-{
- if (strcmp(name, "") == 0)
- return -EINVAL;
return ocfs2_xattr_get(d_inode(dentry), OCFS2_XATTR_INDEX_SECURITY,
name, buffer, size);
}
-static int ocfs2_xattr_security_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+static int ocfs2_xattr_security_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
-
return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_SECURITY,
name, value, size, flags);
}
@@ -7319,7 +7313,6 @@ int ocfs2_init_security_set(handle_t *handle,
const struct xattr_handler ocfs2_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
- .list = ocfs2_xattr_security_list,
.get = ocfs2_xattr_security_get,
.set = ocfs2_xattr_security_set,
};
@@ -7327,43 +7320,24 @@ const struct xattr_handler ocfs2_xattr_security_handler = {
/*
* 'trusted' attributes support
*/
-static size_t ocfs2_xattr_trusted_list(struct dentry *dentry, char *list,
- size_t list_size, const char *name,
- size_t name_len, int type)
+static int ocfs2_xattr_trusted_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
- const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
- const size_t total_len = prefix_len + name_len + 1;
-
- if (list && total_len <= list_size) {
- memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
- memcpy(list + prefix_len, name, name_len);
- list[prefix_len + name_len] = '\0';
- }
- return total_len;
-}
-
-static int ocfs2_xattr_trusted_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
-{
- if (strcmp(name, "") == 0)
- return -EINVAL;
return ocfs2_xattr_get(d_inode(dentry), OCFS2_XATTR_INDEX_TRUSTED,
name, buffer, size);
}
-static int ocfs2_xattr_trusted_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+static int ocfs2_xattr_trusted_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
-
return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_TRUSTED,
name, value, size, flags);
}
const struct xattr_handler ocfs2_xattr_trusted_handler = {
.prefix = XATTR_TRUSTED_PREFIX,
- .list = ocfs2_xattr_trusted_list,
.get = ocfs2_xattr_trusted_get,
.set = ocfs2_xattr_trusted_set,
};
@@ -7371,45 +7345,24 @@ const struct xattr_handler ocfs2_xattr_trusted_handler = {
/*
* 'user' attributes support
*/
-static size_t ocfs2_xattr_user_list(struct dentry *dentry, char *list,
- size_t list_size, const char *name,
- size_t name_len, int type)
-{
- const size_t prefix_len = XATTR_USER_PREFIX_LEN;
- const size_t total_len = prefix_len + name_len + 1;
- struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
-
- if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
- return 0;
-
- if (list && total_len <= list_size) {
- memcpy(list, XATTR_USER_PREFIX, prefix_len);
- memcpy(list + prefix_len, name, name_len);
- list[prefix_len + name_len] = '\0';
- }
- return total_len;
-}
-
-static int ocfs2_xattr_user_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+static int ocfs2_xattr_user_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
- if (strcmp(name, "") == 0)
- return -EINVAL;
if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
return -EOPNOTSUPP;
return ocfs2_xattr_get(d_inode(dentry), OCFS2_XATTR_INDEX_USER, name,
buffer, size);
}
-static int ocfs2_xattr_user_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+static int ocfs2_xattr_user_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
{
struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
- if (strcmp(name, "") == 0)
- return -EINVAL;
if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
return -EOPNOTSUPP;
@@ -7419,7 +7372,6 @@ static int ocfs2_xattr_user_set(struct dentry *dentry, const char *name,
const struct xattr_handler ocfs2_xattr_user_handler = {
.prefix = XATTR_USER_PREFIX,
- .list = ocfs2_xattr_user_list,
.get = ocfs2_xattr_user_get,
.set = ocfs2_xattr_user_set,
};
diff --git a/fs/open.c b/fs/open.c
index e33dab2..b25b154 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -377,7 +377,7 @@ retry:
* with the "noexec" flag.
*/
res = -EACCES;
- if (path.mnt->mnt_flags & MNT_NOEXEC)
+ if (path_noexec(&path))
goto out_path_release;
}
@@ -887,7 +887,7 @@ EXPORT_SYMBOL(dentry_open);
static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op)
{
int lookup_flags = 0;
- int acc_mode;
+ int acc_mode = ACC_MODE(flags);
if (flags & (O_CREAT | __O_TMPFILE))
op->mode = (mode & S_IALLUGO) | S_IFREG;
@@ -909,7 +909,6 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o
if (flags & __O_TMPFILE) {
if ((flags & O_TMPFILE_MASK) != O_TMPFILE)
return -EINVAL;
- acc_mode = MAY_OPEN | ACC_MODE(flags);
if (!(acc_mode & MAY_WRITE))
return -EINVAL;
} else if (flags & O_PATH) {
@@ -919,8 +918,6 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o
*/
flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
acc_mode = 0;
- } else {
- acc_mode = MAY_OPEN | ACC_MODE(flags);
}
op->open_flag = flags;
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 15e4500..b61b883 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -443,7 +443,7 @@ static int __init init_openprom_fs(void)
sizeof(struct op_inode_info),
0,
(SLAB_RECLAIM_ACCOUNT |
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD | SLAB_ACCOUNT),
op_inode_init_once);
if (!op_inode_cachep)
return -ENOMEM;
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 758012b..eff6319 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -206,8 +206,7 @@ int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
struct dentry *dentry, struct path *lowerpath,
- struct kstat *stat, struct iattr *attr,
- const char *link)
+ struct kstat *stat, const char *link)
{
struct inode *wdir = workdir->d_inode;
struct inode *udir = upperdir->d_inode;
@@ -251,8 +250,6 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
mutex_lock(&newdentry->d_inode->i_mutex);
err = ovl_set_attr(newdentry, stat);
- if (!err && attr)
- err = notify_change(newdentry, attr, NULL);
mutex_unlock(&newdentry->d_inode->i_mutex);
if (err)
goto out_cleanup;
@@ -297,8 +294,7 @@ out_cleanup:
* that point the file will have already been copied up anyway.
*/
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
- struct path *lowerpath, struct kstat *stat,
- struct iattr *attr)
+ struct path *lowerpath, struct kstat *stat)
{
struct dentry *workdir = ovl_workdir(dentry);
int err;
@@ -356,26 +352,19 @@ int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
}
upperdentry = ovl_dentry_upper(dentry);
if (upperdentry) {
- unlock_rename(workdir, upperdir);
+ /* Raced with another copy-up? Nothing to do, then... */
err = 0;
- /* Raced with another copy-up? Do the setattr here */
- if (attr) {
- mutex_lock(&upperdentry->d_inode->i_mutex);
- err = notify_change(upperdentry, attr, NULL);
- mutex_unlock(&upperdentry->d_inode->i_mutex);
- }
- goto out_put_cred;
+ goto out_unlock;
}
err = ovl_copy_up_locked(workdir, upperdir, dentry, lowerpath,
- stat, attr, link);
+ stat, link);
if (!err) {
/* Restore timestamps on parent (best effort) */
ovl_set_timestamps(upperdir, &pstat);
}
out_unlock:
unlock_rename(workdir, upperdir);
-out_put_cred:
revert_creds(old_cred);
put_cred(override_cred);
@@ -417,7 +406,7 @@ int ovl_copy_up(struct dentry *dentry)
ovl_path_lower(next, &lowerpath);
err = vfs_getattr(&lowerpath, &stat);
if (!err)
- err = ovl_copy_up_one(parent, next, &lowerpath, &stat, NULL);
+ err = ovl_copy_up_one(parent, next, &lowerpath, &stat);
dput(parent);
dput(next);
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 213a726..bf996e5 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -12,8 +12,7 @@
#include <linux/xattr.h>
#include "overlayfs.h"
-static int ovl_copy_up_last(struct dentry *dentry, struct iattr *attr,
- bool no_data)
+static int ovl_copy_up_truncate(struct dentry *dentry)
{
int err;
struct dentry *parent;
@@ -30,10 +29,8 @@ static int ovl_copy_up_last(struct dentry *dentry, struct iattr *attr,
if (err)
goto out_dput_parent;
- if (no_data)
- stat.size = 0;
-
- err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat, attr);
+ stat.size = 0;
+ err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat);
out_dput_parent:
dput(parent);
@@ -62,13 +59,13 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr)
if (err)
goto out;
- upperdentry = ovl_dentry_upper(dentry);
- if (upperdentry) {
+ err = ovl_copy_up(dentry);
+ if (!err) {
+ upperdentry = ovl_dentry_upper(dentry);
+
mutex_lock(&upperdentry->d_inode->i_mutex);
err = notify_change(upperdentry, attr, NULL);
mutex_unlock(&upperdentry->d_inode->i_mutex);
- } else {
- err = ovl_copy_up_last(dentry, attr, false);
}
ovl_drop_write(dentry);
out:
@@ -170,57 +167,23 @@ out_dput:
return err;
}
-
-struct ovl_link_data {
- struct dentry *realdentry;
- void *cookie;
-};
-
-static const char *ovl_follow_link(struct dentry *dentry, void **cookie)
+static const char *ovl_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
struct dentry *realdentry;
struct inode *realinode;
- struct ovl_link_data *data = NULL;
- const char *ret;
+
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
realdentry = ovl_dentry_real(dentry);
realinode = realdentry->d_inode;
- if (WARN_ON(!realinode->i_op->follow_link))
+ if (WARN_ON(!realinode->i_op->get_link))
return ERR_PTR(-EPERM);
- if (realinode->i_op->put_link) {
- data = kmalloc(sizeof(struct ovl_link_data), GFP_KERNEL);
- if (!data)
- return ERR_PTR(-ENOMEM);
- data->realdentry = realdentry;
- }
-
- ret = realinode->i_op->follow_link(realdentry, cookie);
- if (IS_ERR_OR_NULL(ret)) {
- kfree(data);
- return ret;
- }
-
- if (data)
- data->cookie = *cookie;
-
- *cookie = data;
-
- return ret;
-}
-
-static void ovl_put_link(struct inode *unused, void *c)
-{
- struct inode *realinode;
- struct ovl_link_data *data = c;
-
- if (!data)
- return;
-
- realinode = data->realdentry->d_inode;
- realinode->i_op->put_link(realinode, data->cookie);
- kfree(data);
+ return realinode->i_op->get_link(realdentry, realinode, done);
}
static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
@@ -389,7 +352,7 @@ struct inode *ovl_d_select_inode(struct dentry *dentry, unsigned file_flags)
return ERR_PTR(err);
if (file_flags & O_TRUNC)
- err = ovl_copy_up_last(dentry, NULL, true);
+ err = ovl_copy_up_truncate(dentry);
else
err = ovl_copy_up(dentry);
ovl_drop_write(dentry);
@@ -417,8 +380,7 @@ static const struct inode_operations ovl_file_inode_operations = {
static const struct inode_operations ovl_symlink_inode_operations = {
.setattr = ovl_setattr,
- .follow_link = ovl_follow_link,
- .put_link = ovl_put_link,
+ .get_link = ovl_get_link,
.readlink = ovl_readlink,
.getattr = ovl_getattr,
.setxattr = ovl_setxattr,
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index f3e6efe..99b4168 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -197,7 +197,6 @@ void ovl_cleanup(struct inode *dir, struct dentry *dentry);
/* copy_up.c */
int ovl_copy_up(struct dentry *dentry);
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
- struct path *lowerpath, struct kstat *stat,
- struct iattr *attr);
+ struct path *lowerpath, struct kstat *stat);
int ovl_copy_xattr(struct dentry *old, struct dentry *new);
int ovl_set_attr(struct dentry *upper, struct kstat *stat);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index b08bf4d..a4cbdf9 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -608,10 +608,10 @@ static int ovl_show_options(struct seq_file *m, struct dentry *dentry)
struct super_block *sb = dentry->d_sb;
struct ovl_fs *ufs = sb->s_fs_info;
- seq_printf(m, ",lowerdir=%s", ufs->config.lowerdir);
+ seq_show_option(m, "lowerdir", ufs->config.lowerdir);
if (ufs->config.upperdir) {
- seq_printf(m, ",upperdir=%s", ufs->config.upperdir);
- seq_printf(m, ",workdir=%s", ufs->config.workdir);
+ seq_show_option(m, "upperdir", ufs->config.upperdir);
+ seq_show_option(m, "workdir", ufs->config.workdir);
}
if (ufs->config.default_permissions)
seq_puts(m, ",default_permissions");
diff --git a/fs/pipe.c b/fs/pipe.c
index 8865f79..42cf8dd 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -366,18 +366,17 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
int offset = buf->offset + buf->len;
if (ops->can_merge && offset + chars <= PAGE_SIZE) {
- int error = ops->confirm(pipe, buf);
- if (error)
+ ret = ops->confirm(pipe, buf);
+ if (ret)
goto out;
ret = copy_page_from_iter(buf->page, offset, chars, from);
if (unlikely(ret < chars)) {
- error = -EFAULT;
+ ret = -EFAULT;
goto out;
}
do_wakeup = 1;
- buf->len += chars;
- ret = chars;
+ buf->len += ret;
if (!iov_iter_count(from))
goto out;
}
@@ -693,17 +692,20 @@ int create_pipe_files(struct file **res, int flags)
d_instantiate(path.dentry, inode);
- err = -ENFILE;
f = alloc_file(&path, FMODE_WRITE, &pipefifo_fops);
- if (IS_ERR(f))
+ if (IS_ERR(f)) {
+ err = PTR_ERR(f);
goto err_dentry;
+ }
f->f_flags = O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT));
f->private_data = inode->i_pipe;
res[0] = alloc_file(&path, FMODE_READ, &pipefifo_fops);
- if (IS_ERR(res[0]))
+ if (IS_ERR(res[0])) {
+ err = PTR_ERR(res[0]);
goto err_file;
+ }
path_get(&path);
res[0]->private_data = inode->i_pipe;
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 4fb17de..711dd51 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -762,8 +762,9 @@ posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl,
EXPORT_SYMBOL (posix_acl_to_xattr);
static int
-posix_acl_xattr_get(struct dentry *dentry, const char *name,
- void *value, size_t size, int type)
+posix_acl_xattr_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *value, size_t size)
{
struct posix_acl *acl;
int error;
@@ -773,7 +774,7 @@ posix_acl_xattr_get(struct dentry *dentry, const char *name,
if (d_is_symlink(dentry))
return -EOPNOTSUPP;
- acl = get_acl(d_backing_inode(dentry), type);
+ acl = get_acl(d_backing_inode(dentry), handler->flags);
if (IS_ERR(acl))
return PTR_ERR(acl);
if (acl == NULL)
@@ -786,8 +787,9 @@ posix_acl_xattr_get(struct dentry *dentry, const char *name,
}
static int
-posix_acl_xattr_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+posix_acl_xattr_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
{
struct inode *inode = d_backing_inode(dentry);
struct posix_acl *acl = NULL;
@@ -798,7 +800,7 @@ posix_acl_xattr_set(struct dentry *dentry, const char *name,
if (!inode->i_op->set_acl)
return -EOPNOTSUPP;
- if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
+ if (handler->flags == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
return value ? -EACCES : 0;
if (!inode_owner_or_capable(inode))
return -EPERM;
@@ -815,37 +817,20 @@ posix_acl_xattr_set(struct dentry *dentry, const char *name,
}
}
- ret = inode->i_op->set_acl(inode, acl, type);
+ ret = inode->i_op->set_acl(inode, acl, handler->flags);
out:
posix_acl_release(acl);
return ret;
}
-static size_t
-posix_acl_xattr_list(struct dentry *dentry, char *list, size_t list_size,
- const char *name, size_t name_len, int type)
+static bool
+posix_acl_xattr_list(struct dentry *dentry)
{
- const char *xname;
- size_t size;
-
- if (!IS_POSIXACL(d_backing_inode(dentry)))
- return -EOPNOTSUPP;
- if (d_is_symlink(dentry))
- return -EOPNOTSUPP;
-
- if (type == ACL_TYPE_ACCESS)
- xname = POSIX_ACL_XATTR_ACCESS;
- else
- xname = POSIX_ACL_XATTR_DEFAULT;
-
- size = strlen(xname) + 1;
- if (list && size <= list_size)
- memcpy(list, xname, size);
- return size;
+ return IS_POSIXACL(d_backing_inode(dentry));
}
const struct xattr_handler posix_acl_access_xattr_handler = {
- .prefix = POSIX_ACL_XATTR_ACCESS,
+ .name = XATTR_NAME_POSIX_ACL_ACCESS,
.flags = ACL_TYPE_ACCESS,
.list = posix_acl_xattr_list,
.get = posix_acl_xattr_get,
@@ -854,7 +839,7 @@ const struct xattr_handler posix_acl_access_xattr_handler = {
EXPORT_SYMBOL_GPL(posix_acl_access_xattr_handler);
const struct xattr_handler posix_acl_default_xattr_handler = {
- .prefix = POSIX_ACL_XATTR_DEFAULT,
+ .name = XATTR_NAME_POSIX_ACL_DEFAULT,
.flags = ACL_TYPE_DEFAULT,
.list = posix_acl_xattr_list,
.get = posix_acl_xattr_get,
diff --git a/fs/proc/array.c b/fs/proc/array.c
index ce065cf..d73291f 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -91,18 +91,18 @@
static inline void task_name(struct seq_file *m, struct task_struct *p)
{
char *buf;
+ size_t size;
char tcomm[sizeof(p->comm)];
+ int ret;
get_task_comm(tcomm, p);
seq_puts(m, "Name:\t");
- buf = m->buf + m->count;
- /* Ignore error for now */
- buf += string_escape_str(tcomm, buf, m->size - m->count,
- ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\");
+ size = seq_get_buf(m, &buf);
+ ret = string_escape_str(tcomm, buf, size, ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\");
+ seq_commit(m, ret < size ? ret : -1);
- m->count = buf - m->buf;
seq_putc(m, '\n');
}
@@ -308,7 +308,8 @@ static void render_cap_t(struct seq_file *m, const char *header,
static inline void task_cap(struct seq_file *m, struct task_struct *p)
{
const struct cred *cred;
- kernel_cap_t cap_inheritable, cap_permitted, cap_effective, cap_bset;
+ kernel_cap_t cap_inheritable, cap_permitted, cap_effective,
+ cap_bset, cap_ambient;
rcu_read_lock();
cred = __task_cred(p);
@@ -316,12 +317,14 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p)
cap_permitted = cred->cap_permitted;
cap_effective = cred->cap_effective;
cap_bset = cred->cap_bset;
+ cap_ambient = cred->cap_ambient;
rcu_read_unlock();
render_cap_t(m, "CapInh:\t", &cap_inheritable);
render_cap_t(m, "CapPrm:\t", &cap_permitted);
render_cap_t(m, "CapEff:\t", &cap_effective);
render_cap_t(m, "CapBnd:\t", &cap_bset);
+ render_cap_t(m, "CapAmb:\t", &cap_ambient);
}
static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
@@ -372,7 +375,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task, int whole)
{
- unsigned long vsize, eip, esp, wchan = ~0UL;
+ unsigned long vsize, eip, esp, wchan = 0;
int priority, nice;
int tty_pgrp = -1, tty_nr = 0;
sigset_t sigign, sigcatch;
@@ -504,7 +507,19 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
seq_put_decimal_ull(m, ' ', task->blocked.sig[0] & 0x7fffffffUL);
seq_put_decimal_ull(m, ' ', sigign.sig[0] & 0x7fffffffUL);
seq_put_decimal_ull(m, ' ', sigcatch.sig[0] & 0x7fffffffUL);
- seq_put_decimal_ull(m, ' ', wchan);
+
+ /*
+ * We used to output the absolute kernel address, but that's an
+ * information leak - so instead we show a 0/1 flag here, to signal
+ * to user-space whether there's a wchan field in /proc/PID/wchan.
+ *
+ * This works with older implementations of procps as well.
+ */
+ if (wchan)
+ seq_puts(m, " 1");
+ else
+ seq_puts(m, " 0");
+
seq_put_decimal_ull(m, ' ', 0);
seq_put_decimal_ull(m, ' ', 0);
seq_put_decimal_ll(m, ' ', task->exit_signal);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index aa50d1a..2cf5d7e 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -430,13 +430,10 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns,
wchan = get_wchan(task);
- if (lookup_symbol_name(wchan, symname) < 0) {
- if (!ptrace_may_access(task, PTRACE_MODE_READ))
- return 0;
- seq_printf(m, "%lu", wchan);
- } else {
+ if (wchan && ptrace_may_access(task, PTRACE_MODE_READ) && !lookup_symbol_name(wchan, symname))
seq_printf(m, "%s", symname);
- }
+ else
+ seq_putc(m, '0');
return 0;
}
@@ -1035,6 +1032,16 @@ static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count,
return simple_read_from_buffer(buf, count, ppos, buffer, len);
}
+/*
+ * /proc/pid/oom_adj exists solely for backwards compatibility with previous
+ * kernels. The effective policy is defined by oom_score_adj, which has a
+ * different scale: oom_adj grew exponentially and oom_score_adj grows linearly.
+ * Values written to oom_adj are simply mapped linearly to oom_score_adj.
+ * Processes that become oom disabled via oom_adj will still be oom disabled
+ * with this implementation.
+ *
+ * oom_adj cannot be removed since existing userspace binaries use it.
+ */
static ssize_t oom_adj_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
@@ -1230,10 +1237,9 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
size_t count, loff_t *ppos)
{
struct inode * inode = file_inode(file);
- char *page, *tmp;
- ssize_t length;
uid_t loginuid;
kuid_t kloginuid;
+ int rv;
rcu_read_lock();
if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
@@ -1242,46 +1248,28 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
}
rcu_read_unlock();
- if (count >= PAGE_SIZE)
- count = PAGE_SIZE - 1;
-
if (*ppos != 0) {
/* No partial writes. */
return -EINVAL;
}
- page = (char*)__get_free_page(GFP_TEMPORARY);
- if (!page)
- return -ENOMEM;
- length = -EFAULT;
- if (copy_from_user(page, buf, count))
- goto out_free_page;
-
- page[count] = '\0';
- loginuid = simple_strtoul(page, &tmp, 10);
- if (tmp == page) {
- length = -EINVAL;
- goto out_free_page;
- }
+ rv = kstrtou32_from_user(buf, count, 10, &loginuid);
+ if (rv < 0)
+ return rv;
/* is userspace tring to explicitly UNSET the loginuid? */
if (loginuid == AUDIT_UID_UNSET) {
kloginuid = INVALID_UID;
} else {
kloginuid = make_kuid(file->f_cred->user_ns, loginuid);
- if (!uid_valid(kloginuid)) {
- length = -EINVAL;
- goto out_free_page;
- }
+ if (!uid_valid(kloginuid))
+ return -EINVAL;
}
- length = audit_set_loginuid(kloginuid);
- if (likely(length == 0))
- length = count;
-
-out_free_page:
- free_page((unsigned long) page);
- return length;
+ rv = audit_set_loginuid(kloginuid);
+ if (rv < 0)
+ return rv;
+ return count;
}
static const struct file_operations proc_loginuid_operations = {
@@ -1335,8 +1323,9 @@ static ssize_t proc_fault_inject_write(struct file * file,
const char __user * buf, size_t count, loff_t *ppos)
{
struct task_struct *task;
- char buffer[PROC_NUMBUF], *end;
+ char buffer[PROC_NUMBUF];
int make_it_fail;
+ int rv;
if (!capable(CAP_SYS_RESOURCE))
return -EPERM;
@@ -1345,9 +1334,9 @@ static ssize_t proc_fault_inject_write(struct file * file,
count = sizeof(buffer) - 1;
if (copy_from_user(buffer, buf, count))
return -EFAULT;
- make_it_fail = simple_strtol(strstrip(buffer), &end, 0);
- if (*end)
- return -EINVAL;
+ rv = kstrtoint(strstrip(buffer), 0, &make_it_fail);
+ if (rv < 0)
+ return rv;
if (make_it_fail < 0 || make_it_fail > 1)
return -EINVAL;
@@ -1575,12 +1564,16 @@ static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
return -ENOENT;
}
-static const char *proc_pid_follow_link(struct dentry *dentry, void **cookie)
+static const char *proc_pid_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- struct inode *inode = d_inode(dentry);
struct path path;
int error = -EACCES;
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
/* Are we allowed to snoop on the tasks file descriptors? */
if (!proc_fd_access_allowed(inode))
goto out;
@@ -1641,7 +1634,7 @@ out:
const struct inode_operations proc_pid_link_inode_operations = {
.readlink = proc_pid_readlink,
- .follow_link = proc_pid_follow_link,
+ .get_link = proc_pid_get_link,
.setattr = proc_setattr,
};
@@ -1836,8 +1829,6 @@ end_instantiate:
return dir_emit(ctx, name, len, 1, DT_UNKNOWN);
}
-#ifdef CONFIG_CHECKPOINT_RESTORE
-
/*
* dname_to_vma_addr - maps a dentry name into two unsigned longs
* which represent vma start and end addresses.
@@ -1864,11 +1855,6 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
if (flags & LOOKUP_RCU)
return -ECHILD;
- if (!capable(CAP_SYS_ADMIN)) {
- status = -EPERM;
- goto out_notask;
- }
-
inode = d_inode(dentry);
task = get_proc_task(inode);
if (!task)
@@ -1913,7 +1899,7 @@ static const struct dentry_operations tid_map_files_dentry_operations = {
.d_delete = pid_delete_dentry,
};
-static int proc_map_files_get_link(struct dentry *dentry, struct path *path)
+static int map_files_get_link(struct dentry *dentry, struct path *path)
{
unsigned long vm_start, vm_end;
struct vm_area_struct *vma;
@@ -1957,6 +1943,31 @@ struct map_files_info {
unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
};
+/*
+ * Only allow CAP_SYS_ADMIN to follow the links, due to concerns about how the
+ * symlinks may be used to bypass permissions on ancestor directories in the
+ * path to the file in question.
+ */
+static const char *
+proc_map_files_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ return proc_pid_get_link(dentry, inode, done);
+}
+
+/*
+ * Identical to proc_pid_link_inode_operations except for get_link()
+ */
+static const struct inode_operations proc_map_files_link_inode_operations = {
+ .readlink = proc_pid_readlink,
+ .get_link = proc_map_files_get_link,
+ .setattr = proc_setattr,
+};
+
static int
proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
struct task_struct *task, const void *ptr)
@@ -1970,9 +1981,9 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
return -ENOENT;
ei = PROC_I(inode);
- ei->op.proc_get_link = proc_map_files_get_link;
+ ei->op.proc_get_link = map_files_get_link;
- inode->i_op = &proc_pid_link_inode_operations;
+ inode->i_op = &proc_map_files_link_inode_operations;
inode->i_size = 64;
inode->i_mode = S_IFLNK;
@@ -1996,10 +2007,6 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
int result;
struct mm_struct *mm;
- result = -EPERM;
- if (!capable(CAP_SYS_ADMIN))
- goto out;
-
result = -ENOENT;
task = get_proc_task(dir);
if (!task)
@@ -2053,10 +2060,6 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
struct map_files_info *p;
int ret;
- ret = -EPERM;
- if (!capable(CAP_SYS_ADMIN))
- goto out;
-
ret = -ENOENT;
task = get_proc_task(file_inode(file));
if (!task)
@@ -2245,7 +2248,6 @@ static const struct file_operations proc_timers_operations = {
.llseek = seq_lseek,
.release = seq_release_private,
};
-#endif /* CONFIG_CHECKPOINT_RESTORE */
static int proc_pident_instantiate(struct inode *dir,
struct dentry *dentry, struct task_struct *task, const void *ptr)
@@ -2363,7 +2365,7 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
size_t count, loff_t *ppos)
{
struct inode * inode = file_inode(file);
- char *page;
+ void *page;
ssize_t length;
struct task_struct *task = get_proc_task(inode);
@@ -2378,14 +2380,11 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
if (*ppos != 0)
goto out;
- length = -ENOMEM;
- page = (char*)__get_free_page(GFP_TEMPORARY);
- if (!page)
+ page = memdup_user(buf, count);
+ if (IS_ERR(page)) {
+ length = PTR_ERR(page);
goto out;
-
- length = -EFAULT;
- if (copy_from_user(page, buf, count))
- goto out_free;
+ }
/* Guard against adverse ptrace interaction */
length = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
@@ -2394,10 +2393,10 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
length = security_setprocattr(task,
(char*)file->f_path.dentry->d_name.name,
- (void*)page, count);
+ page, count);
mutex_unlock(&task->signal->cred_guard_mutex);
out_free:
- free_page((unsigned long) page);
+ kfree(page);
out:
put_task_struct(task);
out_no_task:
@@ -2481,35 +2480,24 @@ static ssize_t proc_coredump_filter_write(struct file *file,
{
struct task_struct *task;
struct mm_struct *mm;
- char buffer[PROC_NUMBUF], *end;
unsigned int val;
int ret;
int i;
unsigned long mask;
- ret = -EFAULT;
- memset(buffer, 0, sizeof(buffer));
- if (count > sizeof(buffer) - 1)
- count = sizeof(buffer) - 1;
- if (copy_from_user(buffer, buf, count))
- goto out_no_task;
-
- ret = -EINVAL;
- val = (unsigned int)simple_strtoul(buffer, &end, 0);
- if (*end == '\n')
- end++;
- if (end - buffer == 0)
- goto out_no_task;
+ ret = kstrtouint_from_user(buf, count, 0, &val);
+ if (ret < 0)
+ return ret;
ret = -ESRCH;
task = get_proc_task(file_inode(file));
if (!task)
goto out_no_task;
- ret = end - buffer;
mm = get_task_mm(task);
if (!mm)
goto out_no_mm;
+ ret = 0;
for (i = 0, mask = 1; i < MMF_DUMP_FILTER_BITS; i++, mask <<= 1) {
if (val & mask)
@@ -2522,7 +2510,9 @@ static ssize_t proc_coredump_filter_write(struct file *file,
out_no_mm:
put_task_struct(task);
out_no_task:
- return ret;
+ if (ret < 0)
+ return ret;
+ return count;
}
static const struct file_operations proc_coredump_filter_operations = {
@@ -2744,9 +2734,7 @@ static const struct inode_operations proc_task_inode_operations;
static const struct pid_entry tgid_base_stuff[] = {
DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
-#ifdef CONFIG_CHECKPOINT_RESTORE
DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
-#endif
DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
#ifdef CONFIG_NET
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 6e5fcd0..56afa5e 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -258,6 +258,7 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx,
name, len, instantiate, p,
(void *)(unsigned long)fd))
goto out_fd_loop;
+ cond_resched();
rcu_read_lock();
}
rcu_read_unlock();
@@ -291,11 +292,19 @@ static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry,
*/
int proc_fd_permission(struct inode *inode, int mask)
{
- int rv = generic_permission(inode, mask);
+ struct task_struct *p;
+ int rv;
+
+ rv = generic_permission(inode, mask);
if (rv == 0)
- return 0;
- if (task_tgid(current) == proc_pid(inode))
+ return rv;
+
+ rcu_read_lock();
+ p = pid_task(proc_pid(inode), PIDTYPE_PID);
+ if (p && same_thread_group(p, current))
rv = 0;
+ rcu_read_unlock();
+
return rv;
}
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index e5dee5c..ff3ffc7 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -26,7 +26,7 @@
#include "internal.h"
-static DEFINE_SPINLOCK(proc_subdir_lock);
+static DEFINE_RWLOCK(proc_subdir_lock);
static int proc_match(unsigned int len, const char *name, struct proc_dir_entry *de)
{
@@ -172,9 +172,9 @@ static int xlate_proc_name(const char *name, struct proc_dir_entry **ret,
{
int rv;
- spin_lock(&proc_subdir_lock);
+ read_lock(&proc_subdir_lock);
rv = __xlate_proc_name(name, ret, residual);
- spin_unlock(&proc_subdir_lock);
+ read_unlock(&proc_subdir_lock);
return rv;
}
@@ -231,11 +231,11 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
{
struct inode *inode;
- spin_lock(&proc_subdir_lock);
+ read_lock(&proc_subdir_lock);
de = pde_subdir_find(de, dentry->d_name.name, dentry->d_name.len);
if (de) {
pde_get(de);
- spin_unlock(&proc_subdir_lock);
+ read_unlock(&proc_subdir_lock);
inode = proc_get_inode(dir->i_sb, de);
if (!inode)
return ERR_PTR(-ENOMEM);
@@ -243,7 +243,7 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
d_add(dentry, inode);
return NULL;
}
- spin_unlock(&proc_subdir_lock);
+ read_unlock(&proc_subdir_lock);
return ERR_PTR(-ENOENT);
}
@@ -270,12 +270,12 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file,
if (!dir_emit_dots(file, ctx))
return 0;
- spin_lock(&proc_subdir_lock);
+ read_lock(&proc_subdir_lock);
de = pde_subdir_first(de);
i = ctx->pos - 2;
for (;;) {
if (!de) {
- spin_unlock(&proc_subdir_lock);
+ read_unlock(&proc_subdir_lock);
return 0;
}
if (!i)
@@ -287,19 +287,19 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file,
do {
struct proc_dir_entry *next;
pde_get(de);
- spin_unlock(&proc_subdir_lock);
+ read_unlock(&proc_subdir_lock);
if (!dir_emit(ctx, de->name, de->namelen,
de->low_ino, de->mode >> 12)) {
pde_put(de);
return 0;
}
- spin_lock(&proc_subdir_lock);
+ read_lock(&proc_subdir_lock);
ctx->pos++;
next = pde_subdir_next(de);
pde_put(de);
de = next;
} while (de);
- spin_unlock(&proc_subdir_lock);
+ read_unlock(&proc_subdir_lock);
return 1;
}
@@ -338,16 +338,16 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp
if (ret)
return ret;
- spin_lock(&proc_subdir_lock);
+ write_lock(&proc_subdir_lock);
dp->parent = dir;
if (pde_subdir_insert(dir, dp) == false) {
WARN(1, "proc_dir_entry '%s/%s' already registered\n",
dir->name, dp->name);
- spin_unlock(&proc_subdir_lock);
+ write_unlock(&proc_subdir_lock);
proc_free_inum(dp->low_ino);
return -EEXIST;
}
- spin_unlock(&proc_subdir_lock);
+ write_unlock(&proc_subdir_lock);
return 0;
}
@@ -549,9 +549,9 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
const char *fn = name;
unsigned int len;
- spin_lock(&proc_subdir_lock);
+ write_lock(&proc_subdir_lock);
if (__xlate_proc_name(name, &parent, &fn) != 0) {
- spin_unlock(&proc_subdir_lock);
+ write_unlock(&proc_subdir_lock);
return;
}
len = strlen(fn);
@@ -559,7 +559,7 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
de = pde_subdir_find(parent, fn, len);
if (de)
rb_erase(&de->subdir_node, &parent->subdir);
- spin_unlock(&proc_subdir_lock);
+ write_unlock(&proc_subdir_lock);
if (!de) {
WARN(1, "name '%s'\n", name);
return;
@@ -583,16 +583,16 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
const char *fn = name;
unsigned int len;
- spin_lock(&proc_subdir_lock);
+ write_lock(&proc_subdir_lock);
if (__xlate_proc_name(name, &parent, &fn) != 0) {
- spin_unlock(&proc_subdir_lock);
+ write_unlock(&proc_subdir_lock);
return -ENOENT;
}
len = strlen(fn);
root = pde_subdir_find(parent, fn, len);
if (!root) {
- spin_unlock(&proc_subdir_lock);
+ write_unlock(&proc_subdir_lock);
return -ENOENT;
}
rb_erase(&root->subdir_node, &parent->subdir);
@@ -605,7 +605,7 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
de = next;
continue;
}
- spin_unlock(&proc_subdir_lock);
+ write_unlock(&proc_subdir_lock);
proc_entry_rundown(de);
next = de->parent;
@@ -616,7 +616,7 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
break;
pde_put(de);
- spin_lock(&proc_subdir_lock);
+ write_lock(&proc_subdir_lock);
de = next;
}
pde_put(root);
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index bd95b9f..42305dd 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -95,7 +95,8 @@ void __init proc_init_inodecache(void)
proc_inode_cachep = kmem_cache_create("proc_inode_cache",
sizeof(struct proc_inode),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD|SLAB_PANIC),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT|
+ SLAB_PANIC),
init_once);
}
@@ -393,24 +394,25 @@ static const struct file_operations proc_reg_file_ops_no_compat = {
};
#endif
-static const char *proc_follow_link(struct dentry *dentry, void **cookie)
+static void proc_put_link(void *p)
{
- struct proc_dir_entry *pde = PDE(d_inode(dentry));
- if (unlikely(!use_pde(pde)))
- return ERR_PTR(-EINVAL);
- *cookie = pde;
- return pde->data;
+ unuse_pde(p);
}
-static void proc_put_link(struct inode *unused, void *p)
+static const char *proc_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- unuse_pde(p);
+ struct proc_dir_entry *pde = PDE(inode);
+ if (unlikely(!use_pde(pde)))
+ return ERR_PTR(-EINVAL);
+ set_delayed_call(done, proc_put_link, pde);
+ return pde->data;
}
const struct inode_operations proc_link_inode_operations = {
.readlink = generic_readlink,
- .follow_link = proc_follow_link,
- .put_link = proc_put_link,
+ .get_link = proc_get_link,
};
struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index d3ebf2e..df4661a 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -27,7 +27,6 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
{
struct sysinfo i;
unsigned long committed;
- struct vmalloc_info vmi;
long cached;
long available;
unsigned long pagecache;
@@ -49,8 +48,6 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
if (cached < 0)
cached = 0;
- get_vmalloc_info(&vmi);
-
for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
pages[lru] = global_page_state(NR_LRU_BASE + lru);
@@ -60,11 +57,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
/*
* Estimate the amount of memory available for userspace allocations,
* without causing swapping.
- *
- * Free memory cannot be taken below the low watermark, before the
- * system starts swapping.
*/
- available = i.freeram - wmark_low;
+ available = i.freeram - totalreserve_pages;
/*
* Not all the page cache can be freed, otherwise the system will
@@ -191,8 +185,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
K(vm_commit_limit()),
K(committed),
(unsigned long)VMALLOC_TOTAL >> 10,
- vmi.used >> 10,
- vmi.largest_chunk >> 10
+ 0ul, // used to be vmalloc 'used'
+ 0ul // used to be vmalloc 'largest_chunk'
#ifdef CONFIG_MEMORY_FAILURE
, atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10)
#endif
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index f6e8354..1dece87 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -30,14 +30,18 @@ static const struct proc_ns_operations *ns_entries[] = {
&mntns_operations,
};
-static const char *proc_ns_follow_link(struct dentry *dentry, void **cookie)
+static const char *proc_ns_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- struct inode *inode = d_inode(dentry);
const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops;
struct task_struct *task;
struct path ns_path;
void *error = ERR_PTR(-EACCES);
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
task = get_proc_task(inode);
if (!task)
return error;
@@ -74,7 +78,7 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl
static const struct inode_operations proc_ns_link_inode_operations = {
.readlink = proc_ns_readlink,
- .follow_link = proc_ns_follow_link,
+ .get_link = proc_ns_get_link,
.setattr = proc_setattr,
};
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 7eee2d8..b2855ee 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -9,12 +9,16 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/hugetlb.h>
+#include <linux/memcontrol.h>
+#include <linux/mmu_notifier.h>
+#include <linux/page_idle.h>
#include <linux/kernel-page-flags.h>
#include <asm/uaccess.h>
#include "internal.h"
#define KPMSIZE sizeof(u64)
#define KPMMASK (KPMSIZE - 1)
+#define KPMBITS (KPMSIZE * BITS_PER_BYTE)
/* /proc/kpagecount - an array exposing page counts
*
@@ -54,6 +58,8 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
pfn++;
out++;
count -= KPMSIZE;
+
+ cond_resched();
}
*ppos += (char __user *)out - buf;
@@ -97,9 +103,9 @@ u64 stable_page_flags(struct page *page)
* pseudo flags for the well known (anonymous) memory mapped pages
*
* Note that page->_mapcount is overloaded in SLOB/SLUB/SLQB, so the
- * simple test in page_mapped() is not enough.
+ * simple test in page_mapcount() is not enough.
*/
- if (!PageSlab(page) && page_mapped(page))
+ if (!PageSlab(page) && page_mapcount(page))
u |= 1 << KPF_MMAP;
if (PageAnon(page))
u |= 1 << KPF_ANON;
@@ -146,6 +152,9 @@ u64 stable_page_flags(struct page *page)
if (PageBalloon(page))
u |= 1 << KPF_BALLOON;
+ if (page_is_idle(page))
+ u |= 1 << KPF_IDLE;
+
u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked);
u |= kpf_copy_bit(k, KPF_SLAB, PG_slab);
@@ -212,6 +221,8 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf,
pfn++;
out++;
count -= KPMSIZE;
+
+ cond_resched();
}
*ppos += (char __user *)out - buf;
@@ -225,10 +236,64 @@ static const struct file_operations proc_kpageflags_operations = {
.read = kpageflags_read,
};
+#ifdef CONFIG_MEMCG
+static ssize_t kpagecgroup_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ u64 __user *out = (u64 __user *)buf;
+ struct page *ppage;
+ unsigned long src = *ppos;
+ unsigned long pfn;
+ ssize_t ret = 0;
+ u64 ino;
+
+ pfn = src / KPMSIZE;
+ count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src);
+ if (src & KPMMASK || count & KPMMASK)
+ return -EINVAL;
+
+ while (count > 0) {
+ if (pfn_valid(pfn))
+ ppage = pfn_to_page(pfn);
+ else
+ ppage = NULL;
+
+ if (ppage)
+ ino = page_cgroup_ino(ppage);
+ else
+ ino = 0;
+
+ if (put_user(ino, out)) {
+ ret = -EFAULT;
+ break;
+ }
+
+ pfn++;
+ out++;
+ count -= KPMSIZE;
+
+ cond_resched();
+ }
+
+ *ppos += (char __user *)out - buf;
+ if (!ret)
+ ret = (char __user *)out - buf;
+ return ret;
+}
+
+static const struct file_operations proc_kpagecgroup_operations = {
+ .llseek = mem_lseek,
+ .read = kpagecgroup_read,
+};
+#endif /* CONFIG_MEMCG */
+
static int __init proc_page_init(void)
{
proc_create("kpagecount", S_IRUSR, NULL, &proc_kpagecount_operations);
proc_create("kpageflags", S_IRUSR, NULL, &proc_kpageflags_operations);
+#ifdef CONFIG_MEMCG
+ proc_create("kpagecgroup", S_IRUSR, NULL, &proc_kpagecgroup_operations);
+#endif
return 0;
}
fs_initcall(proc_page_init);
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index fdda62e..fe5b6e6 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -948,7 +948,7 @@ static struct ctl_dir *get_subdir(struct ctl_dir *dir,
found:
subdir->header.nreg++;
failed:
- if (unlikely(IS_ERR(subdir))) {
+ if (IS_ERR(subdir)) {
pr_err("sysctl could not get directory: ");
sysctl_print_dir(dir);
pr_cont("/%*.*s %ld\n",
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 68feb0f..361ab4e 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -134,6 +134,8 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
}
sb->s_flags |= MS_ACTIVE;
+ /* User space would break if executables appear on proc */
+ sb->s_iflags |= SB_I_NOEXEC;
}
return dget(sb->s_root);
diff --git a/fs/proc/self.c b/fs/proc/self.c
index 113b8d0..67e8db4 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -18,26 +18,28 @@ static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
return readlink_copy(buffer, buflen, tmp);
}
-static const char *proc_self_follow_link(struct dentry *dentry, void **cookie)
+static const char *proc_self_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- struct pid_namespace *ns = dentry->d_sb->s_fs_info;
+ struct pid_namespace *ns = inode->i_sb->s_fs_info;
pid_t tgid = task_tgid_nr_ns(current, ns);
char *name;
if (!tgid)
return ERR_PTR(-ENOENT);
/* 11 for max length of signed int in decimal + NULL term */
- name = kmalloc(12, GFP_KERNEL);
- if (!name)
- return ERR_PTR(-ENOMEM);
+ name = kmalloc(12, dentry ? GFP_KERNEL : GFP_ATOMIC);
+ if (unlikely(!name))
+ return dentry ? ERR_PTR(-ENOMEM) : ERR_PTR(-ECHILD);
sprintf(name, "%d", tgid);
- return *cookie = name;
+ set_delayed_call(done, kfree_link, name);
+ return name;
}
static const struct inode_operations proc_self_inode_operations = {
.readlink = proc_self_readlink,
- .follow_link = proc_self_follow_link,
- .put_link = kfree_put_link,
+ .get_link = proc_self_get_link,
};
static unsigned self_inum;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index ca1e091..65a1b6c 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -13,6 +13,8 @@
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/mmu_notifier.h>
+#include <linux/page_idle.h>
+#include <linux/shmem_fs.h>
#include <asm/elf.h>
#include <asm/uaccess.h>
@@ -21,9 +23,13 @@
void task_mem(struct seq_file *m, struct mm_struct *mm)
{
- unsigned long data, text, lib, swap, ptes, pmds;
+ unsigned long text, lib, swap, ptes, pmds, anon, file, shmem;
unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
+ anon = get_mm_counter(mm, MM_ANONPAGES);
+ file = get_mm_counter(mm, MM_FILEPAGES);
+ shmem = get_mm_counter(mm, MM_SHMEMPAGES);
+
/*
* Note: to minimize their overhead, mm maintains hiwater_vm and
* hiwater_rss only when about to *lower* total_vm or rss. Any
@@ -34,11 +40,10 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
hiwater_vm = total_vm = mm->total_vm;
if (hiwater_vm < mm->hiwater_vm)
hiwater_vm = mm->hiwater_vm;
- hiwater_rss = total_rss = get_mm_rss(mm);
+ hiwater_rss = total_rss = anon + file + shmem;
if (hiwater_rss < mm->hiwater_rss)
hiwater_rss = mm->hiwater_rss;
- data = mm->total_vm - mm->shared_vm - mm->stack_vm;
text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
swap = get_mm_counter(mm, MM_SWAPENTS);
@@ -51,6 +56,9 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
"VmPin:\t%8lu kB\n"
"VmHWM:\t%8lu kB\n"
"VmRSS:\t%8lu kB\n"
+ "RssAnon:\t%8lu kB\n"
+ "RssFile:\t%8lu kB\n"
+ "RssShmem:\t%8lu kB\n"
"VmData:\t%8lu kB\n"
"VmStk:\t%8lu kB\n"
"VmExe:\t%8lu kB\n"
@@ -64,11 +72,15 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
mm->pinned_vm << (PAGE_SHIFT-10),
hiwater_rss << (PAGE_SHIFT-10),
total_rss << (PAGE_SHIFT-10),
- data << (PAGE_SHIFT-10),
+ anon << (PAGE_SHIFT-10),
+ file << (PAGE_SHIFT-10),
+ shmem << (PAGE_SHIFT-10),
+ mm->data_vm << (PAGE_SHIFT-10),
mm->stack_vm << (PAGE_SHIFT-10), text, lib,
ptes >> 10,
pmds >> 10,
swap << (PAGE_SHIFT-10));
+ hugetlb_report_usage(m, mm);
}
unsigned long task_vsize(struct mm_struct *mm)
@@ -80,10 +92,11 @@ unsigned long task_statm(struct mm_struct *mm,
unsigned long *shared, unsigned long *text,
unsigned long *data, unsigned long *resident)
{
- *shared = get_mm_counter(mm, MM_FILEPAGES);
+ *shared = get_mm_counter(mm, MM_FILEPAGES) +
+ get_mm_counter(mm, MM_SHMEMPAGES);
*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
>> PAGE_SHIFT;
- *data = mm->total_vm - mm->shared_vm;
+ *data = mm->data_vm + mm->stack_vm;
*resident = *shared + get_mm_counter(mm, MM_ANONPAGES);
return mm->total_vm;
}
@@ -445,41 +458,73 @@ struct mem_size_stats {
unsigned long anonymous;
unsigned long anonymous_thp;
unsigned long swap;
+ unsigned long shared_hugetlb;
+ unsigned long private_hugetlb;
u64 pss;
+ u64 swap_pss;
+ bool check_shmem_swap;
};
static void smaps_account(struct mem_size_stats *mss, struct page *page,
- unsigned long size, bool young, bool dirty)
+ bool compound, bool young, bool dirty)
{
- int mapcount;
+ int i, nr = compound ? HPAGE_PMD_NR : 1;
+ unsigned long size = nr * PAGE_SIZE;
if (PageAnon(page))
mss->anonymous += size;
mss->resident += size;
/* Accumulate the size in pages that have been accessed. */
- if (young || PageReferenced(page))
+ if (young || page_is_young(page) || PageReferenced(page))
mss->referenced += size;
- mapcount = page_mapcount(page);
- if (mapcount >= 2) {
- u64 pss_delta;
- if (dirty || PageDirty(page))
- mss->shared_dirty += size;
- else
- mss->shared_clean += size;
- pss_delta = (u64)size << PSS_SHIFT;
- do_div(pss_delta, mapcount);
- mss->pss += pss_delta;
- } else {
+ /*
+ * page_count(page) == 1 guarantees the page is mapped exactly once.
+ * If any subpage of the compound page mapped with PTE it would elevate
+ * page_count().
+ */
+ if (page_count(page) == 1) {
if (dirty || PageDirty(page))
mss->private_dirty += size;
else
mss->private_clean += size;
mss->pss += (u64)size << PSS_SHIFT;
+ return;
+ }
+
+ for (i = 0; i < nr; i++, page++) {
+ int mapcount = page_mapcount(page);
+
+ if (mapcount >= 2) {
+ if (dirty || PageDirty(page))
+ mss->shared_dirty += PAGE_SIZE;
+ else
+ mss->shared_clean += PAGE_SIZE;
+ mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount;
+ } else {
+ if (dirty || PageDirty(page))
+ mss->private_dirty += PAGE_SIZE;
+ else
+ mss->private_clean += PAGE_SIZE;
+ mss->pss += PAGE_SIZE << PSS_SHIFT;
+ }
}
}
+#ifdef CONFIG_SHMEM
+static int smaps_pte_hole(unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct mem_size_stats *mss = walk->private;
+
+ mss->swap += shmem_partial_swap_usage(
+ walk->vma->vm_file->f_mapping, addr, end);
+
+ return 0;
+}
+#endif
+
static void smaps_pte_entry(pte_t *pte, unsigned long addr,
struct mm_walk *walk)
{
@@ -492,15 +537,40 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
} else if (is_swap_pte(*pte)) {
swp_entry_t swpent = pte_to_swp_entry(*pte);
- if (!non_swap_entry(swpent))
+ if (!non_swap_entry(swpent)) {
+ int mapcount;
+
mss->swap += PAGE_SIZE;
- else if (is_migration_entry(swpent))
+ mapcount = swp_swapcount(swpent);
+ if (mapcount >= 2) {
+ u64 pss_delta = (u64)PAGE_SIZE << PSS_SHIFT;
+
+ do_div(pss_delta, mapcount);
+ mss->swap_pss += pss_delta;
+ } else {
+ mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT;
+ }
+ } else if (is_migration_entry(swpent))
page = migration_entry_to_page(swpent);
+ } else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap
+ && pte_none(*pte))) {
+ page = find_get_entry(vma->vm_file->f_mapping,
+ linear_page_index(vma, addr));
+ if (!page)
+ return;
+
+ if (radix_tree_exceptional_entry(page))
+ mss->swap += PAGE_SIZE;
+ else
+ page_cache_release(page);
+
+ return;
}
if (!page)
return;
- smaps_account(mss, page, PAGE_SIZE, pte_young(*pte), pte_dirty(*pte));
+
+ smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte));
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -516,8 +586,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
if (IS_ERR_OR_NULL(page))
return;
mss->anonymous_thp += HPAGE_PMD_SIZE;
- smaps_account(mss, page, HPAGE_PMD_SIZE,
- pmd_young(*pmd), pmd_dirty(*pmd));
+ smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd));
}
#else
static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
@@ -533,7 +602,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pte_t *pte;
spinlock_t *ptl;
- if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
smaps_pmd_entry(pmd, addr, walk);
spin_unlock(ptl);
return 0;
@@ -597,6 +666,8 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
[ilog2(VM_HUGEPAGE)] = "hg",
[ilog2(VM_NOHUGEPAGE)] = "nh",
[ilog2(VM_MERGEABLE)] = "mg",
+ [ilog2(VM_UFFD_MISSING)]= "um",
+ [ilog2(VM_UFFD_WP)] = "uw",
};
size_t i;
@@ -610,17 +681,74 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
seq_putc(m, '\n');
}
+#ifdef CONFIG_HUGETLB_PAGE
+static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
+ unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct mem_size_stats *mss = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ struct page *page = NULL;
+
+ if (pte_present(*pte)) {
+ page = vm_normal_page(vma, addr, *pte);
+ } else if (is_swap_pte(*pte)) {
+ swp_entry_t swpent = pte_to_swp_entry(*pte);
+
+ if (is_migration_entry(swpent))
+ page = migration_entry_to_page(swpent);
+ }
+ if (page) {
+ int mapcount = page_mapcount(page);
+
+ if (mapcount >= 2)
+ mss->shared_hugetlb += huge_page_size(hstate_vma(vma));
+ else
+ mss->private_hugetlb += huge_page_size(hstate_vma(vma));
+ }
+ return 0;
+}
+#endif /* HUGETLB_PAGE */
+
static int show_smap(struct seq_file *m, void *v, int is_pid)
{
struct vm_area_struct *vma = v;
struct mem_size_stats mss;
struct mm_walk smaps_walk = {
.pmd_entry = smaps_pte_range,
+#ifdef CONFIG_HUGETLB_PAGE
+ .hugetlb_entry = smaps_hugetlb_range,
+#endif
.mm = vma->vm_mm,
.private = &mss,
};
memset(&mss, 0, sizeof mss);
+
+#ifdef CONFIG_SHMEM
+ if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) {
+ /*
+ * For shared or readonly shmem mappings we know that all
+ * swapped out pages belong to the shmem object, and we can
+ * obtain the swap value much more efficiently. For private
+ * writable mappings, we might have COW pages that are
+ * not affected by the parent swapped out pages of the shmem
+ * object, so we have to distinguish them during the page walk.
+ * Unless we know that the shmem object (or the part mapped by
+ * our VMA) has no swapped out pages at all.
+ */
+ unsigned long shmem_swapped = shmem_swap_usage(vma);
+
+ if (!shmem_swapped || (vma->vm_flags & VM_SHARED) ||
+ !(vma->vm_flags & VM_WRITE)) {
+ mss.swap = shmem_swapped;
+ } else {
+ mss.check_shmem_swap = true;
+ smaps_walk.pte_hole = smaps_pte_hole;
+ }
+ }
+#endif
+
/* mmap_sem is held in m_start */
walk_page_vma(vma, &smaps_walk);
@@ -637,7 +765,10 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
"Referenced: %8lu kB\n"
"Anonymous: %8lu kB\n"
"AnonHugePages: %8lu kB\n"
+ "Shared_Hugetlb: %8lu kB\n"
+ "Private_Hugetlb: %7lu kB\n"
"Swap: %8lu kB\n"
+ "SwapPss: %8lu kB\n"
"KernelPageSize: %8lu kB\n"
"MMUPageSize: %8lu kB\n"
"Locked: %8lu kB\n",
@@ -651,7 +782,10 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
mss.referenced >> 10,
mss.anonymous >> 10,
mss.anonymous_thp >> 10,
+ mss.shared_hugetlb >> 10,
+ mss.private_hugetlb >> 10,
mss.swap >> 10,
+ (unsigned long)(mss.swap_pss >> (10 + PSS_SHIFT)),
vma_kernel_pagesize(vma) >> 10,
vma_mmu_pagesize(vma) >> 10,
(vma->vm_flags & VM_LOCKED) ?
@@ -710,23 +844,6 @@ const struct file_operations proc_tid_smaps_operations = {
.release = proc_map_release,
};
-/*
- * We do not want to have constant page-shift bits sitting in
- * pagemap entries and are about to reuse them some time soon.
- *
- * Here's the "migration strategy":
- * 1. when the system boots these bits remain what they are,
- * but a warning about future change is printed in log;
- * 2. once anyone clears soft-dirty bits via clear_refs file,
- * these flag is set to denote, that user is aware of the
- * new API and those page-shift bits change their meaning.
- * The respective warning is printed in dmesg;
- * 3. In a couple of releases we will remove all the mentions
- * of page-shift in pagemap entries.
- */
-
-static bool soft_dirty_cleared __read_mostly;
-
enum clear_refs_types {
CLEAR_REFS_ALL = 1,
CLEAR_REFS_ANON,
@@ -753,36 +870,34 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
pte_t ptent = *pte;
if (pte_present(ptent)) {
+ ptent = ptep_modify_prot_start(vma->vm_mm, addr, pte);
ptent = pte_wrprotect(ptent);
- ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY);
+ ptent = pte_clear_soft_dirty(ptent);
+ ptep_modify_prot_commit(vma->vm_mm, addr, pte, ptent);
} else if (is_swap_pte(ptent)) {
ptent = pte_swp_clear_soft_dirty(ptent);
+ set_pte_at(vma->vm_mm, addr, pte, ptent);
}
-
- set_pte_at(vma->vm_mm, addr, pte, ptent);
}
+#else
+static inline void clear_soft_dirty(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *pte)
+{
+}
+#endif
+#if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
unsigned long addr, pmd_t *pmdp)
{
- pmd_t pmd = *pmdp;
+ pmd_t pmd = pmdp_huge_get_and_clear(vma->vm_mm, addr, pmdp);
pmd = pmd_wrprotect(pmd);
- pmd = pmd_clear_flags(pmd, _PAGE_SOFT_DIRTY);
-
- if (vma->vm_flags & VM_SOFTDIRTY)
- vma->vm_flags &= ~VM_SOFTDIRTY;
+ pmd = pmd_clear_soft_dirty(pmd);
set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
}
-
#else
-
-static inline void clear_soft_dirty(struct vm_area_struct *vma,
- unsigned long addr, pte_t *pte)
-{
-}
-
static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
unsigned long addr, pmd_t *pmdp)
{
@@ -798,7 +913,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
spinlock_t *ptl;
struct page *page;
- if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
clear_soft_dirty_pmd(vma, addr, pmd);
goto out;
@@ -808,6 +923,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
/* Clear accessed and referenced bits. */
pmdp_test_and_clear_young(vma, addr, pmd);
+ test_and_clear_page_young(page);
ClearPageReferenced(page);
out:
spin_unlock(ptl);
@@ -835,6 +951,7 @@ out:
/* Clear accessed and referenced bits. */
ptep_test_and_clear_young(vma, addr, pte);
+ test_and_clear_page_young(page);
ClearPageReferenced(page);
}
pte_unmap_unlock(pte - 1, ptl);
@@ -887,13 +1004,6 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST)
return -EINVAL;
- if (type == CLEAR_REFS_SOFT_DIRTY) {
- soft_dirty_cleared = true;
- pr_warn_once("The pagemap bits 55-60 has changed their meaning!"
- " See the linux/Documentation/vm/pagemap.txt for "
- "details.\n");
- }
-
task = get_proc_task(file_inode(file));
if (!task)
return -ESRCH;
@@ -961,36 +1071,26 @@ typedef struct {
struct pagemapread {
int pos, len; /* units: PM_ENTRY_BYTES, not bytes */
pagemap_entry_t *buffer;
- bool v2;
+ bool show_pfn;
};
#define PAGEMAP_WALK_SIZE (PMD_SIZE)
#define PAGEMAP_WALK_MASK (PMD_MASK)
-#define PM_ENTRY_BYTES sizeof(pagemap_entry_t)
-#define PM_STATUS_BITS 3
-#define PM_STATUS_OFFSET (64 - PM_STATUS_BITS)
-#define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET)
-#define PM_STATUS(nr) (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK)
-#define PM_PSHIFT_BITS 6
-#define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS)
-#define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET)
-#define __PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
-#define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1)
-#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK)
-/* in "new" pagemap pshift bits are occupied with more status bits */
-#define PM_STATUS2(v2, x) (__PM_PSHIFT(v2 ? x : PAGE_SHIFT))
-
-#define __PM_SOFT_DIRTY (1LL)
-#define PM_PRESENT PM_STATUS(4LL)
-#define PM_SWAP PM_STATUS(2LL)
-#define PM_FILE PM_STATUS(1LL)
-#define PM_NOT_PRESENT(v2) PM_STATUS2(v2, 0)
+#define PM_ENTRY_BYTES sizeof(pagemap_entry_t)
+#define PM_PFRAME_BITS 55
+#define PM_PFRAME_MASK GENMASK_ULL(PM_PFRAME_BITS - 1, 0)
+#define PM_SOFT_DIRTY BIT_ULL(55)
+#define PM_MMAP_EXCLUSIVE BIT_ULL(56)
+#define PM_FILE BIT_ULL(61)
+#define PM_SWAP BIT_ULL(62)
+#define PM_PRESENT BIT_ULL(63)
+
#define PM_END_OF_BUFFER 1
-static inline pagemap_entry_t make_pme(u64 val)
+static inline pagemap_entry_t make_pme(u64 frame, u64 flags)
{
- return (pagemap_entry_t) { .pme = val };
+ return (pagemap_entry_t) { .pme = (frame & PM_PFRAME_MASK) | flags };
}
static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme,
@@ -1011,7 +1111,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
while (addr < end) {
struct vm_area_struct *vma = find_vma(walk->mm, addr);
- pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
+ pagemap_entry_t pme = make_pme(0, 0);
/* End of address space hole, which we mark as non-present. */
unsigned long hole_end;
@@ -1031,7 +1131,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
/* Addresses in the VMA. */
if (vma->vm_flags & VM_SOFTDIRTY)
- pme.pme |= PM_STATUS2(pm->v2, __PM_SOFT_DIRTY);
+ pme = make_pme(0, PM_SOFT_DIRTY);
for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
err = add_to_pagemap(addr, &pme, pm);
if (err)
@@ -1042,67 +1142,42 @@ out:
return err;
}
-static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
+static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
struct vm_area_struct *vma, unsigned long addr, pte_t pte)
{
- u64 frame, flags;
+ u64 frame = 0, flags = 0;
struct page *page = NULL;
- int flags2 = 0;
if (pte_present(pte)) {
- frame = pte_pfn(pte);
- flags = PM_PRESENT;
+ if (pm->show_pfn)
+ frame = pte_pfn(pte);
+ flags |= PM_PRESENT;
page = vm_normal_page(vma, addr, pte);
if (pte_soft_dirty(pte))
- flags2 |= __PM_SOFT_DIRTY;
+ flags |= PM_SOFT_DIRTY;
} else if (is_swap_pte(pte)) {
swp_entry_t entry;
if (pte_swp_soft_dirty(pte))
- flags2 |= __PM_SOFT_DIRTY;
+ flags |= PM_SOFT_DIRTY;
entry = pte_to_swp_entry(pte);
frame = swp_type(entry) |
(swp_offset(entry) << MAX_SWAPFILES_SHIFT);
- flags = PM_SWAP;
+ flags |= PM_SWAP;
if (is_migration_entry(entry))
page = migration_entry_to_page(entry);
- } else {
- if (vma->vm_flags & VM_SOFTDIRTY)
- flags2 |= __PM_SOFT_DIRTY;
- *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2));
- return;
}
if (page && !PageAnon(page))
flags |= PM_FILE;
- if ((vma->vm_flags & VM_SOFTDIRTY))
- flags2 |= __PM_SOFT_DIRTY;
-
- *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags);
-}
+ if (page && page_mapcount(page) == 1)
+ flags |= PM_MMAP_EXCLUSIVE;
+ if (vma->vm_flags & VM_SOFTDIRTY)
+ flags |= PM_SOFT_DIRTY;
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
- pmd_t pmd, int offset, int pmd_flags2)
-{
- /*
- * Currently pmd for thp is always present because thp can not be
- * swapped-out, migrated, or HWPOISONed (split in such cases instead.)
- * This if-check is just to prepare for future implementation.
- */
- if (pmd_present(pmd))
- *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset)
- | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT);
- else
- *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, pmd_flags2));
+ return make_pme(frame, flags);
}
-#else
-static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
- pmd_t pmd, int offset, int pmd_flags2)
-{
-}
-#endif
-static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
struct vm_area_struct *vma = walk->vma;
@@ -1111,41 +1186,58 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pte_t *pte, *orig_pte;
int err = 0;
- if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
- int pmd_flags2;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (pmd_trans_huge_lock(pmdp, vma, &ptl)) {
+ u64 flags = 0, frame = 0;
+ pmd_t pmd = *pmdp;
- if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd))
- pmd_flags2 = __PM_SOFT_DIRTY;
- else
- pmd_flags2 = 0;
+ if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(pmd))
+ flags |= PM_SOFT_DIRTY;
+
+ /*
+ * Currently pmd for thp is always present because thp
+ * can not be swapped-out, migrated, or HWPOISONed
+ * (split in such cases instead.)
+ * This if-check is just to prepare for future implementation.
+ */
+ if (pmd_present(pmd)) {
+ struct page *page = pmd_page(pmd);
+
+ if (page_mapcount(page) == 1)
+ flags |= PM_MMAP_EXCLUSIVE;
+
+ flags |= PM_PRESENT;
+ if (pm->show_pfn)
+ frame = pmd_pfn(pmd) +
+ ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+ }
for (; addr != end; addr += PAGE_SIZE) {
- unsigned long offset;
- pagemap_entry_t pme;
+ pagemap_entry_t pme = make_pme(frame, flags);
- offset = (addr & ~PAGEMAP_WALK_MASK) >>
- PAGE_SHIFT;
- thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset, pmd_flags2);
err = add_to_pagemap(addr, &pme, pm);
if (err)
break;
+ if (pm->show_pfn && (flags & PM_PRESENT))
+ frame++;
}
spin_unlock(ptl);
return err;
}
- if (pmd_trans_unstable(pmd))
+ if (pmd_trans_unstable(pmdp))
return 0;
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
/*
* We can assume that @vma always points to a valid one and @end never
* goes beyond vma->vm_end.
*/
- orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+ orig_pte = pte = pte_offset_map_lock(walk->mm, pmdp, addr, &ptl);
for (; addr < end; pte++, addr += PAGE_SIZE) {
pagemap_entry_t pme;
- pte_to_pagemap_entry(&pme, pm, vma, addr, *pte);
+ pme = pte_to_pagemap_entry(pm, vma, addr, *pte);
err = add_to_pagemap(addr, &pme, pm);
if (err)
break;
@@ -1158,40 +1250,44 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
}
#ifdef CONFIG_HUGETLB_PAGE
-static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
- pte_t pte, int offset, int flags2)
-{
- if (pte_present(pte))
- *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) |
- PM_STATUS2(pm->v2, flags2) |
- PM_PRESENT);
- else
- *pme = make_pme(PM_NOT_PRESENT(pm->v2) |
- PM_STATUS2(pm->v2, flags2));
-}
-
/* This function walks within one hugetlb entry in the single call */
-static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
+static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
struct pagemapread *pm = walk->private;
struct vm_area_struct *vma = walk->vma;
+ u64 flags = 0, frame = 0;
int err = 0;
- int flags2;
- pagemap_entry_t pme;
+ pte_t pte;
if (vma->vm_flags & VM_SOFTDIRTY)
- flags2 = __PM_SOFT_DIRTY;
- else
- flags2 = 0;
+ flags |= PM_SOFT_DIRTY;
+
+ pte = huge_ptep_get(ptep);
+ if (pte_present(pte)) {
+ struct page *page = pte_page(pte);
+
+ if (!PageAnon(page))
+ flags |= PM_FILE;
+
+ if (page_mapcount(page) == 1)
+ flags |= PM_MMAP_EXCLUSIVE;
+
+ flags |= PM_PRESENT;
+ if (pm->show_pfn)
+ frame = pte_pfn(pte) +
+ ((addr & ~hmask) >> PAGE_SHIFT);
+ }
for (; addr != end; addr += PAGE_SIZE) {
- int offset = (addr & ~hmask) >> PAGE_SHIFT;
- huge_pte_to_pagemap_entry(&pme, pm, *pte, offset, flags2);
+ pagemap_entry_t pme = make_pme(frame, flags);
+
err = add_to_pagemap(addr, &pme, pm);
if (err)
return err;
+ if (pm->show_pfn && (flags & PM_PRESENT))
+ frame++;
}
cond_resched();
@@ -1209,7 +1305,9 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
* Bits 0-54 page frame number (PFN) if present
* Bits 0-4 swap type if swapped
* Bits 5-54 swap offset if swapped
- * Bits 55-60 page shift (page size = 1<<page shift)
+ * Bit 55 pte is soft-dirty (see Documentation/vm/soft-dirty.txt)
+ * Bit 56 page exclusively mapped
+ * Bits 57-60 zero
* Bit 61 page is file-page or shared-anon
* Bit 62 page swapped
* Bit 63 page present
@@ -1227,42 +1325,37 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
static ssize_t pagemap_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
- struct task_struct *task = get_proc_task(file_inode(file));
- struct mm_struct *mm;
+ struct mm_struct *mm = file->private_data;
struct pagemapread pm;
- int ret = -ESRCH;
struct mm_walk pagemap_walk = {};
unsigned long src;
unsigned long svpfn;
unsigned long start_vaddr;
unsigned long end_vaddr;
- int copied = 0;
+ int ret = 0, copied = 0;
- if (!task)
+ if (!mm || !atomic_inc_not_zero(&mm->mm_users))
goto out;
ret = -EINVAL;
/* file position must be aligned */
if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES))
- goto out_task;
+ goto out_mm;
ret = 0;
if (!count)
- goto out_task;
+ goto out_mm;
+
+ /* do not disclose physical addresses: attack vector */
+ pm.show_pfn = file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN);
- pm.v2 = soft_dirty_cleared;
pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_TEMPORARY);
ret = -ENOMEM;
if (!pm.buffer)
- goto out_task;
-
- mm = mm_access(task, PTRACE_MODE_READ);
- ret = PTR_ERR(mm);
- if (!mm || IS_ERR(mm))
- goto out_free;
+ goto out_mm;
- pagemap_walk.pmd_entry = pagemap_pte_range;
+ pagemap_walk.pmd_entry = pagemap_pmd_range;
pagemap_walk.pte_hole = pagemap_pte_hole;
#ifdef CONFIG_HUGETLB_PAGE
pagemap_walk.hugetlb_entry = pagemap_hugetlb_range;
@@ -1273,10 +1366,10 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
src = *ppos;
svpfn = src / PM_ENTRY_BYTES;
start_vaddr = svpfn << PAGE_SHIFT;
- end_vaddr = TASK_SIZE_OF(task);
+ end_vaddr = mm->task_size;
/* watch out for wraparound */
- if (svpfn > TASK_SIZE_OF(task) >> PAGE_SHIFT)
+ if (svpfn > mm->task_size >> PAGE_SHIFT)
start_vaddr = end_vaddr;
/*
@@ -1303,7 +1396,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
len = min(count, PM_ENTRY_BYTES * pm.pos);
if (copy_to_user(buf, pm.buffer, len)) {
ret = -EFAULT;
- goto out_mm;
+ goto out_free;
}
copied += len;
buf += len;
@@ -1313,24 +1406,31 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
if (!ret || ret == PM_END_OF_BUFFER)
ret = copied;
-out_mm:
- mmput(mm);
out_free:
kfree(pm.buffer);
-out_task:
- put_task_struct(task);
+out_mm:
+ mmput(mm);
out:
return ret;
}
static int pagemap_open(struct inode *inode, struct file *file)
{
- /* do not disclose physical addresses: attack vector */
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
- pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about "
- "to stop being page-shift some time soon. See the "
- "linux/Documentation/vm/pagemap.txt for details.\n");
+ struct mm_struct *mm;
+
+ mm = proc_mem_open(inode, PTRACE_MODE_READ);
+ if (IS_ERR(mm))
+ return PTR_ERR(mm);
+ file->private_data = mm;
+ return 0;
+}
+
+static int pagemap_release(struct inode *inode, struct file *file)
+{
+ struct mm_struct *mm = file->private_data;
+
+ if (mm)
+ mmdrop(mm);
return 0;
}
@@ -1338,6 +1438,7 @@ const struct file_operations proc_pagemap_operations = {
.llseek = mem_lseek, /* borrow this */
.read = pagemap_read,
.open = pagemap_open,
+ .release = pagemap_release,
};
#endif /* CONFIG_PROC_PAGE_MONITOR */
@@ -1418,7 +1519,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
pte_t *orig_pte;
pte_t *pte;
- if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
pte_t huge_pte = *(pte_t *)pmd;
struct page *page;
diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c
index 947b0f4..9eacd59 100644
--- a/fs/proc/thread_self.c
+++ b/fs/proc/thread_self.c
@@ -19,26 +19,29 @@ static int proc_thread_self_readlink(struct dentry *dentry, char __user *buffer,
return readlink_copy(buffer, buflen, tmp);
}
-static const char *proc_thread_self_follow_link(struct dentry *dentry, void **cookie)
+static const char *proc_thread_self_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- struct pid_namespace *ns = dentry->d_sb->s_fs_info;
+ struct pid_namespace *ns = inode->i_sb->s_fs_info;
pid_t tgid = task_tgid_nr_ns(current, ns);
pid_t pid = task_pid_nr_ns(current, ns);
char *name;
if (!pid)
return ERR_PTR(-ENOENT);
- name = kmalloc(PROC_NUMBUF + 6 + PROC_NUMBUF, GFP_KERNEL);
- if (!name)
- return ERR_PTR(-ENOMEM);
+ name = kmalloc(PROC_NUMBUF + 6 + PROC_NUMBUF,
+ dentry ? GFP_KERNEL : GFP_ATOMIC);
+ if (unlikely(!name))
+ return dentry ? ERR_PTR(-ENOMEM) : ERR_PTR(-ECHILD);
sprintf(name, "%d/task/%d", tgid, pid);
- return *cookie = name;
+ set_delayed_call(done, kfree_link, name);
+ return name;
}
static const struct inode_operations proc_thread_self_inode_operations = {
.readlink = proc_thread_self_readlink,
- .follow_link = proc_thread_self_follow_link,
- .put_link = kfree_put_link,
+ .get_link = proc_thread_self_get_link,
};
static unsigned thread_self_inum;
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 8ebd9a3..2256e7e 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -95,9 +95,9 @@ static int show_vfsmnt(struct seq_file *m, struct vfsmount *mnt)
{
struct proc_mounts *p = m->private;
struct mount *r = real_mount(mnt);
- int err = 0;
struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
struct super_block *sb = mnt_path.dentry->d_sb;
+ int err;
if (sb->s_op->show_devname) {
err = sb->s_op->show_devname(m, mnt_path.dentry);
@@ -131,16 +131,17 @@ static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt)
struct mount *r = real_mount(mnt);
struct super_block *sb = mnt->mnt_sb;
struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
- int err = 0;
+ int err;
seq_printf(m, "%i %i %u:%u ", r->mnt_id, r->mnt_parent->mnt_id,
MAJOR(sb->s_dev), MINOR(sb->s_dev));
- if (sb->s_op->show_path)
+ if (sb->s_op->show_path) {
err = sb->s_op->show_path(m, mnt->mnt_root);
- else
+ if (err)
+ goto out;
+ } else {
seq_dentry(m, mnt->mnt_root, " \t\n\\");
- if (err)
- goto out;
+ }
seq_putc(m, ' ');
/* mountpoints outside of chroot jail will give SEQ_SKIP on this */
@@ -168,12 +169,13 @@ static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt)
seq_puts(m, " - ");
show_type(m, sb);
seq_putc(m, ' ');
- if (sb->s_op->show_devname)
+ if (sb->s_op->show_devname) {
err = sb->s_op->show_devname(m, mnt->mnt_root);
- else
+ if (err)
+ goto out;
+ } else {
mangle(m, r->mnt_devname ? r->mnt_devname : "none");
- if (err)
- goto out;
+ }
seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw");
err = show_sb_opts(m, sb);
if (err)
@@ -191,7 +193,7 @@ static int show_vfsstat(struct seq_file *m, struct vfsmount *mnt)
struct mount *r = real_mount(mnt);
struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
struct super_block *sb = mnt_path.dentry->d_sb;
- int err = 0;
+ int err;
/* device */
if (sb->s_op->show_devname) {
@@ -220,8 +222,7 @@ static int show_vfsstat(struct seq_file *m, struct vfsmount *mnt)
/* optional statistics */
if (sb->s_op->show_stats) {
seq_putc(m, ' ');
- if (!err)
- err = sb->s_op->show_stats(m, mnt_path.dentry);
+ err = sb->s_op->show_stats(m, mnt_path.dentry);
}
seq_putc(m, '\n');
diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig
index 916b8e2..360ae43 100644
--- a/fs/pstore/Kconfig
+++ b/fs/pstore/Kconfig
@@ -1,5 +1,5 @@
config PSTORE
- bool "Persistent store support"
+ tristate "Persistent store support"
default n
select ZLIB_DEFLATE
select ZLIB_INFLATE
diff --git a/fs/pstore/Makefile b/fs/pstore/Makefile
index e647d8e..b8803cc 100644
--- a/fs/pstore/Makefile
+++ b/fs/pstore/Makefile
@@ -2,12 +2,12 @@
# Makefile for the linux pstorefs routines.
#
-obj-y += pstore.o
+obj-$(CONFIG_PSTORE) += pstore.o
pstore-objs += inode.o platform.o
-obj-$(CONFIG_PSTORE_FTRACE) += ftrace.o
+pstore-$(CONFIG_PSTORE_FTRACE) += ftrace.o
-obj-$(CONFIG_PSTORE_PMSG) += pmsg.o
+pstore-$(CONFIG_PSTORE_PMSG) += pmsg.o
ramoops-objs += ram.o ram_core.o
obj-$(CONFIG_PSTORE_RAM) += ramoops.o
diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c
index 76a4eeb..d488770 100644
--- a/fs/pstore/ftrace.c
+++ b/fs/pstore/ftrace.c
@@ -104,22 +104,23 @@ static const struct file_operations pstore_knob_fops = {
.write = pstore_ftrace_knob_write,
};
+static struct dentry *pstore_ftrace_dir;
+
void pstore_register_ftrace(void)
{
- struct dentry *dir;
struct dentry *file;
if (!psinfo->write_buf)
return;
- dir = debugfs_create_dir("pstore", NULL);
- if (!dir) {
+ pstore_ftrace_dir = debugfs_create_dir("pstore", NULL);
+ if (!pstore_ftrace_dir) {
pr_err("%s: unable to create pstore directory\n", __func__);
return;
}
- file = debugfs_create_file("record_ftrace", 0600, dir, NULL,
- &pstore_knob_fops);
+ file = debugfs_create_file("record_ftrace", 0600, pstore_ftrace_dir,
+ NULL, &pstore_knob_fops);
if (!file) {
pr_err("%s: unable to create record_ftrace file\n", __func__);
goto err_file;
@@ -127,5 +128,17 @@ void pstore_register_ftrace(void)
return;
err_file:
- debugfs_remove(dir);
+ debugfs_remove(pstore_ftrace_dir);
+}
+
+void pstore_unregister_ftrace(void)
+{
+ mutex_lock(&pstore_ftrace_lock);
+ if (pstore_ftrace_enabled) {
+ unregister_ftrace_function(&pstore_ftrace_ops);
+ pstore_ftrace_enabled = 0;
+ }
+ mutex_unlock(&pstore_ftrace_lock);
+
+ debugfs_remove_recursive(pstore_ftrace_dir);
}
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 3adcc46..d8c439d 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -178,6 +178,7 @@ static loff_t pstore_file_llseek(struct file *file, loff_t off, int whence)
}
static const struct file_operations pstore_file_operations = {
+ .owner = THIS_MODULE,
.open = pstore_file_open,
.read = pstore_file_read,
.llseek = pstore_file_llseek,
@@ -287,7 +288,7 @@ static const struct super_operations pstore_ops = {
static struct super_block *pstore_sb;
-int pstore_is_mounted(void)
+bool pstore_is_mounted(void)
{
return pstore_sb != NULL;
}
@@ -456,6 +457,7 @@ static void pstore_kill_sb(struct super_block *sb)
}
static struct file_system_type pstore_fs_type = {
+ .owner = THIS_MODULE,
.name = "pstore",
.mount = pstore_mount,
.kill_sb = pstore_kill_sb,
@@ -479,5 +481,12 @@ out:
}
module_init(init_pstore_fs)
+static void __exit exit_pstore_fs(void)
+{
+ unregister_filesystem(&pstore_fs_type);
+ sysfs_remove_mount_point(fs_kobj, "pstore");
+}
+module_exit(exit_pstore_fs)
+
MODULE_AUTHOR("Tony Luck <tony.luck@intel.com>");
MODULE_LICENSE("GPL");
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
index c36ba2c..e38a22b 100644
--- a/fs/pstore/internal.h
+++ b/fs/pstore/internal.h
@@ -41,14 +41,18 @@ pstore_ftrace_decode_cpu(struct pstore_ftrace_record *rec)
#ifdef CONFIG_PSTORE_FTRACE
extern void pstore_register_ftrace(void);
+extern void pstore_unregister_ftrace(void);
#else
static inline void pstore_register_ftrace(void) {}
+static inline void pstore_unregister_ftrace(void) {}
#endif
#ifdef CONFIG_PSTORE_PMSG
extern void pstore_register_pmsg(void);
+extern void pstore_unregister_pmsg(void);
#else
static inline void pstore_register_pmsg(void) {}
+static inline void pstore_unregister_pmsg(void) {}
#endif
extern struct pstore_info *psinfo;
@@ -59,6 +63,6 @@ extern int pstore_mkfile(enum pstore_type_id, char *psname, u64 id,
int count, char *data, bool compressed,
size_t size, struct timespec time,
struct pstore_info *psi);
-extern int pstore_is_mounted(void);
+extern bool pstore_is_mounted(void);
#endif
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 791743d..588461b 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -237,6 +237,14 @@ static void allocate_buf_for_compression(void)
}
+static void free_buf_for_compression(void)
+{
+ kfree(stream.workspace);
+ stream.workspace = NULL;
+ kfree(big_oops_buf);
+ big_oops_buf = NULL;
+}
+
/*
* Called when compression fails, since the printk buffer
* would be fetched for compression calling it again when
@@ -353,6 +361,19 @@ static struct kmsg_dumper pstore_dumper = {
.dump = pstore_dump,
};
+/*
+ * Register with kmsg_dump to save last part of console log on panic.
+ */
+static void pstore_register_kmsg(void)
+{
+ kmsg_dump_register(&pstore_dumper);
+}
+
+static void pstore_unregister_kmsg(void)
+{
+ kmsg_dump_unregister(&pstore_dumper);
+}
+
#ifdef CONFIG_PSTORE_CONSOLE
static void pstore_console_write(struct console *con, const char *s, unsigned c)
{
@@ -390,8 +411,14 @@ static void pstore_register_console(void)
{
register_console(&pstore_console);
}
+
+static void pstore_unregister_console(void)
+{
+ unregister_console(&pstore_console);
+}
#else
static void pstore_register_console(void) {}
+static void pstore_unregister_console(void) {}
#endif
static int pstore_write_compat(enum pstore_type_id type,
@@ -410,8 +437,6 @@ static int pstore_write_compat(enum pstore_type_id type,
* read function right away to populate the file system. If not
* then the pstore mount code will call us later to fill out
* the file system.
- *
- * Register with kmsg_dump to save last part of console log on panic.
*/
int pstore_register(struct pstore_info *psi)
{
@@ -442,7 +467,7 @@ int pstore_register(struct pstore_info *psi)
if (pstore_is_mounted())
pstore_get_records(0);
- kmsg_dump_register(&pstore_dumper);
+ pstore_register_kmsg();
if ((psi->flags & PSTORE_FLAGS_FRAGILE) == 0) {
pstore_register_console();
@@ -462,12 +487,28 @@ int pstore_register(struct pstore_info *psi)
*/
backend = psi->name;
+ module_put(owner);
+
pr_info("Registered %s as persistent store backend\n", psi->name);
return 0;
}
EXPORT_SYMBOL_GPL(pstore_register);
+void pstore_unregister(struct pstore_info *psi)
+{
+ pstore_unregister_pmsg();
+ pstore_unregister_ftrace();
+ pstore_unregister_console();
+ pstore_unregister_kmsg();
+
+ free_buf_for_compression();
+
+ psinfo = NULL;
+ backend = NULL;
+}
+EXPORT_SYMBOL_GPL(pstore_unregister);
+
/*
* Read all the records from the persistent store. Create
* files in our filesystem. Don't warn about -EEXIST errors
diff --git a/fs/pstore/pmsg.c b/fs/pstore/pmsg.c
index feb5dd2..7de20cd 100644
--- a/fs/pstore/pmsg.c
+++ b/fs/pstore/pmsg.c
@@ -37,6 +37,8 @@ static ssize_t write_pmsg(struct file *file, const char __user *buf,
if (buffer_size > PMSG_MAX_BOUNCE_BUFFER_SIZE)
buffer_size = PMSG_MAX_BOUNCE_BUFFER_SIZE;
buffer = vmalloc(buffer_size);
+ if (!buffer)
+ return -ENOMEM;
mutex_lock(&pmsg_lock);
for (i = 0; i < count; ) {
@@ -112,3 +114,10 @@ err_class:
err:
return;
}
+
+void pstore_unregister_pmsg(void)
+{
+ device_destroy(pmsg_class, MKDEV(pmsg_major, 0));
+ class_destroy(pmsg_class);
+ unregister_chrdev(pmsg_major, PMSG_NAME);
+}
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 6c26c4d..319c3a6 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -578,30 +578,27 @@ fail_out:
return err;
}
-static int __exit ramoops_remove(struct platform_device *pdev)
+static int ramoops_remove(struct platform_device *pdev)
{
-#if 0
- /* TODO(kees): We cannot unload ramoops since pstore doesn't support
- * unregistering yet.
- */
struct ramoops_context *cxt = &oops_cxt;
- iounmap(cxt->virt_addr);
- release_mem_region(cxt->phys_addr, cxt->size);
+ pstore_unregister(&cxt->pstore);
cxt->max_dump_cnt = 0;
- /* TODO(kees): When pstore supports unregistering, call it here. */
kfree(cxt->pstore.buf);
cxt->pstore.bufsize = 0;
+ persistent_ram_free(cxt->mprz);
+ persistent_ram_free(cxt->fprz);
+ persistent_ram_free(cxt->cprz);
+ ramoops_free_przs(cxt);
+
return 0;
-#endif
- return -EBUSY;
}
static struct platform_driver ramoops_driver = {
.probe = ramoops_probe,
- .remove = __exit_p(ramoops_remove),
+ .remove = ramoops_remove,
.driver = {
.name = "ramoops",
},
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index c4bcb77..3a67cfb 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -316,6 +316,7 @@ struct inode *qnx4_iget(struct super_block *sb, unsigned long ino)
inode->i_fop = &qnx4_dir_operations;
} else if (S_ISLNK(inode->i_mode)) {
inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &qnx4_aops;
qnx4_i(inode)->mmu_private = inode->i_size;
} else {
@@ -364,7 +365,7 @@ static int init_inodecache(void)
qnx4_inode_cachep = kmem_cache_create("qnx4_inode_cache",
sizeof(struct qnx4_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (qnx4_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index 32d2e1a..47bb1de 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -582,6 +582,7 @@ struct inode *qnx6_iget(struct super_block *sb, unsigned ino)
inode->i_mapping->a_ops = &qnx6_aops;
} else if (S_ISLNK(inode->i_mode)) {
inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &qnx6_aops;
} else
init_special_inode(inode, inode->i_mode, 0);
@@ -624,7 +625,7 @@ static int init_inodecache(void)
qnx6_inode_cachep = kmem_cache_create("qnx6_inode_cache",
sizeof(struct qnx6_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (!qnx6_inode_cachep)
return -ENOMEM;
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 20d1f74..fbd70af 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -247,7 +247,7 @@ struct dqstats dqstats;
EXPORT_SYMBOL(dqstats);
static qsize_t inode_get_rsv_space(struct inode *inode);
-static void __dquot_initialize(struct inode *inode, int type);
+static int __dquot_initialize(struct inode *inode, int type);
static inline unsigned int
hashfn(const struct super_block *sb, struct kqid qid)
@@ -832,16 +832,17 @@ static struct dquot *get_empty_dquot(struct super_block *sb, int type)
struct dquot *dqget(struct super_block *sb, struct kqid qid)
{
unsigned int hashent = hashfn(sb, qid);
- struct dquot *dquot = NULL, *empty = NULL;
+ struct dquot *dquot, *empty = NULL;
if (!sb_has_quota_active(sb, qid.type))
- return NULL;
+ return ERR_PTR(-ESRCH);
we_slept:
spin_lock(&dq_list_lock);
spin_lock(&dq_state_lock);
if (!sb_has_quota_active(sb, qid.type)) {
spin_unlock(&dq_state_lock);
spin_unlock(&dq_list_lock);
+ dquot = ERR_PTR(-ESRCH);
goto out;
}
spin_unlock(&dq_state_lock);
@@ -876,11 +877,15 @@ we_slept:
* already finished or it will be canceled due to dq_count > 1 test */
wait_on_dquot(dquot);
/* Read the dquot / allocate space in quota file */
- if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags) &&
- sb->dq_op->acquire_dquot(dquot) < 0) {
- dqput(dquot);
- dquot = NULL;
- goto out;
+ if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) {
+ int err;
+
+ err = sb->dq_op->acquire_dquot(dquot);
+ if (err < 0) {
+ dqput(dquot);
+ dquot = ERR_PTR(err);
+ goto out;
+ }
}
#ifdef CONFIG_QUOTA_DEBUG
BUG_ON(!dquot->dq_sb); /* Has somebody invalidated entry under us? */
@@ -923,7 +928,7 @@ static void add_dquot_ref(struct super_block *sb, int type)
int reserved = 0;
#endif
- spin_lock(&inode_sb_list_lock);
+ spin_lock(&sb->s_inode_list_lock);
list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
spin_lock(&inode->i_lock);
if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
@@ -934,7 +939,7 @@ static void add_dquot_ref(struct super_block *sb, int type)
}
__iget(inode);
spin_unlock(&inode->i_lock);
- spin_unlock(&inode_sb_list_lock);
+ spin_unlock(&sb->s_inode_list_lock);
#ifdef CONFIG_QUOTA_DEBUG
if (unlikely(inode_get_rsv_space(inode) > 0))
@@ -946,15 +951,15 @@ static void add_dquot_ref(struct super_block *sb, int type)
/*
* We hold a reference to 'inode' so it couldn't have been
* removed from s_inodes list while we dropped the
- * inode_sb_list_lock We cannot iput the inode now as we can be
+ * s_inode_list_lock. We cannot iput the inode now as we can be
* holding the last reference and we cannot iput it under
- * inode_sb_list_lock. So we keep the reference and iput it
+ * s_inode_list_lock. So we keep the reference and iput it
* later.
*/
old_inode = inode;
- spin_lock(&inode_sb_list_lock);
+ spin_lock(&sb->s_inode_list_lock);
}
- spin_unlock(&inode_sb_list_lock);
+ spin_unlock(&sb->s_inode_list_lock);
iput(old_inode);
#ifdef CONFIG_QUOTA_DEBUG
@@ -1023,7 +1028,7 @@ static void remove_dquot_ref(struct super_block *sb, int type,
struct inode *inode;
int reserved = 0;
- spin_lock(&inode_sb_list_lock);
+ spin_lock(&sb->s_inode_list_lock);
list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
/*
* We have to scan also I_NEW inodes because they can already
@@ -1039,7 +1044,7 @@ static void remove_dquot_ref(struct super_block *sb, int type,
}
spin_unlock(&dq_data_lock);
}
- spin_unlock(&inode_sb_list_lock);
+ spin_unlock(&sb->s_inode_list_lock);
#ifdef CONFIG_QUOTA_DEBUG
if (reserved) {
printk(KERN_WARNING "VFS (%s): Writes happened after quota"
@@ -1390,15 +1395,16 @@ static int dquot_active(const struct inode *inode)
* It is better to call this function outside of any transaction as it
* might need a lot of space in journal for dquot structure allocation.
*/
-static void __dquot_initialize(struct inode *inode, int type)
+static int __dquot_initialize(struct inode *inode, int type)
{
int cnt, init_needed = 0;
struct dquot **dquots, *got[MAXQUOTAS];
struct super_block *sb = inode->i_sb;
qsize_t rsv;
+ int ret = 0;
if (!dquot_active(inode))
- return;
+ return 0;
dquots = i_dquot(inode);
@@ -1407,6 +1413,7 @@ static void __dquot_initialize(struct inode *inode, int type)
struct kqid qid;
kprojid_t projid;
int rc;
+ struct dquot *dquot;
got[cnt] = NULL;
if (type != -1 && cnt != type)
@@ -1438,16 +1445,25 @@ static void __dquot_initialize(struct inode *inode, int type)
qid = make_kqid_projid(projid);
break;
}
- got[cnt] = dqget(sb, qid);
+ dquot = dqget(sb, qid);
+ if (IS_ERR(dquot)) {
+ /* We raced with somebody turning quotas off... */
+ if (PTR_ERR(dquot) != -ESRCH) {
+ ret = PTR_ERR(dquot);
+ goto out_put;
+ }
+ dquot = NULL;
+ }
+ got[cnt] = dquot;
}
/* All required i_dquot has been initialized */
if (!init_needed)
- return;
+ return 0;
spin_lock(&dq_data_lock);
if (IS_NOQUOTA(inode))
- goto out_err;
+ goto out_lock;
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
if (type != -1 && cnt != type)
continue;
@@ -1469,15 +1485,18 @@ static void __dquot_initialize(struct inode *inode, int type)
dquot_resv_space(dquots[cnt], rsv);
}
}
-out_err:
+out_lock:
spin_unlock(&dq_data_lock);
+out_put:
/* Drop unused references */
dqput_all(got);
+
+ return ret;
}
-void dquot_initialize(struct inode *inode)
+int dquot_initialize(struct inode *inode)
{
- __dquot_initialize(inode, -1);
+ return __dquot_initialize(inode, -1);
}
EXPORT_SYMBOL(dquot_initialize);
@@ -1961,18 +1980,37 @@ EXPORT_SYMBOL(__dquot_transfer);
int dquot_transfer(struct inode *inode, struct iattr *iattr)
{
struct dquot *transfer_to[MAXQUOTAS] = {};
+ struct dquot *dquot;
struct super_block *sb = inode->i_sb;
int ret;
if (!dquot_active(inode))
return 0;
- if (iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid))
- transfer_to[USRQUOTA] = dqget(sb, make_kqid_uid(iattr->ia_uid));
- if (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid))
- transfer_to[GRPQUOTA] = dqget(sb, make_kqid_gid(iattr->ia_gid));
-
+ if (iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)){
+ dquot = dqget(sb, make_kqid_uid(iattr->ia_uid));
+ if (IS_ERR(dquot)) {
+ if (PTR_ERR(dquot) != -ESRCH) {
+ ret = PTR_ERR(dquot);
+ goto out_put;
+ }
+ dquot = NULL;
+ }
+ transfer_to[USRQUOTA] = dquot;
+ }
+ if (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid)){
+ dquot = dqget(sb, make_kqid_gid(iattr->ia_gid));
+ if (IS_ERR(dquot)) {
+ if (PTR_ERR(dquot) != -ESRCH) {
+ ret = PTR_ERR(dquot);
+ goto out_put;
+ }
+ dquot = NULL;
+ }
+ transfer_to[GRPQUOTA] = dquot;
+ }
ret = __dquot_transfer(inode, transfer_to);
+out_put:
dqput_all(transfer_to);
return ret;
}
@@ -2518,8 +2556,8 @@ int dquot_get_dqblk(struct super_block *sb, struct kqid qid,
struct dquot *dquot;
dquot = dqget(sb, qid);
- if (!dquot)
- return -ESRCH;
+ if (IS_ERR(dquot))
+ return PTR_ERR(dquot);
do_get_dqblk(dquot, di);
dqput(dquot);
@@ -2631,8 +2669,8 @@ int dquot_set_dqblk(struct super_block *sb, struct kqid qid,
int rc;
dquot = dqget(sb, qid);
- if (!dquot) {
- rc = -ESRCH;
+ if (IS_ERR(dquot)) {
+ rc = PTR_ERR(dquot);
goto out;
}
rc = do_set_dqblk(dquot, di);
@@ -2886,4 +2924,4 @@ static int __init dquot_init(void)
return 0;
}
-module_init(dquot_init);
+fs_initcall(dquot_init);
diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c
index bb2869f..d07a2f9 100644
--- a/fs/quota/netlink.c
+++ b/fs/quota/netlink.c
@@ -1,7 +1,5 @@
-
#include <linux/cred.h>
#include <linux/init.h>
-#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/quotaops.h>
#include <linux/sched.h>
@@ -105,5 +103,4 @@ static int __init quota_init(void)
"VFS: Failed to create quota netlink interface.\n");
return 0;
};
-
-module_init(quota_init);
+fs_initcall(quota_init);
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 86ded73..3746367 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -141,9 +141,9 @@ static int quota_getinfo(struct super_block *sb, int type, void __user *addr)
if (tstate->flags & QCI_ROOT_SQUASH)
uinfo.dqi_flags |= DQF_ROOT_SQUASH;
uinfo.dqi_valid = IIF_ALL;
- if (!ret && copy_to_user(addr, &uinfo, sizeof(uinfo)))
+ if (copy_to_user(addr, &uinfo, sizeof(uinfo)))
return -EFAULT;
- return ret;
+ return 0;
}
static int quota_setinfo(struct super_block *sb, int type, void __user *addr)
diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
index 2aa012a..ed85d4f 100644
--- a/fs/quota/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -30,13 +30,13 @@ static void v2r1_mem2diskdqb(void *dp, struct dquot *dquot);
static void v2r1_disk2memdqb(struct dquot *dquot, void *dp);
static int v2r1_is_id(void *dp, struct dquot *dquot);
-static struct qtree_fmt_operations v2r0_qtree_ops = {
+static const struct qtree_fmt_operations v2r0_qtree_ops = {
.mem2disk_dqblk = v2r0_mem2diskdqb,
.disk2mem_dqblk = v2r0_disk2memdqb,
.is_id = v2r0_is_id,
};
-static struct qtree_fmt_operations v2r1_qtree_ops = {
+static const struct qtree_fmt_operations v2r1_qtree_ops = {
.mem2disk_dqblk = v2r1_mem2diskdqb,
.disk2mem_dqblk = v2r1_disk2memdqb,
.is_id = v2r1_is_id,
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index ba1323a..a586467 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -70,6 +70,7 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
unsigned order;
void *data;
int ret;
+ gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
/* make various checks */
order = get_order(newsize);
@@ -84,7 +85,7 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
/* allocate enough contiguous pages to be able to satisfy the
* request */
- pages = alloc_pages(mapping_gfp_mask(inode->i_mapping), order);
+ pages = alloc_pages(gfp, order);
if (!pages)
return -ENOMEM;
@@ -108,7 +109,7 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
struct page *page = pages + loop;
ret = add_to_page_cache_lru(page, inode->i_mapping, loop,
- GFP_KERNEL);
+ gfp);
if (ret < 0)
goto add_error;
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 889d558..38981b0 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -79,6 +79,7 @@ struct inode *ramfs_get_inode(struct super_block *sb,
break;
case S_IFLNK:
inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
break;
}
}
diff --git a/fs/read_write.c b/fs/read_write.c
index 819ef3f..06b07d5 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -16,6 +16,7 @@
#include <linux/pagemap.h>
#include <linux/splice.h>
#include <linux/compat.h>
+#include <linux/mount.h>
#include "internal.h"
#include <asm/uaccess.h>
@@ -171,6 +172,45 @@ loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t si
EXPORT_SYMBOL(fixed_size_llseek);
/**
+ * no_seek_end_llseek - llseek implementation for fixed-sized devices
+ * @file: file structure to seek on
+ * @offset: file offset to seek to
+ * @whence: type of seek
+ *
+ */
+loff_t no_seek_end_llseek(struct file *file, loff_t offset, int whence)
+{
+ switch (whence) {
+ case SEEK_SET: case SEEK_CUR:
+ return generic_file_llseek_size(file, offset, whence,
+ ~0ULL, 0);
+ default:
+ return -EINVAL;
+ }
+}
+EXPORT_SYMBOL(no_seek_end_llseek);
+
+/**
+ * no_seek_end_llseek_size - llseek implementation for fixed-sized devices
+ * @file: file structure to seek on
+ * @offset: file offset to seek to
+ * @whence: type of seek
+ * @size: maximal offset allowed
+ *
+ */
+loff_t no_seek_end_llseek_size(struct file *file, loff_t offset, int whence, loff_t size)
+{
+ switch (whence) {
+ case SEEK_SET: case SEEK_CUR:
+ return generic_file_llseek_size(file, offset, whence,
+ size, 0);
+ default:
+ return -EINVAL;
+ }
+}
+EXPORT_SYMBOL(no_seek_end_llseek_size);
+
+/**
* noop_llseek - No Operation Performed llseek implementation
* @file: file structure to seek on
* @offset: file offset to seek to
@@ -395,9 +435,8 @@ int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t
}
if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
- retval = locks_mandatory_area(
- read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
- inode, file, pos, count);
+ retval = locks_mandatory_area(inode, file, pos, pos + count - 1,
+ read_write == READ ? F_RDLCK : F_WRLCK);
if (retval < 0)
return retval;
}
@@ -1327,3 +1366,299 @@ COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
return do_sendfile(out_fd, in_fd, NULL, count, 0);
}
#endif
+
+/*
+ * copy_file_range() differs from regular file read and write in that it
+ * specifically allows return partial success. When it does so is up to
+ * the copy_file_range method.
+ */
+ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ size_t len, unsigned int flags)
+{
+ struct inode *inode_in = file_inode(file_in);
+ struct inode *inode_out = file_inode(file_out);
+ ssize_t ret;
+
+ if (flags != 0)
+ return -EINVAL;
+
+ /* copy_file_range allows full ssize_t len, ignoring MAX_RW_COUNT */
+ ret = rw_verify_area(READ, file_in, &pos_in, len);
+ if (ret >= 0)
+ ret = rw_verify_area(WRITE, file_out, &pos_out, len);
+ if (ret < 0)
+ return ret;
+
+ if (!(file_in->f_mode & FMODE_READ) ||
+ !(file_out->f_mode & FMODE_WRITE) ||
+ (file_out->f_flags & O_APPEND))
+ return -EBADF;
+
+ /* this could be relaxed once a method supports cross-fs copies */
+ if (inode_in->i_sb != inode_out->i_sb)
+ return -EXDEV;
+
+ if (len == 0)
+ return 0;
+
+ ret = mnt_want_write_file(file_out);
+ if (ret)
+ return ret;
+
+ ret = -EOPNOTSUPP;
+ if (file_out->f_op->copy_file_range)
+ ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out,
+ pos_out, len, flags);
+ if (ret == -EOPNOTSUPP)
+ ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
+ len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
+
+ if (ret > 0) {
+ fsnotify_access(file_in);
+ add_rchar(current, ret);
+ fsnotify_modify(file_out);
+ add_wchar(current, ret);
+ }
+ inc_syscr(current);
+ inc_syscw(current);
+
+ mnt_drop_write_file(file_out);
+
+ return ret;
+}
+EXPORT_SYMBOL(vfs_copy_file_range);
+
+SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
+ int, fd_out, loff_t __user *, off_out,
+ size_t, len, unsigned int, flags)
+{
+ loff_t pos_in;
+ loff_t pos_out;
+ struct fd f_in;
+ struct fd f_out;
+ ssize_t ret = -EBADF;
+
+ f_in = fdget(fd_in);
+ if (!f_in.file)
+ goto out2;
+
+ f_out = fdget(fd_out);
+ if (!f_out.file)
+ goto out1;
+
+ ret = -EFAULT;
+ if (off_in) {
+ if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
+ goto out;
+ } else {
+ pos_in = f_in.file->f_pos;
+ }
+
+ if (off_out) {
+ if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
+ goto out;
+ } else {
+ pos_out = f_out.file->f_pos;
+ }
+
+ ret = vfs_copy_file_range(f_in.file, pos_in, f_out.file, pos_out, len,
+ flags);
+ if (ret > 0) {
+ pos_in += ret;
+ pos_out += ret;
+
+ if (off_in) {
+ if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
+ ret = -EFAULT;
+ } else {
+ f_in.file->f_pos = pos_in;
+ }
+
+ if (off_out) {
+ if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
+ ret = -EFAULT;
+ } else {
+ f_out.file->f_pos = pos_out;
+ }
+ }
+
+out:
+ fdput(f_out);
+out1:
+ fdput(f_in);
+out2:
+ return ret;
+}
+
+static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
+{
+ struct inode *inode = file_inode(file);
+
+ if (unlikely(pos < 0))
+ return -EINVAL;
+
+ if (unlikely((loff_t) (pos + len) < 0))
+ return -EINVAL;
+
+ if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
+ loff_t end = len ? pos + len - 1 : OFFSET_MAX;
+ int retval;
+
+ retval = locks_mandatory_area(inode, file, pos, end,
+ write ? F_WRLCK : F_RDLCK);
+ if (retval < 0)
+ return retval;
+ }
+
+ return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
+}
+
+int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out, u64 len)
+{
+ struct inode *inode_in = file_inode(file_in);
+ struct inode *inode_out = file_inode(file_out);
+ int ret;
+
+ if (inode_in->i_sb != inode_out->i_sb ||
+ file_in->f_path.mnt != file_out->f_path.mnt)
+ return -EXDEV;
+
+ if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
+ return -EISDIR;
+ if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
+ return -EINVAL;
+
+ if (!(file_in->f_mode & FMODE_READ) ||
+ !(file_out->f_mode & FMODE_WRITE) ||
+ (file_out->f_flags & O_APPEND) ||
+ !file_in->f_op->clone_file_range)
+ return -EBADF;
+
+ ret = clone_verify_area(file_in, pos_in, len, false);
+ if (ret)
+ return ret;
+
+ ret = clone_verify_area(file_out, pos_out, len, true);
+ if (ret)
+ return ret;
+
+ if (pos_in + len > i_size_read(inode_in))
+ return -EINVAL;
+
+ ret = mnt_want_write_file(file_out);
+ if (ret)
+ return ret;
+
+ ret = file_in->f_op->clone_file_range(file_in, pos_in,
+ file_out, pos_out, len);
+ if (!ret) {
+ fsnotify_access(file_in);
+ fsnotify_modify(file_out);
+ }
+
+ mnt_drop_write_file(file_out);
+ return ret;
+}
+EXPORT_SYMBOL(vfs_clone_file_range);
+
+int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
+{
+ struct file_dedupe_range_info *info;
+ struct inode *src = file_inode(file);
+ u64 off;
+ u64 len;
+ int i;
+ int ret;
+ bool is_admin = capable(CAP_SYS_ADMIN);
+ u16 count = same->dest_count;
+ struct file *dst_file;
+ loff_t dst_off;
+ ssize_t deduped;
+
+ if (!(file->f_mode & FMODE_READ))
+ return -EINVAL;
+
+ if (same->reserved1 || same->reserved2)
+ return -EINVAL;
+
+ off = same->src_offset;
+ len = same->src_length;
+
+ ret = -EISDIR;
+ if (S_ISDIR(src->i_mode))
+ goto out;
+
+ ret = -EINVAL;
+ if (!S_ISREG(src->i_mode))
+ goto out;
+
+ ret = clone_verify_area(file, off, len, false);
+ if (ret < 0)
+ goto out;
+ ret = 0;
+
+ /* pre-format output fields to sane values */
+ for (i = 0; i < count; i++) {
+ same->info[i].bytes_deduped = 0ULL;
+ same->info[i].status = FILE_DEDUPE_RANGE_SAME;
+ }
+
+ for (i = 0, info = same->info; i < count; i++, info++) {
+ struct inode *dst;
+ struct fd dst_fd = fdget(info->dest_fd);
+
+ dst_file = dst_fd.file;
+ if (!dst_file) {
+ info->status = -EBADF;
+ goto next_loop;
+ }
+ dst = file_inode(dst_file);
+
+ ret = mnt_want_write_file(dst_file);
+ if (ret) {
+ info->status = ret;
+ goto next_loop;
+ }
+
+ dst_off = info->dest_offset;
+ ret = clone_verify_area(dst_file, dst_off, len, true);
+ if (ret < 0) {
+ info->status = ret;
+ goto next_file;
+ }
+ ret = 0;
+
+ if (info->reserved) {
+ info->status = -EINVAL;
+ } else if (!(is_admin || (dst_file->f_mode & FMODE_WRITE))) {
+ info->status = -EINVAL;
+ } else if (file->f_path.mnt != dst_file->f_path.mnt) {
+ info->status = -EXDEV;
+ } else if (S_ISDIR(dst->i_mode)) {
+ info->status = -EISDIR;
+ } else if (dst_file->f_op->dedupe_file_range == NULL) {
+ info->status = -EINVAL;
+ } else {
+ deduped = dst_file->f_op->dedupe_file_range(file, off,
+ len, dst_file,
+ info->dest_offset);
+ if (deduped == -EBADE)
+ info->status = FILE_DEDUPE_RANGE_DIFFERS;
+ else if (deduped < 0)
+ info->status = deduped;
+ else
+ info->bytes_deduped += deduped;
+ }
+
+next_file:
+ mnt_drop_write_file(dst_file);
+next_loop:
+ fdput(dst_fd);
+ }
+
+out:
+ return ret;
+}
+EXPORT_SYMBOL(vfs_dedupe_file_range);
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index f6f2fba..ae9e5b3 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1361,6 +1361,7 @@ static void init_inode(struct inode *inode, struct treepath *path)
inode->i_fop = &reiserfs_dir_operations;
} else if (S_ISLNK(inode->i_mode)) {
inode->i_op = &reiserfs_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &reiserfs_address_space_operations;
} else {
inode->i_blocks = 0;
@@ -3319,8 +3320,11 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
/* must be turned off for recursive notify_change calls */
ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID);
- if (is_quota_modification(inode, attr))
- dquot_initialize(inode);
+ if (is_quota_modification(inode, attr)) {
+ error = dquot_initialize(inode);
+ if (error)
+ return error;
+ }
reiserfs_write_lock(inode->i_sb);
if (attr->ia_valid & ATTR_SIZE) {
/*
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 9d6486d..44c2bdc 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -618,12 +618,10 @@ static void release_buffer_page(struct buffer_head *bh)
static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
{
- char b[BDEVNAME_SIZE];
-
if (buffer_journaled(bh)) {
reiserfs_warning(NULL, "clm-2084",
- "pinned buffer %lu:%s sent to disk",
- bh->b_blocknr, bdevname(bh->b_bdev, b));
+ "pinned buffer %lu:%pg sent to disk",
+ bh->b_blocknr, bh->b_bdev);
}
if (uptodate)
set_buffer_uptodate(bh);
@@ -2387,11 +2385,10 @@ static int journal_read(struct super_block *sb)
int replay_count = 0;
int continue_replay = 1;
int ret;
- char b[BDEVNAME_SIZE];
cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(sb);
- reiserfs_info(sb, "checking transaction log (%s)\n",
- bdevname(journal->j_dev_bd, b));
+ reiserfs_info(sb, "checking transaction log (%pg)\n",
+ journal->j_dev_bd);
start = get_seconds();
/*
@@ -2651,8 +2648,8 @@ static int journal_init_dev(struct super_block *super,
set_blocksize(journal->j_dev_bd, super->s_blocksize);
reiserfs_info(super,
- "journal_init_dev: journal device: %s\n",
- bdevname(journal->j_dev_bd, b));
+ "journal_init_dev: journal device: %pg\n",
+ journal->j_dev_bd);
return 0;
}
@@ -2724,7 +2721,6 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
struct reiserfs_journal_header *jh;
struct reiserfs_journal *journal;
struct reiserfs_journal_list *jl;
- char b[BDEVNAME_SIZE];
int ret;
journal = SB_JOURNAL(sb) = vzalloc(sizeof(struct reiserfs_journal));
@@ -2794,10 +2790,10 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
&& (le32_to_cpu(jh->jh_journal.jp_journal_magic) !=
sb_jp_journal_magic(rs))) {
reiserfs_warning(sb, "sh-460",
- "journal header magic %x (device %s) does "
+ "journal header magic %x (device %pg) does "
"not match to magic found in super block %x",
jh->jh_journal.jp_journal_magic,
- bdevname(journal->j_dev_bd, b),
+ journal->j_dev_bd,
sb_jp_journal_magic(rs));
brelse(bhjh);
goto free_and_return;
@@ -2818,10 +2814,10 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
journal->j_max_trans_age = commit_max_age;
}
- reiserfs_info(sb, "journal params: device %s, size %u, "
+ reiserfs_info(sb, "journal params: device %pg, size %u, "
"journal first block %u, max trans len %u, max batch %u, "
"max commit age %u, max trans age %u\n",
- bdevname(journal->j_dev_bd, b),
+ journal->j_dev_bd,
SB_ONDISK_JOURNAL_SIZE(sb),
SB_ONDISK_JOURNAL_1st_BLOCK(sb),
journal->j_trans_max,
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index b55a074..2a12d46 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -613,8 +613,7 @@ static int new_inode_init(struct inode *inode, struct inode *dir, umode_t mode)
* we have to set uid and gid here
*/
inode_init_owner(inode, dir, mode);
- dquot_initialize(inode);
- return 0;
+ return dquot_initialize(inode);
}
static int reiserfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
@@ -633,12 +632,18 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, umode_t mod
struct reiserfs_transaction_handle th;
struct reiserfs_security_handle security;
- dquot_initialize(dir);
+ retval = dquot_initialize(dir);
+ if (retval)
+ return retval;
if (!(inode = new_inode(dir->i_sb))) {
return -ENOMEM;
}
- new_inode_init(inode, dir, mode);
+ retval = new_inode_init(inode, dir, mode);
+ if (retval) {
+ drop_new_inode(inode);
+ return retval;
+ }
jbegin_count += reiserfs_cache_default_acl(dir);
retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security);
@@ -707,15 +712,18 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode
2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) +
REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb));
- if (!new_valid_dev(rdev))
- return -EINVAL;
-
- dquot_initialize(dir);
+ retval = dquot_initialize(dir);
+ if (retval)
+ return retval;
if (!(inode = new_inode(dir->i_sb))) {
return -ENOMEM;
}
- new_inode_init(inode, dir, mode);
+ retval = new_inode_init(inode, dir, mode);
+ if (retval) {
+ drop_new_inode(inode);
+ return retval;
+ }
jbegin_count += reiserfs_cache_default_acl(dir);
retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security);
@@ -787,7 +795,9 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) +
REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb));
- dquot_initialize(dir);
+ retval = dquot_initialize(dir);
+ if (retval)
+ return retval;
#ifdef DISPLACE_NEW_PACKING_LOCALITIES
/*
@@ -800,7 +810,11 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
if (!(inode = new_inode(dir->i_sb))) {
return -ENOMEM;
}
- new_inode_init(inode, dir, mode);
+ retval = new_inode_init(inode, dir, mode);
+ if (retval) {
+ drop_new_inode(inode);
+ return retval;
+ }
jbegin_count += reiserfs_cache_default_acl(dir);
retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security);
@@ -899,7 +913,9 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
JOURNAL_PER_BALANCE_CNT * 2 + 2 +
4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
- dquot_initialize(dir);
+ retval = dquot_initialize(dir);
+ if (retval)
+ return retval;
reiserfs_write_lock(dir->i_sb);
retval = journal_begin(&th, dir->i_sb, jbegin_count);
@@ -985,7 +1001,9 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
int jbegin_count;
unsigned long savelink;
- dquot_initialize(dir);
+ retval = dquot_initialize(dir);
+ if (retval)
+ return retval;
inode = d_inode(dentry);
@@ -1095,12 +1113,18 @@ static int reiserfs_symlink(struct inode *parent_dir,
2 * (REISERFS_QUOTA_INIT_BLOCKS(parent_dir->i_sb) +
REISERFS_QUOTA_TRANS_BLOCKS(parent_dir->i_sb));
- dquot_initialize(parent_dir);
+ retval = dquot_initialize(parent_dir);
+ if (retval)
+ return retval;
if (!(inode = new_inode(parent_dir->i_sb))) {
return -ENOMEM;
}
- new_inode_init(inode, parent_dir, mode);
+ retval = new_inode_init(inode, parent_dir, mode);
+ if (retval) {
+ drop_new_inode(inode);
+ return retval;
+ }
retval = reiserfs_security_init(parent_dir, inode, &dentry->d_name,
&security);
@@ -1146,6 +1170,7 @@ static int reiserfs_symlink(struct inode *parent_dir,
reiserfs_update_inode_transaction(parent_dir);
inode->i_op = &reiserfs_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &reiserfs_address_space_operations;
retval = reiserfs_add_entry(&th, parent_dir, dentry->d_name.name,
@@ -1184,7 +1209,9 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
JOURNAL_PER_BALANCE_CNT * 3 +
2 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
- dquot_initialize(dir);
+ retval = dquot_initialize(dir);
+ if (retval)
+ return retval;
reiserfs_write_lock(dir->i_sb);
if (inode->i_nlink >= REISERFS_LINK_MAX) {
@@ -1308,8 +1335,12 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
JOURNAL_PER_BALANCE_CNT * 3 + 5 +
4 * REISERFS_QUOTA_TRANS_BLOCKS(old_dir->i_sb);
- dquot_initialize(old_dir);
- dquot_initialize(new_dir);
+ retval = dquot_initialize(old_dir);
+ if (retval)
+ return retval;
+ retval = dquot_initialize(new_dir);
+ if (retval)
+ return retval;
old_inode = d_inode(old_dentry);
new_dentry_inode = d_inode(new_dentry);
@@ -1634,8 +1665,7 @@ const struct inode_operations reiserfs_dir_inode_operations = {
*/
const struct inode_operations reiserfs_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
.setattr = reiserfs_setattr,
.setxattr = reiserfs_setxattr,
.getxattr = reiserfs_getxattr,
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
index ae1dc84..4f3f928 100644
--- a/fs/reiserfs/prints.c
+++ b/fs/reiserfs/prints.c
@@ -139,11 +139,9 @@ static void sprintf_block_head(char *buf, struct buffer_head *bh)
static void sprintf_buffer_head(char *buf, struct buffer_head *bh)
{
- char b[BDEVNAME_SIZE];
-
sprintf(buf,
- "dev %s, size %zd, blocknr %llu, count %d, state 0x%lx, page %p, (%s, %s, %s)",
- bdevname(bh->b_bdev, b), bh->b_size,
+ "dev %pg, size %zd, blocknr %llu, count %d, state 0x%lx, page %p, (%s, %s, %s)",
+ bh->b_bdev, bh->b_size,
(unsigned long long)bh->b_blocknr, atomic_read(&(bh->b_count)),
bh->b_state, bh->b_page,
buffer_uptodate(bh) ? "UPTODATE" : "!UPTODATE",
@@ -530,7 +528,6 @@ static int print_super_block(struct buffer_head *bh)
(struct reiserfs_super_block *)(bh->b_data);
int skipped, data_blocks;
char *version;
- char b[BDEVNAME_SIZE];
if (is_reiserfs_3_5(rs)) {
version = "3.5";
@@ -543,7 +540,7 @@ static int print_super_block(struct buffer_head *bh)
return 1;
}
- printk("%s\'s super block is in block %llu\n", bdevname(bh->b_bdev, b),
+ printk("%pg\'s super block is in block %llu\n", bh->b_bdev,
(unsigned long long)bh->b_blocknr);
printk("Reiserfs version %s\n", version);
printk("Block count %u\n", sb_block_count(rs));
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index 621b9f3..fe99915 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -303,11 +303,10 @@ static int show_journal(struct seq_file *m, void *unused)
struct reiserfs_sb_info *r = REISERFS_SB(sb);
struct reiserfs_super_block *rs = r->s_rs;
struct journal_params *jp = &rs->s_v1.s_journal;
- char b[BDEVNAME_SIZE];
seq_printf(m, /* on-disk fields */
"jp_journal_1st_block: \t%i\n"
- "jp_journal_dev: \t%s[%x]\n"
+ "jp_journal_dev: \t%pg[%x]\n"
"jp_journal_size: \t%i\n"
"jp_journal_trans_max: \t%i\n"
"jp_journal_magic: \t%i\n"
@@ -348,7 +347,7 @@ static int show_journal(struct seq_file *m, void *unused)
"prepare: \t%12lu\n"
"prepare_retry: \t%12lu\n",
DJP(jp_journal_1st_block),
- bdevname(SB_JOURNAL(sb)->j_dev_bd, b),
+ SB_JOURNAL(sb)->j_dev_bd,
DJP(jp_journal_dev),
DJP(jp_journal_size),
DJP(jp_journal_trans_max),
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 0e4cf728..05db747 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -626,7 +626,8 @@ static int __init init_inodecache(void)
sizeof(struct
reiserfs_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|
+ SLAB_ACCOUNT),
init_once);
if (reiserfs_inode_cachep == NULL)
return -ENOMEM;
@@ -714,18 +715,20 @@ static int reiserfs_show_options(struct seq_file *seq, struct dentry *root)
seq_puts(seq, ",acl");
if (REISERFS_SB(s)->s_jdev)
- seq_printf(seq, ",jdev=%s", REISERFS_SB(s)->s_jdev);
+ seq_show_option(seq, "jdev", REISERFS_SB(s)->s_jdev);
if (journal->j_max_commit_age != journal->j_default_max_commit_age)
seq_printf(seq, ",commit=%d", journal->j_max_commit_age);
#ifdef CONFIG_QUOTA
if (REISERFS_SB(s)->s_qf_names[USRQUOTA])
- seq_printf(seq, ",usrjquota=%s", REISERFS_SB(s)->s_qf_names[USRQUOTA]);
+ seq_show_option(seq, "usrjquota",
+ REISERFS_SB(s)->s_qf_names[USRQUOTA]);
else if (opts & (1 << REISERFS_USRQUOTA))
seq_puts(seq, ",usrquota");
if (REISERFS_SB(s)->s_qf_names[GRPQUOTA])
- seq_printf(seq, ",grpjquota=%s", REISERFS_SB(s)->s_qf_names[GRPQUOTA]);
+ seq_show_option(seq, "grpjquota",
+ REISERFS_SB(s)->s_qf_names[GRPQUOTA]);
else if (opts & (1 << REISERFS_GRPQUOTA))
seq_puts(seq, ",grpquota");
if (REISERFS_SB(s)->s_jquota_fmt) {
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index e87f9b5..e5ddb4e 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -756,7 +756,8 @@ find_xattr_handler_prefix(const struct xattr_handler **handlers,
return NULL;
for_each_xattr_handler(handlers, xah) {
- if (strncmp(xah->prefix, name, strlen(xah->prefix)) == 0)
+ const char *prefix = xattr_prefix(xah);
+ if (strncmp(prefix, name, strlen(prefix)) == 0)
break;
}
@@ -778,7 +779,7 @@ reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer,
if (!handler || get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1)
return -EOPNOTSUPP;
- return handler->get(dentry, name, buffer, size, handler->flags);
+ return handler->get(handler, dentry, name, buffer, size);
}
/*
@@ -797,7 +798,7 @@ reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
if (!handler || get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1)
return -EOPNOTSUPP;
- return handler->set(dentry, name, value, size, flags, handler->flags);
+ return handler->set(handler, dentry, name, value, size, flags);
}
/*
@@ -814,7 +815,7 @@ int reiserfs_removexattr(struct dentry *dentry, const char *name)
if (!handler || get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1)
return -EOPNOTSUPP;
- return handler->set(dentry, name, NULL, 0, XATTR_REPLACE, handler->flags);
+ return handler->set(handler, dentry, name, NULL, 0, XATTR_REPLACE);
}
struct listxattr_buf {
@@ -839,19 +840,16 @@ static int listxattr_filler(struct dir_context *ctx, const char *name,
handler = find_xattr_handler_prefix(b->dentry->d_sb->s_xattr,
name);
- if (!handler) /* Unsupported xattr name */
+ if (!handler /* Unsupported xattr name */ ||
+ (handler->list && !handler->list(b->dentry)))
return 0;
+ size = namelen + 1;
if (b->buf) {
- size = handler->list(b->dentry, b->buf + b->pos,
- b->size, name, namelen,
- handler->flags);
if (size > b->size)
return -ERANGE;
- } else {
- size = handler->list(b->dentry, NULL, 0, name,
- namelen, handler->flags);
+ memcpy(b->buf + b->pos, name, namelen);
+ b->buf[b->pos + namelen] = 0;
}
-
b->pos += size;
}
return 0;
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 4b34b9d..558a16b 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -186,10 +186,10 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
switch (type) {
case ACL_TYPE_ACCESS:
- name = POSIX_ACL_XATTR_ACCESS;
+ name = XATTR_NAME_POSIX_ACL_ACCESS;
break;
case ACL_TYPE_DEFAULT:
- name = POSIX_ACL_XATTR_DEFAULT;
+ name = XATTR_NAME_POSIX_ACL_DEFAULT;
break;
default:
BUG();
@@ -244,7 +244,7 @@ __reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
switch (type) {
case ACL_TYPE_ACCESS:
- name = POSIX_ACL_XATTR_ACCESS;
+ name = XATTR_NAME_POSIX_ACL_ACCESS;
if (acl) {
error = posix_acl_equiv_mode(acl, &inode->i_mode);
if (error < 0)
@@ -256,7 +256,7 @@ __reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
}
break;
case ACL_TYPE_DEFAULT:
- name = POSIX_ACL_XATTR_DEFAULT;
+ name = XATTR_NAME_POSIX_ACL_DEFAULT;
if (!S_ISDIR(inode->i_mode))
return acl ? -EACCES : 0;
break;
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index 9a3b061..ab0217d 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -9,8 +9,8 @@
#include <linux/uaccess.h>
static int
-security_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
- int handler_flags)
+security_get(const struct xattr_handler *handler, struct dentry *dentry,
+ const char *name, void *buffer, size_t size)
{
if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX))
return -EINVAL;
@@ -22,8 +22,8 @@ security_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
}
static int
-security_set(struct dentry *dentry, const char *name, const void *buffer,
- size_t size, int flags, int handler_flags)
+security_set(const struct xattr_handler *handler, struct dentry *dentry,
+ const char *name, const void *buffer, size_t size, int flags)
{
if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX))
return -EINVAL;
@@ -34,20 +34,9 @@ security_set(struct dentry *dentry, const char *name, const void *buffer,
return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags);
}
-static size_t security_list(struct dentry *dentry, char *list, size_t list_len,
- const char *name, size_t namelen, int handler_flags)
+static bool security_list(struct dentry *dentry)
{
- const size_t len = namelen + 1;
-
- if (IS_PRIVATE(d_inode(dentry)))
- return 0;
-
- if (list && len <= list_len) {
- memcpy(list, name, namelen);
- list[namelen] = '\0';
- }
-
- return len;
+ return !IS_PRIVATE(d_inode(dentry));
}
/* Initializes the security context for a new inode and returns the number
diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c
index e4f1343..64b67aa 100644
--- a/fs/reiserfs/xattr_trusted.c
+++ b/fs/reiserfs/xattr_trusted.c
@@ -8,8 +8,8 @@
#include <linux/uaccess.h>
static int
-trusted_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
- int handler_flags)
+trusted_get(const struct xattr_handler *handler, struct dentry *dentry,
+ const char *name, void *buffer, size_t size)
{
if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX))
return -EINVAL;
@@ -21,8 +21,8 @@ trusted_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
}
static int
-trusted_set(struct dentry *dentry, const char *name, const void *buffer,
- size_t size, int flags, int handler_flags)
+trusted_set(const struct xattr_handler *handler, struct dentry *dentry,
+ const char *name, const void *buffer, size_t size, int flags)
{
if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX))
return -EINVAL;
@@ -33,19 +33,9 @@ trusted_set(struct dentry *dentry, const char *name, const void *buffer,
return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags);
}
-static size_t trusted_list(struct dentry *dentry, char *list, size_t list_size,
- const char *name, size_t name_len, int handler_flags)
+static bool trusted_list(struct dentry *dentry)
{
- const size_t len = name_len + 1;
-
- if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(d_inode(dentry)))
- return 0;
-
- if (list && len <= list_size) {
- memcpy(list, name, name_len);
- list[name_len] = '\0';
- }
- return len;
+ return capable(CAP_SYS_ADMIN) && !IS_PRIVATE(d_inode(dentry));
}
const struct xattr_handler reiserfs_xattr_trusted_handler = {
diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c
index d0b08d3..12e6306 100644
--- a/fs/reiserfs/xattr_user.c
+++ b/fs/reiserfs/xattr_user.c
@@ -7,8 +7,8 @@
#include <linux/uaccess.h>
static int
-user_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
- int handler_flags)
+user_get(const struct xattr_handler *handler, struct dentry *dentry,
+ const char *name, void *buffer, size_t size)
{
if (strlen(name) < sizeof(XATTR_USER_PREFIX))
@@ -19,8 +19,8 @@ user_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
}
static int
-user_set(struct dentry *dentry, const char *name, const void *buffer,
- size_t size, int flags, int handler_flags)
+user_set(const struct xattr_handler *handler, struct dentry *dentry,
+ const char *name, const void *buffer, size_t size, int flags)
{
if (strlen(name) < sizeof(XATTR_USER_PREFIX))
return -EINVAL;
@@ -30,18 +30,9 @@ user_set(struct dentry *dentry, const char *name, const void *buffer,
return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags);
}
-static size_t user_list(struct dentry *dentry, char *list, size_t list_size,
- const char *name, size_t name_len, int handler_flags)
+static bool user_list(struct dentry *dentry)
{
- const size_t len = name_len + 1;
-
- if (!reiserfs_xattrs_user(dentry->d_sb))
- return 0;
- if (list && len <= list_size) {
- memcpy(list, name, name_len);
- list[name_len] = '\0';
- }
- return len;
+ return reiserfs_xattrs_user(dentry->d_sb);
}
const struct xattr_handler reiserfs_xattr_user_handler = {
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 268733c..6b00ca3 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -360,6 +360,7 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos)
break;
case ROMFH_SYM:
i->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(i);
i->i_data.a_ops = &romfs_aops;
mode |= S_IRWXUGO;
break;
@@ -618,8 +619,8 @@ static int __init init_romfs_fs(void)
romfs_inode_cachep =
kmem_cache_create("romfs_i",
sizeof(struct romfs_inode_info), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
- romfs_i_init_once);
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD |
+ SLAB_ACCOUNT, romfs_i_init_once);
if (!romfs_inode_cachep) {
pr_err("Failed to initialise inode cache\n");
diff --git a/fs/select.c b/fs/select.c
index 0155473..79d0d49 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -778,8 +778,8 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait,
return mask;
}
-static int do_poll(unsigned int nfds, struct poll_list *list,
- struct poll_wqueues *wait, struct timespec *end_time)
+static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
+ struct timespec *end_time)
{
poll_table* pt = &wait->pt;
ktime_t expire, *to = NULL;
@@ -908,7 +908,7 @@ int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
}
poll_initwait(&table);
- fdcount = do_poll(nfds, head, &table, end_time);
+ fdcount = do_poll(head, &table, end_time);
poll_freewait(&table);
for (walk = head; walk; walk = walk->next) {
diff --git a/fs/seq_file.c b/fs/seq_file.c
index ce9e39f..e85664b 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -12,6 +12,8 @@
#include <linux/slab.h>
#include <linux/cred.h>
#include <linux/mm.h>
+#include <linux/printk.h>
+#include <linux/string_helpers.h>
#include <asm/uaccess.h>
#include <asm/page.h>
@@ -24,12 +26,17 @@ static void seq_set_overflow(struct seq_file *m)
static void *seq_buf_alloc(unsigned long size)
{
void *buf;
+ gfp_t gfp = GFP_KERNEL;
/*
- * __GFP_NORETRY to avoid oom-killings with high-order allocations -
- * it's better to fall back to vmalloc() than to kill things.
+ * For high order allocations, use __GFP_NORETRY to avoid oom-killing -
+ * it's better to fall back to vmalloc() than to kill things. For small
+ * allocations, just use GFP_KERNEL which will oom kill, thus no need
+ * for vmalloc fallback.
*/
- buf = kmalloc(size, GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
+ if (size > PAGE_SIZE)
+ gfp |= __GFP_NORETRY | __GFP_NOWARN;
+ buf = kmalloc(size, gfp);
if (!buf && size > PAGE_SIZE)
buf = vmalloc(size);
return buf;
@@ -371,36 +378,21 @@ EXPORT_SYMBOL(seq_release);
* @esc: set of characters that need escaping
*
* Puts string into buffer, replacing each occurrence of character from
- * @esc with usual octal escape. Returns 0 in case of success, -1 - in
- * case of overflow.
+ * @esc with usual octal escape.
+ * Use seq_has_overflowed() to check for errors.
*/
-int seq_escape(struct seq_file *m, const char *s, const char *esc)
+void seq_escape(struct seq_file *m, const char *s, const char *esc)
{
- char *end = m->buf + m->size;
- char *p;
- char c;
+ char *buf;
+ size_t size = seq_get_buf(m, &buf);
+ int ret;
- for (p = m->buf + m->count; (c = *s) != '\0' && p < end; s++) {
- if (!strchr(esc, c)) {
- *p++ = c;
- continue;
- }
- if (p + 3 < end) {
- *p++ = '\\';
- *p++ = '0' + ((c & 0300) >> 6);
- *p++ = '0' + ((c & 070) >> 3);
- *p++ = '0' + (c & 07);
- continue;
- }
- seq_set_overflow(m);
- return -1;
- }
- m->count = p - m->buf;
- return 0;
+ ret = string_escape_str(s, buf, size, ESCAPE_OCTAL, esc);
+ seq_commit(m, ret < size ? ret : -1);
}
EXPORT_SYMBOL(seq_escape);
-int seq_vprintf(struct seq_file *m, const char *f, va_list args)
+void seq_vprintf(struct seq_file *m, const char *f, va_list args)
{
int len;
@@ -408,24 +400,20 @@ int seq_vprintf(struct seq_file *m, const char *f, va_list args)
len = vsnprintf(m->buf + m->count, m->size - m->count, f, args);
if (m->count + len < m->size) {
m->count += len;
- return 0;
+ return;
}
}
seq_set_overflow(m);
- return -1;
}
EXPORT_SYMBOL(seq_vprintf);
-int seq_printf(struct seq_file *m, const char *f, ...)
+void seq_printf(struct seq_file *m, const char *f, ...)
{
- int ret;
va_list args;
va_start(args, f);
- ret = seq_vprintf(m, f, args);
+ seq_vprintf(m, f, args);
va_end(args);
-
- return ret;
}
EXPORT_SYMBOL(seq_printf);
@@ -663,26 +651,25 @@ int seq_open_private(struct file *filp, const struct seq_operations *ops,
}
EXPORT_SYMBOL(seq_open_private);
-int seq_putc(struct seq_file *m, char c)
+void seq_putc(struct seq_file *m, char c)
{
- if (m->count < m->size) {
- m->buf[m->count++] = c;
- return 0;
- }
- return -1;
+ if (m->count >= m->size)
+ return;
+
+ m->buf[m->count++] = c;
}
EXPORT_SYMBOL(seq_putc);
-int seq_puts(struct seq_file *m, const char *s)
+void seq_puts(struct seq_file *m, const char *s)
{
int len = strlen(s);
- if (m->count + len < m->size) {
- memcpy(m->buf + m->count, s, len);
- m->count += len;
- return 0;
+
+ if (m->count + len >= m->size) {
+ seq_set_overflow(m);
+ return;
}
- seq_set_overflow(m);
- return -1;
+ memcpy(m->buf + m->count, s, len);
+ m->count += len;
}
EXPORT_SYMBOL(seq_puts);
@@ -693,8 +680,8 @@ EXPORT_SYMBOL(seq_puts);
* This routine is very quick when you show lots of numbers.
* In usual cases, it will be better to use seq_printf(). It's easier to read.
*/
-int seq_put_decimal_ull(struct seq_file *m, char delimiter,
- unsigned long long num)
+void seq_put_decimal_ull(struct seq_file *m, char delimiter,
+ unsigned long long num)
{
int len;
@@ -706,35 +693,33 @@ int seq_put_decimal_ull(struct seq_file *m, char delimiter,
if (num < 10) {
m->buf[m->count++] = num + '0';
- return 0;
+ return;
}
len = num_to_str(m->buf + m->count, m->size - m->count, num);
if (!len)
goto overflow;
m->count += len;
- return 0;
+ return;
+
overflow:
seq_set_overflow(m);
- return -1;
}
EXPORT_SYMBOL(seq_put_decimal_ull);
-int seq_put_decimal_ll(struct seq_file *m, char delimiter,
- long long num)
+void seq_put_decimal_ll(struct seq_file *m, char delimiter, long long num)
{
if (num < 0) {
if (m->count + 3 >= m->size) {
seq_set_overflow(m);
- return -1;
+ return;
}
if (delimiter)
m->buf[m->count++] = delimiter;
num = -num;
delimiter = '-';
}
- return seq_put_decimal_ull(m, delimiter, num);
-
+ seq_put_decimal_ull(m, delimiter, num);
}
EXPORT_SYMBOL(seq_put_decimal_ll);
@@ -773,6 +758,46 @@ void seq_pad(struct seq_file *m, char c)
}
EXPORT_SYMBOL(seq_pad);
+/* A complete analogue of print_hex_dump() */
+void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type,
+ int rowsize, int groupsize, const void *buf, size_t len,
+ bool ascii)
+{
+ const u8 *ptr = buf;
+ int i, linelen, remaining = len;
+ char *buffer;
+ size_t size;
+ int ret;
+
+ if (rowsize != 16 && rowsize != 32)
+ rowsize = 16;
+
+ for (i = 0; i < len && !seq_has_overflowed(m); i += rowsize) {
+ linelen = min(remaining, rowsize);
+ remaining -= rowsize;
+
+ switch (prefix_type) {
+ case DUMP_PREFIX_ADDRESS:
+ seq_printf(m, "%s%p: ", prefix_str, ptr + i);
+ break;
+ case DUMP_PREFIX_OFFSET:
+ seq_printf(m, "%s%.8x: ", prefix_str, i);
+ break;
+ default:
+ seq_printf(m, "%s", prefix_str);
+ break;
+ }
+
+ size = seq_get_buf(m, &buffer);
+ ret = hex_dump_to_buffer(ptr + i, linelen, rowsize, groupsize,
+ buffer, size, ascii);
+ seq_commit(m, ret < size ? ret : -1);
+
+ seq_putc(m, '\n');
+ }
+}
+EXPORT_SYMBOL(seq_hex_dump);
+
struct list_head *seq_list_start(struct list_head *head, loff_t pos)
{
struct list_head *lh;
diff --git a/fs/splice.c b/fs/splice.c
index 5fc1e50..82bc0d6 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -360,7 +360,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
break;
error = add_to_page_cache_lru(page, mapping, index,
- GFP_KERNEL & mapping_gfp_mask(mapping));
+ mapping_gfp_constraint(mapping, GFP_KERNEL));
if (unlikely(error)) {
page_cache_release(page);
if (error == -EEXIST)
@@ -415,6 +415,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
*/
if (!page->mapping) {
unlock_page(page);
+retry_lookup:
page = find_or_create_page(mapping, index,
mapping_gfp_mask(mapping));
@@ -439,13 +440,10 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
error = mapping->a_ops->readpage(in, page);
if (unlikely(error)) {
/*
- * We really should re-lookup the page here,
- * but it complicates things a lot. Instead
- * lets just do what we already stored, and
- * we'll get it the next time we are called.
+ * Re-lookup the page
*/
if (error == AOP_TRUNCATED_PAGE)
- error = 0;
+ goto retry_lookup;
break;
}
@@ -809,6 +807,13 @@ static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_des
*/
static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
{
+ /*
+ * Check for signal early to make process killable when there are
+ * always buffers available
+ */
+ if (signal_pending(current))
+ return -ERESTARTSYS;
+
while (!pipe->nrbufs) {
if (!pipe->writers)
return 0;
@@ -884,6 +889,7 @@ ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
splice_from_pipe_begin(sd);
do {
+ cond_resched();
ret = splice_from_pipe_next(pipe, sd);
if (ret > 0)
ret = splice_from_pipe_feed(pipe, sd, actor);
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index a1ce5ce..0927b1e 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -41,6 +41,7 @@
#include <linux/fs.h>
#include <linux/vfs.h>
#include <linux/xattr.h>
+#include <linux/pagemap.h>
#include "squashfs_fs.h"
#include "squashfs_fs_sb.h"
@@ -291,6 +292,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
inode->i_size = le32_to_cpu(sqsh_ino->symlink_size);
inode->i_op = &squashfs_symlink_inode_ops;
+ inode_nohighmem(inode);
inode->i_data.a_ops = &squashfs_symlink_aops;
inode->i_mode |= S_IFLNK;
squashfs_i(inode)->start = block;
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 5056bab..5e79bfa 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -80,7 +80,6 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
{
struct squashfs_sb_info *msblk;
struct squashfs_super_block *sblk = NULL;
- char b[BDEVNAME_SIZE];
struct inode *root;
long long root_inode;
unsigned short flags;
@@ -124,8 +123,8 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_magic = le32_to_cpu(sblk->s_magic);
if (sb->s_magic != SQUASHFS_MAGIC) {
if (!silent)
- ERROR("Can't find a SQUASHFS superblock on %s\n",
- bdevname(sb->s_bdev, b));
+ ERROR("Can't find a SQUASHFS superblock on %pg\n",
+ sb->s_bdev);
goto failed_mount;
}
@@ -178,7 +177,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
msblk->inodes = le32_to_cpu(sblk->inodes);
flags = le16_to_cpu(sblk->flags);
- TRACE("Found valid superblock on %s\n", bdevname(sb->s_bdev, b));
+ TRACE("Found valid superblock on %pg\n", sb->s_bdev);
TRACE("Inodes are %scompressed\n", SQUASHFS_UNCOMPRESSED_INODES(flags)
? "un" : "");
TRACE("Data is %scompressed\n", SQUASHFS_UNCOMPRESSED_DATA(flags)
@@ -420,7 +419,8 @@ static int __init init_inodecache(void)
{
squashfs_inode_cachep = kmem_cache_create("squashfs_inode_cache",
sizeof(struct squashfs_inode_info), 0,
- SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT, init_once);
+ SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT,
+ init_once);
return squashfs_inode_cachep ? 0 : -ENOMEM;
}
diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c
index 12806df..dbcc2f5 100644
--- a/fs/squashfs/symlink.c
+++ b/fs/squashfs/symlink.c
@@ -119,8 +119,7 @@ const struct address_space_operations squashfs_symlink_aops = {
const struct inode_operations squashfs_symlink_inode_ops = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
.getxattr = generic_getxattr,
.listxattr = squashfs_listxattr
};
diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c
index e5e0ddf..1e9de96 100644
--- a/fs/squashfs/xattr.c
+++ b/fs/squashfs/xattr.c
@@ -58,7 +58,7 @@ ssize_t squashfs_listxattr(struct dentry *d, char *buffer,
struct squashfs_xattr_entry entry;
struct squashfs_xattr_val val;
const struct xattr_handler *handler;
- int name_size, prefix_size = 0;
+ int name_size;
err = squashfs_read_metadata(sb, &entry, &start, &offset,
sizeof(entry));
@@ -67,15 +67,16 @@ ssize_t squashfs_listxattr(struct dentry *d, char *buffer,
name_size = le16_to_cpu(entry.size);
handler = squashfs_xattr_handler(le16_to_cpu(entry.type));
- if (handler)
- prefix_size = handler->list(d, buffer, rest, NULL,
- name_size, handler->flags);
- if (prefix_size) {
+ if (handler && (!handler->list || handler->list(d))) {
+ const char *prefix = handler->prefix ?: handler->name;
+ size_t prefix_size = strlen(prefix);
+
if (buffer) {
if (prefix_size + name_size + 1 > rest) {
err = -ERANGE;
goto failed;
}
+ memcpy(buffer, prefix, prefix_size);
buffer += prefix_size;
}
err = squashfs_read_metadata(sb, buffer, &start,
@@ -212,88 +213,45 @@ failed:
}
-/*
- * User namespace support
- */
-static size_t squashfs_user_list(struct dentry *d, char *list, size_t list_size,
- const char *name, size_t name_len, int type)
-{
- if (list && XATTR_USER_PREFIX_LEN <= list_size)
- memcpy(list, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
- return XATTR_USER_PREFIX_LEN;
-}
-
-static int squashfs_user_get(struct dentry *d, const char *name, void *buffer,
- size_t size, int type)
+static int squashfs_xattr_handler_get(const struct xattr_handler *handler,
+ struct dentry *d, const char *name,
+ void *buffer, size_t size)
{
- if (name[0] == '\0')
- return -EINVAL;
-
- return squashfs_xattr_get(d_inode(d), SQUASHFS_XATTR_USER, name,
+ return squashfs_xattr_get(d_inode(d), handler->flags, name,
buffer, size);
}
+/*
+ * User namespace support
+ */
static const struct xattr_handler squashfs_xattr_user_handler = {
.prefix = XATTR_USER_PREFIX,
- .list = squashfs_user_list,
- .get = squashfs_user_get
+ .flags = SQUASHFS_XATTR_USER,
+ .get = squashfs_xattr_handler_get
};
/*
* Trusted namespace support
*/
-static size_t squashfs_trusted_list(struct dentry *d, char *list,
- size_t list_size, const char *name, size_t name_len, int type)
+static bool squashfs_trusted_xattr_handler_list(struct dentry *d)
{
- if (!capable(CAP_SYS_ADMIN))
- return 0;
-
- if (list && XATTR_TRUSTED_PREFIX_LEN <= list_size)
- memcpy(list, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
- return XATTR_TRUSTED_PREFIX_LEN;
-}
-
-static int squashfs_trusted_get(struct dentry *d, const char *name,
- void *buffer, size_t size, int type)
-{
- if (name[0] == '\0')
- return -EINVAL;
-
- return squashfs_xattr_get(d_inode(d), SQUASHFS_XATTR_TRUSTED, name,
- buffer, size);
+ return capable(CAP_SYS_ADMIN);
}
static const struct xattr_handler squashfs_xattr_trusted_handler = {
.prefix = XATTR_TRUSTED_PREFIX,
- .list = squashfs_trusted_list,
- .get = squashfs_trusted_get
+ .flags = SQUASHFS_XATTR_TRUSTED,
+ .list = squashfs_trusted_xattr_handler_list,
+ .get = squashfs_xattr_handler_get
};
/*
* Security namespace support
*/
-static size_t squashfs_security_list(struct dentry *d, char *list,
- size_t list_size, const char *name, size_t name_len, int type)
-{
- if (list && XATTR_SECURITY_PREFIX_LEN <= list_size)
- memcpy(list, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN);
- return XATTR_SECURITY_PREFIX_LEN;
-}
-
-static int squashfs_security_get(struct dentry *d, const char *name,
- void *buffer, size_t size, int type)
-{
- if (name[0] == '\0')
- return -EINVAL;
-
- return squashfs_xattr_get(d_inode(d), SQUASHFS_XATTR_SECURITY, name,
- buffer, size);
-}
-
static const struct xattr_handler squashfs_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
- .list = squashfs_security_list,
- .get = squashfs_security_get
+ .flags = SQUASHFS_XATTR_SECURITY,
+ .get = squashfs_xattr_handler_get
};
static const struct xattr_handler *squashfs_xattr_handler(int type)
diff --git a/fs/stat.c b/fs/stat.c
index cccc1aa..bc045c7 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -219,7 +219,7 @@ SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, stat
# define choose_32_64(a,b) b
#endif
-#define valid_dev(x) choose_32_64(old_valid_dev,new_valid_dev)(x)
+#define valid_dev(x) choose_32_64(old_valid_dev(x),true)
#define encode_dev(x) choose_32_64(old_encode_dev,new_encode_dev)(x)
#ifndef INIT_STRUCT_STAT_PADDING
@@ -367,8 +367,6 @@ static long cp_new_stat64(struct kstat *stat, struct stat64 __user *statbuf)
INIT_STRUCT_STAT64_PADDING(tmp);
#ifdef CONFIG_MIPS
/* mips has weird padding, so we don't get 64 bits there */
- if (!new_valid_dev(stat->dev) || !new_valid_dev(stat->rdev))
- return -EOVERFLOW;
tmp.st_dev = new_encode_dev(stat->dev);
tmp.st_rdev = new_encode_dev(stat->rdev);
#else
diff --git a/fs/super.c b/fs/super.c
index b613723..1182af8 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -135,6 +135,24 @@ static unsigned long super_cache_count(struct shrinker *shrink,
return total_objects;
}
+static void destroy_super_work(struct work_struct *work)
+{
+ struct super_block *s = container_of(work, struct super_block,
+ destroy_work);
+ int i;
+
+ for (i = 0; i < SB_FREEZE_LEVELS; i++)
+ percpu_free_rwsem(&s->s_writers.rw_sem[i]);
+ kfree(s);
+}
+
+static void destroy_super_rcu(struct rcu_head *head)
+{
+ struct super_block *s = container_of(head, struct super_block, rcu);
+ INIT_WORK(&s->destroy_work, destroy_super_work);
+ schedule_work(&s->destroy_work);
+}
+
/**
* destroy_super - frees a superblock
* @s: superblock to free
@@ -143,16 +161,13 @@ static unsigned long super_cache_count(struct shrinker *shrink,
*/
static void destroy_super(struct super_block *s)
{
- int i;
list_lru_destroy(&s->s_dentry_lru);
list_lru_destroy(&s->s_inode_lru);
- for (i = 0; i < SB_FREEZE_LEVELS; i++)
- percpu_counter_destroy(&s->s_writers.counter[i]);
security_sb_free(s);
WARN_ON(!list_empty(&s->s_mounts));
kfree(s->s_subtype);
kfree(s->s_options);
- kfree_rcu(s, rcu);
+ call_rcu(&s->rcu, destroy_super_rcu);
}
/**
@@ -178,19 +193,19 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
goto fail;
for (i = 0; i < SB_FREEZE_LEVELS; i++) {
- if (percpu_counter_init(&s->s_writers.counter[i], 0,
- GFP_KERNEL) < 0)
+ if (__percpu_init_rwsem(&s->s_writers.rw_sem[i],
+ sb_writers_name[i],
+ &type->s_writers_key[i]))
goto fail;
- lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i],
- &type->s_writers_key[i], 0);
}
- init_waitqueue_head(&s->s_writers.wait);
init_waitqueue_head(&s->s_writers.wait_unfrozen);
s->s_bdi = &noop_backing_dev_info;
s->s_flags = flags;
INIT_HLIST_NODE(&s->s_instances);
INIT_HLIST_BL_HEAD(&s->s_anon);
+ mutex_init(&s->s_sync_lock);
INIT_LIST_HEAD(&s->s_inodes);
+ spin_lock_init(&s->s_inode_list_lock);
if (list_lru_init_memcg(&s->s_dentry_lru))
goto fail;
@@ -399,7 +414,7 @@ void generic_shutdown_super(struct super_block *sb)
sync_filesystem(sb);
sb->s_flags &= ~MS_ACTIVE;
- fsnotify_unmount_inodes(&sb->s_inodes);
+ fsnotify_unmount_inodes(sb);
evict_inodes(sb);
@@ -997,10 +1012,8 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
blkdev_put(bdev, mode);
down_write(&s->s_umount);
} else {
- char b[BDEVNAME_SIZE];
-
s->s_mode = mode;
- strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
+ snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
sb_set_blocksize(s, block_size(bdev));
error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
if (error) {
@@ -1146,72 +1159,46 @@ out:
*/
void __sb_end_write(struct super_block *sb, int level)
{
- percpu_counter_dec(&sb->s_writers.counter[level-1]);
- /*
- * Make sure s_writers are updated before we wake up waiters in
- * freeze_super().
- */
- smp_mb();
- if (waitqueue_active(&sb->s_writers.wait))
- wake_up(&sb->s_writers.wait);
- rwsem_release(&sb->s_writers.lock_map[level-1], 1, _RET_IP_);
+ percpu_up_read(sb->s_writers.rw_sem + level-1);
}
EXPORT_SYMBOL(__sb_end_write);
-#ifdef CONFIG_LOCKDEP
-/*
- * We want lockdep to tell us about possible deadlocks with freezing but
- * it's it bit tricky to properly instrument it. Getting a freeze protection
- * works as getting a read lock but there are subtle problems. XFS for example
- * gets freeze protection on internal level twice in some cases, which is OK
- * only because we already hold a freeze protection also on higher level. Due
- * to these cases we have to tell lockdep we are doing trylock when we
- * already hold a freeze protection for a higher freeze level.
- */
-static void acquire_freeze_lock(struct super_block *sb, int level, bool trylock,
- unsigned long ip)
-{
- int i;
-
- if (!trylock) {
- for (i = 0; i < level - 1; i++)
- if (lock_is_held(&sb->s_writers.lock_map[i])) {
- trylock = true;
- break;
- }
- }
- rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, trylock, ip);
-}
-#endif
-
/*
* This is an internal function, please use sb_start_{write,pagefault,intwrite}
* instead.
*/
int __sb_start_write(struct super_block *sb, int level, bool wait)
{
-retry:
- if (unlikely(sb->s_writers.frozen >= level)) {
- if (!wait)
- return 0;
- wait_event(sb->s_writers.wait_unfrozen,
- sb->s_writers.frozen < level);
- }
+ bool force_trylock = false;
+ int ret = 1;
#ifdef CONFIG_LOCKDEP
- acquire_freeze_lock(sb, level, !wait, _RET_IP_);
-#endif
- percpu_counter_inc(&sb->s_writers.counter[level-1]);
/*
- * Make sure counter is updated before we check for frozen.
- * freeze_super() first sets frozen and then checks the counter.
+ * We want lockdep to tell us about possible deadlocks with freezing
+ * but it's it bit tricky to properly instrument it. Getting a freeze
+ * protection works as getting a read lock but there are subtle
+ * problems. XFS for example gets freeze protection on internal level
+ * twice in some cases, which is OK only because we already hold a
+ * freeze protection also on higher level. Due to these cases we have
+ * to use wait == F (trylock mode) which must not fail.
*/
- smp_mb();
- if (unlikely(sb->s_writers.frozen >= level)) {
- __sb_end_write(sb, level);
- goto retry;
+ if (wait) {
+ int i;
+
+ for (i = 0; i < level - 1; i++)
+ if (percpu_rwsem_is_held(sb->s_writers.rw_sem + i)) {
+ force_trylock = true;
+ break;
+ }
}
- return 1;
+#endif
+ if (wait && !force_trylock)
+ percpu_down_read(sb->s_writers.rw_sem + level-1);
+ else
+ ret = percpu_down_read_trylock(sb->s_writers.rw_sem + level-1);
+
+ WARN_ON(force_trylock && !ret);
+ return ret;
}
EXPORT_SYMBOL(__sb_start_write);
@@ -1221,37 +1208,33 @@ EXPORT_SYMBOL(__sb_start_write);
* @level: type of writers we wait for (normal vs page fault)
*
* This function waits until there are no writers of given type to given file
- * system. Caller of this function should make sure there can be no new writers
- * of type @level before calling this function. Otherwise this function can
- * livelock.
+ * system.
*/
static void sb_wait_write(struct super_block *sb, int level)
{
- s64 writers;
-
+ percpu_down_write(sb->s_writers.rw_sem + level-1);
/*
- * We just cycle-through lockdep here so that it does not complain
- * about returning with lock to userspace
+ * We are going to return to userspace and forget about this lock, the
+ * ownership goes to the caller of thaw_super() which does unlock.
+ *
+ * FIXME: we should do this before return from freeze_super() after we
+ * called sync_filesystem(sb) and s_op->freeze_fs(sb), and thaw_super()
+ * should re-acquire these locks before s_op->unfreeze_fs(sb). However
+ * this leads to lockdep false-positives, so currently we do the early
+ * release right after acquire.
*/
- rwsem_acquire(&sb->s_writers.lock_map[level-1], 0, 0, _THIS_IP_);
- rwsem_release(&sb->s_writers.lock_map[level-1], 1, _THIS_IP_);
-
- do {
- DEFINE_WAIT(wait);
+ percpu_rwsem_release(sb->s_writers.rw_sem + level-1, 0, _THIS_IP_);
+}
- /*
- * We use a barrier in prepare_to_wait() to separate setting
- * of frozen and checking of the counter
- */
- prepare_to_wait(&sb->s_writers.wait, &wait,
- TASK_UNINTERRUPTIBLE);
+static void sb_freeze_unlock(struct super_block *sb)
+{
+ int level;
- writers = percpu_counter_sum(&sb->s_writers.counter[level-1]);
- if (writers)
- schedule();
+ for (level = 0; level < SB_FREEZE_LEVELS; ++level)
+ percpu_rwsem_acquire(sb->s_writers.rw_sem + level, 0, _THIS_IP_);
- finish_wait(&sb->s_writers.wait, &wait);
- } while (writers);
+ for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--)
+ percpu_up_write(sb->s_writers.rw_sem + level);
}
/**
@@ -1310,20 +1293,14 @@ int freeze_super(struct super_block *sb)
return 0;
}
- /* From now on, no new normal writers can start */
sb->s_writers.frozen = SB_FREEZE_WRITE;
- smp_wmb();
-
/* Release s_umount to preserve sb_start_write -> s_umount ordering */
up_write(&sb->s_umount);
-
sb_wait_write(sb, SB_FREEZE_WRITE);
+ down_write(&sb->s_umount);
/* Now we go and block page faults... */
- down_write(&sb->s_umount);
sb->s_writers.frozen = SB_FREEZE_PAGEFAULT;
- smp_wmb();
-
sb_wait_write(sb, SB_FREEZE_PAGEFAULT);
/* All writers are done so after syncing there won't be dirty data */
@@ -1331,7 +1308,6 @@ int freeze_super(struct super_block *sb)
/* Now wait for internal filesystem counter */
sb->s_writers.frozen = SB_FREEZE_FS;
- smp_wmb();
sb_wait_write(sb, SB_FREEZE_FS);
if (sb->s_op->freeze_fs) {
@@ -1340,7 +1316,7 @@ int freeze_super(struct super_block *sb)
printk(KERN_ERR
"VFS:Filesystem freeze failed\n");
sb->s_writers.frozen = SB_UNFROZEN;
- smp_wmb();
+ sb_freeze_unlock(sb);
wake_up(&sb->s_writers.wait_unfrozen);
deactivate_locked_super(sb);
return ret;
@@ -1372,8 +1348,10 @@ int thaw_super(struct super_block *sb)
return -EINVAL;
}
- if (sb->s_flags & MS_RDONLY)
+ if (sb->s_flags & MS_RDONLY) {
+ sb->s_writers.frozen = SB_UNFROZEN;
goto out;
+ }
if (sb->s_op->unfreeze_fs) {
error = sb->s_op->unfreeze_fs(sb);
@@ -1385,12 +1363,11 @@ int thaw_super(struct super_block *sb)
}
}
-out:
sb->s_writers.frozen = SB_UNFROZEN;
- smp_wmb();
+ sb_freeze_unlock(sb);
+out:
wake_up(&sb->s_writers.wait_unfrozen);
deactivate_locked_super(sb);
-
return 0;
}
EXPORT_SYMBOL(thaw_super);
diff --git a/fs/sync.c b/fs/sync.c
index fbc98ee..dd5d171 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -86,7 +86,12 @@ static void fdatawrite_one_bdev(struct block_device *bdev, void *arg)
static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
{
- filemap_fdatawait(bdev->bd_inode->i_mapping);
+ /*
+ * We keep the error status of individual mapping so that
+ * applications can catch the writeback error using fsync(2).
+ * See filemap_fdatawait_keep_errors() for details.
+ */
+ filemap_fdatawait_keep_errors(bdev->bd_inode->i_mapping);
}
/*
@@ -343,7 +348,8 @@ SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes,
}
if (flags & SYNC_FILE_RANGE_WRITE) {
- ret = filemap_fdatawrite_range(mapping, offset, endbyte);
+ ret = __filemap_fdatawrite_range(mapping, offset, endbyte,
+ WB_SYNC_NONE);
if (ret < 0)
goto out_put;
}
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 6c95628..f35523d 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -108,6 +108,7 @@ static ssize_t sysfs_kf_read(struct kernfs_open_file *of, char *buf,
{
const struct sysfs_ops *ops = sysfs_file_ops(of->kn);
struct kobject *kobj = of->kn->parent->priv;
+ size_t len;
/*
* If buf != of->prealloc_buf, we don't know how
@@ -115,7 +116,8 @@ static ssize_t sysfs_kf_read(struct kernfs_open_file *of, char *buf,
*/
if (pos || WARN_ON_ONCE(buf != of->prealloc_buf))
return 0;
- return ops->show(kobj, of->kn->priv, buf);
+ len = ops->show(kobj, of->kn->priv, buf);
+ return min(count, len);
}
/* kernfs write callback for regular sysfs files */
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 39a0199..dc1358b 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -73,13 +73,26 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj,
}
if (grp->bin_attrs) {
- for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) {
+ for (i = 0, bin_attr = grp->bin_attrs; *bin_attr; i++, bin_attr++) {
+ umode_t mode = (*bin_attr)->attr.mode;
+
if (update)
kernfs_remove_by_name(parent,
(*bin_attr)->attr.name);
+ if (grp->is_bin_visible) {
+ mode = grp->is_bin_visible(kobj, *bin_attr, i);
+ if (!mode)
+ continue;
+ }
+
+ WARN(mode & ~(SYSFS_PREALLOC | 0664),
+ "Attribute %s: Invalid permissions 0%o\n",
+ (*bin_attr)->attr.name, mode);
+
+ mode &= SYSFS_PREALLOC | 0664;
error = sysfs_add_file_mode_ns(parent,
&(*bin_attr)->attr, true,
- (*bin_attr)->attr.mode, NULL);
+ mode, NULL);
if (error)
break;
}
@@ -352,3 +365,47 @@ void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name,
}
}
EXPORT_SYMBOL_GPL(sysfs_remove_link_from_group);
+
+/**
+ * __compat_only_sysfs_link_entry_to_kobj - add a symlink to a kobject pointing
+ * to a group or an attribute
+ * @kobj: The kobject containing the group.
+ * @target_kobj: The target kobject.
+ * @target_name: The name of the target group or attribute.
+ */
+int __compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj,
+ struct kobject *target_kobj,
+ const char *target_name)
+{
+ struct kernfs_node *target;
+ struct kernfs_node *entry;
+ struct kernfs_node *link;
+
+ /*
+ * We don't own @target_kobj and it may be removed at any time.
+ * Synchronize using sysfs_symlink_target_lock. See sysfs_remove_dir()
+ * for details.
+ */
+ spin_lock(&sysfs_symlink_target_lock);
+ target = target_kobj->sd;
+ if (target)
+ kernfs_get(target);
+ spin_unlock(&sysfs_symlink_target_lock);
+ if (!target)
+ return -ENOENT;
+
+ entry = kernfs_find_and_get(target_kobj->sd, target_name);
+ if (!entry) {
+ kernfs_put(target);
+ return -ENOENT;
+ }
+
+ link = kernfs_create_link(kobj->sd, target_name, entry);
+ if (IS_ERR(link) && PTR_ERR(link) == -EEXIST)
+ sysfs_warn_dup(kobj->sd, target_name);
+
+ kernfs_put(entry);
+ kernfs_put(target);
+ return IS_ERR(link) ? PTR_ERR(link) : 0;
+}
+EXPORT_SYMBOL_GPL(__compat_only_sysfs_link_entry_to_kobj);
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 1c6ac6f..f3db820 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -40,6 +40,10 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type,
SYSFS_MAGIC, &new_sb, ns);
if (IS_ERR(root) || !new_sb)
kobj_ns_drop(KOBJ_NS_TYPE_NET, ns);
+ else if (new_sb)
+ /* Userspace would break if executables appear on sysfs */
+ root->d_sb->s_iflags |= SB_I_NOEXEC;
+
return root;
}
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 590ad92..d62c423 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -146,8 +146,7 @@ static inline void write3byte(struct sysv_sb_info *sbi,
static const struct inode_operations sysv_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
.getattr = sysv_getattr,
};
@@ -162,15 +161,9 @@ void sysv_set_inode(struct inode *inode, dev_t rdev)
inode->i_fop = &sysv_dir_operations;
inode->i_mapping->a_ops = &sysv_aops;
} else if (S_ISLNK(inode->i_mode)) {
- if (inode->i_blocks) {
- inode->i_op = &sysv_symlink_inode_operations;
- inode->i_mapping->a_ops = &sysv_aops;
- } else {
- inode->i_op = &simple_symlink_inode_operations;
- inode->i_link = (char *)SYSV_I(inode)->i_data;
- nd_terminate_link(inode->i_link, inode->i_size,
- sizeof(SYSV_I(inode)->i_data) - 1);
- }
+ inode->i_op = &sysv_symlink_inode_operations;
+ inode_nohighmem(inode);
+ inode->i_mapping->a_ops = &sysv_aops;
} else
init_special_inode(inode, inode->i_mode, rdev);
}
@@ -353,7 +346,7 @@ int __init sysv_init_icache(void)
{
sysv_inode_cachep = kmem_cache_create("sysv_inode_cache",
sizeof(struct sysv_inode_info), 0,
- SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
+ SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT,
init_once);
if (!sysv_inode_cachep)
return -ENOMEM;
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index cbc8d5d..c66f242 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -340,8 +340,12 @@ static struct dentry *start_creating(const char *name, struct dentry *parent)
dput(dentry);
dentry = ERR_PTR(-EEXIST);
}
- if (IS_ERR(dentry))
+
+ if (IS_ERR(dentry)) {
mutex_unlock(&parent->d_inode->i_mutex);
+ simple_release_fs(&tracefs_mount, &tracefs_mount_count);
+ }
+
return dentry;
}
diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig
index ba66d508..7ff7712 100644
--- a/fs/ubifs/Kconfig
+++ b/fs/ubifs/Kconfig
@@ -35,3 +35,18 @@ config UBIFS_FS_ZLIB
default y
help
Zlib compresses better than LZO but it is slower. Say 'Y' if unsure.
+
+config UBIFS_ATIME_SUPPORT
+ bool "Access time support" if UBIFS_FS
+ depends on UBIFS_FS
+ default n
+ help
+ Originally UBIFS did not support atime, because it looked like a bad idea due
+ increased flash wear. This option adds atime support and it is disabled by default
+ to preserve the old behavior. If you enable this option, UBIFS starts updating atime,
+ which means that file-system read operations will cause writes (inode atime
+ updates). This may affect file-system performance and increase flash device wear,
+ so be careful. How often atime is updated depends on the selected strategy:
+ strictatime is the "heavy", relatime is "lighter", etc.
+
+ If unsure, say 'N'
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 4c46a98..595ca0d 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -2573,7 +2573,7 @@ int dbg_leb_write(struct ubifs_info *c, int lnum, const void *buf,
{
int err, failing;
- if (c->dbg->pc_happened)
+ if (dbg_is_power_cut(c))
return -EROFS;
failing = power_cut_emulated(c, lnum, 1);
@@ -2595,7 +2595,7 @@ int dbg_leb_change(struct ubifs_info *c, int lnum, const void *buf,
{
int err;
- if (c->dbg->pc_happened)
+ if (dbg_is_power_cut(c))
return -EROFS;
if (power_cut_emulated(c, lnum, 1))
return -EROFS;
@@ -2611,7 +2611,7 @@ int dbg_leb_unmap(struct ubifs_info *c, int lnum)
{
int err;
- if (c->dbg->pc_happened)
+ if (dbg_is_power_cut(c))
return -EROFS;
if (power_cut_emulated(c, lnum, 0))
return -EROFS;
@@ -2627,7 +2627,7 @@ int dbg_leb_map(struct ubifs_info *c, int lnum)
{
int err;
- if (c->dbg->pc_happened)
+ if (dbg_is_power_cut(c))
return -EROFS;
if (power_cut_emulated(c, lnum, 0))
return -EROFS;
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 5c27c66..e49bd28 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -449,13 +449,14 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx)
}
out:
+ kfree(file->private_data);
+ file->private_data = NULL;
+
if (err != -ENOENT) {
ubifs_err(c, "cannot find next direntry, error %d", err);
return err;
}
- kfree(file->private_data);
- file->private_data = NULL;
/* 2 is a special value indicating that there are no more direntries */
ctx->pos = 2;
return 0;
@@ -787,9 +788,6 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry,
dbg_gen("dent '%pd' in dir ino %lu", dentry, dir->i_ino);
- if (!new_valid_dev(rdev))
- return -EINVAL;
-
if (S_ISBLK(mode) || S_ISCHR(mode)) {
dev = kmalloc(sizeof(union ubifs_dev_desc), GFP_NOFS);
if (!dev)
@@ -1188,6 +1186,9 @@ const struct inode_operations ubifs_dir_inode_operations = {
.getxattr = ubifs_getxattr,
.listxattr = ubifs_listxattr,
.removexattr = ubifs_removexattr,
+#ifdef CONFIG_UBIFS_ATIME_SUPPORT
+ .update_time = ubifs_update_time,
+#endif
};
const struct file_operations ubifs_dir_operations = {
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index a3dfe2a..eff6280 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1354,6 +1354,47 @@ static inline int mctime_update_needed(const struct inode *inode,
return 0;
}
+#ifdef CONFIG_UBIFS_ATIME_SUPPORT
+/**
+ * ubifs_update_time - update time of inode.
+ * @inode: inode to update
+ *
+ * This function updates time of the inode.
+ */
+int ubifs_update_time(struct inode *inode, struct timespec *time,
+ int flags)
+{
+ struct ubifs_inode *ui = ubifs_inode(inode);
+ struct ubifs_info *c = inode->i_sb->s_fs_info;
+ struct ubifs_budget_req req = { .dirtied_ino = 1,
+ .dirtied_ino_d = ALIGN(ui->data_len, 8) };
+ int iflags = I_DIRTY_TIME;
+ int err, release;
+
+ err = ubifs_budget_space(c, &req);
+ if (err)
+ return err;
+
+ mutex_lock(&ui->ui_mutex);
+ if (flags & S_ATIME)
+ inode->i_atime = *time;
+ if (flags & S_CTIME)
+ inode->i_ctime = *time;
+ if (flags & S_MTIME)
+ inode->i_mtime = *time;
+
+ if (!(inode->i_sb->s_flags & MS_LAZYTIME))
+ iflags |= I_DIRTY_SYNC;
+
+ release = ui->dirty;
+ __mark_inode_dirty(inode, iflags);
+ mutex_unlock(&ui->ui_mutex);
+ if (release)
+ ubifs_release_budget(c, &req);
+ return 0;
+}
+#endif
+
/**
* update_ctime - update mtime and ctime of an inode.
* @inode: inode to update
@@ -1537,6 +1578,9 @@ static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma)
if (err)
return err;
vma->vm_ops = &ubifs_file_vm_ops;
+#ifdef CONFIG_UBIFS_ATIME_SUPPORT
+ file_accessed(file);
+#endif
return 0;
}
@@ -1557,17 +1601,23 @@ const struct inode_operations ubifs_file_inode_operations = {
.getxattr = ubifs_getxattr,
.listxattr = ubifs_listxattr,
.removexattr = ubifs_removexattr,
+#ifdef CONFIG_UBIFS_ATIME_SUPPORT
+ .update_time = ubifs_update_time,
+#endif
};
const struct inode_operations ubifs_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = simple_follow_link,
+ .get_link = simple_get_link,
.setattr = ubifs_setattr,
.getattr = ubifs_getattr,
.setxattr = ubifs_setxattr,
.getxattr = ubifs_getxattr,
.listxattr = ubifs_listxattr,
.removexattr = ubifs_removexattr,
+#ifdef CONFIG_UBIFS_ATIME_SUPPORT
+ .update_time = ubifs_update_time,
+#endif
};
const struct file_operations ubifs_file_operations = {
diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h
index 92a8491..c0a95e3 100644
--- a/fs/ubifs/key.h
+++ b/fs/ubifs/key.h
@@ -34,6 +34,12 @@
* node. We use "r5" hash borrowed from reiserfs.
*/
+/*
+ * Lot's of the key helpers require a struct ubifs_info *c as the first parameter.
+ * But we are not using it at all currently. That's designed for future extensions of
+ * different c->key_format. But right now, there is only one key type, UBIFS_SIMPLE_KEY_FMT.
+ */
+
#ifndef __UBIFS_KEY_H__
#define __UBIFS_KEY_H__
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index dc9f27e..9a51710 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -1498,11 +1498,10 @@ static struct ubifs_nnode *dirty_cow_nnode(struct ubifs_info *c,
}
/* nnode is being committed, so copy it */
- n = kmalloc(sizeof(struct ubifs_nnode), GFP_NOFS);
+ n = kmemdup(nnode, sizeof(struct ubifs_nnode), GFP_NOFS);
if (unlikely(!n))
return ERR_PTR(-ENOMEM);
- memcpy(n, nnode, sizeof(struct ubifs_nnode));
n->cnext = NULL;
__set_bit(DIRTY_CNODE, &n->flags);
__clear_bit(COW_CNODE, &n->flags);
@@ -1549,11 +1548,10 @@ static struct ubifs_pnode *dirty_cow_pnode(struct ubifs_info *c,
}
/* pnode is being committed, so copy it */
- p = kmalloc(sizeof(struct ubifs_pnode), GFP_NOFS);
+ p = kmemdup(pnode, sizeof(struct ubifs_pnode), GFP_NOFS);
if (unlikely(!p))
return ERR_PTR(-ENOMEM);
- memcpy(p, pnode, sizeof(struct ubifs_pnode));
p->cnext = NULL;
__set_bit(DIRTY_CNODE, &p->flags);
__clear_bit(COW_CNODE, &p->flags);
diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h
index ee7cb5e..8ece6ca 100644
--- a/fs/ubifs/misc.h
+++ b/fs/ubifs/misc.h
@@ -155,13 +155,8 @@ static inline int ubifs_wbuf_sync(struct ubifs_wbuf *wbuf)
*/
static inline int ubifs_encode_dev(union ubifs_dev_desc *dev, dev_t rdev)
{
- if (new_valid_dev(rdev)) {
- dev->new = cpu_to_le32(new_encode_dev(rdev));
- return sizeof(dev->new);
- } else {
- dev->huge = cpu_to_le64(huge_encode_dev(rdev));
- return sizeof(dev->huge);
- }
+ dev->new = cpu_to_le32(new_encode_dev(rdev));
+ return sizeof(dev->new);
}
/**
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 695fc71..586d593 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -789,7 +789,7 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
corrupted_rescan:
/* Re-scan the corrupted data with verbose messages */
ubifs_err(c, "corruption %d", ret);
- ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
+ ubifs_scan_a_node(c, buf, len, lnum, offs, 0);
corrupted:
ubifs_scanned_corruption(c, lnum, offs, buf);
err = -EUCLEAN;
@@ -1331,8 +1331,7 @@ void ubifs_destroy_size_tree(struct ubifs_info *c)
struct size_entry *e, *n;
rbtree_postorder_for_each_entry_safe(e, n, &c->size_tree, rb) {
- if (e->inode)
- iput(e->inode);
+ iput(e->inode);
kfree(e);
}
@@ -1533,8 +1532,7 @@ int ubifs_recover_size(struct ubifs_info *c)
err = fix_size_in_place(c, e);
if (err)
return err;
- if (e->inode)
- iput(e->inode);
+ iput(e->inode);
}
}
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 9547a278..a233ba9 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -128,7 +128,10 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum)
if (err)
goto out_ino;
- inode->i_flags |= (S_NOCMTIME | S_NOATIME);
+ inode->i_flags |= S_NOCMTIME;
+#ifndef CONFIG_UBIFS_ATIME_SUPPORT
+ inode->i_flags |= S_NOATIME;
+#endif
set_nlink(inode, le32_to_cpu(ino->nlink));
i_uid_write(inode, le32_to_cpu(ino->uid));
i_gid_write(inode, le32_to_cpu(ino->gid));
@@ -2037,7 +2040,6 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
if (c->max_inode_sz > MAX_LFS_FILESIZE)
sb->s_maxbytes = c->max_inode_sz = MAX_LFS_FILESIZE;
sb->s_op = &ubifs_super_operations;
- sb->s_xattr = ubifs_xattr_handlers;
mutex_lock(&c->umount_mutex);
err = mount_ubifs(c);
@@ -2139,7 +2141,12 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags,
if (err)
goto out_deact;
/* We do not support atime */
- sb->s_flags |= MS_ACTIVE | MS_NOATIME;
+ sb->s_flags |= MS_ACTIVE;
+#ifndef CONFIG_UBIFS_ATIME_SUPPORT
+ sb->s_flags |= MS_NOATIME;
+#else
+ ubifs_msg(c, "full atime support is enabled.");
+#endif
}
/* 'fill_super()' opens ubi again so we must close it here */
@@ -2241,8 +2248,8 @@ static int __init ubifs_init(void)
ubifs_inode_slab = kmem_cache_create("ubifs_inode_slab",
sizeof(struct ubifs_inode), 0,
- SLAB_MEM_SPREAD | SLAB_RECLAIM_ACCOUNT,
- &inode_slab_ctor);
+ SLAB_MEM_SPREAD | SLAB_RECLAIM_ACCOUNT |
+ SLAB_ACCOUNT, &inode_slab_ctor);
if (!ubifs_inode_slab)
return -ENOMEM;
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 957f575..fa9a20c 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -198,11 +198,10 @@ static struct ubifs_znode *copy_znode(struct ubifs_info *c,
{
struct ubifs_znode *zn;
- zn = kmalloc(c->max_znode_sz, GFP_NOFS);
+ zn = kmemdup(znode, c->max_znode_sz, GFP_NOFS);
if (unlikely(!zn))
return ERR_PTR(-ENOMEM);
- memcpy(zn, znode, c->max_znode_sz);
zn->cnext = NULL;
__set_bit(DIRTY_ZNODE, &zn->flags);
__clear_bit(COW_ZNODE, &zn->flags);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index de75902..a5697de 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -858,9 +858,9 @@ struct ubifs_compressor {
* @mod_dent: non-zero if the operation removes or modifies an existing
* directory entry
* @new_ino: non-zero if the operation adds a new inode
- * @new_ino_d: now much data newly created inode contains
+ * @new_ino_d: how much data newly created inode contains
* @dirtied_ino: how many inodes the operation makes dirty
- * @dirtied_ino_d: now much data dirtied inode contains
+ * @dirtied_ino_d: how much data dirtied inode contains
* @idx_growth: how much the index will supposedly grow
* @data_growth: how much new data the operation will supposedly add
* @dd_growth: how much data that makes other data dirty the operation will
@@ -1470,7 +1470,6 @@ extern spinlock_t ubifs_infos_lock;
extern atomic_long_t ubifs_clean_zn_cnt;
extern struct kmem_cache *ubifs_inode_slab;
extern const struct super_operations ubifs_super_operations;
-extern const struct xattr_handler *ubifs_xattr_handlers[];
extern const struct address_space_operations ubifs_file_address_operations;
extern const struct file_operations ubifs_file_operations;
extern const struct inode_operations ubifs_file_inode_operations;
@@ -1746,6 +1745,9 @@ int ubifs_calc_dark(const struct ubifs_info *c, int spc);
/* file.c */
int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync);
int ubifs_setattr(struct dentry *dentry, struct iattr *attr);
+#ifdef CONFIG_UBIFS_ATIME_SUPPORT
+int ubifs_update_time(struct inode *inode, struct timespec *time, int flags);
+#endif
/* dir.c */
struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 96f3448..e53292d 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -200,6 +200,7 @@ static int change_xattr(struct ubifs_info *c, struct inode *host,
int err;
struct ubifs_inode *host_ui = ubifs_inode(host);
struct ubifs_inode *ui = ubifs_inode(inode);
+ void *buf = NULL;
struct ubifs_budget_req req = { .dirtied_ino = 2,
.dirtied_ino_d = ALIGN(size, 8) + ALIGN(host_ui->data_len, 8) };
@@ -208,14 +209,17 @@ static int change_xattr(struct ubifs_info *c, struct inode *host,
if (err)
return err;
- kfree(ui->data);
- ui->data = kmemdup(value, size, GFP_NOFS);
- if (!ui->data) {
+ buf = kmemdup(value, size, GFP_NOFS);
+ if (!buf) {
err = -ENOMEM;
goto out_free;
}
+ mutex_lock(&ui->ui_mutex);
+ kfree(ui->data);
+ ui->data = buf;
inode->i_size = ui->ui_size = size;
ui->data_len = size;
+ mutex_unlock(&ui->ui_mutex);
mutex_lock(&host_ui->ui_mutex);
host->i_ctime = ubifs_current_time(host);
@@ -263,7 +267,7 @@ static int check_namespace(const struct qstr *nm)
if (!strncmp(nm->name, XATTR_TRUSTED_PREFIX,
XATTR_TRUSTED_PREFIX_LEN)) {
- if (nm->name[sizeof(XATTR_TRUSTED_PREFIX) - 1] == '\0')
+ if (nm->name[XATTR_TRUSTED_PREFIX_LEN] == '\0')
return -EINVAL;
type = TRUSTED_XATTR;
} else if (!strncmp(nm->name, XATTR_USER_PREFIX,
@@ -273,7 +277,7 @@ static int check_namespace(const struct qstr *nm)
type = USER_XATTR;
} else if (!strncmp(nm->name, XATTR_SECURITY_PREFIX,
XATTR_SECURITY_PREFIX_LEN)) {
- if (nm->name[sizeof(XATTR_SECURITY_PREFIX) - 1] == '\0')
+ if (nm->name[XATTR_SECURITY_PREFIX_LEN] == '\0')
return -EINVAL;
type = SECURITY_XATTR;
} else
@@ -409,6 +413,7 @@ ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,
ubifs_assert(inode->i_size == ui->data_len);
ubifs_assert(ubifs_inode(host)->xattr_size > ui->data_len);
+ mutex_lock(&ui->ui_mutex);
if (buf) {
/* If @buf is %NULL we are supposed to return the length */
if (ui->data_len > size) {
@@ -423,6 +428,7 @@ ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,
err = ui->data_len;
out_iput:
+ mutex_unlock(&ui->ui_mutex);
iput(inode);
out_unlock:
kfree(xent);
@@ -582,46 +588,6 @@ out_free:
return err;
}
-static size_t security_listxattr(struct dentry *d, char *list, size_t list_size,
- const char *name, size_t name_len, int flags)
-{
- const int prefix_len = XATTR_SECURITY_PREFIX_LEN;
- const size_t total_len = prefix_len + name_len + 1;
-
- if (list && total_len <= list_size) {
- memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
- memcpy(list + prefix_len, name, name_len);
- list[prefix_len + name_len] = '\0';
- }
-
- return total_len;
-}
-
-static int security_getxattr(struct dentry *d, const char *name, void *buffer,
- size_t size, int flags)
-{
- return ubifs_getxattr(d, name, buffer, size);
-}
-
-static int security_setxattr(struct dentry *d, const char *name,
- const void *value, size_t size, int flags,
- int handler_flags)
-{
- return ubifs_setxattr(d, name, value, size, flags);
-}
-
-static const struct xattr_handler ubifs_xattr_security_handler = {
- .prefix = XATTR_SECURITY_PREFIX,
- .list = security_listxattr,
- .get = security_getxattr,
- .set = security_setxattr,
-};
-
-const struct xattr_handler *ubifs_xattr_handlers[] = {
- &ubifs_xattr_security_handler,
- NULL,
-};
-
static int init_xattrs(struct inode *inode, const struct xattr *xattr_array,
void *fs_info)
{
@@ -652,11 +618,8 @@ int ubifs_init_security(struct inode *dentry, struct inode *inode,
{
int err;
- mutex_lock(&inode->i_mutex);
err = security_inode_init_security(inode, dentry, qstr,
&init_xattrs, 0);
- mutex_unlock(&inode->i_mutex);
-
if (err) {
struct ubifs_info *c = dentry->i_sb->s_fs_info;
ubifs_err(c, "cannot initialize security for inode %lu, error %d",
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 6d6a96b..e0fd65f 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -447,9 +447,6 @@ static void udf_table_free_blocks(struct super_block *sb,
*/
int adsize;
- struct short_ad *sad = NULL;
- struct long_ad *lad = NULL;
- struct allocExtDesc *aed;
eloc.logicalBlockNum = start;
elen = EXT_RECORDED_ALLOCATED |
@@ -466,102 +463,17 @@ static void udf_table_free_blocks(struct super_block *sb,
}
if (epos.offset + (2 * adsize) > sb->s_blocksize) {
- unsigned char *sptr, *dptr;
- int loffset;
-
- brelse(oepos.bh);
- oepos = epos;
-
/* Steal a block from the extent being free'd */
- epos.block.logicalBlockNum = eloc.logicalBlockNum;
+ udf_setup_indirect_aext(table, eloc.logicalBlockNum,
+ &epos);
+
eloc.logicalBlockNum++;
elen -= sb->s_blocksize;
-
- epos.bh = udf_tread(sb,
- udf_get_lb_pblock(sb, &epos.block, 0));
- if (!epos.bh) {
- brelse(oepos.bh);
- goto error_return;
- }
- aed = (struct allocExtDesc *)(epos.bh->b_data);
- aed->previousAllocExtLocation =
- cpu_to_le32(oepos.block.logicalBlockNum);
- if (epos.offset + adsize > sb->s_blocksize) {
- loffset = epos.offset;
- aed->lengthAllocDescs = cpu_to_le32(adsize);
- sptr = iinfo->i_ext.i_data + epos.offset
- - adsize;
- dptr = epos.bh->b_data +
- sizeof(struct allocExtDesc);
- memcpy(dptr, sptr, adsize);
- epos.offset = sizeof(struct allocExtDesc) +
- adsize;
- } else {
- loffset = epos.offset + adsize;
- aed->lengthAllocDescs = cpu_to_le32(0);
- if (oepos.bh) {
- sptr = oepos.bh->b_data + epos.offset;
- aed = (struct allocExtDesc *)
- oepos.bh->b_data;
- le32_add_cpu(&aed->lengthAllocDescs,
- adsize);
- } else {
- sptr = iinfo->i_ext.i_data +
- epos.offset;
- iinfo->i_lenAlloc += adsize;
- mark_inode_dirty(table);
- }
- epos.offset = sizeof(struct allocExtDesc);
- }
- if (sbi->s_udfrev >= 0x0200)
- udf_new_tag(epos.bh->b_data, TAG_IDENT_AED,
- 3, 1, epos.block.logicalBlockNum,
- sizeof(struct tag));
- else
- udf_new_tag(epos.bh->b_data, TAG_IDENT_AED,
- 2, 1, epos.block.logicalBlockNum,
- sizeof(struct tag));
-
- switch (iinfo->i_alloc_type) {
- case ICBTAG_FLAG_AD_SHORT:
- sad = (struct short_ad *)sptr;
- sad->extLength = cpu_to_le32(
- EXT_NEXT_EXTENT_ALLOCDECS |
- sb->s_blocksize);
- sad->extPosition =
- cpu_to_le32(epos.block.logicalBlockNum);
- break;
- case ICBTAG_FLAG_AD_LONG:
- lad = (struct long_ad *)sptr;
- lad->extLength = cpu_to_le32(
- EXT_NEXT_EXTENT_ALLOCDECS |
- sb->s_blocksize);
- lad->extLocation =
- cpu_to_lelb(epos.block);
- break;
- }
- if (oepos.bh) {
- udf_update_tag(oepos.bh->b_data, loffset);
- mark_buffer_dirty(oepos.bh);
- } else {
- mark_inode_dirty(table);
- }
}
/* It's possible that stealing the block emptied the extent */
- if (elen) {
- udf_write_aext(table, &epos, &eloc, elen, 1);
-
- if (!epos.bh) {
- iinfo->i_lenAlloc += adsize;
- mark_inode_dirty(table);
- } else {
- aed = (struct allocExtDesc *)epos.bh->b_data;
- le32_add_cpu(&aed->lengthAllocDescs, adsize);
- udf_update_tag(epos.bh->b_data, epos.offset);
- mark_buffer_dirty(epos.bh);
- }
- }
+ if (elen)
+ __udf_add_aext(table, &epos, &eloc, elen, 1);
}
brelse(epos.bh);
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 8d0b3ad..87dc16d 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -539,9 +539,18 @@ static int udf_do_extend_file(struct inode *inode,
udf_add_aext(inode, last_pos, &last_ext->extLocation,
last_ext->extLength, 1);
count++;
- } else
+ } else {
+ struct kernel_lb_addr tmploc;
+ uint32_t tmplen;
+
udf_write_aext(inode, last_pos, &last_ext->extLocation,
last_ext->extLength, 1);
+ /*
+ * We've rewritten the last extent but there may be empty
+ * indirect extent after it - enter it.
+ */
+ udf_next_aext(inode, last_pos, &tmploc, &tmplen, 0);
+ }
/* Managed to do everything necessary? */
if (!blocks)
@@ -1540,7 +1549,8 @@ reread:
break;
case ICBTAG_FILE_TYPE_SYMLINK:
inode->i_data.a_ops = &udf_symlink_aops;
- inode->i_op = &udf_symlink_inode_operations;
+ inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mode = S_IFLNK | S_IRWXUGO;
break;
case ICBTAG_FILE_TYPE_MAIN:
@@ -1866,22 +1876,90 @@ struct inode *__udf_iget(struct super_block *sb, struct kernel_lb_addr *ino,
return inode;
}
-int udf_add_aext(struct inode *inode, struct extent_position *epos,
- struct kernel_lb_addr *eloc, uint32_t elen, int inc)
+int udf_setup_indirect_aext(struct inode *inode, int block,
+ struct extent_position *epos)
{
- int adsize;
- struct short_ad *sad = NULL;
- struct long_ad *lad = NULL;
+ struct super_block *sb = inode->i_sb;
+ struct buffer_head *bh;
struct allocExtDesc *aed;
- uint8_t *ptr;
- struct udf_inode_info *iinfo = UDF_I(inode);
+ struct extent_position nepos;
+ struct kernel_lb_addr neloc;
+ int ver, adsize;
- if (!epos->bh)
- ptr = iinfo->i_ext.i_data + epos->offset -
- udf_file_entry_alloc_offset(inode) +
- iinfo->i_lenEAttr;
+ if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
+ adsize = sizeof(struct short_ad);
+ else if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_LONG)
+ adsize = sizeof(struct long_ad);
else
- ptr = epos->bh->b_data + epos->offset;
+ return -EIO;
+
+ neloc.logicalBlockNum = block;
+ neloc.partitionReferenceNum = epos->block.partitionReferenceNum;
+
+ bh = udf_tgetblk(sb, udf_get_lb_pblock(sb, &neloc, 0));
+ if (!bh)
+ return -EIO;
+ lock_buffer(bh);
+ memset(bh->b_data, 0x00, sb->s_blocksize);
+ set_buffer_uptodate(bh);
+ unlock_buffer(bh);
+ mark_buffer_dirty_inode(bh, inode);
+
+ aed = (struct allocExtDesc *)(bh->b_data);
+ if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT)) {
+ aed->previousAllocExtLocation =
+ cpu_to_le32(epos->block.logicalBlockNum);
+ }
+ aed->lengthAllocDescs = cpu_to_le32(0);
+ if (UDF_SB(sb)->s_udfrev >= 0x0200)
+ ver = 3;
+ else
+ ver = 2;
+ udf_new_tag(bh->b_data, TAG_IDENT_AED, ver, 1, block,
+ sizeof(struct tag));
+
+ nepos.block = neloc;
+ nepos.offset = sizeof(struct allocExtDesc);
+ nepos.bh = bh;
+
+ /*
+ * Do we have to copy current last extent to make space for indirect
+ * one?
+ */
+ if (epos->offset + adsize > sb->s_blocksize) {
+ struct kernel_lb_addr cp_loc;
+ uint32_t cp_len;
+ int cp_type;
+
+ epos->offset -= adsize;
+ cp_type = udf_current_aext(inode, epos, &cp_loc, &cp_len, 0);
+ cp_len |= ((uint32_t)cp_type) << 30;
+
+ __udf_add_aext(inode, &nepos, &cp_loc, cp_len, 1);
+ udf_write_aext(inode, epos, &nepos.block,
+ sb->s_blocksize | EXT_NEXT_EXTENT_ALLOCDECS, 0);
+ } else {
+ __udf_add_aext(inode, epos, &nepos.block,
+ sb->s_blocksize | EXT_NEXT_EXTENT_ALLOCDECS, 0);
+ }
+
+ brelse(epos->bh);
+ *epos = nepos;
+
+ return 0;
+}
+
+/*
+ * Append extent at the given position - should be the first free one in inode
+ * / indirect extent. This function assumes there is enough space in the inode
+ * or indirect extent. Use udf_add_aext() if you didn't check for this before.
+ */
+int __udf_add_aext(struct inode *inode, struct extent_position *epos,
+ struct kernel_lb_addr *eloc, uint32_t elen, int inc)
+{
+ struct udf_inode_info *iinfo = UDF_I(inode);
+ struct allocExtDesc *aed;
+ int adsize;
if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
adsize = sizeof(struct short_ad);
@@ -1890,88 +1968,14 @@ int udf_add_aext(struct inode *inode, struct extent_position *epos,
else
return -EIO;
- if (epos->offset + (2 * adsize) > inode->i_sb->s_blocksize) {
- unsigned char *sptr, *dptr;
- struct buffer_head *nbh;
- int err, loffset;
- struct kernel_lb_addr obloc = epos->block;
-
- epos->block.logicalBlockNum = udf_new_block(inode->i_sb, NULL,
- obloc.partitionReferenceNum,
- obloc.logicalBlockNum, &err);
- if (!epos->block.logicalBlockNum)
- return -ENOSPC;
- nbh = udf_tgetblk(inode->i_sb, udf_get_lb_pblock(inode->i_sb,
- &epos->block,
- 0));
- if (!nbh)
- return -EIO;
- lock_buffer(nbh);
- memset(nbh->b_data, 0x00, inode->i_sb->s_blocksize);
- set_buffer_uptodate(nbh);
- unlock_buffer(nbh);
- mark_buffer_dirty_inode(nbh, inode);
-
- aed = (struct allocExtDesc *)(nbh->b_data);
- if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT))
- aed->previousAllocExtLocation =
- cpu_to_le32(obloc.logicalBlockNum);
- if (epos->offset + adsize > inode->i_sb->s_blocksize) {
- loffset = epos->offset;
- aed->lengthAllocDescs = cpu_to_le32(adsize);
- sptr = ptr - adsize;
- dptr = nbh->b_data + sizeof(struct allocExtDesc);
- memcpy(dptr, sptr, adsize);
- epos->offset = sizeof(struct allocExtDesc) + adsize;
- } else {
- loffset = epos->offset + adsize;
- aed->lengthAllocDescs = cpu_to_le32(0);
- sptr = ptr;
- epos->offset = sizeof(struct allocExtDesc);
-
- if (epos->bh) {
- aed = (struct allocExtDesc *)epos->bh->b_data;
- le32_add_cpu(&aed->lengthAllocDescs, adsize);
- } else {
- iinfo->i_lenAlloc += adsize;
- mark_inode_dirty(inode);
- }
- }
- if (UDF_SB(inode->i_sb)->s_udfrev >= 0x0200)
- udf_new_tag(nbh->b_data, TAG_IDENT_AED, 3, 1,
- epos->block.logicalBlockNum, sizeof(struct tag));
- else
- udf_new_tag(nbh->b_data, TAG_IDENT_AED, 2, 1,
- epos->block.logicalBlockNum, sizeof(struct tag));
- switch (iinfo->i_alloc_type) {
- case ICBTAG_FLAG_AD_SHORT:
- sad = (struct short_ad *)sptr;
- sad->extLength = cpu_to_le32(EXT_NEXT_EXTENT_ALLOCDECS |
- inode->i_sb->s_blocksize);
- sad->extPosition =
- cpu_to_le32(epos->block.logicalBlockNum);
- break;
- case ICBTAG_FLAG_AD_LONG:
- lad = (struct long_ad *)sptr;
- lad->extLength = cpu_to_le32(EXT_NEXT_EXTENT_ALLOCDECS |
- inode->i_sb->s_blocksize);
- lad->extLocation = cpu_to_lelb(epos->block);
- memset(lad->impUse, 0x00, sizeof(lad->impUse));
- break;
- }
- if (epos->bh) {
- if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) ||
- UDF_SB(inode->i_sb)->s_udfrev >= 0x0201)
- udf_update_tag(epos->bh->b_data, loffset);
- else
- udf_update_tag(epos->bh->b_data,
- sizeof(struct allocExtDesc));
- mark_buffer_dirty_inode(epos->bh, inode);
- brelse(epos->bh);
- } else {
- mark_inode_dirty(inode);
- }
- epos->bh = nbh;
+ if (!epos->bh) {
+ WARN_ON(iinfo->i_lenAlloc !=
+ epos->offset - udf_file_entry_alloc_offset(inode));
+ } else {
+ aed = (struct allocExtDesc *)epos->bh->b_data;
+ WARN_ON(le32_to_cpu(aed->lengthAllocDescs) !=
+ epos->offset - sizeof(struct allocExtDesc));
+ WARN_ON(epos->offset + adsize > inode->i_sb->s_blocksize);
}
udf_write_aext(inode, epos, eloc, elen, inc);
@@ -1995,6 +1999,41 @@ int udf_add_aext(struct inode *inode, struct extent_position *epos,
return 0;
}
+/*
+ * Append extent at given position - should be the first free one in inode
+ * / indirect extent. Takes care of allocating and linking indirect blocks.
+ */
+int udf_add_aext(struct inode *inode, struct extent_position *epos,
+ struct kernel_lb_addr *eloc, uint32_t elen, int inc)
+{
+ int adsize;
+ struct super_block *sb = inode->i_sb;
+
+ if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
+ adsize = sizeof(struct short_ad);
+ else if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_LONG)
+ adsize = sizeof(struct long_ad);
+ else
+ return -EIO;
+
+ if (epos->offset + (2 * adsize) > sb->s_blocksize) {
+ int err;
+ int new_block;
+
+ new_block = udf_new_block(sb, NULL,
+ epos->block.partitionReferenceNum,
+ epos->block.logicalBlockNum, &err);
+ if (!new_block)
+ return -ENOSPC;
+
+ err = udf_setup_indirect_aext(inode, new_block, epos);
+ if (err)
+ return err;
+ }
+
+ return __udf_add_aext(inode, epos, eloc, elen, inc);
+}
+
void udf_write_aext(struct inode *inode, struct extent_position *epos,
struct kernel_lb_addr *eloc, uint32_t elen, int inc)
{
@@ -2047,14 +2086,29 @@ void udf_write_aext(struct inode *inode, struct extent_position *epos,
epos->offset += adsize;
}
+/*
+ * Only 1 indirect extent in a row really makes sense but allow upto 16 in case
+ * someone does some weird stuff.
+ */
+#define UDF_MAX_INDIR_EXTS 16
+
int8_t udf_next_aext(struct inode *inode, struct extent_position *epos,
struct kernel_lb_addr *eloc, uint32_t *elen, int inc)
{
int8_t etype;
+ unsigned int indirections = 0;
while ((etype = udf_current_aext(inode, epos, eloc, elen, inc)) ==
(EXT_NEXT_EXTENT_ALLOCDECS >> 30)) {
int block;
+
+ if (++indirections > UDF_MAX_INDIR_EXTS) {
+ udf_err(inode->i_sb,
+ "too many indirect extents in inode %lu\n",
+ inode->i_ino);
+ return -1;
+ }
+
epos->block = *eloc;
epos->offset = sizeof(struct allocExtDesc);
brelse(epos->bh);
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index c97b5a8..42eafb9 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -921,7 +921,8 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
}
inode->i_data.a_ops = &udf_symlink_aops;
- inode->i_op = &udf_symlink_inode_operations;
+ inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
struct kernel_lb_addr eloc;
@@ -1344,8 +1345,3 @@ const struct inode_operations udf_dir_inode_operations = {
.rename = udf_rename,
.tmpfile = udf_tmpfile,
};
-const struct inode_operations udf_symlink_inode_operations = {
- .readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
-};
diff --git a/fs/udf/super.c b/fs/udf/super.c
index b96f190..0fbb4c7 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -179,7 +179,8 @@ static int __init init_inodecache(void)
udf_inode_cachep = kmem_cache_create("udf_inode_cache",
sizeof(struct udf_inode_info),
0, (SLAB_RECLAIM_ACCOUNT |
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD |
+ SLAB_ACCOUNT),
init_once);
if (!udf_inode_cachep)
return -ENOMEM;
@@ -1586,6 +1587,13 @@ static void udf_load_logicalvolint(struct super_block *sb, struct kernel_extent_
}
/*
+ * Maximum number of Terminating Descriptor redirections. The chosen number is
+ * arbitrary - just that we hopefully don't limit any real use of rewritten
+ * inode on write-once media but avoid looping for too long on corrupted media.
+ */
+#define UDF_MAX_TD_NESTING 64
+
+/*
* Process a main/reserve volume descriptor sequence.
* @block First block of first extent of the sequence.
* @lastblock Lastblock of first extent of the sequence.
@@ -1609,6 +1617,7 @@ static noinline int udf_process_sequence(
uint16_t ident;
long next_s = 0, next_e = 0;
int ret;
+ unsigned int indirections = 0;
memset(vds, 0, sizeof(struct udf_vds_record) * VDS_POS_LENGTH);
@@ -1679,6 +1688,12 @@ static noinline int udf_process_sequence(
}
break;
case TAG_IDENT_TD: /* ISO 13346 3/10.9 */
+ if (++indirections > UDF_MAX_TD_NESTING) {
+ udf_err(sb, "too many TDs (max %u supported)\n", UDF_MAX_TD_NESTING);
+ brelse(bh);
+ return -EIO;
+ }
+
vds[VDS_POS_TERMINATING_DESC].block = block;
if (next_e) {
block = next_s;
@@ -2070,6 +2085,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
struct udf_options uopt;
struct kernel_lb_addr rootdir, fileset;
struct udf_sb_info *sbi;
+ bool lvid_open = false;
uopt.flags = (1 << UDF_FLAG_USE_AD_IN_ICB) | (1 << UDF_FLAG_STRICT);
uopt.uid = INVALID_UID;
@@ -2216,8 +2232,10 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
le16_to_cpu(ts.year), ts.month, ts.day,
ts.hour, ts.minute, le16_to_cpu(ts.typeAndTimezone));
}
- if (!(sb->s_flags & MS_RDONLY))
+ if (!(sb->s_flags & MS_RDONLY)) {
udf_open_lvid(sb);
+ lvid_open = true;
+ }
/* Assign the root inode */
/* assign inodes by physical block number */
@@ -2248,7 +2266,7 @@ parse_options_failure:
if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP))
unload_nls(sbi->s_nls_map);
#endif
- if (!(sb->s_flags & MS_RDONLY))
+ if (lvid_open)
udf_close_lvid(sb);
brelse(sbi->s_lvid_bh);
udf_sb_free_partitions(sb);
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index 862535b..8d61977 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -107,7 +107,7 @@ static int udf_symlink_filler(struct file *file, struct page *page)
struct buffer_head *bh = NULL;
unsigned char *symlink;
int err;
- unsigned char *p = kmap(page);
+ unsigned char *p = page_address(page);
struct udf_inode_info *iinfo;
uint32_t pos;
@@ -141,7 +141,6 @@ static int udf_symlink_filler(struct file *file, struct page *page)
up_read(&iinfo->i_data_sem);
SetPageUptodate(page);
- kunmap(page);
unlock_page(page);
return 0;
@@ -149,7 +148,6 @@ out_unlock_inode:
up_read(&iinfo->i_data_sem);
SetPageError(page);
out_unmap:
- kunmap(page);
unlock_page(page);
return err;
}
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 47bb3f5..fa0044b 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -85,7 +85,6 @@ extern const struct inode_operations udf_dir_inode_operations;
extern const struct file_operations udf_dir_operations;
extern const struct inode_operations udf_file_inode_operations;
extern const struct file_operations udf_file_operations;
-extern const struct inode_operations udf_symlink_inode_operations;
extern const struct address_space_operations udf_aops;
extern const struct address_space_operations udf_adinicb_aops;
extern const struct address_space_operations udf_symlink_aops;
@@ -159,6 +158,10 @@ extern int udf_write_inode(struct inode *, struct writeback_control *wbc);
extern long udf_block_map(struct inode *, sector_t);
extern int8_t inode_bmap(struct inode *, sector_t, struct extent_position *,
struct kernel_lb_addr *, uint32_t *, sector_t *);
+extern int udf_setup_indirect_aext(struct inode *inode, int block,
+ struct extent_position *epos);
+extern int __udf_add_aext(struct inode *inode, struct extent_position *epos,
+ struct kernel_lb_addr *eloc, uint32_t elen, int inc);
extern int udf_add_aext(struct inode *, struct extent_position *,
struct kernel_lb_addr *, uint32_t, int);
extern void udf_write_aext(struct inode *, struct extent_position *,
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index ab478e6..e788a05 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -128,11 +128,15 @@ int udf_CS0toUTF8(struct ustr *utf_o, const struct ustr *ocu_i)
if (c < 0x80U)
utf_o->u_name[utf_o->u_len++] = (uint8_t)c;
else if (c < 0x800U) {
+ if (utf_o->u_len > (UDF_NAME_LEN - 4))
+ break;
utf_o->u_name[utf_o->u_len++] =
(uint8_t)(0xc0 | (c >> 6));
utf_o->u_name[utf_o->u_len++] =
(uint8_t)(0x80 | (c & 0x3f));
} else {
+ if (utf_o->u_len > (UDF_NAME_LEN - 5))
+ break;
utf_o->u_name[utf_o->u_len++] =
(uint8_t)(0xe0 | (c >> 12));
utf_o->u_name[utf_o->u_len++] =
@@ -173,17 +177,22 @@ int udf_CS0toUTF8(struct ustr *utf_o, const struct ustr *ocu_i)
static int udf_UTF8toCS0(dstring *ocu, struct ustr *utf, int length)
{
unsigned c, i, max_val, utf_char;
- int utf_cnt, u_len;
+ int utf_cnt, u_len, u_ch;
memset(ocu, 0, sizeof(dstring) * length);
ocu[0] = 8;
max_val = 0xffU;
+ u_ch = 1;
try_again:
u_len = 0U;
utf_char = 0U;
utf_cnt = 0U;
for (i = 0U; i < utf->u_len; i++) {
+ /* Name didn't fit? */
+ if (u_len + 1 + u_ch >= length)
+ return 0;
+
c = (uint8_t)utf->u_name[i];
/* Complete a multi-byte UTF-8 character */
@@ -225,6 +234,7 @@ try_again:
if (max_val == 0xffU) {
max_val = 0xffffU;
ocu[0] = (uint8_t)0x10U;
+ u_ch = 2;
goto try_again;
}
goto error_out;
@@ -277,7 +287,7 @@ static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o,
c = (c << 8) | ocu[i++];
len = nls->uni2char(c, &utf_o->u_name[utf_o->u_len],
- UDF_NAME_LEN - utf_o->u_len);
+ UDF_NAME_LEN - 2 - utf_o->u_len);
/* Valid character? */
if (len >= 0)
utf_o->u_len += len;
@@ -295,15 +305,19 @@ static int udf_NLStoCS0(struct nls_table *nls, dstring *ocu, struct ustr *uni,
int len;
unsigned i, max_val;
uint16_t uni_char;
- int u_len;
+ int u_len, u_ch;
memset(ocu, 0, sizeof(dstring) * length);
ocu[0] = 8;
max_val = 0xffU;
+ u_ch = 1;
try_again:
u_len = 0U;
for (i = 0U; i < uni->u_len; i++) {
+ /* Name didn't fit? */
+ if (u_len + 1 + u_ch >= length)
+ return 0;
len = nls->char2uni(&uni->u_name[i], uni->u_len - i, &uni_char);
if (!len)
continue;
@@ -316,6 +330,7 @@ try_again:
if (uni_char > max_val) {
max_val = 0xffffU;
ocu[0] = (uint8_t)0x10U;
+ u_ch = 2;
goto try_again;
}
diff --git a/fs/ufs/Makefile b/fs/ufs/Makefile
index 4d0e02b..ec4a6b4 100644
--- a/fs/ufs/Makefile
+++ b/fs/ufs/Makefile
@@ -5,5 +5,5 @@
obj-$(CONFIG_UFS_FS) += ufs.o
ufs-objs := balloc.o cylinder.o dir.o file.o ialloc.o inode.o \
- namei.o super.o symlink.o truncate.o util.o
+ namei.o super.o util.o
ccflags-$(CONFIG_UFS_DEBUG) += -DDEBUG
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index a7106ed..dc5fae6 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -417,12 +417,14 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
if (oldcount == 0) {
result = ufs_alloc_fragments (inode, cgno, goal, count, err);
if (result) {
+ ufs_clear_frags(inode, result + oldcount,
+ newcount - oldcount, locked_page != NULL);
+ write_seqlock(&UFS_I(inode)->meta_lock);
ufs_cpu_to_data_ptr(sb, p, result);
+ write_sequnlock(&UFS_I(inode)->meta_lock);
*err = 0;
UFS_I(inode)->i_lastfrag =
max(UFS_I(inode)->i_lastfrag, fragment + count);
- ufs_clear_frags(inode, result + oldcount,
- newcount - oldcount, locked_page != NULL);
}
mutex_unlock(&UFS_SB(sb)->s_lock);
UFSD("EXIT, result %llu\n", (unsigned long long)result);
@@ -473,7 +475,9 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
ufs_change_blocknr(inode, fragment - oldcount, oldcount,
uspi->s_sbbase + tmp,
uspi->s_sbbase + result, locked_page);
+ write_seqlock(&UFS_I(inode)->meta_lock);
ufs_cpu_to_data_ptr(sb, p, result);
+ write_sequnlock(&UFS_I(inode)->meta_lock);
*err = 0;
UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag,
fragment + count);
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index f913a69..d897e16 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -41,9 +41,7 @@
#include "swab.h"
#include "util.h"
-static u64 ufs_frag_map(struct inode *inode, sector_t frag, bool needs_lock);
-
-static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t offsets[4])
+static int ufs_block_to_path(struct inode *inode, sector_t i_block, unsigned offsets[4])
{
struct ufs_sb_private_info *uspi = UFS_SB(inode->i_sb)->s_uspi;
int ptrs = uspi->s_apb;
@@ -75,227 +73,232 @@ static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t off
return n;
}
+typedef struct {
+ void *p;
+ union {
+ __fs32 key32;
+ __fs64 key64;
+ };
+ struct buffer_head *bh;
+} Indirect;
+
+static inline int grow_chain32(struct ufs_inode_info *ufsi,
+ struct buffer_head *bh, __fs32 *v,
+ Indirect *from, Indirect *to)
+{
+ Indirect *p;
+ unsigned seq;
+ to->bh = bh;
+ do {
+ seq = read_seqbegin(&ufsi->meta_lock);
+ to->key32 = *(__fs32 *)(to->p = v);
+ for (p = from; p <= to && p->key32 == *(__fs32 *)p->p; p++)
+ ;
+ } while (read_seqretry(&ufsi->meta_lock, seq));
+ return (p > to);
+}
+
+static inline int grow_chain64(struct ufs_inode_info *ufsi,
+ struct buffer_head *bh, __fs64 *v,
+ Indirect *from, Indirect *to)
+{
+ Indirect *p;
+ unsigned seq;
+ to->bh = bh;
+ do {
+ seq = read_seqbegin(&ufsi->meta_lock);
+ to->key64 = *(__fs64 *)(to->p = v);
+ for (p = from; p <= to && p->key64 == *(__fs64 *)p->p; p++)
+ ;
+ } while (read_seqretry(&ufsi->meta_lock, seq));
+ return (p > to);
+}
+
/*
* Returns the location of the fragment from
* the beginning of the filesystem.
*/
-static u64 ufs_frag_map(struct inode *inode, sector_t frag, bool needs_lock)
+static u64 ufs_frag_map(struct inode *inode, unsigned offsets[4], int depth)
{
struct ufs_inode_info *ufsi = UFS_I(inode);
struct super_block *sb = inode->i_sb;
struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
u64 mask = (u64) uspi->s_apbmask>>uspi->s_fpbshift;
int shift = uspi->s_apbshift-uspi->s_fpbshift;
- sector_t offsets[4], *p;
- int depth = ufs_block_to_path(inode, frag >> uspi->s_fpbshift, offsets);
- u64 ret = 0L;
- __fs32 block;
- __fs64 u2_block = 0L;
+ Indirect chain[4], *q = chain;
+ unsigned *p;
unsigned flags = UFS_SB(sb)->s_flags;
- u64 temp = 0L;
+ u64 res = 0;
- UFSD(": frag = %llu depth = %d\n", (unsigned long long)frag, depth);
UFSD(": uspi->s_fpbshift = %d ,uspi->s_apbmask = %x, mask=%llx\n",
uspi->s_fpbshift, uspi->s_apbmask,
(unsigned long long)mask);
if (depth == 0)
- return 0;
+ goto no_block;
+again:
p = offsets;
- if (needs_lock)
- lock_ufs(sb);
if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2)
goto ufs2;
- block = ufsi->i_u1.i_data[*p++];
- if (!block)
- goto out;
+ if (!grow_chain32(ufsi, NULL, &ufsi->i_u1.i_data[*p++], chain, q))
+ goto changed;
+ if (!q->key32)
+ goto no_block;
while (--depth) {
+ __fs32 *ptr;
struct buffer_head *bh;
- sector_t n = *p++;
+ unsigned n = *p++;
- bh = sb_bread(sb, uspi->s_sbbase + fs32_to_cpu(sb, block)+(n>>shift));
+ bh = sb_bread(sb, uspi->s_sbbase +
+ fs32_to_cpu(sb, q->key32) + (n>>shift));
if (!bh)
- goto out;
- block = ((__fs32 *) bh->b_data)[n & mask];
- brelse (bh);
- if (!block)
- goto out;
- }
- ret = (u64) (uspi->s_sbbase + fs32_to_cpu(sb, block) + (frag & uspi->s_fpbmask));
- goto out;
-ufs2:
- u2_block = ufsi->i_u1.u2_i_data[*p++];
- if (!u2_block)
- goto out;
+ goto no_block;
+ ptr = (__fs32 *)bh->b_data + (n & mask);
+ if (!grow_chain32(ufsi, bh, ptr, chain, ++q))
+ goto changed;
+ if (!q->key32)
+ goto no_block;
+ }
+ res = fs32_to_cpu(sb, q->key32);
+ goto found;
+ufs2:
+ if (!grow_chain64(ufsi, NULL, &ufsi->i_u1.u2_i_data[*p++], chain, q))
+ goto changed;
+ if (!q->key64)
+ goto no_block;
while (--depth) {
+ __fs64 *ptr;
struct buffer_head *bh;
- sector_t n = *p++;
-
+ unsigned n = *p++;
- temp = (u64)(uspi->s_sbbase) + fs64_to_cpu(sb, u2_block);
- bh = sb_bread(sb, temp +(u64) (n>>shift));
+ bh = sb_bread(sb, uspi->s_sbbase +
+ fs64_to_cpu(sb, q->key64) + (n>>shift));
if (!bh)
- goto out;
- u2_block = ((__fs64 *)bh->b_data)[n & mask];
- brelse(bh);
- if (!u2_block)
- goto out;
+ goto no_block;
+ ptr = (__fs64 *)bh->b_data + (n & mask);
+ if (!grow_chain64(ufsi, bh, ptr, chain, ++q))
+ goto changed;
+ if (!q->key64)
+ goto no_block;
}
- temp = (u64)uspi->s_sbbase + fs64_to_cpu(sb, u2_block);
- ret = temp + (u64) (frag & uspi->s_fpbmask);
+ res = fs64_to_cpu(sb, q->key64);
+found:
+ res += uspi->s_sbbase;
+no_block:
+ while (q > chain) {
+ brelse(q->bh);
+ q--;
+ }
+ return res;
-out:
- if (needs_lock)
- unlock_ufs(sb);
- return ret;
+changed:
+ while (q > chain) {
+ brelse(q->bh);
+ q--;
+ }
+ goto again;
+}
+
+/*
+ * Unpacking tails: we have a file with partial final block and
+ * we had been asked to extend it. If the fragment being written
+ * is within the same block, we need to extend the tail just to cover
+ * that fragment. Otherwise the tail is extended to full block.
+ *
+ * Note that we might need to create a _new_ tail, but that will
+ * be handled elsewhere; this is strictly for resizing old
+ * ones.
+ */
+static bool
+ufs_extend_tail(struct inode *inode, u64 writes_to,
+ int *err, struct page *locked_page)
+{
+ struct ufs_inode_info *ufsi = UFS_I(inode);
+ struct super_block *sb = inode->i_sb;
+ struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+ unsigned lastfrag = ufsi->i_lastfrag; /* it's a short file, so unsigned is enough */
+ unsigned block = ufs_fragstoblks(lastfrag);
+ unsigned new_size;
+ void *p;
+ u64 tmp;
+
+ if (writes_to < (lastfrag | uspi->s_fpbmask))
+ new_size = (writes_to & uspi->s_fpbmask) + 1;
+ else
+ new_size = uspi->s_fpb;
+
+ p = ufs_get_direct_data_ptr(uspi, ufsi, block);
+ tmp = ufs_new_fragments(inode, p, lastfrag, ufs_data_ptr_to_cpu(sb, p),
+ new_size, err, locked_page);
+ return tmp != 0;
}
/**
* ufs_inode_getfrag() - allocate new fragment(s)
* @inode: pointer to inode
- * @fragment: number of `fragment' which hold pointer
- * to new allocated fragment(s)
+ * @index: number of block pointer within the inode's array.
* @new_fragment: number of new allocated fragment(s)
- * @required: how many fragment(s) we require
* @err: we set it if something wrong
- * @phys: pointer to where we save physical number of new allocated fragments,
- * NULL if we allocate not data(indirect blocks for example).
* @new: we set it if we allocate new block
* @locked_page: for ufs_new_fragments()
*/
-static struct buffer_head *
-ufs_inode_getfrag(struct inode *inode, u64 fragment,
- sector_t new_fragment, unsigned int required, int *err,
- long *phys, int *new, struct page *locked_page)
+static u64
+ufs_inode_getfrag(struct inode *inode, unsigned index,
+ sector_t new_fragment, int *err,
+ int *new, struct page *locked_page)
{
struct ufs_inode_info *ufsi = UFS_I(inode);
struct super_block *sb = inode->i_sb;
struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
- struct buffer_head * result;
- unsigned blockoff, lastblockoff;
- u64 tmp, goal, lastfrag, block, lastblock;
- void *p, *p2;
-
- UFSD("ENTER, ino %lu, fragment %llu, new_fragment %llu, required %u, "
- "metadata %d\n", inode->i_ino, (unsigned long long)fragment,
- (unsigned long long)new_fragment, required, !phys);
+ u64 tmp, goal, lastfrag;
+ unsigned nfrags = uspi->s_fpb;
+ void *p;
/* TODO : to be done for write support
if ( (flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2)
goto ufs2;
*/
- block = ufs_fragstoblks (fragment);
- blockoff = ufs_fragnum (fragment);
- p = ufs_get_direct_data_ptr(uspi, ufsi, block);
-
- goal = 0;
-
-repeat:
+ p = ufs_get_direct_data_ptr(uspi, ufsi, index);
tmp = ufs_data_ptr_to_cpu(sb, p);
+ if (tmp)
+ goto out;
lastfrag = ufsi->i_lastfrag;
- if (tmp && fragment < lastfrag) {
- if (!phys) {
- result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff);
- if (tmp == ufs_data_ptr_to_cpu(sb, p)) {
- UFSD("EXIT, result %llu\n",
- (unsigned long long)tmp + blockoff);
- return result;
- }
- brelse (result);
- goto repeat;
- } else {
- *phys = uspi->s_sbbase + tmp + blockoff;
- return NULL;
- }
- }
- lastblock = ufs_fragstoblks (lastfrag);
- lastblockoff = ufs_fragnum (lastfrag);
- /*
- * We will extend file into new block beyond last allocated block
- */
- if (lastblock < block) {
- /*
- * We must reallocate last allocated block
- */
- if (lastblockoff) {
- p2 = ufs_get_direct_data_ptr(uspi, ufsi, lastblock);
- tmp = ufs_new_fragments(inode, p2, lastfrag,
- ufs_data_ptr_to_cpu(sb, p2),
- uspi->s_fpb - lastblockoff,
- err, locked_page);
- if (!tmp) {
- if (lastfrag != ufsi->i_lastfrag)
- goto repeat;
- else
- return NULL;
- }
- lastfrag = ufsi->i_lastfrag;
-
- }
- tmp = ufs_data_ptr_to_cpu(sb,
- ufs_get_direct_data_ptr(uspi, ufsi,
- lastblock));
- if (tmp)
- goal = tmp + uspi->s_fpb;
- tmp = ufs_new_fragments (inode, p, fragment - blockoff,
- goal, required + blockoff,
- err,
- phys != NULL ? locked_page : NULL);
- } else if (lastblock == block) {
- /*
- * We will extend last allocated block
- */
- tmp = ufs_new_fragments(inode, p, fragment -
- (blockoff - lastblockoff),
- ufs_data_ptr_to_cpu(sb, p),
- required + (blockoff - lastblockoff),
- err, phys != NULL ? locked_page : NULL);
- } else /* (lastblock > block) */ {
- /*
- * We will allocate new block before last allocated block
- */
- if (block) {
- tmp = ufs_data_ptr_to_cpu(sb,
- ufs_get_direct_data_ptr(uspi, ufsi, block - 1));
- if (tmp)
- goal = tmp + uspi->s_fpb;
- }
- tmp = ufs_new_fragments(inode, p, fragment - blockoff,
- goal, uspi->s_fpb, err,
- phys != NULL ? locked_page : NULL);
+ /* will that be a new tail? */
+ if (new_fragment < UFS_NDIR_FRAGMENT && new_fragment >= lastfrag)
+ nfrags = (new_fragment & uspi->s_fpbmask) + 1;
+
+ goal = 0;
+ if (index) {
+ goal = ufs_data_ptr_to_cpu(sb,
+ ufs_get_direct_data_ptr(uspi, ufsi, index - 1));
+ if (goal)
+ goal += uspi->s_fpb;
}
+ tmp = ufs_new_fragments(inode, p, ufs_blknum(new_fragment),
+ goal, uspi->s_fpb, err, locked_page);
+
if (!tmp) {
- if ((!blockoff && ufs_data_ptr_to_cpu(sb, p)) ||
- (blockoff && lastfrag != ufsi->i_lastfrag))
- goto repeat;
*err = -ENOSPC;
- return NULL;
+ return 0;
}
- if (!phys) {
- result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff);
- } else {
- *phys = uspi->s_sbbase + tmp + blockoff;
- result = NULL;
- *err = 0;
+ if (new)
*new = 1;
- }
-
inode->i_ctime = CURRENT_TIME_SEC;
if (IS_SYNC(inode))
ufs_sync_inode (inode);
mark_inode_dirty(inode);
- UFSD("EXIT, result %llu\n", (unsigned long long)tmp + blockoff);
- return result;
+out:
+ return tmp + uspi->s_sbbase;
/* This part : To be implemented ....
Required only for writing, not required for READ-ONLY.
@@ -316,95 +319,70 @@ repeat2:
/**
* ufs_inode_getblock() - allocate new block
* @inode: pointer to inode
- * @bh: pointer to block which hold "pointer" to new allocated block
- * @fragment: number of `fragment' which hold pointer
- * to new allocated block
+ * @ind_block: block number of the indirect block
+ * @index: number of pointer within the indirect block
* @new_fragment: number of new allocated fragment
* (block will hold this fragment and also uspi->s_fpb-1)
* @err: see ufs_inode_getfrag()
- * @phys: see ufs_inode_getfrag()
* @new: see ufs_inode_getfrag()
* @locked_page: see ufs_inode_getfrag()
*/
-static struct buffer_head *
-ufs_inode_getblock(struct inode *inode, struct buffer_head *bh,
- u64 fragment, sector_t new_fragment, int *err,
- long *phys, int *new, struct page *locked_page)
+static u64
+ufs_inode_getblock(struct inode *inode, u64 ind_block,
+ unsigned index, sector_t new_fragment, int *err,
+ int *new, struct page *locked_page)
{
struct super_block *sb = inode->i_sb;
struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
- struct buffer_head * result;
- unsigned blockoff;
- u64 tmp, goal, block;
+ int shift = uspi->s_apbshift - uspi->s_fpbshift;
+ u64 tmp = 0, goal;
+ struct buffer_head *bh;
void *p;
- block = ufs_fragstoblks (fragment);
- blockoff = ufs_fragnum (fragment);
-
- UFSD("ENTER, ino %lu, fragment %llu, new_fragment %llu, metadata %d\n",
- inode->i_ino, (unsigned long long)fragment,
- (unsigned long long)new_fragment, !phys);
+ if (!ind_block)
+ return 0;
- result = NULL;
- if (!bh)
- goto out;
- if (!buffer_uptodate(bh)) {
- ll_rw_block (READ, 1, &bh);
- wait_on_buffer (bh);
- if (!buffer_uptodate(bh))
- goto out;
+ bh = sb_bread(sb, ind_block + (index >> shift));
+ if (unlikely(!bh)) {
+ *err = -EIO;
+ return 0;
}
+
+ index &= uspi->s_apbmask >> uspi->s_fpbshift;
if (uspi->fs_magic == UFS2_MAGIC)
- p = (__fs64 *)bh->b_data + block;
+ p = (__fs64 *)bh->b_data + index;
else
- p = (__fs32 *)bh->b_data + block;
-repeat:
+ p = (__fs32 *)bh->b_data + index;
+
tmp = ufs_data_ptr_to_cpu(sb, p);
- if (tmp) {
- if (!phys) {
- result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff);
- if (tmp == ufs_data_ptr_to_cpu(sb, p))
- goto out;
- brelse (result);
- goto repeat;
- } else {
- *phys = uspi->s_sbbase + tmp + blockoff;
- goto out;
- }
- }
+ if (tmp)
+ goto out;
- if (block && (uspi->fs_magic == UFS2_MAGIC ?
- (tmp = fs64_to_cpu(sb, ((__fs64 *)bh->b_data)[block-1])) :
- (tmp = fs32_to_cpu(sb, ((__fs32 *)bh->b_data)[block-1]))))
+ if (index && (uspi->fs_magic == UFS2_MAGIC ?
+ (tmp = fs64_to_cpu(sb, ((__fs64 *)bh->b_data)[index-1])) :
+ (tmp = fs32_to_cpu(sb, ((__fs32 *)bh->b_data)[index-1]))))
goal = tmp + uspi->s_fpb;
else
goal = bh->b_blocknr + uspi->s_fpb;
tmp = ufs_new_fragments(inode, p, ufs_blknum(new_fragment), goal,
uspi->s_fpb, err, locked_page);
- if (!tmp) {
- if (ufs_data_ptr_to_cpu(sb, p))
- goto repeat;
+ if (!tmp)
goto out;
- }
-
- if (!phys) {
- result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff);
- } else {
- *phys = uspi->s_sbbase + tmp + blockoff;
+ if (new)
*new = 1;
- }
mark_buffer_dirty(bh);
if (IS_SYNC(inode))
sync_dirty_buffer(bh);
inode->i_ctime = CURRENT_TIME_SEC;
mark_inode_dirty(inode);
- UFSD("result %llu\n", (unsigned long long)tmp + blockoff);
out:
brelse (bh);
UFSD("EXIT\n");
- return result;
+ if (tmp)
+ tmp += uspi->s_sbbase;
+ return tmp;
}
/**
@@ -412,103 +390,64 @@ out:
* readpage, writepage and so on
*/
-int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create)
+static int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create)
{
- struct super_block * sb = inode->i_sb;
- struct ufs_sb_info * sbi = UFS_SB(sb);
- struct ufs_sb_private_info * uspi = sbi->s_uspi;
- struct buffer_head * bh;
- int ret, err, new;
- unsigned long ptr,phys;
+ struct super_block *sb = inode->i_sb;
+ struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+ int err = 0, new = 0;
+ unsigned offsets[4];
+ int depth = ufs_block_to_path(inode, fragment >> uspi->s_fpbshift, offsets);
u64 phys64 = 0;
- bool needs_lock = (sbi->mutex_owner != current);
-
+ unsigned frag = fragment & uspi->s_fpbmask;
+
if (!create) {
- phys64 = ufs_frag_map(inode, fragment, needs_lock);
- UFSD("phys64 = %llu\n", (unsigned long long)phys64);
- if (phys64)
- map_bh(bh_result, sb, phys64);
- return 0;
+ phys64 = ufs_frag_map(inode, offsets, depth);
+ goto out;
}
/* This code entered only while writing ....? */
- err = -EIO;
- new = 0;
- ret = 0;
- bh = NULL;
-
- if (needs_lock)
- lock_ufs(sb);
+ mutex_lock(&UFS_I(inode)->truncate_mutex);
UFSD("ENTER, ino %lu, fragment %llu\n", inode->i_ino, (unsigned long long)fragment);
- if (fragment >
- ((UFS_NDADDR + uspi->s_apb + uspi->s_2apb + uspi->s_3apb)
- << uspi->s_fpbshift))
- goto abort_too_big;
-
- err = 0;
- ptr = fragment;
-
- /*
- * ok, these macros clean the logic up a bit and make
- * it much more readable:
- */
-#define GET_INODE_DATABLOCK(x) \
- ufs_inode_getfrag(inode, x, fragment, 1, &err, &phys, &new,\
- bh_result->b_page)
-#define GET_INODE_PTR(x) \
- ufs_inode_getfrag(inode, x, fragment, uspi->s_fpb, &err, NULL, NULL,\
- bh_result->b_page)
-#define GET_INDIRECT_DATABLOCK(x) \
- ufs_inode_getblock(inode, bh, x, fragment, \
- &err, &phys, &new, bh_result->b_page)
-#define GET_INDIRECT_PTR(x) \
- ufs_inode_getblock(inode, bh, x, fragment, \
- &err, NULL, NULL, NULL)
-
- if (ptr < UFS_NDIR_FRAGMENT) {
- bh = GET_INODE_DATABLOCK(ptr);
+ if (unlikely(!depth)) {
+ ufs_warning(sb, "ufs_get_block", "block > big");
+ err = -EIO;
goto out;
}
- ptr -= UFS_NDIR_FRAGMENT;
- if (ptr < (1 << (uspi->s_apbshift + uspi->s_fpbshift))) {
- bh = GET_INODE_PTR(UFS_IND_FRAGMENT + (ptr >> uspi->s_apbshift));
- goto get_indirect;
- }
- ptr -= 1 << (uspi->s_apbshift + uspi->s_fpbshift);
- if (ptr < (1 << (uspi->s_2apbshift + uspi->s_fpbshift))) {
- bh = GET_INODE_PTR(UFS_DIND_FRAGMENT + (ptr >> uspi->s_2apbshift));
- goto get_double;
- }
- ptr -= 1 << (uspi->s_2apbshift + uspi->s_fpbshift);
- bh = GET_INODE_PTR(UFS_TIND_FRAGMENT + (ptr >> uspi->s_3apbshift));
- bh = GET_INDIRECT_PTR((ptr >> uspi->s_2apbshift) & uspi->s_apbmask);
-get_double:
- bh = GET_INDIRECT_PTR((ptr >> uspi->s_apbshift) & uspi->s_apbmask);
-get_indirect:
- bh = GET_INDIRECT_DATABLOCK(ptr & uspi->s_apbmask);
-
-#undef GET_INODE_DATABLOCK
-#undef GET_INODE_PTR
-#undef GET_INDIRECT_DATABLOCK
-#undef GET_INDIRECT_PTR
-out:
- if (err)
- goto abort;
- if (new)
- set_buffer_new(bh_result);
- map_bh(bh_result, sb, phys);
-abort:
- if (needs_lock)
- unlock_ufs(sb);
+ if (UFS_I(inode)->i_lastfrag < UFS_NDIR_FRAGMENT) {
+ unsigned lastfrag = UFS_I(inode)->i_lastfrag;
+ unsigned tailfrags = lastfrag & uspi->s_fpbmask;
+ if (tailfrags && fragment >= lastfrag) {
+ if (!ufs_extend_tail(inode, fragment,
+ &err, bh_result->b_page))
+ goto out;
+ }
+ }
+ if (depth == 1) {
+ phys64 = ufs_inode_getfrag(inode, offsets[0], fragment,
+ &err, &new, bh_result->b_page);
+ } else {
+ int i;
+ phys64 = ufs_inode_getfrag(inode, offsets[0], fragment,
+ &err, NULL, NULL);
+ for (i = 1; i < depth - 1; i++)
+ phys64 = ufs_inode_getblock(inode, phys64, offsets[i],
+ fragment, &err, NULL, NULL);
+ phys64 = ufs_inode_getblock(inode, phys64, offsets[depth - 1],
+ fragment, &err, &new, bh_result->b_page);
+ }
+out:
+ if (phys64) {
+ phys64 += frag;
+ map_bh(bh_result, sb, phys64);
+ if (new)
+ set_buffer_new(bh_result);
+ }
+ mutex_unlock(&UFS_I(inode)->truncate_mutex);
return err;
-
-abort_too_big:
- ufs_warning(sb, "ufs_get_block", "block > big");
- goto abort;
}
static int ufs_writepage(struct page *page, struct writeback_control *wbc)
@@ -526,12 +465,16 @@ int ufs_prepare_chunk(struct page *page, loff_t pos, unsigned len)
return __block_write_begin(page, pos, len, ufs_getfrag_block);
}
+static void ufs_truncate_blocks(struct inode *);
+
static void ufs_write_failed(struct address_space *mapping, loff_t to)
{
struct inode *inode = mapping->host;
- if (to > inode->i_size)
+ if (to > inode->i_size) {
truncate_pagecache(inode, inode->i_size);
+ ufs_truncate_blocks(inode);
+ }
}
static int ufs_write_begin(struct file *file, struct address_space *mapping,
@@ -548,6 +491,18 @@ static int ufs_write_begin(struct file *file, struct address_space *mapping,
return ret;
}
+static int ufs_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ int ret;
+
+ ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
+ if (ret < len)
+ ufs_write_failed(mapping, pos + len);
+ return ret;
+}
+
static sector_t ufs_bmap(struct address_space *mapping, sector_t block)
{
return generic_block_bmap(mapping,block,ufs_getfrag_block);
@@ -557,7 +512,7 @@ const struct address_space_operations ufs_aops = {
.readpage = ufs_readpage,
.writepage = ufs_writepage,
.write_begin = ufs_write_begin,
- .write_end = generic_write_end,
+ .write_end = ufs_write_end,
.bmap = ufs_bmap
};
@@ -573,11 +528,12 @@ static void ufs_set_inode_ops(struct inode *inode)
inode->i_mapping->a_ops = &ufs_aops;
} else if (S_ISLNK(inode->i_mode)) {
if (!inode->i_blocks) {
- inode->i_op = &ufs_fast_symlink_inode_operations;
inode->i_link = (char *)UFS_I(inode)->i_u1.i_symlink;
+ inode->i_op = &simple_symlink_inode_operations;
} else {
- inode->i_op = &ufs_symlink_inode_operations;
inode->i_mapping->a_ops = &ufs_aops;
+ inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
}
} else
init_special_inode(inode, inode->i_mode,
@@ -599,7 +555,7 @@ static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode)
ufs_error (sb, "ufs_read_inode", "inode %lu has zero nlink\n", inode->i_ino);
return -1;
}
-
+
/*
* Linux now has 32-bit uid and gid, so we can support EFT.
*/
@@ -619,7 +575,7 @@ static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode)
ufsi->i_shadow = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_shadow);
ufsi->i_oeftflag = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_oeftflag);
-
+
if (S_ISCHR(mode) || S_ISBLK(mode) || inode->i_blocks) {
memcpy(ufsi->i_u1.i_data, &ufs_inode->ui_u2.ui_addr,
sizeof(ufs_inode->ui_u2.ui_addr));
@@ -753,7 +709,7 @@ static void ufs1_update_inode(struct inode *inode, struct ufs_inode *ufs_inode)
ufs_set_inode_uid(sb, ufs_inode, i_uid_read(inode));
ufs_set_inode_gid(sb, ufs_inode, i_gid_read(inode));
-
+
ufs_inode->ui_size = cpu_to_fs64(sb, inode->i_size);
ufs_inode->ui_atime.tv_sec = cpu_to_fs32(sb, inode->i_atime.tv_sec);
ufs_inode->ui_atime.tv_usec = 0;
@@ -855,23 +811,19 @@ static int ufs_update_inode(struct inode * inode, int do_sync)
ufs1_update_inode(inode, ufs_inode + ufs_inotofsbo(inode->i_ino));
}
-
+
mark_buffer_dirty(bh);
if (do_sync)
sync_dirty_buffer(bh);
brelse (bh);
-
+
UFSD("EXIT\n");
return 0;
}
int ufs_write_inode(struct inode *inode, struct writeback_control *wbc)
{
- int ret;
- lock_ufs(inode->i_sb);
- ret = ufs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
- unlock_ufs(inode->i_sb);
- return ret;
+ return ufs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
}
int ufs_sync_inode (struct inode *inode)
@@ -888,24 +840,389 @@ void ufs_evict_inode(struct inode * inode)
truncate_inode_pages_final(&inode->i_data);
if (want_delete) {
- loff_t old_i_size;
- /*UFS_I(inode)->i_dtime = CURRENT_TIME;*/
- lock_ufs(inode->i_sb);
- mark_inode_dirty(inode);
- ufs_update_inode(inode, IS_SYNC(inode));
- old_i_size = inode->i_size;
inode->i_size = 0;
- if (inode->i_blocks && ufs_truncate(inode, old_i_size))
- ufs_warning(inode->i_sb, __func__, "ufs_truncate failed\n");
- unlock_ufs(inode->i_sb);
+ if (inode->i_blocks)
+ ufs_truncate_blocks(inode);
}
invalidate_inode_buffers(inode);
clear_inode(inode);
- if (want_delete) {
- lock_ufs(inode->i_sb);
+ if (want_delete)
ufs_free_inode(inode);
- unlock_ufs(inode->i_sb);
+}
+
+struct to_free {
+ struct inode *inode;
+ u64 to;
+ unsigned count;
+};
+
+static inline void free_data(struct to_free *ctx, u64 from, unsigned count)
+{
+ if (ctx->count && ctx->to != from) {
+ ufs_free_blocks(ctx->inode, ctx->to - ctx->count, ctx->count);
+ ctx->count = 0;
+ }
+ ctx->count += count;
+ ctx->to = from + count;
+}
+
+#define DIRECT_BLOCK ((inode->i_size + uspi->s_bsize - 1) >> uspi->s_bshift)
+#define DIRECT_FRAGMENT ((inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift)
+
+static void ufs_trunc_direct(struct inode *inode)
+{
+ struct ufs_inode_info *ufsi = UFS_I(inode);
+ struct super_block * sb;
+ struct ufs_sb_private_info * uspi;
+ void *p;
+ u64 frag1, frag2, frag3, frag4, block1, block2;
+ struct to_free ctx = {.inode = inode};
+ unsigned i, tmp;
+
+ UFSD("ENTER: ino %lu\n", inode->i_ino);
+
+ sb = inode->i_sb;
+ uspi = UFS_SB(sb)->s_uspi;
+
+ frag1 = DIRECT_FRAGMENT;
+ frag4 = min_t(u64, UFS_NDIR_FRAGMENT, ufsi->i_lastfrag);
+ frag2 = ((frag1 & uspi->s_fpbmask) ? ((frag1 | uspi->s_fpbmask) + 1) : frag1);
+ frag3 = frag4 & ~uspi->s_fpbmask;
+ block1 = block2 = 0;
+ if (frag2 > frag3) {
+ frag2 = frag4;
+ frag3 = frag4 = 0;
+ } else if (frag2 < frag3) {
+ block1 = ufs_fragstoblks (frag2);
+ block2 = ufs_fragstoblks (frag3);
+ }
+
+ UFSD("ino %lu, frag1 %llu, frag2 %llu, block1 %llu, block2 %llu,"
+ " frag3 %llu, frag4 %llu\n", inode->i_ino,
+ (unsigned long long)frag1, (unsigned long long)frag2,
+ (unsigned long long)block1, (unsigned long long)block2,
+ (unsigned long long)frag3, (unsigned long long)frag4);
+
+ if (frag1 >= frag2)
+ goto next1;
+
+ /*
+ * Free first free fragments
+ */
+ p = ufs_get_direct_data_ptr(uspi, ufsi, ufs_fragstoblks(frag1));
+ tmp = ufs_data_ptr_to_cpu(sb, p);
+ if (!tmp )
+ ufs_panic (sb, "ufs_trunc_direct", "internal error");
+ frag2 -= frag1;
+ frag1 = ufs_fragnum (frag1);
+
+ ufs_free_fragments(inode, tmp + frag1, frag2);
+
+next1:
+ /*
+ * Free whole blocks
+ */
+ for (i = block1 ; i < block2; i++) {
+ p = ufs_get_direct_data_ptr(uspi, ufsi, i);
+ tmp = ufs_data_ptr_to_cpu(sb, p);
+ if (!tmp)
+ continue;
+ write_seqlock(&ufsi->meta_lock);
+ ufs_data_ptr_clear(uspi, p);
+ write_sequnlock(&ufsi->meta_lock);
+
+ free_data(&ctx, tmp, uspi->s_fpb);
+ }
+
+ free_data(&ctx, 0, 0);
+
+ if (frag3 >= frag4)
+ goto next3;
+
+ /*
+ * Free last free fragments
+ */
+ p = ufs_get_direct_data_ptr(uspi, ufsi, ufs_fragstoblks(frag3));
+ tmp = ufs_data_ptr_to_cpu(sb, p);
+ if (!tmp )
+ ufs_panic(sb, "ufs_truncate_direct", "internal error");
+ frag4 = ufs_fragnum (frag4);
+ write_seqlock(&ufsi->meta_lock);
+ ufs_data_ptr_clear(uspi, p);
+ write_sequnlock(&ufsi->meta_lock);
+
+ ufs_free_fragments (inode, tmp, frag4);
+ next3:
+
+ UFSD("EXIT: ino %lu\n", inode->i_ino);
+}
+
+static void free_full_branch(struct inode *inode, u64 ind_block, int depth)
+{
+ struct super_block *sb = inode->i_sb;
+ struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+ struct ufs_buffer_head *ubh = ubh_bread(sb, ind_block, uspi->s_bsize);
+ unsigned i;
+
+ if (!ubh)
+ return;
+
+ if (--depth) {
+ for (i = 0; i < uspi->s_apb; i++) {
+ void *p = ubh_get_data_ptr(uspi, ubh, i);
+ u64 block = ufs_data_ptr_to_cpu(sb, p);
+ if (block)
+ free_full_branch(inode, block, depth);
+ }
+ } else {
+ struct to_free ctx = {.inode = inode};
+
+ for (i = 0; i < uspi->s_apb; i++) {
+ void *p = ubh_get_data_ptr(uspi, ubh, i);
+ u64 block = ufs_data_ptr_to_cpu(sb, p);
+ if (block)
+ free_data(&ctx, block, uspi->s_fpb);
+ }
+ free_data(&ctx, 0, 0);
+ }
+
+ ubh_bforget(ubh);
+ ufs_free_blocks(inode, ind_block, uspi->s_fpb);
+}
+
+static void free_branch_tail(struct inode *inode, unsigned from, struct ufs_buffer_head *ubh, int depth)
+{
+ struct super_block *sb = inode->i_sb;
+ struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+ unsigned i;
+
+ if (--depth) {
+ for (i = from; i < uspi->s_apb ; i++) {
+ void *p = ubh_get_data_ptr(uspi, ubh, i);
+ u64 block = ufs_data_ptr_to_cpu(sb, p);
+ if (block) {
+ write_seqlock(&UFS_I(inode)->meta_lock);
+ ufs_data_ptr_clear(uspi, p);
+ write_sequnlock(&UFS_I(inode)->meta_lock);
+ ubh_mark_buffer_dirty(ubh);
+ free_full_branch(inode, block, depth);
+ }
+ }
+ } else {
+ struct to_free ctx = {.inode = inode};
+
+ for (i = from; i < uspi->s_apb; i++) {
+ void *p = ubh_get_data_ptr(uspi, ubh, i);
+ u64 block = ufs_data_ptr_to_cpu(sb, p);
+ if (block) {
+ write_seqlock(&UFS_I(inode)->meta_lock);
+ ufs_data_ptr_clear(uspi, p);
+ write_sequnlock(&UFS_I(inode)->meta_lock);
+ ubh_mark_buffer_dirty(ubh);
+ free_data(&ctx, block, uspi->s_fpb);
+ }
+ }
+ free_data(&ctx, 0, 0);
+ }
+ if (IS_SYNC(inode) && ubh_buffer_dirty(ubh))
+ ubh_sync_block(ubh);
+ ubh_brelse(ubh);
+}
+
+static int ufs_alloc_lastblock(struct inode *inode, loff_t size)
+{
+ int err = 0;
+ struct super_block *sb = inode->i_sb;
+ struct address_space *mapping = inode->i_mapping;
+ struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+ unsigned i, end;
+ sector_t lastfrag;
+ struct page *lastpage;
+ struct buffer_head *bh;
+ u64 phys64;
+
+ lastfrag = (size + uspi->s_fsize - 1) >> uspi->s_fshift;
+
+ if (!lastfrag)
+ goto out;
+
+ lastfrag--;
+
+ lastpage = ufs_get_locked_page(mapping, lastfrag >>
+ (PAGE_CACHE_SHIFT - inode->i_blkbits));
+ if (IS_ERR(lastpage)) {
+ err = -EIO;
+ goto out;
+ }
+
+ end = lastfrag & ((1 << (PAGE_CACHE_SHIFT - inode->i_blkbits)) - 1);
+ bh = page_buffers(lastpage);
+ for (i = 0; i < end; ++i)
+ bh = bh->b_this_page;
+
+
+ err = ufs_getfrag_block(inode, lastfrag, bh, 1);
+
+ if (unlikely(err))
+ goto out_unlock;
+
+ if (buffer_new(bh)) {
+ clear_buffer_new(bh);
+ unmap_underlying_metadata(bh->b_bdev,
+ bh->b_blocknr);
+ /*
+ * we do not zeroize fragment, because of
+ * if it maped to hole, it already contains zeroes
+ */
+ set_buffer_uptodate(bh);
+ mark_buffer_dirty(bh);
+ set_page_dirty(lastpage);
+ }
+
+ if (lastfrag >= UFS_IND_FRAGMENT) {
+ end = uspi->s_fpb - ufs_fragnum(lastfrag) - 1;
+ phys64 = bh->b_blocknr + 1;
+ for (i = 0; i < end; ++i) {
+ bh = sb_getblk(sb, i + phys64);
+ lock_buffer(bh);
+ memset(bh->b_data, 0, sb->s_blocksize);
+ set_buffer_uptodate(bh);
+ mark_buffer_dirty(bh);
+ unlock_buffer(bh);
+ sync_dirty_buffer(bh);
+ brelse(bh);
+ }
+ }
+out_unlock:
+ ufs_put_locked_page(lastpage);
+out:
+ return err;
+}
+
+static void __ufs_truncate_blocks(struct inode *inode)
+{
+ struct ufs_inode_info *ufsi = UFS_I(inode);
+ struct super_block *sb = inode->i_sb;
+ struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+ unsigned offsets[4];
+ int depth = ufs_block_to_path(inode, DIRECT_BLOCK, offsets);
+ int depth2;
+ unsigned i;
+ struct ufs_buffer_head *ubh[3];
+ void *p;
+ u64 block;
+
+ if (!depth)
+ return;
+
+ /* find the last non-zero in offsets[] */
+ for (depth2 = depth - 1; depth2; depth2--)
+ if (offsets[depth2])
+ break;
+
+ mutex_lock(&ufsi->truncate_mutex);
+ if (depth == 1) {
+ ufs_trunc_direct(inode);
+ offsets[0] = UFS_IND_BLOCK;
+ } else {
+ /* get the blocks that should be partially emptied */
+ p = ufs_get_direct_data_ptr(uspi, ufsi, offsets[0]);
+ for (i = 0; i < depth2; i++) {
+ offsets[i]++; /* next branch is fully freed */
+ block = ufs_data_ptr_to_cpu(sb, p);
+ if (!block)
+ break;
+ ubh[i] = ubh_bread(sb, block, uspi->s_bsize);
+ if (!ubh[i]) {
+ write_seqlock(&ufsi->meta_lock);
+ ufs_data_ptr_clear(uspi, p);
+ write_sequnlock(&ufsi->meta_lock);
+ break;
+ }
+ p = ubh_get_data_ptr(uspi, ubh[i], offsets[i + 1]);
+ }
+ while (i--)
+ free_branch_tail(inode, offsets[i + 1], ubh[i], depth - i - 1);
+ }
+ for (i = offsets[0]; i <= UFS_TIND_BLOCK; i++) {
+ p = ufs_get_direct_data_ptr(uspi, ufsi, i);
+ block = ufs_data_ptr_to_cpu(sb, p);
+ if (block) {
+ write_seqlock(&ufsi->meta_lock);
+ ufs_data_ptr_clear(uspi, p);
+ write_sequnlock(&ufsi->meta_lock);
+ free_full_branch(inode, block, i - UFS_IND_BLOCK + 1);
+ }
}
+ ufsi->i_lastfrag = DIRECT_FRAGMENT;
+ mark_inode_dirty(inode);
+ mutex_unlock(&ufsi->truncate_mutex);
+}
+
+static int ufs_truncate(struct inode *inode, loff_t size)
+{
+ int err = 0;
+
+ UFSD("ENTER: ino %lu, i_size: %llu, old_i_size: %llu\n",
+ inode->i_ino, (unsigned long long)size,
+ (unsigned long long)i_size_read(inode));
+
+ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ S_ISLNK(inode->i_mode)))
+ return -EINVAL;
+ if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+ return -EPERM;
+
+ err = ufs_alloc_lastblock(inode, size);
+
+ if (err)
+ goto out;
+
+ block_truncate_page(inode->i_mapping, size, ufs_getfrag_block);
+
+ truncate_setsize(inode, size);
+
+ __ufs_truncate_blocks(inode);
+ inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
+ mark_inode_dirty(inode);
+out:
+ UFSD("EXIT: err %d\n", err);
+ return err;
+}
+
+void ufs_truncate_blocks(struct inode *inode)
+{
+ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ S_ISLNK(inode->i_mode)))
+ return;
+ if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+ return;
+ __ufs_truncate_blocks(inode);
+}
+
+int ufs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+ struct inode *inode = d_inode(dentry);
+ unsigned int ia_valid = attr->ia_valid;
+ int error;
+
+ error = inode_change_ok(inode, attr);
+ if (error)
+ return error;
+
+ if (ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) {
+ error = ufs_truncate(inode, attr->ia_size);
+ if (error)
+ return error;
+ }
+
+ setattr_copy(inode, attr);
+ mark_inode_dirty(inode);
+ return 0;
}
+
+const struct inode_operations ufs_file_inode_operations = {
+ .setattr = ufs_setattr,
+};
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 4796655..acf4a3b 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -123,14 +123,15 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
if (l > UFS_SB(sb)->s_uspi->s_maxsymlinklen) {
/* slow symlink */
- inode->i_op = &ufs_symlink_inode_operations;
+ inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &ufs_aops;
err = page_symlink(inode, symname, l);
if (err)
goto out_fail;
} else {
/* fast symlink */
- inode->i_op = &ufs_fast_symlink_inode_operations;
+ inode->i_op = &simple_symlink_inode_operations;
inode->i_link = (char *)UFS_I(inode)->i_u1.i_symlink;
memcpy(inode->i_link, symname, l);
inode->i_size = l-1;
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 250579a..442fd52 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -94,22 +94,6 @@
#include "swab.h"
#include "util.h"
-void lock_ufs(struct super_block *sb)
-{
- struct ufs_sb_info *sbi = UFS_SB(sb);
-
- mutex_lock(&sbi->mutex);
- sbi->mutex_owner = current;
-}
-
-void unlock_ufs(struct super_block *sb)
-{
- struct ufs_sb_info *sbi = UFS_SB(sb);
-
- sbi->mutex_owner = NULL;
- mutex_unlock(&sbi->mutex);
-}
-
static struct inode *ufs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation)
{
struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
@@ -694,7 +678,6 @@ static int ufs_sync_fs(struct super_block *sb, int wait)
struct ufs_super_block_third * usb3;
unsigned flags;
- lock_ufs(sb);
mutex_lock(&UFS_SB(sb)->s_lock);
UFSD("ENTER\n");
@@ -714,7 +697,6 @@ static int ufs_sync_fs(struct super_block *sb, int wait)
UFSD("EXIT\n");
mutex_unlock(&UFS_SB(sb)->s_lock);
- unlock_ufs(sb);
return 0;
}
@@ -758,7 +740,6 @@ static void ufs_put_super(struct super_block *sb)
ubh_brelse_uspi (sbi->s_uspi);
kfree (sbi->s_uspi);
- mutex_destroy(&sbi->mutex);
kfree (sbi);
sb->s_fs_info = NULL;
UFSD("EXIT\n");
@@ -801,7 +782,6 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
UFSD("flag %u\n", (int)(sb->s_flags & MS_RDONLY));
- mutex_init(&sbi->mutex);
mutex_init(&sbi->s_lock);
spin_lock_init(&sbi->work_lock);
INIT_DELAYED_WORK(&sbi->sync_work, delayed_sync_fs);
@@ -1257,7 +1237,6 @@ magic_found:
return 0;
failed:
- mutex_destroy(&sbi->mutex);
if (ubh)
ubh_brelse_uspi (uspi);
kfree (uspi);
@@ -1280,7 +1259,6 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
unsigned flags;
sync_filesystem(sb);
- lock_ufs(sb);
mutex_lock(&UFS_SB(sb)->s_lock);
uspi = UFS_SB(sb)->s_uspi;
flags = UFS_SB(sb)->s_flags;
@@ -1296,7 +1274,6 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
ufs_set_opt (new_mount_opt, ONERROR_LOCK);
if (!ufs_parse_options (data, &new_mount_opt)) {
mutex_unlock(&UFS_SB(sb)->s_lock);
- unlock_ufs(sb);
return -EINVAL;
}
if (!(new_mount_opt & UFS_MOUNT_UFSTYPE)) {
@@ -1304,14 +1281,12 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
} else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) {
pr_err("ufstype can't be changed during remount\n");
mutex_unlock(&UFS_SB(sb)->s_lock);
- unlock_ufs(sb);
return -EINVAL;
}
if ((*mount_flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
UFS_SB(sb)->s_mount_opt = new_mount_opt;
mutex_unlock(&UFS_SB(sb)->s_lock);
- unlock_ufs(sb);
return 0;
}
@@ -1335,7 +1310,6 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
#ifndef CONFIG_UFS_FS_WRITE
pr_err("ufs was compiled with read-only support, can't be mounted as read-write\n");
mutex_unlock(&UFS_SB(sb)->s_lock);
- unlock_ufs(sb);
return -EINVAL;
#else
if (ufstype != UFS_MOUNT_UFSTYPE_SUN &&
@@ -1345,13 +1319,11 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
ufstype != UFS_MOUNT_UFSTYPE_UFS2) {
pr_err("this ufstype is read-only supported\n");
mutex_unlock(&UFS_SB(sb)->s_lock);
- unlock_ufs(sb);
return -EINVAL;
}
if (!ufs_read_cylinder_structures(sb)) {
pr_err("failed during remounting\n");
mutex_unlock(&UFS_SB(sb)->s_lock);
- unlock_ufs(sb);
return -EPERM;
}
sb->s_flags &= ~MS_RDONLY;
@@ -1359,7 +1331,6 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
}
UFS_SB(sb)->s_mount_opt = new_mount_opt;
mutex_unlock(&UFS_SB(sb)->s_lock);
- unlock_ufs(sb);
return 0;
}
@@ -1391,8 +1362,7 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf)
struct ufs_super_block_third *usb3;
u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
- lock_ufs(sb);
-
+ mutex_lock(&UFS_SB(sb)->s_lock);
usb3 = ubh_get_usb_third(uspi);
if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) {
@@ -1413,7 +1383,7 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_fsid.val[0] = (u32)id;
buf->f_fsid.val[1] = (u32)(id >> 32);
- unlock_ufs(sb);
+ mutex_unlock(&UFS_SB(sb)->s_lock);
return 0;
}
@@ -1429,6 +1399,8 @@ static struct inode *ufs_alloc_inode(struct super_block *sb)
return NULL;
ei->vfs_inode.i_version = 1;
+ seqlock_init(&ei->meta_lock);
+ mutex_init(&ei->truncate_mutex);
return &ei->vfs_inode;
}
@@ -1455,7 +1427,7 @@ static int __init init_inodecache(void)
ufs_inode_cachep = kmem_cache_create("ufs_inode_cache",
sizeof(struct ufs_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (ufs_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/ufs/symlink.c b/fs/ufs/symlink.c
deleted file mode 100644
index 874480b..0000000
--- a/fs/ufs/symlink.c
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * linux/fs/ufs/symlink.c
- *
- * Only fast symlinks left here - the rest is done by generic code. AV, 1999
- *
- * Copyright (C) 1998
- * Daniel Pirkl <daniel.pirkl@emai.cz>
- * Charles University, Faculty of Mathematics and Physics
- *
- * from
- *
- * linux/fs/ext2/symlink.c
- *
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- *
- * from
- *
- * linux/fs/minix/symlink.c
- *
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * ext2 symlink handling code
- */
-
-#include "ufs_fs.h"
-#include "ufs.h"
-
-const struct inode_operations ufs_fast_symlink_inode_operations = {
- .readlink = generic_readlink,
- .follow_link = simple_follow_link,
- .setattr = ufs_setattr,
-};
-
-const struct inode_operations ufs_symlink_inode_operations = {
- .readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
- .setattr = ufs_setattr,
-};
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
deleted file mode 100644
index 2115470..0000000
--- a/fs/ufs/truncate.c
+++ /dev/null
@@ -1,523 +0,0 @@
-/*
- * linux/fs/ufs/truncate.c
- *
- * Copyright (C) 1998
- * Daniel Pirkl <daniel.pirkl@email.cz>
- * Charles University, Faculty of Mathematics and Physics
- *
- * from
- *
- * linux/fs/ext2/truncate.c
- *
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- *
- * from
- *
- * linux/fs/minix/truncate.c
- *
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * Big-endian to little-endian byte-swapping/bitmaps by
- * David S. Miller (davem@caip.rutgers.edu), 1995
- */
-
-/*
- * Real random numbers for secure rm added 94/02/18
- * Idea from Pierre del Perugia <delperug@gla.ecoledoc.ibp.fr>
- */
-
-/*
- * Adoptation to use page cache and UFS2 write support by
- * Evgeniy Dushistov <dushistov@mail.ru>, 2006-2007
- */
-
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/fcntl.h>
-#include <linux/time.h>
-#include <linux/stat.h>
-#include <linux/string.h>
-#include <linux/buffer_head.h>
-#include <linux/blkdev.h>
-#include <linux/sched.h>
-
-#include "ufs_fs.h"
-#include "ufs.h"
-#include "swab.h"
-#include "util.h"
-
-/*
- * Secure deletion currently doesn't work. It interacts very badly
- * with buffers shared with memory mappings, and for that reason
- * can't be done in the truncate() routines. It should instead be
- * done separately in "release()" before calling the truncate routines
- * that will release the actual file blocks.
- *
- * Linus
- */
-
-#define DIRECT_BLOCK ((inode->i_size + uspi->s_bsize - 1) >> uspi->s_bshift)
-#define DIRECT_FRAGMENT ((inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift)
-
-
-static int ufs_trunc_direct(struct inode *inode)
-{
- struct ufs_inode_info *ufsi = UFS_I(inode);
- struct super_block * sb;
- struct ufs_sb_private_info * uspi;
- void *p;
- u64 frag1, frag2, frag3, frag4, block1, block2;
- unsigned frag_to_free, free_count;
- unsigned i, tmp;
- int retry;
-
- UFSD("ENTER: ino %lu\n", inode->i_ino);
-
- sb = inode->i_sb;
- uspi = UFS_SB(sb)->s_uspi;
-
- frag_to_free = 0;
- free_count = 0;
- retry = 0;
-
- frag1 = DIRECT_FRAGMENT;
- frag4 = min_t(u64, UFS_NDIR_FRAGMENT, ufsi->i_lastfrag);
- frag2 = ((frag1 & uspi->s_fpbmask) ? ((frag1 | uspi->s_fpbmask) + 1) : frag1);
- frag3 = frag4 & ~uspi->s_fpbmask;
- block1 = block2 = 0;
- if (frag2 > frag3) {
- frag2 = frag4;
- frag3 = frag4 = 0;
- } else if (frag2 < frag3) {
- block1 = ufs_fragstoblks (frag2);
- block2 = ufs_fragstoblks (frag3);
- }
-
- UFSD("ino %lu, frag1 %llu, frag2 %llu, block1 %llu, block2 %llu,"
- " frag3 %llu, frag4 %llu\n", inode->i_ino,
- (unsigned long long)frag1, (unsigned long long)frag2,
- (unsigned long long)block1, (unsigned long long)block2,
- (unsigned long long)frag3, (unsigned long long)frag4);
-
- if (frag1 >= frag2)
- goto next1;
-
- /*
- * Free first free fragments
- */
- p = ufs_get_direct_data_ptr(uspi, ufsi, ufs_fragstoblks(frag1));
- tmp = ufs_data_ptr_to_cpu(sb, p);
- if (!tmp )
- ufs_panic (sb, "ufs_trunc_direct", "internal error");
- frag2 -= frag1;
- frag1 = ufs_fragnum (frag1);
-
- ufs_free_fragments(inode, tmp + frag1, frag2);
- mark_inode_dirty(inode);
- frag_to_free = tmp + frag1;
-
-next1:
- /*
- * Free whole blocks
- */
- for (i = block1 ; i < block2; i++) {
- p = ufs_get_direct_data_ptr(uspi, ufsi, i);
- tmp = ufs_data_ptr_to_cpu(sb, p);
- if (!tmp)
- continue;
- ufs_data_ptr_clear(uspi, p);
-
- if (free_count == 0) {
- frag_to_free = tmp;
- free_count = uspi->s_fpb;
- } else if (free_count > 0 && frag_to_free == tmp - free_count)
- free_count += uspi->s_fpb;
- else {
- ufs_free_blocks (inode, frag_to_free, free_count);
- frag_to_free = tmp;
- free_count = uspi->s_fpb;
- }
- mark_inode_dirty(inode);
- }
-
- if (free_count > 0)
- ufs_free_blocks (inode, frag_to_free, free_count);
-
- if (frag3 >= frag4)
- goto next3;
-
- /*
- * Free last free fragments
- */
- p = ufs_get_direct_data_ptr(uspi, ufsi, ufs_fragstoblks(frag3));
- tmp = ufs_data_ptr_to_cpu(sb, p);
- if (!tmp )
- ufs_panic(sb, "ufs_truncate_direct", "internal error");
- frag4 = ufs_fragnum (frag4);
- ufs_data_ptr_clear(uspi, p);
-
- ufs_free_fragments (inode, tmp, frag4);
- mark_inode_dirty(inode);
- next3:
-
- UFSD("EXIT: ino %lu\n", inode->i_ino);
- return retry;
-}
-
-
-static int ufs_trunc_indirect(struct inode *inode, u64 offset, void *p)
-{
- struct super_block * sb;
- struct ufs_sb_private_info * uspi;
- struct ufs_buffer_head * ind_ubh;
- void *ind;
- u64 tmp, indirect_block, i, frag_to_free;
- unsigned free_count;
- int retry;
-
- UFSD("ENTER: ino %lu, offset %llu, p: %p\n",
- inode->i_ino, (unsigned long long)offset, p);
-
- BUG_ON(!p);
-
- sb = inode->i_sb;
- uspi = UFS_SB(sb)->s_uspi;
-
- frag_to_free = 0;
- free_count = 0;
- retry = 0;
-
- tmp = ufs_data_ptr_to_cpu(sb, p);
- if (!tmp)
- return 0;
- ind_ubh = ubh_bread(sb, tmp, uspi->s_bsize);
- if (tmp != ufs_data_ptr_to_cpu(sb, p)) {
- ubh_brelse (ind_ubh);
- return 1;
- }
- if (!ind_ubh) {
- ufs_data_ptr_clear(uspi, p);
- return 0;
- }
-
- indirect_block = (DIRECT_BLOCK > offset) ? (DIRECT_BLOCK - offset) : 0;
- for (i = indirect_block; i < uspi->s_apb; i++) {
- ind = ubh_get_data_ptr(uspi, ind_ubh, i);
- tmp = ufs_data_ptr_to_cpu(sb, ind);
- if (!tmp)
- continue;
-
- ufs_data_ptr_clear(uspi, ind);
- ubh_mark_buffer_dirty(ind_ubh);
- if (free_count == 0) {
- frag_to_free = tmp;
- free_count = uspi->s_fpb;
- } else if (free_count > 0 && frag_to_free == tmp - free_count)
- free_count += uspi->s_fpb;
- else {
- ufs_free_blocks (inode, frag_to_free, free_count);
- frag_to_free = tmp;
- free_count = uspi->s_fpb;
- }
-
- mark_inode_dirty(inode);
- }
-
- if (free_count > 0) {
- ufs_free_blocks (inode, frag_to_free, free_count);
- }
- for (i = 0; i < uspi->s_apb; i++)
- if (!ufs_is_data_ptr_zero(uspi,
- ubh_get_data_ptr(uspi, ind_ubh, i)))
- break;
- if (i >= uspi->s_apb) {
- tmp = ufs_data_ptr_to_cpu(sb, p);
- ufs_data_ptr_clear(uspi, p);
-
- ufs_free_blocks (inode, tmp, uspi->s_fpb);
- mark_inode_dirty(inode);
- ubh_bforget(ind_ubh);
- ind_ubh = NULL;
- }
- if (IS_SYNC(inode) && ind_ubh && ubh_buffer_dirty(ind_ubh))
- ubh_sync_block(ind_ubh);
- ubh_brelse (ind_ubh);
-
- UFSD("EXIT: ino %lu\n", inode->i_ino);
-
- return retry;
-}
-
-static int ufs_trunc_dindirect(struct inode *inode, u64 offset, void *p)
-{
- struct super_block * sb;
- struct ufs_sb_private_info * uspi;
- struct ufs_buffer_head *dind_bh;
- u64 i, tmp, dindirect_block;
- void *dind;
- int retry = 0;
-
- UFSD("ENTER: ino %lu\n", inode->i_ino);
-
- sb = inode->i_sb;
- uspi = UFS_SB(sb)->s_uspi;
-
- dindirect_block = (DIRECT_BLOCK > offset)
- ? ((DIRECT_BLOCK - offset) >> uspi->s_apbshift) : 0;
- retry = 0;
-
- tmp = ufs_data_ptr_to_cpu(sb, p);
- if (!tmp)
- return 0;
- dind_bh = ubh_bread(sb, tmp, uspi->s_bsize);
- if (tmp != ufs_data_ptr_to_cpu(sb, p)) {
- ubh_brelse (dind_bh);
- return 1;
- }
- if (!dind_bh) {
- ufs_data_ptr_clear(uspi, p);
- return 0;
- }
-
- for (i = dindirect_block ; i < uspi->s_apb ; i++) {
- dind = ubh_get_data_ptr(uspi, dind_bh, i);
- tmp = ufs_data_ptr_to_cpu(sb, dind);
- if (!tmp)
- continue;
- retry |= ufs_trunc_indirect (inode, offset + (i << uspi->s_apbshift), dind);
- ubh_mark_buffer_dirty(dind_bh);
- }
-
- for (i = 0; i < uspi->s_apb; i++)
- if (!ufs_is_data_ptr_zero(uspi,
- ubh_get_data_ptr(uspi, dind_bh, i)))
- break;
- if (i >= uspi->s_apb) {
- tmp = ufs_data_ptr_to_cpu(sb, p);
- ufs_data_ptr_clear(uspi, p);
-
- ufs_free_blocks(inode, tmp, uspi->s_fpb);
- mark_inode_dirty(inode);
- ubh_bforget(dind_bh);
- dind_bh = NULL;
- }
- if (IS_SYNC(inode) && dind_bh && ubh_buffer_dirty(dind_bh))
- ubh_sync_block(dind_bh);
- ubh_brelse (dind_bh);
-
- UFSD("EXIT: ino %lu\n", inode->i_ino);
-
- return retry;
-}
-
-static int ufs_trunc_tindirect(struct inode *inode)
-{
- struct super_block *sb = inode->i_sb;
- struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
- struct ufs_inode_info *ufsi = UFS_I(inode);
- struct ufs_buffer_head * tind_bh;
- u64 tindirect_block, tmp, i;
- void *tind, *p;
- int retry;
-
- UFSD("ENTER: ino %lu\n", inode->i_ino);
-
- retry = 0;
-
- tindirect_block = (DIRECT_BLOCK > (UFS_NDADDR + uspi->s_apb + uspi->s_2apb))
- ? ((DIRECT_BLOCK - UFS_NDADDR - uspi->s_apb - uspi->s_2apb) >> uspi->s_2apbshift) : 0;
-
- p = ufs_get_direct_data_ptr(uspi, ufsi, UFS_TIND_BLOCK);
- if (!(tmp = ufs_data_ptr_to_cpu(sb, p)))
- return 0;
- tind_bh = ubh_bread (sb, tmp, uspi->s_bsize);
- if (tmp != ufs_data_ptr_to_cpu(sb, p)) {
- ubh_brelse (tind_bh);
- return 1;
- }
- if (!tind_bh) {
- ufs_data_ptr_clear(uspi, p);
- return 0;
- }
-
- for (i = tindirect_block ; i < uspi->s_apb ; i++) {
- tind = ubh_get_data_ptr(uspi, tind_bh, i);
- retry |= ufs_trunc_dindirect(inode, UFS_NDADDR +
- uspi->s_apb + ((i + 1) << uspi->s_2apbshift), tind);
- ubh_mark_buffer_dirty(tind_bh);
- }
- for (i = 0; i < uspi->s_apb; i++)
- if (!ufs_is_data_ptr_zero(uspi,
- ubh_get_data_ptr(uspi, tind_bh, i)))
- break;
- if (i >= uspi->s_apb) {
- tmp = ufs_data_ptr_to_cpu(sb, p);
- ufs_data_ptr_clear(uspi, p);
-
- ufs_free_blocks(inode, tmp, uspi->s_fpb);
- mark_inode_dirty(inode);
- ubh_bforget(tind_bh);
- tind_bh = NULL;
- }
- if (IS_SYNC(inode) && tind_bh && ubh_buffer_dirty(tind_bh))
- ubh_sync_block(tind_bh);
- ubh_brelse (tind_bh);
-
- UFSD("EXIT: ino %lu\n", inode->i_ino);
- return retry;
-}
-
-static int ufs_alloc_lastblock(struct inode *inode)
-{
- int err = 0;
- struct super_block *sb = inode->i_sb;
- struct address_space *mapping = inode->i_mapping;
- struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
- unsigned i, end;
- sector_t lastfrag;
- struct page *lastpage;
- struct buffer_head *bh;
- u64 phys64;
-
- lastfrag = (i_size_read(inode) + uspi->s_fsize - 1) >> uspi->s_fshift;
-
- if (!lastfrag)
- goto out;
-
- lastfrag--;
-
- lastpage = ufs_get_locked_page(mapping, lastfrag >>
- (PAGE_CACHE_SHIFT - inode->i_blkbits));
- if (IS_ERR(lastpage)) {
- err = -EIO;
- goto out;
- }
-
- end = lastfrag & ((1 << (PAGE_CACHE_SHIFT - inode->i_blkbits)) - 1);
- bh = page_buffers(lastpage);
- for (i = 0; i < end; ++i)
- bh = bh->b_this_page;
-
-
- err = ufs_getfrag_block(inode, lastfrag, bh, 1);
-
- if (unlikely(err))
- goto out_unlock;
-
- if (buffer_new(bh)) {
- clear_buffer_new(bh);
- unmap_underlying_metadata(bh->b_bdev,
- bh->b_blocknr);
- /*
- * we do not zeroize fragment, because of
- * if it maped to hole, it already contains zeroes
- */
- set_buffer_uptodate(bh);
- mark_buffer_dirty(bh);
- set_page_dirty(lastpage);
- }
-
- if (lastfrag >= UFS_IND_FRAGMENT) {
- end = uspi->s_fpb - ufs_fragnum(lastfrag) - 1;
- phys64 = bh->b_blocknr + 1;
- for (i = 0; i < end; ++i) {
- bh = sb_getblk(sb, i + phys64);
- lock_buffer(bh);
- memset(bh->b_data, 0, sb->s_blocksize);
- set_buffer_uptodate(bh);
- mark_buffer_dirty(bh);
- unlock_buffer(bh);
- sync_dirty_buffer(bh);
- brelse(bh);
- }
- }
-out_unlock:
- ufs_put_locked_page(lastpage);
-out:
- return err;
-}
-
-int ufs_truncate(struct inode *inode, loff_t old_i_size)
-{
- struct ufs_inode_info *ufsi = UFS_I(inode);
- struct super_block *sb = inode->i_sb;
- struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
- int retry, err = 0;
-
- UFSD("ENTER: ino %lu, i_size: %llu, old_i_size: %llu\n",
- inode->i_ino, (unsigned long long)i_size_read(inode),
- (unsigned long long)old_i_size);
-
- if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
- S_ISLNK(inode->i_mode)))
- return -EINVAL;
- if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
- return -EPERM;
-
- err = ufs_alloc_lastblock(inode);
-
- if (err) {
- i_size_write(inode, old_i_size);
- goto out;
- }
-
- block_truncate_page(inode->i_mapping, inode->i_size, ufs_getfrag_block);
-
- while (1) {
- retry = ufs_trunc_direct(inode);
- retry |= ufs_trunc_indirect(inode, UFS_IND_BLOCK,
- ufs_get_direct_data_ptr(uspi, ufsi,
- UFS_IND_BLOCK));
- retry |= ufs_trunc_dindirect(inode, UFS_IND_BLOCK + uspi->s_apb,
- ufs_get_direct_data_ptr(uspi, ufsi,
- UFS_DIND_BLOCK));
- retry |= ufs_trunc_tindirect (inode);
- if (!retry)
- break;
- if (IS_SYNC(inode) && (inode->i_state & I_DIRTY))
- ufs_sync_inode (inode);
- yield();
- }
-
- inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
- ufsi->i_lastfrag = DIRECT_FRAGMENT;
- mark_inode_dirty(inode);
-out:
- UFSD("EXIT: err %d\n", err);
- return err;
-}
-
-int ufs_setattr(struct dentry *dentry, struct iattr *attr)
-{
- struct inode *inode = d_inode(dentry);
- unsigned int ia_valid = attr->ia_valid;
- int error;
-
- error = inode_change_ok(inode, attr);
- if (error)
- return error;
-
- if (ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) {
- loff_t old_i_size = inode->i_size;
-
- /* XXX(truncate): truncate_setsize should be called last */
- truncate_setsize(inode, attr->ia_size);
-
- lock_ufs(inode->i_sb);
- error = ufs_truncate(inode, old_i_size);
- unlock_ufs(inode->i_sb);
- if (error)
- return error;
- }
-
- setattr_copy(inode, attr);
- mark_inode_dirty(inode);
- return 0;
-}
-
-const struct inode_operations ufs_file_inode_operations = {
- .setattr = ufs_setattr,
-};
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index 2e31ea2..c87f4c3 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -24,8 +24,6 @@ struct ufs_sb_info {
unsigned s_cgno[UFS_MAX_GROUP_LOADED];
unsigned short s_cg_loaded;
unsigned s_mount_opt;
- struct mutex mutex;
- struct task_struct *mutex_owner;
struct super_block *sb;
int work_queued; /* non-zero if the delayed work is queued */
struct delayed_work sync_work; /* FS sync delayed work */
@@ -46,6 +44,8 @@ struct ufs_inode_info {
__u32 i_oeftflag;
__u16 i_osync;
__u64 i_lastfrag;
+ seqlock_t meta_lock;
+ struct mutex truncate_mutex;
__u32 i_dir_start_lookup;
struct inode vfs_inode;
};
@@ -122,7 +122,7 @@ extern struct inode *ufs_iget(struct super_block *, unsigned long);
extern int ufs_write_inode (struct inode *, struct writeback_control *);
extern int ufs_sync_inode (struct inode *);
extern void ufs_evict_inode (struct inode *);
-extern int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create);
+extern int ufs_setattr(struct dentry *dentry, struct iattr *attr);
/* namei.c */
extern const struct file_operations ufs_dir_operations;
@@ -136,14 +136,6 @@ extern __printf(3, 4)
void ufs_panic(struct super_block *, const char *, const char *, ...);
void ufs_mark_sb_dirty(struct super_block *sb);
-/* symlink.c */
-extern const struct inode_operations ufs_fast_symlink_inode_operations;
-extern const struct inode_operations ufs_symlink_inode_operations;
-
-/* truncate.c */
-extern int ufs_truncate (struct inode *, loff_t);
-extern int ufs_setattr(struct dentry *dentry, struct iattr *attr);
-
static inline struct ufs_sb_info *UFS_SB(struct super_block *sb)
{
return sb->s_fs_info;
@@ -170,7 +162,4 @@ static inline u32 ufs_dtogd(struct ufs_sb_private_info * uspi, u64 b)
return do_div(b, uspi->s_fpg);
}
-extern void lock_ufs(struct super_block *sb);
-extern void unlock_ufs(struct super_block *sb);
-
#endif /* _UFS_UFS_H */
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
new file mode 100644
index 0000000..5031170
--- /dev/null
+++ b/fs/userfaultfd.c
@@ -0,0 +1,1332 @@
+/*
+ * fs/userfaultfd.c
+ *
+ * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org>
+ * Copyright (C) 2008-2009 Red Hat, Inc.
+ * Copyright (C) 2015 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ * Some part derived from fs/eventfd.c (anon inode setup) and
+ * mm/ksm.c (mm hashing).
+ */
+
+#include <linux/hashtable.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/poll.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+#include <linux/file.h>
+#include <linux/bug.h>
+#include <linux/anon_inodes.h>
+#include <linux/syscalls.h>
+#include <linux/userfaultfd_k.h>
+#include <linux/mempolicy.h>
+#include <linux/ioctl.h>
+#include <linux/security.h>
+
+static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
+
+enum userfaultfd_state {
+ UFFD_STATE_WAIT_API,
+ UFFD_STATE_RUNNING,
+};
+
+/*
+ * Start with fault_pending_wqh and fault_wqh so they're more likely
+ * to be in the same cacheline.
+ */
+struct userfaultfd_ctx {
+ /* waitqueue head for the pending (i.e. not read) userfaults */
+ wait_queue_head_t fault_pending_wqh;
+ /* waitqueue head for the userfaults */
+ wait_queue_head_t fault_wqh;
+ /* waitqueue head for the pseudo fd to wakeup poll/read */
+ wait_queue_head_t fd_wqh;
+ /* a refile sequence protected by fault_pending_wqh lock */
+ struct seqcount refile_seq;
+ /* pseudo fd refcounting */
+ atomic_t refcount;
+ /* userfaultfd syscall flags */
+ unsigned int flags;
+ /* state machine */
+ enum userfaultfd_state state;
+ /* released */
+ bool released;
+ /* mm with one ore more vmas attached to this userfaultfd_ctx */
+ struct mm_struct *mm;
+};
+
+struct userfaultfd_wait_queue {
+ struct uffd_msg msg;
+ wait_queue_t wq;
+ struct userfaultfd_ctx *ctx;
+};
+
+struct userfaultfd_wake_range {
+ unsigned long start;
+ unsigned long len;
+};
+
+static int userfaultfd_wake_function(wait_queue_t *wq, unsigned mode,
+ int wake_flags, void *key)
+{
+ struct userfaultfd_wake_range *range = key;
+ int ret;
+ struct userfaultfd_wait_queue *uwq;
+ unsigned long start, len;
+
+ uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
+ ret = 0;
+ /* len == 0 means wake all */
+ start = range->start;
+ len = range->len;
+ if (len && (start > uwq->msg.arg.pagefault.address ||
+ start + len <= uwq->msg.arg.pagefault.address))
+ goto out;
+ ret = wake_up_state(wq->private, mode);
+ if (ret)
+ /*
+ * Wake only once, autoremove behavior.
+ *
+ * After the effect of list_del_init is visible to the
+ * other CPUs, the waitqueue may disappear from under
+ * us, see the !list_empty_careful() in
+ * handle_userfault(). try_to_wake_up() has an
+ * implicit smp_mb__before_spinlock, and the
+ * wq->private is read before calling the extern
+ * function "wake_up_state" (which in turns calls
+ * try_to_wake_up). While the spin_lock;spin_unlock;
+ * wouldn't be enough, the smp_mb__before_spinlock is
+ * enough to avoid an explicit smp_mb() here.
+ */
+ list_del_init(&wq->task_list);
+out:
+ return ret;
+}
+
+/**
+ * userfaultfd_ctx_get - Acquires a reference to the internal userfaultfd
+ * context.
+ * @ctx: [in] Pointer to the userfaultfd context.
+ *
+ * Returns: In case of success, returns not zero.
+ */
+static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx)
+{
+ if (!atomic_inc_not_zero(&ctx->refcount))
+ BUG();
+}
+
+/**
+ * userfaultfd_ctx_put - Releases a reference to the internal userfaultfd
+ * context.
+ * @ctx: [in] Pointer to userfaultfd context.
+ *
+ * The userfaultfd context reference must have been previously acquired either
+ * with userfaultfd_ctx_get() or userfaultfd_ctx_fdget().
+ */
+static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
+{
+ if (atomic_dec_and_test(&ctx->refcount)) {
+ VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock));
+ VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh));
+ VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock));
+ VM_BUG_ON(waitqueue_active(&ctx->fault_wqh));
+ VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock));
+ VM_BUG_ON(waitqueue_active(&ctx->fd_wqh));
+ mmput(ctx->mm);
+ kmem_cache_free(userfaultfd_ctx_cachep, ctx);
+ }
+}
+
+static inline void msg_init(struct uffd_msg *msg)
+{
+ BUILD_BUG_ON(sizeof(struct uffd_msg) != 32);
+ /*
+ * Must use memset to zero out the paddings or kernel data is
+ * leaked to userland.
+ */
+ memset(msg, 0, sizeof(struct uffd_msg));
+}
+
+static inline struct uffd_msg userfault_msg(unsigned long address,
+ unsigned int flags,
+ unsigned long reason)
+{
+ struct uffd_msg msg;
+ msg_init(&msg);
+ msg.event = UFFD_EVENT_PAGEFAULT;
+ msg.arg.pagefault.address = address;
+ if (flags & FAULT_FLAG_WRITE)
+ /*
+ * If UFFD_FEATURE_PAGEFAULT_FLAG_WRITE was set in the
+ * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WRITE
+ * was not set in a UFFD_EVENT_PAGEFAULT, it means it
+ * was a read fault, otherwise if set it means it's
+ * a write fault.
+ */
+ msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE;
+ if (reason & VM_UFFD_WP)
+ /*
+ * If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the
+ * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WP was
+ * not set in a UFFD_EVENT_PAGEFAULT, it means it was
+ * a missing fault, otherwise if set it means it's a
+ * write protect fault.
+ */
+ msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP;
+ return msg;
+}
+
+/*
+ * Verify the pagetables are still not ok after having reigstered into
+ * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any
+ * userfault that has already been resolved, if userfaultfd_read and
+ * UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different
+ * threads.
+ */
+static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
+ unsigned long address,
+ unsigned long flags,
+ unsigned long reason)
+{
+ struct mm_struct *mm = ctx->mm;
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd, _pmd;
+ pte_t *pte;
+ bool ret = true;
+
+ VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
+
+ pgd = pgd_offset(mm, address);
+ if (!pgd_present(*pgd))
+ goto out;
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud))
+ goto out;
+ pmd = pmd_offset(pud, address);
+ /*
+ * READ_ONCE must function as a barrier with narrower scope
+ * and it must be equivalent to:
+ * _pmd = *pmd; barrier();
+ *
+ * This is to deal with the instability (as in
+ * pmd_trans_unstable) of the pmd.
+ */
+ _pmd = READ_ONCE(*pmd);
+ if (!pmd_present(_pmd))
+ goto out;
+
+ ret = false;
+ if (pmd_trans_huge(_pmd))
+ goto out;
+
+ /*
+ * the pmd is stable (as in !pmd_trans_unstable) so we can re-read it
+ * and use the standard pte_offset_map() instead of parsing _pmd.
+ */
+ pte = pte_offset_map(pmd, address);
+ /*
+ * Lockless access: we're in a wait_event so it's ok if it
+ * changes under us.
+ */
+ if (pte_none(*pte))
+ ret = true;
+ pte_unmap(pte);
+
+out:
+ return ret;
+}
+
+/*
+ * The locking rules involved in returning VM_FAULT_RETRY depending on
+ * FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and
+ * FAULT_FLAG_KILLABLE are not straightforward. The "Caution"
+ * recommendation in __lock_page_or_retry is not an understatement.
+ *
+ * If FAULT_FLAG_ALLOW_RETRY is set, the mmap_sem must be released
+ * before returning VM_FAULT_RETRY only if FAULT_FLAG_RETRY_NOWAIT is
+ * not set.
+ *
+ * If FAULT_FLAG_ALLOW_RETRY is set but FAULT_FLAG_KILLABLE is not
+ * set, VM_FAULT_RETRY can still be returned if and only if there are
+ * fatal_signal_pending()s, and the mmap_sem must be released before
+ * returning it.
+ */
+int handle_userfault(struct vm_area_struct *vma, unsigned long address,
+ unsigned int flags, unsigned long reason)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct userfaultfd_ctx *ctx;
+ struct userfaultfd_wait_queue uwq;
+ int ret;
+ bool must_wait, return_to_userland;
+
+ BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
+
+ ret = VM_FAULT_SIGBUS;
+ ctx = vma->vm_userfaultfd_ctx.ctx;
+ if (!ctx)
+ goto out;
+
+ BUG_ON(ctx->mm != mm);
+
+ VM_BUG_ON(reason & ~(VM_UFFD_MISSING|VM_UFFD_WP));
+ VM_BUG_ON(!(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP));
+
+ /*
+ * If it's already released don't get it. This avoids to loop
+ * in __get_user_pages if userfaultfd_release waits on the
+ * caller of handle_userfault to release the mmap_sem.
+ */
+ if (unlikely(ACCESS_ONCE(ctx->released)))
+ goto out;
+
+ /*
+ * Check that we can return VM_FAULT_RETRY.
+ *
+ * NOTE: it should become possible to return VM_FAULT_RETRY
+ * even if FAULT_FLAG_TRIED is set without leading to gup()
+ * -EBUSY failures, if the userfaultfd is to be extended for
+ * VM_UFFD_WP tracking and we intend to arm the userfault
+ * without first stopping userland access to the memory. For
+ * VM_UFFD_MISSING userfaults this is enough for now.
+ */
+ if (unlikely(!(flags & FAULT_FLAG_ALLOW_RETRY))) {
+ /*
+ * Validate the invariant that nowait must allow retry
+ * to be sure not to return SIGBUS erroneously on
+ * nowait invocations.
+ */
+ BUG_ON(flags & FAULT_FLAG_RETRY_NOWAIT);
+#ifdef CONFIG_DEBUG_VM
+ if (printk_ratelimit()) {
+ printk(KERN_WARNING
+ "FAULT_FLAG_ALLOW_RETRY missing %x\n", flags);
+ dump_stack();
+ }
+#endif
+ goto out;
+ }
+
+ /*
+ * Handle nowait, not much to do other than tell it to retry
+ * and wait.
+ */
+ ret = VM_FAULT_RETRY;
+ if (flags & FAULT_FLAG_RETRY_NOWAIT)
+ goto out;
+
+ /* take the reference before dropping the mmap_sem */
+ userfaultfd_ctx_get(ctx);
+
+ init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
+ uwq.wq.private = current;
+ uwq.msg = userfault_msg(address, flags, reason);
+ uwq.ctx = ctx;
+
+ return_to_userland = (flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) ==
+ (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE);
+
+ spin_lock(&ctx->fault_pending_wqh.lock);
+ /*
+ * After the __add_wait_queue the uwq is visible to userland
+ * through poll/read().
+ */
+ __add_wait_queue(&ctx->fault_pending_wqh, &uwq.wq);
+ /*
+ * The smp_mb() after __set_current_state prevents the reads
+ * following the spin_unlock to happen before the list_add in
+ * __add_wait_queue.
+ */
+ set_current_state(return_to_userland ? TASK_INTERRUPTIBLE :
+ TASK_KILLABLE);
+ spin_unlock(&ctx->fault_pending_wqh.lock);
+
+ must_wait = userfaultfd_must_wait(ctx, address, flags, reason);
+ up_read(&mm->mmap_sem);
+
+ if (likely(must_wait && !ACCESS_ONCE(ctx->released) &&
+ (return_to_userland ? !signal_pending(current) :
+ !fatal_signal_pending(current)))) {
+ wake_up_poll(&ctx->fd_wqh, POLLIN);
+ schedule();
+ ret |= VM_FAULT_MAJOR;
+ }
+
+ __set_current_state(TASK_RUNNING);
+
+ if (return_to_userland) {
+ if (signal_pending(current) &&
+ !fatal_signal_pending(current)) {
+ /*
+ * If we got a SIGSTOP or SIGCONT and this is
+ * a normal userland page fault, just let
+ * userland return so the signal will be
+ * handled and gdb debugging works. The page
+ * fault code immediately after we return from
+ * this function is going to release the
+ * mmap_sem and it's not depending on it
+ * (unlike gup would if we were not to return
+ * VM_FAULT_RETRY).
+ *
+ * If a fatal signal is pending we still take
+ * the streamlined VM_FAULT_RETRY failure path
+ * and there's no need to retake the mmap_sem
+ * in such case.
+ */
+ down_read(&mm->mmap_sem);
+ ret = 0;
+ }
+ }
+
+ /*
+ * Here we race with the list_del; list_add in
+ * userfaultfd_ctx_read(), however because we don't ever run
+ * list_del_init() to refile across the two lists, the prev
+ * and next pointers will never point to self. list_add also
+ * would never let any of the two pointers to point to
+ * self. So list_empty_careful won't risk to see both pointers
+ * pointing to self at any time during the list refile. The
+ * only case where list_del_init() is called is the full
+ * removal in the wake function and there we don't re-list_add
+ * and it's fine not to block on the spinlock. The uwq on this
+ * kernel stack can be released after the list_del_init.
+ */
+ if (!list_empty_careful(&uwq.wq.task_list)) {
+ spin_lock(&ctx->fault_pending_wqh.lock);
+ /*
+ * No need of list_del_init(), the uwq on the stack
+ * will be freed shortly anyway.
+ */
+ list_del(&uwq.wq.task_list);
+ spin_unlock(&ctx->fault_pending_wqh.lock);
+ }
+
+ /*
+ * ctx may go away after this if the userfault pseudo fd is
+ * already released.
+ */
+ userfaultfd_ctx_put(ctx);
+
+out:
+ return ret;
+}
+
+static int userfaultfd_release(struct inode *inode, struct file *file)
+{
+ struct userfaultfd_ctx *ctx = file->private_data;
+ struct mm_struct *mm = ctx->mm;
+ struct vm_area_struct *vma, *prev;
+ /* len == 0 means wake all */
+ struct userfaultfd_wake_range range = { .len = 0, };
+ unsigned long new_flags;
+
+ ACCESS_ONCE(ctx->released) = true;
+
+ /*
+ * Flush page faults out of all CPUs. NOTE: all page faults
+ * must be retried without returning VM_FAULT_SIGBUS if
+ * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx
+ * changes while handle_userfault released the mmap_sem. So
+ * it's critical that released is set to true (above), before
+ * taking the mmap_sem for writing.
+ */
+ down_write(&mm->mmap_sem);
+ prev = NULL;
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ cond_resched();
+ BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^
+ !!(vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
+ if (vma->vm_userfaultfd_ctx.ctx != ctx) {
+ prev = vma;
+ continue;
+ }
+ new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP);
+ prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end,
+ new_flags, vma->anon_vma,
+ vma->vm_file, vma->vm_pgoff,
+ vma_policy(vma),
+ NULL_VM_UFFD_CTX);
+ if (prev)
+ vma = prev;
+ else
+ prev = vma;
+ vma->vm_flags = new_flags;
+ vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+ }
+ up_write(&mm->mmap_sem);
+
+ /*
+ * After no new page faults can wait on this fault_*wqh, flush
+ * the last page faults that may have been already waiting on
+ * the fault_*wqh.
+ */
+ spin_lock(&ctx->fault_pending_wqh.lock);
+ __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &range);
+ __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, &range);
+ spin_unlock(&ctx->fault_pending_wqh.lock);
+
+ wake_up_poll(&ctx->fd_wqh, POLLHUP);
+ userfaultfd_ctx_put(ctx);
+ return 0;
+}
+
+/* fault_pending_wqh.lock must be hold by the caller */
+static inline struct userfaultfd_wait_queue *find_userfault(
+ struct userfaultfd_ctx *ctx)
+{
+ wait_queue_t *wq;
+ struct userfaultfd_wait_queue *uwq;
+
+ VM_BUG_ON(!spin_is_locked(&ctx->fault_pending_wqh.lock));
+
+ uwq = NULL;
+ if (!waitqueue_active(&ctx->fault_pending_wqh))
+ goto out;
+ /* walk in reverse to provide FIFO behavior to read userfaults */
+ wq = list_last_entry(&ctx->fault_pending_wqh.task_list,
+ typeof(*wq), task_list);
+ uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
+out:
+ return uwq;
+}
+
+static unsigned int userfaultfd_poll(struct file *file, poll_table *wait)
+{
+ struct userfaultfd_ctx *ctx = file->private_data;
+ unsigned int ret;
+
+ poll_wait(file, &ctx->fd_wqh, wait);
+
+ switch (ctx->state) {
+ case UFFD_STATE_WAIT_API:
+ return POLLERR;
+ case UFFD_STATE_RUNNING:
+ /*
+ * poll() never guarantees that read won't block.
+ * userfaults can be waken before they're read().
+ */
+ if (unlikely(!(file->f_flags & O_NONBLOCK)))
+ return POLLERR;
+ /*
+ * lockless access to see if there are pending faults
+ * __pollwait last action is the add_wait_queue but
+ * the spin_unlock would allow the waitqueue_active to
+ * pass above the actual list_add inside
+ * add_wait_queue critical section. So use a full
+ * memory barrier to serialize the list_add write of
+ * add_wait_queue() with the waitqueue_active read
+ * below.
+ */
+ ret = 0;
+ smp_mb();
+ if (waitqueue_active(&ctx->fault_pending_wqh))
+ ret = POLLIN;
+ return ret;
+ default:
+ BUG();
+ }
+}
+
+static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
+ struct uffd_msg *msg)
+{
+ ssize_t ret;
+ DECLARE_WAITQUEUE(wait, current);
+ struct userfaultfd_wait_queue *uwq;
+
+ /* always take the fd_wqh lock before the fault_pending_wqh lock */
+ spin_lock(&ctx->fd_wqh.lock);
+ __add_wait_queue(&ctx->fd_wqh, &wait);
+ for (;;) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ spin_lock(&ctx->fault_pending_wqh.lock);
+ uwq = find_userfault(ctx);
+ if (uwq) {
+ /*
+ * Use a seqcount to repeat the lockless check
+ * in wake_userfault() to avoid missing
+ * wakeups because during the refile both
+ * waitqueue could become empty if this is the
+ * only userfault.
+ */
+ write_seqcount_begin(&ctx->refile_seq);
+
+ /*
+ * The fault_pending_wqh.lock prevents the uwq
+ * to disappear from under us.
+ *
+ * Refile this userfault from
+ * fault_pending_wqh to fault_wqh, it's not
+ * pending anymore after we read it.
+ *
+ * Use list_del() by hand (as
+ * userfaultfd_wake_function also uses
+ * list_del_init() by hand) to be sure nobody
+ * changes __remove_wait_queue() to use
+ * list_del_init() in turn breaking the
+ * !list_empty_careful() check in
+ * handle_userfault(). The uwq->wq.task_list
+ * must never be empty at any time during the
+ * refile, or the waitqueue could disappear
+ * from under us. The "wait_queue_head_t"
+ * parameter of __remove_wait_queue() is unused
+ * anyway.
+ */
+ list_del(&uwq->wq.task_list);
+ __add_wait_queue(&ctx->fault_wqh, &uwq->wq);
+
+ write_seqcount_end(&ctx->refile_seq);
+
+ /* careful to always initialize msg if ret == 0 */
+ *msg = uwq->msg;
+ spin_unlock(&ctx->fault_pending_wqh.lock);
+ ret = 0;
+ break;
+ }
+ spin_unlock(&ctx->fault_pending_wqh.lock);
+ if (signal_pending(current)) {
+ ret = -ERESTARTSYS;
+ break;
+ }
+ if (no_wait) {
+ ret = -EAGAIN;
+ break;
+ }
+ spin_unlock(&ctx->fd_wqh.lock);
+ schedule();
+ spin_lock(&ctx->fd_wqh.lock);
+ }
+ __remove_wait_queue(&ctx->fd_wqh, &wait);
+ __set_current_state(TASK_RUNNING);
+ spin_unlock(&ctx->fd_wqh.lock);
+
+ return ret;
+}
+
+static ssize_t userfaultfd_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct userfaultfd_ctx *ctx = file->private_data;
+ ssize_t _ret, ret = 0;
+ struct uffd_msg msg;
+ int no_wait = file->f_flags & O_NONBLOCK;
+
+ if (ctx->state == UFFD_STATE_WAIT_API)
+ return -EINVAL;
+
+ for (;;) {
+ if (count < sizeof(msg))
+ return ret ? ret : -EINVAL;
+ _ret = userfaultfd_ctx_read(ctx, no_wait, &msg);
+ if (_ret < 0)
+ return ret ? ret : _ret;
+ if (copy_to_user((__u64 __user *) buf, &msg, sizeof(msg)))
+ return ret ? ret : -EFAULT;
+ ret += sizeof(msg);
+ buf += sizeof(msg);
+ count -= sizeof(msg);
+ /*
+ * Allow to read more than one fault at time but only
+ * block if waiting for the very first one.
+ */
+ no_wait = O_NONBLOCK;
+ }
+}
+
+static void __wake_userfault(struct userfaultfd_ctx *ctx,
+ struct userfaultfd_wake_range *range)
+{
+ unsigned long start, end;
+
+ start = range->start;
+ end = range->start + range->len;
+
+ spin_lock(&ctx->fault_pending_wqh.lock);
+ /* wake all in the range and autoremove */
+ if (waitqueue_active(&ctx->fault_pending_wqh))
+ __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL,
+ range);
+ if (waitqueue_active(&ctx->fault_wqh))
+ __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, range);
+ spin_unlock(&ctx->fault_pending_wqh.lock);
+}
+
+static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
+ struct userfaultfd_wake_range *range)
+{
+ unsigned seq;
+ bool need_wakeup;
+
+ /*
+ * To be sure waitqueue_active() is not reordered by the CPU
+ * before the pagetable update, use an explicit SMP memory
+ * barrier here. PT lock release or up_read(mmap_sem) still
+ * have release semantics that can allow the
+ * waitqueue_active() to be reordered before the pte update.
+ */
+ smp_mb();
+
+ /*
+ * Use waitqueue_active because it's very frequent to
+ * change the address space atomically even if there are no
+ * userfaults yet. So we take the spinlock only when we're
+ * sure we've userfaults to wake.
+ */
+ do {
+ seq = read_seqcount_begin(&ctx->refile_seq);
+ need_wakeup = waitqueue_active(&ctx->fault_pending_wqh) ||
+ waitqueue_active(&ctx->fault_wqh);
+ cond_resched();
+ } while (read_seqcount_retry(&ctx->refile_seq, seq));
+ if (need_wakeup)
+ __wake_userfault(ctx, range);
+}
+
+static __always_inline int validate_range(struct mm_struct *mm,
+ __u64 start, __u64 len)
+{
+ __u64 task_size = mm->task_size;
+
+ if (start & ~PAGE_MASK)
+ return -EINVAL;
+ if (len & ~PAGE_MASK)
+ return -EINVAL;
+ if (!len)
+ return -EINVAL;
+ if (start < mmap_min_addr)
+ return -EINVAL;
+ if (start >= task_size)
+ return -EINVAL;
+ if (len > task_size - start)
+ return -EINVAL;
+ return 0;
+}
+
+static int userfaultfd_register(struct userfaultfd_ctx *ctx,
+ unsigned long arg)
+{
+ struct mm_struct *mm = ctx->mm;
+ struct vm_area_struct *vma, *prev, *cur;
+ int ret;
+ struct uffdio_register uffdio_register;
+ struct uffdio_register __user *user_uffdio_register;
+ unsigned long vm_flags, new_flags;
+ bool found;
+ unsigned long start, end, vma_end;
+
+ user_uffdio_register = (struct uffdio_register __user *) arg;
+
+ ret = -EFAULT;
+ if (copy_from_user(&uffdio_register, user_uffdio_register,
+ sizeof(uffdio_register)-sizeof(__u64)))
+ goto out;
+
+ ret = -EINVAL;
+ if (!uffdio_register.mode)
+ goto out;
+ if (uffdio_register.mode & ~(UFFDIO_REGISTER_MODE_MISSING|
+ UFFDIO_REGISTER_MODE_WP))
+ goto out;
+ vm_flags = 0;
+ if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
+ vm_flags |= VM_UFFD_MISSING;
+ if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {
+ vm_flags |= VM_UFFD_WP;
+ /*
+ * FIXME: remove the below error constraint by
+ * implementing the wprotect tracking mode.
+ */
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = validate_range(mm, uffdio_register.range.start,
+ uffdio_register.range.len);
+ if (ret)
+ goto out;
+
+ start = uffdio_register.range.start;
+ end = start + uffdio_register.range.len;
+
+ down_write(&mm->mmap_sem);
+ vma = find_vma_prev(mm, start, &prev);
+
+ ret = -ENOMEM;
+ if (!vma)
+ goto out_unlock;
+
+ /* check that there's at least one vma in the range */
+ ret = -EINVAL;
+ if (vma->vm_start >= end)
+ goto out_unlock;
+
+ /*
+ * Search for not compatible vmas.
+ *
+ * FIXME: this shall be relaxed later so that it doesn't fail
+ * on tmpfs backed vmas (in addition to the current allowance
+ * on anonymous vmas).
+ */
+ found = false;
+ for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
+ cond_resched();
+
+ BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
+ !!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
+
+ /* check not compatible vmas */
+ ret = -EINVAL;
+ if (cur->vm_ops)
+ goto out_unlock;
+
+ /*
+ * Check that this vma isn't already owned by a
+ * different userfaultfd. We can't allow more than one
+ * userfaultfd to own a single vma simultaneously or we
+ * wouldn't know which one to deliver the userfaults to.
+ */
+ ret = -EBUSY;
+ if (cur->vm_userfaultfd_ctx.ctx &&
+ cur->vm_userfaultfd_ctx.ctx != ctx)
+ goto out_unlock;
+
+ found = true;
+ }
+ BUG_ON(!found);
+
+ if (vma->vm_start < start)
+ prev = vma;
+
+ ret = 0;
+ do {
+ cond_resched();
+
+ BUG_ON(vma->vm_ops);
+ BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
+ vma->vm_userfaultfd_ctx.ctx != ctx);
+
+ /*
+ * Nothing to do: this vma is already registered into this
+ * userfaultfd and with the right tracking mode too.
+ */
+ if (vma->vm_userfaultfd_ctx.ctx == ctx &&
+ (vma->vm_flags & vm_flags) == vm_flags)
+ goto skip;
+
+ if (vma->vm_start > start)
+ start = vma->vm_start;
+ vma_end = min(end, vma->vm_end);
+
+ new_flags = (vma->vm_flags & ~vm_flags) | vm_flags;
+ prev = vma_merge(mm, prev, start, vma_end, new_flags,
+ vma->anon_vma, vma->vm_file, vma->vm_pgoff,
+ vma_policy(vma),
+ ((struct vm_userfaultfd_ctx){ ctx }));
+ if (prev) {
+ vma = prev;
+ goto next;
+ }
+ if (vma->vm_start < start) {
+ ret = split_vma(mm, vma, start, 1);
+ if (ret)
+ break;
+ }
+ if (vma->vm_end > end) {
+ ret = split_vma(mm, vma, end, 0);
+ if (ret)
+ break;
+ }
+ next:
+ /*
+ * In the vma_merge() successful mprotect-like case 8:
+ * the next vma was merged into the current one and
+ * the current one has not been updated yet.
+ */
+ vma->vm_flags = new_flags;
+ vma->vm_userfaultfd_ctx.ctx = ctx;
+
+ skip:
+ prev = vma;
+ start = vma->vm_end;
+ vma = vma->vm_next;
+ } while (vma && vma->vm_start < end);
+out_unlock:
+ up_write(&mm->mmap_sem);
+ if (!ret) {
+ /*
+ * Now that we scanned all vmas we can already tell
+ * userland which ioctls methods are guaranteed to
+ * succeed on this range.
+ */
+ if (put_user(UFFD_API_RANGE_IOCTLS,
+ &user_uffdio_register->ioctls))
+ ret = -EFAULT;
+ }
+out:
+ return ret;
+}
+
+static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
+ unsigned long arg)
+{
+ struct mm_struct *mm = ctx->mm;
+ struct vm_area_struct *vma, *prev, *cur;
+ int ret;
+ struct uffdio_range uffdio_unregister;
+ unsigned long new_flags;
+ bool found;
+ unsigned long start, end, vma_end;
+ const void __user *buf = (void __user *)arg;
+
+ ret = -EFAULT;
+ if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
+ goto out;
+
+ ret = validate_range(mm, uffdio_unregister.start,
+ uffdio_unregister.len);
+ if (ret)
+ goto out;
+
+ start = uffdio_unregister.start;
+ end = start + uffdio_unregister.len;
+
+ down_write(&mm->mmap_sem);
+ vma = find_vma_prev(mm, start, &prev);
+
+ ret = -ENOMEM;
+ if (!vma)
+ goto out_unlock;
+
+ /* check that there's at least one vma in the range */
+ ret = -EINVAL;
+ if (vma->vm_start >= end)
+ goto out_unlock;
+
+ /*
+ * Search for not compatible vmas.
+ *
+ * FIXME: this shall be relaxed later so that it doesn't fail
+ * on tmpfs backed vmas (in addition to the current allowance
+ * on anonymous vmas).
+ */
+ found = false;
+ ret = -EINVAL;
+ for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
+ cond_resched();
+
+ BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
+ !!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
+
+ /*
+ * Check not compatible vmas, not strictly required
+ * here as not compatible vmas cannot have an
+ * userfaultfd_ctx registered on them, but this
+ * provides for more strict behavior to notice
+ * unregistration errors.
+ */
+ if (cur->vm_ops)
+ goto out_unlock;
+
+ found = true;
+ }
+ BUG_ON(!found);
+
+ if (vma->vm_start < start)
+ prev = vma;
+
+ ret = 0;
+ do {
+ cond_resched();
+
+ BUG_ON(vma->vm_ops);
+
+ /*
+ * Nothing to do: this vma is already registered into this
+ * userfaultfd and with the right tracking mode too.
+ */
+ if (!vma->vm_userfaultfd_ctx.ctx)
+ goto skip;
+
+ if (vma->vm_start > start)
+ start = vma->vm_start;
+ vma_end = min(end, vma->vm_end);
+
+ new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP);
+ prev = vma_merge(mm, prev, start, vma_end, new_flags,
+ vma->anon_vma, vma->vm_file, vma->vm_pgoff,
+ vma_policy(vma),
+ NULL_VM_UFFD_CTX);
+ if (prev) {
+ vma = prev;
+ goto next;
+ }
+ if (vma->vm_start < start) {
+ ret = split_vma(mm, vma, start, 1);
+ if (ret)
+ break;
+ }
+ if (vma->vm_end > end) {
+ ret = split_vma(mm, vma, end, 0);
+ if (ret)
+ break;
+ }
+ next:
+ /*
+ * In the vma_merge() successful mprotect-like case 8:
+ * the next vma was merged into the current one and
+ * the current one has not been updated yet.
+ */
+ vma->vm_flags = new_flags;
+ vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+
+ skip:
+ prev = vma;
+ start = vma->vm_end;
+ vma = vma->vm_next;
+ } while (vma && vma->vm_start < end);
+out_unlock:
+ up_write(&mm->mmap_sem);
+out:
+ return ret;
+}
+
+/*
+ * userfaultfd_wake may be used in combination with the
+ * UFFDIO_*_MODE_DONTWAKE to wakeup userfaults in batches.
+ */
+static int userfaultfd_wake(struct userfaultfd_ctx *ctx,
+ unsigned long arg)
+{
+ int ret;
+ struct uffdio_range uffdio_wake;
+ struct userfaultfd_wake_range range;
+ const void __user *buf = (void __user *)arg;
+
+ ret = -EFAULT;
+ if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake)))
+ goto out;
+
+ ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len);
+ if (ret)
+ goto out;
+
+ range.start = uffdio_wake.start;
+ range.len = uffdio_wake.len;
+
+ /*
+ * len == 0 means wake all and we don't want to wake all here,
+ * so check it again to be sure.
+ */
+ VM_BUG_ON(!range.len);
+
+ wake_userfault(ctx, &range);
+ ret = 0;
+
+out:
+ return ret;
+}
+
+static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
+ unsigned long arg)
+{
+ __s64 ret;
+ struct uffdio_copy uffdio_copy;
+ struct uffdio_copy __user *user_uffdio_copy;
+ struct userfaultfd_wake_range range;
+
+ user_uffdio_copy = (struct uffdio_copy __user *) arg;
+
+ ret = -EFAULT;
+ if (copy_from_user(&uffdio_copy, user_uffdio_copy,
+ /* don't copy "copy" last field */
+ sizeof(uffdio_copy)-sizeof(__s64)))
+ goto out;
+
+ ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len);
+ if (ret)
+ goto out;
+ /*
+ * double check for wraparound just in case. copy_from_user()
+ * will later check uffdio_copy.src + uffdio_copy.len to fit
+ * in the userland range.
+ */
+ ret = -EINVAL;
+ if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src)
+ goto out;
+ if (uffdio_copy.mode & ~UFFDIO_COPY_MODE_DONTWAKE)
+ goto out;
+
+ ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
+ uffdio_copy.len);
+ if (unlikely(put_user(ret, &user_uffdio_copy->copy)))
+ return -EFAULT;
+ if (ret < 0)
+ goto out;
+ BUG_ON(!ret);
+ /* len == 0 would wake all */
+ range.len = ret;
+ if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) {
+ range.start = uffdio_copy.dst;
+ wake_userfault(ctx, &range);
+ }
+ ret = range.len == uffdio_copy.len ? 0 : -EAGAIN;
+out:
+ return ret;
+}
+
+static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
+ unsigned long arg)
+{
+ __s64 ret;
+ struct uffdio_zeropage uffdio_zeropage;
+ struct uffdio_zeropage __user *user_uffdio_zeropage;
+ struct userfaultfd_wake_range range;
+
+ user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;
+
+ ret = -EFAULT;
+ if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage,
+ /* don't copy "zeropage" last field */
+ sizeof(uffdio_zeropage)-sizeof(__s64)))
+ goto out;
+
+ ret = validate_range(ctx->mm, uffdio_zeropage.range.start,
+ uffdio_zeropage.range.len);
+ if (ret)
+ goto out;
+ ret = -EINVAL;
+ if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE)
+ goto out;
+
+ ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start,
+ uffdio_zeropage.range.len);
+ if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
+ return -EFAULT;
+ if (ret < 0)
+ goto out;
+ /* len == 0 would wake all */
+ BUG_ON(!ret);
+ range.len = ret;
+ if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) {
+ range.start = uffdio_zeropage.range.start;
+ wake_userfault(ctx, &range);
+ }
+ ret = range.len == uffdio_zeropage.range.len ? 0 : -EAGAIN;
+out:
+ return ret;
+}
+
+/*
+ * userland asks for a certain API version and we return which bits
+ * and ioctl commands are implemented in this kernel for such API
+ * version or -EINVAL if unknown.
+ */
+static int userfaultfd_api(struct userfaultfd_ctx *ctx,
+ unsigned long arg)
+{
+ struct uffdio_api uffdio_api;
+ void __user *buf = (void __user *)arg;
+ int ret;
+
+ ret = -EINVAL;
+ if (ctx->state != UFFD_STATE_WAIT_API)
+ goto out;
+ ret = -EFAULT;
+ if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
+ goto out;
+ if (uffdio_api.api != UFFD_API || uffdio_api.features) {
+ memset(&uffdio_api, 0, sizeof(uffdio_api));
+ if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
+ goto out;
+ ret = -EINVAL;
+ goto out;
+ }
+ uffdio_api.features = UFFD_API_FEATURES;
+ uffdio_api.ioctls = UFFD_API_IOCTLS;
+ ret = -EFAULT;
+ if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
+ goto out;
+ ctx->state = UFFD_STATE_RUNNING;
+ ret = 0;
+out:
+ return ret;
+}
+
+static long userfaultfd_ioctl(struct file *file, unsigned cmd,
+ unsigned long arg)
+{
+ int ret = -EINVAL;
+ struct userfaultfd_ctx *ctx = file->private_data;
+
+ if (cmd != UFFDIO_API && ctx->state == UFFD_STATE_WAIT_API)
+ return -EINVAL;
+
+ switch(cmd) {
+ case UFFDIO_API:
+ ret = userfaultfd_api(ctx, arg);
+ break;
+ case UFFDIO_REGISTER:
+ ret = userfaultfd_register(ctx, arg);
+ break;
+ case UFFDIO_UNREGISTER:
+ ret = userfaultfd_unregister(ctx, arg);
+ break;
+ case UFFDIO_WAKE:
+ ret = userfaultfd_wake(ctx, arg);
+ break;
+ case UFFDIO_COPY:
+ ret = userfaultfd_copy(ctx, arg);
+ break;
+ case UFFDIO_ZEROPAGE:
+ ret = userfaultfd_zeropage(ctx, arg);
+ break;
+ }
+ return ret;
+}
+
+#ifdef CONFIG_PROC_FS
+static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
+{
+ struct userfaultfd_ctx *ctx = f->private_data;
+ wait_queue_t *wq;
+ struct userfaultfd_wait_queue *uwq;
+ unsigned long pending = 0, total = 0;
+
+ spin_lock(&ctx->fault_pending_wqh.lock);
+ list_for_each_entry(wq, &ctx->fault_pending_wqh.task_list, task_list) {
+ uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
+ pending++;
+ total++;
+ }
+ list_for_each_entry(wq, &ctx->fault_wqh.task_list, task_list) {
+ uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
+ total++;
+ }
+ spin_unlock(&ctx->fault_pending_wqh.lock);
+
+ /*
+ * If more protocols will be added, there will be all shown
+ * separated by a space. Like this:
+ * protocols: aa:... bb:...
+ */
+ seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n",
+ pending, total, UFFD_API, UFFD_API_FEATURES,
+ UFFD_API_IOCTLS|UFFD_API_RANGE_IOCTLS);
+}
+#endif
+
+static const struct file_operations userfaultfd_fops = {
+#ifdef CONFIG_PROC_FS
+ .show_fdinfo = userfaultfd_show_fdinfo,
+#endif
+ .release = userfaultfd_release,
+ .poll = userfaultfd_poll,
+ .read = userfaultfd_read,
+ .unlocked_ioctl = userfaultfd_ioctl,
+ .compat_ioctl = userfaultfd_ioctl,
+ .llseek = noop_llseek,
+};
+
+static void init_once_userfaultfd_ctx(void *mem)
+{
+ struct userfaultfd_ctx *ctx = (struct userfaultfd_ctx *) mem;
+
+ init_waitqueue_head(&ctx->fault_pending_wqh);
+ init_waitqueue_head(&ctx->fault_wqh);
+ init_waitqueue_head(&ctx->fd_wqh);
+ seqcount_init(&ctx->refile_seq);
+}
+
+/**
+ * userfaultfd_file_create - Creates an userfaultfd file pointer.
+ * @flags: Flags for the userfaultfd file.
+ *
+ * This function creates an userfaultfd file pointer, w/out installing
+ * it into the fd table. This is useful when the userfaultfd file is
+ * used during the initialization of data structures that require
+ * extra setup after the userfaultfd creation. So the userfaultfd
+ * creation is split into the file pointer creation phase, and the
+ * file descriptor installation phase. In this way races with
+ * userspace closing the newly installed file descriptor can be
+ * avoided. Returns an userfaultfd file pointer, or a proper error
+ * pointer.
+ */
+static struct file *userfaultfd_file_create(int flags)
+{
+ struct file *file;
+ struct userfaultfd_ctx *ctx;
+
+ BUG_ON(!current->mm);
+
+ /* Check the UFFD_* constants for consistency. */
+ BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC);
+ BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK);
+
+ file = ERR_PTR(-EINVAL);
+ if (flags & ~UFFD_SHARED_FCNTL_FLAGS)
+ goto out;
+
+ file = ERR_PTR(-ENOMEM);
+ ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
+ if (!ctx)
+ goto out;
+
+ atomic_set(&ctx->refcount, 1);
+ ctx->flags = flags;
+ ctx->state = UFFD_STATE_WAIT_API;
+ ctx->released = false;
+ ctx->mm = current->mm;
+ /* prevent the mm struct to be freed */
+ atomic_inc(&ctx->mm->mm_users);
+
+ file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, ctx,
+ O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS));
+ if (IS_ERR(file)) {
+ mmput(ctx->mm);
+ kmem_cache_free(userfaultfd_ctx_cachep, ctx);
+ }
+out:
+ return file;
+}
+
+SYSCALL_DEFINE1(userfaultfd, int, flags)
+{
+ int fd, error;
+ struct file *file;
+
+ error = get_unused_fd_flags(flags & UFFD_SHARED_FCNTL_FLAGS);
+ if (error < 0)
+ return error;
+ fd = error;
+
+ file = userfaultfd_file_create(flags);
+ if (IS_ERR(file)) {
+ error = PTR_ERR(file);
+ goto err_put_unused_fd;
+ }
+ fd_install(fd, file);
+
+ return fd;
+
+err_put_unused_fd:
+ put_unused_fd(fd);
+
+ return error;
+}
+
+static int __init userfaultfd_init(void)
+{
+ userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache",
+ sizeof(struct userfaultfd_ctx),
+ 0,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC,
+ init_once_userfaultfd_ctx);
+ return 0;
+}
+__initcall(userfaultfd_init);
diff --git a/fs/xattr.c b/fs/xattr.c
index 072fee1..d5dd6c8 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -208,25 +208,6 @@ vfs_getxattr_alloc(struct dentry *dentry, const char *name, char **xattr_value,
return error;
}
-/* Compare an extended attribute value with the given value */
-int vfs_xattr_cmp(struct dentry *dentry, const char *xattr_name,
- const char *value, size_t size, gfp_t flags)
-{
- char *xattr_value = NULL;
- int rc;
-
- rc = vfs_getxattr_alloc(dentry, xattr_name, &xattr_value, 0, flags);
- if (rc < 0)
- return rc;
-
- if ((rc != size) || (memcmp(xattr_value, value, rc) != 0))
- rc = -EINVAL;
- else
- rc = 0;
- kfree(xattr_value);
- return rc;
-}
-
ssize_t
vfs_getxattr(struct dentry *dentry, const char *name, void *value, size_t size)
{
@@ -324,7 +305,6 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value,
{
int error;
void *kvalue = NULL;
- void *vvalue = NULL; /* If non-NULL, we used vmalloc() */
char kname[XATTR_NAME_MAX + 1];
if (flags & ~(XATTR_CREATE|XATTR_REPLACE))
@@ -341,10 +321,9 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value,
return -E2BIG;
kvalue = kmalloc(size, GFP_KERNEL | __GFP_NOWARN);
if (!kvalue) {
- vvalue = vmalloc(size);
- if (!vvalue)
+ kvalue = vmalloc(size);
+ if (!kvalue)
return -ENOMEM;
- kvalue = vvalue;
}
if (copy_from_user(kvalue, value, size)) {
error = -EFAULT;
@@ -357,10 +336,8 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value,
error = vfs_setxattr(d, kname, kvalue, size, flags);
out:
- if (vvalue)
- vfree(vvalue);
- else
- kfree(kvalue);
+ kvfree(kvalue);
+
return error;
}
@@ -428,7 +405,6 @@ getxattr(struct dentry *d, const char __user *name, void __user *value,
{
ssize_t error;
void *kvalue = NULL;
- void *vvalue = NULL;
char kname[XATTR_NAME_MAX + 1];
error = strncpy_from_user(kname, name, sizeof(kname));
@@ -442,10 +418,9 @@ getxattr(struct dentry *d, const char __user *name, void __user *value,
size = XATTR_SIZE_MAX;
kvalue = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
if (!kvalue) {
- vvalue = vmalloc(size);
- if (!vvalue)
+ kvalue = vmalloc(size);
+ if (!kvalue)
return -ENOMEM;
- kvalue = vvalue;
}
}
@@ -461,10 +436,9 @@ getxattr(struct dentry *d, const char __user *name, void __user *value,
than XATTR_SIZE_MAX bytes. Not possible. */
error = -E2BIG;
}
- if (vvalue)
- vfree(vvalue);
- else
- kfree(kvalue);
+
+ kvfree(kvalue);
+
return error;
}
@@ -521,17 +495,15 @@ listxattr(struct dentry *d, char __user *list, size_t size)
{
ssize_t error;
char *klist = NULL;
- char *vlist = NULL; /* If non-NULL, we used vmalloc() */
if (size) {
if (size > XATTR_LIST_MAX)
size = XATTR_LIST_MAX;
klist = kmalloc(size, __GFP_NOWARN | GFP_KERNEL);
if (!klist) {
- vlist = vmalloc(size);
- if (!vlist)
+ klist = vmalloc(size);
+ if (!klist)
return -ENOMEM;
- klist = vlist;
}
}
@@ -544,10 +516,9 @@ listxattr(struct dentry *d, char __user *list, size_t size)
than XATTR_LIST_MAX bytes. Not possible. */
error = -E2BIG;
}
- if (vlist)
- vfree(vlist);
- else
- kfree(klist);
+
+ kvfree(klist);
+
return error;
}
@@ -700,13 +671,20 @@ xattr_resolve_name(const struct xattr_handler **handlers, const char **name)
return NULL;
for_each_xattr_handler(handlers, handler) {
- const char *n = strcmp_prefix(*name, handler->prefix);
+ const char *n;
+
+ n = strcmp_prefix(*name, xattr_prefix(handler));
if (n) {
+ if (!handler->prefix ^ !*n) {
+ if (*n)
+ continue;
+ return ERR_PTR(-EINVAL);
+ }
*name = n;
- break;
+ return handler;
}
}
- return handler;
+ return ERR_PTR(-EOPNOTSUPP);
}
/*
@@ -718,9 +696,9 @@ generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t s
const struct xattr_handler *handler;
handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
- if (!handler)
- return -EOPNOTSUPP;
- return handler->get(dentry, name, buffer, size, handler->flags);
+ if (IS_ERR(handler))
+ return PTR_ERR(handler);
+ return handler->get(handler, dentry, name, buffer, size);
}
/*
@@ -735,19 +713,25 @@ generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
if (!buffer) {
for_each_xattr_handler(handlers, handler) {
- size += handler->list(dentry, NULL, 0, NULL, 0,
- handler->flags);
+ if (!handler->name ||
+ (handler->list && !handler->list(dentry)))
+ continue;
+ size += strlen(handler->name) + 1;
}
} else {
char *buf = buffer;
+ size_t len;
for_each_xattr_handler(handlers, handler) {
- size = handler->list(dentry, buf, buffer_size,
- NULL, 0, handler->flags);
- if (size > buffer_size)
+ if (!handler->name ||
+ (handler->list && !handler->list(dentry)))
+ continue;
+ len = strlen(handler->name);
+ if (len + 1 > buffer_size)
return -ERANGE;
- buf += size;
- buffer_size -= size;
+ memcpy(buf, handler->name, len + 1);
+ buf += len + 1;
+ buffer_size -= len + 1;
}
size = buf - buffer;
}
@@ -765,9 +749,9 @@ generic_setxattr(struct dentry *dentry, const char *name, const void *value, siz
if (size == 0)
value = ""; /* empty EA, do not remove */
handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
- if (!handler)
- return -EOPNOTSUPP;
- return handler->set(dentry, name, value, size, flags, handler->flags);
+ if (IS_ERR(handler))
+ return PTR_ERR(handler);
+ return handler->set(handler, dentry, name, value, size, flags);
}
/*
@@ -780,10 +764,9 @@ generic_removexattr(struct dentry *dentry, const char *name)
const struct xattr_handler *handler;
handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
- if (!handler)
- return -EOPNOTSUPP;
- return handler->set(dentry, name, NULL, 0,
- XATTR_REPLACE, handler->flags);
+ if (IS_ERR(handler))
+ return PTR_ERR(handler);
+ return handler->set(handler, dentry, name, NULL, 0, XATTR_REPLACE);
}
EXPORT_SYMBOL(generic_getxattr);
@@ -791,6 +774,30 @@ EXPORT_SYMBOL(generic_listxattr);
EXPORT_SYMBOL(generic_setxattr);
EXPORT_SYMBOL(generic_removexattr);
+/**
+ * xattr_full_name - Compute full attribute name from suffix
+ *
+ * @handler: handler of the xattr_handler operation
+ * @name: name passed to the xattr_handler operation
+ *
+ * The get and set xattr handler operations are called with the remainder of
+ * the attribute name after skipping the handler's prefix: for example, "foo"
+ * is passed to the get operation of a handler with prefix "user." to get
+ * attribute "user.foo". The full name is still "there" in the name though.
+ *
+ * Note: the list xattr handler operation when called from the vfs is passed a
+ * NULL name; some file systems use this operation internally, with varying
+ * semantics.
+ */
+const char *xattr_full_name(const struct xattr_handler *handler,
+ const char *name)
+{
+ size_t prefix_len = strlen(xattr_prefix(handler));
+
+ return name - prefix_len;
+}
+EXPORT_SYMBOL(xattr_full_name);
+
/*
* Allocate new xattr and copy in the value; but leave the name to callers.
*/
@@ -840,8 +847,22 @@ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name,
return ret;
}
-static int __simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
- const void *value, size_t size, int flags)
+/**
+ * simple_xattr_set - xattr SET operation for in-memory/pseudo filesystems
+ * @xattrs: target simple_xattr list
+ * @name: name of the extended attribute
+ * @value: value of the xattr. If %NULL, will remove the attribute.
+ * @size: size of the new xattr
+ * @flags: %XATTR_{CREATE|REPLACE}
+ *
+ * %XATTR_CREATE is set, the xattr shouldn't exist already; otherwise fails
+ * with -EEXIST. If %XATTR_REPLACE is set, the xattr should exist;
+ * otherwise, fails with -ENODATA.
+ *
+ * Returns 0 on success, -errno on failure.
+ */
+int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
+ const void *value, size_t size, int flags)
{
struct simple_xattr *xattr;
struct simple_xattr *new_xattr = NULL;
@@ -891,73 +912,64 @@ out:
}
-/**
- * simple_xattr_set - xattr SET operation for in-memory/pseudo filesystems
- * @xattrs: target simple_xattr list
- * @name: name of the new extended attribute
- * @value: value of the new xattr. If %NULL, will remove the attribute
- * @size: size of the new xattr
- * @flags: %XATTR_{CREATE|REPLACE}
- *
- * %XATTR_CREATE is set, the xattr shouldn't exist already; otherwise fails
- * with -EEXIST. If %XATTR_REPLACE is set, the xattr should exist;
- * otherwise, fails with -ENODATA.
- *
- * Returns 0 on success, -errno on failure.
- */
-int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
- const void *value, size_t size, int flags)
-{
- if (size == 0)
- value = ""; /* empty EA, do not remove */
- return __simple_xattr_set(xattrs, name, value, size, flags);
-}
-
-/*
- * xattr REMOVE operation for in-memory/pseudo filesystems
- */
-int simple_xattr_remove(struct simple_xattrs *xattrs, const char *name)
+static bool xattr_is_trusted(const char *name)
{
- return __simple_xattr_set(xattrs, name, NULL, 0, XATTR_REPLACE);
+ return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
}
-static bool xattr_is_trusted(const char *name)
+static int xattr_list_one(char **buffer, ssize_t *remaining_size,
+ const char *name)
{
- return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
+ size_t len = strlen(name) + 1;
+ if (*buffer) {
+ if (*remaining_size < len)
+ return -ERANGE;
+ memcpy(*buffer, name, len);
+ *buffer += len;
+ }
+ *remaining_size -= len;
+ return 0;
}
/*
* xattr LIST operation for in-memory/pseudo filesystems
*/
-ssize_t simple_xattr_list(struct simple_xattrs *xattrs, char *buffer,
- size_t size)
+ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs,
+ char *buffer, size_t size)
{
bool trusted = capable(CAP_SYS_ADMIN);
struct simple_xattr *xattr;
- size_t used = 0;
+ ssize_t remaining_size = size;
+ int err;
+
+#ifdef CONFIG_FS_POSIX_ACL
+ if (inode->i_acl) {
+ err = xattr_list_one(&buffer, &remaining_size,
+ XATTR_NAME_POSIX_ACL_ACCESS);
+ if (err)
+ return err;
+ }
+ if (inode->i_default_acl) {
+ err = xattr_list_one(&buffer, &remaining_size,
+ XATTR_NAME_POSIX_ACL_DEFAULT);
+ if (err)
+ return err;
+ }
+#endif
spin_lock(&xattrs->lock);
list_for_each_entry(xattr, &xattrs->head, list) {
- size_t len;
-
/* skip "trusted." attributes for unprivileged callers */
if (!trusted && xattr_is_trusted(xattr->name))
continue;
- len = strlen(xattr->name) + 1;
- used += len;
- if (buffer) {
- if (size < used) {
- used = -ERANGE;
- break;
- }
- memcpy(buffer, xattr->name, len);
- buffer += len;
- }
+ err = xattr_list_one(&buffer, &remaining_size, xattr->name);
+ if (err)
+ return err;
}
spin_unlock(&xattrs->lock);
- return used;
+ return size - remaining_size;
}
/*
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index df68285..f646391 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -33,6 +33,7 @@ xfs-y += $(addprefix libxfs/, \
xfs_attr.o \
xfs_attr_leaf.o \
xfs_attr_remote.o \
+ xfs_bit.o \
xfs_bmap.o \
xfs_bmap_btree.o \
xfs_btree.o \
@@ -63,7 +64,6 @@ xfs-$(CONFIG_XFS_RT) += $(addprefix libxfs/, \
xfs-y += xfs_aops.o \
xfs_attr_inactive.o \
xfs_attr_list.o \
- xfs_bit.o \
xfs_bmap_util.o \
xfs_buf.o \
xfs_dir2_readdir.o \
@@ -84,6 +84,7 @@ xfs-y += xfs_aops.o \
xfs_message.o \
xfs_mount.o \
xfs_mru_cache.o \
+ xfs_stats.o \
xfs_super.o \
xfs_symlink.o \
xfs_sysfs.o \
@@ -118,7 +119,6 @@ xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \
xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o
xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o
-xfs-$(CONFIG_PROC_FS) += xfs_stats.o
xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o
xfs-$(CONFIG_COMPAT) += xfs_ioctl32.o
xfs-$(CONFIG_NFSD_PNFS) += xfs_pnfs.o
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index a7a3a63..686ba6f 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -55,8 +55,9 @@ kmem_alloc(size_t size, xfs_km_flags_t flags)
return ptr;
if (!(++retries % 100))
xfs_err(NULL,
- "possible memory allocation deadlock in %s (mode:0x%x)",
- __func__, lflags);
+ "%s(%u) possible memory allocation deadlock size %u in %s (mode:0x%x)",
+ current->comm, current->pid,
+ (unsigned int)size, __func__, lflags);
congestion_wait(BLK_RW_ASYNC, HZ/50);
} while (1);
}
@@ -120,8 +121,9 @@ kmem_zone_alloc(kmem_zone_t *zone, xfs_km_flags_t flags)
return ptr;
if (!(++retries % 100))
xfs_err(NULL,
- "possible memory allocation deadlock in %s (mode:0x%x)",
- __func__, lflags);
+ "%s(%u) possible memory allocation deadlock in %s (mode:0x%x)",
+ current->comm, current->pid,
+ __func__, lflags);
congestion_wait(BLK_RW_ASYNC, HZ/50);
} while (1);
}
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index cc6b768..d1c66e4 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -84,6 +84,7 @@ kmem_zalloc(size_t size, xfs_km_flags_t flags)
#define KM_ZONE_HWALIGN SLAB_HWCACHE_ALIGN
#define KM_ZONE_RECLAIM SLAB_RECLAIM_ACCOUNT
#define KM_ZONE_SPREAD SLAB_MEM_SPREAD
+#define KM_ZONE_ACCOUNT SLAB_ACCOUNT
#define kmem_zone kmem_cache
#define kmem_zone_t struct kmem_cache
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index f9e9ffe..a708e38 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -464,7 +464,7 @@ xfs_agfl_verify(
struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp);
int i;
- if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_uuid))
+ if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid))
return false;
if (be32_to_cpu(agfl->agfl_magicnum) != XFS_AGFL_MAGIC)
return false;
@@ -482,7 +482,9 @@ xfs_agfl_verify(
be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks)
return false;
}
- return true;
+
+ return xfs_log_check_lsn(mp,
+ be64_to_cpu(XFS_BUF_TO_AGFL(bp)->agfl_lsn));
}
static void
@@ -533,6 +535,7 @@ xfs_agfl_write_verify(
}
const struct xfs_buf_ops xfs_agfl_buf_ops = {
+ .name = "xfs_agfl",
.verify_read = xfs_agfl_read_verify,
.verify_write = xfs_agfl_write_verify,
};
@@ -651,8 +654,8 @@ xfs_alloc_ag_vextent(
-((long)(args->len)));
}
- XFS_STATS_INC(xs_allocx);
- XFS_STATS_ADD(xs_allocb, args->len);
+ XFS_STATS_INC(args->mp, xs_allocx);
+ XFS_STATS_ADD(args->mp, xs_allocb, args->len);
return error;
}
@@ -1808,8 +1811,8 @@ xfs_free_ag_extent(
if (!isfl)
xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len);
- XFS_STATS_INC(xs_freex);
- XFS_STATS_ADD(xs_freeb, len);
+ XFS_STATS_INC(mp, xs_freex);
+ XFS_STATS_ADD(mp, xs_freeb, len);
trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright);
@@ -1924,7 +1927,7 @@ xfs_alloc_space_available(
* Decide whether to use this allocation group for this allocation.
* If so, fix up the btree freelist's size.
*/
-STATIC int /* error */
+int /* error */
xfs_alloc_fix_freelist(
struct xfs_alloc_arg *args, /* allocation argument structure */
int flags) /* XFS_ALLOC_FLAG_... */
@@ -1937,7 +1940,7 @@ xfs_alloc_fix_freelist(
struct xfs_alloc_arg targs; /* local allocation arguments */
xfs_agblock_t bno; /* freelist block */
xfs_extlen_t need; /* total blocks needed in freelist */
- int error;
+ int error = 0;
if (!pag->pagf_init) {
error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp);
@@ -2259,9 +2262,13 @@ xfs_agf_verify(
{
struct xfs_agf *agf = XFS_BUF_TO_AGF(bp);
- if (xfs_sb_version_hascrc(&mp->m_sb) &&
- !uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_uuid))
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (!uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid))
+ return false;
+ if (!xfs_log_check_lsn(mp,
+ be64_to_cpu(XFS_BUF_TO_AGF(bp)->agf_lsn)))
return false;
+ }
if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
@@ -2333,6 +2340,7 @@ xfs_agf_write_verify(
}
const struct xfs_buf_ops xfs_agf_buf_ops = {
+ .name = "xfs_agf",
.verify_read = xfs_agf_read_verify,
.verify_write = xfs_agf_write_verify,
};
@@ -2503,7 +2511,7 @@ xfs_alloc_vextent(
* Try near allocation first, then anywhere-in-ag after
* the first a.g. fails.
*/
- if ((args->userdata == XFS_ALLOC_INITIAL_USER_DATA) &&
+ if ((args->userdata & XFS_ALLOC_INITIAL_USER_DATA) &&
(mp->m_flags & XFS_MOUNT_32BITINODES)) {
args->fsbno = XFS_AGB_TO_FSB(mp,
((mp->m_agfrotor / rotorstep) %
@@ -2634,6 +2642,14 @@ xfs_alloc_vextent(
XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno),
args->len);
#endif
+
+ /* Zero the extent if we were asked to do so */
+ if (args->userdata & XFS_ALLOC_USERDATA_ZERO) {
+ error = xfs_zero_extent(args->ip, args->fsbno, args->len);
+ if (error)
+ goto error0;
+ }
+
}
xfs_perag_put(args->pag);
return 0;
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index ca1c816..135eb3d 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -101,6 +101,7 @@ typedef struct xfs_alloc_arg {
struct xfs_mount *mp; /* file system mount point */
struct xfs_buf *agbp; /* buffer for a.g. freelist header */
struct xfs_perag *pag; /* per-ag struct for this agno */
+ struct xfs_inode *ip; /* for userdata zeroing method */
xfs_fsblock_t fsbno; /* file system block number */
xfs_agnumber_t agno; /* allocation group number */
xfs_agblock_t agbno; /* allocation group-relative block # */
@@ -120,15 +121,16 @@ typedef struct xfs_alloc_arg {
char wasdel; /* set if allocation was prev delayed */
char wasfromfl; /* set if allocation is from freelist */
char isfl; /* set if is freelist blocks - !acctg */
- char userdata; /* set if this is user data */
+ char userdata; /* mask defining userdata treatment */
xfs_fsblock_t firstblock; /* io first block allocated */
} xfs_alloc_arg_t;
/*
* Defines for userdata
*/
-#define XFS_ALLOC_USERDATA 1 /* allocation is for user data*/
-#define XFS_ALLOC_INITIAL_USER_DATA 2 /* special case start of file */
+#define XFS_ALLOC_USERDATA (1 << 0)/* allocation is for user data*/
+#define XFS_ALLOC_INITIAL_USER_DATA (1 << 1)/* special case start of file */
+#define XFS_ALLOC_USERDATA_ZERO (1 << 2)/* zero extent on allocation */
xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_mount *mp,
struct xfs_perag *pag, xfs_extlen_t need);
@@ -233,5 +235,6 @@ xfs_alloc_get_rec(
int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
+int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, int flags);
#endif /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index 59d521c..444626d 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -293,14 +293,7 @@ xfs_allocbt_verify(
level = be16_to_cpu(block->bb_level);
switch (block->bb_magic) {
case cpu_to_be32(XFS_ABTB_CRC_MAGIC):
- if (!xfs_sb_version_hascrc(&mp->m_sb))
- return false;
- if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid))
- return false;
- if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
- return false;
- if (pag &&
- be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
+ if (!xfs_btree_sblock_v5hdr_verify(bp))
return false;
/* fall through */
case cpu_to_be32(XFS_ABTB_MAGIC):
@@ -311,14 +304,7 @@ xfs_allocbt_verify(
return false;
break;
case cpu_to_be32(XFS_ABTC_CRC_MAGIC):
- if (!xfs_sb_version_hascrc(&mp->m_sb))
- return false;
- if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid))
- return false;
- if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
- return false;
- if (pag &&
- be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
+ if (!xfs_btree_sblock_v5hdr_verify(bp))
return false;
/* fall through */
case cpu_to_be32(XFS_ABTC_MAGIC):
@@ -332,21 +318,7 @@ xfs_allocbt_verify(
return false;
}
- /* numrecs verification */
- if (be16_to_cpu(block->bb_numrecs) > mp->m_alloc_mxr[level != 0])
- return false;
-
- /* sibling pointer verification */
- if (!block->bb_u.s.bb_leftsib ||
- (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks &&
- block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK)))
- return false;
- if (!block->bb_u.s.bb_rightsib ||
- (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks &&
- block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK)))
- return false;
-
- return true;
+ return xfs_btree_sblock_verify(bp, mp->m_alloc_mxr[level != 0]);
}
static void
@@ -379,6 +351,7 @@ xfs_allocbt_write_verify(
}
const struct xfs_buf_ops xfs_allocbt_buf_ops = {
+ .name = "xfs_allocbt",
.verify_read = xfs_allocbt_read_verify,
.verify_write = xfs_allocbt_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index 3349c9a..fa3b948 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -125,7 +125,7 @@ xfs_attr_get(
uint lock_mode;
int error;
- XFS_STATS_INC(xs_attr_get);
+ XFS_STATS_INC(ip->i_mount, xs_attr_get);
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
return -EIO;
@@ -139,6 +139,8 @@ xfs_attr_get(
args.value = value;
args.valuelen = *valuelenp;
+ /* Entirely possible to look up a name which doesn't exist */
+ args.op_flags = XFS_DA_OP_OKNOENT;
lock_mode = xfs_ilock_attr_map_shared(ip);
if (!xfs_inode_hasattr(ip))
@@ -205,9 +207,9 @@ xfs_attr_set(
struct xfs_trans_res tres;
xfs_fsblock_t firstblock;
int rsvd = (flags & ATTR_ROOT) != 0;
- int error, err2, committed, local;
+ int error, err2, local;
- XFS_STATS_INC(xs_attr_set);
+ XFS_STATS_INC(mp, xs_attr_set);
if (XFS_FORCED_SHUTDOWN(dp->i_mount))
return -EIO;
@@ -332,25 +334,15 @@ xfs_attr_set(
*/
xfs_bmap_init(args.flist, args.firstblock);
error = xfs_attr_shortform_to_leaf(&args);
- if (!error) {
- error = xfs_bmap_finish(&args.trans, args.flist,
- &committed);
- }
+ if (!error)
+ error = xfs_bmap_finish(&args.trans, args.flist, dp);
if (error) {
- ASSERT(committed);
args.trans = NULL;
xfs_bmap_cancel(&flist);
goto out;
}
/*
- * bmap_finish() may have committed the last trans and started
- * a new one. We need the inode to be in all transactions.
- */
- if (committed)
- xfs_trans_ijoin(args.trans, dp, 0);
-
- /*
* Commit the leaf transformation. We'll need another (linked)
* transaction to add the new attribute to the leaf.
*/
@@ -410,7 +402,7 @@ xfs_attr_remove(
xfs_fsblock_t firstblock;
int error;
- XFS_STATS_INC(xs_attr_remove);
+ XFS_STATS_INC(mp, xs_attr_remove);
if (XFS_FORCED_SHUTDOWN(dp->i_mount))
return -EIO;
@@ -566,7 +558,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
{
xfs_inode_t *dp;
struct xfs_buf *bp;
- int retval, error, committed, forkoff;
+ int retval, error, forkoff;
trace_xfs_attr_leaf_addname(args);
@@ -626,25 +618,15 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
*/
xfs_bmap_init(args->flist, args->firstblock);
error = xfs_attr3_leaf_to_node(args);
- if (!error) {
- error = xfs_bmap_finish(&args->trans, args->flist,
- &committed);
- }
+ if (!error)
+ error = xfs_bmap_finish(&args->trans, args->flist, dp);
if (error) {
- ASSERT(committed);
args->trans = NULL;
xfs_bmap_cancel(args->flist);
return error;
}
/*
- * bmap_finish() may have committed the last trans and started
- * a new one. We need the inode to be in all transactions.
- */
- if (committed)
- xfs_trans_ijoin(args->trans, dp, 0);
-
- /*
* Commit the current trans (including the inode) and start
* a new one.
*/
@@ -727,25 +709,14 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
xfs_bmap_init(args->flist, args->firstblock);
error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
/* bp is gone due to xfs_da_shrink_inode */
- if (!error) {
+ if (!error)
error = xfs_bmap_finish(&args->trans,
- args->flist,
- &committed);
- }
+ args->flist, dp);
if (error) {
- ASSERT(committed);
args->trans = NULL;
xfs_bmap_cancel(args->flist);
return error;
}
-
- /*
- * bmap_finish() may have committed the last trans
- * and started a new one. We need the inode to be
- * in all transactions.
- */
- if (committed)
- xfs_trans_ijoin(args->trans, dp, 0);
}
/*
@@ -773,7 +744,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
{
xfs_inode_t *dp;
struct xfs_buf *bp;
- int error, committed, forkoff;
+ int error, forkoff;
trace_xfs_attr_leaf_removename(args);
@@ -801,23 +772,13 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
xfs_bmap_init(args->flist, args->firstblock);
error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
/* bp is gone due to xfs_da_shrink_inode */
- if (!error) {
- error = xfs_bmap_finish(&args->trans, args->flist,
- &committed);
- }
+ if (!error)
+ error = xfs_bmap_finish(&args->trans, args->flist, dp);
if (error) {
- ASSERT(committed);
args->trans = NULL;
xfs_bmap_cancel(args->flist);
return error;
}
-
- /*
- * bmap_finish() may have committed the last trans and started
- * a new one. We need the inode to be in all transactions.
- */
- if (committed)
- xfs_trans_ijoin(args->trans, dp, 0);
}
return 0;
}
@@ -875,7 +836,7 @@ xfs_attr_node_addname(xfs_da_args_t *args)
xfs_da_state_blk_t *blk;
xfs_inode_t *dp;
xfs_mount_t *mp;
- int committed, retval, error;
+ int retval, error;
trace_xfs_attr_node_addname(args);
@@ -936,27 +897,16 @@ restart:
state = NULL;
xfs_bmap_init(args->flist, args->firstblock);
error = xfs_attr3_leaf_to_node(args);
- if (!error) {
+ if (!error)
error = xfs_bmap_finish(&args->trans,
- args->flist,
- &committed);
- }
+ args->flist, dp);
if (error) {
- ASSERT(committed);
args->trans = NULL;
xfs_bmap_cancel(args->flist);
goto out;
}
/*
- * bmap_finish() may have committed the last trans
- * and started a new one. We need the inode to be
- * in all transactions.
- */
- if (committed)
- xfs_trans_ijoin(args->trans, dp, 0);
-
- /*
* Commit the node conversion and start the next
* trans in the chain.
*/
@@ -975,23 +925,13 @@ restart:
*/
xfs_bmap_init(args->flist, args->firstblock);
error = xfs_da3_split(state);
- if (!error) {
- error = xfs_bmap_finish(&args->trans, args->flist,
- &committed);
- }
+ if (!error)
+ error = xfs_bmap_finish(&args->trans, args->flist, dp);
if (error) {
- ASSERT(committed);
args->trans = NULL;
xfs_bmap_cancel(args->flist);
goto out;
}
-
- /*
- * bmap_finish() may have committed the last trans and started
- * a new one. We need the inode to be in all transactions.
- */
- if (committed)
- xfs_trans_ijoin(args->trans, dp, 0);
} else {
/*
* Addition succeeded, update Btree hashvals.
@@ -1084,25 +1024,14 @@ restart:
if (retval && (state->path.active > 1)) {
xfs_bmap_init(args->flist, args->firstblock);
error = xfs_da3_join(state);
- if (!error) {
+ if (!error)
error = xfs_bmap_finish(&args->trans,
- args->flist,
- &committed);
- }
+ args->flist, dp);
if (error) {
- ASSERT(committed);
args->trans = NULL;
xfs_bmap_cancel(args->flist);
goto out;
}
-
- /*
- * bmap_finish() may have committed the last trans
- * and started a new one. We need the inode to be
- * in all transactions.
- */
- if (committed)
- xfs_trans_ijoin(args->trans, dp, 0);
}
/*
@@ -1144,7 +1073,7 @@ xfs_attr_node_removename(xfs_da_args_t *args)
xfs_da_state_blk_t *blk;
xfs_inode_t *dp;
struct xfs_buf *bp;
- int retval, error, committed, forkoff;
+ int retval, error, forkoff;
trace_xfs_attr_node_removename(args);
@@ -1218,24 +1147,13 @@ xfs_attr_node_removename(xfs_da_args_t *args)
if (retval && (state->path.active > 1)) {
xfs_bmap_init(args->flist, args->firstblock);
error = xfs_da3_join(state);
- if (!error) {
- error = xfs_bmap_finish(&args->trans, args->flist,
- &committed);
- }
+ if (!error)
+ error = xfs_bmap_finish(&args->trans, args->flist, dp);
if (error) {
- ASSERT(committed);
args->trans = NULL;
xfs_bmap_cancel(args->flist);
goto out;
}
-
- /*
- * bmap_finish() may have committed the last trans and started
- * a new one. We need the inode to be in all transactions.
- */
- if (committed)
- xfs_trans_ijoin(args->trans, dp, 0);
-
/*
* Commit the Btree join operation and start a new trans.
*/
@@ -1263,25 +1181,14 @@ xfs_attr_node_removename(xfs_da_args_t *args)
xfs_bmap_init(args->flist, args->firstblock);
error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
/* bp is gone due to xfs_da_shrink_inode */
- if (!error) {
+ if (!error)
error = xfs_bmap_finish(&args->trans,
- args->flist,
- &committed);
- }
+ args->flist, dp);
if (error) {
- ASSERT(committed);
args->trans = NULL;
xfs_bmap_cancel(args->flist);
goto out;
}
-
- /*
- * bmap_finish() may have committed the last trans
- * and started a new one. We need the inode to be
- * in all transactions.
- */
- if (committed)
- xfs_trans_ijoin(args->trans, dp, 0);
} else
xfs_trans_brelse(args->trans, bp);
}
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index e9d401c..01a5ecf 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -41,6 +41,7 @@
#include "xfs_buf_item.h"
#include "xfs_cksum.h"
#include "xfs_dir2.h"
+#include "xfs_log.h"
/*
@@ -262,10 +263,12 @@ xfs_attr3_leaf_verify(
if (ichdr.magic != XFS_ATTR3_LEAF_MAGIC)
return false;
- if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_uuid))
+ if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid))
return false;
if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
return false;
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn)))
+ return false;
} else {
if (ichdr.magic != XFS_ATTR_LEAF_MAGIC)
return false;
@@ -325,6 +328,7 @@ xfs_attr3_leaf_read_verify(
}
const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = {
+ .name = "xfs_attr3_leaf",
.verify_read = xfs_attr3_leaf_read_verify,
.verify_write = xfs_attr3_leaf_write_verify,
};
@@ -1056,7 +1060,7 @@ xfs_attr3_leaf_create(
hdr3->blkno = cpu_to_be64(bp->b_bn);
hdr3->owner = cpu_to_be64(dp->i_ino);
- uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid);
+ uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid);
ichdr.freemap[0].base = sizeof(struct xfs_attr3_leaf_hdr);
} else {
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index dd71403..a572532 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -100,14 +100,14 @@ xfs_attr3_rmt_verify(
return false;
if (rmt->rm_magic != cpu_to_be32(XFS_ATTR3_RMT_MAGIC))
return false;
- if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_uuid))
+ if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_meta_uuid))
return false;
if (be64_to_cpu(rmt->rm_blkno) != bno)
return false;
if (be32_to_cpu(rmt->rm_bytes) > fsbsize - sizeof(*rmt))
return false;
if (be32_to_cpu(rmt->rm_offset) +
- be32_to_cpu(rmt->rm_bytes) > XATTR_SIZE_MAX)
+ be32_to_cpu(rmt->rm_bytes) > XFS_XATTR_SIZE_MAX)
return false;
if (rmt->rm_owner == 0)
return false;
@@ -201,6 +201,7 @@ xfs_attr3_rmt_write_verify(
}
const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = {
+ .name = "xfs_attr3_rmt",
.verify_read = xfs_attr3_rmt_read_verify,
.verify_write = xfs_attr3_rmt_write_verify,
};
@@ -222,7 +223,7 @@ xfs_attr3_rmt_hdr_set(
rmt->rm_magic = cpu_to_be32(XFS_ATTR3_RMT_MAGIC);
rmt->rm_offset = cpu_to_be32(offset);
rmt->rm_bytes = cpu_to_be32(size);
- uuid_copy(&rmt->rm_uuid, &mp->m_sb.sb_uuid);
+ uuid_copy(&rmt->rm_uuid, &mp->m_sb.sb_meta_uuid);
rmt->rm_owner = cpu_to_be64(ino);
rmt->rm_blkno = cpu_to_be64(bno);
@@ -447,8 +448,6 @@ xfs_attr_rmtval_set(
* Roll through the "value", allocating blocks on disk as required.
*/
while (blkcnt > 0) {
- int committed;
-
/*
* Allocate a single extent, up to the size of the value.
*
@@ -466,24 +465,14 @@ xfs_attr_rmtval_set(
error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno,
blkcnt, XFS_BMAPI_ATTRFORK, args->firstblock,
args->total, &map, &nmap, args->flist);
- if (!error) {
- error = xfs_bmap_finish(&args->trans, args->flist,
- &committed);
- }
+ if (!error)
+ error = xfs_bmap_finish(&args->trans, args->flist, dp);
if (error) {
- ASSERT(committed);
args->trans = NULL;
xfs_bmap_cancel(args->flist);
return error;
}
- /*
- * bmap_finish() may have committed the last trans and started
- * a new one. We need the inode to be in all transactions.
- */
- if (committed)
- xfs_trans_ijoin(args->trans, dp, 0);
-
ASSERT(nmap == 1);
ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
(map.br_startblock != HOLESTARTBLOCK));
@@ -614,32 +603,20 @@ xfs_attr_rmtval_remove(
blkcnt = args->rmtblkcnt;
done = 0;
while (!done) {
- int committed;
-
xfs_bmap_init(args->flist, args->firstblock);
error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt,
- XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
- 1, args->firstblock, args->flist,
- &done);
- if (!error) {
+ XFS_BMAPI_ATTRFORK, 1, args->firstblock,
+ args->flist, &done);
+ if (!error)
error = xfs_bmap_finish(&args->trans, args->flist,
- &committed);
- }
+ args->dp);
if (error) {
- ASSERT(committed);
args->trans = NULL;
xfs_bmap_cancel(args->flist);
return error;
}
/*
- * bmap_finish() may have committed the last trans and started
- * a new one. We need the inode to be in all transactions.
- */
- if (committed)
- xfs_trans_ijoin(args->trans, args->dp, 0);
-
- /*
* Close out trans and start the next one in the chain.
*/
error = xfs_trans_roll(&args->trans, args->dp);
diff --git a/fs/xfs/xfs_bit.c b/fs/xfs/libxfs/xfs_bit.c
index 0e8885a..0a94cce 100644
--- a/fs/xfs/xfs_bit.c
+++ b/fs/xfs/libxfs/xfs_bit.c
@@ -32,13 +32,13 @@ int
xfs_bitmap_empty(uint *map, uint size)
{
uint i;
- uint ret = 0;
for (i = 0; i < size; i++) {
- ret |= map[i];
+ if (map[i] != 0)
+ return 0;
}
- return (ret == 0);
+ return 1;
}
/*
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 63e05b6..ef00156 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -325,9 +325,11 @@ xfs_check_block(
/*
* Check that the extents for the inode ip are in the right order in all
- * btree leaves.
+ * btree leaves. THis becomes prohibitively expensive for large extent count
+ * files, so don't bother with inodes that have more than 10,000 extents in
+ * them. The btree record ordering checks will still be done, so for such large
+ * bmapbt constructs that is going to catch most corruptions.
*/
-
STATIC void
xfs_bmap_check_leaf_extents(
xfs_btree_cur_t *cur, /* btree cursor or null */
@@ -352,6 +354,10 @@ xfs_bmap_check_leaf_extents(
return;
}
+ /* skip large extent count inodes */
+ if (ip->i_d.di_nextents > 10000)
+ return;
+
bno = NULLFSBLOCK;
mp = ip->i_mount;
ifp = XFS_IFORK_PTR(ip, whichfork);
@@ -948,14 +954,16 @@ xfs_bmap_local_to_extents(
bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
/*
- * Initialise the block and copy the data
+ * Initialize the block, copy the data and log the remote buffer.
*
- * Note: init_fn must set the buffer log item type correctly!
+ * The callout is responsible for logging because the remote format
+ * might differ from the local format and thus we don't know how much to
+ * log here. Note that init_fn must also set the buffer log item type
+ * correctly.
*/
init_fn(tp, bp, ip, ifp);
- /* account for the change in fork size and log everything */
- xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
+ /* account for the change in fork size */
xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
xfs_bmap_local_to_extents_empty(ip, whichfork);
flags |= XFS_ILOG_CORE;
@@ -1109,7 +1117,6 @@ xfs_bmap_add_attrfork(
xfs_trans_t *tp; /* transaction pointer */
int blks; /* space reservation */
int version = 1; /* superblock attr version */
- int committed; /* xaction was committed */
int logflags; /* logging flags */
int error; /* error return value */
@@ -1212,7 +1219,7 @@ xfs_bmap_add_attrfork(
xfs_log_sb(tp);
}
- error = xfs_bmap_finish(&tp, &flist, &committed);
+ error = xfs_bmap_finish(&tp, &flist, NULL);
if (error)
goto bmap_cancel;
error = xfs_trans_commit(tp);
@@ -1435,7 +1442,7 @@ xfs_bmap_search_extents(
xfs_ifork_t *ifp; /* inode fork pointer */
xfs_bmbt_rec_host_t *ep; /* extent record pointer */
- XFS_STATS_INC(xs_look_exlist);
+ XFS_STATS_INC(ip->i_mount, xs_look_exlist);
ifp = XFS_IFORK_PTR(ip, fork);
ep = xfs_bmap_search_multi_extents(ifp, bno, eofp, lastxp, gotp, prevp);
@@ -1721,10 +1728,11 @@ xfs_bmap_add_extent_delay_real(
xfs_filblks_t temp=0; /* value for da_new calculations */
xfs_filblks_t temp2=0;/* value for da_new calculations */
int tmp_rval; /* partial logging flags */
+ int whichfork = XFS_DATA_FORK;
struct xfs_mount *mp;
- mp = bma->tp ? bma->tp->t_mountp : NULL;
- ifp = XFS_IFORK_PTR(bma->ip, XFS_DATA_FORK);
+ mp = bma->ip->i_mount;
+ ifp = XFS_IFORK_PTR(bma->ip, whichfork);
ASSERT(bma->idx >= 0);
ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
@@ -1732,7 +1740,7 @@ xfs_bmap_add_extent_delay_real(
ASSERT(!bma->cur ||
(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
- XFS_STATS_INC(xs_add_exlist);
+ XFS_STATS_INC(mp, xs_add_exlist);
#define LEFT r[0]
#define RIGHT r[1]
@@ -1783,7 +1791,7 @@ xfs_bmap_add_extent_delay_real(
* Don't set contiguous if the combined extent would be too large.
* Also check for all-three-contiguous being too large.
*/
- if (bma->idx < bma->ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
+ if (bma->idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
state |= BMAP_RIGHT_VALID;
xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx + 1), &RIGHT);
@@ -2014,10 +2022,10 @@ xfs_bmap_add_extent_delay_real(
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
}
- if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
+ if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
bma->firstblock, bma->flist,
- &bma->cur, 1, &tmp_rval, XFS_DATA_FORK);
+ &bma->cur, 1, &tmp_rval, whichfork);
rval |= tmp_rval;
if (error)
goto done;
@@ -2098,10 +2106,10 @@ xfs_bmap_add_extent_delay_real(
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
}
- if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
+ if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
bma->firstblock, bma->flist, &bma->cur, 1,
- &tmp_rval, XFS_DATA_FORK);
+ &tmp_rval, whichfork);
rval |= tmp_rval;
if (error)
goto done;
@@ -2167,10 +2175,10 @@ xfs_bmap_add_extent_delay_real(
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
}
- if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
+ if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
bma->firstblock, bma->flist, &bma->cur,
- 1, &tmp_rval, XFS_DATA_FORK);
+ 1, &tmp_rval, whichfork);
rval |= tmp_rval;
if (error)
goto done;
@@ -2213,13 +2221,13 @@ xfs_bmap_add_extent_delay_real(
}
/* convert to a btree if necessary */
- if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
+ if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
int tmp_logflags; /* partial log flag return val */
ASSERT(bma->cur == NULL);
error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
bma->firstblock, bma->flist, &bma->cur,
- da_old > 0, &tmp_logflags, XFS_DATA_FORK);
+ da_old > 0, &tmp_logflags, whichfork);
bma->logflags |= tmp_logflags;
if (error)
goto done;
@@ -2240,7 +2248,7 @@ xfs_bmap_add_extent_delay_real(
if (bma->cur)
bma->cur->bc_private.b.allocated = 0;
- xfs_bmap_check_leaf_extents(bma->cur, bma->ip, XFS_DATA_FORK);
+ xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork);
done:
bma->logflags |= rval;
return error;
@@ -2286,7 +2294,7 @@ xfs_bmap_add_extent_unwritten_real(
ASSERT(*idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
ASSERT(!isnullstartblock(new->br_startblock));
- XFS_STATS_INC(xs_add_exlist);
+ XFS_STATS_INC(mp, xs_add_exlist);
#define LEFT r[0]
#define RIGHT r[1]
@@ -2937,7 +2945,7 @@ xfs_bmap_add_extent_hole_real(
int state; /* state bits, accessed thru macros */
struct xfs_mount *mp;
- mp = bma->tp ? bma->tp->t_mountp : NULL;
+ mp = bma->ip->i_mount;
ifp = XFS_IFORK_PTR(bma->ip, whichfork);
ASSERT(bma->idx >= 0);
@@ -2946,7 +2954,7 @@ xfs_bmap_add_extent_hole_real(
ASSERT(!bma->cur ||
!(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
- XFS_STATS_INC(xs_add_exlist);
+ XFS_STATS_INC(mp, xs_add_exlist);
state = 0;
if (whichfork == XFS_ATTR_FORK)
@@ -3800,8 +3808,13 @@ xfs_bmap_btalloc(
args.wasdel = ap->wasdel;
args.isfl = 0;
args.userdata = ap->userdata;
- if ((error = xfs_alloc_vextent(&args)))
+ if (ap->userdata & XFS_ALLOC_USERDATA_ZERO)
+ args.ip = ap->ip;
+
+ error = xfs_alloc_vextent(&args);
+ if (error)
return error;
+
if (tryagain && args.fsbno == NULLFSBLOCK) {
/*
* Exact allocation failed. Now try with alignment
@@ -4036,7 +4049,7 @@ xfs_bmapi_read(
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
- XFS_STATS_INC(xs_blk_mapr);
+ XFS_STATS_INC(mp, xs_blk_mapr);
ifp = XFS_IFORK_PTR(ip, whichfork);
@@ -4221,7 +4234,7 @@ xfs_bmapi_delay(
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
- XFS_STATS_INC(xs_blk_mapw);
+ XFS_STATS_INC(mp, xs_blk_mapw);
if (!(ifp->if_flags & XFS_IFEXTENTS)) {
error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
@@ -4300,11 +4313,14 @@ xfs_bmapi_allocate(
/*
* Indicate if this is the first user data in the file, or just any
- * user data.
+ * user data. And if it is userdata, indicate whether it needs to
+ * be initialised to zero during allocation.
*/
if (!(bma->flags & XFS_BMAPI_METADATA)) {
bma->userdata = (bma->offset == 0) ?
XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA;
+ if (bma->flags & XFS_BMAPI_ZERO)
+ bma->userdata |= XFS_ALLOC_USERDATA_ZERO;
}
bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1;
@@ -4419,6 +4435,17 @@ xfs_bmapi_convert_unwritten(
mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
? XFS_EXT_NORM : XFS_EXT_UNWRITTEN;
+ /*
+ * Before insertion into the bmbt, zero the range being converted
+ * if required.
+ */
+ if (flags & XFS_BMAPI_ZERO) {
+ error = xfs_zero_extent(bma->ip, mval->br_startblock,
+ mval->br_blockcount);
+ if (error)
+ return error;
+ }
+
error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx,
&bma->cur, mval, bma->firstblock, bma->flist,
&tmp_logflags);
@@ -4512,6 +4539,18 @@ xfs_bmapi_write(
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ /* zeroing is for currently only for data extents, not metadata */
+ ASSERT((flags & (XFS_BMAPI_METADATA | XFS_BMAPI_ZERO)) !=
+ (XFS_BMAPI_METADATA | XFS_BMAPI_ZERO));
+ /*
+ * we can allocate unwritten extents or pre-zero allocated blocks,
+ * but it makes no sense to do both at once. This would result in
+ * zeroing the unwritten extent twice, but it still being an
+ * unwritten extent....
+ */
+ ASSERT((flags & (XFS_BMAPI_PREALLOC | XFS_BMAPI_ZERO)) !=
+ (XFS_BMAPI_PREALLOC | XFS_BMAPI_ZERO));
+
if (unlikely(XFS_TEST_ERROR(
(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
@@ -4525,7 +4564,7 @@ xfs_bmapi_write(
ifp = XFS_IFORK_PTR(ip, whichfork);
- XFS_STATS_INC(xs_blk_mapw);
+ XFS_STATS_INC(mp, xs_blk_mapw);
if (*firstblock == NULLFSBLOCK) {
if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE)
@@ -4718,12 +4757,12 @@ xfs_bmap_del_extent(
xfs_filblks_t temp2; /* for indirect length calculations */
int state = 0;
- XFS_STATS_INC(xs_del_exlist);
+ mp = ip->i_mount;
+ XFS_STATS_INC(mp, xs_del_exlist);
if (whichfork == XFS_ATTR_FORK)
state |= BMAP_ATTRFORK;
- mp = ip->i_mount;
ifp = XFS_IFORK_PTR(ip, whichfork);
ASSERT((*idx >= 0) && (*idx < ifp->if_bytes /
(uint)sizeof(xfs_bmbt_rec_t)));
@@ -5070,7 +5109,7 @@ xfs_bunmapi(
*done = 1;
return 0;
}
- XFS_STATS_INC(xs_blk_unmap);
+ XFS_STATS_INC(mp, xs_blk_unmap);
isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
start = bno;
bno = start + len - 1;
@@ -5917,7 +5956,6 @@ xfs_bmap_split_extent(
struct xfs_trans *tp;
struct xfs_bmap_free free_list;
xfs_fsblock_t firstfsb;
- int committed;
int error;
tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
@@ -5938,13 +5976,14 @@ xfs_bmap_split_extent(
if (error)
goto out;
- error = xfs_bmap_finish(&tp, &free_list, &committed);
+ error = xfs_bmap_finish(&tp, &free_list, NULL);
if (error)
goto out;
return xfs_trans_commit(tp);
out:
+ xfs_bmap_cancel(&free_list);
xfs_trans_cancel(tp);
return error;
}
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 6aaa0c1..423a34e 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -52,9 +52,9 @@ struct xfs_bmalloca {
xfs_extlen_t minleft; /* amount must be left after alloc */
bool eof; /* set if allocating past last extent */
bool wasdel; /* replacing a delayed allocation */
- bool userdata;/* set if is user data */
bool aeof; /* allocated space at eof */
bool conv; /* overwriting unwritten extents */
+ char userdata;/* userdata mask */
int flags;
};
@@ -109,6 +109,14 @@ typedef struct xfs_bmap_free
*/
#define XFS_BMAPI_CONVERT 0x040
+/*
+ * allocate zeroed extents - this requires all newly allocated user data extents
+ * to be initialised to zero. It will be ignored if XFS_BMAPI_METADATA is set.
+ * Use in conjunction with XFS_BMAPI_CONVERT to convert unwritten extents found
+ * during the allocation range to zeroed written extents.
+ */
+#define XFS_BMAPI_ZERO 0x080
+
#define XFS_BMAPI_FLAGS \
{ XFS_BMAPI_ENTIRE, "ENTIRE" }, \
{ XFS_BMAPI_METADATA, "METADATA" }, \
@@ -116,7 +124,8 @@ typedef struct xfs_bmap_free
{ XFS_BMAPI_PREALLOC, "PREALLOC" }, \
{ XFS_BMAPI_IGSTATE, "IGSTATE" }, \
{ XFS_BMAPI_CONTIG, "CONTIG" }, \
- { XFS_BMAPI_CONVERT, "CONVERT" }
+ { XFS_BMAPI_CONVERT, "CONVERT" }, \
+ { XFS_BMAPI_ZERO, "ZERO" }
static inline int xfs_bmapi_aflag(int w)
@@ -186,7 +195,7 @@ void xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len,
struct xfs_bmap_free *flist, struct xfs_mount *mp);
void xfs_bmap_cancel(struct xfs_bmap_free *flist);
int xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist,
- int *committed);
+ struct xfs_inode *ip);
void xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork);
int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork);
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 2c44c8e..1637c37 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -349,7 +349,8 @@ xfs_bmbt_to_bmdr(
if (xfs_sb_version_hascrc(&mp->m_sb)) {
ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_CRC_MAGIC));
- ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid));
+ ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid,
+ &mp->m_sb.sb_meta_uuid));
ASSERT(rblock->bb_u.l.bb_blkno ==
cpu_to_be64(XFS_BUF_DADDR_NULL));
} else
@@ -647,7 +648,7 @@ xfs_bmbt_verify(
case cpu_to_be32(XFS_BMAP_CRC_MAGIC):
if (!xfs_sb_version_hascrc(&mp->m_sb))
return false;
- if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid))
+ if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid))
return false;
if (be64_to_cpu(block->bb_u.l.bb_blkno) != bp->b_bn)
return false;
@@ -719,6 +720,7 @@ xfs_bmbt_write_verify(
}
const struct xfs_buf_ops xfs_bmbt_buf_ops = {
+ .name = "xfs_bmbt",
.verify_read = xfs_bmbt_read_verify,
.verify_write = xfs_bmbt_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index c72283d..a0eb18c 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -32,6 +32,7 @@
#include "xfs_trace.h"
#include "xfs_cksum.h"
#include "xfs_alloc.h"
+#include "xfs_log.h"
/*
* Cursor allocation zone.
@@ -65,7 +66,8 @@ xfs_btree_check_lblock(
if (xfs_sb_version_hascrc(&mp->m_sb)) {
lblock_ok = lblock_ok &&
- uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid) &&
+ uuid_equal(&block->bb_u.l.bb_uuid,
+ &mp->m_sb.sb_meta_uuid) &&
block->bb_u.l.bb_blkno == cpu_to_be64(
bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
}
@@ -115,7 +117,8 @@ xfs_btree_check_sblock(
if (xfs_sb_version_hascrc(&mp->m_sb)) {
sblock_ok = sblock_ok &&
- uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid) &&
+ uuid_equal(&block->bb_u.s.bb_uuid,
+ &mp->m_sb.sb_meta_uuid) &&
block->bb_u.s.bb_blkno == cpu_to_be64(
bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
}
@@ -220,7 +223,7 @@ xfs_btree_check_ptr(
* long-form btree header.
*
* Prior to calculting the CRC, pull the LSN out of the buffer log item and put
- * it into the buffer so recovery knows what the last modifcation was that made
+ * it into the buffer so recovery knows what the last modification was that made
* it to disk.
*/
void
@@ -241,8 +244,14 @@ bool
xfs_btree_lblock_verify_crc(
struct xfs_buf *bp)
{
- if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.l.bb_lsn)))
+ return false;
return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
+ }
return true;
}
@@ -252,7 +261,7 @@ xfs_btree_lblock_verify_crc(
* short-form btree header.
*
* Prior to calculting the CRC, pull the LSN out of the buffer log item and put
- * it into the buffer so recovery knows what the last modifcation was that made
+ * it into the buffer so recovery knows what the last modification was that made
* it to disk.
*/
void
@@ -273,8 +282,14 @@ bool
xfs_btree_sblock_verify_crc(
struct xfs_buf *bp)
{
- if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn)))
+ return false;
return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
+ }
return true;
}
@@ -1000,7 +1015,7 @@ xfs_btree_init_block_int(
if (flags & XFS_BTREE_CRC_BLOCKS) {
buf->bb_u.l.bb_blkno = cpu_to_be64(blkno);
buf->bb_u.l.bb_owner = cpu_to_be64(owner);
- uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid);
+ uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid);
buf->bb_u.l.bb_pad = 0;
buf->bb_u.l.bb_lsn = 0;
}
@@ -1013,7 +1028,7 @@ xfs_btree_init_block_int(
if (flags & XFS_BTREE_CRC_BLOCKS) {
buf->bb_u.s.bb_blkno = cpu_to_be64(blkno);
buf->bb_u.s.bb_owner = cpu_to_be32(__owner);
- uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid);
+ uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid);
buf->bb_u.s.bb_lsn = 0;
}
}
@@ -4065,3 +4080,61 @@ xfs_btree_change_owner(
return 0;
}
+
+/**
+ * xfs_btree_sblock_v5hdr_verify() -- verify the v5 fields of a short-format
+ * btree block
+ *
+ * @bp: buffer containing the btree block
+ * @max_recs: pointer to the m_*_mxr max records field in the xfs mount
+ * @pag_max_level: pointer to the per-ag max level field
+ */
+bool
+xfs_btree_sblock_v5hdr_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ struct xfs_perag *pag = bp->b_pag;
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return false;
+ if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
+ return false;
+ if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
+ return false;
+ if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
+ return false;
+ return true;
+}
+
+/**
+ * xfs_btree_sblock_verify() -- verify a short-format btree block
+ *
+ * @bp: buffer containing the btree block
+ * @max_recs: maximum records allowed in this btree node
+ */
+bool
+xfs_btree_sblock_verify(
+ struct xfs_buf *bp,
+ unsigned int max_recs)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+
+ /* numrecs verification */
+ if (be16_to_cpu(block->bb_numrecs) > max_recs)
+ return false;
+
+ /* sibling pointer verification */
+ if (!block->bb_u.s.bb_leftsib ||
+ (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks &&
+ block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK)))
+ return false;
+ if (!block->bb_u.s.bb_rightsib ||
+ (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks &&
+ block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK)))
+ return false;
+
+ return true;
+}
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 8f18bab..2e874be 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -84,31 +84,38 @@ union xfs_btree_rec {
/*
* Generic stats interface
*/
-#define __XFS_BTREE_STATS_INC(type, stat) \
- XFS_STATS_INC(xs_ ## type ## _2_ ## stat)
-#define XFS_BTREE_STATS_INC(cur, stat) \
+#define __XFS_BTREE_STATS_INC(mp, type, stat) \
+ XFS_STATS_INC(mp, xs_ ## type ## _2_ ## stat)
+#define XFS_BTREE_STATS_INC(cur, stat) \
do { \
+ struct xfs_mount *__mp = cur->bc_mp; \
switch (cur->bc_btnum) { \
- case XFS_BTNUM_BNO: __XFS_BTREE_STATS_INC(abtb, stat); break; \
- case XFS_BTNUM_CNT: __XFS_BTREE_STATS_INC(abtc, stat); break; \
- case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(bmbt, stat); break; \
- case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(ibt, stat); break; \
- case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(fibt, stat); break; \
+ case XFS_BTNUM_BNO: __XFS_BTREE_STATS_INC(__mp, abtb, stat); break; \
+ case XFS_BTNUM_CNT: __XFS_BTREE_STATS_INC(__mp, abtc, stat); break; \
+ case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(__mp, bmbt, stat); break; \
+ case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(__mp, ibt, stat); break; \
+ case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(__mp, fibt, stat); break; \
case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \
} \
} while (0)
-#define __XFS_BTREE_STATS_ADD(type, stat, val) \
- XFS_STATS_ADD(xs_ ## type ## _2_ ## stat, val)
+#define __XFS_BTREE_STATS_ADD(mp, type, stat, val) \
+ XFS_STATS_ADD(mp, xs_ ## type ## _2_ ## stat, val)
#define XFS_BTREE_STATS_ADD(cur, stat, val) \
do { \
+ struct xfs_mount *__mp = cur->bc_mp; \
switch (cur->bc_btnum) { \
- case XFS_BTNUM_BNO: __XFS_BTREE_STATS_ADD(abtb, stat, val); break; \
- case XFS_BTNUM_CNT: __XFS_BTREE_STATS_ADD(abtc, stat, val); break; \
- case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_ADD(bmbt, stat, val); break; \
- case XFS_BTNUM_INO: __XFS_BTREE_STATS_ADD(ibt, stat, val); break; \
- case XFS_BTNUM_FINO: __XFS_BTREE_STATS_ADD(fibt, stat, val); break; \
- case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \
+ case XFS_BTNUM_BNO: \
+ __XFS_BTREE_STATS_ADD(__mp, abtb, stat, val); break; \
+ case XFS_BTNUM_CNT: \
+ __XFS_BTREE_STATS_ADD(__mp, abtc, stat, val); break; \
+ case XFS_BTNUM_BMAP: \
+ __XFS_BTREE_STATS_ADD(__mp, bmbt, stat, val); break; \
+ case XFS_BTNUM_INO: \
+ __XFS_BTREE_STATS_ADD(__mp, ibt, stat, val); break; \
+ case XFS_BTNUM_FINO: \
+ __XFS_BTREE_STATS_ADD(__mp, fibt, stat, val); break; \
+ case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \
} \
} while (0)
@@ -465,4 +472,7 @@ static inline int xfs_btree_get_level(struct xfs_btree_block *block)
#define XFS_BTREE_TRACE_ARGR(c, r)
#define XFS_BTREE_TRACE_CURSOR(c, t)
+bool xfs_btree_sblock_v5hdr_verify(struct xfs_buf *bp);
+bool xfs_btree_sblock_verify(struct xfs_buf *bp, unsigned int max_recs);
+
#endif /* __XFS_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index 2385f8c..097bf77 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -39,6 +39,7 @@
#include "xfs_trace.h"
#include "xfs_cksum.h"
#include "xfs_buf_item.h"
+#include "xfs_log.h"
/*
* xfs_da_btree.c
@@ -146,10 +147,12 @@ xfs_da3_node_verify(
if (ichdr.magic != XFS_DA3_NODE_MAGIC)
return false;
- if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_uuid))
+ if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid))
return false;
if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
return false;
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn)))
+ return false;
} else {
if (ichdr.magic != XFS_DA_NODE_MAGIC)
return false;
@@ -233,6 +236,7 @@ xfs_da3_node_read_verify(
bp->b_ops->verify_read(bp);
return;
default:
+ xfs_buf_ioerror(bp, -EFSCORRUPTED);
break;
}
@@ -241,6 +245,7 @@ xfs_da3_node_read_verify(
}
const struct xfs_buf_ops xfs_da3_node_buf_ops = {
+ .name = "xfs_da3_node",
.verify_read = xfs_da3_node_read_verify,
.verify_write = xfs_da3_node_write_verify,
};
@@ -321,10 +326,11 @@ xfs_da3_node_create(
if (xfs_sb_version_hascrc(&mp->m_sb)) {
struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
+ memset(hdr3, 0, sizeof(struct xfs_da3_node_hdr));
ichdr.magic = XFS_DA3_NODE_MAGIC;
hdr3->info.blkno = cpu_to_be64(bp->b_bn);
hdr3->info.owner = cpu_to_be64(args->dp->i_ino);
- uuid_copy(&hdr3->info.uuid, &mp->m_sb.sb_uuid);
+ uuid_copy(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid);
} else {
ichdr.magic = XFS_DA_NODE_MAGIC;
}
@@ -1822,6 +1828,7 @@ xfs_da3_path_shift(
struct xfs_da_args *args;
struct xfs_da_node_entry *btree;
struct xfs_da3_icnode_hdr nodehdr;
+ struct xfs_buf *bp;
xfs_dablk_t blkno = 0;
int level;
int error;
@@ -1866,20 +1873,24 @@ xfs_da3_path_shift(
*/
for (blk++, level++; level < path->active; blk++, level++) {
/*
- * Release the old block.
- * (if it's dirty, trans won't actually let go)
+ * Read the next child block into a local buffer.
*/
- if (release)
- xfs_trans_brelse(args->trans, blk->bp);
+ error = xfs_da3_node_read(args->trans, dp, blkno, -1, &bp,
+ args->whichfork);
+ if (error)
+ return error;
/*
- * Read the next child block.
+ * Release the old block (if it's dirty, the trans doesn't
+ * actually let go) and swap the local buffer into the path
+ * structure. This ensures failure of the above read doesn't set
+ * a NULL buffer in an active slot in the path.
*/
+ if (release)
+ xfs_trans_brelse(args->trans, blk->bp);
blk->blkno = blkno;
- error = xfs_da3_node_read(args->trans, dp, blkno, -1,
- &blk->bp, args->whichfork);
- if (error)
- return error;
+ blk->bp = bp;
+
info = blk->bp->b_addr;
ASSERT(info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
info->magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) ||
@@ -2351,8 +2362,8 @@ xfs_da_shrink_inode(
* the last block to the place we want to kill.
*/
error = xfs_bunmapi(tp, dp, dead_blkno, count,
- xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
- 0, args->firstblock, args->flist, &done);
+ xfs_bmapi_aflag(w), 0, args->firstblock,
+ args->flist, &done);
if (error == -ENOSPC) {
if (w != XFS_DATA_FORK)
break;
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index 74bcbab..b14bbd6 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -680,8 +680,15 @@ typedef struct xfs_attr_leaf_name_remote {
typedef struct xfs_attr_leafblock {
xfs_attr_leaf_hdr_t hdr; /* constant-structure header block */
xfs_attr_leaf_entry_t entries[1]; /* sorted on key, not name */
- xfs_attr_leaf_name_local_t namelist; /* grows from bottom of buf */
- xfs_attr_leaf_name_remote_t valuelist; /* grows from bottom of buf */
+ /*
+ * The rest of the block contains the following structures after the
+ * leaf entries, growing from the bottom up. The variables are never
+ * referenced and definining them can actually make gcc optimize away
+ * accesses to the 'entries' array above index 0 so don't do that.
+ *
+ * xfs_attr_leaf_name_local_t namelist;
+ * xfs_attr_leaf_name_remote_t valuelist;
+ */
} xfs_attr_leafblock_t;
/*
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index a69fb3a..2fb53a5 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -271,7 +271,7 @@ xfs_dir_createname(
rval = xfs_dir_ino_validate(tp->t_mountp, inum);
if (rval)
return rval;
- XFS_STATS_INC(xs_dir_create);
+ XFS_STATS_INC(dp->i_mount, xs_dir_create);
}
args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
@@ -362,9 +362,10 @@ xfs_dir_lookup(
struct xfs_da_args *args;
int rval;
int v; /* type-checking value */
+ int lock_mode;
ASSERT(S_ISDIR(dp->i_d.di_mode));
- XFS_STATS_INC(xs_dir_lookup);
+ XFS_STATS_INC(dp->i_mount, xs_dir_lookup);
/*
* We need to use KM_NOFS here so that lockdep will not throw false
@@ -387,6 +388,7 @@ xfs_dir_lookup(
if (ci_name)
args->op_flags |= XFS_DA_OP_CILOOKUP;
+ lock_mode = xfs_ilock_data_map_shared(dp);
if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
rval = xfs_dir2_sf_lookup(args);
goto out_check_rval;
@@ -419,6 +421,7 @@ out_check_rval:
}
}
out_free:
+ xfs_iunlock(dp, lock_mode);
kmem_free(args);
return rval;
}
@@ -441,7 +444,7 @@ xfs_dir_removename(
int v; /* type-checking value */
ASSERT(S_ISDIR(dp->i_d.di_mode));
- XFS_STATS_INC(xs_dir_remove);
+ XFS_STATS_INC(dp->i_mount, xs_dir_remove);
args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
if (!args)
@@ -674,25 +677,22 @@ xfs_dir2_shrink_inode(
mp = dp->i_mount;
tp = args->trans;
da = xfs_dir2_db_to_da(args->geo, db);
- /*
- * Unmap the fsblock(s).
- */
- if ((error = xfs_bunmapi(tp, dp, da, args->geo->fsbcount,
- XFS_BMAPI_METADATA, 0, args->firstblock, args->flist,
- &done))) {
+
+ /* Unmap the fsblock(s). */
+ error = xfs_bunmapi(tp, dp, da, args->geo->fsbcount, 0, 0,
+ args->firstblock, args->flist, &done);
+ if (error) {
/*
- * ENOSPC actually can happen if we're in a removename with
- * no space reservation, and the resulting block removal
- * would cause a bmap btree split or conversion from extents
- * to btree. This can only happen for un-fragmented
- * directory blocks, since you need to be punching out
- * the middle of an extent.
- * In this case we need to leave the block in the file,
- * and not binval it.
- * So the block has to be in a consistent empty state
- * and appropriately logged.
- * We don't free up the buffer, the caller can tell it
- * hasn't happened since it got an error back.
+ * ENOSPC actually can happen if we're in a removename with no
+ * space reservation, and the resulting block removal would
+ * cause a bmap btree split or conversion from extents to btree.
+ * This can only happen for un-fragmented directory blocks,
+ * since you need to be punching out the middle of an extent.
+ * In this case we need to leave the block in the file, and not
+ * binval it. So the block has to be in a consistent empty
+ * state and appropriately logged. We don't free up the buffer,
+ * the caller can tell it hasn't happened since it got an error
+ * back.
*/
return error;
}
diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
index 9354e19..aa17cb7 100644
--- a/fs/xfs/libxfs/xfs_dir2_block.c
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -34,6 +34,7 @@
#include "xfs_error.h"
#include "xfs_trace.h"
#include "xfs_cksum.h"
+#include "xfs_log.h"
/*
* Local function prototypes.
@@ -67,10 +68,12 @@ xfs_dir3_block_verify(
if (xfs_sb_version_hascrc(&mp->m_sb)) {
if (hdr3->magic != cpu_to_be32(XFS_DIR3_BLOCK_MAGIC))
return false;
- if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid))
+ if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
return false;
if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
return false;
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
+ return false;
} else {
if (hdr3->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))
return false;
@@ -120,6 +123,7 @@ xfs_dir3_block_write_verify(
}
const struct xfs_buf_ops xfs_dir3_block_buf_ops = {
+ .name = "xfs_dir3_block",
.verify_read = xfs_dir3_block_read_verify,
.verify_write = xfs_dir3_block_write_verify,
};
@@ -157,7 +161,7 @@ xfs_dir3_block_init(
hdr3->magic = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC);
hdr3->blkno = cpu_to_be64(bp->b_bn);
hdr3->owner = cpu_to_be64(dp->i_ino);
- uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid);
+ uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid);
return;
}
diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c
index de1ea16..725fc78 100644
--- a/fs/xfs/libxfs/xfs_dir2_data.c
+++ b/fs/xfs/libxfs/xfs_dir2_data.c
@@ -31,6 +31,7 @@
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_cksum.h"
+#include "xfs_log.h"
/*
* Check the consistency of the data block.
@@ -220,10 +221,12 @@ xfs_dir3_data_verify(
if (xfs_sb_version_hascrc(&mp->m_sb)) {
if (hdr3->magic != cpu_to_be32(XFS_DIR3_DATA_MAGIC))
return false;
- if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid))
+ if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
return false;
if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
return false;
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
+ return false;
} else {
if (hdr3->magic != cpu_to_be32(XFS_DIR2_DATA_MAGIC))
return false;
@@ -252,7 +255,8 @@ xfs_dir3_data_reada_verify(
return;
case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
- xfs_dir3_data_verify(bp);
+ bp->b_ops = &xfs_dir3_data_buf_ops;
+ bp->b_ops->verify_read(bp);
return;
default:
xfs_buf_ioerror(bp, -EFSCORRUPTED);
@@ -301,11 +305,13 @@ xfs_dir3_data_write_verify(
}
const struct xfs_buf_ops xfs_dir3_data_buf_ops = {
+ .name = "xfs_dir3_data",
.verify_read = xfs_dir3_data_read_verify,
.verify_write = xfs_dir3_data_write_verify,
};
static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = {
+ .name = "xfs_dir3_data_reada",
.verify_read = xfs_dir3_data_reada_verify,
.verify_write = xfs_dir3_data_write_verify,
};
@@ -604,7 +610,7 @@ xfs_dir3_data_init(
hdr3->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC);
hdr3->blkno = cpu_to_be64(bp->b_bn);
hdr3->owner = cpu_to_be64(dp->i_ino);
- uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid);
+ uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid);
} else
hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c
index 1061199..b887fb2 100644
--- a/fs/xfs/libxfs/xfs_dir2_leaf.c
+++ b/fs/xfs/libxfs/xfs_dir2_leaf.c
@@ -33,6 +33,7 @@
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_cksum.h"
+#include "xfs_log.h"
/*
* Local function declarations.
@@ -160,10 +161,12 @@ xfs_dir3_leaf_verify(
if (leaf3->info.hdr.magic != cpu_to_be16(magic3))
return false;
- if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_uuid))
+ if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_meta_uuid))
return false;
if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
return false;
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(leaf3->info.lsn)))
+ return false;
} else {
if (leaf->hdr.info.magic != cpu_to_be16(magic))
return false;
@@ -242,11 +245,13 @@ xfs_dir3_leafn_write_verify(
}
const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops = {
+ .name = "xfs_dir3_leaf1",
.verify_read = xfs_dir3_leaf1_read_verify,
.verify_write = xfs_dir3_leaf1_write_verify,
};
const struct xfs_buf_ops xfs_dir3_leafn_buf_ops = {
+ .name = "xfs_dir3_leafn",
.verify_read = xfs_dir3_leafn_read_verify,
.verify_write = xfs_dir3_leafn_write_verify,
};
@@ -310,7 +315,7 @@ xfs_dir3_leaf_init(
: cpu_to_be16(XFS_DIR3_LEAFN_MAGIC);
leaf3->info.blkno = cpu_to_be64(bp->b_bn);
leaf3->info.owner = cpu_to_be64(owner);
- uuid_copy(&leaf3->info.uuid, &mp->m_sb.sb_uuid);
+ uuid_copy(&leaf3->info.uuid, &mp->m_sb.sb_meta_uuid);
} else {
memset(leaf, 0, sizeof(*leaf));
leaf->hdr.info.magic = cpu_to_be16(type);
diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
index 41b80d3..63ee03d 100644
--- a/fs/xfs/libxfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -33,6 +33,7 @@
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_cksum.h"
+#include "xfs_log.h"
/*
* Function declarations.
@@ -93,10 +94,12 @@ xfs_dir3_free_verify(
if (hdr3->magic != cpu_to_be32(XFS_DIR3_FREE_MAGIC))
return false;
- if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid))
+ if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
return false;
if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
return false;
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
+ return false;
} else {
if (hdr->magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC))
return false;
@@ -147,6 +150,7 @@ xfs_dir3_free_write_verify(
}
const struct xfs_buf_ops xfs_dir3_free_buf_ops = {
+ .name = "xfs_dir3_free",
.verify_read = xfs_dir3_free_read_verify,
.verify_write = xfs_dir3_free_write_verify,
};
@@ -226,7 +230,7 @@ xfs_dir3_free_get_buf(
hdr3->hdr.blkno = cpu_to_be64(bp->b_bn);
hdr3->hdr.owner = cpu_to_be64(dp->i_ino);
- uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_uuid);
+ uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_meta_uuid);
} else
hdr.magic = XFS_DIR2_FREE_MAGIC;
dp->d_ops->free_hdr_to_disk(bp->b_addr, &hdr);
@@ -1845,8 +1849,7 @@ xfs_dir2_node_addname_int(
if (dp->d_ops->db_to_fdb(args->geo, dbno) != fbno) {
xfs_alert(mp,
- "%s: dir ino %llu needed freesp block %lld for\n"
- " data block %lld, got %lld ifbno %llu lastfbno %d",
+"%s: dir ino %llu needed freesp block %lld for data block %lld, got %lld ifbno %llu lastfbno %d",
__func__, (unsigned long long)dp->i_ino,
(long long)dp->d_ops->db_to_fdb(
args->geo, dbno),
@@ -2132,6 +2135,7 @@ xfs_dir2_node_replace(
int error; /* error return value */
int i; /* btree level */
xfs_ino_t inum; /* new inode number */
+ int ftype; /* new file type */
xfs_dir2_leaf_t *leaf; /* leaf structure */
xfs_dir2_leaf_entry_t *lep; /* leaf entry being changed */
int rval; /* internal return value */
@@ -2145,7 +2149,14 @@ xfs_dir2_node_replace(
state = xfs_da_state_alloc();
state->args = args;
state->mp = args->dp->i_mount;
+
+ /*
+ * We have to save new inode number and ftype since
+ * xfs_da3_node_lookup_int() is going to overwrite them
+ */
inum = args->inumber;
+ ftype = args->filetype;
+
/*
* Lookup the entry to change in the btree.
*/
@@ -2183,7 +2194,7 @@ xfs_dir2_node_replace(
* Fill in the new inode number and log the entry.
*/
dep->inumber = cpu_to_be64(inum);
- args->dp->d_ops->data_put_ftype(dep, args->filetype);
+ args->dp->d_ops->data_put_ftype(dep, ftype);
xfs_dir2_data_log_entry(args, state->extrablk.bp, dep);
rval = 0;
}
diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c
index 6fbf2d8..3cc3cf7 100644
--- a/fs/xfs/libxfs/xfs_dquot_buf.c
+++ b/fs/xfs/libxfs/xfs_dquot_buf.c
@@ -54,7 +54,7 @@ xfs_dqcheck(
xfs_dqid_t id,
uint type, /* used only when IO_dorepair is true */
uint flags,
- char *str)
+ const char *str)
{
xfs_dqblk_t *d = (xfs_dqblk_t *)ddq;
int errs = 0;
@@ -163,7 +163,7 @@ xfs_dqcheck(
d->dd_diskdq.d_id = cpu_to_be32(id);
if (xfs_sb_version_hascrc(&mp->m_sb)) {
- uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid);
+ uuid_copy(&d->dd_uuid, &mp->m_sb.sb_meta_uuid);
xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk),
XFS_DQUOT_CRC_OFF);
}
@@ -198,7 +198,7 @@ xfs_dquot_buf_verify_crc(
if (!xfs_verify_cksum((char *)d, sizeof(struct xfs_dqblk),
XFS_DQUOT_CRC_OFF))
return false;
- if (!uuid_equal(&d->dd_uuid, &mp->m_sb.sb_uuid))
+ if (!uuid_equal(&d->dd_uuid, &mp->m_sb.sb_meta_uuid))
return false;
}
return true;
@@ -207,7 +207,8 @@ xfs_dquot_buf_verify_crc(
STATIC bool
xfs_dquot_buf_verify(
struct xfs_mount *mp,
- struct xfs_buf *bp)
+ struct xfs_buf *bp,
+ int warn)
{
struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr;
xfs_dqid_t id = 0;
@@ -240,8 +241,7 @@ xfs_dquot_buf_verify(
if (i == 0)
id = be32_to_cpu(ddq->d_id);
- error = xfs_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN,
- "xfs_dquot_buf_verify");
+ error = xfs_dqcheck(mp, ddq, id + i, 0, warn, __func__);
if (error)
return false;
}
@@ -256,7 +256,7 @@ xfs_dquot_buf_read_verify(
if (!xfs_dquot_buf_verify_crc(mp, bp))
xfs_buf_ioerror(bp, -EFSBADCRC);
- else if (!xfs_dquot_buf_verify(mp, bp))
+ else if (!xfs_dquot_buf_verify(mp, bp, XFS_QMOPT_DOWARN))
xfs_buf_ioerror(bp, -EFSCORRUPTED);
if (bp->b_error)
@@ -264,6 +264,25 @@ xfs_dquot_buf_read_verify(
}
/*
+ * readahead errors are silent and simply leave the buffer as !done so a real
+ * read will then be run with the xfs_dquot_buf_ops verifier. See
+ * xfs_inode_buf_verify() for why we use EIO and ~XBF_DONE here rather than
+ * reporting the failure.
+ */
+static void
+xfs_dquot_buf_readahead_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+
+ if (!xfs_dquot_buf_verify_crc(mp, bp) ||
+ !xfs_dquot_buf_verify(mp, bp, 0)) {
+ xfs_buf_ioerror(bp, -EIO);
+ bp->b_flags &= ~XBF_DONE;
+ }
+}
+
+/*
* we don't calculate the CRC here as that is done when the dquot is flushed to
* the buffer after the update is done. This ensures that the dquot in the
* buffer always has an up-to-date CRC value.
@@ -274,7 +293,7 @@ xfs_dquot_buf_write_verify(
{
struct xfs_mount *mp = bp->b_target->bt_mount;
- if (!xfs_dquot_buf_verify(mp, bp)) {
+ if (!xfs_dquot_buf_verify(mp, bp, XFS_QMOPT_DOWARN)) {
xfs_buf_ioerror(bp, -EFSCORRUPTED);
xfs_verifier_error(bp);
return;
@@ -282,7 +301,13 @@ xfs_dquot_buf_write_verify(
}
const struct xfs_buf_ops xfs_dquot_buf_ops = {
+ .name = "xfs_dquot",
.verify_read = xfs_dquot_buf_read_verify,
.verify_write = xfs_dquot_buf_write_verify,
};
+const struct xfs_buf_ops xfs_dquot_buf_ra_ops = {
+ .name = "xfs_dquot_ra",
+ .verify_read = xfs_dquot_buf_readahead_verify,
+ .verify_write = xfs_dquot_buf_write_verify,
+};
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index a0ae572..e2536bb 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -60,6 +60,14 @@ struct xfs_ifork;
#define XFS_SB_VERSION_MOREBITSBIT 0x8000
/*
+ * The size of a single extended attribute on disk is limited by
+ * the size of index values within the attribute entries themselves.
+ * These are be16 fields, so we can only support attribute data
+ * sizes up to 2^16 bytes in length.
+ */
+#define XFS_XATTR_SIZE_MAX (1 << 16)
+
+/*
* Supported feature bit list is just all bits in the versionnum field because
* we've used them all up and understand them all. Except, of course, for the
* shared superblock bit, which nobody knows what it does and so is unsupported.
@@ -100,7 +108,7 @@ typedef struct xfs_sb {
xfs_rfsblock_t sb_dblocks; /* number of data blocks */
xfs_rfsblock_t sb_rblocks; /* number of realtime blocks */
xfs_rtblock_t sb_rextents; /* number of realtime extents */
- uuid_t sb_uuid; /* file system unique id */
+ uuid_t sb_uuid; /* user-visible file system unique id */
xfs_fsblock_t sb_logstart; /* starting block of log if internal */
xfs_ino_t sb_rootino; /* root inode number */
xfs_ino_t sb_rbmino; /* bitmap inode for realtime extents */
@@ -174,6 +182,7 @@ typedef struct xfs_sb {
xfs_ino_t sb_pquotino; /* project quota inode */
xfs_lsn_t sb_lsn; /* last write sequence */
+ uuid_t sb_meta_uuid; /* metadata file system unique id */
/* must be padded to 64 bit alignment */
} xfs_sb_t;
@@ -190,7 +199,7 @@ typedef struct xfs_dsb {
__be64 sb_dblocks; /* number of data blocks */
__be64 sb_rblocks; /* number of realtime blocks */
__be64 sb_rextents; /* number of realtime extents */
- uuid_t sb_uuid; /* file system unique id */
+ uuid_t sb_uuid; /* user-visible file system unique id */
__be64 sb_logstart; /* starting block of log if internal */
__be64 sb_rootino; /* root inode number */
__be64 sb_rbmino; /* bitmap inode for realtime extents */
@@ -260,6 +269,7 @@ typedef struct xfs_dsb {
__be64 sb_pquotino; /* project quota inode */
__be64 sb_lsn; /* last write sequence */
+ uuid_t sb_meta_uuid; /* metadata file system unique id */
/* must be padded to 64 bit alignment */
} xfs_dsb_t;
@@ -458,9 +468,11 @@ xfs_sb_has_ro_compat_feature(
#define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */
#define XFS_SB_FEAT_INCOMPAT_SPINODES (1 << 1) /* sparse inode chunks */
+#define XFS_SB_FEAT_INCOMPAT_META_UUID (1 << 2) /* metadata UUID */
#define XFS_SB_FEAT_INCOMPAT_ALL \
(XFS_SB_FEAT_INCOMPAT_FTYPE| \
- XFS_SB_FEAT_INCOMPAT_SPINODES)
+ XFS_SB_FEAT_INCOMPAT_SPINODES| \
+ XFS_SB_FEAT_INCOMPAT_META_UUID)
#define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL
static inline bool
@@ -515,6 +527,18 @@ static inline bool xfs_sb_version_hassparseinodes(struct xfs_sb *sbp)
}
/*
+ * XFS_SB_FEAT_INCOMPAT_META_UUID indicates that the metadata UUID
+ * is stored separately from the user-visible UUID; this allows the
+ * user-visible UUID to be changed on V5 filesystems which have a
+ * filesystem UUID stamped into every piece of metadata.
+ */
+static inline bool xfs_sb_version_hasmetauuid(struct xfs_sb *sbp)
+{
+ return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) &&
+ (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID);
+}
+
+/*
* end of superblock version macros
*/
@@ -762,7 +786,7 @@ typedef struct xfs_agfl {
__be64 agfl_lsn;
__be32 agfl_crc;
__be32 agfl_bno[]; /* actually XFS_AGFL_SIZE(mp) */
-} xfs_agfl_t;
+} __attribute__((packed)) xfs_agfl_t;
#define XFS_AGFL_CRC_OFF offsetof(struct xfs_agfl, agfl_crc)
@@ -1467,13 +1491,17 @@ struct xfs_acl {
*/
#define XFS_ACL_MAX_ENTRIES(mp) \
(xfs_sb_version_hascrc(&mp->m_sb) \
- ? (XATTR_SIZE_MAX - sizeof(struct xfs_acl)) / \
+ ? (XFS_XATTR_SIZE_MAX - sizeof(struct xfs_acl)) / \
sizeof(struct xfs_acl_entry) \
: 25)
-#define XFS_ACL_MAX_SIZE(mp) \
+#define XFS_ACL_SIZE(cnt) \
(sizeof(struct xfs_acl) + \
- sizeof(struct xfs_acl_entry) * XFS_ACL_MAX_ENTRIES((mp)))
+ sizeof(struct xfs_acl_entry) * cnt)
+
+#define XFS_ACL_MAX_SIZE(mp) \
+ XFS_ACL_SIZE(XFS_ACL_MAX_ENTRIES((mp)))
+
/* On-disk XFS extended attribute names */
#define SGI_ACL_FILE "SGI_ACL_FILE"
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 89689c6..b2b73a9 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -490,6 +490,16 @@ typedef struct xfs_swapext
#define XFS_FSOP_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */
/*
+ * ioctl limits
+ */
+#ifdef XATTR_LIST_MAX
+# define XFS_XATTR_LIST_MAX XATTR_LIST_MAX
+#else
+# define XFS_XATTR_LIST_MAX 65536
+#endif
+
+
+/*
* ioctl commands that are used by Linux filesystems
*/
#define XFS_IOC_GETXFLAGS FS_IOC_GETFLAGS
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 66efc70..66d702e 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -38,6 +38,7 @@
#include "xfs_icreate_item.h"
#include "xfs_icache.h"
#include "xfs_trace.h"
+#include "xfs_log.h"
/*
@@ -338,7 +339,8 @@ xfs_ialloc_inode_init(
if (version == 3) {
free->di_ino = cpu_to_be64(ino);
ino++;
- uuid_copy(&free->di_uuid, &mp->m_sb.sb_uuid);
+ uuid_copy(&free->di_uuid,
+ &mp->m_sb.sb_meta_uuid);
xfs_dinode_calc_crc(mp, free);
} else if (tp) {
/* just log the inode core */
@@ -2232,7 +2234,7 @@ xfs_imap_lookup(
}
xfs_trans_brelse(tp, agbp);
- xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
if (error)
return error;
@@ -2499,9 +2501,14 @@ xfs_agi_verify(
struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_agi *agi = XFS_BUF_TO_AGI(bp);
- if (xfs_sb_version_hascrc(&mp->m_sb) &&
- !uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_uuid))
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (!uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid))
return false;
+ if (!xfs_log_check_lsn(mp,
+ be64_to_cpu(XFS_BUF_TO_AGI(bp)->agi_lsn)))
+ return false;
+ }
+
/*
* Validate the magic number of the agi block.
*/
@@ -2565,6 +2572,7 @@ xfs_agi_write_verify(
}
const struct xfs_buf_ops xfs_agi_buf_ops = {
+ .name = "xfs_agi",
.verify_read = xfs_agi_read_verify,
.verify_write = xfs_agi_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 674ad8f..c679f3c 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -221,7 +221,6 @@ xfs_inobt_verify(
{
struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
- struct xfs_perag *pag = bp->b_pag;
unsigned int level;
/*
@@ -237,14 +236,7 @@ xfs_inobt_verify(
switch (block->bb_magic) {
case cpu_to_be32(XFS_IBT_CRC_MAGIC):
case cpu_to_be32(XFS_FIBT_CRC_MAGIC):
- if (!xfs_sb_version_hascrc(&mp->m_sb))
- return false;
- if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid))
- return false;
- if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
- return false;
- if (pag &&
- be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
+ if (!xfs_btree_sblock_v5hdr_verify(bp))
return false;
/* fall through */
case cpu_to_be32(XFS_IBT_MAGIC):
@@ -254,24 +246,12 @@ xfs_inobt_verify(
return 0;
}
- /* numrecs and level verification */
+ /* level verification */
level = be16_to_cpu(block->bb_level);
if (level >= mp->m_in_maxlevels)
return false;
- if (be16_to_cpu(block->bb_numrecs) > mp->m_inobt_mxr[level != 0])
- return false;
-
- /* sibling pointer verification */
- if (!block->bb_u.s.bb_leftsib ||
- (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks &&
- block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK)))
- return false;
- if (!block->bb_u.s.bb_rightsib ||
- (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks &&
- block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK)))
- return false;
- return true;
+ return xfs_btree_sblock_verify(bp, mp->m_inobt_mxr[level != 0]);
}
static void
@@ -304,6 +284,7 @@ xfs_inobt_write_verify(
}
const struct xfs_buf_ops xfs_inobt_buf_ops = {
+ .name = "xfs_inobt",
.verify_read = xfs_inobt_read_verify,
.verify_write = xfs_inobt_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 6526e76..1aabfda 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -62,11 +62,14 @@ xfs_inobp_check(
* has not had the inode cores stamped into it. Hence for readahead, the buffer
* may be potentially invalid.
*
- * If the readahead buffer is invalid, we don't want to mark it with an error,
- * but we do want to clear the DONE status of the buffer so that a followup read
- * will re-read it from disk. This will ensure that we don't get an unnecessary
- * warnings during log recovery and we don't get unnecssary panics on debug
- * kernels.
+ * If the readahead buffer is invalid, we need to mark it with an error and
+ * clear the DONE status of the buffer so that a followup read will re-read it
+ * from disk. We don't report the error otherwise to avoid warnings during log
+ * recovery and we don't get unnecssary panics on debug kernels. We use EIO here
+ * because all we want to do is say readahead failed; there is no-one to report
+ * the error to, so this will distinguish it from a non-ra verifier failure.
+ * Changes to this readahead error behavour also need to be reflected in
+ * xfs_dquot_buf_readahead_verify().
*/
static void
xfs_inode_buf_verify(
@@ -93,6 +96,7 @@ xfs_inode_buf_verify(
XFS_RANDOM_ITOBP_INOTOBP))) {
if (readahead) {
bp->b_flags &= ~XBF_DONE;
+ xfs_buf_ioerror(bp, -EIO);
return;
}
@@ -132,11 +136,13 @@ xfs_inode_buf_write_verify(
}
const struct xfs_buf_ops xfs_inode_buf_ops = {
+ .name = "xfs_inode",
.verify_read = xfs_inode_buf_read_verify,
.verify_write = xfs_inode_buf_write_verify,
};
const struct xfs_buf_ops xfs_inode_buf_ra_ops = {
+ .name = "xxfs_inode_ra",
.verify_read = xfs_inode_buf_readahead_verify,
.verify_write = xfs_inode_buf_write_verify,
};
@@ -304,7 +310,7 @@ xfs_dinode_verify(
return false;
if (be64_to_cpu(dip->di_ino) != ip->i_ino)
return false;
- if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_uuid))
+ if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_meta_uuid))
return false;
return true;
}
@@ -366,7 +372,7 @@ xfs_iread(
if (xfs_sb_version_hascrc(&mp->m_sb)) {
ip->i_d.di_version = 3;
ip->i_d.di_ino = ip->i_ino;
- uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid);
+ uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_meta_uuid);
} else
ip->i_d.di_version = 2;
return 0;
diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
index 1c55ccb..8e385f9 100644
--- a/fs/xfs/libxfs/xfs_log_recover.h
+++ b/fs/xfs/libxfs/xfs_log_recover.h
@@ -60,6 +60,7 @@ typedef struct xlog_recover {
*/
#define XLOG_BC_TABLE_SIZE 64
+#define XLOG_RECOVER_CRCPASS 0
#define XLOG_RECOVER_PASS1 1
#define XLOG_RECOVER_PASS2 2
diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h
index 1b0a083..f51078f 100644
--- a/fs/xfs/libxfs/xfs_quota_defs.h
+++ b/fs/xfs/libxfs/xfs_quota_defs.h
@@ -153,7 +153,7 @@ typedef __uint16_t xfs_qwarncnt_t;
#define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS)
extern int xfs_dqcheck(struct xfs_mount *mp, xfs_disk_dquot_t *ddq,
- xfs_dqid_t id, uint type, uint flags, char *str);
+ xfs_dqid_t id, uint type, uint flags, const char *str);
extern int xfs_calc_dquots_per_chunk(unsigned int nbblks);
#endif /* __XFS_QUOTA_H__ */
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index df9851c..8a53eaa 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -35,6 +35,7 @@
#include "xfs_bmap_btree.h"
#include "xfs_alloc_btree.h"
#include "xfs_ialloc_btree.h"
+#include "xfs_log.h"
/*
* Physical superblock buffer manipulations. Shared with libxfs in userspace.
@@ -131,10 +132,11 @@ xfs_mount_validate_sb(
if (xfs_sb_has_compat_feature(sbp,
XFS_SB_FEAT_COMPAT_UNKNOWN)) {
xfs_warn(mp,
-"Superblock has unknown compatible features (0x%x) enabled.\n"
-"Using a more recent kernel is recommended.",
+"Superblock has unknown compatible features (0x%x) enabled.",
(sbp->sb_features_compat &
XFS_SB_FEAT_COMPAT_UNKNOWN));
+ xfs_warn(mp,
+"Using a more recent kernel is recommended.");
}
if (xfs_sb_has_ro_compat_feature(sbp,
@@ -145,20 +147,32 @@ xfs_mount_validate_sb(
XFS_SB_FEAT_RO_COMPAT_UNKNOWN));
if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
xfs_warn(mp,
-"Attempted to mount read-only compatible filesystem read-write.\n"
+"Attempted to mount read-only compatible filesystem read-write.");
+ xfs_warn(mp,
"Filesystem can only be safely mounted read only.");
+
return -EINVAL;
}
}
if (xfs_sb_has_incompat_feature(sbp,
XFS_SB_FEAT_INCOMPAT_UNKNOWN)) {
xfs_warn(mp,
-"Superblock has unknown incompatible features (0x%x) enabled.\n"
-"Filesystem can not be safely mounted by this kernel.",
+"Superblock has unknown incompatible features (0x%x) enabled.",
(sbp->sb_features_incompat &
XFS_SB_FEAT_INCOMPAT_UNKNOWN));
+ xfs_warn(mp,
+"Filesystem can not be safely mounted by this kernel.");
return -EINVAL;
}
+ } else if (xfs_sb_version_hascrc(sbp)) {
+ /*
+ * We can't read verify the sb LSN because the read verifier is
+ * called before the log is allocated and processed. We know the
+ * log is set up before write verifier (!check_version) calls,
+ * so just check it here.
+ */
+ if (!xfs_log_check_lsn(mp, sbp->sb_lsn))
+ return -EFSCORRUPTED;
}
if (xfs_sb_version_has_pquotino(sbp)) {
@@ -182,9 +196,6 @@ xfs_mount_validate_sb(
if (xfs_sb_version_hassparseinodes(sbp)) {
uint32_t align;
- xfs_alert(mp,
- "EXPERIMENTAL sparse inode feature enabled. Use at your own risk!");
-
align = XFS_INODES_PER_CHUNK * sbp->sb_inodesize
>> sbp->sb_blocklog;
if (sbp->sb_inoalignmt != align) {
@@ -398,6 +409,14 @@ __xfs_sb_from_disk(
to->sb_spino_align = be32_to_cpu(from->sb_spino_align);
to->sb_pquotino = be64_to_cpu(from->sb_pquotino);
to->sb_lsn = be64_to_cpu(from->sb_lsn);
+ /*
+ * sb_meta_uuid is only on disk if it differs from sb_uuid and the
+ * feature flag is set; if not set we keep it only in memory.
+ */
+ if (xfs_sb_version_hasmetauuid(to))
+ uuid_copy(&to->sb_meta_uuid, &from->sb_meta_uuid);
+ else
+ uuid_copy(&to->sb_meta_uuid, &from->sb_uuid);
/* Convert on-disk flags to in-memory flags? */
if (convert_xquota)
xfs_sb_quota_from_disk(to);
@@ -539,6 +558,8 @@ xfs_sb_to_disk(
cpu_to_be32(from->sb_features_log_incompat);
to->sb_spino_align = cpu_to_be32(from->sb_spino_align);
to->sb_lsn = cpu_to_be64(from->sb_lsn);
+ if (xfs_sb_version_hasmetauuid(from))
+ uuid_copy(&to->sb_meta_uuid, &from->sb_meta_uuid);
}
}
@@ -658,11 +679,13 @@ xfs_sb_write_verify(
}
const struct xfs_buf_ops xfs_sb_buf_ops = {
+ .name = "xfs_sb",
.verify_read = xfs_sb_read_verify,
.verify_write = xfs_sb_write_verify,
};
const struct xfs_buf_ops xfs_sb_quiet_buf_ops = {
+ .name = "xfs_sb_quiet",
.verify_read = xfs_sb_quiet_read_verify,
.verify_write = xfs_sb_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 5be5297..15c3ceb 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -49,6 +49,7 @@ extern const struct xfs_buf_ops xfs_inobt_buf_ops;
extern const struct xfs_buf_ops xfs_inode_buf_ops;
extern const struct xfs_buf_ops xfs_inode_buf_ra_ops;
extern const struct xfs_buf_ops xfs_dquot_buf_ops;
+extern const struct xfs_buf_ops xfs_dquot_buf_ra_ops;
extern const struct xfs_buf_ops xfs_sb_buf_ops;
extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops;
extern const struct xfs_buf_ops xfs_symlink_buf_ops;
diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c
index e7e26bd..2e2c671 100644
--- a/fs/xfs/libxfs/xfs_symlink_remote.c
+++ b/fs/xfs/libxfs/xfs_symlink_remote.c
@@ -31,6 +31,7 @@
#include "xfs_cksum.h"
#include "xfs_trans.h"
#include "xfs_buf_item.h"
+#include "xfs_log.h"
/*
@@ -60,10 +61,11 @@ xfs_symlink_hdr_set(
if (!xfs_sb_version_hascrc(&mp->m_sb))
return 0;
+ memset(dsl, 0, sizeof(struct xfs_dsymlink_hdr));
dsl->sl_magic = cpu_to_be32(XFS_SYMLINK_MAGIC);
dsl->sl_offset = cpu_to_be32(offset);
dsl->sl_bytes = cpu_to_be32(size);
- uuid_copy(&dsl->sl_uuid, &mp->m_sb.sb_uuid);
+ uuid_copy(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid);
dsl->sl_owner = cpu_to_be64(ino);
dsl->sl_blkno = cpu_to_be64(bp->b_bn);
bp->b_ops = &xfs_symlink_buf_ops;
@@ -107,7 +109,7 @@ xfs_symlink_verify(
return false;
if (dsl->sl_magic != cpu_to_be32(XFS_SYMLINK_MAGIC))
return false;
- if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_uuid))
+ if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid))
return false;
if (bp->b_bn != be64_to_cpu(dsl->sl_blkno))
return false;
@@ -116,6 +118,8 @@ xfs_symlink_verify(
return false;
if (dsl->sl_owner == 0)
return false;
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(dsl->sl_lsn)))
+ return false;
return true;
}
@@ -164,6 +168,7 @@ xfs_symlink_write_verify(
}
const struct xfs_buf_ops xfs_symlink_buf_ops = {
+ .name = "xfs_symlink",
.verify_read = xfs_symlink_read_verify,
.verify_write = xfs_symlink_write_verify,
};
@@ -183,6 +188,7 @@ xfs_symlink_local_to_remote(
if (!xfs_sb_version_hascrc(&mp->m_sb)) {
bp->b_ops = NULL;
memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
+ xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
return;
}
@@ -198,4 +204,6 @@ xfs_symlink_local_to_remote(
buf = bp->b_addr;
buf += xfs_symlink_hdr_set(mp, ip->i_ino, 0, ifp->if_bytes, bp);
memcpy(buf, ifp->if_u1.if_data, ifp->if_bytes);
+ xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsymlink_hdr) +
+ ifp->if_bytes - 1);
}
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 4b64167..2d5df1f 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -37,16 +37,19 @@
STATIC struct posix_acl *
xfs_acl_from_disk(
- struct xfs_acl *aclp,
- int max_entries)
+ const struct xfs_acl *aclp,
+ int len,
+ int max_entries)
{
struct posix_acl_entry *acl_e;
struct posix_acl *acl;
- struct xfs_acl_entry *ace;
+ const struct xfs_acl_entry *ace;
unsigned int count, i;
+ if (len < sizeof(*aclp))
+ return ERR_PTR(-EFSCORRUPTED);
count = be32_to_cpu(aclp->acl_cnt);
- if (count > max_entries)
+ if (count > max_entries || XFS_ACL_SIZE(count) != len)
return ERR_PTR(-EFSCORRUPTED);
acl = posix_acl_alloc(count, GFP_KERNEL);
@@ -160,10 +163,11 @@ xfs_get_acl(struct inode *inode, int type)
*/
if (error == -ENOATTR)
goto out_update_cache;
+ acl = ERR_PTR(error);
goto out;
}
- acl = xfs_acl_from_disk(xfs_acl, XFS_ACL_MAX_ENTRIES(ip->i_mount));
+ acl = xfs_acl_from_disk(xfs_acl, len, XFS_ACL_MAX_ENTRIES(ip->i_mount));
if (IS_ERR(acl))
goto out;
@@ -248,29 +252,6 @@ xfs_set_mode(struct inode *inode, umode_t mode)
return error;
}
-static int
-xfs_acl_exists(struct inode *inode, unsigned char *name)
-{
- int len = XFS_ACL_MAX_SIZE(XFS_M(inode->i_sb));
-
- return (xfs_attr_get(XFS_I(inode), name, NULL, &len,
- ATTR_ROOT|ATTR_KERNOVAL) == 0);
-}
-
-int
-posix_acl_access_exists(struct inode *inode)
-{
- return xfs_acl_exists(inode, SGI_ACL_FILE);
-}
-
-int
-posix_acl_default_exists(struct inode *inode)
-{
- if (!S_ISDIR(inode->i_mode))
- return 0;
- return xfs_acl_exists(inode, SGI_ACL_DEFAULT);
-}
-
int
xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 3841b07..286fa89 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -20,20 +20,18 @@
struct inode;
struct posix_acl;
-struct xfs_inode;
#ifdef CONFIG_XFS_POSIX_ACL
extern struct posix_acl *xfs_get_acl(struct inode *inode, int type);
extern int xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
-extern int posix_acl_access_exists(struct inode *inode);
-extern int posix_acl_default_exists(struct inode *inode);
#else
static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type)
{
return NULL;
}
# define xfs_set_acl NULL
-# define posix_acl_access_exists(inode) 0
-# define posix_acl_default_exists(inode) 0
#endif /* CONFIG_XFS_POSIX_ACL */
+
+extern void xfs_forget_acl(struct inode *inode, const char *name, int xflags);
+
#endif /* __XFS_ACL_H__ */
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 3859f5e..379c089 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -119,8 +119,7 @@ xfs_setfilesize_trans_alloc(
* We may pass freeze protection with a transaction. So tell lockdep
* we released it.
*/
- rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
- 1, _THIS_IP_);
+ __sb_writers_release(ioend->io_inode->i_sb, SB_FREEZE_FS);
/*
* We hand off the transaction to the completion thread now, so
* clear the flag here.
@@ -171,8 +170,13 @@ xfs_setfilesize_ioend(
* Similarly for freeze protection.
*/
current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
- rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
- 0, 1, _THIS_IP_);
+ __sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS);
+
+ /* we abort the update if there was an IO error */
+ if (ioend->io_error) {
+ xfs_trans_cancel(tp);
+ return ioend->io_error;
+ }
return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
}
@@ -214,14 +218,17 @@ xfs_end_io(
ioend->io_error = -EIO;
goto done;
}
- if (ioend->io_error)
- goto done;
/*
* For unwritten extents we need to issue transactions to convert a
* range to normal written extens after the data I/O has finished.
+ * Detecting and handling completion IO errors is done individually
+ * for each case as different cleanup operations need to be performed
+ * on error.
*/
if (ioend->io_type == XFS_IO_UNWRITTEN) {
+ if (ioend->io_error)
+ goto done;
error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
ioend->io_size);
} else if (ioend->io_append_trans) {
@@ -351,12 +358,12 @@ xfs_imap_valid(
*/
STATIC void
xfs_end_bio(
- struct bio *bio,
- int error)
+ struct bio *bio)
{
xfs_ioend_t *ioend = bio->bi_private;
- ioend->io_error = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : error;
+ if (!ioend->io_error)
+ ioend->io_error = bio->bi_error;
/* Toss bio and pass work off to an xfsdatad thread */
bio->bi_private = NULL;
@@ -382,8 +389,7 @@ STATIC struct bio *
xfs_alloc_ioend_bio(
struct buffer_head *bh)
{
- int nvecs = bio_get_nr_vecs(bh->b_bdev);
- struct bio *bio = bio_alloc(GFP_NOIO, nvecs);
+ struct bio *bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
ASSERT(bio->bi_private == NULL);
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
@@ -1253,13 +1259,28 @@ xfs_vm_releasepage(
* the DIO. There is only going to be one reference to the ioend and its life
* cycle is constrained by the DIO completion code. hence we don't need
* reference counting here.
+ *
+ * Note that for DIO, an IO to the highest supported file block offset (i.e.
+ * 2^63 - 1FSB bytes) will result in the offset + count overflowing a signed 64
+ * bit variable. Hence if we see this overflow, we have to assume that the IO is
+ * extending the file size. We won't know for sure until IO completion is run
+ * and the actual max write offset is communicated to the IO completion
+ * routine.
+ *
+ * For DAX page faults, we are preparing to never see unwritten extents here,
+ * nor should we ever extend the inode size. Hence we will soon have nothing to
+ * do here for this case, ensuring we don't have to provide an IO completion
+ * callback to free an ioend that we don't actually need for a fault into the
+ * page at offset (2^63 - 1FSB) bytes.
*/
+
static void
xfs_map_direct(
struct inode *inode,
struct buffer_head *bh_result,
struct xfs_bmbt_irec *imap,
- xfs_off_t offset)
+ xfs_off_t offset,
+ bool dax_fault)
{
struct xfs_ioend *ioend;
xfs_off_t size = bh_result->b_size;
@@ -1272,6 +1293,13 @@ xfs_map_direct(
trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap);
+ if (dax_fault) {
+ ASSERT(type == XFS_IO_OVERWRITE);
+ trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type,
+ imap);
+ return;
+ }
+
if (bh_result->b_private) {
ioend = bh_result->b_private;
ASSERT(ioend->io_size > 0);
@@ -1286,7 +1314,8 @@ xfs_map_direct(
ioend->io_size, ioend->io_type,
imap);
} else if (type == XFS_IO_UNWRITTEN ||
- offset + size > i_size_read(inode)) {
+ offset + size > i_size_read(inode) ||
+ offset + size < 0) {
ioend = xfs_alloc_ioend(inode, type);
ioend->io_offset = offset;
ioend->io_size = size;
@@ -1348,7 +1377,8 @@ __xfs_get_blocks(
sector_t iblock,
struct buffer_head *bh_result,
int create,
- bool direct)
+ bool direct,
+ bool dax_fault)
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
@@ -1396,18 +1426,20 @@ __xfs_get_blocks(
if (error)
goto out_unlock;
+ /* for DAX, we convert unwritten extents directly */
if (create &&
(!nimaps ||
(imap.br_startblock == HOLESTARTBLOCK ||
- imap.br_startblock == DELAYSTARTBLOCK))) {
+ imap.br_startblock == DELAYSTARTBLOCK) ||
+ (IS_DAX(inode) && ISUNWRITTEN(&imap)))) {
if (direct || xfs_get_extsz_hint(ip)) {
/*
- * Drop the ilock in preparation for starting the block
- * allocation transaction. It will be retaken
- * exclusively inside xfs_iomap_write_direct for the
- * actual allocation.
+ * xfs_iomap_write_direct() expects the shared lock. It
+ * is unlocked on return.
*/
- xfs_iunlock(ip, lockmode);
+ if (lockmode == XFS_ILOCK_EXCL)
+ xfs_ilock_demote(ip, lockmode);
+
error = xfs_iomap_write_direct(ip, offset, size,
&imap, nimaps);
if (error)
@@ -1444,6 +1476,12 @@ __xfs_get_blocks(
goto out_unlock;
}
+ if (IS_DAX(inode) && create) {
+ ASSERT(!ISUNWRITTEN(&imap));
+ /* zeroing is not needed at a higher layer */
+ new = 0;
+ }
+
/* trim mapping down to size requested */
if (direct || size > (1 << inode->i_blkbits))
xfs_map_trim_size(inode, iblock, bh_result,
@@ -1461,7 +1499,8 @@ __xfs_get_blocks(
set_buffer_unwritten(bh_result);
/* direct IO needs special help */
if (create && direct)
- xfs_map_direct(inode, bh_result, &imap, offset);
+ xfs_map_direct(inode, bh_result, &imap, offset,
+ dax_fault);
}
/*
@@ -1508,7 +1547,7 @@ xfs_get_blocks(
struct buffer_head *bh_result,
int create)
{
- return __xfs_get_blocks(inode, iblock, bh_result, create, false);
+ return __xfs_get_blocks(inode, iblock, bh_result, create, false, false);
}
int
@@ -1518,7 +1557,17 @@ xfs_get_blocks_direct(
struct buffer_head *bh_result,
int create)
{
- return __xfs_get_blocks(inode, iblock, bh_result, create, true);
+ return __xfs_get_blocks(inode, iblock, bh_result, create, true, false);
+}
+
+int
+xfs_get_blocks_dax_fault(
+ struct inode *inode,
+ sector_t iblock,
+ struct buffer_head *bh_result,
+ int create)
+{
+ return __xfs_get_blocks(inode, iblock, bh_result, create, true, true);
}
static void
@@ -1617,45 +1666,6 @@ xfs_end_io_direct_write(
__xfs_end_io_direct_write(inode, ioend, offset, size);
}
-/*
- * For DAX we need a mapping buffer callback for unwritten extent conversion
- * when page faults allocate blocks and then zero them. Note that in this
- * case the mapping indicated by the ioend may extend beyond EOF. We most
- * definitely do not want to extend EOF here, so we trim back the ioend size to
- * EOF.
- */
-#ifdef CONFIG_FS_DAX
-void
-xfs_end_io_dax_write(
- struct buffer_head *bh,
- int uptodate)
-{
- struct xfs_ioend *ioend = bh->b_private;
- struct inode *inode = ioend->io_inode;
- ssize_t size = ioend->io_size;
-
- ASSERT(IS_DAX(ioend->io_inode));
-
- /* if there was an error zeroing, then don't convert it */
- if (!uptodate)
- ioend->io_error = -EIO;
-
- /*
- * Trim update to EOF, so we don't extend EOF during unwritten extent
- * conversion of partial EOF blocks.
- */
- spin_lock(&XFS_I(inode)->i_flags_lock);
- if (ioend->io_offset + size > i_size_read(inode))
- size = i_size_read(inode) - ioend->io_offset;
- spin_unlock(&XFS_I(inode)->i_flags_lock);
-
- __xfs_end_io_direct_write(inode, ioend, ioend->io_offset, size);
-
-}
-#else
-void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate) { }
-#endif
-
static inline ssize_t
xfs_vm_do_dio(
struct inode *inode,
@@ -1907,6 +1917,7 @@ xfs_vm_readpage(
struct file *unused,
struct page *page)
{
+ trace_xfs_vm_readpage(page->mapping->host, 1);
return mpage_readpage(page, xfs_get_blocks);
}
@@ -1917,6 +1928,7 @@ xfs_vm_readpages(
struct list_head *pages,
unsigned nr_pages)
{
+ trace_xfs_vm_readpages(mapping->host, nr_pages);
return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
}
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index 86afd1a..f6ffc9a 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -58,7 +58,8 @@ int xfs_get_blocks(struct inode *inode, sector_t offset,
struct buffer_head *map_bh, int create);
int xfs_get_blocks_direct(struct inode *inode, sector_t offset,
struct buffer_head *map_bh, int create);
-void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate);
+int xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset,
+ struct buffer_head *map_bh, int create);
extern void xfs_count_page_state(struct page *, int *, int *);
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 65fb37a..0ef7c2e 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -511,7 +511,7 @@ xfs_attr_list_int(
xfs_inode_t *dp = context->dp;
uint lock_mode;
- XFS_STATS_INC(xs_attr_list);
+ XFS_STATS_INC(dp->i_mount, xs_attr_list);
if (XFS_FORCED_SHUTDOWN(dp->i_mount))
return -EIO;
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 0f34886c..45ec9e4 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -57,71 +57,104 @@ xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
}
/*
+ * Routine to zero an extent on disk allocated to the specific inode.
+ *
+ * The VFS functions take a linearised filesystem block offset, so we have to
+ * convert the sparse xfs fsb to the right format first.
+ * VFS types are real funky, too.
+ */
+int
+xfs_zero_extent(
+ struct xfs_inode *ip,
+ xfs_fsblock_t start_fsb,
+ xfs_off_t count_fsb)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_daddr_t sector = xfs_fsb_to_db(ip, start_fsb);
+ sector_t block = XFS_BB_TO_FSBT(mp, sector);
+ ssize_t size = XFS_FSB_TO_B(mp, count_fsb);
+
+ if (IS_DAX(VFS_I(ip)))
+ return dax_clear_blocks(VFS_I(ip), block, size);
+
+ /*
+ * let the block layer decide on the fastest method of
+ * implementing the zeroing.
+ */
+ return sb_issue_zeroout(mp->m_super, block, count_fsb, GFP_NOFS);
+
+}
+
+/*
* Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
* caller. Frees all the extents that need freeing, which must be done
* last due to locking considerations. We never free any extents in
* the first transaction.
*
- * Return 1 if the given transaction was committed and a new one
- * started, and 0 otherwise in the committed parameter.
+ * If an inode *ip is provided, rejoin it to the transaction if
+ * the transaction was committed.
*/
int /* error */
xfs_bmap_finish(
- xfs_trans_t **tp, /* transaction pointer addr */
- xfs_bmap_free_t *flist, /* i/o: list extents to free */
- int *committed) /* xact committed or not */
+ struct xfs_trans **tp, /* transaction pointer addr */
+ struct xfs_bmap_free *flist, /* i/o: list extents to free */
+ struct xfs_inode *ip)
{
- xfs_efd_log_item_t *efd; /* extent free data */
- xfs_efi_log_item_t *efi; /* extent free intention */
- int error; /* error return value */
- xfs_bmap_free_item_t *free; /* free extent item */
- xfs_mount_t *mp; /* filesystem mount structure */
- xfs_bmap_free_item_t *next; /* next item on free list */
+ struct xfs_efd_log_item *efd; /* extent free data */
+ struct xfs_efi_log_item *efi; /* extent free intention */
+ int error; /* error return value */
+ int committed;/* xact committed or not */
+ struct xfs_bmap_free_item *free; /* free extent item */
+ struct xfs_bmap_free_item *next; /* next item on free list */
ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
- if (flist->xbf_count == 0) {
- *committed = 0;
+ if (flist->xbf_count == 0)
return 0;
- }
+
efi = xfs_trans_get_efi(*tp, flist->xbf_count);
for (free = flist->xbf_first; free; free = free->xbfi_next)
xfs_trans_log_efi_extent(*tp, efi, free->xbfi_startblock,
free->xbfi_blockcount);
- error = xfs_trans_roll(tp, NULL);
- *committed = 1;
- /*
- * We have a new transaction, so we should return committed=1,
- * even though we're returning an error.
- */
- if (error)
+ error = __xfs_trans_roll(tp, ip, &committed);
+ if (error) {
+ /*
+ * If the transaction was committed, drop the EFD reference
+ * since we're bailing out of here. The other reference is
+ * dropped when the EFI hits the AIL.
+ *
+ * If the transaction was not committed, the EFI is freed by the
+ * EFI item unlock handler on abort. Also, we have a new
+ * transaction so we should return committed=1 even though we're
+ * returning an error.
+ */
+ if (committed) {
+ xfs_efi_release(efi);
+ xfs_force_shutdown((*tp)->t_mountp,
+ (error == -EFSCORRUPTED) ?
+ SHUTDOWN_CORRUPT_INCORE :
+ SHUTDOWN_META_IO_ERROR);
+ }
return error;
+ }
+ /*
+ * Get an EFD and free each extent in the list, logging to the EFD in
+ * the process. The remaining bmap free list is cleaned up by the caller
+ * on error.
+ */
efd = xfs_trans_get_efd(*tp, efi, flist->xbf_count);
for (free = flist->xbf_first; free != NULL; free = next) {
next = free->xbfi_next;
- if ((error = xfs_free_extent(*tp, free->xbfi_startblock,
- free->xbfi_blockcount))) {
- /*
- * The bmap free list will be cleaned up at a
- * higher level. The EFI will be canceled when
- * this transaction is aborted.
- * Need to force shutdown here to make sure it
- * happens, since this transaction may not be
- * dirty yet.
- */
- mp = (*tp)->t_mountp;
- if (!XFS_FORCED_SHUTDOWN(mp))
- xfs_force_shutdown(mp,
- (error == -EFSCORRUPTED) ?
- SHUTDOWN_CORRUPT_INCORE :
- SHUTDOWN_META_IO_ERROR);
+
+ error = xfs_trans_free_extent(*tp, efd, free->xbfi_startblock,
+ free->xbfi_blockcount);
+ if (error)
return error;
- }
- xfs_trans_log_efd_extent(*tp, efd, free->xbfi_startblock,
- free->xbfi_blockcount);
+
xfs_bmap_del_free(flist, NULL, free);
}
+
return 0;
}
@@ -222,6 +255,13 @@ xfs_bmap_rtalloc(
xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT :
XFS_TRANS_DQ_RTBCOUNT, (long) ralen);
+
+ /* Zero the extent if we were asked to do so */
+ if (ap->userdata & XFS_ALLOC_USERDATA_ZERO) {
+ error = xfs_zero_extent(ap->ip, ap->blkno, ap->length);
+ if (error)
+ return error;
+ }
} else {
ap->length = 0;
}
@@ -926,7 +966,6 @@ xfs_alloc_file_space(
xfs_bmbt_irec_t imaps[1], *imapp;
xfs_bmap_free_t free_list;
uint qblocks, resblks, resrtextents;
- int committed;
int error;
trace_xfs_alloc_file_space(ip);
@@ -1020,24 +1059,21 @@ xfs_alloc_file_space(
xfs_bmap_init(&free_list, &firstfsb);
error = xfs_bmapi_write(tp, ip, startoffset_fsb,
allocatesize_fsb, alloc_type, &firstfsb,
- 0, imapp, &nimaps, &free_list);
- if (error) {
+ resblks, imapp, &nimaps, &free_list);
+ if (error)
goto error0;
- }
/*
* Complete the transaction
*/
- error = xfs_bmap_finish(&tp, &free_list, &committed);
- if (error) {
+ error = xfs_bmap_finish(&tp, &free_list, NULL);
+ if (error)
goto error0;
- }
error = xfs_trans_commit(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
- if (error) {
+ if (error)
break;
- }
allocated_fsb = imapp->br_blockcount;
@@ -1163,7 +1199,6 @@ xfs_free_file_space(
xfs_off_t offset,
xfs_off_t len)
{
- int committed;
int done;
xfs_fileoff_t endoffset_fsb;
int error;
@@ -1303,17 +1338,15 @@ xfs_free_file_space(
error = xfs_bunmapi(tp, ip, startoffset_fsb,
endoffset_fsb - startoffset_fsb,
0, 2, &firstfsb, &free_list, &done);
- if (error) {
+ if (error)
goto error0;
- }
/*
* complete the transaction
*/
- error = xfs_bmap_finish(&tp, &free_list, &committed);
- if (error) {
+ error = xfs_bmap_finish(&tp, &free_list, NULL);
+ if (error)
goto error0;
- }
error = xfs_trans_commit(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -1391,7 +1424,6 @@ xfs_shift_file_space(
int error;
struct xfs_bmap_free free_list;
xfs_fsblock_t first_block;
- int committed;
xfs_fileoff_t stop_fsb;
xfs_fileoff_t next_fsb;
xfs_fileoff_t shift_fsb;
@@ -1467,7 +1499,7 @@ xfs_shift_file_space(
XFS_DIOSTRAT_SPACE_RES(mp, 0), 0,
XFS_QMOPT_RES_REGBLKS);
if (error)
- goto out;
+ goto out_trans_cancel;
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
@@ -1481,18 +1513,20 @@ xfs_shift_file_space(
&done, stop_fsb, &first_block, &free_list,
direction, XFS_BMAP_MAX_SHIFT_EXTENTS);
if (error)
- goto out;
+ goto out_bmap_cancel;
- error = xfs_bmap_finish(&tp, &free_list, &committed);
+ error = xfs_bmap_finish(&tp, &free_list, NULL);
if (error)
- goto out;
+ goto out_bmap_cancel;
error = xfs_trans_commit(tp);
}
return error;
-out:
+out_bmap_cancel:
+ xfs_bmap_cancel(&free_list);
+out_trans_cancel:
xfs_trans_cancel(tp);
return error;
}
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index a4b7d92..daed4bfb 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -201,7 +201,7 @@ _xfs_buf_alloc(
atomic_set(&bp->b_pin_count, 0);
init_waitqueue_head(&bp->b_waiters);
- XFS_STATS_INC(xb_create);
+ XFS_STATS_INC(target->bt_mount, xb_create);
trace_xfs_buf_init(bp, _RET_IP_);
return bp;
@@ -354,15 +354,16 @@ retry:
*/
if (!(++retries % 100))
xfs_err(NULL,
- "possible memory allocation deadlock in %s (mode:0x%x)",
+ "%s(%u) possible memory allocation deadlock in %s (mode:0x%x)",
+ current->comm, current->pid,
__func__, gfp_mask);
- XFS_STATS_INC(xb_page_retries);
+ XFS_STATS_INC(bp->b_target->bt_mount, xb_page_retries);
congestion_wait(BLK_RW_ASYNC, HZ/50);
goto retry;
}
- XFS_STATS_INC(xb_page_found);
+ XFS_STATS_INC(bp->b_target->bt_mount, xb_page_found);
nbytes = min_t(size_t, size, PAGE_SIZE - offset);
size -= nbytes;
@@ -438,7 +439,6 @@ _xfs_buf_find(
xfs_buf_flags_t flags,
xfs_buf_t *new_bp)
{
- size_t numbytes;
struct xfs_perag *pag;
struct rb_node **rbp;
struct rb_node *parent;
@@ -450,10 +450,9 @@ _xfs_buf_find(
for (i = 0; i < nmaps; i++)
numblks += map[i].bm_len;
- numbytes = BBTOB(numblks);
/* Check for IOs smaller than the sector size / not sector aligned */
- ASSERT(!(numbytes < btp->bt_meta_sectorsize));
+ ASSERT(!(BBTOB(numblks) < btp->bt_meta_sectorsize));
ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_meta_sectormask));
/*
@@ -518,7 +517,7 @@ _xfs_buf_find(
new_bp->b_pag = pag;
spin_unlock(&pag->pag_buf_lock);
} else {
- XFS_STATS_INC(xb_miss_locked);
+ XFS_STATS_INC(btp->bt_mount, xb_miss_locked);
spin_unlock(&pag->pag_buf_lock);
xfs_perag_put(pag);
}
@@ -531,11 +530,11 @@ found:
if (!xfs_buf_trylock(bp)) {
if (flags & XBF_TRYLOCK) {
xfs_buf_rele(bp);
- XFS_STATS_INC(xb_busy_locked);
+ XFS_STATS_INC(btp->bt_mount, xb_busy_locked);
return NULL;
}
xfs_buf_lock(bp);
- XFS_STATS_INC(xb_get_locked_waited);
+ XFS_STATS_INC(btp->bt_mount, xb_get_locked_waited);
}
/*
@@ -551,7 +550,7 @@ found:
}
trace_xfs_buf_find(bp, flags, _RET_IP_);
- XFS_STATS_INC(xb_get_locked);
+ XFS_STATS_INC(btp->bt_mount, xb_get_locked);
return bp;
}
@@ -605,7 +604,14 @@ found:
}
}
- XFS_STATS_INC(xb_get);
+ /*
+ * Clear b_error if this is a lookup from a caller that doesn't expect
+ * valid data to be found in the buffer.
+ */
+ if (!(flags & XBF_READ))
+ xfs_buf_ioerror(bp, 0);
+
+ XFS_STATS_INC(target->bt_mount, xb_get);
trace_xfs_buf_get(bp, flags, _RET_IP_);
return bp;
}
@@ -645,7 +651,7 @@ xfs_buf_read_map(
trace_xfs_buf_read(bp, flags, _RET_IP_);
if (!XFS_BUF_ISDONE(bp)) {
- XFS_STATS_INC(xb_get_read);
+ XFS_STATS_INC(target->bt_mount, xb_get_read);
bp->b_ops = ops;
_xfs_buf_read(bp, flags);
} else if (flags & XBF_ASYNC) {
@@ -1046,7 +1052,7 @@ xfs_buf_ioend_work(
xfs_buf_ioend(bp);
}
-void
+static void
xfs_buf_ioend_async(
struct xfs_buf *bp)
{
@@ -1096,8 +1102,7 @@ xfs_bwrite(
STATIC void
xfs_buf_bio_end_io(
- struct bio *bio,
- int error)
+ struct bio *bio)
{
xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private;
@@ -1105,10 +1110,10 @@ xfs_buf_bio_end_io(
* don't overwrite existing errors - otherwise we can lose errors on
* buffers that require multiple bios to complete.
*/
- if (error) {
+ if (bio->bi_error) {
spin_lock(&bp->b_lock);
if (!bp->b_io_error)
- bp->b_io_error = error;
+ bp->b_io_error = bio->bi_error;
spin_unlock(&bp->b_lock);
}
@@ -1533,9 +1538,10 @@ xfs_wait_buftarg(
list_del_init(&bp->b_lru);
if (bp->b_flags & XBF_WRITE_FAIL) {
xfs_alert(btp->bt_mount,
-"Corruption Alert: Buffer at block 0x%llx had permanent write failures!\n"
-"Please run xfs_repair to determine the extent of the problem.",
+"Corruption Alert: Buffer at block 0x%llx had permanent write failures!",
(long long)bp->b_bn);
+ xfs_alert(btp->bt_mount,
+"Please run xfs_repair to determine the extent of the problem.");
}
xfs_buf_rele(bp);
}
@@ -1633,13 +1639,9 @@ xfs_setsize_buftarg(
btp->bt_meta_sectormask = sectorsize - 1;
if (set_blocksize(btp->bt_bdev, sectorsize)) {
- char name[BDEVNAME_SIZE];
-
- bdevname(btp->bt_bdev, name);
-
xfs_warn(btp->bt_mount,
- "Cannot set_blocksize to %u on device %s",
- sectorsize, name);
+ "Cannot set_blocksize to %u on device %pg",
+ sectorsize, btp->bt_bdev);
return -EINVAL;
}
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 331c1cc..c75721a 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -23,6 +23,7 @@
#include <linux/spinlock.h>
#include <linux/mm.h>
#include <linux/fs.h>
+#include <linux/dax.h>
#include <linux/buffer_head.h>
#include <linux/uio.h>
#include <linux/list_lru.h>
@@ -131,6 +132,7 @@ struct xfs_buf_map {
struct xfs_buf_map (map) = { .bm_bn = (blkno), .bm_len = (numblk) };
struct xfs_buf_ops {
+ char *name;
void (*verify_read)(struct xfs_buf *);
void (*verify_write)(struct xfs_buf *);
};
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 092d652..7e986da 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -647,11 +647,7 @@ xfs_buf_item_unlock(
xfs_buf_item_relse(bp);
else if (aborted) {
ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp));
- if (lip->li_flags & XFS_LI_IN_AIL) {
- spin_lock(&lip->li_ailp->xa_lock);
- xfs_trans_ail_delete(lip->li_ailp, lip,
- SHUTDOWN_LOG_IO_ERROR);
- }
+ xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR);
xfs_buf_item_relse(bp);
}
}
@@ -750,13 +746,13 @@ xfs_buf_item_free_format(
* buffer (see xfs_buf_attach_iodone() below), then put the
* buf log item at the front.
*/
-void
+int
xfs_buf_item_init(
- xfs_buf_t *bp,
- xfs_mount_t *mp)
+ struct xfs_buf *bp,
+ struct xfs_mount *mp)
{
- xfs_log_item_t *lip = bp->b_fspriv;
- xfs_buf_log_item_t *bip;
+ struct xfs_log_item *lip = bp->b_fspriv;
+ struct xfs_buf_log_item *bip;
int chunks;
int map_size;
int error;
@@ -770,12 +766,11 @@ xfs_buf_item_init(
*/
ASSERT(bp->b_target->bt_mount == mp);
if (lip != NULL && lip->li_type == XFS_LI_BUF)
- return;
+ return 0;
bip = kmem_zone_zalloc(xfs_buf_item_zone, KM_SLEEP);
xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops);
bip->bli_buf = bp;
- xfs_buf_hold(bp);
/*
* chunks is the number of XFS_BLF_CHUNK size pieces the buffer
@@ -788,6 +783,11 @@ xfs_buf_item_init(
*/
error = xfs_buf_item_get_format(bip, bp->b_map_count);
ASSERT(error == 0);
+ if (error) { /* to stop gcc throwing set-but-unused warnings */
+ kmem_zone_free(xfs_buf_item_zone, bip);
+ return error;
+ }
+
for (i = 0; i < bip->bli_format_count; i++) {
chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len),
@@ -807,6 +807,8 @@ xfs_buf_item_init(
if (bp->b_fspriv)
bip->bli_item.li_bio_list = bp->b_fspriv;
bp->b_fspriv = bip;
+ xfs_buf_hold(bp);
+ return 0;
}
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 3f3455a..f7eba99 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -61,7 +61,7 @@ typedef struct xfs_buf_log_item {
struct xfs_buf_log_format __bli_format; /* embedded in-log header */
} xfs_buf_log_item_t;
-void xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
+int xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
void xfs_buf_item_relse(struct xfs_buf *);
void xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint);
uint xfs_buf_item_dirty(xfs_buf_log_item_t *);
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index 098cd78..642d55d 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -171,6 +171,7 @@ xfs_dir2_block_getdents(
int wantoff; /* starting block offset */
xfs_off_t cook;
struct xfs_da_geometry *geo = args->geo;
+ int lock_mode;
/*
* If the block number in the offset is out of range, we're done.
@@ -178,7 +179,9 @@ xfs_dir2_block_getdents(
if (xfs_dir2_dataptr_to_db(geo, ctx->pos) > geo->datablk)
return 0;
+ lock_mode = xfs_ilock_data_map_shared(dp);
error = xfs_dir3_block_read(NULL, dp, &bp);
+ xfs_iunlock(dp, lock_mode);
if (error)
return error;
@@ -529,9 +532,12 @@ xfs_dir2_leaf_getdents(
* current buffer, need to get another one.
*/
if (!bp || ptr >= (char *)bp->b_addr + geo->blksize) {
+ int lock_mode;
+ lock_mode = xfs_ilock_data_map_shared(dp);
error = xfs_dir2_leaf_readbuf(args, bufsize, map_info,
&curoff, &bp);
+ xfs_iunlock(dp, lock_mode);
if (error || !map_info->map_valid)
break;
@@ -653,7 +659,6 @@ xfs_readdir(
struct xfs_da_args args = { NULL };
int rval;
int v;
- uint lock_mode;
trace_xfs_readdir(dp);
@@ -661,12 +666,12 @@ xfs_readdir(
return -EIO;
ASSERT(S_ISDIR(dp->i_d.di_mode));
- XFS_STATS_INC(xs_dir_getdents);
+ XFS_STATS_INC(dp->i_mount, xs_dir_getdents);
args.dp = dp;
args.geo = dp->i_mount->m_dir_geo;
- lock_mode = xfs_ilock_data_map_shared(dp);
+ xfs_ilock(dp, XFS_IOLOCK_SHARED);
if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
rval = xfs_dir2_sf_getdents(&args, ctx);
else if ((rval = xfs_dir2_isblock(&args, &v)))
@@ -675,7 +680,7 @@ xfs_readdir(
rval = xfs_dir2_block_getdents(&args, ctx);
else
rval = xfs_dir2_leaf_getdents(&args, ctx, bufsize);
- xfs_iunlock(dp, lock_mode);
+ xfs_iunlock(dp, XFS_IOLOCK_SHARED);
return rval;
}
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 4143dc7..9c44d38 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -75,9 +75,9 @@ xfs_qm_dqdestroy(
ASSERT(list_empty(&dqp->q_lru));
mutex_destroy(&dqp->q_qlock);
- kmem_zone_free(xfs_qm_dqzone, dqp);
- XFS_STATS_DEC(xs_qm_dquot);
+ XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot);
+ kmem_zone_free(xfs_qm_dqzone, dqp);
}
/*
@@ -251,7 +251,7 @@ xfs_qm_init_dquot_blk(
d->dd_diskdq.d_id = cpu_to_be32(curid);
d->dd_diskdq.d_flags = type;
if (xfs_sb_version_hascrc(&mp->m_sb)) {
- uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid);
+ uuid_copy(&d->dd_uuid, &mp->m_sb.sb_meta_uuid);
xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk),
XFS_DQUOT_CRC_OFF);
}
@@ -306,7 +306,7 @@ xfs_qm_dqalloc(
xfs_fsblock_t firstblock;
xfs_bmap_free_t flist;
xfs_bmbt_irec_t map;
- int nmaps, error, committed;
+ int nmaps, error;
xfs_buf_t *bp;
xfs_trans_t *tp = *tpp;
@@ -379,11 +379,12 @@ xfs_qm_dqalloc(
xfs_trans_bhold(tp, bp);
- if ((error = xfs_bmap_finish(tpp, &flist, &committed))) {
+ error = xfs_bmap_finish(tpp, &flist, NULL);
+ if (error)
goto error1;
- }
- if (committed) {
+ /* Transaction was committed? */
+ if (*tpp != tp) {
tp = *tpp;
xfs_trans_bjoin(tp, bp);
} else {
@@ -393,9 +394,9 @@ xfs_qm_dqalloc(
*O_bpp = bp;
return 0;
- error1:
+error1:
xfs_bmap_cancel(&flist);
- error0:
+error0:
xfs_iunlock(quotip, XFS_ILOCK_EXCL);
return error;
@@ -605,7 +606,7 @@ xfs_qm_dqread(
break;
}
- XFS_STATS_INC(xs_qm_dquot);
+ XFS_STATS_INC(mp, xs_qm_dquot);
trace_xfs_dqread(dqp);
@@ -747,12 +748,12 @@ restart:
mutex_unlock(&qi->qi_tree_lock);
trace_xfs_dqget_hit(dqp);
- XFS_STATS_INC(xs_qm_dqcachehits);
+ XFS_STATS_INC(mp, xs_qm_dqcachehits);
*O_dqpp = dqp;
return 0;
}
mutex_unlock(&qi->qi_tree_lock);
- XFS_STATS_INC(xs_qm_dqcachemisses);
+ XFS_STATS_INC(mp, xs_qm_dqcachemisses);
/*
* Dquot cache miss. We don't want to keep the inode lock across
@@ -806,7 +807,7 @@ restart:
mutex_unlock(&qi->qi_tree_lock);
trace_xfs_dqget_dup(dqp);
xfs_qm_dqdestroy(dqp);
- XFS_STATS_INC(xs_qm_dquot_dups);
+ XFS_STATS_INC(mp, xs_qm_dquot_dups);
goto restart;
}
@@ -846,7 +847,7 @@ xfs_qm_dqput(
trace_xfs_dqput_free(dqp);
if (list_lru_add(&qi->qi_lru, &dqp->q_lru))
- XFS_STATS_INC(xs_qm_dquot_unused);
+ XFS_STATS_INC(dqp->q_mount, xs_qm_dquot_unused);
}
xfs_dqunlock(dqp);
}
@@ -954,12 +955,8 @@ xfs_qm_dqflush(
struct xfs_log_item *lip = &dqp->q_logitem.qli_item;
dqp->dq_flags &= ~XFS_DQ_DIRTY;
- spin_lock(&mp->m_ail->xa_lock);
- if (lip->li_flags & XFS_LI_IN_AIL)
- xfs_trans_ail_delete(mp->m_ail, lip,
- SHUTDOWN_CORRUPT_INCORE);
- else
- spin_unlock(&mp->m_ail->xa_lock);
+ xfs_trans_ail_remove(lip, SHUTDOWN_CORRUPT_INCORE);
+
error = -EIO;
goto out_unlock;
}
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 74d0e59..88693a9 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -164,9 +164,9 @@ xfs_verifier_error(
{
struct xfs_mount *mp = bp->b_target->bt_mount;
- xfs_alert(mp, "Metadata %s detected at %pF, block 0x%llx",
+ xfs_alert(mp, "Metadata %s detected at %pF, %s block 0x%llx",
bp->b_error == -EFSBADCRC ? "CRC error" : "corruption",
- __return_address, bp->b_bn);
+ __return_address, bp->b_ops->name, bp->b_bn);
xfs_alert(mp, "Unmount and run xfs_repair");
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index adc8f8f..4aa0153 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -47,28 +47,6 @@ xfs_efi_item_free(
}
/*
- * Freeing the efi requires that we remove it from the AIL if it has already
- * been placed there. However, the EFI may not yet have been placed in the AIL
- * when called by xfs_efi_release() from EFD processing due to the ordering of
- * committed vs unpin operations in bulk insert operations. Hence the reference
- * count to ensure only the last caller frees the EFI.
- */
-STATIC void
-__xfs_efi_release(
- struct xfs_efi_log_item *efip)
-{
- struct xfs_ail *ailp = efip->efi_item.li_ailp;
-
- if (atomic_dec_and_test(&efip->efi_refcount)) {
- spin_lock(&ailp->xa_lock);
- /* xfs_trans_ail_delete() drops the AIL lock. */
- xfs_trans_ail_delete(ailp, &efip->efi_item,
- SHUTDOWN_LOG_IO_ERROR);
- xfs_efi_item_free(efip);
- }
-}
-
-/*
* This returns the number of iovecs needed to log the given efi item.
* We only need 1 iovec for an efi item. It just logs the efi_log_format
* structure.
@@ -128,12 +106,12 @@ xfs_efi_item_pin(
}
/*
- * While EFIs cannot really be pinned, the unpin operation is the last place at
- * which the EFI is manipulated during a transaction. If we are being asked to
- * remove the EFI it's because the transaction has been cancelled and by
- * definition that means the EFI cannot be in the AIL so remove it from the
- * transaction and free it. Otherwise coordinate with xfs_efi_release()
- * to determine who gets to free the EFI.
+ * The unpin operation is the last place an EFI is manipulated in the log. It is
+ * either inserted in the AIL or aborted in the event of a log I/O error. In
+ * either case, the EFI transaction has been successfully committed to make it
+ * this far. Therefore, we expect whoever committed the EFI to either construct
+ * and commit the EFD or drop the EFD's reference in the event of error. Simply
+ * drop the log's EFI reference now that the log is done with it.
*/
STATIC void
xfs_efi_item_unpin(
@@ -141,15 +119,7 @@ xfs_efi_item_unpin(
int remove)
{
struct xfs_efi_log_item *efip = EFI_ITEM(lip);
-
- if (remove) {
- ASSERT(!(lip->li_flags & XFS_LI_IN_AIL));
- if (lip->li_desc)
- xfs_trans_del_item(lip);
- xfs_efi_item_free(efip);
- return;
- }
- __xfs_efi_release(efip);
+ xfs_efi_release(efip);
}
/*
@@ -167,6 +137,11 @@ xfs_efi_item_push(
return XFS_ITEM_PINNED;
}
+/*
+ * The EFI has been either committed or aborted if the transaction has been
+ * cancelled. If the transaction was cancelled, an EFD isn't going to be
+ * constructed and thus we free the EFI here directly.
+ */
STATIC void
xfs_efi_item_unlock(
struct xfs_log_item *lip)
@@ -301,23 +276,19 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
}
/*
- * This is called by the efd item code below to release references to the given
- * efi item. Each efd calls this with the number of extents that it has
- * logged, and when the sum of these reaches the total number of extents logged
- * by this efi item we can free the efi item.
+ * Freeing the efi requires that we remove it from the AIL if it has already
+ * been placed there. However, the EFI may not yet have been placed in the AIL
+ * when called by xfs_efi_release() from EFD processing due to the ordering of
+ * committed vs unpin operations in bulk insert operations. Hence the reference
+ * count to ensure only the last caller frees the EFI.
*/
void
-xfs_efi_release(xfs_efi_log_item_t *efip,
- uint nextents)
+xfs_efi_release(
+ struct xfs_efi_log_item *efip)
{
- ASSERT(atomic_read(&efip->efi_next_extent) >= nextents);
- if (atomic_sub_and_test(nextents, &efip->efi_next_extent)) {
- /* recovery needs us to drop the EFI reference, too */
- if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags))
- __xfs_efi_release(efip);
-
- __xfs_efi_release(efip);
- /* efip may now have been freed, do not reference it again. */
+ if (atomic_dec_and_test(&efip->efi_refcount)) {
+ xfs_trans_ail_remove(&efip->efi_item, SHUTDOWN_LOG_IO_ERROR);
+ xfs_efi_item_free(efip);
}
}
@@ -415,20 +386,27 @@ xfs_efd_item_push(
return XFS_ITEM_PINNED;
}
+/*
+ * The EFD is either committed or aborted if the transaction is cancelled. If
+ * the transaction is cancelled, drop our reference to the EFI and free the EFD.
+ */
STATIC void
xfs_efd_item_unlock(
struct xfs_log_item *lip)
{
- if (lip->li_flags & XFS_LI_ABORTED)
- xfs_efd_item_free(EFD_ITEM(lip));
+ struct xfs_efd_log_item *efdp = EFD_ITEM(lip);
+
+ if (lip->li_flags & XFS_LI_ABORTED) {
+ xfs_efi_release(efdp->efd_efip);
+ xfs_efd_item_free(efdp);
+ }
}
/*
- * When the efd item is committed to disk, all we need to do
- * is delete our reference to our partner efi item and then
- * free ourselves. Since we're freeing ourselves we must
- * return -1 to keep the transaction code from further referencing
- * this item.
+ * When the efd item is committed to disk, all we need to do is delete our
+ * reference to our partner efi item and then free ourselves. Since we're
+ * freeing ourselves we must return -1 to keep the transaction code from further
+ * referencing this item.
*/
STATIC xfs_lsn_t
xfs_efd_item_committed(
@@ -438,13 +416,14 @@ xfs_efd_item_committed(
struct xfs_efd_log_item *efdp = EFD_ITEM(lip);
/*
- * If we got a log I/O error, it's always the case that the LR with the
- * EFI got unpinned and freed before the EFD got aborted.
+ * Drop the EFI reference regardless of whether the EFD has been
+ * aborted. Once the EFD transaction is constructed, it is the sole
+ * responsibility of the EFD to release the EFI (even if the EFI is
+ * aborted due to log I/O error).
*/
- if (!(lip->li_flags & XFS_LI_ABORTED))
- xfs_efi_release(efdp->efd_efip, efdp->efd_format.efd_nextents);
-
+ xfs_efi_release(efdp->efd_efip);
xfs_efd_item_free(efdp);
+
return (xfs_lsn_t)-1;
}
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
index 0ffbce3..8fa8651 100644
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -39,9 +39,28 @@ struct kmem_zone;
* "extent free done" log item described below.
*
* The EFI is reference counted so that it is not freed prior to both the EFI
- * and EFD being committed and unpinned. This ensures that when the last
- * reference goes away the EFI will always be in the AIL as it has been
- * unpinned, regardless of whether the EFD is processed before or after the EFI.
+ * and EFD being committed and unpinned. This ensures the EFI is inserted into
+ * the AIL even in the event of out of order EFI/EFD processing. In other words,
+ * an EFI is born with two references:
+ *
+ * 1.) an EFI held reference to track EFI AIL insertion
+ * 2.) an EFD held reference to track EFD commit
+ *
+ * On allocation, both references are the responsibility of the caller. Once the
+ * EFI is added to and dirtied in a transaction, ownership of reference one
+ * transfers to the transaction. The reference is dropped once the EFI is
+ * inserted to the AIL or in the event of failure along the way (e.g., commit
+ * failure, log I/O error, etc.). Note that the caller remains responsible for
+ * the EFD reference under all circumstances to this point. The caller has no
+ * means to detect failure once the transaction is committed, however.
+ * Therefore, an EFD is required after this point, even in the event of
+ * unrelated failure.
+ *
+ * Once an EFD is allocated and dirtied in a transaction, reference two
+ * transfers to the transaction. The EFD reference is dropped once it reaches
+ * the unpin handler. Similar to the EFI, the reference also drops in the event
+ * of commit failure or log I/O errors. Note that the EFD is not inserted in the
+ * AIL, so at this point both the EFI and EFD are freed.
*/
typedef struct xfs_efi_log_item {
xfs_log_item_t efi_item;
@@ -77,5 +96,6 @@ xfs_efd_log_item_t *xfs_efd_init(struct xfs_mount *, xfs_efi_log_item_t *,
int xfs_efi_copy_format(xfs_log_iovec_t *buf,
xfs_efi_log_format_t *dst_efi_fmt);
void xfs_efi_item_free(xfs_efi_log_item_t *);
+void xfs_efi_release(struct xfs_efi_log_item *);
#endif /* __XFS_EXTFREE_ITEM_H__ */
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index db4acc1..ebe9b82 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -242,19 +242,30 @@ xfs_file_fsync(
}
/*
- * All metadata updates are logged, which means that we just have
- * to flush the log up to the latest LSN that touched the inode.
+ * All metadata updates are logged, which means that we just have to
+ * flush the log up to the latest LSN that touched the inode. If we have
+ * concurrent fsync/fdatasync() calls, we need them to all block on the
+ * log force before we clear the ili_fsync_fields field. This ensures
+ * that we don't get a racing sync operation that does not wait for the
+ * metadata to hit the journal before returning. If we race with
+ * clearing the ili_fsync_fields, then all that will happen is the log
+ * force will do nothing as the lsn will already be on disk. We can't
+ * race with setting ili_fsync_fields because that is done under
+ * XFS_ILOCK_EXCL, and that can't happen because we hold the lock shared
+ * until after the ili_fsync_fields is cleared.
*/
xfs_ilock(ip, XFS_ILOCK_SHARED);
if (xfs_ipincount(ip)) {
if (!datasync ||
- (ip->i_itemp->ili_fields & ~XFS_ILOG_TIMESTAMP))
+ (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
lsn = ip->i_itemp->ili_last_lsn;
}
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
- if (lsn)
+ if (lsn) {
error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
+ ip->i_itemp->ili_fsync_fields = 0;
+ }
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
/*
* If we only have a single device, and the log force about was
@@ -287,7 +298,7 @@ xfs_file_read_iter(
xfs_fsize_t n;
loff_t pos = iocb->ki_pos;
- XFS_STATS_INC(xs_read_calls);
+ XFS_STATS_INC(mp, xs_read_calls);
if (unlikely(iocb->ki_flags & IOCB_DIRECT))
ioflags |= XFS_IO_ISDIRECT;
@@ -317,24 +328,33 @@ xfs_file_read_iter(
return -EIO;
/*
- * Locking is a bit tricky here. If we take an exclusive lock
- * for direct IO, we effectively serialise all new concurrent
- * read IO to this file and block it behind IO that is currently in
- * progress because IO in progress holds the IO lock shared. We only
- * need to hold the lock exclusive to blow away the page cache, so
- * only take lock exclusively if the page cache needs invalidation.
- * This allows the normal direct IO case of no page cache pages to
- * proceeed concurrently without serialisation.
+ * Locking is a bit tricky here. If we take an exclusive lock for direct
+ * IO, we effectively serialise all new concurrent read IO to this file
+ * and block it behind IO that is currently in progress because IO in
+ * progress holds the IO lock shared. We only need to hold the lock
+ * exclusive to blow away the page cache, so only take lock exclusively
+ * if the page cache needs invalidation. This allows the normal direct
+ * IO case of no page cache pages to proceeed concurrently without
+ * serialisation.
*/
xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
if ((ioflags & XFS_IO_ISDIRECT) && inode->i_mapping->nrpages) {
xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
+ /*
+ * The generic dio code only flushes the range of the particular
+ * I/O. Because we take an exclusive lock here, this whole
+ * sequence is considerably more expensive for us. This has a
+ * noticeable performance impact for any file with cached pages,
+ * even when outside of the range of the particular I/O.
+ *
+ * Hence, amortize the cost of the lock against a full file
+ * flush and reduce the chances of repeated iolock cycles going
+ * forward.
+ */
if (inode->i_mapping->nrpages) {
- ret = filemap_write_and_wait_range(
- VFS_I(ip)->i_mapping,
- pos, pos + size - 1);
+ ret = filemap_write_and_wait(VFS_I(ip)->i_mapping);
if (ret) {
xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
return ret;
@@ -345,9 +365,7 @@ xfs_file_read_iter(
* we fail to invalidate a page, but this should never
* happen on XFS. Warn if it does fail.
*/
- ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
- pos >> PAGE_CACHE_SHIFT,
- (pos + size - 1) >> PAGE_CACHE_SHIFT);
+ ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping);
WARN_ON_ONCE(ret);
ret = 0;
}
@@ -358,7 +376,7 @@ xfs_file_read_iter(
ret = generic_file_read_iter(iocb, to);
if (ret > 0)
- XFS_STATS_ADD(xs_read_bytes, ret);
+ XFS_STATS_ADD(mp, xs_read_bytes, ret);
xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
return ret;
@@ -376,7 +394,7 @@ xfs_file_splice_read(
int ioflags = 0;
ssize_t ret;
- XFS_STATS_INC(xs_read_calls);
+ XFS_STATS_INC(ip->i_mount, xs_read_calls);
if (infilp->f_mode & FMODE_NOCMTIME)
ioflags |= XFS_IO_INVIS;
@@ -384,19 +402,26 @@ xfs_file_splice_read(
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
return -EIO;
- xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
-
trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
- /* for dax, we need to avoid the page cache */
- if (IS_DAX(VFS_I(ip)))
- ret = default_file_splice_read(infilp, ppos, pipe, count, flags);
- else
- ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
- if (ret > 0)
- XFS_STATS_ADD(xs_read_bytes, ret);
+ /*
+ * DAX inodes cannot ues the page cache for splice, so we have to push
+ * them through the VFS IO path. This means it goes through
+ * ->read_iter, which for us takes the XFS_IOLOCK_SHARED. Hence we
+ * cannot lock the splice operation at this level for DAX inodes.
+ */
+ if (IS_DAX(VFS_I(ip))) {
+ ret = default_file_splice_read(infilp, ppos, pipe, count,
+ flags);
+ goto out;
+ }
+ xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
+ ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+out:
+ if (ret > 0)
+ XFS_STATS_ADD(ip->i_mount, xs_read_bytes, ret);
return ret;
}
@@ -475,6 +500,8 @@ xfs_zero_eof(
ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
ASSERT(offset > isize);
+ trace_xfs_zero_eof(ip, isize, offset - isize);
+
/*
* First handle zeroing the block on which isize resides.
*
@@ -567,6 +594,7 @@ xfs_file_aio_write_checks(
struct xfs_inode *ip = XFS_I(inode);
ssize_t error = 0;
size_t count = iov_iter_count(from);
+ bool drained_dio = false;
restart:
error = generic_write_checks(iocb, from);
@@ -604,12 +632,13 @@ restart:
bool zero = false;
spin_unlock(&ip->i_flags_lock);
- if (*iolock == XFS_IOLOCK_SHARED) {
- xfs_rw_iunlock(ip, *iolock);
- *iolock = XFS_IOLOCK_EXCL;
- xfs_rw_ilock(ip, *iolock);
- iov_iter_reexpand(from, count);
-
+ if (!drained_dio) {
+ if (*iolock == XFS_IOLOCK_SHARED) {
+ xfs_rw_iunlock(ip, *iolock);
+ *iolock = XFS_IOLOCK_EXCL;
+ xfs_rw_ilock(ip, *iolock);
+ iov_iter_reexpand(from, count);
+ }
/*
* We now have an IO submission barrier in place, but
* AIO can do EOF updates during IO completion and hence
@@ -619,6 +648,7 @@ restart:
* no-op.
*/
inode_dio_wait(inode);
+ drained_dio = true;
goto restart;
}
error = xfs_zero_eof(ip, iocb->ki_pos, i_size_read(inode), &zero);
@@ -733,19 +763,19 @@ xfs_file_dio_aio_write(
pos = iocb->ki_pos;
end = pos + count - 1;
+ /*
+ * See xfs_file_read_iter() for why we do a full-file flush here.
+ */
if (mapping->nrpages) {
- ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
- pos, end);
+ ret = filemap_write_and_wait(VFS_I(ip)->i_mapping);
if (ret)
goto out;
/*
- * Invalidate whole pages. This can return an error if
- * we fail to invalidate a page, but this should never
- * happen on XFS. Warn if it does fail.
+ * Invalidate whole pages. This can return an error if we fail
+ * to invalidate a page, but this should never happen on XFS.
+ * Warn if it does fail.
*/
- ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
- pos >> PAGE_CACHE_SHIFT,
- end >> PAGE_CACHE_SHIFT);
+ ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping);
WARN_ON_ONCE(ret);
ret = 0;
}
@@ -860,7 +890,7 @@ xfs_file_write_iter(
ssize_t ret;
size_t ocount = iov_iter_count(from);
- XFS_STATS_INC(xs_write_calls);
+ XFS_STATS_INC(ip->i_mount, xs_write_calls);
if (ocount == 0)
return 0;
@@ -876,7 +906,7 @@ xfs_file_write_iter(
if (ret > 0) {
ssize_t err;
- XFS_STATS_ADD(xs_write_bytes, ret);
+ XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
/* Handle various SYNC-type writes */
err = generic_write_sync(file, iocb->ki_pos - ret, ret);
@@ -1470,7 +1500,7 @@ xfs_file_llseek(
*
* mmap_sem (MM)
* sb_start_pagefault(vfs, freeze)
- * i_mmap_lock (XFS - truncate serialisation)
+ * i_mmaplock (XFS - truncate serialisation)
* page_lock (MM)
* i_lock (XFS - extent map serialisation)
*/
@@ -1496,10 +1526,9 @@ xfs_filemap_page_mkwrite(
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
if (IS_DAX(inode)) {
- ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_direct,
- xfs_end_io_dax_write);
+ ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault, NULL);
} else {
- ret = __block_page_mkwrite(vma, vmf, xfs_get_blocks);
+ ret = block_page_mkwrite(vma, vmf, xfs_get_blocks);
ret = block_page_mkwrite_return(ret);
}
@@ -1531,7 +1560,7 @@ xfs_filemap_fault(
* changes to xfs_get_blocks_direct() to map unwritten extent
* ioend for conversion on read-only mappings.
*/
- ret = __dax_fault(vma, vmf, xfs_get_blocks_direct, NULL);
+ ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault, NULL);
} else
ret = filemap_fault(vma, vmf);
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
@@ -1539,10 +1568,85 @@ xfs_filemap_fault(
return ret;
}
+/*
+ * Similar to xfs_filemap_fault(), the DAX fault path can call into here on
+ * both read and write faults. Hence we need to handle both cases. There is no
+ * ->pmd_mkwrite callout for huge pages, so we have a single function here to
+ * handle both cases here. @flags carries the information on the type of fault
+ * occuring.
+ */
+STATIC int
+xfs_filemap_pmd_fault(
+ struct vm_area_struct *vma,
+ unsigned long addr,
+ pmd_t *pmd,
+ unsigned int flags)
+{
+ struct inode *inode = file_inode(vma->vm_file);
+ struct xfs_inode *ip = XFS_I(inode);
+ int ret;
+
+ if (!IS_DAX(inode))
+ return VM_FAULT_FALLBACK;
+
+ trace_xfs_filemap_pmd_fault(ip);
+
+ if (flags & FAULT_FLAG_WRITE) {
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vma->vm_file);
+ }
+
+ xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+ ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault,
+ NULL);
+ xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+
+ if (flags & FAULT_FLAG_WRITE)
+ sb_end_pagefault(inode->i_sb);
+
+ return ret;
+}
+
+/*
+ * pfn_mkwrite was originally inteneded to ensure we capture time stamp
+ * updates on write faults. In reality, it's need to serialise against
+ * truncate similar to page_mkwrite. Hence we open-code dax_pfn_mkwrite()
+ * here and cycle the XFS_MMAPLOCK_SHARED to ensure we serialise the fault
+ * barrier in place.
+ */
+static int
+xfs_filemap_pfn_mkwrite(
+ struct vm_area_struct *vma,
+ struct vm_fault *vmf)
+{
+
+ struct inode *inode = file_inode(vma->vm_file);
+ struct xfs_inode *ip = XFS_I(inode);
+ int ret = VM_FAULT_NOPAGE;
+ loff_t size;
+
+ trace_xfs_filemap_pfn_mkwrite(ip);
+
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vma->vm_file);
+
+ /* check if the faulting page hasn't raced with truncate */
+ xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
+ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ if (vmf->pgoff >= size)
+ ret = VM_FAULT_SIGBUS;
+ xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
+ sb_end_pagefault(inode->i_sb);
+ return ret;
+
+}
+
static const struct vm_operations_struct xfs_file_vm_ops = {
.fault = xfs_filemap_fault,
+ .pmd_fault = xfs_filemap_pmd_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = xfs_filemap_page_mkwrite,
+ .pfn_mkwrite = xfs_filemap_pfn_mkwrite,
};
STATIC int
@@ -1553,7 +1657,7 @@ xfs_file_mmap(
file_accessed(filp);
vma->vm_ops = &xfs_file_vm_ops;
if (IS_DAX(file_inode(filp)))
- vma->vm_flags |= VM_MIXEDMAP;
+ vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
return 0;
}
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 9b3438a..ee3aaa0a 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -250,7 +250,7 @@ xfs_growfs_data_private(
agf->agf_freeblks = cpu_to_be32(tmpsize);
agf->agf_longest = cpu_to_be32(tmpsize);
if (xfs_sb_version_hascrc(&mp->m_sb))
- uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_uuid);
+ uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid);
error = xfs_bwrite(bp);
xfs_buf_relse(bp);
@@ -273,7 +273,7 @@ xfs_growfs_data_private(
if (xfs_sb_version_hascrc(&mp->m_sb)) {
agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC);
agfl->agfl_seqno = cpu_to_be32(agno);
- uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_uuid);
+ uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid);
}
agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, bp);
@@ -309,7 +309,7 @@ xfs_growfs_data_private(
agi->agi_newino = cpu_to_be32(NULLAGINO);
agi->agi_dirino = cpu_to_be32(NULLAGINO);
if (xfs_sb_version_hascrc(&mp->m_sb))
- uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_uuid);
+ uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid);
if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
agi->agi_free_root = cpu_to_be32(XFS_FIBT_BLOCK(mp));
agi->agi_free_level = cpu_to_be32(1);
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 76a9f278..d7a490f 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -63,7 +63,7 @@ xfs_inode_alloc(
return NULL;
}
- XFS_STATS_INC(vn_active);
+ XFS_STATS_INC(mp, vn_active);
ASSERT(atomic_read(&ip->i_pincount) == 0);
ASSERT(!spin_is_locked(&ip->i_flags_lock));
ASSERT(!xfs_isiflocked(ip));
@@ -129,7 +129,7 @@ xfs_inode_free(
/* asserts to verify all state is correct here */
ASSERT(atomic_read(&ip->i_pincount) == 0);
ASSERT(!xfs_isiflocked(ip));
- XFS_STATS_DEC(vn_active);
+ XFS_STATS_DEC(ip->i_mount, vn_active);
call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
}
@@ -159,7 +159,7 @@ xfs_iget_cache_hit(
spin_lock(&ip->i_flags_lock);
if (ip->i_ino != ino) {
trace_xfs_iget_skip(ip);
- XFS_STATS_INC(xs_ig_frecycle);
+ XFS_STATS_INC(mp, xs_ig_frecycle);
error = -EAGAIN;
goto out_error;
}
@@ -177,7 +177,7 @@ xfs_iget_cache_hit(
*/
if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
trace_xfs_iget_skip(ip);
- XFS_STATS_INC(xs_ig_frecycle);
+ XFS_STATS_INC(mp, xs_ig_frecycle);
error = -EAGAIN;
goto out_error;
}
@@ -259,7 +259,7 @@ xfs_iget_cache_hit(
xfs_ilock(ip, lock_flags);
xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE);
- XFS_STATS_INC(xs_ig_found);
+ XFS_STATS_INC(mp, xs_ig_found);
return 0;
@@ -342,7 +342,7 @@ xfs_iget_cache_miss(
error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
if (unlikely(error)) {
WARN_ON(error != -EEXIST);
- XFS_STATS_INC(xs_ig_dup);
+ XFS_STATS_INC(mp, xs_ig_dup);
error = -EAGAIN;
goto out_preload_end;
}
@@ -412,6 +412,8 @@ xfs_iget(
if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
return -EINVAL;
+ XFS_STATS_INC(mp, xs_ig_attempts);
+
/* get the perag structure and ensure that it's inode capable */
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
agino = XFS_INO_TO_AGINO(mp, ino);
@@ -427,7 +429,7 @@ again:
goto out_error_or_again;
} else {
rcu_read_unlock();
- XFS_STATS_INC(xs_ig_missed);
+ XFS_STATS_INC(mp, xs_ig_missed);
error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
flags, lock_flags);
@@ -963,7 +965,7 @@ reclaim:
xfs_ifunlock(ip);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
- XFS_STATS_INC(xs_ig_reclaims);
+ XFS_STATS_INC(ip->i_mount, xs_ig_reclaims);
/*
* Remove the inode from the per-AG radix tree.
*
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 3da9f4d..ae3758a 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -164,7 +164,7 @@ xfs_ilock(
(XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
(XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
- ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
+ ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
if (lock_flags & XFS_IOLOCK_EXCL)
mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
@@ -212,7 +212,7 @@ xfs_ilock_nowait(
(XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
(XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
- ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
+ ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
if (lock_flags & XFS_IOLOCK_EXCL) {
if (!mrtryupdate(&ip->i_iolock))
@@ -281,7 +281,7 @@ xfs_iunlock(
(XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
(XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
- ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
+ ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
ASSERT(lock_flags != 0);
if (lock_flags & XFS_IOLOCK_EXCL)
@@ -363,31 +363,57 @@ int xfs_lock_delays;
#endif
/*
+ * xfs_lockdep_subclass_ok() is only used in an ASSERT, so is only called when
+ * DEBUG or XFS_WARN is set. And MAX_LOCKDEP_SUBCLASSES is then only defined
+ * when CONFIG_LOCKDEP is set. Hence the complex define below to avoid build
+ * errors and warnings.
+ */
+#if (defined(DEBUG) || defined(XFS_WARN)) && defined(CONFIG_LOCKDEP)
+static bool
+xfs_lockdep_subclass_ok(
+ int subclass)
+{
+ return subclass < MAX_LOCKDEP_SUBCLASSES;
+}
+#else
+#define xfs_lockdep_subclass_ok(subclass) (true)
+#endif
+
+/*
* Bump the subclass so xfs_lock_inodes() acquires each lock with a different
- * value. This shouldn't be called for page fault locking, but we also need to
- * ensure we don't overrun the number of lockdep subclasses for the iolock or
- * mmaplock as that is limited to 12 by the mmap lock lockdep annotations.
+ * value. This can be called for any type of inode lock combination, including
+ * parent locking. Care must be taken to ensure we don't overrun the subclass
+ * storage fields in the class mask we build.
*/
static inline int
xfs_lock_inumorder(int lock_mode, int subclass)
{
+ int class = 0;
+
+ ASSERT(!(lock_mode & (XFS_ILOCK_PARENT | XFS_ILOCK_RTBITMAP |
+ XFS_ILOCK_RTSUM)));
+ ASSERT(xfs_lockdep_subclass_ok(subclass));
+
if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) {
- ASSERT(subclass + XFS_LOCK_INUMORDER <
- (1 << (XFS_MMAPLOCK_SHIFT - XFS_IOLOCK_SHIFT)));
- lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT;
+ ASSERT(subclass <= XFS_IOLOCK_MAX_SUBCLASS);
+ ASSERT(xfs_lockdep_subclass_ok(subclass +
+ XFS_IOLOCK_PARENT_VAL));
+ class += subclass << XFS_IOLOCK_SHIFT;
+ if (lock_mode & XFS_IOLOCK_PARENT)
+ class += XFS_IOLOCK_PARENT_VAL << XFS_IOLOCK_SHIFT;
}
if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) {
- ASSERT(subclass + XFS_LOCK_INUMORDER <
- (1 << (XFS_ILOCK_SHIFT - XFS_MMAPLOCK_SHIFT)));
- lock_mode |= (subclass + XFS_LOCK_INUMORDER) <<
- XFS_MMAPLOCK_SHIFT;
+ ASSERT(subclass <= XFS_MMAPLOCK_MAX_SUBCLASS);
+ class += subclass << XFS_MMAPLOCK_SHIFT;
}
- if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))
- lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT;
+ if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) {
+ ASSERT(subclass <= XFS_ILOCK_MAX_SUBCLASS);
+ class += subclass << XFS_ILOCK_SHIFT;
+ }
- return lock_mode;
+ return (lock_mode & ~XFS_LOCK_SUBCLASS_MASK) | class;
}
/*
@@ -399,6 +425,11 @@ xfs_lock_inumorder(int lock_mode, int subclass)
* transaction (such as truncate). This can result in deadlock since the long
* running trans might need to wait for the inode we just locked in order to
* push the tail and free space in the log.
+ *
+ * xfs_lock_inodes() can only be used to lock one type of lock at a time -
+ * the iolock, the mmaplock or the ilock, but not more than one at a time. If we
+ * lock more than one at a time, lockdep will report false positives saying we
+ * have violated locking orders.
*/
void
xfs_lock_inodes(
@@ -409,8 +440,29 @@ xfs_lock_inodes(
int attempts = 0, i, j, try_lock;
xfs_log_item_t *lp;
- /* currently supports between 2 and 5 inodes */
+ /*
+ * Currently supports between 2 and 5 inodes with exclusive locking. We
+ * support an arbitrary depth of locking here, but absolute limits on
+ * inodes depend on the the type of locking and the limits placed by
+ * lockdep annotations in xfs_lock_inumorder. These are all checked by
+ * the asserts.
+ */
ASSERT(ips && inodes >= 2 && inodes <= 5);
+ ASSERT(lock_mode & (XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL |
+ XFS_ILOCK_EXCL));
+ ASSERT(!(lock_mode & (XFS_IOLOCK_SHARED | XFS_MMAPLOCK_SHARED |
+ XFS_ILOCK_SHARED)));
+ ASSERT(!(lock_mode & XFS_IOLOCK_EXCL) ||
+ inodes <= XFS_IOLOCK_MAX_SUBCLASS + 1);
+ ASSERT(!(lock_mode & XFS_MMAPLOCK_EXCL) ||
+ inodes <= XFS_MMAPLOCK_MAX_SUBCLASS + 1);
+ ASSERT(!(lock_mode & XFS_ILOCK_EXCL) ||
+ inodes <= XFS_ILOCK_MAX_SUBCLASS + 1);
+
+ if (lock_mode & XFS_IOLOCK_EXCL) {
+ ASSERT(!(lock_mode & (XFS_MMAPLOCK_EXCL | XFS_ILOCK_EXCL)));
+ } else if (lock_mode & XFS_MMAPLOCK_EXCL)
+ ASSERT(!(lock_mode & XFS_ILOCK_EXCL));
try_lock = 0;
i = 0;
@@ -629,30 +681,29 @@ xfs_lookup(
{
xfs_ino_t inum;
int error;
- uint lock_mode;
trace_xfs_lookup(dp, name);
if (XFS_FORCED_SHUTDOWN(dp->i_mount))
return -EIO;
- lock_mode = xfs_ilock_data_map_shared(dp);
+ xfs_ilock(dp, XFS_IOLOCK_SHARED);
error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
- xfs_iunlock(dp, lock_mode);
-
if (error)
- goto out;
+ goto out_unlock;
error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp);
if (error)
goto out_free_name;
+ xfs_iunlock(dp, XFS_IOLOCK_SHARED);
return 0;
out_free_name:
if (ci_name)
kmem_free(ci_name->name);
-out:
+out_unlock:
+ xfs_iunlock(dp, XFS_IOLOCK_SHARED);
*ipp = NULL;
return error;
}
@@ -787,7 +838,7 @@ xfs_ialloc(
if (ip->i_d.di_version == 3) {
ASSERT(ip->i_d.di_ino == ino);
- ASSERT(uuid_equal(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid));
+ ASSERT(uuid_equal(&ip->i_d.di_uuid, &mp->m_sb.sb_meta_uuid));
ip->i_d.di_crc = 0;
ip->i_d.di_changecount = 1;
ip->i_d.di_lsn = 0;
@@ -1092,7 +1143,6 @@ xfs_create(
xfs_bmap_free_t free_list;
xfs_fsblock_t first_block;
bool unlock_dp_on_error = false;
- int committed;
prid_t prid;
struct xfs_dquot *udqp = NULL;
struct xfs_dquot *gdqp = NULL;
@@ -1149,7 +1199,8 @@ xfs_create(
goto out_trans_cancel;
- xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
+ xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL |
+ XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT);
unlock_dp_on_error = true;
xfs_bmap_init(&free_list, &first_block);
@@ -1174,12 +1225,9 @@ xfs_create(
* pointing to itself.
*/
error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev,
- prid, resblks > 0, &ip, &committed);
- if (error) {
- if (error == -ENOSPC)
- goto out_trans_cancel;
+ prid, resblks > 0, &ip, NULL);
+ if (error)
goto out_trans_cancel;
- }
/*
* Now we join the directory inode to the transaction. We do not do it
@@ -1188,7 +1236,7 @@ xfs_create(
* the transaction cancel unlocking dp so don't do it explicitly in the
* error path.
*/
- xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
unlock_dp_on_error = false;
error = xfs_dir_createname(tp, dp, name, ip->i_ino,
@@ -1226,7 +1274,7 @@ xfs_create(
*/
xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
- error = xfs_bmap_finish(&tp, &free_list, &committed);
+ error = xfs_bmap_finish(&tp, &free_list, NULL);
if (error)
goto out_bmap_cancel;
@@ -1261,7 +1309,7 @@ xfs_create(
xfs_qm_dqrele(pdqp);
if (unlock_dp_on_error)
- xfs_iunlock(dp, XFS_ILOCK_EXCL);
+ xfs_iunlock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
return error;
}
@@ -1318,11 +1366,8 @@ xfs_create_tmpfile(
error = xfs_dir_ialloc(&tp, dp, mode, 1, 0,
prid, resblks > 0, &ip, NULL);
- if (error) {
- if (error == -ENOSPC)
- goto out_trans_cancel;
+ if (error)
goto out_trans_cancel;
- }
if (mp->m_flags & XFS_MOUNT_WSYNC)
xfs_trans_set_sync(tp);
@@ -1381,7 +1426,6 @@ xfs_link(
int error;
xfs_bmap_free_t free_list;
xfs_fsblock_t first_block;
- int committed;
int resblks;
trace_xfs_link(tdp, target_name);
@@ -1409,10 +1453,11 @@ xfs_link(
if (error)
goto error_return;
+ xfs_ilock(tdp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, tdp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
/*
* If we are using project inheritance, we only allow hard link
@@ -1455,11 +1500,10 @@ xfs_link(
* link transaction goes to disk before returning to
* the user.
*/
- if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
+ if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
xfs_trans_set_sync(tp);
- }
- error = xfs_bmap_finish (&tp, &free_list, &committed);
+ error = xfs_bmap_finish(&tp, &free_list, NULL);
if (error) {
xfs_bmap_cancel(&free_list);
goto error_return;
@@ -1508,7 +1552,6 @@ xfs_itruncate_extents(
xfs_fileoff_t first_unmap_block;
xfs_fileoff_t last_block;
xfs_filblks_t unmap_len;
- int committed;
int error = 0;
int done = 0;
@@ -1554,9 +1597,7 @@ xfs_itruncate_extents(
* Duplicate the transaction that has the permanent
* reservation and commit the old transaction.
*/
- error = xfs_bmap_finish(&tp, &free_list, &committed);
- if (committed)
- xfs_trans_ijoin(tp, ip, 0);
+ error = xfs_bmap_finish(&tp, &free_list, ip);
if (error)
goto out_bmap_cancel;
@@ -1727,7 +1768,6 @@ xfs_inactive_ifree(
{
xfs_bmap_free_t free_list;
xfs_fsblock_t first_block;
- int committed;
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp;
int error;
@@ -1791,14 +1831,15 @@ xfs_inactive_ifree(
xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
/*
- * Just ignore errors at this point. There is nothing we can
- * do except to try to keep going. Make sure it's not a silent
- * error.
+ * Just ignore errors at this point. There is nothing we can do except
+ * to try to keep going. Make sure it's not a silent error.
*/
- error = xfs_bmap_finish(&tp, &free_list, &committed);
- if (error)
+ error = xfs_bmap_finish(&tp, &free_list, NULL);
+ if (error) {
xfs_notice(mp, "%s: xfs_bmap_finish returned error %d",
__func__, error);
+ xfs_bmap_cancel(&free_list);
+ }
error = xfs_trans_commit(tp);
if (error)
xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
@@ -2317,6 +2358,7 @@ retry:
iip->ili_last_fields = iip->ili_fields;
iip->ili_fields = 0;
+ iip->ili_fsync_fields = 0;
iip->ili_logged = 1;
xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
&iip->ili_item.li_lsn);
@@ -2474,7 +2516,6 @@ xfs_remove(
int error = 0;
xfs_bmap_free_t free_list;
xfs_fsblock_t first_block;
- int committed;
uint resblks;
trace_xfs_remove(dp, name);
@@ -2515,9 +2556,10 @@ xfs_remove(
goto out_trans_cancel;
}
+ xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
/*
@@ -2574,7 +2616,7 @@ xfs_remove(
if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
xfs_trans_set_sync(tp);
- error = xfs_bmap_finish(&tp, &free_list, &committed);
+ error = xfs_bmap_finish(&tp, &free_list, NULL);
if (error)
goto out_bmap_cancel;
@@ -2651,7 +2693,6 @@ xfs_finish_rename(
struct xfs_trans *tp,
struct xfs_bmap_free *free_list)
{
- int committed = 0;
int error;
/*
@@ -2661,7 +2702,7 @@ xfs_finish_rename(
if (tp->t_mountp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
xfs_trans_set_sync(tp);
- error = xfs_bmap_finish(&tp, free_list, &committed);
+ error = xfs_bmap_finish(&tp, free_list, NULL);
if (error) {
xfs_bmap_cancel(free_list);
xfs_trans_cancel(tp);
@@ -2898,6 +2939,12 @@ xfs_rename(
* whether the target directory is the same as the source
* directory, we can lock from 2 to 4 inodes.
*/
+ if (!new_parent)
+ xfs_ilock(src_dp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
+ else
+ xfs_lock_two_inodes(src_dp, target_dp,
+ XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
+
xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL);
/*
@@ -2905,9 +2952,9 @@ xfs_rename(
* we can rely on either trans_commit or trans_cancel to unlock
* them.
*/
- xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, src_dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
if (new_parent)
- xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, target_dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
if (target_ip)
xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
@@ -3216,8 +3263,8 @@ xfs_iflush_cluster(
}
if (clcount) {
- XFS_STATS_INC(xs_icluster_flushcnt);
- XFS_STATS_ADD(xs_icluster_flushinode, clcount);
+ XFS_STATS_INC(mp, xs_icluster_flushcnt);
+ XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount);
}
out_free:
@@ -3290,7 +3337,7 @@ xfs_iflush(
struct xfs_dinode *dip;
int error;
- XFS_STATS_INC(xs_iflush_count);
+ XFS_STATS_INC(mp, xs_iflush_count);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
ASSERT(xfs_isiflocked(ip));
@@ -3505,6 +3552,7 @@ xfs_iflush_int(
*/
iip->ili_last_fields = iip->ili_fields;
iip->ili_fields = 0;
+ iip->ili_fsync_fields = 0;
iip->ili_logged = 1;
xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 8f22d20..ca9e119 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -284,9 +284,9 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
* Flags for lockdep annotations.
*
* XFS_LOCK_PARENT - for directory operations that require locking a
- * parent directory inode and a child entry inode. The parent gets locked
- * with this flag so it gets a lockdep subclass of 1 and the child entry
- * lock will have a lockdep subclass of 0.
+ * parent directory inode and a child entry inode. IOLOCK requires nesting,
+ * MMAPLOCK does not support this class, ILOCK requires a single subclass
+ * to differentiate parent from child.
*
* XFS_LOCK_RTBITMAP/XFS_LOCK_RTSUM - the realtime device bitmap and summary
* inodes do not participate in the normal lock order, and thus have their
@@ -295,30 +295,63 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
* XFS_LOCK_INUMORDER - for locking several inodes at the some time
* with xfs_lock_inodes(). This flag is used as the starting subclass
* and each subsequent lock acquired will increment the subclass by one.
- * So the first lock acquired will have a lockdep subclass of 4, the
- * second lock will have a lockdep subclass of 5, and so on. It is
- * the responsibility of the class builder to shift this to the correct
- * portion of the lock_mode lockdep mask.
+ * However, MAX_LOCKDEP_SUBCLASSES == 8, which means we are greatly
+ * limited to the subclasses we can represent via nesting. We need at least
+ * 5 inodes nest depth for the ILOCK through rename, and we also have to support
+ * XFS_ILOCK_PARENT, which gives 6 subclasses. Then we have XFS_ILOCK_RTBITMAP
+ * and XFS_ILOCK_RTSUM, which are another 2 unique subclasses, so that's all
+ * 8 subclasses supported by lockdep.
+ *
+ * This also means we have to number the sub-classes in the lowest bits of
+ * the mask we keep, and we have to ensure we never exceed 3 bits of lockdep
+ * mask and we can't use bit-masking to build the subclasses. What a mess.
+ *
+ * Bit layout:
+ *
+ * Bit Lock Region
+ * 16-19 XFS_IOLOCK_SHIFT dependencies
+ * 20-23 XFS_MMAPLOCK_SHIFT dependencies
+ * 24-31 XFS_ILOCK_SHIFT dependencies
+ *
+ * IOLOCK values
+ *
+ * 0-3 subclass value
+ * 4-7 PARENT subclass values
+ *
+ * MMAPLOCK values
+ *
+ * 0-3 subclass value
+ * 4-7 unused
+ *
+ * ILOCK values
+ * 0-4 subclass values
+ * 5 PARENT subclass (not nestable)
+ * 6 RTBITMAP subclass (not nestable)
+ * 7 RTSUM subclass (not nestable)
+ *
*/
-#define XFS_LOCK_PARENT 1
-#define XFS_LOCK_RTBITMAP 2
-#define XFS_LOCK_RTSUM 3
-#define XFS_LOCK_INUMORDER 4
-
-#define XFS_IOLOCK_SHIFT 16
-#define XFS_IOLOCK_PARENT (XFS_LOCK_PARENT << XFS_IOLOCK_SHIFT)
-
-#define XFS_MMAPLOCK_SHIFT 20
-
-#define XFS_ILOCK_SHIFT 24
-#define XFS_ILOCK_PARENT (XFS_LOCK_PARENT << XFS_ILOCK_SHIFT)
-#define XFS_ILOCK_RTBITMAP (XFS_LOCK_RTBITMAP << XFS_ILOCK_SHIFT)
-#define XFS_ILOCK_RTSUM (XFS_LOCK_RTSUM << XFS_ILOCK_SHIFT)
-
-#define XFS_IOLOCK_DEP_MASK 0x000f0000
-#define XFS_MMAPLOCK_DEP_MASK 0x00f00000
-#define XFS_ILOCK_DEP_MASK 0xff000000
-#define XFS_LOCK_DEP_MASK (XFS_IOLOCK_DEP_MASK | \
+#define XFS_IOLOCK_SHIFT 16
+#define XFS_IOLOCK_PARENT_VAL 4
+#define XFS_IOLOCK_MAX_SUBCLASS (XFS_IOLOCK_PARENT_VAL - 1)
+#define XFS_IOLOCK_DEP_MASK 0x000f0000
+#define XFS_IOLOCK_PARENT (XFS_IOLOCK_PARENT_VAL << XFS_IOLOCK_SHIFT)
+
+#define XFS_MMAPLOCK_SHIFT 20
+#define XFS_MMAPLOCK_NUMORDER 0
+#define XFS_MMAPLOCK_MAX_SUBCLASS 3
+#define XFS_MMAPLOCK_DEP_MASK 0x00f00000
+
+#define XFS_ILOCK_SHIFT 24
+#define XFS_ILOCK_PARENT_VAL 5
+#define XFS_ILOCK_MAX_SUBCLASS (XFS_ILOCK_PARENT_VAL - 1)
+#define XFS_ILOCK_RTBITMAP_VAL 6
+#define XFS_ILOCK_RTSUM_VAL 7
+#define XFS_ILOCK_DEP_MASK 0xff000000
+#define XFS_ILOCK_PARENT (XFS_ILOCK_PARENT_VAL << XFS_ILOCK_SHIFT)
+#define XFS_ILOCK_RTBITMAP (XFS_ILOCK_RTBITMAP_VAL << XFS_ILOCK_SHIFT)
+#define XFS_ILOCK_RTSUM (XFS_ILOCK_RTSUM_VAL << XFS_ILOCK_SHIFT)
+
+#define XFS_LOCK_SUBCLASS_MASK (XFS_IOLOCK_DEP_MASK | \
XFS_MMAPLOCK_DEP_MASK | \
XFS_ILOCK_DEP_MASK)
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index bf13a5a..d14b12b 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -703,17 +703,10 @@ xfs_iflush_abort(
xfs_inode_log_item_t *iip = ip->i_itemp;
if (iip) {
- struct xfs_ail *ailp = iip->ili_item.li_ailp;
if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
- spin_lock(&ailp->xa_lock);
- if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
- /* xfs_trans_ail_delete() drops the AIL lock. */
- xfs_trans_ail_delete(ailp, &iip->ili_item,
- stale ?
- SHUTDOWN_LOG_IO_ERROR :
+ xfs_trans_ail_remove(&iip->ili_item,
+ stale ? SHUTDOWN_LOG_IO_ERROR :
SHUTDOWN_CORRUPT_INCORE);
- } else
- spin_unlock(&ailp->xa_lock);
}
iip->ili_logged = 0;
/*
@@ -726,6 +719,7 @@ xfs_iflush_abort(
* attempted.
*/
iip->ili_fields = 0;
+ iip->ili_fsync_fields = 0;
}
/*
* Release the inode's flush lock since we're done with it.
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 488d812..4c7722e 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -34,6 +34,7 @@ typedef struct xfs_inode_log_item {
unsigned short ili_logged; /* flushed logged data */
unsigned int ili_last_fields; /* fields when flushed */
unsigned int ili_fields; /* fields to be logged */
+ unsigned int ili_fsync_fields; /* logged since last fsync */
} xfs_inode_log_item_t;
static inline int xfs_inode_clean(xfs_inode_t *ip)
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index ea7d85a..d42738d 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -40,6 +40,7 @@
#include "xfs_symlink.h"
#include "xfs_trans.h"
#include "xfs_pnfs.h"
+#include "xfs_acl.h"
#include <linux/capability.h>
#include <linux/dcache.h>
@@ -411,7 +412,7 @@ xfs_attrlist_by_handle(
if (copy_from_user(&al_hreq, arg, sizeof(xfs_fsop_attrlist_handlereq_t)))
return -EFAULT;
if (al_hreq.buflen < sizeof(struct attrlist) ||
- al_hreq.buflen > XATTR_LIST_MAX)
+ al_hreq.buflen > XFS_XATTR_LIST_MAX)
return -EINVAL;
/*
@@ -455,7 +456,7 @@ xfs_attrmulti_attr_get(
unsigned char *kbuf;
int error = -EFAULT;
- if (*len > XATTR_SIZE_MAX)
+ if (*len > XFS_XATTR_SIZE_MAX)
return -EINVAL;
kbuf = kmem_zalloc_large(*len, KM_SLEEP);
if (!kbuf)
@@ -482,17 +483,22 @@ xfs_attrmulti_attr_set(
__uint32_t flags)
{
unsigned char *kbuf;
+ int error;
if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
return -EPERM;
- if (len > XATTR_SIZE_MAX)
+ if (len > XFS_XATTR_SIZE_MAX)
return -EINVAL;
kbuf = memdup_user(ubuf, len);
if (IS_ERR(kbuf))
return PTR_ERR(kbuf);
- return xfs_attr_set(XFS_I(inode), name, kbuf, len, flags);
+ error = xfs_attr_set(XFS_I(inode), name, kbuf, len, flags);
+ if (!error)
+ xfs_forget_acl(inode, name, flags);
+ kfree(kbuf);
+ return error;
}
int
@@ -501,9 +507,14 @@ xfs_attrmulti_attr_remove(
unsigned char *name,
__uint32_t flags)
{
+ int error;
+
if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
return -EPERM;
- return xfs_attr_remove(XFS_I(inode), name, flags);
+ error = xfs_attr_remove(XFS_I(inode), name, flags);
+ if (!error)
+ xfs_forget_acl(inode, name, flags);
+ return error;
}
STATIC int
@@ -1028,7 +1039,7 @@ xfs_ioctl_setattr_xflags(
xfs_diflags_to_linux(ip);
xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- XFS_STATS_INC(xs_ig_attrchg);
+ XFS_STATS_INC(mp, xs_ig_attrchg);
return 0;
}
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index b88bdc8..1a05d8a 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -356,7 +356,7 @@ xfs_compat_attrlist_by_handle(
sizeof(compat_xfs_fsop_attrlist_handlereq_t)))
return -EFAULT;
if (al_hreq.buflen < sizeof(struct attrlist) ||
- al_hreq.buflen > XATTR_LIST_MAX)
+ al_hreq.buflen > XFS_XATTR_LIST_MAX)
return -EINVAL;
/*
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 1f86033..d81bdc0 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -129,22 +129,31 @@ xfs_iomap_write_direct(
xfs_trans_t *tp;
xfs_bmap_free_t free_list;
uint qblocks, resblks, resrtextents;
- int committed;
int error;
-
- error = xfs_qm_dqattach(ip, 0);
- if (error)
- return error;
+ int lockmode;
+ int bmapi_flags = XFS_BMAPI_PREALLOC;
rt = XFS_IS_REALTIME_INODE(ip);
extsz = xfs_get_extsz_hint(ip);
+ lockmode = XFS_ILOCK_SHARED; /* locked by caller */
+
+ ASSERT(xfs_isilocked(ip, lockmode));
offset_fsb = XFS_B_TO_FSBT(mp, offset);
last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
if ((offset + count) > XFS_ISIZE(ip)) {
+ /*
+ * Assert that the in-core extent list is present since this can
+ * call xfs_iread_extents() and we only have the ilock shared.
+ * This should be safe because the lock was held around a bmapi
+ * call in the caller and we only need it to access the in-core
+ * list.
+ */
+ ASSERT(XFS_IFORK_PTR(ip, XFS_DATA_FORK)->if_flags &
+ XFS_IFEXTENTS);
error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
if (error)
- return error;
+ goto out_unlock;
} else {
if (nmaps && (imap->br_startblock == HOLESTARTBLOCK))
last_fsb = MIN(last_fsb, (xfs_fileoff_t)
@@ -174,9 +183,40 @@ xfs_iomap_write_direct(
}
/*
+ * Drop the shared lock acquired by the caller, attach the dquot if
+ * necessary and move on to transaction setup.
+ */
+ xfs_iunlock(ip, lockmode);
+ error = xfs_qm_dqattach(ip, 0);
+ if (error)
+ return error;
+
+ /*
* Allocate and setup the transaction
*/
tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
+
+ /*
+ * For DAX, we do not allocate unwritten extents, but instead we zero
+ * the block before we commit the transaction. Ideally we'd like to do
+ * this outside the transaction context, but if we commit and then crash
+ * we may not have zeroed the blocks and this will be exposed on
+ * recovery of the allocation. Hence we must zero before commit.
+ *
+ * Further, if we are mapping unwritten extents here, we need to zero
+ * and convert them to written so that we don't need an unwritten extent
+ * callback for DAX. This also means that we need to be able to dip into
+ * the reserve block pool for bmbt block allocation if there is no space
+ * left but we need to do unwritten extent conversion.
+ */
+
+ if (IS_DAX(VFS_I(ip))) {
+ bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO;
+ if (ISUNWRITTEN(imap)) {
+ tp->t_flags |= XFS_TRANS_RESERVE;
+ resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1;
+ }
+ }
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
resblks, resrtextents);
/*
@@ -187,7 +227,8 @@ xfs_iomap_write_direct(
return error;
}
- xfs_ilock(ip, XFS_ILOCK_EXCL);
+ lockmode = XFS_ILOCK_EXCL;
+ xfs_ilock(ip, lockmode);
error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, 0, quota_flag);
if (error)
@@ -202,17 +243,18 @@ xfs_iomap_write_direct(
xfs_bmap_init(&free_list, &firstfsb);
nimaps = 1;
error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
- XFS_BMAPI_PREALLOC, &firstfsb, 0,
- imap, &nimaps, &free_list);
+ bmapi_flags, &firstfsb, resblks, imap,
+ &nimaps, &free_list);
if (error)
goto out_bmap_cancel;
/*
* Complete the transaction
*/
- error = xfs_bmap_finish(&tp, &free_list, &committed);
+ error = xfs_bmap_finish(&tp, &free_list, NULL);
if (error)
goto out_bmap_cancel;
+
error = xfs_trans_commit(tp);
if (error)
goto out_unlock;
@@ -229,7 +271,7 @@ xfs_iomap_write_direct(
error = xfs_alert_fsblock_zero(ip, imap);
out_unlock:
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_iunlock(ip, lockmode);
return error;
out_bmap_cancel:
@@ -655,7 +697,7 @@ xfs_iomap_write_allocate(
xfs_bmap_free_t free_list;
xfs_filblks_t count_fsb;
xfs_trans_t *tp;
- int nimaps, committed;
+ int nimaps;
int error = 0;
int nres;
@@ -670,7 +712,7 @@ xfs_iomap_write_allocate(
count_fsb = imap->br_blockcount;
map_start_fsb = imap->br_startoff;
- XFS_STATS_ADD(xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb));
+ XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb));
while (count_fsb != 0) {
/*
@@ -750,13 +792,13 @@ xfs_iomap_write_allocate(
* pointer that the caller gave to us.
*/
error = xfs_bmapi_write(tp, ip, map_start_fsb,
- count_fsb, 0,
- &first_block, 1,
- imap, &nimaps, &free_list);
+ count_fsb, 0, &first_block,
+ nres, imap, &nimaps,
+ &free_list);
if (error)
goto trans_cancel;
- error = xfs_bmap_finish(&tp, &free_list, &committed);
+ error = xfs_bmap_finish(&tp, &free_list, NULL);
if (error)
goto trans_cancel;
@@ -777,7 +819,7 @@ xfs_iomap_write_allocate(
if ((offset_fsb >= imap->br_startoff) &&
(offset_fsb < (imap->br_startoff +
imap->br_blockcount))) {
- XFS_STATS_INC(xs_xstrat_quick);
+ XFS_STATS_INC(mp, xs_xstrat_quick);
return 0;
}
@@ -814,7 +856,6 @@ xfs_iomap_write_unwritten(
xfs_bmap_free_t free_list;
xfs_fsize_t i_size;
uint resblks;
- int committed;
int error;
trace_xfs_unwritten_convert(ip, offset, count);
@@ -866,8 +907,8 @@ xfs_iomap_write_unwritten(
xfs_bmap_init(&free_list, &firstfsb);
nimaps = 1;
error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
- XFS_BMAPI_CONVERT, &firstfsb,
- 1, &imap, &nimaps, &free_list);
+ XFS_BMAPI_CONVERT, &firstfsb, resblks,
+ &imap, &nimaps, &free_list);
if (error)
goto error_on_bmapi_transaction;
@@ -886,7 +927,7 @@ xfs_iomap_write_unwritten(
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
}
- error = xfs_bmap_finish(&tp, &free_list, &committed);
+ error = xfs_bmap_finish(&tp, &free_list, NULL);
if (error)
goto error_on_bmapi_transaction;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 766b23f..06eafaf 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -414,13 +414,17 @@ xfs_vn_rename(
* uio is kmalloced for this reason...
*/
STATIC const char *
-xfs_vn_follow_link(
+xfs_vn_get_link(
struct dentry *dentry,
- void **cookie)
+ struct inode *inode,
+ struct delayed_call *done)
{
char *link;
int error = -ENOMEM;
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
link = kmalloc(MAXPATHLEN+1, GFP_KERNEL);
if (!link)
goto out_err;
@@ -429,7 +433,8 @@ xfs_vn_follow_link(
if (unlikely(error))
goto out_kfree;
- return *cookie = link;
+ set_delayed_call(done, kfree_link, link);
+ return link;
out_kfree:
kfree(link);
@@ -609,7 +614,7 @@ xfs_setattr_nonsize(
tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
if (error)
- goto out_dqrele;
+ goto out_trans_cancel;
xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -640,7 +645,7 @@ xfs_setattr_nonsize(
NULL, capable(CAP_FOWNER) ?
XFS_QMOPT_FORCE_RES : 0);
if (error) /* out of quota */
- goto out_trans_cancel;
+ goto out_unlock;
}
}
@@ -695,7 +700,7 @@ xfs_setattr_nonsize(
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- XFS_STATS_INC(xs_ig_attrchg);
+ XFS_STATS_INC(mp, xs_ig_attrchg);
if (mp->m_flags & XFS_MOUNT_WSYNC)
xfs_trans_set_sync(tp);
@@ -729,10 +734,10 @@ xfs_setattr_nonsize(
return 0;
+out_unlock:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
out_trans_cancel:
xfs_trans_cancel(tp);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
-out_dqrele:
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(gdqp);
return error;
@@ -922,7 +927,7 @@ xfs_setattr_size(
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- XFS_STATS_INC(xs_ig_attrchg);
+ XFS_STATS_INC(mp, xs_ig_attrchg);
if (mp->m_flags & XFS_MOUNT_WSYNC)
xfs_trans_set_sync(tp);
@@ -1172,8 +1177,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = {
static const struct inode_operations xfs_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = xfs_vn_follow_link,
- .put_link = kfree_put_link,
+ .get_link = xfs_vn_get_link,
.getattr = xfs_vn_getattr,
.setattr = xfs_vn_setattr,
.setxattr = generic_setxattr,
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index f41b0c3..930ebd8 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -473,7 +473,8 @@ xfs_bulkstat(
* pending error, then we are done.
*/
del_cursor:
- xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ xfs_btree_del_cursor(cur, error ?
+ XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
xfs_buf_relse(agbp);
if (error)
break;
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 85f883d..ec0e239 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -171,6 +171,13 @@ struct xfs_kobj {
struct completion complete;
};
+struct xstats {
+ struct xfsstats __percpu *xs_stats;
+ struct xfs_kobj xs_kobj;
+};
+
+extern struct xstats xfsstats;
+
/* Kernel uid/gid conversion. These are used to convert to/from the on disk
* uid_t/gid_t types to the kuid_t/kgid_t types that the kernel uses internally.
* The conversion here is type only, the value will remain the same since we
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 08d4fe4..9c9a1c9 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -268,7 +268,7 @@ xlog_grant_head_wait(
__set_current_state(TASK_UNINTERRUPTIBLE);
spin_unlock(&head->lock);
- XFS_STATS_INC(xs_sleep_logspace);
+ XFS_STATS_INC(log->l_mp, xs_sleep_logspace);
trace_xfs_log_grant_sleep(log, tic);
schedule();
@@ -379,7 +379,7 @@ xfs_log_regrant(
if (XLOG_FORCED_SHUTDOWN(log))
return -EIO;
- XFS_STATS_INC(xs_try_logspace);
+ XFS_STATS_INC(mp, xs_try_logspace);
/*
* This is a new transaction on the ticket, so we need to change the
@@ -448,7 +448,7 @@ xfs_log_reserve(
if (XLOG_FORCED_SHUTDOWN(log))
return -EIO;
- XFS_STATS_INC(xs_try_logspace);
+ XFS_STATS_INC(mp, xs_try_logspace);
ASSERT(*ticp == NULL);
tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent,
@@ -668,9 +668,9 @@ xfs_log_mount(
ASSERT(0);
goto out_free_log;
}
+ xfs_crit(mp, "Log size out of supported range.");
xfs_crit(mp,
-"Log size out of supported range. Continuing onwards, but if log hangs are\n"
-"experienced then please report this message in the bug report.");
+"Continuing onwards, but if log hangs are experienced then please report this message in the bug report.");
}
/*
@@ -700,6 +700,7 @@ xfs_log_mount(
if (error) {
xfs_warn(mp, "log mount/recovery failed: error %d",
error);
+ xlog_recover_cancel(mp->m_log);
goto out_destroy_ail;
}
}
@@ -740,18 +741,35 @@ out:
* it.
*/
int
-xfs_log_mount_finish(xfs_mount_t *mp)
+xfs_log_mount_finish(
+ struct xfs_mount *mp)
{
int error = 0;
- if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) {
- error = xlog_recover_finish(mp->m_log);
- if (!error)
- xfs_log_work_queue(mp);
- } else {
+ if (mp->m_flags & XFS_MOUNT_NORECOVERY) {
ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
+ return 0;
}
+ error = xlog_recover_finish(mp->m_log);
+ if (!error)
+ xfs_log_work_queue(mp);
+
+ return error;
+}
+
+/*
+ * The mount has failed. Cancel the recovery if it hasn't completed and destroy
+ * the log.
+ */
+int
+xfs_log_mount_cancel(
+ struct xfs_mount *mp)
+{
+ int error;
+
+ error = xlog_recover_cancel(mp->m_log);
+ xfs_log_unmount(mp);
return error;
}
@@ -1142,11 +1160,13 @@ xlog_space_left(
* In this case we just want to return the size of the
* log as the amount of space left.
*/
+ xfs_alert(log->l_mp, "xlog_space_left: head behind tail");
xfs_alert(log->l_mp,
- "xlog_space_left: head behind tail\n"
- " tail_cycle = %d, tail_bytes = %d\n"
- " GH cycle = %d, GH bytes = %d",
- tail_cycle, tail_bytes, head_cycle, head_bytes);
+ " tail_cycle = %d, tail_bytes = %d",
+ tail_cycle, tail_bytes);
+ xfs_alert(log->l_mp,
+ " GH cycle = %d, GH bytes = %d",
+ head_cycle, head_bytes);
ASSERT(0);
free_bytes = log->l_logsize;
}
@@ -1168,10 +1188,16 @@ xlog_iodone(xfs_buf_t *bp)
int aborted = 0;
/*
- * Race to shutdown the filesystem if we see an error.
+ * Race to shutdown the filesystem if we see an error or the iclog is in
+ * IOABORT state. The IOABORT state is only set in DEBUG mode to inject
+ * CRC errors into log recovery.
*/
- if (XFS_TEST_ERROR(bp->b_error, l->l_mp,
- XFS_ERRTAG_IODONE_IOERR, XFS_RANDOM_IODONE_IOERR)) {
+ if (XFS_TEST_ERROR(bp->b_error, l->l_mp, XFS_ERRTAG_IODONE_IOERR,
+ XFS_RANDOM_IODONE_IOERR) ||
+ iclog->ic_state & XLOG_STATE_IOABORT) {
+ if (iclog->ic_state & XLOG_STATE_IOABORT)
+ iclog->ic_state &= ~XLOG_STATE_IOABORT;
+
xfs_buf_ioerror_alert(bp, __func__);
xfs_buf_stale(bp);
xfs_force_shutdown(l->l_mp, SHUTDOWN_LOG_IO_ERROR);
@@ -1652,8 +1678,13 @@ xlog_cksum(
if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
union xlog_in_core2 *xhdr = (union xlog_in_core2 *)rhead;
int i;
+ int xheads;
- for (i = 1; i < log->l_iclog_heads; i++) {
+ xheads = size / XLOG_HEADER_CYCLE_SIZE;
+ if (size % XLOG_HEADER_CYCLE_SIZE)
+ xheads++;
+
+ for (i = 1; i < xheads; i++) {
crc = crc32c(crc, &xhdr[i].hic_xheader,
sizeof(struct xlog_rec_ext_header));
}
@@ -1743,7 +1774,7 @@ xlog_sync(
int v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb);
int size;
- XFS_STATS_INC(xs_log_writes);
+ XFS_STATS_INC(log->l_mp, xs_log_writes);
ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
/* Add for LR header */
@@ -1780,7 +1811,7 @@ xlog_sync(
bp = iclog->ic_bp;
XFS_BUF_SET_ADDR(bp, BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn)));
- XFS_STATS_ADD(xs_log_blocks, BTOBB(count));
+ XFS_STATS_ADD(log->l_mp, xs_log_blocks, BTOBB(count));
/* Do we need to split this write into 2 parts? */
if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) {
@@ -1813,6 +1844,23 @@ xlog_sync(
/* calculcate the checksum */
iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header,
iclog->ic_datap, size);
+#ifdef DEBUG
+ /*
+ * Intentionally corrupt the log record CRC based on the error injection
+ * frequency, if defined. This facilitates testing log recovery in the
+ * event of torn writes. Hence, set the IOABORT state to abort the log
+ * write on I/O completion and shutdown the fs. The subsequent mount
+ * detects the bad CRC and attempts to recover.
+ */
+ if (log->l_badcrc_factor &&
+ (prandom_u32() % log->l_badcrc_factor == 0)) {
+ iclog->ic_header.h_crc &= 0xAAAAAAAA;
+ iclog->ic_state |= XLOG_STATE_IOABORT;
+ xfs_warn(log->l_mp,
+ "Intentionally corrupted log record at LSN 0x%llx. Shutdown imminent.",
+ be64_to_cpu(iclog->ic_header.h_lsn));
+ }
+#endif
bp->b_io_length = BTOBB(count);
bp->b_fspriv = iclog;
@@ -2020,34 +2068,34 @@ xlog_print_tic_res(
"QM_DQCLUSTER",
"QM_QINOCREATE",
"QM_QUOTAOFF_END",
- "SB_UNIT",
"FSYNC_TS",
"GROWFSRT_ALLOC",
"GROWFSRT_ZERO",
"GROWFSRT_FREE",
- "SWAPEXT"
+ "SWAPEXT",
+ "CHECKPOINT",
+ "ICREATE",
+ "CREATE_TMPFILE"
};
- xfs_warn(mp,
- "xlog_write: reservation summary:\n"
- " trans type = %s (%u)\n"
- " unit res = %d bytes\n"
- " current res = %d bytes\n"
- " total reg = %u bytes (o/flow = %u bytes)\n"
- " ophdrs = %u (ophdr space = %u bytes)\n"
- " ophdr + reg = %u bytes\n"
- " num regions = %u",
- ((ticket->t_trans_type <= 0 ||
- ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ?
+ xfs_warn(mp, "xlog_write: reservation summary:");
+ xfs_warn(mp, " trans type = %s (%u)",
+ ((ticket->t_trans_type <= 0 ||
+ ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ?
"bad-trans-type" : trans_type_str[ticket->t_trans_type-1]),
- ticket->t_trans_type,
- ticket->t_unit_res,
- ticket->t_curr_res,
- ticket->t_res_arr_sum, ticket->t_res_o_flow,
- ticket->t_res_num_ophdrs, ophdr_spc,
- ticket->t_res_arr_sum +
- ticket->t_res_o_flow + ophdr_spc,
- ticket->t_res_num);
+ ticket->t_trans_type);
+ xfs_warn(mp, " unit res = %d bytes",
+ ticket->t_unit_res);
+ xfs_warn(mp, " current res = %d bytes",
+ ticket->t_curr_res);
+ xfs_warn(mp, " total reg = %u bytes (o/flow = %u bytes)",
+ ticket->t_res_arr_sum, ticket->t_res_o_flow);
+ xfs_warn(mp, " ophdrs = %u (ophdr space = %u bytes)",
+ ticket->t_res_num_ophdrs, ophdr_spc);
+ xfs_warn(mp, " ophdr + reg = %u bytes",
+ ticket->t_res_arr_sum + ticket->t_res_o_flow + ophdr_spc);
+ xfs_warn(mp, " num regions = %u",
+ ticket->t_res_num);
for (i = 0; i < ticket->t_res_num; i++) {
uint r_type = ticket->t_res_arr[i].r_type;
@@ -2399,11 +2447,20 @@ xlog_write(
&partial_copy_len);
xlog_verify_dest_ptr(log, ptr);
- /* copy region */
+ /*
+ * Copy region.
+ *
+ * Unmount records just log an opheader, so can have
+ * empty payloads with no data region to copy. Hence we
+ * only copy the payload if the vector says it has data
+ * to copy.
+ */
ASSERT(copy_len >= 0);
- memcpy(ptr, reg->i_addr + copy_off, copy_len);
- xlog_write_adv_cnt(&ptr, &len, &log_offset, copy_len);
-
+ if (copy_len > 0) {
+ memcpy(ptr, reg->i_addr + copy_off, copy_len);
+ xlog_write_adv_cnt(&ptr, &len, &log_offset,
+ copy_len);
+ }
copy_len += start_rec_copy + sizeof(xlog_op_header_t);
record_cnt++;
data_cnt += contwr ? copy_len : 0;
@@ -2759,11 +2816,19 @@ xlog_state_do_callback(
}
} while (!ioerrors && loopdidcallbacks);
+#ifdef DEBUG
/*
- * make one last gasp attempt to see if iclogs are being left in
- * limbo..
+ * Make one last gasp attempt to see if iclogs are being left in limbo.
+ * If the above loop finds an iclog earlier than the current iclog and
+ * in one of the syncing states, the current iclog is put into
+ * DO_CALLBACK and the callbacks are deferred to the completion of the
+ * earlier iclog. Walk the iclogs in order and make sure that no iclog
+ * is in DO_CALLBACK unless an earlier iclog is in one of the syncing
+ * states.
+ *
+ * Note that SYNCING|IOABORT is a valid state so we cannot just check
+ * for ic_state == SYNCING.
*/
-#ifdef DEBUG
if (funcdidcallbacks) {
first_iclog = iclog = log->l_iclog;
do {
@@ -2778,7 +2843,7 @@ xlog_state_do_callback(
* IOERROR - give up hope all ye who enter here
*/
if (iclog->ic_state == XLOG_STATE_WANT_SYNC ||
- iclog->ic_state == XLOG_STATE_SYNCING ||
+ iclog->ic_state & XLOG_STATE_SYNCING ||
iclog->ic_state == XLOG_STATE_DONE_SYNC ||
iclog->ic_state == XLOG_STATE_IOERROR )
break;
@@ -2890,7 +2955,7 @@ restart:
iclog = log->l_iclog;
if (iclog->ic_state != XLOG_STATE_ACTIVE) {
- XFS_STATS_INC(xs_log_noiclogs);
+ XFS_STATS_INC(log->l_mp, xs_log_noiclogs);
/* Wait for log writes to have flushed */
xlog_wait(&log->l_flush_wait, &log->l_icloglock);
@@ -3142,11 +3207,19 @@ xlog_state_switch_iclogs(
}
if (log->l_curr_block >= log->l_logBBsize) {
+ /*
+ * Rewind the current block before the cycle is bumped to make
+ * sure that the combined LSN never transiently moves forward
+ * when the log wraps to the next cycle. This is to support the
+ * unlocked sample of these fields from xlog_valid_lsn(). Most
+ * other cases should acquire l_icloglock.
+ */
+ log->l_curr_block -= log->l_logBBsize;
+ ASSERT(log->l_curr_block >= 0);
+ smp_wmb();
log->l_curr_cycle++;
if (log->l_curr_cycle == XLOG_HEADER_MAGIC_NUM)
log->l_curr_cycle++;
- log->l_curr_block -= log->l_logBBsize;
- ASSERT(log->l_curr_block >= 0);
}
ASSERT(iclog == log->l_iclog);
log->l_iclog = iclog->ic_next;
@@ -3189,7 +3262,7 @@ _xfs_log_force(
struct xlog_in_core *iclog;
xfs_lsn_t lsn;
- XFS_STATS_INC(xs_log_force);
+ XFS_STATS_INC(mp, xs_log_force);
xlog_cil_force(log);
@@ -3274,7 +3347,7 @@ maybe_sleep:
spin_unlock(&log->l_icloglock);
return -EIO;
}
- XFS_STATS_INC(xs_log_force_sleep);
+ XFS_STATS_INC(mp, xs_log_force_sleep);
xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
/*
* No need to grab the log lock here since we're
@@ -3339,7 +3412,7 @@ _xfs_log_force_lsn(
ASSERT(lsn != 0);
- XFS_STATS_INC(xs_log_force);
+ XFS_STATS_INC(mp, xs_log_force);
lsn = xlog_cil_force_lsn(log, lsn);
if (lsn == NULLCOMMITLSN)
@@ -3388,7 +3461,7 @@ try_again:
(XLOG_STATE_WANT_SYNC | XLOG_STATE_SYNCING))) {
ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR));
- XFS_STATS_INC(xs_log_force_sleep);
+ XFS_STATS_INC(mp, xs_log_force_sleep);
xlog_wait(&iclog->ic_prev->ic_write_wait,
&log->l_icloglock);
@@ -3418,7 +3491,7 @@ try_again:
spin_unlock(&log->l_icloglock);
return -EIO;
}
- XFS_STATS_INC(xs_log_force_sleep);
+ XFS_STATS_INC(mp, xs_log_force_sleep);
xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
/*
* No need to grab the log lock here since we're
@@ -4000,3 +4073,45 @@ xlog_iclogs_empty(
return 1;
}
+/*
+ * Verify that an LSN stamped into a piece of metadata is valid. This is
+ * intended for use in read verifiers on v5 superblocks.
+ */
+bool
+xfs_log_check_lsn(
+ struct xfs_mount *mp,
+ xfs_lsn_t lsn)
+{
+ struct xlog *log = mp->m_log;
+ bool valid;
+
+ /*
+ * norecovery mode skips mount-time log processing and unconditionally
+ * resets the in-core LSN. We can't validate in this mode, but
+ * modifications are not allowed anyways so just return true.
+ */
+ if (mp->m_flags & XFS_MOUNT_NORECOVERY)
+ return true;
+
+ /*
+ * Some metadata LSNs are initialized to NULL (e.g., the agfl). This is
+ * handled by recovery and thus safe to ignore here.
+ */
+ if (lsn == NULLCOMMITLSN)
+ return true;
+
+ valid = xlog_valid_lsn(mp->m_log, lsn);
+
+ /* warn the user about what's gone wrong before verifier failure */
+ if (!valid) {
+ spin_lock(&log->l_icloglock);
+ xfs_warn(mp,
+"Corruption warning: Metadata has LSN (%d:%d) ahead of current LSN (%d:%d). "
+"Please unmount and run xfs_repair (>= v4.3) to resolve.",
+ CYCLE_LSN(lsn), BLOCK_LSN(lsn),
+ log->l_curr_cycle, log->l_curr_block);
+ spin_unlock(&log->l_icloglock);
+ }
+
+ return valid;
+}
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index fa27aae..aa533a7 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -147,6 +147,7 @@ int xfs_log_mount(struct xfs_mount *mp,
xfs_daddr_t start_block,
int num_bblocks);
int xfs_log_mount_finish(struct xfs_mount *mp);
+int xfs_log_mount_cancel(struct xfs_mount *);
xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
xfs_lsn_t xlog_assign_tail_lsn_locked(struct xfs_mount *mp);
void xfs_log_space_wake(struct xfs_mount *mp);
@@ -180,5 +181,6 @@ bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
void xfs_log_work_queue(struct xfs_mount *mp);
void xfs_log_worker(struct work_struct *work);
void xfs_log_quiesce(struct xfs_mount *mp);
+bool xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t);
#endif /* __XFS_LOG_H__ */
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index abc2ccb..4e76493 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -307,7 +307,13 @@ xlog_cil_insert_items(
if (!(lidp->lid_flags & XFS_LID_DIRTY))
continue;
- list_move_tail(&lip->li_cil, &cil->xc_cil);
+ /*
+ * Only move the item if it isn't already at the tail. This is
+ * to prevent a transient list_empty() state when reinserting
+ * an item that is already the only item in the CIL.
+ */
+ if (!list_is_last(&lip->li_cil, &cil->xc_cil))
+ list_move_tail(&lip->li_cil, &cil->xc_cil);
}
/* account for space used by new iovec headers */
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 1c87c8a..ed88963 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -62,6 +62,7 @@ static inline uint xlog_get_client_id(__be32 i)
#define XLOG_STATE_CALLBACK 0x0020 /* Callback functions now */
#define XLOG_STATE_DIRTY 0x0040 /* Dirty IC log, not ready for ACTIVE status*/
#define XLOG_STATE_IOERROR 0x0080 /* IO error happened in sync'ing log */
+#define XLOG_STATE_IOABORT 0x0100 /* force abort on I/O completion (debug) */
#define XLOG_STATE_ALL 0x7FFF /* All possible valid flags */
#define XLOG_STATE_NOTUSED 0x8000 /* This IC log not being used */
@@ -410,6 +411,8 @@ struct xlog {
/* The following field are used for debugging; need to hold icloglock */
#ifdef DEBUG
void *l_iclog_bak[XLOG_MAX_ICLOGS];
+ /* log record crc error injection factor */
+ uint32_t l_badcrc_factor;
#endif
};
@@ -426,6 +429,8 @@ xlog_recover(
extern int
xlog_recover_finish(
struct xlog *log);
+extern int
+xlog_recover_cancel(struct xlog *);
extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead,
char *dp, int size);
@@ -558,4 +563,55 @@ static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock)
remove_wait_queue(wq, &wait);
}
+/*
+ * The LSN is valid so long as it is behind the current LSN. If it isn't, this
+ * means that the next log record that includes this metadata could have a
+ * smaller LSN. In turn, this means that the modification in the log would not
+ * replay.
+ */
+static inline bool
+xlog_valid_lsn(
+ struct xlog *log,
+ xfs_lsn_t lsn)
+{
+ int cur_cycle;
+ int cur_block;
+ bool valid = true;
+
+ /*
+ * First, sample the current lsn without locking to avoid added
+ * contention from metadata I/O. The current cycle and block are updated
+ * (in xlog_state_switch_iclogs()) and read here in a particular order
+ * to avoid false negatives (e.g., thinking the metadata LSN is valid
+ * when it is not).
+ *
+ * The current block is always rewound before the cycle is bumped in
+ * xlog_state_switch_iclogs() to ensure the current LSN is never seen in
+ * a transiently forward state. Instead, we can see the LSN in a
+ * transiently behind state if we happen to race with a cycle wrap.
+ */
+ cur_cycle = ACCESS_ONCE(log->l_curr_cycle);
+ smp_rmb();
+ cur_block = ACCESS_ONCE(log->l_curr_block);
+
+ if ((CYCLE_LSN(lsn) > cur_cycle) ||
+ (CYCLE_LSN(lsn) == cur_cycle && BLOCK_LSN(lsn) > cur_block)) {
+ /*
+ * If the metadata LSN appears invalid, it's possible the check
+ * above raced with a wrap to the next log cycle. Grab the lock
+ * to check for sure.
+ */
+ spin_lock(&log->l_icloglock);
+ cur_cycle = log->l_curr_cycle;
+ cur_block = log->l_curr_block;
+ spin_unlock(&log->l_icloglock);
+
+ if ((CYCLE_LSN(lsn) > cur_cycle) ||
+ (CYCLE_LSN(lsn) == cur_cycle && BLOCK_LSN(lsn) > cur_block))
+ valid = false;
+ }
+
+ return valid;
+}
+
#endif /* __XFS_LOG_PRIV_H__ */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 480ebba..da37beb 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -61,6 +61,9 @@ xlog_recover_check_summary(
#else
#define xlog_recover_check_summary(log)
#endif
+STATIC int
+xlog_do_recovery_pass(
+ struct xlog *, xfs_daddr_t, xfs_daddr_t, int, xfs_daddr_t *);
/*
* This structure is used during recovery to record the buf log items which
@@ -868,6 +871,351 @@ validate_head:
}
/*
+ * Seek backwards in the log for log record headers.
+ *
+ * Given a starting log block, walk backwards until we find the provided number
+ * of records or hit the provided tail block. The return value is the number of
+ * records encountered or a negative error code. The log block and buffer
+ * pointer of the last record seen are returned in rblk and rhead respectively.
+ */
+STATIC int
+xlog_rseek_logrec_hdr(
+ struct xlog *log,
+ xfs_daddr_t head_blk,
+ xfs_daddr_t tail_blk,
+ int count,
+ struct xfs_buf *bp,
+ xfs_daddr_t *rblk,
+ struct xlog_rec_header **rhead,
+ bool *wrapped)
+{
+ int i;
+ int error;
+ int found = 0;
+ char *offset = NULL;
+ xfs_daddr_t end_blk;
+
+ *wrapped = false;
+
+ /*
+ * Walk backwards from the head block until we hit the tail or the first
+ * block in the log.
+ */
+ end_blk = head_blk > tail_blk ? tail_blk : 0;
+ for (i = (int) head_blk - 1; i >= end_blk; i--) {
+ error = xlog_bread(log, i, 1, bp, &offset);
+ if (error)
+ goto out_error;
+
+ if (*(__be32 *) offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
+ *rblk = i;
+ *rhead = (struct xlog_rec_header *) offset;
+ if (++found == count)
+ break;
+ }
+ }
+
+ /*
+ * If we haven't hit the tail block or the log record header count,
+ * start looking again from the end of the physical log. Note that
+ * callers can pass head == tail if the tail is not yet known.
+ */
+ if (tail_blk >= head_blk && found != count) {
+ for (i = log->l_logBBsize - 1; i >= (int) tail_blk; i--) {
+ error = xlog_bread(log, i, 1, bp, &offset);
+ if (error)
+ goto out_error;
+
+ if (*(__be32 *)offset ==
+ cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
+ *wrapped = true;
+ *rblk = i;
+ *rhead = (struct xlog_rec_header *) offset;
+ if (++found == count)
+ break;
+ }
+ }
+ }
+
+ return found;
+
+out_error:
+ return error;
+}
+
+/*
+ * Seek forward in the log for log record headers.
+ *
+ * Given head and tail blocks, walk forward from the tail block until we find
+ * the provided number of records or hit the head block. The return value is the
+ * number of records encountered or a negative error code. The log block and
+ * buffer pointer of the last record seen are returned in rblk and rhead
+ * respectively.
+ */
+STATIC int
+xlog_seek_logrec_hdr(
+ struct xlog *log,
+ xfs_daddr_t head_blk,
+ xfs_daddr_t tail_blk,
+ int count,
+ struct xfs_buf *bp,
+ xfs_daddr_t *rblk,
+ struct xlog_rec_header **rhead,
+ bool *wrapped)
+{
+ int i;
+ int error;
+ int found = 0;
+ char *offset = NULL;
+ xfs_daddr_t end_blk;
+
+ *wrapped = false;
+
+ /*
+ * Walk forward from the tail block until we hit the head or the last
+ * block in the log.
+ */
+ end_blk = head_blk > tail_blk ? head_blk : log->l_logBBsize - 1;
+ for (i = (int) tail_blk; i <= end_blk; i++) {
+ error = xlog_bread(log, i, 1, bp, &offset);
+ if (error)
+ goto out_error;
+
+ if (*(__be32 *) offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
+ *rblk = i;
+ *rhead = (struct xlog_rec_header *) offset;
+ if (++found == count)
+ break;
+ }
+ }
+
+ /*
+ * If we haven't hit the head block or the log record header count,
+ * start looking again from the start of the physical log.
+ */
+ if (tail_blk > head_blk && found != count) {
+ for (i = 0; i < (int) head_blk; i++) {
+ error = xlog_bread(log, i, 1, bp, &offset);
+ if (error)
+ goto out_error;
+
+ if (*(__be32 *)offset ==
+ cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
+ *wrapped = true;
+ *rblk = i;
+ *rhead = (struct xlog_rec_header *) offset;
+ if (++found == count)
+ break;
+ }
+ }
+ }
+
+ return found;
+
+out_error:
+ return error;
+}
+
+/*
+ * Check the log tail for torn writes. This is required when torn writes are
+ * detected at the head and the head had to be walked back to a previous record.
+ * The tail of the previous record must now be verified to ensure the torn
+ * writes didn't corrupt the previous tail.
+ *
+ * Return an error if CRC verification fails as recovery cannot proceed.
+ */
+STATIC int
+xlog_verify_tail(
+ struct xlog *log,
+ xfs_daddr_t head_blk,
+ xfs_daddr_t tail_blk)
+{
+ struct xlog_rec_header *thead;
+ struct xfs_buf *bp;
+ xfs_daddr_t first_bad;
+ int count;
+ int error = 0;
+ bool wrapped;
+ xfs_daddr_t tmp_head;
+
+ bp = xlog_get_bp(log, 1);
+ if (!bp)
+ return -ENOMEM;
+
+ /*
+ * Seek XLOG_MAX_ICLOGS + 1 records past the current tail record to get
+ * a temporary head block that points after the last possible
+ * concurrently written record of the tail.
+ */
+ count = xlog_seek_logrec_hdr(log, head_blk, tail_blk,
+ XLOG_MAX_ICLOGS + 1, bp, &tmp_head, &thead,
+ &wrapped);
+ if (count < 0) {
+ error = count;
+ goto out;
+ }
+
+ /*
+ * If the call above didn't find XLOG_MAX_ICLOGS + 1 records, we ran
+ * into the actual log head. tmp_head points to the start of the record
+ * so update it to the actual head block.
+ */
+ if (count < XLOG_MAX_ICLOGS + 1)
+ tmp_head = head_blk;
+
+ /*
+ * We now have a tail and temporary head block that covers at least
+ * XLOG_MAX_ICLOGS records from the tail. We need to verify that these
+ * records were completely written. Run a CRC verification pass from
+ * tail to head and return the result.
+ */
+ error = xlog_do_recovery_pass(log, tmp_head, tail_blk,
+ XLOG_RECOVER_CRCPASS, &first_bad);
+
+out:
+ xlog_put_bp(bp);
+ return error;
+}
+
+/*
+ * Detect and trim torn writes from the head of the log.
+ *
+ * Storage without sector atomicity guarantees can result in torn writes in the
+ * log in the event of a crash. Our only means to detect this scenario is via
+ * CRC verification. While we can't always be certain that CRC verification
+ * failure is due to a torn write vs. an unrelated corruption, we do know that
+ * only a certain number (XLOG_MAX_ICLOGS) of log records can be written out at
+ * one time. Therefore, CRC verify up to XLOG_MAX_ICLOGS records at the head of
+ * the log and treat failures in this range as torn writes as a matter of
+ * policy. In the event of CRC failure, the head is walked back to the last good
+ * record in the log and the tail is updated from that record and verified.
+ */
+STATIC int
+xlog_verify_head(
+ struct xlog *log,
+ xfs_daddr_t *head_blk, /* in/out: unverified head */
+ xfs_daddr_t *tail_blk, /* out: tail block */
+ struct xfs_buf *bp,
+ xfs_daddr_t *rhead_blk, /* start blk of last record */
+ struct xlog_rec_header **rhead, /* ptr to last record */
+ bool *wrapped) /* last rec. wraps phys. log */
+{
+ struct xlog_rec_header *tmp_rhead;
+ struct xfs_buf *tmp_bp;
+ xfs_daddr_t first_bad;
+ xfs_daddr_t tmp_rhead_blk;
+ int found;
+ int error;
+ bool tmp_wrapped;
+
+ /*
+ * Search backwards through the log looking for the log record header
+ * block. This wraps all the way back around to the head so something is
+ * seriously wrong if we can't find it.
+ */
+ found = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, bp, rhead_blk,
+ rhead, wrapped);
+ if (found < 0)
+ return found;
+ if (!found) {
+ xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
+ return -EIO;
+ }
+
+ *tail_blk = BLOCK_LSN(be64_to_cpu((*rhead)->h_tail_lsn));
+
+ /*
+ * Now that we have a tail block, check the head of the log for torn
+ * writes. Search again until we hit the tail or the maximum number of
+ * log record I/Os that could have been in flight at one time. Use a
+ * temporary buffer so we don't trash the rhead/bp pointer from the
+ * call above.
+ */
+ tmp_bp = xlog_get_bp(log, 1);
+ if (!tmp_bp)
+ return -ENOMEM;
+ error = xlog_rseek_logrec_hdr(log, *head_blk, *tail_blk,
+ XLOG_MAX_ICLOGS, tmp_bp, &tmp_rhead_blk,
+ &tmp_rhead, &tmp_wrapped);
+ xlog_put_bp(tmp_bp);
+ if (error < 0)
+ return error;
+
+ /*
+ * Now run a CRC verification pass over the records starting at the
+ * block found above to the current head. If a CRC failure occurs, the
+ * log block of the first bad record is saved in first_bad.
+ */
+ error = xlog_do_recovery_pass(log, *head_blk, tmp_rhead_blk,
+ XLOG_RECOVER_CRCPASS, &first_bad);
+ if (error == -EFSBADCRC) {
+ /*
+ * We've hit a potential torn write. Reset the error and warn
+ * about it.
+ */
+ error = 0;
+ xfs_warn(log->l_mp,
+"Torn write (CRC failure) detected at log block 0x%llx. Truncating head block from 0x%llx.",
+ first_bad, *head_blk);
+
+ /*
+ * Get the header block and buffer pointer for the last good
+ * record before the bad record.
+ *
+ * Note that xlog_find_tail() clears the blocks at the new head
+ * (i.e., the records with invalid CRC) if the cycle number
+ * matches the the current cycle.
+ */
+ found = xlog_rseek_logrec_hdr(log, first_bad, *tail_blk, 1, bp,
+ rhead_blk, rhead, wrapped);
+ if (found < 0)
+ return found;
+ if (found == 0) /* XXX: right thing to do here? */
+ return -EIO;
+
+ /*
+ * Reset the head block to the starting block of the first bad
+ * log record and set the tail block based on the last good
+ * record.
+ *
+ * Bail out if the updated head/tail match as this indicates
+ * possible corruption outside of the acceptable
+ * (XLOG_MAX_ICLOGS) range. This is a job for xfs_repair...
+ */
+ *head_blk = first_bad;
+ *tail_blk = BLOCK_LSN(be64_to_cpu((*rhead)->h_tail_lsn));
+ if (*head_blk == *tail_blk) {
+ ASSERT(0);
+ return 0;
+ }
+
+ /*
+ * Now verify the tail based on the updated head. This is
+ * required because the torn writes trimmed from the head could
+ * have been written over the tail of a previous record. Return
+ * any errors since recovery cannot proceed if the tail is
+ * corrupt.
+ *
+ * XXX: This leaves a gap in truly robust protection from torn
+ * writes in the log. If the head is behind the tail, the tail
+ * pushes forward to create some space and then a crash occurs
+ * causing the writes into the previous record's tail region to
+ * tear, log recovery isn't able to recover.
+ *
+ * How likely is this to occur? If possible, can we do something
+ * more intelligent here? Is it safe to push the tail forward if
+ * we can determine that the tail is within the range of the
+ * torn write (e.g., the kernel can only overwrite the tail if
+ * it has actually been pushed forward)? Alternatively, could we
+ * somehow prevent this condition at runtime?
+ */
+ error = xlog_verify_tail(log, *head_blk, *tail_blk);
+ }
+
+ return error;
+}
+
+/*
* Find the sync block number or the tail of the log.
*
* This will be the block number of the last record to have its
@@ -893,13 +1241,13 @@ xlog_find_tail(
xlog_op_header_t *op_head;
char *offset = NULL;
xfs_buf_t *bp;
- int error, i, found;
+ int error;
xfs_daddr_t umount_data_blk;
xfs_daddr_t after_umount_blk;
+ xfs_daddr_t rhead_blk;
xfs_lsn_t tail_lsn;
int hblks;
-
- found = 0;
+ bool wrapped = false;
/*
* Find previous log record
@@ -923,48 +1271,16 @@ xlog_find_tail(
}
/*
- * Search backwards looking for log record header block
+ * Trim the head block back to skip over torn records. We can have
+ * multiple log I/Os in flight at any time, so we assume CRC failures
+ * back through the previous several records are torn writes and skip
+ * them.
*/
ASSERT(*head_blk < INT_MAX);
- for (i = (int)(*head_blk) - 1; i >= 0; i--) {
- error = xlog_bread(log, i, 1, bp, &offset);
- if (error)
- goto done;
-
- if (*(__be32 *)offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
- found = 1;
- break;
- }
- }
- /*
- * If we haven't found the log record header block, start looking
- * again from the end of the physical log. XXXmiken: There should be
- * a check here to make sure we didn't search more than N blocks in
- * the previous code.
- */
- if (!found) {
- for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) {
- error = xlog_bread(log, i, 1, bp, &offset);
- if (error)
- goto done;
-
- if (*(__be32 *)offset ==
- cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
- found = 2;
- break;
- }
- }
- }
- if (!found) {
- xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
- xlog_put_bp(bp);
- ASSERT(0);
- return -EIO;
- }
-
- /* find blk_no of tail of log */
- rhead = (xlog_rec_header_t *)offset;
- *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
+ error = xlog_verify_head(log, head_blk, tail_blk, bp, &rhead_blk,
+ &rhead, &wrapped);
+ if (error)
+ goto done;
/*
* Reset log values according to the state of the log when we
@@ -976,10 +1292,10 @@ xlog_find_tail(
* written was complete and ended exactly on the end boundary
* of the physical log.
*/
- log->l_prev_block = i;
+ log->l_prev_block = rhead_blk;
log->l_curr_block = (int)*head_blk;
log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
- if (found == 2)
+ if (wrapped)
log->l_curr_cycle++;
atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
@@ -1014,12 +1330,13 @@ xlog_find_tail(
} else {
hblks = 1;
}
- after_umount_blk = (i + hblks + (int)
- BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize;
+ after_umount_blk = rhead_blk + hblks + BTOBB(be32_to_cpu(rhead->h_len));
+ after_umount_blk = do_mod(after_umount_blk, log->l_logBBsize);
tail_lsn = atomic64_read(&log->l_tail_lsn);
if (*head_blk == after_umount_blk &&
be32_to_cpu(rhead->h_num_logops) == 1) {
- umount_data_blk = (i + hblks) % log->l_logBBsize;
+ umount_data_blk = rhead_blk + hblks;
+ umount_data_blk = do_mod(umount_data_blk, log->l_logBBsize);
error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
if (error)
goto done;
@@ -1895,15 +2212,25 @@ xlog_recover_get_buf_lsn(
*/
goto recover_immediately;
case XFS_SB_MAGIC:
+ /*
+ * superblock uuids are magic. We may or may not have a
+ * sb_meta_uuid on disk, but it will be set in the in-core
+ * superblock. We set the uuid pointer for verification
+ * according to the superblock feature mask to ensure we check
+ * the relevant UUID in the superblock.
+ */
lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn);
- uuid = &((struct xfs_dsb *)blk)->sb_uuid;
+ if (xfs_sb_version_hasmetauuid(&mp->m_sb))
+ uuid = &((struct xfs_dsb *)blk)->sb_meta_uuid;
+ else
+ uuid = &((struct xfs_dsb *)blk)->sb_uuid;
break;
default:
break;
}
if (lsn != (xfs_lsn_t)-1) {
- if (!uuid_equal(&mp->m_sb.sb_uuid, uuid))
+ if (!uuid_equal(&mp->m_sb.sb_meta_uuid, uuid))
goto recover_immediately;
return lsn;
}
@@ -2933,16 +3260,16 @@ xlog_recover_efi_pass2(
struct xlog_recover_item *item,
xfs_lsn_t lsn)
{
- int error;
- xfs_mount_t *mp = log->l_mp;
- xfs_efi_log_item_t *efip;
- xfs_efi_log_format_t *efi_formatp;
+ int error;
+ struct xfs_mount *mp = log->l_mp;
+ struct xfs_efi_log_item *efip;
+ struct xfs_efi_log_format *efi_formatp;
efi_formatp = item->ri_buf[0].i_addr;
efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
- if ((error = xfs_efi_copy_format(&(item->ri_buf[0]),
- &(efip->efi_format)))) {
+ error = xfs_efi_copy_format(&item->ri_buf[0], &efip->efi_format);
+ if (error) {
xfs_efi_item_free(efip);
return error;
}
@@ -2950,20 +3277,23 @@ xlog_recover_efi_pass2(
spin_lock(&log->l_ailp->xa_lock);
/*
- * xfs_trans_ail_update() drops the AIL lock.
+ * The EFI has two references. One for the EFD and one for EFI to ensure
+ * it makes it into the AIL. Insert the EFI into the AIL directly and
+ * drop the EFI reference. Note that xfs_trans_ail_update() drops the
+ * AIL lock.
*/
xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn);
+ xfs_efi_release(efip);
return 0;
}
/*
- * This routine is called when an efd format structure is found in
- * a committed transaction in the log. It's purpose is to cancel
- * the corresponding efi if it was still in the log. To do this
- * it searches the AIL for the efi with an id equal to that in the
- * efd format structure. If we find it, we remove the efi from the
- * AIL and free it.
+ * This routine is called when an EFD format structure is found in a committed
+ * transaction in the log. Its purpose is to cancel the corresponding EFI if it
+ * was still in the log. To do this it searches the AIL for the EFI with an id
+ * equal to that in the EFD format structure. If we find it we drop the EFD
+ * reference, which removes the EFI from the AIL and frees it.
*/
STATIC int
xlog_recover_efd_pass2(
@@ -2985,8 +3315,8 @@ xlog_recover_efd_pass2(
efi_id = efd_formatp->efd_efi_id;
/*
- * Search for the efi with the id in the efd format structure
- * in the AIL.
+ * Search for the EFI with the id in the EFD format structure in the
+ * AIL.
*/
spin_lock(&ailp->xa_lock);
lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
@@ -2995,18 +3325,18 @@ xlog_recover_efd_pass2(
efip = (xfs_efi_log_item_t *)lip;
if (efip->efi_format.efi_id == efi_id) {
/*
- * xfs_trans_ail_delete() drops the
- * AIL lock.
+ * Drop the EFD reference to the EFI. This
+ * removes the EFI from the AIL and frees it.
*/
- xfs_trans_ail_delete(ailp, lip,
- SHUTDOWN_CORRUPT_INCORE);
- xfs_efi_item_free(efip);
+ spin_unlock(&ailp->xa_lock);
+ xfs_efi_release(efip);
spin_lock(&ailp->xa_lock);
break;
}
}
lip = xfs_trans_ail_cursor_next(ailp, &cur);
}
+
xfs_trans_ail_cursor_done(&cur);
spin_unlock(&ailp->xa_lock);
@@ -3034,6 +3364,11 @@ xlog_recover_do_icreate_pass2(
unsigned int count;
unsigned int isize;
xfs_agblock_t length;
+ int blks_per_cluster;
+ int bb_per_cluster;
+ int cancel_count;
+ int nbufs;
+ int i;
icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr;
if (icl->icl_type != XFS_LI_ICREATE) {
@@ -3092,22 +3427,45 @@ xlog_recover_do_icreate_pass2(
}
/*
- * Inode buffers can be freed. Do not replay the inode initialisation as
- * we could be overwriting something written after this inode buffer was
- * cancelled.
+ * The icreate transaction can cover multiple cluster buffers and these
+ * buffers could have been freed and reused. Check the individual
+ * buffers for cancellation so we don't overwrite anything written after
+ * a cancellation.
+ */
+ blks_per_cluster = xfs_icluster_size_fsb(mp);
+ bb_per_cluster = XFS_FSB_TO_BB(mp, blks_per_cluster);
+ nbufs = length / blks_per_cluster;
+ for (i = 0, cancel_count = 0; i < nbufs; i++) {
+ xfs_daddr_t daddr;
+
+ daddr = XFS_AGB_TO_DADDR(mp, agno,
+ agbno + i * blks_per_cluster);
+ if (xlog_check_buffer_cancelled(log, daddr, bb_per_cluster, 0))
+ cancel_count++;
+ }
+
+ /*
+ * We currently only use icreate for a single allocation at a time. This
+ * means we should expect either all or none of the buffers to be
+ * cancelled. Be conservative and skip replay if at least one buffer is
+ * cancelled, but warn the user that something is awry if the buffers
+ * are not consistent.
*
- * XXX: we need to iterate all buffers and only init those that are not
- * cancelled. I think that a more fine grained factoring of
- * xfs_ialloc_inode_init may be appropriate here to enable this to be
- * done easily.
+ * XXX: This must be refined to only skip cancelled clusters once we use
+ * icreate for multiple chunk allocations.
*/
- if (xlog_check_buffer_cancelled(log,
- XFS_AGB_TO_DADDR(mp, agno, agbno), length, 0))
+ ASSERT(!cancel_count || cancel_count == nbufs);
+ if (cancel_count) {
+ if (cancel_count != nbufs)
+ xfs_warn(mp,
+ "WARNING: partial inode chunk cancellation, skipped icreate.");
+ trace_xfs_log_recover_icreate_cancel(log, icl);
return 0;
+ }
- xfs_ialloc_inode_init(mp, NULL, buffer_list, count, agno, agbno, length,
- be32_to_cpu(icl->icl_gen));
- return 0;
+ trace_xfs_log_recover_icreate_recover(log, icl);
+ return xfs_ialloc_inode_init(mp, NULL, buffer_list, count, agno, agbno,
+ length, be32_to_cpu(icl->icl_gen));
}
STATIC void
@@ -3163,6 +3521,7 @@ xlog_recover_dquot_ra_pass2(
struct xfs_disk_dquot *recddq;
struct xfs_dq_logformat *dq_f;
uint type;
+ int len;
if (mp->m_qflags == 0)
@@ -3183,8 +3542,12 @@ xlog_recover_dquot_ra_pass2(
ASSERT(dq_f);
ASSERT(dq_f->qlf_len == 1);
- xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno,
- XFS_FSB_TO_BB(mp, dq_f->qlf_len), NULL);
+ len = XFS_FSB_TO_BB(mp, dq_f->qlf_len);
+ if (xlog_peek_buffer_cancelled(log, dq_f->qlf_blkno, len, 0))
+ return;
+
+ xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno, len,
+ &xfs_dquot_buf_ra_ops);
}
STATIC void
@@ -3385,14 +3748,24 @@ xlog_recover_add_to_cont_trans(
char *ptr, *old_ptr;
int old_len;
+ /*
+ * If the transaction is empty, the header was split across this and the
+ * previous record. Copy the rest of the header.
+ */
if (list_empty(&trans->r_itemq)) {
- /* finish copying rest of trans header */
+ ASSERT(len <= sizeof(struct xfs_trans_header));
+ if (len > sizeof(struct xfs_trans_header)) {
+ xfs_warn(log->l_mp, "%s: bad header length", __func__);
+ return -EIO;
+ }
+
xlog_recover_add_item(&trans->r_itemq);
ptr = (char *)&trans->r_theader +
- sizeof(xfs_trans_header_t) - len;
+ sizeof(struct xfs_trans_header) - len;
memcpy(ptr, dp, len);
return 0;
}
+
/* take the tail entry */
item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
@@ -3441,7 +3814,19 @@ xlog_recover_add_to_trans(
ASSERT(0);
return -EIO;
}
- if (len == sizeof(xfs_trans_header_t))
+
+ if (len > sizeof(struct xfs_trans_header)) {
+ xfs_warn(log->l_mp, "%s: bad header length", __func__);
+ ASSERT(0);
+ return -EIO;
+ }
+
+ /*
+ * The transaction header can be arbitrarily split across op
+ * records. If we don't have the whole thing here, copy what we
+ * do have and handle the rest in the next record.
+ */
+ if (len == sizeof(struct xfs_trans_header))
xlog_recover_add_item(&trans->r_itemq);
memcpy(&trans->r_theader, dp, len);
return 0;
@@ -3744,7 +4129,7 @@ xlog_recover_process_efi(
* free the memory associated with it.
*/
set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
- xfs_efi_release(efip, efip->efi_format.efi_nextents);
+ xfs_efi_release(efip);
return -EIO;
}
}
@@ -3757,11 +4142,11 @@ xlog_recover_process_efi(
for (i = 0; i < efip->efi_format.efi_nextents; i++) {
extp = &(efip->efi_format.efi_extents[i]);
- error = xfs_free_extent(tp, extp->ext_start, extp->ext_len);
+ error = xfs_trans_free_extent(tp, efdp, extp->ext_start,
+ extp->ext_len);
if (error)
goto abort_error;
- xfs_trans_log_efd_extent(tp, efdp, extp->ext_start,
- extp->ext_len);
+
}
set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
@@ -3793,10 +4178,10 @@ abort_error:
*/
STATIC int
xlog_recover_process_efis(
- struct xlog *log)
+ struct xlog *log)
{
- xfs_log_item_t *lip;
- xfs_efi_log_item_t *efip;
+ struct xfs_log_item *lip;
+ struct xfs_efi_log_item *efip;
int error = 0;
struct xfs_ail_cursor cur;
struct xfs_ail *ailp;
@@ -3820,7 +4205,7 @@ xlog_recover_process_efis(
/*
* Skip EFIs that we've already processed.
*/
- efip = (xfs_efi_log_item_t *)lip;
+ efip = container_of(lip, struct xfs_efi_log_item, efi_item);
if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) {
lip = xfs_trans_ail_cursor_next(ailp, &cur);
continue;
@@ -3840,6 +4225,50 @@ out:
}
/*
+ * A cancel occurs when the mount has failed and we're bailing out. Release all
+ * pending EFIs so they don't pin the AIL.
+ */
+STATIC int
+xlog_recover_cancel_efis(
+ struct xlog *log)
+{
+ struct xfs_log_item *lip;
+ struct xfs_efi_log_item *efip;
+ int error = 0;
+ struct xfs_ail_cursor cur;
+ struct xfs_ail *ailp;
+
+ ailp = log->l_ailp;
+ spin_lock(&ailp->xa_lock);
+ lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
+ while (lip != NULL) {
+ /*
+ * We're done when we see something other than an EFI.
+ * There should be no EFIs left in the AIL now.
+ */
+ if (lip->li_type != XFS_LI_EFI) {
+#ifdef DEBUG
+ for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur))
+ ASSERT(lip->li_type != XFS_LI_EFI);
+#endif
+ break;
+ }
+
+ efip = container_of(lip, struct xfs_efi_log_item, efi_item);
+
+ spin_unlock(&ailp->xa_lock);
+ xfs_efi_release(efip);
+ spin_lock(&ailp->xa_lock);
+
+ lip = xfs_trans_ail_cursor_next(ailp, &cur);
+ }
+
+ xfs_trans_ail_cursor_done(&cur);
+ spin_unlock(&ailp->xa_lock);
+ return error;
+}
+
+/*
* This routine performs a transaction to null out a bad inode pointer
* in an agi unlinked inode hash bucket.
*/
@@ -4011,26 +4440,69 @@ xlog_recover_process_iunlinks(
mp->m_dmevmask = mp_dmevmask;
}
+STATIC int
+xlog_unpack_data(
+ struct xlog_rec_header *rhead,
+ char *dp,
+ struct xlog *log)
+{
+ int i, j, k;
+
+ for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
+ i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
+ *(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i];
+ dp += BBSIZE;
+ }
+
+ if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
+ xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead;
+ for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {
+ j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+ k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+ *(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k];
+ dp += BBSIZE;
+ }
+ }
+
+ return 0;
+}
+
/*
- * Upack the log buffer data and crc check it. If the check fails, issue a
- * warning if and only if the CRC in the header is non-zero. This makes the
- * check an advisory warning, and the zero CRC check will prevent failure
- * warnings from being emitted when upgrading the kernel from one that does not
- * add CRCs by default.
- *
- * When filesystems are CRC enabled, this CRC mismatch becomes a fatal log
- * corruption failure
+ * CRC check, unpack and process a log record.
*/
STATIC int
-xlog_unpack_data_crc(
+xlog_recover_process(
+ struct xlog *log,
+ struct hlist_head rhash[],
struct xlog_rec_header *rhead,
char *dp,
- struct xlog *log)
+ int pass)
{
+ int error;
__le32 crc;
crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
- if (crc != rhead->h_crc) {
+
+ /*
+ * Nothing else to do if this is a CRC verification pass. Just return
+ * if this a record with a non-zero crc. Unfortunately, mkfs always
+ * sets h_crc to 0 so we must consider this valid even on v5 supers.
+ * Otherwise, return EFSBADCRC on failure so the callers up the stack
+ * know precisely what failed.
+ */
+ if (pass == XLOG_RECOVER_CRCPASS) {
+ if (rhead->h_crc && crc != le32_to_cpu(rhead->h_crc))
+ return -EFSBADCRC;
+ return 0;
+ }
+
+ /*
+ * We're in the normal recovery path. Issue a warning if and only if the
+ * CRC in the header is non-zero. This is an advisory warning and the
+ * zero CRC check prevents warnings from being emitted when upgrading
+ * the kernel from one that does not add CRCs by default.
+ */
+ if (crc != le32_to_cpu(rhead->h_crc)) {
if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
xfs_alert(log->l_mp,
"log record CRC mismatch: found 0x%x, expected 0x%x.",
@@ -4040,47 +4512,18 @@ xlog_unpack_data_crc(
}
/*
- * If we've detected a log record corruption, then we can't
- * recover past this point. Abort recovery if we are enforcing
- * CRC protection by punting an error back up the stack.
+ * If the filesystem is CRC enabled, this mismatch becomes a
+ * fatal log corruption failure.
*/
if (xfs_sb_version_hascrc(&log->l_mp->m_sb))
return -EFSCORRUPTED;
}
- return 0;
-}
-
-STATIC int
-xlog_unpack_data(
- struct xlog_rec_header *rhead,
- char *dp,
- struct xlog *log)
-{
- int i, j, k;
- int error;
-
- error = xlog_unpack_data_crc(rhead, dp, log);
+ error = xlog_unpack_data(rhead, dp, log);
if (error)
return error;
- for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
- i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
- *(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i];
- dp += BBSIZE;
- }
-
- if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
- xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead;
- for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {
- j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
- k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
- *(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k];
- dp += BBSIZE;
- }
- }
-
- return 0;
+ return xlog_recover_process_data(log, rhash, rhead, dp, pass);
}
STATIC int
@@ -4132,18 +4575,21 @@ xlog_do_recovery_pass(
struct xlog *log,
xfs_daddr_t head_blk,
xfs_daddr_t tail_blk,
- int pass)
+ int pass,
+ xfs_daddr_t *first_bad) /* out: first bad log rec */
{
xlog_rec_header_t *rhead;
xfs_daddr_t blk_no;
+ xfs_daddr_t rhead_blk;
char *offset;
xfs_buf_t *hbp, *dbp;
- int error = 0, h_size;
+ int error = 0, h_size, h_len;
int bblks, split_bblks;
int hblks, split_hblks, wrapped_hblks;
struct hlist_head rhash[XLOG_RHASH_SIZE];
ASSERT(head_blk != tail_blk);
+ rhead_blk = 0;
/*
* Read the header of the tail block and get the iclog buffer size from
@@ -4167,7 +4613,31 @@ xlog_do_recovery_pass(
error = xlog_valid_rec_header(log, rhead, tail_blk);
if (error)
goto bread_err1;
+
+ /*
+ * xfsprogs has a bug where record length is based on lsunit but
+ * h_size (iclog size) is hardcoded to 32k. Now that we
+ * unconditionally CRC verify the unmount record, this means the
+ * log buffer can be too small for the record and cause an
+ * overrun.
+ *
+ * Detect this condition here. Use lsunit for the buffer size as
+ * long as this looks like the mkfs case. Otherwise, return an
+ * error to avoid a buffer overrun.
+ */
h_size = be32_to_cpu(rhead->h_size);
+ h_len = be32_to_cpu(rhead->h_len);
+ if (h_len > h_size) {
+ if (h_len <= log->l_mp->m_logbsize &&
+ be32_to_cpu(rhead->h_num_logops) == 1) {
+ xfs_warn(log->l_mp,
+ "invalid iclog size (%d bytes), using lsunit (%d bytes)",
+ h_size, log->l_mp->m_logbsize);
+ h_size = log->l_mp->m_logbsize;
+ } else
+ return -EFSCORRUPTED;
+ }
+
if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) &&
(h_size > XLOG_HEADER_CYCLE_SIZE)) {
hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
@@ -4194,7 +4664,7 @@ xlog_do_recovery_pass(
}
memset(rhash, 0, sizeof(rhash));
- blk_no = tail_blk;
+ blk_no = rhead_blk = tail_blk;
if (tail_blk > head_blk) {
/*
* Perform recovery around the end of the physical log.
@@ -4301,19 +4771,18 @@ xlog_do_recovery_pass(
goto bread_err2;
}
- error = xlog_unpack_data(rhead, offset, log);
+ error = xlog_recover_process(log, rhash, rhead, offset,
+ pass);
if (error)
goto bread_err2;
- error = xlog_recover_process_data(log, rhash,
- rhead, offset, pass);
- if (error)
- goto bread_err2;
blk_no += bblks;
+ rhead_blk = blk_no;
}
ASSERT(blk_no >= log->l_logBBsize);
blk_no -= log->l_logBBsize;
+ rhead_blk = blk_no;
}
/* read first part of physical log */
@@ -4334,21 +4803,22 @@ xlog_do_recovery_pass(
if (error)
goto bread_err2;
- error = xlog_unpack_data(rhead, offset, log);
+ error = xlog_recover_process(log, rhash, rhead, offset, pass);
if (error)
goto bread_err2;
- error = xlog_recover_process_data(log, rhash,
- rhead, offset, pass);
- if (error)
- goto bread_err2;
blk_no += bblks + hblks;
+ rhead_blk = blk_no;
}
bread_err2:
xlog_put_bp(dbp);
bread_err1:
xlog_put_bp(hbp);
+
+ if (error && first_bad)
+ *first_bad = rhead_blk;
+
return error;
}
@@ -4386,7 +4856,7 @@ xlog_do_log_recovery(
INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
error = xlog_do_recovery_pass(log, head_blk, tail_blk,
- XLOG_RECOVER_PASS1);
+ XLOG_RECOVER_PASS1, NULL);
if (error != 0) {
kmem_free(log->l_buf_cancel_table);
log->l_buf_cancel_table = NULL;
@@ -4397,7 +4867,7 @@ xlog_do_log_recovery(
* When it is complete free the table of buf cancel items.
*/
error = xlog_do_recovery_pass(log, head_blk, tail_blk,
- XLOG_RECOVER_PASS2);
+ XLOG_RECOVER_PASS2, NULL);
#ifdef DEBUG
if (!error) {
int i;
@@ -4502,9 +4972,19 @@ xlog_recover(
int error;
/* find the tail of the log */
- if ((error = xlog_find_tail(log, &head_blk, &tail_blk)))
+ error = xlog_find_tail(log, &head_blk, &tail_blk);
+ if (error)
return error;
+ /*
+ * The superblock was read before the log was available and thus the LSN
+ * could not be verified. Check the superblock LSN against the current
+ * LSN now that it's known.
+ */
+ if (xfs_sb_version_hascrc(&log->l_mp->m_sb) &&
+ !xfs_log_check_lsn(log->l_mp, log->l_mp->m_sb.sb_lsn))
+ return -EINVAL;
+
if (tail_blk != head_blk) {
/* There used to be a comment here:
*
@@ -4532,11 +5012,13 @@ xlog_recover(
xfs_sb_has_incompat_log_feature(&log->l_mp->m_sb,
XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)) {
xfs_warn(log->l_mp,
-"Superblock has unknown incompatible log features (0x%x) enabled.\n"
-"The log can not be fully and/or safely recovered by this kernel.\n"
-"Please recover the log on a kernel that supports the unknown features.",
+"Superblock has unknown incompatible log features (0x%x) enabled.",
(log->l_mp->m_sb.sb_features_log_incompat &
XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN));
+ xfs_warn(log->l_mp,
+"The log can not be fully and/or safely recovered by this kernel.");
+ xfs_warn(log->l_mp,
+"Please recover the log on a kernel that supports the unknown features.");
return -EINVAL;
}
@@ -4612,6 +5094,17 @@ xlog_recover_finish(
return 0;
}
+int
+xlog_recover_cancel(
+ struct xlog *log)
+{
+ int error = 0;
+
+ if (log->l_flags & XLOG_RECOVERY_NEEDED)
+ error = xlog_recover_cancel_efis(log);
+
+ return error;
+}
#if defined(DEBUG)
/*
diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c
index d8b6754..11792d8 100644
--- a/fs/xfs/xfs_message.c
+++ b/fs/xfs/xfs_message.c
@@ -17,6 +17,7 @@
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_error.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
@@ -43,6 +44,7 @@ void func(const struct xfs_mount *mp, const char *fmt, ...) \
{ \
struct va_format vaf; \
va_list args; \
+ int level; \
\
va_start(args, fmt); \
\
@@ -51,6 +53,11 @@ void func(const struct xfs_mount *mp, const char *fmt, ...) \
\
__xfs_printk(kern_level, mp, &vaf); \
va_end(args); \
+ \
+ if (!kstrtoint(kern_level, 0, &level) && \
+ level <= LOGLEVEL_ERR && \
+ xfs_error_level >= XFS_ERRLEVEL_HIGH) \
+ xfs_stack_trace(); \
} \
define_xfs_printk_level(xfs_emerg, KERN_EMERG);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 461e791..bb753b3 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -47,6 +47,16 @@ static DEFINE_MUTEX(xfs_uuid_table_mutex);
static int xfs_uuid_table_size;
static uuid_t *xfs_uuid_table;
+void
+xfs_uuid_table_free(void)
+{
+ if (xfs_uuid_table_size == 0)
+ return;
+ kmem_free(xfs_uuid_table);
+ xfs_uuid_table = NULL;
+ xfs_uuid_table_size = 0;
+}
+
/*
* See if the UUID is unique among mounted XFS filesystems.
* Mount fails if UUID is nil or a FS with the same UUID is already mounted.
@@ -615,14 +625,14 @@ xfs_default_resblks(xfs_mount_t *mp)
*/
int
xfs_mountfs(
- xfs_mount_t *mp)
+ struct xfs_mount *mp)
{
- xfs_sb_t *sbp = &(mp->m_sb);
- xfs_inode_t *rip;
- __uint64_t resblks;
- uint quotamount = 0;
- uint quotaflags = 0;
- int error = 0;
+ struct xfs_sb *sbp = &(mp->m_sb);
+ struct xfs_inode *rip;
+ __uint64_t resblks;
+ uint quotamount = 0;
+ uint quotaflags = 0;
+ int error = 0;
xfs_sb_mount_common(mp, sbp);
@@ -693,10 +703,15 @@ xfs_mountfs(
if (error)
goto out;
- error = xfs_uuid_mount(mp);
+ error = xfs_sysfs_init(&mp->m_stats.xs_kobj, &xfs_stats_ktype,
+ &mp->m_kobj, "stats");
if (error)
goto out_remove_sysfs;
+ error = xfs_uuid_mount(mp);
+ if (error)
+ goto out_del_stats;
+
/*
* Set the minimum read and write sizes
*/
@@ -799,7 +814,9 @@ xfs_mountfs(
}
/*
- * log's mount-time initialization. Perform 1st part recovery if needed
+ * Log's mount-time initialization. The first part of recovery can place
+ * some items on the AIL, to be handled when recovery is finished or
+ * cancelled.
*/
error = xfs_log_mount(mp, mp->m_logdev_targp,
XFS_FSB_TO_DADDR(mp, sbp->sb_logstart),
@@ -910,9 +927,9 @@ xfs_mountfs(
}
/*
- * Finish recovering the file system. This part needed to be
- * delayed until after the root and real-time bitmap inodes
- * were consistently read in.
+ * Finish recovering the file system. This part needed to be delayed
+ * until after the root and real-time bitmap inodes were consistently
+ * read in.
*/
error = xfs_log_mount_finish(mp);
if (error) {
@@ -955,8 +972,10 @@ xfs_mountfs(
xfs_rtunmount_inodes(mp);
out_rele_rip:
IRELE(rip);
+ cancel_delayed_work_sync(&mp->m_reclaim_work);
+ xfs_reclaim_inodes(mp, SYNC_WAIT);
out_log_dealloc:
- xfs_log_unmount(mp);
+ xfs_log_mount_cancel(mp);
out_fail_wait:
if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
xfs_wait_buftarg(mp->m_logdev_targp);
@@ -967,6 +986,8 @@ xfs_mountfs(
xfs_da_unmount(mp);
out_remove_uuid:
xfs_uuid_unmount(mp);
+ out_del_stats:
+ xfs_sysfs_del(&mp->m_stats.xs_kobj);
out_remove_sysfs:
xfs_sysfs_del(&mp->m_kobj);
out:
@@ -1043,6 +1064,7 @@ xfs_unmountfs(
xfs_warn(mp, "Unable to update superblock counters. "
"Freespace may not be correct on next mount.");
+
xfs_log_unmount(mp);
xfs_da_unmount(mp);
xfs_uuid_unmount(mp);
@@ -1052,6 +1074,7 @@ xfs_unmountfs(
#endif
xfs_free_perag(mp);
+ xfs_sysfs_del(&mp->m_stats.xs_kobj);
xfs_sysfs_del(&mp->m_kobj);
}
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 7999e91..b570984 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -127,6 +127,7 @@ typedef struct xfs_mount {
int64_t m_low_space[XFS_LOWSP_MAX];
/* low free space thresholds */
struct xfs_kobj m_kobj;
+ struct xstats m_stats; /* per-fs stats */
struct workqueue_struct *m_buf_workqueue;
struct workqueue_struct *m_data_workqueue;
@@ -312,6 +313,7 @@ typedef struct xfs_perag {
int pagb_count; /* pagb slots in use */
} xfs_perag_t;
+extern void xfs_uuid_table_free(void);
extern int xfs_log_sbcount(xfs_mount_t *);
extern __uint64_t xfs_default_resblks(xfs_mount_t *mp);
extern int xfs_mountfs(xfs_mount_t *mp);
@@ -336,4 +338,7 @@ extern int xfs_dev_is_read_only(struct xfs_mount *, char *);
extern void xfs_set_low_space_thresholds(struct xfs_mount *);
+int xfs_zero_extent(struct xfs_inode *ip, xfs_fsblock_t start_fsb,
+ xfs_off_t count_fsb);
+
#endif /* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index ab4a606..dc62219 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -181,6 +181,11 @@ xfs_fs_map_blocks(
ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
if (!nimaps || imap.br_startblock == HOLESTARTBLOCK) {
+ /*
+ * xfs_iomap_write_direct() expects to take ownership of
+ * the shared ilock.
+ */
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
error = xfs_iomap_write_direct(ip, offset, length,
&imap, nimaps);
if (error)
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index eac9549..532ab79 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -184,7 +184,7 @@ xfs_qm_dqpurge(
*/
ASSERT(!list_empty(&dqp->q_lru));
list_lru_del(&qi->qi_lru, &dqp->q_lru);
- XFS_STATS_DEC(xs_qm_dquot_unused);
+ XFS_STATS_DEC(mp, xs_qm_dquot_unused);
xfs_qm_dqdestroy(dqp);
return 0;
@@ -448,11 +448,11 @@ xfs_qm_dquot_isolate(
*/
if (dqp->q_nrefs) {
xfs_dqunlock(dqp);
- XFS_STATS_INC(xs_qm_dqwants);
+ XFS_STATS_INC(dqp->q_mount, xs_qm_dqwants);
trace_xfs_dqreclaim_want(dqp);
list_lru_isolate(lru, &dqp->q_lru);
- XFS_STATS_DEC(xs_qm_dquot_unused);
+ XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot_unused);
return LRU_REMOVED;
}
@@ -496,19 +496,19 @@ xfs_qm_dquot_isolate(
ASSERT(dqp->q_nrefs == 0);
list_lru_isolate_move(lru, &dqp->q_lru, &isol->dispose);
- XFS_STATS_DEC(xs_qm_dquot_unused);
+ XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot_unused);
trace_xfs_dqreclaim_done(dqp);
- XFS_STATS_INC(xs_qm_dqreclaims);
+ XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaims);
return LRU_REMOVED;
out_miss_busy:
trace_xfs_dqreclaim_busy(dqp);
- XFS_STATS_INC(xs_qm_dqreclaim_misses);
+ XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses);
return LRU_SKIP;
out_unlock_dirty:
trace_xfs_dqreclaim_busy(dqp);
- XFS_STATS_INC(xs_qm_dqreclaim_misses);
+ XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses);
xfs_dqunlock(dqp);
spin_lock(lru_lock);
return LRU_RETRY;
@@ -525,7 +525,7 @@ xfs_qm_shrink_scan(
unsigned long freed;
int error;
- if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT))
+ if ((sc->gfp_mask & (__GFP_FS|__GFP_DIRECT_RECLAIM)) != (__GFP_FS|__GFP_DIRECT_RECLAIM))
return 0;
INIT_LIST_HEAD(&isol.buffers);
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index f4e8c06..be02a68 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -757,31 +757,29 @@ xfs_rtallocate_extent_size(
/*
* Allocate space to the bitmap or summary file, and zero it, for growfs.
*/
-STATIC int /* error */
+STATIC int
xfs_growfs_rt_alloc(
- xfs_mount_t *mp, /* file system mount point */
- xfs_extlen_t oblocks, /* old count of blocks */
- xfs_extlen_t nblocks, /* new count of blocks */
- xfs_inode_t *ip) /* inode (bitmap/summary) */
+ struct xfs_mount *mp, /* file system mount point */
+ xfs_extlen_t oblocks, /* old count of blocks */
+ xfs_extlen_t nblocks, /* new count of blocks */
+ struct xfs_inode *ip) /* inode (bitmap/summary) */
{
- xfs_fileoff_t bno; /* block number in file */
- xfs_buf_t *bp; /* temporary buffer for zeroing */
- int committed; /* transaction committed flag */
- xfs_daddr_t d; /* disk block address */
- int error; /* error return value */
- xfs_fsblock_t firstblock; /* first block allocated in xaction */
- xfs_bmap_free_t flist; /* list of freed blocks */
- xfs_fsblock_t fsbno; /* filesystem block for bno */
- xfs_bmbt_irec_t map; /* block map output */
- int nmap; /* number of block maps */
- int resblks; /* space reservation */
+ xfs_fileoff_t bno; /* block number in file */
+ struct xfs_buf *bp; /* temporary buffer for zeroing */
+ xfs_daddr_t d; /* disk block address */
+ int error; /* error return value */
+ xfs_fsblock_t firstblock;/* first block allocated in xaction */
+ struct xfs_bmap_free flist; /* list of freed blocks */
+ xfs_fsblock_t fsbno; /* filesystem block for bno */
+ struct xfs_bmbt_irec map; /* block map output */
+ int nmap; /* number of block maps */
+ int resblks; /* space reservation */
+ struct xfs_trans *tp;
/*
* Allocate space to the file, as necessary.
*/
while (oblocks < nblocks) {
- xfs_trans_t *tp;
-
tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC);
resblks = XFS_GROWFSRT_SPACE_RES(mp, nblocks - oblocks);
/*
@@ -790,7 +788,7 @@ xfs_growfs_rt_alloc(
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtalloc,
resblks, 0);
if (error)
- goto error_cancel;
+ goto out_trans_cancel;
/*
* Lock the inode.
*/
@@ -808,16 +806,16 @@ xfs_growfs_rt_alloc(
if (!error && nmap < 1)
error = -ENOSPC;
if (error)
- goto error_cancel;
+ goto out_bmap_cancel;
/*
* Free any blocks freed up in the transaction, then commit.
*/
- error = xfs_bmap_finish(&tp, &flist, &committed);
+ error = xfs_bmap_finish(&tp, &flist, NULL);
if (error)
- goto error_cancel;
+ goto out_bmap_cancel;
error = xfs_trans_commit(tp);
if (error)
- goto error;
+ return error;
/*
* Now we need to clear the allocated blocks.
* Do this one block per transaction, to keep it simple.
@@ -832,7 +830,7 @@ xfs_growfs_rt_alloc(
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtzero,
0, 0);
if (error)
- goto error_cancel;
+ goto out_trans_cancel;
/*
* Lock the bitmap inode.
*/
@@ -846,9 +844,7 @@ xfs_growfs_rt_alloc(
mp->m_bsize, 0);
if (bp == NULL) {
error = -EIO;
-error_cancel:
- xfs_trans_cancel(tp);
- goto error;
+ goto out_trans_cancel;
}
memset(bp->b_addr, 0, mp->m_sb.sb_blocksize);
xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1);
@@ -857,16 +853,20 @@ error_cancel:
*/
error = xfs_trans_commit(tp);
if (error)
- goto error;
+ return error;
}
/*
* Go on to the next extent, if any.
*/
oblocks = map.br_startoff + map.br_blockcount;
}
+
return 0;
-error:
+out_bmap_cancel:
+ xfs_bmap_cancel(&flist);
+out_trans_cancel:
+ xfs_trans_cancel(tp);
return error;
}
diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index f224038..8686df6 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
@@ -18,20 +18,21 @@
#include "xfs.h"
#include <linux/proc_fs.h>
-DEFINE_PER_CPU(struct xfsstats, xfsstats);
+struct xstats xfsstats;
-static int counter_val(int idx)
+static int counter_val(struct xfsstats __percpu *stats, int idx)
{
int val = 0, cpu;
for_each_possible_cpu(cpu)
- val += *(((__u32 *)&per_cpu(xfsstats, cpu) + idx));
+ val += *(((__u32 *)per_cpu_ptr(stats, cpu) + idx));
return val;
}
-static int xfs_stat_proc_show(struct seq_file *m, void *v)
+int xfs_stats_format(struct xfsstats __percpu *stats, char *buf)
{
int i, j;
+ int len = 0;
__uint64_t xs_xstrat_bytes = 0;
__uint64_t xs_write_bytes = 0;
__uint64_t xs_read_bytes = 0;
@@ -65,54 +66,59 @@ static int xfs_stat_proc_show(struct seq_file *m, void *v)
};
/* Loop over all stats groups */
+
for (i = j = 0; i < ARRAY_SIZE(xstats); i++) {
- seq_printf(m, "%s", xstats[i].desc);
+ len += snprintf(buf + len, PATH_MAX - len, "%s",
+ xstats[i].desc);
/* inner loop does each group */
for (; j < xstats[i].endpoint; j++)
- seq_printf(m, " %u", counter_val(j));
- seq_putc(m, '\n');
+ len += snprintf(buf + len, PATH_MAX - len, " %u",
+ counter_val(stats, j));
+ len += snprintf(buf + len, PATH_MAX - len, "\n");
}
/* extra precision counters */
for_each_possible_cpu(i) {
- xs_xstrat_bytes += per_cpu(xfsstats, i).xs_xstrat_bytes;
- xs_write_bytes += per_cpu(xfsstats, i).xs_write_bytes;
- xs_read_bytes += per_cpu(xfsstats, i).xs_read_bytes;
+ xs_xstrat_bytes += per_cpu_ptr(stats, i)->xs_xstrat_bytes;
+ xs_write_bytes += per_cpu_ptr(stats, i)->xs_write_bytes;
+ xs_read_bytes += per_cpu_ptr(stats, i)->xs_read_bytes;
}
- seq_printf(m, "xpc %Lu %Lu %Lu\n",
+ len += snprintf(buf + len, PATH_MAX-len, "xpc %Lu %Lu %Lu\n",
xs_xstrat_bytes, xs_write_bytes, xs_read_bytes);
- seq_printf(m, "debug %u\n",
+ len += snprintf(buf + len, PATH_MAX-len, "debug %u\n",
#if defined(DEBUG)
1);
#else
0);
#endif
- return 0;
+
+ return len;
}
-static int xfs_stat_proc_open(struct inode *inode, struct file *file)
+void xfs_stats_clearall(struct xfsstats __percpu *stats)
{
- return single_open(file, xfs_stat_proc_show, NULL);
+ int c;
+ __uint32_t vn_active;
+
+ xfs_notice(NULL, "Clearing xfsstats");
+ for_each_possible_cpu(c) {
+ preempt_disable();
+ /* save vn_active, it's a universal truth! */
+ vn_active = per_cpu_ptr(stats, c)->vn_active;
+ memset(per_cpu_ptr(stats, c), 0, sizeof(*stats));
+ per_cpu_ptr(stats, c)->vn_active = vn_active;
+ preempt_enable();
+ }
}
-static const struct file_operations xfs_stat_proc_fops = {
- .owner = THIS_MODULE,
- .open = xfs_stat_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
/* legacy quota interfaces */
#ifdef CONFIG_XFS_QUOTA
static int xqm_proc_show(struct seq_file *m, void *v)
{
/* maximum; incore; ratio free to inuse; freelist */
seq_printf(m, "%d\t%d\t%d\t%u\n",
- 0,
- counter_val(XFSSTAT_END_XQMSTAT),
- 0,
- counter_val(XFSSTAT_END_XQMSTAT + 1));
+ 0, counter_val(xfsstats.xs_stats, XFSSTAT_END_XQMSTAT),
+ 0, counter_val(xfsstats.xs_stats, XFSSTAT_END_XQMSTAT + 1));
return 0;
}
@@ -136,7 +142,7 @@ static int xqmstat_proc_show(struct seq_file *m, void *v)
seq_printf(m, "qm");
for (j = XFSSTAT_END_IBT_V2; j < XFSSTAT_END_XQMSTAT; j++)
- seq_printf(m, " %u", counter_val(j));
+ seq_printf(m, " %u", counter_val(xfsstats.xs_stats, j));
seq_putc(m, '\n');
return 0;
}
@@ -155,44 +161,35 @@ static const struct file_operations xqmstat_proc_fops = {
};
#endif /* CONFIG_XFS_QUOTA */
+#ifdef CONFIG_PROC_FS
int
xfs_init_procfs(void)
{
if (!proc_mkdir("fs/xfs", NULL))
+ return -ENOMEM;
+
+ if (!proc_symlink("fs/xfs/stat", NULL,
+ "/sys/fs/xfs/stats/stats"))
goto out;
- if (!proc_create("fs/xfs/stat", 0, NULL,
- &xfs_stat_proc_fops))
- goto out_remove_xfs_dir;
#ifdef CONFIG_XFS_QUOTA
if (!proc_create("fs/xfs/xqmstat", 0, NULL,
&xqmstat_proc_fops))
- goto out_remove_stat_file;
+ goto out;
if (!proc_create("fs/xfs/xqm", 0, NULL,
&xqm_proc_fops))
- goto out_remove_xqmstat_file;
+ goto out;
#endif
return 0;
-#ifdef CONFIG_XFS_QUOTA
- out_remove_xqmstat_file:
- remove_proc_entry("fs/xfs/xqmstat", NULL);
- out_remove_stat_file:
- remove_proc_entry("fs/xfs/stat", NULL);
-#endif
- out_remove_xfs_dir:
- remove_proc_entry("fs/xfs", NULL);
- out:
+out:
+ remove_proc_subtree("fs/xfs", NULL);
return -ENOMEM;
}
void
xfs_cleanup_procfs(void)
{
-#ifdef CONFIG_XFS_QUOTA
- remove_proc_entry("fs/xfs/xqm", NULL);
- remove_proc_entry("fs/xfs/xqmstat", NULL);
-#endif
- remove_proc_entry("fs/xfs/stat", NULL);
- remove_proc_entry("fs/xfs", NULL);
+ remove_proc_subtree("fs/xfs", NULL);
}
+#endif /* CONFIG_PROC_FS */
diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h
index c8f238b..483b0ef 100644
--- a/fs/xfs/xfs_stats.h
+++ b/fs/xfs/xfs_stats.h
@@ -19,8 +19,6 @@
#define __XFS_STATS_H__
-#if defined(CONFIG_PROC_FS) && !defined(XFS_STATS_OFF)
-
#include <linux/percpu.h>
/*
@@ -215,15 +213,29 @@ struct xfsstats {
__uint64_t xs_read_bytes;
};
-DECLARE_PER_CPU(struct xfsstats, xfsstats);
+int xfs_stats_format(struct xfsstats __percpu *stats, char *buf);
+void xfs_stats_clearall(struct xfsstats __percpu *stats);
+extern struct xstats xfsstats;
-/*
- * We don't disable preempt, not too worried about poking the
- * wrong CPU's stat for now (also aggregated before reporting).
- */
-#define XFS_STATS_INC(v) (per_cpu(xfsstats, current_cpu()).v++)
-#define XFS_STATS_DEC(v) (per_cpu(xfsstats, current_cpu()).v--)
-#define XFS_STATS_ADD(v, inc) (per_cpu(xfsstats, current_cpu()).v += (inc))
+#define XFS_STATS_INC(mp, v) \
+do { \
+ per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v++; \
+ per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->v++; \
+} while (0)
+
+#define XFS_STATS_DEC(mp, v) \
+do { \
+ per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v--; \
+ per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->v--; \
+} while (0)
+
+#define XFS_STATS_ADD(mp, v, inc) \
+do { \
+ per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v += (inc); \
+ per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->v += (inc); \
+} while (0)
+
+#if defined(CONFIG_PROC_FS)
extern int xfs_init_procfs(void);
extern void xfs_cleanup_procfs(void);
@@ -231,10 +243,6 @@ extern void xfs_cleanup_procfs(void);
#else /* !CONFIG_PROC_FS */
-# define XFS_STATS_INC(count)
-# define XFS_STATS_DEC(count)
-# define XFS_STATS_ADD(count, inc)
-
static inline int xfs_init_procfs(void)
{
return 0;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 1fb16562..59c9b7b 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -137,7 +137,7 @@ static const match_table_t tokens = {
};
-STATIC unsigned long
+STATIC int
suffix_kstrtoint(char *s, unsigned int base, int *res)
{
int last, shift_left_factor = 0, _res;
@@ -261,16 +261,8 @@ xfs_parseargs(
mp->m_rtname = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
if (!mp->m_rtname)
return -ENOMEM;
- } else if (!strcmp(this_char, MNTOPT_BIOSIZE)) {
- if (!value || !*value) {
- xfs_warn(mp, "%s option requires an argument",
- this_char);
- return -EINVAL;
- }
- if (kstrtoint(value, 10, &iosize))
- return -EINVAL;
- iosizelog = ffs(iosize) - 1;
- } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) {
+ } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE) ||
+ !strcmp(this_char, MNTOPT_BIOSIZE)) {
if (!value || !*value) {
xfs_warn(mp, "%s option requires an argument",
this_char);
@@ -511,9 +503,9 @@ xfs_showargs(
seq_printf(m, "," MNTOPT_LOGBSIZE "=%dk", mp->m_logbsize >> 10);
if (mp->m_logname)
- seq_printf(m, "," MNTOPT_LOGDEV "=%s", mp->m_logname);
+ seq_show_option(m, MNTOPT_LOGDEV, mp->m_logname);
if (mp->m_rtname)
- seq_printf(m, "," MNTOPT_RTDEV "=%s", mp->m_rtname);
+ seq_show_option(m, MNTOPT_RTDEV, mp->m_rtname);
if (mp->m_dalign > 0)
seq_printf(m, "," MNTOPT_SUNIT "=%d",
@@ -846,17 +838,18 @@ xfs_init_mount_workqueues(
goto out_destroy_unwritten;
mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s",
- WQ_FREEZABLE, 0, mp->m_fsname);
+ WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
if (!mp->m_reclaim_workqueue)
goto out_destroy_cil;
mp->m_log_workqueue = alloc_workqueue("xfs-log/%s",
- WQ_FREEZABLE|WQ_HIGHPRI, 0, mp->m_fsname);
+ WQ_MEM_RECLAIM|WQ_FREEZABLE|WQ_HIGHPRI, 0,
+ mp->m_fsname);
if (!mp->m_log_workqueue)
goto out_destroy_reclaim;
mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s",
- WQ_FREEZABLE, 0, mp->m_fsname);
+ WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
if (!mp->m_eofblocks_workqueue)
goto out_destroy_log;
@@ -930,7 +923,7 @@ xfs_fs_destroy_inode(
trace_xfs_destroy_inode(ip);
- XFS_STATS_INC(vn_reclaim);
+ XFS_STATS_INC(ip->i_mount, vn_reclaim);
ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
@@ -991,8 +984,8 @@ xfs_fs_evict_inode(
truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
- XFS_STATS_INC(vn_rele);
- XFS_STATS_INC(vn_remove);
+ XFS_STATS_INC(ip->i_mount, vn_rele);
+ XFS_STATS_INC(ip->i_mount, vn_remove);
xfs_inactive(ip);
}
@@ -1482,9 +1475,16 @@ xfs_fs_fill_super(
if (error)
goto out_destroy_workqueues;
+ /* Allocate stats memory before we do operations that might use it */
+ mp->m_stats.xs_stats = alloc_percpu(struct xfsstats);
+ if (!mp->m_stats.xs_stats) {
+ error = -ENOMEM;
+ goto out_destroy_counters;
+ }
+
error = xfs_readsb(mp, flags);
if (error)
- goto out_destroy_counters;
+ goto out_free_stats;
error = xfs_finish_flags(mp);
if (error)
@@ -1528,6 +1528,10 @@ xfs_fs_fill_super(
}
}
+ if (xfs_sb_version_hassparseinodes(&mp->m_sb))
+ xfs_alert(mp,
+ "EXPERIMENTAL sparse inode feature enabled. Use at your own risk!");
+
error = xfs_mountfs(mp);
if (error)
goto out_filestream_unmount;
@@ -1549,9 +1553,11 @@ xfs_fs_fill_super(
xfs_filestream_unmount(mp);
out_free_sb:
xfs_freesb(mp);
+ out_free_stats:
+ free_percpu(mp->m_stats.xs_stats);
out_destroy_counters:
xfs_destroy_percpu_counters(mp);
-out_destroy_workqueues:
+ out_destroy_workqueues:
xfs_destroy_mount_workqueues(mp);
out_close_devices:
xfs_close_devices(mp);
@@ -1578,6 +1584,7 @@ xfs_fs_put_super(
xfs_unmountfs(mp);
xfs_freesb(mp);
+ free_percpu(mp->m_stats.xs_stats);
xfs_destroy_percpu_counters(mp);
xfs_destroy_mount_workqueues(mp);
xfs_close_devices(mp);
@@ -1707,8 +1714,8 @@ xfs_init_zones(void)
xfs_inode_zone =
kmem_zone_init_flags(sizeof(xfs_inode_t), "xfs_inode",
- KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | KM_ZONE_SPREAD,
- xfs_fs_inode_init_once);
+ KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | KM_ZONE_SPREAD |
+ KM_ZONE_ACCOUNT, xfs_fs_inode_init_once);
if (!xfs_inode_zone)
goto out_destroy_efi_zone;
@@ -1842,19 +1849,32 @@ init_xfs_fs(void)
xfs_kset = kset_create_and_add("xfs", NULL, fs_kobj);
if (!xfs_kset) {
error = -ENOMEM;
- goto out_sysctl_unregister;;
+ goto out_sysctl_unregister;
}
+ xfsstats.xs_kobj.kobject.kset = xfs_kset;
+
+ xfsstats.xs_stats = alloc_percpu(struct xfsstats);
+ if (!xfsstats.xs_stats) {
+ error = -ENOMEM;
+ goto out_kset_unregister;
+ }
+
+ error = xfs_sysfs_init(&xfsstats.xs_kobj, &xfs_stats_ktype, NULL,
+ "stats");
+ if (error)
+ goto out_free_stats;
+
#ifdef DEBUG
xfs_dbg_kobj.kobject.kset = xfs_kset;
error = xfs_sysfs_init(&xfs_dbg_kobj, &xfs_dbg_ktype, NULL, "debug");
if (error)
- goto out_kset_unregister;
+ goto out_remove_stats_kobj;
#endif
error = xfs_qm_init();
if (error)
- goto out_remove_kobj;
+ goto out_remove_dbg_kobj;
error = register_filesystem(&xfs_fs_type);
if (error)
@@ -1863,11 +1883,15 @@ init_xfs_fs(void)
out_qm_exit:
xfs_qm_exit();
- out_remove_kobj:
+ out_remove_dbg_kobj:
#ifdef DEBUG
xfs_sysfs_del(&xfs_dbg_kobj);
- out_kset_unregister:
+ out_remove_stats_kobj:
#endif
+ xfs_sysfs_del(&xfsstats.xs_kobj);
+ out_free_stats:
+ free_percpu(xfsstats.xs_stats);
+ out_kset_unregister:
kset_unregister(xfs_kset);
out_sysctl_unregister:
xfs_sysctl_unregister();
@@ -1893,6 +1917,8 @@ exit_xfs_fs(void)
#ifdef DEBUG
xfs_sysfs_del(&xfs_dbg_kobj);
#endif
+ xfs_sysfs_del(&xfsstats.xs_kobj);
+ free_percpu(xfsstats.xs_stats);
kset_unregister(xfs_kset);
xfs_sysctl_unregister();
xfs_cleanup_procfs();
@@ -1900,6 +1926,7 @@ exit_xfs_fs(void)
xfs_mru_cache_uninit();
xfs_destroy_workqueues();
xfs_destroy_zones();
+ xfs_uuid_table_free();
}
module_init(init_xfs_fs);
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 4be27b0..b44284c 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -178,7 +178,6 @@ xfs_symlink(
struct xfs_bmap_free free_list;
xfs_fsblock_t first_block;
bool unlock_dp_on_error = false;
- int committed;
xfs_fileoff_t first_fsb;
xfs_filblks_t fs_blocks;
int nmaps;
@@ -240,7 +239,8 @@ xfs_symlink(
if (error)
goto out_trans_cancel;
- xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
+ xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL |
+ XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT);
unlock_dp_on_error = true;
/*
@@ -288,7 +288,7 @@ xfs_symlink(
* the transaction cancel unlocking dp so don't do it explicitly in the
* error path.
*/
- xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
unlock_dp_on_error = false;
/*
@@ -386,7 +386,7 @@ xfs_symlink(
xfs_trans_set_sync(tp);
}
- error = xfs_bmap_finish(&tp, &free_list, &committed);
+ error = xfs_bmap_finish(&tp, &free_list, NULL);
if (error)
goto out_bmap_cancel;
@@ -421,7 +421,7 @@ out_release_inode:
xfs_qm_dqrele(pdqp);
if (unlock_dp_on_error)
- xfs_iunlock(dp, XFS_ILOCK_EXCL);
+ xfs_iunlock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
return error;
}
@@ -433,7 +433,6 @@ xfs_inactive_symlink_rmt(
struct xfs_inode *ip)
{
xfs_buf_t *bp;
- int committed;
int done;
int error;
xfs_fsblock_t first_block;
@@ -501,7 +500,7 @@ xfs_inactive_symlink_rmt(
/*
* Unmap the dead block(s) to the free_list.
*/
- error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps,
+ error = xfs_bunmapi(tp, ip, 0, size, 0, nmaps,
&first_block, &free_list, &done);
if (error)
goto error_bmap_cancel;
@@ -509,16 +508,10 @@ xfs_inactive_symlink_rmt(
/*
* Commit the first transaction. This logs the EFI and the inode.
*/
- error = xfs_bmap_finish(&tp, &free_list, &committed);
+ error = xfs_bmap_finish(&tp, &free_list, ip);
if (error)
goto error_bmap_cancel;
/*
- * The transaction must have been committed, since there were
- * actually extents freed by xfs_bunmapi. See xfs_bmap_finish.
- * The new tp has the extent freeing and EFDs.
- */
- ASSERT(committed);
- /*
* The first xact was committed, so add the inode to the new one.
* Mark it dirty so it will be logged and moved forward in the log as
* part of every commit.
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index a0c8067..aed74d3 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -19,6 +19,7 @@
#include <linux/sysctl.h>
#include <linux/proc_fs.h>
#include "xfs_error.h"
+#include "xfs_stats.h"
static struct ctl_table_header *xfs_table_header;
@@ -31,22 +32,12 @@ xfs_stats_clear_proc_handler(
size_t *lenp,
loff_t *ppos)
{
- int c, ret, *valp = ctl->data;
- __uint32_t vn_active;
+ int ret, *valp = ctl->data;
ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
if (!ret && write && *valp) {
- xfs_notice(NULL, "Clearing xfsstats");
- for_each_possible_cpu(c) {
- preempt_disable();
- /* save vn_active, it's a universal truth! */
- vn_active = per_cpu(xfsstats, c).vn_active;
- memset(&per_cpu(xfsstats, c), 0,
- sizeof(struct xfsstats));
- per_cpu(xfsstats, c).vn_active = vn_active;
- preempt_enable();
- }
+ xfs_stats_clearall(xfsstats.xs_stats);
xfs_stats_clear = 0;
}
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index aa03670..641d625 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -21,11 +21,13 @@
#include "xfs_log_format.h"
#include "xfs_log.h"
#include "xfs_log_priv.h"
+#include "xfs_stats.h"
struct xfs_sysfs_attr {
struct attribute attr;
- ssize_t (*show)(char *buf, void *data);
- ssize_t (*store)(const char *buf, size_t count, void *data);
+ ssize_t (*show)(struct kobject *kobject, char *buf);
+ ssize_t (*store)(struct kobject *kobject, const char *buf,
+ size_t count);
};
static inline struct xfs_sysfs_attr *
@@ -38,6 +40,8 @@ to_attr(struct attribute *attr)
static struct xfs_sysfs_attr xfs_sysfs_attr_##name = __ATTR_RW(name)
#define XFS_SYSFS_ATTR_RO(name) \
static struct xfs_sysfs_attr xfs_sysfs_attr_##name = __ATTR_RO(name)
+#define XFS_SYSFS_ATTR_WO(name) \
+ static struct xfs_sysfs_attr xfs_sysfs_attr_##name = __ATTR_WO(name)
#define ATTR_LIST(name) &xfs_sysfs_attr_##name.attr
@@ -51,14 +55,42 @@ struct kobj_type xfs_mp_ktype = {
.release = xfs_sysfs_release,
};
+STATIC ssize_t
+xfs_sysfs_object_show(
+ struct kobject *kobject,
+ struct attribute *attr,
+ char *buf)
+{
+ struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
+
+ return xfs_attr->show ? xfs_attr->show(kobject, buf) : 0;
+}
+
+STATIC ssize_t
+xfs_sysfs_object_store(
+ struct kobject *kobject,
+ struct attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
+
+ return xfs_attr->store ? xfs_attr->store(kobject, buf, count) : 0;
+}
+
+static const struct sysfs_ops xfs_sysfs_ops = {
+ .show = xfs_sysfs_object_show,
+ .store = xfs_sysfs_object_store,
+};
+
#ifdef DEBUG
/* debug */
STATIC ssize_t
log_recovery_delay_store(
+ struct kobject *kobject,
const char *buf,
- size_t count,
- void *data)
+ size_t count)
{
int ret;
int val;
@@ -77,8 +109,8 @@ log_recovery_delay_store(
STATIC ssize_t
log_recovery_delay_show(
- char *buf,
- void *data)
+ struct kobject *kobject,
+ char *buf)
{
return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.log_recovery_delay);
}
@@ -89,52 +121,87 @@ static struct attribute *xfs_dbg_attrs[] = {
NULL,
};
+struct kobj_type xfs_dbg_ktype = {
+ .release = xfs_sysfs_release,
+ .sysfs_ops = &xfs_sysfs_ops,
+ .default_attrs = xfs_dbg_attrs,
+};
+
+#endif /* DEBUG */
+
+/* stats */
+
+static inline struct xstats *
+to_xstats(struct kobject *kobject)
+{
+ struct xfs_kobj *kobj = to_kobj(kobject);
+
+ return container_of(kobj, struct xstats, xs_kobj);
+}
+
STATIC ssize_t
-xfs_dbg_show(
- struct kobject *kobject,
- struct attribute *attr,
- char *buf)
+stats_show(
+ struct kobject *kobject,
+ char *buf)
{
- struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
+ struct xstats *stats = to_xstats(kobject);
- return xfs_attr->show ? xfs_attr->show(buf, NULL) : 0;
+ return xfs_stats_format(stats->xs_stats, buf);
}
+XFS_SYSFS_ATTR_RO(stats);
STATIC ssize_t
-xfs_dbg_store(
- struct kobject *kobject,
- struct attribute *attr,
- const char *buf,
- size_t count)
+stats_clear_store(
+ struct kobject *kobject,
+ const char *buf,
+ size_t count)
{
- struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
+ int ret;
+ int val;
+ struct xstats *stats = to_xstats(kobject);
+
+ ret = kstrtoint(buf, 0, &val);
+ if (ret)
+ return ret;
- return xfs_attr->store ? xfs_attr->store(buf, count, NULL) : 0;
+ if (val != 1)
+ return -EINVAL;
+
+ xfs_stats_clearall(stats->xs_stats);
+ return count;
}
+XFS_SYSFS_ATTR_WO(stats_clear);
-static struct sysfs_ops xfs_dbg_ops = {
- .show = xfs_dbg_show,
- .store = xfs_dbg_store,
+static struct attribute *xfs_stats_attrs[] = {
+ ATTR_LIST(stats),
+ ATTR_LIST(stats_clear),
+ NULL,
};
-struct kobj_type xfs_dbg_ktype = {
+struct kobj_type xfs_stats_ktype = {
.release = xfs_sysfs_release,
- .sysfs_ops = &xfs_dbg_ops,
- .default_attrs = xfs_dbg_attrs,
+ .sysfs_ops = &xfs_sysfs_ops,
+ .default_attrs = xfs_stats_attrs,
};
-#endif /* DEBUG */
-
/* xlog */
+static inline struct xlog *
+to_xlog(struct kobject *kobject)
+{
+ struct xfs_kobj *kobj = to_kobj(kobject);
+
+ return container_of(kobj, struct xlog, l_kobj);
+}
+
STATIC ssize_t
log_head_lsn_show(
- char *buf,
- void *data)
+ struct kobject *kobject,
+ char *buf)
{
- struct xlog *log = data;
int cycle;
int block;
+ struct xlog *log = to_xlog(kobject);
spin_lock(&log->l_icloglock);
cycle = log->l_curr_cycle;
@@ -147,12 +214,12 @@ XFS_SYSFS_ATTR_RO(log_head_lsn);
STATIC ssize_t
log_tail_lsn_show(
- char *buf,
- void *data)
+ struct kobject *kobject,
+ char *buf)
{
- struct xlog *log = data;
int cycle;
int block;
+ struct xlog *log = to_xlog(kobject);
xlog_crack_atomic_lsn(&log->l_tail_lsn, &cycle, &block);
return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, block);
@@ -161,12 +228,13 @@ XFS_SYSFS_ATTR_RO(log_tail_lsn);
STATIC ssize_t
reserve_grant_head_show(
- char *buf,
- void *data)
+ struct kobject *kobject,
+ char *buf)
+
{
- struct xlog *log = data;
int cycle;
int bytes;
+ struct xlog *log = to_xlog(kobject);
xlog_crack_grant_head(&log->l_reserve_head.grant, &cycle, &bytes);
return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, bytes);
@@ -175,65 +243,64 @@ XFS_SYSFS_ATTR_RO(reserve_grant_head);
STATIC ssize_t
write_grant_head_show(
- char *buf,
- void *data)
+ struct kobject *kobject,
+ char *buf)
{
- struct xlog *log = data;
int cycle;
int bytes;
+ struct xlog *log = to_xlog(kobject);
xlog_crack_grant_head(&log->l_write_head.grant, &cycle, &bytes);
return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, bytes);
}
XFS_SYSFS_ATTR_RO(write_grant_head);
-static struct attribute *xfs_log_attrs[] = {
- ATTR_LIST(log_head_lsn),
- ATTR_LIST(log_tail_lsn),
- ATTR_LIST(reserve_grant_head),
- ATTR_LIST(write_grant_head),
- NULL,
-};
-
-static inline struct xlog *
-to_xlog(struct kobject *kobject)
-{
- struct xfs_kobj *kobj = to_kobj(kobject);
- return container_of(kobj, struct xlog, l_kobj);
-}
-
+#ifdef DEBUG
STATIC ssize_t
-xfs_log_show(
- struct kobject *kobject,
- struct attribute *attr,
- char *buf)
+log_badcrc_factor_store(
+ struct kobject *kobject,
+ const char *buf,
+ size_t count)
{
- struct xlog *log = to_xlog(kobject);
- struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
+ struct xlog *log = to_xlog(kobject);
+ int ret;
+ uint32_t val;
- return xfs_attr->show ? xfs_attr->show(buf, log) : 0;
+ ret = kstrtouint(buf, 0, &val);
+ if (ret)
+ return ret;
+
+ log->l_badcrc_factor = val;
+
+ return count;
}
STATIC ssize_t
-xfs_log_store(
- struct kobject *kobject,
- struct attribute *attr,
- const char *buf,
- size_t count)
+log_badcrc_factor_show(
+ struct kobject *kobject,
+ char *buf)
{
- struct xlog *log = to_xlog(kobject);
- struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
+ struct xlog *log = to_xlog(kobject);
- return xfs_attr->store ? xfs_attr->store(buf, count, log) : 0;
+ return snprintf(buf, PAGE_SIZE, "%d\n", log->l_badcrc_factor);
}
-static struct sysfs_ops xfs_log_ops = {
- .show = xfs_log_show,
- .store = xfs_log_store,
+XFS_SYSFS_ATTR_RW(log_badcrc_factor);
+#endif /* DEBUG */
+
+static struct attribute *xfs_log_attrs[] = {
+ ATTR_LIST(log_head_lsn),
+ ATTR_LIST(log_tail_lsn),
+ ATTR_LIST(reserve_grant_head),
+ ATTR_LIST(write_grant_head),
+#ifdef DEBUG
+ ATTR_LIST(log_badcrc_factor),
+#endif
+ NULL,
};
struct kobj_type xfs_log_ktype = {
.release = xfs_sysfs_release,
- .sysfs_ops = &xfs_log_ops,
+ .sysfs_ops = &xfs_sysfs_ops,
.default_attrs = xfs_log_attrs,
};
diff --git a/fs/xfs/xfs_sysfs.h b/fs/xfs/xfs_sysfs.h
index 240eee3..be692e5 100644
--- a/fs/xfs/xfs_sysfs.h
+++ b/fs/xfs/xfs_sysfs.h
@@ -22,6 +22,7 @@
extern struct kobj_type xfs_mp_ktype; /* xfs_mount */
extern struct kobj_type xfs_dbg_ktype; /* debug */
extern struct kobj_type xfs_log_ktype; /* xlog */
+extern struct kobj_type xfs_stats_ktype; /* stats */
static inline struct xfs_kobj *
to_kobj(struct kobject *kobject)
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 8d916d3..391d797 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -687,7 +687,9 @@ DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag);
DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid);
DEFINE_INODE_EVENT(xfs_filemap_fault);
+DEFINE_INODE_EVENT(xfs_filemap_pmd_fault);
DEFINE_INODE_EVENT(xfs_filemap_page_mkwrite);
+DEFINE_INODE_EVENT(xfs_filemap_pfn_mkwrite);
DECLARE_EVENT_CLASS(xfs_iref_class,
TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip),
@@ -1220,6 +1222,32 @@ DEFINE_PAGE_EVENT(xfs_writepage);
DEFINE_PAGE_EVENT(xfs_releasepage);
DEFINE_PAGE_EVENT(xfs_invalidatepage);
+DECLARE_EVENT_CLASS(xfs_readpage_class,
+ TP_PROTO(struct inode *inode, int nr_pages),
+ TP_ARGS(inode, nr_pages),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(int, nr_pages)
+ ),
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
+ __entry->nr_pages = nr_pages;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx nr_pages %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->nr_pages)
+)
+
+#define DEFINE_READPAGE_EVENT(name) \
+DEFINE_EVENT(xfs_readpage_class, name, \
+ TP_PROTO(struct inode *inode, int nr_pages), \
+ TP_ARGS(inode, nr_pages))
+DEFINE_READPAGE_EVENT(xfs_vm_readpage);
+DEFINE_READPAGE_EVENT(xfs_vm_readpages);
+
DECLARE_EVENT_CLASS(xfs_imap_class,
TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
int type, struct xfs_bmbt_irec *irec),
@@ -1311,6 +1339,7 @@ DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc);
DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
DEFINE_SIMPLE_IO_EVENT(xfs_get_blocks_notfound);
DEFINE_SIMPLE_IO_EVENT(xfs_setfilesize);
+DEFINE_SIMPLE_IO_EVENT(xfs_zero_eof);
DECLARE_EVENT_CLASS(xfs_itrunc_class,
TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size),
@@ -2089,6 +2118,40 @@ DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover);
DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel);
DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip);
+DECLARE_EVENT_CLASS(xfs_log_recover_icreate_item_class,
+ TP_PROTO(struct xlog *log, struct xfs_icreate_log *in_f),
+ TP_ARGS(log, in_f),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, agbno)
+ __field(unsigned int, count)
+ __field(unsigned int, isize)
+ __field(xfs_agblock_t, length)
+ __field(unsigned int, gen)
+ ),
+ TP_fast_assign(
+ __entry->dev = log->l_mp->m_super->s_dev;
+ __entry->agno = be32_to_cpu(in_f->icl_ag);
+ __entry->agbno = be32_to_cpu(in_f->icl_agbno);
+ __entry->count = be32_to_cpu(in_f->icl_count);
+ __entry->isize = be32_to_cpu(in_f->icl_isize);
+ __entry->length = be32_to_cpu(in_f->icl_length);
+ __entry->gen = be32_to_cpu(in_f->icl_gen);
+ ),
+ TP_printk("dev %d:%d agno %u agbno %u count %u isize %u length %u "
+ "gen %u", MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno, __entry->agbno, __entry->count, __entry->isize,
+ __entry->length, __entry->gen)
+)
+#define DEFINE_LOG_RECOVER_ICREATE_ITEM(name) \
+DEFINE_EVENT(xfs_log_recover_icreate_item_class, name, \
+ TP_PROTO(struct xlog *log, struct xfs_icreate_log *in_f), \
+ TP_ARGS(log, in_f))
+
+DEFINE_LOG_RECOVER_ICREATE_ITEM(xfs_log_recover_icreate_cancel);
+DEFINE_LOG_RECOVER_ICREATE_ITEM(xfs_log_recover_icreate_recover);
+
DECLARE_EVENT_CLASS(xfs_discard_class,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
xfs_agblock_t agbno, xfs_extlen_t len),
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 0582a27..748b16a 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -930,9 +930,9 @@ __xfs_trans_commit(
*/
if (sync) {
error = _xfs_log_force_lsn(mp, commit_lsn, XFS_LOG_SYNC, NULL);
- XFS_STATS_INC(xs_trans_sync);
+ XFS_STATS_INC(mp, xs_trans_sync);
} else {
- XFS_STATS_INC(xs_trans_async);
+ XFS_STATS_INC(mp, xs_trans_async);
}
return error;
@@ -955,7 +955,7 @@ out_unreserve:
xfs_trans_free_items(tp, NULLCOMMITLSN, !!error);
xfs_trans_free(tp);
- XFS_STATS_INC(xs_trans_empty);
+ XFS_STATS_INC(mp, xs_trans_empty);
return error;
}
@@ -1019,9 +1019,10 @@ xfs_trans_cancel(
* chunk we've been working on and get a new transaction to continue.
*/
int
-xfs_trans_roll(
+__xfs_trans_roll(
struct xfs_trans **tpp,
- struct xfs_inode *dp)
+ struct xfs_inode *dp,
+ int *committed)
{
struct xfs_trans *trans;
struct xfs_trans_res tres;
@@ -1052,6 +1053,7 @@ xfs_trans_roll(
if (error)
return error;
+ *committed = 1;
trans = *tpp;
/*
@@ -1074,3 +1076,12 @@ xfs_trans_roll(
xfs_trans_ijoin(trans, dp, 0);
return 0;
}
+
+int
+xfs_trans_roll(
+ struct xfs_trans **tpp,
+ struct xfs_inode *dp)
+{
+ int committed = 0;
+ return __xfs_trans_roll(tpp, dp, &committed);
+}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 3b21b4e..4643070 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -213,7 +213,6 @@ void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint);
void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint);
void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint);
struct xfs_efi_log_item *xfs_trans_get_efi(xfs_trans_t *, uint);
-void xfs_efi_release(struct xfs_efi_log_item *, uint);
void xfs_trans_log_efi_extent(xfs_trans_t *,
struct xfs_efi_log_item *,
xfs_fsblock_t,
@@ -221,11 +220,11 @@ void xfs_trans_log_efi_extent(xfs_trans_t *,
struct xfs_efd_log_item *xfs_trans_get_efd(xfs_trans_t *,
struct xfs_efi_log_item *,
uint);
-void xfs_trans_log_efd_extent(xfs_trans_t *,
- struct xfs_efd_log_item *,
- xfs_fsblock_t,
- xfs_extlen_t);
+int xfs_trans_free_extent(struct xfs_trans *,
+ struct xfs_efd_log_item *, xfs_fsblock_t,
+ xfs_extlen_t);
int xfs_trans_commit(struct xfs_trans *);
+int __xfs_trans_roll(struct xfs_trans **, struct xfs_inode *, int *);
int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *);
void xfs_trans_cancel(xfs_trans_t *);
int xfs_trans_ail_init(struct xfs_mount *);
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 1098cf4..aa67339 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -349,7 +349,7 @@ xfsaild_push(
xfs_ail_min_lsn(ailp))) {
ailp->xa_log_flush = 0;
- XFS_STATS_INC(xs_push_ail_flush);
+ XFS_STATS_INC(mp, xs_push_ail_flush);
xfs_log_force(mp, XFS_LOG_SYNC);
}
@@ -371,7 +371,7 @@ xfsaild_push(
goto out_done;
}
- XFS_STATS_INC(xs_push_ail);
+ XFS_STATS_INC(mp, xs_push_ail);
lsn = lip->li_lsn;
while ((XFS_LSN_CMP(lip->li_lsn, target) <= 0)) {
@@ -385,7 +385,7 @@ xfsaild_push(
lock_result = lip->li_ops->iop_push(lip, &ailp->xa_buf_list);
switch (lock_result) {
case XFS_ITEM_SUCCESS:
- XFS_STATS_INC(xs_push_ail_success);
+ XFS_STATS_INC(mp, xs_push_ail_success);
trace_xfs_ail_push(lip);
ailp->xa_last_pushed_lsn = lsn;
@@ -403,7 +403,7 @@ xfsaild_push(
* re-try the flushing relatively soon if most of the
* AIL is beeing flushed.
*/
- XFS_STATS_INC(xs_push_ail_flushing);
+ XFS_STATS_INC(mp, xs_push_ail_flushing);
trace_xfs_ail_flushing(lip);
flushing++;
@@ -411,14 +411,14 @@ xfsaild_push(
break;
case XFS_ITEM_PINNED:
- XFS_STATS_INC(xs_push_ail_pinned);
+ XFS_STATS_INC(mp, xs_push_ail_pinned);
trace_xfs_ail_pinned(lip);
stuck++;
ailp->xa_log_flush++;
break;
case XFS_ITEM_LOCKED:
- XFS_STATS_INC(xs_push_ail_locked);
+ XFS_STATS_INC(mp, xs_push_ail_locked);
trace_xfs_ail_locked(lip);
stuck++;
@@ -497,6 +497,7 @@ xfsaild(
long tout = 0; /* milliseconds */
current->flags |= PF_MEMALLOC;
+ set_freezable();
while (!kthread_should_stop()) {
if (tout && tout <= 20)
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index ce78534..9951701 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -572,12 +572,16 @@ xfs_quota_warn(
struct xfs_dquot *dqp,
int type)
{
- /* no warnings for project quotas - we just return ENOSPC later */
+ enum quota_type qtype;
+
if (dqp->dq_flags & XFS_DQ_PROJ)
- return;
- quota_send_warning(make_kqid(&init_user_ns,
- (dqp->dq_flags & XFS_DQ_USER) ?
- USRQUOTA : GRPQUOTA,
+ qtype = PRJQUOTA;
+ else if (dqp->dq_flags & XFS_DQ_USER)
+ qtype = USRQUOTA;
+ else
+ qtype = GRPQUOTA;
+
+ quota_send_warning(make_kqid(&init_user_ns, qtype,
be32_to_cpu(dqp->q_core.d_id)),
mp->m_super->s_dev, type);
}
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
index 284397d..a96ae54 100644
--- a/fs/xfs/xfs_trans_extfree.c
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -25,6 +25,7 @@
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
#include "xfs_extfree_item.h"
+#include "xfs_alloc.h"
/*
* This routine is called to allocate an "extent free intention"
@@ -108,19 +109,30 @@ xfs_trans_get_efd(xfs_trans_t *tp,
}
/*
- * This routine is called to indicate that the described
- * extent is to be logged as having been freed. It should
- * be called once for each extent freed.
+ * Free an extent and log it to the EFD. Note that the transaction is marked
+ * dirty regardless of whether the extent free succeeds or fails to support the
+ * EFI/EFD lifecycle rules.
*/
-void
-xfs_trans_log_efd_extent(xfs_trans_t *tp,
- xfs_efd_log_item_t *efdp,
- xfs_fsblock_t start_block,
- xfs_extlen_t ext_len)
+int
+xfs_trans_free_extent(
+ struct xfs_trans *tp,
+ struct xfs_efd_log_item *efdp,
+ xfs_fsblock_t start_block,
+ xfs_extlen_t ext_len)
{
uint next_extent;
- xfs_extent_t *extp;
+ struct xfs_extent *extp;
+ int error;
+ error = xfs_free_extent(tp, start_block, ext_len);
+
+ /*
+ * Mark the transaction dirty, even on error. This ensures the
+ * transaction is aborted, which:
+ *
+ * 1.) releases the EFI and frees the EFD
+ * 2.) shuts down the filesystem
+ */
tp->t_flags |= XFS_TRANS_DIRTY;
efdp->efd_item.li_desc->lid_flags |= XFS_LID_DIRTY;
@@ -130,4 +142,6 @@ xfs_trans_log_efd_extent(xfs_trans_t *tp,
extp->ext_start = start_block;
extp->ext_len = ext_len;
efdp->efd_next_extent++;
+
+ return error;
}
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index 17280cd..b97f1df 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -108,6 +108,15 @@ xfs_trans_log_inode(
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
/*
+ * Record the specific change for fdatasync optimisation. This
+ * allows fdatasync to skip log forces for inodes that are only
+ * timestamp dirty. We do this before the change count so that
+ * the core being logged in this case does not impact on fdatasync
+ * behaviour.
+ */
+ ip->i_itemp->ili_fsync_fields |= flags;
+
+ /*
* First time we log the inode in a transaction, bump the inode change
* counter if it is configured for this to occur. We don't use
* inode_inc_version() because there is no need for extra locking around
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 1b73629..49931b7 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -119,6 +119,21 @@ xfs_trans_ail_delete(
xfs_trans_ail_delete_bulk(ailp, &lip, 1, shutdown_type);
}
+static inline void
+xfs_trans_ail_remove(
+ struct xfs_log_item *lip,
+ int shutdown_type)
+{
+ struct xfs_ail *ailp = lip->li_ailp;
+
+ spin_lock(&ailp->xa_lock);
+ /* xfs_trans_ail_delete() drops the AIL lock */
+ if (lip->li_flags & XFS_LI_IN_AIL)
+ xfs_trans_ail_delete(ailp, lip, shutdown_type);
+ else
+ spin_unlock(&ailp->xa_lock);
+}
+
void xfs_ail_push(struct xfs_ail *, xfs_lsn_t);
void xfs_ail_push_all(struct xfs_ail *);
void xfs_ail_push_all_sync(struct xfs_ail *);
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index c0368151..110f1d7 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -32,15 +32,13 @@
static int
-xfs_xattr_get(struct dentry *dentry, const char *name,
- void *value, size_t size, int xflags)
+xfs_xattr_get(const struct xattr_handler *handler, struct dentry *dentry,
+ const char *name, void *value, size_t size)
{
+ int xflags = handler->flags;
struct xfs_inode *ip = XFS_I(d_inode(dentry));
int error, asize = size;
- if (strcmp(name, "") == 0)
- return -EINVAL;
-
/* Convert Linux syscall to XFS internal ATTR flags */
if (!size) {
xflags |= ATTR_KERNOVAL;
@@ -53,14 +51,35 @@ xfs_xattr_get(struct dentry *dentry, const char *name,
return asize;
}
-static int
-xfs_xattr_set(struct dentry *dentry, const char *name, const void *value,
- size_t size, int flags, int xflags)
+void
+xfs_forget_acl(
+ struct inode *inode,
+ const char *name,
+ int xflags)
{
- struct xfs_inode *ip = XFS_I(d_inode(dentry));
+ /*
+ * Invalidate any cached ACLs if the user has bypassed the ACL
+ * interface. We don't validate the content whatsoever so it is caller
+ * responsibility to provide data in valid format and ensure i_mode is
+ * consistent.
+ */
+ if (xflags & ATTR_ROOT) {
+#ifdef CONFIG_XFS_POSIX_ACL
+ if (!strcmp(name, SGI_ACL_FILE))
+ forget_cached_acl(inode, ACL_TYPE_ACCESS);
+ else if (!strcmp(name, SGI_ACL_DEFAULT))
+ forget_cached_acl(inode, ACL_TYPE_DEFAULT);
+#endif
+ }
+}
- if (strcmp(name, "") == 0)
- return -EINVAL;
+static int
+xfs_xattr_set(const struct xattr_handler *handler, struct dentry *dentry,
+ const char *name, const void *value, size_t size, int flags)
+{
+ int xflags = handler->flags;
+ struct xfs_inode *ip = XFS_I(d_inode(dentry));
+ int error;
/* Convert Linux syscall to XFS internal ATTR flags */
if (flags & XATTR_CREATE)
@@ -70,8 +89,12 @@ xfs_xattr_set(struct dentry *dentry, const char *name, const void *value,
if (!value)
return xfs_attr_remove(ip, (unsigned char *)name, xflags);
- return xfs_attr_set(ip, (unsigned char *)name,
+ error = xfs_attr_set(ip, (unsigned char *)name,
(void *)value, size, xflags);
+ if (!error)
+ xfs_forget_acl(d_inode(dentry), name, xflags);
+
+ return error;
}
static const struct xattr_handler xfs_xattr_user_handler = {
@@ -106,47 +129,19 @@ const struct xattr_handler *xfs_xattr_handlers[] = {
NULL
};
-static unsigned int xfs_xattr_prefix_len(int flags)
-{
- if (flags & XFS_ATTR_SECURE)
- return sizeof("security");
- else if (flags & XFS_ATTR_ROOT)
- return sizeof("trusted");
- else
- return sizeof("user");
-}
-
-static const char *xfs_xattr_prefix(int flags)
-{
- if (flags & XFS_ATTR_SECURE)
- return xfs_xattr_security_handler.prefix;
- else if (flags & XFS_ATTR_ROOT)
- return xfs_xattr_trusted_handler.prefix;
- else
- return xfs_xattr_user_handler.prefix;
-}
-
static int
-xfs_xattr_put_listent(
+__xfs_xattr_put_listent(
struct xfs_attr_list_context *context,
- int flags,
- unsigned char *name,
- int namelen,
- int valuelen,
- unsigned char *value)
+ char *prefix,
+ int prefix_len,
+ unsigned char *name,
+ int namelen)
{
- unsigned int prefix_len = xfs_xattr_prefix_len(flags);
char *offset;
int arraytop;
- ASSERT(context->count >= 0);
-
- /*
- * Only show root namespace entries if we are actually allowed to
- * see them.
- */
- if ((flags & XFS_ATTR_ROOT) && !capable(CAP_SYS_ADMIN))
- return 0;
+ if (!context->alist)
+ goto compute_size;
arraytop = context->count + prefix_len + namelen + 1;
if (arraytop > context->firstu) {
@@ -154,17 +149,19 @@ xfs_xattr_put_listent(
return 1;
}
offset = (char *)context->alist + context->count;
- strncpy(offset, xfs_xattr_prefix(flags), prefix_len);
+ strncpy(offset, prefix, prefix_len);
offset += prefix_len;
strncpy(offset, (char *)name, namelen); /* real name */
offset += namelen;
*offset = '\0';
+
+compute_size:
context->count += prefix_len + namelen + 1;
return 0;
}
static int
-xfs_xattr_put_listent_sizes(
+xfs_xattr_put_listent(
struct xfs_attr_list_context *context,
int flags,
unsigned char *name,
@@ -172,24 +169,55 @@ xfs_xattr_put_listent_sizes(
int valuelen,
unsigned char *value)
{
- context->count += xfs_xattr_prefix_len(flags) + namelen + 1;
- return 0;
-}
+ char *prefix;
+ int prefix_len;
-static int
-list_one_attr(const char *name, const size_t len, void *data,
- size_t size, ssize_t *result)
-{
- char *p = data + *result;
+ ASSERT(context->count >= 0);
- *result += len;
- if (!size)
- return 0;
- if (*result > size)
- return -ERANGE;
+ if (flags & XFS_ATTR_ROOT) {
+#ifdef CONFIG_XFS_POSIX_ACL
+ if (namelen == SGI_ACL_FILE_SIZE &&
+ strncmp(name, SGI_ACL_FILE,
+ SGI_ACL_FILE_SIZE) == 0) {
+ int ret = __xfs_xattr_put_listent(
+ context, XATTR_SYSTEM_PREFIX,
+ XATTR_SYSTEM_PREFIX_LEN,
+ XATTR_POSIX_ACL_ACCESS,
+ strlen(XATTR_POSIX_ACL_ACCESS));
+ if (ret)
+ return ret;
+ } else if (namelen == SGI_ACL_DEFAULT_SIZE &&
+ strncmp(name, SGI_ACL_DEFAULT,
+ SGI_ACL_DEFAULT_SIZE) == 0) {
+ int ret = __xfs_xattr_put_listent(
+ context, XATTR_SYSTEM_PREFIX,
+ XATTR_SYSTEM_PREFIX_LEN,
+ XATTR_POSIX_ACL_DEFAULT,
+ strlen(XATTR_POSIX_ACL_DEFAULT));
+ if (ret)
+ return ret;
+ }
+#endif
- strcpy(p, name);
- return 0;
+ /*
+ * Only show root namespace entries if we are actually allowed to
+ * see them.
+ */
+ if (!capable(CAP_SYS_ADMIN))
+ return 0;
+
+ prefix = XATTR_TRUSTED_PREFIX;
+ prefix_len = XATTR_TRUSTED_PREFIX_LEN;
+ } else if (flags & XFS_ATTR_SECURE) {
+ prefix = XATTR_SECURITY_PREFIX;
+ prefix_len = XATTR_SECURITY_PREFIX_LEN;
+ } else {
+ prefix = XATTR_USER_PREFIX;
+ prefix_len = XATTR_USER_PREFIX_LEN;
+ }
+
+ return __xfs_xattr_put_listent(context, prefix, prefix_len, name,
+ namelen);
}
ssize_t
@@ -198,7 +226,6 @@ xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size)
struct xfs_attr_list_context context;
struct attrlist_cursor_kern cursor = { 0 };
struct inode *inode = d_inode(dentry);
- int error;
/*
* First read the regular on-disk attributes.
@@ -207,37 +234,14 @@ xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size)
context.dp = XFS_I(inode);
context.cursor = &cursor;
context.resynch = 1;
- context.alist = data;
+ context.alist = size ? data : NULL;
context.bufsize = size;
context.firstu = context.bufsize;
-
- if (size)
- context.put_listent = xfs_xattr_put_listent;
- else
- context.put_listent = xfs_xattr_put_listent_sizes;
+ context.put_listent = xfs_xattr_put_listent;
xfs_attr_list_int(&context);
if (context.count < 0)
return -ERANGE;
- /*
- * Then add the two synthetic ACL attributes.
- */
- if (posix_acl_access_exists(inode)) {
- error = list_one_attr(POSIX_ACL_XATTR_ACCESS,
- strlen(POSIX_ACL_XATTR_ACCESS) + 1,
- data, size, &context.count);
- if (error)
- return error;
- }
-
- if (posix_acl_default_exists(inode)) {
- error = list_one_attr(POSIX_ACL_XATTR_DEFAULT,
- strlen(POSIX_ACL_XATTR_DEFAULT) + 1,
- data, size, &context.count);
- if (error)
- return error;
- }
-
return context.count;
}
OpenPOWER on IntegriCloud