diff options
Diffstat (limited to 'fs')
-rw-r--r-- | fs/9p/acl.c | 6 | ||||
-rw-r--r-- | fs/9p/acl.h | 4 | ||||
-rw-r--r-- | fs/9p/v9fs_vfs.h | 6 | ||||
-rw-r--r-- | fs/9p/vfs_file.c | 36 | ||||
-rw-r--r-- | fs/9p/vfs_inode.c | 139 | ||||
-rw-r--r-- | fs/9p/vfs_inode_dotl.c | 92 | ||||
-rw-r--r-- | fs/9p/vfs_super.c | 2 | ||||
-rw-r--r-- | fs/Kconfig | 15 | ||||
-rw-r--r-- | fs/anon_inodes.c | 2 | ||||
-rw-r--r-- | fs/autofs4/autofs_i.h | 26 | ||||
-rw-r--r-- | fs/autofs4/waitq.c | 2 | ||||
-rw-r--r-- | fs/befs/linuxvfs.c | 23 | ||||
-rw-r--r-- | fs/block_dev.c | 28 | ||||
-rw-r--r-- | fs/btrfs/Makefile | 4 | ||||
-rw-r--r-- | fs/btrfs/acl.c | 27 | ||||
-rw-r--r-- | fs/btrfs/btrfs_inode.h | 22 | ||||
-rw-r--r-- | fs/btrfs/compression.c | 14 | ||||
-rw-r--r-- | fs/btrfs/ctree.c | 457 | ||||
-rw-r--r-- | fs/btrfs/ctree.h | 54 | ||||
-rw-r--r-- | fs/btrfs/delayed-inode.c | 2 | ||||
-rw-r--r-- | fs/btrfs/delayed-inode.h | 2 | ||||
-rw-r--r-- | fs/btrfs/dir-item.c | 39 | ||||
-rw-r--r-- | fs/btrfs/disk-io.c | 116 | ||||
-rw-r--r-- | fs/btrfs/disk-io.h | 10 | ||||
-rw-r--r-- | fs/btrfs/extent-tree.c | 401 | ||||
-rw-r--r-- | fs/btrfs/extent_io.c | 309 | ||||
-rw-r--r-- | fs/btrfs/extent_io.h | 55 | ||||
-rw-r--r-- | fs/btrfs/extent_map.c | 155 | ||||
-rw-r--r-- | fs/btrfs/file-item.c | 50 | ||||
-rw-r--r-- | fs/btrfs/file.c | 76 | ||||
-rw-r--r-- | fs/btrfs/free-space-cache.c | 193 | ||||
-rw-r--r-- | fs/btrfs/inode.c | 259 | ||||
-rw-r--r-- | fs/btrfs/ioctl.c | 34 | ||||
-rw-r--r-- | fs/btrfs/locking.c | 280 | ||||
-rw-r--r-- | fs/btrfs/locking.h | 36 | ||||
-rw-r--r-- | fs/btrfs/ref-cache.c | 68 | ||||
-rw-r--r-- | fs/btrfs/ref-cache.h | 52 | ||||
-rw-r--r-- | fs/btrfs/relocation.c | 3 | ||||
-rw-r--r-- | fs/btrfs/root-tree.c | 5 | ||||
-rw-r--r-- | fs/btrfs/struct-funcs.c | 100 | ||||
-rw-r--r-- | fs/btrfs/transaction.c | 116 | ||||
-rw-r--r-- | fs/btrfs/tree-log.c | 46 | ||||
-rw-r--r-- | fs/btrfs/volumes.c | 65 | ||||
-rw-r--r-- | fs/btrfs/volumes.h | 2 | ||||
-rw-r--r-- | fs/btrfs/xattr.c | 73 | ||||
-rw-r--r-- | fs/ceph/debugfs.c | 2 | ||||
-rw-r--r-- | fs/ceph/dir.c | 116 | ||||
-rw-r--r-- | fs/ceph/export.c | 24 | ||||
-rw-r--r-- | fs/ceph/file.c | 61 | ||||
-rw-r--r-- | fs/ceph/inode.c | 48 | ||||
-rw-r--r-- | fs/ceph/ioctl.c | 15 | ||||
-rw-r--r-- | fs/ceph/ioctl.h | 1 | ||||
-rw-r--r-- | fs/ceph/mds_client.c | 58 | ||||
-rw-r--r-- | fs/ceph/mds_client.h | 3 | ||||
-rw-r--r-- | fs/ceph/snap.c | 25 | ||||
-rw-r--r-- | fs/ceph/super.c | 11 | ||||
-rw-r--r-- | fs/ceph/super.h | 20 | ||||
-rw-r--r-- | fs/ceph/xattr.c | 8 | ||||
-rw-r--r-- | fs/cifs/cifs_debug.c | 2 | ||||
-rw-r--r-- | fs/cifs/cifs_dfs_ref.c | 5 | ||||
-rw-r--r-- | fs/cifs/cifsacl.c | 28 | ||||
-rw-r--r-- | fs/cifs/cifsencrypt.c | 126 | ||||
-rw-r--r-- | fs/cifs/cifsfs.c | 22 | ||||
-rw-r--r-- | fs/cifs/cifsfs.h | 6 | ||||
-rw-r--r-- | fs/cifs/cifsglob.h | 60 | ||||
-rw-r--r-- | fs/cifs/cifssmb.c | 6 | ||||
-rw-r--r-- | fs/cifs/connect.c | 662 | ||||
-rw-r--r-- | fs/cifs/dir.c | 9 | ||||
-rw-r--r-- | fs/cifs/dns_resolve.c | 4 | ||||
-rw-r--r-- | fs/cifs/file.c | 27 | ||||
-rw-r--r-- | fs/cifs/inode.c | 14 | ||||
-rw-r--r-- | fs/cifs/link.c | 8 | ||||
-rw-r--r-- | fs/cifs/misc.c | 11 | ||||
-rw-r--r-- | fs/cifs/readdir.c | 427 | ||||
-rw-r--r-- | fs/cifs/smbencrypt.c | 8 | ||||
-rw-r--r-- | fs/cifs/transport.c | 53 | ||||
-rw-r--r-- | fs/compat.c | 5 | ||||
-rw-r--r-- | fs/compat_ioctl.c | 1 | ||||
-rw-r--r-- | fs/dcache.c | 83 | ||||
-rw-r--r-- | fs/direct-io.c | 2 | ||||
-rw-r--r-- | fs/ecryptfs/Kconfig | 2 | ||||
-rw-r--r-- | fs/ecryptfs/ecryptfs_kernel.h | 150 | ||||
-rw-r--r-- | fs/ecryptfs/inode.c | 1 | ||||
-rw-r--r-- | fs/ecryptfs/keystore.c | 62 | ||||
-rw-r--r-- | fs/ecryptfs/main.c | 23 | ||||
-rw-r--r-- | fs/ecryptfs/read_write.c | 18 | ||||
-rw-r--r-- | fs/eventpoll.c | 2 | ||||
-rw-r--r-- | fs/exec.c | 77 | ||||
-rw-r--r-- | fs/exofs/Kbuild | 5 | ||||
-rw-r--r-- | fs/exofs/Kconfig | 4 | ||||
-rw-r--r-- | fs/exofs/exofs.h | 159 | ||||
-rw-r--r-- | fs/exofs/inode.c | 152 | ||||
-rw-r--r-- | fs/exofs/ore.c (renamed from fs/exofs/ios.c) | 370 | ||||
-rw-r--r-- | fs/exofs/pnfs.h | 45 | ||||
-rw-r--r-- | fs/exofs/super.c | 251 | ||||
-rw-r--r-- | fs/ext2/acl.c | 8 | ||||
-rw-r--r-- | fs/ext2/acl.h | 1 | ||||
-rw-r--r-- | fs/ext2/xattr.c | 10 | ||||
-rw-r--r-- | fs/ext3/acl.c | 9 | ||||
-rw-r--r-- | fs/ext3/balloc.c | 38 | ||||
-rw-r--r-- | fs/ext3/file.c | 1 | ||||
-rw-r--r-- | fs/ext3/fsync.c | 11 | ||||
-rw-r--r-- | fs/ext3/ialloc.c | 4 | ||||
-rw-r--r-- | fs/ext3/inode.c | 193 | ||||
-rw-r--r-- | fs/ext3/ioctl.c | 4 | ||||
-rw-r--r-- | fs/ext3/namei.c | 13 | ||||
-rw-r--r-- | fs/ext3/super.c | 13 | ||||
-rw-r--r-- | fs/ext3/xattr.c | 12 | ||||
-rw-r--r-- | fs/ext4/Makefile | 2 | ||||
-rw-r--r-- | fs/ext4/acl.c | 9 | ||||
-rw-r--r-- | fs/ext4/balloc.c | 48 | ||||
-rw-r--r-- | fs/ext4/block_validity.c | 21 | ||||
-rw-r--r-- | fs/ext4/ext4.h | 56 | ||||
-rw-r--r-- | fs/ext4/ext4_jbd2.h | 4 | ||||
-rw-r--r-- | fs/ext4/extents.c | 129 | ||||
-rw-r--r-- | fs/ext4/fsync.c | 26 | ||||
-rw-r--r-- | fs/ext4/ialloc.c | 2 | ||||
-rw-r--r-- | fs/ext4/indirect.c | 1487 | ||||
-rw-r--r-- | fs/ext4/inode.c | 1623 | ||||
-rw-r--r-- | fs/ext4/ioctl.c | 12 | ||||
-rw-r--r-- | fs/ext4/mballoc.c | 230 | ||||
-rw-r--r-- | fs/ext4/mballoc.h | 1 | ||||
-rw-r--r-- | fs/ext4/namei.c | 27 | ||||
-rw-r--r-- | fs/ext4/page-io.c | 30 | ||||
-rw-r--r-- | fs/ext4/resize.c | 199 | ||||
-rw-r--r-- | fs/ext4/super.c | 89 | ||||
-rw-r--r-- | fs/ext4/truncate.h | 43 | ||||
-rw-r--r-- | fs/fat/dir.c | 2 | ||||
-rw-r--r-- | fs/fat/inode.c | 7 | ||||
-rw-r--r-- | fs/file_table.c | 2 | ||||
-rw-r--r-- | fs/fs-writeback.c | 378 | ||||
-rw-r--r-- | fs/fuse/dev.c | 16 | ||||
-rw-r--r-- | fs/fuse/file.c | 84 | ||||
-rw-r--r-- | fs/fuse/fuse_i.h | 8 | ||||
-rw-r--r-- | fs/fuse/inode.c | 13 | ||||
-rw-r--r-- | fs/generic_acl.c | 13 | ||||
-rw-r--r-- | fs/gfs2/acl.c | 6 | ||||
-rw-r--r-- | fs/gfs2/main.c | 2 | ||||
-rw-r--r-- | fs/gfs2/ops_fstype.c | 4 | ||||
-rw-r--r-- | fs/hppfs/hppfs.c | 1 | ||||
-rw-r--r-- | fs/hugetlbfs/inode.c | 1 | ||||
-rw-r--r-- | fs/inode.c | 82 | ||||
-rw-r--r-- | fs/jbd/checkpoint.c | 37 | ||||
-rw-r--r-- | fs/jbd/commit.c | 57 | ||||
-rw-r--r-- | fs/jbd/journal.c | 99 | ||||
-rw-r--r-- | fs/jbd/transaction.c | 83 | ||||
-rw-r--r-- | fs/jbd2/checkpoint.c | 5 | ||||
-rw-r--r-- | fs/jbd2/journal.c | 67 | ||||
-rw-r--r-- | fs/jffs2/acl.c | 4 | ||||
-rw-r--r-- | fs/jffs2/acl.h | 2 | ||||
-rw-r--r-- | fs/jffs2/fs.c | 4 | ||||
-rw-r--r-- | fs/jffs2/os-linux.h | 2 | ||||
-rw-r--r-- | fs/jfs/acl.c | 4 | ||||
-rw-r--r-- | fs/jfs/jfs_dmap.c | 5 | ||||
-rw-r--r-- | fs/jfs/jfs_txnmgr.c | 6 | ||||
-rw-r--r-- | fs/jfs/jfs_umount.c | 4 | ||||
-rw-r--r-- | fs/jfs/namei.c | 3 | ||||
-rw-r--r-- | fs/jfs/xattr.c | 4 | ||||
-rw-r--r-- | fs/lockd/clntproc.c | 9 | ||||
-rw-r--r-- | fs/namei.c | 118 | ||||
-rw-r--r-- | fs/nfs/Kconfig | 15 | ||||
-rw-r--r-- | fs/nfs/Makefile | 1 | ||||
-rw-r--r-- | fs/nfs/blocklayout/Makefile | 5 | ||||
-rw-r--r-- | fs/nfs/blocklayout/blocklayout.c | 1020 | ||||
-rw-r--r-- | fs/nfs/blocklayout/blocklayout.h | 207 | ||||
-rw-r--r-- | fs/nfs/blocklayout/blocklayoutdev.c | 410 | ||||
-rw-r--r-- | fs/nfs/blocklayout/blocklayoutdm.c | 111 | ||||
-rw-r--r-- | fs/nfs/blocklayout/extents.c | 935 | ||||
-rw-r--r-- | fs/nfs/cache_lib.h | 2 | ||||
-rw-r--r-- | fs/nfs/callback.h | 2 | ||||
-rw-r--r-- | fs/nfs/callback_proc.c | 82 | ||||
-rw-r--r-- | fs/nfs/callback_xdr.c | 24 | ||||
-rw-r--r-- | fs/nfs/client.c | 18 | ||||
-rw-r--r-- | fs/nfs/delegation.c | 16 | ||||
-rw-r--r-- | fs/nfs/dir.c | 57 | ||||
-rw-r--r-- | fs/nfs/direct.c | 2 | ||||
-rw-r--r-- | fs/nfs/internal.h | 13 | ||||
-rw-r--r-- | fs/nfs/namespace.c | 2 | ||||
-rw-r--r-- | fs/nfs/nfs3acl.c | 2 | ||||
-rw-r--r-- | fs/nfs/nfs3proc.c | 6 | ||||
-rw-r--r-- | fs/nfs/nfs4_fs.h | 7 | ||||
-rw-r--r-- | fs/nfs/nfs4filelayout.c | 82 | ||||
-rw-r--r-- | fs/nfs/nfs4filelayout.h | 17 | ||||
-rw-r--r-- | fs/nfs/nfs4filelayoutdev.c | 452 | ||||
-rw-r--r-- | fs/nfs/nfs4proc.c | 277 | ||||
-rw-r--r-- | fs/nfs/nfs4state.c | 9 | ||||
-rw-r--r-- | fs/nfs/nfs4xdr.c | 480 | ||||
-rw-r--r-- | fs/nfs/objlayout/objio_osd.c | 48 | ||||
-rw-r--r-- | fs/nfs/objlayout/pnfs_osd_xdr_cli.c | 3 | ||||
-rw-r--r-- | fs/nfs/pagelist.c | 69 | ||||
-rw-r--r-- | fs/nfs/pnfs.c | 307 | ||||
-rw-r--r-- | fs/nfs/pnfs.h | 102 | ||||
-rw-r--r-- | fs/nfs/pnfs_dev.c | 64 | ||||
-rw-r--r-- | fs/nfs/read.c | 166 | ||||
-rw-r--r-- | fs/nfs/unlink.c | 37 | ||||
-rw-r--r-- | fs/nfs/write.c | 159 | ||||
-rw-r--r-- | fs/notify/group.c | 2 | ||||
-rw-r--r-- | fs/notify/inode_mark.c | 2 | ||||
-rw-r--r-- | fs/notify/mark.c | 2 | ||||
-rw-r--r-- | fs/notify/notification.c | 2 | ||||
-rw-r--r-- | fs/notify/vfsmount_mark.c | 2 | ||||
-rw-r--r-- | fs/ntfs/inode.h | 2 | ||||
-rw-r--r-- | fs/ocfs2/acl.c | 4 | ||||
-rw-r--r-- | fs/omfs/dir.c | 2 | ||||
-rw-r--r-- | fs/open.c | 78 | ||||
-rw-r--r-- | fs/pipe.c | 2 | ||||
-rw-r--r-- | fs/posix_acl.c | 18 | ||||
-rw-r--r-- | fs/proc/base.c | 28 | ||||
-rw-r--r-- | fs/proc/generic.c | 3 | ||||
-rw-r--r-- | fs/proc/inode.c | 2 | ||||
-rw-r--r-- | fs/proc/meminfo.c | 2 | ||||
-rw-r--r-- | fs/proc/proc_net.c | 4 | ||||
-rw-r--r-- | fs/proc/root.c | 2 | ||||
-rw-r--r-- | fs/pstore/inode.c | 12 | ||||
-rw-r--r-- | fs/pstore/internal.h | 2 | ||||
-rw-r--r-- | fs/pstore/platform.c | 30 | ||||
-rw-r--r-- | fs/read_write.c | 12 | ||||
-rw-r--r-- | fs/reiserfs/xattr_acl.c | 10 | ||||
-rw-r--r-- | fs/stack.c | 5 | ||||
-rw-r--r-- | fs/stat.c | 4 | ||||
-rw-r--r-- | fs/ubifs/debug.h | 6 | ||||
-rw-r--r-- | fs/xfs/Makefile | 119 | ||||
-rw-r--r-- | fs/xfs/kmem.c (renamed from fs/xfs/linux-2.6/kmem.c) | 0 | ||||
-rw-r--r-- | fs/xfs/kmem.h (renamed from fs/xfs/linux-2.6/kmem.h) | 0 | ||||
-rw-r--r-- | fs/xfs/mrlock.h (renamed from fs/xfs/linux-2.6/mrlock.h) | 0 | ||||
-rw-r--r-- | fs/xfs/time.h (renamed from fs/xfs/linux-2.6/time.h) | 0 | ||||
-rw-r--r-- | fs/xfs/uuid.c (renamed from fs/xfs/support/uuid.c) | 0 | ||||
-rw-r--r-- | fs/xfs/uuid.h (renamed from fs/xfs/support/uuid.h) | 0 | ||||
-rw-r--r-- | fs/xfs/xfs.h | 3 | ||||
-rw-r--r-- | fs/xfs/xfs_acl.c (renamed from fs/xfs/linux-2.6/xfs_acl.c) | 6 | ||||
-rw-r--r-- | fs/xfs/xfs_acl.h | 5 | ||||
-rw-r--r-- | fs/xfs/xfs_ag.h | 6 | ||||
-rw-r--r-- | fs/xfs/xfs_alloc.c | 7 | ||||
-rw-r--r-- | fs/xfs/xfs_aops.c (renamed from fs/xfs/linux-2.6/xfs_aops.c) | 3 | ||||
-rw-r--r-- | fs/xfs/xfs_aops.h (renamed from fs/xfs/linux-2.6/xfs_aops.h) | 0 | ||||
-rw-r--r-- | fs/xfs/xfs_attr.c | 3 | ||||
-rw-r--r-- | fs/xfs/xfs_bmap.c | 10 | ||||
-rw-r--r-- | fs/xfs/xfs_btree.c | 17 | ||||
-rw-r--r-- | fs/xfs/xfs_btree.h | 2 | ||||
-rw-r--r-- | fs/xfs/xfs_buf.c (renamed from fs/xfs/linux-2.6/xfs_buf.c) | 18 | ||||
-rw-r--r-- | fs/xfs/xfs_buf.h (renamed from fs/xfs/linux-2.6/xfs_buf.h) | 32 | ||||
-rw-r--r-- | fs/xfs/xfs_buf_item.c | 24 | ||||
-rw-r--r-- | fs/xfs/xfs_da_btree.c | 44 | ||||
-rw-r--r-- | fs/xfs/xfs_dinode.h | 2 | ||||
-rw-r--r-- | fs/xfs/xfs_dir2.c | 16 | ||||
-rw-r--r-- | fs/xfs/xfs_discard.c (renamed from fs/xfs/linux-2.6/xfs_discard.c) | 0 | ||||
-rw-r--r-- | fs/xfs/xfs_discard.h (renamed from fs/xfs/linux-2.6/xfs_discard.h) | 0 | ||||
-rw-r--r-- | fs/xfs/xfs_dquot.c (renamed from fs/xfs/quota/xfs_dquot.c) | 16 | ||||
-rw-r--r-- | fs/xfs/xfs_dquot.h (renamed from fs/xfs/quota/xfs_dquot.h) | 0 | ||||
-rw-r--r-- | fs/xfs/xfs_dquot_item.c (renamed from fs/xfs/quota/xfs_dquot_item.c) | 0 | ||||
-rw-r--r-- | fs/xfs/xfs_dquot_item.h (renamed from fs/xfs/quota/xfs_dquot_item.h) | 0 | ||||
-rw-r--r-- | fs/xfs/xfs_export.c (renamed from fs/xfs/linux-2.6/xfs_export.c) | 0 | ||||
-rw-r--r-- | fs/xfs/xfs_export.h (renamed from fs/xfs/linux-2.6/xfs_export.h) | 0 | ||||
-rw-r--r-- | fs/xfs/xfs_file.c (renamed from fs/xfs/linux-2.6/xfs_file.c) | 2 | ||||
-rw-r--r-- | fs/xfs/xfs_filestream.c | 14 | ||||
-rw-r--r-- | fs/xfs/xfs_fs_subr.c (renamed from fs/xfs/linux-2.6/xfs_fs_subr.c) | 0 | ||||
-rw-r--r-- | fs/xfs/xfs_globals.c (renamed from fs/xfs/linux-2.6/xfs_globals.c) | 0 | ||||
-rw-r--r-- | fs/xfs/xfs_ialloc.c | 5 | ||||
-rw-r--r-- | fs/xfs/xfs_inode.c | 20 | ||||
-rw-r--r-- | fs/xfs/xfs_inode.h | 2 | ||||
-rw-r--r-- | fs/xfs/xfs_ioctl.c (renamed from fs/xfs/linux-2.6/xfs_ioctl.c) | 6 | ||||
-rw-r--r-- | fs/xfs/xfs_ioctl.h (renamed from fs/xfs/linux-2.6/xfs_ioctl.h) | 0 | ||||
-rw-r--r-- | fs/xfs/xfs_ioctl32.c (renamed from fs/xfs/linux-2.6/xfs_ioctl32.c) | 0 | ||||
-rw-r--r-- | fs/xfs/xfs_ioctl32.h (renamed from fs/xfs/linux-2.6/xfs_ioctl32.h) | 0 | ||||
-rw-r--r-- | fs/xfs/xfs_iops.c (renamed from fs/xfs/linux-2.6/xfs_iops.c) | 23 | ||||
-rw-r--r-- | fs/xfs/xfs_iops.h (renamed from fs/xfs/linux-2.6/xfs_iops.h) | 0 | ||||
-rw-r--r-- | fs/xfs/xfs_linux.h (renamed from fs/xfs/linux-2.6/xfs_linux.h) | 27 | ||||
-rw-r--r-- | fs/xfs/xfs_log.c | 14 | ||||
-rw-r--r-- | fs/xfs/xfs_log_recover.c | 42 | ||||
-rw-r--r-- | fs/xfs/xfs_message.c (renamed from fs/xfs/linux-2.6/xfs_message.c) | 0 | ||||
-rw-r--r-- | fs/xfs/xfs_message.h (renamed from fs/xfs/linux-2.6/xfs_message.h) | 0 | ||||
-rw-r--r-- | fs/xfs/xfs_mount.c | 6 | ||||
-rw-r--r-- | fs/xfs/xfs_qm.c (renamed from fs/xfs/quota/xfs_qm.c) | 2 | ||||
-rw-r--r-- | fs/xfs/xfs_qm.h (renamed from fs/xfs/quota/xfs_qm.h) | 0 | ||||
-rw-r--r-- | fs/xfs/xfs_qm_bhv.c (renamed from fs/xfs/quota/xfs_qm_bhv.c) | 0 | ||||
-rw-r--r-- | fs/xfs/xfs_qm_stats.c (renamed from fs/xfs/quota/xfs_qm_stats.c) | 0 | ||||
-rw-r--r-- | fs/xfs/xfs_qm_stats.h (renamed from fs/xfs/quota/xfs_qm_stats.h) | 0 | ||||
-rw-r--r-- | fs/xfs/xfs_qm_syscalls.c (renamed from fs/xfs/quota/xfs_qm_syscalls.c) | 0 | ||||
-rw-r--r-- | fs/xfs/xfs_quota_priv.h (renamed from fs/xfs/quota/xfs_quota_priv.h) | 0 | ||||
-rw-r--r-- | fs/xfs/xfs_quotaops.c (renamed from fs/xfs/linux-2.6/xfs_quotaops.c) | 2 | ||||
-rw-r--r-- | fs/xfs/xfs_rename.c | 4 | ||||
-rw-r--r-- | fs/xfs/xfs_rtalloc.c | 32 | ||||
-rw-r--r-- | fs/xfs/xfs_rtalloc.h | 2 | ||||
-rw-r--r-- | fs/xfs/xfs_rw.c | 8 | ||||
-rw-r--r-- | fs/xfs/xfs_sb.h | 2 | ||||
-rw-r--r-- | fs/xfs/xfs_stats.c (renamed from fs/xfs/linux-2.6/xfs_stats.c) | 0 | ||||
-rw-r--r-- | fs/xfs/xfs_stats.h (renamed from fs/xfs/linux-2.6/xfs_stats.h) | 0 | ||||
-rw-r--r-- | fs/xfs/xfs_super.c (renamed from fs/xfs/linux-2.6/xfs_super.c) | 36 | ||||
-rw-r--r-- | fs/xfs/xfs_super.h (renamed from fs/xfs/linux-2.6/xfs_super.h) | 0 | ||||
-rw-r--r-- | fs/xfs/xfs_sync.c (renamed from fs/xfs/linux-2.6/xfs_sync.c) | 2 | ||||
-rw-r--r-- | fs/xfs/xfs_sync.h (renamed from fs/xfs/linux-2.6/xfs_sync.h) | 0 | ||||
-rw-r--r-- | fs/xfs/xfs_sysctl.c (renamed from fs/xfs/linux-2.6/xfs_sysctl.c) | 0 | ||||
-rw-r--r-- | fs/xfs/xfs_sysctl.h (renamed from fs/xfs/linux-2.6/xfs_sysctl.h) | 0 | ||||
-rw-r--r-- | fs/xfs/xfs_trace.c (renamed from fs/xfs/linux-2.6/xfs_trace.c) | 4 | ||||
-rw-r--r-- | fs/xfs/xfs_trace.h (renamed from fs/xfs/linux-2.6/xfs_trace.h) | 0 | ||||
-rw-r--r-- | fs/xfs/xfs_trans_ail.c | 67 | ||||
-rw-r--r-- | fs/xfs/xfs_trans_buf.c | 28 | ||||
-rw-r--r-- | fs/xfs/xfs_trans_dquot.c (renamed from fs/xfs/quota/xfs_trans_dquot.c) | 0 | ||||
-rw-r--r-- | fs/xfs/xfs_vnode.h (renamed from fs/xfs/linux-2.6/xfs_vnode.h) | 0 | ||||
-rw-r--r-- | fs/xfs/xfs_vnodeops.c | 22 | ||||
-rw-r--r-- | fs/xfs/xfs_xattr.c (renamed from fs/xfs/linux-2.6/xfs_xattr.c) | 0 |
301 files changed, 11478 insertions, 6949 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c index e9cb57f..9a1d426 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -182,11 +182,11 @@ int v9fs_set_create_acl(struct dentry *dentry, return 0; } -int v9fs_acl_mode(struct inode *dir, mode_t *modep, +int v9fs_acl_mode(struct inode *dir, umode_t *modep, struct posix_acl **dpacl, struct posix_acl **pacl) { int retval = 0; - mode_t mode = *modep; + umode_t mode = *modep; struct posix_acl *acl = NULL; if (!S_ISLNK(mode)) { @@ -319,7 +319,7 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name, case ACL_TYPE_ACCESS: name = POSIX_ACL_XATTR_ACCESS; if (acl) { - mode_t mode = inode->i_mode; + umode_t mode = inode->i_mode; retval = posix_acl_equiv_mode(acl, &mode); if (retval < 0) goto err_out; diff --git a/fs/9p/acl.h b/fs/9p/acl.h index ddb7ae1..5595564 100644 --- a/fs/9p/acl.h +++ b/fs/9p/acl.h @@ -20,7 +20,7 @@ extern struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type); extern int v9fs_acl_chmod(struct dentry *); extern int v9fs_set_create_acl(struct dentry *, struct posix_acl **, struct posix_acl **); -extern int v9fs_acl_mode(struct inode *dir, mode_t *modep, +extern int v9fs_acl_mode(struct inode *dir, umode_t *modep, struct posix_acl **dpacl, struct posix_acl **pacl); #else #define v9fs_iop_get_acl NULL @@ -38,7 +38,7 @@ static inline int v9fs_set_create_acl(struct dentry *dentry, { return 0; } -static inline int v9fs_acl_mode(struct inode *dir, mode_t *modep, +static inline int v9fs_acl_mode(struct inode *dir, umode_t *modep, struct posix_acl **dpacl, struct posix_acl **pacl) { diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index 46ce357..410ffd6 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -54,9 +54,9 @@ extern struct kmem_cache *v9fs_inode_cache; struct inode *v9fs_alloc_inode(struct super_block *sb); void v9fs_destroy_inode(struct inode *inode); -struct inode *v9fs_get_inode(struct super_block *sb, int mode); +struct inode *v9fs_get_inode(struct super_block *sb, int mode, dev_t); int v9fs_init_inode(struct v9fs_session_info *v9ses, - struct inode *inode, int mode); + struct inode *inode, int mode, dev_t); void v9fs_evict_inode(struct inode *inode); ino_t v9fs_qid2ino(struct p9_qid *qid); void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *); @@ -83,4 +83,6 @@ static inline void v9fs_invalidate_inode_attr(struct inode *inode) v9inode->cache_validity |= V9FS_INO_INVALID_ATTR; return; } + +int v9fs_open_to_dotl_flags(int flags); #endif diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 3c173fc..62857a8 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -65,7 +65,7 @@ int v9fs_file_open(struct inode *inode, struct file *file) v9inode = V9FS_I(inode); v9ses = v9fs_inode2v9ses(inode); if (v9fs_proto_dotl(v9ses)) - omode = file->f_flags; + omode = v9fs_open_to_dotl_flags(file->f_flags); else omode = v9fs_uflags2omode(file->f_flags, v9fs_proto_dotu(v9ses)); @@ -169,7 +169,18 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl) /* convert posix lock to p9 tlock args */ memset(&flock, 0, sizeof(flock)); - flock.type = fl->fl_type; + /* map the lock type */ + switch (fl->fl_type) { + case F_RDLCK: + flock.type = P9_LOCK_TYPE_RDLCK; + break; + case F_WRLCK: + flock.type = P9_LOCK_TYPE_WRLCK; + break; + case F_UNLCK: + flock.type = P9_LOCK_TYPE_UNLCK; + break; + } flock.start = fl->fl_start; if (fl->fl_end == OFFSET_MAX) flock.length = 0; @@ -245,7 +256,7 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl) /* convert posix lock to p9 tgetlock args */ memset(&glock, 0, sizeof(glock)); - glock.type = fl->fl_type; + glock.type = P9_LOCK_TYPE_UNLCK; glock.start = fl->fl_start; if (fl->fl_end == OFFSET_MAX) glock.length = 0; @@ -257,17 +268,26 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl) res = p9_client_getlock_dotl(fid, &glock); if (res < 0) return res; - if (glock.type != F_UNLCK) { - fl->fl_type = glock.type; + /* map 9p lock type to os lock type */ + switch (glock.type) { + case P9_LOCK_TYPE_RDLCK: + fl->fl_type = F_RDLCK; + break; + case P9_LOCK_TYPE_WRLCK: + fl->fl_type = F_WRLCK; + break; + case P9_LOCK_TYPE_UNLCK: + fl->fl_type = F_UNLCK; + break; + } + if (glock.type != P9_LOCK_TYPE_UNLCK) { fl->fl_start = glock.start; if (glock.length == 0) fl->fl_end = OFFSET_MAX; else fl->fl_end = glock.start + glock.length - 1; fl->fl_pid = glock.proc_id; - } else - fl->fl_type = F_UNLCK; - + } return res; } diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 8bb5507..e3c03db 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -95,15 +95,18 @@ static int unixmode2p9mode(struct v9fs_session_info *v9ses, int mode) /** * p9mode2unixmode- convert plan9 mode bits to unix mode bits * @v9ses: v9fs session information - * @mode: mode to convert + * @stat: p9_wstat from which mode need to be derived + * @rdev: major number, minor number in case of device files. * */ - -static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode) +static int p9mode2unixmode(struct v9fs_session_info *v9ses, + struct p9_wstat *stat, dev_t *rdev) { int res; + int mode = stat->mode; - res = mode & 0777; + res = mode & S_IALLUGO; + *rdev = 0; if ((mode & P9_DMDIR) == P9_DMDIR) res |= S_IFDIR; @@ -116,9 +119,26 @@ static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode) && (v9ses->nodev == 0)) res |= S_IFIFO; else if ((mode & P9_DMDEVICE) && (v9fs_proto_dotu(v9ses)) - && (v9ses->nodev == 0)) - res |= S_IFBLK; - else + && (v9ses->nodev == 0)) { + char type = 0, ext[32]; + int major = -1, minor = -1; + + strncpy(ext, stat->extension, sizeof(ext)); + sscanf(ext, "%c %u %u", &type, &major, &minor); + switch (type) { + case 'c': + res |= S_IFCHR; + break; + case 'b': + res |= S_IFBLK; + break; + default: + P9_DPRINTK(P9_DEBUG_ERROR, + "Unknown special type %c %s\n", type, + stat->extension); + }; + *rdev = MKDEV(major, minor); + } else res |= S_IFREG; if (v9fs_proto_dotu(v9ses)) { @@ -131,7 +151,6 @@ static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode) if ((mode & P9_DMSETVTX) == P9_DMSETVTX) res |= S_ISVTX; } - return res; } @@ -242,13 +261,13 @@ void v9fs_destroy_inode(struct inode *inode) } int v9fs_init_inode(struct v9fs_session_info *v9ses, - struct inode *inode, int mode) + struct inode *inode, int mode, dev_t rdev) { int err = 0; inode_init_owner(inode, NULL, mode); inode->i_blocks = 0; - inode->i_rdev = 0; + inode->i_rdev = rdev; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_mapping->a_ops = &v9fs_addr_operations; @@ -335,7 +354,7 @@ error: * */ -struct inode *v9fs_get_inode(struct super_block *sb, int mode) +struct inode *v9fs_get_inode(struct super_block *sb, int mode, dev_t rdev) { int err; struct inode *inode; @@ -348,7 +367,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode) P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n"); return ERR_PTR(-ENOMEM); } - err = v9fs_init_inode(v9ses, inode, mode); + err = v9fs_init_inode(v9ses, inode, mode, rdev); if (err) { iput(inode); return ERR_PTR(err); @@ -435,11 +454,12 @@ void v9fs_evict_inode(struct inode *inode) static int v9fs_test_inode(struct inode *inode, void *data) { int umode; + dev_t rdev; struct v9fs_inode *v9inode = V9FS_I(inode); struct p9_wstat *st = (struct p9_wstat *)data; struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode); - umode = p9mode2unixmode(v9ses, st->mode); + umode = p9mode2unixmode(v9ses, st, &rdev); /* don't match inode of different type */ if ((inode->i_mode & S_IFMT) != (umode & S_IFMT)) return 0; @@ -473,6 +493,7 @@ static struct inode *v9fs_qid_iget(struct super_block *sb, struct p9_wstat *st, int new) { + dev_t rdev; int retval, umode; unsigned long i_ino; struct inode *inode; @@ -496,8 +517,8 @@ static struct inode *v9fs_qid_iget(struct super_block *sb, * later. */ inode->i_ino = i_ino; - umode = p9mode2unixmode(v9ses, st->mode); - retval = v9fs_init_inode(v9ses, inode, umode); + umode = p9mode2unixmode(v9ses, st, &rdev); + retval = v9fs_init_inode(v9ses, inode, umode, rdev); if (retval) goto error; @@ -532,6 +553,19 @@ v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, } /** + * v9fs_at_to_dotl_flags- convert Linux specific AT flags to + * plan 9 AT flag. + * @flags: flags to convert + */ +static int v9fs_at_to_dotl_flags(int flags) +{ + int rflags = 0; + if (flags & AT_REMOVEDIR) + rflags |= P9_DOTL_AT_REMOVEDIR; + return rflags; +} + +/** * v9fs_remove - helper function to remove files and directories * @dir: directory inode that is being deleted * @dentry: dentry that is being deleted @@ -558,7 +592,8 @@ static int v9fs_remove(struct inode *dir, struct dentry *dentry, int flags) return retval; } if (v9fs_proto_dotl(v9ses)) - retval = p9_client_unlinkat(dfid, dentry->d_name.name, flags); + retval = p9_client_unlinkat(dfid, dentry->d_name.name, + v9fs_at_to_dotl_flags(flags)); if (retval == -EOPNOTSUPP) { /* Try the one based on path */ v9fid = v9fs_fid_clone(dentry); @@ -645,13 +680,11 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err); goto error; } - d_instantiate(dentry, inode); err = v9fs_fid_add(dentry, fid); if (err < 0) goto error; - + d_instantiate(dentry, inode); return ofid; - error: if (ofid) p9_client_clunk(ofid); @@ -792,6 +825,7 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nameidata) { + struct dentry *res; struct super_block *sb; struct v9fs_session_info *v9ses; struct p9_fid *dfid, *fid; @@ -823,22 +857,35 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, return ERR_PTR(result); } - - inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb); + /* + * Make sure we don't use a wrong inode due to parallel + * unlink. For cached mode create calls request for new + * inode. But with cache disabled, lookup should do this. + */ + if (v9ses->cache) + inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb); + else + inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { result = PTR_ERR(inode); inode = NULL; goto error; } - result = v9fs_fid_add(dentry, fid); if (result < 0) goto error_iput; - inst_out: - d_add(dentry, inode); - return NULL; - + /* + * If we had a rename on the server and a parallel lookup + * for the new name, then make sure we instantiate with + * the new name. ie look up for a/b, while on server somebody + * moved b under k and client parallely did a lookup for + * k/b. + */ + res = d_materialise_unique(dentry, inode); + if (!IS_ERR(res)) + return res; + result = PTR_ERR(res); error_iput: iput(inode); error: @@ -1002,7 +1049,7 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, return PTR_ERR(st); v9fs_stat2inode(st, dentry->d_inode, dentry->d_inode->i_sb); - generic_fillattr(dentry->d_inode, stat); + generic_fillattr(dentry->d_inode, stat); p9stat_free(st); kfree(st); @@ -1086,6 +1133,7 @@ void v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, struct super_block *sb) { + mode_t mode; char ext[32]; char tag_name[14]; unsigned int i_nlink; @@ -1121,31 +1169,9 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, inode->i_nlink = i_nlink; } } - inode->i_mode = p9mode2unixmode(v9ses, stat->mode); - if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode))) { - char type = 0; - int major = -1; - int minor = -1; - - strncpy(ext, stat->extension, sizeof(ext)); - sscanf(ext, "%c %u %u", &type, &major, &minor); - switch (type) { - case 'c': - inode->i_mode &= ~S_IFBLK; - inode->i_mode |= S_IFCHR; - break; - case 'b': - break; - default: - P9_DPRINTK(P9_DEBUG_ERROR, - "Unknown special type %c %s\n", type, - stat->extension); - }; - inode->i_rdev = MKDEV(major, minor); - init_special_inode(inode, inode->i_mode, inode->i_rdev); - } else - inode->i_rdev = 0; - + mode = stat->mode & S_IALLUGO; + mode |= inode->i_mode & ~S_IALLUGO; + inode->i_mode = mode; i_size_write(inode, stat->length); /* not real number of blocks, but 512 byte ones ... */ @@ -1411,6 +1437,8 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode) { + int umode; + dev_t rdev; loff_t i_size; struct p9_wstat *st; struct v9fs_session_info *v9ses; @@ -1419,6 +1447,12 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode) st = p9_client_stat(fid); if (IS_ERR(st)) return PTR_ERR(st); + /* + * Don't update inode if the file type is different + */ + umode = p9mode2unixmode(v9ses, st, &rdev); + if ((inode->i_mode & S_IFMT) != (umode & S_IFMT)) + goto out; spin_lock(&inode->i_lock); /* @@ -1430,6 +1464,7 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode) if (v9ses->cache) inode->i_size = i_size; spin_unlock(&inode->i_lock); +out: p9stat_free(st); kfree(st); return 0; diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 9a26dce..aded79f 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -153,7 +153,8 @@ static struct inode *v9fs_qid_iget_dotl(struct super_block *sb, * later. */ inode->i_ino = i_ino; - retval = v9fs_init_inode(v9ses, inode, st->st_mode); + retval = v9fs_init_inode(v9ses, inode, + st->st_mode, new_decode_dev(st->st_rdev)); if (retval) goto error; @@ -190,6 +191,58 @@ v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid, return inode; } +struct dotl_openflag_map { + int open_flag; + int dotl_flag; +}; + +static int v9fs_mapped_dotl_flags(int flags) +{ + int i; + int rflags = 0; + struct dotl_openflag_map dotl_oflag_map[] = { + { O_CREAT, P9_DOTL_CREATE }, + { O_EXCL, P9_DOTL_EXCL }, + { O_NOCTTY, P9_DOTL_NOCTTY }, + { O_TRUNC, P9_DOTL_TRUNC }, + { O_APPEND, P9_DOTL_APPEND }, + { O_NONBLOCK, P9_DOTL_NONBLOCK }, + { O_DSYNC, P9_DOTL_DSYNC }, + { FASYNC, P9_DOTL_FASYNC }, + { O_DIRECT, P9_DOTL_DIRECT }, + { O_LARGEFILE, P9_DOTL_LARGEFILE }, + { O_DIRECTORY, P9_DOTL_DIRECTORY }, + { O_NOFOLLOW, P9_DOTL_NOFOLLOW }, + { O_NOATIME, P9_DOTL_NOATIME }, + { O_CLOEXEC, P9_DOTL_CLOEXEC }, + { O_SYNC, P9_DOTL_SYNC}, + }; + for (i = 0; i < ARRAY_SIZE(dotl_oflag_map); i++) { + if (flags & dotl_oflag_map[i].open_flag) + rflags |= dotl_oflag_map[i].dotl_flag; + } + return rflags; +} + +/** + * v9fs_open_to_dotl_flags- convert Linux specific open flags to + * plan 9 open flag. + * @flags: flags to convert + */ +int v9fs_open_to_dotl_flags(int flags) +{ + int rflags = 0; + + /* + * We have same bits for P9_DOTL_READONLY, P9_DOTL_WRONLY + * and P9_DOTL_NOACCESS + */ + rflags |= flags & O_ACCMODE; + rflags |= v9fs_mapped_dotl_flags(flags); + + return rflags; +} + /** * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol. * @dir: directory inode that is being created @@ -206,7 +259,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode, int err = 0; gid_t gid; int flags; - mode_t mode; + umode_t mode; char *name = NULL; struct file *filp; struct p9_qid qid; @@ -258,7 +311,8 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode, "Failed to get acl values in creat %d\n", err); goto error; } - err = p9_client_create_dotl(ofid, name, flags, mode, gid, &qid); + err = p9_client_create_dotl(ofid, name, v9fs_open_to_dotl_flags(flags), + mode, gid, &qid); if (err < 0) { P9_DPRINTK(P9_DEBUG_VFS, "p9_client_open_dotl failed in creat %d\n", @@ -281,10 +335,10 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode, P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err); goto error; } - d_instantiate(dentry, inode); err = v9fs_fid_add(dentry, fid); if (err < 0) goto error; + d_instantiate(dentry, inode); /* Now set the ACL based on the default value */ v9fs_set_create_acl(dentry, &dacl, &pacl); @@ -348,7 +402,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, struct p9_fid *fid = NULL, *dfid = NULL; gid_t gid; char *name; - mode_t mode; + umode_t mode; struct inode *inode; struct p9_qid qid; struct dentry *dir_dentry; @@ -403,10 +457,10 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, err); goto error; } - d_instantiate(dentry, inode); err = v9fs_fid_add(dentry, fid); if (err < 0) goto error; + d_instantiate(dentry, inode); fid = NULL; } else { /* @@ -414,7 +468,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, * inode with stat. We need to get an inode * so that we can set the acl with dentry */ - inode = v9fs_get_inode(dir->i_sb, mode); + inode = v9fs_get_inode(dir->i_sb, mode, 0); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto error; @@ -540,6 +594,7 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr) void v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode) { + mode_t mode; struct v9fs_inode *v9inode = V9FS_I(inode); if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) { @@ -552,11 +607,10 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode) inode->i_uid = stat->st_uid; inode->i_gid = stat->st_gid; inode->i_nlink = stat->st_nlink; - inode->i_mode = stat->st_mode; - inode->i_rdev = new_decode_dev(stat->st_rdev); - if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode))) - init_special_inode(inode, inode->i_mode, inode->i_rdev); + mode = stat->st_mode & S_IALLUGO; + mode |= inode->i_mode & ~S_IALLUGO; + inode->i_mode = mode; i_size_write(inode, stat->st_size); inode->i_blocks = stat->st_blocks; @@ -657,14 +711,14 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry, err); goto error; } - d_instantiate(dentry, inode); err = v9fs_fid_add(dentry, fid); if (err < 0) goto error; + d_instantiate(dentry, inode); fid = NULL; } else { /* Not in cached mode. No need to populate inode with stat */ - inode = v9fs_get_inode(dir->i_sb, S_IFLNK); + inode = v9fs_get_inode(dir->i_sb, S_IFLNK, 0); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto error; @@ -751,7 +805,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode, int err; gid_t gid; char *name; - mode_t mode; + umode_t mode; struct v9fs_session_info *v9ses; struct p9_fid *fid = NULL, *dfid = NULL; struct inode *inode; @@ -810,17 +864,17 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode, err); goto error; } - d_instantiate(dentry, inode); err = v9fs_fid_add(dentry, fid); if (err < 0) goto error; + d_instantiate(dentry, inode); fid = NULL; } else { /* * Not in cached mode. No need to populate inode with stat. * socket syscall returns a fd, so we need instantiate */ - inode = v9fs_get_inode(dir->i_sb, mode); + inode = v9fs_get_inode(dir->i_sb, mode, rdev); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto error; @@ -886,6 +940,11 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode) st = p9_client_getattr_dotl(fid, P9_STATS_ALL); if (IS_ERR(st)) return PTR_ERR(st); + /* + * Don't update inode if the file type is different + */ + if ((inode->i_mode & S_IFMT) != (st->st_mode & S_IFMT)) + goto out; spin_lock(&inode->i_lock); /* @@ -897,6 +956,7 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode) if (v9ses->cache) inode->i_size = i_size; spin_unlock(&inode->i_lock); +out: kfree(st); return 0; } diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index feef6cd..c70251d 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -149,7 +149,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, else sb->s_d_op = &v9fs_dentry_operations; - inode = v9fs_get_inode(sb, S_IFDIR | mode); + inode = v9fs_get_inode(sb, S_IFDIR | mode, 0); if (IS_ERR(inode)) { retval = PTR_ERR(inode); goto release_sb; @@ -127,14 +127,21 @@ config TMPFS_POSIX_ACL select TMPFS_XATTR select GENERIC_ACL help - POSIX Access Control Lists (ACLs) support permissions for users and - groups beyond the owner/group/world scheme. + POSIX Access Control Lists (ACLs) support additional access rights + for users and groups beyond the standard owner/group/world scheme, + and this option selects support for ACLs specifically for tmpfs + filesystems. + + If you've selected TMPFS, it's possible that you'll also need + this option as there are a number of Linux distros that require + POSIX ACL support under /dev for certain features to work properly. + For example, some distros need this feature for ALSA-related /dev + files for sound to work properly. In short, if you're not sure, + say Y. To learn more about Access Control Lists, visit the POSIX ACLs for Linux website <http://acl.bestbits.at/>. - If you don't know what Access Control Lists are, say N. - config TMPFS_XATTR bool "Tmpfs extended attributes" depends on TMPFS diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index 4d433d3..f11e43e 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -187,7 +187,7 @@ EXPORT_SYMBOL_GPL(anon_inode_getfd); */ static struct inode *anon_inode_mkinode(void) { - struct inode *inode = new_inode(anon_inode_mnt->mnt_sb); + struct inode *inode = new_inode_pseudo(anon_inode_mnt->mnt_sb); if (!inode) return ERR_PTR(-ENOMEM); diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index 475f9c5..326dc08 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -39,27 +39,17 @@ /* #define DEBUG */ -#ifdef DEBUG -#define DPRINTK(fmt, args...) \ -do { \ - printk(KERN_DEBUG "pid %d: %s: " fmt "\n", \ - current->pid, __func__, ##args); \ -} while (0) -#else -#define DPRINTK(fmt, args...) do {} while (0) -#endif - -#define AUTOFS_WARN(fmt, args...) \ -do { \ +#define DPRINTK(fmt, ...) \ + pr_debug("pid %d: %s: " fmt "\n", \ + current->pid, __func__, ##__VA_ARGS__) + +#define AUTOFS_WARN(fmt, ...) \ printk(KERN_WARNING "pid %d: %s: " fmt "\n", \ - current->pid, __func__, ##args); \ -} while (0) + current->pid, __func__, ##__VA_ARGS__) -#define AUTOFS_ERROR(fmt, args...) \ -do { \ +#define AUTOFS_ERROR(fmt, ...) \ printk(KERN_ERR "pid %d: %s: " fmt "\n", \ - current->pid, __func__, ##args); \ -} while (0) + current->pid, __func__, ##__VA_ARGS__) /* Unified info structure. This is pointed to by both the dentry and inode structures. Each file in the filesystem has an instance of this diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 2543598..e1fbdee 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -104,7 +104,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, size_t pktsz; DPRINTK("wait id = 0x%08lx, name = %.*s, type=%d", - wq->wait_queue_token, wq->name.len, wq->name.name, type); + (unsigned long) wq->wait_queue_token, wq->name.len, wq->name.name, type); memset(&pkt,0,sizeof pkt); /* For security reasons */ diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 54b8c28..720d885 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -474,17 +474,22 @@ befs_follow_link(struct dentry *dentry, struct nameidata *nd) befs_data_stream *data = &befs_ino->i_data.ds; befs_off_t len = data->size; - befs_debug(sb, "Follow long symlink"); - - link = kmalloc(len, GFP_NOFS); - if (!link) { - link = ERR_PTR(-ENOMEM); - } else if (befs_read_lsymlink(sb, data, link, len) != len) { - kfree(link); - befs_error(sb, "Failed to read entire long symlink"); + if (len == 0) { + befs_error(sb, "Long symlink with illegal length"); link = ERR_PTR(-EIO); } else { - link[len - 1] = '\0'; + befs_debug(sb, "Follow long symlink"); + + link = kmalloc(len, GFP_NOFS); + if (!link) { + link = ERR_PTR(-ENOMEM); + } else if (befs_read_lsymlink(sb, data, link, len) != len) { + kfree(link); + befs_error(sb, "Failed to read entire long symlink"); + link = ERR_PTR(-EIO); + } else { + link[len - 1] = '\0'; + } } } else { link = befs_ino->i_data.symlink; diff --git a/fs/block_dev.c b/fs/block_dev.c index c62fb84..95f786e 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -44,24 +44,28 @@ inline struct block_device *I_BDEV(struct inode *inode) { return &BDEV_I(inode)->bdev; } - EXPORT_SYMBOL(I_BDEV); /* - * move the inode from it's current bdi to the a new bdi. if the inode is dirty - * we need to move it onto the dirty list of @dst so that the inode is always - * on the right list. + * Move the inode from its current bdi to a new bdi. If the inode is dirty we + * need to move it onto the dirty list of @dst so that the inode is always on + * the right list. */ static void bdev_inode_switch_bdi(struct inode *inode, struct backing_dev_info *dst) { - spin_lock(&inode_wb_list_lock); + struct backing_dev_info *old = inode->i_data.backing_dev_info; + + if (unlikely(dst == old)) /* deadlock avoidance */ + return; + bdi_lock_two(&old->wb, &dst->wb); spin_lock(&inode->i_lock); inode->i_data.backing_dev_info = dst; if (inode->i_state & I_DIRTY) list_move(&inode->i_wb_list, &dst->wb.b_dirty); spin_unlock(&inode->i_lock); - spin_unlock(&inode_wb_list_lock); + spin_unlock(&old->wb.list_lock); + spin_unlock(&dst->wb.list_lock); } static sector_t max_block(struct block_device *bdev) @@ -383,6 +387,10 @@ int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) struct inode *bd_inode = filp->f_mapping->host; struct block_device *bdev = I_BDEV(bd_inode); int error; + + error = filemap_write_and_wait_range(filp->f_mapping, start, end); + if (error) + return error; /* * There is no need to serialise calls to blkdev_issue_flush with @@ -548,6 +556,7 @@ struct block_device *bdget(dev_t dev) if (inode->i_state & I_NEW) { bdev->bd_contains = NULL; + bdev->bd_super = NULL; bdev->bd_inode = inode; bdev->bd_block_size = (1 << inode->i_blkbits); bdev->bd_part_count = 0; @@ -1420,6 +1429,11 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) WARN_ON_ONCE(bdev->bd_holders); sync_blockdev(bdev); kill_bdev(bdev); + /* ->release can cause the old bdi to disappear, + * so must switch it out first + */ + bdev_inode_switch_bdi(bdev->bd_inode, + &default_backing_dev_info); } if (bdev->bd_contains == bdev) { if (disk->fops->release) @@ -1433,8 +1447,6 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) disk_put_part(bdev->bd_part); bdev->bd_part = NULL; bdev->bd_disk = NULL; - bdev_inode_switch_bdi(bdev->bd_inode, - &default_backing_dev_info); if (bdev != bdev->bd_contains) victim = bdev->bd_contains; bdev->bd_contains = NULL; diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 9b72dcf..40e6ac0 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -6,5 +6,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ transaction.o inode.o file.o tree-defrag.o \ extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ - export.o tree-log.o acl.o free-space-cache.o zlib.o lzo.o \ + export.o tree-log.o free-space-cache.o zlib.o lzo.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o + +btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 65a735d..eb159aa 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -28,8 +28,6 @@ #include "btrfs_inode.h" #include "xattr.h" -#ifdef CONFIG_BTRFS_FS_POSIX_ACL - struct posix_acl *btrfs_get_acl(struct inode *inode, int type) { int size; @@ -111,7 +109,6 @@ static int btrfs_set_acl(struct btrfs_trans_handle *trans, int ret, size = 0; const char *name; char *value = NULL; - mode_t mode; if (acl) { ret = posix_acl_valid(acl); @@ -122,13 +119,11 @@ static int btrfs_set_acl(struct btrfs_trans_handle *trans, switch (type) { case ACL_TYPE_ACCESS: - mode = inode->i_mode; name = POSIX_ACL_XATTR_ACCESS; if (acl) { - ret = posix_acl_equiv_mode(acl, &mode); + ret = posix_acl_equiv_mode(acl, &inode->i_mode); if (ret < 0) return ret; - inode->i_mode = mode; } ret = 0; break; @@ -222,19 +217,16 @@ int btrfs_init_acl(struct btrfs_trans_handle *trans, } if (IS_POSIXACL(dir) && acl) { - mode_t mode = inode->i_mode; - if (S_ISDIR(inode->i_mode)) { ret = btrfs_set_acl(trans, inode, acl, ACL_TYPE_DEFAULT); if (ret) goto failed; } - ret = posix_acl_create(&acl, GFP_NOFS, &mode); + ret = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); if (ret < 0) return ret; - inode->i_mode = mode; if (ret > 0) { /* we need an acl */ ret = btrfs_set_acl(trans, inode, acl, ACL_TYPE_ACCESS); @@ -282,18 +274,3 @@ const struct xattr_handler btrfs_xattr_acl_access_handler = { .get = btrfs_xattr_acl_get, .set = btrfs_xattr_acl_set, }; - -#else /* CONFIG_BTRFS_FS_POSIX_ACL */ - -int btrfs_acl_chmod(struct inode *inode) -{ - return 0; -} - -int btrfs_init_acl(struct btrfs_trans_handle *trans, - struct inode *inode, struct inode *dir) -{ - return 0; -} - -#endif /* CONFIG_BTRFS_FS_POSIX_ACL */ diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 52d7eca..d9f99a1 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -34,6 +34,9 @@ struct btrfs_inode { */ struct btrfs_key location; + /* Lock for counters */ + spinlock_t lock; + /* the extent_tree has caches of all the extent mappings to disk */ struct extent_map_tree extent_tree; @@ -134,8 +137,8 @@ struct btrfs_inode { * items we think we'll end up using, and reserved_extents is the number * of extent items we've reserved metadata for. */ - atomic_t outstanding_extents; - atomic_t reserved_extents; + unsigned outstanding_extents; + unsigned reserved_extents; /* * ordered_data_close is set by truncate when a file that used @@ -173,7 +176,11 @@ static inline u64 btrfs_ino(struct inode *inode) { u64 ino = BTRFS_I(inode)->location.objectid; - if (ino <= BTRFS_FIRST_FREE_OBJECTID) + /* + * !ino: btree_inode + * type == BTRFS_ROOT_ITEM_KEY: subvol dir + */ + if (!ino || BTRFS_I(inode)->location.type == BTRFS_ROOT_ITEM_KEY) ino = inode->i_ino; return ino; } @@ -184,4 +191,13 @@ static inline void btrfs_i_size_write(struct inode *inode, u64 size) BTRFS_I(inode)->disk_i_size = size; } +static inline bool btrfs_is_free_space_inode(struct btrfs_root *root, + struct inode *inode) +{ + if (root == root->fs_info->tree_root || + BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) + return true; + return false; +} + #endif diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index bfe42b0..8ec5d86 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -338,6 +338,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, u64 first_byte = disk_start; struct block_device *bdev; int ret; + int skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1)); cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); @@ -392,8 +393,11 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); BUG_ON(ret); - ret = btrfs_csum_one_bio(root, inode, bio, start, 1); - BUG_ON(ret); + if (!skip_sum) { + ret = btrfs_csum_one_bio(root, inode, bio, + start, 1); + BUG_ON(ret); + } ret = btrfs_map_bio(root, WRITE, bio, 0, 1); BUG_ON(ret); @@ -418,8 +422,10 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); BUG_ON(ret); - ret = btrfs_csum_one_bio(root, inode, bio, start, 1); - BUG_ON(ret); + if (!skip_sum) { + ret = btrfs_csum_one_bio(root, inode, bio, start, 1); + BUG_ON(ret); + } ret = btrfs_map_bio(root, WRITE, bio, 0, 1); BUG_ON(ret); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 2e66786..011cab3 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -54,8 +54,13 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p) { int i; for (i = 0; i < BTRFS_MAX_LEVEL; i++) { - if (p->nodes[i] && p->locks[i]) - btrfs_set_lock_blocking(p->nodes[i]); + if (!p->nodes[i] || !p->locks[i]) + continue; + btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]); + if (p->locks[i] == BTRFS_READ_LOCK) + p->locks[i] = BTRFS_READ_LOCK_BLOCKING; + else if (p->locks[i] == BTRFS_WRITE_LOCK) + p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING; } } @@ -68,7 +73,7 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p) * for held */ noinline void btrfs_clear_path_blocking(struct btrfs_path *p, - struct extent_buffer *held) + struct extent_buffer *held, int held_rw) { int i; @@ -79,19 +84,29 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p, * really sure by forcing the path to blocking before we clear * the path blocking. */ - if (held) - btrfs_set_lock_blocking(held); + if (held) { + btrfs_set_lock_blocking_rw(held, held_rw); + if (held_rw == BTRFS_WRITE_LOCK) + held_rw = BTRFS_WRITE_LOCK_BLOCKING; + else if (held_rw == BTRFS_READ_LOCK) + held_rw = BTRFS_READ_LOCK_BLOCKING; + } btrfs_set_path_blocking(p); #endif for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) { - if (p->nodes[i] && p->locks[i]) - btrfs_clear_lock_blocking(p->nodes[i]); + if (p->nodes[i] && p->locks[i]) { + btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]); + if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING) + p->locks[i] = BTRFS_WRITE_LOCK; + else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING) + p->locks[i] = BTRFS_READ_LOCK; + } } #ifdef CONFIG_DEBUG_LOCK_ALLOC if (held) - btrfs_clear_lock_blocking(held); + btrfs_clear_lock_blocking_rw(held, held_rw); #endif } @@ -119,7 +134,7 @@ noinline void btrfs_release_path(struct btrfs_path *p) if (!p->nodes[i]) continue; if (p->locks[i]) { - btrfs_tree_unlock(p->nodes[i]); + btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]); p->locks[i] = 0; } free_extent_buffer(p->nodes[i]); @@ -167,6 +182,25 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) return eb; } +/* loop around taking references on and locking the root node of the + * tree until you end up with a lock on the root. A locked buffer + * is returned, with a reference held. + */ +struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) +{ + struct extent_buffer *eb; + + while (1) { + eb = btrfs_root_node(root); + btrfs_tree_read_lock(eb); + if (eb == root->node) + break; + btrfs_tree_read_unlock(eb); + free_extent_buffer(eb); + } + return eb; +} + /* cowonly root (everything not a reference counted cow subvolume), just get * put onto a simple dirty list. transaction.c walks this to make sure they * get properly updated on disk. @@ -626,14 +660,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, for (i = start_slot; i < end_slot; i++) { int close = 1; - if (!parent->map_token) { - map_extent_buffer(parent, - btrfs_node_key_ptr_offset(i), - sizeof(struct btrfs_key_ptr), - &parent->map_token, &parent->kaddr, - &parent->map_start, &parent->map_len, - KM_USER1); - } btrfs_node_key(parent, &disk_key, i); if (!progress_passed && comp_keys(&disk_key, progress) < 0) continue; @@ -656,11 +682,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, last_block = blocknr; continue; } - if (parent->map_token) { - unmap_extent_buffer(parent, parent->map_token, - KM_USER1); - parent->map_token = NULL; - } cur = btrfs_find_tree_block(root, blocknr, blocksize); if (cur) @@ -701,11 +722,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, btrfs_tree_unlock(cur); free_extent_buffer(cur); } - if (parent->map_token) { - unmap_extent_buffer(parent, parent->map_token, - KM_USER1); - parent->map_token = NULL; - } return err; } @@ -746,7 +762,6 @@ static noinline int generic_bin_search(struct extent_buffer *eb, struct btrfs_disk_key *tmp = NULL; struct btrfs_disk_key unaligned; unsigned long offset; - char *map_token = NULL; char *kaddr = NULL; unsigned long map_start = 0; unsigned long map_len = 0; @@ -756,18 +771,13 @@ static noinline int generic_bin_search(struct extent_buffer *eb, mid = (low + high) / 2; offset = p + mid * item_size; - if (!map_token || offset < map_start || + if (!kaddr || offset < map_start || (offset + sizeof(struct btrfs_disk_key)) > map_start + map_len) { - if (map_token) { - unmap_extent_buffer(eb, map_token, KM_USER0); - map_token = NULL; - } err = map_private_extent_buffer(eb, offset, sizeof(struct btrfs_disk_key), - &map_token, &kaddr, - &map_start, &map_len, KM_USER0); + &kaddr, &map_start, &map_len); if (!err) { tmp = (struct btrfs_disk_key *)(kaddr + offset - @@ -790,14 +800,10 @@ static noinline int generic_bin_search(struct extent_buffer *eb, high = mid; else { *slot = mid; - if (map_token) - unmap_extent_buffer(eb, map_token, KM_USER0); return 0; } } *slot = low; - if (map_token) - unmap_extent_buffer(eb, map_token, KM_USER0); return 1; } @@ -890,7 +896,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, mid = path->nodes[level]; - WARN_ON(!path->locks[level]); + WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK && + path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING); WARN_ON(btrfs_header_generation(mid) != trans->transid); orig_ptr = btrfs_node_blockptr(mid, orig_slot); @@ -1228,7 +1235,6 @@ static void reada_for_search(struct btrfs_root *root, u32 nr; u32 blocksize; u32 nscan = 0; - bool map = true; if (level != 1) return; @@ -1250,19 +1256,8 @@ static void reada_for_search(struct btrfs_root *root, nritems = btrfs_header_nritems(node); nr = slot; - if (node->map_token || path->skip_locking) - map = false; while (1) { - if (map && !node->map_token) { - unsigned long offset = btrfs_node_key_ptr_offset(nr); - map_private_extent_buffer(node, offset, - sizeof(struct btrfs_key_ptr), - &node->map_token, - &node->kaddr, - &node->map_start, - &node->map_len, KM_USER1); - } if (direction < 0) { if (nr == 0) break; @@ -1281,11 +1276,6 @@ static void reada_for_search(struct btrfs_root *root, if ((search <= target && target - search <= 65536) || (search > target && search - target <= 65536)) { gen = btrfs_node_ptr_generation(node, nr); - if (map && node->map_token) { - unmap_extent_buffer(node, node->map_token, - KM_USER1); - node->map_token = NULL; - } readahead_tree_block(root, search, blocksize, gen); nread += blocksize; } @@ -1293,10 +1283,6 @@ static void reada_for_search(struct btrfs_root *root, if ((nread > 65536 || nscan > 32)) break; } - if (map && node->map_token) { - unmap_extent_buffer(node, node->map_token, KM_USER1); - node->map_token = NULL; - } } /* @@ -1409,7 +1395,7 @@ static noinline void unlock_up(struct btrfs_path *path, int level, t = path->nodes[i]; if (i >= lowest_unlock && i > skip_level && path->locks[i]) { - btrfs_tree_unlock(t); + btrfs_tree_unlock_rw(t, path->locks[i]); path->locks[i] = 0; } } @@ -1436,7 +1422,7 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level) continue; if (!path->locks[i]) continue; - btrfs_tree_unlock(path->nodes[i]); + btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); path->locks[i] = 0; } } @@ -1485,6 +1471,8 @@ read_block_for_search(struct btrfs_trans_handle *trans, * we can trust our generation number */ free_extent_buffer(tmp); + btrfs_set_path_blocking(p); + tmp = read_tree_block(root, blocknr, blocksize, gen); if (tmp && btrfs_buffer_uptodate(tmp, gen)) { *eb_ret = tmp; @@ -1540,20 +1528,27 @@ read_block_for_search(struct btrfs_trans_handle *trans, static int setup_nodes_for_search(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *p, - struct extent_buffer *b, int level, int ins_len) + struct extent_buffer *b, int level, int ins_len, + int *write_lock_level) { int ret; if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >= BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { int sret; + if (*write_lock_level < level + 1) { + *write_lock_level = level + 1; + btrfs_release_path(p); + goto again; + } + sret = reada_for_balance(root, p, level); if (sret) goto again; btrfs_set_path_blocking(p); sret = split_node(trans, root, p, level); - btrfs_clear_path_blocking(p, NULL); + btrfs_clear_path_blocking(p, NULL, 0); BUG_ON(sret > 0); if (sret) { @@ -1565,13 +1560,19 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans, BTRFS_NODEPTRS_PER_BLOCK(root) / 2) { int sret; + if (*write_lock_level < level + 1) { + *write_lock_level = level + 1; + btrfs_release_path(p); + goto again; + } + sret = reada_for_balance(root, p, level); if (sret) goto again; btrfs_set_path_blocking(p); sret = balance_level(trans, root, p, level); - btrfs_clear_path_blocking(p, NULL); + btrfs_clear_path_blocking(p, NULL, 0); if (sret) { ret = sret; @@ -1615,27 +1616,78 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root int err; int level; int lowest_unlock = 1; + int root_lock; + /* everything at write_lock_level or lower must be write locked */ + int write_lock_level = 0; u8 lowest_level = 0; lowest_level = p->lowest_level; WARN_ON(lowest_level && ins_len > 0); WARN_ON(p->nodes[0] != NULL); - if (ins_len < 0) + if (ins_len < 0) { lowest_unlock = 2; + /* when we are removing items, we might have to go up to level + * two as we update tree pointers Make sure we keep write + * for those levels as well + */ + write_lock_level = 2; + } else if (ins_len > 0) { + /* + * for inserting items, make sure we have a write lock on + * level 1 so we can update keys + */ + write_lock_level = 1; + } + + if (!cow) + write_lock_level = -1; + + if (cow && (p->keep_locks || p->lowest_level)) + write_lock_level = BTRFS_MAX_LEVEL; + again: + /* + * we try very hard to do read locks on the root + */ + root_lock = BTRFS_READ_LOCK; + level = 0; if (p->search_commit_root) { + /* + * the commit roots are read only + * so we always do read locks + */ b = root->commit_root; extent_buffer_get(b); + level = btrfs_header_level(b); if (!p->skip_locking) - btrfs_tree_lock(b); + btrfs_tree_read_lock(b); } else { - if (p->skip_locking) + if (p->skip_locking) { b = btrfs_root_node(root); - else - b = btrfs_lock_root_node(root); + level = btrfs_header_level(b); + } else { + /* we don't know the level of the root node + * until we actually have it read locked + */ + b = btrfs_read_lock_root_node(root); + level = btrfs_header_level(b); + if (level <= write_lock_level) { + /* whoops, must trade for write lock */ + btrfs_tree_read_unlock(b); + free_extent_buffer(b); + b = btrfs_lock_root_node(root); + root_lock = BTRFS_WRITE_LOCK; + + /* the level might have changed, check again */ + level = btrfs_header_level(b); + } + } } + p->nodes[level] = b; + if (!p->skip_locking) + p->locks[level] = root_lock; while (b) { level = btrfs_header_level(b); @@ -1644,10 +1696,6 @@ again: * setup the path here so we can release it under lock * contention with the cow code */ - p->nodes[level] = b; - if (!p->skip_locking) - p->locks[level] = 1; - if (cow) { /* * if we don't really need to cow this block @@ -1659,6 +1707,16 @@ again: btrfs_set_path_blocking(p); + /* + * must have write locks on this node and the + * parent + */ + if (level + 1 > write_lock_level) { + write_lock_level = level + 1; + btrfs_release_path(p); + goto again; + } + err = btrfs_cow_block(trans, root, b, p->nodes[level + 1], p->slots[level + 1], &b); @@ -1671,10 +1729,7 @@ cow_done: BUG_ON(!cow && ins_len); p->nodes[level] = b; - if (!p->skip_locking) - p->locks[level] = 1; - - btrfs_clear_path_blocking(p, NULL); + btrfs_clear_path_blocking(p, NULL, 0); /* * we have a lock on b and as long as we aren't changing @@ -1700,7 +1755,7 @@ cow_done: } p->slots[level] = slot; err = setup_nodes_for_search(trans, root, p, b, level, - ins_len); + ins_len, &write_lock_level); if (err == -EAGAIN) goto again; if (err) { @@ -1710,6 +1765,19 @@ cow_done: b = p->nodes[level]; slot = p->slots[level]; + /* + * slot 0 is special, if we change the key + * we have to update the parent pointer + * which means we must have a write lock + * on the parent + */ + if (slot == 0 && cow && + write_lock_level < level + 1) { + write_lock_level = level + 1; + btrfs_release_path(p); + goto again; + } + unlock_up(p, level, lowest_unlock); if (level == lowest_level) { @@ -1728,23 +1796,42 @@ cow_done: } if (!p->skip_locking) { - btrfs_clear_path_blocking(p, NULL); - err = btrfs_try_spin_lock(b); - - if (!err) { - btrfs_set_path_blocking(p); - btrfs_tree_lock(b); - btrfs_clear_path_blocking(p, b); + level = btrfs_header_level(b); + if (level <= write_lock_level) { + err = btrfs_try_tree_write_lock(b); + if (!err) { + btrfs_set_path_blocking(p); + btrfs_tree_lock(b); + btrfs_clear_path_blocking(p, b, + BTRFS_WRITE_LOCK); + } + p->locks[level] = BTRFS_WRITE_LOCK; + } else { + err = btrfs_try_tree_read_lock(b); + if (!err) { + btrfs_set_path_blocking(p); + btrfs_tree_read_lock(b); + btrfs_clear_path_blocking(p, b, + BTRFS_READ_LOCK); + } + p->locks[level] = BTRFS_READ_LOCK; } + p->nodes[level] = b; } } else { p->slots[level] = slot; if (ins_len > 0 && btrfs_leaf_free_space(root, b) < ins_len) { + if (write_lock_level < 1) { + write_lock_level = 1; + btrfs_release_path(p); + goto again; + } + btrfs_set_path_blocking(p); err = split_leaf(trans, root, key, p, ins_len, ret == 0); - btrfs_clear_path_blocking(p, NULL); + btrfs_clear_path_blocking(p, NULL, 0); BUG_ON(err > 0); if (err) { @@ -2025,7 +2112,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, add_root_to_dirty_list(root); extent_buffer_get(c); path->nodes[level] = c; - path->locks[level] = 1; + path->locks[level] = BTRFS_WRITE_LOCK; path->slots[level] = 0; return 0; } @@ -2253,14 +2340,6 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, if (path->slots[0] == i) push_space += data_size; - if (!left->map_token) { - map_extent_buffer(left, (unsigned long)item, - sizeof(struct btrfs_item), - &left->map_token, &left->kaddr, - &left->map_start, &left->map_len, - KM_USER1); - } - this_item_size = btrfs_item_size(left, item); if (this_item_size + sizeof(*item) + push_space > free_space) break; @@ -2271,10 +2350,6 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, break; i--; } - if (left->map_token) { - unmap_extent_buffer(left, left->map_token, KM_USER1); - left->map_token = NULL; - } if (push_items == 0) goto out_unlock; @@ -2316,21 +2391,10 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, push_space = BTRFS_LEAF_DATA_SIZE(root); for (i = 0; i < right_nritems; i++) { item = btrfs_item_nr(right, i); - if (!right->map_token) { - map_extent_buffer(right, (unsigned long)item, - sizeof(struct btrfs_item), - &right->map_token, &right->kaddr, - &right->map_start, &right->map_len, - KM_USER1); - } push_space -= btrfs_item_size(right, item); btrfs_set_item_offset(right, item, push_space); } - if (right->map_token) { - unmap_extent_buffer(right, right->map_token, KM_USER1); - right->map_token = NULL; - } left_nritems -= push_items; btrfs_set_header_nritems(left, left_nritems); @@ -2467,13 +2531,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, for (i = 0; i < nr; i++) { item = btrfs_item_nr(right, i); - if (!right->map_token) { - map_extent_buffer(right, (unsigned long)item, - sizeof(struct btrfs_item), - &right->map_token, &right->kaddr, - &right->map_start, &right->map_len, - KM_USER1); - } if (!empty && push_items > 0) { if (path->slots[0] < i) @@ -2496,11 +2553,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, push_space += this_item_size + sizeof(*item); } - if (right->map_token) { - unmap_extent_buffer(right, right->map_token, KM_USER1); - right->map_token = NULL; - } - if (push_items == 0) { ret = 1; goto out; @@ -2530,23 +2582,12 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, u32 ioff; item = btrfs_item_nr(left, i); - if (!left->map_token) { - map_extent_buffer(left, (unsigned long)item, - sizeof(struct btrfs_item), - &left->map_token, &left->kaddr, - &left->map_start, &left->map_len, - KM_USER1); - } ioff = btrfs_item_offset(left, item); btrfs_set_item_offset(left, item, ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size)); } btrfs_set_header_nritems(left, old_left_nritems + push_items); - if (left->map_token) { - unmap_extent_buffer(left, left->map_token, KM_USER1); - left->map_token = NULL; - } /* fixup right node */ if (push_items > right_nritems) { @@ -2574,21 +2615,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, for (i = 0; i < right_nritems; i++) { item = btrfs_item_nr(right, i); - if (!right->map_token) { - map_extent_buffer(right, (unsigned long)item, - sizeof(struct btrfs_item), - &right->map_token, &right->kaddr, - &right->map_start, &right->map_len, - KM_USER1); - } - push_space = push_space - btrfs_item_size(right, item); btrfs_set_item_offset(right, item, push_space); } - if (right->map_token) { - unmap_extent_buffer(right, right->map_token, KM_USER1); - right->map_token = NULL; - } btrfs_mark_buffer_dirty(left); if (right_nritems) @@ -2729,23 +2758,10 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans, struct btrfs_item *item = btrfs_item_nr(right, i); u32 ioff; - if (!right->map_token) { - map_extent_buffer(right, (unsigned long)item, - sizeof(struct btrfs_item), - &right->map_token, &right->kaddr, - &right->map_start, &right->map_len, - KM_USER1); - } - ioff = btrfs_item_offset(right, item); btrfs_set_item_offset(right, item, ioff + rt_data_off); } - if (right->map_token) { - unmap_extent_buffer(right, right->map_token, KM_USER1); - right->map_token = NULL; - } - btrfs_set_header_nritems(l, mid); ret = 0; btrfs_item_key(right, &disk_key, 0); @@ -3264,23 +3280,10 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans, u32 ioff; item = btrfs_item_nr(leaf, i); - if (!leaf->map_token) { - map_extent_buffer(leaf, (unsigned long)item, - sizeof(struct btrfs_item), - &leaf->map_token, &leaf->kaddr, - &leaf->map_start, &leaf->map_len, - KM_USER1); - } - ioff = btrfs_item_offset(leaf, item); btrfs_set_item_offset(leaf, item, ioff + size_diff); } - if (leaf->map_token) { - unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); - leaf->map_token = NULL; - } - /* shift the data */ if (from_end) { memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + @@ -3377,22 +3380,10 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans, u32 ioff; item = btrfs_item_nr(leaf, i); - if (!leaf->map_token) { - map_extent_buffer(leaf, (unsigned long)item, - sizeof(struct btrfs_item), - &leaf->map_token, &leaf->kaddr, - &leaf->map_start, &leaf->map_len, - KM_USER1); - } ioff = btrfs_item_offset(leaf, item); btrfs_set_item_offset(leaf, item, ioff - data_size); } - if (leaf->map_token) { - unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); - leaf->map_token = NULL; - } - /* shift the data */ memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + data_end - data_size, btrfs_leaf_data(leaf) + @@ -3494,27 +3485,13 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans, * item0..itemN ... dataN.offset..dataN.size .. data0.size */ /* first correct the data pointers */ - WARN_ON(leaf->map_token); for (i = slot; i < nritems; i++) { u32 ioff; item = btrfs_item_nr(leaf, i); - if (!leaf->map_token) { - map_extent_buffer(leaf, (unsigned long)item, - sizeof(struct btrfs_item), - &leaf->map_token, &leaf->kaddr, - &leaf->map_start, &leaf->map_len, - KM_USER1); - } - ioff = btrfs_item_offset(leaf, item); btrfs_set_item_offset(leaf, item, ioff - total_data); } - if (leaf->map_token) { - unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); - leaf->map_token = NULL; - } - /* shift the items */ memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), btrfs_item_nr_offset(slot), @@ -3608,27 +3585,13 @@ int setup_items_for_insert(struct btrfs_trans_handle *trans, * item0..itemN ... dataN.offset..dataN.size .. data0.size */ /* first correct the data pointers */ - WARN_ON(leaf->map_token); for (i = slot; i < nritems; i++) { u32 ioff; item = btrfs_item_nr(leaf, i); - if (!leaf->map_token) { - map_extent_buffer(leaf, (unsigned long)item, - sizeof(struct btrfs_item), - &leaf->map_token, &leaf->kaddr, - &leaf->map_start, &leaf->map_len, - KM_USER1); - } - ioff = btrfs_item_offset(leaf, item); btrfs_set_item_offset(leaf, item, ioff - total_data); } - if (leaf->map_token) { - unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); - leaf->map_token = NULL; - } - /* shift the items */ memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), btrfs_item_nr_offset(slot), @@ -3840,22 +3803,10 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 ioff; item = btrfs_item_nr(leaf, i); - if (!leaf->map_token) { - map_extent_buffer(leaf, (unsigned long)item, - sizeof(struct btrfs_item), - &leaf->map_token, &leaf->kaddr, - &leaf->map_start, &leaf->map_len, - KM_USER1); - } ioff = btrfs_item_offset(leaf, item); btrfs_set_item_offset(leaf, item, ioff + dsize); } - if (leaf->map_token) { - unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); - leaf->map_token = NULL; - } - memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot), btrfs_item_nr_offset(slot + nr), sizeof(struct btrfs_item) * @@ -4004,11 +3955,11 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, WARN_ON(!path->keep_locks); again: - cur = btrfs_lock_root_node(root); + cur = btrfs_read_lock_root_node(root); level = btrfs_header_level(cur); WARN_ON(path->nodes[level]); path->nodes[level] = cur; - path->locks[level] = 1; + path->locks[level] = BTRFS_READ_LOCK; if (btrfs_header_generation(cur) < min_trans) { ret = 1; @@ -4098,12 +4049,12 @@ find_next_key: cur = read_node_slot(root, cur, slot); BUG_ON(!cur); - btrfs_tree_lock(cur); + btrfs_tree_read_lock(cur); - path->locks[level - 1] = 1; + path->locks[level - 1] = BTRFS_READ_LOCK; path->nodes[level - 1] = cur; unlock_up(path, level, 1); - btrfs_clear_path_blocking(path, NULL); + btrfs_clear_path_blocking(path, NULL, 0); } out: if (ret == 0) @@ -4218,30 +4169,21 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) u32 nritems; int ret; int old_spinning = path->leave_spinning; - int force_blocking = 0; + int next_rw_lock = 0; nritems = btrfs_header_nritems(path->nodes[0]); if (nritems == 0) return 1; - /* - * we take the blocks in an order that upsets lockdep. Using - * blocking mode is the only way around it. - */ -#ifdef CONFIG_DEBUG_LOCK_ALLOC - force_blocking = 1; -#endif - btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); again: level = 1; next = NULL; + next_rw_lock = 0; btrfs_release_path(path); path->keep_locks = 1; - - if (!force_blocking) - path->leave_spinning = 1; + path->leave_spinning = 1; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); path->keep_locks = 0; @@ -4281,11 +4223,12 @@ again: } if (next) { - btrfs_tree_unlock(next); + btrfs_tree_unlock_rw(next, next_rw_lock); free_extent_buffer(next); } next = c; + next_rw_lock = path->locks[level]; ret = read_block_for_search(NULL, root, path, &next, level, slot, &key); if (ret == -EAGAIN) @@ -4297,15 +4240,14 @@ again: } if (!path->skip_locking) { - ret = btrfs_try_spin_lock(next); + ret = btrfs_try_tree_read_lock(next); if (!ret) { btrfs_set_path_blocking(path); - btrfs_tree_lock(next); - if (!force_blocking) - btrfs_clear_path_blocking(path, next); + btrfs_tree_read_lock(next); + btrfs_clear_path_blocking(path, next, + BTRFS_READ_LOCK); } - if (force_blocking) - btrfs_set_lock_blocking(next); + next_rw_lock = BTRFS_READ_LOCK; } break; } @@ -4314,14 +4256,13 @@ again: level--; c = path->nodes[level]; if (path->locks[level]) - btrfs_tree_unlock(c); + btrfs_tree_unlock_rw(c, path->locks[level]); free_extent_buffer(c); path->nodes[level] = next; path->slots[level] = 0; if (!path->skip_locking) - path->locks[level] = 1; - + path->locks[level] = next_rw_lock; if (!level) break; @@ -4336,16 +4277,14 @@ again: } if (!path->skip_locking) { - btrfs_assert_tree_locked(path->nodes[level]); - ret = btrfs_try_spin_lock(next); + ret = btrfs_try_tree_read_lock(next); if (!ret) { btrfs_set_path_blocking(path); - btrfs_tree_lock(next); - if (!force_blocking) - btrfs_clear_path_blocking(path, next); + btrfs_tree_read_lock(next); + btrfs_clear_path_blocking(path, next, + BTRFS_READ_LOCK); } - if (force_blocking) - btrfs_set_lock_blocking(next); + next_rw_lock = BTRFS_READ_LOCK; } } ret = 0; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index fe9287b..03912c5 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -755,6 +755,8 @@ struct btrfs_space_info { chunks for this space */ unsigned int chunk_alloc:1; /* set if we are allocating a chunk */ + unsigned int flush:1; /* set if we are trying to make space */ + unsigned int force_alloc; /* set if we need to force a chunk alloc for this space */ @@ -764,7 +766,7 @@ struct btrfs_space_info { struct list_head block_groups[BTRFS_NR_RAID_TYPES]; spinlock_t lock; struct rw_semaphore groups_sem; - atomic_t caching_threads; + wait_queue_head_t wait; }; struct btrfs_block_rsv { @@ -824,6 +826,7 @@ struct btrfs_caching_control { struct list_head list; struct mutex mutex; wait_queue_head_t wait; + struct btrfs_work work; struct btrfs_block_group_cache *block_group; u64 progress; atomic_t count; @@ -1032,6 +1035,8 @@ struct btrfs_fs_info { struct btrfs_workers endio_write_workers; struct btrfs_workers endio_freespace_worker; struct btrfs_workers submit_workers; + struct btrfs_workers caching_workers; + /* * fixup workers take dirty pages that didn't properly go through * the cow mechanism and make them safe to write. It happens @@ -1410,17 +1415,15 @@ void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val); #define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \ static inline u##bits btrfs_##name(struct extent_buffer *eb) \ { \ - type *p = kmap_atomic(eb->first_page, KM_USER0); \ + type *p = page_address(eb->first_page); \ u##bits res = le##bits##_to_cpu(p->member); \ - kunmap_atomic(p, KM_USER0); \ return res; \ } \ static inline void btrfs_set_##name(struct extent_buffer *eb, \ u##bits val) \ { \ - type *p = kmap_atomic(eb->first_page, KM_USER0); \ + type *p = page_address(eb->first_page); \ p->member = cpu_to_le##bits(val); \ - kunmap_atomic(p, KM_USER0); \ } #define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \ @@ -2128,7 +2131,7 @@ static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info) /* extent-tree.c */ static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root, - int num_items) + unsigned num_items) { return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) * 3 * num_items; @@ -2222,9 +2225,6 @@ void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); int btrfs_check_data_free_space(struct inode *inode, u64 bytes); void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); -int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - int num_items); void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, @@ -2330,7 +2330,7 @@ struct btrfs_path *btrfs_alloc_path(void); void btrfs_free_path(struct btrfs_path *p); void btrfs_set_path_blocking(struct btrfs_path *p); void btrfs_clear_path_blocking(struct btrfs_path *p, - struct extent_buffer *held); + struct extent_buffer *held, int held_rw); void btrfs_unlock_up_safe(struct btrfs_path *p, int level); int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -2365,8 +2365,8 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf); -int btrfs_drop_snapshot(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, int update_ref); +void btrfs_drop_snapshot(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, int update_ref); int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *node, @@ -2404,8 +2404,8 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct btrfs_root_item *item, struct btrfs_key *key); int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid); int btrfs_find_orphan_roots(struct btrfs_root *tree_root); -int btrfs_set_root_node(struct btrfs_root_item *item, - struct extent_buffer *node); +void btrfs_set_root_node(struct btrfs_root_item *item, + struct extent_buffer *node); void btrfs_check_and_init_root_item(struct btrfs_root_item *item); /* dir-item.c */ @@ -2521,6 +2521,14 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag #define PageChecked PageFsMisc #endif +/* This forces readahead on a given range of bytes in an inode */ +static inline void btrfs_force_ra(struct address_space *mapping, + struct file_ra_state *ra, struct file *file, + pgoff_t offset, unsigned long req_size) +{ + page_cache_sync_readahead(mapping, ra, file, offset, req_size); +} + struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry); int btrfs_set_inode_index(struct inode *dir, u64 *index); int btrfs_unlink_inode(struct btrfs_trans_handle *trans, @@ -2549,9 +2557,6 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, int btrfs_merge_bio_hook(struct page *page, unsigned long offset, size_t size, struct bio *bio, unsigned long bio_flags); -unsigned long btrfs_force_ra(struct address_space *mapping, - struct file_ra_state *ra, struct file *file, - pgoff_t offset, pgoff_t last_index); int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); int btrfs_readpage(struct file *file, struct page *page); void btrfs_evict_inode(struct inode *inode); @@ -2646,12 +2651,21 @@ do { \ /* acl.c */ #ifdef CONFIG_BTRFS_FS_POSIX_ACL struct posix_acl *btrfs_get_acl(struct inode *inode, int type); -#else -#define btrfs_get_acl NULL -#endif int btrfs_init_acl(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir); int btrfs_acl_chmod(struct inode *inode); +#else +#define btrfs_get_acl NULL +static inline int btrfs_init_acl(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *dir) +{ + return 0; +} +static inline int btrfs_acl_chmod(struct inode *inode) +{ + return 0; +} +#endif /* relocation.c */ int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 98c68e6..b52c672 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -735,7 +735,7 @@ static int btrfs_batch_insert_items(struct btrfs_trans_handle *trans, } /* reset all the locked nodes in the patch to spinning locks. */ - btrfs_clear_path_blocking(path, NULL); + btrfs_clear_path_blocking(path, NULL, 0); /* insert the keys of the items */ ret = setup_items_for_insert(trans, root, path, keys, data_size, diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 8d27af4..7083d08 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -25,7 +25,7 @@ #include <linux/mutex.h> #include <linux/list.h> #include <linux/wait.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include "ctree.h" diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 685f259..31d84e7 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -89,13 +89,8 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, data_size = sizeof(*dir_item) + name_len + data_len; dir_item = insert_with_overflow(trans, root, path, &key, data_size, name, name_len); - /* - * FIXME: at some point we should handle xattr's that are larger than - * what we can fit in our leaf. We set location to NULL b/c we arent - * pointing at anything else, that will change if we store the xattr - * data in a separate inode. - */ - BUG_ON(IS_ERR(dir_item)); + if (IS_ERR(dir_item)) + return PTR_ERR(dir_item); memset(&location, 0, sizeof(location)); leaf = path->nodes[0]; @@ -203,8 +198,6 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_key key; int ins_len = mod < 0 ? -1 : 0; int cow = mod != 0; - struct btrfs_key found_key; - struct extent_buffer *leaf; key.objectid = dir; btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); @@ -214,18 +207,7 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); if (ret < 0) return ERR_PTR(ret); - if (ret > 0) { - if (path->slots[0] == 0) - return NULL; - path->slots[0]--; - } - - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - - if (found_key.objectid != dir || - btrfs_key_type(&found_key) != BTRFS_DIR_ITEM_KEY || - found_key.offset != key.offset) + if (ret > 0) return NULL; return btrfs_match_dir_item_name(root, path, name, name_len); @@ -320,8 +302,6 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, struct btrfs_key key; int ins_len = mod < 0 ? -1 : 0; int cow = mod != 0; - struct btrfs_key found_key; - struct extent_buffer *leaf; key.objectid = dir; btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); @@ -329,18 +309,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); if (ret < 0) return ERR_PTR(ret); - if (ret > 0) { - if (path->slots[0] == 0) - return NULL; - path->slots[0]--; - } - - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - - if (found_key.objectid != dir || - btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY || - found_key.offset != key.offset) + if (ret > 0) return NULL; return btrfs_match_dir_item_name(root, path, name, name_len); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b231ae1..07b3ac6 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -100,38 +100,83 @@ struct async_submit_bio { struct btrfs_work work; }; -/* These are used to set the lockdep class on the extent buffer locks. - * The class is set by the readpage_end_io_hook after the buffer has - * passed csum validation but before the pages are unlocked. +/* + * Lockdep class keys for extent_buffer->lock's in this root. For a given + * eb, the lockdep key is determined by the btrfs_root it belongs to and + * the level the eb occupies in the tree. + * + * Different roots are used for different purposes and may nest inside each + * other and they require separate keysets. As lockdep keys should be + * static, assign keysets according to the purpose of the root as indicated + * by btrfs_root->objectid. This ensures that all special purpose roots + * have separate keysets. * - * The lockdep class is also set by btrfs_init_new_buffer on freshly - * allocated blocks. + * Lock-nesting across peer nodes is always done with the immediate parent + * node locked thus preventing deadlock. As lockdep doesn't know this, use + * subclass to avoid triggering lockdep warning in such cases. * - * The class is based on the level in the tree block, which allows lockdep - * to know that lower nodes nest inside the locks of higher nodes. + * The key is set by the readpage_end_io_hook after the buffer has passed + * csum validation but before the pages are unlocked. It is also set by + * btrfs_init_new_buffer on freshly allocated blocks. * - * We also add a check to make sure the highest level of the tree is - * the same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this - * code needs update as well. + * We also add a check to make sure the highest level of the tree is the + * same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this code + * needs update as well. */ #ifdef CONFIG_DEBUG_LOCK_ALLOC # if BTRFS_MAX_LEVEL != 8 # error # endif -static struct lock_class_key btrfs_eb_class[BTRFS_MAX_LEVEL + 1]; -static const char *btrfs_eb_name[BTRFS_MAX_LEVEL + 1] = { - /* leaf */ - "btrfs-extent-00", - "btrfs-extent-01", - "btrfs-extent-02", - "btrfs-extent-03", - "btrfs-extent-04", - "btrfs-extent-05", - "btrfs-extent-06", - "btrfs-extent-07", - /* highest possible level */ - "btrfs-extent-08", + +static struct btrfs_lockdep_keyset { + u64 id; /* root objectid */ + const char *name_stem; /* lock name stem */ + char names[BTRFS_MAX_LEVEL + 1][20]; + struct lock_class_key keys[BTRFS_MAX_LEVEL + 1]; +} btrfs_lockdep_keysets[] = { + { .id = BTRFS_ROOT_TREE_OBJECTID, .name_stem = "root" }, + { .id = BTRFS_EXTENT_TREE_OBJECTID, .name_stem = "extent" }, + { .id = BTRFS_CHUNK_TREE_OBJECTID, .name_stem = "chunk" }, + { .id = BTRFS_DEV_TREE_OBJECTID, .name_stem = "dev" }, + { .id = BTRFS_FS_TREE_OBJECTID, .name_stem = "fs" }, + { .id = BTRFS_CSUM_TREE_OBJECTID, .name_stem = "csum" }, + { .id = BTRFS_ORPHAN_OBJECTID, .name_stem = "orphan" }, + { .id = BTRFS_TREE_LOG_OBJECTID, .name_stem = "log" }, + { .id = BTRFS_TREE_RELOC_OBJECTID, .name_stem = "treloc" }, + { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, .name_stem = "dreloc" }, + { .id = 0, .name_stem = "tree" }, }; + +void __init btrfs_init_lockdep(void) +{ + int i, j; + + /* initialize lockdep class names */ + for (i = 0; i < ARRAY_SIZE(btrfs_lockdep_keysets); i++) { + struct btrfs_lockdep_keyset *ks = &btrfs_lockdep_keysets[i]; + + for (j = 0; j < ARRAY_SIZE(ks->names); j++) + snprintf(ks->names[j], sizeof(ks->names[j]), + "btrfs-%s-%02d", ks->name_stem, j); + } +} + +void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, + int level) +{ + struct btrfs_lockdep_keyset *ks; + + BUG_ON(level >= ARRAY_SIZE(ks->keys)); + + /* find the matching keyset, id 0 is the default entry */ + for (ks = btrfs_lockdep_keysets; ks->id; ks++) + if (ks->id == objectid) + break; + + lockdep_set_class_and_name(&eb->lock, + &ks->keys[level], ks->names[level]); +} + #endif /* @@ -217,7 +262,6 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, unsigned long len; unsigned long cur_len; unsigned long offset = BTRFS_CSUM_SIZE; - char *map_token = NULL; char *kaddr; unsigned long map_start; unsigned long map_len; @@ -228,8 +272,7 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, len = buf->len - offset; while (len > 0) { err = map_private_extent_buffer(buf, offset, 32, - &map_token, &kaddr, - &map_start, &map_len, KM_USER0); + &kaddr, &map_start, &map_len); if (err) return 1; cur_len = min(len, map_len - (offset - map_start)); @@ -237,7 +280,6 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, crc, cur_len); len -= cur_len; offset += cur_len; - unmap_extent_buffer(buf, map_token, KM_USER0); } if (csum_size > sizeof(inline_result)) { result = kzalloc(csum_size * sizeof(char), GFP_NOFS); @@ -494,15 +536,6 @@ static noinline int check_leaf(struct btrfs_root *root, return 0; } -#ifdef CONFIG_DEBUG_LOCK_ALLOC -void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level) -{ - lockdep_set_class_and_name(&eb->lock, - &btrfs_eb_class[level], - btrfs_eb_name[level]); -} -#endif - static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, struct extent_state *state) { @@ -553,7 +586,8 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, } found_level = btrfs_header_level(eb); - btrfs_set_buffer_lockdep_class(eb, found_level); + btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), + eb, found_level); ret = csum_tree_block(root, eb, 1); if (ret) { @@ -1598,7 +1632,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, goto fail_bdi; } - fs_info->btree_inode->i_mapping->flags &= ~__GFP_FS; + mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); INIT_LIST_HEAD(&fs_info->trans_list); @@ -1802,6 +1836,9 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->thread_pool_size), &fs_info->generic_worker); + btrfs_init_workers(&fs_info->caching_workers, "cache", + 2, &fs_info->generic_worker); + /* a higher idle thresh on the submit workers makes it much more * likely that bios will be send down in a sane order to the * devices @@ -1855,6 +1892,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_start_workers(&fs_info->endio_write_workers, 1); btrfs_start_workers(&fs_info->endio_freespace_worker, 1); btrfs_start_workers(&fs_info->delayed_workers, 1); + btrfs_start_workers(&fs_info->caching_workers, 1); fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, @@ -2112,6 +2150,7 @@ fail_sb_buffer: btrfs_stop_workers(&fs_info->endio_freespace_worker); btrfs_stop_workers(&fs_info->submit_workers); btrfs_stop_workers(&fs_info->delayed_workers); + btrfs_stop_workers(&fs_info->caching_workers); fail_alloc: kfree(fs_info->delayed_root); fail_iput: @@ -2577,6 +2616,7 @@ int close_ctree(struct btrfs_root *root) btrfs_stop_workers(&fs_info->endio_freespace_worker); btrfs_stop_workers(&fs_info->submit_workers); btrfs_stop_workers(&fs_info->delayed_workers); + btrfs_stop_workers(&fs_info->caching_workers); btrfs_close_devices(fs_info->fs_devices); btrfs_mapping_tree_free(&fs_info->mapping_tree); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index a0b610a..bec3ea4 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -87,10 +87,14 @@ int btree_lock_page_hook(struct page *page); #ifdef CONFIG_DEBUG_LOCK_ALLOC -void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level); +void btrfs_init_lockdep(void); +void btrfs_set_buffer_lockdep_class(u64 objectid, + struct extent_buffer *eb, int level); #else -static inline void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, - int level) +static inline void btrfs_init_lockdep(void) +{ } +static inline void btrfs_set_buffer_lockdep_class(u64 objectid, + struct extent_buffer *eb, int level) { } #endif diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 71cd456..f5be06a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -320,12 +320,12 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, return total_added; } -static int caching_kthread(void *data) +static noinline void caching_thread(struct btrfs_work *work) { - struct btrfs_block_group_cache *block_group = data; - struct btrfs_fs_info *fs_info = block_group->fs_info; - struct btrfs_caching_control *caching_ctl = block_group->caching_ctl; - struct btrfs_root *extent_root = fs_info->extent_root; + struct btrfs_block_group_cache *block_group; + struct btrfs_fs_info *fs_info; + struct btrfs_caching_control *caching_ctl; + struct btrfs_root *extent_root; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_key key; @@ -334,9 +334,14 @@ static int caching_kthread(void *data) u32 nritems; int ret = 0; + caching_ctl = container_of(work, struct btrfs_caching_control, work); + block_group = caching_ctl->block_group; + fs_info = block_group->fs_info; + extent_root = fs_info->extent_root; + path = btrfs_alloc_path(); if (!path) - return -ENOMEM; + goto out; last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); @@ -433,13 +438,11 @@ err: free_excluded_extents(extent_root, block_group); mutex_unlock(&caching_ctl->mutex); +out: wake_up(&caching_ctl->wait); put_caching_control(caching_ctl); - atomic_dec(&block_group->space_info->caching_threads); btrfs_put_block_group(block_group); - - return 0; } static int cache_block_group(struct btrfs_block_group_cache *cache, @@ -449,7 +452,6 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, { struct btrfs_fs_info *fs_info = cache->fs_info; struct btrfs_caching_control *caching_ctl; - struct task_struct *tsk; int ret = 0; smp_mb(); @@ -501,6 +503,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, caching_ctl->progress = cache->key.objectid; /* one for caching kthread, one for caching block group list */ atomic_set(&caching_ctl->count, 2); + caching_ctl->work.func = caching_thread; spin_lock(&cache->lock); if (cache->cached != BTRFS_CACHE_NO) { @@ -516,16 +519,9 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); up_write(&fs_info->extent_commit_sem); - atomic_inc(&cache->space_info->caching_threads); btrfs_get_block_group(cache); - tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n", - cache->key.objectid); - if (IS_ERR(tsk)) { - ret = PTR_ERR(tsk); - printk(KERN_ERR "error running thread %d\n", ret); - BUG(); - } + btrfs_queue_worker(&fs_info->caching_workers, &caching_ctl->work); return ret; } @@ -667,7 +663,9 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len) struct btrfs_path *path; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; + key.objectid = start; key.offset = len; btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); @@ -1784,6 +1782,9 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, for (i = 0; i < multi->num_stripes; i++, stripe++) { + if (!stripe->dev->can_discard) + continue; + ret = btrfs_issue_discard(stripe->dev->bdev, stripe->physical, stripe->length); @@ -1791,11 +1792,16 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, discarded_bytes += stripe->length; else if (ret != -EOPNOTSUPP) break; + + /* + * Just in case we get back EOPNOTSUPP for some reason, + * just ignore the return value so we don't screw up + * people calling discard_extent. + */ + ret = 0; } kfree(multi); } - if (discarded_bytes && ret == -EOPNOTSUPP) - ret = 0; if (actual_bytes) *actual_bytes = discarded_bytes; @@ -2932,9 +2938,10 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->full = 0; found->force_alloc = CHUNK_ALLOC_NO_FORCE; found->chunk_alloc = 0; + found->flush = 0; + init_waitqueue_head(&found->wait); *space_info = found; list_add_rcu(&found->list, &info->space_info); - atomic_set(&found->caching_threads, 0); return 0; } @@ -3275,6 +3282,9 @@ again: } ret = btrfs_alloc_chunk(trans, extent_root, flags); + if (ret < 0 && ret != -ENOSPC) + goto out; + spin_lock(&space_info->lock); if (ret) space_info->full = 1; @@ -3284,6 +3294,7 @@ again: space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; space_info->chunk_alloc = 0; spin_unlock(&space_info->lock); +out: mutex_unlock(&extent_root->fs_info->chunk_mutex); return ret; } @@ -3314,6 +3325,14 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, if (reserved == 0) return 0; + smp_mb(); + if (root->fs_info->delalloc_bytes == 0) { + if (trans) + return 0; + btrfs_wait_ordered_extents(root, 0, 0); + return 0; + } + max_reclaim = min(reserved, to_reclaim); while (loops < 1024) { @@ -3356,6 +3375,8 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, } } + if (reclaimed >= to_reclaim && !trans) + btrfs_wait_ordered_extents(root, 0, 0); return reclaimed >= to_reclaim; } @@ -3380,15 +3401,36 @@ static int reserve_metadata_bytes(struct btrfs_trans_handle *trans, u64 num_bytes = orig_bytes; int retries = 0; int ret = 0; - bool reserved = false; bool committed = false; + bool flushing = false; again: - ret = -ENOSPC; - if (reserved) - num_bytes = 0; - + ret = 0; spin_lock(&space_info->lock); + /* + * We only want to wait if somebody other than us is flushing and we are + * actually alloed to flush. + */ + while (flush && !flushing && space_info->flush) { + spin_unlock(&space_info->lock); + /* + * If we have a trans handle we can't wait because the flusher + * may have to commit the transaction, which would mean we would + * deadlock since we are waiting for the flusher to finish, but + * hold the current transaction open. + */ + if (trans) + return -EAGAIN; + ret = wait_event_interruptible(space_info->wait, + !space_info->flush); + /* Must have been interrupted, return */ + if (ret) + return -EINTR; + + spin_lock(&space_info->lock); + } + + ret = -ENOSPC; unused = space_info->bytes_used + space_info->bytes_reserved + space_info->bytes_pinned + space_info->bytes_readonly + space_info->bytes_may_use; @@ -3403,8 +3445,7 @@ again: if (unused <= space_info->total_bytes) { unused = space_info->total_bytes - unused; if (unused >= num_bytes) { - if (!reserved) - space_info->bytes_reserved += orig_bytes; + space_info->bytes_reserved += orig_bytes; ret = 0; } else { /* @@ -3429,17 +3470,14 @@ again: * to reclaim space we can actually use it instead of somebody else * stealing it from us. */ - if (ret && !reserved) { - space_info->bytes_reserved += orig_bytes; - reserved = true; + if (ret && flush) { + flushing = true; + space_info->flush = 1; } spin_unlock(&space_info->lock); - if (!ret) - return 0; - - if (!flush) + if (!ret || !flush) goto out; /* @@ -3447,11 +3485,11 @@ again: * metadata until after the IO is completed. */ ret = shrink_delalloc(trans, root, num_bytes, 1); - if (ret > 0) - return 0; - else if (ret < 0) + if (ret < 0) goto out; + ret = 0; + /* * So if we were overcommitted it's possible that somebody else flushed * out enough space and we simply didn't have enough space to reclaim, @@ -3462,11 +3500,11 @@ again: goto again; } - spin_lock(&space_info->lock); /* * Not enough space to be reclaimed, don't bother committing the * transaction. */ + spin_lock(&space_info->lock); if (space_info->bytes_pinned < orig_bytes) ret = -ENOSPC; spin_unlock(&space_info->lock); @@ -3474,10 +3512,13 @@ again: goto out; ret = -EAGAIN; - if (trans || committed) + if (trans) goto out; ret = -ENOSPC; + if (committed) + goto out; + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) goto out; @@ -3489,12 +3530,12 @@ again: } out: - if (reserved) { + if (flushing) { spin_lock(&space_info->lock); - space_info->bytes_reserved -= orig_bytes; + space_info->flush = 0; + wake_up_all(&space_info->wait); spin_unlock(&space_info->lock); } - return ret; } @@ -3704,7 +3745,6 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, if (commit_trans) { if (trans) return -EAGAIN; - trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); ret = btrfs_commit_transaction(trans, root); @@ -3874,26 +3914,6 @@ int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans, return 0; } -int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - int num_items) -{ - u64 num_bytes; - int ret; - - if (num_items == 0 || root->fs_info->chunk_root == root) - return 0; - - num_bytes = btrfs_calc_trans_metadata_size(root, num_items); - ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv, - num_bytes); - if (!ret) { - trans->bytes_reserved += num_bytes; - trans->block_rsv = &root->fs_info->trans_block_rsv; - } - return ret; -} - void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, struct btrfs_root *root) { @@ -3944,6 +3964,30 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); } +static unsigned drop_outstanding_extent(struct inode *inode) +{ + unsigned dropped_extents = 0; + + spin_lock(&BTRFS_I(inode)->lock); + BUG_ON(!BTRFS_I(inode)->outstanding_extents); + BTRFS_I(inode)->outstanding_extents--; + + /* + * If we have more or the same amount of outsanding extents than we have + * reserved then we need to leave the reserved extents count alone. + */ + if (BTRFS_I(inode)->outstanding_extents >= + BTRFS_I(inode)->reserved_extents) + goto out; + + dropped_extents = BTRFS_I(inode)->reserved_extents - + BTRFS_I(inode)->outstanding_extents; + BTRFS_I(inode)->reserved_extents -= dropped_extents; +out: + spin_unlock(&BTRFS_I(inode)->lock); + return dropped_extents; +} + static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes) { return num_bytes >>= 3; @@ -3953,9 +3997,8 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; - u64 to_reserve; - int nr_extents; - int reserved_extents; + u64 to_reserve = 0; + unsigned nr_extents = 0; int ret; if (btrfs_transaction_in_commit(root->fs_info)) @@ -3963,66 +4006,49 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) num_bytes = ALIGN(num_bytes, root->sectorsize); - nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1; - reserved_extents = atomic_read(&BTRFS_I(inode)->reserved_extents); + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->outstanding_extents++; + + if (BTRFS_I(inode)->outstanding_extents > + BTRFS_I(inode)->reserved_extents) { + nr_extents = BTRFS_I(inode)->outstanding_extents - + BTRFS_I(inode)->reserved_extents; + BTRFS_I(inode)->reserved_extents += nr_extents; - if (nr_extents > reserved_extents) { - nr_extents -= reserved_extents; to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); - } else { - nr_extents = 0; - to_reserve = 0; } + spin_unlock(&BTRFS_I(inode)->lock); to_reserve += calc_csum_metadata_size(inode, num_bytes); ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1); - if (ret) + if (ret) { + unsigned dropped; + /* + * We don't need the return value since our reservation failed, + * we just need to clean up our counter. + */ + dropped = drop_outstanding_extent(inode); + WARN_ON(dropped > 1); return ret; - - atomic_add(nr_extents, &BTRFS_I(inode)->reserved_extents); - atomic_inc(&BTRFS_I(inode)->outstanding_extents); + } block_rsv_add_bytes(block_rsv, to_reserve, 1); - if (block_rsv->size > 512 * 1024 * 1024) - shrink_delalloc(NULL, root, to_reserve, 0); - return 0; } void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) { struct btrfs_root *root = BTRFS_I(inode)->root; - u64 to_free; - int nr_extents; - int reserved_extents; + u64 to_free = 0; + unsigned dropped; num_bytes = ALIGN(num_bytes, root->sectorsize); - atomic_dec(&BTRFS_I(inode)->outstanding_extents); - WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents) < 0); - - reserved_extents = atomic_read(&BTRFS_I(inode)->reserved_extents); - do { - int old, new; - - nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents); - if (nr_extents >= reserved_extents) { - nr_extents = 0; - break; - } - old = reserved_extents; - nr_extents = reserved_extents - nr_extents; - new = reserved_extents - nr_extents; - old = atomic_cmpxchg(&BTRFS_I(inode)->reserved_extents, - reserved_extents, new); - if (likely(old == reserved_extents)) - break; - reserved_extents = old; - } while (1); + dropped = drop_outstanding_extent(inode); to_free = calc_csum_metadata_size(inode, num_bytes); - if (nr_extents > 0) - to_free += btrfs_calc_trans_metadata_size(root, nr_extents); + if (dropped > 0) + to_free += btrfs_calc_trans_metadata_size(root, dropped); btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, to_free); @@ -4444,7 +4470,9 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, printk(KERN_ERR "umm, got %d back from search" ", was looking for %llu\n", ret, (unsigned long long)bytenr); - btrfs_print_leaf(extent_root, path->nodes[0]); + if (ret > 0) + btrfs_print_leaf(extent_root, + path->nodes[0]); } BUG_ON(ret); extent_slot = path->slots[0]; @@ -4990,14 +5018,10 @@ have_block_group: } /* - * We only want to start kthread caching if we are at - * the point where we will wait for caching to make - * progress, or if our ideal search is over and we've - * found somebody to start caching. + * The caching workers are limited to 2 threads, so we + * can queue as much work as we care to. */ - if (loop > LOOP_CACHING_NOWAIT || - (loop > LOOP_FIND_IDEAL && - atomic_read(&space_info->caching_threads) < 2)) { + if (loop > LOOP_FIND_IDEAL) { ret = cache_block_group(block_group, trans, orig_root, 0); BUG_ON(ret); @@ -5065,7 +5089,9 @@ have_block_group: * group is does point to and try again */ if (!last_ptr_loop && last_ptr->block_group && - last_ptr->block_group != block_group) { + last_ptr->block_group != block_group && + index <= + get_block_group_index(last_ptr->block_group)) { btrfs_put_block_group(block_group); block_group = last_ptr->block_group; @@ -5219,8 +5245,7 @@ loop: if (loop == LOOP_FIND_IDEAL && found_uncached_bg) { found_uncached_bg = false; loop++; - if (!ideal_cache_percent && - atomic_read(&space_info->caching_threads)) + if (!ideal_cache_percent) goto search; /* @@ -5494,7 +5519,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref); path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, @@ -5623,7 +5649,7 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, if (!buf) return ERR_PTR(-ENOMEM); btrfs_set_header_generation(buf, trans->transid); - btrfs_set_buffer_lockdep_class(buf, level); + btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); btrfs_tree_lock(buf); clean_tree_block(trans, root, buf); @@ -5910,7 +5936,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, return 1; if (path->locks[level] && !wc->keep_locks) { - btrfs_tree_unlock(eb); + btrfs_tree_unlock_rw(eb, path->locks[level]); path->locks[level] = 0; } return 0; @@ -5934,7 +5960,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, * keep the tree lock */ if (path->locks[level] && level > 0) { - btrfs_tree_unlock(eb); + btrfs_tree_unlock_rw(eb, path->locks[level]); path->locks[level] = 0; } return 0; @@ -6047,7 +6073,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, BUG_ON(level != btrfs_header_level(next)); path->nodes[level] = next; path->slots[level] = 0; - path->locks[level] = 1; + path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; wc->level = level; if (wc->level == 1) wc->reada_slot = 0; @@ -6118,7 +6144,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, BUG_ON(level == 0); btrfs_tree_lock(eb); btrfs_set_lock_blocking(eb); - path->locks[level] = 1; + path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; ret = btrfs_lookup_extent_info(trans, root, eb->start, eb->len, @@ -6127,8 +6153,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, BUG_ON(ret); BUG_ON(wc->refs[level] == 0); if (wc->refs[level] == 1) { - btrfs_tree_unlock(eb); - path->locks[level] = 0; + btrfs_tree_unlock_rw(eb, path->locks[level]); return 1; } } @@ -6150,7 +6175,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, btrfs_header_generation(eb) == trans->transid) { btrfs_tree_lock(eb); btrfs_set_lock_blocking(eb); - path->locks[level] = 1; + path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; } clean_tree_block(trans, root, eb); } @@ -6229,7 +6254,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, return 0; if (path->locks[level]) { - btrfs_tree_unlock(path->nodes[level]); + btrfs_tree_unlock_rw(path->nodes[level], + path->locks[level]); path->locks[level] = 0; } free_extent_buffer(path->nodes[level]); @@ -6251,8 +6277,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, * also make sure backrefs for the shared block and all lower level * blocks are properly updated. */ -int btrfs_drop_snapshot(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, int update_ref) +void btrfs_drop_snapshot(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, int update_ref) { struct btrfs_path *path; struct btrfs_trans_handle *trans; @@ -6265,10 +6291,17 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int level; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) { + err = -ENOMEM; + goto out; + } wc = kzalloc(sizeof(*wc), GFP_NOFS); - BUG_ON(!wc); + if (!wc) { + btrfs_free_path(path); + err = -ENOMEM; + goto out; + } trans = btrfs_start_transaction(tree_root, 0); BUG_ON(IS_ERR(trans)); @@ -6281,7 +6314,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, path->nodes[level] = btrfs_lock_root_node(root); btrfs_set_lock_blocking(path->nodes[level]); path->slots[level] = 0; - path->locks[level] = 1; + path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; memset(&wc->update_progress, 0, sizeof(wc->update_progress)); } else { @@ -6296,7 +6329,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, path->lowest_level = 0; if (ret < 0) { err = ret; - goto out; + goto out_free; } WARN_ON(ret > 0); @@ -6403,11 +6436,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root, free_extent_buffer(root->commit_root); kfree(root); } -out: +out_free: btrfs_end_transaction_throttle(trans, tree_root); kfree(wc); btrfs_free_path(path); - return err; +out: + if (err) + btrfs_std_error(root->fs_info, err); + return; } /* @@ -6449,7 +6485,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, level = btrfs_header_level(node); path->nodes[level] = node; path->slots[level] = 0; - path->locks[level] = 1; + path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; wc->refs[parent_level] = 1; wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; @@ -6524,30 +6560,48 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) return flags; } -static int set_block_group_ro(struct btrfs_block_group_cache *cache) +static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force) { struct btrfs_space_info *sinfo = cache->space_info; u64 num_bytes; + u64 min_allocable_bytes; int ret = -ENOSPC; - if (cache->ro) - return 0; + + /* + * We need some metadata space and system metadata space for + * allocating chunks in some corner cases until we force to set + * it to be readonly. + */ + if ((sinfo->flags & + (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) && + !force) + min_allocable_bytes = 1 * 1024 * 1024; + else + min_allocable_bytes = 0; spin_lock(&sinfo->lock); spin_lock(&cache->lock); + + if (cache->ro) { + ret = 0; + goto out; + } + num_bytes = cache->key.offset - cache->reserved - cache->pinned - cache->bytes_super - btrfs_block_group_used(&cache->item); if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + sinfo->bytes_may_use + sinfo->bytes_readonly + - cache->reserved_pinned + num_bytes <= sinfo->total_bytes) { + cache->reserved_pinned + num_bytes + min_allocable_bytes <= + sinfo->total_bytes) { sinfo->bytes_readonly += num_bytes; sinfo->bytes_reserved += cache->reserved_pinned; cache->reserved_pinned = 0; cache->ro = 1; ret = 0; } - +out: spin_unlock(&cache->lock); spin_unlock(&sinfo->lock); return ret; @@ -6571,7 +6625,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root, do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, CHUNK_ALLOC_FORCE); - ret = set_block_group_ro(cache); + ret = set_block_group_ro(cache, 0); if (!ret) goto out; alloc_flags = get_alloc_profile(root, cache->space_info->flags); @@ -6579,7 +6633,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root, CHUNK_ALLOC_FORCE); if (ret < 0) goto out; - ret = set_block_group_ro(cache); + ret = set_block_group_ro(cache, 0); out: btrfs_end_transaction(trans, root); return ret; @@ -6680,6 +6734,10 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) struct btrfs_space_info *space_info; struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; struct btrfs_device *device; + u64 min_free; + u64 dev_min = 1; + u64 dev_nr = 0; + int index; int full = 0; int ret = 0; @@ -6689,8 +6747,10 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) if (!block_group) return -1; + min_free = btrfs_block_group_used(&block_group->item); + /* no bytes used, we're good */ - if (!btrfs_block_group_used(&block_group->item)) + if (!min_free) goto out; space_info = block_group->space_info; @@ -6706,10 +6766,9 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) * all of the extents from this block group. If we can, we're good */ if ((space_info->total_bytes != block_group->key.offset) && - (space_info->bytes_used + space_info->bytes_reserved + - space_info->bytes_pinned + space_info->bytes_readonly + - btrfs_block_group_used(&block_group->item) < - space_info->total_bytes)) { + (space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly + + min_free < space_info->total_bytes)) { spin_unlock(&space_info->lock); goto out; } @@ -6726,9 +6785,31 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) if (full) goto out; + /* + * index: + * 0: raid10 + * 1: raid1 + * 2: dup + * 3: raid0 + * 4: single + */ + index = get_block_group_index(block_group); + if (index == 0) { + dev_min = 4; + /* Divide by 2 */ + min_free >>= 1; + } else if (index == 1) { + dev_min = 2; + } else if (index == 2) { + /* Multiply by 2 */ + min_free <<= 1; + } else if (index == 3) { + dev_min = fs_devices->rw_devices; + do_div(min_free, dev_min); + } + mutex_lock(&root->fs_info->chunk_mutex); list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { - u64 min_free = btrfs_block_group_used(&block_group->item); u64 dev_offset; /* @@ -6739,7 +6820,11 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) ret = find_free_dev_extent(NULL, device, min_free, &dev_offset, NULL); if (!ret) + dev_nr++; + + if (dev_nr >= dev_min) break; + ret = -1; } } @@ -7016,7 +7101,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) set_avail_alloc_bits(root->fs_info, cache->flags); if (btrfs_chunk_readonly(root, cache->key.objectid)) - set_block_group_ro(cache); + set_block_group_ro(cache, 1); } list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) { @@ -7030,9 +7115,9 @@ int btrfs_read_block_groups(struct btrfs_root *root) * mirrored block groups. */ list_for_each_entry(cache, &space_info->block_groups[3], list) - set_block_group_ro(cache); + set_block_group_ro(cache, 1); list_for_each_entry(cache, &space_info->block_groups[4], list) - set_block_group_ro(cache); + set_block_group_ro(cache, 1); } init_global_block_rsv(info); @@ -7162,11 +7247,15 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, spin_unlock(&cluster->refill_lock); path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) { + ret = -ENOMEM; + goto out; + } inode = lookup_free_space_inode(root, block_group, path); if (!IS_ERR(inode)) { - btrfs_orphan_add(trans, inode); + ret = btrfs_orphan_add(trans, inode); + BUG_ON(ret); clear_nlink(inode); /* One for the block groups ref */ spin_lock(&block_group->lock); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 7055d11..d418164 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -254,14 +254,14 @@ static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, * * This should be called with the tree lock held. */ -static int merge_state(struct extent_io_tree *tree, - struct extent_state *state) +static void merge_state(struct extent_io_tree *tree, + struct extent_state *state) { struct extent_state *other; struct rb_node *other_node; if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) - return 0; + return; other_node = rb_prev(&state->rb_node); if (other_node) { @@ -281,26 +281,19 @@ static int merge_state(struct extent_io_tree *tree, if (other->start == state->end + 1 && other->state == state->state) { merge_cb(tree, state, other); - other->start = state->start; - state->tree = NULL; - rb_erase(&state->rb_node, &tree->state); - free_extent_state(state); - state = NULL; + state->end = other->end; + other->tree = NULL; + rb_erase(&other->rb_node, &tree->state); + free_extent_state(other); } } - - return 0; } -static int set_state_cb(struct extent_io_tree *tree, +static void set_state_cb(struct extent_io_tree *tree, struct extent_state *state, int *bits) { - if (tree->ops && tree->ops->set_bit_hook) { - return tree->ops->set_bit_hook(tree->mapping->host, - state, bits); - } - - return 0; + if (tree->ops && tree->ops->set_bit_hook) + tree->ops->set_bit_hook(tree->mapping->host, state, bits); } static void clear_state_cb(struct extent_io_tree *tree, @@ -310,6 +303,9 @@ static void clear_state_cb(struct extent_io_tree *tree, tree->ops->clear_bit_hook(tree->mapping->host, state, bits); } +static void set_state_bits(struct extent_io_tree *tree, + struct extent_state *state, int *bits); + /* * insert an extent_state struct into the tree. 'bits' are set on the * struct before it is inserted. @@ -325,8 +321,6 @@ static int insert_state(struct extent_io_tree *tree, int *bits) { struct rb_node *node; - int bits_to_set = *bits & ~EXTENT_CTLBITS; - int ret; if (end < start) { printk(KERN_ERR "btrfs end < start %llu %llu\n", @@ -336,13 +330,9 @@ static int insert_state(struct extent_io_tree *tree, } state->start = start; state->end = end; - ret = set_state_cb(tree, state, bits); - if (ret) - return ret; - if (bits_to_set & EXTENT_DIRTY) - tree->dirty_bytes += end - start + 1; - state->state |= bits_to_set; + set_state_bits(tree, state, bits); + node = tree_insert(&tree->state, end, &state->rb_node); if (node) { struct extent_state *found; @@ -351,7 +341,6 @@ static int insert_state(struct extent_io_tree *tree, "%llu %llu\n", (unsigned long long)found->start, (unsigned long long)found->end, (unsigned long long)start, (unsigned long long)end); - free_extent_state(state); return -EEXIST; } state->tree = tree; @@ -359,13 +348,11 @@ static int insert_state(struct extent_io_tree *tree, return 0; } -static int split_cb(struct extent_io_tree *tree, struct extent_state *orig, +static void split_cb(struct extent_io_tree *tree, struct extent_state *orig, u64 split) { if (tree->ops && tree->ops->split_extent_hook) - return tree->ops->split_extent_hook(tree->mapping->host, - orig, split); - return 0; + tree->ops->split_extent_hook(tree->mapping->host, orig, split); } /* @@ -500,7 +487,8 @@ again: cached_state = NULL; } - if (cached && cached->tree && cached->start == start) { + if (cached && cached->tree && cached->start <= start && + cached->end > start) { if (clear) atomic_dec(&cached->refs); state = cached; @@ -660,34 +648,25 @@ again: if (start > end) break; - if (need_resched()) { - spin_unlock(&tree->lock); - cond_resched(); - spin_lock(&tree->lock); - } + cond_resched_lock(&tree->lock); } out: spin_unlock(&tree->lock); return 0; } -static int set_state_bits(struct extent_io_tree *tree, +static void set_state_bits(struct extent_io_tree *tree, struct extent_state *state, int *bits) { - int ret; int bits_to_set = *bits & ~EXTENT_CTLBITS; - ret = set_state_cb(tree, state, bits); - if (ret) - return ret; + set_state_cb(tree, state, bits); if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { u64 range = state->end - state->start + 1; tree->dirty_bytes += range; } state->state |= bits_to_set; - - return 0; } static void cache_state(struct extent_state *state, @@ -742,7 +721,8 @@ again: spin_lock(&tree->lock); if (cached_state && *cached_state) { state = *cached_state; - if (state->start == start && state->tree) { + if (state->start <= start && state->end > start && + state->tree) { node = &state->rb_node; goto hit_next; } @@ -779,17 +759,15 @@ hit_next: goto out; } - err = set_state_bits(tree, state, &bits); - if (err) - goto out; + set_state_bits(tree, state, &bits); - next_node = rb_next(node); cache_state(state, cached_state); merge_state(tree, state); if (last_end == (u64)-1) goto out; start = last_end + 1; + next_node = rb_next(&state->rb_node); if (next_node && start < end && prealloc && !need_resched()) { state = rb_entry(next_node, struct extent_state, rb_node); @@ -830,9 +808,7 @@ hit_next: if (err) goto out; if (state->end <= end) { - err = set_state_bits(tree, state, &bits); - if (err) - goto out; + set_state_bits(tree, state, &bits); cache_state(state, cached_state); merge_state(tree, state); if (last_end == (u64)-1) @@ -862,7 +838,6 @@ hit_next: * Avoid to free 'prealloc' if it can be merged with * the later extent. */ - atomic_inc(&prealloc->refs); err = insert_state(tree, prealloc, start, this_end, &bits); BUG_ON(err == -EEXIST); @@ -872,7 +847,6 @@ hit_next: goto out; } cache_state(prealloc, cached_state); - free_extent_state(prealloc); prealloc = NULL; start = this_end + 1; goto search_again; @@ -895,11 +869,7 @@ hit_next: err = split_state(tree, state, prealloc, end + 1); BUG_ON(err == -EEXIST); - err = set_state_bits(tree, prealloc, &bits); - if (err) { - prealloc = NULL; - goto out; - } + set_state_bits(tree, prealloc, &bits); cache_state(prealloc, cached_state); merge_state(tree, prealloc); prealloc = NULL; @@ -1061,46 +1031,6 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) return 0; } -/* - * find the first offset in the io tree with 'bits' set. zero is - * returned if we find something, and *start_ret and *end_ret are - * set to reflect the state struct that was found. - * - * If nothing was found, 1 is returned, < 0 on error - */ -int find_first_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, int bits) -{ - struct rb_node *node; - struct extent_state *state; - int ret = 1; - - spin_lock(&tree->lock); - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search(tree, start); - if (!node) - goto out; - - while (1) { - state = rb_entry(node, struct extent_state, rb_node); - if (state->end >= start && (state->state & bits)) { - *start_ret = state->start; - *end_ret = state->end; - ret = 0; - break; - } - node = rb_next(node); - if (!node) - break; - } -out: - spin_unlock(&tree->lock); - return ret; -} - /* find the first state struct with 'bits' set after 'start', and * return it. tree->lock must be held. NULL will returned if * nothing was found after 'start' @@ -1133,6 +1063,30 @@ out: } /* + * find the first offset in the io tree with 'bits' set. zero is + * returned if we find something, and *start_ret and *end_ret are + * set to reflect the state struct that was found. + * + * If nothing was found, 1 is returned, < 0 on error + */ +int find_first_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, int bits) +{ + struct extent_state *state; + int ret = 1; + + spin_lock(&tree->lock); + state = find_first_extent_bit_state(tree, start, bits); + if (state) { + *start_ret = state->start; + *end_ret = state->end; + ret = 0; + } + spin_unlock(&tree->lock); + return ret; +} + +/* * find a contiguous range of bytes in the file marked as delalloc, not * more than 'max_bytes'. start and end are used to return the range, * @@ -1564,7 +1518,8 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, int bitset = 0; spin_lock(&tree->lock); - if (cached && cached->tree && cached->start == start) + if (cached && cached->tree && cached->start <= start && + cached->end > start) node = &cached->rb_node; else node = tree_search(tree, start); @@ -2432,6 +2387,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, pgoff_t index; pgoff_t end; /* Inclusive */ int scanned = 0; + int tag; pagevec_init(&pvec, 0); if (wbc->range_cyclic) { @@ -2442,11 +2398,16 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, end = wbc->range_end >> PAGE_CACHE_SHIFT; scanned = 1; } + if (wbc->sync_mode == WB_SYNC_ALL) + tag = PAGECACHE_TAG_TOWRITE; + else + tag = PAGECACHE_TAG_DIRTY; retry: + if (wbc->sync_mode == WB_SYNC_ALL) + tag_pages_for_writeback(mapping, index, end); while (!done && !nr_to_write_done && (index <= end) && - (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY, min(end - index, - (pgoff_t)PAGEVEC_SIZE-1) + 1))) { + (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { unsigned i; scanned = 1; @@ -2541,7 +2502,6 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page, struct writeback_control *wbc) { int ret; - struct address_space *mapping = page->mapping; struct extent_page_data epd = { .bio = NULL, .tree = tree, @@ -2549,18 +2509,9 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page, .extent_locked = 0, .sync_io = wbc->sync_mode == WB_SYNC_ALL, }; - struct writeback_control wbc_writepages = { - .sync_mode = wbc->sync_mode, - .older_than_this = NULL, - .nr_to_write = 64, - .range_start = page_offset(page) + PAGE_CACHE_SIZE, - .range_end = (loff_t)-1, - }; ret = __extent_writepage(page, wbc, &epd); - extent_write_cache_pages(tree, mapping, &wbc_writepages, - __extent_writepage, &epd, flush_write_bio); flush_epd_write_bio(&epd); return ret; } @@ -2584,7 +2535,6 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, }; struct writeback_control wbc_writepages = { .sync_mode = mode, - .older_than_this = NULL, .nr_to_write = nr_pages * 2, .range_start = start, .range_end = end + 1, @@ -3022,8 +2972,15 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, return NULL; eb->start = start; eb->len = len; - spin_lock_init(&eb->lock); - init_waitqueue_head(&eb->lock_wq); + rwlock_init(&eb->lock); + atomic_set(&eb->write_locks, 0); + atomic_set(&eb->read_locks, 0); + atomic_set(&eb->blocking_readers, 0); + atomic_set(&eb->blocking_writers, 0); + atomic_set(&eb->spinning_readers, 0); + atomic_set(&eb->spinning_writers, 0); + init_waitqueue_head(&eb->write_lock_wq); + init_waitqueue_head(&eb->read_lock_wq); #if LEAK_DEBUG spin_lock_irqsave(&leak_lock, flags); @@ -3119,7 +3076,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, i = 0; } for (; i < num_pages; i++, index++) { - p = find_or_create_page(mapping, index, GFP_NOFS | __GFP_HIGHMEM); + p = find_or_create_page(mapping, index, GFP_NOFS); if (!p) { WARN_ON(1); goto free_eb; @@ -3266,6 +3223,22 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree, return was_dirty; } +static int __eb_straddles_pages(u64 start, u64 len) +{ + if (len < PAGE_CACHE_SIZE) + return 1; + if (start & (PAGE_CACHE_SIZE - 1)) + return 1; + if ((start + len) & (PAGE_CACHE_SIZE - 1)) + return 1; + return 0; +} + +static int eb_straddles_pages(struct extent_buffer *eb) +{ + return __eb_straddles_pages(eb->start, eb->len); +} + int clear_extent_buffer_uptodate(struct extent_io_tree *tree, struct extent_buffer *eb, struct extent_state **cached_state) @@ -3277,8 +3250,10 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree, num_pages = num_extent_pages(eb->start, eb->len); clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, - cached_state, GFP_NOFS); + if (eb_straddles_pages(eb)) { + clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, + cached_state, GFP_NOFS); + } for (i = 0; i < num_pages; i++) { page = extent_buffer_page(eb, i); if (page) @@ -3296,8 +3271,10 @@ int set_extent_buffer_uptodate(struct extent_io_tree *tree, num_pages = num_extent_pages(eb->start, eb->len); - set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, - NULL, GFP_NOFS); + if (eb_straddles_pages(eb)) { + set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, + NULL, GFP_NOFS); + } for (i = 0; i < num_pages; i++) { page = extent_buffer_page(eb, i); if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || @@ -3320,9 +3297,12 @@ int extent_range_uptodate(struct extent_io_tree *tree, int uptodate; unsigned long index; - ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL); - if (ret) - return 1; + if (__eb_straddles_pages(start, end - start + 1)) { + ret = test_range_bit(tree, start, end, + EXTENT_UPTODATE, 1, NULL); + if (ret) + return 1; + } while (start <= end) { index = start >> PAGE_CACHE_SHIFT; page = find_get_page(tree->mapping, index); @@ -3350,10 +3330,12 @@ int extent_buffer_uptodate(struct extent_io_tree *tree, if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 1; - ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, - EXTENT_UPTODATE, 1, cached_state); - if (ret) - return ret; + if (eb_straddles_pages(eb)) { + ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, + EXTENT_UPTODATE, 1, cached_state); + if (ret) + return ret; + } num_pages = num_extent_pages(eb->start, eb->len); for (i = 0; i < num_pages; i++) { @@ -3386,9 +3368,11 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 0; - if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, - EXTENT_UPTODATE, 1, NULL)) { - return 0; + if (eb_straddles_pages(eb)) { + if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, + EXTENT_UPTODATE, 1, NULL)) { + return 0; + } } if (start) { @@ -3492,9 +3476,8 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv, page = extent_buffer_page(eb, i); cur = min(len, (PAGE_CACHE_SIZE - offset)); - kaddr = kmap_atomic(page, KM_USER1); + kaddr = page_address(page); memcpy(dst, kaddr + offset, cur); - kunmap_atomic(kaddr, KM_USER1); dst += cur; len -= cur; @@ -3504,9 +3487,9 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv, } int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, - unsigned long min_len, char **token, char **map, + unsigned long min_len, char **map, unsigned long *map_start, - unsigned long *map_len, int km) + unsigned long *map_len) { size_t offset = start & (PAGE_CACHE_SIZE - 1); char *kaddr; @@ -3536,42 +3519,12 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, } p = extent_buffer_page(eb, i); - kaddr = kmap_atomic(p, km); - *token = kaddr; + kaddr = page_address(p); *map = kaddr + offset; *map_len = PAGE_CACHE_SIZE - offset; return 0; } -int map_extent_buffer(struct extent_buffer *eb, unsigned long start, - unsigned long min_len, - char **token, char **map, - unsigned long *map_start, - unsigned long *map_len, int km) -{ - int err; - int save = 0; - if (eb->map_token) { - unmap_extent_buffer(eb, eb->map_token, km); - eb->map_token = NULL; - save = 1; - } - err = map_private_extent_buffer(eb, start, min_len, token, map, - map_start, map_len, km); - if (!err && save) { - eb->map_token = *token; - eb->kaddr = *map; - eb->map_start = *map_start; - eb->map_len = *map_len; - } - return err; -} - -void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km) -{ - kunmap_atomic(token, km); -} - int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, unsigned long start, unsigned long len) @@ -3595,9 +3548,8 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, cur = min(len, (PAGE_CACHE_SIZE - offset)); - kaddr = kmap_atomic(page, KM_USER0); + kaddr = page_address(page); ret = memcmp(ptr, kaddr + offset, cur); - kunmap_atomic(kaddr, KM_USER0); if (ret) break; @@ -3630,9 +3582,8 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv, WARN_ON(!PageUptodate(page)); cur = min(len, PAGE_CACHE_SIZE - offset); - kaddr = kmap_atomic(page, KM_USER1); + kaddr = page_address(page); memcpy(kaddr + offset, src, cur); - kunmap_atomic(kaddr, KM_USER1); src += cur; len -= cur; @@ -3661,9 +3612,8 @@ void memset_extent_buffer(struct extent_buffer *eb, char c, WARN_ON(!PageUptodate(page)); cur = min(len, PAGE_CACHE_SIZE - offset); - kaddr = kmap_atomic(page, KM_USER0); + kaddr = page_address(page); memset(kaddr + offset, c, cur); - kunmap_atomic(kaddr, KM_USER0); len -= cur; offset = 0; @@ -3694,9 +3644,8 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset)); - kaddr = kmap_atomic(page, KM_USER0); + kaddr = page_address(page); read_extent_buffer(src, kaddr + offset, src_offset, cur); - kunmap_atomic(kaddr, KM_USER0); src_offset += cur; len -= cur; @@ -3709,20 +3658,17 @@ static void move_pages(struct page *dst_page, struct page *src_page, unsigned long dst_off, unsigned long src_off, unsigned long len) { - char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); + char *dst_kaddr = page_address(dst_page); if (dst_page == src_page) { memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len); } else { - char *src_kaddr = kmap_atomic(src_page, KM_USER1); + char *src_kaddr = page_address(src_page); char *p = dst_kaddr + dst_off + len; char *s = src_kaddr + src_off + len; while (len--) *--p = *--s; - - kunmap_atomic(src_kaddr, KM_USER1); } - kunmap_atomic(dst_kaddr, KM_USER0); } static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) @@ -3735,20 +3681,17 @@ static void copy_pages(struct page *dst_page, struct page *src_page, unsigned long dst_off, unsigned long src_off, unsigned long len) { - char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); + char *dst_kaddr = page_address(dst_page); char *src_kaddr; if (dst_page != src_page) { - src_kaddr = kmap_atomic(src_page, KM_USER1); + src_kaddr = page_address(src_page); } else { src_kaddr = dst_kaddr; BUG_ON(areas_overlap(src_off, dst_off, len)); } memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); - kunmap_atomic(dst_kaddr, KM_USER0); - if (dst_page != src_page) - kunmap_atomic(src_kaddr, KM_USER1); } void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index a11a92e..7b2f0c3 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -76,15 +76,15 @@ struct extent_io_ops { struct extent_state *state); int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, struct extent_state *state, int uptodate); - int (*set_bit_hook)(struct inode *inode, struct extent_state *state, - int *bits); - int (*clear_bit_hook)(struct inode *inode, struct extent_state *state, - int *bits); - int (*merge_extent_hook)(struct inode *inode, - struct extent_state *new, - struct extent_state *other); - int (*split_extent_hook)(struct inode *inode, - struct extent_state *orig, u64 split); + void (*set_bit_hook)(struct inode *inode, struct extent_state *state, + int *bits); + void (*clear_bit_hook)(struct inode *inode, struct extent_state *state, + int *bits); + void (*merge_extent_hook)(struct inode *inode, + struct extent_state *new, + struct extent_state *other); + void (*split_extent_hook)(struct inode *inode, + struct extent_state *orig, u64 split); int (*write_cache_pages_lock_hook)(struct page *page); }; @@ -108,8 +108,6 @@ struct extent_state { wait_queue_head_t wq; atomic_t refs; unsigned long state; - u64 split_start; - u64 split_end; /* for use by the FS */ u64 private; @@ -120,8 +118,6 @@ struct extent_state { struct extent_buffer { u64 start; unsigned long len; - char *map_token; - char *kaddr; unsigned long map_start; unsigned long map_len; struct page *first_page; @@ -130,14 +126,26 @@ struct extent_buffer { struct rcu_head rcu_head; atomic_t refs; - /* the spinlock is used to protect most operations */ - spinlock_t lock; + /* count of read lock holders on the extent buffer */ + atomic_t write_locks; + atomic_t read_locks; + atomic_t blocking_writers; + atomic_t blocking_readers; + atomic_t spinning_readers; + atomic_t spinning_writers; + + /* protects write locks */ + rwlock_t lock; - /* - * when we keep the lock held while blocking, waiters go onto - * the wq + /* readers use lock_wq while they wait for the write + * lock holders to unlock */ - wait_queue_head_t lock_wq; + wait_queue_head_t write_lock_wq; + + /* writers use read_lock_wq while they wait for readers + * to unlock + */ + wait_queue_head_t read_lock_wq; }; static inline void extent_set_compress_type(unsigned long *bio_flags, @@ -279,15 +287,10 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree, int extent_buffer_uptodate(struct extent_io_tree *tree, struct extent_buffer *eb, struct extent_state *cached_state); -int map_extent_buffer(struct extent_buffer *eb, unsigned long offset, - unsigned long min_len, char **token, char **map, - unsigned long *map_start, - unsigned long *map_len, int km); int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset, - unsigned long min_len, char **token, char **map, + unsigned long min_len, char **map, unsigned long *map_start, - unsigned long *map_len, int km); -void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km); + unsigned long *map_len); int extent_range_uptodate(struct extent_io_tree *tree, u64 start, u64 end); int extent_clear_unlock_delalloc(struct inode *inode, diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 2d04103..7c97b33 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -183,22 +183,10 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next) return 0; } -int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len) +static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) { - int ret = 0; struct extent_map *merge = NULL; struct rb_node *rb; - struct extent_map *em; - - write_lock(&tree->lock); - em = lookup_extent_mapping(tree, start, len); - - WARN_ON(!em || em->start != start); - - if (!em) - goto out; - - clear_bit(EXTENT_FLAG_PINNED, &em->flags); if (em->start != 0) { rb = rb_prev(&em->rb_node); @@ -225,6 +213,24 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len) merge->in_tree = 0; free_extent_map(merge); } +} + +int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len) +{ + int ret = 0; + struct extent_map *em; + + write_lock(&tree->lock); + em = lookup_extent_mapping(tree, start, len); + + WARN_ON(!em || em->start != start); + + if (!em) + goto out; + + clear_bit(EXTENT_FLAG_PINNED, &em->flags); + + try_merge_map(tree, em); free_extent_map(em); out: @@ -247,7 +253,6 @@ int add_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) { int ret = 0; - struct extent_map *merge = NULL; struct rb_node *rb; struct extent_map *exist; @@ -263,30 +268,8 @@ int add_extent_mapping(struct extent_map_tree *tree, goto out; } atomic_inc(&em->refs); - if (em->start != 0) { - rb = rb_prev(&em->rb_node); - if (rb) - merge = rb_entry(rb, struct extent_map, rb_node); - if (rb && mergable_maps(merge, em)) { - em->start = merge->start; - em->len += merge->len; - em->block_len += merge->block_len; - em->block_start = merge->block_start; - merge->in_tree = 0; - rb_erase(&merge->rb_node, &tree->map); - free_extent_map(merge); - } - } - rb = rb_next(&em->rb_node); - if (rb) - merge = rb_entry(rb, struct extent_map, rb_node); - if (rb && mergable_maps(em, merge)) { - em->len += merge->len; - em->block_len += merge->len; - rb_erase(&merge->rb_node, &tree->map); - merge->in_tree = 0; - free_extent_map(merge); - } + + try_merge_map(tree, em); out: return ret; } @@ -299,19 +282,8 @@ static u64 range_end(u64 start, u64 len) return start + len; } -/** - * lookup_extent_mapping - lookup extent_map - * @tree: tree to lookup in - * @start: byte offset to start the search - * @len: length of the lookup range - * - * Find and return the first extent_map struct in @tree that intersects the - * [start, len] range. There may be additional objects in the tree that - * intersect, so check the object returned carefully to make sure that no - * additional lookups are needed. - */ -struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, - u64 start, u64 len) +struct extent_map *__lookup_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len, int strict) { struct extent_map *em; struct rb_node *rb_node; @@ -320,38 +292,42 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, u64 end = range_end(start, len); rb_node = __tree_search(&tree->map, start, &prev, &next); - if (!rb_node && prev) { - em = rb_entry(prev, struct extent_map, rb_node); - if (end > em->start && start < extent_map_end(em)) - goto found; - } - if (!rb_node && next) { - em = rb_entry(next, struct extent_map, rb_node); - if (end > em->start && start < extent_map_end(em)) - goto found; - } if (!rb_node) { - em = NULL; - goto out; - } - if (IS_ERR(rb_node)) { - em = ERR_CAST(rb_node); - goto out; + if (prev) + rb_node = prev; + else if (next) + rb_node = next; + else + return NULL; } + em = rb_entry(rb_node, struct extent_map, rb_node); - if (end > em->start && start < extent_map_end(em)) - goto found; - em = NULL; - goto out; + if (strict && !(end > em->start && start < extent_map_end(em))) + return NULL; -found: atomic_inc(&em->refs); -out: return em; } /** + * lookup_extent_mapping - lookup extent_map + * @tree: tree to lookup in + * @start: byte offset to start the search + * @len: length of the lookup range + * + * Find and return the first extent_map struct in @tree that intersects the + * [start, len] range. There may be additional objects in the tree that + * intersect, so check the object returned carefully to make sure that no + * additional lookups are needed. + */ +struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len) +{ + return __lookup_extent_mapping(tree, start, len, 1); +} + +/** * search_extent_mapping - find a nearby extent map * @tree: tree to lookup in * @start: byte offset to start the search @@ -365,38 +341,7 @@ out: struct extent_map *search_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len) { - struct extent_map *em; - struct rb_node *rb_node; - struct rb_node *prev = NULL; - struct rb_node *next = NULL; - - rb_node = __tree_search(&tree->map, start, &prev, &next); - if (!rb_node && prev) { - em = rb_entry(prev, struct extent_map, rb_node); - goto found; - } - if (!rb_node && next) { - em = rb_entry(next, struct extent_map, rb_node); - goto found; - } - if (!rb_node) { - em = NULL; - goto out; - } - if (IS_ERR(rb_node)) { - em = ERR_CAST(rb_node); - goto out; - } - em = rb_entry(rb_node, struct extent_map, rb_node); - goto found; - - em = NULL; - goto out; - -found: - atomic_inc(&em->refs); -out: - return em; + return __lookup_extent_mapping(tree, start, len, 0); } /** diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 90d4ee5..a1cb782 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -177,6 +177,17 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, WARN_ON(bio->bi_vcnt <= 0); + /* + * the free space stuff is only read when it hasn't been + * updated in the current transaction. So, we can safely + * read from the commit root and sidestep a nasty deadlock + * between reading the free space cache and updating the csum tree. + */ + if (btrfs_is_free_space_inode(root, inode)) { + path->search_commit_root = 1; + path->skip_locking = 1; + } + disk_bytenr = (u64)bio->bi_sector << 9; if (dio) offset = logical_offset; @@ -282,7 +293,8 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; if (search_commit) { path->skip_locking = 1; @@ -664,15 +676,13 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_sector_sum *sector_sum; u32 nritems; u32 ins_size; - char *eb_map; - char *eb_token; - unsigned long map_len; - unsigned long map_start; u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; + sector_sum = sums->sums; again: next_offset = (u64)-1; @@ -814,30 +824,9 @@ found: item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); item_end = (struct btrfs_csum_item *)((unsigned char *)item_end + btrfs_item_size_nr(leaf, path->slots[0])); - eb_token = NULL; next_sector: - if (!eb_token || - (unsigned long)item + csum_size >= map_start + map_len) { - int err; - - if (eb_token) - unmap_extent_buffer(leaf, eb_token, KM_USER1); - eb_token = NULL; - err = map_private_extent_buffer(leaf, (unsigned long)item, - csum_size, - &eb_token, &eb_map, - &map_start, &map_len, KM_USER1); - if (err) - eb_token = NULL; - } - if (eb_token) { - memcpy(eb_token + ((unsigned long)item & (PAGE_CACHE_SIZE - 1)), - §or_sum->sum, csum_size); - } else { - write_extent_buffer(leaf, §or_sum->sum, - (unsigned long)item, csum_size); - } + write_extent_buffer(leaf, §or_sum->sum, (unsigned long)item, csum_size); total_bytes += root->sectorsize; sector_sum++; @@ -850,10 +839,7 @@ next_sector: goto next_sector; } } - if (eb_token) { - unmap_extent_buffer(leaf, eb_token, KM_USER1); - eb_token = NULL; - } + btrfs_mark_buffer_dirty(path->nodes[0]); if (total_bytes < sums->len) { btrfs_release_path(path); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 59cbdb1..3c3abff 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -74,7 +74,7 @@ struct inode_defrag { * If an existing record is found the defrag item you * pass in is freed */ -static int __btrfs_add_inode_defrag(struct inode *inode, +static void __btrfs_add_inode_defrag(struct inode *inode, struct inode_defrag *defrag) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -106,11 +106,11 @@ static int __btrfs_add_inode_defrag(struct inode *inode, BTRFS_I(inode)->in_defrag = 1; rb_link_node(&defrag->rb_node, parent, p); rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes); - return 0; + return; exists: kfree(defrag); - return 0; + return; } @@ -123,7 +123,6 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, { struct btrfs_root *root = BTRFS_I(inode)->root; struct inode_defrag *defrag; - int ret = 0; u64 transid; if (!btrfs_test_opt(root, AUTO_DEFRAG)) @@ -150,9 +149,11 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, spin_lock(&root->fs_info->defrag_inodes_lock); if (!BTRFS_I(inode)->in_defrag) - ret = __btrfs_add_inode_defrag(inode, defrag); + __btrfs_add_inode_defrag(inode, defrag); + else + kfree(defrag); spin_unlock(&root->fs_info->defrag_inodes_lock); - return ret; + return 0; } /* @@ -855,7 +856,8 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, btrfs_drop_extent_cache(inode, start, end - 1, 0); path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; again: recow = 0; split = start; @@ -1059,7 +1061,7 @@ static int prepare_uptodate_page(struct page *page, u64 pos) static noinline int prepare_pages(struct btrfs_root *root, struct file *file, struct page **pages, size_t num_pages, loff_t pos, unsigned long first_index, - unsigned long last_index, size_t write_bytes) + size_t write_bytes) { struct extent_state *cached_state = NULL; int i; @@ -1073,15 +1075,10 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file, start_pos = pos & ~((u64)root->sectorsize - 1); last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT; - if (start_pos > inode->i_size) { - err = btrfs_cont_expand(inode, i_size_read(inode), start_pos); - if (err) - return err; - } - again: for (i = 0; i < num_pages; i++) { - pages[i] = grab_cache_page(inode->i_mapping, index + i); + pages[i] = find_or_create_page(inode->i_mapping, index + i, + GFP_NOFS); if (!pages[i]) { faili = i - 1; err = -ENOMEM; @@ -1158,7 +1155,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, struct btrfs_root *root = BTRFS_I(inode)->root; struct page **pages = NULL; unsigned long first_index; - unsigned long last_index; size_t num_written = 0; int nrptrs; int ret = 0; @@ -1171,7 +1167,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, return -ENOMEM; first_index = pos >> PAGE_CACHE_SHIFT; - last_index = (pos + iov_iter_count(i)) >> PAGE_CACHE_SHIFT; while (iov_iter_count(i) > 0) { size_t offset = pos & (PAGE_CACHE_SIZE - 1); @@ -1205,8 +1200,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, * contents of pages from loop to loop */ ret = prepare_pages(root, file, pages, num_pages, - pos, first_index, last_index, - write_bytes); + pos, first_index, write_bytes); if (ret) { btrfs_delalloc_release_space(inode, num_pages << PAGE_CACHE_SHIFT); @@ -1238,9 +1232,11 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, * managed to copy. */ if (num_pages > dirty_pages) { - if (copied > 0) - atomic_inc( - &BTRFS_I(inode)->outstanding_extents); + if (copied > 0) { + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->outstanding_extents++; + spin_unlock(&BTRFS_I(inode)->lock); + } btrfs_delalloc_release_space(inode, (num_pages - dirty_pages) << PAGE_CACHE_SHIFT); @@ -1336,6 +1332,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, struct inode *inode = fdentry(file)->d_inode; struct btrfs_root *root = BTRFS_I(inode)->root; loff_t *ppos = &iocb->ki_pos; + u64 start_pos; ssize_t num_written = 0; ssize_t err = 0; size_t count, ocount; @@ -1384,6 +1381,15 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, file_update_time(file); BTRFS_I(inode)->sequence++; + start_pos = round_down(pos, root->sectorsize); + if (start_pos > i_size_read(inode)) { + err = btrfs_cont_expand(inode, i_size_read(inode), start_pos); + if (err) { + mutex_unlock(&inode->i_mutex); + goto out; + } + } + if (unlikely(file->f_flags & O_DIRECT)) { num_written = __btrfs_direct_write(iocb, iov, nr_segs, pos, ppos, count, ocount); @@ -1638,11 +1644,15 @@ static long btrfs_fallocate(struct file *file, int mode, cur_offset = alloc_start; while (1) { + u64 actual_end; + em = btrfs_get_extent(inode, NULL, 0, cur_offset, alloc_end - cur_offset, 0); BUG_ON(IS_ERR_OR_NULL(em)); last_byte = min(extent_map_end(em), alloc_end); + actual_end = min_t(u64, extent_map_end(em), offset + len); last_byte = (last_byte + mask) & ~mask; + if (em->block_start == EXTENT_MAP_HOLE || (cur_offset >= inode->i_size && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { @@ -1655,6 +1665,16 @@ static long btrfs_fallocate(struct file *file, int mode, free_extent_map(em); break; } + } else if (actual_end > inode->i_size && + !(mode & FALLOC_FL_KEEP_SIZE)) { + /* + * We didn't need to allocate any more space, but we + * still extended the size of the file so we need to + * update i_size. + */ + inode->i_ctime = CURRENT_TIME; + i_size_write(inode, actual_end); + btrfs_ordered_update_i_size(inode, actual_end, NULL); } free_extent_map(em); @@ -1804,10 +1824,14 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin) } } - if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) - return -EINVAL; - if (offset > inode->i_sb->s_maxbytes) - return -EINVAL; + if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) { + ret = -EINVAL; + goto out; + } + if (offset > inode->i_sb->s_maxbytes) { + ret = -EINVAL; + goto out; + } /* Special lock needed here? */ if (offset != file->f_pos) { diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index bf0d615..41ac927 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -98,6 +98,12 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root, return inode; spin_lock(&block_group->lock); + if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) { + printk(KERN_INFO "Old style space inode found, converting.\n"); + BTRFS_I(inode)->flags &= ~BTRFS_INODE_NODATASUM; + block_group->disk_cache_state = BTRFS_DC_CLEAR; + } + if (!btrfs_fs_closing(root->fs_info)) { block_group->inode = igrab(inode); block_group->iref = 1; @@ -135,7 +141,7 @@ int __create_free_space_inode(struct btrfs_root *root, btrfs_set_inode_gid(leaf, inode_item, 0); btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600); btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS | - BTRFS_INODE_PREALLOC | BTRFS_INODE_NODATASUM); + BTRFS_INODE_PREALLOC); btrfs_set_inode_nlink(leaf, inode_item, 1); btrfs_set_inode_transid(leaf, inode_item, trans->transid); btrfs_set_inode_block_group(leaf, inode_item, offset); @@ -184,9 +190,11 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root, struct btrfs_path *path, struct inode *inode) { + struct btrfs_block_rsv *rsv; loff_t oldsize; int ret = 0; + rsv = trans->block_rsv; trans->block_rsv = root->orphan_block_rsv; ret = btrfs_block_rsv_check(trans, root, root->orphan_block_rsv, @@ -204,6 +212,8 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root, */ ret = btrfs_truncate_inode_items(trans, root, inode, 0, BTRFS_EXTENT_DATA_KEY); + + trans->block_rsv = rsv; if (ret) { WARN_ON(1); return ret; @@ -239,17 +249,12 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, struct btrfs_free_space_header *header; struct extent_buffer *leaf; struct page *page; - u32 *checksums = NULL, *crc; - char *disk_crcs = NULL; struct btrfs_key key; struct list_head bitmaps; u64 num_entries; u64 num_bitmaps; u64 generation; - u32 cur_crc = ~(u32)0; pgoff_t index = 0; - unsigned long first_page_offset; - int num_checksums; int ret = 0; INIT_LIST_HEAD(&bitmaps); @@ -292,16 +297,6 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, if (!num_entries) goto out; - /* Setup everything for doing checksumming */ - num_checksums = i_size_read(inode) / PAGE_CACHE_SIZE; - checksums = crc = kzalloc(sizeof(u32) * num_checksums, GFP_NOFS); - if (!checksums) - goto out; - first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64); - disk_crcs = kzalloc(first_page_offset, GFP_NOFS); - if (!disk_crcs) - goto out; - ret = readahead_cache(inode); if (ret) goto out; @@ -311,18 +306,12 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, struct btrfs_free_space *e; void *addr; unsigned long offset = 0; - unsigned long start_offset = 0; int need_loop = 0; if (!num_entries && !num_bitmaps) break; - if (index == 0) { - start_offset = first_page_offset; - offset = start_offset; - } - - page = grab_cache_page(inode->i_mapping, index); + page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); if (!page) goto free_cache; @@ -342,8 +331,15 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, if (index == 0) { u64 *gen; - memcpy(disk_crcs, addr, first_page_offset); - gen = addr + (sizeof(u32) * num_checksums); + /* + * We put a bogus crc in the front of the first page in + * case old kernels try to mount a fs with the new + * format to make sure they discard the cache. + */ + addr += sizeof(u64); + offset += sizeof(u64); + + gen = addr; if (*gen != BTRFS_I(inode)->generation) { printk(KERN_ERR "btrfs: space cache generation" " (%llu) does not match inode (%llu)\n", @@ -355,24 +351,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, page_cache_release(page); goto free_cache; } - crc = (u32 *)disk_crcs; + addr += sizeof(u64); + offset += sizeof(u64); } - entry = addr + start_offset; - - /* First lets check our crc before we do anything fun */ - cur_crc = ~(u32)0; - cur_crc = btrfs_csum_data(root, addr + start_offset, cur_crc, - PAGE_CACHE_SIZE - start_offset); - btrfs_csum_final(cur_crc, (char *)&cur_crc); - if (cur_crc != *crc) { - printk(KERN_ERR "btrfs: crc mismatch for page %lu\n", - index); - kunmap(page); - unlock_page(page); - page_cache_release(page); - goto free_cache; - } - crc++; + entry = addr; while (1) { if (!num_entries) @@ -470,8 +452,6 @@ next: ret = 1; out: - kfree(checksums); - kfree(disk_crcs); return ret; free_cache: __btrfs_remove_free_space_cache(ctl); @@ -569,8 +549,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, struct btrfs_key key; u64 start, end, len; u64 bytes = 0; - u32 *crc, *checksums; - unsigned long first_page_offset; + u32 crc = ~(u32)0; int index = 0, num_pages = 0; int entries = 0; int bitmaps = 0; @@ -590,34 +569,13 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - /* Since the first page has all of our checksums and our generation we - * need to calculate the offset into the page that we can start writing - * our entries. - */ - first_page_offset = (sizeof(u32) * num_pages) + sizeof(u64); - filemap_write_and_wait(inode->i_mapping); btrfs_wait_ordered_range(inode, inode->i_size & ~(root->sectorsize - 1), (u64)-1); - /* make sure we don't overflow that first page */ - if (first_page_offset + sizeof(struct btrfs_free_space_entry) >= PAGE_CACHE_SIZE) { - /* this is really the same as running out of space, where we also return 0 */ - printk(KERN_CRIT "Btrfs: free space cache was too big for the crc page\n"); - ret = 0; - goto out_update; - } - - /* We need a checksum per page. */ - crc = checksums = kzalloc(sizeof(u32) * num_pages, GFP_NOFS); - if (!crc) - return -1; - pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS); - if (!pages) { - kfree(crc); + if (!pages) return -1; - } /* Get the cluster for this block_group if it exists */ if (block_group && !list_empty(&block_group->cluster_list)) @@ -640,7 +598,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, * know and don't freak out. */ while (index < num_pages) { - page = grab_cache_page(inode->i_mapping, index); + page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); if (!page) { int i; @@ -648,7 +606,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, unlock_page(pages[i]); page_cache_release(pages[i]); } - goto out_free; + goto out; } pages[index] = page; index++; @@ -668,17 +626,11 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, /* Write out the extent entries */ do { struct btrfs_free_space_entry *entry; - void *addr; + void *addr, *orig; unsigned long offset = 0; - unsigned long start_offset = 0; next_page = false; - if (index == 0) { - start_offset = first_page_offset; - offset = start_offset; - } - if (index >= num_pages) { out_of_space = true; break; @@ -686,10 +638,26 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, page = pages[index]; - addr = kmap(page); - entry = addr + start_offset; + orig = addr = kmap(page); + if (index == 0) { + u64 *gen; - memset(addr, 0, PAGE_CACHE_SIZE); + /* + * We're going to put in a bogus crc for this page to + * make sure that old kernels who aren't aware of this + * format will be sure to discard the cache. + */ + addr += sizeof(u64); + offset += sizeof(u64); + + gen = addr; + *gen = trans->transid; + addr += sizeof(u64); + offset += sizeof(u64); + } + entry = addr; + + memset(addr, 0, PAGE_CACHE_SIZE - offset); while (node && !next_page) { struct btrfs_free_space *e; @@ -752,13 +720,19 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, next_page = true; entry++; } - *crc = ~(u32)0; - *crc = btrfs_csum_data(root, addr + start_offset, *crc, - PAGE_CACHE_SIZE - start_offset); - kunmap(page); - btrfs_csum_final(*crc, (char *)crc); - crc++; + /* Generate bogus crc value */ + if (index == 0) { + u32 *tmp; + crc = btrfs_csum_data(root, orig + sizeof(u64), crc, + PAGE_CACHE_SIZE - sizeof(u64)); + btrfs_csum_final(crc, (char *)&crc); + crc++; + tmp = orig; + *tmp = crc; + } + + kunmap(page); bytes += PAGE_CACHE_SIZE; @@ -779,11 +753,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, addr = kmap(page); memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE); - *crc = ~(u32)0; - *crc = btrfs_csum_data(root, addr, *crc, PAGE_CACHE_SIZE); kunmap(page); - btrfs_csum_final(*crc, (char *)crc); - crc++; bytes += PAGE_CACHE_SIZE; list_del_init(&entry->list); @@ -796,7 +766,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, i_size_read(inode) - 1, &cached_state, GFP_NOFS); ret = 0; - goto out_free; + goto out; } /* Zero out the rest of the pages just to make sure */ @@ -811,20 +781,6 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, index++; } - /* Write the checksums and trans id to the first page */ - { - void *addr; - u64 *gen; - - page = pages[0]; - - addr = kmap(page); - memcpy(addr, checksums, sizeof(u32) * num_pages); - gen = addr + (sizeof(u32) * num_pages); - *gen = trans->transid; - kunmap(page); - } - ret = btrfs_dirty_pages(root, inode, pages, num_pages, 0, bytes, &cached_state); btrfs_drop_pages(pages, num_pages); @@ -833,7 +789,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, if (ret) { ret = 0; - goto out_free; + goto out; } BTRFS_I(inode)->generation = trans->transid; @@ -850,7 +806,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS); - goto out_free; + goto out; } leaf = path->nodes[0]; if (ret > 0) { @@ -866,7 +822,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS); btrfs_release_path(path); - goto out_free; + goto out; } } header = btrfs_item_ptr(leaf, path->slots[0], @@ -879,11 +835,8 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, ret = 1; -out_free: - kfree(checksums); +out: kfree(pages); - -out_update: if (ret != 1) { invalidate_inode_pages2_range(inode->i_mapping, 0, index); BTRFS_I(inode)->generation = 0; @@ -1219,9 +1172,9 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) div64_u64(extent_bytes, (sizeof(struct btrfs_free_space))); } -static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, - struct btrfs_free_space *info, u64 offset, - u64 bytes) +static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info, + u64 offset, u64 bytes) { unsigned long start, count; @@ -1232,6 +1185,13 @@ static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, bitmap_clear(info->bitmap, start, count); info->bytes -= bytes; +} + +static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info, u64 offset, + u64 bytes) +{ + __bitmap_clear_bits(ctl, info, offset, bytes); ctl->free_space -= bytes; } @@ -2035,7 +1995,7 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group, return 0; ret = search_start; - bitmap_clear_bits(ctl, entry, ret, bytes); + __bitmap_clear_bits(ctl, entry, ret, bytes); return ret; } @@ -2090,7 +2050,6 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, continue; } } else { - ret = entry->offset; entry->offset += bytes; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e91b097..4d14de6 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -750,15 +750,6 @@ static u64 get_extent_allocation_hint(struct inode *inode, u64 start, return alloc_hint; } -static inline bool is_free_space_inode(struct btrfs_root *root, - struct inode *inode) -{ - if (root == root->fs_info->tree_root || - BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) - return true; - return false; -} - /* * when extent_io.c finds a delayed allocation range in the file, * the call backs end up in this code. The basic idea is to @@ -791,7 +782,7 @@ static noinline int cow_file_range(struct inode *inode, struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; int ret = 0; - BUG_ON(is_free_space_inode(root, inode)); + BUG_ON(btrfs_is_free_space_inode(root, inode)); trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); trans->block_rsv = &root->fs_info->delalloc_block_rsv; @@ -1070,9 +1061,10 @@ static noinline int run_delalloc_nocow(struct inode *inode, u64 ino = btrfs_ino(inode); path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; - nolock = is_free_space_inode(root, inode); + nolock = btrfs_is_free_space_inode(root, inode); if (nolock) trans = btrfs_join_transaction_nolock(root); @@ -1291,15 +1283,16 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, return ret; } -static int btrfs_split_extent_hook(struct inode *inode, - struct extent_state *orig, u64 split) +static void btrfs_split_extent_hook(struct inode *inode, + struct extent_state *orig, u64 split) { /* not delalloc, ignore it */ if (!(orig->state & EXTENT_DELALLOC)) - return 0; + return; - atomic_inc(&BTRFS_I(inode)->outstanding_extents); - return 0; + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->outstanding_extents++; + spin_unlock(&BTRFS_I(inode)->lock); } /* @@ -1308,16 +1301,17 @@ static int btrfs_split_extent_hook(struct inode *inode, * extents, such as when we are doing sequential writes, so we can properly * account for the metadata space we'll need. */ -static int btrfs_merge_extent_hook(struct inode *inode, - struct extent_state *new, - struct extent_state *other) +static void btrfs_merge_extent_hook(struct inode *inode, + struct extent_state *new, + struct extent_state *other) { /* not delalloc, ignore it */ if (!(other->state & EXTENT_DELALLOC)) - return 0; + return; - atomic_dec(&BTRFS_I(inode)->outstanding_extents); - return 0; + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->outstanding_extents--; + spin_unlock(&BTRFS_I(inode)->lock); } /* @@ -1325,8 +1319,8 @@ static int btrfs_merge_extent_hook(struct inode *inode, * bytes in this file, and to maintain the list of inodes that * have pending delalloc work to be done. */ -static int btrfs_set_bit_hook(struct inode *inode, - struct extent_state *state, int *bits) +static void btrfs_set_bit_hook(struct inode *inode, + struct extent_state *state, int *bits) { /* @@ -1337,12 +1331,15 @@ static int btrfs_set_bit_hook(struct inode *inode, if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; u64 len = state->end + 1 - state->start; - bool do_list = !is_free_space_inode(root, inode); + bool do_list = !btrfs_is_free_space_inode(root, inode); - if (*bits & EXTENT_FIRST_DELALLOC) + if (*bits & EXTENT_FIRST_DELALLOC) { *bits &= ~EXTENT_FIRST_DELALLOC; - else - atomic_inc(&BTRFS_I(inode)->outstanding_extents); + } else { + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->outstanding_extents++; + spin_unlock(&BTRFS_I(inode)->lock); + } spin_lock(&root->fs_info->delalloc_lock); BTRFS_I(inode)->delalloc_bytes += len; @@ -1353,14 +1350,13 @@ static int btrfs_set_bit_hook(struct inode *inode, } spin_unlock(&root->fs_info->delalloc_lock); } - return 0; } /* * extent_io.c clear_bit_hook, see set_bit_hook for why */ -static int btrfs_clear_bit_hook(struct inode *inode, - struct extent_state *state, int *bits) +static void btrfs_clear_bit_hook(struct inode *inode, + struct extent_state *state, int *bits) { /* * set_bit and clear bit hooks normally require _irqsave/restore @@ -1370,12 +1366,15 @@ static int btrfs_clear_bit_hook(struct inode *inode, if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; u64 len = state->end + 1 - state->start; - bool do_list = !is_free_space_inode(root, inode); + bool do_list = !btrfs_is_free_space_inode(root, inode); - if (*bits & EXTENT_FIRST_DELALLOC) + if (*bits & EXTENT_FIRST_DELALLOC) { *bits &= ~EXTENT_FIRST_DELALLOC; - else if (!(*bits & EXTENT_DO_ACCOUNTING)) - atomic_dec(&BTRFS_I(inode)->outstanding_extents); + } else if (!(*bits & EXTENT_DO_ACCOUNTING)) { + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->outstanding_extents--; + spin_unlock(&BTRFS_I(inode)->lock); + } if (*bits & EXTENT_DO_ACCOUNTING) btrfs_delalloc_release_metadata(inode, len); @@ -1394,7 +1393,6 @@ static int btrfs_clear_bit_hook(struct inode *inode, } spin_unlock(&root->fs_info->delalloc_lock); } - return 0; } /* @@ -1477,7 +1475,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; - if (is_free_space_inode(root, inode)) + if (btrfs_is_free_space_inode(root, inode)) ret = btrfs_bio_wq_end_io(root->fs_info, bio, 2); else ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); @@ -1644,7 +1642,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, int ret; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; path->leave_spinning = 1; @@ -1726,7 +1725,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) return 0; BUG_ON(!ordered_extent); - nolock = is_free_space_inode(root, inode); + nolock = btrfs_is_free_space_inode(root, inode); if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { BUG_ON(!list_empty(&ordered_extent->list)); @@ -1787,7 +1786,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) &ordered_extent->list); ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); - if (!ret) { + if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); } @@ -2214,7 +2213,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) if (!root->orphan_block_rsv) { block_rsv = btrfs_alloc_block_rsv(root); - BUG_ON(!block_rsv); + if (!block_rsv) + return -ENOMEM; } spin_lock(&root->orphan_lock); @@ -2516,7 +2516,9 @@ static void btrfs_read_locked_inode(struct inode *inode) filled = true; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + goto make_bad; + path->leave_spinning = 1; memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); @@ -2531,13 +2533,6 @@ static void btrfs_read_locked_inode(struct inode *inode) inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); - if (!leaf->map_token) - map_private_extent_buffer(leaf, (unsigned long)inode_item, - sizeof(struct btrfs_inode_item), - &leaf->map_token, &leaf->kaddr, - &leaf->map_start, &leaf->map_len, - KM_USER1); - inode->i_mode = btrfs_inode_mode(leaf, inode_item); inode->i_nlink = btrfs_inode_nlink(leaf, inode_item); inode->i_uid = btrfs_inode_uid(leaf, inode_item); @@ -2575,11 +2570,6 @@ cache_acl: if (!maybe_acls) cache_no_acl(inode); - if (leaf->map_token) { - unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); - leaf->map_token = NULL; - } - btrfs_free_path(path); switch (inode->i_mode & S_IFMT) { @@ -2624,13 +2614,6 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode_item *item, struct inode *inode) { - if (!leaf->map_token) - map_private_extent_buffer(leaf, (unsigned long)item, - sizeof(struct btrfs_inode_item), - &leaf->map_token, &leaf->kaddr, - &leaf->map_start, &leaf->map_len, - KM_USER1); - btrfs_set_inode_uid(leaf, item, inode->i_uid); btrfs_set_inode_gid(leaf, item, inode->i_gid); btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size); @@ -2659,11 +2642,6 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, btrfs_set_inode_rdev(leaf, item, inode->i_rdev); btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); btrfs_set_inode_block_group(leaf, item, 0); - - if (leaf->map_token) { - unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); - leaf->map_token = NULL; - } } /* @@ -2684,7 +2662,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, * The data relocation inode should also be directly updated * without delay */ - if (!is_free_space_inode(root, inode) + if (!btrfs_is_free_space_inode(root, inode) && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) { ret = btrfs_delayed_update_inode(trans, root, inode); if (!ret) @@ -3021,13 +2999,16 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, dentry->d_name.name, dentry->d_name.len); - BUG_ON(ret); + if (ret) + goto out; if (inode->i_nlink == 0) { ret = btrfs_orphan_add(trans, inode); - BUG_ON(ret); + if (ret) + goto out; } +out: nr = trans->blocks_used; __unlink_end_trans(trans, root); btrfs_btree_balance_dirty(root, nr); @@ -3170,6 +3151,11 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = -1; + if (root->ref_cows || root == root->fs_info->tree_root) btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0); @@ -3182,10 +3168,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, if (min_type == 0 && root == BTRFS_I(inode)->root) btrfs_kill_delayed_inode_items(inode); - path = btrfs_alloc_path(); - BUG_ON(!path); - path->reada = -1; - key.objectid = ino; key.offset = (u64)-1; key.type = (u8)-1; @@ -3398,7 +3380,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from) ret = -ENOMEM; again: - page = grab_cache_page(mapping, index); + page = find_or_create_page(mapping, index, GFP_NOFS); if (!page) { btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); goto out; @@ -3528,15 +3510,19 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) err = btrfs_drop_extents(trans, inode, cur_offset, cur_offset + hole_size, &hint_byte, 1); - if (err) + if (err) { + btrfs_end_transaction(trans, root); break; + } err = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), cur_offset, 0, 0, hole_size, 0, hole_size, 0, 0, 0); - if (err) + if (err) { + btrfs_end_transaction(trans, root); break; + } btrfs_drop_extent_cache(inode, hole_start, last_byte - 1, 0); @@ -3634,7 +3620,7 @@ void btrfs_evict_inode(struct inode *inode) truncate_inode_pages(&inode->i_data, 0); if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 || - is_free_space_inode(root, inode))) + btrfs_is_free_space_inode(root, inode))) goto no_delete; if (is_bad_inode(inode)) { @@ -3713,7 +3699,8 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, int ret = 0; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), name, namelen, 0); @@ -3978,10 +3965,16 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, BTRFS_I(inode)->root = root; memcpy(&BTRFS_I(inode)->location, location, sizeof(*location)); btrfs_read_locked_inode(inode); - inode_tree_add(inode); - unlock_new_inode(inode); - if (new) - *new = 1; + if (!is_bad_inode(inode)) { + inode_tree_add(inode); + unlock_new_inode(inode); + if (new) + *new = 1; + } else { + unlock_new_inode(inode); + iput(inode); + inode = ERR_PTR(-ESTALE); + } } return inode; @@ -4016,12 +4009,19 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) struct btrfs_root *sub_root = root; struct btrfs_key location; int index; - int ret; + int ret = 0; if (dentry->d_name.len > BTRFS_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); - ret = btrfs_inode_by_name(dir, dentry, &location); + if (unlikely(d_need_lookup(dentry))) { + memcpy(&location, dentry->d_fsdata, sizeof(struct btrfs_key)); + kfree(dentry->d_fsdata); + dentry->d_fsdata = NULL; + d_clear_need_lookup(dentry); + } else { + ret = btrfs_inode_by_name(dir, dentry, &location); + } if (ret < 0) return ERR_PTR(ret); @@ -4076,6 +4076,12 @@ static int btrfs_dentry_delete(const struct dentry *dentry) return 0; } +static void btrfs_dentry_release(struct dentry *dentry) +{ + if (dentry->d_fsdata) + kfree(dentry->d_fsdata); +} + static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { @@ -4098,6 +4104,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, struct btrfs_path *path; struct list_head ins_list; struct list_head del_list; + struct qstr q; int ret; struct extent_buffer *leaf; int slot; @@ -4187,6 +4194,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, while (di_cur < di_total) { struct btrfs_key location; + struct dentry *tmp; if (verify_dir_item(root, leaf, di)) break; @@ -4207,6 +4215,33 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)]; btrfs_dir_item_key_to_cpu(leaf, di, &location); + q.name = name_ptr; + q.len = name_len; + q.hash = full_name_hash(q.name, q.len); + tmp = d_lookup(filp->f_dentry, &q); + if (!tmp) { + struct btrfs_key *newkey; + + newkey = kzalloc(sizeof(struct btrfs_key), + GFP_NOFS); + if (!newkey) + goto no_dentry; + tmp = d_alloc(filp->f_dentry, &q); + if (!tmp) { + kfree(newkey); + dput(tmp); + goto no_dentry; + } + memcpy(newkey, &location, + sizeof(struct btrfs_key)); + tmp->d_fsdata = newkey; + tmp->d_flags |= DCACHE_NEED_LOOKUP; + d_rehash(tmp); + dput(tmp); + } else { + dput(tmp); + } +no_dentry: /* is this a reference to our own snapshot? If so * skip it */ @@ -4271,7 +4306,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc) if (BTRFS_I(inode)->dummy_inode) return 0; - if (btrfs_fs_closing(root->fs_info) && is_free_space_inode(root, inode)) + if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(root, inode)) nolock = true; if (wbc->sync_mode == WB_SYNC_ALL) { @@ -4432,7 +4467,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, int owner; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return ERR_PTR(-ENOMEM); inode = new_inode(root->fs_info->sb); if (!inode) { @@ -4467,7 +4503,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, inode->i_generation = BTRFS_I(inode)->generation; btrfs_set_inode_space_info(root, inode); - if (mode & S_IFDIR) + if (S_ISDIR(mode)) owner = 0; else owner = 1; @@ -4512,7 +4548,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, btrfs_inherit_iflags(inode, dir); - if ((mode & S_IFREG)) { + if (S_ISREG(mode)) { if (btrfs_test_opt(root, NODATASUM)) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; if (btrfs_test_opt(root, NODATACOW) || @@ -5787,7 +5823,7 @@ again: add_pending_csums(trans, inode, ordered->file_offset, &ordered->list); ret = btrfs_ordered_update_i_size(inode, 0, ordered); - if (!ret) + if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) btrfs_update_inode(trans, root, inode); ret = 0; out_unlock: @@ -6692,19 +6728,6 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, return 0; } -/* helper function for file defrag and space balancing. This - * forces readahead on a given range of bytes in an inode - */ -unsigned long btrfs_force_ra(struct address_space *mapping, - struct file_ra_state *ra, struct file *file, - pgoff_t offset, pgoff_t last_index) -{ - pgoff_t req_size = last_index - offset + 1; - - page_cache_sync_readahead(mapping, ra, file, offset, req_size); - return offset + req_size; -} - struct inode *btrfs_alloc_inode(struct super_block *sb) { struct btrfs_inode *ei; @@ -6728,8 +6751,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->index_cnt = (u64)-1; ei->last_unlink_trans = 0; - atomic_set(&ei->outstanding_extents, 0); - atomic_set(&ei->reserved_extents, 0); + spin_lock_init(&ei->lock); + ei->outstanding_extents = 0; + ei->reserved_extents = 0; ei->ordered_data_close = 0; ei->orphan_meta_reserved = 0; @@ -6767,8 +6791,8 @@ void btrfs_destroy_inode(struct inode *inode) WARN_ON(!list_empty(&inode->i_dentry)); WARN_ON(inode->i_data.nrpages); - WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents)); - WARN_ON(atomic_read(&BTRFS_I(inode)->reserved_extents)); + WARN_ON(BTRFS_I(inode)->outstanding_extents); + WARN_ON(BTRFS_I(inode)->reserved_extents); /* * This can happen where we create an inode, but somebody else also @@ -6823,7 +6847,7 @@ int btrfs_drop_inode(struct inode *inode) struct btrfs_root *root = BTRFS_I(inode)->root; if (btrfs_root_refs(&root->root_item) == 0 && - !is_free_space_inode(root, inode)) + !btrfs_is_free_space_inode(root, inode)) return 1; else return generic_drop_inode(inode); @@ -7186,7 +7210,11 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, goto out_unlock; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) { + err = -ENOMEM; + drop_inode = 1; + goto out_unlock; + } key.objectid = btrfs_ino(inode); key.offset = 0; btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); @@ -7326,11 +7354,15 @@ static int btrfs_set_page_dirty(struct page *page) static int btrfs_permission(struct inode *inode, int mask) { struct btrfs_root *root = BTRFS_I(inode)->root; + umode_t mode = inode->i_mode; - if (btrfs_root_readonly(root) && (mask & MAY_WRITE)) - return -EROFS; - if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE)) - return -EACCES; + if (mask & MAY_WRITE && + (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) { + if (btrfs_root_readonly(root)) + return -EROFS; + if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) + return -EACCES; + } return generic_permission(inode, mask); } @@ -7452,4 +7484,5 @@ static const struct inode_operations btrfs_symlink_inode_operations = { const struct dentry_operations btrfs_dentry_operations = { .d_delete = btrfs_dentry_delete, + .d_release = btrfs_dentry_release, }; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 6225433..3351b1b 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -859,8 +859,8 @@ again: /* step one, lock all the pages */ for (i = 0; i < num_pages; i++) { struct page *page; - page = grab_cache_page(inode->i_mapping, - start_index + i); + page = find_or_create_page(inode->i_mapping, + start_index + i, GFP_NOFS); if (!page) break; @@ -930,7 +930,9 @@ again: GFP_NOFS); if (i_done != num_pages) { - atomic_inc(&BTRFS_I(inode)->outstanding_extents); + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->outstanding_extents++; + spin_unlock(&BTRFS_I(inode)->lock); btrfs_delalloc_release_space(inode, (num_pages - i_done) << PAGE_CACHE_SHIFT); } @@ -1747,11 +1749,10 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info, key.objectid = key.offset; key.offset = (u64)-1; dirid = key.objectid; - } if (ptr < name) goto out; - memcpy(name, ptr, total_len); + memmove(name, ptr, total_len); name[total_len]='\0'; ret = 0; out: @@ -2219,6 +2220,12 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, !IS_ALIGNED(destoff, bs)) goto out_unlock; + if (destoff > inode->i_size) { + ret = btrfs_cont_expand(inode, inode->i_size, destoff); + if (ret) + goto out_unlock; + } + /* do any pending delalloc/csum calc on src, one way or another, and lock file content */ while (1) { @@ -2235,6 +2242,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, btrfs_wait_ordered_range(src, off, len); } + /* truncate page cache pages from target inode range */ + truncate_inode_pages_range(&inode->i_data, off, + ALIGN(off + len, PAGE_CACHE_SIZE) - 1); + /* clone data */ key.objectid = btrfs_ino(src); key.type = BTRFS_EXTENT_DATA_KEY; @@ -2320,14 +2331,21 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, if (type == BTRFS_FILE_EXTENT_REG || type == BTRFS_FILE_EXTENT_PREALLOC) { + /* + * a | --- range to clone ---| b + * | ------------- extent ------------- | + */ + + /* substract range b */ + if (key.offset + datal > off + len) + datal = off + len - key.offset; + + /* substract range a */ if (off > key.offset) { datao += off - key.offset; datal -= off - key.offset; } - if (key.offset + datal > off + len) - datal = off + len - key.offset; - ret = btrfs_drop_extents(trans, inode, new_key.offset, new_key.offset + datal, diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 66fa43d..d77b67c 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -24,185 +24,197 @@ #include "extent_io.h" #include "locking.h" -static inline void spin_nested(struct extent_buffer *eb) -{ - spin_lock(&eb->lock); -} +void btrfs_assert_tree_read_locked(struct extent_buffer *eb); /* - * Setting a lock to blocking will drop the spinlock and set the - * flag that forces other procs who want the lock to wait. After - * this you can safely schedule with the lock held. + * if we currently have a spinning reader or writer lock + * (indicated by the rw flag) this will bump the count + * of blocking holders and drop the spinlock. */ -void btrfs_set_lock_blocking(struct extent_buffer *eb) +void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw) { - if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) { - set_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags); - spin_unlock(&eb->lock); + if (rw == BTRFS_WRITE_LOCK) { + if (atomic_read(&eb->blocking_writers) == 0) { + WARN_ON(atomic_read(&eb->spinning_writers) != 1); + atomic_dec(&eb->spinning_writers); + btrfs_assert_tree_locked(eb); + atomic_inc(&eb->blocking_writers); + write_unlock(&eb->lock); + } + } else if (rw == BTRFS_READ_LOCK) { + btrfs_assert_tree_read_locked(eb); + atomic_inc(&eb->blocking_readers); + WARN_ON(atomic_read(&eb->spinning_readers) == 0); + atomic_dec(&eb->spinning_readers); + read_unlock(&eb->lock); } - /* exit with the spin lock released and the bit set */ + return; } /* - * clearing the blocking flag will take the spinlock again. - * After this you can't safely schedule + * if we currently have a blocking lock, take the spinlock + * and drop our blocking count */ -void btrfs_clear_lock_blocking(struct extent_buffer *eb) +void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw) { - if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) { - spin_nested(eb); - clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags); - smp_mb__after_clear_bit(); + if (rw == BTRFS_WRITE_LOCK_BLOCKING) { + BUG_ON(atomic_read(&eb->blocking_writers) != 1); + write_lock(&eb->lock); + WARN_ON(atomic_read(&eb->spinning_writers)); + atomic_inc(&eb->spinning_writers); + if (atomic_dec_and_test(&eb->blocking_writers)) + wake_up(&eb->write_lock_wq); + } else if (rw == BTRFS_READ_LOCK_BLOCKING) { + BUG_ON(atomic_read(&eb->blocking_readers) == 0); + read_lock(&eb->lock); + atomic_inc(&eb->spinning_readers); + if (atomic_dec_and_test(&eb->blocking_readers)) + wake_up(&eb->read_lock_wq); } - /* exit with the spin lock held */ + return; } /* - * unfortunately, many of the places that currently set a lock to blocking - * don't end up blocking for very long, and often they don't block - * at all. For a dbench 50 run, if we don't spin on the blocking bit - * at all, the context switch rate can jump up to 400,000/sec or more. - * - * So, we're still stuck with this crummy spin on the blocking bit, - * at least until the most common causes of the short blocks - * can be dealt with. + * take a spinning read lock. This will wait for any blocking + * writers */ -static int btrfs_spin_on_block(struct extent_buffer *eb) +void btrfs_tree_read_lock(struct extent_buffer *eb) { - int i; - - for (i = 0; i < 512; i++) { - if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) - return 1; - if (need_resched()) - break; - cpu_relax(); +again: + wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0); + read_lock(&eb->lock); + if (atomic_read(&eb->blocking_writers)) { + read_unlock(&eb->lock); + wait_event(eb->write_lock_wq, + atomic_read(&eb->blocking_writers) == 0); + goto again; } - return 0; + atomic_inc(&eb->read_locks); + atomic_inc(&eb->spinning_readers); } /* - * This is somewhat different from trylock. It will take the - * spinlock but if it finds the lock is set to blocking, it will - * return without the lock held. - * - * returns 1 if it was able to take the lock and zero otherwise - * - * After this call, scheduling is not safe without first calling - * btrfs_set_lock_blocking() + * returns 1 if we get the read lock and 0 if we don't + * this won't wait for blocking writers */ -int btrfs_try_spin_lock(struct extent_buffer *eb) +int btrfs_try_tree_read_lock(struct extent_buffer *eb) { - int i; + if (atomic_read(&eb->blocking_writers)) + return 0; - if (btrfs_spin_on_block(eb)) { - spin_nested(eb); - if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) - return 1; - spin_unlock(&eb->lock); + read_lock(&eb->lock); + if (atomic_read(&eb->blocking_writers)) { + read_unlock(&eb->lock); + return 0; } - /* spin for a bit on the BLOCKING flag */ - for (i = 0; i < 2; i++) { - cpu_relax(); - if (!btrfs_spin_on_block(eb)) - break; - - spin_nested(eb); - if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) - return 1; - spin_unlock(&eb->lock); - } - return 0; + atomic_inc(&eb->read_locks); + atomic_inc(&eb->spinning_readers); + return 1; } /* - * the autoremove wake function will return 0 if it tried to wake up - * a process that was already awake, which means that process won't - * count as an exclusive wakeup. The waitq code will continue waking - * procs until it finds one that was actually sleeping. - * - * For btrfs, this isn't quite what we want. We want a single proc - * to be notified that the lock is ready for taking. If that proc - * already happen to be awake, great, it will loop around and try for - * the lock. - * - * So, btrfs_wake_function always returns 1, even when the proc that we - * tried to wake up was already awake. + * returns 1 if we get the read lock and 0 if we don't + * this won't wait for blocking writers or readers */ -static int btrfs_wake_function(wait_queue_t *wait, unsigned mode, - int sync, void *key) +int btrfs_try_tree_write_lock(struct extent_buffer *eb) { - autoremove_wake_function(wait, mode, sync, key); + if (atomic_read(&eb->blocking_writers) || + atomic_read(&eb->blocking_readers)) + return 0; + write_lock(&eb->lock); + if (atomic_read(&eb->blocking_writers) || + atomic_read(&eb->blocking_readers)) { + write_unlock(&eb->lock); + return 0; + } + atomic_inc(&eb->write_locks); + atomic_inc(&eb->spinning_writers); return 1; } /* - * returns with the extent buffer spinlocked. - * - * This will spin and/or wait as required to take the lock, and then - * return with the spinlock held. - * - * After this call, scheduling is not safe without first calling - * btrfs_set_lock_blocking() + * drop a spinning read lock + */ +void btrfs_tree_read_unlock(struct extent_buffer *eb) +{ + btrfs_assert_tree_read_locked(eb); + WARN_ON(atomic_read(&eb->spinning_readers) == 0); + atomic_dec(&eb->spinning_readers); + atomic_dec(&eb->read_locks); + read_unlock(&eb->lock); +} + +/* + * drop a blocking read lock + */ +void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) +{ + btrfs_assert_tree_read_locked(eb); + WARN_ON(atomic_read(&eb->blocking_readers) == 0); + if (atomic_dec_and_test(&eb->blocking_readers)) + wake_up(&eb->read_lock_wq); + atomic_dec(&eb->read_locks); +} + +/* + * take a spinning write lock. This will wait for both + * blocking readers or writers */ int btrfs_tree_lock(struct extent_buffer *eb) { - DEFINE_WAIT(wait); - wait.func = btrfs_wake_function; - - if (!btrfs_spin_on_block(eb)) - goto sleep; - - while(1) { - spin_nested(eb); - - /* nobody is blocking, exit with the spinlock held */ - if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) - return 0; - - /* - * we have the spinlock, but the real owner is blocking. - * wait for them - */ - spin_unlock(&eb->lock); - - /* - * spin for a bit, and if the blocking flag goes away, - * loop around - */ - cpu_relax(); - if (btrfs_spin_on_block(eb)) - continue; -sleep: - prepare_to_wait_exclusive(&eb->lock_wq, &wait, - TASK_UNINTERRUPTIBLE); - - if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) - schedule(); - - finish_wait(&eb->lock_wq, &wait); +again: + wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0); + wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0); + write_lock(&eb->lock); + if (atomic_read(&eb->blocking_readers)) { + write_unlock(&eb->lock); + wait_event(eb->read_lock_wq, + atomic_read(&eb->blocking_readers) == 0); + goto again; } + if (atomic_read(&eb->blocking_writers)) { + write_unlock(&eb->lock); + wait_event(eb->write_lock_wq, + atomic_read(&eb->blocking_writers) == 0); + goto again; + } + WARN_ON(atomic_read(&eb->spinning_writers)); + atomic_inc(&eb->spinning_writers); + atomic_inc(&eb->write_locks); return 0; } +/* + * drop a spinning or a blocking write lock. + */ int btrfs_tree_unlock(struct extent_buffer *eb) { - /* - * if we were a blocking owner, we don't have the spinlock held - * just clear the bit and look for waiters - */ - if (test_and_clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) - smp_mb__after_clear_bit(); - else - spin_unlock(&eb->lock); - - if (waitqueue_active(&eb->lock_wq)) - wake_up(&eb->lock_wq); + int blockers = atomic_read(&eb->blocking_writers); + + BUG_ON(blockers > 1); + + btrfs_assert_tree_locked(eb); + atomic_dec(&eb->write_locks); + + if (blockers) { + WARN_ON(atomic_read(&eb->spinning_writers)); + atomic_dec(&eb->blocking_writers); + smp_wmb(); + wake_up(&eb->write_lock_wq); + } else { + WARN_ON(atomic_read(&eb->spinning_writers) != 1); + atomic_dec(&eb->spinning_writers); + write_unlock(&eb->lock); + } return 0; } void btrfs_assert_tree_locked(struct extent_buffer *eb) { - if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) - assert_spin_locked(&eb->lock); + BUG_ON(!atomic_read(&eb->write_locks)); +} + +void btrfs_assert_tree_read_locked(struct extent_buffer *eb) +{ + BUG_ON(!atomic_read(&eb->read_locks)); } diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 5c33a56..17247dd 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -19,11 +19,43 @@ #ifndef __BTRFS_LOCKING_ #define __BTRFS_LOCKING_ +#define BTRFS_WRITE_LOCK 1 +#define BTRFS_READ_LOCK 2 +#define BTRFS_WRITE_LOCK_BLOCKING 3 +#define BTRFS_READ_LOCK_BLOCKING 4 + int btrfs_tree_lock(struct extent_buffer *eb); int btrfs_tree_unlock(struct extent_buffer *eb); int btrfs_try_spin_lock(struct extent_buffer *eb); -void btrfs_set_lock_blocking(struct extent_buffer *eb); -void btrfs_clear_lock_blocking(struct extent_buffer *eb); +void btrfs_tree_read_lock(struct extent_buffer *eb); +void btrfs_tree_read_unlock(struct extent_buffer *eb); +void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb); +void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw); +void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw); void btrfs_assert_tree_locked(struct extent_buffer *eb); +int btrfs_try_tree_read_lock(struct extent_buffer *eb); +int btrfs_try_tree_write_lock(struct extent_buffer *eb); + +static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw) +{ + if (rw == BTRFS_WRITE_LOCK || rw == BTRFS_WRITE_LOCK_BLOCKING) + btrfs_tree_unlock(eb); + else if (rw == BTRFS_READ_LOCK_BLOCKING) + btrfs_tree_read_unlock_blocking(eb); + else if (rw == BTRFS_READ_LOCK) + btrfs_tree_read_unlock(eb); + else + BUG(); +} + +static inline void btrfs_set_lock_blocking(struct extent_buffer *eb) +{ + btrfs_set_lock_blocking_rw(eb, BTRFS_WRITE_LOCK); +} + +static inline void btrfs_clear_lock_blocking(struct extent_buffer *eb) +{ + btrfs_clear_lock_blocking_rw(eb, BTRFS_WRITE_LOCK_BLOCKING); +} #endif diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c deleted file mode 100644 index 82d569c..0000000 --- a/fs/btrfs/ref-cache.c +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright (C) 2008 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/sort.h> -#include "ctree.h" -#include "ref-cache.h" -#include "transaction.h" - -static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr, - struct rb_node *node) -{ - struct rb_node **p = &root->rb_node; - struct rb_node *parent = NULL; - struct btrfs_leaf_ref *entry; - - while (*p) { - parent = *p; - entry = rb_entry(parent, struct btrfs_leaf_ref, rb_node); - - if (bytenr < entry->bytenr) - p = &(*p)->rb_left; - else if (bytenr > entry->bytenr) - p = &(*p)->rb_right; - else - return parent; - } - - entry = rb_entry(node, struct btrfs_leaf_ref, rb_node); - rb_link_node(node, parent, p); - rb_insert_color(node, root); - return NULL; -} - -static struct rb_node *tree_search(struct rb_root *root, u64 bytenr) -{ - struct rb_node *n = root->rb_node; - struct btrfs_leaf_ref *entry; - - while (n) { - entry = rb_entry(n, struct btrfs_leaf_ref, rb_node); - WARN_ON(!entry->in_tree); - - if (bytenr < entry->bytenr) - n = n->rb_left; - else if (bytenr > entry->bytenr) - n = n->rb_right; - else - return n; - } - return NULL; -} diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h deleted file mode 100644 index 24f7001..0000000 --- a/fs/btrfs/ref-cache.h +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright (C) 2008 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ -#ifndef __REFCACHE__ -#define __REFCACHE__ - -struct btrfs_extent_info { - /* bytenr and num_bytes find the extent in the extent allocation tree */ - u64 bytenr; - u64 num_bytes; - - /* objectid and offset find the back reference for the file */ - u64 objectid; - u64 offset; -}; - -struct btrfs_leaf_ref { - struct rb_node rb_node; - struct btrfs_leaf_ref_tree *tree; - int in_tree; - atomic_t usage; - - u64 root_gen; - u64 bytenr; - u64 owner; - u64 generation; - int nritems; - - struct list_head list; - struct btrfs_extent_info extents[]; -}; - -static inline size_t btrfs_leaf_ref_size(int nr_extents) -{ - return sizeof(struct btrfs_leaf_ref) + - sizeof(struct btrfs_extent_info) * nr_extents; -} -#endif diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 5e0a3dc..59bb176 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2955,7 +2955,8 @@ static int relocate_file_extent_cluster(struct inode *inode, page_cache_sync_readahead(inode->i_mapping, ra, NULL, index, last_index + 1 - index); - page = grab_cache_page(inode->i_mapping, index); + page = find_or_create_page(inode->i_mapping, index, + GFP_NOFS); if (!page) { btrfs_delalloc_release_metadata(inode, PAGE_CACHE_SIZE); diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index ebe4544..f409990 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -71,13 +71,12 @@ out: return ret; } -int btrfs_set_root_node(struct btrfs_root_item *item, - struct extent_buffer *node) +void btrfs_set_root_node(struct btrfs_root_item *item, + struct extent_buffer *node) { btrfs_set_root_bytenr(item, node->start); btrfs_set_root_level(item, btrfs_header_level(node)); btrfs_set_root_generation(item, btrfs_header_generation(node)); - return 0; } /* diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c index c0f7eca..bc1f6ad 100644 --- a/fs/btrfs/struct-funcs.c +++ b/fs/btrfs/struct-funcs.c @@ -50,36 +50,22 @@ u##bits btrfs_##name(struct extent_buffer *eb, \ unsigned long part_offset = (unsigned long)s; \ unsigned long offset = part_offset + offsetof(type, member); \ type *p; \ - /* ugly, but we want the fast path here */ \ - if (eb->map_token && offset >= eb->map_start && \ - offset + sizeof(((type *)0)->member) <= eb->map_start + \ - eb->map_len) { \ - p = (type *)(eb->kaddr + part_offset - eb->map_start); \ - return le##bits##_to_cpu(p->member); \ - } \ - { \ - int err; \ - char *map_token; \ - char *kaddr; \ - int unmap_on_exit = (eb->map_token == NULL); \ - unsigned long map_start; \ - unsigned long map_len; \ - u##bits res; \ - err = map_extent_buffer(eb, offset, \ - sizeof(((type *)0)->member), \ - &map_token, &kaddr, \ - &map_start, &map_len, KM_USER1); \ - if (err) { \ - __le##bits leres; \ - read_eb_member(eb, s, type, member, &leres); \ - return le##bits##_to_cpu(leres); \ - } \ - p = (type *)(kaddr + part_offset - map_start); \ - res = le##bits##_to_cpu(p->member); \ - if (unmap_on_exit) \ - unmap_extent_buffer(eb, map_token, KM_USER1); \ - return res; \ - } \ + int err; \ + char *kaddr; \ + unsigned long map_start; \ + unsigned long map_len; \ + u##bits res; \ + err = map_private_extent_buffer(eb, offset, \ + sizeof(((type *)0)->member), \ + &kaddr, &map_start, &map_len); \ + if (err) { \ + __le##bits leres; \ + read_eb_member(eb, s, type, member, &leres); \ + return le##bits##_to_cpu(leres); \ + } \ + p = (type *)(kaddr + part_offset - map_start); \ + res = le##bits##_to_cpu(p->member); \ + return res; \ } \ void btrfs_set_##name(struct extent_buffer *eb, \ type *s, u##bits val) \ @@ -87,36 +73,21 @@ void btrfs_set_##name(struct extent_buffer *eb, \ unsigned long part_offset = (unsigned long)s; \ unsigned long offset = part_offset + offsetof(type, member); \ type *p; \ - /* ugly, but we want the fast path here */ \ - if (eb->map_token && offset >= eb->map_start && \ - offset + sizeof(((type *)0)->member) <= eb->map_start + \ - eb->map_len) { \ - p = (type *)(eb->kaddr + part_offset - eb->map_start); \ - p->member = cpu_to_le##bits(val); \ - return; \ - } \ - { \ - int err; \ - char *map_token; \ - char *kaddr; \ - int unmap_on_exit = (eb->map_token == NULL); \ - unsigned long map_start; \ - unsigned long map_len; \ - err = map_extent_buffer(eb, offset, \ - sizeof(((type *)0)->member), \ - &map_token, &kaddr, \ - &map_start, &map_len, KM_USER1); \ - if (err) { \ - __le##bits val2; \ - val2 = cpu_to_le##bits(val); \ - write_eb_member(eb, s, type, member, &val2); \ - return; \ - } \ - p = (type *)(kaddr + part_offset - map_start); \ - p->member = cpu_to_le##bits(val); \ - if (unmap_on_exit) \ - unmap_extent_buffer(eb, map_token, KM_USER1); \ - } \ + int err; \ + char *kaddr; \ + unsigned long map_start; \ + unsigned long map_len; \ + err = map_private_extent_buffer(eb, offset, \ + sizeof(((type *)0)->member), \ + &kaddr, &map_start, &map_len); \ + if (err) { \ + __le##bits val2; \ + val2 = cpu_to_le##bits(val); \ + write_eb_member(eb, s, type, member, &val2); \ + return; \ + } \ + p = (type *)(kaddr + part_offset - map_start); \ + p->member = cpu_to_le##bits(val); \ } #include "ctree.h" @@ -125,15 +96,6 @@ void btrfs_node_key(struct extent_buffer *eb, struct btrfs_disk_key *disk_key, int nr) { unsigned long ptr = btrfs_node_key_ptr_offset(nr); - if (eb->map_token && ptr >= eb->map_start && - ptr + sizeof(*disk_key) <= eb->map_start + eb->map_len) { - memcpy(disk_key, eb->kaddr + ptr - eb->map_start, - sizeof(*disk_key)); - return; - } else if (eb->map_token) { - unmap_extent_buffer(eb, eb->map_token, KM_USER1); - eb->map_token = NULL; - } read_eb_member(eb, (struct btrfs_key_ptr *)ptr, struct btrfs_key_ptr, key, disk_key); } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 51dcec8..e24b796 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -216,17 +216,11 @@ static void wait_current_trans(struct btrfs_root *root) spin_lock(&root->fs_info->trans_lock); cur_trans = root->fs_info->running_transaction; if (cur_trans && cur_trans->blocked) { - DEFINE_WAIT(wait); atomic_inc(&cur_trans->use_count); spin_unlock(&root->fs_info->trans_lock); - while (1) { - prepare_to_wait(&root->fs_info->transaction_wait, &wait, - TASK_UNINTERRUPTIBLE); - if (!cur_trans->blocked) - break; - schedule(); - } - finish_wait(&root->fs_info->transaction_wait, &wait); + + wait_event(root->fs_info->transaction_wait, + !cur_trans->blocked); put_transaction(cur_trans); } else { spin_unlock(&root->fs_info->trans_lock); @@ -260,7 +254,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, { struct btrfs_trans_handle *h; struct btrfs_transaction *cur_trans; - int retries = 0; + u64 num_bytes = 0; int ret; if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) @@ -274,6 +268,19 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, h->block_rsv = NULL; goto got_it; } + + /* + * Do the reservation before we join the transaction so we can do all + * the appropriate flushing if need be. + */ + if (num_items > 0 && root != root->fs_info->chunk_root) { + num_bytes = btrfs_calc_trans_metadata_size(root, num_items); + ret = btrfs_block_rsv_add(NULL, root, + &root->fs_info->trans_block_rsv, + num_bytes); + if (ret) + return ERR_PTR(ret); + } again: h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); if (!h) @@ -310,24 +317,9 @@ again: goto again; } - if (num_items > 0) { - ret = btrfs_trans_reserve_metadata(h, root, num_items); - if (ret == -EAGAIN && !retries) { - retries++; - btrfs_commit_transaction(h, root); - goto again; - } else if (ret == -EAGAIN) { - /* - * We have already retried and got EAGAIN, so really we - * don't have space, so set ret to -ENOSPC. - */ - ret = -ENOSPC; - } - - if (ret < 0) { - btrfs_end_transaction(h, root); - return ERR_PTR(ret); - } + if (num_bytes) { + h->block_rsv = &root->fs_info->trans_block_rsv; + h->bytes_reserved = num_bytes; } got_it: @@ -359,19 +351,10 @@ struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root } /* wait for a transaction commit to be fully complete */ -static noinline int wait_for_commit(struct btrfs_root *root, +static noinline void wait_for_commit(struct btrfs_root *root, struct btrfs_transaction *commit) { - DEFINE_WAIT(wait); - while (!commit->commit_done) { - prepare_to_wait(&commit->commit_wait, &wait, - TASK_UNINTERRUPTIBLE); - if (commit->commit_done) - break; - schedule(); - } - finish_wait(&commit->commit_wait, &wait); - return 0; + wait_event(commit->commit_wait, commit->commit_done); } int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) @@ -499,10 +482,17 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, } if (lock && cur_trans->blocked && !cur_trans->in_commit) { - if (throttle) + if (throttle) { + /* + * We may race with somebody else here so end up having + * to call end_transaction on ourselves again, so inc + * our use_count. + */ + trans->use_count++; return btrfs_commit_transaction(trans, root); - else + } else { wake_up_process(info->transaction_kthread); + } } WARN_ON(cur_trans != info->running_transaction); @@ -894,6 +884,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root *tree_root = fs_info->tree_root; struct btrfs_root *root = pending->root; struct btrfs_root *parent_root; + struct btrfs_block_rsv *rsv; struct inode *parent_inode; struct dentry *parent; struct dentry *dentry; @@ -905,6 +896,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, u64 objectid; u64 root_flags; + rsv = trans->block_rsv; + new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); if (!new_root_item) { pending->error = -ENOMEM; @@ -1012,6 +1005,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_orphan_post_snapshot(trans, pending); fail: kfree(new_root_item); + trans->block_rsv = rsv; btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1); return 0; } @@ -1080,22 +1074,7 @@ int btrfs_transaction_blocked(struct btrfs_fs_info *info) static void wait_current_trans_commit_start(struct btrfs_root *root, struct btrfs_transaction *trans) { - DEFINE_WAIT(wait); - - if (trans->in_commit) - return; - - while (1) { - prepare_to_wait(&root->fs_info->transaction_blocked_wait, &wait, - TASK_UNINTERRUPTIBLE); - if (trans->in_commit) { - finish_wait(&root->fs_info->transaction_blocked_wait, - &wait); - break; - } - schedule(); - finish_wait(&root->fs_info->transaction_blocked_wait, &wait); - } + wait_event(root->fs_info->transaction_blocked_wait, trans->in_commit); } /* @@ -1105,24 +1084,8 @@ static void wait_current_trans_commit_start(struct btrfs_root *root, static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root, struct btrfs_transaction *trans) { - DEFINE_WAIT(wait); - - if (trans->commit_done || (trans->in_commit && !trans->blocked)) - return; - - while (1) { - prepare_to_wait(&root->fs_info->transaction_wait, &wait, - TASK_UNINTERRUPTIBLE); - if (trans->commit_done || - (trans->in_commit && !trans->blocked)) { - finish_wait(&root->fs_info->transaction_wait, - &wait); - break; - } - schedule(); - finish_wait(&root->fs_info->transaction_wait, - &wait); - } + wait_event(root->fs_info->transaction_wait, + trans->commit_done || (trans->in_commit && !trans->blocked)); } /* @@ -1229,8 +1192,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, atomic_inc(&cur_trans->use_count); btrfs_end_transaction(trans, root); - ret = wait_for_commit(root, cur_trans); - BUG_ON(ret); + wait_for_commit(root, cur_trans); put_transaction(cur_trans); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 4ce8a9f..786639f 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -799,14 +799,15 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, struct extent_buffer *eb, int slot, struct btrfs_key *key) { - struct inode *dir; - int ret; struct btrfs_inode_ref *ref; + struct btrfs_dir_item *di; + struct inode *dir; struct inode *inode; - char *name; - int namelen; unsigned long ref_ptr; unsigned long ref_end; + char *name; + int namelen; + int ret; int search_done = 0; /* @@ -909,6 +910,25 @@ again: } btrfs_release_path(path); + /* look for a conflicting sequence number */ + di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), + btrfs_inode_ref_index(eb, ref), + name, namelen, 0); + if (di && !IS_ERR(di)) { + ret = drop_one_dir_item(trans, root, path, dir, di); + BUG_ON(ret); + } + btrfs_release_path(path); + + /* look for a conflicing name */ + di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), + name, namelen, 0); + if (di && !IS_ERR(di)) { + ret = drop_one_dir_item(trans, root, path, dir, di); + BUG_ON(ret); + } + btrfs_release_path(path); + insert: /* insert our name */ ret = btrfs_add_link(trans, dir, inode, name, namelen, 0, @@ -1617,7 +1637,8 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, return 0; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; nritems = btrfs_header_nritems(eb); for (i = 0; i < nritems; i++) { @@ -1723,15 +1744,17 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, return -ENOMEM; if (*level == 1) { - wc->process_func(root, next, wc, ptr_gen); + ret = wc->process_func(root, next, wc, ptr_gen); + if (ret) + return ret; path->slots[*level]++; if (wc->free) { btrfs_read_buffer(next, ptr_gen); btrfs_tree_lock(next); - clean_tree_block(trans, root, next); btrfs_set_lock_blocking(next); + clean_tree_block(trans, root, next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); @@ -1788,16 +1811,19 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, parent = path->nodes[*level + 1]; root_owner = btrfs_header_owner(parent); - wc->process_func(root, path->nodes[*level], wc, + ret = wc->process_func(root, path->nodes[*level], wc, btrfs_header_generation(path->nodes[*level])); + if (ret) + return ret; + if (wc->free) { struct extent_buffer *next; next = path->nodes[*level]; btrfs_tree_lock(next); - clean_tree_block(trans, root, next); btrfs_set_lock_blocking(next); + clean_tree_block(trans, root, next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); @@ -1864,8 +1890,8 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, next = path->nodes[orig_level]; btrfs_tree_lock(next); - clean_tree_block(trans, log, next); btrfs_set_lock_blocking(next); + clean_tree_block(trans, log, next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 19450bc..f2a4cc7 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -142,6 +142,7 @@ static noinline int run_scheduled_bios(struct btrfs_device *device) unsigned long limit; unsigned long last_waited = 0; int force_reg = 0; + int sync_pending = 0; struct blk_plug plug; /* @@ -229,6 +230,22 @@ loop_lock: BUG_ON(atomic_read(&cur->bi_cnt) == 0); + /* + * if we're doing the sync list, record that our + * plug has some sync requests on it + * + * If we're doing the regular list and there are + * sync requests sitting around, unplug before + * we add more + */ + if (pending_bios == &device->pending_sync_bios) { + sync_pending = 1; + } else if (sync_pending) { + blk_finish_plug(&plug); + blk_start_plug(&plug); + sync_pending = 0; + } + submit_bio(cur->bi_rw, cur); num_run++; batch_run++; @@ -500,6 +517,9 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) fs_devices->rw_devices--; } + if (device->can_discard) + fs_devices->num_can_discard--; + new_device = kmalloc(sizeof(*new_device), GFP_NOFS); BUG_ON(!new_device); memcpy(new_device, device, sizeof(*new_device)); @@ -508,6 +528,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) new_device->bdev = NULL; new_device->writeable = 0; new_device->in_fs_metadata = 0; + new_device->can_discard = 0; list_replace_rcu(&device->dev_list, &new_device->dev_list); call_rcu(&device->rcu, free_device); @@ -547,6 +568,7 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, fmode_t flags, void *holder) { + struct request_queue *q; struct block_device *bdev; struct list_head *head = &fs_devices->devices; struct btrfs_device *device; @@ -603,6 +625,12 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, seeding = 0; } + q = bdev_get_queue(bdev); + if (blk_queue_discard(q)) { + device->can_discard = 1; + fs_devices->num_can_discard++; + } + device->bdev = bdev; device->in_fs_metadata = 0; device->mode = flags; @@ -835,6 +863,7 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans, max_hole_start = search_start; max_hole_size = 0; + hole_size = 0; if (search_start >= search_end) { ret = -ENOSPC; @@ -917,7 +946,14 @@ next: cond_resched(); } - hole_size = search_end- search_start; + /* + * At this point, search_start should be the end of + * allocated dev extents, and when shrinking the device, + * search_end may be smaller than search_start. + */ + if (search_end > search_start) + hole_size = search_end - search_start; + if (hole_size > max_hole_size) { max_hole_start = search_start; max_hole_size = hole_size; @@ -1037,7 +1073,8 @@ static noinline int find_next_chunk(struct btrfs_root *root, struct btrfs_key found_key; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; key.objectid = objectid; key.offset = (u64)-1; @@ -1542,6 +1579,7 @@ error: int btrfs_init_new_device(struct btrfs_root *root, char *device_path) { + struct request_queue *q; struct btrfs_trans_handle *trans; struct btrfs_device *device; struct block_device *bdev; @@ -1611,6 +1649,9 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) lock_chunks(root); + q = bdev_get_queue(bdev); + if (blk_queue_discard(q)) + device->can_discard = 1; device->writeable = 1; device->work.func = pending_bios_fn; generate_random_uuid(device->uuid); @@ -1646,6 +1687,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) root->fs_info->fs_devices->num_devices++; root->fs_info->fs_devices->open_devices++; root->fs_info->fs_devices->rw_devices++; + if (device->can_discard) + root->fs_info->fs_devices->num_can_discard++; root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; if (!blk_queue_nonrot(bdev_get_queue(bdev))) @@ -2061,8 +2104,10 @@ int btrfs_balance(struct btrfs_root *dev_root) /* step two, relocate all the chunks */ path = btrfs_alloc_path(); - BUG_ON(!path); - + if (!path) { + ret = -ENOMEM; + goto error; + } key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; key.offset = (u64)-1; key.type = BTRFS_CHUNK_ITEM_KEY; @@ -2410,9 +2455,10 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, total_avail = device->total_bytes - device->bytes_used; else total_avail = 0; - /* avail is off by max(alloc_start, 1MB), but that is the same - * for all devices, so it doesn't hurt the sorting later on - */ + + /* If there is no space on this device, skip it. */ + if (total_avail == 0) + continue; ret = find_free_dev_extent(trans, device, max_stripe_size * dev_stripes, @@ -2661,7 +2707,8 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, ret = find_next_chunk(fs_info->chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset); - BUG_ON(ret); + if (ret) + return ret; alloc_profile = BTRFS_BLOCK_GROUP_METADATA | (fs_info->metadata_alloc_profile & @@ -3595,7 +3642,7 @@ int btrfs_read_sys_array(struct btrfs_root *root) if (!sb) return -ENOMEM; btrfs_set_buffer_uptodate(sb); - btrfs_set_buffer_lockdep_class(sb, 0); + btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0); write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); array_size = btrfs_super_sys_array_size(super_copy); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 7c12d61..6d866db 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -48,6 +48,7 @@ struct btrfs_device { int writeable; int in_fs_metadata; int missing; + int can_discard; spinlock_t io_lock; @@ -104,6 +105,7 @@ struct btrfs_fs_devices { u64 rw_devices; u64 missing_devices; u64 total_rw_bytes; + u64 num_can_discard; struct block_device *latest_bdev; /* all of the devices in the FS, protected by a mutex diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 5366fe4..69565e5 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -102,48 +102,71 @@ static int do_setxattr(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - /* first lets see if we already have this xattr */ - di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), name, - strlen(name), -1); - if (IS_ERR(di)) { - ret = PTR_ERR(di); - goto out; - } - - /* ok we already have this xattr, lets remove it */ - if (di) { - /* if we want create only exit */ - if (flags & XATTR_CREATE) { - ret = -EEXIST; + if (flags & XATTR_REPLACE) { + di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), name, + name_len, -1); + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } else if (!di) { + ret = -ENODATA; goto out; } - ret = btrfs_delete_one_dir_name(trans, root, path, di); - BUG_ON(ret); + if (ret) + goto out; btrfs_release_path(path); - /* if we don't have a value then we are removing the xattr */ + /* + * remove the attribute + */ if (!value) goto out; - } else { + } + +again: + ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode), + name, name_len, value, size); + if (ret == -EEXIST) { + if (flags & XATTR_CREATE) + goto out; + /* + * We can't use the path we already have since we won't have the + * proper locking for a delete, so release the path and + * re-lookup to delete the thing. + */ btrfs_release_path(path); + di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), + name, name_len, -1); + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } else if (!di) { + /* Shouldn't happen but just in case... */ + btrfs_release_path(path); + goto again; + } - if (flags & XATTR_REPLACE) { - /* we couldn't find the attr to replace */ - ret = -ENODATA; + ret = btrfs_delete_one_dir_name(trans, root, path, di); + if (ret) goto out; + + /* + * We have a value to set, so go back and try to insert it now. + */ + if (value) { + btrfs_release_path(path); + goto again; } } - - /* ok we have to create a completely new xattr */ - ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode), - name, name_len, value, size); - BUG_ON(ret); out: btrfs_free_path(path); return ret; } +/* + * @value: "" makes the attribute to empty, NULL removes it + */ int __btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 0dba691..fb962ef 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -102,7 +102,7 @@ static int mdsc_show(struct seq_file *s, void *p) path = NULL; spin_lock(&req->r_old_dentry->d_lock); seq_printf(s, " #%llx/%.*s (%s)", - ceph_ino(req->r_old_dentry->d_parent->d_inode), + ceph_ino(req->r_old_dentry_dir), req->r_old_dentry->d_name.len, req->r_old_dentry->d_name.name, path ? path : ""); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 1065ac7..382abc9 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -40,14 +40,6 @@ int ceph_init_dentry(struct dentry *dentry) if (dentry->d_fsdata) return 0; - if (dentry->d_parent == NULL || /* nfs fh_to_dentry */ - ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) - d_set_d_op(dentry, &ceph_dentry_ops); - else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR) - d_set_d_op(dentry, &ceph_snapdir_dentry_ops); - else - d_set_d_op(dentry, &ceph_snap_dentry_ops); - di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO); if (!di) return -ENOMEM; /* oh well */ @@ -58,16 +50,42 @@ int ceph_init_dentry(struct dentry *dentry) kmem_cache_free(ceph_dentry_cachep, di); goto out_unlock; } + + if (dentry->d_parent == NULL || /* nfs fh_to_dentry */ + ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) + d_set_d_op(dentry, &ceph_dentry_ops); + else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR) + d_set_d_op(dentry, &ceph_snapdir_dentry_ops); + else + d_set_d_op(dentry, &ceph_snap_dentry_ops); + di->dentry = dentry; di->lease_session = NULL; - dentry->d_fsdata = di; dentry->d_time = jiffies; + /* avoid reordering d_fsdata setup so that the check above is safe */ + smp_mb(); + dentry->d_fsdata = di; ceph_dentry_lru_add(dentry); out_unlock: spin_unlock(&dentry->d_lock); return 0; } +struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry) +{ + struct inode *inode = NULL; + + if (!dentry) + return NULL; + + spin_lock(&dentry->d_lock); + if (dentry->d_parent) { + inode = dentry->d_parent->d_inode; + ihold(inode); + } + spin_unlock(&dentry->d_lock); + return inode; +} /* @@ -133,7 +151,7 @@ more: d_unhashed(dentry) ? "!hashed" : "hashed", parent->d_subdirs.prev, parent->d_subdirs.next); if (p == &parent->d_subdirs) { - fi->at_end = 1; + fi->flags |= CEPH_F_ATEND; goto out_unlock; } spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); @@ -234,7 +252,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir) const int max_bytes = fsc->mount_options->max_readdir_bytes; dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off); - if (fi->at_end) + if (fi->flags & CEPH_F_ATEND) return 0; /* always start with . and .. */ @@ -403,7 +421,7 @@ more: dout("readdir next frag is %x\n", frag); goto more; } - fi->at_end = 1; + fi->flags |= CEPH_F_ATEND; /* * if dir_release_count still matches the dir, no dentries @@ -435,7 +453,7 @@ static void reset_readdir(struct ceph_file_info *fi) dput(fi->dentry); fi->dentry = NULL; } - fi->at_end = 0; + fi->flags &= ~CEPH_F_ATEND; } static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin) @@ -463,7 +481,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin) if (offset != file->f_pos) { file->f_pos = offset; file->f_version = 0; - fi->at_end = 0; + fi->flags &= ~CEPH_F_ATEND; } retval = offset; @@ -488,21 +506,13 @@ out: } /* - * Process result of a lookup/open request. - * - * Mainly, make sure we return the final req->r_dentry (if it already - * existed) in place of the original VFS-provided dentry when they - * differ. - * - * Gracefully handle the case where the MDS replies with -ENOENT and - * no trace (which it may do, at its discretion, e.g., if it doesn't - * care to issue a lease on the negative dentry). + * Handle lookups for the hidden .snap directory. */ -struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, - struct dentry *dentry, int err) +int ceph_handle_snapdir(struct ceph_mds_request *req, + struct dentry *dentry, int err) { struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); - struct inode *parent = dentry->d_parent->d_inode; + struct inode *parent = dentry->d_parent->d_inode; /* we hold i_mutex */ /* .snap dir? */ if (err == -ENOENT && @@ -516,7 +526,23 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, d_add(dentry, inode); err = 0; } + return err; +} +/* + * Figure out final result of a lookup/open request. + * + * Mainly, make sure we return the final req->r_dentry (if it already + * existed) in place of the original VFS-provided dentry when they + * differ. + * + * Gracefully handle the case where the MDS replies with -ENOENT and + * no trace (which it may do, at its discretion, e.g., if it doesn't + * care to issue a lease on the negative dentry). + */ +struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, + struct dentry *dentry, int err) +{ if (err == -ENOENT) { /* no trace? */ err = 0; @@ -610,6 +636,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE); req->r_locked_dir = dir; err = ceph_mdsc_do_request(mdsc, NULL, req); + err = ceph_handle_snapdir(req, dentry, err); dentry = ceph_finish_lookup(req, dentry, err); ceph_mdsc_put_request(req); /* will dput(dentry) */ dout("lookup result=%p\n", dentry); @@ -789,6 +816,7 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir, req->r_dentry = dget(dentry); req->r_num_caps = 2; req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */ + req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry); req->r_locked_dir = dir; req->r_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; @@ -887,6 +915,7 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, req->r_dentry = dget(new_dentry); req->r_num_caps = 2; req->r_old_dentry = dget(old_dentry); + req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry); req->r_locked_dir = new_dir; req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL; @@ -1002,36 +1031,38 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry) */ static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd) { + int valid = 0; struct inode *dir; if (nd && nd->flags & LOOKUP_RCU) return -ECHILD; - dir = dentry->d_parent->d_inode; - dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode, ceph_dentry(dentry)->offset); + dir = ceph_get_dentry_parent_inode(dentry); + /* always trust cached snapped dentries, snapdir dentry */ if (ceph_snap(dir) != CEPH_NOSNAP) { dout("d_revalidate %p '%.*s' inode %p is SNAPPED\n", dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode); - goto out_touch; + valid = 1; + } else if (dentry->d_inode && + ceph_snap(dentry->d_inode) == CEPH_SNAPDIR) { + valid = 1; + } else if (dentry_lease_is_valid(dentry) || + dir_lease_is_valid(dir, dentry)) { + valid = 1; } - if (dentry->d_inode && ceph_snap(dentry->d_inode) == CEPH_SNAPDIR) - goto out_touch; - - if (dentry_lease_is_valid(dentry) || - dir_lease_is_valid(dir, dentry)) - goto out_touch; - dout("d_revalidate %p invalid\n", dentry); - d_drop(dentry); - return 0; -out_touch: - ceph_dentry_lru_touch(dentry); - return 1; + dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid"); + if (valid) + ceph_dentry_lru_touch(dentry); + else + d_drop(dentry); + iput(dir); + return valid; } /* @@ -1228,9 +1259,8 @@ void ceph_dentry_lru_del(struct dentry *dn) * Return name hash for a given dentry. This is dependent on * the parent directory's hash function. */ -unsigned ceph_dentry_hash(struct dentry *dn) +unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn) { - struct inode *dir = dn->d_parent->d_inode; struct ceph_inode_info *dci = ceph_inode(dir); switch (dci->i_dir_layout.dl_dir_hash) { diff --git a/fs/ceph/export.c b/fs/ceph/export.c index f67b687..9fbcdeca 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -46,7 +46,7 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len, int type; struct ceph_nfs_fh *fh = (void *)rawfh; struct ceph_nfs_confh *cfh = (void *)rawfh; - struct dentry *parent = dentry->d_parent; + struct dentry *parent; struct inode *inode = dentry->d_inode; int connected_handle_length = sizeof(*cfh)/4; int handle_length = sizeof(*fh)/4; @@ -55,26 +55,33 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len, if (ceph_snap(inode) != CEPH_NOSNAP) return -EINVAL; + spin_lock(&dentry->d_lock); + parent = dget(dentry->d_parent); + spin_unlock(&dentry->d_lock); + if (*max_len >= connected_handle_length) { dout("encode_fh %p connectable\n", dentry); cfh->ino = ceph_ino(dentry->d_inode); cfh->parent_ino = ceph_ino(parent->d_inode); - cfh->parent_name_hash = ceph_dentry_hash(parent); + cfh->parent_name_hash = ceph_dentry_hash(parent->d_inode, + dentry); *max_len = connected_handle_length; type = 2; } else if (*max_len >= handle_length) { if (connectable) { *max_len = connected_handle_length; - return 255; + type = 255; + } else { + dout("encode_fh %p\n", dentry); + fh->ino = ceph_ino(dentry->d_inode); + *max_len = handle_length; + type = 1; } - dout("encode_fh %p\n", dentry); - fh->ino = ceph_ino(dentry->d_inode); - *max_len = handle_length; - type = 1; } else { *max_len = handle_length; - return 255; + type = 255; } + dput(parent); return type; } @@ -123,7 +130,6 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, return dentry; } err = ceph_init_dentry(dentry); - if (err < 0) { iput(inode); return ERR_PTR(err); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 0d0eae0..ce549d3 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -122,7 +122,7 @@ int ceph_open(struct inode *inode, struct file *file) struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; struct ceph_file_info *cf = file->private_data; - struct inode *parent_inode = file->f_dentry->d_parent->d_inode; + struct inode *parent_inode = NULL; int err; int flags, fmode, wanted; @@ -194,7 +194,10 @@ int ceph_open(struct inode *inode, struct file *file) req->r_inode = inode; ihold(inode); req->r_num_caps = 1; + if (flags & (O_CREAT|O_TRUNC)) + parent_inode = ceph_get_dentry_parent_inode(file->f_dentry); err = ceph_mdsc_do_request(mdsc, parent_inode, req); + iput(parent_inode); if (!err) err = ceph_init_file(inode, file, req->r_fmode); ceph_mdsc_put_request(req); @@ -222,9 +225,9 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry, { struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); struct ceph_mds_client *mdsc = fsc->mdsc; - struct file *file = nd->intent.open.file; - struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry); + struct file *file; struct ceph_mds_request *req; + struct dentry *ret; int err; int flags = nd->intent.open.flags; @@ -242,16 +245,24 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry, req->r_dentry_unless = CEPH_CAP_FILE_EXCL; } req->r_locked_dir = dir; /* caller holds dir->i_mutex */ - err = ceph_mdsc_do_request(mdsc, parent_inode, req); - dentry = ceph_finish_lookup(req, dentry, err); - if (!err && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry) + err = ceph_mdsc_do_request(mdsc, + (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, + req); + err = ceph_handle_snapdir(req, dentry, err); + if (err) + goto out; + if ((flags & O_CREAT) && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); - if (!err) - err = ceph_init_file(req->r_dentry->d_inode, file, - req->r_fmode); + if (err) + goto out; + file = lookup_instantiate_filp(nd, req->r_dentry, ceph_open); + if (IS_ERR(file)) + err = PTR_ERR(file); +out: + ret = ceph_finish_lookup(req, dentry, err); ceph_mdsc_put_request(req); - dout("ceph_lookup_open result=%p\n", dentry); - return dentry; + dout("ceph_lookup_open result=%p\n", ret); + return ret; } int ceph_release(struct inode *inode, struct file *file) @@ -643,7 +654,8 @@ again: if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 || (iocb->ki_filp->f_flags & O_DIRECT) || - (inode->i_sb->s_flags & MS_SYNCHRONOUS)) + (inode->i_sb->s_flags & MS_SYNCHRONOUS) || + (fi->flags & CEPH_F_SYNC)) /* hmm, this isn't really async... */ ret = ceph_sync_read(filp, base, len, ppos, &checkeof); else @@ -712,7 +724,7 @@ retry_snap: want = CEPH_CAP_FILE_BUFFER; ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff); if (ret < 0) - goto out; + goto out_put; dout("aio_write %p %llx.%llx %llu~%u got cap refs on %s\n", inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, @@ -720,12 +732,23 @@ retry_snap: if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || (iocb->ki_filp->f_flags & O_DIRECT) || - (inode->i_sb->s_flags & MS_SYNCHRONOUS)) { + (inode->i_sb->s_flags & MS_SYNCHRONOUS) || + (fi->flags & CEPH_F_SYNC)) { ret = ceph_sync_write(file, iov->iov_base, iov->iov_len, &iocb->ki_pos); } else { - ret = generic_file_aio_write(iocb, iov, nr_segs, pos); + /* + * buffered write; drop Fw early to avoid slow + * revocation if we get stuck on balance_dirty_pages + */ + int dirty; + spin_lock(&inode->i_lock); + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); + spin_unlock(&inode->i_lock); + ceph_put_cap_refs(ci, got); + + ret = generic_file_aio_write(iocb, iov, nr_segs, pos); if ((ret >= 0 || ret == -EIOCBQUEUED) && ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { @@ -733,7 +756,12 @@ retry_snap: if (err < 0) ret = err; } + + if (dirty) + __mark_inode_dirty(inode, dirty); + goto out; } + if (ret >= 0) { int dirty; spin_lock(&inode->i_lock); @@ -743,12 +771,13 @@ retry_snap: __mark_inode_dirty(inode, dirty); } -out: +out_put: dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n", inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, ceph_cap_string(got)); ceph_put_cap_refs(ci, got); +out: if (ret == -EOLDSNAPC) { dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n", inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index dfb2831..095799b 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -560,7 +560,8 @@ static int fill_inode(struct inode *inode, struct ceph_mds_reply_inode *info = iinfo->in; struct ceph_inode_info *ci = ceph_inode(inode); int i; - int issued, implemented; + int issued = 0, implemented; + int updating_inode = 0; struct timespec mtime, atime, ctime; u32 nsplits; struct ceph_buffer *xattr_blob = NULL; @@ -599,7 +600,8 @@ static int fill_inode(struct inode *inode, if (le64_to_cpu(info->version) > 0 && (ci->i_version & ~1) >= le64_to_cpu(info->version)) goto no_change; - + + updating_inode = 1; issued = __ceph_caps_issued(ci, &implemented); issued |= implemented | __ceph_caps_dirty(ci); @@ -707,17 +709,6 @@ static int fill_inode(struct inode *inode, ci->i_rfiles = le64_to_cpu(info->rfiles); ci->i_rsubdirs = le64_to_cpu(info->rsubdirs); ceph_decode_timespec(&ci->i_rctime, &info->rctime); - - /* set dir completion flag? */ - if (ci->i_files == 0 && ci->i_subdirs == 0 && - ceph_snap(inode) == CEPH_NOSNAP && - (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) && - (issued & CEPH_CAP_FILE_EXCL) == 0 && - (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) { - dout(" marking %p complete (empty)\n", inode); - /* ci->i_ceph_flags |= CEPH_I_COMPLETE; */ - ci->i_max_offset = 2; - } break; default: pr_err("fill_inode %llx.%llx BAD mode 0%o\n", @@ -774,6 +765,19 @@ no_change: __ceph_get_fmode(ci, cap_fmode); } + /* set dir completion flag? */ + if (S_ISDIR(inode->i_mode) && + updating_inode && /* didn't jump to no_change */ + ci->i_files == 0 && ci->i_subdirs == 0 && + ceph_snap(inode) == CEPH_NOSNAP && + (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) && + (issued & CEPH_CAP_FILE_EXCL) == 0 && + (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) { + dout(" marking %p complete (empty)\n", inode); + /* ci->i_ceph_flags |= CEPH_I_COMPLETE; */ + ci->i_max_offset = 2; + } + /* update delegation info? */ if (dirinfo) ceph_fill_dirfrag(inode, dirinfo); @@ -805,14 +809,14 @@ static void update_dentry_lease(struct dentry *dentry, return; spin_lock(&dentry->d_lock); - dout("update_dentry_lease %p mask %d duration %lu ms ttl %lu\n", - dentry, le16_to_cpu(lease->mask), duration, ttl); + dout("update_dentry_lease %p duration %lu ms ttl %lu\n", + dentry, duration, ttl); /* make lease_rdcache_gen match directory */ dir = dentry->d_parent->d_inode; di->lease_shared_gen = ceph_inode(dir)->i_shared_gen; - if (lease->mask == 0) + if (duration == 0) goto out_unlock; if (di->lease_gen == session->s_cap_gen && @@ -839,11 +843,13 @@ out_unlock: /* * Set dentry's directory position based on the current dir's max, and * order it in d_subdirs, so that dcache_readdir behaves. + * + * Always called under directory's i_mutex. */ static void ceph_set_dentry_offset(struct dentry *dn) { struct dentry *dir = dn->d_parent; - struct inode *inode = dn->d_parent->d_inode; + struct inode *inode = dir->d_inode; struct ceph_dentry_info *di; BUG_ON(!inode); @@ -1022,9 +1028,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, /* do we have a dn lease? */ have_lease = have_dir_cap || - (le16_to_cpu(rinfo->dlease->mask) & - CEPH_LOCK_DN); - + le32_to_cpu(rinfo->dlease->duration_ms); if (!have_lease) dout("fill_trace no dentry lease or dir cap\n"); @@ -1560,7 +1564,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct inode *parent_inode = dentry->d_parent->d_inode; + struct inode *parent_inode; const unsigned int ia_valid = attr->ia_valid; struct ceph_mds_request *req; struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc; @@ -1743,7 +1747,9 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) req->r_inode_drop = release; req->r_args.setattr.mask = cpu_to_le32(mask); req->r_num_caps = 1; + parent_inode = ceph_get_dentry_parent_inode(dentry); err = ceph_mdsc_do_request(mdsc, parent_inode, req); + iput(parent_inode); } dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err, ceph_cap_string(dirtied), mask); diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index ef0b5f4..3b256b5 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -38,7 +38,7 @@ static long ceph_ioctl_get_layout(struct file *file, void __user *arg) static long ceph_ioctl_set_layout(struct file *file, void __user *arg) { struct inode *inode = file->f_dentry->d_inode; - struct inode *parent_inode = file->f_dentry->d_parent->d_inode; + struct inode *parent_inode; struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_mds_request *req; struct ceph_ioctl_layout l; @@ -87,7 +87,9 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg) req->r_args.setlayout.layout.fl_pg_preferred = cpu_to_le32(l.preferred_osd); + parent_inode = ceph_get_dentry_parent_inode(file->f_dentry); err = ceph_mdsc_do_request(mdsc, parent_inode, req); + iput(parent_inode); ceph_mdsc_put_request(req); return err; } @@ -231,6 +233,14 @@ static long ceph_ioctl_lazyio(struct file *file) return 0; } +static long ceph_ioctl_syncio(struct file *file) +{ + struct ceph_file_info *fi = file->private_data; + + fi->flags |= CEPH_F_SYNC; + return 0; +} + long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg); @@ -249,6 +259,9 @@ long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case CEPH_IOC_LAZYIO: return ceph_ioctl_lazyio(file); + + case CEPH_IOC_SYNCIO: + return ceph_ioctl_syncio(file); } return -ENOTTY; diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h index 52e8fd7..0c5167e 100644 --- a/fs/ceph/ioctl.h +++ b/fs/ceph/ioctl.h @@ -40,5 +40,6 @@ struct ceph_ioctl_dataloc { struct ceph_ioctl_dataloc) #define CEPH_IOC_LAZYIO _IO(CEPH_IOCTL_MAGIC, 4) +#define CEPH_IOC_SYNCIO _IO(CEPH_IOCTL_MAGIC, 5) #endif diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 0c1d917..86c59e1 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -483,22 +483,26 @@ void ceph_mdsc_release_request(struct kref *kref) destroy_reply_info(&req->r_reply_info); } if (req->r_inode) { - ceph_put_cap_refs(ceph_inode(req->r_inode), - CEPH_CAP_PIN); + ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); iput(req->r_inode); } if (req->r_locked_dir) - ceph_put_cap_refs(ceph_inode(req->r_locked_dir), - CEPH_CAP_PIN); + ceph_put_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN); if (req->r_target_inode) iput(req->r_target_inode); if (req->r_dentry) dput(req->r_dentry); if (req->r_old_dentry) { - ceph_put_cap_refs( - ceph_inode(req->r_old_dentry->d_parent->d_inode), - CEPH_CAP_PIN); + /* + * track (and drop pins for) r_old_dentry_dir + * separately, since r_old_dentry's d_parent may have + * changed between the dir mutex being dropped and + * this request being freed. + */ + ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir), + CEPH_CAP_PIN); dput(req->r_old_dentry); + iput(req->r_old_dentry_dir); } kfree(req->r_path1); kfree(req->r_path2); @@ -617,6 +621,12 @@ static void __unregister_request(struct ceph_mds_client *mdsc, */ struct dentry *get_nonsnap_parent(struct dentry *dentry) { + /* + * we don't need to worry about protecting the d_parent access + * here because we never renaming inside the snapped namespace + * except to resplice to another snapdir, and either the old or new + * result is a valid result. + */ while (!IS_ROOT(dentry) && ceph_snap(dentry->d_inode) != CEPH_NOSNAP) dentry = dentry->d_parent; return dentry; @@ -652,7 +662,9 @@ static int __choose_mds(struct ceph_mds_client *mdsc, if (req->r_inode) { inode = req->r_inode; } else if (req->r_dentry) { - struct inode *dir = req->r_dentry->d_parent->d_inode; + /* ignore race with rename; old or new d_parent is okay */ + struct dentry *parent = req->r_dentry->d_parent; + struct inode *dir = parent->d_inode; if (dir->i_sb != mdsc->fsc->sb) { /* not this fs! */ @@ -660,8 +672,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc, } else if (ceph_snap(dir) != CEPH_NOSNAP) { /* direct snapped/virtual snapdir requests * based on parent dir inode */ - struct dentry *dn = - get_nonsnap_parent(req->r_dentry->d_parent); + struct dentry *dn = get_nonsnap_parent(parent); inode = dn->d_inode; dout("__choose_mds using nonsnap parent %p\n", inode); } else if (req->r_dentry->d_inode) { @@ -670,7 +681,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc, } else { /* dir + name */ inode = dir; - hash = ceph_dentry_hash(req->r_dentry); + hash = ceph_dentry_hash(dir, req->r_dentry); is_hash = true; } } @@ -1584,7 +1595,7 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, r = build_dentry_path(rdentry, ppath, pathlen, ino, freepath); dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen, *ppath); - } else if (rpath) { + } else if (rpath || rino) { *ino = rino; *ppath = rpath; *pathlen = strlen(rpath); @@ -1931,9 +1942,8 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, if (req->r_locked_dir) ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN); if (req->r_old_dentry) - ceph_get_cap_refs( - ceph_inode(req->r_old_dentry->d_parent->d_inode), - CEPH_CAP_PIN); + ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir), + CEPH_CAP_PIN); /* issue */ mutex_lock(&mdsc->mutex); @@ -2714,7 +2724,6 @@ static void handle_lease(struct ceph_mds_client *mdsc, struct ceph_mds_lease *h = msg->front.iov_base; u32 seq; struct ceph_vino vino; - int mask; struct qstr dname; int release = 0; @@ -2725,7 +2734,6 @@ static void handle_lease(struct ceph_mds_client *mdsc, goto bad; vino.ino = le64_to_cpu(h->ino); vino.snap = CEPH_NOSNAP; - mask = le16_to_cpu(h->mask); seq = le32_to_cpu(h->seq); dname.name = (void *)h + sizeof(*h) + sizeof(u32); dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32); @@ -2737,8 +2745,8 @@ static void handle_lease(struct ceph_mds_client *mdsc, /* lookup inode */ inode = ceph_find_inode(sb, vino); - dout("handle_lease %s, mask %d, ino %llx %p %.*s\n", - ceph_lease_op_name(h->action), mask, vino.ino, inode, + dout("handle_lease %s, ino %llx %p %.*s\n", + ceph_lease_op_name(h->action), vino.ino, inode, dname.len, dname.name); if (inode == NULL) { dout("handle_lease no inode %llx\n", vino.ino); @@ -2828,7 +2836,6 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, return; lease = msg->front.iov_base; lease->action = action; - lease->mask = cpu_to_le16(1); lease->ino = cpu_to_le64(ceph_vino(inode).ino); lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap); lease->seq = cpu_to_le32(seq); @@ -2850,7 +2857,7 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, * Pass @inode always, @dentry is optional. */ void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode, - struct dentry *dentry, int mask) + struct dentry *dentry) { struct ceph_dentry_info *di; struct ceph_mds_session *session; @@ -2858,7 +2865,6 @@ void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode, BUG_ON(inode == NULL); BUG_ON(dentry == NULL); - BUG_ON(mask == 0); /* is dentry lease valid? */ spin_lock(&dentry->d_lock); @@ -2868,8 +2874,8 @@ void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode, di->lease_gen != di->lease_session->s_cap_gen || !time_before(jiffies, dentry->d_time)) { dout("lease_release inode %p dentry %p -- " - "no lease on %d\n", - inode, dentry, mask); + "no lease\n", + inode, dentry); spin_unlock(&dentry->d_lock); return; } @@ -2880,8 +2886,8 @@ void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode, __ceph_mdsc_drop_dentry_lease(dentry); spin_unlock(&dentry->d_lock); - dout("lease_release inode %p dentry %p mask %d to mds%d\n", - inode, dentry, mask, session->s_mds); + dout("lease_release inode %p dentry %p to mds%d\n", + inode, dentry, session->s_mds); ceph_mdsc_lease_send_msg(session, inode, dentry, CEPH_MDS_LEASE_RELEASE, seq); ceph_put_mds_session(session); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 7d8a0d6..4bb2399 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -171,6 +171,7 @@ struct ceph_mds_request { struct inode *r_inode; /* arg1 */ struct dentry *r_dentry; /* arg1 */ struct dentry *r_old_dentry; /* arg2: rename from or link from */ + struct inode *r_old_dentry_dir; /* arg2: old dentry's parent dir */ char *r_path1, *r_path2; struct ceph_vino r_ino1, r_ino2; @@ -333,7 +334,7 @@ extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc); extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode, - struct dentry *dn, int mask); + struct dentry *dn); extern void ceph_invalidate_dir_request(struct ceph_mds_request *req); diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 54b14de..e264371 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -449,6 +449,15 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) spin_lock(&inode->i_lock); used = __ceph_caps_used(ci); dirty = __ceph_caps_dirty(ci); + + /* + * If there is a write in progress, treat that as a dirty Fw, + * even though it hasn't completed yet; by the time we finish + * up this capsnap it will be. + */ + if (used & CEPH_CAP_FILE_WR) + dirty |= CEPH_CAP_FILE_WR; + if (__ceph_have_pending_cap_snap(ci)) { /* there is no point in queuing multiple "pending" cap_snaps, as no new writes are allowed to start when pending, so any @@ -456,13 +465,19 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) cap_snap. lucky us. */ dout("queue_cap_snap %p already pending\n", inode); kfree(capsnap); - } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR) || - (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL| - CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR))) { + } else if (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL| + CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR)) { struct ceph_snap_context *snapc = ci->i_head_snapc; - dout("queue_cap_snap %p cap_snap %p queuing under %p\n", inode, - capsnap, snapc); + /* + * if we are a sync write, we may need to go to the snaprealm + * to get the current snapc. + */ + if (!snapc) + snapc = ci->i_snap_realm->cached_context; + + dout("queue_cap_snap %p cap_snap %p queuing under %p %s\n", + inode, capsnap, snapc, ceph_cap_string(dirty)); ihold(inode); atomic_set(&capsnap->nref, 1); diff --git a/fs/ceph/super.c b/fs/ceph/super.c index f2f77fd..88bacaf 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -73,8 +73,7 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) */ buf->f_bsize = 1 << CEPH_BLOCK_SHIFT; buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10); - buf->f_bfree = (le64_to_cpu(st.kb) - le64_to_cpu(st.kb_used)) >> - (CEPH_BLOCK_SHIFT-10); + buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); buf->f_files = le64_to_cpu(st.num_objects); @@ -780,6 +779,10 @@ static int ceph_register_bdi(struct super_block *sb, fsc->backing_dev_info.ra_pages = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_SHIFT; + else + fsc->backing_dev_info.ra_pages = + default_backing_dev_info.ra_pages; + err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%d", atomic_long_inc_return(&bdi_seq)); if (!err) @@ -810,8 +813,8 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type, fsc = create_fs_client(fsopt, opt); if (IS_ERR(fsc)) { res = ERR_CAST(fsc); - kfree(fsopt); - kfree(opt); + destroy_mount_options(fsopt); + ceph_destroy_options(opt); goto out_final; } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 30446b1..a23eed5 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -543,13 +543,16 @@ extern void ceph_reservation_status(struct ceph_fs_client *client, /* * we keep buffered readdir results attached to file->private_data */ +#define CEPH_F_SYNC 1 +#define CEPH_F_ATEND 2 + struct ceph_file_info { - int fmode; /* initialized on open */ + short fmode; /* initialized on open */ + short flags; /* CEPH_F_* */ /* readdir: position within the dir */ u32 frag; struct ceph_mds_request *last_readdir; - int at_end; /* readdir: position within a frag */ unsigned offset; /* offset of last chunk, adjusted for . and .. */ @@ -789,6 +792,8 @@ extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops, ceph_snapdir_dentry_ops; extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry); +extern int ceph_handle_snapdir(struct ceph_mds_request *req, + struct dentry *dentry, int err); extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, struct dentry *dentry, int err); @@ -796,7 +801,8 @@ extern void ceph_dentry_lru_add(struct dentry *dn); extern void ceph_dentry_lru_touch(struct dentry *dn); extern void ceph_dentry_lru_del(struct dentry *dn); extern void ceph_invalidate_dentry_lease(struct dentry *dentry); -extern unsigned ceph_dentry_hash(struct dentry *dn); +extern unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn); +extern struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry); /* * our d_ops vary depending on whether the inode is live, @@ -819,14 +825,6 @@ extern int ceph_encode_locks(struct inode *i, struct ceph_pagelist *p, int p_locks, int f_locks); extern int lock_to_ceph_filelock(struct file_lock *fl, struct ceph_filelock *c); -static inline struct inode *get_dentry_parent_inode(struct dentry *dentry) -{ - if (dentry && dentry->d_parent) - return dentry->d_parent->d_inode; - - return NULL; -} - /* debugfs.c */ extern int ceph_fs_debugfs_init(struct ceph_fs_client *client); extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client); diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index f42d730..96c6739 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -629,7 +629,7 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name, struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); struct inode *inode = dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct inode *parent_inode = dentry->d_parent->d_inode; + struct inode *parent_inode; struct ceph_mds_request *req; struct ceph_mds_client *mdsc = fsc->mdsc; int err; @@ -677,7 +677,9 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name, req->r_data_len = size; dout("xattr.ver (before): %lld\n", ci->i_xattrs.version); + parent_inode = ceph_get_dentry_parent_inode(dentry); err = ceph_mdsc_do_request(mdsc, parent_inode, req); + iput(parent_inode); ceph_mdsc_put_request(req); dout("xattr.ver (after): %lld\n", ci->i_xattrs.version); @@ -788,7 +790,7 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name) struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); struct ceph_mds_client *mdsc = fsc->mdsc; struct inode *inode = dentry->d_inode; - struct inode *parent_inode = dentry->d_parent->d_inode; + struct inode *parent_inode; struct ceph_mds_request *req; int err; @@ -802,7 +804,9 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name) req->r_num_caps = 1; req->r_path2 = kstrdup(name, GFP_NOFS); + parent_inode = ceph_get_dentry_parent_inode(dentry); err = ceph_mdsc_do_request(mdsc, parent_inode, req); + iput(parent_inode); ceph_mdsc_put_request(req); return err; } diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 2fe3cf1..6d40656 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -176,7 +176,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) #ifdef CONFIG_CIFS_STATS2 seq_printf(m, " In Send: %d In MaxReq Wait: %d", - atomic_read(&server->inSend), + atomic_read(&server->in_send), atomic_read(&server->num_waiters)); #endif diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index 8d8f28c..6873bb6 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -141,10 +141,11 @@ char *cifs_compose_mount_options(const char *sb_mountdata, rc = dns_resolve_server_name_to_ip(*devname, &srvIP); if (rc < 0) { - cERROR(1, "%s: Failed to resolve server part of %s to IP: %d", - __func__, *devname, rc); + cFYI(1, "%s: Failed to resolve server part of %s to IP: %d", + __func__, *devname, rc); goto compose_mount_options_err; } + /* md_len = strlen(...) + 12 for 'sep+prefixpath=' * assuming that we have 'unc=' and 'ip=' in * the original sb_mountdata diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 21de1d6..d0f59fa 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -991,24 +991,6 @@ struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb, return pntsd; } -static int set_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, __u16 fid, - struct cifs_ntsd *pnntsd, u32 acllen) -{ - int xid, rc; - struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); - - if (IS_ERR(tlink)) - return PTR_ERR(tlink); - - xid = GetXid(); - rc = CIFSSMBSetCIFSACL(xid, tlink_tcon(tlink), fid, pnntsd, acllen); - FreeXid(xid); - cifs_put_tlink(tlink); - - cFYI(DBG2, "SetCIFSACL rc = %d", rc); - return rc; -} - static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path, struct cifs_ntsd *pnntsd, u32 acllen) { @@ -1047,18 +1029,10 @@ int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen, struct inode *inode, const char *path) { struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); - struct cifsFileInfo *open_file; - int rc; cFYI(DBG2, "set ACL for %s from mode 0x%x", path, inode->i_mode); - open_file = find_readable_file(CIFS_I(inode), true); - if (!open_file) - return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen); - - rc = set_cifs_acl_by_fid(cifs_sb, open_file->netfid, pnntsd, acllen); - cifsFileInfo_put(open_file); - return rc; + return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen); } /* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */ diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 5a0ee7f..e76bfeb 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -52,19 +52,29 @@ static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu, rc = crypto_shash_init(&server->secmech.sdescmd5->shash); if (rc) { - cERROR(1, "%s: Oould not init md5\n", __func__); + cERROR(1, "%s: Could not init md5\n", __func__); return rc; } - crypto_shash_update(&server->secmech.sdescmd5->shash, + rc = crypto_shash_update(&server->secmech.sdescmd5->shash, server->session_key.response, server->session_key.len); + if (rc) { + cERROR(1, "%s: Could not update with response\n", __func__); + return rc; + } - crypto_shash_update(&server->secmech.sdescmd5->shash, + rc = crypto_shash_update(&server->secmech.sdescmd5->shash, cifs_pdu->Protocol, be32_to_cpu(cifs_pdu->smb_buf_length)); + if (rc) { + cERROR(1, "%s: Could not update with payload\n", __func__); + return rc; + } rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature); + if (rc) + cERROR(1, "%s: Could not generate md5 hash\n", __func__); - return 0; + return rc; } /* must be called with server->srv_mutex held */ @@ -77,9 +87,15 @@ int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server, if ((cifs_pdu == NULL) || (server == NULL)) return -EINVAL; - if ((cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) == 0) + if (!(cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) || + server->tcpStatus == CifsNeedNegotiate) return rc; + if (!server->session_estab) { + strncpy(cifs_pdu->Signature.SecuritySignature, "BSRSPYL", 8); + return rc; + } + cifs_pdu->Signature.Sequence.SequenceNumber = cpu_to_le32(server->sequence_number); cifs_pdu->Signature.Sequence.Reserved = 0; @@ -112,12 +128,16 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec, rc = crypto_shash_init(&server->secmech.sdescmd5->shash); if (rc) { - cERROR(1, "%s: Oould not init md5\n", __func__); + cERROR(1, "%s: Could not init md5\n", __func__); return rc; } - crypto_shash_update(&server->secmech.sdescmd5->shash, + rc = crypto_shash_update(&server->secmech.sdescmd5->shash, server->session_key.response, server->session_key.len); + if (rc) { + cERROR(1, "%s: Could not update with response\n", __func__); + return rc; + } for (i = 0; i < n_vec; i++) { if (iov[i].iov_len == 0) @@ -131,14 +151,24 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec, if (i == 0) { if (iov[0].iov_len <= 8) /* cmd field at offset 9 */ break; /* nothing to sign or corrupt header */ + rc = crypto_shash_update(&server->secmech.sdescmd5->shash, iov[i].iov_base + 4, iov[i].iov_len - 4); - } else + } else { + rc = crypto_shash_update(&server->secmech.sdescmd5->shash, iov[i].iov_base, iov[i].iov_len); + } + if (rc) { + cERROR(1, "%s: Could not update with payload\n", + __func__); + return rc; + } } rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature); + if (rc) + cERROR(1, "%s: Could not generate md5 hash\n", __func__); return rc; } @@ -154,8 +184,14 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server, if ((cifs_pdu == NULL) || (server == NULL)) return -EINVAL; - if ((cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) == 0) + if (!(cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) || + server->tcpStatus == CifsNeedNegotiate) + return rc; + + if (!server->session_estab) { + strncpy(cifs_pdu->Signature.SecuritySignature, "BSRSPYL", 8); return rc; + } cifs_pdu->Signature.Sequence.SequenceNumber = cpu_to_le32(server->sequence_number); @@ -463,8 +499,12 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, /* calculate md4 hash of password */ E_md4hash(ses->password, nt_hash); - crypto_shash_setkey(ses->server->secmech.hmacmd5, nt_hash, + rc = crypto_shash_setkey(ses->server->secmech.hmacmd5, nt_hash, CIFS_NTHASH_SIZE); + if (rc) { + cERROR(1, "%s: Could not set NT Hash as a key", __func__); + return rc; + } rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash); if (rc) { @@ -478,13 +518,18 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, if (user == NULL) { cERROR(1, "calc_ntlmv2_hash: user mem alloc failure\n"); rc = -ENOMEM; - goto calc_exit_2; + return rc; } len = cifs_strtoUCS((__le16 *)user, ses->user_name, len, nls_cp); UniStrupr(user); - crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, + rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, (char *)user, 2 * len); + kfree(user); + if (rc) { + cERROR(1, "%s: Could not update with user\n", __func__); + return rc; + } /* convert ses->domainName to unicode and uppercase */ if (ses->domainName) { @@ -494,13 +539,19 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, if (domain == NULL) { cERROR(1, "calc_ntlmv2_hash: domain mem alloc failure"); rc = -ENOMEM; - goto calc_exit_1; + return rc; } len = cifs_strtoUCS((__le16 *)domain, ses->domainName, len, nls_cp); + rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, (char *)domain, 2 * len); kfree(domain); + if (rc) { + cERROR(1, "%s: Could not update with domain\n", + __func__); + return rc; + } } else if (ses->serverName) { len = strlen(ses->serverName); @@ -508,21 +559,26 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, if (server == NULL) { cERROR(1, "calc_ntlmv2_hash: server mem alloc failure"); rc = -ENOMEM; - goto calc_exit_1; + return rc; } len = cifs_strtoUCS((__le16 *)server, ses->serverName, len, nls_cp); + rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, (char *)server, 2 * len); kfree(server); + if (rc) { + cERROR(1, "%s: Could not update with server\n", + __func__); + return rc; + } } rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash, ntlmv2_hash); + if (rc) + cERROR(1, "%s: Could not generate md5 hash\n", __func__); -calc_exit_1: - kfree(user); -calc_exit_2: return rc; } @@ -537,8 +593,12 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash) return -1; } - crypto_shash_setkey(ses->server->secmech.hmacmd5, + rc = crypto_shash_setkey(ses->server->secmech.hmacmd5, ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); + if (rc) { + cERROR(1, "%s: Could not set NTLMV2 Hash as a key", __func__); + return rc; + } rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash); if (rc) { @@ -552,11 +612,17 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash) else memcpy(ses->auth_key.response + offset, ses->server->cryptkey, CIFS_SERVER_CHALLENGE_SIZE); - crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, + rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, ses->auth_key.response + offset, ses->auth_key.len - offset); + if (rc) { + cERROR(1, "%s: Could not update with response\n", __func__); + return rc; + } rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash, ses->auth_key.response + CIFS_SESS_KEY_SIZE); + if (rc) + cERROR(1, "%s: Could not generate md5 hash\n", __func__); return rc; } @@ -626,8 +692,12 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) } /* now calculate the session key for NTLMv2 */ - crypto_shash_setkey(ses->server->secmech.hmacmd5, + rc = crypto_shash_setkey(ses->server->secmech.hmacmd5, ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); + if (rc) { + cERROR(1, "%s: Could not set NTLMV2 Hash as a key", __func__); + goto setup_ntlmv2_rsp_ret; + } rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash); if (rc) { @@ -635,12 +705,18 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) goto setup_ntlmv2_rsp_ret; } - crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, + rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, ses->auth_key.response + CIFS_SESS_KEY_SIZE, CIFS_HMAC_MD5_HASH_SIZE); + if (rc) { + cERROR(1, "%s: Could not update with response\n", __func__); + goto setup_ntlmv2_rsp_ret; + } rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash, ses->auth_key.response); + if (rc) + cERROR(1, "%s: Could not generate md5 hash\n", __func__); setup_ntlmv2_rsp_ret: kfree(tiblob); @@ -668,8 +744,12 @@ calc_seckey(struct cifs_ses *ses) desc.tfm = tfm_arc4; - crypto_blkcipher_setkey(tfm_arc4, ses->auth_key.response, + rc = crypto_blkcipher_setkey(tfm_arc4, ses->auth_key.response, CIFS_SESS_KEY_SIZE); + if (rc) { + cERROR(1, "%s: Could not set response as a key", __func__); + return rc; + } sg_init_one(&sgin, sec_key, CIFS_SESS_KEY_SIZE); sg_init_one(&sgout, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE); @@ -688,7 +768,7 @@ calc_seckey(struct cifs_ses *ses) crypto_free_blkcipher(tfm_arc4); - return 0; + return rc; } void diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 8655174..f93eb94 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -86,24 +86,6 @@ extern mempool_t *cifs_sm_req_poolp; extern mempool_t *cifs_req_poolp; extern mempool_t *cifs_mid_poolp; -void -cifs_sb_active(struct super_block *sb) -{ - struct cifs_sb_info *server = CIFS_SB(sb); - - if (atomic_inc_return(&server->active) == 1) - atomic_inc(&sb->s_active); -} - -void -cifs_sb_deactive(struct super_block *sb) -{ - struct cifs_sb_info *server = CIFS_SB(sb); - - if (atomic_dec_and_test(&server->active)) - deactivate_super(sb); -} - static int cifs_read_super(struct super_block *sb) { @@ -581,6 +563,10 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb) mutex_unlock(&dir->i_mutex); dput(dentry); dentry = child; + if (!dentry->d_inode) { + dput(dentry); + dentry = ERR_PTR(-ENOENT); + } } while (!IS_ERR(dentry)); _FreeXid(xid); kfree(full_path); diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index fbd050c..95da802 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -41,10 +41,6 @@ extern struct file_system_type cifs_fs_type; extern const struct address_space_operations cifs_addr_ops; extern const struct address_space_operations cifs_addr_ops_smallbuf; -/* Functions related to super block operations */ -extern void cifs_sb_active(struct super_block *sb); -extern void cifs_sb_deactive(struct super_block *sb); - /* Functions related to inodes */ extern const struct inode_operations cifs_dir_inode_ops; extern struct inode *cifs_root_iget(struct super_block *); @@ -129,5 +125,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "1.74" +#define CIFS_VERSION "1.75" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 6255fa8..95dad9d 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -291,7 +291,7 @@ struct TCP_Server_Info { struct fscache_cookie *fscache; /* client index cache cookie */ #endif #ifdef CONFIG_CIFS_STATS2 - atomic_t inSend; /* requests trying to send */ + atomic_t in_send; /* requests trying to send */ atomic_t num_waiters; /* blocked waiting to get in sendrecv */ #endif }; @@ -501,7 +501,7 @@ struct cifs_search_info { char *ntwrk_buf_start; char *srch_entries_start; char *last_entry; - char *presume_name; + const char *presume_name; unsigned int resume_name_len; bool endOfSearch:1; bool emptyDir:1; @@ -672,12 +672,54 @@ struct mid_q_entry { bool multiEnd:1; /* both received */ }; -struct oplock_q_entry { - struct list_head qhead; - struct inode *pinode; - struct cifs_tcon *tcon; - __u16 netfid; -}; +/* Make code in transport.c a little cleaner by moving + update of optional stats into function below */ +#ifdef CONFIG_CIFS_STATS2 + +static inline void cifs_in_send_inc(struct TCP_Server_Info *server) +{ + atomic_inc(&server->in_send); +} + +static inline void cifs_in_send_dec(struct TCP_Server_Info *server) +{ + atomic_dec(&server->in_send); +} + +static inline void cifs_num_waiters_inc(struct TCP_Server_Info *server) +{ + atomic_inc(&server->num_waiters); +} + +static inline void cifs_num_waiters_dec(struct TCP_Server_Info *server) +{ + atomic_dec(&server->num_waiters); +} + +static inline void cifs_save_when_sent(struct mid_q_entry *mid) +{ + mid->when_sent = jiffies; +} +#else +static inline void cifs_in_send_inc(struct TCP_Server_Info *server) +{ +} +static inline void cifs_in_send_dec(struct TCP_Server_Info *server) +{ +} + +static inline void cifs_num_waiters_inc(struct TCP_Server_Info *server) +{ +} + +static inline void cifs_num_waiters_dec(struct TCP_Server_Info *server) +{ +} + +static inline void cifs_save_when_sent(struct mid_q_entry *mid) +{ +} +#endif /* for pending dnotify requests */ struct dir_notify_req { @@ -942,8 +984,6 @@ GLOBAL_EXTERN spinlock_t siduidlock; GLOBAL_EXTERN spinlock_t sidgidlock; void cifs_oplock_break(struct work_struct *work); -void cifs_oplock_break_get(struct cifsFileInfo *cfile); -void cifs_oplock_break_put(struct cifsFileInfo *cfile); extern const struct slow_work_ops cifs_oplock_break_ops; diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 1a9fe7f..aac37d9 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -107,7 +107,7 @@ static void mark_open_files_invalid(struct cifs_tcon *pTcon) static int cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command) { - int rc = 0; + int rc; struct cifs_ses *ses; struct TCP_Server_Info *server; struct nls_table *nls_codepage; @@ -5720,6 +5720,7 @@ CIFSSMBQAllEAs(const int xid, struct cifs_tcon *tcon, char *temp_ptr; char *end_of_smb; __u16 params, byte_count, data_offset; + unsigned int ea_name_len = ea_name ? strlen(ea_name) : 0; cFYI(1, "In Query All EAs path %s", searchName); QAllEAsRetry: @@ -5837,7 +5838,8 @@ QAllEAsRetry: } if (ea_name) { - if (strncmp(ea_name, temp_ptr, name_len) == 0) { + if (ea_name_len == name_len && + strncmp(ea_name, temp_ptr, name_len) == 0) { temp_ptr += name_len + 1; rc = value_len; if (buf_size == 0) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index e66297b..633c246 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -319,25 +319,328 @@ requeue_echo: queue_delayed_work(system_nrt_wq, &server->echo, SMB_ECHO_INTERVAL); } +static bool +allocate_buffers(char **bigbuf, char **smallbuf, unsigned int size, + bool is_large_buf) +{ + char *bbuf = *bigbuf, *sbuf = *smallbuf; + + if (bbuf == NULL) { + bbuf = (char *)cifs_buf_get(); + if (!bbuf) { + cERROR(1, "No memory for large SMB response"); + msleep(3000); + /* retry will check if exiting */ + return false; + } + } else if (is_large_buf) { + /* we are reusing a dirty large buf, clear its start */ + memset(bbuf, 0, size); + } + + if (sbuf == NULL) { + sbuf = (char *)cifs_small_buf_get(); + if (!sbuf) { + cERROR(1, "No memory for SMB response"); + msleep(1000); + /* retry will check if exiting */ + return false; + } + /* beginning of smb buffer is cleared in our buf_get */ + } else { + /* if existing small buf clear beginning */ + memset(sbuf, 0, size); + } + + *bigbuf = bbuf; + *smallbuf = sbuf; + + return true; +} + +static int +read_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg, + struct kvec *iov, unsigned int to_read, + unsigned int *ptotal_read, bool is_header_read) +{ + int length, rc = 0; + unsigned int total_read; + char *buf = iov->iov_base; + + for (total_read = 0; total_read < to_read; total_read += length) { + length = kernel_recvmsg(server->ssocket, smb_msg, iov, 1, + to_read - total_read, 0); + if (server->tcpStatus == CifsExiting) { + /* then will exit */ + rc = 2; + break; + } else if (server->tcpStatus == CifsNeedReconnect) { + cifs_reconnect(server); + /* Reconnect wakes up rspns q */ + /* Now we will reread sock */ + rc = 1; + break; + } else if (length == -ERESTARTSYS || + length == -EAGAIN || + length == -EINTR) { + /* + * Minimum sleep to prevent looping, allowing socket + * to clear and app threads to set tcpStatus + * CifsNeedReconnect if server hung. + */ + usleep_range(1000, 2000); + length = 0; + if (!is_header_read) + continue; + /* Special handling for header read */ + if (total_read) { + iov->iov_base = (to_read - total_read) + + buf; + iov->iov_len = to_read - total_read; + smb_msg->msg_control = NULL; + smb_msg->msg_controllen = 0; + rc = 3; + } else + rc = 1; + break; + } else if (length <= 0) { + cERROR(1, "Received no data, expecting %d", + to_read - total_read); + cifs_reconnect(server); + rc = 1; + break; + } + } + + *ptotal_read = total_read; + return rc; +} + +static bool +check_rfc1002_header(struct TCP_Server_Info *server, char *buf) +{ + char temp = *buf; + unsigned int pdu_length = be32_to_cpu( + ((struct smb_hdr *)buf)->smb_buf_length); + + /* + * The first byte big endian of the length field, + * is actually not part of the length but the type + * with the most common, zero, as regular data. + */ + if (temp == (char) RFC1002_SESSION_KEEP_ALIVE) { + return false; + } else if (temp == (char)RFC1002_POSITIVE_SESSION_RESPONSE) { + cFYI(1, "Good RFC 1002 session rsp"); + return false; + } else if (temp == (char)RFC1002_NEGATIVE_SESSION_RESPONSE) { + /* + * We get this from Windows 98 instead of an error on + * SMB negprot response. + */ + cFYI(1, "Negative RFC1002 Session Response Error 0x%x)", + pdu_length); + /* give server a second to clean up */ + msleep(1000); + /* + * Always try 445 first on reconnect since we get NACK + * on some if we ever connected to port 139 (the NACK + * is since we do not begin with RFC1001 session + * initialize frame). + */ + cifs_set_port((struct sockaddr *) + &server->dstaddr, CIFS_PORT); + cifs_reconnect(server); + wake_up(&server->response_q); + return false; + } else if (temp != (char) 0) { + cERROR(1, "Unknown RFC 1002 frame"); + cifs_dump_mem(" Received Data: ", buf, 4); + cifs_reconnect(server); + return false; + } + + /* else we have an SMB response */ + if ((pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) || + (pdu_length < sizeof(struct smb_hdr) - 1 - 4)) { + cERROR(1, "Invalid size SMB length %d pdu_length %d", + 4, pdu_length+4); + cifs_reconnect(server); + wake_up(&server->response_q); + return false; + } + + return true; +} + +static struct mid_q_entry * +find_cifs_mid(struct TCP_Server_Info *server, struct smb_hdr *buf, + int *length, bool is_large_buf, bool *is_multi_rsp, char **bigbuf) +{ + struct mid_q_entry *mid = NULL, *tmp_mid, *ret = NULL; + + spin_lock(&GlobalMid_Lock); + list_for_each_entry_safe(mid, tmp_mid, &server->pending_mid_q, qhead) { + if (mid->mid != buf->Mid || + mid->midState != MID_REQUEST_SUBMITTED || + mid->command != buf->Command) + continue; + + if (*length == 0 && check2ndT2(buf, server->maxBuf) > 0) { + /* We have a multipart transact2 resp */ + *is_multi_rsp = true; + if (mid->resp_buf) { + /* merge response - fix up 1st*/ + *length = coalesce_t2(buf, mid->resp_buf); + if (*length > 0) { + *length = 0; + mid->multiRsp = true; + break; + } + /* All parts received or packet is malformed. */ + mid->multiEnd = true; + goto multi_t2_fnd; + } + if (!is_large_buf) { + /*FIXME: switch to already allocated largebuf?*/ + cERROR(1, "1st trans2 resp needs bigbuf"); + } else { + /* Have first buffer */ + mid->resp_buf = buf; + mid->largeBuf = true; + *bigbuf = NULL; + } + break; + } + mid->resp_buf = buf; + mid->largeBuf = is_large_buf; +multi_t2_fnd: + if (*length == 0) + mid->midState = MID_RESPONSE_RECEIVED; + else + mid->midState = MID_RESPONSE_MALFORMED; +#ifdef CONFIG_CIFS_STATS2 + mid->when_received = jiffies; +#endif + list_del_init(&mid->qhead); + ret = mid; + break; + } + spin_unlock(&GlobalMid_Lock); + + return ret; +} + +static void clean_demultiplex_info(struct TCP_Server_Info *server) +{ + int length; + + /* take it off the list, if it's not already */ + spin_lock(&cifs_tcp_ses_lock); + list_del_init(&server->tcp_ses_list); + spin_unlock(&cifs_tcp_ses_lock); + + spin_lock(&GlobalMid_Lock); + server->tcpStatus = CifsExiting; + spin_unlock(&GlobalMid_Lock); + wake_up_all(&server->response_q); + + /* + * Check if we have blocked requests that need to free. Note that + * cifs_max_pending is normally 50, but can be set at module install + * time to as little as two. + */ + spin_lock(&GlobalMid_Lock); + if (atomic_read(&server->inFlight) >= cifs_max_pending) + atomic_set(&server->inFlight, cifs_max_pending - 1); + /* + * We do not want to set the max_pending too low or we could end up + * with the counter going negative. + */ + spin_unlock(&GlobalMid_Lock); + /* + * Although there should not be any requests blocked on this queue it + * can not hurt to be paranoid and try to wake up requests that may + * haven been blocked when more than 50 at time were on the wire to the + * same server - they now will see the session is in exit state and get + * out of SendReceive. + */ + wake_up_all(&server->request_q); + /* give those requests time to exit */ + msleep(125); + + if (server->ssocket) { + sock_release(server->ssocket); + server->ssocket = NULL; + } + + if (!list_empty(&server->pending_mid_q)) { + struct list_head dispose_list; + struct mid_q_entry *mid_entry; + struct list_head *tmp, *tmp2; + + INIT_LIST_HEAD(&dispose_list); + spin_lock(&GlobalMid_Lock); + list_for_each_safe(tmp, tmp2, &server->pending_mid_q) { + mid_entry = list_entry(tmp, struct mid_q_entry, qhead); + cFYI(1, "Clearing mid 0x%x", mid_entry->mid); + mid_entry->midState = MID_SHUTDOWN; + list_move(&mid_entry->qhead, &dispose_list); + } + spin_unlock(&GlobalMid_Lock); + + /* now walk dispose list and issue callbacks */ + list_for_each_safe(tmp, tmp2, &dispose_list) { + mid_entry = list_entry(tmp, struct mid_q_entry, qhead); + cFYI(1, "Callback mid 0x%x", mid_entry->mid); + list_del_init(&mid_entry->qhead); + mid_entry->callback(mid_entry); + } + /* 1/8th of sec is more than enough time for them to exit */ + msleep(125); + } + + if (!list_empty(&server->pending_mid_q)) { + /* + * mpx threads have not exited yet give them at least the smb + * send timeout time for long ops. + * + * Due to delays on oplock break requests, we need to wait at + * least 45 seconds before giving up on a request getting a + * response and going ahead and killing cifsd. + */ + cFYI(1, "Wait for exit from demultiplex thread"); + msleep(46000); + /* + * If threads still have not exited they are probably never + * coming home not much else we can do but free the memory. + */ + } + + kfree(server->hostname); + kfree(server); + + length = atomic_dec_return(&tcpSesAllocCount); + if (length > 0) + mempool_resize(cifs_req_poolp, length + cifs_min_rcv, + GFP_KERNEL); +} + static int cifs_demultiplex_thread(void *p) { int length; struct TCP_Server_Info *server = p; unsigned int pdu_length, total_read; + char *buf = NULL, *bigbuf = NULL, *smallbuf = NULL; struct smb_hdr *smb_buffer = NULL; - struct smb_hdr *bigbuf = NULL; - struct smb_hdr *smallbuf = NULL; struct msghdr smb_msg; struct kvec iov; - struct socket *csocket = server->ssocket; - struct list_head *tmp, *tmp2; struct task_struct *task_to_wake = NULL; struct mid_q_entry *mid_entry; - char temp; bool isLargeBuf = false; - bool isMultiRsp; - int reconnect; + bool isMultiRsp = false; + int rc; current->flags |= PF_MEMALLOC; cFYI(1, "Demultiplex PID: %d", task_pid_nr(current)); @@ -351,35 +654,16 @@ cifs_demultiplex_thread(void *p) while (server->tcpStatus != CifsExiting) { if (try_to_freeze()) continue; - if (bigbuf == NULL) { - bigbuf = cifs_buf_get(); - if (!bigbuf) { - cERROR(1, "No memory for large SMB response"); - msleep(3000); - /* retry will check if exiting */ - continue; - } - } else if (isLargeBuf) { - /* we are reusing a dirty large buf, clear its start */ - memset(bigbuf, 0, sizeof(struct smb_hdr)); - } - if (smallbuf == NULL) { - smallbuf = cifs_small_buf_get(); - if (!smallbuf) { - cERROR(1, "No memory for SMB response"); - msleep(1000); - /* retry will check if exiting */ - continue; - } - /* beginning of smb buffer is cleared in our buf_get */ - } else /* if existing small buf clear beginning */ - memset(smallbuf, 0, sizeof(struct smb_hdr)); + if (!allocate_buffers(&bigbuf, &smallbuf, + sizeof(struct smb_hdr), isLargeBuf)) + continue; isLargeBuf = false; isMultiRsp = false; - smb_buffer = smallbuf; - iov.iov_base = smb_buffer; + smb_buffer = (struct smb_hdr *)smallbuf; + buf = smallbuf; + iov.iov_base = buf; iov.iov_len = 4; smb_msg.msg_control = NULL; smb_msg.msg_controllen = 0; @@ -393,158 +677,50 @@ incomplete_rcv: "Reconnecting...", server->hostname, (echo_retries * SMB_ECHO_INTERVAL / HZ)); cifs_reconnect(server); - csocket = server->ssocket; wake_up(&server->response_q); continue; } - length = - kernel_recvmsg(csocket, &smb_msg, - &iov, 1, pdu_length, 0 /* BB other flags? */); - - if (server->tcpStatus == CifsExiting) { + rc = read_from_socket(server, &smb_msg, &iov, pdu_length, + &total_read, true /* header read */); + if (rc == 3) + goto incomplete_rcv; + else if (rc == 2) break; - } else if (server->tcpStatus == CifsNeedReconnect) { - cFYI(1, "Reconnect after server stopped responding"); - cifs_reconnect(server); - cFYI(1, "call to reconnect done"); - csocket = server->ssocket; - continue; - } else if (length == -ERESTARTSYS || - length == -EAGAIN || - length == -EINTR) { - msleep(1); /* minimum sleep to prevent looping - allowing socket to clear and app threads to set - tcpStatus CifsNeedReconnect if server hung */ - if (pdu_length < 4) { - iov.iov_base = (4 - pdu_length) + - (char *)smb_buffer; - iov.iov_len = pdu_length; - smb_msg.msg_control = NULL; - smb_msg.msg_controllen = 0; - goto incomplete_rcv; - } else - continue; - } else if (length <= 0) { - cFYI(1, "Reconnect after unexpected peek error %d", - length); - cifs_reconnect(server); - csocket = server->ssocket; - wake_up(&server->response_q); + else if (rc == 1) continue; - } else if (length < pdu_length) { - cFYI(1, "requested %d bytes but only got %d bytes", - pdu_length, length); - pdu_length -= length; - msleep(1); - goto incomplete_rcv; - } - - /* The right amount was read from socket - 4 bytes */ - /* so we can now interpret the length field */ - /* the first byte big endian of the length field, - is actually not part of the length but the type - with the most common, zero, as regular data */ - temp = *((char *) smb_buffer); + /* + * The right amount was read from socket - 4 bytes, + * so we can now interpret the length field. + */ - /* Note that FC 1001 length is big endian on the wire, - but we convert it here so it is always manipulated - as host byte order */ + /* + * Note that RFC 1001 length is big endian on the wire, + * but we convert it here so it is always manipulated + * as host byte order. + */ pdu_length = be32_to_cpu(smb_buffer->smb_buf_length); cFYI(1, "rfc1002 length 0x%x", pdu_length+4); - - if (temp == (char) RFC1002_SESSION_KEEP_ALIVE) { - continue; - } else if (temp == (char)RFC1002_POSITIVE_SESSION_RESPONSE) { - cFYI(1, "Good RFC 1002 session rsp"); - continue; - } else if (temp == (char)RFC1002_NEGATIVE_SESSION_RESPONSE) { - /* we get this from Windows 98 instead of - an error on SMB negprot response */ - cFYI(1, "Negative RFC1002 Session Response Error 0x%x)", - pdu_length); - /* give server a second to clean up */ - msleep(1000); - /* always try 445 first on reconnect since we get NACK - * on some if we ever connected to port 139 (the NACK - * is since we do not begin with RFC1001 session - * initialize frame) - */ - cifs_set_port((struct sockaddr *) - &server->dstaddr, CIFS_PORT); - cifs_reconnect(server); - csocket = server->ssocket; - wake_up(&server->response_q); - continue; - } else if (temp != (char) 0) { - cERROR(1, "Unknown RFC 1002 frame"); - cifs_dump_mem(" Received Data: ", (char *)smb_buffer, - length); - cifs_reconnect(server); - csocket = server->ssocket; + if (!check_rfc1002_header(server, buf)) continue; - } - - /* else we have an SMB response */ - if ((pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) || - (pdu_length < sizeof(struct smb_hdr) - 1 - 4)) { - cERROR(1, "Invalid size SMB length %d pdu_length %d", - length, pdu_length+4); - cifs_reconnect(server); - csocket = server->ssocket; - wake_up(&server->response_q); - continue; - } /* else length ok */ - reconnect = 0; - if (pdu_length > MAX_CIFS_SMALL_BUFFER_SIZE - 4) { isLargeBuf = true; memcpy(bigbuf, smallbuf, 4); - smb_buffer = bigbuf; + smb_buffer = (struct smb_hdr *)bigbuf; + buf = bigbuf; } - length = 0; - iov.iov_base = 4 + (char *)smb_buffer; + + iov.iov_base = 4 + buf; iov.iov_len = pdu_length; - for (total_read = 0; total_read < pdu_length; - total_read += length) { - length = kernel_recvmsg(csocket, &smb_msg, &iov, 1, - pdu_length - total_read, 0); - if (server->tcpStatus == CifsExiting) { - /* then will exit */ - reconnect = 2; - break; - } else if (server->tcpStatus == CifsNeedReconnect) { - cifs_reconnect(server); - csocket = server->ssocket; - /* Reconnect wakes up rspns q */ - /* Now we will reread sock */ - reconnect = 1; - break; - } else if (length == -ERESTARTSYS || - length == -EAGAIN || - length == -EINTR) { - msleep(1); /* minimum sleep to prevent looping, - allowing socket to clear and app - threads to set tcpStatus - CifsNeedReconnect if server hung*/ - length = 0; - continue; - } else if (length <= 0) { - cERROR(1, "Received no data, expecting %d", - pdu_length - total_read); - cifs_reconnect(server); - csocket = server->ssocket; - reconnect = 1; - break; - } - } - if (reconnect == 2) + rc = read_from_socket(server, &smb_msg, &iov, pdu_length, + &total_read, false); + if (rc == 2) break; - else if (reconnect == 1) + else if (rc == 1) continue; total_read += 4; /* account for rfc1002 hdr */ @@ -562,75 +738,13 @@ incomplete_rcv: */ length = checkSMB(smb_buffer, smb_buffer->Mid, total_read); if (length != 0) - cifs_dump_mem("Bad SMB: ", smb_buffer, - min_t(unsigned int, total_read, 48)); + cifs_dump_mem("Bad SMB: ", buf, + min_t(unsigned int, total_read, 48)); - mid_entry = NULL; server->lstrp = jiffies; - spin_lock(&GlobalMid_Lock); - list_for_each_safe(tmp, tmp2, &server->pending_mid_q) { - mid_entry = list_entry(tmp, struct mid_q_entry, qhead); - - if (mid_entry->mid != smb_buffer->Mid || - mid_entry->midState != MID_REQUEST_SUBMITTED || - mid_entry->command != smb_buffer->Command) { - mid_entry = NULL; - continue; - } - - if (length == 0 && - check2ndT2(smb_buffer, server->maxBuf) > 0) { - /* We have a multipart transact2 resp */ - isMultiRsp = true; - if (mid_entry->resp_buf) { - /* merge response - fix up 1st*/ - length = coalesce_t2(smb_buffer, - mid_entry->resp_buf); - if (length > 0) { - length = 0; - mid_entry->multiRsp = true; - break; - } else { - /* all parts received or - * packet is malformed - */ - mid_entry->multiEnd = true; - goto multi_t2_fnd; - } - } else { - if (!isLargeBuf) { - /* - * FIXME: switch to already - * allocated largebuf? - */ - cERROR(1, "1st trans2 resp " - "needs bigbuf"); - } else { - /* Have first buffer */ - mid_entry->resp_buf = - smb_buffer; - mid_entry->largeBuf = true; - bigbuf = NULL; - } - } - break; - } - mid_entry->resp_buf = smb_buffer; - mid_entry->largeBuf = isLargeBuf; -multi_t2_fnd: - if (length == 0) - mid_entry->midState = MID_RESPONSE_RECEIVED; - else - mid_entry->midState = MID_RESPONSE_MALFORMED; -#ifdef CONFIG_CIFS_STATS2 - mid_entry->when_received = jiffies; -#endif - list_del_init(&mid_entry->qhead); - break; - } - spin_unlock(&GlobalMid_Lock); - + mid_entry = find_cifs_mid(server, smb_buffer, &length, + isLargeBuf, &isMultiRsp, &bigbuf); if (mid_entry != NULL) { mid_entry->callback(mid_entry); /* Was previous buf put in mpx struct for multi-rsp? */ @@ -648,7 +762,7 @@ multi_t2_fnd: !isMultiRsp) { cERROR(1, "No task to wake, unknown frame received! " "NumMids %d", atomic_read(&midCount)); - cifs_dump_mem("Received Data is: ", (char *)smb_buffer, + cifs_dump_mem("Received Data is: ", buf, sizeof(struct smb_hdr)); #ifdef CONFIG_CIFS_DEBUG2 cifs_dump_detail(smb_buffer); @@ -658,88 +772,13 @@ multi_t2_fnd: } } /* end while !EXITING */ - /* take it off the list, if it's not already */ - spin_lock(&cifs_tcp_ses_lock); - list_del_init(&server->tcp_ses_list); - spin_unlock(&cifs_tcp_ses_lock); - - spin_lock(&GlobalMid_Lock); - server->tcpStatus = CifsExiting; - spin_unlock(&GlobalMid_Lock); - wake_up_all(&server->response_q); - - /* check if we have blocked requests that need to free */ - /* Note that cifs_max_pending is normally 50, but - can be set at module install time to as little as two */ - spin_lock(&GlobalMid_Lock); - if (atomic_read(&server->inFlight) >= cifs_max_pending) - atomic_set(&server->inFlight, cifs_max_pending - 1); - /* We do not want to set the max_pending too low or we - could end up with the counter going negative */ - spin_unlock(&GlobalMid_Lock); - /* Although there should not be any requests blocked on - this queue it can not hurt to be paranoid and try to wake up requests - that may haven been blocked when more than 50 at time were on the wire - to the same server - they now will see the session is in exit state - and get out of SendReceive. */ - wake_up_all(&server->request_q); - /* give those requests time to exit */ - msleep(125); - - if (server->ssocket) { - sock_release(csocket); - server->ssocket = NULL; - } /* buffer usually freed in free_mid - need to free it here on exit */ cifs_buf_release(bigbuf); if (smallbuf) /* no sense logging a debug message if NULL */ cifs_small_buf_release(smallbuf); - if (!list_empty(&server->pending_mid_q)) { - struct list_head dispose_list; - - INIT_LIST_HEAD(&dispose_list); - spin_lock(&GlobalMid_Lock); - list_for_each_safe(tmp, tmp2, &server->pending_mid_q) { - mid_entry = list_entry(tmp, struct mid_q_entry, qhead); - cFYI(1, "Clearing mid 0x%x", mid_entry->mid); - mid_entry->midState = MID_SHUTDOWN; - list_move(&mid_entry->qhead, &dispose_list); - } - spin_unlock(&GlobalMid_Lock); - - /* now walk dispose list and issue callbacks */ - list_for_each_safe(tmp, tmp2, &dispose_list) { - mid_entry = list_entry(tmp, struct mid_q_entry, qhead); - cFYI(1, "Callback mid 0x%x", mid_entry->mid); - list_del_init(&mid_entry->qhead); - mid_entry->callback(mid_entry); - } - /* 1/8th of sec is more than enough time for them to exit */ - msleep(125); - } - - if (!list_empty(&server->pending_mid_q)) { - /* mpx threads have not exited yet give them - at least the smb send timeout time for long ops */ - /* due to delays on oplock break requests, we need - to wait at least 45 seconds before giving up - on a request getting a response and going ahead - and killing cifsd */ - cFYI(1, "Wait for exit from demultiplex thread"); - msleep(46000); - /* if threads still have not exited they are probably never - coming home not much else we can do but free the memory */ - } - - kfree(server->hostname); task_to_wake = xchg(&server->tsk, NULL); - kfree(server); - - length = atomic_dec_return(&tcpSesAllocCount); - if (length > 0) - mempool_resize(cifs_req_poolp, length + cifs_min_rcv, - GFP_KERNEL); + clean_demultiplex_info(server); /* if server->tsk was NULL then wait for a signal before exiting */ if (!task_to_wake) { @@ -2839,7 +2878,8 @@ cleanup_volume_info_contents(struct smb_vol *volume_info) kfree(volume_info->username); kzfree(volume_info->password); kfree(volume_info->UNC); - kfree(volume_info->UNCip); + if (volume_info->UNCip != volume_info->UNC + 2) + kfree(volume_info->UNCip); kfree(volume_info->domainname); kfree(volume_info->iocharset); kfree(volume_info->prepath); @@ -3193,15 +3233,9 @@ mount_fail_check: else cifs_put_tcp_session(srvTcp); bdi_destroy(&cifs_sb->bdi); - goto out; } - /* volume_info->password is freed above when existing session found - (in which case it is not needed anymore) but when new sesion is created - the password ptr is put in the new session structure (in which case the - password will be freed at unmount time) */ out: - /* zero out password before freeing */ FreeXid(xid); return rc; } diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 499f27f..72d448b 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -57,11 +57,6 @@ build_path_from_dentry(struct dentry *direntry) struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); unsigned seq; - if (direntry == NULL) - return NULL; /* not much we can do if dentry is freed and - we need to reopen the file after it was closed implicitly - when the server crashed */ - dirsep = CIFS_DIR_SEP(cifs_sb); if (tcon->Flags & SMB_SHARE_IS_IN_DFS) dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1); @@ -110,8 +105,8 @@ cifs_bp_rename_retry: } rcu_read_unlock(); if (namelen != dfsplen || read_seqretry(&rename_lock, seq)) { - cERROR(1, "did not end path lookup where expected namelen is %d", - namelen); + cFYI(1, "did not end path lookup where expected. namelen=%d " + "dfsplen=%d", namelen, dfsplen); /* presumably this is only possible if racing with a rename of one of the parent directories (we can not lock the dentries above us to prevent this, but retrying should be harmless) */ diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c index 548f062..1d2d91d 100644 --- a/fs/cifs/dns_resolve.c +++ b/fs/cifs/dns_resolve.c @@ -79,8 +79,8 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr) /* Perform the upcall */ rc = dns_query(NULL, hostname, len, NULL, ip_addr, NULL); if (rc < 0) - cERROR(1, "%s: unable to resolve: %*.*s", - __func__, len, len, hostname); + cFYI(1, "%s: unable to resolve: %*.*s", + __func__, len, len, hostname); else cFYI(1, "%s: resolved: %*.*s to %s", __func__, len, len, hostname, *ip_addr); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 378acdaf..9f41a10 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -314,6 +314,8 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) } spin_unlock(&cifs_file_list_lock); + cancel_work_sync(&cifs_file->oplock_break); + if (!tcon->need_reconnect && !cifs_file->invalidHandle) { int xid, rc; @@ -2418,31 +2420,6 @@ void cifs_oplock_break(struct work_struct *work) cinode->clientCanCacheRead ? 1 : 0); cFYI(1, "Oplock release rc = %d", rc); } - - /* - * We might have kicked in before is_valid_oplock_break() - * finished grabbing reference for us. Make sure it's done by - * waiting for cifs_file_list_lock. - */ - spin_lock(&cifs_file_list_lock); - spin_unlock(&cifs_file_list_lock); - - cifs_oplock_break_put(cfile); -} - -/* must be called while holding cifs_file_list_lock */ -void cifs_oplock_break_get(struct cifsFileInfo *cfile) -{ - cifs_sb_active(cfile->dentry->d_sb); - cifsFileInfo_get(cfile); -} - -void cifs_oplock_break_put(struct cifsFileInfo *cfile) -{ - struct super_block *sb = cfile->dentry->d_sb; - - cifsFileInfo_put(cfile); - cifs_sb_deactive(sb); } const struct address_space_operations cifs_addr_ops = { diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 9b018c8..a7b2dcd 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -764,20 +764,10 @@ char *cifs_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb, if (full_path == NULL) return full_path; - if (dfsplen) { + if (dfsplen) strncpy(full_path, tcon->treeName, dfsplen); - /* switch slash direction in prepath depending on whether - * windows or posix style path names - */ - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) { - int i; - for (i = 0; i < dfsplen; i++) { - if (full_path[i] == '\\') - full_path[i] = '/'; - } - } - } strncpy(full_path + dfsplen, vol->prepath, pplen); + convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb)); full_path[dfsplen + pplen] = 0; /* add trailing null */ return full_path; } diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 556b1a0..db3f18c 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -74,8 +74,14 @@ symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash) cERROR(1, "%s: Could not init md5 shash\n", __func__); goto symlink_hash_err; } - crypto_shash_update(&sdescmd5->shash, link_str, link_len); + rc = crypto_shash_update(&sdescmd5->shash, link_str, link_len); + if (rc) { + cERROR(1, "%s: Could not update iwth link_str\n", __func__); + goto symlink_hash_err; + } rc = crypto_shash_final(&sdescmd5->shash, md5_hash); + if (rc) + cERROR(1, "%s: Could not generate md5 hash\n", __func__); symlink_hash_err: crypto_free_shash(md5); diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 03a1f491..7c16933 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -585,15 +585,8 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv) cifs_set_oplock_level(pCifsInode, pSMB->OplockLevel ? OPLOCK_READ : 0); - /* - * cifs_oplock_break_put() can't be called - * from here. Get reference after queueing - * succeeded. cifs_oplock_break() will - * synchronize using cifs_file_list_lock. - */ - if (queue_work(system_nrt_wq, - &netfile->oplock_break)) - cifs_oplock_break_get(netfile); + queue_work(system_nrt_wq, + &netfile->oplock_break); netfile->oplock_break_cancelled = false; spin_unlock(&cifs_file_list_lock); diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 965a3af..5de03ec 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -4,6 +4,7 @@ * Directory search handling * * Copyright (C) International Business Machines Corp., 2004, 2008 + * Copyright (C) Red Hat, Inc., 2011 * Author(s): Steve French (sfrench@us.ibm.com) * * This library is free software; you can redistribute it and/or modify @@ -290,10 +291,10 @@ error_exit: } /* return length of unicode string in bytes */ -static int cifs_unicode_bytelen(char *str) +static int cifs_unicode_bytelen(const char *str) { int len; - __le16 *ustr = (__le16 *)str; + const __le16 *ustr = (const __le16 *)str; for (len = 0; len <= PATH_MAX; len++) { if (ustr[len] == 0) @@ -334,78 +335,128 @@ static char *nxt_dir_entry(char *old_entry, char *end_of_smb, int level) } +struct cifs_dirent { + const char *name; + size_t namelen; + u32 resume_key; + u64 ino; +}; + +static void cifs_fill_dirent_unix(struct cifs_dirent *de, + const FILE_UNIX_INFO *info, bool is_unicode) +{ + de->name = &info->FileName[0]; + if (is_unicode) + de->namelen = cifs_unicode_bytelen(de->name); + else + de->namelen = strnlen(de->name, PATH_MAX); + de->resume_key = info->ResumeKey; + de->ino = le64_to_cpu(info->basic.UniqueId); +} + +static void cifs_fill_dirent_dir(struct cifs_dirent *de, + const FILE_DIRECTORY_INFO *info) +{ + de->name = &info->FileName[0]; + de->namelen = le32_to_cpu(info->FileNameLength); + de->resume_key = info->FileIndex; +} + +static void cifs_fill_dirent_full(struct cifs_dirent *de, + const FILE_FULL_DIRECTORY_INFO *info) +{ + de->name = &info->FileName[0]; + de->namelen = le32_to_cpu(info->FileNameLength); + de->resume_key = info->FileIndex; +} + +static void cifs_fill_dirent_search(struct cifs_dirent *de, + const SEARCH_ID_FULL_DIR_INFO *info) +{ + de->name = &info->FileName[0]; + de->namelen = le32_to_cpu(info->FileNameLength); + de->resume_key = info->FileIndex; + de->ino = le64_to_cpu(info->UniqueId); +} + +static void cifs_fill_dirent_both(struct cifs_dirent *de, + const FILE_BOTH_DIRECTORY_INFO *info) +{ + de->name = &info->FileName[0]; + de->namelen = le32_to_cpu(info->FileNameLength); + de->resume_key = info->FileIndex; +} + +static void cifs_fill_dirent_std(struct cifs_dirent *de, + const FIND_FILE_STANDARD_INFO *info) +{ + de->name = &info->FileName[0]; + /* one byte length, no endianess conversion */ + de->namelen = info->FileNameLength; + de->resume_key = info->ResumeKey; +} + +static int cifs_fill_dirent(struct cifs_dirent *de, const void *info, + u16 level, bool is_unicode) +{ + memset(de, 0, sizeof(*de)); + + switch (level) { + case SMB_FIND_FILE_UNIX: + cifs_fill_dirent_unix(de, info, is_unicode); + break; + case SMB_FIND_FILE_DIRECTORY_INFO: + cifs_fill_dirent_dir(de, info); + break; + case SMB_FIND_FILE_FULL_DIRECTORY_INFO: + cifs_fill_dirent_full(de, info); + break; + case SMB_FIND_FILE_ID_FULL_DIR_INFO: + cifs_fill_dirent_search(de, info); + break; + case SMB_FIND_FILE_BOTH_DIRECTORY_INFO: + cifs_fill_dirent_both(de, info); + break; + case SMB_FIND_FILE_INFO_STANDARD: + cifs_fill_dirent_std(de, info); + break; + default: + cFYI(1, "Unknown findfirst level %d", level); + return -EINVAL; + } + + return 0; +} + #define UNICODE_DOT cpu_to_le16(0x2e) /* return 0 if no match and 1 for . (current directory) and 2 for .. (parent) */ -static int cifs_entry_is_dot(char *current_entry, struct cifsFileInfo *cfile) +static int cifs_entry_is_dot(struct cifs_dirent *de, bool is_unicode) { int rc = 0; - char *filename = NULL; - int len = 0; - - if (cfile->srch_inf.info_level == SMB_FIND_FILE_UNIX) { - FILE_UNIX_INFO *pFindData = (FILE_UNIX_INFO *)current_entry; - filename = &pFindData->FileName[0]; - if (cfile->srch_inf.unicode) { - len = cifs_unicode_bytelen(filename); - } else { - /* BB should we make this strnlen of PATH_MAX? */ - len = strnlen(filename, 5); - } - } else if (cfile->srch_inf.info_level == SMB_FIND_FILE_DIRECTORY_INFO) { - FILE_DIRECTORY_INFO *pFindData = - (FILE_DIRECTORY_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); - } else if (cfile->srch_inf.info_level == - SMB_FIND_FILE_FULL_DIRECTORY_INFO) { - FILE_FULL_DIRECTORY_INFO *pFindData = - (FILE_FULL_DIRECTORY_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); - } else if (cfile->srch_inf.info_level == - SMB_FIND_FILE_ID_FULL_DIR_INFO) { - SEARCH_ID_FULL_DIR_INFO *pFindData = - (SEARCH_ID_FULL_DIR_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); - } else if (cfile->srch_inf.info_level == - SMB_FIND_FILE_BOTH_DIRECTORY_INFO) { - FILE_BOTH_DIRECTORY_INFO *pFindData = - (FILE_BOTH_DIRECTORY_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); - } else if (cfile->srch_inf.info_level == SMB_FIND_FILE_INFO_STANDARD) { - FIND_FILE_STANDARD_INFO *pFindData = - (FIND_FILE_STANDARD_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = pFindData->FileNameLength; - } else { - cFYI(1, "Unknown findfirst level %d", - cfile->srch_inf.info_level); - } - if (filename) { - if (cfile->srch_inf.unicode) { - __le16 *ufilename = (__le16 *)filename; - if (len == 2) { - /* check for . */ - if (ufilename[0] == UNICODE_DOT) - rc = 1; - } else if (len == 4) { - /* check for .. */ - if ((ufilename[0] == UNICODE_DOT) - && (ufilename[1] == UNICODE_DOT)) - rc = 2; - } - } else /* ASCII */ { - if (len == 1) { - if (filename[0] == '.') - rc = 1; - } else if (len == 2) { - if ((filename[0] == '.') && (filename[1] == '.')) - rc = 2; - } + if (!de->name) + return 0; + + if (is_unicode) { + __le16 *ufilename = (__le16 *)de->name; + if (de->namelen == 2) { + /* check for . */ + if (ufilename[0] == UNICODE_DOT) + rc = 1; + } else if (de->namelen == 4) { + /* check for .. */ + if (ufilename[0] == UNICODE_DOT && + ufilename[1] == UNICODE_DOT) + rc = 2; + } + } else /* ASCII */ { + if (de->namelen == 1) { + if (de->name[0] == '.') + rc = 1; + } else if (de->namelen == 2) { + if (de->name[0] == '.' && de->name[1] == '.') + rc = 2; } } @@ -427,66 +478,18 @@ static int is_dir_changed(struct file *file) } static int cifs_save_resume_key(const char *current_entry, - struct cifsFileInfo *cifsFile) + struct cifsFileInfo *file_info) { - int rc = 0; - unsigned int len = 0; - __u16 level; - char *filename; - - if ((cifsFile == NULL) || (current_entry == NULL)) - return -EINVAL; - - level = cifsFile->srch_inf.info_level; - - if (level == SMB_FIND_FILE_UNIX) { - FILE_UNIX_INFO *pFindData = (FILE_UNIX_INFO *)current_entry; + struct cifs_dirent de; + int rc; - filename = &pFindData->FileName[0]; - if (cifsFile->srch_inf.unicode) { - len = cifs_unicode_bytelen(filename); - } else { - /* BB should we make this strnlen of PATH_MAX? */ - len = strnlen(filename, PATH_MAX); - } - cifsFile->srch_inf.resume_key = pFindData->ResumeKey; - } else if (level == SMB_FIND_FILE_DIRECTORY_INFO) { - FILE_DIRECTORY_INFO *pFindData = - (FILE_DIRECTORY_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); - cifsFile->srch_inf.resume_key = pFindData->FileIndex; - } else if (level == SMB_FIND_FILE_FULL_DIRECTORY_INFO) { - FILE_FULL_DIRECTORY_INFO *pFindData = - (FILE_FULL_DIRECTORY_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); - cifsFile->srch_inf.resume_key = pFindData->FileIndex; - } else if (level == SMB_FIND_FILE_ID_FULL_DIR_INFO) { - SEARCH_ID_FULL_DIR_INFO *pFindData = - (SEARCH_ID_FULL_DIR_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); - cifsFile->srch_inf.resume_key = pFindData->FileIndex; - } else if (level == SMB_FIND_FILE_BOTH_DIRECTORY_INFO) { - FILE_BOTH_DIRECTORY_INFO *pFindData = - (FILE_BOTH_DIRECTORY_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); - cifsFile->srch_inf.resume_key = pFindData->FileIndex; - } else if (level == SMB_FIND_FILE_INFO_STANDARD) { - FIND_FILE_STANDARD_INFO *pFindData = - (FIND_FILE_STANDARD_INFO *)current_entry; - filename = &pFindData->FileName[0]; - /* one byte length, no name conversion */ - len = (unsigned int)pFindData->FileNameLength; - cifsFile->srch_inf.resume_key = pFindData->ResumeKey; - } else { - cFYI(1, "Unknown findfirst level %d", level); - return -EINVAL; + rc = cifs_fill_dirent(&de, current_entry, file_info->srch_inf.info_level, + file_info->srch_inf.unicode); + if (!rc) { + file_info->srch_inf.presume_name = de.name; + file_info->srch_inf.resume_name_len = de.namelen; + file_info->srch_inf.resume_key = de.resume_key; } - cifsFile->srch_inf.resume_name_len = len; - cifsFile->srch_inf.presume_name = filename; return rc; } @@ -605,136 +608,70 @@ static int find_cifs_entry(const int xid, struct cifs_tcon *pTcon, return rc; } -/* inode num, inode type and filename returned */ -static int cifs_get_name_from_search_buf(struct qstr *pqst, - char *current_entry, __u16 level, unsigned int unicode, - struct cifs_sb_info *cifs_sb, unsigned int max_len, __u64 *pinum) +static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir, + void *dirent, char *scratch_buf, unsigned int max_len) { + struct cifsFileInfo *file_info = file->private_data; + struct super_block *sb = file->f_path.dentry->d_sb; + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + struct cifs_dirent de = { NULL, }; + struct cifs_fattr fattr; + struct dentry *dentry; + struct qstr name; int rc = 0; - unsigned int len = 0; - char *filename; - struct nls_table *nlt = cifs_sb->local_nls; - - *pinum = 0; - - if (level == SMB_FIND_FILE_UNIX) { - FILE_UNIX_INFO *pFindData = (FILE_UNIX_INFO *)current_entry; - - filename = &pFindData->FileName[0]; - if (unicode) { - len = cifs_unicode_bytelen(filename); - } else { - /* BB should we make this strnlen of PATH_MAX? */ - len = strnlen(filename, PATH_MAX); - } + ino_t ino; - *pinum = le64_to_cpu(pFindData->basic.UniqueId); - } else if (level == SMB_FIND_FILE_DIRECTORY_INFO) { - FILE_DIRECTORY_INFO *pFindData = - (FILE_DIRECTORY_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); - } else if (level == SMB_FIND_FILE_FULL_DIRECTORY_INFO) { - FILE_FULL_DIRECTORY_INFO *pFindData = - (FILE_FULL_DIRECTORY_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); - } else if (level == SMB_FIND_FILE_ID_FULL_DIR_INFO) { - SEARCH_ID_FULL_DIR_INFO *pFindData = - (SEARCH_ID_FULL_DIR_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); - *pinum = le64_to_cpu(pFindData->UniqueId); - } else if (level == SMB_FIND_FILE_BOTH_DIRECTORY_INFO) { - FILE_BOTH_DIRECTORY_INFO *pFindData = - (FILE_BOTH_DIRECTORY_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); - } else if (level == SMB_FIND_FILE_INFO_STANDARD) { - FIND_FILE_STANDARD_INFO *pFindData = - (FIND_FILE_STANDARD_INFO *)current_entry; - filename = &pFindData->FileName[0]; - /* one byte length, no name conversion */ - len = (unsigned int)pFindData->FileNameLength; - } else { - cFYI(1, "Unknown findfirst level %d", level); - return -EINVAL; - } + rc = cifs_fill_dirent(&de, find_entry, file_info->srch_inf.info_level, + file_info->srch_inf.unicode); + if (rc) + return rc; - if (len > max_len) { - cERROR(1, "bad search response length %d past smb end", len); + if (de.namelen > max_len) { + cERROR(1, "bad search response length %zd past smb end", + de.namelen); return -EINVAL; } - if (unicode) { - pqst->len = cifs_from_ucs2((char *) pqst->name, - (__le16 *) filename, - UNICODE_NAME_MAX, - min(len, max_len), nlt, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); - pqst->len -= nls_nullsize(nlt); - } else { - pqst->name = filename; - pqst->len = len; - } - return rc; -} - -static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir, - void *direntry, char *scratch_buf, unsigned int max_len) -{ - int rc = 0; - struct qstr qstring; - struct cifsFileInfo *pCifsF; - u64 inum; - ino_t ino; - struct super_block *sb; - struct cifs_sb_info *cifs_sb; - struct dentry *tmp_dentry; - struct cifs_fattr fattr; - - /* get filename and len into qstring */ - /* get dentry */ - /* decide whether to create and populate ionde */ - if ((direntry == NULL) || (file == NULL)) - return -EINVAL; - - pCifsF = file->private_data; - - if ((scratch_buf == NULL) || (pfindEntry == NULL) || (pCifsF == NULL)) - return -ENOENT; - - rc = cifs_entry_is_dot(pfindEntry, pCifsF); /* skip . and .. since we added them first */ - if (rc != 0) + if (cifs_entry_is_dot(&de, file_info->srch_inf.unicode)) return 0; - sb = file->f_path.dentry->d_sb; - cifs_sb = CIFS_SB(sb); - - qstring.name = scratch_buf; - rc = cifs_get_name_from_search_buf(&qstring, pfindEntry, - pCifsF->srch_inf.info_level, - pCifsF->srch_inf.unicode, cifs_sb, - max_len, &inum /* returned */); + if (file_info->srch_inf.unicode) { + struct nls_table *nlt = cifs_sb->local_nls; - if (rc) - return rc; + name.name = scratch_buf; + name.len = + cifs_from_ucs2((char *)name.name, (__le16 *)de.name, + UNICODE_NAME_MAX, + min(de.namelen, (size_t)max_len), nlt, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + name.len -= nls_nullsize(nlt); + } else { + name.name = de.name; + name.len = de.namelen; + } - if (pCifsF->srch_inf.info_level == SMB_FIND_FILE_UNIX) + switch (file_info->srch_inf.info_level) { + case SMB_FIND_FILE_UNIX: cifs_unix_basic_to_fattr(&fattr, - &((FILE_UNIX_INFO *) pfindEntry)->basic, - cifs_sb); - else if (pCifsF->srch_inf.info_level == SMB_FIND_FILE_INFO_STANDARD) - cifs_std_info_to_fattr(&fattr, (FIND_FILE_STANDARD_INFO *) - pfindEntry, cifs_sb); - else - cifs_dir_info_to_fattr(&fattr, (FILE_DIRECTORY_INFO *) - pfindEntry, cifs_sb); + &((FILE_UNIX_INFO *)find_entry)->basic, + cifs_sb); + break; + case SMB_FIND_FILE_INFO_STANDARD: + cifs_std_info_to_fattr(&fattr, + (FIND_FILE_STANDARD_INFO *)find_entry, + cifs_sb); + break; + default: + cifs_dir_info_to_fattr(&fattr, + (FILE_DIRECTORY_INFO *)find_entry, + cifs_sb); + break; + } - if (inum && (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) { - fattr.cf_uniqueid = inum; + if (de.ino && (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) { + fattr.cf_uniqueid = de.ino; } else { fattr.cf_uniqueid = iunique(sb, ROOT_I); cifs_autodisable_serverino(cifs_sb); @@ -750,12 +687,12 @@ static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir, fattr.cf_flags |= CIFS_FATTR_NEED_REVAL; ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid); - tmp_dentry = cifs_readdir_lookup(file->f_dentry, &qstring, &fattr); + dentry = cifs_readdir_lookup(file->f_dentry, &name, &fattr); - rc = filldir(direntry, qstring.name, qstring.len, file->f_pos, - ino, fattr.cf_dtype); + rc = filldir(dirent, name.name, name.len, file->f_pos, ino, + fattr.cf_dtype); - dput(tmp_dentry); + dput(dentry); return rc; } diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c index 1c5b770..42b9fff 100644 --- a/fs/cifs/smbencrypt.c +++ b/fs/cifs/smbencrypt.c @@ -157,8 +157,14 @@ mdfour(unsigned char *md4_hash, unsigned char *link_str, int link_len) cERROR(1, "%s: Could not init md4 shash\n", __func__); goto mdfour_err; } - crypto_shash_update(&sdescmd4->shash, link_str, link_len); + rc = crypto_shash_update(&sdescmd4->shash, link_str, link_len); + if (rc) { + cERROR(1, "%s: Could not update with link_str\n", __func__); + goto mdfour_err; + } rc = crypto_shash_final(&sdescmd4->shash, md4_hash); + if (rc) + cERROR(1, "%s: Could not genereate md4 hash\n", __func__); mdfour_err: crypto_free_shash(md4); diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 147aa22..10ca6b2 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -266,15 +266,11 @@ static int wait_for_free_request(struct TCP_Server_Info *server, while (1) { if (atomic_read(&server->inFlight) >= cifs_max_pending) { spin_unlock(&GlobalMid_Lock); -#ifdef CONFIG_CIFS_STATS2 - atomic_inc(&server->num_waiters); -#endif + cifs_num_waiters_inc(server); wait_event(server->request_q, atomic_read(&server->inFlight) < cifs_max_pending); -#ifdef CONFIG_CIFS_STATS2 - atomic_dec(&server->num_waiters); -#endif + cifs_num_waiters_dec(server); spin_lock(&GlobalMid_Lock); } else { if (server->tcpStatus == CifsExiting) { @@ -362,6 +358,8 @@ cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov, mid = AllocMidQEntry(hdr, server); if (mid == NULL) { mutex_unlock(&server->srv_mutex); + atomic_dec(&server->inFlight); + wake_up(&server->request_q); return -ENOMEM; } @@ -379,15 +377,13 @@ cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov, mid->callback = callback; mid->callback_data = cbdata; mid->midState = MID_REQUEST_SUBMITTED; -#ifdef CONFIG_CIFS_STATS2 - atomic_inc(&server->inSend); -#endif + + cifs_in_send_inc(server); rc = smb_sendv(server, iov, nvec); -#ifdef CONFIG_CIFS_STATS2 - atomic_dec(&server->inSend); - mid->when_sent = jiffies; -#endif + cifs_in_send_dec(server); + cifs_save_when_sent(mid); mutex_unlock(&server->srv_mutex); + if (rc) goto out_err; @@ -573,14 +569,10 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses, } midQ->midState = MID_REQUEST_SUBMITTED; -#ifdef CONFIG_CIFS_STATS2 - atomic_inc(&ses->server->inSend); -#endif + cifs_in_send_inc(ses->server); rc = smb_sendv(ses->server, iov, n_vec); -#ifdef CONFIG_CIFS_STATS2 - atomic_dec(&ses->server->inSend); - midQ->when_sent = jiffies; -#endif + cifs_in_send_dec(ses->server); + cifs_save_when_sent(midQ); mutex_unlock(&ses->server->srv_mutex); @@ -701,14 +693,11 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, } midQ->midState = MID_REQUEST_SUBMITTED; -#ifdef CONFIG_CIFS_STATS2 - atomic_inc(&ses->server->inSend); -#endif + + cifs_in_send_inc(ses->server); rc = smb_send(ses->server, in_buf, be32_to_cpu(in_buf->smb_buf_length)); -#ifdef CONFIG_CIFS_STATS2 - atomic_dec(&ses->server->inSend); - midQ->when_sent = jiffies; -#endif + cifs_in_send_dec(ses->server); + cifs_save_when_sent(midQ); mutex_unlock(&ses->server->srv_mutex); if (rc < 0) @@ -841,14 +830,10 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, } midQ->midState = MID_REQUEST_SUBMITTED; -#ifdef CONFIG_CIFS_STATS2 - atomic_inc(&ses->server->inSend); -#endif + cifs_in_send_inc(ses->server); rc = smb_send(ses->server, in_buf, be32_to_cpu(in_buf->smb_buf_length)); -#ifdef CONFIG_CIFS_STATS2 - atomic_dec(&ses->server->inSend); - midQ->when_sent = jiffies; -#endif + cifs_in_send_dec(ses->server); + cifs_save_when_sent(midQ); mutex_unlock(&ses->server->srv_mutex); if (rc < 0) { diff --git a/fs/compat.c b/fs/compat.c index 0b48d01..58b1da4 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1675,11 +1675,6 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds, } #endif /* HAVE_SET_RESTORE_SIGMASK */ -long asmlinkage compat_sys_nfsservctl(int cmd, void *notused, void *notused2) -{ - return sys_ni_syscall(); -} - #ifdef CONFIG_EPOLL #ifdef HAVE_SET_RESTORE_SIGMASK diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 8be086e..51352de 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -1003,6 +1003,7 @@ COMPATIBLE_IOCTL(PPPIOCCONNECT) COMPATIBLE_IOCTL(PPPIOCDISCONN) COMPATIBLE_IOCTL(PPPIOCATTCHAN) COMPATIBLE_IOCTL(PPPIOCGCHAN) +COMPATIBLE_IOCTL(PPPIOCGL2TPSTATS) /* PPPOX */ COMPATIBLE_IOCTL(PPPOEIOCSFWD) COMPATIBLE_IOCTL(PPPOEIOCDFWD) diff --git a/fs/dcache.c b/fs/dcache.c index be18598..a88948b 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -301,6 +301,27 @@ static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent) return parent; } +/* + * Unhash a dentry without inserting an RCU walk barrier or checking that + * dentry->d_lock is locked. The caller must take care of that, if + * appropriate. + */ +static void __d_shrink(struct dentry *dentry) +{ + if (!d_unhashed(dentry)) { + struct hlist_bl_head *b; + if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED)) + b = &dentry->d_sb->s_anon; + else + b = d_hash(dentry->d_parent, dentry->d_name.hash); + + hlist_bl_lock(b); + __hlist_bl_del(&dentry->d_hash); + dentry->d_hash.pprev = NULL; + hlist_bl_unlock(b); + } +} + /** * d_drop - drop a dentry * @dentry: dentry to drop @@ -319,17 +340,7 @@ static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent) void __d_drop(struct dentry *dentry) { if (!d_unhashed(dentry)) { - struct hlist_bl_head *b; - if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED)) - b = &dentry->d_sb->s_anon; - else - b = d_hash(dentry->d_parent, dentry->d_name.hash); - - hlist_bl_lock(b); - __hlist_bl_del(&dentry->d_hash); - dentry->d_hash.pprev = NULL; - hlist_bl_unlock(b); - + __d_shrink(dentry); dentry_rcuwalk_barrier(dentry); } } @@ -784,6 +795,7 @@ relock: /** * prune_dcache_sb - shrink the dcache + * @sb: superblock * @nr_to_scan: number of entries to try to free * * Attempt to shrink the superblock dcache LRU by @nr_to_scan entries. This is @@ -828,44 +840,24 @@ EXPORT_SYMBOL(shrink_dcache_sb); static void shrink_dcache_for_umount_subtree(struct dentry *dentry) { struct dentry *parent; - unsigned detached = 0; BUG_ON(!IS_ROOT(dentry)); - /* detach this root from the system */ - spin_lock(&dentry->d_lock); - dentry_lru_del(dentry); - __d_drop(dentry); - spin_unlock(&dentry->d_lock); - for (;;) { /* descend to the first leaf in the current subtree */ - while (!list_empty(&dentry->d_subdirs)) { - struct dentry *loop; - - /* this is a branch with children - detach all of them - * from the system in one go */ - spin_lock(&dentry->d_lock); - list_for_each_entry(loop, &dentry->d_subdirs, - d_u.d_child) { - spin_lock_nested(&loop->d_lock, - DENTRY_D_LOCK_NESTED); - dentry_lru_del(loop); - __d_drop(loop); - spin_unlock(&loop->d_lock); - } - spin_unlock(&dentry->d_lock); - - /* move to the first child */ + while (!list_empty(&dentry->d_subdirs)) dentry = list_entry(dentry->d_subdirs.next, struct dentry, d_u.d_child); - } /* consume the dentries from this leaf up through its parents * until we find one with children or run out altogether */ do { struct inode *inode; + /* detach from the system */ + dentry_lru_del(dentry); + __d_shrink(dentry); + if (dentry->d_count != 0) { printk(KERN_ERR "BUG: Dentry %p{i=%lx,n=%s}" @@ -886,14 +878,10 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry) list_del(&dentry->d_u.d_child); } else { parent = dentry->d_parent; - spin_lock(&parent->d_lock); parent->d_count--; list_del(&dentry->d_u.d_child); - spin_unlock(&parent->d_lock); } - detached++; - inode = dentry->d_inode; if (inode) { dentry->d_inode = NULL; @@ -938,9 +926,7 @@ void shrink_dcache_for_umount(struct super_block *sb) dentry = sb->s_root; sb->s_root = NULL; - spin_lock(&dentry->d_lock); dentry->d_count--; - spin_unlock(&dentry->d_lock); shrink_dcache_for_umount_subtree(dentry); while (!hlist_bl_empty(&sb->s_anon)) { @@ -1743,7 +1729,7 @@ seqretry: */ if (read_seqcount_retry(&dentry->d_seq, *seq)) goto seqretry; - if (parent->d_flags & DCACHE_OP_COMPARE) { + if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) { if (parent->d_op->d_compare(parent, *inode, dentry, i, tlen, tname, name)) @@ -2138,8 +2124,9 @@ static void dentry_unlock_parents_for_move(struct dentry *dentry, * @target: new dentry * * Update the dcache to reflect the move of a file name. Negative - * dcache entries should not be moved in this way. Caller hold - * rename_lock. + * dcache entries should not be moved in this way. Caller must hold + * rename_lock, the i_mutex of the source and target directories, + * and the sb->s_vfs_rename_mutex if they differ. See lock_rename(). */ static void __d_move(struct dentry * dentry, struct dentry * target) { @@ -2202,7 +2189,8 @@ static void __d_move(struct dentry * dentry, struct dentry * target) * @target: new dentry * * Update the dcache to reflect the move of a file name. Negative - * dcache entries should not be moved in this way. + * dcache entries should not be moved in this way. See the locking + * requirements for __d_move. */ void d_move(struct dentry *dentry, struct dentry *target) { @@ -2320,7 +2308,8 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon) * @inode: inode to bind to the dentry, to which aliases may be attached * * Introduces an dentry into the tree, substituting an extant disconnected - * root directory alias in its place if there is one + * root directory alias in its place if there is one. Caller must hold the + * i_mutex of the parent directory. */ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode) { diff --git a/fs/direct-io.c b/fs/direct-io.c index 01d2d9e..44a360c 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -35,7 +35,7 @@ #include <linux/buffer_head.h> #include <linux/rwsem.h> #include <linux/uio.h> -#include <asm/atomic.h> +#include <linux/atomic.h> /* * How many user pages to map in one call to get_user_pages(). This determines diff --git a/fs/ecryptfs/Kconfig b/fs/ecryptfs/Kconfig index 1cd6d9d..cc16562 100644 --- a/fs/ecryptfs/Kconfig +++ b/fs/ecryptfs/Kconfig @@ -1,6 +1,6 @@ config ECRYPT_FS tristate "eCrypt filesystem layer support (EXPERIMENTAL)" - depends on EXPERIMENTAL && KEYS && CRYPTO + depends on EXPERIMENTAL && KEYS && CRYPTO && (ENCRYPTED_KEYS || ENCRYPTED_KEYS=n) select CRYPTO_ECB select CRYPTO_CBC select CRYPTO_MD5 diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 43c7c43..b36c557 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -29,6 +29,7 @@ #define ECRYPTFS_KERNEL_H #include <keys/user-type.h> +#include <keys/encrypted-type.h> #include <linux/fs.h> #include <linux/fs_stack.h> #include <linux/namei.h> @@ -36,125 +37,18 @@ #include <linux/hash.h> #include <linux/nsproxy.h> #include <linux/backing-dev.h> +#include <linux/ecryptfs.h> -/* Version verification for shared data structures w/ userspace */ -#define ECRYPTFS_VERSION_MAJOR 0x00 -#define ECRYPTFS_VERSION_MINOR 0x04 -#define ECRYPTFS_SUPPORTED_FILE_VERSION 0x03 -/* These flags indicate which features are supported by the kernel - * module; userspace tools such as the mount helper read - * ECRYPTFS_VERSIONING_MASK from a sysfs handle in order to determine - * how to behave. */ -#define ECRYPTFS_VERSIONING_PASSPHRASE 0x00000001 -#define ECRYPTFS_VERSIONING_PUBKEY 0x00000002 -#define ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH 0x00000004 -#define ECRYPTFS_VERSIONING_POLICY 0x00000008 -#define ECRYPTFS_VERSIONING_XATTR 0x00000010 -#define ECRYPTFS_VERSIONING_MULTKEY 0x00000020 -#define ECRYPTFS_VERSIONING_DEVMISC 0x00000040 -#define ECRYPTFS_VERSIONING_HMAC 0x00000080 -#define ECRYPTFS_VERSIONING_FILENAME_ENCRYPTION 0x00000100 -#define ECRYPTFS_VERSIONING_GCM 0x00000200 -#define ECRYPTFS_VERSIONING_MASK (ECRYPTFS_VERSIONING_PASSPHRASE \ - | ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH \ - | ECRYPTFS_VERSIONING_PUBKEY \ - | ECRYPTFS_VERSIONING_XATTR \ - | ECRYPTFS_VERSIONING_MULTKEY \ - | ECRYPTFS_VERSIONING_DEVMISC \ - | ECRYPTFS_VERSIONING_FILENAME_ENCRYPTION) -#define ECRYPTFS_MAX_PASSWORD_LENGTH 64 -#define ECRYPTFS_MAX_PASSPHRASE_BYTES ECRYPTFS_MAX_PASSWORD_LENGTH -#define ECRYPTFS_SALT_SIZE 8 -#define ECRYPTFS_SALT_SIZE_HEX (ECRYPTFS_SALT_SIZE*2) -/* The original signature size is only for what is stored on disk; all - * in-memory representations are expanded hex, so it better adapted to - * be passed around or referenced on the command line */ -#define ECRYPTFS_SIG_SIZE 8 -#define ECRYPTFS_SIG_SIZE_HEX (ECRYPTFS_SIG_SIZE*2) -#define ECRYPTFS_PASSWORD_SIG_SIZE ECRYPTFS_SIG_SIZE_HEX -#define ECRYPTFS_MAX_KEY_BYTES 64 -#define ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES 512 #define ECRYPTFS_DEFAULT_IV_BYTES 16 -#define ECRYPTFS_FILE_VERSION 0x03 #define ECRYPTFS_DEFAULT_EXTENT_SIZE 4096 #define ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE 8192 #define ECRYPTFS_DEFAULT_MSG_CTX_ELEMS 32 #define ECRYPTFS_DEFAULT_SEND_TIMEOUT HZ #define ECRYPTFS_MAX_MSG_CTX_TTL (HZ*3) -#define ECRYPTFS_MAX_PKI_NAME_BYTES 16 #define ECRYPTFS_DEFAULT_NUM_USERS 4 #define ECRYPTFS_MAX_NUM_USERS 32768 #define ECRYPTFS_XATTR_NAME "user.ecryptfs" -#define RFC2440_CIPHER_DES3_EDE 0x02 -#define RFC2440_CIPHER_CAST_5 0x03 -#define RFC2440_CIPHER_BLOWFISH 0x04 -#define RFC2440_CIPHER_AES_128 0x07 -#define RFC2440_CIPHER_AES_192 0x08 -#define RFC2440_CIPHER_AES_256 0x09 -#define RFC2440_CIPHER_TWOFISH 0x0a -#define RFC2440_CIPHER_CAST_6 0x0b - -#define RFC2440_CIPHER_RSA 0x01 - -/** - * For convenience, we may need to pass around the encrypted session - * key between kernel and userspace because the authentication token - * may not be extractable. For example, the TPM may not release the - * private key, instead requiring the encrypted data and returning the - * decrypted data. - */ -struct ecryptfs_session_key { -#define ECRYPTFS_USERSPACE_SHOULD_TRY_TO_DECRYPT 0x00000001 -#define ECRYPTFS_USERSPACE_SHOULD_TRY_TO_ENCRYPT 0x00000002 -#define ECRYPTFS_CONTAINS_DECRYPTED_KEY 0x00000004 -#define ECRYPTFS_CONTAINS_ENCRYPTED_KEY 0x00000008 - u32 flags; - u32 encrypted_key_size; - u32 decrypted_key_size; - u8 encrypted_key[ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES]; - u8 decrypted_key[ECRYPTFS_MAX_KEY_BYTES]; -}; - -struct ecryptfs_password { - u32 password_bytes; - s32 hash_algo; - u32 hash_iterations; - u32 session_key_encryption_key_bytes; -#define ECRYPTFS_PERSISTENT_PASSWORD 0x01 -#define ECRYPTFS_SESSION_KEY_ENCRYPTION_KEY_SET 0x02 - u32 flags; - /* Iterated-hash concatenation of salt and passphrase */ - u8 session_key_encryption_key[ECRYPTFS_MAX_KEY_BYTES]; - u8 signature[ECRYPTFS_PASSWORD_SIG_SIZE + 1]; - /* Always in expanded hex */ - u8 salt[ECRYPTFS_SALT_SIZE]; -}; - -enum ecryptfs_token_types {ECRYPTFS_PASSWORD, ECRYPTFS_PRIVATE_KEY}; - -struct ecryptfs_private_key { - u32 key_size; - u32 data_len; - u8 signature[ECRYPTFS_PASSWORD_SIG_SIZE + 1]; - char pki_type[ECRYPTFS_MAX_PKI_NAME_BYTES + 1]; - u8 data[]; -}; - -/* May be a password or a private key */ -struct ecryptfs_auth_tok { - u16 version; /* 8-bit major and 8-bit minor */ - u16 token_type; -#define ECRYPTFS_ENCRYPT_ONLY 0x00000001 - u32 flags; - struct ecryptfs_session_key session_key; - u8 reserved[32]; - union { - struct ecryptfs_password password; - struct ecryptfs_private_key private_key; - } token; -} __attribute__ ((packed)); - void ecryptfs_dump_auth_tok(struct ecryptfs_auth_tok *auth_tok); extern void ecryptfs_to_hex(char *dst, char *src, size_t src_size); extern void ecryptfs_from_hex(char *dst, char *src, int dst_size); @@ -185,11 +79,47 @@ struct ecryptfs_page_crypt_context { } param; }; +#if defined(CONFIG_ENCRYPTED_KEYS) || defined(CONFIG_ENCRYPTED_KEYS_MODULE) +static inline struct ecryptfs_auth_tok * +ecryptfs_get_encrypted_key_payload_data(struct key *key) +{ + if (key->type == &key_type_encrypted) + return (struct ecryptfs_auth_tok *) + (&((struct encrypted_key_payload *)key->payload.data)->payload_data); + else + return NULL; +} + +static inline struct key *ecryptfs_get_encrypted_key(char *sig) +{ + return request_key(&key_type_encrypted, sig, NULL); +} + +#else +static inline struct ecryptfs_auth_tok * +ecryptfs_get_encrypted_key_payload_data(struct key *key) +{ + return NULL; +} + +static inline struct key *ecryptfs_get_encrypted_key(char *sig) +{ + return ERR_PTR(-ENOKEY); +} + +#endif /* CONFIG_ENCRYPTED_KEYS */ + static inline struct ecryptfs_auth_tok * ecryptfs_get_key_payload_data(struct key *key) { - return (struct ecryptfs_auth_tok *) - (((struct user_key_payload*)key->payload.data)->data); + struct ecryptfs_auth_tok *auth_tok; + + auth_tok = ecryptfs_get_encrypted_key_payload_data(key); + if (!auth_tok) + return (struct ecryptfs_auth_tok *) + (((struct user_key_payload *)key->payload.data)->data); + else + return auth_tok; } #define ECRYPTFS_MAX_KEYSET_SIZE 1024 diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 340c657..11f8582 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -69,6 +69,7 @@ static int ecryptfs_inode_set(struct inode *inode, void *opaque) inode->i_ino = lower_inode->i_ino; inode->i_version++; inode->i_mapping->a_ops = &ecryptfs_aops; + inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi; if (S_ISLNK(inode->i_mode)) inode->i_op = &ecryptfs_symlink_iops; diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index fa8049e..ac1ad48 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -1635,11 +1635,14 @@ int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key, (*auth_tok_key) = request_key(&key_type_user, sig, NULL); if (!(*auth_tok_key) || IS_ERR(*auth_tok_key)) { - printk(KERN_ERR "Could not find key with description: [%s]\n", - sig); - rc = process_request_key_err(PTR_ERR(*auth_tok_key)); - (*auth_tok_key) = NULL; - goto out; + (*auth_tok_key) = ecryptfs_get_encrypted_key(sig); + if (!(*auth_tok_key) || IS_ERR(*auth_tok_key)) { + printk(KERN_ERR "Could not find key with description: [%s]\n", + sig); + rc = process_request_key_err(PTR_ERR(*auth_tok_key)); + (*auth_tok_key) = NULL; + goto out; + } } down_write(&(*auth_tok_key)->sem); rc = ecryptfs_verify_auth_tok_from_key(*auth_tok_key, auth_tok); @@ -1868,11 +1871,6 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat, * just one will be sufficient to decrypt to get the FEK. */ find_next_matching_auth_tok: found_auth_tok = 0; - if (auth_tok_key) { - up_write(&(auth_tok_key->sem)); - key_put(auth_tok_key); - auth_tok_key = NULL; - } list_for_each_entry(auth_tok_list_item, &auth_tok_list, list) { candidate_auth_tok = &auth_tok_list_item->auth_tok; if (unlikely(ecryptfs_verbosity > 0)) { @@ -1909,14 +1907,22 @@ found_matching_auth_tok: memcpy(&(candidate_auth_tok->token.private_key), &(matching_auth_tok->token.private_key), sizeof(struct ecryptfs_private_key)); + up_write(&(auth_tok_key->sem)); + key_put(auth_tok_key); rc = decrypt_pki_encrypted_session_key(candidate_auth_tok, crypt_stat); } else if (candidate_auth_tok->token_type == ECRYPTFS_PASSWORD) { memcpy(&(candidate_auth_tok->token.password), &(matching_auth_tok->token.password), sizeof(struct ecryptfs_password)); + up_write(&(auth_tok_key->sem)); + key_put(auth_tok_key); rc = decrypt_passphrase_encrypted_session_key( candidate_auth_tok, crypt_stat); + } else { + up_write(&(auth_tok_key->sem)); + key_put(auth_tok_key); + rc = -EINVAL; } if (rc) { struct ecryptfs_auth_tok_list_item *auth_tok_list_item_tmp; @@ -1956,21 +1962,18 @@ found_matching_auth_tok: out_wipe_list: wipe_auth_tok_list(&auth_tok_list); out: - if (auth_tok_key) { - up_write(&(auth_tok_key->sem)); - key_put(auth_tok_key); - } return rc; } static int -pki_encrypt_session_key(struct ecryptfs_auth_tok *auth_tok, +pki_encrypt_session_key(struct key *auth_tok_key, + struct ecryptfs_auth_tok *auth_tok, struct ecryptfs_crypt_stat *crypt_stat, struct ecryptfs_key_record *key_rec) { struct ecryptfs_msg_ctx *msg_ctx = NULL; char *payload = NULL; - size_t payload_len; + size_t payload_len = 0; struct ecryptfs_message *msg; int rc; @@ -1979,6 +1982,8 @@ pki_encrypt_session_key(struct ecryptfs_auth_tok *auth_tok, crypt_stat->cipher, crypt_stat->key_size), crypt_stat, &payload, &payload_len); + up_write(&(auth_tok_key->sem)); + key_put(auth_tok_key); if (rc) { ecryptfs_printk(KERN_ERR, "Error generating tag 66 packet\n"); goto out; @@ -2008,6 +2013,8 @@ out: * write_tag_1_packet - Write an RFC2440-compatible tag 1 (public key) packet * @dest: Buffer into which to write the packet * @remaining_bytes: Maximum number of bytes that can be writtn + * @auth_tok_key: The authentication token key to unlock and put when done with + * @auth_tok * @auth_tok: The authentication token used for generating the tag 1 packet * @crypt_stat: The cryptographic context * @key_rec: The key record struct for the tag 1 packet @@ -2018,7 +2025,7 @@ out: */ static int write_tag_1_packet(char *dest, size_t *remaining_bytes, - struct ecryptfs_auth_tok *auth_tok, + struct key *auth_tok_key, struct ecryptfs_auth_tok *auth_tok, struct ecryptfs_crypt_stat *crypt_stat, struct ecryptfs_key_record *key_rec, size_t *packet_size) { @@ -2039,12 +2046,15 @@ write_tag_1_packet(char *dest, size_t *remaining_bytes, memcpy(key_rec->enc_key, auth_tok->session_key.encrypted_key, auth_tok->session_key.encrypted_key_size); + up_write(&(auth_tok_key->sem)); + key_put(auth_tok_key); goto encrypted_session_key_set; } if (auth_tok->session_key.encrypted_key_size == 0) auth_tok->session_key.encrypted_key_size = auth_tok->token.private_key.key_size; - rc = pki_encrypt_session_key(auth_tok, crypt_stat, key_rec); + rc = pki_encrypt_session_key(auth_tok_key, auth_tok, crypt_stat, + key_rec); if (rc) { printk(KERN_ERR "Failed to encrypt session key via a key " "module; rc = [%d]\n", rc); @@ -2421,6 +2431,8 @@ ecryptfs_generate_key_packet_set(char *dest_base, &max, auth_tok, crypt_stat, key_rec, &written); + up_write(&(auth_tok_key->sem)); + key_put(auth_tok_key); if (rc) { ecryptfs_printk(KERN_WARNING, "Error " "writing tag 3 packet\n"); @@ -2438,8 +2450,8 @@ ecryptfs_generate_key_packet_set(char *dest_base, } (*len) += written; } else if (auth_tok->token_type == ECRYPTFS_PRIVATE_KEY) { - rc = write_tag_1_packet(dest_base + (*len), - &max, auth_tok, + rc = write_tag_1_packet(dest_base + (*len), &max, + auth_tok_key, auth_tok, crypt_stat, key_rec, &written); if (rc) { ecryptfs_printk(KERN_WARNING, "Error " @@ -2448,14 +2460,13 @@ ecryptfs_generate_key_packet_set(char *dest_base, } (*len) += written; } else { + up_write(&(auth_tok_key->sem)); + key_put(auth_tok_key); ecryptfs_printk(KERN_WARNING, "Unsupported " "authentication token type\n"); rc = -EINVAL; goto out_free; } - up_write(&(auth_tok_key->sem)); - key_put(auth_tok_key); - auth_tok_key = NULL; } if (likely(max > 0)) { dest_base[(*len)] = 0x00; @@ -2468,11 +2479,6 @@ out_free: out: if (rc) (*len) = 0; - if (auth_tok_key) { - up_write(&(auth_tok_key->sem)); - key_put(auth_tok_key); - } - mutex_unlock(&crypt_stat->keysig_list_mutex); return rc; } diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 9f1bb74..b4a6bef 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -175,6 +175,7 @@ enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig, ecryptfs_opt_encrypted_view, ecryptfs_opt_fnek_sig, ecryptfs_opt_fn_cipher, ecryptfs_opt_fn_cipher_key_bytes, ecryptfs_opt_unlink_sigs, ecryptfs_opt_mount_auth_tok_only, + ecryptfs_opt_check_dev_ruid, ecryptfs_opt_err }; static const match_table_t tokens = { @@ -191,6 +192,7 @@ static const match_table_t tokens = { {ecryptfs_opt_fn_cipher_key_bytes, "ecryptfs_fn_key_bytes=%u"}, {ecryptfs_opt_unlink_sigs, "ecryptfs_unlink_sigs"}, {ecryptfs_opt_mount_auth_tok_only, "ecryptfs_mount_auth_tok_only"}, + {ecryptfs_opt_check_dev_ruid, "ecryptfs_check_dev_ruid"}, {ecryptfs_opt_err, NULL} }; @@ -236,6 +238,7 @@ static void ecryptfs_init_mount_crypt_stat( * ecryptfs_parse_options * @sb: The ecryptfs super block * @options: The options passed to the kernel + * @check_ruid: set to 1 if device uid should be checked against the ruid * * Parse mount options: * debug=N - ecryptfs_verbosity level for debug output @@ -251,7 +254,8 @@ static void ecryptfs_init_mount_crypt_stat( * * Returns zero on success; non-zero on error */ -static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options) +static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options, + uid_t *check_ruid) { char *p; int rc = 0; @@ -276,6 +280,8 @@ static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options) char *cipher_key_bytes_src; char *fn_cipher_key_bytes_src; + *check_ruid = 0; + if (!options) { rc = -EINVAL; goto out; @@ -380,6 +386,9 @@ static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options) mount_crypt_stat->flags |= ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY; break; + case ecryptfs_opt_check_dev_ruid: + *check_ruid = 1; + break; case ecryptfs_opt_err: default: printk(KERN_WARNING @@ -475,6 +484,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags const char *err = "Getting sb failed"; struct inode *inode; struct path path; + uid_t check_ruid; int rc; sbi = kmem_cache_zalloc(ecryptfs_sb_info_cache, GFP_KERNEL); @@ -483,7 +493,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags goto out; } - rc = ecryptfs_parse_options(sbi, raw_data); + rc = ecryptfs_parse_options(sbi, raw_data, &check_ruid); if (rc) { err = "Error parsing options"; goto out; @@ -521,6 +531,15 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags "known incompatibilities\n"); goto out_free; } + + if (check_ruid && path.dentry->d_inode->i_uid != current_uid()) { + rc = -EPERM; + printk(KERN_ERR "Mount of device (uid: %d) not owned by " + "requested user (uid: %d)\n", + path.dentry->d_inode->i_uid, current_uid()); + goto out_free; + } + ecryptfs_set_superblock_lower(s, path.dentry->d_sb); s->s_maxbytes = path.dentry->d_sb->s_maxbytes; s->s_blocksize = path.dentry->d_sb->s_blocksize; diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c index 85d4309..3745f7c 100644 --- a/fs/ecryptfs/read_write.c +++ b/fs/ecryptfs/read_write.c @@ -39,15 +39,16 @@ int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data, loff_t offset, size_t size) { - struct ecryptfs_inode_info *inode_info; + struct file *lower_file; mm_segment_t fs_save; ssize_t rc; - inode_info = ecryptfs_inode_to_private(ecryptfs_inode); - BUG_ON(!inode_info->lower_file); + lower_file = ecryptfs_inode_to_private(ecryptfs_inode)->lower_file; + if (!lower_file) + return -EIO; fs_save = get_fs(); set_fs(get_ds()); - rc = vfs_write(inode_info->lower_file, data, size, &offset); + rc = vfs_write(lower_file, data, size, &offset); set_fs(fs_save); mark_inode_dirty_sync(ecryptfs_inode); return rc; @@ -225,15 +226,16 @@ out: int ecryptfs_read_lower(char *data, loff_t offset, size_t size, struct inode *ecryptfs_inode) { - struct ecryptfs_inode_info *inode_info = - ecryptfs_inode_to_private(ecryptfs_inode); + struct file *lower_file; mm_segment_t fs_save; ssize_t rc; - BUG_ON(!inode_info->lower_file); + lower_file = ecryptfs_inode_to_private(ecryptfs_inode)->lower_file; + if (!lower_file) + return -EIO; fs_save = get_fs(); set_fs(get_ds()); - rc = vfs_read(inode_info->lower_file, data, size, &offset); + rc = vfs_read(lower_file, data, size, &offset); set_fs(fs_save); return rc; } diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 5e480d5..9026fc9 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -37,7 +37,7 @@ #include <asm/system.h> #include <asm/io.h> #include <asm/mman.h> -#include <asm/atomic.h> +#include <linux/atomic.h> /* * LOCKING: @@ -181,14 +181,7 @@ static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) return; bprm->vma_pages = pages; - -#ifdef SPLIT_RSS_COUNTING - add_mm_counter(mm, MM_ANONPAGES, diff); -#else - spin_lock(&mm->page_table_lock); add_mm_counter(mm, MM_ANONPAGES, diff); - spin_unlock(&mm->page_table_lock); -#endif } static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, @@ -277,7 +270,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm) * use STACK_TOP because that can depend on attributes which aren't * configured yet. */ - BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); + BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); vma->vm_end = STACK_TOP_MAX; vma->vm_start = vma->vm_end - PAGE_SIZE; vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; @@ -1430,9 +1423,9 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) } } read_unlock(&binfmt_lock); +#ifdef CONFIG_MODULES if (retval != -ENOEXEC || bprm->mm == NULL) { break; -#ifdef CONFIG_MODULES } else { #define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e)) if (printable(bprm->buf[0]) && @@ -1440,9 +1433,13 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) printable(bprm->buf[2]) && printable(bprm->buf[3])) break; /* -ENOEXEC */ + if (try) + break; /* -ENOEXEC */ request_module("binfmt-%04x", *(unsigned short *)(&bprm->buf[2])); -#endif } +#else + break; +#endif } return retval; } @@ -1462,6 +1459,23 @@ static int do_execve_common(const char *filename, struct files_struct *displaced; bool clear_in_exec; int retval; + const struct cred *cred = current_cred(); + + /* + * We move the actual failure in case of RLIMIT_NPROC excess from + * set*uid() to execve() because too many poorly written programs + * don't check setuid() return code. Here we additionally recheck + * whether NPROC limit is still exceeded. + */ + if ((current->flags & PF_NPROC_EXCEEDED) && + atomic_read(&cred->user->processes) > rlimit(RLIMIT_NPROC)) { + retval = -EAGAIN; + goto out_ret; + } + + /* We're below the limit (still or again), so we don't want to make + * further execve() calls fail. */ + current->flags &= ~PF_NPROC_EXCEEDED; retval = unshare_files(&displaced); if (retval) @@ -1649,15 +1663,26 @@ expand_fail: return ret; } +static void cn_escape(char *str) +{ + for (; *str; str++) + if (*str == '/') + *str = '!'; +} + static int cn_print_exe_file(struct core_name *cn) { struct file *exe_file; - char *pathbuf, *path, *p; + char *pathbuf, *path; int ret; exe_file = get_mm_exe_file(current->mm); - if (!exe_file) - return cn_printf(cn, "(unknown)"); + if (!exe_file) { + char *commstart = cn->corename + cn->used; + ret = cn_printf(cn, "%s (path unknown)", current->comm); + cn_escape(commstart); + return ret; + } pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY); if (!pathbuf) { @@ -1671,9 +1696,7 @@ static int cn_print_exe_file(struct core_name *cn) goto free_buf; } - for (p = path; *p; p++) - if (*p == '/') - *p = '!'; + cn_escape(path); ret = cn_printf(cn, "%s", path); @@ -1745,16 +1768,22 @@ static int format_corename(struct core_name *cn, long signr) break; } /* hostname */ - case 'h': + case 'h': { + char *namestart = cn->corename + cn->used; down_read(&uts_sem); err = cn_printf(cn, "%s", utsname()->nodename); up_read(&uts_sem); + cn_escape(namestart); break; + } /* executable */ - case 'e': + case 'e': { + char *commstart = cn->corename + cn->used; err = cn_printf(cn, "%s", current->comm); + cn_escape(commstart); break; + } case 'E': err = cn_print_exe_file(cn); break; @@ -2118,16 +2147,16 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) ispipe = format_corename(&cn, signr); - if (ispipe == -ENOMEM) { - printk(KERN_WARNING "format_corename failed\n"); - printk(KERN_WARNING "Aborting core\n"); - goto fail_corename; - } - if (ispipe) { int dump_count; char **helper_argv; + if (ispipe < 0) { + printk(KERN_WARNING "format_corename failed\n"); + printk(KERN_WARNING "Aborting core\n"); + goto fail_corename; + } + if (cprm.limit == 1) { /* * Normally core limits are irrelevant to pipes, since diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild index 2d0f757..c5a5855 100644 --- a/fs/exofs/Kbuild +++ b/fs/exofs/Kbuild @@ -12,5 +12,8 @@ # Kbuild - Gets included from the Kernels Makefile and build system # -exofs-y := ios.o inode.o file.o symlink.o namei.o dir.o super.o +# ore module library +obj-$(CONFIG_ORE) += ore.o + +exofs-y := inode.o file.o symlink.o namei.o dir.o super.o obj-$(CONFIG_EXOFS_FS) += exofs.o diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig index 86194b2..70bae41 100644 --- a/fs/exofs/Kconfig +++ b/fs/exofs/Kconfig @@ -1,6 +1,10 @@ +config ORE + tristate + config EXOFS_FS tristate "exofs: OSD based file system support" depends on SCSI_OSD_ULD + select ORE help EXOFS is a file system that uses an OSD storage device, as its backing storage. diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index c965806..f4e442e 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -36,12 +36,9 @@ #include <linux/fs.h> #include <linux/time.h> #include <linux/backing-dev.h> -#include "common.h" +#include <scsi/osd_ore.h> -/* FIXME: Remove once pnfs hits mainline - * #include <linux/exportfs/pnfs_osd_xdr.h> - */ -#include "pnfs.h" +#include "common.h" #define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a) @@ -56,27 +53,11 @@ /* u64 has problems with printk this will cast it to unsigned long long */ #define _LLU(x) (unsigned long long)(x) -struct exofs_layout { - osd_id s_pid; /* partition ID of file system*/ - - /* Our way of looking at the data_map */ - unsigned stripe_unit; - unsigned mirrors_p1; - - unsigned group_width; - u64 group_depth; - unsigned group_count; - - enum exofs_inode_layout_gen_functions lay_func; - - unsigned s_numdevs; /* Num of devices in array */ - struct osd_dev *s_ods[0]; /* Variable length */ -}; - /* * our extension to the in-memory superblock */ struct exofs_sb_info { + struct backing_dev_info bdi; /* register our bdi with VFS */ struct exofs_sb_stats s_ess; /* Written often, pre-allocate*/ int s_timeout; /* timeout for OSD operations */ uint64_t s_nextid; /* highest object ID used */ @@ -84,16 +65,13 @@ struct exofs_sb_info { spinlock_t s_next_gen_lock; /* spinlock for gen # update */ u32 s_next_generation; /* next gen # to use */ atomic_t s_curr_pending; /* number of pending commands */ - uint8_t s_cred[OSD_CAP_LEN]; /* credential for the fscb */ - struct backing_dev_info bdi; /* register our bdi with VFS */ struct pnfs_osd_data_map data_map; /* Default raid to use * FIXME: Needed ? */ -/* struct exofs_layout dir_layout;*/ /* Default dir layout */ - struct exofs_layout layout; /* Default files layout, - * contains the variable osd_dev - * array. Keep last */ + struct ore_layout layout; /* Default files layout */ + struct ore_comp one_comp; /* id & cred of partition id=0*/ + struct ore_components comps; /* comps for the partition */ struct osd_dev *_min_one_dev[1]; /* Place holder for one dev */ }; @@ -107,7 +85,8 @@ struct exofs_i_info { uint32_t i_data[EXOFS_IDATA];/*short symlink names and device #s*/ uint32_t i_dir_start_lookup; /* which page to start lookup */ uint64_t i_commit_size; /* the object's written length */ - uint8_t i_cred[OSD_CAP_LEN];/* all-powerful credential */ + struct ore_comp one_comp; /* same component for all devices */ + struct ore_components comps; /* inode view of the device table */ }; static inline osd_id exofs_oi_objno(struct exofs_i_info *oi) @@ -115,52 +94,6 @@ static inline osd_id exofs_oi_objno(struct exofs_i_info *oi) return oi->vfs_inode.i_ino + EXOFS_OBJ_OFF; } -struct exofs_io_state; -typedef void (*exofs_io_done_fn)(struct exofs_io_state *or, void *private); - -struct exofs_io_state { - struct kref kref; - - void *private; - exofs_io_done_fn done; - - struct exofs_layout *layout; - struct osd_obj_id obj; - u8 *cred; - - /* Global read/write IO*/ - loff_t offset; - unsigned long length; - void *kern_buff; - - struct page **pages; - unsigned nr_pages; - unsigned pgbase; - unsigned pages_consumed; - - /* Attributes */ - unsigned in_attr_len; - struct osd_attr *in_attr; - unsigned out_attr_len; - struct osd_attr *out_attr; - - /* Variable array of size numdevs */ - unsigned numdevs; - struct exofs_per_dev_state { - struct osd_request *or; - struct bio *bio; - loff_t offset; - unsigned length; - unsigned dev; - } per_dev[]; -}; - -static inline unsigned exofs_io_state_size(unsigned numdevs) -{ - return sizeof(struct exofs_io_state) + - sizeof(struct exofs_per_dev_state) * numdevs; -} - /* * our inode flags */ @@ -205,12 +138,6 @@ static inline struct exofs_i_info *exofs_i(struct inode *inode) } /* - * Given a layout, object_number and stripe_index return the associated global - * dev_index - */ -unsigned exofs_layout_od_id(struct exofs_layout *layout, - osd_id obj_no, unsigned layout_index); -/* * Maximum count of links to a file */ #define EXOFS_LINK_MAX 32000 @@ -219,44 +146,8 @@ unsigned exofs_layout_od_id(struct exofs_layout *layout, * function declarations * *************************/ -/* ios.c */ -void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], - const struct osd_obj_id *obj); -int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj, - u64 offset, void *p, unsigned length); - -int exofs_get_io_state(struct exofs_layout *layout, - struct exofs_io_state **ios); -void exofs_put_io_state(struct exofs_io_state *ios); - -int exofs_check_io(struct exofs_io_state *ios, u64 *resid); - -int exofs_sbi_create(struct exofs_io_state *ios); -int exofs_sbi_remove(struct exofs_io_state *ios); -int exofs_sbi_write(struct exofs_io_state *ios); -int exofs_sbi_read(struct exofs_io_state *ios); - -int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr); - -int exofs_oi_truncate(struct exofs_i_info *oi, u64 new_len); -static inline int exofs_oi_write(struct exofs_i_info *oi, - struct exofs_io_state *ios) -{ - ios->obj.id = exofs_oi_objno(oi); - ios->cred = oi->i_cred; - return exofs_sbi_write(ios); -} - -static inline int exofs_oi_read(struct exofs_i_info *oi, - struct exofs_io_state *ios) -{ - ios->obj.id = exofs_oi_objno(oi); - ios->cred = oi->i_cred; - return exofs_sbi_read(ios); -} - /* inode.c */ -unsigned exofs_max_io_pages(struct exofs_layout *layout, +unsigned exofs_max_io_pages(struct ore_layout *layout, unsigned expected_pages); int exofs_setattr(struct dentry *, struct iattr *); int exofs_write_begin(struct file *file, struct address_space *mapping, @@ -281,6 +172,8 @@ int exofs_set_link(struct inode *, struct exofs_dir_entry *, struct page *, struct inode *); /* super.c */ +void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], + const struct osd_obj_id *obj); int exofs_sbi_write_stats(struct exofs_sb_info *sbi); /********************* @@ -295,7 +188,6 @@ extern const struct file_operations exofs_file_operations; /* inode.c */ extern const struct address_space_operations exofs_aops; -extern const struct osd_attr g_attr_logical_length; /* namei.c */ extern const struct inode_operations exofs_dir_inode_operations; @@ -305,4 +197,33 @@ extern const struct inode_operations exofs_special_inode_operations; extern const struct inode_operations exofs_symlink_inode_operations; extern const struct inode_operations exofs_fast_symlink_inode_operations; +/* exofs_init_comps will initialize an ore_components device array + * pointing to a single ore_comp struct, and a round-robin view + * of the device table. + * The first device of each inode is the [inode->ino % num_devices] + * and the rest of the devices sequentially following where the + * first device is after the last device. + * It is assumed that the global device array at @sbi is twice + * bigger and that the device table repeats twice. + * See: exofs_read_lookup_dev_table() + */ +static inline void exofs_init_comps(struct ore_components *comps, + struct ore_comp *one_comp, + struct exofs_sb_info *sbi, osd_id oid) +{ + unsigned dev_mod = (unsigned)oid, first_dev; + + one_comp->obj.partition = sbi->one_comp.obj.partition; + one_comp->obj.id = oid; + exofs_make_credential(one_comp->cred, &one_comp->obj); + + comps->numdevs = sbi->comps.numdevs; + comps->single_comp = EC_SINGLE_COMP; + comps->comps = one_comp; + + /* Round robin device view of the table */ + first_dev = (dev_mod * sbi->layout.mirrors_p1) % sbi->comps.numdevs; + comps->ods = sbi->comps.ods + first_dev; +} + #endif diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 8472c09..f39a38f 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -43,7 +43,7 @@ enum { BIO_MAX_PAGES_KMALLOC = PAGE_SIZE / sizeof(struct page *), }; -unsigned exofs_max_io_pages(struct exofs_layout *layout, +unsigned exofs_max_io_pages(struct ore_layout *layout, unsigned expected_pages) { unsigned pages = min_t(unsigned, expected_pages, MAX_PAGES_KMALLOC); @@ -58,7 +58,7 @@ struct page_collect { struct exofs_sb_info *sbi; struct inode *inode; unsigned expected_pages; - struct exofs_io_state *ios; + struct ore_io_state *ios; struct page **pages; unsigned alloc_pages; @@ -110,13 +110,6 @@ static int pcol_try_alloc(struct page_collect *pcol) { unsigned pages; - if (!pcol->ios) { /* First time allocate io_state */ - int ret = exofs_get_io_state(&pcol->sbi->layout, &pcol->ios); - - if (ret) - return ret; - } - /* TODO: easily support bio chaining */ pages = exofs_max_io_pages(&pcol->sbi->layout, pcol->expected_pages); @@ -140,7 +133,7 @@ static void pcol_free(struct page_collect *pcol) pcol->pages = NULL; if (pcol->ios) { - exofs_put_io_state(pcol->ios); + ore_put_io_state(pcol->ios); pcol->ios = NULL; } } @@ -200,7 +193,7 @@ static int __readpages_done(struct page_collect *pcol) u64 resid; u64 good_bytes; u64 length = 0; - int ret = exofs_check_io(pcol->ios, &resid); + int ret = ore_check_io(pcol->ios, &resid); if (likely(!ret)) good_bytes = pcol->length; @@ -241,7 +234,7 @@ static int __readpages_done(struct page_collect *pcol) } /* callback of async reads */ -static void readpages_done(struct exofs_io_state *ios, void *p) +static void readpages_done(struct ore_io_state *ios, void *p) { struct page_collect *pcol = p; @@ -269,20 +262,28 @@ static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw) static int read_exec(struct page_collect *pcol) { struct exofs_i_info *oi = exofs_i(pcol->inode); - struct exofs_io_state *ios = pcol->ios; + struct ore_io_state *ios; struct page_collect *pcol_copy = NULL; int ret; if (!pcol->pages) return 0; + if (!pcol->ios) { + int ret = ore_get_rw_state(&pcol->sbi->layout, &oi->comps, true, + pcol->pg_first << PAGE_CACHE_SHIFT, + pcol->length, &pcol->ios); + + if (ret) + return ret; + } + + ios = pcol->ios; ios->pages = pcol->pages; ios->nr_pages = pcol->nr_pages; - ios->length = pcol->length; - ios->offset = pcol->pg_first << PAGE_CACHE_SHIFT; if (pcol->read_4_write) { - exofs_oi_read(oi, pcol->ios); + ore_read(pcol->ios); return __readpages_done(pcol); } @@ -295,14 +296,14 @@ static int read_exec(struct page_collect *pcol) *pcol_copy = *pcol; ios->done = readpages_done; ios->private = pcol_copy; - ret = exofs_oi_read(oi, ios); + ret = ore_read(ios); if (unlikely(ret)) goto err; atomic_inc(&pcol->sbi->s_curr_pending); EXOFS_DBGMSG2("read_exec obj=0x%llx start=0x%llx length=0x%lx\n", - ios->obj.id, _LLU(ios->offset), pcol->length); + oi->one_comp.obj.id, _LLU(ios->offset), pcol->length); /* pages ownership was passed to pcol_copy */ _pcol_reset(pcol); @@ -457,14 +458,14 @@ static int exofs_readpage(struct file *file, struct page *page) } /* Callback for osd_write. All writes are asynchronous */ -static void writepages_done(struct exofs_io_state *ios, void *p) +static void writepages_done(struct ore_io_state *ios, void *p) { struct page_collect *pcol = p; int i; u64 resid; u64 good_bytes; u64 length = 0; - int ret = exofs_check_io(ios, &resid); + int ret = ore_check_io(ios, &resid); atomic_dec(&pcol->sbi->s_curr_pending); @@ -507,13 +508,21 @@ static void writepages_done(struct exofs_io_state *ios, void *p) static int write_exec(struct page_collect *pcol) { struct exofs_i_info *oi = exofs_i(pcol->inode); - struct exofs_io_state *ios = pcol->ios; + struct ore_io_state *ios; struct page_collect *pcol_copy = NULL; int ret; if (!pcol->pages) return 0; + BUG_ON(pcol->ios); + ret = ore_get_rw_state(&pcol->sbi->layout, &oi->comps, false, + pcol->pg_first << PAGE_CACHE_SHIFT, + pcol->length, &pcol->ios); + + if (unlikely(ret)) + goto err; + pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL); if (!pcol_copy) { EXOFS_ERR("write_exec: Failed to kmalloc(pcol)\n"); @@ -523,16 +532,15 @@ static int write_exec(struct page_collect *pcol) *pcol_copy = *pcol; + ios = pcol->ios; ios->pages = pcol_copy->pages; ios->nr_pages = pcol_copy->nr_pages; - ios->offset = pcol_copy->pg_first << PAGE_CACHE_SHIFT; - ios->length = pcol_copy->length; ios->done = writepages_done; ios->private = pcol_copy; - ret = exofs_oi_write(oi, ios); + ret = ore_write(ios); if (unlikely(ret)) { - EXOFS_ERR("write_exec: exofs_oi_write() Failed\n"); + EXOFS_ERR("write_exec: ore_write() Failed\n"); goto err; } @@ -844,17 +852,15 @@ static inline int exofs_inode_is_fast_symlink(struct inode *inode) return S_ISLNK(inode->i_mode) && (oi->i_data[0] != 0); } -const struct osd_attr g_attr_logical_length = ATTR_DEF( - OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8); - static int _do_truncate(struct inode *inode, loff_t newsize) { struct exofs_i_info *oi = exofs_i(inode); + struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; int ret; inode->i_mtime = inode->i_ctime = CURRENT_TIME; - ret = exofs_oi_truncate(oi, (u64)newsize); + ret = ore_truncate(&sbi->layout, &oi->comps, (u64)newsize); if (likely(!ret)) truncate_setsize(inode, newsize); @@ -917,30 +923,26 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, [1] = g_attr_inode_file_layout, [2] = g_attr_inode_dir_layout, }; - struct exofs_io_state *ios; + struct ore_io_state *ios; struct exofs_on_disk_inode_layout *layout; int ret; - ret = exofs_get_io_state(&sbi->layout, &ios); + ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios); if (unlikely(ret)) { - EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__); + EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); return ret; } - ios->obj.id = exofs_oi_objno(oi); - exofs_make_credential(oi->i_cred, &ios->obj); - ios->cred = oi->i_cred; - - attrs[1].len = exofs_on_disk_inode_layout_size(sbi->layout.s_numdevs); - attrs[2].len = exofs_on_disk_inode_layout_size(sbi->layout.s_numdevs); + attrs[1].len = exofs_on_disk_inode_layout_size(sbi->comps.numdevs); + attrs[2].len = exofs_on_disk_inode_layout_size(sbi->comps.numdevs); ios->in_attr = attrs; ios->in_attr_len = ARRAY_SIZE(attrs); - ret = exofs_sbi_read(ios); + ret = ore_read(ios); if (unlikely(ret)) { EXOFS_ERR("object(0x%llx) corrupted, return empty file=>%d\n", - _LLU(ios->obj.id), ret); + _LLU(oi->one_comp.obj.id), ret); memset(inode, 0, sizeof(*inode)); inode->i_mode = 0040000 | (0777 & ~022); /* If object is lost on target we might as well enable it's @@ -990,7 +992,7 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, } out: - exofs_put_io_state(ios); + ore_put_io_state(ios); return ret; } @@ -1016,6 +1018,8 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino) return inode; oi = exofs_i(inode); __oi_init(oi); + exofs_init_comps(&oi->comps, &oi->one_comp, sb->s_fs_info, + exofs_oi_objno(oi)); /* read the inode from the osd */ ret = exofs_get_inode(sb, oi, &fcb); @@ -1107,21 +1111,22 @@ int __exofs_wait_obj_created(struct exofs_i_info *oi) * set the obj_created flag so that other methods know that the object exists on * the OSD. */ -static void create_done(struct exofs_io_state *ios, void *p) +static void create_done(struct ore_io_state *ios, void *p) { struct inode *inode = p; struct exofs_i_info *oi = exofs_i(inode); struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; int ret; - ret = exofs_check_io(ios, NULL); - exofs_put_io_state(ios); + ret = ore_check_io(ios, NULL); + ore_put_io_state(ios); atomic_dec(&sbi->s_curr_pending); if (unlikely(ret)) { EXOFS_ERR("object=0x%llx creation failed in pid=0x%llx", - _LLU(exofs_oi_objno(oi)), _LLU(sbi->layout.s_pid)); + _LLU(exofs_oi_objno(oi)), + _LLU(oi->one_comp.obj.partition)); /*TODO: When FS is corrupted creation can fail, object already * exist. Get rid of this asynchronous creation, if exist * increment the obj counter and try the next object. Until we @@ -1140,14 +1145,13 @@ static void create_done(struct exofs_io_state *ios, void *p) */ struct inode *exofs_new_inode(struct inode *dir, int mode) { - struct super_block *sb; + struct super_block *sb = dir->i_sb; + struct exofs_sb_info *sbi = sb->s_fs_info; struct inode *inode; struct exofs_i_info *oi; - struct exofs_sb_info *sbi; - struct exofs_io_state *ios; + struct ore_io_state *ios; int ret; - sb = dir->i_sb; inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOMEM); @@ -1157,8 +1161,6 @@ struct inode *exofs_new_inode(struct inode *dir, int mode) set_obj_2bcreated(oi); - sbi = sb->s_fs_info; - inode->i_mapping->backing_dev_info = sb->s_bdi; inode_init_owner(inode, dir, mode); inode->i_ino = sbi->s_nextid++; @@ -1170,25 +1172,24 @@ struct inode *exofs_new_inode(struct inode *dir, int mode) spin_unlock(&sbi->s_next_gen_lock); insert_inode_hash(inode); + exofs_init_comps(&oi->comps, &oi->one_comp, sb->s_fs_info, + exofs_oi_objno(oi)); exofs_sbi_write_stats(sbi); /* Make sure new sbi->s_nextid is on disk */ mark_inode_dirty(inode); - ret = exofs_get_io_state(&sbi->layout, &ios); + ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios); if (unlikely(ret)) { - EXOFS_ERR("exofs_new_inode: exofs_get_io_state failed\n"); + EXOFS_ERR("exofs_new_inode: ore_get_io_state failed\n"); return ERR_PTR(ret); } - ios->obj.id = exofs_oi_objno(oi); - exofs_make_credential(oi->i_cred, &ios->obj); - ios->done = create_done; ios->private = inode; - ios->cred = oi->i_cred; - ret = exofs_sbi_create(ios); + + ret = ore_create(ios); if (ret) { - exofs_put_io_state(ios); + ore_put_io_state(ios); return ERR_PTR(ret); } atomic_inc(&sbi->s_curr_pending); @@ -1207,11 +1208,11 @@ struct updatei_args { /* * Callback function from exofs_update_inode(). */ -static void updatei_done(struct exofs_io_state *ios, void *p) +static void updatei_done(struct ore_io_state *ios, void *p) { struct updatei_args *args = p; - exofs_put_io_state(ios); + ore_put_io_state(ios); atomic_dec(&args->sbi->s_curr_pending); @@ -1227,7 +1228,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync) struct exofs_i_info *oi = exofs_i(inode); struct super_block *sb = inode->i_sb; struct exofs_sb_info *sbi = sb->s_fs_info; - struct exofs_io_state *ios; + struct ore_io_state *ios; struct osd_attr attr; struct exofs_fcb *fcb; struct updatei_args *args; @@ -1266,9 +1267,9 @@ static int exofs_update_inode(struct inode *inode, int do_sync) } else memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data)); - ret = exofs_get_io_state(&sbi->layout, &ios); + ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios); if (unlikely(ret)) { - EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__); + EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); goto free_args; } @@ -1285,13 +1286,13 @@ static int exofs_update_inode(struct inode *inode, int do_sync) ios->private = args; } - ret = exofs_oi_write(oi, ios); + ret = ore_write(ios); if (!do_sync && !ret) { atomic_inc(&sbi->s_curr_pending); goto out; /* deallocation in updatei_done */ } - exofs_put_io_state(ios); + ore_put_io_state(ios); free_args: kfree(args); out: @@ -1310,11 +1311,11 @@ int exofs_write_inode(struct inode *inode, struct writeback_control *wbc) * Callback function from exofs_delete_inode() - don't have much cleaning up to * do. */ -static void delete_done(struct exofs_io_state *ios, void *p) +static void delete_done(struct ore_io_state *ios, void *p) { struct exofs_sb_info *sbi = p; - exofs_put_io_state(ios); + ore_put_io_state(ios); atomic_dec(&sbi->s_curr_pending); } @@ -1329,7 +1330,7 @@ void exofs_evict_inode(struct inode *inode) struct exofs_i_info *oi = exofs_i(inode); struct super_block *sb = inode->i_sb; struct exofs_sb_info *sbi = sb->s_fs_info; - struct exofs_io_state *ios; + struct ore_io_state *ios; int ret; truncate_inode_pages(&inode->i_data, 0); @@ -1349,20 +1350,19 @@ void exofs_evict_inode(struct inode *inode) /* ignore the error, attempt a remove anyway */ /* Now Remove the OSD objects */ - ret = exofs_get_io_state(&sbi->layout, &ios); + ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios); if (unlikely(ret)) { - EXOFS_ERR("%s: exofs_get_io_state failed\n", __func__); + EXOFS_ERR("%s: ore_get_io_state failed\n", __func__); return; } - ios->obj.id = exofs_oi_objno(oi); ios->done = delete_done; ios->private = sbi; - ios->cred = oi->i_cred; - ret = exofs_sbi_remove(ios); + + ret = ore_remove(ios); if (ret) { - EXOFS_ERR("%s: exofs_sbi_remove failed\n", __func__); - exofs_put_io_state(ios); + EXOFS_ERR("%s: ore_remove failed\n", __func__); + ore_put_io_state(ios); return; } atomic_inc(&sbi->s_curr_pending); diff --git a/fs/exofs/ios.c b/fs/exofs/ore.c index f74a2ec..25305af 100644 --- a/fs/exofs/ios.c +++ b/fs/exofs/ore.c @@ -23,81 +23,87 @@ */ #include <linux/slab.h> -#include <scsi/scsi_device.h> #include <asm/div64.h> -#include "exofs.h" +#include <scsi/osd_ore.h> -#define EXOFS_DBGMSG2(M...) do {} while (0) -/* #define EXOFS_DBGMSG2 EXOFS_DBGMSG */ +#define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a) -void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj) -{ - osd_sec_init_nosec_doall_caps(cred_a, obj, false, true); -} +#ifdef CONFIG_EXOFS_DEBUG +#define ORE_DBGMSG(fmt, a...) \ + printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a) +#else +#define ORE_DBGMSG(fmt, a...) \ + do { if (0) printk(fmt, ##a); } while (0) +#endif -int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj, - u64 offset, void *p, unsigned length) -{ - struct osd_request *or = osd_start_request(od, GFP_KERNEL); -/* struct osd_sense_info osi = {.key = 0};*/ - int ret; +/* u64 has problems with printk this will cast it to unsigned long long */ +#define _LLU(x) (unsigned long long)(x) - if (unlikely(!or)) { - EXOFS_DBGMSG("%s: osd_start_request failed.\n", __func__); - return -ENOMEM; - } - ret = osd_req_read_kern(or, obj, offset, p, length); - if (unlikely(ret)) { - EXOFS_DBGMSG("%s: osd_req_read_kern failed.\n", __func__); - goto out; - } +#define ORE_DBGMSG2(M...) do {} while (0) +/* #define ORE_DBGMSG2 ORE_DBGMSG */ - ret = osd_finalize_request(or, 0, cred, NULL); - if (unlikely(ret)) { - EXOFS_DBGMSG("Failed to osd_finalize_request() => %d\n", ret); - goto out; - } +MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>"); +MODULE_DESCRIPTION("Objects Raid Engine ore.ko"); +MODULE_LICENSE("GPL"); - ret = osd_execute_request(or); - if (unlikely(ret)) - EXOFS_DBGMSG("osd_execute_request() => %d\n", ret); - /* osd_req_decode_sense(or, ret); */ +static u8 *_ios_cred(struct ore_io_state *ios, unsigned index) +{ + return ios->comps->comps[index & ios->comps->single_comp].cred; +} -out: - osd_end_request(or); - return ret; +static struct osd_obj_id *_ios_obj(struct ore_io_state *ios, unsigned index) +{ + return &ios->comps->comps[index & ios->comps->single_comp].obj; } -int exofs_get_io_state(struct exofs_layout *layout, - struct exofs_io_state **pios) +static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index) { - struct exofs_io_state *ios; + return ios->comps->ods[index]; +} + +int ore_get_rw_state(struct ore_layout *layout, struct ore_components *comps, + bool is_reading, u64 offset, u64 length, + struct ore_io_state **pios) +{ + struct ore_io_state *ios; /*TODO: Maybe use kmem_cach per sbi of size * exofs_io_state_size(layout->s_numdevs) */ - ios = kzalloc(exofs_io_state_size(layout->s_numdevs), GFP_KERNEL); + ios = kzalloc(ore_io_state_size(comps->numdevs), GFP_KERNEL); if (unlikely(!ios)) { - EXOFS_DBGMSG("Failed kzalloc bytes=%d\n", - exofs_io_state_size(layout->s_numdevs)); + ORE_DBGMSG("Failed kzalloc bytes=%d\n", + ore_io_state_size(comps->numdevs)); *pios = NULL; return -ENOMEM; } ios->layout = layout; - ios->obj.partition = layout->s_pid; + ios->comps = comps; + ios->offset = offset; + ios->length = length; + ios->reading = is_reading; + *pios = ios; return 0; } +EXPORT_SYMBOL(ore_get_rw_state); + +int ore_get_io_state(struct ore_layout *layout, struct ore_components *comps, + struct ore_io_state **ios) +{ + return ore_get_rw_state(layout, comps, true, 0, 0, ios); +} +EXPORT_SYMBOL(ore_get_io_state); -void exofs_put_io_state(struct exofs_io_state *ios) +void ore_put_io_state(struct ore_io_state *ios) { if (ios) { unsigned i; for (i = 0; i < ios->numdevs; i++) { - struct exofs_per_dev_state *per_dev = &ios->per_dev[i]; + struct ore_per_dev_state *per_dev = &ios->per_dev[i]; if (per_dev->or) osd_end_request(per_dev->or); @@ -108,31 +114,9 @@ void exofs_put_io_state(struct exofs_io_state *ios) kfree(ios); } } +EXPORT_SYMBOL(ore_put_io_state); -unsigned exofs_layout_od_id(struct exofs_layout *layout, - osd_id obj_no, unsigned layout_index) -{ -/* switch (layout->lay_func) { - case LAYOUT_MOVING_WINDOW: - {*/ - unsigned dev_mod = obj_no; - - return (layout_index + dev_mod * layout->mirrors_p1) % - layout->s_numdevs; -/* } - case LAYOUT_FUNC_IMPLICT: - return layout->devs[layout_index]; - }*/ -} - -static inline struct osd_dev *exofs_ios_od(struct exofs_io_state *ios, - unsigned layout_index) -{ - return ios->layout->s_ods[ - exofs_layout_od_id(ios->layout, ios->obj.id, layout_index)]; -} - -static void _sync_done(struct exofs_io_state *ios, void *p) +static void _sync_done(struct ore_io_state *ios, void *p) { struct completion *waiting = p; @@ -141,20 +125,20 @@ static void _sync_done(struct exofs_io_state *ios, void *p) static void _last_io(struct kref *kref) { - struct exofs_io_state *ios = container_of( - kref, struct exofs_io_state, kref); + struct ore_io_state *ios = container_of( + kref, struct ore_io_state, kref); ios->done(ios, ios->private); } static void _done_io(struct osd_request *or, void *p) { - struct exofs_io_state *ios = p; + struct ore_io_state *ios = p; kref_put(&ios->kref, _last_io); } -static int exofs_io_execute(struct exofs_io_state *ios) +static int ore_io_execute(struct ore_io_state *ios) { DECLARE_COMPLETION_ONSTACK(wait); bool sync = (ios->done == NULL); @@ -170,9 +154,9 @@ static int exofs_io_execute(struct exofs_io_state *ios) if (unlikely(!or)) continue; - ret = osd_finalize_request(or, 0, ios->cred, NULL); + ret = osd_finalize_request(or, 0, _ios_cred(ios, i), NULL); if (unlikely(ret)) { - EXOFS_DBGMSG("Failed to osd_finalize_request() => %d\n", + ORE_DBGMSG("Failed to osd_finalize_request() => %d\n", ret); return ret; } @@ -194,7 +178,7 @@ static int exofs_io_execute(struct exofs_io_state *ios) if (sync) { wait_for_completion(&wait); - ret = exofs_check_io(ios, NULL); + ret = ore_check_io(ios, NULL); } return ret; } @@ -214,7 +198,7 @@ static void _clear_bio(struct bio *bio) } } -int exofs_check_io(struct exofs_io_state *ios, u64 *resid) +int ore_check_io(struct ore_io_state *ios, u64 *resid) { enum osd_err_priority acumulated_osd_err = 0; int acumulated_lin_err = 0; @@ -235,7 +219,7 @@ int exofs_check_io(struct exofs_io_state *ios, u64 *resid) if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) { /* start read offset passed endof file */ _clear_bio(ios->per_dev[i].bio); - EXOFS_DBGMSG("start read offset passed end of file " + ORE_DBGMSG("start read offset passed end of file " "offset=0x%llx, length=0x%llx\n", _LLU(ios->per_dev[i].offset), _LLU(ios->per_dev[i].length)); @@ -259,6 +243,7 @@ int exofs_check_io(struct exofs_io_state *ios, u64 *resid) return acumulated_lin_err; } +EXPORT_SYMBOL(ore_check_io); /* * L - logical offset into the file @@ -305,20 +290,21 @@ int exofs_check_io(struct exofs_io_state *ios, u64 *resid) struct _striping_info { u64 obj_offset; u64 group_length; + u64 M; /* for truncate */ unsigned dev; unsigned unit_off; }; -static void _calc_stripe_info(struct exofs_io_state *ios, u64 file_offset, +static void _calc_stripe_info(struct ore_layout *layout, u64 file_offset, struct _striping_info *si) { - u32 stripe_unit = ios->layout->stripe_unit; - u32 group_width = ios->layout->group_width; - u64 group_depth = ios->layout->group_depth; + u32 stripe_unit = layout->stripe_unit; + u32 group_width = layout->group_width; + u64 group_depth = layout->group_depth; u32 U = stripe_unit * group_width; u64 T = U * group_depth; - u64 S = T * ios->layout->group_count; + u64 S = T * layout->group_count; u64 M = div64_u64(file_offset, S); /* @@ -333,7 +319,7 @@ static void _calc_stripe_info(struct exofs_io_state *ios, u64 file_offset, /* "H - (N * U)" is just "H % U" so it's bound to u32 */ si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width; - si->dev *= ios->layout->mirrors_p1; + si->dev *= layout->mirrors_p1; div_u64_rem(file_offset, stripe_unit, &si->unit_off); @@ -341,15 +327,16 @@ static void _calc_stripe_info(struct exofs_io_state *ios, u64 file_offset, (M * group_depth * stripe_unit); si->group_length = T - H; + si->M = M; } -static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_pg, - unsigned pgbase, struct exofs_per_dev_state *per_dev, +static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, + unsigned pgbase, struct ore_per_dev_state *per_dev, int cur_len) { unsigned pg = *cur_pg; struct request_queue *q = - osd_request_queue(exofs_ios_od(ios, per_dev->dev)); + osd_request_queue(_ios_od(ios, per_dev->dev)); per_dev->length += cur_len; @@ -361,7 +348,7 @@ static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_pg, per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); if (unlikely(!per_dev->bio)) { - EXOFS_DBGMSG("Failed to allocate BIO size=%u\n", + ORE_DBGMSG("Failed to allocate BIO size=%u\n", bio_size); return -ENOMEM; } @@ -387,7 +374,7 @@ static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_pg, return 0; } -static int _prepare_one_group(struct exofs_io_state *ios, u64 length, +static int _prepare_one_group(struct ore_io_state *ios, u64 length, struct _striping_info *si) { unsigned stripe_unit = ios->layout->stripe_unit; @@ -400,7 +387,7 @@ static int _prepare_one_group(struct exofs_io_state *ios, u64 length, int ret = 0; while (length) { - struct exofs_per_dev_state *per_dev = &ios->per_dev[dev]; + struct ore_per_dev_state *per_dev = &ios->per_dev[dev]; unsigned cur_len, page_off = 0; if (!per_dev->length) { @@ -443,7 +430,7 @@ out: return ret; } -static int _prepare_for_striping(struct exofs_io_state *ios) +static int _prepare_for_striping(struct ore_io_state *ios) { u64 length = ios->length; u64 offset = ios->offset; @@ -452,9 +439,9 @@ static int _prepare_for_striping(struct exofs_io_state *ios) if (!ios->pages) { if (ios->kern_buff) { - struct exofs_per_dev_state *per_dev = &ios->per_dev[0]; + struct ore_per_dev_state *per_dev = &ios->per_dev[0]; - _calc_stripe_info(ios, ios->offset, &si); + _calc_stripe_info(ios->layout, ios->offset, &si); per_dev->offset = si.obj_offset; per_dev->dev = si.dev; @@ -468,7 +455,7 @@ static int _prepare_for_striping(struct exofs_io_state *ios) } while (length) { - _calc_stripe_info(ios, offset, &si); + _calc_stripe_info(ios->layout, offset, &si); if (length < si.group_length) si.group_length = length; @@ -485,57 +472,59 @@ out: return ret; } -int exofs_sbi_create(struct exofs_io_state *ios) +int ore_create(struct ore_io_state *ios) { int i, ret; - for (i = 0; i < ios->layout->s_numdevs; i++) { + for (i = 0; i < ios->comps->numdevs; i++) { struct osd_request *or; - or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL); + or = osd_start_request(_ios_od(ios, i), GFP_KERNEL); if (unlikely(!or)) { - EXOFS_ERR("%s: osd_start_request failed\n", __func__); + ORE_ERR("%s: osd_start_request failed\n", __func__); ret = -ENOMEM; goto out; } ios->per_dev[i].or = or; ios->numdevs++; - osd_req_create_object(or, &ios->obj); + osd_req_create_object(or, _ios_obj(ios, i)); } - ret = exofs_io_execute(ios); + ret = ore_io_execute(ios); out: return ret; } +EXPORT_SYMBOL(ore_create); -int exofs_sbi_remove(struct exofs_io_state *ios) +int ore_remove(struct ore_io_state *ios) { int i, ret; - for (i = 0; i < ios->layout->s_numdevs; i++) { + for (i = 0; i < ios->comps->numdevs; i++) { struct osd_request *or; - or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL); + or = osd_start_request(_ios_od(ios, i), GFP_KERNEL); if (unlikely(!or)) { - EXOFS_ERR("%s: osd_start_request failed\n", __func__); + ORE_ERR("%s: osd_start_request failed\n", __func__); ret = -ENOMEM; goto out; } ios->per_dev[i].or = or; ios->numdevs++; - osd_req_remove_object(or, &ios->obj); + osd_req_remove_object(or, _ios_obj(ios, i)); } - ret = exofs_io_execute(ios); + ret = ore_io_execute(ios); out: return ret; } +EXPORT_SYMBOL(ore_remove); -static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp) +static int _write_mirror(struct ore_io_state *ios, int cur_comp) { - struct exofs_per_dev_state *master_dev = &ios->per_dev[cur_comp]; + struct ore_per_dev_state *master_dev = &ios->per_dev[cur_comp]; unsigned dev = ios->per_dev[cur_comp].dev; unsigned last_comp = cur_comp + ios->layout->mirrors_p1; int ret = 0; @@ -544,12 +533,12 @@ static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp) return 0; /* Just an empty slot */ for (; cur_comp < last_comp; ++cur_comp, ++dev) { - struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp]; + struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp]; struct osd_request *or; - or = osd_start_request(exofs_ios_od(ios, dev), GFP_KERNEL); + or = osd_start_request(_ios_od(ios, dev), GFP_KERNEL); if (unlikely(!or)) { - EXOFS_ERR("%s: osd_start_request failed\n", __func__); + ORE_ERR("%s: osd_start_request failed\n", __func__); ret = -ENOMEM; goto out; } @@ -563,7 +552,7 @@ static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp) bio = bio_kmalloc(GFP_KERNEL, master_dev->bio->bi_max_vecs); if (unlikely(!bio)) { - EXOFS_DBGMSG( + ORE_DBGMSG( "Failed to allocate BIO size=%u\n", master_dev->bio->bi_max_vecs); ret = -ENOMEM; @@ -582,25 +571,29 @@ static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp) bio->bi_rw |= REQ_WRITE; } - osd_req_write(or, &ios->obj, per_dev->offset, bio, - per_dev->length); - EXOFS_DBGMSG("write(0x%llx) offset=0x%llx " + osd_req_write(or, _ios_obj(ios, dev), per_dev->offset, + bio, per_dev->length); + ORE_DBGMSG("write(0x%llx) offset=0x%llx " "length=0x%llx dev=%d\n", - _LLU(ios->obj.id), _LLU(per_dev->offset), + _LLU(_ios_obj(ios, dev)->id), + _LLU(per_dev->offset), _LLU(per_dev->length), dev); } else if (ios->kern_buff) { - ret = osd_req_write_kern(or, &ios->obj, per_dev->offset, - ios->kern_buff, ios->length); + ret = osd_req_write_kern(or, _ios_obj(ios, dev), + per_dev->offset, + ios->kern_buff, ios->length); if (unlikely(ret)) goto out; - EXOFS_DBGMSG2("write_kern(0x%llx) offset=0x%llx " + ORE_DBGMSG2("write_kern(0x%llx) offset=0x%llx " "length=0x%llx dev=%d\n", - _LLU(ios->obj.id), _LLU(per_dev->offset), + _LLU(_ios_obj(ios, dev)->id), + _LLU(per_dev->offset), _LLU(ios->length), dev); } else { - osd_req_set_attributes(or, &ios->obj); - EXOFS_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n", - _LLU(ios->obj.id), ios->out_attr_len, dev); + osd_req_set_attributes(or, _ios_obj(ios, dev)); + ORE_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n", + _LLU(_ios_obj(ios, dev)->id), + ios->out_attr_len, dev); } if (ios->out_attr) @@ -616,7 +609,7 @@ out: return ret; } -int exofs_sbi_write(struct exofs_io_state *ios) +int ore_write(struct ore_io_state *ios) { int i; int ret; @@ -626,52 +619,55 @@ int exofs_sbi_write(struct exofs_io_state *ios) return ret; for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { - ret = _sbi_write_mirror(ios, i); + ret = _write_mirror(ios, i); if (unlikely(ret)) return ret; } - ret = exofs_io_execute(ios); + ret = ore_io_execute(ios); return ret; } +EXPORT_SYMBOL(ore_write); -static int _sbi_read_mirror(struct exofs_io_state *ios, unsigned cur_comp) +static int _read_mirror(struct ore_io_state *ios, unsigned cur_comp) { struct osd_request *or; - struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp]; - unsigned first_dev = (unsigned)ios->obj.id; + struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp]; + struct osd_obj_id *obj = _ios_obj(ios, cur_comp); + unsigned first_dev = (unsigned)obj->id; if (ios->pages && !per_dev->length) return 0; /* Just an empty slot */ first_dev = per_dev->dev + first_dev % ios->layout->mirrors_p1; - or = osd_start_request(exofs_ios_od(ios, first_dev), GFP_KERNEL); + or = osd_start_request(_ios_od(ios, first_dev), GFP_KERNEL); if (unlikely(!or)) { - EXOFS_ERR("%s: osd_start_request failed\n", __func__); + ORE_ERR("%s: osd_start_request failed\n", __func__); return -ENOMEM; } per_dev->or = or; if (ios->pages) { - osd_req_read(or, &ios->obj, per_dev->offset, + osd_req_read(or, obj, per_dev->offset, per_dev->bio, per_dev->length); - EXOFS_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx" - " dev=%d\n", _LLU(ios->obj.id), + ORE_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx" + " dev=%d\n", _LLU(obj->id), _LLU(per_dev->offset), _LLU(per_dev->length), first_dev); } else if (ios->kern_buff) { - int ret = osd_req_read_kern(or, &ios->obj, per_dev->offset, + int ret = osd_req_read_kern(or, obj, per_dev->offset, ios->kern_buff, ios->length); - EXOFS_DBGMSG2("read_kern(0x%llx) offset=0x%llx " + ORE_DBGMSG2("read_kern(0x%llx) offset=0x%llx " "length=0x%llx dev=%d ret=>%d\n", - _LLU(ios->obj.id), _LLU(per_dev->offset), + _LLU(obj->id), _LLU(per_dev->offset), _LLU(ios->length), first_dev, ret); if (unlikely(ret)) return ret; } else { - osd_req_get_attributes(or, &ios->obj); - EXOFS_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n", - _LLU(ios->obj.id), ios->in_attr_len, first_dev); + osd_req_get_attributes(or, obj); + ORE_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n", + _LLU(obj->id), + ios->in_attr_len, first_dev); } if (ios->out_attr) osd_req_add_set_attr_list(or, ios->out_attr, ios->out_attr_len); @@ -682,7 +678,7 @@ static int _sbi_read_mirror(struct exofs_io_state *ios, unsigned cur_comp) return 0; } -int exofs_sbi_read(struct exofs_io_state *ios) +int ore_read(struct ore_io_state *ios) { int i; int ret; @@ -692,16 +688,17 @@ int exofs_sbi_read(struct exofs_io_state *ios) return ret; for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { - ret = _sbi_read_mirror(ios, i); + ret = _read_mirror(ios, i); if (unlikely(ret)) return ret; } - ret = exofs_io_execute(ios); + ret = ore_io_execute(ios); return ret; } +EXPORT_SYMBOL(ore_read); -int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr) +int extract_attr_from_ios(struct ore_io_state *ios, struct osd_attr *attr) { struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */ void *iter = NULL; @@ -721,83 +718,118 @@ int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr) return -EIO; } +EXPORT_SYMBOL(extract_attr_from_ios); -static int _truncate_mirrors(struct exofs_io_state *ios, unsigned cur_comp, +static int _truncate_mirrors(struct ore_io_state *ios, unsigned cur_comp, struct osd_attr *attr) { int last_comp = cur_comp + ios->layout->mirrors_p1; for (; cur_comp < last_comp; ++cur_comp) { - struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp]; + struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp]; struct osd_request *or; - or = osd_start_request(exofs_ios_od(ios, cur_comp), GFP_KERNEL); + or = osd_start_request(_ios_od(ios, cur_comp), GFP_KERNEL); if (unlikely(!or)) { - EXOFS_ERR("%s: osd_start_request failed\n", __func__); + ORE_ERR("%s: osd_start_request failed\n", __func__); return -ENOMEM; } per_dev->or = or; - osd_req_set_attributes(or, &ios->obj); + osd_req_set_attributes(or, _ios_obj(ios, cur_comp)); osd_req_add_set_attr_list(or, attr, 1); } return 0; } -int exofs_oi_truncate(struct exofs_i_info *oi, u64 size) +struct _trunc_info { + struct _striping_info si; + u64 prev_group_obj_off; + u64 next_group_obj_off; + + unsigned first_group_dev; + unsigned nex_group_dev; + unsigned max_devs; +}; + +void _calc_trunk_info(struct ore_layout *layout, u64 file_offset, + struct _trunc_info *ti) +{ + unsigned stripe_unit = layout->stripe_unit; + + _calc_stripe_info(layout, file_offset, &ti->si); + + ti->prev_group_obj_off = ti->si.M * stripe_unit; + ti->next_group_obj_off = ti->si.M ? (ti->si.M - 1) * stripe_unit : 0; + + ti->first_group_dev = ti->si.dev - (ti->si.dev % layout->group_width); + ti->nex_group_dev = ti->first_group_dev + layout->group_width; + ti->max_devs = layout->group_width * layout->group_count; +} + +int ore_truncate(struct ore_layout *layout, struct ore_components *comps, + u64 size) { - struct exofs_sb_info *sbi = oi->vfs_inode.i_sb->s_fs_info; - struct exofs_io_state *ios; + struct ore_io_state *ios; struct exofs_trunc_attr { struct osd_attr attr; __be64 newsize; } *size_attrs; - struct _striping_info si; + struct _trunc_info ti; int i, ret; - ret = exofs_get_io_state(&sbi->layout, &ios); + ret = ore_get_io_state(layout, comps, &ios); if (unlikely(ret)) return ret; - size_attrs = kcalloc(ios->layout->group_width, sizeof(*size_attrs), + _calc_trunk_info(ios->layout, size, &ti); + + size_attrs = kcalloc(ti.max_devs, sizeof(*size_attrs), GFP_KERNEL); if (unlikely(!size_attrs)) { ret = -ENOMEM; goto out; } - ios->obj.id = exofs_oi_objno(oi); - ios->cred = oi->i_cred; + ios->numdevs = ios->comps->numdevs; - ios->numdevs = ios->layout->s_numdevs; - _calc_stripe_info(ios, size, &si); - - for (i = 0; i < ios->layout->group_width; ++i) { + for (i = 0; i < ti.max_devs; ++i) { struct exofs_trunc_attr *size_attr = &size_attrs[i]; u64 obj_size; - if (i < si.dev) - obj_size = si.obj_offset + - ios->layout->stripe_unit - si.unit_off; - else if (i == si.dev) - obj_size = si.obj_offset; - else /* i > si.dev */ - obj_size = si.obj_offset - si.unit_off; + if (i < ti.first_group_dev) + obj_size = ti.prev_group_obj_off; + else if (i >= ti.nex_group_dev) + obj_size = ti.next_group_obj_off; + else if (i < ti.si.dev) /* dev within this group */ + obj_size = ti.si.obj_offset + + ios->layout->stripe_unit - ti.si.unit_off; + else if (i == ti.si.dev) + obj_size = ti.si.obj_offset; + else /* i > ti.dev */ + obj_size = ti.si.obj_offset - ti.si.unit_off; size_attr->newsize = cpu_to_be64(obj_size); size_attr->attr = g_attr_logical_length; size_attr->attr.val_ptr = &size_attr->newsize; + ORE_DBGMSG("trunc(0x%llx) obj_offset=0x%llx dev=%d\n", + _LLU(comps->comps->obj.id), _LLU(obj_size), i); ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1, &size_attr->attr); if (unlikely(ret)) goto out; } - ret = exofs_io_execute(ios); + ret = ore_io_execute(ios); out: kfree(size_attrs); - exofs_put_io_state(ios); + ore_put_io_state(ios); return ret; } +EXPORT_SYMBOL(ore_truncate); + +const struct osd_attr g_attr_logical_length = ATTR_DEF( + OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8); +EXPORT_SYMBOL(g_attr_logical_length); diff --git a/fs/exofs/pnfs.h b/fs/exofs/pnfs.h deleted file mode 100644 index c52e988..0000000 --- a/fs/exofs/pnfs.h +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (C) 2008, 2009 - * Boaz Harrosh <bharrosh@panasas.com> - * - * This file is part of exofs. - * - * exofs is free software; you can redistribute it and/or modify it under the - * terms of the GNU General Public License version 2 as published by the Free - * Software Foundation. - * - */ - -/* FIXME: Remove this file once pnfs hits mainline */ - -#ifndef __EXOFS_PNFS_H__ -#define __EXOFS_PNFS_H__ - -#if ! defined(__PNFS_OSD_XDR_H__) - -enum pnfs_iomode { - IOMODE_READ = 1, - IOMODE_RW = 2, - IOMODE_ANY = 3, -}; - -/* Layout Structure */ -enum pnfs_osd_raid_algorithm4 { - PNFS_OSD_RAID_0 = 1, - PNFS_OSD_RAID_4 = 2, - PNFS_OSD_RAID_5 = 3, - PNFS_OSD_RAID_PQ = 4 /* Reed-Solomon P+Q */ -}; - -struct pnfs_osd_data_map { - u32 odm_num_comps; - u64 odm_stripe_unit; - u32 odm_group_width; - u32 odm_group_depth; - u32 odm_mirror_cnt; - u32 odm_raid_algorithm; -}; - -#endif /* ! defined(__PNFS_OSD_XDR_H__) */ - -#endif /* __EXOFS_PNFS_H__ */ diff --git a/fs/exofs/super.c b/fs/exofs/super.c index c57bedd..2748940 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -40,6 +40,8 @@ #include "exofs.h" +#define EXOFS_DBGMSG2(M...) do {} while (0) + /****************************************************************************** * MOUNT OPTIONS *****************************************************************************/ @@ -208,10 +210,48 @@ static void destroy_inodecache(void) } /****************************************************************************** - * SUPERBLOCK FUNCTIONS + * Some osd helpers *****************************************************************************/ -static const struct super_operations exofs_sops; -static const struct export_operations exofs_export_ops; +void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj) +{ + osd_sec_init_nosec_doall_caps(cred_a, obj, false, true); +} + +static int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj, + u64 offset, void *p, unsigned length) +{ + struct osd_request *or = osd_start_request(od, GFP_KERNEL); +/* struct osd_sense_info osi = {.key = 0};*/ + int ret; + + if (unlikely(!or)) { + EXOFS_DBGMSG("%s: osd_start_request failed.\n", __func__); + return -ENOMEM; + } + ret = osd_req_read_kern(or, obj, offset, p, length); + if (unlikely(ret)) { + EXOFS_DBGMSG("%s: osd_req_read_kern failed.\n", __func__); + goto out; + } + + ret = osd_finalize_request(or, 0, cred, NULL); + if (unlikely(ret)) { + EXOFS_DBGMSG("Failed to osd_finalize_request() => %d\n", ret); + goto out; + } + + ret = osd_execute_request(or); + if (unlikely(ret)) + EXOFS_DBGMSG("osd_execute_request() => %d\n", ret); + /* osd_req_decode_sense(or, ret); */ + +out: + osd_end_request(or); + EXOFS_DBGMSG2("read_kern(0x%llx) offset=0x%llx " + "length=0x%llx dev=%p ret=>%d\n", + _LLU(obj->id), _LLU(offset), _LLU(length), od, ret); + return ret; +} static const struct osd_attr g_attr_sb_stats = ATTR_DEF( EXOFS_APAGE_SB_DATA, @@ -223,21 +263,19 @@ static int __sbi_read_stats(struct exofs_sb_info *sbi) struct osd_attr attrs[] = { [0] = g_attr_sb_stats, }; - struct exofs_io_state *ios; + struct ore_io_state *ios; int ret; - ret = exofs_get_io_state(&sbi->layout, &ios); + ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios); if (unlikely(ret)) { - EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__); + EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); return ret; } - ios->cred = sbi->s_cred; - ios->in_attr = attrs; ios->in_attr_len = ARRAY_SIZE(attrs); - ret = exofs_sbi_read(ios); + ret = ore_read(ios); if (unlikely(ret)) { EXOFS_ERR("Error reading super_block stats => %d\n", ret); goto out; @@ -264,13 +302,13 @@ static int __sbi_read_stats(struct exofs_sb_info *sbi) } out: - exofs_put_io_state(ios); + ore_put_io_state(ios); return ret; } -static void stats_done(struct exofs_io_state *ios, void *p) +static void stats_done(struct ore_io_state *ios, void *p) { - exofs_put_io_state(ios); + ore_put_io_state(ios); /* Good thanks nothing to do anymore */ } @@ -280,12 +318,12 @@ int exofs_sbi_write_stats(struct exofs_sb_info *sbi) struct osd_attr attrs[] = { [0] = g_attr_sb_stats, }; - struct exofs_io_state *ios; + struct ore_io_state *ios; int ret; - ret = exofs_get_io_state(&sbi->layout, &ios); + ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios); if (unlikely(ret)) { - EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__); + EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); return ret; } @@ -293,21 +331,27 @@ int exofs_sbi_write_stats(struct exofs_sb_info *sbi) sbi->s_ess.s_numfiles = cpu_to_le64(sbi->s_numfiles); attrs[0].val_ptr = &sbi->s_ess; - ios->cred = sbi->s_cred; + ios->done = stats_done; ios->private = sbi; ios->out_attr = attrs; ios->out_attr_len = ARRAY_SIZE(attrs); - ret = exofs_sbi_write(ios); + ret = ore_write(ios); if (unlikely(ret)) { - EXOFS_ERR("%s: exofs_sbi_write failed.\n", __func__); - exofs_put_io_state(ios); + EXOFS_ERR("%s: ore_write failed.\n", __func__); + ore_put_io_state(ios); } return ret; } +/****************************************************************************** + * SUPERBLOCK FUNCTIONS + *****************************************************************************/ +static const struct super_operations exofs_sops; +static const struct export_operations exofs_export_ops; + /* * Write the superblock to the OSD */ @@ -315,7 +359,9 @@ int exofs_sync_fs(struct super_block *sb, int wait) { struct exofs_sb_info *sbi; struct exofs_fscb *fscb; - struct exofs_io_state *ios; + struct ore_comp one_comp; + struct ore_components comps; + struct ore_io_state *ios; int ret = -ENOMEM; fscb = kmalloc(sizeof(*fscb), GFP_KERNEL); @@ -331,7 +377,10 @@ int exofs_sync_fs(struct super_block *sb, int wait) * version). Otherwise the exofs_fscb is read-only from mkfs time. All * the writeable info is set in exofs_sbi_write_stats() above. */ - ret = exofs_get_io_state(&sbi->layout, &ios); + + exofs_init_comps(&comps, &one_comp, sbi, EXOFS_SUPER_ID); + + ret = ore_get_io_state(&sbi->layout, &comps, &ios); if (unlikely(ret)) goto out; @@ -345,14 +394,12 @@ int exofs_sync_fs(struct super_block *sb, int wait) fscb->s_newfs = 0; fscb->s_version = EXOFS_FSCB_VER; - ios->obj.id = EXOFS_SUPER_ID; ios->offset = 0; ios->kern_buff = fscb; - ios->cred = sbi->s_cred; - ret = exofs_sbi_write(ios); + ret = ore_write(ios); if (unlikely(ret)) - EXOFS_ERR("%s: exofs_sbi_write failed.\n", __func__); + EXOFS_ERR("%s: ore_write failed.\n", __func__); else sb->s_dirt = 0; @@ -360,7 +407,7 @@ int exofs_sync_fs(struct super_block *sb, int wait) unlock_super(sb); out: EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret); - exofs_put_io_state(ios); + ore_put_io_state(ios); kfree(fscb); return ret; } @@ -384,15 +431,17 @@ static void _exofs_print_device(const char *msg, const char *dev_path, void exofs_free_sbi(struct exofs_sb_info *sbi) { - while (sbi->layout.s_numdevs) { - int i = --sbi->layout.s_numdevs; - struct osd_dev *od = sbi->layout.s_ods[i]; + while (sbi->comps.numdevs) { + int i = --sbi->comps.numdevs; + struct osd_dev *od = sbi->comps.ods[i]; if (od) { - sbi->layout.s_ods[i] = NULL; + sbi->comps.ods[i] = NULL; osduld_put_device(od); } } + if (sbi->comps.ods != sbi->_min_one_dev) + kfree(sbi->comps.ods); kfree(sbi); } @@ -419,8 +468,8 @@ static void exofs_put_super(struct super_block *sb) msecs_to_jiffies(100)); } - _exofs_print_device("Unmounting", NULL, sbi->layout.s_ods[0], - sbi->layout.s_pid); + _exofs_print_device("Unmounting", NULL, sbi->comps.ods[0], + sbi->one_comp.obj.partition); bdi_destroy(&sbi->bdi); exofs_free_sbi(sbi); @@ -501,10 +550,19 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, return -EINVAL; } + EXOFS_DBGMSG("exofs: layout: " + "num_comps=%u stripe_unit=0x%x group_width=%u " + "group_depth=0x%llx mirrors_p1=%u raid_algorithm=%u\n", + numdevs, + sbi->layout.stripe_unit, + sbi->layout.group_width, + _LLU(sbi->layout.group_depth), + sbi->layout.mirrors_p1, + sbi->data_map.odm_raid_algorithm); return 0; } -static unsigned __ra_pages(struct exofs_layout *layout) +static unsigned __ra_pages(struct ore_layout *layout) { const unsigned _MIN_RA = 32; /* min 128K read-ahead */ unsigned ra_pages = layout->group_width * layout->stripe_unit / @@ -547,13 +605,11 @@ static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev, return !(odi->systemid_len || odi->osdname_len); } -static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi, +static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, + struct osd_dev *fscb_od, unsigned table_count) { - struct exofs_sb_info *sbi = *psbi; - struct osd_dev *fscb_od; - struct osd_obj_id obj = {.partition = sbi->layout.s_pid, - .id = EXOFS_DEVTABLE_ID}; + struct ore_comp comp; struct exofs_device_table *dt; unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) + sizeof(*dt); @@ -567,10 +623,14 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi, return -ENOMEM; } - fscb_od = sbi->layout.s_ods[0]; - sbi->layout.s_ods[0] = NULL; - sbi->layout.s_numdevs = 0; - ret = exofs_read_kern(fscb_od, sbi->s_cred, &obj, 0, dt, table_bytes); + sbi->comps.numdevs = 0; + + comp.obj.partition = sbi->one_comp.obj.partition; + comp.obj.id = EXOFS_DEVTABLE_ID; + exofs_make_credential(comp.cred, &comp.obj); + + ret = exofs_read_kern(fscb_od, comp.cred, &comp.obj, 0, dt, + table_bytes); if (unlikely(ret)) { EXOFS_ERR("ERROR: reading device table\n"); goto out; @@ -588,16 +648,18 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi, goto out; if (likely(numdevs > 1)) { - unsigned size = numdevs * sizeof(sbi->layout.s_ods[0]); + unsigned size = numdevs * sizeof(sbi->comps.ods[0]); - sbi = krealloc(sbi, sizeof(*sbi) + size, GFP_KERNEL); - if (unlikely(!sbi)) { + /* Twice bigger table: See exofs_init_comps() and below + * comment + */ + sbi->comps.ods = kzalloc(size + size - 1, GFP_KERNEL); + if (unlikely(!sbi->comps.ods)) { + EXOFS_ERR("ERROR: faild allocating Device array[%d]\n", + numdevs); ret = -ENOMEM; goto out; } - memset(&sbi->layout.s_ods[1], 0, - size - sizeof(sbi->layout.s_ods[0])); - *psbi = sbi; } for (i = 0; i < numdevs; i++) { @@ -619,8 +681,8 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi, * line. We always keep them in device-table order. */ if (fscb_od && osduld_device_same(fscb_od, &odi)) { - sbi->layout.s_ods[i] = fscb_od; - ++sbi->layout.s_numdevs; + sbi->comps.ods[i] = fscb_od; + ++sbi->comps.numdevs; fscb_od = NULL; continue; } @@ -633,13 +695,13 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi, goto out; } - sbi->layout.s_ods[i] = od; - ++sbi->layout.s_numdevs; + sbi->comps.ods[i] = od; + ++sbi->comps.numdevs; /* Read the fscb of the other devices to make sure the FS * partition is there. */ - ret = exofs_read_kern(od, sbi->s_cred, &obj, 0, &fscb, + ret = exofs_read_kern(od, comp.cred, &comp.obj, 0, &fscb, sizeof(fscb)); if (unlikely(ret)) { EXOFS_ERR("ERROR: Malformed participating device " @@ -656,13 +718,22 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi, out: kfree(dt); - if (unlikely(!ret && fscb_od)) { - EXOFS_ERR( - "ERROR: Bad device-table container device not present\n"); - osduld_put_device(fscb_od); - ret = -EINVAL; - } + if (likely(!ret)) { + unsigned numdevs = sbi->comps.numdevs; + if (unlikely(fscb_od)) { + EXOFS_ERR("ERROR: Bad device-table container device not present\n"); + osduld_put_device(fscb_od); + return -EINVAL; + } + /* exofs round-robins the device table view according to inode + * number. We hold a: twice bigger table hence inodes can point + * to any device and have a sequential view of the table + * starting at this device. See exofs_init_comps() + */ + for (i = 0; i < numdevs - 1; ++i) + sbi->comps.ods[i + numdevs] = sbi->comps.ods[i]; + } return ret; } @@ -676,7 +747,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) struct exofs_sb_info *sbi; /*extended info */ struct osd_dev *od; /* Master device */ struct exofs_fscb fscb; /*on-disk superblock info */ - struct osd_obj_id obj; + struct ore_comp comp; unsigned table_count; int ret; @@ -684,10 +755,6 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) if (!sbi) return -ENOMEM; - ret = bdi_setup_and_register(&sbi->bdi, "exofs", BDI_CAP_MAP_COPY); - if (ret) - goto free_bdi; - /* use mount options to fill superblock */ if (opts->is_osdname) { struct osd_dev_info odi = {.systemid_len = 0}; @@ -695,6 +762,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) odi.osdname_len = strlen(opts->dev_name); odi.osdname = (u8 *)opts->dev_name; od = osduld_info_lookup(&odi); + kfree(opts->dev_name); + opts->dev_name = NULL; } else { od = osduld_path_lookup(opts->dev_name); } @@ -709,11 +778,16 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) sbi->layout.group_width = 1; sbi->layout.group_depth = -1; sbi->layout.group_count = 1; - sbi->layout.s_ods[0] = od; - sbi->layout.s_numdevs = 1; - sbi->layout.s_pid = opts->pid; sbi->s_timeout = opts->timeout; + sbi->one_comp.obj.partition = opts->pid; + sbi->one_comp.obj.id = 0; + exofs_make_credential(sbi->one_comp.cred, &sbi->one_comp.obj); + sbi->comps.numdevs = 1; + sbi->comps.single_comp = EC_SINGLE_COMP; + sbi->comps.comps = &sbi->one_comp; + sbi->comps.ods = sbi->_min_one_dev; + /* fill in some other data by hand */ memset(sb->s_id, 0, sizeof(sb->s_id)); strcpy(sb->s_id, "exofs"); @@ -724,11 +798,11 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) sb->s_bdev = NULL; sb->s_dev = 0; - obj.partition = sbi->layout.s_pid; - obj.id = EXOFS_SUPER_ID; - exofs_make_credential(sbi->s_cred, &obj); + comp.obj.partition = sbi->one_comp.obj.partition; + comp.obj.id = EXOFS_SUPER_ID; + exofs_make_credential(comp.cred, &comp.obj); - ret = exofs_read_kern(od, sbi->s_cred, &obj, 0, &fscb, sizeof(fscb)); + ret = exofs_read_kern(od, comp.cred, &comp.obj, 0, &fscb, sizeof(fscb)); if (unlikely(ret)) goto free_sbi; @@ -757,9 +831,11 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) table_count = le64_to_cpu(fscb.s_dev_table_count); if (table_count) { - ret = exofs_read_lookup_dev_table(&sbi, table_count); + ret = exofs_read_lookup_dev_table(sbi, od, table_count); if (unlikely(ret)) goto free_sbi; + } else { + sbi->comps.ods[0] = od; } __sbi_read_stats(sbi); @@ -793,20 +869,20 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) goto free_sbi; } - _exofs_print_device("Mounting", opts->dev_name, sbi->layout.s_ods[0], - sbi->layout.s_pid); - if (opts->is_osdname) - kfree(opts->dev_name); + ret = bdi_setup_and_register(&sbi->bdi, "exofs", BDI_CAP_MAP_COPY); + if (ret) { + EXOFS_DBGMSG("Failed to bdi_setup_and_register\n"); + goto free_sbi; + } + + _exofs_print_device("Mounting", opts->dev_name, sbi->comps.ods[0], + sbi->one_comp.obj.partition); return 0; free_sbi: - bdi_destroy(&sbi->bdi); -free_bdi: EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n", - opts->dev_name, sbi->layout.s_pid, ret); + opts->dev_name, sbi->one_comp.obj.partition, ret); exofs_free_sbi(sbi); - if (opts->is_osdname) - kfree(opts->dev_name); return ret; } @@ -837,7 +913,7 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct exofs_sb_info *sbi = sb->s_fs_info; - struct exofs_io_state *ios; + struct ore_io_state *ios; struct osd_attr attrs[] = { ATTR_DEF(OSD_APAGE_PARTITION_QUOTAS, OSD_ATTR_PQ_CAPACITY_QUOTA, sizeof(__be64)), @@ -846,21 +922,18 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf) }; uint64_t capacity = ULLONG_MAX; uint64_t used = ULLONG_MAX; - uint8_t cred_a[OSD_CAP_LEN]; int ret; - ret = exofs_get_io_state(&sbi->layout, &ios); + ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios); if (ret) { - EXOFS_DBGMSG("exofs_get_io_state failed.\n"); + EXOFS_DBGMSG("ore_get_io_state failed.\n"); return ret; } - exofs_make_credential(cred_a, &ios->obj); - ios->cred = sbi->s_cred; ios->in_attr = attrs; ios->in_attr_len = ARRAY_SIZE(attrs); - ret = exofs_sbi_read(ios); + ret = ore_read(ios); if (unlikely(ret)) goto out; @@ -889,7 +962,7 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_namelen = EXOFS_NAME_LEN; out: - exofs_put_io_state(ios); + ore_put_io_state(ios); return ret; } diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index 52c0537..35d6a3c 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -194,12 +194,10 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl) case ACL_TYPE_ACCESS: name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS; if (acl) { - mode_t mode = inode->i_mode; - error = posix_acl_equiv_mode(acl, &mode); + error = posix_acl_equiv_mode(acl, &inode->i_mode); if (error < 0) return error; else { - inode->i_mode = mode; inode->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(inode); if (error == 0) @@ -253,16 +251,14 @@ ext2_init_acl(struct inode *inode, struct inode *dir) inode->i_mode &= ~current_umask(); } if (test_opt(inode->i_sb, POSIX_ACL) && acl) { - mode_t mode = inode->i_mode; if (S_ISDIR(inode->i_mode)) { error = ext2_set_acl(inode, ACL_TYPE_DEFAULT, acl); if (error) goto cleanup; } - error = posix_acl_create(&acl, GFP_KERNEL, &mode); + error = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode); if (error < 0) return error; - inode->i_mode = mode; if (error > 0) { /* This is an extended ACL */ error = ext2_set_acl(inode, ACL_TYPE_ACCESS, acl); diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h index 5c0a6a4..503bfb0 100644 --- a/fs/ext2/acl.h +++ b/fs/ext2/acl.h @@ -61,7 +61,6 @@ extern int ext2_init_acl (struct inode *, struct inode *); #else #include <linux/sched.h> #define ext2_get_acl NULL -#define ext2_get_acl NULL #define ext2_set_acl NULL static inline int diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index 5299706..d27b71f 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -161,6 +161,10 @@ ext2_xattr_get(struct inode *inode, int name_index, const char *name, if (name == NULL) return -EINVAL; + name_len = strlen(name); + if (name_len > 255) + return -ERANGE; + down_read(&EXT2_I(inode)->xattr_sem); error = -ENODATA; if (!EXT2_I(inode)->i_file_acl) @@ -181,12 +185,8 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_get", error = -EIO; goto cleanup; } - /* find named attribute */ - name_len = strlen(name); - error = -ERANGE; - if (name_len > 255) - goto cleanup; + /* find named attribute */ entry = FIRST_ENTRY(bh); while (!IS_LAST_ENTRY(entry)) { struct ext2_xattr_entry *next = diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c index 6c29bf0..3091f62 100644 --- a/fs/ext3/acl.c +++ b/fs/ext3/acl.c @@ -199,12 +199,10 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type, case ACL_TYPE_ACCESS: name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS; if (acl) { - mode_t mode = inode->i_mode; - error = posix_acl_equiv_mode(acl, &mode); + error = posix_acl_equiv_mode(acl, &inode->i_mode); if (error < 0) return error; else { - inode->i_mode = mode; inode->i_ctime = CURRENT_TIME_SEC; ext3_mark_inode_dirty(handle, inode); if (error == 0) @@ -261,19 +259,16 @@ ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) inode->i_mode &= ~current_umask(); } if (test_opt(inode->i_sb, POSIX_ACL) && acl) { - mode_t mode = inode->i_mode; - if (S_ISDIR(inode->i_mode)) { error = ext3_set_acl(handle, inode, ACL_TYPE_DEFAULT, acl); if (error) goto cleanup; } - error = posix_acl_create(&acl, GFP_NOFS, &mode); + error = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); if (error < 0) return error; - inode->i_mode = mode; if (error > 0) { /* This is an extended ACL */ error = ext3_set_acl(handle, inode, ACL_TYPE_ACCESS, acl); diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c index fe52297..6386d76 100644 --- a/fs/ext3/balloc.c +++ b/fs/ext3/balloc.c @@ -21,6 +21,7 @@ #include <linux/quotaops.h> #include <linux/buffer_head.h> #include <linux/blkdev.h> +#include <trace/events/ext3.h> /* * balloc.c contains the blocks allocation and deallocation routines @@ -161,6 +162,7 @@ read_block_bitmap(struct super_block *sb, unsigned int block_group) desc = ext3_get_group_desc(sb, block_group, NULL); if (!desc) return NULL; + trace_ext3_read_block_bitmap(sb, block_group); bitmap_blk = le32_to_cpu(desc->bg_block_bitmap); bh = sb_getblk(sb, bitmap_blk); if (unlikely(!bh)) { @@ -351,6 +353,7 @@ void ext3_rsv_window_add(struct super_block *sb, struct rb_node * parent = NULL; struct ext3_reserve_window_node *this; + trace_ext3_rsv_window_add(sb, rsv); while (*p) { parent = *p; @@ -476,8 +479,10 @@ void ext3_discard_reservation(struct inode *inode) rsv = &block_i->rsv_window_node; if (!rsv_is_empty(&rsv->rsv_window)) { spin_lock(rsv_lock); - if (!rsv_is_empty(&rsv->rsv_window)) + if (!rsv_is_empty(&rsv->rsv_window)) { + trace_ext3_discard_reservation(inode, rsv); rsv_window_remove(inode->i_sb, rsv); + } spin_unlock(rsv_lock); } } @@ -683,14 +688,10 @@ error_return: void ext3_free_blocks(handle_t *handle, struct inode *inode, ext3_fsblk_t block, unsigned long count) { - struct super_block * sb; + struct super_block *sb = inode->i_sb; unsigned long dquot_freed_blocks; - sb = inode->i_sb; - if (!sb) { - printk ("ext3_free_blocks: nonexistent device"); - return; - } + trace_ext3_free_blocks(inode, block, count); ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); if (dquot_freed_blocks) dquot_free_block(inode, dquot_freed_blocks); @@ -1136,6 +1137,7 @@ static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv, else start_block = grp_goal + group_first_block; + trace_ext3_alloc_new_reservation(sb, start_block); size = my_rsv->rsv_goal_size; if (!rsv_is_empty(&my_rsv->rsv_window)) { @@ -1230,8 +1232,11 @@ retry: * check if the first free block is within the * free space we just reserved */ - if (start_block >= my_rsv->rsv_start && start_block <= my_rsv->rsv_end) + if (start_block >= my_rsv->rsv_start && + start_block <= my_rsv->rsv_end) { + trace_ext3_reserved(sb, start_block, my_rsv); return 0; /* success */ + } /* * if the first free bit we found is out of the reservable space * continue search for next reservable space, @@ -1514,10 +1519,6 @@ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode, *errp = -ENOSPC; sb = inode->i_sb; - if (!sb) { - printk("ext3_new_block: nonexistent device"); - return 0; - } /* * Check quota for allocation of this block. @@ -1528,8 +1529,10 @@ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode, return 0; } + trace_ext3_request_blocks(inode, goal, num); + sbi = EXT3_SB(sb); - es = EXT3_SB(sb)->s_es; + es = sbi->s_es; ext3_debug("goal=%lu.\n", goal); /* * Allocate a block from reservation only when @@ -1742,6 +1745,10 @@ allocated: brelse(bitmap_bh); dquot_free_block(inode, *count-num); *count = num; + + trace_ext3_allocate_blocks(inode, goal, num, + (unsigned long long)ret_block); + return ret_block; io_error: @@ -1996,6 +2003,7 @@ ext3_grpblk_t ext3_trim_all_free(struct super_block *sb, unsigned int group, if ((next - start) < minblocks) goto free_extent; + trace_ext3_discard_blocks(sb, discard_block, next - start); /* Send the TRIM command down to the device */ err = sb_issue_discard(sb, discard_block, next - start, GFP_NOFS, 0); @@ -2100,7 +2108,7 @@ int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range) if (unlikely(minlen > EXT3_BLOCKS_PER_GROUP(sb))) return -EINVAL; if (start >= max_blks) - goto out; + return -EINVAL; if (start + len > max_blks) len = max_blks - start; @@ -2148,8 +2156,6 @@ int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range) if (ret >= 0) ret = 0; - -out: range->len = trimmed * sb->s_blocksize; return ret; diff --git a/fs/ext3/file.c b/fs/ext3/file.c index 2be5b99..724df69 100644 --- a/fs/ext3/file.c +++ b/fs/ext3/file.c @@ -71,7 +71,6 @@ const struct file_operations ext3_file_operations = { }; const struct inode_operations ext3_file_inode_operations = { - .truncate = ext3_truncate, .setattr = ext3_setattr, #ifdef CONFIG_EXT3_FS_XATTR .setxattr = generic_setxattr, diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c index 0bcf63a..d494c55 100644 --- a/fs/ext3/fsync.c +++ b/fs/ext3/fsync.c @@ -30,6 +30,7 @@ #include <linux/jbd.h> #include <linux/ext3_fs.h> #include <linux/ext3_jbd.h> +#include <trace/events/ext3.h> /* * akpm: A new design for ext3_sync_file(). @@ -51,12 +52,14 @@ int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync) int ret, needs_barrier = 0; tid_t commit_tid; + trace_ext3_sync_file_enter(file, datasync); + if (inode->i_sb->s_flags & MS_RDONLY) return 0; ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret) - return ret; + goto out; /* * Taking the mutex here just to keep consistent with how fsync was @@ -83,7 +86,8 @@ int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ if (ext3_should_journal_data(inode)) { mutex_unlock(&inode->i_mutex); - return ext3_force_commit(inode->i_sb); + ret = ext3_force_commit(inode->i_sb); + goto out; } if (datasync) @@ -104,6 +108,9 @@ int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ if (needs_barrier) blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + mutex_unlock(&inode->i_mutex); +out: + trace_ext3_sync_file_exit(inode, ret); return ret; } diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c index bfc2dc4..bf09cbf 100644 --- a/fs/ext3/ialloc.c +++ b/fs/ext3/ialloc.c @@ -23,6 +23,7 @@ #include <linux/buffer_head.h> #include <linux/random.h> #include <linux/bitops.h> +#include <trace/events/ext3.h> #include <asm/byteorder.h> @@ -118,6 +119,7 @@ void ext3_free_inode (handle_t *handle, struct inode * inode) ino = inode->i_ino; ext3_debug ("freeing inode %lu\n", ino); + trace_ext3_free_inode(inode); is_directory = S_ISDIR(inode->i_mode); @@ -426,6 +428,7 @@ struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, return ERR_PTR(-EPERM); sb = dir->i_sb; + trace_ext3_request_inode(dir, mode); inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOMEM); @@ -601,6 +604,7 @@ got: } ext3_debug("allocating inode %lu\n", inode->i_ino); + trace_ext3_allocate_inode(inode, dir, mode); goto really_out; fail: ext3_std_error(sb, err); diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 2978a2a..04da6ac 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -38,10 +38,12 @@ #include <linux/bio.h> #include <linux/fiemap.h> #include <linux/namei.h> +#include <trace/events/ext3.h> #include "xattr.h" #include "acl.h" static int ext3_writepage_trans_blocks(struct inode *inode); +static int ext3_block_truncate_page(struct inode *inode, loff_t from); /* * Test whether an inode is a fast symlink. @@ -70,6 +72,7 @@ int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode, might_sleep(); + trace_ext3_forget(inode, is_metadata, blocknr); BUFFER_TRACE(bh, "enter"); jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " @@ -194,20 +197,47 @@ static int truncate_restart_transaction(handle_t *handle, struct inode *inode) */ void ext3_evict_inode (struct inode *inode) { + struct ext3_inode_info *ei = EXT3_I(inode); struct ext3_block_alloc_info *rsv; handle_t *handle; int want_delete = 0; + trace_ext3_evict_inode(inode); if (!inode->i_nlink && !is_bad_inode(inode)) { dquot_initialize(inode); want_delete = 1; } + /* + * When journalling data dirty buffers are tracked only in the journal. + * So although mm thinks everything is clean and ready for reaping the + * inode might still have some pages to write in the running + * transaction or waiting to be checkpointed. Thus calling + * journal_invalidatepage() (via truncate_inode_pages()) to discard + * these buffers can cause data loss. Also even if we did not discard + * these buffers, we would have no way to find them after the inode + * is reaped and thus user could see stale data if he tries to read + * them before the transaction is checkpointed. So be careful and + * force everything to disk here... We use ei->i_datasync_tid to + * store the newest transaction containing inode's data. + * + * Note that directories do not have this problem because they don't + * use page cache. + */ + if (inode->i_nlink && ext3_should_journal_data(inode) && + (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) { + tid_t commit_tid = atomic_read(&ei->i_datasync_tid); + journal_t *journal = EXT3_SB(inode->i_sb)->s_journal; + + log_start_commit(journal, commit_tid); + log_wait_commit(journal, commit_tid); + filemap_write_and_wait(&inode->i_data); + } truncate_inode_pages(&inode->i_data, 0); ext3_discard_reservation(inode); - rsv = EXT3_I(inode)->i_block_alloc_info; - EXT3_I(inode)->i_block_alloc_info = NULL; + rsv = ei->i_block_alloc_info; + ei->i_block_alloc_info = NULL; if (unlikely(rsv)) kfree(rsv); @@ -231,15 +261,13 @@ void ext3_evict_inode (struct inode *inode) if (inode->i_blocks) ext3_truncate(inode); /* - * Kill off the orphan record which ext3_truncate created. - * AKPM: I think this can be inside the above `if'. - * Note that ext3_orphan_del() has to be able to cope with the - * deletion of a non-existent orphan - this is because we don't - * know if ext3_truncate() actually created an orphan record. - * (Well, we could do this if we need to, but heck - it works) + * Kill off the orphan record created when the inode lost the last + * link. Note that ext3_orphan_del() has to be able to cope with the + * deletion of a non-existent orphan - ext3_truncate() could + * have removed the record. */ ext3_orphan_del(handle, inode); - EXT3_I(inode)->i_dtime = get_seconds(); + ei->i_dtime = get_seconds(); /* * One subtle ordering requirement: if anything has gone wrong @@ -842,6 +870,7 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, ext3_fsblk_t first_block = 0; + trace_ext3_get_blocks_enter(inode, iblock, maxblocks, create); J_ASSERT(handle != NULL || create == 0); depth = ext3_block_to_path(inode,iblock,offsets,&blocks_to_boundary); @@ -886,6 +915,9 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, if (!create || err == -EIO) goto cleanup; + /* + * Block out ext3_truncate while we alter the tree + */ mutex_lock(&ei->truncate_mutex); /* @@ -934,9 +966,6 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, */ count = ext3_blks_to_allocate(partial, indirect_blks, maxblocks, blocks_to_boundary); - /* - * Block out ext3_truncate while we alter the tree - */ err = ext3_alloc_branch(handle, inode, indirect_blks, &count, goal, offsets + (partial - chain), partial); @@ -970,6 +999,9 @@ cleanup: } BUFFER_TRACE(bh_result, "returned"); out: + trace_ext3_get_blocks_exit(inode, iblock, + depth ? le32_to_cpu(chain[depth-1].key) : 0, + count, err); return err; } @@ -1202,6 +1234,16 @@ static void ext3_truncate_failed_write(struct inode *inode) ext3_truncate(inode); } +/* + * Truncate blocks that were not used by direct IO write. We have to zero out + * the last file block as well because direct IO might have written to it. + */ +static void ext3_truncate_failed_direct_write(struct inode *inode) +{ + ext3_block_truncate_page(inode, inode->i_size); + ext3_truncate(inode); +} + static int ext3_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -1217,6 +1259,8 @@ static int ext3_write_begin(struct file *file, struct address_space *mapping, * we allocate blocks but write fails for some reason */ int needed_blocks = ext3_writepage_trans_blocks(inode) + 1; + trace_ext3_write_begin(inode, pos, len, flags); + index = pos >> PAGE_CACHE_SHIFT; from = pos & (PAGE_CACHE_SIZE - 1); to = from + len; @@ -1332,6 +1376,7 @@ static int ext3_ordered_write_end(struct file *file, unsigned from, to; int ret = 0, ret2; + trace_ext3_ordered_write_end(inode, pos, len, copied); copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); from = pos & (PAGE_CACHE_SIZE - 1); @@ -1367,6 +1412,7 @@ static int ext3_writeback_write_end(struct file *file, struct inode *inode = file->f_mapping->host; int ret; + trace_ext3_writeback_write_end(inode, pos, len, copied); copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); update_file_sizes(inode, pos, copied); /* @@ -1391,10 +1437,12 @@ static int ext3_journalled_write_end(struct file *file, { handle_t *handle = ext3_journal_current_handle(); struct inode *inode = mapping->host; + struct ext3_inode_info *ei = EXT3_I(inode); int ret = 0, ret2; int partial = 0; unsigned from, to; + trace_ext3_journalled_write_end(inode, pos, len, copied); from = pos & (PAGE_CACHE_SIZE - 1); to = from + len; @@ -1419,8 +1467,9 @@ static int ext3_journalled_write_end(struct file *file, if (pos + len > inode->i_size && ext3_can_truncate(inode)) ext3_orphan_add(handle, inode); ext3_set_inode_state(inode, EXT3_STATE_JDATA); - if (inode->i_size > EXT3_I(inode)->i_disksize) { - EXT3_I(inode)->i_disksize = inode->i_size; + atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid); + if (inode->i_size > ei->i_disksize) { + ei->i_disksize = inode->i_size; ret2 = ext3_mark_inode_dirty(handle, inode); if (!ret) ret = ret2; @@ -1577,6 +1626,7 @@ static int ext3_ordered_writepage(struct page *page, if (ext3_journal_current_handle()) goto out_fail; + trace_ext3_ordered_writepage(page); if (!page_has_buffers(page)) { create_empty_buffers(page, inode->i_sb->s_blocksize, (1 << BH_Dirty)|(1 << BH_Uptodate)); @@ -1647,6 +1697,7 @@ static int ext3_writeback_writepage(struct page *page, if (ext3_journal_current_handle()) goto out_fail; + trace_ext3_writeback_writepage(page); if (page_has_buffers(page)) { if (!walk_page_buffers(NULL, page_buffers(page), 0, PAGE_CACHE_SIZE, NULL, buffer_unmapped)) { @@ -1689,6 +1740,7 @@ static int ext3_journalled_writepage(struct page *page, if (ext3_journal_current_handle()) goto no_write; + trace_ext3_journalled_writepage(page); handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode)); if (IS_ERR(handle)) { ret = PTR_ERR(handle); @@ -1715,6 +1767,8 @@ static int ext3_journalled_writepage(struct page *page, if (ret == 0) ret = err; ext3_set_inode_state(inode, EXT3_STATE_JDATA); + atomic_set(&EXT3_I(inode)->i_datasync_tid, + handle->h_transaction->t_tid); unlock_page(page); } else { /* @@ -1739,6 +1793,7 @@ out_unlock: static int ext3_readpage(struct file *file, struct page *page) { + trace_ext3_readpage(page); return mpage_readpage(page, ext3_get_block); } @@ -1753,6 +1808,8 @@ static void ext3_invalidatepage(struct page *page, unsigned long offset) { journal_t *journal = EXT3_JOURNAL(page->mapping->host); + trace_ext3_invalidatepage(page, offset); + /* * If it's a full truncate we just forget about the pending dirtying */ @@ -1766,6 +1823,7 @@ static int ext3_releasepage(struct page *page, gfp_t wait) { journal_t *journal = EXT3_JOURNAL(page->mapping->host); + trace_ext3_releasepage(page); WARN_ON(PageChecked(page)); if (!page_has_buffers(page)) return 0; @@ -1794,6 +1852,8 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb, size_t count = iov_length(iov, nr_segs); int retries = 0; + trace_ext3_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw); + if (rw == WRITE) { loff_t final_size = offset + count; @@ -1827,7 +1887,7 @@ retry: loff_t end = offset + iov_length(iov, nr_segs); if (end > isize) - vmtruncate(inode, isize); + ext3_truncate_failed_direct_write(inode); } if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) goto retry; @@ -1841,7 +1901,7 @@ retry: /* This is really bad luck. We've written the data * but cannot extend i_size. Truncate allocated blocks * and pretend the write failed... */ - ext3_truncate(inode); + ext3_truncate_failed_direct_write(inode); ret = PTR_ERR(handle); goto out; } @@ -1867,6 +1927,8 @@ retry: ret = err; } out: + trace_ext3_direct_IO_exit(inode, offset, + iov_length(iov, nr_segs), rw, ret); return ret; } @@ -1949,17 +2011,24 @@ void ext3_set_aops(struct inode *inode) * This required during truncate. We need to physically zero the tail end * of that block so it doesn't yield old data if the file is later grown. */ -static int ext3_block_truncate_page(handle_t *handle, struct page *page, - struct address_space *mapping, loff_t from) +static int ext3_block_truncate_page(struct inode *inode, loff_t from) { ext3_fsblk_t index = from >> PAGE_CACHE_SHIFT; - unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned offset = from & (PAGE_CACHE_SIZE - 1); unsigned blocksize, iblock, length, pos; - struct inode *inode = mapping->host; + struct page *page; + handle_t *handle = NULL; struct buffer_head *bh; int err = 0; + /* Truncated on block boundary - nothing to do */ blocksize = inode->i_sb->s_blocksize; + if ((from & (blocksize - 1)) == 0) + return 0; + + page = grab_cache_page(inode->i_mapping, index); + if (!page) + return -ENOMEM; length = blocksize - (offset & (blocksize - 1)); iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); @@ -2004,11 +2073,23 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page, goto unlock; } + /* data=writeback mode doesn't need transaction to zero-out data */ + if (!ext3_should_writeback_data(inode)) { + /* We journal at most one block */ + handle = ext3_journal_start(inode, 1); + if (IS_ERR(handle)) { + clear_highpage(page); + flush_dcache_page(page); + err = PTR_ERR(handle); + goto unlock; + } + } + if (ext3_should_journal_data(inode)) { BUFFER_TRACE(bh, "get write access"); err = ext3_journal_get_write_access(handle, bh); if (err) - goto unlock; + goto stop; } zero_user(page, offset, length); @@ -2022,6 +2103,9 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page, err = ext3_journal_dirty_data(handle, bh); mark_buffer_dirty(bh); } +stop: + if (handle) + ext3_journal_stop(handle); unlock: unlock_page(page); @@ -2390,8 +2474,6 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode, int ext3_can_truncate(struct inode *inode) { - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) - return 0; if (S_ISREG(inode->i_mode)) return 1; if (S_ISDIR(inode->i_mode)) @@ -2435,7 +2517,6 @@ void ext3_truncate(struct inode *inode) struct ext3_inode_info *ei = EXT3_I(inode); __le32 *i_data = ei->i_data; int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb); - struct address_space *mapping = inode->i_mapping; int offsets[4]; Indirect chain[4]; Indirect *partial; @@ -2443,7 +2524,8 @@ void ext3_truncate(struct inode *inode) int n; long last_block; unsigned blocksize = inode->i_sb->s_blocksize; - struct page *page; + + trace_ext3_truncate_enter(inode); if (!ext3_can_truncate(inode)) goto out_notrans; @@ -2451,37 +2533,12 @@ void ext3_truncate(struct inode *inode) if (inode->i_size == 0 && ext3_should_writeback_data(inode)) ext3_set_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE); - /* - * We have to lock the EOF page here, because lock_page() nests - * outside journal_start(). - */ - if ((inode->i_size & (blocksize - 1)) == 0) { - /* Block boundary? Nothing to do */ - page = NULL; - } else { - page = grab_cache_page(mapping, - inode->i_size >> PAGE_CACHE_SHIFT); - if (!page) - goto out_notrans; - } - handle = start_transaction(inode); - if (IS_ERR(handle)) { - if (page) { - clear_highpage(page); - flush_dcache_page(page); - unlock_page(page); - page_cache_release(page); - } + if (IS_ERR(handle)) goto out_notrans; - } last_block = (inode->i_size + blocksize-1) >> EXT3_BLOCK_SIZE_BITS(inode->i_sb); - - if (page) - ext3_block_truncate_page(handle, page, mapping, inode->i_size); - n = ext3_block_to_path(inode, last_block, offsets, NULL); if (n == 0) goto out_stop; /* error */ @@ -2596,6 +2653,7 @@ out_stop: ext3_orphan_del(handle, inode); ext3_journal_stop(handle); + trace_ext3_truncate_exit(inode); return; out_notrans: /* @@ -2604,6 +2662,7 @@ out_notrans: */ if (inode->i_nlink) ext3_orphan_del(NULL, inode); + trace_ext3_truncate_exit(inode); } static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb, @@ -2745,6 +2804,7 @@ make_io: * has in-inode xattrs, or we don't have this inode in memory. * Read the block from disk. */ + trace_ext3_load_inode(inode); get_bh(bh); bh->b_end_io = end_buffer_read_sync; submit_bh(READ_META, bh); @@ -3229,18 +3289,36 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr) } error = ext3_orphan_add(handle, inode); + if (error) { + ext3_journal_stop(handle); + goto err_out; + } EXT3_I(inode)->i_disksize = attr->ia_size; - rc = ext3_mark_inode_dirty(handle, inode); - if (!error) - error = rc; + error = ext3_mark_inode_dirty(handle, inode); ext3_journal_stop(handle); + if (error) { + /* Some hard fs error must have happened. Bail out. */ + ext3_orphan_del(NULL, inode); + goto err_out; + } + rc = ext3_block_truncate_page(inode, attr->ia_size); + if (rc) { + /* Cleanup orphan list and exit */ + handle = ext3_journal_start(inode, 3); + if (IS_ERR(handle)) { + ext3_orphan_del(NULL, inode); + goto err_out; + } + ext3_orphan_del(handle, inode); + ext3_journal_stop(handle); + goto err_out; + } } if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size != i_size_read(inode)) { - rc = vmtruncate(inode, attr->ia_size); - if (rc) - goto err_out; + truncate_setsize(inode, attr->ia_size); + ext3_truncate(inode); } setattr_copy(inode, attr); @@ -3374,6 +3452,7 @@ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode) int err; might_sleep(); + trace_ext3_mark_inode_dirty(inode, _RET_IP_); err = ext3_reserve_inode_write(handle, inode, &iloc); if (!err) err = ext3_mark_iloc_dirty(handle, inode, &iloc); diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c index f4090bd..c7f4394 100644 --- a/fs/ext3/ioctl.c +++ b/fs/ext3/ioctl.c @@ -285,7 +285,7 @@ group_add_out: if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (copy_from_user(&range, (struct fstrim_range *)arg, + if (copy_from_user(&range, (struct fstrim_range __user *)arg, sizeof(range))) return -EFAULT; @@ -293,7 +293,7 @@ group_add_out: if (ret < 0) return ret; - if (copy_to_user((struct fstrim_range *)arg, &range, + if (copy_to_user((struct fstrim_range __user *)arg, &range, sizeof(range))) return -EFAULT; diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 3b57230..5571708 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -36,6 +36,7 @@ #include <linux/quotaops.h> #include <linux/buffer_head.h> #include <linux/bio.h> +#include <trace/events/ext3.h> #include "namei.h" #include "xattr.h" @@ -287,7 +288,7 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext3_dir_ent while (len--) printk("%c", *name++); ext3fs_dirhash(de->name, de->name_len, &h); printk(":%x.%u ", h.hash, - ((char *) de - base)); + (unsigned) ((char *) de - base)); } space += EXT3_DIR_REC_LEN(de->name_len); names++; @@ -1013,7 +1014,7 @@ static struct buffer_head * ext3_dx_find_entry(struct inode *dir, *err = -ENOENT; errout: - dxtrace(printk("%s not found\n", name)); + dxtrace(printk("%s not found\n", entry->name)); dx_release (frames); return NULL; } @@ -2140,6 +2141,7 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry) struct ext3_dir_entry_2 * de; handle_t *handle; + trace_ext3_unlink_enter(dir, dentry); /* Initialize quotas before so that eventual writes go * in separate transaction */ dquot_initialize(dir); @@ -2185,6 +2187,7 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry) end_unlink: ext3_journal_stop(handle); brelse (bh); + trace_ext3_unlink_exit(dentry, retval); return retval; } @@ -2206,9 +2209,11 @@ static int ext3_symlink (struct inode * dir, /* * For non-fast symlinks, we just allocate inode and put it on * orphan list in the first transaction => we need bitmap, - * group descriptor, sb, inode block, quota blocks. + * group descriptor, sb, inode block, quota blocks, and + * possibly selinux xattr blocks. */ - credits = 4 + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb); + credits = 4 + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) + + EXT3_XATTR_TRANS_BLOCKS; } else { /* * Fast symlink. We have to add entry to directory diff --git a/fs/ext3/super.c b/fs/ext3/super.c index b57ea2f..7beb69a 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -44,6 +44,9 @@ #include "acl.h" #include "namei.h" +#define CREATE_TRACE_POINTS +#include <trace/events/ext3.h> + #ifdef CONFIG_EXT3_DEFAULTS_TO_ORDERED #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_ORDERED_DATA #else @@ -497,6 +500,14 @@ static struct inode *ext3_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } +static int ext3_drop_inode(struct inode *inode) +{ + int drop = generic_drop_inode(inode); + + trace_ext3_drop_inode(inode, drop); + return drop; +} + static void ext3_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); @@ -788,6 +799,7 @@ static const struct super_operations ext3_sops = { .destroy_inode = ext3_destroy_inode, .write_inode = ext3_write_inode, .dirty_inode = ext3_dirty_inode, + .drop_inode = ext3_drop_inode, .evict_inode = ext3_evict_inode, .put_super = ext3_put_super, .sync_fs = ext3_sync_fs, @@ -2509,6 +2521,7 @@ static int ext3_sync_fs(struct super_block *sb, int wait) { tid_t target; + trace_ext3_sync_fs(sb, wait); if (journal_start_commit(EXT3_SB(sb)->s_journal, &target)) { if (wait) log_wait_commit(EXT3_SB(sb)->s_journal, target); diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c index 32e6cc2..d565759 100644 --- a/fs/ext3/xattr.c +++ b/fs/ext3/xattr.c @@ -803,8 +803,16 @@ inserted: /* We need to allocate a new block */ ext3_fsblk_t goal = ext3_group_first_block_no(sb, EXT3_I(inode)->i_block_group); - ext3_fsblk_t block = ext3_new_block(handle, inode, - goal, &error); + ext3_fsblk_t block; + + /* + * Protect us agaist concurrent allocations to the + * same inode from ext3_..._writepage(). Reservation + * code does not expect racing allocations. + */ + mutex_lock(&EXT3_I(inode)->truncate_mutex); + block = ext3_new_block(handle, inode, goal, &error); + mutex_unlock(&EXT3_I(inode)->truncate_mutex); if (error) goto cleanup; ea_idebug(inode, "creating block %d", block); diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index 04109460..56fd8f86 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -7,7 +7,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ - mmp.o + mmp.o indirect.o ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index dca2d1d..a5c29bb 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -198,12 +198,10 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type, case ACL_TYPE_ACCESS: name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; if (acl) { - mode_t mode = inode->i_mode; - error = posix_acl_equiv_mode(acl, &mode); + error = posix_acl_equiv_mode(acl, &inode->i_mode); if (error < 0) return error; else { - inode->i_mode = mode; inode->i_ctime = ext4_current_time(inode); ext4_mark_inode_dirty(handle, inode); if (error == 0) @@ -259,19 +257,16 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) inode->i_mode &= ~current_umask(); } if (test_opt(inode->i_sb, POSIX_ACL) && acl) { - mode_t mode = inode->i_mode; - if (S_ISDIR(inode->i_mode)) { error = ext4_set_acl(handle, inode, ACL_TYPE_DEFAULT, acl); if (error) goto cleanup; } - error = posix_acl_create(&acl, GFP_NOFS, &mode); + error = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); if (error < 0) return error; - inode->i_mode = mode; if (error > 0) { /* This is an extended ACL */ error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, acl); diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 264f694..f8224ad 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -620,3 +620,51 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group) } +/** + * ext4_inode_to_goal_block - return a hint for block allocation + * @inode: inode for block allocation + * + * Return the ideal location to start allocating blocks for a + * newly created inode. + */ +ext4_fsblk_t ext4_inode_to_goal_block(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + ext4_group_t block_group; + ext4_grpblk_t colour; + int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); + ext4_fsblk_t bg_start; + ext4_fsblk_t last_block; + + block_group = ei->i_block_group; + if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { + /* + * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME + * block groups per flexgroup, reserve the first block + * group for directories and special files. Regular + * files will start at the second block group. This + * tends to speed up directory access and improves + * fsck times. + */ + block_group &= ~(flex_size-1); + if (S_ISREG(inode->i_mode)) + block_group++; + } + bg_start = ext4_group_first_block_no(inode->i_sb, block_group); + last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; + + /* + * If we are doing delayed allocation, we don't need take + * colour into account. + */ + if (test_opt(inode->i_sb, DELALLOC)) + return bg_start; + + if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) + colour = (current->pid % 16) * + (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); + else + colour = (current->pid % 16) * ((last_block - bg_start) / 16); + return bg_start + colour; +} + diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index fac90f3..8efb2f0 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -246,3 +246,24 @@ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk, return 1; } +int ext4_check_blockref(const char *function, unsigned int line, + struct inode *inode, __le32 *p, unsigned int max) +{ + struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; + __le32 *bref = p; + unsigned int blk; + + while (bref < p+max) { + blk = le32_to_cpu(*bref++); + if (blk && + unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), + blk, 1))) { + es->s_last_error_block = cpu_to_le64(blk); + ext4_error_inode(inode, function, line, blk, + "invalid block"); + return -EIO; + } + } + return 0; +} + diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index fa44df8..b7d7bd0 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -175,6 +175,7 @@ struct mpage_da_data { */ #define EXT4_IO_END_UNWRITTEN 0x0001 #define EXT4_IO_END_ERROR 0x0002 +#define EXT4_IO_END_QUEUED 0x0004 struct ext4_io_page { struct page *p_page; @@ -526,6 +527,7 @@ struct ext4_new_group_data { #define EXT4_FREE_BLOCKS_METADATA 0x0001 #define EXT4_FREE_BLOCKS_FORGET 0x0002 #define EXT4_FREE_BLOCKS_VALIDATED 0x0004 +#define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008 /* * ioctl commands @@ -939,6 +941,8 @@ struct ext4_inode_info { #define ext4_find_next_zero_bit find_next_zero_bit_le #define ext4_find_next_bit find_next_bit_le +extern void ext4_set_bits(void *bm, int cur, int len); + /* * Maximal mount counts between two filesystem checks */ @@ -1126,7 +1130,8 @@ struct ext4_sb_info { struct journal_s *s_journal; struct list_head s_orphan; struct mutex s_orphan_lock; - struct mutex s_resize_lock; + unsigned long s_resize_flags; /* Flags indicating if there + is a resizer */ unsigned long s_commit_interval; u32 s_max_batch_time; u32 s_min_batch_time; @@ -1214,6 +1219,9 @@ struct ext4_sb_info { /* Kernel thread for multiple mount protection */ struct task_struct *s_mmp_tsk; + + /* record the last minlen when FITRIM is called. */ + atomic_t s_last_trim_minblks; }; static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) @@ -1743,6 +1751,7 @@ extern unsigned ext4_init_block_bitmap(struct super_block *sb, struct ext4_group_desc *desc); #define ext4_free_blocks_after_init(sb, group, desc) \ ext4_init_block_bitmap(sb, NULL, group, desc) +ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); /* dir.c */ extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, @@ -1793,7 +1802,7 @@ extern void ext4_free_blocks(handle_t *handle, struct inode *inode, unsigned long count, int flags); extern int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t i, struct ext4_group_desc *desc); -extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, +extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, ext4_fsblk_t block, unsigned long count); extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); @@ -1834,6 +1843,17 @@ extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); extern qsize_t *ext4_get_reserved_space(struct inode *inode); extern void ext4_da_update_reserve_space(struct inode *inode, int used, int quota_claim); + +/* indirect.c */ +extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags); +extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, + unsigned long nr_segs); +extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); +extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk); +extern void ext4_ind_truncate(struct inode *inode); + /* ioctl.c */ extern long ext4_ioctl(struct file *, unsigned int, unsigned long); extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); @@ -1855,6 +1875,9 @@ extern int ext4_group_extend(struct super_block *sb, ext4_fsblk_t n_blocks_count); /* super.c */ +extern void *ext4_kvmalloc(size_t size, gfp_t flags); +extern void *ext4_kvzalloc(size_t size, gfp_t flags); +extern void ext4_kvfree(void *ptr); extern void __ext4_error(struct super_block *, const char *, unsigned int, const char *, ...) __attribute__ ((format (printf, 4, 5))); @@ -2067,11 +2090,19 @@ struct ext4_group_info { * 5 free 8-block regions. */ }; -#define EXT4_GROUP_INFO_NEED_INIT_BIT 0 +#define EXT4_GROUP_INFO_NEED_INIT_BIT 0 +#define EXT4_GROUP_INFO_WAS_TRIMMED_BIT 1 #define EXT4_MB_GRP_NEED_INIT(grp) \ (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_WAS_TRIMMED(grp) \ + (test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_SET_TRIMMED(grp) \ + (set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_CLEAR_TRIMMED(grp) \ + (clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) + #define EXT4_MAX_CONTENTION 8 #define EXT4_CONTENTION_THRESHOLD 2 @@ -2123,6 +2154,19 @@ static inline void ext4_mark_super_dirty(struct super_block *sb) } /* + * Block validity checking + */ +#define ext4_check_indirect_blockref(inode, bh) \ + ext4_check_blockref(__func__, __LINE__, inode, \ + (__le32 *)(bh)->b_data, \ + EXT4_ADDR_PER_BLOCK((inode)->i_sb)) + +#define ext4_ind_check_inode(inode) \ + ext4_check_blockref(__func__, __LINE__, inode, \ + EXT4_I(inode)->i_data, \ + EXT4_NDIR_BLOCKS) + +/* * Inodes and files operations */ @@ -2151,6 +2195,8 @@ extern void ext4_exit_system_zone(void); extern int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk, unsigned int count); +extern int ext4_check_blockref(const char *, unsigned int, + struct inode *, __le32 *, unsigned int); /* extents.c */ extern int ext4_ext_tree_init(handle_t *handle, struct inode *); @@ -2230,6 +2276,10 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh) extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ]; +#define EXT4_RESIZING 0 +extern int ext4_resize_begin(struct super_block *sb); +extern void ext4_resize_end(struct super_block *sb); + #endif /* __KERNEL__ */ #endif /* _EXT4_H */ diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index bb85757..5802fa1 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -289,10 +289,10 @@ static inline int ext4_should_order_data(struct inode *inode) static inline int ext4_should_writeback_data(struct inode *inode) { - if (!S_ISREG(inode->i_mode)) - return 0; if (EXT4_JOURNAL(inode) == NULL) return 1; + if (!S_ISREG(inode->i_mode)) + return 0; if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) return 0; if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index f815cc8..57cf568 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -114,12 +114,6 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t block) { - struct ext4_inode_info *ei = EXT4_I(inode); - ext4_fsblk_t bg_start; - ext4_fsblk_t last_block; - ext4_grpblk_t colour; - ext4_group_t block_group; - int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); int depth; if (path) { @@ -161,36 +155,7 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, } /* OK. use inode's group */ - block_group = ei->i_block_group; - if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { - /* - * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME - * block groups per flexgroup, reserve the first block - * group for directories and special files. Regular - * files will start at the second block group. This - * tends to speed up directory access and improves - * fsck times. - */ - block_group &= ~(flex_size-1); - if (S_ISREG(inode->i_mode)) - block_group++; - } - bg_start = ext4_group_first_block_no(inode->i_sb, block_group); - last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; - - /* - * If we are doing delayed allocation, we don't need take - * colour into account. - */ - if (test_opt(inode->i_sb, DELALLOC)) - return bg_start; - - if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) - colour = (current->pid % 16) * - (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); - else - colour = (current->pid % 16) * ((last_block - bg_start) / 16); - return bg_start + colour + block; + return ext4_inode_to_goal_block(inode); } /* @@ -776,6 +741,16 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, logical, le32_to_cpu(curp->p_idx->ei_block)); return -EIO; } + + if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries) + >= le16_to_cpu(curp->p_hdr->eh_max))) { + EXT4_ERROR_INODE(inode, + "eh_entries %d >= eh_max %d!", + le16_to_cpu(curp->p_hdr->eh_entries), + le16_to_cpu(curp->p_hdr->eh_max)); + return -EIO; + } + len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; if (logical > le32_to_cpu(curp->p_idx->ei_block)) { /* insert after */ @@ -805,13 +780,6 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, ext4_idx_store_pblock(ix, ptr); le16_add_cpu(&curp->p_hdr->eh_entries, 1); - if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries) - > le16_to_cpu(curp->p_hdr->eh_max))) { - EXT4_ERROR_INODE(inode, - "logical %d == ei_block %d!", - logical, le32_to_cpu(curp->p_idx->ei_block)); - return -EIO; - } if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) { EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!"); return -EIO; @@ -1446,8 +1414,7 @@ ext4_ext_next_allocated_block(struct ext4_ext_path *path) * ext4_ext_next_leaf_block: * returns first allocated block from next leaf or EXT_MAX_BLOCKS */ -static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode, - struct ext4_ext_path *path) +static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path) { int depth; @@ -1757,7 +1724,6 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, goto merge; } -repeat: depth = ext_depth(inode); eh = path[depth].p_hdr; if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) @@ -1765,9 +1731,10 @@ repeat: /* probably next leaf has space for us? */ fex = EXT_LAST_EXTENT(eh); - next = ext4_ext_next_leaf_block(inode, path); - if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block) - && next != EXT_MAX_BLOCKS) { + next = EXT_MAX_BLOCKS; + if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)) + next = ext4_ext_next_leaf_block(path); + if (next != EXT_MAX_BLOCKS) { ext_debug("next leaf block - %d\n", next); BUG_ON(npath != NULL); npath = ext4_ext_find_extent(inode, next, NULL); @@ -1779,7 +1746,7 @@ repeat: ext_debug("next leaf isn't full(%d)\n", le16_to_cpu(eh->eh_entries)); path = npath; - goto repeat; + goto has_space; } ext_debug("next leaf has no free space(%d,%d)\n", le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); @@ -1839,7 +1806,7 @@ has_space: ext4_ext_pblock(newext), ext4_ext_is_uninitialized(newext), ext4_ext_get_actual_len(newext), - nearex, len, nearex + 1, nearex + 2); + nearex, len, nearex, nearex + 1); memmove(nearex + 1, nearex, len); path[depth].p_ext = nearex; } @@ -2052,7 +2019,7 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, } /* - * ext4_ext_in_cache() + * ext4_ext_check_cache() * Checks to see if the given block is in the cache. * If it is, the cached extent is stored in the given * cache extent pointer. If the cached extent is a hole, @@ -2134,8 +2101,6 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, /* * ext4_ext_rm_idx: * removes index from the index block. - * It's used in truncate case only, thus all requests are for - * last index in the block only. */ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, struct ext4_ext_path *path) @@ -2153,6 +2118,13 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, err = ext4_ext_get_access(handle, inode, path); if (err) return err; + + if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) { + int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx; + len *= sizeof(struct ext4_extent_idx); + memmove(path->p_idx, path->p_idx + 1, len); + } + le16_add_cpu(&path->p_hdr->eh_entries, -1); err = ext4_ext_dirty(handle, inode, path); if (err) @@ -2534,8 +2506,7 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path) return 1; } -static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, - ext4_lblk_t end) +static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) { struct super_block *sb = inode->i_sb; int depth = ext_depth(inode); @@ -2575,7 +2546,7 @@ again: if (i == depth) { /* this is leaf block */ err = ext4_ext_rm_leaf(handle, inode, path, - start, end); + start, EXT_MAX_BLOCKS - 1); /* root level has p_bh == NULL, brelse() eats this */ brelse(path[i].p_bh); path[i].p_bh = NULL; @@ -3107,12 +3078,10 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, struct ext4_ext_path *path) { struct ext4_extent *ex; - struct ext4_extent_header *eh; int depth; int err = 0; depth = ext_depth(inode); - eh = path[depth].p_hdr; ex = path[depth].p_ext; ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical" @@ -3357,8 +3326,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); /* check in cache */ - if (ext4_ext_in_cache(inode, map->m_lblk, &newex) && - ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0)) { + if (!(flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) && + ext4_ext_in_cache(inode, map->m_lblk, &newex)) { if (!newex.ee_start_lo && !newex.ee_start_hi) { if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { /* @@ -3497,8 +3466,27 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ext4_ext_mark_uninitialized(ex); - err = ext4_ext_remove_space(inode, map->m_lblk, - map->m_lblk + punched_out); + ext4_ext_invalidate_cache(inode); + + err = ext4_ext_rm_leaf(handle, inode, path, + map->m_lblk, map->m_lblk + punched_out); + + if (!err && path->p_hdr->eh_entries == 0) { + /* + * Punch hole freed all of this sub tree, + * so we need to correct eh_depth + */ + err = ext4_ext_get_access(handle, inode, path); + if (err == 0) { + ext_inode_hdr(inode)->eh_depth = 0; + ext_inode_hdr(inode)->eh_max = + cpu_to_le16(ext4_ext_space_root( + inode, 0)); + + err = ext4_ext_dirty( + handle, inode, path); + } + } goto out2; } @@ -3596,17 +3584,18 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, } err = check_eofblocks_fl(handle, inode, map->m_lblk, path, ar.len); - if (err) - goto out2; - - err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); + if (!err) + err = ext4_ext_insert_extent(handle, inode, path, + &newex, flags); if (err) { + int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ? + EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0; /* free data blocks we just allocated */ /* not a good idea to call discard here directly, * but otherwise we'd need to call it every free() */ ext4_discard_preallocations(inode); ext4_free_blocks(handle, inode, NULL, ext4_ext_pblock(&newex), - ext4_ext_get_actual_len(&newex), 0); + ext4_ext_get_actual_len(&newex), fb_flags); goto out2; } @@ -3699,7 +3688,7 @@ void ext4_ext_truncate(struct inode *inode) last_block = (inode->i_size + sb->s_blocksize - 1) >> EXT4_BLOCK_SIZE_BITS(sb); - err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); + err = ext4_ext_remove_space(inode, last_block); /* In a multi-transaction truncate, we only make the final * transaction synchronous. @@ -3835,7 +3824,7 @@ retry: blkbits) >> blkbits)) new_size = offset + len; else - new_size = (map.m_lblk + ret) << blkbits; + new_size = ((loff_t) map.m_lblk + ret) << blkbits; ext4_falloc_update_inode(inode, mode, new_size, (map.m_flags & EXT4_MAP_NEW)); diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index da3bed3..036f78f 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -129,15 +129,30 @@ static int ext4_sync_parent(struct inode *inode) { struct writeback_control wbc; struct dentry *dentry = NULL; + struct inode *next; int ret = 0; - while (inode && ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) { + if (!ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) + return 0; + inode = igrab(inode); + while (ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) { ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY); - dentry = list_entry(inode->i_dentry.next, - struct dentry, d_alias); - if (!dentry || !dentry->d_parent || !dentry->d_parent->d_inode) + dentry = NULL; + spin_lock(&inode->i_lock); + if (!list_empty(&inode->i_dentry)) { + dentry = list_first_entry(&inode->i_dentry, + struct dentry, d_alias); + dget(dentry); + } + spin_unlock(&inode->i_lock); + if (!dentry) break; - inode = dentry->d_parent->d_inode; + next = igrab(dentry->d_parent->d_inode); + dput(dentry); + if (!next) + break; + iput(inode); + inode = next; ret = sync_mapping_buffers(inode->i_mapping); if (ret) break; @@ -148,6 +163,7 @@ static int ext4_sync_parent(struct inode *inode) if (ret) break; } + iput(inode); return ret; } diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 21bb2f6..9c63f27 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1287,7 +1287,7 @@ extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, group, used_blks, ext4_itable_unused_count(sb, gdp)); ret = 1; - goto out; + goto err_out; } blk = ext4_inode_table(sb, gdp) + used_blks; diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c new file mode 100644 index 0000000..0962642 --- /dev/null +++ b/fs/ext4/indirect.c @@ -0,0 +1,1487 @@ +/* + * linux/fs/ext4/indirect.c + * + * from + * + * linux/fs/ext4/inode.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/inode.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Goal-directed block allocation by Stephen Tweedie + * (sct@redhat.com), 1993, 1998 + */ + +#include <linux/module.h> +#include "ext4_jbd2.h" +#include "truncate.h" + +#include <trace/events/ext4.h> + +typedef struct { + __le32 *p; + __le32 key; + struct buffer_head *bh; +} Indirect; + +static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) +{ + p->key = *(p->p = v); + p->bh = bh; +} + +/** + * ext4_block_to_path - parse the block number into array of offsets + * @inode: inode in question (we are only interested in its superblock) + * @i_block: block number to be parsed + * @offsets: array to store the offsets in + * @boundary: set this non-zero if the referred-to block is likely to be + * followed (on disk) by an indirect block. + * + * To store the locations of file's data ext4 uses a data structure common + * for UNIX filesystems - tree of pointers anchored in the inode, with + * data blocks at leaves and indirect blocks in intermediate nodes. + * This function translates the block number into path in that tree - + * return value is the path length and @offsets[n] is the offset of + * pointer to (n+1)th node in the nth one. If @block is out of range + * (negative or too large) warning is printed and zero returned. + * + * Note: function doesn't find node addresses, so no IO is needed. All + * we need to know is the capacity of indirect blocks (taken from the + * inode->i_sb). + */ + +/* + * Portability note: the last comparison (check that we fit into triple + * indirect block) is spelled differently, because otherwise on an + * architecture with 32-bit longs and 8Kb pages we might get into trouble + * if our filesystem had 8Kb blocks. We might use long long, but that would + * kill us on x86. Oh, well, at least the sign propagation does not matter - + * i_block would have to be negative in the very beginning, so we would not + * get there at all. + */ + +static int ext4_block_to_path(struct inode *inode, + ext4_lblk_t i_block, + ext4_lblk_t offsets[4], int *boundary) +{ + int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb); + int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb); + const long direct_blocks = EXT4_NDIR_BLOCKS, + indirect_blocks = ptrs, + double_blocks = (1 << (ptrs_bits * 2)); + int n = 0; + int final = 0; + + if (i_block < direct_blocks) { + offsets[n++] = i_block; + final = direct_blocks; + } else if ((i_block -= direct_blocks) < indirect_blocks) { + offsets[n++] = EXT4_IND_BLOCK; + offsets[n++] = i_block; + final = ptrs; + } else if ((i_block -= indirect_blocks) < double_blocks) { + offsets[n++] = EXT4_DIND_BLOCK; + offsets[n++] = i_block >> ptrs_bits; + offsets[n++] = i_block & (ptrs - 1); + final = ptrs; + } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { + offsets[n++] = EXT4_TIND_BLOCK; + offsets[n++] = i_block >> (ptrs_bits * 2); + offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); + offsets[n++] = i_block & (ptrs - 1); + final = ptrs; + } else { + ext4_warning(inode->i_sb, "block %lu > max in inode %lu", + i_block + direct_blocks + + indirect_blocks + double_blocks, inode->i_ino); + } + if (boundary) + *boundary = final - 1 - (i_block & (ptrs - 1)); + return n; +} + +/** + * ext4_get_branch - read the chain of indirect blocks leading to data + * @inode: inode in question + * @depth: depth of the chain (1 - direct pointer, etc.) + * @offsets: offsets of pointers in inode/indirect blocks + * @chain: place to store the result + * @err: here we store the error value + * + * Function fills the array of triples <key, p, bh> and returns %NULL + * if everything went OK or the pointer to the last filled triple + * (incomplete one) otherwise. Upon the return chain[i].key contains + * the number of (i+1)-th block in the chain (as it is stored in memory, + * i.e. little-endian 32-bit), chain[i].p contains the address of that + * number (it points into struct inode for i==0 and into the bh->b_data + * for i>0) and chain[i].bh points to the buffer_head of i-th indirect + * block for i>0 and NULL for i==0. In other words, it holds the block + * numbers of the chain, addresses they were taken from (and where we can + * verify that chain did not change) and buffer_heads hosting these + * numbers. + * + * Function stops when it stumbles upon zero pointer (absent block) + * (pointer to last triple returned, *@err == 0) + * or when it gets an IO error reading an indirect block + * (ditto, *@err == -EIO) + * or when it reads all @depth-1 indirect blocks successfully and finds + * the whole chain, all way to the data (returns %NULL, *err == 0). + * + * Need to be called with + * down_read(&EXT4_I(inode)->i_data_sem) + */ +static Indirect *ext4_get_branch(struct inode *inode, int depth, + ext4_lblk_t *offsets, + Indirect chain[4], int *err) +{ + struct super_block *sb = inode->i_sb; + Indirect *p = chain; + struct buffer_head *bh; + + *err = 0; + /* i_data is not going away, no lock needed */ + add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets); + if (!p->key) + goto no_block; + while (--depth) { + bh = sb_getblk(sb, le32_to_cpu(p->key)); + if (unlikely(!bh)) + goto failure; + + if (!bh_uptodate_or_lock(bh)) { + if (bh_submit_read(bh) < 0) { + put_bh(bh); + goto failure; + } + /* validate block references */ + if (ext4_check_indirect_blockref(inode, bh)) { + put_bh(bh); + goto failure; + } + } + + add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets); + /* Reader: end */ + if (!p->key) + goto no_block; + } + return NULL; + +failure: + *err = -EIO; +no_block: + return p; +} + +/** + * ext4_find_near - find a place for allocation with sufficient locality + * @inode: owner + * @ind: descriptor of indirect block. + * + * This function returns the preferred place for block allocation. + * It is used when heuristic for sequential allocation fails. + * Rules are: + * + if there is a block to the left of our position - allocate near it. + * + if pointer will live in indirect block - allocate near that block. + * + if pointer will live in inode - allocate in the same + * cylinder group. + * + * In the latter case we colour the starting block by the callers PID to + * prevent it from clashing with concurrent allocations for a different inode + * in the same block group. The PID is used here so that functionally related + * files will be close-by on-disk. + * + * Caller must make sure that @ind is valid and will stay that way. + */ +static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data; + __le32 *p; + + /* Try to find previous block */ + for (p = ind->p - 1; p >= start; p--) { + if (*p) + return le32_to_cpu(*p); + } + + /* No such thing, so let's try location of indirect block */ + if (ind->bh) + return ind->bh->b_blocknr; + + /* + * It is going to be referred to from the inode itself? OK, just put it + * into the same cylinder group then. + */ + return ext4_inode_to_goal_block(inode); +} + +/** + * ext4_find_goal - find a preferred place for allocation. + * @inode: owner + * @block: block we want + * @partial: pointer to the last triple within a chain + * + * Normally this function find the preferred place for block allocation, + * returns it. + * Because this is only used for non-extent files, we limit the block nr + * to 32 bits. + */ +static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, + Indirect *partial) +{ + ext4_fsblk_t goal; + + /* + * XXX need to get goal block from mballoc's data structures + */ + + goal = ext4_find_near(inode, partial); + goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; + return goal; +} + +/** + * ext4_blks_to_allocate - Look up the block map and count the number + * of direct blocks need to be allocated for the given branch. + * + * @branch: chain of indirect blocks + * @k: number of blocks need for indirect blocks + * @blks: number of data blocks to be mapped. + * @blocks_to_boundary: the offset in the indirect block + * + * return the total number of blocks to be allocate, including the + * direct and indirect blocks. + */ +static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, + int blocks_to_boundary) +{ + unsigned int count = 0; + + /* + * Simple case, [t,d]Indirect block(s) has not allocated yet + * then it's clear blocks on that path have not allocated + */ + if (k > 0) { + /* right now we don't handle cross boundary allocation */ + if (blks < blocks_to_boundary + 1) + count += blks; + else + count += blocks_to_boundary + 1; + return count; + } + + count++; + while (count < blks && count <= blocks_to_boundary && + le32_to_cpu(*(branch[0].p + count)) == 0) { + count++; + } + return count; +} + +/** + * ext4_alloc_blocks: multiple allocate blocks needed for a branch + * @handle: handle for this transaction + * @inode: inode which needs allocated blocks + * @iblock: the logical block to start allocated at + * @goal: preferred physical block of allocation + * @indirect_blks: the number of blocks need to allocate for indirect + * blocks + * @blks: number of desired blocks + * @new_blocks: on return it will store the new block numbers for + * the indirect blocks(if needed) and the first direct block, + * @err: on return it will store the error code + * + * This function will return the number of blocks allocated as + * requested by the passed-in parameters. + */ +static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, + ext4_lblk_t iblock, ext4_fsblk_t goal, + int indirect_blks, int blks, + ext4_fsblk_t new_blocks[4], int *err) +{ + struct ext4_allocation_request ar; + int target, i; + unsigned long count = 0, blk_allocated = 0; + int index = 0; + ext4_fsblk_t current_block = 0; + int ret = 0; + + /* + * Here we try to allocate the requested multiple blocks at once, + * on a best-effort basis. + * To build a branch, we should allocate blocks for + * the indirect blocks(if not allocated yet), and at least + * the first direct block of this branch. That's the + * minimum number of blocks need to allocate(required) + */ + /* first we try to allocate the indirect blocks */ + target = indirect_blks; + while (target > 0) { + count = target; + /* allocating blocks for indirect blocks and direct blocks */ + current_block = ext4_new_meta_blocks(handle, inode, goal, + 0, &count, err); + if (*err) + goto failed_out; + + if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) { + EXT4_ERROR_INODE(inode, + "current_block %llu + count %lu > %d!", + current_block, count, + EXT4_MAX_BLOCK_FILE_PHYS); + *err = -EIO; + goto failed_out; + } + + target -= count; + /* allocate blocks for indirect blocks */ + while (index < indirect_blks && count) { + new_blocks[index++] = current_block++; + count--; + } + if (count > 0) { + /* + * save the new block number + * for the first direct block + */ + new_blocks[index] = current_block; + printk(KERN_INFO "%s returned more blocks than " + "requested\n", __func__); + WARN_ON(1); + break; + } + } + + target = blks - count ; + blk_allocated = count; + if (!target) + goto allocated; + /* Now allocate data blocks */ + memset(&ar, 0, sizeof(ar)); + ar.inode = inode; + ar.goal = goal; + ar.len = target; + ar.logical = iblock; + if (S_ISREG(inode->i_mode)) + /* enable in-core preallocation only for regular files */ + ar.flags = EXT4_MB_HINT_DATA; + + current_block = ext4_mb_new_blocks(handle, &ar, err); + if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) { + EXT4_ERROR_INODE(inode, + "current_block %llu + ar.len %d > %d!", + current_block, ar.len, + EXT4_MAX_BLOCK_FILE_PHYS); + *err = -EIO; + goto failed_out; + } + + if (*err && (target == blks)) { + /* + * if the allocation failed and we didn't allocate + * any blocks before + */ + goto failed_out; + } + if (!*err) { + if (target == blks) { + /* + * save the new block number + * for the first direct block + */ + new_blocks[index] = current_block; + } + blk_allocated += ar.len; + } +allocated: + /* total number of blocks allocated for direct blocks */ + ret = blk_allocated; + *err = 0; + return ret; +failed_out: + for (i = 0; i < index; i++) + ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); + return ret; +} + +/** + * ext4_alloc_branch - allocate and set up a chain of blocks. + * @handle: handle for this transaction + * @inode: owner + * @indirect_blks: number of allocated indirect blocks + * @blks: number of allocated direct blocks + * @goal: preferred place for allocation + * @offsets: offsets (in the blocks) to store the pointers to next. + * @branch: place to store the chain in. + * + * This function allocates blocks, zeroes out all but the last one, + * links them into chain and (if we are synchronous) writes them to disk. + * In other words, it prepares a branch that can be spliced onto the + * inode. It stores the information about that chain in the branch[], in + * the same format as ext4_get_branch() would do. We are calling it after + * we had read the existing part of chain and partial points to the last + * triple of that (one with zero ->key). Upon the exit we have the same + * picture as after the successful ext4_get_block(), except that in one + * place chain is disconnected - *branch->p is still zero (we did not + * set the last link), but branch->key contains the number that should + * be placed into *branch->p to fill that gap. + * + * If allocation fails we free all blocks we've allocated (and forget + * their buffer_heads) and return the error value the from failed + * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain + * as described above and return 0. + */ +static int ext4_alloc_branch(handle_t *handle, struct inode *inode, + ext4_lblk_t iblock, int indirect_blks, + int *blks, ext4_fsblk_t goal, + ext4_lblk_t *offsets, Indirect *branch) +{ + int blocksize = inode->i_sb->s_blocksize; + int i, n = 0; + int err = 0; + struct buffer_head *bh; + int num; + ext4_fsblk_t new_blocks[4]; + ext4_fsblk_t current_block; + + num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks, + *blks, new_blocks, &err); + if (err) + return err; + + branch[0].key = cpu_to_le32(new_blocks[0]); + /* + * metadata blocks and data blocks are allocated. + */ + for (n = 1; n <= indirect_blks; n++) { + /* + * Get buffer_head for parent block, zero it out + * and set the pointer to new one, then send + * parent to disk. + */ + bh = sb_getblk(inode->i_sb, new_blocks[n-1]); + if (unlikely(!bh)) { + err = -EIO; + goto failed; + } + + branch[n].bh = bh; + lock_buffer(bh); + BUFFER_TRACE(bh, "call get_create_access"); + err = ext4_journal_get_create_access(handle, bh); + if (err) { + /* Don't brelse(bh) here; it's done in + * ext4_journal_forget() below */ + unlock_buffer(bh); + goto failed; + } + + memset(bh->b_data, 0, blocksize); + branch[n].p = (__le32 *) bh->b_data + offsets[n]; + branch[n].key = cpu_to_le32(new_blocks[n]); + *branch[n].p = branch[n].key; + if (n == indirect_blks) { + current_block = new_blocks[n]; + /* + * End of chain, update the last new metablock of + * the chain to point to the new allocated + * data blocks numbers + */ + for (i = 1; i < num; i++) + *(branch[n].p + i) = cpu_to_le32(++current_block); + } + BUFFER_TRACE(bh, "marking uptodate"); + set_buffer_uptodate(bh); + unlock_buffer(bh); + + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, inode, bh); + if (err) + goto failed; + } + *blks = num; + return err; +failed: + /* Allocation failed, free what we already allocated */ + ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0); + for (i = 1; i <= n ; i++) { + /* + * branch[i].bh is newly allocated, so there is no + * need to revoke the block, which is why we don't + * need to set EXT4_FREE_BLOCKS_METADATA. + */ + ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, + EXT4_FREE_BLOCKS_FORGET); + } + for (i = n+1; i < indirect_blks; i++) + ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); + + ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0); + + return err; +} + +/** + * ext4_splice_branch - splice the allocated branch onto inode. + * @handle: handle for this transaction + * @inode: owner + * @block: (logical) number of block we are adding + * @chain: chain of indirect blocks (with a missing link - see + * ext4_alloc_branch) + * @where: location of missing link + * @num: number of indirect blocks we are adding + * @blks: number of direct blocks we are adding + * + * This function fills the missing link and does all housekeeping needed in + * inode (->i_blocks, etc.). In case of success we end up with the full + * chain to new block and return 0. + */ +static int ext4_splice_branch(handle_t *handle, struct inode *inode, + ext4_lblk_t block, Indirect *where, int num, + int blks) +{ + int i; + int err = 0; + ext4_fsblk_t current_block; + + /* + * If we're splicing into a [td]indirect block (as opposed to the + * inode) then we need to get write access to the [td]indirect block + * before the splice. + */ + if (where->bh) { + BUFFER_TRACE(where->bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, where->bh); + if (err) + goto err_out; + } + /* That's it */ + + *where->p = where->key; + + /* + * Update the host buffer_head or inode to point to more just allocated + * direct blocks blocks + */ + if (num == 0 && blks > 1) { + current_block = le32_to_cpu(where->key) + 1; + for (i = 1; i < blks; i++) + *(where->p + i) = cpu_to_le32(current_block++); + } + + /* We are done with atomic stuff, now do the rest of housekeeping */ + /* had we spliced it onto indirect block? */ + if (where->bh) { + /* + * If we spliced it onto an indirect block, we haven't + * altered the inode. Note however that if it is being spliced + * onto an indirect block at the very end of the file (the + * file is growing) then we *will* alter the inode to reflect + * the new i_size. But that is not done here - it is done in + * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode. + */ + jbd_debug(5, "splicing indirect only\n"); + BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, inode, where->bh); + if (err) + goto err_out; + } else { + /* + * OK, we spliced it into the inode itself on a direct block. + */ + ext4_mark_inode_dirty(handle, inode); + jbd_debug(5, "splicing direct\n"); + } + return err; + +err_out: + for (i = 1; i <= num; i++) { + /* + * branch[i].bh is newly allocated, so there is no + * need to revoke the block, which is why we don't + * need to set EXT4_FREE_BLOCKS_METADATA. + */ + ext4_free_blocks(handle, inode, where[i].bh, 0, 1, + EXT4_FREE_BLOCKS_FORGET); + } + ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key), + blks, 0); + + return err; +} + +/* + * The ext4_ind_map_blocks() function handles non-extents inodes + * (i.e., using the traditional indirect/double-indirect i_blocks + * scheme) for ext4_map_blocks(). + * + * Allocation strategy is simple: if we have to allocate something, we will + * have to go the whole way to leaf. So let's do it before attaching anything + * to tree, set linkage between the newborn blocks, write them if sync is + * required, recheck the path, free and repeat if check fails, otherwise + * set the last missing link (that will protect us from any truncate-generated + * removals - all blocks on the path are immune now) and possibly force the + * write on the parent block. + * That has a nice additional property: no special recovery from the failed + * allocations is needed - we simply release blocks and do not touch anything + * reachable from inode. + * + * `handle' can be NULL if create == 0. + * + * return > 0, # of blocks mapped or allocated. + * return = 0, if plain lookup failed. + * return < 0, error case. + * + * The ext4_ind_get_blocks() function should be called with + * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem + * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or + * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system + * blocks. + */ +int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, + int flags) +{ + int err = -EIO; + ext4_lblk_t offsets[4]; + Indirect chain[4]; + Indirect *partial; + ext4_fsblk_t goal; + int indirect_blks; + int blocks_to_boundary = 0; + int depth; + int count = 0; + ext4_fsblk_t first_block = 0; + + trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); + J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))); + J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); + depth = ext4_block_to_path(inode, map->m_lblk, offsets, + &blocks_to_boundary); + + if (depth == 0) + goto out; + + partial = ext4_get_branch(inode, depth, offsets, chain, &err); + + /* Simplest case - block found, no allocation needed */ + if (!partial) { + first_block = le32_to_cpu(chain[depth - 1].key); + count++; + /*map more blocks*/ + while (count < map->m_len && count <= blocks_to_boundary) { + ext4_fsblk_t blk; + + blk = le32_to_cpu(*(chain[depth-1].p + count)); + + if (blk == first_block + count) + count++; + else + break; + } + goto got_it; + } + + /* Next simple case - plain lookup or failed read of indirect block */ + if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO) + goto cleanup; + + /* + * Okay, we need to do block allocation. + */ + goal = ext4_find_goal(inode, map->m_lblk, partial); + + /* the number of blocks need to allocate for [d,t]indirect blocks */ + indirect_blks = (chain + depth) - partial - 1; + + /* + * Next look up the indirect map to count the totoal number of + * direct blocks to allocate for this branch. + */ + count = ext4_blks_to_allocate(partial, indirect_blks, + map->m_len, blocks_to_boundary); + /* + * Block out ext4_truncate while we alter the tree + */ + err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks, + &count, goal, + offsets + (partial - chain), partial); + + /* + * The ext4_splice_branch call will free and forget any buffers + * on the new chain if there is a failure, but that risks using + * up transaction credits, especially for bitmaps where the + * credits cannot be returned. Can we handle this somehow? We + * may need to return -EAGAIN upwards in the worst case. --sct + */ + if (!err) + err = ext4_splice_branch(handle, inode, map->m_lblk, + partial, indirect_blks, count); + if (err) + goto cleanup; + + map->m_flags |= EXT4_MAP_NEW; + + ext4_update_inode_fsync_trans(handle, inode, 1); +got_it: + map->m_flags |= EXT4_MAP_MAPPED; + map->m_pblk = le32_to_cpu(chain[depth-1].key); + map->m_len = count; + if (count > blocks_to_boundary) + map->m_flags |= EXT4_MAP_BOUNDARY; + err = count; + /* Clean up and exit */ + partial = chain + depth - 1; /* the whole chain */ +cleanup: + while (partial > chain) { + BUFFER_TRACE(partial->bh, "call brelse"); + brelse(partial->bh); + partial--; + } +out: + trace_ext4_ind_map_blocks_exit(inode, map->m_lblk, + map->m_pblk, map->m_len, err); + return err; +} + +/* + * O_DIRECT for ext3 (or indirect map) based files + * + * If the O_DIRECT write will extend the file then add this inode to the + * orphan list. So recovery will truncate it back to the original size + * if the machine crashes during the write. + * + * If the O_DIRECT write is intantiating holes inside i_size and the machine + * crashes then stale disk data _may_ be exposed inside the file. But current + * VFS code falls back into buffered path in that case so we are safe. + */ +ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, + unsigned long nr_segs) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + struct ext4_inode_info *ei = EXT4_I(inode); + handle_t *handle; + ssize_t ret; + int orphan = 0; + size_t count = iov_length(iov, nr_segs); + int retries = 0; + + if (rw == WRITE) { + loff_t final_size = offset + count; + + if (final_size > inode->i_size) { + /* Credits for sb + inode write */ + handle = ext4_journal_start(inode, 2); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + ret = ext4_orphan_add(handle, inode); + if (ret) { + ext4_journal_stop(handle); + goto out; + } + orphan = 1; + ei->i_disksize = inode->i_size; + ext4_journal_stop(handle); + } + } + +retry: + if (rw == READ && ext4_should_dioread_nolock(inode)) { + if (unlikely(!list_empty(&ei->i_completed_io_list))) { + mutex_lock(&inode->i_mutex); + ext4_flush_completed_IO(inode); + mutex_unlock(&inode->i_mutex); + } + ret = __blockdev_direct_IO(rw, iocb, inode, + inode->i_sb->s_bdev, iov, + offset, nr_segs, + ext4_get_block, NULL, NULL, 0); + } else { + ret = blockdev_direct_IO(rw, iocb, inode, iov, + offset, nr_segs, ext4_get_block); + + if (unlikely((rw & WRITE) && ret < 0)) { + loff_t isize = i_size_read(inode); + loff_t end = offset + iov_length(iov, nr_segs); + + if (end > isize) + ext4_truncate_failed_write(inode); + } + } + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + + if (orphan) { + int err; + + /* Credits for sb + inode write */ + handle = ext4_journal_start(inode, 2); + if (IS_ERR(handle)) { + /* This is really bad luck. We've written the data + * but cannot extend i_size. Bail out and pretend + * the write failed... */ + ret = PTR_ERR(handle); + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + + goto out; + } + if (inode->i_nlink) + ext4_orphan_del(handle, inode); + if (ret > 0) { + loff_t end = offset + ret; + if (end > inode->i_size) { + ei->i_disksize = end; + i_size_write(inode, end); + /* + * We're going to return a positive `ret' + * here due to non-zero-length I/O, so there's + * no way of reporting error returns from + * ext4_mark_inode_dirty() to userspace. So + * ignore it. + */ + ext4_mark_inode_dirty(handle, inode); + } + } + err = ext4_journal_stop(handle); + if (ret == 0) + ret = err; + } +out: + return ret; +} + +/* + * Calculate the number of metadata blocks need to reserve + * to allocate a new block at @lblocks for non extent file based file + */ +int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1); + int blk_bits; + + if (lblock < EXT4_NDIR_BLOCKS) + return 0; + + lblock -= EXT4_NDIR_BLOCKS; + + if (ei->i_da_metadata_calc_len && + (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) { + ei->i_da_metadata_calc_len++; + return 0; + } + ei->i_da_metadata_calc_last_lblock = lblock & dind_mask; + ei->i_da_metadata_calc_len = 1; + blk_bits = order_base_2(lblock); + return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1; +} + +int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk) +{ + int indirects; + + /* if nrblocks are contiguous */ + if (chunk) { + /* + * With N contiguous data blocks, we need at most + * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks, + * 2 dindirect blocks, and 1 tindirect block + */ + return DIV_ROUND_UP(nrblocks, + EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4; + } + /* + * if nrblocks are not contiguous, worse case, each block touch + * a indirect block, and each indirect block touch a double indirect + * block, plus a triple indirect block + */ + indirects = nrblocks * 2 + 1; + return indirects; +} + +/* + * Truncate transactions can be complex and absolutely huge. So we need to + * be able to restart the transaction at a conventient checkpoint to make + * sure we don't overflow the journal. + * + * start_transaction gets us a new handle for a truncate transaction, + * and extend_transaction tries to extend the existing one a bit. If + * extend fails, we need to propagate the failure up and restart the + * transaction in the top-level truncate loop. --sct + */ +static handle_t *start_transaction(struct inode *inode) +{ + handle_t *result; + + result = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)); + if (!IS_ERR(result)) + return result; + + ext4_std_error(inode->i_sb, PTR_ERR(result)); + return result; +} + +/* + * Try to extend this transaction for the purposes of truncation. + * + * Returns 0 if we managed to create more room. If we can't create more + * room, and the transaction must be restarted we return 1. + */ +static int try_to_extend_transaction(handle_t *handle, struct inode *inode) +{ + if (!ext4_handle_valid(handle)) + return 0; + if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1)) + return 0; + if (!ext4_journal_extend(handle, ext4_blocks_for_truncate(inode))) + return 0; + return 1; +} + +/* + * Probably it should be a library function... search for first non-zero word + * or memcmp with zero_page, whatever is better for particular architecture. + * Linus? + */ +static inline int all_zeroes(__le32 *p, __le32 *q) +{ + while (p < q) + if (*p++) + return 0; + return 1; +} + +/** + * ext4_find_shared - find the indirect blocks for partial truncation. + * @inode: inode in question + * @depth: depth of the affected branch + * @offsets: offsets of pointers in that branch (see ext4_block_to_path) + * @chain: place to store the pointers to partial indirect blocks + * @top: place to the (detached) top of branch + * + * This is a helper function used by ext4_truncate(). + * + * When we do truncate() we may have to clean the ends of several + * indirect blocks but leave the blocks themselves alive. Block is + * partially truncated if some data below the new i_size is referred + * from it (and it is on the path to the first completely truncated + * data block, indeed). We have to free the top of that path along + * with everything to the right of the path. Since no allocation + * past the truncation point is possible until ext4_truncate() + * finishes, we may safely do the latter, but top of branch may + * require special attention - pageout below the truncation point + * might try to populate it. + * + * We atomically detach the top of branch from the tree, store the + * block number of its root in *@top, pointers to buffer_heads of + * partially truncated blocks - in @chain[].bh and pointers to + * their last elements that should not be removed - in + * @chain[].p. Return value is the pointer to last filled element + * of @chain. + * + * The work left to caller to do the actual freeing of subtrees: + * a) free the subtree starting from *@top + * b) free the subtrees whose roots are stored in + * (@chain[i].p+1 .. end of @chain[i].bh->b_data) + * c) free the subtrees growing from the inode past the @chain[0]. + * (no partially truncated stuff there). */ + +static Indirect *ext4_find_shared(struct inode *inode, int depth, + ext4_lblk_t offsets[4], Indirect chain[4], + __le32 *top) +{ + Indirect *partial, *p; + int k, err; + + *top = 0; + /* Make k index the deepest non-null offset + 1 */ + for (k = depth; k > 1 && !offsets[k-1]; k--) + ; + partial = ext4_get_branch(inode, k, offsets, chain, &err); + /* Writer: pointers */ + if (!partial) + partial = chain + k-1; + /* + * If the branch acquired continuation since we've looked at it - + * fine, it should all survive and (new) top doesn't belong to us. + */ + if (!partial->key && *partial->p) + /* Writer: end */ + goto no_top; + for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--) + ; + /* + * OK, we've found the last block that must survive. The rest of our + * branch should be detached before unlocking. However, if that rest + * of branch is all ours and does not grow immediately from the inode + * it's easier to cheat and just decrement partial->p. + */ + if (p == chain + k - 1 && p > chain) { + p->p--; + } else { + *top = *p->p; + /* Nope, don't do this in ext4. Must leave the tree intact */ +#if 0 + *p->p = 0; +#endif + } + /* Writer: end */ + + while (partial > p) { + brelse(partial->bh); + partial--; + } +no_top: + return partial; +} + +/* + * Zero a number of block pointers in either an inode or an indirect block. + * If we restart the transaction we must again get write access to the + * indirect block for further modification. + * + * We release `count' blocks on disk, but (last - first) may be greater + * than `count' because there can be holes in there. + * + * Return 0 on success, 1 on invalid block range + * and < 0 on fatal error. + */ +static int ext4_clear_blocks(handle_t *handle, struct inode *inode, + struct buffer_head *bh, + ext4_fsblk_t block_to_free, + unsigned long count, __le32 *first, + __le32 *last) +{ + __le32 *p; + int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED; + int err; + + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + flags |= EXT4_FREE_BLOCKS_METADATA; + + if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, + count)) { + EXT4_ERROR_INODE(inode, "attempt to clear invalid " + "blocks %llu len %lu", + (unsigned long long) block_to_free, count); + return 1; + } + + if (try_to_extend_transaction(handle, inode)) { + if (bh) { + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, inode, bh); + if (unlikely(err)) + goto out_err; + } + err = ext4_mark_inode_dirty(handle, inode); + if (unlikely(err)) + goto out_err; + err = ext4_truncate_restart_trans(handle, inode, + ext4_blocks_for_truncate(inode)); + if (unlikely(err)) + goto out_err; + if (bh) { + BUFFER_TRACE(bh, "retaking write access"); + err = ext4_journal_get_write_access(handle, bh); + if (unlikely(err)) + goto out_err; + } + } + + for (p = first; p < last; p++) + *p = 0; + + ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags); + return 0; +out_err: + ext4_std_error(inode->i_sb, err); + return err; +} + +/** + * ext4_free_data - free a list of data blocks + * @handle: handle for this transaction + * @inode: inode we are dealing with + * @this_bh: indirect buffer_head which contains *@first and *@last + * @first: array of block numbers + * @last: points immediately past the end of array + * + * We are freeing all blocks referred from that array (numbers are stored as + * little-endian 32-bit) and updating @inode->i_blocks appropriately. + * + * We accumulate contiguous runs of blocks to free. Conveniently, if these + * blocks are contiguous then releasing them at one time will only affect one + * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't + * actually use a lot of journal space. + * + * @this_bh will be %NULL if @first and @last point into the inode's direct + * block pointers. + */ +static void ext4_free_data(handle_t *handle, struct inode *inode, + struct buffer_head *this_bh, + __le32 *first, __le32 *last) +{ + ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */ + unsigned long count = 0; /* Number of blocks in the run */ + __le32 *block_to_free_p = NULL; /* Pointer into inode/ind + corresponding to + block_to_free */ + ext4_fsblk_t nr; /* Current block # */ + __le32 *p; /* Pointer into inode/ind + for current block */ + int err = 0; + + if (this_bh) { /* For indirect block */ + BUFFER_TRACE(this_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, this_bh); + /* Important: if we can't update the indirect pointers + * to the blocks, we can't free them. */ + if (err) + return; + } + + for (p = first; p < last; p++) { + nr = le32_to_cpu(*p); + if (nr) { + /* accumulate blocks to free if they're contiguous */ + if (count == 0) { + block_to_free = nr; + block_to_free_p = p; + count = 1; + } else if (nr == block_to_free + count) { + count++; + } else { + err = ext4_clear_blocks(handle, inode, this_bh, + block_to_free, count, + block_to_free_p, p); + if (err) + break; + block_to_free = nr; + block_to_free_p = p; + count = 1; + } + } + } + + if (!err && count > 0) + err = ext4_clear_blocks(handle, inode, this_bh, block_to_free, + count, block_to_free_p, p); + if (err < 0) + /* fatal error */ + return; + + if (this_bh) { + BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata"); + + /* + * The buffer head should have an attached journal head at this + * point. However, if the data is corrupted and an indirect + * block pointed to itself, it would have been detached when + * the block was cleared. Check for this instead of OOPSing. + */ + if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) + ext4_handle_dirty_metadata(handle, inode, this_bh); + else + EXT4_ERROR_INODE(inode, + "circular indirect block detected at " + "block %llu", + (unsigned long long) this_bh->b_blocknr); + } +} + +/** + * ext4_free_branches - free an array of branches + * @handle: JBD handle for this transaction + * @inode: inode we are dealing with + * @parent_bh: the buffer_head which contains *@first and *@last + * @first: array of block numbers + * @last: pointer immediately past the end of array + * @depth: depth of the branches to free + * + * We are freeing all blocks referred from these branches (numbers are + * stored as little-endian 32-bit) and updating @inode->i_blocks + * appropriately. + */ +static void ext4_free_branches(handle_t *handle, struct inode *inode, + struct buffer_head *parent_bh, + __le32 *first, __le32 *last, int depth) +{ + ext4_fsblk_t nr; + __le32 *p; + + if (ext4_handle_is_aborted(handle)) + return; + + if (depth--) { + struct buffer_head *bh; + int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); + p = last; + while (--p >= first) { + nr = le32_to_cpu(*p); + if (!nr) + continue; /* A hole */ + + if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), + nr, 1)) { + EXT4_ERROR_INODE(inode, + "invalid indirect mapped " + "block %lu (level %d)", + (unsigned long) nr, depth); + break; + } + + /* Go read the buffer for the next level down */ + bh = sb_bread(inode->i_sb, nr); + + /* + * A read failure? Report error and clear slot + * (should be rare). + */ + if (!bh) { + EXT4_ERROR_INODE_BLOCK(inode, nr, + "Read failure"); + continue; + } + + /* This zaps the entire block. Bottom up. */ + BUFFER_TRACE(bh, "free child branches"); + ext4_free_branches(handle, inode, bh, + (__le32 *) bh->b_data, + (__le32 *) bh->b_data + addr_per_block, + depth); + brelse(bh); + + /* + * Everything below this this pointer has been + * released. Now let this top-of-subtree go. + * + * We want the freeing of this indirect block to be + * atomic in the journal with the updating of the + * bitmap block which owns it. So make some room in + * the journal. + * + * We zero the parent pointer *after* freeing its + * pointee in the bitmaps, so if extend_transaction() + * for some reason fails to put the bitmap changes and + * the release into the same transaction, recovery + * will merely complain about releasing a free block, + * rather than leaking blocks. + */ + if (ext4_handle_is_aborted(handle)) + return; + if (try_to_extend_transaction(handle, inode)) { + ext4_mark_inode_dirty(handle, inode); + ext4_truncate_restart_trans(handle, inode, + ext4_blocks_for_truncate(inode)); + } + + /* + * The forget flag here is critical because if + * we are journaling (and not doing data + * journaling), we have to make sure a revoke + * record is written to prevent the journal + * replay from overwriting the (former) + * indirect block if it gets reallocated as a + * data block. This must happen in the same + * transaction where the data blocks are + * actually freed. + */ + ext4_free_blocks(handle, inode, NULL, nr, 1, + EXT4_FREE_BLOCKS_METADATA| + EXT4_FREE_BLOCKS_FORGET); + + if (parent_bh) { + /* + * The block which we have just freed is + * pointed to by an indirect block: journal it + */ + BUFFER_TRACE(parent_bh, "get_write_access"); + if (!ext4_journal_get_write_access(handle, + parent_bh)){ + *p = 0; + BUFFER_TRACE(parent_bh, + "call ext4_handle_dirty_metadata"); + ext4_handle_dirty_metadata(handle, + inode, + parent_bh); + } + } + } + } else { + /* We have reached the bottom of the tree. */ + BUFFER_TRACE(parent_bh, "free data blocks"); + ext4_free_data(handle, inode, parent_bh, first, last); + } +} + +void ext4_ind_truncate(struct inode *inode) +{ + handle_t *handle; + struct ext4_inode_info *ei = EXT4_I(inode); + __le32 *i_data = ei->i_data; + int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); + struct address_space *mapping = inode->i_mapping; + ext4_lblk_t offsets[4]; + Indirect chain[4]; + Indirect *partial; + __le32 nr = 0; + int n = 0; + ext4_lblk_t last_block, max_block; + unsigned blocksize = inode->i_sb->s_blocksize; + + handle = start_transaction(inode); + if (IS_ERR(handle)) + return; /* AKPM: return what? */ + + last_block = (inode->i_size + blocksize-1) + >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); + max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) + >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); + + if (inode->i_size & (blocksize - 1)) + if (ext4_block_truncate_page(handle, mapping, inode->i_size)) + goto out_stop; + + if (last_block != max_block) { + n = ext4_block_to_path(inode, last_block, offsets, NULL); + if (n == 0) + goto out_stop; /* error */ + } + + /* + * OK. This truncate is going to happen. We add the inode to the + * orphan list, so that if this truncate spans multiple transactions, + * and we crash, we will resume the truncate when the filesystem + * recovers. It also marks the inode dirty, to catch the new size. + * + * Implication: the file must always be in a sane, consistent + * truncatable state while each transaction commits. + */ + if (ext4_orphan_add(handle, inode)) + goto out_stop; + + /* + * From here we block out all ext4_get_block() callers who want to + * modify the block allocation tree. + */ + down_write(&ei->i_data_sem); + + ext4_discard_preallocations(inode); + + /* + * The orphan list entry will now protect us from any crash which + * occurs before the truncate completes, so it is now safe to propagate + * the new, shorter inode size (held for now in i_size) into the + * on-disk inode. We do this via i_disksize, which is the value which + * ext4 *really* writes onto the disk inode. + */ + ei->i_disksize = inode->i_size; + + if (last_block == max_block) { + /* + * It is unnecessary to free any data blocks if last_block is + * equal to the indirect block limit. + */ + goto out_unlock; + } else if (n == 1) { /* direct blocks */ + ext4_free_data(handle, inode, NULL, i_data+offsets[0], + i_data + EXT4_NDIR_BLOCKS); + goto do_indirects; + } + + partial = ext4_find_shared(inode, n, offsets, chain, &nr); + /* Kill the top of shared branch (not detached) */ + if (nr) { + if (partial == chain) { + /* Shared branch grows from the inode */ + ext4_free_branches(handle, inode, NULL, + &nr, &nr+1, (chain+n-1) - partial); + *partial->p = 0; + /* + * We mark the inode dirty prior to restart, + * and prior to stop. No need for it here. + */ + } else { + /* Shared branch grows from an indirect block */ + BUFFER_TRACE(partial->bh, "get_write_access"); + ext4_free_branches(handle, inode, partial->bh, + partial->p, + partial->p+1, (chain+n-1) - partial); + } + } + /* Clear the ends of indirect blocks on the shared branch */ + while (partial > chain) { + ext4_free_branches(handle, inode, partial->bh, partial->p + 1, + (__le32*)partial->bh->b_data+addr_per_block, + (chain+n-1) - partial); + BUFFER_TRACE(partial->bh, "call brelse"); + brelse(partial->bh); + partial--; + } +do_indirects: + /* Kill the remaining (whole) subtrees */ + switch (offsets[0]) { + default: + nr = i_data[EXT4_IND_BLOCK]; + if (nr) { + ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); + i_data[EXT4_IND_BLOCK] = 0; + } + case EXT4_IND_BLOCK: + nr = i_data[EXT4_DIND_BLOCK]; + if (nr) { + ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); + i_data[EXT4_DIND_BLOCK] = 0; + } + case EXT4_DIND_BLOCK: + nr = i_data[EXT4_TIND_BLOCK]; + if (nr) { + ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); + i_data[EXT4_TIND_BLOCK] = 0; + } + case EXT4_TIND_BLOCK: + ; + } + +out_unlock: + up_write(&ei->i_data_sem); + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); + + /* + * In a multi-transaction truncate, we only make the final transaction + * synchronous + */ + if (IS_SYNC(inode)) + ext4_handle_sync(handle); +out_stop: + /* + * If this was a simple ftruncate(), and the file will remain alive + * then we need to clear up the orphan record which we created above. + * However, if this was a real unlink then we were called by + * ext4_delete_inode(), and we allow that function to clean up the + * orphan info for us. + */ + if (inode->i_nlink) + ext4_orphan_del(handle, inode); + + ext4_journal_stop(handle); + trace_ext4_truncate_exit(inode); +} + diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 678cde8..18d2558 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -12,10 +12,6 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * - * Goal-directed block allocation by Stephen Tweedie - * (sct@redhat.com), 1993, 1998 - * Big-endian to little-endian byte-swapping/bitmaps by - * David S. Miller (davem@caip.rutgers.edu), 1995 * 64-bit file support on 64-bit platforms by Jakub Jelinek * (jj@sunsite.ms.mff.cuni.cz) * @@ -47,6 +43,7 @@ #include "xattr.h" #include "acl.h" #include "ext4_extents.h" +#include "truncate.h" #include <trace/events/ext4.h> @@ -89,72 +86,6 @@ static int ext4_inode_is_fast_symlink(struct inode *inode) } /* - * Work out how many blocks we need to proceed with the next chunk of a - * truncate transaction. - */ -static unsigned long blocks_for_truncate(struct inode *inode) -{ - ext4_lblk_t needed; - - needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); - - /* Give ourselves just enough room to cope with inodes in which - * i_blocks is corrupt: we've seen disk corruptions in the past - * which resulted in random data in an inode which looked enough - * like a regular file for ext4 to try to delete it. Things - * will go a bit crazy if that happens, but at least we should - * try not to panic the whole kernel. */ - if (needed < 2) - needed = 2; - - /* But we need to bound the transaction so we don't overflow the - * journal. */ - if (needed > EXT4_MAX_TRANS_DATA) - needed = EXT4_MAX_TRANS_DATA; - - return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed; -} - -/* - * Truncate transactions can be complex and absolutely huge. So we need to - * be able to restart the transaction at a conventient checkpoint to make - * sure we don't overflow the journal. - * - * start_transaction gets us a new handle for a truncate transaction, - * and extend_transaction tries to extend the existing one a bit. If - * extend fails, we need to propagate the failure up and restart the - * transaction in the top-level truncate loop. --sct - */ -static handle_t *start_transaction(struct inode *inode) -{ - handle_t *result; - - result = ext4_journal_start(inode, blocks_for_truncate(inode)); - if (!IS_ERR(result)) - return result; - - ext4_std_error(inode->i_sb, PTR_ERR(result)); - return result; -} - -/* - * Try to extend this transaction for the purposes of truncation. - * - * Returns 0 if we managed to create more room. If we can't create more - * room, and the transaction must be restarted we return 1. - */ -static int try_to_extend_transaction(handle_t *handle, struct inode *inode) -{ - if (!ext4_handle_valid(handle)) - return 0; - if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1)) - return 0; - if (!ext4_journal_extend(handle, blocks_for_truncate(inode))) - return 0; - return 1; -} - -/* * Restart the transaction associated with *handle. This does a commit, * so before we call here everything must be consistently dirtied against * this transaction. @@ -189,7 +120,37 @@ void ext4_evict_inode(struct inode *inode) int err; trace_ext4_evict_inode(inode); + + ext4_ioend_wait(inode); + if (inode->i_nlink) { + /* + * When journalling data dirty buffers are tracked only in the + * journal. So although mm thinks everything is clean and + * ready for reaping the inode might still have some pages to + * write in the running transaction or waiting to be + * checkpointed. Thus calling jbd2_journal_invalidatepage() + * (via truncate_inode_pages()) to discard these buffers can + * cause data loss. Also even if we did not discard these + * buffers, we would have no way to find them after the inode + * is reaped and thus user could see stale data if he tries to + * read them before the transaction is checkpointed. So be + * careful and force everything to disk here... We use + * ei->i_datasync_tid to store the newest transaction + * containing inode's data. + * + * Note that directories do not have this problem because they + * don't use page cache. + */ + if (ext4_should_journal_data(inode) && + (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) { + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; + tid_t commit_tid = EXT4_I(inode)->i_datasync_tid; + + jbd2_log_start_commit(journal, commit_tid); + jbd2_log_wait_commit(journal, commit_tid); + filemap_write_and_wait(&inode->i_data); + } truncate_inode_pages(&inode->i_data, 0); goto no_delete; } @@ -204,7 +165,7 @@ void ext4_evict_inode(struct inode *inode) if (is_bad_inode(inode)) goto no_delete; - handle = ext4_journal_start(inode, blocks_for_truncate(inode)+3); + handle = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)+3); if (IS_ERR(handle)) { ext4_std_error(inode->i_sb, PTR_ERR(handle)); /* @@ -277,793 +238,6 @@ no_delete: ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ } -typedef struct { - __le32 *p; - __le32 key; - struct buffer_head *bh; -} Indirect; - -static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) -{ - p->key = *(p->p = v); - p->bh = bh; -} - -/** - * ext4_block_to_path - parse the block number into array of offsets - * @inode: inode in question (we are only interested in its superblock) - * @i_block: block number to be parsed - * @offsets: array to store the offsets in - * @boundary: set this non-zero if the referred-to block is likely to be - * followed (on disk) by an indirect block. - * - * To store the locations of file's data ext4 uses a data structure common - * for UNIX filesystems - tree of pointers anchored in the inode, with - * data blocks at leaves and indirect blocks in intermediate nodes. - * This function translates the block number into path in that tree - - * return value is the path length and @offsets[n] is the offset of - * pointer to (n+1)th node in the nth one. If @block is out of range - * (negative or too large) warning is printed and zero returned. - * - * Note: function doesn't find node addresses, so no IO is needed. All - * we need to know is the capacity of indirect blocks (taken from the - * inode->i_sb). - */ - -/* - * Portability note: the last comparison (check that we fit into triple - * indirect block) is spelled differently, because otherwise on an - * architecture with 32-bit longs and 8Kb pages we might get into trouble - * if our filesystem had 8Kb blocks. We might use long long, but that would - * kill us on x86. Oh, well, at least the sign propagation does not matter - - * i_block would have to be negative in the very beginning, so we would not - * get there at all. - */ - -static int ext4_block_to_path(struct inode *inode, - ext4_lblk_t i_block, - ext4_lblk_t offsets[4], int *boundary) -{ - int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb); - int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb); - const long direct_blocks = EXT4_NDIR_BLOCKS, - indirect_blocks = ptrs, - double_blocks = (1 << (ptrs_bits * 2)); - int n = 0; - int final = 0; - - if (i_block < direct_blocks) { - offsets[n++] = i_block; - final = direct_blocks; - } else if ((i_block -= direct_blocks) < indirect_blocks) { - offsets[n++] = EXT4_IND_BLOCK; - offsets[n++] = i_block; - final = ptrs; - } else if ((i_block -= indirect_blocks) < double_blocks) { - offsets[n++] = EXT4_DIND_BLOCK; - offsets[n++] = i_block >> ptrs_bits; - offsets[n++] = i_block & (ptrs - 1); - final = ptrs; - } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { - offsets[n++] = EXT4_TIND_BLOCK; - offsets[n++] = i_block >> (ptrs_bits * 2); - offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); - offsets[n++] = i_block & (ptrs - 1); - final = ptrs; - } else { - ext4_warning(inode->i_sb, "block %lu > max in inode %lu", - i_block + direct_blocks + - indirect_blocks + double_blocks, inode->i_ino); - } - if (boundary) - *boundary = final - 1 - (i_block & (ptrs - 1)); - return n; -} - -static int __ext4_check_blockref(const char *function, unsigned int line, - struct inode *inode, - __le32 *p, unsigned int max) -{ - struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; - __le32 *bref = p; - unsigned int blk; - - while (bref < p+max) { - blk = le32_to_cpu(*bref++); - if (blk && - unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), - blk, 1))) { - es->s_last_error_block = cpu_to_le64(blk); - ext4_error_inode(inode, function, line, blk, - "invalid block"); - return -EIO; - } - } - return 0; -} - - -#define ext4_check_indirect_blockref(inode, bh) \ - __ext4_check_blockref(__func__, __LINE__, inode, \ - (__le32 *)(bh)->b_data, \ - EXT4_ADDR_PER_BLOCK((inode)->i_sb)) - -#define ext4_check_inode_blockref(inode) \ - __ext4_check_blockref(__func__, __LINE__, inode, \ - EXT4_I(inode)->i_data, \ - EXT4_NDIR_BLOCKS) - -/** - * ext4_get_branch - read the chain of indirect blocks leading to data - * @inode: inode in question - * @depth: depth of the chain (1 - direct pointer, etc.) - * @offsets: offsets of pointers in inode/indirect blocks - * @chain: place to store the result - * @err: here we store the error value - * - * Function fills the array of triples <key, p, bh> and returns %NULL - * if everything went OK or the pointer to the last filled triple - * (incomplete one) otherwise. Upon the return chain[i].key contains - * the number of (i+1)-th block in the chain (as it is stored in memory, - * i.e. little-endian 32-bit), chain[i].p contains the address of that - * number (it points into struct inode for i==0 and into the bh->b_data - * for i>0) and chain[i].bh points to the buffer_head of i-th indirect - * block for i>0 and NULL for i==0. In other words, it holds the block - * numbers of the chain, addresses they were taken from (and where we can - * verify that chain did not change) and buffer_heads hosting these - * numbers. - * - * Function stops when it stumbles upon zero pointer (absent block) - * (pointer to last triple returned, *@err == 0) - * or when it gets an IO error reading an indirect block - * (ditto, *@err == -EIO) - * or when it reads all @depth-1 indirect blocks successfully and finds - * the whole chain, all way to the data (returns %NULL, *err == 0). - * - * Need to be called with - * down_read(&EXT4_I(inode)->i_data_sem) - */ -static Indirect *ext4_get_branch(struct inode *inode, int depth, - ext4_lblk_t *offsets, - Indirect chain[4], int *err) -{ - struct super_block *sb = inode->i_sb; - Indirect *p = chain; - struct buffer_head *bh; - - *err = 0; - /* i_data is not going away, no lock needed */ - add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets); - if (!p->key) - goto no_block; - while (--depth) { - bh = sb_getblk(sb, le32_to_cpu(p->key)); - if (unlikely(!bh)) - goto failure; - - if (!bh_uptodate_or_lock(bh)) { - if (bh_submit_read(bh) < 0) { - put_bh(bh); - goto failure; - } - /* validate block references */ - if (ext4_check_indirect_blockref(inode, bh)) { - put_bh(bh); - goto failure; - } - } - - add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets); - /* Reader: end */ - if (!p->key) - goto no_block; - } - return NULL; - -failure: - *err = -EIO; -no_block: - return p; -} - -/** - * ext4_find_near - find a place for allocation with sufficient locality - * @inode: owner - * @ind: descriptor of indirect block. - * - * This function returns the preferred place for block allocation. - * It is used when heuristic for sequential allocation fails. - * Rules are: - * + if there is a block to the left of our position - allocate near it. - * + if pointer will live in indirect block - allocate near that block. - * + if pointer will live in inode - allocate in the same - * cylinder group. - * - * In the latter case we colour the starting block by the callers PID to - * prevent it from clashing with concurrent allocations for a different inode - * in the same block group. The PID is used here so that functionally related - * files will be close-by on-disk. - * - * Caller must make sure that @ind is valid and will stay that way. - */ -static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data; - __le32 *p; - ext4_fsblk_t bg_start; - ext4_fsblk_t last_block; - ext4_grpblk_t colour; - ext4_group_t block_group; - int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); - - /* Try to find previous block */ - for (p = ind->p - 1; p >= start; p--) { - if (*p) - return le32_to_cpu(*p); - } - - /* No such thing, so let's try location of indirect block */ - if (ind->bh) - return ind->bh->b_blocknr; - - /* - * It is going to be referred to from the inode itself? OK, just put it - * into the same cylinder group then. - */ - block_group = ei->i_block_group; - if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { - block_group &= ~(flex_size-1); - if (S_ISREG(inode->i_mode)) - block_group++; - } - bg_start = ext4_group_first_block_no(inode->i_sb, block_group); - last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; - - /* - * If we are doing delayed allocation, we don't need take - * colour into account. - */ - if (test_opt(inode->i_sb, DELALLOC)) - return bg_start; - - if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) - colour = (current->pid % 16) * - (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); - else - colour = (current->pid % 16) * ((last_block - bg_start) / 16); - return bg_start + colour; -} - -/** - * ext4_find_goal - find a preferred place for allocation. - * @inode: owner - * @block: block we want - * @partial: pointer to the last triple within a chain - * - * Normally this function find the preferred place for block allocation, - * returns it. - * Because this is only used for non-extent files, we limit the block nr - * to 32 bits. - */ -static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, - Indirect *partial) -{ - ext4_fsblk_t goal; - - /* - * XXX need to get goal block from mballoc's data structures - */ - - goal = ext4_find_near(inode, partial); - goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; - return goal; -} - -/** - * ext4_blks_to_allocate - Look up the block map and count the number - * of direct blocks need to be allocated for the given branch. - * - * @branch: chain of indirect blocks - * @k: number of blocks need for indirect blocks - * @blks: number of data blocks to be mapped. - * @blocks_to_boundary: the offset in the indirect block - * - * return the total number of blocks to be allocate, including the - * direct and indirect blocks. - */ -static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, - int blocks_to_boundary) -{ - unsigned int count = 0; - - /* - * Simple case, [t,d]Indirect block(s) has not allocated yet - * then it's clear blocks on that path have not allocated - */ - if (k > 0) { - /* right now we don't handle cross boundary allocation */ - if (blks < blocks_to_boundary + 1) - count += blks; - else - count += blocks_to_boundary + 1; - return count; - } - - count++; - while (count < blks && count <= blocks_to_boundary && - le32_to_cpu(*(branch[0].p + count)) == 0) { - count++; - } - return count; -} - -/** - * ext4_alloc_blocks: multiple allocate blocks needed for a branch - * @handle: handle for this transaction - * @inode: inode which needs allocated blocks - * @iblock: the logical block to start allocated at - * @goal: preferred physical block of allocation - * @indirect_blks: the number of blocks need to allocate for indirect - * blocks - * @blks: number of desired blocks - * @new_blocks: on return it will store the new block numbers for - * the indirect blocks(if needed) and the first direct block, - * @err: on return it will store the error code - * - * This function will return the number of blocks allocated as - * requested by the passed-in parameters. - */ -static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, - ext4_lblk_t iblock, ext4_fsblk_t goal, - int indirect_blks, int blks, - ext4_fsblk_t new_blocks[4], int *err) -{ - struct ext4_allocation_request ar; - int target, i; - unsigned long count = 0, blk_allocated = 0; - int index = 0; - ext4_fsblk_t current_block = 0; - int ret = 0; - - /* - * Here we try to allocate the requested multiple blocks at once, - * on a best-effort basis. - * To build a branch, we should allocate blocks for - * the indirect blocks(if not allocated yet), and at least - * the first direct block of this branch. That's the - * minimum number of blocks need to allocate(required) - */ - /* first we try to allocate the indirect blocks */ - target = indirect_blks; - while (target > 0) { - count = target; - /* allocating blocks for indirect blocks and direct blocks */ - current_block = ext4_new_meta_blocks(handle, inode, goal, - 0, &count, err); - if (*err) - goto failed_out; - - if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) { - EXT4_ERROR_INODE(inode, - "current_block %llu + count %lu > %d!", - current_block, count, - EXT4_MAX_BLOCK_FILE_PHYS); - *err = -EIO; - goto failed_out; - } - - target -= count; - /* allocate blocks for indirect blocks */ - while (index < indirect_blks && count) { - new_blocks[index++] = current_block++; - count--; - } - if (count > 0) { - /* - * save the new block number - * for the first direct block - */ - new_blocks[index] = current_block; - printk(KERN_INFO "%s returned more blocks than " - "requested\n", __func__); - WARN_ON(1); - break; - } - } - - target = blks - count ; - blk_allocated = count; - if (!target) - goto allocated; - /* Now allocate data blocks */ - memset(&ar, 0, sizeof(ar)); - ar.inode = inode; - ar.goal = goal; - ar.len = target; - ar.logical = iblock; - if (S_ISREG(inode->i_mode)) - /* enable in-core preallocation only for regular files */ - ar.flags = EXT4_MB_HINT_DATA; - - current_block = ext4_mb_new_blocks(handle, &ar, err); - if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) { - EXT4_ERROR_INODE(inode, - "current_block %llu + ar.len %d > %d!", - current_block, ar.len, - EXT4_MAX_BLOCK_FILE_PHYS); - *err = -EIO; - goto failed_out; - } - - if (*err && (target == blks)) { - /* - * if the allocation failed and we didn't allocate - * any blocks before - */ - goto failed_out; - } - if (!*err) { - if (target == blks) { - /* - * save the new block number - * for the first direct block - */ - new_blocks[index] = current_block; - } - blk_allocated += ar.len; - } -allocated: - /* total number of blocks allocated for direct blocks */ - ret = blk_allocated; - *err = 0; - return ret; -failed_out: - for (i = 0; i < index; i++) - ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); - return ret; -} - -/** - * ext4_alloc_branch - allocate and set up a chain of blocks. - * @handle: handle for this transaction - * @inode: owner - * @indirect_blks: number of allocated indirect blocks - * @blks: number of allocated direct blocks - * @goal: preferred place for allocation - * @offsets: offsets (in the blocks) to store the pointers to next. - * @branch: place to store the chain in. - * - * This function allocates blocks, zeroes out all but the last one, - * links them into chain and (if we are synchronous) writes them to disk. - * In other words, it prepares a branch that can be spliced onto the - * inode. It stores the information about that chain in the branch[], in - * the same format as ext4_get_branch() would do. We are calling it after - * we had read the existing part of chain and partial points to the last - * triple of that (one with zero ->key). Upon the exit we have the same - * picture as after the successful ext4_get_block(), except that in one - * place chain is disconnected - *branch->p is still zero (we did not - * set the last link), but branch->key contains the number that should - * be placed into *branch->p to fill that gap. - * - * If allocation fails we free all blocks we've allocated (and forget - * their buffer_heads) and return the error value the from failed - * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain - * as described above and return 0. - */ -static int ext4_alloc_branch(handle_t *handle, struct inode *inode, - ext4_lblk_t iblock, int indirect_blks, - int *blks, ext4_fsblk_t goal, - ext4_lblk_t *offsets, Indirect *branch) -{ - int blocksize = inode->i_sb->s_blocksize; - int i, n = 0; - int err = 0; - struct buffer_head *bh; - int num; - ext4_fsblk_t new_blocks[4]; - ext4_fsblk_t current_block; - - num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks, - *blks, new_blocks, &err); - if (err) - return err; - - branch[0].key = cpu_to_le32(new_blocks[0]); - /* - * metadata blocks and data blocks are allocated. - */ - for (n = 1; n <= indirect_blks; n++) { - /* - * Get buffer_head for parent block, zero it out - * and set the pointer to new one, then send - * parent to disk. - */ - bh = sb_getblk(inode->i_sb, new_blocks[n-1]); - if (unlikely(!bh)) { - err = -EIO; - goto failed; - } - - branch[n].bh = bh; - lock_buffer(bh); - BUFFER_TRACE(bh, "call get_create_access"); - err = ext4_journal_get_create_access(handle, bh); - if (err) { - /* Don't brelse(bh) here; it's done in - * ext4_journal_forget() below */ - unlock_buffer(bh); - goto failed; - } - - memset(bh->b_data, 0, blocksize); - branch[n].p = (__le32 *) bh->b_data + offsets[n]; - branch[n].key = cpu_to_le32(new_blocks[n]); - *branch[n].p = branch[n].key; - if (n == indirect_blks) { - current_block = new_blocks[n]; - /* - * End of chain, update the last new metablock of - * the chain to point to the new allocated - * data blocks numbers - */ - for (i = 1; i < num; i++) - *(branch[n].p + i) = cpu_to_le32(++current_block); - } - BUFFER_TRACE(bh, "marking uptodate"); - set_buffer_uptodate(bh); - unlock_buffer(bh); - - BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, inode, bh); - if (err) - goto failed; - } - *blks = num; - return err; -failed: - /* Allocation failed, free what we already allocated */ - ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0); - for (i = 1; i <= n ; i++) { - /* - * branch[i].bh is newly allocated, so there is no - * need to revoke the block, which is why we don't - * need to set EXT4_FREE_BLOCKS_METADATA. - */ - ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, - EXT4_FREE_BLOCKS_FORGET); - } - for (i = n+1; i < indirect_blks; i++) - ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); - - ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0); - - return err; -} - -/** - * ext4_splice_branch - splice the allocated branch onto inode. - * @handle: handle for this transaction - * @inode: owner - * @block: (logical) number of block we are adding - * @chain: chain of indirect blocks (with a missing link - see - * ext4_alloc_branch) - * @where: location of missing link - * @num: number of indirect blocks we are adding - * @blks: number of direct blocks we are adding - * - * This function fills the missing link and does all housekeeping needed in - * inode (->i_blocks, etc.). In case of success we end up with the full - * chain to new block and return 0. - */ -static int ext4_splice_branch(handle_t *handle, struct inode *inode, - ext4_lblk_t block, Indirect *where, int num, - int blks) -{ - int i; - int err = 0; - ext4_fsblk_t current_block; - - /* - * If we're splicing into a [td]indirect block (as opposed to the - * inode) then we need to get write access to the [td]indirect block - * before the splice. - */ - if (where->bh) { - BUFFER_TRACE(where->bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, where->bh); - if (err) - goto err_out; - } - /* That's it */ - - *where->p = where->key; - - /* - * Update the host buffer_head or inode to point to more just allocated - * direct blocks blocks - */ - if (num == 0 && blks > 1) { - current_block = le32_to_cpu(where->key) + 1; - for (i = 1; i < blks; i++) - *(where->p + i) = cpu_to_le32(current_block++); - } - - /* We are done with atomic stuff, now do the rest of housekeeping */ - /* had we spliced it onto indirect block? */ - if (where->bh) { - /* - * If we spliced it onto an indirect block, we haven't - * altered the inode. Note however that if it is being spliced - * onto an indirect block at the very end of the file (the - * file is growing) then we *will* alter the inode to reflect - * the new i_size. But that is not done here - it is done in - * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode. - */ - jbd_debug(5, "splicing indirect only\n"); - BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, inode, where->bh); - if (err) - goto err_out; - } else { - /* - * OK, we spliced it into the inode itself on a direct block. - */ - ext4_mark_inode_dirty(handle, inode); - jbd_debug(5, "splicing direct\n"); - } - return err; - -err_out: - for (i = 1; i <= num; i++) { - /* - * branch[i].bh is newly allocated, so there is no - * need to revoke the block, which is why we don't - * need to set EXT4_FREE_BLOCKS_METADATA. - */ - ext4_free_blocks(handle, inode, where[i].bh, 0, 1, - EXT4_FREE_BLOCKS_FORGET); - } - ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key), - blks, 0); - - return err; -} - -/* - * The ext4_ind_map_blocks() function handles non-extents inodes - * (i.e., using the traditional indirect/double-indirect i_blocks - * scheme) for ext4_map_blocks(). - * - * Allocation strategy is simple: if we have to allocate something, we will - * have to go the whole way to leaf. So let's do it before attaching anything - * to tree, set linkage between the newborn blocks, write them if sync is - * required, recheck the path, free and repeat if check fails, otherwise - * set the last missing link (that will protect us from any truncate-generated - * removals - all blocks on the path are immune now) and possibly force the - * write on the parent block. - * That has a nice additional property: no special recovery from the failed - * allocations is needed - we simply release blocks and do not touch anything - * reachable from inode. - * - * `handle' can be NULL if create == 0. - * - * return > 0, # of blocks mapped or allocated. - * return = 0, if plain lookup failed. - * return < 0, error case. - * - * The ext4_ind_get_blocks() function should be called with - * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem - * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or - * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system - * blocks. - */ -static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, - struct ext4_map_blocks *map, - int flags) -{ - int err = -EIO; - ext4_lblk_t offsets[4]; - Indirect chain[4]; - Indirect *partial; - ext4_fsblk_t goal; - int indirect_blks; - int blocks_to_boundary = 0; - int depth; - int count = 0; - ext4_fsblk_t first_block = 0; - - trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); - J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))); - J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); - depth = ext4_block_to_path(inode, map->m_lblk, offsets, - &blocks_to_boundary); - - if (depth == 0) - goto out; - - partial = ext4_get_branch(inode, depth, offsets, chain, &err); - - /* Simplest case - block found, no allocation needed */ - if (!partial) { - first_block = le32_to_cpu(chain[depth - 1].key); - count++; - /*map more blocks*/ - while (count < map->m_len && count <= blocks_to_boundary) { - ext4_fsblk_t blk; - - blk = le32_to_cpu(*(chain[depth-1].p + count)); - - if (blk == first_block + count) - count++; - else - break; - } - goto got_it; - } - - /* Next simple case - plain lookup or failed read of indirect block */ - if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO) - goto cleanup; - - /* - * Okay, we need to do block allocation. - */ - goal = ext4_find_goal(inode, map->m_lblk, partial); - - /* the number of blocks need to allocate for [d,t]indirect blocks */ - indirect_blks = (chain + depth) - partial - 1; - - /* - * Next look up the indirect map to count the totoal number of - * direct blocks to allocate for this branch. - */ - count = ext4_blks_to_allocate(partial, indirect_blks, - map->m_len, blocks_to_boundary); - /* - * Block out ext4_truncate while we alter the tree - */ - err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks, - &count, goal, - offsets + (partial - chain), partial); - - /* - * The ext4_splice_branch call will free and forget any buffers - * on the new chain if there is a failure, but that risks using - * up transaction credits, especially for bitmaps where the - * credits cannot be returned. Can we handle this somehow? We - * may need to return -EAGAIN upwards in the worst case. --sct - */ - if (!err) - err = ext4_splice_branch(handle, inode, map->m_lblk, - partial, indirect_blks, count); - if (err) - goto cleanup; - - map->m_flags |= EXT4_MAP_NEW; - - ext4_update_inode_fsync_trans(handle, inode, 1); -got_it: - map->m_flags |= EXT4_MAP_MAPPED; - map->m_pblk = le32_to_cpu(chain[depth-1].key); - map->m_len = count; - if (count > blocks_to_boundary) - map->m_flags |= EXT4_MAP_BOUNDARY; - err = count; - /* Clean up and exit */ - partial = chain + depth - 1; /* the whole chain */ -cleanup: - while (partial > chain) { - BUFFER_TRACE(partial->bh, "call brelse"); - brelse(partial->bh); - partial--; - } -out: - trace_ext4_ind_map_blocks_exit(inode, map->m_lblk, - map->m_pblk, map->m_len, err); - return err; -} - #ifdef CONFIG_QUOTA qsize_t *ext4_get_reserved_space(struct inode *inode) { @@ -1073,33 +247,6 @@ qsize_t *ext4_get_reserved_space(struct inode *inode) /* * Calculate the number of metadata blocks need to reserve - * to allocate a new block at @lblocks for non extent file based file - */ -static int ext4_indirect_calc_metadata_amount(struct inode *inode, - sector_t lblock) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1); - int blk_bits; - - if (lblock < EXT4_NDIR_BLOCKS) - return 0; - - lblock -= EXT4_NDIR_BLOCKS; - - if (ei->i_da_metadata_calc_len && - (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) { - ei->i_da_metadata_calc_len++; - return 0; - } - ei->i_da_metadata_calc_last_lblock = lblock & dind_mask; - ei->i_da_metadata_calc_len = 1; - blk_bits = order_base_2(lblock); - return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1; -} - -/* - * Calculate the number of metadata blocks need to reserve * to allocate a block located at @lblock */ static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) @@ -1107,7 +254,7 @@ static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) return ext4_ext_calc_metadata_amount(inode, lblock); - return ext4_indirect_calc_metadata_amount(inode, lblock); + return ext4_ind_calc_metadata_amount(inode, lblock); } /* @@ -1589,16 +736,6 @@ static int do_journal_get_write_access(handle_t *handle, return ret; } -/* - * Truncate blocks that were not used by write. We have to truncate the - * pagecache as well so that corresponding buffers get properly unmapped. - */ -static void ext4_truncate_failed_write(struct inode *inode) -{ - truncate_inode_pages(inode->i_mapping, inode->i_size); - ext4_truncate(inode); -} - static int ext4_get_block_write(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); static int ext4_write_begin(struct file *file, struct address_space *mapping, @@ -1849,6 +986,8 @@ static int ext4_journalled_write_end(struct file *file, from = pos & (PAGE_CACHE_SIZE - 1); to = from + len; + BUG_ON(!ext4_handle_valid(handle)); + if (copied < len) { if (!PageUptodate(page)) copied = 0; @@ -1863,6 +1002,7 @@ static int ext4_journalled_write_end(struct file *file, if (new_i_size > inode->i_size) i_size_write(inode, pos+copied); ext4_set_inode_state(inode, EXT4_STATE_JDATA); + EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; if (new_i_size > EXT4_I(inode)->i_disksize) { ext4_update_i_disksize(inode, new_i_size); ret2 = ext4_mark_inode_dirty(handle, inode); @@ -2148,7 +1288,12 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT)) err = ext4_bio_write_page(&io_submit, page, len, mpd->wbc); - else + else if (buffer_uninit(page_bufs)) { + ext4_set_bh_endio(page_bufs, inode); + err = block_write_full_page_endio(page, + noalloc_get_block_write, + mpd->wbc, ext4_end_io_buffer_write); + } else err = block_write_full_page(page, noalloc_get_block_write, mpd->wbc); @@ -2564,6 +1709,8 @@ static int __ext4_journalled_writepage(struct page *page, goto out; } + BUG_ON(!ext4_handle_valid(handle)); + ret = walk_page_buffers(handle, page_bufs, 0, len, NULL, do_journal_get_write_access); @@ -2571,6 +1718,7 @@ static int __ext4_journalled_writepage(struct page *page, write_end_fn); if (ret == 0) ret = err; + EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; err = ext4_journal_stop(handle); if (!ret) ret = err; @@ -2741,7 +1889,7 @@ static int write_cache_pages_da(struct address_space *mapping, index = wbc->range_start >> PAGE_CACHE_SHIFT; end = wbc->range_end >> PAGE_CACHE_SHIFT; - if (wbc->sync_mode == WB_SYNC_ALL) + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; @@ -2973,7 +2121,7 @@ static int ext4_da_writepages(struct address_space *mapping, } retry: - if (wbc->sync_mode == WB_SYNC_ALL) + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, index, end); while (!ret && wbc->nr_to_write > 0) { @@ -3450,112 +2598,6 @@ static int ext4_releasepage(struct page *page, gfp_t wait) } /* - * O_DIRECT for ext3 (or indirect map) based files - * - * If the O_DIRECT write will extend the file then add this inode to the - * orphan list. So recovery will truncate it back to the original size - * if the machine crashes during the write. - * - * If the O_DIRECT write is intantiating holes inside i_size and the machine - * crashes then stale disk data _may_ be exposed inside the file. But current - * VFS code falls back into buffered path in that case so we are safe. - */ -static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, - unsigned long nr_segs) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - struct ext4_inode_info *ei = EXT4_I(inode); - handle_t *handle; - ssize_t ret; - int orphan = 0; - size_t count = iov_length(iov, nr_segs); - int retries = 0; - - if (rw == WRITE) { - loff_t final_size = offset + count; - - if (final_size > inode->i_size) { - /* Credits for sb + inode write */ - handle = ext4_journal_start(inode, 2); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out; - } - ret = ext4_orphan_add(handle, inode); - if (ret) { - ext4_journal_stop(handle); - goto out; - } - orphan = 1; - ei->i_disksize = inode->i_size; - ext4_journal_stop(handle); - } - } - -retry: - if (rw == READ && ext4_should_dioread_nolock(inode)) - ret = __blockdev_direct_IO(rw, iocb, inode, - inode->i_sb->s_bdev, iov, - offset, nr_segs, - ext4_get_block, NULL, NULL, 0); - else { - ret = blockdev_direct_IO(rw, iocb, inode, iov, - offset, nr_segs, ext4_get_block); - - if (unlikely((rw & WRITE) && ret < 0)) { - loff_t isize = i_size_read(inode); - loff_t end = offset + iov_length(iov, nr_segs); - - if (end > isize) - ext4_truncate_failed_write(inode); - } - } - if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) - goto retry; - - if (orphan) { - int err; - - /* Credits for sb + inode write */ - handle = ext4_journal_start(inode, 2); - if (IS_ERR(handle)) { - /* This is really bad luck. We've written the data - * but cannot extend i_size. Bail out and pretend - * the write failed... */ - ret = PTR_ERR(handle); - if (inode->i_nlink) - ext4_orphan_del(NULL, inode); - - goto out; - } - if (inode->i_nlink) - ext4_orphan_del(handle, inode); - if (ret > 0) { - loff_t end = offset + ret; - if (end > inode->i_size) { - ei->i_disksize = end; - i_size_write(inode, end); - /* - * We're going to return a positive `ret' - * here due to non-zero-length I/O, so there's - * no way of reporting error returns from - * ext4_mark_inode_dirty() to userspace. So - * ignore it. - */ - ext4_mark_inode_dirty(handle, inode); - } - } - err = ext4_journal_stop(handle); - if (ret == 0) - ret = err; - } -out: - return ret; -} - -/* * ext4_get_block used when preparing for a DIO write or buffer write. * We allocate an uinitialized extent if blocks haven't been allocated. * The extent will be converted to initialized after the IO is complete. @@ -3638,8 +2680,15 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) goto out; } - io_end->flag = EXT4_IO_END_UNWRITTEN; + /* + * It may be over-defensive here to check EXT4_IO_END_UNWRITTEN now, + * but being more careful is always safe for the future change. + */ inode = io_end->inode; + if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { + io_end->flag |= EXT4_IO_END_UNWRITTEN; + atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); + } /* Add the io_end to per-inode completed io list*/ spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); @@ -4033,383 +3082,6 @@ unlock: return err; } -/* - * Probably it should be a library function... search for first non-zero word - * or memcmp with zero_page, whatever is better for particular architecture. - * Linus? - */ -static inline int all_zeroes(__le32 *p, __le32 *q) -{ - while (p < q) - if (*p++) - return 0; - return 1; -} - -/** - * ext4_find_shared - find the indirect blocks for partial truncation. - * @inode: inode in question - * @depth: depth of the affected branch - * @offsets: offsets of pointers in that branch (see ext4_block_to_path) - * @chain: place to store the pointers to partial indirect blocks - * @top: place to the (detached) top of branch - * - * This is a helper function used by ext4_truncate(). - * - * When we do truncate() we may have to clean the ends of several - * indirect blocks but leave the blocks themselves alive. Block is - * partially truncated if some data below the new i_size is referred - * from it (and it is on the path to the first completely truncated - * data block, indeed). We have to free the top of that path along - * with everything to the right of the path. Since no allocation - * past the truncation point is possible until ext4_truncate() - * finishes, we may safely do the latter, but top of branch may - * require special attention - pageout below the truncation point - * might try to populate it. - * - * We atomically detach the top of branch from the tree, store the - * block number of its root in *@top, pointers to buffer_heads of - * partially truncated blocks - in @chain[].bh and pointers to - * their last elements that should not be removed - in - * @chain[].p. Return value is the pointer to last filled element - * of @chain. - * - * The work left to caller to do the actual freeing of subtrees: - * a) free the subtree starting from *@top - * b) free the subtrees whose roots are stored in - * (@chain[i].p+1 .. end of @chain[i].bh->b_data) - * c) free the subtrees growing from the inode past the @chain[0]. - * (no partially truncated stuff there). */ - -static Indirect *ext4_find_shared(struct inode *inode, int depth, - ext4_lblk_t offsets[4], Indirect chain[4], - __le32 *top) -{ - Indirect *partial, *p; - int k, err; - - *top = 0; - /* Make k index the deepest non-null offset + 1 */ - for (k = depth; k > 1 && !offsets[k-1]; k--) - ; - partial = ext4_get_branch(inode, k, offsets, chain, &err); - /* Writer: pointers */ - if (!partial) - partial = chain + k-1; - /* - * If the branch acquired continuation since we've looked at it - - * fine, it should all survive and (new) top doesn't belong to us. - */ - if (!partial->key && *partial->p) - /* Writer: end */ - goto no_top; - for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--) - ; - /* - * OK, we've found the last block that must survive. The rest of our - * branch should be detached before unlocking. However, if that rest - * of branch is all ours and does not grow immediately from the inode - * it's easier to cheat and just decrement partial->p. - */ - if (p == chain + k - 1 && p > chain) { - p->p--; - } else { - *top = *p->p; - /* Nope, don't do this in ext4. Must leave the tree intact */ -#if 0 - *p->p = 0; -#endif - } - /* Writer: end */ - - while (partial > p) { - brelse(partial->bh); - partial--; - } -no_top: - return partial; -} - -/* - * Zero a number of block pointers in either an inode or an indirect block. - * If we restart the transaction we must again get write access to the - * indirect block for further modification. - * - * We release `count' blocks on disk, but (last - first) may be greater - * than `count' because there can be holes in there. - * - * Return 0 on success, 1 on invalid block range - * and < 0 on fatal error. - */ -static int ext4_clear_blocks(handle_t *handle, struct inode *inode, - struct buffer_head *bh, - ext4_fsblk_t block_to_free, - unsigned long count, __le32 *first, - __le32 *last) -{ - __le32 *p; - int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED; - int err; - - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - flags |= EXT4_FREE_BLOCKS_METADATA; - - if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, - count)) { - EXT4_ERROR_INODE(inode, "attempt to clear invalid " - "blocks %llu len %lu", - (unsigned long long) block_to_free, count); - return 1; - } - - if (try_to_extend_transaction(handle, inode)) { - if (bh) { - BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, inode, bh); - if (unlikely(err)) - goto out_err; - } - err = ext4_mark_inode_dirty(handle, inode); - if (unlikely(err)) - goto out_err; - err = ext4_truncate_restart_trans(handle, inode, - blocks_for_truncate(inode)); - if (unlikely(err)) - goto out_err; - if (bh) { - BUFFER_TRACE(bh, "retaking write access"); - err = ext4_journal_get_write_access(handle, bh); - if (unlikely(err)) - goto out_err; - } - } - - for (p = first; p < last; p++) - *p = 0; - - ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags); - return 0; -out_err: - ext4_std_error(inode->i_sb, err); - return err; -} - -/** - * ext4_free_data - free a list of data blocks - * @handle: handle for this transaction - * @inode: inode we are dealing with - * @this_bh: indirect buffer_head which contains *@first and *@last - * @first: array of block numbers - * @last: points immediately past the end of array - * - * We are freeing all blocks referred from that array (numbers are stored as - * little-endian 32-bit) and updating @inode->i_blocks appropriately. - * - * We accumulate contiguous runs of blocks to free. Conveniently, if these - * blocks are contiguous then releasing them at one time will only affect one - * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't - * actually use a lot of journal space. - * - * @this_bh will be %NULL if @first and @last point into the inode's direct - * block pointers. - */ -static void ext4_free_data(handle_t *handle, struct inode *inode, - struct buffer_head *this_bh, - __le32 *first, __le32 *last) -{ - ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */ - unsigned long count = 0; /* Number of blocks in the run */ - __le32 *block_to_free_p = NULL; /* Pointer into inode/ind - corresponding to - block_to_free */ - ext4_fsblk_t nr; /* Current block # */ - __le32 *p; /* Pointer into inode/ind - for current block */ - int err = 0; - - if (this_bh) { /* For indirect block */ - BUFFER_TRACE(this_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, this_bh); - /* Important: if we can't update the indirect pointers - * to the blocks, we can't free them. */ - if (err) - return; - } - - for (p = first; p < last; p++) { - nr = le32_to_cpu(*p); - if (nr) { - /* accumulate blocks to free if they're contiguous */ - if (count == 0) { - block_to_free = nr; - block_to_free_p = p; - count = 1; - } else if (nr == block_to_free + count) { - count++; - } else { - err = ext4_clear_blocks(handle, inode, this_bh, - block_to_free, count, - block_to_free_p, p); - if (err) - break; - block_to_free = nr; - block_to_free_p = p; - count = 1; - } - } - } - - if (!err && count > 0) - err = ext4_clear_blocks(handle, inode, this_bh, block_to_free, - count, block_to_free_p, p); - if (err < 0) - /* fatal error */ - return; - - if (this_bh) { - BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata"); - - /* - * The buffer head should have an attached journal head at this - * point. However, if the data is corrupted and an indirect - * block pointed to itself, it would have been detached when - * the block was cleared. Check for this instead of OOPSing. - */ - if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) - ext4_handle_dirty_metadata(handle, inode, this_bh); - else - EXT4_ERROR_INODE(inode, - "circular indirect block detected at " - "block %llu", - (unsigned long long) this_bh->b_blocknr); - } -} - -/** - * ext4_free_branches - free an array of branches - * @handle: JBD handle for this transaction - * @inode: inode we are dealing with - * @parent_bh: the buffer_head which contains *@first and *@last - * @first: array of block numbers - * @last: pointer immediately past the end of array - * @depth: depth of the branches to free - * - * We are freeing all blocks referred from these branches (numbers are - * stored as little-endian 32-bit) and updating @inode->i_blocks - * appropriately. - */ -static void ext4_free_branches(handle_t *handle, struct inode *inode, - struct buffer_head *parent_bh, - __le32 *first, __le32 *last, int depth) -{ - ext4_fsblk_t nr; - __le32 *p; - - if (ext4_handle_is_aborted(handle)) - return; - - if (depth--) { - struct buffer_head *bh; - int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); - p = last; - while (--p >= first) { - nr = le32_to_cpu(*p); - if (!nr) - continue; /* A hole */ - - if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), - nr, 1)) { - EXT4_ERROR_INODE(inode, - "invalid indirect mapped " - "block %lu (level %d)", - (unsigned long) nr, depth); - break; - } - - /* Go read the buffer for the next level down */ - bh = sb_bread(inode->i_sb, nr); - - /* - * A read failure? Report error and clear slot - * (should be rare). - */ - if (!bh) { - EXT4_ERROR_INODE_BLOCK(inode, nr, - "Read failure"); - continue; - } - - /* This zaps the entire block. Bottom up. */ - BUFFER_TRACE(bh, "free child branches"); - ext4_free_branches(handle, inode, bh, - (__le32 *) bh->b_data, - (__le32 *) bh->b_data + addr_per_block, - depth); - brelse(bh); - - /* - * Everything below this this pointer has been - * released. Now let this top-of-subtree go. - * - * We want the freeing of this indirect block to be - * atomic in the journal with the updating of the - * bitmap block which owns it. So make some room in - * the journal. - * - * We zero the parent pointer *after* freeing its - * pointee in the bitmaps, so if extend_transaction() - * for some reason fails to put the bitmap changes and - * the release into the same transaction, recovery - * will merely complain about releasing a free block, - * rather than leaking blocks. - */ - if (ext4_handle_is_aborted(handle)) - return; - if (try_to_extend_transaction(handle, inode)) { - ext4_mark_inode_dirty(handle, inode); - ext4_truncate_restart_trans(handle, inode, - blocks_for_truncate(inode)); - } - - /* - * The forget flag here is critical because if - * we are journaling (and not doing data - * journaling), we have to make sure a revoke - * record is written to prevent the journal - * replay from overwriting the (former) - * indirect block if it gets reallocated as a - * data block. This must happen in the same - * transaction where the data blocks are - * actually freed. - */ - ext4_free_blocks(handle, inode, NULL, nr, 1, - EXT4_FREE_BLOCKS_METADATA| - EXT4_FREE_BLOCKS_FORGET); - - if (parent_bh) { - /* - * The block which we have just freed is - * pointed to by an indirect block: journal it - */ - BUFFER_TRACE(parent_bh, "get_write_access"); - if (!ext4_journal_get_write_access(handle, - parent_bh)){ - *p = 0; - BUFFER_TRACE(parent_bh, - "call ext4_handle_dirty_metadata"); - ext4_handle_dirty_metadata(handle, - inode, - parent_bh); - } - } - } - } else { - /* We have reached the bottom of the tree. */ - BUFFER_TRACE(parent_bh, "free data blocks"); - ext4_free_data(handle, inode, parent_bh, first, last); - } -} - int ext4_can_truncate(struct inode *inode) { if (S_ISREG(inode->i_mode)) @@ -4476,19 +3148,6 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) */ void ext4_truncate(struct inode *inode) { - handle_t *handle; - struct ext4_inode_info *ei = EXT4_I(inode); - __le32 *i_data = ei->i_data; - int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); - struct address_space *mapping = inode->i_mapping; - ext4_lblk_t offsets[4]; - Indirect chain[4]; - Indirect *partial; - __le32 nr = 0; - int n = 0; - ext4_lblk_t last_block, max_block; - unsigned blocksize = inode->i_sb->s_blocksize; - trace_ext4_truncate_enter(inode); if (!ext4_can_truncate(inode)) @@ -4499,149 +3158,11 @@ void ext4_truncate(struct inode *inode) if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) ext4_ext_truncate(inode); - trace_ext4_truncate_exit(inode); - return; - } - - handle = start_transaction(inode); - if (IS_ERR(handle)) - return; /* AKPM: return what? */ - - last_block = (inode->i_size + blocksize-1) - >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); - max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) - >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); - - if (inode->i_size & (blocksize - 1)) - if (ext4_block_truncate_page(handle, mapping, inode->i_size)) - goto out_stop; - - if (last_block != max_block) { - n = ext4_block_to_path(inode, last_block, offsets, NULL); - if (n == 0) - goto out_stop; /* error */ - } - - /* - * OK. This truncate is going to happen. We add the inode to the - * orphan list, so that if this truncate spans multiple transactions, - * and we crash, we will resume the truncate when the filesystem - * recovers. It also marks the inode dirty, to catch the new size. - * - * Implication: the file must always be in a sane, consistent - * truncatable state while each transaction commits. - */ - if (ext4_orphan_add(handle, inode)) - goto out_stop; - - /* - * From here we block out all ext4_get_block() callers who want to - * modify the block allocation tree. - */ - down_write(&ei->i_data_sem); - - ext4_discard_preallocations(inode); - - /* - * The orphan list entry will now protect us from any crash which - * occurs before the truncate completes, so it is now safe to propagate - * the new, shorter inode size (held for now in i_size) into the - * on-disk inode. We do this via i_disksize, which is the value which - * ext4 *really* writes onto the disk inode. - */ - ei->i_disksize = inode->i_size; - - if (last_block == max_block) { - /* - * It is unnecessary to free any data blocks if last_block is - * equal to the indirect block limit. - */ - goto out_unlock; - } else if (n == 1) { /* direct blocks */ - ext4_free_data(handle, inode, NULL, i_data+offsets[0], - i_data + EXT4_NDIR_BLOCKS); - goto do_indirects; - } - - partial = ext4_find_shared(inode, n, offsets, chain, &nr); - /* Kill the top of shared branch (not detached) */ - if (nr) { - if (partial == chain) { - /* Shared branch grows from the inode */ - ext4_free_branches(handle, inode, NULL, - &nr, &nr+1, (chain+n-1) - partial); - *partial->p = 0; - /* - * We mark the inode dirty prior to restart, - * and prior to stop. No need for it here. - */ - } else { - /* Shared branch grows from an indirect block */ - BUFFER_TRACE(partial->bh, "get_write_access"); - ext4_free_branches(handle, inode, partial->bh, - partial->p, - partial->p+1, (chain+n-1) - partial); - } - } - /* Clear the ends of indirect blocks on the shared branch */ - while (partial > chain) { - ext4_free_branches(handle, inode, partial->bh, partial->p + 1, - (__le32*)partial->bh->b_data+addr_per_block, - (chain+n-1) - partial); - BUFFER_TRACE(partial->bh, "call brelse"); - brelse(partial->bh); - partial--; - } -do_indirects: - /* Kill the remaining (whole) subtrees */ - switch (offsets[0]) { - default: - nr = i_data[EXT4_IND_BLOCK]; - if (nr) { - ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); - i_data[EXT4_IND_BLOCK] = 0; - } - case EXT4_IND_BLOCK: - nr = i_data[EXT4_DIND_BLOCK]; - if (nr) { - ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); - i_data[EXT4_DIND_BLOCK] = 0; - } - case EXT4_DIND_BLOCK: - nr = i_data[EXT4_TIND_BLOCK]; - if (nr) { - ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); - i_data[EXT4_TIND_BLOCK] = 0; - } - case EXT4_TIND_BLOCK: - ; - } - -out_unlock: - up_write(&ei->i_data_sem); - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); - ext4_mark_inode_dirty(handle, inode); - - /* - * In a multi-transaction truncate, we only make the final transaction - * synchronous - */ - if (IS_SYNC(inode)) - ext4_handle_sync(handle); -out_stop: - /* - * If this was a simple ftruncate(), and the file will remain alive - * then we need to clear up the orphan record which we created above. - * However, if this was a real unlink then we were called by - * ext4_delete_inode(), and we allow that function to clean up the - * orphan info for us. - */ - if (inode->i_nlink) - ext4_orphan_del(handle, inode); + else + ext4_ind_truncate(inode); - ext4_journal_stop(handle); trace_ext4_truncate_exit(inode); } @@ -5012,7 +3533,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) (S_ISLNK(inode->i_mode) && !ext4_inode_is_fast_symlink(inode))) { /* Validate block references which are part of inode */ - ret = ext4_check_inode_blockref(inode); + ret = ext4_ind_check_inode(inode); } if (ret) goto bad_inode; @@ -5459,34 +3980,10 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, return 0; } -static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks, - int chunk) -{ - int indirects; - - /* if nrblocks are contiguous */ - if (chunk) { - /* - * With N contiguous data blocks, we need at most - * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks, - * 2 dindirect blocks, and 1 tindirect block - */ - return DIV_ROUND_UP(nrblocks, - EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4; - } - /* - * if nrblocks are not contiguous, worse case, each block touch - * a indirect block, and each indirect block touch a double indirect - * block, plus a triple indirect block - */ - indirects = nrblocks * 2 + 1; - return indirects; -} - static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) { if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) - return ext4_indirect_trans_blocks(inode, nrblocks, chunk); + return ext4_ind_trans_blocks(inode, nrblocks, chunk); return ext4_ext_index_trans_blocks(inode, nrblocks, chunk); } diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 808c554..f18bfe3 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -202,8 +202,9 @@ setversion_out: struct super_block *sb = inode->i_sb; int err, err2=0; - if (!capable(CAP_SYS_RESOURCE)) - return -EPERM; + err = ext4_resize_begin(sb); + if (err) + return err; if (get_user(n_blocks_count, (__u32 __user *)arg)) return -EFAULT; @@ -221,6 +222,7 @@ setversion_out: if (err == 0) err = err2; mnt_drop_write(filp->f_path.mnt); + ext4_resize_end(sb); return err; } @@ -271,8 +273,9 @@ mext_out: struct super_block *sb = inode->i_sb; int err, err2=0; - if (!capable(CAP_SYS_RESOURCE)) - return -EPERM; + err = ext4_resize_begin(sb); + if (err) + return err; if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg, sizeof(input))) @@ -291,6 +294,7 @@ mext_out: if (err == 0) err = err2; mnt_drop_write(filp->f_path.mnt); + ext4_resize_end(sb); return err; } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 6ed859d..17a5a57 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -75,8 +75,8 @@ * * The inode preallocation space is used looking at the _logical_ start * block. If only the logical file block falls within the range of prealloc - * space we will consume the particular prealloc space. This make sure that - * that the we have contiguous physical blocks representing the file blocks + * space we will consume the particular prealloc space. This makes sure that + * we have contiguous physical blocks representing the file blocks * * The important thing to be noted in case of inode prealloc space is that * we don't modify the values associated to inode prealloc space except @@ -84,7 +84,7 @@ * * If we are not able to find blocks in the inode prealloc space and if we * have the group allocation flag set then we look at the locality group - * prealloc space. These are per CPU prealloc list repreasented as + * prealloc space. These are per CPU prealloc list represented as * * ext4_sb_info.s_locality_groups[smp_processor_id()] * @@ -128,12 +128,13 @@ * we are doing a group prealloc we try to normalize the request to * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is * 512 blocks. This can be tuned via - * /sys/fs/ext4/<partition/mb_group_prealloc. The value is represented in + * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in * terms of number of blocks. If we have mounted the file system with -O * stripe=<value> option the group prealloc request is normalized to the - * stripe value (sbi->s_stripe) + * the smallest multiple of the stripe value (sbi->s_stripe) which is + * greater than the default mb_group_prealloc. * - * The regular allocator(using the buddy cache) supports few tunables. + * The regular allocator (using the buddy cache) supports a few tunables. * * /sys/fs/ext4/<partition>/mb_min_to_scan * /sys/fs/ext4/<partition>/mb_max_to_scan @@ -152,7 +153,7 @@ * best extent in the found extents. Searching for the blocks starts with * the group specified as the goal value in allocation context via * ac_g_ex. Each group is first checked based on the criteria whether it - * can used for allocation. ext4_mb_good_group explains how the groups are + * can be used for allocation. ext4_mb_good_group explains how the groups are * checked. * * Both the prealloc space are getting populated as above. So for the first @@ -492,10 +493,11 @@ static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) b2 = (unsigned char *) bitmap; for (i = 0; i < e4b->bd_sb->s_blocksize; i++) { if (b1[i] != b2[i]) { - printk(KERN_ERR "corruption in group %u " - "at byte %u(%u): %x in copy != %x " - "on disk/prealloc\n", - e4b->bd_group, i, i * 8, b1[i], b2[i]); + ext4_msg(e4b->bd_sb, KERN_ERR, + "corruption in group %u " + "at byte %u(%u): %x in copy != %x " + "on disk/prealloc", + e4b->bd_group, i, i * 8, b1[i], b2[i]); BUG(); } } @@ -1125,7 +1127,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, grp = ext4_get_group_info(sb, group); e4b->bd_blkbits = sb->s_blocksize_bits; - e4b->bd_info = ext4_get_group_info(sb, group); + e4b->bd_info = grp; e4b->bd_sb = sb; e4b->bd_group = group; e4b->bd_buddy_page = NULL; @@ -1281,7 +1283,7 @@ static void mb_clear_bits(void *bm, int cur, int len) } } -static void mb_set_bits(void *bm, int cur, int len) +void ext4_set_bits(void *bm, int cur, int len) { __u32 *addr; @@ -1510,7 +1512,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) } mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); - mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0); + ext4_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0); mb_check_buddy(e4b); return ret; @@ -2223,8 +2225,8 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, EXT4_DESC_PER_BLOCK_BITS(sb); meta_group_info = kmalloc(metalen, GFP_KERNEL); if (meta_group_info == NULL) { - printk(KERN_ERR "EXT4-fs: can't allocate mem for a " - "buddy group\n"); + ext4_msg(sb, KERN_ERR, "EXT4-fs: can't allocate mem " + "for a buddy group"); goto exit_meta_group_info; } sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = @@ -2237,7 +2239,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL); if (meta_group_info[i] == NULL) { - printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n"); + ext4_msg(sb, KERN_ERR, "EXT4-fs: can't allocate buddy mem"); goto exit_group_info; } memset(meta_group_info[i], 0, kmem_cache_size(cachep)); @@ -2279,8 +2281,10 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, exit_group_info: /* If a meta_group_info table has been allocated, release it now */ - if (group % EXT4_DESC_PER_BLOCK(sb) == 0) + if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]); + sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = NULL; + } exit_meta_group_info: return -ENOMEM; } /* ext4_mb_add_groupinfo */ @@ -2328,23 +2332,26 @@ static int ext4_mb_init_backend(struct super_block *sb) /* An 8TB filesystem with 64-bit pointers requires a 4096 byte * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. * So a two level scheme suffices for now. */ - sbi->s_group_info = kzalloc(array_size, GFP_KERNEL); + sbi->s_group_info = ext4_kvzalloc(array_size, GFP_KERNEL); if (sbi->s_group_info == NULL) { - printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n"); + ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group"); return -ENOMEM; } sbi->s_buddy_cache = new_inode(sb); if (sbi->s_buddy_cache == NULL) { - printk(KERN_ERR "EXT4-fs: can't get new inode\n"); + ext4_msg(sb, KERN_ERR, "can't get new inode"); goto err_freesgi; } - sbi->s_buddy_cache->i_ino = get_next_ino(); + /* To avoid potentially colliding with an valid on-disk inode number, + * use EXT4_BAD_INO for the buddy cache inode number. This inode is + * not in the inode hash, so it should never be found by iget(), but + * this will avoid confusion if it ever shows up during debugging. */ + sbi->s_buddy_cache->i_ino = EXT4_BAD_INO; EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; for (i = 0; i < ngroups; i++) { desc = ext4_get_group_desc(sb, i, NULL); if (desc == NULL) { - printk(KERN_ERR - "EXT4-fs: can't read descriptor %u\n", i); + ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i); goto err_freebuddy; } if (ext4_mb_add_groupinfo(sb, i, desc) != 0) @@ -2362,7 +2369,7 @@ err_freebuddy: kfree(sbi->s_group_info[i]); iput(sbi->s_buddy_cache); err_freesgi: - kfree(sbi->s_group_info); + ext4_kvfree(sbi->s_group_info); return -ENOMEM; } @@ -2404,14 +2411,15 @@ static int ext4_groupinfo_create_slab(size_t size) slab_size, 0, SLAB_RECLAIM_ACCOUNT, NULL); + ext4_groupinfo_caches[cache_index] = cachep; + mutex_unlock(&ext4_grpinfo_slab_create_mutex); if (!cachep) { - printk(KERN_EMERG "EXT4: no memory for groupinfo slab cache\n"); + printk(KERN_EMERG + "EXT4-fs: no memory for groupinfo slab cache\n"); return -ENOMEM; } - ext4_groupinfo_caches[cache_index] = cachep; - return 0; } @@ -2457,12 +2465,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) i++; } while (i <= sb->s_blocksize_bits + 1); - /* init file for buddy data */ - ret = ext4_mb_init_backend(sb); - if (ret != 0) { - goto out; - } - spin_lock_init(&sbi->s_md_lock); spin_lock_init(&sbi->s_bal_lock); @@ -2472,6 +2474,18 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC; + /* + * If there is a s_stripe > 1, then we set the s_mb_group_prealloc + * to the lowest multiple of s_stripe which is bigger than + * the s_mb_group_prealloc as determined above. We want + * the preallocation size to be an exact multiple of the + * RAID stripe size so that preallocations don't fragment + * the stripes. + */ + if (sbi->s_stripe > 1) { + sbi->s_mb_group_prealloc = roundup( + sbi->s_mb_group_prealloc, sbi->s_stripe); + } sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); if (sbi->s_locality_groups == NULL) { @@ -2487,6 +2501,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) spin_lock_init(&lg->lg_prealloc_lock); } + /* init file for buddy data */ + ret = ext4_mb_init_backend(sb); + if (ret != 0) { + goto out; + } + if (sbi->s_proc) proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, &ext4_mb_seq_groups_fops, sb); @@ -2544,32 +2564,32 @@ int ext4_mb_release(struct super_block *sb) EXT4_DESC_PER_BLOCK_BITS(sb); for (i = 0; i < num_meta_group_infos; i++) kfree(sbi->s_group_info[i]); - kfree(sbi->s_group_info); + ext4_kvfree(sbi->s_group_info); } kfree(sbi->s_mb_offsets); kfree(sbi->s_mb_maxs); if (sbi->s_buddy_cache) iput(sbi->s_buddy_cache); if (sbi->s_mb_stats) { - printk(KERN_INFO - "EXT4-fs: mballoc: %u blocks %u reqs (%u success)\n", + ext4_msg(sb, KERN_INFO, + "mballoc: %u blocks %u reqs (%u success)", atomic_read(&sbi->s_bal_allocated), atomic_read(&sbi->s_bal_reqs), atomic_read(&sbi->s_bal_success)); - printk(KERN_INFO - "EXT4-fs: mballoc: %u extents scanned, %u goal hits, " - "%u 2^N hits, %u breaks, %u lost\n", + ext4_msg(sb, KERN_INFO, + "mballoc: %u extents scanned, %u goal hits, " + "%u 2^N hits, %u breaks, %u lost", atomic_read(&sbi->s_bal_ex_scanned), atomic_read(&sbi->s_bal_goals), atomic_read(&sbi->s_bal_2orders), atomic_read(&sbi->s_bal_breaks), atomic_read(&sbi->s_mb_lost_chunks)); - printk(KERN_INFO - "EXT4-fs: mballoc: %lu generated and it took %Lu\n", - sbi->s_mb_buddies_generated++, + ext4_msg(sb, KERN_INFO, + "mballoc: %lu generated and it took %Lu", + sbi->s_mb_buddies_generated, sbi->s_mb_generation_time); - printk(KERN_INFO - "EXT4-fs: mballoc: %u preallocated, %u discarded\n", + ext4_msg(sb, KERN_INFO, + "mballoc: %u preallocated, %u discarded", atomic_read(&sbi->s_mb_preallocated), atomic_read(&sbi->s_mb_discarded)); } @@ -2628,6 +2648,15 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) rb_erase(&entry->node, &(db->bb_free_root)); mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count); + /* + * Clear the trimmed flag for the group so that the next + * ext4_trim_fs can trim it. + * If the volume is mounted with -o discard, online discard + * is supported and the free blocks will be trimmed online. + */ + if (!test_opt(sb, DISCARD)) + EXT4_MB_GRP_CLEAR_TRIMMED(db); + if (!db->bb_free_root.rb_node) { /* No more items in the per group rb tree * balance refcounts from ext4_mb_free_metadata() @@ -2771,8 +2800,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, * We leak some of the blocks here. */ ext4_lock_group(sb, ac->ac_b_ex.fe_group); - mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, - ac->ac_b_ex.fe_len); + ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, + ac->ac_b_ex.fe_len); ext4_unlock_group(sb, ac->ac_b_ex.fe_group); err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); if (!err) @@ -2790,7 +2819,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, } } #endif - mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,ac->ac_b_ex.fe_len); + ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, + ac->ac_b_ex.fe_len); if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); ext4_free_blks_set(sb, gdp, @@ -2830,8 +2860,9 @@ out_err: /* * here we normalize request for locality group - * Group request are normalized to s_strip size if we set the same via mount - * option. If not we set it to s_mb_group_prealloc which can be configured via + * Group request are normalized to s_mb_group_prealloc, which goes to + * s_strip if we set the same via mount option. + * s_mb_group_prealloc can be configured via * /sys/fs/ext4/<partition>/mb_group_prealloc * * XXX: should we try to preallocate more than the group has now? @@ -2842,10 +2873,7 @@ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac) struct ext4_locality_group *lg = ac->ac_lg; BUG_ON(lg == NULL); - if (EXT4_SB(sb)->s_stripe) - ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe; - else - ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; + ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; mb_debug(1, "#%u: goal %u blocks for locality group\n", current->pid, ac->ac_g_ex.fe_len); } @@ -3001,9 +3029,10 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, if (start + size <= ac->ac_o_ex.fe_logical && start > ac->ac_o_ex.fe_logical) { - printk(KERN_ERR "start %lu, size %lu, fe_logical %lu\n", - (unsigned long) start, (unsigned long) size, - (unsigned long) ac->ac_o_ex.fe_logical); + ext4_msg(ac->ac_sb, KERN_ERR, + "start %lu, size %lu, fe_logical %lu", + (unsigned long) start, (unsigned long) size, + (unsigned long) ac->ac_o_ex.fe_logical); } BUG_ON(start + size <= ac->ac_o_ex.fe_logical && start > ac->ac_o_ex.fe_logical); @@ -3262,7 +3291,7 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, while (n) { entry = rb_entry(n, struct ext4_free_data, node); - mb_set_bits(bitmap, entry->start_blk, entry->count); + ext4_set_bits(bitmap, entry->start_blk, entry->count); n = rb_next(n); } return; @@ -3304,7 +3333,7 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, if (unlikely(len == 0)) continue; BUG_ON(groupnr != group); - mb_set_bits(bitmap, start, len); + ext4_set_bits(bitmap, start, len); preallocated += len; count++; } @@ -3584,10 +3613,11 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, bit = next + 1; } if (free != pa->pa_free) { - printk(KERN_CRIT "pa %p: logic %lu, phys. %lu, len %lu\n", - pa, (unsigned long) pa->pa_lstart, - (unsigned long) pa->pa_pstart, - (unsigned long) pa->pa_len); + ext4_msg(e4b->bd_sb, KERN_CRIT, + "pa %p: logic %lu, phys. %lu, len %lu", + pa, (unsigned long) pa->pa_lstart, + (unsigned long) pa->pa_pstart, + (unsigned long) pa->pa_len); ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u", free, pa->pa_free); /* @@ -3775,7 +3805,8 @@ repeat: * use preallocation while we're discarding it */ spin_unlock(&pa->pa_lock); spin_unlock(&ei->i_prealloc_lock); - printk(KERN_ERR "uh-oh! used pa while discarding\n"); + ext4_msg(sb, KERN_ERR, + "uh-oh! used pa while discarding"); WARN_ON(1); schedule_timeout_uninterruptible(HZ); goto repeat; @@ -3852,12 +3883,13 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) return; - printk(KERN_ERR "EXT4-fs: Can't allocate:" - " Allocation context details:\n"); - printk(KERN_ERR "EXT4-fs: status %d flags %d\n", + ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: Can't allocate:" + " Allocation context details:"); + ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: status %d flags %d", ac->ac_status, ac->ac_flags); - printk(KERN_ERR "EXT4-fs: orig %lu/%lu/%lu@%lu, goal %lu/%lu/%lu@%lu, " - "best %lu/%lu/%lu@%lu cr %d\n", + ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: orig %lu/%lu/%lu@%lu, " + "goal %lu/%lu/%lu@%lu, " + "best %lu/%lu/%lu@%lu cr %d", (unsigned long)ac->ac_o_ex.fe_group, (unsigned long)ac->ac_o_ex.fe_start, (unsigned long)ac->ac_o_ex.fe_len, @@ -3871,9 +3903,9 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) (unsigned long)ac->ac_b_ex.fe_len, (unsigned long)ac->ac_b_ex.fe_logical, (int)ac->ac_criteria); - printk(KERN_ERR "EXT4-fs: %lu scanned, %d found\n", ac->ac_ex_scanned, - ac->ac_found); - printk(KERN_ERR "EXT4-fs: groups: \n"); + ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: %lu scanned, %d found", + ac->ac_ex_scanned, ac->ac_found); + ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: groups: "); ngroups = ext4_get_groups_count(sb); for (i = 0; i < ngroups; i++) { struct ext4_group_info *grp = ext4_get_group_info(sb, i); @@ -4637,7 +4669,7 @@ do_more: } ext4_mark_super_dirty(sb); error_return: - if (freed) + if (freed && !(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) dquot_free_block(inode, freed); brelse(bitmap_bh); ext4_std_error(sb, err); @@ -4645,7 +4677,7 @@ error_return: } /** - * ext4_add_groupblocks() -- Add given blocks to an existing group + * ext4_group_add_blocks() -- Add given blocks to an existing group * @handle: handle to this transaction * @sb: super block * @block: start physcial block to add to the block group @@ -4653,7 +4685,7 @@ error_return: * * This marks the blocks as free in the bitmap and buddy. */ -void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, +int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, ext4_fsblk_t block, unsigned long count) { struct buffer_head *bitmap_bh = NULL; @@ -4666,25 +4698,35 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, struct ext4_buddy e4b; int err = 0, ret, blk_free_count; ext4_grpblk_t blocks_freed; - struct ext4_group_info *grp; ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); + if (count == 0) + return 0; + ext4_get_group_no_and_offset(sb, block, &block_group, &bit); - grp = ext4_get_group_info(sb, block_group); /* * Check to see if we are freeing blocks across a group * boundary. */ - if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) + if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) { + ext4_warning(sb, "too much blocks added to group %u\n", + block_group); + err = -EINVAL; goto error_return; + } bitmap_bh = ext4_read_block_bitmap(sb, block_group); - if (!bitmap_bh) + if (!bitmap_bh) { + err = -EIO; goto error_return; + } + desc = ext4_get_group_desc(sb, block_group, &gd_bh); - if (!desc) + if (!desc) { + err = -EIO; goto error_return; + } if (in_range(ext4_block_bitmap(sb, desc), block, count) || in_range(ext4_inode_bitmap(sb, desc), block, count) || @@ -4694,6 +4736,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, ext4_error(sb, "Adding blocks in system zones - " "Block = %llu, count = %lu", block, count); + err = -EINVAL; goto error_return; } @@ -4762,7 +4805,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, error_return: brelse(bitmap_bh); ext4_std_error(sb, err); - return; + return err; } /** @@ -4782,6 +4825,8 @@ static void ext4_trim_extent(struct super_block *sb, int start, int count, { struct ext4_free_extent ex; + trace_ext4_trim_extent(sb, group, start, count); + assert_spin_locked(ext4_group_lock_ptr(sb, group)); ex.fe_start = start; @@ -4802,7 +4847,7 @@ static void ext4_trim_extent(struct super_block *sb, int start, int count, /** * ext4_trim_all_free -- function to trim all free space in alloc. group * @sb: super block for file system - * @e4b: ext4 buddy + * @group: group to be trimmed * @start: first group block to examine * @max: last group block to examine * @minblocks: minimum extent block count @@ -4823,10 +4868,12 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, ext4_grpblk_t minblocks) { void *bitmap; - ext4_grpblk_t next, count = 0; + ext4_grpblk_t next, count = 0, free_count = 0; struct ext4_buddy e4b; int ret; + trace_ext4_trim_all_free(sb, group, start, max); + ret = ext4_mb_load_buddy(sb, group, &e4b); if (ret) { ext4_error(sb, "Error in loading buddy " @@ -4836,6 +4883,10 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, bitmap = e4b.bd_bitmap; ext4_lock_group(sb, group); + if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) && + minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks)) + goto out; + start = (e4b.bd_info->bb_first_free > start) ? e4b.bd_info->bb_first_free : start; @@ -4850,6 +4901,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, next - start, group, &e4b); count += next - start; } + free_count += next - start; start = next + 1; if (fatal_signal_pending(current)) { @@ -4863,9 +4915,13 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, ext4_lock_group(sb, group); } - if ((e4b.bd_info->bb_free - count) < minblocks) + if ((e4b.bd_info->bb_free - free_count) < minblocks) break; } + + if (!ret) + EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info); +out: ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); @@ -4904,6 +4960,8 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb))) return -EINVAL; + if (start + len <= first_data_blk) + goto out; if (start < first_data_blk) { len -= first_data_blk - start; start = first_data_blk; @@ -4952,5 +5010,9 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) } range->len = trimmed * sb->s_blocksize; + if (!ret) + atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen); + +out: return ret; } diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 20b5e7b..9d4a636 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -187,7 +187,6 @@ struct ext4_allocation_context { __u16 ac_flags; /* allocation hints */ __u8 ac_status; __u8 ac_criteria; - __u8 ac_repeats; __u8 ac_2order; /* if request is to allocate 2^N blocks and * N > 0, the field stores N, otherwise 0 */ __u8 ac_op; /* operation, for history only */ diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 8c9baba..f8068c7 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -289,7 +289,7 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_ent while (len--) printk("%c", *name++); ext4fs_dirhash(de->name, de->name_len, &h); printk(":%x.%u ", h.hash, - ((char *) de - base)); + (unsigned) ((char *) de - base)); } space += EXT4_DIR_REC_LEN(de->name_len); names++; @@ -1013,7 +1013,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q *err = -ENOENT; errout: - dxtrace(printk(KERN_DEBUG "%s not found\n", name)); + dxtrace(printk(KERN_DEBUG "%s not found\n", d_name->name)); dx_release (frames); return NULL; } @@ -1985,18 +1985,11 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) if (!list_empty(&EXT4_I(inode)->i_orphan)) goto out_unlock; - /* Orphan handling is only valid for files with data blocks - * being truncated, or files being unlinked. */ - - /* @@@ FIXME: Observation from aviro: - * I think I can trigger J_ASSERT in ext4_orphan_add(). We block - * here (on s_orphan_lock), so race with ext4_link() which might bump - * ->i_nlink. For, say it, character device. Not a regular file, - * not a directory, not a symlink and ->i_nlink > 0. - * - * tytso, 4/25/2009: I'm not sure how that could happen; - * shouldn't the fs core protect us from these sort of - * unlink()/link() races? + /* + * Orphan handling is only valid for files with data blocks + * being truncated, or files being unlinked. Note that we either + * hold i_mutex, or the inode can not be referenced from outside, + * so i_nlink should not be bumped due to race */ J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); @@ -2260,9 +2253,11 @@ static int ext4_symlink(struct inode *dir, /* * For non-fast symlinks, we just allocate inode and put it on * orphan list in the first transaction => we need bitmap, - * group descriptor, sb, inode block, quota blocks. + * group descriptor, sb, inode block, quota blocks, and + * possibly selinux xattr blocks. */ - credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb); + credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) + + EXT4_XATTR_TRANS_BLOCKS; } else { /* * Fast symlink. We have to add entry to directory diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 7bb8f76..92f38ee 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -142,7 +142,23 @@ static void ext4_end_io_work(struct work_struct *work) unsigned long flags; int ret; - mutex_lock(&inode->i_mutex); + if (!mutex_trylock(&inode->i_mutex)) { + /* + * Requeue the work instead of waiting so that the work + * items queued after this can be processed. + */ + queue_work(EXT4_SB(inode->i_sb)->dio_unwritten_wq, &io->work); + /* + * To prevent the ext4-dio-unwritten thread from keeping + * requeueing end_io requests and occupying cpu for too long, + * yield the cpu if it sees an end_io request that has already + * been requeued. + */ + if (io->flag & EXT4_IO_END_QUEUED) + yield(); + io->flag |= EXT4_IO_END_QUEUED; + return; + } ret = ext4_end_io_nolock(io); if (ret < 0) { mutex_unlock(&inode->i_mutex); @@ -285,11 +301,7 @@ static int io_submit_init(struct ext4_io_submit *io, io_end = ext4_init_io_end(inode, GFP_NOFS); if (!io_end) return -ENOMEM; - do { - bio = bio_alloc(GFP_NOIO, nvecs); - nvecs >>= 1; - } while (bio == NULL); - + bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES)); bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_bdev = bh->b_bdev; bio->bi_private = io->io_end = io_end; @@ -338,8 +350,10 @@ submit_and_retry: if ((io_end->num_io_pages >= MAX_IO_PAGES) && (io_end->pages[io_end->num_io_pages-1] != io_page)) goto submit_and_retry; - if (buffer_uninit(bh)) - io->io_end->flag |= EXT4_IO_END_UNWRITTEN; + if (buffer_uninit(bh) && !(io_end->flag & EXT4_IO_END_UNWRITTEN)) { + io_end->flag |= EXT4_IO_END_UNWRITTEN; + atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); + } io->io_end->size += bh->b_size; io->io_next_block++; ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh)); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 80bbc9c..707d3f1 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -16,6 +16,35 @@ #include "ext4_jbd2.h" +int ext4_resize_begin(struct super_block *sb) +{ + int ret = 0; + + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + + /* + * We are not allowed to do online-resizing on a filesystem mounted + * with error, because it can destroy the filesystem easily. + */ + if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { + ext4_warning(sb, "There are errors in the filesystem, " + "so online resizing is not allowed\n"); + return -EPERM; + } + + if (test_and_set_bit_lock(EXT4_RESIZING, &EXT4_SB(sb)->s_resize_flags)) + ret = -EBUSY; + + return ret; +} + +void ext4_resize_end(struct super_block *sb) +{ + clear_bit_unlock(EXT4_RESIZING, &EXT4_SB(sb)->s_resize_flags); + smp_mb__after_clear_bit(); +} + #define outside(b, first, last) ((b) < (first) || (b) >= (last)) #define inside(b, first, last) ((b) >= (first) && (b) < (last)) @@ -118,10 +147,8 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, brelse(bh); bh = ERR_PTR(err); } else { - lock_buffer(bh); memset(bh->b_data, 0, sb->s_blocksize); set_buffer_uptodate(bh); - unlock_buffer(bh); } return bh; @@ -132,8 +159,7 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, * If that fails, restart the transaction & regain write access for the * buffer head which is used for block_bitmap modifications. */ -static int extend_or_restart_transaction(handle_t *handle, int thresh, - struct buffer_head *bh) +static int extend_or_restart_transaction(handle_t *handle, int thresh) { int err; @@ -144,9 +170,8 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh, if (err < 0) return err; if (err) { - if ((err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA))) - return err; - if ((err = ext4_journal_get_write_access(handle, bh))) + err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA); + if (err) return err; } @@ -181,21 +206,7 @@ static int setup_new_group_blocks(struct super_block *sb, if (IS_ERR(handle)) return PTR_ERR(handle); - mutex_lock(&sbi->s_resize_lock); - if (input->group != sbi->s_groups_count) { - err = -EBUSY; - goto exit_journal; - } - - if (IS_ERR(bh = bclean(handle, sb, input->block_bitmap))) { - err = PTR_ERR(bh); - goto exit_journal; - } - - if (ext4_bg_has_super(sb, input->group)) { - ext4_debug("mark backup superblock %#04llx (+0)\n", start); - ext4_set_bit(0, bh->b_data); - } + BUG_ON(input->group != sbi->s_groups_count); /* Copy all of the GDT blocks into the backup in this group */ for (i = 0, bit = 1, block = start + 1; @@ -203,29 +214,26 @@ static int setup_new_group_blocks(struct super_block *sb, struct buffer_head *gdb; ext4_debug("update backup group %#04llx (+%d)\n", block, bit); - - if ((err = extend_or_restart_transaction(handle, 1, bh))) - goto exit_bh; + err = extend_or_restart_transaction(handle, 1); + if (err) + goto exit_journal; gdb = sb_getblk(sb, block); if (!gdb) { err = -EIO; - goto exit_bh; + goto exit_journal; } if ((err = ext4_journal_get_write_access(handle, gdb))) { brelse(gdb); - goto exit_bh; + goto exit_journal; } - lock_buffer(gdb); memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size); set_buffer_uptodate(gdb); - unlock_buffer(gdb); err = ext4_handle_dirty_metadata(handle, NULL, gdb); if (unlikely(err)) { brelse(gdb); - goto exit_bh; + goto exit_journal; } - ext4_set_bit(bit, bh->b_data); brelse(gdb); } @@ -235,9 +243,22 @@ static int setup_new_group_blocks(struct super_block *sb, err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb, GFP_NOFS); if (err) - goto exit_bh; - for (i = 0, bit = gdblocks + 1; i < reserved_gdb; i++, bit++) - ext4_set_bit(bit, bh->b_data); + goto exit_journal; + + err = extend_or_restart_transaction(handle, 2); + if (err) + goto exit_journal; + + bh = bclean(handle, sb, input->block_bitmap); + if (IS_ERR(bh)) { + err = PTR_ERR(bh); + goto exit_journal; + } + + if (ext4_bg_has_super(sb, input->group)) { + ext4_debug("mark backup group tables %#04llx (+0)\n", start); + ext4_set_bits(bh->b_data, 0, gdblocks + reserved_gdb + 1); + } ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap, input->block_bitmap - start); @@ -253,12 +274,9 @@ static int setup_new_group_blocks(struct super_block *sb, err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS); if (err) goto exit_bh; - for (i = 0, bit = input->inode_table - start; - i < sbi->s_itb_per_group; i++, bit++) - ext4_set_bit(bit, bh->b_data); + ext4_set_bits(bh->b_data, input->inode_table - start, + sbi->s_itb_per_group); - if ((err = extend_or_restart_transaction(handle, 2, bh))) - goto exit_bh; ext4_mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, bh->b_data); @@ -285,7 +303,6 @@ exit_bh: brelse(bh); exit_journal: - mutex_unlock(&sbi->s_resize_lock); if ((err2 = ext4_journal_stop(handle)) && !err) err = err2; @@ -377,15 +394,15 @@ static int verify_reserved_gdb(struct super_block *sb, * fail once we start modifying the data on disk, because JBD has no rollback. */ static int add_new_gdb(handle_t *handle, struct inode *inode, - struct ext4_new_group_data *input, - struct buffer_head **primary) + ext4_group_t group) { struct super_block *sb = inode->i_sb; struct ext4_super_block *es = EXT4_SB(sb)->s_es; - unsigned long gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb); + unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb); ext4_fsblk_t gdblock = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num; struct buffer_head **o_group_desc, **n_group_desc; struct buffer_head *dind; + struct buffer_head *gdb_bh; int gdbackups; struct ext4_iloc iloc; __le32 *data; @@ -408,11 +425,12 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, return -EPERM; } - *primary = sb_bread(sb, gdblock); - if (!*primary) + gdb_bh = sb_bread(sb, gdblock); + if (!gdb_bh) return -EIO; - if ((gdbackups = verify_reserved_gdb(sb, *primary)) < 0) { + gdbackups = verify_reserved_gdb(sb, gdb_bh); + if (gdbackups < 0) { err = gdbackups; goto exit_bh; } @@ -427,7 +445,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, data = (__le32 *)dind->b_data; if (le32_to_cpu(data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)]) != gdblock) { ext4_warning(sb, "new group %u GDT block %llu not reserved", - input->group, gdblock); + group, gdblock); err = -EINVAL; goto exit_dind; } @@ -436,7 +454,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, if (unlikely(err)) goto exit_dind; - err = ext4_journal_get_write_access(handle, *primary); + err = ext4_journal_get_write_access(handle, gdb_bh); if (unlikely(err)) goto exit_sbh; @@ -449,12 +467,13 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, if (unlikely(err)) goto exit_dindj; - n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *), - GFP_NOFS); + n_group_desc = ext4_kvmalloc((gdb_num + 1) * + sizeof(struct buffer_head *), + GFP_NOFS); if (!n_group_desc) { err = -ENOMEM; - ext4_warning(sb, - "not enough memory for %lu groups", gdb_num + 1); + ext4_warning(sb, "not enough memory for %lu groups", + gdb_num + 1); goto exit_inode; } @@ -475,8 +494,8 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, } inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9; ext4_mark_iloc_dirty(handle, inode, &iloc); - memset((*primary)->b_data, 0, sb->s_blocksize); - err = ext4_handle_dirty_metadata(handle, NULL, *primary); + memset(gdb_bh->b_data, 0, sb->s_blocksize); + err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh); if (unlikely(err)) { ext4_std_error(sb, err); goto exit_inode; @@ -486,10 +505,10 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, o_group_desc = EXT4_SB(sb)->s_group_desc; memcpy(n_group_desc, o_group_desc, EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *)); - n_group_desc[gdb_num] = *primary; + n_group_desc[gdb_num] = gdb_bh; EXT4_SB(sb)->s_group_desc = n_group_desc; EXT4_SB(sb)->s_gdb_count++; - kfree(o_group_desc); + ext4_kvfree(o_group_desc); le16_add_cpu(&es->s_reserved_gdt_blocks, -1); err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); @@ -499,6 +518,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, return err; exit_inode: + ext4_kvfree(n_group_desc); /* ext4_handle_release_buffer(handle, iloc.bh); */ brelse(iloc.bh); exit_dindj: @@ -508,7 +528,7 @@ exit_sbh: exit_dind: brelse(dind); exit_bh: - brelse(*primary); + brelse(gdb_bh); ext4_debug("leaving with error %d\n", err); return err; @@ -528,7 +548,7 @@ exit_bh: * backup GDT blocks are stored in their reserved primary GDT block. */ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, - struct ext4_new_group_data *input) + ext4_group_t group) { struct super_block *sb = inode->i_sb; int reserved_gdb =le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks); @@ -599,7 +619,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, * Finally we can add each of the reserved backup GDT blocks from * the new group to its reserved primary GDT block. */ - blk = input->group * EXT4_BLOCKS_PER_GROUP(sb); + blk = group * EXT4_BLOCKS_PER_GROUP(sb); for (i = 0; i < reserved_gdb; i++) { int err2; data = (__le32 *)primary[i]->b_data; @@ -799,13 +819,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) goto exit_put; } - mutex_lock(&sbi->s_resize_lock); - if (input->group != sbi->s_groups_count) { - ext4_warning(sb, "multiple resizers run on filesystem!"); - err = -EBUSY; - goto exit_journal; - } - if ((err = ext4_journal_get_write_access(handle, sbi->s_sbh))) goto exit_journal; @@ -820,16 +833,25 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) if ((err = ext4_journal_get_write_access(handle, primary))) goto exit_journal; - if (reserved_gdb && ext4_bg_num_gdb(sb, input->group) && - (err = reserve_backup_gdb(handle, inode, input))) + if (reserved_gdb && ext4_bg_num_gdb(sb, input->group)) { + err = reserve_backup_gdb(handle, inode, input->group); + if (err) + goto exit_journal; + } + } else { + /* + * Note that we can access new group descriptor block safely + * only if add_new_gdb() succeeds. + */ + err = add_new_gdb(handle, inode, input->group); + if (err) goto exit_journal; - } else if ((err = add_new_gdb(handle, inode, input, &primary))) - goto exit_journal; + primary = sbi->s_group_desc[gdb_num]; + } /* * OK, now we've set up the new group. Time to make it active. * - * We do not lock all allocations via s_resize_lock * so we have to be safe wrt. concurrent accesses the group * data. So we need to be careful to set all of the relevant * group descriptor data etc. *before* we enable the group. @@ -886,13 +908,9 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) * * The precise rules we use are: * - * * Writers of s_groups_count *must* hold s_resize_lock - * AND * * Writers must perform a smp_wmb() after updating all dependent * data and before modifying the groups count * - * * Readers must hold s_resize_lock over the access - * OR * * Readers must perform an smp_rmb() after reading the groups count * and before reading any dependent data. * @@ -937,10 +955,9 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) ext4_handle_dirty_super(handle, sb); exit_journal: - mutex_unlock(&sbi->s_resize_lock); if ((err2 = ext4_journal_stop(handle)) && !err) err = err2; - if (!err) { + if (!err && primary) { update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es, sizeof(struct ext4_super_block)); update_backups(sb, primary->b_blocknr, primary->b_data, @@ -969,16 +986,13 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, ext4_grpblk_t add; struct buffer_head *bh; handle_t *handle; - int err; + int err, err2; ext4_group_t group; - /* We don't need to worry about locking wrt other resizers just - * yet: we're going to revalidate es->s_blocks_count after - * taking the s_resize_lock below. */ o_blocks_count = ext4_blocks_count(es); if (test_opt(sb, DEBUG)) - printk(KERN_DEBUG "EXT4-fs: extending last group from %llu uto %llu blocks\n", + printk(KERN_DEBUG "EXT4-fs: extending last group from %llu to %llu blocks\n", o_blocks_count, n_blocks_count); if (n_blocks_count == 0 || n_blocks_count == o_blocks_count) @@ -995,7 +1009,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, if (n_blocks_count < o_blocks_count) { ext4_warning(sb, "can't shrink FS - resize aborted"); - return -EBUSY; + return -EINVAL; } /* Handle the remaining blocks in the last group only. */ @@ -1038,32 +1052,25 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, goto exit_put; } - mutex_lock(&EXT4_SB(sb)->s_resize_lock); - if (o_blocks_count != ext4_blocks_count(es)) { - ext4_warning(sb, "multiple resizers run on filesystem!"); - mutex_unlock(&EXT4_SB(sb)->s_resize_lock); - ext4_journal_stop(handle); - err = -EBUSY; - goto exit_put; - } - if ((err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh))) { ext4_warning(sb, "error %d on journal write access", err); - mutex_unlock(&EXT4_SB(sb)->s_resize_lock); ext4_journal_stop(handle); goto exit_put; } ext4_blocks_count_set(es, o_blocks_count + add); - mutex_unlock(&EXT4_SB(sb)->s_resize_lock); ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, o_blocks_count + add); /* We add the blocks to the bitmap and set the group need init bit */ - ext4_add_groupblocks(handle, sb, o_blocks_count, add); + err = ext4_group_add_blocks(handle, sb, o_blocks_count, add); ext4_handle_dirty_super(handle, sb); ext4_debug("freed blocks %llu through %llu\n", o_blocks_count, o_blocks_count + add); - if ((err = ext4_journal_stop(handle))) + err2 = ext4_journal_stop(handle); + if (!err && err2) + err = err2; + + if (err) goto exit_put; if (test_opt(sb, DEBUG)) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9ea71aa..44d0c8d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -110,6 +110,35 @@ static struct file_system_type ext3_fs_type = { #define IS_EXT3_SB(sb) (0) #endif +void *ext4_kvmalloc(size_t size, gfp_t flags) +{ + void *ret; + + ret = kmalloc(size, flags); + if (!ret) + ret = __vmalloc(size, flags, PAGE_KERNEL); + return ret; +} + +void *ext4_kvzalloc(size_t size, gfp_t flags) +{ + void *ret; + + ret = kzalloc(size, flags); + if (!ret) + ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL); + return ret; +} + +void ext4_kvfree(void *ptr) +{ + if (is_vmalloc_addr(ptr)) + vfree(ptr); + else + kfree(ptr); + +} + ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, struct ext4_group_desc *bg) { @@ -269,6 +298,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks) journal_t *journal; handle_t *handle; + trace_ext4_journal_start(sb, nblocks, _RET_IP_); if (sb->s_flags & MS_RDONLY) return ERR_PTR(-EROFS); @@ -789,11 +819,8 @@ static void ext4_put_super(struct super_block *sb) for (i = 0; i < sbi->s_gdb_count; i++) brelse(sbi->s_group_desc[i]); - kfree(sbi->s_group_desc); - if (is_vmalloc_addr(sbi->s_flex_groups)) - vfree(sbi->s_flex_groups); - else - kfree(sbi->s_flex_groups); + ext4_kvfree(sbi->s_group_desc); + ext4_kvfree(sbi->s_flex_groups); percpu_counter_destroy(&sbi->s_freeblocks_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); @@ -892,7 +919,6 @@ static void ext4_i_callback(struct rcu_head *head) static void ext4_destroy_inode(struct inode *inode) { - ext4_ioend_wait(inode); if (!list_empty(&(EXT4_I(inode)->i_orphan))) { ext4_msg(inode->i_sb, KERN_ERR, "Inode %lu (%p): orphan list check failed!", @@ -1976,15 +2002,11 @@ static int ext4_fill_flex_info(struct super_block *sb) ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) << EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex; size = flex_group_count * sizeof(struct flex_groups); - sbi->s_flex_groups = kzalloc(size, GFP_KERNEL); + sbi->s_flex_groups = ext4_kvzalloc(size, GFP_KERNEL); if (sbi->s_flex_groups == NULL) { - sbi->s_flex_groups = vzalloc(size); - if (sbi->s_flex_groups == NULL) { - ext4_msg(sb, KERN_ERR, - "not enough memory for %u flex groups", - flex_group_count); - goto failed; - } + ext4_msg(sb, KERN_ERR, "not enough memory for %u flex groups", + flex_group_count); + goto failed; } for (i = 0; i < sbi->s_groups_count; i++) { @@ -2383,17 +2405,25 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi) unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride); unsigned long stripe_width = le32_to_cpu(sbi->s_es->s_raid_stripe_width); + int ret; if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group) - return sbi->s_stripe; - - if (stripe_width <= sbi->s_blocks_per_group) - return stripe_width; + ret = sbi->s_stripe; + else if (stripe_width <= sbi->s_blocks_per_group) + ret = stripe_width; + else if (stride <= sbi->s_blocks_per_group) + ret = stride; + else + ret = 0; - if (stride <= sbi->s_blocks_per_group) - return stride; + /* + * If the stripe width is 1, this makes no sense and + * we set it to 0 to turn off stripe handling code. + */ + if (ret <= 1) + ret = 0; - return 0; + return ret; } /* sysfs supprt */ @@ -3408,8 +3438,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / EXT4_DESC_PER_BLOCK(sb); - sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *), - GFP_KERNEL); + sbi->s_group_desc = ext4_kvmalloc(db_count * + sizeof(struct buffer_head *), + GFP_KERNEL); if (sbi->s_group_desc == NULL) { ext4_msg(sb, KERN_ERR, "not enough memory"); goto failed_mount; @@ -3491,7 +3522,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ mutex_init(&sbi->s_orphan_lock); - mutex_init(&sbi->s_resize_lock); + sbi->s_resize_flags = 0; sb->s_root = NULL; @@ -3741,12 +3772,8 @@ failed_mount_wq: } failed_mount3: del_timer(&sbi->s_err_report); - if (sbi->s_flex_groups) { - if (is_vmalloc_addr(sbi->s_flex_groups)) - vfree(sbi->s_flex_groups); - else - kfree(sbi->s_flex_groups); - } + if (sbi->s_flex_groups) + ext4_kvfree(sbi->s_flex_groups); percpu_counter_destroy(&sbi->s_freeblocks_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); @@ -3756,7 +3783,7 @@ failed_mount3: failed_mount2: for (i = 0; i < db_count; i++) brelse(sbi->s_group_desc[i]); - kfree(sbi->s_group_desc); + ext4_kvfree(sbi->s_group_desc); failed_mount: if (sbi->s_proc) { remove_proc_entry(sb->s_id, ext4_proc_root); diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h new file mode 100644 index 0000000..011ba66 --- /dev/null +++ b/fs/ext4/truncate.h @@ -0,0 +1,43 @@ +/* + * linux/fs/ext4/truncate.h + * + * Common inline functions needed for truncate support + */ + +/* + * Truncate blocks that were not used by write. We have to truncate the + * pagecache as well so that corresponding buffers get properly unmapped. + */ +static inline void ext4_truncate_failed_write(struct inode *inode) +{ + truncate_inode_pages(inode->i_mapping, inode->i_size); + ext4_truncate(inode); +} + +/* + * Work out how many blocks we need to proceed with the next chunk of a + * truncate transaction. + */ +static inline unsigned long ext4_blocks_for_truncate(struct inode *inode) +{ + ext4_lblk_t needed; + + needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); + + /* Give ourselves just enough room to cope with inodes in which + * i_blocks is corrupt: we've seen disk corruptions in the past + * which resulted in random data in an inode which looked enough + * like a regular file for ext4 to try to delete it. Things + * will go a bit crazy if that happens, but at least we should + * try not to panic the whole kernel. */ + if (needed < 2) + needed = 2; + + /* But we need to bound the transaction so we don't overflow the + * journal. */ + if (needed > EXT4_MAX_TRANS_DATA) + needed = EXT4_MAX_TRANS_DATA; + + return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed; +} + diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 4ad6473..5efbd5d 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -1231,7 +1231,7 @@ int fat_add_entries(struct inode *dir, void *slots, int nr_slots, struct super_block *sb = dir->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); struct buffer_head *bh, *prev, *bhs[3]; /* 32*slots (672bytes) */ - struct msdos_dir_entry *de; + struct msdos_dir_entry *uninitialized_var(de); int err, free_slots, i, nr_bhs; loff_t pos, i_pos; diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 5942fec..1726d73 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -1188,9 +1188,9 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat, out: /* UTF-8 doesn't provide FAT semantics */ if (!strcmp(opts->iocharset, "utf8")) { - fat_msg(sb, KERN_ERR, "utf8 is not a recommended IO charset" + fat_msg(sb, KERN_WARNING, "utf8 is not a recommended IO charset" " for FAT filesystems, filesystem will be " - "case sensitive!\n"); + "case sensitive!"); } /* If user doesn't specify allow_utime, it's initialized from dmask. */ @@ -1367,6 +1367,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, sbi->free_clusters = -1; /* Don't know yet */ sbi->free_clus_valid = 0; sbi->prev_free = FAT_START_ENT; + sb->s_maxbytes = 0xffffffff; if (!sbi->fat_length && b->fat32_length) { struct fat_boot_fsinfo *fsinfo; @@ -1377,8 +1378,6 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, sbi->fat_length = le32_to_cpu(b->fat32_length); sbi->root_cluster = le32_to_cpu(b->root_cluster); - sb->s_maxbytes = 0xffffffff; - /* MC - if info_sector is 0, don't multiply by 0 */ sbi->fsinfo_sector = le16_to_cpu(b->info_sector); if (sbi->fsinfo_sector == 0) diff --git a/fs/file_table.c b/fs/file_table.c index 01e4c1e..c322794 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -25,7 +25,7 @@ #include <linux/percpu.h> #include <linux/ima.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include "internal.h" diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index b8c507c..04cf3b9 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -35,7 +35,9 @@ struct wb_writeback_work { long nr_pages; struct super_block *sb; + unsigned long *older_than_this; enum writeback_sync_modes sync_mode; + unsigned int tagged_writepages:1; unsigned int for_kupdate:1; unsigned int range_cyclic:1; unsigned int for_background:1; @@ -180,12 +182,13 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi) */ void inode_wb_list_del(struct inode *inode) { - spin_lock(&inode_wb_list_lock); + struct backing_dev_info *bdi = inode_to_bdi(inode); + + spin_lock(&bdi->wb.list_lock); list_del_init(&inode->i_wb_list); - spin_unlock(&inode_wb_list_lock); + spin_unlock(&bdi->wb.list_lock); } - /* * Redirty an inode: set its when-it-was dirtied timestamp and move it to the * furthest end of its superblock's dirty-inode list. @@ -195,11 +198,9 @@ void inode_wb_list_del(struct inode *inode) * the case then the inode must have been redirtied while it was being written * out and we don't reset its dirtied_when. */ -static void redirty_tail(struct inode *inode) +static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) { - struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; - - assert_spin_locked(&inode_wb_list_lock); + assert_spin_locked(&wb->list_lock); if (!list_empty(&wb->b_dirty)) { struct inode *tail; @@ -213,11 +214,9 @@ static void redirty_tail(struct inode *inode) /* * requeue inode for re-scanning after bdi->b_io list is exhausted. */ -static void requeue_io(struct inode *inode) +static void requeue_io(struct inode *inode, struct bdi_writeback *wb) { - struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; - - assert_spin_locked(&inode_wb_list_lock); + assert_spin_locked(&wb->list_lock); list_move(&inode->i_wb_list, &wb->b_more_io); } @@ -225,7 +224,7 @@ static void inode_sync_complete(struct inode *inode) { /* * Prevent speculative execution through - * spin_unlock(&inode_wb_list_lock); + * spin_unlock(&wb->list_lock); */ smp_mb(); @@ -250,15 +249,16 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t) /* * Move expired dirty inodes from @delaying_queue to @dispatch_queue. */ -static void move_expired_inodes(struct list_head *delaying_queue, +static int move_expired_inodes(struct list_head *delaying_queue, struct list_head *dispatch_queue, - unsigned long *older_than_this) + unsigned long *older_than_this) { LIST_HEAD(tmp); struct list_head *pos, *node; struct super_block *sb = NULL; struct inode *inode; int do_sb_sort = 0; + int moved = 0; while (!list_empty(delaying_queue)) { inode = wb_inode(delaying_queue->prev); @@ -269,12 +269,13 @@ static void move_expired_inodes(struct list_head *delaying_queue, do_sb_sort = 1; sb = inode->i_sb; list_move(&inode->i_wb_list, &tmp); + moved++; } /* just one sb in list, splice to dispatch_queue and we're done */ if (!do_sb_sort) { list_splice(&tmp, dispatch_queue); - return; + goto out; } /* Move inodes from one superblock together */ @@ -286,6 +287,8 @@ static void move_expired_inodes(struct list_head *delaying_queue, list_move(&inode->i_wb_list, dispatch_queue); } } +out: + return moved; } /* @@ -301,9 +304,11 @@ static void move_expired_inodes(struct list_head *delaying_queue, */ static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) { - assert_spin_locked(&inode_wb_list_lock); + int moved; + assert_spin_locked(&wb->list_lock); list_splice_init(&wb->b_more_io, &wb->b_io); - move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); + moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); + trace_writeback_queue_io(wb, older_than_this, moved); } static int write_inode(struct inode *inode, struct writeback_control *wbc) @@ -316,7 +321,8 @@ static int write_inode(struct inode *inode, struct writeback_control *wbc) /* * Wait for writeback on an inode to complete. */ -static void inode_wait_for_writeback(struct inode *inode) +static void inode_wait_for_writeback(struct inode *inode, + struct bdi_writeback *wb) { DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC); wait_queue_head_t *wqh; @@ -324,15 +330,15 @@ static void inode_wait_for_writeback(struct inode *inode) wqh = bit_waitqueue(&inode->i_state, __I_SYNC); while (inode->i_state & I_SYNC) { spin_unlock(&inode->i_lock); - spin_unlock(&inode_wb_list_lock); + spin_unlock(&wb->list_lock); __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); - spin_lock(&inode_wb_list_lock); + spin_lock(&wb->list_lock); spin_lock(&inode->i_lock); } } /* - * Write out an inode's dirty pages. Called under inode_wb_list_lock and + * Write out an inode's dirty pages. Called under wb->list_lock and * inode->i_lock. Either the caller has an active reference on the inode or * the inode has I_WILL_FREE set. * @@ -343,13 +349,15 @@ static void inode_wait_for_writeback(struct inode *inode) * livelocks, etc. */ static int -writeback_single_inode(struct inode *inode, struct writeback_control *wbc) +writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, + struct writeback_control *wbc) { struct address_space *mapping = inode->i_mapping; + long nr_to_write = wbc->nr_to_write; unsigned dirty; int ret; - assert_spin_locked(&inode_wb_list_lock); + assert_spin_locked(&wb->list_lock); assert_spin_locked(&inode->i_lock); if (!atomic_read(&inode->i_count)) @@ -367,14 +375,16 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * completed a full scan of b_io. */ if (wbc->sync_mode != WB_SYNC_ALL) { - requeue_io(inode); + requeue_io(inode, wb); + trace_writeback_single_inode_requeue(inode, wbc, + nr_to_write); return 0; } /* * It's a data-integrity sync. We must wait. */ - inode_wait_for_writeback(inode); + inode_wait_for_writeback(inode, wb); } BUG_ON(inode->i_state & I_SYNC); @@ -383,7 +393,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) inode->i_state |= I_SYNC; inode->i_state &= ~I_DIRTY_PAGES; spin_unlock(&inode->i_lock); - spin_unlock(&inode_wb_list_lock); + spin_unlock(&wb->list_lock); ret = do_writepages(mapping, wbc); @@ -414,10 +424,19 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) ret = err; } - spin_lock(&inode_wb_list_lock); + spin_lock(&wb->list_lock); spin_lock(&inode->i_lock); inode->i_state &= ~I_SYNC; if (!(inode->i_state & I_FREEING)) { + /* + * Sync livelock prevention. Each inode is tagged and synced in + * one shot. If still dirty, it will be redirty_tail()'ed below. + * Update the dirty time to prevent enqueue and sync it again. + */ + if ((inode->i_state & I_DIRTY) && + (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)) + inode->dirtied_when = jiffies; + if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { /* * We didn't write back all the pages. nfs_writepages() @@ -428,7 +447,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) /* * slice used up: queue for next turn */ - requeue_io(inode); + requeue_io(inode, wb); } else { /* * Writeback blocked by something other than @@ -437,7 +456,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * retrying writeback of the dirty page/inode * that cannot be performed immediately. */ - redirty_tail(inode); + redirty_tail(inode, wb); } } else if (inode->i_state & I_DIRTY) { /* @@ -446,7 +465,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * submission or metadata updates after data IO * completion. */ - redirty_tail(inode); + redirty_tail(inode, wb); } else { /* * The inode is clean. At this point we either have @@ -457,9 +476,41 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) } } inode_sync_complete(inode); + trace_writeback_single_inode(inode, wbc, nr_to_write); return ret; } +static long writeback_chunk_size(struct backing_dev_info *bdi, + struct wb_writeback_work *work) +{ + long pages; + + /* + * WB_SYNC_ALL mode does livelock avoidance by syncing dirty + * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX + * here avoids calling into writeback_inodes_wb() more than once. + * + * The intended call sequence for WB_SYNC_ALL writeback is: + * + * wb_writeback() + * writeback_sb_inodes() <== called only once + * write_cache_pages() <== called once for each inode + * (quickly) tag currently dirty pages + * (maybe slowly) sync all tagged pages + */ + if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages) + pages = LONG_MAX; + else { + pages = min(bdi->avg_write_bandwidth / 2, + global_dirty_limit / DIRTY_SCOPE); + pages = min(pages, work->nr_pages); + pages = round_down(pages + MIN_WRITEBACK_PAGES, + MIN_WRITEBACK_PAGES); + } + + return pages; +} + /* * Write a portion of b_io inodes which belong to @sb. * @@ -467,24 +518,36 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * inodes. Otherwise write only ones which go sequentially * in reverse order. * - * Return 1, if the caller writeback routine should be - * interrupted. Otherwise return 0. + * Return the number of pages and/or inodes written. */ -static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, - struct writeback_control *wbc, bool only_this_sb) +static long writeback_sb_inodes(struct super_block *sb, + struct bdi_writeback *wb, + struct wb_writeback_work *work) { + struct writeback_control wbc = { + .sync_mode = work->sync_mode, + .tagged_writepages = work->tagged_writepages, + .for_kupdate = work->for_kupdate, + .for_background = work->for_background, + .range_cyclic = work->range_cyclic, + .range_start = 0, + .range_end = LLONG_MAX, + }; + unsigned long start_time = jiffies; + long write_chunk; + long wrote = 0; /* count both pages and inodes */ + while (!list_empty(&wb->b_io)) { - long pages_skipped; struct inode *inode = wb_inode(wb->b_io.prev); if (inode->i_sb != sb) { - if (only_this_sb) { + if (work->sb) { /* * We only want to write back data for this * superblock, move all inodes not belonging * to it back onto the dirty list. */ - redirty_tail(inode); + redirty_tail(inode, wb); continue; } @@ -493,7 +556,7 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, * Bounce back to the caller to unpin this and * pin the next superblock. */ - return 0; + break; } /* @@ -504,95 +567,96 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, spin_lock(&inode->i_lock); if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { spin_unlock(&inode->i_lock); - requeue_io(inode); + redirty_tail(inode, wb); continue; } - - /* - * Was this inode dirtied after sync_sb_inodes was called? - * This keeps sync from extra jobs and livelock. - */ - if (inode_dirtied_after(inode, wbc->wb_start)) { - spin_unlock(&inode->i_lock); - return 1; - } - __iget(inode); + write_chunk = writeback_chunk_size(wb->bdi, work); + wbc.nr_to_write = write_chunk; + wbc.pages_skipped = 0; + + writeback_single_inode(inode, wb, &wbc); - pages_skipped = wbc->pages_skipped; - writeback_single_inode(inode, wbc); - if (wbc->pages_skipped != pages_skipped) { + work->nr_pages -= write_chunk - wbc.nr_to_write; + wrote += write_chunk - wbc.nr_to_write; + if (!(inode->i_state & I_DIRTY)) + wrote++; + if (wbc.pages_skipped) { /* * writeback is not making progress due to locked * buffers. Skip this inode for now. */ - redirty_tail(inode); + redirty_tail(inode, wb); } spin_unlock(&inode->i_lock); - spin_unlock(&inode_wb_list_lock); + spin_unlock(&wb->list_lock); iput(inode); cond_resched(); - spin_lock(&inode_wb_list_lock); - if (wbc->nr_to_write <= 0) { - wbc->more_io = 1; - return 1; + spin_lock(&wb->list_lock); + /* + * bail out to wb_writeback() often enough to check + * background threshold and other termination conditions. + */ + if (wrote) { + if (time_is_before_jiffies(start_time + HZ / 10UL)) + break; + if (work->nr_pages <= 0) + break; } - if (!list_empty(&wb->b_more_io)) - wbc->more_io = 1; } - /* b_io is empty */ - return 1; + return wrote; } -void writeback_inodes_wb(struct bdi_writeback *wb, - struct writeback_control *wbc) +static long __writeback_inodes_wb(struct bdi_writeback *wb, + struct wb_writeback_work *work) { - int ret = 0; - - if (!wbc->wb_start) - wbc->wb_start = jiffies; /* livelock avoidance */ - spin_lock(&inode_wb_list_lock); - if (!wbc->for_kupdate || list_empty(&wb->b_io)) - queue_io(wb, wbc->older_than_this); + unsigned long start_time = jiffies; + long wrote = 0; while (!list_empty(&wb->b_io)) { struct inode *inode = wb_inode(wb->b_io.prev); struct super_block *sb = inode->i_sb; if (!grab_super_passive(sb)) { - requeue_io(inode); + /* + * grab_super_passive() may fail consistently due to + * s_umount being grabbed by someone else. Don't use + * requeue_io() to avoid busy retrying the inode/sb. + */ + redirty_tail(inode, wb); continue; } - ret = writeback_sb_inodes(sb, wb, wbc, false); + wrote += writeback_sb_inodes(sb, wb, work); drop_super(sb); - if (ret) - break; + /* refer to the same tests at the end of writeback_sb_inodes */ + if (wrote) { + if (time_is_before_jiffies(start_time + HZ / 10UL)) + break; + if (work->nr_pages <= 0) + break; + } } - spin_unlock(&inode_wb_list_lock); /* Leave any unwritten inodes on b_io */ + return wrote; } -static void __writeback_inodes_sb(struct super_block *sb, - struct bdi_writeback *wb, struct writeback_control *wbc) +long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages) { - WARN_ON(!rwsem_is_locked(&sb->s_umount)); + struct wb_writeback_work work = { + .nr_pages = nr_pages, + .sync_mode = WB_SYNC_NONE, + .range_cyclic = 1, + }; - spin_lock(&inode_wb_list_lock); - if (!wbc->for_kupdate || list_empty(&wb->b_io)) - queue_io(wb, wbc->older_than_this); - writeback_sb_inodes(sb, wb, wbc, true); - spin_unlock(&inode_wb_list_lock); -} + spin_lock(&wb->list_lock); + if (list_empty(&wb->b_io)) + queue_io(wb, NULL); + __writeback_inodes_wb(wb, &work); + spin_unlock(&wb->list_lock); -/* - * The maximum number of pages to writeout in a single bdi flush/kupdate - * operation. We do this so we don't hold I_SYNC against an inode for - * enormous amounts of time, which would block a userspace task which has - * been forced to throttle against that inode. Also, the code reevaluates - * the dirty each time it has written this many pages. - */ -#define MAX_WRITEBACK_PAGES 1024 + return nr_pages - work.nr_pages; +} static inline bool over_bground_thresh(void) { @@ -605,6 +669,16 @@ static inline bool over_bground_thresh(void) } /* + * Called under wb->list_lock. If there are multiple wb per bdi, + * only the flusher working on the first wb should do it. + */ +static void wb_update_bandwidth(struct bdi_writeback *wb, + unsigned long start_time) +{ + __bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, start_time); +} + +/* * Explicit flushing or periodic writeback of "old" data. * * Define "old": the first time one of an inode's pages is dirtied, we mark the @@ -622,47 +696,16 @@ static inline bool over_bground_thresh(void) static long wb_writeback(struct bdi_writeback *wb, struct wb_writeback_work *work) { - struct writeback_control wbc = { - .sync_mode = work->sync_mode, - .older_than_this = NULL, - .for_kupdate = work->for_kupdate, - .for_background = work->for_background, - .range_cyclic = work->range_cyclic, - }; + unsigned long wb_start = jiffies; + long nr_pages = work->nr_pages; unsigned long oldest_jif; - long wrote = 0; - long write_chunk; struct inode *inode; + long progress; - if (wbc.for_kupdate) { - wbc.older_than_this = &oldest_jif; - oldest_jif = jiffies - - msecs_to_jiffies(dirty_expire_interval * 10); - } - if (!wbc.range_cyclic) { - wbc.range_start = 0; - wbc.range_end = LLONG_MAX; - } - - /* - * WB_SYNC_ALL mode does livelock avoidance by syncing dirty - * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX - * here avoids calling into writeback_inodes_wb() more than once. - * - * The intended call sequence for WB_SYNC_ALL writeback is: - * - * wb_writeback() - * __writeback_inodes_sb() <== called only once - * write_cache_pages() <== called once for each inode - * (quickly) tag currently dirty pages - * (maybe slowly) sync all tagged pages - */ - if (wbc.sync_mode == WB_SYNC_NONE) - write_chunk = MAX_WRITEBACK_PAGES; - else - write_chunk = LONG_MAX; + oldest_jif = jiffies; + work->older_than_this = &oldest_jif; - wbc.wb_start = jiffies; /* livelock avoidance */ + spin_lock(&wb->list_lock); for (;;) { /* * Stop writeback when nr_pages has been consumed @@ -687,52 +730,54 @@ static long wb_writeback(struct bdi_writeback *wb, if (work->for_background && !over_bground_thresh()) break; - wbc.more_io = 0; - wbc.nr_to_write = write_chunk; - wbc.pages_skipped = 0; + if (work->for_kupdate) { + oldest_jif = jiffies - + msecs_to_jiffies(dirty_expire_interval * 10); + work->older_than_this = &oldest_jif; + } - trace_wbc_writeback_start(&wbc, wb->bdi); + trace_writeback_start(wb->bdi, work); + if (list_empty(&wb->b_io)) + queue_io(wb, work->older_than_this); if (work->sb) - __writeback_inodes_sb(work->sb, wb, &wbc); + progress = writeback_sb_inodes(work->sb, wb, work); else - writeback_inodes_wb(wb, &wbc); - trace_wbc_writeback_written(&wbc, wb->bdi); + progress = __writeback_inodes_wb(wb, work); + trace_writeback_written(wb->bdi, work); - work->nr_pages -= write_chunk - wbc.nr_to_write; - wrote += write_chunk - wbc.nr_to_write; + wb_update_bandwidth(wb, wb_start); /* - * If we consumed everything, see if we have more + * Did we write something? Try for more + * + * Dirty inodes are moved to b_io for writeback in batches. + * The completion of the current batch does not necessarily + * mean the overall work is done. So we keep looping as long + * as made some progress on cleaning pages or inodes. */ - if (wbc.nr_to_write <= 0) + if (progress) continue; /* - * Didn't write everything and we don't have more IO, bail + * No more inodes for IO, bail */ - if (!wbc.more_io) + if (list_empty(&wb->b_more_io)) break; /* - * Did we write something? Try for more - */ - if (wbc.nr_to_write < write_chunk) - continue; - /* * Nothing written. Wait for some inode to * become available for writeback. Otherwise * we'll just busyloop. */ - spin_lock(&inode_wb_list_lock); if (!list_empty(&wb->b_more_io)) { + trace_writeback_wait(wb->bdi, work); inode = wb_inode(wb->b_more_io.prev); - trace_wbc_writeback_wait(&wbc, wb->bdi); spin_lock(&inode->i_lock); - inode_wait_for_writeback(inode); + inode_wait_for_writeback(inode, wb); spin_unlock(&inode->i_lock); } - spin_unlock(&inode_wb_list_lock); } + spin_unlock(&wb->list_lock); - return wrote; + return nr_pages - work->nr_pages; } /* @@ -1063,10 +1108,10 @@ void __mark_inode_dirty(struct inode *inode, int flags) } spin_unlock(&inode->i_lock); - spin_lock(&inode_wb_list_lock); + spin_lock(&bdi->wb.list_lock); inode->dirtied_when = jiffies; list_move(&inode->i_wb_list, &bdi->wb.b_dirty); - spin_unlock(&inode_wb_list_lock); + spin_unlock(&bdi->wb.list_lock); if (wakeup_bdi) bdi_wakeup_thread_delayed(bdi); @@ -1162,10 +1207,11 @@ void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr) { DECLARE_COMPLETION_ONSTACK(done); struct wb_writeback_work work = { - .sb = sb, - .sync_mode = WB_SYNC_NONE, - .done = &done, - .nr_pages = nr, + .sb = sb, + .sync_mode = WB_SYNC_NONE, + .tagged_writepages = 1, + .done = &done, + .nr_pages = nr, }; WARN_ON(!rwsem_is_locked(&sb->s_umount)); @@ -1267,6 +1313,7 @@ EXPORT_SYMBOL(sync_inodes_sb); */ int write_inode_now(struct inode *inode, int sync) { + struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; int ret; struct writeback_control wbc = { .nr_to_write = LONG_MAX, @@ -1279,11 +1326,11 @@ int write_inode_now(struct inode *inode, int sync) wbc.nr_to_write = 0; might_sleep(); - spin_lock(&inode_wb_list_lock); + spin_lock(&wb->list_lock); spin_lock(&inode->i_lock); - ret = writeback_single_inode(inode, &wbc); + ret = writeback_single_inode(inode, wb, &wbc); spin_unlock(&inode->i_lock); - spin_unlock(&inode_wb_list_lock); + spin_unlock(&wb->list_lock); if (sync) inode_sync_wait(inode); return ret; @@ -1303,13 +1350,14 @@ EXPORT_SYMBOL(write_inode_now); */ int sync_inode(struct inode *inode, struct writeback_control *wbc) { + struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; int ret; - spin_lock(&inode_wb_list_lock); + spin_lock(&wb->list_lock); spin_lock(&inode->i_lock); - ret = writeback_single_inode(inode, wbc); + ret = writeback_single_inode(inode, wb, wbc); spin_unlock(&inode->i_lock); - spin_unlock(&inode_wb_list_lock); + spin_unlock(&wb->list_lock); return ret; } EXPORT_SYMBOL(sync_inode); diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 640fc22..5cb8614 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -258,10 +258,14 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, forget->forget_one.nlookup = nlookup; spin_lock(&fc->lock); - fc->forget_list_tail->next = forget; - fc->forget_list_tail = forget; - wake_up(&fc->waitq); - kill_fasync(&fc->fasync, SIGIO, POLL_IN); + if (fc->connected) { + fc->forget_list_tail->next = forget; + fc->forget_list_tail = forget; + wake_up(&fc->waitq); + kill_fasync(&fc->fasync, SIGIO, POLL_IN); + } else { + kfree(forget); + } spin_unlock(&fc->lock); } @@ -1358,6 +1362,10 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size, if (outarg.namelen > FUSE_NAME_MAX) goto err; + err = -EINVAL; + if (size != sizeof(outarg) + outarg.namelen + 1) + goto err; + name.name = buf; name.len = outarg.namelen; err = fuse_copy_one(cs, buf, outarg.namelen + 1); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index d480d9a..594f07a 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -14,6 +14,7 @@ #include <linux/sched.h> #include <linux/module.h> #include <linux/compat.h> +#include <linux/swap.h> static const struct file_operations fuse_direct_io_file_operations; @@ -245,6 +246,12 @@ void fuse_release_common(struct file *file, int opcode) req = ff->reserved_req; fuse_prepare_release(ff, file->f_flags, opcode); + if (ff->flock) { + struct fuse_release_in *inarg = &req->misc.release.in; + inarg->release_flags |= FUSE_RELEASE_FLOCK_UNLOCK; + inarg->lock_owner = fuse_lock_owner_id(ff->fc, + (fl_owner_t) file); + } /* Hold vfsmount and dentry until release is finished */ path_get(&file->f_path); req->misc.release.path = file->f_path; @@ -755,18 +762,6 @@ static size_t fuse_send_write(struct fuse_req *req, struct file *file, return req->misc.write.out.size; } -static int fuse_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata) -{ - pgoff_t index = pos >> PAGE_CACHE_SHIFT; - - *pagep = grab_cache_page_write_begin(mapping, index, flags); - if (!*pagep) - return -ENOMEM; - return 0; -} - void fuse_write_update_size(struct inode *inode, loff_t pos) { struct fuse_conn *fc = get_fuse_conn(inode); @@ -779,62 +774,6 @@ void fuse_write_update_size(struct inode *inode, loff_t pos) spin_unlock(&fc->lock); } -static int fuse_buffered_write(struct file *file, struct inode *inode, - loff_t pos, unsigned count, struct page *page) -{ - int err; - size_t nres; - struct fuse_conn *fc = get_fuse_conn(inode); - unsigned offset = pos & (PAGE_CACHE_SIZE - 1); - struct fuse_req *req; - - if (is_bad_inode(inode)) - return -EIO; - - /* - * Make sure writepages on the same page are not mixed up with - * plain writes. - */ - fuse_wait_on_page_writeback(inode, page->index); - - req = fuse_get_req(fc); - if (IS_ERR(req)) - return PTR_ERR(req); - - req->in.argpages = 1; - req->num_pages = 1; - req->pages[0] = page; - req->page_offset = offset; - nres = fuse_send_write(req, file, pos, count, NULL); - err = req->out.h.error; - fuse_put_request(fc, req); - if (!err && !nres) - err = -EIO; - if (!err) { - pos += nres; - fuse_write_update_size(inode, pos); - if (count == PAGE_CACHE_SIZE) - SetPageUptodate(page); - } - fuse_invalidate_attr(inode); - return err ? err : nres; -} - -static int fuse_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - struct inode *inode = mapping->host; - int res = 0; - - if (copied) - res = fuse_buffered_write(file, inode, pos, copied, page); - - unlock_page(page); - page_cache_release(page); - return res; -} - static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file, struct inode *inode, loff_t pos, size_t count) @@ -908,6 +847,8 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req, pagefault_enable(); flush_dcache_page(page); + mark_page_accessed(page); + if (!tmp) { unlock_page(page); page_cache_release(page); @@ -1559,11 +1500,14 @@ static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl) struct fuse_conn *fc = get_fuse_conn(inode); int err; - if (fc->no_lock) { + if (fc->no_flock) { err = flock_lock_file_wait(file, fl); } else { + struct fuse_file *ff = file->private_data; + /* emulate flock with POSIX locks */ fl->fl_owner = (fl_owner_t) file; + ff->flock = true; err = fuse_setlk(file, fl, 1); } @@ -2201,8 +2145,6 @@ static const struct address_space_operations fuse_file_aops = { .readpage = fuse_readpage, .writepage = fuse_writepage, .launder_page = fuse_launder_page, - .write_begin = fuse_write_begin, - .write_end = fuse_write_end, .readpages = fuse_readpages, .set_page_dirty = __set_page_dirty_nobuffers, .bmap = fuse_bmap, diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index c6aa2d4..cf6db0a 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -135,6 +135,9 @@ struct fuse_file { /** Wait queue head for poll */ wait_queue_head_t poll_wait; + + /** Has flock been performed on this file? */ + bool flock:1; }; /** One input argument of a request */ @@ -448,7 +451,7 @@ struct fuse_conn { /** Is removexattr not implemented by fs? */ unsigned no_removexattr:1; - /** Are file locking primitives not implemented by fs? */ + /** Are posix file locking primitives not implemented by fs? */ unsigned no_lock:1; /** Is access not implemented by fs? */ @@ -472,6 +475,9 @@ struct fuse_conn { /** Don't apply umask to creation modes */ unsigned dont_mask:1; + /** Are BSD file locking primitives not implemented by fs? */ + unsigned no_flock:1; + /** The number of requests waiting for completion */ atomic_t num_waiting; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 38f84cd..add96f6 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -71,7 +71,7 @@ struct fuse_mount_data { unsigned blksize; }; -struct fuse_forget_link *fuse_alloc_forget() +struct fuse_forget_link *fuse_alloc_forget(void) { return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL); } @@ -809,6 +809,13 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->async_read = 1; if (!(arg->flags & FUSE_POSIX_LOCKS)) fc->no_lock = 1; + if (arg->minor >= 17) { + if (!(arg->flags & FUSE_FLOCK_LOCKS)) + fc->no_flock = 1; + } else { + if (!(arg->flags & FUSE_POSIX_LOCKS)) + fc->no_flock = 1; + } if (arg->flags & FUSE_ATOMIC_O_TRUNC) fc->atomic_o_trunc = 1; if (arg->minor >= 9) { @@ -823,6 +830,7 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) } else { ra_pages = fc->max_read / PAGE_CACHE_SIZE; fc->no_lock = 1; + fc->no_flock = 1; } fc->bdi.ra_pages = min(fc->bdi.ra_pages, ra_pages); @@ -843,7 +851,8 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) arg->minor = FUSE_KERNEL_MINOR_VERSION; arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE; arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | - FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK; + FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK | + FUSE_FLOCK_LOCKS; req->in.h.opcode = FUSE_INIT; req->in.numargs = 1; req->in.args[0].size = sizeof(*arg); diff --git a/fs/generic_acl.c b/fs/generic_acl.c index d5e33a0..d0dddac 100644 --- a/fs/generic_acl.c +++ b/fs/generic_acl.c @@ -82,18 +82,14 @@ generic_acl_set(struct dentry *dentry, const char *name, const void *value, return PTR_ERR(acl); } if (acl) { - mode_t mode; - error = posix_acl_valid(acl); if (error) goto failed; switch (type) { case ACL_TYPE_ACCESS: - mode = inode->i_mode; - error = posix_acl_equiv_mode(acl, &mode); + error = posix_acl_equiv_mode(acl, &inode->i_mode); if (error < 0) goto failed; - inode->i_mode = mode; inode->i_ctime = CURRENT_TIME; if (error == 0) { posix_acl_release(acl); @@ -125,21 +121,20 @@ int generic_acl_init(struct inode *inode, struct inode *dir) { struct posix_acl *acl = NULL; - mode_t mode = inode->i_mode; int error; - inode->i_mode = mode & ~current_umask(); if (!S_ISLNK(inode->i_mode)) acl = get_cached_acl(dir, ACL_TYPE_DEFAULT); if (acl) { if (S_ISDIR(inode->i_mode)) set_cached_acl(inode, ACL_TYPE_DEFAULT, acl); - error = posix_acl_create(&acl, GFP_KERNEL, &mode); + error = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode); if (error < 0) return error; - inode->i_mode = mode; if (error > 0) set_cached_acl(inode, ACL_TYPE_ACCESS, acl); + } else { + inode->i_mode &= ~current_umask(); } error = 0; diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 884c9af..34501b6 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -72,7 +72,7 @@ struct posix_acl *gfs2_get_acl(struct inode *inode, int type) return gfs2_acl_get(GFS2_I(inode), type); } -static int gfs2_set_mode(struct inode *inode, mode_t mode) +static int gfs2_set_mode(struct inode *inode, umode_t mode) { int error = 0; @@ -117,7 +117,7 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode) { struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); struct posix_acl *acl; - mode_t mode = inode->i_mode; + umode_t mode = inode->i_mode; int error = 0; if (!sdp->sd_args.ar_posix_acl) @@ -276,7 +276,7 @@ static int gfs2_xattr_system_set(struct dentry *dentry, const char *name, goto out_release; if (type == ACL_TYPE_ACCESS) { - mode_t mode = inode->i_mode; + umode_t mode = inode->i_mode; error = posix_acl_equiv_mode(acl, &mode); if (error <= 0) { diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 29e1ace..8a139ff 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -16,7 +16,7 @@ #include <linux/gfs2_ondisk.h> #include <linux/rcupdate.h> #include <linux/rculist_bl.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include "gfs2.h" #include "incore.h" diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 516516e..3bc073a 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1018,13 +1018,13 @@ hostdata_error: fsname++; if (lm->lm_mount == NULL) { fs_info(sdp, "Now mounting FS...\n"); - complete(&sdp->sd_locking_init); + complete_all(&sdp->sd_locking_init); return 0; } ret = lm->lm_mount(sdp, fsname); if (ret == 0) fs_info(sdp, "Joined cluster. Now mounting FS...\n"); - complete(&sdp->sd_locking_init); + complete_all(&sdp->sd_locking_init); return ret; } diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c index 8635be5..970ea98 100644 --- a/fs/hppfs/hppfs.c +++ b/fs/hppfs/hppfs.c @@ -16,6 +16,7 @@ #include <linux/statfs.h> #include <linux/types.h> #include <linux/pid_namespace.h> +#include <linux/namei.h> #include <asm/uaccess.h> #include "os.h" diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 87b6e04..ec88953 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -491,6 +491,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid, inode->i_op = &page_symlink_inode_operations; break; } + lockdep_annotate_inode_mutex_key(inode); } return inode; } @@ -37,7 +37,7 @@ * inode->i_sb->s_inode_lru, inode->i_lru * inode_sb_list_lock protects: * sb->s_inodes, inode->i_sb_list - * inode_wb_list_lock protects: + * bdi->wb.list_lock protects: * bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list * inode_hash_lock protects: * inode_hashtable, inode->i_hash @@ -48,7 +48,7 @@ * inode->i_lock * inode->i_sb->s_inode_lru_lock * - * inode_wb_list_lock + * bdi->wb.list_lock * inode->i_lock * * inode_hash_lock @@ -65,7 +65,6 @@ static struct hlist_head *inode_hashtable __read_mostly; static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock); __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock); -__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock); /* * Empty aops. Can be used for the cases where the user does not @@ -144,6 +143,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) inode->i_op = &empty_iops; inode->i_fop = &empty_fops; inode->i_nlink = 1; + inode->i_opflags = 0; inode->i_uid = 0; inode->i_gid = 0; atomic_set(&inode->i_writecount, 0); @@ -362,9 +362,11 @@ EXPORT_SYMBOL_GPL(inode_sb_list_add); static inline void inode_sb_list_del(struct inode *inode) { - spin_lock(&inode_sb_list_lock); - list_del_init(&inode->i_sb_list); - spin_unlock(&inode_sb_list_lock); + if (!list_empty(&inode->i_sb_list)) { + spin_lock(&inode_sb_list_lock); + list_del_init(&inode->i_sb_list); + spin_unlock(&inode_sb_list_lock); + } } static unsigned long hash(struct super_block *sb, unsigned long hashval) @@ -398,12 +400,12 @@ void __insert_inode_hash(struct inode *inode, unsigned long hashval) EXPORT_SYMBOL(__insert_inode_hash); /** - * remove_inode_hash - remove an inode from the hash + * __remove_inode_hash - remove an inode from the hash * @inode: inode to unhash * * Remove an inode from the superblock. */ -void remove_inode_hash(struct inode *inode) +void __remove_inode_hash(struct inode *inode) { spin_lock(&inode_hash_lock); spin_lock(&inode->i_lock); @@ -411,7 +413,7 @@ void remove_inode_hash(struct inode *inode) spin_unlock(&inode->i_lock); spin_unlock(&inode_hash_lock); } -EXPORT_SYMBOL(remove_inode_hash); +EXPORT_SYMBOL(__remove_inode_hash); void end_writeback(struct inode *inode) { @@ -453,7 +455,9 @@ static void evict(struct inode *inode) BUG_ON(!(inode->i_state & I_FREEING)); BUG_ON(!list_empty(&inode->i_lru)); - inode_wb_list_del(inode); + if (!list_empty(&inode->i_wb_list)) + inode_wb_list_del(inode); + inode_sb_list_del(inode); if (op->evict_inode) { @@ -797,6 +801,29 @@ unsigned int get_next_ino(void) EXPORT_SYMBOL(get_next_ino); /** + * new_inode_pseudo - obtain an inode + * @sb: superblock + * + * Allocates a new inode for given superblock. + * Inode wont be chained in superblock s_inodes list + * This means : + * - fs can't be unmount + * - quotas, fsnotify, writeback can't work + */ +struct inode *new_inode_pseudo(struct super_block *sb) +{ + struct inode *inode = alloc_inode(sb); + + if (inode) { + spin_lock(&inode->i_lock); + inode->i_state = 0; + spin_unlock(&inode->i_lock); + INIT_LIST_HEAD(&inode->i_sb_list); + } + return inode; +} + +/** * new_inode - obtain an inode * @sb: superblock * @@ -814,27 +841,16 @@ struct inode *new_inode(struct super_block *sb) spin_lock_prefetch(&inode_sb_list_lock); - inode = alloc_inode(sb); - if (inode) { - spin_lock(&inode->i_lock); - inode->i_state = 0; - spin_unlock(&inode->i_lock); + inode = new_inode_pseudo(sb); + if (inode) inode_sb_list_add(inode); - } return inode; } EXPORT_SYMBOL(new_inode); -/** - * unlock_new_inode - clear the I_NEW state and wake up any waiters - * @inode: new inode to unlock - * - * Called when the inode is fully initialised to clear the new state of the - * inode and wake up anyone waiting for the inode to finish initialisation. - */ -void unlock_new_inode(struct inode *inode) -{ #ifdef CONFIG_DEBUG_LOCK_ALLOC +void lockdep_annotate_inode_mutex_key(struct inode *inode) +{ if (S_ISDIR(inode->i_mode)) { struct file_system_type *type = inode->i_sb->s_type; @@ -850,7 +866,20 @@ void unlock_new_inode(struct inode *inode) &type->i_mutex_dir_key); } } +} +EXPORT_SYMBOL(lockdep_annotate_inode_mutex_key); #endif + +/** + * unlock_new_inode - clear the I_NEW state and wake up any waiters + * @inode: new inode to unlock + * + * Called when the inode is fully initialised to clear the new state of the + * inode and wake up anyone waiting for the inode to finish initialisation. + */ +void unlock_new_inode(struct inode *inode) +{ + lockdep_annotate_inode_mutex_key(inode); spin_lock(&inode->i_lock); WARN_ON(!(inode->i_state & I_NEW)); inode->i_state &= ~I_NEW; @@ -1308,7 +1337,8 @@ static void iput_final(struct inode *inode) } inode->i_state |= I_FREEING; - inode_lru_list_del(inode); + if (!list_empty(&inode->i_lru)) + inode_lru_list_del(inode); spin_unlock(&inode->i_lock); evict(inode); diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c index e4b87bc..f94fc48 100644 --- a/fs/jbd/checkpoint.c +++ b/fs/jbd/checkpoint.c @@ -22,6 +22,8 @@ #include <linux/jbd.h> #include <linux/errno.h> #include <linux/slab.h> +#include <linux/blkdev.h> +#include <trace/events/jbd.h> /* * Unlink a buffer from a transaction checkpoint list. @@ -95,10 +97,14 @@ static int __try_to_free_cp_buf(struct journal_head *jh) if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh) && !buffer_write_io_error(bh)) { + /* + * Get our reference so that bh cannot be freed before + * we unlock it + */ + get_bh(bh); JBUFFER_TRACE(jh, "remove from checkpoint list"); ret = __journal_remove_checkpoint(jh) + 1; jbd_unlock_bh_state(bh); - journal_remove_journal_head(bh); BUFFER_TRACE(bh, "release"); __brelse(bh); } else { @@ -220,8 +226,8 @@ restart: spin_lock(&journal->j_list_lock); goto restart; } + get_bh(bh); if (buffer_locked(bh)) { - get_bh(bh); spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); wait_on_buffer(bh); @@ -240,7 +246,6 @@ restart: */ released = __journal_remove_checkpoint(jh); jbd_unlock_bh_state(bh); - journal_remove_journal_head(bh); __brelse(bh); } @@ -253,9 +258,12 @@ static void __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count) { int i; + struct blk_plug plug; + blk_start_plug(&plug); for (i = 0; i < *batch_count; i++) - write_dirty_buffer(bhs[i], WRITE); + write_dirty_buffer(bhs[i], WRITE_SYNC); + blk_finish_plug(&plug); for (i = 0; i < *batch_count; i++) { struct buffer_head *bh = bhs[i]; @@ -304,12 +312,12 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh, ret = 1; if (unlikely(buffer_write_io_error(bh))) ret = -EIO; + get_bh(bh); J_ASSERT_JH(jh, !buffer_jbddirty(bh)); BUFFER_TRACE(bh, "remove from checkpoint"); __journal_remove_checkpoint(jh); spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); - journal_remove_journal_head(bh); __brelse(bh); } else { /* @@ -358,6 +366,7 @@ int log_do_checkpoint(journal_t *journal) * journal straight away. */ result = cleanup_journal_tail(journal); + trace_jbd_checkpoint(journal, result); jbd_debug(1, "cleanup_journal_tail returned %d\n", result); if (result <= 0) return result; @@ -503,6 +512,7 @@ int cleanup_journal_tail(journal_t *journal) if (blocknr < journal->j_tail) freed = freed + journal->j_last - journal->j_first; + trace_jbd_cleanup_journal_tail(journal, first_tid, blocknr, freed); jbd_debug(1, "Cleaning journal tail from %d to %d (offset %u), " "freeing %u\n", @@ -523,9 +533,9 @@ int cleanup_journal_tail(journal_t *journal) /* * journal_clean_one_cp_list * - * Find all the written-back checkpoint buffers in the given list and release them. + * Find all the written-back checkpoint buffers in the given list and release + * them. * - * Called with the journal locked. * Called with j_list_lock held. * Returns number of bufers reaped (for debug) */ @@ -632,8 +642,8 @@ out: * checkpoint lists. * * The function returns 1 if it frees the transaction, 0 otherwise. + * The function can free jh and bh. * - * This function is called with the journal locked. * This function is called with j_list_lock held. * This function is called with jbd_lock_bh_state(jh2bh(jh)) */ @@ -652,13 +662,14 @@ int __journal_remove_checkpoint(struct journal_head *jh) } journal = transaction->t_journal; + JBUFFER_TRACE(jh, "removing from transaction"); __buffer_unlink(jh); jh->b_cp_transaction = NULL; + journal_put_journal_head(jh); if (transaction->t_checkpoint_list != NULL || transaction->t_checkpoint_io_list != NULL) goto out; - JBUFFER_TRACE(jh, "transaction has no more buffers"); /* * There is one special case to worry about: if we have just pulled the @@ -669,10 +680,8 @@ int __journal_remove_checkpoint(struct journal_head *jh) * The locking here around t_state is a bit sleazy. * See the comment at the end of journal_commit_transaction(). */ - if (transaction->t_state != T_FINISHED) { - JBUFFER_TRACE(jh, "belongs to running/committing transaction"); + if (transaction->t_state != T_FINISHED) goto out; - } /* OK, that was the last buffer for the transaction: we can now safely remove this transaction from the log */ @@ -684,7 +693,6 @@ int __journal_remove_checkpoint(struct journal_head *jh) wake_up(&journal->j_wait_logspace); ret = 1; out: - JBUFFER_TRACE(jh, "exit"); return ret; } @@ -703,6 +711,8 @@ void __journal_insert_checkpoint(struct journal_head *jh, J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh))); J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); + /* Get reference for checkpointing transaction */ + journal_grab_journal_head(jh2bh(jh)); jh->b_cp_transaction = transaction; if (!transaction->t_checkpoint_list) { @@ -752,6 +762,7 @@ void __journal_drop_transaction(journal_t *journal, transaction_t *transaction) J_ASSERT(journal->j_committing_transaction != transaction); J_ASSERT(journal->j_running_transaction != transaction); + trace_jbd_drop_transaction(journal, transaction); jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid); kfree(transaction); } diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 72ffa97..8799207 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -21,6 +21,7 @@ #include <linux/pagemap.h> #include <linux/bio.h> #include <linux/blkdev.h> +#include <trace/events/jbd.h> /* * Default IO end handler for temporary BJ_IO buffer_heads. @@ -204,6 +205,8 @@ write_out_data: if (!trylock_buffer(bh)) { BUFFER_TRACE(bh, "needs blocking lock"); spin_unlock(&journal->j_list_lock); + trace_jbd_do_submit_data(journal, + commit_transaction); /* Write out all data to prevent deadlocks */ journal_do_submit_data(wbuf, bufs, write_op); bufs = 0; @@ -236,6 +239,8 @@ write_out_data: jbd_unlock_bh_state(bh); if (bufs == journal->j_wbufsize) { spin_unlock(&journal->j_list_lock); + trace_jbd_do_submit_data(journal, + commit_transaction); journal_do_submit_data(wbuf, bufs, write_op); bufs = 0; goto write_out_data; @@ -253,10 +258,6 @@ write_out_data: jbd_unlock_bh_state(bh); if (locked) unlock_buffer(bh); - journal_remove_journal_head(bh); - /* One for our safety reference, other for - * journal_remove_journal_head() */ - put_bh(bh); release_data_buffer(bh); } @@ -266,6 +267,7 @@ write_out_data: } } spin_unlock(&journal->j_list_lock); + trace_jbd_do_submit_data(journal, commit_transaction); journal_do_submit_data(wbuf, bufs, write_op); return err; @@ -316,12 +318,14 @@ void journal_commit_transaction(journal_t *journal) commit_transaction = journal->j_running_transaction; J_ASSERT(commit_transaction->t_state == T_RUNNING); + trace_jbd_start_commit(journal, commit_transaction); jbd_debug(1, "JBD: starting commit of transaction %d\n", commit_transaction->t_tid); spin_lock(&journal->j_state_lock); commit_transaction->t_state = T_LOCKED; + trace_jbd_commit_locking(journal, commit_transaction); spin_lock(&commit_transaction->t_handle_lock); while (commit_transaction->t_updates) { DEFINE_WAIT(wait); @@ -392,6 +396,7 @@ void journal_commit_transaction(journal_t *journal) */ journal_switch_revoke_table(journal); + trace_jbd_commit_flushing(journal, commit_transaction); commit_transaction->t_state = T_FLUSH; journal->j_committing_transaction = commit_transaction; journal->j_running_transaction = NULL; @@ -446,14 +451,9 @@ void journal_commit_transaction(journal_t *journal) } if (buffer_jbd(bh) && bh2jh(bh) == jh && jh->b_transaction == commit_transaction && - jh->b_jlist == BJ_Locked) { + jh->b_jlist == BJ_Locked) __journal_unfile_buffer(jh); - jbd_unlock_bh_state(bh); - journal_remove_journal_head(bh); - put_bh(bh); - } else { - jbd_unlock_bh_state(bh); - } + jbd_unlock_bh_state(bh); release_data_buffer(bh); cond_resched_lock(&journal->j_list_lock); } @@ -493,6 +493,7 @@ void journal_commit_transaction(journal_t *journal) commit_transaction->t_state = T_COMMIT; spin_unlock(&journal->j_state_lock); + trace_jbd_commit_logging(journal, commit_transaction); J_ASSERT(commit_transaction->t_nr_buffers <= commit_transaction->t_outstanding_credits); @@ -797,10 +798,16 @@ restart_loop: while (commit_transaction->t_forget) { transaction_t *cp_transaction; struct buffer_head *bh; + int try_to_free = 0; jh = commit_transaction->t_forget; spin_unlock(&journal->j_list_lock); bh = jh2bh(jh); + /* + * Get a reference so that bh cannot be freed before we are + * done with it. + */ + get_bh(bh); jbd_lock_bh_state(bh); J_ASSERT_JH(jh, jh->b_transaction == commit_transaction || jh->b_transaction == journal->j_running_transaction); @@ -858,28 +865,27 @@ restart_loop: __journal_insert_checkpoint(jh, commit_transaction); if (is_journal_aborted(journal)) clear_buffer_jbddirty(bh); - JBUFFER_TRACE(jh, "refile for checkpoint writeback"); - __journal_refile_buffer(jh); - jbd_unlock_bh_state(bh); } else { J_ASSERT_BH(bh, !buffer_dirty(bh)); - /* The buffer on BJ_Forget list and not jbddirty means + /* + * The buffer on BJ_Forget list and not jbddirty means * it has been freed by this transaction and hence it * could not have been reallocated until this * transaction has committed. *BUT* it could be * reallocated once we have written all the data to * disk and before we process the buffer on BJ_Forget - * list. */ - JBUFFER_TRACE(jh, "refile or unfile freed buffer"); - __journal_refile_buffer(jh); - if (!jh->b_transaction) { - jbd_unlock_bh_state(bh); - /* needs a brelse */ - journal_remove_journal_head(bh); - release_buffer_page(bh); - } else - jbd_unlock_bh_state(bh); + * list. + */ + if (!jh->b_next_transaction) + try_to_free = 1; } + JBUFFER_TRACE(jh, "refile or unfile freed buffer"); + __journal_refile_buffer(jh); + jbd_unlock_bh_state(bh); + if (try_to_free) + release_buffer_page(bh); + else + __brelse(bh); cond_resched_lock(&journal->j_list_lock); } spin_unlock(&journal->j_list_lock); @@ -946,6 +952,7 @@ restart_loop: } spin_unlock(&journal->j_list_lock); + trace_jbd_end_commit(journal, commit_transaction); jbd_debug(1, "JBD: commit %d complete, head %d\n", journal->j_commit_sequence, journal->j_tail_sequence); diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index e2d4285..9fe061f 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -38,6 +38,9 @@ #include <linux/debugfs.h> #include <linux/ratelimit.h> +#define CREATE_TRACE_POINTS +#include <trace/events/jbd.h> + #include <asm/uaccess.h> #include <asm/page.h> @@ -1065,6 +1068,7 @@ void journal_update_superblock(journal_t *journal, int wait) } else write_dirty_buffer(bh, WRITE); + trace_jbd_update_superblock_end(journal, wait); out: /* If we have just flushed the log (by marking s_start==0), then * any future commit will have to be careful to update the @@ -1799,10 +1803,9 @@ static void journal_free_journal_head(struct journal_head *jh) * When a buffer has its BH_JBD bit set it is immune from being released by * core kernel code, mainly via ->b_count. * - * A journal_head may be detached from its buffer_head when the journal_head's - * b_transaction, b_cp_transaction and b_next_transaction pointers are NULL. - * Various places in JBD call journal_remove_journal_head() to indicate that the - * journal_head can be dropped if needed. + * A journal_head is detached from its buffer_head when the journal_head's + * b_jcount reaches zero. Running transaction (b_transaction) and checkpoint + * transaction (b_cp_transaction) hold their references to b_jcount. * * Various places in the kernel want to attach a journal_head to a buffer_head * _before_ attaching the journal_head to a transaction. To protect the @@ -1815,17 +1818,16 @@ static void journal_free_journal_head(struct journal_head *jh) * (Attach a journal_head if needed. Increments b_jcount) * struct journal_head *jh = journal_add_journal_head(bh); * ... - * jh->b_transaction = xxx; - * journal_put_journal_head(jh); - * - * Now, the journal_head's b_jcount is zero, but it is safe from being released - * because it has a non-zero b_transaction. + * (Get another reference for transaction) + * journal_grab_journal_head(bh); + * jh->b_transaction = xxx; + * (Put original reference) + * journal_put_journal_head(jh); */ /* * Give a buffer_head a journal_head. * - * Doesn't need the journal lock. * May sleep. */ struct journal_head *journal_add_journal_head(struct buffer_head *bh) @@ -1889,61 +1891,29 @@ static void __journal_remove_journal_head(struct buffer_head *bh) struct journal_head *jh = bh2jh(bh); J_ASSERT_JH(jh, jh->b_jcount >= 0); - - get_bh(bh); - if (jh->b_jcount == 0) { - if (jh->b_transaction == NULL && - jh->b_next_transaction == NULL && - jh->b_cp_transaction == NULL) { - J_ASSERT_JH(jh, jh->b_jlist == BJ_None); - J_ASSERT_BH(bh, buffer_jbd(bh)); - J_ASSERT_BH(bh, jh2bh(jh) == bh); - BUFFER_TRACE(bh, "remove journal_head"); - if (jh->b_frozen_data) { - printk(KERN_WARNING "%s: freeing " - "b_frozen_data\n", - __func__); - jbd_free(jh->b_frozen_data, bh->b_size); - } - if (jh->b_committed_data) { - printk(KERN_WARNING "%s: freeing " - "b_committed_data\n", - __func__); - jbd_free(jh->b_committed_data, bh->b_size); - } - bh->b_private = NULL; - jh->b_bh = NULL; /* debug, really */ - clear_buffer_jbd(bh); - __brelse(bh); - journal_free_journal_head(jh); - } else { - BUFFER_TRACE(bh, "journal_head was locked"); - } + J_ASSERT_JH(jh, jh->b_transaction == NULL); + J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); + J_ASSERT_JH(jh, jh->b_jlist == BJ_None); + J_ASSERT_BH(bh, buffer_jbd(bh)); + J_ASSERT_BH(bh, jh2bh(jh) == bh); + BUFFER_TRACE(bh, "remove journal_head"); + if (jh->b_frozen_data) { + printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__); + jbd_free(jh->b_frozen_data, bh->b_size); } + if (jh->b_committed_data) { + printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__); + jbd_free(jh->b_committed_data, bh->b_size); + } + bh->b_private = NULL; + jh->b_bh = NULL; /* debug, really */ + clear_buffer_jbd(bh); + journal_free_journal_head(jh); } /* - * journal_remove_journal_head(): if the buffer isn't attached to a transaction - * and has a zero b_jcount then remove and release its journal_head. If we did - * see that the buffer is not used by any transaction we also "logically" - * decrement ->b_count. - * - * We in fact take an additional increment on ->b_count as a convenience, - * because the caller usually wants to do additional things with the bh - * after calling here. - * The caller of journal_remove_journal_head() *must* run __brelse(bh) at some - * time. Once the caller has run __brelse(), the buffer is eligible for - * reaping by try_to_free_buffers(). - */ -void journal_remove_journal_head(struct buffer_head *bh) -{ - jbd_lock_bh_journal_head(bh); - __journal_remove_journal_head(bh); - jbd_unlock_bh_journal_head(bh); -} - -/* - * Drop a reference on the passed journal_head. If it fell to zero then try to + * Drop a reference on the passed journal_head. If it fell to zero then * release the journal_head from the buffer_head. */ void journal_put_journal_head(struct journal_head *jh) @@ -1953,11 +1923,12 @@ void journal_put_journal_head(struct journal_head *jh) jbd_lock_bh_journal_head(bh); J_ASSERT_JH(jh, jh->b_jcount > 0); --jh->b_jcount; - if (!jh->b_jcount && !jh->b_transaction) { + if (!jh->b_jcount) { __journal_remove_journal_head(bh); + jbd_unlock_bh_journal_head(bh); __brelse(bh); - } - jbd_unlock_bh_journal_head(bh); + } else + jbd_unlock_bh_journal_head(bh); } /* diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index f7ee81a..7e59c6e 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -26,6 +26,7 @@ #include <linux/mm.h> #include <linux/highmem.h> #include <linux/hrtimer.h> +#include <linux/backing-dev.h> static void __journal_temp_unlink_buffer(struct journal_head *jh); @@ -99,11 +100,10 @@ static int start_this_handle(journal_t *journal, handle_t *handle) alloc_transaction: if (!journal->j_running_transaction) { - new_transaction = kzalloc(sizeof(*new_transaction), - GFP_NOFS|__GFP_NOFAIL); + new_transaction = kzalloc(sizeof(*new_transaction), GFP_NOFS); if (!new_transaction) { - ret = -ENOMEM; - goto out; + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto alloc_transaction; } } @@ -696,7 +696,6 @@ repeat: if (!jh->b_transaction) { JBUFFER_TRACE(jh, "no transaction"); J_ASSERT_JH(jh, !jh->b_next_transaction); - jh->b_transaction = transaction; JBUFFER_TRACE(jh, "file as BJ_Reserved"); spin_lock(&journal->j_list_lock); __journal_file_buffer(jh, transaction, BJ_Reserved); @@ -818,7 +817,6 @@ int journal_get_create_access(handle_t *handle, struct buffer_head *bh) * committed and so it's safe to clear the dirty bit. */ clear_buffer_dirty(jh2bh(jh)); - jh->b_transaction = transaction; /* first access by this transaction */ jh->b_modified = 0; @@ -844,8 +842,8 @@ int journal_get_create_access(handle_t *handle, struct buffer_head *bh) */ JBUFFER_TRACE(jh, "cancelling revoke"); journal_cancel_revoke(handle, jh); - journal_put_journal_head(jh); out: + journal_put_journal_head(jh); return err; } @@ -1069,8 +1067,9 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh) ret = -EIO; goto no_journal; } - - if (jh->b_transaction != NULL) { + /* We might have slept so buffer could be refiled now */ + if (jh->b_transaction != NULL && + jh->b_transaction != handle->h_transaction) { JBUFFER_TRACE(jh, "unfile from commit"); __journal_temp_unlink_buffer(jh); /* It still points to the committing @@ -1091,8 +1090,6 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh) if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) { JBUFFER_TRACE(jh, "not on correct data list: unfile"); J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow); - __journal_temp_unlink_buffer(jh); - jh->b_transaction = handle->h_transaction; JBUFFER_TRACE(jh, "file as data"); __journal_file_buffer(jh, handle->h_transaction, BJ_SyncData); @@ -1300,8 +1297,6 @@ int journal_forget (handle_t *handle, struct buffer_head *bh) __journal_file_buffer(jh, transaction, BJ_Forget); } else { __journal_unfile_buffer(jh); - journal_remove_journal_head(bh); - __brelse(bh); if (!buffer_jbd(bh)) { spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); @@ -1622,19 +1617,32 @@ static void __journal_temp_unlink_buffer(struct journal_head *jh) mark_buffer_dirty(bh); /* Expose it to the VM */ } +/* + * Remove buffer from all transactions. + * + * Called with bh_state lock and j_list_lock + * + * jh and bh may be already freed when this function returns. + */ void __journal_unfile_buffer(struct journal_head *jh) { __journal_temp_unlink_buffer(jh); jh->b_transaction = NULL; + journal_put_journal_head(jh); } void journal_unfile_buffer(journal_t *journal, struct journal_head *jh) { - jbd_lock_bh_state(jh2bh(jh)); + struct buffer_head *bh = jh2bh(jh); + + /* Get reference so that buffer cannot be freed before we unlock it */ + get_bh(bh); + jbd_lock_bh_state(bh); spin_lock(&journal->j_list_lock); __journal_unfile_buffer(jh); spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(jh2bh(jh)); + jbd_unlock_bh_state(bh); + __brelse(bh); } /* @@ -1661,16 +1669,12 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh) /* A written-back ordered data buffer */ JBUFFER_TRACE(jh, "release data"); __journal_unfile_buffer(jh); - journal_remove_journal_head(bh); - __brelse(bh); } } else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) { /* written-back checkpointed metadata buffer */ if (jh->b_jlist == BJ_None) { JBUFFER_TRACE(jh, "remove from checkpoint list"); __journal_remove_checkpoint(jh); - journal_remove_journal_head(bh); - __brelse(bh); } } spin_unlock(&journal->j_list_lock); @@ -1733,7 +1737,7 @@ int journal_try_to_free_buffers(journal_t *journal, /* * We take our own ref against the journal_head here to avoid * having to add tons of locking around each instance of - * journal_remove_journal_head() and journal_put_journal_head(). + * journal_put_journal_head(). */ jh = journal_grab_journal_head(bh); if (!jh) @@ -1770,10 +1774,9 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) int may_free = 1; struct buffer_head *bh = jh2bh(jh); - __journal_unfile_buffer(jh); - if (jh->b_cp_transaction) { JBUFFER_TRACE(jh, "on running+cp transaction"); + __journal_temp_unlink_buffer(jh); /* * We don't want to write the buffer anymore, clear the * bit so that we don't confuse checks in @@ -1784,8 +1787,7 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) may_free = 0; } else { JBUFFER_TRACE(jh, "on running transaction"); - journal_remove_journal_head(bh); - __brelse(bh); + __journal_unfile_buffer(jh); } return may_free; } @@ -2070,6 +2072,8 @@ void __journal_file_buffer(struct journal_head *jh, if (jh->b_transaction) __journal_temp_unlink_buffer(jh); + else + journal_grab_journal_head(bh); jh->b_transaction = transaction; switch (jlist) { @@ -2127,9 +2131,10 @@ void journal_file_buffer(struct journal_head *jh, * already started to be used by a subsequent transaction, refile the * buffer on that transaction's metadata list. * - * Called under journal->j_list_lock - * + * Called under j_list_lock * Called under jbd_lock_bh_state(jh2bh(jh)) + * + * jh and bh may be already free when this function returns */ void __journal_refile_buffer(struct journal_head *jh) { @@ -2153,6 +2158,11 @@ void __journal_refile_buffer(struct journal_head *jh) was_dirty = test_clear_buffer_jbddirty(bh); __journal_temp_unlink_buffer(jh); + /* + * We set b_transaction here because b_next_transaction will inherit + * our jh reference and thus __journal_file_buffer() must not take a + * new one. + */ jh->b_transaction = jh->b_next_transaction; jh->b_next_transaction = NULL; if (buffer_freed(bh)) @@ -2169,30 +2179,21 @@ void __journal_refile_buffer(struct journal_head *jh) } /* - * For the unlocked version of this call, also make sure that any - * hanging journal_head is cleaned up if necessary. - * - * __journal_refile_buffer is usually called as part of a single locked - * operation on a buffer_head, in which the caller is probably going to - * be hooking the journal_head onto other lists. In that case it is up - * to the caller to remove the journal_head if necessary. For the - * unlocked journal_refile_buffer call, the caller isn't going to be - * doing anything else to the buffer so we need to do the cleanup - * ourselves to avoid a jh leak. - * - * *** The journal_head may be freed by this call! *** + * __journal_refile_buffer() with necessary locking added. We take our bh + * reference so that we can safely unlock bh. + * + * The jh and bh may be freed by this call. */ void journal_refile_buffer(journal_t *journal, struct journal_head *jh) { struct buffer_head *bh = jh2bh(jh); + /* Get reference so that buffer cannot be freed before we unlock it */ + get_bh(bh); jbd_lock_bh_state(bh); spin_lock(&journal->j_list_lock); - __journal_refile_buffer(jh); jbd_unlock_bh_state(bh); - journal_remove_journal_head(bh); - spin_unlock(&journal->j_list_lock); __brelse(bh); } diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 2c62c5a..16a698b 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -257,9 +257,12 @@ static void __flush_batch(journal_t *journal, int *batch_count) { int i; + struct blk_plug plug; + blk_start_plug(&plug); for (i = 0; i < *batch_count; i++) - write_dirty_buffer(journal->j_chkpt_bhs[i], WRITE); + write_dirty_buffer(journal->j_chkpt_bhs[i], WRITE_SYNC); + blk_finish_plug(&plug); for (i = 0; i < *batch_count; i++) { struct buffer_head *bh = journal->j_chkpt_bhs[i]; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 0dfa5b59..f24df13 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -2390,73 +2390,6 @@ static void __exit journal_exit(void) jbd2_journal_destroy_caches(); } -/* - * jbd2_dev_to_name is a utility function used by the jbd2 and ext4 - * tracing infrastructure to map a dev_t to a device name. - * - * The caller should use rcu_read_lock() in order to make sure the - * device name stays valid until its done with it. We use - * rcu_read_lock() as well to make sure we're safe in case the caller - * gets sloppy, and because rcu_read_lock() is cheap and can be safely - * nested. - */ -struct devname_cache { - struct rcu_head rcu; - dev_t device; - char devname[BDEVNAME_SIZE]; -}; -#define CACHE_SIZE_BITS 6 -static struct devname_cache *devcache[1 << CACHE_SIZE_BITS]; -static DEFINE_SPINLOCK(devname_cache_lock); - -static void free_devcache(struct rcu_head *rcu) -{ - kfree(rcu); -} - -const char *jbd2_dev_to_name(dev_t device) -{ - int i = hash_32(device, CACHE_SIZE_BITS); - char *ret; - struct block_device *bd; - static struct devname_cache *new_dev; - - rcu_read_lock(); - if (devcache[i] && devcache[i]->device == device) { - ret = devcache[i]->devname; - rcu_read_unlock(); - return ret; - } - rcu_read_unlock(); - - new_dev = kmalloc(sizeof(struct devname_cache), GFP_KERNEL); - if (!new_dev) - return "NODEV-ALLOCFAILURE"; /* Something non-NULL */ - bd = bdget(device); - spin_lock(&devname_cache_lock); - if (devcache[i]) { - if (devcache[i]->device == device) { - kfree(new_dev); - bdput(bd); - ret = devcache[i]->devname; - spin_unlock(&devname_cache_lock); - return ret; - } - call_rcu(&devcache[i]->rcu, free_devcache); - } - devcache[i] = new_dev; - devcache[i]->device = device; - if (bd) { - bdevname(bd, devcache[i]->devname); - bdput(bd); - } else - __bdevname(device, devcache[i]->devname); - ret = devcache[i]->devname; - spin_unlock(&devname_cache_lock); - return ret; -} -EXPORT_SYMBOL(jbd2_dev_to_name); - MODULE_LICENSE("GPL"); module_init(journal_init); module_exit(journal_exit); diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index 27c511a..926d020 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -227,7 +227,7 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl) case ACL_TYPE_ACCESS: xprefix = JFFS2_XPREFIX_ACL_ACCESS; if (acl) { - mode_t mode = inode->i_mode; + umode_t mode = inode->i_mode; rc = posix_acl_equiv_mode(acl, &mode); if (rc < 0) return rc; @@ -259,7 +259,7 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl) return rc; } -int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, mode_t *i_mode) +int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, umode_t *i_mode) { struct posix_acl *acl; int rc; diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h index b3421c7..9b47724 100644 --- a/fs/jffs2/acl.h +++ b/fs/jffs2/acl.h @@ -28,7 +28,7 @@ struct jffs2_acl_header { struct posix_acl *jffs2_get_acl(struct inode *inode, int type); extern int jffs2_acl_chmod(struct inode *); -extern int jffs2_init_acl_pre(struct inode *, struct inode *, mode_t *); +extern int jffs2_init_acl_pre(struct inode *, struct inode *, umode_t *); extern int jffs2_init_acl_post(struct inode *); extern const struct xattr_handler jffs2_acl_access_xattr_handler; diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index eeead33..bbcb975 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -80,7 +80,7 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr) ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE); if (ret) { jffs2_free_raw_inode(ri); - if (S_ISLNK(inode->i_mode & S_IFMT)) + if (S_ISLNK(inode->i_mode)) kfree(mdata); return ret; } @@ -406,7 +406,7 @@ int jffs2_remount_fs (struct super_block *sb, int *flags, char *data) /* jffs2_new_inode: allocate a new inode and inocache, add it to the hash, fill in the raw_inode while you're at it. */ -struct inode *jffs2_new_inode (struct inode *dir_i, mode_t mode, struct jffs2_raw_inode *ri) +struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_raw_inode *ri) { struct inode *inode; struct super_block *sb = dir_i->i_sb; diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h index 526979c..6c1755c 100644 --- a/fs/jffs2/os-linux.h +++ b/fs/jffs2/os-linux.h @@ -173,7 +173,7 @@ int jffs2_do_setattr (struct inode *, struct iattr *); struct inode *jffs2_iget(struct super_block *, unsigned long); void jffs2_evict_inode (struct inode *); void jffs2_dirty_inode(struct inode *inode, int flags); -struct inode *jffs2_new_inode (struct inode *dir_i, mode_t mode, +struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_raw_inode *ri); int jffs2_statfs (struct dentry *, struct kstatfs *); int jffs2_remount_fs (struct super_block *, int *, char *); diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index b3a32ca..45559dc 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -127,16 +127,14 @@ int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir) return PTR_ERR(acl); if (acl) { - mode_t mode = inode->i_mode; if (S_ISDIR(inode->i_mode)) { rc = jfs_set_acl(tid, inode, ACL_TYPE_DEFAULT, acl); if (rc) goto cleanup; } - rc = posix_acl_create(&acl, GFP_KERNEL, &mode); + rc = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode); if (rc < 0) goto cleanup; /* posix_acl_release(NULL) is no-op */ - inode->i_mode = mode; if (rc > 0) rc = jfs_set_acl(tid, inode, ACL_TYPE_ACCESS, acl); cleanup: diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index 4496872..9cbd11a 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -3161,7 +3161,7 @@ static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno, { int rc; int dbitno, word, rembits, nb, nwords, wbitno, agno; - s8 oldroot, *leaf; + s8 oldroot; struct dmaptree *tp = (struct dmaptree *) & dp->tree; /* save the current value of the root (i.e. maximum free string) @@ -3169,9 +3169,6 @@ static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno, */ oldroot = tp->stree[ROOT]; - /* pick up a pointer to the leaves of the dmap tree */ - leaf = tp->stree + LEAFIND; - /* determine the bit number and word within the dmap of the * starting block. */ diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index f6cc0c0..af96060 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -1143,7 +1143,6 @@ int txCommit(tid_t tid, /* transaction identifier */ struct jfs_log *log; struct tblock *tblk; struct lrd *lrd; - int lsn; struct inode *ip; struct jfs_inode_info *jfs_ip; int k, n; @@ -1310,7 +1309,7 @@ int txCommit(tid_t tid, /* transaction identifier */ */ lrd->type = cpu_to_le16(LOG_COMMIT); lrd->length = 0; - lsn = lmLog(log, tblk, lrd, NULL); + lmLog(log, tblk, lrd, NULL); lmGroupCommit(log, tblk); @@ -2935,7 +2934,6 @@ int jfs_sync(void *arg) { struct inode *ip; struct jfs_inode_info *jfs_ip; - int rc; tid_t tid; do { @@ -2961,7 +2959,7 @@ int jfs_sync(void *arg) */ TXN_UNLOCK(); tid = txBegin(ip->i_sb, COMMIT_INODE); - rc = txCommit(tid, 1, &ip, 0); + txCommit(tid, 1, &ip, 0); txEnd(tid); mutex_unlock(&jfs_ip->commit_mutex); diff --git a/fs/jfs/jfs_umount.c b/fs/jfs/jfs_umount.c index adcf92d..7971f37 100644 --- a/fs/jfs/jfs_umount.c +++ b/fs/jfs/jfs_umount.c @@ -68,7 +68,7 @@ int jfs_umount(struct super_block *sb) /* * Wait for outstanding transactions to be written to log: */ - jfs_flush_journal(log, 1); + jfs_flush_journal(log, 2); /* * close fileset inode allocation map (aka fileset inode) @@ -146,7 +146,7 @@ int jfs_umount_rw(struct super_block *sb) * * remove file system from log active file system list. */ - jfs_flush_journal(log, 1); + jfs_flush_journal(log, 2); /* * Make sure all metadata makes it to disk diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 29b1f1a..e17545e 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -893,7 +893,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, unchar *i_fastsymlink; s64 xlen = 0; int bmask = 0, xsize; - s64 extent = 0, xaddr; + s64 xaddr; struct metapage *mp; struct super_block *sb; struct tblock *tblk; @@ -993,7 +993,6 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, txAbort(tid, 0); goto out3; } - extent = xaddr; ip->i_size = ssize - 1; while (ssize) { /* This is kind of silly since PATH_MAX == 4K */ diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index 24838f1..e87fede 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -693,8 +693,7 @@ static int can_set_system_xattr(struct inode *inode, const char *name, return rc; } if (acl) { - mode_t mode = inode->i_mode; - rc = posix_acl_equiv_mode(acl, &mode); + rc = posix_acl_equiv_mode(acl, &inode->i_mode); posix_acl_release(acl); if (rc < 0) { printk(KERN_ERR @@ -702,7 +701,6 @@ static int can_set_system_xattr(struct inode *inode, const char *name, rc); return rc; } - inode->i_mode = mode; mark_inode_dirty(inode); } /* diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index e374050..8392cb8 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -302,7 +302,8 @@ nlmclnt_call(struct rpc_cred *cred, struct nlm_rqst *req, u32 proc) /* We appear to be out of the grace period */ wake_up_all(&host->h_gracewait); } - dprintk("lockd: server returns status %d\n", resp->status); + dprintk("lockd: server returns status %d\n", + ntohl(resp->status)); return 0; /* Okay, call complete */ } @@ -690,7 +691,8 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl) goto out; if (resp->status != nlm_lck_denied_nolocks) - printk("lockd: unexpected unlock status: %d\n", resp->status); + printk("lockd: unexpected unlock status: %d\n", + ntohl(resp->status)); /* What to do now? I'm out of my depth... */ status = -ENOLCK; out: @@ -843,6 +845,7 @@ nlm_stat_to_errno(__be32 status) return -ENOLCK; #endif } - printk(KERN_NOTICE "lockd: unexpected server status %d\n", status); + printk(KERN_NOTICE "lockd: unexpected server status %d\n", + ntohl(status)); return -ENOLCK; } @@ -179,19 +179,14 @@ static int check_acl(struct inode *inode, int mask) #ifdef CONFIG_FS_POSIX_ACL struct posix_acl *acl; - /* - * Under RCU walk, we cannot even do a "get_cached_acl()", - * because that involves locking and getting a refcount on - * a cached ACL. - * - * So the only case we handle during RCU walking is the - * case of a cached "no ACL at all", which needs no locks - * or refcounts. - */ if (mask & MAY_NOT_BLOCK) { - if (negative_cached_acl(inode, ACL_TYPE_ACCESS)) + acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS); + if (!acl) return -EAGAIN; - return -ECHILD; + /* no ->get_acl() calls in RCU mode... */ + if (acl == ACL_NOT_CACHED) + return -ECHILD; + return posix_acl_permission(inode, acl, mask & ~MAY_NOT_BLOCK); } acl = get_cached_acl(inode, ACL_TYPE_ACCESS); @@ -313,6 +308,26 @@ int generic_permission(struct inode *inode, int mask) return -EACCES; } +/* + * We _really_ want to just do "generic_permission()" without + * even looking at the inode->i_op values. So we keep a cache + * flag in inode->i_opflags, that says "this has not special + * permission function, use the fast case". + */ +static inline int do_inode_permission(struct inode *inode, int mask) +{ + if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) { + if (likely(inode->i_op->permission)) + return inode->i_op->permission(inode, mask); + + /* This gets set once for the inode lifetime */ + spin_lock(&inode->i_lock); + inode->i_opflags |= IOP_FASTPERM; + spin_unlock(&inode->i_lock); + } + return generic_permission(inode, mask); +} + /** * inode_permission - check for access rights to a given inode * @inode: inode to check permission on @@ -327,7 +342,7 @@ int inode_permission(struct inode *inode, int mask) { int retval; - if (mask & MAY_WRITE) { + if (unlikely(mask & MAY_WRITE)) { umode_t mode = inode->i_mode; /* @@ -344,11 +359,7 @@ int inode_permission(struct inode *inode, int mask) return -EACCES; } - if (inode->i_op->permission) - retval = inode->i_op->permission(inode, mask); - else - retval = generic_permission(inode, mask); - + retval = do_inode_permission(inode, mask); if (retval) return retval; @@ -716,17 +727,20 @@ static int follow_automount(struct path *path, unsigned flags, if ((flags & LOOKUP_NO_AUTOMOUNT) && !(flags & LOOKUP_PARENT)) return -EISDIR; /* we actually want to stop here */ - /* We want to mount if someone is trying to open/create a file of any - * type under the mountpoint, wants to traverse through the mountpoint - * or wants to open the mounted directory. + /* We don't want to mount if someone's just doing a stat - + * unless they're stat'ing a directory and appended a '/' to + * the name. * - * We don't want to mount if someone's just doing a stat and they've - * set AT_SYMLINK_NOFOLLOW - unless they're stat'ing a directory and - * appended a '/' to the name. + * We do, however, want to mount if someone wants to open or + * create a file of any type under the mountpoint, wants to + * traverse through the mountpoint or wants to open the + * mounted directory. Also, autofs may mark negative dentries + * as being automount points. These will need the attentions + * of the daemon to instantiate them before they can be used. */ - if (!(flags & LOOKUP_FOLLOW) && - !(flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY | - LOOKUP_OPEN | LOOKUP_CREATE))) + if (!(flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY | + LOOKUP_OPEN | LOOKUP_CREATE)) && + path->dentry->d_inode) return -EISDIR; current->total_link_count++; @@ -1244,6 +1258,26 @@ static void terminate_walk(struct nameidata *nd) } } +/* + * Do we need to follow links? We _really_ want to be able + * to do this check without having to look at inode->i_op, + * so we keep a cache of "no, this doesn't need follow_link" + * for the common case. + */ +static inline int should_follow_link(struct inode *inode, int follow) +{ + if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) { + if (likely(inode->i_op->follow_link)) + return follow; + + /* This gets set once for the inode lifetime */ + spin_lock(&inode->i_lock); + inode->i_opflags |= IOP_NOFOLLOW; + spin_unlock(&inode->i_lock); + } + return 0; +} + static inline int walk_component(struct nameidata *nd, struct path *path, struct qstr *name, int type, int follow) { @@ -1266,7 +1300,7 @@ static inline int walk_component(struct nameidata *nd, struct path *path, terminate_walk(nd); return -ENOENT; } - if (unlikely(inode->i_op->follow_link) && follow) { + if (should_follow_link(inode, follow)) { if (nd->flags & LOOKUP_RCU) { if (unlikely(unlazy_walk(nd, path->dentry))) { terminate_walk(nd); @@ -1319,6 +1353,26 @@ static inline int nested_symlink(struct path *path, struct nameidata *nd) } /* + * We really don't want to look at inode->i_op->lookup + * when we don't have to. So we keep a cache bit in + * the inode ->i_opflags field that says "yes, we can + * do lookup on this inode". + */ +static inline int can_lookup(struct inode *inode) +{ + if (likely(inode->i_opflags & IOP_LOOKUP)) + return 1; + if (likely(!inode->i_op->lookup)) + return 0; + + /* We do this once for the lifetime of the inode */ + spin_lock(&inode->i_lock); + inode->i_opflags |= IOP_LOOKUP; + spin_unlock(&inode->i_lock); + return 1; +} + +/* * Name resolution. * This is the basic name resolution function, turning a pathname into * the final dentry. We expect 'base' to be positive and a directory. @@ -1397,10 +1451,10 @@ static int link_path_walk(const char *name, struct nameidata *nd) if (err) return err; } + if (can_lookup(nd->inode)) + continue; err = -ENOTDIR; - if (!nd->inode->i_op->lookup) - break; - continue; + break; /* here ends the main loop */ last_component: @@ -2562,6 +2616,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry) if (!dir->i_op->rmdir) return -EPERM; + dget(dentry); mutex_lock(&dentry->d_inode->i_mutex); error = -EBUSY; @@ -2582,6 +2637,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry) out: mutex_unlock(&dentry->d_inode->i_mutex); + dput(dentry); if (!error) d_delete(dentry); return error; @@ -2971,6 +3027,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, if (error) return error; + dget(new_dentry); if (target) mutex_lock(&target->i_mutex); @@ -2991,6 +3048,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, out: if (target) mutex_unlock(&target->i_mutex); + dput(new_dentry); if (!error) if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) d_move(old_dentry,new_dentry); diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index 8151554..dbcd821 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -77,6 +77,7 @@ config NFS_V4 config NFS_V4_1 bool "NFS client support for NFSv4.1 (EXPERIMENTAL)" depends on NFS_FS && NFS_V4 && EXPERIMENTAL + select SUNRPC_BACKCHANNEL select PNFS_FILE_LAYOUT help This option enables support for minor version 1 of the NFSv4 protocol @@ -87,15 +88,15 @@ config NFS_V4_1 config PNFS_FILE_LAYOUT tristate +config PNFS_BLOCK + tristate + depends on NFS_FS && NFS_V4_1 && BLK_DEV_DM + default m + config PNFS_OBJLAYOUT - tristate "Provide support for the pNFS Objects Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)" + tristate depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD - help - Say M here if you want your pNFS client to support the Objects Layout Driver. - Requires the SCSI osd initiator library (SCSI_OSD_INITIATOR) and - upper level driver (SCSI_OSD_ULD). - - If unsure, say N. + default m config ROOT_NFS bool "Root file system on NFS" diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index 6a34f7d..b58613d 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -23,3 +23,4 @@ obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/ +obj-$(CONFIG_PNFS_BLOCK) += blocklayout/ diff --git a/fs/nfs/blocklayout/Makefile b/fs/nfs/blocklayout/Makefile new file mode 100644 index 0000000..d581550 --- /dev/null +++ b/fs/nfs/blocklayout/Makefile @@ -0,0 +1,5 @@ +# +# Makefile for the pNFS block layout driver kernel module +# +obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o +blocklayoutdriver-objs := blocklayout.o extents.o blocklayoutdev.o blocklayoutdm.o diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c new file mode 100644 index 0000000..9561c8f --- /dev/null +++ b/fs/nfs/blocklayout/blocklayout.c @@ -0,0 +1,1020 @@ +/* + * linux/fs/nfs/blocklayout/blocklayout.c + * + * Module for the NFSv4.1 pNFS block layout driver. + * + * Copyright (c) 2006 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson <andros@citi.umich.edu> + * Fred Isaman <iisaman@umich.edu> + * + * permission is granted to use, copy, create derivative works and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the university of michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. if + * the above copyright notice or any other identification of the + * university of michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * this software is provided as is, without representation from the + * university of michigan as to its fitness for any purpose, and without + * warranty by the university of michigan of any kind, either express + * or implied, including without limitation the implied warranties of + * merchantability and fitness for a particular purpose. the regents + * of the university of michigan shall not be liable for any damages, + * including special, indirect, incidental, or consequential damages, + * with respect to any claim arising out or in connection with the use + * of the software, even if it has been or is hereafter advised of the + * possibility of such damages. + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/mount.h> +#include <linux/namei.h> +#include <linux/bio.h> /* struct bio */ +#include <linux/buffer_head.h> /* various write calls */ +#include <linux/prefetch.h> + +#include "blocklayout.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>"); +MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver"); + +struct dentry *bl_device_pipe; +wait_queue_head_t bl_wq; + +static void print_page(struct page *page) +{ + dprintk("PRINTPAGE page %p\n", page); + dprintk(" PagePrivate %d\n", PagePrivate(page)); + dprintk(" PageUptodate %d\n", PageUptodate(page)); + dprintk(" PageError %d\n", PageError(page)); + dprintk(" PageDirty %d\n", PageDirty(page)); + dprintk(" PageReferenced %d\n", PageReferenced(page)); + dprintk(" PageLocked %d\n", PageLocked(page)); + dprintk(" PageWriteback %d\n", PageWriteback(page)); + dprintk(" PageMappedToDisk %d\n", PageMappedToDisk(page)); + dprintk("\n"); +} + +/* Given the be associated with isect, determine if page data needs to be + * initialized. + */ +static int is_hole(struct pnfs_block_extent *be, sector_t isect) +{ + if (be->be_state == PNFS_BLOCK_NONE_DATA) + return 1; + else if (be->be_state != PNFS_BLOCK_INVALID_DATA) + return 0; + else + return !bl_is_sector_init(be->be_inval, isect); +} + +/* Given the be associated with isect, determine if page data can be + * written to disk. + */ +static int is_writable(struct pnfs_block_extent *be, sector_t isect) +{ + return (be->be_state == PNFS_BLOCK_READWRITE_DATA || + be->be_state == PNFS_BLOCK_INVALID_DATA); +} + +/* The data we are handed might be spread across several bios. We need + * to track when the last one is finished. + */ +struct parallel_io { + struct kref refcnt; + struct rpc_call_ops call_ops; + void (*pnfs_callback) (void *data); + void *data; +}; + +static inline struct parallel_io *alloc_parallel(void *data) +{ + struct parallel_io *rv; + + rv = kmalloc(sizeof(*rv), GFP_NOFS); + if (rv) { + rv->data = data; + kref_init(&rv->refcnt); + } + return rv; +} + +static inline void get_parallel(struct parallel_io *p) +{ + kref_get(&p->refcnt); +} + +static void destroy_parallel(struct kref *kref) +{ + struct parallel_io *p = container_of(kref, struct parallel_io, refcnt); + + dprintk("%s enter\n", __func__); + p->pnfs_callback(p->data); + kfree(p); +} + +static inline void put_parallel(struct parallel_io *p) +{ + kref_put(&p->refcnt, destroy_parallel); +} + +static struct bio * +bl_submit_bio(int rw, struct bio *bio) +{ + if (bio) { + get_parallel(bio->bi_private); + dprintk("%s submitting %s bio %u@%llu\n", __func__, + rw == READ ? "read" : "write", + bio->bi_size, (unsigned long long)bio->bi_sector); + submit_bio(rw, bio); + } + return NULL; +} + +static struct bio *bl_alloc_init_bio(int npg, sector_t isect, + struct pnfs_block_extent *be, + void (*end_io)(struct bio *, int err), + struct parallel_io *par) +{ + struct bio *bio; + + bio = bio_alloc(GFP_NOIO, npg); + if (!bio) + return NULL; + + bio->bi_sector = isect - be->be_f_offset + be->be_v_offset; + bio->bi_bdev = be->be_mdev; + bio->bi_end_io = end_io; + bio->bi_private = par; + return bio; +} + +static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw, + sector_t isect, struct page *page, + struct pnfs_block_extent *be, + void (*end_io)(struct bio *, int err), + struct parallel_io *par) +{ +retry: + if (!bio) { + bio = bl_alloc_init_bio(npg, isect, be, end_io, par); + if (!bio) + return ERR_PTR(-ENOMEM); + } + if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) { + bio = bl_submit_bio(rw, bio); + goto retry; + } + return bio; +} + +static void bl_set_lo_fail(struct pnfs_layout_segment *lseg) +{ + if (lseg->pls_range.iomode == IOMODE_RW) { + dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__); + set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags); + } else { + dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__); + set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags); + } +} + +/* This is basically copied from mpage_end_io_read */ +static void bl_end_io_read(struct bio *bio, int err) +{ + struct parallel_io *par = bio->bi_private; + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct nfs_read_data *rdata = (struct nfs_read_data *)par->data; + + do { + struct page *page = bvec->bv_page; + + if (--bvec >= bio->bi_io_vec) + prefetchw(&bvec->bv_page->flags); + if (uptodate) + SetPageUptodate(page); + } while (bvec >= bio->bi_io_vec); + if (!uptodate) { + if (!rdata->pnfs_error) + rdata->pnfs_error = -EIO; + bl_set_lo_fail(rdata->lseg); + } + bio_put(bio); + put_parallel(par); +} + +static void bl_read_cleanup(struct work_struct *work) +{ + struct rpc_task *task; + struct nfs_read_data *rdata; + dprintk("%s enter\n", __func__); + task = container_of(work, struct rpc_task, u.tk_work); + rdata = container_of(task, struct nfs_read_data, task); + pnfs_ld_read_done(rdata); +} + +static void +bl_end_par_io_read(void *data) +{ + struct nfs_read_data *rdata = data; + + INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup); + schedule_work(&rdata->task.u.tk_work); +} + +/* We don't want normal .rpc_call_done callback used, so we replace it + * with this stub. + */ +static void bl_rpc_do_nothing(struct rpc_task *task, void *calldata) +{ + return; +} + +static enum pnfs_try_status +bl_read_pagelist(struct nfs_read_data *rdata) +{ + int i, hole; + struct bio *bio = NULL; + struct pnfs_block_extent *be = NULL, *cow_read = NULL; + sector_t isect, extent_length = 0; + struct parallel_io *par; + loff_t f_offset = rdata->args.offset; + size_t count = rdata->args.count; + struct page **pages = rdata->args.pages; + int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT; + + dprintk("%s enter nr_pages %u offset %lld count %Zd\n", __func__, + rdata->npages, f_offset, count); + + par = alloc_parallel(rdata); + if (!par) + goto use_mds; + par->call_ops = *rdata->mds_ops; + par->call_ops.rpc_call_done = bl_rpc_do_nothing; + par->pnfs_callback = bl_end_par_io_read; + /* At this point, we can no longer jump to use_mds */ + + isect = (sector_t) (f_offset >> SECTOR_SHIFT); + /* Code assumes extents are page-aligned */ + for (i = pg_index; i < rdata->npages; i++) { + if (!extent_length) { + /* We've used up the previous extent */ + bl_put_extent(be); + bl_put_extent(cow_read); + bio = bl_submit_bio(READ, bio); + /* Get the next one */ + be = bl_find_get_extent(BLK_LSEG2EXT(rdata->lseg), + isect, &cow_read); + if (!be) { + rdata->pnfs_error = -EIO; + goto out; + } + extent_length = be->be_length - + (isect - be->be_f_offset); + if (cow_read) { + sector_t cow_length = cow_read->be_length - + (isect - cow_read->be_f_offset); + extent_length = min(extent_length, cow_length); + } + } + hole = is_hole(be, isect); + if (hole && !cow_read) { + bio = bl_submit_bio(READ, bio); + /* Fill hole w/ zeroes w/o accessing device */ + dprintk("%s Zeroing page for hole\n", __func__); + zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE); + print_page(pages[i]); + SetPageUptodate(pages[i]); + } else { + struct pnfs_block_extent *be_read; + + be_read = (hole && cow_read) ? cow_read : be; + bio = bl_add_page_to_bio(bio, rdata->npages - i, READ, + isect, pages[i], be_read, + bl_end_io_read, par); + if (IS_ERR(bio)) { + rdata->pnfs_error = PTR_ERR(bio); + goto out; + } + } + isect += PAGE_CACHE_SECTORS; + extent_length -= PAGE_CACHE_SECTORS; + } + if ((isect << SECTOR_SHIFT) >= rdata->inode->i_size) { + rdata->res.eof = 1; + rdata->res.count = rdata->inode->i_size - f_offset; + } else { + rdata->res.count = (isect << SECTOR_SHIFT) - f_offset; + } +out: + bl_put_extent(be); + bl_put_extent(cow_read); + bl_submit_bio(READ, bio); + put_parallel(par); + return PNFS_ATTEMPTED; + + use_mds: + dprintk("Giving up and using normal NFS\n"); + return PNFS_NOT_ATTEMPTED; +} + +static void mark_extents_written(struct pnfs_block_layout *bl, + __u64 offset, __u32 count) +{ + sector_t isect, end; + struct pnfs_block_extent *be; + + dprintk("%s(%llu, %u)\n", __func__, offset, count); + if (count == 0) + return; + isect = (offset & (long)(PAGE_CACHE_MASK)) >> SECTOR_SHIFT; + end = (offset + count + PAGE_CACHE_SIZE - 1) & (long)(PAGE_CACHE_MASK); + end >>= SECTOR_SHIFT; + while (isect < end) { + sector_t len; + be = bl_find_get_extent(bl, isect, NULL); + BUG_ON(!be); /* FIXME */ + len = min(end, be->be_f_offset + be->be_length) - isect; + if (be->be_state == PNFS_BLOCK_INVALID_DATA) + bl_mark_for_commit(be, isect, len); /* What if fails? */ + isect += len; + bl_put_extent(be); + } +} + +static void bl_end_io_write_zero(struct bio *bio, int err) +{ + struct parallel_io *par = bio->bi_private; + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct nfs_write_data *wdata = (struct nfs_write_data *)par->data; + + do { + struct page *page = bvec->bv_page; + + if (--bvec >= bio->bi_io_vec) + prefetchw(&bvec->bv_page->flags); + /* This is the zeroing page we added */ + end_page_writeback(page); + page_cache_release(page); + } while (bvec >= bio->bi_io_vec); + if (!uptodate) { + if (!wdata->pnfs_error) + wdata->pnfs_error = -EIO; + bl_set_lo_fail(wdata->lseg); + } + bio_put(bio); + put_parallel(par); +} + +/* This is basically copied from mpage_end_io_read */ +static void bl_end_io_write(struct bio *bio, int err) +{ + struct parallel_io *par = bio->bi_private; + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct nfs_write_data *wdata = (struct nfs_write_data *)par->data; + + if (!uptodate) { + if (!wdata->pnfs_error) + wdata->pnfs_error = -EIO; + bl_set_lo_fail(wdata->lseg); + } + bio_put(bio); + put_parallel(par); +} + +/* Function scheduled for call during bl_end_par_io_write, + * it marks sectors as written and extends the commitlist. + */ +static void bl_write_cleanup(struct work_struct *work) +{ + struct rpc_task *task; + struct nfs_write_data *wdata; + dprintk("%s enter\n", __func__); + task = container_of(work, struct rpc_task, u.tk_work); + wdata = container_of(task, struct nfs_write_data, task); + if (!wdata->pnfs_error) { + /* Marks for LAYOUTCOMMIT */ + mark_extents_written(BLK_LSEG2EXT(wdata->lseg), + wdata->args.offset, wdata->args.count); + } + pnfs_ld_write_done(wdata); +} + +/* Called when last of bios associated with a bl_write_pagelist call finishes */ +static void bl_end_par_io_write(void *data) +{ + struct nfs_write_data *wdata = data; + + wdata->task.tk_status = 0; + wdata->verf.committed = NFS_FILE_SYNC; + INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup); + schedule_work(&wdata->task.u.tk_work); +} + +/* FIXME STUB - mark intersection of layout and page as bad, so is not + * used again. + */ +static void mark_bad_read(void) +{ + return; +} + +/* + * map_block: map a requested I/0 block (isect) into an offset in the LVM + * block_device + */ +static void +map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be) +{ + dprintk("%s enter be=%p\n", __func__, be); + + set_buffer_mapped(bh); + bh->b_bdev = be->be_mdev; + bh->b_blocknr = (isect - be->be_f_offset + be->be_v_offset) >> + (be->be_mdev->bd_inode->i_blkbits - SECTOR_SHIFT); + + dprintk("%s isect %llu, bh->b_blocknr %ld, using bsize %Zd\n", + __func__, (unsigned long long)isect, (long)bh->b_blocknr, + bh->b_size); + return; +} + +/* Given an unmapped page, zero it or read in page for COW, page is locked + * by caller. + */ +static int +init_page_for_write(struct page *page, struct pnfs_block_extent *cow_read) +{ + struct buffer_head *bh = NULL; + int ret = 0; + sector_t isect; + + dprintk("%s enter, %p\n", __func__, page); + BUG_ON(PageUptodate(page)); + if (!cow_read) { + zero_user_segment(page, 0, PAGE_SIZE); + SetPageUptodate(page); + goto cleanup; + } + + bh = alloc_page_buffers(page, PAGE_CACHE_SIZE, 0); + if (!bh) { + ret = -ENOMEM; + goto cleanup; + } + + isect = (sector_t) page->index << PAGE_CACHE_SECTOR_SHIFT; + map_block(bh, isect, cow_read); + if (!bh_uptodate_or_lock(bh)) + ret = bh_submit_read(bh); + if (ret) + goto cleanup; + SetPageUptodate(page); + +cleanup: + bl_put_extent(cow_read); + if (bh) + free_buffer_head(bh); + if (ret) { + /* Need to mark layout with bad read...should now + * just use nfs4 for reads and writes. + */ + mark_bad_read(); + } + return ret; +} + +static enum pnfs_try_status +bl_write_pagelist(struct nfs_write_data *wdata, int sync) +{ + int i, ret, npg_zero, pg_index, last = 0; + struct bio *bio = NULL; + struct pnfs_block_extent *be = NULL, *cow_read = NULL; + sector_t isect, last_isect = 0, extent_length = 0; + struct parallel_io *par; + loff_t offset = wdata->args.offset; + size_t count = wdata->args.count; + struct page **pages = wdata->args.pages; + struct page *page; + pgoff_t index; + u64 temp; + int npg_per_block = + NFS_SERVER(wdata->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT; + + dprintk("%s enter, %Zu@%lld\n", __func__, count, offset); + /* At this point, wdata->pages is a (sequential) list of nfs_pages. + * We want to write each, and if there is an error set pnfs_error + * to have it redone using nfs. + */ + par = alloc_parallel(wdata); + if (!par) + return PNFS_NOT_ATTEMPTED; + par->call_ops = *wdata->mds_ops; + par->call_ops.rpc_call_done = bl_rpc_do_nothing; + par->pnfs_callback = bl_end_par_io_write; + /* At this point, have to be more careful with error handling */ + + isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT); + be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), isect, &cow_read); + if (!be || !is_writable(be, isect)) { + dprintk("%s no matching extents!\n", __func__); + wdata->pnfs_error = -EINVAL; + goto out; + } + + /* First page inside INVALID extent */ + if (be->be_state == PNFS_BLOCK_INVALID_DATA) { + temp = offset >> PAGE_CACHE_SHIFT; + npg_zero = do_div(temp, npg_per_block); + isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) & + (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT); + extent_length = be->be_length - (isect - be->be_f_offset); + +fill_invalid_ext: + dprintk("%s need to zero %d pages\n", __func__, npg_zero); + for (;npg_zero > 0; npg_zero--) { + /* page ref released in bl_end_io_write_zero */ + index = isect >> PAGE_CACHE_SECTOR_SHIFT; + dprintk("%s zero %dth page: index %lu isect %llu\n", + __func__, npg_zero, index, + (unsigned long long)isect); + page = + find_or_create_page(wdata->inode->i_mapping, index, + GFP_NOFS); + if (!page) { + dprintk("%s oom\n", __func__); + wdata->pnfs_error = -ENOMEM; + goto out; + } + + /* PageDirty: Other will write this out + * PageWriteback: Other is writing this out + * PageUptodate: It was read before + * sector_initialized: already written out + */ + if (PageDirty(page) || PageWriteback(page) || + bl_is_sector_init(be->be_inval, isect)) { + print_page(page); + unlock_page(page); + page_cache_release(page); + goto next_page; + } + if (!PageUptodate(page)) { + /* New page, readin or zero it */ + init_page_for_write(page, cow_read); + } + set_page_writeback(page); + unlock_page(page); + + ret = bl_mark_sectors_init(be->be_inval, isect, + PAGE_CACHE_SECTORS, + NULL); + if (unlikely(ret)) { + dprintk("%s bl_mark_sectors_init fail %d\n", + __func__, ret); + end_page_writeback(page); + page_cache_release(page); + wdata->pnfs_error = ret; + goto out; + } + bio = bl_add_page_to_bio(bio, npg_zero, WRITE, + isect, page, be, + bl_end_io_write_zero, par); + if (IS_ERR(bio)) { + wdata->pnfs_error = PTR_ERR(bio); + goto out; + } + /* FIXME: This should be done in bi_end_io */ + mark_extents_written(BLK_LSEG2EXT(wdata->lseg), + page->index << PAGE_CACHE_SHIFT, + PAGE_CACHE_SIZE); +next_page: + isect += PAGE_CACHE_SECTORS; + extent_length -= PAGE_CACHE_SECTORS; + } + if (last) + goto write_done; + } + bio = bl_submit_bio(WRITE, bio); + + /* Middle pages */ + pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT; + for (i = pg_index; i < wdata->npages; i++) { + if (!extent_length) { + /* We've used up the previous extent */ + bl_put_extent(be); + bio = bl_submit_bio(WRITE, bio); + /* Get the next one */ + be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), + isect, NULL); + if (!be || !is_writable(be, isect)) { + wdata->pnfs_error = -EINVAL; + goto out; + } + extent_length = be->be_length - + (isect - be->be_f_offset); + } + if (be->be_state == PNFS_BLOCK_INVALID_DATA) { + ret = bl_mark_sectors_init(be->be_inval, isect, + PAGE_CACHE_SECTORS, + NULL); + if (unlikely(ret)) { + dprintk("%s bl_mark_sectors_init fail %d\n", + __func__, ret); + wdata->pnfs_error = ret; + goto out; + } + } + bio = bl_add_page_to_bio(bio, wdata->npages - i, WRITE, + isect, pages[i], be, + bl_end_io_write, par); + if (IS_ERR(bio)) { + wdata->pnfs_error = PTR_ERR(bio); + goto out; + } + isect += PAGE_CACHE_SECTORS; + last_isect = isect; + extent_length -= PAGE_CACHE_SECTORS; + } + + /* Last page inside INVALID extent */ + if (be->be_state == PNFS_BLOCK_INVALID_DATA) { + bio = bl_submit_bio(WRITE, bio); + temp = last_isect >> PAGE_CACHE_SECTOR_SHIFT; + npg_zero = npg_per_block - do_div(temp, npg_per_block); + if (npg_zero < npg_per_block) { + last = 1; + goto fill_invalid_ext; + } + } + +write_done: + wdata->res.count = (last_isect << SECTOR_SHIFT) - (offset); + if (count < wdata->res.count) { + wdata->res.count = count; + } +out: + bl_put_extent(be); + bl_submit_bio(WRITE, bio); + put_parallel(par); + return PNFS_ATTEMPTED; +} + +/* FIXME - range ignored */ +static void +release_extents(struct pnfs_block_layout *bl, struct pnfs_layout_range *range) +{ + int i; + struct pnfs_block_extent *be; + + spin_lock(&bl->bl_ext_lock); + for (i = 0; i < EXTENT_LISTS; i++) { + while (!list_empty(&bl->bl_extents[i])) { + be = list_first_entry(&bl->bl_extents[i], + struct pnfs_block_extent, + be_node); + list_del(&be->be_node); + bl_put_extent(be); + } + } + spin_unlock(&bl->bl_ext_lock); +} + +static void +release_inval_marks(struct pnfs_inval_markings *marks) +{ + struct pnfs_inval_tracking *pos, *temp; + + list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) { + list_del(&pos->it_link); + kfree(pos); + } + return; +} + +static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo) +{ + struct pnfs_block_layout *bl = BLK_LO2EXT(lo); + + dprintk("%s enter\n", __func__); + release_extents(bl, NULL); + release_inval_marks(&bl->bl_inval); + kfree(bl); +} + +static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode, + gfp_t gfp_flags) +{ + struct pnfs_block_layout *bl; + + dprintk("%s enter\n", __func__); + bl = kzalloc(sizeof(*bl), gfp_flags); + if (!bl) + return NULL; + spin_lock_init(&bl->bl_ext_lock); + INIT_LIST_HEAD(&bl->bl_extents[0]); + INIT_LIST_HEAD(&bl->bl_extents[1]); + INIT_LIST_HEAD(&bl->bl_commit); + INIT_LIST_HEAD(&bl->bl_committing); + bl->bl_count = 0; + bl->bl_blocksize = NFS_SERVER(inode)->pnfs_blksize >> SECTOR_SHIFT; + BL_INIT_INVAL_MARKS(&bl->bl_inval, bl->bl_blocksize); + return &bl->bl_layout; +} + +static void bl_free_lseg(struct pnfs_layout_segment *lseg) +{ + dprintk("%s enter\n", __func__); + kfree(lseg); +} + +/* We pretty much ignore lseg, and store all data layout wide, so we + * can correctly merge. + */ +static struct pnfs_layout_segment *bl_alloc_lseg(struct pnfs_layout_hdr *lo, + struct nfs4_layoutget_res *lgr, + gfp_t gfp_flags) +{ + struct pnfs_layout_segment *lseg; + int status; + + dprintk("%s enter\n", __func__); + lseg = kzalloc(sizeof(*lseg), gfp_flags); + if (!lseg) + return ERR_PTR(-ENOMEM); + status = nfs4_blk_process_layoutget(lo, lgr, gfp_flags); + if (status) { + /* We don't want to call the full-blown bl_free_lseg, + * since on error extents were not touched. + */ + kfree(lseg); + return ERR_PTR(status); + } + return lseg; +} + +static void +bl_encode_layoutcommit(struct pnfs_layout_hdr *lo, struct xdr_stream *xdr, + const struct nfs4_layoutcommit_args *arg) +{ + dprintk("%s enter\n", __func__); + encode_pnfs_block_layoutupdate(BLK_LO2EXT(lo), xdr, arg); +} + +static void +bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata) +{ + struct pnfs_layout_hdr *lo = NFS_I(lcdata->args.inode)->layout; + + dprintk("%s enter\n", __func__); + clean_pnfs_block_layoutupdate(BLK_LO2EXT(lo), &lcdata->args, lcdata->res.status); +} + +static void free_blk_mountid(struct block_mount_id *mid) +{ + if (mid) { + struct pnfs_block_dev *dev; + spin_lock(&mid->bm_lock); + while (!list_empty(&mid->bm_devlist)) { + dev = list_first_entry(&mid->bm_devlist, + struct pnfs_block_dev, + bm_node); + list_del(&dev->bm_node); + bl_free_block_dev(dev); + } + spin_unlock(&mid->bm_lock); + kfree(mid); + } +} + +/* This is mostly copied from the filelayout's get_device_info function. + * It seems much of this should be at the generic pnfs level. + */ +static struct pnfs_block_dev * +nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh, + struct nfs4_deviceid *d_id) +{ + struct pnfs_device *dev; + struct pnfs_block_dev *rv = NULL; + u32 max_resp_sz; + int max_pages; + struct page **pages = NULL; + int i, rc; + + /* + * Use the session max response size as the basis for setting + * GETDEVICEINFO's maxcount + */ + max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; + max_pages = max_resp_sz >> PAGE_SHIFT; + dprintk("%s max_resp_sz %u max_pages %d\n", + __func__, max_resp_sz, max_pages); + + dev = kmalloc(sizeof(*dev), GFP_NOFS); + if (!dev) { + dprintk("%s kmalloc failed\n", __func__); + return NULL; + } + + pages = kzalloc(max_pages * sizeof(struct page *), GFP_NOFS); + if (pages == NULL) { + kfree(dev); + return NULL; + } + for (i = 0; i < max_pages; i++) { + pages[i] = alloc_page(GFP_NOFS); + if (!pages[i]) + goto out_free; + } + + memcpy(&dev->dev_id, d_id, sizeof(*d_id)); + dev->layout_type = LAYOUT_BLOCK_VOLUME; + dev->pages = pages; + dev->pgbase = 0; + dev->pglen = PAGE_SIZE * max_pages; + dev->mincount = 0; + + dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data); + rc = nfs4_proc_getdeviceinfo(server, dev); + dprintk("%s getdevice info returns %d\n", __func__, rc); + if (rc) + goto out_free; + + rv = nfs4_blk_decode_device(server, dev); + out_free: + for (i = 0; i < max_pages; i++) + __free_page(pages[i]); + kfree(pages); + kfree(dev); + return rv; +} + +static int +bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh) +{ + struct block_mount_id *b_mt_id = NULL; + struct pnfs_devicelist *dlist = NULL; + struct pnfs_block_dev *bdev; + LIST_HEAD(block_disklist); + int status = 0, i; + + dprintk("%s enter\n", __func__); + + if (server->pnfs_blksize == 0) { + dprintk("%s Server did not return blksize\n", __func__); + return -EINVAL; + } + b_mt_id = kzalloc(sizeof(struct block_mount_id), GFP_NOFS); + if (!b_mt_id) { + status = -ENOMEM; + goto out_error; + } + /* Initialize nfs4 block layout mount id */ + spin_lock_init(&b_mt_id->bm_lock); + INIT_LIST_HEAD(&b_mt_id->bm_devlist); + + dlist = kmalloc(sizeof(struct pnfs_devicelist), GFP_NOFS); + if (!dlist) { + status = -ENOMEM; + goto out_error; + } + dlist->eof = 0; + while (!dlist->eof) { + status = nfs4_proc_getdevicelist(server, fh, dlist); + if (status) + goto out_error; + dprintk("%s GETDEVICELIST numdevs=%i, eof=%i\n", + __func__, dlist->num_devs, dlist->eof); + for (i = 0; i < dlist->num_devs; i++) { + bdev = nfs4_blk_get_deviceinfo(server, fh, + &dlist->dev_id[i]); + if (!bdev) { + status = -ENODEV; + goto out_error; + } + spin_lock(&b_mt_id->bm_lock); + list_add(&bdev->bm_node, &b_mt_id->bm_devlist); + spin_unlock(&b_mt_id->bm_lock); + } + } + dprintk("%s SUCCESS\n", __func__); + server->pnfs_ld_data = b_mt_id; + + out_return: + kfree(dlist); + return status; + + out_error: + free_blk_mountid(b_mt_id); + goto out_return; +} + +static int +bl_clear_layoutdriver(struct nfs_server *server) +{ + struct block_mount_id *b_mt_id = server->pnfs_ld_data; + + dprintk("%s enter\n", __func__); + free_blk_mountid(b_mt_id); + dprintk("%s RETURNS\n", __func__); + return 0; +} + +static const struct nfs_pageio_ops bl_pg_read_ops = { + .pg_init = pnfs_generic_pg_init_read, + .pg_test = pnfs_generic_pg_test, + .pg_doio = pnfs_generic_pg_readpages, +}; + +static const struct nfs_pageio_ops bl_pg_write_ops = { + .pg_init = pnfs_generic_pg_init_write, + .pg_test = pnfs_generic_pg_test, + .pg_doio = pnfs_generic_pg_writepages, +}; + +static struct pnfs_layoutdriver_type blocklayout_type = { + .id = LAYOUT_BLOCK_VOLUME, + .name = "LAYOUT_BLOCK_VOLUME", + .read_pagelist = bl_read_pagelist, + .write_pagelist = bl_write_pagelist, + .alloc_layout_hdr = bl_alloc_layout_hdr, + .free_layout_hdr = bl_free_layout_hdr, + .alloc_lseg = bl_alloc_lseg, + .free_lseg = bl_free_lseg, + .encode_layoutcommit = bl_encode_layoutcommit, + .cleanup_layoutcommit = bl_cleanup_layoutcommit, + .set_layoutdriver = bl_set_layoutdriver, + .clear_layoutdriver = bl_clear_layoutdriver, + .pg_read_ops = &bl_pg_read_ops, + .pg_write_ops = &bl_pg_write_ops, +}; + +static const struct rpc_pipe_ops bl_upcall_ops = { + .upcall = bl_pipe_upcall, + .downcall = bl_pipe_downcall, + .destroy_msg = bl_pipe_destroy_msg, +}; + +static int __init nfs4blocklayout_init(void) +{ + struct vfsmount *mnt; + struct path path; + int ret; + + dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__); + + ret = pnfs_register_layoutdriver(&blocklayout_type); + if (ret) + goto out; + + init_waitqueue_head(&bl_wq); + + mnt = rpc_get_mount(); + if (IS_ERR(mnt)) { + ret = PTR_ERR(mnt); + goto out_remove; + } + + ret = vfs_path_lookup(mnt->mnt_root, + mnt, + NFS_PIPE_DIRNAME, 0, &path); + if (ret) + goto out_remove; + + bl_device_pipe = rpc_mkpipe(path.dentry, "blocklayout", NULL, + &bl_upcall_ops, 0); + if (IS_ERR(bl_device_pipe)) { + ret = PTR_ERR(bl_device_pipe); + goto out_remove; + } +out: + return ret; + +out_remove: + pnfs_unregister_layoutdriver(&blocklayout_type); + return ret; +} + +static void __exit nfs4blocklayout_exit(void) +{ + dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n", + __func__); + + pnfs_unregister_layoutdriver(&blocklayout_type); + rpc_unlink(bl_device_pipe); +} + +MODULE_ALIAS("nfs-layouttype4-3"); + +module_init(nfs4blocklayout_init); +module_exit(nfs4blocklayout_exit); diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h new file mode 100644 index 0000000..f27d827 --- /dev/null +++ b/fs/nfs/blocklayout/blocklayout.h @@ -0,0 +1,207 @@ +/* + * linux/fs/nfs/blocklayout/blocklayout.h + * + * Module for the NFSv4.1 pNFS block layout driver. + * + * Copyright (c) 2006 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson <andros@citi.umich.edu> + * Fred Isaman <iisaman@umich.edu> + * + * permission is granted to use, copy, create derivative works and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the university of michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. if + * the above copyright notice or any other identification of the + * university of michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * this software is provided as is, without representation from the + * university of michigan as to its fitness for any purpose, and without + * warranty by the university of michigan of any kind, either express + * or implied, including without limitation the implied warranties of + * merchantability and fitness for a particular purpose. the regents + * of the university of michigan shall not be liable for any damages, + * including special, indirect, incidental, or consequential damages, + * with respect to any claim arising out or in connection with the use + * of the software, even if it has been or is hereafter advised of the + * possibility of such damages. + */ +#ifndef FS_NFS_NFS4BLOCKLAYOUT_H +#define FS_NFS_NFS4BLOCKLAYOUT_H + +#include <linux/device-mapper.h> +#include <linux/nfs_fs.h> +#include <linux/sunrpc/rpc_pipe_fs.h> + +#include "../pnfs.h" + +#define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> SECTOR_SHIFT) +#define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT) + +struct block_mount_id { + spinlock_t bm_lock; /* protects list */ + struct list_head bm_devlist; /* holds pnfs_block_dev */ +}; + +struct pnfs_block_dev { + struct list_head bm_node; + struct nfs4_deviceid bm_mdevid; /* associated devid */ + struct block_device *bm_mdev; /* meta device itself */ +}; + +enum exstate4 { + PNFS_BLOCK_READWRITE_DATA = 0, + PNFS_BLOCK_READ_DATA = 1, + PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */ + PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */ +}; + +#define MY_MAX_TAGS (15) /* tag bitnums used must be less than this */ + +struct my_tree { + sector_t mtt_step_size; /* Internal sector alignment */ + struct list_head mtt_stub; /* Should be a radix tree */ +}; + +struct pnfs_inval_markings { + spinlock_t im_lock; + struct my_tree im_tree; /* Sectors that need LAYOUTCOMMIT */ + sector_t im_block_size; /* Server blocksize in sectors */ +}; + +struct pnfs_inval_tracking { + struct list_head it_link; + int it_sector; + int it_tags; +}; + +/* sector_t fields are all in 512-byte sectors */ +struct pnfs_block_extent { + struct kref be_refcnt; + struct list_head be_node; /* link into lseg list */ + struct nfs4_deviceid be_devid; /* FIXME: could use device cache instead */ + struct block_device *be_mdev; + sector_t be_f_offset; /* the starting offset in the file */ + sector_t be_length; /* the size of the extent */ + sector_t be_v_offset; /* the starting offset in the volume */ + enum exstate4 be_state; /* the state of this extent */ + struct pnfs_inval_markings *be_inval; /* tracks INVAL->RW transition */ +}; + +/* Shortened extent used by LAYOUTCOMMIT */ +struct pnfs_block_short_extent { + struct list_head bse_node; + struct nfs4_deviceid bse_devid; + struct block_device *bse_mdev; + sector_t bse_f_offset; /* the starting offset in the file */ + sector_t bse_length; /* the size of the extent */ +}; + +static inline void +BL_INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize) +{ + spin_lock_init(&marks->im_lock); + INIT_LIST_HEAD(&marks->im_tree.mtt_stub); + marks->im_block_size = blocksize; + marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS, + blocksize); +} + +enum extentclass4 { + RW_EXTENT = 0, /* READWRTE and INVAL */ + RO_EXTENT = 1, /* READ and NONE */ + EXTENT_LISTS = 2, +}; + +static inline int bl_choose_list(enum exstate4 state) +{ + if (state == PNFS_BLOCK_READ_DATA || state == PNFS_BLOCK_NONE_DATA) + return RO_EXTENT; + else + return RW_EXTENT; +} + +struct pnfs_block_layout { + struct pnfs_layout_hdr bl_layout; + struct pnfs_inval_markings bl_inval; /* tracks INVAL->RW transition */ + spinlock_t bl_ext_lock; /* Protects list manipulation */ + struct list_head bl_extents[EXTENT_LISTS]; /* R and RW extents */ + struct list_head bl_commit; /* Needs layout commit */ + struct list_head bl_committing; /* Layout committing */ + unsigned int bl_count; /* entries in bl_commit */ + sector_t bl_blocksize; /* Server blocksize in sectors */ +}; + +#define BLK_ID(lo) ((struct block_mount_id *)(NFS_SERVER(lo->plh_inode)->pnfs_ld_data)) + +static inline struct pnfs_block_layout * +BLK_LO2EXT(struct pnfs_layout_hdr *lo) +{ + return container_of(lo, struct pnfs_block_layout, bl_layout); +} + +static inline struct pnfs_block_layout * +BLK_LSEG2EXT(struct pnfs_layout_segment *lseg) +{ + return BLK_LO2EXT(lseg->pls_layout); +} + +struct bl_dev_msg { + int status; + uint32_t major, minor; +}; + +struct bl_msg_hdr { + u8 type; + u16 totallen; /* length of entire message, including hdr itself */ +}; + +extern struct dentry *bl_device_pipe; +extern wait_queue_head_t bl_wq; + +#define BL_DEVICE_UMOUNT 0x0 /* Umount--delete devices */ +#define BL_DEVICE_MOUNT 0x1 /* Mount--create devices*/ +#define BL_DEVICE_REQUEST_INIT 0x0 /* Start request */ +#define BL_DEVICE_REQUEST_PROC 0x1 /* User level process succeeds */ +#define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */ + +/* blocklayoutdev.c */ +ssize_t bl_pipe_upcall(struct file *, struct rpc_pipe_msg *, + char __user *, size_t); +ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t); +void bl_pipe_destroy_msg(struct rpc_pipe_msg *); +struct block_device *nfs4_blkdev_get(dev_t dev); +int nfs4_blkdev_put(struct block_device *bdev); +struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server, + struct pnfs_device *dev); +int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo, + struct nfs4_layoutget_res *lgr, gfp_t gfp_flags); + +/* blocklayoutdm.c */ +void bl_free_block_dev(struct pnfs_block_dev *bdev); + +/* extents.c */ +struct pnfs_block_extent * +bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect, + struct pnfs_block_extent **cow_read); +int bl_mark_sectors_init(struct pnfs_inval_markings *marks, + sector_t offset, sector_t length, + sector_t **pages); +void bl_put_extent(struct pnfs_block_extent *be); +struct pnfs_block_extent *bl_alloc_extent(void); +int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect); +int encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, + struct xdr_stream *xdr, + const struct nfs4_layoutcommit_args *arg); +void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, + const struct nfs4_layoutcommit_args *arg, + int status); +int bl_add_merge_extent(struct pnfs_block_layout *bl, + struct pnfs_block_extent *new); +int bl_mark_for_commit(struct pnfs_block_extent *be, + sector_t offset, sector_t length); + +#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */ diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c new file mode 100644 index 0000000..a83b393 --- /dev/null +++ b/fs/nfs/blocklayout/blocklayoutdev.c @@ -0,0 +1,410 @@ +/* + * linux/fs/nfs/blocklayout/blocklayoutdev.c + * + * Device operations for the pnfs nfs4 file layout driver. + * + * Copyright (c) 2006 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson <andros@citi.umich.edu> + * Fred Isaman <iisaman@umich.edu> + * + * permission is granted to use, copy, create derivative works and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the university of michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. if + * the above copyright notice or any other identification of the + * university of michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * this software is provided as is, without representation from the + * university of michigan as to its fitness for any purpose, and without + * warranty by the university of michigan of any kind, either express + * or implied, including without limitation the implied warranties of + * merchantability and fitness for a particular purpose. the regents + * of the university of michigan shall not be liable for any damages, + * including special, indirect, incidental, or consequential damages, + * with respect to any claim arising out or in connection with the use + * of the software, even if it has been or is hereafter advised of the + * possibility of such damages. + */ +#include <linux/module.h> +#include <linux/buffer_head.h> /* __bread */ + +#include <linux/genhd.h> +#include <linux/blkdev.h> +#include <linux/hash.h> + +#include "blocklayout.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +static int decode_sector_number(__be32 **rp, sector_t *sp) +{ + uint64_t s; + + *rp = xdr_decode_hyper(*rp, &s); + if (s & 0x1ff) { + printk(KERN_WARNING "%s: sector not aligned\n", __func__); + return -1; + } + *sp = s >> SECTOR_SHIFT; + return 0; +} + +/* Open a block_device by device number. */ +struct block_device *nfs4_blkdev_get(dev_t dev) +{ + struct block_device *bd; + + dprintk("%s enter\n", __func__); + bd = blkdev_get_by_dev(dev, FMODE_READ, NULL); + if (IS_ERR(bd)) + goto fail; + return bd; +fail: + dprintk("%s failed to open device : %ld\n", + __func__, PTR_ERR(bd)); + return NULL; +} + +/* + * Release the block device + */ +int nfs4_blkdev_put(struct block_device *bdev) +{ + dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev), + MINOR(bdev->bd_dev)); + return blkdev_put(bdev, FMODE_READ); +} + +/* + * Shouldn't there be a rpc_generic_upcall() to do this for us? + */ +ssize_t bl_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg, + char __user *dst, size_t buflen) +{ + char *data = (char *)msg->data + msg->copied; + size_t mlen = min(msg->len - msg->copied, buflen); + unsigned long left; + + left = copy_to_user(dst, data, mlen); + if (left == mlen) { + msg->errno = -EFAULT; + return -EFAULT; + } + + mlen -= left; + msg->copied += mlen; + msg->errno = 0; + return mlen; +} + +static struct bl_dev_msg bl_mount_reply; + +ssize_t bl_pipe_downcall(struct file *filp, const char __user *src, + size_t mlen) +{ + if (mlen != sizeof (struct bl_dev_msg)) + return -EINVAL; + + if (copy_from_user(&bl_mount_reply, src, mlen) != 0) + return -EFAULT; + + wake_up(&bl_wq); + + return mlen; +} + +void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg) +{ + if (msg->errno >= 0) + return; + wake_up(&bl_wq); +} + +/* + * Decodes pnfs_block_deviceaddr4 which is XDR encoded in dev->dev_addr_buf. + */ +struct pnfs_block_dev * +nfs4_blk_decode_device(struct nfs_server *server, + struct pnfs_device *dev) +{ + struct pnfs_block_dev *rv = NULL; + struct block_device *bd = NULL; + struct rpc_pipe_msg msg; + struct bl_msg_hdr bl_msg = { + .type = BL_DEVICE_MOUNT, + .totallen = dev->mincount, + }; + uint8_t *dataptr; + DECLARE_WAITQUEUE(wq, current); + struct bl_dev_msg *reply = &bl_mount_reply; + int offset, len, i; + + dprintk("%s CREATING PIPEFS MESSAGE\n", __func__); + dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data, + dev->mincount); + + memset(&msg, 0, sizeof(msg)); + msg.data = kzalloc(sizeof(bl_msg) + dev->mincount, GFP_NOFS); + if (!msg.data) { + rv = ERR_PTR(-ENOMEM); + goto out; + } + + memcpy(msg.data, &bl_msg, sizeof(bl_msg)); + dataptr = (uint8_t *) msg.data; + len = dev->mincount; + offset = sizeof(bl_msg); + for (i = 0; len > 0; i++) { + memcpy(&dataptr[offset], page_address(dev->pages[i]), + len < PAGE_CACHE_SIZE ? len : PAGE_CACHE_SIZE); + len -= PAGE_CACHE_SIZE; + offset += PAGE_CACHE_SIZE; + } + msg.len = sizeof(bl_msg) + dev->mincount; + + dprintk("%s CALLING USERSPACE DAEMON\n", __func__); + add_wait_queue(&bl_wq, &wq); + if (rpc_queue_upcall(bl_device_pipe->d_inode, &msg) < 0) { + remove_wait_queue(&bl_wq, &wq); + goto out; + } + + set_current_state(TASK_UNINTERRUPTIBLE); + schedule(); + __set_current_state(TASK_RUNNING); + remove_wait_queue(&bl_wq, &wq); + + if (reply->status != BL_DEVICE_REQUEST_PROC) { + dprintk("%s failed to open device: %d\n", + __func__, reply->status); + rv = ERR_PTR(-EINVAL); + goto out; + } + + bd = nfs4_blkdev_get(MKDEV(reply->major, reply->minor)); + if (IS_ERR(bd)) { + dprintk("%s failed to open device : %ld\n", + __func__, PTR_ERR(bd)); + goto out; + } + + rv = kzalloc(sizeof(*rv), GFP_NOFS); + if (!rv) { + rv = ERR_PTR(-ENOMEM); + goto out; + } + + rv->bm_mdev = bd; + memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct nfs4_deviceid)); + dprintk("%s Created device %s with bd_block_size %u\n", + __func__, + bd->bd_disk->disk_name, + bd->bd_block_size); + +out: + kfree(msg.data); + return rv; +} + +/* Map deviceid returned by the server to constructed block_device */ +static struct block_device *translate_devid(struct pnfs_layout_hdr *lo, + struct nfs4_deviceid *id) +{ + struct block_device *rv = NULL; + struct block_mount_id *mid; + struct pnfs_block_dev *dev; + + dprintk("%s enter, lo=%p, id=%p\n", __func__, lo, id); + mid = BLK_ID(lo); + spin_lock(&mid->bm_lock); + list_for_each_entry(dev, &mid->bm_devlist, bm_node) { + if (memcmp(id->data, dev->bm_mdevid.data, + NFS4_DEVICEID4_SIZE) == 0) { + rv = dev->bm_mdev; + goto out; + } + } + out: + spin_unlock(&mid->bm_lock); + dprintk("%s returning %p\n", __func__, rv); + return rv; +} + +/* Tracks info needed to ensure extents in layout obey constraints of spec */ +struct layout_verification { + u32 mode; /* R or RW */ + u64 start; /* Expected start of next non-COW extent */ + u64 inval; /* Start of INVAL coverage */ + u64 cowread; /* End of COW read coverage */ +}; + +/* Verify the extent meets the layout requirements of the pnfs-block draft, + * section 2.3.1. + */ +static int verify_extent(struct pnfs_block_extent *be, + struct layout_verification *lv) +{ + if (lv->mode == IOMODE_READ) { + if (be->be_state == PNFS_BLOCK_READWRITE_DATA || + be->be_state == PNFS_BLOCK_INVALID_DATA) + return -EIO; + if (be->be_f_offset != lv->start) + return -EIO; + lv->start += be->be_length; + return 0; + } + /* lv->mode == IOMODE_RW */ + if (be->be_state == PNFS_BLOCK_READWRITE_DATA) { + if (be->be_f_offset != lv->start) + return -EIO; + if (lv->cowread > lv->start) + return -EIO; + lv->start += be->be_length; + lv->inval = lv->start; + return 0; + } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) { + if (be->be_f_offset != lv->start) + return -EIO; + lv->start += be->be_length; + return 0; + } else if (be->be_state == PNFS_BLOCK_READ_DATA) { + if (be->be_f_offset > lv->start) + return -EIO; + if (be->be_f_offset < lv->inval) + return -EIO; + if (be->be_f_offset < lv->cowread) + return -EIO; + /* It looks like you might want to min this with lv->start, + * but you really don't. + */ + lv->inval = lv->inval + be->be_length; + lv->cowread = be->be_f_offset + be->be_length; + return 0; + } else + return -EIO; +} + +/* XDR decode pnfs_block_layout4 structure */ +int +nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo, + struct nfs4_layoutget_res *lgr, gfp_t gfp_flags) +{ + struct pnfs_block_layout *bl = BLK_LO2EXT(lo); + int i, status = -EIO; + uint32_t count; + struct pnfs_block_extent *be = NULL, *save; + struct xdr_stream stream; + struct xdr_buf buf; + struct page *scratch; + __be32 *p; + struct layout_verification lv = { + .mode = lgr->range.iomode, + .start = lgr->range.offset >> SECTOR_SHIFT, + .inval = lgr->range.offset >> SECTOR_SHIFT, + .cowread = lgr->range.offset >> SECTOR_SHIFT, + }; + LIST_HEAD(extents); + + dprintk("---> %s\n", __func__); + + scratch = alloc_page(gfp_flags); + if (!scratch) + return -ENOMEM; + + xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len); + xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); + + p = xdr_inline_decode(&stream, 4); + if (unlikely(!p)) + goto out_err; + + count = be32_to_cpup(p++); + + dprintk("%s enter, number of extents %i\n", __func__, count); + p = xdr_inline_decode(&stream, (28 + NFS4_DEVICEID4_SIZE) * count); + if (unlikely(!p)) + goto out_err; + + /* Decode individual extents, putting them in temporary + * staging area until whole layout is decoded to make error + * recovery easier. + */ + for (i = 0; i < count; i++) { + be = bl_alloc_extent(); + if (!be) { + status = -ENOMEM; + goto out_err; + } + memcpy(&be->be_devid, p, NFS4_DEVICEID4_SIZE); + p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); + be->be_mdev = translate_devid(lo, &be->be_devid); + if (!be->be_mdev) + goto out_err; + + /* The next three values are read in as bytes, + * but stored as 512-byte sector lengths + */ + if (decode_sector_number(&p, &be->be_f_offset) < 0) + goto out_err; + if (decode_sector_number(&p, &be->be_length) < 0) + goto out_err; + if (decode_sector_number(&p, &be->be_v_offset) < 0) + goto out_err; + be->be_state = be32_to_cpup(p++); + if (be->be_state == PNFS_BLOCK_INVALID_DATA) + be->be_inval = &bl->bl_inval; + if (verify_extent(be, &lv)) { + dprintk("%s verify failed\n", __func__); + goto out_err; + } + list_add_tail(&be->be_node, &extents); + } + if (lgr->range.offset + lgr->range.length != + lv.start << SECTOR_SHIFT) { + dprintk("%s Final length mismatch\n", __func__); + be = NULL; + goto out_err; + } + if (lv.start < lv.cowread) { + dprintk("%s Final uncovered COW extent\n", __func__); + be = NULL; + goto out_err; + } + /* Extents decoded properly, now try to merge them in to + * existing layout extents. + */ + spin_lock(&bl->bl_ext_lock); + list_for_each_entry_safe(be, save, &extents, be_node) { + list_del(&be->be_node); + status = bl_add_merge_extent(bl, be); + if (status) { + spin_unlock(&bl->bl_ext_lock); + /* This is a fairly catastrophic error, as the + * entire layout extent lists are now corrupted. + * We should have some way to distinguish this. + */ + be = NULL; + goto out_err; + } + } + spin_unlock(&bl->bl_ext_lock); + status = 0; + out: + __free_page(scratch); + dprintk("%s returns %i\n", __func__, status); + return status; + + out_err: + bl_put_extent(be); + while (!list_empty(&extents)) { + be = list_first_entry(&extents, struct pnfs_block_extent, + be_node); + list_del(&be->be_node); + bl_put_extent(be); + } + goto out; +} diff --git a/fs/nfs/blocklayout/blocklayoutdm.c b/fs/nfs/blocklayout/blocklayoutdm.c new file mode 100644 index 0000000..d055c75 --- /dev/null +++ b/fs/nfs/blocklayout/blocklayoutdm.c @@ -0,0 +1,111 @@ +/* + * linux/fs/nfs/blocklayout/blocklayoutdm.c + * + * Module for the NFSv4.1 pNFS block layout driver. + * + * Copyright (c) 2007 The Regents of the University of Michigan. + * All rights reserved. + * + * Fred Isaman <iisaman@umich.edu> + * Andy Adamson <andros@citi.umich.edu> + * + * permission is granted to use, copy, create derivative works and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the university of michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. if + * the above copyright notice or any other identification of the + * university of michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * this software is provided as is, without representation from the + * university of michigan as to its fitness for any purpose, and without + * warranty by the university of michigan of any kind, either express + * or implied, including without limitation the implied warranties of + * merchantability and fitness for a particular purpose. the regents + * of the university of michigan shall not be liable for any damages, + * including special, indirect, incidental, or consequential damages, + * with respect to any claim arising out or in connection with the use + * of the software, even if it has been or is hereafter advised of the + * possibility of such damages. + */ + +#include <linux/genhd.h> /* gendisk - used in a dprintk*/ +#include <linux/sched.h> +#include <linux/hash.h> + +#include "blocklayout.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +static void dev_remove(dev_t dev) +{ + struct rpc_pipe_msg msg; + struct bl_dev_msg bl_umount_request; + struct bl_msg_hdr bl_msg = { + .type = BL_DEVICE_UMOUNT, + .totallen = sizeof(bl_umount_request), + }; + uint8_t *dataptr; + DECLARE_WAITQUEUE(wq, current); + + dprintk("Entering %s\n", __func__); + + memset(&msg, 0, sizeof(msg)); + msg.data = kzalloc(1 + sizeof(bl_umount_request), GFP_NOFS); + if (!msg.data) + goto out; + + memset(&bl_umount_request, 0, sizeof(bl_umount_request)); + bl_umount_request.major = MAJOR(dev); + bl_umount_request.minor = MINOR(dev); + + memcpy(msg.data, &bl_msg, sizeof(bl_msg)); + dataptr = (uint8_t *) msg.data; + memcpy(&dataptr[sizeof(bl_msg)], &bl_umount_request, sizeof(bl_umount_request)); + msg.len = sizeof(bl_msg) + bl_msg.totallen; + + add_wait_queue(&bl_wq, &wq); + if (rpc_queue_upcall(bl_device_pipe->d_inode, &msg) < 0) { + remove_wait_queue(&bl_wq, &wq); + goto out; + } + + set_current_state(TASK_UNINTERRUPTIBLE); + schedule(); + __set_current_state(TASK_RUNNING); + remove_wait_queue(&bl_wq, &wq); + +out: + kfree(msg.data); +} + +/* + * Release meta device + */ +static void nfs4_blk_metadev_release(struct pnfs_block_dev *bdev) +{ + int rv; + + dprintk("%s Releasing\n", __func__); + rv = nfs4_blkdev_put(bdev->bm_mdev); + if (rv) + printk(KERN_ERR "%s nfs4_blkdev_put returns %d\n", + __func__, rv); + + dev_remove(bdev->bm_mdev->bd_dev); +} + +void bl_free_block_dev(struct pnfs_block_dev *bdev) +{ + if (bdev) { + if (bdev->bm_mdev) { + dprintk("%s Removing DM device: %d:%d\n", + __func__, + MAJOR(bdev->bm_mdev->bd_dev), + MINOR(bdev->bm_mdev->bd_dev)); + nfs4_blk_metadev_release(bdev); + } + kfree(bdev); + } +} diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c new file mode 100644 index 0000000..19fa7b0 --- /dev/null +++ b/fs/nfs/blocklayout/extents.c @@ -0,0 +1,935 @@ +/* + * linux/fs/nfs/blocklayout/blocklayout.h + * + * Module for the NFSv4.1 pNFS block layout driver. + * + * Copyright (c) 2006 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson <andros@citi.umich.edu> + * Fred Isaman <iisaman@umich.edu> + * + * permission is granted to use, copy, create derivative works and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the university of michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. if + * the above copyright notice or any other identification of the + * university of michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * this software is provided as is, without representation from the + * university of michigan as to its fitness for any purpose, and without + * warranty by the university of michigan of any kind, either express + * or implied, including without limitation the implied warranties of + * merchantability and fitness for a particular purpose. the regents + * of the university of michigan shall not be liable for any damages, + * including special, indirect, incidental, or consequential damages, + * with respect to any claim arising out or in connection with the use + * of the software, even if it has been or is hereafter advised of the + * possibility of such damages. + */ + +#include "blocklayout.h" +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +/* Bit numbers */ +#define EXTENT_INITIALIZED 0 +#define EXTENT_WRITTEN 1 +#define EXTENT_IN_COMMIT 2 +#define INTERNAL_EXISTS MY_MAX_TAGS +#define INTERNAL_MASK ((1 << INTERNAL_EXISTS) - 1) + +/* Returns largest t<=s s.t. t%base==0 */ +static inline sector_t normalize(sector_t s, int base) +{ + sector_t tmp = s; /* Since do_div modifies its argument */ + return s - do_div(tmp, base); +} + +static inline sector_t normalize_up(sector_t s, int base) +{ + return normalize(s + base - 1, base); +} + +/* Complete stub using list while determine API wanted */ + +/* Returns tags, or negative */ +static int32_t _find_entry(struct my_tree *tree, u64 s) +{ + struct pnfs_inval_tracking *pos; + + dprintk("%s(%llu) enter\n", __func__, s); + list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { + if (pos->it_sector > s) + continue; + else if (pos->it_sector == s) + return pos->it_tags & INTERNAL_MASK; + else + break; + } + return -ENOENT; +} + +static inline +int _has_tag(struct my_tree *tree, u64 s, int32_t tag) +{ + int32_t tags; + + dprintk("%s(%llu, %i) enter\n", __func__, s, tag); + s = normalize(s, tree->mtt_step_size); + tags = _find_entry(tree, s); + if ((tags < 0) || !(tags & (1 << tag))) + return 0; + else + return 1; +} + +/* Creates entry with tag, or if entry already exists, unions tag to it. + * If storage is not NULL, newly created entry will use it. + * Returns number of entries added, or negative on error. + */ +static int _add_entry(struct my_tree *tree, u64 s, int32_t tag, + struct pnfs_inval_tracking *storage) +{ + int found = 0; + struct pnfs_inval_tracking *pos; + + dprintk("%s(%llu, %i, %p) enter\n", __func__, s, tag, storage); + list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { + if (pos->it_sector > s) + continue; + else if (pos->it_sector == s) { + found = 1; + break; + } else + break; + } + if (found) { + pos->it_tags |= (1 << tag); + return 0; + } else { + struct pnfs_inval_tracking *new; + if (storage) + new = storage; + else { + new = kmalloc(sizeof(*new), GFP_NOFS); + if (!new) + return -ENOMEM; + } + new->it_sector = s; + new->it_tags = (1 << tag); + list_add(&new->it_link, &pos->it_link); + return 1; + } +} + +/* XXXX Really want option to not create */ +/* Over range, unions tag with existing entries, else creates entry with tag */ +static int _set_range(struct my_tree *tree, int32_t tag, u64 s, u64 length) +{ + u64 i; + + dprintk("%s(%i, %llu, %llu) enter\n", __func__, tag, s, length); + for (i = normalize(s, tree->mtt_step_size); i < s + length; + i += tree->mtt_step_size) + if (_add_entry(tree, i, tag, NULL)) + return -ENOMEM; + return 0; +} + +/* Ensure that future operations on given range of tree will not malloc */ +static int _preload_range(struct my_tree *tree, u64 offset, u64 length) +{ + u64 start, end, s; + int count, i, used = 0, status = -ENOMEM; + struct pnfs_inval_tracking **storage; + + dprintk("%s(%llu, %llu) enter\n", __func__, offset, length); + start = normalize(offset, tree->mtt_step_size); + end = normalize_up(offset + length, tree->mtt_step_size); + count = (int)(end - start) / (int)tree->mtt_step_size; + + /* Pre-malloc what memory we might need */ + storage = kmalloc(sizeof(*storage) * count, GFP_NOFS); + if (!storage) + return -ENOMEM; + for (i = 0; i < count; i++) { + storage[i] = kmalloc(sizeof(struct pnfs_inval_tracking), + GFP_NOFS); + if (!storage[i]) + goto out_cleanup; + } + + /* Now need lock - HOW??? */ + + for (s = start; s < end; s += tree->mtt_step_size) + used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]); + + /* Unlock - HOW??? */ + status = 0; + + out_cleanup: + for (i = used; i < count; i++) { + if (!storage[i]) + break; + kfree(storage[i]); + } + kfree(storage); + return status; +} + +static void set_needs_init(sector_t *array, sector_t offset) +{ + sector_t *p = array; + + dprintk("%s enter\n", __func__); + if (!p) + return; + while (*p < offset) + p++; + if (*p == offset) + return; + else if (*p == ~0) { + *p++ = offset; + *p = ~0; + return; + } else { + sector_t *save = p; + dprintk("%s Adding %llu\n", __func__, (u64)offset); + while (*p != ~0) + p++; + p++; + memmove(save + 1, save, (char *)p - (char *)save); + *save = offset; + return; + } +} + +/* We are relying on page lock to serialize this */ +int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect) +{ + int rv; + + spin_lock(&marks->im_lock); + rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED); + spin_unlock(&marks->im_lock); + return rv; +} + +/* Assume start, end already sector aligned */ +static int +_range_has_tag(struct my_tree *tree, u64 start, u64 end, int32_t tag) +{ + struct pnfs_inval_tracking *pos; + u64 expect = 0; + + dprintk("%s(%llu, %llu, %i) enter\n", __func__, start, end, tag); + list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { + if (pos->it_sector >= end) + continue; + if (!expect) { + if ((pos->it_sector == end - tree->mtt_step_size) && + (pos->it_tags & (1 << tag))) { + expect = pos->it_sector - tree->mtt_step_size; + if (pos->it_sector < tree->mtt_step_size || expect < start) + return 1; + continue; + } else { + return 0; + } + } + if (pos->it_sector != expect || !(pos->it_tags & (1 << tag))) + return 0; + expect -= tree->mtt_step_size; + if (expect < start) + return 1; + } + return 0; +} + +static int is_range_written(struct pnfs_inval_markings *marks, + sector_t start, sector_t end) +{ + int rv; + + spin_lock(&marks->im_lock); + rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN); + spin_unlock(&marks->im_lock); + return rv; +} + +/* Marks sectors in [offest, offset_length) as having been initialized. + * All lengths are step-aligned, where step is min(pagesize, blocksize). + * Notes where partial block is initialized, and helps prepare it for + * complete initialization later. + */ +/* Currently assumes offset is page-aligned */ +int bl_mark_sectors_init(struct pnfs_inval_markings *marks, + sector_t offset, sector_t length, + sector_t **pages) +{ + sector_t s, start, end; + sector_t *array = NULL; /* Pages to mark */ + + dprintk("%s(offset=%llu,len=%llu) enter\n", + __func__, (u64)offset, (u64)length); + s = max((sector_t) 3, + 2 * (marks->im_block_size / (PAGE_CACHE_SECTORS))); + dprintk("%s set max=%llu\n", __func__, (u64)s); + if (pages) { + array = kmalloc(s * sizeof(sector_t), GFP_NOFS); + if (!array) + goto outerr; + array[0] = ~0; + } + + start = normalize(offset, marks->im_block_size); + end = normalize_up(offset + length, marks->im_block_size); + if (_preload_range(&marks->im_tree, start, end - start)) + goto outerr; + + spin_lock(&marks->im_lock); + + for (s = normalize_up(start, PAGE_CACHE_SECTORS); + s < offset; s += PAGE_CACHE_SECTORS) { + dprintk("%s pre-area pages\n", __func__); + /* Portion of used block is not initialized */ + if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED)) + set_needs_init(array, s); + } + if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length)) + goto out_unlock; + for (s = normalize_up(offset + length, PAGE_CACHE_SECTORS); + s < end; s += PAGE_CACHE_SECTORS) { + dprintk("%s post-area pages\n", __func__); + if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED)) + set_needs_init(array, s); + } + + spin_unlock(&marks->im_lock); + + if (pages) { + if (array[0] == ~0) { + kfree(array); + *pages = NULL; + } else + *pages = array; + } + return 0; + + out_unlock: + spin_unlock(&marks->im_lock); + outerr: + if (pages) { + kfree(array); + *pages = NULL; + } + return -ENOMEM; +} + +/* Marks sectors in [offest, offset+length) as having been written to disk. + * All lengths should be block aligned. + */ +static int mark_written_sectors(struct pnfs_inval_markings *marks, + sector_t offset, sector_t length) +{ + int status; + + dprintk("%s(offset=%llu,len=%llu) enter\n", __func__, + (u64)offset, (u64)length); + spin_lock(&marks->im_lock); + status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length); + spin_unlock(&marks->im_lock); + return status; +} + +static void print_short_extent(struct pnfs_block_short_extent *be) +{ + dprintk("PRINT SHORT EXTENT extent %p\n", be); + if (be) { + dprintk(" be_f_offset %llu\n", (u64)be->bse_f_offset); + dprintk(" be_length %llu\n", (u64)be->bse_length); + } +} + +static void print_clist(struct list_head *list, unsigned int count) +{ + struct pnfs_block_short_extent *be; + unsigned int i = 0; + + ifdebug(FACILITY) { + printk(KERN_DEBUG "****************\n"); + printk(KERN_DEBUG "Extent list looks like:\n"); + list_for_each_entry(be, list, bse_node) { + i++; + print_short_extent(be); + } + if (i != count) + printk(KERN_DEBUG "\n\nExpected %u entries\n\n\n", count); + printk(KERN_DEBUG "****************\n"); + } +} + +/* Note: In theory, we should do more checking that devid's match between + * old and new, but if they don't, the lists are too corrupt to salvage anyway. + */ +/* Note this is very similar to bl_add_merge_extent */ +static void add_to_commitlist(struct pnfs_block_layout *bl, + struct pnfs_block_short_extent *new) +{ + struct list_head *clist = &bl->bl_commit; + struct pnfs_block_short_extent *old, *save; + sector_t end = new->bse_f_offset + new->bse_length; + + dprintk("%s enter\n", __func__); + print_short_extent(new); + print_clist(clist, bl->bl_count); + bl->bl_count++; + /* Scan for proper place to insert, extending new to the left + * as much as possible. + */ + list_for_each_entry_safe(old, save, clist, bse_node) { + if (new->bse_f_offset < old->bse_f_offset) + break; + if (end <= old->bse_f_offset + old->bse_length) { + /* Range is already in list */ + bl->bl_count--; + kfree(new); + return; + } else if (new->bse_f_offset <= + old->bse_f_offset + old->bse_length) { + /* new overlaps or abuts existing be */ + if (new->bse_mdev == old->bse_mdev) { + /* extend new to fully replace old */ + new->bse_length += new->bse_f_offset - + old->bse_f_offset; + new->bse_f_offset = old->bse_f_offset; + list_del(&old->bse_node); + bl->bl_count--; + kfree(old); + } + } + } + /* Note that if we never hit the above break, old will not point to a + * valid extent. However, in that case &old->bse_node==list. + */ + list_add_tail(&new->bse_node, &old->bse_node); + /* Scan forward for overlaps. If we find any, extend new and + * remove the overlapped extent. + */ + old = list_prepare_entry(new, clist, bse_node); + list_for_each_entry_safe_continue(old, save, clist, bse_node) { + if (end < old->bse_f_offset) + break; + /* new overlaps or abuts old */ + if (new->bse_mdev == old->bse_mdev) { + if (end < old->bse_f_offset + old->bse_length) { + /* extend new to fully cover old */ + end = old->bse_f_offset + old->bse_length; + new->bse_length = end - new->bse_f_offset; + } + list_del(&old->bse_node); + bl->bl_count--; + kfree(old); + } + } + dprintk("%s: after merging\n", __func__); + print_clist(clist, bl->bl_count); +} + +/* Note the range described by offset, length is guaranteed to be contained + * within be. + */ +int bl_mark_for_commit(struct pnfs_block_extent *be, + sector_t offset, sector_t length) +{ + sector_t new_end, end = offset + length; + struct pnfs_block_short_extent *new; + struct pnfs_block_layout *bl = container_of(be->be_inval, + struct pnfs_block_layout, + bl_inval); + + new = kmalloc(sizeof(*new), GFP_NOFS); + if (!new) + return -ENOMEM; + + mark_written_sectors(be->be_inval, offset, length); + /* We want to add the range to commit list, but it must be + * block-normalized, and verified that the normalized range has + * been entirely written to disk. + */ + new->bse_f_offset = offset; + offset = normalize(offset, bl->bl_blocksize); + if (offset < new->bse_f_offset) { + if (is_range_written(be->be_inval, offset, new->bse_f_offset)) + new->bse_f_offset = offset; + else + new->bse_f_offset = offset + bl->bl_blocksize; + } + new_end = normalize_up(end, bl->bl_blocksize); + if (end < new_end) { + if (is_range_written(be->be_inval, end, new_end)) + end = new_end; + else + end = new_end - bl->bl_blocksize; + } + if (end <= new->bse_f_offset) { + kfree(new); + return 0; + } + new->bse_length = end - new->bse_f_offset; + new->bse_devid = be->be_devid; + new->bse_mdev = be->be_mdev; + + spin_lock(&bl->bl_ext_lock); + /* new will be freed, either by add_to_commitlist if it decides not + * to use it, or after LAYOUTCOMMIT uses it in the commitlist. + */ + add_to_commitlist(bl, new); + spin_unlock(&bl->bl_ext_lock); + return 0; +} + +static void print_bl_extent(struct pnfs_block_extent *be) +{ + dprintk("PRINT EXTENT extent %p\n", be); + if (be) { + dprintk(" be_f_offset %llu\n", (u64)be->be_f_offset); + dprintk(" be_length %llu\n", (u64)be->be_length); + dprintk(" be_v_offset %llu\n", (u64)be->be_v_offset); + dprintk(" be_state %d\n", be->be_state); + } +} + +static void +destroy_extent(struct kref *kref) +{ + struct pnfs_block_extent *be; + + be = container_of(kref, struct pnfs_block_extent, be_refcnt); + dprintk("%s be=%p\n", __func__, be); + kfree(be); +} + +void +bl_put_extent(struct pnfs_block_extent *be) +{ + if (be) { + dprintk("%s enter %p (%i)\n", __func__, be, + atomic_read(&be->be_refcnt.refcount)); + kref_put(&be->be_refcnt, destroy_extent); + } +} + +struct pnfs_block_extent *bl_alloc_extent(void) +{ + struct pnfs_block_extent *be; + + be = kmalloc(sizeof(struct pnfs_block_extent), GFP_NOFS); + if (!be) + return NULL; + INIT_LIST_HEAD(&be->be_node); + kref_init(&be->be_refcnt); + be->be_inval = NULL; + return be; +} + +static void print_elist(struct list_head *list) +{ + struct pnfs_block_extent *be; + dprintk("****************\n"); + dprintk("Extent list looks like:\n"); + list_for_each_entry(be, list, be_node) { + print_bl_extent(be); + } + dprintk("****************\n"); +} + +static inline int +extents_consistent(struct pnfs_block_extent *old, struct pnfs_block_extent *new) +{ + /* Note this assumes new->be_f_offset >= old->be_f_offset */ + return (new->be_state == old->be_state) && + ((new->be_state == PNFS_BLOCK_NONE_DATA) || + ((new->be_v_offset - old->be_v_offset == + new->be_f_offset - old->be_f_offset) && + new->be_mdev == old->be_mdev)); +} + +/* Adds new to appropriate list in bl, modifying new and removing existing + * extents as appropriate to deal with overlaps. + * + * See bl_find_get_extent for list constraints. + * + * Refcount on new is already set. If end up not using it, or error out, + * need to put the reference. + * + * bl->bl_ext_lock is held by caller. + */ +int +bl_add_merge_extent(struct pnfs_block_layout *bl, + struct pnfs_block_extent *new) +{ + struct pnfs_block_extent *be, *tmp; + sector_t end = new->be_f_offset + new->be_length; + struct list_head *list; + + dprintk("%s enter with be=%p\n", __func__, new); + print_bl_extent(new); + list = &bl->bl_extents[bl_choose_list(new->be_state)]; + print_elist(list); + + /* Scan for proper place to insert, extending new to the left + * as much as possible. + */ + list_for_each_entry_safe_reverse(be, tmp, list, be_node) { + if (new->be_f_offset >= be->be_f_offset + be->be_length) + break; + if (new->be_f_offset >= be->be_f_offset) { + if (end <= be->be_f_offset + be->be_length) { + /* new is a subset of existing be*/ + if (extents_consistent(be, new)) { + dprintk("%s: new is subset, ignoring\n", + __func__); + bl_put_extent(new); + return 0; + } else { + goto out_err; + } + } else { + /* |<-- be -->| + * |<-- new -->| */ + if (extents_consistent(be, new)) { + /* extend new to fully replace be */ + new->be_length += new->be_f_offset - + be->be_f_offset; + new->be_f_offset = be->be_f_offset; + new->be_v_offset = be->be_v_offset; + dprintk("%s: removing %p\n", __func__, be); + list_del(&be->be_node); + bl_put_extent(be); + } else { + goto out_err; + } + } + } else if (end >= be->be_f_offset + be->be_length) { + /* new extent overlap existing be */ + if (extents_consistent(be, new)) { + /* extend new to fully replace be */ + dprintk("%s: removing %p\n", __func__, be); + list_del(&be->be_node); + bl_put_extent(be); + } else { + goto out_err; + } + } else if (end > be->be_f_offset) { + /* |<-- be -->| + *|<-- new -->| */ + if (extents_consistent(new, be)) { + /* extend new to fully replace be */ + new->be_length += be->be_f_offset + be->be_length - + new->be_f_offset - new->be_length; + dprintk("%s: removing %p\n", __func__, be); + list_del(&be->be_node); + bl_put_extent(be); + } else { + goto out_err; + } + } + } + /* Note that if we never hit the above break, be will not point to a + * valid extent. However, in that case &be->be_node==list. + */ + list_add(&new->be_node, &be->be_node); + dprintk("%s: inserting new\n", __func__); + print_elist(list); + /* FIXME - The per-list consistency checks have all been done, + * should now check cross-list consistency. + */ + return 0; + + out_err: + bl_put_extent(new); + return -EIO; +} + +/* Returns extent, or NULL. If a second READ extent exists, it is returned + * in cow_read, if given. + * + * The extents are kept in two seperate ordered lists, one for READ and NONE, + * one for READWRITE and INVALID. Within each list, we assume: + * 1. Extents are ordered by file offset. + * 2. For any given isect, there is at most one extents that matches. + */ +struct pnfs_block_extent * +bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect, + struct pnfs_block_extent **cow_read) +{ + struct pnfs_block_extent *be, *cow, *ret; + int i; + + dprintk("%s enter with isect %llu\n", __func__, (u64)isect); + cow = ret = NULL; + spin_lock(&bl->bl_ext_lock); + for (i = 0; i < EXTENT_LISTS; i++) { + list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) { + if (isect >= be->be_f_offset + be->be_length) + break; + if (isect >= be->be_f_offset) { + /* We have found an extent */ + dprintk("%s Get %p (%i)\n", __func__, be, + atomic_read(&be->be_refcnt.refcount)); + kref_get(&be->be_refcnt); + if (!ret) + ret = be; + else if (be->be_state != PNFS_BLOCK_READ_DATA) + bl_put_extent(be); + else + cow = be; + break; + } + } + if (ret && + (!cow_read || ret->be_state != PNFS_BLOCK_INVALID_DATA)) + break; + } + spin_unlock(&bl->bl_ext_lock); + if (cow_read) + *cow_read = cow; + print_bl_extent(ret); + return ret; +} + +/* Similar to bl_find_get_extent, but called with lock held, and ignores cow */ +static struct pnfs_block_extent * +bl_find_get_extent_locked(struct pnfs_block_layout *bl, sector_t isect) +{ + struct pnfs_block_extent *be, *ret = NULL; + int i; + + dprintk("%s enter with isect %llu\n", __func__, (u64)isect); + for (i = 0; i < EXTENT_LISTS; i++) { + if (ret) + break; + list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) { + if (isect >= be->be_f_offset + be->be_length) + break; + if (isect >= be->be_f_offset) { + /* We have found an extent */ + dprintk("%s Get %p (%i)\n", __func__, be, + atomic_read(&be->be_refcnt.refcount)); + kref_get(&be->be_refcnt); + ret = be; + break; + } + } + } + print_bl_extent(ret); + return ret; +} + +int +encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, + struct xdr_stream *xdr, + const struct nfs4_layoutcommit_args *arg) +{ + struct pnfs_block_short_extent *lce, *save; + unsigned int count = 0; + __be32 *p, *xdr_start; + + dprintk("%s enter\n", __func__); + /* BUG - creation of bl_commit is buggy - need to wait for + * entire block to be marked WRITTEN before it can be added. + */ + spin_lock(&bl->bl_ext_lock); + /* Want to adjust for possible truncate */ + /* We now want to adjust argument range */ + + /* XDR encode the ranges found */ + xdr_start = xdr_reserve_space(xdr, 8); + if (!xdr_start) + goto out; + list_for_each_entry_safe(lce, save, &bl->bl_commit, bse_node) { + p = xdr_reserve_space(xdr, 7 * 4 + sizeof(lce->bse_devid.data)); + if (!p) + break; + p = xdr_encode_opaque_fixed(p, lce->bse_devid.data, NFS4_DEVICEID4_SIZE); + p = xdr_encode_hyper(p, lce->bse_f_offset << SECTOR_SHIFT); + p = xdr_encode_hyper(p, lce->bse_length << SECTOR_SHIFT); + p = xdr_encode_hyper(p, 0LL); + *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA); + list_del(&lce->bse_node); + list_add_tail(&lce->bse_node, &bl->bl_committing); + bl->bl_count--; + count++; + } + xdr_start[0] = cpu_to_be32((xdr->p - xdr_start - 1) * 4); + xdr_start[1] = cpu_to_be32(count); +out: + spin_unlock(&bl->bl_ext_lock); + dprintk("%s found %i ranges\n", __func__, count); + return 0; +} + +/* Helper function to set_to_rw that initialize a new extent */ +static void +_prep_new_extent(struct pnfs_block_extent *new, + struct pnfs_block_extent *orig, + sector_t offset, sector_t length, int state) +{ + kref_init(&new->be_refcnt); + /* don't need to INIT_LIST_HEAD(&new->be_node) */ + memcpy(&new->be_devid, &orig->be_devid, sizeof(struct nfs4_deviceid)); + new->be_mdev = orig->be_mdev; + new->be_f_offset = offset; + new->be_length = length; + new->be_v_offset = orig->be_v_offset - orig->be_f_offset + offset; + new->be_state = state; + new->be_inval = orig->be_inval; +} + +/* Tries to merge be with extent in front of it in list. + * Frees storage if not used. + */ +static struct pnfs_block_extent * +_front_merge(struct pnfs_block_extent *be, struct list_head *head, + struct pnfs_block_extent *storage) +{ + struct pnfs_block_extent *prev; + + if (!storage) + goto no_merge; + if (&be->be_node == head || be->be_node.prev == head) + goto no_merge; + prev = list_entry(be->be_node.prev, struct pnfs_block_extent, be_node); + if ((prev->be_f_offset + prev->be_length != be->be_f_offset) || + !extents_consistent(prev, be)) + goto no_merge; + _prep_new_extent(storage, prev, prev->be_f_offset, + prev->be_length + be->be_length, prev->be_state); + list_replace(&prev->be_node, &storage->be_node); + bl_put_extent(prev); + list_del(&be->be_node); + bl_put_extent(be); + return storage; + + no_merge: + kfree(storage); + return be; +} + +static u64 +set_to_rw(struct pnfs_block_layout *bl, u64 offset, u64 length) +{ + u64 rv = offset + length; + struct pnfs_block_extent *be, *e1, *e2, *e3, *new, *old; + struct pnfs_block_extent *children[3]; + struct pnfs_block_extent *merge1 = NULL, *merge2 = NULL; + int i = 0, j; + + dprintk("%s(%llu, %llu)\n", __func__, offset, length); + /* Create storage for up to three new extents e1, e2, e3 */ + e1 = kmalloc(sizeof(*e1), GFP_ATOMIC); + e2 = kmalloc(sizeof(*e2), GFP_ATOMIC); + e3 = kmalloc(sizeof(*e3), GFP_ATOMIC); + /* BUG - we are ignoring any failure */ + if (!e1 || !e2 || !e3) + goto out_nosplit; + + spin_lock(&bl->bl_ext_lock); + be = bl_find_get_extent_locked(bl, offset); + rv = be->be_f_offset + be->be_length; + if (be->be_state != PNFS_BLOCK_INVALID_DATA) { + spin_unlock(&bl->bl_ext_lock); + goto out_nosplit; + } + /* Add e* to children, bumping e*'s krefs */ + if (be->be_f_offset != offset) { + _prep_new_extent(e1, be, be->be_f_offset, + offset - be->be_f_offset, + PNFS_BLOCK_INVALID_DATA); + children[i++] = e1; + print_bl_extent(e1); + } else + merge1 = e1; + _prep_new_extent(e2, be, offset, + min(length, be->be_f_offset + be->be_length - offset), + PNFS_BLOCK_READWRITE_DATA); + children[i++] = e2; + print_bl_extent(e2); + if (offset + length < be->be_f_offset + be->be_length) { + _prep_new_extent(e3, be, e2->be_f_offset + e2->be_length, + be->be_f_offset + be->be_length - + offset - length, + PNFS_BLOCK_INVALID_DATA); + children[i++] = e3; + print_bl_extent(e3); + } else + merge2 = e3; + + /* Remove be from list, and insert the e* */ + /* We don't get refs on e*, since this list is the base reference + * set when init'ed. + */ + if (i < 3) + children[i] = NULL; + new = children[0]; + list_replace(&be->be_node, &new->be_node); + bl_put_extent(be); + new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge1); + for (j = 1; j < i; j++) { + old = new; + new = children[j]; + list_add(&new->be_node, &old->be_node); + } + if (merge2) { + /* This is a HACK, should just create a _back_merge function */ + new = list_entry(new->be_node.next, + struct pnfs_block_extent, be_node); + new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge2); + } + spin_unlock(&bl->bl_ext_lock); + + /* Since we removed the base reference above, be is now scheduled for + * destruction. + */ + bl_put_extent(be); + dprintk("%s returns %llu after split\n", __func__, rv); + return rv; + + out_nosplit: + kfree(e1); + kfree(e2); + kfree(e3); + dprintk("%s returns %llu without splitting\n", __func__, rv); + return rv; +} + +void +clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, + const struct nfs4_layoutcommit_args *arg, + int status) +{ + struct pnfs_block_short_extent *lce, *save; + + dprintk("%s status %d\n", __func__, status); + list_for_each_entry_safe(lce, save, &bl->bl_committing, bse_node) { + if (likely(!status)) { + u64 offset = lce->bse_f_offset; + u64 end = offset + lce->bse_length; + + do { + offset = set_to_rw(bl, offset, end - offset); + } while (offset < end); + list_del(&lce->bse_node); + + kfree(lce); + } else { + list_del(&lce->bse_node); + spin_lock(&bl->bl_ext_lock); + add_to_commitlist(bl, lce); + spin_unlock(&bl->bl_ext_lock); + } + } +} diff --git a/fs/nfs/cache_lib.h b/fs/nfs/cache_lib.h index 76f856e..7cf6caf 100644 --- a/fs/nfs/cache_lib.h +++ b/fs/nfs/cache_lib.h @@ -6,7 +6,7 @@ #include <linux/completion.h> #include <linux/sunrpc/cache.h> -#include <asm/atomic.h> +#include <linux/atomic.h> /* * Deferred request handling diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h index b257383..07df5f1 100644 --- a/fs/nfs/callback.h +++ b/fs/nfs/callback.h @@ -38,6 +38,7 @@ enum nfs4_callback_opnum { struct cb_process_state { __be32 drc_status; struct nfs_client *clp; + int slotid; }; struct cb_compound_hdr_arg { @@ -166,7 +167,6 @@ extern unsigned nfs4_callback_layoutrecall( void *dummy, struct cb_process_state *cps); extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses); -extern void nfs4_cb_take_slot(struct nfs_client *clp); struct cb_devicenotifyitem { uint32_t cbd_notify_type; diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index d4d1954..43926ad 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -111,6 +111,7 @@ int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nf static u32 initiate_file_draining(struct nfs_client *clp, struct cb_layoutrecallargs *args) { + struct nfs_server *server; struct pnfs_layout_hdr *lo; struct inode *ino; bool found = false; @@ -118,21 +119,28 @@ static u32 initiate_file_draining(struct nfs_client *clp, LIST_HEAD(free_me_list); spin_lock(&clp->cl_lock); - list_for_each_entry(lo, &clp->cl_layouts, plh_layouts) { - if (nfs_compare_fh(&args->cbl_fh, - &NFS_I(lo->plh_inode)->fh)) - continue; - ino = igrab(lo->plh_inode); - if (!ino) - continue; - found = true; - /* Without this, layout can be freed as soon - * as we release cl_lock. - */ - get_layout_hdr(lo); - break; + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + list_for_each_entry(lo, &server->layouts, plh_layouts) { + if (nfs_compare_fh(&args->cbl_fh, + &NFS_I(lo->plh_inode)->fh)) + continue; + ino = igrab(lo->plh_inode); + if (!ino) + continue; + found = true; + /* Without this, layout can be freed as soon + * as we release cl_lock. + */ + get_layout_hdr(lo); + break; + } + if (found) + break; } + rcu_read_unlock(); spin_unlock(&clp->cl_lock); + if (!found) return NFS4ERR_NOMATCHING_LAYOUT; @@ -154,6 +162,7 @@ static u32 initiate_file_draining(struct nfs_client *clp, static u32 initiate_bulk_draining(struct nfs_client *clp, struct cb_layoutrecallargs *args) { + struct nfs_server *server; struct pnfs_layout_hdr *lo; struct inode *ino; u32 rv = NFS4ERR_NOMATCHING_LAYOUT; @@ -167,18 +176,24 @@ static u32 initiate_bulk_draining(struct nfs_client *clp, }; spin_lock(&clp->cl_lock); - list_for_each_entry(lo, &clp->cl_layouts, plh_layouts) { + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { if ((args->cbl_recall_type == RETURN_FSID) && - memcmp(&NFS_SERVER(lo->plh_inode)->fsid, - &args->cbl_fsid, sizeof(struct nfs_fsid))) - continue; - if (!igrab(lo->plh_inode)) + memcmp(&server->fsid, &args->cbl_fsid, + sizeof(struct nfs_fsid))) continue; - get_layout_hdr(lo); - BUG_ON(!list_empty(&lo->plh_bulk_recall)); - list_add(&lo->plh_bulk_recall, &recall_list); + + list_for_each_entry(lo, &server->layouts, plh_layouts) { + if (!igrab(lo->plh_inode)) + continue; + get_layout_hdr(lo); + BUG_ON(!list_empty(&lo->plh_bulk_recall)); + list_add(&lo->plh_bulk_recall, &recall_list); + } } + rcu_read_unlock(); spin_unlock(&clp->cl_lock); + list_for_each_entry_safe(lo, tmp, &recall_list, plh_bulk_recall) { ino = lo->plh_inode; @@ -333,7 +348,7 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args) /* Normal */ if (likely(args->csa_sequenceid == slot->seq_nr + 1)) { slot->seq_nr++; - return htonl(NFS4_OK); + goto out_ok; } /* Replay */ @@ -352,11 +367,14 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args) /* Wraparound */ if (args->csa_sequenceid == 1 && (slot->seq_nr + 1) == 0) { slot->seq_nr = 1; - return htonl(NFS4_OK); + goto out_ok; } /* Misordered request */ return htonl(NFS4ERR_SEQ_MISORDERED); +out_ok: + tbl->highest_used_slotid = args->csa_slotid; + return htonl(NFS4_OK); } /* @@ -418,26 +436,37 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args, struct cb_sequenceres *res, struct cb_process_state *cps) { + struct nfs4_slot_table *tbl; struct nfs_client *clp; int i; __be32 status = htonl(NFS4ERR_BADSESSION); - cps->clp = NULL; - clp = nfs4_find_client_sessionid(args->csa_addr, &args->csa_sessionid); if (clp == NULL) goto out; + tbl = &clp->cl_session->bc_slot_table; + + spin_lock(&tbl->slot_tbl_lock); /* state manager is resetting the session */ if (test_bit(NFS4_SESSION_DRAINING, &clp->cl_session->session_state)) { - status = NFS4ERR_DELAY; + spin_unlock(&tbl->slot_tbl_lock); + status = htonl(NFS4ERR_DELAY); + /* Return NFS4ERR_BADSESSION if we're draining the session + * in order to reset it. + */ + if (test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)) + status = htonl(NFS4ERR_BADSESSION); goto out; } status = validate_seqid(&clp->cl_session->bc_slot_table, args); + spin_unlock(&tbl->slot_tbl_lock); if (status) goto out; + cps->slotid = args->csa_slotid; + /* * Check for pending referring calls. If a match is found, a * related callback was received before the response to the original @@ -454,7 +483,6 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args, res->csr_slotid = args->csa_slotid; res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1; res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1; - nfs4_cb_take_slot(clp); out: cps->clp = clp; /* put in nfs4_callback_compound */ diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index c6c86a7..918ad64 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -754,26 +754,15 @@ static void nfs4_callback_free_slot(struct nfs4_session *session) * Let the state manager know callback processing done. * A single slot, so highest used slotid is either 0 or -1 */ - tbl->highest_used_slotid--; + tbl->highest_used_slotid = -1; nfs4_check_drain_bc_complete(session); spin_unlock(&tbl->slot_tbl_lock); } -static void nfs4_cb_free_slot(struct nfs_client *clp) +static void nfs4_cb_free_slot(struct cb_process_state *cps) { - if (clp && clp->cl_session) - nfs4_callback_free_slot(clp->cl_session); -} - -/* A single slot, so highest used slotid is either 0 or -1 */ -void nfs4_cb_take_slot(struct nfs_client *clp) -{ - struct nfs4_slot_table *tbl = &clp->cl_session->bc_slot_table; - - spin_lock(&tbl->slot_tbl_lock); - tbl->highest_used_slotid++; - BUG_ON(tbl->highest_used_slotid != 0); - spin_unlock(&tbl->slot_tbl_lock); + if (cps->slotid != -1) + nfs4_callback_free_slot(cps->clp->cl_session); } #else /* CONFIG_NFS_V4_1 */ @@ -784,7 +773,7 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op) return htonl(NFS4ERR_MINOR_VERS_MISMATCH); } -static void nfs4_cb_free_slot(struct nfs_client *clp) +static void nfs4_cb_free_slot(struct cb_process_state *cps) { } #endif /* CONFIG_NFS_V4_1 */ @@ -866,6 +855,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r struct cb_process_state cps = { .drc_status = 0, .clp = NULL, + .slotid = -1, }; unsigned int nops = 0; @@ -906,7 +896,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r *hdr_res.status = status; *hdr_res.nops = htonl(nops); - nfs4_cb_free_slot(cps.clp); + nfs4_cb_free_slot(&cps); nfs_put_client(cps.clp); dprintk("%s: done, status = %u\n", __func__, ntohl(status)); return rpc_success; diff --git a/fs/nfs/client.c b/fs/nfs/client.c index b3dc2b8..5833fbb 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -105,7 +105,7 @@ struct rpc_program nfs_program = { .nrvers = ARRAY_SIZE(nfs_version), .version = nfs_version, .stats = &nfs_rpcstat, - .pipe_dir_name = "/nfs", + .pipe_dir_name = NFS_PIPE_DIRNAME, }; struct rpc_stat nfs_rpcstat = { @@ -188,9 +188,6 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_ cred = rpc_lookup_machine_cred(); if (!IS_ERR(cred)) clp->cl_machine_cred = cred; -#if defined(CONFIG_NFS_V4_1) - INIT_LIST_HEAD(&clp->cl_layouts); -#endif nfs_fscache_get_client_cookie(clp); return clp; @@ -293,6 +290,7 @@ static void nfs_free_client(struct nfs_client *clp) nfs4_deviceid_purge_client(clp); kfree(clp->cl_hostname); + kfree(clp->server_scope); kfree(clp); dprintk("<-- nfs_free_client()\n"); @@ -906,7 +904,9 @@ error: /* * Load up the server record from information gained in an fsinfo record */ -static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *fsinfo) +static void nfs_server_set_fsinfo(struct nfs_server *server, + struct nfs_fh *mntfh, + struct nfs_fsinfo *fsinfo) { unsigned long max_rpc_payload; @@ -936,7 +936,8 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo * if (server->wsize > NFS_MAX_FILE_IO_SIZE) server->wsize = NFS_MAX_FILE_IO_SIZE; server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - set_pnfs_layoutdriver(server, fsinfo->layouttype); + server->pnfs_blksize = fsinfo->blksize; + set_pnfs_layoutdriver(server, mntfh, fsinfo->layouttype); server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL); @@ -982,7 +983,7 @@ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, str if (error < 0) goto out_error; - nfs_server_set_fsinfo(server, &fsinfo); + nfs_server_set_fsinfo(server, mntfh, &fsinfo); /* Get some general file system info */ if (server->namelen == 0) { @@ -1062,6 +1063,7 @@ static struct nfs_server *nfs_alloc_server(void) INIT_LIST_HEAD(&server->client_link); INIT_LIST_HEAD(&server->master_link); INIT_LIST_HEAD(&server->delegations); + INIT_LIST_HEAD(&server->layouts); atomic_set(&server->active, 0); @@ -1464,7 +1466,7 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, dprintk("<-- %s %p\n", __func__, clp); return clp; } -EXPORT_SYMBOL(nfs4_set_ds_client); +EXPORT_SYMBOL_GPL(nfs4_set_ds_client); /* * Session has been established, and the client marked ready. diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index dd25c2a..321a66b 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -398,12 +398,11 @@ int nfs_inode_return_delegation(struct inode *inode) return err; } -static void nfs_mark_return_delegation(struct nfs_delegation *delegation) +static void nfs_mark_return_delegation(struct nfs_server *server, + struct nfs_delegation *delegation) { - struct nfs_client *clp = NFS_SERVER(delegation->inode)->nfs_client; - set_bit(NFS_DELEGATION_RETURN, &delegation->flags); - set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state); + set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state); } /** @@ -441,7 +440,7 @@ static void nfs_mark_return_all_delegation_types(struct nfs_server *server, if ((delegation->type == (FMODE_READ|FMODE_WRITE)) && !(flags & FMODE_WRITE)) continue; if (delegation->type & flags) - nfs_mark_return_delegation(delegation); + nfs_mark_return_delegation(server, delegation); } } @@ -508,7 +507,7 @@ static void nfs_mark_return_unreferenced_delegations(struct nfs_server *server) list_for_each_entry_rcu(delegation, &server->delegations, super_list) { if (test_and_clear_bit(NFS_DELEGATION_REFERENCED, &delegation->flags)) continue; - nfs_mark_return_delegation(delegation); + nfs_mark_return_delegation(server, delegation); } } @@ -539,7 +538,8 @@ void nfs_expire_unreferenced_delegations(struct nfs_client *clp) int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid) { - struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_client *clp = server->nfs_client; struct nfs_delegation *delegation; rcu_read_lock(); @@ -549,7 +549,7 @@ int nfs_async_inode_return_delegation(struct inode *inode, rcu_read_unlock(); return -ENOENT; } - nfs_mark_return_delegation(delegation); + nfs_mark_return_delegation(server, delegation); rcu_read_unlock(); nfs_delegation_run_state_manager(clp); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 57f578e..b238d95 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -134,18 +134,19 @@ const struct inode_operations nfs4_dir_inode_operations = { #endif /* CONFIG_NFS_V4 */ -static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct rpc_cred *cred) +static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir, struct rpc_cred *cred) { struct nfs_open_dir_context *ctx; ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); if (ctx != NULL) { ctx->duped = 0; + ctx->attr_gencount = NFS_I(dir)->attr_gencount; ctx->dir_cookie = 0; ctx->dup_cookie = 0; ctx->cred = get_rpccred(cred); - } else - ctx = ERR_PTR(-ENOMEM); - return ctx; + return ctx; + } + return ERR_PTR(-ENOMEM); } static void put_nfs_open_dir_context(struct nfs_open_dir_context *ctx) @@ -173,7 +174,7 @@ nfs_opendir(struct inode *inode, struct file *filp) cred = rpc_lookup_cred(); if (IS_ERR(cred)) return PTR_ERR(cred); - ctx = alloc_nfs_open_dir_context(cred); + ctx = alloc_nfs_open_dir_context(inode, cred); if (IS_ERR(ctx)) { res = PTR_ERR(ctx); goto out; @@ -323,7 +324,6 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri { loff_t diff = desc->file->f_pos - desc->current_index; unsigned int index; - struct nfs_open_dir_context *ctx = desc->file->private_data; if (diff < 0) goto out_eof; @@ -336,7 +336,6 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri index = (unsigned int)diff; *desc->dir_cookie = array->array[index].cookie; desc->cache_entry_index = index; - ctx->duped = 0; return 0; out_eof: desc->eof = 1; @@ -349,14 +348,34 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des int i; loff_t new_pos; int status = -EAGAIN; - struct nfs_open_dir_context *ctx = desc->file->private_data; for (i = 0; i < array->size; i++) { if (array->array[i].cookie == *desc->dir_cookie) { + struct nfs_inode *nfsi = NFS_I(desc->file->f_path.dentry->d_inode); + struct nfs_open_dir_context *ctx = desc->file->private_data; + new_pos = desc->current_index + i; - if (new_pos < desc->file->f_pos) { + if (ctx->attr_gencount != nfsi->attr_gencount + || (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))) { + ctx->duped = 0; + ctx->attr_gencount = nfsi->attr_gencount; + } else if (new_pos < desc->file->f_pos) { + if (ctx->duped > 0 + && ctx->dup_cookie == *desc->dir_cookie) { + if (printk_ratelimit()) { + pr_notice("NFS: directory %s/%s contains a readdir loop." + "Please contact your server vendor. " + "The file: %s has duplicate cookie %llu\n", + desc->file->f_dentry->d_parent->d_name.name, + desc->file->f_dentry->d_name.name, + array->array[i].string.name, + *desc->dir_cookie); + } + status = -ELOOP; + goto out; + } ctx->dup_cookie = *desc->dir_cookie; - ctx->duped = 1; + ctx->duped = -1; } desc->file->f_pos = new_pos; desc->cache_entry_index = i; @@ -368,6 +387,7 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des if (*desc->dir_cookie == array->last_cookie) desc->eof = 1; } +out: return status; } @@ -740,19 +760,6 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent, struct nfs_cache_array *array = NULL; struct nfs_open_dir_context *ctx = file->private_data; - if (ctx->duped != 0 && ctx->dup_cookie == *desc->dir_cookie) { - if (printk_ratelimit()) { - pr_notice("NFS: directory %s/%s contains a readdir loop. " - "Please contact your server vendor. " - "Offending cookie: %llu\n", - file->f_dentry->d_parent->d_name.name, - file->f_dentry->d_name.name, - *desc->dir_cookie); - } - res = -ELOOP; - goto out; - } - array = nfs_readdir_get_array(desc->page); if (IS_ERR(array)) { res = PTR_ERR(array); @@ -774,6 +781,8 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent, *desc->dir_cookie = array->array[i+1].cookie; else *desc->dir_cookie = array->last_cookie; + if (ctx->duped != 0) + ctx->duped = 1; } if (array->eof_index >= 0) desc->eof = 1; @@ -805,6 +814,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, struct page *page = NULL; int status; struct inode *inode = desc->file->f_path.dentry->d_inode; + struct nfs_open_dir_context *ctx = desc->file->private_data; dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n", (unsigned long long)*desc->dir_cookie); @@ -818,6 +828,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, desc->page_index = 0; desc->last_cookie = *desc->dir_cookie; desc->page = page; + ctx->duped = 0; status = nfs_readdir_xdr_to_array(desc, page, inode); if (status < 0) diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index b35d25b..1940f1a 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -53,7 +53,7 @@ #include <asm/system.h> #include <asm/uaccess.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include "internal.h" #include "iostat.h" diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 2a55347..ab12913 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -277,6 +277,9 @@ extern void nfs_sb_deactive(struct super_block *sb); extern char *nfs_path(char **p, struct dentry *dentry, char *buffer, ssize_t buflen); extern struct vfsmount *nfs_d_automount(struct path *path); +#ifdef CONFIG_NFS_V4 +rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *); +#endif /* getroot.c */ extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *, @@ -288,12 +291,22 @@ extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *, extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh); #endif +struct nfs_pageio_descriptor; /* read.c */ extern int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, const struct rpc_call_ops *call_ops); extern void nfs_read_prepare(struct rpc_task *task, void *calldata); +extern int nfs_generic_pagein(struct nfs_pageio_descriptor *desc, + struct list_head *head); + +extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio); +extern void nfs_readdata_release(struct nfs_read_data *rdata); /* write.c */ +extern int nfs_generic_flush(struct nfs_pageio_descriptor *desc, + struct list_head *head); +extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio); +extern void nfs_writedata_release(struct nfs_write_data *wdata); extern void nfs_commit_free(struct nfs_write_data *p); extern int nfs_initiate_write(struct nfs_write_data *data, struct rpc_clnt *clnt, diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 1f063ba..8102391 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -119,7 +119,7 @@ Elong: } #ifdef CONFIG_NFS_V4 -static rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors) +rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors) { struct gss_api_mech *mech; struct xdr_netobj oid; diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index e49e731..7ef2397 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -415,7 +415,7 @@ fail: } int nfs3_proc_set_default_acl(struct inode *dir, struct inode *inode, - mode_t mode) + umode_t mode) { struct posix_acl *dfacl, *acl; int error = 0; diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 38053d8..85f1690 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -316,7 +316,7 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, int flags, struct nfs_open_context *ctx) { struct nfs3_createdata *data; - mode_t mode = sattr->ia_mode; + umode_t mode = sattr->ia_mode; int status = -ENOMEM; dprintk("NFS call create %s\n", dentry->d_name.name); @@ -562,7 +562,7 @@ static int nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) { struct nfs3_createdata *data; - int mode = sattr->ia_mode; + umode_t mode = sattr->ia_mode; int status = -ENOMEM; dprintk("NFS call mkdir %s\n", dentry->d_name.name); @@ -681,7 +681,7 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, dev_t rdev) { struct nfs3_createdata *data; - mode_t mode = sattr->ia_mode; + umode_t mode = sattr->ia_mode; int status = -ENOMEM; dprintk("NFS call mknod %s %u:%u\n", dentry->d_name.name, diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index b788f2e..1ec1a85 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -48,6 +48,7 @@ enum nfs4_client_state { NFS4CLNT_SESSION_RESET, NFS4CLNT_RECALL_SLOT, NFS4CLNT_LEASE_CONFIRM, + NFS4CLNT_SERVER_SCOPE_MISMATCH, }; enum nfs4_session_state { @@ -66,6 +67,8 @@ struct nfs4_minor_version_ops { int cache_reply); int (*validate_stateid)(struct nfs_delegation *, const nfs4_stateid *); + int (*find_root_sec)(struct nfs_server *, struct nfs_fh *, + struct nfs_fsinfo *); const struct nfs4_state_recovery_ops *reboot_recovery_ops; const struct nfs4_state_recovery_ops *nograce_recovery_ops; const struct nfs4_state_maintenance_ops *state_renewal_ops; @@ -315,7 +318,7 @@ extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[]; extern const u32 nfs4_fattr_bitmap[2]; extern const u32 nfs4_statfs_bitmap[2]; extern const u32 nfs4_pathconf_bitmap[2]; -extern const u32 nfs4_fsinfo_bitmap[2]; +extern const u32 nfs4_fsinfo_bitmap[3]; extern const u32 nfs4_fs_locations_bitmap[2]; /* nfs4renewd.c */ @@ -349,6 +352,8 @@ extern void nfs4_schedule_state_manager(struct nfs_client *); extern void nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *); extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags); extern void nfs41_handle_recall_slot(struct nfs_client *clp); +extern void nfs41_handle_server_scope(struct nfs_client *, + struct server_scope **); extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t, pid_t); diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index f9d03ab..e8915d4 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -170,7 +170,7 @@ filelayout_set_layoutcommit(struct nfs_write_data *wdata) pnfs_set_layoutcommit(wdata); dprintk("%s ionde %lu pls_end_pos %lu\n", __func__, wdata->inode->i_ino, - (unsigned long) wdata->lseg->pls_end_pos); + (unsigned long) NFS_I(wdata->inode)->layout->plh_lwb); } /* @@ -334,6 +334,9 @@ filelayout_read_pagelist(struct nfs_read_data *data) __func__, data->inode->i_ino, data->args.pgbase, (size_t)data->args.count, offset); + if (test_bit(NFS_DEVICEID_INVALID, &FILELAYOUT_DEVID_NODE(lseg)->flags)) + return PNFS_NOT_ATTEMPTED; + /* Retrieve the correct rpc_client for the byte range */ j = nfs4_fl_calc_j_index(lseg, offset); idx = nfs4_fl_calc_ds_index(lseg, j); @@ -344,8 +347,7 @@ filelayout_read_pagelist(struct nfs_read_data *data) set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags); return PNFS_NOT_ATTEMPTED; } - dprintk("%s USE DS:ip %x %hu\n", __func__, - ntohl(ds->ds_ip_addr), ntohs(ds->ds_port)); + dprintk("%s USE DS: %s\n", __func__, ds->ds_remotestr); /* No multipath support. Use first DS */ data->ds_clp = ds->ds_clp; @@ -374,6 +376,9 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync) struct nfs_fh *fh; int status; + if (test_bit(NFS_DEVICEID_INVALID, &FILELAYOUT_DEVID_NODE(lseg)->flags)) + return PNFS_NOT_ATTEMPTED; + /* Retrieve the correct rpc_client for the byte range */ j = nfs4_fl_calc_j_index(lseg, offset); idx = nfs4_fl_calc_ds_index(lseg, j); @@ -384,9 +389,9 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync) set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags); return PNFS_NOT_ATTEMPTED; } - dprintk("%s ino %lu sync %d req %Zu@%llu DS:%x:%hu\n", __func__, + dprintk("%s ino %lu sync %d req %Zu@%llu DS: %s\n", __func__, data->inode->i_ino, sync, (size_t) data->args.count, offset, - ntohl(ds->ds_ip_addr), ntohs(ds->ds_port)); + ds->ds_remotestr); data->write_done_cb = filelayout_write_done_cb; data->ds_clp = ds->ds_clp; @@ -428,6 +433,14 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo, dprintk("--> %s\n", __func__); + /* FIXME: remove this check when layout segment support is added */ + if (lgr->range.offset != 0 || + lgr->range.length != NFS4_MAX_UINT64) { + dprintk("%s Only whole file layouts supported. Use MDS i/o\n", + __func__); + goto out; + } + if (fl->pattern_offset > lgr->range.offset) { dprintk("%s pattern_offset %lld too large\n", __func__, fl->pattern_offset); @@ -449,6 +462,10 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo, goto out; } else dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node); + /* Found deviceid is being reaped */ + if (test_bit(NFS_DEVICEID_INVALID, &dsaddr->id_node.flags)) + goto out_put; + fl->dsaddr = dsaddr; if (fl->first_stripe_index < 0 || @@ -659,7 +676,7 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid, * return true : coalesce page * return false : don't coalesce page */ -bool +static bool filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req) { @@ -670,8 +687,6 @@ filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, !nfs_generic_pg_test(pgio, prev, req)) return false; - if (!pgio->pg_lseg) - return 1; p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT; r_stripe = (u64)req->wb_index << PAGE_CACHE_SHIFT; stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit; @@ -682,6 +697,52 @@ filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, return (p_stripe == r_stripe); } +void +filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio, + struct nfs_page *req) +{ + BUG_ON(pgio->pg_lseg != NULL); + + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, + req->wb_context, + 0, + NFS4_MAX_UINT64, + IOMODE_READ, + GFP_KERNEL); + /* If no lseg, fall back to read through mds */ + if (pgio->pg_lseg == NULL) + nfs_pageio_reset_read_mds(pgio); +} + +void +filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio, + struct nfs_page *req) +{ + BUG_ON(pgio->pg_lseg != NULL); + + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, + req->wb_context, + 0, + NFS4_MAX_UINT64, + IOMODE_RW, + GFP_NOFS); + /* If no lseg, fall back to write through mds */ + if (pgio->pg_lseg == NULL) + nfs_pageio_reset_write_mds(pgio); +} + +static const struct nfs_pageio_ops filelayout_pg_read_ops = { + .pg_init = filelayout_pg_init_read, + .pg_test = filelayout_pg_test, + .pg_doio = pnfs_generic_pg_readpages, +}; + +static const struct nfs_pageio_ops filelayout_pg_write_ops = { + .pg_init = filelayout_pg_init_write, + .pg_test = filelayout_pg_test, + .pg_doio = pnfs_generic_pg_writepages, +}; + static bool filelayout_mark_pnfs_commit(struct pnfs_layout_segment *lseg) { return !FILELAYOUT_LSEG(lseg)->commit_through_mds; @@ -879,7 +940,8 @@ static struct pnfs_layoutdriver_type filelayout_type = { .owner = THIS_MODULE, .alloc_lseg = filelayout_alloc_lseg, .free_lseg = filelayout_free_lseg, - .pg_test = filelayout_pg_test, + .pg_read_ops = &filelayout_pg_read_ops, + .pg_write_ops = &filelayout_pg_write_ops, .mark_pnfs_commit = filelayout_mark_pnfs_commit, .choose_commit_list = filelayout_choose_commit_list, .commit_pagelist = filelayout_commit_pagelist, @@ -902,5 +964,7 @@ static void __exit nfs4filelayout_exit(void) pnfs_unregister_layoutdriver(&filelayout_type); } +MODULE_ALIAS("nfs-layouttype4-1"); + module_init(nfs4filelayout_init); module_exit(nfs4filelayout_exit); diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h index cebe01e..2e42284 100644 --- a/fs/nfs/nfs4filelayout.h +++ b/fs/nfs/nfs4filelayout.h @@ -47,10 +47,17 @@ enum stripetype4 { }; /* Individual ip address */ +struct nfs4_pnfs_ds_addr { + struct sockaddr_storage da_addr; + size_t da_addrlen; + struct list_head da_node; /* nfs4_pnfs_dev_hlist dev_dslist */ + char *da_remotestr; /* human readable addr+port */ +}; + struct nfs4_pnfs_ds { struct list_head ds_node; /* nfs4_pnfs_dev_hlist dev_dslist */ - u32 ds_ip_addr; - u32 ds_port; + char *ds_remotestr; /* comma sep list of addrs */ + struct list_head ds_addrs; struct nfs_client *ds_clp; atomic_t ds_count; }; @@ -89,6 +96,12 @@ FILELAYOUT_LSEG(struct pnfs_layout_segment *lseg) generic_hdr); } +static inline struct nfs4_deviceid_node * +FILELAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg) +{ + return &FILELAYOUT_LSEG(lseg)->dsaddr->id_node; +} + extern struct nfs_fh * nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j); diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c index 3b7bf13..ed388aa 100644 --- a/fs/nfs/nfs4filelayoutdev.c +++ b/fs/nfs/nfs4filelayoutdev.c @@ -56,54 +56,139 @@ print_ds(struct nfs4_pnfs_ds *ds) printk("%s NULL device\n", __func__); return; } - printk(" ip_addr %x port %hu\n" + printk(" ds %s\n" " ref count %d\n" " client %p\n" " cl_exchange_flags %x\n", - ntohl(ds->ds_ip_addr), ntohs(ds->ds_port), + ds->ds_remotestr, atomic_read(&ds->ds_count), ds->ds_clp, ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0); } -/* nfs4_ds_cache_lock is held */ -static struct nfs4_pnfs_ds * -_data_server_lookup_locked(u32 ip_addr, u32 port) +static bool +same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2) { - struct nfs4_pnfs_ds *ds; + struct sockaddr_in *a, *b; + struct sockaddr_in6 *a6, *b6; + + if (addr1->sa_family != addr2->sa_family) + return false; + + switch (addr1->sa_family) { + case AF_INET: + a = (struct sockaddr_in *)addr1; + b = (struct sockaddr_in *)addr2; + + if (a->sin_addr.s_addr == b->sin_addr.s_addr && + a->sin_port == b->sin_port) + return true; + break; + + case AF_INET6: + a6 = (struct sockaddr_in6 *)addr1; + b6 = (struct sockaddr_in6 *)addr2; + + /* LINKLOCAL addresses must have matching scope_id */ + if (ipv6_addr_scope(&a6->sin6_addr) == + IPV6_ADDR_SCOPE_LINKLOCAL && + a6->sin6_scope_id != b6->sin6_scope_id) + return false; + + if (ipv6_addr_equal(&a6->sin6_addr, &b6->sin6_addr) && + a6->sin6_port == b6->sin6_port) + return true; + break; + + default: + dprintk("%s: unhandled address family: %u\n", + __func__, addr1->sa_family); + return false; + } - dprintk("_data_server_lookup: ip_addr=%x port=%hu\n", - ntohl(ip_addr), ntohs(port)); + return false; +} - list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) { - if (ds->ds_ip_addr == ip_addr && - ds->ds_port == port) { - return ds; +/* + * Lookup DS by addresses. The first matching address returns true. + * nfs4_ds_cache_lock is held + */ +static struct nfs4_pnfs_ds * +_data_server_lookup_locked(struct list_head *dsaddrs) +{ + struct nfs4_pnfs_ds *ds; + struct nfs4_pnfs_ds_addr *da1, *da2; + + list_for_each_entry(da1, dsaddrs, da_node) { + list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) { + list_for_each_entry(da2, &ds->ds_addrs, da_node) { + if (same_sockaddr( + (struct sockaddr *)&da1->da_addr, + (struct sockaddr *)&da2->da_addr)) + return ds; + } } } return NULL; } /* + * Compare two lists of addresses. + */ +static bool +_data_server_match_all_addrs_locked(struct list_head *dsaddrs1, + struct list_head *dsaddrs2) +{ + struct nfs4_pnfs_ds_addr *da1, *da2; + size_t count1 = 0, + count2 = 0; + + list_for_each_entry(da1, dsaddrs1, da_node) + count1++; + + list_for_each_entry(da2, dsaddrs2, da_node) { + bool found = false; + count2++; + list_for_each_entry(da1, dsaddrs1, da_node) { + if (same_sockaddr((struct sockaddr *)&da1->da_addr, + (struct sockaddr *)&da2->da_addr)) { + found = true; + break; + } + } + if (!found) + return false; + } + + return (count1 == count2); +} + +/* * Create an rpc connection to the nfs4_pnfs_ds data server - * Currently only support IPv4 + * Currently only supports IPv4 and IPv6 addresses */ static int nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds) { - struct nfs_client *clp; - struct sockaddr_in sin; + struct nfs_client *clp = ERR_PTR(-EIO); + struct nfs4_pnfs_ds_addr *da; int status = 0; - dprintk("--> %s ip:port %x:%hu au_flavor %d\n", __func__, - ntohl(ds->ds_ip_addr), ntohs(ds->ds_port), + dprintk("--> %s DS %s au_flavor %d\n", __func__, ds->ds_remotestr, mds_srv->nfs_client->cl_rpcclient->cl_auth->au_flavor); - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = ds->ds_ip_addr; - sin.sin_port = ds->ds_port; + BUG_ON(list_empty(&ds->ds_addrs)); + + list_for_each_entry(da, &ds->ds_addrs, da_node) { + dprintk("%s: DS %s: trying address %s\n", + __func__, ds->ds_remotestr, da->da_remotestr); + + clp = nfs4_set_ds_client(mds_srv->nfs_client, + (struct sockaddr *)&da->da_addr, + da->da_addrlen, IPPROTO_TCP); + if (!IS_ERR(clp)) + break; + } - clp = nfs4_set_ds_client(mds_srv->nfs_client, (struct sockaddr *)&sin, - sizeof(sin), IPPROTO_TCP); if (IS_ERR(clp)) { status = PTR_ERR(clp); goto out; @@ -115,8 +200,8 @@ nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds) goto out_put; } ds->ds_clp = clp; - dprintk("%s [existing] ip=%x, port=%hu\n", __func__, - ntohl(ds->ds_ip_addr), ntohs(ds->ds_port)); + dprintk("%s [existing] server=%s\n", __func__, + ds->ds_remotestr); goto out; } @@ -135,8 +220,7 @@ nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds) goto out_put; ds->ds_clp = clp; - dprintk("%s [new] ip=%x, port=%hu\n", __func__, ntohl(ds->ds_ip_addr), - ntohs(ds->ds_port)); + dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr); out: return status; out_put: @@ -147,12 +231,25 @@ out_put: static void destroy_ds(struct nfs4_pnfs_ds *ds) { + struct nfs4_pnfs_ds_addr *da; + dprintk("--> %s\n", __func__); ifdebug(FACILITY) print_ds(ds); if (ds->ds_clp) nfs_put_client(ds->ds_clp); + + while (!list_empty(&ds->ds_addrs)) { + da = list_first_entry(&ds->ds_addrs, + struct nfs4_pnfs_ds_addr, + da_node); + list_del_init(&da->da_node); + kfree(da->da_remotestr); + kfree(da); + } + + kfree(ds->ds_remotestr); kfree(ds); } @@ -179,31 +276,96 @@ nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) kfree(dsaddr); } +/* + * Create a string with a human readable address and port to avoid + * complicated setup around many dprinks. + */ +static char * +nfs4_pnfs_remotestr(struct list_head *dsaddrs, gfp_t gfp_flags) +{ + struct nfs4_pnfs_ds_addr *da; + char *remotestr; + size_t len; + char *p; + + len = 3; /* '{', '}' and eol */ + list_for_each_entry(da, dsaddrs, da_node) { + len += strlen(da->da_remotestr) + 1; /* string plus comma */ + } + + remotestr = kzalloc(len, gfp_flags); + if (!remotestr) + return NULL; + + p = remotestr; + *(p++) = '{'; + len--; + list_for_each_entry(da, dsaddrs, da_node) { + size_t ll = strlen(da->da_remotestr); + + if (ll > len) + goto out_err; + + memcpy(p, da->da_remotestr, ll); + p += ll; + len -= ll; + + if (len < 1) + goto out_err; + (*p++) = ','; + len--; + } + if (len < 2) + goto out_err; + *(p++) = '}'; + *p = '\0'; + return remotestr; +out_err: + kfree(remotestr); + return NULL; +} + static struct nfs4_pnfs_ds * -nfs4_pnfs_ds_add(struct inode *inode, u32 ip_addr, u32 port, gfp_t gfp_flags) +nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags) { - struct nfs4_pnfs_ds *tmp_ds, *ds; + struct nfs4_pnfs_ds *tmp_ds, *ds = NULL; + char *remotestr; - ds = kzalloc(sizeof(*tmp_ds), gfp_flags); + if (list_empty(dsaddrs)) { + dprintk("%s: no addresses defined\n", __func__); + goto out; + } + + ds = kzalloc(sizeof(*ds), gfp_flags); if (!ds) goto out; + /* this is only used for debugging, so it's ok if its NULL */ + remotestr = nfs4_pnfs_remotestr(dsaddrs, gfp_flags); + spin_lock(&nfs4_ds_cache_lock); - tmp_ds = _data_server_lookup_locked(ip_addr, port); + tmp_ds = _data_server_lookup_locked(dsaddrs); if (tmp_ds == NULL) { - ds->ds_ip_addr = ip_addr; - ds->ds_port = port; + INIT_LIST_HEAD(&ds->ds_addrs); + list_splice_init(dsaddrs, &ds->ds_addrs); + ds->ds_remotestr = remotestr; atomic_set(&ds->ds_count, 1); INIT_LIST_HEAD(&ds->ds_node); ds->ds_clp = NULL; list_add(&ds->ds_node, &nfs4_data_server_cache); - dprintk("%s add new data server ip 0x%x\n", __func__, - ds->ds_ip_addr); + dprintk("%s add new data server %s\n", __func__, + ds->ds_remotestr); } else { + if (!_data_server_match_all_addrs_locked(&tmp_ds->ds_addrs, + dsaddrs)) { + dprintk("%s: multipath address mismatch: %s != %s", + __func__, tmp_ds->ds_remotestr, remotestr); + } + kfree(remotestr); kfree(ds); atomic_inc(&tmp_ds->ds_count); - dprintk("%s data server found ip 0x%x, inc'ed ds_count to %d\n", - __func__, tmp_ds->ds_ip_addr, + dprintk("%s data server %s found, inc'ed ds_count to %d\n", + __func__, tmp_ds->ds_remotestr, atomic_read(&tmp_ds->ds_count)); ds = tmp_ds; } @@ -213,18 +375,22 @@ out: } /* - * Currently only support ipv4, and one multi-path address. + * Currently only supports ipv4, ipv6 and one multi-path address. */ -static struct nfs4_pnfs_ds * -decode_and_add_ds(struct xdr_stream *streamp, struct inode *inode, gfp_t gfp_flags) +static struct nfs4_pnfs_ds_addr * +decode_ds_addr(struct xdr_stream *streamp, gfp_t gfp_flags) { - struct nfs4_pnfs_ds *ds = NULL; - char *buf; - const char *ipend, *pstr; - u32 ip_addr, port; - int nlen, rlen, i; + struct nfs4_pnfs_ds_addr *da = NULL; + char *buf, *portstr; + u32 port; + int nlen, rlen; int tmp[2]; __be32 *p; + char *netid, *match_netid; + size_t len, match_netid_len; + char *startsep = ""; + char *endsep = ""; + /* r_netid */ p = xdr_inline_decode(streamp, 4); @@ -236,64 +402,123 @@ decode_and_add_ds(struct xdr_stream *streamp, struct inode *inode, gfp_t gfp_fla if (unlikely(!p)) goto out_err; - /* Check that netid is "tcp" */ - if (nlen != 3 || memcmp((char *)p, "tcp", 3)) { - dprintk("%s: ERROR: non ipv4 TCP r_netid\n", __func__); + netid = kmalloc(nlen+1, gfp_flags); + if (unlikely(!netid)) goto out_err; - } - /* r_addr */ + netid[nlen] = '\0'; + memcpy(netid, p, nlen); + + /* r_addr: ip/ip6addr with port in dec octets - see RFC 5665 */ p = xdr_inline_decode(streamp, 4); if (unlikely(!p)) - goto out_err; + goto out_free_netid; rlen = be32_to_cpup(p); p = xdr_inline_decode(streamp, rlen); if (unlikely(!p)) - goto out_err; + goto out_free_netid; - /* ipv6 length plus port is legal */ - if (rlen > INET6_ADDRSTRLEN + 8) { + /* port is ".ABC.DEF", 8 chars max */ + if (rlen > INET6_ADDRSTRLEN + IPV6_SCOPE_ID_LEN + 8) { dprintk("%s: Invalid address, length %d\n", __func__, rlen); - goto out_err; + goto out_free_netid; } buf = kmalloc(rlen + 1, gfp_flags); if (!buf) { dprintk("%s: Not enough memory\n", __func__); - goto out_err; + goto out_free_netid; } buf[rlen] = '\0'; memcpy(buf, p, rlen); - /* replace the port dots with dashes for the in4_pton() delimiter*/ - for (i = 0; i < 2; i++) { - char *res = strrchr(buf, '.'); - if (!res) { - dprintk("%s: Failed finding expected dots in port\n", - __func__); - goto out_free; - } - *res = '-'; + /* replace port '.' with '-' */ + portstr = strrchr(buf, '.'); + if (!portstr) { + dprintk("%s: Failed finding expected dot in port\n", + __func__); + goto out_free_buf; + } + *portstr = '-'; + + /* find '.' between address and port */ + portstr = strrchr(buf, '.'); + if (!portstr) { + dprintk("%s: Failed finding expected dot between address and " + "port\n", __func__); + goto out_free_buf; } + *portstr = '\0'; - /* Currently only support ipv4 address */ - if (in4_pton(buf, rlen, (u8 *)&ip_addr, '-', &ipend) == 0) { - dprintk("%s: Only ipv4 addresses supported\n", __func__); - goto out_free; + da = kzalloc(sizeof(*da), gfp_flags); + if (unlikely(!da)) + goto out_free_buf; + + INIT_LIST_HEAD(&da->da_node); + + if (!rpc_pton(buf, portstr-buf, (struct sockaddr *)&da->da_addr, + sizeof(da->da_addr))) { + dprintk("%s: error parsing address %s\n", __func__, buf); + goto out_free_da; } - /* port */ - pstr = ipend; - sscanf(pstr, "-%d-%d", &tmp[0], &tmp[1]); + portstr++; + sscanf(portstr, "%d-%d", &tmp[0], &tmp[1]); port = htons((tmp[0] << 8) | (tmp[1])); - ds = nfs4_pnfs_ds_add(inode, ip_addr, port, gfp_flags); - dprintk("%s: Decoded address and port %s\n", __func__, buf); -out_free: + switch (da->da_addr.ss_family) { + case AF_INET: + ((struct sockaddr_in *)&da->da_addr)->sin_port = port; + da->da_addrlen = sizeof(struct sockaddr_in); + match_netid = "tcp"; + match_netid_len = 3; + break; + + case AF_INET6: + ((struct sockaddr_in6 *)&da->da_addr)->sin6_port = port; + da->da_addrlen = sizeof(struct sockaddr_in6); + match_netid = "tcp6"; + match_netid_len = 4; + startsep = "["; + endsep = "]"; + break; + + default: + dprintk("%s: unsupported address family: %u\n", + __func__, da->da_addr.ss_family); + goto out_free_da; + } + + if (nlen != match_netid_len || strncmp(netid, match_netid, nlen)) { + dprintk("%s: ERROR: r_netid \"%s\" != \"%s\"\n", + __func__, netid, match_netid); + goto out_free_da; + } + + /* save human readable address */ + len = strlen(startsep) + strlen(buf) + strlen(endsep) + 7; + da->da_remotestr = kzalloc(len, gfp_flags); + + /* NULL is ok, only used for dprintk */ + if (da->da_remotestr) + snprintf(da->da_remotestr, len, "%s%s%s:%u", startsep, + buf, endsep, ntohs(port)); + + dprintk("%s: Parsed DS addr %s\n", __func__, da->da_remotestr); kfree(buf); + kfree(netid); + return da; + +out_free_da: + kfree(da); +out_free_buf: + dprintk("%s: Error parsing DS addr: %s\n", __func__, buf); + kfree(buf); +out_free_netid: + kfree(netid); out_err: - return ds; + return NULL; } /* Decode opaque device data and return the result */ @@ -310,6 +535,8 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags) struct xdr_stream stream; struct xdr_buf buf; struct page *scratch; + struct list_head dsaddrs; + struct nfs4_pnfs_ds_addr *da; /* set up xdr stream */ scratch = alloc_page(gfp_flags); @@ -386,6 +613,8 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags) NFS_SERVER(ino)->nfs_client, &pdev->dev_id); + INIT_LIST_HEAD(&dsaddrs); + for (i = 0; i < dsaddr->ds_num; i++) { int j; u32 mp_count; @@ -395,48 +624,43 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags) goto out_err_free_deviceid; mp_count = be32_to_cpup(p); /* multipath count */ - if (mp_count > 1) { - printk(KERN_WARNING - "%s: Multipath count %d not supported, " - "skipping all greater than 1\n", __func__, - mp_count); - } for (j = 0; j < mp_count; j++) { - if (j == 0) { - dsaddr->ds_list[i] = decode_and_add_ds(&stream, - ino, gfp_flags); - if (dsaddr->ds_list[i] == NULL) - goto out_err_free_deviceid; - } else { - u32 len; - /* skip extra multipath */ - - /* read len, skip */ - p = xdr_inline_decode(&stream, 4); - if (unlikely(!p)) - goto out_err_free_deviceid; - len = be32_to_cpup(p); - - p = xdr_inline_decode(&stream, len); - if (unlikely(!p)) - goto out_err_free_deviceid; - - /* read len, skip */ - p = xdr_inline_decode(&stream, 4); - if (unlikely(!p)) - goto out_err_free_deviceid; - len = be32_to_cpup(p); - - p = xdr_inline_decode(&stream, len); - if (unlikely(!p)) - goto out_err_free_deviceid; - } + da = decode_ds_addr(&stream, gfp_flags); + if (da) + list_add_tail(&da->da_node, &dsaddrs); + } + if (list_empty(&dsaddrs)) { + dprintk("%s: no suitable DS addresses found\n", + __func__); + goto out_err_free_deviceid; + } + + dsaddr->ds_list[i] = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags); + if (!dsaddr->ds_list[i]) + goto out_err_drain_dsaddrs; + + /* If DS was already in cache, free ds addrs */ + while (!list_empty(&dsaddrs)) { + da = list_first_entry(&dsaddrs, + struct nfs4_pnfs_ds_addr, + da_node); + list_del_init(&da->da_node); + kfree(da->da_remotestr); + kfree(da); } } __free_page(scratch); return dsaddr; +out_err_drain_dsaddrs: + while (!list_empty(&dsaddrs)) { + da = list_first_entry(&dsaddrs, struct nfs4_pnfs_ds_addr, + da_node); + list_del_init(&da->da_node); + kfree(da->da_remotestr); + kfree(da); + } out_err_free_deviceid: nfs4_fl_free_deviceid(dsaddr); /* stripe_indicies was part of dsaddr */ @@ -591,13 +815,13 @@ nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j) static void filelayout_mark_devid_negative(struct nfs4_file_layout_dsaddr *dsaddr, - int err, u32 ds_addr) + int err, const char *ds_remotestr) { u32 *p = (u32 *)&dsaddr->id_node.deviceid; - printk(KERN_ERR "NFS: data server %x connection error %d." + printk(KERN_ERR "NFS: data server %s connection error %d." " Deviceid [%x%x%x%x] marked out of use.\n", - ds_addr, err, p[0], p[1], p[2], p[3]); + ds_remotestr, err, p[0], p[1], p[2], p[3]); spin_lock(&nfs4_ds_cache_lock); dsaddr->flags |= NFS4_DEVICE_ID_NEG_ENTRY; @@ -628,7 +852,7 @@ nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx) err = nfs4_ds_connect(s, ds); if (err) { filelayout_mark_devid_negative(dsaddr, err, - ntohl(ds->ds_ip_addr)); + ds->ds_remotestr); return NULL; } } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 26bece8..8c77039 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -80,7 +80,10 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, struct nfs_fattr *fattr, struct iattr *sattr, struct nfs4_state *state); - +#ifdef CONFIG_NFS_V4_1 +static int nfs41_test_stateid(struct nfs_server *, struct nfs4_state *); +static int nfs41_free_stateid(struct nfs_server *, struct nfs4_state *); +#endif /* Prevent leaks of NFSv4 errors into userland */ static int nfs4_map_errors(int err) { @@ -137,12 +140,13 @@ const u32 nfs4_pathconf_bitmap[2] = { 0 }; -const u32 nfs4_fsinfo_bitmap[2] = { FATTR4_WORD0_MAXFILESIZE +const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE | FATTR4_WORD0_MAXREAD | FATTR4_WORD0_MAXWRITE | FATTR4_WORD0_LEASE_TIME, FATTR4_WORD1_TIME_DELTA - | FATTR4_WORD1_FS_LAYOUT_TYPES + | FATTR4_WORD1_FS_LAYOUT_TYPES, + FATTR4_WORD2_LAYOUT_BLKSIZE }; const u32 nfs4_fs_locations_bitmap[2] = { @@ -1689,6 +1693,20 @@ static int nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *sta return ret; } +#if defined(CONFIG_NFS_V4_1) +static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state) +{ + int status; + struct nfs_server *server = NFS_SERVER(state->inode); + + status = nfs41_test_stateid(server, state); + if (status == NFS_OK) + return 0; + nfs41_free_stateid(server, state); + return nfs4_open_expired(sp, state); +} +#endif + /* * on an EXCLUSIVE create, the server should send back a bitmask with FATTR4-* * fields corresponding to attributes that were used to store the verifier. @@ -2252,13 +2270,14 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *info) { + int minor_version = server->nfs_client->cl_minorversion; int status = nfs4_lookup_root(server, fhandle, info); if ((status == -NFS4ERR_WRONGSEC) && !(server->flags & NFS_MOUNT_SECFLAVOUR)) /* * A status of -NFS4ERR_WRONGSEC will be mapped to -EPERM * by nfs4_map_errors() as this function exits. */ - status = nfs4_find_root_sec(server, fhandle, info); + status = nfs_v4_minor_ops[minor_version]->find_root_sec(server, fhandle, info); if (status == 0) status = nfs4_server_capabilities(server, fhandle); if (status == 0) @@ -4441,6 +4460,20 @@ out: return err; } +#if defined(CONFIG_NFS_V4_1) +static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *request) +{ + int status; + struct nfs_server *server = NFS_SERVER(state->inode); + + status = nfs41_test_stateid(server, state); + if (status == NFS_OK) + return 0; + nfs41_free_stateid(server, state); + return nfs4_lock_expired(state, request); +} +#endif + static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) { struct nfs_inode *nfsi = NFS_I(state->inode); @@ -4779,6 +4812,16 @@ out_inval: return -NFS4ERR_INVAL; } +static bool +nfs41_same_server_scope(struct server_scope *a, struct server_scope *b) +{ + if (a->server_scope_sz == b->server_scope_sz && + memcmp(a->server_scope, b->server_scope, a->server_scope_sz) == 0) + return true; + + return false; +} + /* * nfs4_proc_exchange_id() * @@ -4821,9 +4864,31 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred) init_utsname()->domainname, clp->cl_rpcclient->cl_auth->au_flavor); + res.server_scope = kzalloc(sizeof(struct server_scope), GFP_KERNEL); + if (unlikely(!res.server_scope)) + return -ENOMEM; + status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); if (!status) status = nfs4_check_cl_exchange_flags(clp->cl_exchange_flags); + + if (!status) { + if (clp->server_scope && + !nfs41_same_server_scope(clp->server_scope, + res.server_scope)) { + dprintk("%s: server_scope mismatch detected\n", + __func__); + set_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, &clp->cl_state); + kfree(clp->server_scope); + clp->server_scope = NULL; + } + + if (!clp->server_scope) + clp->server_scope = res.server_scope; + else + kfree(res.server_scope); + } + dprintk("<-- %s status= %d\n", __func__, status); return status; } @@ -5704,7 +5769,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) { struct nfs4_layoutreturn *lrp = calldata; struct nfs_server *server; - struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout; + struct pnfs_layout_hdr *lo = lrp->args.layout; dprintk("--> %s\n", __func__); @@ -5733,7 +5798,7 @@ static void nfs4_layoutreturn_release(void *calldata) struct nfs4_layoutreturn *lrp = calldata; dprintk("--> %s\n", __func__); - put_layout_hdr(NFS_I(lrp->args.inode)->layout); + put_layout_hdr(lrp->args.layout); kfree(calldata); dprintk("<-- %s\n", __func__); } @@ -5770,6 +5835,54 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp) return status; } +/* + * Retrieve the list of Data Server devices from the MDS. + */ +static int _nfs4_getdevicelist(struct nfs_server *server, + const struct nfs_fh *fh, + struct pnfs_devicelist *devlist) +{ + struct nfs4_getdevicelist_args args = { + .fh = fh, + .layoutclass = server->pnfs_curr_ld->id, + }; + struct nfs4_getdevicelist_res res = { + .devlist = devlist, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICELIST], + .rpc_argp = &args, + .rpc_resp = &res, + }; + int status; + + dprintk("--> %s\n", __func__); + status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, + &res.seq_res, 0); + dprintk("<-- %s status=%d\n", __func__, status); + return status; +} + +int nfs4_proc_getdevicelist(struct nfs_server *server, + const struct nfs_fh *fh, + struct pnfs_devicelist *devlist) +{ + struct nfs4_exception exception = { }; + int err; + + do { + err = nfs4_handle_exception(server, + _nfs4_getdevicelist(server, fh, devlist), + &exception); + } while (exception.retry); + + dprintk("%s: err=%d, num_devs=%u\n", __func__, + err, devlist->num_devs); + + return err; +} +EXPORT_SYMBOL_GPL(nfs4_proc_getdevicelist); + static int _nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev) { @@ -5848,9 +5961,16 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata) static void nfs4_layoutcommit_release(void *calldata) { struct nfs4_layoutcommit_data *data = calldata; + struct pnfs_layout_segment *lseg, *tmp; + pnfs_cleanup_layoutcommit(data); /* Matched by references in pnfs_set_layoutcommit */ - put_lseg(data->lseg); + list_for_each_entry_safe(lseg, tmp, &data->lseg_list, pls_lc_list) { + list_del_init(&lseg->pls_lc_list); + if (test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, + &lseg->pls_flags)) + put_lseg(lseg); + } put_rpccred(data->cred); kfree(data); } @@ -5901,6 +6021,143 @@ out: rpc_put_task(task); return status; } + +static int +_nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info, struct nfs4_secinfo_flavors *flavors) +{ + struct nfs41_secinfo_no_name_args args = { + .style = SECINFO_STYLE_CURRENT_FH, + }; + struct nfs4_secinfo_res res = { + .flavors = flavors, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SECINFO_NO_NAME], + .rpc_argp = &args, + .rpc_resp = &res, + }; + return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); +} + +static int +nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info, struct nfs4_secinfo_flavors *flavors) +{ + struct nfs4_exception exception = { }; + int err; + do { + err = _nfs41_proc_secinfo_no_name(server, fhandle, info, flavors); + switch (err) { + case 0: + case -NFS4ERR_WRONGSEC: + case -NFS4ERR_NOTSUPP: + break; + default: + err = nfs4_handle_exception(server, err, &exception); + } + } while (exception.retry); + return err; +} + +static int +nfs41_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info) +{ + int err; + struct page *page; + rpc_authflavor_t flavor; + struct nfs4_secinfo_flavors *flavors; + + page = alloc_page(GFP_KERNEL); + if (!page) { + err = -ENOMEM; + goto out; + } + + flavors = page_address(page); + err = nfs41_proc_secinfo_no_name(server, fhandle, info, flavors); + + /* + * Fall back on "guess and check" method if + * the server doesn't support SECINFO_NO_NAME + */ + if (err == -NFS4ERR_WRONGSEC || err == -NFS4ERR_NOTSUPP) { + err = nfs4_find_root_sec(server, fhandle, info); + goto out_freepage; + } + if (err) + goto out_freepage; + + flavor = nfs_find_best_sec(flavors); + if (err == 0) + err = nfs4_lookup_root_sec(server, fhandle, info, flavor); + +out_freepage: + put_page(page); + if (err == -EACCES) + return -EPERM; +out: + return err; +} +static int _nfs41_test_stateid(struct nfs_server *server, struct nfs4_state *state) +{ + int status; + struct nfs41_test_stateid_args args = { + .stateid = &state->stateid, + }; + struct nfs41_test_stateid_res res; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_TEST_STATEID], + .rpc_argp = &args, + .rpc_resp = &res, + }; + args.seq_args.sa_session = res.seq_res.sr_session = NULL; + status = nfs4_call_sync_sequence(server->client, server, &msg, &args.seq_args, &res.seq_res, 0, 1); + return status; +} + +static int nfs41_test_stateid(struct nfs_server *server, struct nfs4_state *state) +{ + struct nfs4_exception exception = { }; + int err; + do { + err = nfs4_handle_exception(server, + _nfs41_test_stateid(server, state), + &exception); + } while (exception.retry); + return err; +} + +static int _nfs4_free_stateid(struct nfs_server *server, struct nfs4_state *state) +{ + int status; + struct nfs41_free_stateid_args args = { + .stateid = &state->stateid, + }; + struct nfs41_free_stateid_res res; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FREE_STATEID], + .rpc_argp = &args, + .rpc_resp = &res, + }; + + args.seq_args.sa_session = res.seq_res.sr_session = NULL; + status = nfs4_call_sync_sequence(server->client, server, &msg, &args.seq_args, &res.seq_res, 0, 1); + return status; +} + +static int nfs41_free_stateid(struct nfs_server *server, struct nfs4_state *state) +{ + struct nfs4_exception exception = { }; + int err; + do { + err = nfs4_handle_exception(server, + _nfs4_free_stateid(server, state), + &exception); + } while (exception.retry); + return err; +} #endif /* CONFIG_NFS_V4_1 */ struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = { @@ -5937,8 +6194,8 @@ struct nfs4_state_recovery_ops nfs40_nograce_recovery_ops = { struct nfs4_state_recovery_ops nfs41_nograce_recovery_ops = { .owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE, .state_flag_bit = NFS_STATE_RECLAIM_NOGRACE, - .recover_open = nfs4_open_expired, - .recover_lock = nfs4_lock_expired, + .recover_open = nfs41_open_expired, + .recover_lock = nfs41_lock_expired, .establish_clid = nfs41_init_clientid, .get_clid_cred = nfs4_get_exchange_id_cred, }; @@ -5962,6 +6219,7 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = { .minor_version = 0, .call_sync = _nfs4_call_sync, .validate_stateid = nfs4_validate_delegation_stateid, + .find_root_sec = nfs4_find_root_sec, .reboot_recovery_ops = &nfs40_reboot_recovery_ops, .nograce_recovery_ops = &nfs40_nograce_recovery_ops, .state_renewal_ops = &nfs40_state_renewal_ops, @@ -5972,6 +6230,7 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = { .minor_version = 1, .call_sync = _nfs4_call_sync_session, .validate_stateid = nfs41_validate_delegation_stateid, + .find_root_sec = nfs41_find_root_sec, .reboot_recovery_ops = &nfs41_reboot_recovery_ops, .nograce_recovery_ops = &nfs41_nograce_recovery_ops, .state_renewal_ops = &nfs41_state_renewal_ops, diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 7acfe88..72ab97e 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1643,7 +1643,14 @@ static void nfs4_state_manager(struct nfs_client *clp) goto out_error; } clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); - set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state); + + if (test_and_clear_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, + &clp->cl_state)) + nfs4_state_start_reclaim_nograce(clp); + else + set_bit(NFS4CLNT_RECLAIM_REBOOT, + &clp->cl_state); + pnfs_destroy_all_layouts(clp); } diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index e6e8f3b..1dce12f 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -113,7 +113,11 @@ static int nfs4_stat_to_errno(int); #define encode_restorefh_maxsz (op_encode_hdr_maxsz) #define decode_restorefh_maxsz (op_decode_hdr_maxsz) #define encode_fsinfo_maxsz (encode_getattr_maxsz) -#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + 15) +/* The 5 accounts for the PNFS attributes, and assumes that at most three + * layout types will be returned. + */ +#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + \ + nfs4_fattr_bitmap_maxsz + 4 + 8 + 5) #define encode_renew_maxsz (op_encode_hdr_maxsz + 3) #define decode_renew_maxsz (op_decode_hdr_maxsz) #define encode_setclientid_maxsz \ @@ -314,6 +318,17 @@ static int nfs4_stat_to_errno(int); XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) #define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4) #define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4) +#define encode_getdevicelist_maxsz (op_encode_hdr_maxsz + 4 + \ + encode_verifier_maxsz) +#define decode_getdevicelist_maxsz (op_decode_hdr_maxsz + \ + 2 /* nfs_cookie4 gdlr_cookie */ + \ + decode_verifier_maxsz \ + /* verifier4 gdlr_verifier */ + \ + 1 /* gdlr_deviceid_list count */ + \ + XDR_QUADLEN(NFS4_PNFS_GETDEVLIST_MAXNUM * \ + NFS4_DEVICEID4_SIZE) \ + /* gdlr_deviceid_list */ + \ + 1 /* bool gdlr_eof */) #define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \ XDR_QUADLEN(NFS4_DEVICEID4_SIZE)) #define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \ @@ -343,6 +358,14 @@ static int nfs4_stat_to_errno(int); 1 /* FIXME: opaque lrf_body always empty at the moment */) #define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \ 1 + decode_stateid_maxsz) +#define encode_secinfo_no_name_maxsz (op_encode_hdr_maxsz + 1) +#define decode_secinfo_no_name_maxsz decode_secinfo_maxsz +#define encode_test_stateid_maxsz (op_encode_hdr_maxsz + 2 + \ + XDR_QUADLEN(NFS4_STATEID_SIZE)) +#define decode_test_stateid_maxsz (op_decode_hdr_maxsz + 2 + 1) +#define encode_free_stateid_maxsz (op_encode_hdr_maxsz + 1 + \ + XDR_QUADLEN(NFS4_STATEID_SIZE)) +#define decode_free_stateid_maxsz (op_decode_hdr_maxsz + 1) #else /* CONFIG_NFS_V4_1 */ #define encode_sequence_maxsz 0 #define decode_sequence_maxsz 0 @@ -740,6 +763,14 @@ static int nfs4_stat_to_errno(int); #define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \ decode_sequence_maxsz + \ decode_reclaim_complete_maxsz) +#define NFS4_enc_getdevicelist_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_getdevicelist_maxsz) +#define NFS4_dec_getdevicelist_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_getdevicelist_maxsz) #define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz +\ encode_getdeviceinfo_maxsz) @@ -772,6 +803,26 @@ static int nfs4_stat_to_errno(int); decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_layoutreturn_maxsz) +#define NFS4_enc_secinfo_no_name_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putrootfh_maxsz +\ + encode_secinfo_no_name_maxsz) +#define NFS4_dec_secinfo_no_name_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putrootfh_maxsz + \ + decode_secinfo_no_name_maxsz) +#define NFS4_enc_test_stateid_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_test_stateid_maxsz) +#define NFS4_dec_test_stateid_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_test_stateid_maxsz) +#define NFS4_enc_free_stateid_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_free_stateid_maxsz) +#define NFS4_dec_free_stateid_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_free_stateid_maxsz) const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH + compound_encode_hdr_maxsz + @@ -1076,6 +1127,35 @@ static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm hdr->replen += decode_getattr_maxsz; } +static void +encode_getattr_three(struct xdr_stream *xdr, + uint32_t bm0, uint32_t bm1, uint32_t bm2, + struct compound_hdr *hdr) +{ + __be32 *p; + + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_GETATTR); + if (bm2) { + p = reserve_space(xdr, 16); + *p++ = cpu_to_be32(3); + *p++ = cpu_to_be32(bm0); + *p++ = cpu_to_be32(bm1); + *p = cpu_to_be32(bm2); + } else if (bm1) { + p = reserve_space(xdr, 12); + *p++ = cpu_to_be32(2); + *p++ = cpu_to_be32(bm0); + *p = cpu_to_be32(bm1); + } else { + p = reserve_space(xdr, 8); + *p++ = cpu_to_be32(1); + *p = cpu_to_be32(bm0); + } + hdr->nops++; + hdr->replen += decode_getattr_maxsz; +} + static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) { encode_getattr_two(xdr, bitmask[0] & nfs4_fattr_bitmap[0], @@ -1084,8 +1164,11 @@ static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct c static void encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) { - encode_getattr_two(xdr, bitmask[0] & nfs4_fsinfo_bitmap[0], - bitmask[1] & nfs4_fsinfo_bitmap[1], hdr); + encode_getattr_three(xdr, + bitmask[0] & nfs4_fsinfo_bitmap[0], + bitmask[1] & nfs4_fsinfo_bitmap[1], + bitmask[2] & nfs4_fsinfo_bitmap[2], + hdr); } static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) @@ -1827,6 +1910,26 @@ static void encode_sequence(struct xdr_stream *xdr, #ifdef CONFIG_NFS_V4_1 static void +encode_getdevicelist(struct xdr_stream *xdr, + const struct nfs4_getdevicelist_args *args, + struct compound_hdr *hdr) +{ + __be32 *p; + nfs4_verifier dummy = { + .data = "dummmmmy", + }; + + p = reserve_space(xdr, 20); + *p++ = cpu_to_be32(OP_GETDEVICELIST); + *p++ = cpu_to_be32(args->layoutclass); + *p++ = cpu_to_be32(NFS4_PNFS_GETDEVLIST_MAXNUM); + xdr_encode_hyper(p, 0ULL); /* cookie */ + encode_nfs4_verifier(xdr, &dummy); + hdr->nops++; + hdr->replen += decode_getdevicelist_maxsz; +} + +static void encode_getdeviceinfo(struct xdr_stream *xdr, const struct nfs4_getdeviceinfo_args *args, struct compound_hdr *hdr) @@ -1888,7 +1991,7 @@ encode_layoutcommit(struct xdr_stream *xdr, *p++ = cpu_to_be32(OP_LAYOUTCOMMIT); /* Only whole file layouts */ p = xdr_encode_hyper(p, 0); /* offset */ - p = xdr_encode_hyper(p, NFS4_MAX_UINT64); /* length */ + p = xdr_encode_hyper(p, args->lastbytewritten + 1); /* length */ *p++ = cpu_to_be32(0); /* reclaim */ p = xdr_encode_opaque_fixed(p, args->stateid.data, NFS4_STATEID_SIZE); *p++ = cpu_to_be32(1); /* newoffset = TRUE */ @@ -1938,6 +2041,46 @@ encode_layoutreturn(struct xdr_stream *xdr, hdr->nops++; hdr->replen += decode_layoutreturn_maxsz; } + +static int +encode_secinfo_no_name(struct xdr_stream *xdr, + const struct nfs41_secinfo_no_name_args *args, + struct compound_hdr *hdr) +{ + __be32 *p; + p = reserve_space(xdr, 8); + *p++ = cpu_to_be32(OP_SECINFO_NO_NAME); + *p++ = cpu_to_be32(args->style); + hdr->nops++; + hdr->replen += decode_secinfo_no_name_maxsz; + return 0; +} + +static void encode_test_stateid(struct xdr_stream *xdr, + struct nfs41_test_stateid_args *args, + struct compound_hdr *hdr) +{ + __be32 *p; + + p = reserve_space(xdr, 8 + NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(OP_TEST_STATEID); + *p++ = cpu_to_be32(1); + xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE); + hdr->nops++; + hdr->replen += decode_test_stateid_maxsz; +} + +static void encode_free_stateid(struct xdr_stream *xdr, + struct nfs41_free_stateid_args *args, + struct compound_hdr *hdr) +{ + __be32 *p; + p = reserve_space(xdr, 4 + NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(OP_FREE_STATEID); + xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE); + hdr->nops++; + hdr->replen += decode_free_stateid_maxsz; +} #endif /* CONFIG_NFS_V4_1 */ /* @@ -2536,7 +2679,7 @@ static void nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, struct compound_hdr hdr = { .nops = 0, }; - const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 }; + const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME }; encode_compound_hdr(xdr, req, &hdr); encode_setclientid_confirm(xdr, arg, &hdr); @@ -2680,7 +2823,7 @@ static void nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->la_seq_args), }; - const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 }; + const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME }; encode_compound_hdr(xdr, req, &hdr); encode_sequence(xdr, &args->la_seq_args, &hdr); @@ -2707,6 +2850,24 @@ static void nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req, } /* + * Encode GETDEVICELIST request + */ +static void nfs4_xdr_enc_getdevicelist(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_getdevicelist_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_getdevicelist(xdr, args, &hdr); + encode_nops(&hdr); +} + +/* * Encode GETDEVICEINFO request */ static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, @@ -2790,6 +2951,59 @@ static void nfs4_xdr_enc_layoutreturn(struct rpc_rqst *req, encode_layoutreturn(xdr, args, &hdr); encode_nops(&hdr); } + +/* + * Encode SECINFO_NO_NAME request + */ +static int nfs4_xdr_enc_secinfo_no_name(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs41_secinfo_no_name_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putrootfh(xdr, &hdr); + encode_secinfo_no_name(xdr, args, &hdr); + encode_nops(&hdr); + return 0; +} + +/* + * Encode TEST_STATEID request + */ +static void nfs4_xdr_enc_test_stateid(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs41_test_stateid_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_test_stateid(xdr, args, &hdr); + encode_nops(&hdr); +} + +/* + * Encode FREE_STATEID request + */ +static void nfs4_xdr_enc_free_stateid(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs41_free_stateid_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_free_stateid(xdr, args, &hdr); + encode_nops(&hdr); +} #endif /* CONFIG_NFS_V4_1 */ static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) @@ -2890,14 +3104,17 @@ static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap) goto out_overflow; bmlen = be32_to_cpup(p); - bitmap[0] = bitmap[1] = 0; + bitmap[0] = bitmap[1] = bitmap[2] = 0; p = xdr_inline_decode(xdr, (bmlen << 2)); if (unlikely(!p)) goto out_overflow; if (bmlen > 0) { bitmap[0] = be32_to_cpup(p++); - if (bmlen > 1) - bitmap[1] = be32_to_cpup(p); + if (bmlen > 1) { + bitmap[1] = be32_to_cpup(p++); + if (bmlen > 2) + bitmap[2] = be32_to_cpup(p); + } } return 0; out_overflow: @@ -2929,8 +3146,9 @@ static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint3 return ret; bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS; } else - bitmask[0] = bitmask[1] = 0; - dprintk("%s: bitmask=%08x:%08x\n", __func__, bitmask[0], bitmask[1]); + bitmask[0] = bitmask[1] = bitmask[2] = 0; + dprintk("%s: bitmask=%08x:%08x:%08x\n", __func__, + bitmask[0], bitmask[1], bitmask[2]); return 0; } @@ -3984,7 +4202,7 @@ out_overflow: static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res) { __be32 *savep; - uint32_t attrlen, bitmap[2] = {0}; + uint32_t attrlen, bitmap[3] = {0}; int status; if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) @@ -4010,7 +4228,7 @@ xdr_error: static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat) { __be32 *savep; - uint32_t attrlen, bitmap[2] = {0}; + uint32_t attrlen, bitmap[3] = {0}; int status; if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) @@ -4042,7 +4260,7 @@ xdr_error: static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf) { __be32 *savep; - uint32_t attrlen, bitmap[2] = {0}; + uint32_t attrlen, bitmap[3] = {0}; int status; if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) @@ -4182,7 +4400,7 @@ static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fat { __be32 *savep; uint32_t attrlen, - bitmap[2] = {0}; + bitmap[3] = {0}; int status; status = decode_op_hdr(xdr, OP_GETATTR); @@ -4268,10 +4486,32 @@ static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap, return status; } +/* + * The prefered block size for layout directed io + */ +static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap, + uint32_t *res) +{ + __be32 *p; + + dprintk("%s: bitmap is %x\n", __func__, bitmap[2]); + *res = 0; + if (bitmap[2] & FATTR4_WORD2_LAYOUT_BLKSIZE) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) { + print_overflow_msg(__func__, xdr); + return -EIO; + } + *res = be32_to_cpup(p); + bitmap[2] &= ~FATTR4_WORD2_LAYOUT_BLKSIZE; + } + return 0; +} + static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) { __be32 *savep; - uint32_t attrlen, bitmap[2]; + uint32_t attrlen, bitmap[3]; int status; if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) @@ -4299,6 +4539,9 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype); if (status != 0) goto xdr_error; + status = decode_attr_layout_blksize(xdr, bitmap, &fsinfo->blksize); + if (status) + goto xdr_error; status = verify_attr_len(xdr, savep, attrlen); xdr_error: @@ -4718,7 +4961,7 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req, { __be32 *savep; uint32_t attrlen, - bitmap[2] = {0}; + bitmap[3] = {0}; struct kvec *iov = req->rq_rcv_buf.head; int status; @@ -4977,11 +5220,17 @@ static int decode_exchange_id(struct xdr_stream *xdr, if (unlikely(status)) return status; - /* Throw away server_scope */ + /* Save server_scope */ status = decode_opaque_inline(xdr, &dummy, &dummy_str); if (unlikely(status)) return status; + if (unlikely(dummy > NFS4_OPAQUE_LIMIT)) + return -EIO; + + memcpy(res->server_scope->server_scope, dummy_str, dummy); + res->server_scope->server_scope_sz = dummy; + /* Throw away Implementation id array */ status = decode_opaque_inline(xdr, &dummy, &dummy_str); if (unlikely(status)) @@ -5141,6 +5390,53 @@ out_overflow: } #if defined(CONFIG_NFS_V4_1) +/* + * TODO: Need to handle case when EOF != true; + */ +static int decode_getdevicelist(struct xdr_stream *xdr, + struct pnfs_devicelist *res) +{ + __be32 *p; + int status, i; + struct nfs_writeverf verftemp; + + status = decode_op_hdr(xdr, OP_GETDEVICELIST); + if (status) + return status; + + p = xdr_inline_decode(xdr, 8 + 8 + 4); + if (unlikely(!p)) + goto out_overflow; + + /* TODO: Skip cookie for now */ + p += 2; + + /* Read verifier */ + p = xdr_decode_opaque_fixed(p, verftemp.verifier, 8); + + res->num_devs = be32_to_cpup(p); + + dprintk("%s: num_dev %d\n", __func__, res->num_devs); + + if (res->num_devs > NFS4_PNFS_GETDEVLIST_MAXNUM) { + printk(KERN_ERR "%s too many result dev_num %u\n", + __func__, res->num_devs); + return -EIO; + } + + p = xdr_inline_decode(xdr, + res->num_devs * NFS4_DEVICEID4_SIZE + 4); + if (unlikely(!p)) + goto out_overflow; + for (i = 0; i < res->num_devs; i++) + p = xdr_decode_opaque_fixed(p, res->dev_id[i].data, + NFS4_DEVICEID4_SIZE); + res->eof = be32_to_cpup(p); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} static int decode_getdeviceinfo(struct xdr_stream *xdr, struct pnfs_device *pdev) @@ -5303,6 +5599,7 @@ static int decode_layoutcommit(struct xdr_stream *xdr, int status; status = decode_op_hdr(xdr, OP_LAYOUTCOMMIT); + res->status = status; if (status) return status; @@ -5322,6 +5619,55 @@ out_overflow: print_overflow_msg(__func__, xdr); return -EIO; } + +static int decode_test_stateid(struct xdr_stream *xdr, + struct nfs41_test_stateid_res *res) +{ + __be32 *p; + int status; + int num_res; + + status = decode_op_hdr(xdr, OP_TEST_STATEID); + if (status) + return status; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + num_res = be32_to_cpup(p++); + if (num_res != 1) + goto out; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + res->status = be32_to_cpup(p++); + return res->status; +out_overflow: + print_overflow_msg(__func__, xdr); +out: + return -EIO; +} + +static int decode_free_stateid(struct xdr_stream *xdr, + struct nfs41_free_stateid_res *res) +{ + __be32 *p; + int status; + + status = decode_op_hdr(xdr, OP_FREE_STATEID); + if (status) + return status; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + res->status = be32_to_cpup(p++); + return res->status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} #endif /* CONFIG_NFS_V4_1 */ /* @@ -6366,6 +6712,32 @@ static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp, } /* + * Decode GETDEVICELIST response + */ +static int nfs4_xdr_dec_getdevicelist(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs4_getdevicelist_res *res) +{ + struct compound_hdr hdr; + int status; + + dprintk("encoding getdevicelist!\n"); + + status = decode_compound_hdr(xdr, &hdr); + if (status != 0) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status != 0) + goto out; + status = decode_putfh(xdr); + if (status != 0) + goto out; + status = decode_getdevicelist(xdr, res->devlist); +out: + return status; +} + +/* * Decode GETDEVINFO response */ static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, @@ -6461,6 +6833,72 @@ static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp, out: return status; } + +/* + * Decode SECINFO_NO_NAME response + */ +static int nfs4_xdr_dec_secinfo_no_name(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs4_secinfo_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putrootfh(xdr); + if (status) + goto out; + status = decode_secinfo(xdr, res); +out: + return status; +} + +/* + * Decode TEST_STATEID response + */ +static int nfs4_xdr_dec_test_stateid(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs41_test_stateid_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_test_stateid(xdr, res); +out: + return status; +} + +/* + * Decode FREE_STATEID response + */ +static int nfs4_xdr_dec_free_stateid(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs41_free_stateid_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_free_stateid(xdr, res); +out: + return status; +} #endif /* CONFIG_NFS_V4_1 */ /** @@ -6480,7 +6918,7 @@ out: int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, int plus) { - uint32_t bitmap[2] = {0}; + uint32_t bitmap[3] = {0}; uint32_t len; __be32 *p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) @@ -6663,6 +7101,10 @@ struct rpc_procinfo nfs4_procedures[] = { PROC(LAYOUTGET, enc_layoutget, dec_layoutget), PROC(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit), PROC(LAYOUTRETURN, enc_layoutreturn, dec_layoutreturn), + PROC(SECINFO_NO_NAME, enc_secinfo_no_name, dec_secinfo_no_name), + PROC(TEST_STATEID, enc_test_stateid, dec_test_stateid), + PROC(FREE_STATEID, enc_free_stateid, dec_free_stateid), + PROC(GETDEVICELIST, enc_getdevicelist, dec_getdevicelist), #endif /* CONFIG_NFS_V4_1 */ }; diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index 8ff2ea3..d0cda12 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -479,7 +479,6 @@ static int _io_check(struct objio_state *ios, bool is_write) for (i = 0; i < ios->numdevs; i++) { struct osd_sense_info osi; struct osd_request *or = ios->per_dev[i].or; - unsigned dev; int ret; if (!or) @@ -500,9 +499,8 @@ static int _io_check(struct objio_state *ios, bool is_write) continue; /* we recovered */ } - dev = ios->per_dev[i].dev; - objlayout_io_set_result(&ios->ol_state, dev, - &ios->layout->comps[dev].oc_object_id, + objlayout_io_set_result(&ios->ol_state, i, + &ios->layout->comps[i].oc_object_id, osd_pri_2_pnfs_err(osi.osd_err_pri), ios->per_dev[i].offset, ios->per_dev[i].length, @@ -589,22 +587,19 @@ static void _calc_stripe_info(struct objio_state *ios, u64 file_offset, } static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg, - unsigned pgbase, struct _objio_per_comp *per_dev, int cur_len, + unsigned pgbase, struct _objio_per_comp *per_dev, int len, gfp_t gfp_flags) { unsigned pg = *cur_pg; + int cur_len = len; struct request_queue *q = osd_request_queue(_io_od(ios, per_dev->dev)); - per_dev->length += cur_len; - if (per_dev->bio == NULL) { - unsigned stripes = ios->layout->num_comps / - ios->layout->mirrors_p1; - unsigned pages_in_stripe = stripes * + unsigned pages_in_stripe = ios->layout->group_width * (ios->layout->stripe_unit / PAGE_SIZE); unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) / - stripes; + ios->layout->group_width; if (BIO_MAX_PAGES_KMALLOC < bio_size) bio_size = BIO_MAX_PAGES_KMALLOC; @@ -632,6 +627,7 @@ static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg, } BUG_ON(cur_len); + per_dev->length += len; *cur_pg = pg; return 0; } @@ -650,7 +646,7 @@ static int _prepare_one_group(struct objio_state *ios, u64 length, int ret = 0; while (length) { - struct _objio_per_comp *per_dev = &ios->per_dev[dev]; + struct _objio_per_comp *per_dev = &ios->per_dev[dev - first_dev]; unsigned cur_len, page_off = 0; if (!per_dev->length) { @@ -670,8 +666,8 @@ static int _prepare_one_group(struct objio_state *ios, u64 length, cur_len = stripe_unit; } - if (max_comp < dev) - max_comp = dev; + if (max_comp < dev - first_dev) + max_comp = dev - first_dev; } else { cur_len = stripe_unit; } @@ -806,7 +802,7 @@ static int _read_mirrors(struct objio_state *ios, unsigned cur_comp) struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp]; unsigned dev = per_dev->dev; struct pnfs_osd_object_cred *cred = - &ios->layout->comps[dev]; + &ios->layout->comps[cur_comp]; struct osd_obj_id obj = { .partition = cred->oc_object_id.oid_partition_id, .id = cred->oc_object_id.oid_object_id, @@ -904,7 +900,7 @@ static int _write_mirrors(struct objio_state *ios, unsigned cur_comp) for (; cur_comp < last_comp; ++cur_comp, ++dev) { struct osd_request *or = NULL; struct pnfs_osd_object_cred *cred = - &ios->layout->comps[dev]; + &ios->layout->comps[cur_comp]; struct osd_obj_id obj = { .partition = cred->oc_object_id.oid_partition_id, .id = cred->oc_object_id.oid_object_id, @@ -1000,13 +996,22 @@ static bool objio_pg_test(struct nfs_pageio_descriptor *pgio, if (!pnfs_generic_pg_test(pgio, prev, req)) return false; - if (pgio->pg_lseg == NULL) - return true; - return pgio->pg_count + req->wb_bytes <= OBJIO_LSEG(pgio->pg_lseg)->max_io_size; } +static const struct nfs_pageio_ops objio_pg_read_ops = { + .pg_init = pnfs_generic_pg_init_read, + .pg_test = objio_pg_test, + .pg_doio = pnfs_generic_pg_readpages, +}; + +static const struct nfs_pageio_ops objio_pg_write_ops = { + .pg_init = pnfs_generic_pg_init_write, + .pg_test = objio_pg_test, + .pg_doio = pnfs_generic_pg_writepages, +}; + static struct pnfs_layoutdriver_type objlayout_type = { .id = LAYOUT_OSD2_OBJECTS, .name = "LAYOUT_OSD2_OBJECTS", @@ -1020,7 +1025,8 @@ static struct pnfs_layoutdriver_type objlayout_type = { .read_pagelist = objlayout_read_pagelist, .write_pagelist = objlayout_write_pagelist, - .pg_test = objio_pg_test, + .pg_read_ops = &objio_pg_read_ops, + .pg_write_ops = &objio_pg_write_ops, .free_deviceid_node = objio_free_deviceid_node, @@ -1055,5 +1061,7 @@ objlayout_exit(void) __func__); } +MODULE_ALIAS("nfs-layouttype4-2"); + module_init(objlayout_init); module_exit(objlayout_exit); diff --git a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c index 16fc758..b3918f7 100644 --- a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c +++ b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c @@ -170,6 +170,9 @@ int pnfs_osd_xdr_decode_layout_map(struct pnfs_osd_layout *layout, p = _osd_xdr_decode_data_map(p, &layout->olo_map); layout->olo_comps_index = be32_to_cpup(p++); layout->olo_num_comps = be32_to_cpup(p++); + dprintk("%s: olo_comps_index=%d olo_num_comps=%d\n", __func__, + layout->olo_comps_index, layout->olo_num_comps); + iter->total_comps = layout->olo_num_comps; return 0; } diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 18449f4..b60970c 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -230,7 +230,7 @@ EXPORT_SYMBOL_GPL(nfs_generic_pg_test); */ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, struct inode *inode, - int (*doio)(struct nfs_pageio_descriptor *), + const struct nfs_pageio_ops *pg_ops, size_t bsize, int io_flags) { @@ -240,13 +240,12 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, desc->pg_bsize = bsize; desc->pg_base = 0; desc->pg_moreio = 0; + desc->pg_recoalesce = 0; desc->pg_inode = inode; - desc->pg_doio = doio; + desc->pg_ops = pg_ops; desc->pg_ioflags = io_flags; desc->pg_error = 0; desc->pg_lseg = NULL; - desc->pg_test = nfs_generic_pg_test; - pnfs_pageio_init(desc, inode); } /** @@ -276,7 +275,7 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev, return false; if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE) return false; - return pgio->pg_test(pgio, prev, req); + return pgio->pg_ops->pg_test(pgio, prev, req); } /** @@ -297,6 +296,8 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc, if (!nfs_can_coalesce_requests(prev, req, desc)) return 0; } else { + if (desc->pg_ops->pg_init) + desc->pg_ops->pg_init(desc, req); desc->pg_base = req->wb_pgbase; } nfs_list_remove_request(req); @@ -311,7 +312,7 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc, static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc) { if (!list_empty(&desc->pg_list)) { - int error = desc->pg_doio(desc); + int error = desc->pg_ops->pg_doio(desc); if (error < 0) desc->pg_error = error; else @@ -331,7 +332,7 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc) * Returns true if the request 'req' was successfully coalesced into the * existing list of pages 'desc'. */ -int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, +static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, struct nfs_page *req) { while (!nfs_pageio_do_add_request(desc, req)) { @@ -340,17 +341,67 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, if (desc->pg_error < 0) return 0; desc->pg_moreio = 0; + if (desc->pg_recoalesce) + return 0; } return 1; } +static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc) +{ + LIST_HEAD(head); + + do { + list_splice_init(&desc->pg_list, &head); + desc->pg_bytes_written -= desc->pg_count; + desc->pg_count = 0; + desc->pg_base = 0; + desc->pg_recoalesce = 0; + + while (!list_empty(&head)) { + struct nfs_page *req; + + req = list_first_entry(&head, struct nfs_page, wb_list); + nfs_list_remove_request(req); + if (__nfs_pageio_add_request(desc, req)) + continue; + if (desc->pg_error < 0) + return 0; + break; + } + } while (desc->pg_recoalesce); + return 1; +} + +int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, + struct nfs_page *req) +{ + int ret; + + do { + ret = __nfs_pageio_add_request(desc, req); + if (ret) + break; + if (desc->pg_error < 0) + break; + ret = nfs_do_recoalesce(desc); + } while (ret); + return ret; +} + /** * nfs_pageio_complete - Complete I/O on an nfs_pageio_descriptor * @desc: pointer to io descriptor */ void nfs_pageio_complete(struct nfs_pageio_descriptor *desc) { - nfs_pageio_doio(desc); + for (;;) { + nfs_pageio_doio(desc); + if (!desc->pg_recoalesce) + break; + if (!nfs_do_recoalesce(desc)) + break; + } } /** @@ -369,7 +420,7 @@ void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index) if (!list_empty(&desc->pg_list)) { struct nfs_page *prev = nfs_list_entry(desc->pg_list.prev); if (index != prev->wb_index + 1) - nfs_pageio_doio(desc); + nfs_pageio_complete(desc); } } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 29c0ca7f..e550e88 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -28,6 +28,7 @@ */ #include <linux/nfs_fs.h> +#include <linux/nfs_page.h> #include "internal.h" #include "pnfs.h" #include "iostat.h" @@ -75,8 +76,11 @@ find_pnfs_driver(u32 id) void unset_pnfs_layoutdriver(struct nfs_server *nfss) { - if (nfss->pnfs_curr_ld) + if (nfss->pnfs_curr_ld) { + if (nfss->pnfs_curr_ld->clear_layoutdriver) + nfss->pnfs_curr_ld->clear_layoutdriver(nfss); module_put(nfss->pnfs_curr_ld->owner); + } nfss->pnfs_curr_ld = NULL; } @@ -87,7 +91,8 @@ unset_pnfs_layoutdriver(struct nfs_server *nfss) * @id layout type. Zero (illegal layout type) indicates pNFS not in use. */ void -set_pnfs_layoutdriver(struct nfs_server *server, u32 id) +set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh, + u32 id) { struct pnfs_layoutdriver_type *ld_type = NULL; @@ -114,6 +119,13 @@ set_pnfs_layoutdriver(struct nfs_server *server, u32 id) goto out_no_driver; } server->pnfs_curr_ld = ld_type; + if (ld_type->set_layoutdriver + && ld_type->set_layoutdriver(server, mntfh)) { + printk(KERN_ERR "%s: Error initializing pNFS layout driver %u.\n", + __func__, id); + module_put(ld_type->owner); + goto out_no_driver; + } dprintk("%s: pNFS module for %u set\n", __func__, id); return; @@ -189,6 +201,7 @@ static void pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo) { struct pnfs_layoutdriver_type *ld = NFS_SERVER(lo->plh_inode)->pnfs_curr_ld; + put_rpccred(lo->plh_lc_cred); return ld->alloc_layout_hdr ? ld->free_layout_hdr(lo) : kfree(lo); } @@ -223,6 +236,7 @@ static void init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg) { INIT_LIST_HEAD(&lseg->pls_list); + INIT_LIST_HEAD(&lseg->pls_lc_list); atomic_set(&lseg->pls_refcount, 1); smp_mb(); set_bit(NFS_LSEG_VALID, &lseg->pls_flags); @@ -448,11 +462,20 @@ pnfs_destroy_layout(struct nfs_inode *nfsi) void pnfs_destroy_all_layouts(struct nfs_client *clp) { + struct nfs_server *server; struct pnfs_layout_hdr *lo; LIST_HEAD(tmp_list); + nfs4_deviceid_mark_client_invalid(clp); + nfs4_deviceid_purge_client(clp); + spin_lock(&clp->cl_lock); - list_splice_init(&clp->cl_layouts, &tmp_list); + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + if (!list_empty(&server->layouts)) + list_splice_init(&server->layouts, &tmp_list); + } + rcu_read_unlock(); spin_unlock(&clp->cl_lock); while (!list_empty(&tmp_list)) { @@ -661,6 +684,7 @@ _pnfs_return_layout(struct inode *ino) lrp->args.stateid = stateid; lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id; lrp->args.inode = ino; + lrp->args.layout = lo; lrp->clp = NFS_SERVER(ino)->nfs_client; status = nfs4_proc_layoutreturn(lrp); @@ -805,7 +829,9 @@ out: } static struct pnfs_layout_hdr * -alloc_init_layout_hdr(struct inode *ino, gfp_t gfp_flags) +alloc_init_layout_hdr(struct inode *ino, + struct nfs_open_context *ctx, + gfp_t gfp_flags) { struct pnfs_layout_hdr *lo; @@ -817,11 +843,14 @@ alloc_init_layout_hdr(struct inode *ino, gfp_t gfp_flags) INIT_LIST_HEAD(&lo->plh_segs); INIT_LIST_HEAD(&lo->plh_bulk_recall); lo->plh_inode = ino; + lo->plh_lc_cred = get_rpccred(ctx->state->owner->so_cred); return lo; } static struct pnfs_layout_hdr * -pnfs_find_alloc_layout(struct inode *ino, gfp_t gfp_flags) +pnfs_find_alloc_layout(struct inode *ino, + struct nfs_open_context *ctx, + gfp_t gfp_flags) { struct nfs_inode *nfsi = NFS_I(ino); struct pnfs_layout_hdr *new = NULL; @@ -836,7 +865,7 @@ pnfs_find_alloc_layout(struct inode *ino, gfp_t gfp_flags) return nfsi->layout; } spin_unlock(&ino->i_lock); - new = alloc_init_layout_hdr(ino, gfp_flags); + new = alloc_init_layout_hdr(ino, ctx, gfp_flags); spin_lock(&ino->i_lock); if (likely(nfsi->layout == NULL)) /* Won the race? */ @@ -920,7 +949,8 @@ pnfs_update_layout(struct inode *ino, }; unsigned pg_offset; struct nfs_inode *nfsi = NFS_I(ino); - struct nfs_client *clp = NFS_SERVER(ino)->nfs_client; + struct nfs_server *server = NFS_SERVER(ino); + struct nfs_client *clp = server->nfs_client; struct pnfs_layout_hdr *lo; struct pnfs_layout_segment *lseg = NULL; bool first = false; @@ -928,7 +958,7 @@ pnfs_update_layout(struct inode *ino, if (!pnfs_enabled_sb(NFS_SERVER(ino))) return NULL; spin_lock(&ino->i_lock); - lo = pnfs_find_alloc_layout(ino, gfp_flags); + lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags); if (lo == NULL) { dprintk("%s ERROR: can't get pnfs_layout_hdr\n", __func__); goto out_unlock; @@ -964,7 +994,7 @@ pnfs_update_layout(struct inode *ino, */ spin_lock(&clp->cl_lock); BUG_ON(!list_empty(&lo->plh_layouts)); - list_add_tail(&lo->plh_layouts, &clp->cl_layouts); + list_add_tail(&lo->plh_layouts, &server->layouts); spin_unlock(&clp->cl_lock); } @@ -973,7 +1003,8 @@ pnfs_update_layout(struct inode *ino, arg.offset -= pg_offset; arg.length += pg_offset; } - arg.length = PAGE_CACHE_ALIGN(arg.length); + if (arg.length != NFS4_MAX_UINT64) + arg.length = PAGE_CACHE_ALIGN(arg.length); lseg = send_layoutget(lo, ctx, &arg, gfp_flags); if (!lseg && first) { @@ -991,6 +1022,7 @@ out_unlock: spin_unlock(&ino->i_lock); goto out; } +EXPORT_SYMBOL_GPL(pnfs_update_layout); int pnfs_layout_process(struct nfs4_layoutget *lgp) @@ -1048,35 +1080,71 @@ out_forget_reply: goto out; } +void +pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) +{ + BUG_ON(pgio->pg_lseg != NULL); + + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, + req->wb_context, + req_offset(req), + req->wb_bytes, + IOMODE_READ, + GFP_KERNEL); + /* If no lseg, fall back to read through mds */ + if (pgio->pg_lseg == NULL) + nfs_pageio_reset_read_mds(pgio); + +} +EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_read); + +void +pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) +{ + BUG_ON(pgio->pg_lseg != NULL); + + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, + req->wb_context, + req_offset(req), + req->wb_bytes, + IOMODE_RW, + GFP_NOFS); + /* If no lseg, fall back to write through mds */ + if (pgio->pg_lseg == NULL) + nfs_pageio_reset_write_mds(pgio); +} +EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write); + bool -pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, - struct nfs_page *req) +pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode) { - enum pnfs_iomode access_type; - gfp_t gfp_flags; + struct nfs_server *server = NFS_SERVER(inode); + struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld; - /* We assume that pg_ioflags == 0 iff we're reading a page */ - if (pgio->pg_ioflags == 0) { - access_type = IOMODE_READ; - gfp_flags = GFP_KERNEL; - } else { - access_type = IOMODE_RW; - gfp_flags = GFP_NOFS; - } + if (ld == NULL) + return false; + nfs_pageio_init(pgio, inode, ld->pg_read_ops, server->rsize, 0); + return true; +} - if (pgio->pg_lseg == NULL) { - if (pgio->pg_count != prev->wb_bytes) - return true; - /* This is first coelesce call for a series of nfs_pages */ - pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, - prev->wb_context, - req_offset(prev), - pgio->pg_count, - access_type, - gfp_flags); - if (pgio->pg_lseg == NULL) - return true; - } +bool +pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, int ioflags) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld; + + if (ld == NULL) + return false; + nfs_pageio_init(pgio, inode, ld->pg_write_ops, server->wsize, ioflags); + return true; +} + +bool +pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, + struct nfs_page *req) +{ + if (pgio->pg_lseg == NULL) + return nfs_generic_pg_test(pgio, prev, req); /* * Test if a nfs_page is fully contained in the pnfs_layout_range. @@ -1120,15 +1188,30 @@ pnfs_ld_write_done(struct nfs_write_data *data) } EXPORT_SYMBOL_GPL(pnfs_ld_write_done); -enum pnfs_try_status +static void +pnfs_write_through_mds(struct nfs_pageio_descriptor *desc, + struct nfs_write_data *data) +{ + list_splice_tail_init(&data->pages, &desc->pg_list); + if (data->req && list_empty(&data->req->wb_list)) + nfs_list_add_request(data->req, &desc->pg_list); + nfs_pageio_reset_write_mds(desc); + desc->pg_recoalesce = 1; + nfs_writedata_release(data); +} + +static enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *wdata, - const struct rpc_call_ops *call_ops, int how) + const struct rpc_call_ops *call_ops, + struct pnfs_layout_segment *lseg, + int how) { struct inode *inode = wdata->inode; enum pnfs_try_status trypnfs; struct nfs_server *nfss = NFS_SERVER(inode); wdata->mds_ops = call_ops; + wdata->lseg = get_lseg(lseg); dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__, inode->i_ino, wdata->args.count, wdata->args.offset, how); @@ -1144,6 +1227,44 @@ pnfs_try_to_write_data(struct nfs_write_data *wdata, return trypnfs; } +static void +pnfs_do_multiple_writes(struct nfs_pageio_descriptor *desc, struct list_head *head, int how) +{ + struct nfs_write_data *data; + const struct rpc_call_ops *call_ops = desc->pg_rpc_callops; + struct pnfs_layout_segment *lseg = desc->pg_lseg; + + desc->pg_lseg = NULL; + while (!list_empty(head)) { + enum pnfs_try_status trypnfs; + + data = list_entry(head->next, struct nfs_write_data, list); + list_del_init(&data->list); + + trypnfs = pnfs_try_to_write_data(data, call_ops, lseg, how); + if (trypnfs == PNFS_NOT_ATTEMPTED) + pnfs_write_through_mds(desc, data); + } + put_lseg(lseg); +} + +int +pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) +{ + LIST_HEAD(head); + int ret; + + ret = nfs_generic_flush(desc, &head); + if (ret != 0) { + put_lseg(desc->pg_lseg); + desc->pg_lseg = NULL; + return ret; + } + pnfs_do_multiple_writes(desc, &head, desc->pg_ioflags); + return 0; +} +EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages); + /* * Called by non rpc-based layout drivers */ @@ -1167,18 +1288,32 @@ pnfs_ld_read_done(struct nfs_read_data *data) } EXPORT_SYMBOL_GPL(pnfs_ld_read_done); +static void +pnfs_read_through_mds(struct nfs_pageio_descriptor *desc, + struct nfs_read_data *data) +{ + list_splice_tail_init(&data->pages, &desc->pg_list); + if (data->req && list_empty(&data->req->wb_list)) + nfs_list_add_request(data->req, &desc->pg_list); + nfs_pageio_reset_read_mds(desc); + desc->pg_recoalesce = 1; + nfs_readdata_release(data); +} + /* * Call the appropriate parallel I/O subsystem read function. */ -enum pnfs_try_status +static enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *rdata, - const struct rpc_call_ops *call_ops) + const struct rpc_call_ops *call_ops, + struct pnfs_layout_segment *lseg) { struct inode *inode = rdata->inode; struct nfs_server *nfss = NFS_SERVER(inode); enum pnfs_try_status trypnfs; rdata->mds_ops = call_ops; + rdata->lseg = get_lseg(lseg); dprintk("%s: Reading ino:%lu %u@%llu\n", __func__, inode->i_ino, rdata->args.count, rdata->args.offset); @@ -1194,17 +1329,56 @@ pnfs_try_to_read_data(struct nfs_read_data *rdata, return trypnfs; } +static void +pnfs_do_multiple_reads(struct nfs_pageio_descriptor *desc, struct list_head *head) +{ + struct nfs_read_data *data; + const struct rpc_call_ops *call_ops = desc->pg_rpc_callops; + struct pnfs_layout_segment *lseg = desc->pg_lseg; + + desc->pg_lseg = NULL; + while (!list_empty(head)) { + enum pnfs_try_status trypnfs; + + data = list_entry(head->next, struct nfs_read_data, list); + list_del_init(&data->list); + + trypnfs = pnfs_try_to_read_data(data, call_ops, lseg); + if (trypnfs == PNFS_NOT_ATTEMPTED) + pnfs_read_through_mds(desc, data); + } + put_lseg(lseg); +} + +int +pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) +{ + LIST_HEAD(head); + int ret; + + ret = nfs_generic_pagein(desc, &head); + if (ret != 0) { + put_lseg(desc->pg_lseg); + desc->pg_lseg = NULL; + return ret; + } + pnfs_do_multiple_reads(desc, &head); + return 0; +} +EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages); + /* - * Currently there is only one (whole file) write lseg. + * There can be multiple RW segments. */ -static struct pnfs_layout_segment *pnfs_list_write_lseg(struct inode *inode) +static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp) { - struct pnfs_layout_segment *lseg, *rv = NULL; + struct pnfs_layout_segment *lseg; - list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) - if (lseg->pls_range.iomode == IOMODE_RW) - rv = lseg; - return rv; + list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) { + if (lseg->pls_range.iomode == IOMODE_RW && + test_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) + list_add(&lseg->pls_lc_list, listp); + } } void @@ -1216,17 +1390,19 @@ pnfs_set_layoutcommit(struct nfs_write_data *wdata) spin_lock(&nfsi->vfs_inode.i_lock); if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { - /* references matched in nfs4_layoutcommit_release */ - get_lseg(wdata->lseg); - wdata->lseg->pls_lc_cred = - get_rpccred(wdata->args.context->state->owner->so_cred); mark_as_dirty = true; dprintk("%s: Set layoutcommit for inode %lu ", __func__, wdata->inode->i_ino); } - if (end_pos > wdata->lseg->pls_end_pos) - wdata->lseg->pls_end_pos = end_pos; + if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &wdata->lseg->pls_flags)) { + /* references matched in nfs4_layoutcommit_release */ + get_lseg(wdata->lseg); + } + if (end_pos > nfsi->layout->plh_lwb) + nfsi->layout->plh_lwb = end_pos; spin_unlock(&nfsi->vfs_inode.i_lock); + dprintk("%s: lseg %p end_pos %llu\n", + __func__, wdata->lseg, nfsi->layout->plh_lwb); /* if pnfs_layoutcommit_inode() runs between inode locks, the next one * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */ @@ -1235,6 +1411,14 @@ pnfs_set_layoutcommit(struct nfs_write_data *wdata) } EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit); +void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data) +{ + struct nfs_server *nfss = NFS_SERVER(data->args.inode); + + if (nfss->pnfs_curr_ld->cleanup_layoutcommit) + nfss->pnfs_curr_ld->cleanup_layoutcommit(data); +} + /* * For the LAYOUT4_NFSV4_1_FILES layout type, NFS_DATA_SYNC WRITEs and * NFS_UNSTABLE WRITEs with a COMMIT to data servers must store enough @@ -1248,8 +1432,6 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) { struct nfs4_layoutcommit_data *data; struct nfs_inode *nfsi = NFS_I(inode); - struct pnfs_layout_segment *lseg; - struct rpc_cred *cred; loff_t end_pos; int status = 0; @@ -1266,30 +1448,25 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) goto out; } + INIT_LIST_HEAD(&data->lseg_list); spin_lock(&inode->i_lock); if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { spin_unlock(&inode->i_lock); kfree(data); goto out; } - /* - * Currently only one (whole file) write lseg which is referenced - * in pnfs_set_layoutcommit and will be found. - */ - lseg = pnfs_list_write_lseg(inode); - end_pos = lseg->pls_end_pos; - cred = lseg->pls_lc_cred; - lseg->pls_end_pos = 0; - lseg->pls_lc_cred = NULL; + pnfs_list_write_lseg(inode, &data->lseg_list); + + end_pos = nfsi->layout->plh_lwb; + nfsi->layout->plh_lwb = 0; memcpy(&data->args.stateid.data, nfsi->layout->plh_stateid.data, sizeof(nfsi->layout->plh_stateid.data)); spin_unlock(&inode->i_lock); data->args.inode = inode; - data->lseg = lseg; - data->cred = cred; + data->cred = get_rpccred(nfsi->layout->plh_lc_cred); nfs_fattr_init(&data->fattr); data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask; data->res.fattr = &data->fattr; diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 96bf4e6..01cbfd5 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -36,16 +36,16 @@ enum { NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */ NFS_LSEG_ROC, /* roc bit received from server */ + NFS_LSEG_LAYOUTCOMMIT, /* layoutcommit bit set for layoutcommit */ }; struct pnfs_layout_segment { struct list_head pls_list; + struct list_head pls_lc_list; struct pnfs_layout_range pls_range; atomic_t pls_refcount; unsigned long pls_flags; struct pnfs_layout_hdr *pls_layout; - struct rpc_cred *pls_lc_cred; /* LAYOUTCOMMIT credential */ - loff_t pls_end_pos; /* LAYOUTCOMMIT write end */ }; enum pnfs_try_status { @@ -80,6 +80,9 @@ struct pnfs_layoutdriver_type { struct module *owner; unsigned flags; + int (*set_layoutdriver) (struct nfs_server *, const struct nfs_fh *); + int (*clear_layoutdriver) (struct nfs_server *); + struct pnfs_layout_hdr * (*alloc_layout_hdr) (struct inode *inode, gfp_t gfp_flags); void (*free_layout_hdr) (struct pnfs_layout_hdr *); @@ -87,7 +90,8 @@ struct pnfs_layoutdriver_type { void (*free_lseg) (struct pnfs_layout_segment *lseg); /* test for nfs page cache coalescing */ - bool (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *); + const struct nfs_pageio_ops *pg_read_ops; + const struct nfs_pageio_ops *pg_write_ops; /* Returns true if layoutdriver wants to divert this request to * driver's commit routine. @@ -109,6 +113,8 @@ struct pnfs_layoutdriver_type { struct xdr_stream *xdr, const struct nfs4_layoutreturn_args *args); + void (*cleanup_layoutcommit) (struct nfs4_layoutcommit_data *data); + void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid, struct xdr_stream *xdr, const struct nfs4_layoutcommit_args *args); @@ -124,6 +130,8 @@ struct pnfs_layout_hdr { unsigned long plh_block_lgets; /* block LAYOUTGET if >0 */ u32 plh_barrier; /* ignore lower seqids */ unsigned long plh_flags; + loff_t plh_lwb; /* last write byte for layoutcommit */ + struct rpc_cred *plh_lc_cred; /* layoutcommit cred */ struct inode *plh_inode; }; @@ -136,10 +144,21 @@ struct pnfs_device { unsigned int pglen; }; +#define NFS4_PNFS_GETDEVLIST_MAXNUM 16 + +struct pnfs_devicelist { + unsigned int eof; + unsigned int num_devs; + struct nfs4_deviceid dev_id[NFS4_PNFS_GETDEVLIST_MAXNUM]; +}; + extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *); extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *); /* nfs4proc.c */ +extern int nfs4_proc_getdevicelist(struct nfs_server *server, + const struct nfs_fh *fh, + struct pnfs_devicelist *devlist); extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *dev); extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp); @@ -148,16 +167,16 @@ extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp); /* pnfs.c */ void get_layout_hdr(struct pnfs_layout_hdr *lo); void put_lseg(struct pnfs_layout_segment *lseg); -struct pnfs_layout_segment * -pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, - loff_t pos, u64 count, enum pnfs_iomode access_type, - gfp_t gfp_flags); -void set_pnfs_layoutdriver(struct nfs_server *, u32 id); + +bool pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *); +bool pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *, int); + +void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32); void unset_pnfs_layoutdriver(struct nfs_server *); -enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *, - const struct rpc_call_ops *, int); -enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *, - const struct rpc_call_ops *); +void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *); +int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc); +void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *, struct nfs_page *); +int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc); bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req); int pnfs_layout_process(struct nfs4_layoutget *lgp); void pnfs_free_lseg_list(struct list_head *tmp_list); @@ -178,10 +197,24 @@ void pnfs_roc_release(struct inode *ino); void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); bool pnfs_roc_drain(struct inode *ino, u32 *barrier); void pnfs_set_layoutcommit(struct nfs_write_data *wdata); +void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data); int pnfs_layoutcommit_inode(struct inode *inode, bool sync); int _pnfs_return_layout(struct inode *); int pnfs_ld_write_done(struct nfs_write_data *); int pnfs_ld_read_done(struct nfs_read_data *); +struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino, + struct nfs_open_context *ctx, + loff_t pos, + u64 count, + enum pnfs_iomode iomode, + gfp_t gfp_flags); + +void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp); + +/* nfs4_deviceid_flags */ +enum { + NFS_DEVICEID_INVALID = 0, /* set when MDS clientid recalled */ +}; /* pnfs_dev.c */ struct nfs4_deviceid_node { @@ -189,13 +222,13 @@ struct nfs4_deviceid_node { struct hlist_node tmpnode; const struct pnfs_layoutdriver_type *ld; const struct nfs_client *nfs_client; + unsigned long flags; struct nfs4_deviceid deviceid; atomic_t ref; }; void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id); struct nfs4_deviceid_node *nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *); -struct nfs4_deviceid_node *nfs4_unhash_put_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *); void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *); void nfs4_init_deviceid_node(struct nfs4_deviceid_node *, const struct pnfs_layoutdriver_type *, @@ -293,15 +326,6 @@ static inline int pnfs_return_layout(struct inode *ino) return 0; } -static inline void pnfs_pageio_init(struct nfs_pageio_descriptor *pgio, - struct inode *inode) -{ - struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; - - if (ld) - pgio->pg_test = ld->pg_test; -} - #else /* CONFIG_NFS_V4_1 */ static inline void pnfs_destroy_all_layouts(struct nfs_client *clp) @@ -322,28 +346,6 @@ static inline void put_lseg(struct pnfs_layout_segment *lseg) { } -static inline struct pnfs_layout_segment * -pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, - loff_t pos, u64 count, enum pnfs_iomode access_type, - gfp_t gfp_flags) -{ - return NULL; -} - -static inline enum pnfs_try_status -pnfs_try_to_read_data(struct nfs_read_data *data, - const struct rpc_call_ops *call_ops) -{ - return PNFS_NOT_ATTEMPTED; -} - -static inline enum pnfs_try_status -pnfs_try_to_write_data(struct nfs_write_data *data, - const struct rpc_call_ops *call_ops, int how) -{ - return PNFS_NOT_ATTEMPTED; -} - static inline int pnfs_return_layout(struct inode *ino) { return 0; @@ -377,7 +379,8 @@ pnfs_roc_drain(struct inode *ino, u32 *barrier) return false; } -static inline void set_pnfs_layoutdriver(struct nfs_server *s, u32 id) +static inline void set_pnfs_layoutdriver(struct nfs_server *s, + const struct nfs_fh *mntfh, u32 id) { } @@ -385,9 +388,14 @@ static inline void unset_pnfs_layoutdriver(struct nfs_server *s) { } -static inline void pnfs_pageio_init(struct nfs_pageio_descriptor *pgio, - struct inode *inode) +static inline bool pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode) +{ + return false; +} + +static inline bool pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, int ioflags) { + return false; } static inline void diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c index f0f8e1e..6fda522 100644 --- a/fs/nfs/pnfs_dev.c +++ b/fs/nfs/pnfs_dev.c @@ -100,8 +100,8 @@ _find_get_deviceid(const struct pnfs_layoutdriver_type *ld, rcu_read_lock(); d = _lookup_deviceid(ld, clp, id, hash); - if (d && !atomic_inc_not_zero(&d->ref)) - d = NULL; + if (d != NULL) + atomic_inc(&d->ref); rcu_read_unlock(); return d; } @@ -115,15 +115,15 @@ nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *ld, EXPORT_SYMBOL_GPL(nfs4_find_get_deviceid); /* - * Unhash and put deviceid + * Remove a deviceid from cache * * @clp nfs_client associated with deviceid * @id the deviceid to unhash * * @ret the unhashed node, if found and dereferenced to zero, NULL otherwise. */ -struct nfs4_deviceid_node * -nfs4_unhash_put_deviceid(const struct pnfs_layoutdriver_type *ld, +void +nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld, const struct nfs_client *clp, const struct nfs4_deviceid *id) { struct nfs4_deviceid_node *d; @@ -134,7 +134,7 @@ nfs4_unhash_put_deviceid(const struct pnfs_layoutdriver_type *ld, rcu_read_unlock(); if (!d) { spin_unlock(&nfs4_deviceid_lock); - return NULL; + return; } hlist_del_init_rcu(&d->node); spin_unlock(&nfs4_deviceid_lock); @@ -142,28 +142,7 @@ nfs4_unhash_put_deviceid(const struct pnfs_layoutdriver_type *ld, /* balance the initial ref set in pnfs_insert_deviceid */ if (atomic_dec_and_test(&d->ref)) - return d; - - return NULL; -} -EXPORT_SYMBOL_GPL(nfs4_unhash_put_deviceid); - -/* - * Delete a deviceid from cache - * - * @clp struct nfs_client qualifying the deviceid - * @id deviceid to delete - */ -void -nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld, - const struct nfs_client *clp, const struct nfs4_deviceid *id) -{ - struct nfs4_deviceid_node *d; - - d = nfs4_unhash_put_deviceid(ld, clp, id); - if (!d) - return; - d->ld->free_deviceid_node(d); + d->ld->free_deviceid_node(d); } EXPORT_SYMBOL_GPL(nfs4_delete_deviceid); @@ -177,6 +156,7 @@ nfs4_init_deviceid_node(struct nfs4_deviceid_node *d, INIT_HLIST_NODE(&d->tmpnode); d->ld = ld; d->nfs_client = nfs_client; + d->flags = 0; d->deviceid = *id; atomic_set(&d->ref, 1); } @@ -221,16 +201,15 @@ EXPORT_SYMBOL_GPL(nfs4_insert_deviceid_node); * * @d deviceid node to put * - * @ret true iff the node was deleted + * return true iff the node was deleted + * Note that since the test for d->ref == 0 is sufficient to establish + * that the node is no longer hashed in the global device id cache. */ bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *d) { - if (!atomic_dec_and_lock(&d->ref, &nfs4_deviceid_lock)) + if (!atomic_dec_and_test(&d->ref)) return false; - hlist_del_init_rcu(&d->node); - spin_unlock(&nfs4_deviceid_lock); - synchronize_rcu(); d->ld->free_deviceid_node(d); return true; } @@ -275,3 +254,22 @@ nfs4_deviceid_purge_client(const struct nfs_client *clp) for (h = 0; h < NFS4_DEVICE_ID_HASH_SIZE; h++) _deviceid_purge_client(clp, h); } + +/* + * Stop use of all deviceids associated with an nfs_client + */ +void +nfs4_deviceid_mark_client_invalid(struct nfs_client *clp) +{ + struct nfs4_deviceid_node *d; + struct hlist_node *n; + int i; + + rcu_read_lock(); + for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i ++){ + hlist_for_each_entry_rcu(d, n, &nfs4_deviceid_cache[i], node) + if (d->nfs_client == clp) + set_bit(NFS_DEVICEID_INVALID, &d->flags); + } + rcu_read_unlock(); +} diff --git a/fs/nfs/read.c b/fs/nfs/read.c index a68679f..2171c04 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -30,8 +30,7 @@ #define NFSDBG_FACILITY NFSDBG_PAGECACHE -static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc); -static int nfs_pagein_one(struct nfs_pageio_descriptor *desc); +static const struct nfs_pageio_ops nfs_pageio_read_ops; static const struct rpc_call_ops nfs_read_partial_ops; static const struct rpc_call_ops nfs_read_full_ops; @@ -68,7 +67,7 @@ void nfs_readdata_free(struct nfs_read_data *p) mempool_free(p, nfs_rdata_mempool); } -static void nfs_readdata_release(struct nfs_read_data *rdata) +void nfs_readdata_release(struct nfs_read_data *rdata) { put_lseg(rdata->lseg); put_nfs_open_context(rdata->args.context); @@ -113,6 +112,27 @@ static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data) } } +static void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio, + struct inode *inode) +{ + nfs_pageio_init(pgio, inode, &nfs_pageio_read_ops, + NFS_SERVER(inode)->rsize, 0); +} + +void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio) +{ + pgio->pg_ops = &nfs_pageio_read_ops; + pgio->pg_bsize = NFS_SERVER(pgio->pg_inode)->rsize; +} +EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds); + +static void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, + struct inode *inode) +{ + if (!pnfs_pageio_init_read(pgio, inode)) + nfs_pageio_init_read_mds(pgio, inode); +} + int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, struct page *page) { @@ -131,14 +151,9 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, if (len < PAGE_CACHE_SIZE) zero_user_segment(page, len, PAGE_CACHE_SIZE); - nfs_pageio_init(&pgio, inode, NULL, 0, 0); - nfs_list_add_request(new, &pgio.pg_list); - pgio.pg_count = len; - - if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE) - nfs_pagein_multi(&pgio); - else - nfs_pagein_one(&pgio); + nfs_pageio_init_read(&pgio, inode); + nfs_pageio_add_request(&pgio, new); + nfs_pageio_complete(&pgio); return 0; } @@ -202,17 +217,14 @@ EXPORT_SYMBOL_GPL(nfs_initiate_read); /* * Set up the NFS read request struct */ -static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, - const struct rpc_call_ops *call_ops, - unsigned int count, unsigned int offset, - struct pnfs_layout_segment *lseg) +static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, + unsigned int count, unsigned int offset) { struct inode *inode = req->wb_context->dentry->d_inode; data->req = req; data->inode = inode; data->cred = req->wb_context->cred; - data->lseg = get_lseg(lseg); data->args.fh = NFS_FH(inode); data->args.offset = req_offset(req) + offset; @@ -226,14 +238,36 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, data->res.count = count; data->res.eof = 0; nfs_fattr_init(&data->fattr); +} - if (data->lseg && - (pnfs_try_to_read_data(data, call_ops) == PNFS_ATTEMPTED)) - return 0; +static int nfs_do_read(struct nfs_read_data *data, + const struct rpc_call_ops *call_ops) +{ + struct inode *inode = data->args.context->dentry->d_inode; return nfs_initiate_read(data, NFS_CLIENT(inode), call_ops); } +static int +nfs_do_multiple_reads(struct list_head *head, + const struct rpc_call_ops *call_ops) +{ + struct nfs_read_data *data; + int ret = 0; + + while (!list_empty(head)) { + int ret2; + + data = list_entry(head->next, struct nfs_read_data, list); + list_del_init(&data->list); + + ret2 = nfs_do_read(data, call_ops); + if (ret == 0) + ret = ret2; + } + return ret; +} + static void nfs_async_read_error(struct list_head *head) { @@ -260,20 +294,19 @@ nfs_async_read_error(struct list_head *head) * won't see the new data until our attribute cache is updated. This is more * or less conventional NFS client behavior. */ -static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc) +static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc, struct list_head *res) { struct nfs_page *req = nfs_list_entry(desc->pg_list.next); struct page *page = req->wb_page; struct nfs_read_data *data; - size_t rsize = NFS_SERVER(desc->pg_inode)->rsize, nbytes; + size_t rsize = desc->pg_bsize, nbytes; unsigned int offset; int requests = 0; int ret = 0; - struct pnfs_layout_segment *lseg; - LIST_HEAD(list); nfs_list_remove_request(req); + offset = 0; nbytes = desc->pg_count; do { size_t len = min(nbytes,rsize); @@ -281,45 +314,21 @@ static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc) data = nfs_readdata_alloc(1); if (!data) goto out_bad; - list_add(&data->pages, &list); + data->pagevec[0] = page; + nfs_read_rpcsetup(req, data, len, offset); + list_add(&data->list, res); requests++; nbytes -= len; + offset += len; } while(nbytes != 0); atomic_set(&req->wb_complete, requests); - - BUG_ON(desc->pg_lseg != NULL); - lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, - req_offset(req), desc->pg_count, - IOMODE_READ, GFP_KERNEL); ClearPageError(page); - offset = 0; - nbytes = desc->pg_count; - do { - int ret2; - - data = list_entry(list.next, struct nfs_read_data, pages); - list_del_init(&data->pages); - - data->pagevec[0] = page; - - if (nbytes < rsize) - rsize = nbytes; - ret2 = nfs_read_rpcsetup(req, data, &nfs_read_partial_ops, - rsize, offset, lseg); - if (ret == 0) - ret = ret2; - offset += rsize; - nbytes -= rsize; - } while (nbytes != 0); - put_lseg(lseg); - desc->pg_lseg = NULL; - + desc->pg_rpc_callops = &nfs_read_partial_ops; return ret; - out_bad: - while (!list_empty(&list)) { - data = list_entry(list.next, struct nfs_read_data, pages); - list_del(&data->pages); + while (!list_empty(res)) { + data = list_entry(res->next, struct nfs_read_data, list); + list_del(&data->list); nfs_readdata_free(data); } SetPageError(page); @@ -327,19 +336,19 @@ out_bad: return -ENOMEM; } -static int nfs_pagein_one(struct nfs_pageio_descriptor *desc) +static int nfs_pagein_one(struct nfs_pageio_descriptor *desc, struct list_head *res) { struct nfs_page *req; struct page **pages; struct nfs_read_data *data; struct list_head *head = &desc->pg_list; - struct pnfs_layout_segment *lseg = desc->pg_lseg; - int ret = -ENOMEM; + int ret = 0; data = nfs_readdata_alloc(nfs_page_array_len(desc->pg_base, desc->pg_count)); if (!data) { nfs_async_read_error(head); + ret = -ENOMEM; goto out; } @@ -352,19 +361,37 @@ static int nfs_pagein_one(struct nfs_pageio_descriptor *desc) *pages++ = req->wb_page; } req = nfs_list_entry(data->pages.next); - if ((!lseg) && list_is_singular(&data->pages)) - lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, - req_offset(req), desc->pg_count, - IOMODE_READ, GFP_KERNEL); - ret = nfs_read_rpcsetup(req, data, &nfs_read_full_ops, desc->pg_count, - 0, lseg); + nfs_read_rpcsetup(req, data, desc->pg_count, 0); + list_add(&data->list, res); + desc->pg_rpc_callops = &nfs_read_full_ops; out: - put_lseg(lseg); - desc->pg_lseg = NULL; return ret; } +int nfs_generic_pagein(struct nfs_pageio_descriptor *desc, struct list_head *head) +{ + if (desc->pg_bsize < PAGE_CACHE_SIZE) + return nfs_pagein_multi(desc, head); + return nfs_pagein_one(desc, head); +} + +static int nfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) +{ + LIST_HEAD(head); + int ret; + + ret = nfs_generic_pagein(desc, &head); + if (ret == 0) + ret = nfs_do_multiple_reads(&head, desc->pg_rpc_callops); + return ret; +} + +static const struct nfs_pageio_ops nfs_pageio_read_ops = { + .pg_test = nfs_generic_pg_test, + .pg_doio = nfs_generic_pg_readpages, +}; + /* * This is the callback from RPC telling us whether a reply was * received or some error occurred (timeout or socket shutdown). @@ -635,8 +662,6 @@ int nfs_readpages(struct file *filp, struct address_space *mapping, .pgio = &pgio, }; struct inode *inode = mapping->host; - struct nfs_server *server = NFS_SERVER(inode); - size_t rsize = server->rsize; unsigned long npages; int ret = -ESTALE; @@ -664,10 +689,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping, if (ret == 0) goto read_complete; /* all pages were read */ - if (rsize < PAGE_CACHE_SIZE) - nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0); - else - nfs_pageio_init(&pgio, inode, nfs_pagein_one, rsize, 0); + nfs_pageio_init_read(&pgio, inode); ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc); diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index 8d6864c..b2fbbde 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -147,7 +147,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n alias = d_lookup(parent, &data->args.name); if (alias != NULL) { - int ret = 0; + int ret; void *devname_garbage = NULL; /* @@ -155,14 +155,16 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n * the sillyrename information to the aliased dentry. */ nfs_free_dname(data); + ret = nfs_copy_dname(alias, data); spin_lock(&alias->d_lock); - if (alias->d_inode != NULL && + if (ret == 0 && alias->d_inode != NULL && !(alias->d_flags & DCACHE_NFSFS_RENAMED)) { devname_garbage = alias->d_fsdata; alias->d_fsdata = data; alias->d_flags |= DCACHE_NFSFS_RENAMED; ret = 1; - } + } else + ret = 0; spin_unlock(&alias->d_lock); nfs_dec_sillycount(dir); dput(alias); @@ -171,8 +173,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n * point dentry is definitely not a root, so we won't need * that anymore. */ - if (devname_garbage) - kfree(devname_garbage); + kfree(devname_garbage); return ret; } data->dir = igrab(dir); @@ -204,8 +205,6 @@ static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data) if (parent == NULL) goto out_free; dir = parent->d_inode; - if (nfs_copy_dname(dentry, data) != 0) - goto out_dput; /* Non-exclusive lock protects against concurrent lookup() calls */ spin_lock(&dir->i_lock); if (atomic_inc_not_zero(&NFS_I(dir)->silly_count) == 0) { @@ -366,6 +365,8 @@ static void nfs_async_rename_done(struct rpc_task *task, void *calldata) struct nfs_renamedata *data = calldata; struct inode *old_dir = data->old_dir; struct inode *new_dir = data->new_dir; + struct dentry *old_dentry = data->old_dentry; + struct dentry *new_dentry = data->new_dentry; if (!NFS_PROTO(old_dir)->rename_done(task, old_dir, new_dir)) { nfs_restart_rpc(task, NFS_SERVER(old_dir)->nfs_client); @@ -373,12 +374,12 @@ static void nfs_async_rename_done(struct rpc_task *task, void *calldata) } if (task->tk_status != 0) { - nfs_cancel_async_unlink(data->old_dentry); + nfs_cancel_async_unlink(old_dentry); return; } - nfs_set_verifier(data->old_dentry, nfs_save_change_attribute(old_dir)); - d_move(data->old_dentry, data->new_dentry); + d_drop(old_dentry); + d_drop(new_dentry); } /** @@ -501,6 +502,14 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir, * and only performs the unlink once the last reference to it is put. * * The final cleanup is done during dentry_iput. + * + * (Note: NFSv4 is stateful, and has opens, so in theory an NFSv4 server + * could take responsibility for keeping open files referenced. The server + * would also need to ensure that opened-but-deleted files were kept over + * reboots. However, we may not assume a server does so. (RFC 5661 + * does provide an OPEN4_RESULT_PRESERVE_UNLINKED flag that a server can + * use to advertise that it does this; some day we may take advantage of + * it.)) */ int nfs_sillyrename(struct inode *dir, struct dentry *dentry) @@ -560,6 +569,14 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry) if (error) goto out_dput; + /* populate unlinkdata with the right dname */ + error = nfs_copy_dname(sdentry, + (struct nfs_unlinkdata *)dentry->d_fsdata); + if (error) { + nfs_cancel_async_unlink(dentry); + goto out_dput; + } + /* run the rename task, undo unlink if it fails */ task = nfs_async_rename(dir, dir, dentry, sdentry); if (IS_ERR(task)) { diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 0857931..b39b37f 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -97,7 +97,7 @@ void nfs_writedata_free(struct nfs_write_data *p) mempool_free(p, nfs_wdata_mempool); } -static void nfs_writedata_release(struct nfs_write_data *wdata) +void nfs_writedata_release(struct nfs_write_data *wdata) { put_lseg(wdata->lseg); put_nfs_open_context(wdata->args.context); @@ -845,11 +845,9 @@ EXPORT_SYMBOL_GPL(nfs_initiate_write); /* * Set up the argument/result storage required for the RPC call. */ -static int nfs_write_rpcsetup(struct nfs_page *req, +static void nfs_write_rpcsetup(struct nfs_page *req, struct nfs_write_data *data, - const struct rpc_call_ops *call_ops, unsigned int count, unsigned int offset, - struct pnfs_layout_segment *lseg, int how) { struct inode *inode = req->wb_context->dentry->d_inode; @@ -860,7 +858,6 @@ static int nfs_write_rpcsetup(struct nfs_page *req, data->req = req; data->inode = inode = req->wb_context->dentry->d_inode; data->cred = req->wb_context->cred; - data->lseg = get_lseg(lseg); data->args.fh = NFS_FH(inode); data->args.offset = req_offset(req) + offset; @@ -872,24 +869,51 @@ static int nfs_write_rpcsetup(struct nfs_page *req, data->args.context = get_nfs_open_context(req->wb_context); data->args.lock_context = req->wb_lock_context; data->args.stable = NFS_UNSTABLE; - if (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) { - data->args.stable = NFS_DATA_SYNC; - if (!nfs_need_commit(NFS_I(inode))) - data->args.stable = NFS_FILE_SYNC; + switch (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) { + case 0: + break; + case FLUSH_COND_STABLE: + if (nfs_need_commit(NFS_I(inode))) + break; + default: + data->args.stable = NFS_FILE_SYNC; } data->res.fattr = &data->fattr; data->res.count = count; data->res.verf = &data->verf; nfs_fattr_init(&data->fattr); +} - if (data->lseg && - (pnfs_try_to_write_data(data, call_ops, how) == PNFS_ATTEMPTED)) - return 0; +static int nfs_do_write(struct nfs_write_data *data, + const struct rpc_call_ops *call_ops, + int how) +{ + struct inode *inode = data->args.context->dentry->d_inode; return nfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how); } +static int nfs_do_multiple_writes(struct list_head *head, + const struct rpc_call_ops *call_ops, + int how) +{ + struct nfs_write_data *data; + int ret = 0; + + while (!list_empty(head)) { + int ret2; + + data = list_entry(head->next, struct nfs_write_data, list); + list_del_init(&data->list); + + ret2 = nfs_do_write(data, call_ops, how); + if (ret == 0) + ret = ret2; + } + return ret; +} + /* If a nfs_flush_* function fails, it should remove reqs from @head and * call this on each, which will prepare them to be retried on next * writeback using standard nfs. @@ -907,17 +931,15 @@ static void nfs_redirty_request(struct nfs_page *req) * Generate multiple small requests to write out a single * contiguous dirty area on one page. */ -static int nfs_flush_multi(struct nfs_pageio_descriptor *desc) +static int nfs_flush_multi(struct nfs_pageio_descriptor *desc, struct list_head *res) { struct nfs_page *req = nfs_list_entry(desc->pg_list.next); struct page *page = req->wb_page; struct nfs_write_data *data; - size_t wsize = NFS_SERVER(desc->pg_inode)->wsize, nbytes; + size_t wsize = desc->pg_bsize, nbytes; unsigned int offset; int requests = 0; int ret = 0; - struct pnfs_layout_segment *lseg; - LIST_HEAD(list); nfs_list_remove_request(req); @@ -927,6 +949,7 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc) desc->pg_ioflags &= ~FLUSH_COND_STABLE; + offset = 0; nbytes = desc->pg_count; do { size_t len = min(nbytes, wsize); @@ -934,45 +957,21 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc) data = nfs_writedata_alloc(1); if (!data) goto out_bad; - list_add(&data->pages, &list); + data->pagevec[0] = page; + nfs_write_rpcsetup(req, data, wsize, offset, desc->pg_ioflags); + list_add(&data->list, res); requests++; nbytes -= len; + offset += len; } while (nbytes != 0); atomic_set(&req->wb_complete, requests); - - BUG_ON(desc->pg_lseg); - lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, - req_offset(req), desc->pg_count, - IOMODE_RW, GFP_NOFS); - ClearPageError(page); - offset = 0; - nbytes = desc->pg_count; - do { - int ret2; - - data = list_entry(list.next, struct nfs_write_data, pages); - list_del_init(&data->pages); - - data->pagevec[0] = page; - - if (nbytes < wsize) - wsize = nbytes; - ret2 = nfs_write_rpcsetup(req, data, &nfs_write_partial_ops, - wsize, offset, lseg, desc->pg_ioflags); - if (ret == 0) - ret = ret2; - offset += wsize; - nbytes -= wsize; - } while (nbytes != 0); - - put_lseg(lseg); - desc->pg_lseg = NULL; + desc->pg_rpc_callops = &nfs_write_partial_ops; return ret; out_bad: - while (!list_empty(&list)) { - data = list_entry(list.next, struct nfs_write_data, pages); - list_del(&data->pages); + while (!list_empty(res)) { + data = list_entry(res->next, struct nfs_write_data, list); + list_del(&data->list); nfs_writedata_free(data); } nfs_redirty_request(req); @@ -987,14 +986,13 @@ out_bad: * This is the case if nfs_updatepage detects a conflicting request * that has been written but not committed. */ -static int nfs_flush_one(struct nfs_pageio_descriptor *desc) +static int nfs_flush_one(struct nfs_pageio_descriptor *desc, struct list_head *res) { struct nfs_page *req; struct page **pages; struct nfs_write_data *data; struct list_head *head = &desc->pg_list; - struct pnfs_layout_segment *lseg = desc->pg_lseg; - int ret; + int ret = 0; data = nfs_writedata_alloc(nfs_page_array_len(desc->pg_base, desc->pg_count)); @@ -1016,32 +1014,62 @@ static int nfs_flush_one(struct nfs_pageio_descriptor *desc) *pages++ = req->wb_page; } req = nfs_list_entry(data->pages.next); - if ((!lseg) && list_is_singular(&data->pages)) - lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, - req_offset(req), desc->pg_count, - IOMODE_RW, GFP_NOFS); if ((desc->pg_ioflags & FLUSH_COND_STABLE) && (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit)) desc->pg_ioflags &= ~FLUSH_COND_STABLE; /* Set up the argument struct */ - ret = nfs_write_rpcsetup(req, data, &nfs_write_full_ops, desc->pg_count, 0, lseg, desc->pg_ioflags); + nfs_write_rpcsetup(req, data, desc->pg_count, 0, desc->pg_ioflags); + list_add(&data->list, res); + desc->pg_rpc_callops = &nfs_write_full_ops; out: - put_lseg(lseg); /* Cleans any gotten in ->pg_test */ - desc->pg_lseg = NULL; return ret; } -static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, +int nfs_generic_flush(struct nfs_pageio_descriptor *desc, struct list_head *head) +{ + if (desc->pg_bsize < PAGE_CACHE_SIZE) + return nfs_flush_multi(desc, head); + return nfs_flush_one(desc, head); +} + +static int nfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) +{ + LIST_HEAD(head); + int ret; + + ret = nfs_generic_flush(desc, &head); + if (ret == 0) + ret = nfs_do_multiple_writes(&head, desc->pg_rpc_callops, + desc->pg_ioflags); + return ret; +} + +static const struct nfs_pageio_ops nfs_pageio_write_ops = { + .pg_test = nfs_generic_pg_test, + .pg_doio = nfs_generic_pg_writepages, +}; + +static void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio, struct inode *inode, int ioflags) { - size_t wsize = NFS_SERVER(inode)->wsize; + nfs_pageio_init(pgio, inode, &nfs_pageio_write_ops, + NFS_SERVER(inode)->wsize, ioflags); +} + +void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio) +{ + pgio->pg_ops = &nfs_pageio_write_ops; + pgio->pg_bsize = NFS_SERVER(pgio->pg_inode)->wsize; +} +EXPORT_SYMBOL_GPL(nfs_pageio_reset_write_mds); - if (wsize < PAGE_CACHE_SIZE) - nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags); - else - nfs_pageio_init(pgio, inode, nfs_flush_one, wsize, ioflags); +static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, + struct inode *inode, int ioflags) +{ + if (!pnfs_pageio_init_write(pgio, inode, ioflags)) + nfs_pageio_init_write_mds(pgio, inode, ioflags); } /* @@ -1566,8 +1594,7 @@ int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) int status; bool sync = true; - if (wbc->sync_mode == WB_SYNC_NONE || wbc->nonblocking || - wbc->for_background) + if (wbc->sync_mode == WB_SYNC_NONE) sync = false; status = pnfs_layoutcommit_inode(inode, sync); diff --git a/fs/notify/group.c b/fs/notify/group.c index d309f38..63fc294 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -26,7 +26,7 @@ #include <linux/fsnotify_backend.h> #include "fsnotify.h" -#include <asm/atomic.h> +#include <linux/atomic.h> /* * Final freeing of a group diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c index 07ea8d3..b13c00a 100644 --- a/fs/notify/inode_mark.c +++ b/fs/notify/inode_mark.c @@ -23,7 +23,7 @@ #include <linux/mutex.h> #include <linux/spinlock.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include <linux/fsnotify_backend.h> #include "fsnotify.h" diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 252ab1f..e14587d 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -92,7 +92,7 @@ #include <linux/spinlock.h> #include <linux/srcu.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include <linux/fsnotify_backend.h> #include "fsnotify.h" diff --git a/fs/notify/notification.c b/fs/notify/notification.c index f39260f..ee18815 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -43,7 +43,7 @@ #include <linux/slab.h> #include <linux/spinlock.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include <linux/fsnotify_backend.h> #include "fsnotify.h" diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c index e86577d..778fe6c 100644 --- a/fs/notify/vfsmount_mark.c +++ b/fs/notify/vfsmount_mark.c @@ -24,7 +24,7 @@ #include <linux/mutex.h> #include <linux/spinlock.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include <linux/fsnotify_backend.h> #include "fsnotify.h" diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h index 2dabf81..fe8e7e9 100644 --- a/fs/ntfs/inode.h +++ b/fs/ntfs/inode.h @@ -24,7 +24,7 @@ #ifndef _LINUX_NTFS_INODE_H #define _LINUX_NTFS_INODE_H -#include <asm/atomic.h> +#include <linux/atomic.h> #include <linux/fs.h> #include <linux/list.h> diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 783c58d..a721907 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -247,7 +247,7 @@ static int ocfs2_set_acl(handle_t *handle, case ACL_TYPE_ACCESS: name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS; if (acl) { - mode_t mode = inode->i_mode; + umode_t mode = inode->i_mode; ret = posix_acl_equiv_mode(acl, &mode); if (ret < 0) return ret; @@ -351,7 +351,7 @@ int ocfs2_init_acl(handle_t *handle, struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct posix_acl *acl = NULL; int ret = 0, ret2; - mode_t mode; + umode_t mode; if (!S_ISLNK(inode->i_mode)) { if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) { diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c index 3b8d397..98e5442 100644 --- a/fs/omfs/dir.c +++ b/fs/omfs/dir.c @@ -93,7 +93,7 @@ int omfs_make_empty(struct inode *inode, struct super_block *sb) memset(bh->b_data, 0, sizeof(struct omfs_inode)); - if (inode->i_mode & S_IFDIR) { + if (S_ISDIR(inode->i_mode)) { memset(&bh->b_data[OMFS_DIR_START], 0xff, sbi->s_sys_blocksize - OMFS_DIR_START); } else @@ -446,74 +446,52 @@ out: return error; } -SYSCALL_DEFINE2(fchmod, unsigned int, fd, mode_t, mode) +static int chmod_common(struct path *path, umode_t mode) { - struct inode * inode; - struct dentry * dentry; - struct file * file; - int err = -EBADF; + struct inode *inode = path->dentry->d_inode; struct iattr newattrs; + int error; - file = fget(fd); - if (!file) - goto out; - - dentry = file->f_path.dentry; - inode = dentry->d_inode; - - audit_inode(NULL, dentry); - - err = mnt_want_write_file(file); - if (err) - goto out_putf; + error = mnt_want_write(path->mnt); + if (error) + return error; mutex_lock(&inode->i_mutex); - err = security_path_chmod(dentry, file->f_vfsmnt, mode); - if (err) + error = security_path_chmod(path->dentry, path->mnt, mode); + if (error) goto out_unlock; - if (mode == (mode_t) -1) - mode = inode->i_mode; newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; - err = notify_change(dentry, &newattrs); + error = notify_change(path->dentry, &newattrs); out_unlock: mutex_unlock(&inode->i_mutex); - mnt_drop_write(file->f_path.mnt); -out_putf: - fput(file); -out: + mnt_drop_write(path->mnt); + return error; +} + +SYSCALL_DEFINE2(fchmod, unsigned int, fd, mode_t, mode) +{ + struct file * file; + int err = -EBADF; + + file = fget(fd); + if (file) { + audit_inode(NULL, file->f_path.dentry); + err = chmod_common(&file->f_path, mode); + fput(file); + } return err; } SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, mode_t, mode) { struct path path; - struct inode *inode; int error; - struct iattr newattrs; error = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path); - if (error) - goto out; - inode = path.dentry->d_inode; - - error = mnt_want_write(path.mnt); - if (error) - goto dput_and_out; - mutex_lock(&inode->i_mutex); - error = security_path_chmod(path.dentry, path.mnt, mode); - if (error) - goto out_unlock; - if (mode == (mode_t) -1) - mode = inode->i_mode; - newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); - newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; - error = notify_change(path.dentry, &newattrs); -out_unlock: - mutex_unlock(&inode->i_mutex); - mnt_drop_write(path.mnt); -dput_and_out: - path_put(&path); -out: + if (!error) { + error = chmod_common(&path, mode); + path_put(&path); + } return error; } @@ -948,7 +948,7 @@ static const struct dentry_operations pipefs_dentry_operations = { static struct inode * get_pipe_inode(void) { - struct inode *inode = new_inode(pipe_mnt->mnt_sb); + struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb); struct pipe_inode_info *pipe; if (!inode) diff --git a/fs/posix_acl.c b/fs/posix_acl.c index a6227d2..10027b4 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -14,7 +14,7 @@ #include <linux/kernel.h> #include <linux/slab.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include <linux/fs.h> #include <linux/sched.h> #include <linux/posix_acl.h> @@ -149,10 +149,10 @@ posix_acl_valid(const struct posix_acl *acl) * file mode permission bits, or else 1. Returns -E... on error. */ int -posix_acl_equiv_mode(const struct posix_acl *acl, mode_t *mode_p) +posix_acl_equiv_mode(const struct posix_acl *acl, umode_t *mode_p) { const struct posix_acl_entry *pa, *pe; - mode_t mode = 0; + umode_t mode = 0; int not_equiv = 0; FOREACH_ACL_ENTRY(pa, acl, pe) { @@ -188,7 +188,7 @@ posix_acl_equiv_mode(const struct posix_acl *acl, mode_t *mode_p) * Create an ACL representing the file mode permission bits of an inode. */ struct posix_acl * -posix_acl_from_mode(mode_t mode, gfp_t flags) +posix_acl_from_mode(umode_t mode, gfp_t flags) { struct posix_acl *acl = posix_acl_alloc(3, flags); if (!acl) @@ -279,11 +279,11 @@ check_perm: * system calls. All permissions that are not granted by the acl are removed. * The permissions in the acl are changed to reflect the mode_p parameter. */ -static int posix_acl_create_masq(struct posix_acl *acl, mode_t *mode_p) +static int posix_acl_create_masq(struct posix_acl *acl, umode_t *mode_p) { struct posix_acl_entry *pa, *pe; struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL; - mode_t mode = *mode_p; + umode_t mode = *mode_p; int not_equiv = 0; /* assert(atomic_read(acl->a_refcount) == 1); */ @@ -336,7 +336,7 @@ static int posix_acl_create_masq(struct posix_acl *acl, mode_t *mode_p) /* * Modify the ACL for the chmod syscall. */ -static int posix_acl_chmod_masq(struct posix_acl *acl, mode_t mode) +static int posix_acl_chmod_masq(struct posix_acl *acl, umode_t mode) { struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL; struct posix_acl_entry *pa, *pe; @@ -382,7 +382,7 @@ static int posix_acl_chmod_masq(struct posix_acl *acl, mode_t mode) } int -posix_acl_create(struct posix_acl **acl, gfp_t gfp, mode_t *mode_p) +posix_acl_create(struct posix_acl **acl, gfp_t gfp, umode_t *mode_p) { struct posix_acl *clone = posix_acl_clone(*acl, gfp); int err = -ENOMEM; @@ -400,7 +400,7 @@ posix_acl_create(struct posix_acl **acl, gfp_t gfp, mode_t *mode_p) EXPORT_SYMBOL(posix_acl_create); int -posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, mode_t mode) +posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, umode_t mode) { struct posix_acl *clone = posix_acl_clone(*acl, gfp); int err = -ENOMEM; diff --git a/fs/proc/base.c b/fs/proc/base.c index c9e3f65..5eb0206 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1118,7 +1118,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, * Warn that /proc/pid/oom_adj is deprecated, see * Documentation/feature-removal-schedule.txt. */ - WARN_ONCE(1, "%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n", + printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n", current->comm, task_pid_nr(current), task_pid_nr(task), task_pid_nr(task)); task->signal->oom_adj = oom_adjust; @@ -1919,6 +1919,14 @@ static int proc_fd_info(struct inode *inode, struct path *path, char *info) spin_lock(&files->file_lock); file = fcheck_files(files, fd); if (file) { + unsigned int f_flags; + struct fdtable *fdt; + + fdt = files_fdtable(files); + f_flags = file->f_flags & ~O_CLOEXEC; + if (FD_ISSET(fd, fdt->close_on_exec)) + f_flags |= O_CLOEXEC; + if (path) { *path = file->f_path; path_get(&file->f_path); @@ -1928,7 +1936,7 @@ static int proc_fd_info(struct inode *inode, struct path *path, char *info) "pos:\t%lli\n" "flags:\t0%o\n", (long long) file->f_pos, - file->f_flags); + f_flags); spin_unlock(&files->file_lock); put_files_struct(files); return 0; @@ -2706,9 +2714,16 @@ static int do_io_accounting(struct task_struct *task, char *buffer, int whole) { struct task_io_accounting acct = task->ioac; unsigned long flags; + int result; - if (!ptrace_may_access(task, PTRACE_MODE_READ)) - return -EACCES; + result = mutex_lock_killable(&task->signal->cred_guard_mutex); + if (result) + return result; + + if (!ptrace_may_access(task, PTRACE_MODE_READ)) { + result = -EACCES; + goto out_unlock; + } if (whole && lock_task_sighand(task, &flags)) { struct task_struct *t = task; @@ -2719,7 +2734,7 @@ static int do_io_accounting(struct task_struct *task, char *buffer, int whole) unlock_task_sighand(task, &flags); } - return sprintf(buffer, + result = sprintf(buffer, "rchar: %llu\n" "wchar: %llu\n" "syscr: %llu\n" @@ -2734,6 +2749,9 @@ static int do_io_accounting(struct task_struct *task, char *buffer, int whole) (unsigned long long)acct.read_bytes, (unsigned long long)acct.write_bytes, (unsigned long long)acct.cancelled_write_bytes); +out_unlock: + mutex_unlock(&task->signal->cred_guard_mutex); + return result; } static int proc_tid_io_accounting(struct task_struct *task, char *buffer) diff --git a/fs/proc/generic.c b/fs/proc/generic.c index f1637f1..9d99131 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -620,8 +620,7 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, if (!ent) goto out; memset(ent, 0, sizeof(struct proc_dir_entry)); - memcpy(((char *) ent) + sizeof(struct proc_dir_entry), fn, len + 1); - ent->name = ((char *) ent) + sizeof(*ent); + memcpy(ent->name, fn, len + 1); ent->namelen = len; ent->mode = mode; ent->nlink = nlink; diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 74b48cf..7ed72d6 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -319,7 +319,7 @@ static int proc_reg_open(struct inode *inode, struct file *file) if (!pde->proc_fops) { spin_unlock(&pde->pde_unload_lock); kfree(pdeo); - return -EINVAL; + return -ENOENT; } pde->pde_users++; open = pde->proc_fops->open; diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index ed257d1..5861741 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -10,7 +10,7 @@ #include <linux/seq_file.h> #include <linux/swap.h> #include <linux/vmstat.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include <asm/page.h> #include <asm/pgtable.h> #include "internal.h" diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 9020ac1..f738024 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -197,15 +197,15 @@ static __net_init int proc_net_ns_init(struct net *net) int err; err = -ENOMEM; - netd = kzalloc(sizeof(*netd), GFP_KERNEL); + netd = kzalloc(sizeof(*netd) + 4, GFP_KERNEL); if (!netd) goto out; netd->data = net; netd->nlink = 2; - netd->name = "net"; netd->namelen = 3; netd->parent = &proc_root; + memcpy(netd->name, "net", 4); err = -EEXIST; net_statd = proc_net_mkdir(net, "stat", netd); diff --git a/fs/proc/root.c b/fs/proc/root.c index d6c3b41..9a8a2b7 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -186,13 +186,13 @@ static const struct inode_operations proc_root_inode_operations = { struct proc_dir_entry proc_root = { .low_ino = PROC_ROOT_INO, .namelen = 5, - .name = "/proc", .mode = S_IFDIR | S_IRUGO | S_IXUGO, .nlink = 2, .count = ATOMIC_INIT(1), .proc_iops = &proc_root_inode_operations, .proc_fops = &proc_root_operations, .parent = &proc_root, + .name = "/proc", }; int pid_ns_prepare_proc(struct pid_namespace *ns) diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 977ed27..893b961 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -39,8 +39,9 @@ #define PSTORE_NAMELEN 64 struct pstore_private { + struct pstore_info *psi; + enum pstore_type_id type; u64 id; - int (*erase)(u64); ssize_t size; char data[]; }; @@ -73,7 +74,7 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry) { struct pstore_private *p = dentry->d_inode->i_private; - p->erase(p->id); + p->psi->erase(p->type, p->id, p->psi); return simple_unlink(dir, dentry); } @@ -175,8 +176,8 @@ int pstore_is_mounted(void) * Set the mtime & ctime to the date that this record was originally stored. */ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, - char *data, size_t size, - struct timespec time, int (*erase)(u64)) + char *data, size_t size, struct timespec time, + struct pstore_info *psi) { struct dentry *root = pstore_sb->s_root; struct dentry *dentry; @@ -192,8 +193,9 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, private = kmalloc(sizeof *private + size, GFP_KERNEL); if (!private) goto fail_alloc; + private->type = type; private->id = id; - private->erase = erase; + private->psi = psi; switch (type) { case PSTORE_TYPE_DMESG: diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h index 8c9f23e..611c1b3 100644 --- a/fs/pstore/internal.h +++ b/fs/pstore/internal.h @@ -2,5 +2,5 @@ extern void pstore_set_kmsg_bytes(int); extern void pstore_get_records(void); extern int pstore_mkfile(enum pstore_type_id, char *psname, u64 id, char *data, size_t size, - struct timespec time, int (*erase)(u64)); + struct timespec time, struct pstore_info *psi); extern int pstore_is_mounted(void); diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index f2c3ff2..c5300ec 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -37,6 +37,8 @@ static DEFINE_SPINLOCK(pstore_lock); static struct pstore_info *psinfo; +static char *backend; + /* How much of the console log to snapshot */ static unsigned long kmsg_bytes = 10240; @@ -67,7 +69,8 @@ static void pstore_dump(struct kmsg_dumper *dumper, unsigned long size, total = 0; char *dst, *why; u64 id; - int hsize, part = 1; + int hsize; + unsigned int part = 1; if (reason < ARRAY_SIZE(reason_str)) why = reason_str[reason]; @@ -78,7 +81,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, oopscount++; while (total < kmsg_bytes) { dst = psinfo->buf; - hsize = sprintf(dst, "%s#%d Part%d\n", why, oopscount, part++); + hsize = sprintf(dst, "%s#%d Part%d\n", why, oopscount, part); size = psinfo->bufsize - hsize; dst += hsize; @@ -94,14 +97,16 @@ static void pstore_dump(struct kmsg_dumper *dumper, memcpy(dst, s1 + s1_start, l1_cpy); memcpy(dst + l1_cpy, s2 + s2_start, l2_cpy); - id = psinfo->write(PSTORE_TYPE_DMESG, hsize + l1_cpy + l2_cpy); + id = psinfo->write(PSTORE_TYPE_DMESG, part, + hsize + l1_cpy + l2_cpy, psinfo); if (reason == KMSG_DUMP_OOPS && pstore_is_mounted()) pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id, psinfo->buf, hsize + l1_cpy + l2_cpy, - CURRENT_TIME, psinfo->erase); + CURRENT_TIME, psinfo); l1 -= l1_cpy; l2 -= l2_cpy; total += l1_cpy + l2_cpy; + part++; } mutex_unlock(&psinfo->buf_mutex); } @@ -128,6 +133,12 @@ int pstore_register(struct pstore_info *psi) spin_unlock(&pstore_lock); return -EBUSY; } + + if (backend && strcmp(backend, psi->name)) { + spin_unlock(&pstore_lock); + return -EINVAL; + } + psinfo = psi; spin_unlock(&pstore_lock); @@ -166,9 +177,9 @@ void pstore_get_records(void) if (rc) goto out; - while ((size = psi->read(&id, &type, &time)) > 0) { + while ((size = psi->read(&id, &type, &time, psi)) > 0) { if (pstore_mkfile(type, psi->name, id, psi->buf, (size_t)size, - time, psi->erase)) + time, psi)) failed++; } psi->close(psi); @@ -196,12 +207,15 @@ int pstore_write(enum pstore_type_id type, char *buf, size_t size) mutex_lock(&psinfo->buf_mutex); memcpy(psinfo->buf, buf, size); - id = psinfo->write(type, size); + id = psinfo->write(type, 0, size, psinfo); if (pstore_is_mounted()) pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id, psinfo->buf, - size, CURRENT_TIME, psinfo->erase); + size, CURRENT_TIME, psinfo); mutex_unlock(&psinfo->buf_mutex); return 0; } EXPORT_SYMBOL_GPL(pstore_write); + +module_param(backend, charp, 0444); +MODULE_PARM_DESC(backend, "Pstore backend to use"); diff --git a/fs/read_write.c b/fs/read_write.c index 5907b49..179f1c3 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -166,8 +166,10 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin) * long as offset isn't at the end of the file then the * offset is data. */ - if (offset >= inode->i_size) - return -ENXIO; + if (offset >= inode->i_size) { + retval = -ENXIO; + goto out; + } break; case SEEK_HOLE: /* @@ -175,8 +177,10 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin) * as long as offset isn't i_size or larger, return * i_size. */ - if (offset >= inode->i_size) - return -ENXIO; + if (offset >= inode->i_size) { + retval = -ENXIO; + goto out; + } offset = inode->i_size; break; } diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index 7362cf4..6da0396e 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -272,12 +272,10 @@ reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode, case ACL_TYPE_ACCESS: name = POSIX_ACL_XATTR_ACCESS; if (acl) { - mode_t mode = inode->i_mode; - error = posix_acl_equiv_mode(acl, &mode); + error = posix_acl_equiv_mode(acl, &inode->i_mode); if (error < 0) return error; else { - inode->i_mode = mode; if (error == 0) acl = NULL; } @@ -354,8 +352,6 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th, return PTR_ERR(acl); if (acl) { - mode_t mode = inode->i_mode; - /* Copy the default ACL to the default ACL of a new directory */ if (S_ISDIR(inode->i_mode)) { err = reiserfs_set_acl(th, inode, ACL_TYPE_DEFAULT, @@ -366,12 +362,10 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th, /* Now we reconcile the new ACL and the mode, potentially modifying both */ - err = posix_acl_create(&acl, GFP_NOFS, &mode); + err = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); if (err < 0) return err; - inode->i_mode = mode; - /* If we need an ACL.. */ if (err > 0) err = reiserfs_set_acl(th, inode, ACL_TYPE_ACCESS, acl); @@ -29,10 +29,7 @@ void fsstack_copy_inode_size(struct inode *dst, struct inode *src) * * We don't actually know what locking is used at the lower level; * but if it's a filesystem that supports quotas, it will be using - * i_lock as in inode_add_bytes(). tmpfs uses other locking, and - * its 32-bit is (just) able to exceed 2TB i_size with the aid of - * holes; but its i_blocks cannot carry into the upper long without - * almost 2TB swap - let's ignore that case. + * i_lock as in inode_add_bytes(). */ if (sizeof(i_blocks) > sizeof(long)) spin_lock(&src->i_lock); @@ -27,12 +27,12 @@ void generic_fillattr(struct inode *inode, struct kstat *stat) stat->uid = inode->i_uid; stat->gid = inode->i_gid; stat->rdev = inode->i_rdev; + stat->size = i_size_read(inode); stat->atime = inode->i_atime; stat->mtime = inode->i_mtime; stat->ctime = inode->i_ctime; - stat->size = i_size_read(inode); - stat->blocks = inode->i_blocks; stat->blksize = (1 << inode->i_blkbits); + stat->blocks = inode->i_blocks; } EXPORT_SYMBOL(generic_fillattr); diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h index 45174b5..feb361e 100644 --- a/fs/ubifs/debug.h +++ b/fs/ubifs/debug.h @@ -335,9 +335,9 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c); #define DBGKEY(key) ((char *)(key)) #define DBGKEY1(key) ((char *)(key)) -#define ubifs_dbg_msg(fmt, ...) do { \ - if (0) \ - pr_debug(fmt "\n", ##__VA_ARGS__); \ +#define ubifs_dbg_msg(fmt, ...) do { \ + if (0) \ + printk(KERN_DEBUG fmt "\n", ##__VA_ARGS__); \ } while (0) #define dbg_dump_stack() diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 75bb316..427a4e8 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -16,44 +16,53 @@ # Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA # -ccflags-y := -I$(src) -I$(src)/linux-2.6 -ccflags-$(CONFIG_XFS_DEBUG) += -g +ccflags-y += -I$(src) # needed for trace events -XFS_LINUX := linux-2.6 +ccflags-$(CONFIG_XFS_DEBUG) += -g obj-$(CONFIG_XFS_FS) += xfs.o -xfs-y += linux-2.6/xfs_trace.o - -xfs-$(CONFIG_XFS_QUOTA) += $(addprefix quota/, \ - xfs_dquot.o \ - xfs_dquot_item.o \ - xfs_trans_dquot.o \ - xfs_qm_syscalls.o \ - xfs_qm_bhv.o \ - xfs_qm.o) -xfs-$(CONFIG_XFS_QUOTA) += linux-2.6/xfs_quotaops.o - -ifeq ($(CONFIG_XFS_QUOTA),y) -xfs-$(CONFIG_PROC_FS) += quota/xfs_qm_stats.o -endif - -xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o -xfs-$(CONFIG_XFS_POSIX_ACL) += $(XFS_LINUX)/xfs_acl.o -xfs-$(CONFIG_PROC_FS) += $(XFS_LINUX)/xfs_stats.o -xfs-$(CONFIG_SYSCTL) += $(XFS_LINUX)/xfs_sysctl.o -xfs-$(CONFIG_COMPAT) += $(XFS_LINUX)/xfs_ioctl32.o +# this one should be compiled first, as the tracing macros can easily blow up +xfs-y += xfs_trace.o +# highlevel code +xfs-y += xfs_aops.o \ + xfs_bit.o \ + xfs_buf.o \ + xfs_dfrag.o \ + xfs_discard.o \ + xfs_error.o \ + xfs_export.o \ + xfs_file.o \ + xfs_filestream.o \ + xfs_fsops.o \ + xfs_fs_subr.o \ + xfs_globals.o \ + xfs_iget.o \ + xfs_ioctl.o \ + xfs_iomap.o \ + xfs_iops.o \ + xfs_itable.o \ + xfs_message.o \ + xfs_mru_cache.o \ + xfs_super.o \ + xfs_sync.o \ + xfs_xattr.o \ + xfs_rename.o \ + xfs_rw.o \ + xfs_utils.o \ + xfs_vnodeops.o \ + kmem.o \ + uuid.o +# code shared with libxfs xfs-y += xfs_alloc.o \ xfs_alloc_btree.o \ xfs_attr.o \ xfs_attr_leaf.o \ - xfs_bit.o \ xfs_bmap.o \ xfs_bmap_btree.o \ xfs_btree.o \ - xfs_buf_item.o \ xfs_da_btree.o \ xfs_dir2.o \ xfs_dir2_block.o \ @@ -61,49 +70,37 @@ xfs-y += xfs_alloc.o \ xfs_dir2_leaf.o \ xfs_dir2_node.o \ xfs_dir2_sf.o \ - xfs_error.o \ - xfs_extfree_item.o \ - xfs_filestream.o \ - xfs_fsops.o \ xfs_ialloc.o \ xfs_ialloc_btree.o \ - xfs_iget.o \ xfs_inode.o \ - xfs_inode_item.o \ - xfs_iomap.o \ - xfs_itable.o \ - xfs_dfrag.o \ - xfs_log.o \ - xfs_log_cil.o \ xfs_log_recover.o \ xfs_mount.o \ - xfs_mru_cache.o \ - xfs_rename.o \ - xfs_trans.o \ + xfs_trans.o + +# low-level transaction/log code +xfs-y += xfs_log.o \ + xfs_log_cil.o \ + xfs_buf_item.o \ + xfs_extfree_item.o \ + xfs_inode_item.o \ xfs_trans_ail.o \ xfs_trans_buf.o \ xfs_trans_extfree.o \ xfs_trans_inode.o \ - xfs_utils.o \ - xfs_vnodeops.o \ - xfs_rw.o - -# Objects in linux/ -xfs-y += $(addprefix $(XFS_LINUX)/, \ - kmem.o \ - xfs_aops.o \ - xfs_buf.o \ - xfs_discard.o \ - xfs_export.o \ - xfs_file.o \ - xfs_fs_subr.o \ - xfs_globals.o \ - xfs_ioctl.o \ - xfs_iops.o \ - xfs_message.o \ - xfs_super.o \ - xfs_sync.o \ - xfs_xattr.o) -# Objects in support/ -xfs-y += support/uuid.o +# optional features +xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \ + xfs_dquot_item.o \ + xfs_trans_dquot.o \ + xfs_qm_syscalls.o \ + xfs_qm_bhv.o \ + xfs_qm.o \ + xfs_quotaops.o +ifeq ($(CONFIG_XFS_QUOTA),y) +xfs-$(CONFIG_PROC_FS) += xfs_qm_stats.o +endif +xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o +xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o +xfs-$(CONFIG_PROC_FS) += xfs_stats.o +xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o +xfs-$(CONFIG_COMPAT) += xfs_ioctl32.o diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/kmem.c index a907de5..a907de5 100644 --- a/fs/xfs/linux-2.6/kmem.c +++ b/fs/xfs/kmem.c diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/kmem.h index 292eff1..292eff1 100644 --- a/fs/xfs/linux-2.6/kmem.h +++ b/fs/xfs/kmem.h diff --git a/fs/xfs/linux-2.6/mrlock.h b/fs/xfs/mrlock.h index ff6a198..ff6a198 100644 --- a/fs/xfs/linux-2.6/mrlock.h +++ b/fs/xfs/mrlock.h diff --git a/fs/xfs/linux-2.6/time.h b/fs/xfs/time.h index 387e695..387e695 100644 --- a/fs/xfs/linux-2.6/time.h +++ b/fs/xfs/time.h diff --git a/fs/xfs/support/uuid.c b/fs/xfs/uuid.c index b83f76b..b83f76b 100644 --- a/fs/xfs/support/uuid.c +++ b/fs/xfs/uuid.c diff --git a/fs/xfs/support/uuid.h b/fs/xfs/uuid.h index 4732d71..4732d71 100644 --- a/fs/xfs/support/uuid.h +++ b/fs/xfs/uuid.h diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h index 53ec3ea..d8b11b7 100644 --- a/fs/xfs/xfs.h +++ b/fs/xfs/xfs.h @@ -24,5 +24,6 @@ #define XFS_BUF_LOCK_TRACKING 1 #endif -#include <linux-2.6/xfs_linux.h> +#include "xfs_linux.h" + #endif /* __XFS_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/xfs_acl.c index 44ce516..b6c4b37 100644 --- a/fs/xfs/linux-2.6/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -221,7 +221,7 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) } static int -xfs_set_mode(struct inode *inode, mode_t mode) +xfs_set_mode(struct inode *inode, umode_t mode) { int error = 0; @@ -267,7 +267,7 @@ posix_acl_default_exists(struct inode *inode) int xfs_inherit_acl(struct inode *inode, struct posix_acl *acl) { - mode_t mode = inode->i_mode; + umode_t mode = inode->i_mode; int error = 0, inherit = 0; if (S_ISDIR(inode->i_mode)) { @@ -381,7 +381,7 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name, goto out_release; if (type == ACL_TYPE_ACCESS) { - mode_t mode = inode->i_mode; + umode_t mode = inode->i_mode; error = posix_acl_equiv_mode(acl, &mode); if (error <= 0) { diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h index 2c656ef..39632d9 100644 --- a/fs/xfs/xfs_acl.h +++ b/fs/xfs/xfs_acl.h @@ -51,7 +51,10 @@ extern int posix_acl_default_exists(struct inode *inode); extern const struct xattr_handler xfs_xattr_acl_access_handler; extern const struct xattr_handler xfs_xattr_acl_default_handler; #else -# define xfs_get_acl(inode, type) NULL +static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type) +{ + return NULL; +} # define xfs_inherit_acl(inode, default_acl) 0 # define xfs_acl_chmod(inode) 0 # define posix_acl_access_exists(inode) 0 diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index 6530769..4805f00 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -103,7 +103,7 @@ typedef struct xfs_agf { /* disk block (xfs_daddr_t) in the AG */ #define XFS_AGF_DADDR(mp) ((xfs_daddr_t)(1 << (mp)->m_sectbb_log)) #define XFS_AGF_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp)) -#define XFS_BUF_TO_AGF(bp) ((xfs_agf_t *)XFS_BUF_PTR(bp)) +#define XFS_BUF_TO_AGF(bp) ((xfs_agf_t *)((bp)->b_addr)) extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, int flags, struct xfs_buf **bpp); @@ -156,7 +156,7 @@ typedef struct xfs_agi { /* disk block (xfs_daddr_t) in the AG */ #define XFS_AGI_DADDR(mp) ((xfs_daddr_t)(2 << (mp)->m_sectbb_log)) #define XFS_AGI_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGI_DADDR(mp)) -#define XFS_BUF_TO_AGI(bp) ((xfs_agi_t *)XFS_BUF_PTR(bp)) +#define XFS_BUF_TO_AGI(bp) ((xfs_agi_t *)((bp)->b_addr)) extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, struct xfs_buf **bpp); @@ -168,7 +168,7 @@ extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp, #define XFS_AGFL_DADDR(mp) ((xfs_daddr_t)(3 << (mp)->m_sectbb_log)) #define XFS_AGFL_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGFL_DADDR(mp)) #define XFS_AGFL_SIZE(mp) ((mp)->m_sb.sb_sectsize / sizeof(xfs_agblock_t)) -#define XFS_BUF_TO_AGFL(bp) ((xfs_agfl_t *)XFS_BUF_PTR(bp)) +#define XFS_BUF_TO_AGFL(bp) ((xfs_agfl_t *)((bp)->b_addr)) typedef struct xfs_agfl { __be32 agfl_bno[1]; /* actually XFS_AGFL_SIZE(mp) */ diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 1e00b3e..bdd9cb5 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -451,8 +451,7 @@ xfs_alloc_read_agfl( XFS_FSS_TO_BB(mp, 1), 0, &bp); if (error) return error; - ASSERT(bp); - ASSERT(!XFS_BUF_GETERROR(bp)); + ASSERT(!xfs_buf_geterror(bp)); XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGFL, XFS_AGFL_REF); *bpp = bp; return 0; @@ -2116,7 +2115,7 @@ xfs_read_agf( if (!*bpp) return 0; - ASSERT(!XFS_BUF_GETERROR(*bpp)); + ASSERT(!(*bpp)->b_error); agf = XFS_BUF_TO_AGF(*bpp); /* @@ -2168,7 +2167,7 @@ xfs_alloc_read_agf( return error; if (!*bpp) return 0; - ASSERT(!XFS_BUF_GETERROR(*bpp)); + ASSERT(!(*bpp)->b_error); agf = XFS_BUF_TO_AGF(*bpp); pag = xfs_perag_get(mp, agno); diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/xfs_aops.c index 63e971e..8c37dde 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1300,6 +1300,7 @@ xfs_end_io_direct_write( bool is_async) { struct xfs_ioend *ioend = iocb->private; + struct inode *inode = ioend->io_inode; /* * blockdev_direct_IO can return an error even after the I/O @@ -1331,7 +1332,7 @@ xfs_end_io_direct_write( } /* XXX: probably should move into the real I/O completion handler */ - inode_dio_done(ioend->io_inode); + inode_dio_done(inode); } STATIC ssize_t diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/xfs_aops.h index 71f721e..71f721e 100644 --- a/fs/xfs/linux-2.6/xfs_aops.h +++ b/fs/xfs/xfs_aops.h diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index cbae424..160bcdc 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -2121,8 +2121,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt, XBF_LOCK | XBF_DONT_BLOCK); - ASSERT(bp); - ASSERT(!XFS_BUF_GETERROR(bp)); + ASSERT(!xfs_buf_geterror(bp)); tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen : XFS_BUF_SIZE(bp); diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index c51a3f9..452a291 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -414,7 +414,7 @@ xfs_bmap_add_attrfork_local( if (ip->i_df.if_bytes <= XFS_IFORK_DSIZE(ip)) return 0; - if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) { + if (S_ISDIR(ip->i_d.di_mode)) { mp = ip->i_mount; memset(&dargs, 0, sizeof(dargs)); dargs.dp = ip; @@ -3344,8 +3344,7 @@ xfs_bmap_local_to_extents( * We don't want to deal with the case of keeping inode data inline yet. * So sending the data fork of a regular inode is invalid. */ - ASSERT(!((ip->i_d.di_mode & S_IFMT) == S_IFREG && - whichfork == XFS_DATA_FORK)); + ASSERT(!(S_ISREG(ip->i_d.di_mode) && whichfork == XFS_DATA_FORK)); ifp = XFS_IFORK_PTR(ip, whichfork); ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); flags = 0; @@ -3384,8 +3383,7 @@ xfs_bmap_local_to_extents( ASSERT(args.len == 1); *firstblock = args.fsbno; bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0); - memcpy((char *)XFS_BUF_PTR(bp), ifp->if_u1.if_data, - ifp->if_bytes); + memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes); xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1); xfs_bmap_forkoff_reset(args.mp, ip, whichfork); xfs_idata_realloc(ip, -ifp->if_bytes, whichfork); @@ -4052,7 +4050,7 @@ xfs_bmap_one_block( #ifndef DEBUG if (whichfork == XFS_DATA_FORK) { - return ((ip->i_d.di_mode & S_IFMT) == S_IFREG) ? + return S_ISREG(ip->i_d.di_mode) ? (ip->i_size == ip->i_mount->m_sb.sb_blocksize) : (ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize); } diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index cabf4b5..2b9fd38 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -275,8 +275,7 @@ xfs_btree_dup_cursor( return error; } new->bc_bufs[i] = bp; - ASSERT(bp); - ASSERT(!XFS_BUF_GETERROR(bp)); + ASSERT(!xfs_buf_geterror(bp)); } else new->bc_bufs[i] = NULL; } @@ -467,8 +466,7 @@ xfs_btree_get_bufl( ASSERT(fsbno != NULLFSBLOCK); d = XFS_FSB_TO_DADDR(mp, fsbno); bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock); - ASSERT(bp); - ASSERT(!XFS_BUF_GETERROR(bp)); + ASSERT(!xfs_buf_geterror(bp)); return bp; } @@ -491,8 +489,7 @@ xfs_btree_get_bufs( ASSERT(agbno != NULLAGBLOCK); d = XFS_AGB_TO_DADDR(mp, agno, agbno); bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock); - ASSERT(bp); - ASSERT(!XFS_BUF_GETERROR(bp)); + ASSERT(!xfs_buf_geterror(bp)); return bp; } @@ -632,7 +629,7 @@ xfs_btree_read_bufl( mp->m_bsize, lock, &bp))) { return error; } - ASSERT(!bp || !XFS_BUF_GETERROR(bp)); + ASSERT(!xfs_buf_geterror(bp)); if (bp) XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval); *bpp = bp; @@ -973,8 +970,7 @@ xfs_btree_get_buf_block( *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d, mp->m_bsize, flags); - ASSERT(*bpp); - ASSERT(!XFS_BUF_GETERROR(*bpp)); + ASSERT(!xfs_buf_geterror(*bpp)); *block = XFS_BUF_TO_BLOCK(*bpp); return 0; @@ -1006,8 +1002,7 @@ xfs_btree_read_buf_block( if (error) return error; - ASSERT(*bpp != NULL); - ASSERT(!XFS_BUF_GETERROR(*bpp)); + ASSERT(!xfs_buf_geterror(*bpp)); xfs_btree_set_refs(cur, *bpp); *block = XFS_BUF_TO_BLOCK(*bpp); diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h index 8d05a6a..5b240de 100644 --- a/fs/xfs/xfs_btree.h +++ b/fs/xfs/xfs_btree.h @@ -262,7 +262,7 @@ typedef struct xfs_btree_cur /* * Convert from buffer to btree block header. */ -#define XFS_BUF_TO_BLOCK(bp) ((struct xfs_btree_block *)XFS_BUF_PTR(bp)) +#define XFS_BUF_TO_BLOCK(bp) ((struct xfs_btree_block *)((bp)->b_addr)) /* diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/xfs_buf.c index b2b4119..c57836d 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -596,7 +596,7 @@ _xfs_buf_read( bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD); status = xfs_buf_iorequest(bp); - if (status || XFS_BUF_ISERROR(bp) || (flags & XBF_ASYNC)) + if (status || bp->b_error || (flags & XBF_ASYNC)) return status; return xfs_buf_iowait(bp); } @@ -679,7 +679,6 @@ xfs_buf_read_uncached( /* set up the buffer for a read IO */ XFS_BUF_SET_ADDR(bp, daddr); XFS_BUF_READ(bp); - XFS_BUF_BUSY(bp); xfsbdstrat(mp, bp); error = xfs_buf_iowait(bp); @@ -1069,7 +1068,7 @@ xfs_bioerror( /* * No need to wait until the buffer is unpinned, we aren't flushing it. */ - XFS_BUF_ERROR(bp, EIO); + xfs_buf_ioerror(bp, EIO); /* * We're calling xfs_buf_ioend, so delete XBF_DONE flag. @@ -1094,7 +1093,7 @@ STATIC int xfs_bioerror_relse( struct xfs_buf *bp) { - int64_t fl = XFS_BUF_BFLAGS(bp); + int64_t fl = bp->b_flags; /* * No need to wait until the buffer is unpinned. * We aren't flushing it. @@ -1115,7 +1114,7 @@ xfs_bioerror_relse( * There's no reason to mark error for * ASYNC buffers. */ - XFS_BUF_ERROR(bp, EIO); + xfs_buf_ioerror(bp, EIO); XFS_BUF_FINISH_IOWAIT(bp); } else { xfs_buf_relse(bp); @@ -1224,6 +1223,9 @@ _xfs_buf_ioapply( rw = READ; } + /* we only use the buffer cache for meta-data */ + rw |= REQ_META; + next_chunk: atomic_inc(&bp->b_io_remaining); nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT); @@ -1321,7 +1323,7 @@ xfs_buf_offset( struct page *page; if (bp->b_flags & XBF_MAPPED) - return XFS_BUF_PTR(bp) + offset; + return bp->b_addr + offset; offset += bp->b_offset; page = bp->b_pages[offset >> PAGE_SHIFT]; @@ -1481,7 +1483,7 @@ xfs_setsize_buftarg_flags( if (set_blocksize(btp->bt_bdev, sectorsize)) { xfs_warn(btp->bt_mount, "Cannot set_blocksize to %u on device %s\n", - sectorsize, XFS_BUFTARG_NAME(btp)); + sectorsize, xfs_buf_target_name(btp)); return EINVAL; } @@ -1678,7 +1680,7 @@ xfs_buf_delwri_split( list_for_each_entry_safe(bp, n, dwq, b_list) { ASSERT(bp->b_flags & XBF_DELWRI); - if (!XFS_BUF_ISPINNED(bp) && xfs_buf_trylock(bp)) { + if (!xfs_buf_ispinned(bp) && xfs_buf_trylock(bp)) { if (!force && time_before(jiffies, bp->b_queuetime + age)) { xfs_buf_unlock(bp); diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/xfs_buf.h index 6a83b46..620972b 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -228,11 +228,15 @@ extern void xfs_buf_delwri_promote(xfs_buf_t *); extern int xfs_buf_init(void); extern void xfs_buf_terminate(void); -#define xfs_buf_target_name(target) \ - ({ char __b[BDEVNAME_SIZE]; bdevname((target)->bt_bdev, __b); __b; }) +static inline const char * +xfs_buf_target_name(struct xfs_buftarg *target) +{ + static char __b[BDEVNAME_SIZE]; + + return bdevname(target->bt_bdev, __b); +} -#define XFS_BUF_BFLAGS(bp) ((bp)->b_flags) #define XFS_BUF_ZEROFLAGS(bp) \ ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI| \ XBF_SYNCIO|XBF_FUA|XBF_FLUSH)) @@ -251,23 +255,14 @@ void xfs_buf_stale(struct xfs_buf *bp); #define XFS_BUF_UNDELAYWRITE(bp) xfs_buf_delwri_dequeue(bp) #define XFS_BUF_ISDELAYWRITE(bp) ((bp)->b_flags & XBF_DELWRI) -#define XFS_BUF_ERROR(bp,no) xfs_buf_ioerror(bp,no) -#define XFS_BUF_GETERROR(bp) xfs_buf_geterror(bp) -#define XFS_BUF_ISERROR(bp) (xfs_buf_geterror(bp) ? 1 : 0) - #define XFS_BUF_DONE(bp) ((bp)->b_flags |= XBF_DONE) #define XFS_BUF_UNDONE(bp) ((bp)->b_flags &= ~XBF_DONE) #define XFS_BUF_ISDONE(bp) ((bp)->b_flags & XBF_DONE) -#define XFS_BUF_BUSY(bp) do { } while (0) -#define XFS_BUF_UNBUSY(bp) do { } while (0) -#define XFS_BUF_ISBUSY(bp) (1) - #define XFS_BUF_ASYNC(bp) ((bp)->b_flags |= XBF_ASYNC) #define XFS_BUF_UNASYNC(bp) ((bp)->b_flags &= ~XBF_ASYNC) #define XFS_BUF_ISASYNC(bp) ((bp)->b_flags & XBF_ASYNC) -#define XFS_BUF_HOLD(bp) xfs_buf_hold(bp) #define XFS_BUF_READ(bp) ((bp)->b_flags |= XBF_READ) #define XFS_BUF_UNREAD(bp) ((bp)->b_flags &= ~XBF_READ) #define XFS_BUF_ISREAD(bp) ((bp)->b_flags & XBF_READ) @@ -276,10 +271,6 @@ void xfs_buf_stale(struct xfs_buf *bp); #define XFS_BUF_UNWRITE(bp) ((bp)->b_flags &= ~XBF_WRITE) #define XFS_BUF_ISWRITE(bp) ((bp)->b_flags & XBF_WRITE) -#define XFS_BUF_SET_START(bp) do { } while (0) - -#define XFS_BUF_PTR(bp) (xfs_caddr_t)((bp)->b_addr) -#define XFS_BUF_SET_PTR(bp, val, cnt) xfs_buf_associate_memory(bp, val, cnt) #define XFS_BUF_ADDR(bp) ((bp)->b_bn) #define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_bn = (xfs_daddr_t)(bno)) #define XFS_BUF_OFFSET(bp) ((bp)->b_file_offset) @@ -299,14 +290,13 @@ xfs_buf_set_ref( #define XFS_BUF_SET_VTYPE_REF(bp, type, ref) xfs_buf_set_ref(bp, ref) #define XFS_BUF_SET_VTYPE(bp, type) do { } while (0) -#define XFS_BUF_ISPINNED(bp) atomic_read(&((bp)->b_pin_count)) +static inline int xfs_buf_ispinned(struct xfs_buf *bp) +{ + return atomic_read(&bp->b_pin_count); +} #define XFS_BUF_FINISH_IOWAIT(bp) complete(&bp->b_iowait); -#define XFS_BUF_SET_TARGET(bp, target) ((bp)->b_target = (target)) -#define XFS_BUF_TARGET(bp) ((bp)->b_target) -#define XFS_BUFTARG_NAME(target) xfs_buf_target_name(target) - static inline void xfs_buf_relse(xfs_buf_t *bp) { xfs_buf_unlock(bp); diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 8849291..cac2ecf 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -124,9 +124,9 @@ xfs_buf_item_log_check( bp = bip->bli_buf; ASSERT(XFS_BUF_COUNT(bp) > 0); - ASSERT(XFS_BUF_PTR(bp) != NULL); + ASSERT(bp->b_addr != NULL); orig = bip->bli_orig; - buffer = XFS_BUF_PTR(bp); + buffer = bp->b_addr; for (x = 0; x < XFS_BUF_COUNT(bp); x++) { if (orig[x] != buffer[x] && !btst(bip->bli_logged, x)) { xfs_emerg(bp->b_mount, @@ -371,7 +371,6 @@ xfs_buf_item_pin( { struct xfs_buf_log_item *bip = BUF_ITEM(lip); - ASSERT(XFS_BUF_ISBUSY(bip->bli_buf)); ASSERT(atomic_read(&bip->bli_refcount) > 0); ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || (bip->bli_flags & XFS_BLI_STALE)); @@ -479,13 +478,13 @@ xfs_buf_item_trylock( struct xfs_buf_log_item *bip = BUF_ITEM(lip); struct xfs_buf *bp = bip->bli_buf; - if (XFS_BUF_ISPINNED(bp)) + if (xfs_buf_ispinned(bp)) return XFS_ITEM_PINNED; if (!xfs_buf_trylock(bp)) return XFS_ITEM_LOCKED; /* take a reference to the buffer. */ - XFS_BUF_HOLD(bp); + xfs_buf_hold(bp); ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); trace_xfs_buf_item_trylock(bip); @@ -726,7 +725,7 @@ xfs_buf_item_init( * to have logged. */ bip->bli_orig = (char *)kmem_alloc(XFS_BUF_COUNT(bp), KM_SLEEP); - memcpy(bip->bli_orig, XFS_BUF_PTR(bp), XFS_BUF_COUNT(bp)); + memcpy(bip->bli_orig, bp->b_addr, XFS_BUF_COUNT(bp)); bip->bli_logged = (char *)kmem_zalloc(XFS_BUF_COUNT(bp) / NBBY, KM_SLEEP); #endif @@ -895,7 +894,6 @@ xfs_buf_attach_iodone( { xfs_log_item_t *head_lip; - ASSERT(XFS_BUF_ISBUSY(bp)); ASSERT(xfs_buf_islocked(bp)); lip->li_cb = cb; @@ -960,7 +958,7 @@ xfs_buf_iodone_callbacks( static ulong lasttime; static xfs_buftarg_t *lasttarg; - if (likely(!XFS_BUF_GETERROR(bp))) + if (likely(!xfs_buf_geterror(bp))) goto do_callbacks; /* @@ -973,14 +971,14 @@ xfs_buf_iodone_callbacks( goto do_callbacks; } - if (XFS_BUF_TARGET(bp) != lasttarg || + if (bp->b_target != lasttarg || time_after(jiffies, (lasttime + 5*HZ))) { lasttime = jiffies; xfs_alert(mp, "Device %s: metadata write error block 0x%llx", - XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)), + xfs_buf_target_name(bp->b_target), (__uint64_t)XFS_BUF_ADDR(bp)); } - lasttarg = XFS_BUF_TARGET(bp); + lasttarg = bp->b_target; /* * If the write was asynchronous then no one will be looking for the @@ -991,12 +989,11 @@ xfs_buf_iodone_callbacks( * around. */ if (XFS_BUF_ISASYNC(bp)) { - XFS_BUF_ERROR(bp, 0); /* errno of 0 unsets the flag */ + xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */ if (!XFS_BUF_ISSTALE(bp)) { XFS_BUF_DELAYWRITE(bp); XFS_BUF_DONE(bp); - XFS_BUF_SET_START(bp); } ASSERT(bp->b_iodone != NULL); trace_xfs_buf_item_iodone_async(bp, _RET_IP_); @@ -1013,7 +1010,6 @@ xfs_buf_iodone_callbacks( XFS_BUF_UNDELAYWRITE(bp); trace_xfs_buf_error_relse(bp, _RET_IP_); - xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); do_callbacks: xfs_buf_do_callbacks(bp); diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 2925726..ee9d542 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -692,6 +692,24 @@ xfs_da_join(xfs_da_state_t *state) return(error); } +#ifdef DEBUG +static void +xfs_da_blkinfo_onlychild_validate(struct xfs_da_blkinfo *blkinfo, __u16 level) +{ + __be16 magic = blkinfo->magic; + + if (level == 1) { + ASSERT(magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || + magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); + } else + ASSERT(magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); + ASSERT(!blkinfo->forw); + ASSERT(!blkinfo->back); +} +#else /* !DEBUG */ +#define xfs_da_blkinfo_onlychild_validate(blkinfo, level) +#endif /* !DEBUG */ + /* * We have only one entry in the root. Copy the only remaining child of * the old root to block 0 as the new root node. @@ -700,8 +718,6 @@ STATIC int xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk) { xfs_da_intnode_t *oldroot; - /* REFERENCED */ - xfs_da_blkinfo_t *blkinfo; xfs_da_args_t *args; xfs_dablk_t child; xfs_dabuf_t *bp; @@ -732,15 +748,9 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk) if (error) return(error); ASSERT(bp != NULL); - blkinfo = bp->data; - if (be16_to_cpu(oldroot->hdr.level) == 1) { - ASSERT(blkinfo->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || - blkinfo->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); - } else { - ASSERT(blkinfo->magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); - } - ASSERT(!blkinfo->forw); - ASSERT(!blkinfo->back); + xfs_da_blkinfo_onlychild_validate(bp->data, + be16_to_cpu(oldroot->hdr.level)); + memcpy(root_blk->bp->data, bp->data, state->blocksize); xfs_da_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1); error = xfs_da_shrink_inode(args, child, bp); @@ -2040,7 +2050,7 @@ xfs_da_do_buf( case 0: bp = xfs_trans_get_buf(trans, mp->m_ddev_targp, mappedbno, nmapped, 0); - error = bp ? XFS_BUF_GETERROR(bp) : XFS_ERROR(EIO); + error = bp ? bp->b_error : XFS_ERROR(EIO); break; case 1: case 2: @@ -2258,7 +2268,7 @@ xfs_da_buf_make(int nbuf, xfs_buf_t **bps) dabuf->nbuf = 1; bp = bps[0]; dabuf->bbcount = (short)BTOBB(XFS_BUF_COUNT(bp)); - dabuf->data = XFS_BUF_PTR(bp); + dabuf->data = bp->b_addr; dabuf->bps[0] = bp; } else { dabuf->nbuf = nbuf; @@ -2269,7 +2279,7 @@ xfs_da_buf_make(int nbuf, xfs_buf_t **bps) dabuf->data = kmem_alloc(BBTOB(dabuf->bbcount), KM_SLEEP); for (i = off = 0; i < nbuf; i++, off += XFS_BUF_COUNT(bp)) { bp = bps[i]; - memcpy((char *)dabuf->data + off, XFS_BUF_PTR(bp), + memcpy((char *)dabuf->data + off, bp->b_addr, XFS_BUF_COUNT(bp)); } } @@ -2292,8 +2302,8 @@ xfs_da_buf_clean(xfs_dabuf_t *dabuf) for (i = off = 0; i < dabuf->nbuf; i++, off += XFS_BUF_COUNT(bp)) { bp = dabuf->bps[i]; - memcpy(XFS_BUF_PTR(bp), (char *)dabuf->data + off, - XFS_BUF_COUNT(bp)); + memcpy(bp->b_addr, dabuf->data + off, + XFS_BUF_COUNT(bp)); } } } @@ -2330,7 +2340,7 @@ xfs_da_log_buf(xfs_trans_t *tp, xfs_dabuf_t *dabuf, uint first, uint last) ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]); if (dabuf->nbuf == 1) { - ASSERT(dabuf->data == (void *)XFS_BUF_PTR(dabuf->bps[0])); + ASSERT(dabuf->data == dabuf->bps[0]->b_addr); xfs_trans_log_buf(tp, dabuf->bps[0], first, last); return; } diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h index dffba9b..a372163 100644 --- a/fs/xfs/xfs_dinode.h +++ b/fs/xfs/xfs_dinode.h @@ -148,7 +148,7 @@ typedef enum xfs_dinode_fmt { be32_to_cpu((dip)->di_nextents) : \ be16_to_cpu((dip)->di_anextents)) -#define XFS_BUF_TO_DINODE(bp) ((xfs_dinode_t *)XFS_BUF_PTR(bp)) +#define XFS_BUF_TO_DINODE(bp) ((xfs_dinode_t *)((bp)->b_addr)) /* * For block and character special files the 32bit dev_t is stored at the diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c index 4580ce0..a2e2701 100644 --- a/fs/xfs/xfs_dir2.c +++ b/fs/xfs/xfs_dir2.c @@ -121,7 +121,7 @@ xfs_dir_isempty( { xfs_dir2_sf_hdr_t *sfp; - ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + ASSERT(S_ISDIR(dp->i_d.di_mode)); if (dp->i_d.di_size == 0) /* might happen during shutdown. */ return 1; if (dp->i_d.di_size > XFS_IFORK_DSIZE(dp)) @@ -179,7 +179,7 @@ xfs_dir_init( memset((char *)&args, 0, sizeof(args)); args.dp = dp; args.trans = tp; - ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + ASSERT(S_ISDIR(dp->i_d.di_mode)); if ((error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino))) return error; return xfs_dir2_sf_create(&args, pdp->i_ino); @@ -202,7 +202,7 @@ xfs_dir_createname( int rval; int v; /* type-checking value */ - ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + ASSERT(S_ISDIR(dp->i_d.di_mode)); if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum))) return rval; XFS_STATS_INC(xs_dir_create); @@ -278,7 +278,7 @@ xfs_dir_lookup( int rval; int v; /* type-checking value */ - ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + ASSERT(S_ISDIR(dp->i_d.di_mode)); XFS_STATS_INC(xs_dir_lookup); memset(&args, 0, sizeof(xfs_da_args_t)); @@ -333,7 +333,7 @@ xfs_dir_removename( int rval; int v; /* type-checking value */ - ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + ASSERT(S_ISDIR(dp->i_d.di_mode)); XFS_STATS_INC(xs_dir_remove); memset(&args, 0, sizeof(xfs_da_args_t)); @@ -382,7 +382,7 @@ xfs_readdir( if (XFS_FORCED_SHUTDOWN(dp->i_mount)) return XFS_ERROR(EIO); - ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + ASSERT(S_ISDIR(dp->i_d.di_mode)); XFS_STATS_INC(xs_dir_getdents); if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) @@ -414,7 +414,7 @@ xfs_dir_replace( int rval; int v; /* type-checking value */ - ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + ASSERT(S_ISDIR(dp->i_d.di_mode)); if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum))) return rval; @@ -464,7 +464,7 @@ xfs_dir_canenter( if (resblks) return 0; - ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + ASSERT(S_ISDIR(dp->i_d.di_mode)); memset(&args, 0, sizeof(xfs_da_args_t)); args.name = name->name; diff --git a/fs/xfs/linux-2.6/xfs_discard.c b/fs/xfs/xfs_discard.c index 244e797..244e797 100644 --- a/fs/xfs/linux-2.6/xfs_discard.c +++ b/fs/xfs/xfs_discard.c diff --git a/fs/xfs/linux-2.6/xfs_discard.h b/fs/xfs/xfs_discard.h index 344879a..344879a 100644 --- a/fs/xfs/linux-2.6/xfs_discard.h +++ b/fs/xfs/xfs_discard.h diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 837f311..db62959 100644 --- a/fs/xfs/quota/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -318,10 +318,9 @@ xfs_qm_init_dquot_blk( int curid, i; ASSERT(tp); - ASSERT(XFS_BUF_ISBUSY(bp)); ASSERT(xfs_buf_islocked(bp)); - d = (xfs_dqblk_t *)XFS_BUF_PTR(bp); + d = bp->b_addr; /* * ID of the first dquot in the block - id's are zero based. @@ -403,7 +402,7 @@ xfs_qm_dqalloc( dqp->q_blkno, mp->m_quotainfo->qi_dqchunklen, 0); - if (!bp || (error = XFS_BUF_GETERROR(bp))) + if (!bp || (error = xfs_buf_geterror(bp))) goto error1; /* * Make a chunk of dquots out of this buffer and log @@ -534,13 +533,12 @@ xfs_qm_dqtobp( return XFS_ERROR(error); } - ASSERT(XFS_BUF_ISBUSY(bp)); ASSERT(xfs_buf_islocked(bp)); /* * calculate the location of the dquot inside the buffer. */ - ddq = (struct xfs_disk_dquot *)(XFS_BUF_PTR(bp) + dqp->q_bufoffset); + ddq = bp->b_addr + dqp->q_bufoffset; /* * A simple sanity check in case we got a corrupted dquot... @@ -553,7 +551,6 @@ xfs_qm_dqtobp( xfs_trans_brelse(tp, bp); return XFS_ERROR(EIO); } - XFS_BUF_BUSY(bp); /* We dirtied this */ } *O_bpp = bp; @@ -622,7 +619,6 @@ xfs_qm_dqread( * this particular dquot was repaired. We still aren't afraid to * brelse it because we have the changes incore. */ - ASSERT(XFS_BUF_ISBUSY(bp)); ASSERT(xfs_buf_islocked(bp)); xfs_trans_brelse(tp, bp); @@ -1204,7 +1200,7 @@ xfs_qm_dqflush( /* * Calculate the location of the dquot inside the buffer. */ - ddqp = (struct xfs_disk_dquot *)(XFS_BUF_PTR(bp) + dqp->q_bufoffset); + ddqp = bp->b_addr + dqp->q_bufoffset; /* * A simple sanity check in case we got a corrupted dquot.. @@ -1240,7 +1236,7 @@ xfs_qm_dqflush( * If the buffer is pinned then push on the log so we won't * get stuck waiting in the write for too long. */ - if (XFS_BUF_ISPINNED(bp)) { + if (xfs_buf_ispinned(bp)) { trace_xfs_dqflush_force(dqp); xfs_log_force(mp, 0); } @@ -1447,7 +1443,7 @@ xfs_qm_dqflock_pushbuf_wait( goto out_lock; if (XFS_BUF_ISDELAYWRITE(bp)) { - if (XFS_BUF_ISPINNED(bp)) + if (xfs_buf_ispinned(bp)) xfs_log_force(mp, 0); xfs_buf_delwri_promote(bp); wake_up_process(bp->b_target->bt_task); diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 34b7e94..34b7e94 100644 --- a/fs/xfs/quota/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index 9e0e2fa..9e0e2fa 100644 --- a/fs/xfs/quota/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c diff --git a/fs/xfs/quota/xfs_dquot_item.h b/fs/xfs/xfs_dquot_item.h index 5acae2a..5acae2a 100644 --- a/fs/xfs/quota/xfs_dquot_item.h +++ b/fs/xfs/xfs_dquot_item.h diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/xfs_export.c index 75e5d32..75e5d32 100644 --- a/fs/xfs/linux-2.6/xfs_export.c +++ b/fs/xfs/xfs_export.c diff --git a/fs/xfs/linux-2.6/xfs_export.h b/fs/xfs/xfs_export.h index 3272b6a..3272b6a 100644 --- a/fs/xfs/linux-2.6/xfs_export.h +++ b/fs/xfs/xfs_export.h diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/xfs_file.c index 825390e..7f7b424 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -149,7 +149,9 @@ xfs_file_fsync( xfs_iflags_clear(ip, XFS_ITRUNCATED); + xfs_ilock(ip, XFS_IOLOCK_SHARED); xfs_ioend_wait(ip); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); if (mp->m_flags & XFS_MOUNT_BARRIER) { /* diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index 9124425..3ff3d9e 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -344,9 +344,9 @@ _xfs_filestream_update_ag( * Either ip is a regular file and pip is a directory, or ip is a * directory and pip is NULL. */ - ASSERT(ip && (((ip->i_d.di_mode & S_IFREG) && pip && - (pip->i_d.di_mode & S_IFDIR)) || - ((ip->i_d.di_mode & S_IFDIR) && !pip))); + ASSERT(ip && ((S_ISREG(ip->i_d.di_mode) && pip && + S_ISDIR(pip->i_d.di_mode)) || + (S_ISDIR(ip->i_d.di_mode) && !pip))); mp = ip->i_mount; cache = mp->m_filestream; @@ -537,7 +537,7 @@ xfs_filestream_lookup_ag( xfs_agnumber_t ag; int ref; - if (!(ip->i_d.di_mode & (S_IFREG | S_IFDIR))) { + if (!S_ISREG(ip->i_d.di_mode) && !S_ISDIR(ip->i_d.di_mode)) { ASSERT(0); return NULLAGNUMBER; } @@ -579,9 +579,9 @@ xfs_filestream_associate( xfs_agnumber_t ag, rotorstep, startag; int err = 0; - ASSERT(pip->i_d.di_mode & S_IFDIR); - ASSERT(ip->i_d.di_mode & S_IFREG); - if (!(pip->i_d.di_mode & S_IFDIR) || !(ip->i_d.di_mode & S_IFREG)) + ASSERT(S_ISDIR(pip->i_d.di_mode)); + ASSERT(S_ISREG(ip->i_d.di_mode)); + if (!S_ISDIR(pip->i_d.di_mode) || !S_ISREG(ip->i_d.di_mode)) return -EINVAL; mp = pip->i_mount; diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/xfs_fs_subr.c index ed88ed1..ed88ed1 100644 --- a/fs/xfs/linux-2.6/xfs_fs_subr.c +++ b/fs/xfs/xfs_fs_subr.c diff --git a/fs/xfs/linux-2.6/xfs_globals.c b/fs/xfs/xfs_globals.c index 76e81cf..76e81cf 100644 --- a/fs/xfs/linux-2.6/xfs_globals.c +++ b/fs/xfs/xfs_globals.c diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index dd5628b..9f24ec2 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -202,8 +202,7 @@ xfs_ialloc_inode_init( fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize * blks_per_cluster, XBF_LOCK); - ASSERT(fbuf); - ASSERT(!XFS_BUF_GETERROR(fbuf)); + ASSERT(!xfs_buf_geterror(fbuf)); /* * Initialize all inodes in this buffer and then log them. @@ -1486,7 +1485,7 @@ xfs_read_agi( if (error) return error; - ASSERT(*bpp && !XFS_BUF_GETERROR(*bpp)); + ASSERT(!xfs_buf_geterror(*bpp)); agi = XFS_BUF_TO_AGI(*bpp); /* diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 3cc21dd..0239a7c 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -368,7 +368,7 @@ xfs_iformat( /* * no local regular files yet */ - if (unlikely((be16_to_cpu(dip->di_mode) & S_IFMT) == S_IFREG)) { + if (unlikely(S_ISREG(be16_to_cpu(dip->di_mode)))) { xfs_warn(ip->i_mount, "corrupt inode %Lu (local format for regular file).", (unsigned long long) ip->i_ino); @@ -1040,7 +1040,7 @@ xfs_ialloc( if (pip && XFS_INHERIT_GID(pip)) { ip->i_d.di_gid = pip->i_d.di_gid; - if ((pip->i_d.di_mode & S_ISGID) && (mode & S_IFMT) == S_IFDIR) { + if ((pip->i_d.di_mode & S_ISGID) && S_ISDIR(mode)) { ip->i_d.di_mode |= S_ISGID; } } @@ -1097,14 +1097,14 @@ xfs_ialloc( if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) { uint di_flags = 0; - if ((mode & S_IFMT) == S_IFDIR) { + if (S_ISDIR(mode)) { if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) di_flags |= XFS_DIFLAG_RTINHERIT; if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { di_flags |= XFS_DIFLAG_EXTSZINHERIT; ip->i_d.di_extsize = pip->i_d.di_extsize; } - } else if ((mode & S_IFMT) == S_IFREG) { + } else if (S_ISREG(mode)) { if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) di_flags |= XFS_DIFLAG_REALTIME; if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { @@ -1188,7 +1188,7 @@ xfs_isize_check( int nimaps; xfs_bmbt_irec_t imaps[2]; - if ((ip->i_d.di_mode & S_IFMT) != S_IFREG) + if (!S_ISREG(ip->i_d.di_mode)) return; if (XFS_IS_REALTIME_INODE(ip)) @@ -1828,7 +1828,7 @@ xfs_ifree( ASSERT(ip->i_d.di_nextents == 0); ASSERT(ip->i_d.di_anextents == 0); ASSERT((ip->i_d.di_size == 0 && ip->i_size == 0) || - ((ip->i_d.di_mode & S_IFMT) != S_IFREG)); + (!S_ISREG(ip->i_d.di_mode))); ASSERT(ip->i_d.di_nblocks == 0); /* @@ -2473,7 +2473,7 @@ cluster_corrupt_out: if (bp->b_iodone) { XFS_BUF_UNDONE(bp); XFS_BUF_STALE(bp); - XFS_BUF_ERROR(bp,EIO); + xfs_buf_ioerror(bp, EIO); xfs_buf_ioend(bp, 0); } else { XFS_BUF_STALE(bp); @@ -2585,7 +2585,7 @@ xfs_iflush( * If the buffer is pinned then push on the log now so we won't * get stuck waiting in the write for too long. */ - if (XFS_BUF_ISPINNED(bp)) + if (xfs_buf_ispinned(bp)) xfs_log_force(mp, 0); /* @@ -2671,7 +2671,7 @@ xfs_iflush_int( __func__, ip->i_ino, ip, ip->i_d.di_magic); goto corrupt_out; } - if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) { + if (S_ISREG(ip->i_d.di_mode)) { if (XFS_TEST_ERROR( (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && (ip->i_d.di_format != XFS_DINODE_FMT_BTREE), @@ -2681,7 +2681,7 @@ xfs_iflush_int( __func__, ip->i_ino, ip); goto corrupt_out; } - } else if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) { + } else if (S_ISDIR(ip->i_d.di_mode)) { if (XFS_TEST_ERROR( (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) && diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index a97644a..2380a4b 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -263,7 +263,7 @@ typedef struct xfs_inode { struct inode i_vnode; /* embedded VFS inode */ } xfs_inode_t; -#define XFS_ISIZE(ip) (((ip)->i_d.di_mode & S_IFMT) == S_IFREG) ? \ +#define XFS_ISIZE(ip) S_ISREG((ip)->i_d.di_mode) ? \ (ip)->i_size : (ip)->i_d.di_size; /* Convert from vfs inode to xfs inode */ diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index acca2c5..f7ce7de 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -265,7 +265,7 @@ xfs_open_by_handle( return PTR_ERR(filp); } - if (inode->i_mode & S_IFREG) { + if (S_ISREG(inode->i_mode)) { filp->f_flags |= O_NOATIME; filp->f_mode |= FMODE_NOCMTIME; } @@ -850,14 +850,14 @@ xfs_set_diflags( di_flags |= XFS_DIFLAG_NODEFRAG; if (xflags & XFS_XFLAG_FILESTREAM) di_flags |= XFS_DIFLAG_FILESTREAM; - if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) { + if (S_ISDIR(ip->i_d.di_mode)) { if (xflags & XFS_XFLAG_RTINHERIT) di_flags |= XFS_DIFLAG_RTINHERIT; if (xflags & XFS_XFLAG_NOSYMLINKS) di_flags |= XFS_DIFLAG_NOSYMLINKS; if (xflags & XFS_XFLAG_EXTSZINHERIT) di_flags |= XFS_DIFLAG_EXTSZINHERIT; - } else if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) { + } else if (S_ISREG(ip->i_d.di_mode)) { if (xflags & XFS_XFLAG_REALTIME) di_flags |= XFS_DIFLAG_REALTIME; if (xflags & XFS_XFLAG_EXTSIZE) diff --git a/fs/xfs/linux-2.6/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h index d56173b..d56173b 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.h +++ b/fs/xfs/xfs_ioctl.h diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index 54e623b..54e623b 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.h b/fs/xfs/xfs_ioctl32.h index 80f4060..80f4060 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl32.h +++ b/fs/xfs/xfs_ioctl32.h diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/xfs_iops.c index 6544c32..673704f 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -70,9 +70,8 @@ xfs_synchronize_times( } /* - * If the linux inode is valid, mark it dirty. - * Used when committing a dirty inode into a transaction so that - * the inode will get written back by the linux code + * If the linux inode is valid, mark it dirty, else mark the dirty state + * in the XFS inode to make sure we pick it up when reclaiming the inode. */ void xfs_mark_inode_dirty_sync( @@ -82,6 +81,10 @@ xfs_mark_inode_dirty_sync( if (!(inode->i_state & (I_WILL_FREE|I_FREEING))) mark_inode_dirty_sync(inode); + else { + barrier(); + ip->i_update_core = 1; + } } void @@ -92,6 +95,11 @@ xfs_mark_inode_dirty( if (!(inode->i_state & (I_WILL_FREE|I_FREEING))) mark_inode_dirty(inode); + else { + barrier(); + ip->i_update_core = 1; + } + } /* @@ -1194,9 +1202,14 @@ xfs_setup_inode( break; } - /* if there is no attribute fork no ACL can exist on this inode */ - if (!XFS_IFORK_Q(ip)) + /* + * If there is no attribute fork no ACL can exist on this inode, + * and it can't have any file capabilities attached to it either. + */ + if (!XFS_IFORK_Q(ip)) { + inode_has_no_xattr(inode); cache_no_acl(inode); + } xfs_iflags_clear(ip, XFS_INEW); barrier(); diff --git a/fs/xfs/linux-2.6/xfs_iops.h b/fs/xfs/xfs_iops.h index ef41c92..ef41c92 100644 --- a/fs/xfs/linux-2.6/xfs_iops.h +++ b/fs/xfs/xfs_iops.h diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/xfs_linux.h index d42f814..1e8a45e 100644 --- a/fs/xfs/linux-2.6/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -32,13 +32,12 @@ # define XFS_BIG_INUMS 0 #endif -#include <xfs_types.h> +#include "xfs_types.h" -#include <kmem.h> -#include <mrlock.h> -#include <time.h> - -#include <support/uuid.h> +#include "kmem.h" +#include "mrlock.h" +#include "time.h" +#include "uuid.h" #include <linux/semaphore.h> #include <linux/mm.h> @@ -78,14 +77,14 @@ #include <asm/byteorder.h> #include <asm/unaligned.h> -#include <xfs_vnode.h> -#include <xfs_stats.h> -#include <xfs_sysctl.h> -#include <xfs_iops.h> -#include <xfs_aops.h> -#include <xfs_super.h> -#include <xfs_buf.h> -#include <xfs_message.h> +#include "xfs_vnode.h" +#include "xfs_stats.h" +#include "xfs_sysctl.h" +#include "xfs_iops.h" +#include "xfs_aops.h" +#include "xfs_super.h" +#include "xfs_buf.h" +#include "xfs_message.h" #ifdef __BIG_ENDIAN #define XFS_NATIVE_HOST 1 diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 06ff843..3a8d4f6 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -878,7 +878,7 @@ xlog_iodone(xfs_buf_t *bp) /* * Race to shutdown the filesystem if we see an error. */ - if (XFS_TEST_ERROR((XFS_BUF_GETERROR(bp)), l->l_mp, + if (XFS_TEST_ERROR((xfs_buf_geterror(bp)), l->l_mp, XFS_ERRTAG_IODONE_IOERR, XFS_RANDOM_IODONE_IOERR)) { xfs_ioerror_alert("xlog_iodone", l->l_mp, bp, XFS_BUF_ADDR(bp)); XFS_BUF_STALE(bp); @@ -1051,7 +1051,6 @@ xlog_alloc_log(xfs_mount_t *mp, if (!bp) goto out_free_log; bp->b_iodone = xlog_iodone; - ASSERT(XFS_BUF_ISBUSY(bp)); ASSERT(xfs_buf_islocked(bp)); log->l_xbuf = bp; @@ -1108,7 +1107,6 @@ xlog_alloc_log(xfs_mount_t *mp, iclog->ic_callback_tail = &(iclog->ic_callback); iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize; - ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp)); ASSERT(xfs_buf_islocked(iclog->ic_bp)); init_waitqueue_head(&iclog->ic_force_wait); init_waitqueue_head(&iclog->ic_write_wait); @@ -1248,7 +1246,7 @@ xlog_bdstrat( struct xlog_in_core *iclog = bp->b_fspriv; if (iclog->ic_state & XLOG_STATE_IOERROR) { - XFS_BUF_ERROR(bp, EIO); + xfs_buf_ioerror(bp, EIO); XFS_BUF_STALE(bp); xfs_buf_ioend(bp, 0); /* @@ -1355,7 +1353,6 @@ xlog_sync(xlog_t *log, XFS_BUF_SET_COUNT(bp, count); bp->b_fspriv = iclog; XFS_BUF_ZEROFLAGS(bp); - XFS_BUF_BUSY(bp); XFS_BUF_ASYNC(bp); bp->b_flags |= XBF_SYNCIO; @@ -1398,16 +1395,15 @@ xlog_sync(xlog_t *log, if (split) { bp = iclog->ic_log->l_xbuf; XFS_BUF_SET_ADDR(bp, 0); /* logical 0 */ - XFS_BUF_SET_PTR(bp, (xfs_caddr_t)((__psint_t)&(iclog->ic_header)+ - (__psint_t)count), split); + xfs_buf_associate_memory(bp, + (char *)&iclog->ic_header + count, split); bp->b_fspriv = iclog; XFS_BUF_ZEROFLAGS(bp); - XFS_BUF_BUSY(bp); XFS_BUF_ASYNC(bp); bp->b_flags |= XBF_SYNCIO; if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) bp->b_flags |= XBF_FUA; - dptr = XFS_BUF_PTR(bp); + dptr = bp->b_addr; /* * Bump the cycle numbers at the start of each block * since this part of the buffer is at the start of diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 8fe4206..a199dbc 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -147,7 +147,7 @@ xlog_align( xfs_daddr_t offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1); ASSERT(BBTOB(offset + nbblks) <= XFS_BUF_SIZE(bp)); - return XFS_BUF_PTR(bp) + BBTOB(offset); + return bp->b_addr + BBTOB(offset); } @@ -178,9 +178,7 @@ xlog_bread_noalign( XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); XFS_BUF_READ(bp); - XFS_BUF_BUSY(bp); XFS_BUF_SET_COUNT(bp, BBTOB(nbblks)); - XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp); xfsbdstrat(log->l_mp, bp); error = xfs_buf_iowait(bp); @@ -220,18 +218,18 @@ xlog_bread_offset( xfs_buf_t *bp, xfs_caddr_t offset) { - xfs_caddr_t orig_offset = XFS_BUF_PTR(bp); + xfs_caddr_t orig_offset = bp->b_addr; int orig_len = bp->b_buffer_length; int error, error2; - error = XFS_BUF_SET_PTR(bp, offset, BBTOB(nbblks)); + error = xfs_buf_associate_memory(bp, offset, BBTOB(nbblks)); if (error) return error; error = xlog_bread_noalign(log, blk_no, nbblks, bp); /* must reset buffer pointer even on error */ - error2 = XFS_BUF_SET_PTR(bp, orig_offset, orig_len); + error2 = xfs_buf_associate_memory(bp, orig_offset, orig_len); if (error) return error; return error2; @@ -266,11 +264,9 @@ xlog_bwrite( XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); XFS_BUF_ZEROFLAGS(bp); - XFS_BUF_BUSY(bp); - XFS_BUF_HOLD(bp); + xfs_buf_hold(bp); xfs_buf_lock(bp); XFS_BUF_SET_COUNT(bp, BBTOB(nbblks)); - XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp); if ((error = xfs_bwrite(log->l_mp, bp))) xfs_ioerror_alert("xlog_bwrite", log->l_mp, @@ -360,7 +356,7 @@ STATIC void xlog_recover_iodone( struct xfs_buf *bp) { - if (XFS_BUF_GETERROR(bp)) { + if (bp->b_error) { /* * We're not going to bother about retrying * this during recovery. One strike! @@ -1262,7 +1258,7 @@ xlog_write_log_records( */ ealign = round_down(end_block, sectbb); if (j == 0 && (start_block + endcount > ealign)) { - offset = XFS_BUF_PTR(bp) + BBTOB(ealign - start_block); + offset = bp->b_addr + BBTOB(ealign - start_block); error = xlog_bread_offset(log, ealign, sectbb, bp, offset); if (error) @@ -2135,15 +2131,16 @@ xlog_recover_buffer_pass2( bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, buf_flags); - if (XFS_BUF_ISERROR(bp)) { + if (!bp) + return XFS_ERROR(ENOMEM); + error = bp->b_error; + if (error) { xfs_ioerror_alert("xlog_recover_do..(read#1)", mp, bp, buf_f->blf_blkno); - error = XFS_BUF_GETERROR(bp); xfs_buf_relse(bp); return error; } - error = 0; if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); } else if (buf_f->blf_flags & @@ -2227,14 +2224,17 @@ xlog_recover_inode_pass2( bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, XBF_LOCK); - if (XFS_BUF_ISERROR(bp)) { + if (!bp) { + error = ENOMEM; + goto error; + } + error = bp->b_error; + if (error) { xfs_ioerror_alert("xlog_recover_do..(read#2)", mp, bp, in_f->ilf_blkno); - error = XFS_BUF_GETERROR(bp); xfs_buf_relse(bp); goto error; } - error = 0; ASSERT(in_f->ilf_fields & XFS_ILOG_CORE); dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset); @@ -2283,7 +2283,7 @@ xlog_recover_inode_pass2( /* Take the opportunity to reset the flush iteration count */ dicp->di_flushiter = 0; - if (unlikely((dicp->di_mode & S_IFMT) == S_IFREG)) { + if (unlikely(S_ISREG(dicp->di_mode))) { if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && (dicp->di_format != XFS_DINODE_FMT_BTREE)) { XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)", @@ -2296,7 +2296,7 @@ xlog_recover_inode_pass2( error = EFSCORRUPTED; goto error; } - } else if (unlikely((dicp->di_mode & S_IFMT) == S_IFDIR)) { + } else if (unlikely(S_ISDIR(dicp->di_mode))) { if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && (dicp->di_format != XFS_DINODE_FMT_BTREE) && (dicp->di_format != XFS_DINODE_FMT_LOCAL)) { @@ -3437,7 +3437,7 @@ xlog_do_recovery_pass( /* * Check for header wrapping around physical end-of-log */ - offset = XFS_BUF_PTR(hbp); + offset = hbp->b_addr; split_hblks = 0; wrapped_hblks = 0; if (blk_no + hblks <= log->l_logBBsize) { @@ -3497,7 +3497,7 @@ xlog_do_recovery_pass( } else { /* This log record is split across the * physical end of log */ - offset = XFS_BUF_PTR(dbp); + offset = dbp->b_addr; split_bblks = 0; if (blk_no != log->l_logBBsize) { /* some data is before the physical diff --git a/fs/xfs/linux-2.6/xfs_message.c b/fs/xfs/xfs_message.c index bd672de..bd672de 100644 --- a/fs/xfs/linux-2.6/xfs_message.c +++ b/fs/xfs/xfs_message.c diff --git a/fs/xfs/linux-2.6/xfs_message.h b/fs/xfs/xfs_message.h index 7fb7ea0..7fb7ea0 100644 --- a/fs/xfs/linux-2.6/xfs_message.h +++ b/fs/xfs/xfs_message.h diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 7f25245..0081657 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1331,7 +1331,7 @@ xfs_mountfs( ASSERT(rip != NULL); - if (unlikely((rip->i_d.di_mode & S_IFMT) != S_IFDIR)) { + if (unlikely(!S_ISDIR(rip->i_d.di_mode))) { xfs_warn(mp, "corrupted root inode %llu: not a directory", (unsigned long long)rip->i_ino); xfs_iunlock(rip, XFS_ILOCK_EXCL); @@ -1615,7 +1615,7 @@ xfs_unmountfs_writesb(xfs_mount_t *mp) XFS_BUF_UNDELAYWRITE(sbp); XFS_BUF_WRITE(sbp); XFS_BUF_UNASYNC(sbp); - ASSERT(XFS_BUF_TARGET(sbp) == mp->m_ddev_targp); + ASSERT(sbp->b_target == mp->m_ddev_targp); xfsbdstrat(mp, sbp); error = xfs_buf_iowait(sbp); if (error) @@ -1938,7 +1938,7 @@ xfs_getsb( xfs_buf_lock(bp); } - XFS_BUF_HOLD(bp); + xfs_buf_hold(bp); ASSERT(XFS_BUF_ISDONE(bp)); return bp; } diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/xfs_qm.c index 46e54ad..9a0aa76 100644 --- a/fs/xfs/quota/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -1240,7 +1240,7 @@ xfs_qm_reset_dqcounts( do_div(j, sizeof(xfs_dqblk_t)); ASSERT(mp->m_quotainfo->qi_dqperchunk == j); #endif - ddq = (xfs_disk_dquot_t *)XFS_BUF_PTR(bp); + ddq = bp->b_addr; for (j = 0; j < mp->m_quotainfo->qi_dqperchunk; j++) { /* * Do a sanity check, and if needed, repair the dqblk. Don't diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/xfs_qm.h index 43b9abe..43b9abe 100644 --- a/fs/xfs/quota/xfs_qm.h +++ b/fs/xfs/xfs_qm.h diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c index a0a829a..a0a829a 100644 --- a/fs/xfs/quota/xfs_qm_bhv.c +++ b/fs/xfs/xfs_qm_bhv.c diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/xfs_qm_stats.c index 8671a0b..8671a0b 100644 --- a/fs/xfs/quota/xfs_qm_stats.c +++ b/fs/xfs/xfs_qm_stats.c diff --git a/fs/xfs/quota/xfs_qm_stats.h b/fs/xfs/xfs_qm_stats.h index 5b964fc..5b964fc 100644 --- a/fs/xfs/quota/xfs_qm_stats.h +++ b/fs/xfs/xfs_qm_stats.h diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 609246f..609246f 100644 --- a/fs/xfs/quota/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c diff --git a/fs/xfs/quota/xfs_quota_priv.h b/fs/xfs/xfs_quota_priv.h index 94a3d92..94a3d92 100644 --- a/fs/xfs/quota/xfs_quota_priv.h +++ b/fs/xfs/xfs_quota_priv.h diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c index 29b9d64..7e76f53 100644 --- a/fs/xfs/linux-2.6/xfs_quotaops.c +++ b/fs/xfs/xfs_quotaops.c @@ -25,7 +25,7 @@ #include "xfs_trans.h" #include "xfs_bmap_btree.h" #include "xfs_inode.h" -#include "quota/xfs_qm.h" +#include "xfs_qm.h" #include <linux/quota.h> diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c index 77a5989..df78c29 100644 --- a/fs/xfs/xfs_rename.c +++ b/fs/xfs/xfs_rename.c @@ -116,7 +116,7 @@ xfs_rename( trace_xfs_rename(src_dp, target_dp, src_name, target_name); new_parent = (src_dp != target_dp); - src_is_directory = ((src_ip->i_d.di_mode & S_IFMT) == S_IFDIR); + src_is_directory = S_ISDIR(src_ip->i_d.di_mode); if (src_is_directory) { /* @@ -226,7 +226,7 @@ xfs_rename( * target and source are directories and that target can be * destroyed, or that neither is a directory. */ - if ((target_ip->i_d.di_mode & S_IFMT) == S_IFDIR) { + if (S_ISDIR(target_ip->i_d.di_mode)) { /* * Make sure target dir is empty. */ diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 8f76fdf..35561a5 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -168,7 +168,7 @@ error_cancel: xfs_trans_cancel(tp, cancelflags); goto error; } - memset(XFS_BUF_PTR(bp), 0, mp->m_sb.sb_blocksize); + memset(bp->b_addr, 0, mp->m_sb.sb_blocksize); xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1); /* * Commit the transaction. @@ -883,7 +883,7 @@ xfs_rtbuf_get( if (error) { return error; } - ASSERT(bp && !XFS_BUF_GETERROR(bp)); + ASSERT(!xfs_buf_geterror(bp)); *bpp = bp; return 0; } @@ -943,7 +943,7 @@ xfs_rtcheck_range( if (error) { return error; } - bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + bufp = bp->b_addr; /* * Compute the starting word's address, and starting bit. */ @@ -994,7 +994,7 @@ xfs_rtcheck_range( if (error) { return error; } - b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + b = bufp = bp->b_addr; word = 0; } else { /* @@ -1040,7 +1040,7 @@ xfs_rtcheck_range( if (error) { return error; } - b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + b = bufp = bp->b_addr; word = 0; } else { /* @@ -1158,7 +1158,7 @@ xfs_rtfind_back( if (error) { return error; } - bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + bufp = bp->b_addr; /* * Get the first word's index & point to it. */ @@ -1210,7 +1210,7 @@ xfs_rtfind_back( if (error) { return error; } - bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + bufp = bp->b_addr; word = XFS_BLOCKWMASK(mp); b = &bufp[word]; } else { @@ -1256,7 +1256,7 @@ xfs_rtfind_back( if (error) { return error; } - bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + bufp = bp->b_addr; word = XFS_BLOCKWMASK(mp); b = &bufp[word]; } else { @@ -1333,7 +1333,7 @@ xfs_rtfind_forw( if (error) { return error; } - bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + bufp = bp->b_addr; /* * Get the first word's index & point to it. */ @@ -1384,7 +1384,7 @@ xfs_rtfind_forw( if (error) { return error; } - b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + b = bufp = bp->b_addr; word = 0; } else { /* @@ -1429,7 +1429,7 @@ xfs_rtfind_forw( if (error) { return error; } - b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + b = bufp = bp->b_addr; word = 0; } else { /* @@ -1649,7 +1649,7 @@ xfs_rtmodify_range( if (error) { return error; } - bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + bufp = bp->b_addr; /* * Compute the starting word's address, and starting bit. */ @@ -1694,7 +1694,7 @@ xfs_rtmodify_range( if (error) { return error; } - first = b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + first = b = bufp = bp->b_addr; word = 0; } else { /* @@ -1734,7 +1734,7 @@ xfs_rtmodify_range( if (error) { return error; } - first = b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); + first = b = bufp = bp->b_addr; word = 0; } else { /* @@ -1832,8 +1832,8 @@ xfs_rtmodify_summary( */ sp = XFS_SUMPTR(mp, bp, so); *sp += delta; - xfs_trans_log_buf(tp, bp, (uint)((char *)sp - (char *)XFS_BUF_PTR(bp)), - (uint)((char *)sp - (char *)XFS_BUF_PTR(bp) + sizeof(*sp) - 1)); + xfs_trans_log_buf(tp, bp, (uint)((char *)sp - (char *)bp->b_addr), + (uint)((char *)sp - (char *)bp->b_addr + sizeof(*sp) - 1)); return 0; } diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index 09e1f4f..f7f3a35 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -47,7 +47,7 @@ struct xfs_trans; #define XFS_SUMOFFSTOBLOCK(mp,s) \ (((s) * (uint)sizeof(xfs_suminfo_t)) >> (mp)->m_sb.sb_blocklog) #define XFS_SUMPTR(mp,bp,so) \ - ((xfs_suminfo_t *)((char *)XFS_BUF_PTR(bp) + \ + ((xfs_suminfo_t *)((bp)->b_addr + \ (((so) * (uint)sizeof(xfs_suminfo_t)) & XFS_BLOCKMASK(mp)))) #define XFS_BITTOBLOCK(mp,bi) ((bi) >> (mp)->m_blkbit_log) diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c index d6d6fdf..c96a8a0 100644 --- a/fs/xfs/xfs_rw.c +++ b/fs/xfs/xfs_rw.c @@ -104,9 +104,9 @@ xfs_ioerror_alert( xfs_alert(mp, "I/O error occurred: meta-data dev %s block 0x%llx" " (\"%s\") error %d buf count %zd", - XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)), + xfs_buf_target_name(bp->b_target), (__uint64_t)blkno, func, - XFS_BUF_GETERROR(bp), XFS_BUF_COUNT(bp)); + bp->b_error, XFS_BUF_COUNT(bp)); } /* @@ -137,8 +137,8 @@ xfs_read_buf( bp = xfs_buf_read(target, blkno, len, flags); if (!bp) return XFS_ERROR(EIO); - error = XFS_BUF_GETERROR(bp); - if (bp && !error && !XFS_FORCED_SHUTDOWN(mp)) { + error = bp->b_error; + if (!error && !XFS_FORCED_SHUTDOWN(mp)) { *bpp = bp; } else { *bpp = NULL; diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h index 1eb2ba5..cb6ae71 100644 --- a/fs/xfs/xfs_sb.h +++ b/fs/xfs/xfs_sb.h @@ -509,7 +509,7 @@ static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp) #define XFS_SB_DADDR ((xfs_daddr_t)0) /* daddr in filesystem/ag */ #define XFS_SB_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_SB_DADDR) -#define XFS_BUF_TO_SBP(bp) ((xfs_dsb_t *)XFS_BUF_PTR(bp)) +#define XFS_BUF_TO_SBP(bp) ((xfs_dsb_t *)((bp)->b_addr)) #define XFS_HDR_BLOCK(mp,d) ((xfs_agblock_t)XFS_BB_TO_FSBT(mp,d)) #define XFS_DADDR_TO_FSB(mp,d) XFS_AGB_TO_FSB(mp, \ diff --git a/fs/xfs/linux-2.6/xfs_stats.c b/fs/xfs/xfs_stats.c index 76fdc58..76fdc58 100644 --- a/fs/xfs/linux-2.6/xfs_stats.c +++ b/fs/xfs/xfs_stats.c diff --git a/fs/xfs/linux-2.6/xfs_stats.h b/fs/xfs/xfs_stats.h index 736854b..736854b 100644 --- a/fs/xfs/linux-2.6/xfs_stats.h +++ b/fs/xfs/xfs_stats.h diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/xfs_super.c index 9a72dda..2366c54 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -356,6 +356,8 @@ xfs_parseargs( mp->m_flags |= XFS_MOUNT_DELAYLOG; } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) { mp->m_flags &= ~XFS_MOUNT_DELAYLOG; + xfs_warn(mp, + "nodelaylog is deprecated and will be removed in Linux 3.3"); } else if (!strcmp(this_char, MNTOPT_DISCARD)) { mp->m_flags |= XFS_MOUNT_DISCARD; } else if (!strcmp(this_char, MNTOPT_NODISCARD)) { @@ -877,33 +879,17 @@ xfs_log_inode( struct xfs_trans *tp; int error; - xfs_iunlock(ip, XFS_ILOCK_SHARED); tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0); - if (error) { xfs_trans_cancel(tp, 0); - /* we need to return with the lock hold shared */ - xfs_ilock(ip, XFS_ILOCK_SHARED); return error; } xfs_ilock(ip, XFS_ILOCK_EXCL); - - /* - * Note - it's possible that we might have pushed ourselves out of the - * way during trans_reserve which would flush the inode. But there's - * no guarantee that the inode buffer has actually gone out yet (it's - * delwri). Plus the buffer could be pinned anyway if it's part of - * an inode in another recent transaction. So we play it safe and - * fire off the transaction anyway. - */ - xfs_trans_ijoin(tp, ip); + xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - error = xfs_trans_commit(tp, 0); - xfs_ilock_demote(ip, XFS_ILOCK_EXCL); - - return error; + return xfs_trans_commit(tp, 0); } STATIC int @@ -918,7 +904,9 @@ xfs_fs_write_inode( trace_xfs_write_inode(ip); if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); + return -XFS_ERROR(EIO); + if (!ip->i_update_core) + return 0; if (wbc->sync_mode == WB_SYNC_ALL) { /* @@ -929,12 +917,10 @@ xfs_fs_write_inode( * of synchronous log foces dramatically. */ xfs_ioend_wait(ip); - xfs_ilock(ip, XFS_ILOCK_SHARED); - if (ip->i_update_core) { - error = xfs_log_inode(ip); - if (error) - goto out_unlock; - } + error = xfs_log_inode(ip); + if (error) + goto out; + return 0; } else { /* * We make this non-blocking if the inode is contended, return diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/xfs_super.h index 50a3266..50a3266 100644 --- a/fs/xfs/linux-2.6/xfs_super.h +++ b/fs/xfs/xfs_super.h diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/xfs_sync.c index e4c938a..4604f90 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/xfs_sync.c @@ -332,7 +332,7 @@ xfs_sync_fsdata( * between there and here. */ bp = xfs_getsb(mp, 0); - if (XFS_BUF_ISPINNED(bp)) + if (xfs_buf_ispinned(bp)) xfs_log_force(mp, 0); return xfs_bwrite(mp, bp); diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/xfs_sync.h index 941202e..941202e 100644 --- a/fs/xfs/linux-2.6/xfs_sync.h +++ b/fs/xfs/xfs_sync.h diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c index ee2d2ad..ee2d2ad 100644 --- a/fs/xfs/linux-2.6/xfs_sysctl.c +++ b/fs/xfs/xfs_sysctl.c diff --git a/fs/xfs/linux-2.6/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h index b9937d4..b9937d4 100644 --- a/fs/xfs/linux-2.6/xfs_sysctl.h +++ b/fs/xfs/xfs_sysctl.h diff --git a/fs/xfs/linux-2.6/xfs_trace.c b/fs/xfs/xfs_trace.c index 88d25d4..9010ce8 100644 --- a/fs/xfs/linux-2.6/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -43,8 +43,8 @@ #include "xfs_quota.h" #include "xfs_iomap.h" #include "xfs_aops.h" -#include "quota/xfs_dquot_item.h" -#include "quota/xfs_dquot.h" +#include "xfs_dquot_item.h" +#include "xfs_dquot.h" #include "xfs_log_recover.h" #include "xfs_inode_item.h" diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/xfs_trace.h index 690fc7a..690fc7a 100644 --- a/fs/xfs/linux-2.6/xfs_trace.h +++ b/fs/xfs/xfs_trace.h diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 43233e9..c15aa29 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -299,7 +299,7 @@ xfs_trans_ail_cursor_last( * Splice the log item list into the AIL at the given LSN. We splice to the * tail of the given LSN to maintain insert order for push traversals. The * cursor is optional, allowing repeated updates to the same LSN to avoid - * repeated traversals. + * repeated traversals. This should not be called with an empty list. */ static void xfs_ail_splice( @@ -308,50 +308,39 @@ xfs_ail_splice( struct list_head *list, xfs_lsn_t lsn) { - struct xfs_log_item *lip = cur ? cur->item : NULL; - struct xfs_log_item *next_lip; + struct xfs_log_item *lip; + + ASSERT(!list_empty(list)); /* - * Get a new cursor if we don't have a placeholder or the existing one - * has been invalidated. + * Use the cursor to determine the insertion point if one is + * provided. If not, or if the one we got is not valid, + * find the place in the AIL where the items belong. */ - if (!lip || (__psint_t)lip & 1) { + lip = cur ? cur->item : NULL; + if (!lip || (__psint_t) lip & 1) lip = __xfs_trans_ail_cursor_last(ailp, lsn); - if (!lip) { - /* The list is empty, so just splice and return. */ - if (cur) - cur->item = NULL; - list_splice(list, &ailp->xa_ail); - return; - } - } + /* + * If a cursor is provided, we know we're processing the AIL + * in lsn order, and future items to be spliced in will + * follow the last one being inserted now. Update the + * cursor to point to that last item, now while we have a + * reliable pointer to it. + */ + if (cur) + cur->item = list_entry(list->prev, struct xfs_log_item, li_ail); /* - * Our cursor points to the item we want to insert _after_, so we have - * to update the cursor to point to the end of the list we are splicing - * in so that it points to the correct location for the next splice. - * i.e. before the splice - * - * lsn -> lsn -> lsn + x -> lsn + x ... - * ^ - * | cursor points here - * - * After the splice we have: - * - * lsn -> lsn -> lsn -> lsn -> .... -> lsn -> lsn + x -> lsn + x ... - * ^ ^ - * | cursor points here | needs to move here - * - * So we set the cursor to the last item in the list to be spliced - * before we execute the splice, resulting in the cursor pointing to - * the correct item after the splice occurs. + * Finally perform the splice. Unless the AIL was empty, + * lip points to the item in the AIL _after_ which the new + * items should go. If lip is null the AIL was empty, so + * the new items go at the head of the AIL. */ - if (cur) { - next_lip = list_entry(list->prev, struct xfs_log_item, li_ail); - cur->item = next_lip; - } - list_splice(list, &lip->li_ail); + if (lip) + list_splice(list, &lip->li_ail); + else + list_splice(list, &ailp->xa_ail); } /* @@ -682,6 +671,7 @@ xfs_trans_ail_update_bulk( int i; LIST_HEAD(tmp); + ASSERT(nr_items > 0); /* Not required, but true. */ mlip = xfs_ail_min(ailp); for (i = 0; i < nr_items; i++) { @@ -701,7 +691,8 @@ xfs_trans_ail_update_bulk( list_add(&lip->li_ail, &tmp); } - xfs_ail_splice(ailp, cur, &tmp, lsn); + if (!list_empty(&tmp)) + xfs_ail_splice(ailp, cur, &tmp, lsn); if (!mlip_changed) { spin_unlock(&ailp->xa_lock); diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 15584fc..137e2b9 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -54,7 +54,7 @@ xfs_trans_buf_item_match( list_for_each_entry(lidp, &tp->t_items, lid_trans) { blip = (struct xfs_buf_log_item *)lidp->lid_item; if (blip->bli_item.li_type == XFS_LI_BUF && - XFS_BUF_TARGET(blip->bli_buf) == target && + blip->bli_buf->b_target == target && XFS_BUF_ADDR(blip->bli_buf) == blkno && XFS_BUF_COUNT(blip->bli_buf) == len) return blip->bli_buf; @@ -80,7 +80,6 @@ _xfs_trans_bjoin( { struct xfs_buf_log_item *bip; - ASSERT(XFS_BUF_ISBUSY(bp)); ASSERT(bp->b_transp == NULL); /* @@ -194,7 +193,7 @@ xfs_trans_get_buf(xfs_trans_t *tp, return NULL; } - ASSERT(!XFS_BUF_GETERROR(bp)); + ASSERT(!bp->b_error); _xfs_trans_bjoin(tp, bp, 1); trace_xfs_trans_get_buf(bp->b_fspriv); @@ -293,10 +292,10 @@ xfs_trans_read_buf( return (flags & XBF_TRYLOCK) ? EAGAIN : XFS_ERROR(ENOMEM); - if (XFS_BUF_GETERROR(bp) != 0) { + if (bp->b_error) { + error = bp->b_error; xfs_ioerror_alert("xfs_trans_read_buf", mp, bp, blkno); - error = XFS_BUF_GETERROR(bp); xfs_buf_relse(bp); return error; } @@ -330,7 +329,7 @@ xfs_trans_read_buf( ASSERT(xfs_buf_islocked(bp)); ASSERT(bp->b_transp == tp); ASSERT(bp->b_fspriv != NULL); - ASSERT((XFS_BUF_ISERROR(bp)) == 0); + ASSERT(!bp->b_error); if (!(XFS_BUF_ISDONE(bp))) { trace_xfs_trans_read_buf_io(bp, _RET_IP_); ASSERT(!XFS_BUF_ISASYNC(bp)); @@ -386,10 +385,9 @@ xfs_trans_read_buf( return (flags & XBF_TRYLOCK) ? 0 : XFS_ERROR(ENOMEM); } - if (XFS_BUF_GETERROR(bp) != 0) { - XFS_BUF_SUPER_STALE(bp); - error = XFS_BUF_GETERROR(bp); - + if (bp->b_error) { + error = bp->b_error; + XFS_BUF_SUPER_STALE(bp); xfs_ioerror_alert("xfs_trans_read_buf", mp, bp, blkno); if (tp->t_flags & XFS_TRANS_DIRTY) @@ -430,7 +428,7 @@ shutdown_abort: if (XFS_BUF_ISSTALE(bp) && XFS_BUF_ISDELAYWRITE(bp)) xfs_notice(mp, "about to pop assert, bp == 0x%p", bp); #endif - ASSERT((XFS_BUF_BFLAGS(bp) & (XBF_STALE|XBF_DELWRI)) != + ASSERT((bp->b_flags & (XBF_STALE|XBF_DELWRI)) != (XBF_STALE|XBF_DELWRI)); trace_xfs_trans_read_buf_shut(bp, _RET_IP_); @@ -581,7 +579,6 @@ xfs_trans_bhold(xfs_trans_t *tp, { xfs_buf_log_item_t *bip = bp->b_fspriv; - ASSERT(XFS_BUF_ISBUSY(bp)); ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); @@ -602,7 +599,6 @@ xfs_trans_bhold_release(xfs_trans_t *tp, { xfs_buf_log_item_t *bip = bp->b_fspriv; - ASSERT(XFS_BUF_ISBUSY(bp)); ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); @@ -631,7 +627,6 @@ xfs_trans_log_buf(xfs_trans_t *tp, { xfs_buf_log_item_t *bip = bp->b_fspriv; - ASSERT(XFS_BUF_ISBUSY(bp)); ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); ASSERT((first <= last) && (last < XFS_BUF_COUNT(bp))); @@ -702,7 +697,6 @@ xfs_trans_binval( { xfs_buf_log_item_t *bip = bp->b_fspriv; - ASSERT(XFS_BUF_ISBUSY(bp)); ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); @@ -774,7 +768,6 @@ xfs_trans_inode_buf( { xfs_buf_log_item_t *bip = bp->b_fspriv; - ASSERT(XFS_BUF_ISBUSY(bp)); ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); @@ -798,7 +791,6 @@ xfs_trans_stale_inode_buf( { xfs_buf_log_item_t *bip = bp->b_fspriv; - ASSERT(XFS_BUF_ISBUSY(bp)); ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); @@ -823,7 +815,6 @@ xfs_trans_inode_alloc_buf( { xfs_buf_log_item_t *bip = bp->b_fspriv; - ASSERT(XFS_BUF_ISBUSY(bp)); ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); @@ -851,7 +842,6 @@ xfs_trans_dquot_buf( { xfs_buf_log_item_t *bip = bp->b_fspriv; - ASSERT(XFS_BUF_ISBUSY(bp)); ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); ASSERT(type == XFS_BLF_UDQUOT_BUF || diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 4d00ee6..4d00ee6 100644 --- a/fs/xfs/quota/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/xfs_vnode.h index 7c220b4..7c220b4 100644 --- a/fs/xfs/linux-2.6/xfs_vnode.h +++ b/fs/xfs/xfs_vnode.h diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 88d1214..51fc429 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -83,7 +83,9 @@ xfs_readlink_bmap( bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), XBF_LOCK | XBF_MAPPED | XBF_DONT_BLOCK); - error = XFS_BUF_GETERROR(bp); + if (!bp) + return XFS_ERROR(ENOMEM); + error = bp->b_error; if (error) { xfs_ioerror_alert("xfs_readlink", ip->i_mount, bp, XFS_BUF_ADDR(bp)); @@ -94,7 +96,7 @@ xfs_readlink_bmap( byte_cnt = pathlen; pathlen -= byte_cnt; - memcpy(link, XFS_BUF_PTR(bp), byte_cnt); + memcpy(link, bp->b_addr, byte_cnt); xfs_buf_relse(bp); } @@ -121,7 +123,7 @@ xfs_readlink( xfs_ilock(ip, XFS_ILOCK_SHARED); - ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFLNK); + ASSERT(S_ISLNK(ip->i_d.di_mode)); ASSERT(ip->i_d.di_size <= MAXPATHLEN); pathlen = ip->i_d.di_size; @@ -529,7 +531,7 @@ xfs_release( if (ip->i_d.di_nlink == 0) return 0; - if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) && + if ((S_ISREG(ip->i_d.di_mode) && ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) && (ip->i_df.if_flags & XFS_IFEXTENTS)) && @@ -610,7 +612,7 @@ xfs_inactive( truncate = ((ip->i_d.di_nlink == 0) && ((ip->i_d.di_size != 0) || (ip->i_size != 0) || (ip->i_d.di_nextents > 0) || (ip->i_delayed_blks > 0)) && - ((ip->i_d.di_mode & S_IFMT) == S_IFREG)); + S_ISREG(ip->i_d.di_mode)); mp = ip->i_mount; @@ -621,7 +623,7 @@ xfs_inactive( goto out; if (ip->i_d.di_nlink != 0) { - if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) && + if ((S_ISREG(ip->i_d.di_mode) && ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) && (ip->i_df.if_flags & XFS_IFEXTENTS) && @@ -669,7 +671,7 @@ xfs_inactive( xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); return VN_INACTIVE_CACHE; } - } else if ((ip->i_d.di_mode & S_IFMT) == S_IFLNK) { + } else if (S_ISLNK(ip->i_d.di_mode)) { /* * If we get an error while cleaning up a @@ -1648,13 +1650,13 @@ xfs_symlink( byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, BTOBB(byte_cnt), 0); - ASSERT(bp && !XFS_BUF_GETERROR(bp)); + ASSERT(!xfs_buf_geterror(bp)); if (pathlen < byte_cnt) { byte_cnt = pathlen; } pathlen -= byte_cnt; - memcpy(XFS_BUF_PTR(bp), cur_chunk, byte_cnt); + memcpy(bp->b_addr, cur_chunk, byte_cnt); cur_chunk += byte_cnt; xfs_trans_log_buf(tp, bp, 0, byte_cnt - 1); @@ -1999,7 +2001,7 @@ xfs_zero_remaining_bytes( mp, bp, XFS_BUF_ADDR(bp)); break; } - memset(XFS_BUF_PTR(bp) + + memset(bp->b_addr + (offset - XFS_FSB_TO_B(mp, imap.br_startoff)), 0, lastoffset - offset + 1); XFS_BUF_UNDONE(bp); diff --git a/fs/xfs/linux-2.6/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 87d3e03..87d3e03 100644 --- a/fs/xfs/linux-2.6/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c |