diff options
228 files changed, 6650 insertions, 3116 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index d449e63..8e2da1e 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -61,6 +61,7 @@ ata *); ssize_t (*listxattr) (struct dentry *, char *, size_t); int (*removexattr) (struct dentry *, const char *); int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len); + void (*update_time)(struct inode *, struct timespec *, int); locking rules: all may block @@ -87,6 +88,8 @@ getxattr: no listxattr: no removexattr: yes fiemap: no +update_time: no + Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on victim. cross-directory ->rename() has (per-superblock) ->s_vfs_rename_sem. diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index ef19f91..efd23f4 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -363,6 +363,7 @@ struct inode_operations { ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); ssize_t (*listxattr) (struct dentry *, char *, size_t); int (*removexattr) (struct dentry *, const char *); + void (*update_time)(struct inode *, struct timespec *, int); }; Again, all methods are called without any locks being held, unless @@ -471,6 +472,9 @@ otherwise noted. removexattr: called by the VFS to remove an extended attribute from a file. This method is called by removexattr(2) system call. + update_time: called by the VFS to update a specific time or the i_version of + an inode. If this is not defined the VFS will update the inode itself + and call mark_inode_dirty_sync. The Address Space Object ======================== diff --git a/arch/alpha/include/asm/posix_types.h b/arch/alpha/include/asm/posix_types.h index 24779fc..5a8a483 100644 --- a/arch/alpha/include/asm/posix_types.h +++ b/arch/alpha/include/asm/posix_types.h @@ -10,9 +10,6 @@ typedef unsigned int __kernel_ino_t; #define __kernel_ino_t __kernel_ino_t -typedef unsigned int __kernel_nlink_t; -#define __kernel_nlink_t __kernel_nlink_t - typedef unsigned long __kernel_sigset_t; /* at least 32 bits */ #include <asm-generic/posix_types.h> diff --git a/arch/arm/include/asm/posix_types.h b/arch/arm/include/asm/posix_types.h index efdf990..d2de9cb 100644 --- a/arch/arm/include/asm/posix_types.h +++ b/arch/arm/include/asm/posix_types.h @@ -22,9 +22,6 @@ typedef unsigned short __kernel_mode_t; #define __kernel_mode_t __kernel_mode_t -typedef unsigned short __kernel_nlink_t; -#define __kernel_nlink_t __kernel_nlink_t - typedef unsigned short __kernel_ipc_pid_t; #define __kernel_ipc_pid_t __kernel_ipc_pid_t diff --git a/arch/avr32/include/asm/posix_types.h b/arch/avr32/include/asm/posix_types.h index 74667bf..9ba9e74 100644 --- a/arch/avr32/include/asm/posix_types.h +++ b/arch/avr32/include/asm/posix_types.h @@ -17,9 +17,6 @@ typedef unsigned short __kernel_mode_t; #define __kernel_mode_t __kernel_mode_t -typedef unsigned short __kernel_nlink_t; -#define __kernel_nlink_t __kernel_nlink_t - typedef unsigned short __kernel_ipc_pid_t; #define __kernel_ipc_pid_t __kernel_ipc_pid_t diff --git a/arch/blackfin/include/asm/posix_types.h b/arch/blackfin/include/asm/posix_types.h index 41bc187..1bd3436 100644 --- a/arch/blackfin/include/asm/posix_types.h +++ b/arch/blackfin/include/asm/posix_types.h @@ -10,9 +10,6 @@ typedef unsigned short __kernel_mode_t; #define __kernel_mode_t __kernel_mode_t -typedef unsigned short __kernel_nlink_t; -#define __kernel_nlink_t __kernel_nlink_t - typedef unsigned int __kernel_ipc_pid_t; #define __kernel_ipc_pid_t __kernel_ipc_pid_t diff --git a/arch/cris/include/asm/posix_types.h b/arch/cris/include/asm/posix_types.h index 234891c..ce4e517 100644 --- a/arch/cris/include/asm/posix_types.h +++ b/arch/cris/include/asm/posix_types.h @@ -15,9 +15,6 @@ typedef unsigned short __kernel_mode_t; #define __kernel_mode_t __kernel_mode_t -typedef unsigned short __kernel_nlink_t; -#define __kernel_nlink_t __kernel_nlink_t - typedef unsigned short __kernel_ipc_pid_t; #define __kernel_ipc_pid_t __kernel_ipc_pid_t diff --git a/arch/frv/include/asm/posix_types.h b/arch/frv/include/asm/posix_types.h index 3f34cb4..fe512af 100644 --- a/arch/frv/include/asm/posix_types.h +++ b/arch/frv/include/asm/posix_types.h @@ -10,9 +10,6 @@ typedef unsigned short __kernel_mode_t; #define __kernel_mode_t __kernel_mode_t -typedef unsigned short __kernel_nlink_t; -#define __kernel_nlink_t __kernel_nlink_t - typedef unsigned short __kernel_ipc_pid_t; #define __kernel_ipc_pid_t __kernel_ipc_pid_t diff --git a/arch/h8300/include/asm/posix_types.h b/arch/h8300/include/asm/posix_types.h index bc4c34e..91e62ba 100644 --- a/arch/h8300/include/asm/posix_types.h +++ b/arch/h8300/include/asm/posix_types.h @@ -10,9 +10,6 @@ typedef unsigned short __kernel_mode_t; #define __kernel_mode_t __kernel_mode_t -typedef unsigned short __kernel_nlink_t; -#define __kernel_nlink_t __kernel_nlink_t - typedef unsigned short __kernel_ipc_pid_t; #define __kernel_ipc_pid_t __kernel_ipc_pid_t diff --git a/arch/ia64/include/asm/posix_types.h b/arch/ia64/include/asm/posix_types.h index 7323ab9..99ee1d6 100644 --- a/arch/ia64/include/asm/posix_types.h +++ b/arch/ia64/include/asm/posix_types.h @@ -1,9 +1,6 @@ #ifndef _ASM_IA64_POSIX_TYPES_H #define _ASM_IA64_POSIX_TYPES_H -typedef unsigned int __kernel_nlink_t; -#define __kernel_nlink_t __kernel_nlink_t - typedef unsigned long __kernel_sigset_t; /* at least 32 bits */ #include <asm-generic/posix_types.h> diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index f00ba02..d7f558c 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -604,12 +604,6 @@ pfm_unprotect_ctx_ctxsw(pfm_context_t *x, unsigned long f) spin_unlock(&(x)->ctx_lock); } -static inline unsigned long -pfm_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, unsigned long exec) -{ - return get_unmapped_area(file, addr, len, pgoff, flags); -} - /* forward declaration */ static const struct dentry_operations pfmfs_dentry_operations; @@ -2333,8 +2327,8 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t down_write(&task->mm->mmap_sem); /* find some free area in address space, must have mmap sem held */ - vma->vm_start = pfm_get_unmapped_area(NULL, 0, size, 0, MAP_PRIVATE|MAP_ANONYMOUS, 0); - if (vma->vm_start == 0UL) { + vma->vm_start = get_unmapped_area(NULL, 0, size, 0, MAP_PRIVATE|MAP_ANONYMOUS); + if (IS_ERR_VALUE(vma->vm_start)) { DPRINT(("Cannot find unmapped area for size %ld\n", size)); up_write(&task->mm->mmap_sem); goto error; diff --git a/arch/ia64/kernel/sys_ia64.c b/arch/ia64/kernel/sys_ia64.c index 609d500..d9439ef 100644 --- a/arch/ia64/kernel/sys_ia64.c +++ b/arch/ia64/kernel/sys_ia64.c @@ -171,22 +171,9 @@ asmlinkage unsigned long ia64_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr) { - extern unsigned long do_mremap (unsigned long addr, - unsigned long old_len, - unsigned long new_len, - unsigned long flags, - unsigned long new_addr); - - down_write(¤t->mm->mmap_sem); - { - addr = do_mremap(addr, old_len, new_len, flags, new_addr); - } - up_write(¤t->mm->mmap_sem); - - if (IS_ERR((void *) addr)) - return addr; - - force_successful_syscall_return(); + addr = sys_mremap(addr, old_len, new_len, flags, new_addr); + if (!IS_ERR((void *) addr)) + force_successful_syscall_return(); return addr; } diff --git a/arch/m32r/include/asm/posix_types.h b/arch/m32r/include/asm/posix_types.h index 0195850..236de26 100644 --- a/arch/m32r/include/asm/posix_types.h +++ b/arch/m32r/include/asm/posix_types.h @@ -10,9 +10,6 @@ typedef unsigned short __kernel_mode_t; #define __kernel_mode_t __kernel_mode_t -typedef unsigned short __kernel_nlink_t; -#define __kernel_nlink_t __kernel_nlink_t - typedef unsigned short __kernel_ipc_pid_t; #define __kernel_ipc_pid_t __kernel_ipc_pid_t diff --git a/arch/m68k/include/asm/posix_types.h b/arch/m68k/include/asm/posix_types.h index 6373093..cf4dbf7 100644 --- a/arch/m68k/include/asm/posix_types.h +++ b/arch/m68k/include/asm/posix_types.h @@ -10,9 +10,6 @@ typedef unsigned short __kernel_mode_t; #define __kernel_mode_t __kernel_mode_t -typedef unsigned short __kernel_nlink_t; -#define __kernel_nlink_t __kernel_nlink_t - typedef unsigned short __kernel_ipc_pid_t; #define __kernel_ipc_pid_t __kernel_ipc_pid_t diff --git a/arch/mips/include/asm/posix_types.h b/arch/mips/include/asm/posix_types.h index e0308dc..fa03ec3 100644 --- a/arch/mips/include/asm/posix_types.h +++ b/arch/mips/include/asm/posix_types.h @@ -17,11 +17,6 @@ * assume GCC is being used. */ -#if (_MIPS_SZLONG == 64) -typedef unsigned int __kernel_nlink_t; -#define __kernel_nlink_t __kernel_nlink_t -#endif - typedef long __kernel_daddr_t; #define __kernel_daddr_t __kernel_daddr_t diff --git a/arch/mips/include/asm/stat.h b/arch/mips/include/asm/stat.h index 6e00f75..fe9a4c3 100644 --- a/arch/mips/include/asm/stat.h +++ b/arch/mips/include/asm/stat.h @@ -20,7 +20,7 @@ struct stat { long st_pad1[3]; /* Reserved for network id */ ino_t st_ino; mode_t st_mode; - nlink_t st_nlink; + __u32 st_nlink; uid_t st_uid; gid_t st_gid; unsigned st_rdev; @@ -55,7 +55,7 @@ struct stat64 { unsigned long long st_ino; mode_t st_mode; - nlink_t st_nlink; + __u32 st_nlink; uid_t st_uid; gid_t st_gid; @@ -96,7 +96,7 @@ struct stat { unsigned long st_ino; mode_t st_mode; - nlink_t st_nlink; + __u32 st_nlink; uid_t st_uid; gid_t st_gid; diff --git a/arch/mn10300/include/asm/posix_types.h b/arch/mn10300/include/asm/posix_types.h index ab50618..d31eeea 100644 --- a/arch/mn10300/include/asm/posix_types.h +++ b/arch/mn10300/include/asm/posix_types.h @@ -20,9 +20,6 @@ typedef unsigned short __kernel_mode_t; #define __kernel_mode_t __kernel_mode_t -typedef unsigned short __kernel_nlink_t; -#define __kernel_nlink_t __kernel_nlink_t - typedef unsigned short __kernel_ipc_pid_t; #define __kernel_ipc_pid_t __kernel_ipc_pid_t diff --git a/arch/parisc/include/asm/posix_types.h b/arch/parisc/include/asm/posix_types.h index 5212b03..b934425 100644 --- a/arch/parisc/include/asm/posix_types.h +++ b/arch/parisc/include/asm/posix_types.h @@ -10,9 +10,6 @@ typedef unsigned short __kernel_mode_t; #define __kernel_mode_t __kernel_mode_t -typedef unsigned short __kernel_nlink_t; -#define __kernel_nlink_t __kernel_nlink_t - typedef unsigned short __kernel_ipc_pid_t; #define __kernel_ipc_pid_t __kernel_ipc_pid_t diff --git a/arch/parisc/include/asm/stat.h b/arch/parisc/include/asm/stat.h index 9d5fbbc..d76fbda 100644 --- a/arch/parisc/include/asm/stat.h +++ b/arch/parisc/include/asm/stat.h @@ -7,7 +7,7 @@ struct stat { unsigned int st_dev; /* dev_t is 32 bits on parisc */ ino_t st_ino; /* 32 bits */ mode_t st_mode; /* 16 bits */ - nlink_t st_nlink; /* 16 bits */ + unsigned short st_nlink; /* 16 bits */ unsigned short st_reserved1; /* old st_uid */ unsigned short st_reserved2; /* old st_gid */ unsigned int st_rdev; @@ -42,7 +42,7 @@ struct hpux_stat64 { unsigned int st_dev; /* dev_t is 32 bits on parisc */ ino_t st_ino; /* 32 bits */ mode_t st_mode; /* 16 bits */ - nlink_t st_nlink; /* 16 bits */ + unsigned short st_nlink; /* 16 bits */ unsigned short st_reserved1; /* old st_uid */ unsigned short st_reserved2; /* old st_gid */ unsigned int st_rdev; diff --git a/arch/powerpc/include/asm/posix_types.h b/arch/powerpc/include/asm/posix_types.h index f139325..2958c5b 100644 --- a/arch/powerpc/include/asm/posix_types.h +++ b/arch/powerpc/include/asm/posix_types.h @@ -16,9 +16,6 @@ typedef int __kernel_ssize_t; typedef long __kernel_ptrdiff_t; #define __kernel_size_t __kernel_size_t -typedef unsigned short __kernel_nlink_t; -#define __kernel_nlink_t __kernel_nlink_t - typedef short __kernel_ipc_pid_t; #define __kernel_ipc_pid_t __kernel_ipc_pid_t #endif diff --git a/arch/powerpc/include/asm/stat.h b/arch/powerpc/include/asm/stat.h index e4edc51..10cfb55 100644 --- a/arch/powerpc/include/asm/stat.h +++ b/arch/powerpc/include/asm/stat.h @@ -30,11 +30,11 @@ struct stat { unsigned long st_dev; ino_t st_ino; #ifdef __powerpc64__ - nlink_t st_nlink; + unsigned short st_nlink; mode_t st_mode; #else mode_t st_mode; - nlink_t st_nlink; + unsigned short st_nlink; #endif uid_t st_uid; gid_t st_gid; diff --git a/arch/s390/include/asm/posix_types.h b/arch/s390/include/asm/posix_types.h index edf8527..7be104c 100644 --- a/arch/s390/include/asm/posix_types.h +++ b/arch/s390/include/asm/posix_types.h @@ -24,7 +24,6 @@ typedef unsigned short __kernel_old_dev_t; typedef unsigned long __kernel_ino_t; typedef unsigned short __kernel_mode_t; -typedef unsigned short __kernel_nlink_t; typedef unsigned short __kernel_ipc_pid_t; typedef unsigned short __kernel_uid_t; typedef unsigned short __kernel_gid_t; @@ -35,7 +34,6 @@ typedef int __kernel_ptrdiff_t; typedef unsigned int __kernel_ino_t; typedef unsigned int __kernel_mode_t; -typedef unsigned int __kernel_nlink_t; typedef int __kernel_ipc_pid_t; typedef unsigned int __kernel_uid_t; typedef unsigned int __kernel_gid_t; @@ -47,7 +45,6 @@ typedef unsigned long __kernel_sigset_t; /* at least 32 bits */ #define __kernel_ino_t __kernel_ino_t #define __kernel_mode_t __kernel_mode_t -#define __kernel_nlink_t __kernel_nlink_t #define __kernel_ipc_pid_t __kernel_ipc_pid_t #define __kernel_uid_t __kernel_uid_t #define __kernel_gid_t __kernel_gid_t diff --git a/arch/sh/include/asm/posix_types_32.h b/arch/sh/include/asm/posix_types_32.h index abda584..ba0bdc4 100644 --- a/arch/sh/include/asm/posix_types_32.h +++ b/arch/sh/include/asm/posix_types_32.h @@ -3,8 +3,6 @@ typedef unsigned short __kernel_mode_t; #define __kernel_mode_t __kernel_mode_t -typedef unsigned short __kernel_nlink_t; -#define __kernel_nlink_t __kernel_nlink_t typedef unsigned short __kernel_ipc_pid_t; #define __kernel_ipc_pid_t __kernel_ipc_pid_t typedef unsigned short __kernel_uid_t; diff --git a/arch/sh/include/asm/posix_types_64.h b/arch/sh/include/asm/posix_types_64.h index fcda07b..244f7e9 100644 --- a/arch/sh/include/asm/posix_types_64.h +++ b/arch/sh/include/asm/posix_types_64.h @@ -3,8 +3,6 @@ typedef unsigned short __kernel_mode_t; #define __kernel_mode_t __kernel_mode_t -typedef unsigned short __kernel_nlink_t; -#define __kernel_nlink_t __kernel_nlink_t typedef unsigned short __kernel_ipc_pid_t; #define __kernel_ipc_pid_t __kernel_ipc_pid_t typedef unsigned short __kernel_uid_t; diff --git a/arch/sparc/include/asm/posix_types.h b/arch/sparc/include/asm/posix_types.h index 3070f25..156220e 100644 --- a/arch/sparc/include/asm/posix_types.h +++ b/arch/sparc/include/asm/posix_types.h @@ -9,8 +9,6 @@ #if defined(__sparc__) && defined(__arch64__) /* sparc 64 bit */ -typedef unsigned int __kernel_nlink_t; -#define __kernel_nlink_t __kernel_nlink_t typedef unsigned short __kernel_old_uid_t; typedef unsigned short __kernel_old_gid_t; @@ -38,9 +36,6 @@ typedef unsigned short __kernel_gid_t; typedef unsigned short __kernel_mode_t; #define __kernel_mode_t __kernel_mode_t -typedef short __kernel_nlink_t; -#define __kernel_nlink_t __kernel_nlink_t - typedef long __kernel_daddr_t; #define __kernel_daddr_t __kernel_daddr_t diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c index 3ee51f1..275f74f 100644 --- a/arch/sparc/kernel/sys_sparc_64.c +++ b/arch/sparc/kernel/sys_sparc_64.c @@ -580,16 +580,9 @@ SYSCALL_DEFINE5(64_mremap, unsigned long, addr, unsigned long, old_len, unsigned long, new_len, unsigned long, flags, unsigned long, new_addr) { - unsigned long ret = -EINVAL; - if (test_thread_flag(TIF_32BIT)) - goto out; - - down_write(¤t->mm->mmap_sem); - ret = do_mremap(addr, old_len, new_len, flags, new_addr); - up_write(¤t->mm->mmap_sem); -out: - return ret; + return -EINVAL; + return sys_mremap(addr, old_len, new_len, flags, new_addr); } /* we come to here via sys_nis_syscall so it can setup the regs argument */ diff --git a/arch/tile/include/asm/compat.h b/arch/tile/include/asm/compat.h index 69adc08..6e74450 100644 --- a/arch/tile/include/asm/compat.h +++ b/arch/tile/include/asm/compat.h @@ -44,7 +44,6 @@ typedef __kernel_uid32_t __compat_gid32_t; typedef __kernel_mode_t compat_mode_t; typedef __kernel_dev_t compat_dev_t; typedef __kernel_loff_t compat_loff_t; -typedef __kernel_nlink_t compat_nlink_t; typedef __kernel_ipc_pid_t compat_ipc_pid_t; typedef __kernel_daddr_t compat_daddr_t; typedef __kernel_fsid_t compat_fsid_t; diff --git a/arch/x86/include/asm/posix_types_32.h b/arch/x86/include/asm/posix_types_32.h index 99f262e..8e52505 100644 --- a/arch/x86/include/asm/posix_types_32.h +++ b/arch/x86/include/asm/posix_types_32.h @@ -10,9 +10,6 @@ typedef unsigned short __kernel_mode_t; #define __kernel_mode_t __kernel_mode_t -typedef unsigned short __kernel_nlink_t; -#define __kernel_nlink_t __kernel_nlink_t - typedef unsigned short __kernel_ipc_pid_t; #define __kernel_ipc_pid_t __kernel_ipc_pid_t diff --git a/drivers/base/soc.c b/drivers/base/soc.c index ba29b2e..72b5e72 100644 --- a/drivers/base/soc.c +++ b/drivers/base/soc.c @@ -42,7 +42,7 @@ struct device *soc_device_to_device(struct soc_device *soc_dev) return &soc_dev->dev; } -static mode_t soc_attribute_mode(struct kobject *kobj, +static umode_t soc_attribute_mode(struct kobject *kobj, struct attribute *attr, int index) { diff --git a/drivers/gpu/drm/i810/i810_dma.c b/drivers/gpu/drm/i810/i810_dma.c index f920fb5..fa94391 100644 --- a/drivers/gpu/drm/i810/i810_dma.c +++ b/drivers/gpu/drm/i810/i810_dma.c @@ -130,11 +130,10 @@ static int i810_map_buffer(struct drm_buf *buf, struct drm_file *file_priv) return -EINVAL; /* This is all entirely broken */ - down_write(¤t->mm->mmap_sem); old_fops = file_priv->filp->f_op; file_priv->filp->f_op = &i810_buffer_fops; dev_priv->mmap_buffer = buf; - buf_priv->virtual = (void *)do_mmap(file_priv->filp, 0, buf->total, + buf_priv->virtual = (void *)vm_mmap(file_priv->filp, 0, buf->total, PROT_READ | PROT_WRITE, MAP_SHARED, buf->bus_address); dev_priv->mmap_buffer = NULL; @@ -145,7 +144,6 @@ static int i810_map_buffer(struct drm_buf *buf, struct drm_file *file_priv) retcode = PTR_ERR(buf_priv->virtual); buf_priv->virtual = NULL; } - up_write(¤t->mm->mmap_sem); return retcode; } diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index a1e6c99..e3dd2a1 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -68,24 +68,6 @@ static gid_t v9fs_get_fsgid_for_create(struct inode *dir_inode) return current_fsgid(); } -/** - * v9fs_dentry_from_dir_inode - helper function to get the dentry from - * dir inode. - * - */ - -static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode) -{ - struct dentry *dentry; - - spin_lock(&inode->i_lock); - /* Directory should have only one entry. */ - BUG_ON(S_ISDIR(inode->i_mode) && !list_is_singular(&inode->i_dentry)); - dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias); - spin_unlock(&inode->i_lock); - return dentry; -} - static int v9fs_test_inode_dotl(struct inode *inode, void *data) { struct v9fs_inode *v9inode = V9FS_I(inode); @@ -415,7 +397,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, if (dir->i_mode & S_ISGID) omode |= S_ISGID; - dir_dentry = v9fs_dentry_from_dir_inode(dir); + dir_dentry = dentry->d_parent; dfid = v9fs_fid_lookup(dir_dentry); if (IS_ERR(dfid)) { err = PTR_ERR(dfid); @@ -793,7 +775,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, dir->i_ino, old_dentry->d_name.name, dentry->d_name.name); v9ses = v9fs_inode2v9ses(dir); - dir_dentry = v9fs_dentry_from_dir_inode(dir); + dir_dentry = dentry->d_parent; dfid = v9fs_fid_lookup(dir_dentry); if (IS_ERR(dfid)) return PTR_ERR(dfid); @@ -858,7 +840,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, return -EINVAL; v9ses = v9fs_inode2v9ses(dir); - dir_dentry = v9fs_dentry_from_dir_inode(dir); + dir_dentry = dentry->d_parent; dfid = v9fs_fid_lookup(dir_dentry); if (IS_ERR(dfid)) { err = PTR_ERR(dfid); diff --git a/fs/affs/affs.h b/fs/affs/affs.h index 45a0ce4..1fceb32 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -18,14 +18,6 @@ #define AFFS_GET_HASHENTRY(data,hashkey) be32_to_cpu(((struct dir_front *)data)->hashtable[hashkey]) #define AFFS_BLOCK(sb, bh, blk) (AFFS_HEAD(bh)->table[AFFS_SB(sb)->s_hashsize-1-(blk)]) -#ifdef __LITTLE_ENDIAN -#define BO_EXBITS 0x18UL -#elif defined(__BIG_ENDIAN) -#define BO_EXBITS 0x00UL -#else -#error Endianness must be known for affs to work. -#endif - #define AFFS_HEAD(bh) ((struct affs_head *)(bh)->b_data) #define AFFS_TAIL(sb, bh) ((struct affs_tail *)((bh)->b_data+(sb)->s_blocksize-sizeof(struct affs_tail))) #define AFFS_ROOT_HEAD(bh) ((struct affs_root_head *)(bh)->b_data) @@ -134,9 +134,9 @@ static int aio_setup_ring(struct kioctx *ctx) info->mmap_size = nr_pages * PAGE_SIZE; dprintk("attempting mmap of %lu bytes\n", info->mmap_size); down_write(&ctx->mm->mmap_sem); - info->mmap_base = do_mmap(NULL, 0, info->mmap_size, - PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, - 0); + info->mmap_base = do_mmap_pgoff(NULL, 0, info->mmap_size, + PROT_READ|PROT_WRITE, + MAP_ANONYMOUS|MAP_PRIVATE, 0); if (IS_ERR((void *)info->mmap_base)) { up_write(&ctx->mm->mmap_sem); info->mmap_size = 0; @@ -176,6 +176,11 @@ int notify_change(struct dentry * dentry, struct iattr * attr) return -EPERM; } + if ((ia_valid & ATTR_SIZE) && IS_I_VERSION(inode)) { + if (attr->ia_size != inode->i_size) + inode_inc_iversion(inode); + } + if ((ia_valid & ATTR_MODE)) { umode_t amode = attr->ia_mode; /* Flag setting protected by i_mutex */ diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index e658dd1..1b52956 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -329,7 +329,6 @@ static unsigned long elf_map(struct file *filep, unsigned long addr, if (!size) return addr; - down_write(¤t->mm->mmap_sem); /* * total_size is the size of the ELF (interpreter) image. * The _first_ mmap needs to know the full size, otherwise @@ -340,13 +339,12 @@ static unsigned long elf_map(struct file *filep, unsigned long addr, */ if (total_size) { total_size = ELF_PAGEALIGN(total_size); - map_addr = do_mmap(filep, addr, total_size, prot, type, off); + map_addr = vm_mmap(filep, addr, total_size, prot, type, off); if (!BAD_ADDR(map_addr)) - do_munmap(current->mm, map_addr+size, total_size-size); + vm_munmap(map_addr+size, total_size-size); } else - map_addr = do_mmap(filep, addr, size, prot, type, off); + map_addr = vm_mmap(filep, addr, size, prot, type, off); - up_write(¤t->mm->mmap_sem); return(map_addr); } diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 6b2daf9..178cb70 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -562,7 +562,7 @@ static int load_flat_file(struct linux_binprm * bprm, realdatastart = (unsigned long) -ENOMEM; printk("Unable to allocate RAM for process data, errno %d\n", (int)-realdatastart); - do_munmap(current->mm, textpos, text_len); + vm_munmap(textpos, text_len); ret = realdatastart; goto err; } @@ -586,8 +586,8 @@ static int load_flat_file(struct linux_binprm * bprm, } if (IS_ERR_VALUE(result)) { printk("Unable to read data+bss, errno %d\n", (int)-result); - do_munmap(current->mm, textpos, text_len); - do_munmap(current->mm, realdatastart, len); + vm_munmap(textpos, text_len); + vm_munmap(realdatastart, len); ret = result; goto err; } @@ -654,7 +654,7 @@ static int load_flat_file(struct linux_binprm * bprm, } if (IS_ERR_VALUE(result)) { printk("Unable to read code+data+bss, errno %d\n",(int)-result); - do_munmap(current->mm, textpos, text_len + data_len + extra + + vm_munmap(textpos, text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long)); ret = result; goto err; diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 89b156d..761e2cd 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -227,7 +227,11 @@ int btrfs_init_acl(struct btrfs_trans_handle *trans, if (ret > 0) { /* we need an acl */ ret = btrfs_set_acl(trans, inode, acl, ACL_TYPE_ACCESS); + } else { + cache_no_acl(inode); } + } else { + cache_no_acl(inode); } failed: posix_acl_release(acl); diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index bcec067..3f75895 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -24,22 +24,135 @@ #include "delayed-ref.h" #include "locking.h" +struct extent_inode_elem { + u64 inum; + u64 offset; + struct extent_inode_elem *next; +}; + +static int check_extent_in_eb(struct btrfs_key *key, struct extent_buffer *eb, + struct btrfs_file_extent_item *fi, + u64 extent_item_pos, + struct extent_inode_elem **eie) +{ + u64 data_offset; + u64 data_len; + struct extent_inode_elem *e; + + data_offset = btrfs_file_extent_offset(eb, fi); + data_len = btrfs_file_extent_num_bytes(eb, fi); + + if (extent_item_pos < data_offset || + extent_item_pos >= data_offset + data_len) + return 1; + + e = kmalloc(sizeof(*e), GFP_NOFS); + if (!e) + return -ENOMEM; + + e->next = *eie; + e->inum = key->objectid; + e->offset = key->offset + (extent_item_pos - data_offset); + *eie = e; + + return 0; +} + +static int find_extent_in_eb(struct extent_buffer *eb, u64 wanted_disk_byte, + u64 extent_item_pos, + struct extent_inode_elem **eie) +{ + u64 disk_byte; + struct btrfs_key key; + struct btrfs_file_extent_item *fi; + int slot; + int nritems; + int extent_type; + int ret; + + /* + * from the shared data ref, we only have the leaf but we need + * the key. thus, we must look into all items and see that we + * find one (some) with a reference to our extent item. + */ + nritems = btrfs_header_nritems(eb); + for (slot = 0; slot < nritems; ++slot) { + btrfs_item_key_to_cpu(eb, &key, slot); + if (key.type != BTRFS_EXTENT_DATA_KEY) + continue; + fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); + extent_type = btrfs_file_extent_type(eb, fi); + if (extent_type == BTRFS_FILE_EXTENT_INLINE) + continue; + /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */ + disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); + if (disk_byte != wanted_disk_byte) + continue; + + ret = check_extent_in_eb(&key, eb, fi, extent_item_pos, eie); + if (ret < 0) + return ret; + } + + return 0; +} + /* * this structure records all encountered refs on the way up to the root */ struct __prelim_ref { struct list_head list; u64 root_id; - struct btrfs_key key; + struct btrfs_key key_for_search; int level; int count; + struct extent_inode_elem *inode_list; u64 parent; u64 wanted_disk_byte; }; +/* + * the rules for all callers of this function are: + * - obtaining the parent is the goal + * - if you add a key, you must know that it is a correct key + * - if you cannot add the parent or a correct key, then we will look into the + * block later to set a correct key + * + * delayed refs + * ============ + * backref type | shared | indirect | shared | indirect + * information | tree | tree | data | data + * --------------------+--------+----------+--------+---------- + * parent logical | y | - | - | - + * key to resolve | - | y | y | y + * tree block logical | - | - | - | - + * root for resolving | y | y | y | y + * + * - column 1: we've the parent -> done + * - column 2, 3, 4: we use the key to find the parent + * + * on disk refs (inline or keyed) + * ============================== + * backref type | shared | indirect | shared | indirect + * information | tree | tree | data | data + * --------------------+--------+----------+--------+---------- + * parent logical | y | - | y | - + * key to resolve | - | - | - | y + * tree block logical | y | y | y | y + * root for resolving | - | y | y | y + * + * - column 1, 3: we've the parent -> done + * - column 2: we take the first key from the block to find the parent + * (see __add_missing_keys) + * - column 4: we use the key to find the parent + * + * additional information that's available but not required to find the parent + * block might help in merging entries to gain some speed. + */ + static int __add_prelim_ref(struct list_head *head, u64 root_id, - struct btrfs_key *key, int level, u64 parent, - u64 wanted_disk_byte, int count) + struct btrfs_key *key, int level, + u64 parent, u64 wanted_disk_byte, int count) { struct __prelim_ref *ref; @@ -50,10 +163,11 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id, ref->root_id = root_id; if (key) - ref->key = *key; + ref->key_for_search = *key; else - memset(&ref->key, 0, sizeof(ref->key)); + memset(&ref->key_for_search, 0, sizeof(ref->key_for_search)); + ref->inode_list = NULL; ref->level = level; ref->count = count; ref->parent = parent; @@ -64,18 +178,26 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id, } static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, - struct ulist *parents, - struct extent_buffer *eb, int level, - u64 wanted_objectid, u64 wanted_disk_byte) + struct ulist *parents, int level, + struct btrfs_key *key, u64 wanted_disk_byte, + const u64 *extent_item_pos) { int ret; - int slot; + int slot = path->slots[level]; + struct extent_buffer *eb = path->nodes[level]; struct btrfs_file_extent_item *fi; - struct btrfs_key key; + struct extent_inode_elem *eie = NULL; u64 disk_byte; + u64 wanted_objectid = key->objectid; add_parent: - ret = ulist_add(parents, eb->start, 0, GFP_NOFS); + if (level == 0 && extent_item_pos) { + fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); + ret = check_extent_in_eb(key, eb, fi, *extent_item_pos, &eie); + if (ret < 0) + return ret; + } + ret = ulist_add(parents, eb->start, (unsigned long)eie, GFP_NOFS); if (ret < 0) return ret; @@ -89,6 +211,7 @@ add_parent: * repeat this until we don't find any additional EXTENT_DATA items. */ while (1) { + eie = NULL; ret = btrfs_next_leaf(root, path); if (ret < 0) return ret; @@ -97,9 +220,9 @@ add_parent: eb = path->nodes[0]; for (slot = 0; slot < btrfs_header_nritems(eb); ++slot) { - btrfs_item_key_to_cpu(eb, &key, slot); - if (key.objectid != wanted_objectid || - key.type != BTRFS_EXTENT_DATA_KEY) + btrfs_item_key_to_cpu(eb, key, slot); + if (key->objectid != wanted_objectid || + key->type != BTRFS_EXTENT_DATA_KEY) return 0; fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); @@ -118,8 +241,10 @@ add_parent: */ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, int search_commit_root, + u64 time_seq, struct __prelim_ref *ref, - struct ulist *parents) + struct ulist *parents, + const u64 *extent_item_pos) { struct btrfs_path *path; struct btrfs_root *root; @@ -152,12 +277,13 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, goto out; path->lowest_level = level; - ret = btrfs_search_slot(NULL, root, &ref->key, path, 0, 0); + ret = btrfs_search_old_slot(root, &ref->key_for_search, path, time_seq); pr_debug("search slot in root %llu (level %d, ref count %d) returned " "%d for key (%llu %u %llu)\n", (unsigned long long)ref->root_id, level, ref->count, ret, - (unsigned long long)ref->key.objectid, ref->key.type, - (unsigned long long)ref->key.offset); + (unsigned long long)ref->key_for_search.objectid, + ref->key_for_search.type, + (unsigned long long)ref->key_for_search.offset); if (ret < 0) goto out; @@ -179,9 +305,8 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, btrfs_item_key_to_cpu(eb, &key, path->slots[0]); } - /* the last two parameters will only be used for level == 0 */ - ret = add_all_parents(root, path, parents, eb, level, key.objectid, - ref->wanted_disk_byte); + ret = add_all_parents(root, path, parents, level, &key, + ref->wanted_disk_byte, extent_item_pos); out: btrfs_free_path(path); return ret; @@ -191,8 +316,9 @@ out: * resolve all indirect backrefs from the list */ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, - int search_commit_root, - struct list_head *head) + int search_commit_root, u64 time_seq, + struct list_head *head, + const u64 *extent_item_pos) { int err; int ret = 0; @@ -201,6 +327,7 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, struct __prelim_ref *new_ref; struct ulist *parents; struct ulist_node *node; + struct ulist_iterator uiter; parents = ulist_alloc(GFP_NOFS); if (!parents) @@ -217,7 +344,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, if (ref->count == 0) continue; err = __resolve_indirect_ref(fs_info, search_commit_root, - ref, parents); + time_seq, ref, parents, + extent_item_pos); if (err) { if (ret == 0) ret = err; @@ -225,11 +353,14 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, } /* we put the first parent into the ref at hand */ - node = ulist_next(parents, NULL); + ULIST_ITER_INIT(&uiter); + node = ulist_next(parents, &uiter); ref->parent = node ? node->val : 0; + ref->inode_list = + node ? (struct extent_inode_elem *)node->aux : 0; /* additional parents require new refs being added here */ - while ((node = ulist_next(parents, node))) { + while ((node = ulist_next(parents, &uiter))) { new_ref = kmalloc(sizeof(*new_ref), GFP_NOFS); if (!new_ref) { ret = -ENOMEM; @@ -237,6 +368,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, } memcpy(new_ref, ref, sizeof(*ref)); new_ref->parent = node->val; + new_ref->inode_list = + (struct extent_inode_elem *)node->aux; list_add(&new_ref->list, &ref->list); } ulist_reinit(parents); @@ -246,10 +379,65 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, return ret; } +static inline int ref_for_same_block(struct __prelim_ref *ref1, + struct __prelim_ref *ref2) +{ + if (ref1->level != ref2->level) + return 0; + if (ref1->root_id != ref2->root_id) + return 0; + if (ref1->key_for_search.type != ref2->key_for_search.type) + return 0; + if (ref1->key_for_search.objectid != ref2->key_for_search.objectid) + return 0; + if (ref1->key_for_search.offset != ref2->key_for_search.offset) + return 0; + if (ref1->parent != ref2->parent) + return 0; + + return 1; +} + +/* + * read tree blocks and add keys where required. + */ +static int __add_missing_keys(struct btrfs_fs_info *fs_info, + struct list_head *head) +{ + struct list_head *pos; + struct extent_buffer *eb; + + list_for_each(pos, head) { + struct __prelim_ref *ref; + ref = list_entry(pos, struct __prelim_ref, list); + + if (ref->parent) + continue; + if (ref->key_for_search.type) + continue; + BUG_ON(!ref->wanted_disk_byte); + eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte, + fs_info->tree_root->leafsize, 0); + BUG_ON(!eb); + btrfs_tree_read_lock(eb); + if (btrfs_header_level(eb) == 0) + btrfs_item_key_to_cpu(eb, &ref->key_for_search, 0); + else + btrfs_node_key_to_cpu(eb, &ref->key_for_search, 0); + btrfs_tree_read_unlock(eb); + free_extent_buffer(eb); + } + return 0; +} + /* * merge two lists of backrefs and adjust counts accordingly * * mode = 1: merge identical keys, if key is set + * FIXME: if we add more keys in __add_prelim_ref, we can merge more here. + * additionally, we could even add a key range for the blocks we + * looked into to merge even more (-> replace unresolved refs by those + * having a parent). * mode = 2: merge identical parents */ static int __merge_refs(struct list_head *head, int mode) @@ -263,20 +451,21 @@ static int __merge_refs(struct list_head *head, int mode) ref1 = list_entry(pos1, struct __prelim_ref, list); - if (mode == 1 && ref1->key.type == 0) - continue; for (pos2 = pos1->next, n2 = pos2->next; pos2 != head; pos2 = n2, n2 = pos2->next) { struct __prelim_ref *ref2; + struct __prelim_ref *xchg; ref2 = list_entry(pos2, struct __prelim_ref, list); if (mode == 1) { - if (memcmp(&ref1->key, &ref2->key, - sizeof(ref1->key)) || - ref1->level != ref2->level || - ref1->root_id != ref2->root_id) + if (!ref_for_same_block(ref1, ref2)) continue; + if (!ref1->parent && ref2->parent) { + xchg = ref1; + ref1 = ref2; + ref2 = xchg; + } ref1->count += ref2->count; } else { if (ref1->parent != ref2->parent) @@ -296,16 +485,17 @@ static int __merge_refs(struct list_head *head, int mode) * smaller or equal that seq to the list */ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, - struct btrfs_key *info_key, struct list_head *prefs) { struct btrfs_delayed_extent_op *extent_op = head->extent_op; struct rb_node *n = &head->node.rb_node; + struct btrfs_key key; + struct btrfs_key op_key = {0}; int sgn; int ret = 0; if (extent_op && extent_op->update_key) - btrfs_disk_key_to_cpu(info_key, &extent_op->key); + btrfs_disk_key_to_cpu(&op_key, &extent_op->key); while ((n = rb_prev(n))) { struct btrfs_delayed_ref_node *node; @@ -337,7 +527,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, struct btrfs_delayed_tree_ref *ref; ref = btrfs_delayed_node_to_tree_ref(node); - ret = __add_prelim_ref(prefs, ref->root, info_key, + ret = __add_prelim_ref(prefs, ref->root, &op_key, ref->level + 1, 0, node->bytenr, node->ref_mod * sgn); break; @@ -346,7 +536,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, struct btrfs_delayed_tree_ref *ref; ref = btrfs_delayed_node_to_tree_ref(node); - ret = __add_prelim_ref(prefs, ref->root, info_key, + ret = __add_prelim_ref(prefs, ref->root, NULL, ref->level + 1, ref->parent, node->bytenr, node->ref_mod * sgn); @@ -354,8 +544,6 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, } case BTRFS_EXTENT_DATA_REF_KEY: { struct btrfs_delayed_data_ref *ref; - struct btrfs_key key; - ref = btrfs_delayed_node_to_data_ref(node); key.objectid = ref->objectid; @@ -368,7 +556,6 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, } case BTRFS_SHARED_DATA_REF_KEY: { struct btrfs_delayed_data_ref *ref; - struct btrfs_key key; ref = btrfs_delayed_node_to_data_ref(node); @@ -394,8 +581,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, */ static int __add_inline_refs(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 bytenr, - struct btrfs_key *info_key, int *info_level, - struct list_head *prefs) + int *info_level, struct list_head *prefs) { int ret = 0; int slot; @@ -411,7 +597,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info, * enumerate all inline refs */ leaf = path->nodes[0]; - slot = path->slots[0] - 1; + slot = path->slots[0]; item_size = btrfs_item_size_nr(leaf, slot); BUG_ON(item_size < sizeof(*ei)); @@ -424,12 +610,9 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info, if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { struct btrfs_tree_block_info *info; - struct btrfs_disk_key disk_key; info = (struct btrfs_tree_block_info *)ptr; *info_level = btrfs_tree_block_level(leaf, info); - btrfs_tree_block_key(leaf, info, &disk_key); - btrfs_disk_key_to_cpu(info_key, &disk_key); ptr += sizeof(struct btrfs_tree_block_info); BUG_ON(ptr > end); } else { @@ -447,7 +630,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info, switch (type) { case BTRFS_SHARED_BLOCK_REF_KEY: - ret = __add_prelim_ref(prefs, 0, info_key, + ret = __add_prelim_ref(prefs, 0, NULL, *info_level + 1, offset, bytenr, 1); break; @@ -462,8 +645,9 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info, break; } case BTRFS_TREE_BLOCK_REF_KEY: - ret = __add_prelim_ref(prefs, offset, info_key, - *info_level + 1, 0, bytenr, 1); + ret = __add_prelim_ref(prefs, offset, NULL, + *info_level + 1, 0, + bytenr, 1); break; case BTRFS_EXTENT_DATA_REF_KEY: { struct btrfs_extent_data_ref *dref; @@ -477,8 +661,8 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info, key.type = BTRFS_EXTENT_DATA_KEY; key.offset = btrfs_extent_data_ref_offset(leaf, dref); root = btrfs_extent_data_ref_root(leaf, dref); - ret = __add_prelim_ref(prefs, root, &key, 0, 0, bytenr, - count); + ret = __add_prelim_ref(prefs, root, &key, 0, 0, + bytenr, count); break; } default: @@ -496,8 +680,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info, */ static int __add_keyed_refs(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 bytenr, - struct btrfs_key *info_key, int info_level, - struct list_head *prefs) + int info_level, struct list_head *prefs) { struct btrfs_root *extent_root = fs_info->extent_root; int ret; @@ -527,7 +710,7 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info, switch (key.type) { case BTRFS_SHARED_BLOCK_REF_KEY: - ret = __add_prelim_ref(prefs, 0, info_key, + ret = __add_prelim_ref(prefs, 0, NULL, info_level + 1, key.offset, bytenr, 1); break; @@ -543,8 +726,9 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info, break; } case BTRFS_TREE_BLOCK_REF_KEY: - ret = __add_prelim_ref(prefs, key.offset, info_key, - info_level + 1, 0, bytenr, 1); + ret = __add_prelim_ref(prefs, key.offset, NULL, + info_level + 1, 0, + bytenr, 1); break; case BTRFS_EXTENT_DATA_REF_KEY: { struct btrfs_extent_data_ref *dref; @@ -560,7 +744,7 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info, key.offset = btrfs_extent_data_ref_offset(leaf, dref); root = btrfs_extent_data_ref_root(leaf, dref); ret = __add_prelim_ref(prefs, root, &key, 0, 0, - bytenr, count); + bytenr, count); break; } default: @@ -582,11 +766,12 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info, */ static int find_parent_nodes(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, - u64 seq, struct ulist *refs, struct ulist *roots) + u64 delayed_ref_seq, u64 time_seq, + struct ulist *refs, struct ulist *roots, + const u64 *extent_item_pos) { struct btrfs_key key; struct btrfs_path *path; - struct btrfs_key info_key = { 0 }; struct btrfs_delayed_ref_root *delayed_refs = NULL; struct btrfs_delayed_ref_head *head; int info_level = 0; @@ -645,7 +830,7 @@ again: btrfs_put_delayed_ref(&head->node); goto again; } - ret = __add_delayed_refs(head, seq, &info_key, + ret = __add_delayed_refs(head, delayed_ref_seq, &prefs_delayed); if (ret) { spin_unlock(&delayed_refs->lock); @@ -659,16 +844,17 @@ again: struct extent_buffer *leaf; int slot; + path->slots[0]--; leaf = path->nodes[0]; - slot = path->slots[0] - 1; + slot = path->slots[0]; btrfs_item_key_to_cpu(leaf, &key, slot); if (key.objectid == bytenr && key.type == BTRFS_EXTENT_ITEM_KEY) { ret = __add_inline_refs(fs_info, path, bytenr, - &info_key, &info_level, &prefs); + &info_level, &prefs); if (ret) goto out; - ret = __add_keyed_refs(fs_info, path, bytenr, &info_key, + ret = __add_keyed_refs(fs_info, path, bytenr, info_level, &prefs); if (ret) goto out; @@ -676,21 +862,18 @@ again: } btrfs_release_path(path); - /* - * when adding the delayed refs above, the info_key might not have - * been known yet. Go over the list and replace the missing keys - */ - list_for_each_entry(ref, &prefs_delayed, list) { - if ((ref->key.offset | ref->key.type | ref->key.objectid) == 0) - memcpy(&ref->key, &info_key, sizeof(ref->key)); - } list_splice_init(&prefs_delayed, &prefs); + ret = __add_missing_keys(fs_info, &prefs); + if (ret) + goto out; + ret = __merge_refs(&prefs, 1); if (ret) goto out; - ret = __resolve_indirect_refs(fs_info, search_commit_root, &prefs); + ret = __resolve_indirect_refs(fs_info, search_commit_root, time_seq, + &prefs, extent_item_pos); if (ret) goto out; @@ -709,7 +892,33 @@ again: BUG_ON(ret < 0); } if (ref->count && ref->parent) { - ret = ulist_add(refs, ref->parent, 0, GFP_NOFS); + struct extent_inode_elem *eie = NULL; + if (extent_item_pos && !ref->inode_list) { + u32 bsz; + struct extent_buffer *eb; + bsz = btrfs_level_size(fs_info->extent_root, + info_level); + eb = read_tree_block(fs_info->extent_root, + ref->parent, bsz, 0); + BUG_ON(!eb); + ret = find_extent_in_eb(eb, bytenr, + *extent_item_pos, &eie); + ref->inode_list = eie; + free_extent_buffer(eb); + } + ret = ulist_add_merge(refs, ref->parent, + (unsigned long)ref->inode_list, + (unsigned long *)&eie, GFP_NOFS); + if (!ret && extent_item_pos) { + /* + * we've recorded that parent, so we must extend + * its inode list here + */ + BUG_ON(!eie); + while (eie->next) + eie = eie->next; + eie->next = ref->inode_list; + } BUG_ON(ret < 0); } kfree(ref); @@ -734,6 +943,28 @@ out: return ret; } +static void free_leaf_list(struct ulist *blocks) +{ + struct ulist_node *node = NULL; + struct extent_inode_elem *eie; + struct extent_inode_elem *eie_next; + struct ulist_iterator uiter; + + ULIST_ITER_INIT(&uiter); + while ((node = ulist_next(blocks, &uiter))) { + if (!node->aux) + continue; + eie = (struct extent_inode_elem *)node->aux; + for (; eie; eie = eie_next) { + eie_next = eie->next; + kfree(eie); + } + node->aux = 0; + } + + ulist_free(blocks); +} + /* * Finds all leafs with a reference to the specified combination of bytenr and * offset. key_list_head will point to a list of corresponding keys (caller must @@ -744,7 +975,9 @@ out: */ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, - u64 num_bytes, u64 seq, struct ulist **leafs) + u64 delayed_ref_seq, u64 time_seq, + struct ulist **leafs, + const u64 *extent_item_pos) { struct ulist *tmp; int ret; @@ -758,11 +991,12 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, return -ENOMEM; } - ret = find_parent_nodes(trans, fs_info, bytenr, seq, *leafs, tmp); + ret = find_parent_nodes(trans, fs_info, bytenr, delayed_ref_seq, + time_seq, *leafs, tmp, extent_item_pos); ulist_free(tmp); if (ret < 0 && ret != -ENOENT) { - ulist_free(*leafs); + free_leaf_list(*leafs); return ret; } @@ -784,10 +1018,12 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, */ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, - u64 num_bytes, u64 seq, struct ulist **roots) + u64 delayed_ref_seq, u64 time_seq, + struct ulist **roots) { struct ulist *tmp; struct ulist_node *node = NULL; + struct ulist_iterator uiter; int ret; tmp = ulist_alloc(GFP_NOFS); @@ -799,15 +1035,16 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, return -ENOMEM; } + ULIST_ITER_INIT(&uiter); while (1) { - ret = find_parent_nodes(trans, fs_info, bytenr, seq, - tmp, *roots); + ret = find_parent_nodes(trans, fs_info, bytenr, delayed_ref_seq, + time_seq, tmp, *roots, NULL); if (ret < 0 && ret != -ENOENT) { ulist_free(tmp); ulist_free(*roots); return ret; } - node = ulist_next(tmp, node); + node = ulist_next(tmp, &uiter); if (!node) break; bytenr = node->val; @@ -1093,67 +1330,25 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, return 0; } -static int iterate_leaf_refs(struct btrfs_fs_info *fs_info, u64 logical, - u64 orig_extent_item_objectid, - u64 extent_item_pos, u64 root, +static int iterate_leaf_refs(struct extent_inode_elem *inode_list, + u64 root, u64 extent_item_objectid, iterate_extent_inodes_t *iterate, void *ctx) { - u64 disk_byte; - struct btrfs_key key; - struct btrfs_file_extent_item *fi; - struct extent_buffer *eb; - int slot; - int nritems; + struct extent_inode_elem *eie; int ret = 0; - int extent_type; - u64 data_offset; - u64 data_len; - - eb = read_tree_block(fs_info->tree_root, logical, - fs_info->tree_root->leafsize, 0); - if (!eb) - return -EIO; - - /* - * from the shared data ref, we only have the leaf but we need - * the key. thus, we must look into all items and see that we - * find one (some) with a reference to our extent item. - */ - nritems = btrfs_header_nritems(eb); - for (slot = 0; slot < nritems; ++slot) { - btrfs_item_key_to_cpu(eb, &key, slot); - if (key.type != BTRFS_EXTENT_DATA_KEY) - continue; - fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); - extent_type = btrfs_file_extent_type(eb, fi); - if (extent_type == BTRFS_FILE_EXTENT_INLINE) - continue; - /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */ - disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); - if (disk_byte != orig_extent_item_objectid) - continue; - - data_offset = btrfs_file_extent_offset(eb, fi); - data_len = btrfs_file_extent_num_bytes(eb, fi); - - if (extent_item_pos < data_offset || - extent_item_pos >= data_offset + data_len) - continue; + for (eie = inode_list; eie; eie = eie->next) { pr_debug("ref for %llu resolved, key (%llu EXTEND_DATA %llu), " - "root %llu\n", orig_extent_item_objectid, - key.objectid, key.offset, root); - ret = iterate(key.objectid, - key.offset + (extent_item_pos - data_offset), - root, ctx); + "root %llu\n", extent_item_objectid, + eie->inum, eie->offset, root); + ret = iterate(eie->inum, eie->offset, root, ctx); if (ret) { - pr_debug("stopping iteration because ret=%d\n", ret); + pr_debug("stopping iteration for %llu due to ret=%d\n", + extent_item_objectid, ret); break; } } - free_extent_buffer(eb); - return ret; } @@ -1175,7 +1370,10 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, struct ulist *roots = NULL; struct ulist_node *ref_node = NULL; struct ulist_node *root_node = NULL; - struct seq_list seq_elem; + struct seq_list seq_elem = {}; + struct seq_list tree_mod_seq_elem = {}; + struct ulist_iterator ref_uiter; + struct ulist_iterator root_uiter; struct btrfs_delayed_ref_root *delayed_refs = NULL; pr_debug("resolving all inodes for extent %llu\n", @@ -1192,34 +1390,41 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, spin_lock(&delayed_refs->lock); btrfs_get_delayed_seq(delayed_refs, &seq_elem); spin_unlock(&delayed_refs->lock); + btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem); } ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid, - extent_item_pos, seq_elem.seq, - &refs); - + seq_elem.seq, tree_mod_seq_elem.seq, &refs, + &extent_item_pos); if (ret) goto out; - while (!ret && (ref_node = ulist_next(refs, ref_node))) { - ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, -1, - seq_elem.seq, &roots); + ULIST_ITER_INIT(&ref_uiter); + while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) { + ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, + seq_elem.seq, + tree_mod_seq_elem.seq, &roots); if (ret) break; - while (!ret && (root_node = ulist_next(roots, root_node))) { - pr_debug("root %llu references leaf %llu\n", - root_node->val, ref_node->val); - ret = iterate_leaf_refs(fs_info, ref_node->val, - extent_item_objectid, - extent_item_pos, root_node->val, - iterate, ctx); + ULIST_ITER_INIT(&root_uiter); + while (!ret && (root_node = ulist_next(roots, &root_uiter))) { + pr_debug("root %llu references leaf %llu, data list " + "%#lx\n", root_node->val, ref_node->val, + ref_node->aux); + ret = iterate_leaf_refs( + (struct extent_inode_elem *)ref_node->aux, + root_node->val, extent_item_objectid, + iterate, ctx); } + ulist_free(roots); + roots = NULL; } - ulist_free(refs); + free_leaf_list(refs); ulist_free(roots); out: if (!search_commit_root) { + btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); btrfs_put_delayed_seq(delayed_refs, &seq_elem); btrfs_end_transaction(trans, fs_info->extent_root); } diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 57ea2e9..c18d8ac 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -58,7 +58,8 @@ int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); int btrfs_find_all_roots(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, - u64 num_bytes, u64 seq, struct ulist **roots); + u64 delayed_ref_seq, u64 time_seq, + struct ulist **roots); struct btrfs_data_container *init_data_container(u32 total_bytes); struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 9b9b15f..e616f887 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -24,6 +24,20 @@ #include "ordered-data.h" #include "delayed-inode.h" +/* + * ordered_data_close is set by truncate when a file that used + * to have good data has been truncated to zero. When it is set + * the btrfs file release call will add this inode to the + * ordered operations list so that we make sure to flush out any + * new data the application may have written before commit. + */ +#define BTRFS_INODE_ORDERED_DATA_CLOSE 0 +#define BTRFS_INODE_ORPHAN_META_RESERVED 1 +#define BTRFS_INODE_DUMMY 2 +#define BTRFS_INODE_IN_DEFRAG 3 +#define BTRFS_INODE_DELALLOC_META_RESERVED 4 +#define BTRFS_INODE_HAS_ORPHAN_ITEM 5 + /* in memory btrfs inode */ struct btrfs_inode { /* which subvolume this inode belongs to */ @@ -57,9 +71,6 @@ struct btrfs_inode { /* used to order data wrt metadata */ struct btrfs_ordered_inode_tree ordered_tree; - /* for keeping track of orphaned inodes */ - struct list_head i_orphan; - /* list of all the delalloc inodes in the FS. There are times we need * to write all the delalloc pages to disk, and this list is used * to walk them all. @@ -78,14 +89,13 @@ struct btrfs_inode { /* the space_info for where this inode's data allocations are done */ struct btrfs_space_info *space_info; + unsigned long runtime_flags; + /* full 64 bit generation number, struct vfs_inode doesn't have a big * enough field for this. */ u64 generation; - /* sequence number for NFS changes */ - u64 sequence; - /* * transid of the trans_handle that last modified this inode */ @@ -145,22 +155,9 @@ struct btrfs_inode { unsigned reserved_extents; /* - * ordered_data_close is set by truncate when a file that used - * to have good data has been truncated to zero. When it is set - * the btrfs file release call will add this inode to the - * ordered operations list so that we make sure to flush out any - * new data the application may have written before commit. - */ - unsigned ordered_data_close:1; - unsigned orphan_meta_reserved:1; - unsigned dummy_inode:1; - unsigned in_defrag:1; - unsigned delalloc_meta_reserved:1; - - /* * always compress this one file */ - unsigned force_compress:4; + unsigned force_compress; struct btrfs_delayed_node *delayed_node; @@ -202,4 +199,17 @@ static inline bool btrfs_is_free_space_inode(struct btrfs_root *root, return false; } +static inline int btrfs_inode_in_log(struct inode *inode, u64 generation) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret = 0; + + mutex_lock(&root->log_mutex); + if (BTRFS_I(inode)->logged_trans == generation && + BTRFS_I(inode)->last_sub_trans <= root->last_log_commit) + ret = 1; + mutex_unlock(&root->log_mutex); + return ret; +} + #endif diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index c053e90..9cebb1f 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -103,8 +103,6 @@ #define BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER 20111300 #define BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL (200 - 6) /* in characters, * excluding " [...]" */ -#define BTRFSIC_BLOCK_SIZE PAGE_SIZE - #define BTRFSIC_GENERATION_UNKNOWN ((u64)-1) /* @@ -210,8 +208,9 @@ struct btrfsic_block_data_ctx { u64 dev_bytenr; /* physical bytenr on device */ u32 len; struct btrfsic_dev_state *dev; - char *data; - struct buffer_head *bh; /* do not use if set to NULL */ + char **datav; + struct page **pagev; + void *mem_to_free; }; /* This structure is used to implement recursion without occupying @@ -243,6 +242,8 @@ struct btrfsic_state { struct btrfs_root *root; u64 max_superblock_generation; struct btrfsic_block *latest_superblock; + u32 metablock_size; + u32 datablock_size; }; static void btrfsic_block_init(struct btrfsic_block *b); @@ -290,8 +291,10 @@ static int btrfsic_process_superblock(struct btrfsic_state *state, static int btrfsic_process_metablock(struct btrfsic_state *state, struct btrfsic_block *block, struct btrfsic_block_data_ctx *block_ctx, - struct btrfs_header *hdr, int limit_nesting, int force_iodone_flag); +static void btrfsic_read_from_block_data( + struct btrfsic_block_data_ctx *block_ctx, + void *dst, u32 offset, size_t len); static int btrfsic_create_link_to_next_block( struct btrfsic_state *state, struct btrfsic_block *block, @@ -318,12 +321,13 @@ static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx); static int btrfsic_read_block(struct btrfsic_state *state, struct btrfsic_block_data_ctx *block_ctx); static void btrfsic_dump_database(struct btrfsic_state *state); +static void btrfsic_complete_bio_end_io(struct bio *bio, int err); static int btrfsic_test_for_metadata(struct btrfsic_state *state, - const u8 *data, unsigned int size); + char **datav, unsigned int num_pages); static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, - u64 dev_bytenr, u8 *mapped_data, - unsigned int len, struct bio *bio, - int *bio_is_patched, + u64 dev_bytenr, char **mapped_datav, + unsigned int num_pages, + struct bio *bio, int *bio_is_patched, struct buffer_head *bh, int submit_bio_bh_rw); static int btrfsic_process_written_superblock( @@ -375,7 +379,7 @@ static struct btrfsic_dev_state *btrfsic_dev_state_lookup( static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, u64 bytenr, struct btrfsic_dev_state *dev_state, - u64 dev_bytenr, char *data); + u64 dev_bytenr); static struct mutex btrfsic_mutex; static int btrfsic_is_initialized; @@ -651,7 +655,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state, int pass; BUG_ON(NULL == state); - selected_super = kmalloc(sizeof(*selected_super), GFP_NOFS); + selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS); if (NULL == selected_super) { printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); return -1; @@ -718,7 +722,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state, num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree, - next_bytenr, PAGE_SIZE); + next_bytenr, state->metablock_size); if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", (unsigned long long)next_bytenr, num_copies); @@ -727,9 +731,9 @@ static int btrfsic_process_superblock(struct btrfsic_state *state, struct btrfsic_block *next_block; struct btrfsic_block_data_ctx tmp_next_block_ctx; struct btrfsic_block_link *l; - struct btrfs_header *hdr; - ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE, + ret = btrfsic_map_block(state, next_bytenr, + state->metablock_size, &tmp_next_block_ctx, mirror_num); if (ret) { @@ -758,7 +762,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state, BUG_ON(NULL == l); ret = btrfsic_read_block(state, &tmp_next_block_ctx); - if (ret < (int)BTRFSIC_BLOCK_SIZE) { + if (ret < (int)PAGE_CACHE_SIZE) { printk(KERN_INFO "btrfsic: read @logical %llu failed!\n", (unsigned long long) @@ -768,11 +772,9 @@ static int btrfsic_process_superblock(struct btrfsic_state *state, return -1; } - hdr = (struct btrfs_header *)tmp_next_block_ctx.data; ret = btrfsic_process_metablock(state, next_block, &tmp_next_block_ctx, - hdr, BTRFS_MAX_LEVEL + 3, 1); btrfsic_release_block_ctx(&tmp_next_block_ctx); } @@ -799,7 +801,10 @@ static int btrfsic_process_superblock_dev_mirror( /* super block bytenr is always the unmapped device bytenr */ dev_bytenr = btrfs_sb_offset(superblock_mirror_num); - bh = __bread(superblock_bdev, dev_bytenr / 4096, 4096); + if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes) + return -1; + bh = __bread(superblock_bdev, dev_bytenr / 4096, + BTRFS_SUPER_INFO_SIZE); if (NULL == bh) return -1; super_tmp = (struct btrfs_super_block *) @@ -808,7 +813,10 @@ static int btrfsic_process_superblock_dev_mirror( if (btrfs_super_bytenr(super_tmp) != dev_bytenr || strncmp((char *)(&(super_tmp->magic)), BTRFS_MAGIC, sizeof(super_tmp->magic)) || - memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE)) { + memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) || + btrfs_super_nodesize(super_tmp) != state->metablock_size || + btrfs_super_leafsize(super_tmp) != state->metablock_size || + btrfs_super_sectorsize(super_tmp) != state->datablock_size) { brelse(bh); return 0; } @@ -893,7 +901,7 @@ static int btrfsic_process_superblock_dev_mirror( num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree, - next_bytenr, PAGE_SIZE); + next_bytenr, state->metablock_size); if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", (unsigned long long)next_bytenr, num_copies); @@ -902,7 +910,8 @@ static int btrfsic_process_superblock_dev_mirror( struct btrfsic_block_data_ctx tmp_next_block_ctx; struct btrfsic_block_link *l; - if (btrfsic_map_block(state, next_bytenr, PAGE_SIZE, + if (btrfsic_map_block(state, next_bytenr, + state->metablock_size, &tmp_next_block_ctx, mirror_num)) { printk(KERN_INFO "btrfsic: btrfsic_map_block(" @@ -966,13 +975,15 @@ static int btrfsic_process_metablock( struct btrfsic_state *state, struct btrfsic_block *const first_block, struct btrfsic_block_data_ctx *const first_block_ctx, - struct btrfs_header *const first_hdr, int first_limit_nesting, int force_iodone_flag) { struct btrfsic_stack_frame initial_stack_frame = { 0 }; struct btrfsic_stack_frame *sf; struct btrfsic_stack_frame *next_stack; + struct btrfs_header *const first_hdr = + (struct btrfs_header *)first_block_ctx->datav[0]; + BUG_ON(!first_hdr); sf = &initial_stack_frame; sf->error = 0; sf->i = -1; @@ -1012,21 +1023,47 @@ continue_with_current_leaf_stack_frame: } if (sf->i < sf->nr) { - struct btrfs_item *disk_item = leafhdr->items + sf->i; - struct btrfs_disk_key *disk_key = &disk_item->key; + struct btrfs_item disk_item; + u32 disk_item_offset = + (uintptr_t)(leafhdr->items + sf->i) - + (uintptr_t)leafhdr; + struct btrfs_disk_key *disk_key; u8 type; - const u32 item_offset = le32_to_cpu(disk_item->offset); + u32 item_offset; + if (disk_item_offset + sizeof(struct btrfs_item) > + sf->block_ctx->len) { +leaf_item_out_of_bounce_error: + printk(KERN_INFO + "btrfsic: leaf item out of bounce at logical %llu, dev %s\n", + sf->block_ctx->start, + sf->block_ctx->dev->name); + goto one_stack_frame_backwards; + } + btrfsic_read_from_block_data(sf->block_ctx, + &disk_item, + disk_item_offset, + sizeof(struct btrfs_item)); + item_offset = le32_to_cpu(disk_item.offset); + disk_key = &disk_item.key; type = disk_key->type; if (BTRFS_ROOT_ITEM_KEY == type) { - const struct btrfs_root_item *const root_item = - (struct btrfs_root_item *) - (sf->block_ctx->data + - offsetof(struct btrfs_leaf, items) + - item_offset); - const u64 next_bytenr = - le64_to_cpu(root_item->bytenr); + struct btrfs_root_item root_item; + u32 root_item_offset; + u64 next_bytenr; + + root_item_offset = item_offset + + offsetof(struct btrfs_leaf, items); + if (root_item_offset + + sizeof(struct btrfs_root_item) > + sf->block_ctx->len) + goto leaf_item_out_of_bounce_error; + btrfsic_read_from_block_data( + sf->block_ctx, &root_item, + root_item_offset, + sizeof(struct btrfs_root_item)); + next_bytenr = le64_to_cpu(root_item.bytenr); sf->error = btrfsic_create_link_to_next_block( @@ -1041,7 +1078,7 @@ continue_with_current_leaf_stack_frame: &sf->num_copies, &sf->mirror_num, disk_key, - le64_to_cpu(root_item-> + le64_to_cpu(root_item. generation)); if (sf->error) goto one_stack_frame_backwards; @@ -1049,7 +1086,7 @@ continue_with_current_leaf_stack_frame: if (NULL != sf->next_block) { struct btrfs_header *const next_hdr = (struct btrfs_header *) - sf->next_block_ctx.data; + sf->next_block_ctx.datav[0]; next_stack = btrfsic_stack_frame_alloc(); @@ -1111,10 +1148,24 @@ continue_with_current_node_stack_frame: } if (sf->i < sf->nr) { - struct btrfs_key_ptr *disk_key_ptr = - nodehdr->ptrs + sf->i; - const u64 next_bytenr = - le64_to_cpu(disk_key_ptr->blockptr); + struct btrfs_key_ptr key_ptr; + u32 key_ptr_offset; + u64 next_bytenr; + + key_ptr_offset = (uintptr_t)(nodehdr->ptrs + sf->i) - + (uintptr_t)nodehdr; + if (key_ptr_offset + sizeof(struct btrfs_key_ptr) > + sf->block_ctx->len) { + printk(KERN_INFO + "btrfsic: node item out of bounce at logical %llu, dev %s\n", + sf->block_ctx->start, + sf->block_ctx->dev->name); + goto one_stack_frame_backwards; + } + btrfsic_read_from_block_data( + sf->block_ctx, &key_ptr, key_ptr_offset, + sizeof(struct btrfs_key_ptr)); + next_bytenr = le64_to_cpu(key_ptr.blockptr); sf->error = btrfsic_create_link_to_next_block( state, @@ -1127,15 +1178,15 @@ continue_with_current_node_stack_frame: force_iodone_flag, &sf->num_copies, &sf->mirror_num, - &disk_key_ptr->key, - le64_to_cpu(disk_key_ptr->generation)); + &key_ptr.key, + le64_to_cpu(key_ptr.generation)); if (sf->error) goto one_stack_frame_backwards; if (NULL != sf->next_block) { struct btrfs_header *const next_hdr = (struct btrfs_header *) - sf->next_block_ctx.data; + sf->next_block_ctx.datav[0]; next_stack = btrfsic_stack_frame_alloc(); if (NULL == next_stack) @@ -1181,6 +1232,35 @@ one_stack_frame_backwards: return sf->error; } +static void btrfsic_read_from_block_data( + struct btrfsic_block_data_ctx *block_ctx, + void *dstv, u32 offset, size_t len) +{ + size_t cur; + size_t offset_in_page; + char *kaddr; + char *dst = (char *)dstv; + size_t start_offset = block_ctx->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + offset) >> PAGE_CACHE_SHIFT; + + WARN_ON(offset + len > block_ctx->len); + offset_in_page = (start_offset + offset) & + ((unsigned long)PAGE_CACHE_SIZE - 1); + + while (len > 0) { + cur = min(len, ((size_t)PAGE_CACHE_SIZE - offset_in_page)); + BUG_ON(i >= (block_ctx->len + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT); + kaddr = block_ctx->datav[i]; + memcpy(dst, kaddr + offset_in_page, cur); + + dst += cur; + len -= cur; + offset_in_page = 0; + i++; + } +} + static int btrfsic_create_link_to_next_block( struct btrfsic_state *state, struct btrfsic_block *block, @@ -1204,7 +1284,7 @@ static int btrfsic_create_link_to_next_block( if (0 == *num_copiesp) { *num_copiesp = btrfs_num_copies(&state->root->fs_info->mapping_tree, - next_bytenr, PAGE_SIZE); + next_bytenr, state->metablock_size); if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", (unsigned long long)next_bytenr, *num_copiesp); @@ -1219,7 +1299,7 @@ static int btrfsic_create_link_to_next_block( "btrfsic_create_link_to_next_block(mirror_num=%d)\n", *mirror_nump); ret = btrfsic_map_block(state, next_bytenr, - BTRFSIC_BLOCK_SIZE, + state->metablock_size, next_block_ctx, *mirror_nump); if (ret) { printk(KERN_INFO @@ -1314,7 +1394,7 @@ static int btrfsic_create_link_to_next_block( if (limit_nesting > 0 && did_alloc_block_link) { ret = btrfsic_read_block(state, next_block_ctx); - if (ret < (int)BTRFSIC_BLOCK_SIZE) { + if (ret < (int)next_block_ctx->len) { printk(KERN_INFO "btrfsic: read block @logical %llu failed!\n", (unsigned long long)next_bytenr); @@ -1339,43 +1419,74 @@ static int btrfsic_handle_extent_data( u32 item_offset, int force_iodone_flag) { int ret; - struct btrfs_file_extent_item *file_extent_item = - (struct btrfs_file_extent_item *)(block_ctx->data + - offsetof(struct btrfs_leaf, - items) + item_offset); - u64 next_bytenr = - le64_to_cpu(file_extent_item->disk_bytenr) + - le64_to_cpu(file_extent_item->offset); - u64 num_bytes = le64_to_cpu(file_extent_item->num_bytes); - u64 generation = le64_to_cpu(file_extent_item->generation); + struct btrfs_file_extent_item file_extent_item; + u64 file_extent_item_offset; + u64 next_bytenr; + u64 num_bytes; + u64 generation; struct btrfsic_block_link *l; + file_extent_item_offset = offsetof(struct btrfs_leaf, items) + + item_offset; + if (file_extent_item_offset + + offsetof(struct btrfs_file_extent_item, disk_num_bytes) > + block_ctx->len) { + printk(KERN_INFO + "btrfsic: file item out of bounce at logical %llu, dev %s\n", + block_ctx->start, block_ctx->dev->name); + return -1; + } + + btrfsic_read_from_block_data(block_ctx, &file_extent_item, + file_extent_item_offset, + offsetof(struct btrfs_file_extent_item, disk_num_bytes)); + if (BTRFS_FILE_EXTENT_REG != file_extent_item.type || + ((u64)0) == le64_to_cpu(file_extent_item.disk_bytenr)) { + if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) + printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu\n", + file_extent_item.type, + (unsigned long long) + le64_to_cpu(file_extent_item.disk_bytenr)); + return 0; + } + + if (file_extent_item_offset + sizeof(struct btrfs_file_extent_item) > + block_ctx->len) { + printk(KERN_INFO + "btrfsic: file item out of bounce at logical %llu, dev %s\n", + block_ctx->start, block_ctx->dev->name); + return -1; + } + btrfsic_read_from_block_data(block_ctx, &file_extent_item, + file_extent_item_offset, + sizeof(struct btrfs_file_extent_item)); + next_bytenr = le64_to_cpu(file_extent_item.disk_bytenr) + + le64_to_cpu(file_extent_item.offset); + generation = le64_to_cpu(file_extent_item.generation); + num_bytes = le64_to_cpu(file_extent_item.num_bytes); + generation = le64_to_cpu(file_extent_item.generation); + if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu," " offset = %llu, num_bytes = %llu\n", - file_extent_item->type, + file_extent_item.type, (unsigned long long) - le64_to_cpu(file_extent_item->disk_bytenr), - (unsigned long long) - le64_to_cpu(file_extent_item->offset), - (unsigned long long) - le64_to_cpu(file_extent_item->num_bytes)); - if (BTRFS_FILE_EXTENT_REG != file_extent_item->type || - ((u64)0) == le64_to_cpu(file_extent_item->disk_bytenr)) - return 0; + le64_to_cpu(file_extent_item.disk_bytenr), + (unsigned long long)le64_to_cpu(file_extent_item.offset), + (unsigned long long)num_bytes); while (num_bytes > 0) { u32 chunk_len; int num_copies; int mirror_num; - if (num_bytes > BTRFSIC_BLOCK_SIZE) - chunk_len = BTRFSIC_BLOCK_SIZE; + if (num_bytes > state->datablock_size) + chunk_len = state->datablock_size; else chunk_len = num_bytes; num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree, - next_bytenr, PAGE_SIZE); + next_bytenr, state->datablock_size); if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", (unsigned long long)next_bytenr, num_copies); @@ -1475,8 +1586,9 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, block_ctx_out->dev_bytenr = multi->stripes[0].physical; block_ctx_out->start = bytenr; block_ctx_out->len = len; - block_ctx_out->data = NULL; - block_ctx_out->bh = NULL; + block_ctx_out->datav = NULL; + block_ctx_out->pagev = NULL; + block_ctx_out->mem_to_free = NULL; if (0 == ret) kfree(multi); @@ -1496,8 +1608,9 @@ static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr, block_ctx_out->dev_bytenr = bytenr; block_ctx_out->start = bytenr; block_ctx_out->len = len; - block_ctx_out->data = NULL; - block_ctx_out->bh = NULL; + block_ctx_out->datav = NULL; + block_ctx_out->pagev = NULL; + block_ctx_out->mem_to_free = NULL; if (NULL != block_ctx_out->dev) { return 0; } else { @@ -1508,38 +1621,127 @@ static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr, static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx) { - if (NULL != block_ctx->bh) { - brelse(block_ctx->bh); - block_ctx->bh = NULL; + if (block_ctx->mem_to_free) { + unsigned int num_pages; + + BUG_ON(!block_ctx->datav); + BUG_ON(!block_ctx->pagev); + num_pages = (block_ctx->len + (u64)PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + while (num_pages > 0) { + num_pages--; + if (block_ctx->datav[num_pages]) { + kunmap(block_ctx->pagev[num_pages]); + block_ctx->datav[num_pages] = NULL; + } + if (block_ctx->pagev[num_pages]) { + __free_page(block_ctx->pagev[num_pages]); + block_ctx->pagev[num_pages] = NULL; + } + } + + kfree(block_ctx->mem_to_free); + block_ctx->mem_to_free = NULL; + block_ctx->pagev = NULL; + block_ctx->datav = NULL; } } static int btrfsic_read_block(struct btrfsic_state *state, struct btrfsic_block_data_ctx *block_ctx) { - block_ctx->bh = NULL; - if (block_ctx->dev_bytenr & 4095) { + unsigned int num_pages; + unsigned int i; + u64 dev_bytenr; + int ret; + + BUG_ON(block_ctx->datav); + BUG_ON(block_ctx->pagev); + BUG_ON(block_ctx->mem_to_free); + if (block_ctx->dev_bytenr & ((u64)PAGE_CACHE_SIZE - 1)) { printk(KERN_INFO "btrfsic: read_block() with unaligned bytenr %llu\n", (unsigned long long)block_ctx->dev_bytenr); return -1; } - if (block_ctx->len > 4096) { - printk(KERN_INFO - "btrfsic: read_block() with too huge size %d\n", - block_ctx->len); + + num_pages = (block_ctx->len + (u64)PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + block_ctx->mem_to_free = kzalloc((sizeof(*block_ctx->datav) + + sizeof(*block_ctx->pagev)) * + num_pages, GFP_NOFS); + if (!block_ctx->mem_to_free) return -1; + block_ctx->datav = block_ctx->mem_to_free; + block_ctx->pagev = (struct page **)(block_ctx->datav + num_pages); + for (i = 0; i < num_pages; i++) { + block_ctx->pagev[i] = alloc_page(GFP_NOFS); + if (!block_ctx->pagev[i]) + return -1; } - block_ctx->bh = __bread(block_ctx->dev->bdev, - block_ctx->dev_bytenr >> 12, 4096); - if (NULL == block_ctx->bh) - return -1; - block_ctx->data = block_ctx->bh->b_data; + dev_bytenr = block_ctx->dev_bytenr; + for (i = 0; i < num_pages;) { + struct bio *bio; + unsigned int j; + DECLARE_COMPLETION_ONSTACK(complete); + + bio = bio_alloc(GFP_NOFS, num_pages - i); + if (!bio) { + printk(KERN_INFO + "btrfsic: bio_alloc() for %u pages failed!\n", + num_pages - i); + return -1; + } + bio->bi_bdev = block_ctx->dev->bdev; + bio->bi_sector = dev_bytenr >> 9; + bio->bi_end_io = btrfsic_complete_bio_end_io; + bio->bi_private = &complete; + + for (j = i; j < num_pages; j++) { + ret = bio_add_page(bio, block_ctx->pagev[j], + PAGE_CACHE_SIZE, 0); + if (PAGE_CACHE_SIZE != ret) + break; + } + if (j == i) { + printk(KERN_INFO + "btrfsic: error, failed to add a single page!\n"); + return -1; + } + submit_bio(READ, bio); + + /* this will also unplug the queue */ + wait_for_completion(&complete); + + if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { + printk(KERN_INFO + "btrfsic: read error at logical %llu dev %s!\n", + block_ctx->start, block_ctx->dev->name); + bio_put(bio); + return -1; + } + bio_put(bio); + dev_bytenr += (j - i) * PAGE_CACHE_SIZE; + i = j; + } + for (i = 0; i < num_pages; i++) { + block_ctx->datav[i] = kmap(block_ctx->pagev[i]); + if (!block_ctx->datav[i]) { + printk(KERN_INFO "btrfsic: kmap() failed (dev %s)!\n", + block_ctx->dev->name); + return -1; + } + } return block_ctx->len; } +static void btrfsic_complete_bio_end_io(struct bio *bio, int err) +{ + complete((struct completion *)bio->bi_private); +} + static void btrfsic_dump_database(struct btrfsic_state *state) { struct list_head *elem_all; @@ -1617,32 +1819,39 @@ static void btrfsic_dump_database(struct btrfsic_state *state) * (note that this test fails for the super block) */ static int btrfsic_test_for_metadata(struct btrfsic_state *state, - const u8 *data, unsigned int size) + char **datav, unsigned int num_pages) { struct btrfs_header *h; u8 csum[BTRFS_CSUM_SIZE]; u32 crc = ~(u32)0; - int fail = 0; - int crc_fail = 0; + unsigned int i; - h = (struct btrfs_header *)data; + if (num_pages * PAGE_CACHE_SIZE < state->metablock_size) + return 1; /* not metadata */ + num_pages = state->metablock_size >> PAGE_CACHE_SHIFT; + h = (struct btrfs_header *)datav[0]; if (memcmp(h->fsid, state->root->fs_info->fsid, BTRFS_UUID_SIZE)) - fail++; + return 1; + + for (i = 0; i < num_pages; i++) { + u8 *data = i ? datav[i] : (datav[i] + BTRFS_CSUM_SIZE); + size_t sublen = i ? PAGE_CACHE_SIZE : + (PAGE_CACHE_SIZE - BTRFS_CSUM_SIZE); - crc = crc32c(crc, data + BTRFS_CSUM_SIZE, PAGE_SIZE - BTRFS_CSUM_SIZE); + crc = crc32c(crc, data, sublen); + } btrfs_csum_final(crc, csum); if (memcmp(csum, h->csum, state->csum_size)) - crc_fail++; + return 1; - return fail || crc_fail; + return 0; /* is metadata */ } static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, - u64 dev_bytenr, - u8 *mapped_data, unsigned int len, - struct bio *bio, - int *bio_is_patched, + u64 dev_bytenr, char **mapped_datav, + unsigned int num_pages, + struct bio *bio, int *bio_is_patched, struct buffer_head *bh, int submit_bio_bh_rw) { @@ -1652,12 +1861,19 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, int ret; struct btrfsic_state *state = dev_state->state; struct block_device *bdev = dev_state->bdev; + unsigned int processed_len; - WARN_ON(len > PAGE_SIZE); - is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_data, len)); if (NULL != bio_is_patched) *bio_is_patched = 0; +again: + if (num_pages == 0) + return; + + processed_len = 0; + is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_datav, + num_pages)); + block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr, &state->block_hashtable); if (NULL != block) { @@ -1667,8 +1883,16 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, if (block->is_superblock) { bytenr = le64_to_cpu(((struct btrfs_super_block *) - mapped_data)->bytenr); + mapped_datav[0])->bytenr); + if (num_pages * PAGE_CACHE_SIZE < + BTRFS_SUPER_INFO_SIZE) { + printk(KERN_INFO + "btrfsic: cannot work with too short bios!\n"); + return; + } is_metadata = 1; + BUG_ON(BTRFS_SUPER_INFO_SIZE & (PAGE_CACHE_SIZE - 1)); + processed_len = BTRFS_SUPER_INFO_SIZE; if (state->print_mask & BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) { printk(KERN_INFO @@ -1678,12 +1902,18 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, } if (is_metadata) { if (!block->is_superblock) { + if (num_pages * PAGE_CACHE_SIZE < + state->metablock_size) { + printk(KERN_INFO + "btrfsic: cannot work with too short bios!\n"); + return; + } + processed_len = state->metablock_size; bytenr = le64_to_cpu(((struct btrfs_header *) - mapped_data)->bytenr); + mapped_datav[0])->bytenr); btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state, - dev_bytenr, - mapped_data); + dev_bytenr); } if (block->logical_bytenr != bytenr) { printk(KERN_INFO @@ -1710,6 +1940,13 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, block->mirror_num, btrfsic_get_block_type(state, block)); } else { + if (num_pages * PAGE_CACHE_SIZE < + state->datablock_size) { + printk(KERN_INFO + "btrfsic: cannot work with too short bios!\n"); + return; + } + processed_len = state->datablock_size; bytenr = block->logical_bytenr; if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) printk(KERN_INFO @@ -1747,7 +1984,7 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, le64_to_cpu(block->disk_key.offset), (unsigned long long) le64_to_cpu(((struct btrfs_header *) - mapped_data)->generation), + mapped_datav[0])->generation), (unsigned long long) state->max_superblock_generation); btrfsic_dump_tree(state); @@ -1765,10 +2002,10 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, (unsigned long long)block->generation, (unsigned long long) le64_to_cpu(((struct btrfs_header *) - mapped_data)->generation)); + mapped_datav[0])->generation)); /* it would not be safe to go on */ btrfsic_dump_tree(state); - return; + goto continue_loop; } /* @@ -1796,18 +2033,19 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, } if (block->is_superblock) - ret = btrfsic_map_superblock(state, bytenr, len, + ret = btrfsic_map_superblock(state, bytenr, + processed_len, bdev, &block_ctx); else - ret = btrfsic_map_block(state, bytenr, len, + ret = btrfsic_map_block(state, bytenr, processed_len, &block_ctx, 0); if (ret) { printk(KERN_INFO "btrfsic: btrfsic_map_block(root @%llu)" " failed!\n", (unsigned long long)bytenr); - return; + goto continue_loop; } - block_ctx.data = mapped_data; + block_ctx.datav = mapped_datav; /* the following is required in case of writes to mirrors, * use the same that was used for the lookup */ block_ctx.dev = dev_state; @@ -1863,11 +2101,13 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, block->logical_bytenr = bytenr; block->is_metadata = 1; if (block->is_superblock) { + BUG_ON(PAGE_CACHE_SIZE != + BTRFS_SUPER_INFO_SIZE); ret = btrfsic_process_written_superblock( state, block, (struct btrfs_super_block *) - mapped_data); + mapped_datav[0]); if (state->print_mask & BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE) { printk(KERN_INFO @@ -1880,8 +2120,6 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, state, block, &block_ctx, - (struct btrfs_header *) - block_ctx.data, 0, 0); } if (ret) @@ -1912,26 +2150,30 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, u64 bytenr; if (!is_metadata) { + processed_len = state->datablock_size; if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) printk(KERN_INFO "Written block (%s/%llu/?)" " !found in hash table, D.\n", dev_state->name, (unsigned long long)dev_bytenr); - if (!state->include_extent_data) - return; /* ignore that written D block */ + if (!state->include_extent_data) { + /* ignore that written D block */ + goto continue_loop; + } /* this is getting ugly for the * include_extent_data case... */ bytenr = 0; /* unknown */ block_ctx.start = bytenr; - block_ctx.len = len; - block_ctx.bh = NULL; + block_ctx.len = processed_len; + block_ctx.mem_to_free = NULL; + block_ctx.pagev = NULL; } else { + processed_len = state->metablock_size; bytenr = le64_to_cpu(((struct btrfs_header *) - mapped_data)->bytenr); + mapped_datav[0])->bytenr); btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state, - dev_bytenr, - mapped_data); + dev_bytenr); if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) printk(KERN_INFO "Written block @%llu (%s/%llu/?)" @@ -1940,17 +2182,17 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, dev_state->name, (unsigned long long)dev_bytenr); - ret = btrfsic_map_block(state, bytenr, len, &block_ctx, - 0); + ret = btrfsic_map_block(state, bytenr, processed_len, + &block_ctx, 0); if (ret) { printk(KERN_INFO "btrfsic: btrfsic_map_block(root @%llu)" " failed!\n", (unsigned long long)dev_bytenr); - return; + goto continue_loop; } } - block_ctx.data = mapped_data; + block_ctx.datav = mapped_datav; /* the following is required in case of writes to mirrors, * use the same that was used for the lookup */ block_ctx.dev = dev_state; @@ -1960,7 +2202,7 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, if (NULL == block) { printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); btrfsic_release_block_ctx(&block_ctx); - return; + goto continue_loop; } block->dev_state = dev_state; block->dev_bytenr = dev_bytenr; @@ -2020,9 +2262,7 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, if (is_metadata) { ret = btrfsic_process_metablock(state, block, - &block_ctx, - (struct btrfs_header *) - block_ctx.data, 0, 0); + &block_ctx, 0, 0); if (ret) printk(KERN_INFO "btrfsic: process_metablock(root @%llu)" @@ -2031,6 +2271,13 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, } btrfsic_release_block_ctx(&block_ctx); } + +continue_loop: + BUG_ON(!processed_len); + dev_bytenr += processed_len; + mapped_datav += processed_len >> PAGE_CACHE_SHIFT; + num_pages -= processed_len >> PAGE_CACHE_SHIFT; + goto again; } static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status) @@ -2213,7 +2460,7 @@ static int btrfsic_process_written_superblock( num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree, - next_bytenr, PAGE_SIZE); + next_bytenr, BTRFS_SUPER_INFO_SIZE); if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", (unsigned long long)next_bytenr, num_copies); @@ -2224,7 +2471,8 @@ static int btrfsic_process_written_superblock( printk(KERN_INFO "btrfsic_process_written_superblock(" "mirror_num=%d)\n", mirror_num); - ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE, + ret = btrfsic_map_block(state, next_bytenr, + BTRFS_SUPER_INFO_SIZE, &tmp_next_block_ctx, mirror_num); if (ret) { @@ -2689,7 +2937,7 @@ static struct btrfsic_block *btrfsic_block_lookup_or_add( static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, u64 bytenr, struct btrfsic_dev_state *dev_state, - u64 dev_bytenr, char *data) + u64 dev_bytenr) { int num_copies; int mirror_num; @@ -2698,10 +2946,10 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, int match = 0; num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree, - bytenr, PAGE_SIZE); + bytenr, state->metablock_size); for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { - ret = btrfsic_map_block(state, bytenr, PAGE_SIZE, + ret = btrfsic_map_block(state, bytenr, state->metablock_size, &block_ctx, mirror_num); if (ret) { printk(KERN_INFO "btrfsic:" @@ -2727,7 +2975,8 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, (unsigned long long)bytenr, dev_state->name, (unsigned long long)dev_bytenr); for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { - ret = btrfsic_map_block(state, bytenr, PAGE_SIZE, + ret = btrfsic_map_block(state, bytenr, + state->metablock_size, &block_ctx, mirror_num); if (ret) continue; @@ -2781,13 +3030,13 @@ int btrfsic_submit_bh(int rw, struct buffer_head *bh) (unsigned long)bh->b_size, bh->b_data, bh->b_bdev); btrfsic_process_written_block(dev_state, dev_bytenr, - bh->b_data, bh->b_size, NULL, + &bh->b_data, 1, NULL, NULL, bh, rw); } else if (NULL != dev_state && (rw & REQ_FLUSH)) { if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) printk(KERN_INFO - "submit_bh(rw=0x%x) FLUSH, bdev=%p)\n", + "submit_bh(rw=0x%x FLUSH, bdev=%p)\n", rw, bh->b_bdev); if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { if ((dev_state->state->print_mask & @@ -2836,6 +3085,7 @@ void btrfsic_submit_bio(int rw, struct bio *bio) unsigned int i; u64 dev_bytenr; int bio_is_patched; + char **mapped_datav; dev_bytenr = 512 * bio->bi_sector; bio_is_patched = 0; @@ -2848,35 +3098,46 @@ void btrfsic_submit_bio(int rw, struct bio *bio) (unsigned long long)dev_bytenr, bio->bi_bdev); + mapped_datav = kmalloc(sizeof(*mapped_datav) * bio->bi_vcnt, + GFP_NOFS); + if (!mapped_datav) + goto leave; for (i = 0; i < bio->bi_vcnt; i++) { - u8 *mapped_data; - - mapped_data = kmap(bio->bi_io_vec[i].bv_page); + BUG_ON(bio->bi_io_vec[i].bv_len != PAGE_CACHE_SIZE); + mapped_datav[i] = kmap(bio->bi_io_vec[i].bv_page); + if (!mapped_datav[i]) { + while (i > 0) { + i--; + kunmap(bio->bi_io_vec[i].bv_page); + } + kfree(mapped_datav); + goto leave; + } if ((BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | BTRFSIC_PRINT_MASK_VERBOSE) == (dev_state->state->print_mask & (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | BTRFSIC_PRINT_MASK_VERBOSE))) printk(KERN_INFO - "#%u: page=%p, mapped=%p, len=%u," - " offset=%u\n", + "#%u: page=%p, len=%u, offset=%u\n", i, bio->bi_io_vec[i].bv_page, - mapped_data, bio->bi_io_vec[i].bv_len, bio->bi_io_vec[i].bv_offset); - btrfsic_process_written_block(dev_state, dev_bytenr, - mapped_data, - bio->bi_io_vec[i].bv_len, - bio, &bio_is_patched, - NULL, rw); + } + btrfsic_process_written_block(dev_state, dev_bytenr, + mapped_datav, bio->bi_vcnt, + bio, &bio_is_patched, + NULL, rw); + while (i > 0) { + i--; kunmap(bio->bi_io_vec[i].bv_page); - dev_bytenr += bio->bi_io_vec[i].bv_len; } + kfree(mapped_datav); } else if (NULL != dev_state && (rw & REQ_FLUSH)) { if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) printk(KERN_INFO - "submit_bio(rw=0x%x) FLUSH, bdev=%p)\n", + "submit_bio(rw=0x%x FLUSH, bdev=%p)\n", rw, bio->bi_bdev); if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { if ((dev_state->state->print_mask & @@ -2903,6 +3164,7 @@ void btrfsic_submit_bio(int rw, struct bio *bio) bio->bi_end_io = btrfsic_bio_end_io; } } +leave: mutex_unlock(&btrfsic_mutex); submit_bio(rw, bio); @@ -2917,6 +3179,30 @@ int btrfsic_mount(struct btrfs_root *root, struct list_head *dev_head = &fs_devices->devices; struct btrfs_device *device; + if (root->nodesize != root->leafsize) { + printk(KERN_INFO + "btrfsic: cannot handle nodesize %d != leafsize %d!\n", + root->nodesize, root->leafsize); + return -1; + } + if (root->nodesize & ((u64)PAGE_CACHE_SIZE - 1)) { + printk(KERN_INFO + "btrfsic: cannot handle nodesize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n", + root->nodesize, (unsigned long)PAGE_CACHE_SIZE); + return -1; + } + if (root->leafsize & ((u64)PAGE_CACHE_SIZE - 1)) { + printk(KERN_INFO + "btrfsic: cannot handle leafsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n", + root->leafsize, (unsigned long)PAGE_CACHE_SIZE); + return -1; + } + if (root->sectorsize & ((u64)PAGE_CACHE_SIZE - 1)) { + printk(KERN_INFO + "btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n", + root->sectorsize, (unsigned long)PAGE_CACHE_SIZE); + return -1; + } state = kzalloc(sizeof(*state), GFP_NOFS); if (NULL == state) { printk(KERN_INFO "btrfs check-integrity: kmalloc() failed!\n"); @@ -2933,6 +3219,8 @@ int btrfsic_mount(struct btrfs_root *root, state->print_mask = print_mask; state->include_extent_data = including_extent_data; state->csum_size = 0; + state->metablock_size = root->nodesize; + state->datablock_size = root->sectorsize; INIT_LIST_HEAD(&state->all_blocks_list); btrfsic_block_hashtable_init(&state->block_hashtable); btrfsic_block_link_hashtable_init(&state->block_link_hashtable); @@ -3049,7 +3337,7 @@ void btrfsic_unmount(struct btrfs_root *root, btrfsic_block_link_free(l); } - if (b_all->is_iodone) + if (b_all->is_iodone || b_all->never_written) btrfsic_block_free(b_all); else printk(KERN_INFO "btrfs: attempt to free %c-block" diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 4106264..d7a96cf 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -18,6 +18,7 @@ #include <linux/sched.h> #include <linux/slab.h> +#include <linux/rbtree.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -37,7 +38,16 @@ static int balance_node_right(struct btrfs_trans_handle *trans, struct extent_buffer *dst_buf, struct extent_buffer *src_buf); static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_path *path, int level, int slot); + struct btrfs_path *path, int level, int slot, + int tree_mod_log); +static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb); +struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr, + u32 blocksize, u64 parent_transid, + u64 time_seq); +struct extent_buffer *btrfs_find_old_tree_block(struct btrfs_root *root, + u64 bytenr, u32 blocksize, + u64 time_seq); struct btrfs_path *btrfs_alloc_path(void) { @@ -255,7 +265,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, cow = btrfs_alloc_free_block(trans, root, buf->len, 0, new_root_objectid, &disk_key, level, - buf->start, 0, 1); + buf->start, 0); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -288,6 +298,434 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, return 0; } +enum mod_log_op { + MOD_LOG_KEY_REPLACE, + MOD_LOG_KEY_ADD, + MOD_LOG_KEY_REMOVE, + MOD_LOG_KEY_REMOVE_WHILE_FREEING, + MOD_LOG_KEY_REMOVE_WHILE_MOVING, + MOD_LOG_MOVE_KEYS, + MOD_LOG_ROOT_REPLACE, +}; + +struct tree_mod_move { + int dst_slot; + int nr_items; +}; + +struct tree_mod_root { + u64 logical; + u8 level; +}; + +struct tree_mod_elem { + struct rb_node node; + u64 index; /* shifted logical */ + struct seq_list elem; + enum mod_log_op op; + + /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */ + int slot; + + /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */ + u64 generation; + + /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */ + struct btrfs_disk_key key; + u64 blockptr; + + /* this is used for op == MOD_LOG_MOVE_KEYS */ + struct tree_mod_move move; + + /* this is used for op == MOD_LOG_ROOT_REPLACE */ + struct tree_mod_root old_root; +}; + +static inline void +__get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem) +{ + elem->seq = atomic_inc_return(&fs_info->tree_mod_seq); + list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); +} + +void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, + struct seq_list *elem) +{ + elem->flags = 1; + spin_lock(&fs_info->tree_mod_seq_lock); + __get_tree_mod_seq(fs_info, elem); + spin_unlock(&fs_info->tree_mod_seq_lock); +} + +void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, + struct seq_list *elem) +{ + struct rb_root *tm_root; + struct rb_node *node; + struct rb_node *next; + struct seq_list *cur_elem; + struct tree_mod_elem *tm; + u64 min_seq = (u64)-1; + u64 seq_putting = elem->seq; + + if (!seq_putting) + return; + + BUG_ON(!(elem->flags & 1)); + spin_lock(&fs_info->tree_mod_seq_lock); + list_del(&elem->list); + + list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) { + if ((cur_elem->flags & 1) && cur_elem->seq < min_seq) { + if (seq_putting > cur_elem->seq) { + /* + * blocker with lower sequence number exists, we + * cannot remove anything from the log + */ + goto out; + } + min_seq = cur_elem->seq; + } + } + + /* + * anything that's lower than the lowest existing (read: blocked) + * sequence number can be removed from the tree. + */ + write_lock(&fs_info->tree_mod_log_lock); + tm_root = &fs_info->tree_mod_log; + for (node = rb_first(tm_root); node; node = next) { + next = rb_next(node); + tm = container_of(node, struct tree_mod_elem, node); + if (tm->elem.seq > min_seq) + continue; + rb_erase(node, tm_root); + list_del(&tm->elem.list); + kfree(tm); + } + write_unlock(&fs_info->tree_mod_log_lock); +out: + spin_unlock(&fs_info->tree_mod_seq_lock); +} + +/* + * key order of the log: + * index -> sequence + * + * the index is the shifted logical of the *new* root node for root replace + * operations, or the shifted logical of the affected block for all other + * operations. + */ +static noinline int +__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) +{ + struct rb_root *tm_root; + struct rb_node **new; + struct rb_node *parent = NULL; + struct tree_mod_elem *cur; + int ret = 0; + + BUG_ON(!tm || !tm->elem.seq); + + write_lock(&fs_info->tree_mod_log_lock); + tm_root = &fs_info->tree_mod_log; + new = &tm_root->rb_node; + while (*new) { + cur = container_of(*new, struct tree_mod_elem, node); + parent = *new; + if (cur->index < tm->index) + new = &((*new)->rb_left); + else if (cur->index > tm->index) + new = &((*new)->rb_right); + else if (cur->elem.seq < tm->elem.seq) + new = &((*new)->rb_left); + else if (cur->elem.seq > tm->elem.seq) + new = &((*new)->rb_right); + else { + kfree(tm); + ret = -EEXIST; + goto unlock; + } + } + + rb_link_node(&tm->node, parent, new); + rb_insert_color(&tm->node, tm_root); +unlock: + write_unlock(&fs_info->tree_mod_log_lock); + return ret; +} + +static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb) { + smp_mb(); + if (list_empty(&(fs_info)->tree_mod_seq_list)) + return 1; + if (!eb) + return 0; + if (btrfs_header_level(eb) == 0) + return 1; + return 0; +} + +static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags, + struct tree_mod_elem **tm_ret) +{ + struct tree_mod_elem *tm; + int seq; + + if (tree_mod_dont_log(fs_info, NULL)) + return 0; + + tm = *tm_ret = kzalloc(sizeof(*tm), flags); + if (!tm) + return -ENOMEM; + + tm->elem.flags = 0; + spin_lock(&fs_info->tree_mod_seq_lock); + if (list_empty(&fs_info->tree_mod_seq_list)) { + /* + * someone emptied the list while we were waiting for the lock. + * we must not add to the list, because no blocker exists. items + * are removed from the list only when the existing blocker is + * removed from the list. + */ + kfree(tm); + seq = 0; + } else { + __get_tree_mod_seq(fs_info, &tm->elem); + seq = tm->elem.seq; + } + spin_unlock(&fs_info->tree_mod_seq_lock); + + return seq; +} + +static noinline int +tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, int slot, + enum mod_log_op op, gfp_t flags) +{ + struct tree_mod_elem *tm; + int ret; + + ret = tree_mod_alloc(fs_info, flags, &tm); + if (ret <= 0) + return ret; + + tm->index = eb->start >> PAGE_CACHE_SHIFT; + if (op != MOD_LOG_KEY_ADD) { + btrfs_node_key(eb, &tm->key, slot); + tm->blockptr = btrfs_node_blockptr(eb, slot); + } + tm->op = op; + tm->slot = slot; + tm->generation = btrfs_node_ptr_generation(eb, slot); + + return __tree_mod_log_insert(fs_info, tm); +} + +static noinline int +tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, + int slot, enum mod_log_op op) +{ + return tree_mod_log_insert_key_mask(fs_info, eb, slot, op, GFP_NOFS); +} + +static noinline int +tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, int dst_slot, int src_slot, + int nr_items, gfp_t flags) +{ + struct tree_mod_elem *tm; + int ret; + int i; + + if (tree_mod_dont_log(fs_info, eb)) + return 0; + + for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { + ret = tree_mod_log_insert_key(fs_info, eb, i + dst_slot, + MOD_LOG_KEY_REMOVE_WHILE_MOVING); + BUG_ON(ret < 0); + } + + ret = tree_mod_alloc(fs_info, flags, &tm); + if (ret <= 0) + return ret; + + tm->index = eb->start >> PAGE_CACHE_SHIFT; + tm->slot = src_slot; + tm->move.dst_slot = dst_slot; + tm->move.nr_items = nr_items; + tm->op = MOD_LOG_MOVE_KEYS; + + return __tree_mod_log_insert(fs_info, tm); +} + +static noinline int +tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, + struct extent_buffer *old_root, + struct extent_buffer *new_root, gfp_t flags) +{ + struct tree_mod_elem *tm; + int ret; + + ret = tree_mod_alloc(fs_info, flags, &tm); + if (ret <= 0) + return ret; + + tm->index = new_root->start >> PAGE_CACHE_SHIFT; + tm->old_root.logical = old_root->start; + tm->old_root.level = btrfs_header_level(old_root); + tm->generation = btrfs_header_generation(old_root); + tm->op = MOD_LOG_ROOT_REPLACE; + + return __tree_mod_log_insert(fs_info, tm); +} + +static struct tree_mod_elem * +__tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq, + int smallest) +{ + struct rb_root *tm_root; + struct rb_node *node; + struct tree_mod_elem *cur = NULL; + struct tree_mod_elem *found = NULL; + u64 index = start >> PAGE_CACHE_SHIFT; + + read_lock(&fs_info->tree_mod_log_lock); + tm_root = &fs_info->tree_mod_log; + node = tm_root->rb_node; + while (node) { + cur = container_of(node, struct tree_mod_elem, node); + if (cur->index < index) { + node = node->rb_left; + } else if (cur->index > index) { + node = node->rb_right; + } else if (cur->elem.seq < min_seq) { + node = node->rb_left; + } else if (!smallest) { + /* we want the node with the highest seq */ + if (found) + BUG_ON(found->elem.seq > cur->elem.seq); + found = cur; + node = node->rb_left; + } else if (cur->elem.seq > min_seq) { + /* we want the node with the smallest seq */ + if (found) + BUG_ON(found->elem.seq < cur->elem.seq); + found = cur; + node = node->rb_right; + } else { + found = cur; + break; + } + } + read_unlock(&fs_info->tree_mod_log_lock); + + return found; +} + +/* + * this returns the element from the log with the smallest time sequence + * value that's in the log (the oldest log item). any element with a time + * sequence lower than min_seq will be ignored. + */ +static struct tree_mod_elem * +tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start, + u64 min_seq) +{ + return __tree_mod_log_search(fs_info, start, min_seq, 1); +} + +/* + * this returns the element from the log with the largest time sequence + * value that's in the log (the most recent log item). any element with + * a time sequence lower than min_seq will be ignored. + */ +static struct tree_mod_elem * +tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq) +{ + return __tree_mod_log_search(fs_info, start, min_seq, 0); +} + +static inline void +tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, + struct extent_buffer *src, unsigned long dst_offset, + unsigned long src_offset, int nr_items) +{ + int ret; + int i; + + if (tree_mod_dont_log(fs_info, NULL)) + return; + + if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) + return; + + /* speed this up by single seq for all operations? */ + for (i = 0; i < nr_items; i++) { + ret = tree_mod_log_insert_key(fs_info, src, i + src_offset, + MOD_LOG_KEY_REMOVE); + BUG_ON(ret < 0); + ret = tree_mod_log_insert_key(fs_info, dst, i + dst_offset, + MOD_LOG_KEY_ADD); + BUG_ON(ret < 0); + } +} + +static inline void +tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, + int dst_offset, int src_offset, int nr_items) +{ + int ret; + ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset, + nr_items, GFP_NOFS); + BUG_ON(ret < 0); +} + +static inline void +tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int slot, int atomic) +{ + int ret; + + ret = tree_mod_log_insert_key_mask(fs_info, eb, slot, + MOD_LOG_KEY_REPLACE, + atomic ? GFP_ATOMIC : GFP_NOFS); + BUG_ON(ret < 0); +} + +static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb) +{ + int i; + int ret; + u32 nritems; + + if (tree_mod_dont_log(fs_info, eb)) + return; + + nritems = btrfs_header_nritems(eb); + for (i = nritems - 1; i >= 0; i--) { + ret = tree_mod_log_insert_key(fs_info, eb, i, + MOD_LOG_KEY_REMOVE_WHILE_FREEING); + BUG_ON(ret < 0); + } +} + +static inline void +tree_mod_log_set_root_pointer(struct btrfs_root *root, + struct extent_buffer *new_root_node) +{ + int ret; + tree_mod_log_free_eb(root->fs_info, root->node); + ret = tree_mod_log_insert_root(root->fs_info, root->node, + new_root_node, GFP_NOFS); + BUG_ON(ret < 0); +} + /* * check if the tree block can be shared by multiple trees */ @@ -409,6 +847,12 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, ret = btrfs_dec_ref(trans, root, buf, 1, 1); BUG_ON(ret); /* -ENOMEM */ } + /* + * don't log freeing in case we're freeing the root node, this + * is done by tree_mod_log_set_root_pointer later + */ + if (buf != root->node && btrfs_header_level(buf) != 0) + tree_mod_log_free_eb(root->fs_info, buf); clean_tree_block(trans, root, buf); *last_ref = 1; } @@ -467,7 +911,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start, root->root_key.objectid, &disk_key, - level, search_start, empty_size, 1); + level, search_start, empty_size); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -506,10 +950,11 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, parent_start = 0; extent_buffer_get(cow); + tree_mod_log_set_root_pointer(root, cow); rcu_assign_pointer(root->node, cow); btrfs_free_tree_block(trans, root, buf, parent_start, - last_ref, 1); + last_ref); free_extent_buffer(buf); add_root_to_dirty_list(root); } else { @@ -519,13 +964,15 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, parent_start = 0; WARN_ON(trans->transid != btrfs_header_generation(parent)); + tree_mod_log_insert_key(root->fs_info, parent, parent_slot, + MOD_LOG_KEY_REPLACE); btrfs_set_node_blockptr(parent, parent_slot, cow->start); btrfs_set_node_ptr_generation(parent, parent_slot, trans->transid); btrfs_mark_buffer_dirty(parent); btrfs_free_tree_block(trans, root, buf, parent_start, - last_ref, 1); + last_ref); } if (unlock_orig) btrfs_tree_unlock(buf); @@ -535,6 +982,210 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, return 0; } +/* + * returns the logical address of the oldest predecessor of the given root. + * entries older than time_seq are ignored. + */ +static struct tree_mod_elem * +__tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info, + struct btrfs_root *root, u64 time_seq) +{ + struct tree_mod_elem *tm; + struct tree_mod_elem *found = NULL; + u64 root_logical = root->node->start; + int looped = 0; + + if (!time_seq) + return 0; + + /* + * the very last operation that's logged for a root is the replacement + * operation (if it is replaced at all). this has the index of the *new* + * root, making it the very first operation that's logged for this root. + */ + while (1) { + tm = tree_mod_log_search_oldest(fs_info, root_logical, + time_seq); + if (!looped && !tm) + return 0; + /* + * we must have key remove operations in the log before the + * replace operation. + */ + BUG_ON(!tm); + + if (tm->op != MOD_LOG_ROOT_REPLACE) + break; + + found = tm; + root_logical = tm->old_root.logical; + BUG_ON(root_logical == root->node->start); + looped = 1; + } + + return found; +} + +/* + * tm is a pointer to the first operation to rewind within eb. then, all + * previous operations will be rewinded (until we reach something older than + * time_seq). + */ +static void +__tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq, + struct tree_mod_elem *first_tm) +{ + u32 n; + struct rb_node *next; + struct tree_mod_elem *tm = first_tm; + unsigned long o_dst; + unsigned long o_src; + unsigned long p_size = sizeof(struct btrfs_key_ptr); + + n = btrfs_header_nritems(eb); + while (tm && tm->elem.seq >= time_seq) { + /* + * all the operations are recorded with the operator used for + * the modification. as we're going backwards, we do the + * opposite of each operation here. + */ + switch (tm->op) { + case MOD_LOG_KEY_REMOVE_WHILE_FREEING: + BUG_ON(tm->slot < n); + case MOD_LOG_KEY_REMOVE_WHILE_MOVING: + case MOD_LOG_KEY_REMOVE: + btrfs_set_node_key(eb, &tm->key, tm->slot); + btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); + btrfs_set_node_ptr_generation(eb, tm->slot, + tm->generation); + n++; + break; + case MOD_LOG_KEY_REPLACE: + BUG_ON(tm->slot >= n); + btrfs_set_node_key(eb, &tm->key, tm->slot); + btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); + btrfs_set_node_ptr_generation(eb, tm->slot, + tm->generation); + break; + case MOD_LOG_KEY_ADD: + if (tm->slot != n - 1) { + o_dst = btrfs_node_key_ptr_offset(tm->slot); + o_src = btrfs_node_key_ptr_offset(tm->slot + 1); + memmove_extent_buffer(eb, o_dst, o_src, p_size); + } + n--; + break; + case MOD_LOG_MOVE_KEYS: + o_dst = btrfs_node_key_ptr_offset(tm->slot); + o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot); + memmove_extent_buffer(eb, o_dst, o_src, + tm->move.nr_items * p_size); + break; + case MOD_LOG_ROOT_REPLACE: + /* + * this operation is special. for roots, this must be + * handled explicitly before rewinding. + * for non-roots, this operation may exist if the node + * was a root: root A -> child B; then A gets empty and + * B is promoted to the new root. in the mod log, we'll + * have a root-replace operation for B, a tree block + * that is no root. we simply ignore that operation. + */ + break; + } + next = rb_next(&tm->node); + if (!next) + break; + tm = container_of(next, struct tree_mod_elem, node); + if (tm->index != first_tm->index) + break; + } + btrfs_set_header_nritems(eb, n); +} + +static struct extent_buffer * +tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, + u64 time_seq) +{ + struct extent_buffer *eb_rewin; + struct tree_mod_elem *tm; + + if (!time_seq) + return eb; + + if (btrfs_header_level(eb) == 0) + return eb; + + tm = tree_mod_log_search(fs_info, eb->start, time_seq); + if (!tm) + return eb; + + if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) { + BUG_ON(tm->slot != 0); + eb_rewin = alloc_dummy_extent_buffer(eb->start, + fs_info->tree_root->nodesize); + BUG_ON(!eb_rewin); + btrfs_set_header_bytenr(eb_rewin, eb->start); + btrfs_set_header_backref_rev(eb_rewin, + btrfs_header_backref_rev(eb)); + btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb)); + btrfs_set_header_level(eb_rewin, btrfs_header_level(eb)); + } else { + eb_rewin = btrfs_clone_extent_buffer(eb); + BUG_ON(!eb_rewin); + } + + extent_buffer_get(eb_rewin); + free_extent_buffer(eb); + + __tree_mod_log_rewind(eb_rewin, time_seq, tm); + + return eb_rewin; +} + +static inline struct extent_buffer * +get_old_root(struct btrfs_root *root, u64 time_seq) +{ + struct tree_mod_elem *tm; + struct extent_buffer *eb; + struct tree_mod_root *old_root; + u64 old_generation; + + tm = __tree_mod_log_oldest_root(root->fs_info, root, time_seq); + if (!tm) + return root->node; + + old_root = &tm->old_root; + old_generation = tm->generation; + + tm = tree_mod_log_search(root->fs_info, old_root->logical, time_seq); + /* + * there was an item in the log when __tree_mod_log_oldest_root + * returned. this one must not go away, because the time_seq passed to + * us must be blocking its removal. + */ + BUG_ON(!tm); + + if (old_root->logical == root->node->start) { + /* there are logged operations for the current root */ + eb = btrfs_clone_extent_buffer(root->node); + } else { + /* there's a root replace operation for the current root */ + eb = alloc_dummy_extent_buffer(tm->index << PAGE_CACHE_SHIFT, + root->nodesize); + btrfs_set_header_bytenr(eb, eb->start); + btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV); + btrfs_set_header_owner(eb, root->root_key.objectid); + } + if (!eb) + return NULL; + btrfs_set_header_level(eb, old_root->level); + btrfs_set_header_generation(eb, old_generation); + __tree_mod_log_rewind(eb, time_seq, tm); + + return eb; +} + static inline int should_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf) @@ -739,7 +1390,11 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, if (!cur) return -EIO; } else if (!uptodate) { - btrfs_read_buffer(cur, gen); + err = btrfs_read_buffer(cur, gen); + if (err) { + free_extent_buffer(cur); + return err; + } } } if (search_start == 0) @@ -854,20 +1509,18 @@ static noinline int generic_bin_search(struct extent_buffer *eb, static int bin_search(struct extent_buffer *eb, struct btrfs_key *key, int level, int *slot) { - if (level == 0) { + if (level == 0) return generic_bin_search(eb, offsetof(struct btrfs_leaf, items), sizeof(struct btrfs_item), key, btrfs_header_nritems(eb), slot); - } else { + else return generic_bin_search(eb, offsetof(struct btrfs_node, ptrs), sizeof(struct btrfs_key_ptr), key, btrfs_header_nritems(eb), slot); - } - return -1; } int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, @@ -974,6 +1627,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, goto enospc; } + tree_mod_log_set_root_pointer(root, child); rcu_assign_pointer(root->node, child); add_root_to_dirty_list(root); @@ -987,7 +1641,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, free_extent_buffer(mid); root_sub_used(root, mid->len); - btrfs_free_tree_block(trans, root, mid, 0, 1, 0); + btrfs_free_tree_block(trans, root, mid, 0, 1); /* once for the root ptr */ free_extent_buffer_stale(mid); return 0; @@ -1040,14 +1694,16 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (btrfs_header_nritems(right) == 0) { clean_tree_block(trans, root, right); btrfs_tree_unlock(right); - del_ptr(trans, root, path, level + 1, pslot + 1); + del_ptr(trans, root, path, level + 1, pslot + 1, 1); root_sub_used(root, right->len); - btrfs_free_tree_block(trans, root, right, 0, 1, 0); + btrfs_free_tree_block(trans, root, right, 0, 1); free_extent_buffer_stale(right); right = NULL; } else { struct btrfs_disk_key right_key; btrfs_node_key(right, &right_key, 0); + tree_mod_log_set_node_key(root->fs_info, parent, + &right_key, pslot + 1, 0); btrfs_set_node_key(parent, &right_key, pslot + 1); btrfs_mark_buffer_dirty(parent); } @@ -1082,15 +1738,17 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (btrfs_header_nritems(mid) == 0) { clean_tree_block(trans, root, mid); btrfs_tree_unlock(mid); - del_ptr(trans, root, path, level + 1, pslot); + del_ptr(trans, root, path, level + 1, pslot, 1); root_sub_used(root, mid->len); - btrfs_free_tree_block(trans, root, mid, 0, 1, 0); + btrfs_free_tree_block(trans, root, mid, 0, 1); free_extent_buffer_stale(mid); mid = NULL; } else { /* update the parent key to reflect our changes */ struct btrfs_disk_key mid_key; btrfs_node_key(mid, &mid_key, 0); + tree_mod_log_set_node_key(root->fs_info, parent, &mid_key, + pslot, 0); btrfs_set_node_key(parent, &mid_key, pslot); btrfs_mark_buffer_dirty(parent); } @@ -1188,6 +1846,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, struct btrfs_disk_key disk_key; orig_slot += left_nr; btrfs_node_key(mid, &disk_key, 0); + tree_mod_log_set_node_key(root->fs_info, parent, + &disk_key, pslot, 0); btrfs_set_node_key(parent, &disk_key, pslot); btrfs_mark_buffer_dirty(parent); if (btrfs_header_nritems(left) > orig_slot) { @@ -1239,6 +1899,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, struct btrfs_disk_key disk_key; btrfs_node_key(right, &disk_key, 0); + tree_mod_log_set_node_key(root->fs_info, parent, + &disk_key, pslot + 1, 0); btrfs_set_node_key(parent, &disk_key, pslot + 1); btrfs_mark_buffer_dirty(parent); @@ -1496,7 +2158,7 @@ static int read_block_for_search(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *p, struct extent_buffer **eb_ret, int level, int slot, - struct btrfs_key *key) + struct btrfs_key *key, u64 time_seq) { u64 blocknr; u64 gen; @@ -1850,7 +2512,7 @@ cow_done: } err = read_block_for_search(trans, root, p, - &b, level, slot, key); + &b, level, slot, key, 0); if (err == -EAGAIN) goto again; if (err) { @@ -1922,6 +2584,115 @@ done: } /* + * Like btrfs_search_slot, this looks for a key in the given tree. It uses the + * current state of the tree together with the operations recorded in the tree + * modification log to search for the key in a previous version of this tree, as + * denoted by the time_seq parameter. + * + * Naturally, there is no support for insert, delete or cow operations. + * + * The resulting path and return value will be set up as if we called + * btrfs_search_slot at that point in time with ins_len and cow both set to 0. + */ +int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key, + struct btrfs_path *p, u64 time_seq) +{ + struct extent_buffer *b; + int slot; + int ret; + int err; + int level; + int lowest_unlock = 1; + u8 lowest_level = 0; + + lowest_level = p->lowest_level; + WARN_ON(p->nodes[0] != NULL); + + if (p->search_commit_root) { + BUG_ON(time_seq); + return btrfs_search_slot(NULL, root, key, p, 0, 0); + } + +again: + b = get_old_root(root, time_seq); + extent_buffer_get(b); + level = btrfs_header_level(b); + btrfs_tree_read_lock(b); + p->locks[level] = BTRFS_READ_LOCK; + + while (b) { + level = btrfs_header_level(b); + p->nodes[level] = b; + btrfs_clear_path_blocking(p, NULL, 0); + + /* + * we have a lock on b and as long as we aren't changing + * the tree, there is no way to for the items in b to change. + * It is safe to drop the lock on our parent before we + * go through the expensive btree search on b. + */ + btrfs_unlock_up_safe(p, level + 1); + + ret = bin_search(b, key, level, &slot); + + if (level != 0) { + int dec = 0; + if (ret && slot > 0) { + dec = 1; + slot -= 1; + } + p->slots[level] = slot; + unlock_up(p, level, lowest_unlock, 0, NULL); + + if (level == lowest_level) { + if (dec) + p->slots[level]++; + goto done; + } + + err = read_block_for_search(NULL, root, p, &b, level, + slot, key, time_seq); + if (err == -EAGAIN) + goto again; + if (err) { + ret = err; + goto done; + } + + level = btrfs_header_level(b); + err = btrfs_try_tree_read_lock(b); + if (!err) { + btrfs_set_path_blocking(p); + btrfs_tree_read_lock(b); + btrfs_clear_path_blocking(p, b, + BTRFS_READ_LOCK); + } + p->locks[level] = BTRFS_READ_LOCK; + p->nodes[level] = b; + b = tree_mod_log_rewind(root->fs_info, b, time_seq); + if (b != p->nodes[level]) { + btrfs_tree_unlock_rw(p->nodes[level], + p->locks[level]); + p->locks[level] = 0; + p->nodes[level] = b; + } + } else { + p->slots[level] = slot; + unlock_up(p, level, lowest_unlock, 0, NULL); + goto done; + } + } + ret = 1; +done: + if (!p->leave_spinning) + btrfs_set_path_blocking(p); + if (ret < 0) + btrfs_release_path(p); + + return ret; +} + +/* * adjust the pointers going up the tree, starting at level * making sure the right key of each node is points to 'key'. * This is used after shifting pointers to the left, so it stops @@ -1941,6 +2712,7 @@ static void fixup_low_keys(struct btrfs_trans_handle *trans, if (!path->nodes[i]) break; t = path->nodes[i]; + tree_mod_log_set_node_key(root->fs_info, t, key, tslot, 1); btrfs_set_node_key(t, key, tslot); btrfs_mark_buffer_dirty(path->nodes[i]); if (tslot != 0) @@ -2023,12 +2795,16 @@ static int push_node_left(struct btrfs_trans_handle *trans, } else push_items = min(src_nritems - 8, push_items); + tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0, + push_items); copy_extent_buffer(dst, src, btrfs_node_key_ptr_offset(dst_nritems), btrfs_node_key_ptr_offset(0), push_items * sizeof(struct btrfs_key_ptr)); if (push_items < src_nritems) { + tree_mod_log_eb_move(root->fs_info, src, 0, push_items, + src_nritems - push_items); memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0), btrfs_node_key_ptr_offset(push_items), (src_nritems - push_items) * @@ -2082,11 +2858,14 @@ static int balance_node_right(struct btrfs_trans_handle *trans, if (max_push < push_items) push_items = max_push; + tree_mod_log_eb_move(root->fs_info, dst, push_items, 0, dst_nritems); memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items), btrfs_node_key_ptr_offset(0), (dst_nritems) * sizeof(struct btrfs_key_ptr)); + tree_mod_log_eb_copy(root->fs_info, dst, src, 0, + src_nritems - push_items, push_items); copy_extent_buffer(dst, src, btrfs_node_key_ptr_offset(0), btrfs_node_key_ptr_offset(src_nritems - push_items), @@ -2129,7 +2908,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, c = btrfs_alloc_free_block(trans, root, root->nodesize, 0, root->root_key.objectid, &lower_key, - level, root->node->start, 0, 0); + level, root->node->start, 0); if (IS_ERR(c)) return PTR_ERR(c); @@ -2161,6 +2940,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(c); old = root->node; + tree_mod_log_set_root_pointer(root, c); rcu_assign_pointer(root->node, c); /* the super has an extra ref to root->node */ @@ -2184,10 +2964,11 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, static void insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_disk_key *key, u64 bytenr, - int slot, int level) + int slot, int level, int tree_mod_log) { struct extent_buffer *lower; int nritems; + int ret; BUG_ON(!path->nodes[level]); btrfs_assert_tree_locked(path->nodes[level]); @@ -2196,11 +2977,19 @@ static void insert_ptr(struct btrfs_trans_handle *trans, BUG_ON(slot > nritems); BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root)); if (slot != nritems) { + if (tree_mod_log && level) + tree_mod_log_eb_move(root->fs_info, lower, slot + 1, + slot, nritems - slot); memmove_extent_buffer(lower, btrfs_node_key_ptr_offset(slot + 1), btrfs_node_key_ptr_offset(slot), (nritems - slot) * sizeof(struct btrfs_key_ptr)); } + if (tree_mod_log && level) { + ret = tree_mod_log_insert_key(root->fs_info, lower, slot, + MOD_LOG_KEY_ADD); + BUG_ON(ret < 0); + } btrfs_set_node_key(lower, key, slot); btrfs_set_node_blockptr(lower, slot, bytenr); WARN_ON(trans->transid == 0); @@ -2252,7 +3041,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, split = btrfs_alloc_free_block(trans, root, root->nodesize, 0, root->root_key.objectid, - &disk_key, level, c->start, 0, 0); + &disk_key, level, c->start, 0); if (IS_ERR(split)) return PTR_ERR(split); @@ -2271,7 +3060,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, (unsigned long)btrfs_header_chunk_tree_uuid(split), BTRFS_UUID_SIZE); - + tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid); copy_extent_buffer(split, c, btrfs_node_key_ptr_offset(0), btrfs_node_key_ptr_offset(mid), @@ -2284,7 +3073,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(split); insert_ptr(trans, root, path, &disk_key, split->start, - path->slots[level + 1] + 1, level + 1); + path->slots[level + 1] + 1, level + 1, 1); if (path->slots[level] >= mid) { path->slots[level] -= mid; @@ -2821,7 +3610,7 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans, btrfs_set_header_nritems(l, mid); btrfs_item_key(right, &disk_key, 0); insert_ptr(trans, root, path, &disk_key, right->start, - path->slots[1] + 1, 1); + path->slots[1] + 1, 1, 0); btrfs_mark_buffer_dirty(right); btrfs_mark_buffer_dirty(l); @@ -3004,7 +3793,7 @@ again: right = btrfs_alloc_free_block(trans, root, root->leafsize, 0, root->root_key.objectid, - &disk_key, 0, l->start, 0, 0); + &disk_key, 0, l->start, 0); if (IS_ERR(right)) return PTR_ERR(right); @@ -3028,7 +3817,7 @@ again: if (mid <= slot) { btrfs_set_header_nritems(right, 0); insert_ptr(trans, root, path, &disk_key, right->start, - path->slots[1] + 1, 1); + path->slots[1] + 1, 1, 0); btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); path->nodes[0] = right; @@ -3037,7 +3826,7 @@ again: } else { btrfs_set_header_nritems(right, 0); insert_ptr(trans, root, path, &disk_key, right->start, - path->slots[1], 1); + path->slots[1], 1, 0); btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); path->nodes[0] = right; @@ -3749,19 +4538,29 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root * empty a node. */ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_path *path, int level, int slot) + struct btrfs_path *path, int level, int slot, + int tree_mod_log) { struct extent_buffer *parent = path->nodes[level]; u32 nritems; + int ret; nritems = btrfs_header_nritems(parent); if (slot != nritems - 1) { + if (tree_mod_log && level) + tree_mod_log_eb_move(root->fs_info, parent, slot, + slot + 1, nritems - slot - 1); memmove_extent_buffer(parent, btrfs_node_key_ptr_offset(slot), btrfs_node_key_ptr_offset(slot + 1), sizeof(struct btrfs_key_ptr) * (nritems - slot - 1)); + } else if (tree_mod_log && level) { + ret = tree_mod_log_insert_key(root->fs_info, parent, slot, + MOD_LOG_KEY_REMOVE); + BUG_ON(ret < 0); } + nritems--; btrfs_set_header_nritems(parent, nritems); if (nritems == 0 && parent == root->node) { @@ -3793,7 +4592,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans, struct extent_buffer *leaf) { WARN_ON(btrfs_header_generation(leaf) != trans->transid); - del_ptr(trans, root, path, 1, path->slots[1]); + del_ptr(trans, root, path, 1, path->slots[1], 1); /* * btrfs_free_extent is expensive, we want to make sure we @@ -3804,7 +4603,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans, root_sub_used(root, leaf->len); extent_buffer_get(leaf); - btrfs_free_tree_block(trans, root, leaf, 0, 1, 0); + btrfs_free_tree_block(trans, root, leaf, 0, 1); free_extent_buffer_stale(leaf); } /* @@ -4271,7 +5070,7 @@ again: next = c; next_rw_lock = path->locks[level]; ret = read_block_for_search(NULL, root, path, &next, level, - slot, &key); + slot, &key, 0); if (ret == -EAGAIN) goto again; @@ -4308,7 +5107,7 @@ again: break; ret = read_block_for_search(NULL, root, path, &next, level, - 0, &key); + 0, &key, 0); if (ret == -EAGAIN) goto again; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 8fd7233..0236d03 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -173,6 +173,9 @@ static int btrfs_csum_sizes[] = { 4, 0 }; #define BTRFS_FT_XATTR 8 #define BTRFS_FT_MAX 9 +/* ioprio of readahead is set to idle */ +#define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) + /* * The key defines the order in the tree, and so it also defines (optimal) * block layout. @@ -823,6 +826,14 @@ struct btrfs_csum_item { u8 csum; } __attribute__ ((__packed__)); +struct btrfs_dev_stats_item { + /* + * grow this item struct at the end for future enhancements and keep + * the existing values unchanged + */ + __le64 values[BTRFS_DEV_STAT_VALUES_MAX]; +} __attribute__ ((__packed__)); + /* different types of block groups (and chunks) */ #define BTRFS_BLOCK_GROUP_DATA (1ULL << 0) #define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1) @@ -1129,6 +1140,15 @@ struct btrfs_fs_info { spinlock_t delayed_iput_lock; struct list_head delayed_iputs; + /* this protects tree_mod_seq_list */ + spinlock_t tree_mod_seq_lock; + atomic_t tree_mod_seq; + struct list_head tree_mod_seq_list; + + /* this protects tree_mod_log */ + rwlock_t tree_mod_log_lock; + struct rb_root tree_mod_log; + atomic_t nr_async_submits; atomic_t async_submit_draining; atomic_t nr_async_bios; @@ -1375,7 +1395,7 @@ struct btrfs_root { struct list_head root_list; spinlock_t orphan_lock; - struct list_head orphan_list; + atomic_t orphan_inodes; struct btrfs_block_rsv *orphan_block_rsv; int orphan_item_inserted; int orphan_cleanup_state; @@ -1508,6 +1528,12 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_BALANCE_ITEM_KEY 248 /* + * Persistantly stores the io stats in the device tree. + * One key for all stats, (0, BTRFS_DEV_STATS_KEY, devid). + */ +#define BTRFS_DEV_STATS_KEY 249 + +/* * string items are for debugging. They just store a short string of * data in the FS */ @@ -2415,6 +2441,30 @@ static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb, return btrfs_item_size(eb, e) - offset; } +/* btrfs_dev_stats_item */ +static inline u64 btrfs_dev_stats_value(struct extent_buffer *eb, + struct btrfs_dev_stats_item *ptr, + int index) +{ + u64 val; + + read_extent_buffer(eb, &val, + offsetof(struct btrfs_dev_stats_item, values) + + ((unsigned long)ptr) + (index * sizeof(u64)), + sizeof(val)); + return val; +} + +static inline void btrfs_set_dev_stats_value(struct extent_buffer *eb, + struct btrfs_dev_stats_item *ptr, + int index, u64 val) +{ + write_extent_buffer(eb, &val, + offsetof(struct btrfs_dev_stats_item, values) + + ((unsigned long)ptr) + (index * sizeof(u64)), + sizeof(val)); +} + static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) { return sb->s_fs_info; @@ -2496,11 +2546,11 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 blocksize, u64 parent, u64 root_objectid, struct btrfs_disk_key *key, int level, - u64 hint, u64 empty_size, int for_cow); + u64 hint, u64 empty_size); void btrfs_free_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, - u64 parent, int last_ref, int for_cow); + u64 parent, int last_ref); struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u32 blocksize, @@ -2659,6 +2709,8 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans, int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_key *key, struct btrfs_path *p, int ins_len, int cow); +int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key, + struct btrfs_path *p, u64 time_seq); int btrfs_realloc_node(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *parent, int start_slot, int cache_only, u64 *last_ret, @@ -2922,7 +2974,6 @@ int btrfs_readpage(struct file *file, struct page *page); void btrfs_evict_inode(struct inode *inode); int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc); int btrfs_dirty_inode(struct inode *inode); -int btrfs_update_time(struct file *file); struct inode *btrfs_alloc_inode(struct super_block *sb); void btrfs_destroy_inode(struct inode *inode); int btrfs_drop_inode(struct inode *inode); @@ -3098,4 +3149,23 @@ void btrfs_reada_detach(void *handle); int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, u64 start, int err); +/* delayed seq elem */ +struct seq_list { + struct list_head list; + u64 seq; + u32 flags; +}; + +void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, + struct seq_list *elem); +void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, + struct seq_list *elem); + +static inline int is_fstree(u64 rootid) +{ + if (rootid == BTRFS_FS_TREE_OBJECTID || + (s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID) + return 1; + return 0; +} #endif diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 03e3748..c18d044 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -669,8 +669,8 @@ static int btrfs_delayed_inode_reserve_metadata( return ret; } else if (src_rsv == &root->fs_info->delalloc_block_rsv) { spin_lock(&BTRFS_I(inode)->lock); - if (BTRFS_I(inode)->delalloc_meta_reserved) { - BTRFS_I(inode)->delalloc_meta_reserved = 0; + if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, + &BTRFS_I(inode)->runtime_flags)) { spin_unlock(&BTRFS_I(inode)->lock); release = true; goto migrate; @@ -1706,7 +1706,7 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans, btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(inode)); btrfs_set_stack_inode_generation(inode_item, BTRFS_I(inode)->generation); - btrfs_set_stack_inode_sequence(inode_item, BTRFS_I(inode)->sequence); + btrfs_set_stack_inode_sequence(inode_item, inode->i_version); btrfs_set_stack_inode_transid(inode_item, trans->transid); btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev); btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags); @@ -1754,7 +1754,7 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev) set_nlink(inode, btrfs_stack_inode_nlink(inode_item)); inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item)); BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item); - BTRFS_I(inode)->sequence = btrfs_stack_inode_sequence(inode_item); + inode->i_version = btrfs_stack_inode_sequence(inode_item); inode->i_rdev = 0; *rdev = btrfs_stack_inode_rdev(inode_item); BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 69f22e3..13ae7b0 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -525,7 +525,7 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info, ref->is_head = 0; ref->in_tree = 1; - if (need_ref_seq(for_cow, ref_root)) + if (is_fstree(ref_root)) seq = inc_delayed_seq(delayed_refs); ref->seq = seq; @@ -584,7 +584,7 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info, ref->is_head = 0; ref->in_tree = 1; - if (need_ref_seq(for_cow, ref_root)) + if (is_fstree(ref_root)) seq = inc_delayed_seq(delayed_refs); ref->seq = seq; @@ -658,10 +658,11 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr, num_bytes, parent, ref_root, level, action, for_cow); - if (!need_ref_seq(for_cow, ref_root) && + if (!is_fstree(ref_root) && waitqueue_active(&delayed_refs->seq_wait)) wake_up(&delayed_refs->seq_wait); spin_unlock(&delayed_refs->lock); + return 0; } @@ -706,10 +707,11 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, add_delayed_data_ref(fs_info, trans, &ref->node, bytenr, num_bytes, parent, ref_root, owner, offset, action, for_cow); - if (!need_ref_seq(for_cow, ref_root) && + if (!is_fstree(ref_root) && waitqueue_active(&delayed_refs->seq_wait)) wake_up(&delayed_refs->seq_wait); spin_unlock(&delayed_refs->lock); + return 0; } diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index d8f244d..413927f 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -195,11 +195,6 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, struct list_head *cluster, u64 search_start); -struct seq_list { - struct list_head list; - u64 seq; -}; - static inline u64 inc_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs) { assert_spin_locked(&delayed_refs->lock); @@ -230,25 +225,6 @@ int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs, u64 seq); /* - * delayed refs with a ref_seq > 0 must be held back during backref walking. - * this only applies to items in one of the fs-trees. for_cow items never need - * to be held back, so they won't get a ref_seq number. - */ -static inline int need_ref_seq(int for_cow, u64 rootid) -{ - if (for_cow) - return 0; - - if (rootid == BTRFS_FS_TREE_OBJECTID) - return 1; - - if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID) - return 1; - - return 0; -} - -/* * a node might live in a head or a regular ref, this lets you * test for the proper type to use. */ diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e1fe74a..7ae51de 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1153,7 +1153,6 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root->orphan_block_rsv = NULL; INIT_LIST_HEAD(&root->dirty_list); - INIT_LIST_HEAD(&root->orphan_list); INIT_LIST_HEAD(&root->root_list); spin_lock_init(&root->orphan_lock); spin_lock_init(&root->inode_lock); @@ -1166,6 +1165,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, atomic_set(&root->log_commit[0], 0); atomic_set(&root->log_commit[1], 0); atomic_set(&root->log_writers, 0); + atomic_set(&root->orphan_inodes, 0); root->log_batch = 0; root->log_transid = 0; root->last_log_commit = 0; @@ -1252,7 +1252,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, BTRFS_TREE_LOG_OBJECTID, NULL, - 0, 0, 0, 0); + 0, 0, 0); if (IS_ERR(leaf)) { kfree(root); return ERR_CAST(leaf); @@ -1914,11 +1914,14 @@ int open_ctree(struct super_block *sb, spin_lock_init(&fs_info->delayed_iput_lock); spin_lock_init(&fs_info->defrag_inodes_lock); spin_lock_init(&fs_info->free_chunk_lock); + spin_lock_init(&fs_info->tree_mod_seq_lock); + rwlock_init(&fs_info->tree_mod_log_lock); mutex_init(&fs_info->reloc_mutex); init_completion(&fs_info->kobj_unregister); INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); INIT_LIST_HEAD(&fs_info->space_info); + INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); btrfs_mapping_init(&fs_info->mapping_tree); btrfs_init_block_rsv(&fs_info->global_block_rsv); btrfs_init_block_rsv(&fs_info->delalloc_block_rsv); @@ -1931,12 +1934,14 @@ int open_ctree(struct super_block *sb, atomic_set(&fs_info->async_submit_draining, 0); atomic_set(&fs_info->nr_async_bios, 0); atomic_set(&fs_info->defrag_running, 0); + atomic_set(&fs_info->tree_mod_seq, 0); fs_info->sb = sb; fs_info->max_inline = 8192 * 1024; fs_info->metadata_ratio = 0; fs_info->defrag_inodes = RB_ROOT; fs_info->trans_no_join = 0; fs_info->free_chunk_space = 0; + fs_info->tree_mod_log = RB_ROOT; /* readahead state */ INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT); @@ -2001,7 +2006,8 @@ int open_ctree(struct super_block *sb, BTRFS_I(fs_info->btree_inode)->root = tree_root; memset(&BTRFS_I(fs_info->btree_inode)->location, 0, sizeof(struct btrfs_key)); - BTRFS_I(fs_info->btree_inode)->dummy_inode = 1; + set_bit(BTRFS_INODE_DUMMY, + &BTRFS_I(fs_info->btree_inode)->runtime_flags); insert_inode_hash(fs_info->btree_inode); spin_lock_init(&fs_info->block_group_cache_lock); @@ -2353,6 +2359,13 @@ retry_root_backup: fs_info->generation = generation; fs_info->last_trans_committed = generation; + ret = btrfs_init_dev_stats(fs_info); + if (ret) { + printk(KERN_ERR "btrfs: failed to init dev_stats: %d\n", + ret); + goto fail_block_groups; + } + ret = btrfs_init_space_info(fs_info); if (ret) { printk(KERN_ERR "Failed to initial space info: %d\n", ret); @@ -2556,18 +2569,19 @@ recovery_tree_root: static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) { - char b[BDEVNAME_SIZE]; - if (uptodate) { set_buffer_uptodate(bh); } else { + struct btrfs_device *device = (struct btrfs_device *) + bh->b_private; + printk_ratelimited(KERN_WARNING "lost page write due to " - "I/O error on %s\n", - bdevname(bh->b_bdev, b)); + "I/O error on %s\n", device->name); /* note, we dont' set_buffer_write_io_error because we have * our own ways of dealing with the IO errors */ clear_buffer_uptodate(bh); + btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_WRITE_ERRS); } unlock_buffer(bh); put_bh(bh); @@ -2682,6 +2696,7 @@ static int write_dev_supers(struct btrfs_device *device, set_buffer_uptodate(bh); lock_buffer(bh); bh->b_end_io = btrfs_end_buffer_write_sync; + bh->b_private = device; } /* @@ -2740,6 +2755,9 @@ static int write_dev_flush(struct btrfs_device *device, int wait) } if (!bio_flagged(bio, BIO_UPTODATE)) { ret = -EIO; + if (!bio_flagged(bio, BIO_EOPNOTSUPP)) + btrfs_dev_stat_inc_and_print(device, + BTRFS_DEV_STAT_FLUSH_ERRS); } /* drop the reference from the wait == 0 run */ @@ -2902,19 +2920,6 @@ int write_ctree_super(struct btrfs_trans_handle *trans, return ret; } -/* Kill all outstanding I/O */ -void btrfs_abort_devices(struct btrfs_root *root) -{ - struct list_head *head; - struct btrfs_device *dev; - mutex_lock(&root->fs_info->fs_devices->device_list_mutex); - head = &root->fs_info->fs_devices->devices; - list_for_each_entry_rcu(dev, head, dev_list) { - blk_abort_queue(dev->bdev->bd_disk->queue); - } - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); -} - void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) { spin_lock(&fs_info->fs_roots_radix_lock); @@ -3671,17 +3676,6 @@ int btrfs_cleanup_transaction(struct btrfs_root *root) return 0; } -static int btree_writepage_io_failed_hook(struct bio *bio, struct page *page, - u64 start, u64 end, - struct extent_state *state) -{ - struct super_block *sb = page->mapping->host->i_sb; - struct btrfs_fs_info *fs_info = btrfs_sb(sb); - btrfs_error(fs_info, -EIO, - "Error occured while writing out btree at %llu", start); - return -EIO; -} - static struct extent_io_ops btree_extent_io_ops = { .write_cache_pages_lock_hook = btree_lock_page_hook, .readpage_end_io_hook = btree_readpage_end_io_hook, @@ -3689,5 +3683,4 @@ static struct extent_io_ops btree_extent_io_ops = { .submit_bio_hook = btree_submit_bio_hook, /* note we're sharing with inode.c for the merge bio hook */ .merge_bio_hook = btrfs_merge_bio_hook, - .writepage_io_failed_hook = btree_writepage_io_failed_hook, }; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index ab1830a..05b3fab 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -89,7 +89,6 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, int btrfs_cleanup_transaction(struct btrfs_root *root); void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans, struct btrfs_root *root); -void btrfs_abort_devices(struct btrfs_root *root); #ifdef CONFIG_DEBUG_LOCK_ALLOC void btrfs_init_lockdep(void); diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index e887ee6..614f34a 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -13,15 +13,14 @@ parent_root_objectid) / 4) #define BTRFS_FID_SIZE_CONNECTABLE_ROOT (sizeof(struct btrfs_fid) / 4) -static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len, - int connectable) +static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, + struct inode *parent) { struct btrfs_fid *fid = (struct btrfs_fid *)fh; - struct inode *inode = dentry->d_inode; int len = *max_len; int type; - if (connectable && (len < BTRFS_FID_SIZE_CONNECTABLE)) { + if (parent && (len < BTRFS_FID_SIZE_CONNECTABLE)) { *max_len = BTRFS_FID_SIZE_CONNECTABLE; return 255; } else if (len < BTRFS_FID_SIZE_NON_CONNECTABLE) { @@ -36,19 +35,13 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len, fid->root_objectid = BTRFS_I(inode)->root->objectid; fid->gen = inode->i_generation; - if (connectable && !S_ISDIR(inode->i_mode)) { - struct inode *parent; + if (parent) { u64 parent_root_id; - spin_lock(&dentry->d_lock); - - parent = dentry->d_parent->d_inode; fid->parent_objectid = BTRFS_I(parent)->location.objectid; fid->parent_gen = parent->i_generation; parent_root_id = BTRFS_I(parent)->root->objectid; - spin_unlock(&dentry->d_lock); - if (parent_root_id != fid->root_objectid) { fid->parent_root_objectid = parent_root_id; len = BTRFS_FID_SIZE_CONNECTABLE_ROOT; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 49fd7b6..4b5a1e1 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3578,7 +3578,7 @@ again: space_info->chunk_alloc = 0; spin_unlock(&space_info->lock); out: - mutex_unlock(&extent_root->fs_info->chunk_mutex); + mutex_unlock(&fs_info->chunk_mutex); return ret; } @@ -4355,10 +4355,9 @@ static unsigned drop_outstanding_extent(struct inode *inode) BTRFS_I(inode)->outstanding_extents--; if (BTRFS_I(inode)->outstanding_extents == 0 && - BTRFS_I(inode)->delalloc_meta_reserved) { + test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, + &BTRFS_I(inode)->runtime_flags)) drop_inode_space = 1; - BTRFS_I(inode)->delalloc_meta_reserved = 0; - } /* * If we have more or the same amount of outsanding extents than we have @@ -4465,7 +4464,8 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) * Add an item to reserve for updating the inode when we complete the * delalloc io. */ - if (!BTRFS_I(inode)->delalloc_meta_reserved) { + if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED, + &BTRFS_I(inode)->runtime_flags)) { nr_extents++; extra_reserve = 1; } @@ -4511,7 +4511,8 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) spin_lock(&BTRFS_I(inode)->lock); if (extra_reserve) { - BTRFS_I(inode)->delalloc_meta_reserved = 1; + set_bit(BTRFS_INODE_DELALLOC_META_RESERVED, + &BTRFS_I(inode)->runtime_flags); nr_extents--; } BTRFS_I(inode)->reserved_extents += nr_extents; @@ -5217,7 +5218,7 @@ out: void btrfs_free_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, - u64 parent, int last_ref, int for_cow) + u64 parent, int last_ref) { struct btrfs_block_group_cache *cache = NULL; int ret; @@ -5227,7 +5228,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, buf->start, buf->len, parent, root->root_key.objectid, btrfs_header_level(buf), - BTRFS_DROP_DELAYED_REF, NULL, for_cow); + BTRFS_DROP_DELAYED_REF, NULL, 0); BUG_ON(ret); /* -ENOMEM */ } @@ -6249,7 +6250,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 blocksize, u64 parent, u64 root_objectid, struct btrfs_disk_key *key, int level, - u64 hint, u64 empty_size, int for_cow) + u64 hint, u64 empty_size) { struct btrfs_key ins; struct btrfs_block_rsv *block_rsv; @@ -6297,7 +6298,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, ins.objectid, ins.offset, parent, root_objectid, level, BTRFS_ADD_DELAYED_EXTENT, - extent_op, for_cow); + extent_op, 0); BUG_ON(ret); /* -ENOMEM */ } return buf; @@ -6715,7 +6716,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, btrfs_header_owner(path->nodes[level + 1])); } - btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1, 0); + btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1); out: wc->refs[level] = 0; wc->flags[level] = 0; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c9018a0..2c8f7b2 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -186,7 +186,6 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 offset, return parent; } - entry = rb_entry(node, struct tree_entry, rb_node); rb_link_node(node, parent, p); rb_insert_color(node, root); return NULL; @@ -413,7 +412,7 @@ static struct extent_state *next_state(struct extent_state *state) /* * utility function to clear some bits in an extent state struct. - * it will optionally wake up any one waiting on this state (wake == 1) + * it will optionally wake up any one waiting on this state (wake == 1). * * If no bits are set on the state struct after clearing things, the * struct is freed and removed from the tree @@ -570,10 +569,8 @@ hit_next: if (err) goto out; if (state->end <= end) { - clear_state_bit(tree, state, &bits, wake); - if (last_end == (u64)-1) - goto out; - start = last_end + 1; + state = clear_state_bit(tree, state, &bits, wake); + goto next; } goto search_again; } @@ -781,7 +778,6 @@ hit_next: * Just lock what we found and keep going */ if (state->start == start && state->end <= end) { - struct rb_node *next_node; if (state->state & exclusive_bits) { *failed_start = state->start; err = -EEXIST; @@ -789,20 +785,15 @@ hit_next: } set_state_bits(tree, state, &bits); - cache_state(state, cached_state); merge_state(tree, state); if (last_end == (u64)-1) goto out; - start = last_end + 1; - next_node = rb_next(&state->rb_node); - if (next_node && start < end && prealloc && !need_resched()) { - state = rb_entry(next_node, struct extent_state, - rb_node); - if (state->start == start) - goto hit_next; - } + state = next_state(state); + if (start < end && state && state->start == start && + !need_resched()) + goto hit_next; goto search_again; } @@ -845,6 +836,10 @@ hit_next: if (last_end == (u64)-1) goto out; start = last_end + 1; + state = next_state(state); + if (start < end && state && state->start == start && + !need_resched()) + goto hit_next; } goto search_again; } @@ -994,21 +989,14 @@ hit_next: * Just lock what we found and keep going */ if (state->start == start && state->end <= end) { - struct rb_node *next_node; - set_state_bits(tree, state, &bits); - clear_state_bit(tree, state, &clear_bits, 0); + state = clear_state_bit(tree, state, &clear_bits, 0); if (last_end == (u64)-1) goto out; - start = last_end + 1; - next_node = rb_next(&state->rb_node); - if (next_node && start < end && prealloc && !need_resched()) { - state = rb_entry(next_node, struct extent_state, - rb_node); - if (state->start == start) - goto hit_next; - } + if (start < end && state && state->start == start && + !need_resched()) + goto hit_next; goto search_again; } @@ -1042,10 +1030,13 @@ hit_next: goto out; if (state->end <= end) { set_state_bits(tree, state, &bits); - clear_state_bit(tree, state, &clear_bits, 0); + state = clear_state_bit(tree, state, &clear_bits, 0); if (last_end == (u64)-1) goto out; start = last_end + 1; + if (start < end && state && state->start == start && + !need_resched()) + goto hit_next; } goto search_again; } @@ -1173,9 +1164,8 @@ int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, cached_state, mask); } -static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached_state, - gfp_t mask) +int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached_state, gfp_t mask) { return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, cached_state, mask); @@ -1293,7 +1283,7 @@ out: * returned if we find something, and *start_ret and *end_ret are * set to reflect the state struct that was found. * - * If nothing was found, 1 is returned, < 0 on error + * If nothing was found, 1 is returned. If found something, return 0. */ int find_first_extent_bit(struct extent_io_tree *tree, u64 start, u64 *start_ret, u64 *end_ret, int bits) @@ -1923,6 +1913,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { /* try to remap that extent elsewhere? */ bio_put(bio); + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); return -EIO; } @@ -2222,17 +2213,7 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end) uptodate = 0; } - if (!uptodate && tree->ops && - tree->ops->writepage_io_failed_hook) { - ret = tree->ops->writepage_io_failed_hook(NULL, page, - start, end, NULL); - /* Writeback already completed */ - if (ret == 0) - return 1; - } - if (!uptodate) { - clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS); ClearPageUptodate(page); SetPageError(page); } @@ -2347,10 +2328,23 @@ static void end_bio_extent_readpage(struct bio *bio, int err) if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { ret = tree->ops->readpage_end_io_hook(page, start, end, state, mirror); - if (ret) + if (ret) { + /* no IO indicated but software detected errors + * in the block, either checksum errors or + * issues with the contents */ + struct btrfs_root *root = + BTRFS_I(page->mapping->host)->root; + struct btrfs_device *device; + uptodate = 0; - else + device = btrfs_find_device_for_logical( + root, start, mirror); + if (device) + btrfs_dev_stat_inc_and_print(device, + BTRFS_DEV_STAT_CORRUPTION_ERRS); + } else { clean_io_failure(start, page); + } } if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) { @@ -3164,7 +3158,7 @@ static int write_one_eb(struct extent_buffer *eb, u64 offset = eb->start; unsigned long i, num_pages; int rw = (epd->sync_io ? WRITE_SYNC : WRITE); - int ret; + int ret = 0; clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); num_pages = num_extent_pages(eb->start, eb->len); @@ -3930,6 +3924,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, eb->start = start; eb->len = len; eb->tree = tree; + eb->bflags = 0; rwlock_init(&eb->lock); atomic_set(&eb->write_locks, 0); atomic_set(&eb->read_locks, 0); @@ -3967,6 +3962,60 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, return eb; } +struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src) +{ + unsigned long i; + struct page *p; + struct extent_buffer *new; + unsigned long num_pages = num_extent_pages(src->start, src->len); + + new = __alloc_extent_buffer(NULL, src->start, src->len, GFP_ATOMIC); + if (new == NULL) + return NULL; + + for (i = 0; i < num_pages; i++) { + p = alloc_page(GFP_ATOMIC); + BUG_ON(!p); + attach_extent_buffer_page(new, p); + WARN_ON(PageDirty(p)); + SetPageUptodate(p); + new->pages[i] = p; + } + + copy_extent_buffer(new, src, 0, 0, src->len); + set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags); + set_bit(EXTENT_BUFFER_DUMMY, &new->bflags); + + return new; +} + +struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len) +{ + struct extent_buffer *eb; + unsigned long num_pages = num_extent_pages(0, len); + unsigned long i; + + eb = __alloc_extent_buffer(NULL, start, len, GFP_ATOMIC); + if (!eb) + return NULL; + + for (i = 0; i < num_pages; i++) { + eb->pages[i] = alloc_page(GFP_ATOMIC); + if (!eb->pages[i]) + goto err; + } + set_extent_buffer_uptodate(eb); + btrfs_set_header_nritems(eb, 0); + set_bit(EXTENT_BUFFER_DUMMY, &eb->bflags); + + return eb; +err: + for (i--; i > 0; i--) + __free_page(eb->pages[i]); + __free_extent_buffer(eb); + return NULL; +} + static int extent_buffer_under_io(struct extent_buffer *eb) { return (atomic_read(&eb->io_pages) || @@ -3981,18 +4030,21 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, unsigned long start_idx) { unsigned long index; + unsigned long num_pages; struct page *page; + int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags); BUG_ON(extent_buffer_under_io(eb)); - index = num_extent_pages(eb->start, eb->len); + num_pages = num_extent_pages(eb->start, eb->len); + index = start_idx + num_pages; if (start_idx >= index) return; do { index--; page = extent_buffer_page(eb, index); - if (page) { + if (page && mapped) { spin_lock(&page->mapping->private_lock); /* * We do this since we'll remove the pages after we've @@ -4017,6 +4069,8 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, } spin_unlock(&page->mapping->private_lock); + } + if (page) { /* One for when we alloced the page */ page_cache_release(page); } @@ -4235,14 +4289,18 @@ static void release_extent_buffer(struct extent_buffer *eb, gfp_t mask) { WARN_ON(atomic_read(&eb->refs) == 0); if (atomic_dec_and_test(&eb->refs)) { - struct extent_io_tree *tree = eb->tree; + if (test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) { + spin_unlock(&eb->refs_lock); + } else { + struct extent_io_tree *tree = eb->tree; - spin_unlock(&eb->refs_lock); + spin_unlock(&eb->refs_lock); - spin_lock(&tree->buffer_lock); - radix_tree_delete(&tree->buffer, - eb->start >> PAGE_CACHE_SHIFT); - spin_unlock(&tree->buffer_lock); + spin_lock(&tree->buffer_lock); + radix_tree_delete(&tree->buffer, + eb->start >> PAGE_CACHE_SHIFT); + spin_unlock(&tree->buffer_lock); + } /* Should be safe to release our pages at this point */ btrfs_release_extent_buffer_page(eb, 0); @@ -4260,6 +4318,10 @@ void free_extent_buffer(struct extent_buffer *eb) spin_lock(&eb->refs_lock); if (atomic_read(&eb->refs) == 2 && + test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) + atomic_dec(&eb->refs); + + if (atomic_read(&eb->refs) == 2 && test_bit(EXTENT_BUFFER_STALE, &eb->bflags) && !extent_buffer_under_io(eb) && test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index b516c3b..25900af 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -39,6 +39,7 @@ #define EXTENT_BUFFER_STALE 6 #define EXTENT_BUFFER_WRITEBACK 7 #define EXTENT_BUFFER_IOERR 8 +#define EXTENT_BUFFER_DUMMY 9 /* these are flags for extent_clear_unlock_delalloc */ #define EXTENT_CLEAR_UNLOCK_PAGE 0x1 @@ -75,9 +76,6 @@ struct extent_io_ops { unsigned long bio_flags); int (*readpage_io_hook)(struct page *page, u64 start, u64 end); int (*readpage_io_failed_hook)(struct page *page, int failed_mirror); - int (*writepage_io_failed_hook)(struct bio *bio, struct page *page, - u64 start, u64 end, - struct extent_state *state); int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end, struct extent_state *state, int mirror); int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, @@ -225,6 +223,8 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask); int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask); +int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached_state, gfp_t mask); int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, @@ -265,6 +265,8 @@ void set_page_extent_mapped(struct page *page); struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, u64 start, unsigned long len); +struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len); +struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src); struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, u64 start, unsigned long len); void free_extent_buffer(struct extent_buffer *eb); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 53bf2d7..70dc8ca 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -65,6 +65,21 @@ struct inode_defrag { int cycled; }; +static int __compare_inode_defrag(struct inode_defrag *defrag1, + struct inode_defrag *defrag2) +{ + if (defrag1->root > defrag2->root) + return 1; + else if (defrag1->root < defrag2->root) + return -1; + else if (defrag1->ino > defrag2->ino) + return 1; + else if (defrag1->ino < defrag2->ino) + return -1; + else + return 0; +} + /* pop a record for an inode into the defrag tree. The lock * must be held already * @@ -81,15 +96,17 @@ static void __btrfs_add_inode_defrag(struct inode *inode, struct inode_defrag *entry; struct rb_node **p; struct rb_node *parent = NULL; + int ret; p = &root->fs_info->defrag_inodes.rb_node; while (*p) { parent = *p; entry = rb_entry(parent, struct inode_defrag, rb_node); - if (defrag->ino < entry->ino) + ret = __compare_inode_defrag(defrag, entry); + if (ret < 0) p = &parent->rb_left; - else if (defrag->ino > entry->ino) + else if (ret > 0) p = &parent->rb_right; else { /* if we're reinserting an entry for @@ -103,7 +120,7 @@ static void __btrfs_add_inode_defrag(struct inode *inode, goto exists; } } - BTRFS_I(inode)->in_defrag = 1; + set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); rb_link_node(&defrag->rb_node, parent, p); rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes); return; @@ -131,7 +148,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, if (btrfs_fs_closing(root->fs_info)) return 0; - if (BTRFS_I(inode)->in_defrag) + if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) return 0; if (trans) @@ -148,7 +165,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, defrag->root = root->root_key.objectid; spin_lock(&root->fs_info->defrag_inodes_lock); - if (!BTRFS_I(inode)->in_defrag) + if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) __btrfs_add_inode_defrag(inode, defrag); else kfree(defrag); @@ -159,28 +176,35 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, /* * must be called with the defrag_inodes lock held */ -struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, u64 ino, +struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, + u64 root, u64 ino, struct rb_node **next) { struct inode_defrag *entry = NULL; + struct inode_defrag tmp; struct rb_node *p; struct rb_node *parent = NULL; + int ret; + + tmp.ino = ino; + tmp.root = root; p = info->defrag_inodes.rb_node; while (p) { parent = p; entry = rb_entry(parent, struct inode_defrag, rb_node); - if (ino < entry->ino) + ret = __compare_inode_defrag(&tmp, entry); + if (ret < 0) p = parent->rb_left; - else if (ino > entry->ino) + else if (ret > 0) p = parent->rb_right; else return entry; } if (next) { - while (parent && ino > entry->ino) { + while (parent && __compare_inode_defrag(&tmp, entry) > 0) { parent = rb_next(parent); entry = rb_entry(parent, struct inode_defrag, rb_node); } @@ -202,6 +226,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) struct btrfs_key key; struct btrfs_ioctl_defrag_range_args range; u64 first_ino = 0; + u64 root_objectid = 0; int num_defrag; int defrag_batch = 1024; @@ -214,11 +239,14 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) n = NULL; /* find an inode to defrag */ - defrag = btrfs_find_defrag_inode(fs_info, first_ino, &n); + defrag = btrfs_find_defrag_inode(fs_info, root_objectid, + first_ino, &n); if (!defrag) { - if (n) - defrag = rb_entry(n, struct inode_defrag, rb_node); - else if (first_ino) { + if (n) { + defrag = rb_entry(n, struct inode_defrag, + rb_node); + } else if (root_objectid || first_ino) { + root_objectid = 0; first_ino = 0; continue; } else { @@ -228,6 +256,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) /* remove it from the rbtree */ first_ino = defrag->ino + 1; + root_objectid = defrag->root; rb_erase(&defrag->rb_node, &fs_info->defrag_inodes); if (btrfs_fs_closing(fs_info)) @@ -252,7 +281,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) goto next; /* do a chunk of defrag */ - BTRFS_I(inode)->in_defrag = 0; + clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); range.start = defrag->last_offset; num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid, defrag_batch); @@ -1404,12 +1433,11 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, goto out; } - err = btrfs_update_time(file); + err = file_update_time(file); if (err) { mutex_unlock(&inode->i_mutex); goto out; } - BTRFS_I(inode)->sequence++; start_pos = round_down(pos, root->sectorsize); if (start_pos > i_size_read(inode)) { @@ -1466,8 +1494,8 @@ int btrfs_release_file(struct inode *inode, struct file *filp) * flush down new bytes that may have been written if the * application were using truncate to replace a file in place. */ - if (BTRFS_I(inode)->ordered_data_close) { - BTRFS_I(inode)->ordered_data_close = 0; + if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, + &BTRFS_I(inode)->runtime_flags)) { btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode); if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) filemap_flush(inode->i_mapping); @@ -1498,14 +1526,15 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) trace_btrfs_sync_file(file, datasync); - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); - if (ret) - return ret; mutex_lock(&inode->i_mutex); - /* we wait first, since the writeback may change the inode */ + /* + * we wait first, since the writeback may change the inode, also wait + * ordered range does a filemape_write_and_wait_range which is why we + * don't do it above like other file systems. + */ root->log_batch++; - btrfs_wait_ordered_range(inode, 0, (u64)-1); + btrfs_wait_ordered_range(inode, start, end); root->log_batch++; /* @@ -1523,7 +1552,8 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * syncing */ smp_mb(); - if (BTRFS_I(inode)->last_trans <= + if (btrfs_inode_in_log(inode, root->fs_info->generation) || + BTRFS_I(inode)->last_trans <= root->fs_info->last_trans_committed) { BTRFS_I(inode)->last_trans = 0; mutex_unlock(&inode->i_mutex); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 202008e..81296c5 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -33,6 +33,8 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info); +static void unlink_free_space(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info); static struct inode *__lookup_free_space_inode(struct btrfs_root *root, struct btrfs_path *path, @@ -75,7 +77,8 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root, return ERR_PTR(-ENOENT); } - inode->i_mapping->flags &= ~__GFP_FS; + mapping_set_gfp_mask(inode->i_mapping, + mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); return inode; } @@ -365,7 +368,7 @@ static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode, static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation) { - u64 *val; + __le64 *val; io_ctl_map_page(io_ctl, 1); @@ -388,7 +391,7 @@ static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation) static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation) { - u64 *gen; + __le64 *gen; /* * Skip the crc area. If we don't check crcs then we just have a 64bit @@ -584,6 +587,44 @@ static int io_ctl_read_bitmap(struct io_ctl *io_ctl, return 0; } +/* + * Since we attach pinned extents after the fact we can have contiguous sections + * of free space that are split up in entries. This poses a problem with the + * tree logging stuff since it could have allocated across what appears to be 2 + * entries since we would have merged the entries when adding the pinned extents + * back to the free space cache. So run through the space cache that we just + * loaded and merge contiguous entries. This will make the log replay stuff not + * blow up and it will make for nicer allocator behavior. + */ +static void merge_space_tree(struct btrfs_free_space_ctl *ctl) +{ + struct btrfs_free_space *e, *prev = NULL; + struct rb_node *n; + +again: + spin_lock(&ctl->tree_lock); + for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) { + e = rb_entry(n, struct btrfs_free_space, offset_index); + if (!prev) + goto next; + if (e->bitmap || prev->bitmap) + goto next; + if (prev->offset + prev->bytes == e->offset) { + unlink_free_space(ctl, prev); + unlink_free_space(ctl, e); + prev->bytes += e->bytes; + kmem_cache_free(btrfs_free_space_cachep, e); + link_free_space(ctl, prev); + prev = NULL; + spin_unlock(&ctl->tree_lock); + goto again; + } +next: + prev = e; + } + spin_unlock(&ctl->tree_lock); +} + int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, struct btrfs_free_space_ctl *ctl, struct btrfs_path *path, u64 offset) @@ -726,6 +767,7 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, } io_ctl_drop_pages(&io_ctl); + merge_space_tree(ctl); ret = 1; out: io_ctl_free(&io_ctl); @@ -972,9 +1014,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, goto out; - ret = filemap_write_and_wait(inode->i_mapping); - if (ret) - goto out; + btrfs_wait_ordered_range(inode, 0, (u64)-1); key.objectid = BTRFS_FREE_SPACE_OBJECTID; key.offset = offset; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ceb7b9c..f6ab6f5 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -89,7 +89,7 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { static int btrfs_setsize(struct inode *inode, loff_t newsize); static int btrfs_truncate(struct inode *inode); -static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end); +static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); static noinline int cow_file_range(struct inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, @@ -257,10 +257,13 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, ret = insert_inline_extent(trans, root, inode, start, inline_len, compressed_size, compress_type, compressed_pages); - if (ret) { + if (ret && ret != -ENOSPC) { btrfs_abort_transaction(trans, root, ret); return ret; + } else if (ret == -ENOSPC) { + return 1; } + btrfs_delalloc_release_metadata(inode, end + 1 - start); btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); return 0; @@ -1572,11 +1575,11 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, if (btrfs_is_free_space_inode(root, inode)) metadata = 2; - ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata); - if (ret) - return ret; - if (!(rw & REQ_WRITE)) { + ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata); + if (ret) + return ret; + if (bio_flags & EXTENT_BIO_COMPRESSED) { return btrfs_submit_compressed_read(inode, bio, mirror_num, bio_flags); @@ -1815,25 +1818,24 @@ out: * an ordered extent if the range of bytes in the file it covers are * fully written. */ -static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) +static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) { + struct inode *inode = ordered_extent->inode; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans = NULL; - struct btrfs_ordered_extent *ordered_extent = NULL; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_state *cached_state = NULL; int compress_type = 0; int ret; bool nolock; - ret = btrfs_dec_test_ordered_pending(inode, &ordered_extent, start, - end - start + 1); - if (!ret) - return 0; - BUG_ON(!ordered_extent); /* Logic error */ - nolock = btrfs_is_free_space_inode(root, inode); + if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) { + ret = -EIO; + goto out; + } + if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); @@ -1889,12 +1891,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) ordered_extent->file_offset, ordered_extent->len); } - unlock_extent_cached(io_tree, ordered_extent->file_offset, - ordered_extent->file_offset + - ordered_extent->len - 1, &cached_state, GFP_NOFS); + if (ret < 0) { btrfs_abort_transaction(trans, root, ret); - goto out; + goto out_unlock; } add_pending_csums(trans, inode, ordered_extent->file_offset, @@ -1905,10 +1905,14 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) ret = btrfs_update_inode_fallback(trans, root, inode); if (ret) { /* -ENOMEM or corruption */ btrfs_abort_transaction(trans, root, ret); - goto out; + goto out_unlock; } } ret = 0; +out_unlock: + unlock_extent_cached(io_tree, ordered_extent->file_offset, + ordered_extent->file_offset + + ordered_extent->len - 1, &cached_state, GFP_NOFS); out: if (root != root->fs_info->tree_root) btrfs_delalloc_release_metadata(inode, ordered_extent->len); @@ -1919,26 +1923,57 @@ out: btrfs_end_transaction(trans, root); } + if (ret) + clear_extent_uptodate(io_tree, ordered_extent->file_offset, + ordered_extent->file_offset + + ordered_extent->len - 1, NULL, GFP_NOFS); + + /* + * This needs to be dont to make sure anybody waiting knows we are done + * upating everything for this ordered extent. + */ + btrfs_remove_ordered_extent(inode, ordered_extent); + /* once for us */ btrfs_put_ordered_extent(ordered_extent); /* once for the tree */ btrfs_put_ordered_extent(ordered_extent); - return 0; -out_unlock: - unlock_extent_cached(io_tree, ordered_extent->file_offset, - ordered_extent->file_offset + - ordered_extent->len - 1, &cached_state, GFP_NOFS); - goto out; + return ret; +} + +static void finish_ordered_fn(struct btrfs_work *work) +{ + struct btrfs_ordered_extent *ordered_extent; + ordered_extent = container_of(work, struct btrfs_ordered_extent, work); + btrfs_finish_ordered_io(ordered_extent); } static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, struct extent_state *state, int uptodate) { + struct inode *inode = page->mapping->host; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_ordered_extent *ordered_extent = NULL; + struct btrfs_workers *workers; + trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); ClearPagePrivate2(page); - return btrfs_finish_ordered_io(page->mapping->host, start, end); + if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start, + end - start + 1, uptodate)) + return 0; + + ordered_extent->work.func = finish_ordered_fn; + ordered_extent->work.flags = 0; + + if (btrfs_is_free_space_inode(root, inode)) + workers = &root->fs_info->endio_freespace_worker; + else + workers = &root->fs_info->endio_write_workers; + btrfs_queue_worker(workers, &ordered_extent->work); + + return 0; } /* @@ -2072,12 +2107,12 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, struct btrfs_block_rsv *block_rsv; int ret; - if (!list_empty(&root->orphan_list) || + if (atomic_read(&root->orphan_inodes) || root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) return; spin_lock(&root->orphan_lock); - if (!list_empty(&root->orphan_list)) { + if (atomic_read(&root->orphan_inodes)) { spin_unlock(&root->orphan_lock); return; } @@ -2134,8 +2169,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) block_rsv = NULL; } - if (list_empty(&BTRFS_I(inode)->i_orphan)) { - list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); + if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, + &BTRFS_I(inode)->runtime_flags)) { #if 0 /* * For proper ENOSPC handling, we should do orphan @@ -2148,12 +2183,12 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) insert = 1; #endif insert = 1; + atomic_dec(&root->orphan_inodes); } - if (!BTRFS_I(inode)->orphan_meta_reserved) { - BTRFS_I(inode)->orphan_meta_reserved = 1; + if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED, + &BTRFS_I(inode)->runtime_flags)) reserve = 1; - } spin_unlock(&root->orphan_lock); /* grab metadata reservation from transaction handle */ @@ -2166,6 +2201,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) if (insert >= 1) { ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode)); if (ret && ret != -EEXIST) { + clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, + &BTRFS_I(inode)->runtime_flags); btrfs_abort_transaction(trans, root, ret); return ret; } @@ -2196,15 +2233,13 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode) int ret = 0; spin_lock(&root->orphan_lock); - if (!list_empty(&BTRFS_I(inode)->i_orphan)) { - list_del_init(&BTRFS_I(inode)->i_orphan); + if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, + &BTRFS_I(inode)->runtime_flags)) delete_item = 1; - } - if (BTRFS_I(inode)->orphan_meta_reserved) { - BTRFS_I(inode)->orphan_meta_reserved = 0; + if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED, + &BTRFS_I(inode)->runtime_flags)) release_rsv = 1; - } spin_unlock(&root->orphan_lock); if (trans && delete_item) { @@ -2212,8 +2247,10 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode) BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */ } - if (release_rsv) + if (release_rsv) { btrfs_orphan_release_metadata(inode); + atomic_dec(&root->orphan_inodes); + } return 0; } @@ -2341,6 +2378,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) ret = PTR_ERR(trans); goto out; } + printk(KERN_ERR "auto deleting %Lu\n", + found_key.objectid); ret = btrfs_del_orphan_item(trans, root, found_key.objectid); BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */ @@ -2352,9 +2391,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) * add this inode to the orphan list so btrfs_orphan_del does * the proper thing when we hit it */ - spin_lock(&root->orphan_lock); - list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); - spin_unlock(&root->orphan_lock); + set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, + &BTRFS_I(inode)->runtime_flags); /* if we have links, this was a truncate, lets do that */ if (inode->i_nlink) { @@ -2510,7 +2548,7 @@ static void btrfs_read_locked_inode(struct inode *inode) inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); - BTRFS_I(inode)->sequence = btrfs_inode_sequence(leaf, inode_item); + inode->i_version = btrfs_inode_sequence(leaf, inode_item); inode->i_generation = BTRFS_I(inode)->generation; inode->i_rdev = 0; rdev = btrfs_inode_rdev(leaf, inode_item); @@ -2594,7 +2632,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode)); btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation); - btrfs_set_inode_sequence(leaf, item, BTRFS_I(inode)->sequence); + btrfs_set_inode_sequence(leaf, item, inode->i_version); btrfs_set_inode_transid(leaf, item, trans->transid); btrfs_set_inode_rdev(leaf, item, inode->i_rdev); btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); @@ -2752,6 +2790,8 @@ err: goto out; btrfs_i_size_write(dir, dir->i_size - name_len * 2); + inode_inc_iversion(inode); + inode_inc_iversion(dir); inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME; btrfs_update_inode(trans, root, dir); out: @@ -3089,6 +3129,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, } btrfs_i_size_write(dir, dir->i_size - name_len * 2); + inode_inc_iversion(dir); dir->i_mtime = dir->i_ctime = CURRENT_TIME; ret = btrfs_update_inode(trans, root, dir); if (ret) @@ -3607,7 +3648,8 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize) * any new writes get down to disk quickly. */ if (newsize == 0) - BTRFS_I(inode)->ordered_data_close = 1; + set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, + &BTRFS_I(inode)->runtime_flags); /* we don't support swapfiles, so vmtruncate shouldn't fail */ truncate_setsize(inode, newsize); @@ -3638,6 +3680,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_valid) { setattr_copy(inode, attr); + inode_inc_iversion(inode); err = btrfs_dirty_inode(inode); if (!err && attr->ia_valid & ATTR_MODE) @@ -3671,7 +3714,8 @@ void btrfs_evict_inode(struct inode *inode) btrfs_wait_ordered_range(inode, 0, (u64)-1); if (root->fs_info->log_root_recovering) { - BUG_ON(!list_empty(&BTRFS_I(inode)->i_orphan)); + BUG_ON(!test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, + &BTRFS_I(inode)->runtime_flags)); goto no_delete; } @@ -4066,7 +4110,7 @@ static struct inode *new_simple_dir(struct super_block *s, BTRFS_I(inode)->root = root; memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); - BTRFS_I(inode)->dummy_inode = 1; + set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID; inode->i_op = &btrfs_dir_ro_inode_operations; @@ -4370,7 +4414,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc) int ret = 0; bool nolock = false; - if (BTRFS_I(inode)->dummy_inode) + if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags)) return 0; if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(root, inode)) @@ -4403,7 +4447,7 @@ int btrfs_dirty_inode(struct inode *inode) struct btrfs_trans_handle *trans; int ret; - if (BTRFS_I(inode)->dummy_inode) + if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags)) return 0; trans = btrfs_join_transaction(root); @@ -4431,46 +4475,18 @@ int btrfs_dirty_inode(struct inode *inode) * This is a copy of file_update_time. We need this so we can return error on * ENOSPC for updating the inode in the case of file write and mmap writes. */ -int btrfs_update_time(struct file *file) +static int btrfs_update_time(struct inode *inode, struct timespec *now, + int flags) { - struct inode *inode = file->f_path.dentry->d_inode; - struct timespec now; - int ret; - enum { S_MTIME = 1, S_CTIME = 2, S_VERSION = 4 } sync_it = 0; - - /* First try to exhaust all avenues to not sync */ - if (IS_NOCMTIME(inode)) - return 0; - - now = current_fs_time(inode->i_sb); - if (!timespec_equal(&inode->i_mtime, &now)) - sync_it = S_MTIME; - - if (!timespec_equal(&inode->i_ctime, &now)) - sync_it |= S_CTIME; - - if (IS_I_VERSION(inode)) - sync_it |= S_VERSION; - - if (!sync_it) - return 0; - - /* Finally allowed to write? Takes lock. */ - if (mnt_want_write_file(file)) - return 0; - - /* Only change inode inside the lock region */ - if (sync_it & S_VERSION) + if (flags & S_VERSION) inode_inc_iversion(inode); - if (sync_it & S_CTIME) - inode->i_ctime = now; - if (sync_it & S_MTIME) - inode->i_mtime = now; - ret = btrfs_dirty_inode(inode); - if (!ret) - mark_inode_dirty_sync(inode); - mnt_drop_write(file->f_path.mnt); - return ret; + if (flags & S_CTIME) + inode->i_ctime = *now; + if (flags & S_MTIME) + inode->i_mtime = *now; + if (flags & S_ATIME) + inode->i_atime = *now; + return btrfs_dirty_inode(inode); } /* @@ -4730,6 +4746,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, btrfs_i_size_write(parent_inode, parent_inode->i_size + name_len * 2); + inode_inc_iversion(parent_inode); parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; ret = btrfs_update_inode(trans, root, parent_inode); if (ret) @@ -4937,6 +4954,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, } btrfs_inc_nlink(inode); + inode_inc_iversion(inode); inode->i_ctime = CURRENT_TIME; ihold(inode); @@ -5903,9 +5921,7 @@ static void btrfs_endio_direct_write(struct bio *bio, int err) struct btrfs_dio_private *dip = bio->bi_private; struct inode *inode = dip->inode; struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_trans_handle *trans; struct btrfs_ordered_extent *ordered = NULL; - struct extent_state *cached_state = NULL; u64 ordered_offset = dip->logical_offset; u64 ordered_bytes = dip->bytes; int ret; @@ -5915,73 +5931,14 @@ static void btrfs_endio_direct_write(struct bio *bio, int err) again: ret = btrfs_dec_test_first_ordered_pending(inode, &ordered, &ordered_offset, - ordered_bytes); + ordered_bytes, !err); if (!ret) goto out_test; - BUG_ON(!ordered); - - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - err = -ENOMEM; - goto out; - } - trans->block_rsv = &root->fs_info->delalloc_block_rsv; - - if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { - ret = btrfs_ordered_update_i_size(inode, 0, ordered); - if (!ret) - err = btrfs_update_inode_fallback(trans, root, inode); - goto out; - } - - lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset, - ordered->file_offset + ordered->len - 1, 0, - &cached_state); - - if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) { - ret = btrfs_mark_extent_written(trans, inode, - ordered->file_offset, - ordered->file_offset + - ordered->len); - if (ret) { - err = ret; - goto out_unlock; - } - } else { - ret = insert_reserved_file_extent(trans, inode, - ordered->file_offset, - ordered->start, - ordered->disk_len, - ordered->len, - ordered->len, - 0, 0, 0, - BTRFS_FILE_EXTENT_REG); - unpin_extent_cache(&BTRFS_I(inode)->extent_tree, - ordered->file_offset, ordered->len); - if (ret) { - err = ret; - WARN_ON(1); - goto out_unlock; - } - } - - add_pending_csums(trans, inode, ordered->file_offset, &ordered->list); - ret = btrfs_ordered_update_i_size(inode, 0, ordered); - if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) - btrfs_update_inode_fallback(trans, root, inode); - ret = 0; -out_unlock: - unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset, - ordered->file_offset + ordered->len - 1, - &cached_state, GFP_NOFS); -out: - btrfs_delalloc_release_metadata(inode, ordered->len); - btrfs_end_transaction(trans, root); - ordered_offset = ordered->file_offset + ordered->len; - btrfs_put_ordered_extent(ordered); - btrfs_put_ordered_extent(ordered); - + ordered->work.func = finish_ordered_fn; + ordered->work.flags = 0; + btrfs_queue_worker(&root->fs_info->endio_write_workers, + &ordered->work); out_test: /* * our bio might span multiple ordered extents. If we haven't @@ -5990,12 +5947,12 @@ out_test: if (ordered_offset < dip->logical_offset + dip->bytes) { ordered_bytes = dip->logical_offset + dip->bytes - ordered_offset; + ordered = NULL; goto again; } out_done: bio->bi_private = dip->private; - kfree(dip->csums); kfree(dip); /* If we had an error make sure to clear the uptodate flag */ @@ -6063,9 +6020,12 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, int ret; bio_get(bio); - ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); - if (ret) - goto err; + + if (!write) { + ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); + if (ret) + goto err; + } if (skip_sum) goto map; @@ -6485,13 +6445,13 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags) static void btrfs_invalidatepage(struct page *page, unsigned long offset) { + struct inode *inode = page->mapping->host; struct extent_io_tree *tree; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; u64 page_start = page_offset(page); u64 page_end = page_start + PAGE_CACHE_SIZE - 1; - /* * we have the page locked, so new writeback can't start, * and the dirty bit won't be cleared while we are here. @@ -6501,13 +6461,13 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) */ wait_on_page_writeback(page); - tree = &BTRFS_I(page->mapping->host)->io_tree; + tree = &BTRFS_I(inode)->io_tree; if (offset) { btrfs_releasepage(page, GFP_NOFS); return; } lock_extent_bits(tree, page_start, page_end, 0, &cached_state); - ordered = btrfs_lookup_ordered_extent(page->mapping->host, + ordered = btrfs_lookup_ordered_extent(inode, page_offset(page)); if (ordered) { /* @@ -6522,9 +6482,10 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) * whoever cleared the private bit is responsible * for the finish_ordered_io */ - if (TestClearPagePrivate2(page)) { - btrfs_finish_ordered_io(page->mapping->host, - page_start, page_end); + if (TestClearPagePrivate2(page) && + btrfs_dec_test_ordered_pending(inode, &ordered, page_start, + PAGE_CACHE_SIZE, 1)) { + btrfs_finish_ordered_io(ordered); } btrfs_put_ordered_extent(ordered); cached_state = NULL; @@ -6576,7 +6537,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); if (!ret) { - ret = btrfs_update_time(vma->vm_file); + ret = file_update_time(vma->vm_file); reserved = 1; } if (ret) { @@ -6771,7 +6732,8 @@ static int btrfs_truncate(struct inode *inode) * using truncate to replace the contents of the file will * end up with a zero length file after a crash. */ - if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close) + if (inode->i_size == 0 && test_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, + &BTRFS_I(inode)->runtime_flags)) btrfs_add_ordered_operation(trans, root, inode); while (1) { @@ -6894,7 +6856,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->root = NULL; ei->space_info = NULL; ei->generation = 0; - ei->sequence = 0; ei->last_trans = 0; ei->last_sub_trans = 0; ei->logged_trans = 0; @@ -6909,11 +6870,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->outstanding_extents = 0; ei->reserved_extents = 0; - ei->ordered_data_close = 0; - ei->orphan_meta_reserved = 0; - ei->dummy_inode = 0; - ei->in_defrag = 0; - ei->delalloc_meta_reserved = 0; + ei->runtime_flags = 0; ei->force_compress = BTRFS_COMPRESS_NONE; ei->delayed_node = NULL; @@ -6927,7 +6884,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) mutex_init(&ei->log_mutex); mutex_init(&ei->delalloc_mutex); btrfs_ordered_inode_tree_init(&ei->ordered_tree); - INIT_LIST_HEAD(&ei->i_orphan); INIT_LIST_HEAD(&ei->delalloc_inodes); INIT_LIST_HEAD(&ei->ordered_operations); RB_CLEAR_NODE(&ei->rb_node); @@ -6972,13 +6928,12 @@ void btrfs_destroy_inode(struct inode *inode) spin_unlock(&root->fs_info->ordered_extent_lock); } - spin_lock(&root->orphan_lock); - if (!list_empty(&BTRFS_I(inode)->i_orphan)) { + if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, + &BTRFS_I(inode)->runtime_flags)) { printk(KERN_INFO "BTRFS: inode %llu still on the orphan list\n", (unsigned long long)btrfs_ino(inode)); - list_del_init(&BTRFS_I(inode)->i_orphan); + atomic_dec(&root->orphan_inodes); } - spin_unlock(&root->orphan_lock); while (1) { ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); @@ -7193,6 +7148,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode)) btrfs_add_ordered_operation(trans, root, old_inode); + inode_inc_iversion(old_dir); + inode_inc_iversion(new_dir); + inode_inc_iversion(old_inode); old_dir->i_ctime = old_dir->i_mtime = ctime; new_dir->i_ctime = new_dir->i_mtime = ctime; old_inode->i_ctime = ctime; @@ -7219,6 +7177,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, } if (new_inode) { + inode_inc_iversion(new_inode); new_inode->i_ctime = CURRENT_TIME; if (unlikely(btrfs_ino(new_inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { @@ -7490,6 +7449,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, cur_offset += ins.offset; *alloc_hint = ins.objectid + ins.offset; + inode_inc_iversion(inode); inode->i_ctime = CURRENT_TIME; BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; if (!(mode & FALLOC_FL_KEEP_SIZE) && @@ -7647,6 +7607,7 @@ static const struct inode_operations btrfs_file_inode_operations = { .permission = btrfs_permission, .fiemap = btrfs_fiemap, .get_acl = btrfs_get_acl, + .update_time = btrfs_update_time, }; static const struct inode_operations btrfs_special_inode_operations = { .getattr = btrfs_getattr, @@ -7657,6 +7618,7 @@ static const struct inode_operations btrfs_special_inode_operations = { .listxattr = btrfs_listxattr, .removexattr = btrfs_removexattr, .get_acl = btrfs_get_acl, + .update_time = btrfs_update_time, }; static const struct inode_operations btrfs_symlink_inode_operations = { .readlink = generic_readlink, @@ -7670,6 +7632,7 @@ static const struct inode_operations btrfs_symlink_inode_operations = { .listxattr = btrfs_listxattr, .removexattr = btrfs_removexattr, .get_acl = btrfs_get_acl, + .update_time = btrfs_update_time, }; const struct dentry_operations btrfs_dentry_operations = { diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 14f8e1f..24b776c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -261,6 +261,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) } btrfs_update_iflags(inode); + inode_inc_iversion(inode); inode->i_ctime = CURRENT_TIME; ret = btrfs_update_inode(trans, root, inode); @@ -367,7 +368,7 @@ static noinline int create_subvol(struct btrfs_root *root, return PTR_ERR(trans); leaf = btrfs_alloc_free_block(trans, root, root->leafsize, - 0, objectid, NULL, 0, 0, 0, 0); + 0, objectid, NULL, 0, 0, 0); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); goto fail; @@ -2262,10 +2263,12 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg) di_args->bytes_used = dev->bytes_used; di_args->total_bytes = dev->total_bytes; memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid)); - if (dev->name) + if (dev->name) { strncpy(di_args->path, dev->name, sizeof(di_args->path)); - else + di_args->path[sizeof(di_args->path) - 1] = 0; + } else { di_args->path[0] = '\0'; + } out: if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args))) @@ -2622,6 +2625,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); + inode_inc_iversion(inode); inode->i_mtime = inode->i_ctime = CURRENT_TIME; /* @@ -2914,7 +2918,7 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) up_read(&info->groups_sem); } - user_dest = (struct btrfs_ioctl_space_info *) + user_dest = (struct btrfs_ioctl_space_info __user *) (arg + sizeof(struct btrfs_ioctl_space_args)); if (copy_to_user(user_dest, dest_orig, alloc_size)) @@ -3042,6 +3046,28 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_root *root, return ret; } +static long btrfs_ioctl_get_dev_stats(struct btrfs_root *root, + void __user *arg, int reset_after_read) +{ + struct btrfs_ioctl_get_dev_stats *sa; + int ret; + + if (reset_after_read && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + sa = memdup_user(arg, sizeof(*sa)); + if (IS_ERR(sa)) + return PTR_ERR(sa); + + ret = btrfs_get_dev_stats(root, sa, reset_after_read); + + if (copy_to_user(arg, sa, sizeof(*sa))) + ret = -EFAULT; + + kfree(sa); + return ret; +} + static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) { int ret = 0; @@ -3212,8 +3238,9 @@ void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock, } } -static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg) +static long btrfs_ioctl_balance(struct file *file, void __user *arg) { + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_ioctl_balance_args *bargs; struct btrfs_balance_control *bctl; @@ -3225,6 +3252,10 @@ static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg) if (fs_info->sb->s_flags & MS_RDONLY) return -EROFS; + ret = mnt_want_write(file->f_path.mnt); + if (ret) + return ret; + mutex_lock(&fs_info->volume_mutex); mutex_lock(&fs_info->balance_mutex); @@ -3291,6 +3322,7 @@ out_bargs: out: mutex_unlock(&fs_info->balance_mutex); mutex_unlock(&fs_info->volume_mutex); + mnt_drop_write(file->f_path.mnt); return ret; } @@ -3386,7 +3418,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_DEV_INFO: return btrfs_ioctl_dev_info(root, argp); case BTRFS_IOC_BALANCE: - return btrfs_ioctl_balance(root, NULL); + return btrfs_ioctl_balance(file, NULL); case BTRFS_IOC_CLONE: return btrfs_ioctl_clone(file, arg, 0, 0, 0); case BTRFS_IOC_CLONE_RANGE: @@ -3419,11 +3451,15 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_SCRUB_PROGRESS: return btrfs_ioctl_scrub_progress(root, argp); case BTRFS_IOC_BALANCE_V2: - return btrfs_ioctl_balance(root, argp); + return btrfs_ioctl_balance(file, argp); case BTRFS_IOC_BALANCE_CTL: return btrfs_ioctl_balance_ctl(root, arg); case BTRFS_IOC_BALANCE_PROGRESS: return btrfs_ioctl_balance_progress(root, argp); + case BTRFS_IOC_GET_DEV_STATS: + return btrfs_ioctl_get_dev_stats(root, argp, 0); + case BTRFS_IOC_GET_AND_RESET_DEV_STATS: + return btrfs_ioctl_get_dev_stats(root, argp, 1); } return -ENOTTY; diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index 086e6bd..497c530 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -266,6 +266,35 @@ struct btrfs_ioctl_logical_ino_args { __u64 inodes; }; +enum btrfs_dev_stat_values { + /* disk I/O failure stats */ + BTRFS_DEV_STAT_WRITE_ERRS, /* EIO or EREMOTEIO from lower layers */ + BTRFS_DEV_STAT_READ_ERRS, /* EIO or EREMOTEIO from lower layers */ + BTRFS_DEV_STAT_FLUSH_ERRS, /* EIO or EREMOTEIO from lower layers */ + + /* stats for indirect indications for I/O failures */ + BTRFS_DEV_STAT_CORRUPTION_ERRS, /* checksum error, bytenr error or + * contents is illegal: this is an + * indication that the block was damaged + * during read or write, or written to + * wrong location or read from wrong + * location */ + BTRFS_DEV_STAT_GENERATION_ERRS, /* an indication that blocks have not + * been written */ + + BTRFS_DEV_STAT_VALUES_MAX +}; + +struct btrfs_ioctl_get_dev_stats { + __u64 devid; /* in */ + __u64 nr_items; /* in/out */ + + /* out values: */ + __u64 values[BTRFS_DEV_STAT_VALUES_MAX]; + + __u64 unused[128 - 2 - BTRFS_DEV_STAT_VALUES_MAX]; /* pad to 1k */ +}; + #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ struct btrfs_ioctl_vol_args) #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ @@ -330,5 +359,9 @@ struct btrfs_ioctl_logical_ino_args { struct btrfs_ioctl_ino_path_args) #define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \ struct btrfs_ioctl_ino_path_args) +#define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \ + struct btrfs_ioctl_get_dev_stats) +#define BTRFS_IOC_GET_AND_RESET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 53, \ + struct btrfs_ioctl_get_dev_stats) #endif diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index bbf6d0d..9e138cd 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -196,7 +196,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, entry->len = len; entry->disk_len = disk_len; entry->bytes_left = len; - entry->inode = inode; + entry->inode = igrab(inode); entry->compress_type = compress_type; if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) set_bit(type, &entry->flags); @@ -212,12 +212,12 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, trace_btrfs_ordered_extent_add(inode, entry); - spin_lock(&tree->lock); + spin_lock_irq(&tree->lock); node = tree_insert(&tree->tree, file_offset, &entry->rb_node); if (node) ordered_data_tree_panic(inode, -EEXIST, file_offset); - spin_unlock(&tree->lock); + spin_unlock_irq(&tree->lock); spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); list_add_tail(&entry->root_extent_list, @@ -264,9 +264,9 @@ void btrfs_add_ordered_sum(struct inode *inode, struct btrfs_ordered_inode_tree *tree; tree = &BTRFS_I(inode)->ordered_tree; - spin_lock(&tree->lock); + spin_lock_irq(&tree->lock); list_add_tail(&sum->list, &entry->list); - spin_unlock(&tree->lock); + spin_unlock_irq(&tree->lock); } /* @@ -283,18 +283,19 @@ void btrfs_add_ordered_sum(struct inode *inode, */ int btrfs_dec_test_first_ordered_pending(struct inode *inode, struct btrfs_ordered_extent **cached, - u64 *file_offset, u64 io_size) + u64 *file_offset, u64 io_size, int uptodate) { struct btrfs_ordered_inode_tree *tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; int ret; + unsigned long flags; u64 dec_end; u64 dec_start; u64 to_dec; tree = &BTRFS_I(inode)->ordered_tree; - spin_lock(&tree->lock); + spin_lock_irqsave(&tree->lock, flags); node = tree_search(tree, *file_offset); if (!node) { ret = 1; @@ -323,6 +324,9 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode, (unsigned long long)to_dec); } entry->bytes_left -= to_dec; + if (!uptodate) + set_bit(BTRFS_ORDERED_IOERR, &entry->flags); + if (entry->bytes_left == 0) ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); else @@ -332,7 +336,7 @@ out: *cached = entry; atomic_inc(&entry->refs); } - spin_unlock(&tree->lock); + spin_unlock_irqrestore(&tree->lock, flags); return ret == 0; } @@ -347,15 +351,21 @@ out: */ int btrfs_dec_test_ordered_pending(struct inode *inode, struct btrfs_ordered_extent **cached, - u64 file_offset, u64 io_size) + u64 file_offset, u64 io_size, int uptodate) { struct btrfs_ordered_inode_tree *tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; + unsigned long flags; int ret; tree = &BTRFS_I(inode)->ordered_tree; - spin_lock(&tree->lock); + spin_lock_irqsave(&tree->lock, flags); + if (cached && *cached) { + entry = *cached; + goto have_entry; + } + node = tree_search(tree, file_offset); if (!node) { ret = 1; @@ -363,6 +373,7 @@ int btrfs_dec_test_ordered_pending(struct inode *inode, } entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); +have_entry: if (!offset_in_entry(entry, file_offset)) { ret = 1; goto out; @@ -374,6 +385,9 @@ int btrfs_dec_test_ordered_pending(struct inode *inode, (unsigned long long)io_size); } entry->bytes_left -= io_size; + if (!uptodate) + set_bit(BTRFS_ORDERED_IOERR, &entry->flags); + if (entry->bytes_left == 0) ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); else @@ -383,7 +397,7 @@ out: *cached = entry; atomic_inc(&entry->refs); } - spin_unlock(&tree->lock); + spin_unlock_irqrestore(&tree->lock, flags); return ret == 0; } @@ -399,6 +413,8 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) trace_btrfs_ordered_extent_put(entry->inode, entry); if (atomic_dec_and_test(&entry->refs)) { + if (entry->inode) + btrfs_add_delayed_iput(entry->inode); while (!list_empty(&entry->list)) { cur = entry->list.next; sum = list_entry(cur, struct btrfs_ordered_sum, list); @@ -411,21 +427,22 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) /* * remove an ordered extent from the tree. No references are dropped - * and you must wake_up entry->wait. You must hold the tree lock - * while you call this function. + * and waiters are woken up. */ -static void __btrfs_remove_ordered_extent(struct inode *inode, - struct btrfs_ordered_extent *entry) +void btrfs_remove_ordered_extent(struct inode *inode, + struct btrfs_ordered_extent *entry) { struct btrfs_ordered_inode_tree *tree; struct btrfs_root *root = BTRFS_I(inode)->root; struct rb_node *node; tree = &BTRFS_I(inode)->ordered_tree; + spin_lock_irq(&tree->lock); node = &entry->rb_node; rb_erase(node, &tree->tree); tree->last = NULL; set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); + spin_unlock_irq(&tree->lock); spin_lock(&root->fs_info->ordered_extent_lock); list_del_init(&entry->root_extent_list); @@ -442,21 +459,6 @@ static void __btrfs_remove_ordered_extent(struct inode *inode, list_del_init(&BTRFS_I(inode)->ordered_operations); } spin_unlock(&root->fs_info->ordered_extent_lock); -} - -/* - * remove an ordered extent from the tree. No references are dropped - * but any waiters are woken. - */ -void btrfs_remove_ordered_extent(struct inode *inode, - struct btrfs_ordered_extent *entry) -{ - struct btrfs_ordered_inode_tree *tree; - - tree = &BTRFS_I(inode)->ordered_tree; - spin_lock(&tree->lock); - __btrfs_remove_ordered_extent(inode, entry); - spin_unlock(&tree->lock); wake_up(&entry->wait); } @@ -621,19 +623,11 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) if (orig_end > INT_LIMIT(loff_t)) orig_end = INT_LIMIT(loff_t); } -again: + /* start IO across the range first to instantiate any delalloc * extents */ - filemap_fdatawrite_range(inode->i_mapping, start, orig_end); - - /* The compression code will leave pages locked but return from - * writepage without setting the page writeback. Starting again - * with WB_SYNC_ALL will end up waiting for the IO to actually start. - */ - filemap_fdatawrite_range(inode->i_mapping, start, orig_end); - - filemap_fdatawait_range(inode->i_mapping, start, orig_end); + filemap_write_and_wait_range(inode->i_mapping, start, orig_end); end = orig_end; found = 0; @@ -657,11 +651,6 @@ again: break; end--; } - if (found || test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end, - EXTENT_DELALLOC, 0, NULL)) { - schedule_timeout(1); - goto again; - } } /* @@ -676,7 +665,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, struct btrfs_ordered_extent *entry = NULL; tree = &BTRFS_I(inode)->ordered_tree; - spin_lock(&tree->lock); + spin_lock_irq(&tree->lock); node = tree_search(tree, file_offset); if (!node) goto out; @@ -687,7 +676,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, if (entry) atomic_inc(&entry->refs); out: - spin_unlock(&tree->lock); + spin_unlock_irq(&tree->lock); return entry; } @@ -703,7 +692,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, struct btrfs_ordered_extent *entry = NULL; tree = &BTRFS_I(inode)->ordered_tree; - spin_lock(&tree->lock); + spin_lock_irq(&tree->lock); node = tree_search(tree, file_offset); if (!node) { node = tree_search(tree, file_offset + len); @@ -728,7 +717,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, out: if (entry) atomic_inc(&entry->refs); - spin_unlock(&tree->lock); + spin_unlock_irq(&tree->lock); return entry; } @@ -744,7 +733,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset) struct btrfs_ordered_extent *entry = NULL; tree = &BTRFS_I(inode)->ordered_tree; - spin_lock(&tree->lock); + spin_lock_irq(&tree->lock); node = tree_search(tree, file_offset); if (!node) goto out; @@ -752,7 +741,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset) entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); atomic_inc(&entry->refs); out: - spin_unlock(&tree->lock); + spin_unlock_irq(&tree->lock); return entry; } @@ -764,7 +753,6 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *ordered) { struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; u64 disk_i_size; u64 new_i_size; u64 i_size_test; @@ -779,7 +767,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, else offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize); - spin_lock(&tree->lock); + spin_lock_irq(&tree->lock); disk_i_size = BTRFS_I(inode)->disk_i_size; /* truncate file */ @@ -798,14 +786,6 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, } /* - * we can't update the disk_isize if there are delalloc bytes - * between disk_i_size and this ordered extent - */ - if (test_range_bit(io_tree, disk_i_size, offset - 1, - EXTENT_DELALLOC, 0, NULL)) { - goto out; - } - /* * walk backward from this ordered extent to disk_i_size. * if we find an ordered extent then we can't update disk i_size * yet @@ -825,15 +805,18 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, } node = prev; } - while (node) { + for (; node; node = rb_prev(node)) { test = rb_entry(node, struct btrfs_ordered_extent, rb_node); + + /* We treat this entry as if it doesnt exist */ + if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags)) + continue; if (test->file_offset + test->len <= disk_i_size) break; if (test->file_offset >= i_size) break; if (test->file_offset >= disk_i_size) goto out; - node = rb_prev(node); } new_i_size = min_t(u64, offset, i_size); @@ -851,43 +834,49 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, else node = rb_first(&tree->tree); } - i_size_test = 0; - if (node) { - /* - * do we have an area where IO might have finished - * between our ordered extent and the next one. - */ + + /* + * We are looking for an area between our current extent and the next + * ordered extent to update the i_size to. There are 3 cases here + * + * 1) We don't actually have anything and we can update to i_size. + * 2) We have stuff but they already did their i_size update so again we + * can just update to i_size. + * 3) We have an outstanding ordered extent so the most we can update + * our disk_i_size to is the start of the next offset. + */ + i_size_test = i_size; + for (; node; node = rb_next(node)) { test = rb_entry(node, struct btrfs_ordered_extent, rb_node); - if (test->file_offset > offset) + + if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags)) + continue; + if (test->file_offset > offset) { i_size_test = test->file_offset; - } else { - i_size_test = i_size; + break; + } } /* * i_size_test is the end of a region after this ordered - * extent where there are no ordered extents. As long as there - * are no delalloc bytes in this area, it is safe to update - * disk_i_size to the end of the region. + * extent where there are no ordered extents, we can safely set + * disk_i_size to this. */ - if (i_size_test > offset && - !test_range_bit(io_tree, offset, i_size_test - 1, - EXTENT_DELALLOC, 0, NULL)) { + if (i_size_test > offset) new_i_size = min_t(u64, i_size_test, i_size); - } BTRFS_I(inode)->disk_i_size = new_i_size; ret = 0; out: /* - * we need to remove the ordered extent with the tree lock held - * so that other people calling this function don't find our fully - * processed ordered entry and skip updating the i_size + * We need to do this because we can't remove ordered extents until + * after the i_disk_size has been updated and then the inode has been + * updated to reflect the change, so we need to tell anybody who finds + * this ordered extent that we've already done all the real work, we + * just haven't completed all the other work. */ if (ordered) - __btrfs_remove_ordered_extent(inode, ordered); - spin_unlock(&tree->lock); - if (ordered) - wake_up(&ordered->wait); + set_bit(BTRFS_ORDERED_UPDATED_ISIZE, &ordered->flags); + spin_unlock_irq(&tree->lock); return ret; } @@ -912,7 +901,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, if (!ordered) return 1; - spin_lock(&tree->lock); + spin_lock_irq(&tree->lock); list_for_each_entry_reverse(ordered_sum, &ordered->list, list) { if (disk_bytenr >= ordered_sum->bytenr) { num_sectors = ordered_sum->len / sectorsize; @@ -927,7 +916,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, } } out: - spin_unlock(&tree->lock); + spin_unlock_irq(&tree->lock); btrfs_put_ordered_extent(ordered); return ret; } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index c355ad4..e03c560 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -74,6 +74,12 @@ struct btrfs_ordered_sum { #define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */ +#define BTRFS_ORDERED_IOERR 6 /* We had an io error when writing this out */ + +#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates wether this ordered extent + * has done its due diligence in updating + * the isize. */ + struct btrfs_ordered_extent { /* logical offset in the file */ u64 file_offset; @@ -113,6 +119,8 @@ struct btrfs_ordered_extent { /* a per root list of all the pending ordered extents */ struct list_head root_extent_list; + + struct btrfs_work work; }; @@ -143,10 +151,11 @@ void btrfs_remove_ordered_extent(struct inode *inode, struct btrfs_ordered_extent *entry); int btrfs_dec_test_ordered_pending(struct inode *inode, struct btrfs_ordered_extent **cached, - u64 file_offset, u64 io_size); + u64 file_offset, u64 io_size, int uptodate); int btrfs_dec_test_first_ordered_pending(struct inode *inode, struct btrfs_ordered_extent **cached, - u64 *file_offset, u64 io_size); + u64 *file_offset, u64 io_size, + int uptodate); int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, u64 start, u64 len, u64 disk_len, int type); int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index f38e452..5e23684 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -294,6 +294,9 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) btrfs_dev_extent_chunk_offset(l, dev_extent), (unsigned long long) btrfs_dev_extent_length(l, dev_extent)); + case BTRFS_DEV_STATS_KEY: + printk(KERN_INFO "\t\tdevice stats\n"); + break; }; } } diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index ac5d010..48a4882 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -718,13 +718,18 @@ static void reada_start_machine_worker(struct btrfs_work *work) { struct reada_machine_work *rmw; struct btrfs_fs_info *fs_info; + int old_ioprio; rmw = container_of(work, struct reada_machine_work, work); fs_info = rmw->fs_info; kfree(rmw); + old_ioprio = IOPRIO_PRIO_VALUE(task_nice_ioclass(current), + task_nice_ioprio(current)); + set_task_ioprio(current, BTRFS_IOPRIO_READA); __reada_start_machine(fs_info); + set_task_ioprio(current, old_ioprio); } static void __reada_start_machine(struct btrfs_fs_info *fs_info) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 2f3d6f9..a38cfa4 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -50,7 +50,7 @@ struct scrub_dev; struct scrub_page { struct scrub_block *sblock; struct page *page; - struct block_device *bdev; + struct btrfs_device *dev; u64 flags; /* extent flags */ u64 generation; u64 logical; @@ -86,6 +86,7 @@ struct scrub_block { unsigned int header_error:1; unsigned int checksum_error:1; unsigned int no_io_error_seen:1; + unsigned int generation_error:1; /* also sets header_error */ }; }; @@ -675,6 +676,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) sdev->stat.read_errors++; sdev->stat.uncorrectable_errors++; spin_unlock(&sdev->stat_lock); + btrfs_dev_stat_inc_and_print(sdev->dev, + BTRFS_DEV_STAT_READ_ERRS); goto out; } @@ -686,6 +689,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) sdev->stat.read_errors++; sdev->stat.uncorrectable_errors++; spin_unlock(&sdev->stat_lock); + btrfs_dev_stat_inc_and_print(sdev->dev, + BTRFS_DEV_STAT_READ_ERRS); goto out; } BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS); @@ -699,6 +704,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) sdev->stat.read_errors++; sdev->stat.uncorrectable_errors++; spin_unlock(&sdev->stat_lock); + btrfs_dev_stat_inc_and_print(sdev->dev, + BTRFS_DEV_STAT_READ_ERRS); goto out; } @@ -725,12 +732,16 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) spin_unlock(&sdev->stat_lock); if (__ratelimit(&_rs)) scrub_print_warning("i/o error", sblock_to_check); + btrfs_dev_stat_inc_and_print(sdev->dev, + BTRFS_DEV_STAT_READ_ERRS); } else if (sblock_bad->checksum_error) { spin_lock(&sdev->stat_lock); sdev->stat.csum_errors++; spin_unlock(&sdev->stat_lock); if (__ratelimit(&_rs)) scrub_print_warning("checksum error", sblock_to_check); + btrfs_dev_stat_inc_and_print(sdev->dev, + BTRFS_DEV_STAT_CORRUPTION_ERRS); } else if (sblock_bad->header_error) { spin_lock(&sdev->stat_lock); sdev->stat.verify_errors++; @@ -738,6 +749,12 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) if (__ratelimit(&_rs)) scrub_print_warning("checksum/header error", sblock_to_check); + if (sblock_bad->generation_error) + btrfs_dev_stat_inc_and_print(sdev->dev, + BTRFS_DEV_STAT_GENERATION_ERRS); + else + btrfs_dev_stat_inc_and_print(sdev->dev, + BTRFS_DEV_STAT_CORRUPTION_ERRS); } if (sdev->readonly) @@ -998,8 +1015,8 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev, page = sblock->pagev + page_index; page->logical = logical; page->physical = bbio->stripes[mirror_index].physical; - /* for missing devices, bdev is NULL */ - page->bdev = bbio->stripes[mirror_index].dev->bdev; + /* for missing devices, dev->bdev is NULL */ + page->dev = bbio->stripes[mirror_index].dev; page->mirror_num = mirror_index + 1; page->page = alloc_page(GFP_NOFS); if (!page->page) { @@ -1043,7 +1060,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info, struct scrub_page *page = sblock->pagev + page_num; DECLARE_COMPLETION_ONSTACK(complete); - if (page->bdev == NULL) { + if (page->dev->bdev == NULL) { page->io_error = 1; sblock->no_io_error_seen = 0; continue; @@ -1053,7 +1070,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info, bio = bio_alloc(GFP_NOFS, 1); if (!bio) return -EIO; - bio->bi_bdev = page->bdev; + bio->bi_bdev = page->dev->bdev; bio->bi_sector = page->physical >> 9; bio->bi_end_io = scrub_complete_bio_end_io; bio->bi_private = &complete; @@ -1102,11 +1119,14 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, h = (struct btrfs_header *)mapped_buffer; if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) || - generation != le64_to_cpu(h->generation) || memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) || memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, - BTRFS_UUID_SIZE)) + BTRFS_UUID_SIZE)) { sblock->header_error = 1; + } else if (generation != le64_to_cpu(h->generation)) { + sblock->header_error = 1; + sblock->generation_error = 1; + } csum = h->csum; } else { if (!have_csum) @@ -1182,7 +1202,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, bio = bio_alloc(GFP_NOFS, 1); if (!bio) return -EIO; - bio->bi_bdev = page_bad->bdev; + bio->bi_bdev = page_bad->dev->bdev; bio->bi_sector = page_bad->physical >> 9; bio->bi_end_io = scrub_complete_bio_end_io; bio->bi_private = &complete; @@ -1196,6 +1216,12 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, /* this will also unplug the queue */ wait_for_completion(&complete); + if (!bio_flagged(bio, BIO_UPTODATE)) { + btrfs_dev_stat_inc_and_print(page_bad->dev, + BTRFS_DEV_STAT_WRITE_ERRS); + bio_put(bio); + return -EIO; + } bio_put(bio); } @@ -1352,7 +1378,8 @@ static int scrub_checksum_super(struct scrub_block *sblock) u64 mapped_size; void *p; u32 crc = ~(u32)0; - int fail = 0; + int fail_gen = 0; + int fail_cor = 0; u64 len; int index; @@ -1363,13 +1390,13 @@ static int scrub_checksum_super(struct scrub_block *sblock) memcpy(on_disk_csum, s->csum, sdev->csum_size); if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr)) - ++fail; + ++fail_cor; if (sblock->pagev[0].generation != le64_to_cpu(s->generation)) - ++fail; + ++fail_gen; if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) - ++fail; + ++fail_cor; len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE; mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; @@ -1394,9 +1421,9 @@ static int scrub_checksum_super(struct scrub_block *sblock) btrfs_csum_final(crc, calculated_csum); if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) - ++fail; + ++fail_cor; - if (fail) { + if (fail_cor + fail_gen) { /* * if we find an error in a super block, we just report it. * They will get written with the next transaction commit @@ -1405,9 +1432,15 @@ static int scrub_checksum_super(struct scrub_block *sblock) spin_lock(&sdev->stat_lock); ++sdev->stat.super_errors; spin_unlock(&sdev->stat_lock); + if (fail_cor) + btrfs_dev_stat_inc_and_print(sdev->dev, + BTRFS_DEV_STAT_CORRUPTION_ERRS); + else + btrfs_dev_stat_inc_and_print(sdev->dev, + BTRFS_DEV_STAT_GENERATION_ERRS); } - return fail; + return fail_cor + fail_gen; } static void scrub_block_get(struct scrub_block *sblock) @@ -1551,7 +1584,7 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, return -ENOMEM; } spage->sblock = sblock; - spage->bdev = sdev->dev->bdev; + spage->dev = sdev->dev; spage->flags = flags; spage->generation = gen; spage->logical = logical; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index c5f8fca..96eb9fe 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -188,7 +188,8 @@ void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...) va_start(args, fmt); if (fmt[0] == '<' && isdigit(fmt[1]) && fmt[2] == '>') { - strncpy(lvl, fmt, 3); + memcpy(lvl, fmt, 3); + lvl[3] = '\0'; fmt += 3; type = logtypes[fmt[1] - '0']; } else @@ -435,11 +436,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) case Opt_thread_pool: intarg = 0; match_int(&args[0], &intarg); - if (intarg) { + if (intarg) info->thread_pool_size = intarg; - printk(KERN_INFO "btrfs: thread pool %d\n", - info->thread_pool_size); - } break; case Opt_max_inline: num = match_strdup(&args[0]); @@ -769,7 +767,7 @@ static int btrfs_fill_super(struct super_block *sb, #ifdef CONFIG_BTRFS_FS_POSIX_ACL sb->s_flags |= MS_POSIXACL; #endif - + sb->s_flags |= MS_I_VERSION; err = open_ctree(sb, fs_devices, (char *)data); if (err) { printk("btrfs: open_ctree failed\n"); @@ -925,63 +923,48 @@ static inline int is_subvolume_inode(struct inode *inode) */ static char *setup_root_args(char *args) { - unsigned copied = 0; - unsigned len = strlen(args) + 2; - char *pos; - char *ret; + unsigned len = strlen(args) + 2 + 1; + char *src, *dst, *buf; /* - * We need the same args as before, but minus - * - * subvol=a - * - * and add - * - * subvolid=0 + * We need the same args as before, but with this substitution: + * s!subvol=[^,]+!subvolid=0! * - * which is a difference of 2 characters, so we allocate strlen(args) + - * 2 characters. + * Since the replacement string is up to 2 bytes longer than the + * original, allocate strlen(args) + 2 + 1 bytes. */ - ret = kzalloc(len * sizeof(char), GFP_NOFS); - if (!ret) - return NULL; - pos = strstr(args, "subvol="); + src = strstr(args, "subvol="); /* This shouldn't happen, but just in case.. */ - if (!pos) { - kfree(ret); + if (!src) + return NULL; + + buf = dst = kmalloc(len, GFP_NOFS); + if (!buf) return NULL; - } /* - * The subvol=<> arg is not at the front of the string, copy everybody - * up to that into ret. + * If the subvol= arg is not at the start of the string, + * copy whatever precedes it into buf. */ - if (pos != args) { - *pos = '\0'; - strcpy(ret, args); - copied += strlen(args); - pos++; + if (src != args) { + *src++ = '\0'; + strcpy(buf, args); + dst += strlen(args); } - strncpy(ret + copied, "subvolid=0", len - copied); - - /* Length of subvolid=0 */ - copied += 10; + strcpy(dst, "subvolid=0"); + dst += strlen("subvolid=0"); /* - * If there is no , after the subvol= option then we know there's no - * other options and we can just return. + * If there is a "," after the original subvol=... string, + * copy that suffix into our buffer. Otherwise, we're done. */ - pos = strchr(pos, ','); - if (!pos) - return ret; + src = strchr(src, ','); + if (src) + strcpy(dst, src); - /* Copy the rest of the arguments into our buffer */ - strncpy(ret + copied, pos, len - copied); - copied += strlen(pos); - - return ret; + return buf; } static struct dentry *mount_subvol(const char *subvol_name, int flags, @@ -1118,6 +1101,40 @@ error_fs_info: return ERR_PTR(error); } +static void btrfs_set_max_workers(struct btrfs_workers *workers, int new_limit) +{ + spin_lock_irq(&workers->lock); + workers->max_workers = new_limit; + spin_unlock_irq(&workers->lock); +} + +static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, + int new_pool_size, int old_pool_size) +{ + if (new_pool_size == old_pool_size) + return; + + fs_info->thread_pool_size = new_pool_size; + + printk(KERN_INFO "btrfs: resize thread pool %d -> %d\n", + old_pool_size, new_pool_size); + + btrfs_set_max_workers(&fs_info->generic_worker, new_pool_size); + btrfs_set_max_workers(&fs_info->workers, new_pool_size); + btrfs_set_max_workers(&fs_info->delalloc_workers, new_pool_size); + btrfs_set_max_workers(&fs_info->submit_workers, new_pool_size); + btrfs_set_max_workers(&fs_info->caching_workers, new_pool_size); + btrfs_set_max_workers(&fs_info->fixup_workers, new_pool_size); + btrfs_set_max_workers(&fs_info->endio_workers, new_pool_size); + btrfs_set_max_workers(&fs_info->endio_meta_workers, new_pool_size); + btrfs_set_max_workers(&fs_info->endio_meta_write_workers, new_pool_size); + btrfs_set_max_workers(&fs_info->endio_write_workers, new_pool_size); + btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size); + btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size); + btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size); + btrfs_set_max_workers(&fs_info->scrub_workers, new_pool_size); +} + static int btrfs_remount(struct super_block *sb, int *flags, char *data) { struct btrfs_fs_info *fs_info = btrfs_sb(sb); @@ -1137,6 +1154,9 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) goto restore; } + btrfs_resize_thread_pool(fs_info, + fs_info->thread_pool_size, old_thread_pool_size); + if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) return 0; @@ -1180,7 +1200,8 @@ restore: fs_info->compress_type = old_compress_type; fs_info->max_inline = old_max_inline; fs_info->alloc_start = old_alloc_start; - fs_info->thread_pool_size = old_thread_pool_size; + btrfs_resize_thread_pool(fs_info, + old_thread_pool_size, fs_info->thread_pool_size); fs_info->metadata_ratio = old_metadata_ratio; return ret; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 3642225..1791c6e 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -28,6 +28,7 @@ #include "locking.h" #include "tree-log.h" #include "inode-map.h" +#include "volumes.h" #define BTRFS_ROOT_TRANS_TAG 0 @@ -55,48 +56,49 @@ static noinline void switch_commit_root(struct btrfs_root *root) static noinline int join_transaction(struct btrfs_root *root, int nofail) { struct btrfs_transaction *cur_trans; + struct btrfs_fs_info *fs_info = root->fs_info; - spin_lock(&root->fs_info->trans_lock); + spin_lock(&fs_info->trans_lock); loop: /* The file system has been taken offline. No new transactions. */ - if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { - spin_unlock(&root->fs_info->trans_lock); + if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + spin_unlock(&fs_info->trans_lock); return -EROFS; } - if (root->fs_info->trans_no_join) { + if (fs_info->trans_no_join) { if (!nofail) { - spin_unlock(&root->fs_info->trans_lock); + spin_unlock(&fs_info->trans_lock); return -EBUSY; } } - cur_trans = root->fs_info->running_transaction; + cur_trans = fs_info->running_transaction; if (cur_trans) { if (cur_trans->aborted) { - spin_unlock(&root->fs_info->trans_lock); + spin_unlock(&fs_info->trans_lock); return cur_trans->aborted; } atomic_inc(&cur_trans->use_count); atomic_inc(&cur_trans->num_writers); cur_trans->num_joined++; - spin_unlock(&root->fs_info->trans_lock); + spin_unlock(&fs_info->trans_lock); return 0; } - spin_unlock(&root->fs_info->trans_lock); + spin_unlock(&fs_info->trans_lock); cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS); if (!cur_trans) return -ENOMEM; - spin_lock(&root->fs_info->trans_lock); - if (root->fs_info->running_transaction) { + spin_lock(&fs_info->trans_lock); + if (fs_info->running_transaction) { /* * someone started a transaction after we unlocked. Make sure * to redo the trans_no_join checks above */ kmem_cache_free(btrfs_transaction_cachep, cur_trans); - cur_trans = root->fs_info->running_transaction; + cur_trans = fs_info->running_transaction; goto loop; } @@ -121,20 +123,38 @@ loop: cur_trans->delayed_refs.flushing = 0; cur_trans->delayed_refs.run_delayed_start = 0; cur_trans->delayed_refs.seq = 1; + + /* + * although the tree mod log is per file system and not per transaction, + * the log must never go across transaction boundaries. + */ + smp_mb(); + if (!list_empty(&fs_info->tree_mod_seq_list)) { + printk(KERN_ERR "btrfs: tree_mod_seq_list not empty when " + "creating a fresh transaction\n"); + WARN_ON(1); + } + if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) { + printk(KERN_ERR "btrfs: tree_mod_log rb tree not empty when " + "creating a fresh transaction\n"); + WARN_ON(1); + } + atomic_set(&fs_info->tree_mod_seq, 0); + init_waitqueue_head(&cur_trans->delayed_refs.seq_wait); spin_lock_init(&cur_trans->commit_lock); spin_lock_init(&cur_trans->delayed_refs.lock); INIT_LIST_HEAD(&cur_trans->delayed_refs.seq_head); INIT_LIST_HEAD(&cur_trans->pending_snapshots); - list_add_tail(&cur_trans->list, &root->fs_info->trans_list); + list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(&cur_trans->dirty_pages, - root->fs_info->btree_inode->i_mapping); - root->fs_info->generation++; - cur_trans->transid = root->fs_info->generation; - root->fs_info->running_transaction = cur_trans; + fs_info->btree_inode->i_mapping); + fs_info->generation++; + cur_trans->transid = fs_info->generation; + fs_info->running_transaction = cur_trans; cur_trans->aborted = 0; - spin_unlock(&root->fs_info->trans_lock); + spin_unlock(&fs_info->trans_lock); return 0; } @@ -758,6 +778,9 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, if (ret) return ret; + ret = btrfs_run_dev_stats(trans, root->fs_info); + BUG_ON(ret); + while (!list_empty(&fs_info->dirty_cowonly_roots)) { next = fs_info->dirty_cowonly_roots.next; list_del_init(next); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index eb1ae90..2017d0f 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1628,7 +1628,9 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, int i; int ret; - btrfs_read_buffer(eb, gen); + ret = btrfs_read_buffer(eb, gen); + if (ret) + return ret; level = btrfs_header_level(eb); @@ -1749,7 +1751,11 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, path->slots[*level]++; if (wc->free) { - btrfs_read_buffer(next, ptr_gen); + ret = btrfs_read_buffer(next, ptr_gen); + if (ret) { + free_extent_buffer(next); + return ret; + } btrfs_tree_lock(next); btrfs_set_lock_blocking(next); @@ -1766,7 +1772,11 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, free_extent_buffer(next); continue; } - btrfs_read_buffer(next, ptr_gen); + ret = btrfs_read_buffer(next, ptr_gen); + if (ret) { + free_extent_buffer(next); + return ret; + } WARN_ON(*level <= 0); if (path->nodes[*level-1]) @@ -2657,6 +2667,8 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans, btrfs_release_path(path); } btrfs_release_path(path); + if (ret > 0) + ret = 0; return ret; } @@ -3028,21 +3040,6 @@ out: return ret; } -static int inode_in_log(struct btrfs_trans_handle *trans, - struct inode *inode) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - int ret = 0; - - mutex_lock(&root->log_mutex); - if (BTRFS_I(inode)->logged_trans == trans->transid && - BTRFS_I(inode)->last_sub_trans <= root->last_log_commit) - ret = 1; - mutex_unlock(&root->log_mutex); - return ret; -} - - /* * helper function around btrfs_log_inode to make sure newly created * parent directories also end up in the log. A minimal inode and backref @@ -3083,7 +3080,7 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (ret) goto end_no_trans; - if (inode_in_log(trans, inode)) { + if (btrfs_inode_in_log(inode, trans->transid)) { ret = BTRFS_NO_LOG_SYNC; goto end_no_trans; } diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index 12f5147..ab942f4 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c @@ -23,9 +23,9 @@ * * ulist = ulist_alloc(); * ulist_add(ulist, root); - * elem = NULL; + * ULIST_ITER_INIT(&uiter); * - * while ((elem = ulist_next(ulist, elem)) { + * while ((elem = ulist_next(ulist, &uiter)) { * for (all child nodes n in elem) * ulist_add(ulist, n); * do something useful with the node; @@ -95,7 +95,7 @@ EXPORT_SYMBOL(ulist_reinit); * * The allocated ulist will be returned in an initialized state. */ -struct ulist *ulist_alloc(unsigned long gfp_mask) +struct ulist *ulist_alloc(gfp_t gfp_mask) { struct ulist *ulist = kmalloc(sizeof(*ulist), gfp_mask); @@ -144,13 +144,22 @@ EXPORT_SYMBOL(ulist_free); * unaltered. */ int ulist_add(struct ulist *ulist, u64 val, unsigned long aux, - unsigned long gfp_mask) + gfp_t gfp_mask) +{ + return ulist_add_merge(ulist, val, aux, NULL, gfp_mask); +} + +int ulist_add_merge(struct ulist *ulist, u64 val, unsigned long aux, + unsigned long *old_aux, gfp_t gfp_mask) { int i; for (i = 0; i < ulist->nnodes; ++i) { - if (ulist->nodes[i].val == val) + if (ulist->nodes[i].val == val) { + if (old_aux) + *old_aux = ulist->nodes[i].aux; return 0; + } } if (ulist->nnodes >= ulist->nodes_alloced) { @@ -188,33 +197,26 @@ EXPORT_SYMBOL(ulist_add); /** * ulist_next - iterate ulist * @ulist: ulist to iterate - * @prev: previously returned element or %NULL to start iteration + * @uiter: iterator variable, initialized with ULIST_ITER_INIT(&iterator) * * Note: locking must be provided by the caller. In case of rwlocks only read * locking is needed * - * This function is used to iterate an ulist. The iteration is started with - * @prev = %NULL. It returns the next element from the ulist or %NULL when the + * This function is used to iterate an ulist. + * It returns the next element from the ulist or %NULL when the * end is reached. No guarantee is made with respect to the order in which * the elements are returned. They might neither be returned in order of * addition nor in ascending order. * It is allowed to call ulist_add during an enumeration. Newly added items * are guaranteed to show up in the running enumeration. */ -struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev) +struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter) { - int next; - if (ulist->nnodes == 0) return NULL; - - if (!prev) - return &ulist->nodes[0]; - - next = (prev - ulist->nodes) + 1; - if (next < 0 || next >= ulist->nnodes) + if (uiter->i < 0 || uiter->i >= ulist->nnodes) return NULL; - return &ulist->nodes[next]; + return &ulist->nodes[uiter->i++]; } EXPORT_SYMBOL(ulist_next); diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h index 2e25dec..21bdc8e 100644 --- a/fs/btrfs/ulist.h +++ b/fs/btrfs/ulist.h @@ -24,6 +24,10 @@ */ #define ULIST_SIZE 16 +struct ulist_iterator { + int i; +}; + /* * element of the list */ @@ -59,10 +63,15 @@ struct ulist { void ulist_init(struct ulist *ulist); void ulist_fini(struct ulist *ulist); void ulist_reinit(struct ulist *ulist); -struct ulist *ulist_alloc(unsigned long gfp_mask); +struct ulist *ulist_alloc(gfp_t gfp_mask); void ulist_free(struct ulist *ulist); int ulist_add(struct ulist *ulist, u64 val, unsigned long aux, - unsigned long gfp_mask); -struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev); + gfp_t gfp_mask); +int ulist_add_merge(struct ulist *ulist, u64 val, unsigned long aux, + unsigned long *old_aux, gfp_t gfp_mask); +struct ulist_node *ulist_next(struct ulist *ulist, + struct ulist_iterator *uiter); + +#define ULIST_ITER_INIT(uiter) ((uiter)->i = 0) #endif diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 1411b99..7782020 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -23,6 +23,7 @@ #include <linux/random.h> #include <linux/iocontext.h> #include <linux/capability.h> +#include <linux/ratelimit.h> #include <linux/kthread.h> #include <asm/div64.h> #include "compat.h" @@ -39,6 +40,8 @@ static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_device *device); static int btrfs_relocate_sys_chunks(struct btrfs_root *root); +static void __btrfs_reset_dev_stats(struct btrfs_device *dev); +static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); static DEFINE_MUTEX(uuid_mutex); static LIST_HEAD(fs_uuids); @@ -361,6 +364,7 @@ static noinline int device_list_add(const char *path, return -ENOMEM; } device->devid = devid; + device->dev_stats_valid = 0; device->work.func = pending_bios_fn; memcpy(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE); @@ -1633,7 +1637,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) int ret = 0; if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) - return -EINVAL; + return -EROFS; bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, root->fs_info->bdev_holder); @@ -4001,13 +4005,58 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, return 0; } +static void *merge_stripe_index_into_bio_private(void *bi_private, + unsigned int stripe_index) +{ + /* + * with single, dup, RAID0, RAID1 and RAID10, stripe_index is + * at most 1. + * The alternative solution (instead of stealing bits from the + * pointer) would be to allocate an intermediate structure + * that contains the old private pointer plus the stripe_index. + */ + BUG_ON((((uintptr_t)bi_private) & 3) != 0); + BUG_ON(stripe_index > 3); + return (void *)(((uintptr_t)bi_private) | stripe_index); +} + +static struct btrfs_bio *extract_bbio_from_bio_private(void *bi_private) +{ + return (struct btrfs_bio *)(((uintptr_t)bi_private) & ~((uintptr_t)3)); +} + +static unsigned int extract_stripe_index_from_bio_private(void *bi_private) +{ + return (unsigned int)((uintptr_t)bi_private) & 3; +} + static void btrfs_end_bio(struct bio *bio, int err) { - struct btrfs_bio *bbio = bio->bi_private; + struct btrfs_bio *bbio = extract_bbio_from_bio_private(bio->bi_private); int is_orig_bio = 0; - if (err) + if (err) { atomic_inc(&bbio->error); + if (err == -EIO || err == -EREMOTEIO) { + unsigned int stripe_index = + extract_stripe_index_from_bio_private( + bio->bi_private); + struct btrfs_device *dev; + + BUG_ON(stripe_index >= bbio->num_stripes); + dev = bbio->stripes[stripe_index].dev; + if (bio->bi_rw & WRITE) + btrfs_dev_stat_inc(dev, + BTRFS_DEV_STAT_WRITE_ERRS); + else + btrfs_dev_stat_inc(dev, + BTRFS_DEV_STAT_READ_ERRS); + if ((bio->bi_rw & WRITE_FLUSH) == WRITE_FLUSH) + btrfs_dev_stat_inc(dev, + BTRFS_DEV_STAT_FLUSH_ERRS); + btrfs_dev_stat_print_on_error(dev); + } + } if (bio == bbio->orig_bio) is_orig_bio = 1; @@ -4149,6 +4198,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, bio = first_bio; } bio->bi_private = bbio; + bio->bi_private = merge_stripe_index_into_bio_private( + bio->bi_private, (unsigned int)dev_nr); bio->bi_end_io = btrfs_end_bio; bio->bi_sector = bbio->stripes[dev_nr].physical >> 9; dev = bbio->stripes[dev_nr].dev; @@ -4509,6 +4560,28 @@ int btrfs_read_sys_array(struct btrfs_root *root) return ret; } +struct btrfs_device *btrfs_find_device_for_logical(struct btrfs_root *root, + u64 logical, int mirror_num) +{ + struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; + int ret; + u64 map_length = 0; + struct btrfs_bio *bbio = NULL; + struct btrfs_device *device; + + BUG_ON(mirror_num == 0); + ret = btrfs_map_block(map_tree, WRITE, logical, &map_length, &bbio, + mirror_num); + if (ret) { + BUG_ON(bbio != NULL); + return NULL; + } + BUG_ON(mirror_num != bbio->mirror_num); + device = bbio->stripes[mirror_num - 1].dev; + kfree(bbio); + return device; +} + int btrfs_read_chunk_tree(struct btrfs_root *root) { struct btrfs_path *path; @@ -4583,3 +4656,230 @@ error: btrfs_free_path(path); return ret; } + +static void __btrfs_reset_dev_stats(struct btrfs_device *dev) +{ + int i; + + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) + btrfs_dev_stat_reset(dev, i); +} + +int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) +{ + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_root *dev_root = fs_info->dev_root; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct extent_buffer *eb; + int slot; + int ret = 0; + struct btrfs_device *device; + struct btrfs_path *path = NULL; + int i; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) { + int item_size; + struct btrfs_dev_stats_item *ptr; + + key.objectid = 0; + key.type = BTRFS_DEV_STATS_KEY; + key.offset = device->devid; + ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); + if (ret) { + printk(KERN_WARNING "btrfs: no dev_stats entry found for device %s (devid %llu) (OK on first mount after mkfs)\n", + device->name, (unsigned long long)device->devid); + __btrfs_reset_dev_stats(device); + device->dev_stats_valid = 1; + btrfs_release_path(path); + continue; + } + slot = path->slots[0]; + eb = path->nodes[0]; + btrfs_item_key_to_cpu(eb, &found_key, slot); + item_size = btrfs_item_size_nr(eb, slot); + + ptr = btrfs_item_ptr(eb, slot, + struct btrfs_dev_stats_item); + + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { + if (item_size >= (1 + i) * sizeof(__le64)) + btrfs_dev_stat_set(device, i, + btrfs_dev_stats_value(eb, ptr, i)); + else + btrfs_dev_stat_reset(device, i); + } + + device->dev_stats_valid = 1; + btrfs_dev_stat_print_on_load(device); + btrfs_release_path(path); + } + mutex_unlock(&fs_devices->device_list_mutex); + +out: + btrfs_free_path(path); + return ret < 0 ? ret : 0; +} + +static int update_dev_stat_item(struct btrfs_trans_handle *trans, + struct btrfs_root *dev_root, + struct btrfs_device *device) +{ + struct btrfs_path *path; + struct btrfs_key key; + struct extent_buffer *eb; + struct btrfs_dev_stats_item *ptr; + int ret; + int i; + + key.objectid = 0; + key.type = BTRFS_DEV_STATS_KEY; + key.offset = device->devid; + + path = btrfs_alloc_path(); + BUG_ON(!path); + ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); + if (ret < 0) { + printk(KERN_WARNING "btrfs: error %d while searching for dev_stats item for device %s!\n", + ret, device->name); + goto out; + } + + if (ret == 0 && + btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { + /* need to delete old one and insert a new one */ + ret = btrfs_del_item(trans, dev_root, path); + if (ret != 0) { + printk(KERN_WARNING "btrfs: delete too small dev_stats item for device %s failed %d!\n", + device->name, ret); + goto out; + } + ret = 1; + } + + if (ret == 1) { + /* need to insert a new item */ + btrfs_release_path(path); + ret = btrfs_insert_empty_item(trans, dev_root, path, + &key, sizeof(*ptr)); + if (ret < 0) { + printk(KERN_WARNING "btrfs: insert dev_stats item for device %s failed %d!\n", + device->name, ret); + goto out; + } + } + + eb = path->nodes[0]; + ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item); + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) + btrfs_set_dev_stats_value(eb, ptr, i, + btrfs_dev_stat_read(device, i)); + btrfs_mark_buffer_dirty(eb); + +out: + btrfs_free_path(path); + return ret; +} + +/* + * called from commit_transaction. Writes all changed device stats to disk. + */ +int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *dev_root = fs_info->dev_root; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_device *device; + int ret = 0; + + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) { + if (!device->dev_stats_valid || !device->dev_stats_dirty) + continue; + + ret = update_dev_stat_item(trans, dev_root, device); + if (!ret) + device->dev_stats_dirty = 0; + } + mutex_unlock(&fs_devices->device_list_mutex); + + return ret; +} + +void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) +{ + btrfs_dev_stat_inc(dev, index); + btrfs_dev_stat_print_on_error(dev); +} + +void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) +{ + if (!dev->dev_stats_valid) + return; + printk_ratelimited(KERN_ERR + "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", + dev->name, + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), + btrfs_dev_stat_read(dev, + BTRFS_DEV_STAT_CORRUPTION_ERRS), + btrfs_dev_stat_read(dev, + BTRFS_DEV_STAT_GENERATION_ERRS)); +} + +static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) +{ + printk(KERN_INFO "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", + dev->name, + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); +} + +int btrfs_get_dev_stats(struct btrfs_root *root, + struct btrfs_ioctl_get_dev_stats *stats, + int reset_after_read) +{ + struct btrfs_device *dev; + struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; + int i; + + mutex_lock(&fs_devices->device_list_mutex); + dev = btrfs_find_device(root, stats->devid, NULL, NULL); + mutex_unlock(&fs_devices->device_list_mutex); + + if (!dev) { + printk(KERN_WARNING + "btrfs: get dev_stats failed, device not found\n"); + return -ENODEV; + } else if (!dev->dev_stats_valid) { + printk(KERN_WARNING + "btrfs: get dev_stats failed, not yet valid\n"); + return -ENODEV; + } else if (reset_after_read) { + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { + if (stats->nr_items > i) + stats->values[i] = + btrfs_dev_stat_read_and_reset(dev, i); + else + btrfs_dev_stat_reset(dev, i); + } + } else { + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) + if (stats->nr_items > i) + stats->values[i] = btrfs_dev_stat_read(dev, i); + } + if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX) + stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX; + return 0; +} diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index bb6b03f..3406a88 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -22,6 +22,7 @@ #include <linux/bio.h> #include <linux/sort.h> #include "async-thread.h" +#include "ioctl.h" #define BTRFS_STRIPE_LEN (64 * 1024) @@ -106,6 +107,11 @@ struct btrfs_device { struct completion flush_wait; int nobarriers; + /* disk I/O failure stats. For detailed description refer to + * enum btrfs_dev_stat_values in ioctl.h */ + int dev_stats_valid; + int dev_stats_dirty; /* counters need to be written to disk */ + atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX]; }; struct btrfs_fs_devices { @@ -281,4 +287,50 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, u64 *start, u64 *max_avail); +struct btrfs_device *btrfs_find_device_for_logical(struct btrfs_root *root, + u64 logical, int mirror_num); +void btrfs_dev_stat_print_on_error(struct btrfs_device *device); +void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index); +int btrfs_get_dev_stats(struct btrfs_root *root, + struct btrfs_ioctl_get_dev_stats *stats, + int reset_after_read); +int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info); +int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); + +static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, + int index) +{ + atomic_inc(dev->dev_stat_values + index); + dev->dev_stats_dirty = 1; +} + +static inline int btrfs_dev_stat_read(struct btrfs_device *dev, + int index) +{ + return atomic_read(dev->dev_stat_values + index); +} + +static inline int btrfs_dev_stat_read_and_reset(struct btrfs_device *dev, + int index) +{ + int ret; + + ret = atomic_xchg(dev->dev_stat_values + index, 0); + dev->dev_stats_dirty = 1; + return ret; +} + +static inline void btrfs_dev_stat_set(struct btrfs_device *dev, + int index, unsigned long val) +{ + atomic_set(dev->dev_stat_values + index, val); + dev->dev_stats_dirty = 1; +} + +static inline void btrfs_dev_stat_reset(struct btrfs_device *dev, + int index) +{ + btrfs_dev_stat_set(dev, index, 0); +} #endif diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index e7a5659..3f4e2d6 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -196,6 +196,7 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans, if (ret) goto out; + inode_inc_iversion(inode); inode->i_ctime = CURRENT_TIME; ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); diff --git a/fs/buffer.c b/fs/buffer.c index ad5938c..838a9cf 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3152,7 +3152,7 @@ SYSCALL_DEFINE2(bdflush, int, func, long, data) /* * Buffer-head allocation */ -static struct kmem_cache *bh_cachep; +static struct kmem_cache *bh_cachep __read_mostly; /* * Once the number of bh's in the machine exceeds this level, we start diff --git a/fs/ceph/export.c b/fs/ceph/export.c index fbb2a64..8e1b60e 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -40,38 +40,49 @@ struct ceph_nfs_confh { u32 parent_name_hash; } __attribute__ ((packed)); -static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len, - int connectable) +/* + * The presence of @parent_inode here tells us whether NFS wants a + * connectable file handle. However, we want to make a connectionable + * file handle unconditionally so that the MDS gets as much of a hint + * as possible. That means we only use @parent_dentry to indicate + * whether nfsd wants a connectable fh, and whether we should indicate + * failure from a too-small @max_len. + */ +static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len, + struct inode *parent_inode) { int type; struct ceph_nfs_fh *fh = (void *)rawfh; struct ceph_nfs_confh *cfh = (void *)rawfh; - struct dentry *parent; - struct inode *inode = dentry->d_inode; int connected_handle_length = sizeof(*cfh)/4; int handle_length = sizeof(*fh)/4; + struct dentry *dentry = d_find_alias(inode); + struct dentry *parent; /* don't re-export snaps */ if (ceph_snap(inode) != CEPH_NOSNAP) return -EINVAL; - spin_lock(&dentry->d_lock); - parent = dentry->d_parent; - if (*max_len >= connected_handle_length) { + /* if we found an alias, generate a connectable fh */ + if (*max_len >= connected_handle_length && dentry) { dout("encode_fh %p connectable\n", dentry); - cfh->ino = ceph_ino(dentry->d_inode); + spin_lock(&dentry->d_lock); + parent = dentry->d_parent; + cfh->ino = ceph_ino(inode); cfh->parent_ino = ceph_ino(parent->d_inode); cfh->parent_name_hash = ceph_dentry_hash(parent->d_inode, dentry); *max_len = connected_handle_length; type = 2; + spin_unlock(&dentry->d_lock); } else if (*max_len >= handle_length) { - if (connectable) { + if (parent_inode) { + /* nfsd wants connectable */ *max_len = connected_handle_length; type = 255; } else { dout("encode_fh %p\n", dentry); - fh->ino = ceph_ino(dentry->d_inode); + fh->ino = ceph_ino(inode); *max_len = handle_length; type = 1; } @@ -79,7 +90,6 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len, *max_len = handle_length; type = 255; } - spin_unlock(&dentry->d_lock); return type; } diff --git a/fs/compat.c b/fs/compat.c index 3adf3d4..6161255 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -871,12 +871,12 @@ asmlinkage long compat_sys_old_readdir(unsigned int fd, { int error; struct file *file; + int fput_needed; struct compat_readdir_callback buf; - error = -EBADF; - file = fget(fd); + file = fget_light(fd, &fput_needed); if (!file) - goto out; + return -EBADF; buf.result = 0; buf.dirent = dirent; @@ -885,8 +885,7 @@ asmlinkage long compat_sys_old_readdir(unsigned int fd, if (buf.result) error = buf.result; - fput(file); -out: + fput_light(file, fput_needed); return error; } @@ -953,16 +952,15 @@ asmlinkage long compat_sys_getdents(unsigned int fd, struct file * file; struct compat_linux_dirent __user * lastdirent; struct compat_getdents_callback buf; + int fput_needed; int error; - error = -EFAULT; if (!access_ok(VERIFY_WRITE, dirent, count)) - goto out; + return -EFAULT; - error = -EBADF; - file = fget(fd); + file = fget_light(fd, &fput_needed); if (!file) - goto out; + return -EBADF; buf.current_dir = dirent; buf.previous = NULL; @@ -979,8 +977,7 @@ asmlinkage long compat_sys_getdents(unsigned int fd, else error = count - buf.count; } - fput(file); -out: + fput_light(file, fput_needed); return error; } @@ -1041,16 +1038,15 @@ asmlinkage long compat_sys_getdents64(unsigned int fd, struct file * file; struct linux_dirent64 __user * lastdirent; struct compat_getdents_callback64 buf; + int fput_needed; int error; - error = -EFAULT; if (!access_ok(VERIFY_WRITE, dirent, count)) - goto out; + return -EFAULT; - error = -EBADF; - file = fget(fd); + file = fget_light(fd, &fput_needed); if (!file) - goto out; + return -EBADF; buf.current_dir = dirent; buf.previous = NULL; @@ -1068,8 +1064,7 @@ asmlinkage long compat_sys_getdents64(unsigned int fd, else error = count - buf.count; } - fput(file); -out: + fput_light(file, fput_needed); return error; } #endif /* ! __ARCH_OMIT_COMPAT_SYS_GETDENTS64 */ diff --git a/fs/dcache.c b/fs/dcache.c index 4435d8b..85c9e2b 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -683,8 +683,6 @@ EXPORT_SYMBOL(dget_parent); /** * d_find_alias - grab a hashed alias of inode * @inode: inode in question - * @want_discon: flag, used by d_splice_alias, to request - * that only a DISCONNECTED alias be returned. * * If inode has a hashed alias, or is a directory and has any alias, * acquire the reference to alias and return it. Otherwise return NULL. @@ -693,10 +691,9 @@ EXPORT_SYMBOL(dget_parent); * of a filesystem. * * If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer - * any other hashed alias over that one unless @want_discon is set, - * in which case only return an IS_ROOT, DCACHE_DISCONNECTED alias. + * any other hashed alias over that. */ -static struct dentry *__d_find_alias(struct inode *inode, int want_discon) +static struct dentry *__d_find_alias(struct inode *inode) { struct dentry *alias, *discon_alias; @@ -708,7 +705,7 @@ again: if (IS_ROOT(alias) && (alias->d_flags & DCACHE_DISCONNECTED)) { discon_alias = alias; - } else if (!want_discon) { + } else { __dget_dlock(alias); spin_unlock(&alias->d_lock); return alias; @@ -739,7 +736,7 @@ struct dentry *d_find_alias(struct inode *inode) if (!list_empty(&inode->i_dentry)) { spin_lock(&inode->i_lock); - de = __d_find_alias(inode, 0); + de = __d_find_alias(inode); spin_unlock(&inode->i_lock); } return de; @@ -1650,9 +1647,8 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) if (inode && S_ISDIR(inode->i_mode)) { spin_lock(&inode->i_lock); - new = __d_find_alias(inode, 1); + new = __d_find_any_alias(inode); if (new) { - BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED)); spin_unlock(&inode->i_lock); security_d_instantiate(new, inode); d_move(new, dentry); @@ -2482,7 +2478,7 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode) struct dentry *alias; /* Does an aliased dentry already exist? */ - alias = __d_find_alias(inode, 0); + alias = __d_find_alias(inode); if (alias) { actual = alias; write_seqlock(&rename_lock); @@ -2575,7 +2571,7 @@ static int prepend_path(const struct path *path, bool slash = false; int error = 0; - br_read_lock(vfsmount_lock); + br_read_lock(&vfsmount_lock); while (dentry != root->dentry || vfsmnt != root->mnt) { struct dentry * parent; @@ -2606,7 +2602,7 @@ static int prepend_path(const struct path *path, error = prepend(buffer, buflen, "/", 1); out: - br_read_unlock(vfsmount_lock); + br_read_unlock(&vfsmount_lock); return error; global_root: diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index ab35b11..a07441a 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -660,11 +660,10 @@ static int ecryptfs_readlink_lower(struct dentry *dentry, char **buf, { struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); char *lower_buf; - size_t lower_bufsiz = PATH_MAX; mm_segment_t old_fs; int rc; - lower_buf = kmalloc(lower_bufsiz, GFP_KERNEL); + lower_buf = kmalloc(PATH_MAX, GFP_KERNEL); if (!lower_buf) { rc = -ENOMEM; goto out; @@ -673,58 +672,29 @@ static int ecryptfs_readlink_lower(struct dentry *dentry, char **buf, set_fs(get_ds()); rc = lower_dentry->d_inode->i_op->readlink(lower_dentry, (char __user *)lower_buf, - lower_bufsiz); + PATH_MAX); set_fs(old_fs); if (rc < 0) goto out; - lower_bufsiz = rc; rc = ecryptfs_decode_and_decrypt_filename(buf, bufsiz, dentry, - lower_buf, lower_bufsiz); + lower_buf, rc); out: kfree(lower_buf); return rc; } -static int -ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz) +static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd) { - char *kbuf; - size_t kbufsiz, copied; + char *buf; + size_t len = PATH_MAX; int rc; - rc = ecryptfs_readlink_lower(dentry, &kbuf, &kbufsiz); + rc = ecryptfs_readlink_lower(dentry, &buf, &len); if (rc) goto out; - copied = min_t(size_t, bufsiz, kbufsiz); - rc = copy_to_user(buf, kbuf, copied) ? -EFAULT : copied; - kfree(kbuf); fsstack_copy_attr_atime(dentry->d_inode, ecryptfs_dentry_to_lower(dentry)->d_inode); -out: - return rc; -} - -static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd) -{ - char *buf; - int len = PAGE_SIZE, rc; - mm_segment_t old_fs; - - /* Released in ecryptfs_put_link(); only release here on error */ - buf = kmalloc(len, GFP_KERNEL); - if (!buf) { - buf = ERR_PTR(-ENOMEM); - goto out; - } - old_fs = get_fs(); - set_fs(get_ds()); - rc = dentry->d_inode->i_op->readlink(dentry, (char __user *)buf, len); - set_fs(old_fs); - if (rc < 0) { - kfree(buf); - buf = ERR_PTR(rc); - } else - buf[rc] = '\0'; + buf[len] = '\0'; out: nd_set_link(nd, buf); return NULL; @@ -1153,7 +1123,7 @@ out: } const struct inode_operations ecryptfs_symlink_iops = { - .readlink = ecryptfs_readlink, + .readlink = generic_readlink, .follow_link = ecryptfs_follow_link, .put_link = ecryptfs_put_link, .permission = ecryptfs_permission, @@ -280,10 +280,6 @@ static int __bprm_mm_init(struct linux_binprm *bprm) vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); INIT_LIST_HEAD(&vma->anon_vma_chain); - err = security_file_mmap(NULL, 0, 0, 0, vma->vm_start, 1); - if (err) - goto err; - err = insert_vm_struct(mm, vma); if (err) goto err; diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index b05acb7..b0201ca 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -304,24 +304,23 @@ out: /** * export_encode_fh - default export_operations->encode_fh function - * @dentry: the dentry to encode + * @inode: the object to encode * @fh: where to store the file handle fragment * @max_len: maximum length to store there - * @connectable: whether to store parent information + * @parent: parent directory inode, if wanted * * This default encode_fh function assumes that the 32 inode number * is suitable for locating an inode, and that the generation number * can be used to check that it is still valid. It places them in the * filehandle fragment where export_decode_fh expects to find them. */ -static int export_encode_fh(struct dentry *dentry, struct fid *fid, - int *max_len, int connectable) +static int export_encode_fh(struct inode *inode, struct fid *fid, + int *max_len, struct inode *parent) { - struct inode * inode = dentry->d_inode; int len = *max_len; int type = FILEID_INO32_GEN; - if (connectable && (len < 4)) { + if (parent && (len < 4)) { *max_len = 4; return 255; } else if (len < 2) { @@ -332,14 +331,9 @@ static int export_encode_fh(struct dentry *dentry, struct fid *fid, len = 2; fid->i32.ino = inode->i_ino; fid->i32.gen = inode->i_generation; - if (connectable && !S_ISDIR(inode->i_mode)) { - struct inode *parent; - - spin_lock(&dentry->d_lock); - parent = dentry->d_parent->d_inode; + if (parent) { fid->i32.parent_ino = parent->i_ino; fid->i32.parent_gen = parent->i_generation; - spin_unlock(&dentry->d_lock); len = 4; type = FILEID_INO32_GEN_PARENT; } @@ -352,11 +346,22 @@ int exportfs_encode_fh(struct dentry *dentry, struct fid *fid, int *max_len, { const struct export_operations *nop = dentry->d_sb->s_export_op; int error; + struct dentry *p = NULL; + struct inode *inode = dentry->d_inode, *parent = NULL; + if (connectable && !S_ISDIR(inode->i_mode)) { + p = dget_parent(dentry); + /* + * note that while p might've ceased to be our parent already, + * it's still pinned by and still positive. + */ + parent = p->d_inode; + } if (nop->encode_fh) - error = nop->encode_fh(dentry, fid->raw, max_len, connectable); + error = nop->encode_fh(inode, fid->raw, max_len, parent); else - error = export_encode_fh(dentry, fid, max_len, connectable); + error = export_encode_fh(inode, fid, max_len, parent); + dput(p); return error; } diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index 9ed1bb1..c22f170 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -2,6 +2,8 @@ config EXT4_FS tristate "The Extended 4 (ext4) filesystem" select JBD2 select CRC16 + select CRYPTO + select CRYPTO_CRC32C help This is the next generation of the ext3 filesystem. diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index c45c411..99b6324 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -168,12 +168,14 @@ void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, /* If checksum is bad mark all blocks used to prevent allocation * essentially implementing a per-group read-only flag. */ - if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { + if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { ext4_error(sb, "Checksum bad for group %u", block_group); ext4_free_group_clusters_set(sb, gdp, 0); ext4_free_inodes_set(sb, gdp, 0); ext4_itable_unused_set(sb, gdp, 0); memset(bh->b_data, 0xff, sb->s_blocksize); + ext4_block_bitmap_csum_set(sb, block_group, gdp, bh, + EXT4_BLOCKS_PER_GROUP(sb) / 8); return; } memset(bh->b_data, 0, sb->s_blocksize); @@ -210,6 +212,9 @@ void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, */ ext4_mark_bitmap_end(num_clusters_in_group(sb, block_group), sb->s_blocksize * 8, bh->b_data); + ext4_block_bitmap_csum_set(sb, block_group, gdp, bh, + EXT4_BLOCKS_PER_GROUP(sb) / 8); + ext4_group_desc_csum_set(sb, block_group, gdp); } /* Return the number of free blocks in a block group. It is used when @@ -276,9 +281,9 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb, } static int ext4_valid_block_bitmap(struct super_block *sb, - struct ext4_group_desc *desc, - unsigned int block_group, - struct buffer_head *bh) + struct ext4_group_desc *desc, + unsigned int block_group, + struct buffer_head *bh) { ext4_grpblk_t offset; ext4_grpblk_t next_zero_bit; @@ -325,6 +330,23 @@ err_out: block_group, bitmap_blk); return 0; } + +void ext4_validate_block_bitmap(struct super_block *sb, + struct ext4_group_desc *desc, + unsigned int block_group, + struct buffer_head *bh) +{ + if (buffer_verified(bh)) + return; + + ext4_lock_group(sb, block_group); + if (ext4_valid_block_bitmap(sb, desc, block_group, bh) && + ext4_block_bitmap_csum_verify(sb, block_group, desc, bh, + EXT4_BLOCKS_PER_GROUP(sb) / 8)) + set_buffer_verified(bh); + ext4_unlock_group(sb, block_group); +} + /** * ext4_read_block_bitmap() * @sb: super block @@ -355,12 +377,12 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) } if (bitmap_uptodate(bh)) - return bh; + goto verify; lock_buffer(bh); if (bitmap_uptodate(bh)) { unlock_buffer(bh); - return bh; + goto verify; } ext4_lock_group(sb, block_group); if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { @@ -379,7 +401,7 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) */ set_bitmap_uptodate(bh); unlock_buffer(bh); - return bh; + goto verify; } /* * submit the buffer_head for reading @@ -390,6 +412,9 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) get_bh(bh); submit_bh(READ, bh); return bh; +verify: + ext4_validate_block_bitmap(sb, desc, block_group, bh); + return bh; } /* Returns 0 on success, 1 on error */ @@ -412,7 +437,7 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group, } clear_buffer_new(bh); /* Panic or remount fs read-only if block bitmap is invalid */ - ext4_valid_block_bitmap(sb, desc, block_group, bh); + ext4_validate_block_bitmap(sb, desc, block_group, bh); return 0; } diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c index fa3af81..b319721 100644 --- a/fs/ext4/bitmap.c +++ b/fs/ext4/bitmap.c @@ -29,3 +29,86 @@ unsigned int ext4_count_free(struct buffer_head *map, unsigned int numchars) #endif /* EXT4FS_DEBUG */ +int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *gdp, + struct buffer_head *bh, int sz) +{ + __u32 hi; + __u32 provided, calculated; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return 1; + + provided = le16_to_cpu(gdp->bg_inode_bitmap_csum_lo); + calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); + if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) { + hi = le16_to_cpu(gdp->bg_inode_bitmap_csum_hi); + provided |= (hi << 16); + } else + calculated &= 0xFFFF; + + return provided == calculated; +} + +void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *gdp, + struct buffer_head *bh, int sz) +{ + __u32 csum; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return; + + csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); + gdp->bg_inode_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF); + if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) + gdp->bg_inode_bitmap_csum_hi = cpu_to_le16(csum >> 16); +} + +int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *gdp, + struct buffer_head *bh, int sz) +{ + __u32 hi; + __u32 provided, calculated; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return 1; + + provided = le16_to_cpu(gdp->bg_block_bitmap_csum_lo); + calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); + if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END) { + hi = le16_to_cpu(gdp->bg_block_bitmap_csum_hi); + provided |= (hi << 16); + } else + calculated &= 0xFFFF; + + if (provided == calculated) + return 1; + + ext4_error(sb, "Bad block bitmap checksum: block_group = %u", group); + return 0; +} + +void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *gdp, + struct buffer_head *bh, int sz) +{ + __u32 csum; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return; + + csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); + gdp->bg_block_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF); + if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END) + gdp->bg_block_bitmap_csum_hi = cpu_to_le16(csum >> 16); +} diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index b867862..aa39e60 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -179,6 +179,18 @@ static int ext4_readdir(struct file *filp, continue; } + /* Check the checksum */ + if (!buffer_verified(bh) && + !ext4_dirent_csum_verify(inode, + (struct ext4_dir_entry *)bh->b_data)) { + EXT4_ERROR_FILE(filp, 0, "directory fails checksum " + "at offset %llu", + (unsigned long long)filp->f_pos); + filp->f_pos += sb->s_blocksize - offset; + continue; + } + set_buffer_verified(bh); + revalidate: /* If the dir block has changed since the last call to * readdir(2), then we might be pointing to an invalid diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index c21b1de5..cfc4e01 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -29,6 +29,7 @@ #include <linux/wait.h> #include <linux/blockgroup_lock.h> #include <linux/percpu_counter.h> +#include <crypto/hash.h> #ifdef __KERNEL__ #include <linux/compat.h> #endif @@ -298,7 +299,9 @@ struct ext4_group_desc __le16 bg_free_inodes_count_lo;/* Free inodes count */ __le16 bg_used_dirs_count_lo; /* Directories count */ __le16 bg_flags; /* EXT4_BG_flags (INODE_UNINIT, etc) */ - __u32 bg_reserved[2]; /* Likely block/inode bitmap checksum */ + __le32 bg_exclude_bitmap_lo; /* Exclude bitmap for snapshots */ + __le16 bg_block_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bbitmap) LE */ + __le16 bg_inode_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+ibitmap) LE */ __le16 bg_itable_unused_lo; /* Unused inodes count */ __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */ __le32 bg_block_bitmap_hi; /* Blocks bitmap block MSB */ @@ -308,9 +311,19 @@ struct ext4_group_desc __le16 bg_free_inodes_count_hi;/* Free inodes count MSB */ __le16 bg_used_dirs_count_hi; /* Directories count MSB */ __le16 bg_itable_unused_hi; /* Unused inodes count MSB */ - __u32 bg_reserved2[3]; + __le32 bg_exclude_bitmap_hi; /* Exclude bitmap block MSB */ + __le16 bg_block_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+bbitmap) BE */ + __le16 bg_inode_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+ibitmap) BE */ + __u32 bg_reserved; }; +#define EXT4_BG_INODE_BITMAP_CSUM_HI_END \ + (offsetof(struct ext4_group_desc, bg_inode_bitmap_csum_hi) + \ + sizeof(__le16)) +#define EXT4_BG_BLOCK_BITMAP_CSUM_HI_END \ + (offsetof(struct ext4_group_desc, bg_block_bitmap_csum_hi) + \ + sizeof(__le16)) + /* * Structure of a flex block group info */ @@ -650,7 +663,8 @@ struct ext4_inode { __le16 l_i_file_acl_high; __le16 l_i_uid_high; /* these 2 fields */ __le16 l_i_gid_high; /* were reserved2[0] */ - __u32 l_i_reserved2; + __le16 l_i_checksum_lo;/* crc32c(uuid+inum+inode) LE */ + __le16 l_i_reserved; } linux2; struct { __le16 h_i_reserved1; /* Obsoleted fragment number/size which are removed in ext4 */ @@ -666,7 +680,7 @@ struct ext4_inode { } masix2; } osd2; /* OS dependent 2 */ __le16 i_extra_isize; - __le16 i_pad1; + __le16 i_checksum_hi; /* crc32c(uuid+inum+inode) BE */ __le32 i_ctime_extra; /* extra Change time (nsec << 2 | epoch) */ __le32 i_mtime_extra; /* extra Modification time(nsec << 2 | epoch) */ __le32 i_atime_extra; /* extra Access time (nsec << 2 | epoch) */ @@ -768,7 +782,7 @@ do { \ #define i_gid_low i_gid #define i_uid_high osd2.linux2.l_i_uid_high #define i_gid_high osd2.linux2.l_i_gid_high -#define i_reserved2 osd2.linux2.l_i_reserved2 +#define i_checksum_lo osd2.linux2.l_i_checksum_lo #elif defined(__GNU__) @@ -908,6 +922,9 @@ struct ext4_inode_info { */ tid_t i_sync_tid; tid_t i_datasync_tid; + + /* Precomputed uuid+inum+igen checksum for seeding inode checksums */ + __u32 i_csum_seed; }; /* @@ -1001,6 +1018,9 @@ extern void ext4_set_bits(void *bm, int cur, int len); #define EXT4_ERRORS_PANIC 3 /* Panic */ #define EXT4_ERRORS_DEFAULT EXT4_ERRORS_CONTINUE +/* Metadata checksum algorithm codes */ +#define EXT4_CRC32C_CHKSUM 1 + /* * Structure of the super block */ @@ -1087,7 +1107,7 @@ struct ext4_super_block { __le64 s_mmp_block; /* Block for multi-mount protection */ __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ __u8 s_log_groups_per_flex; /* FLEX_BG group size */ - __u8 s_reserved_char_pad; + __u8 s_checksum_type; /* metadata checksum algorithm used */ __le16 s_reserved_pad; __le64 s_kbytes_written; /* nr of lifetime kilobytes written */ __le32 s_snapshot_inum; /* Inode number of active snapshot */ @@ -1113,7 +1133,8 @@ struct ext4_super_block { __le32 s_usr_quota_inum; /* inode for tracking user quota */ __le32 s_grp_quota_inum; /* inode for tracking group quota */ __le32 s_overhead_clusters; /* overhead blocks/clusters in fs */ - __le32 s_reserved[109]; /* Padding to the end of the block */ + __le32 s_reserved[108]; /* Padding to the end of the block */ + __le32 s_checksum; /* crc32c(superblock) */ }; #define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START) @@ -1176,6 +1197,7 @@ struct ext4_sb_info { struct proc_dir_entry *s_proc; struct kobject s_kobj; struct completion s_kobj_unregister; + struct super_block *s_sb; /* Journaling */ struct journal_s *s_journal; @@ -1266,6 +1288,12 @@ struct ext4_sb_info { /* record the last minlen when FITRIM is called. */ atomic_t s_last_trim_minblks; + + /* Reference to checksum algorithm driver via cryptoapi */ + struct crypto_shash *s_chksum_driver; + + /* Precomputed FS UUID checksum for seeding other checksums */ + __u32 s_csum_seed; }; static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) @@ -1414,6 +1442,12 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) #define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040 #define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100 #define EXT4_FEATURE_RO_COMPAT_BIGALLOC 0x0200 +/* + * METADATA_CSUM also enables group descriptor checksums (GDT_CSUM). When + * METADATA_CSUM is set, group descriptor checksums use the same algorithm as + * all other data structures' checksums. However, the METADATA_CSUM and + * GDT_CSUM bits are mutually exclusive. + */ #define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM 0x0400 #define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 @@ -1461,7 +1495,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \ EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\ EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\ - EXT4_FEATURE_RO_COMPAT_BIGALLOC) + EXT4_FEATURE_RO_COMPAT_BIGALLOC |\ + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) /* * Default values for user and/or group using reserved blocks @@ -1527,6 +1562,18 @@ struct ext4_dir_entry_2 { }; /* + * This is a bogus directory entry at the end of each leaf block that + * records checksums. + */ +struct ext4_dir_entry_tail { + __le32 det_reserved_zero1; /* Pretend to be unused */ + __le16 det_rec_len; /* 12 */ + __u8 det_reserved_zero2; /* Zero name length */ + __u8 det_reserved_ft; /* 0xDE, fake file type */ + __le32 det_checksum; /* crc32c(uuid+inum+dirblock) */ +}; + +/* * Ext4 directory file types. Only the low 3 bits are used. The * other bits are reserved for now. */ @@ -1541,6 +1588,8 @@ struct ext4_dir_entry_2 { #define EXT4_FT_MAX 8 +#define EXT4_FT_DIR_CSUM 0xDE + /* * EXT4_DIR_PAD defines the directory entries boundaries * @@ -1609,6 +1658,25 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) #define DX_HASH_HALF_MD4_UNSIGNED 4 #define DX_HASH_TEA_UNSIGNED 5 +static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc, + const void *address, unsigned int length) +{ + struct { + struct shash_desc shash; + char ctx[crypto_shash_descsize(sbi->s_chksum_driver)]; + } desc; + int err; + + desc.shash.tfm = sbi->s_chksum_driver; + desc.shash.flags = 0; + *(u32 *)desc.ctx = crc; + + err = crypto_shash_update(&desc.shash, address, length); + BUG_ON(err); + + return *(u32 *)desc.ctx; +} + #ifdef __KERNEL__ /* hash info structure used by the directory hash */ @@ -1741,7 +1809,8 @@ struct mmp_struct { __le16 mmp_check_interval; __le16 mmp_pad1; - __le32 mmp_pad2[227]; + __le32 mmp_pad2[226]; + __le32 mmp_checksum; /* crc32c(uuid+mmp_block) */ }; /* arguments passed to the mmp thread */ @@ -1784,8 +1853,24 @@ struct mmpd_data { /* bitmap.c */ extern unsigned int ext4_count_free(struct buffer_head *, unsigned); +void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *gdp, + struct buffer_head *bh, int sz); +int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *gdp, + struct buffer_head *bh, int sz); +void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *gdp, + struct buffer_head *bh, int sz); +int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *gdp, + struct buffer_head *bh, int sz); /* balloc.c */ +extern void ext4_validate_block_bitmap(struct super_block *sb, + struct ext4_group_desc *desc, + unsigned int block_group, + struct buffer_head *bh); extern unsigned int ext4_block_group(struct super_block *sb, ext4_fsblk_t blocknr); extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb, @@ -1864,7 +1949,7 @@ extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate); /* mballoc.c */ extern long ext4_mb_stats; extern long ext4_mb_max_to_scan; -extern int ext4_mb_init(struct super_block *, int); +extern int ext4_mb_init(struct super_block *); extern int ext4_mb_release(struct super_block *); extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *, struct ext4_allocation_request *, int *); @@ -1936,6 +2021,8 @@ extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); extern int ext4_ext_migrate(struct inode *); /* namei.c */ +extern int ext4_dirent_csum_verify(struct inode *inode, + struct ext4_dir_entry *dirent); extern int ext4_orphan_add(handle_t *, struct inode *); extern int ext4_orphan_del(handle_t *, struct inode *); extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, @@ -1950,6 +2037,10 @@ extern int ext4_group_extend(struct super_block *sb, extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count); /* super.c */ +extern int ext4_superblock_csum_verify(struct super_block *sb, + struct ext4_super_block *es); +extern void ext4_superblock_csum_set(struct super_block *sb, + struct ext4_super_block *es); extern void *ext4_kvmalloc(size_t size, gfp_t flags); extern void *ext4_kvzalloc(size_t size, gfp_t flags); extern void ext4_kvfree(void *ptr); @@ -2025,10 +2116,17 @@ extern void ext4_used_dirs_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count); extern void ext4_itable_unused_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count); -extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group, - struct ext4_group_desc *gdp); -extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group, +extern int ext4_group_desc_csum_verify(struct super_block *sb, __u32 group, struct ext4_group_desc *gdp); +extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group, + struct ext4_group_desc *gdp); + +static inline int ext4_has_group_desc_csum(struct super_block *sb) +{ + return EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_GDT_CSUM | + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM); +} static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es) { @@ -2225,6 +2323,9 @@ static inline void ext4_unlock_group(struct super_block *sb, static inline void ext4_mark_super_dirty(struct super_block *sb) { + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + + ext4_superblock_csum_set(sb, es); if (EXT4_SB(sb)->s_journal == NULL) sb->s_dirt =1; } @@ -2314,6 +2415,9 @@ extern int ext4_bio_write_page(struct ext4_io_submit *io, /* mmp.c */ extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t); +extern void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp); +extern int ext4_mmp_csum_verify(struct super_block *sb, + struct mmp_struct *mmp); /* BH_Uninit flag: blocks are allocated but uninitialized on disk */ enum ext4_state_bits { diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 0f58b86..cb1b2c9 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -63,9 +63,22 @@ * ext4_inode has i_block array (60 bytes total). * The first 12 bytes store ext4_extent_header; * the remainder stores an array of ext4_extent. + * For non-inode extent blocks, ext4_extent_tail + * follows the array. */ /* + * This is the extent tail on-disk structure. + * All other extent structures are 12 bytes long. It turns out that + * block_size % 12 >= 4 for at least all powers of 2 greater than 512, which + * covers all valid ext4 block sizes. Therefore, this tail structure can be + * crammed into the end of the block without having to rebalance the tree. + */ +struct ext4_extent_tail { + __le32 et_checksum; /* crc32c(uuid+inum+extent_block) */ +}; + +/* * This is the extent on-disk structure. * It's used at the bottom of the tree. */ @@ -101,6 +114,17 @@ struct ext4_extent_header { #define EXT4_EXT_MAGIC cpu_to_le16(0xf30a) +#define EXT4_EXTENT_TAIL_OFFSET(hdr) \ + (sizeof(struct ext4_extent_header) + \ + (sizeof(struct ext4_extent) * le16_to_cpu((hdr)->eh_max))) + +static inline struct ext4_extent_tail * +find_ext4_extent_tail(struct ext4_extent_header *eh) +{ + return (struct ext4_extent_tail *)(((void *)eh) + + EXT4_EXTENT_TAIL_OFFSET(eh)); +} + /* * Array of ext4_ext_path contains path to some extent. * Creation/lookup routines use it for traversal/splitting/etc. diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index aca1790..90f7c2e 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -138,16 +138,23 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, } int __ext4_handle_dirty_super(const char *where, unsigned int line, - handle_t *handle, struct super_block *sb) + handle_t *handle, struct super_block *sb, + int now) { struct buffer_head *bh = EXT4_SB(sb)->s_sbh; int err = 0; if (ext4_handle_valid(handle)) { + ext4_superblock_csum_set(sb, + (struct ext4_super_block *)bh->b_data); err = jbd2_journal_dirty_metadata(handle, bh); if (err) ext4_journal_abort_handle(where, line, __func__, bh, handle, err); + } else if (now) { + ext4_superblock_csum_set(sb, + (struct ext4_super_block *)bh->b_data); + mark_buffer_dirty(bh); } else sb->s_dirt = 1; return err; diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 83b20fc..f440e8f1 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -213,7 +213,8 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, struct buffer_head *bh); int __ext4_handle_dirty_super(const char *where, unsigned int line, - handle_t *handle, struct super_block *sb); + handle_t *handle, struct super_block *sb, + int now); #define ext4_journal_get_write_access(handle, bh) \ __ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh)) @@ -225,8 +226,10 @@ int __ext4_handle_dirty_super(const char *where, unsigned int line, #define ext4_handle_dirty_metadata(handle, inode, bh) \ __ext4_handle_dirty_metadata(__func__, __LINE__, (handle), (inode), \ (bh)) +#define ext4_handle_dirty_super_now(handle, sb) \ + __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb), 1) #define ext4_handle_dirty_super(handle, sb) \ - __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb)) + __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb), 0) handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks); int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index abcdeab..91341ec 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -52,6 +52,46 @@ #define EXT4_EXT_MARK_UNINIT1 0x2 /* mark first half uninitialized */ #define EXT4_EXT_MARK_UNINIT2 0x4 /* mark second half uninitialized */ +static __le32 ext4_extent_block_csum(struct inode *inode, + struct ext4_extent_header *eh) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + __u32 csum; + + csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh, + EXT4_EXTENT_TAIL_OFFSET(eh)); + return cpu_to_le32(csum); +} + +static int ext4_extent_block_csum_verify(struct inode *inode, + struct ext4_extent_header *eh) +{ + struct ext4_extent_tail *et; + + if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return 1; + + et = find_ext4_extent_tail(eh); + if (et->et_checksum != ext4_extent_block_csum(inode, eh)) + return 0; + return 1; +} + +static void ext4_extent_block_csum_set(struct inode *inode, + struct ext4_extent_header *eh) +{ + struct ext4_extent_tail *et; + + if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return; + + et = find_ext4_extent_tail(eh); + et->et_checksum = ext4_extent_block_csum(inode, eh); +} + static int ext4_split_extent(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, @@ -117,6 +157,7 @@ static int __ext4_ext_dirty(const char *where, unsigned int line, { int err; if (path->p_bh) { + ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh)); /* path points to block */ err = __ext4_handle_dirty_metadata(where, line, handle, inode, path->p_bh); @@ -391,6 +432,12 @@ static int __ext4_ext_check(const char *function, unsigned int line, error_msg = "invalid extent entries"; goto corrupted; } + /* Verify checksum on non-root extent tree nodes */ + if (ext_depth(inode) != depth && + !ext4_extent_block_csum_verify(inode, eh)) { + error_msg = "extent tree corrupted"; + goto corrupted; + } return 0; corrupted: @@ -412,6 +459,26 @@ int ext4_ext_check_inode(struct inode *inode) return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode)); } +static int __ext4_ext_check_block(const char *function, unsigned int line, + struct inode *inode, + struct ext4_extent_header *eh, + int depth, + struct buffer_head *bh) +{ + int ret; + + if (buffer_verified(bh)) + return 0; + ret = ext4_ext_check(inode, eh, depth); + if (ret) + return ret; + set_buffer_verified(bh); + return ret; +} + +#define ext4_ext_check_block(inode, eh, depth, bh) \ + __ext4_ext_check_block(__func__, __LINE__, inode, eh, depth, bh) + #ifdef EXT_DEBUG static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) { @@ -536,7 +603,7 @@ ext4_ext_binsearch_idx(struct inode *inode, } path->p_idx = l - 1; - ext_debug(" -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block), + ext_debug(" -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block), ext4_idx_pblock(path->p_idx)); #ifdef CHECK_BINSEARCH @@ -668,8 +735,6 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, i = depth; /* walk through the tree */ while (i) { - int need_to_validate = 0; - ext_debug("depth %d: num %d, max %d\n", ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); @@ -688,8 +753,6 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, put_bh(bh); goto err; } - /* validate the extent entries */ - need_to_validate = 1; } eh = ext_block_hdr(bh); ppos++; @@ -703,7 +766,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, path[ppos].p_hdr = eh; i--; - if (need_to_validate && ext4_ext_check(inode, eh, i)) + if (ext4_ext_check_block(inode, eh, i, bh)) goto err; } @@ -914,6 +977,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, le16_add_cpu(&neh->eh_entries, m); } + ext4_extent_block_csum_set(inode, neh); set_buffer_uptodate(bh); unlock_buffer(bh); @@ -992,6 +1056,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, sizeof(struct ext4_extent_idx) * m); le16_add_cpu(&neh->eh_entries, m); } + ext4_extent_block_csum_set(inode, neh); set_buffer_uptodate(bh); unlock_buffer(bh); @@ -1089,6 +1154,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, else neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); neh->eh_magic = EXT4_EXT_MAGIC; + ext4_extent_block_csum_set(inode, neh); set_buffer_uptodate(bh); unlock_buffer(bh); @@ -1344,7 +1410,8 @@ got_index: return -EIO; eh = ext_block_hdr(bh); /* subtract from p_depth to get proper eh_depth */ - if (ext4_ext_check(inode, eh, path->p_depth - depth)) { + if (ext4_ext_check_block(inode, eh, + path->p_depth - depth, bh)) { put_bh(bh); return -EIO; } @@ -1357,7 +1424,7 @@ got_index: if (bh == NULL) return -EIO; eh = ext_block_hdr(bh); - if (ext4_ext_check(inode, eh, path->p_depth - depth)) { + if (ext4_ext_check_block(inode, eh, path->p_depth - depth, bh)) { put_bh(bh); return -EIO; } @@ -2644,8 +2711,8 @@ cont: err = -EIO; break; } - if (ext4_ext_check(inode, ext_block_hdr(bh), - depth - i - 1)) { + if (ext4_ext_check_block(inode, ext_block_hdr(bh), + depth - i - 1, bh)) { err = -EIO; break; } @@ -4722,8 +4789,8 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) /* Now release the pages */ if (last_page_offset > first_page_offset) { - truncate_inode_pages_range(mapping, first_page_offset, - last_page_offset-1); + truncate_pagecache_range(inode, first_page_offset, + last_page_offset - 1); } /* finish any pending end_io work */ diff --git a/fs/ext4/file.c b/fs/ext4/file.c index cb70f18..8c7642a 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -95,7 +95,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov, { struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; int unaligned_aio = 0; - int ret; + ssize_t ret; /* * If we have encountered a bitmap-format file, the size limit diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 9f9acac..d48e8b1 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -70,24 +70,27 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb, ext4_group_t block_group, struct ext4_group_desc *gdp) { - struct ext4_sb_info *sbi = EXT4_SB(sb); - J_ASSERT_BH(bh, buffer_locked(bh)); /* If checksum is bad mark all blocks and inodes use to prevent * allocation, essentially implementing a per-group read-only flag. */ - if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { + if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { ext4_error(sb, "Checksum bad for group %u", block_group); ext4_free_group_clusters_set(sb, gdp, 0); ext4_free_inodes_set(sb, gdp, 0); ext4_itable_unused_set(sb, gdp, 0); memset(bh->b_data, 0xff, sb->s_blocksize); + ext4_inode_bitmap_csum_set(sb, block_group, gdp, bh, + EXT4_INODES_PER_GROUP(sb) / 8); return 0; } memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8); ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, bh->b_data); + ext4_inode_bitmap_csum_set(sb, block_group, gdp, bh, + EXT4_INODES_PER_GROUP(sb) / 8); + ext4_group_desc_csum_set(sb, block_group, gdp); return EXT4_INODES_PER_GROUP(sb); } @@ -128,12 +131,12 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) return NULL; } if (bitmap_uptodate(bh)) - return bh; + goto verify; lock_buffer(bh); if (bitmap_uptodate(bh)) { unlock_buffer(bh); - return bh; + goto verify; } ext4_lock_group(sb, block_group); @@ -141,6 +144,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) ext4_init_inode_bitmap(sb, bh, block_group, desc); set_bitmap_uptodate(bh); set_buffer_uptodate(bh); + set_buffer_verified(bh); ext4_unlock_group(sb, block_group); unlock_buffer(bh); return bh; @@ -154,7 +158,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) */ set_bitmap_uptodate(bh); unlock_buffer(bh); - return bh; + goto verify; } /* * submit the buffer_head for reading @@ -171,6 +175,20 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) block_group, bitmap_blk); return NULL; } + +verify: + ext4_lock_group(sb, block_group); + if (!buffer_verified(bh) && + !ext4_inode_bitmap_csum_verify(sb, block_group, desc, bh, + EXT4_INODES_PER_GROUP(sb) / 8)) { + ext4_unlock_group(sb, block_group); + put_bh(bh); + ext4_error(sb, "Corrupt inode bitmap - block_group = %u, " + "inode_bitmap = %llu", block_group, bitmap_blk); + return NULL; + } + ext4_unlock_group(sb, block_group); + set_buffer_verified(bh); return bh; } @@ -276,7 +294,9 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) ext4_used_dirs_set(sb, gdp, count); percpu_counter_dec(&sbi->s_dirs_counter); } - gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); + ext4_inode_bitmap_csum_set(sb, block_group, gdp, bitmap_bh, + EXT4_INODES_PER_GROUP(sb) / 8); + ext4_group_desc_csum_set(sb, block_group, gdp); ext4_unlock_group(sb, block_group); percpu_counter_inc(&sbi->s_freeinodes_counter); @@ -488,10 +508,12 @@ fallback_retry: for (i = 0; i < ngroups; i++) { grp = (parent_group + i) % ngroups; desc = ext4_get_group_desc(sb, grp, NULL); - grp_free = ext4_free_inodes_count(sb, desc); - if (desc && grp_free && grp_free >= avefreei) { - *group = grp; - return 0; + if (desc) { + grp_free = ext4_free_inodes_count(sb, desc); + if (grp_free && grp_free >= avefreei) { + *group = grp; + return 0; + } } } @@ -709,7 +731,7 @@ repeat_in_this_group: got: /* We may have to initialize the block bitmap if it isn't already */ - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) && + if (ext4_has_group_desc_csum(sb) && gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { struct buffer_head *block_bitmap_bh; @@ -731,8 +753,11 @@ got: gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); ext4_free_group_clusters_set(sb, gdp, ext4_free_clusters_after_init(sb, group, gdp)); - gdp->bg_checksum = ext4_group_desc_csum(sbi, group, - gdp); + ext4_block_bitmap_csum_set(sb, group, gdp, + block_bitmap_bh, + EXT4_BLOCKS_PER_GROUP(sb) / + 8); + ext4_group_desc_csum_set(sb, group, gdp); } ext4_unlock_group(sb, group); @@ -751,7 +776,7 @@ got: goto fail; /* Update the relevant bg descriptor fields */ - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) { + if (ext4_has_group_desc_csum(sb)) { int free; struct ext4_group_info *grp = ext4_get_group_info(sb, group); @@ -772,7 +797,10 @@ got: ext4_itable_unused_set(sb, gdp, (EXT4_INODES_PER_GROUP(sb) - ino)); up_read(&grp->alloc_sem); + } else { + ext4_lock_group(sb, group); } + ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1); if (S_ISDIR(mode)) { ext4_used_dirs_set(sb, gdp, ext4_used_dirs_count(sb, gdp) + 1); @@ -782,10 +810,12 @@ got: atomic_inc(&sbi->s_flex_groups[f].used_dirs); } } - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) { - gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); - ext4_unlock_group(sb, group); + if (ext4_has_group_desc_csum(sb)) { + ext4_inode_bitmap_csum_set(sb, group, gdp, inode_bitmap_bh, + EXT4_INODES_PER_GROUP(sb) / 8); + ext4_group_desc_csum_set(sb, group, gdp); } + ext4_unlock_group(sb, group); BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh); @@ -850,6 +880,19 @@ got: inode->i_generation = sbi->s_next_generation++; spin_unlock(&sbi->s_next_gen_lock); + /* Precompute checksum seed for inode metadata */ + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) { + __u32 csum; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + __le32 inum = cpu_to_le32(inode->i_ino); + __le32 gen = cpu_to_le32(inode->i_generation); + csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, + sizeof(inum)); + ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, + sizeof(gen)); + } + ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ ext4_set_inode_state(inode, EXT4_STATE_NEW); @@ -1140,7 +1183,7 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, skip_zeroout: ext4_lock_group(sb, group); gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED); - gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); + ext4_group_desc_csum_set(sb, group, gdp); ext4_unlock_group(sb, group); BUFFER_TRACE(group_desc_bh, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 07eaf56..02bc8cb 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -47,6 +47,73 @@ #define MPAGE_DA_EXTENT_TAIL 0x01 +static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw, + struct ext4_inode_info *ei) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + __u16 csum_lo; + __u16 csum_hi = 0; + __u32 csum; + + csum_lo = raw->i_checksum_lo; + raw->i_checksum_lo = 0; + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && + EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) { + csum_hi = raw->i_checksum_hi; + raw->i_checksum_hi = 0; + } + + csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, + EXT4_INODE_SIZE(inode->i_sb)); + + raw->i_checksum_lo = csum_lo; + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && + EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) + raw->i_checksum_hi = csum_hi; + + return csum; +} + +static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw, + struct ext4_inode_info *ei) +{ + __u32 provided, calculated; + + if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != + cpu_to_le32(EXT4_OS_LINUX) || + !EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return 1; + + provided = le16_to_cpu(raw->i_checksum_lo); + calculated = ext4_inode_csum(inode, raw, ei); + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && + EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) + provided |= ((__u32)le16_to_cpu(raw->i_checksum_hi)) << 16; + else + calculated &= 0xFFFF; + + return provided == calculated; +} + +static void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw, + struct ext4_inode_info *ei) +{ + __u32 csum; + + if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != + cpu_to_le32(EXT4_OS_LINUX) || + !EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return; + + csum = ext4_inode_csum(inode, raw, ei); + raw->i_checksum_lo = cpu_to_le16(csum & 0xFFFF); + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && + EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) + raw->i_checksum_hi = cpu_to_le16(csum >> 16); +} + static inline int ext4_begin_ordered_truncate(struct inode *inode, loff_t new_size) { @@ -3517,8 +3584,7 @@ make_io: b = table; end = b + EXT4_SB(sb)->s_inode_readahead_blks; num = EXT4_INODES_PER_GROUP(sb); - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) + if (ext4_has_group_desc_csum(sb)) num -= ext4_itable_unused_count(sb, gdp); table += num / inodes_per_block; if (end > table) @@ -3646,6 +3712,39 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) if (ret < 0) goto bad_inode; raw_inode = ext4_raw_inode(&iloc); + + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { + ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); + if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > + EXT4_INODE_SIZE(inode->i_sb)) { + EXT4_ERROR_INODE(inode, "bad extra_isize (%u != %u)", + EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize, + EXT4_INODE_SIZE(inode->i_sb)); + ret = -EIO; + goto bad_inode; + } + } else + ei->i_extra_isize = 0; + + /* Precompute checksum seed for inode metadata */ + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + __u32 csum; + __le32 inum = cpu_to_le32(inode->i_ino); + __le32 gen = raw_inode->i_generation; + csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, + sizeof(inum)); + ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, + sizeof(gen)); + } + + if (!ext4_inode_csum_verify(inode, raw_inode, ei)) { + EXT4_ERROR_INODE(inode, "checksum invalid"); + ret = -EIO; + goto bad_inode; + } + inode->i_mode = le16_to_cpu(raw_inode->i_mode); i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); @@ -3725,12 +3824,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) } if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { - ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); - if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > - EXT4_INODE_SIZE(inode->i_sb)) { - ret = -EIO; - goto bad_inode; - } if (ei->i_extra_isize == 0) { /* The extra space is currently unused. Use it. */ ei->i_extra_isize = sizeof(struct ext4_inode) - @@ -3742,8 +3835,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) ext4_set_inode_state(inode, EXT4_STATE_XATTR); } - } else - ei->i_extra_isize = 0; + } EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode); EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode); @@ -3942,7 +4034,7 @@ static int ext4_do_update_inode(handle_t *handle, EXT4_SET_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_LARGE_FILE); ext4_handle_sync(handle); - err = ext4_handle_dirty_super(handle, sb); + err = ext4_handle_dirty_super_now(handle, sb); } } raw_inode->i_generation = cpu_to_le32(inode->i_generation); @@ -3969,6 +4061,8 @@ static int ext4_do_update_inode(handle_t *handle, raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); } + ext4_inode_csum_set(inode, raw_inode, ei); + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); rc = ext4_handle_dirty_metadata(handle, NULL, bh); if (!err) @@ -4213,7 +4307,8 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, * will return the blocks that include the delayed allocation * blocks for this file. */ - delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks; + delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb), + EXT4_I(inode)->i_reserved_data_blocks); stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9; return 0; diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 6eee255..8ad112a 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -38,7 +38,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) handle_t *handle = NULL; int err, migrate = 0; struct ext4_iloc iloc; - unsigned int oldflags; + unsigned int oldflags, mask, i; unsigned int jflag; if (!inode_owner_or_capable(inode)) @@ -115,8 +115,14 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (err) goto flags_err; - flags = flags & EXT4_FL_USER_MODIFIABLE; - flags |= oldflags & ~EXT4_FL_USER_MODIFIABLE; + for (i = 0, mask = 1; i < 32; i++, mask <<= 1) { + if (!(mask & EXT4_FL_USER_MODIFIABLE)) + continue; + if (mask & flags) + ext4_set_inode_flag(inode, i); + else + ext4_clear_inode_flag(inode, i); + } ei->i_flags = flags; ext4_set_inode_flags(inode); @@ -152,6 +158,13 @@ flags_out: if (!inode_owner_or_capable(inode)) return -EPERM; + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) { + ext4_warning(sb, "Setting inode version is not " + "supported with metadata_csum enabled."); + return -ENOTTY; + } + err = mnt_want_write_file(filp); if (err) return err; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 99ab428..1cd6994 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -788,7 +788,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore) int first_block; struct super_block *sb; struct buffer_head *bhs; - struct buffer_head **bh; + struct buffer_head **bh = NULL; struct inode *inode; char *data; char *bitmap; @@ -2375,7 +2375,7 @@ static int ext4_groupinfo_create_slab(size_t size) return 0; } -int ext4_mb_init(struct super_block *sb, int needs_recovery) +int ext4_mb_init(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned i, j; @@ -2517,6 +2517,9 @@ int ext4_mb_release(struct super_block *sb) struct ext4_sb_info *sbi = EXT4_SB(sb); struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); + if (sbi->s_proc) + remove_proc_entry("mb_groups", sbi->s_proc); + if (sbi->s_group_info) { for (i = 0; i < ngroups; i++) { grinfo = ext4_get_group_info(sb, i); @@ -2564,8 +2567,6 @@ int ext4_mb_release(struct super_block *sb) } free_percpu(sbi->s_locality_groups); - if (sbi->s_proc) - remove_proc_entry("mb_groups", sbi->s_proc); return 0; } @@ -2797,7 +2798,9 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, } len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len; ext4_free_group_clusters_set(sb, gdp, len); - gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); + ext4_block_bitmap_csum_set(sb, ac->ac_b_ex.fe_group, gdp, bitmap_bh, + EXT4_BLOCKS_PER_GROUP(sb) / 8); + ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp); ext4_unlock_group(sb, ac->ac_b_ex.fe_group); percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len); @@ -3071,13 +3074,9 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac) { struct ext4_prealloc_space *pa = ac->ac_pa; - int len; - - if (pa && pa->pa_type == MB_INODE_PA) { - len = ac->ac_b_ex.fe_len; - pa->pa_free += len; - } + if (pa && pa->pa_type == MB_INODE_PA) + pa->pa_free += ac->ac_b_ex.fe_len; } /* @@ -4636,6 +4635,7 @@ do_more: */ new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS); if (!new_entry) { + ext4_mb_unload_buddy(&e4b); err = -ENOMEM; goto error_return; } @@ -4659,7 +4659,9 @@ do_more: ret = ext4_free_group_clusters(sb, gdp) + count_clusters; ext4_free_group_clusters_set(sb, gdp, ret); - gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); + ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh, + EXT4_BLOCKS_PER_GROUP(sb) / 8); + ext4_group_desc_csum_set(sb, block_group, gdp); ext4_unlock_group(sb, block_group); percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters); @@ -4803,7 +4805,9 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, mb_free_blocks(NULL, &e4b, bit, count); blk_free_count = blocks_freed + ext4_free_group_clusters(sb, desc); ext4_free_group_clusters_set(sb, desc, blk_free_count); - desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc); + ext4_block_bitmap_csum_set(sb, block_group, desc, bitmap_bh, + EXT4_BLOCKS_PER_GROUP(sb) / 8); + ext4_group_desc_csum_set(sb, block_group, desc); ext4_unlock_group(sb, block_group); percpu_counter_add(&sbi->s_freeclusters_counter, EXT4_B2C(sbi, blocks_freed)); diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index ed6548d..f99a131 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -6,12 +6,45 @@ #include "ext4.h" +/* Checksumming functions */ +static __u32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + int offset = offsetof(struct mmp_struct, mmp_checksum); + __u32 csum; + + csum = ext4_chksum(sbi, sbi->s_csum_seed, (char *)mmp, offset); + + return cpu_to_le32(csum); +} + +int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp) +{ + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return 1; + + return mmp->mmp_checksum == ext4_mmp_csum(sb, mmp); +} + +void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp) +{ + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return; + + mmp->mmp_checksum = ext4_mmp_csum(sb, mmp); +} + /* * Write the MMP block using WRITE_SYNC to try to get the block on-disk * faster. */ -static int write_mmp_block(struct buffer_head *bh) +static int write_mmp_block(struct super_block *sb, struct buffer_head *bh) { + struct mmp_struct *mmp = (struct mmp_struct *)(bh->b_data); + + ext4_mmp_csum_set(sb, mmp); mark_buffer_dirty(bh); lock_buffer(bh); bh->b_end_io = end_buffer_write_sync; @@ -59,7 +92,8 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh, } mmp = (struct mmp_struct *)((*bh)->b_data); - if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC) + if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC || + !ext4_mmp_csum_verify(sb, mmp)) return -EINVAL; return 0; @@ -120,7 +154,7 @@ static int kmmpd(void *data) mmp->mmp_time = cpu_to_le64(get_seconds()); last_update_time = jiffies; - retval = write_mmp_block(bh); + retval = write_mmp_block(sb, bh); /* * Don't spew too many error messages. Print one every * (s_mmp_update_interval * 60) seconds. @@ -200,7 +234,7 @@ static int kmmpd(void *data) mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN); mmp->mmp_time = cpu_to_le64(get_seconds()); - retval = write_mmp_block(bh); + retval = write_mmp_block(sb, bh); failed: kfree(data); @@ -299,7 +333,7 @@ skip: seq = mmp_new_seq(); mmp->mmp_seq = cpu_to_le32(seq); - retval = write_mmp_block(bh); + retval = write_mmp_block(sb, bh); if (retval) goto failed; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index e2a3f4b..5845cd9 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -145,6 +145,14 @@ struct dx_map_entry u16 size; }; +/* + * This goes at the end of each htree block. + */ +struct dx_tail { + u32 dt_reserved; + __le32 dt_checksum; /* crc32c(uuid+inum+dirblock) */ +}; + static inline ext4_lblk_t dx_get_block(struct dx_entry *entry); static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value); static inline unsigned dx_get_hash(struct dx_entry *entry); @@ -180,6 +188,230 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, struct inode *inode); +/* checksumming functions */ +#define EXT4_DIRENT_TAIL(block, blocksize) \ + ((struct ext4_dir_entry_tail *)(((void *)(block)) + \ + ((blocksize) - \ + sizeof(struct ext4_dir_entry_tail)))) + +static void initialize_dirent_tail(struct ext4_dir_entry_tail *t, + unsigned int blocksize) +{ + memset(t, 0, sizeof(struct ext4_dir_entry_tail)); + t->det_rec_len = ext4_rec_len_to_disk( + sizeof(struct ext4_dir_entry_tail), blocksize); + t->det_reserved_ft = EXT4_FT_DIR_CSUM; +} + +/* Walk through a dirent block to find a checksum "dirent" at the tail */ +static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode, + struct ext4_dir_entry *de) +{ + struct ext4_dir_entry_tail *t; + +#ifdef PARANOID + struct ext4_dir_entry *d, *top; + + d = de; + top = (struct ext4_dir_entry *)(((void *)de) + + (EXT4_BLOCK_SIZE(inode->i_sb) - + sizeof(struct ext4_dir_entry_tail))); + while (d < top && d->rec_len) + d = (struct ext4_dir_entry *)(((void *)d) + + le16_to_cpu(d->rec_len)); + + if (d != top) + return NULL; + + t = (struct ext4_dir_entry_tail *)d; +#else + t = EXT4_DIRENT_TAIL(de, EXT4_BLOCK_SIZE(inode->i_sb)); +#endif + + if (t->det_reserved_zero1 || + le16_to_cpu(t->det_rec_len) != sizeof(struct ext4_dir_entry_tail) || + t->det_reserved_zero2 || + t->det_reserved_ft != EXT4_FT_DIR_CSUM) + return NULL; + + return t; +} + +static __le32 ext4_dirent_csum(struct inode *inode, + struct ext4_dir_entry *dirent, int size) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); + __u32 csum; + + csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size); + return cpu_to_le32(csum); +} + +int ext4_dirent_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent) +{ + struct ext4_dir_entry_tail *t; + + if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return 1; + + t = get_dirent_tail(inode, dirent); + if (!t) { + EXT4_ERROR_INODE(inode, "metadata_csum set but no space in dir " + "leaf for checksum. Please run e2fsck -D."); + return 0; + } + + if (t->det_checksum != ext4_dirent_csum(inode, dirent, + (void *)t - (void *)dirent)) + return 0; + + return 1; +} + +static void ext4_dirent_csum_set(struct inode *inode, + struct ext4_dir_entry *dirent) +{ + struct ext4_dir_entry_tail *t; + + if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return; + + t = get_dirent_tail(inode, dirent); + if (!t) { + EXT4_ERROR_INODE(inode, "metadata_csum set but no space in dir " + "leaf for checksum. Please run e2fsck -D."); + return; + } + + t->det_checksum = ext4_dirent_csum(inode, dirent, + (void *)t - (void *)dirent); +} + +static inline int ext4_handle_dirty_dirent_node(handle_t *handle, + struct inode *inode, + struct buffer_head *bh) +{ + ext4_dirent_csum_set(inode, (struct ext4_dir_entry *)bh->b_data); + return ext4_handle_dirty_metadata(handle, inode, bh); +} + +static struct dx_countlimit *get_dx_countlimit(struct inode *inode, + struct ext4_dir_entry *dirent, + int *offset) +{ + struct ext4_dir_entry *dp; + struct dx_root_info *root; + int count_offset; + + if (le16_to_cpu(dirent->rec_len) == EXT4_BLOCK_SIZE(inode->i_sb)) + count_offset = 8; + else if (le16_to_cpu(dirent->rec_len) == 12) { + dp = (struct ext4_dir_entry *)(((void *)dirent) + 12); + if (le16_to_cpu(dp->rec_len) != + EXT4_BLOCK_SIZE(inode->i_sb) - 12) + return NULL; + root = (struct dx_root_info *)(((void *)dp + 12)); + if (root->reserved_zero || + root->info_length != sizeof(struct dx_root_info)) + return NULL; + count_offset = 32; + } else + return NULL; + + if (offset) + *offset = count_offset; + return (struct dx_countlimit *)(((void *)dirent) + count_offset); +} + +static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent, + int count_offset, int count, struct dx_tail *t) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); + __u32 csum, old_csum; + int size; + + size = count_offset + (count * sizeof(struct dx_entry)); + old_csum = t->dt_checksum; + t->dt_checksum = 0; + csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size); + csum = ext4_chksum(sbi, csum, (__u8 *)t, sizeof(struct dx_tail)); + t->dt_checksum = old_csum; + + return cpu_to_le32(csum); +} + +static int ext4_dx_csum_verify(struct inode *inode, + struct ext4_dir_entry *dirent) +{ + struct dx_countlimit *c; + struct dx_tail *t; + int count_offset, limit, count; + + if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return 1; + + c = get_dx_countlimit(inode, dirent, &count_offset); + if (!c) { + EXT4_ERROR_INODE(inode, "dir seems corrupt? Run e2fsck -D."); + return 1; + } + limit = le16_to_cpu(c->limit); + count = le16_to_cpu(c->count); + if (count_offset + (limit * sizeof(struct dx_entry)) > + EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) { + EXT4_ERROR_INODE(inode, "metadata_csum set but no space for " + "tree checksum found. Run e2fsck -D."); + return 1; + } + t = (struct dx_tail *)(((struct dx_entry *)c) + limit); + + if (t->dt_checksum != ext4_dx_csum(inode, dirent, count_offset, + count, t)) + return 0; + return 1; +} + +static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent) +{ + struct dx_countlimit *c; + struct dx_tail *t; + int count_offset, limit, count; + + if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return; + + c = get_dx_countlimit(inode, dirent, &count_offset); + if (!c) { + EXT4_ERROR_INODE(inode, "dir seems corrupt? Run e2fsck -D."); + return; + } + limit = le16_to_cpu(c->limit); + count = le16_to_cpu(c->count); + if (count_offset + (limit * sizeof(struct dx_entry)) > + EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) { + EXT4_ERROR_INODE(inode, "metadata_csum set but no space for " + "tree checksum. Run e2fsck -D."); + return; + } + t = (struct dx_tail *)(((struct dx_entry *)c) + limit); + + t->dt_checksum = ext4_dx_csum(inode, dirent, count_offset, count, t); +} + +static inline int ext4_handle_dirty_dx_node(handle_t *handle, + struct inode *inode, + struct buffer_head *bh) +{ + ext4_dx_csum_set(inode, (struct ext4_dir_entry *)bh->b_data); + return ext4_handle_dirty_metadata(handle, inode, bh); +} + /* * p is at least 6 bytes before the end of page */ @@ -239,12 +471,20 @@ static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize) { unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) - EXT4_DIR_REC_LEN(2) - infosize; + + if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + entry_space -= sizeof(struct dx_tail); return entry_space / sizeof(struct dx_entry); } static inline unsigned dx_node_limit(struct inode *dir) { unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0); + + if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + entry_space -= sizeof(struct dx_tail); return entry_space / sizeof(struct dx_entry); } @@ -390,6 +630,15 @@ dx_probe(const struct qstr *d_name, struct inode *dir, goto fail; } + if (!buffer_verified(bh) && + !ext4_dx_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data)) { + ext4_warning(dir->i_sb, "Root failed checksum"); + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail; + } + set_buffer_verified(bh); + entries = (struct dx_entry *) (((char *)&root->info) + root->info.info_length); @@ -450,6 +699,17 @@ dx_probe(const struct qstr *d_name, struct inode *dir, if (!(bh = ext4_bread (NULL,dir, dx_get_block(at), 0, err))) goto fail2; at = entries = ((struct dx_node *) bh->b_data)->entries; + + if (!buffer_verified(bh) && + !ext4_dx_csum_verify(dir, + (struct ext4_dir_entry *)bh->b_data)) { + ext4_warning(dir->i_sb, "Node failed checksum"); + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail; + } + set_buffer_verified(bh); + if (dx_get_limit(entries) != dx_node_limit (dir)) { ext4_warning(dir->i_sb, "dx entry: limit != node limit"); @@ -549,6 +809,15 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash, if (!(bh = ext4_bread(NULL, dir, dx_get_block(p->at), 0, &err))) return err; /* Failure */ + + if (!buffer_verified(bh) && + !ext4_dx_csum_verify(dir, + (struct ext4_dir_entry *)bh->b_data)) { + ext4_warning(dir->i_sb, "Node failed checksum"); + return -EIO; + } + set_buffer_verified(bh); + p++; brelse(p->bh); p->bh = bh; @@ -577,6 +846,11 @@ static int htree_dirblock_to_tree(struct file *dir_file, if (!(bh = ext4_bread (NULL, dir, block, 0, &err))) return err; + if (!buffer_verified(bh) && + !ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data)) + return -EIO; + set_buffer_verified(bh); + de = (struct ext4_dir_entry_2 *) bh->b_data; top = (struct ext4_dir_entry_2 *) ((char *) de + dir->i_sb->s_blocksize - @@ -936,6 +1210,15 @@ restart: brelse(bh); goto next; } + if (!buffer_verified(bh) && + !ext4_dirent_csum_verify(dir, + (struct ext4_dir_entry *)bh->b_data)) { + EXT4_ERROR_INODE(dir, "checksumming directory " + "block %lu", (unsigned long)block); + brelse(bh); + goto next; + } + set_buffer_verified(bh); i = search_dirblock(bh, dir, d_name, block << EXT4_BLOCK_SIZE_BITS(sb), res_dir); if (i == 1) { @@ -987,6 +1270,16 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q if (!(bh = ext4_bread(NULL, dir, block, 0, err))) goto errout; + if (!buffer_verified(bh) && + !ext4_dirent_csum_verify(dir, + (struct ext4_dir_entry *)bh->b_data)) { + EXT4_ERROR_INODE(dir, "checksumming directory " + "block %lu", (unsigned long)block); + brelse(bh); + *err = -EIO; + goto errout; + } + set_buffer_verified(bh); retval = search_dirblock(bh, dir, d_name, block << EXT4_BLOCK_SIZE_BITS(sb), res_dir); @@ -1037,6 +1330,12 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru EXT4_ERROR_INODE(dir, "bad inode number: %u", ino); return ERR_PTR(-EIO); } + if (unlikely(ino == dir->i_ino)) { + EXT4_ERROR_INODE(dir, "'%.*s' linked to parent dir", + dentry->d_name.len, + dentry->d_name.name); + return ERR_PTR(-EIO); + } inode = ext4_iget(dir->i_sb, ino); if (inode == ERR_PTR(-ESTALE)) { EXT4_ERROR_INODE(dir, @@ -1156,8 +1455,14 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, char *data1 = (*bh)->b_data, *data2; unsigned split, move, size; struct ext4_dir_entry_2 *de = NULL, *de2; + struct ext4_dir_entry_tail *t; + int csum_size = 0; int err = 0, i; + if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + csum_size = sizeof(struct ext4_dir_entry_tail); + bh2 = ext4_append (handle, dir, &newblock, &err); if (!(bh2)) { brelse(*bh); @@ -1204,10 +1509,20 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, /* Fancy dance to stay within two buffers */ de2 = dx_move_dirents(data1, data2, map + split, count - split, blocksize); de = dx_pack_dirents(data1, blocksize); - de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de, + de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) - + (char *) de, blocksize); - de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2, + de2->rec_len = ext4_rec_len_to_disk(data2 + (blocksize - csum_size) - + (char *) de2, blocksize); + if (csum_size) { + t = EXT4_DIRENT_TAIL(data2, blocksize); + initialize_dirent_tail(t, blocksize); + + t = EXT4_DIRENT_TAIL(data1, blocksize); + initialize_dirent_tail(t, blocksize); + } + dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1)); dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1)); @@ -1218,10 +1533,10 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, de = de2; } dx_insert_block(frame, hash2 + continued, newblock); - err = ext4_handle_dirty_metadata(handle, dir, bh2); + err = ext4_handle_dirty_dirent_node(handle, dir, bh2); if (err) goto journal_error; - err = ext4_handle_dirty_metadata(handle, dir, frame->bh); + err = ext4_handle_dirty_dx_node(handle, dir, frame->bh); if (err) goto journal_error; brelse(bh2); @@ -1258,11 +1573,16 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, unsigned short reclen; int nlen, rlen, err; char *top; + int csum_size = 0; + + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + csum_size = sizeof(struct ext4_dir_entry_tail); reclen = EXT4_DIR_REC_LEN(namelen); if (!de) { de = (struct ext4_dir_entry_2 *)bh->b_data; - top = bh->b_data + blocksize - reclen; + top = bh->b_data + (blocksize - csum_size) - reclen; while ((char *) de <= top) { if (ext4_check_dir_entry(dir, NULL, de, bh, offset)) return -EIO; @@ -1295,11 +1615,8 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, de = de1; } de->file_type = EXT4_FT_UNKNOWN; - if (inode) { - de->inode = cpu_to_le32(inode->i_ino); - ext4_set_de_type(dir->i_sb, de, inode->i_mode); - } else - de->inode = 0; + de->inode = cpu_to_le32(inode->i_ino); + ext4_set_de_type(dir->i_sb, de, inode->i_mode); de->name_len = namelen; memcpy(de->name, name, namelen); /* @@ -1318,7 +1635,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, dir->i_version++; ext4_mark_inode_dirty(handle, dir); BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, dir, bh); + err = ext4_handle_dirty_dirent_node(handle, dir, bh); if (err) ext4_std_error(dir->i_sb, err); return 0; @@ -1339,6 +1656,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, struct dx_frame frames[2], *frame; struct dx_entry *entries; struct ext4_dir_entry_2 *de, *de2; + struct ext4_dir_entry_tail *t; char *data1, *top; unsigned len; int retval; @@ -1346,6 +1664,11 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, struct dx_hash_info hinfo; ext4_lblk_t block; struct fake_dirent *fde; + int csum_size = 0; + + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + csum_size = sizeof(struct ext4_dir_entry_tail); blocksize = dir->i_sb->s_blocksize; dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino)); @@ -1366,7 +1689,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, brelse(bh); return -EIO; } - len = ((char *) root) + blocksize - (char *) de; + len = ((char *) root) + (blocksize - csum_size) - (char *) de; /* Allocate new block for the 0th block's dirents */ bh2 = ext4_append(handle, dir, &block, &retval); @@ -1382,8 +1705,15 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, top = data1 + len; while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top) de = de2; - de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de, + de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) - + (char *) de, blocksize); + + if (csum_size) { + t = EXT4_DIRENT_TAIL(data1, blocksize); + initialize_dirent_tail(t, blocksize); + } + /* Initialize the root; the dot dirents already exist */ de = (struct ext4_dir_entry_2 *) (&root->dotdot); de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2), @@ -1408,8 +1738,8 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, frame->bh = bh; bh = bh2; - ext4_handle_dirty_metadata(handle, dir, frame->bh); - ext4_handle_dirty_metadata(handle, dir, bh); + ext4_handle_dirty_dx_node(handle, dir, frame->bh); + ext4_handle_dirty_dirent_node(handle, dir, bh); de = do_split(handle,dir, &bh, frame, &hinfo, &retval); if (!de) { @@ -1445,11 +1775,17 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, struct inode *dir = dentry->d_parent->d_inode; struct buffer_head *bh; struct ext4_dir_entry_2 *de; + struct ext4_dir_entry_tail *t; struct super_block *sb; int retval; int dx_fallback=0; unsigned blocksize; ext4_lblk_t block, blocks; + int csum_size = 0; + + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + csum_size = sizeof(struct ext4_dir_entry_tail); sb = dir->i_sb; blocksize = sb->s_blocksize; @@ -1468,6 +1804,11 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, bh = ext4_bread(handle, dir, block, 0, &retval); if(!bh) return retval; + if (!buffer_verified(bh) && + !ext4_dirent_csum_verify(dir, + (struct ext4_dir_entry *)bh->b_data)) + return -EIO; + set_buffer_verified(bh); retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh); if (retval != -ENOSPC) { brelse(bh); @@ -1484,7 +1825,13 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, return retval; de = (struct ext4_dir_entry_2 *) bh->b_data; de->inode = 0; - de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize); + de->rec_len = ext4_rec_len_to_disk(blocksize - csum_size, blocksize); + + if (csum_size) { + t = EXT4_DIRENT_TAIL(bh->b_data, blocksize); + initialize_dirent_tail(t, blocksize); + } + retval = add_dirent_to_buf(handle, dentry, inode, de, bh); brelse(bh); if (retval == 0) @@ -1516,6 +1863,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, if (!(bh = ext4_bread(handle,dir, dx_get_block(frame->at), 0, &err))) goto cleanup; + if (!buffer_verified(bh) && + !ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data)) + goto journal_error; + set_buffer_verified(bh); + BUFFER_TRACE(bh, "get_write_access"); err = ext4_journal_get_write_access(handle, bh); if (err) @@ -1583,7 +1935,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, dxtrace(dx_show_index("node", frames[1].entries)); dxtrace(dx_show_index("node", ((struct dx_node *) bh2->b_data)->entries)); - err = ext4_handle_dirty_metadata(handle, dir, bh2); + err = ext4_handle_dirty_dx_node(handle, dir, bh2); if (err) goto journal_error; brelse (bh2); @@ -1609,7 +1961,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, if (err) goto journal_error; } - err = ext4_handle_dirty_metadata(handle, dir, frames[0].bh); + err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh); if (err) { ext4_std_error(inode->i_sb, err); goto cleanup; @@ -1641,12 +1993,17 @@ static int ext4_delete_entry(handle_t *handle, { struct ext4_dir_entry_2 *de, *pde; unsigned int blocksize = dir->i_sb->s_blocksize; + int csum_size = 0; int i, err; + if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + csum_size = sizeof(struct ext4_dir_entry_tail); + i = 0; pde = NULL; de = (struct ext4_dir_entry_2 *) bh->b_data; - while (i < bh->b_size) { + while (i < bh->b_size - csum_size) { if (ext4_check_dir_entry(dir, NULL, de, bh, i)) return -EIO; if (de == de_del) { @@ -1667,7 +2024,7 @@ static int ext4_delete_entry(handle_t *handle, de->inode = 0; dir->i_version++; BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, dir, bh); + err = ext4_handle_dirty_dirent_node(handle, dir, bh); if (unlikely(err)) { ext4_std_error(dir->i_sb, err); return err; @@ -1809,9 +2166,15 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) struct inode *inode; struct buffer_head *dir_block = NULL; struct ext4_dir_entry_2 *de; + struct ext4_dir_entry_tail *t; unsigned int blocksize = dir->i_sb->s_blocksize; + int csum_size = 0; int err, retries = 0; + if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + csum_size = sizeof(struct ext4_dir_entry_tail); + if (EXT4_DIR_LINK_MAX(dir)) return -EMLINK; @@ -1852,16 +2215,24 @@ retry: ext4_set_de_type(dir->i_sb, de, S_IFDIR); de = ext4_next_entry(de, blocksize); de->inode = cpu_to_le32(dir->i_ino); - de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(1), + de->rec_len = ext4_rec_len_to_disk(blocksize - + (csum_size + EXT4_DIR_REC_LEN(1)), blocksize); de->name_len = 2; strcpy(de->name, ".."); ext4_set_de_type(dir->i_sb, de, S_IFDIR); set_nlink(inode, 2); + + if (csum_size) { + t = EXT4_DIRENT_TAIL(dir_block->b_data, blocksize); + initialize_dirent_tail(t, blocksize); + } + BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, inode, dir_block); + err = ext4_handle_dirty_dirent_node(handle, inode, dir_block); if (err) goto out_clear_inode; + set_buffer_verified(dir_block); err = ext4_mark_inode_dirty(handle, inode); if (!err) err = ext4_add_entry(handle, dentry, inode); @@ -1911,6 +2282,14 @@ static int empty_dir(struct inode *inode) inode->i_ino); return 1; } + if (!buffer_verified(bh) && + !ext4_dirent_csum_verify(inode, + (struct ext4_dir_entry *)bh->b_data)) { + EXT4_ERROR_INODE(inode, "checksum error reading directory " + "lblock 0"); + return -EIO; + } + set_buffer_verified(bh); de = (struct ext4_dir_entry_2 *) bh->b_data; de1 = ext4_next_entry(de, sb->s_blocksize); if (le32_to_cpu(de->inode) != inode->i_ino || @@ -1942,6 +2321,14 @@ static int empty_dir(struct inode *inode) offset += sb->s_blocksize; continue; } + if (!buffer_verified(bh) && + !ext4_dirent_csum_verify(inode, + (struct ext4_dir_entry *)bh->b_data)) { + EXT4_ERROR_INODE(inode, "checksum error " + "reading directory lblock 0"); + return -EIO; + } + set_buffer_verified(bh); de = (struct ext4_dir_entry_2 *) bh->b_data; } if (ext4_check_dir_entry(inode, NULL, de, bh, offset)) { @@ -2010,7 +2397,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) /* Insert this inode at the head of the on-disk orphan list... */ NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan); EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); - err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); + err = ext4_handle_dirty_super_now(handle, sb); rc = ext4_mark_iloc_dirty(handle, inode, &iloc); if (!err) err = rc; @@ -2083,7 +2470,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) if (err) goto out_brelse; sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); - err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); + err = ext4_handle_dirty_super_now(handle, inode->i_sb); } else { struct ext4_iloc iloc2; struct inode *i_prev = @@ -2442,6 +2829,11 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval); if (!dir_bh) goto end_rename; + if (!buffer_verified(dir_bh) && + !ext4_dirent_csum_verify(old_inode, + (struct ext4_dir_entry *)dir_bh->b_data)) + goto end_rename; + set_buffer_verified(dir_bh); if (le32_to_cpu(PARENT_INO(dir_bh->b_data, old_dir->i_sb->s_blocksize)) != old_dir->i_ino) goto end_rename; @@ -2472,7 +2864,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, ext4_current_time(new_dir); ext4_mark_inode_dirty(handle, new_dir); BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata"); - retval = ext4_handle_dirty_metadata(handle, new_dir, new_bh); + retval = ext4_handle_dirty_dirent_node(handle, new_dir, new_bh); if (unlikely(retval)) { ext4_std_error(new_dir->i_sb, retval); goto end_rename; @@ -2526,7 +2918,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) = cpu_to_le32(new_dir->i_ino); BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata"); - retval = ext4_handle_dirty_metadata(handle, old_inode, dir_bh); + retval = ext4_handle_dirty_dirent_node(handle, old_inode, + dir_bh); if (retval) { ext4_std_error(old_dir->i_sb, retval); goto end_rename; diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 59fa0be..7ea6cbb 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -161,6 +161,8 @@ static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned long flexbg_size) if (flex_gd == NULL) goto out3; + if (flexbg_size >= UINT_MAX / sizeof(struct ext4_new_flex_group_data)) + goto out2; flex_gd->count = flexbg_size; flex_gd->groups = kmalloc(sizeof(struct ext4_new_group_data) * @@ -796,7 +798,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, ext4_kvfree(o_group_desc); le16_add_cpu(&es->s_reserved_gdt_blocks, -1); - err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); + err = ext4_handle_dirty_super_now(handle, sb); if (err) ext4_std_error(sb, err); @@ -968,6 +970,8 @@ static void update_backups(struct super_block *sb, goto exit_err; } + ext4_superblock_csum_set(sb, (struct ext4_super_block *)data); + while ((group = ext4_list_backups(sb, &three, &five, &seven)) < last) { struct buffer_head *bh; @@ -1067,6 +1071,54 @@ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb, return err; } +static struct buffer_head *ext4_get_bitmap(struct super_block *sb, __u64 block) +{ + struct buffer_head *bh = sb_getblk(sb, block); + if (!bh) + return NULL; + + if (bitmap_uptodate(bh)) + return bh; + + lock_buffer(bh); + if (bh_submit_read(bh) < 0) { + unlock_buffer(bh); + brelse(bh); + return NULL; + } + unlock_buffer(bh); + + return bh; +} + +static int ext4_set_bitmap_checksums(struct super_block *sb, + ext4_group_t group, + struct ext4_group_desc *gdp, + struct ext4_new_group_data *group_data) +{ + struct buffer_head *bh; + + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return 0; + + bh = ext4_get_bitmap(sb, group_data->inode_bitmap); + if (!bh) + return -EIO; + ext4_inode_bitmap_csum_set(sb, group, gdp, bh, + EXT4_INODES_PER_GROUP(sb) / 8); + brelse(bh); + + bh = ext4_get_bitmap(sb, group_data->block_bitmap); + if (!bh) + return -EIO; + ext4_block_bitmap_csum_set(sb, group, gdp, bh, + EXT4_BLOCKS_PER_GROUP(sb) / 8); + brelse(bh); + + return 0; +} + /* * ext4_setup_new_descs() will set up the group descriptor descriptors of a flex bg */ @@ -1093,18 +1145,24 @@ static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb, */ gdb_bh = sbi->s_group_desc[gdb_num]; /* Update group descriptor block for new group */ - gdp = (struct ext4_group_desc *)((char *)gdb_bh->b_data + + gdp = (struct ext4_group_desc *)(gdb_bh->b_data + gdb_off * EXT4_DESC_SIZE(sb)); memset(gdp, 0, EXT4_DESC_SIZE(sb)); ext4_block_bitmap_set(sb, gdp, group_data->block_bitmap); ext4_inode_bitmap_set(sb, gdp, group_data->inode_bitmap); + err = ext4_set_bitmap_checksums(sb, group, gdp, group_data); + if (err) { + ext4_std_error(sb, err); + break; + } + ext4_inode_table_set(sb, gdp, group_data->inode_table); ext4_free_group_clusters_set(sb, gdp, EXT4_B2C(sbi, group_data->free_blocks_count)); ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb)); gdp->bg_flags = cpu_to_le16(*bg_flags); - gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); + ext4_group_desc_csum_set(sb, group, gdp); err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh); if (unlikely(err)) { @@ -1343,17 +1401,14 @@ static int ext4_setup_next_flex_gd(struct super_block *sb, (1 + ext4_bg_num_gdb(sb, group + i) + le16_to_cpu(es->s_reserved_gdt_blocks)) : 0; group_data[i].free_blocks_count = blocks_per_group - overhead; - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) + if (ext4_has_group_desc_csum(sb)) flex_gd->bg_flags[i] = EXT4_BG_BLOCK_UNINIT | EXT4_BG_INODE_UNINIT; else flex_gd->bg_flags[i] = EXT4_BG_INODE_ZEROED; } - if (last_group == n_group && - EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) + if (last_group == n_group && ext4_has_group_desc_csum(sb)) /* We need to initialize block bitmap of last group. */ flex_gd->bg_flags[i - 1] &= ~EXT4_BG_BLOCK_UNINIT; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 35b5954..eb7aa3e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -112,6 +112,48 @@ static struct file_system_type ext3_fs_type = { #define IS_EXT3_SB(sb) (0) #endif +static int ext4_verify_csum_type(struct super_block *sb, + struct ext4_super_block *es) +{ + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return 1; + + return es->s_checksum_type == EXT4_CRC32C_CHKSUM; +} + +static __le32 ext4_superblock_csum(struct super_block *sb, + struct ext4_super_block *es) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + int offset = offsetof(struct ext4_super_block, s_checksum); + __u32 csum; + + csum = ext4_chksum(sbi, ~0, (char *)es, offset); + + return cpu_to_le32(csum); +} + +int ext4_superblock_csum_verify(struct super_block *sb, + struct ext4_super_block *es) +{ + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return 1; + + return es->s_checksum == ext4_superblock_csum(sb, es); +} + +void ext4_superblock_csum_set(struct super_block *sb, + struct ext4_super_block *es) +{ + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return; + + es->s_checksum = ext4_superblock_csum(sb, es); +} + void *ext4_kvmalloc(size_t size, gfp_t flags) { void *ret; @@ -497,6 +539,7 @@ void __ext4_error(struct super_block *sb, const char *function, printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n", sb->s_id, function, line, current->comm, &vaf); va_end(args); + save_error_info(sb, function, line); ext4_handle_error(sb); } @@ -905,6 +948,8 @@ static void ext4_put_super(struct super_block *sb) unlock_super(sb); kobject_put(&sbi->s_kobj); wait_for_completion(&sbi->s_kobj_unregister); + if (sbi->s_chksum_driver) + crypto_free_shash(sbi->s_chksum_driver); kfree(sbi->s_blockgroup_lock); kfree(sbi); } @@ -1922,43 +1967,69 @@ failed: return 0; } -__le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group, - struct ext4_group_desc *gdp) +static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group, + struct ext4_group_desc *gdp) { + int offset; __u16 crc = 0; + __le32 le_group = cpu_to_le32(block_group); - if (sbi->s_es->s_feature_ro_compat & - cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) { - int offset = offsetof(struct ext4_group_desc, bg_checksum); - __le32 le_group = cpu_to_le32(block_group); - - crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid)); - crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group)); - crc = crc16(crc, (__u8 *)gdp, offset); - offset += sizeof(gdp->bg_checksum); /* skip checksum */ - /* for checksum of struct ext4_group_desc do the rest...*/ - if ((sbi->s_es->s_feature_incompat & - cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT)) && - offset < le16_to_cpu(sbi->s_es->s_desc_size)) - crc = crc16(crc, (__u8 *)gdp + offset, - le16_to_cpu(sbi->s_es->s_desc_size) - - offset); + if ((sbi->s_es->s_feature_ro_compat & + cpu_to_le32(EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))) { + /* Use new metadata_csum algorithm */ + __u16 old_csum; + __u32 csum32; + + old_csum = gdp->bg_checksum; + gdp->bg_checksum = 0; + csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group, + sizeof(le_group)); + csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp, + sbi->s_desc_size); + gdp->bg_checksum = old_csum; + + crc = csum32 & 0xFFFF; + goto out; } + /* old crc16 code */ + offset = offsetof(struct ext4_group_desc, bg_checksum); + + crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid)); + crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group)); + crc = crc16(crc, (__u8 *)gdp, offset); + offset += sizeof(gdp->bg_checksum); /* skip checksum */ + /* for checksum of struct ext4_group_desc do the rest...*/ + if ((sbi->s_es->s_feature_incompat & + cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT)) && + offset < le16_to_cpu(sbi->s_es->s_desc_size)) + crc = crc16(crc, (__u8 *)gdp + offset, + le16_to_cpu(sbi->s_es->s_desc_size) - + offset); + +out: return cpu_to_le16(crc); } -int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 block_group, +int ext4_group_desc_csum_verify(struct super_block *sb, __u32 block_group, struct ext4_group_desc *gdp) { - if ((sbi->s_es->s_feature_ro_compat & - cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) && - (gdp->bg_checksum != ext4_group_desc_csum(sbi, block_group, gdp))) + if (ext4_has_group_desc_csum(sb) && + (gdp->bg_checksum != ext4_group_desc_csum(EXT4_SB(sb), + block_group, gdp))) return 0; return 1; } +void ext4_group_desc_csum_set(struct super_block *sb, __u32 block_group, + struct ext4_group_desc *gdp) +{ + if (!ext4_has_group_desc_csum(sb)) + return; + gdp->bg_checksum = ext4_group_desc_csum(EXT4_SB(sb), block_group, gdp); +} + /* Called at mount-time, super-block is locked */ static int ext4_check_descriptors(struct super_block *sb, ext4_group_t *first_not_zeroed) @@ -2013,7 +2084,7 @@ static int ext4_check_descriptors(struct super_block *sb, return 0; } ext4_lock_group(sb, i); - if (!ext4_group_desc_csum_verify(sbi, i, gdp)) { + if (!ext4_group_desc_csum_verify(sb, i, gdp)) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Checksum for group %u failed (%u!=%u)", i, le16_to_cpu(ext4_group_desc_csum(sbi, i, @@ -2417,6 +2488,23 @@ static ssize_t sbi_ui_store(struct ext4_attr *a, return count; } +static ssize_t trigger_test_error(struct ext4_attr *a, + struct ext4_sb_info *sbi, + const char *buf, size_t count) +{ + int len = count; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (len && buf[len-1] == '\n') + len--; + + if (len) + ext4_error(sbi->s_sb, "%.*s", len, buf); + return count; +} + #define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \ static struct ext4_attr ext4_attr_##_name = { \ .attr = {.name = __stringify(_name), .mode = _mode }, \ @@ -2447,6 +2535,7 @@ EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump); +EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error); static struct attribute *ext4_attrs[] = { ATTR_LIST(delayed_allocation_blocks), @@ -2461,6 +2550,7 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(mb_stream_req), ATTR_LIST(mb_group_prealloc), ATTR_LIST(max_writeback_mb_bump), + ATTR_LIST(trigger_fs_error), NULL, }; @@ -2957,6 +3047,44 @@ static void ext4_destroy_lazyinit_thread(void) kthread_stop(ext4_lazyinit_task); } +static int set_journal_csum_feature_set(struct super_block *sb) +{ + int ret = 1; + int compat, incompat; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) { + /* journal checksum v2 */ + compat = 0; + incompat = JBD2_FEATURE_INCOMPAT_CSUM_V2; + } else { + /* journal checksum v1 */ + compat = JBD2_FEATURE_COMPAT_CHECKSUM; + incompat = 0; + } + + if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { + ret = jbd2_journal_set_features(sbi->s_journal, + compat, 0, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT | + incompat); + } else if (test_opt(sb, JOURNAL_CHECKSUM)) { + ret = jbd2_journal_set_features(sbi->s_journal, + compat, 0, + incompat); + jbd2_journal_clear_features(sbi->s_journal, 0, 0, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); + } else { + jbd2_journal_clear_features(sbi->s_journal, + JBD2_FEATURE_COMPAT_CHECKSUM, 0, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT | + JBD2_FEATURE_INCOMPAT_CSUM_V2); + } + + return ret; +} + static int ext4_fill_super(struct super_block *sb, void *data, int silent) { char *orig_data = kstrdup(data, GFP_KERNEL); @@ -2993,6 +3121,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto out_free_orig; } sb->s_fs_info = sbi; + sbi->s_sb = sb; sbi->s_mount_opt = 0; sbi->s_resuid = make_kuid(&init_user_ns, EXT4_DEF_RESUID); sbi->s_resgid = make_kgid(&init_user_ns, EXT4_DEF_RESGID); @@ -3032,13 +3161,54 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) * Note: s_es must be initialized as soon as possible because * some ext4 macro-instructions depend on its value */ - es = (struct ext4_super_block *) (((char *)bh->b_data) + offset); + es = (struct ext4_super_block *) (bh->b_data + offset); sbi->s_es = es; sb->s_magic = le16_to_cpu(es->s_magic); if (sb->s_magic != EXT4_SUPER_MAGIC) goto cantfind_ext4; sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written); + /* Warn if metadata_csum and gdt_csum are both set. */ + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) && + EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) + ext4_warning(sb, KERN_INFO "metadata_csum and uninit_bg are " + "redundant flags; please run fsck."); + + /* Check for a known checksum algorithm */ + if (!ext4_verify_csum_type(sb, es)) { + ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with " + "unknown checksum algorithm."); + silent = 1; + goto cantfind_ext4; + } + + /* Load the checksum driver */ + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) { + sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); + if (IS_ERR(sbi->s_chksum_driver)) { + ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver."); + ret = PTR_ERR(sbi->s_chksum_driver); + sbi->s_chksum_driver = NULL; + goto failed_mount; + } + } + + /* Check superblock checksum */ + if (!ext4_superblock_csum_verify(sb, es)) { + ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with " + "invalid superblock checksum. Run e2fsck?"); + silent = 1; + goto cantfind_ext4; + } + + /* Precompute checksum seed for all metadata */ + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid, + sizeof(es->s_uuid)); + /* Set defaults before we parse the mount options */ def_mount_opts = le32_to_cpu(es->s_default_mount_opts); set_opt(sb, INIT_INODE_TABLE); @@ -3200,7 +3370,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) "Can't read superblock on 2nd try"); goto failed_mount; } - es = (struct ext4_super_block *)(((char *)bh->b_data) + offset); + es = (struct ext4_super_block *)(bh->b_data + offset); sbi->s_es = es; if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) { ext4_msg(sb, KERN_ERR, @@ -3392,6 +3562,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) GFP_KERNEL); if (sbi->s_group_desc == NULL) { ext4_msg(sb, KERN_ERR, "not enough memory"); + ret = -ENOMEM; goto failed_mount; } @@ -3449,6 +3620,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } if (err) { ext4_msg(sb, KERN_ERR, "insufficient memory"); + ret = err; goto failed_mount3; } @@ -3506,26 +3678,17 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto no_journal; } - if (ext4_blocks_count(es) > 0xffffffffULL && + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT) && !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_64BIT)) { ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature"); goto failed_mount_wq; } - if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { - jbd2_journal_set_features(sbi->s_journal, - JBD2_FEATURE_COMPAT_CHECKSUM, 0, - JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); - } else if (test_opt(sb, JOURNAL_CHECKSUM)) { - jbd2_journal_set_features(sbi->s_journal, - JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0); - jbd2_journal_clear_features(sbi->s_journal, 0, 0, - JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); - } else { - jbd2_journal_clear_features(sbi->s_journal, - JBD2_FEATURE_COMPAT_CHECKSUM, 0, - JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); + if (!set_journal_csum_feature_set(sb)) { + ext4_msg(sb, KERN_ERR, "Failed to set journal checksum " + "feature set"); + goto failed_mount_wq; } /* We have now updated the journal if required, so we can @@ -3606,7 +3769,8 @@ no_journal: goto failed_mount4; } - ext4_setup_super(sb, es, sb->s_flags & MS_RDONLY); + if (ext4_setup_super(sb, es, sb->s_flags & MS_RDONLY)) + sb->s_flags |= MS_RDONLY; /* determine the minimum size of new large inodes, if present */ if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) { @@ -3641,7 +3805,7 @@ no_journal: } ext4_ext_init(sb); - err = ext4_mb_init(sb, needs_recovery); + err = ext4_mb_init(sb); if (err) { ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", err); @@ -3724,6 +3888,8 @@ failed_mount2: brelse(sbi->s_group_desc[i]); ext4_kvfree(sbi->s_group_desc); failed_mount: + if (sbi->s_chksum_driver) + crypto_free_shash(sbi->s_chksum_driver); if (sbi->s_proc) { remove_proc_entry("options", sbi->s_proc); remove_proc_entry(sb->s_id, ext4_proc_root); @@ -3847,7 +4013,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb, goto out_bdev; } - es = (struct ext4_super_block *) (((char *)bh->b_data) + offset); + es = (struct ext4_super_block *) (bh->b_data + offset); if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) || !(le32_to_cpu(es->s_feature_incompat) & EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) { @@ -4039,6 +4205,7 @@ static int ext4_commit_super(struct super_block *sb, int sync) &EXT4_SB(sb)->s_freeinodes_counter)); sb->s_dirt = 0; BUFFER_TRACE(sbh, "marking dirty"); + ext4_superblock_csum_set(sb, es); mark_buffer_dirty(sbh); if (sync) { error = sync_dirty_buffer(sbh); @@ -4333,7 +4500,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) struct ext4_group_desc *gdp = ext4_get_group_desc(sb, g, NULL); - if (!ext4_group_desc_csum_verify(sbi, g, gdp)) { + if (!ext4_group_desc_csum_verify(sb, g, gdp)) { ext4_msg(sb, KERN_ERR, "ext4_remount: Checksum for group %u failed (%u!=%u)", g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)), diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index e88748e..e56c9ed 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -122,6 +122,58 @@ const struct xattr_handler *ext4_xattr_handlers[] = { NULL }; +static __le32 ext4_xattr_block_csum(struct inode *inode, + sector_t block_nr, + struct ext4_xattr_header *hdr) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); + __u32 csum, old; + + old = hdr->h_checksum; + hdr->h_checksum = 0; + if (le32_to_cpu(hdr->h_refcount) != 1) { + block_nr = cpu_to_le64(block_nr); + csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&block_nr, + sizeof(block_nr)); + } else + csum = ei->i_csum_seed; + csum = ext4_chksum(sbi, csum, (__u8 *)hdr, + EXT4_BLOCK_SIZE(inode->i_sb)); + hdr->h_checksum = old; + return cpu_to_le32(csum); +} + +static int ext4_xattr_block_csum_verify(struct inode *inode, + sector_t block_nr, + struct ext4_xattr_header *hdr) +{ + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) && + (hdr->h_checksum != ext4_xattr_block_csum(inode, block_nr, hdr))) + return 0; + return 1; +} + +static void ext4_xattr_block_csum_set(struct inode *inode, + sector_t block_nr, + struct ext4_xattr_header *hdr) +{ + if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return; + + hdr->h_checksum = ext4_xattr_block_csum(inode, block_nr, hdr); +} + +static inline int ext4_handle_dirty_xattr_block(handle_t *handle, + struct inode *inode, + struct buffer_head *bh) +{ + ext4_xattr_block_csum_set(inode, bh->b_blocknr, BHDR(bh)); + return ext4_handle_dirty_metadata(handle, inode, bh); +} + static inline const struct xattr_handler * ext4_xattr_handler(int name_index) { @@ -156,12 +208,22 @@ ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end) } static inline int -ext4_xattr_check_block(struct buffer_head *bh) +ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh) { + int error; + + if (buffer_verified(bh)) + return 0; + if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || BHDR(bh)->h_blocks != cpu_to_le32(1)) return -EIO; - return ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size); + if (!ext4_xattr_block_csum_verify(inode, bh->b_blocknr, BHDR(bh))) + return -EIO; + error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size); + if (!error) + set_buffer_verified(bh); + return error; } static inline int @@ -224,7 +286,7 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, goto cleanup; ea_bdebug(bh, "b_count=%d, refcount=%d", atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); - if (ext4_xattr_check_block(bh)) { + if (ext4_xattr_check_block(inode, bh)) { bad_block: EXT4_ERROR_INODE(inode, "bad block %llu", EXT4_I(inode)->i_file_acl); @@ -369,7 +431,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) goto cleanup; ea_bdebug(bh, "b_count=%d, refcount=%d", atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); - if (ext4_xattr_check_block(bh)) { + if (ext4_xattr_check_block(inode, bh)) { EXT4_ERROR_INODE(inode, "bad block %llu", EXT4_I(inode)->i_file_acl); error = -EIO; @@ -492,7 +554,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, if (ce) mb_cache_entry_release(ce); unlock_buffer(bh); - error = ext4_handle_dirty_metadata(handle, inode, bh); + error = ext4_handle_dirty_xattr_block(handle, inode, bh); if (IS_SYNC(inode)) ext4_handle_sync(handle); dquot_free_block(inode, 1); @@ -662,7 +724,7 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i, ea_bdebug(bs->bh, "b_count=%d, refcount=%d", atomic_read(&(bs->bh->b_count)), le32_to_cpu(BHDR(bs->bh)->h_refcount)); - if (ext4_xattr_check_block(bs->bh)) { + if (ext4_xattr_check_block(inode, bs->bh)) { EXT4_ERROR_INODE(inode, "bad block %llu", EXT4_I(inode)->i_file_acl); error = -EIO; @@ -725,9 +787,9 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, if (error == -EIO) goto bad_block; if (!error) - error = ext4_handle_dirty_metadata(handle, - inode, - bs->bh); + error = ext4_handle_dirty_xattr_block(handle, + inode, + bs->bh); if (error) goto cleanup; goto inserted; @@ -796,9 +858,9 @@ inserted: ea_bdebug(new_bh, "reusing; refcount now=%d", le32_to_cpu(BHDR(new_bh)->h_refcount)); unlock_buffer(new_bh); - error = ext4_handle_dirty_metadata(handle, - inode, - new_bh); + error = ext4_handle_dirty_xattr_block(handle, + inode, + new_bh); if (error) goto cleanup_dquot; } @@ -855,8 +917,8 @@ getblk_failed: set_buffer_uptodate(new_bh); unlock_buffer(new_bh); ext4_xattr_cache_insert(new_bh); - error = ext4_handle_dirty_metadata(handle, - inode, new_bh); + error = ext4_handle_dirty_xattr_block(handle, + inode, new_bh); if (error) goto cleanup; } @@ -1193,7 +1255,7 @@ retry: error = -EIO; if (!bh) goto cleanup; - if (ext4_xattr_check_block(bh)) { + if (ext4_xattr_check_block(inode, bh)) { EXT4_ERROR_INODE(inode, "bad block %llu", EXT4_I(inode)->i_file_acl); error = -EIO; diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 25b7387..91f31ca7 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -27,7 +27,9 @@ struct ext4_xattr_header { __le32 h_refcount; /* reference count */ __le32 h_blocks; /* number of disk blocks used */ __le32 h_hash; /* hash value of all attributes */ - __u32 h_reserved[4]; /* zero right now */ + __le32 h_checksum; /* crc32c(uuid+id+xattrblock) */ + /* id = inum if refcount=1, blknum otherwise */ + __u32 h_reserved[3]; /* zero right now */ }; struct ext4_xattr_ibody_header { diff --git a/fs/fat/inode.c b/fs/fat/inode.c index c2973ea..a3d81eb 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -735,10 +735,9 @@ static struct dentry *fat_fh_to_dentry(struct super_block *sb, } static int -fat_encode_fh(struct dentry *de, __u32 *fh, int *lenp, int connectable) +fat_encode_fh(struct inode *inode, __u32 *fh, int *lenp, struct inode *parent) { int len = *lenp; - struct inode *inode = de->d_inode; u32 ipos_h, ipos_m, ipos_l; if (len < 5) { @@ -754,9 +753,9 @@ fat_encode_fh(struct dentry *de, __u32 *fh, int *lenp, int connectable) fh[1] = inode->i_generation; fh[2] = ipos_h; fh[3] = ipos_m | MSDOS_I(inode)->i_logstart; - spin_lock(&de->d_lock); - fh[4] = ipos_l | MSDOS_I(de->d_parent->d_inode)->i_logstart; - spin_unlock(&de->d_lock); + fh[4] = ipos_l; + if (parent) + fh[4] |= MSDOS_I(parent)->i_logstart; return 3; } @@ -442,28 +442,24 @@ static int check_fcntl_cmd(unsigned cmd) SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg) { struct file *filp; + int fput_needed; long err = -EBADF; - filp = fget_raw(fd); + filp = fget_raw_light(fd, &fput_needed); if (!filp) goto out; if (unlikely(filp->f_mode & FMODE_PATH)) { - if (!check_fcntl_cmd(cmd)) { - fput(filp); - goto out; - } + if (!check_fcntl_cmd(cmd)) + goto out1; } err = security_file_fcntl(filp, cmd, arg); - if (err) { - fput(filp); - return err; - } + if (!err) + err = do_fcntl(fd, cmd, arg, filp); - err = do_fcntl(fd, cmd, arg, filp); - - fput(filp); +out1: + fput_light(filp, fput_needed); out: return err; } @@ -473,26 +469,21 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, unsigned long, arg) { struct file * filp; - long err; + long err = -EBADF; + int fput_needed; - err = -EBADF; - filp = fget_raw(fd); + filp = fget_raw_light(fd, &fput_needed); if (!filp) goto out; if (unlikely(filp->f_mode & FMODE_PATH)) { - if (!check_fcntl_cmd(cmd)) { - fput(filp); - goto out; - } + if (!check_fcntl_cmd(cmd)) + goto out1; } err = security_file_fcntl(filp, cmd, arg); - if (err) { - fput(filp); - return err; - } - err = -EBADF; + if (err) + goto out1; switch (cmd) { case F_GETLK64: @@ -507,7 +498,8 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, err = do_fcntl(fd, cmd, arg, filp); break; } - fput(filp); +out1: + fput_light(filp, fput_needed); out: return err; } diff --git a/fs/file_table.c b/fs/file_table.c index 70f2a0f..a305d9e 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -34,7 +34,6 @@ struct files_stat_struct files_stat = { .max_files = NR_FILE }; -DECLARE_LGLOCK(files_lglock); DEFINE_LGLOCK(files_lglock); /* SLAB cache for file structures */ @@ -421,9 +420,9 @@ static inline void __file_sb_list_add(struct file *file, struct super_block *sb) */ void file_sb_list_add(struct file *file, struct super_block *sb) { - lg_local_lock(files_lglock); + lg_local_lock(&files_lglock); __file_sb_list_add(file, sb); - lg_local_unlock(files_lglock); + lg_local_unlock(&files_lglock); } /** @@ -436,9 +435,9 @@ void file_sb_list_add(struct file *file, struct super_block *sb) void file_sb_list_del(struct file *file) { if (!list_empty(&file->f_u.fu_list)) { - lg_local_lock_cpu(files_lglock, file_list_cpu(file)); + lg_local_lock_cpu(&files_lglock, file_list_cpu(file)); list_del_init(&file->f_u.fu_list); - lg_local_unlock_cpu(files_lglock, file_list_cpu(file)); + lg_local_unlock_cpu(&files_lglock, file_list_cpu(file)); } } @@ -485,7 +484,7 @@ void mark_files_ro(struct super_block *sb) struct file *f; retry: - lg_global_lock(files_lglock); + lg_global_lock(&files_lglock); do_file_list_for_each_entry(sb, f) { struct vfsmount *mnt; if (!S_ISREG(f->f_path.dentry->d_inode->i_mode)) @@ -502,12 +501,12 @@ retry: file_release_write(f); mnt = mntget(f->f_path.mnt); /* This can sleep, so we can't hold the spinlock. */ - lg_global_unlock(files_lglock); + lg_global_unlock(&files_lglock); mnt_drop_write(mnt); mntput(mnt); goto retry; } while_file_list_for_each_entry; - lg_global_unlock(files_lglock); + lg_global_unlock(&files_lglock); } void __init files_init(unsigned long mempages) @@ -525,6 +524,6 @@ void __init files_init(unsigned long mempages) n = (mempages * (PAGE_SIZE / 1024)) / 10; files_stat.max_files = max_t(unsigned long, n, NR_FILE); files_defer_init(); - lg_lock_init(files_lglock); + lg_lock_init(&files_lglock, "files_lglock"); percpu_counter_init(&nr_files, 0); } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 504e61b..9562109 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -962,7 +962,9 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov, if (err) goto out; - file_update_time(file); + err = file_update_time(file); + if (err) + goto out; if (file->f_flags & O_DIRECT) { written = generic_file_direct_write(iocb, iov, &nr_segs, diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 56f6dcf..42678a3 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -627,12 +627,10 @@ static struct dentry *fuse_get_dentry(struct super_block *sb, return ERR_PTR(err); } -static int fuse_encode_fh(struct dentry *dentry, u32 *fh, int *max_len, - int connectable) +static int fuse_encode_fh(struct inode *inode, u32 *fh, int *max_len, + struct inode *parent) { - struct inode *inode = dentry->d_inode; - bool encode_parent = connectable && !S_ISDIR(inode->i_mode); - int len = encode_parent ? 6 : 3; + int len = parent ? 6 : 3; u64 nodeid; u32 generation; @@ -648,14 +646,9 @@ static int fuse_encode_fh(struct dentry *dentry, u32 *fh, int *max_len, fh[1] = (u32)(nodeid & 0xffffffff); fh[2] = generation; - if (encode_parent) { - struct inode *parent; - - spin_lock(&dentry->d_lock); - parent = dentry->d_parent->d_inode; + if (parent) { nodeid = get_fuse_inode(parent)->nodeid; generation = parent->i_generation; - spin_unlock(&dentry->d_lock); fh[3] = (u32)(nodeid >> 32); fh[4] = (u32)(nodeid & 0xffffffff); @@ -663,7 +656,7 @@ static int fuse_encode_fh(struct dentry *dentry, u32 *fh, int *max_len, } *max_len = len; - return encode_parent ? 0x82 : 0x81; + return parent ? 0x82 : 0x81; } static struct dentry *fuse_fh_to_dentry(struct super_block *sb, diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c index 70ba891..e8ed6d4 100644 --- a/fs/gfs2/export.c +++ b/fs/gfs2/export.c @@ -28,15 +28,14 @@ #define GFS2_LARGE_FH_SIZE 8 #define GFS2_OLD_FH_SIZE 10 -static int gfs2_encode_fh(struct dentry *dentry, __u32 *p, int *len, - int connectable) +static int gfs2_encode_fh(struct inode *inode, __u32 *p, int *len, + struct inode *parent) { __be32 *fh = (__force __be32 *)p; - struct inode *inode = dentry->d_inode; struct super_block *sb = inode->i_sb; struct gfs2_inode *ip = GFS2_I(inode); - if (connectable && (*len < GFS2_LARGE_FH_SIZE)) { + if (parent && (*len < GFS2_LARGE_FH_SIZE)) { *len = GFS2_LARGE_FH_SIZE; return 255; } else if (*len < GFS2_SMALL_FH_SIZE) { @@ -50,14 +49,10 @@ static int gfs2_encode_fh(struct dentry *dentry, __u32 *p, int *len, fh[3] = cpu_to_be32(ip->i_no_addr & 0xFFFFFFFF); *len = GFS2_SMALL_FH_SIZE; - if (!connectable || inode == sb->s_root->d_inode) + if (!parent || inode == sb->s_root->d_inode) return *len; - spin_lock(&dentry->d_lock); - inode = dentry->d_parent->d_inode; - ip = GFS2_I(inode); - igrab(inode); - spin_unlock(&dentry->d_lock); + ip = GFS2_I(parent); fh[4] = cpu_to_be32(ip->i_no_formal_ino >> 32); fh[5] = cpu_to_be32(ip->i_no_formal_ino & 0xFFFFFFFF); @@ -65,8 +60,6 @@ static int gfs2_encode_fh(struct dentry *dentry, __u32 *p, int *len, fh[7] = cpu_to_be32(ip->i_no_addr & 0xFFFFFFFF); *len = GFS2_LARGE_FH_SIZE; - iput(inode); - return *len; } diff --git a/fs/hpfs/alloc.c b/fs/hpfs/alloc.c index 7a5eb2c..cdb84a8 100644 --- a/fs/hpfs/alloc.c +++ b/fs/hpfs/alloc.c @@ -16,9 +16,9 @@ static int chk_if_allocated(struct super_block *s, secno sec, char *msg) { struct quad_buffer_head qbh; - u32 *bmp; + __le32 *bmp; if (!(bmp = hpfs_map_bitmap(s, sec >> 14, &qbh, "chk"))) goto fail; - if ((cpu_to_le32(bmp[(sec & 0x3fff) >> 5]) >> (sec & 0x1f)) & 1) { + if ((le32_to_cpu(bmp[(sec & 0x3fff) >> 5]) >> (sec & 0x1f)) & 1) { hpfs_error(s, "sector '%s' - %08x not allocated in bitmap", msg, sec); goto fail1; } @@ -62,7 +62,7 @@ int hpfs_chk_sectors(struct super_block *s, secno start, int len, char *msg) static secno alloc_in_bmp(struct super_block *s, secno near, unsigned n, unsigned forward) { struct quad_buffer_head qbh; - unsigned *bmp; + __le32 *bmp; unsigned bs = near & ~0x3fff; unsigned nr = (near & 0x3fff) & ~(n - 1); /*unsigned mnr;*/ @@ -236,7 +236,7 @@ static secno alloc_in_dirband(struct super_block *s, secno near) int hpfs_alloc_if_possible(struct super_block *s, secno sec) { struct quad_buffer_head qbh; - u32 *bmp; + __le32 *bmp; if (!(bmp = hpfs_map_bitmap(s, sec >> 14, &qbh, "aip"))) goto end; if (le32_to_cpu(bmp[(sec & 0x3fff) >> 5]) & (1 << (sec & 0x1f))) { bmp[(sec & 0x3fff) >> 5] &= cpu_to_le32(~(1 << (sec & 0x1f))); @@ -254,7 +254,7 @@ int hpfs_alloc_if_possible(struct super_block *s, secno sec) void hpfs_free_sectors(struct super_block *s, secno sec, unsigned n) { struct quad_buffer_head qbh; - u32 *bmp; + __le32 *bmp; struct hpfs_sb_info *sbi = hpfs_sb(s); /*printk("2 - ");*/ if (!n) return; @@ -299,7 +299,7 @@ int hpfs_check_free_dnodes(struct super_block *s, int n) int n_bmps = (hpfs_sb(s)->sb_fs_size + 0x4000 - 1) >> 14; int b = hpfs_sb(s)->sb_c_bitmap & 0x0fffffff; int i, j; - u32 *bmp; + __le32 *bmp; struct quad_buffer_head qbh; if ((bmp = hpfs_map_dnode_bitmap(s, &qbh))) { for (j = 0; j < 512; j++) { @@ -351,7 +351,7 @@ void hpfs_free_dnode(struct super_block *s, dnode_secno dno) hpfs_free_sectors(s, dno, 4); } else { struct quad_buffer_head qbh; - u32 *bmp; + __le32 *bmp; unsigned ssec = (dno - hpfs_sb(s)->sb_dirband_start) / 4; if (!(bmp = hpfs_map_dnode_bitmap(s, &qbh))) { return; diff --git a/fs/hpfs/anode.c b/fs/hpfs/anode.c index 08b503e8..4bae4a4a 100644 --- a/fs/hpfs/anode.c +++ b/fs/hpfs/anode.c @@ -20,7 +20,7 @@ secno hpfs_bplus_lookup(struct super_block *s, struct inode *inode, int c1, c2 = 0; go_down: if (hpfs_sb(s)->sb_chk) if (hpfs_stop_cycles(s, a, &c1, &c2, "hpfs_bplus_lookup")) return -1; - if (btree->internal) { + if (bp_internal(btree)) { for (i = 0; i < btree->n_used_nodes; i++) if (le32_to_cpu(btree->u.internal[i].file_secno) > sec) { a = le32_to_cpu(btree->u.internal[i].down); @@ -82,7 +82,7 @@ secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsi brelse(bh); return -1; } - if (btree->internal) { + if (bp_internal(btree)) { a = le32_to_cpu(btree->u.internal[n].down); btree->u.internal[n].file_secno = cpu_to_le32(-1); mark_buffer_dirty(bh); @@ -129,12 +129,12 @@ secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsi } if (a == node && fnod) { anode->up = cpu_to_le32(node); - anode->btree.fnode_parent = 1; + anode->btree.flags |= BP_fnode_parent; anode->btree.n_used_nodes = btree->n_used_nodes; anode->btree.first_free = btree->first_free; anode->btree.n_free_nodes = 40 - anode->btree.n_used_nodes; memcpy(&anode->u, &btree->u, btree->n_used_nodes * 12); - btree->internal = 1; + btree->flags |= BP_internal; btree->n_free_nodes = 11; btree->n_used_nodes = 1; btree->first_free = cpu_to_le16((char *)&(btree->u.internal[1]) - (char *)btree); @@ -184,7 +184,10 @@ secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsi hpfs_free_sectors(s, ra, 1); if ((anode = hpfs_map_anode(s, na, &bh))) { anode->up = cpu_to_le32(up); - anode->btree.fnode_parent = up == node && fnod; + if (up == node && fnod) + anode->btree.flags |= BP_fnode_parent; + else + anode->btree.flags &= ~BP_fnode_parent; mark_buffer_dirty(bh); brelse(bh); } @@ -198,7 +201,7 @@ secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsi if ((new_anode = hpfs_alloc_anode(s, a, &na, &bh))) { anode = new_anode; /*anode->up = cpu_to_le32(up != -1 ? up : ra);*/ - anode->btree.internal = 1; + anode->btree.flags |= BP_internal; anode->btree.n_used_nodes = 1; anode->btree.n_free_nodes = 59; anode->btree.first_free = cpu_to_le16(16); @@ -215,7 +218,8 @@ secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsi } if ((anode = hpfs_map_anode(s, na, &bh))) { anode->up = cpu_to_le32(node); - if (fnod) anode->btree.fnode_parent = 1; + if (fnod) + anode->btree.flags |= BP_fnode_parent; mark_buffer_dirty(bh); brelse(bh); } @@ -234,18 +238,19 @@ secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsi } ranode->up = cpu_to_le32(node); memcpy(&ranode->btree, btree, le16_to_cpu(btree->first_free)); - if (fnod) ranode->btree.fnode_parent = 1; - ranode->btree.n_free_nodes = (ranode->btree.internal ? 60 : 40) - ranode->btree.n_used_nodes; - if (ranode->btree.internal) for (n = 0; n < ranode->btree.n_used_nodes; n++) { + if (fnod) + ranode->btree.flags |= BP_fnode_parent; + ranode->btree.n_free_nodes = (bp_internal(&ranode->btree) ? 60 : 40) - ranode->btree.n_used_nodes; + if (bp_internal(&ranode->btree)) for (n = 0; n < ranode->btree.n_used_nodes; n++) { struct anode *unode; if ((unode = hpfs_map_anode(s, le32_to_cpu(ranode->u.internal[n].down), &bh1))) { unode->up = cpu_to_le32(ra); - unode->btree.fnode_parent = 0; + unode->btree.flags &= ~BP_fnode_parent; mark_buffer_dirty(bh1); brelse(bh1); } } - btree->internal = 1; + btree->flags |= BP_internal; btree->n_free_nodes = fnod ? 10 : 58; btree->n_used_nodes = 2; btree->first_free = cpu_to_le16((char *)&btree->u.internal[2] - (char *)btree); @@ -278,7 +283,7 @@ void hpfs_remove_btree(struct super_block *s, struct bplus_header *btree) int d1, d2; go_down: d2 = 0; - while (btree1->internal) { + while (bp_internal(btree1)) { ano = le32_to_cpu(btree1->u.internal[pos].down); if (level) brelse(bh); if (hpfs_sb(s)->sb_chk) @@ -412,13 +417,13 @@ void hpfs_truncate_btree(struct super_block *s, secno f, int fno, unsigned secs) btree->n_free_nodes = 8; btree->n_used_nodes = 0; btree->first_free = cpu_to_le16(8); - btree->internal = 0; + btree->flags &= ~BP_internal; mark_buffer_dirty(bh); } else hpfs_free_sectors(s, f, 1); brelse(bh); return; } - while (btree->internal) { + while (bp_internal(btree)) { nodes = btree->n_used_nodes + btree->n_free_nodes; for (i = 0; i < btree->n_used_nodes; i++) if (le32_to_cpu(btree->u.internal[i].file_secno) >= secs) goto f; @@ -479,13 +484,13 @@ void hpfs_remove_fnode(struct super_block *s, fnode_secno fno) struct extended_attribute *ea; struct extended_attribute *ea_end; if (!(fnode = hpfs_map_fnode(s, fno, &bh))) return; - if (!fnode->dirflag) hpfs_remove_btree(s, &fnode->btree); + if (!fnode_is_dir(fnode)) hpfs_remove_btree(s, &fnode->btree); else hpfs_remove_dtree(s, le32_to_cpu(fnode->u.external[0].disk_secno)); ea_end = fnode_end_ea(fnode); for (ea = fnode_ea(fnode); ea < ea_end; ea = next_ea(ea)) - if (ea->indirect) - hpfs_ea_remove(s, ea_sec(ea), ea->anode, ea_len(ea)); - hpfs_ea_ext_remove(s, le32_to_cpu(fnode->ea_secno), fnode->ea_anode, le32_to_cpu(fnode->ea_size_l)); + if (ea_indirect(ea)) + hpfs_ea_remove(s, ea_sec(ea), ea_in_anode(ea), ea_len(ea)); + hpfs_ea_ext_remove(s, le32_to_cpu(fnode->ea_secno), fnode_in_anode(fnode), le32_to_cpu(fnode->ea_size_l)); brelse(bh); hpfs_free_sectors(s, fno, 1); } diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c index 2fa0089..b8472f8 100644 --- a/fs/hpfs/dir.c +++ b/fs/hpfs/dir.c @@ -87,7 +87,7 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir) ret = -EIOERROR; goto out; } - if (!fno->dirflag) { + if (!fnode_is_dir(fno)) { e = 1; hpfs_error(inode->i_sb, "not a directory, fnode %08lx", (unsigned long)inode->i_ino); diff --git a/fs/hpfs/dnode.c b/fs/hpfs/dnode.c index 1e0e2ac..3228c52 100644 --- a/fs/hpfs/dnode.c +++ b/fs/hpfs/dnode.c @@ -153,7 +153,7 @@ static void set_last_pointer(struct super_block *s, struct dnode *d, dnode_secno } de->length = cpu_to_le16(36); de->down = 1; - *(dnode_secno *)((char *)de + 32) = cpu_to_le32(ptr); + *(__le32 *)((char *)de + 32) = cpu_to_le32(ptr); } } @@ -177,7 +177,7 @@ struct hpfs_dirent *hpfs_add_de(struct super_block *s, struct dnode *d, memmove((char *)de + d_size, de, (char *)de_end - (char *)de); memset(de, 0, d_size); if (down_ptr) { - *(dnode_secno *)((char *)de + d_size - 4) = cpu_to_le32(down_ptr); + *(__le32 *)((char *)de + d_size - 4) = cpu_to_le32(down_ptr); de->down = 1; } de->length = cpu_to_le16(d_size); @@ -656,7 +656,7 @@ static void delete_empty_dnode(struct inode *i, dnode_secno dno) del->down = 0; d1->first_free = cpu_to_le32(le32_to_cpu(d1->first_free) - 4); } else if (down) - *(dnode_secno *) ((void *) del + le16_to_cpu(del->length) - 4) = cpu_to_le32(down); + *(__le32 *) ((void *) del + le16_to_cpu(del->length) - 4) = cpu_to_le32(down); } else goto endm; if (!(de_cp = kmalloc(le16_to_cpu(de_prev->length), GFP_NOFS))) { printk("HPFS: out of memory for dtree balancing\n"); @@ -672,7 +672,7 @@ static void delete_empty_dnode(struct inode *i, dnode_secno dno) de_prev->down = 1; dnode->first_free = cpu_to_le32(le32_to_cpu(dnode->first_free) + 4); } - *(dnode_secno *) ((void *) de_prev + le16_to_cpu(de_prev->length) - 4) = cpu_to_le32(ndown); + *(__le32 *) ((void *) de_prev + le16_to_cpu(de_prev->length) - 4) = cpu_to_le32(ndown); hpfs_mark_4buffers_dirty(&qbh); hpfs_brelse4(&qbh); for_all_poss(i, hpfs_pos_subst, ((loff_t)up << 4) | (p - 1), 4); @@ -1015,7 +1015,7 @@ struct hpfs_dirent *map_fnode_dirent(struct super_block *s, fnode_secno fno, kfree(name2); return NULL; } - if (!upf->dirflag) { + if (!fnode_is_dir(upf)) { brelse(bh); hpfs_error(s, "fnode %08x has non-directory parent %08x", fno, le32_to_cpu(f->up)); kfree(name2); diff --git a/fs/hpfs/ea.c b/fs/hpfs/ea.c index d8b84d1..bcaafcd 100644 --- a/fs/hpfs/ea.c +++ b/fs/hpfs/ea.c @@ -23,15 +23,15 @@ void hpfs_ea_ext_remove(struct super_block *s, secno a, int ano, unsigned len) return; } if (hpfs_ea_read(s, a, ano, pos, 4, ex)) return; - if (ea->indirect) { + if (ea_indirect(ea)) { if (ea_valuelen(ea) != 8) { - hpfs_error(s, "ea->indirect set while ea->valuelen!=8, %s %08x, pos %08x", + hpfs_error(s, "ea_indirect(ea) set while ea->valuelen!=8, %s %08x, pos %08x", ano ? "anode" : "sectors", a, pos); return; } if (hpfs_ea_read(s, a, ano, pos + 4, ea->namelen + 9, ex+4)) return; - hpfs_ea_remove(s, ea_sec(ea), ea->anode, ea_len(ea)); + hpfs_ea_remove(s, ea_sec(ea), ea_in_anode(ea), ea_len(ea)); } pos += ea->namelen + ea_valuelen(ea) + 5; } @@ -81,7 +81,7 @@ int hpfs_read_ea(struct super_block *s, struct fnode *fnode, char *key, struct extended_attribute *ea_end = fnode_end_ea(fnode); for (ea = fnode_ea(fnode); ea < ea_end; ea = next_ea(ea)) if (!strcmp(ea->name, key)) { - if (ea->indirect) + if (ea_indirect(ea)) goto indirect; if (ea_valuelen(ea) >= size) return -EINVAL; @@ -91,7 +91,7 @@ int hpfs_read_ea(struct super_block *s, struct fnode *fnode, char *key, } a = le32_to_cpu(fnode->ea_secno); len = le32_to_cpu(fnode->ea_size_l); - ano = fnode->ea_anode; + ano = fnode_in_anode(fnode); pos = 0; while (pos < len) { ea = (struct extended_attribute *)ex; @@ -101,10 +101,10 @@ int hpfs_read_ea(struct super_block *s, struct fnode *fnode, char *key, return -EIO; } if (hpfs_ea_read(s, a, ano, pos, 4, ex)) return -EIO; - if (hpfs_ea_read(s, a, ano, pos + 4, ea->namelen + 1 + (ea->indirect ? 8 : 0), ex + 4)) + if (hpfs_ea_read(s, a, ano, pos + 4, ea->namelen + 1 + (ea_indirect(ea) ? 8 : 0), ex + 4)) return -EIO; if (!strcmp(ea->name, key)) { - if (ea->indirect) + if (ea_indirect(ea)) goto indirect; if (ea_valuelen(ea) >= size) return -EINVAL; @@ -119,7 +119,7 @@ int hpfs_read_ea(struct super_block *s, struct fnode *fnode, char *key, indirect: if (ea_len(ea) >= size) return -EINVAL; - if (hpfs_ea_read(s, ea_sec(ea), ea->anode, 0, ea_len(ea), buf)) + if (hpfs_ea_read(s, ea_sec(ea), ea_in_anode(ea), 0, ea_len(ea), buf)) return -EIO; buf[ea_len(ea)] = 0; return 0; @@ -136,8 +136,8 @@ char *hpfs_get_ea(struct super_block *s, struct fnode *fnode, char *key, int *si struct extended_attribute *ea_end = fnode_end_ea(fnode); for (ea = fnode_ea(fnode); ea < ea_end; ea = next_ea(ea)) if (!strcmp(ea->name, key)) { - if (ea->indirect) - return get_indirect_ea(s, ea->anode, ea_sec(ea), *size = ea_len(ea)); + if (ea_indirect(ea)) + return get_indirect_ea(s, ea_in_anode(ea), ea_sec(ea), *size = ea_len(ea)); if (!(ret = kmalloc((*size = ea_valuelen(ea)) + 1, GFP_NOFS))) { printk("HPFS: out of memory for EA\n"); return NULL; @@ -148,7 +148,7 @@ char *hpfs_get_ea(struct super_block *s, struct fnode *fnode, char *key, int *si } a = le32_to_cpu(fnode->ea_secno); len = le32_to_cpu(fnode->ea_size_l); - ano = fnode->ea_anode; + ano = fnode_in_anode(fnode); pos = 0; while (pos < len) { char ex[4 + 255 + 1 + 8]; @@ -159,11 +159,11 @@ char *hpfs_get_ea(struct super_block *s, struct fnode *fnode, char *key, int *si return NULL; } if (hpfs_ea_read(s, a, ano, pos, 4, ex)) return NULL; - if (hpfs_ea_read(s, a, ano, pos + 4, ea->namelen + 1 + (ea->indirect ? 8 : 0), ex + 4)) + if (hpfs_ea_read(s, a, ano, pos + 4, ea->namelen + 1 + (ea_indirect(ea) ? 8 : 0), ex + 4)) return NULL; if (!strcmp(ea->name, key)) { - if (ea->indirect) - return get_indirect_ea(s, ea->anode, ea_sec(ea), *size = ea_len(ea)); + if (ea_indirect(ea)) + return get_indirect_ea(s, ea_in_anode(ea), ea_sec(ea), *size = ea_len(ea)); if (!(ret = kmalloc((*size = ea_valuelen(ea)) + 1, GFP_NOFS))) { printk("HPFS: out of memory for EA\n"); return NULL; @@ -199,9 +199,9 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key, struct extended_attribute *ea_end = fnode_end_ea(fnode); for (ea = fnode_ea(fnode); ea < ea_end; ea = next_ea(ea)) if (!strcmp(ea->name, key)) { - if (ea->indirect) { + if (ea_indirect(ea)) { if (ea_len(ea) == size) - set_indirect_ea(s, ea->anode, ea_sec(ea), data, size); + set_indirect_ea(s, ea_in_anode(ea), ea_sec(ea), data, size); } else if (ea_valuelen(ea) == size) { memcpy(ea_data(ea), data, size); } @@ -209,7 +209,7 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key, } a = le32_to_cpu(fnode->ea_secno); len = le32_to_cpu(fnode->ea_size_l); - ano = fnode->ea_anode; + ano = fnode_in_anode(fnode); pos = 0; while (pos < len) { char ex[4 + 255 + 1 + 8]; @@ -220,12 +220,12 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key, return; } if (hpfs_ea_read(s, a, ano, pos, 4, ex)) return; - if (hpfs_ea_read(s, a, ano, pos + 4, ea->namelen + 1 + (ea->indirect ? 8 : 0), ex + 4)) + if (hpfs_ea_read(s, a, ano, pos + 4, ea->namelen + 1 + (ea_indirect(ea) ? 8 : 0), ex + 4)) return; if (!strcmp(ea->name, key)) { - if (ea->indirect) { + if (ea_indirect(ea)) { if (ea_len(ea) == size) - set_indirect_ea(s, ea->anode, ea_sec(ea), data, size); + set_indirect_ea(s, ea_in_anode(ea), ea_sec(ea), data, size); } else { if (ea_valuelen(ea) == size) @@ -246,7 +246,7 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key, if (le16_to_cpu(fnode->ea_offs) < 0xc4 || le16_to_cpu(fnode->ea_offs) + le16_to_cpu(fnode->acl_size_s) + le16_to_cpu(fnode->ea_size_s) > 0x200) { hpfs_error(s, "fnode %08lx: ea_offs == %03x, ea_size_s == %03x", (unsigned long)inode->i_ino, - le32_to_cpu(fnode->ea_offs), le16_to_cpu(fnode->ea_size_s)); + le16_to_cpu(fnode->ea_offs), le16_to_cpu(fnode->ea_size_s)); return; } if ((le16_to_cpu(fnode->ea_size_s) || !le32_to_cpu(fnode->ea_size_l)) && @@ -276,7 +276,7 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key, fnode->ea_size_l = cpu_to_le32(le16_to_cpu(fnode->ea_size_s)); fnode->ea_size_s = cpu_to_le16(0); fnode->ea_secno = cpu_to_le32(n); - fnode->ea_anode = cpu_to_le32(0); + fnode->flags &= ~FNODE_anode; mark_buffer_dirty(bh); brelse(bh); } @@ -288,9 +288,9 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key, secno q = hpfs_alloc_sector(s, fno, 1, 0); if (!q) goto bail; fnode->ea_secno = cpu_to_le32(q); - fnode->ea_anode = 0; + fnode->flags &= ~FNODE_anode; len++; - } else if (!fnode->ea_anode) { + } else if (!fnode_in_anode(fnode)) { if (hpfs_alloc_if_possible(s, le32_to_cpu(fnode->ea_secno) + len)) { len++; } else { @@ -310,7 +310,7 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key, anode->u.external[0].length = cpu_to_le32(len); mark_buffer_dirty(bh); brelse(bh); - fnode->ea_anode = 1; + fnode->flags |= FNODE_anode; fnode->ea_secno = cpu_to_le32(a_s);*/ secno new_sec; int i; @@ -338,7 +338,7 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key, len = (pos + 511) >> 9; } } - if (fnode->ea_anode) { + if (fnode_in_anode(fnode)) { if (hpfs_add_sector_to_btree(s, le32_to_cpu(fnode->ea_secno), 0, len) != -1) { len++; @@ -351,16 +351,16 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key, h[1] = strlen(key); h[2] = size & 0xff; h[3] = size >> 8; - if (hpfs_ea_write(s, le32_to_cpu(fnode->ea_secno), fnode->ea_anode, le32_to_cpu(fnode->ea_size_l), 4, h)) goto bail; - if (hpfs_ea_write(s, le32_to_cpu(fnode->ea_secno), fnode->ea_anode, le32_to_cpu(fnode->ea_size_l) + 4, h[1] + 1, key)) goto bail; - if (hpfs_ea_write(s, le32_to_cpu(fnode->ea_secno), fnode->ea_anode, le32_to_cpu(fnode->ea_size_l) + 5 + h[1], size, data)) goto bail; + if (hpfs_ea_write(s, le32_to_cpu(fnode->ea_secno), fnode_in_anode(fnode), le32_to_cpu(fnode->ea_size_l), 4, h)) goto bail; + if (hpfs_ea_write(s, le32_to_cpu(fnode->ea_secno), fnode_in_anode(fnode), le32_to_cpu(fnode->ea_size_l) + 4, h[1] + 1, key)) goto bail; + if (hpfs_ea_write(s, le32_to_cpu(fnode->ea_secno), fnode_in_anode(fnode), le32_to_cpu(fnode->ea_size_l) + 5 + h[1], size, data)) goto bail; fnode->ea_size_l = cpu_to_le32(pos); ret: hpfs_i(inode)->i_ea_size += 5 + strlen(key) + size; return; bail: if (le32_to_cpu(fnode->ea_secno)) - if (fnode->ea_anode) hpfs_truncate_btree(s, le32_to_cpu(fnode->ea_secno), 1, (le32_to_cpu(fnode->ea_size_l) + 511) >> 9); + if (fnode_in_anode(fnode)) hpfs_truncate_btree(s, le32_to_cpu(fnode->ea_secno), 1, (le32_to_cpu(fnode->ea_size_l) + 511) >> 9); else hpfs_free_sectors(s, le32_to_cpu(fnode->ea_secno) + ((le32_to_cpu(fnode->ea_size_l) + 511) >> 9), len - ((le32_to_cpu(fnode->ea_size_l) + 511) >> 9)); else fnode->ea_secno = fnode->ea_size_l = cpu_to_le32(0); } diff --git a/fs/hpfs/hpfs.h b/fs/hpfs/hpfs.h index 8b0650a..cce025a 100644 --- a/fs/hpfs/hpfs.h +++ b/fs/hpfs/hpfs.h @@ -51,11 +51,11 @@ struct hpfs_boot_block u8 n_rootdir_entries[2]; u8 n_sectors_s[2]; u8 media_byte; - u16 sectors_per_fat; - u16 sectors_per_track; - u16 heads_per_cyl; - u32 n_hidden_sectors; - u32 n_sectors_l; /* size of partition */ + __le16 sectors_per_fat; + __le16 sectors_per_track; + __le16 heads_per_cyl; + __le32 n_hidden_sectors; + __le32 n_sectors_l; /* size of partition */ u8 drive_number; u8 mbz; u8 sig_28h; /* 28h */ @@ -63,7 +63,7 @@ struct hpfs_boot_block u8 vol_label[11]; u8 sig_hpfs[8]; /* "HPFS " */ u8 pad[448]; - u16 magic; /* aa55 */ + __le16 magic; /* aa55 */ }; @@ -75,28 +75,28 @@ struct hpfs_boot_block struct hpfs_super_block { - u32 magic; /* f995 e849 */ - u32 magic1; /* fa53 e9c5, more magic? */ + __le32 magic; /* f995 e849 */ + __le32 magic1; /* fa53 e9c5, more magic? */ u8 version; /* version of a filesystem usually 2 */ u8 funcversion; /* functional version - oldest version of filesystem that can understand this disk */ - u16 zero; /* 0 */ - fnode_secno root; /* fnode of root directory */ - secno n_sectors; /* size of filesystem */ - u32 n_badblocks; /* number of bad blocks */ - secno bitmaps; /* pointers to free space bit maps */ - u32 zero1; /* 0 */ - secno badblocks; /* bad block list */ - u32 zero3; /* 0 */ - time32_t last_chkdsk; /* date last checked, 0 if never */ - time32_t last_optimize; /* date last optimized, 0 if never */ - secno n_dir_band; /* number of sectors in dir band */ - secno dir_band_start; /* first sector in dir band */ - secno dir_band_end; /* last sector in dir band */ - secno dir_band_bitmap; /* free space map, 1 dnode per bit */ + __le16 zero; /* 0 */ + __le32 root; /* fnode of root directory */ + __le32 n_sectors; /* size of filesystem */ + __le32 n_badblocks; /* number of bad blocks */ + __le32 bitmaps; /* pointers to free space bit maps */ + __le32 zero1; /* 0 */ + __le32 badblocks; /* bad block list */ + __le32 zero3; /* 0 */ + __le32 last_chkdsk; /* date last checked, 0 if never */ + __le32 last_optimize; /* date last optimized, 0 if never */ + __le32 n_dir_band; /* number of sectors in dir band */ + __le32 dir_band_start; /* first sector in dir band */ + __le32 dir_band_end; /* last sector in dir band */ + __le32 dir_band_bitmap; /* free space map, 1 dnode per bit */ u8 volume_name[32]; /* not used */ - secno user_id_table; /* 8 preallocated sectors - user id */ + __le32 user_id_table; /* 8 preallocated sectors - user id */ u32 zero6[103]; /* 0 */ }; @@ -109,8 +109,8 @@ struct hpfs_super_block struct hpfs_spare_block { - u32 magic; /* f991 1849 */ - u32 magic1; /* fa52 29c5, more magic? */ + __le32 magic; /* f991 1849 */ + __le32 magic1; /* fa52 29c5, more magic? */ #ifdef __LITTLE_ENDIAN u8 dirty: 1; /* 0 clean, 1 "improperly stopped" */ @@ -153,21 +153,21 @@ struct hpfs_spare_block u8 mm_contlgulty; u8 unused; - secno hotfix_map; /* info about remapped bad sectors */ - u32 n_spares_used; /* number of hotfixes */ - u32 n_spares; /* number of spares in hotfix map */ - u32 n_dnode_spares_free; /* spare dnodes unused */ - u32 n_dnode_spares; /* length of spare_dnodes[] list, + __le32 hotfix_map; /* info about remapped bad sectors */ + __le32 n_spares_used; /* number of hotfixes */ + __le32 n_spares; /* number of spares in hotfix map */ + __le32 n_dnode_spares_free; /* spare dnodes unused */ + __le32 n_dnode_spares; /* length of spare_dnodes[] list, follows in this block*/ - secno code_page_dir; /* code page directory block */ - u32 n_code_pages; /* number of code pages */ - u32 super_crc; /* on HPFS386 and LAN Server this is + __le32 code_page_dir; /* code page directory block */ + __le32 n_code_pages; /* number of code pages */ + __le32 super_crc; /* on HPFS386 and LAN Server this is checksum of superblock, on normal OS/2 unused */ - u32 spare_crc; /* on HPFS386 checksum of spareblock */ - u32 zero1[15]; /* unused */ - dnode_secno spare_dnodes[100]; /* emergency free dnode list */ - u32 zero2[1]; /* room for more? */ + __le32 spare_crc; /* on HPFS386 checksum of spareblock */ + __le32 zero1[15]; /* unused */ + __le32 spare_dnodes[100]; /* emergency free dnode list */ + __le32 zero2[1]; /* room for more? */ }; /* The bad block list is 4 sectors long. The first word must be zero, @@ -202,18 +202,18 @@ struct hpfs_spare_block struct code_page_directory { - u32 magic; /* 4945 21f7 */ - u32 n_code_pages; /* number of pointers following */ - u32 zero1[2]; + __le32 magic; /* 4945 21f7 */ + __le32 n_code_pages; /* number of pointers following */ + __le32 zero1[2]; struct { - u16 ix; /* index */ - u16 code_page_number; /* code page number */ - u32 bounds; /* matches corresponding word + __le16 ix; /* index */ + __le16 code_page_number; /* code page number */ + __le32 bounds; /* matches corresponding word in data block */ - secno code_page_data; /* sector number of a code_page_data + __le32 code_page_data; /* sector number of a code_page_data containing c.p. array */ - u16 index; /* index in c.p. array in that sector*/ - u16 unknown; /* some unknown value; usually 0; + __le16 index; /* index in c.p. array in that sector*/ + __le16 unknown; /* some unknown value; usually 0; 2 in Japanese version */ } array[31]; /* unknown length */ }; @@ -224,19 +224,19 @@ struct code_page_directory struct code_page_data { - u32 magic; /* 8945 21f7 */ - u32 n_used; /* # elements used in c_p_data[] */ - u32 bounds[3]; /* looks a bit like + __le32 magic; /* 8945 21f7 */ + __le32 n_used; /* # elements used in c_p_data[] */ + __le32 bounds[3]; /* looks a bit like (beg1,end1), (beg2,end2) one byte each */ - u16 offs[3]; /* offsets from start of sector + __le16 offs[3]; /* offsets from start of sector to start of c_p_data[ix] */ struct { - u16 ix; /* index */ - u16 code_page_number; /* code page number */ - u16 unknown; /* the same as in cp directory */ + __le16 ix; /* index */ + __le16 code_page_number; /* code page number */ + __le16 unknown; /* the same as in cp directory */ u8 map[128]; /* upcase table for chars 80..ff */ - u16 zero2; + __le16 zero2; } code_page[3]; u8 incognita[78]; }; @@ -278,8 +278,8 @@ struct code_page_data #define DNODE_MAGIC 0x77e40aae struct dnode { - u32 magic; /* 77e4 0aae */ - u32 first_free; /* offset from start of dnode to + __le32 magic; /* 77e4 0aae */ + __le32 first_free; /* offset from start of dnode to first free dir entry */ #ifdef __LITTLE_ENDIAN u8 root_dnode: 1; /* Is it root dnode? */ @@ -293,14 +293,14 @@ struct dnode { u8 root_dnode: 1; /* Is it root dnode? */ #endif u8 increment_me2[3]; - secno up; /* (root dnode) directory's fnode + __le32 up; /* (root dnode) directory's fnode (nonroot) parent dnode */ - dnode_secno self; /* pointer to this dnode */ + __le32 self; /* pointer to this dnode */ u8 dirent[2028]; /* one or more dirents */ }; struct hpfs_dirent { - u16 length; /* offset to next dirent */ + __le16 length; /* offset to next dirent */ #ifdef __LITTLE_ENDIAN u8 first: 1; /* set on phony ^A^A (".") entry */ @@ -346,12 +346,12 @@ struct hpfs_dirent { u8 read_only: 1; /* dos attrib */ #endif - fnode_secno fnode; /* fnode giving allocation info */ - time32_t write_date; /* mtime */ - u32 file_size; /* file length, bytes */ - time32_t read_date; /* atime */ - time32_t creation_date; /* ctime */ - u32 ea_size; /* total EA length, bytes */ + __le32 fnode; /* fnode giving allocation info */ + __le32 write_date; /* mtime */ + __le32 file_size; /* file length, bytes */ + __le32 read_date; /* atime */ + __le32 creation_date; /* ctime */ + __le32 ea_size; /* total EA length, bytes */ u8 no_of_acls; /* number of ACL's (low 3 bits) */ u8 ix; /* code page index (of filename), see struct code_page_data */ @@ -375,50 +375,36 @@ struct hpfs_dirent { struct bplus_leaf_node { - u32 file_secno; /* first file sector in extent */ - u32 length; /* length, sectors */ - secno disk_secno; /* first corresponding disk sector */ + __le32 file_secno; /* first file sector in extent */ + __le32 length; /* length, sectors */ + __le32 disk_secno; /* first corresponding disk sector */ }; struct bplus_internal_node { - u32 file_secno; /* subtree maps sectors < this */ - anode_secno down; /* pointer to subtree */ + __le32 file_secno; /* subtree maps sectors < this */ + __le32 down; /* pointer to subtree */ }; +enum { + BP_hbff = 1, + BP_fnode_parent = 0x20, + BP_binary_search = 0x40, + BP_internal = 0x80 +}; struct bplus_header { -#ifdef __LITTLE_ENDIAN - u8 hbff: 1; /* high bit of first free entry offset */ - u8 flag1234: 4; - u8 fnode_parent: 1; /* ? we're pointed to by an fnode, - the data btree or some ea or the - main ea bootage pointer ea_secno */ - /* also can get set in fnodes, which - may be a chkdsk glitch or may mean - this bit is irrelevant in fnodes, - or this interpretation is all wet */ - u8 binary_search: 1; /* suggest binary search (unused) */ - u8 internal: 1; /* 1 -> (internal) tree of anodes - 0 -> (leaf) list of extents */ -#else - u8 internal: 1; /* 1 -> (internal) tree of anodes - 0 -> (leaf) list of extents */ - u8 binary_search: 1; /* suggest binary search (unused) */ - u8 fnode_parent: 1; /* ? we're pointed to by an fnode, + u8 flags; /* bit 0 - high bit of first free entry offset + bit 5 - we're pointed to by an fnode, the data btree or some ea or the - main ea bootage pointer ea_secno */ - /* also can get set in fnodes, which - may be a chkdsk glitch or may mean - this bit is irrelevant in fnodes, - or this interpretation is all wet */ - u8 flag1234: 4; - u8 hbff: 1; /* high bit of first free entry offset */ -#endif + main ea bootage pointer ea_secno + bit 6 - suggest binary search (unused) + bit 7 - 1 -> (internal) tree of anodes + 0 -> (leaf) list of extents */ u8 fill[3]; u8 n_free_nodes; /* free nodes in following array */ u8 n_used_nodes; /* used nodes in following array */ - u16 first_free; /* offset from start of header to + __le16 first_free; /* offset from start of header to first free node in array */ union { struct bplus_internal_node internal[0]; /* (internal) 2-word entries giving @@ -428,6 +414,16 @@ struct bplus_header } u; }; +static inline bool bp_internal(struct bplus_header *bp) +{ + return bp->flags & BP_internal; +} + +static inline bool bp_fnode_parent(struct bplus_header *bp) +{ + return bp->flags & BP_fnode_parent; +} + /* fnode: root of allocation b+ tree, and EA's */ /* Every file and every directory has one fnode, pointed to by the directory @@ -436,62 +432,56 @@ struct bplus_header #define FNODE_MAGIC 0xf7e40aae +enum {FNODE_anode = cpu_to_le16(2), FNODE_dir = cpu_to_le16(256)}; struct fnode { - u32 magic; /* f7e4 0aae */ - u32 zero1[2]; /* read history */ + __le32 magic; /* f7e4 0aae */ + __le32 zero1[2]; /* read history */ u8 len, name[15]; /* true length, truncated name */ - fnode_secno up; /* pointer to file's directory fnode */ - secno acl_size_l; - secno acl_secno; - u16 acl_size_s; + __le32 up; /* pointer to file's directory fnode */ + __le32 acl_size_l; + __le32 acl_secno; + __le16 acl_size_s; u8 acl_anode; u8 zero2; /* history bit count */ - u32 ea_size_l; /* length of disk-resident ea's */ - secno ea_secno; /* first sector of disk-resident ea's*/ - u16 ea_size_s; /* length of fnode-resident ea's */ - -#ifdef __LITTLE_ENDIAN - u8 flag0: 1; - u8 ea_anode: 1; /* 1 -> ea_secno is an anode */ - u8 flag234567: 6; -#else - u8 flag234567: 6; - u8 ea_anode: 1; /* 1 -> ea_secno is an anode */ - u8 flag0: 1; -#endif + __le32 ea_size_l; /* length of disk-resident ea's */ + __le32 ea_secno; /* first sector of disk-resident ea's*/ + __le16 ea_size_s; /* length of fnode-resident ea's */ -#ifdef __LITTLE_ENDIAN - u8 dirflag: 1; /* 1 -> directory. first & only extent - points to dnode. */ - u8 flag9012345: 7; -#else - u8 flag9012345: 7; - u8 dirflag: 1; /* 1 -> directory. first & only extent + __le16 flags; /* bit 1 set -> ea_secno is an anode */ + /* bit 8 set -> directory. first & only extent points to dnode. */ -#endif - struct bplus_header btree; /* b+ tree, 8 extents or 12 subtrees */ union { struct bplus_leaf_node external[8]; struct bplus_internal_node internal[12]; } u; - u32 file_size; /* file length, bytes */ - u32 n_needea; /* number of EA's with NEEDEA set */ + __le32 file_size; /* file length, bytes */ + __le32 n_needea; /* number of EA's with NEEDEA set */ u8 user_id[16]; /* unused */ - u16 ea_offs; /* offset from start of fnode + __le16 ea_offs; /* offset from start of fnode to first fnode-resident ea */ u8 dasd_limit_treshhold; u8 dasd_limit_delta; - u32 dasd_limit; - u32 dasd_usage; + __le32 dasd_limit; + __le32 dasd_usage; u8 ea[316]; /* zero or more EA's, packed together with no alignment padding. (Do not use this name, get here via fnode + ea_offs. I think.) */ }; +static inline bool fnode_in_anode(struct fnode *p) +{ + return (p->flags & FNODE_anode) != 0; +} + +static inline bool fnode_is_dir(struct fnode *p) +{ + return (p->flags & FNODE_dir) != 0; +} + /* anode: 99.44% pure allocation tree */ @@ -499,9 +489,9 @@ struct fnode struct anode { - u32 magic; /* 37e4 0aae */ - anode_secno self; /* pointer to this anode */ - secno up; /* parent anode or fnode */ + __le32 magic; /* 37e4 0aae */ + __le32 self; /* pointer to this anode */ + __le32 up; /* parent anode or fnode */ struct bplus_header btree; /* b+tree, 40 extents or 60 subtrees */ union { @@ -509,7 +499,7 @@ struct anode struct bplus_internal_node internal[60]; } u; - u32 fill[3]; /* unused */ + __le32 fill[3]; /* unused */ }; @@ -528,32 +518,23 @@ struct anode run, or in multiple runs. Flags in the fnode tell whether the EA list is immediate, in a single run, or in multiple runs. */ +enum {EA_indirect = 1, EA_anode = 2, EA_needea = 128 }; struct extended_attribute { -#ifdef __LITTLE_ENDIAN - u8 indirect: 1; /* 1 -> value gives sector number + u8 flags; /* bit 0 set -> value gives sector number where real value starts */ - u8 anode: 1; /* 1 -> sector is an anode + /* bit 1 set -> sector is an anode that points to fragmented value */ - u8 flag23456: 5; - u8 needea: 1; /* required ea */ -#else - u8 needea: 1; /* required ea */ - u8 flag23456: 5; - u8 anode: 1; /* 1 -> sector is an anode - that points to fragmented value */ - u8 indirect: 1; /* 1 -> value gives sector number - where real value starts */ -#endif + /* bit 7 set -> required ea */ u8 namelen; /* length of name, bytes */ u8 valuelen_lo; /* length of value, bytes */ u8 valuelen_hi; /* length of value, bytes */ - u8 name[0]; + u8 name[]; /* u8 name[namelen]; ascii attrib name u8 nul; terminating '\0', not counted u8 value[valuelen]; value, arbitrary - if this.indirect, valuelen is 8 and the value is + if this.flags & 1, valuelen is 8 and the value is u32 length; real length of value, bytes secno secno; sector address where it starts if this.anode, the above sector number is the root of an anode tree @@ -561,6 +542,16 @@ struct extended_attribute */ }; +static inline bool ea_indirect(struct extended_attribute *ea) +{ + return ea->flags & EA_indirect; +} + +static inline bool ea_in_anode(struct extended_attribute *ea) +{ + return ea->flags & EA_anode; +} + /* Local Variables: comment-column: 40 diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h index 6d2d500..c07ef1f 100644 --- a/fs/hpfs/hpfs_fn.h +++ b/fs/hpfs/hpfs_fn.h @@ -75,7 +75,7 @@ struct hpfs_sb_info { unsigned char *sb_cp_table; /* code page tables: */ /* 128 bytes uppercasing table & */ /* 128 bytes lowercasing table */ - unsigned *sb_bmp_dir; /* main bitmap directory */ + __le32 *sb_bmp_dir; /* main bitmap directory */ unsigned sb_c_bitmap; /* current bitmap */ unsigned sb_max_fwd_alloc; /* max forwad allocation */ int sb_timeshift; @@ -93,7 +93,7 @@ struct quad_buffer_head { static inline dnode_secno de_down_pointer (struct hpfs_dirent *de) { CHKCOND(de->down,("HPFS: de_down_pointer: !de->down\n")); - return le32_to_cpu(*(dnode_secno *) ((void *) de + le16_to_cpu(de->length) - 4)); + return le32_to_cpu(*(__le32 *) ((void *) de + le16_to_cpu(de->length) - 4)); } /* The first dir entry in a dnode */ @@ -141,12 +141,12 @@ static inline struct extended_attribute *next_ea(struct extended_attribute *ea) static inline secno ea_sec(struct extended_attribute *ea) { - return le32_to_cpu(get_unaligned((secno *)((char *)ea + 9 + ea->namelen))); + return le32_to_cpu(get_unaligned((__le32 *)((char *)ea + 9 + ea->namelen))); } static inline secno ea_len(struct extended_attribute *ea) { - return le32_to_cpu(get_unaligned((secno *)((char *)ea + 5 + ea->namelen))); + return le32_to_cpu(get_unaligned((__le32 *)((char *)ea + 5 + ea->namelen))); } static inline char *ea_data(struct extended_attribute *ea) @@ -171,7 +171,7 @@ static inline void copy_de(struct hpfs_dirent *dst, struct hpfs_dirent *src) dst->not_8x3 = n; } -static inline unsigned tstbits(u32 *bmp, unsigned b, unsigned n) +static inline unsigned tstbits(__le32 *bmp, unsigned b, unsigned n) { int i; if ((b >= 0x4000) || (b + n - 1 >= 0x4000)) return n; @@ -268,10 +268,10 @@ void hpfs_evict_inode(struct inode *); /* map.c */ -unsigned *hpfs_map_dnode_bitmap(struct super_block *, struct quad_buffer_head *); -unsigned *hpfs_map_bitmap(struct super_block *, unsigned, struct quad_buffer_head *, char *); +__le32 *hpfs_map_dnode_bitmap(struct super_block *, struct quad_buffer_head *); +__le32 *hpfs_map_bitmap(struct super_block *, unsigned, struct quad_buffer_head *, char *); unsigned char *hpfs_load_code_page(struct super_block *, secno); -secno *hpfs_load_bitmap_directory(struct super_block *, secno bmp); +__le32 *hpfs_load_bitmap_directory(struct super_block *, secno bmp); struct fnode *hpfs_map_fnode(struct super_block *s, ino_t, struct buffer_head **); struct anode *hpfs_map_anode(struct super_block *s, anode_secno, struct buffer_head **); struct dnode *hpfs_map_dnode(struct super_block *s, dnode_secno, struct quad_buffer_head *); diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index b43066c..ed671e0 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -110,7 +110,7 @@ void hpfs_read_inode(struct inode *i) } } } - if (fnode->dirflag) { + if (fnode_is_dir(fnode)) { int n_dnodes, n_subdirs; i->i_mode |= S_IFDIR; i->i_op = &hpfs_dir_iops; diff --git a/fs/hpfs/map.c b/fs/hpfs/map.c index a790821..4acb19d 100644 --- a/fs/hpfs/map.c +++ b/fs/hpfs/map.c @@ -8,12 +8,12 @@ #include "hpfs_fn.h" -unsigned *hpfs_map_dnode_bitmap(struct super_block *s, struct quad_buffer_head *qbh) +__le32 *hpfs_map_dnode_bitmap(struct super_block *s, struct quad_buffer_head *qbh) { return hpfs_map_4sectors(s, hpfs_sb(s)->sb_dmap, qbh, 0); } -unsigned int *hpfs_map_bitmap(struct super_block *s, unsigned bmp_block, +__le32 *hpfs_map_bitmap(struct super_block *s, unsigned bmp_block, struct quad_buffer_head *qbh, char *id) { secno sec; @@ -89,18 +89,18 @@ unsigned char *hpfs_load_code_page(struct super_block *s, secno cps) return cp_table; } -secno *hpfs_load_bitmap_directory(struct super_block *s, secno bmp) +__le32 *hpfs_load_bitmap_directory(struct super_block *s, secno bmp) { struct buffer_head *bh; int n = (hpfs_sb(s)->sb_fs_size + 0x200000 - 1) >> 21; int i; - secno *b; + __le32 *b; if (!(b = kmalloc(n * 512, GFP_KERNEL))) { printk("HPFS: can't allocate memory for bitmap directory\n"); return NULL; } for (i=0;i<n;i++) { - secno *d = hpfs_map_sector(s, bmp+i, &bh, n - i - 1); + __le32 *d = hpfs_map_sector(s, bmp+i, &bh, n - i - 1); if (!d) { kfree(b); return NULL; @@ -130,16 +130,16 @@ struct fnode *hpfs_map_fnode(struct super_block *s, ino_t ino, struct buffer_hea (unsigned long)ino); goto bail; } - if (!fnode->dirflag) { + if (!fnode_is_dir(fnode)) { if ((unsigned)fnode->btree.n_used_nodes + (unsigned)fnode->btree.n_free_nodes != - (fnode->btree.internal ? 12 : 8)) { + (bp_internal(&fnode->btree) ? 12 : 8)) { hpfs_error(s, "bad number of nodes in fnode %08lx", (unsigned long)ino); goto bail; } if (le16_to_cpu(fnode->btree.first_free) != - 8 + fnode->btree.n_used_nodes * (fnode->btree.internal ? 8 : 12)) { + 8 + fnode->btree.n_used_nodes * (bp_internal(&fnode->btree) ? 8 : 12)) { hpfs_error(s, "bad first_free pointer in fnode %08lx", (unsigned long)ino); @@ -187,12 +187,12 @@ struct anode *hpfs_map_anode(struct super_block *s, anode_secno ano, struct buff goto bail; } if ((unsigned)anode->btree.n_used_nodes + (unsigned)anode->btree.n_free_nodes != - (anode->btree.internal ? 60 : 40)) { + (bp_internal(&anode->btree) ? 60 : 40)) { hpfs_error(s, "bad number of nodes in anode %08x", ano); goto bail; } if (le16_to_cpu(anode->btree.first_free) != - 8 + anode->btree.n_used_nodes * (anode->btree.internal ? 8 : 12)) { + 8 + anode->btree.n_used_nodes * (bp_internal(&anode->btree) ? 8 : 12)) { hpfs_error(s, "bad first_free pointer in anode %08x", ano); goto bail; } diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index 30dd7b1..9083ef8 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c @@ -70,7 +70,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) fnode->len = len; memcpy(fnode->name, name, len > 15 ? 15 : len); fnode->up = cpu_to_le32(dir->i_ino); - fnode->dirflag = 1; + fnode->flags |= FNODE_dir; fnode->btree.n_free_nodes = 7; fnode->btree.n_used_nodes = 1; fnode->btree.first_free = cpu_to_le16(0x14); diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index 54f6ecc..706a12c 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -572,7 +572,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) mark_buffer_dirty(bh2); } - if (le32_to_cpu(spareblock->hotfixes_used) || le32_to_cpu(spareblock->n_spares_used)) { + if (spareblock->hotfixes_used || spareblock->n_spares_used) { if (errs >= 2) { printk("HPFS: Hotfixes not supported here, try chkdsk\n"); mark_dirty(s, 0); @@ -645,7 +645,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) root->i_mtime.tv_nsec = 0; root->i_ctime.tv_sec = local_to_gmt(s, le32_to_cpu(de->creation_date)); root->i_ctime.tv_nsec = 0; - hpfs_i(root)->i_ea_size = le16_to_cpu(de->ea_size); + hpfs_i(root)->i_ea_size = le32_to_cpu(de->ea_size); hpfs_i(root)->i_parent_dir = root->i_ino; if (root->i_size == -1) root->i_size = 2048; @@ -1487,10 +1487,30 @@ static int relatime_need_update(struct vfsmount *mnt, struct inode *inode, return 0; } +/* + * This does the actual work of updating an inodes time or version. Must have + * had called mnt_want_write() before calling this. + */ +static int update_time(struct inode *inode, struct timespec *time, int flags) +{ + if (inode->i_op->update_time) + return inode->i_op->update_time(inode, time, flags); + + if (flags & S_ATIME) + inode->i_atime = *time; + if (flags & S_VERSION) + inode_inc_iversion(inode); + if (flags & S_CTIME) + inode->i_ctime = *time; + if (flags & S_MTIME) + inode->i_mtime = *time; + mark_inode_dirty_sync(inode); + return 0; +} + /** * touch_atime - update the access time - * @mnt: mount the inode is accessed on - * @dentry: dentry accessed + * @path: the &struct path to update * * Update the accessed time on an inode and mark it for writeback. * This function automatically handles read only file systems and media, @@ -1525,12 +1545,83 @@ void touch_atime(struct path *path) if (mnt_want_write(mnt)) return; - inode->i_atime = now; - mark_inode_dirty_sync(inode); + /* + * File systems can error out when updating inodes if they need to + * allocate new space to modify an inode (such is the case for + * Btrfs), but since we touch atime while walking down the path we + * really don't care if we failed to update the atime of the file, + * so just ignore the return value. + */ + update_time(inode, &now, S_ATIME); mnt_drop_write(mnt); } EXPORT_SYMBOL(touch_atime); +/* + * The logic we want is + * + * if suid or (sgid and xgrp) + * remove privs + */ +int should_remove_suid(struct dentry *dentry) +{ + umode_t mode = dentry->d_inode->i_mode; + int kill = 0; + + /* suid always must be killed */ + if (unlikely(mode & S_ISUID)) + kill = ATTR_KILL_SUID; + + /* + * sgid without any exec bits is just a mandatory locking mark; leave + * it alone. If some exec bits are set, it's a real sgid; kill it. + */ + if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) + kill |= ATTR_KILL_SGID; + + if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode))) + return kill; + + return 0; +} +EXPORT_SYMBOL(should_remove_suid); + +static int __remove_suid(struct dentry *dentry, int kill) +{ + struct iattr newattrs; + + newattrs.ia_valid = ATTR_FORCE | kill; + return notify_change(dentry, &newattrs); +} + +int file_remove_suid(struct file *file) +{ + struct dentry *dentry = file->f_path.dentry; + struct inode *inode = dentry->d_inode; + int killsuid; + int killpriv; + int error = 0; + + /* Fast path for nothing security related */ + if (IS_NOSEC(inode)) + return 0; + + killsuid = should_remove_suid(dentry); + killpriv = security_inode_need_killpriv(dentry); + + if (killpriv < 0) + return killpriv; + if (killpriv) + error = security_inode_killpriv(dentry); + if (!error && killsuid) + error = __remove_suid(dentry, killsuid); + if (!error && (inode->i_sb->s_flags & MS_NOSEC)) + inode->i_flags |= S_NOSEC; + + return error; +} +EXPORT_SYMBOL(file_remove_suid); + /** * file_update_time - update mtime and ctime time * @file: file accessed @@ -1540,18 +1631,20 @@ EXPORT_SYMBOL(touch_atime); * usage in the file write path of filesystems, and filesystems may * choose to explicitly ignore update via this function with the * S_NOCMTIME inode flag, e.g. for network filesystem where these - * timestamps are handled by the server. + * timestamps are handled by the server. This can return an error for + * file systems who need to allocate space in order to update an inode. */ -void file_update_time(struct file *file) +int file_update_time(struct file *file) { struct inode *inode = file->f_path.dentry->d_inode; struct timespec now; - enum { S_MTIME = 1, S_CTIME = 2, S_VERSION = 4 } sync_it = 0; + int sync_it = 0; + int ret; /* First try to exhaust all avenues to not sync */ if (IS_NOCMTIME(inode)) - return; + return 0; now = current_fs_time(inode->i_sb); if (!timespec_equal(&inode->i_mtime, &now)) @@ -1564,21 +1657,16 @@ void file_update_time(struct file *file) sync_it |= S_VERSION; if (!sync_it) - return; + return 0; /* Finally allowed to write? Takes lock. */ if (mnt_want_write_file(file)) - return; + return 0; - /* Only change inode inside the lock region */ - if (sync_it & S_VERSION) - inode_inc_iversion(inode); - if (sync_it & S_CTIME) - inode->i_ctime = now; - if (sync_it & S_MTIME) - inode->i_mtime = now; - mark_inode_dirty_sync(inode); + ret = update_time(inode, &now, sync_it); mnt_drop_write_file(file); + + return ret; } EXPORT_SYMBOL(file_update_time); diff --git a/fs/internal.h b/fs/internal.h index 9962c59..18bc216 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -56,7 +56,7 @@ extern int sb_prepare_remount_readonly(struct super_block *); extern void __init mnt_init(void); -DECLARE_BRLOCK(vfsmount_lock); +extern struct lglock vfsmount_lock; /* @@ -100,6 +100,7 @@ extern struct file *do_file_open_root(struct dentry *, struct vfsmount *, extern long do_handle_open(int mountdirfd, struct file_handle __user *ufh, int open_flag); +extern int open_check_o_direct(struct file *f); /* * inode.c diff --git a/fs/isofs/export.c b/fs/isofs/export.c index dd4687f..aa4356d 100644 --- a/fs/isofs/export.c +++ b/fs/isofs/export.c @@ -107,12 +107,11 @@ static struct dentry *isofs_export_get_parent(struct dentry *child) } static int -isofs_export_encode_fh(struct dentry *dentry, +isofs_export_encode_fh(struct inode *inode, __u32 *fh32, int *max_len, - int connectable) + struct inode *parent) { - struct inode * inode = dentry->d_inode; struct iso_inode_info * ei = ISOFS_I(inode); int len = *max_len; int type = 1; @@ -124,7 +123,7 @@ isofs_export_encode_fh(struct dentry *dentry, * offset of the inode and the upper 16 bits of fh32[1] to * hold the offset of the parent. */ - if (connectable && (len < 5)) { + if (parent && (len < 5)) { *max_len = 5; return 255; } else if (len < 3) { @@ -136,16 +135,12 @@ isofs_export_encode_fh(struct dentry *dentry, fh32[0] = ei->i_iget5_block; fh16[2] = (__u16)ei->i_iget5_offset; /* fh16 [sic] */ fh32[2] = inode->i_generation; - if (connectable && !S_ISDIR(inode->i_mode)) { - struct inode *parent; + if (parent) { struct iso_inode_info *eparent; - spin_lock(&dentry->d_lock); - parent = dentry->d_parent->d_inode; eparent = ISOFS_I(parent); fh32[3] = eparent->i_iget5_block; fh16[3] = (__u16)eparent->i_iget5_offset; /* fh16 [sic] */ fh32[4] = parent->i_generation; - spin_unlock(&dentry->d_lock); len = 5; type = 2; } diff --git a/fs/jbd2/Kconfig b/fs/jbd2/Kconfig index f32f346..69a48c2 100644 --- a/fs/jbd2/Kconfig +++ b/fs/jbd2/Kconfig @@ -1,6 +1,8 @@ config JBD2 tristate select CRC32 + select CRYPTO + select CRYPTO_CRC32C help This is a generic journaling layer for block devices that support both 32-bit and 64-bit block numbers. It is currently used by diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 840f70f..216f429 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -85,6 +85,24 @@ nope: __brelse(bh); } +static void jbd2_commit_block_csum_set(journal_t *j, + struct journal_head *descriptor) +{ + struct commit_header *h; + __u32 csum; + + if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + return; + + h = (struct commit_header *)(jh2bh(descriptor)->b_data); + h->h_chksum_type = 0; + h->h_chksum_size = 0; + h->h_chksum[0] = 0; + csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data, + j->j_blocksize); + h->h_chksum[0] = cpu_to_be32(csum); +} + /* * Done it all: now submit the commit record. We should have * cleaned up our previous buffers by now, so if we are in abort @@ -128,6 +146,7 @@ static int journal_submit_commit_record(journal_t *journal, tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE; tmp->h_chksum[0] = cpu_to_be32(crc32_sum); } + jbd2_commit_block_csum_set(journal, descriptor); JBUFFER_TRACE(descriptor, "submit commit block"); lock_buffer(bh); @@ -301,6 +320,44 @@ static void write_tag_block(int tag_bytes, journal_block_tag_t *tag, tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1); } +static void jbd2_descr_block_csum_set(journal_t *j, + struct journal_head *descriptor) +{ + struct jbd2_journal_block_tail *tail; + __u32 csum; + + if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + return; + + tail = (struct jbd2_journal_block_tail *) + (jh2bh(descriptor)->b_data + j->j_blocksize - + sizeof(struct jbd2_journal_block_tail)); + tail->t_checksum = 0; + csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data, + j->j_blocksize); + tail->t_checksum = cpu_to_be32(csum); +} + +static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag, + struct buffer_head *bh, __u32 sequence) +{ + struct page *page = bh->b_page; + __u8 *addr; + __u32 csum; + + if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + return; + + sequence = cpu_to_be32(sequence); + addr = kmap_atomic(page, KM_USER0); + csum = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence, + sizeof(sequence)); + csum = jbd2_chksum(j, csum, addr + offset_in_page(bh->b_data), + bh->b_size); + kunmap_atomic(addr, KM_USER0); + + tag->t_checksum = cpu_to_be32(csum); +} /* * jbd2_journal_commit_transaction * @@ -334,6 +391,10 @@ void jbd2_journal_commit_transaction(journal_t *journal) unsigned long first_block; tid_t first_tid; int update_tail; + int csum_size = 0; + + if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + csum_size = sizeof(struct jbd2_journal_block_tail); /* * First job: lock down the current transaction and wait for @@ -627,7 +688,9 @@ void jbd2_journal_commit_transaction(journal_t *journal) tag = (journal_block_tag_t *) tagp; write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr); - tag->t_flags = cpu_to_be32(tag_flag); + tag->t_flags = cpu_to_be16(tag_flag); + jbd2_block_tag_csum_set(journal, tag, jh2bh(new_jh), + commit_transaction->t_tid); tagp += tag_bytes; space_left -= tag_bytes; @@ -643,7 +706,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) if (bufs == journal->j_wbufsize || commit_transaction->t_buffers == NULL || - space_left < tag_bytes + 16) { + space_left < tag_bytes + 16 + csum_size) { jbd_debug(4, "JBD2: Submit %d IOs\n", bufs); @@ -651,8 +714,9 @@ void jbd2_journal_commit_transaction(journal_t *journal) submitting the IOs. "tag" still points to the last tag we set up. */ - tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG); + tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG); + jbd2_descr_block_csum_set(journal, descriptor); start_journal_io: for (i = 0; i < bufs; i++) { struct buffer_head *bh = wbuf[i]; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 1afb701..e9a3c4c 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -97,6 +97,43 @@ EXPORT_SYMBOL(jbd2_inode_cache); static void __journal_abort_soft (journal_t *journal, int errno); static int jbd2_journal_create_slab(size_t slab_size); +/* Checksumming functions */ +int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb) +{ + if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + return 1; + + return sb->s_checksum_type == JBD2_CRC32C_CHKSUM; +} + +static __u32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb) +{ + __u32 csum, old_csum; + + old_csum = sb->s_checksum; + sb->s_checksum = 0; + csum = jbd2_chksum(j, ~0, (char *)sb, sizeof(journal_superblock_t)); + sb->s_checksum = old_csum; + + return cpu_to_be32(csum); +} + +int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb) +{ + if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + return 1; + + return sb->s_checksum == jbd2_superblock_csum(j, sb); +} + +void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb) +{ + if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + return; + + sb->s_checksum = jbd2_superblock_csum(j, sb); +} + /* * Helper function used to manage commit timeouts */ @@ -1348,6 +1385,7 @@ static void jbd2_journal_update_sb_errno(journal_t *journal) jbd_debug(1, "JBD2: updating superblock error (errno %d)\n", journal->j_errno); sb->s_errno = cpu_to_be32(journal->j_errno); + jbd2_superblock_csum_set(journal, sb); read_unlock(&journal->j_state_lock); jbd2_write_superblock(journal, WRITE_SYNC); @@ -1376,6 +1414,9 @@ static int journal_get_superblock(journal_t *journal) } } + if (buffer_verified(bh)) + return 0; + sb = journal->j_superblock; err = -EINVAL; @@ -1413,6 +1454,43 @@ static int journal_get_superblock(journal_t *journal) goto out; } + if (JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM) && + JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) { + /* Can't have checksum v1 and v2 on at the same time! */ + printk(KERN_ERR "JBD: Can't enable checksumming v1 and v2 " + "at the same time!\n"); + goto out; + } + + if (!jbd2_verify_csum_type(journal, sb)) { + printk(KERN_ERR "JBD: Unknown checksum type\n"); + goto out; + } + + /* Load the checksum driver */ + if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) { + journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); + if (IS_ERR(journal->j_chksum_driver)) { + printk(KERN_ERR "JBD: Cannot load crc32c driver.\n"); + err = PTR_ERR(journal->j_chksum_driver); + journal->j_chksum_driver = NULL; + goto out; + } + } + + /* Check superblock checksum */ + if (!jbd2_superblock_csum_verify(journal, sb)) { + printk(KERN_ERR "JBD: journal checksum error\n"); + goto out; + } + + /* Precompute checksum seed for all metadata */ + if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid, + sizeof(sb->s_uuid)); + + set_buffer_verified(bh); + return 0; out: @@ -1564,6 +1642,8 @@ int jbd2_journal_destroy(journal_t *journal) iput(journal->j_inode); if (journal->j_revoke) jbd2_journal_destroy_revoke(journal); + if (journal->j_chksum_driver) + crypto_free_shash(journal->j_chksum_driver); kfree(journal->j_wbuf); kfree(journal); @@ -1653,6 +1733,10 @@ int jbd2_journal_check_available_features (journal_t *journal, unsigned long com int jbd2_journal_set_features (journal_t *journal, unsigned long compat, unsigned long ro, unsigned long incompat) { +#define INCOMPAT_FEATURE_ON(f) \ + ((incompat & (f)) && !(sb->s_feature_incompat & cpu_to_be32(f))) +#define COMPAT_FEATURE_ON(f) \ + ((compat & (f)) && !(sb->s_feature_compat & cpu_to_be32(f))) journal_superblock_t *sb; if (jbd2_journal_check_used_features(journal, compat, ro, incompat)) @@ -1661,16 +1745,54 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat, if (!jbd2_journal_check_available_features(journal, compat, ro, incompat)) return 0; + /* Asking for checksumming v2 and v1? Only give them v2. */ + if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V2 && + compat & JBD2_FEATURE_COMPAT_CHECKSUM) + compat &= ~JBD2_FEATURE_COMPAT_CHECKSUM; + jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n", compat, ro, incompat); sb = journal->j_superblock; + /* If enabling v2 checksums, update superblock */ + if (INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V2)) { + sb->s_checksum_type = JBD2_CRC32C_CHKSUM; + sb->s_feature_compat &= + ~cpu_to_be32(JBD2_FEATURE_COMPAT_CHECKSUM); + + /* Load the checksum driver */ + if (journal->j_chksum_driver == NULL) { + journal->j_chksum_driver = crypto_alloc_shash("crc32c", + 0, 0); + if (IS_ERR(journal->j_chksum_driver)) { + printk(KERN_ERR "JBD: Cannot load crc32c " + "driver.\n"); + journal->j_chksum_driver = NULL; + return 0; + } + } + + /* Precompute checksum seed for all metadata */ + if (JBD2_HAS_INCOMPAT_FEATURE(journal, + JBD2_FEATURE_INCOMPAT_CSUM_V2)) + journal->j_csum_seed = jbd2_chksum(journal, ~0, + sb->s_uuid, + sizeof(sb->s_uuid)); + } + + /* If enabling v1 checksums, downgrade superblock */ + if (COMPAT_FEATURE_ON(JBD2_FEATURE_COMPAT_CHECKSUM)) + sb->s_feature_incompat &= + ~cpu_to_be32(JBD2_FEATURE_INCOMPAT_CSUM_V2); + sb->s_feature_compat |= cpu_to_be32(compat); sb->s_feature_ro_compat |= cpu_to_be32(ro); sb->s_feature_incompat |= cpu_to_be32(incompat); return 1; +#undef COMPAT_FEATURE_ON +#undef INCOMPAT_FEATURE_ON } /* @@ -1975,10 +2097,16 @@ int jbd2_journal_blocks_per_page(struct inode *inode) */ size_t journal_tag_bytes(journal_t *journal) { + journal_block_tag_t tag; + size_t x = 0; + + if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + x += sizeof(tag.t_checksum); + if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) - return JBD2_TAG_SIZE64; + return x + JBD2_TAG_SIZE64; else - return JBD2_TAG_SIZE32; + return x + JBD2_TAG_SIZE32; } /* diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index c1a0335..0131e43 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -174,6 +174,25 @@ static int jread(struct buffer_head **bhp, journal_t *journal, return 0; } +static int jbd2_descr_block_csum_verify(journal_t *j, + void *buf) +{ + struct jbd2_journal_block_tail *tail; + __u32 provided, calculated; + + if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + return 1; + + tail = (struct jbd2_journal_block_tail *)(buf + j->j_blocksize - + sizeof(struct jbd2_journal_block_tail)); + provided = tail->t_checksum; + tail->t_checksum = 0; + calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize); + tail->t_checksum = provided; + + provided = be32_to_cpu(provided); + return provided == calculated; +} /* * Count the number of in-use tags in a journal descriptor block. @@ -186,6 +205,9 @@ static int count_tags(journal_t *journal, struct buffer_head *bh) int nr = 0, size = journal->j_blocksize; int tag_bytes = journal_tag_bytes(journal); + if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + size -= sizeof(struct jbd2_journal_block_tail); + tagp = &bh->b_data[sizeof(journal_header_t)]; while ((tagp - bh->b_data + tag_bytes) <= size) { @@ -193,10 +215,10 @@ static int count_tags(journal_t *journal, struct buffer_head *bh) nr++; tagp += tag_bytes; - if (!(tag->t_flags & cpu_to_be32(JBD2_FLAG_SAME_UUID))) + if (!(tag->t_flags & cpu_to_be16(JBD2_FLAG_SAME_UUID))) tagp += 16; - if (tag->t_flags & cpu_to_be32(JBD2_FLAG_LAST_TAG)) + if (tag->t_flags & cpu_to_be16(JBD2_FLAG_LAST_TAG)) break; } @@ -353,6 +375,41 @@ static int calc_chksums(journal_t *journal, struct buffer_head *bh, return 0; } +static int jbd2_commit_block_csum_verify(journal_t *j, void *buf) +{ + struct commit_header *h; + __u32 provided, calculated; + + if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + return 1; + + h = buf; + provided = h->h_chksum[0]; + h->h_chksum[0] = 0; + calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize); + h->h_chksum[0] = provided; + + provided = be32_to_cpu(provided); + return provided == calculated; +} + +static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag, + void *buf, __u32 sequence) +{ + __u32 provided, calculated; + + if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + return 1; + + sequence = cpu_to_be32(sequence); + calculated = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence, + sizeof(sequence)); + calculated = jbd2_chksum(j, calculated, buf, j->j_blocksize); + provided = be32_to_cpu(tag->t_checksum); + + return provided == cpu_to_be32(calculated); +} + static int do_one_pass(journal_t *journal, struct recovery_info *info, enum passtype pass) { @@ -366,6 +423,7 @@ static int do_one_pass(journal_t *journal, int blocktype; int tag_bytes = journal_tag_bytes(journal); __u32 crc32_sum = ~0; /* Transactional Checksums */ + int descr_csum_size = 0; /* * First thing is to establish what we expect to find in the log @@ -451,6 +509,18 @@ static int do_one_pass(journal_t *journal, switch(blocktype) { case JBD2_DESCRIPTOR_BLOCK: + /* Verify checksum first */ + if (JBD2_HAS_INCOMPAT_FEATURE(journal, + JBD2_FEATURE_INCOMPAT_CSUM_V2)) + descr_csum_size = + sizeof(struct jbd2_journal_block_tail); + if (descr_csum_size > 0 && + !jbd2_descr_block_csum_verify(journal, + bh->b_data)) { + err = -EIO; + goto failed; + } + /* If it is a valid descriptor block, replay it * in pass REPLAY; if journal_checksums enabled, then * calculate checksums in PASS_SCAN, otherwise, @@ -481,11 +551,11 @@ static int do_one_pass(journal_t *journal, tagp = &bh->b_data[sizeof(journal_header_t)]; while ((tagp - bh->b_data + tag_bytes) - <= journal->j_blocksize) { + <= journal->j_blocksize - descr_csum_size) { unsigned long io_block; tag = (journal_block_tag_t *) tagp; - flags = be32_to_cpu(tag->t_flags); + flags = be16_to_cpu(tag->t_flags); io_block = next_log_block++; wrap(journal, next_log_block); @@ -516,6 +586,19 @@ static int do_one_pass(journal_t *journal, goto skip_write; } + /* Look for block corruption */ + if (!jbd2_block_tag_csum_verify( + journal, tag, obh->b_data, + be32_to_cpu(tmp->h_sequence))) { + brelse(obh); + success = -EIO; + printk(KERN_ERR "JBD: Invalid " + "checksum recovering " + "block %llu in log\n", + blocknr); + continue; + } + /* Find a buffer for the new * data being restored */ nbh = __getblk(journal->j_fs_dev, @@ -650,6 +733,19 @@ static int do_one_pass(journal_t *journal, } crc32_sum = ~0; } + if (pass == PASS_SCAN && + !jbd2_commit_block_csum_verify(journal, + bh->b_data)) { + info->end_transaction = next_commit_ID; + + if (!JBD2_HAS_INCOMPAT_FEATURE(journal, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { + journal->j_failed_commit = + next_commit_ID; + brelse(bh); + break; + } + } brelse(bh); next_commit_ID++; continue; @@ -706,6 +802,25 @@ static int do_one_pass(journal_t *journal, return err; } +static int jbd2_revoke_block_csum_verify(journal_t *j, + void *buf) +{ + struct jbd2_journal_revoke_tail *tail; + __u32 provided, calculated; + + if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + return 1; + + tail = (struct jbd2_journal_revoke_tail *)(buf + j->j_blocksize - + sizeof(struct jbd2_journal_revoke_tail)); + provided = tail->r_checksum; + tail->r_checksum = 0; + calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize); + tail->r_checksum = provided; + + provided = be32_to_cpu(provided); + return provided == calculated; +} /* Scan a revoke record, marking all blocks mentioned as revoked. */ @@ -720,6 +835,9 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, offset = sizeof(jbd2_journal_revoke_header_t); max = be32_to_cpu(header->r_count); + if (!jbd2_revoke_block_csum_verify(journal, header)) + return -EINVAL; + if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) record_len = 8; diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index 6973705..f30b80b 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -578,6 +578,7 @@ static void write_one_revoke_record(journal_t *journal, struct jbd2_revoke_record_s *record, int write_op) { + int csum_size = 0; struct journal_head *descriptor; int offset; journal_header_t *header; @@ -592,9 +593,13 @@ static void write_one_revoke_record(journal_t *journal, descriptor = *descriptorp; offset = *offsetp; + /* Do we need to leave space at the end for a checksum? */ + if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + csum_size = sizeof(struct jbd2_journal_revoke_tail); + /* Make sure we have a descriptor with space left for the record */ if (descriptor) { - if (offset == journal->j_blocksize) { + if (offset >= journal->j_blocksize - csum_size) { flush_descriptor(journal, descriptor, offset, write_op); descriptor = NULL; } @@ -631,6 +636,24 @@ static void write_one_revoke_record(journal_t *journal, *offsetp = offset; } +static void jbd2_revoke_csum_set(journal_t *j, + struct journal_head *descriptor) +{ + struct jbd2_journal_revoke_tail *tail; + __u32 csum; + + if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + return; + + tail = (struct jbd2_journal_revoke_tail *) + (jh2bh(descriptor)->b_data + j->j_blocksize - + sizeof(struct jbd2_journal_revoke_tail)); + tail->r_checksum = 0; + csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data, + j->j_blocksize); + tail->r_checksum = cpu_to_be32(csum); +} + /* * Flush a revoke descriptor out to the journal. If we are aborting, * this is a noop; otherwise we are generating a buffer which needs to @@ -652,6 +675,8 @@ static void flush_descriptor(journal_t *journal, header = (jbd2_journal_revoke_header_t *) jh2bh(descriptor)->b_data; header->r_count = cpu_to_be32(offset); + jbd2_revoke_csum_set(journal, descriptor); + set_buffer_jwrite(bh); BUFFER_TRACE(bh, "write"); set_buffer_dirty(bh); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index ddcd354..fb1ab953 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -162,8 +162,8 @@ static int start_this_handle(journal_t *journal, handle_t *handle, alloc_transaction: if (!journal->j_running_transaction) { - new_transaction = kmem_cache_alloc(transaction_cache, - gfp_mask | __GFP_ZERO); + new_transaction = kmem_cache_zalloc(transaction_cache, + gfp_mask); if (!new_transaction) { /* * If __GFP_FS is not present, then we may be diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h index 55a0c1d..44dca1f 100644 --- a/fs/jffs2/jffs2_fs_sb.h +++ b/fs/jffs2/jffs2_fs_sb.h @@ -126,6 +126,10 @@ struct jffs2_sb_info { struct jffs2_inodirty *wbuf_inodes; struct rw_semaphore wbuf_sem; /* Protects the write buffer */ + struct delayed_work wbuf_dwork; /* write-buffer write-out work */ + int wbuf_queued; /* non-zero delayed work is queued */ + spinlock_t wbuf_dwork_lock; /* protects wbuf_dwork and and wbuf_queued */ + unsigned char *oobbuf; int oobavail; /* How many bytes are available for JFFS2 in OOB */ #endif diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h index 1cd3aec..bcd983d 100644 --- a/fs/jffs2/os-linux.h +++ b/fs/jffs2/os-linux.h @@ -95,6 +95,7 @@ static inline void jffs2_init_inode_info(struct jffs2_inode_info *f) #define jffs2_ubivol(c) (0) #define jffs2_ubivol_setup(c) (0) #define jffs2_ubivol_cleanup(c) do {} while (0) +#define jffs2_dirty_trigger(c) do {} while (0) #else /* NAND and/or ECC'd NOR support present */ @@ -135,14 +136,10 @@ void jffs2_ubivol_cleanup(struct jffs2_sb_info *c); #define jffs2_nor_wbuf_flash(c) (c->mtd->type == MTD_NORFLASH && ! (c->mtd->flags & MTD_BIT_WRITEABLE)) int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c); void jffs2_nor_wbuf_flash_cleanup(struct jffs2_sb_info *c); +void jffs2_dirty_trigger(struct jffs2_sb_info *c); #endif /* WRITEBUFFER */ -static inline void jffs2_dirty_trigger(struct jffs2_sb_info *c) -{ - OFNI_BS_2SFFJ(c)->s_dirt = 1; -} - /* background.c */ int jffs2_start_garbage_collect_thread(struct jffs2_sb_info *c); void jffs2_stop_garbage_collect_thread(struct jffs2_sb_info *c); diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index f9916f3..bc586f2 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -63,21 +63,6 @@ static void jffs2_i_init_once(void *foo) inode_init_once(&f->vfs_inode); } -static void jffs2_write_super(struct super_block *sb) -{ - struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); - - lock_super(sb); - sb->s_dirt = 0; - - if (!(sb->s_flags & MS_RDONLY)) { - jffs2_dbg(1, "%s()\n", __func__); - jffs2_flush_wbuf_gc(c, 0); - } - - unlock_super(sb); -} - static const char *jffs2_compr_name(unsigned int compr) { switch (compr) { @@ -113,8 +98,6 @@ static int jffs2_sync_fs(struct super_block *sb, int wait) { struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); - jffs2_write_super(sb); - mutex_lock(&c->alloc_sem); jffs2_flush_wbuf_pad(c); mutex_unlock(&c->alloc_sem); @@ -251,7 +234,6 @@ static const struct super_operations jffs2_super_operations = .alloc_inode = jffs2_alloc_inode, .destroy_inode =jffs2_destroy_inode, .put_super = jffs2_put_super, - .write_super = jffs2_write_super, .statfs = jffs2_statfs, .remount_fs = jffs2_remount_fs, .evict_inode = jffs2_evict_inode, @@ -319,9 +301,6 @@ static void jffs2_put_super (struct super_block *sb) jffs2_dbg(2, "%s()\n", __func__); - if (sb->s_dirt) - jffs2_write_super(sb); - mutex_lock(&c->alloc_sem); jffs2_flush_wbuf_pad(c); mutex_unlock(&c->alloc_sem); diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c index 74d9be1..6f4529d 100644 --- a/fs/jffs2/wbuf.c +++ b/fs/jffs2/wbuf.c @@ -20,6 +20,7 @@ #include <linux/mtd/nand.h> #include <linux/jiffies.h> #include <linux/sched.h> +#include <linux/writeback.h> #include "nodelist.h" @@ -85,7 +86,7 @@ static void jffs2_wbuf_dirties_inode(struct jffs2_sb_info *c, uint32_t ino) { struct jffs2_inodirty *new; - /* Mark the superblock dirty so that kupdated will flush... */ + /* Schedule delayed write-buffer write-out */ jffs2_dirty_trigger(c); if (jffs2_wbuf_pending_for_ino(c, ino)) @@ -1148,6 +1149,47 @@ int jffs2_write_nand_badblock(struct jffs2_sb_info *c, struct jffs2_eraseblock * return 1; } +static struct jffs2_sb_info *work_to_sb(struct work_struct *work) +{ + struct delayed_work *dwork; + + dwork = container_of(work, struct delayed_work, work); + return container_of(dwork, struct jffs2_sb_info, wbuf_dwork); +} + +static void delayed_wbuf_sync(struct work_struct *work) +{ + struct jffs2_sb_info *c = work_to_sb(work); + struct super_block *sb = OFNI_BS_2SFFJ(c); + + spin_lock(&c->wbuf_dwork_lock); + c->wbuf_queued = 0; + spin_unlock(&c->wbuf_dwork_lock); + + if (!(sb->s_flags & MS_RDONLY)) { + jffs2_dbg(1, "%s()\n", __func__); + jffs2_flush_wbuf_gc(c, 0); + } +} + +void jffs2_dirty_trigger(struct jffs2_sb_info *c) +{ + struct super_block *sb = OFNI_BS_2SFFJ(c); + unsigned long delay; + + if (sb->s_flags & MS_RDONLY) + return; + + spin_lock(&c->wbuf_dwork_lock); + if (!c->wbuf_queued) { + jffs2_dbg(1, "%s()\n", __func__); + delay = msecs_to_jiffies(dirty_writeback_interval * 10); + queue_delayed_work(system_long_wq, &c->wbuf_dwork, delay); + c->wbuf_queued = 1; + } + spin_unlock(&c->wbuf_dwork_lock); +} + int jffs2_nand_flash_setup(struct jffs2_sb_info *c) { struct nand_ecclayout *oinfo = c->mtd->ecclayout; @@ -1169,6 +1211,8 @@ int jffs2_nand_flash_setup(struct jffs2_sb_info *c) /* Initialise write buffer */ init_rwsem(&c->wbuf_sem); + spin_lock_init(&c->wbuf_dwork_lock); + INIT_DELAYED_WORK(&c->wbuf_dwork, delayed_wbuf_sync); c->wbuf_pagesize = c->mtd->writesize; c->wbuf_ofs = 0xFFFFFFFF; @@ -1207,8 +1251,8 @@ int jffs2_dataflash_setup(struct jffs2_sb_info *c) { /* Initialize write buffer */ init_rwsem(&c->wbuf_sem); - - + spin_lock_init(&c->wbuf_dwork_lock); + INIT_DELAYED_WORK(&c->wbuf_dwork, delayed_wbuf_sync); c->wbuf_pagesize = c->mtd->erasesize; /* Find a suitable c->sector_size @@ -1267,6 +1311,9 @@ int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c) { /* Initialize write buffer */ init_rwsem(&c->wbuf_sem); + spin_lock_init(&c->wbuf_dwork_lock); + INIT_DELAYED_WORK(&c->wbuf_dwork, delayed_wbuf_sync); + c->wbuf_pagesize = c->mtd->writesize; c->wbuf_ofs = 0xFFFFFFFF; @@ -1299,6 +1346,8 @@ int jffs2_ubivol_setup(struct jffs2_sb_info *c) { return 0; init_rwsem(&c->wbuf_sem); + spin_lock_init(&c->wbuf_dwork_lock); + INIT_DELAYED_WORK(&c->wbuf_dwork, delayed_wbuf_sync); c->wbuf_pagesize = c->mtd->writesize; c->wbuf_ofs = 0xFFFFFFFF; diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 1ead075..80938fd 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -251,39 +251,40 @@ out_err: return err; } -static int lockd_up_net(struct net *net) +static int lockd_up_net(struct svc_serv *serv, struct net *net) { struct lockd_net *ln = net_generic(net, lockd_net_id); - struct svc_serv *serv = nlmsvc_rqst->rq_server; int error; - if (ln->nlmsvc_users) + if (ln->nlmsvc_users++) return 0; - error = svc_rpcb_setup(serv, net); + error = svc_bind(serv, net); if (error) - goto err_rpcb; + goto err_bind; error = make_socks(serv, net); if (error < 0) goto err_socks; + dprintk("lockd_up_net: per-net data created; net=%p\n", net); return 0; err_socks: svc_rpcb_cleanup(serv, net); -err_rpcb: +err_bind: + ln->nlmsvc_users--; return error; } -static void lockd_down_net(struct net *net) +static void lockd_down_net(struct svc_serv *serv, struct net *net) { struct lockd_net *ln = net_generic(net, lockd_net_id); - struct svc_serv *serv = nlmsvc_rqst->rq_server; if (ln->nlmsvc_users) { if (--ln->nlmsvc_users == 0) { nlm_shutdown_hosts_net(net); svc_shutdown_net(serv, net); + dprintk("lockd_down_net: per-net data destroyed; net=%p\n", net); } } else { printk(KERN_ERR "lockd_down_net: no users! task=%p, net=%p\n", @@ -292,21 +293,60 @@ static void lockd_down_net(struct net *net) } } -/* - * Bring up the lockd process if it's not already up. - */ -int lockd_up(struct net *net) +static int lockd_start_svc(struct svc_serv *serv) +{ + int error; + + if (nlmsvc_rqst) + return 0; + + /* + * Create the kernel thread and wait for it to start. + */ + nlmsvc_rqst = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE); + if (IS_ERR(nlmsvc_rqst)) { + error = PTR_ERR(nlmsvc_rqst); + printk(KERN_WARNING + "lockd_up: svc_rqst allocation failed, error=%d\n", + error); + goto out_rqst; + } + + svc_sock_update_bufs(serv); + serv->sv_maxconn = nlm_max_connections; + + nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, serv->sv_name); + if (IS_ERR(nlmsvc_task)) { + error = PTR_ERR(nlmsvc_task); + printk(KERN_WARNING + "lockd_up: kthread_run failed, error=%d\n", error); + goto out_task; + } + dprintk("lockd_up: service started\n"); + return 0; + +out_task: + svc_exit_thread(nlmsvc_rqst); + nlmsvc_task = NULL; +out_rqst: + nlmsvc_rqst = NULL; + return error; +} + +static struct svc_serv *lockd_create_svc(void) { struct svc_serv *serv; - int error = 0; - mutex_lock(&nlmsvc_mutex); /* * Check whether we're already up and running. */ if (nlmsvc_rqst) { - error = lockd_up_net(net); - goto out; + /* + * Note: increase service usage, because later in case of error + * svc_destroy() will be called. + */ + svc_get(nlmsvc_rqst->rq_server); + return nlmsvc_rqst->rq_server; } /* @@ -317,59 +357,53 @@ int lockd_up(struct net *net) printk(KERN_WARNING "lockd_up: no pid, %d users??\n", nlmsvc_users); - error = -ENOMEM; serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, NULL); if (!serv) { printk(KERN_WARNING "lockd_up: create service failed\n"); - goto out; + return ERR_PTR(-ENOMEM); } + dprintk("lockd_up: service created\n"); + return serv; +} - error = make_socks(serv, net); - if (error < 0) - goto destroy_and_out; +/* + * Bring up the lockd process if it's not already up. + */ +int lockd_up(struct net *net) +{ + struct svc_serv *serv; + int error; - /* - * Create the kernel thread and wait for it to start. - */ - nlmsvc_rqst = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE); - if (IS_ERR(nlmsvc_rqst)) { - error = PTR_ERR(nlmsvc_rqst); - nlmsvc_rqst = NULL; - printk(KERN_WARNING - "lockd_up: svc_rqst allocation failed, error=%d\n", - error); - goto destroy_and_out; + mutex_lock(&nlmsvc_mutex); + + serv = lockd_create_svc(); + if (IS_ERR(serv)) { + error = PTR_ERR(serv); + goto err_create; } - svc_sock_update_bufs(serv); - serv->sv_maxconn = nlm_max_connections; + error = lockd_up_net(serv, net); + if (error < 0) + goto err_net; - nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, serv->sv_name); - if (IS_ERR(nlmsvc_task)) { - error = PTR_ERR(nlmsvc_task); - svc_exit_thread(nlmsvc_rqst); - nlmsvc_task = NULL; - nlmsvc_rqst = NULL; - printk(KERN_WARNING - "lockd_up: kthread_run failed, error=%d\n", error); - goto destroy_and_out; - } + error = lockd_start_svc(serv); + if (error < 0) + goto err_start; + nlmsvc_users++; /* * Note: svc_serv structures have an initial use count of 1, * so we exit through here on both success and failure. */ -destroy_and_out: +err_net: svc_destroy(serv); -out: - if (!error) { - struct lockd_net *ln = net_generic(net, lockd_net_id); - - ln->nlmsvc_users++; - nlmsvc_users++; - } +err_create: mutex_unlock(&nlmsvc_mutex); return error; + +err_start: + lockd_down_net(serv, net); + goto err_net; } EXPORT_SYMBOL_GPL(lockd_up); @@ -380,11 +414,10 @@ void lockd_down(struct net *net) { mutex_lock(&nlmsvc_mutex); + lockd_down_net(nlmsvc_rqst->rq_server, net); if (nlmsvc_users) { - if (--nlmsvc_users) { - lockd_down_net(net); + if (--nlmsvc_users) goto out; - } } else { printk(KERN_ERR "lockd_down: no users! task=%p\n", nlmsvc_task); @@ -396,7 +429,9 @@ lockd_down(struct net *net) BUG(); } kthread_stop(nlmsvc_task); + dprintk("lockd_down: service stopped\n"); svc_exit_thread(nlmsvc_rqst); + dprintk("lockd_down: service destroyed\n"); nlmsvc_task = NULL; nlmsvc_rqst = NULL; out: @@ -1636,12 +1636,13 @@ EXPORT_SYMBOL(flock_lock_file_wait); SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) { struct file *filp; + int fput_needed; struct file_lock *lock; int can_sleep, unlock; int error; error = -EBADF; - filp = fget(fd); + filp = fget_light(fd, &fput_needed); if (!filp) goto out; @@ -1674,7 +1675,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) locks_free_lock(lock); out_putf: - fput(filp); + fput_light(filp, fput_needed); out: return error; } @@ -449,7 +449,7 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry) mntget(nd->path.mnt); rcu_read_unlock(); - br_read_unlock(vfsmount_lock); + br_read_unlock(&vfsmount_lock); nd->flags &= ~LOOKUP_RCU; return 0; @@ -507,14 +507,14 @@ static int complete_walk(struct nameidata *nd) if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) { spin_unlock(&dentry->d_lock); rcu_read_unlock(); - br_read_unlock(vfsmount_lock); + br_read_unlock(&vfsmount_lock); return -ECHILD; } BUG_ON(nd->inode != dentry->d_inode); spin_unlock(&dentry->d_lock); mntget(nd->path.mnt); rcu_read_unlock(); - br_read_unlock(vfsmount_lock); + br_read_unlock(&vfsmount_lock); } if (likely(!(nd->flags & LOOKUP_JUMPED))) @@ -681,15 +681,15 @@ int follow_up(struct path *path) struct mount *parent; struct dentry *mountpoint; - br_read_lock(vfsmount_lock); + br_read_lock(&vfsmount_lock); parent = mnt->mnt_parent; if (&parent->mnt == path->mnt) { - br_read_unlock(vfsmount_lock); + br_read_unlock(&vfsmount_lock); return 0; } mntget(&parent->mnt); mountpoint = dget(mnt->mnt_mountpoint); - br_read_unlock(vfsmount_lock); + br_read_unlock(&vfsmount_lock); dput(path->dentry); path->dentry = mountpoint; mntput(path->mnt); @@ -947,7 +947,7 @@ failed: if (!(nd->flags & LOOKUP_ROOT)) nd->root.mnt = NULL; rcu_read_unlock(); - br_read_unlock(vfsmount_lock); + br_read_unlock(&vfsmount_lock); return -ECHILD; } @@ -1125,8 +1125,8 @@ static struct dentry *__lookup_hash(struct qstr *name, * small and for now I'd prefer to have fast path as straight as possible. * It _is_ time-critical. */ -static int do_lookup(struct nameidata *nd, struct qstr *name, - struct path *path, struct inode **inode) +static int lookup_fast(struct nameidata *nd, struct qstr *name, + struct path *path, struct inode **inode) { struct vfsmount *mnt = nd->path.mnt; struct dentry *dentry, *parent = nd->path.dentry; @@ -1208,7 +1208,7 @@ unlazy: goto need_lookup; } } -done: + path->mnt = mnt; path->dentry = dentry; err = follow_managed(path, nd->flags); @@ -1222,6 +1222,17 @@ done: return 0; need_lookup: + return 1; +} + +/* Fast lookup failed, do it the slow way */ +static int lookup_slow(struct nameidata *nd, struct qstr *name, + struct path *path) +{ + struct dentry *dentry, *parent; + int err; + + parent = nd->path.dentry; BUG_ON(nd->inode != parent->d_inode); mutex_lock(&parent->d_inode->i_mutex); @@ -1229,7 +1240,16 @@ need_lookup: mutex_unlock(&parent->d_inode->i_mutex); if (IS_ERR(dentry)) return PTR_ERR(dentry); - goto done; + path->mnt = nd->path.mnt; + path->dentry = dentry; + err = follow_managed(path, nd->flags); + if (unlikely(err < 0)) { + path_put_conditional(path, nd); + return err; + } + if (err) + nd->flags |= LOOKUP_JUMPED; + return 0; } static inline int may_lookup(struct nameidata *nd) @@ -1265,7 +1285,7 @@ static void terminate_walk(struct nameidata *nd) if (!(nd->flags & LOOKUP_ROOT)) nd->root.mnt = NULL; rcu_read_unlock(); - br_read_unlock(vfsmount_lock); + br_read_unlock(&vfsmount_lock); } } @@ -1301,21 +1321,26 @@ static inline int walk_component(struct nameidata *nd, struct path *path, */ if (unlikely(type != LAST_NORM)) return handle_dots(nd, type); - err = do_lookup(nd, name, path, &inode); + err = lookup_fast(nd, name, path, &inode); if (unlikely(err)) { - terminate_walk(nd); - return err; - } - if (!inode) { - path_to_nameidata(path, nd); - terminate_walk(nd); - return -ENOENT; + if (err < 0) + goto out_err; + + err = lookup_slow(nd, name, path); + if (err < 0) + goto out_err; + + inode = path->dentry->d_inode; } + err = -ENOENT; + if (!inode) + goto out_path_put; + if (should_follow_link(inode, follow)) { if (nd->flags & LOOKUP_RCU) { if (unlikely(unlazy_walk(nd, path->dentry))) { - terminate_walk(nd); - return -ECHILD; + err = -ECHILD; + goto out_err; } } BUG_ON(inode != path->dentry->d_inode); @@ -1324,6 +1349,12 @@ static inline int walk_component(struct nameidata *nd, struct path *path, path_to_nameidata(path, nd); nd->inode = inode; return 0; + +out_path_put: + path_to_nameidata(path, nd); +out_err: + terminate_walk(nd); + return err; } /* @@ -1620,7 +1651,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, nd->path = nd->root; nd->inode = inode; if (flags & LOOKUP_RCU) { - br_read_lock(vfsmount_lock); + br_read_lock(&vfsmount_lock); rcu_read_lock(); nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); } else { @@ -1633,7 +1664,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, if (*name=='/') { if (flags & LOOKUP_RCU) { - br_read_lock(vfsmount_lock); + br_read_lock(&vfsmount_lock); rcu_read_lock(); set_root_rcu(nd); } else { @@ -1646,7 +1677,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, struct fs_struct *fs = current->fs; unsigned seq; - br_read_lock(vfsmount_lock); + br_read_lock(&vfsmount_lock); rcu_read_lock(); do { @@ -1682,7 +1713,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, if (fput_needed) *fp = file; nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); - br_read_lock(vfsmount_lock); + br_read_lock(&vfsmount_lock); rcu_read_lock(); } else { path_get(&file->f_path); @@ -2169,6 +2200,10 @@ static struct file *do_last(struct nameidata *nd, struct path *path, int want_write = 0; int acc_mode = op->acc_mode; struct file *filp; + struct inode *inode; + int symlink_ok = 0; + struct path save_parent = { .dentry = NULL, .mnt = NULL }; + bool retried = false; int error; nd->flags &= ~LOOKUP_PARENT; @@ -2200,30 +2235,23 @@ static struct file *do_last(struct nameidata *nd, struct path *path, } if (!(open_flag & O_CREAT)) { - int symlink_ok = 0; if (nd->last.name[nd->last.len]) nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW)) symlink_ok = 1; /* we _can_ be in RCU mode here */ - error = walk_component(nd, path, &nd->last, LAST_NORM, - !symlink_ok); - if (error < 0) - return ERR_PTR(error); - if (error) /* symlink */ - return NULL; - /* sayonara */ - error = complete_walk(nd); - if (error) - return ERR_PTR(error); + error = lookup_fast(nd, &nd->last, path, &inode); + if (unlikely(error)) { + if (error < 0) + goto exit; - error = -ENOTDIR; - if (nd->flags & LOOKUP_DIRECTORY) { - if (!nd->inode->i_op->lookup) + error = lookup_slow(nd, &nd->last, path); + if (error < 0) goto exit; + + inode = path->dentry->d_inode; } - audit_inode(pathname, nd->path.dentry); - goto ok; + goto finish_lookup; } /* create side of things */ @@ -2241,6 +2269,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path, if (nd->last.name[nd->last.len]) goto exit; +retry_lookup: mutex_lock(&dir->d_inode->i_mutex); dentry = lookup_hash(nd); @@ -2302,22 +2331,49 @@ static struct file *do_last(struct nameidata *nd, struct path *path, if (error) nd->flags |= LOOKUP_JUMPED; + BUG_ON(nd->flags & LOOKUP_RCU); + inode = path->dentry->d_inode; +finish_lookup: + /* we _can_ be in RCU mode here */ error = -ENOENT; - if (!path->dentry->d_inode) - goto exit_dput; + if (!inode) { + path_to_nameidata(path, nd); + goto exit; + } - if (path->dentry->d_inode->i_op->follow_link) + if (should_follow_link(inode, !symlink_ok)) { + if (nd->flags & LOOKUP_RCU) { + if (unlikely(unlazy_walk(nd, path->dentry))) { + error = -ECHILD; + goto exit; + } + } + BUG_ON(inode != path->dentry->d_inode); return NULL; + } - path_to_nameidata(path, nd); - nd->inode = path->dentry->d_inode; + if ((nd->flags & LOOKUP_RCU) || nd->path.mnt != path->mnt) { + path_to_nameidata(path, nd); + } else { + save_parent.dentry = nd->path.dentry; + save_parent.mnt = mntget(path->mnt); + nd->path.dentry = path->dentry; + + } + nd->inode = inode; /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */ error = complete_walk(nd); - if (error) + if (error) { + path_put(&save_parent); return ERR_PTR(error); + } error = -EISDIR; - if (S_ISDIR(nd->inode->i_mode)) + if ((open_flag & O_CREAT) && S_ISDIR(nd->inode->i_mode)) + goto exit; + error = -ENOTDIR; + if ((nd->flags & LOOKUP_DIRECTORY) && !nd->inode->i_op->lookup) goto exit; + audit_inode(pathname, nd->path.dentry); ok: if (!S_ISREG(nd->inode->i_mode)) will_truncate = 0; @@ -2333,6 +2389,20 @@ common: if (error) goto exit; filp = nameidata_to_filp(nd); + if (filp == ERR_PTR(-EOPENSTALE) && save_parent.dentry && !retried) { + BUG_ON(save_parent.dentry != dir); + path_put(&nd->path); + nd->path = save_parent; + nd->inode = dir->d_inode; + save_parent.mnt = NULL; + save_parent.dentry = NULL; + if (want_write) { + mnt_drop_write(nd->path.mnt); + want_write = 0; + } + retried = true; + goto retry_lookup; + } if (!IS_ERR(filp)) { error = ima_file_check(filp, op->acc_mode); if (error) { @@ -2352,7 +2422,8 @@ common: out: if (want_write) mnt_drop_write(nd->path.mnt); - path_put(&nd->path); + path_put(&save_parent); + terminate_walk(nd); return filp; exit_mutex_unlock: @@ -2415,6 +2486,12 @@ out: if (base) fput(base); release_open_intent(nd); + if (filp == ERR_PTR(-EOPENSTALE)) { + if (flags & LOOKUP_RCU) + filp = ERR_PTR(-ECHILD); + else + filp = ERR_PTR(-ESTALE); + } return filp; out_filp: diff --git a/fs/namespace.c b/fs/namespace.c index e608199..1e4a5fe 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -397,7 +397,7 @@ static int mnt_make_readonly(struct mount *mnt) { int ret = 0; - br_write_lock(vfsmount_lock); + br_write_lock(&vfsmount_lock); mnt->mnt.mnt_flags |= MNT_WRITE_HOLD; /* * After storing MNT_WRITE_HOLD, we'll read the counters. This store @@ -431,15 +431,15 @@ static int mnt_make_readonly(struct mount *mnt) */ smp_wmb(); mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; - br_write_unlock(vfsmount_lock); + br_write_unlock(&vfsmount_lock); return ret; } static void __mnt_unmake_readonly(struct mount *mnt) { - br_write_lock(vfsmount_lock); + br_write_lock(&vfsmount_lock); mnt->mnt.mnt_flags &= ~MNT_READONLY; - br_write_unlock(vfsmount_lock); + br_write_unlock(&vfsmount_lock); } int sb_prepare_remount_readonly(struct super_block *sb) @@ -451,7 +451,7 @@ int sb_prepare_remount_readonly(struct super_block *sb) if (atomic_long_read(&sb->s_remove_count)) return -EBUSY; - br_write_lock(vfsmount_lock); + br_write_lock(&vfsmount_lock); list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { if (!(mnt->mnt.mnt_flags & MNT_READONLY)) { mnt->mnt.mnt_flags |= MNT_WRITE_HOLD; @@ -473,7 +473,7 @@ int sb_prepare_remount_readonly(struct super_block *sb) if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD) mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; } - br_write_unlock(vfsmount_lock); + br_write_unlock(&vfsmount_lock); return err; } @@ -522,14 +522,14 @@ struct vfsmount *lookup_mnt(struct path *path) { struct mount *child_mnt; - br_read_lock(vfsmount_lock); + br_read_lock(&vfsmount_lock); child_mnt = __lookup_mnt(path->mnt, path->dentry, 1); if (child_mnt) { mnt_add_count(child_mnt, 1); - br_read_unlock(vfsmount_lock); + br_read_unlock(&vfsmount_lock); return &child_mnt->mnt; } else { - br_read_unlock(vfsmount_lock); + br_read_unlock(&vfsmount_lock); return NULL; } } @@ -714,9 +714,9 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void mnt->mnt.mnt_sb = root->d_sb; mnt->mnt_mountpoint = mnt->mnt.mnt_root; mnt->mnt_parent = mnt; - br_write_lock(vfsmount_lock); + br_write_lock(&vfsmount_lock); list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts); - br_write_unlock(vfsmount_lock); + br_write_unlock(&vfsmount_lock); return &mnt->mnt; } EXPORT_SYMBOL_GPL(vfs_kern_mount); @@ -745,9 +745,9 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, mnt->mnt.mnt_root = dget(root); mnt->mnt_mountpoint = mnt->mnt.mnt_root; mnt->mnt_parent = mnt; - br_write_lock(vfsmount_lock); + br_write_lock(&vfsmount_lock); list_add_tail(&mnt->mnt_instance, &sb->s_mounts); - br_write_unlock(vfsmount_lock); + br_write_unlock(&vfsmount_lock); if (flag & CL_SLAVE) { list_add(&mnt->mnt_slave, &old->mnt_slave_list); @@ -803,35 +803,36 @@ static void mntput_no_expire(struct mount *mnt) { put_again: #ifdef CONFIG_SMP - br_read_lock(vfsmount_lock); + br_read_lock(&vfsmount_lock); if (likely(atomic_read(&mnt->mnt_longterm))) { mnt_add_count(mnt, -1); - br_read_unlock(vfsmount_lock); + br_read_unlock(&vfsmount_lock); return; } - br_read_unlock(vfsmount_lock); + br_read_unlock(&vfsmount_lock); - br_write_lock(vfsmount_lock); + br_write_lock(&vfsmount_lock); mnt_add_count(mnt, -1); if (mnt_get_count(mnt)) { - br_write_unlock(vfsmount_lock); + br_write_unlock(&vfsmount_lock); return; } #else mnt_add_count(mnt, -1); if (likely(mnt_get_count(mnt))) return; - br_write_lock(vfsmount_lock); + br_write_lock(&vfsmount_lock); #endif if (unlikely(mnt->mnt_pinned)) { mnt_add_count(mnt, mnt->mnt_pinned + 1); mnt->mnt_pinned = 0; - br_write_unlock(vfsmount_lock); + br_write_unlock(&vfsmount_lock); acct_auto_close_mnt(&mnt->mnt); goto put_again; } + list_del(&mnt->mnt_instance); - br_write_unlock(vfsmount_lock); + br_write_unlock(&vfsmount_lock); mntfree(mnt); } @@ -857,21 +858,21 @@ EXPORT_SYMBOL(mntget); void mnt_pin(struct vfsmount *mnt) { - br_write_lock(vfsmount_lock); + br_write_lock(&vfsmount_lock); real_mount(mnt)->mnt_pinned++; - br_write_unlock(vfsmount_lock); + br_write_unlock(&vfsmount_lock); } EXPORT_SYMBOL(mnt_pin); void mnt_unpin(struct vfsmount *m) { struct mount *mnt = real_mount(m); - br_write_lock(vfsmount_lock); + br_write_lock(&vfsmount_lock); if (mnt->mnt_pinned) { mnt_add_count(mnt, 1); mnt->mnt_pinned--; } - br_write_unlock(vfsmount_lock); + br_write_unlock(&vfsmount_lock); } EXPORT_SYMBOL(mnt_unpin); @@ -988,12 +989,12 @@ int may_umount_tree(struct vfsmount *m) BUG_ON(!m); /* write lock needed for mnt_get_count */ - br_write_lock(vfsmount_lock); + br_write_lock(&vfsmount_lock); for (p = mnt; p; p = next_mnt(p, mnt)) { actual_refs += mnt_get_count(p); minimum_refs += 2; } - br_write_unlock(vfsmount_lock); + br_write_unlock(&vfsmount_lock); if (actual_refs > minimum_refs) return 0; @@ -1020,10 +1021,10 @@ int may_umount(struct vfsmount *mnt) { int ret = 1; down_read(&namespace_sem); - br_write_lock(vfsmount_lock); + br_write_lock(&vfsmount_lock); if (propagate_mount_busy(real_mount(mnt), 2)) ret = 0; - br_write_unlock(vfsmount_lock); + br_write_unlock(&vfsmount_lock); up_read(&namespace_sem); return ret; } @@ -1040,13 +1041,13 @@ void release_mounts(struct list_head *head) struct dentry *dentry; struct mount *m; - br_write_lock(vfsmount_lock); + br_write_lock(&vfsmount_lock); dentry = mnt->mnt_mountpoint; m = mnt->mnt_parent; mnt->mnt_mountpoint = mnt->mnt.mnt_root; mnt->mnt_parent = mnt; m->mnt_ghosts--; - br_write_unlock(vfsmount_lock); + br_write_unlock(&vfsmount_lock); dput(dentry); mntput(&m->mnt); } @@ -1073,8 +1074,9 @@ void umount_tree(struct mount *mnt, int propagate, struct list_head *kill) list_del_init(&p->mnt_expire); list_del_init(&p->mnt_list); __touch_mnt_namespace(p->mnt_ns); + if (p->mnt_ns) + __mnt_make_shortterm(p); p->mnt_ns = NULL; - __mnt_make_shortterm(p); list_del_init(&p->mnt_child); if (mnt_has_parent(p)) { p->mnt_parent->mnt_ghosts++; @@ -1112,12 +1114,12 @@ static int do_umount(struct mount *mnt, int flags) * probably don't strictly need the lock here if we examined * all race cases, but it's a slowpath. */ - br_write_lock(vfsmount_lock); + br_write_lock(&vfsmount_lock); if (mnt_get_count(mnt) != 2) { - br_write_unlock(vfsmount_lock); + br_write_unlock(&vfsmount_lock); return -EBUSY; } - br_write_unlock(vfsmount_lock); + br_write_unlock(&vfsmount_lock); if (!xchg(&mnt->mnt_expiry_mark, 1)) return -EAGAIN; @@ -1159,7 +1161,7 @@ static int do_umount(struct mount *mnt, int flags) } down_write(&namespace_sem); - br_write_lock(vfsmount_lock); + br_write_lock(&vfsmount_lock); event++; if (!(flags & MNT_DETACH)) @@ -1171,7 +1173,7 @@ static int do_umount(struct mount *mnt, int flags) umount_tree(mnt, 1, &umount_list); retval = 0; } - br_write_unlock(vfsmount_lock); + br_write_unlock(&vfsmount_lock); up_write(&namespace_sem); release_mounts(&umount_list); return retval; @@ -1286,19 +1288,19 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, q = clone_mnt(p, p->mnt.mnt_root, flag); if (!q) goto Enomem; - br_write_lock(vfsmount_lock); + br_write_lock(&vfsmount_lock); list_add_tail(&q->mnt_list, &res->mnt_list); attach_mnt(q, &path); - br_write_unlock(vfsmount_lock); + br_write_unlock(&vfsmount_lock); } } return res; Enomem: if (res) { LIST_HEAD(umount_list); - br_write_lock(vfsmount_lock); + br_write_lock(&vfsmount_lock); umount_tree(res, 0, &umount_list); - br_write_unlock(vfsmount_lock); + br_write_unlock(&vfsmount_lock); release_mounts(&umount_list); } return NULL; @@ -1318,9 +1320,9 @@ void drop_collected_mounts(struct vfsmount *mnt) { LIST_HEAD(umount_list); down_write(&namespace_sem); - br_write_lock(vfsmount_lock); + br_write_lock(&vfsmount_lock); umount_tree(real_mount(mnt), 0, &umount_list); - br_write_unlock(vfsmount_lock); + br_write_unlock(&vfsmount_lock); up_write(&namespace_sem); release_mounts(&umount_list); } @@ -1448,7 +1450,7 @@ static int attach_recursive_mnt(struct mount *source_mnt, if (err) goto out_cleanup_ids; - br_write_lock(vfsmount_lock); + br_write_lock(&vfsmount_lock); if (IS_MNT_SHARED(dest_mnt)) { for (p = source_mnt; p; p = next_mnt(p, source_mnt)) @@ -1467,7 +1469,7 @@ static int attach_recursive_mnt(struct mount *source_mnt, list_del_init(&child->mnt_hash); commit_tree(child); } - br_write_unlock(vfsmount_lock); + br_write_unlock(&vfsmount_lock); return 0; @@ -1565,10 +1567,10 @@ static int do_change_type(struct path *path, int flag) goto out_unlock; } - br_write_lock(vfsmount_lock); + br_write_lock(&vfsmount_lock); for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL)) change_mnt_propagation(m, type); - br_write_unlock(vfsmount_lock); + br_write_unlock(&vfsmount_lock); out_unlock: up_write(&namespace_sem); @@ -1617,9 +1619,9 @@ static int do_loopback(struct path *path, char *old_name, err = graft_tree(mnt, path); if (err) { - br_write_lock(vfsmount_lock); + br_write_lock(&vfsmount_lock); umount_tree(mnt, 0, &umount_list); - br_write_unlock(vfsmount_lock); + br_write_unlock(&vfsmount_lock); } out2: unlock_mount(path); @@ -1677,16 +1679,16 @@ static int do_remount(struct path *path, int flags, int mnt_flags, else err = do_remount_sb(sb, flags, data, 0); if (!err) { - br_write_lock(vfsmount_lock); + br_write_lock(&vfsmount_lock); mnt_flags |= mnt->mnt.mnt_flags & MNT_PROPAGATION_MASK; mnt->mnt.mnt_flags = mnt_flags; - br_write_unlock(vfsmount_lock); + br_write_unlock(&vfsmount_lock); } up_write(&sb->s_umount); if (!err) { - br_write_lock(vfsmount_lock); + br_write_lock(&vfsmount_lock); touch_mnt_namespace(mnt->mnt_ns); - br_write_unlock(vfsmount_lock); + br_write_unlock(&vfsmount_lock); } return err; } @@ -1893,9 +1895,9 @@ fail: /* remove m from any expiration list it may be on */ if (!list_empty(&mnt->mnt_expire)) { down_write(&namespace_sem); - br_write_lock(vfsmount_lock); + br_write_lock(&vfsmount_lock); list_del_init(&mnt->mnt_expire); - br_write_unlock(vfsmount_lock); + br_write_unlock(&vfsmount_lock); up_write(&namespace_sem); } mntput(m); @@ -1911,11 +1913,11 @@ fail: void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list) { down_write(&namespace_sem); - br_write_lock(vfsmount_lock); + br_write_lock(&vfsmount_lock); list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list); - br_write_unlock(vfsmount_lock); + br_write_unlock(&vfsmount_lock); up_write(&namespace_sem); } EXPORT_SYMBOL(mnt_set_expiry); @@ -1935,7 +1937,7 @@ void mark_mounts_for_expiry(struct list_head *mounts) return; down_write(&namespace_sem); - br_write_lock(vfsmount_lock); + br_write_lock(&vfsmount_lock); /* extract from the expiration list every vfsmount that matches the * following criteria: @@ -1954,7 +1956,7 @@ void mark_mounts_for_expiry(struct list_head *mounts) touch_mnt_namespace(mnt->mnt_ns); umount_tree(mnt, 1, &umounts); } - br_write_unlock(vfsmount_lock); + br_write_unlock(&vfsmount_lock); up_write(&namespace_sem); release_mounts(&umounts); @@ -2218,9 +2220,9 @@ void mnt_make_shortterm(struct vfsmount *m) struct mount *mnt = real_mount(m); if (atomic_add_unless(&mnt->mnt_longterm, -1, 1)) return; - br_write_lock(vfsmount_lock); + br_write_lock(&vfsmount_lock); atomic_dec(&mnt->mnt_longterm); - br_write_unlock(vfsmount_lock); + br_write_unlock(&vfsmount_lock); #endif } @@ -2250,9 +2252,9 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, return ERR_PTR(-ENOMEM); } new_ns->root = new; - br_write_lock(vfsmount_lock); + br_write_lock(&vfsmount_lock); list_add_tail(&new_ns->list, &new->mnt_list); - br_write_unlock(vfsmount_lock); + br_write_unlock(&vfsmount_lock); /* * Second pass: switch the tsk->fs->* elements and mark new vfsmounts @@ -2416,9 +2418,9 @@ bool is_path_reachable(struct mount *mnt, struct dentry *dentry, int path_is_under(struct path *path1, struct path *path2) { int res; - br_read_lock(vfsmount_lock); + br_read_lock(&vfsmount_lock); res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2); - br_read_unlock(vfsmount_lock); + br_read_unlock(&vfsmount_lock); return res; } EXPORT_SYMBOL(path_is_under); @@ -2505,7 +2507,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, /* make sure we can reach put_old from new_root */ if (!is_path_reachable(real_mount(old.mnt), old.dentry, &new)) goto out4; - br_write_lock(vfsmount_lock); + br_write_lock(&vfsmount_lock); detach_mnt(new_mnt, &parent_path); detach_mnt(root_mnt, &root_parent); /* mount old root on put_old */ @@ -2513,7 +2515,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, /* mount new_root on / */ attach_mnt(new_mnt, &root_parent); touch_mnt_namespace(current->nsproxy->mnt_ns); - br_write_unlock(vfsmount_lock); + br_write_unlock(&vfsmount_lock); chroot_fs_refs(&root, &new); error = 0; out4: @@ -2576,7 +2578,7 @@ void __init mnt_init(void) for (u = 0; u < HASH_SIZE; u++) INIT_LIST_HEAD(&mount_hashtable[u]); - br_lock_init(vfsmount_lock); + br_lock_init(&vfsmount_lock); err = sysfs_init(); if (err) @@ -2596,9 +2598,9 @@ void put_mnt_ns(struct mnt_namespace *ns) if (!atomic_dec_and_test(&ns->count)) return; down_write(&namespace_sem); - br_write_lock(vfsmount_lock); + br_write_lock(&vfsmount_lock); umount_tree(ns->root, 0, &umount_list); - br_write_unlock(vfsmount_lock); + br_write_unlock(&vfsmount_lock); up_write(&namespace_sem); release_mounts(&umount_list); kfree(ns); diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c index 3ff5fcc..122e260 100644 --- a/fs/ncpfs/file.c +++ b/fs/ncpfs/file.c @@ -221,6 +221,10 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t * already_written = 0; + errno = file_update_time(file); + if (errno) + goto outrel; + bouncebuffer = vmalloc(bufsize); if (!bouncebuffer) { errno = -EIO; /* -ENOMEM */ @@ -252,8 +256,6 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t * } vfree(bouncebuffer); - file_update_time(file); - *ppos = pos; if (pos > i_size_read(inode)) { diff --git a/fs/ncpfs/ncp_fs_sb.h b/fs/ncpfs/ncp_fs_sb.h index 4af803f..54cc0cd 100644 --- a/fs/ncpfs/ncp_fs_sb.h +++ b/fs/ncpfs/ncp_fs_sb.h @@ -23,17 +23,17 @@ struct ncp_mount_data_kernel { unsigned long flags; /* NCP_MOUNT_* flags */ unsigned int int_flags; /* internal flags */ #define NCP_IMOUNT_LOGGEDIN_POSSIBLE 0x0001 - __kernel_uid32_t mounted_uid; /* Who may umount() this filesystem? */ + uid_t mounted_uid; /* Who may umount() this filesystem? */ struct pid *wdog_pid; /* Who cares for our watchdog packets? */ unsigned int ncp_fd; /* The socket to the ncp port */ unsigned int time_out; /* How long should I wait after sending a NCP request? */ unsigned int retry_count; /* And how often should I retry? */ unsigned char mounted_vol[NCP_VOLNAME_LEN + 1]; - __kernel_uid32_t uid; - __kernel_gid32_t gid; - __kernel_mode_t file_mode; - __kernel_mode_t dir_mode; + uid_t uid; + gid_t gid; + umode_t file_mode; + umode_t dir_mode; int info_fd; }; diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index eb95f50..970659d 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -17,6 +17,7 @@ #include <linux/kthread.h> #include <linux/sunrpc/svcauth_gss.h> #include <linux/sunrpc/bc_xprt.h> +#include <linux/nsproxy.h> #include <net/inet_sock.h> @@ -253,6 +254,7 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt) char svc_name[12]; int ret = 0; int minorversion_setup; + struct net *net = current->nsproxy->net_ns; mutex_lock(&nfs_callback_mutex); if (cb_info->users++ || cb_info->task != NULL) { @@ -265,6 +267,12 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt) goto out_err; } + ret = svc_bind(serv, net); + if (ret < 0) { + printk(KERN_WARNING "NFS: bind callback service failed\n"); + goto out_err; + } + minorversion_setup = nfs_minorversion_callback_svc_setup(minorversion, serv, xprt, &rqstp, &callback_svc); if (!minorversion_setup) { @@ -306,6 +314,8 @@ out_err: dprintk("NFS: Couldn't create callback socket or server thread; " "err = %d\n", ret); cb_info->users--; + if (serv) + svc_shutdown_net(serv, net); goto out; } @@ -320,6 +330,7 @@ void nfs_callback_down(int minorversion) cb_info->users--; if (cb_info->users == 0 && cb_info->task != NULL) { kthread_stop(cb_info->task); + svc_shutdown_net(cb_info->serv, current->nsproxy->net_ns); svc_exit_thread(cb_info->rqst); cb_info->serv = NULL; cb_info->rqst = NULL; @@ -332,7 +343,7 @@ void nfs_callback_down(int minorversion) int check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp) { - char *p = svc_gss_principal(rqstp); + char *p = rqstp->rq_cred.cr_principal; if (rqstp->rq_authop->flavour != RPC_AUTH_GSS) return 1; diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 0989a20..f430057 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1354,10 +1354,10 @@ out: } #ifdef CONFIG_NFS_V4 -static int nfs_open_revalidate(struct dentry *, struct nameidata *); +static int nfs4_lookup_revalidate(struct dentry *, struct nameidata *); const struct dentry_operations nfs4_dentry_operations = { - .d_revalidate = nfs_open_revalidate, + .d_revalidate = nfs4_lookup_revalidate, .d_delete = nfs_dentry_delete, .d_iput = nfs_dentry_iput, .d_automount = nfs_d_automount, @@ -1519,13 +1519,11 @@ no_open: return nfs_lookup(dir, dentry, nd); } -static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd) +static int nfs4_lookup_revalidate(struct dentry *dentry, struct nameidata *nd) { struct dentry *parent = NULL; struct inode *inode; struct inode *dir; - struct nfs_open_context *ctx; - struct iattr attr; int openflags, ret = 0; if (nd->flags & LOOKUP_RCU) @@ -1554,57 +1552,13 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd) /* We cannot do exclusive creation on a positive dentry */ if ((openflags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL)) goto no_open_dput; - /* We can't create new files here */ - openflags &= ~(O_CREAT|O_EXCL); - - ctx = create_nfs_open_context(dentry, openflags); - ret = PTR_ERR(ctx); - if (IS_ERR(ctx)) - goto out; - attr.ia_valid = ATTR_OPEN; - if (openflags & O_TRUNC) { - attr.ia_valid |= ATTR_SIZE; - attr.ia_size = 0; - nfs_wb_all(inode); - } - - /* - * Note: we're not holding inode->i_mutex and so may be racing with - * operations that change the directory. We therefore save the - * change attribute *before* we do the RPC call. - */ - inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr); - if (IS_ERR(inode)) { - ret = PTR_ERR(inode); - switch (ret) { - case -EPERM: - case -EACCES: - case -EDQUOT: - case -ENOSPC: - case -EROFS: - goto out_put_ctx; - default: - goto out_drop; - } - } - iput(inode); - if (inode != dentry->d_inode) - goto out_drop; + /* Let f_op->open() actually open (and revalidate) the file */ + ret = 1; - nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); - ret = nfs_intent_set_file(nd, ctx); - if (ret >= 0) - ret = 1; out: dput(parent); return ret; -out_drop: - d_drop(dentry); - ret = 0; -out_put_ctx: - put_nfs_open_context(ctx); - goto out; no_open_dput: dput(parent); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 56311ca..a6708e6b 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -879,12 +879,81 @@ const struct file_operations nfs_file_operations = { static int nfs4_file_open(struct inode *inode, struct file *filp) { + struct nfs_open_context *ctx; + struct dentry *dentry = filp->f_path.dentry; + struct dentry *parent = NULL; + struct inode *dir; + unsigned openflags = filp->f_flags; + struct iattr attr; + int err; + + BUG_ON(inode != dentry->d_inode); /* - * NFSv4 opens are handled in d_lookup and d_revalidate. If we get to - * this point, then something is very wrong + * If no cached dentry exists or if it's negative, NFSv4 handled the + * opens in ->lookup() or ->create(). + * + * We only get this far for a cached positive dentry. We skipped + * revalidation, so handle it here by dropping the dentry and returning + * -EOPENSTALE. The VFS will retry the lookup/create/open. */ - dprintk("NFS: %s called! inode=%p filp=%p\n", __func__, inode, filp); - return -ENOTDIR; + + dprintk("NFS: open file(%s/%s)\n", + dentry->d_parent->d_name.name, + dentry->d_name.name); + + if ((openflags & O_ACCMODE) == 3) + openflags--; + + /* We can't create new files here */ + openflags &= ~(O_CREAT|O_EXCL); + + parent = dget_parent(dentry); + dir = parent->d_inode; + + ctx = alloc_nfs_open_context(filp->f_path.dentry, filp->f_mode); + err = PTR_ERR(ctx); + if (IS_ERR(ctx)) + goto out; + + attr.ia_valid = ATTR_OPEN; + if (openflags & O_TRUNC) { + attr.ia_valid |= ATTR_SIZE; + attr.ia_size = 0; + nfs_wb_all(inode); + } + + inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + switch (err) { + case -EPERM: + case -EACCES: + case -EDQUOT: + case -ENOSPC: + case -EROFS: + goto out_put_ctx; + default: + goto out_drop; + } + } + iput(inode); + if (inode != dentry->d_inode) + goto out_drop; + + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + nfs_file_set_open_context(filp, ctx); + err = 0; + +out_put_ctx: + put_nfs_open_context(ctx); +out: + dput(parent); + return err; + +out_drop: + d_drop(dentry); + err = -EOPENSTALE; + goto out_put_ctx; } const struct file_operations nfs4_file_operations = { diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index 204438c..34a10d7 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c @@ -11,7 +11,7 @@ int nfsexp_flags(struct svc_rqst *rqstp, struct svc_export *exp) struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors; for (f = exp->ex_flavors; f < end; f++) { - if (f->pseudoflavor == rqstp->rq_flavor) + if (f->pseudoflavor == rqstp->rq_cred.cr_flavor) return f->flags; } return exp->ex_flags; diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index dcb52b8..ba23349 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -706,7 +706,7 @@ static struct cache_head *svc_export_alloc(void) return NULL; } -struct cache_detail svc_export_cache_template = { +static struct cache_detail svc_export_cache_template = { .owner = THIS_MODULE, .hash_size = EXPORT_HASHMAX, .name = "nfsd.export", @@ -904,13 +904,13 @@ __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp) return 0; /* ip-address based client; check sec= export option: */ for (f = exp->ex_flavors; f < end; f++) { - if (f->pseudoflavor == rqstp->rq_flavor) + if (f->pseudoflavor == rqstp->rq_cred.cr_flavor) return 0; } /* defaults in absence of sec= options: */ if (exp->ex_nflavors == 0) { - if (rqstp->rq_flavor == RPC_AUTH_NULL || - rqstp->rq_flavor == RPC_AUTH_UNIX) + if (rqstp->rq_cred.cr_flavor == RPC_AUTH_NULL || + rqstp->rq_cred.cr_flavor == RPC_AUTH_UNIX) return 0; } return nfserr_wrongsec; diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c index 9559ce4..e6c3815 100644 --- a/fs/nfsd/fault_inject.c +++ b/fs/nfsd/fault_inject.c @@ -58,6 +58,7 @@ static int nfsd_inject_set(void *op_ptr, u64 val) static int nfsd_inject_get(void *data, u64 *val) { + *val = 0; return 0; } diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index c8e9f63..a5fd6b98 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -650,9 +650,10 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c struct rpc_clnt *client; if (clp->cl_minorversion == 0) { - if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5)) + if (!clp->cl_cred.cr_principal && + (clp->cl_flavor >= RPC_AUTH_GSS_KRB5)) return -EINVAL; - args.client_name = clp->cl_principal; + args.client_name = clp->cl_cred.cr_principal; args.prognumber = conn->cb_prog, args.protocol = XPRT_TRANSPORT_TCP; args.authflavor = clp->cl_flavor; diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index 286a7f8..dae36f1 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -605,7 +605,7 @@ numeric_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namel static __be32 do_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, uid_t *id) { - if (nfs4_disable_idmapping && rqstp->rq_flavor < RPC_AUTH_GSS) + if (nfs4_disable_idmapping && rqstp->rq_cred.cr_flavor < RPC_AUTH_GSS) if (numeric_name_to_id(rqstp, type, name, namelen, id)) return 0; /* @@ -618,7 +618,7 @@ do_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, u static int do_id_to_name(struct svc_rqst *rqstp, int type, uid_t id, char *name) { - if (nfs4_disable_idmapping && rqstp->rq_flavor < RPC_AUTH_GSS) + if (nfs4_disable_idmapping && rqstp->rq_cred.cr_flavor < RPC_AUTH_GSS) return sprintf(name, "%u", id); return idmap_id_to_name(rqstp, type, id, name); } diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index ed3f920..5ff0b7b 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -570,7 +570,7 @@ static ssize_t cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) { struct cld_upcall *tmp, *cup; - struct cld_msg *cmsg = (struct cld_msg *)src; + struct cld_msg __user *cmsg = (struct cld_msg __user *)src; uint32_t xid; struct nfsd_net *nn = net_generic(filp->f_dentry->d_sb->s_fs_info, nfsd_net_id); @@ -1029,7 +1029,7 @@ rpc_pipefs_event(struct notifier_block *nb, unsigned long event, void *ptr) return ret; } -struct notifier_block nfsd4_cld_block = { +static struct notifier_block nfsd4_cld_block = { .notifier_call = rpc_pipefs_event, }; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 03f82c0..8fdc9ec 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -42,6 +42,7 @@ #include <linux/sunrpc/clnt.h> #include "xdr4.h" #include "vfs.h" +#include "current_stateid.h" #define NFSDDBG_FACILITY NFSDDBG_PROC @@ -447,37 +448,69 @@ static struct list_head close_lru; * * which we should reject. */ -static void -set_access(unsigned int *access, unsigned long bmap) { +static unsigned int +bmap_to_share_mode(unsigned long bmap) { int i; + unsigned int access = 0; - *access = 0; for (i = 1; i < 4; i++) { if (test_bit(i, &bmap)) - *access |= i; - } -} - -static void -set_deny(unsigned int *deny, unsigned long bmap) { - int i; - - *deny = 0; - for (i = 0; i < 4; i++) { - if (test_bit(i, &bmap)) - *deny |= i ; + access |= i; } + return access; } -static int +static bool test_share(struct nfs4_ol_stateid *stp, struct nfsd4_open *open) { unsigned int access, deny; - set_access(&access, stp->st_access_bmap); - set_deny(&deny, stp->st_deny_bmap); + access = bmap_to_share_mode(stp->st_access_bmap); + deny = bmap_to_share_mode(stp->st_deny_bmap); if ((access & open->op_share_deny) || (deny & open->op_share_access)) - return 0; - return 1; + return false; + return true; +} + +/* set share access for a given stateid */ +static inline void +set_access(u32 access, struct nfs4_ol_stateid *stp) +{ + __set_bit(access, &stp->st_access_bmap); +} + +/* clear share access for a given stateid */ +static inline void +clear_access(u32 access, struct nfs4_ol_stateid *stp) +{ + __clear_bit(access, &stp->st_access_bmap); +} + +/* test whether a given stateid has access */ +static inline bool +test_access(u32 access, struct nfs4_ol_stateid *stp) +{ + return test_bit(access, &stp->st_access_bmap); +} + +/* set share deny for a given stateid */ +static inline void +set_deny(u32 access, struct nfs4_ol_stateid *stp) +{ + __set_bit(access, &stp->st_deny_bmap); +} + +/* clear share deny for a given stateid */ +static inline void +clear_deny(u32 access, struct nfs4_ol_stateid *stp) +{ + __clear_bit(access, &stp->st_deny_bmap); +} + +/* test whether a given stateid is denying specific access */ +static inline bool +test_deny(u32 access, struct nfs4_ol_stateid *stp) +{ + return test_bit(access, &stp->st_deny_bmap); } static int nfs4_access_to_omode(u32 access) @@ -493,6 +526,20 @@ static int nfs4_access_to_omode(u32 access) BUG(); } +/* release all access and file references for a given stateid */ +static void +release_all_access(struct nfs4_ol_stateid *stp) +{ + int i; + + for (i = 1; i < 4; i++) { + if (test_access(i, stp)) + nfs4_file_put_access(stp->st_file, + nfs4_access_to_omode(i)); + clear_access(i, stp); + } +} + static void unhash_generic_stateid(struct nfs4_ol_stateid *stp) { list_del(&stp->st_perfile); @@ -501,16 +548,7 @@ static void unhash_generic_stateid(struct nfs4_ol_stateid *stp) static void close_generic_stateid(struct nfs4_ol_stateid *stp) { - int i; - - if (stp->st_access_bmap) { - for (i = 1; i < 4; i++) { - if (test_bit(i, &stp->st_access_bmap)) - nfs4_file_put_access(stp->st_file, - nfs4_access_to_omode(i)); - __clear_bit(i, &stp->st_access_bmap); - } - } + release_all_access(stp); put_nfs4_file(stp->st_file); stp->st_file = NULL; } @@ -885,7 +923,7 @@ static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct n struct nfsd4_session *new; struct nfsd4_channel_attrs *fchan = &cses->fore_channel; int numslots, slotsize; - int status; + __be32 status; int idx; /* @@ -984,7 +1022,8 @@ static inline void renew_client_locked(struct nfs4_client *clp) { if (is_client_expired(clp)) { - dprintk("%s: client (clientid %08x/%08x) already expired\n", + WARN_ON(1); + printk("%s: client (clientid %08x/%08x) already expired\n", __func__, clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id); @@ -1049,9 +1088,7 @@ free_client(struct nfs4_client *clp) list_del(&ses->se_perclnt); nfsd4_put_session_locked(ses); } - if (clp->cl_cred.cr_group_info) - put_group_info(clp->cl_cred.cr_group_info); - kfree(clp->cl_principal); + free_svc_cred(&clp->cl_cred); kfree(clp->cl_name.data); kfree(clp); } @@ -1132,12 +1169,21 @@ static void copy_clid(struct nfs4_client *target, struct nfs4_client *source) target->cl_clientid.cl_id = source->cl_clientid.cl_id; } -static void copy_cred(struct svc_cred *target, struct svc_cred *source) +static int copy_cred(struct svc_cred *target, struct svc_cred *source) { + if (source->cr_principal) { + target->cr_principal = + kstrdup(source->cr_principal, GFP_KERNEL); + if (target->cr_principal == NULL) + return -ENOMEM; + } else + target->cr_principal = NULL; + target->cr_flavor = source->cr_flavor; target->cr_uid = source->cr_uid; target->cr_gid = source->cr_gid; target->cr_group_info = source->cr_group_info; get_group_info(target->cr_group_info); + return 0; } static int same_name(const char *n1, const char *n2) @@ -1157,11 +1203,31 @@ same_clid(clientid_t *cl1, clientid_t *cl2) return (cl1->cl_boot == cl2->cl_boot) && (cl1->cl_id == cl2->cl_id); } -/* XXX what about NGROUP */ +static bool groups_equal(struct group_info *g1, struct group_info *g2) +{ + int i; + + if (g1->ngroups != g2->ngroups) + return false; + for (i=0; i<g1->ngroups; i++) + if (GROUP_AT(g1, i) != GROUP_AT(g2, i)) + return false; + return true; +} + static int same_creds(struct svc_cred *cr1, struct svc_cred *cr2) { - return cr1->cr_uid == cr2->cr_uid; + if ((cr1->cr_flavor != cr2->cr_flavor) + || (cr1->cr_uid != cr2->cr_uid) + || (cr1->cr_gid != cr2->cr_gid) + || !groups_equal(cr1->cr_group_info, cr2->cr_group_info)) + return false; + if (cr1->cr_principal == cr2->cr_principal) + return true; + if (!cr1->cr_principal || !cr2->cr_principal) + return false; + return 0 == strcmp(cr1->cr_principal, cr1->cr_principal); } static void gen_clid(struct nfs4_client *clp) @@ -1204,25 +1270,20 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir, { struct nfs4_client *clp; struct sockaddr *sa = svc_addr(rqstp); - char *princ; + int ret; clp = alloc_client(name); if (clp == NULL) return NULL; INIT_LIST_HEAD(&clp->cl_sessions); - - princ = svc_gss_principal(rqstp); - if (princ) { - clp->cl_principal = kstrdup(princ, GFP_KERNEL); - if (clp->cl_principal == NULL) { - spin_lock(&client_lock); - free_client(clp); - spin_unlock(&client_lock); - return NULL; - } + ret = copy_cred(&clp->cl_cred, &rqstp->rq_cred); + if (ret) { + spin_lock(&client_lock); + free_client(clp); + spin_unlock(&client_lock); + return NULL; } - idr_init(&clp->cl_stateids); memcpy(clp->cl_recdir, recdir, HEXDIR_LEN); atomic_set(&clp->cl_refcount, 0); @@ -1240,8 +1301,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir, rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table"); copy_verf(clp, verf); rpc_copy_addr((struct sockaddr *) &clp->cl_addr, sa); - clp->cl_flavor = rqstp->rq_flavor; - copy_cred(&clp->cl_cred, &rqstp->rq_cred); gen_confirm(clp); clp->cl_cb_session = NULL; return clp; @@ -1470,18 +1529,32 @@ nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid) clid->flags = new->cl_exchange_flags; } +static bool client_has_state(struct nfs4_client *clp) +{ + /* + * Note clp->cl_openowners check isn't quite right: there's no + * need to count owners without stateid's. + * + * Also note we should probably be using this in 4.0 case too. + */ + return !list_empty(&clp->cl_openowners) + || !list_empty(&clp->cl_delegations) + || !list_empty(&clp->cl_sessions); +} + __be32 nfsd4_exchange_id(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_exchange_id *exid) { struct nfs4_client *unconf, *conf, *new; - int status; + __be32 status; unsigned int strhashval; char dname[HEXDIR_LEN]; char addr_str[INET6_ADDRSTRLEN]; nfs4_verifier verf = exid->verifier; struct sockaddr *sa = svc_addr(rqstp); + bool update = exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A; rpc_ntop(sa, addr_str, sizeof(addr_str)); dprintk("%s rqstp=%p exid=%p clname.len=%u clname.data=%p " @@ -1507,71 +1580,63 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, status = nfs4_make_rec_clidname(dname, &exid->clname); if (status) - goto error; + return status; strhashval = clientstr_hashval(dname); + /* Cases below refer to rfc 5661 section 18.35.4: */ nfs4_lock_state(); - status = nfs_ok; - conf = find_confirmed_client_by_str(dname, strhashval); if (conf) { - if (!clp_used_exchangeid(conf)) { - status = nfserr_clid_inuse; /* XXX: ? */ - goto out; - } - if (!same_verf(&verf, &conf->cl_verifier)) { - /* 18.35.4 case 8 */ - if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) { + bool creds_match = same_creds(&conf->cl_cred, &rqstp->rq_cred); + bool verfs_match = same_verf(&verf, &conf->cl_verifier); + + if (update) { + if (!clp_used_exchangeid(conf)) { /* buggy client */ + status = nfserr_inval; + goto out; + } + if (!creds_match) { /* case 9 */ + status = nfserr_perm; + goto out; + } + if (!verfs_match) { /* case 8 */ status = nfserr_not_same; goto out; } - /* Client reboot: destroy old state */ - expire_client(conf); - goto out_new; + /* case 6 */ + exid->flags |= EXCHGID4_FLAG_CONFIRMED_R; + new = conf; + goto out_copy; } - if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) { - /* 18.35.4 case 9 */ - if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) { - status = nfserr_perm; + if (!creds_match) { /* case 3 */ + if (client_has_state(conf)) { + status = nfserr_clid_inuse; goto out; } expire_client(conf); goto out_new; } - /* - * Set bit when the owner id and verifier map to an already - * confirmed client id (18.35.3). - */ - exid->flags |= EXCHGID4_FLAG_CONFIRMED_R; - - /* - * Falling into 18.35.4 case 2, possible router replay. - * Leave confirmed record intact and return same result. - */ - copy_verf(conf, &verf); - new = conf; - goto out_copy; + if (verfs_match) { /* case 2 */ + conf->cl_exchange_flags |= EXCHGID4_FLAG_CONFIRMED_R; + new = conf; + goto out_copy; + } + /* case 5, client reboot */ + goto out_new; } - /* 18.35.4 case 7 */ - if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) { + if (update) { /* case 7 */ status = nfserr_noent; goto out; } unconf = find_unconfirmed_client_by_str(dname, strhashval); - if (unconf) { - /* - * Possible retry or client restart. Per 18.35.4 case 4, - * a new unconfirmed record should be generated regardless - * of whether any properties have changed. - */ + if (unconf) /* case 4, possible retry or client restart */ expire_client(unconf); - } + /* case 1 (normal case) */ out_new: - /* Normal case */ new = create_client(exid->clname, dname, rqstp, &verf); if (new == NULL) { status = nfserr_jukebox; @@ -1584,7 +1649,7 @@ out_copy: exid->clientid.cl_boot = new->cl_clientid.cl_boot; exid->clientid.cl_id = new->cl_clientid.cl_id; - exid->seqid = 1; + exid->seqid = new->cl_cs_slot.sl_seqid + 1; nfsd4_set_ex_flags(new, exid); dprintk("nfsd4_exchange_id seqid %d flags %x\n", @@ -1593,12 +1658,10 @@ out_copy: out: nfs4_unlock_state(); -error: - dprintk("nfsd4_exchange_id returns %d\n", ntohl(status)); return status; } -static int +static __be32 check_slot_seqid(u32 seqid, u32 slot_seqid, int slot_inuse) { dprintk("%s enter. seqid %d slot_seqid %d\n", __func__, seqid, @@ -1626,7 +1689,7 @@ check_slot_seqid(u32 seqid, u32 slot_seqid, int slot_inuse) */ static void nfsd4_cache_create_session(struct nfsd4_create_session *cr_ses, - struct nfsd4_clid_slot *slot, int nfserr) + struct nfsd4_clid_slot *slot, __be32 nfserr) { slot->sl_status = nfserr; memcpy(&slot->sl_cr_ses, cr_ses, sizeof(*cr_ses)); @@ -1657,7 +1720,7 @@ nfsd4_replay_create_session(struct nfsd4_create_session *cr_ses, /* seqid, slotID, slotID, slotID, status */ \ 5 ) * sizeof(__be32)) -static __be32 check_forechannel_attrs(struct nfsd4_channel_attrs fchannel) +static bool check_forechannel_attrs(struct nfsd4_channel_attrs fchannel) { return fchannel.maxreq_sz < NFSD_MIN_REQ_HDR_SEQ_SZ || fchannel.maxresp_sz < NFSD_MIN_RESP_HDR_SEQ_SZ; @@ -1673,7 +1736,7 @@ nfsd4_create_session(struct svc_rqst *rqstp, struct nfsd4_session *new; struct nfsd4_clid_slot *cs_slot = NULL; bool confirm_me = false; - int status = 0; + __be32 status = 0; if (cr_ses->flags & ~SESSION4_FLAG_MASK_A) return nfserr_inval; @@ -1686,16 +1749,10 @@ nfsd4_create_session(struct svc_rqst *rqstp, cs_slot = &conf->cl_cs_slot; status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0); if (status == nfserr_replay_cache) { - dprintk("Got a create_session replay! seqid= %d\n", - cs_slot->sl_seqid); - /* Return the cached reply status */ status = nfsd4_replay_create_session(cr_ses, cs_slot); goto out; } else if (cr_ses->seqid != cs_slot->sl_seqid + 1) { status = nfserr_seq_misordered; - dprintk("Sequence misordered!\n"); - dprintk("Expected seqid= %d but got seqid= %d\n", - cs_slot->sl_seqid, cr_ses->seqid); goto out; } } else if (unconf) { @@ -1704,7 +1761,6 @@ nfsd4_create_session(struct svc_rqst *rqstp, status = nfserr_clid_inuse; goto out; } - cs_slot = &unconf->cl_cs_slot; status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0); if (status) { @@ -1712,7 +1768,6 @@ nfsd4_create_session(struct svc_rqst *rqstp, status = nfserr_seq_misordered; goto out; } - confirm_me = true; conf = unconf; } else { @@ -1749,8 +1804,14 @@ nfsd4_create_session(struct svc_rqst *rqstp, /* cache solo and embedded create sessions under the state lock */ nfsd4_cache_create_session(cr_ses, cs_slot, status); - if (confirm_me) + if (confirm_me) { + unsigned int hash = clientstr_hashval(unconf->cl_recdir); + struct nfs4_client *old = + find_confirmed_client_by_str(conf->cl_recdir, hash); + if (old) + expire_client(old); move_to_confirmed(conf); + } out: nfs4_unlock_state(); dprintk("%s returns %d\n", __func__, ntohl(status)); @@ -1818,7 +1879,7 @@ nfsd4_destroy_session(struct svc_rqst *r, struct nfsd4_destroy_session *sessionid) { struct nfsd4_session *ses; - u32 status = nfserr_badsession; + __be32 status = nfserr_badsession; /* Notes: * - The confirmed nfs4_client->cl_sessionid holds destroyed sessinid @@ -1914,7 +1975,7 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_session *session; struct nfsd4_slot *slot; struct nfsd4_conn *conn; - int status; + __be32 status; if (resp->opcnt != 1) return nfserr_sequence_pos; @@ -2008,18 +2069,11 @@ out: return status; } -static inline bool has_resources(struct nfs4_client *clp) -{ - return !list_empty(&clp->cl_openowners) - || !list_empty(&clp->cl_delegations) - || !list_empty(&clp->cl_sessions); -} - __be32 nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_destroy_clientid *dc) { struct nfs4_client *conf, *unconf, *clp; - int status = 0; + __be32 status = 0; nfs4_lock_state(); unconf = find_unconfirmed_client(&dc->clientid); @@ -2028,7 +2082,7 @@ nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta if (conf) { clp = conf; - if (!is_client_expired(conf) && has_resources(conf)) { + if (!is_client_expired(conf) && client_has_state(conf)) { status = nfserr_clientid_busy; goto out; } @@ -2055,7 +2109,7 @@ out: __be32 nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_reclaim_complete *rc) { - int status = 0; + __be32 status = 0; if (rc->rca_one_fs) { if (!cstate->current_fh.fh_dentry) @@ -2106,17 +2160,13 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (status) return status; - /* - * XXX The Duplicate Request Cache (DRC) has been checked (??) - * We get here on a DRC miss. - */ - strhashval = clientstr_hashval(dname); + /* Cases below refer to rfc 3530 section 14.2.33: */ nfs4_lock_state(); conf = find_confirmed_client_by_str(dname, strhashval); if (conf) { - /* RFC 3530 14.2.33 CASE 0: */ + /* case 0: */ status = nfserr_clid_inuse; if (clp_used_exchangeid(conf)) goto out; @@ -2129,63 +2179,18 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; } } - /* - * section 14.2.33 of RFC 3530 (under the heading "IMPLEMENTATION") - * has a description of SETCLIENTID request processing consisting - * of 5 bullet points, labeled as CASE0 - CASE4 below. - */ unconf = find_unconfirmed_client_by_str(dname, strhashval); + if (unconf) + expire_client(unconf); status = nfserr_jukebox; - if (!conf) { - /* - * RFC 3530 14.2.33 CASE 4: - * placed first, because it is the normal case - */ - if (unconf) - expire_client(unconf); - new = create_client(clname, dname, rqstp, &clverifier); - if (new == NULL) - goto out; - gen_clid(new); - } else if (same_verf(&conf->cl_verifier, &clverifier)) { - /* - * RFC 3530 14.2.33 CASE 1: - * probable callback update - */ - if (unconf) { - /* Note this is removing unconfirmed {*x***}, - * which is stronger than RFC recommended {vxc**}. - * This has the advantage that there is at most - * one {*x***} in either list at any time. - */ - expire_client(unconf); - } - new = create_client(clname, dname, rqstp, &clverifier); - if (new == NULL) - goto out; + new = create_client(clname, dname, rqstp, &clverifier); + if (new == NULL) + goto out; + if (conf && same_verf(&conf->cl_verifier, &clverifier)) + /* case 1: probable callback update */ copy_clid(new, conf); - } else if (!unconf) { - /* - * RFC 3530 14.2.33 CASE 2: - * probable client reboot; state will be removed if - * confirmed. - */ - new = create_client(clname, dname, rqstp, &clverifier); - if (new == NULL) - goto out; - gen_clid(new); - } else { - /* - * RFC 3530 14.2.33 CASE 3: - * probable client reboot; state will be removed if - * confirmed. - */ - expire_client(unconf); - new = create_client(clname, dname, rqstp, &clverifier); - if (new == NULL) - goto out; + else /* case 4 (new client) or cases 2, 3 (client reboot): */ gen_clid(new); - } /* * XXX: we should probably set this at creation time, and check * for consistent minorversion use throughout: @@ -2203,17 +2208,11 @@ out: } -/* - * Section 14.2.34 of RFC 3530 (under the heading "IMPLEMENTATION") has - * a description of SETCLIENTID_CONFIRM request processing consisting of 4 - * bullets, labeled as CASE1 - CASE4 below. - */ __be32 nfsd4_setclientid_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_setclientid_confirm *setclientid_confirm) { - struct sockaddr *sa = svc_addr(rqstp); struct nfs4_client *conf, *unconf; nfs4_verifier confirm = setclientid_confirm->sc_confirm; clientid_t * clid = &setclientid_confirm->sc_clientid; @@ -2221,84 +2220,44 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, if (STALE_CLIENTID(clid)) return nfserr_stale_clientid; - /* - * XXX The Duplicate Request Cache (DRC) has been checked (??) - * We get here on a DRC miss. - */ - nfs4_lock_state(); conf = find_confirmed_client(clid); unconf = find_unconfirmed_client(clid); - - status = nfserr_clid_inuse; - if (conf && !rpc_cmp_addr((struct sockaddr *) &conf->cl_addr, sa)) - goto out; - if (unconf && !rpc_cmp_addr((struct sockaddr *) &unconf->cl_addr, sa)) - goto out; - /* - * section 14.2.34 of RFC 3530 has a description of - * SETCLIENTID_CONFIRM request processing consisting - * of 4 bullet points, labeled as CASE1 - CASE4 below. + * We try hard to give out unique clientid's, so if we get an + * attempt to confirm the same clientid with a different cred, + * there's a bug somewhere. Let's charitably assume it's our + * bug. */ - if (conf && unconf && same_verf(&confirm, &unconf->cl_confirm)) { - /* - * RFC 3530 14.2.34 CASE 1: - * callback update - */ - if (!same_creds(&conf->cl_cred, &unconf->cl_cred)) - status = nfserr_clid_inuse; - else { - nfsd4_change_callback(conf, &unconf->cl_cb_conn); - nfsd4_probe_callback(conf); - expire_client(unconf); + status = nfserr_serverfault; + if (unconf && !same_creds(&unconf->cl_cred, &rqstp->rq_cred)) + goto out; + if (conf && !same_creds(&conf->cl_cred, &rqstp->rq_cred)) + goto out; + /* cases below refer to rfc 3530 section 14.2.34: */ + if (!unconf || !same_verf(&confirm, &unconf->cl_confirm)) { + if (conf && !unconf) /* case 2: probable retransmit */ status = nfs_ok; + else /* case 4: client hasn't noticed we rebooted yet? */ + status = nfserr_stale_clientid; + goto out; + } + status = nfs_ok; + if (conf) { /* case 1: callback update */ + nfsd4_change_callback(conf, &unconf->cl_cb_conn); + nfsd4_probe_callback(conf); + expire_client(unconf); + } else { /* case 3: normal case; new or rebooted client */ + unsigned int hash = clientstr_hashval(unconf->cl_recdir); + conf = find_confirmed_client_by_str(unconf->cl_recdir, hash); + if (conf) { + nfsd4_client_record_remove(conf); + expire_client(conf); } - } else if (conf && !unconf) { - /* - * RFC 3530 14.2.34 CASE 2: - * probable retransmitted request; play it safe and - * do nothing. - */ - if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) - status = nfserr_clid_inuse; - else - status = nfs_ok; - } else if (!conf && unconf - && same_verf(&unconf->cl_confirm, &confirm)) { - /* - * RFC 3530 14.2.34 CASE 3: - * Normal case; new or rebooted client: - */ - if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred)) { - status = nfserr_clid_inuse; - } else { - unsigned int hash = - clientstr_hashval(unconf->cl_recdir); - conf = find_confirmed_client_by_str(unconf->cl_recdir, - hash); - if (conf) { - nfsd4_client_record_remove(conf); - expire_client(conf); - } - move_to_confirmed(unconf); - conf = unconf; - nfsd4_probe_callback(conf); - status = nfs_ok; - } - } else if ((!conf || (conf && !same_verf(&conf->cl_confirm, &confirm))) - && (!unconf || (unconf && !same_verf(&unconf->cl_confirm, - &confirm)))) { - /* - * RFC 3530 14.2.34 CASE 4: - * Client probably hasn't noticed that we rebooted yet. - */ - status = nfserr_stale_clientid; - } else { - /* check that we have hit one of the cases...*/ - status = nfserr_clid_inuse; + move_to_confirmed(unconf); + nfsd4_probe_callback(unconf); } out: nfs4_unlock_state(); @@ -2454,8 +2413,8 @@ static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, stp->st_file = fp; stp->st_access_bmap = 0; stp->st_deny_bmap = 0; - __set_bit(open->op_share_access, &stp->st_access_bmap); - __set_bit(open->op_share_deny, &stp->st_deny_bmap); + set_access(open->op_share_access, stp); + set_deny(open->op_share_deny, stp); stp->st_openstp = NULL; } @@ -2534,8 +2493,8 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type) ret = nfserr_locked; /* Search for conflicting share reservations */ list_for_each_entry(stp, &fp->fi_stateids, st_perfile) { - if (test_bit(deny_type, &stp->st_deny_bmap) || - test_bit(NFS4_SHARE_DENY_BOTH, &stp->st_deny_bmap)) + if (test_deny(deny_type, stp) || + test_deny(NFS4_SHARE_DENY_BOTH, stp)) goto out; } ret = nfs_ok; @@ -2791,7 +2750,7 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *c bool new_access; __be32 status; - new_access = !test_bit(op_share_access, &stp->st_access_bmap); + new_access = !test_access(op_share_access, stp); if (new_access) { status = nfs4_get_vfs_file(rqstp, fp, cur_fh, open); if (status) @@ -2806,8 +2765,8 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *c return status; } /* remember the open */ - __set_bit(op_share_access, &stp->st_access_bmap); - __set_bit(open->op_share_deny, &stp->st_deny_bmap); + set_access(op_share_access, stp); + set_deny(open->op_share_deny, stp); return nfs_ok; } @@ -3282,18 +3241,18 @@ STALE_STATEID(stateid_t *stateid) } static inline int -access_permit_read(unsigned long access_bmap) +access_permit_read(struct nfs4_ol_stateid *stp) { - return test_bit(NFS4_SHARE_ACCESS_READ, &access_bmap) || - test_bit(NFS4_SHARE_ACCESS_BOTH, &access_bmap) || - test_bit(NFS4_SHARE_ACCESS_WRITE, &access_bmap); + return test_access(NFS4_SHARE_ACCESS_READ, stp) || + test_access(NFS4_SHARE_ACCESS_BOTH, stp) || + test_access(NFS4_SHARE_ACCESS_WRITE, stp); } static inline int -access_permit_write(unsigned long access_bmap) +access_permit_write(struct nfs4_ol_stateid *stp) { - return test_bit(NFS4_SHARE_ACCESS_WRITE, &access_bmap) || - test_bit(NFS4_SHARE_ACCESS_BOTH, &access_bmap); + return test_access(NFS4_SHARE_ACCESS_WRITE, stp) || + test_access(NFS4_SHARE_ACCESS_BOTH, stp); } static @@ -3304,9 +3263,9 @@ __be32 nfs4_check_openmode(struct nfs4_ol_stateid *stp, int flags) /* For lock stateid's, we test the parent open, not the lock: */ if (stp->st_openstp) stp = stp->st_openstp; - if ((flags & WR_STATE) && (!access_permit_write(stp->st_access_bmap))) + if ((flags & WR_STATE) && !access_permit_write(stp)) goto out; - if ((flags & RD_STATE) && (!access_permit_read(stp->st_access_bmap))) + if ((flags & RD_STATE) && !access_permit_read(stp)) goto out; status = nfs_ok; out: @@ -3346,7 +3305,7 @@ static bool stateid_generation_after(stateid_t *a, stateid_t *b) return (s32)a->si_generation - (s32)b->si_generation > 0; } -static int check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_session) +static __be32 check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_session) { /* * When sessions are used the stateid generation number is ignored @@ -3655,10 +3614,10 @@ out: static inline void nfs4_stateid_downgrade_bit(struct nfs4_ol_stateid *stp, u32 access) { - if (!test_bit(access, &stp->st_access_bmap)) + if (!test_access(access, stp)) return; nfs4_file_put_access(stp->st_file, nfs4_access_to_omode(access)); - __clear_bit(access, &stp->st_access_bmap); + clear_access(access, stp); } static inline void nfs4_stateid_downgrade(struct nfs4_ol_stateid *stp, u32 to_access) @@ -3680,12 +3639,12 @@ static inline void nfs4_stateid_downgrade(struct nfs4_ol_stateid *stp, u32 to_ac } static void -reset_union_bmap_deny(unsigned long deny, unsigned long *bmap) +reset_union_bmap_deny(unsigned long deny, struct nfs4_ol_stateid *stp) { int i; for (i = 0; i < 4; i++) { if ((i & deny) != i) - __clear_bit(i, bmap); + clear_deny(i, stp); } } @@ -3712,19 +3671,19 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp, if (status) goto out; status = nfserr_inval; - if (!test_bit(od->od_share_access, &stp->st_access_bmap)) { - dprintk("NFSD:access not a subset current bitmap: 0x%lx, input access=%08x\n", + if (!test_access(od->od_share_access, stp)) { + dprintk("NFSD: access not a subset current bitmap: 0x%lx, input access=%08x\n", stp->st_access_bmap, od->od_share_access); goto out; } - if (!test_bit(od->od_share_deny, &stp->st_deny_bmap)) { + if (!test_deny(od->od_share_deny, stp)) { dprintk("NFSD:deny not a subset current bitmap: 0x%lx, input deny=%08x\n", stp->st_deny_bmap, od->od_share_deny); goto out; } nfs4_stateid_downgrade(stp, od->od_share_access); - reset_union_bmap_deny(od->od_share_deny, &stp->st_deny_bmap); + reset_union_bmap_deny(od->od_share_deny, stp); update_stateid(&stp->st_stid.sc_stateid); memcpy(&od->od_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); @@ -4014,13 +3973,13 @@ static void get_lock_access(struct nfs4_ol_stateid *lock_stp, u32 access) struct nfs4_file *fp = lock_stp->st_file; int oflag = nfs4_access_to_omode(access); - if (test_bit(access, &lock_stp->st_access_bmap)) + if (test_access(access, lock_stp)) return; nfs4_file_get_access(fp, oflag); - __set_bit(access, &lock_stp->st_access_bmap); + set_access(access, lock_stp); } -__be32 lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, struct nfs4_ol_stateid *ost, struct nfsd4_lock *lock, struct nfs4_ol_stateid **lst, bool *new) +static __be32 lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, struct nfs4_ol_stateid *ost, struct nfsd4_lock *lock, struct nfs4_ol_stateid **lst, bool *new) { struct nfs4_file *fi = ost->st_file; struct nfs4_openowner *oo = openowner(ost->st_stateowner); diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 74c00bc..4949667 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1674,12 +1674,12 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) static void write32(__be32 **p, u32 n) { - *(*p)++ = n; + *(*p)++ = htonl(n); } static void write64(__be32 **p, u64 n) { - write32(p, (u32)(n >> 32)); + write32(p, (n >> 32)); write32(p, (u32)n); } @@ -1744,15 +1744,16 @@ static void encode_seqid_op_tail(struct nfsd4_compoundres *resp, __be32 *save, _ } /* Encode as an array of strings the string given with components - * separated @sep. + * separated @sep, escaped with esc_enter and esc_exit. */ -static __be32 nfsd4_encode_components(char sep, char *components, - __be32 **pp, int *buflen) +static __be32 nfsd4_encode_components_esc(char sep, char *components, + __be32 **pp, int *buflen, + char esc_enter, char esc_exit) { __be32 *p = *pp; __be32 *countp = p; int strlen, count=0; - char *str, *end; + char *str, *end, *next; dprintk("nfsd4_encode_components(%s)\n", components); if ((*buflen -= 4) < 0) @@ -1760,8 +1761,23 @@ static __be32 nfsd4_encode_components(char sep, char *components, WRITE32(0); /* We will fill this in with @count later */ end = str = components; while (*end) { - for (; *end && (*end != sep); end++) - ; /* Point to end of component */ + bool found_esc = false; + + /* try to parse as esc_start, ..., esc_end, sep */ + if (*str == esc_enter) { + for (; *end && (*end != esc_exit); end++) + /* find esc_exit or end of string */; + next = end + 1; + if (*end && (!*next || *next == sep)) { + str++; + found_esc = true; + } + } + + if (!found_esc) + for (; *end && (*end != sep); end++) + /* find sep or end of string */; + strlen = end - str; if (strlen) { if ((*buflen -= ((XDR_QUADLEN(strlen) << 2) + 4)) < 0) @@ -1780,6 +1796,15 @@ static __be32 nfsd4_encode_components(char sep, char *components, return 0; } +/* Encode as an array of strings the string given with components + * separated @sep. + */ +static __be32 nfsd4_encode_components(char sep, char *components, + __be32 **pp, int *buflen) +{ + return nfsd4_encode_components_esc(sep, components, pp, buflen, 0, 0); +} + /* * encode a location element of a fs_locations structure */ @@ -1789,7 +1814,8 @@ static __be32 nfsd4_encode_fs_location4(struct nfsd4_fs_location *location, __be32 status; __be32 *p = *pp; - status = nfsd4_encode_components(':', location->hosts, &p, buflen); + status = nfsd4_encode_components_esc(':', location->hosts, &p, buflen, + '[', ']'); if (status) return status; status = nfsd4_encode_components('/', location->path, &p, buflen); @@ -3251,7 +3277,7 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_w } static __be32 -nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, int nfserr, +nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_exchange_id *exid) { __be32 *p; @@ -3306,7 +3332,7 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, int nfserr, } static __be32 -nfsd4_encode_create_session(struct nfsd4_compoundres *resp, int nfserr, +nfsd4_encode_create_session(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_create_session *sess) { __be32 *p; @@ -3355,14 +3381,14 @@ nfsd4_encode_create_session(struct nfsd4_compoundres *resp, int nfserr, } static __be32 -nfsd4_encode_destroy_session(struct nfsd4_compoundres *resp, int nfserr, +nfsd4_encode_destroy_session(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_destroy_session *destroy_session) { return nfserr; } static __be32 -nfsd4_encode_free_stateid(struct nfsd4_compoundres *resp, int nfserr, +nfsd4_encode_free_stateid(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_free_stateid *free_stateid) { __be32 *p; @@ -3371,13 +3397,13 @@ nfsd4_encode_free_stateid(struct nfsd4_compoundres *resp, int nfserr, return nfserr; RESERVE_SPACE(4); - WRITE32(nfserr); + *p++ = nfserr; ADJUST_ARGS(); return nfserr; } static __be32 -nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr, +nfsd4_encode_sequence(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_sequence *seq) { __be32 *p; @@ -3399,8 +3425,8 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr, return 0; } -__be32 -nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, int nfserr, +static __be32 +nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_test_stateid *test_stateid) { struct nfsd4_test_stateid_id *stateid, *next; @@ -3503,7 +3529,7 @@ static nfsd4_enc nfsd4_enc_ops[] = { * Our se_fmaxresp_cached will always be a multiple of PAGE_SIZE, and so * will be at least a page and will therefore hold the xdr_buf head. */ -int nfsd4_check_resp_size(struct nfsd4_compoundres *resp, u32 pad) +__be32 nfsd4_check_resp_size(struct nfsd4_compoundres *resp, u32 pad) { struct xdr_buf *xb = &resp->rqstp->rq_res; struct nfsd4_session *session = NULL; diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 7269988..c55298e 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -661,6 +661,7 @@ static ssize_t __write_ports_addfd(char *buf) { char *mesg = buf; int fd, err; + struct net *net = &init_net; err = get_int(&mesg, &fd); if (err != 0 || fd < 0) @@ -672,6 +673,8 @@ static ssize_t __write_ports_addfd(char *buf) err = svc_addsock(nfsd_serv, fd, buf, SIMPLE_TRANSACTION_LIMIT); if (err < 0) { + if (nfsd_serv->sv_nrthreads == 1) + svc_shutdown_net(nfsd_serv, net); svc_destroy(nfsd_serv); return err; } @@ -709,6 +712,7 @@ static ssize_t __write_ports_addxprt(char *buf) char transport[16]; struct svc_xprt *xprt; int port, err; + struct net *net = &init_net; if (sscanf(buf, "%15s %4u", transport, &port) != 2) return -EINVAL; @@ -720,12 +724,12 @@ static ssize_t __write_ports_addxprt(char *buf) if (err != 0) return err; - err = svc_create_xprt(nfsd_serv, transport, &init_net, + err = svc_create_xprt(nfsd_serv, transport, net, PF_INET, port, SVC_SOCK_ANONYMOUS); if (err < 0) goto out_err; - err = svc_create_xprt(nfsd_serv, transport, &init_net, + err = svc_create_xprt(nfsd_serv, transport, net, PF_INET6, port, SVC_SOCK_ANONYMOUS); if (err < 0 && err != -EAFNOSUPPORT) goto out_close; @@ -734,12 +738,14 @@ static ssize_t __write_ports_addxprt(char *buf) nfsd_serv->sv_nrthreads--; return 0; out_close: - xprt = svc_find_xprt(nfsd_serv, transport, &init_net, PF_INET, port); + xprt = svc_find_xprt(nfsd_serv, transport, net, PF_INET, port); if (xprt != NULL) { svc_close_xprt(xprt); svc_xprt_put(xprt); } out_err: + if (nfsd_serv->sv_nrthreads == 1) + svc_shutdown_net(nfsd_serv, net); svc_destroy(nfsd_serv); return err; } diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index cb4d51d..ee709fc 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -11,6 +11,7 @@ #include <linux/module.h> #include <linux/fs_struct.h> #include <linux/swap.h> +#include <linux/nsproxy.h> #include <linux/sunrpc/stats.h> #include <linux/sunrpc/svcsock.h> @@ -330,6 +331,8 @@ static int nfsd_get_default_max_blksize(void) int nfsd_create_serv(void) { + int error; + WARN_ON(!mutex_is_locked(&nfsd_mutex)); if (nfsd_serv) { svc_get(nfsd_serv); @@ -343,6 +346,12 @@ int nfsd_create_serv(void) if (nfsd_serv == NULL) return -ENOMEM; + error = svc_bind(nfsd_serv, current->nsproxy->net_ns); + if (error < 0) { + svc_destroy(nfsd_serv); + return error; + } + set_max_drc(); do_gettimeofday(&nfssvc_boot); /* record boot time */ return 0; @@ -373,6 +382,7 @@ int nfsd_set_nrthreads(int n, int *nthreads) int i = 0; int tot = 0; int err = 0; + struct net *net = &init_net; WARN_ON(!mutex_is_locked(&nfsd_mutex)); @@ -417,6 +427,9 @@ int nfsd_set_nrthreads(int n, int *nthreads) if (err) break; } + + if (nfsd_serv->sv_nrthreads == 1) + svc_shutdown_net(nfsd_serv, net); svc_destroy(nfsd_serv); return err; @@ -432,6 +445,7 @@ nfsd_svc(unsigned short port, int nrservs) { int error; bool nfsd_up_before; + struct net *net = &init_net; mutex_lock(&nfsd_mutex); dprintk("nfsd: creating service\n"); @@ -464,6 +478,8 @@ out_shutdown: if (error < 0 && !nfsd_up_before) nfsd_shutdown(); out_destroy: + if (nfsd_serv->sv_nrthreads == 1) + svc_shutdown_net(nfsd_serv, net); svc_destroy(nfsd_serv); /* Release server */ out: mutex_unlock(&nfsd_mutex); @@ -547,6 +563,9 @@ nfsd(void *vrqstp) nfsdstats.th_cnt --; out: + if (rqstp->rq_server->sv_nrthreads == 1) + svc_shutdown_net(rqstp->rq_server, &init_net); + /* Release the thread */ svc_exit_thread(rqstp); @@ -659,8 +678,12 @@ int nfsd_pool_stats_open(struct inode *inode, struct file *file) int nfsd_pool_stats_release(struct inode *inode, struct file *file) { int ret = seq_release(inode, file); + struct net *net = &init_net; + mutex_lock(&nfsd_mutex); /* this function really, really should have been called svc_put() */ + if (nfsd_serv->sv_nrthreads == 1) + svc_shutdown_net(nfsd_serv, net); svc_destroy(nfsd_serv); mutex_unlock(&nfsd_mutex); return ret; diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 89ab137..849091e 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -232,7 +232,6 @@ struct nfs4_client { time_t cl_time; /* time of last lease renewal */ struct sockaddr_storage cl_addr; /* client ipaddress */ u32 cl_flavor; /* setclientid pseudoflavor */ - char *cl_principal; /* setclientid principal name */ struct svc_cred cl_cred; /* setclientid principal */ clientid_t cl_clientid; /* generated by server */ nfs4_verifier cl_confirm; /* generated by server */ diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 1b35015..acd127d 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -60,7 +60,7 @@ struct nfsd4_compound_state { __be32 *datap; size_t iovlen; u32 minorversion; - u32 status; + __be32 status; stateid_t current_stateid; stateid_t save_stateid; /* to indicate current and saved state id presents */ @@ -364,7 +364,7 @@ struct nfsd4_test_stateid_id { }; struct nfsd4_test_stateid { - __be32 ts_num_ids; + u32 ts_num_ids; struct list_head ts_stateid_list; }; @@ -549,7 +549,7 @@ int nfs4svc_decode_compoundargs(struct svc_rqst *, __be32 *, struct nfsd4_compoundargs *); int nfs4svc_encode_compoundres(struct svc_rqst *, __be32 *, struct nfsd4_compoundres *); -int nfsd4_check_resp_size(struct nfsd4_compoundres *, u32); +__be32 nfsd4_check_resp_size(struct nfsd4_compoundres *, u32); void nfsd4_encode_operation(struct nfsd4_compoundres *, struct nfsd4_op *); void nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op); __be32 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index 0bb2c20..b728479 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -508,31 +508,29 @@ static struct dentry *nilfs_fh_to_parent(struct super_block *sb, struct fid *fh, return nilfs_get_dentry(sb, fid->cno, fid->parent_ino, fid->parent_gen); } -static int nilfs_encode_fh(struct dentry *dentry, __u32 *fh, int *lenp, - int connectable) +static int nilfs_encode_fh(struct inode *inode, __u32 *fh, int *lenp, + struct inode *parent) { struct nilfs_fid *fid = (struct nilfs_fid *)fh; - struct inode *inode = dentry->d_inode; struct nilfs_root *root = NILFS_I(inode)->i_root; int type; - if (*lenp < NILFS_FID_SIZE_NON_CONNECTABLE || - (connectable && *lenp < NILFS_FID_SIZE_CONNECTABLE)) + if (parent && *lenp < NILFS_FID_SIZE_CONNECTABLE) { + *lenp = NILFS_FID_SIZE_CONNECTABLE; + return 255; + } + if (*lenp < NILFS_FID_SIZE_NON_CONNECTABLE) { + *lenp = NILFS_FID_SIZE_NON_CONNECTABLE; return 255; + } fid->cno = root->cno; fid->ino = inode->i_ino; fid->gen = inode->i_generation; - if (connectable && !S_ISDIR(inode->i_mode)) { - struct inode *parent; - - spin_lock(&dentry->d_lock); - parent = dentry->d_parent->d_inode; + if (parent) { fid->parent_ino = parent->i_ino; fid->parent_gen = parent->i_generation; - spin_unlock(&dentry->d_lock); - type = FILEID_NILFS_WITH_PARENT; *lenp = NILFS_FID_SIZE_CONNECTABLE; } else { diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index ccb14d3..b39c5c1 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -123,7 +123,7 @@ int __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask) } EXPORT_SYMBOL_GPL(__fsnotify_parent); -static int send_to_group(struct inode *to_tell, struct vfsmount *mnt, +static int send_to_group(struct inode *to_tell, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, __u32 mask, void *data, @@ -168,10 +168,10 @@ static int send_to_group(struct inode *to_tell, struct vfsmount *mnt, vfsmount_test_mask &= ~inode_mark->ignored_mask; } - pr_debug("%s: group=%p to_tell=%p mnt=%p mask=%x inode_mark=%p" + pr_debug("%s: group=%p to_tell=%p mask=%x inode_mark=%p" " inode_test_mask=%x vfsmount_mark=%p vfsmount_test_mask=%x" " data=%p data_is=%d cookie=%d event=%p\n", - __func__, group, to_tell, mnt, mask, inode_mark, + __func__, group, to_tell, mask, inode_mark, inode_test_mask, vfsmount_mark, vfsmount_test_mask, data, data_is, cookie, *event); @@ -258,16 +258,16 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, if (inode_group > vfsmount_group) { /* handle inode */ - ret = send_to_group(to_tell, NULL, inode_mark, NULL, mask, data, + ret = send_to_group(to_tell, inode_mark, NULL, mask, data, data_is, cookie, file_name, &event); /* we didn't use the vfsmount_mark */ vfsmount_group = NULL; } else if (vfsmount_group > inode_group) { - ret = send_to_group(to_tell, &mnt->mnt, NULL, vfsmount_mark, mask, data, + ret = send_to_group(to_tell, NULL, vfsmount_mark, mask, data, data_is, cookie, file_name, &event); inode_group = NULL; } else { - ret = send_to_group(to_tell, &mnt->mnt, inode_mark, vfsmount_mark, + ret = send_to_group(to_tell, inode_mark, vfsmount_mark, mask, data, data_is, cookie, file_name, &event); } diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 8639169..7389d2d 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -2096,7 +2096,9 @@ static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb, err = file_remove_suid(file); if (err) goto out; - file_update_time(file); + err = file_update_time(file); + if (err) + goto out; written = ntfs_file_buffered_write(iocb, iov, nr_segs, pos, ppos, count); out: diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c index c7ee03c..0725e60 100644 --- a/fs/ocfs2/blockcheck.c +++ b/fs/ocfs2/blockcheck.c @@ -422,45 +422,46 @@ int ocfs2_block_check_validate(void *data, size_t blocksize, struct ocfs2_blockcheck_stats *stats) { int rc = 0; - struct ocfs2_block_check check; + u32 bc_crc32e; + u16 bc_ecc; u32 crc, ecc; ocfs2_blockcheck_inc_check(stats); - check.bc_crc32e = le32_to_cpu(bc->bc_crc32e); - check.bc_ecc = le16_to_cpu(bc->bc_ecc); + bc_crc32e = le32_to_cpu(bc->bc_crc32e); + bc_ecc = le16_to_cpu(bc->bc_ecc); memset(bc, 0, sizeof(struct ocfs2_block_check)); /* Fast path - if the crc32 validates, we're good to go */ crc = crc32_le(~0, data, blocksize); - if (crc == check.bc_crc32e) + if (crc == bc_crc32e) goto out; ocfs2_blockcheck_inc_failure(stats); mlog(ML_ERROR, "CRC32 failed: stored: 0x%x, computed 0x%x. Applying ECC.\n", - (unsigned int)check.bc_crc32e, (unsigned int)crc); + (unsigned int)bc_crc32e, (unsigned int)crc); /* Ok, try ECC fixups */ ecc = ocfs2_hamming_encode_block(data, blocksize); - ocfs2_hamming_fix_block(data, blocksize, ecc ^ check.bc_ecc); + ocfs2_hamming_fix_block(data, blocksize, ecc ^ bc_ecc); /* And check the crc32 again */ crc = crc32_le(~0, data, blocksize); - if (crc == check.bc_crc32e) { + if (crc == bc_crc32e) { ocfs2_blockcheck_inc_recover(stats); goto out; } mlog(ML_ERROR, "Fixed CRC32 failed: stored: 0x%x, computed 0x%x\n", - (unsigned int)check.bc_crc32e, (unsigned int)crc); + (unsigned int)bc_crc32e, (unsigned int)crc); rc = -EIO; out: - bc->bc_crc32e = cpu_to_le32(check.bc_crc32e); - bc->bc_ecc = cpu_to_le16(check.bc_ecc); + bc->bc_crc32e = cpu_to_le32(bc_crc32e); + bc->bc_ecc = cpu_to_le16(bc_ecc); return rc; } @@ -528,7 +529,8 @@ int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr, struct ocfs2_blockcheck_stats *stats) { int i, rc = 0; - struct ocfs2_block_check check; + u32 bc_crc32e; + u16 bc_ecc; u32 crc, ecc, fix; BUG_ON(nr < 0); @@ -538,21 +540,21 @@ int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr, ocfs2_blockcheck_inc_check(stats); - check.bc_crc32e = le32_to_cpu(bc->bc_crc32e); - check.bc_ecc = le16_to_cpu(bc->bc_ecc); + bc_crc32e = le32_to_cpu(bc->bc_crc32e); + bc_ecc = le16_to_cpu(bc->bc_ecc); memset(bc, 0, sizeof(struct ocfs2_block_check)); /* Fast path - if the crc32 validates, we're good to go */ for (i = 0, crc = ~0; i < nr; i++) crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size); - if (crc == check.bc_crc32e) + if (crc == bc_crc32e) goto out; ocfs2_blockcheck_inc_failure(stats); mlog(ML_ERROR, "CRC32 failed: stored: %u, computed %u. Applying ECC.\n", - (unsigned int)check.bc_crc32e, (unsigned int)crc); + (unsigned int)bc_crc32e, (unsigned int)crc); /* Ok, try ECC fixups */ for (i = 0, ecc = 0; i < nr; i++) { @@ -565,7 +567,7 @@ int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr, bhs[i]->b_size * 8, bhs[i]->b_size * 8 * i); } - fix = ecc ^ check.bc_ecc; + fix = ecc ^ bc_ecc; for (i = 0; i < nr; i++) { /* * Try the fix against each buffer. It will only affect @@ -578,19 +580,19 @@ int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr, /* And check the crc32 again */ for (i = 0, crc = ~0; i < nr; i++) crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size); - if (crc == check.bc_crc32e) { + if (crc == bc_crc32e) { ocfs2_blockcheck_inc_recover(stats); goto out; } mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n", - (unsigned int)check.bc_crc32e, (unsigned int)crc); + (unsigned int)bc_crc32e, (unsigned int)crc); rc = -EIO; out: - bc->bc_crc32e = cpu_to_le32(check.bc_crc32e); - bc->bc_ecc = cpu_to_le16(check.bc_ecc); + bc->bc_crc32e = cpu_to_le32(bc_crc32e); + bc->bc_ecc = cpu_to_le16(bc_ecc); return rc; } diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c index 3a3ed4b..fbec0be 100644 --- a/fs/ocfs2/dlm/dlmast.c +++ b/fs/ocfs2/dlm/dlmast.c @@ -293,7 +293,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data, struct dlm_proxy_ast *past = (struct dlm_proxy_ast *) msg->buf; char *name; struct list_head *iter, *head=NULL; - u64 cookie; + __be64 cookie; u32 flags; u8 node; diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index a5952ce..de854cc 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -679,7 +679,7 @@ struct dlm_query_join_packet { }; union dlm_query_join_response { - u32 intval; + __be32 intval; struct dlm_query_join_packet packet; }; @@ -755,8 +755,8 @@ struct dlm_query_region { struct dlm_node_info { u8 ni_nodenum; u8 pad1; - u16 ni_ipv4_port; - u32 ni_ipv4_address; + __be16 ni_ipv4_port; + __be32 ni_ipv4_address; }; struct dlm_query_nodeinfo { diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 92f2ead..9e89d70 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -818,7 +818,7 @@ static void dlm_query_join_packet_to_wire(struct dlm_query_join_packet *packet, union dlm_query_join_response response; response.packet = *packet; - *wire = cpu_to_be32(response.intval); + *wire = be32_to_cpu(response.intval); } static void dlm_query_join_wire_to_packet(u32 wire, diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c index 745db42..322216a 100644 --- a/fs/ocfs2/export.c +++ b/fs/ocfs2/export.c @@ -177,21 +177,23 @@ bail: return parent; } -static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len, - int connectable) +static int ocfs2_encode_fh(struct inode *inode, u32 *fh_in, int *max_len, + struct inode *parent) { - struct inode *inode = dentry->d_inode; int len = *max_len; int type = 1; u64 blkno; u32 generation; __le32 *fh = (__force __le32 *) fh_in; +#ifdef TRACE_HOOKS_ARE_NOT_BRAINDEAD_IN_YOUR_OPINION +#error "You go ahead and fix that mess, then. Somehow" trace_ocfs2_encode_fh_begin(dentry, dentry->d_name.len, dentry->d_name.name, fh, len, connectable); +#endif - if (connectable && (len < 6)) { + if (parent && (len < 6)) { *max_len = 6; type = 255; goto bail; @@ -211,12 +213,7 @@ static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len, fh[1] = cpu_to_le32((u32)(blkno & 0xffffffff)); fh[2] = cpu_to_le32(generation); - if (connectable && !S_ISDIR(inode->i_mode)) { - struct inode *parent; - - spin_lock(&dentry->d_lock); - - parent = dentry->d_parent->d_inode; + if (parent) { blkno = OCFS2_I(parent)->ip_blkno; generation = parent->i_generation; @@ -224,8 +221,6 @@ static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len, fh[4] = cpu_to_le32((u32)(blkno & 0xffffffff)); fh[5] = cpu_to_le32(generation); - spin_unlock(&dentry->d_lock); - len = 6; type = 2; diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 735514c..d89e08a 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -273,11 +273,13 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, inode->i_gid = le32_to_cpu(fe->i_gid); /* Fast symlinks will have i_size but no allocated clusters. */ - if (S_ISLNK(inode->i_mode) && !fe->i_clusters) + if (S_ISLNK(inode->i_mode) && !fe->i_clusters) { inode->i_blocks = 0; - else + inode->i_mapping->a_ops = &ocfs2_fast_symlink_aops; + } else { inode->i_blocks = ocfs2_inode_sector_count(inode); - inode->i_mapping->a_ops = &ocfs2_aops; + inode->i_mapping->a_ops = &ocfs2_aops; + } inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime); inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec); inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime); @@ -331,10 +333,7 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, OCFS2_I(inode)->ip_dir_lock_gen = 1; break; case S_IFLNK: - if (ocfs2_inode_is_fast_symlink(inode)) - inode->i_op = &ocfs2_fast_symlink_inode_operations; - else - inode->i_op = &ocfs2_symlink_inode_operations; + inode->i_op = &ocfs2_symlink_inode_operations; i_size_write(inode, le64_to_cpu(fe->i_size)); break; default: diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index a1a1bfd..d96f7f8 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -864,7 +864,7 @@ int ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info, if (status) break; - reqp = (struct ocfs2_info_request *)(unsigned long)req_addr; + reqp = (struct ocfs2_info_request __user *)(unsigned long)req_addr; if (!reqp) { status = -EINVAL; goto bail; @@ -888,9 +888,11 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) struct ocfs2_space_resv sr; struct ocfs2_new_group_input input; struct reflink_arguments args; - const char *old_path, *new_path; + const char __user *old_path; + const char __user *new_path; bool preserve; struct ocfs2_info info; + void __user *argp = (void __user *)arg; switch (cmd) { case OCFS2_IOC_GETFLAGS: @@ -937,17 +939,15 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return ocfs2_group_add(inode, &input); case OCFS2_IOC_REFLINK: - if (copy_from_user(&args, (struct reflink_arguments *)arg, - sizeof(args))) + if (copy_from_user(&args, argp, sizeof(args))) return -EFAULT; - old_path = (const char *)(unsigned long)args.old_path; - new_path = (const char *)(unsigned long)args.new_path; + old_path = (const char __user *)(unsigned long)args.old_path; + new_path = (const char __user *)(unsigned long)args.new_path; preserve = (args.preserve != 0); return ocfs2_reflink_ioctl(inode, old_path, new_path, preserve); case OCFS2_IOC_INFO: - if (copy_from_user(&info, (struct ocfs2_info __user *)arg, - sizeof(struct ocfs2_info))) + if (copy_from_user(&info, argp, sizeof(struct ocfs2_info))) return -EFAULT; return ocfs2_info_handle(inode, &info, 0); @@ -960,22 +960,20 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (copy_from_user(&range, (struct fstrim_range *)arg, - sizeof(range))) + if (copy_from_user(&range, argp, sizeof(range))) return -EFAULT; ret = ocfs2_trim_fs(sb, &range); if (ret < 0) return ret; - if (copy_to_user((struct fstrim_range *)arg, &range, - sizeof(range))) + if (copy_to_user(argp, &range, sizeof(range))) return -EFAULT; return 0; } case OCFS2_IOC_MOVE_EXT: - return ocfs2_ioctl_move_extents(filp, (void __user *)arg); + return ocfs2_ioctl_move_extents(filp, argp); default: return -ENOTTY; } @@ -988,6 +986,7 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg) struct reflink_arguments args; struct inode *inode = file->f_path.dentry->d_inode; struct ocfs2_info info; + void __user *argp = (void __user *)arg; switch (cmd) { case OCFS2_IOC32_GETFLAGS: @@ -1006,16 +1005,14 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg) case FITRIM: break; case OCFS2_IOC_REFLINK: - if (copy_from_user(&args, (struct reflink_arguments *)arg, - sizeof(args))) + if (copy_from_user(&args, argp, sizeof(args))) return -EFAULT; preserve = (args.preserve != 0); return ocfs2_reflink_ioctl(inode, compat_ptr(args.old_path), compat_ptr(args.new_path), preserve); case OCFS2_IOC_INFO: - if (copy_from_user(&info, (struct ocfs2_info __user *)arg, - sizeof(struct ocfs2_info))) + if (copy_from_user(&info, argp, sizeof(struct ocfs2_info))) return -EFAULT; return ocfs2_info_handle(inode, &info, 1); diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index b1e3fce..6083432 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -1082,8 +1082,7 @@ int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp) context->file = filp; if (argp) { - if (copy_from_user(&range, (struct ocfs2_move_extents *)argp, - sizeof(range))) { + if (copy_from_user(&range, argp, sizeof(range))) { status = -EFAULT; goto out; } @@ -1138,8 +1137,7 @@ out: * length and new_offset even if failure happens somewhere. */ if (argp) { - if (copy_to_user((struct ocfs2_move_extents *)argp, &range, - sizeof(range))) + if (copy_to_user(argp, &range, sizeof(range))) status = -EFAULT; } diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index a9856e3..9f39c64 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -1724,15 +1724,16 @@ static int ocfs2_symlink(struct inode *dir, fe = (struct ocfs2_dinode *) new_fe_bh->b_data; inode->i_rdev = 0; newsize = l - 1; + inode->i_op = &ocfs2_symlink_inode_operations; if (l > ocfs2_fast_symlink_chars(sb)) { u32 offset = 0; - inode->i_op = &ocfs2_symlink_inode_operations; status = dquot_alloc_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb, 1)); if (status) goto bail; did_quota = 1; + inode->i_mapping->a_ops = &ocfs2_aops; status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0, new_fe_bh, handle, data_ac, NULL, @@ -1750,7 +1751,7 @@ static int ocfs2_symlink(struct inode *dir, i_size_write(inode, newsize); inode->i_blocks = ocfs2_inode_sector_count(inode); } else { - inode->i_op = &ocfs2_fast_symlink_inode_operations; + inode->i_mapping->a_ops = &ocfs2_fast_symlink_aops; memcpy((char *) fe->id2.i_symlink, symname, l); i_size_write(inode, newsize); inode->i_blocks = 0; diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c index 5d22872..f1fbb4b 100644 --- a/fs/ocfs2/symlink.c +++ b/fs/ocfs2/symlink.c @@ -54,101 +54,40 @@ #include "buffer_head_io.h" -static char *ocfs2_fast_symlink_getlink(struct inode *inode, - struct buffer_head **bh) +static int ocfs2_fast_symlink_readpage(struct file *unused, struct page *page) { - int status; - char *link = NULL; + struct inode *inode = page->mapping->host; + struct buffer_head *bh; + int status = ocfs2_read_inode_block(inode, &bh); struct ocfs2_dinode *fe; + const char *link; + void *kaddr; + size_t len; - status = ocfs2_read_inode_block(inode, bh); if (status < 0) { mlog_errno(status); - link = ERR_PTR(status); - goto bail; + return status; } - fe = (struct ocfs2_dinode *) (*bh)->b_data; + fe = (struct ocfs2_dinode *) bh->b_data; link = (char *) fe->id2.i_symlink; -bail: - - return link; -} - -static int ocfs2_readlink(struct dentry *dentry, - char __user *buffer, - int buflen) -{ - int ret; - char *link; - struct buffer_head *bh = NULL; - struct inode *inode = dentry->d_inode; - - link = ocfs2_fast_symlink_getlink(inode, &bh); - if (IS_ERR(link)) { - ret = PTR_ERR(link); - goto out; - } - - /* - * Without vfsmount we can't update atime now, - * but we will update atime here ultimately. - */ - ret = vfs_readlink(dentry, buffer, buflen, link); - + /* will be less than a page size */ + len = strnlen(link, ocfs2_fast_symlink_chars(inode->i_sb)); + kaddr = kmap_atomic(page); + memcpy(kaddr, link, len + 1); + kunmap_atomic(kaddr); + SetPageUptodate(page); + unlock_page(page); brelse(bh); -out: - if (ret < 0) - mlog_errno(ret); - return ret; + return 0; } -static void *ocfs2_fast_follow_link(struct dentry *dentry, - struct nameidata *nd) -{ - int status = 0; - int len; - char *target, *link = ERR_PTR(-ENOMEM); - struct inode *inode = dentry->d_inode; - struct buffer_head *bh = NULL; - - BUG_ON(!ocfs2_inode_is_fast_symlink(inode)); - target = ocfs2_fast_symlink_getlink(inode, &bh); - if (IS_ERR(target)) { - status = PTR_ERR(target); - mlog_errno(status); - goto bail; - } - - /* Fast symlinks can't be large */ - len = strnlen(target, ocfs2_fast_symlink_chars(inode->i_sb)); - link = kzalloc(len + 1, GFP_NOFS); - if (!link) { - status = -ENOMEM; - mlog_errno(status); - goto bail; - } - - memcpy(link, target, len); - -bail: - nd_set_link(nd, status ? ERR_PTR(status) : link); - brelse(bh); - - if (status) - mlog_errno(status); - return NULL; -} - -static void ocfs2_fast_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) -{ - char *link = nd_get_link(nd); - if (!IS_ERR(link)) - kfree(link); -} +const struct address_space_operations ocfs2_fast_symlink_aops = { + .readpage = ocfs2_fast_symlink_readpage, +}; const struct inode_operations ocfs2_symlink_inode_operations = { - .readlink = page_readlink, + .readlink = generic_readlink, .follow_link = page_follow_link_light, .put_link = page_put_link, .getattr = ocfs2_getattr, @@ -159,15 +98,3 @@ const struct inode_operations ocfs2_symlink_inode_operations = { .removexattr = generic_removexattr, .fiemap = ocfs2_fiemap, }; -const struct inode_operations ocfs2_fast_symlink_inode_operations = { - .readlink = ocfs2_readlink, - .follow_link = ocfs2_fast_follow_link, - .put_link = ocfs2_fast_put_link, - .getattr = ocfs2_getattr, - .setattr = ocfs2_setattr, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, - .listxattr = ocfs2_listxattr, - .removexattr = generic_removexattr, - .fiemap = ocfs2_fiemap, -}; diff --git a/fs/ocfs2/symlink.h b/fs/ocfs2/symlink.h index 65a6c9c..71ee424 100644 --- a/fs/ocfs2/symlink.h +++ b/fs/ocfs2/symlink.h @@ -27,7 +27,7 @@ #define OCFS2_SYMLINK_H extern const struct inode_operations ocfs2_symlink_inode_operations; -extern const struct inode_operations ocfs2_fast_symlink_inode_operations; +extern const struct address_space_operations ocfs2_fast_symlink_aops; /* * Test whether an inode is a fast symlink. @@ -654,10 +654,23 @@ static inline int __get_file_write_access(struct inode *inode, return error; } -static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, - struct file *f, - int (*open)(struct inode *, struct file *), - const struct cred *cred) +int open_check_o_direct(struct file *f) +{ + /* NB: we're sure to have correct a_ops only after f_op->open */ + if (f->f_flags & O_DIRECT) { + if (!f->f_mapping->a_ops || + ((!f->f_mapping->a_ops->direct_IO) && + (!f->f_mapping->a_ops->get_xip_mem))) { + return -EINVAL; + } + } + return 0; +} + +static struct file *do_dentry_open(struct dentry *dentry, struct vfsmount *mnt, + struct file *f, + int (*open)(struct inode *, struct file *), + const struct cred *cred) { static const struct file_operations empty_fops = {}; struct inode *inode; @@ -713,16 +726,6 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping); - /* NB: we're sure to have correct a_ops only after f_op->open */ - if (f->f_flags & O_DIRECT) { - if (!f->f_mapping->a_ops || - ((!f->f_mapping->a_ops->direct_IO) && - (!f->f_mapping->a_ops->get_xip_mem))) { - fput(f); - f = ERR_PTR(-EINVAL); - } - } - return f; cleanup_all: @@ -744,12 +747,29 @@ cleanup_all: f->f_path.dentry = NULL; f->f_path.mnt = NULL; cleanup_file: - put_filp(f); dput(dentry); mntput(mnt); return ERR_PTR(error); } +static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, + struct file *f, + int (*open)(struct inode *, struct file *), + const struct cred *cred) +{ + struct file *res = do_dentry_open(dentry, mnt, f, open, cred); + if (!IS_ERR(res)) { + int error = open_check_o_direct(f); + if (error) { + fput(res); + res = ERR_PTR(error); + } + } else { + put_filp(f); + } + return res; +} + /** * lookup_instantiate_filp - instantiates the open intent filp * @nd: pointer to nameidata @@ -804,13 +824,31 @@ struct file *nameidata_to_filp(struct nameidata *nd) /* Pick up the filp from the open intent */ filp = nd->intent.open.file; - nd->intent.open.file = NULL; /* Has the filesystem initialised the file for us? */ - if (filp->f_path.dentry == NULL) { + if (filp->f_path.dentry != NULL) { + nd->intent.open.file = NULL; + } else { + struct file *res; + path_get(&nd->path); - filp = __dentry_open(nd->path.dentry, nd->path.mnt, filp, - NULL, cred); + res = do_dentry_open(nd->path.dentry, nd->path.mnt, + filp, NULL, cred); + if (!IS_ERR(res)) { + int error; + + nd->intent.open.file = NULL; + BUG_ON(res != filp); + + error = open_check_o_direct(filp); + if (error) { + fput(filp); + filp = ERR_PTR(error); + } + } else { + /* Allow nd->intent.open.file to be recycled */ + filp = res; + } } return filp; } @@ -654,8 +654,11 @@ out: wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); } - if (ret > 0) - file_update_time(filp); + if (ret > 0) { + int err = file_update_time(filp); + if (err) + ret = err; + } return ret; } @@ -257,12 +257,12 @@ int propagate_mnt(struct mount *dest_mnt, struct dentry *dest_dentry, prev_src_mnt = child; } out: - br_write_lock(vfsmount_lock); + br_write_lock(&vfsmount_lock); while (!list_empty(&tmp_list)) { child = list_first_entry(&tmp_list, struct mount, mnt_hash); umount_tree(child, 0, &umount_list); } - br_write_unlock(vfsmount_lock); + br_write_unlock(&vfsmount_lock); release_mounts(&umount_list); return ret; } diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index 1241285..5e289a7 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -23,12 +23,12 @@ static unsigned mounts_poll(struct file *file, poll_table *wait) poll_wait(file, &p->ns->poll, wait); - br_read_lock(vfsmount_lock); + br_read_lock(&vfsmount_lock); if (p->m.poll_event != ns->event) { p->m.poll_event = ns->event; res |= POLLERR | POLLPRI; } - br_read_unlock(vfsmount_lock); + br_read_unlock(&vfsmount_lock); return res; } diff --git a/fs/readdir.c b/fs/readdir.c index cc0a822..39e3370 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -108,11 +108,11 @@ SYSCALL_DEFINE3(old_readdir, unsigned int, fd, int error; struct file * file; struct readdir_callback buf; + int fput_needed; - error = -EBADF; - file = fget(fd); + file = fget_light(fd, &fput_needed); if (!file) - goto out; + return -EBADF; buf.result = 0; buf.dirent = dirent; @@ -121,8 +121,7 @@ SYSCALL_DEFINE3(old_readdir, unsigned int, fd, if (buf.result) error = buf.result; - fput(file); -out: + fput_light(file, fput_needed); return error; } @@ -195,16 +194,15 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd, struct file * file; struct linux_dirent __user * lastdirent; struct getdents_callback buf; + int fput_needed; int error; - error = -EFAULT; if (!access_ok(VERIFY_WRITE, dirent, count)) - goto out; + return -EFAULT; - error = -EBADF; - file = fget(fd); + file = fget_light(fd, &fput_needed); if (!file) - goto out; + return -EBADF; buf.current_dir = dirent; buf.previous = NULL; @@ -221,8 +219,7 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd, else error = count - buf.count; } - fput(file); -out: + fput_light(file, fput_needed); return error; } @@ -278,16 +275,15 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd, struct file * file; struct linux_dirent64 __user * lastdirent; struct getdents_callback64 buf; + int fput_needed; int error; - error = -EFAULT; if (!access_ok(VERIFY_WRITE, dirent, count)) - goto out; + return -EFAULT; - error = -EBADF; - file = fget(fd); + file = fget_light(fd, &fput_needed); if (!file) - goto out; + return -EBADF; buf.current_dir = dirent; buf.previous = NULL; @@ -305,7 +301,6 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd, else error = count - buf.count; } - fput(file); -out: + fput_light(file, fput_needed); return error; } diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 59d0687..a6d4268 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -1592,13 +1592,12 @@ struct dentry *reiserfs_fh_to_parent(struct super_block *sb, struct fid *fid, (fh_type == 6) ? fid->raw[5] : 0); } -int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp, - int need_parent) +int reiserfs_encode_fh(struct inode *inode, __u32 * data, int *lenp, + struct inode *parent) { - struct inode *inode = dentry->d_inode; int maxlen = *lenp; - if (need_parent && (maxlen < 5)) { + if (parent && (maxlen < 5)) { *lenp = 5; return 255; } else if (maxlen < 3) { @@ -1610,20 +1609,15 @@ int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp, data[1] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id); data[2] = inode->i_generation; *lenp = 3; - /* no room for directory info? return what we've stored so far */ - if (maxlen < 5 || !need_parent) - return 3; - - spin_lock(&dentry->d_lock); - inode = dentry->d_parent->d_inode; - data[3] = inode->i_ino; - data[4] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id); - *lenp = 5; - if (maxlen >= 6) { - data[5] = inode->i_generation; - *lenp = 6; - } - spin_unlock(&dentry->d_lock); + if (parent) { + data[3] = parent->i_ino; + data[4] = le32_to_cpu(INODE_PKEY(parent)->k_dir_id); + *lenp = 5; + if (maxlen >= 6) { + data[5] = parent->i_generation; + *lenp = 6; + } + } return *lenp; } diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index b1a0857..afcadcc 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -1923,6 +1923,8 @@ static int do_journal_release(struct reiserfs_transaction_handle *th, * the workqueue job (flush_async_commit) needs this lock */ reiserfs_write_unlock(sb); + + cancel_delayed_work_sync(&REISERFS_SB(sb)->old_work); flush_workqueue(commit_wq); if (!reiserfs_mounted_fs_count) { @@ -3231,8 +3233,6 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th, th->t_trans_id, journal->j_trans_id); } - sb->s_dirt = 1; - prepared = test_clear_buffer_journal_prepared(bh); clear_buffer_journal_restore_dirty(bh); /* already in this transaction, we are done */ @@ -3316,6 +3316,7 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th, journal->j_first = cn; journal->j_last = cn; } + reiserfs_schedule_old_flush(sb); return 0; } @@ -3492,7 +3493,7 @@ static void flush_async_commits(struct work_struct *work) ** flushes any old transactions to disk ** ends the current transaction if it is too old */ -int reiserfs_flush_old_commits(struct super_block *sb) +void reiserfs_flush_old_commits(struct super_block *sb) { time_t now; struct reiserfs_transaction_handle th; @@ -3502,9 +3503,8 @@ int reiserfs_flush_old_commits(struct super_block *sb) /* safety check so we don't flush while we are replaying the log during * mount */ - if (list_empty(&journal->j_journal_list)) { - return 0; - } + if (list_empty(&journal->j_journal_list)) + return; /* check the current transaction. If there are no writers, and it is * too old, finish it, and force the commit blocks to disk @@ -3526,7 +3526,6 @@ int reiserfs_flush_old_commits(struct super_block *sb) do_journal_end(&th, sb, 1, COMMIT_NOW | WAIT); } } - return sb->s_dirt; } /* @@ -3955,7 +3954,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, ** it tells us if we should continue with the journal_end, or just return */ if (!check_journal_end(th, sb, nblocks, flags)) { - sb->s_dirt = 1; + reiserfs_schedule_old_flush(sb); wake_queued_writers(sb); reiserfs_async_progress_wait(sb); goto out; diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h index a59d271..33215f5 100644 --- a/fs/reiserfs/reiserfs.h +++ b/fs/reiserfs/reiserfs.h @@ -480,6 +480,11 @@ struct reiserfs_sb_info { struct dentry *priv_root; /* root of /.reiserfs_priv */ struct dentry *xattr_root; /* root of /.reiserfs_priv/xattrs */ int j_errno; + + int work_queued; /* non-zero delayed work is queued */ + struct delayed_work old_work; /* old transactions flush delayed work */ + spinlock_t old_work_lock; /* protects old_work and work_queued */ + #ifdef CONFIG_QUOTA char *s_qf_names[MAXQUOTAS]; int s_jquota_fmt; @@ -2452,7 +2457,7 @@ struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *); int reiserfs_commit_page(struct inode *inode, struct page *page, unsigned from, unsigned to); -int reiserfs_flush_old_commits(struct super_block *); +void reiserfs_flush_old_commits(struct super_block *); int reiserfs_commit_for_inode(struct inode *); int reiserfs_inode_needs_commit(struct inode *); void reiserfs_update_inode_transaction(struct inode *); @@ -2487,6 +2492,7 @@ void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...); int reiserfs_allocate_list_bitmaps(struct super_block *s, struct reiserfs_list_bitmap *, unsigned int); +void reiserfs_schedule_old_flush(struct super_block *s); void add_save_link(struct reiserfs_transaction_handle *th, struct inode *inode, int truncate); int remove_save_link(struct inode *inode, int truncate); @@ -2611,8 +2617,8 @@ struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type); struct dentry *reiserfs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type); -int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp, - int connectable); +int reiserfs_encode_fh(struct inode *inode, __u32 * data, int *lenp, + struct inode *parent); int reiserfs_truncate_file(struct inode *, int update_timestamps); void make_cpu_key(struct cpu_key *cpu_key, struct inode *inode, loff_t offset, diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c index 9a17f63..3ce02cf 100644 --- a/fs/reiserfs/resize.c +++ b/fs/reiserfs/resize.c @@ -200,7 +200,6 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new) (bmap_nr_new - bmap_nr))); PUT_SB_BLOCK_COUNT(s, block_count_new); PUT_SB_BMAP_NR(s, bmap_would_wrap(bmap_nr_new) ? : bmap_nr_new); - s->s_dirt = 1; journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s)); diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index c07b7d7..651ce76 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -72,20 +72,58 @@ static int reiserfs_sync_fs(struct super_block *s, int wait) if (!journal_begin(&th, s, 1)) if (!journal_end_sync(&th, s, 1)) reiserfs_flush_old_commits(s); - s->s_dirt = 0; /* Even if it's not true. - * We'll loop forever in sync_supers otherwise */ reiserfs_write_unlock(s); return 0; } -static void reiserfs_write_super(struct super_block *s) +static void flush_old_commits(struct work_struct *work) { + struct reiserfs_sb_info *sbi; + struct super_block *s; + + sbi = container_of(work, struct reiserfs_sb_info, old_work.work); + s = sbi->s_journal->j_work_sb; + + spin_lock(&sbi->old_work_lock); + sbi->work_queued = 0; + spin_unlock(&sbi->old_work_lock); + reiserfs_sync_fs(s, 1); } +void reiserfs_schedule_old_flush(struct super_block *s) +{ + struct reiserfs_sb_info *sbi = REISERFS_SB(s); + unsigned long delay; + + if (s->s_flags & MS_RDONLY) + return; + + spin_lock(&sbi->old_work_lock); + if (!sbi->work_queued) { + delay = msecs_to_jiffies(dirty_writeback_interval * 10); + queue_delayed_work(system_long_wq, &sbi->old_work, delay); + sbi->work_queued = 1; + } + spin_unlock(&sbi->old_work_lock); +} + +static void cancel_old_flush(struct super_block *s) +{ + struct reiserfs_sb_info *sbi = REISERFS_SB(s); + + cancel_delayed_work_sync(&REISERFS_SB(s)->old_work); + spin_lock(&sbi->old_work_lock); + sbi->work_queued = 0; + spin_unlock(&sbi->old_work_lock); +} + static int reiserfs_freeze(struct super_block *s) { struct reiserfs_transaction_handle th; + + cancel_old_flush(s); + reiserfs_write_lock(s); if (!(s->s_flags & MS_RDONLY)) { int err = journal_begin(&th, s, 1); @@ -99,7 +137,6 @@ static int reiserfs_freeze(struct super_block *s) journal_end_sync(&th, s, 1); } } - s->s_dirt = 0; reiserfs_write_unlock(s); return 0; } @@ -483,9 +520,6 @@ static void reiserfs_put_super(struct super_block *s) reiserfs_write_lock(s); - if (s->s_dirt) - reiserfs_write_super(s); - /* change file system state to current state if it was mounted with read-write permissions */ if (!(s->s_flags & MS_RDONLY)) { if (!journal_begin(&th, s, 10)) { @@ -692,7 +726,6 @@ static const struct super_operations reiserfs_sops = { .dirty_inode = reiserfs_dirty_inode, .evict_inode = reiserfs_evict_inode, .put_super = reiserfs_put_super, - .write_super = reiserfs_write_super, .sync_fs = reiserfs_sync_fs, .freeze_fs = reiserfs_freeze, .unfreeze_fs = reiserfs_unfreeze, @@ -1400,7 +1433,6 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) err = journal_end(&th, s, 10); if (err) goto out_err; - s->s_dirt = 0; if (!(*mount_flags & MS_RDONLY)) { dquot_resume(s, -1); @@ -1730,19 +1762,21 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) return -ENOMEM; s->s_fs_info = sbi; /* Set default values for options: non-aggressive tails, RO on errors */ - REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_SMALLTAIL); - REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_ERROR_RO); - REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_BARRIER_FLUSH); + sbi->s_mount_opt |= (1 << REISERFS_SMALLTAIL); + sbi->s_mount_opt |= (1 << REISERFS_ERROR_RO); + sbi->s_mount_opt |= (1 << REISERFS_BARRIER_FLUSH); /* no preallocation minimum, be smart in reiserfs_file_write instead */ - REISERFS_SB(s)->s_alloc_options.preallocmin = 0; + sbi->s_alloc_options.preallocmin = 0; /* Preallocate by 16 blocks (17-1) at once */ - REISERFS_SB(s)->s_alloc_options.preallocsize = 17; + sbi->s_alloc_options.preallocsize = 17; /* setup default block allocator options */ reiserfs_init_alloc_options(s); - mutex_init(&REISERFS_SB(s)->lock); - REISERFS_SB(s)->lock_depth = -1; + spin_lock_init(&sbi->old_work_lock); + INIT_DELAYED_WORK(&sbi->old_work, flush_old_commits); + mutex_init(&sbi->lock); + sbi->lock_depth = -1; jdev_name = NULL; if (reiserfs_parse_options @@ -1751,8 +1785,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) goto error_unlocked; } if (jdev_name && jdev_name[0]) { - REISERFS_SB(s)->s_jdev = kstrdup(jdev_name, GFP_KERNEL); - if (!REISERFS_SB(s)->s_jdev) { + sbi->s_jdev = kstrdup(jdev_name, GFP_KERNEL); + if (!sbi->s_jdev) { SWARN(silent, s, "", "Cannot allocate memory for " "journal device name"); goto error; @@ -1810,7 +1844,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) /* make data=ordered the default */ if (!reiserfs_data_log(s) && !reiserfs_data_ordered(s) && !reiserfs_data_writeback(s)) { - REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_DATA_ORDERED); + sbi->s_mount_opt |= (1 << REISERFS_DATA_ORDERED); } if (reiserfs_data_log(s)) { @@ -2003,6 +2037,8 @@ error_unlocked: reiserfs_write_unlock(s); } + cancel_delayed_work_sync(&REISERFS_SB(s)->old_work); + reiserfs_free_bitmap_cache(s); if (SB_BUFFER_WITH_SB(s)) brelse(SB_BUFFER_WITH_SB(s)); diff --git a/fs/signalfd.c b/fs/signalfd.c index 7ae2a57..9f35a37 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -269,12 +269,13 @@ SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask, if (ufd < 0) kfree(ctx); } else { - struct file *file = fget(ufd); + int fput_needed; + struct file *file = fget_light(ufd, &fput_needed); if (!file) return -EBADF; ctx = file->private_data; if (file->f_op != &signalfd_fops) { - fput(file); + fput_light(file, fput_needed); return -EINVAL; } spin_lock_irq(¤t->sighand->siglock); @@ -282,7 +283,7 @@ SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask, spin_unlock_irq(¤t->sighand->siglock); wake_up(¤t->sighand->signalfd_wqh); - fput(file); + fput_light(file, fput_needed); } return ufd; diff --git a/fs/splice.c b/fs/splice.c index 406ef2b..c9f1318 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1003,8 +1003,10 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); ret = file_remove_suid(out); if (!ret) { - file_update_time(out); - ret = splice_from_pipe_feed(pipe, &sd, pipe_to_file); + ret = file_update_time(out); + if (!ret) + ret = splice_from_pipe_feed(pipe, &sd, + pipe_to_file); } mutex_unlock(&inode->i_mutex); } while (ret > 0); diff --git a/fs/statfs.c b/fs/statfs.c index 43e6b6f..95ad5c0 100644 --- a/fs/statfs.c +++ b/fs/statfs.c @@ -87,11 +87,12 @@ int user_statfs(const char __user *pathname, struct kstatfs *st) int fd_statfs(int fd, struct kstatfs *st) { - struct file *file = fget(fd); + int fput_needed; + struct file *file = fget_light(fd, &fput_needed); int error = -EBADF; if (file) { error = vfs_statfs(&file->f_path, st); - fput(file); + fput_light(file, fput_needed); } return error; } @@ -188,11 +188,12 @@ static int do_fsync(unsigned int fd, int datasync) { struct file *file; int ret = -EBADF; + int fput_needed; - file = fget(fd); + file = fget_light(fd, &fput_needed); if (file) { ret = vfs_fsync(file, datasync); - fput(file); + fput_light(file, fput_needed); } return ret; } diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 62a2727..a6d42ef 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -1127,16 +1127,7 @@ int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct ubifs_inode *ui = ubifs_inode(inode); mutex_lock(&ui->ui_mutex); - stat->dev = inode->i_sb->s_dev; - stat->ino = inode->i_ino; - stat->mode = inode->i_mode; - stat->nlink = inode->i_nlink; - stat->uid = inode->i_uid; - stat->gid = inode->i_gid; - stat->rdev = inode->i_rdev; - stat->atime = inode->i_atime; - stat->mtime = inode->i_mtime; - stat->ctime = inode->i_ctime; + generic_fillattr(inode, stat); stat->blksize = UBIFS_BLOCK_SIZE; stat->size = ui->ui_size; diff --git a/fs/udf/namei.c b/fs/udf/namei.c index a165c66..1802417 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -1260,16 +1260,15 @@ static struct dentry *udf_fh_to_parent(struct super_block *sb, fid->udf.parent_partref, fid->udf.parent_generation); } -static int udf_encode_fh(struct dentry *de, __u32 *fh, int *lenp, - int connectable) +static int udf_encode_fh(struct inode *inode, __u32 *fh, int *lenp, + struct inode *parent) { int len = *lenp; - struct inode *inode = de->d_inode; struct kernel_lb_addr location = UDF_I(inode)->i_location; struct fid *fid = (struct fid *)fh; int type = FILEID_UDF_WITHOUT_PARENT; - if (connectable && (len < 5)) { + if (parent && (len < 5)) { *lenp = 5; return 255; } else if (len < 3) { @@ -1282,14 +1281,11 @@ static int udf_encode_fh(struct dentry *de, __u32 *fh, int *lenp, fid->udf.partref = location.partitionReferenceNum; fid->udf.generation = inode->i_generation; - if (connectable && !S_ISDIR(inode->i_mode)) { - spin_lock(&de->d_lock); - inode = de->d_parent->d_inode; - location = UDF_I(inode)->i_location; + if (parent) { + location = UDF_I(parent)->i_location; fid->udf.parent_block = location.logicalBlockNum; fid->udf.parent_partref = location.partitionReferenceNum; fid->udf.parent_generation = inode->i_generation; - spin_unlock(&de->d_lock); *lenp = 5; type = FILEID_UDF_WITH_PARENT; } diff --git a/fs/utimes.c b/fs/utimes.c index ba653f3..fa4dbe4 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -140,18 +140,19 @@ long do_utimes(int dfd, const char __user *filename, struct timespec *times, goto out; if (filename == NULL && dfd != AT_FDCWD) { + int fput_needed; struct file *file; if (flags & AT_SYMLINK_NOFOLLOW) goto out; - file = fget(dfd); + file = fget_light(dfd, &fput_needed); error = -EBADF; if (!file) goto out; error = utimes_common(&file->f_path, times); - fput(file); + fput_light(file, fput_needed); } else { struct path path; int lookup_flags = 0; @@ -399,11 +399,12 @@ SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname, SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name, const void __user *,value, size_t, size, int, flags) { + int fput_needed; struct file *f; struct dentry *dentry; int error = -EBADF; - f = fget(fd); + f = fget_light(fd, &fput_needed); if (!f) return error; dentry = f->f_path.dentry; @@ -413,7 +414,7 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name, error = setxattr(dentry, name, value, size, flags); mnt_drop_write_file(f); } - fput(f); + fput_light(f, fput_needed); return error; } @@ -486,15 +487,16 @@ SYSCALL_DEFINE4(lgetxattr, const char __user *, pathname, SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name, void __user *, value, size_t, size) { + int fput_needed; struct file *f; ssize_t error = -EBADF; - f = fget(fd); + f = fget_light(fd, &fput_needed); if (!f) return error; audit_inode(NULL, f->f_path.dentry); error = getxattr(f->f_path.dentry, name, value, size); - fput(f); + fput_light(f, fput_needed); return error; } @@ -566,15 +568,16 @@ SYSCALL_DEFINE3(llistxattr, const char __user *, pathname, char __user *, list, SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size) { + int fput_needed; struct file *f; ssize_t error = -EBADF; - f = fget(fd); + f = fget_light(fd, &fput_needed); if (!f) return error; audit_inode(NULL, f->f_path.dentry); error = listxattr(f->f_path.dentry, list, size); - fput(f); + fput_light(f, fput_needed); return error; } @@ -634,11 +637,12 @@ SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname, SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name) { + int fput_needed; struct file *f; struct dentry *dentry; int error = -EBADF; - f = fget(fd); + f = fget_light(fd, &fput_needed); if (!f) return error; dentry = f->f_path.dentry; @@ -648,7 +652,7 @@ SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name) error = removexattr(dentry, name); mnt_drop_write_file(f); } - fput(f); + fput_light(f, fput_needed); return error; } diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index a907de5..4a7286c 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -46,7 +46,7 @@ kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize) } void * -kmem_alloc(size_t size, unsigned int __nocast flags) +kmem_alloc(size_t size, xfs_km_flags_t flags) { int retries = 0; gfp_t lflags = kmem_flags_convert(flags); @@ -65,7 +65,7 @@ kmem_alloc(size_t size, unsigned int __nocast flags) } void * -kmem_zalloc(size_t size, unsigned int __nocast flags) +kmem_zalloc(size_t size, xfs_km_flags_t flags) { void *ptr; @@ -87,7 +87,7 @@ kmem_free(const void *ptr) void * kmem_realloc(const void *ptr, size_t newsize, size_t oldsize, - unsigned int __nocast flags) + xfs_km_flags_t flags) { void *new; @@ -102,7 +102,7 @@ kmem_realloc(const void *ptr, size_t newsize, size_t oldsize, } void * -kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags) +kmem_zone_alloc(kmem_zone_t *zone, xfs_km_flags_t flags) { int retries = 0; gfp_t lflags = kmem_flags_convert(flags); @@ -121,7 +121,7 @@ kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags) } void * -kmem_zone_zalloc(kmem_zone_t *zone, unsigned int __nocast flags) +kmem_zone_zalloc(kmem_zone_t *zone, xfs_km_flags_t flags) { void *ptr; diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index ab7c53f..b2f2620 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -27,10 +27,11 @@ * General memory allocation interfaces */ -#define KM_SLEEP 0x0001u -#define KM_NOSLEEP 0x0002u -#define KM_NOFS 0x0004u -#define KM_MAYFAIL 0x0008u +typedef unsigned __bitwise xfs_km_flags_t; +#define KM_SLEEP ((__force xfs_km_flags_t)0x0001u) +#define KM_NOSLEEP ((__force xfs_km_flags_t)0x0002u) +#define KM_NOFS ((__force xfs_km_flags_t)0x0004u) +#define KM_MAYFAIL ((__force xfs_km_flags_t)0x0008u) /* * We use a special process flag to avoid recursive callbacks into @@ -38,7 +39,7 @@ * warnings, so we explicitly skip any generic ones (silly of us). */ static inline gfp_t -kmem_flags_convert(unsigned int __nocast flags) +kmem_flags_convert(xfs_km_flags_t flags) { gfp_t lflags; @@ -54,9 +55,9 @@ kmem_flags_convert(unsigned int __nocast flags) return lflags; } -extern void *kmem_alloc(size_t, unsigned int __nocast); -extern void *kmem_zalloc(size_t, unsigned int __nocast); -extern void *kmem_realloc(const void *, size_t, size_t, unsigned int __nocast); +extern void *kmem_alloc(size_t, xfs_km_flags_t); +extern void *kmem_zalloc(size_t, xfs_km_flags_t); +extern void *kmem_realloc(const void *, size_t, size_t, xfs_km_flags_t); extern void kmem_free(const void *); static inline void *kmem_zalloc_large(size_t size) @@ -107,7 +108,7 @@ kmem_zone_destroy(kmem_zone_t *zone) kmem_cache_destroy(zone); } -extern void *kmem_zone_alloc(kmem_zone_t *, unsigned int __nocast); -extern void *kmem_zone_zalloc(kmem_zone_t *, unsigned int __nocast); +extern void *kmem_zone_alloc(kmem_zone_t *, xfs_km_flags_t); +extern void *kmem_zone_zalloc(kmem_zone_t *, xfs_km_flags_t); #endif /* __XFS_SUPPORT_KMEM_H__ */ diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index 2d25d19..4267922 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -52,19 +52,18 @@ static int xfs_fileid_length(int fileid_type) STATIC int xfs_fs_encode_fh( - struct dentry *dentry, - __u32 *fh, - int *max_len, - int connectable) + struct inode *inode, + __u32 *fh, + int *max_len, + struct inode *parent) { struct fid *fid = (struct fid *)fh; struct xfs_fid64 *fid64 = (struct xfs_fid64 *)fh; - struct inode *inode = dentry->d_inode; int fileid_type; int len; /* Directories don't need their parent encoded, they have ".." */ - if (S_ISDIR(inode->i_mode) || !connectable) + if (!parent) fileid_type = FILEID_INO32_GEN; else fileid_type = FILEID_INO32_GEN_PARENT; @@ -96,20 +95,16 @@ xfs_fs_encode_fh( switch (fileid_type) { case FILEID_INO32_GEN_PARENT: - spin_lock(&dentry->d_lock); - fid->i32.parent_ino = XFS_I(dentry->d_parent->d_inode)->i_ino; - fid->i32.parent_gen = dentry->d_parent->d_inode->i_generation; - spin_unlock(&dentry->d_lock); + fid->i32.parent_ino = XFS_I(parent)->i_ino; + fid->i32.parent_gen = parent->i_generation; /*FALLTHRU*/ case FILEID_INO32_GEN: fid->i32.ino = XFS_I(inode)->i_ino; fid->i32.gen = inode->i_generation; break; case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG: - spin_lock(&dentry->d_lock); - fid64->parent_ino = XFS_I(dentry->d_parent->d_inode)->i_ino; - fid64->parent_gen = dentry->d_parent->d_inode->i_generation; - spin_unlock(&dentry->d_lock); + fid64->parent_ino = XFS_I(parent)->i_ino; + fid64->parent_gen = parent->i_generation; /*FALLTHRU*/ case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG: fid64->ino = XFS_I(inode)->i_ino; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 8d214b8..9f7ec15 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -586,8 +586,11 @@ restart: * lock above. Eventually we should look into a way to avoid * the pointless lock roundtrip. */ - if (likely(!(file->f_mode & FMODE_NOCMTIME))) - file_update_time(file); + if (likely(!(file->f_mode & FMODE_NOCMTIME))) { + error = file_update_time(file); + if (error) + return error; + } /* * If we're writing the file then make sure to clear the setuid and diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 6b965bf..f30d980 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -3152,7 +3152,7 @@ xlog_ticket_alloc( int cnt, char client, bool permanent, - int alloc_flags) + xfs_km_flags_t alloc_flags) { struct xlog_ticket *tic; uint num_headers; diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 735ff1e..5bc3326 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -555,7 +555,7 @@ extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int); extern kmem_zone_t *xfs_log_ticket_zone; struct xlog_ticket *xlog_ticket_alloc(struct log *log, int unit_bytes, int count, char client, bool permanent, - int alloc_flags); + xfs_km_flags_t alloc_flags); static inline void diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index cdf896f..fdf3245 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -584,7 +584,7 @@ xfs_trans_t * _xfs_trans_alloc( xfs_mount_t *mp, uint type, - uint memflags) + xfs_km_flags_t memflags) { xfs_trans_t *tp; diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 7ab99e1..7c37b53 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -443,7 +443,7 @@ typedef struct xfs_trans { * XFS transaction mechanism exported interfaces. */ xfs_trans_t *xfs_trans_alloc(struct xfs_mount *, uint); -xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint, uint); +xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint, xfs_km_flags_t); xfs_trans_t *xfs_trans_dup(xfs_trans_t *); int xfs_trans_reserve(xfs_trans_t *, uint, uint, uint, uint, uint); diff --git a/include/asm-generic/posix_types.h b/include/asm-generic/posix_types.h index 91d44bd..fe74fcc 100644 --- a/include/asm-generic/posix_types.h +++ b/include/asm-generic/posix_types.h @@ -23,10 +23,6 @@ typedef __kernel_ulong_t __kernel_ino_t; typedef unsigned int __kernel_mode_t; #endif -#ifndef __kernel_nlink_t -typedef __kernel_ulong_t __kernel_nlink_t; -#endif - #ifndef __kernel_pid_t typedef int __kernel_pid_t; #endif diff --git a/include/linux/errno.h b/include/linux/errno.h index 2d09bfa..e0de516 100644 --- a/include/linux/errno.h +++ b/include/linux/errno.h @@ -17,6 +17,7 @@ #define ENOIOCTLCMD 515 /* No ioctl command */ #define ERESTART_RESTARTBLOCK 516 /* restart by calling sys_restart_syscall */ #define EPROBE_DEFER 517 /* Driver requests probe retry */ +#define EOPENSTALE 518 /* open found a stale dentry */ /* Defined for the NFSv3 protocol */ #define EBADHANDLE 521 /* Illegal NFS file handle */ diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index 3a4cef5..12291a7 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -165,8 +165,8 @@ struct fid { */ struct export_operations { - int (*encode_fh)(struct dentry *de, __u32 *fh, int *max_len, - int connectable); + int (*encode_fh)(struct inode *inode, __u32 *fh, int *max_len, + struct inode *parent); struct dentry * (*fh_to_dentry)(struct super_block *sb, struct fid *fid, int fh_len, int fh_type); struct dentry * (*fh_to_parent)(struct super_block *sb, struct fid *fid, diff --git a/include/linux/fs.h b/include/linux/fs.h index 40887af..51978ed 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1692,6 +1692,7 @@ struct inode_operations { int (*removexattr) (struct dentry *, const char *); int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len); + int (*update_time)(struct inode *, struct timespec *, int); } ____cacheline_aligned; struct seq_file; @@ -1850,6 +1851,13 @@ static inline void inode_inc_iversion(struct inode *inode) spin_unlock(&inode->i_lock); } +enum file_time_flags { + S_ATIME = 1, + S_MTIME = 2, + S_CTIME = 4, + S_VERSION = 8, +}; + extern void touch_atime(struct path *); static inline void file_accessed(struct file *file) { @@ -2583,7 +2591,7 @@ extern int inode_change_ok(const struct inode *, struct iattr *); extern int inode_newsize_ok(const struct inode *, loff_t offset); extern void setattr_copy(struct inode *inode, const struct iattr *attr); -extern void file_update_time(struct file *file); +extern int file_update_time(struct file *file); extern int generic_show_options(struct seq_file *m, struct dentry *root); extern void save_mount_options(struct super_block *sb, char *options); diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 91d0e0a3..63d966d 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -60,7 +60,7 @@ #define FS_EVENTS_POSS_ON_CHILD (FS_ACCESS | FS_MODIFY | FS_ATTRIB |\ FS_CLOSE_WRITE | FS_CLOSE_NOWRITE | FS_OPEN |\ FS_MOVED_FROM | FS_MOVED_TO | FS_CREATE |\ - FS_DELETE) + FS_DELETE | FS_OPEN_PERM | FS_ACCESS_PERM) #define FS_MOVE (FS_MOVED_FROM | FS_MOVED_TO) diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 912c30a..f334c7f 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -31,6 +31,7 @@ #include <linux/mutex.h> #include <linux/timer.h> #include <linux/slab.h> +#include <crypto/hash.h> #endif #define journal_oom_retry 1 @@ -147,12 +148,24 @@ typedef struct journal_header_s #define JBD2_CRC32_CHKSUM 1 #define JBD2_MD5_CHKSUM 2 #define JBD2_SHA1_CHKSUM 3 +#define JBD2_CRC32C_CHKSUM 4 #define JBD2_CRC32_CHKSUM_SIZE 4 #define JBD2_CHECKSUM_BYTES (32 / sizeof(u32)) /* * Commit block header for storing transactional checksums: + * + * NOTE: If FEATURE_COMPAT_CHECKSUM (checksum v1) is set, the h_chksum* + * fields are used to store a checksum of the descriptor and data blocks. + * + * If FEATURE_INCOMPAT_CSUM_V2 (checksum v2) is set, then the h_chksum + * field is used to store crc32c(uuid+commit_block). Each journal metadata + * block gets its own checksum, and data block checksums are stored in + * journal_block_tag (in the descriptor). The other h_chksum* fields are + * not used. + * + * Checksum v1 and v2 are mutually exclusive features. */ struct commit_header { __be32 h_magic; @@ -175,13 +188,19 @@ struct commit_header { typedef struct journal_block_tag_s { __be32 t_blocknr; /* The on-disk block number */ - __be32 t_flags; /* See below */ + __be16 t_checksum; /* truncated crc32c(uuid+seq+block) */ + __be16 t_flags; /* See below */ __be32 t_blocknr_high; /* most-significant high 32bits. */ } journal_block_tag_t; #define JBD2_TAG_SIZE32 (offsetof(journal_block_tag_t, t_blocknr_high)) #define JBD2_TAG_SIZE64 (sizeof(journal_block_tag_t)) +/* Tail of descriptor block, for checksumming */ +struct jbd2_journal_block_tail { + __be32 t_checksum; /* crc32c(uuid+descr_block) */ +}; + /* * The revoke descriptor: used on disk to describe a series of blocks to * be revoked from the log @@ -192,6 +211,10 @@ typedef struct jbd2_journal_revoke_header_s __be32 r_count; /* Count of bytes used in the block */ } jbd2_journal_revoke_header_t; +/* Tail of revoke block, for checksumming */ +struct jbd2_journal_revoke_tail { + __be32 r_checksum; /* crc32c(uuid+revoke_block) */ +}; /* Definitions for the journal tag flags word: */ #define JBD2_FLAG_ESCAPE 1 /* on-disk block is escaped */ @@ -241,7 +264,10 @@ typedef struct journal_superblock_s __be32 s_max_trans_data; /* Limit of data blocks per trans. */ /* 0x0050 */ - __u32 s_padding[44]; + __u8 s_checksum_type; /* checksum type */ + __u8 s_padding2[3]; + __u32 s_padding[42]; + __be32 s_checksum; /* crc32c(superblock) */ /* 0x0100 */ __u8 s_users[16*48]; /* ids of all fs'es sharing the log */ @@ -263,13 +289,15 @@ typedef struct journal_superblock_s #define JBD2_FEATURE_INCOMPAT_REVOKE 0x00000001 #define JBD2_FEATURE_INCOMPAT_64BIT 0x00000002 #define JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT 0x00000004 +#define JBD2_FEATURE_INCOMPAT_CSUM_V2 0x00000008 /* Features known to this kernel version: */ #define JBD2_KNOWN_COMPAT_FEATURES JBD2_FEATURE_COMPAT_CHECKSUM #define JBD2_KNOWN_ROCOMPAT_FEATURES 0 #define JBD2_KNOWN_INCOMPAT_FEATURES (JBD2_FEATURE_INCOMPAT_REVOKE | \ JBD2_FEATURE_INCOMPAT_64BIT | \ - JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT | \ + JBD2_FEATURE_INCOMPAT_CSUM_V2) #ifdef __KERNEL__ @@ -939,6 +967,12 @@ struct journal_s * superblock pointer here */ void *j_private; + + /* Reference to checksum algorithm driver via cryptoapi */ + struct crypto_shash *j_chksum_driver; + + /* Precomputed journal UUID checksum for seeding other checksums */ + __u32 j_csum_seed; }; /* @@ -1268,6 +1302,25 @@ static inline int jbd_space_needed(journal_t *journal) extern int jbd_blocks_per_page(struct inode *inode); +static inline u32 jbd2_chksum(journal_t *journal, u32 crc, + const void *address, unsigned int length) +{ + struct { + struct shash_desc shash; + char ctx[crypto_shash_descsize(journal->j_chksum_driver)]; + } desc; + int err; + + desc.shash.tfm = journal->j_chksum_driver; + desc.shash.flags = 0; + *(u32 *)desc.ctx = crc; + + err = crypto_shash_update(&desc.shash, address, length); + BUG_ON(err); + + return *(u32 *)desc.ctx; +} + #ifdef __KERNEL__ #define buffer_trace_init(bh) do {} while (0) diff --git a/include/linux/jbd_common.h b/include/linux/jbd_common.h index 6230f85..6133679 100644 --- a/include/linux/jbd_common.h +++ b/include/linux/jbd_common.h @@ -12,6 +12,7 @@ enum jbd_state_bits { BH_State, /* Pins most journal_head state */ BH_JournalHead, /* Pins bh->b_private and jh->b_bh */ BH_Unshadow, /* Dummy bit, for BJ_Shadow wakeup filtering */ + BH_Verified, /* Metadata block has been verified ok */ BH_JBDPrivateStart, /* First bit available for private use by FS */ }; @@ -24,6 +25,7 @@ TAS_BUFFER_FNS(Revoked, revoked) BUFFER_FNS(RevokeValid, revokevalid) TAS_BUFFER_FNS(RevokeValid, revokevalid) BUFFER_FNS(Freed, freed) +BUFFER_FNS(Verified, verified) static inline struct buffer_head *jh2bh(struct journal_head *jh) { diff --git a/include/linux/lglock.h b/include/linux/lglock.h index 87f402c..f01e5f6 100644 --- a/include/linux/lglock.h +++ b/include/linux/lglock.h @@ -23,28 +23,17 @@ #include <linux/lockdep.h> #include <linux/percpu.h> #include <linux/cpu.h> +#include <linux/notifier.h> /* can make br locks by using local lock for read side, global lock for write */ -#define br_lock_init(name) name##_lock_init() -#define br_read_lock(name) name##_local_lock() -#define br_read_unlock(name) name##_local_unlock() -#define br_write_lock(name) name##_global_lock_online() -#define br_write_unlock(name) name##_global_unlock_online() +#define br_lock_init(name) lg_lock_init(name, #name) +#define br_read_lock(name) lg_local_lock(name) +#define br_read_unlock(name) lg_local_unlock(name) +#define br_write_lock(name) lg_global_lock(name) +#define br_write_unlock(name) lg_global_unlock(name) -#define DECLARE_BRLOCK(name) DECLARE_LGLOCK(name) #define DEFINE_BRLOCK(name) DEFINE_LGLOCK(name) - -#define lg_lock_init(name) name##_lock_init() -#define lg_local_lock(name) name##_local_lock() -#define lg_local_unlock(name) name##_local_unlock() -#define lg_local_lock_cpu(name, cpu) name##_local_lock_cpu(cpu) -#define lg_local_unlock_cpu(name, cpu) name##_local_unlock_cpu(cpu) -#define lg_global_lock(name) name##_global_lock() -#define lg_global_unlock(name) name##_global_unlock() -#define lg_global_lock_online(name) name##_global_lock_online() -#define lg_global_unlock_online(name) name##_global_unlock_online() - #ifdef CONFIG_DEBUG_LOCK_ALLOC #define LOCKDEP_INIT_MAP lockdep_init_map @@ -59,142 +48,26 @@ #define DEFINE_LGLOCK_LOCKDEP(name) #endif - -#define DECLARE_LGLOCK(name) \ - extern void name##_lock_init(void); \ - extern void name##_local_lock(void); \ - extern void name##_local_unlock(void); \ - extern void name##_local_lock_cpu(int cpu); \ - extern void name##_local_unlock_cpu(int cpu); \ - extern void name##_global_lock(void); \ - extern void name##_global_unlock(void); \ - extern void name##_global_lock_online(void); \ - extern void name##_global_unlock_online(void); \ +struct lglock { + arch_spinlock_t __percpu *lock; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lock_class_key lock_key; + struct lockdep_map lock_dep_map; +#endif +}; #define DEFINE_LGLOCK(name) \ - \ - DEFINE_SPINLOCK(name##_cpu_lock); \ - cpumask_t name##_cpus __read_mostly; \ - DEFINE_PER_CPU(arch_spinlock_t, name##_lock); \ - DEFINE_LGLOCK_LOCKDEP(name); \ - \ - static int \ - name##_lg_cpu_callback(struct notifier_block *nb, \ - unsigned long action, void *hcpu) \ - { \ - switch (action & ~CPU_TASKS_FROZEN) { \ - case CPU_UP_PREPARE: \ - spin_lock(&name##_cpu_lock); \ - cpu_set((unsigned long)hcpu, name##_cpus); \ - spin_unlock(&name##_cpu_lock); \ - break; \ - case CPU_UP_CANCELED: case CPU_DEAD: \ - spin_lock(&name##_cpu_lock); \ - cpu_clear((unsigned long)hcpu, name##_cpus); \ - spin_unlock(&name##_cpu_lock); \ - } \ - return NOTIFY_OK; \ - } \ - static struct notifier_block name##_lg_cpu_notifier = { \ - .notifier_call = name##_lg_cpu_callback, \ - }; \ - void name##_lock_init(void) { \ - int i; \ - LOCKDEP_INIT_MAP(&name##_lock_dep_map, #name, &name##_lock_key, 0); \ - for_each_possible_cpu(i) { \ - arch_spinlock_t *lock; \ - lock = &per_cpu(name##_lock, i); \ - *lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; \ - } \ - register_hotcpu_notifier(&name##_lg_cpu_notifier); \ - get_online_cpus(); \ - for_each_online_cpu(i) \ - cpu_set(i, name##_cpus); \ - put_online_cpus(); \ - } \ - EXPORT_SYMBOL(name##_lock_init); \ - \ - void name##_local_lock(void) { \ - arch_spinlock_t *lock; \ - preempt_disable(); \ - rwlock_acquire_read(&name##_lock_dep_map, 0, 0, _THIS_IP_); \ - lock = &__get_cpu_var(name##_lock); \ - arch_spin_lock(lock); \ - } \ - EXPORT_SYMBOL(name##_local_lock); \ - \ - void name##_local_unlock(void) { \ - arch_spinlock_t *lock; \ - rwlock_release(&name##_lock_dep_map, 1, _THIS_IP_); \ - lock = &__get_cpu_var(name##_lock); \ - arch_spin_unlock(lock); \ - preempt_enable(); \ - } \ - EXPORT_SYMBOL(name##_local_unlock); \ - \ - void name##_local_lock_cpu(int cpu) { \ - arch_spinlock_t *lock; \ - preempt_disable(); \ - rwlock_acquire_read(&name##_lock_dep_map, 0, 0, _THIS_IP_); \ - lock = &per_cpu(name##_lock, cpu); \ - arch_spin_lock(lock); \ - } \ - EXPORT_SYMBOL(name##_local_lock_cpu); \ - \ - void name##_local_unlock_cpu(int cpu) { \ - arch_spinlock_t *lock; \ - rwlock_release(&name##_lock_dep_map, 1, _THIS_IP_); \ - lock = &per_cpu(name##_lock, cpu); \ - arch_spin_unlock(lock); \ - preempt_enable(); \ - } \ - EXPORT_SYMBOL(name##_local_unlock_cpu); \ - \ - void name##_global_lock_online(void) { \ - int i; \ - spin_lock(&name##_cpu_lock); \ - rwlock_acquire(&name##_lock_dep_map, 0, 0, _RET_IP_); \ - for_each_cpu(i, &name##_cpus) { \ - arch_spinlock_t *lock; \ - lock = &per_cpu(name##_lock, i); \ - arch_spin_lock(lock); \ - } \ - } \ - EXPORT_SYMBOL(name##_global_lock_online); \ - \ - void name##_global_unlock_online(void) { \ - int i; \ - rwlock_release(&name##_lock_dep_map, 1, _RET_IP_); \ - for_each_cpu(i, &name##_cpus) { \ - arch_spinlock_t *lock; \ - lock = &per_cpu(name##_lock, i); \ - arch_spin_unlock(lock); \ - } \ - spin_unlock(&name##_cpu_lock); \ - } \ - EXPORT_SYMBOL(name##_global_unlock_online); \ - \ - void name##_global_lock(void) { \ - int i; \ - preempt_disable(); \ - rwlock_acquire(&name##_lock_dep_map, 0, 0, _RET_IP_); \ - for_each_possible_cpu(i) { \ - arch_spinlock_t *lock; \ - lock = &per_cpu(name##_lock, i); \ - arch_spin_lock(lock); \ - } \ - } \ - EXPORT_SYMBOL(name##_global_lock); \ - \ - void name##_global_unlock(void) { \ - int i; \ - rwlock_release(&name##_lock_dep_map, 1, _RET_IP_); \ - for_each_possible_cpu(i) { \ - arch_spinlock_t *lock; \ - lock = &per_cpu(name##_lock, i); \ - arch_spin_unlock(lock); \ - } \ - preempt_enable(); \ - } \ - EXPORT_SYMBOL(name##_global_unlock); + DEFINE_LGLOCK_LOCKDEP(name); \ + DEFINE_PER_CPU(arch_spinlock_t, name ## _lock) \ + = __ARCH_SPIN_LOCK_UNLOCKED; \ + struct lglock name = { .lock = &name ## _lock } + +void lg_lock_init(struct lglock *lg, char *name); +void lg_local_lock(struct lglock *lg); +void lg_local_unlock(struct lglock *lg); +void lg_local_lock_cpu(struct lglock *lg, int cpu); +void lg_local_unlock_cpu(struct lglock *lg, int cpu); +void lg_global_lock(struct lglock *lg); +void lg_global_unlock(struct lglock *lg); + #endif diff --git a/include/linux/mm.h b/include/linux/mm.h index ce26716..b36d08c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1392,7 +1392,7 @@ extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned lo extern unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, unsigned long flags, vm_flags_t vm_flags, unsigned long pgoff); -extern unsigned long do_mmap(struct file *, unsigned long, +extern unsigned long do_mmap_pgoff(struct file *, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); extern int do_munmap(struct mm_struct *, unsigned long, size_t); diff --git a/include/linux/security.h b/include/linux/security.h index ab0e091..4e5a73c 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -86,9 +86,9 @@ extern int cap_inode_setxattr(struct dentry *dentry, const char *name, extern int cap_inode_removexattr(struct dentry *dentry, const char *name); extern int cap_inode_need_killpriv(struct dentry *dentry); extern int cap_inode_killpriv(struct dentry *dentry); -extern int cap_file_mmap(struct file *file, unsigned long reqprot, - unsigned long prot, unsigned long flags, - unsigned long addr, unsigned long addr_only); +extern int cap_mmap_addr(unsigned long addr); +extern int cap_mmap_file(struct file *file, unsigned long reqprot, + unsigned long prot, unsigned long flags); extern int cap_task_fix_setuid(struct cred *new, const struct cred *old, int flags); extern int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5); @@ -586,15 +586,17 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * simple integer value. When @arg represents a user space pointer, it * should never be used by the security module. * Return 0 if permission is granted. - * @file_mmap : + * @mmap_addr : + * Check permissions for a mmap operation at @addr. + * @addr contains virtual address that will be used for the operation. + * Return 0 if permission is granted. + * @mmap_file : * Check permissions for a mmap operation. The @file may be NULL, e.g. * if mapping anonymous memory. * @file contains the file structure for file to map (may be NULL). * @reqprot contains the protection requested by the application. * @prot contains the protection that will be applied by the kernel. * @flags contains the operational flags. - * @addr contains virtual address that will be used for the operation. - * @addr_only contains a boolean: 0 if file-backed VMA, otherwise 1. * Return 0 if permission is granted. * @file_mprotect: * Check permissions before changing memory access permissions. @@ -1481,10 +1483,10 @@ struct security_operations { void (*file_free_security) (struct file *file); int (*file_ioctl) (struct file *file, unsigned int cmd, unsigned long arg); - int (*file_mmap) (struct file *file, + int (*mmap_addr) (unsigned long addr); + int (*mmap_file) (struct file *file, unsigned long reqprot, unsigned long prot, - unsigned long flags, unsigned long addr, - unsigned long addr_only); + unsigned long flags); int (*file_mprotect) (struct vm_area_struct *vma, unsigned long reqprot, unsigned long prot); @@ -1743,9 +1745,9 @@ int security_file_permission(struct file *file, int mask); int security_file_alloc(struct file *file); void security_file_free(struct file *file); int security_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg); -int security_file_mmap(struct file *file, unsigned long reqprot, - unsigned long prot, unsigned long flags, - unsigned long addr, unsigned long addr_only); +int security_mmap_file(struct file *file, unsigned long prot, + unsigned long flags); +int security_mmap_addr(unsigned long addr); int security_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot, unsigned long prot); int security_file_lock(struct file *file, unsigned int cmd); @@ -2181,13 +2183,15 @@ static inline int security_file_ioctl(struct file *file, unsigned int cmd, return 0; } -static inline int security_file_mmap(struct file *file, unsigned long reqprot, - unsigned long prot, - unsigned long flags, - unsigned long addr, - unsigned long addr_only) +static inline int security_mmap_file(struct file *file, unsigned long prot, + unsigned long flags) +{ + return 0; +} + +static inline int security_mmap_addr(unsigned long addr) { - return cap_file_mmap(file, reqprot, prot, flags, addr, addr_only); + return cap_mmap_addr(addr); } static inline int security_file_mprotect(struct vm_area_struct *vma, diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 51b29ac..40e0a27 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -232,7 +232,6 @@ struct svc_rqst { struct svc_pool * rq_pool; /* thread pool */ struct svc_procedure * rq_procinfo; /* procedure info */ struct auth_ops * rq_authop; /* authentication flavour */ - u32 rq_flavor; /* pseudoflavor */ struct svc_cred rq_cred; /* auth info */ void * rq_xprt_ctxt; /* transport specific context ptr */ struct svc_deferred_req*rq_deferred; /* deferred request we are replaying */ @@ -416,6 +415,7 @@ struct svc_procedure { */ int svc_rpcb_setup(struct svc_serv *serv, struct net *net); void svc_rpcb_cleanup(struct svc_serv *serv, struct net *net); +int svc_bind(struct svc_serv *serv, struct net *net); struct svc_serv *svc_create(struct svc_program *, unsigned int, void (*shutdown)(struct svc_serv *, struct net *net)); struct svc_rqst *svc_prepare_thread(struct svc_serv *serv, diff --git a/include/linux/sunrpc/svcauth.h b/include/linux/sunrpc/svcauth.h index 2c54683..dd74084 100644 --- a/include/linux/sunrpc/svcauth.h +++ b/include/linux/sunrpc/svcauth.h @@ -15,13 +15,23 @@ #include <linux/sunrpc/msg_prot.h> #include <linux/sunrpc/cache.h> #include <linux/hash.h> +#include <linux/cred.h> struct svc_cred { uid_t cr_uid; gid_t cr_gid; struct group_info *cr_group_info; + u32 cr_flavor; /* pseudoflavor */ + char *cr_principal; /* for gss */ }; +static inline void free_svc_cred(struct svc_cred *cred) +{ + if (cred->cr_group_info) + put_group_info(cred->cr_group_info); + kfree(cred->cr_principal); +} + struct svc_rqst; /* forward decl */ struct in6_addr; diff --git a/include/linux/sunrpc/svcauth_gss.h b/include/linux/sunrpc/svcauth_gss.h index 7c32daa..726aff1 100644 --- a/include/linux/sunrpc/svcauth_gss.h +++ b/include/linux/sunrpc/svcauth_gss.h @@ -22,7 +22,6 @@ int gss_svc_init_net(struct net *net); void gss_svc_shutdown_net(struct net *net); int svcauth_gss_register_pseudoflavor(u32 pseudoflavor, char * name); u32 svcauth_gss_flavor(struct auth_domain *dom); -char *svc_gss_principal(struct svc_rqst *); #endif /* __KERNEL__ */ #endif /* _LINUX_SUNRPC_SVCAUTH_GSS_H */ diff --git a/include/linux/types.h b/include/linux/types.h index 7f480db..9c1bd53 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -25,7 +25,7 @@ typedef __kernel_dev_t dev_t; typedef __kernel_ino_t ino_t; typedef __kernel_mode_t mode_t; typedef unsigned short umode_t; -typedef __kernel_nlink_t nlink_t; +typedef __u32 nlink_t; typedef __kernel_off_t off_t; typedef __kernel_pid_t pid_t; typedef __kernel_daddr_t daddr_t; @@ -1036,6 +1036,10 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr) sfd->file = shp->shm_file; sfd->vm_ops = NULL; + err = security_mmap_file(file, prot, flags); + if (err) + goto out_fput; + down_write(¤t->mm->mmap_sem); if (addr && !(shmflg & SHM_REMAP)) { err = -EINVAL; @@ -1050,7 +1054,7 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr) goto invalid; } - user_addr = do_mmap (file, addr, size, prot, flags, 0); + user_addr = do_mmap_pgoff(file, addr, size, prot, flags, 0); *raddr = user_addr; err = 0; if (IS_ERR_VALUE(user_addr)) @@ -1058,6 +1062,7 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr) invalid: up_write(¤t->mm->mmap_sem); +out_fput: fput(file); out_nattch: diff --git a/kernel/Makefile b/kernel/Makefile index 6f3d0ae..c0cc67a 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -10,7 +10,7 @@ obj-y = fork.o exec_domain.o panic.o printk.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ notifier.o ksysfs.o cred.o \ - async.o range.o groups.o + async.o range.o groups.o lglock.o ifdef CONFIG_FUNCTION_TRACER # Do not trace debug files and internal ftrace files diff --git a/kernel/lglock.c b/kernel/lglock.c new file mode 100644 index 0000000..6535a66 --- /dev/null +++ b/kernel/lglock.c @@ -0,0 +1,89 @@ +/* See include/linux/lglock.h for description */ +#include <linux/module.h> +#include <linux/lglock.h> +#include <linux/cpu.h> +#include <linux/string.h> + +/* + * Note there is no uninit, so lglocks cannot be defined in + * modules (but it's fine to use them from there) + * Could be added though, just undo lg_lock_init + */ + +void lg_lock_init(struct lglock *lg, char *name) +{ + LOCKDEP_INIT_MAP(&lg->lock_dep_map, name, &lg->lock_key, 0); +} +EXPORT_SYMBOL(lg_lock_init); + +void lg_local_lock(struct lglock *lg) +{ + arch_spinlock_t *lock; + + preempt_disable(); + rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_); + lock = this_cpu_ptr(lg->lock); + arch_spin_lock(lock); +} +EXPORT_SYMBOL(lg_local_lock); + +void lg_local_unlock(struct lglock *lg) +{ + arch_spinlock_t *lock; + + rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); + lock = this_cpu_ptr(lg->lock); + arch_spin_unlock(lock); + preempt_enable(); +} +EXPORT_SYMBOL(lg_local_unlock); + +void lg_local_lock_cpu(struct lglock *lg, int cpu) +{ + arch_spinlock_t *lock; + + preempt_disable(); + rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_); + lock = per_cpu_ptr(lg->lock, cpu); + arch_spin_lock(lock); +} +EXPORT_SYMBOL(lg_local_lock_cpu); + +void lg_local_unlock_cpu(struct lglock *lg, int cpu) +{ + arch_spinlock_t *lock; + + rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); + lock = per_cpu_ptr(lg->lock, cpu); + arch_spin_unlock(lock); + preempt_enable(); +} +EXPORT_SYMBOL(lg_local_unlock_cpu); + +void lg_global_lock(struct lglock *lg) +{ + int i; + + preempt_disable(); + rwlock_acquire(&lg->lock_dep_map, 0, 0, _RET_IP_); + for_each_possible_cpu(i) { + arch_spinlock_t *lock; + lock = per_cpu_ptr(lg->lock, i); + arch_spin_lock(lock); + } +} +EXPORT_SYMBOL(lg_global_lock); + +void lg_global_unlock(struct lglock *lg) +{ + int i; + + rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); + for_each_possible_cpu(i) { + arch_spinlock_t *lock; + lock = per_cpu_ptr(lg->lock, i); + arch_spin_unlock(lock); + } + preempt_enable(); +} +EXPORT_SYMBOL(lg_global_unlock); diff --git a/mm/cleancache.c b/mm/cleancache.c index 5646c74..32e6f41 100644 --- a/mm/cleancache.c +++ b/mm/cleancache.c @@ -80,7 +80,7 @@ EXPORT_SYMBOL(__cleancache_init_shared_fs); static int cleancache_get_key(struct inode *inode, struct cleancache_filekey *key) { - int (*fhfn)(struct dentry *, __u32 *fh, int *, int); + int (*fhfn)(struct inode *, __u32 *fh, int *, struct inode *); int len = 0, maxlen = CLEANCACHE_KEY_MAX; struct super_block *sb = inode->i_sb; @@ -88,9 +88,7 @@ static int cleancache_get_key(struct inode *inode, if (sb->s_export_op != NULL) { fhfn = sb->s_export_op->encode_fh; if (fhfn) { - struct dentry d; - d.d_inode = inode; - len = (*fhfn)(&d, &key->u.fh[0], &maxlen, 0); + len = (*fhfn)(inode, &key->u.fh[0], &maxlen, NULL); if (len <= 0 || len == 255) return -1; if (maxlen > CLEANCACHE_KEY_MAX) diff --git a/mm/filemap.c b/mm/filemap.c index 64b48f9..a4a5260 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1899,71 +1899,6 @@ struct page *read_cache_page(struct address_space *mapping, } EXPORT_SYMBOL(read_cache_page); -/* - * The logic we want is - * - * if suid or (sgid and xgrp) - * remove privs - */ -int should_remove_suid(struct dentry *dentry) -{ - umode_t mode = dentry->d_inode->i_mode; - int kill = 0; - - /* suid always must be killed */ - if (unlikely(mode & S_ISUID)) - kill = ATTR_KILL_SUID; - - /* - * sgid without any exec bits is just a mandatory locking mark; leave - * it alone. If some exec bits are set, it's a real sgid; kill it. - */ - if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) - kill |= ATTR_KILL_SGID; - - if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode))) - return kill; - - return 0; -} -EXPORT_SYMBOL(should_remove_suid); - -static int __remove_suid(struct dentry *dentry, int kill) -{ - struct iattr newattrs; - - newattrs.ia_valid = ATTR_FORCE | kill; - return notify_change(dentry, &newattrs); -} - -int file_remove_suid(struct file *file) -{ - struct dentry *dentry = file->f_path.dentry; - struct inode *inode = dentry->d_inode; - int killsuid; - int killpriv; - int error = 0; - - /* Fast path for nothing security related */ - if (IS_NOSEC(inode)) - return 0; - - killsuid = should_remove_suid(dentry); - killpriv = security_inode_need_killpriv(dentry); - - if (killpriv < 0) - return killpriv; - if (killpriv) - error = security_inode_killpriv(dentry); - if (!error && killsuid) - error = __remove_suid(dentry, killsuid); - if (!error && (inode->i_sb->s_flags & MS_NOSEC)) - inode->i_flags |= S_NOSEC; - - return error; -} -EXPORT_SYMBOL(file_remove_suid); - static size_t __iovec_copy_from_user_inatomic(char *vaddr, const struct iovec *iov, size_t base, size_t bytes) { @@ -2489,7 +2424,9 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, if (err) goto out; - file_update_time(file); + err = file_update_time(file); + if (err) + goto out; /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ if (unlikely(file->f_flags & O_DIRECT)) { diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index a4eb311..213ca1f 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -426,7 +426,9 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len, if (ret) goto out_backing; - file_update_time(filp); + ret = file_update_time(filp); + if (ret) + goto out_backing; ret = __xip_file_write (filp, buf, count, pos, ppos); diff --git a/mm/internal.h b/mm/internal.h index 4194ab9..5cbb781 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -350,3 +350,7 @@ extern u64 hwpoison_filter_flags_mask; extern u64 hwpoison_filter_flags_value; extern u64 hwpoison_filter_memcg; extern u32 hwpoison_filter_enable; + +extern unsigned long vm_mmap_pgoff(struct file *, unsigned long, + unsigned long, unsigned long, + unsigned long, unsigned long); @@ -971,15 +971,13 @@ static inline unsigned long round_hint_to_min(unsigned long hint) * The caller must hold down_write(¤t->mm->mmap_sem). */ -static unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, +unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long pgoff) { struct mm_struct * mm = current->mm; struct inode *inode; vm_flags_t vm_flags; - int error; - unsigned long reqprot = prot; /* * Does the application expect PROT_READ to imply PROT_EXEC? @@ -1101,39 +1099,9 @@ static unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, } } - error = security_file_mmap(file, reqprot, prot, flags, addr, 0); - if (error) - return error; - return mmap_region(file, addr, len, flags, vm_flags, pgoff); } -unsigned long do_mmap(struct file *file, unsigned long addr, - unsigned long len, unsigned long prot, - unsigned long flag, unsigned long offset) -{ - if (unlikely(offset + PAGE_ALIGN(len) < offset)) - return -EINVAL; - if (unlikely(offset & ~PAGE_MASK)) - return -EINVAL; - return do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); -} -EXPORT_SYMBOL(do_mmap); - -unsigned long vm_mmap(struct file *file, unsigned long addr, - unsigned long len, unsigned long prot, - unsigned long flag, unsigned long offset) -{ - unsigned long ret; - struct mm_struct *mm = current->mm; - - down_write(&mm->mmap_sem); - ret = do_mmap(file, addr, len, prot, flag, offset); - up_write(&mm->mmap_sem); - return ret; -} -EXPORT_SYMBOL(vm_mmap); - SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, unsigned long, prot, unsigned long, flags, unsigned long, fd, unsigned long, pgoff) @@ -1165,10 +1133,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - down_write(¤t->mm->mmap_sem); - retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - + retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); if (file) fput(file); out: @@ -1629,7 +1594,9 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, if (addr & ~PAGE_MASK) return -EINVAL; - return arch_rebalance_pgtables(addr, len); + addr = arch_rebalance_pgtables(addr, len); + error = security_mmap_addr(addr); + return error ? error : addr; } EXPORT_SYMBOL(get_unmapped_area); @@ -1819,7 +1786,7 @@ int expand_downwards(struct vm_area_struct *vma, return -ENOMEM; address &= PAGE_MASK; - error = security_file_mmap(NULL, 0, 0, 0, address, 1); + error = security_mmap_addr(address); if (error) return error; @@ -2159,7 +2126,6 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) return 0; } -EXPORT_SYMBOL(do_munmap); int vm_munmap(unsigned long start, size_t len) { @@ -2207,10 +2173,6 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) if (!len) return addr; - error = security_file_mmap(NULL, 0, 0, 0, addr, 1); - if (error) - return error; - flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); @@ -2563,10 +2525,6 @@ int install_special_mapping(struct mm_struct *mm, vma->vm_ops = &special_mapping_vmops; vma->vm_private_data = pages; - ret = security_file_mmap(NULL, 0, 0, 0, vma->vm_start, 1); - if (ret) - goto out; - ret = insert_vm_struct(mm, vma); if (ret) goto out; diff --git a/mm/mremap.c b/mm/mremap.c index db8d983..21fed20 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -371,10 +371,6 @@ static unsigned long mremap_to(unsigned long addr, if ((addr <= new_addr) && (addr+old_len) > new_addr) goto out; - ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1); - if (ret) - goto out; - ret = do_munmap(mm, new_addr, new_len); if (ret) goto out; @@ -432,15 +428,17 @@ static int vma_expandable(struct vm_area_struct *vma, unsigned long delta) * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise * This option implies MREMAP_MAYMOVE. */ -unsigned long do_mremap(unsigned long addr, - unsigned long old_len, unsigned long new_len, - unsigned long flags, unsigned long new_addr) +SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, + unsigned long, new_len, unsigned long, flags, + unsigned long, new_addr) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; unsigned long ret = -EINVAL; unsigned long charged = 0; + down_write(¤t->mm->mmap_sem); + if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE)) goto out; @@ -530,25 +528,11 @@ unsigned long do_mremap(unsigned long addr, goto out; } - ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1); - if (ret) - goto out; ret = move_vma(vma, addr, old_len, new_len, new_addr); } out: if (ret & ~PAGE_MASK) vm_unacct_memory(charged); - return ret; -} - -SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, - unsigned long, new_len, unsigned long, flags, - unsigned long, new_addr) -{ - unsigned long ret; - - down_write(¤t->mm->mmap_sem); - ret = do_mremap(addr, old_len, new_len, flags, new_addr); up_write(¤t->mm->mmap_sem); return ret; } @@ -889,7 +889,6 @@ static int validate_mmap_request(struct file *file, unsigned long *_capabilities) { unsigned long capabilities, rlen; - unsigned long reqprot = prot; int ret; /* do the simple checks first */ @@ -1047,7 +1046,7 @@ static int validate_mmap_request(struct file *file, } /* allow the security API to have its say */ - ret = security_file_mmap(file, reqprot, prot, flags, addr, 0); + ret = security_mmap_addr(addr); if (ret < 0) return ret; @@ -1233,7 +1232,7 @@ enomem: /* * handle mapping creation for uClinux */ -static unsigned long do_mmap_pgoff(struct file *file, +unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, @@ -1471,32 +1470,6 @@ error_getting_region: return -ENOMEM; } -unsigned long do_mmap(struct file *file, unsigned long addr, - unsigned long len, unsigned long prot, - unsigned long flag, unsigned long offset) -{ - if (unlikely(offset + PAGE_ALIGN(len) < offset)) - return -EINVAL; - if (unlikely(offset & ~PAGE_MASK)) - return -EINVAL; - return do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); -} -EXPORT_SYMBOL(do_mmap); - -unsigned long vm_mmap(struct file *file, unsigned long addr, - unsigned long len, unsigned long prot, - unsigned long flag, unsigned long offset) -{ - unsigned long ret; - struct mm_struct *mm = current->mm; - - down_write(&mm->mmap_sem); - ret = do_mmap(file, addr, len, prot, flag, offset); - up_write(&mm->mmap_sem); - return ret; -} -EXPORT_SYMBOL(vm_mmap); - SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, unsigned long, prot, unsigned long, flags, unsigned long, fd, unsigned long, pgoff) @@ -1513,9 +1486,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - down_write(¤t->mm->mmap_sem); - retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); + ret = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); if (file) fput(file); @@ -2439,11 +2439,9 @@ static struct dentry *shmem_fh_to_dentry(struct super_block *sb, return dentry; } -static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len, - int connectable) +static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len, + struct inode *parent) { - struct inode *inode = dentry->d_inode; - if (*len < 3) { *len = 3; return 255; @@ -4,6 +4,7 @@ #include <linux/export.h> #include <linux/err.h> #include <linux/sched.h> +#include <linux/security.h> #include <asm/uaccess.h> #include "internal.h" @@ -341,6 +342,35 @@ int __attribute__((weak)) get_user_pages_fast(unsigned long start, } EXPORT_SYMBOL_GPL(get_user_pages_fast); +unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, + unsigned long len, unsigned long prot, + unsigned long flag, unsigned long pgoff) +{ + unsigned long ret; + struct mm_struct *mm = current->mm; + + ret = security_mmap_file(file, prot, flag); + if (!ret) { + down_write(&mm->mmap_sem); + ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff); + up_write(&mm->mmap_sem); + } + return ret; +} + +unsigned long vm_mmap(struct file *file, unsigned long addr, + unsigned long len, unsigned long prot, + unsigned long flag, unsigned long offset) +{ + if (unlikely(offset + PAGE_ALIGN(len) < offset)) + return -EINVAL; + if (unlikely(offset & ~PAGE_MASK)) + return -EINVAL; + + return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); +} +EXPORT_SYMBOL(vm_mmap); + /* Tracepoints definitions. */ EXPORT_TRACEPOINT_SYMBOL(kmalloc); EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index 8522a47..ca8e0a5 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -16,8 +16,6 @@ #include <net/netlink.h> #include <net/pkt_sched.h> -extern struct socket *sockfd_lookup(int fd, int *err); /* @@@ fix this */ - /* * The ATM queuing discipline provides a framework for invoking classifiers * (aka "filters"), which in turn select classes of this queuing discipline. diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c index 38f388c..107c452 100644 --- a/net/sunrpc/auth_gss/gss_krb5_wrap.c +++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c @@ -381,21 +381,53 @@ gss_unwrap_kerberos_v1(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf) } /* - * We cannot currently handle tokens with rotated data. We need a - * generalized routine to rotate the data in place. It is anticipated - * that we won't encounter rotated data in the general case. + * We can shift data by up to LOCAL_BUF_LEN bytes in a pass. If we need + * to do more than that, we shift repeatedly. Kevin Coffman reports + * seeing 28 bytes as the value used by Microsoft clients and servers + * with AES, so this constant is chosen to allow handling 28 in one pass + * without using too much stack space. + * + * If that proves to a problem perhaps we could use a more clever + * algorithm. */ -static u32 -rotate_left(struct krb5_ctx *kctx, u32 offset, struct xdr_buf *buf, u16 rrc) +#define LOCAL_BUF_LEN 32u + +static void rotate_buf_a_little(struct xdr_buf *buf, unsigned int shift) { - unsigned int realrrc = rrc % (buf->len - offset - GSS_KRB5_TOK_HDR_LEN); + char head[LOCAL_BUF_LEN]; + char tmp[LOCAL_BUF_LEN]; + unsigned int this_len, i; + + BUG_ON(shift > LOCAL_BUF_LEN); - if (realrrc == 0) - return 0; + read_bytes_from_xdr_buf(buf, 0, head, shift); + for (i = 0; i + shift < buf->len; i += LOCAL_BUF_LEN) { + this_len = min(LOCAL_BUF_LEN, buf->len - (i + shift)); + read_bytes_from_xdr_buf(buf, i+shift, tmp, this_len); + write_bytes_to_xdr_buf(buf, i, tmp, this_len); + } + write_bytes_to_xdr_buf(buf, buf->len - shift, head, shift); +} - dprintk("%s: cannot process token with rotated data: " - "rrc %u, realrrc %u\n", __func__, rrc, realrrc); - return 1; +static void _rotate_left(struct xdr_buf *buf, unsigned int shift) +{ + int shifted = 0; + int this_shift; + + shift %= buf->len; + while (shifted < shift) { + this_shift = min(shift - shifted, LOCAL_BUF_LEN); + rotate_buf_a_little(buf, this_shift); + shifted += this_shift; + } +} + +static void rotate_left(u32 base, struct xdr_buf *buf, unsigned int shift) +{ + struct xdr_buf subbuf; + + xdr_buf_subsegment(buf, &subbuf, base, buf->len - base); + _rotate_left(&subbuf, shift); } static u32 @@ -495,11 +527,8 @@ gss_unwrap_kerberos_v2(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf) seqnum = be64_to_cpup((__be64 *)(ptr + 8)); - if (rrc != 0) { - err = rotate_left(kctx, offset, buf, rrc); - if (err) - return GSS_S_FAILURE; - } + if (rrc != 0) + rotate_left(offset + 16, buf, rrc); err = (*kctx->gk5e->decrypt_v2)(kctx, offset, buf, &headskip, &tailskip); diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 3089de3..73e9573 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -336,7 +336,6 @@ struct rsc { struct svc_cred cred; struct gss_svc_seq_data seqdata; struct gss_ctx *mechctx; - char *client_name; }; static struct rsc *rsc_update(struct cache_detail *cd, struct rsc *new, struct rsc *old); @@ -347,9 +346,7 @@ static void rsc_free(struct rsc *rsci) kfree(rsci->handle.data); if (rsci->mechctx) gss_delete_sec_context(&rsci->mechctx); - if (rsci->cred.cr_group_info) - put_group_info(rsci->cred.cr_group_info); - kfree(rsci->client_name); + free_svc_cred(&rsci->cred); } static void rsc_put(struct kref *ref) @@ -387,7 +384,7 @@ rsc_init(struct cache_head *cnew, struct cache_head *ctmp) tmp->handle.data = NULL; new->mechctx = NULL; new->cred.cr_group_info = NULL; - new->client_name = NULL; + new->cred.cr_principal = NULL; } static void @@ -402,8 +399,8 @@ update_rsc(struct cache_head *cnew, struct cache_head *ctmp) spin_lock_init(&new->seqdata.sd_lock); new->cred = tmp->cred; tmp->cred.cr_group_info = NULL; - new->client_name = tmp->client_name; - tmp->client_name = NULL; + new->cred.cr_principal = tmp->cred.cr_principal; + tmp->cred.cr_principal = NULL; } static struct cache_head * @@ -501,8 +498,8 @@ static int rsc_parse(struct cache_detail *cd, /* get client name */ len = qword_get(&mesg, buf, mlen); if (len > 0) { - rsci.client_name = kstrdup(buf, GFP_KERNEL); - if (!rsci.client_name) + rsci.cred.cr_principal = kstrdup(buf, GFP_KERNEL); + if (!rsci.cred.cr_principal) goto out; } @@ -932,16 +929,6 @@ struct gss_svc_data { struct rsc *rsci; }; -char *svc_gss_principal(struct svc_rqst *rqstp) -{ - struct gss_svc_data *gd = (struct gss_svc_data *)rqstp->rq_auth_data; - - if (gd && gd->rsci) - return gd->rsci->client_name; - return NULL; -} -EXPORT_SYMBOL_GPL(svc_gss_principal); - static int svcauth_gss_set_client(struct svc_rqst *rqstp) { @@ -1220,7 +1207,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp) } svcdata->rsci = rsci; cache_get(&rsci->h); - rqstp->rq_flavor = gss_svc_to_pseudoflavor( + rqstp->rq_cred.cr_flavor = gss_svc_to_pseudoflavor( rsci->mechctx->mech_type, gc->gc_svc); ret = SVC_OK; goto out; diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index 3c06534..92509ff 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -180,14 +180,16 @@ void rpcb_put_local(struct net *net) struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); struct rpc_clnt *clnt = sn->rpcb_local_clnt; struct rpc_clnt *clnt4 = sn->rpcb_local_clnt4; - int shutdown; + int shutdown = 0; spin_lock(&sn->rpcb_clnt_lock); - if (--sn->rpcb_users == 0) { - sn->rpcb_local_clnt = NULL; - sn->rpcb_local_clnt4 = NULL; + if (sn->rpcb_users) { + if (--sn->rpcb_users == 0) { + sn->rpcb_local_clnt = NULL; + sn->rpcb_local_clnt4 = NULL; + } + shutdown = !sn->rpcb_users; } - shutdown = !sn->rpcb_users; spin_unlock(&sn->rpcb_clnt_lock); if (shutdown) { diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 017c011..7e9baaa 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -407,6 +407,14 @@ static int svc_uses_rpcbind(struct svc_serv *serv) return 0; } +int svc_bind(struct svc_serv *serv, struct net *net) +{ + if (!svc_uses_rpcbind(serv)) + return 0; + return svc_rpcb_setup(serv, net); +} +EXPORT_SYMBOL_GPL(svc_bind); + /* * Create an RPC service */ @@ -471,15 +479,8 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, spin_lock_init(&pool->sp_lock); } - if (svc_uses_rpcbind(serv)) { - if (svc_rpcb_setup(serv, current->nsproxy->net_ns) < 0) { - kfree(serv->sv_pools); - kfree(serv); - return NULL; - } - if (!serv->sv_shutdown) - serv->sv_shutdown = svc_rpcb_cleanup; - } + if (svc_uses_rpcbind(serv) && (!serv->sv_shutdown)) + serv->sv_shutdown = svc_rpcb_cleanup; return serv; } @@ -536,8 +537,6 @@ EXPORT_SYMBOL_GPL(svc_shutdown_net); void svc_destroy(struct svc_serv *serv) { - struct net *net = current->nsproxy->net_ns; - dprintk("svc: svc_destroy(%s, %d)\n", serv->sv_program->pg_name, serv->sv_nrthreads); @@ -552,8 +551,6 @@ svc_destroy(struct svc_serv *serv) del_timer_sync(&serv->sv_temptimer); - svc_shutdown_net(serv, net); - /* * The last user is gone and thus all sockets have to be destroyed to * the point. Check this. diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index b98ee35..88f2bf6 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -598,6 +598,7 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) /* now allocate needed pages. If we get a failure, sleep briefly */ pages = (serv->sv_max_mesg + PAGE_SIZE) / PAGE_SIZE; + BUG_ON(pages >= RPCSVC_MAXPAGES); for (i = 0; i < pages ; i++) while (rqstp->rq_pages[i] == NULL) { struct page *p = alloc_page(GFP_KERNEL); @@ -612,7 +613,6 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) rqstp->rq_pages[i] = p; } rqstp->rq_pages[i++] = NULL; /* this might be seen in nfs_read_actor */ - BUG_ON(pages >= RPCSVC_MAXPAGES); /* Make arg->head point to first page and arg->pages point to rest */ arg = &rqstp->rq_arg; @@ -973,7 +973,7 @@ void svc_close_net(struct svc_serv *serv, struct net *net) svc_clear_pools(serv, net); /* * At this point the sp_sockets lists will stay empty, since - * svc_enqueue will not add new entries without taking the + * svc_xprt_enqueue will not add new entries without taking the * sp_lock and checking XPT_BUSY. */ svc_clear_list(&serv->sv_tempsocks, net); diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index 6138c92..2777fa8 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -746,6 +746,7 @@ svcauth_null_accept(struct svc_rqst *rqstp, __be32 *authp) struct svc_cred *cred = &rqstp->rq_cred; cred->cr_group_info = NULL; + cred->cr_principal = NULL; rqstp->rq_client = NULL; if (argv->iov_len < 3*4) @@ -773,7 +774,7 @@ svcauth_null_accept(struct svc_rqst *rqstp, __be32 *authp) svc_putnl(resv, RPC_AUTH_NULL); svc_putnl(resv, 0); - rqstp->rq_flavor = RPC_AUTH_NULL; + rqstp->rq_cred.cr_flavor = RPC_AUTH_NULL; return SVC_OK; } @@ -811,6 +812,7 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp) int len = argv->iov_len; cred->cr_group_info = NULL; + cred->cr_principal = NULL; rqstp->rq_client = NULL; if ((len -= 3*4) < 0) @@ -847,7 +849,7 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp) svc_putnl(resv, RPC_AUTH_NULL); svc_putnl(resv, 0); - rqstp->rq_flavor = RPC_AUTH_UNIX; + rqstp->rq_cred.cr_flavor = RPC_AUTH_UNIX; return SVC_OK; badcred: diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index 032daab..8ea39aa 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -490,17 +490,9 @@ static int common_mmap(int op, struct file *file, unsigned long prot, return common_file_perm(op, file, mask); } -static int apparmor_file_mmap(struct file *file, unsigned long reqprot, - unsigned long prot, unsigned long flags, - unsigned long addr, unsigned long addr_only) +static int apparmor_mmap_file(struct file *file, unsigned long reqprot, + unsigned long prot, unsigned long flags) { - int rc = 0; - - /* do DAC check */ - rc = cap_file_mmap(file, reqprot, prot, flags, addr, addr_only); - if (rc || addr_only) - return rc; - return common_mmap(OP_FMMAP, file, prot, flags); } @@ -646,7 +638,8 @@ static struct security_operations apparmor_ops = { .file_permission = apparmor_file_permission, .file_alloc_security = apparmor_file_alloc_security, .file_free_security = apparmor_file_free_security, - .file_mmap = apparmor_file_mmap, + .mmap_file = apparmor_mmap_file, + .mmap_addr = cap_mmap_addr, .file_mprotect = apparmor_file_mprotect, .file_lock = apparmor_file_lock, diff --git a/security/capability.c b/security/capability.c index fca8896..61095df 100644 --- a/security/capability.c +++ b/security/capability.c @@ -949,7 +949,8 @@ void __init security_fixup_ops(struct security_operations *ops) set_to_cap_if_null(ops, file_alloc_security); set_to_cap_if_null(ops, file_free_security); set_to_cap_if_null(ops, file_ioctl); - set_to_cap_if_null(ops, file_mmap); + set_to_cap_if_null(ops, mmap_addr); + set_to_cap_if_null(ops, mmap_file); set_to_cap_if_null(ops, file_mprotect); set_to_cap_if_null(ops, file_lock); set_to_cap_if_null(ops, file_fcntl); diff --git a/security/commoncap.c b/security/commoncap.c index e771cb1..6dbae46 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -958,22 +958,15 @@ int cap_vm_enough_memory(struct mm_struct *mm, long pages) } /* - * cap_file_mmap - check if able to map given addr - * @file: unused - * @reqprot: unused - * @prot: unused - * @flags: unused + * cap_mmap_addr - check if able to map given addr * @addr: address attempting to be mapped - * @addr_only: unused * * If the process is attempting to map memory below dac_mmap_min_addr they need * CAP_SYS_RAWIO. The other parameters to this function are unused by the * capability security module. Returns 0 if this mapping should be allowed * -EPERM if not. */ -int cap_file_mmap(struct file *file, unsigned long reqprot, - unsigned long prot, unsigned long flags, - unsigned long addr, unsigned long addr_only) +int cap_mmap_addr(unsigned long addr) { int ret = 0; @@ -986,3 +979,9 @@ int cap_file_mmap(struct file *file, unsigned long reqprot, } return ret; } + +int cap_mmap_file(struct file *file, unsigned long reqprot, + unsigned long prot, unsigned long flags) +{ + return 0; +} diff --git a/security/security.c b/security/security.c index 5497a57..3efc9b1 100644 --- a/security/security.c +++ b/security/security.c @@ -20,6 +20,9 @@ #include <linux/ima.h> #include <linux/evm.h> #include <linux/fsnotify.h> +#include <linux/mman.h> +#include <linux/mount.h> +#include <linux/personality.h> #include <net/flow.h> #define MAX_LSM_EVM_XATTR 2 @@ -657,18 +660,56 @@ int security_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return security_ops->file_ioctl(file, cmd, arg); } -int security_file_mmap(struct file *file, unsigned long reqprot, - unsigned long prot, unsigned long flags, - unsigned long addr, unsigned long addr_only) +static inline unsigned long mmap_prot(struct file *file, unsigned long prot) { - int ret; + /* + * Does we have PROT_READ and does the application expect + * it to imply PROT_EXEC? If not, nothing to talk about... + */ + if ((prot & (PROT_READ | PROT_EXEC)) != PROT_READ) + return prot; + if (!(current->personality & READ_IMPLIES_EXEC)) + return prot; + /* + * if that's an anonymous mapping, let it. + */ + if (!file) + return prot | PROT_EXEC; + /* + * ditto if it's not on noexec mount, except that on !MMU we need + * BDI_CAP_EXEC_MMAP (== VM_MAYEXEC) in this case + */ + if (!(file->f_path.mnt->mnt_flags & MNT_NOEXEC)) { +#ifndef CONFIG_MMU + unsigned long caps = 0; + struct address_space *mapping = file->f_mapping; + if (mapping && mapping->backing_dev_info) + caps = mapping->backing_dev_info->capabilities; + if (!(caps & BDI_CAP_EXEC_MAP)) + return prot; +#endif + return prot | PROT_EXEC; + } + /* anything on noexec mount won't get PROT_EXEC */ + return prot; +} - ret = security_ops->file_mmap(file, reqprot, prot, flags, addr, addr_only); +int security_mmap_file(struct file *file, unsigned long prot, + unsigned long flags) +{ + int ret; + ret = security_ops->mmap_file(file, prot, + mmap_prot(file, prot), flags); if (ret) return ret; return ima_file_mmap(file, prot); } +int security_mmap_addr(unsigned long addr) +{ + return security_ops->mmap_addr(addr); +} + int security_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot, unsigned long prot) { diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index fa2341b..372ec65 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -3083,9 +3083,7 @@ error: return rc; } -static int selinux_file_mmap(struct file *file, unsigned long reqprot, - unsigned long prot, unsigned long flags, - unsigned long addr, unsigned long addr_only) +static int selinux_mmap_addr(unsigned long addr) { int rc = 0; u32 sid = current_sid(); @@ -3104,10 +3102,12 @@ static int selinux_file_mmap(struct file *file, unsigned long reqprot, } /* do DAC check on address space usage */ - rc = cap_file_mmap(file, reqprot, prot, flags, addr, addr_only); - if (rc || addr_only) - return rc; + return cap_mmap_addr(addr); +} +static int selinux_mmap_file(struct file *file, unsigned long reqprot, + unsigned long prot, unsigned long flags) +{ if (selinux_checkreqprot) prot = reqprot; @@ -5570,7 +5570,8 @@ static struct security_operations selinux_ops = { .file_alloc_security = selinux_file_alloc_security, .file_free_security = selinux_file_free_security, .file_ioctl = selinux_file_ioctl, - .file_mmap = selinux_file_mmap, + .mmap_file = selinux_mmap_file, + .mmap_addr = selinux_mmap_addr, .file_mprotect = selinux_file_mprotect, .file_lock = selinux_file_lock, .file_fcntl = selinux_file_fcntl, diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c index 4e93f9e..3ad2902 100644 --- a/security/selinux/selinuxfs.c +++ b/security/selinux/selinuxfs.c @@ -1259,12 +1259,8 @@ static int sel_make_bools(void) if (!inode) goto out; - ret = -EINVAL; - len = snprintf(page, PAGE_SIZE, "/%s/%s", BOOL_DIR_NAME, names[i]); - if (len < 0) - goto out; - ret = -ENAMETOOLONG; + len = snprintf(page, PAGE_SIZE, "/%s/%s", BOOL_DIR_NAME, names[i]); if (len >= PAGE_SIZE) goto out; @@ -1557,19 +1553,10 @@ static inline u32 sel_ino_to_perm(unsigned long ino) static ssize_t sel_read_class(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - ssize_t rc, len; - char *page; unsigned long ino = file->f_path.dentry->d_inode->i_ino; - - page = (char *)__get_free_page(GFP_KERNEL); - if (!page) - return -ENOMEM; - - len = snprintf(page, PAGE_SIZE, "%d", sel_ino_to_class(ino)); - rc = simple_read_from_buffer(buf, count, ppos, page, len); - free_page((unsigned long)page); - - return rc; + char res[TMPBUFLEN]; + ssize_t len = snprintf(res, sizeof(res), "%d", sel_ino_to_class(ino)); + return simple_read_from_buffer(buf, count, ppos, res, len); } static const struct file_operations sel_class_ops = { @@ -1580,19 +1567,10 @@ static const struct file_operations sel_class_ops = { static ssize_t sel_read_perm(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - ssize_t rc, len; - char *page; unsigned long ino = file->f_path.dentry->d_inode->i_ino; - - page = (char *)__get_free_page(GFP_KERNEL); - if (!page) - return -ENOMEM; - - len = snprintf(page, PAGE_SIZE, "%d", sel_ino_to_perm(ino)); - rc = simple_read_from_buffer(buf, count, ppos, page, len); - free_page((unsigned long)page); - - return rc; + char res[TMPBUFLEN]; + ssize_t len = snprintf(res, sizeof(res), "%d", sel_ino_to_perm(ino)); + return simple_read_from_buffer(buf, count, ppos, res, len); } static const struct file_operations sel_perm_ops = { diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index d583c05..ee0bb57 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -1171,7 +1171,7 @@ static int smack_file_fcntl(struct file *file, unsigned int cmd, } /** - * smack_file_mmap : + * smack_mmap_file : * Check permissions for a mmap operation. The @file may be NULL, e.g. * if mapping anonymous memory. * @file contains the file structure for file to map (may be NULL). @@ -1180,10 +1180,9 @@ static int smack_file_fcntl(struct file *file, unsigned int cmd, * @flags contains the operational flags. * Return 0 if permission is granted. */ -static int smack_file_mmap(struct file *file, +static int smack_mmap_file(struct file *file, unsigned long reqprot, unsigned long prot, - unsigned long flags, unsigned long addr, - unsigned long addr_only) + unsigned long flags) { struct smack_known *skp; struct smack_rule *srp; @@ -1198,11 +1197,6 @@ static int smack_file_mmap(struct file *file, int tmay; int rc; - /* do DAC check on address space usage */ - rc = cap_file_mmap(file, reqprot, prot, flags, addr, addr_only); - if (rc || addr_only) - return rc; - if (file == NULL || file->f_dentry == NULL) return 0; @@ -3482,7 +3476,8 @@ struct security_operations smack_ops = { .file_ioctl = smack_file_ioctl, .file_lock = smack_file_lock, .file_fcntl = smack_file_fcntl, - .file_mmap = smack_file_mmap, + .mmap_file = smack_mmap_file, + .mmap_addr = cap_mmap_addr, .file_set_fowner = smack_file_set_fowner, .file_send_sigiotask = smack_file_send_sigiotask, .file_receive = smack_file_receive, |