diff options
author | Ingo Molnar <mingo@elte.hu> | 2008-07-26 15:30:40 +0200 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2008-07-26 15:30:40 +0200 |
commit | 1503af661947b7a4a09355cc2ae6aa0d43f16776 (patch) | |
tree | 5bfcfadf2dd2d98c2ad251c96d7ee43a6903819a /fs | |
parent | a31863168660c6b6f6c7ffe05bb6a38e97803326 (diff) | |
parent | 024e8ac04453b3525448c31ef39848cf675ba6db (diff) | |
download | op-kernel-dev-1503af661947b7a4a09355cc2ae6aa0d43f16776.zip op-kernel-dev-1503af661947b7a4a09355cc2ae6aa0d43f16776.tar.gz |
Merge branch 'linus' into x86/header-guards
Conflicts:
include/asm-x86/gpio.h
include/asm-x86/ide.h
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'fs')
116 files changed, 3022 insertions, 1810 deletions
@@ -902,65 +902,7 @@ endif # BLOCK menu "Pseudo filesystems" -config PROC_FS - bool "/proc file system support" if EMBEDDED - default y - help - This is a virtual file system providing information about the status - of the system. "Virtual" means that it doesn't take up any space on - your hard disk: the files are created on the fly by the kernel when - you try to access them. Also, you cannot read the files with older - version of the program less: you need to use more or cat. - - It's totally cool; for example, "cat /proc/interrupts" gives - information about what the different IRQs are used for at the moment - (there is a small number of Interrupt ReQuest lines in your computer - that are used by the attached devices to gain the CPU's attention -- - often a source of trouble if two devices are mistakenly configured - to use the same IRQ). The program procinfo to display some - information about your system gathered from the /proc file system. - - Before you can use the /proc file system, it has to be mounted, - meaning it has to be given a location in the directory hierarchy. - That location should be /proc. A command such as "mount -t proc proc - /proc" or the equivalent line in /etc/fstab does the job. - - The /proc file system is explained in the file - <file:Documentation/filesystems/proc.txt> and on the proc(5) manpage - ("man 5 proc"). - - This option will enlarge your kernel by about 67 KB. Several - programs depend on this, so everyone should say Y here. - -config PROC_KCORE - bool "/proc/kcore support" if !ARM - depends on PROC_FS && MMU - -config PROC_VMCORE - bool "/proc/vmcore support (EXPERIMENTAL)" - depends on PROC_FS && CRASH_DUMP - default y - help - Exports the dump image of crashed kernel in ELF format. - -config PROC_SYSCTL - bool "Sysctl support (/proc/sys)" if EMBEDDED - depends on PROC_FS - select SYSCTL - default y - ---help--- - The sysctl interface provides a means of dynamically changing - certain kernel parameters and variables on the fly without requiring - a recompile of the kernel or reboot of the system. The primary - interface is through /proc/sys. If you say Y here a tree of - modifiable sysctl entries will be generated beneath the - /proc/sys directory. They are explained in the files - in <file:Documentation/sysctl/>. Note that enabling this - option will enlarge the kernel by at least 8 KB. - - As it is generally a good thing, you should say Y here unless - building a kernel for install/rescue disks or your system is very - limited in memory. +source "fs/proc/Kconfig" config SYSFS bool "sysfs file system support" if EMBEDDED @@ -2093,20 +2035,6 @@ config CODA_FS To compile the coda client support as a module, choose M here: the module will be called coda. -config CODA_FS_OLD_API - bool "Use 96-bit Coda file identifiers" - depends on CODA_FS - help - A new kernel-userspace API had to be introduced for Coda v6.0 - to support larger 128-bit file identifiers as needed by the - new realms implementation. - - However this new API is not backward compatible with older - clients. If you really need to run the old Coda userspace - cache manager then say Y. - - For most cases you probably want to say N. - config AFS_FS tristate "Andrew File System support (AFS) (EXPERIMENTAL)" depends on INET && EXPERIMENTAL @@ -586,7 +586,6 @@ static void use_mm(struct mm_struct *mm) struct task_struct *tsk = current; task_lock(tsk); - tsk->flags |= PF_BORROWED_MM; active_mm = tsk->active_mm; atomic_inc(&mm->mm_count); tsk->mm = mm; @@ -610,7 +609,6 @@ static void unuse_mm(struct mm_struct *mm) struct task_struct *tsk = current; task_lock(tsk); - tsk->flags &= ~PF_BORROWED_MM; tsk->mm = NULL; /* active_mm is still 'mm' */ enter_lazy_tlb(mm, tsk); diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index 977ef20..3662dd4 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -58,8 +58,9 @@ static struct dentry_operations anon_inodefs_dentry_operations = { * of the file * * @name: [in] name of the "class" of the new file - * @fops [in] file operations for the new file - * @priv [in] private data for the new file (will be file's private_data) + * @fops: [in] file operations for the new file + * @priv: [in] private data for the new file (will be file's private_data) + * @flags: [in] flags * * Creates a new file by hooking it on a single inode. This is useful for files * that do not need to have a full-fledged inode in order to operate correctly. @@ -68,7 +69,7 @@ static struct dentry_operations anon_inodefs_dentry_operations = { * setup. Returns new descriptor or -error. */ int anon_inode_getfd(const char *name, const struct file_operations *fops, - void *priv) + void *priv, int flags) { struct qstr this; struct dentry *dentry; @@ -78,7 +79,7 @@ int anon_inode_getfd(const char *name, const struct file_operations *fops, if (IS_ERR(anon_inode_inode)) return -ENODEV; - error = get_unused_fd(); + error = get_unused_fd_flags(flags); if (error < 0) return error; fd = error; @@ -115,7 +116,7 @@ int anon_inode_getfd(const char *name, const struct file_operations *fops, file->f_mapping = anon_inode_inode->i_mapping; file->f_pos = 0; - file->f_flags = O_RDWR; + file->f_flags = O_RDWR | (flags & O_NONBLOCK); file->f_version = 0; file->private_data = priv; diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index c3d352d..69a2f5c 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -52,7 +52,10 @@ struct autofs_info { int flags; - struct list_head rehash; + struct completion expire_complete; + + struct list_head active; + struct list_head expiring; struct autofs_sb_info *sbi; unsigned long last_used; @@ -68,15 +71,14 @@ struct autofs_info { }; #define AUTOFS_INF_EXPIRING (1<<0) /* dentry is in the process of expiring */ +#define AUTOFS_INF_MOUNTPOINT (1<<1) /* mountpoint status for direct expire */ struct autofs_wait_queue { wait_queue_head_t queue; struct autofs_wait_queue *next; autofs_wqt_t wait_queue_token; /* We use the following to see what we are waiting for */ - unsigned int hash; - unsigned int len; - char *name; + struct qstr name; u32 dev; u64 ino; uid_t uid; @@ -85,7 +87,7 @@ struct autofs_wait_queue { pid_t tgid; /* This is for status reporting upon return */ int status; - atomic_t wait_ctr; + unsigned int wait_ctr; }; #define AUTOFS_SBI_MAGIC 0x6d4a556d @@ -112,8 +114,9 @@ struct autofs_sb_info { struct mutex wq_mutex; spinlock_t fs_lock; struct autofs_wait_queue *queues; /* Wait queue pointer */ - spinlock_t rehash_lock; - struct list_head rehash_list; + spinlock_t lookup_lock; + struct list_head active_list; + struct list_head expiring_list; }; static inline struct autofs_sb_info *autofs4_sbi(struct super_block *sb) @@ -138,18 +141,14 @@ static inline int autofs4_oz_mode(struct autofs_sb_info *sbi) { static inline int autofs4_ispending(struct dentry *dentry) { struct autofs_info *inf = autofs4_dentry_ino(dentry); - int pending = 0; if (dentry->d_flags & DCACHE_AUTOFS_PENDING) return 1; - if (inf) { - spin_lock(&inf->sbi->fs_lock); - pending = inf->flags & AUTOFS_INF_EXPIRING; - spin_unlock(&inf->sbi->fs_lock); - } + if (inf->flags & AUTOFS_INF_EXPIRING) + return 1; - return pending; + return 0; } static inline void autofs4_copy_atime(struct file *src, struct file *dst) @@ -164,6 +163,7 @@ void autofs4_free_ino(struct autofs_info *); /* Expiration */ int is_autofs4_dentry(struct dentry *); +int autofs4_expire_wait(struct dentry *dentry); int autofs4_expire_run(struct super_block *, struct vfsmount *, struct autofs_sb_info *, struct autofs_packet_expire __user *); diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index 894fee5..cdabb79 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -259,13 +259,15 @@ static struct dentry *autofs4_expire_direct(struct super_block *sb, now = jiffies; timeout = sbi->exp_timeout; - /* Lock the tree as we must expire as a whole */ spin_lock(&sbi->fs_lock); if (!autofs4_direct_busy(mnt, root, timeout, do_now)) { struct autofs_info *ino = autofs4_dentry_ino(root); - - /* Set this flag early to catch sys_chdir and the like */ + if (d_mountpoint(root)) { + ino->flags |= AUTOFS_INF_MOUNTPOINT; + root->d_mounted--; + } ino->flags |= AUTOFS_INF_EXPIRING; + init_completion(&ino->expire_complete); spin_unlock(&sbi->fs_lock); return root; } @@ -292,6 +294,8 @@ static struct dentry *autofs4_expire_indirect(struct super_block *sb, struct list_head *next; int do_now = how & AUTOFS_EXP_IMMEDIATE; int exp_leaves = how & AUTOFS_EXP_LEAVES; + struct autofs_info *ino; + unsigned int ino_count; if (!root) return NULL; @@ -316,6 +320,9 @@ static struct dentry *autofs4_expire_indirect(struct super_block *sb, dentry = dget(dentry); spin_unlock(&dcache_lock); + spin_lock(&sbi->fs_lock); + ino = autofs4_dentry_ino(dentry); + /* * Case 1: (i) indirect mount or top level pseudo direct mount * (autofs-4.1). @@ -326,6 +333,11 @@ static struct dentry *autofs4_expire_indirect(struct super_block *sb, DPRINTK("checking mountpoint %p %.*s", dentry, (int)dentry->d_name.len, dentry->d_name.name); + /* Path walk currently on this dentry? */ + ino_count = atomic_read(&ino->count) + 2; + if (atomic_read(&dentry->d_count) > ino_count) + goto next; + /* Can we umount this guy */ if (autofs4_mount_busy(mnt, dentry)) goto next; @@ -343,23 +355,25 @@ static struct dentry *autofs4_expire_indirect(struct super_block *sb, /* Case 2: tree mount, expire iff entire tree is not busy */ if (!exp_leaves) { - /* Lock the tree as we must expire as a whole */ - spin_lock(&sbi->fs_lock); - if (!autofs4_tree_busy(mnt, dentry, timeout, do_now)) { - struct autofs_info *inf = autofs4_dentry_ino(dentry); + /* Path walk currently on this dentry? */ + ino_count = atomic_read(&ino->count) + 1; + if (atomic_read(&dentry->d_count) > ino_count) + goto next; - /* Set this flag early to catch sys_chdir and the like */ - inf->flags |= AUTOFS_INF_EXPIRING; - spin_unlock(&sbi->fs_lock); + if (!autofs4_tree_busy(mnt, dentry, timeout, do_now)) { expired = dentry; goto found; } - spin_unlock(&sbi->fs_lock); /* * Case 3: pseudo direct mount, expire individual leaves * (autofs-4.1). */ } else { + /* Path walk currently on this dentry? */ + ino_count = atomic_read(&ino->count) + 1; + if (atomic_read(&dentry->d_count) > ino_count) + goto next; + expired = autofs4_check_leaves(mnt, dentry, timeout, do_now); if (expired) { dput(dentry); @@ -367,6 +381,7 @@ static struct dentry *autofs4_expire_indirect(struct super_block *sb, } } next: + spin_unlock(&sbi->fs_lock); dput(dentry); spin_lock(&dcache_lock); next = next->next; @@ -377,12 +392,45 @@ next: found: DPRINTK("returning %p %.*s", expired, (int)expired->d_name.len, expired->d_name.name); + ino = autofs4_dentry_ino(expired); + ino->flags |= AUTOFS_INF_EXPIRING; + init_completion(&ino->expire_complete); + spin_unlock(&sbi->fs_lock); spin_lock(&dcache_lock); list_move(&expired->d_parent->d_subdirs, &expired->d_u.d_child); spin_unlock(&dcache_lock); return expired; } +int autofs4_expire_wait(struct dentry *dentry) +{ + struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_info *ino = autofs4_dentry_ino(dentry); + int status; + + /* Block on any pending expire */ + spin_lock(&sbi->fs_lock); + if (ino->flags & AUTOFS_INF_EXPIRING) { + spin_unlock(&sbi->fs_lock); + + DPRINTK("waiting for expire %p name=%.*s", + dentry, dentry->d_name.len, dentry->d_name.name); + + status = autofs4_wait(sbi, dentry, NFY_NONE); + wait_for_completion(&ino->expire_complete); + + DPRINTK("expire done status=%d", status); + + if (d_unhashed(dentry)) + return -EAGAIN; + + return status; + } + spin_unlock(&sbi->fs_lock); + + return 0; +} + /* Perform an expiry operation */ int autofs4_expire_run(struct super_block *sb, struct vfsmount *mnt, @@ -390,7 +438,9 @@ int autofs4_expire_run(struct super_block *sb, struct autofs_packet_expire __user *pkt_p) { struct autofs_packet_expire pkt; + struct autofs_info *ino; struct dentry *dentry; + int ret = 0; memset(&pkt,0,sizeof pkt); @@ -406,9 +456,15 @@ int autofs4_expire_run(struct super_block *sb, dput(dentry); if ( copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire)) ) - return -EFAULT; + ret = -EFAULT; - return 0; + spin_lock(&sbi->fs_lock); + ino = autofs4_dentry_ino(dentry); + ino->flags &= ~AUTOFS_INF_EXPIRING; + complete_all(&ino->expire_complete); + spin_unlock(&sbi->fs_lock); + + return ret; } /* Call repeatedly until it returns -EAGAIN, meaning there's nothing @@ -433,9 +489,16 @@ int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt, /* This is synchronous because it makes the daemon a little easier */ - ino->flags |= AUTOFS_INF_EXPIRING; ret = autofs4_wait(sbi, dentry, NFY_EXPIRE); + + spin_lock(&sbi->fs_lock); + if (ino->flags & AUTOFS_INF_MOUNTPOINT) { + sb->s_root->d_mounted++; + ino->flags &= ~AUTOFS_INF_MOUNTPOINT; + } ino->flags &= ~AUTOFS_INF_EXPIRING; + complete_all(&ino->expire_complete); + spin_unlock(&sbi->fs_lock); dput(dentry); } diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index 2fdcf5e..7bb3e5b 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -24,8 +24,10 @@ static void ino_lnkfree(struct autofs_info *ino) { - kfree(ino->u.symlink); - ino->u.symlink = NULL; + if (ino->u.symlink) { + kfree(ino->u.symlink); + ino->u.symlink = NULL; + } } struct autofs_info *autofs4_init_ino(struct autofs_info *ino, @@ -41,16 +43,18 @@ struct autofs_info *autofs4_init_ino(struct autofs_info *ino, if (ino == NULL) return NULL; - ino->flags = 0; - ino->mode = mode; - ino->inode = NULL; - ino->dentry = NULL; - ino->size = 0; - - INIT_LIST_HEAD(&ino->rehash); + if (!reinit) { + ino->flags = 0; + ino->inode = NULL; + ino->dentry = NULL; + ino->size = 0; + INIT_LIST_HEAD(&ino->active); + INIT_LIST_HEAD(&ino->expiring); + atomic_set(&ino->count, 0); + } + ino->mode = mode; ino->last_used = jiffies; - atomic_set(&ino->count, 0); ino->sbi = sbi; @@ -159,8 +163,8 @@ void autofs4_kill_sb(struct super_block *sb) if (!sbi) goto out_kill_sb; - if (!sbi->catatonic) - autofs4_catatonic_mode(sbi); /* Free wait queues, close pipe */ + /* Free wait queues, close pipe */ + autofs4_catatonic_mode(sbi); /* Clean up and release dangling references */ autofs4_force_release(sbi); @@ -338,8 +342,9 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) mutex_init(&sbi->wq_mutex); spin_lock_init(&sbi->fs_lock); sbi->queues = NULL; - spin_lock_init(&sbi->rehash_lock); - INIT_LIST_HEAD(&sbi->rehash_list); + spin_lock_init(&sbi->lookup_lock); + INIT_LIST_HEAD(&sbi->active_list); + INIT_LIST_HEAD(&sbi->expiring_list); s->s_blocksize = 1024; s->s_blocksize_bits = 10; s->s_magic = AUTOFS_SUPER_MAGIC; diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index edf5b6b..bcfb2dc 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -25,25 +25,25 @@ static int autofs4_dir_rmdir(struct inode *,struct dentry *); static int autofs4_dir_mkdir(struct inode *,struct dentry *,int); static int autofs4_root_ioctl(struct inode *, struct file *,unsigned int,unsigned long); static int autofs4_dir_open(struct inode *inode, struct file *file); -static int autofs4_dir_close(struct inode *inode, struct file *file); -static int autofs4_dir_readdir(struct file * filp, void * dirent, filldir_t filldir); -static int autofs4_root_readdir(struct file * filp, void * dirent, filldir_t filldir); static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *); static void *autofs4_follow_link(struct dentry *, struct nameidata *); +#define TRIGGER_FLAGS (LOOKUP_CONTINUE | LOOKUP_DIRECTORY) +#define TRIGGER_INTENTS (LOOKUP_OPEN | LOOKUP_CREATE) + const struct file_operations autofs4_root_operations = { .open = dcache_dir_open, .release = dcache_dir_close, .read = generic_read_dir, - .readdir = autofs4_root_readdir, + .readdir = dcache_readdir, .ioctl = autofs4_root_ioctl, }; const struct file_operations autofs4_dir_operations = { .open = autofs4_dir_open, - .release = autofs4_dir_close, + .release = dcache_dir_close, .read = generic_read_dir, - .readdir = autofs4_dir_readdir, + .readdir = dcache_readdir, }; const struct inode_operations autofs4_indirect_root_inode_operations = { @@ -70,42 +70,10 @@ const struct inode_operations autofs4_dir_inode_operations = { .rmdir = autofs4_dir_rmdir, }; -static int autofs4_root_readdir(struct file *file, void *dirent, - filldir_t filldir) -{ - struct autofs_sb_info *sbi = autofs4_sbi(file->f_path.dentry->d_sb); - int oz_mode = autofs4_oz_mode(sbi); - - DPRINTK("called, filp->f_pos = %lld", file->f_pos); - - /* - * Don't set reghost flag if: - * 1) f_pos is larger than zero -- we've already been here. - * 2) we haven't even enabled reghosting in the 1st place. - * 3) this is the daemon doing a readdir - */ - if (oz_mode && file->f_pos == 0 && sbi->reghost_enabled) - sbi->needs_reghost = 1; - - DPRINTK("needs_reghost = %d", sbi->needs_reghost); - - return dcache_readdir(file, dirent, filldir); -} - static int autofs4_dir_open(struct inode *inode, struct file *file) { struct dentry *dentry = file->f_path.dentry; - struct vfsmount *mnt = file->f_path.mnt; struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); - struct dentry *cursor; - int status; - - status = dcache_dir_open(inode, file); - if (status) - goto out; - - cursor = file->private_data; - cursor->d_fsdata = NULL; DPRINTK("file=%p dentry=%p %.*s", file, dentry, dentry->d_name.len, dentry->d_name.name); @@ -113,159 +81,32 @@ static int autofs4_dir_open(struct inode *inode, struct file *file) if (autofs4_oz_mode(sbi)) goto out; - if (autofs4_ispending(dentry)) { - DPRINTK("dentry busy"); - dcache_dir_close(inode, file); - status = -EBUSY; - goto out; - } - - status = -ENOENT; - if (!d_mountpoint(dentry) && dentry->d_op && dentry->d_op->d_revalidate) { - struct nameidata nd; - int empty, ret; - - /* In case there are stale directory dentrys from a failed mount */ - spin_lock(&dcache_lock); - empty = list_empty(&dentry->d_subdirs); + /* + * An empty directory in an autofs file system is always a + * mount point. The daemon must have failed to mount this + * during lookup so it doesn't exist. This can happen, for + * example, if user space returns an incorrect status for a + * mount request. Otherwise we're doing a readdir on the + * autofs file system so just let the libfs routines handle + * it. + */ + spin_lock(&dcache_lock); + if (!d_mountpoint(dentry) && __simple_empty(dentry)) { spin_unlock(&dcache_lock); - - if (!empty) - d_invalidate(dentry); - - nd.flags = LOOKUP_DIRECTORY; - ret = (dentry->d_op->d_revalidate)(dentry, &nd); - - if (ret <= 0) { - if (ret < 0) - status = ret; - dcache_dir_close(inode, file); - goto out; - } + return -ENOENT; } + spin_unlock(&dcache_lock); - if (d_mountpoint(dentry)) { - struct file *fp = NULL; - struct path fp_path = { .dentry = dentry, .mnt = mnt }; - - path_get(&fp_path); - - if (!autofs4_follow_mount(&fp_path.mnt, &fp_path.dentry)) { - path_put(&fp_path); - dcache_dir_close(inode, file); - goto out; - } - - fp = dentry_open(fp_path.dentry, fp_path.mnt, file->f_flags); - status = PTR_ERR(fp); - if (IS_ERR(fp)) { - dcache_dir_close(inode, file); - goto out; - } - cursor->d_fsdata = fp; - } - return 0; -out: - return status; -} - -static int autofs4_dir_close(struct inode *inode, struct file *file) -{ - struct dentry *dentry = file->f_path.dentry; - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); - struct dentry *cursor = file->private_data; - int status = 0; - - DPRINTK("file=%p dentry=%p %.*s", - file, dentry, dentry->d_name.len, dentry->d_name.name); - - if (autofs4_oz_mode(sbi)) - goto out; - - if (autofs4_ispending(dentry)) { - DPRINTK("dentry busy"); - status = -EBUSY; - goto out; - } - - if (d_mountpoint(dentry)) { - struct file *fp = cursor->d_fsdata; - if (!fp) { - status = -ENOENT; - goto out; - } - filp_close(fp, current->files); - } -out: - dcache_dir_close(inode, file); - return status; -} - -static int autofs4_dir_readdir(struct file *file, void *dirent, filldir_t filldir) -{ - struct dentry *dentry = file->f_path.dentry; - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); - struct dentry *cursor = file->private_data; - int status; - - DPRINTK("file=%p dentry=%p %.*s", - file, dentry, dentry->d_name.len, dentry->d_name.name); - - if (autofs4_oz_mode(sbi)) - goto out; - - if (autofs4_ispending(dentry)) { - DPRINTK("dentry busy"); - return -EBUSY; - } - - if (d_mountpoint(dentry)) { - struct file *fp = cursor->d_fsdata; - - if (!fp) - return -ENOENT; - - if (!fp->f_op || !fp->f_op->readdir) - goto out; - - status = vfs_readdir(fp, filldir, dirent); - file->f_pos = fp->f_pos; - if (status) - autofs4_copy_atime(file, fp); - return status; - } out: - return dcache_readdir(file, dirent, filldir); + return dcache_dir_open(inode, file); } static int try_to_fill_dentry(struct dentry *dentry, int flags) { struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); struct autofs_info *ino = autofs4_dentry_ino(dentry); - struct dentry *new; int status; - /* Block on any pending expiry here; invalidate the dentry - when expiration is done to trigger mount request with a new - dentry */ - if (ino && (ino->flags & AUTOFS_INF_EXPIRING)) { - DPRINTK("waiting for expire %p name=%.*s", - dentry, dentry->d_name.len, dentry->d_name.name); - - status = autofs4_wait(sbi, dentry, NFY_NONE); - - DPRINTK("expire done status=%d", status); - - /* - * If the directory still exists the mount request must - * continue otherwise it can't be followed at the right - * time during the walk. - */ - status = d_invalidate(dentry); - if (status != -EBUSY) - return -EAGAIN; - } - DPRINTK("dentry=%p %.*s ino=%p", dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode); @@ -292,7 +133,8 @@ static int try_to_fill_dentry(struct dentry *dentry, int flags) return status; } /* Trigger mount for path component or follow link */ - } else if (flags & (LOOKUP_CONTINUE | LOOKUP_DIRECTORY) || + } else if (dentry->d_flags & DCACHE_AUTOFS_PENDING || + flags & (TRIGGER_FLAGS | TRIGGER_INTENTS) || current->link_count) { DPRINTK("waiting for mount name=%.*s", dentry->d_name.len, dentry->d_name.name); @@ -320,26 +162,6 @@ static int try_to_fill_dentry(struct dentry *dentry, int flags) dentry->d_flags &= ~DCACHE_AUTOFS_PENDING; spin_unlock(&dentry->d_lock); - /* - * The dentry that is passed in from lookup may not be the one - * we end up using, as mkdir can create a new one. If this - * happens, and another process tries the lookup at the same time, - * it will set the PENDING flag on this new dentry, but add itself - * to our waitq. Then, if after the lookup succeeds, the first - * process that requested the mount performs another lookup of the - * same directory, it will show up as still pending! So, we need - * to redo the lookup here and clear pending on that dentry. - */ - if (d_unhashed(dentry)) { - new = d_lookup(dentry->d_parent, &dentry->d_name); - if (new) { - spin_lock(&new->d_lock); - new->d_flags &= ~DCACHE_AUTOFS_PENDING; - spin_unlock(&new->d_lock); - dput(new); - } - } - return 0; } @@ -355,51 +177,63 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd) DPRINTK("dentry=%p %.*s oz_mode=%d nd->flags=%d", dentry, dentry->d_name.len, dentry->d_name.name, oz_mode, nd->flags); - - /* If it's our master or we shouldn't trigger a mount we're done */ - lookup_type = nd->flags & (LOOKUP_CONTINUE | LOOKUP_DIRECTORY); - if (oz_mode || !lookup_type) + /* + * For an expire of a covered direct or offset mount we need + * to beeak out of follow_down() at the autofs mount trigger + * (d_mounted--), so we can see the expiring flag, and manage + * the blocking and following here until the expire is completed. + */ + if (oz_mode) { + spin_lock(&sbi->fs_lock); + if (ino->flags & AUTOFS_INF_EXPIRING) { + spin_unlock(&sbi->fs_lock); + /* Follow down to our covering mount. */ + if (!follow_down(&nd->path.mnt, &nd->path.dentry)) + goto done; + goto follow; + } + spin_unlock(&sbi->fs_lock); goto done; + } - /* If an expire request is pending wait for it. */ - if (ino && (ino->flags & AUTOFS_INF_EXPIRING)) { - DPRINTK("waiting for active request %p name=%.*s", - dentry, dentry->d_name.len, dentry->d_name.name); - - status = autofs4_wait(sbi, dentry, NFY_NONE); + /* If an expire request is pending everyone must wait. */ + autofs4_expire_wait(dentry); - DPRINTK("request done status=%d", status); - } + /* We trigger a mount for almost all flags */ + lookup_type = nd->flags & (TRIGGER_FLAGS | TRIGGER_INTENTS); + if (!(lookup_type || dentry->d_flags & DCACHE_AUTOFS_PENDING)) + goto follow; /* - * If the dentry contains directories then it is an - * autofs multi-mount with no root mount offset. So - * don't try to mount it again. + * If the dentry contains directories then it is an autofs + * multi-mount with no root mount offset. So don't try to + * mount it again. */ spin_lock(&dcache_lock); - if (!d_mountpoint(dentry) && __simple_empty(dentry)) { + if (dentry->d_flags & DCACHE_AUTOFS_PENDING || + (!d_mountpoint(dentry) && __simple_empty(dentry))) { spin_unlock(&dcache_lock); status = try_to_fill_dentry(dentry, 0); if (status) goto out_error; - /* - * The mount succeeded but if there is no root mount - * it must be an autofs multi-mount with no root offset - * so we don't need to follow the mount. - */ - if (d_mountpoint(dentry)) { - if (!autofs4_follow_mount(&nd->path.mnt, - &nd->path.dentry)) { - status = -ENOENT; - goto out_error; - } - } - - goto done; + goto follow; } spin_unlock(&dcache_lock); +follow: + /* + * If there is no root mount it must be an autofs + * multi-mount with no root offset so we don't need + * to follow it. + */ + if (d_mountpoint(dentry)) { + if (!autofs4_follow_mount(&nd->path.mnt, + &nd->path.dentry)) { + status = -ENOENT; + goto out_error; + } + } done: return NULL; @@ -424,12 +258,23 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd) int status = 1; /* Pending dentry */ + spin_lock(&sbi->fs_lock); if (autofs4_ispending(dentry)) { /* The daemon never causes a mount to trigger */ + spin_unlock(&sbi->fs_lock); + if (oz_mode) return 1; /* + * If the directory has gone away due to an expire + * we have been called as ->d_revalidate() and so + * we need to return false and proceed to ->lookup(). + */ + if (autofs4_expire_wait(dentry) == -EAGAIN) + return 0; + + /* * A zero status is success otherwise we have a * negative error code. */ @@ -437,17 +282,9 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd) if (status == 0) return 1; - /* - * A status of EAGAIN here means that the dentry has gone - * away while waiting for an expire to complete. If we are - * racing with expire lookup will wait for it so this must - * be a revalidate and we need to send it to lookup. - */ - if (status == -EAGAIN) - return 0; - return status; } + spin_unlock(&sbi->fs_lock); /* Negative dentry.. invalidate if "old" */ if (dentry->d_inode == NULL) @@ -461,6 +298,7 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd) DPRINTK("dentry=%p %.*s, emptydir", dentry, dentry->d_name.len, dentry->d_name.name); spin_unlock(&dcache_lock); + /* The daemon never causes a mount to trigger */ if (oz_mode) return 1; @@ -493,10 +331,12 @@ void autofs4_dentry_release(struct dentry *de) struct autofs_sb_info *sbi = autofs4_sbi(de->d_sb); if (sbi) { - spin_lock(&sbi->rehash_lock); - if (!list_empty(&inf->rehash)) - list_del(&inf->rehash); - spin_unlock(&sbi->rehash_lock); + spin_lock(&sbi->lookup_lock); + if (!list_empty(&inf->active)) + list_del(&inf->active); + if (!list_empty(&inf->expiring)) + list_del(&inf->expiring); + spin_unlock(&sbi->lookup_lock); } inf->dentry = NULL; @@ -518,7 +358,7 @@ static struct dentry_operations autofs4_dentry_operations = { .d_release = autofs4_dentry_release, }; -static struct dentry *autofs4_lookup_unhashed(struct autofs_sb_info *sbi, struct dentry *parent, struct qstr *name) +static struct dentry *autofs4_lookup_active(struct autofs_sb_info *sbi, struct dentry *parent, struct qstr *name) { unsigned int len = name->len; unsigned int hash = name->hash; @@ -526,14 +366,66 @@ static struct dentry *autofs4_lookup_unhashed(struct autofs_sb_info *sbi, struct struct list_head *p, *head; spin_lock(&dcache_lock); - spin_lock(&sbi->rehash_lock); - head = &sbi->rehash_list; + spin_lock(&sbi->lookup_lock); + head = &sbi->active_list; list_for_each(p, head) { struct autofs_info *ino; struct dentry *dentry; struct qstr *qstr; - ino = list_entry(p, struct autofs_info, rehash); + ino = list_entry(p, struct autofs_info, active); + dentry = ino->dentry; + + spin_lock(&dentry->d_lock); + + /* Already gone? */ + if (atomic_read(&dentry->d_count) == 0) + goto next; + + qstr = &dentry->d_name; + + if (dentry->d_name.hash != hash) + goto next; + if (dentry->d_parent != parent) + goto next; + + if (qstr->len != len) + goto next; + if (memcmp(qstr->name, str, len)) + goto next; + + if (d_unhashed(dentry)) { + dget(dentry); + spin_unlock(&dentry->d_lock); + spin_unlock(&sbi->lookup_lock); + spin_unlock(&dcache_lock); + return dentry; + } +next: + spin_unlock(&dentry->d_lock); + } + spin_unlock(&sbi->lookup_lock); + spin_unlock(&dcache_lock); + + return NULL; +} + +static struct dentry *autofs4_lookup_expiring(struct autofs_sb_info *sbi, struct dentry *parent, struct qstr *name) +{ + unsigned int len = name->len; + unsigned int hash = name->hash; + const unsigned char *str = name->name; + struct list_head *p, *head; + + spin_lock(&dcache_lock); + spin_lock(&sbi->lookup_lock); + head = &sbi->expiring_list; + list_for_each(p, head) { + struct autofs_info *ino; + struct dentry *dentry; + struct qstr *qstr; + + ino = list_entry(p, struct autofs_info, expiring); dentry = ino->dentry; spin_lock(&dentry->d_lock); @@ -555,33 +447,16 @@ static struct dentry *autofs4_lookup_unhashed(struct autofs_sb_info *sbi, struct goto next; if (d_unhashed(dentry)) { - struct inode *inode = dentry->d_inode; - - ino = autofs4_dentry_ino(dentry); - list_del_init(&ino->rehash); dget(dentry); - /* - * Make the rehashed dentry negative so the VFS - * behaves as it should. - */ - if (inode) { - dentry->d_inode = NULL; - list_del_init(&dentry->d_alias); - spin_unlock(&dentry->d_lock); - spin_unlock(&sbi->rehash_lock); - spin_unlock(&dcache_lock); - iput(inode); - return dentry; - } spin_unlock(&dentry->d_lock); - spin_unlock(&sbi->rehash_lock); + spin_unlock(&sbi->lookup_lock); spin_unlock(&dcache_lock); return dentry; } next: spin_unlock(&dentry->d_lock); } - spin_unlock(&sbi->rehash_lock); + spin_unlock(&sbi->lookup_lock); spin_unlock(&dcache_lock); return NULL; @@ -591,7 +466,8 @@ next: static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { struct autofs_sb_info *sbi; - struct dentry *unhashed; + struct autofs_info *ino; + struct dentry *expiring, *unhashed; int oz_mode; DPRINTK("name = %.*s", @@ -607,8 +483,26 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d", current->pid, task_pgrp_nr(current), sbi->catatonic, oz_mode); - unhashed = autofs4_lookup_unhashed(sbi, dentry->d_parent, &dentry->d_name); - if (!unhashed) { + expiring = autofs4_lookup_expiring(sbi, dentry->d_parent, &dentry->d_name); + if (expiring) { + /* + * If we are racing with expire the request might not + * be quite complete but the directory has been removed + * so it must have been successful, so just wait for it. + */ + ino = autofs4_dentry_ino(expiring); + autofs4_expire_wait(expiring); + spin_lock(&sbi->lookup_lock); + if (!list_empty(&ino->expiring)) + list_del_init(&ino->expiring); + spin_unlock(&sbi->lookup_lock); + dput(expiring); + } + + unhashed = autofs4_lookup_active(sbi, dentry->d_parent, &dentry->d_name); + if (unhashed) + dentry = unhashed; + else { /* * Mark the dentry incomplete but don't hash it. We do this * to serialize our inode creation operations (symlink and @@ -622,39 +516,34 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s */ dentry->d_op = &autofs4_root_dentry_operations; - dentry->d_fsdata = NULL; - d_instantiate(dentry, NULL); - } else { - struct autofs_info *ino = autofs4_dentry_ino(unhashed); - DPRINTK("rehash %p with %p", dentry, unhashed); /* - * If we are racing with expire the request might not - * be quite complete but the directory has been removed - * so it must have been successful, so just wait for it. - * We need to ensure the AUTOFS_INF_EXPIRING flag is clear - * before continuing as revalidate may fail when calling - * try_to_fill_dentry (returning EAGAIN) if we don't. + * And we need to ensure that the same dentry is used for + * all following lookup calls until it is hashed so that + * the dentry flags are persistent throughout the request. */ - while (ino && (ino->flags & AUTOFS_INF_EXPIRING)) { - DPRINTK("wait for incomplete expire %p name=%.*s", - unhashed, unhashed->d_name.len, - unhashed->d_name.name); - autofs4_wait(sbi, unhashed, NFY_NONE); - DPRINTK("request completed"); - } - dentry = unhashed; + ino = autofs4_init_ino(NULL, sbi, 0555); + if (!ino) + return ERR_PTR(-ENOMEM); + + dentry->d_fsdata = ino; + ino->dentry = dentry; + + spin_lock(&sbi->lookup_lock); + list_add(&ino->active, &sbi->active_list); + spin_unlock(&sbi->lookup_lock); + + d_instantiate(dentry, NULL); } if (!oz_mode) { spin_lock(&dentry->d_lock); dentry->d_flags |= DCACHE_AUTOFS_PENDING; spin_unlock(&dentry->d_lock); - } - - if (dentry->d_op && dentry->d_op->d_revalidate) { - mutex_unlock(&dir->i_mutex); - (dentry->d_op->d_revalidate)(dentry, nd); - mutex_lock(&dir->i_mutex); + if (dentry->d_op && dentry->d_op->d_revalidate) { + mutex_unlock(&dir->i_mutex); + (dentry->d_op->d_revalidate)(dentry, nd); + mutex_lock(&dir->i_mutex); + } } /* @@ -673,9 +562,11 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s return ERR_PTR(-ERESTARTNOINTR); } } - spin_lock(&dentry->d_lock); - dentry->d_flags &= ~DCACHE_AUTOFS_PENDING; - spin_unlock(&dentry->d_lock); + if (!oz_mode) { + spin_lock(&dentry->d_lock); + dentry->d_flags &= ~DCACHE_AUTOFS_PENDING; + spin_unlock(&dentry->d_lock); + } } /* @@ -706,7 +597,7 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s } if (unhashed) - return dentry; + return unhashed; return NULL; } @@ -728,20 +619,31 @@ static int autofs4_dir_symlink(struct inode *dir, return -EACCES; ino = autofs4_init_ino(ino, sbi, S_IFLNK | 0555); - if (ino == NULL) - return -ENOSPC; + if (!ino) + return -ENOMEM; - ino->size = strlen(symname); - ino->u.symlink = cp = kmalloc(ino->size + 1, GFP_KERNEL); + spin_lock(&sbi->lookup_lock); + if (!list_empty(&ino->active)) + list_del_init(&ino->active); + spin_unlock(&sbi->lookup_lock); - if (cp == NULL) { - kfree(ino); - return -ENOSPC; + ino->size = strlen(symname); + cp = kmalloc(ino->size + 1, GFP_KERNEL); + if (!cp) { + if (!dentry->d_fsdata) + kfree(ino); + return -ENOMEM; } strcpy(cp, symname); inode = autofs4_get_inode(dir->i_sb, ino); + if (!inode) { + kfree(cp); + if (!dentry->d_fsdata) + kfree(ino); + return -ENOMEM; + } d_add(dentry, inode); if (dir == dir->i_sb->s_root->d_inode) @@ -757,6 +659,7 @@ static int autofs4_dir_symlink(struct inode *dir, atomic_inc(&p_ino->count); ino->inode = inode; + ino->u.symlink = cp; dir->i_mtime = CURRENT_TIME; return 0; @@ -769,9 +672,8 @@ static int autofs4_dir_symlink(struct inode *dir, * that the file no longer exists. However, doing that means that the * VFS layer can turn the dentry into a negative dentry. We don't want * this, because the unlink is probably the result of an expire. - * We simply d_drop it and add it to a rehash candidates list in the - * super block, which allows the dentry lookup to reuse it retaining - * the flags, such as expire in progress, in case we're racing with expire. + * We simply d_drop it and add it to a expiring list in the super block, + * which allows the dentry lookup to check for an incomplete expire. * * If a process is blocked on the dentry waiting for the expire to finish, * it will invalidate the dentry and try to mount with a new one. @@ -801,9 +703,10 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry) dir->i_mtime = CURRENT_TIME; spin_lock(&dcache_lock); - spin_lock(&sbi->rehash_lock); - list_add(&ino->rehash, &sbi->rehash_list); - spin_unlock(&sbi->rehash_lock); + spin_lock(&sbi->lookup_lock); + if (list_empty(&ino->expiring)) + list_add(&ino->expiring, &sbi->expiring_list); + spin_unlock(&sbi->lookup_lock); spin_lock(&dentry->d_lock); __d_drop(dentry); spin_unlock(&dentry->d_lock); @@ -829,9 +732,10 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry) spin_unlock(&dcache_lock); return -ENOTEMPTY; } - spin_lock(&sbi->rehash_lock); - list_add(&ino->rehash, &sbi->rehash_list); - spin_unlock(&sbi->rehash_lock); + spin_lock(&sbi->lookup_lock); + if (list_empty(&ino->expiring)) + list_add(&ino->expiring, &sbi->expiring_list); + spin_unlock(&sbi->lookup_lock); spin_lock(&dentry->d_lock); __d_drop(dentry); spin_unlock(&dentry->d_lock); @@ -866,10 +770,20 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode) dentry, dentry->d_name.len, dentry->d_name.name); ino = autofs4_init_ino(ino, sbi, S_IFDIR | 0555); - if (ino == NULL) - return -ENOSPC; + if (!ino) + return -ENOMEM; + + spin_lock(&sbi->lookup_lock); + if (!list_empty(&ino->active)) + list_del_init(&ino->active); + spin_unlock(&sbi->lookup_lock); inode = autofs4_get_inode(dir->i_sb, ino); + if (!inode) { + if (!dentry->d_fsdata) + kfree(ino); + return -ENOMEM; + } d_add(dentry, inode); if (dir == dir->i_sb->s_root->d_inode) @@ -922,44 +836,6 @@ static inline int autofs4_get_protosubver(struct autofs_sb_info *sbi, int __user } /* - * Tells the daemon whether we need to reghost or not. Also, clears - * the reghost_needed flag. - */ -static inline int autofs4_ask_reghost(struct autofs_sb_info *sbi, int __user *p) -{ - int status; - - DPRINTK("returning %d", sbi->needs_reghost); - - status = put_user(sbi->needs_reghost, p); - if (status) - return status; - - sbi->needs_reghost = 0; - return 0; -} - -/* - * Enable / Disable reghosting ioctl() operation - */ -static inline int autofs4_toggle_reghost(struct autofs_sb_info *sbi, int __user *p) -{ - int status; - int val; - - status = get_user(val, p); - - DPRINTK("reghost = %d", val); - - if (status) - return status; - - /* turn on/off reghosting, with the val */ - sbi->reghost_enabled = val; - return 0; -} - -/* * Tells the daemon whether it can umount the autofs mount. */ static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p) @@ -1023,11 +899,6 @@ static int autofs4_root_ioctl(struct inode *inode, struct file *filp, case AUTOFS_IOC_SETTIMEOUT: return autofs4_get_set_timeout(sbi, p); - case AUTOFS_IOC_TOGGLEREGHOST: - return autofs4_toggle_reghost(sbi, p); - case AUTOFS_IOC_ASKREGHOST: - return autofs4_ask_reghost(sbi, p); - case AUTOFS_IOC_ASKUMOUNT: return autofs4_ask_umount(filp->f_path.mnt, p); diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 75e5955..35216d1 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -28,6 +28,12 @@ void autofs4_catatonic_mode(struct autofs_sb_info *sbi) { struct autofs_wait_queue *wq, *nwq; + mutex_lock(&sbi->wq_mutex); + if (sbi->catatonic) { + mutex_unlock(&sbi->wq_mutex); + return; + } + DPRINTK("entering catatonic mode"); sbi->catatonic = 1; @@ -36,13 +42,18 @@ void autofs4_catatonic_mode(struct autofs_sb_info *sbi) while (wq) { nwq = wq->next; wq->status = -ENOENT; /* Magic is gone - report failure */ - kfree(wq->name); - wq->name = NULL; + if (wq->name.name) { + kfree(wq->name.name); + wq->name.name = NULL; + } + wq->wait_ctr--; wake_up_interruptible(&wq->queue); wq = nwq; } fput(sbi->pipe); /* Close the pipe */ sbi->pipe = NULL; + sbi->pipefd = -1; + mutex_unlock(&sbi->wq_mutex); } static int autofs4_write(struct file *file, const void *addr, int bytes) @@ -89,10 +100,11 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, union autofs_packet_union v4_pkt; union autofs_v5_packet_union v5_pkt; } pkt; + struct file *pipe = NULL; size_t pktsz; DPRINTK("wait id = 0x%08lx, name = %.*s, type=%d", - wq->wait_queue_token, wq->len, wq->name, type); + wq->wait_queue_token, wq->name.len, wq->name.name, type); memset(&pkt,0,sizeof pkt); /* For security reasons */ @@ -107,9 +119,9 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, pktsz = sizeof(*mp); mp->wait_queue_token = wq->wait_queue_token; - mp->len = wq->len; - memcpy(mp->name, wq->name, wq->len); - mp->name[wq->len] = '\0'; + mp->len = wq->name.len; + memcpy(mp->name, wq->name.name, wq->name.len); + mp->name[wq->name.len] = '\0'; break; } case autofs_ptype_expire_multi: @@ -119,9 +131,9 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, pktsz = sizeof(*ep); ep->wait_queue_token = wq->wait_queue_token; - ep->len = wq->len; - memcpy(ep->name, wq->name, wq->len); - ep->name[wq->len] = '\0'; + ep->len = wq->name.len; + memcpy(ep->name, wq->name.name, wq->name.len); + ep->name[wq->name.len] = '\0'; break; } /* @@ -138,9 +150,9 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, pktsz = sizeof(*packet); packet->wait_queue_token = wq->wait_queue_token; - packet->len = wq->len; - memcpy(packet->name, wq->name, wq->len); - packet->name[wq->len] = '\0'; + packet->len = wq->name.len; + memcpy(packet->name, wq->name.name, wq->name.len); + packet->name[wq->name.len] = '\0'; packet->dev = wq->dev; packet->ino = wq->ino; packet->uid = wq->uid; @@ -154,8 +166,19 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, return; } - if (autofs4_write(sbi->pipe, &pkt, pktsz)) - autofs4_catatonic_mode(sbi); + /* Check if we have become catatonic */ + mutex_lock(&sbi->wq_mutex); + if (!sbi->catatonic) { + pipe = sbi->pipe; + get_file(pipe); + } + mutex_unlock(&sbi->wq_mutex); + + if (pipe) { + if (autofs4_write(pipe, &pkt, pktsz)) + autofs4_catatonic_mode(sbi); + fput(pipe); + } } static int autofs4_getpath(struct autofs_sb_info *sbi, @@ -191,58 +214,55 @@ static int autofs4_getpath(struct autofs_sb_info *sbi, } static struct autofs_wait_queue * -autofs4_find_wait(struct autofs_sb_info *sbi, - char *name, unsigned int hash, unsigned int len) +autofs4_find_wait(struct autofs_sb_info *sbi, struct qstr *qstr) { struct autofs_wait_queue *wq; for (wq = sbi->queues; wq; wq = wq->next) { - if (wq->hash == hash && - wq->len == len && - wq->name && !memcmp(wq->name, name, len)) + if (wq->name.hash == qstr->hash && + wq->name.len == qstr->len && + wq->name.name && + !memcmp(wq->name.name, qstr->name, qstr->len)) break; } return wq; } -int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, - enum autofs_notify notify) +/* + * Check if we have a valid request. + * Returns + * 1 if the request should continue. + * In this case we can return an autofs_wait_queue entry if one is + * found or NULL to idicate a new wait needs to be created. + * 0 or a negative errno if the request shouldn't continue. + */ +static int validate_request(struct autofs_wait_queue **wait, + struct autofs_sb_info *sbi, + struct qstr *qstr, + struct dentry*dentry, enum autofs_notify notify) { - struct autofs_info *ino; struct autofs_wait_queue *wq; - char *name; - unsigned int len = 0; - unsigned int hash = 0; - int status, type; - - /* In catatonic mode, we don't wait for nobody */ - if (sbi->catatonic) - return -ENOENT; - - name = kmalloc(NAME_MAX + 1, GFP_KERNEL); - if (!name) - return -ENOMEM; + struct autofs_info *ino; - /* If this is a direct mount request create a dummy name */ - if (IS_ROOT(dentry) && (sbi->type & AUTOFS_TYPE_DIRECT)) - len = sprintf(name, "%p", dentry); - else { - len = autofs4_getpath(sbi, dentry, &name); - if (!len) { - kfree(name); - return -ENOENT; - } + /* Wait in progress, continue; */ + wq = autofs4_find_wait(sbi, qstr); + if (wq) { + *wait = wq; + return 1; } - hash = full_name_hash(name, len); - if (mutex_lock_interruptible(&sbi->wq_mutex)) { - kfree(name); - return -EINTR; - } + *wait = NULL; - wq = autofs4_find_wait(sbi, name, hash, len); + /* If we don't yet have any info this is a new request */ ino = autofs4_dentry_ino(dentry); - if (!wq && ino && notify == NFY_NONE) { + if (!ino) + return 1; + + /* + * If we've been asked to wait on an existing expire (NFY_NONE) + * but there is no wait in the queue ... + */ + if (notify == NFY_NONE) { /* * Either we've betean the pending expire to post it's * wait or it finished while we waited on the mutex. @@ -253,13 +273,14 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, while (ino->flags & AUTOFS_INF_EXPIRING) { mutex_unlock(&sbi->wq_mutex); schedule_timeout_interruptible(HZ/10); - if (mutex_lock_interruptible(&sbi->wq_mutex)) { - kfree(name); + if (mutex_lock_interruptible(&sbi->wq_mutex)) return -EINTR; + + wq = autofs4_find_wait(sbi, qstr); + if (wq) { + *wait = wq; + return 1; } - wq = autofs4_find_wait(sbi, name, hash, len); - if (wq) - break; } /* @@ -267,18 +288,96 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, * cases where we wait on NFY_NONE neither depend on the * return status of the wait. */ - if (!wq) { + return 0; + } + + /* + * If we've been asked to trigger a mount and the request + * completed while we waited on the mutex ... + */ + if (notify == NFY_MOUNT) { + /* + * If the dentry isn't hashed just go ahead and try the + * mount again with a new wait (not much else we can do). + */ + if (!d_unhashed(dentry)) { + /* + * But if the dentry is hashed, that means that we + * got here through the revalidate path. Thus, we + * need to check if the dentry has been mounted + * while we waited on the wq_mutex. If it has, + * simply return success. + */ + if (d_mountpoint(dentry)) + return 0; + } + } + + return 1; +} + +int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, + enum autofs_notify notify) +{ + struct autofs_wait_queue *wq; + struct qstr qstr; + char *name; + int status, ret, type; + + /* In catatonic mode, we don't wait for nobody */ + if (sbi->catatonic) + return -ENOENT; + + if (!dentry->d_inode) { + /* + * A wait for a negative dentry is invalid for certain + * cases. A direct or offset mount "always" has its mount + * point directory created and so the request dentry must + * be positive or the map key doesn't exist. The situation + * is very similar for indirect mounts except only dentrys + * in the root of the autofs file system may be negative. + */ + if (sbi->type & (AUTOFS_TYPE_DIRECT|AUTOFS_TYPE_OFFSET)) + return -ENOENT; + else if (!IS_ROOT(dentry->d_parent)) + return -ENOENT; + } + + name = kmalloc(NAME_MAX + 1, GFP_KERNEL); + if (!name) + return -ENOMEM; + + /* If this is a direct mount request create a dummy name */ + if (IS_ROOT(dentry) && (sbi->type & AUTOFS_TYPE_DIRECT)) + qstr.len = sprintf(name, "%p", dentry); + else { + qstr.len = autofs4_getpath(sbi, dentry, &name); + if (!qstr.len) { kfree(name); - mutex_unlock(&sbi->wq_mutex); - return 0; + return -ENOENT; } } + qstr.name = name; + qstr.hash = full_name_hash(name, qstr.len); + + if (mutex_lock_interruptible(&sbi->wq_mutex)) { + kfree(qstr.name); + return -EINTR; + } + + ret = validate_request(&wq, sbi, &qstr, dentry, notify); + if (ret <= 0) { + if (ret == 0) + mutex_unlock(&sbi->wq_mutex); + kfree(qstr.name); + return ret; + } if (!wq) { /* Create a new wait queue */ wq = kmalloc(sizeof(struct autofs_wait_queue),GFP_KERNEL); if (!wq) { - kfree(name); + kfree(qstr.name); mutex_unlock(&sbi->wq_mutex); return -ENOMEM; } @@ -289,9 +388,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, wq->next = sbi->queues; sbi->queues = wq; init_waitqueue_head(&wq->queue); - wq->hash = hash; - wq->name = name; - wq->len = len; + memcpy(&wq->name, &qstr, sizeof(struct qstr)); wq->dev = autofs4_get_dev(sbi); wq->ino = autofs4_get_ino(sbi); wq->uid = current->uid; @@ -299,7 +396,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, wq->pid = current->pid; wq->tgid = current->tgid; wq->status = -EINTR; /* Status return if interrupted */ - atomic_set(&wq->wait_ctr, 2); + wq->wait_ctr = 2; mutex_unlock(&sbi->wq_mutex); if (sbi->version < 5) { @@ -319,28 +416,25 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, } DPRINTK("new wait id = 0x%08lx, name = %.*s, nfy=%d\n", - (unsigned long) wq->wait_queue_token, wq->len, wq->name, notify); + (unsigned long) wq->wait_queue_token, wq->name.len, + wq->name.name, notify); /* autofs4_notify_daemon() may block */ autofs4_notify_daemon(sbi, wq, type); } else { - atomic_inc(&wq->wait_ctr); + wq->wait_ctr++; mutex_unlock(&sbi->wq_mutex); - kfree(name); + kfree(qstr.name); DPRINTK("existing wait id = 0x%08lx, name = %.*s, nfy=%d", - (unsigned long) wq->wait_queue_token, wq->len, wq->name, notify); - } - - /* wq->name is NULL if and only if the lock is already released */ - - if (sbi->catatonic) { - /* We might have slept, so check again for catatonic mode */ - wq->status = -ENOENT; - kfree(wq->name); - wq->name = NULL; + (unsigned long) wq->wait_queue_token, wq->name.len, + wq->name.name, notify); } - if (wq->name) { + /* + * wq->name.name is NULL iff the lock is already released + * or the mount has been made catatonic. + */ + if (wq->name.name) { /* Block all but "shutdown" signals while waiting */ sigset_t oldset; unsigned long irqflags; @@ -351,7 +445,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, recalc_sigpending(); spin_unlock_irqrestore(¤t->sighand->siglock, irqflags); - wait_event_interruptible(wq->queue, wq->name == NULL); + wait_event_interruptible(wq->queue, wq->name.name == NULL); spin_lock_irqsave(¤t->sighand->siglock, irqflags); current->blocked = oldset; @@ -364,8 +458,10 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, status = wq->status; /* Are we the last process to need status? */ - if (atomic_dec_and_test(&wq->wait_ctr)) + mutex_lock(&sbi->wq_mutex); + if (!--wq->wait_ctr) kfree(wq); + mutex_unlock(&sbi->wq_mutex); return status; } @@ -387,16 +483,13 @@ int autofs4_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_tok } *wql = wq->next; /* Unlink from chain */ - mutex_unlock(&sbi->wq_mutex); - kfree(wq->name); - wq->name = NULL; /* Do not wait on this queue */ - + kfree(wq->name.name); + wq->name.name = NULL; /* Do not wait on this queue */ wq->status = status; - - if (atomic_dec_and_test(&wq->wait_ctr)) /* Is anyone still waiting for this guy? */ + wake_up_interruptible(&wq->queue); + if (!--wq->wait_ctr) kfree(wq); - else - wake_up_interruptible(&wq->queue); + mutex_unlock(&sbi->wq_mutex); return 0; } diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index d48ff5f3..3b6ff85 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -131,6 +131,15 @@ static int padzero(unsigned long elf_bss) #define STACK_ALLOC(sp, len) ({ sp -= len ; sp; }) #endif +#ifndef ELF_BASE_PLATFORM +/* + * AT_BASE_PLATFORM indicates the "real" hardware/microarchitecture. + * If the arch defines ELF_BASE_PLATFORM (in asm/elf.h), the value + * will be copied to the user stack in the same manner as AT_PLATFORM. + */ +#define ELF_BASE_PLATFORM NULL +#endif + static int create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, unsigned long load_addr, unsigned long interp_load_addr) @@ -142,7 +151,9 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, elf_addr_t __user *envp; elf_addr_t __user *sp; elf_addr_t __user *u_platform; + elf_addr_t __user *u_base_platform; const char *k_platform = ELF_PLATFORM; + const char *k_base_platform = ELF_BASE_PLATFORM; int items; elf_addr_t *elf_info; int ei_index = 0; @@ -172,6 +183,19 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, return -EFAULT; } + /* + * If this architecture has a "base" platform capability + * string, copy it to userspace. + */ + u_base_platform = NULL; + if (k_base_platform) { + size_t len = strlen(k_base_platform) + 1; + + u_base_platform = (elf_addr_t __user *)STACK_ALLOC(p, len); + if (__copy_to_user(u_base_platform, k_base_platform, len)) + return -EFAULT; + } + /* Create the ELF interpreter info */ elf_info = (elf_addr_t *)current->mm->saved_auxv; /* update AT_VECTOR_SIZE_BASE if the number of NEW_AUX_ENT() changes */ @@ -204,10 +228,15 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, NEW_AUX_ENT(AT_GID, tsk->gid); NEW_AUX_ENT(AT_EGID, tsk->egid); NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm)); + NEW_AUX_ENT(AT_EXECFN, bprm->exec); if (k_platform) { NEW_AUX_ENT(AT_PLATFORM, (elf_addr_t)(unsigned long)u_platform); } + if (k_base_platform) { + NEW_AUX_ENT(AT_BASE_PLATFORM, + (elf_addr_t)(unsigned long)u_base_platform); + } if (bprm->interp_flags & BINPRM_FLAGS_EXECFD) { NEW_AUX_ENT(AT_EXECFD, bprm->interp_data); } @@ -1477,7 +1506,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, const struct user_regset_view *view = task_user_regset_view(dump_task); struct elf_thread_core_info *t; struct elf_prpsinfo *psinfo; - struct task_struct *g, *p; + struct core_thread *ct; unsigned int i; info->size = 0; @@ -1516,31 +1545,26 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, /* * Allocate a structure for each thread. */ - rcu_read_lock(); - do_each_thread(g, p) - if (p->mm == dump_task->mm) { - t = kzalloc(offsetof(struct elf_thread_core_info, - notes[info->thread_notes]), - GFP_ATOMIC); - if (unlikely(!t)) { - rcu_read_unlock(); - return 0; - } - t->task = p; - if (p == dump_task || !info->thread) { - t->next = info->thread; - info->thread = t; - } else { - /* - * Make sure to keep the original task at - * the head of the list. - */ - t->next = info->thread->next; - info->thread->next = t; - } + for (ct = &dump_task->mm->core_state->dumper; ct; ct = ct->next) { + t = kzalloc(offsetof(struct elf_thread_core_info, + notes[info->thread_notes]), + GFP_KERNEL); + if (unlikely(!t)) + return 0; + + t->task = ct->task; + if (ct->task == dump_task || !info->thread) { + t->next = info->thread; + info->thread = t; + } else { + /* + * Make sure to keep the original task at + * the head of the list. + */ + t->next = info->thread->next; + info->thread->next = t; } - while_each_thread(g, p); - rcu_read_unlock(); + } /* * Now fill in each thread's information. @@ -1687,7 +1711,6 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, { #define NUM_NOTES 6 struct list_head *t; - struct task_struct *g, *p; info->notes = NULL; info->prstatus = NULL; @@ -1719,20 +1742,19 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, info->thread_status_size = 0; if (signr) { + struct core_thread *ct; struct elf_thread_status *ets; - rcu_read_lock(); - do_each_thread(g, p) - if (current->mm == p->mm && current != p) { - ets = kzalloc(sizeof(*ets), GFP_ATOMIC); - if (!ets) { - rcu_read_unlock(); - return 0; - } - ets->thread = p; - list_add(&ets->list, &info->thread_list); - } - while_each_thread(g, p); - rcu_read_unlock(); + + for (ct = current->mm->core_state->dumper.next; + ct; ct = ct->next) { + ets = kzalloc(sizeof(*ets), GFP_KERNEL); + if (!ets) + return 0; + + ets->thread = ct->task; + list_add(&ets->list, &info->thread_list); + } + list_for_each(t, &info->thread_list) { int sz; diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index d051a32..1b59b1e 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1573,7 +1573,6 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs, struct memelfnote *notes = NULL; struct elf_prstatus *prstatus = NULL; /* NT_PRSTATUS */ struct elf_prpsinfo *psinfo = NULL; /* NT_PRPSINFO */ - struct task_struct *g, *p; LIST_HEAD(thread_list); struct list_head *t; elf_fpregset_t *fpu = NULL; @@ -1622,20 +1621,19 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs, #endif if (signr) { + struct core_thread *ct; struct elf_thread_status *tmp; - rcu_read_lock(); - do_each_thread(g,p) - if (current->mm == p->mm && current != p) { - tmp = kzalloc(sizeof(*tmp), GFP_ATOMIC); - if (!tmp) { - rcu_read_unlock(); - goto cleanup; - } - tmp->thread = p; - list_add(&tmp->list, &thread_list); - } - while_each_thread(g,p); - rcu_read_unlock(); + + for (ct = current->mm->core_state->dumper.next; + ct; ct = ct->next) { + tmp = kzalloc(sizeof(*tmp), GFP_KERNEL); + if (!tmp) + goto cleanup; + + tmp->thread = ct->task; + list_add(&tmp->list, &thread_list); + } + list_for_each(t, &thread_list) { struct elf_thread_status *tmp; int sz; diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 7191306..7562053 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -27,6 +27,7 @@ #include <linux/namei.h> #include <linux/mount.h> #include <linux/syscalls.h> +#include <linux/fs.h> #include <asm/uaccess.h> @@ -535,31 +536,16 @@ static ssize_t bm_entry_read(struct file * file, char __user * buf, size_t nbytes, loff_t *ppos) { Node *e = file->f_path.dentry->d_inode->i_private; - loff_t pos = *ppos; ssize_t res; char *page; - int len; if (!(page = (char*) __get_free_page(GFP_KERNEL))) return -ENOMEM; entry_status(e, page); - len = strlen(page); - res = -EINVAL; - if (pos < 0) - goto out; - res = 0; - if (pos >= len) - goto out; - if (len < pos + nbytes) - nbytes = len - pos; - res = -EFAULT; - if (copy_to_user(buf, page + pos, nbytes)) - goto out; - *ppos = pos + nbytes; - res = nbytes; -out: + res = simple_read_from_buffer(buf, nbytes, ppos, page, strlen(page)); + free_page((unsigned long) page); return res; } diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c index e1c8548..bf4a3fd 100644 --- a/fs/coda/coda_linux.c +++ b/fs/coda/coda_linux.c @@ -28,11 +28,9 @@ int coda_fake_statfs; char * coda_f2s(struct CodaFid *f) { static char s[60]; -#ifdef CONFIG_CODA_FS_OLD_API - sprintf(s, "(%08x.%08x.%08x)", f->opaque[0], f->opaque[1], f->opaque[2]); -#else + sprintf(s, "(%08x.%08x.%08x.%08x)", f->opaque[0], f->opaque[1], f->opaque[2], f->opaque[3]); -#endif + return s; } diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c index e3eb355..0d9b80e 100644 --- a/fs/coda/psdev.c +++ b/fs/coda/psdev.c @@ -362,8 +362,9 @@ static int init_coda_psdev(void) goto out_chrdev; } for (i = 0; i < MAX_CODADEVS; i++) - device_create(coda_psdev_class, NULL, - MKDEV(CODA_PSDEV_MAJOR,i), "cfs%d", i); + device_create_drvdata(coda_psdev_class, NULL, + MKDEV(CODA_PSDEV_MAJOR, i), + NULL, "cfs%d", i); coda_sysctl_init(); goto out; @@ -377,11 +378,7 @@ MODULE_AUTHOR("Jan Harkes, Peter J. Braam"); MODULE_DESCRIPTION("Coda Distributed File System VFS interface"); MODULE_ALIAS_CHARDEV_MAJOR(CODA_PSDEV_MAJOR); MODULE_LICENSE("GPL"); -#ifdef CONFIG_CODA_FS_OLD_API -MODULE_VERSION("5.3.21"); -#else MODULE_VERSION("6.6"); -#endif static int __init init_coda(void) { diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c index 359e531..ce432bc 100644 --- a/fs/coda/upcall.c +++ b/fs/coda/upcall.c @@ -52,12 +52,8 @@ static void *alloc_upcall(int opcode, int size) inp->ih.opcode = opcode; inp->ih.pid = current->pid; inp->ih.pgid = task_pgrp_nr(current); -#ifdef CONFIG_CODA_FS_OLD_API - memset(&inp->ih.cred, 0, sizeof(struct coda_cred)); - inp->ih.cred.cr_fsuid = current->fsuid; -#else inp->ih.uid = current->fsuid; -#endif + return (void*)inp; } @@ -166,20 +162,11 @@ int venus_close(struct super_block *sb, struct CodaFid *fid, int flags, union inputArgs *inp; union outputArgs *outp; int insize, outsize, error; -#ifdef CONFIG_CODA_FS_OLD_API - struct coda_cred cred = { 0, }; - cred.cr_fsuid = uid; -#endif insize = SIZE(release); UPARG(CODA_CLOSE); -#ifdef CONFIG_CODA_FS_OLD_API - memcpy(&(inp->ih.cred), &cred, sizeof(cred)); -#else inp->ih.uid = uid; -#endif - inp->coda_close.VFid = *fid; inp->coda_close.flags = flags; diff --git a/fs/compat.c b/fs/compat.c index ed43e17..106eba2 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -197,8 +197,8 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs * { if (sizeof ubuf->f_blocks == 4) { - if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail) & - 0xffffffff00000000ULL) + if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail | + kbuf->f_bsize | kbuf->f_frsize) & 0xffffffff00000000ULL) return -EOVERFLOW; /* f_files and f_ffree may be -1; it's okay * to stuff that into 32 bits */ @@ -271,8 +271,8 @@ out: static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstatfs *kbuf) { if (sizeof ubuf->f_blocks == 4) { - if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail) & - 0xffffffff00000000ULL) + if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail | + kbuf->f_bsize | kbuf->f_frsize) & 0xffffffff00000000ULL) return -EOVERFLOW; /* f_files and f_ffree may be -1; it's okay * to stuff that into 32 bits */ @@ -2131,9 +2131,9 @@ asmlinkage long compat_sys_epoll_pwait(int epfd, #ifdef CONFIG_SIGNALFD -asmlinkage long compat_sys_signalfd(int ufd, - const compat_sigset_t __user *sigmask, - compat_size_t sigsetsize) +asmlinkage long compat_sys_signalfd4(int ufd, + const compat_sigset_t __user *sigmask, + compat_size_t sigsetsize, int flags) { compat_sigset_t ss32; sigset_t tmp; @@ -2148,9 +2148,15 @@ asmlinkage long compat_sys_signalfd(int ufd, if (copy_to_user(ksigmask, &tmp, sizeof(sigset_t))) return -EFAULT; - return sys_signalfd(ufd, ksigmask, sizeof(sigset_t)); + return sys_signalfd4(ufd, ksigmask, sizeof(sigset_t), flags); } +asmlinkage long compat_sys_signalfd(int ufd, + const compat_sigset_t __user *sigmask, + compat_size_t sigsetsize) +{ + return compat_sys_signalfd4(ufd, sigmask, sigsetsize, 0); +} #endif /* CONFIG_SIGNALFD */ #ifdef CONFIG_TIMERFD diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 7b3a03c..5235c67 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -25,7 +25,6 @@ #include <linux/slab.h> #include <linux/raid/md.h> #include <linux/kd.h> -#include <linux/dirent.h> #include <linux/route.h> #include <linux/in6.h> #include <linux/ipv6_route.h> @@ -2297,8 +2296,6 @@ COMPATIBLE_IOCTL(AUTOFS_IOC_PROTOVER) COMPATIBLE_IOCTL(AUTOFS_IOC_EXPIRE) COMPATIBLE_IOCTL(AUTOFS_IOC_EXPIRE_MULTI) COMPATIBLE_IOCTL(AUTOFS_IOC_PROTOSUBVER) -COMPATIBLE_IOCTL(AUTOFS_IOC_ASKREGHOST) -COMPATIBLE_IOCTL(AUTOFS_IOC_TOGGLEREGHOST) COMPATIBLE_IOCTL(AUTOFS_IOC_ASKUMOUNT) /* Raw devices */ COMPATIBLE_IOCTL(RAW_SETBIND) diff --git a/fs/dcache.c b/fs/dcache.c index 6068c25..3818d6a 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -61,7 +61,6 @@ static struct kmem_cache *dentry_cache __read_mostly; static unsigned int d_hash_mask __read_mostly; static unsigned int d_hash_shift __read_mostly; static struct hlist_head *dentry_hashtable __read_mostly; -static LIST_HEAD(dentry_unused); /* Statistics gathering. */ struct dentry_stat_t dentry_stat = { @@ -96,14 +95,6 @@ static void d_free(struct dentry *dentry) call_rcu(&dentry->d_u.d_rcu, d_callback); } -static void dentry_lru_remove(struct dentry *dentry) -{ - if (!list_empty(&dentry->d_lru)) { - list_del_init(&dentry->d_lru); - dentry_stat.nr_unused--; - } -} - /* * Release the dentry's inode, using the filesystem * d_iput() operation if defined. @@ -130,6 +121,41 @@ static void dentry_iput(struct dentry * dentry) } } +/* + * dentry_lru_(add|add_tail|del|del_init) must be called with dcache_lock held. + */ +static void dentry_lru_add(struct dentry *dentry) +{ + list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru); + dentry->d_sb->s_nr_dentry_unused++; + dentry_stat.nr_unused++; +} + +static void dentry_lru_add_tail(struct dentry *dentry) +{ + list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru); + dentry->d_sb->s_nr_dentry_unused++; + dentry_stat.nr_unused++; +} + +static void dentry_lru_del(struct dentry *dentry) +{ + if (!list_empty(&dentry->d_lru)) { + list_del(&dentry->d_lru); + dentry->d_sb->s_nr_dentry_unused--; + dentry_stat.nr_unused--; + } +} + +static void dentry_lru_del_init(struct dentry *dentry) +{ + if (likely(!list_empty(&dentry->d_lru))) { + list_del_init(&dentry->d_lru); + dentry->d_sb->s_nr_dentry_unused--; + dentry_stat.nr_unused--; + } +} + /** * d_kill - kill dentry and return parent * @dentry: dentry to kill @@ -212,8 +238,7 @@ repeat: goto kill_it; if (list_empty(&dentry->d_lru)) { dentry->d_flags |= DCACHE_REFERENCED; - list_add(&dentry->d_lru, &dentry_unused); - dentry_stat.nr_unused++; + dentry_lru_add(dentry); } spin_unlock(&dentry->d_lock); spin_unlock(&dcache_lock); @@ -222,7 +247,8 @@ repeat: unhash_it: __d_drop(dentry); kill_it: - dentry_lru_remove(dentry); + /* if dentry was on the d_lru list delete it from there */ + dentry_lru_del(dentry); dentry = d_kill(dentry); if (dentry) goto repeat; @@ -290,7 +316,7 @@ int d_invalidate(struct dentry * dentry) static inline struct dentry * __dget_locked(struct dentry *dentry) { atomic_inc(&dentry->d_count); - dentry_lru_remove(dentry); + dentry_lru_del_init(dentry); return dentry; } @@ -406,133 +432,167 @@ static void prune_one_dentry(struct dentry * dentry) if (dentry->d_op && dentry->d_op->d_delete) dentry->d_op->d_delete(dentry); - dentry_lru_remove(dentry); + dentry_lru_del_init(dentry); __d_drop(dentry); dentry = d_kill(dentry); spin_lock(&dcache_lock); } } -/** - * prune_dcache - shrink the dcache - * @count: number of entries to try and free - * @sb: if given, ignore dentries for other superblocks - * which are being unmounted. - * - * Shrink the dcache. This is done when we need - * more memory, or simply when we need to unmount - * something (at which point we need to unuse - * all dentries). - * - * This function may fail to free any resources if - * all the dentries are in use. +/* + * Shrink the dentry LRU on a given superblock. + * @sb : superblock to shrink dentry LRU. + * @count: If count is NULL, we prune all dentries on superblock. + * @flags: If flags is non-zero, we need to do special processing based on + * which flags are set. This means we don't need to maintain multiple + * similar copies of this loop. */ - -static void prune_dcache(int count, struct super_block *sb) +static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags) { - spin_lock(&dcache_lock); - for (; count ; count--) { - struct dentry *dentry; - struct list_head *tmp; - struct rw_semaphore *s_umount; - - cond_resched_lock(&dcache_lock); + LIST_HEAD(referenced); + LIST_HEAD(tmp); + struct dentry *dentry; + int cnt = 0; - tmp = dentry_unused.prev; - if (sb) { - /* Try to find a dentry for this sb, but don't try - * too hard, if they aren't near the tail they will - * be moved down again soon + BUG_ON(!sb); + BUG_ON((flags & DCACHE_REFERENCED) && count == NULL); + spin_lock(&dcache_lock); + if (count != NULL) + /* called from prune_dcache() and shrink_dcache_parent() */ + cnt = *count; +restart: + if (count == NULL) + list_splice_init(&sb->s_dentry_lru, &tmp); + else { + while (!list_empty(&sb->s_dentry_lru)) { + dentry = list_entry(sb->s_dentry_lru.prev, + struct dentry, d_lru); + BUG_ON(dentry->d_sb != sb); + + spin_lock(&dentry->d_lock); + /* + * If we are honouring the DCACHE_REFERENCED flag and + * the dentry has this flag set, don't free it. Clear + * the flag and put it back on the LRU. */ - int skip = count; - while (skip && tmp != &dentry_unused && - list_entry(tmp, struct dentry, d_lru)->d_sb != sb) { - skip--; - tmp = tmp->prev; + if ((flags & DCACHE_REFERENCED) + && (dentry->d_flags & DCACHE_REFERENCED)) { + dentry->d_flags &= ~DCACHE_REFERENCED; + list_move_tail(&dentry->d_lru, &referenced); + spin_unlock(&dentry->d_lock); + } else { + list_move_tail(&dentry->d_lru, &tmp); + spin_unlock(&dentry->d_lock); + cnt--; + if (!cnt) + break; } } - if (tmp == &dentry_unused) - break; - list_del_init(tmp); - prefetch(dentry_unused.prev); - dentry_stat.nr_unused--; - dentry = list_entry(tmp, struct dentry, d_lru); - - spin_lock(&dentry->d_lock); + } + while (!list_empty(&tmp)) { + dentry = list_entry(tmp.prev, struct dentry, d_lru); + dentry_lru_del_init(dentry); + spin_lock(&dentry->d_lock); /* * We found an inuse dentry which was not removed from - * dentry_unused because of laziness during lookup. Do not free - * it - just keep it off the dentry_unused list. + * the LRU because of laziness during lookup. Do not free + * it - just keep it off the LRU list. */ - if (atomic_read(&dentry->d_count)) { - spin_unlock(&dentry->d_lock); + if (atomic_read(&dentry->d_count)) { + spin_unlock(&dentry->d_lock); continue; } - /* If the dentry was recently referenced, don't free it. */ - if (dentry->d_flags & DCACHE_REFERENCED) { - dentry->d_flags &= ~DCACHE_REFERENCED; - list_add(&dentry->d_lru, &dentry_unused); - dentry_stat.nr_unused++; - spin_unlock(&dentry->d_lock); + prune_one_dentry(dentry); + /* dentry->d_lock was dropped in prune_one_dentry() */ + cond_resched_lock(&dcache_lock); + } + if (count == NULL && !list_empty(&sb->s_dentry_lru)) + goto restart; + if (count != NULL) + *count = cnt; + if (!list_empty(&referenced)) + list_splice(&referenced, &sb->s_dentry_lru); + spin_unlock(&dcache_lock); +} + +/** + * prune_dcache - shrink the dcache + * @count: number of entries to try to free + * + * Shrink the dcache. This is done when we need more memory, or simply when we + * need to unmount something (at which point we need to unuse all dentries). + * + * This function may fail to free any resources if all the dentries are in use. + */ +static void prune_dcache(int count) +{ + struct super_block *sb; + int w_count; + int unused = dentry_stat.nr_unused; + int prune_ratio; + int pruned; + + if (unused == 0 || count == 0) + return; + spin_lock(&dcache_lock); +restart: + if (count >= unused) + prune_ratio = 1; + else + prune_ratio = unused / count; + spin_lock(&sb_lock); + list_for_each_entry(sb, &super_blocks, s_list) { + if (sb->s_nr_dentry_unused == 0) continue; - } - /* - * If the dentry is not DCACHED_REFERENCED, it is time - * to remove it from the dcache, provided the super block is - * NULL (which means we are trying to reclaim memory) - * or this dentry belongs to the same super block that - * we want to shrink. - */ - /* - * If this dentry is for "my" filesystem, then I can prune it - * without taking the s_umount lock (I already hold it). + sb->s_count++; + /* Now, we reclaim unused dentrins with fairness. + * We reclaim them same percentage from each superblock. + * We calculate number of dentries to scan on this sb + * as follows, but the implementation is arranged to avoid + * overflows: + * number of dentries to scan on this sb = + * count * (number of dentries on this sb / + * number of dentries in the machine) */ - if (sb && dentry->d_sb == sb) { - prune_one_dentry(dentry); - continue; - } + spin_unlock(&sb_lock); + if (prune_ratio != 1) + w_count = (sb->s_nr_dentry_unused / prune_ratio) + 1; + else + w_count = sb->s_nr_dentry_unused; + pruned = w_count; /* - * ...otherwise we need to be sure this filesystem isn't being - * unmounted, otherwise we could race with - * generic_shutdown_super(), and end up holding a reference to - * an inode while the filesystem is unmounted. - * So we try to get s_umount, and make sure s_root isn't NULL. - * (Take a local copy of s_umount to avoid a use-after-free of - * `dentry'). + * We need to be sure this filesystem isn't being unmounted, + * otherwise we could race with generic_shutdown_super(), and + * end up holding a reference to an inode while the filesystem + * is unmounted. So we try to get s_umount, and make sure + * s_root isn't NULL. */ - s_umount = &dentry->d_sb->s_umount; - if (down_read_trylock(s_umount)) { - if (dentry->d_sb->s_root != NULL) { - prune_one_dentry(dentry); - up_read(s_umount); - continue; + if (down_read_trylock(&sb->s_umount)) { + if ((sb->s_root != NULL) && + (!list_empty(&sb->s_dentry_lru))) { + spin_unlock(&dcache_lock); + __shrink_dcache_sb(sb, &w_count, + DCACHE_REFERENCED); + pruned -= w_count; + spin_lock(&dcache_lock); } - up_read(s_umount); + up_read(&sb->s_umount); } - spin_unlock(&dentry->d_lock); + spin_lock(&sb_lock); + count -= pruned; /* - * Insert dentry at the head of the list as inserting at the - * tail leads to a cycle. + * restart only when sb is no longer on the list and + * we have more work to do. */ - list_add(&dentry->d_lru, &dentry_unused); - dentry_stat.nr_unused++; + if (__put_super_and_need_restart(sb) && count > 0) { + spin_unlock(&sb_lock); + goto restart; + } } + spin_unlock(&sb_lock); spin_unlock(&dcache_lock); } -/* - * Shrink the dcache for the specified super block. - * This allows us to unmount a device without disturbing - * the dcache for the other devices. - * - * This implementation makes just two traversals of the - * unused list. On the first pass we move the selected - * dentries to the most recent end, and on the second - * pass we free them. The second pass must restart after - * each dput(), but since the target dentries are all at - * the end, it's really just a single traversal. - */ - /** * shrink_dcache_sb - shrink dcache for a superblock * @sb: superblock @@ -541,44 +601,9 @@ static void prune_dcache(int count, struct super_block *sb) * is used to free the dcache before unmounting a file * system */ - void shrink_dcache_sb(struct super_block * sb) { - struct list_head *tmp, *next; - struct dentry *dentry; - - /* - * Pass one ... move the dentries for the specified - * superblock to the most recent end of the unused list. - */ - spin_lock(&dcache_lock); - list_for_each_prev_safe(tmp, next, &dentry_unused) { - dentry = list_entry(tmp, struct dentry, d_lru); - if (dentry->d_sb != sb) - continue; - list_move_tail(tmp, &dentry_unused); - } - - /* - * Pass two ... free the dentries for this superblock. - */ -repeat: - list_for_each_prev_safe(tmp, next, &dentry_unused) { - dentry = list_entry(tmp, struct dentry, d_lru); - if (dentry->d_sb != sb) - continue; - dentry_stat.nr_unused--; - list_del_init(tmp); - spin_lock(&dentry->d_lock); - if (atomic_read(&dentry->d_count)) { - spin_unlock(&dentry->d_lock); - continue; - } - prune_one_dentry(dentry); - cond_resched_lock(&dcache_lock); - goto repeat; - } - spin_unlock(&dcache_lock); + __shrink_dcache_sb(sb, NULL, 0); } /* @@ -595,7 +620,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry) /* detach this root from the system */ spin_lock(&dcache_lock); - dentry_lru_remove(dentry); + dentry_lru_del_init(dentry); __d_drop(dentry); spin_unlock(&dcache_lock); @@ -609,7 +634,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry) spin_lock(&dcache_lock); list_for_each_entry(loop, &dentry->d_subdirs, d_u.d_child) { - dentry_lru_remove(loop); + dentry_lru_del_init(loop); __d_drop(loop); cond_resched_lock(&dcache_lock); } @@ -791,14 +816,13 @@ resume: struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child); next = tmp->next; - dentry_lru_remove(dentry); + dentry_lru_del_init(dentry); /* * move only zero ref count dentries to the end * of the unused list for prune_dcache */ if (!atomic_read(&dentry->d_count)) { - list_add_tail(&dentry->d_lru, &dentry_unused); - dentry_stat.nr_unused++; + dentry_lru_add_tail(dentry); found++; } @@ -840,10 +864,11 @@ out: void shrink_dcache_parent(struct dentry * parent) { + struct super_block *sb = parent->d_sb; int found; while ((found = select_parent(parent)) != 0) - prune_dcache(found, parent->d_sb); + __shrink_dcache_sb(sb, &found, 0); } /* @@ -863,7 +888,7 @@ static int shrink_dcache_memory(int nr, gfp_t gfp_mask) if (nr) { if (!(gfp_mask & __GFP_FS)) return -1; - prune_dcache(nr, NULL); + prune_dcache(nr); } return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; } @@ -1215,7 +1240,7 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) * rcu_read_lock() and rcu_read_unlock() are used to disable preemption while * lookup is going on. * - * dentry_unused list is not updated even if lookup finds the required dentry + * The dentry unused LRU is not updated even if lookup finds the required dentry * in there. It is updated in places such as prune_dcache, shrink_dcache_sb, * select_parent and __dget_locked. This laziness saves lookup from dcache_lock * acquisition. diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index e9602d8..08e28c9 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -309,6 +309,31 @@ struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent, } EXPORT_SYMBOL_GPL(debugfs_create_symlink); +static void __debugfs_remove(struct dentry *dentry, struct dentry *parent) +{ + int ret = 0; + + if (debugfs_positive(dentry)) { + if (dentry->d_inode) { + dget(dentry); + switch (dentry->d_inode->i_mode & S_IFMT) { + case S_IFDIR: + ret = simple_rmdir(parent->d_inode, dentry); + break; + case S_IFLNK: + kfree(dentry->d_inode->i_private); + /* fall through */ + default: + simple_unlink(parent->d_inode, dentry); + break; + } + if (!ret) + d_delete(dentry); + dput(dentry); + } + } +} + /** * debugfs_remove - removes a file or directory from the debugfs filesystem * @dentry: a pointer to a the dentry of the file or directory to be @@ -325,7 +350,6 @@ EXPORT_SYMBOL_GPL(debugfs_create_symlink); void debugfs_remove(struct dentry *dentry) { struct dentry *parent; - int ret = 0; if (!dentry) return; @@ -335,29 +359,83 @@ void debugfs_remove(struct dentry *dentry) return; mutex_lock(&parent->d_inode->i_mutex); - if (debugfs_positive(dentry)) { - if (dentry->d_inode) { - dget(dentry); - switch (dentry->d_inode->i_mode & S_IFMT) { - case S_IFDIR: - ret = simple_rmdir(parent->d_inode, dentry); - break; - case S_IFLNK: - kfree(dentry->d_inode->i_private); - /* fall through */ - default: - simple_unlink(parent->d_inode, dentry); + __debugfs_remove(dentry, parent); + mutex_unlock(&parent->d_inode->i_mutex); + simple_release_fs(&debugfs_mount, &debugfs_mount_count); +} +EXPORT_SYMBOL_GPL(debugfs_remove); + +/** + * debugfs_remove_recursive - recursively removes a directory + * @dentry: a pointer to a the dentry of the directory to be removed. + * + * This function recursively removes a directory tree in debugfs that + * was previously created with a call to another debugfs function + * (like debugfs_create_file() or variants thereof.) + * + * This function is required to be called in order for the file to be + * removed, no automatic cleanup of files will happen when a module is + * removed, you are responsible here. + */ +void debugfs_remove_recursive(struct dentry *dentry) +{ + struct dentry *child; + struct dentry *parent; + + if (!dentry) + return; + + parent = dentry->d_parent; + if (!parent || !parent->d_inode) + return; + + parent = dentry; + mutex_lock(&parent->d_inode->i_mutex); + + while (1) { + /* + * When all dentries under "parent" has been removed, + * walk up the tree until we reach our starting point. + */ + if (list_empty(&parent->d_subdirs)) { + mutex_unlock(&parent->d_inode->i_mutex); + if (parent == dentry) break; - } - if (!ret) - d_delete(dentry); - dput(dentry); + parent = parent->d_parent; + mutex_lock(&parent->d_inode->i_mutex); + } + child = list_entry(parent->d_subdirs.next, struct dentry, + d_u.d_child); + + /* + * If "child" isn't empty, walk down the tree and + * remove all its descendants first. + */ + if (!list_empty(&child->d_subdirs)) { + mutex_unlock(&parent->d_inode->i_mutex); + parent = child; + mutex_lock(&parent->d_inode->i_mutex); + continue; } + __debugfs_remove(child, parent); + if (parent->d_subdirs.next == &child->d_u.d_child) { + /* + * Avoid infinite loop if we fail to remove + * one dentry. + */ + mutex_unlock(&parent->d_inode->i_mutex); + break; + } + simple_release_fs(&debugfs_mount, &debugfs_mount_count); } + + parent = dentry->d_parent; + mutex_lock(&parent->d_inode->i_mutex); + __debugfs_remove(dentry, parent); mutex_unlock(&parent->d_inode->i_mutex); simple_release_fs(&debugfs_mount, &debugfs_mount_count); } -EXPORT_SYMBOL_GPL(debugfs_remove); +EXPORT_SYMBOL_GPL(debugfs_remove_recursive); /** * debugfs_rename - rename a file/directory in the debugfs filesystem diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c index 78878c5..eba87ff 100644 --- a/fs/dlm/plock.c +++ b/fs/dlm/plock.c @@ -116,7 +116,7 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file, if (xop->callback == NULL) wait_event(recv_wq, (op->done != 0)); else { - rv = -EINPROGRESS; + rv = FILE_LOCK_DEFERRED; goto out; } @@ -562,6 +562,8 @@ static struct shrinker dqcache_shrinker = { */ static void dqput(struct dquot *dquot) { + int ret; + if (!dquot) return; #ifdef __DQUOT_PARANOIA @@ -594,7 +596,19 @@ we_slept: if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags) && dquot_dirty(dquot)) { spin_unlock(&dq_list_lock); /* Commit dquot before releasing */ - dquot->dq_sb->dq_op->write_dquot(dquot); + ret = dquot->dq_sb->dq_op->write_dquot(dquot); + if (ret < 0) { + printk(KERN_ERR "VFS: cannot write quota structure on " + "device %s (error %d). Quota may get out of " + "sync!\n", dquot->dq_sb->s_id, ret); + /* + * We clear dirty bit anyway, so that we avoid + * infinite loop here + */ + spin_lock(&dq_list_lock); + clear_dquot_dirty(dquot); + spin_unlock(&dq_list_lock); + } goto we_slept; } /* Clear flag in case dquot was inactive (something bad happened) */ @@ -875,7 +889,10 @@ static void print_warning(struct dquot *dquot, const int warntype) char *msg = NULL; struct tty_struct *tty; - if (!need_print_warning(dquot)) + if (warntype == QUOTA_NL_IHARDBELOW || + warntype == QUOTA_NL_ISOFTBELOW || + warntype == QUOTA_NL_BHARDBELOW || + warntype == QUOTA_NL_BSOFTBELOW || !need_print_warning(dquot)) return; mutex_lock(&tty_mutex); @@ -1083,6 +1100,35 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war return QUOTA_OK; } +static int info_idq_free(struct dquot *dquot, ulong inodes) +{ + if (test_bit(DQ_FAKE_B, &dquot->dq_flags) || + dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit) + return QUOTA_NL_NOWARN; + + if (dquot->dq_dqb.dqb_curinodes - inodes <= dquot->dq_dqb.dqb_isoftlimit) + return QUOTA_NL_ISOFTBELOW; + if (dquot->dq_dqb.dqb_curinodes >= dquot->dq_dqb.dqb_ihardlimit && + dquot->dq_dqb.dqb_curinodes - inodes < dquot->dq_dqb.dqb_ihardlimit) + return QUOTA_NL_IHARDBELOW; + return QUOTA_NL_NOWARN; +} + +static int info_bdq_free(struct dquot *dquot, qsize_t space) +{ + if (test_bit(DQ_FAKE_B, &dquot->dq_flags) || + toqb(dquot->dq_dqb.dqb_curspace) <= dquot->dq_dqb.dqb_bsoftlimit) + return QUOTA_NL_NOWARN; + + if (toqb(dquot->dq_dqb.dqb_curspace - space) <= + dquot->dq_dqb.dqb_bsoftlimit) + return QUOTA_NL_BSOFTBELOW; + if (toqb(dquot->dq_dqb.dqb_curspace) >= dquot->dq_dqb.dqb_bhardlimit && + toqb(dquot->dq_dqb.dqb_curspace - space) < + dquot->dq_dqb.dqb_bhardlimit) + return QUOTA_NL_BHARDBELOW; + return QUOTA_NL_NOWARN; +} /* * Initialize quota pointers in inode * Transaction must be started at entry @@ -1139,6 +1185,28 @@ int dquot_drop(struct inode *inode) return 0; } +/* Wrapper to remove references to quota structures from inode */ +void vfs_dq_drop(struct inode *inode) +{ + /* Here we can get arbitrary inode from clear_inode() so we have + * to be careful. OTOH we don't need locking as quota operations + * are allowed to change only at mount time */ + if (!IS_NOQUOTA(inode) && inode->i_sb && inode->i_sb->dq_op + && inode->i_sb->dq_op->drop) { + int cnt; + /* Test before calling to rule out calls from proc and such + * where we are not allowed to block. Note that this is + * actually reliable test even without the lock - the caller + * must assure that nobody can come after the DQUOT_DROP and + * add quota pointers back anyway */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) + if (inode->i_dquot[cnt] != NODQUOT) + break; + if (cnt < MAXQUOTAS) + inode->i_sb->dq_op->drop(inode); + } +} + /* * Following four functions update i_blocks+i_bytes fields and * quota information (together with appropriate checks) @@ -1248,6 +1316,7 @@ warn_put_all: int dquot_free_space(struct inode *inode, qsize_t number) { unsigned int cnt; + char warntype[MAXQUOTAS]; /* First test before acquiring mutex - solves deadlocks when we * re-enter the quota code and are already holding the mutex */ @@ -1256,6 +1325,7 @@ out_sub: inode_sub_bytes(inode, number); return QUOTA_OK; } + down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); /* Now recheck reliably when holding dqptr_sem */ if (IS_NOQUOTA(inode)) { @@ -1266,6 +1336,7 @@ out_sub: for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (inode->i_dquot[cnt] == NODQUOT) continue; + warntype[cnt] = info_bdq_free(inode->i_dquot[cnt], number); dquot_decr_space(inode->i_dquot[cnt], number); } inode_sub_bytes(inode, number); @@ -1274,6 +1345,7 @@ out_sub: for (cnt = 0; cnt < MAXQUOTAS; cnt++) if (inode->i_dquot[cnt]) mark_dquot_dirty(inode->i_dquot[cnt]); + flush_warnings(inode->i_dquot, warntype); up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); return QUOTA_OK; } @@ -1284,11 +1356,13 @@ out_sub: int dquot_free_inode(const struct inode *inode, unsigned long number) { unsigned int cnt; + char warntype[MAXQUOTAS]; /* First test before acquiring mutex - solves deadlocks when we * re-enter the quota code and are already holding the mutex */ if (IS_NOQUOTA(inode)) return QUOTA_OK; + down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); /* Now recheck reliably when holding dqptr_sem */ if (IS_NOQUOTA(inode)) { @@ -1299,6 +1373,7 @@ int dquot_free_inode(const struct inode *inode, unsigned long number) for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (inode->i_dquot[cnt] == NODQUOT) continue; + warntype[cnt] = info_idq_free(inode->i_dquot[cnt], number); dquot_decr_inodes(inode->i_dquot[cnt], number); } spin_unlock(&dq_data_lock); @@ -1306,6 +1381,7 @@ int dquot_free_inode(const struct inode *inode, unsigned long number) for (cnt = 0; cnt < MAXQUOTAS; cnt++) if (inode->i_dquot[cnt]) mark_dquot_dirty(inode->i_dquot[cnt]); + flush_warnings(inode->i_dquot, warntype); up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); return QUOTA_OK; } @@ -1323,7 +1399,8 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) struct dquot *transfer_to[MAXQUOTAS]; int cnt, ret = NO_QUOTA, chuid = (iattr->ia_valid & ATTR_UID) && inode->i_uid != iattr->ia_uid, chgid = (iattr->ia_valid & ATTR_GID) && inode->i_gid != iattr->ia_gid; - char warntype[MAXQUOTAS]; + char warntype_to[MAXQUOTAS]; + char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS]; /* First test before acquiring mutex - solves deadlocks when we * re-enter the quota code and are already holding the mutex */ @@ -1332,7 +1409,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) /* Clear the arrays */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { transfer_to[cnt] = transfer_from[cnt] = NODQUOT; - warntype[cnt] = QUOTA_NL_NOWARN; + warntype_to[cnt] = QUOTA_NL_NOWARN; } down_write(&sb_dqopt(inode->i_sb)->dqptr_sem); /* Now recheck reliably when holding dqptr_sem */ @@ -1364,8 +1441,9 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) if (transfer_to[cnt] == NODQUOT) continue; transfer_from[cnt] = inode->i_dquot[cnt]; - if (check_idq(transfer_to[cnt], 1, warntype+cnt) == NO_QUOTA || - check_bdq(transfer_to[cnt], space, 0, warntype+cnt) == NO_QUOTA) + if (check_idq(transfer_to[cnt], 1, warntype_to + cnt) == + NO_QUOTA || check_bdq(transfer_to[cnt], space, 0, + warntype_to + cnt) == NO_QUOTA) goto warn_put_all; } @@ -1381,6 +1459,10 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) /* Due to IO error we might not have transfer_from[] structure */ if (transfer_from[cnt]) { + warntype_from_inodes[cnt] = + info_idq_free(transfer_from[cnt], 1); + warntype_from_space[cnt] = + info_bdq_free(transfer_from[cnt], space); dquot_decr_inodes(transfer_from[cnt], 1); dquot_decr_space(transfer_from[cnt], space); } @@ -1400,7 +1482,9 @@ warn_put_all: if (transfer_to[cnt]) mark_dquot_dirty(transfer_to[cnt]); } - flush_warnings(transfer_to, warntype); + flush_warnings(transfer_to, warntype_to); + flush_warnings(transfer_from, warntype_from_inodes); + flush_warnings(transfer_from, warntype_from_space); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (ret == QUOTA_OK && transfer_from[cnt] != NODQUOT) @@ -1412,6 +1496,18 @@ warn_put_all: return ret; } +/* Wrapper for transferring ownership of an inode */ +int vfs_dq_transfer(struct inode *inode, struct iattr *iattr) +{ + if (sb_any_quota_enabled(inode->i_sb) && !IS_NOQUOTA(inode)) { + vfs_dq_init(inode); + if (inode->i_sb->dq_op->transfer(inode, iattr) == NO_QUOTA) + return 1; + } + return 0; +} + + /* * Write info of quota file to disk */ @@ -1752,6 +1848,22 @@ out: return error; } +/* Wrapper to turn on quotas when remounting rw */ +int vfs_dq_quota_on_remount(struct super_block *sb) +{ + int cnt; + int ret = 0, err; + + if (!sb->s_qcop || !sb->s_qcop->quota_on) + return -ENOSYS; + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + err = sb->s_qcop->quota_on(sb, cnt, 0, NULL, 1); + if (err < 0 && !ret) + ret = err; + } + return ret; +} + /* Generic routine for getting common part of quota structure */ static void do_get_dqblk(struct dquot *dquot, struct if_dqblk *di) { @@ -2087,8 +2199,11 @@ EXPORT_SYMBOL(dquot_release); EXPORT_SYMBOL(dquot_mark_dquot_dirty); EXPORT_SYMBOL(dquot_initialize); EXPORT_SYMBOL(dquot_drop); +EXPORT_SYMBOL(vfs_dq_drop); EXPORT_SYMBOL(dquot_alloc_space); EXPORT_SYMBOL(dquot_alloc_inode); EXPORT_SYMBOL(dquot_free_space); EXPORT_SYMBOL(dquot_free_inode); EXPORT_SYMBOL(dquot_transfer); +EXPORT_SYMBOL(vfs_dq_transfer); +EXPORT_SYMBOL(vfs_dq_quota_on_remount); diff --git a/fs/ecryptfs/Makefile b/fs/ecryptfs/Makefile index 1e34a7f..b4755a8 100644 --- a/fs/ecryptfs/Makefile +++ b/fs/ecryptfs/Makefile @@ -4,4 +4,4 @@ obj-$(CONFIG_ECRYPT_FS) += ecryptfs.o -ecryptfs-objs := dentry.o file.o inode.o main.o super.o mmap.o read_write.o crypto.o keystore.o messaging.o netlink.o miscdev.o debug.o +ecryptfs-objs := dentry.o file.o inode.o main.o super.o mmap.o read_write.o crypto.o keystore.o messaging.o netlink.o miscdev.o kthread.o debug.o diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index e2832bc..7b99917f 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -33,6 +33,7 @@ #include <linux/crypto.h> #include <linux/file.h> #include <linux/scatterlist.h> +#include <asm/unaligned.h> #include "ecryptfs_kernel.h" static int @@ -1032,10 +1033,8 @@ static int contains_ecryptfs_marker(char *data) { u32 m_1, m_2; - memcpy(&m_1, data, 4); - m_1 = be32_to_cpu(m_1); - memcpy(&m_2, (data + 4), 4); - m_2 = be32_to_cpu(m_2); + m_1 = get_unaligned_be32(data); + m_2 = get_unaligned_be32(data + 4); if ((m_1 ^ MAGIC_ECRYPTFS_MARKER) == m_2) return 1; ecryptfs_printk(KERN_DEBUG, "m_1 = [0x%.8x]; m_2 = [0x%.8x]; " @@ -1073,8 +1072,7 @@ static int ecryptfs_process_flags(struct ecryptfs_crypt_stat *crypt_stat, int i; u32 flags; - memcpy(&flags, page_virt, 4); - flags = be32_to_cpu(flags); + flags = get_unaligned_be32(page_virt); for (i = 0; i < ((sizeof(ecryptfs_flag_map) / sizeof(struct ecryptfs_flag_map_elem))); i++) if (flags & ecryptfs_flag_map[i].file_flag) { @@ -1100,11 +1098,9 @@ static void write_ecryptfs_marker(char *page_virt, size_t *written) get_random_bytes(&m_1, (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2)); m_2 = (m_1 ^ MAGIC_ECRYPTFS_MARKER); - m_1 = cpu_to_be32(m_1); - memcpy(page_virt, &m_1, (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2)); - m_2 = cpu_to_be32(m_2); - memcpy(page_virt + (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2), &m_2, - (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2)); + put_unaligned_be32(m_1, page_virt); + page_virt += (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2); + put_unaligned_be32(m_2, page_virt); (*written) = MAGIC_ECRYPTFS_MARKER_SIZE_BYTES; } @@ -1121,8 +1117,7 @@ write_ecryptfs_flags(char *page_virt, struct ecryptfs_crypt_stat *crypt_stat, flags |= ecryptfs_flag_map[i].file_flag; /* Version is in top 8 bits of the 32-bit flag vector */ flags |= ((((u8)crypt_stat->file_version) << 24) & 0xFF000000); - flags = cpu_to_be32(flags); - memcpy(page_virt, &flags, 4); + put_unaligned_be32(flags, page_virt); (*written) = 4; } @@ -1238,11 +1233,9 @@ ecryptfs_write_header_metadata(char *virt, num_header_extents_at_front = (u16)(crypt_stat->num_header_bytes_at_front / crypt_stat->extent_size); - header_extent_size = cpu_to_be32(header_extent_size); - memcpy(virt, &header_extent_size, 4); + put_unaligned_be32(header_extent_size, virt); virt += 4; - num_header_extents_at_front = cpu_to_be16(num_header_extents_at_front); - memcpy(virt, &num_header_extents_at_front, 2); + put_unaligned_be16(num_header_extents_at_front, virt); (*written) = 6; } @@ -1410,15 +1403,13 @@ static int parse_header_metadata(struct ecryptfs_crypt_stat *crypt_stat, u32 header_extent_size; u16 num_header_extents_at_front; - memcpy(&header_extent_size, virt, sizeof(u32)); - header_extent_size = be32_to_cpu(header_extent_size); - virt += sizeof(u32); - memcpy(&num_header_extents_at_front, virt, sizeof(u16)); - num_header_extents_at_front = be16_to_cpu(num_header_extents_at_front); + header_extent_size = get_unaligned_be32(virt); + virt += sizeof(__be32); + num_header_extents_at_front = get_unaligned_be16(virt); crypt_stat->num_header_bytes_at_front = (((size_t)num_header_extents_at_front * (size_t)header_extent_size)); - (*bytes_read) = (sizeof(u32) + sizeof(u16)); + (*bytes_read) = (sizeof(__be32) + sizeof(__be16)); if ((validate_header_size == ECRYPTFS_VALIDATE_HEADER_SIZE) && (crypt_stat->num_header_bytes_at_front < ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE)) { diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index c15c257..b73fb75 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -559,10 +559,25 @@ extern struct kmem_cache *ecryptfs_key_record_cache; extern struct kmem_cache *ecryptfs_key_sig_cache; extern struct kmem_cache *ecryptfs_global_auth_tok_cache; extern struct kmem_cache *ecryptfs_key_tfm_cache; +extern struct kmem_cache *ecryptfs_open_req_cache; +struct ecryptfs_open_req { +#define ECRYPTFS_REQ_PROCESSED 0x00000001 +#define ECRYPTFS_REQ_DROPPED 0x00000002 +#define ECRYPTFS_REQ_ZOMBIE 0x00000004 + u32 flags; + struct file **lower_file; + struct dentry *lower_dentry; + struct vfsmount *lower_mnt; + wait_queue_head_t wait; + struct mutex mux; + struct list_head kthread_ctl_list; +}; + +#define ECRYPTFS_INTERPOSE_FLAG_D_ADD 0x00000001 int ecryptfs_interpose(struct dentry *hidden_dentry, struct dentry *this_dentry, struct super_block *sb, - int flag); + u32 flags); int ecryptfs_fill_zeros(struct file *file, loff_t new_length); int ecryptfs_decode_filename(struct ecryptfs_crypt_stat *crypt_stat, const char *name, int length, @@ -690,5 +705,11 @@ void ecryptfs_msg_ctx_alloc_to_free(struct ecryptfs_msg_ctx *msg_ctx); int ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, uid_t euid, struct user_namespace *user_ns, struct pid *pid); +int ecryptfs_init_kthread(void); +void ecryptfs_destroy_kthread(void); +int ecryptfs_privileged_open(struct file **lower_file, + struct dentry *lower_dentry, + struct vfsmount *lower_mnt); +int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry); #endif /* #ifndef ECRYPTFS_KERNEL_H */ diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 24749bf..9244d65 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -192,6 +192,23 @@ static int ecryptfs_open(struct inode *inode, struct file *file) | ECRYPTFS_ENCRYPTED); } mutex_unlock(&crypt_stat->cs_mutex); + if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_RDONLY) + && !(file->f_flags & O_RDONLY)) { + rc = -EPERM; + printk(KERN_WARNING "%s: Lower persistent file is RO; eCryptfs " + "file must hence be opened RO\n", __func__); + goto out; + } + if (!ecryptfs_inode_to_private(inode)->lower_file) { + rc = ecryptfs_init_persistent_file(ecryptfs_dentry); + if (rc) { + printk(KERN_ERR "%s: Error attempting to initialize " + "the persistent file for the dentry with name " + "[%s]; rc = [%d]\n", __func__, + ecryptfs_dentry->d_name.name, rc); + goto out; + } + } ecryptfs_set_file_lower( file, ecryptfs_inode_to_private(inode)->lower_file); if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) { diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index c92cc1c..d755455 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -31,6 +31,7 @@ #include <linux/mount.h> #include <linux/crypto.h> #include <linux/fs_stack.h> +#include <asm/unaligned.h> #include "ecryptfs_kernel.h" static struct dentry *lock_parent(struct dentry *dentry) @@ -188,6 +189,16 @@ static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry) "context; rc = [%d]\n", rc); goto out; } + if (!ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->lower_file) { + rc = ecryptfs_init_persistent_file(ecryptfs_dentry); + if (rc) { + printk(KERN_ERR "%s: Error attempting to initialize " + "the persistent file for the dentry with name " + "[%s]; rc = [%d]\n", __func__, + ecryptfs_dentry->d_name.name, rc); + goto out; + } + } rc = ecryptfs_write_metadata(ecryptfs_dentry); if (rc) { printk(KERN_ERR "Error writing headers; rc = [%d]\n", rc); @@ -307,10 +318,11 @@ static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry, d_add(dentry, NULL); goto out; } - rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 1); + rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, + ECRYPTFS_INTERPOSE_FLAG_D_ADD); if (rc) { ecryptfs_printk(KERN_ERR, "Error interposing\n"); - goto out_dput; + goto out; } if (S_ISDIR(lower_inode->i_mode)) { ecryptfs_printk(KERN_DEBUG, "Is a directory; returning\n"); @@ -336,11 +348,21 @@ static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry, rc = -ENOMEM; ecryptfs_printk(KERN_ERR, "Cannot ecryptfs_kmalloc a page\n"); - goto out_dput; + goto out; } crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat; if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED)) ecryptfs_set_default_sizes(crypt_stat); + if (!ecryptfs_inode_to_private(dentry->d_inode)->lower_file) { + rc = ecryptfs_init_persistent_file(dentry); + if (rc) { + printk(KERN_ERR "%s: Error attempting to initialize " + "the persistent file for the dentry with name " + "[%s]; rc = [%d]\n", __func__, + dentry->d_name.name, rc); + goto out; + } + } rc = ecryptfs_read_and_validate_header_region(page_virt, dentry->d_inode); if (rc) { @@ -364,8 +386,7 @@ static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry, else file_size = i_size_read(lower_dentry->d_inode); } else { - memcpy(&file_size, page_virt, sizeof(file_size)); - file_size = be64_to_cpu(file_size); + file_size = get_unaligned_be64(page_virt); } i_size_write(dentry->d_inode, (loff_t)file_size); kmem_cache_free(ecryptfs_header_cache_2, page_virt); diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index e82b457..f5b76a3 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -44,15 +44,15 @@ static int process_request_key_err(long err_code) int rc = 0; switch (err_code) { - case ENOKEY: + case -ENOKEY: ecryptfs_printk(KERN_WARNING, "No key\n"); rc = -ENOENT; break; - case EKEYEXPIRED: + case -EKEYEXPIRED: ecryptfs_printk(KERN_WARNING, "Key expired\n"); rc = -ETIME; break; - case EKEYREVOKED: + case -EKEYREVOKED: ecryptfs_printk(KERN_WARNING, "Key revoked\n"); rc = -EINVAL; break; @@ -963,8 +963,7 @@ int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key, if (!(*auth_tok_key) || IS_ERR(*auth_tok_key)) { printk(KERN_ERR "Could not find key with description: [%s]\n", sig); - process_request_key_err(PTR_ERR(*auth_tok_key)); - rc = -EINVAL; + rc = process_request_key_err(PTR_ERR(*auth_tok_key)); goto out; } (*auth_tok) = ecryptfs_get_key_payload_data(*auth_tok_key); diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c new file mode 100644 index 0000000..c440c6b --- /dev/null +++ b/fs/ecryptfs/kthread.c @@ -0,0 +1,203 @@ +/** + * eCryptfs: Linux filesystem encryption layer + * + * Copyright (C) 2008 International Business Machines Corp. + * Author(s): Michael A. Halcrow <mahalcro@us.ibm.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + * 02111-1307, USA. + */ + +#include <linux/kthread.h> +#include <linux/freezer.h> +#include <linux/wait.h> +#include <linux/mount.h> +#include "ecryptfs_kernel.h" + +struct kmem_cache *ecryptfs_open_req_cache; + +static struct ecryptfs_kthread_ctl { +#define ECRYPTFS_KTHREAD_ZOMBIE 0x00000001 + u32 flags; + struct mutex mux; + struct list_head req_list; + wait_queue_head_t wait; +} ecryptfs_kthread_ctl; + +static struct task_struct *ecryptfs_kthread; + +/** + * ecryptfs_threadfn + * @ignored: ignored + * + * The eCryptfs kernel thread that has the responsibility of getting + * the lower persistent file with RW permissions. + * + * Returns zero on success; non-zero otherwise + */ +static int ecryptfs_threadfn(void *ignored) +{ + set_freezable(); + while (1) { + struct ecryptfs_open_req *req; + + wait_event_freezable( + ecryptfs_kthread_ctl.wait, + (!list_empty(&ecryptfs_kthread_ctl.req_list) + || kthread_should_stop())); + mutex_lock(&ecryptfs_kthread_ctl.mux); + if (ecryptfs_kthread_ctl.flags & ECRYPTFS_KTHREAD_ZOMBIE) { + mutex_unlock(&ecryptfs_kthread_ctl.mux); + goto out; + } + while (!list_empty(&ecryptfs_kthread_ctl.req_list)) { + req = list_first_entry(&ecryptfs_kthread_ctl.req_list, + struct ecryptfs_open_req, + kthread_ctl_list); + mutex_lock(&req->mux); + list_del(&req->kthread_ctl_list); + if (!(req->flags & ECRYPTFS_REQ_ZOMBIE)) { + dget(req->lower_dentry); + mntget(req->lower_mnt); + (*req->lower_file) = dentry_open( + req->lower_dentry, req->lower_mnt, + (O_RDWR | O_LARGEFILE)); + req->flags |= ECRYPTFS_REQ_PROCESSED; + } + wake_up(&req->wait); + mutex_unlock(&req->mux); + } + mutex_unlock(&ecryptfs_kthread_ctl.mux); + } +out: + return 0; +} + +int ecryptfs_init_kthread(void) +{ + int rc = 0; + + mutex_init(&ecryptfs_kthread_ctl.mux); + init_waitqueue_head(&ecryptfs_kthread_ctl.wait); + INIT_LIST_HEAD(&ecryptfs_kthread_ctl.req_list); + ecryptfs_kthread = kthread_run(&ecryptfs_threadfn, NULL, + "ecryptfs-kthread"); + if (IS_ERR(ecryptfs_kthread)) { + rc = PTR_ERR(ecryptfs_kthread); + printk(KERN_ERR "%s: Failed to create kernel thread; rc = [%d]" + "\n", __func__, rc); + } + return rc; +} + +void ecryptfs_destroy_kthread(void) +{ + struct ecryptfs_open_req *req; + + mutex_lock(&ecryptfs_kthread_ctl.mux); + ecryptfs_kthread_ctl.flags |= ECRYPTFS_KTHREAD_ZOMBIE; + list_for_each_entry(req, &ecryptfs_kthread_ctl.req_list, + kthread_ctl_list) { + mutex_lock(&req->mux); + req->flags |= ECRYPTFS_REQ_ZOMBIE; + wake_up(&req->wait); + mutex_unlock(&req->mux); + } + mutex_unlock(&ecryptfs_kthread_ctl.mux); + kthread_stop(ecryptfs_kthread); + wake_up(&ecryptfs_kthread_ctl.wait); +} + +/** + * ecryptfs_privileged_open + * @lower_file: Result of dentry_open by root on lower dentry + * @lower_dentry: Lower dentry for file to open + * @lower_mnt: Lower vfsmount for file to open + * + * This function gets a r/w file opened againt the lower dentry. + * + * Returns zero on success; non-zero otherwise + */ +int ecryptfs_privileged_open(struct file **lower_file, + struct dentry *lower_dentry, + struct vfsmount *lower_mnt) +{ + struct ecryptfs_open_req *req; + int rc = 0; + + /* Corresponding dput() and mntput() are done when the + * persistent file is fput() when the eCryptfs inode is + * destroyed. */ + dget(lower_dentry); + mntget(lower_mnt); + (*lower_file) = dentry_open(lower_dentry, lower_mnt, + (O_RDWR | O_LARGEFILE)); + if (!IS_ERR(*lower_file)) + goto out; + req = kmem_cache_alloc(ecryptfs_open_req_cache, GFP_KERNEL); + if (!req) { + rc = -ENOMEM; + goto out; + } + mutex_init(&req->mux); + req->lower_file = lower_file; + req->lower_dentry = lower_dentry; + req->lower_mnt = lower_mnt; + init_waitqueue_head(&req->wait); + req->flags = 0; + mutex_lock(&ecryptfs_kthread_ctl.mux); + if (ecryptfs_kthread_ctl.flags & ECRYPTFS_KTHREAD_ZOMBIE) { + rc = -EIO; + mutex_unlock(&ecryptfs_kthread_ctl.mux); + printk(KERN_ERR "%s: We are in the middle of shutting down; " + "aborting privileged request to open lower file\n", + __func__); + goto out_free; + } + list_add_tail(&req->kthread_ctl_list, &ecryptfs_kthread_ctl.req_list); + mutex_unlock(&ecryptfs_kthread_ctl.mux); + wake_up(&ecryptfs_kthread_ctl.wait); + wait_event(req->wait, (req->flags != 0)); + mutex_lock(&req->mux); + BUG_ON(req->flags == 0); + if (req->flags & ECRYPTFS_REQ_DROPPED + || req->flags & ECRYPTFS_REQ_ZOMBIE) { + rc = -EIO; + printk(KERN_WARNING "%s: Privileged open request dropped\n", + __func__); + goto out_unlock; + } + if (IS_ERR(*req->lower_file)) { + rc = PTR_ERR(*req->lower_file); + dget(lower_dentry); + mntget(lower_mnt); + (*lower_file) = dentry_open(lower_dentry, lower_mnt, + (O_RDONLY | O_LARGEFILE)); + if (IS_ERR(*lower_file)) { + rc = PTR_ERR(*req->lower_file); + (*lower_file) = NULL; + printk(KERN_WARNING "%s: Error attempting privileged " + "open of lower file with either RW or RO " + "perms; rc = [%d]. Giving up.\n", + __func__, rc); + } + } +out_unlock: + mutex_unlock(&req->mux); +out_free: + kmem_cache_free(ecryptfs_open_req_cache, req); +out: + return rc; +} diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index d603631..6f403cf 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -117,7 +117,7 @@ void __ecryptfs_printk(const char *fmt, ...) * * Returns zero on success; non-zero otherwise */ -static int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry) +int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry) { struct ecryptfs_inode_info *inode_info = ecryptfs_inode_to_private(ecryptfs_dentry->d_inode); @@ -130,26 +130,12 @@ static int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry) ecryptfs_dentry_to_lower_mnt(ecryptfs_dentry); lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry); - /* Corresponding dput() and mntput() are done when the - * persistent file is fput() when the eCryptfs inode - * is destroyed. */ - dget(lower_dentry); - mntget(lower_mnt); - inode_info->lower_file = dentry_open(lower_dentry, - lower_mnt, - (O_RDWR | O_LARGEFILE)); - if (IS_ERR(inode_info->lower_file)) { - dget(lower_dentry); - mntget(lower_mnt); - inode_info->lower_file = dentry_open(lower_dentry, - lower_mnt, - (O_RDONLY - | O_LARGEFILE)); - } - if (IS_ERR(inode_info->lower_file)) { + rc = ecryptfs_privileged_open(&inode_info->lower_file, + lower_dentry, lower_mnt); + if (rc || IS_ERR(inode_info->lower_file)) { printk(KERN_ERR "Error opening lower persistent file " - "for lower_dentry [0x%p] and lower_mnt [0x%p]\n", - lower_dentry, lower_mnt); + "for lower_dentry [0x%p] and lower_mnt [0x%p]; " + "rc = [%d]\n", lower_dentry, lower_mnt, rc); rc = PTR_ERR(inode_info->lower_file); inode_info->lower_file = NULL; } @@ -163,14 +149,14 @@ static int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry) * @lower_dentry: Existing dentry in the lower filesystem * @dentry: ecryptfs' dentry * @sb: ecryptfs's super_block - * @flag: If set to true, then d_add is called, else d_instantiate is called + * @flags: flags to govern behavior of interpose procedure * * Interposes upper and lower dentries. * * Returns zero on success; non-zero otherwise */ int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry, - struct super_block *sb, int flag) + struct super_block *sb, u32 flags) { struct inode *lower_inode; struct inode *inode; @@ -207,7 +193,7 @@ int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry, init_special_inode(inode, lower_inode->i_mode, lower_inode->i_rdev); dentry->d_op = &ecryptfs_dops; - if (flag) + if (flags & ECRYPTFS_INTERPOSE_FLAG_D_ADD) d_add(dentry, inode); else d_instantiate(dentry, inode); @@ -215,13 +201,6 @@ int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry, /* This size will be overwritten for real files w/ headers and * other metadata */ fsstack_copy_inode_size(inode, lower_inode); - rc = ecryptfs_init_persistent_file(dentry); - if (rc) { - printk(KERN_ERR "%s: Error attempting to initialize the " - "persistent file for the dentry with name [%s]; " - "rc = [%d]\n", __func__, dentry->d_name.name, rc); - goto out; - } out: return rc; } @@ -262,10 +241,11 @@ static int ecryptfs_init_global_auth_toks( "session keyring for sig specified in mount " "option: [%s]\n", global_auth_tok->sig); global_auth_tok->flags |= ECRYPTFS_AUTH_TOK_INVALID; - rc = 0; + goto out; } else global_auth_tok->flags &= ~ECRYPTFS_AUTH_TOK_INVALID; } +out: return rc; } @@ -314,7 +294,6 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options) char *cipher_name_dst; char *cipher_name_src; char *cipher_key_bytes_src; - int cipher_name_len; if (!options) { rc = -EINVAL; @@ -395,17 +374,12 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options) goto out; } if (!cipher_name_set) { - cipher_name_len = strlen(ECRYPTFS_DEFAULT_CIPHER); - if (unlikely(cipher_name_len - >= ECRYPTFS_MAX_CIPHER_NAME_SIZE)) { - rc = -EINVAL; - BUG(); - goto out; - } - memcpy(mount_crypt_stat->global_default_cipher_name, - ECRYPTFS_DEFAULT_CIPHER, cipher_name_len); - mount_crypt_stat->global_default_cipher_name[cipher_name_len] - = '\0'; + int cipher_name_len = strlen(ECRYPTFS_DEFAULT_CIPHER); + + BUG_ON(cipher_name_len >= ECRYPTFS_MAX_CIPHER_NAME_SIZE); + + strcpy(mount_crypt_stat->global_default_cipher_name, + ECRYPTFS_DEFAULT_CIPHER); } if (!cipher_key_bytes_set) { mount_crypt_stat->global_default_cipher_key_size = 0; @@ -430,7 +404,6 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options) printk(KERN_WARNING "One or more global auth toks could not " "properly register; rc = [%d]\n", rc); } - rc = 0; out: return rc; } @@ -679,6 +652,11 @@ static struct ecryptfs_cache_info { .name = "ecryptfs_key_tfm_cache", .size = sizeof(struct ecryptfs_key_tfm), }, + { + .cache = &ecryptfs_open_req_cache, + .name = "ecryptfs_open_req_cache", + .size = sizeof(struct ecryptfs_open_req), + }, }; static void ecryptfs_free_kmem_caches(void) @@ -795,11 +773,17 @@ static int __init ecryptfs_init(void) printk(KERN_ERR "sysfs registration failed\n"); goto out_unregister_filesystem; } + rc = ecryptfs_init_kthread(); + if (rc) { + printk(KERN_ERR "%s: kthread initialization failed; " + "rc = [%d]\n", __func__, rc); + goto out_do_sysfs_unregistration; + } rc = ecryptfs_init_messaging(ecryptfs_transport); if (rc) { - ecryptfs_printk(KERN_ERR, "Failure occured while attempting to " + printk(KERN_ERR "Failure occured while attempting to " "initialize the eCryptfs netlink socket\n"); - goto out_do_sysfs_unregistration; + goto out_destroy_kthread; } rc = ecryptfs_init_crypto(); if (rc) { @@ -814,6 +798,8 @@ static int __init ecryptfs_init(void) goto out; out_release_messaging: ecryptfs_release_messaging(ecryptfs_transport); +out_destroy_kthread: + ecryptfs_destroy_kthread(); out_do_sysfs_unregistration: do_sysfs_unregistration(); out_unregister_filesystem: @@ -833,6 +819,7 @@ static void __exit ecryptfs_exit(void) printk(KERN_ERR "Failure whilst attempting to destroy crypto; " "rc = [%d]\n", rc); ecryptfs_release_messaging(ecryptfs_transport); + ecryptfs_destroy_kthread(); do_sysfs_unregistration(); unregister_filesystem(&ecryptfs_fs_type); ecryptfs_free_kmem_caches(); diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c index 09a4522..b484792 100644 --- a/fs/ecryptfs/miscdev.c +++ b/fs/ecryptfs/miscdev.c @@ -358,46 +358,6 @@ out_unlock_daemon: } /** - * ecryptfs_miscdev_helo - * @euid: effective user id of miscdevess sending helo packet - * @user_ns: The namespace in which @euid applies - * @pid: miscdevess id of miscdevess sending helo packet - * - * Returns zero on success; non-zero otherwise - */ -static int ecryptfs_miscdev_helo(uid_t euid, struct user_namespace *user_ns, - struct pid *pid) -{ - int rc; - - rc = ecryptfs_process_helo(ECRYPTFS_TRANSPORT_MISCDEV, euid, user_ns, - pid); - if (rc) - printk(KERN_WARNING "Error processing HELO; rc = [%d]\n", rc); - return rc; -} - -/** - * ecryptfs_miscdev_quit - * @euid: effective user id of miscdevess sending quit packet - * @user_ns: The namespace in which @euid applies - * @pid: miscdevess id of miscdevess sending quit packet - * - * Returns zero on success; non-zero otherwise - */ -static int ecryptfs_miscdev_quit(uid_t euid, struct user_namespace *user_ns, - struct pid *pid) -{ - int rc; - - rc = ecryptfs_process_quit(euid, user_ns, pid); - if (rc) - printk(KERN_WARNING - "Error processing QUIT message; rc = [%d]\n", rc); - return rc; -} - -/** * ecryptfs_miscdev_response - miscdevess response to message previously sent to daemon * @data: Bytes comprising struct ecryptfs_message * @data_size: sizeof(struct ecryptfs_message) + data len @@ -512,26 +472,7 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf, __func__, rc); break; case ECRYPTFS_MSG_HELO: - rc = ecryptfs_miscdev_helo(current->euid, - current->nsproxy->user_ns, - task_pid(current)); - if (rc) { - printk(KERN_ERR "%s: Error attempting to process " - "helo from pid [0x%p]; rc = [%d]\n", __func__, - task_pid(current), rc); - goto out_free; - } - break; case ECRYPTFS_MSG_QUIT: - rc = ecryptfs_miscdev_quit(current->euid, - current->nsproxy->user_ns, - task_pid(current)); - if (rc) { - printk(KERN_ERR "%s: Error attempting to process " - "quit from pid [0x%p]; rc = [%d]\n", __func__, - task_pid(current), rc); - goto out_free; - } break; default: ecryptfs_printk(KERN_WARNING, "Dropping miscdev " diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 2b6fe1e..245c2dc 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -32,6 +32,7 @@ #include <linux/file.h> #include <linux/crypto.h> #include <linux/scatterlist.h> +#include <asm/unaligned.h> #include "ecryptfs_kernel.h" /** @@ -372,7 +373,6 @@ out: */ static int ecryptfs_write_inode_size_to_header(struct inode *ecryptfs_inode) { - u64 file_size; char *file_size_virt; int rc; @@ -381,9 +381,7 @@ static int ecryptfs_write_inode_size_to_header(struct inode *ecryptfs_inode) rc = -ENOMEM; goto out; } - file_size = (u64)i_size_read(ecryptfs_inode); - file_size = cpu_to_be64(file_size); - memcpy(file_size_virt, &file_size, sizeof(u64)); + put_unaligned_be64(i_size_read(ecryptfs_inode), file_size_virt); rc = ecryptfs_write_lower(ecryptfs_inode, file_size_virt, 0, sizeof(u64)); kfree(file_size_virt); @@ -403,7 +401,6 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode) struct dentry *lower_dentry = ecryptfs_inode_to_private(ecryptfs_inode)->lower_file->f_dentry; struct inode *lower_inode = lower_dentry->d_inode; - u64 file_size; int rc; if (!lower_inode->i_op->getxattr || !lower_inode->i_op->setxattr) { @@ -424,9 +421,7 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode) xattr_virt, PAGE_CACHE_SIZE); if (size < 0) size = 8; - file_size = (u64)i_size_read(ecryptfs_inode); - file_size = cpu_to_be64(file_size); - memcpy(xattr_virt, &file_size, sizeof(u64)); + put_unaligned_be64(i_size_read(ecryptfs_inode), xattr_virt); rc = lower_inode->i_op->setxattr(lower_dentry, ECRYPTFS_XATTR_NAME, xattr_virt, size, 0); mutex_unlock(&lower_inode->i_mutex); diff --git a/fs/eventfd.c b/fs/eventfd.c index 343942d..08bf558 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -198,11 +198,18 @@ struct file *eventfd_fget(int fd) return file; } -asmlinkage long sys_eventfd(unsigned int count) +asmlinkage long sys_eventfd2(unsigned int count, int flags) { int fd; struct eventfd_ctx *ctx; + /* Check the EFD_* constants for consistency. */ + BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC); + BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK); + + if (flags & ~(EFD_CLOEXEC | EFD_NONBLOCK)) + return -EINVAL; + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return -ENOMEM; @@ -214,9 +221,15 @@ asmlinkage long sys_eventfd(unsigned int count) * When we call this, the initialization must be complete, since * anon_inode_getfd() will install the fd. */ - fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx); + fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx, + flags & (O_CLOEXEC | O_NONBLOCK)); if (fd < 0) kfree(ctx); return fd; } +asmlinkage long sys_eventfd(unsigned int count) +{ + return sys_eventfd2(count, 0); +} + diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 990c01d..0c87474 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1046,20 +1046,25 @@ retry: * RB tree. With the current implementation, the "size" parameter is ignored * (besides sanity checks). */ -asmlinkage long sys_epoll_create(int size) +asmlinkage long sys_epoll_create1(int flags) { int error, fd = -1; struct eventpoll *ep; + /* Check the EPOLL_* constant for consistency. */ + BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC); + + if (flags & ~EPOLL_CLOEXEC) + return -EINVAL; + DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n", - current, size)); + current, flags)); /* - * Sanity check on the size parameter, and create the internal data - * structure ( "struct eventpoll" ). + * Create the internal data structure ( "struct eventpoll" ). */ - error = -EINVAL; - if (size <= 0 || (error = ep_alloc(&ep)) < 0) { + error = ep_alloc(&ep); + if (error < 0) { fd = error; goto error_return; } @@ -1068,17 +1073,26 @@ asmlinkage long sys_epoll_create(int size) * Creates all the items needed to setup an eventpoll file. That is, * a file structure and a free file descriptor. */ - fd = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep); + fd = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep, + flags & O_CLOEXEC); if (fd < 0) ep_free(ep); error_return: DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n", - current, size, fd)); + current, flags, fd)); return fd; } +asmlinkage long sys_epoll_create(int size) +{ + if (size < 0) + return -EINVAL; + + return sys_epoll_create1(0); +} + /* * The following function implements the controller interface for * the eventpoll file that enables the insertion/removal/change of @@ -25,19 +25,18 @@ #include <linux/slab.h> #include <linux/file.h> #include <linux/fdtable.h> -#include <linux/mman.h> +#include <linux/mm.h> #include <linux/stat.h> #include <linux/fcntl.h> #include <linux/smp_lock.h> +#include <linux/swap.h> #include <linux/string.h> #include <linux/init.h> -#include <linux/pagemap.h> #include <linux/highmem.h> #include <linux/spinlock.h> #include <linux/key.h> #include <linux/personality.h> #include <linux/binfmts.h> -#include <linux/swap.h> #include <linux/utsname.h> #include <linux/pid_namespace.h> #include <linux/module.h> @@ -47,7 +46,6 @@ #include <linux/mount.h> #include <linux/security.h> #include <linux/syscalls.h> -#include <linux/rmap.h> #include <linux/tsacct_kern.h> #include <linux/cn_proc.h> #include <linux/audit.h> @@ -541,7 +539,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) /* * when the old and new regions overlap clear from new_end. */ - free_pgd_range(&tlb, new_end, old_end, new_end, + free_pgd_range(tlb, new_end, old_end, new_end, vma->vm_next ? vma->vm_next->vm_start : 0); } else { /* @@ -550,7 +548,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) * have constraints on va-space that make this illegal (IA64) - * for the others its just a little faster. */ - free_pgd_range(&tlb, old_start, old_end, new_end, + free_pgd_range(tlb, old_start, old_end, new_end, vma->vm_next ? vma->vm_next->vm_start : 0); } tlb_finish_mmu(tlb, new_end, old_end); @@ -724,12 +722,10 @@ static int exec_mmap(struct mm_struct *mm) * Make sure that if there is a core dump in progress * for the old mm, we get out and die instead of going * through with the exec. We must hold mmap_sem around - * checking core_waiters and changing tsk->mm. The - * core-inducing thread will increment core_waiters for - * each thread whose ->mm == old_mm. + * checking core_state and changing tsk->mm. */ down_read(&old_mm->mmap_sem); - if (unlikely(old_mm->core_waiters)) { + if (unlikely(old_mm->core_state)) { up_read(&old_mm->mmap_sem); return -EINTR; } @@ -1328,6 +1324,7 @@ int do_execve(char * filename, if (retval < 0) goto out; + current->flags &= ~PF_KTHREAD; retval = search_binary_handler(bprm,regs); if (retval >= 0) { /* execve success */ @@ -1382,17 +1379,14 @@ EXPORT_SYMBOL(set_binfmt); * name into corename, which must have space for at least * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator. */ -static int format_corename(char *corename, const char *pattern, long signr) +static int format_corename(char *corename, int nr_threads, long signr) { - const char *pat_ptr = pattern; + const char *pat_ptr = core_pattern; + int ispipe = (*pat_ptr == '|'); char *out_ptr = corename; char *const out_end = corename + CORENAME_MAX_SIZE; int rc; int pid_in_pattern = 0; - int ispipe = 0; - - if (*pattern == '|') - ispipe = 1; /* Repeat as long as we have more pattern to process and more output space */ @@ -1493,7 +1487,7 @@ static int format_corename(char *corename, const char *pattern, long signr) * and core_uses_pid is set, then .%pid will be appended to * the filename. Do not do this for piped commands. */ if (!ispipe && !pid_in_pattern - && (core_uses_pid || atomic_read(¤t->mm->mm_users) != 1)) { + && (core_uses_pid || nr_threads)) { rc = snprintf(out_ptr, out_end - out_ptr, ".%d", task_tgid_vnr(current)); if (rc > out_end - out_ptr) @@ -1505,9 +1499,10 @@ out: return ispipe; } -static void zap_process(struct task_struct *start) +static int zap_process(struct task_struct *start) { struct task_struct *t; + int nr = 0; start->signal->flags = SIGNAL_GROUP_EXIT; start->signal->group_stop_count = 0; @@ -1515,72 +1510,99 @@ static void zap_process(struct task_struct *start) t = start; do { if (t != current && t->mm) { - t->mm->core_waiters++; sigaddset(&t->pending.signal, SIGKILL); signal_wake_up(t, 1); + nr++; } - } while ((t = next_thread(t)) != start); + } while_each_thread(start, t); + + return nr; } static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm, - int exit_code) + struct core_state *core_state, int exit_code) { struct task_struct *g, *p; unsigned long flags; - int err = -EAGAIN; + int nr = -EAGAIN; spin_lock_irq(&tsk->sighand->siglock); if (!signal_group_exit(tsk->signal)) { + mm->core_state = core_state; tsk->signal->group_exit_code = exit_code; - zap_process(tsk); - err = 0; + nr = zap_process(tsk); } spin_unlock_irq(&tsk->sighand->siglock); - if (err) - return err; + if (unlikely(nr < 0)) + return nr; - if (atomic_read(&mm->mm_users) == mm->core_waiters + 1) + if (atomic_read(&mm->mm_users) == nr + 1) goto done; - + /* + * We should find and kill all tasks which use this mm, and we should + * count them correctly into ->nr_threads. We don't take tasklist + * lock, but this is safe wrt: + * + * fork: + * None of sub-threads can fork after zap_process(leader). All + * processes which were created before this point should be + * visible to zap_threads() because copy_process() adds the new + * process to the tail of init_task.tasks list, and lock/unlock + * of ->siglock provides a memory barrier. + * + * do_exit: + * The caller holds mm->mmap_sem. This means that the task which + * uses this mm can't pass exit_mm(), so it can't exit or clear + * its ->mm. + * + * de_thread: + * It does list_replace_rcu(&leader->tasks, ¤t->tasks), + * we must see either old or new leader, this does not matter. + * However, it can change p->sighand, so lock_task_sighand(p) + * must be used. Since p->mm != NULL and we hold ->mmap_sem + * it can't fail. + * + * Note also that "g" can be the old leader with ->mm == NULL + * and already unhashed and thus removed from ->thread_group. + * This is OK, __unhash_process()->list_del_rcu() does not + * clear the ->next pointer, we will find the new leader via + * next_thread(). + */ rcu_read_lock(); for_each_process(g) { if (g == tsk->group_leader) continue; - + if (g->flags & PF_KTHREAD) + continue; p = g; do { if (p->mm) { - if (p->mm == mm) { - /* - * p->sighand can't disappear, but - * may be changed by de_thread() - */ + if (unlikely(p->mm == mm)) { lock_task_sighand(p, &flags); - zap_process(p); + nr += zap_process(p); unlock_task_sighand(p, &flags); } break; } - } while ((p = next_thread(p)) != g); + } while_each_thread(g, p); } rcu_read_unlock(); done: - return mm->core_waiters; + atomic_set(&core_state->nr_threads, nr); + return nr; } -static int coredump_wait(int exit_code) +static int coredump_wait(int exit_code, struct core_state *core_state) { struct task_struct *tsk = current; struct mm_struct *mm = tsk->mm; - struct completion startup_done; struct completion *vfork_done; int core_waiters; - init_completion(&mm->core_done); - init_completion(&startup_done); - mm->core_startup_done = &startup_done; - - core_waiters = zap_threads(tsk, mm, exit_code); + init_completion(&core_state->startup); + core_state->dumper.task = tsk; + core_state->dumper.next = NULL; + core_waiters = zap_threads(tsk, mm, core_state, exit_code); up_write(&mm->mmap_sem); if (unlikely(core_waiters < 0)) @@ -1597,12 +1619,32 @@ static int coredump_wait(int exit_code) } if (core_waiters) - wait_for_completion(&startup_done); + wait_for_completion(&core_state->startup); fail: - BUG_ON(mm->core_waiters); return core_waiters; } +static void coredump_finish(struct mm_struct *mm) +{ + struct core_thread *curr, *next; + struct task_struct *task; + + next = mm->core_state->dumper.next; + while ((curr = next) != NULL) { + next = curr->next; + task = curr->task; + /* + * see exit_mm(), curr->task must not see + * ->task == NULL before we read ->next. + */ + smp_mb(); + curr->task = NULL; + wake_up_process(task); + } + + mm->core_state = NULL; +} + /* * set_dumpable converts traditional three-value dumpable to two flags and * stores them into mm->flags. It modifies lower two bits of mm->flags, but @@ -1654,6 +1696,7 @@ int get_dumpable(struct mm_struct *mm) int do_coredump(long signr, int exit_code, struct pt_regs * regs) { + struct core_state core_state; char corename[CORENAME_MAX_SIZE + 1]; struct mm_struct *mm = current->mm; struct linux_binfmt * binfmt; @@ -1677,7 +1720,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs) /* * If another thread got here first, or we are not dumpable, bail out. */ - if (mm->core_waiters || !get_dumpable(mm)) { + if (mm->core_state || !get_dumpable(mm)) { up_write(&mm->mmap_sem); goto fail; } @@ -1692,7 +1735,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs) current->fsuid = 0; /* Dump root private */ } - retval = coredump_wait(exit_code); + retval = coredump_wait(exit_code, &core_state); if (retval < 0) goto fail; @@ -1707,7 +1750,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs) * uses lock_kernel() */ lock_kernel(); - ispipe = format_corename(corename, core_pattern, signr); + ispipe = format_corename(corename, retval, signr); unlock_kernel(); /* * Don't bother to check the RLIMIT_CORE value if core_pattern points @@ -1786,7 +1829,7 @@ fail_unlock: argv_free(helper_argv); current->fsuid = fsuid; - complete_all(&mm->core_done); + coredump_finish(mm); fail: return retval; } diff --git a/fs/ext2/super.c b/fs/ext2/super.c index ef50cbc..31308a3 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -31,6 +31,7 @@ #include <linux/seq_file.h> #include <linux/mount.h> #include <linux/log2.h> +#include <linux/quotaops.h> #include <asm/uaccess.h> #include "ext2.h" #include "xattr.h" diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c index eaa23d2..70c0dbd 100644 --- a/fs/ext2/xattr_security.c +++ b/fs/ext2/xattr_security.c @@ -14,7 +14,7 @@ static size_t ext2_xattr_security_list(struct inode *inode, char *list, size_t list_size, const char *name, size_t name_len) { - const int prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1; + const int prefix_len = XATTR_SECURITY_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; if (list && total_len <= list_size) { diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c index 83ee149..e8219f8 100644 --- a/fs/ext2/xattr_trusted.c +++ b/fs/ext2/xattr_trusted.c @@ -12,13 +12,11 @@ #include <linux/ext2_fs.h> #include "xattr.h" -#define XATTR_TRUSTED_PREFIX "trusted." - static size_t ext2_xattr_trusted_list(struct inode *inode, char *list, size_t list_size, const char *name, size_t name_len) { - const int prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1; + const int prefix_len = XATTR_TRUSTED_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; if (!capable(CAP_SYS_ADMIN)) diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c index f383e7c..92495d2 100644 --- a/fs/ext2/xattr_user.c +++ b/fs/ext2/xattr_user.c @@ -11,13 +11,11 @@ #include "ext2.h" #include "xattr.h" -#define XATTR_USER_PREFIX "user." - static size_t ext2_xattr_user_list(struct inode *inode, char *list, size_t list_size, const char *name, size_t name_len) { - const size_t prefix_len = sizeof(XATTR_USER_PREFIX)-1; + const size_t prefix_len = XATTR_USER_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; if (!test_opt(inode->i_sb, XATTR_USER)) diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c index 8ca3bfd..2eea96e 100644 --- a/fs/ext3/dir.c +++ b/fs/ext3/dir.c @@ -272,7 +272,7 @@ static void free_rb_tree_fname(struct rb_root *root) while (n) { /* Do the node's children first */ - if ((n)->rb_left) { + if (n->rb_left) { n = n->rb_left; continue; } @@ -301,24 +301,18 @@ static void free_rb_tree_fname(struct rb_root *root) parent->rb_right = NULL; n = parent; } - root->rb_node = NULL; } -static struct dir_private_info *create_dir_info(loff_t pos) +static struct dir_private_info *ext3_htree_create_dir_info(loff_t pos) { struct dir_private_info *p; - p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL); + p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL); if (!p) return NULL; - p->root.rb_node = NULL; - p->curr_node = NULL; - p->extra_fname = NULL; - p->last_pos = 0; p->curr_hash = pos2maj_hash(pos); p->curr_minor_hash = pos2min_hash(pos); - p->next_hash = 0; return p; } @@ -433,7 +427,7 @@ static int ext3_dx_readdir(struct file * filp, int ret; if (!info) { - info = create_dir_info(filp->f_pos); + info = ext3_htree_create_dir_info(filp->f_pos); if (!info) return -ENOMEM; filp->private_data = info; diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c index 7712682..47b678d 100644 --- a/fs/ext3/ialloc.c +++ b/fs/ext3/ialloc.c @@ -669,6 +669,14 @@ struct inode *ext3_orphan_get(struct super_block *sb, unsigned long ino) if (IS_ERR(inode)) goto iget_failed; + /* + * If the orphans has i_nlinks > 0 then it should be able to be + * truncated, otherwise it won't be removed from the orphan list + * during processing and an infinite loop will result. + */ + if (inode->i_nlink && !ext3_can_truncate(inode)) + goto bad_orphan; + if (NEXT_ORPHAN(inode) > max_ino) goto bad_orphan; brelse(bitmap_bh); @@ -690,6 +698,7 @@ bad_orphan: printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n", NEXT_ORPHAN(inode)); printk(KERN_NOTICE "max_ino=%lu\n", max_ino); + printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink); /* Avoid freeing blocks if we got a bad deleted inode */ if (inode->i_nlink == 0) inode->i_blocks = 0; diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 6ae4ecf..3bf07d7 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -2127,7 +2127,21 @@ static void ext3_free_data(handle_t *handle, struct inode *inode, if (this_bh) { BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata"); - ext3_journal_dirty_metadata(handle, this_bh); + + /* + * The buffer head should have an attached journal head at this + * point. However, if the data is corrupted and an indirect + * block pointed to itself, it would have been detached when + * the block was cleared. Check for this instead of OOPSing. + */ + if (bh2jh(this_bh)) + ext3_journal_dirty_metadata(handle, this_bh); + else + ext3_error(inode->i_sb, "ext3_free_data", + "circular indirect block detected, " + "inode=%lu, block=%llu", + inode->i_ino, + (unsigned long long)this_bh->b_blocknr); } } @@ -2253,6 +2267,19 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode, } } +int ext3_can_truncate(struct inode *inode) +{ + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return 0; + if (S_ISREG(inode->i_mode)) + return 1; + if (S_ISDIR(inode->i_mode)) + return 1; + if (S_ISLNK(inode->i_mode)) + return !ext3_inode_is_fast_symlink(inode); + return 0; +} + /* * ext3_truncate() * @@ -2297,12 +2324,7 @@ void ext3_truncate(struct inode *inode) unsigned blocksize = inode->i_sb->s_blocksize; struct page *page; - if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - S_ISLNK(inode->i_mode))) - return; - if (ext3_inode_is_fast_symlink(inode)) - return; - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + if (!ext3_can_truncate(inode)) return; /* @@ -2513,6 +2535,16 @@ static int __ext3_get_inode_loc(struct inode *inode, } if (!buffer_uptodate(bh)) { lock_buffer(bh); + + /* + * If the buffer has the write error flag, we have failed + * to write out another inode in the same block. In this + * case, we don't have to read the block because we may + * read the old inode data successfully. + */ + if (buffer_write_io_error(bh) && !buffer_uptodate(bh)) + set_buffer_uptodate(bh); + if (buffer_uptodate(bh)) { /* someone brought it uptodate while we waited */ unlock_buffer(bh); diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 0b8cf80..de13e91 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -240,13 +240,13 @@ static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize) { unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(1) - EXT3_DIR_REC_LEN(2) - infosize; - return 0? 20: entry_space / sizeof(struct dx_entry); + return entry_space / sizeof(struct dx_entry); } static inline unsigned dx_node_limit (struct inode *dir) { unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(0); - return 0? 22: entry_space / sizeof(struct dx_entry); + return entry_space / sizeof(struct dx_entry); } /* @@ -991,19 +991,21 @@ static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, de = (struct ext3_dir_entry_2 *) bh->b_data; top = (struct ext3_dir_entry_2 *) ((char *) de + sb->s_blocksize - EXT3_DIR_REC_LEN(0)); - for (; de < top; de = ext3_next_entry(de)) - if (ext3_match (namelen, name, de)) { - if (!ext3_check_dir_entry("ext3_find_entry", - dir, de, bh, - (block<<EXT3_BLOCK_SIZE_BITS(sb)) - +((char *)de - bh->b_data))) { - brelse (bh); + for (; de < top; de = ext3_next_entry(de)) { + int off = (block << EXT3_BLOCK_SIZE_BITS(sb)) + + ((char *) de - bh->b_data); + + if (!ext3_check_dir_entry(__func__, dir, de, bh, off)) { + brelse(bh); *err = ERR_BAD_DX_DIR; goto errout; } - *res_dir = de; - dx_release (frames); - return bh; + + if (ext3_match(namelen, name, de)) { + *res_dir = de; + dx_release(frames); + return bh; + } } brelse (bh); /* Check to see if we should continue to search */ diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 2845425..615788c 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -842,7 +842,7 @@ static int parse_options (char *options, struct super_block *sb, int data_opt = 0; int option; #ifdef CONFIG_QUOTA - int qtype; + int qtype, qfmt; char *qname; #endif @@ -1018,9 +1018,11 @@ static int parse_options (char *options, struct super_block *sb, case Opt_grpjquota: qtype = GRPQUOTA; set_qf_name: - if (sb_any_quota_enabled(sb)) { + if ((sb_any_quota_enabled(sb) || + sb_any_quota_suspended(sb)) && + !sbi->s_qf_names[qtype]) { printk(KERN_ERR - "EXT3-fs: Cannot change journalled " + "EXT3-fs: Cannot change journaled " "quota options when quota turned on.\n"); return 0; } @@ -1056,9 +1058,11 @@ set_qf_name: case Opt_offgrpjquota: qtype = GRPQUOTA; clear_qf_name: - if (sb_any_quota_enabled(sb)) { + if ((sb_any_quota_enabled(sb) || + sb_any_quota_suspended(sb)) && + sbi->s_qf_names[qtype]) { printk(KERN_ERR "EXT3-fs: Cannot change " - "journalled quota options when " + "journaled quota options when " "quota turned on.\n"); return 0; } @@ -1069,10 +1073,20 @@ clear_qf_name: sbi->s_qf_names[qtype] = NULL; break; case Opt_jqfmt_vfsold: - sbi->s_jquota_fmt = QFMT_VFS_OLD; - break; + qfmt = QFMT_VFS_OLD; + goto set_qf_format; case Opt_jqfmt_vfsv0: - sbi->s_jquota_fmt = QFMT_VFS_V0; + qfmt = QFMT_VFS_V0; +set_qf_format: + if ((sb_any_quota_enabled(sb) || + sb_any_quota_suspended(sb)) && + sbi->s_jquota_fmt != qfmt) { + printk(KERN_ERR "EXT3-fs: Cannot change " + "journaled quota options when " + "quota turned on.\n"); + return 0; + } + sbi->s_jquota_fmt = qfmt; break; case Opt_quota: case Opt_usrquota: @@ -1084,7 +1098,8 @@ clear_qf_name: set_opt(sbi->s_mount_opt, GRPQUOTA); break; case Opt_noquota: - if (sb_any_quota_enabled(sb)) { + if (sb_any_quota_enabled(sb) || + sb_any_quota_suspended(sb)) { printk(KERN_ERR "EXT3-fs: Cannot change quota " "options when quota turned on.\n"); return 0; @@ -1169,14 +1184,14 @@ clear_qf_name: } if (!sbi->s_jquota_fmt) { - printk(KERN_ERR "EXT3-fs: journalled quota format " + printk(KERN_ERR "EXT3-fs: journaled quota format " "not specified.\n"); return 0; } } else { if (sbi->s_jquota_fmt) { - printk(KERN_ERR "EXT3-fs: journalled quota format " - "specified with no journalling " + printk(KERN_ERR "EXT3-fs: journaled quota format " + "specified with no journaling " "enabled.\n"); return 0; } @@ -1370,7 +1385,7 @@ static void ext3_orphan_cleanup (struct super_block * sb, int ret = ext3_quota_on_mount(sb, i); if (ret < 0) printk(KERN_ERR - "EXT3-fs: Cannot turn on journalled " + "EXT3-fs: Cannot turn on journaled " "quota: error %d\n", ret); } } @@ -2712,7 +2727,7 @@ static int ext3_release_dquot(struct dquot *dquot) static int ext3_mark_dquot_dirty(struct dquot *dquot) { - /* Are we journalling quotas? */ + /* Are we journaling quotas? */ if (EXT3_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] || EXT3_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) { dquot_mark_dquot_dirty(dquot); @@ -2759,23 +2774,42 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id, if (!test_opt(sb, QUOTA)) return -EINVAL; - /* Not journalling quota or remount? */ - if ((!EXT3_SB(sb)->s_qf_names[USRQUOTA] && - !EXT3_SB(sb)->s_qf_names[GRPQUOTA]) || remount) + /* When remounting, no checks are needed and in fact, path is NULL */ + if (remount) return vfs_quota_on(sb, type, format_id, path, remount); + err = path_lookup(path, LOOKUP_FOLLOW, &nd); if (err) return err; + /* Quotafile not on the same filesystem? */ if (nd.path.mnt->mnt_sb != sb) { path_put(&nd.path); return -EXDEV; } - /* Quotafile not in fs root? */ - if (nd.path.dentry->d_parent->d_inode != sb->s_root->d_inode) - printk(KERN_WARNING - "EXT3-fs: Quota file not on filesystem root. " - "Journalled quota will not work.\n"); + /* Journaling quota? */ + if (EXT3_SB(sb)->s_qf_names[type]) { + /* Quotafile not of fs root? */ + if (nd.path.dentry->d_parent->d_inode != sb->s_root->d_inode) + printk(KERN_WARNING + "EXT3-fs: Quota file not on filesystem root. " + "Journaled quota will not work.\n"); + } + + /* + * When we journal data on quota file, we have to flush journal to see + * all updates to the file when we bypass pagecache... + */ + if (ext3_should_journal_data(nd.path.dentry->d_inode)) { + /* + * We don't need to lock updates but journal_flush() could + * otherwise be livelocked... + */ + journal_lock_updates(EXT3_SB(sb)->s_journal); + journal_flush(EXT3_SB(sb)->s_journal); + journal_unlock_updates(EXT3_SB(sb)->s_journal); + } + path_put(&nd.path); return vfs_quota_on(sb, type, format_id, path, remount); } diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c index 821efaf..37b8109 100644 --- a/fs/ext3/xattr_security.c +++ b/fs/ext3/xattr_security.c @@ -15,7 +15,7 @@ static size_t ext3_xattr_security_list(struct inode *inode, char *list, size_t list_size, const char *name, size_t name_len) { - const size_t prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1; + const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c index 0327497..c7c41a4 100644 --- a/fs/ext3/xattr_trusted.c +++ b/fs/ext3/xattr_trusted.c @@ -13,13 +13,11 @@ #include <linux/ext3_fs.h> #include "xattr.h" -#define XATTR_TRUSTED_PREFIX "trusted." - static size_t ext3_xattr_trusted_list(struct inode *inode, char *list, size_t list_size, const char *name, size_t name_len) { - const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1; + const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; if (!capable(CAP_SYS_ADMIN)) diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c index 1abd8f9..430fe63 100644 --- a/fs/ext3/xattr_user.c +++ b/fs/ext3/xattr_user.c @@ -12,13 +12,11 @@ #include <linux/ext3_fs.h> #include "xattr.h" -#define XATTR_USER_PREFIX "user." - static size_t ext3_xattr_user_list(struct inode *inode, char *list, size_t list_size, const char *name, size_t name_len) { - const size_t prefix_len = sizeof(XATTR_USER_PREFIX)-1; + const size_t prefix_len = XATTR_USER_PREFIX_LEN; const size_t total_len = prefix_len + name_len + 1; if (!test_opt(inode->i_sb, XATTR_USER)) diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 34541d0..cd4a016 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -17,7 +17,6 @@ #include <linux/slab.h> #include <linux/time.h> #include <linux/msdos_fs.h> -#include <linux/dirent.h> #include <linux/smp_lock.h> #include <linux/buffer_head.h> #include <linux/compat.h> @@ -124,10 +123,11 @@ static inline int fat_get_entry(struct inode *dir, loff_t *pos, * but ignore that right now. * Ahem... Stack smashing in ring 0 isn't fun. Fixed. */ -static int uni16_to_x8(unsigned char *ascii, wchar_t *uni, int len, +static int uni16_to_x8(unsigned char *ascii, const wchar_t *uni, int len, int uni_xlate, struct nls_table *nls) { - wchar_t *ip, ec; + const wchar_t *ip; + wchar_t ec; unsigned char *op, nc; int charlen; int k; @@ -167,6 +167,16 @@ static int uni16_to_x8(unsigned char *ascii, wchar_t *uni, int len, return (op - ascii); } +static inline int fat_uni_to_x8(struct msdos_sb_info *sbi, const wchar_t *uni, + unsigned char *buf, int size) +{ + if (sbi->options.utf8) + return utf8_wcstombs(buf, uni, size); + else + return uni16_to_x8(buf, uni, size, sbi->options.unicode_xlate, + sbi->nls_io); +} + static inline int fat_short2uni(struct nls_table *t, unsigned char *c, int clen, wchar_t *uni) { @@ -227,6 +237,19 @@ fat_shortname2uni(struct nls_table *nls, unsigned char *buf, int buf_size, return len; } +static inline int fat_name_match(struct msdos_sb_info *sbi, + const unsigned char *a, int a_len, + const unsigned char *b, int b_len) +{ + if (a_len != b_len) + return 0; + + if (sbi->options.name_check != 's') + return !nls_strnicmp(sbi->nls_io, a, b, a_len); + else + return !memcmp(a, b, a_len); +} + enum { PARSE_INVALID = 1, PARSE_NOT_LONGNAME, PARSE_EOF, }; /** @@ -302,6 +325,19 @@ parse_long: } /* + * Maximum buffer size of short name. + * [(MSDOS_NAME + '.') * max one char + nul] + * For msdos style, ['.' (hidden) + MSDOS_NAME + '.' + nul] + */ +#define FAT_MAX_SHORT_SIZE ((MSDOS_NAME + 1) * NLS_MAX_CHARSET_SIZE + 1) +/* + * Maximum buffer size of unicode chars from slots. + * [(max longname slots * 13 (size in a slot) + nul) * sizeof(wchar_t)] + */ +#define FAT_MAX_UNI_CHARS ((MSDOS_SLOTS - 1) * 13 + 1) +#define FAT_MAX_UNI_SIZE (FAT_MAX_UNI_CHARS * sizeof(wchar_t)) + +/* * Return values: negative -> error, 0 -> not found, positive -> found, * value is the total amount of slots, including the shortname entry. */ @@ -312,29 +348,20 @@ int fat_search_long(struct inode *inode, const unsigned char *name, struct msdos_sb_info *sbi = MSDOS_SB(sb); struct buffer_head *bh = NULL; struct msdos_dir_entry *de; - struct nls_table *nls_io = sbi->nls_io; struct nls_table *nls_disk = sbi->nls_disk; - wchar_t bufuname[14]; unsigned char nr_slots; - int xlate_len; + wchar_t bufuname[14]; wchar_t *unicode = NULL; unsigned char work[MSDOS_NAME]; - unsigned char *bufname = NULL; - int uni_xlate = sbi->options.unicode_xlate; - int utf8 = sbi->options.utf8; - int anycase = (sbi->options.name_check != 's'); + unsigned char bufname[FAT_MAX_SHORT_SIZE]; unsigned short opt_shortname = sbi->options.shortname; loff_t cpos = 0; - int chl, i, j, last_u, err; - - bufname = __getname(); - if (!bufname) - return -ENOMEM; + int chl, i, j, last_u, err, len; err = -ENOENT; - while(1) { + while (1) { if (fat_get_entry(inode, &cpos, &bh, &de) == -1) - goto EODir; + goto end_of_dir; parse_record: nr_slots = 0; if (de->name[0] == DELETED_FLAG) @@ -353,7 +380,7 @@ parse_record: else if (status == PARSE_NOT_LONGNAME) goto parse_record; else if (status == PARSE_EOF) - goto EODir; + goto end_of_dir; } memcpy(work, de->name, sizeof(de->name)); @@ -394,30 +421,24 @@ parse_record: if (!last_u) continue; + /* Compare shortname */ bufuname[last_u] = 0x0000; - xlate_len = utf8 - ?utf8_wcstombs(bufname, bufuname, PATH_MAX) - :uni16_to_x8(bufname, bufuname, PATH_MAX, uni_xlate, nls_io); - if (xlate_len == name_len) - if ((!anycase && !memcmp(name, bufname, xlate_len)) || - (anycase && !nls_strnicmp(nls_io, name, bufname, - xlate_len))) - goto Found; + len = fat_uni_to_x8(sbi, bufuname, bufname, sizeof(bufname)); + if (fat_name_match(sbi, name, name_len, bufname, len)) + goto found; if (nr_slots) { - xlate_len = utf8 - ?utf8_wcstombs(bufname, unicode, PATH_MAX) - :uni16_to_x8(bufname, unicode, PATH_MAX, uni_xlate, nls_io); - if (xlate_len != name_len) - continue; - if ((!anycase && !memcmp(name, bufname, xlate_len)) || - (anycase && !nls_strnicmp(nls_io, name, bufname, - xlate_len))) - goto Found; + void *longname = unicode + FAT_MAX_UNI_CHARS; + int size = PATH_MAX - FAT_MAX_UNI_SIZE; + + /* Compare longname */ + len = fat_uni_to_x8(sbi, unicode, longname, size); + if (fat_name_match(sbi, name, name_len, longname, len)) + goto found; } } -Found: +found: nr_slots++; /* include the de */ sinfo->slot_off = cpos - nr_slots * sizeof(*de); sinfo->nr_slots = nr_slots; @@ -425,9 +446,7 @@ Found: sinfo->bh = bh; sinfo->i_pos = fat_make_i_pos(sb, sinfo->bh, sinfo->de); err = 0; -EODir: - if (bufname) - __putname(bufname); +end_of_dir: if (unicode) __putname(unicode); @@ -453,23 +472,20 @@ static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent, struct msdos_sb_info *sbi = MSDOS_SB(sb); struct buffer_head *bh; struct msdos_dir_entry *de; - struct nls_table *nls_io = sbi->nls_io; struct nls_table *nls_disk = sbi->nls_disk; - unsigned char long_slots; - const char *fill_name; - int fill_len; + unsigned char nr_slots; wchar_t bufuname[14]; wchar_t *unicode = NULL; - unsigned char c, work[MSDOS_NAME], bufname[56], *ptname = bufname; - unsigned long lpos, dummy, *furrfu = &lpos; - int uni_xlate = sbi->options.unicode_xlate; + unsigned char c, work[MSDOS_NAME]; + unsigned char bufname[FAT_MAX_SHORT_SIZE], *ptname = bufname; + unsigned short opt_shortname = sbi->options.shortname; int isvfat = sbi->options.isvfat; - int utf8 = sbi->options.utf8; int nocase = sbi->options.nocase; - unsigned short opt_shortname = sbi->options.shortname; + const char *fill_name = NULL; unsigned long inum; - int chi, chl, i, i2, j, last, last_u, dotoffset = 0; + unsigned long lpos, dummy, *furrfu = &lpos; loff_t cpos; + int chi, chl, i, i2, j, last, last_u, dotoffset = 0, fill_len = 0; int ret = 0; lock_super(sb); @@ -489,43 +505,58 @@ static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent, cpos = 0; } } - if (cpos & (sizeof(struct msdos_dir_entry)-1)) { + if (cpos & (sizeof(struct msdos_dir_entry) - 1)) { ret = -ENOENT; goto out; } bh = NULL; -GetNew: +get_new: if (fat_get_entry(inode, &cpos, &bh, &de) == -1) - goto EODir; + goto end_of_dir; parse_record: - long_slots = 0; - /* Check for long filename entry */ - if (isvfat) { + nr_slots = 0; + /* + * Check for long filename entry, but if short_only, we don't + * need to parse long filename. + */ + if (isvfat && !short_only) { if (de->name[0] == DELETED_FLAG) - goto RecEnd; + goto record_end; if (de->attr != ATTR_EXT && (de->attr & ATTR_VOLUME)) - goto RecEnd; + goto record_end; if (de->attr != ATTR_EXT && IS_FREE(de->name)) - goto RecEnd; + goto record_end; } else { if ((de->attr & ATTR_VOLUME) || IS_FREE(de->name)) - goto RecEnd; + goto record_end; } if (isvfat && de->attr == ATTR_EXT) { int status = fat_parse_long(inode, &cpos, &bh, &de, - &unicode, &long_slots); + &unicode, &nr_slots); if (status < 0) { filp->f_pos = cpos; ret = status; goto out; } else if (status == PARSE_INVALID) - goto RecEnd; + goto record_end; else if (status == PARSE_NOT_LONGNAME) goto parse_record; else if (status == PARSE_EOF) - goto EODir; + goto end_of_dir; + + if (nr_slots) { + void *longname = unicode + FAT_MAX_UNI_CHARS; + int size = PATH_MAX - FAT_MAX_UNI_SIZE; + int len = fat_uni_to_x8(sbi, unicode, longname, size); + + fill_name = longname; + fill_len = len; + /* !both && !short_only, so we don't need shortname. */ + if (!both) + goto start_filldir; + } } if (sbi->options.dotsOK) { @@ -587,12 +618,32 @@ parse_record: } } if (!last) - goto RecEnd; + goto record_end; i = last + dotoffset; j = last_u; - lpos = cpos - (long_slots+1)*sizeof(struct msdos_dir_entry); + if (isvfat) { + bufuname[j] = 0x0000; + i = fat_uni_to_x8(sbi, bufuname, bufname, sizeof(bufname)); + } + if (nr_slots) { + /* hack for fat_ioctl_filldir() */ + struct fat_ioctl_filldir_callback *p = dirent; + + p->longname = fill_name; + p->long_len = fill_len; + p->shortname = bufname; + p->short_len = i; + fill_name = NULL; + fill_len = 0; + } else { + fill_name = bufname; + fill_len = i; + } + +start_filldir: + lpos = cpos - (nr_slots + 1) * sizeof(struct msdos_dir_entry); if (!memcmp(de->name, MSDOS_DOT, MSDOS_NAME)) inum = inode->i_ino; else if (!memcmp(de->name, MSDOS_DOTDOT, MSDOS_NAME)) { @@ -607,49 +658,17 @@ parse_record: inum = iunique(sb, MSDOS_ROOT_INO); } - if (isvfat) { - bufuname[j] = 0x0000; - i = utf8 ? utf8_wcstombs(bufname, bufuname, sizeof(bufname)) - : uni16_to_x8(bufname, bufuname, sizeof(bufname), uni_xlate, nls_io); - } - - fill_name = bufname; - fill_len = i; - if (!short_only && long_slots) { - /* convert the unicode long name. 261 is maximum size - * of unicode buffer. (13 * slots + nul) */ - void *longname = unicode + 261; - int buf_size = PATH_MAX - (261 * sizeof(unicode[0])); - int long_len = utf8 - ? utf8_wcstombs(longname, unicode, buf_size) - : uni16_to_x8(longname, unicode, buf_size, uni_xlate, nls_io); - - if (!both) { - fill_name = longname; - fill_len = long_len; - } else { - /* hack for fat_ioctl_filldir() */ - struct fat_ioctl_filldir_callback *p = dirent; - - p->longname = longname; - p->long_len = long_len; - p->shortname = bufname; - p->short_len = i; - fill_name = NULL; - fill_len = 0; - } - } if (filldir(dirent, fill_name, fill_len, *furrfu, inum, (de->attr & ATTR_DIR) ? DT_DIR : DT_REG) < 0) - goto FillFailed; + goto fill_failed; -RecEnd: +record_end: furrfu = &lpos; filp->f_pos = cpos; - goto GetNew; -EODir: + goto get_new; +end_of_dir: filp->f_pos = cpos; -FillFailed: +fill_failed: brelse(bh); if (unicode) __putname(unicode); @@ -715,7 +734,7 @@ efault: \ return -EFAULT; \ } -FAT_IOCTL_FILLDIR_FUNC(fat_ioctl_filldir, dirent) +FAT_IOCTL_FILLDIR_FUNC(fat_ioctl_filldir, __fat_dirent) static int fat_ioctl_readdir(struct inode *inode, struct file *filp, void __user *dirent, filldir_t filldir, @@ -741,7 +760,7 @@ static int fat_ioctl_readdir(struct inode *inode, struct file *filp, static int fat_dir_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg) { - struct dirent __user *d1 = (struct dirent __user *)arg; + struct __fat_dirent __user *d1 = (struct __fat_dirent __user *)arg; int short_only, both; switch (cmd) { @@ -757,7 +776,7 @@ static int fat_dir_ioctl(struct inode *inode, struct file *filp, return fat_generic_ioctl(inode, filp, cmd, arg); } - if (!access_ok(VERIFY_WRITE, d1, sizeof(struct dirent[2]))) + if (!access_ok(VERIFY_WRITE, d1, sizeof(struct __fat_dirent[2]))) return -EFAULT; /* * Yes, we don't need this put_user() absolutely. However old @@ -1082,7 +1101,7 @@ int fat_alloc_new_dir(struct inode *dir, struct timespec *ts) goto error_free; } - fat_date_unix2dos(ts->tv_sec, &time, &date); + fat_date_unix2dos(ts->tv_sec, &time, &date, sbi->options.tz_utc); de = (struct msdos_dir_entry *)bhs[0]->b_data; /* filling the new directory slots ("." and ".." entries) */ diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 46a4508..23676f9 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -382,17 +382,20 @@ static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) inode->i_blocks = ((inode->i_size + (sbi->cluster_size - 1)) & ~((loff_t)sbi->cluster_size - 1)) >> 9; inode->i_mtime.tv_sec = - date_dos2unix(le16_to_cpu(de->time), le16_to_cpu(de->date)); + date_dos2unix(le16_to_cpu(de->time), le16_to_cpu(de->date), + sbi->options.tz_utc); inode->i_mtime.tv_nsec = 0; if (sbi->options.isvfat) { int secs = de->ctime_cs / 100; int csecs = de->ctime_cs % 100; inode->i_ctime.tv_sec = date_dos2unix(le16_to_cpu(de->ctime), - le16_to_cpu(de->cdate)) + secs; + le16_to_cpu(de->cdate), + sbi->options.tz_utc) + secs; inode->i_ctime.tv_nsec = csecs * 10000000; inode->i_atime.tv_sec = - date_dos2unix(0, le16_to_cpu(de->adate)); + date_dos2unix(0, le16_to_cpu(de->adate), + sbi->options.tz_utc); inode->i_atime.tv_nsec = 0; } else inode->i_ctime = inode->i_atime = inode->i_mtime; @@ -591,11 +594,14 @@ retry: raw_entry->attr = fat_attr(inode); raw_entry->start = cpu_to_le16(MSDOS_I(inode)->i_logstart); raw_entry->starthi = cpu_to_le16(MSDOS_I(inode)->i_logstart >> 16); - fat_date_unix2dos(inode->i_mtime.tv_sec, &raw_entry->time, &raw_entry->date); + fat_date_unix2dos(inode->i_mtime.tv_sec, &raw_entry->time, + &raw_entry->date, sbi->options.tz_utc); if (sbi->options.isvfat) { __le16 atime; - fat_date_unix2dos(inode->i_ctime.tv_sec,&raw_entry->ctime,&raw_entry->cdate); - fat_date_unix2dos(inode->i_atime.tv_sec,&atime,&raw_entry->adate); + fat_date_unix2dos(inode->i_ctime.tv_sec, &raw_entry->ctime, + &raw_entry->cdate, sbi->options.tz_utc); + fat_date_unix2dos(inode->i_atime.tv_sec, &atime, + &raw_entry->adate, sbi->options.tz_utc); raw_entry->ctime_cs = (inode->i_ctime.tv_sec & 1) * 100 + inode->i_ctime.tv_nsec / 10000000; } @@ -836,6 +842,8 @@ static int fat_show_options(struct seq_file *m, struct vfsmount *mnt) } if (sbi->options.flush) seq_puts(m, ",flush"); + if (opts->tz_utc) + seq_puts(m, ",tz=UTC"); return 0; } @@ -848,7 +856,7 @@ enum { Opt_charset, Opt_shortname_lower, Opt_shortname_win95, Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes, Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes, - Opt_obsolate, Opt_flush, Opt_err, + Opt_obsolate, Opt_flush, Opt_tz_utc, Opt_err, }; static match_table_t fat_tokens = { @@ -883,6 +891,7 @@ static match_table_t fat_tokens = { {Opt_obsolate, "cvf_options=%100s"}, {Opt_obsolate, "posix"}, {Opt_flush, "flush"}, + {Opt_tz_utc, "tz=UTC"}, {Opt_err, NULL}, }; static match_table_t msdos_tokens = { @@ -947,10 +956,11 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug, opts->utf8 = opts->unicode_xlate = 0; opts->numtail = 1; opts->usefree = opts->nocase = 0; + opts->tz_utc = 0; *debug = 0; if (!options) - return 0; + goto out; while ((p = strsep(&options, ",")) != NULL) { int token; @@ -1036,6 +1046,9 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug, case Opt_flush: opts->flush = 1; break; + case Opt_tz_utc: + opts->tz_utc = 1; + break; /* msdos specific */ case Opt_dots: @@ -1104,10 +1117,13 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug, return -EINVAL; } } + +out: /* UTF-8 doesn't provide FAT semantics */ if (!strcmp(opts->iocharset, "utf8")) { printk(KERN_ERR "FAT: utf8 is not a recommended IO charset" - " for FAT filesystems, filesystem will be case sensitive!\n"); + " for FAT filesystems, filesystem will be " + "case sensitive!\n"); } /* If user doesn't specify allow_utime, it's initialized from dmask. */ diff --git a/fs/fat/misc.c b/fs/fat/misc.c index 61f2351..79fb98a 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -142,7 +142,7 @@ static int day_n[] = { }; /* Convert a MS-DOS time/date pair to a UNIX date (seconds since 1 1 70). */ -int date_dos2unix(unsigned short time, unsigned short date) +int date_dos2unix(unsigned short time, unsigned short date, int tz_utc) { int month, year, secs; @@ -156,16 +156,18 @@ int date_dos2unix(unsigned short time, unsigned short date) ((date & 31)-1+day_n[month]+(year/4)+year*365-((year & 3) == 0 && month < 2 ? 1 : 0)+3653); /* days since 1.1.70 plus 80's leap day */ - secs += sys_tz.tz_minuteswest*60; + if (!tz_utc) + secs += sys_tz.tz_minuteswest*60; return secs; } /* Convert linear UNIX date to a MS-DOS time/date pair. */ -void fat_date_unix2dos(int unix_date, __le16 *time, __le16 *date) +void fat_date_unix2dos(int unix_date, __le16 *time, __le16 *date, int tz_utc) { int day, year, nl_day, month; - unix_date -= sys_tz.tz_minuteswest*60; + if (!tz_utc) + unix_date -= sys_tz.tz_minuteswest*60; /* Jan 1 GMT 00:00:00 1980. But what about another time zone? */ if (unix_date < 315532800) @@ -125,13 +125,16 @@ static int dupfd(struct file *file, unsigned int start, int cloexec) return fd; } -asmlinkage long sys_dup2(unsigned int oldfd, unsigned int newfd) +asmlinkage long sys_dup3(unsigned int oldfd, unsigned int newfd, int flags) { int err = -EBADF; struct file * file, *tofree; struct files_struct * files = current->files; struct fdtable *fdt; + if ((flags & ~O_CLOEXEC) != 0) + return -EINVAL; + spin_lock(&files->file_lock); if (!(file = fcheck(oldfd))) goto out_unlock; @@ -163,7 +166,10 @@ asmlinkage long sys_dup2(unsigned int oldfd, unsigned int newfd) rcu_assign_pointer(fdt->fd[newfd], file); FD_SET(newfd, fdt->open_fds); - FD_CLR(newfd, fdt->close_on_exec); + if (flags & O_CLOEXEC) + FD_SET(newfd, fdt->close_on_exec); + else + FD_CLR(newfd, fdt->close_on_exec); spin_unlock(&files->file_lock); if (tofree) @@ -181,6 +187,11 @@ out_fput: goto out; } +asmlinkage long sys_dup2(unsigned int oldfd, unsigned int newfd) +{ + return sys_dup3(oldfd, newfd, 0); +} + asmlinkage long sys_dup(unsigned int fildes) { int ret = -EBADF; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 2060bf0..51d0035 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -97,7 +97,7 @@ void fuse_invalidate_attr(struct inode *inode) * timeout is unknown (unlink, rmdir, rename and in some cases * lookup) */ -static void fuse_invalidate_entry_cache(struct dentry *entry) +void fuse_invalidate_entry_cache(struct dentry *entry) { fuse_dentry_settime(entry, 0); } @@ -112,18 +112,16 @@ static void fuse_invalidate_entry(struct dentry *entry) fuse_invalidate_entry_cache(entry); } -static void fuse_lookup_init(struct fuse_req *req, struct inode *dir, - struct dentry *entry, +static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_req *req, + u64 nodeid, struct qstr *name, struct fuse_entry_out *outarg) { - struct fuse_conn *fc = get_fuse_conn(dir); - memset(outarg, 0, sizeof(struct fuse_entry_out)); req->in.h.opcode = FUSE_LOOKUP; - req->in.h.nodeid = get_node_id(dir); + req->in.h.nodeid = nodeid; req->in.numargs = 1; - req->in.args[0].size = entry->d_name.len + 1; - req->in.args[0].value = entry->d_name.name; + req->in.args[0].size = name->len + 1; + req->in.args[0].value = name->name; req->out.numargs = 1; if (fc->minor < 9) req->out.args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE; @@ -189,7 +187,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd) attr_version = fuse_get_attr_version(fc); parent = dget_parent(entry); - fuse_lookup_init(req, parent->d_inode, entry, &outarg); + fuse_lookup_init(fc, req, get_node_id(parent->d_inode), + &entry->d_name, &outarg); request_send(fc, req); dput(parent); err = req->out.h.error; @@ -225,7 +224,7 @@ static int invalid_nodeid(u64 nodeid) return !nodeid || nodeid == FUSE_ROOT_ID; } -static struct dentry_operations fuse_dentry_operations = { +struct dentry_operations fuse_dentry_operations = { .d_revalidate = fuse_dentry_revalidate, }; @@ -239,85 +238,127 @@ int fuse_valid_type(int m) * Add a directory inode to a dentry, ensuring that no other dentry * refers to this inode. Called with fc->inst_mutex. */ -static int fuse_d_add_directory(struct dentry *entry, struct inode *inode) +static struct dentry *fuse_d_add_directory(struct dentry *entry, + struct inode *inode) { struct dentry *alias = d_find_alias(inode); - if (alias) { + if (alias && !(alias->d_flags & DCACHE_DISCONNECTED)) { /* This tries to shrink the subtree below alias */ fuse_invalidate_entry(alias); dput(alias); if (!list_empty(&inode->i_dentry)) - return -EBUSY; + return ERR_PTR(-EBUSY); + } else { + dput(alias); } - d_add(entry, inode); - return 0; + return d_splice_alias(inode, entry); } -static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, - struct nameidata *nd) +int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name, + struct fuse_entry_out *outarg, struct inode **inode) { - int err; - struct fuse_entry_out outarg; - struct inode *inode = NULL; - struct fuse_conn *fc = get_fuse_conn(dir); + struct fuse_conn *fc = get_fuse_conn_super(sb); struct fuse_req *req; struct fuse_req *forget_req; u64 attr_version; + int err; - if (entry->d_name.len > FUSE_NAME_MAX) - return ERR_PTR(-ENAMETOOLONG); + *inode = NULL; + err = -ENAMETOOLONG; + if (name->len > FUSE_NAME_MAX) + goto out; req = fuse_get_req(fc); + err = PTR_ERR(req); if (IS_ERR(req)) - return ERR_CAST(req); + goto out; forget_req = fuse_get_req(fc); + err = PTR_ERR(forget_req); if (IS_ERR(forget_req)) { fuse_put_request(fc, req); - return ERR_CAST(forget_req); + goto out; } attr_version = fuse_get_attr_version(fc); - fuse_lookup_init(req, dir, entry, &outarg); + fuse_lookup_init(fc, req, nodeid, name, outarg); request_send(fc, req); err = req->out.h.error; fuse_put_request(fc, req); /* Zero nodeid is same as -ENOENT, but with valid timeout */ - if (!err && outarg.nodeid && - (invalid_nodeid(outarg.nodeid) || - !fuse_valid_type(outarg.attr.mode))) - err = -EIO; - if (!err && outarg.nodeid) { - inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation, - &outarg.attr, entry_attr_timeout(&outarg), - attr_version); - if (!inode) { - fuse_send_forget(fc, forget_req, outarg.nodeid, 1); - return ERR_PTR(-ENOMEM); - } + if (err || !outarg->nodeid) + goto out_put_forget; + + err = -EIO; + if (!outarg->nodeid) + goto out_put_forget; + if (!fuse_valid_type(outarg->attr.mode)) + goto out_put_forget; + + *inode = fuse_iget(sb, outarg->nodeid, outarg->generation, + &outarg->attr, entry_attr_timeout(outarg), + attr_version); + err = -ENOMEM; + if (!*inode) { + fuse_send_forget(fc, forget_req, outarg->nodeid, 1); + goto out; } + err = 0; + + out_put_forget: fuse_put_request(fc, forget_req); - if (err && err != -ENOENT) - return ERR_PTR(err); + out: + return err; +} + +static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, + struct nameidata *nd) +{ + int err; + struct fuse_entry_out outarg; + struct inode *inode; + struct dentry *newent; + struct fuse_conn *fc = get_fuse_conn(dir); + bool outarg_valid = true; + + err = fuse_lookup_name(dir->i_sb, get_node_id(dir), &entry->d_name, + &outarg, &inode); + if (err == -ENOENT) { + outarg_valid = false; + err = 0; + } + if (err) + goto out_err; + + err = -EIO; + if (inode && get_node_id(inode) == FUSE_ROOT_ID) + goto out_iput; if (inode && S_ISDIR(inode->i_mode)) { mutex_lock(&fc->inst_mutex); - err = fuse_d_add_directory(entry, inode); + newent = fuse_d_add_directory(entry, inode); mutex_unlock(&fc->inst_mutex); - if (err) { - iput(inode); - return ERR_PTR(err); - } - } else - d_add(entry, inode); + err = PTR_ERR(newent); + if (IS_ERR(newent)) + goto out_iput; + } else { + newent = d_splice_alias(inode, entry); + } + entry = newent ? newent : entry; entry->d_op = &fuse_dentry_operations; - if (!err) + if (outarg_valid) fuse_change_entry_timeout(entry, &outarg); else fuse_invalidate_entry_cache(entry); - return NULL; + + return newent; + + out_iput: + iput(inode); + out_err: + return ERR_PTR(err); } /* diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 8092f0d..67ff2c6 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1341,6 +1341,11 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) pid_t pid = fl->fl_type != F_UNLCK ? current->tgid : 0; int err; + if (fl->fl_lmops && fl->fl_lmops->fl_grant) { + /* NLM needs asynchronous locks, which we don't support yet */ + return -ENOLCK; + } + /* Unlock on close is handled by the flush method */ if (fl->fl_flags & FL_CLOSE) return 0; @@ -1365,7 +1370,9 @@ static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl) struct fuse_conn *fc = get_fuse_conn(inode); int err; - if (cmd == F_GETLK) { + if (cmd == F_CANCELLK) { + err = 0; + } else if (cmd == F_GETLK) { if (fc->no_lock) { posix_test_lock(file, fl); err = 0; @@ -1373,7 +1380,7 @@ static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl) err = fuse_getlk(file, fl); } else { if (fc->no_lock) - err = posix_lock_file_wait(file, fl); + err = posix_lock_file(file, fl, NULL); else err = fuse_setlk(file, fl, 0); } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index bae9486..3a87607 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -363,6 +363,9 @@ struct fuse_conn { /** Do not send separate SETATTR request before open(O_TRUNC) */ unsigned atomic_o_trunc : 1; + /** Filesystem supports NFS exporting. Only set in INIT */ + unsigned export_support : 1; + /* * The following bitfields are only for optimization purposes * and hence races in setting them will not cause malfunction @@ -464,6 +467,8 @@ static inline u64 get_node_id(struct inode *inode) /** Device operations */ extern const struct file_operations fuse_dev_operations; +extern struct dentry_operations fuse_dentry_operations; + /** * Get a filled in inode */ @@ -471,6 +476,9 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, int generation, struct fuse_attr *attr, u64 attr_valid, u64 attr_version); +int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name, + struct fuse_entry_out *outarg, struct inode **inode); + /** * Send FORGET command */ @@ -604,6 +612,8 @@ void fuse_abort_conn(struct fuse_conn *fc); */ void fuse_invalidate_attr(struct inode *inode); +void fuse_invalidate_entry_cache(struct dentry *entry); + /** * Acquire reference to fuse_conn */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 3141690..7d2f7d6 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -18,6 +18,7 @@ #include <linux/statfs.h> #include <linux/random.h> #include <linux/sched.h> +#include <linux/exportfs.h> MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>"); MODULE_DESCRIPTION("Filesystem in Userspace"); @@ -552,6 +553,174 @@ static struct inode *get_root_inode(struct super_block *sb, unsigned mode) return fuse_iget(sb, 1, 0, &attr, 0, 0); } +struct fuse_inode_handle +{ + u64 nodeid; + u32 generation; +}; + +static struct dentry *fuse_get_dentry(struct super_block *sb, + struct fuse_inode_handle *handle) +{ + struct fuse_conn *fc = get_fuse_conn_super(sb); + struct inode *inode; + struct dentry *entry; + int err = -ESTALE; + + if (handle->nodeid == 0) + goto out_err; + + inode = ilookup5(sb, handle->nodeid, fuse_inode_eq, &handle->nodeid); + if (!inode) { + struct fuse_entry_out outarg; + struct qstr name; + + if (!fc->export_support) + goto out_err; + + name.len = 1; + name.name = "."; + err = fuse_lookup_name(sb, handle->nodeid, &name, &outarg, + &inode); + if (err && err != -ENOENT) + goto out_err; + if (err || !inode) { + err = -ESTALE; + goto out_err; + } + err = -EIO; + if (get_node_id(inode) != handle->nodeid) + goto out_iput; + } + err = -ESTALE; + if (inode->i_generation != handle->generation) + goto out_iput; + + entry = d_alloc_anon(inode); + err = -ENOMEM; + if (!entry) + goto out_iput; + + if (get_node_id(inode) != FUSE_ROOT_ID) { + entry->d_op = &fuse_dentry_operations; + fuse_invalidate_entry_cache(entry); + } + + return entry; + + out_iput: + iput(inode); + out_err: + return ERR_PTR(err); +} + +static int fuse_encode_fh(struct dentry *dentry, u32 *fh, int *max_len, + int connectable) +{ + struct inode *inode = dentry->d_inode; + bool encode_parent = connectable && !S_ISDIR(inode->i_mode); + int len = encode_parent ? 6 : 3; + u64 nodeid; + u32 generation; + + if (*max_len < len) + return 255; + + nodeid = get_fuse_inode(inode)->nodeid; + generation = inode->i_generation; + + fh[0] = (u32)(nodeid >> 32); + fh[1] = (u32)(nodeid & 0xffffffff); + fh[2] = generation; + + if (encode_parent) { + struct inode *parent; + + spin_lock(&dentry->d_lock); + parent = dentry->d_parent->d_inode; + nodeid = get_fuse_inode(parent)->nodeid; + generation = parent->i_generation; + spin_unlock(&dentry->d_lock); + + fh[3] = (u32)(nodeid >> 32); + fh[4] = (u32)(nodeid & 0xffffffff); + fh[5] = generation; + } + + *max_len = len; + return encode_parent ? 0x82 : 0x81; +} + +static struct dentry *fuse_fh_to_dentry(struct super_block *sb, + struct fid *fid, int fh_len, int fh_type) +{ + struct fuse_inode_handle handle; + + if ((fh_type != 0x81 && fh_type != 0x82) || fh_len < 3) + return NULL; + + handle.nodeid = (u64) fid->raw[0] << 32; + handle.nodeid |= (u64) fid->raw[1]; + handle.generation = fid->raw[2]; + return fuse_get_dentry(sb, &handle); +} + +static struct dentry *fuse_fh_to_parent(struct super_block *sb, + struct fid *fid, int fh_len, int fh_type) +{ + struct fuse_inode_handle parent; + + if (fh_type != 0x82 || fh_len < 6) + return NULL; + + parent.nodeid = (u64) fid->raw[3] << 32; + parent.nodeid |= (u64) fid->raw[4]; + parent.generation = fid->raw[5]; + return fuse_get_dentry(sb, &parent); +} + +static struct dentry *fuse_get_parent(struct dentry *child) +{ + struct inode *child_inode = child->d_inode; + struct fuse_conn *fc = get_fuse_conn(child_inode); + struct inode *inode; + struct dentry *parent; + struct fuse_entry_out outarg; + struct qstr name; + int err; + + if (!fc->export_support) + return ERR_PTR(-ESTALE); + + name.len = 2; + name.name = ".."; + err = fuse_lookup_name(child_inode->i_sb, get_node_id(child_inode), + &name, &outarg, &inode); + if (err && err != -ENOENT) + return ERR_PTR(err); + if (err || !inode) + return ERR_PTR(-ESTALE); + + parent = d_alloc_anon(inode); + if (!parent) { + iput(inode); + return ERR_PTR(-ENOMEM); + } + if (get_node_id(inode) != FUSE_ROOT_ID) { + parent->d_op = &fuse_dentry_operations; + fuse_invalidate_entry_cache(parent); + } + + return parent; +} + +static const struct export_operations fuse_export_operations = { + .fh_to_dentry = fuse_fh_to_dentry, + .fh_to_parent = fuse_fh_to_parent, + .encode_fh = fuse_encode_fh, + .get_parent = fuse_get_parent, +}; + static const struct super_operations fuse_super_operations = { .alloc_inode = fuse_alloc_inode, .destroy_inode = fuse_destroy_inode, @@ -581,6 +750,11 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->no_lock = 1; if (arg->flags & FUSE_ATOMIC_O_TRUNC) fc->atomic_o_trunc = 1; + if (arg->minor >= 9) { + /* LOOKUP has dependency on proto version */ + if (arg->flags & FUSE_EXPORT_SUPPORT) + fc->export_support = 1; + } if (arg->flags & FUSE_BIG_WRITES) fc->big_writes = 1; } else { @@ -607,7 +781,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) arg->minor = FUSE_KERNEL_MINOR_VERSION; arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE; arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | - FUSE_BIG_WRITES; + FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES; req->in.h.opcode = FUSE_INIT; req->in.numargs = 1; req->in.args[0].size = sizeof(*arg); @@ -652,6 +826,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) sb->s_magic = FUSE_SUPER_MAGIC; sb->s_op = &fuse_super_operations; sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_export_op = &fuse_export_operations; file = fget(d.fd); if (!file) diff --git a/fs/hfs/bitmap.c b/fs/hfs/bitmap.c index 24e7579..c6e9736 100644 --- a/fs/hfs/bitmap.c +++ b/fs/hfs/bitmap.c @@ -145,7 +145,7 @@ u32 hfs_vbm_search_free(struct super_block *sb, u32 goal, u32 *num_bits) if (!*num_bits) return 0; - down(&HFS_SB(sb)->bitmap_lock); + mutex_lock(&HFS_SB(sb)->bitmap_lock); bitmap = HFS_SB(sb)->bitmap; pos = hfs_find_set_zero_bits(bitmap, HFS_SB(sb)->fs_ablocks, goal, num_bits); @@ -162,7 +162,7 @@ u32 hfs_vbm_search_free(struct super_block *sb, u32 goal, u32 *num_bits) HFS_SB(sb)->free_ablocks -= *num_bits; hfs_bitmap_dirty(sb); out: - up(&HFS_SB(sb)->bitmap_lock); + mutex_unlock(&HFS_SB(sb)->bitmap_lock); return pos; } @@ -205,7 +205,7 @@ int hfs_clear_vbm_bits(struct super_block *sb, u16 start, u16 count) if ((start + count) > HFS_SB(sb)->fs_ablocks) return -2; - down(&HFS_SB(sb)->bitmap_lock); + mutex_lock(&HFS_SB(sb)->bitmap_lock); /* bitmap is always on a 32-bit boundary */ curr = HFS_SB(sb)->bitmap + (start / 32); len = count; @@ -236,7 +236,7 @@ int hfs_clear_vbm_bits(struct super_block *sb, u16 start, u16 count) } out: HFS_SB(sb)->free_ablocks += len; - up(&HFS_SB(sb)->bitmap_lock); + mutex_unlock(&HFS_SB(sb)->bitmap_lock); hfs_bitmap_dirty(sb); return 0; diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c index f6621a7..9b9d639 100644 --- a/fs/hfs/btree.c +++ b/fs/hfs/btree.c @@ -40,7 +40,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke { struct hfs_mdb *mdb = HFS_SB(sb)->mdb; HFS_I(tree->inode)->flags = 0; - init_MUTEX(&HFS_I(tree->inode)->extents_lock); + mutex_init(&HFS_I(tree->inode)->extents_lock); switch (id) { case HFS_EXT_CNID: hfs_inode_read_fork(tree->inode, mdb->drXTExtRec, mdb->drXTFlSize, diff --git a/fs/hfs/extent.c b/fs/hfs/extent.c index c176f67..2c16316 100644 --- a/fs/hfs/extent.c +++ b/fs/hfs/extent.c @@ -343,16 +343,16 @@ int hfs_get_block(struct inode *inode, sector_t block, goto done; } - down(&HFS_I(inode)->extents_lock); + mutex_lock(&HFS_I(inode)->extents_lock); res = hfs_ext_read_extent(inode, ablock); if (!res) dblock = hfs_ext_find_block(HFS_I(inode)->cached_extents, ablock - HFS_I(inode)->cached_start); else { - up(&HFS_I(inode)->extents_lock); + mutex_unlock(&HFS_I(inode)->extents_lock); return -EIO; } - up(&HFS_I(inode)->extents_lock); + mutex_unlock(&HFS_I(inode)->extents_lock); done: map_bh(bh_result, sb, HFS_SB(sb)->fs_start + @@ -375,7 +375,7 @@ int hfs_extend_file(struct inode *inode) u32 start, len, goal; int res; - down(&HFS_I(inode)->extents_lock); + mutex_lock(&HFS_I(inode)->extents_lock); if (HFS_I(inode)->alloc_blocks == HFS_I(inode)->first_blocks) goal = hfs_ext_lastblock(HFS_I(inode)->first_extents); else { @@ -425,7 +425,7 @@ int hfs_extend_file(struct inode *inode) goto insert_extent; } out: - up(&HFS_I(inode)->extents_lock); + mutex_unlock(&HFS_I(inode)->extents_lock); if (!res) { HFS_I(inode)->alloc_blocks += len; mark_inode_dirty(inode); @@ -487,7 +487,7 @@ void hfs_file_truncate(struct inode *inode) if (blk_cnt == alloc_cnt) goto out; - down(&HFS_I(inode)->extents_lock); + mutex_lock(&HFS_I(inode)->extents_lock); hfs_find_init(HFS_SB(sb)->ext_tree, &fd); while (1) { if (alloc_cnt == HFS_I(inode)->first_blocks) { @@ -514,7 +514,7 @@ void hfs_file_truncate(struct inode *inode) hfs_brec_remove(&fd); } hfs_find_exit(&fd); - up(&HFS_I(inode)->extents_lock); + mutex_unlock(&HFS_I(inode)->extents_lock); HFS_I(inode)->alloc_blocks = blk_cnt; out: diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index 147374b..9955232 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -11,6 +11,7 @@ #include <linux/slab.h> #include <linux/types.h> +#include <linux/mutex.h> #include <linux/buffer_head.h> #include <linux/fs.h> @@ -53,7 +54,7 @@ struct hfs_inode_info { struct list_head open_dir_list; struct inode *rsrc_inode; - struct semaphore extents_lock; + struct mutex extents_lock; u16 alloc_blocks, clump_blocks; sector_t fs_blocks; @@ -139,7 +140,7 @@ struct hfs_sb_info { struct nls_table *nls_io, *nls_disk; - struct semaphore bitmap_lock; + struct mutex bitmap_lock; unsigned long flags; diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 97f8446..dc4ec64 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -150,7 +150,7 @@ struct inode *hfs_new_inode(struct inode *dir, struct qstr *name, int mode) if (!inode) return NULL; - init_MUTEX(&HFS_I(inode)->extents_lock); + mutex_init(&HFS_I(inode)->extents_lock); INIT_LIST_HEAD(&HFS_I(inode)->open_dir_list); hfs_cat_build_key(sb, (btree_key *)&HFS_I(inode)->cat_key, dir->i_ino, name); inode->i_ino = HFS_SB(sb)->next_id++; @@ -281,7 +281,7 @@ static int hfs_read_inode(struct inode *inode, void *data) HFS_I(inode)->flags = 0; HFS_I(inode)->rsrc_inode = NULL; - init_MUTEX(&HFS_I(inode)->extents_lock); + mutex_init(&HFS_I(inode)->extents_lock); INIT_LIST_HEAD(&HFS_I(inode)->open_dir_list); /* Initialize the inode */ diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 8cf6797..ac2ec5e 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -372,7 +372,7 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_op = &hfs_super_operations; sb->s_flags |= MS_NODIRATIME; - init_MUTEX(&sbi->bitmap_lock); + mutex_init(&sbi->bitmap_lock); res = hfs_mdb_get(sb); if (res) { diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index 12e899c..fec8f61 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c @@ -199,16 +199,16 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock, goto done; } - down(&HFSPLUS_I(inode).extents_lock); + mutex_lock(&HFSPLUS_I(inode).extents_lock); res = hfsplus_ext_read_extent(inode, ablock); if (!res) { dblock = hfsplus_ext_find_block(HFSPLUS_I(inode).cached_extents, ablock - HFSPLUS_I(inode).cached_start); } else { - up(&HFSPLUS_I(inode).extents_lock); + mutex_unlock(&HFSPLUS_I(inode).extents_lock); return -EIO; } - up(&HFSPLUS_I(inode).extents_lock); + mutex_unlock(&HFSPLUS_I(inode).extents_lock); done: dprint(DBG_EXTENT, "get_block(%lu): %llu - %u\n", inode->i_ino, (long long)iblock, dblock); @@ -355,7 +355,7 @@ int hfsplus_file_extend(struct inode *inode) return -ENOSPC; } - down(&HFSPLUS_I(inode).extents_lock); + mutex_lock(&HFSPLUS_I(inode).extents_lock); if (HFSPLUS_I(inode).alloc_blocks == HFSPLUS_I(inode).first_blocks) goal = hfsplus_ext_lastblock(HFSPLUS_I(inode).first_extents); else { @@ -408,7 +408,7 @@ int hfsplus_file_extend(struct inode *inode) goto insert_extent; } out: - up(&HFSPLUS_I(inode).extents_lock); + mutex_unlock(&HFSPLUS_I(inode).extents_lock); if (!res) { HFSPLUS_I(inode).alloc_blocks += len; mark_inode_dirty(inode); @@ -465,7 +465,7 @@ void hfsplus_file_truncate(struct inode *inode) if (blk_cnt == alloc_cnt) goto out; - down(&HFSPLUS_I(inode).extents_lock); + mutex_lock(&HFSPLUS_I(inode).extents_lock); hfs_find_init(HFSPLUS_SB(sb).ext_tree, &fd); while (1) { if (alloc_cnt == HFSPLUS_I(inode).first_blocks) { @@ -492,7 +492,7 @@ void hfsplus_file_truncate(struct inode *inode) hfs_brec_remove(&fd); } hfs_find_exit(&fd); - up(&HFSPLUS_I(inode).extents_lock); + mutex_unlock(&HFSPLUS_I(inode).extents_lock); HFSPLUS_I(inode).alloc_blocks = blk_cnt; out: diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 9e59537..f027a90 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -11,6 +11,7 @@ #define _LINUX_HFSPLUS_FS_H #include <linux/fs.h> +#include <linux/mutex.h> #include <linux/buffer_head.h> #include "hfsplus_raw.h" @@ -154,7 +155,7 @@ struct hfsplus_sb_info { struct hfsplus_inode_info { - struct semaphore extents_lock; + struct mutex extents_lock; u32 clump_blocks, alloc_blocks; sector_t fs_blocks; /* Allocation extents from catalog record or volume header */ diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 67e1c8b..cc3b5e2 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -163,7 +163,7 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent inode->i_ino = dir->i_ino; INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list); - init_MUTEX(&HFSPLUS_I(inode).extents_lock); + mutex_init(&HFSPLUS_I(inode).extents_lock); HFSPLUS_I(inode).flags = HFSPLUS_FLG_RSRC; hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd); @@ -316,7 +316,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, int mode) inode->i_nlink = 1; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list); - init_MUTEX(&HFSPLUS_I(inode).extents_lock); + mutex_init(&HFSPLUS_I(inode).extents_lock); atomic_set(&HFSPLUS_I(inode).opencnt, 0); HFSPLUS_I(inode).flags = 0; memset(HFSPLUS_I(inode).first_extents, 0, sizeof(hfsplus_extent_rec)); diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index ce97a54..3859118 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -34,7 +34,7 @@ struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino) return inode; INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list); - init_MUTEX(&HFSPLUS_I(inode).extents_lock); + mutex_init(&HFSPLUS_I(inode).extents_lock); HFSPLUS_I(inode).flags = 0; HFSPLUS_I(inode).rsrc_inode = NULL; atomic_set(&HFSPLUS_I(inode).opencnt, 0); diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index aeabf80..dbd01d2 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -53,6 +53,7 @@ int sysctl_hugetlb_shm_group; enum { Opt_size, Opt_nr_inodes, Opt_mode, Opt_uid, Opt_gid, + Opt_pagesize, Opt_err, }; @@ -62,6 +63,7 @@ static match_table_t tokens = { {Opt_mode, "mode=%o"}, {Opt_uid, "uid=%u"}, {Opt_gid, "gid=%u"}, + {Opt_pagesize, "pagesize=%s"}, {Opt_err, NULL}, }; @@ -80,6 +82,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) struct inode *inode = file->f_path.dentry->d_inode; loff_t len, vma_len; int ret; + struct hstate *h = hstate_file(file); /* * vma address alignment (but not the pgoff alignment) has @@ -92,7 +95,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) vma->vm_flags |= VM_HUGETLB | VM_RESERVED; vma->vm_ops = &hugetlb_vm_ops; - if (vma->vm_pgoff & ~(HPAGE_MASK >> PAGE_SHIFT)) + if (vma->vm_pgoff & ~(huge_page_mask(h) >> PAGE_SHIFT)) return -EINVAL; vma_len = (loff_t)(vma->vm_end - vma->vm_start); @@ -103,9 +106,9 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) ret = -ENOMEM; len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); - if (vma->vm_flags & VM_MAYSHARE && - hugetlb_reserve_pages(inode, vma->vm_pgoff >> (HPAGE_SHIFT-PAGE_SHIFT), - len >> HPAGE_SHIFT)) + if (hugetlb_reserve_pages(inode, + vma->vm_pgoff >> huge_page_order(h), + len >> huge_page_shift(h), vma)) goto out; ret = 0; @@ -130,20 +133,21 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, struct mm_struct *mm = current->mm; struct vm_area_struct *vma; unsigned long start_addr; + struct hstate *h = hstate_file(file); - if (len & ~HPAGE_MASK) + if (len & ~huge_page_mask(h)) return -EINVAL; if (len > TASK_SIZE) return -ENOMEM; if (flags & MAP_FIXED) { - if (prepare_hugepage_range(addr, len)) + if (prepare_hugepage_range(file, addr, len)) return -EINVAL; return addr; } if (addr) { - addr = ALIGN(addr, HPAGE_SIZE); + addr = ALIGN(addr, huge_page_size(h)); vma = find_vma(mm, addr); if (TASK_SIZE - len >= addr && (!vma || addr + len <= vma->vm_start)) @@ -156,7 +160,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, start_addr = TASK_UNMAPPED_BASE; full_search: - addr = ALIGN(start_addr, HPAGE_SIZE); + addr = ALIGN(start_addr, huge_page_size(h)); for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { /* At this point: (!vma || addr < vma->vm_end). */ @@ -174,7 +178,7 @@ full_search: if (!vma || addr + len <= vma->vm_start) return addr; - addr = ALIGN(vma->vm_end, HPAGE_SIZE); + addr = ALIGN(vma->vm_end, huge_page_size(h)); } } #endif @@ -225,10 +229,11 @@ hugetlbfs_read_actor(struct page *page, unsigned long offset, static ssize_t hugetlbfs_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) { + struct hstate *h = hstate_file(filp); struct address_space *mapping = filp->f_mapping; struct inode *inode = mapping->host; - unsigned long index = *ppos >> HPAGE_SHIFT; - unsigned long offset = *ppos & ~HPAGE_MASK; + unsigned long index = *ppos >> huge_page_shift(h); + unsigned long offset = *ppos & ~huge_page_mask(h); unsigned long end_index; loff_t isize; ssize_t retval = 0; @@ -243,17 +248,17 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf, if (!isize) goto out; - end_index = (isize - 1) >> HPAGE_SHIFT; + end_index = (isize - 1) >> huge_page_shift(h); for (;;) { struct page *page; - int nr, ret; + unsigned long nr, ret; /* nr is the maximum number of bytes to copy from this page */ - nr = HPAGE_SIZE; + nr = huge_page_size(h); if (index >= end_index) { if (index > end_index) goto out; - nr = ((isize - 1) & ~HPAGE_MASK) + 1; + nr = ((isize - 1) & ~huge_page_mask(h)) + 1; if (nr <= offset) { goto out; } @@ -287,8 +292,8 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf, offset += ret; retval += ret; len -= ret; - index += offset >> HPAGE_SHIFT; - offset &= ~HPAGE_MASK; + index += offset >> huge_page_shift(h); + offset &= ~huge_page_mask(h); if (page) page_cache_release(page); @@ -298,7 +303,7 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf, break; } out: - *ppos = ((loff_t)index << HPAGE_SHIFT) + offset; + *ppos = ((loff_t)index << huge_page_shift(h)) + offset; mutex_unlock(&inode->i_mutex); return retval; } @@ -339,8 +344,9 @@ static void truncate_huge_page(struct page *page) static void truncate_hugepages(struct inode *inode, loff_t lstart) { + struct hstate *h = hstate_inode(inode); struct address_space *mapping = &inode->i_data; - const pgoff_t start = lstart >> HPAGE_SHIFT; + const pgoff_t start = lstart >> huge_page_shift(h); struct pagevec pvec; pgoff_t next; int i, freed = 0; @@ -441,7 +447,7 @@ hugetlb_vmtruncate_list(struct prio_tree_root *root, pgoff_t pgoff) v_offset = 0; __unmap_hugepage_range(vma, - vma->vm_start + v_offset, vma->vm_end); + vma->vm_start + v_offset, vma->vm_end, NULL); } } @@ -449,8 +455,9 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) { pgoff_t pgoff; struct address_space *mapping = inode->i_mapping; + struct hstate *h = hstate_inode(inode); - BUG_ON(offset & ~HPAGE_MASK); + BUG_ON(offset & ~huge_page_mask(h)); pgoff = offset >> PAGE_SHIFT; i_size_write(inode, offset); @@ -465,6 +472,7 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = dentry->d_inode; + struct hstate *h = hstate_inode(inode); int error; unsigned int ia_valid = attr->ia_valid; @@ -476,7 +484,7 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr) if (ia_valid & ATTR_SIZE) { error = -EINVAL; - if (!(attr->ia_size & ~HPAGE_MASK)) + if (!(attr->ia_size & ~huge_page_mask(h))) error = hugetlb_vmtruncate(inode, attr->ia_size); if (error) goto out; @@ -610,9 +618,10 @@ static int hugetlbfs_set_page_dirty(struct page *page) static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb); + struct hstate *h = hstate_inode(dentry->d_inode); buf->f_type = HUGETLBFS_MAGIC; - buf->f_bsize = HPAGE_SIZE; + buf->f_bsize = huge_page_size(h); if (sbinfo) { spin_lock(&sbinfo->stat_lock); /* If no limits set, just report 0 for max/free/used @@ -743,6 +752,8 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) char *p, *rest; substring_t args[MAX_OPT_ARGS]; int option; + unsigned long long size = 0; + enum { NO_SIZE, SIZE_STD, SIZE_PERCENT } setsize = NO_SIZE; if (!options) return 0; @@ -773,17 +784,13 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) break; case Opt_size: { - unsigned long long size; /* memparse() will accept a K/M/G without a digit */ if (!isdigit(*args[0].from)) goto bad_val; size = memparse(args[0].from, &rest); - if (*rest == '%') { - size <<= HPAGE_SHIFT; - size *= max_huge_pages; - do_div(size, 100); - } - pconfig->nr_blocks = (size >> HPAGE_SHIFT); + setsize = SIZE_STD; + if (*rest == '%') + setsize = SIZE_PERCENT; break; } @@ -794,6 +801,19 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) pconfig->nr_inodes = memparse(args[0].from, &rest); break; + case Opt_pagesize: { + unsigned long ps; + ps = memparse(args[0].from, &rest); + pconfig->hstate = size_to_hstate(ps); + if (!pconfig->hstate) { + printk(KERN_ERR + "hugetlbfs: Unsupported page size %lu MB\n", + ps >> 20); + return -EINVAL; + } + break; + } + default: printk(KERN_ERR "hugetlbfs: Bad mount option: \"%s\"\n", p); @@ -801,6 +821,18 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) break; } } + + /* Do size after hstate is set up */ + if (setsize > NO_SIZE) { + struct hstate *h = pconfig->hstate; + if (setsize == SIZE_PERCENT) { + size <<= huge_page_shift(h); + size *= h->max_huge_pages; + do_div(size, 100); + } + pconfig->nr_blocks = (size >> huge_page_shift(h)); + } + return 0; bad_val: @@ -825,6 +857,7 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent) config.uid = current->fsuid; config.gid = current->fsgid; config.mode = 0755; + config.hstate = &default_hstate; ret = hugetlbfs_parse_options(data, &config); if (ret) return ret; @@ -833,14 +866,15 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent) if (!sbinfo) return -ENOMEM; sb->s_fs_info = sbinfo; + sbinfo->hstate = config.hstate; spin_lock_init(&sbinfo->stat_lock); sbinfo->max_blocks = config.nr_blocks; sbinfo->free_blocks = config.nr_blocks; sbinfo->max_inodes = config.nr_inodes; sbinfo->free_inodes = config.nr_inodes; sb->s_maxbytes = MAX_LFS_FILESIZE; - sb->s_blocksize = HPAGE_SIZE; - sb->s_blocksize_bits = HPAGE_SHIFT; + sb->s_blocksize = huge_page_size(config.hstate); + sb->s_blocksize_bits = huge_page_shift(config.hstate); sb->s_magic = HUGETLBFS_MAGIC; sb->s_op = &hugetlbfs_ops; sb->s_time_gran = 1; @@ -942,7 +976,8 @@ struct file *hugetlb_file_setup(const char *name, size_t size) goto out_dentry; error = -ENOMEM; - if (hugetlb_reserve_pages(inode, 0, size >> HPAGE_SHIFT)) + if (hugetlb_reserve_pages(inode, 0, + size >> huge_page_shift(hstate_inode(inode)), NULL)) goto out_inode; d_instantiate(dentry, inode); diff --git a/fs/inotify_user.c b/fs/inotify_user.c index 6676c06..fe79c25d 100644 --- a/fs/inotify_user.c +++ b/fs/inotify_user.c @@ -566,7 +566,7 @@ static const struct inotify_operations inotify_user_ops = { .destroy_watch = free_inotify_user_watch, }; -asmlinkage long sys_inotify_init(void) +asmlinkage long sys_inotify_init1(int flags) { struct inotify_device *dev; struct inotify_handle *ih; @@ -574,7 +574,14 @@ asmlinkage long sys_inotify_init(void) struct file *filp; int fd, ret; - fd = get_unused_fd(); + /* Check the IN_* constants for consistency. */ + BUILD_BUG_ON(IN_CLOEXEC != O_CLOEXEC); + BUILD_BUG_ON(IN_NONBLOCK != O_NONBLOCK); + + if (flags & ~(IN_CLOEXEC | IN_NONBLOCK)) + return -EINVAL; + + fd = get_unused_fd_flags(flags & O_CLOEXEC); if (fd < 0) return fd; @@ -610,7 +617,7 @@ asmlinkage long sys_inotify_init(void) filp->f_path.dentry = dget(inotify_mnt->mnt_root); filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping; filp->f_mode = FMODE_READ; - filp->f_flags = O_RDONLY; + filp->f_flags = O_RDONLY | (flags & O_NONBLOCK); filp->private_data = dev; INIT_LIST_HEAD(&dev->events); @@ -638,6 +645,11 @@ out_put_fd: return ret; } +asmlinkage long sys_inotify_init(void) +{ + return sys_inotify_init1(0); +} + asmlinkage long sys_inotify_add_watch(int fd, const char __user *path, u32 mask) { struct inode *inode; diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c index 6bd48f0..c2fb2dd 100644 --- a/fs/isofs/rock.c +++ b/fs/isofs/rock.c @@ -209,6 +209,11 @@ repeat: while (rs.len > 2) { /* There may be one byte for padding somewhere */ rr = (struct rock_ridge *)rs.chr; + /* + * Ignore rock ridge info if rr->len is out of range, but + * don't return -EIO because that would make the file + * invisible. + */ if (rr->len < 3) goto out; /* Something got screwed up here */ sig = isonum_721(rs.chr); @@ -216,8 +221,12 @@ repeat: goto eio; rs.chr += rr->len; rs.len -= rr->len; + /* + * As above, just ignore the rock ridge info if rr->len + * is bogus. + */ if (rs.len < 0) - goto eio; /* corrupted isofs */ + goto out; /* Something got screwed up here */ switch (sig) { case SIG('R', 'R'): @@ -307,6 +316,11 @@ parse_rock_ridge_inode_internal(struct iso_directory_record *de, repeat: while (rs.len > 2) { /* There may be one byte for padding somewhere */ rr = (struct rock_ridge *)rs.chr; + /* + * Ignore rock ridge info if rr->len is out of range, but + * don't return -EIO because that would make the file + * invisible. + */ if (rr->len < 3) goto out; /* Something got screwed up here */ sig = isonum_721(rs.chr); @@ -314,8 +328,12 @@ repeat: goto eio; rs.chr += rr->len; rs.len -= rr->len; + /* + * As above, just ignore the rock ridge info if rr->len + * is bogus. + */ if (rs.len < 0) - goto eio; /* corrupted isofs */ + goto out; /* Something got screwed up here */ switch (sig) { #ifndef CONFIG_ZISOFS /* No flag for SF or ZF */ diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 5a8ca61..2eccbfa 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -36,7 +36,7 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) /* * When an ext3-ordered file is truncated, it is possible that many pages are - * not sucessfully freed, because they are attached to a committing transaction. + * not successfully freed, because they are attached to a committing transaction. * After the transaction commits, these pages are left on the LRU, with no * ->mapping, and with attached buffers. These pages are trivially reclaimable * by the VM, but their apparent absence upsets the VM accounting, and it makes @@ -45,8 +45,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) * So here, we have a buffer which has just come off the forget list. Look to * see if we can strip all buffers from the backing page. * - * Called under lock_journal(), and possibly under journal_datalist_lock. The - * caller provided us with a ref against the buffer, and we drop that here. + * Called under journal->j_list_lock. The caller provided us with a ref + * against the buffer, and we drop that here. */ static void release_buffer_page(struct buffer_head *bh) { @@ -78,6 +78,19 @@ nope: } /* + * Decrement reference counter for data buffer. If it has been marked + * 'BH_Freed', release it and the page to which it belongs if possible. + */ +static void release_data_buffer(struct buffer_head *bh) +{ + if (buffer_freed(bh)) { + clear_buffer_freed(bh); + release_buffer_page(bh); + } else + put_bh(bh); +} + +/* * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is * held. For ranking reasons we must trylock. If we lose, schedule away and * return 0. j_list_lock is dropped in this case. @@ -172,7 +185,7 @@ static void journal_do_submit_data(struct buffer_head **wbuf, int bufs) /* * Submit all the data buffers to disk */ -static void journal_submit_data_buffers(journal_t *journal, +static int journal_submit_data_buffers(journal_t *journal, transaction_t *commit_transaction) { struct journal_head *jh; @@ -180,6 +193,7 @@ static void journal_submit_data_buffers(journal_t *journal, int locked; int bufs = 0; struct buffer_head **wbuf = journal->j_wbuf; + int err = 0; /* * Whenever we unlock the journal and sleep, things can get added @@ -231,7 +245,7 @@ write_out_data: if (locked) unlock_buffer(bh); BUFFER_TRACE(bh, "already cleaned up"); - put_bh(bh); + release_data_buffer(bh); continue; } if (locked && test_clear_buffer_dirty(bh)) { @@ -253,15 +267,17 @@ write_out_data: put_bh(bh); } else { BUFFER_TRACE(bh, "writeout complete: unfile"); + if (unlikely(!buffer_uptodate(bh))) + err = -EIO; __journal_unfile_buffer(jh); jbd_unlock_bh_state(bh); if (locked) unlock_buffer(bh); journal_remove_journal_head(bh); - /* Once for our safety reference, once for + /* One for our safety reference, other for * journal_remove_journal_head() */ put_bh(bh); - put_bh(bh); + release_data_buffer(bh); } if (need_resched() || spin_needbreak(&journal->j_list_lock)) { @@ -271,6 +287,8 @@ write_out_data: } spin_unlock(&journal->j_list_lock); journal_do_submit_data(wbuf, bufs); + + return err; } /* @@ -410,8 +428,7 @@ void journal_commit_transaction(journal_t *journal) * Now start flushing things to disk, in the order they appear * on the transaction lists. Data blocks go first. */ - err = 0; - journal_submit_data_buffers(journal, commit_transaction); + err = journal_submit_data_buffers(journal, commit_transaction); /* * Wait for all previously submitted IO to complete. @@ -426,10 +443,21 @@ void journal_commit_transaction(journal_t *journal) if (buffer_locked(bh)) { spin_unlock(&journal->j_list_lock); wait_on_buffer(bh); - if (unlikely(!buffer_uptodate(bh))) - err = -EIO; spin_lock(&journal->j_list_lock); } + if (unlikely(!buffer_uptodate(bh))) { + if (TestSetPageLocked(bh->b_page)) { + spin_unlock(&journal->j_list_lock); + lock_page(bh->b_page); + spin_lock(&journal->j_list_lock); + } + if (bh->b_page->mapping) + set_bit(AS_EIO, &bh->b_page->mapping->flags); + + unlock_page(bh->b_page); + SetPageError(bh->b_page); + err = -EIO; + } if (!inverted_lock(journal, bh)) { put_bh(bh); spin_lock(&journal->j_list_lock); @@ -443,17 +471,21 @@ void journal_commit_transaction(journal_t *journal) } else { jbd_unlock_bh_state(bh); } - put_bh(bh); + release_data_buffer(bh); cond_resched_lock(&journal->j_list_lock); } spin_unlock(&journal->j_list_lock); - if (err) - journal_abort(journal, err); + if (err) { + char b[BDEVNAME_SIZE]; - journal_write_revoke_records(journal, commit_transaction); + printk(KERN_WARNING + "JBD: Detected IO errors while flushing file data " + "on %s\n", bdevname(journal->j_fs_dev, b)); + err = 0; + } - jbd_debug(3, "JBD: commit phase 2\n"); + journal_write_revoke_records(journal, commit_transaction); /* * If we found any dirty or locked buffers, then we should have diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index b99c3b3..aa7143a 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -68,7 +68,6 @@ EXPORT_SYMBOL(journal_set_features); EXPORT_SYMBOL(journal_create); EXPORT_SYMBOL(journal_load); EXPORT_SYMBOL(journal_destroy); -EXPORT_SYMBOL(journal_update_superblock); EXPORT_SYMBOL(journal_abort); EXPORT_SYMBOL(journal_errno); EXPORT_SYMBOL(journal_ack_err); @@ -1636,9 +1635,10 @@ static int journal_init_journal_head_cache(void) static void journal_destroy_journal_head_cache(void) { - J_ASSERT(journal_head_cache != NULL); - kmem_cache_destroy(journal_head_cache); - journal_head_cache = NULL; + if (journal_head_cache) { + kmem_cache_destroy(journal_head_cache); + journal_head_cache = NULL; + } } /* diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c index 1bb43e9..c7bd649 100644 --- a/fs/jbd/revoke.c +++ b/fs/jbd/revoke.c @@ -166,138 +166,123 @@ static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal, return NULL; } +void journal_destroy_revoke_caches(void) +{ + if (revoke_record_cache) { + kmem_cache_destroy(revoke_record_cache); + revoke_record_cache = NULL; + } + if (revoke_table_cache) { + kmem_cache_destroy(revoke_table_cache); + revoke_table_cache = NULL; + } +} + int __init journal_init_revoke_caches(void) { + J_ASSERT(!revoke_record_cache); + J_ASSERT(!revoke_table_cache); + revoke_record_cache = kmem_cache_create("revoke_record", sizeof(struct jbd_revoke_record_s), 0, SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY, NULL); if (!revoke_record_cache) - return -ENOMEM; + goto record_cache_failure; revoke_table_cache = kmem_cache_create("revoke_table", sizeof(struct jbd_revoke_table_s), 0, SLAB_TEMPORARY, NULL); - if (!revoke_table_cache) { - kmem_cache_destroy(revoke_record_cache); - revoke_record_cache = NULL; - return -ENOMEM; - } + if (!revoke_table_cache) + goto table_cache_failure; + return 0; -} -void journal_destroy_revoke_caches(void) -{ - kmem_cache_destroy(revoke_record_cache); - revoke_record_cache = NULL; - kmem_cache_destroy(revoke_table_cache); - revoke_table_cache = NULL; +table_cache_failure: + journal_destroy_revoke_caches(); +record_cache_failure: + return -ENOMEM; } -/* Initialise the revoke table for a given journal to a given size. */ - -int journal_init_revoke(journal_t *journal, int hash_size) +static struct jbd_revoke_table_s *journal_init_revoke_table(int hash_size) { - int shift, tmp; + int shift = 0; + int tmp = hash_size; + struct jbd_revoke_table_s *table; - J_ASSERT (journal->j_revoke_table[0] == NULL); + table = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL); + if (!table) + goto out; - shift = 0; - tmp = hash_size; while((tmp >>= 1UL) != 0UL) shift++; - journal->j_revoke_table[0] = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL); - if (!journal->j_revoke_table[0]) - return -ENOMEM; - journal->j_revoke = journal->j_revoke_table[0]; - - /* Check that the hash_size is a power of two */ - J_ASSERT(is_power_of_2(hash_size)); - - journal->j_revoke->hash_size = hash_size; - - journal->j_revoke->hash_shift = shift; - - journal->j_revoke->hash_table = + table->hash_size = hash_size; + table->hash_shift = shift; + table->hash_table = kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL); - if (!journal->j_revoke->hash_table) { - kmem_cache_free(revoke_table_cache, journal->j_revoke_table[0]); - journal->j_revoke = NULL; - return -ENOMEM; + if (!table->hash_table) { + kmem_cache_free(revoke_table_cache, table); + table = NULL; + goto out; } for (tmp = 0; tmp < hash_size; tmp++) - INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]); + INIT_LIST_HEAD(&table->hash_table[tmp]); - journal->j_revoke_table[1] = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL); - if (!journal->j_revoke_table[1]) { - kfree(journal->j_revoke_table[0]->hash_table); - kmem_cache_free(revoke_table_cache, journal->j_revoke_table[0]); - return -ENOMEM; +out: + return table; +} + +static void journal_destroy_revoke_table(struct jbd_revoke_table_s *table) +{ + int i; + struct list_head *hash_list; + + for (i = 0; i < table->hash_size; i++) { + hash_list = &table->hash_table[i]; + J_ASSERT(list_empty(hash_list)); } - journal->j_revoke = journal->j_revoke_table[1]; + kfree(table->hash_table); + kmem_cache_free(revoke_table_cache, table); +} - /* Check that the hash_size is a power of two */ +/* Initialise the revoke table for a given journal to a given size. */ +int journal_init_revoke(journal_t *journal, int hash_size) +{ + J_ASSERT(journal->j_revoke_table[0] == NULL); J_ASSERT(is_power_of_2(hash_size)); - journal->j_revoke->hash_size = hash_size; + journal->j_revoke_table[0] = journal_init_revoke_table(hash_size); + if (!journal->j_revoke_table[0]) + goto fail0; - journal->j_revoke->hash_shift = shift; + journal->j_revoke_table[1] = journal_init_revoke_table(hash_size); + if (!journal->j_revoke_table[1]) + goto fail1; - journal->j_revoke->hash_table = - kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL); - if (!journal->j_revoke->hash_table) { - kfree(journal->j_revoke_table[0]->hash_table); - kmem_cache_free(revoke_table_cache, journal->j_revoke_table[0]); - kmem_cache_free(revoke_table_cache, journal->j_revoke_table[1]); - journal->j_revoke = NULL; - return -ENOMEM; - } - - for (tmp = 0; tmp < hash_size; tmp++) - INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]); + journal->j_revoke = journal->j_revoke_table[1]; spin_lock_init(&journal->j_revoke_lock); return 0; -} -/* Destoy a journal's revoke table. The table must already be empty! */ +fail1: + journal_destroy_revoke_table(journal->j_revoke_table[0]); +fail0: + return -ENOMEM; +} +/* Destroy a journal's revoke table. The table must already be empty! */ void journal_destroy_revoke(journal_t *journal) { - struct jbd_revoke_table_s *table; - struct list_head *hash_list; - int i; - - table = journal->j_revoke_table[0]; - if (!table) - return; - - for (i=0; i<table->hash_size; i++) { - hash_list = &table->hash_table[i]; - J_ASSERT (list_empty(hash_list)); - } - - kfree(table->hash_table); - kmem_cache_free(revoke_table_cache, table); - journal->j_revoke = NULL; - - table = journal->j_revoke_table[1]; - if (!table) - return; - - for (i=0; i<table->hash_size; i++) { - hash_list = &table->hash_table[i]; - J_ASSERT (list_empty(hash_list)); - } - - kfree(table->hash_table); - kmem_cache_free(revoke_table_cache, table); journal->j_revoke = NULL; + if (journal->j_revoke_table[0]) + journal_destroy_revoke_table(journal->j_revoke_table[0]); + if (journal->j_revoke_table[1]) + journal_destroy_revoke_table(journal->j_revoke_table[1]); } diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index 67ff202..8dee320 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -1648,12 +1648,42 @@ out: return; } +/* + * journal_try_to_free_buffers() could race with journal_commit_transaction() + * The latter might still hold the a count on buffers when inspecting + * them on t_syncdata_list or t_locked_list. + * + * journal_try_to_free_buffers() will call this function to + * wait for the current transaction to finish syncing data buffers, before + * tryinf to free that buffer. + * + * Called with journal->j_state_lock held. + */ +static void journal_wait_for_transaction_sync_data(journal_t *journal) +{ + transaction_t *transaction = NULL; + tid_t tid; + + spin_lock(&journal->j_state_lock); + transaction = journal->j_committing_transaction; + + if (!transaction) { + spin_unlock(&journal->j_state_lock); + return; + } + + tid = transaction->t_tid; + spin_unlock(&journal->j_state_lock); + log_wait_commit(journal, tid); +} /** * int journal_try_to_free_buffers() - try to free page buffers. * @journal: journal for operation * @page: to try and free - * @unused_gfp_mask: unused + * @gfp_mask: we use the mask to detect how hard should we try to release + * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to + * release the buffers. * * * For all the buffers on this page, @@ -1682,9 +1712,11 @@ out: * journal_try_to_free_buffer() is changing its state. But that * cannot happen because we never reallocate freed data as metadata * while the data is part of a transaction. Yes? + * + * Return 0 on failure, 1 on success */ int journal_try_to_free_buffers(journal_t *journal, - struct page *page, gfp_t unused_gfp_mask) + struct page *page, gfp_t gfp_mask) { struct buffer_head *head; struct buffer_head *bh; @@ -1713,7 +1745,28 @@ int journal_try_to_free_buffers(journal_t *journal, if (buffer_jbd(bh)) goto busy; } while ((bh = bh->b_this_page) != head); + ret = try_to_free_buffers(page); + + /* + * There are a number of places where journal_try_to_free_buffers() + * could race with journal_commit_transaction(), the later still + * holds the reference to the buffers to free while processing them. + * try_to_free_buffers() failed to free those buffers. Some of the + * caller of releasepage() request page buffers to be dropped, otherwise + * treat the fail-to-free as errors (such as generic_file_direct_IO()) + * + * So, if the caller of try_to_release_page() wants the synchronous + * behaviour(i.e make sure buffers are dropped upon return), + * let's wait for the current transaction to finish flush of + * dirty data buffers, then try to free those buffers again, + * with the journal locked. + */ + if (ret == 0 && (gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)) { + journal_wait_for_transaction_sync_data(journal); + ret = try_to_free_buffers(page); + } + busy: return ret; } diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 0288e6d..359c091 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -22,6 +22,7 @@ #include <linux/parser.h> #include <linux/completion.h> #include <linux/vfs.h> +#include <linux/quotaops.h> #include <linux/mount.h> #include <linux/moduleparam.h> #include <linux/kthread.h> diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 1f6dc51..31668b6 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -582,7 +582,15 @@ again: } if (status < 0) goto out_unlock; - status = nlm_stat_to_errno(resp->status); + /* + * EAGAIN doesn't make sense for sleeping locks, and in some + * cases NLM_LCK_DENIED is returned for a permanent error. So + * turn it into an ENOLCK. + */ + if (resp->status == nlm_lck_denied && (fl_flags & FL_SLEEP)) + status = -ENOLCK; + else + status = nlm_stat_to_errno(resp->status); out_unblock: nlmclnt_finish_block(block); out: diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 821b9ac..cf0d5c2c 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -418,8 +418,8 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, goto out; case -EAGAIN: ret = nlm_lck_denied; - break; - case -EINPROGRESS: + goto out; + case FILE_LOCK_DEFERRED: if (wait) break; /* Filesystem lock operation is in progress @@ -434,10 +434,6 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, goto out; } - ret = nlm_lck_denied; - if (!wait) - goto out; - ret = nlm_lck_blocked; /* Append to list of blocked */ @@ -507,7 +503,7 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, } error = vfs_test_lock(file->f_file, &lock->fl); - if (error == -EINPROGRESS) { + if (error == FILE_LOCK_DEFERRED) { ret = nlmsvc_defer_lock_rqst(rqstp, block); goto out; } @@ -731,8 +727,7 @@ nlmsvc_grant_blocked(struct nlm_block *block) switch (error) { case 0: break; - case -EAGAIN: - case -EINPROGRESS: + case FILE_LOCK_DEFERRED: dprintk("lockd: lock still blocked error %d\n", error); nlmsvc_insert_block(block, NLM_NEVER); nlmsvc_release_block(block); @@ -779,8 +779,10 @@ find_conflict: if (!flock_locks_conflict(request, fl)) continue; error = -EAGAIN; - if (request->fl_flags & FL_SLEEP) - locks_insert_block(fl, request); + if (!(request->fl_flags & FL_SLEEP)) + goto out; + error = FILE_LOCK_DEFERRED; + locks_insert_block(fl, request); goto out; } if (request->fl_flags & FL_ACCESS) @@ -836,7 +838,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str error = -EDEADLK; if (posix_locks_deadlock(request, fl)) goto out; - error = -EAGAIN; + error = FILE_LOCK_DEFERRED; locks_insert_block(fl, request); goto out; } @@ -1035,7 +1037,7 @@ int posix_lock_file_wait(struct file *filp, struct file_lock *fl) might_sleep (); for (;;) { error = posix_lock_file(filp, fl, NULL); - if ((error != -EAGAIN) || !(fl->fl_flags & FL_SLEEP)) + if (error != FILE_LOCK_DEFERRED) break; error = wait_event_interruptible(fl->fl_wait, !fl->fl_next); if (!error) @@ -1107,9 +1109,7 @@ int locks_mandatory_area(int read_write, struct inode *inode, for (;;) { error = __posix_lock_file(inode, &fl, NULL); - if (error != -EAGAIN) - break; - if (!(fl.fl_flags & FL_SLEEP)) + if (error != FILE_LOCK_DEFERRED) break; error = wait_event_interruptible(fl.fl_wait, !fl.fl_next); if (!error) { @@ -1531,7 +1531,7 @@ int flock_lock_file_wait(struct file *filp, struct file_lock *fl) might_sleep(); for (;;) { error = flock_lock_file(filp, fl); - if ((error != -EAGAIN) || !(fl->fl_flags & FL_SLEEP)) + if (error != FILE_LOCK_DEFERRED) break; error = wait_event_interruptible(fl->fl_wait, !fl->fl_next); if (!error) @@ -1716,17 +1716,17 @@ out: * fl_grant is set. Callers expecting ->lock() to return asynchronously * will only use F_SETLK, not F_SETLKW; they will set FL_SLEEP if (and only if) * the request is for a blocking lock. When ->lock() does return asynchronously, - * it must return -EINPROGRESS, and call ->fl_grant() when the lock + * it must return FILE_LOCK_DEFERRED, and call ->fl_grant() when the lock * request completes. * If the request is for non-blocking lock the file system should return - * -EINPROGRESS then try to get the lock and call the callback routine with - * the result. If the request timed out the callback routine will return a + * FILE_LOCK_DEFERRED then try to get the lock and call the callback routine + * with the result. If the request timed out the callback routine will return a * nonzero return code and the file system should release the lock. The file * system is also responsible to keep a corresponding posix lock when it * grants a lock so the VFS can find out which locks are locally held and do * the correct lock cleanup when required. * The underlying filesystem must not drop the kernel lock or call - * ->fl_grant() before returning to the caller with a -EINPROGRESS + * ->fl_grant() before returning to the caller with a FILE_LOCK_DEFERRED * return code. */ int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, struct file_lock *conf) @@ -1738,6 +1738,30 @@ int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, str } EXPORT_SYMBOL_GPL(vfs_lock_file); +static int do_lock_file_wait(struct file *filp, unsigned int cmd, + struct file_lock *fl) +{ + int error; + + error = security_file_lock(filp, fl->fl_type); + if (error) + return error; + + for (;;) { + error = vfs_lock_file(filp, cmd, fl, NULL); + if (error != FILE_LOCK_DEFERRED) + break; + error = wait_event_interruptible(fl->fl_wait, !fl->fl_next); + if (!error) + continue; + + locks_delete_block(fl); + break; + } + + return error; +} + /* Apply the lock described by l to an open file descriptor. * This implements both the F_SETLK and F_SETLKW commands of fcntl(). */ @@ -1795,26 +1819,7 @@ again: goto out; } - error = security_file_lock(filp, file_lock->fl_type); - if (error) - goto out; - - if (filp->f_op && filp->f_op->lock != NULL) - error = filp->f_op->lock(filp, cmd, file_lock); - else { - for (;;) { - error = posix_lock_file(filp, file_lock, NULL); - if (error != -EAGAIN || cmd == F_SETLK) - break; - error = wait_event_interruptible(file_lock->fl_wait, - !file_lock->fl_next); - if (!error) - continue; - - locks_delete_block(file_lock); - break; - } - } + error = do_lock_file_wait(filp, cmd, file_lock); /* * Attempt to detect a close/fcntl race and recover by @@ -1932,26 +1937,7 @@ again: goto out; } - error = security_file_lock(filp, file_lock->fl_type); - if (error) - goto out; - - if (filp->f_op && filp->f_op->lock != NULL) - error = filp->f_op->lock(filp, cmd, file_lock); - else { - for (;;) { - error = posix_lock_file(filp, file_lock, NULL); - if (error != -EAGAIN || cmd == F_SETLK64) - break; - error = wait_event_interruptible(file_lock->fl_wait, - !file_lock->fl_next); - if (!error) - continue; - - locks_delete_block(file_lock); - break; - } - } + error = do_lock_file_wait(filp, cmd, file_lock); /* * Attempt to detect a close/fcntl race and recover by diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 84f6242..523d737 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -256,9 +256,6 @@ static int minix_fill_super(struct super_block *s, void *data, int silent) if (!s->s_root) goto out_iput; - if (!NO_TRUNCATE) - s->s_root->d_op = &minix_dentry_operations; - if (!(s->s_flags & MS_RDONLY)) { if (sbi->s_version != MINIX_V3) /* s_state is now out from V3 sb */ ms->s_state &= ~MINIX_VALID_FS; diff --git a/fs/minix/minix.h b/fs/minix/minix.h index 326edfe..e6a0b19 100644 --- a/fs/minix/minix.h +++ b/fs/minix/minix.h @@ -2,11 +2,6 @@ #include <linux/pagemap.h> #include <linux/minix_fs.h> -/* - * change the define below to 0 if you want names > info->s_namelen chars to be - * truncated. Else they will be disallowed (ENAMETOOLONG). - */ -#define NO_TRUNCATE 1 #define INODE_VERSION(inode) minix_sb(inode->i_sb)->s_version #define MINIX_V1 0x0001 /* original minix fs */ #define MINIX_V2 0x0002 /* minix V2 fs */ @@ -83,7 +78,6 @@ extern const struct inode_operations minix_file_inode_operations; extern const struct inode_operations minix_dir_inode_operations; extern const struct file_operations minix_file_operations; extern const struct file_operations minix_dir_operations; -extern struct dentry_operations minix_dentry_operations; static inline struct minix_sb_info *minix_sb(struct super_block *sb) { diff --git a/fs/minix/namei.c b/fs/minix/namei.c index 102241b..32b131c 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -18,30 +18,6 @@ static int add_nondir(struct dentry *dentry, struct inode *inode) return err; } -static int minix_hash(struct dentry *dentry, struct qstr *qstr) -{ - unsigned long hash; - int i; - const unsigned char *name; - - i = minix_sb(dentry->d_inode->i_sb)->s_namelen; - if (i >= qstr->len) - return 0; - /* Truncate the name in place, avoids having to define a compare - function. */ - qstr->len = i; - name = qstr->name; - hash = init_name_hash(); - while (i--) - hash = partial_name_hash(*name++, hash); - qstr->hash = end_name_hash(hash); - return 0; -} - -struct dentry_operations minix_dentry_operations = { - .d_hash = minix_hash, -}; - static struct dentry *minix_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd) { struct inode * inode = NULL; diff --git a/fs/msdos/namei.c b/fs/msdos/namei.c index 1f7f295..e844b98 100644 --- a/fs/msdos/namei.c +++ b/fs/msdos/namei.c @@ -14,12 +14,7 @@ /* Characters that are undesirable in an MS-DOS file name */ static unsigned char bad_chars[] = "*?<>|\""; -static unsigned char bad_if_strict_pc[] = "+=,; "; -/* GEMDOS is less restrictive */ -static unsigned char bad_if_strict_atari[] = " "; - -#define bad_if_strict(opts) \ - ((opts)->atari ? bad_if_strict_atari : bad_if_strict_pc) +static unsigned char bad_if_strict[] = "+=,; "; /***** Formats an MS-DOS file name. Rejects invalid names. */ static int msdos_format_name(const unsigned char *name, int len, @@ -40,21 +35,20 @@ static int msdos_format_name(const unsigned char *name, int len, /* Get rid of dot - test for it elsewhere */ name++; len--; - } else if (!opts->atari) + } else return -EINVAL; } /* - * disallow names that _really_ start with a dot for MS-DOS, - * GEMDOS does not care + * disallow names that _really_ start with a dot */ - space = !opts->atari; + space = 1; c = 0; for (walk = res; len && walk - res < 8; walk++) { c = *name++; len--; if (opts->name_check != 'r' && strchr(bad_chars, c)) return -EINVAL; - if (opts->name_check == 's' && strchr(bad_if_strict(opts), c)) + if (opts->name_check == 's' && strchr(bad_if_strict, c)) return -EINVAL; if (c >= 'A' && c <= 'Z' && opts->name_check == 's') return -EINVAL; @@ -94,7 +88,7 @@ static int msdos_format_name(const unsigned char *name, int len, if (opts->name_check != 'r' && strchr(bad_chars, c)) return -EINVAL; if (opts->name_check == 's' && - strchr(bad_if_strict(opts), c)) + strchr(bad_if_strict, c)) return -EINVAL; if (c < ' ' || c == ':' || c == '\\') return -EINVAL; @@ -243,6 +237,7 @@ static int msdos_add_entry(struct inode *dir, const unsigned char *name, int is_dir, int is_hid, int cluster, struct timespec *ts, struct fat_slot_info *sinfo) { + struct msdos_sb_info *sbi = MSDOS_SB(dir->i_sb); struct msdos_dir_entry de; __le16 time, date; int err; @@ -252,7 +247,7 @@ static int msdos_add_entry(struct inode *dir, const unsigned char *name, if (is_hid) de.attr |= ATTR_HIDDEN; de.lcase = 0; - fat_date_unix2dos(ts->tv_sec, &time, &date); + fat_date_unix2dos(ts->tv_sec, &time, &date, sbi->options.tz_utc); de.cdate = de.adate = 0; de.ctime = 0; de.ctime_cs = 0; diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index 46763d1c..8478fc2 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -127,7 +127,7 @@ enum { Opt_err }; -static match_table_t __initdata tokens = { +static match_table_t __initconst tokens = { {Opt_port, "port=%u"}, {Opt_rsize, "rsize=%u"}, {Opt_wsize, "wsize=%u"}, diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c index 6b6225a..15c6fae 100644 --- a/fs/nfsd/lockd.c +++ b/fs/nfsd/lockd.c @@ -19,6 +19,13 @@ #define NFSDDBG_FACILITY NFSDDBG_LOCKD +#ifdef CONFIG_LOCKD_V4 +#define nlm_stale_fh nlm4_stale_fh +#define nlm_failed nlm4_failed +#else +#define nlm_stale_fh nlm_lck_denied_nolocks +#define nlm_failed nlm_lck_denied_nolocks +#endif /* * Note: we hold the dentry use count while the file is open. */ @@ -47,12 +54,10 @@ nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp) return 0; case nfserr_dropit: return nlm_drop_reply; -#ifdef CONFIG_LOCKD_V4 case nfserr_stale: - return nlm4_stale_fh; -#endif + return nlm_stale_fh; default: - return nlm_lck_denied; + return nlm_failed; } } @@ -64,7 +64,8 @@ static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf) memcpy(buf, &st, sizeof(st)); else { if (sizeof buf->f_blocks == 4) { - if ((st.f_blocks | st.f_bfree | st.f_bavail) & + if ((st.f_blocks | st.f_bfree | st.f_bavail | + st.f_bsize | st.f_frsize) & 0xffffffff00000000ULL) return -EOVERFLOW; /* diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 6149e4b..7d6b34e 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -344,18 +344,18 @@ static ssize_t whole_disk_show(struct device *dev, static DEVICE_ATTR(whole_disk, S_IRUSR | S_IRGRP | S_IROTH, whole_disk_show, NULL); -void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len, int flags) +int add_partition(struct gendisk *disk, int part, sector_t start, sector_t len, int flags) { struct hd_struct *p; int err; p = kzalloc(sizeof(*p), GFP_KERNEL); if (!p) - return; + return -ENOMEM; if (!init_part_stats(p)) { - kfree(p); - return; + err = -ENOMEM; + goto out0; } p->start_sect = start; p->nr_sects = len; @@ -378,15 +378,31 @@ void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len, /* delay uevent until 'holders' subdir is created */ p->dev.uevent_suppress = 1; - device_add(&p->dev); + err = device_add(&p->dev); + if (err) + goto out1; partition_sysfs_add_subdir(p); p->dev.uevent_suppress = 0; - if (flags & ADDPART_FLAG_WHOLEDISK) + if (flags & ADDPART_FLAG_WHOLEDISK) { err = device_create_file(&p->dev, &dev_attr_whole_disk); + if (err) + goto out2; + } /* suppress uevent if the disk supresses it */ if (!disk->dev.uevent_suppress) kobject_uevent(&p->dev.kobj, KOBJ_ADD); + + return 0; + +out2: + device_del(&p->dev); +out1: + put_device(&p->dev); + free_part_stats(p); +out0: + kfree(p); + return err; } /* Not exported, helper to add_disk(). */ @@ -401,7 +417,7 @@ void register_disk(struct gendisk *disk) disk->dev.parent = disk->driverfs_dev; disk->dev.devt = MKDEV(disk->major, disk->first_minor); - strlcpy(disk->dev.bus_id, disk->disk_name, KOBJ_NAME_LEN); + strlcpy(disk->dev.bus_id, disk->disk_name, BUS_ID_SIZE); /* ewww... some of these buggers have / in the name... */ s = strchr(disk->dev.bus_id, '/'); if (s) @@ -483,10 +499,16 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev) if (!size) continue; if (from + size > get_capacity(disk)) { - printk(" %s: p%d exceeds device capacity\n", + printk(KERN_ERR " %s: p%d exceeds device capacity\n", disk->disk_name, p); + continue; + } + res = add_partition(disk, p, from, size, state->parts[p].flags); + if (res) { + printk(KERN_ERR " %s: p%d could not be added: %d\n", + disk->disk_name, p, -res); + continue; } - add_partition(disk, p, from, size, state->parts[p].flags); #ifdef CONFIG_BLK_DEV_MD if (state->parts[p].flags & ADDPART_FLAG_RAID) md_autodetect_dev(bdev->bd_dev+p); diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c index e7b0700..038a602 100644 --- a/fs/partitions/efi.c +++ b/fs/partitions/efi.c @@ -95,13 +95,6 @@ #include "check.h" #include "efi.h" -#undef EFI_DEBUG -#ifdef EFI_DEBUG -#define Dprintk(x...) printk(KERN_DEBUG x) -#else -#define Dprintk(x...) -#endif - /* This allows a kernel command line option 'gpt' to override * the test for invalid PMBR. Not __initdata because reloading * the partition tables happens after init too. @@ -305,10 +298,10 @@ is_gpt_valid(struct block_device *bdev, u64 lba, /* Check the GUID Partition Table signature */ if (le64_to_cpu((*gpt)->signature) != GPT_HEADER_SIGNATURE) { - Dprintk("GUID Partition Table Header signature is wrong:" - "%lld != %lld\n", - (unsigned long long)le64_to_cpu((*gpt)->signature), - (unsigned long long)GPT_HEADER_SIGNATURE); + pr_debug("GUID Partition Table Header signature is wrong:" + "%lld != %lld\n", + (unsigned long long)le64_to_cpu((*gpt)->signature), + (unsigned long long)GPT_HEADER_SIGNATURE); goto fail; } @@ -318,9 +311,8 @@ is_gpt_valid(struct block_device *bdev, u64 lba, crc = efi_crc32((const unsigned char *) (*gpt), le32_to_cpu((*gpt)->header_size)); if (crc != origcrc) { - Dprintk - ("GUID Partition Table Header CRC is wrong: %x != %x\n", - crc, origcrc); + pr_debug("GUID Partition Table Header CRC is wrong: %x != %x\n", + crc, origcrc); goto fail; } (*gpt)->header_crc32 = cpu_to_le32(origcrc); @@ -328,9 +320,9 @@ is_gpt_valid(struct block_device *bdev, u64 lba, /* Check that the my_lba entry points to the LBA that contains * the GUID Partition Table */ if (le64_to_cpu((*gpt)->my_lba) != lba) { - Dprintk("GPT my_lba incorrect: %lld != %lld\n", - (unsigned long long)le64_to_cpu((*gpt)->my_lba), - (unsigned long long)lba); + pr_debug("GPT my_lba incorrect: %lld != %lld\n", + (unsigned long long)le64_to_cpu((*gpt)->my_lba), + (unsigned long long)lba); goto fail; } @@ -339,15 +331,15 @@ is_gpt_valid(struct block_device *bdev, u64 lba, */ lastlba = last_lba(bdev); if (le64_to_cpu((*gpt)->first_usable_lba) > lastlba) { - Dprintk("GPT: first_usable_lba incorrect: %lld > %lld\n", - (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba), - (unsigned long long)lastlba); + pr_debug("GPT: first_usable_lba incorrect: %lld > %lld\n", + (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba), + (unsigned long long)lastlba); goto fail; } if (le64_to_cpu((*gpt)->last_usable_lba) > lastlba) { - Dprintk("GPT: last_usable_lba incorrect: %lld > %lld\n", - (unsigned long long)le64_to_cpu((*gpt)->last_usable_lba), - (unsigned long long)lastlba); + pr_debug("GPT: last_usable_lba incorrect: %lld > %lld\n", + (unsigned long long)le64_to_cpu((*gpt)->last_usable_lba), + (unsigned long long)lastlba); goto fail; } @@ -360,7 +352,7 @@ is_gpt_valid(struct block_device *bdev, u64 lba, le32_to_cpu((*gpt)->sizeof_partition_entry)); if (crc != le32_to_cpu((*gpt)->partition_entry_array_crc32)) { - Dprintk("GUID Partitition Entry Array CRC check failed.\n"); + pr_debug("GUID Partitition Entry Array CRC check failed.\n"); goto fail_ptes; } @@ -616,7 +608,7 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev) return 0; } - Dprintk("GUID Partition Table is valid! Yea!\n"); + pr_debug("GUID Partition Table is valid! Yea!\n"); for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) { if (!is_pte_valid(&ptes[i], last_lba(bdev))) diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c index 0fdda2e..8652fb9 100644 --- a/fs/partitions/ldm.c +++ b/fs/partitions/ldm.c @@ -133,17 +133,17 @@ static bool ldm_parse_privhead(const u8 *data, struct privhead *ph) bool is_vista = false; BUG_ON(!data || !ph); - if (MAGIC_PRIVHEAD != BE64(data)) { + if (MAGIC_PRIVHEAD != get_unaligned_be64(data)) { ldm_error("Cannot find PRIVHEAD structure. LDM database is" " corrupt. Aborting."); return false; } - ph->ver_major = BE16(data + 0x000C); - ph->ver_minor = BE16(data + 0x000E); - ph->logical_disk_start = BE64(data + 0x011B); - ph->logical_disk_size = BE64(data + 0x0123); - ph->config_start = BE64(data + 0x012B); - ph->config_size = BE64(data + 0x0133); + ph->ver_major = get_unaligned_be16(data + 0x000C); + ph->ver_minor = get_unaligned_be16(data + 0x000E); + ph->logical_disk_start = get_unaligned_be64(data + 0x011B); + ph->logical_disk_size = get_unaligned_be64(data + 0x0123); + ph->config_start = get_unaligned_be64(data + 0x012B); + ph->config_size = get_unaligned_be64(data + 0x0133); /* Version 2.11 is Win2k/XP and version 2.12 is Vista. */ if (ph->ver_major == 2 && ph->ver_minor == 12) is_vista = true; @@ -191,14 +191,14 @@ static bool ldm_parse_tocblock (const u8 *data, struct tocblock *toc) { BUG_ON (!data || !toc); - if (MAGIC_TOCBLOCK != BE64 (data)) { + if (MAGIC_TOCBLOCK != get_unaligned_be64(data)) { ldm_crit ("Cannot find TOCBLOCK, database may be corrupt."); return false; } strncpy (toc->bitmap1_name, data + 0x24, sizeof (toc->bitmap1_name)); toc->bitmap1_name[sizeof (toc->bitmap1_name) - 1] = 0; - toc->bitmap1_start = BE64 (data + 0x2E); - toc->bitmap1_size = BE64 (data + 0x36); + toc->bitmap1_start = get_unaligned_be64(data + 0x2E); + toc->bitmap1_size = get_unaligned_be64(data + 0x36); if (strncmp (toc->bitmap1_name, TOC_BITMAP1, sizeof (toc->bitmap1_name)) != 0) { @@ -208,8 +208,8 @@ static bool ldm_parse_tocblock (const u8 *data, struct tocblock *toc) } strncpy (toc->bitmap2_name, data + 0x46, sizeof (toc->bitmap2_name)); toc->bitmap2_name[sizeof (toc->bitmap2_name) - 1] = 0; - toc->bitmap2_start = BE64 (data + 0x50); - toc->bitmap2_size = BE64 (data + 0x58); + toc->bitmap2_start = get_unaligned_be64(data + 0x50); + toc->bitmap2_size = get_unaligned_be64(data + 0x58); if (strncmp (toc->bitmap2_name, TOC_BITMAP2, sizeof (toc->bitmap2_name)) != 0) { ldm_crit ("TOCBLOCK's second bitmap is '%s', should be '%s'.", @@ -237,22 +237,22 @@ static bool ldm_parse_vmdb (const u8 *data, struct vmdb *vm) { BUG_ON (!data || !vm); - if (MAGIC_VMDB != BE32 (data)) { + if (MAGIC_VMDB != get_unaligned_be32(data)) { ldm_crit ("Cannot find the VMDB, database may be corrupt."); return false; } - vm->ver_major = BE16 (data + 0x12); - vm->ver_minor = BE16 (data + 0x14); + vm->ver_major = get_unaligned_be16(data + 0x12); + vm->ver_minor = get_unaligned_be16(data + 0x14); if ((vm->ver_major != 4) || (vm->ver_minor != 10)) { ldm_error ("Expected VMDB version %d.%d, got %d.%d. " "Aborting.", 4, 10, vm->ver_major, vm->ver_minor); return false; } - vm->vblk_size = BE32 (data + 0x08); - vm->vblk_offset = BE32 (data + 0x0C); - vm->last_vblk_seq = BE32 (data + 0x04); + vm->vblk_size = get_unaligned_be32(data + 0x08); + vm->vblk_offset = get_unaligned_be32(data + 0x0C); + vm->last_vblk_seq = get_unaligned_be32(data + 0x04); ldm_debug ("Parsed VMDB successfully."); return true; @@ -507,7 +507,7 @@ static bool ldm_validate_vmdb (struct block_device *bdev, unsigned long base, goto out; /* Already logged */ /* Are there uncommitted transactions? */ - if (BE16(data + 0x10) != 0x01) { + if (get_unaligned_be16(data + 0x10) != 0x01) { ldm_crit ("Database is not in a consistent state. Aborting."); goto out; } @@ -802,7 +802,7 @@ static bool ldm_parse_cmp3 (const u8 *buffer, int buflen, struct vblk *vb) return false; len += VBLK_SIZE_CMP3; - if (len != BE32 (buffer + 0x14)) + if (len != get_unaligned_be32(buffer + 0x14)) return false; comp = &vb->vblk.comp; @@ -851,7 +851,7 @@ static int ldm_parse_dgr3 (const u8 *buffer, int buflen, struct vblk *vb) return false; len += VBLK_SIZE_DGR3; - if (len != BE32 (buffer + 0x14)) + if (len != get_unaligned_be32(buffer + 0x14)) return false; dgrp = &vb->vblk.dgrp; @@ -895,7 +895,7 @@ static bool ldm_parse_dgr4 (const u8 *buffer, int buflen, struct vblk *vb) return false; len += VBLK_SIZE_DGR4; - if (len != BE32 (buffer + 0x14)) + if (len != get_unaligned_be32(buffer + 0x14)) return false; dgrp = &vb->vblk.dgrp; @@ -931,7 +931,7 @@ static bool ldm_parse_dsk3 (const u8 *buffer, int buflen, struct vblk *vb) return false; len += VBLK_SIZE_DSK3; - if (len != BE32 (buffer + 0x14)) + if (len != get_unaligned_be32(buffer + 0x14)) return false; disk = &vb->vblk.disk; @@ -968,7 +968,7 @@ static bool ldm_parse_dsk4 (const u8 *buffer, int buflen, struct vblk *vb) return false; len += VBLK_SIZE_DSK4; - if (len != BE32 (buffer + 0x14)) + if (len != get_unaligned_be32(buffer + 0x14)) return false; disk = &vb->vblk.disk; @@ -1034,14 +1034,14 @@ static bool ldm_parse_prt3(const u8 *buffer, int buflen, struct vblk *vb) return false; } len += VBLK_SIZE_PRT3; - if (len > BE32(buffer + 0x14)) { + if (len > get_unaligned_be32(buffer + 0x14)) { ldm_error("len %d > BE32(buffer + 0x14) %d", len, - BE32(buffer + 0x14)); + get_unaligned_be32(buffer + 0x14)); return false; } part = &vb->vblk.part; - part->start = BE64(buffer + 0x24 + r_name); - part->volume_offset = BE64(buffer + 0x2C + r_name); + part->start = get_unaligned_be64(buffer + 0x24 + r_name); + part->volume_offset = get_unaligned_be64(buffer + 0x2C + r_name); part->size = ldm_get_vnum(buffer + 0x34 + r_name); part->parent_id = ldm_get_vnum(buffer + 0x34 + r_size); part->disk_id = ldm_get_vnum(buffer + 0x34 + r_parent); @@ -1139,9 +1139,9 @@ static bool ldm_parse_vol5(const u8 *buffer, int buflen, struct vblk *vb) return false; } len += VBLK_SIZE_VOL5; - if (len > BE32(buffer + 0x14)) { + if (len > get_unaligned_be32(buffer + 0x14)) { ldm_error("len %d > BE32(buffer + 0x14) %d", len, - BE32(buffer + 0x14)); + get_unaligned_be32(buffer + 0x14)); return false; } volu = &vb->vblk.volu; @@ -1294,9 +1294,9 @@ static bool ldm_frag_add (const u8 *data, int size, struct list_head *frags) BUG_ON (!data || !frags); - group = BE32 (data + 0x08); - rec = BE16 (data + 0x0C); - num = BE16 (data + 0x0E); + group = get_unaligned_be32(data + 0x08); + rec = get_unaligned_be16(data + 0x0C); + num = get_unaligned_be16(data + 0x0E); if ((num < 1) || (num > 4)) { ldm_error ("A VBLK claims to have %d parts.", num); return false; @@ -1425,12 +1425,12 @@ static bool ldm_get_vblks (struct block_device *bdev, unsigned long base, } for (v = 0; v < perbuf; v++, data+=size) { /* For each vblk */ - if (MAGIC_VBLK != BE32 (data)) { + if (MAGIC_VBLK != get_unaligned_be32(data)) { ldm_error ("Expected to find a VBLK."); goto out; } - recs = BE16 (data + 0x0E); /* Number of records */ + recs = get_unaligned_be16(data + 0x0E); /* Number of records */ if (recs == 1) { if (!ldm_ldmdb_add (data, size, ldb)) goto out; /* Already logged */ diff --git a/fs/partitions/ldm.h b/fs/partitions/ldm.h index 80f63b5..30e08e8 100644 --- a/fs/partitions/ldm.h +++ b/fs/partitions/ldm.h @@ -98,11 +98,6 @@ struct parsed_partitions; #define TOC_BITMAP1 "config" /* Names of the two defined */ #define TOC_BITMAP2 "log" /* bitmaps in the TOCBLOCK. */ -/* Most numbers we deal with are big-endian and won't be aligned. */ -#define BE16(x) ((u16)be16_to_cpu(get_unaligned((__be16*)(x)))) -#define BE32(x) ((u32)be32_to_cpu(get_unaligned((__be32*)(x)))) -#define BE64(x) ((u64)be64_to_cpu(get_unaligned((__be64*)(x)))) - /* Borrowed from msdos.c */ #define SYS_IND(p) (get_unaligned(&(p)->sys_ind)) @@ -950,7 +950,7 @@ fail_inode: return NULL; } -struct file *create_write_pipe(void) +struct file *create_write_pipe(int flags) { int err; struct inode *inode; @@ -983,7 +983,7 @@ struct file *create_write_pipe(void) goto err_dentry; f->f_mapping = inode->i_mapping; - f->f_flags = O_WRONLY; + f->f_flags = O_WRONLY | (flags & O_NONBLOCK); f->f_version = 0; return f; @@ -1007,7 +1007,7 @@ void free_write_pipe(struct file *f) put_filp(f); } -struct file *create_read_pipe(struct file *wrf) +struct file *create_read_pipe(struct file *wrf, int flags) { struct file *f = get_empty_filp(); if (!f) @@ -1019,7 +1019,7 @@ struct file *create_read_pipe(struct file *wrf) f->f_mapping = wrf->f_path.dentry->d_inode->i_mapping; f->f_pos = 0; - f->f_flags = O_RDONLY; + f->f_flags = O_RDONLY | (flags & O_NONBLOCK); f->f_op = &read_pipe_fops; f->f_mode = FMODE_READ; f->f_version = 0; @@ -1027,26 +1027,29 @@ struct file *create_read_pipe(struct file *wrf) return f; } -int do_pipe(int *fd) +int do_pipe_flags(int *fd, int flags) { struct file *fw, *fr; int error; int fdw, fdr; - fw = create_write_pipe(); + if (flags & ~(O_CLOEXEC | O_NONBLOCK)) + return -EINVAL; + + fw = create_write_pipe(flags); if (IS_ERR(fw)) return PTR_ERR(fw); - fr = create_read_pipe(fw); + fr = create_read_pipe(fw, flags); error = PTR_ERR(fr); if (IS_ERR(fr)) goto err_write_pipe; - error = get_unused_fd(); + error = get_unused_fd_flags(flags); if (error < 0) goto err_read_pipe; fdr = error; - error = get_unused_fd(); + error = get_unused_fd_flags(flags); if (error < 0) goto err_fdr; fdw = error; @@ -1074,16 +1077,21 @@ int do_pipe(int *fd) return error; } +int do_pipe(int *fd) +{ + return do_pipe_flags(fd, 0); +} + /* * sys_pipe() is the normal C calling standard for creating * a pipe. It's not the way Unix traditionally does this, though. */ -asmlinkage long __weak sys_pipe(int __user *fildes) +asmlinkage long __weak sys_pipe2(int __user *fildes, int flags) { int fd[2]; int error; - error = do_pipe(fd); + error = do_pipe_flags(fd, flags); if (!error) { if (copy_to_user(fildes, fd, sizeof(fd))) { sys_close(fd[0]); @@ -1094,6 +1102,11 @@ asmlinkage long __weak sys_pipe(int __user *fildes) return error; } +asmlinkage long __weak sys_pipe(int __user *fildes) +{ + return sys_pipe2(fildes, 0); +} + /* * pipefs should _never_ be mounted by userland - too much of security hassle, * no real gain from having the whole whorehouse mounted. So we don't need diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig new file mode 100644 index 0000000..73cd7a4 --- /dev/null +++ b/fs/proc/Kconfig @@ -0,0 +1,59 @@ +config PROC_FS + bool "/proc file system support" if EMBEDDED + default y + help + This is a virtual file system providing information about the status + of the system. "Virtual" means that it doesn't take up any space on + your hard disk: the files are created on the fly by the kernel when + you try to access them. Also, you cannot read the files with older + version of the program less: you need to use more or cat. + + It's totally cool; for example, "cat /proc/interrupts" gives + information about what the different IRQs are used for at the moment + (there is a small number of Interrupt ReQuest lines in your computer + that are used by the attached devices to gain the CPU's attention -- + often a source of trouble if two devices are mistakenly configured + to use the same IRQ). The program procinfo to display some + information about your system gathered from the /proc file system. + + Before you can use the /proc file system, it has to be mounted, + meaning it has to be given a location in the directory hierarchy. + That location should be /proc. A command such as "mount -t proc proc + /proc" or the equivalent line in /etc/fstab does the job. + + The /proc file system is explained in the file + <file:Documentation/filesystems/proc.txt> and on the proc(5) manpage + ("man 5 proc"). + + This option will enlarge your kernel by about 67 KB. Several + programs depend on this, so everyone should say Y here. + +config PROC_KCORE + bool "/proc/kcore support" if !ARM + depends on PROC_FS && MMU + +config PROC_VMCORE + bool "/proc/vmcore support (EXPERIMENTAL)" + depends on PROC_FS && CRASH_DUMP + default y + help + Exports the dump image of crashed kernel in ELF format. + +config PROC_SYSCTL + bool "Sysctl support (/proc/sys)" if EMBEDDED + depends on PROC_FS + select SYSCTL + default y + ---help--- + The sysctl interface provides a means of dynamically changing + certain kernel parameters and variables on the fly without requiring + a recompile of the kernel or reboot of the system. The primary + interface is through /proc/sys. If you say Y here a tree of + modifiable sysctl entries will be generated beneath the + /proc/sys directory. They are explained in the files + in <file:Documentation/sysctl/>. Note that enabling this + option will enlarge the kernel by at least 8 KB. + + As it is generally a good thing, you should say Y here unless + building a kernel for install/rescue disks or your system is very + limited in memory. diff --git a/fs/proc/base.c b/fs/proc/base.c index 58c3e6a..a891fe4 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2376,29 +2376,82 @@ static int proc_base_fill_cache(struct file *filp, void *dirent, } #ifdef CONFIG_TASK_IO_ACCOUNTING -static int proc_pid_io_accounting(struct task_struct *task, char *buffer) -{ +static int do_io_accounting(struct task_struct *task, char *buffer, int whole) +{ + u64 rchar, wchar, syscr, syscw; + struct task_io_accounting ioac; + + if (!whole) { + rchar = task->rchar; + wchar = task->wchar; + syscr = task->syscr; + syscw = task->syscw; + memcpy(&ioac, &task->ioac, sizeof(ioac)); + } else { + unsigned long flags; + struct task_struct *t = task; + rchar = wchar = syscr = syscw = 0; + memset(&ioac, 0, sizeof(ioac)); + + rcu_read_lock(); + do { + rchar += t->rchar; + wchar += t->wchar; + syscr += t->syscr; + syscw += t->syscw; + + ioac.read_bytes += t->ioac.read_bytes; + ioac.write_bytes += t->ioac.write_bytes; + ioac.cancelled_write_bytes += + t->ioac.cancelled_write_bytes; + t = next_thread(t); + } while (t != task); + rcu_read_unlock(); + + if (lock_task_sighand(task, &flags)) { + struct signal_struct *sig = task->signal; + + rchar += sig->rchar; + wchar += sig->wchar; + syscr += sig->syscr; + syscw += sig->syscw; + + ioac.read_bytes += sig->ioac.read_bytes; + ioac.write_bytes += sig->ioac.write_bytes; + ioac.cancelled_write_bytes += + sig->ioac.cancelled_write_bytes; + + unlock_task_sighand(task, &flags); + } + } + return sprintf(buffer, -#ifdef CONFIG_TASK_XACCT "rchar: %llu\n" "wchar: %llu\n" "syscr: %llu\n" "syscw: %llu\n" -#endif "read_bytes: %llu\n" "write_bytes: %llu\n" "cancelled_write_bytes: %llu\n", -#ifdef CONFIG_TASK_XACCT - (unsigned long long)task->rchar, - (unsigned long long)task->wchar, - (unsigned long long)task->syscr, - (unsigned long long)task->syscw, -#endif - (unsigned long long)task->ioac.read_bytes, - (unsigned long long)task->ioac.write_bytes, - (unsigned long long)task->ioac.cancelled_write_bytes); + (unsigned long long)rchar, + (unsigned long long)wchar, + (unsigned long long)syscr, + (unsigned long long)syscw, + (unsigned long long)ioac.read_bytes, + (unsigned long long)ioac.write_bytes, + (unsigned long long)ioac.cancelled_write_bytes); +} + +static int proc_tid_io_accounting(struct task_struct *task, char *buffer) +{ + return do_io_accounting(task, buffer, 0); } -#endif + +static int proc_tgid_io_accounting(struct task_struct *task, char *buffer) +{ + return do_io_accounting(task, buffer, 1); +} +#endif /* CONFIG_TASK_IO_ACCOUNTING */ /* * Thread groups @@ -2470,7 +2523,7 @@ static const struct pid_entry tgid_base_stuff[] = { REG("coredump_filter", S_IRUGO|S_IWUSR, coredump_filter), #endif #ifdef CONFIG_TASK_IO_ACCOUNTING - INF("io", S_IRUGO, pid_io_accounting), + INF("io", S_IRUGO, tgid_io_accounting), #endif }; @@ -2797,6 +2850,9 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_FAULT_INJECTION REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject), #endif +#ifdef CONFIG_TASK_IO_ACCOUNTING + INF("io", S_IRUGO, tid_io_accounting), +#endif }; static int proc_tid_base_readdir(struct file * filp, diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 43e54e8..bc0a0dd 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -597,6 +597,7 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, ent->pde_users = 0; spin_lock_init(&ent->pde_unload_lock); ent->pde_unload_completion = NULL; + INIT_LIST_HEAD(&ent->pde_openers); out: return ent; } @@ -789,6 +790,19 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) spin_unlock(&de->pde_unload_lock); continue_removing: + spin_lock(&de->pde_unload_lock); + while (!list_empty(&de->pde_openers)) { + struct pde_opener *pdeo; + + pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh); + list_del(&pdeo->lh); + spin_unlock(&de->pde_unload_lock); + pdeo->release(pdeo->inode, pdeo->file); + kfree(pdeo); + spin_lock(&de->pde_unload_lock); + } + spin_unlock(&de->pde_unload_lock); + if (S_ISDIR(de->mode)) parent->nlink--; de->nlink = 0; diff --git a/fs/proc/inode.c b/fs/proc/inode.c index b08d100..02eca2e 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -111,27 +111,25 @@ int __init proc_init_inodecache(void) return 0; } -static int proc_remount(struct super_block *sb, int *flags, char *data) -{ - *flags |= MS_NODIRATIME; - return 0; -} - static const struct super_operations proc_sops = { .alloc_inode = proc_alloc_inode, .destroy_inode = proc_destroy_inode, .drop_inode = generic_delete_inode, .delete_inode = proc_delete_inode, .statfs = simple_statfs, - .remount_fs = proc_remount, }; -static void pde_users_dec(struct proc_dir_entry *pde) +static void __pde_users_dec(struct proc_dir_entry *pde) { - spin_lock(&pde->pde_unload_lock); pde->pde_users--; if (pde->pde_unload_completion && pde->pde_users == 0) complete(pde->pde_unload_completion); +} + +static void pde_users_dec(struct proc_dir_entry *pde) +{ + spin_lock(&pde->pde_unload_lock); + __pde_users_dec(pde); spin_unlock(&pde->pde_unload_lock); } @@ -318,36 +316,97 @@ static int proc_reg_open(struct inode *inode, struct file *file) struct proc_dir_entry *pde = PDE(inode); int rv = 0; int (*open)(struct inode *, struct file *); + int (*release)(struct inode *, struct file *); + struct pde_opener *pdeo; + + /* + * What for, you ask? Well, we can have open, rmmod, remove_proc_entry + * sequence. ->release won't be called because ->proc_fops will be + * cleared. Depending on complexity of ->release, consequences vary. + * + * We can't wait for mercy when close will be done for real, it's + * deadlockable: rmmod foo </proc/foo . So, we're going to do ->release + * by hand in remove_proc_entry(). For this, save opener's credentials + * for later. + */ + pdeo = kmalloc(sizeof(struct pde_opener), GFP_KERNEL); + if (!pdeo) + return -ENOMEM; spin_lock(&pde->pde_unload_lock); if (!pde->proc_fops) { spin_unlock(&pde->pde_unload_lock); + kfree(pdeo); return rv; } pde->pde_users++; open = pde->proc_fops->open; + release = pde->proc_fops->release; spin_unlock(&pde->pde_unload_lock); if (open) rv = open(inode, file); - pde_users_dec(pde); + spin_lock(&pde->pde_unload_lock); + if (rv == 0 && release) { + /* To know what to release. */ + pdeo->inode = inode; + pdeo->file = file; + /* Strictly for "too late" ->release in proc_reg_release(). */ + pdeo->release = release; + list_add(&pdeo->lh, &pde->pde_openers); + } else + kfree(pdeo); + __pde_users_dec(pde); + spin_unlock(&pde->pde_unload_lock); return rv; } +static struct pde_opener *find_pde_opener(struct proc_dir_entry *pde, + struct inode *inode, struct file *file) +{ + struct pde_opener *pdeo; + + list_for_each_entry(pdeo, &pde->pde_openers, lh) { + if (pdeo->inode == inode && pdeo->file == file) + return pdeo; + } + return NULL; +} + static int proc_reg_release(struct inode *inode, struct file *file) { struct proc_dir_entry *pde = PDE(inode); int rv = 0; int (*release)(struct inode *, struct file *); + struct pde_opener *pdeo; spin_lock(&pde->pde_unload_lock); + pdeo = find_pde_opener(pde, inode, file); if (!pde->proc_fops) { - spin_unlock(&pde->pde_unload_lock); + /* + * Can't simply exit, __fput() will think that everything is OK, + * and move on to freeing struct file. remove_proc_entry() will + * find slacker in opener's list and will try to do non-trivial + * things with struct file. Therefore, remove opener from list. + * + * But if opener is removed from list, who will ->release it? + */ + if (pdeo) { + list_del(&pdeo->lh); + spin_unlock(&pde->pde_unload_lock); + rv = pdeo->release(inode, file); + kfree(pdeo); + } else + spin_unlock(&pde->pde_unload_lock); return rv; } pde->pde_users++; release = pde->proc_fops->release; + if (pdeo) { + list_del(&pdeo->lh); + kfree(pdeo); + } spin_unlock(&pde->pde_unload_lock); if (release) diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 28cbca8..4422023 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -63,6 +63,7 @@ extern const struct file_operations proc_smaps_operations; extern const struct file_operations proc_clear_refs_operations; extern const struct file_operations proc_pagemap_operations; extern const struct file_operations proc_net_operations; +extern const struct file_operations proc_kmsg_operations; extern const struct inode_operations proc_net_inode_operations; void free_proc_entry(struct proc_dir_entry *de); @@ -88,3 +89,10 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *ino, struct dentry *dentry); int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent, filldir_t filldir); + +struct pde_opener { + struct inode *inode; + struct file *file; + int (*release)(struct inode *, struct file *); + struct list_head lh; +}; diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index e78c81f..c2370c7 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -23,6 +23,10 @@ #define CORE_STR "CORE" +#ifndef ELF_CORE_EFLAGS +#define ELF_CORE_EFLAGS 0 +#endif + static int open_kcore(struct inode * inode, struct file * filp) { return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; @@ -164,11 +168,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff) elf->e_entry = 0; elf->e_phoff = sizeof(struct elfhdr); elf->e_shoff = 0; -#if defined(CONFIG_H8300) - elf->e_flags = ELF_FLAGS; -#else - elf->e_flags = 0; -#endif + elf->e_flags = ELF_CORE_EFLAGS; elf->e_ehsize = sizeof(struct elfhdr); elf->e_phentsize= sizeof(struct elf_phdr); elf->e_phnum = nphdr; diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c index ff3b90b..9fd5df3 100644 --- a/fs/proc/kmsg.c +++ b/fs/proc/kmsg.c @@ -15,6 +15,8 @@ #include <asm/uaccess.h> #include <asm/io.h> +#include "internal.h" + extern wait_queue_head_t log_wait; extern int do_syslog(int type, char __user *bug, int count); diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index c652d46..ded9698 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -232,7 +232,6 @@ static int meminfo_read_proc(char *page, char **start, off_t off, #undef K } -extern const struct seq_operations fragmentation_op; static int fragmentation_open(struct inode *inode, struct file *file) { (void)inode; @@ -246,7 +245,6 @@ static const struct file_operations fragmentation_file_operations = { .release = seq_release, }; -extern const struct seq_operations pagetypeinfo_op; static int pagetypeinfo_open(struct inode *inode, struct file *file) { return seq_open(file, &pagetypeinfo_op); @@ -259,7 +257,6 @@ static const struct file_operations pagetypeinfo_file_ops = { .release = seq_release, }; -extern const struct seq_operations zoneinfo_op; static int zoneinfo_open(struct inode *inode, struct file *file) { return seq_open(file, &zoneinfo_op); @@ -356,7 +353,6 @@ static const struct file_operations proc_devinfo_operations = { .release = seq_release, }; -extern const struct seq_operations vmstat_op; static int vmstat_open(struct inode *inode, struct file *file) { return seq_open(file, &vmstat_op); @@ -468,14 +464,25 @@ static const struct file_operations proc_slabstats_operations = { #ifdef CONFIG_MMU static int vmalloc_open(struct inode *inode, struct file *file) { - return seq_open(file, &vmalloc_op); + unsigned int *ptr = NULL; + int ret; + + if (NUMA_BUILD) + ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL); + ret = seq_open(file, &vmalloc_op); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = ptr; + } else + kfree(ptr); + return ret; } static const struct file_operations proc_vmalloc_operations = { .open = vmalloc_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release, + .release = seq_release_private, }; #endif diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index b224a28..7bc296f 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -27,6 +27,11 @@ #include "internal.h" +static struct net *get_proc_net(const struct inode *inode) +{ + return maybe_get_net(PDE_NET(PDE(inode))); +} + int seq_open_net(struct inode *ino, struct file *f, const struct seq_operations *ops, int size) { @@ -185,12 +190,6 @@ void proc_net_remove(struct net *net, const char *name) } EXPORT_SYMBOL_GPL(proc_net_remove); -struct net *get_proc_net(const struct inode *inode) -{ - return maybe_get_net(PDE_NET(PDE(inode))); -} -EXPORT_SYMBOL_GPL(get_proc_net); - static __net_init int proc_net_ns_init(struct net *net) { struct proc_dir_entry *netd, *net_statd; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 164bd9f..7546a91 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -636,7 +636,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, struct pagemapread pm; int pagecount; int ret = -ESRCH; - struct mm_walk pagemap_walk; + struct mm_walk pagemap_walk = {}; unsigned long src; unsigned long svpfn; unsigned long start_vaddr; @@ -186,7 +186,7 @@ static void quota_sync_sb(struct super_block *sb, int type) void sync_dquots(struct super_block *sb, int type) { - int cnt, dirty; + int cnt; if (sb) { if (sb->s_qcop->quota_sync) @@ -198,11 +198,17 @@ void sync_dquots(struct super_block *sb, int type) restart: list_for_each_entry(sb, &super_blocks, s_list) { /* This test just improves performance so it needn't be reliable... */ - for (cnt = 0, dirty = 0; cnt < MAXQUOTAS; cnt++) - if ((type == cnt || type == -1) && sb_has_quota_enabled(sb, cnt) - && info_any_dirty(&sb_dqopt(sb)->info[cnt])) - dirty = 1; - if (!dirty) + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (type != -1 && type != cnt) + continue; + if (!sb_has_quota_enabled(sb, cnt)) + continue; + if (!info_dirty(&sb_dqopt(sb)->info[cnt]) && + list_empty(&sb_dqopt(sb)->info[cnt].dqi_dirty_list)) + continue; + break; + } + if (cnt == MAXQUOTAS) continue; sb->s_count++; spin_unlock(&sb_lock); diff --git a/fs/quota_v1.c b/fs/quota_v1.c index a6cf926..5ae15b1 100644 --- a/fs/quota_v1.c +++ b/fs/quota_v1.c @@ -1,6 +1,7 @@ #include <linux/errno.h> #include <linux/fs.h> #include <linux/quota.h> +#include <linux/quotaops.h> #include <linux/dqblk_v1.h> #include <linux/quotaio_v1.h> #include <linux/kernel.h> diff --git a/fs/quota_v2.c b/fs/quota_v2.c index 234ada9..b53827d 100644 --- a/fs/quota_v2.c +++ b/fs/quota_v2.c @@ -11,6 +11,7 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/slab.h> +#include <linux/quotaops.h> #include <asm/byteorder.h> diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index e396b2f..c8f60ee 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -34,15 +34,10 @@ ** from within kupdate, it will ignore the immediate flag */ -#include <asm/uaccess.h> -#include <asm/system.h> - #include <linux/time.h> #include <linux/semaphore.h> - #include <linux/vmalloc.h> #include <linux/reiserfs_fs.h> - #include <linux/kernel.h> #include <linux/errno.h> #include <linux/fcntl.h> @@ -54,6 +49,9 @@ #include <linux/writeback.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> +#include <linux/uaccess.h> + +#include <asm/system.h> /* gets a struct reiserfs_journal_list * from a list head */ #define JOURNAL_LIST_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \ @@ -558,13 +556,13 @@ static inline void insert_journal_hash(struct reiserfs_journal_cnode **table, static inline void lock_journal(struct super_block *p_s_sb) { PROC_INFO_INC(p_s_sb, journal.lock_journal); - down(&SB_JOURNAL(p_s_sb)->j_lock); + mutex_lock(&SB_JOURNAL(p_s_sb)->j_mutex); } /* unlock the current transaction */ static inline void unlock_journal(struct super_block *p_s_sb) { - up(&SB_JOURNAL(p_s_sb)->j_lock); + mutex_unlock(&SB_JOURNAL(p_s_sb)->j_mutex); } static inline void get_journal_list(struct reiserfs_journal_list *jl) @@ -1045,9 +1043,9 @@ static int flush_commit_list(struct super_block *s, } /* make sure nobody is trying to flush this one at the same time */ - down(&jl->j_commit_lock); + mutex_lock(&jl->j_commit_mutex); if (!journal_list_still_alive(s, trans_id)) { - up(&jl->j_commit_lock); + mutex_unlock(&jl->j_commit_mutex); goto put_jl; } BUG_ON(jl->j_trans_id == 0); @@ -1057,7 +1055,7 @@ static int flush_commit_list(struct super_block *s, if (flushall) { atomic_set(&(jl->j_older_commits_done), 1); } - up(&jl->j_commit_lock); + mutex_unlock(&jl->j_commit_mutex); goto put_jl; } @@ -1181,7 +1179,7 @@ static int flush_commit_list(struct super_block *s, if (flushall) { atomic_set(&(jl->j_older_commits_done), 1); } - up(&jl->j_commit_lock); + mutex_unlock(&jl->j_commit_mutex); put_jl: put_journal_list(s, jl); @@ -1411,8 +1409,8 @@ static int flush_journal_list(struct super_block *s, /* if flushall == 0, the lock is already held */ if (flushall) { - down(&journal->j_flush_sem); - } else if (!down_trylock(&journal->j_flush_sem)) { + mutex_lock(&journal->j_flush_mutex); + } else if (mutex_trylock(&journal->j_flush_mutex)) { BUG(); } @@ -1642,7 +1640,7 @@ static int flush_journal_list(struct super_block *s, jl->j_state = 0; put_journal_list(s, jl); if (flushall) - up(&journal->j_flush_sem); + mutex_unlock(&journal->j_flush_mutex); put_fs_excl(); return err; } @@ -1772,12 +1770,12 @@ static int kupdate_transactions(struct super_block *s, struct reiserfs_journal *journal = SB_JOURNAL(s); chunk.nr = 0; - down(&journal->j_flush_sem); + mutex_lock(&journal->j_flush_mutex); if (!journal_list_still_alive(s, orig_trans_id)) { goto done; } - /* we've got j_flush_sem held, nobody is going to delete any + /* we've got j_flush_mutex held, nobody is going to delete any * of these lists out from underneath us */ while ((num_trans && transactions_flushed < num_trans) || @@ -1812,7 +1810,7 @@ static int kupdate_transactions(struct super_block *s, } done: - up(&journal->j_flush_sem); + mutex_unlock(&journal->j_flush_mutex); return ret; } @@ -2556,7 +2554,7 @@ static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s) INIT_LIST_HEAD(&jl->j_working_list); INIT_LIST_HEAD(&jl->j_tail_bh_list); INIT_LIST_HEAD(&jl->j_bh_list); - sema_init(&jl->j_commit_lock, 1); + mutex_init(&jl->j_commit_mutex); SB_JOURNAL(s)->j_num_lists++; get_journal_list(jl); return jl; @@ -2837,8 +2835,8 @@ int journal_init(struct super_block *p_s_sb, const char *j_dev_name, journal->j_last = NULL; journal->j_first = NULL; init_waitqueue_head(&(journal->j_join_wait)); - sema_init(&journal->j_lock, 1); - sema_init(&journal->j_flush_sem, 1); + mutex_init(&journal->j_mutex); + mutex_init(&journal->j_flush_mutex); journal->j_trans_id = 10; journal->j_mount_id = 10; @@ -4030,7 +4028,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, * the new transaction is fully setup, and we've already flushed the * ordered bh list */ - down(&jl->j_commit_lock); + mutex_lock(&jl->j_commit_mutex); /* save the transaction id in case we need to commit it later */ commit_trans_id = jl->j_trans_id; @@ -4196,7 +4194,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, lock_kernel(); } BUG_ON(!list_empty(&jl->j_tail_bh_list)); - up(&jl->j_commit_lock); + mutex_unlock(&jl->j_commit_mutex); /* honor the flush wishes from the caller, simple commits can ** be done outside the journal lock, they are done below diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 1d40f2b..2ec748b 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -22,6 +22,7 @@ #include <linux/blkdev.h> #include <linux/buffer_head.h> #include <linux/exportfs.h> +#include <linux/quotaops.h> #include <linux/vfs.h> #include <linux/mnt_namespace.h> #include <linux/mount.h> @@ -182,7 +183,7 @@ static int finish_unfinished(struct super_block *s) int ret = reiserfs_quota_on_mount(s, i); if (ret < 0) reiserfs_warning(s, - "reiserfs: cannot turn on journalled quota: error %d", + "reiserfs: cannot turn on journaled quota: error %d", ret); } } @@ -876,7 +877,9 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin mount options were selected. */ unsigned long *blocks, /* strtol-ed from NNN of resize=NNN */ char **jdev_name, - unsigned int *commit_max_age) + unsigned int *commit_max_age, + char **qf_names, + unsigned int *qfmt) { int c; char *arg = NULL; @@ -992,9 +995,11 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin if (c == 'u' || c == 'g') { int qtype = c == 'u' ? USRQUOTA : GRPQUOTA; - if (sb_any_quota_enabled(s)) { + if ((sb_any_quota_enabled(s) || + sb_any_quota_suspended(s)) && + (!*arg != !REISERFS_SB(s)->s_qf_names[qtype])) { reiserfs_warning(s, - "reiserfs_parse_options: cannot change journalled quota options when quota turned on."); + "reiserfs_parse_options: cannot change journaled quota options when quota turned on."); return 0; } if (*arg) { /* Some filename specified? */ @@ -1011,46 +1016,54 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin "reiserfs_parse_options: quotafile must be on filesystem root."); return 0; } - REISERFS_SB(s)->s_qf_names[qtype] = + qf_names[qtype] = kmalloc(strlen(arg) + 1, GFP_KERNEL); - if (!REISERFS_SB(s)->s_qf_names[qtype]) { + if (!qf_names[qtype]) { reiserfs_warning(s, "reiserfs_parse_options: not enough memory for storing quotafile name."); return 0; } - strcpy(REISERFS_SB(s)->s_qf_names[qtype], arg); + strcpy(qf_names[qtype], arg); *mount_options |= 1 << REISERFS_QUOTA; } else { - kfree(REISERFS_SB(s)->s_qf_names[qtype]); - REISERFS_SB(s)->s_qf_names[qtype] = NULL; + if (qf_names[qtype] != + REISERFS_SB(s)->s_qf_names[qtype]) + kfree(qf_names[qtype]); + qf_names[qtype] = NULL; } } if (c == 'f') { if (!strcmp(arg, "vfsold")) - REISERFS_SB(s)->s_jquota_fmt = QFMT_VFS_OLD; + *qfmt = QFMT_VFS_OLD; else if (!strcmp(arg, "vfsv0")) - REISERFS_SB(s)->s_jquota_fmt = QFMT_VFS_V0; + *qfmt = QFMT_VFS_V0; else { reiserfs_warning(s, "reiserfs_parse_options: unknown quota format specified."); return 0; } + if ((sb_any_quota_enabled(s) || + sb_any_quota_suspended(s)) && + *qfmt != REISERFS_SB(s)->s_jquota_fmt) { + reiserfs_warning(s, + "reiserfs_parse_options: cannot change journaled quota options when quota turned on."); + return 0; + } } #else if (c == 'u' || c == 'g' || c == 'f') { reiserfs_warning(s, - "reiserfs_parse_options: journalled quota options not supported."); + "reiserfs_parse_options: journaled quota options not supported."); return 0; } #endif } #ifdef CONFIG_QUOTA - if (!REISERFS_SB(s)->s_jquota_fmt - && (REISERFS_SB(s)->s_qf_names[USRQUOTA] - || REISERFS_SB(s)->s_qf_names[GRPQUOTA])) { + if (!REISERFS_SB(s)->s_jquota_fmt && !*qfmt + && (qf_names[USRQUOTA] || qf_names[GRPQUOTA])) { reiserfs_warning(s, - "reiserfs_parse_options: journalled quota format not specified."); + "reiserfs_parse_options: journaled quota format not specified."); return 0; } /* This checking is not precise wrt the quota type but for our purposes it is sufficient */ @@ -1130,6 +1143,21 @@ static void handle_attrs(struct super_block *s) } } +#ifdef CONFIG_QUOTA +static void handle_quota_files(struct super_block *s, char **qf_names, + unsigned int *qfmt) +{ + int i; + + for (i = 0; i < MAXQUOTAS; i++) { + if (qf_names[i] != REISERFS_SB(s)->s_qf_names[i]) + kfree(REISERFS_SB(s)->s_qf_names[i]); + REISERFS_SB(s)->s_qf_names[i] = qf_names[i]; + } + REISERFS_SB(s)->s_jquota_fmt = *qfmt; +} +#endif + static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) { struct reiserfs_super_block *rs; @@ -1141,23 +1169,30 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) struct reiserfs_journal *journal = SB_JOURNAL(s); char *new_opts = kstrdup(arg, GFP_KERNEL); int err; + char *qf_names[MAXQUOTAS]; + unsigned int qfmt = 0; #ifdef CONFIG_QUOTA int i; + + memcpy(qf_names, REISERFS_SB(s)->s_qf_names, sizeof(qf_names)); #endif rs = SB_DISK_SUPER_BLOCK(s); if (!reiserfs_parse_options - (s, arg, &mount_options, &blocks, NULL, &commit_max_age)) { + (s, arg, &mount_options, &blocks, NULL, &commit_max_age, + qf_names, &qfmt)) { #ifdef CONFIG_QUOTA - for (i = 0; i < MAXQUOTAS; i++) { - kfree(REISERFS_SB(s)->s_qf_names[i]); - REISERFS_SB(s)->s_qf_names[i] = NULL; - } + for (i = 0; i < MAXQUOTAS; i++) + if (qf_names[i] != REISERFS_SB(s)->s_qf_names[i]) + kfree(qf_names[i]); #endif err = -EINVAL; goto out_err; } +#ifdef CONFIG_QUOTA + handle_quota_files(s, qf_names, &qfmt); +#endif handle_attrs(s); @@ -1570,6 +1605,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) char *jdev_name; struct reiserfs_sb_info *sbi; int errval = -EINVAL; + char *qf_names[MAXQUOTAS] = {}; + unsigned int qfmt = 0; save_mount_options(s, data); @@ -1597,9 +1634,12 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) jdev_name = NULL; if (reiserfs_parse_options (s, (char *)data, &(sbi->s_mount_opt), &blocks, &jdev_name, - &commit_max_age) == 0) { + &commit_max_age, qf_names, &qfmt) == 0) { goto error; } +#ifdef CONFIG_QUOTA + handle_quota_files(s, qf_names, &qfmt); +#endif if (blocks) { SWARN(silent, s, "jmacd-7: reiserfs_fill_super: resize option " @@ -1819,7 +1859,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) return (0); - error: +error: if (jinit_done) { /* kill the commit thread, free journal ram */ journal_release_error(NULL, s); } @@ -1830,10 +1870,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) #ifdef CONFIG_QUOTA { int j; - for (j = 0; j < MAXQUOTAS; j++) { - kfree(sbi->s_qf_names[j]); - sbi->s_qf_names[j] = NULL; - } + for (j = 0; j < MAXQUOTAS; j++) + kfree(qf_names[j]); } #endif kfree(sbi); @@ -1980,7 +2018,7 @@ static int reiserfs_release_dquot(struct dquot *dquot) static int reiserfs_mark_dquot_dirty(struct dquot *dquot) { - /* Are we journalling quotas? */ + /* Are we journaling quotas? */ if (REISERFS_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] || REISERFS_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) { dquot_mark_dquot_dirty(dquot); @@ -2026,6 +2064,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, int err; struct nameidata nd; struct inode *inode; + struct reiserfs_transaction_handle th; if (!(REISERFS_SB(sb)->s_mount_opt & (1 << REISERFS_QUOTA))) return -EINVAL; @@ -2053,17 +2092,28 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, } mark_inode_dirty(inode); } - /* Not journalling quota? No more tests needed... */ - if (!REISERFS_SB(sb)->s_qf_names[USRQUOTA] && - !REISERFS_SB(sb)->s_qf_names[GRPQUOTA]) { - path_put(&nd.path); - return vfs_quota_on(sb, type, format_id, path, 0); - } - /* Quotafile not of fs root? */ - if (nd.path.dentry->d_parent->d_inode != sb->s_root->d_inode) - reiserfs_warning(sb, + /* Journaling quota? */ + if (REISERFS_SB(sb)->s_qf_names[type]) { + /* Quotafile not of fs root? */ + if (nd.path.dentry->d_parent->d_inode != sb->s_root->d_inode) + reiserfs_warning(sb, "reiserfs: Quota file not on filesystem root. " "Journalled quota will not work."); + } + + /* + * When we journal data on quota file, we have to flush journal to see + * all updates to the file when we bypass pagecache... + */ + if (reiserfs_file_data_log(inode)) { + /* Just start temporary transaction and finish it */ + err = journal_begin(&th, sb, 1); + if (err) + return err; + err = journal_end_sync(&th, sb, 1); + if (err) + return err; + } path_put(&nd.path); return vfs_quota_on(sb, type, format_id, path, 0); } diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c index 5e90a95..056008d 100644 --- a/fs/reiserfs/xattr_security.c +++ b/fs/reiserfs/xattr_security.c @@ -6,8 +6,6 @@ #include <linux/reiserfs_xattr.h> #include <asm/uaccess.h> -#define XATTR_SECURITY_PREFIX "security." - static int security_get(struct inode *inode, const char *name, void *buffer, size_t size) { diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c index 024a938..60abe2b 100644 --- a/fs/reiserfs/xattr_trusted.c +++ b/fs/reiserfs/xattr_trusted.c @@ -7,8 +7,6 @@ #include <linux/reiserfs_xattr.h> #include <asm/uaccess.h> -#define XATTR_TRUSTED_PREFIX "trusted." - static int trusted_get(struct inode *inode, const char *name, void *buffer, size_t size) { diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c index 073f393..1384efc 100644 --- a/fs/reiserfs/xattr_user.c +++ b/fs/reiserfs/xattr_user.c @@ -10,8 +10,6 @@ # include <linux/reiserfs_acl.h> #endif -#define XATTR_USER_PREFIX "user." - static int user_get(struct inode *inode, const char *name, void *buffer, size_t size) { diff --git a/fs/signalfd.c b/fs/signalfd.c index 6197256..9c39bc7 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -205,11 +205,19 @@ static const struct file_operations signalfd_fops = { .read = signalfd_read, }; -asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemask) +asmlinkage long sys_signalfd4(int ufd, sigset_t __user *user_mask, + size_t sizemask, int flags) { sigset_t sigmask; struct signalfd_ctx *ctx; + /* Check the SFD_* constants for consistency. */ + BUILD_BUG_ON(SFD_CLOEXEC != O_CLOEXEC); + BUILD_BUG_ON(SFD_NONBLOCK != O_NONBLOCK); + + if (flags & ~(SFD_CLOEXEC | SFD_NONBLOCK)) + return -EINVAL; + if (sizemask != sizeof(sigset_t) || copy_from_user(&sigmask, user_mask, sizeof(sigmask))) return -EINVAL; @@ -227,7 +235,8 @@ asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemas * When we call this, the initialization must be complete, since * anon_inode_getfd() will install the fd. */ - ufd = anon_inode_getfd("[signalfd]", &signalfd_fops, ctx); + ufd = anon_inode_getfd("[signalfd]", &signalfd_fops, ctx, + flags & (O_CLOEXEC | O_NONBLOCK)); if (ufd < 0) kfree(ctx); } else { @@ -249,3 +258,9 @@ asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemas return ufd; } + +asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, + size_t sizemask) +{ + return sys_signalfd4(ufd, user_mask, sizemask, 0); +} diff --git a/fs/smbfs/cache.c b/fs/smbfs/cache.c index 8182f05..8c177eb 100644 --- a/fs/smbfs/cache.c +++ b/fs/smbfs/cache.c @@ -13,7 +13,6 @@ #include <linux/errno.h> #include <linux/kernel.h> #include <linux/mm.h> -#include <linux/dirent.h> #include <linux/smb_fs.h> #include <linux/pagemap.h> #include <linux/net.h> diff --git a/fs/smbfs/proc.c b/fs/smbfs/proc.c index d517a27..ee536e8 100644 --- a/fs/smbfs/proc.c +++ b/fs/smbfs/proc.c @@ -16,7 +16,6 @@ #include <linux/stat.h> #include <linux/fcntl.h> #include <linux/dcache.h> -#include <linux/dirent.h> #include <linux/nls.h> #include <linux/smp_lock.h> #include <linux/net.h> @@ -70,6 +70,7 @@ static struct super_block *alloc_super(struct file_system_type *type) INIT_LIST_HEAD(&s->s_instances); INIT_HLIST_HEAD(&s->s_anon); INIT_LIST_HEAD(&s->s_inodes); + INIT_LIST_HEAD(&s->s_dentry_lru); init_rwsem(&s->s_umount); mutex_init(&s->s_lock); lockdep_set_class(&s->s_umount, &type->s_umount_key); @@ -139,7 +139,8 @@ asmlinkage long sys_fdatasync(unsigned int fd) * before performing the write. * * SYNC_FILE_RANGE_WRITE: initiate writeout of all those dirty pages in the - * range which are not presently under writeback. + * range which are not presently under writeback. Note that this may block for + * significant periods due to exhaustion of disk request structures. * * SYNC_FILE_RANGE_WAIT_AFTER: wait upon writeout of all pages in the range * after performing the write. diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 8c0e4b9..c1a7efb 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -398,7 +398,7 @@ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt, } /** - * sysfs_add_one - add sysfs_dirent to parent + * __sysfs_add_one - add sysfs_dirent to parent without warning * @acxt: addrm context to use * @sd: sysfs_dirent to be added * @@ -417,7 +417,7 @@ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt, * 0 on success, -EEXIST if entry with the given name already * exists. */ -int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) +int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) { if (sysfs_find_dirent(acxt->parent_sd, sd->s_name)) return -EEXIST; @@ -435,6 +435,39 @@ int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) } /** + * sysfs_add_one - add sysfs_dirent to parent + * @acxt: addrm context to use + * @sd: sysfs_dirent to be added + * + * Get @acxt->parent_sd and set sd->s_parent to it and increment + * nlink of parent inode if @sd is a directory and link into the + * children list of the parent. + * + * This function should be called between calls to + * sysfs_addrm_start() and sysfs_addrm_finish() and should be + * passed the same @acxt as passed to sysfs_addrm_start(). + * + * LOCKING: + * Determined by sysfs_addrm_start(). + * + * RETURNS: + * 0 on success, -EEXIST if entry with the given name already + * exists. + */ +int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) +{ + int ret; + + ret = __sysfs_add_one(acxt, sd); + if (ret == -EEXIST) { + printk(KERN_WARNING "sysfs: duplicate filename '%s' " + "can not be created\n", sd->s_name); + WARN_ON(1); + } + return ret; +} + +/** * sysfs_remove_one - remove sysfs_dirent from parent * @acxt: addrm context to use * @sd: sysfs_dirent to be removed diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index e7735f6..3f07893 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -14,6 +14,7 @@ #include <linux/kobject.h> #include <linux/kallsyms.h> #include <linux/slab.h> +#include <linux/fsnotify.h> #include <linux/namei.h> #include <linux/poll.h> #include <linux/list.h> @@ -585,9 +586,11 @@ int sysfs_chmod_file(struct kobject *kobj, struct attribute *attr, mode_t mode) newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; - rc = notify_change(victim, &newattrs); + newattrs.ia_ctime = current_fs_time(inode->i_sb); + rc = sysfs_setattr(victim, &newattrs); if (rc == 0) { + fsnotify_change(victim, newattrs.ia_valid); mutex_lock(&sysfs_mutex); victim_sd->s_mode = newattrs.ia_mode; mutex_unlock(&sysfs_mutex); diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index 817f596..a3ba217 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -19,13 +19,8 @@ #include "sysfs.h" -/** - * sysfs_create_link - create symlink between two objects. - * @kobj: object whose directory we're creating the link in. - * @target: object we're pointing to. - * @name: name of the symlink. - */ -int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char * name) +static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target, + const char *name, int warn) { struct sysfs_dirent *parent_sd = NULL; struct sysfs_dirent *target_sd = NULL; @@ -65,7 +60,10 @@ int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char target_sd = NULL; /* reference is now owned by the symlink */ sysfs_addrm_start(&acxt, parent_sd); - error = sysfs_add_one(&acxt, sd); + if (warn) + error = sysfs_add_one(&acxt, sd); + else + error = __sysfs_add_one(&acxt, sd); sysfs_addrm_finish(&acxt); if (error) @@ -80,6 +78,33 @@ int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char } /** + * sysfs_create_link - create symlink between two objects. + * @kobj: object whose directory we're creating the link in. + * @target: object we're pointing to. + * @name: name of the symlink. + */ +int sysfs_create_link(struct kobject *kobj, struct kobject *target, + const char *name) +{ + return sysfs_do_create_link(kobj, target, name, 1); +} + +/** + * sysfs_create_link_nowarn - create symlink between two objects. + * @kobj: object whose directory we're creating the link in. + * @target: object we're pointing to. + * @name: name of the symlink. + * + * This function does the same as sysf_create_link(), but it + * doesn't warn if the link already exists. + */ +int sysfs_create_link_nowarn(struct kobject *kobj, struct kobject *target, + const char *name) +{ + return sysfs_do_create_link(kobj, target, name, 0); +} + +/** * sysfs_remove_link - remove symlink in object's directory. * @kobj: object we're acting for. * @name: name of the symlink to remove. diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index ce4e15f8..a5db496 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -107,6 +107,7 @@ struct sysfs_dirent *sysfs_get_active_two(struct sysfs_dirent *sd); void sysfs_put_active_two(struct sysfs_dirent *sd); void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *parent_sd); +int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd); int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd); void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd); void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt); diff --git a/fs/timerfd.c b/fs/timerfd.c index d87d354..c502c60 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -184,7 +184,11 @@ asmlinkage long sys_timerfd_create(int clockid, int flags) int ufd; struct timerfd_ctx *ctx; - if (flags) + /* Check the TFD_* constants for consistency. */ + BUILD_BUG_ON(TFD_CLOEXEC != O_CLOEXEC); + BUILD_BUG_ON(TFD_NONBLOCK != O_NONBLOCK); + + if (flags & ~(TFD_CLOEXEC | TFD_NONBLOCK)) return -EINVAL; if (clockid != CLOCK_MONOTONIC && clockid != CLOCK_REALTIME) @@ -198,7 +202,8 @@ asmlinkage long sys_timerfd_create(int clockid, int flags) ctx->clockid = clockid; hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS); - ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx); + ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx, + flags & (O_CLOEXEC | O_NONBLOCK)); if (ufd < 0) kfree(ctx); diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 85b22b5..227c9d7 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -76,6 +76,7 @@ #include <linux/errno.h> #include <linux/fs.h> +#include <linux/quotaops.h> #include <linux/slab.h> #include <linux/time.h> #include <linux/stat.h> @@ -1232,7 +1233,7 @@ static int ufs_show_options(struct seq_file *seq, struct vfsmount *vfs) { struct ufs_sb_info *sbi = UFS_SB(vfs->mnt_sb); unsigned mval = sbi->s_mount_opt & UFS_MOUNT_UFSTYPE; - struct match_token *tp = tokens; + const struct match_token *tp = tokens; while (tp->token != Opt_onerror_panic && tp->token != mval) ++tp; diff --git a/fs/vfat/namei.c b/fs/vfat/namei.c index b546ba6..155c10b 100644 --- a/fs/vfat/namei.c +++ b/fs/vfat/namei.c @@ -621,7 +621,7 @@ shortname: memcpy(de->name, msdos_name, MSDOS_NAME); de->attr = is_dir ? ATTR_DIR : ATTR_ARCH; de->lcase = lcase; - fat_date_unix2dos(ts->tv_sec, &time, &date); + fat_date_unix2dos(ts->tv_sec, &time, &date, sbi->options.tz_utc); de->time = de->ctime = time; de->date = de->cdate = de->adate = date; de->ctime_cs = 0; |