From 1b56e98990bcdbb20b9fab163654b9315bf158e8 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 10 Feb 2014 15:18:55 -0500 Subject: ocfs2 syncs the wrong range... Cc: stable@vger.kernel.org Signed-off-by: Al Viro --- fs/ocfs2/file.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 8450262bc..51632c4 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2393,8 +2393,8 @@ out_dio: if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) || ((file->f_flags & O_DIRECT) && !direct_io)) { - ret = filemap_fdatawrite_range(file->f_mapping, pos, - pos + count - 1); + ret = filemap_fdatawrite_range(file->f_mapping, *ppos, + *ppos + count - 1); if (ret < 0) written = ret; @@ -2407,8 +2407,8 @@ out_dio: } if (!ret) - ret = filemap_fdatawait_range(file->f_mapping, pos, - pos + count - 1); + ret = filemap_fdatawait_range(file->f_mapping, *ppos, + *ppos + count - 1); } /* -- cgit v1.1 From 9c225f2655e36a470c4f58dbbc99244c5fc7f2d4 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 3 Mar 2014 09:36:58 -0800 Subject: vfs: atomic f_pos accesses as per POSIX Our write() system call has always been atomic in the sense that you get the expected thread-safe contiguous write, but we haven't actually guaranteed that concurrent writes are serialized wrt f_pos accesses, so threads (or processes) that share a file descriptor and use "write()" concurrently would quite likely overwrite each others data. This violates POSIX.1-2008/SUSv4 Section XSI 2.9.7 that says: "2.9.7 Thread Interactions with Regular File Operations All of the following functions shall be atomic with respect to each other in the effects specified in POSIX.1-2008 when they operate on regular files or symbolic links: [...]" and one of the effects is the file position update. This unprotected file position behavior is not new behavior, and nobody has ever cared. Until now. Yongzhi Pan reported unexpected behavior to Michael Kerrisk that was due to this. This resolves the issue with a f_pos-specific lock that is taken by read/write/lseek on file descriptors that may be shared across threads or processes. Reported-by: Yongzhi Pan Reported-by: Michael Kerrisk Cc: Al Viro Signed-off-by: Linus Torvalds Signed-off-by: Al Viro --- fs/file_table.c | 1 + fs/namei.c | 2 +- fs/open.c | 4 ++++ fs/read_write.c | 54 ++++++++++++++++++++++++++++++++++++++++-------------- 4 files changed, 46 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/file_table.c b/fs/file_table.c index 5fff903..5b24008 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -135,6 +135,7 @@ struct file *get_empty_filp(void) atomic_long_set(&f->f_count, 1); rwlock_init(&f->f_owner.lock); spin_lock_init(&f->f_lock); + mutex_init(&f->f_pos_lock); eventpoll_init_file(f); /* f->f_version: 0 */ return f; diff --git a/fs/namei.c b/fs/namei.c index 385f781..2f730ef 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1884,7 +1884,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, nd->path = f.file->f_path; if (flags & LOOKUP_RCU) { - if (f.need_put) + if (f.flags & FDPUT_FPUT) *fp = f.file; nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); rcu_read_lock(); diff --git a/fs/open.c b/fs/open.c index 4b3e1ed..b9ed8b2 100644 --- a/fs/open.c +++ b/fs/open.c @@ -705,6 +705,10 @@ static int do_dentry_open(struct file *f, return 0; } + /* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */ + if (S_ISREG(inode->i_mode)) + f->f_mode |= FMODE_ATOMIC_POS; + f->f_op = fops_get(inode->i_fop); if (unlikely(WARN_ON(!f->f_op))) { error = -ENODEV; diff --git a/fs/read_write.c b/fs/read_write.c index edc5746..932bb34 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -264,10 +264,36 @@ loff_t vfs_llseek(struct file *file, loff_t offset, int whence) } EXPORT_SYMBOL(vfs_llseek); +/* + * We only lock f_pos if we have threads or if the file might be + * shared with another process. In both cases we'll have an elevated + * file count (done either by fdget() or by fork()). + */ +static inline struct fd fdget_pos(int fd) +{ + struct fd f = fdget(fd); + struct file *file = f.file; + + if (file && (file->f_mode & FMODE_ATOMIC_POS)) { + if (file_count(file) > 1) { + f.flags |= FDPUT_POS_UNLOCK; + mutex_lock(&file->f_pos_lock); + } + } + return f; +} + +static inline void fdput_pos(struct fd f) +{ + if (f.flags & FDPUT_POS_UNLOCK) + mutex_unlock(&f.file->f_pos_lock); + fdput(f); +} + SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence) { off_t retval; - struct fd f = fdget(fd); + struct fd f = fdget_pos(fd); if (!f.file) return -EBADF; @@ -278,7 +304,7 @@ SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence) if (res != (loff_t)retval) retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */ } - fdput(f); + fdput_pos(f); return retval; } @@ -498,7 +524,7 @@ static inline void file_pos_write(struct file *file, loff_t pos) SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) { - struct fd f = fdget(fd); + struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; if (f.file) { @@ -506,7 +532,7 @@ SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) ret = vfs_read(f.file, buf, count, &pos); if (ret >= 0) file_pos_write(f.file, pos); - fdput(f); + fdput_pos(f); } return ret; } @@ -514,7 +540,7 @@ SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, size_t, count) { - struct fd f = fdget(fd); + struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; if (f.file) { @@ -522,7 +548,7 @@ SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, ret = vfs_write(f.file, buf, count, &pos); if (ret >= 0) file_pos_write(f.file, pos); - fdput(f); + fdput_pos(f); } return ret; @@ -797,7 +823,7 @@ EXPORT_SYMBOL(vfs_writev); SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen) { - struct fd f = fdget(fd); + struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; if (f.file) { @@ -805,7 +831,7 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec, ret = vfs_readv(f.file, vec, vlen, &pos); if (ret >= 0) file_pos_write(f.file, pos); - fdput(f); + fdput_pos(f); } if (ret > 0) @@ -817,7 +843,7 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec, SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen) { - struct fd f = fdget(fd); + struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; if (f.file) { @@ -825,7 +851,7 @@ SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec, ret = vfs_writev(f.file, vec, vlen, &pos); if (ret >= 0) file_pos_write(f.file, pos); - fdput(f); + fdput_pos(f); } if (ret > 0) @@ -968,7 +994,7 @@ COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd, const struct compat_iovec __user *,vec, compat_ulong_t, vlen) { - struct fd f = fdget(fd); + struct fd f = fdget_pos(fd); ssize_t ret; loff_t pos; @@ -978,7 +1004,7 @@ COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd, ret = compat_readv(f.file, vec, vlen, &pos); if (ret >= 0) f.file->f_pos = pos; - fdput(f); + fdput_pos(f); return ret; } @@ -1035,7 +1061,7 @@ COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd, const struct compat_iovec __user *, vec, compat_ulong_t, vlen) { - struct fd f = fdget(fd); + struct fd f = fdget_pos(fd); ssize_t ret; loff_t pos; @@ -1045,7 +1071,7 @@ COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd, ret = compat_writev(f.file, vec, vlen, &pos); if (ret >= 0) f.file->f_pos = pos; - fdput(f); + fdput_pos(f); return ret; } -- cgit v1.1 From bd2a31d522344b3ac2fb680bd2366e77a9bd8209 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 4 Mar 2014 14:54:22 -0500 Subject: get rid of fget_light() instead of returning the flags by reference, we can just have the low-level primitive return those in lower bits of unsigned long, with struct file * derived from the rest. Signed-off-by: Al Viro --- fs/file.c | 56 +++++++++++++++++++++++++++++++++++++++++++------------- fs/read_write.c | 16 +--------------- 2 files changed, 44 insertions(+), 28 deletions(-) (limited to 'fs') diff --git a/fs/file.c b/fs/file.c index db25c2b..60a45e9 100644 --- a/fs/file.c +++ b/fs/file.c @@ -683,35 +683,65 @@ EXPORT_SYMBOL(fget_raw); * The fput_needed flag returned by fget_light should be passed to the * corresponding fput_light. */ -struct file *__fget_light(unsigned int fd, fmode_t mask, int *fput_needed) +static unsigned long __fget_light(unsigned int fd, fmode_t mask) { struct files_struct *files = current->files; struct file *file; - *fput_needed = 0; if (atomic_read(&files->count) == 1) { file = __fcheck_files(files, fd); - if (file && (file->f_mode & mask)) - file = NULL; + if (!file || unlikely(file->f_mode & mask)) + return 0; + return (unsigned long)file; } else { file = __fget(fd, mask); - if (file) - *fput_needed = 1; + if (!file) + return 0; + return FDPUT_FPUT | (unsigned long)file; } - - return file; } -struct file *fget_light(unsigned int fd, int *fput_needed) +unsigned long __fdget(unsigned int fd) { - return __fget_light(fd, FMODE_PATH, fput_needed); + return __fget_light(fd, FMODE_PATH); } -EXPORT_SYMBOL(fget_light); +EXPORT_SYMBOL(__fdget); -struct file *fget_raw_light(unsigned int fd, int *fput_needed) +unsigned long __fdget_raw(unsigned int fd) { - return __fget_light(fd, 0, fput_needed); + return __fget_light(fd, 0); +} + +unsigned long __fdget_pos(unsigned int fd) +{ + struct files_struct *files = current->files; + struct file *file; + unsigned long v; + + if (atomic_read(&files->count) == 1) { + file = __fcheck_files(files, fd); + v = 0; + } else { + file = __fget(fd, 0); + v = FDPUT_FPUT; + } + if (!file) + return 0; + + if (file->f_mode & FMODE_ATOMIC_POS) { + if (file_count(file) > 1) { + v |= FDPUT_POS_UNLOCK; + mutex_lock(&file->f_pos_lock); + } + } + return v | (unsigned long)file; } +/* + * We only lock f_pos if we have threads or if the file might be + * shared with another process. In both cases we'll have an elevated + * file count (done either by fdget() or by fork()). + */ + void set_close_on_exec(unsigned int fd, int flag) { struct files_struct *files = current->files; diff --git a/fs/read_write.c b/fs/read_write.c index 932bb34..54e19b9 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -264,23 +264,9 @@ loff_t vfs_llseek(struct file *file, loff_t offset, int whence) } EXPORT_SYMBOL(vfs_llseek); -/* - * We only lock f_pos if we have threads or if the file might be - * shared with another process. In both cases we'll have an elevated - * file count (done either by fdget() or by fork()). - */ static inline struct fd fdget_pos(int fd) { - struct fd f = fdget(fd); - struct file *file = f.file; - - if (file && (file->f_mode & FMODE_ATOMIC_POS)) { - if (file_count(file) > 1) { - f.flags |= FDPUT_POS_UNLOCK; - mutex_lock(&file->f_pos_lock); - } - } - return f; + return __to_fd(__fdget_pos(fd)); } static inline void fdput_pos(struct fd f) -- cgit v1.1