summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/affs/file.c5
-rw-r--r--fs/binfmt_elf.c2
-rw-r--r--fs/block_dev.c55
-rw-r--r--fs/btrfs/backref.c10
-rw-r--r--fs/btrfs/compression.c6
-rw-r--r--fs/btrfs/delayed-inode.c3
-rw-r--r--fs/btrfs/delayed-inode.h2
-rw-r--r--fs/btrfs/extent_io.c45
-rw-r--r--fs/btrfs/extent_io.h3
-rw-r--r--fs/btrfs/inode.c16
-rw-r--r--fs/btrfs/ioctl.c119
-rw-r--r--fs/btrfs/root-tree.c10
-rw-r--r--fs/ceph/addr.c4
-rw-r--r--fs/ceph/caps.c27
-rw-r--r--fs/ceph/file.c6
-rw-r--r--fs/ceph/inode.c2
-rw-r--r--fs/ceph/mds_client.c16
-rw-r--r--fs/ceph/mds_client.h1
-rw-r--r--fs/ceph/super.h1
-rw-r--r--fs/cifs/cifs_dfs_ref.c2
-rw-r--r--fs/cifs/cifsencrypt.c2
-rw-r--r--fs/cifs/cifsfs.c1
-rw-r--r--fs/cifs/cifsfs.h12
-rw-r--r--fs/cifs/cifssmb.c21
-rw-r--r--fs/cifs/connect.c3
-rw-r--r--fs/cifs/smb2pdu.c24
-rw-r--r--fs/compat_ioctl.c3
-rw-r--r--fs/dax.c53
-rw-r--r--fs/dcache.c20
-rw-r--r--fs/devpts/inode.c20
-rw-r--r--fs/direct-io.c2
-rw-r--r--fs/efivarfs/file.c70
-rw-r--r--fs/efivarfs/inode.c30
-rw-r--r--fs/efivarfs/internal.h3
-rw-r--r--fs/efivarfs/super.c16
-rw-r--r--fs/eventpoll.c38
-rw-r--r--fs/ext2/file.c19
-rw-r--r--fs/ext2/inode.c16
-rw-r--r--fs/ext4/balloc.c7
-rw-r--r--fs/ext4/crypto.c56
-rw-r--r--fs/ext4/dir.c13
-rw-r--r--fs/ext4/ext4.h1
-rw-r--r--fs/ext4/extents.c4
-rw-r--r--fs/ext4/file.c28
-rw-r--r--fs/ext4/ialloc.c6
-rw-r--r--fs/ext4/inode.c78
-rw-r--r--fs/ext4/ioctl.c7
-rw-r--r--fs/ext4/mballoc.c2
-rw-r--r--fs/ext4/move_extent.c16
-rw-r--r--fs/ext4/namei.c26
-rw-r--r--fs/ext4/resize.c2
-rw-r--r--fs/fs-writeback.c39
-rw-r--r--fs/hpfs/namei.c31
-rw-r--r--fs/inode.c6
-rw-r--r--fs/jffs2/README.Locking5
-rw-r--r--fs/jffs2/build.c75
-rw-r--r--fs/jffs2/dir.c11
-rw-r--r--fs/jffs2/file.c39
-rw-r--r--fs/jffs2/gc.c17
-rw-r--r--fs/jffs2/nodelist.h6
-rw-r--r--fs/namei.c22
-rw-r--r--fs/ncpfs/dir.c2
-rw-r--r--fs/nfs/blocklayout/extent_tree.c10
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c2
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c2
-rw-r--r--fs/nfs/nfs42proc.c119
-rw-r--r--fs/nfs/nfs4proc.c4
-rw-r--r--fs/nfs/pnfs.c175
-rw-r--r--fs/nfs/pnfs.h4
-rw-r--r--fs/notify/mark.c53
-rw-r--r--fs/ocfs2/aops.c1
-rw-r--r--fs/ocfs2/cluster/heartbeat.c14
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c2
-rw-r--r--fs/ocfs2/mmap.c4
-rw-r--r--fs/overlayfs/dir.c10
-rw-r--r--fs/overlayfs/inode.c2
-rw-r--r--fs/overlayfs/super.c13
-rw-r--r--fs/pnode.c9
-rw-r--r--fs/proc/task_mmu.c73
-rw-r--r--fs/proc/task_nommu.c49
-rw-r--r--fs/read_write.c9
-rw-r--r--fs/super.c1
-rw-r--r--fs/userfaultfd.c6
-rw-r--r--fs/xattr.c6
-rw-r--r--fs/xfs/xfs_aops.c6
-rw-r--r--fs/xfs/xfs_aops.h1
-rw-r--r--fs/xfs/xfs_bmap_util.c3
-rw-r--r--fs/xfs/xfs_log_recover.c4
88 files changed, 1166 insertions, 603 deletions
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 0548c53..22fc7c8 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -511,8 +511,6 @@ affs_do_readpage_ofs(struct page *page, unsigned to)
pr_debug("%s(%lu, %ld, 0, %d)\n", __func__, inode->i_ino,
page->index, to);
BUG_ON(to > PAGE_CACHE_SIZE);
- kmap(page);
- data = page_address(page);
bsize = AFFS_SB(sb)->s_data_blksize;
tmp = page->index << PAGE_CACHE_SHIFT;
bidx = tmp / bsize;
@@ -524,14 +522,15 @@ affs_do_readpage_ofs(struct page *page, unsigned to)
return PTR_ERR(bh);
tmp = min(bsize - boff, to - pos);
BUG_ON(pos + tmp > to || tmp > bsize);
+ data = kmap_atomic(page);
memcpy(data + pos, AFFS_DATA(bh) + boff, tmp);
+ kunmap_atomic(data);
affs_brelse(bh);
bidx++;
pos += tmp;
boff = 0;
}
flush_dcache_page(page);
- kunmap(page);
return 0;
}
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 051ea48..7d914c6 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -653,7 +653,7 @@ static unsigned long randomize_stack_top(unsigned long stack_top)
if ((current->flags & PF_RANDOMIZE) &&
!(current->personality & ADDR_NO_RANDOMIZE)) {
- random_variable = (unsigned long) get_random_int();
+ random_variable = get_random_long();
random_variable &= STACK_RND_MASK;
random_variable <<= PAGE_SHIFT;
}
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 7b9cd49..826b164 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1201,7 +1201,11 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
bdev->bd_disk = disk;
bdev->bd_queue = disk->queue;
bdev->bd_contains = bdev;
- bdev->bd_inode->i_flags = disk->fops->direct_access ? S_DAX : 0;
+ if (IS_ENABLED(CONFIG_BLK_DEV_DAX) && disk->fops->direct_access)
+ bdev->bd_inode->i_flags = S_DAX;
+ else
+ bdev->bd_inode->i_flags = 0;
+
if (!partno) {
ret = -ENXIO;
bdev->bd_part = disk_get_part(disk, partno);
@@ -1693,13 +1697,24 @@ static int blkdev_releasepage(struct page *page, gfp_t wait)
return try_to_free_buffers(page);
}
+static int blkdev_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ if (dax_mapping(mapping)) {
+ struct block_device *bdev = I_BDEV(mapping->host);
+
+ return dax_writeback_mapping_range(mapping, bdev, wbc);
+ }
+ return generic_writepages(mapping, wbc);
+}
+
static const struct address_space_operations def_blk_aops = {
.readpage = blkdev_readpage,
.readpages = blkdev_readpages,
.writepage = blkdev_writepage,
.write_begin = blkdev_write_begin,
.write_end = blkdev_write_end,
- .writepages = generic_writepages,
+ .writepages = blkdev_writepages,
.releasepage = blkdev_releasepage,
.direct_IO = blkdev_direct_IO,
.is_dirty_writeback = buffer_check_dirty_writeback,
@@ -1730,43 +1745,25 @@ static int blkdev_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
return __dax_fault(vma, vmf, blkdev_get_block, NULL);
}
-static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmd, unsigned int flags)
-{
- return __dax_pmd_fault(vma, addr, pmd, flags, blkdev_get_block, NULL);
-}
-
-static void blkdev_vm_open(struct vm_area_struct *vma)
+static int blkdev_dax_pfn_mkwrite(struct vm_area_struct *vma,
+ struct vm_fault *vmf)
{
- struct inode *bd_inode = bdev_file_inode(vma->vm_file);
- struct block_device *bdev = I_BDEV(bd_inode);
-
- inode_lock(bd_inode);
- bdev->bd_map_count++;
- inode_unlock(bd_inode);
+ return dax_pfn_mkwrite(vma, vmf);
}
-static void blkdev_vm_close(struct vm_area_struct *vma)
+static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmd, unsigned int flags)
{
- struct inode *bd_inode = bdev_file_inode(vma->vm_file);
- struct block_device *bdev = I_BDEV(bd_inode);
-
- inode_lock(bd_inode);
- bdev->bd_map_count--;
- inode_unlock(bd_inode);
+ return __dax_pmd_fault(vma, addr, pmd, flags, blkdev_get_block, NULL);
}
static const struct vm_operations_struct blkdev_dax_vm_ops = {
- .open = blkdev_vm_open,
- .close = blkdev_vm_close,
.fault = blkdev_dax_fault,
.pmd_fault = blkdev_dax_pmd_fault,
- .pfn_mkwrite = blkdev_dax_fault,
+ .pfn_mkwrite = blkdev_dax_pfn_mkwrite,
};
static const struct vm_operations_struct blkdev_default_vm_ops = {
- .open = blkdev_vm_open,
- .close = blkdev_vm_close,
.fault = filemap_fault,
.map_pages = filemap_map_pages,
};
@@ -1774,18 +1771,14 @@ static const struct vm_operations_struct blkdev_default_vm_ops = {
static int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
{
struct inode *bd_inode = bdev_file_inode(file);
- struct block_device *bdev = I_BDEV(bd_inode);
file_accessed(file);
- inode_lock(bd_inode);
- bdev->bd_map_count++;
if (IS_DAX(bd_inode)) {
vma->vm_ops = &blkdev_dax_vm_ops;
vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
} else {
vma->vm_ops = &blkdev_default_vm_ops;
}
- inode_unlock(bd_inode);
return 0;
}
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index b90cd37..f6dac40 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1406,7 +1406,8 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
read_extent_buffer(eb, dest + bytes_left,
name_off, name_len);
if (eb != eb_in) {
- btrfs_tree_read_unlock_blocking(eb);
+ if (!path->skip_locking)
+ btrfs_tree_read_unlock_blocking(eb);
free_extent_buffer(eb);
}
ret = btrfs_find_item(fs_root, path, parent, 0,
@@ -1426,9 +1427,10 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
eb = path->nodes[0];
/* make sure we can use eb after releasing the path */
if (eb != eb_in) {
- atomic_inc(&eb->refs);
- btrfs_tree_read_lock(eb);
- btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ if (!path->skip_locking)
+ btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ path->nodes[0] = NULL;
+ path->locks[0] = 0;
}
btrfs_release_path(path);
iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index c473c42..3346cd8 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -637,11 +637,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
faili = nr_pages - 1;
cb->nr_pages = nr_pages;
- /* In the parent-locked case, we only locked the range we are
- * interested in. In all other cases, we can opportunistically
- * cache decompressed data that goes beyond the requested range. */
- if (!(bio_flags & EXTENT_BIO_PARENT_LOCKED))
- add_ra_bio_pages(inode, em_start + em_len, cb);
+ add_ra_bio_pages(inode, em_start + em_len, cb);
/* include any pages we added in add_ra-bio_pages */
uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 0be47e4..b57daa8 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1689,7 +1689,7 @@ int btrfs_should_delete_dir_index(struct list_head *del_list,
*
*/
int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
- struct list_head *ins_list)
+ struct list_head *ins_list, bool *emitted)
{
struct btrfs_dir_item *di;
struct btrfs_delayed_item *curr, *next;
@@ -1733,6 +1733,7 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
if (over)
return 1;
+ *emitted = true;
}
return 0;
}
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index f70119f..0167853 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -144,7 +144,7 @@ void btrfs_put_delayed_items(struct list_head *ins_list,
int btrfs_should_delete_dir_index(struct list_head *del_list,
u64 index);
int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
- struct list_head *ins_list);
+ struct list_head *ins_list, bool *emitted);
/* for init */
int __init btrfs_delayed_inode_init(void);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 2e7c97a..392592d 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2897,12 +2897,11 @@ static int __do_readpage(struct extent_io_tree *tree,
struct block_device *bdev;
int ret;
int nr = 0;
- int parent_locked = *bio_flags & EXTENT_BIO_PARENT_LOCKED;
size_t pg_offset = 0;
size_t iosize;
size_t disk_io_size;
size_t blocksize = inode->i_sb->s_blocksize;
- unsigned long this_bio_flag = *bio_flags & EXTENT_BIO_PARENT_LOCKED;
+ unsigned long this_bio_flag = 0;
set_page_extent_mapped(page);
@@ -2942,18 +2941,16 @@ static int __do_readpage(struct extent_io_tree *tree,
kunmap_atomic(userpage);
set_extent_uptodate(tree, cur, cur + iosize - 1,
&cached, GFP_NOFS);
- if (!parent_locked)
- unlock_extent_cached(tree, cur,
- cur + iosize - 1,
- &cached, GFP_NOFS);
+ unlock_extent_cached(tree, cur,
+ cur + iosize - 1,
+ &cached, GFP_NOFS);
break;
}
em = __get_extent_map(inode, page, pg_offset, cur,
end - cur + 1, get_extent, em_cached);
if (IS_ERR_OR_NULL(em)) {
SetPageError(page);
- if (!parent_locked)
- unlock_extent(tree, cur, end);
+ unlock_extent(tree, cur, end);
break;
}
extent_offset = cur - em->start;
@@ -3038,12 +3035,9 @@ static int __do_readpage(struct extent_io_tree *tree,
set_extent_uptodate(tree, cur, cur + iosize - 1,
&cached, GFP_NOFS);
- if (parent_locked)
- free_extent_state(cached);
- else
- unlock_extent_cached(tree, cur,
- cur + iosize - 1,
- &cached, GFP_NOFS);
+ unlock_extent_cached(tree, cur,
+ cur + iosize - 1,
+ &cached, GFP_NOFS);
cur = cur + iosize;
pg_offset += iosize;
continue;
@@ -3052,8 +3046,7 @@ static int __do_readpage(struct extent_io_tree *tree,
if (test_range_bit(tree, cur, cur_end,
EXTENT_UPTODATE, 1, NULL)) {
check_page_uptodate(tree, page);
- if (!parent_locked)
- unlock_extent(tree, cur, cur + iosize - 1);
+ unlock_extent(tree, cur, cur + iosize - 1);
cur = cur + iosize;
pg_offset += iosize;
continue;
@@ -3063,8 +3056,7 @@ static int __do_readpage(struct extent_io_tree *tree,
*/
if (block_start == EXTENT_MAP_INLINE) {
SetPageError(page);
- if (!parent_locked)
- unlock_extent(tree, cur, cur + iosize - 1);
+ unlock_extent(tree, cur, cur + iosize - 1);
cur = cur + iosize;
pg_offset += iosize;
continue;
@@ -3083,8 +3075,7 @@ static int __do_readpage(struct extent_io_tree *tree,
*bio_flags = this_bio_flag;
} else {
SetPageError(page);
- if (!parent_locked)
- unlock_extent(tree, cur, cur + iosize - 1);
+ unlock_extent(tree, cur, cur + iosize - 1);
}
cur = cur + iosize;
pg_offset += iosize;
@@ -3213,20 +3204,6 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
return ret;
}
-int extent_read_full_page_nolock(struct extent_io_tree *tree, struct page *page,
- get_extent_t *get_extent, int mirror_num)
-{
- struct bio *bio = NULL;
- unsigned long bio_flags = EXTENT_BIO_PARENT_LOCKED;
- int ret;
-
- ret = __do_readpage(tree, page, get_extent, NULL, &bio, mirror_num,
- &bio_flags, READ, NULL);
- if (bio)
- ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
- return ret;
-}
-
static noinline void update_nr_written(struct page *page,
struct writeback_control *wbc,
unsigned long nr_written)
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 0377413..880d529 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -29,7 +29,6 @@
*/
#define EXTENT_BIO_COMPRESSED 1
#define EXTENT_BIO_TREE_LOG 2
-#define EXTENT_BIO_PARENT_LOCKED 4
#define EXTENT_BIO_FLAG_SHIFT 16
/* these are bit numbers for test/set bit */
@@ -210,8 +209,6 @@ static inline int lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
get_extent_t *get_extent, int mirror_num);
-int extent_read_full_page_nolock(struct extent_io_tree *tree, struct page *page,
- get_extent_t *get_extent, int mirror_num);
int __init extent_io_init(void);
void extent_io_exit(void);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5f06eb1..d96f5cf 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5717,6 +5717,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
char *name_ptr;
int name_len;
int is_curr = 0; /* ctx->pos points to the current index? */
+ bool emitted;
/* FIXME, use a real flag for deciding about the key type */
if (root->fs_info->tree_root == root)
@@ -5745,6 +5746,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
if (ret < 0)
goto err;
+ emitted = false;
while (1) {
leaf = path->nodes[0];
slot = path->slots[0];
@@ -5824,6 +5826,7 @@ skip:
if (over)
goto nopos;
+ emitted = true;
di_len = btrfs_dir_name_len(leaf, di) +
btrfs_dir_data_len(leaf, di) + sizeof(*di);
di_cur += di_len;
@@ -5836,11 +5839,20 @@ next:
if (key_type == BTRFS_DIR_INDEX_KEY) {
if (is_curr)
ctx->pos++;
- ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
+ ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list, &emitted);
if (ret)
goto nopos;
}
+ /*
+ * If we haven't emitted any dir entry, we must not touch ctx->pos as
+ * it was was set to the termination value in previous call. We assume
+ * that "." and ".." were emitted if we reach this point and set the
+ * termination value as well for an empty directory.
+ */
+ if (ctx->pos > 2 && !emitted)
+ goto nopos;
+
/* Reached end of directory/root. Bump pos past the last item. */
ctx->pos++;
@@ -7974,6 +7986,7 @@ static void btrfs_endio_direct_read(struct bio *bio)
kfree(dip);
+ dio_bio->bi_error = bio->bi_error;
dio_end_io(dio_bio, bio->bi_error);
if (io_bio->end_io)
@@ -8028,6 +8041,7 @@ static void btrfs_endio_direct_write(struct bio *bio)
kfree(dip);
+ dio_bio->bi_error = bio->bi_error;
dio_end_io(dio_bio, bio->bi_error);
bio_put(bio);
}
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 952172c..48aee98 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2794,24 +2794,29 @@ out:
static struct page *extent_same_get_page(struct inode *inode, pgoff_t index)
{
struct page *page;
- struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
page = grab_cache_page(inode->i_mapping, index);
if (!page)
- return NULL;
+ return ERR_PTR(-ENOMEM);
if (!PageUptodate(page)) {
- if (extent_read_full_page_nolock(tree, page, btrfs_get_extent,
- 0))
- return NULL;
+ int ret;
+
+ ret = btrfs_readpage(NULL, page);
+ if (ret)
+ return ERR_PTR(ret);
lock_page(page);
if (!PageUptodate(page)) {
unlock_page(page);
page_cache_release(page);
- return NULL;
+ return ERR_PTR(-EIO);
+ }
+ if (page->mapping != inode->i_mapping) {
+ unlock_page(page);
+ page_cache_release(page);
+ return ERR_PTR(-EAGAIN);
}
}
- unlock_page(page);
return page;
}
@@ -2823,17 +2828,31 @@ static int gather_extent_pages(struct inode *inode, struct page **pages,
pgoff_t index = off >> PAGE_CACHE_SHIFT;
for (i = 0; i < num_pages; i++) {
+again:
pages[i] = extent_same_get_page(inode, index + i);
- if (!pages[i])
- return -ENOMEM;
+ if (IS_ERR(pages[i])) {
+ int err = PTR_ERR(pages[i]);
+
+ if (err == -EAGAIN)
+ goto again;
+ pages[i] = NULL;
+ return err;
+ }
}
return 0;
}
-static inline void lock_extent_range(struct inode *inode, u64 off, u64 len)
+static int lock_extent_range(struct inode *inode, u64 off, u64 len,
+ bool retry_range_locking)
{
- /* do any pending delalloc/csum calc on src, one way or
- another, and lock file content */
+ /*
+ * Do any pending delalloc/csum calculations on inode, one way or
+ * another, and lock file content.
+ * The locking order is:
+ *
+ * 1) pages
+ * 2) range in the inode's io tree
+ */
while (1) {
struct btrfs_ordered_extent *ordered;
lock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1);
@@ -2851,8 +2870,11 @@ static inline void lock_extent_range(struct inode *inode, u64 off, u64 len)
unlock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1);
if (ordered)
btrfs_put_ordered_extent(ordered);
+ if (!retry_range_locking)
+ return -EAGAIN;
btrfs_wait_ordered_range(inode, off, len);
}
+ return 0;
}
static void btrfs_double_inode_unlock(struct inode *inode1, struct inode *inode2)
@@ -2877,15 +2899,24 @@ static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
}
-static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
- struct inode *inode2, u64 loff2, u64 len)
+static int btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
+ struct inode *inode2, u64 loff2, u64 len,
+ bool retry_range_locking)
{
+ int ret;
+
if (inode1 < inode2) {
swap(inode1, inode2);
swap(loff1, loff2);
}
- lock_extent_range(inode1, loff1, len);
- lock_extent_range(inode2, loff2, len);
+ ret = lock_extent_range(inode1, loff1, len, retry_range_locking);
+ if (ret)
+ return ret;
+ ret = lock_extent_range(inode2, loff2, len, retry_range_locking);
+ if (ret)
+ unlock_extent(&BTRFS_I(inode1)->io_tree, loff1,
+ loff1 + len - 1);
+ return ret;
}
struct cmp_pages {
@@ -2901,11 +2932,15 @@ static void btrfs_cmp_data_free(struct cmp_pages *cmp)
for (i = 0; i < cmp->num_pages; i++) {
pg = cmp->src_pages[i];
- if (pg)
+ if (pg) {
+ unlock_page(pg);
page_cache_release(pg);
+ }
pg = cmp->dst_pages[i];
- if (pg)
+ if (pg) {
+ unlock_page(pg);
page_cache_release(pg);
+ }
}
kfree(cmp->src_pages);
kfree(cmp->dst_pages);
@@ -2966,6 +3001,8 @@ static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst,
src_page = cmp->src_pages[i];
dst_page = cmp->dst_pages[i];
+ ASSERT(PageLocked(src_page));
+ ASSERT(PageLocked(dst_page));
addr = kmap_atomic(src_page);
dst_addr = kmap_atomic(dst_page);
@@ -3078,14 +3115,46 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
goto out_unlock;
}
+again:
ret = btrfs_cmp_data_prepare(src, loff, dst, dst_loff, olen, &cmp);
if (ret)
goto out_unlock;
if (same_inode)
- lock_extent_range(src, same_lock_start, same_lock_len);
+ ret = lock_extent_range(src, same_lock_start, same_lock_len,
+ false);
else
- btrfs_double_extent_lock(src, loff, dst, dst_loff, len);
+ ret = btrfs_double_extent_lock(src, loff, dst, dst_loff, len,
+ false);
+ /*
+ * If one of the inodes has dirty pages in the respective range or
+ * ordered extents, we need to flush dellaloc and wait for all ordered
+ * extents in the range. We must unlock the pages and the ranges in the
+ * io trees to avoid deadlocks when flushing delalloc (requires locking
+ * pages) and when waiting for ordered extents to complete (they require
+ * range locking).
+ */
+ if (ret == -EAGAIN) {
+ /*
+ * Ranges in the io trees already unlocked. Now unlock all
+ * pages before waiting for all IO to complete.
+ */
+ btrfs_cmp_data_free(&cmp);
+ if (same_inode) {
+ btrfs_wait_ordered_range(src, same_lock_start,
+ same_lock_len);
+ } else {
+ btrfs_wait_ordered_range(src, loff, len);
+ btrfs_wait_ordered_range(dst, dst_loff, len);
+ }
+ goto again;
+ }
+ ASSERT(ret == 0);
+ if (WARN_ON(ret)) {
+ /* ranges in the io trees already unlocked */
+ btrfs_cmp_data_free(&cmp);
+ return ret;
+ }
/* pass original length for comparison so we stay within i_size */
ret = btrfs_cmp_data(src, loff, dst, dst_loff, olen, &cmp);
@@ -3795,9 +3864,15 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
u64 lock_start = min_t(u64, off, destoff);
u64 lock_len = max_t(u64, off, destoff) + len - lock_start;
- lock_extent_range(src, lock_start, lock_len);
+ ret = lock_extent_range(src, lock_start, lock_len, true);
} else {
- btrfs_double_extent_lock(src, off, inode, destoff, len);
+ ret = btrfs_double_extent_lock(src, off, inode, destoff, len,
+ true);
+ }
+ ASSERT(ret == 0);
+ if (WARN_ON(ret)) {
+ /* ranges in the io trees already unlocked */
+ goto out_unlock;
}
ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 7cf8509..2c849b0 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -310,8 +310,16 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);
err = btrfs_insert_fs_root(root->fs_info, root);
+ /*
+ * The root might have been inserted already, as before we look
+ * for orphan roots, log replay might have happened, which
+ * triggers a transaction commit and qgroup accounting, which
+ * in turn reads and inserts fs roots while doing backref
+ * walking.
+ */
+ if (err == -EEXIST)
+ err = 0;
if (err) {
- BUG_ON(err == -EEXIST);
btrfs_free_fs_root(root);
break;
}
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index c2221378..19adeb0 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1756,6 +1756,10 @@ int ceph_pool_perm_check(struct ceph_inode_info *ci, int need)
u32 pool;
int ret, flags;
+ /* does not support pool namespace yet */
+ if (ci->i_pool_ns_len)
+ return -EIO;
+
if (ceph_test_mount_opt(ceph_inode_to_client(&ci->vfs_inode),
NOPOOLPERM))
return 0;
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index cdbf8cf..6fe0ad2 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2753,7 +2753,8 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
void *inline_data, int inline_len,
struct ceph_buffer *xattr_buf,
struct ceph_mds_session *session,
- struct ceph_cap *cap, int issued)
+ struct ceph_cap *cap, int issued,
+ u32 pool_ns_len)
__releases(ci->i_ceph_lock)
__releases(mdsc->snap_rwsem)
{
@@ -2873,6 +2874,8 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
/* file layout may have changed */
ci->i_layout = grant->layout;
+ ci->i_pool_ns_len = pool_ns_len;
+
/* size/truncate_seq? */
queue_trunc = ceph_fill_file_size(inode, issued,
le32_to_cpu(grant->truncate_seq),
@@ -3411,6 +3414,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
u32 inline_len = 0;
void *snaptrace;
size_t snaptrace_len;
+ u32 pool_ns_len = 0;
void *p, *end;
dout("handle_caps from mds%d\n", mds);
@@ -3463,6 +3467,21 @@ void ceph_handle_caps(struct ceph_mds_session *session,
p += inline_len;
}
+ if (le16_to_cpu(msg->hdr.version) >= 8) {
+ u64 flush_tid;
+ u32 caller_uid, caller_gid;
+ u32 osd_epoch_barrier;
+ /* version >= 5 */
+ ceph_decode_32_safe(&p, end, osd_epoch_barrier, bad);
+ /* version >= 6 */
+ ceph_decode_64_safe(&p, end, flush_tid, bad);
+ /* version >= 7 */
+ ceph_decode_32_safe(&p, end, caller_uid, bad);
+ ceph_decode_32_safe(&p, end, caller_gid, bad);
+ /* version >= 8 */
+ ceph_decode_32_safe(&p, end, pool_ns_len, bad);
+ }
+
/* lookup ino */
inode = ceph_find_inode(sb, vino);
ci = ceph_inode(inode);
@@ -3518,7 +3537,8 @@ void ceph_handle_caps(struct ceph_mds_session *session,
&cap, &issued);
handle_cap_grant(mdsc, inode, h,
inline_version, inline_data, inline_len,
- msg->middle, session, cap, issued);
+ msg->middle, session, cap, issued,
+ pool_ns_len);
if (realm)
ceph_put_snap_realm(mdsc, realm);
goto done_unlocked;
@@ -3542,7 +3562,8 @@ void ceph_handle_caps(struct ceph_mds_session *session,
issued |= __ceph_caps_dirty(ci);
handle_cap_grant(mdsc, inode, h,
inline_version, inline_data, inline_len,
- msg->middle, session, cap, issued);
+ msg->middle, session, cap, issued,
+ pool_ns_len);
goto done_unlocked;
case CEPH_CAP_OP_FLUSH_ACK:
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 86a9c38..eb9028e 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -698,8 +698,8 @@ static void ceph_aio_retry_work(struct work_struct *work)
req = ceph_osdc_alloc_request(orig_req->r_osdc, snapc, 2,
false, GFP_NOFS);
- if (IS_ERR(req)) {
- ret = PTR_ERR(req);
+ if (!req) {
+ ret = -ENOMEM;
req = orig_req;
goto out;
}
@@ -716,7 +716,6 @@ static void ceph_aio_retry_work(struct work_struct *work)
ceph_osdc_build_request(req, req->r_ops[0].extent.offset,
snapc, CEPH_NOSNAP, &aio_req->mtime);
- ceph_put_snap_context(snapc);
ceph_osdc_put_request(orig_req);
req->r_callback = ceph_aio_complete_req;
@@ -731,6 +730,7 @@ out:
ceph_aio_complete_req(req, NULL);
}
+ ceph_put_snap_context(snapc);
kfree(aio_work);
}
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index fb4ba2e..5849b88 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -396,6 +396,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_symlink = NULL;
memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
+ ci->i_pool_ns_len = 0;
ci->i_fragtree = RB_ROOT;
mutex_init(&ci->i_fragtree_mutex);
@@ -756,6 +757,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
if (ci->i_layout.fl_pg_pool != info->layout.fl_pg_pool)
ci->i_ceph_flags &= ~CEPH_I_POOL_PERM;
ci->i_layout = info->layout;
+ ci->i_pool_ns_len = iinfo->pool_ns_len;
queue_trunc = ceph_fill_file_size(inode, issued,
le32_to_cpu(info->truncate_seq),
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index e7b130a..911d64d 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -100,6 +100,14 @@ static int parse_reply_info_in(void **p, void *end,
} else
info->inline_version = CEPH_INLINE_NONE;
+ if (features & CEPH_FEATURE_FS_FILE_LAYOUT_V2) {
+ ceph_decode_32_safe(p, end, info->pool_ns_len, bad);
+ ceph_decode_need(p, end, info->pool_ns_len, bad);
+ *p += info->pool_ns_len;
+ } else {
+ info->pool_ns_len = 0;
+ }
+
return 0;
bad:
return err;
@@ -2298,6 +2306,14 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
CEPH_CAP_PIN);
+ /* deny access to directories with pool_ns layouts */
+ if (req->r_inode && S_ISDIR(req->r_inode->i_mode) &&
+ ceph_inode(req->r_inode)->i_pool_ns_len)
+ return -EIO;
+ if (req->r_locked_dir &&
+ ceph_inode(req->r_locked_dir)->i_pool_ns_len)
+ return -EIO;
+
/* issue */
mutex_lock(&mdsc->mutex);
__register_request(mdsc, req, dir);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index ccf11ef..37712cc 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -44,6 +44,7 @@ struct ceph_mds_reply_info_in {
u64 inline_version;
u32 inline_len;
char *inline_data;
+ u32 pool_ns_len;
};
/*
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 75b7d12..9c458eb 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -287,6 +287,7 @@ struct ceph_inode_info {
struct ceph_dir_layout i_dir_layout;
struct ceph_file_layout i_layout;
+ size_t i_pool_ns_len;
char *i_symlink;
/* for dirs */
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 7dc886c..e956cba 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -175,7 +175,7 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
* string to the length of the original string to allow for worst case.
*/
md_len = strlen(sb_mountdata) + INET6_ADDRSTRLEN;
- mountdata = kzalloc(md_len + 1, GFP_KERNEL);
+ mountdata = kzalloc(md_len + sizeof("ip=") + 1, GFP_KERNEL);
if (mountdata == NULL) {
rc = -ENOMEM;
goto compose_mount_options_err;
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index afa09fc..e682b36 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -714,7 +714,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
ses->auth_key.response = kmalloc(baselen + tilen, GFP_KERNEL);
if (!ses->auth_key.response) {
- rc = ENOMEM;
+ rc = -ENOMEM;
ses->auth_key.len = 0;
goto setup_ntlmv2_rsp_ret;
}
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index c48ca13..2eea403 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -1013,7 +1013,6 @@ const struct file_operations cifs_file_strict_ops = {
.llseek = cifs_llseek,
.unlocked_ioctl = cifs_ioctl,
.clone_file_range = cifs_clone_file_range,
- .clone_file_range = cifs_clone_file_range,
.setlease = cifs_setlease,
.fallocate = cifs_fallocate,
};
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 68c4547..83aac8b 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -31,19 +31,15 @@
* so that it will fit. We use hash_64 to convert the value to 31 bits, and
* then add 1, to ensure that we don't end up with a 0 as the value.
*/
-#if BITS_PER_LONG == 64
static inline ino_t
cifs_uniqueid_to_ino_t(u64 fileid)
{
+ if ((sizeof(ino_t)) < (sizeof(u64)))
+ return (ino_t)hash_64(fileid, (sizeof(ino_t) * 8) - 1) + 1;
+
return (ino_t)fileid;
+
}
-#else
-static inline ino_t
-cifs_uniqueid_to_ino_t(u64 fileid)
-{
- return (ino_t)hash_64(fileid, (sizeof(ino_t) * 8) - 1) + 1;
-}
-#endif
extern struct file_system_type cifs_fs_type;
extern const struct address_space_operations cifs_addr_ops;
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 90b4f9f..76fcb50 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1396,11 +1396,10 @@ openRetry:
* current bigbuf.
*/
static int
-cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
+discard_remaining_data(struct TCP_Server_Info *server)
{
unsigned int rfclen = get_rfc1002_length(server->smallbuf);
int remaining = rfclen + 4 - server->total_read;
- struct cifs_readdata *rdata = mid->callback_data;
while (remaining > 0) {
int length;
@@ -1414,10 +1413,20 @@ cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
remaining -= length;
}
- dequeue_mid(mid, rdata->result);
return 0;
}
+static int
+cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
+{
+ int length;
+ struct cifs_readdata *rdata = mid->callback_data;
+
+ length = discard_remaining_data(server);
+ dequeue_mid(mid, rdata->result);
+ return length;
+}
+
int
cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
{
@@ -1446,6 +1455,12 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
return length;
server->total_read += length;
+ if (server->ops->is_status_pending &&
+ server->ops->is_status_pending(buf, server, 0)) {
+ discard_remaining_data(server);
+ return -1;
+ }
+
/* Was the SMB read successful? */
rdata->result = server->ops->map_error(buf, false);
if (rdata->result != 0) {
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 4fbd92d..a763cd3 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2999,8 +2999,7 @@ ip_rfc1001_connect(struct TCP_Server_Info *server)
if (ses_init_buf) {
ses_init_buf->trailer.session_req.called_len = 32;
- if (server->server_RFC1001_name &&
- server->server_RFC1001_name[0] != 0)
+ if (server->server_RFC1001_name[0] != 0)
rfc1002mangle(ses_init_buf->trailer.
session_req.called_name,
server->server_RFC1001_name,
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 10f8d5c..42e1f44 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -1106,21 +1106,25 @@ parse_lease_state(struct TCP_Server_Info *server, struct smb2_create_rsp *rsp,
{
char *data_offset;
struct create_context *cc;
- unsigned int next = 0;
+ unsigned int next;
+ unsigned int remaining;
char *name;
data_offset = (char *)rsp + 4 + le32_to_cpu(rsp->CreateContextsOffset);
+ remaining = le32_to_cpu(rsp->CreateContextsLength);
cc = (struct create_context *)data_offset;
- do {
- cc = (struct create_context *)((char *)cc + next);
+ while (remaining >= sizeof(struct create_context)) {
name = le16_to_cpu(cc->NameOffset) + (char *)cc;
- if (le16_to_cpu(cc->NameLength) != 4 ||
- strncmp(name, "RqLs", 4)) {
- next = le32_to_cpu(cc->Next);
- continue;
- }
- return server->ops->parse_lease_buf(cc, epoch);
- } while (next != 0);
+ if (le16_to_cpu(cc->NameLength) == 4 &&
+ strncmp(name, "RqLs", 4) == 0)
+ return server->ops->parse_lease_buf(cc, epoch);
+
+ next = le32_to_cpu(cc->Next);
+ if (!next)
+ break;
+ remaining -= next;
+ cc = (struct create_context *)((char *)cc + next);
+ }
return 0;
}
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index a5b8eb6..6402eaf 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1261,6 +1261,9 @@ COMPATIBLE_IOCTL(HCIUNBLOCKADDR)
COMPATIBLE_IOCTL(HCIINQUIRY)
COMPATIBLE_IOCTL(HCIUARTSETPROTO)
COMPATIBLE_IOCTL(HCIUARTGETPROTO)
+COMPATIBLE_IOCTL(HCIUARTGETDEVICE)
+COMPATIBLE_IOCTL(HCIUARTSETFLAGS)
+COMPATIBLE_IOCTL(HCIUARTGETFLAGS)
COMPATIBLE_IOCTL(RFCOMMCREATEDEV)
COMPATIBLE_IOCTL(RFCOMMRELEASEDEV)
COMPATIBLE_IOCTL(RFCOMMGETDEVLIST)
diff --git a/fs/dax.c b/fs/dax.c
index 4fd6b0c..bbb2ad7 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -58,16 +58,35 @@ static void dax_unmap_atomic(struct block_device *bdev,
blk_queue_exit(bdev->bd_queue);
}
+struct page *read_dax_sector(struct block_device *bdev, sector_t n)
+{
+ struct page *page = alloc_pages(GFP_KERNEL, 0);
+ struct blk_dax_ctl dax = {
+ .size = PAGE_SIZE,
+ .sector = n & ~((((int) PAGE_SIZE) / 512) - 1),
+ };
+ long rc;
+
+ if (!page)
+ return ERR_PTR(-ENOMEM);
+
+ rc = dax_map_atomic(bdev, &dax);
+ if (rc < 0)
+ return ERR_PTR(rc);
+ memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE);
+ dax_unmap_atomic(bdev, &dax);
+ return page;
+}
+
/*
- * dax_clear_blocks() is called from within transaction context from XFS,
+ * dax_clear_sectors() is called from within transaction context from XFS,
* and hence this means the stack from this point must follow GFP_NOFS
* semantics for all operations.
*/
-int dax_clear_blocks(struct inode *inode, sector_t block, long _size)
+int dax_clear_sectors(struct block_device *bdev, sector_t _sector, long _size)
{
- struct block_device *bdev = inode->i_sb->s_bdev;
struct blk_dax_ctl dax = {
- .sector = block << (inode->i_blkbits - 9),
+ .sector = _sector,
.size = _size,
};
@@ -89,7 +108,7 @@ int dax_clear_blocks(struct inode *inode, sector_t block, long _size)
wmb_pmem();
return 0;
}
-EXPORT_SYMBOL_GPL(dax_clear_blocks);
+EXPORT_SYMBOL_GPL(dax_clear_sectors);
/* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */
static void dax_new_buf(void __pmem *addr, unsigned size, unsigned first,
@@ -338,7 +357,8 @@ static int dax_radix_entry(struct address_space *mapping, pgoff_t index,
void *entry;
WARN_ON_ONCE(pmd_entry && !dirty);
- __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+ if (dirty)
+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
spin_lock_irq(&mapping->tree_lock);
@@ -464,11 +484,10 @@ static int dax_writeback_one(struct block_device *bdev,
* end]. This is required by data integrity operations to ensure file data is
* on persistent storage prior to completion of the operation.
*/
-int dax_writeback_mapping_range(struct address_space *mapping, loff_t start,
- loff_t end)
+int dax_writeback_mapping_range(struct address_space *mapping,
+ struct block_device *bdev, struct writeback_control *wbc)
{
struct inode *inode = mapping->host;
- struct block_device *bdev = inode->i_sb->s_bdev;
pgoff_t start_index, end_index, pmd_index;
pgoff_t indices[PAGEVEC_SIZE];
struct pagevec pvec;
@@ -479,8 +498,11 @@ int dax_writeback_mapping_range(struct address_space *mapping, loff_t start,
if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
return -EIO;
- start_index = start >> PAGE_CACHE_SHIFT;
- end_index = end >> PAGE_CACHE_SHIFT;
+ if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL)
+ return 0;
+
+ start_index = wbc->range_start >> PAGE_CACHE_SHIFT;
+ end_index = wbc->range_end >> PAGE_CACHE_SHIFT;
pmd_index = DAX_PMD_INDEX(start_index);
rcu_read_lock();
@@ -1034,6 +1056,7 @@ EXPORT_SYMBOL_GPL(dax_pmd_fault);
int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct file *file = vma->vm_file;
+ int error;
/*
* We pass NO_SECTOR to dax_radix_entry() because we expect that a
@@ -1043,7 +1066,13 @@ int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
* saves us from having to make a call to get_block() here to look
* up the sector.
*/
- dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false, true);
+ error = dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false,
+ true);
+
+ if (error == -ENOMEM)
+ return VM_FAULT_OOM;
+ if (error)
+ return VM_FAULT_SIGBUS;
return VM_FAULT_NOPAGE;
}
EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
diff --git a/fs/dcache.c b/fs/dcache.c
index 92d5140..2398f9f9 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -269,9 +269,6 @@ static inline int dname_external(const struct dentry *dentry)
return dentry->d_name.name != dentry->d_iname;
}
-/*
- * Make sure other CPUs see the inode attached before the type is set.
- */
static inline void __d_set_inode_and_type(struct dentry *dentry,
struct inode *inode,
unsigned type_flags)
@@ -279,28 +276,18 @@ static inline void __d_set_inode_and_type(struct dentry *dentry,
unsigned flags;
dentry->d_inode = inode;
- smp_wmb();
flags = READ_ONCE(dentry->d_flags);
flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU);
flags |= type_flags;
WRITE_ONCE(dentry->d_flags, flags);
}
-/*
- * Ideally, we want to make sure that other CPUs see the flags cleared before
- * the inode is detached, but this is really a violation of RCU principles
- * since the ordering suggests we should always set inode before flags.
- *
- * We should instead replace or discard the entire dentry - but that sucks
- * performancewise on mass deletion/rename.
- */
static inline void __d_clear_type_and_inode(struct dentry *dentry)
{
unsigned flags = READ_ONCE(dentry->d_flags);
flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU);
WRITE_ONCE(dentry->d_flags, flags);
- smp_wmb();
dentry->d_inode = NULL;
}
@@ -370,9 +357,11 @@ static void dentry_unlink_inode(struct dentry * dentry)
__releases(dentry->d_inode->i_lock)
{
struct inode *inode = dentry->d_inode;
+
+ raw_write_seqcount_begin(&dentry->d_seq);
__d_clear_type_and_inode(dentry);
hlist_del_init(&dentry->d_u.d_alias);
- dentry_rcuwalk_invalidate(dentry);
+ raw_write_seqcount_end(&dentry->d_seq);
spin_unlock(&dentry->d_lock);
spin_unlock(&inode->i_lock);
if (!inode->i_nlink)
@@ -1758,8 +1747,9 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode)
spin_lock(&dentry->d_lock);
if (inode)
hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
+ raw_write_seqcount_begin(&dentry->d_seq);
__d_set_inode_and_type(dentry, inode, add_flags);
- dentry_rcuwalk_invalidate(dentry);
+ raw_write_seqcount_end(&dentry->d_seq);
spin_unlock(&dentry->d_lock);
fsnotify_d_instantiate(dentry, inode);
}
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 1f107fd..655f21f 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -575,6 +575,26 @@ void devpts_kill_index(struct inode *ptmx_inode, int idx)
mutex_unlock(&allocated_ptys_lock);
}
+/*
+ * pty code needs to hold extra references in case of last /dev/tty close
+ */
+
+void devpts_add_ref(struct inode *ptmx_inode)
+{
+ struct super_block *sb = pts_sb_from_inode(ptmx_inode);
+
+ atomic_inc(&sb->s_active);
+ ihold(ptmx_inode);
+}
+
+void devpts_del_ref(struct inode *ptmx_inode)
+{
+ struct super_block *sb = pts_sb_from_inode(ptmx_inode);
+
+ iput(ptmx_inode);
+ deactivate_super(sb);
+}
+
/**
* devpts_pty_new -- create a new inode in /dev/pts/
* @ptmx_inode: inode of the master
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 1b2f7ff..d6a9012 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -472,8 +472,8 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio)
dio->io_error = -EIO;
if (dio->is_async && dio->rw == READ && dio->should_dirty) {
- bio_check_pages_dirty(bio); /* transfers ownership */
err = bio->bi_error;
+ bio_check_pages_dirty(bio); /* transfers ownership */
} else {
bio_for_each_segment_all(bvec, bio, i) {
struct page *page = bvec->bv_page;
diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c
index c424e48..d48e0d2 100644
--- a/fs/efivarfs/file.c
+++ b/fs/efivarfs/file.c
@@ -10,6 +10,7 @@
#include <linux/efi.h>
#include <linux/fs.h>
#include <linux/slab.h>
+#include <linux/mount.h>
#include "internal.h"
@@ -103,9 +104,78 @@ out_free:
return size;
}
+static int
+efivarfs_ioc_getxflags(struct file *file, void __user *arg)
+{
+ struct inode *inode = file->f_mapping->host;
+ unsigned int i_flags;
+ unsigned int flags = 0;
+
+ i_flags = inode->i_flags;
+ if (i_flags & S_IMMUTABLE)
+ flags |= FS_IMMUTABLE_FL;
+
+ if (copy_to_user(arg, &flags, sizeof(flags)))
+ return -EFAULT;
+ return 0;
+}
+
+static int
+efivarfs_ioc_setxflags(struct file *file, void __user *arg)
+{
+ struct inode *inode = file->f_mapping->host;
+ unsigned int flags;
+ unsigned int i_flags = 0;
+ int error;
+
+ if (!inode_owner_or_capable(inode))
+ return -EACCES;
+
+ if (copy_from_user(&flags, arg, sizeof(flags)))
+ return -EFAULT;
+
+ if (flags & ~FS_IMMUTABLE_FL)
+ return -EOPNOTSUPP;
+
+ if (!capable(CAP_LINUX_IMMUTABLE))
+ return -EPERM;
+
+ if (flags & FS_IMMUTABLE_FL)
+ i_flags |= S_IMMUTABLE;
+
+
+ error = mnt_want_write_file(file);
+ if (error)
+ return error;
+
+ inode_lock(inode);
+ inode_set_flags(inode, i_flags, S_IMMUTABLE);
+ inode_unlock(inode);
+
+ mnt_drop_write_file(file);
+
+ return 0;
+}
+
+long
+efivarfs_file_ioctl(struct file *file, unsigned int cmd, unsigned long p)
+{
+ void __user *arg = (void __user *)p;
+
+ switch (cmd) {
+ case FS_IOC_GETFLAGS:
+ return efivarfs_ioc_getxflags(file, arg);
+ case FS_IOC_SETFLAGS:
+ return efivarfs_ioc_setxflags(file, arg);
+ }
+
+ return -ENOTTY;
+}
+
const struct file_operations efivarfs_file_operations = {
.open = simple_open,
.read = efivarfs_file_read,
.write = efivarfs_file_write,
.llseek = no_llseek,
+ .unlocked_ioctl = efivarfs_file_ioctl,
};
diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c
index 3381b9d..e2ab6d0 100644
--- a/fs/efivarfs/inode.c
+++ b/fs/efivarfs/inode.c
@@ -15,7 +15,8 @@
#include "internal.h"
struct inode *efivarfs_get_inode(struct super_block *sb,
- const struct inode *dir, int mode, dev_t dev)
+ const struct inode *dir, int mode,
+ dev_t dev, bool is_removable)
{
struct inode *inode = new_inode(sb);
@@ -23,6 +24,7 @@ struct inode *efivarfs_get_inode(struct super_block *sb,
inode->i_ino = get_next_ino();
inode->i_mode = mode;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_flags = is_removable ? 0 : S_IMMUTABLE;
switch (mode & S_IFMT) {
case S_IFREG:
inode->i_fop = &efivarfs_file_operations;
@@ -102,22 +104,17 @@ static void efivarfs_hex_to_guid(const char *str, efi_guid_t *guid)
static int efivarfs_create(struct inode *dir, struct dentry *dentry,
umode_t mode, bool excl)
{
- struct inode *inode;
+ struct inode *inode = NULL;
struct efivar_entry *var;
int namelen, i = 0, err = 0;
+ bool is_removable = false;
if (!efivarfs_valid_name(dentry->d_name.name, dentry->d_name.len))
return -EINVAL;
- inode = efivarfs_get_inode(dir->i_sb, dir, mode, 0);
- if (!inode)
- return -ENOMEM;
-
var = kzalloc(sizeof(struct efivar_entry), GFP_KERNEL);
- if (!var) {
- err = -ENOMEM;
- goto out;
- }
+ if (!var)
+ return -ENOMEM;
/* length of the variable name itself: remove GUID and separator */
namelen = dentry->d_name.len - EFI_VARIABLE_GUID_LEN - 1;
@@ -125,6 +122,16 @@ static int efivarfs_create(struct inode *dir, struct dentry *dentry,
efivarfs_hex_to_guid(dentry->d_name.name + namelen + 1,
&var->var.VendorGuid);
+ if (efivar_variable_is_removable(var->var.VendorGuid,
+ dentry->d_name.name, namelen))
+ is_removable = true;
+
+ inode = efivarfs_get_inode(dir->i_sb, dir, mode, 0, is_removable);
+ if (!inode) {
+ err = -ENOMEM;
+ goto out;
+ }
+
for (i = 0; i < namelen; i++)
var->var.VariableName[i] = dentry->d_name.name[i];
@@ -138,7 +145,8 @@ static int efivarfs_create(struct inode *dir, struct dentry *dentry,
out:
if (err) {
kfree(var);
- iput(inode);
+ if (inode)
+ iput(inode);
}
return err;
}
diff --git a/fs/efivarfs/internal.h b/fs/efivarfs/internal.h
index b5ff16a..b450518 100644
--- a/fs/efivarfs/internal.h
+++ b/fs/efivarfs/internal.h
@@ -15,7 +15,8 @@ extern const struct file_operations efivarfs_file_operations;
extern const struct inode_operations efivarfs_dir_inode_operations;
extern bool efivarfs_valid_name(const char *str, int len);
extern struct inode *efivarfs_get_inode(struct super_block *sb,
- const struct inode *dir, int mode, dev_t dev);
+ const struct inode *dir, int mode, dev_t dev,
+ bool is_removable);
extern struct list_head efivarfs_list;
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index b8a564f..dd029d1 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -118,8 +118,9 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
struct dentry *dentry, *root = sb->s_root;
unsigned long size = 0;
char *name;
- int len, i;
+ int len;
int err = -ENOMEM;
+ bool is_removable = false;
entry = kzalloc(sizeof(*entry), GFP_KERNEL);
if (!entry)
@@ -128,15 +129,17 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
memcpy(entry->var.VariableName, name16, name_size);
memcpy(&(entry->var.VendorGuid), &vendor, sizeof(efi_guid_t));
- len = ucs2_strlen(entry->var.VariableName);
+ len = ucs2_utf8size(entry->var.VariableName);
/* name, plus '-', plus GUID, plus NUL*/
name = kmalloc(len + 1 + EFI_VARIABLE_GUID_LEN + 1, GFP_KERNEL);
if (!name)
goto fail;
- for (i = 0; i < len; i++)
- name[i] = entry->var.VariableName[i] & 0xFF;
+ ucs2_as_utf8(name, entry->var.VariableName, len);
+
+ if (efivar_variable_is_removable(entry->var.VendorGuid, name, len))
+ is_removable = true;
name[len] = '-';
@@ -144,7 +147,8 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
name[len + EFI_VARIABLE_GUID_LEN+1] = '\0';
- inode = efivarfs_get_inode(sb, d_inode(root), S_IFREG | 0644, 0);
+ inode = efivarfs_get_inode(sb, d_inode(root), S_IFREG | 0644, 0,
+ is_removable);
if (!inode)
goto fail_name;
@@ -200,7 +204,7 @@ static int efivarfs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_d_op = &efivarfs_d_ops;
sb->s_time_gran = 1;
- inode = efivarfs_get_inode(sb, NULL, S_IFDIR | 0755, 0);
+ inode = efivarfs_get_inode(sb, NULL, S_IFDIR | 0755, 0, true);
if (!inode)
return -ENOMEM;
inode->i_op = &efivarfs_dir_inode_operations;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index ae1dbcf..cde6074 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -94,6 +94,11 @@
/* Epoll private bits inside the event mask */
#define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE)
+#define EPOLLINOUT_BITS (POLLIN | POLLOUT)
+
+#define EPOLLEXCLUSIVE_OK_BITS (EPOLLINOUT_BITS | POLLERR | POLLHUP | \
+ EPOLLWAKEUP | EPOLLET | EPOLLEXCLUSIVE)
+
/* Maximum number of nesting allowed inside epoll sets */
#define EP_MAX_NESTS 4
@@ -1068,7 +1073,22 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
* wait list.
*/
if (waitqueue_active(&ep->wq)) {
- ewake = 1;
+ if ((epi->event.events & EPOLLEXCLUSIVE) &&
+ !((unsigned long)key & POLLFREE)) {
+ switch ((unsigned long)key & EPOLLINOUT_BITS) {
+ case POLLIN:
+ if (epi->event.events & POLLIN)
+ ewake = 1;
+ break;
+ case POLLOUT:
+ if (epi->event.events & POLLOUT)
+ ewake = 1;
+ break;
+ case 0:
+ ewake = 1;
+ break;
+ }
+ }
wake_up_locked(&ep->wq);
}
if (waitqueue_active(&ep->poll_wait))
@@ -1875,9 +1895,13 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
* so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation.
* Also, we do not currently supported nested exclusive wakeups.
*/
- if ((epds.events & EPOLLEXCLUSIVE) && (op == EPOLL_CTL_MOD ||
- (op == EPOLL_CTL_ADD && is_file_epoll(tf.file))))
- goto error_tgt_fput;
+ if (epds.events & EPOLLEXCLUSIVE) {
+ if (op == EPOLL_CTL_MOD)
+ goto error_tgt_fput;
+ if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) ||
+ (epds.events & ~EPOLLEXCLUSIVE_OK_BITS)))
+ goto error_tgt_fput;
+ }
/*
* At this point it is safe to assume that the "private_data" contains
@@ -1950,8 +1974,10 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
break;
case EPOLL_CTL_MOD:
if (epi) {
- epds.events |= POLLERR | POLLHUP;
- error = ep_modify(ep, epi, &epds);
+ if (!(epi->event.events & EPOLLEXCLUSIVE)) {
+ epds.events |= POLLERR | POLLHUP;
+ error = ep_modify(ep, epi, &epds);
+ }
} else
error = -ENOENT;
break;
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 2c88d68..c1400b1 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -80,23 +80,6 @@ static int ext2_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
return ret;
}
-static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
- struct inode *inode = file_inode(vma->vm_file);
- struct ext2_inode_info *ei = EXT2_I(inode);
- int ret;
-
- sb_start_pagefault(inode->i_sb);
- file_update_time(vma->vm_file);
- down_read(&ei->dax_sem);
-
- ret = __dax_mkwrite(vma, vmf, ext2_get_block, NULL);
-
- up_read(&ei->dax_sem);
- sb_end_pagefault(inode->i_sb);
- return ret;
-}
-
static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma,
struct vm_fault *vmf)
{
@@ -124,7 +107,7 @@ static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma,
static const struct vm_operations_struct ext2_dax_vm_ops = {
.fault = ext2_dax_fault,
.pmd_fault = ext2_dax_pmd_fault,
- .page_mkwrite = ext2_dax_mkwrite,
+ .page_mkwrite = ext2_dax_fault,
.pfn_mkwrite = ext2_dax_pfn_mkwrite,
};
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 338eefd..6bd58e6 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -737,8 +737,10 @@ static int ext2_get_blocks(struct inode *inode,
* so that it's not found by another thread before it's
* initialised
*/
- err = dax_clear_blocks(inode, le32_to_cpu(chain[depth-1].key),
- 1 << inode->i_blkbits);
+ err = dax_clear_sectors(inode->i_sb->s_bdev,
+ le32_to_cpu(chain[depth-1].key) <<
+ (inode->i_blkbits - 9),
+ 1 << inode->i_blkbits);
if (err) {
mutex_unlock(&ei->truncate_mutex);
goto cleanup;
@@ -874,6 +876,14 @@ ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
static int
ext2_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
+#ifdef CONFIG_FS_DAX
+ if (dax_mapping(mapping)) {
+ return dax_writeback_mapping_range(mapping,
+ mapping->host->i_sb->s_bdev,
+ wbc);
+ }
+#endif
+
return mpage_writepages(mapping, wbc, ext2_get_block);
}
@@ -1296,7 +1306,7 @@ void ext2_set_inode_flags(struct inode *inode)
inode->i_flags |= S_NOATIME;
if (flags & EXT2_DIRSYNC_FL)
inode->i_flags |= S_DIRSYNC;
- if (test_opt(inode->i_sb, DAX))
+ if (test_opt(inode->i_sb, DAX) && S_ISREG(inode->i_mode))
inode->i_flags |= S_DAX;
}
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index ec0668a..fe1f50f 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -191,7 +191,6 @@ static int ext4_init_block_bitmap(struct super_block *sb,
/* If checksum is bad mark all blocks used to prevent allocation
* essentially implementing a per-group read-only flag. */
if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
- ext4_error(sb, "Checksum bad for group %u", block_group);
grp = ext4_get_group_info(sb, block_group);
if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
percpu_counter_sub(&sbi->s_freeclusters_counter,
@@ -442,14 +441,16 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
}
ext4_lock_group(sb, block_group);
if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
-
err = ext4_init_block_bitmap(sb, bh, block_group, desc);
set_bitmap_uptodate(bh);
set_buffer_uptodate(bh);
ext4_unlock_group(sb, block_group);
unlock_buffer(bh);
- if (err)
+ if (err) {
+ ext4_error(sb, "Failed to init block bitmap for group "
+ "%u: %d", block_group, err);
goto out;
+ }
goto verify;
}
ext4_unlock_group(sb, block_group);
diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c
index c802120..38f7562 100644
--- a/fs/ext4/crypto.c
+++ b/fs/ext4/crypto.c
@@ -467,3 +467,59 @@ uint32_t ext4_validate_encryption_key_size(uint32_t mode, uint32_t size)
return size;
return 0;
}
+
+/*
+ * Validate dentries for encrypted directories to make sure we aren't
+ * potentially caching stale data after a key has been added or
+ * removed.
+ */
+static int ext4_d_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ struct inode *dir = d_inode(dentry->d_parent);
+ struct ext4_crypt_info *ci = EXT4_I(dir)->i_crypt_info;
+ int dir_has_key, cached_with_key;
+
+ if (!ext4_encrypted_inode(dir))
+ return 0;
+
+ if (ci && ci->ci_keyring_key &&
+ (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) |
+ (1 << KEY_FLAG_REVOKED) |
+ (1 << KEY_FLAG_DEAD))))
+ ci = NULL;
+
+ /* this should eventually be an flag in d_flags */
+ cached_with_key = dentry->d_fsdata != NULL;
+ dir_has_key = (ci != NULL);
+
+ /*
+ * If the dentry was cached without the key, and it is a
+ * negative dentry, it might be a valid name. We can't check
+ * if the key has since been made available due to locking
+ * reasons, so we fail the validation so ext4_lookup() can do
+ * this check.
+ *
+ * We also fail the validation if the dentry was created with
+ * the key present, but we no longer have the key, or vice versa.
+ */
+ if ((!cached_with_key && d_is_negative(dentry)) ||
+ (!cached_with_key && dir_has_key) ||
+ (cached_with_key && !dir_has_key)) {
+#if 0 /* Revalidation debug */
+ char buf[80];
+ char *cp = simple_dname(dentry, buf, sizeof(buf));
+
+ if (IS_ERR(cp))
+ cp = (char *) "???";
+ pr_err("revalidate: %s %p %d %d %d\n", cp, dentry->d_fsdata,
+ cached_with_key, d_is_negative(dentry),
+ dir_has_key);
+#endif
+ return 0;
+ }
+ return 1;
+}
+
+const struct dentry_operations ext4_encrypted_d_ops = {
+ .d_revalidate = ext4_d_revalidate,
+};
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 1d1bca7..33f5e2a 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -111,6 +111,12 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
int dir_has_error = 0;
struct ext4_str fname_crypto_str = {.name = NULL, .len = 0};
+ if (ext4_encrypted_inode(inode)) {
+ err = ext4_get_encryption_info(inode);
+ if (err && err != -ENOKEY)
+ return err;
+ }
+
if (is_dx_dir(inode)) {
err = ext4_dx_readdir(file, ctx);
if (err != ERR_BAD_DX_DIR) {
@@ -157,8 +163,11 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
index, 1);
file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
bh = ext4_bread(NULL, inode, map.m_lblk, 0);
- if (IS_ERR(bh))
- return PTR_ERR(bh);
+ if (IS_ERR(bh)) {
+ err = PTR_ERR(bh);
+ bh = NULL;
+ goto errout;
+ }
}
if (!bh) {
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 0662b28..157b458 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2302,6 +2302,7 @@ struct page *ext4_encrypt(struct inode *inode,
int ext4_decrypt(struct page *page);
int ext4_encrypted_zeroout(struct inode *inode, ext4_lblk_t lblk,
ext4_fsblk_t pblk, ext4_lblk_t len);
+extern const struct dentry_operations ext4_encrypted_d_ops;
#ifdef CONFIG_EXT4_FS_ENCRYPTION
int ext4_init_crypto(void);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 0ffabaf..3753ceb 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3928,7 +3928,7 @@ static int
convert_initialized_extent(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map,
struct ext4_ext_path **ppath, int flags,
- unsigned int allocated, ext4_fsblk_t newblock)
+ unsigned int allocated)
{
struct ext4_ext_path *path = *ppath;
struct ext4_extent *ex;
@@ -4347,7 +4347,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
allocated = convert_initialized_extent(
handle, inode, map, &path,
- flags, allocated, newblock);
+ flags, allocated);
goto out2;
} else if (!ext4_ext_is_unwritten(ex))
goto out;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 1126436..4cd318f 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -262,23 +262,8 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
return result;
}
-static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
- int err;
- struct inode *inode = file_inode(vma->vm_file);
-
- sb_start_pagefault(inode->i_sb);
- file_update_time(vma->vm_file);
- down_read(&EXT4_I(inode)->i_mmap_sem);
- err = __dax_mkwrite(vma, vmf, ext4_dax_mmap_get_block, NULL);
- up_read(&EXT4_I(inode)->i_mmap_sem);
- sb_end_pagefault(inode->i_sb);
-
- return err;
-}
-
/*
- * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_mkwrite()
+ * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_fault()
* handler we check for races agaist truncate. Note that since we cycle through
* i_mmap_sem, we are sure that also any hole punching that began before we
* were called is finished by now and so if it included part of the file we
@@ -311,7 +296,7 @@ static int ext4_dax_pfn_mkwrite(struct vm_area_struct *vma,
static const struct vm_operations_struct ext4_dax_vm_ops = {
.fault = ext4_dax_fault,
.pmd_fault = ext4_dax_pmd_fault,
- .page_mkwrite = ext4_dax_mkwrite,
+ .page_mkwrite = ext4_dax_fault,
.pfn_mkwrite = ext4_dax_pfn_mkwrite,
};
#else
@@ -350,6 +335,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
struct super_block *sb = inode->i_sb;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct vfsmount *mnt = filp->f_path.mnt;
+ struct inode *dir = filp->f_path.dentry->d_parent->d_inode;
struct path path;
char buf[64], *cp;
int ret;
@@ -393,6 +379,14 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
if (ext4_encryption_info(inode) == NULL)
return -ENOKEY;
}
+ if (ext4_encrypted_inode(dir) &&
+ !ext4_is_child_context_consistent_with_parent(dir, inode)) {
+ ext4_warning(inode->i_sb,
+ "Inconsistent encryption contexts: %lu/%lu\n",
+ (unsigned long) dir->i_ino,
+ (unsigned long) inode->i_ino);
+ return -EPERM;
+ }
/*
* Set up the jbd2_inode if we are opening the inode for
* writing and the journal is present
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 3fcfd50..acc0ad5 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -76,7 +76,6 @@ static int ext4_init_inode_bitmap(struct super_block *sb,
/* If checksum is bad mark all blocks and inodes use to prevent
* allocation, essentially implementing a per-group read-only flag. */
if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
- ext4_error(sb, "Checksum bad for group %u", block_group);
grp = ext4_get_group_info(sb, block_group);
if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
percpu_counter_sub(&sbi->s_freeclusters_counter,
@@ -191,8 +190,11 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
set_buffer_verified(bh);
ext4_unlock_group(sb, block_group);
unlock_buffer(bh);
- if (err)
+ if (err) {
+ ext4_error(sb, "Failed to init inode bitmap for group "
+ "%u: %d", block_group, err);
goto out;
+ }
return bh;
}
ext4_unlock_group(sb, block_group);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 83bc8bf..aee960b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -686,6 +686,34 @@ out_sem:
return retval;
}
+/*
+ * Update EXT4_MAP_FLAGS in bh->b_state. For buffer heads attached to pages
+ * we have to be careful as someone else may be manipulating b_state as well.
+ */
+static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags)
+{
+ unsigned long old_state;
+ unsigned long new_state;
+
+ flags &= EXT4_MAP_FLAGS;
+
+ /* Dummy buffer_head? Set non-atomically. */
+ if (!bh->b_page) {
+ bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | flags;
+ return;
+ }
+ /*
+ * Someone else may be modifying b_state. Be careful! This is ugly but
+ * once we get rid of using bh as a container for mapping information
+ * to pass to / from get_block functions, this can go away.
+ */
+ do {
+ old_state = READ_ONCE(bh->b_state);
+ new_state = (old_state & ~EXT4_MAP_FLAGS) | flags;
+ } while (unlikely(
+ cmpxchg(&bh->b_state, old_state, new_state) != old_state));
+}
+
/* Maximum number of blocks we map for direct IO at once. */
#define DIO_MAX_BLOCKS 4096
@@ -722,7 +750,7 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
ext4_io_end_t *io_end = ext4_inode_aio(inode);
map_bh(bh, inode->i_sb, map.m_pblk);
- bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+ ext4_update_bh_state(bh, map.m_flags);
if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
set_buffer_defer_completion(bh);
bh->b_size = inode->i_sb->s_blocksize * map.m_len;
@@ -1685,7 +1713,7 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
return ret;
map_bh(bh, inode->i_sb, map.m_pblk);
- bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+ ext4_update_bh_state(bh, map.m_flags);
if (buffer_unwritten(bh)) {
/* A delayed write to unwritten bh should be marked
@@ -2450,6 +2478,10 @@ static int ext4_writepages(struct address_space *mapping,
trace_ext4_writepages(inode, wbc);
+ if (dax_mapping(mapping))
+ return dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev,
+ wbc);
+
/*
* No pages to write? This is mainly a kludge to avoid starting
* a transaction for special inodes like journal inode on last iput()
@@ -3253,29 +3285,29 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
* case, we allocate an io_end structure to hook to the iocb.
*/
iocb->private = NULL;
- ext4_inode_aio_set(inode, NULL);
- if (!is_sync_kiocb(iocb)) {
- io_end = ext4_init_io_end(inode, GFP_NOFS);
- if (!io_end) {
- ret = -ENOMEM;
- goto retake_lock;
- }
- /*
- * Grab reference for DIO. Will be dropped in ext4_end_io_dio()
- */
- iocb->private = ext4_get_io_end(io_end);
- /*
- * we save the io structure for current async direct
- * IO, so that later ext4_map_blocks() could flag the
- * io structure whether there is a unwritten extents
- * needs to be converted when IO is completed.
- */
- ext4_inode_aio_set(inode, io_end);
- }
-
if (overwrite) {
get_block_func = ext4_get_block_overwrite;
} else {
+ ext4_inode_aio_set(inode, NULL);
+ if (!is_sync_kiocb(iocb)) {
+ io_end = ext4_init_io_end(inode, GFP_NOFS);
+ if (!io_end) {
+ ret = -ENOMEM;
+ goto retake_lock;
+ }
+ /*
+ * Grab reference for DIO. Will be dropped in
+ * ext4_end_io_dio()
+ */
+ iocb->private = ext4_get_io_end(io_end);
+ /*
+ * we save the io structure for current async direct
+ * IO, so that later ext4_map_blocks() could flag the
+ * io structure whether there is a unwritten extents
+ * needs to be converted when IO is completed.
+ */
+ ext4_inode_aio_set(inode, io_end);
+ }
get_block_func = ext4_get_block_write;
dio_flags = DIO_LOCKING;
}
@@ -4127,7 +4159,7 @@ void ext4_set_inode_flags(struct inode *inode)
new_fl |= S_NOATIME;
if (flags & EXT4_DIRSYNC_FL)
new_fl |= S_DIRSYNC;
- if (test_opt(inode->i_sb, DAX))
+ if (test_opt(inode->i_sb, DAX) && S_ISREG(inode->i_mode))
new_fl |= S_DAX;
inode_set_flags(inode, new_fl,
S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 0f6c369..eae5917 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -208,7 +208,7 @@ static int ext4_ioctl_setflags(struct inode *inode,
{
struct ext4_inode_info *ei = EXT4_I(inode);
handle_t *handle = NULL;
- int err = EPERM, migrate = 0;
+ int err = -EPERM, migrate = 0;
struct ext4_iloc iloc;
unsigned int oldflags, mask, i;
unsigned int jflag;
@@ -583,6 +583,11 @@ group_extend_out:
"Online defrag not supported with bigalloc");
err = -EOPNOTSUPP;
goto mext_out;
+ } else if (IS_DAX(inode)) {
+ ext4_msg(sb, KERN_ERR,
+ "Online defrag not supported with DAX");
+ err = -EOPNOTSUPP;
+ goto mext_out;
}
err = mnt_want_write_file(filp);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 61eaf74..4424b7b 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2285,7 +2285,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
if (group == 0)
seq_puts(seq, "#group: free frags first ["
" 2^0 2^1 2^2 2^3 2^4 2^5 2^6 "
- " 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]");
+ " 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]\n");
i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
sizeof(struct ext4_group_info);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index fb6f117..4098acc 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -265,11 +265,12 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
ext4_lblk_t orig_blk_offset, donor_blk_offset;
unsigned long blocksize = orig_inode->i_sb->s_blocksize;
unsigned int tmp_data_size, data_size, replaced_size;
- int err2, jblocks, retries = 0;
+ int i, err2, jblocks, retries = 0;
int replaced_count = 0;
int from = data_offset_in_page << orig_inode->i_blkbits;
int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
struct super_block *sb = orig_inode->i_sb;
+ struct buffer_head *bh = NULL;
/*
* It needs twice the amount of ordinary journal buffers because
@@ -380,8 +381,17 @@ data_copy:
}
/* Perform all necessary steps similar write_begin()/write_end()
* but keeping in mind that i_size will not change */
- *err = __block_write_begin(pagep[0], from, replaced_size,
- ext4_get_block);
+ if (!page_has_buffers(pagep[0]))
+ create_empty_buffers(pagep[0], 1 << orig_inode->i_blkbits, 0);
+ bh = page_buffers(pagep[0]);
+ for (i = 0; i < data_offset_in_page; i++)
+ bh = bh->b_this_page;
+ for (i = 0; i < block_len_in_page; i++) {
+ *err = ext4_get_block(orig_inode, orig_blk_offset + i, bh, 0);
+ if (*err < 0)
+ break;
+ bh = bh->b_this_page;
+ }
if (!*err)
*err = block_commit_write(pagep[0], from, from + replaced_size);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 06574dd..48e4b89 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1558,6 +1558,24 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
struct ext4_dir_entry_2 *de;
struct buffer_head *bh;
+ if (ext4_encrypted_inode(dir)) {
+ int res = ext4_get_encryption_info(dir);
+
+ /*
+ * This should be a properly defined flag for
+ * dentry->d_flags when we uplift this to the VFS.
+ * d_fsdata is set to (void *) 1 if if the dentry is
+ * created while the directory was encrypted and we
+ * don't have access to the key.
+ */
+ dentry->d_fsdata = NULL;
+ if (ext4_encryption_info(dir))
+ dentry->d_fsdata = (void *) 1;
+ d_set_d_op(dentry, &ext4_encrypted_d_ops);
+ if (res && res != -ENOKEY)
+ return ERR_PTR(res);
+ }
+
if (dentry->d_name.len > EXT4_NAME_LEN)
return ERR_PTR(-ENAMETOOLONG);
@@ -1585,11 +1603,15 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
return ERR_PTR(-EFSCORRUPTED);
}
if (!IS_ERR(inode) && ext4_encrypted_inode(dir) &&
- (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
- S_ISLNK(inode->i_mode)) &&
+ (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) &&
!ext4_is_child_context_consistent_with_parent(dir,
inode)) {
+ int nokey = ext4_encrypted_inode(inode) &&
+ !ext4_encryption_info(inode);
+
iput(inode);
+ if (nokey)
+ return ERR_PTR(-ENOKEY);
ext4_warning(inode->i_sb,
"Inconsistent encryption contexts: %lu/%lu\n",
(unsigned long) dir->i_ino,
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index ad62d7a..34038e3 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -198,7 +198,7 @@ static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned long flexbg_size)
if (flex_gd == NULL)
goto out3;
- if (flexbg_size >= UINT_MAX / sizeof(struct ext4_new_flex_group_data))
+ if (flexbg_size >= UINT_MAX / sizeof(struct ext4_new_group_data))
goto out2;
flex_gd->count = flexbg_size;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 6915c95..5c46ed9 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -223,6 +223,9 @@ static void wb_wait_for_completion(struct backing_dev_info *bdi,
#define WB_FRN_HIST_MAX_SLOTS (WB_FRN_HIST_THR_SLOTS / 2 + 1)
/* one round can affect upto 5 slots */
+static atomic_t isw_nr_in_flight = ATOMIC_INIT(0);
+static struct workqueue_struct *isw_wq;
+
void __inode_attach_wb(struct inode *inode, struct page *page)
{
struct backing_dev_info *bdi = inode_to_bdi(inode);
@@ -424,6 +427,8 @@ skip_switch:
iput(inode);
kfree(isw);
+
+ atomic_dec(&isw_nr_in_flight);
}
static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
@@ -433,7 +438,7 @@ static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
/* needs to grab bh-unsafe locks, bounce to work item */
INIT_WORK(&isw->work, inode_switch_wbs_work_fn);
- schedule_work(&isw->work);
+ queue_work(isw_wq, &isw->work);
}
/**
@@ -469,7 +474,8 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
/* while holding I_WB_SWITCH, no one else can update the association */
spin_lock(&inode->i_lock);
- if (inode->i_state & (I_WB_SWITCH | I_FREEING) ||
+ if (!(inode->i_sb->s_flags & MS_ACTIVE) ||
+ inode->i_state & (I_WB_SWITCH | I_FREEING) ||
inode_to_wb(inode) == isw->new_wb) {
spin_unlock(&inode->i_lock);
goto out_free;
@@ -480,6 +486,8 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
ihold(inode);
isw->inode = inode;
+ atomic_inc(&isw_nr_in_flight);
+
/*
* In addition to synchronizing among switchers, I_WB_SWITCH tells
* the RCU protected stat update paths to grab the mapping's
@@ -840,6 +848,33 @@ restart:
wb_put(last_wb);
}
+/**
+ * cgroup_writeback_umount - flush inode wb switches for umount
+ *
+ * This function is called when a super_block is about to be destroyed and
+ * flushes in-flight inode wb switches. An inode wb switch goes through
+ * RCU and then workqueue, so the two need to be flushed in order to ensure
+ * that all previously scheduled switches are finished. As wb switches are
+ * rare occurrences and synchronize_rcu() can take a while, perform
+ * flushing iff wb switches are in flight.
+ */
+void cgroup_writeback_umount(void)
+{
+ if (atomic_read(&isw_nr_in_flight)) {
+ synchronize_rcu();
+ flush_workqueue(isw_wq);
+ }
+}
+
+static int __init cgroup_writeback_init(void)
+{
+ isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0);
+ if (!isw_wq)
+ return -ENOMEM;
+ return 0;
+}
+fs_initcall(cgroup_writeback_init);
+
#else /* CONFIG_CGROUP_WRITEBACK */
static struct bdi_writeback *
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index 506765a..bb8d67e 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -376,12 +376,11 @@ static int hpfs_unlink(struct inode *dir, struct dentry *dentry)
struct inode *inode = d_inode(dentry);
dnode_secno dno;
int r;
- int rep = 0;
int err;
hpfs_lock(dir->i_sb);
hpfs_adjust_length(name, &len);
-again:
+
err = -ENOENT;
de = map_dirent(dir, hpfs_i(dir)->i_dno, name, len, &dno, &qbh);
if (!de)
@@ -401,33 +400,9 @@ again:
hpfs_error(dir->i_sb, "there was error when removing dirent");
err = -EFSERROR;
break;
- case 2: /* no space for deleting, try to truncate file */
-
+ case 2: /* no space for deleting */
err = -ENOSPC;
- if (rep++)
- break;
-
- dentry_unhash(dentry);
- if (!d_unhashed(dentry)) {
- hpfs_unlock(dir->i_sb);
- return -ENOSPC;
- }
- if (generic_permission(inode, MAY_WRITE) ||
- !S_ISREG(inode->i_mode) ||
- get_write_access(inode)) {
- d_rehash(dentry);
- } else {
- struct iattr newattrs;
- /*pr_info("truncating file before delete.\n");*/
- newattrs.ia_size = 0;
- newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
- err = notify_change(dentry, &newattrs, NULL);
- put_write_access(inode);
- if (!err)
- goto again;
- }
- hpfs_unlock(dir->i_sb);
- return -ENOSPC;
+ break;
default:
drop_nlink(inode);
err = 0;
diff --git a/fs/inode.c b/fs/inode.c
index 9f62db3..69b8b52 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -154,6 +154,12 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
inode->i_rdev = 0;
inode->dirtied_when = 0;
+#ifdef CONFIG_CGROUP_WRITEBACK
+ inode->i_wb_frn_winner = 0;
+ inode->i_wb_frn_avg_time = 0;
+ inode->i_wb_frn_history = 0;
+#endif
+
if (security_inode_alloc(inode))
goto out;
spin_lock_init(&inode->i_lock);
diff --git a/fs/jffs2/README.Locking b/fs/jffs2/README.Locking
index 3ea3655..8918ac9 100644
--- a/fs/jffs2/README.Locking
+++ b/fs/jffs2/README.Locking
@@ -2,10 +2,6 @@
JFFS2 LOCKING DOCUMENTATION
---------------------------
-At least theoretically, JFFS2 does not require the Big Kernel Lock
-(BKL), which was always helpfully obtained for it by Linux 2.4 VFS
-code. It has its own locking, as described below.
-
This document attempts to describe the existing locking rules for
JFFS2. It is not expected to remain perfectly up to date, but ought to
be fairly close.
@@ -69,6 +65,7 @@ Ordering constraints:
any f->sem held.
2. Never attempt to lock two file mutexes in one thread.
No ordering rules have been made for doing so.
+ 3. Never lock a page cache page with f->sem held.
erase_completion_lock spinlock
diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c
index 0ae91ad..b288c8a 100644
--- a/fs/jffs2/build.c
+++ b/fs/jffs2/build.c
@@ -50,7 +50,8 @@ next_inode(int *i, struct jffs2_inode_cache *ic, struct jffs2_sb_info *c)
static void jffs2_build_inode_pass1(struct jffs2_sb_info *c,
- struct jffs2_inode_cache *ic)
+ struct jffs2_inode_cache *ic,
+ int *dir_hardlinks)
{
struct jffs2_full_dirent *fd;
@@ -69,19 +70,21 @@ static void jffs2_build_inode_pass1(struct jffs2_sb_info *c,
dbg_fsbuild("child \"%s\" (ino #%u) of dir ino #%u doesn't exist!\n",
fd->name, fd->ino, ic->ino);
jffs2_mark_node_obsolete(c, fd->raw);
+ /* Clear the ic/raw union so it doesn't cause problems later. */
+ fd->ic = NULL;
continue;
}
+ /* From this point, fd->raw is no longer used so we can set fd->ic */
+ fd->ic = child_ic;
+ child_ic->pino_nlink++;
+ /* If we appear (at this stage) to have hard-linked directories,
+ * set a flag to trigger a scan later */
if (fd->type == DT_DIR) {
- if (child_ic->pino_nlink) {
- JFFS2_ERROR("child dir \"%s\" (ino #%u) of dir ino #%u appears to be a hard link\n",
- fd->name, fd->ino, ic->ino);
- /* TODO: What do we do about it? */
- } else {
- child_ic->pino_nlink = ic->ino;
- }
- } else
- child_ic->pino_nlink++;
+ child_ic->flags |= INO_FLAGS_IS_DIR;
+ if (child_ic->pino_nlink > 1)
+ *dir_hardlinks = 1;
+ }
dbg_fsbuild("increased nlink for child \"%s\" (ino #%u)\n", fd->name, fd->ino);
/* Can't free scan_dents so far. We might need them in pass 2 */
@@ -95,8 +98,7 @@ static void jffs2_build_inode_pass1(struct jffs2_sb_info *c,
*/
static int jffs2_build_filesystem(struct jffs2_sb_info *c)
{
- int ret;
- int i;
+ int ret, i, dir_hardlinks = 0;
struct jffs2_inode_cache *ic;
struct jffs2_full_dirent *fd;
struct jffs2_full_dirent *dead_fds = NULL;
@@ -120,7 +122,7 @@ static int jffs2_build_filesystem(struct jffs2_sb_info *c)
/* Now scan the directory tree, increasing nlink according to every dirent found. */
for_each_inode(i, c, ic) {
if (ic->scan_dents) {
- jffs2_build_inode_pass1(c, ic);
+ jffs2_build_inode_pass1(c, ic, &dir_hardlinks);
cond_resched();
}
}
@@ -156,6 +158,20 @@ static int jffs2_build_filesystem(struct jffs2_sb_info *c)
}
dbg_fsbuild("pass 2a complete\n");
+
+ if (dir_hardlinks) {
+ /* If we detected directory hardlinks earlier, *hopefully*
+ * they are gone now because some of the links were from
+ * dead directories which still had some old dirents lying
+ * around and not yet garbage-collected, but which have
+ * been discarded above. So clear the pino_nlink field
+ * in each directory, so that the final scan below can
+ * print appropriate warnings. */
+ for_each_inode(i, c, ic) {
+ if (ic->flags & INO_FLAGS_IS_DIR)
+ ic->pino_nlink = 0;
+ }
+ }
dbg_fsbuild("freeing temporary data structures\n");
/* Finally, we can scan again and free the dirent structs */
@@ -163,6 +179,33 @@ static int jffs2_build_filesystem(struct jffs2_sb_info *c)
while(ic->scan_dents) {
fd = ic->scan_dents;
ic->scan_dents = fd->next;
+ /* We do use the pino_nlink field to count nlink of
+ * directories during fs build, so set it to the
+ * parent ino# now. Now that there's hopefully only
+ * one. */
+ if (fd->type == DT_DIR) {
+ if (!fd->ic) {
+ /* We'll have complained about it and marked the coresponding
+ raw node obsolete already. Just skip it. */
+ continue;
+ }
+
+ /* We *have* to have set this in jffs2_build_inode_pass1() */
+ BUG_ON(!(fd->ic->flags & INO_FLAGS_IS_DIR));
+
+ /* We clear ic->pino_nlink ∀ directories' ic *only* if dir_hardlinks
+ * is set. Otherwise, we know this should never trigger anyway, so
+ * we don't do the check. And ic->pino_nlink still contains the nlink
+ * value (which is 1). */
+ if (dir_hardlinks && fd->ic->pino_nlink) {
+ JFFS2_ERROR("child dir \"%s\" (ino #%u) of dir ino #%u is also hard linked from dir ino #%u\n",
+ fd->name, fd->ino, ic->ino, fd->ic->pino_nlink);
+ /* Should we unlink it from its previous parent? */
+ }
+
+ /* For directories, ic->pino_nlink holds that parent inode # */
+ fd->ic->pino_nlink = ic->ino;
+ }
jffs2_free_full_dirent(fd);
}
ic->scan_dents = NULL;
@@ -241,11 +284,7 @@ static void jffs2_build_remove_unlinked_inode(struct jffs2_sb_info *c,
/* Reduce nlink of the child. If it's now zero, stick it on the
dead_fds list to be cleaned up later. Else just free the fd */
-
- if (fd->type == DT_DIR)
- child_ic->pino_nlink = 0;
- else
- child_ic->pino_nlink--;
+ child_ic->pino_nlink--;
if (!child_ic->pino_nlink) {
dbg_fsbuild("inode #%u (\"%s\") now has no links; adding to dead_fds list.\n",
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index d211b8e..30c4c9e 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -843,9 +843,14 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
pr_notice("%s(): Link succeeded, unlink failed (err %d). You now have a hard link\n",
__func__, ret);
- /* Might as well let the VFS know */
- d_instantiate(new_dentry, d_inode(old_dentry));
- ihold(d_inode(old_dentry));
+ /*
+ * We can't keep the target in dcache after that.
+ * For one thing, we can't afford dentry aliases for directories.
+ * For another, if there was a victim, we _can't_ set new inode
+ * for that sucker and we have to trigger mount eviction - the
+ * caller won't do it on its own since we are returning an error.
+ */
+ d_invalidate(new_dentry);
new_dir_i->i_mtime = new_dir_i->i_ctime = ITIME(now);
return ret;
}
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index c5ac594..cad86ba 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -137,39 +137,33 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
struct page *pg;
struct inode *inode = mapping->host;
struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
- struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
- struct jffs2_raw_inode ri;
- uint32_t alloc_len = 0;
pgoff_t index = pos >> PAGE_CACHE_SHIFT;
uint32_t pageofs = index << PAGE_CACHE_SHIFT;
int ret = 0;
- jffs2_dbg(1, "%s()\n", __func__);
-
- if (pageofs > inode->i_size) {
- ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len,
- ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
- if (ret)
- return ret;
- }
-
- mutex_lock(&f->sem);
pg = grab_cache_page_write_begin(mapping, index, flags);
- if (!pg) {
- if (alloc_len)
- jffs2_complete_reservation(c);
- mutex_unlock(&f->sem);
+ if (!pg)
return -ENOMEM;
- }
*pagep = pg;
- if (alloc_len) {
+ jffs2_dbg(1, "%s()\n", __func__);
+
+ if (pageofs > inode->i_size) {
/* Make new hole frag from old EOF to new page */
+ struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
+ struct jffs2_raw_inode ri;
struct jffs2_full_dnode *fn;
+ uint32_t alloc_len;
jffs2_dbg(1, "Writing new hole frag 0x%x-0x%x between current EOF and new page\n",
(unsigned int)inode->i_size, pageofs);
+ ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len,
+ ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
+ if (ret)
+ goto out_page;
+
+ mutex_lock(&f->sem);
memset(&ri, 0, sizeof(ri));
ri.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
@@ -196,6 +190,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
if (IS_ERR(fn)) {
ret = PTR_ERR(fn);
jffs2_complete_reservation(c);
+ mutex_unlock(&f->sem);
goto out_page;
}
ret = jffs2_add_full_dnode_to_inode(c, f, fn);
@@ -210,10 +205,12 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
jffs2_mark_node_obsolete(c, fn->raw);
jffs2_free_full_dnode(fn);
jffs2_complete_reservation(c);
+ mutex_unlock(&f->sem);
goto out_page;
}
jffs2_complete_reservation(c);
inode->i_size = pageofs;
+ mutex_unlock(&f->sem);
}
/*
@@ -222,18 +219,18 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
* case of a short-copy.
*/
if (!PageUptodate(pg)) {
+ mutex_lock(&f->sem);
ret = jffs2_do_readpage_nolock(inode, pg);
+ mutex_unlock(&f->sem);
if (ret)
goto out_page;
}
- mutex_unlock(&f->sem);
jffs2_dbg(1, "end write_begin(). pg->flags %lx\n", pg->flags);
return ret;
out_page:
unlock_page(pg);
page_cache_release(pg);
- mutex_unlock(&f->sem);
return ret;
}
diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index 5a2dec2..95d5880 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -1296,14 +1296,17 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era
BUG_ON(start > orig_start);
}
- /* First, use readpage() to read the appropriate page into the page cache */
- /* Q: What happens if we actually try to GC the _same_ page for which commit_write()
- * triggered garbage collection in the first place?
- * A: I _think_ it's OK. read_cache_page shouldn't deadlock, we'll write out the
- * page OK. We'll actually write it out again in commit_write, which is a little
- * suboptimal, but at least we're correct.
- */
+ /* The rules state that we must obtain the page lock *before* f->sem, so
+ * drop f->sem temporarily. Since we also hold c->alloc_sem, nothing's
+ * actually going to *change* so we're safe; we only allow reading.
+ *
+ * It is important to note that jffs2_write_begin() will ensure that its
+ * page is marked Uptodate before allocating space. That means that if we
+ * end up here trying to GC the *same* page that jffs2_write_begin() is
+ * trying to write out, read_cache_page() will not deadlock. */
+ mutex_unlock(&f->sem);
pg_ptr = jffs2_gc_fetch_page(c, f, start, &pg);
+ mutex_lock(&f->sem);
if (IS_ERR(pg_ptr)) {
pr_warn("read_cache_page() returned error: %ld\n",
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index fa35ff7..0637271 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -194,6 +194,7 @@ struct jffs2_inode_cache {
#define INO_STATE_CLEARING 6 /* In clear_inode() */
#define INO_FLAGS_XATTR_CHECKED 0x01 /* has no duplicate xattr_ref */
+#define INO_FLAGS_IS_DIR 0x02 /* is a directory */
#define RAWNODE_CLASS_INODE_CACHE 0
#define RAWNODE_CLASS_XATTR_DATUM 1
@@ -249,7 +250,10 @@ struct jffs2_readinode_info
struct jffs2_full_dirent
{
- struct jffs2_raw_node_ref *raw;
+ union {
+ struct jffs2_raw_node_ref *raw;
+ struct jffs2_inode_cache *ic; /* Just during part of build */
+ };
struct jffs2_full_dirent *next;
uint32_t version;
uint32_t ino; /* == zero for unlink */
diff --git a/fs/namei.c b/fs/namei.c
index f624d13..9c590e0 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1712,6 +1712,11 @@ static inline int should_follow_link(struct nameidata *nd, struct path *link,
return 0;
if (!follow)
return 0;
+ /* make sure that d_is_symlink above matches inode */
+ if (nd->flags & LOOKUP_RCU) {
+ if (read_seqcount_retry(&link->dentry->d_seq, seq))
+ return -ECHILD;
+ }
return pick_link(nd, link, inode, seq);
}
@@ -1743,11 +1748,11 @@ static int walk_component(struct nameidata *nd, int flags)
if (err < 0)
return err;
- inode = d_backing_inode(path.dentry);
seq = 0; /* we are already out of RCU mode */
err = -ENOENT;
if (d_is_negative(path.dentry))
goto out_path_put;
+ inode = d_backing_inode(path.dentry);
}
if (flags & WALK_PUT)
@@ -3192,12 +3197,12 @@ retry_lookup:
return error;
BUG_ON(nd->flags & LOOKUP_RCU);
- inode = d_backing_inode(path.dentry);
seq = 0; /* out of RCU mode, so the value doesn't matter */
if (unlikely(d_is_negative(path.dentry))) {
path_to_nameidata(&path, nd);
return -ENOENT;
}
+ inode = d_backing_inode(path.dentry);
finish_lookup:
if (nd->depth)
put_link(nd);
@@ -3206,11 +3211,6 @@ finish_lookup:
if (unlikely(error))
return error;
- if (unlikely(d_is_symlink(path.dentry)) && !(open_flag & O_PATH)) {
- path_to_nameidata(&path, nd);
- return -ELOOP;
- }
-
if ((nd->flags & LOOKUP_RCU) || nd->path.mnt != path.mnt) {
path_to_nameidata(&path, nd);
} else {
@@ -3229,6 +3229,10 @@ finish_open:
return error;
}
audit_inode(nd->name, nd->path.dentry, 0);
+ if (unlikely(d_is_symlink(nd->path.dentry)) && !(open_flag & O_PATH)) {
+ error = -ELOOP;
+ goto out;
+ }
error = -EISDIR;
if ((open_flag & O_CREAT) && d_is_dir(nd->path.dentry))
goto out;
@@ -3273,6 +3277,10 @@ opened:
goto exit_fput;
}
out:
+ if (unlikely(error > 0)) {
+ WARN_ON(1);
+ error = -EINVAL;
+ }
if (got_write)
mnt_drop_write(nd->path.mnt);
path_put(&save_parent);
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 26c2de2..b7f8eae 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -633,7 +633,7 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx,
d_rehash(newdent);
} else {
spin_lock(&dentry->d_lock);
- NCP_FINFO(inode)->flags &= ~NCPI_DIR_CACHE;
+ NCP_FINFO(dir)->flags &= ~NCPI_DIR_CACHE;
spin_unlock(&dentry->d_lock);
}
} else {
diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
index c59a59c..35ab51c 100644
--- a/fs/nfs/blocklayout/extent_tree.c
+++ b/fs/nfs/blocklayout/extent_tree.c
@@ -476,6 +476,7 @@ static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
for (i = 0; i < nr_pages; i++)
put_page(arg->layoutupdate_pages[i]);
+ vfree(arg->start_p);
kfree(arg->layoutupdate_pages);
} else {
put_page(arg->layoutupdate_page);
@@ -559,10 +560,15 @@ retry:
if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) {
void *p = start_p, *end = p + arg->layoutupdate_len;
+ struct page *page = NULL;
int i = 0;
- for ( ; p < end; p += PAGE_SIZE)
- arg->layoutupdate_pages[i++] = vmalloc_to_page(p);
+ arg->start_p = start_p;
+ for ( ; p < end; p += PAGE_SIZE) {
+ page = vmalloc_to_page(p);
+ arg->layoutupdate_pages[i++] = page;
+ get_page(page);
+ }
}
dprintk("%s found %zu ranges\n", __func__, count);
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 5bcd92d..0cb1abd 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -1215,7 +1215,7 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
hdr->pgio_mirror_idx + 1,
&hdr->pgio_mirror_idx))
goto out_eagain;
- set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
+ set_bit(NFS_LAYOUT_RETURN_REQUESTED,
&hdr->lseg->pls_layout->plh_flags);
pnfs_read_resend_pnfs(hdr);
return task->tk_status;
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index 29898a9..eb37046 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -412,7 +412,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
OP_ILLEGAL, GFP_NOIO);
if (!fail_return) {
if (ff_layout_has_available_ds(lseg))
- set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
+ set_bit(NFS_LAYOUT_RETURN_REQUESTED,
&lseg->pls_layout->plh_flags);
else
pnfs_error_mark_layout_for_return(ino, lseg);
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index bd25dc7..dff8346 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -16,29 +16,8 @@
#define NFSDBG_FACILITY NFSDBG_PROC
-static int nfs42_set_rw_stateid(nfs4_stateid *dst, struct file *file,
- fmode_t fmode)
-{
- struct nfs_open_context *open;
- struct nfs_lock_context *lock;
- int ret;
-
- open = get_nfs_open_context(nfs_file_open_context(file));
- lock = nfs_get_lock_context(open);
- if (IS_ERR(lock)) {
- put_nfs_open_context(open);
- return PTR_ERR(lock);
- }
-
- ret = nfs4_set_rw_stateid(dst, open, lock, fmode);
-
- nfs_put_lock_context(lock);
- put_nfs_open_context(open);
- return ret;
-}
-
static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
- loff_t offset, loff_t len)
+ struct nfs_lock_context *lock, loff_t offset, loff_t len)
{
struct inode *inode = file_inode(filep);
struct nfs_server *server = NFS_SERVER(inode);
@@ -56,7 +35,8 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
msg->rpc_argp = &args;
msg->rpc_resp = &res;
- status = nfs42_set_rw_stateid(&args.falloc_stateid, filep, FMODE_WRITE);
+ status = nfs4_set_rw_stateid(&args.falloc_stateid, lock->open_context,
+ lock, FMODE_WRITE);
if (status)
return status;
@@ -78,15 +58,26 @@ static int nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
{
struct nfs_server *server = NFS_SERVER(file_inode(filep));
struct nfs4_exception exception = { };
+ struct nfs_lock_context *lock;
int err;
+ lock = nfs_get_lock_context(nfs_file_open_context(filep));
+ if (IS_ERR(lock))
+ return PTR_ERR(lock);
+
+ exception.inode = file_inode(filep);
+ exception.state = lock->open_context->state;
+
do {
- err = _nfs42_proc_fallocate(msg, filep, offset, len);
- if (err == -ENOTSUPP)
- return -EOPNOTSUPP;
+ err = _nfs42_proc_fallocate(msg, filep, lock, offset, len);
+ if (err == -ENOTSUPP) {
+ err = -EOPNOTSUPP;
+ break;
+ }
err = nfs4_handle_exception(server, err, &exception);
} while (exception.retry);
+ nfs_put_lock_context(lock);
return err;
}
@@ -135,7 +126,8 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len)
return err;
}
-static loff_t _nfs42_proc_llseek(struct file *filep, loff_t offset, int whence)
+static loff_t _nfs42_proc_llseek(struct file *filep,
+ struct nfs_lock_context *lock, loff_t offset, int whence)
{
struct inode *inode = file_inode(filep);
struct nfs42_seek_args args = {
@@ -156,7 +148,8 @@ static loff_t _nfs42_proc_llseek(struct file *filep, loff_t offset, int whence)
if (!nfs_server_capable(inode, NFS_CAP_SEEK))
return -ENOTSUPP;
- status = nfs42_set_rw_stateid(&args.sa_stateid, filep, FMODE_READ);
+ status = nfs4_set_rw_stateid(&args.sa_stateid, lock->open_context,
+ lock, FMODE_READ);
if (status)
return status;
@@ -175,17 +168,28 @@ loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence)
{
struct nfs_server *server = NFS_SERVER(file_inode(filep));
struct nfs4_exception exception = { };
+ struct nfs_lock_context *lock;
loff_t err;
+ lock = nfs_get_lock_context(nfs_file_open_context(filep));
+ if (IS_ERR(lock))
+ return PTR_ERR(lock);
+
+ exception.inode = file_inode(filep);
+ exception.state = lock->open_context->state;
+
do {
- err = _nfs42_proc_llseek(filep, offset, whence);
+ err = _nfs42_proc_llseek(filep, lock, offset, whence);
if (err >= 0)
break;
- if (err == -ENOTSUPP)
- return -EOPNOTSUPP;
+ if (err == -ENOTSUPP) {
+ err = -EOPNOTSUPP;
+ break;
+ }
err = nfs4_handle_exception(server, err, &exception);
} while (exception.retry);
+ nfs_put_lock_context(lock);
return err;
}
@@ -298,8 +302,9 @@ int nfs42_proc_layoutstats_generic(struct nfs_server *server,
}
static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f,
- struct file *dst_f, loff_t src_offset,
- loff_t dst_offset, loff_t count)
+ struct file *dst_f, struct nfs_lock_context *src_lock,
+ struct nfs_lock_context *dst_lock, loff_t src_offset,
+ loff_t dst_offset, loff_t count)
{
struct inode *src_inode = file_inode(src_f);
struct inode *dst_inode = file_inode(dst_f);
@@ -320,11 +325,13 @@ static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f,
msg->rpc_argp = &args;
msg->rpc_resp = &res;
- status = nfs42_set_rw_stateid(&args.src_stateid, src_f, FMODE_READ);
+ status = nfs4_set_rw_stateid(&args.src_stateid, src_lock->open_context,
+ src_lock, FMODE_READ);
if (status)
return status;
- status = nfs42_set_rw_stateid(&args.dst_stateid, dst_f, FMODE_WRITE);
+ status = nfs4_set_rw_stateid(&args.dst_stateid, dst_lock->open_context,
+ dst_lock, FMODE_WRITE);
if (status)
return status;
@@ -349,22 +356,48 @@ int nfs42_proc_clone(struct file *src_f, struct file *dst_f,
};
struct inode *inode = file_inode(src_f);
struct nfs_server *server = NFS_SERVER(file_inode(src_f));
- struct nfs4_exception exception = { };
- int err;
+ struct nfs_lock_context *src_lock;
+ struct nfs_lock_context *dst_lock;
+ struct nfs4_exception src_exception = { };
+ struct nfs4_exception dst_exception = { };
+ int err, err2;
if (!nfs_server_capable(inode, NFS_CAP_CLONE))
return -EOPNOTSUPP;
+ src_lock = nfs_get_lock_context(nfs_file_open_context(src_f));
+ if (IS_ERR(src_lock))
+ return PTR_ERR(src_lock);
+
+ src_exception.inode = file_inode(src_f);
+ src_exception.state = src_lock->open_context->state;
+
+ dst_lock = nfs_get_lock_context(nfs_file_open_context(dst_f));
+ if (IS_ERR(dst_lock)) {
+ err = PTR_ERR(dst_lock);
+ goto out_put_src_lock;
+ }
+
+ dst_exception.inode = file_inode(dst_f);
+ dst_exception.state = dst_lock->open_context->state;
+
do {
- err = _nfs42_proc_clone(&msg, src_f, dst_f, src_offset,
- dst_offset, count);
+ err = _nfs42_proc_clone(&msg, src_f, dst_f, src_lock, dst_lock,
+ src_offset, dst_offset, count);
if (err == -ENOTSUPP || err == -EOPNOTSUPP) {
NFS_SERVER(inode)->caps &= ~NFS_CAP_CLONE;
- return -EOPNOTSUPP;
+ err = -EOPNOTSUPP;
+ break;
}
- err = nfs4_handle_exception(server, err, &exception);
- } while (exception.retry);
- return err;
+ err2 = nfs4_handle_exception(server, err, &src_exception);
+ err = nfs4_handle_exception(server, err, &dst_exception);
+ if (!err)
+ err = err2;
+ } while (src_exception.retry || dst_exception.retry);
+ nfs_put_lock_context(dst_lock);
+out_put_src_lock:
+ nfs_put_lock_context(src_lock);
+ return err;
}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 4bfc33a..1488159 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2466,9 +2466,9 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
dentry = d_add_unique(dentry, igrab(state->inode));
if (dentry == NULL) {
dentry = opendata->dentry;
- } else if (dentry != ctx->dentry) {
+ } else {
dput(ctx->dentry);
- ctx->dentry = dget(dentry);
+ ctx->dentry = dentry;
}
nfs_set_verifier(dentry,
nfs_save_change_attribute(d_inode(opendata->dir)));
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index a3592cc..2fa483e 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -52,9 +52,7 @@ static DEFINE_SPINLOCK(pnfs_spinlock);
*/
static LIST_HEAD(pnfs_modules_tbl);
-static int
-pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid,
- enum pnfs_iomode iomode, bool sync);
+static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo);
/* Return the registered pnfs layout driver module matching given id */
static struct pnfs_layoutdriver_type *
@@ -243,6 +241,8 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
{
struct inode *inode = lo->plh_inode;
+ pnfs_layoutreturn_before_put_layout_hdr(lo);
+
if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
if (!list_empty(&lo->plh_segs))
WARN_ONCE(1, "NFS: BUG unfreed layout segments.\n");
@@ -252,6 +252,27 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
}
}
+/*
+ * Mark a pnfs_layout_hdr and all associated layout segments as invalid
+ *
+ * In order to continue using the pnfs_layout_hdr, a full recovery
+ * is required.
+ * Note that caller must hold inode->i_lock.
+ */
+static int
+pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo,
+ struct list_head *lseg_list)
+{
+ struct pnfs_layout_range range = {
+ .iomode = IOMODE_ANY,
+ .offset = 0,
+ .length = NFS4_MAX_UINT64,
+ };
+
+ set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
+ return pnfs_mark_matching_lsegs_invalid(lo, lseg_list, &range);
+}
+
static int
pnfs_iomode_to_fail_bit(u32 iomode)
{
@@ -345,58 +366,6 @@ pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo,
rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq);
}
-/* Return true if layoutreturn is needed */
-static bool
-pnfs_layout_need_return(struct pnfs_layout_hdr *lo,
- struct pnfs_layout_segment *lseg)
-{
- struct pnfs_layout_segment *s;
-
- if (!test_and_clear_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
- return false;
-
- list_for_each_entry(s, &lo->plh_segs, pls_list)
- if (s != lseg && test_bit(NFS_LSEG_LAYOUTRETURN, &s->pls_flags))
- return false;
-
- return true;
-}
-
-static bool
-pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo)
-{
- if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
- return false;
- lo->plh_return_iomode = 0;
- pnfs_get_layout_hdr(lo);
- clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &lo->plh_flags);
- return true;
-}
-
-static void pnfs_layoutreturn_before_put_lseg(struct pnfs_layout_segment *lseg,
- struct pnfs_layout_hdr *lo, struct inode *inode)
-{
- lo = lseg->pls_layout;
- inode = lo->plh_inode;
-
- spin_lock(&inode->i_lock);
- if (pnfs_layout_need_return(lo, lseg)) {
- nfs4_stateid stateid;
- enum pnfs_iomode iomode;
- bool send;
-
- nfs4_stateid_copy(&stateid, &lo->plh_stateid);
- iomode = lo->plh_return_iomode;
- send = pnfs_prepare_layoutreturn(lo);
- spin_unlock(&inode->i_lock);
- if (send) {
- /* Send an async layoutreturn so we dont deadlock */
- pnfs_send_layoutreturn(lo, &stateid, iomode, false);
- }
- } else
- spin_unlock(&inode->i_lock);
-}
-
void
pnfs_put_lseg(struct pnfs_layout_segment *lseg)
{
@@ -410,15 +379,8 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg)
atomic_read(&lseg->pls_refcount),
test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
- /* Handle the case where refcount != 1 */
- if (atomic_add_unless(&lseg->pls_refcount, -1, 1))
- return;
-
lo = lseg->pls_layout;
inode = lo->plh_inode;
- /* Do we need a layoutreturn? */
- if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
- pnfs_layoutreturn_before_put_lseg(lseg, lo, inode);
if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) {
if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) {
@@ -613,9 +575,8 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
spin_lock(&nfsi->vfs_inode.i_lock);
lo = nfsi->layout;
if (lo) {
- lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
- pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL);
pnfs_get_layout_hdr(lo);
+ pnfs_mark_layout_stateid_invalid(lo, &tmp_list);
pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RO_FAILED);
pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED);
spin_unlock(&nfsi->vfs_inode.i_lock);
@@ -676,11 +637,6 @@ pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list,
{
struct pnfs_layout_hdr *lo;
struct inode *inode;
- struct pnfs_layout_range range = {
- .iomode = IOMODE_ANY,
- .offset = 0,
- .length = NFS4_MAX_UINT64,
- };
LIST_HEAD(lseg_list);
int ret = 0;
@@ -695,11 +651,11 @@ pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list,
spin_lock(&inode->i_lock);
list_del_init(&lo->plh_bulk_destroy);
- lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
- if (is_bulk_recall)
- set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
- if (pnfs_mark_matching_lsegs_invalid(lo, &lseg_list, &range))
+ if (pnfs_mark_layout_stateid_invalid(lo, &lseg_list)) {
+ if (is_bulk_recall)
+ set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
ret = -EAGAIN;
+ }
spin_unlock(&inode->i_lock);
pnfs_free_lseg_list(&lseg_list);
/* Free all lsegs that are attached to commit buckets */
@@ -937,6 +893,17 @@ void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo)
rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq);
}
+static bool
+pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo)
+{
+ if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
+ return false;
+ lo->plh_return_iomode = 0;
+ pnfs_get_layout_hdr(lo);
+ clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
+ return true;
+}
+
static int
pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid,
enum pnfs_iomode iomode, bool sync)
@@ -971,6 +938,48 @@ out:
return status;
}
+/* Return true if layoutreturn is needed */
+static bool
+pnfs_layout_need_return(struct pnfs_layout_hdr *lo)
+{
+ struct pnfs_layout_segment *s;
+
+ if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
+ return false;
+
+ /* Defer layoutreturn until all lsegs are done */
+ list_for_each_entry(s, &lo->plh_segs, pls_list) {
+ if (test_bit(NFS_LSEG_LAYOUTRETURN, &s->pls_flags))
+ return false;
+ }
+
+ return true;
+}
+
+static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+ struct inode *inode= lo->plh_inode;
+
+ if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
+ return;
+ spin_lock(&inode->i_lock);
+ if (pnfs_layout_need_return(lo)) {
+ nfs4_stateid stateid;
+ enum pnfs_iomode iomode;
+ bool send;
+
+ nfs4_stateid_copy(&stateid, &lo->plh_stateid);
+ iomode = lo->plh_return_iomode;
+ send = pnfs_prepare_layoutreturn(lo);
+ spin_unlock(&inode->i_lock);
+ if (send) {
+ /* Send an async layoutreturn so we dont deadlock */
+ pnfs_send_layoutreturn(lo, &stateid, iomode, false);
+ }
+ } else
+ spin_unlock(&inode->i_lock);
+}
+
/*
* Initiates a LAYOUTRETURN(FILE), and removes the pnfs_layout_hdr
* when the layout segment list is empty.
@@ -1091,7 +1100,7 @@ bool pnfs_roc(struct inode *ino)
nfs4_stateid_copy(&stateid, &lo->plh_stateid);
/* always send layoutreturn if being marked so */
- if (test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
+ if (test_and_clear_bit(NFS_LAYOUT_RETURN_REQUESTED,
&lo->plh_flags))
layoutreturn = pnfs_prepare_layoutreturn(lo);
@@ -1744,8 +1753,19 @@ pnfs_set_plh_return_iomode(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode)
if (lo->plh_return_iomode != 0)
iomode = IOMODE_ANY;
lo->plh_return_iomode = iomode;
+ set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
}
+/**
+ * pnfs_mark_matching_lsegs_return - Free or return matching layout segments
+ * @lo: pointer to layout header
+ * @tmp_list: list header to be used with pnfs_free_lseg_list()
+ * @return_range: describe layout segment ranges to be returned
+ *
+ * This function is mainly intended for use by layoutrecall. It attempts
+ * to free the layout segment immediately, or else to mark it for return
+ * as soon as its reference count drops to zero.
+ */
int
pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
struct list_head *tmp_list,
@@ -1768,12 +1788,11 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
lseg, lseg->pls_range.iomode,
lseg->pls_range.offset,
lseg->pls_range.length);
+ if (mark_lseg_invalid(lseg, tmp_list))
+ continue;
+ remaining++;
set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
pnfs_set_plh_return_iomode(lo, return_range->iomode);
- if (!mark_lseg_invalid(lseg, tmp_list))
- remaining++;
- set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
- &lo->plh_flags);
}
return remaining;
}
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 9f4e2a4..1ac1db5 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -94,8 +94,8 @@ enum {
NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */
NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */
NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */
- NFS_LAYOUT_RETURN, /* Return this layout ASAP */
- NFS_LAYOUT_RETURN_BEFORE_CLOSE, /* Return this layout before close */
+ NFS_LAYOUT_RETURN, /* layoutreturn in progress */
+ NFS_LAYOUT_RETURN_REQUESTED, /* Return this layout ASAP */
NFS_LAYOUT_INVALID_STID, /* layout stateid id is invalid */
NFS_LAYOUT_FIRST_LAYOUTGET, /* Serialize first layoutget */
};
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index cfcbf11..7115c5d 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -91,7 +91,14 @@
#include <linux/fsnotify_backend.h>
#include "fsnotify.h"
+#define FSNOTIFY_REAPER_DELAY (1) /* 1 jiffy */
+
struct srcu_struct fsnotify_mark_srcu;
+static DEFINE_SPINLOCK(destroy_lock);
+static LIST_HEAD(destroy_list);
+
+static void fsnotify_mark_destroy(struct work_struct *work);
+static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy);
void fsnotify_get_mark(struct fsnotify_mark *mark)
{
@@ -165,19 +172,10 @@ void fsnotify_detach_mark(struct fsnotify_mark *mark)
atomic_dec(&group->num_marks);
}
-static void
-fsnotify_mark_free_rcu(struct rcu_head *rcu)
-{
- struct fsnotify_mark *mark;
-
- mark = container_of(rcu, struct fsnotify_mark, g_rcu);
- fsnotify_put_mark(mark);
-}
-
/*
- * Free fsnotify mark. The freeing is actually happening from a call_srcu
- * callback. Caller must have a reference to the mark or be protected by
- * fsnotify_mark_srcu.
+ * Free fsnotify mark. The freeing is actually happening from a kthread which
+ * first waits for srcu period end. Caller must have a reference to the mark
+ * or be protected by fsnotify_mark_srcu.
*/
void fsnotify_free_mark(struct fsnotify_mark *mark)
{
@@ -192,7 +190,11 @@ void fsnotify_free_mark(struct fsnotify_mark *mark)
mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
spin_unlock(&mark->lock);
- call_srcu(&fsnotify_mark_srcu, &mark->g_rcu, fsnotify_mark_free_rcu);
+ spin_lock(&destroy_lock);
+ list_add(&mark->g_list, &destroy_list);
+ spin_unlock(&destroy_lock);
+ queue_delayed_work(system_unbound_wq, &reaper_work,
+ FSNOTIFY_REAPER_DELAY);
/*
* Some groups like to know that marks are being freed. This is a
@@ -388,7 +390,12 @@ err:
spin_unlock(&mark->lock);
- call_srcu(&fsnotify_mark_srcu, &mark->g_rcu, fsnotify_mark_free_rcu);
+ spin_lock(&destroy_lock);
+ list_add(&mark->g_list, &destroy_list);
+ spin_unlock(&destroy_lock);
+ queue_delayed_work(system_unbound_wq, &reaper_work,
+ FSNOTIFY_REAPER_DELAY);
+
return ret;
}
@@ -491,3 +498,21 @@ void fsnotify_init_mark(struct fsnotify_mark *mark,
atomic_set(&mark->refcnt, 1);
mark->free_mark = free_mark;
}
+
+static void fsnotify_mark_destroy(struct work_struct *work)
+{
+ struct fsnotify_mark *mark, *next;
+ struct list_head private_destroy_list;
+
+ spin_lock(&destroy_lock);
+ /* exchange the list head */
+ list_replace_init(&destroy_list, &private_destroy_list);
+ spin_unlock(&destroy_lock);
+
+ synchronize_srcu(&fsnotify_mark_srcu);
+
+ list_for_each_entry_safe(mark, next, &private_destroy_list, g_list) {
+ list_del_init(&mark->g_list);
+ fsnotify_put_mark(mark);
+ }
+}
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 794fd15..cda0361 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -956,6 +956,7 @@ clean_orphan:
tmp_ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh,
update_isize, end);
if (tmp_ret < 0) {
+ ocfs2_inode_unlock(inode, 1);
ret = tmp_ret;
mlog_errno(ret);
brelse(di_bh);
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index a3cc6d2..a76b9ea 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1254,15 +1254,15 @@ static const struct file_operations o2hb_debug_fops = {
void o2hb_exit(void)
{
- kfree(o2hb_db_livenodes);
- kfree(o2hb_db_liveregions);
- kfree(o2hb_db_quorumregions);
- kfree(o2hb_db_failedregions);
debugfs_remove(o2hb_debug_failedregions);
debugfs_remove(o2hb_debug_quorumregions);
debugfs_remove(o2hb_debug_liveregions);
debugfs_remove(o2hb_debug_livenodes);
debugfs_remove(o2hb_debug_dir);
+ kfree(o2hb_db_livenodes);
+ kfree(o2hb_db_liveregions);
+ kfree(o2hb_db_quorumregions);
+ kfree(o2hb_db_failedregions);
}
static struct dentry *o2hb_debug_create(const char *name, struct dentry *dir,
@@ -1438,13 +1438,15 @@ static void o2hb_region_release(struct config_item *item)
kfree(reg->hr_slots);
- kfree(reg->hr_db_regnum);
- kfree(reg->hr_db_livenodes);
debugfs_remove(reg->hr_debug_livenodes);
debugfs_remove(reg->hr_debug_regnum);
debugfs_remove(reg->hr_debug_elapsed_time);
debugfs_remove(reg->hr_debug_pinned);
debugfs_remove(reg->hr_debug_dir);
+ kfree(reg->hr_db_livenodes);
+ kfree(reg->hr_db_regnum);
+ kfree(reg->hr_debug_elapsed_time);
+ kfree(reg->hr_debug_pinned);
spin_lock(&o2hb_live_lock);
list_del(&reg->hr_all_item);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index c5bdf02..b94a425 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2367,6 +2367,8 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
break;
}
}
+ dlm_lockres_clear_refmap_bit(dlm, res,
+ dead_node);
spin_unlock(&res->spinlock);
continue;
}
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 9581d190..77ebc2b 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -147,6 +147,10 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
ret = ocfs2_inode_lock(inode, &di_bh, 1);
if (ret < 0) {
mlog_errno(ret);
+ if (ret == -ENOMEM)
+ ret = VM_FAULT_OOM;
+ else
+ ret = VM_FAULT_SIGBUS;
goto out;
}
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index ed95272..52f6de5 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -618,7 +618,8 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
* sole user of this dentry. Too tricky... Just unhash for
* now.
*/
- d_drop(dentry);
+ if (!err)
+ d_drop(dentry);
inode_unlock(dir);
return err;
@@ -903,6 +904,13 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old,
if (!overwrite && new_is_dir && !old_opaque && new_opaque)
ovl_remove_opaque(newdentry);
+ /*
+ * Old dentry now lives in different location. Dentries in
+ * lowerstack are stale. We cannot drop them here because
+ * access to them is lockless. This could be only pure upper
+ * or opaque directory - numlower is zero. Or upper non-dir
+ * entry - its pureness is tracked by flag opaque.
+ */
if (old_opaque != new_opaque) {
ovl_dentry_set_opaque(old, new_opaque);
if (!overwrite)
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 49e2045..a4ff5d0 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -65,6 +65,8 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr)
inode_lock(upperdentry->d_inode);
err = notify_change(upperdentry, attr, NULL);
+ if (!err)
+ ovl_copyattr(upperdentry->d_inode, dentry->d_inode);
inode_unlock(upperdentry->d_inode);
}
ovl_drop_write(dentry);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 8d826bd..619ad4b 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -76,12 +76,14 @@ enum ovl_path_type ovl_path_type(struct dentry *dentry)
if (oe->__upperdentry) {
type = __OVL_PATH_UPPER;
- if (oe->numlower) {
- if (S_ISDIR(dentry->d_inode->i_mode))
- type |= __OVL_PATH_MERGE;
- } else if (!oe->opaque) {
+ /*
+ * Non-dir dentry can hold lower dentry from previous
+ * location. Its purity depends only on opaque flag.
+ */
+ if (oe->numlower && S_ISDIR(dentry->d_inode->i_mode))
+ type |= __OVL_PATH_MERGE;
+ else if (!oe->opaque)
type |= __OVL_PATH_PURE;
- }
} else {
if (oe->numlower > 1)
type |= __OVL_PATH_MERGE;
@@ -341,6 +343,7 @@ static const struct dentry_operations ovl_dentry_operations = {
static const struct dentry_operations ovl_reval_dentry_operations = {
.d_release = ovl_dentry_release,
+ .d_select_inode = ovl_d_select_inode,
.d_revalidate = ovl_dentry_revalidate,
.d_weak_revalidate = ovl_dentry_weak_revalidate,
};
diff --git a/fs/pnode.c b/fs/pnode.c
index 6367e1e..c524fdd 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -202,6 +202,11 @@ static struct mount *last_dest, *last_source, *dest_master;
static struct mountpoint *mp;
static struct hlist_head *list;
+static inline bool peers(struct mount *m1, struct mount *m2)
+{
+ return m1->mnt_group_id == m2->mnt_group_id && m1->mnt_group_id;
+}
+
static int propagate_one(struct mount *m)
{
struct mount *child;
@@ -212,7 +217,7 @@ static int propagate_one(struct mount *m)
/* skip if mountpoint isn't covered by it */
if (!is_subdir(mp->m_dentry, m->mnt.mnt_root))
return 0;
- if (m->mnt_group_id == last_dest->mnt_group_id) {
+ if (peers(m, last_dest)) {
type = CL_MAKE_SHARED;
} else {
struct mount *n, *p;
@@ -223,7 +228,7 @@ static int propagate_one(struct mount *m)
last_source = last_source->mnt_master;
last_dest = last_source->mnt_parent;
}
- if (n->mnt_group_id != last_dest->mnt_group_id) {
+ if (!peers(n, last_dest)) {
last_source = last_source->mnt_master;
last_dest = last_source->mnt_parent;
}
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 85d16c6..fa95ab2 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -259,23 +259,29 @@ static int do_maps_open(struct inode *inode, struct file *file,
sizeof(struct proc_maps_private));
}
-static pid_t pid_of_stack(struct proc_maps_private *priv,
- struct vm_area_struct *vma, bool is_pid)
+/*
+ * Indicate if the VMA is a stack for the given task; for
+ * /proc/PID/maps that is the stack of the main task.
+ */
+static int is_stack(struct proc_maps_private *priv,
+ struct vm_area_struct *vma, int is_pid)
{
- struct inode *inode = priv->inode;
- struct task_struct *task;
- pid_t ret = 0;
+ int stack = 0;
+
+ if (is_pid) {
+ stack = vma->vm_start <= vma->vm_mm->start_stack &&
+ vma->vm_end >= vma->vm_mm->start_stack;
+ } else {
+ struct inode *inode = priv->inode;
+ struct task_struct *task;
- rcu_read_lock();
- task = pid_task(proc_pid(inode), PIDTYPE_PID);
- if (task) {
- task = task_of_stack(task, vma, is_pid);
+ rcu_read_lock();
+ task = pid_task(proc_pid(inode), PIDTYPE_PID);
if (task)
- ret = task_pid_nr_ns(task, inode->i_sb->s_fs_info);
+ stack = vma_is_stack_for_task(vma, task);
+ rcu_read_unlock();
}
- rcu_read_unlock();
-
- return ret;
+ return stack;
}
static void
@@ -335,8 +341,6 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
name = arch_vma_name(vma);
if (!name) {
- pid_t tid;
-
if (!mm) {
name = "[vdso]";
goto done;
@@ -348,21 +352,8 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
goto done;
}
- tid = pid_of_stack(priv, vma, is_pid);
- if (tid != 0) {
- /*
- * Thread stack in /proc/PID/task/TID/maps or
- * the main process stack.
- */
- if (!is_pid || (vma->vm_start <= mm->start_stack &&
- vma->vm_end >= mm->start_stack)) {
- name = "[stack]";
- } else {
- /* Thread stack in /proc/PID/maps */
- seq_pad(m, ' ');
- seq_printf(m, "[stack:%d]", tid);
- }
- }
+ if (is_stack(priv, vma, is_pid))
+ name = "[stack]";
}
done:
@@ -1552,18 +1543,19 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
unsigned long addr, unsigned long end, struct mm_walk *walk)
{
+ pte_t huge_pte = huge_ptep_get(pte);
struct numa_maps *md;
struct page *page;
- if (!pte_present(*pte))
+ if (!pte_present(huge_pte))
return 0;
- page = pte_page(*pte);
+ page = pte_page(huge_pte);
if (!page)
return 0;
md = walk->private;
- gather_stats(page, md, pte_dirty(*pte), 1);
+ gather_stats(page, md, pte_dirty(huge_pte), 1);
return 0;
}
@@ -1617,19 +1609,8 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
seq_file_path(m, file, "\n\t= ");
} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
seq_puts(m, " heap");
- } else {
- pid_t tid = pid_of_stack(proc_priv, vma, is_pid);
- if (tid != 0) {
- /*
- * Thread stack in /proc/PID/task/TID/maps or
- * the main process stack.
- */
- if (!is_pid || (vma->vm_start <= mm->start_stack &&
- vma->vm_end >= mm->start_stack))
- seq_puts(m, " stack");
- else
- seq_printf(m, " stack:%d", tid);
- }
+ } else if (is_stack(proc_priv, vma, is_pid)) {
+ seq_puts(m, " stack");
}
if (is_vm_hugetlb_page(vma))
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index e0d64c9..faacb0c 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -123,23 +123,26 @@ unsigned long task_statm(struct mm_struct *mm,
return size;
}
-static pid_t pid_of_stack(struct proc_maps_private *priv,
- struct vm_area_struct *vma, bool is_pid)
+static int is_stack(struct proc_maps_private *priv,
+ struct vm_area_struct *vma, int is_pid)
{
- struct inode *inode = priv->inode;
- struct task_struct *task;
- pid_t ret = 0;
-
- rcu_read_lock();
- task = pid_task(proc_pid(inode), PIDTYPE_PID);
- if (task) {
- task = task_of_stack(task, vma, is_pid);
+ struct mm_struct *mm = vma->vm_mm;
+ int stack = 0;
+
+ if (is_pid) {
+ stack = vma->vm_start <= mm->start_stack &&
+ vma->vm_end >= mm->start_stack;
+ } else {
+ struct inode *inode = priv->inode;
+ struct task_struct *task;
+
+ rcu_read_lock();
+ task = pid_task(proc_pid(inode), PIDTYPE_PID);
if (task)
- ret = task_pid_nr_ns(task, inode->i_sb->s_fs_info);
+ stack = vma_is_stack_for_task(vma, task);
+ rcu_read_unlock();
}
- rcu_read_unlock();
-
- return ret;
+ return stack;
}
/*
@@ -181,21 +184,9 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,
if (file) {
seq_pad(m, ' ');
seq_file_path(m, file, "");
- } else if (mm) {
- pid_t tid = pid_of_stack(priv, vma, is_pid);
-
- if (tid != 0) {
- seq_pad(m, ' ');
- /*
- * Thread stack in /proc/PID/task/TID/maps or
- * the main process stack.
- */
- if (!is_pid || (vma->vm_start <= mm->start_stack &&
- vma->vm_end >= mm->start_stack))
- seq_printf(m, "[stack]");
- else
- seq_printf(m, "[stack:%d]", tid);
- }
+ } else if (mm && is_stack(priv, vma, is_pid)) {
+ seq_pad(m, ' ');
+ seq_printf(m, "[stack]");
}
seq_putc(m, '\n');
diff --git a/fs/read_write.c b/fs/read_write.c
index 324ec27..dadf24e 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -17,6 +17,7 @@
#include <linux/splice.h>
#include <linux/compat.h>
#include <linux/mount.h>
+#include <linux/fs.h>
#include "internal.h"
#include <asm/uaccess.h>
@@ -183,7 +184,7 @@ loff_t no_seek_end_llseek(struct file *file, loff_t offset, int whence)
switch (whence) {
case SEEK_SET: case SEEK_CUR:
return generic_file_llseek_size(file, offset, whence,
- ~0ULL, 0);
+ OFFSET_MAX, 0);
default:
return -EINVAL;
}
@@ -1532,10 +1533,12 @@ int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
if (!(file_in->f_mode & FMODE_READ) ||
!(file_out->f_mode & FMODE_WRITE) ||
- (file_out->f_flags & O_APPEND) ||
- !file_in->f_op->clone_file_range)
+ (file_out->f_flags & O_APPEND))
return -EBADF;
+ if (!file_in->f_op->clone_file_range)
+ return -EOPNOTSUPP;
+
ret = clone_verify_area(file_in, pos_in, len, false);
if (ret)
return ret;
diff --git a/fs/super.c b/fs/super.c
index 1182af8..74914b1 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -415,6 +415,7 @@ void generic_shutdown_super(struct super_block *sb)
sb->s_flags &= ~MS_ACTIVE;
fsnotify_unmount_inodes(sb);
+ cgroup_writeback_umount();
evict_inodes(sb);
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 5031170..66cdb44 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -287,6 +287,12 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
goto out;
/*
+ * We don't do userfault handling for the final child pid update.
+ */
+ if (current->flags & PF_EXITING)
+ goto out;
+
+ /*
* Check that we can return VM_FAULT_RETRY.
*
* NOTE: it should become possible to return VM_FAULT_RETRY
diff --git a/fs/xattr.c b/fs/xattr.c
index 07d0e47..4861322 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -940,7 +940,7 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs,
bool trusted = capable(CAP_SYS_ADMIN);
struct simple_xattr *xattr;
ssize_t remaining_size = size;
- int err;
+ int err = 0;
#ifdef CONFIG_FS_POSIX_ACL
if (inode->i_acl) {
@@ -965,11 +965,11 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs,
err = xattr_list_one(&buffer, &remaining_size, xattr->name);
if (err)
- return err;
+ break;
}
spin_unlock(&xattrs->lock);
- return size - remaining_size;
+ return err ? err : size - remaining_size;
}
/*
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 379c089..a9ebabfe 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -55,7 +55,7 @@ xfs_count_page_state(
} while ((bh = bh->b_this_page) != head);
}
-STATIC struct block_device *
+struct block_device *
xfs_find_bdev_for_inode(
struct inode *inode)
{
@@ -1208,6 +1208,10 @@ xfs_vm_writepages(
struct writeback_control *wbc)
{
xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
+ if (dax_mapping(mapping))
+ return dax_writeback_mapping_range(mapping,
+ xfs_find_bdev_for_inode(mapping->host), wbc);
+
return generic_writepages(mapping, wbc);
}
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index f6ffc9a..a4343c6 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -62,5 +62,6 @@ int xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset,
struct buffer_head *map_bh, int create);
extern void xfs_count_page_state(struct page *, int *, int *);
+extern struct block_device *xfs_find_bdev_for_inode(struct inode *);
#endif /* __XFS_AOPS_H__ */
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 45ec9e4..6c87601 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -75,7 +75,8 @@ xfs_zero_extent(
ssize_t size = XFS_FSB_TO_B(mp, count_fsb);
if (IS_DAX(VFS_I(ip)))
- return dax_clear_blocks(VFS_I(ip), block, size);
+ return dax_clear_sectors(xfs_find_bdev_for_inode(VFS_I(ip)),
+ sector, size);
/*
* let the block layer decide on the fastest method of
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index e7aa82f..be55688 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -4556,7 +4556,7 @@ xlog_recover_process(
* know precisely what failed.
*/
if (pass == XLOG_RECOVER_CRCPASS) {
- if (rhead->h_crc && crc != le32_to_cpu(rhead->h_crc))
+ if (rhead->h_crc && crc != rhead->h_crc)
return -EFSBADCRC;
return 0;
}
@@ -4567,7 +4567,7 @@ xlog_recover_process(
* zero CRC check prevents warnings from being emitted when upgrading
* the kernel from one that does not add CRCs by default.
*/
- if (crc != le32_to_cpu(rhead->h_crc)) {
+ if (crc != rhead->h_crc) {
if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
xfs_alert(log->l_mp,
"log record CRC mismatch: found 0x%x, expected 0x%x.",
OpenPOWER on IntegriCloud