From 0b173bc4daa8f8ec03a85abf5e47b23502ff80af Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 8 Oct 2012 16:28:46 -0700 Subject: mm: kill vma flag VM_CAN_NONLINEAR Move actual pte filling for non-linear file mappings into the new special vma operation: ->remap_pages(). Filesystems must implement this method to get non-linear mapping support, if it uses filemap_fault() then generic_file_remap_pages() can be used. Now device drivers can implement this method and obtain nonlinear vma support. Signed-off-by: Konstantin Khlebnikov Cc: Alexander Viro Cc: Carsten Otte Cc: Chris Metcalf #arch/tile Cc: Cyrill Gorcunov Cc: Eric Paris Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Morris Cc: Jason Baron Cc: Kentaro Takeda Cc: Matt Helsley Cc: Nick Piggin Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Robert Richter Cc: Suresh Siddha Cc: Tetsuo Handa Cc: Venkatesh Pallipadi Acked-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/9p/vfs_file.c | 1 + fs/btrfs/file.c | 2 +- fs/ceph/addr.c | 2 +- fs/cifs/file.c | 1 + fs/ext4/file.c | 2 +- fs/fuse/file.c | 1 + fs/gfs2/file.c | 2 +- fs/nfs/file.c | 1 + fs/nilfs2/file.c | 2 +- fs/ocfs2/mmap.c | 2 +- fs/ubifs/file.c | 1 + fs/xfs/xfs_file.c | 2 +- 12 files changed, 12 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index dd6f7ee..c2483e9 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -738,6 +738,7 @@ v9fs_cached_file_write(struct file *filp, const char __user * data, static const struct vm_operations_struct v9fs_file_vm_ops = { .fault = filemap_fault, .page_mkwrite = v9fs_vm_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 5caf285..f6b40e8 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1599,6 +1599,7 @@ out: static const struct vm_operations_struct btrfs_file_vm_ops = { .fault = filemap_fault, .page_mkwrite = btrfs_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) @@ -1610,7 +1611,6 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) file_accessed(filp); vma->vm_ops = &btrfs_file_vm_ops; - vma->vm_flags |= VM_CAN_NONLINEAR; return 0; } diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 22b6e45..6690269 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1224,6 +1224,7 @@ out: static struct vm_operations_struct ceph_vmops = { .fault = filemap_fault, .page_mkwrite = ceph_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; int ceph_mmap(struct file *file, struct vm_area_struct *vma) @@ -1234,6 +1235,5 @@ int ceph_mmap(struct file *file, struct vm_area_struct *vma) return -ENOEXEC; file_accessed(file); vma->vm_ops = &ceph_vmops; - vma->vm_flags |= VM_CAN_NONLINEAR; return 0; } diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 7d7bbdc..edb25b4 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -3003,6 +3003,7 @@ cifs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) static struct vm_operations_struct cifs_file_vm_ops = { .fault = filemap_fault, .page_mkwrite = cifs_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index ca6f07a..bf3966b 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -207,6 +207,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov, static const struct vm_operations_struct ext4_file_vm_ops = { .fault = filemap_fault, .page_mkwrite = ext4_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) @@ -217,7 +218,6 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) return -ENOEXEC; file_accessed(file); vma->vm_ops = &ext4_file_vm_ops; - vma->vm_flags |= VM_CAN_NONLINEAR; return 0; } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index aba15f1..78d2837 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1379,6 +1379,7 @@ static const struct vm_operations_struct fuse_file_vm_ops = { .close = fuse_vma_close, .fault = filemap_fault, .page_mkwrite = fuse_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 30e2199..0def050 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -492,6 +492,7 @@ out: static const struct vm_operations_struct gfs2_vm_ops = { .fault = filemap_fault, .page_mkwrite = gfs2_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; /** @@ -526,7 +527,6 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma) return error; } vma->vm_ops = &gfs2_vm_ops; - vma->vm_flags |= VM_CAN_NONLINEAR; return 0; } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 6a7fcab..f692be9 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -578,6 +578,7 @@ out: static const struct vm_operations_struct nfs_file_vm_ops = { .fault = filemap_fault, .page_mkwrite = nfs_vm_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; static int nfs_need_sync_write(struct file *filp, struct inode *inode) diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index 5b387a4..16f35f7 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -135,13 +135,13 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) static const struct vm_operations_struct nilfs_file_vm_ops = { .fault = filemap_fault, .page_mkwrite = nilfs_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma) { file_accessed(file); vma->vm_ops = &nilfs_file_vm_ops; - vma->vm_flags |= VM_CAN_NONLINEAR; return 0; } diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index d150372..47a87dd 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -173,6 +173,7 @@ out: static const struct vm_operations_struct ocfs2_file_vm_ops = { .fault = ocfs2_fault, .page_mkwrite = ocfs2_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; int ocfs2_mmap(struct file *file, struct vm_area_struct *vma) @@ -188,7 +189,6 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma) ocfs2_inode_unlock(file->f_dentry->d_inode, lock_level); out: vma->vm_ops = &ocfs2_file_vm_ops; - vma->vm_flags |= VM_CAN_NONLINEAR; return 0; } diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index ff48c5a..5bc7781 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1536,6 +1536,7 @@ out_unlock: static const struct vm_operations_struct ubifs_file_vm_ops = { .fault = filemap_fault, .page_mkwrite = ubifs_vm_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 1eaeb8b..aa473fa 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -940,7 +940,6 @@ xfs_file_mmap( struct vm_area_struct *vma) { vma->vm_ops = &xfs_file_vm_ops; - vma->vm_flags |= VM_CAN_NONLINEAR; file_accessed(filp); return 0; @@ -1443,4 +1442,5 @@ const struct file_operations xfs_dir_file_operations = { static const struct vm_operations_struct xfs_file_vm_ops = { .fault = filemap_fault, .page_mkwrite = xfs_vm_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; -- cgit v1.1 From 0103bd16fb90bc741c7a03fd1ea4e8a505abad23 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 8 Oct 2012 16:28:59 -0700 Subject: mm: prepare VM_DONTDUMP for using in drivers Rename VM_NODUMP into VM_DONTDUMP: this name matches other negative flags: VM_DONTEXPAND, VM_DONTCOPY. Currently this flag used only for sys_madvise. The next patch will use it for replacing the outdated flag VM_RESERVED. Also forbid madvise(MADV_DODUMP) for special kernel mappings VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP) Signed-off-by: Konstantin Khlebnikov Cc: Alexander Viro Cc: Carsten Otte Cc: Chris Metcalf Cc: Cyrill Gorcunov Cc: Eric Paris Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Morris Cc: Jason Baron Cc: Kentaro Takeda Cc: Matt Helsley Cc: Nick Piggin Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Robert Richter Cc: Suresh Siddha Cc: Tetsuo Handa Cc: Venkatesh Pallipadi Acked-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_elf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 28a64e7..2b72d26 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1123,7 +1123,7 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma, if (always_dump_vma(vma)) goto whole; - if (vma->vm_flags & VM_NODUMP) + if (vma->vm_flags & VM_DONTDUMP) return 0; /* Hugetlb memory check */ -- cgit v1.1 From 314e51b9851b4f4e8ab302243ff5a6fc6147f379 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 8 Oct 2012 16:29:02 -0700 Subject: mm: kill vma flag VM_RESERVED and mm->reserved_vm counter A long time ago, in v2.4, VM_RESERVED kept swapout process off VMA, currently it lost original meaning but still has some effects: | effect | alternative flags -+------------------------+--------------------------------------------- 1| account as reserved_vm | VM_IO 2| skip in core dump | VM_IO, VM_DONTDUMP 3| do not merge or expand | VM_IO, VM_DONTEXPAND, VM_HUGETLB, VM_PFNMAP 4| do not mlock | VM_IO, VM_DONTEXPAND, VM_HUGETLB, VM_PFNMAP This patch removes reserved_vm counter from mm_struct. Seems like nobody cares about it, it does not exported into userspace directly, it only reduces total_vm showed in proc. Thus VM_RESERVED can be replaced with VM_IO or pair VM_DONTEXPAND | VM_DONTDUMP. remap_pfn_range() and io_remap_pfn_range() set VM_IO|VM_DONTEXPAND|VM_DONTDUMP. remap_vmalloc_range() set VM_DONTEXPAND | VM_DONTDUMP. [akpm@linux-foundation.org: drivers/vfio/pci/vfio_pci.c fixup] Signed-off-by: Konstantin Khlebnikov Cc: Alexander Viro Cc: Carsten Otte Cc: Chris Metcalf Cc: Cyrill Gorcunov Cc: Eric Paris Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Morris Cc: Jason Baron Cc: Kentaro Takeda Cc: Matt Helsley Cc: Nick Piggin Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Robert Richter Cc: Suresh Siddha Cc: Tetsuo Handa Cc: Venkatesh Pallipadi Acked-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_elf.c | 2 +- fs/binfmt_elf_fdpic.c | 2 +- fs/hugetlbfs/inode.c | 2 +- fs/proc/task_mmu.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 2b72d26..e800dec 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1135,7 +1135,7 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma, } /* Do not dump I/O mapped devices or special mappings */ - if (vma->vm_flags & (VM_IO | VM_RESERVED)) + if (vma->vm_flags & VM_IO) return 0; /* By default, dump shared memory if mapped from an anonymous file. */ diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 08d812b..262db11 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1205,7 +1205,7 @@ static int maydump(struct vm_area_struct *vma, unsigned long mm_flags) int dump_ok; /* Do not dump I/O mapped devices or special mappings */ - if (vma->vm_flags & (VM_IO | VM_RESERVED)) { + if (vma->vm_flags & VM_IO) { kdcore("%08lx: %08lx: no (IO)", vma->vm_start, vma->vm_flags); return 0; } diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 9460120..0a0ab8e 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -110,7 +110,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) * way when do_mmap_pgoff unwinds (may be important on powerpc * and ia64). */ - vma->vm_flags |= VM_HUGETLB | VM_RESERVED; + vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND | VM_DONTDUMP; vma->vm_ops = &hugetlb_vm_ops; if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 4540b8f..79827ce 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -54,7 +54,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) "VmPTE:\t%8lu kB\n" "VmSwap:\t%8lu kB\n", hiwater_vm << (PAGE_SHIFT-10), - (total_vm - mm->reserved_vm) << (PAGE_SHIFT-10), + total_vm << (PAGE_SHIFT-10), mm->locked_vm << (PAGE_SHIFT-10), mm->pinned_vm << (PAGE_SHIFT-10), hiwater_rss << (PAGE_SHIFT-10), -- cgit v1.1 From 01dc52ebdf472f77cca623ca693ca24cfc0f1bbe Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 8 Oct 2012 16:29:30 -0700 Subject: oom: remove deprecated oom_adj The deprecated /proc//oom_adj is scheduled for removal this month. Signed-off-by: Davidlohr Bueso Acked-by: David Rientjes Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 117 +-------------------------------------------------------- 1 file changed, 1 insertion(+), 116 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index d295af99..ef5c84b 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -873,111 +873,6 @@ static const struct file_operations proc_environ_operations = { .release = mem_release, }; -static ssize_t oom_adjust_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) -{ - struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); - char buffer[PROC_NUMBUF]; - size_t len; - int oom_adjust = OOM_DISABLE; - unsigned long flags; - - if (!task) - return -ESRCH; - - if (lock_task_sighand(task, &flags)) { - oom_adjust = task->signal->oom_adj; - unlock_task_sighand(task, &flags); - } - - put_task_struct(task); - - len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust); - - return simple_read_from_buffer(buf, count, ppos, buffer, len); -} - -static ssize_t oom_adjust_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) -{ - struct task_struct *task; - char buffer[PROC_NUMBUF]; - int oom_adjust; - unsigned long flags; - int err; - - memset(buffer, 0, sizeof(buffer)); - if (count > sizeof(buffer) - 1) - count = sizeof(buffer) - 1; - if (copy_from_user(buffer, buf, count)) { - err = -EFAULT; - goto out; - } - - err = kstrtoint(strstrip(buffer), 0, &oom_adjust); - if (err) - goto out; - if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) && - oom_adjust != OOM_DISABLE) { - err = -EINVAL; - goto out; - } - - task = get_proc_task(file->f_path.dentry->d_inode); - if (!task) { - err = -ESRCH; - goto out; - } - - task_lock(task); - if (!task->mm) { - err = -EINVAL; - goto err_task_lock; - } - - if (!lock_task_sighand(task, &flags)) { - err = -ESRCH; - goto err_task_lock; - } - - if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) { - err = -EACCES; - goto err_sighand; - } - - /* - * Warn that /proc/pid/oom_adj is deprecated, see - * Documentation/feature-removal-schedule.txt. - */ - printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n", - current->comm, task_pid_nr(current), task_pid_nr(task), - task_pid_nr(task)); - task->signal->oom_adj = oom_adjust; - /* - * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum - * value is always attainable. - */ - if (task->signal->oom_adj == OOM_ADJUST_MAX) - task->signal->oom_score_adj = OOM_SCORE_ADJ_MAX; - else - task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) / - -OOM_DISABLE; - trace_oom_score_adj_update(task); -err_sighand: - unlock_task_sighand(task, &flags); -err_task_lock: - task_unlock(task); - put_task_struct(task); -out: - return err < 0 ? err : count; -} - -static const struct file_operations proc_oom_adjust_operations = { - .read = oom_adjust_read, - .write = oom_adjust_write, - .llseek = generic_file_llseek, -}; - static ssize_t oom_score_adj_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { @@ -1051,15 +946,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, if (has_capability_noaudit(current, CAP_SYS_RESOURCE)) task->signal->oom_score_adj_min = oom_score_adj; trace_oom_score_adj_update(task); - /* - * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is - * always attainable. - */ - if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) - task->signal->oom_adj = OOM_DISABLE; - else - task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) / - OOM_SCORE_ADJ_MAX; + err_sighand: unlock_task_sighand(task, &flags); err_task_lock: @@ -2710,7 +2597,6 @@ static const struct pid_entry tgid_base_stuff[] = { REG("cgroup", S_IRUGO, proc_cgroup_operations), #endif INF("oom_score", S_IRUGO, proc_oom_score), - REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations), REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), #ifdef CONFIG_AUDITSYSCALL REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), @@ -3077,7 +2963,6 @@ static const struct pid_entry tid_base_stuff[] = { REG("cgroup", S_IRUGO, proc_cgroup_operations), #endif INF("oom_score", S_IRUGO, proc_oom_score), - REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations), REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), #ifdef CONFIG_AUDITSYSCALL REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), -- cgit v1.1 From 4c199a93a2d36b277a9fd209a0f2793f8460a215 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:30:32 -0700 Subject: rbtree: empty nodes have no color Empty nodes have no color. We can make use of this property to simplify the code emitted by the RB_EMPTY_NODE and RB_CLEAR_NODE macros. Also, we can get rid of the rb_init_node function which had been introduced by commit 88d19cf37952 ("timers: Add rb_init_node() to allow for stack allocated rb nodes") to avoid some issue with the empty node's color not being initialized. I'm not sure what the RB_EMPTY_NODE checks in rb_prev() / rb_next() are doing there, though. axboe introduced them in commit 10fd48f2376d ("rbtree: fixed reversed RB_EMPTY_NODE and rb_next/prev"). The way I see it, the 'empty node' abstraction is only used by rbtree users to flag nodes that they haven't inserted in any rbtree, so asking the predecessor or successor of such nodes doesn't make any sense. One final rb_init_node() caller was recently added in sysctl code to implement faster sysctl name lookups. This code doesn't make use of RB_EMPTY_NODE at all, and from what I could see it only called rb_init_node() under the mistaken assumption that such initialization was required before node insertion. [sfr@canb.auug.org.au: fix net/ceph/osd_client.c build] Signed-off-by: Michel Lespinasse Cc: Andrea Arcangeli Acked-by: David Woodhouse Cc: Rik van Riel Cc: Peter Zijlstra Cc: Daniel Santos Cc: Jens Axboe Cc: "Eric W. Biederman" Cc: John Stultz Signed-off-by: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_sysctl.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index dcd56f84..fddc507 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -168,10 +168,8 @@ static void init_header(struct ctl_table_header *head, head->node = node; if (node) { struct ctl_table *entry; - for (entry = table; entry->procname; entry++, node++) { - rb_init_node(&node->node); + for (entry = table; entry->procname; entry++, node++) node->header = head; - } } } -- cgit v1.1 From ea5272f5c94fb2ee62f4f15a5b88eef6184cd506 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:30:35 -0700 Subject: rbtree: fix incorrect rbtree node insertion in fs/proc/proc_sysctl.c The recently added code to use rbtrees in sysctl did not follow the proper rbtree interface on insertion - it was calling rb_link_node() which inserts a new node into the binary tree, but missed the call to rb_insert_color() which properly balances the rbtree and establishes all expected rbtree invariants. I found out about this only because faulty commit also used rb_init_node(), which I am removing within this patchset. But I think it's an easy mistake to make, and it makes me wonder if we should change the rbtree API so that insertions would be done with a single rb_insert() call (even if its implementation could still inline the rb_link_node() part and call a private __rb_insert_color function to do the rebalancing). Signed-off-by: Michel Lespinasse Cc: Andrea Arcangeli Acked-by: David Woodhouse Cc: Rik van Riel Cc: Peter Zijlstra Cc: Daniel Santos Cc: Jens Axboe Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_sysctl.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index fddc507..a781bdf 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -142,6 +142,7 @@ static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry) } rb_link_node(node, parent, p); + rb_insert_color(node, &head->parent->root); return 0; } -- cgit v1.1 From bf7ad8eeab995710c766df49c9c69a8592ca0216 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:30:37 -0700 Subject: rbtree: move some implementation details from rbtree.h to rbtree.c rbtree users must use the documented APIs to manipulate the tree structure. Low-level helpers to manipulate node colors and parenthood are not part of that API, so move them to lib/rbtree.c [dwmw2@infradead.org: fix jffs2 build issue due to renamed __rb_parent_color field] Signed-off-by: Michel Lespinasse Cc: Andrea Arcangeli Acked-by: David Woodhouse Cc: Rik van Riel Cc: Peter Zijlstra Cc: Daniel Santos Cc: Jens Axboe Cc: "Eric W. Biederman" Signed-off-by: David Woodhouse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/jffs2/readinode.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c index 1ea349f..ae81b01 100644 --- a/fs/jffs2/readinode.c +++ b/fs/jffs2/readinode.c @@ -394,8 +394,11 @@ static int jffs2_add_tn_to_tree(struct jffs2_sb_info *c, } /* Trivial function to remove the last node in the tree. Which by definition - has no right-hand -- so can be removed just by making its only child (if - any) take its place under its parent. */ + has no right-hand child — so can be removed just by making its left-hand + child (if any) take its place under its parent. Since this is only done + when we're consuming the whole tree, there's no need to use rb_erase() + and let it worry about adjusting colours and balancing the tree. That + would just be a waste of time. */ static void eat_last(struct rb_root *root, struct rb_node *node) { struct rb_node *parent = rb_parent(node); @@ -412,12 +415,12 @@ static void eat_last(struct rb_root *root, struct rb_node *node) link = &parent->rb_right; *link = node->rb_left; - /* Colour doesn't matter now. Only the parent pointer. */ if (node->rb_left) - node->rb_left->rb_parent_color = node->rb_parent_color; + node->rb_left->__rb_parent_color = node->__rb_parent_color; } -/* We put this in reverse order, so we can just use eat_last */ +/* We put the version tree in reverse order, so we can use the same eat_last() + function that we use to consume the tmpnode tree (tn_root). */ static void ver_insert(struct rb_root *ver_root, struct jffs2_tmp_dnode_info *tn) { struct rb_node **link = &ver_root->rb_node; -- cgit v1.1 From 6b2dbba8b6ac4df26f72eda1e5ea7bab9f950e08 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:25 -0700 Subject: mm: replace vma prio_tree with an interval tree Implement an interval tree as a replacement for the VMA prio_tree. The algorithms are similar to lib/interval_tree.c; however that code can't be directly reused as the interval endpoints are not explicitly stored in the VMA. So instead, the common algorithm is moved into a template and the details (node type, how to get interval endpoints from the node, etc) are filled in using the C preprocessor. Once the interval tree functions are available, using them as a replacement to the VMA prio tree is a relatively simple, mechanical job. Signed-off-by: Michel Lespinasse Cc: Rik van Riel Cc: Hillf Danton Cc: Peter Zijlstra Cc: Catalin Marinas Cc: Andrea Arcangeli Cc: David Woodhouse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 9 ++++----- fs/inode.c | 2 +- 2 files changed, 5 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 0a0ab8e..c5bc355 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -397,17 +397,16 @@ static void hugetlbfs_evict_inode(struct inode *inode) } static inline void -hugetlb_vmtruncate_list(struct prio_tree_root *root, pgoff_t pgoff) +hugetlb_vmtruncate_list(struct rb_root *root, pgoff_t pgoff) { struct vm_area_struct *vma; - struct prio_tree_iter iter; - vma_prio_tree_foreach(vma, &iter, root, pgoff, ULONG_MAX) { + vma_interval_tree_foreach(vma, root, pgoff, ULONG_MAX) { unsigned long v_offset; /* * Can the expression below overflow on 32-bit arches? - * No, because the prio_tree returns us only those vmas + * No, because the interval tree returns us only those vmas * which overlap the truncated area starting at pgoff, * and no vma on a 32-bit arch can span beyond the 4GB. */ @@ -432,7 +431,7 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) i_size_write(inode, offset); mutex_lock(&mapping->i_mmap_mutex); - if (!prio_tree_empty(&mapping->i_mmap)) + if (!RB_EMPTY_ROOT(&mapping->i_mmap)) hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); mutex_unlock(&mapping->i_mmap_mutex); truncate_hugepages(inode, offset); diff --git a/fs/inode.c b/fs/inode.c index ac8d904..b03c719 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -348,7 +348,7 @@ void address_space_init_once(struct address_space *mapping) mutex_init(&mapping->i_mmap_mutex); INIT_LIST_HEAD(&mapping->private_list); spin_lock_init(&mapping->private_lock); - INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap); + mapping->i_mmap = RB_ROOT; INIT_LIST_HEAD(&mapping->i_mmap_nonlinear); } EXPORT_SYMBOL(address_space_init_once); -- cgit v1.1 From 38a76013ad809beb0b52f60d365c960d035bd83c Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:50 -0700 Subject: mm: avoid taking rmap locks in move_ptes() During mremap(), the destination VMA is generally placed after the original vma in rmap traversal order: in move_vma(), we always have new_pgoff >= vma->vm_pgoff, and as a result new_vma->vm_pgoff >= vma->vm_pgoff unless vma_merge() merged the new vma with an adjacent one. When the destination VMA is placed after the original in rmap traversal order, we can avoid taking the rmap locks in move_ptes(). Essentially, this reintroduces the optimization that had been disabled in "mm anon rmap: remove anon_vma_moveto_tail". The difference is that we don't try to impose the rmap traversal order; instead we just rely on things being in the desired order in the common case and fall back to taking locks in the uncommon case. Also we skip the i_mmap_mutex in addition to the anon_vma lock: in both cases, the vmas are traversed in increasing vm_pgoff order with ties resolved in tree insertion order. Signed-off-by: Michel Lespinasse Cc: Andrea Arcangeli Cc: Rik van Riel Cc: Peter Zijlstra Cc: Daniel Santos Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 19f4fb8..4f2bebc 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -603,7 +603,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) * process cleanup to remove whatever mess we made. */ if (length != move_page_tables(vma, old_start, - vma, new_start, length)) + vma, new_start, length, false)) return -ENOMEM; lru_add_drain(); -- cgit v1.1 From cd8ed2a45a401cb692d769e92de7d73aa42fabce Mon Sep 17 00:00:00 2001 From: Yan Hong Date: Mon, 8 Oct 2012 16:33:45 -0700 Subject: fs/fs-writeback.c: remove unneccesary parameter of __writeback_single_inode() The parameter 'wb' is never used in this function. Signed-off-by: Yan Hong Acked-by: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs-writeback.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 8e1d7b9..401b6c6 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -439,8 +439,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, * setting I_SYNC flag and calling inode_sync_complete() to clear it. */ static int -__writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, - struct writeback_control *wbc) +__writeback_single_inode(struct inode *inode, struct writeback_control *wbc) { struct address_space *mapping = inode->i_mapping; long nr_to_write = wbc->nr_to_write; @@ -527,7 +526,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, inode->i_state |= I_SYNC; spin_unlock(&inode->i_lock); - ret = __writeback_single_inode(inode, wb, wbc); + ret = __writeback_single_inode(inode, wbc); spin_lock(&wb->list_lock); spin_lock(&inode->i_lock); @@ -670,7 +669,7 @@ static long writeback_sb_inodes(struct super_block *sb, * We use I_SYNC to pin the inode in memory. While it is set * evict_inode() will wait so the inode cannot be freed. */ - __writeback_single_inode(inode, wb, &wbc); + __writeback_single_inode(inode, &wbc); work->nr_pages -= write_chunk - wbc.nr_to_write; wrote += write_chunk - wbc.nr_to_write; -- cgit v1.1 From 7a71932d5676b7410ab64d149bad8bde6b0d8632 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Mon, 8 Oct 2012 16:33:47 -0700 Subject: kpageflags: fix wrong KPF_THP on non-huge compound pages KPF_THP can be set on non-huge compound pages (like slab pages or pages allocated by drivers with __GFP_COMP) because PageTransCompound only checks PG_head and PG_tail. Obviously this is a bug and breaks user space applications which look for thp via /proc/kpageflags. This patch rules out setting KPF_THP wrongly by additionally checking PageLRU on the head pages. Signed-off-by: Naoya Horiguchi Acked-by: KOSAKI Motohiro Acked-by: David Rientjes Reviewed-by: Fengguang Wu Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/page.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/page.c b/fs/proc/page.c index 7fcd0d6..b8730d9 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -115,7 +115,13 @@ u64 stable_page_flags(struct page *page) u |= 1 << KPF_COMPOUND_TAIL; if (PageHuge(page)) u |= 1 << KPF_HUGE; - else if (PageTransCompound(page)) + /* + * PageTransCompound can be true for non-huge compound pages (slab + * pages or pages allocated by drivers with __GFP_COMP) because it + * just checks PG_head/PG_tail, so we need to check PageLRU to make + * sure a given page is a thp, not a non-huge compound page. + */ + else if (PageTransCompound(page) && PageLRU(compound_trans_head(page))) u |= 1 << KPF_THP; /* -- cgit v1.1