diff options
author | Jeff Garzik <jeff@garzik.org> | 2006-03-24 12:29:39 -0500 |
---|---|---|
committer | Jeff Garzik <jeff@garzik.org> | 2006-03-24 12:29:39 -0500 |
commit | 4bbf7bc4c7bf1c80ec3c942fa5f1b6e6fa67dd99 (patch) | |
tree | c4ff89dc09abe69d58db1e14da22ecda9fdd3ce8 /mm | |
parent | 84ac69e8bf9f36eb0166817373336d14fa58f5cc (diff) | |
parent | aec5c3c1a929d7d79a420e943285cf3ba26a7c0d (diff) | |
download | op-kernel-dev-4bbf7bc4c7bf1c80ec3c942fa5f1b6e6fa67dd99.zip op-kernel-dev-4bbf7bc4c7bf1c80ec3c942fa5f1b6e6fa67dd99.tar.gz |
Merge branch 'upstream'
Conflicts:
drivers/scsi/libata-core.c
Diffstat (limited to 'mm')
-rw-r--r-- | mm/fadvise.c | 46 | ||||
-rw-r--r-- | mm/filemap.c | 41 | ||||
-rw-r--r-- | mm/highmem.c | 3 | ||||
-rw-r--r-- | mm/mempolicy.c | 32 | ||||
-rw-r--r-- | mm/msync.c | 137 | ||||
-rw-r--r-- | mm/page-writeback.c | 64 | ||||
-rw-r--r-- | mm/page_alloc.c | 3 | ||||
-rw-r--r-- | mm/readahead.c | 1 | ||||
-rw-r--r-- | mm/slab.c | 37 | ||||
-rw-r--r-- | mm/swapfile.c | 57 | ||||
-rw-r--r-- | mm/util.c | 37 |
11 files changed, 351 insertions, 107 deletions
diff --git a/mm/fadvise.c b/mm/fadvise.c index d257c89..907c392 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -15,6 +15,7 @@ #include <linux/backing-dev.h> #include <linux/pagevec.h> #include <linux/fadvise.h> +#include <linux/writeback.h> #include <linux/syscalls.h> #include <asm/unistd.h> @@ -22,13 +23,36 @@ /* * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could * deactivate the pages and clear PG_Referenced. + * + * LINUX_FADV_ASYNC_WRITE: start async writeout of any dirty pages between file + * offsets `offset' and `offset+len' inclusive. Any pages which are currently + * under writeout are skipped, whether or not they are dirty. + * + * LINUX_FADV_WRITE_WAIT: wait upon writeout of any dirty pages between file + * offsets `offset' and `offset+len'. + * + * By combining these two operations the application may do several things: + * + * LINUX_FADV_ASYNC_WRITE: push some or all of the dirty pages at the disk. + * + * LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE: push all of the currently + * dirty pages at the disk. + * + * LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE, LINUX_FADV_WRITE_WAIT: push + * all of the currently dirty pages at the disk, wait until they have been + * written. + * + * It should be noted that none of these operations write out the file's + * metadata. So unless the application is strictly performing overwrites of + * already-instantiated disk blocks, there are no guarantees here that the data + * will be available after a crash. */ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) { struct file *file = fget(fd); struct address_space *mapping; struct backing_dev_info *bdi; - loff_t endbyte; + loff_t endbyte; /* inclusive */ pgoff_t start_index; pgoff_t end_index; unsigned long nrpages; @@ -56,6 +80,8 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) endbyte = offset + len; if (!len || endbyte < len) endbyte = -1; + else + endbyte--; /* inclusive */ bdi = mapping->backing_dev_info; @@ -78,7 +104,7 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) /* First and last PARTIAL page! */ start_index = offset >> PAGE_CACHE_SHIFT; - end_index = (endbyte-1) >> PAGE_CACHE_SHIFT; + end_index = endbyte >> PAGE_CACHE_SHIFT; /* Careful about overflow on the "+1" */ nrpages = end_index - start_index + 1; @@ -96,11 +122,21 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) filemap_flush(mapping); /* First and last FULL page! */ - start_index = (offset + (PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT; + start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT; end_index = (endbyte >> PAGE_CACHE_SHIFT); - if (end_index > start_index) - invalidate_mapping_pages(mapping, start_index, end_index-1); + if (end_index >= start_index) + invalidate_mapping_pages(mapping, start_index, + end_index); + break; + case LINUX_FADV_ASYNC_WRITE: + ret = __filemap_fdatawrite_range(mapping, offset, endbyte, + WB_SYNC_NONE); + break; + case LINUX_FADV_WRITE_WAIT: + ret = wait_on_page_writeback_range(mapping, + offset >> PAGE_CACHE_SHIFT, + endbyte >> PAGE_CACHE_SHIFT); break; default: ret = -EINVAL; diff --git a/mm/filemap.c b/mm/filemap.c index e8f58f7..3ef2073 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -29,6 +29,7 @@ #include <linux/blkdev.h> #include <linux/security.h> #include <linux/syscalls.h> +#include <linux/cpuset.h> #include "filemap.h" #include "internal.h" @@ -174,7 +175,7 @@ static int sync_page(void *word) * dirty pages that lie within the byte offsets <start, end> * @mapping: address space structure to write * @start: offset in bytes where the range starts - * @end: offset in bytes where the range ends + * @end: offset in bytes where the range ends (inclusive) * @sync_mode: enable synchronous operation * * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as @@ -182,8 +183,8 @@ static int sync_page(void *word) * these two operations is that if a dirty page/buffer is encountered, it must * be waited upon, and not just skipped over. */ -static int __filemap_fdatawrite_range(struct address_space *mapping, - loff_t start, loff_t end, int sync_mode) +int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, + loff_t end, int sync_mode) { int ret; struct writeback_control wbc = { @@ -212,8 +213,8 @@ int filemap_fdatawrite(struct address_space *mapping) } EXPORT_SYMBOL(filemap_fdatawrite); -static int filemap_fdatawrite_range(struct address_space *mapping, - loff_t start, loff_t end) +static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, + loff_t end) { return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); } @@ -232,7 +233,7 @@ EXPORT_SYMBOL(filemap_flush); * Wait for writeback to complete against pages indexed by start->end * inclusive */ -static int wait_on_page_writeback_range(struct address_space *mapping, +int wait_on_page_writeback_range(struct address_space *mapping, pgoff_t start, pgoff_t end) { struct pagevec pvec; @@ -367,6 +368,12 @@ int filemap_write_and_wait(struct address_space *mapping) } EXPORT_SYMBOL(filemap_write_and_wait); +/* + * Write out and wait upon file offsets lstart->lend, inclusive. + * + * Note that `lend' is inclusive (describes the last byte to be written) so + * that this function can be used to write to the very end-of-file (end = -1). + */ int filemap_write_and_wait_range(struct address_space *mapping, loff_t lstart, loff_t lend) { @@ -427,6 +434,28 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, return ret; } +#ifdef CONFIG_NUMA +struct page *page_cache_alloc(struct address_space *x) +{ + if (cpuset_do_page_mem_spread()) { + int n = cpuset_mem_spread_node(); + return alloc_pages_node(n, mapping_gfp_mask(x), 0); + } + return alloc_pages(mapping_gfp_mask(x), 0); +} +EXPORT_SYMBOL(page_cache_alloc); + +struct page *page_cache_alloc_cold(struct address_space *x) +{ + if (cpuset_do_page_mem_spread()) { + int n = cpuset_mem_spread_node(); + return alloc_pages_node(n, mapping_gfp_mask(x)|__GFP_COLD, 0); + } + return alloc_pages(mapping_gfp_mask(x)|__GFP_COLD, 0); +} +EXPORT_SYMBOL(page_cache_alloc_cold); +#endif + /* * In order to wait for pages to become available there must be * waitqueues associated with pages. By using a hash table of diff --git a/mm/highmem.c b/mm/highmem.c index ce2e7e8..d0ea1ee 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -26,6 +26,7 @@ #include <linux/init.h> #include <linux/hash.h> #include <linux/highmem.h> +#include <linux/blktrace_api.h> #include <asm/tlbflush.h> static mempool_t *page_pool, *isa_page_pool; @@ -483,6 +484,8 @@ void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig) pool = isa_page_pool; } + blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE); + /* * slow path */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e93cc74..4f71cfd 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -422,6 +422,37 @@ static int contextualize_policy(int mode, nodemask_t *nodes) return mpol_check_policy(mode, nodes); } + +/* + * Update task->flags PF_MEMPOLICY bit: set iff non-default + * mempolicy. Allows more rapid checking of this (combined perhaps + * with other PF_* flag bits) on memory allocation hot code paths. + * + * If called from outside this file, the task 'p' should -only- be + * a newly forked child not yet visible on the task list, because + * manipulating the task flags of a visible task is not safe. + * + * The above limitation is why this routine has the funny name + * mpol_fix_fork_child_flag(). + * + * It is also safe to call this with a task pointer of current, + * which the static wrapper mpol_set_task_struct_flag() does, + * for use within this file. + */ + +void mpol_fix_fork_child_flag(struct task_struct *p) +{ + if (p->mempolicy) + p->flags |= PF_MEMPOLICY; + else + p->flags &= ~PF_MEMPOLICY; +} + +static void mpol_set_task_struct_flag(void) +{ + mpol_fix_fork_child_flag(current); +} + /* Set the process memory policy */ long do_set_mempolicy(int mode, nodemask_t *nodes) { @@ -434,6 +465,7 @@ long do_set_mempolicy(int mode, nodemask_t *nodes) return PTR_ERR(new); mpol_free(current->mempolicy); current->mempolicy = new; + mpol_set_task_struct_flag(); if (new && new->policy == MPOL_INTERLEAVE) current->il_next = first_node(new->v.nodes); return 0; @@ -9,20 +9,24 @@ */ #include <linux/slab.h> #include <linux/pagemap.h> +#include <linux/fs.h> #include <linux/mm.h> #include <linux/mman.h> #include <linux/hugetlb.h> +#include <linux/writeback.h> +#include <linux/file.h> #include <linux/syscalls.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> -static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd, +static unsigned long msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end) { pte_t *pte; spinlock_t *ptl; int progress = 0; + unsigned long ret = 0; again: pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); @@ -43,58 +47,64 @@ again: if (!page) continue; if (ptep_clear_flush_dirty(vma, addr, pte) || - page_test_and_clear_dirty(page)) - set_page_dirty(page); + page_test_and_clear_dirty(page)) + ret += set_page_dirty(page); progress += 3; } while (pte++, addr += PAGE_SIZE, addr != end); pte_unmap_unlock(pte - 1, ptl); cond_resched(); if (addr != end) goto again; + return ret; } -static inline void msync_pmd_range(struct vm_area_struct *vma, pud_t *pud, - unsigned long addr, unsigned long end) +static inline unsigned long msync_pmd_range(struct vm_area_struct *vma, + pud_t *pud, unsigned long addr, unsigned long end) { pmd_t *pmd; unsigned long next; + unsigned long ret = 0; pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); if (pmd_none_or_clear_bad(pmd)) continue; - msync_pte_range(vma, pmd, addr, next); + ret += msync_pte_range(vma, pmd, addr, next); } while (pmd++, addr = next, addr != end); + return ret; } -static inline void msync_pud_range(struct vm_area_struct *vma, pgd_t *pgd, - unsigned long addr, unsigned long end) +static inline unsigned long msync_pud_range(struct vm_area_struct *vma, + pgd_t *pgd, unsigned long addr, unsigned long end) { pud_t *pud; unsigned long next; + unsigned long ret = 0; pud = pud_offset(pgd, addr); do { next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; - msync_pmd_range(vma, pud, addr, next); + ret += msync_pmd_range(vma, pud, addr, next); } while (pud++, addr = next, addr != end); + return ret; } -static void msync_page_range(struct vm_area_struct *vma, +static unsigned long msync_page_range(struct vm_area_struct *vma, unsigned long addr, unsigned long end) { pgd_t *pgd; unsigned long next; + unsigned long ret = 0; /* For hugepages we can't go walking the page table normally, * but that's ok, hugetlbfs is memory based, so we don't need * to do anything more on an msync(). */ if (vma->vm_flags & VM_HUGETLB) - return; + return 0; BUG_ON(addr >= end); pgd = pgd_offset(vma->vm_mm, addr); @@ -103,8 +113,9 @@ static void msync_page_range(struct vm_area_struct *vma, next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - msync_pud_range(vma, pgd, addr, next); + ret += msync_pud_range(vma, pgd, addr, next); } while (pgd++, addr = next, addr != end); + return ret; } /* @@ -118,50 +129,28 @@ static void msync_page_range(struct vm_area_struct *vma, * So my _not_ starting I/O in MS_ASYNC we provide complete flexibility to * applications. */ -static int msync_interval(struct vm_area_struct *vma, - unsigned long addr, unsigned long end, int flags) +static int msync_interval(struct vm_area_struct *vma, unsigned long addr, + unsigned long end, int flags, + unsigned long *nr_pages_dirtied) { - int ret = 0; struct file *file = vma->vm_file; if ((flags & MS_INVALIDATE) && (vma->vm_flags & VM_LOCKED)) return -EBUSY; - if (file && (vma->vm_flags & VM_SHARED)) { - msync_page_range(vma, addr, end); - - if (flags & MS_SYNC) { - struct address_space *mapping = file->f_mapping; - int err; - - ret = filemap_fdatawrite(mapping); - if (file->f_op && file->f_op->fsync) { - /* - * We don't take i_mutex here because mmap_sem - * is already held. - */ - err = file->f_op->fsync(file,file->f_dentry,1); - if (err && !ret) - ret = err; - } - err = filemap_fdatawait(mapping); - if (!ret) - ret = err; - } - } - return ret; + if (file && (vma->vm_flags & VM_SHARED)) + *nr_pages_dirtied = msync_page_range(vma, addr, end); + return 0; } asmlinkage long sys_msync(unsigned long start, size_t len, int flags) { unsigned long end; struct vm_area_struct *vma; - int unmapped_error, error = -EINVAL; - - if (flags & MS_SYNC) - current->flags |= PF_SYNCWRITE; + int unmapped_error = 0; + int error = -EINVAL; + int done = 0; - down_read(¤t->mm->mmap_sem); if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC)) goto out; if (start & ~PAGE_MASK) @@ -180,13 +169,18 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags) * If the interval [start,end) covers some unmapped address ranges, * just ignore them, but return -ENOMEM at the end. */ + down_read(¤t->mm->mmap_sem); + if (flags & MS_SYNC) + current->flags |= PF_SYNCWRITE; vma = find_vma(current->mm, start); - unmapped_error = 0; - for (;;) { - /* Still start < end. */ + if (!vma) { error = -ENOMEM; - if (!vma) - goto out; + goto out_unlock; + } + do { + unsigned long nr_pages_dirtied = 0; + struct file *file; + /* Here start < vma->vm_end. */ if (start < vma->vm_start) { unmapped_error = -ENOMEM; @@ -195,22 +189,47 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags) /* Here vma->vm_start <= start < vma->vm_end. */ if (end <= vma->vm_end) { if (start < end) { - error = msync_interval(vma, start, end, flags); + error = msync_interval(vma, start, end, flags, + &nr_pages_dirtied); if (error) - goto out; + goto out_unlock; } error = unmapped_error; - goto out; + done = 1; + } else { + /* Here vma->vm_start <= start < vma->vm_end < end. */ + error = msync_interval(vma, start, vma->vm_end, flags, + &nr_pages_dirtied); + if (error) + goto out_unlock; } - /* Here vma->vm_start <= start < vma->vm_end < end. */ - error = msync_interval(vma, start, vma->vm_end, flags); - if (error) - goto out; + file = vma->vm_file; start = vma->vm_end; - vma = vma->vm_next; - } -out: - up_read(¤t->mm->mmap_sem); + if ((flags & MS_ASYNC) && file && nr_pages_dirtied) { + get_file(file); + up_read(¤t->mm->mmap_sem); + balance_dirty_pages_ratelimited_nr(file->f_mapping, + nr_pages_dirtied); + fput(file); + down_read(¤t->mm->mmap_sem); + vma = find_vma(current->mm, start); + } else if ((flags & MS_SYNC) && file && + (vma->vm_flags & VM_SHARED)) { + get_file(file); + up_read(¤t->mm->mmap_sem); + error = do_fsync(file, 0); + fput(file); + down_read(¤t->mm->mmap_sem); + if (error) + goto out_unlock; + vma = find_vma(current->mm, start); + } else { + vma = vma->vm_next; + } + } while (vma && !done); +out_unlock: current->flags &= ~PF_SYNCWRITE; + up_read(¤t->mm->mmap_sem); +out: return error; } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 945559f..893d767 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -75,12 +75,12 @@ int vm_dirty_ratio = 40; * The interval between `kupdate'-style writebacks, in centiseconds * (hundredths of a second) */ -int dirty_writeback_centisecs = 5 * 100; +int dirty_writeback_interval = 5 * HZ; /* * The longest number of centiseconds for which data is allowed to remain dirty */ -int dirty_expire_centisecs = 30 * 100; +int dirty_expire_interval = 30 * HZ; /* * Flag that makes the machine dump writes/reads and block dirtyings. @@ -88,7 +88,8 @@ int dirty_expire_centisecs = 30 * 100; int block_dump; /* - * Flag that puts the machine in "laptop mode". + * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies: + * a full sync is triggered after this time elapses without any disk activity. */ int laptop_mode; @@ -255,8 +256,9 @@ static void balance_dirty_pages(struct address_space *mapping) } /** - * balance_dirty_pages_ratelimited - balance dirty memory state + * balance_dirty_pages_ratelimited_nr - balance dirty memory state * @mapping: address_space which was dirtied + * @nr_pages: number of pages which the caller has just dirtied * * Processes which are dirtying memory should call in here once for each page * which was newly dirtied. The function will periodically check the system's @@ -267,10 +269,12 @@ static void balance_dirty_pages(struct address_space *mapping) * limit we decrease the ratelimiting by a lot, to prevent individual processes * from overshooting the limit by (ratelimit_pages) each. */ -void balance_dirty_pages_ratelimited(struct address_space *mapping) +void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, + unsigned long nr_pages_dirtied) { - static DEFINE_PER_CPU(int, ratelimits) = 0; - long ratelimit; + static DEFINE_PER_CPU(unsigned long, ratelimits) = 0; + unsigned long ratelimit; + unsigned long *p; ratelimit = ratelimit_pages; if (dirty_exceeded) @@ -280,15 +284,18 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping) * Check the rate limiting. Also, we do not want to throttle real-time * tasks in balance_dirty_pages(). Period. */ - if (get_cpu_var(ratelimits)++ >= ratelimit) { - __get_cpu_var(ratelimits) = 0; - put_cpu_var(ratelimits); + preempt_disable(); + p = &__get_cpu_var(ratelimits); + *p += nr_pages_dirtied; + if (unlikely(*p >= ratelimit)) { + *p = 0; + preempt_enable(); balance_dirty_pages(mapping); return; } - put_cpu_var(ratelimits); + preempt_enable(); } -EXPORT_SYMBOL(balance_dirty_pages_ratelimited); +EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr); void throttle_vm_writeout(void) { @@ -380,8 +387,8 @@ static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0); * just walks the superblock inode list, writing back any inodes which are * older than a specific point in time. * - * Try to run once per dirty_writeback_centisecs. But if a writeback event - * takes longer than a dirty_writeback_centisecs interval, then leave a + * Try to run once per dirty_writeback_interval. But if a writeback event + * takes longer than a dirty_writeback_interval interval, then leave a * one-second gap. * * older_than_this takes precedence over nr_to_write. So we'll only write back @@ -406,9 +413,9 @@ static void wb_kupdate(unsigned long arg) sync_supers(); get_writeback_state(&wbs); - oldest_jif = jiffies - (dirty_expire_centisecs * HZ) / 100; + oldest_jif = jiffies - dirty_expire_interval; start_jif = jiffies; - next_jif = start_jif + (dirty_writeback_centisecs * HZ) / 100; + next_jif = start_jif + dirty_writeback_interval; nr_to_write = wbs.nr_dirty + wbs.nr_unstable + (inodes_stat.nr_inodes - inodes_stat.nr_unused); while (nr_to_write > 0) { @@ -425,7 +432,7 @@ static void wb_kupdate(unsigned long arg) } if (time_before(next_jif, jiffies + HZ)) next_jif = jiffies + HZ; - if (dirty_writeback_centisecs) + if (dirty_writeback_interval) mod_timer(&wb_timer, next_jif); } @@ -435,11 +442,11 @@ static void wb_kupdate(unsigned long arg) int dirty_writeback_centisecs_handler(ctl_table *table, int write, struct file *file, void __user *buffer, size_t *length, loff_t *ppos) { - proc_dointvec(table, write, file, buffer, length, ppos); - if (dirty_writeback_centisecs) { + proc_dointvec_userhz_jiffies(table, write, file, buffer, length, ppos); + if (dirty_writeback_interval) { mod_timer(&wb_timer, - jiffies + (dirty_writeback_centisecs * HZ) / 100); - } else { + jiffies + dirty_writeback_interval); + } else { del_timer(&wb_timer); } return 0; @@ -468,7 +475,7 @@ static void laptop_timer_fn(unsigned long unused) */ void laptop_io_completion(void) { - mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode * HZ); + mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode); } /* @@ -544,7 +551,7 @@ void __init page_writeback_init(void) if (vm_dirty_ratio <= 0) vm_dirty_ratio = 1; } - mod_timer(&wb_timer, jiffies + (dirty_writeback_centisecs * HZ) / 100); + mod_timer(&wb_timer, jiffies + dirty_writeback_interval); set_ratelimit(); register_cpu_notifier(&ratelimit_nb); } @@ -621,8 +628,6 @@ EXPORT_SYMBOL(write_one_page); */ int __set_page_dirty_nobuffers(struct page *page) { - int ret = 0; - if (!TestSetPageDirty(page)) { struct address_space *mapping = page_mapping(page); struct address_space *mapping2; @@ -644,8 +649,9 @@ int __set_page_dirty_nobuffers(struct page *page) I_DIRTY_PAGES); } } + return 1; } - return ret; + return 0; } EXPORT_SYMBOL(__set_page_dirty_nobuffers); @@ -675,8 +681,10 @@ int fastcall set_page_dirty(struct page *page) return (*spd)(page); return __set_page_dirty_buffers(page); } - if (!PageDirty(page)) - SetPageDirty(page); + if (!PageDirty(page)) { + if (!TestSetPageDirty(page)) + return 1; + } return 0; } EXPORT_SYMBOL(set_page_dirty); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b7f14a4..a5c3f8b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -943,7 +943,8 @@ restart: goto got_pg; do { - wakeup_kswapd(*z, order); + if (cpuset_zone_allowed(*z, gfp_mask)) + wakeup_kswapd(*z, order); } while (*(++z)); /* diff --git a/mm/readahead.c b/mm/readahead.c index 301b36c..0f142a4 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -555,6 +555,7 @@ recheck: out: return ra->prev_page + 1; } +EXPORT_SYMBOL_GPL(page_cache_readahead); /* * handle_ra_miss() is called when it is known that a page which should have @@ -94,6 +94,7 @@ #include <linux/interrupt.h> #include <linux/init.h> #include <linux/compiler.h> +#include <linux/cpuset.h> #include <linux/seq_file.h> #include <linux/notifier.h> #include <linux/kallsyms.h> @@ -173,12 +174,12 @@ SLAB_CACHE_DMA | \ SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \ SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ - SLAB_DESTROY_BY_RCU) + SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD) #else # define CREATE_MASK (SLAB_HWCACHE_ALIGN | \ SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \ SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ - SLAB_DESTROY_BY_RCU) + SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD) #endif /* @@ -898,6 +899,7 @@ static struct array_cache *alloc_arraycache(int node, int entries, #ifdef CONFIG_NUMA static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int); +static void *alternate_node_alloc(struct kmem_cache *, gfp_t); static struct array_cache **alloc_alien_cache(int node, int limit) { @@ -2807,11 +2809,10 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) struct array_cache *ac; #ifdef CONFIG_NUMA - if (unlikely(current->mempolicy && !in_interrupt())) { - int nid = slab_node(current->mempolicy); - - if (nid != numa_node_id()) - return __cache_alloc_node(cachep, flags, nid); + if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) { + objp = alternate_node_alloc(cachep, flags); + if (objp != NULL) + return objp; } #endif @@ -2847,6 +2848,28 @@ static __always_inline void *__cache_alloc(struct kmem_cache *cachep, #ifdef CONFIG_NUMA /* + * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY. + * + * If we are in_interrupt, then process context, including cpusets and + * mempolicy, may not apply and should not be used for allocation policy. + */ +static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) +{ + int nid_alloc, nid_here; + + if (in_interrupt()) + return NULL; + nid_alloc = nid_here = numa_node_id(); + if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) + nid_alloc = cpuset_mem_spread_node(); + else if (current->mempolicy) + nid_alloc = slab_node(current->mempolicy); + if (nid_alloc != nid_here) + return __cache_alloc_node(cachep, flags, nid_alloc); + return NULL; +} + +/* * A interface to enable slab creation on nodeid */ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, diff --git a/mm/swapfile.c b/mm/swapfile.c index 365ed6f..39aa9d1 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -45,7 +45,7 @@ static const char Unused_offset[] = "Unused swap offset entry "; struct swap_list_t swap_list = {-1, -1}; -struct swap_info_struct swap_info[MAX_SWAPFILES]; +static struct swap_info_struct swap_info[MAX_SWAPFILES]; static DEFINE_MUTEX(swapon_mutex); @@ -417,6 +417,61 @@ void free_swap_and_cache(swp_entry_t entry) } } +#ifdef CONFIG_SOFTWARE_SUSPEND +/* + * Find the swap type that corresponds to given device (if any) + * + * This is needed for software suspend and is done in such a way that inode + * aliasing is allowed. + */ +int swap_type_of(dev_t device) +{ + int i; + + spin_lock(&swap_lock); + for (i = 0; i < nr_swapfiles; i++) { + struct inode *inode; + + if (!(swap_info[i].flags & SWP_WRITEOK)) + continue; + if (!device) { + spin_unlock(&swap_lock); + return i; + } + inode = swap_info->swap_file->f_dentry->d_inode; + if (S_ISBLK(inode->i_mode) && + device == MKDEV(imajor(inode), iminor(inode))) { + spin_unlock(&swap_lock); + return i; + } + } + spin_unlock(&swap_lock); + return -ENODEV; +} + +/* + * Return either the total number of swap pages of given type, or the number + * of free pages of that type (depending on @free) + * + * This is needed for software suspend + */ +unsigned int count_swap_pages(int type, int free) +{ + unsigned int n = 0; + + if (type < nr_swapfiles) { + spin_lock(&swap_lock); + if (swap_info[type].flags & SWP_WRITEOK) { + n = swap_info[type].pages; + if (free) + n -= swap_info[type].inuse_pages; + } + spin_unlock(&swap_lock); + } + return n; +} +#endif + /* * No need to decide whether this PTE shares the swap entry with others, * just let do_wp_page work it out if a write is requested later - to @@ -1,6 +1,8 @@ #include <linux/slab.h> #include <linux/string.h> #include <linux/module.h> +#include <linux/err.h> +#include <asm/uaccess.h> /** * kzalloc - allocate memory. The memory is set to zero. @@ -37,3 +39,38 @@ char *kstrdup(const char *s, gfp_t gfp) return buf; } EXPORT_SYMBOL(kstrdup); + +/* + * strndup_user - duplicate an existing string from user space + * + * @s: The string to duplicate + * @n: Maximum number of bytes to copy, including the trailing NUL. + */ +char *strndup_user(const char __user *s, long n) +{ + char *p; + long length; + + length = strnlen_user(s, n); + + if (!length) + return ERR_PTR(-EFAULT); + + if (length > n) + return ERR_PTR(-EINVAL); + + p = kmalloc(length, GFP_KERNEL); + + if (!p) + return ERR_PTR(-ENOMEM); + + if (copy_from_user(p, s, length)) { + kfree(p); + return ERR_PTR(-EFAULT); + } + + p[length - 1] = '\0'; + + return p; +} +EXPORT_SYMBOL(strndup_user); |