diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Makefile | 2 | ||||
-rw-r--r-- | mm/dmapool.c | 500 | ||||
-rw-r--r-- | mm/fadvise.c | 16 | ||||
-rw-r--r-- | mm/filemap.c | 44 | ||||
-rw-r--r-- | mm/filemap_xip.c | 2 | ||||
-rw-r--r-- | mm/fremap.c | 5 | ||||
-rw-r--r-- | mm/highmem.c | 4 | ||||
-rw-r--r-- | mm/hugetlb.c | 2 | ||||
-rw-r--r-- | mm/internal.h | 4 | ||||
-rw-r--r-- | mm/memory.c | 196 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 6 | ||||
-rw-r--r-- | mm/migrate.c | 35 | ||||
-rw-r--r-- | mm/mmap.c | 8 | ||||
-rw-r--r-- | mm/nommu.c | 53 | ||||
-rw-r--r-- | mm/oom_kill.c | 5 | ||||
-rw-r--r-- | mm/page-writeback.c | 24 | ||||
-rw-r--r-- | mm/page_alloc.c | 159 | ||||
-rw-r--r-- | mm/page_io.c | 2 | ||||
-rw-r--r-- | mm/pagewalk.c | 131 | ||||
-rw-r--r-- | mm/rmap.c | 6 | ||||
-rw-r--r-- | mm/shmem.c | 495 | ||||
-rw-r--r-- | mm/slob.c | 51 | ||||
-rw-r--r-- | mm/slub.c | 182 | ||||
-rw-r--r-- | mm/sparse.c | 12 | ||||
-rw-r--r-- | mm/swap.c | 10 | ||||
-rw-r--r-- | mm/swap_state.c | 153 | ||||
-rw-r--r-- | mm/swapfile.c | 113 | ||||
-rw-r--r-- | mm/tiny-shmem.c | 12 | ||||
-rw-r--r-- | mm/truncate.c | 10 | ||||
-rw-r--r-- | mm/vmalloc.c | 74 | ||||
-rw-r--r-- | mm/vmstat.c | 61 |
31 files changed, 1537 insertions, 840 deletions
diff --git a/mm/Makefile b/mm/Makefile index 5c0b0ea..4af5dff 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -13,8 +13,10 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ page_isolation.o $(mmu-y) +obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o +obj-$(CONFIG_HAS_DMA) += dmapool.o obj-$(CONFIG_HUGETLBFS) += hugetlb.o obj-$(CONFIG_NUMA) += mempolicy.o obj-$(CONFIG_SPARSEMEM) += sparse.o diff --git a/mm/dmapool.c b/mm/dmapool.c new file mode 100644 index 0000000..34aaac4 --- /dev/null +++ b/mm/dmapool.c @@ -0,0 +1,500 @@ +/* + * DMA Pool allocator + * + * Copyright 2001 David Brownell + * Copyright 2007 Intel Corporation + * Author: Matthew Wilcox <willy@linux.intel.com> + * + * This software may be redistributed and/or modified under the terms of + * the GNU General Public License ("GPL") version 2 as published by the + * Free Software Foundation. + * + * This allocator returns small blocks of a given size which are DMA-able by + * the given device. It uses the dma_alloc_coherent page allocator to get + * new pages, then splits them up into blocks of the required size. + * Many older drivers still have their own code to do this. + * + * The current design of this allocator is fairly simple. The pool is + * represented by the 'struct dma_pool' which keeps a doubly-linked list of + * allocated pages. Each page in the page_list is split into blocks of at + * least 'size' bytes. Free blocks are tracked in an unsorted singly-linked + * list of free blocks within the page. Used blocks aren't tracked, but we + * keep a count of how many are currently allocated from each page. + */ + +#include <linux/device.h> +#include <linux/dma-mapping.h> +#include <linux/dmapool.h> +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/poison.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/wait.h> + +struct dma_pool { /* the pool */ + struct list_head page_list; + spinlock_t lock; + size_t size; + struct device *dev; + size_t allocation; + size_t boundary; + char name[32]; + wait_queue_head_t waitq; + struct list_head pools; +}; + +struct dma_page { /* cacheable header for 'allocation' bytes */ + struct list_head page_list; + void *vaddr; + dma_addr_t dma; + unsigned int in_use; + unsigned int offset; +}; + +#define POOL_TIMEOUT_JIFFIES ((100 /* msec */ * HZ) / 1000) + +static DEFINE_MUTEX(pools_lock); + +static ssize_t +show_pools(struct device *dev, struct device_attribute *attr, char *buf) +{ + unsigned temp; + unsigned size; + char *next; + struct dma_page *page; + struct dma_pool *pool; + + next = buf; + size = PAGE_SIZE; + + temp = scnprintf(next, size, "poolinfo - 0.1\n"); + size -= temp; + next += temp; + + mutex_lock(&pools_lock); + list_for_each_entry(pool, &dev->dma_pools, pools) { + unsigned pages = 0; + unsigned blocks = 0; + + list_for_each_entry(page, &pool->page_list, page_list) { + pages++; + blocks += page->in_use; + } + + /* per-pool info, no real statistics yet */ + temp = scnprintf(next, size, "%-16s %4u %4Zu %4Zu %2u\n", + pool->name, blocks, + pages * (pool->allocation / pool->size), + pool->size, pages); + size -= temp; + next += temp; + } + mutex_unlock(&pools_lock); + + return PAGE_SIZE - size; +} + +static DEVICE_ATTR(pools, S_IRUGO, show_pools, NULL); + +/** + * dma_pool_create - Creates a pool of consistent memory blocks, for dma. + * @name: name of pool, for diagnostics + * @dev: device that will be doing the DMA + * @size: size of the blocks in this pool. + * @align: alignment requirement for blocks; must be a power of two + * @boundary: returned blocks won't cross this power of two boundary + * Context: !in_interrupt() + * + * Returns a dma allocation pool with the requested characteristics, or + * null if one can't be created. Given one of these pools, dma_pool_alloc() + * may be used to allocate memory. Such memory will all have "consistent" + * DMA mappings, accessible by the device and its driver without using + * cache flushing primitives. The actual size of blocks allocated may be + * larger than requested because of alignment. + * + * If @boundary is nonzero, objects returned from dma_pool_alloc() won't + * cross that size boundary. This is useful for devices which have + * addressing restrictions on individual DMA transfers, such as not crossing + * boundaries of 4KBytes. + */ +struct dma_pool *dma_pool_create(const char *name, struct device *dev, + size_t size, size_t align, size_t boundary) +{ + struct dma_pool *retval; + size_t allocation; + + if (align == 0) { + align = 1; + } else if (align & (align - 1)) { + return NULL; + } + + if (size == 0) { + return NULL; + } else if (size < 4) { + size = 4; + } + + if ((size % align) != 0) + size = ALIGN(size, align); + + allocation = max_t(size_t, size, PAGE_SIZE); + + if (!boundary) { + boundary = allocation; + } else if ((boundary < size) || (boundary & (boundary - 1))) { + return NULL; + } + + retval = kmalloc_node(sizeof(*retval), GFP_KERNEL, dev_to_node(dev)); + if (!retval) + return retval; + + strlcpy(retval->name, name, sizeof(retval->name)); + + retval->dev = dev; + + INIT_LIST_HEAD(&retval->page_list); + spin_lock_init(&retval->lock); + retval->size = size; + retval->boundary = boundary; + retval->allocation = allocation; + init_waitqueue_head(&retval->waitq); + + if (dev) { + int ret; + + mutex_lock(&pools_lock); + if (list_empty(&dev->dma_pools)) + ret = device_create_file(dev, &dev_attr_pools); + else + ret = 0; + /* note: not currently insisting "name" be unique */ + if (!ret) + list_add(&retval->pools, &dev->dma_pools); + else { + kfree(retval); + retval = NULL; + } + mutex_unlock(&pools_lock); + } else + INIT_LIST_HEAD(&retval->pools); + + return retval; +} +EXPORT_SYMBOL(dma_pool_create); + +static void pool_initialise_page(struct dma_pool *pool, struct dma_page *page) +{ + unsigned int offset = 0; + unsigned int next_boundary = pool->boundary; + + do { + unsigned int next = offset + pool->size; + if (unlikely((next + pool->size) >= next_boundary)) { + next = next_boundary; + next_boundary += pool->boundary; + } + *(int *)(page->vaddr + offset) = next; + offset = next; + } while (offset < pool->allocation); +} + +static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags) +{ + struct dma_page *page; + + page = kmalloc(sizeof(*page), mem_flags); + if (!page) + return NULL; + page->vaddr = dma_alloc_coherent(pool->dev, pool->allocation, + &page->dma, mem_flags); + if (page->vaddr) { +#ifdef CONFIG_DEBUG_SLAB + memset(page->vaddr, POOL_POISON_FREED, pool->allocation); +#endif + pool_initialise_page(pool, page); + list_add(&page->page_list, &pool->page_list); + page->in_use = 0; + page->offset = 0; + } else { + kfree(page); + page = NULL; + } + return page; +} + +static inline int is_page_busy(struct dma_page *page) +{ + return page->in_use != 0; +} + +static void pool_free_page(struct dma_pool *pool, struct dma_page *page) +{ + dma_addr_t dma = page->dma; + +#ifdef CONFIG_DEBUG_SLAB + memset(page->vaddr, POOL_POISON_FREED, pool->allocation); +#endif + dma_free_coherent(pool->dev, pool->allocation, page->vaddr, dma); + list_del(&page->page_list); + kfree(page); +} + +/** + * dma_pool_destroy - destroys a pool of dma memory blocks. + * @pool: dma pool that will be destroyed + * Context: !in_interrupt() + * + * Caller guarantees that no more memory from the pool is in use, + * and that nothing will try to use the pool after this call. + */ +void dma_pool_destroy(struct dma_pool *pool) +{ + mutex_lock(&pools_lock); + list_del(&pool->pools); + if (pool->dev && list_empty(&pool->dev->dma_pools)) + device_remove_file(pool->dev, &dev_attr_pools); + mutex_unlock(&pools_lock); + + while (!list_empty(&pool->page_list)) { + struct dma_page *page; + page = list_entry(pool->page_list.next, + struct dma_page, page_list); + if (is_page_busy(page)) { + if (pool->dev) + dev_err(pool->dev, + "dma_pool_destroy %s, %p busy\n", + pool->name, page->vaddr); + else + printk(KERN_ERR + "dma_pool_destroy %s, %p busy\n", + pool->name, page->vaddr); + /* leak the still-in-use consistent memory */ + list_del(&page->page_list); + kfree(page); + } else + pool_free_page(pool, page); + } + + kfree(pool); +} +EXPORT_SYMBOL(dma_pool_destroy); + +/** + * dma_pool_alloc - get a block of consistent memory + * @pool: dma pool that will produce the block + * @mem_flags: GFP_* bitmask + * @handle: pointer to dma address of block + * + * This returns the kernel virtual address of a currently unused block, + * and reports its dma address through the handle. + * If such a memory block can't be allocated, %NULL is returned. + */ +void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, + dma_addr_t *handle) +{ + unsigned long flags; + struct dma_page *page; + size_t offset; + void *retval; + + spin_lock_irqsave(&pool->lock, flags); + restart: + list_for_each_entry(page, &pool->page_list, page_list) { + if (page->offset < pool->allocation) + goto ready; + } + page = pool_alloc_page(pool, GFP_ATOMIC); + if (!page) { + if (mem_flags & __GFP_WAIT) { + DECLARE_WAITQUEUE(wait, current); + + __set_current_state(TASK_INTERRUPTIBLE); + __add_wait_queue(&pool->waitq, &wait); + spin_unlock_irqrestore(&pool->lock, flags); + + schedule_timeout(POOL_TIMEOUT_JIFFIES); + + spin_lock_irqsave(&pool->lock, flags); + __remove_wait_queue(&pool->waitq, &wait); + goto restart; + } + retval = NULL; + goto done; + } + + ready: + page->in_use++; + offset = page->offset; + page->offset = *(int *)(page->vaddr + offset); + retval = offset + page->vaddr; + *handle = offset + page->dma; +#ifdef CONFIG_DEBUG_SLAB + memset(retval, POOL_POISON_ALLOCATED, pool->size); +#endif + done: + spin_unlock_irqrestore(&pool->lock, flags); + return retval; +} +EXPORT_SYMBOL(dma_pool_alloc); + +static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma) +{ + unsigned long flags; + struct dma_page *page; + + spin_lock_irqsave(&pool->lock, flags); + list_for_each_entry(page, &pool->page_list, page_list) { + if (dma < page->dma) + continue; + if (dma < (page->dma + pool->allocation)) + goto done; + } + page = NULL; + done: + spin_unlock_irqrestore(&pool->lock, flags); + return page; +} + +/** + * dma_pool_free - put block back into dma pool + * @pool: the dma pool holding the block + * @vaddr: virtual address of block + * @dma: dma address of block + * + * Caller promises neither device nor driver will again touch this block + * unless it is first re-allocated. + */ +void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) +{ + struct dma_page *page; + unsigned long flags; + unsigned int offset; + + page = pool_find_page(pool, dma); + if (!page) { + if (pool->dev) + dev_err(pool->dev, + "dma_pool_free %s, %p/%lx (bad dma)\n", + pool->name, vaddr, (unsigned long)dma); + else + printk(KERN_ERR "dma_pool_free %s, %p/%lx (bad dma)\n", + pool->name, vaddr, (unsigned long)dma); + return; + } + + offset = vaddr - page->vaddr; +#ifdef CONFIG_DEBUG_SLAB + if ((dma - page->dma) != offset) { + if (pool->dev) + dev_err(pool->dev, + "dma_pool_free %s, %p (bad vaddr)/%Lx\n", + pool->name, vaddr, (unsigned long long)dma); + else + printk(KERN_ERR + "dma_pool_free %s, %p (bad vaddr)/%Lx\n", + pool->name, vaddr, (unsigned long long)dma); + return; + } + { + unsigned int chain = page->offset; + while (chain < pool->allocation) { + if (chain != offset) { + chain = *(int *)(page->vaddr + chain); + continue; + } + if (pool->dev) + dev_err(pool->dev, "dma_pool_free %s, dma %Lx " + "already free\n", pool->name, + (unsigned long long)dma); + else + printk(KERN_ERR "dma_pool_free %s, dma %Lx " + "already free\n", pool->name, + (unsigned long long)dma); + return; + } + } + memset(vaddr, POOL_POISON_FREED, pool->size); +#endif + + spin_lock_irqsave(&pool->lock, flags); + page->in_use--; + *(int *)vaddr = page->offset; + page->offset = offset; + if (waitqueue_active(&pool->waitq)) + wake_up_locked(&pool->waitq); + /* + * Resist a temptation to do + * if (!is_page_busy(page)) pool_free_page(pool, page); + * Better have a few empty pages hang around. + */ + spin_unlock_irqrestore(&pool->lock, flags); +} +EXPORT_SYMBOL(dma_pool_free); + +/* + * Managed DMA pool + */ +static void dmam_pool_release(struct device *dev, void *res) +{ + struct dma_pool *pool = *(struct dma_pool **)res; + + dma_pool_destroy(pool); +} + +static int dmam_pool_match(struct device *dev, void *res, void *match_data) +{ + return *(struct dma_pool **)res == match_data; +} + +/** + * dmam_pool_create - Managed dma_pool_create() + * @name: name of pool, for diagnostics + * @dev: device that will be doing the DMA + * @size: size of the blocks in this pool. + * @align: alignment requirement for blocks; must be a power of two + * @allocation: returned blocks won't cross this boundary (or zero) + * + * Managed dma_pool_create(). DMA pool created with this function is + * automatically destroyed on driver detach. + */ +struct dma_pool *dmam_pool_create(const char *name, struct device *dev, + size_t size, size_t align, size_t allocation) +{ + struct dma_pool **ptr, *pool; + + ptr = devres_alloc(dmam_pool_release, sizeof(*ptr), GFP_KERNEL); + if (!ptr) + return NULL; + + pool = *ptr = dma_pool_create(name, dev, size, align, allocation); + if (pool) + devres_add(dev, ptr); + else + devres_free(ptr); + + return pool; +} +EXPORT_SYMBOL(dmam_pool_create); + +/** + * dmam_pool_destroy - Managed dma_pool_destroy() + * @pool: dma pool that will be destroyed + * + * Managed dma_pool_destroy(). + */ +void dmam_pool_destroy(struct dma_pool *pool) +{ + struct device *dev = pool->dev; + + dma_pool_destroy(pool); + WARN_ON(devres_destroy(dev, dmam_pool_release, dmam_pool_match, pool)); +} +EXPORT_SYMBOL(dmam_pool_destroy); diff --git a/mm/fadvise.c b/mm/fadvise.c index 0df4c89..3c0f1e9 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -49,9 +49,21 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) goto out; } - if (mapping->a_ops->get_xip_page) - /* no bad return value, but ignore advice */ + if (mapping->a_ops->get_xip_page) { + switch (advice) { + case POSIX_FADV_NORMAL: + case POSIX_FADV_RANDOM: + case POSIX_FADV_SEQUENTIAL: + case POSIX_FADV_WILLNEED: + case POSIX_FADV_NOREUSE: + case POSIX_FADV_DONTNEED: + /* no bad return value, but ignore advice */ + break; + default: + ret = -EINVAL; + } goto out; + } /* Careful about overflows. Len == 0 means "as much as possible" */ endbyte = offset + len; diff --git a/mm/filemap.c b/mm/filemap.c index f4d0cde..81fb9bf 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -65,7 +65,6 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, * ->private_lock (__free_pte->__set_page_dirty_buffers) * ->swap_lock (exclusive_swap_page, others) * ->mapping->tree_lock - * ->zone.lock * * ->i_mutex * ->i_mmap_lock (truncate->unmap_mapping_range) @@ -185,6 +184,12 @@ static int sync_page(void *word) return 0; } +static int sync_page_killable(void *word) +{ + sync_page(word); + return fatal_signal_pending(current) ? -EINTR : 0; +} + /** * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range * @mapping: address space structure to write @@ -522,7 +527,7 @@ static inline void wake_up_page(struct page *page, int bit) __wake_up_bit(page_waitqueue(page), &page->flags, bit); } -void fastcall wait_on_page_bit(struct page *page, int bit_nr) +void wait_on_page_bit(struct page *page, int bit_nr) { DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); @@ -546,7 +551,7 @@ EXPORT_SYMBOL(wait_on_page_bit); * the clear_bit and the read of the waitqueue (to avoid SMP races with a * parallel wait_on_page_locked()). */ -void fastcall unlock_page(struct page *page) +void unlock_page(struct page *page) { smp_mb__before_clear_bit(); if (!TestClearPageLocked(page)) @@ -580,7 +585,7 @@ EXPORT_SYMBOL(end_page_writeback); * chances are that on the second loop, the block layer's plug list is empty, * so sync_page() will then return in state TASK_UNINTERRUPTIBLE. */ -void fastcall __lock_page(struct page *page) +void __lock_page(struct page *page) { DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); @@ -589,11 +594,19 @@ void fastcall __lock_page(struct page *page) } EXPORT_SYMBOL(__lock_page); +int fastcall __lock_page_killable(struct page *page) +{ + DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); + + return __wait_on_bit_lock(page_waitqueue(page), &wait, + sync_page_killable, TASK_KILLABLE); +} + /* * Variant of lock_page that does not require the caller to hold a reference * on the page's mapping. */ -void fastcall __lock_page_nosync(struct page *page) +void __lock_page_nosync(struct page *page) { DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock, @@ -980,7 +993,8 @@ page_ok: page_not_up_to_date: /* Get exclusive access to the page ... */ - lock_page(page); + if (lock_page_killable(page)) + goto readpage_eio; /* Did it get truncated before we got the lock? */ if (!page->mapping) { @@ -1008,7 +1022,8 @@ readpage: } if (!PageUptodate(page)) { - lock_page(page); + if (lock_page_killable(page)) + goto readpage_eio; if (!PageUptodate(page)) { if (page->mapping == NULL) { /* @@ -1019,15 +1034,16 @@ readpage: goto find_page; } unlock_page(page); - error = -EIO; shrink_readahead_size_eio(filp, ra); - goto readpage_error; + goto readpage_eio; } unlock_page(page); } goto page_ok; +readpage_eio: + error = -EIO; readpage_error: /* UHHUH! A synchronous read error occurred. Report it */ desc->error = error; @@ -1260,7 +1276,7 @@ asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count) * This adds the requested page to the page cache if it isn't already there, * and schedules an I/O to read in its contents from disk. */ -static int fastcall page_cache_read(struct file * file, pgoff_t offset) +static int page_cache_read(struct file *file, pgoff_t offset) { struct address_space *mapping = file->f_mapping; struct page *page; @@ -1733,7 +1749,11 @@ static void __iov_iter_advance_iov(struct iov_iter *i, size_t bytes) const struct iovec *iov = i->iov; size_t base = i->iov_offset; - while (bytes) { + /* + * The !iov->iov_len check ensures we skip over unlikely + * zero-length segments. + */ + while (bytes || !iov->iov_len) { int copy = min(bytes, iov->iov_len - base); bytes -= copy; @@ -2251,6 +2271,7 @@ again: cond_resched(); + iov_iter_advance(i, copied); if (unlikely(copied == 0)) { /* * If we were unable to copy any data at all, we must @@ -2264,7 +2285,6 @@ again: iov_iter_single_seg_count(i)); goto again; } - iov_iter_advance(i, copied); pos += copied; written += copied; diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index f874ae8..0420a02 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -431,7 +431,7 @@ xip_truncate_page(struct address_space *mapping, loff_t from) else return PTR_ERR(page); } - zero_user_page(page, offset, length, KM_USER0); + zero_user(page, offset, length); return 0; } EXPORT_SYMBOL_GPL(xip_truncate_page); diff --git a/mm/fremap.c b/mm/fremap.c index 14bd3bf..69a37c2 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -190,10 +190,13 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, */ if (mapping_cap_account_dirty(mapping)) { unsigned long addr; + struct file *file = vma->vm_file; flags &= MAP_NONBLOCK; - addr = mmap_region(vma->vm_file, start, size, + get_file(file); + addr = mmap_region(file, start, size, flags, vma->vm_flags, pgoff, 1); + fput(file); if (IS_ERR_VALUE(addr)) { err = addr; } else { diff --git a/mm/highmem.c b/mm/highmem.c index 7a967bc..35d4773 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -163,7 +163,7 @@ start: return vaddr; } -void fastcall *kmap_high(struct page *page) +void *kmap_high(struct page *page) { unsigned long vaddr; @@ -185,7 +185,7 @@ void fastcall *kmap_high(struct page *page) EXPORT_SYMBOL(kmap_high); -void fastcall kunmap_high(struct page *page) +void kunmap_high(struct page *page) { unsigned long vaddr; unsigned long nr; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index db861d8..1a56420 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -813,6 +813,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, spin_unlock(&mm->page_table_lock); copy_huge_page(new_page, old_page, address, vma); + __SetPageUptodate(new_page); spin_lock(&mm->page_table_lock); ptep = huge_pte_offset(mm, address & HPAGE_MASK); @@ -858,6 +859,7 @@ retry: goto out; } clear_huge_page(page, address); + __SetPageUptodate(page); if (vma->vm_flags & VM_SHARED) { int err; diff --git a/mm/internal.h b/mm/internal.h index 953f941..5a9a620 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -24,7 +24,7 @@ static inline void set_page_count(struct page *page, int v) */ static inline void set_page_refcounted(struct page *page) { - VM_BUG_ON(PageCompound(page) && PageTail(page)); + VM_BUG_ON(PageTail(page)); VM_BUG_ON(atomic_read(&page->_count)); set_page_count(page, 1); } @@ -34,7 +34,7 @@ static inline void __put_page(struct page *page) atomic_dec(&page->_count); } -extern void fastcall __init __free_pages_bootmem(struct page *page, +extern void __init __free_pages_bootmem(struct page *page, unsigned int order); /* diff --git a/mm/memory.c b/mm/memory.c index d902d0e..7bb7072 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -305,7 +305,7 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) spin_lock(&mm->page_table_lock); if (pmd_present(*pmd)) { /* Another has populated it */ pte_lock_deinit(new); - pte_free(new); + pte_free(mm, new); } else { mm->nr_ptes++; inc_zone_page_state(new, NR_PAGETABLE); @@ -323,7 +323,7 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) spin_lock(&init_mm.page_table_lock); if (pmd_present(*pmd)) /* Another has populated it */ - pte_free_kernel(new); + pte_free_kernel(&init_mm, new); else pmd_populate_kernel(&init_mm, pmd, new); spin_unlock(&init_mm.page_table_lock); @@ -1109,7 +1109,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, } EXPORT_SYMBOL(get_user_pages); -pte_t * fastcall get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl) +pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, + spinlock_t **ptl) { pgd_t * pgd = pgd_offset(mm, addr); pud_t * pud = pud_alloc(mm, pgd, addr); @@ -1517,10 +1518,8 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo memset(kaddr, 0, PAGE_SIZE); kunmap_atomic(kaddr, KM_USER0); flush_dcache_page(dst); - return; - - } - copy_user_highpage(dst, src, va, vma); + } else + copy_user_highpage(dst, src, va, vma); } /* @@ -1629,6 +1628,7 @@ gotten: if (!new_page) goto oom; cow_user_page(new_page, old_page, address, vma); + __SetPageUptodate(new_page); /* * Re-check the pte - we dropped the lock @@ -1909,50 +1909,49 @@ EXPORT_SYMBOL(unmap_mapping_range); */ int vmtruncate(struct inode * inode, loff_t offset) { - struct address_space *mapping = inode->i_mapping; - unsigned long limit; + if (inode->i_size < offset) { + unsigned long limit; - if (inode->i_size < offset) - goto do_expand; - /* - * truncation of in-use swapfiles is disallowed - it would cause - * subsequent swapout to scribble on the now-freed blocks. - */ - if (IS_SWAPFILE(inode)) - goto out_busy; - i_size_write(inode, offset); + limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; + if (limit != RLIM_INFINITY && offset > limit) + goto out_sig; + if (offset > inode->i_sb->s_maxbytes) + goto out_big; + i_size_write(inode, offset); + } else { + struct address_space *mapping = inode->i_mapping; + + /* + * truncation of in-use swapfiles is disallowed - it would + * cause subsequent swapout to scribble on the now-freed + * blocks. + */ + if (IS_SWAPFILE(inode)) + return -ETXTBSY; + i_size_write(inode, offset); + + /* + * unmap_mapping_range is called twice, first simply for + * efficiency so that truncate_inode_pages does fewer + * single-page unmaps. However after this first call, and + * before truncate_inode_pages finishes, it is possible for + * private pages to be COWed, which remain after + * truncate_inode_pages finishes, hence the second + * unmap_mapping_range call must be made for correctness. + */ + unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); + truncate_inode_pages(mapping, offset); + unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); + } - /* - * unmap_mapping_range is called twice, first simply for efficiency - * so that truncate_inode_pages does fewer single-page unmaps. However - * after this first call, and before truncate_inode_pages finishes, - * it is possible for private pages to be COWed, which remain after - * truncate_inode_pages finishes, hence the second unmap_mapping_range - * call must be made for correctness. - */ - unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); - truncate_inode_pages(mapping, offset); - unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); - goto out_truncate; - -do_expand: - limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; - if (limit != RLIM_INFINITY && offset > limit) - goto out_sig; - if (offset > inode->i_sb->s_maxbytes) - goto out_big; - i_size_write(inode, offset); - -out_truncate: if (inode->i_op && inode->i_op->truncate) inode->i_op->truncate(inode); return 0; + out_sig: send_sig(SIGXFSZ, current, 0); out_big: return -EFBIG; -out_busy: - return -ETXTBSY; } EXPORT_SYMBOL(vmtruncate); @@ -1980,67 +1979,6 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) return 0; } -/** - * swapin_readahead - swap in pages in hope we need them soon - * @entry: swap entry of this memory - * @addr: address to start - * @vma: user vma this addresses belong to - * - * Primitive swap readahead code. We simply read an aligned block of - * (1 << page_cluster) entries in the swap area. This method is chosen - * because it doesn't cost us any seek time. We also make sure to queue - * the 'original' request together with the readahead ones... - * - * This has been extended to use the NUMA policies from the mm triggering - * the readahead. - * - * Caller must hold down_read on the vma->vm_mm if vma is not NULL. - */ -void swapin_readahead(swp_entry_t entry, unsigned long addr,struct vm_area_struct *vma) -{ -#ifdef CONFIG_NUMA - struct vm_area_struct *next_vma = vma ? vma->vm_next : NULL; -#endif - int i, num; - struct page *new_page; - unsigned long offset; - - /* - * Get the number of handles we should do readahead io to. - */ - num = valid_swaphandles(entry, &offset); - for (i = 0; i < num; offset++, i++) { - /* Ok, do the async read-ahead now */ - new_page = read_swap_cache_async(swp_entry(swp_type(entry), - offset), vma, addr); - if (!new_page) - break; - page_cache_release(new_page); -#ifdef CONFIG_NUMA - /* - * Find the next applicable VMA for the NUMA policy. - */ - addr += PAGE_SIZE; - if (addr == 0) - vma = NULL; - if (vma) { - if (addr >= vma->vm_end) { - vma = next_vma; - next_vma = vma ? vma->vm_next : NULL; - } - if (vma && addr < vma->vm_start) - vma = NULL; - } else { - if (next_vma && addr >= next_vma->vm_start) { - vma = next_vma; - next_vma = vma->vm_next; - } - } -#endif - } - lru_add_drain(); /* Push any new pages onto the LRU now */ -} - /* * We enter with non-exclusive mmap_sem (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. @@ -2068,8 +2006,8 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, page = lookup_swap_cache(entry); if (!page) { grab_swap_token(); /* Contend for token _before_ read-in */ - swapin_readahead(entry, address, vma); - page = read_swap_cache_async(entry, vma, address); + page = swapin_readahead(entry, + GFP_HIGHUSER_MOVABLE, vma, address); if (!page) { /* * Back out if somebody else faulted in this pte @@ -2163,6 +2101,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, page = alloc_zeroed_user_highpage_movable(vma, address); if (!page) goto oom; + __SetPageUptodate(page); entry = mk_pte(page, vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); @@ -2263,6 +2202,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, goto out; } copy_user_highpage(page, vmf.page, address, vma); + __SetPageUptodate(page); } else { /* * If the page will be shareable, see if the backing @@ -2563,7 +2503,7 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) spin_lock(&mm->page_table_lock); if (pgd_present(*pgd)) /* Another has populated it */ - pud_free(new); + pud_free(mm, new); else pgd_populate(mm, pgd, new); spin_unlock(&mm->page_table_lock); @@ -2585,12 +2525,12 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) spin_lock(&mm->page_table_lock); #ifndef __ARCH_HAS_4LEVEL_HACK if (pud_present(*pud)) /* Another has populated it */ - pmd_free(new); + pmd_free(mm, new); else pud_populate(mm, pud, new); #else if (pgd_present(*pud)) /* Another has populated it */ - pmd_free(new); + pmd_free(mm, new); else pgd_populate(mm, pud, new); #endif /* __ARCH_HAS_4LEVEL_HACK */ @@ -2618,46 +2558,6 @@ int make_pages_present(unsigned long addr, unsigned long end) return ret == len ? 0 : -1; } -/* - * Map a vmalloc()-space virtual address to the physical page. - */ -struct page * vmalloc_to_page(void * vmalloc_addr) -{ - unsigned long addr = (unsigned long) vmalloc_addr; - struct page *page = NULL; - pgd_t *pgd = pgd_offset_k(addr); - pud_t *pud; - pmd_t *pmd; - pte_t *ptep, pte; - - if (!pgd_none(*pgd)) { - pud = pud_offset(pgd, addr); - if (!pud_none(*pud)) { - pmd = pmd_offset(pud, addr); - if (!pmd_none(*pmd)) { - ptep = pte_offset_map(pmd, addr); - pte = *ptep; - if (pte_present(pte)) - page = pte_page(pte); - pte_unmap(ptep); - } - } - } - return page; -} - -EXPORT_SYMBOL(vmalloc_to_page); - -/* - * Map a vmalloc()-space virtual address to the physical page frame number. - */ -unsigned long vmalloc_to_pfn(void * vmalloc_addr) -{ - return page_to_pfn(vmalloc_to_page(vmalloc_addr)); -} - -EXPORT_SYMBOL(vmalloc_to_pfn); - #if !defined(__HAVE_ARCH_GATE_AREA) #if defined(AT_SYSINFO_EHDR) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 9512a54..7469c50 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -481,8 +481,6 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) return offlined; } -extern void drain_all_local_pages(void); - int offline_pages(unsigned long start_pfn, unsigned long end_pfn, unsigned long timeout) { @@ -540,7 +538,7 @@ repeat: lru_add_drain_all(); flush_scheduled_work(); cond_resched(); - drain_all_local_pages(); + drain_all_pages(); } pfn = scan_lru_pages(start_pfn, end_pfn); @@ -563,7 +561,7 @@ repeat: flush_scheduled_work(); yield(); /* drain pcp pages , this is synchrouns. */ - drain_all_local_pages(); + drain_all_pages(); /* check again */ offlined_pages = check_pages_isolated(start_pfn, end_pfn); if (offlined_pages < 0) { diff --git a/mm/migrate.c b/mm/migrate.c index 6a207e8..857a987 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -115,11 +115,6 @@ int putback_lru_pages(struct list_head *l) return count; } -static inline int is_swap_pte(pte_t pte) -{ - return !pte_none(pte) && !pte_present(pte) && !pte_file(pte); -} - /* * Restore a potential migration pte to a working pte entry */ @@ -645,15 +640,33 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, rcu_read_lock(); rcu_locked = 1; } + /* - * This is a corner case handling. - * When a new swap-cache is read into, it is linked to LRU - * and treated as swapcache but has no rmap yet. - * Calling try_to_unmap() against a page->mapping==NULL page is - * BUG. So handle it here. + * Corner case handling: + * 1. When a new swap-cache page is read into, it is added to the LRU + * and treated as swapcache but it has no rmap yet. + * Calling try_to_unmap() against a page->mapping==NULL page will + * trigger a BUG. So handle it here. + * 2. An orphaned page (see truncate_complete_page) might have + * fs-private metadata. The page can be picked up due to memory + * offlining. Everywhere else except page reclaim, the page is + * invisible to the vm, so the page can not be migrated. So try to + * free the metadata, so the page can be freed. */ - if (!page->mapping) + if (!page->mapping) { + if (!PageAnon(page) && PagePrivate(page)) { + /* + * Go direct to try_to_free_buffers() here because + * a) that's what try_to_release_page() would do anyway + * b) we may be under rcu_read_lock() here, so we can't + * use GFP_KERNEL which is what try_to_release_page() + * needs to be effective. + */ + try_to_free_buffers(page); + } goto rcu_unlock; + } + /* Establish migration ptes or remove ptes */ try_to_unmap(page, 1); @@ -36,6 +36,10 @@ #define arch_mmap_check(addr, len, flags) (0) #endif +#ifndef arch_rebalance_pgtables +#define arch_rebalance_pgtables(addr, len) (addr) +#endif + static void unmap_region(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, unsigned long start, unsigned long end); @@ -1424,7 +1428,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, if (addr & ~PAGE_MASK) return -EINVAL; - return addr; + return arch_rebalance_pgtables(addr, len); } EXPORT_SYMBOL(get_unmapped_area); @@ -2216,7 +2220,7 @@ int install_special_mapping(struct mm_struct *mm, vma->vm_start = addr; vma->vm_end = addr + len; - vma->vm_flags = vm_flags | mm->def_flags; + vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND; vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); vma->vm_ops = &special_mapping_vmops; @@ -10,6 +10,7 @@ * Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com> * Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org> * Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com> + * Copyright (c) 2007 Paul Mundt <lethal@linux-sh.org> */ #include <linux/module.h> @@ -167,7 +168,7 @@ EXPORT_SYMBOL(get_user_pages); DEFINE_RWLOCK(vmlist_lock); struct vm_struct *vmlist; -void vfree(void *addr) +void vfree(const void *addr) { kfree(addr); } @@ -183,13 +184,33 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) } EXPORT_SYMBOL(__vmalloc); -struct page * vmalloc_to_page(void *addr) +void *vmalloc_user(unsigned long size) +{ + void *ret; + + ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, + PAGE_KERNEL); + if (ret) { + struct vm_area_struct *vma; + + down_write(¤t->mm->mmap_sem); + vma = find_vma(current->mm, (unsigned long)ret); + if (vma) + vma->vm_flags |= VM_USERMAP; + up_write(¤t->mm->mmap_sem); + } + + return ret; +} +EXPORT_SYMBOL(vmalloc_user); + +struct page *vmalloc_to_page(const void *addr) { return virt_to_page(addr); } EXPORT_SYMBOL(vmalloc_to_page); -unsigned long vmalloc_to_pfn(void *addr) +unsigned long vmalloc_to_pfn(const void *addr) { return page_to_pfn(virt_to_page(addr)); } @@ -253,10 +274,17 @@ EXPORT_SYMBOL(vmalloc_32); * * The resulting memory area is 32bit addressable and zeroed so it can be * mapped to userspace without leaking data. + * + * VM_USERMAP is set on the corresponding VMA so that subsequent calls to + * remap_vmalloc_range() are permissible. */ void *vmalloc_32_user(unsigned long size) { - return __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL); + /* + * We'll have to sort out the ZONE_DMA bits for 64-bit, + * but for now this can simply use vmalloc_user() directly. + */ + return vmalloc_user(size); } EXPORT_SYMBOL(vmalloc_32_user); @@ -267,7 +295,7 @@ void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_ } EXPORT_SYMBOL(vmap); -void vunmap(void *addr) +void vunmap(const void *addr) { BUG(); } @@ -1216,6 +1244,21 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long from, } EXPORT_SYMBOL(remap_pfn_range); +int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, + unsigned long pgoff) +{ + unsigned int size = vma->vm_end - vma->vm_start; + + if (!(vma->vm_flags & VM_USERMAP)) + return -EINVAL; + + vma->vm_start = (unsigned long)(addr + (pgoff << PAGE_SHIFT)); + vma->vm_end = vma->vm_start + size; + + return 0; +} +EXPORT_SYMBOL(remap_vmalloc_range); + void swap_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) { } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 96473b4..c1850bf 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -125,8 +125,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) * Superuser processes are usually more important, so we make it * less likely that we kill those. */ - if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_ADMIN) || - p->uid == 0 || p->euid == 0) + if (__capable(p, CAP_SYS_ADMIN) || __capable(p, CAP_SYS_RESOURCE)) points /= 4; /* @@ -135,7 +134,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) * tend to only have this flag set on applications they think * of as important. */ - if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO)) + if (__capable(p, CAP_SYS_RAWIO)) points /= 4; /* diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 3d3848f..5e00f17 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -69,6 +69,12 @@ static inline long sync_writeback_pages(void) int dirty_background_ratio = 5; /* + * free highmem will not be subtracted from the total free memory + * for calculating free ratios if vm_highmem_is_dirtyable is true + */ +int vm_highmem_is_dirtyable; + +/* * The generator of dirty data starts writeback at this percentage */ int vm_dirty_ratio = 10; @@ -219,7 +225,7 @@ static inline void task_dirties_fraction(struct task_struct *tsk, * * dirty -= (dirty/8) * p_{t} */ -void task_dirty_limit(struct task_struct *tsk, long *pdirty) +static void task_dirty_limit(struct task_struct *tsk, long *pdirty) { long numerator, denominator; long dirty = *pdirty; @@ -287,7 +293,10 @@ static unsigned long determine_dirtyable_memory(void) x = global_page_state(NR_FREE_PAGES) + global_page_state(NR_INACTIVE) + global_page_state(NR_ACTIVE); - x -= highmem_dirtyable_memory(x); + + if (!vm_highmem_is_dirtyable) + x -= highmem_dirtyable_memory(x); + return x + 1; /* Ensure that we never return 0 */ } @@ -558,6 +567,7 @@ static void background_writeout(unsigned long _min_pages) global_page_state(NR_UNSTABLE_NFS) < background_thresh && min_pages <= 0) break; + wbc.more_io = 0; wbc.encountered_congestion = 0; wbc.nr_to_write = MAX_WRITEBACK_PAGES; wbc.pages_skipped = 0; @@ -565,8 +575,9 @@ static void background_writeout(unsigned long _min_pages) min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) { /* Wrote less than expected */ - congestion_wait(WRITE, HZ/10); - if (!wbc.encountered_congestion) + if (wbc.encountered_congestion || wbc.more_io) + congestion_wait(WRITE, HZ/10); + else break; } } @@ -631,11 +642,12 @@ static void wb_kupdate(unsigned long arg) global_page_state(NR_UNSTABLE_NFS) + (inodes_stat.nr_inodes - inodes_stat.nr_unused); while (nr_to_write > 0) { + wbc.more_io = 0; wbc.encountered_congestion = 0; wbc.nr_to_write = MAX_WRITEBACK_PAGES; writeback_inodes(&wbc); if (wbc.nr_to_write > 0) { - if (wbc.encountered_congestion) + if (wbc.encountered_congestion || wbc.more_io) congestion_wait(WRITE, HZ/10); else break; /* All the old data is written */ @@ -1064,7 +1076,7 @@ static int __set_page_dirty(struct page *page) return 0; } -int fastcall set_page_dirty(struct page *page) +int set_page_dirty(struct page *page) { int ret = __set_page_dirty(page); if (ret) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b2838c2..37576b8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -537,7 +537,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) /* * permit the bootmem allocator to evade page validation on high-order frees */ -void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order) +void __init __free_pages_bootmem(struct page *page, unsigned int order) { if (order == 0) { __ClearPageReserved(page); @@ -890,31 +890,51 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) } #endif -static void __drain_pages(unsigned int cpu) +/* + * Drain pages of the indicated processor. + * + * The processor must either be the current processor and the + * thread pinned to the current processor or a processor that + * is not online. + */ +static void drain_pages(unsigned int cpu) { unsigned long flags; struct zone *zone; - int i; for_each_zone(zone) { struct per_cpu_pageset *pset; + struct per_cpu_pages *pcp; if (!populated_zone(zone)) continue; pset = zone_pcp(zone, cpu); - for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { - struct per_cpu_pages *pcp; - - pcp = &pset->pcp[i]; - local_irq_save(flags); - free_pages_bulk(zone, pcp->count, &pcp->list, 0); - pcp->count = 0; - local_irq_restore(flags); - } + + pcp = &pset->pcp; + local_irq_save(flags); + free_pages_bulk(zone, pcp->count, &pcp->list, 0); + pcp->count = 0; + local_irq_restore(flags); } } +/* + * Spill all of this CPU's per-cpu pages back into the buddy allocator. + */ +void drain_local_pages(void *arg) +{ + drain_pages(smp_processor_id()); +} + +/* + * Spill all the per-cpu pages from all CPUs back into the buddy allocator + */ +void drain_all_pages(void) +{ + on_each_cpu(drain_local_pages, NULL, 0, 1); +} + #ifdef CONFIG_HIBERNATION void mark_free_pages(struct zone *zone) @@ -952,40 +972,9 @@ void mark_free_pages(struct zone *zone) #endif /* CONFIG_PM */ /* - * Spill all of this CPU's per-cpu pages back into the buddy allocator. - */ -void drain_local_pages(void) -{ - unsigned long flags; - - local_irq_save(flags); - __drain_pages(smp_processor_id()); - local_irq_restore(flags); -} - -void smp_drain_local_pages(void *arg) -{ - drain_local_pages(); -} - -/* - * Spill all the per-cpu pages from all CPUs back into the buddy allocator - */ -void drain_all_local_pages(void) -{ - unsigned long flags; - - local_irq_save(flags); - __drain_pages(smp_processor_id()); - local_irq_restore(flags); - - smp_call_function(smp_drain_local_pages, NULL, 0, 1); -} - -/* * Free a 0-order page */ -static void fastcall free_hot_cold_page(struct page *page, int cold) +static void free_hot_cold_page(struct page *page, int cold) { struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; @@ -1001,10 +990,13 @@ static void fastcall free_hot_cold_page(struct page *page, int cold) arch_free_page(page, 0); kernel_map_pages(page, 1, 0); - pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; + pcp = &zone_pcp(zone, get_cpu())->pcp; local_irq_save(flags); __count_vm_event(PGFREE); - list_add(&page->lru, &pcp->list); + if (cold) + list_add_tail(&page->lru, &pcp->list); + else + list_add(&page->lru, &pcp->list); set_page_private(page, get_pageblock_migratetype(page)); pcp->count++; if (pcp->count >= pcp->high) { @@ -1015,12 +1007,12 @@ static void fastcall free_hot_cold_page(struct page *page, int cold) put_cpu(); } -void fastcall free_hot_page(struct page *page) +void free_hot_page(struct page *page) { free_hot_cold_page(page, 0); } -void fastcall free_cold_page(struct page *page) +void free_cold_page(struct page *page) { free_hot_cold_page(page, 1); } @@ -1062,7 +1054,7 @@ again: if (likely(order == 0)) { struct per_cpu_pages *pcp; - pcp = &zone_pcp(zone, cpu)->pcp[cold]; + pcp = &zone_pcp(zone, cpu)->pcp; local_irq_save(flags); if (!pcp->count) { pcp->count = rmqueue_bulk(zone, 0, @@ -1072,9 +1064,15 @@ again: } /* Find a page of the appropriate migrate type */ - list_for_each_entry(page, &pcp->list, lru) - if (page_private(page) == migratetype) - break; + if (cold) { + list_for_each_entry_reverse(page, &pcp->list, lru) + if (page_private(page) == migratetype) + break; + } else { + list_for_each_entry(page, &pcp->list, lru) + if (page_private(page) == migratetype) + break; + } /* Allocate more to the pcp list if necessary */ if (unlikely(&page->lru == &pcp->list)) { @@ -1569,7 +1567,7 @@ nofail_alloc: cond_resched(); if (order != 0) - drain_all_local_pages(); + drain_all_pages(); if (likely(did_some_progress)) { page = get_page_from_freelist(gfp_mask, order, @@ -1643,7 +1641,7 @@ EXPORT_SYMBOL(__alloc_pages); /* * Common helper functions. */ -fastcall unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) +unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) { struct page * page; page = alloc_pages(gfp_mask, order); @@ -1654,7 +1652,7 @@ fastcall unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) EXPORT_SYMBOL(__get_free_pages); -fastcall unsigned long get_zeroed_page(gfp_t gfp_mask) +unsigned long get_zeroed_page(gfp_t gfp_mask) { struct page * page; @@ -1680,7 +1678,7 @@ void __pagevec_free(struct pagevec *pvec) free_hot_cold_page(pvec->pages[i], pvec->cold); } -fastcall void __free_pages(struct page *page, unsigned int order) +void __free_pages(struct page *page, unsigned int order) { if (put_page_testzero(page)) { if (order == 0) @@ -1692,7 +1690,7 @@ fastcall void __free_pages(struct page *page, unsigned int order) EXPORT_SYMBOL(__free_pages); -fastcall void free_pages(unsigned long addr, unsigned int order) +void free_pages(unsigned long addr, unsigned int order) { if (addr != 0) { VM_BUG_ON(!virt_addr_valid((void *)addr)); @@ -1801,12 +1799,9 @@ void show_free_areas(void) pageset = zone_pcp(zone, cpu); - printk("CPU %4d: Hot: hi:%5d, btch:%4d usd:%4d " - "Cold: hi:%5d, btch:%4d usd:%4d\n", - cpu, pageset->pcp[0].high, - pageset->pcp[0].batch, pageset->pcp[0].count, - pageset->pcp[1].high, pageset->pcp[1].batch, - pageset->pcp[1].count); + printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n", + cpu, pageset->pcp.high, + pageset->pcp.batch, pageset->pcp.count); } } @@ -1879,6 +1874,8 @@ void show_free_areas(void) printk("= %lukB\n", K(total)); } + printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES)); + show_swap_cache_info(); } @@ -2551,8 +2548,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, } } -static void __meminit zone_init_free_lists(struct pglist_data *pgdat, - struct zone *zone, unsigned long size) +static void __meminit zone_init_free_lists(struct zone *zone) { int order, t; for_each_migratetype_order(order, t) { @@ -2604,17 +2600,11 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) memset(p, 0, sizeof(*p)); - pcp = &p->pcp[0]; /* hot */ + pcp = &p->pcp; pcp->count = 0; pcp->high = 6 * batch; pcp->batch = max(1UL, 1 * batch); INIT_LIST_HEAD(&pcp->list); - - pcp = &p->pcp[1]; /* cold*/ - pcp->count = 0; - pcp->high = 2 * batch; - pcp->batch = max(1UL, batch/2); - INIT_LIST_HEAD(&pcp->list); } /* @@ -2627,7 +2617,7 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p, { struct per_cpu_pages *pcp; - pcp = &p->pcp[0]; /* hot list */ + pcp = &p->pcp; pcp->high = high; pcp->batch = max(1UL, high/4); if ((high/4) > (PAGE_SHIFT * 8)) @@ -2831,7 +2821,7 @@ __meminit int init_currently_empty_zone(struct zone *zone, memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn); - zone_init_free_lists(pgdat, zone, zone->spanned_pages); + zone_init_free_lists(zone); return 0; } @@ -3978,10 +3968,23 @@ static int page_alloc_cpu_notify(struct notifier_block *self, int cpu = (unsigned long)hcpu; if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { - local_irq_disable(); - __drain_pages(cpu); + drain_pages(cpu); + + /* + * Spill the event counters of the dead processor + * into the current processors event counters. + * This artificially elevates the count of the current + * processor. + */ vm_events_fold_cpu(cpu); - local_irq_enable(); + + /* + * Zero the differential counters of the dead processor + * so that the vm statistics are consistent. + * + * This is only okay since the processor is dead and cannot + * race with what we are doing. + */ refresh_cpu_vm_stats(cpu); } return NOTIFY_OK; @@ -4480,7 +4483,7 @@ int set_migratetype_isolate(struct page *page) out: spin_unlock_irqrestore(&zone->lock, flags); if (!ret) - drain_all_local_pages(); + drain_all_pages(); return ret; } diff --git a/mm/page_io.c b/mm/page_io.c index 3b97f68..065c448 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -126,7 +126,7 @@ int swap_readpage(struct file *file, struct page *page) int ret = 0; BUG_ON(!PageLocked(page)); - ClearPageUptodate(page); + BUG_ON(PageUptodate(page)); bio = get_swap_bio(GFP_KERNEL, page_private(page), page, end_swap_bio_read); if (bio == NULL) { diff --git a/mm/pagewalk.c b/mm/pagewalk.c new file mode 100644 index 0000000..b4f27d2 --- /dev/null +++ b/mm/pagewalk.c @@ -0,0 +1,131 @@ +#include <linux/mm.h> +#include <linux/highmem.h> +#include <linux/sched.h> + +static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, + const struct mm_walk *walk, void *private) +{ + pte_t *pte; + int err = 0; + + pte = pte_offset_map(pmd, addr); + do { + err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, private); + if (err) + break; + } while (pte++, addr += PAGE_SIZE, addr != end); + + pte_unmap(pte); + return err; +} + +static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, + const struct mm_walk *walk, void *private) +{ + pmd_t *pmd; + unsigned long next; + int err = 0; + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (pmd_none_or_clear_bad(pmd)) { + if (walk->pte_hole) + err = walk->pte_hole(addr, next, private); + if (err) + break; + continue; + } + if (walk->pmd_entry) + err = walk->pmd_entry(pmd, addr, next, private); + if (!err && walk->pte_entry) + err = walk_pte_range(pmd, addr, next, walk, private); + if (err) + break; + } while (pmd++, addr = next, addr != end); + + return err; +} + +static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end, + const struct mm_walk *walk, void *private) +{ + pud_t *pud; + unsigned long next; + int err = 0; + + pud = pud_offset(pgd, addr); + do { + next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(pud)) { + if (walk->pte_hole) + err = walk->pte_hole(addr, next, private); + if (err) + break; + continue; + } + if (walk->pud_entry) + err = walk->pud_entry(pud, addr, next, private); + if (!err && (walk->pmd_entry || walk->pte_entry)) + err = walk_pmd_range(pud, addr, next, walk, private); + if (err) + break; + } while (pud++, addr = next, addr != end); + + return err; +} + +/** + * walk_page_range - walk a memory map's page tables with a callback + * @mm - memory map to walk + * @addr - starting address + * @end - ending address + * @walk - set of callbacks to invoke for each level of the tree + * @private - private data passed to the callback function + * + * Recursively walk the page table for the memory area in a VMA, + * calling supplied callbacks. Callbacks are called in-order (first + * PGD, first PUD, first PMD, first PTE, second PTE... second PMD, + * etc.). If lower-level callbacks are omitted, walking depth is reduced. + * + * Each callback receives an entry pointer, the start and end of the + * associated range, and a caller-supplied private data pointer. + * + * No locks are taken, but the bottom level iterator will map PTE + * directories from highmem if necessary. + * + * If any callback returns a non-zero value, the walk is aborted and + * the return value is propagated back to the caller. Otherwise 0 is returned. + */ +int walk_page_range(const struct mm_struct *mm, + unsigned long addr, unsigned long end, + const struct mm_walk *walk, void *private) +{ + pgd_t *pgd; + unsigned long next; + int err = 0; + + if (addr >= end) + return err; + + pgd = pgd_offset(mm, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) { + if (walk->pte_hole) + err = walk->pte_hole(addr, next, private); + if (err) + break; + continue; + } + if (walk->pgd_entry) + err = walk->pgd_entry(pgd, addr, next, private); + if (!err && + (walk->pud_entry || walk->pmd_entry || walk->pte_entry)) + err = walk_pud_range(pgd, addr, next, walk, private); + if (err) + break; + } while (pgd++, addr = next, addr != end); + + return err; +} @@ -36,7 +36,6 @@ * mapping->tree_lock (widely used, in set_page_dirty, * in arch-dependent flush_dcache_mmap_lock, * within inode_lock in __sync_single_inode) - * zone->lock (within radix tree node alloc) */ #include <linux/mm.h> @@ -284,7 +283,10 @@ static int page_referenced_one(struct page *page, if (!pte) goto out; - if (ptep_clear_flush_young(vma, address, pte)) + if (vma->vm_flags & VM_LOCKED) { + referenced++; + *mapcount = 1; /* break early from loop */ + } else if (ptep_clear_flush_young(vma, address, pte)) referenced++; /* Pretend the page is referenced if the task has the @@ -78,11 +78,10 @@ /* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */ enum sgp_type { - SGP_QUICK, /* don't try more than file page cache lookup */ SGP_READ, /* don't exceed i_size, don't allocate page */ SGP_CACHE, /* don't exceed i_size, may allocate page */ + SGP_DIRTY, /* like SGP_CACHE, but set new page dirty */ SGP_WRITE, /* may exceed i_size, may allocate page */ - SGP_FAULT, /* same as SGP_CACHE, return with page locked */ }; static int shmem_getpage(struct inode *inode, unsigned long idx, @@ -194,7 +193,7 @@ static struct backing_dev_info shmem_backing_dev_info __read_mostly = { }; static LIST_HEAD(shmem_swaplist); -static DEFINE_SPINLOCK(shmem_swaplist_lock); +static DEFINE_MUTEX(shmem_swaplist_mutex); static void shmem_free_blocks(struct inode *inode, long pages) { @@ -207,6 +206,31 @@ static void shmem_free_blocks(struct inode *inode, long pages) } } +static int shmem_reserve_inode(struct super_block *sb) +{ + struct shmem_sb_info *sbinfo = SHMEM_SB(sb); + if (sbinfo->max_inodes) { + spin_lock(&sbinfo->stat_lock); + if (!sbinfo->free_inodes) { + spin_unlock(&sbinfo->stat_lock); + return -ENOSPC; + } + sbinfo->free_inodes--; + spin_unlock(&sbinfo->stat_lock); + } + return 0; +} + +static void shmem_free_inode(struct super_block *sb) +{ + struct shmem_sb_info *sbinfo = SHMEM_SB(sb); + if (sbinfo->max_inodes) { + spin_lock(&sbinfo->stat_lock); + sbinfo->free_inodes++; + spin_unlock(&sbinfo->stat_lock); + } +} + /* * shmem_recalc_inode - recalculate the size of an inode * @@ -731,6 +755,8 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) (void) shmem_getpage(inode, attr->ia_size>>PAGE_CACHE_SHIFT, &page, SGP_READ, NULL); + if (page) + unlock_page(page); } /* * Reset SHMEM_PAGEIN flag so that shmem_truncate can @@ -762,7 +788,6 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) static void shmem_delete_inode(struct inode *inode) { - struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); struct shmem_inode_info *info = SHMEM_I(inode); if (inode->i_op->truncate == shmem_truncate) { @@ -771,17 +796,13 @@ static void shmem_delete_inode(struct inode *inode) inode->i_size = 0; shmem_truncate(inode); if (!list_empty(&info->swaplist)) { - spin_lock(&shmem_swaplist_lock); + mutex_lock(&shmem_swaplist_mutex); list_del_init(&info->swaplist); - spin_unlock(&shmem_swaplist_lock); + mutex_unlock(&shmem_swaplist_mutex); } } BUG_ON(inode->i_blocks); - if (sbinfo->max_inodes) { - spin_lock(&sbinfo->stat_lock); - sbinfo->free_inodes++; - spin_unlock(&sbinfo->stat_lock); - } + shmem_free_inode(inode->i_sb); clear_inode(inode); } @@ -807,19 +828,22 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s struct page *subdir; swp_entry_t *ptr; int offset; + int error; idx = 0; ptr = info->i_direct; spin_lock(&info->lock); + if (!info->swapped) { + list_del_init(&info->swaplist); + goto lost2; + } limit = info->next_index; size = limit; if (size > SHMEM_NR_DIRECT) size = SHMEM_NR_DIRECT; offset = shmem_find_swp(entry, ptr, ptr+size); - if (offset >= 0) { - shmem_swp_balance_unmap(); + if (offset >= 0) goto found; - } if (!info->i_indirect) goto lost2; @@ -829,6 +853,14 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s for (idx = SHMEM_NR_DIRECT; idx < limit; idx += ENTRIES_PER_PAGE, dir++) { if (unlikely(idx == stage)) { shmem_dir_unmap(dir-1); + if (cond_resched_lock(&info->lock)) { + /* check it has not been truncated */ + if (limit > info->next_index) { + limit = info->next_index; + if (idx >= limit) + goto lost2; + } + } dir = shmem_dir_map(info->i_indirect) + ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE; while (!*dir) { @@ -849,11 +881,11 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s if (size > ENTRIES_PER_PAGE) size = ENTRIES_PER_PAGE; offset = shmem_find_swp(entry, ptr, ptr+size); + shmem_swp_unmap(ptr); if (offset >= 0) { shmem_dir_unmap(dir); goto found; } - shmem_swp_unmap(ptr); } } lost1: @@ -863,19 +895,63 @@ lost2: return 0; found: idx += offset; - inode = &info->vfs_inode; - if (move_from_swap_cache(page, idx, inode->i_mapping) == 0) { - info->flags |= SHMEM_PAGEIN; - shmem_swp_set(info, ptr + offset, 0); - } - shmem_swp_unmap(ptr); + inode = igrab(&info->vfs_inode); spin_unlock(&info->lock); + /* - * Decrement swap count even when the entry is left behind: - * try_to_unuse will skip over mms, then reincrement count. + * Move _head_ to start search for next from here. + * But be careful: shmem_delete_inode checks list_empty without taking + * mutex, and there's an instant in list_move_tail when info->swaplist + * would appear empty, if it were the only one on shmem_swaplist. We + * could avoid doing it if inode NULL; or use this minor optimization. */ - swap_free(entry); - return 1; + if (shmem_swaplist.next != &info->swaplist) + list_move_tail(&shmem_swaplist, &info->swaplist); + mutex_unlock(&shmem_swaplist_mutex); + + error = 1; + if (!inode) + goto out; + error = radix_tree_preload(GFP_KERNEL); + if (error) + goto out; + error = 1; + + spin_lock(&info->lock); + ptr = shmem_swp_entry(info, idx, NULL); + if (ptr && ptr->val == entry.val) + error = add_to_page_cache(page, inode->i_mapping, + idx, GFP_NOWAIT); + if (error == -EEXIST) { + struct page *filepage = find_get_page(inode->i_mapping, idx); + error = 1; + if (filepage) { + /* + * There might be a more uptodate page coming down + * from a stacked writepage: forget our swappage if so. + */ + if (PageUptodate(filepage)) + error = 0; + page_cache_release(filepage); + } + } + if (!error) { + delete_from_swap_cache(page); + set_page_dirty(page); + info->flags |= SHMEM_PAGEIN; + shmem_swp_set(info, ptr, 0); + swap_free(entry); + error = 1; /* not an error, but entry was found */ + } + if (ptr) + shmem_swp_unmap(ptr); + spin_unlock(&info->lock); + radix_tree_preload_end(); +out: + unlock_page(page); + page_cache_release(page); + iput(inode); /* allows for NULL */ + return error; } /* @@ -887,20 +963,16 @@ int shmem_unuse(swp_entry_t entry, struct page *page) struct shmem_inode_info *info; int found = 0; - spin_lock(&shmem_swaplist_lock); + mutex_lock(&shmem_swaplist_mutex); list_for_each_safe(p, next, &shmem_swaplist) { info = list_entry(p, struct shmem_inode_info, swaplist); - if (!info->swapped) - list_del_init(&info->swaplist); - else if (shmem_unuse_inode(info, entry, page)) { - /* move head to start search for next from here */ - list_move_tail(&shmem_swaplist, &info->swaplist); - found = 1; - break; - } + found = shmem_unuse_inode(info, entry, page); + cond_resched(); + if (found) + goto out; } - spin_unlock(&shmem_swaplist_lock); - return found; + mutex_unlock(&shmem_swaplist_mutex); +out: return found; /* 0 or 1 or -ENOMEM */ } /* @@ -915,54 +987,65 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) struct inode *inode; BUG_ON(!PageLocked(page)); - /* - * shmem_backing_dev_info's capabilities prevent regular writeback or - * sync from ever calling shmem_writepage; but a stacking filesystem - * may use the ->writepage of its underlying filesystem, in which case - * we want to do nothing when that underlying filesystem is tmpfs - * (writing out to swap is useful as a response to memory pressure, but - * of no use to stabilize the data) - just redirty the page, unlock it - * and claim success in this case. AOP_WRITEPAGE_ACTIVATE, and the - * page_mapped check below, must be avoided unless we're in reclaim. - */ - if (!wbc->for_reclaim) { - set_page_dirty(page); - unlock_page(page); - return 0; - } - BUG_ON(page_mapped(page)); - mapping = page->mapping; index = page->index; inode = mapping->host; info = SHMEM_I(inode); if (info->flags & VM_LOCKED) goto redirty; - swap = get_swap_page(); - if (!swap.val) + if (!total_swap_pages) goto redirty; + /* + * shmem_backing_dev_info's capabilities prevent regular writeback or + * sync from ever calling shmem_writepage; but a stacking filesystem + * may use the ->writepage of its underlying filesystem, in which case + * tmpfs should write out to swap only in response to memory pressure, + * and not for pdflush or sync. However, in those cases, we do still + * want to check if there's a redundant swappage to be discarded. + */ + if (wbc->for_reclaim) + swap = get_swap_page(); + else + swap.val = 0; + spin_lock(&info->lock); - shmem_recalc_inode(inode); if (index >= info->next_index) { BUG_ON(!(info->flags & SHMEM_TRUNCATE)); goto unlock; } entry = shmem_swp_entry(info, index, NULL); - BUG_ON(!entry); - BUG_ON(entry->val); + if (entry->val) { + /* + * The more uptodate page coming down from a stacked + * writepage should replace our old swappage. + */ + free_swap_and_cache(*entry); + shmem_swp_set(info, entry, 0); + } + shmem_recalc_inode(inode); - if (move_to_swap_cache(page, swap) == 0) { + if (swap.val && add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { + remove_from_page_cache(page); shmem_swp_set(info, entry, swap.val); shmem_swp_unmap(entry); + if (list_empty(&info->swaplist)) + inode = igrab(inode); + else + inode = NULL; spin_unlock(&info->lock); - if (list_empty(&info->swaplist)) { - spin_lock(&shmem_swaplist_lock); + swap_duplicate(swap); + BUG_ON(page_mapped(page)); + page_cache_release(page); /* pagecache ref */ + set_page_dirty(page); + unlock_page(page); + if (inode) { + mutex_lock(&shmem_swaplist_mutex); /* move instead of add in case we're racing */ list_move_tail(&info->swaplist, &shmem_swaplist); - spin_unlock(&shmem_swaplist_lock); + mutex_unlock(&shmem_swaplist_mutex); + iput(inode); } - unlock_page(page); return 0; } @@ -972,7 +1055,10 @@ unlock: swap_free(swap); redirty: set_page_dirty(page); - return AOP_WRITEPAGE_ACTIVATE; /* Return with the page locked */ + if (wbc->for_reclaim) + return AOP_WRITEPAGE_ACTIVATE; /* Return with page locked */ + unlock_page(page); + return 0; } #ifdef CONFIG_NUMA @@ -1025,53 +1111,33 @@ out: return err; } -static struct page *shmem_swapin_async(struct shared_policy *p, - swp_entry_t entry, unsigned long idx) +static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp, + struct shmem_inode_info *info, unsigned long idx) { - struct page *page; struct vm_area_struct pvma; + struct page *page; /* Create a pseudo vma that just contains the policy */ - memset(&pvma, 0, sizeof(struct vm_area_struct)); - pvma.vm_end = PAGE_SIZE; + pvma.vm_start = 0; pvma.vm_pgoff = idx; - pvma.vm_policy = mpol_shared_policy_lookup(p, idx); - page = read_swap_cache_async(entry, &pvma, 0); + pvma.vm_ops = NULL; + pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx); + page = swapin_readahead(entry, gfp, &pvma, 0); mpol_free(pvma.vm_policy); return page; } -static struct page *shmem_swapin(struct shmem_inode_info *info, - swp_entry_t entry, unsigned long idx) -{ - struct shared_policy *p = &info->policy; - int i, num; - struct page *page; - unsigned long offset; - - num = valid_swaphandles(entry, &offset); - for (i = 0; i < num; offset++, i++) { - page = shmem_swapin_async(p, - swp_entry(swp_type(entry), offset), idx); - if (!page) - break; - page_cache_release(page); - } - lru_add_drain(); /* Push any new pages onto the LRU now */ - return shmem_swapin_async(p, entry, idx); -} - -static struct page * -shmem_alloc_page(gfp_t gfp, struct shmem_inode_info *info, - unsigned long idx) +static struct page *shmem_alloc_page(gfp_t gfp, + struct shmem_inode_info *info, unsigned long idx) { struct vm_area_struct pvma; struct page *page; - memset(&pvma, 0, sizeof(struct vm_area_struct)); - pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx); + /* Create a pseudo vma that just contains the policy */ + pvma.vm_start = 0; pvma.vm_pgoff = idx; - pvma.vm_end = PAGE_SIZE; + pvma.vm_ops = NULL; + pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx); page = alloc_page_vma(gfp, &pvma, 0); mpol_free(pvma.vm_policy); return page; @@ -1083,15 +1149,14 @@ static inline int shmem_parse_mpol(char *value, int *policy, return 1; } -static inline struct page * -shmem_swapin(struct shmem_inode_info *info,swp_entry_t entry,unsigned long idx) +static inline struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp, + struct shmem_inode_info *info, unsigned long idx) { - swapin_readahead(entry, 0, NULL); - return read_swap_cache_async(entry, NULL, 0); + return swapin_readahead(entry, gfp, NULL, 0); } -static inline struct page * -shmem_alloc_page(gfp_t gfp,struct shmem_inode_info *info, unsigned long idx) +static inline struct page *shmem_alloc_page(gfp_t gfp, + struct shmem_inode_info *info, unsigned long idx) { return alloc_page(gfp); } @@ -1114,6 +1179,7 @@ static int shmem_getpage(struct inode *inode, unsigned long idx, struct page *swappage; swp_entry_t *entry; swp_entry_t swap; + gfp_t gfp; int error; if (idx >= SHMEM_MAX_INDEX) @@ -1126,7 +1192,7 @@ static int shmem_getpage(struct inode *inode, unsigned long idx, * Normally, filepage is NULL on entry, and either found * uptodate immediately, or allocated and zeroed, or read * in under swappage, which is then assigned to filepage. - * But shmem_readpage and shmem_write_begin pass in a locked + * But shmem_readpage (required for splice) passes in a locked * filepage, which may be found not uptodate by other callers * too, and may need to be copied from the swappage read in. */ @@ -1136,8 +1202,17 @@ repeat: if (filepage && PageUptodate(filepage)) goto done; error = 0; - if (sgp == SGP_QUICK) - goto failed; + gfp = mapping_gfp_mask(mapping); + if (!filepage) { + /* + * Try to preload while we can wait, to not make a habit of + * draining atomic reserves; but don't latch on to this cpu. + */ + error = radix_tree_preload(gfp & ~__GFP_HIGHMEM); + if (error) + goto failed; + radix_tree_preload_end(); + } spin_lock(&info->lock); shmem_recalc_inode(inode); @@ -1160,7 +1235,7 @@ repeat: *type |= VM_FAULT_MAJOR; } spin_unlock(&info->lock); - swappage = shmem_swapin(info, swap, idx); + swappage = shmem_swapin(swap, gfp, info, idx); if (!swappage) { spin_lock(&info->lock); entry = shmem_swp_alloc(info, idx, sgp); @@ -1218,23 +1293,21 @@ repeat: SetPageUptodate(filepage); set_page_dirty(filepage); swap_free(swap); - } else if (!(error = move_from_swap_cache( - swappage, idx, mapping))) { + } else if (!(error = add_to_page_cache( + swappage, mapping, idx, GFP_NOWAIT))) { info->flags |= SHMEM_PAGEIN; shmem_swp_set(info, entry, 0); shmem_swp_unmap(entry); + delete_from_swap_cache(swappage); spin_unlock(&info->lock); filepage = swappage; + set_page_dirty(filepage); swap_free(swap); } else { shmem_swp_unmap(entry); spin_unlock(&info->lock); unlock_page(swappage); page_cache_release(swappage); - if (error == -ENOMEM) { - /* let kswapd refresh zone for GFP_ATOMICs */ - congestion_wait(WRITE, HZ/50); - } goto repeat; } } else if (sgp == SGP_READ && !filepage) { @@ -1272,9 +1345,7 @@ repeat: if (!filepage) { spin_unlock(&info->lock); - filepage = shmem_alloc_page(mapping_gfp_mask(mapping), - info, - idx); + filepage = shmem_alloc_page(gfp, info, idx); if (!filepage) { shmem_unacct_blocks(info->flags, 1); shmem_free_blocks(inode, 1); @@ -1291,7 +1362,7 @@ repeat: shmem_swp_unmap(entry); } if (error || swap.val || 0 != add_to_page_cache_lru( - filepage, mapping, idx, GFP_ATOMIC)) { + filepage, mapping, idx, GFP_NOWAIT)) { spin_unlock(&info->lock); page_cache_release(filepage); shmem_unacct_blocks(info->flags, 1); @@ -1309,14 +1380,11 @@ repeat: clear_highpage(filepage); flush_dcache_page(filepage); SetPageUptodate(filepage); + if (sgp == SGP_DIRTY) + set_page_dirty(filepage); } done: - if (*pagep != filepage) { - *pagep = filepage; - if (sgp != SGP_FAULT) - unlock_page(filepage); - - } + *pagep = filepage; return 0; failed: @@ -1336,7 +1404,7 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) return VM_FAULT_SIGBUS; - error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_FAULT, &ret); + error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); if (error) return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); @@ -1399,15 +1467,8 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev) struct shmem_inode_info *info; struct shmem_sb_info *sbinfo = SHMEM_SB(sb); - if (sbinfo->max_inodes) { - spin_lock(&sbinfo->stat_lock); - if (!sbinfo->free_inodes) { - spin_unlock(&sbinfo->stat_lock); - return NULL; - } - sbinfo->free_inodes--; - spin_unlock(&sbinfo->stat_lock); - } + if (shmem_reserve_inode(sb)) + return NULL; inode = new_inode(sb); if (inode) { @@ -1451,11 +1512,8 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev) NULL); break; } - } else if (sbinfo->max_inodes) { - spin_lock(&sbinfo->stat_lock); - sbinfo->free_inodes++; - spin_unlock(&sbinfo->stat_lock); - } + } else + shmem_free_inode(sb); return inode; } @@ -1494,123 +1552,30 @@ shmem_write_end(struct file *file, struct address_space *mapping, { struct inode *inode = mapping->host; + if (pos + copied > inode->i_size) + i_size_write(inode, pos + copied); + + unlock_page(page); set_page_dirty(page); page_cache_release(page); - if (pos+copied > inode->i_size) - i_size_write(inode, pos+copied); - return copied; } -static ssize_t -shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) -{ - struct inode *inode = file->f_path.dentry->d_inode; - loff_t pos; - unsigned long written; - ssize_t err; - - if ((ssize_t) count < 0) - return -EINVAL; - - if (!access_ok(VERIFY_READ, buf, count)) - return -EFAULT; - - mutex_lock(&inode->i_mutex); - - pos = *ppos; - written = 0; - - err = generic_write_checks(file, &pos, &count, 0); - if (err || !count) - goto out; - - err = remove_suid(file->f_path.dentry); - if (err) - goto out; - - inode->i_ctime = inode->i_mtime = CURRENT_TIME; - - do { - struct page *page = NULL; - unsigned long bytes, index, offset; - char *kaddr; - int left; - - offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ - index = pos >> PAGE_CACHE_SHIFT; - bytes = PAGE_CACHE_SIZE - offset; - if (bytes > count) - bytes = count; - - /* - * We don't hold page lock across copy from user - - * what would it guard against? - so no deadlock here. - * But it still may be a good idea to prefault below. - */ - - err = shmem_getpage(inode, index, &page, SGP_WRITE, NULL); - if (err) - break; - - left = bytes; - if (PageHighMem(page)) { - volatile unsigned char dummy; - __get_user(dummy, buf); - __get_user(dummy, buf + bytes - 1); - - kaddr = kmap_atomic(page, KM_USER0); - left = __copy_from_user_inatomic(kaddr + offset, - buf, bytes); - kunmap_atomic(kaddr, KM_USER0); - } - if (left) { - kaddr = kmap(page); - left = __copy_from_user(kaddr + offset, buf, bytes); - kunmap(page); - } - - written += bytes; - count -= bytes; - pos += bytes; - buf += bytes; - if (pos > inode->i_size) - i_size_write(inode, pos); - - flush_dcache_page(page); - set_page_dirty(page); - mark_page_accessed(page); - page_cache_release(page); - - if (left) { - pos -= left; - written -= left; - err = -EFAULT; - break; - } - - /* - * Our dirty pages are not counted in nr_dirty, - * and we do not attempt to balance dirty pages. - */ - - cond_resched(); - } while (count); - - *ppos = pos; - if (written) - err = written; -out: - mutex_unlock(&inode->i_mutex); - return err; -} - static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor) { struct inode *inode = filp->f_path.dentry->d_inode; struct address_space *mapping = inode->i_mapping; unsigned long index, offset; + enum sgp_type sgp = SGP_READ; + + /* + * Might this read be for a stacking filesystem? Then when reading + * holes of a sparse file, we actually need to allocate those pages, + * and even mark them dirty, so it cannot exceed the max_blocks limit. + */ + if (segment_eq(get_fs(), KERNEL_DS)) + sgp = SGP_DIRTY; index = *ppos >> PAGE_CACHE_SHIFT; offset = *ppos & ~PAGE_CACHE_MASK; @@ -1629,12 +1594,14 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_ break; } - desc->error = shmem_getpage(inode, index, &page, SGP_READ, NULL); + desc->error = shmem_getpage(inode, index, &page, sgp, NULL); if (desc->error) { if (desc->error == -EINVAL) desc->error = 0; break; } + if (page) + unlock_page(page); /* * We must evaluate after, since reads (unlike writes) @@ -1798,22 +1765,16 @@ static int shmem_create(struct inode *dir, struct dentry *dentry, int mode, static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { struct inode *inode = old_dentry->d_inode; - struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + int ret; /* * No ordinary (disk based) filesystem counts links as inodes; * but each new link needs a new dentry, pinning lowmem, and * tmpfs dentries cannot be pruned until they are unlinked. */ - if (sbinfo->max_inodes) { - spin_lock(&sbinfo->stat_lock); - if (!sbinfo->free_inodes) { - spin_unlock(&sbinfo->stat_lock); - return -ENOSPC; - } - sbinfo->free_inodes--; - spin_unlock(&sbinfo->stat_lock); - } + ret = shmem_reserve_inode(inode->i_sb); + if (ret) + goto out; dir->i_size += BOGO_DIRENT_SIZE; inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; @@ -1821,21 +1782,16 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr atomic_inc(&inode->i_count); /* New dentry reference */ dget(dentry); /* Extra pinning count for the created dentry */ d_instantiate(dentry, inode); - return 0; +out: + return ret; } static int shmem_unlink(struct inode *dir, struct dentry *dentry) { struct inode *inode = dentry->d_inode; - if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) { - struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); - if (sbinfo->max_inodes) { - spin_lock(&sbinfo->stat_lock); - sbinfo->free_inodes++; - spin_unlock(&sbinfo->stat_lock); - } - } + if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) + shmem_free_inode(inode->i_sb); dir->i_size -= BOGO_DIRENT_SIZE; inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; @@ -1924,6 +1880,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s iput(inode); return error; } + unlock_page(page); inode->i_op = &shmem_symlink_inode_operations; kaddr = kmap_atomic(page, KM_USER0); memcpy(kaddr, symname, len); @@ -1951,6 +1908,8 @@ static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd) struct page *page = NULL; int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL); nd_set_link(nd, res ? ERR_PTR(res) : kmap(page)); + if (page) + unlock_page(page); return page; } @@ -1996,8 +1955,7 @@ static int shmem_xattr_security_get(struct inode *inode, const char *name, { if (strcmp(name, "") == 0) return -EINVAL; - return security_inode_getsecurity(inode, name, buffer, size, - -EOPNOTSUPP); + return xattr_getsecurity(inode, name, buffer, size); } static int shmem_xattr_security_set(struct inode *inode, const char *name, @@ -2138,7 +2096,7 @@ static int shmem_parse_options(char *options, int *mode, uid_t *uid, } if (*rest) goto bad_val; - *blocks = size >> PAGE_CACHE_SHIFT; + *blocks = DIV_ROUND_UP(size, PAGE_CACHE_SIZE); } else if (!strcmp(this_char,"nr_blocks")) { *blocks = memparse(value,&rest); if (*rest) @@ -2375,7 +2333,8 @@ static const struct file_operations shmem_file_operations = { #ifdef CONFIG_TMPFS .llseek = generic_file_llseek, .read = shmem_file_read, - .write = shmem_file_write, + .write = do_sync_write, + .aio_write = generic_file_aio_write, .fsync = simple_sync_file, .splice_read = generic_file_splice_read, .splice_write = generic_file_splice_write, @@ -12,10 +12,17 @@ * allocator is as little as 2 bytes, however typically most architectures * will require 4 bytes on 32-bit and 8 bytes on 64-bit. * - * The slob heap is a linked list of pages from alloc_pages(), and - * within each page, there is a singly-linked list of free blocks (slob_t). - * The heap is grown on demand and allocation from the heap is currently - * first-fit. + * The slob heap is a set of linked list of pages from alloc_pages(), + * and within each page, there is a singly-linked list of free blocks + * (slob_t). The heap is grown on demand. To reduce fragmentation, + * heap pages are segregated into three lists, with objects less than + * 256 bytes, objects less than 1024 bytes, and all other objects. + * + * Allocation from heap involves first searching for a page with + * sufficient free blocks (using a next-fit-like approach) followed by + * a first-fit scan of the page. Deallocation inserts objects back + * into the free list in address order, so this is effectively an + * address-ordered first fit. * * Above this is an implementation of kmalloc/kfree. Blocks returned * from kmalloc are prepended with a 4-byte header with the kmalloc size. @@ -110,9 +117,13 @@ static inline void free_slob_page(struct slob_page *sp) } /* - * All (partially) free slob pages go on this list. + * All partially free slob pages go on these lists. */ -static LIST_HEAD(free_slob_pages); +#define SLOB_BREAK1 256 +#define SLOB_BREAK2 1024 +static LIST_HEAD(free_slob_small); +static LIST_HEAD(free_slob_medium); +static LIST_HEAD(free_slob_large); /* * slob_page: True for all slob pages (false for bigblock pages) @@ -140,9 +151,9 @@ static inline int slob_page_free(struct slob_page *sp) return test_bit(PG_private, &sp->flags); } -static inline void set_slob_page_free(struct slob_page *sp) +static void set_slob_page_free(struct slob_page *sp, struct list_head *list) { - list_add(&sp->list, &free_slob_pages); + list_add(&sp->list, list); __set_bit(PG_private, &sp->flags); } @@ -294,12 +305,20 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) { struct slob_page *sp; struct list_head *prev; + struct list_head *slob_list; slob_t *b = NULL; unsigned long flags; + if (size < SLOB_BREAK1) + slob_list = &free_slob_small; + else if (size < SLOB_BREAK2) + slob_list = &free_slob_medium; + else + slob_list = &free_slob_large; + spin_lock_irqsave(&slob_lock, flags); /* Iterate through each partially free page, try to find room */ - list_for_each_entry(sp, &free_slob_pages, list) { + list_for_each_entry(sp, slob_list, list) { #ifdef CONFIG_NUMA /* * If there's a node specification, search for a partial @@ -321,9 +340,9 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) /* Improve fragment distribution and reduce our average * search time by starting our next search here. (see * Knuth vol 1, sec 2.5, pg 449) */ - if (prev != free_slob_pages.prev && - free_slob_pages.next != prev->next) - list_move_tail(&free_slob_pages, prev->next); + if (prev != slob_list->prev && + slob_list->next != prev->next) + list_move_tail(slob_list, prev->next); break; } spin_unlock_irqrestore(&slob_lock, flags); @@ -341,7 +360,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) sp->free = b; INIT_LIST_HEAD(&sp->list); set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE)); - set_slob_page_free(sp); + set_slob_page_free(sp, slob_list); b = slob_page_alloc(sp, size, align); BUG_ON(!b); spin_unlock_irqrestore(&slob_lock, flags); @@ -387,7 +406,7 @@ static void slob_free(void *block, int size) set_slob(b, units, (void *)((unsigned long)(b + SLOB_UNITS(PAGE_SIZE)) & PAGE_MASK)); - set_slob_page_free(sp); + set_slob_page_free(sp, &free_slob_small); goto out; } @@ -398,6 +417,10 @@ static void slob_free(void *block, int size) sp->units += units; if (b < sp->free) { + if (b + units == sp->free) { + units += slob_units(sp->free); + sp->free = slob_next(sp->free); + } set_slob(b, units, sp->free); sp->free = b; } else { @@ -247,7 +247,10 @@ static void sysfs_slab_remove(struct kmem_cache *); static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) { return 0; } -static inline void sysfs_slab_remove(struct kmem_cache *s) {} +static inline void sysfs_slab_remove(struct kmem_cache *s) +{ + kfree(s); +} #endif /******************************************************************** @@ -354,22 +357,22 @@ static void print_section(char *text, u8 *addr, unsigned int length) printk(KERN_ERR "%8s 0x%p: ", text, addr + i); newline = 0; } - printk(" %02x", addr[i]); + printk(KERN_CONT " %02x", addr[i]); offset = i % 16; ascii[offset] = isgraph(addr[i]) ? addr[i] : '.'; if (offset == 15) { - printk(" %s\n",ascii); + printk(KERN_CONT " %s\n", ascii); newline = 1; } } if (!newline) { i %= 16; while (i < 16) { - printk(" "); + printk(KERN_CONT " "); ascii[i] = ' '; i++; } - printk(" %s\n", ascii); + printk(KERN_CONT " %s\n", ascii); } } @@ -529,7 +532,7 @@ static void init_object(struct kmem_cache *s, void *object, int active) if (s->flags & __OBJECT_POISON) { memset(p, POISON_FREE, s->objsize - 1); - p[s->objsize -1] = POISON_END; + p[s->objsize - 1] = POISON_END; } if (s->flags & SLAB_RED_ZONE) @@ -558,7 +561,7 @@ static void restore_bytes(struct kmem_cache *s, char *message, u8 data, static int check_bytes_and_report(struct kmem_cache *s, struct page *page, u8 *object, char *what, - u8* start, unsigned int value, unsigned int bytes) + u8 *start, unsigned int value, unsigned int bytes) { u8 *fault; u8 *end; @@ -692,7 +695,7 @@ static int check_object(struct kmem_cache *s, struct page *page, (!check_bytes_and_report(s, page, p, "Poison", p, POISON_FREE, s->objsize - 1) || !check_bytes_and_report(s, page, p, "Poison", - p + s->objsize -1, POISON_END, 1))) + p + s->objsize - 1, POISON_END, 1))) return 0; /* * check_pad_bytes cleans up on its own. @@ -900,8 +903,7 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page, "SLUB <none>: no slab for object 0x%p.\n", object); dump_stack(); - } - else + } else object_err(s, page, object, "page slab pointer corrupt."); goto fail; @@ -947,7 +949,7 @@ static int __init setup_slub_debug(char *str) /* * Determine which debug features should be switched on */ - for ( ;*str && *str != ','; str++) { + for (; *str && *str != ','; str++) { switch (tolower(*str)) { case 'f': slub_debug |= SLAB_DEBUG_FREE; @@ -966,7 +968,7 @@ static int __init setup_slub_debug(char *str) break; default: printk(KERN_ERR "slub_debug option '%c' " - "unknown. skipped\n",*str); + "unknown. skipped\n", *str); } } @@ -1039,7 +1041,7 @@ static inline unsigned long kmem_cache_flags(unsigned long objsize, */ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) { - struct page * page; + struct page *page; int pages = 1 << s->order; if (s->order) @@ -1135,7 +1137,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page) mod_zone_page_state(page_zone(page), (s->flags & SLAB_RECLAIM_ACCOUNT) ? NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, - - pages); + -pages); __free_pages(page, s->order); } @@ -1195,19 +1197,15 @@ static __always_inline int slab_trylock(struct page *page) /* * Management of partially allocated slabs */ -static void add_partial_tail(struct kmem_cache_node *n, struct page *page) -{ - spin_lock(&n->list_lock); - n->nr_partial++; - list_add_tail(&page->lru, &n->partial); - spin_unlock(&n->list_lock); -} - -static void add_partial(struct kmem_cache_node *n, struct page *page) +static void add_partial(struct kmem_cache_node *n, + struct page *page, int tail) { spin_lock(&n->list_lock); n->nr_partial++; - list_add(&page->lru, &n->partial); + if (tail) + list_add_tail(&page->lru, &n->partial); + else + list_add(&page->lru, &n->partial); spin_unlock(&n->list_lock); } @@ -1292,7 +1290,8 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) * expensive if we do it every time we are trying to find a slab * with available objects. */ - if (!s->defrag_ratio || get_cycles() % 1024 > s->defrag_ratio) + if (!s->remote_node_defrag_ratio || + get_cycles() % 1024 > s->remote_node_defrag_ratio) return NULL; zonelist = &NODE_DATA(slab_node(current->mempolicy)) @@ -1335,7 +1334,7 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) * * On exit the slab lock will have been dropped. */ -static void unfreeze_slab(struct kmem_cache *s, struct page *page) +static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) { struct kmem_cache_node *n = get_node(s, page_to_nid(page)); @@ -1343,7 +1342,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page) if (page->inuse) { if (page->freelist) - add_partial(n, page); + add_partial(n, page, tail); else if (SlabDebug(page) && (s->flags & SLAB_STORE_USER)) add_full(n, page); slab_unlock(page); @@ -1358,7 +1357,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page) * partial list stays small. kmem_cache_shrink can * reclaim empty slabs from the partial list. */ - add_partial_tail(n, page); + add_partial(n, page, 1); slab_unlock(page); } else { slab_unlock(page); @@ -1373,6 +1372,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page) static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) { struct page *page = c->page; + int tail = 1; /* * Merge cpu freelist into freelist. Typically we get here * because both freelists are empty. So this is unlikely @@ -1381,6 +1381,8 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) while (unlikely(c->freelist)) { void **object; + tail = 0; /* Hot objects. Put the slab first */ + /* Retrieve object from cpu_freelist */ object = c->freelist; c->freelist = c->freelist[c->offset]; @@ -1391,7 +1393,7 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) page->inuse--; } c->page = NULL; - unfreeze_slab(s, page); + unfreeze_slab(s, page, tail); } static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) @@ -1539,7 +1541,7 @@ debug: * * Otherwise we can simply pick the next object from the lockless free list. */ -static void __always_inline *slab_alloc(struct kmem_cache *s, +static __always_inline void *slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, void *addr) { void **object; @@ -1613,7 +1615,7 @@ checks_ok: * then add it. */ if (unlikely(!prior)) - add_partial_tail(get_node(s, page_to_nid(page)), page); + add_partial(get_node(s, page_to_nid(page)), page, 1); out_unlock: slab_unlock(page); @@ -1647,7 +1649,7 @@ debug: * If fastpath is not possible then fall back to __slab_free where we deal * with all sorts of special processing. */ -static void __always_inline slab_free(struct kmem_cache *s, +static __always_inline void slab_free(struct kmem_cache *s, struct page *page, void *x, void *addr) { void **object = (void *)x; @@ -1997,6 +1999,7 @@ static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags, { struct page *page; struct kmem_cache_node *n; + unsigned long flags; BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node)); @@ -2021,7 +2024,14 @@ static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags, #endif init_kmem_cache_node(n); atomic_long_inc(&n->nr_slabs); - add_partial(n, page); + /* + * lockdep requires consistent irq usage for each lock + * so even though there cannot be a race this early in + * the boot sequence, we still disable irqs. + */ + local_irq_save(flags); + add_partial(n, page, 0); + local_irq_restore(flags); return n; } @@ -2206,7 +2216,7 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, s->refcount = 1; #ifdef CONFIG_NUMA - s->defrag_ratio = 100; + s->remote_node_defrag_ratio = 100; #endif if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) goto error; @@ -2228,7 +2238,7 @@ error: */ int kmem_ptr_validate(struct kmem_cache *s, const void *object) { - struct page * page; + struct page *page; page = get_object_page(object); @@ -2322,7 +2332,6 @@ void kmem_cache_destroy(struct kmem_cache *s) if (kmem_cache_close(s)) WARN_ON(1); sysfs_slab_remove(s); - kfree(s); } else up_write(&slub_lock); } @@ -2341,7 +2350,7 @@ static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT]; static int __init setup_slub_min_order(char *str) { - get_option (&str, &slub_min_order); + get_option(&str, &slub_min_order); return 1; } @@ -2350,7 +2359,7 @@ __setup("slub_min_order=", setup_slub_min_order); static int __init setup_slub_max_order(char *str) { - get_option (&str, &slub_max_order); + get_option(&str, &slub_max_order); return 1; } @@ -2359,7 +2368,7 @@ __setup("slub_max_order=", setup_slub_max_order); static int __init setup_slub_min_objects(char *str) { - get_option (&str, &slub_min_objects); + get_option(&str, &slub_min_objects); return 1; } @@ -2605,6 +2614,19 @@ void kfree(const void *x) } EXPORT_SYMBOL(kfree); +static unsigned long count_partial(struct kmem_cache_node *n) +{ + unsigned long flags; + unsigned long x = 0; + struct page *page; + + spin_lock_irqsave(&n->list_lock, flags); + list_for_each_entry(page, &n->partial, lru) + x += page->inuse; + spin_unlock_irqrestore(&n->list_lock, flags); + return x; +} + /* * kmem_cache_shrink removes empty slabs from the partial lists and sorts * the remaining slabs by the number of items in use. The slabs with the @@ -2931,7 +2953,7 @@ static struct kmem_cache *find_mergeable(size_t size, * Check if alignment is compatible. * Courtesy of Adrian Drzewiecki */ - if ((s->size & ~(align -1)) != s->size) + if ((s->size & ~(align - 1)) != s->size) continue; if (s->size - size >= sizeof(void *)) @@ -3040,8 +3062,9 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, return NOTIFY_OK; } -static struct notifier_block __cpuinitdata slab_notifier = - { &slab_cpuup_callback, NULL, 0 }; +static struct notifier_block __cpuinitdata slab_notifier = { + &slab_cpuup_callback, NULL, 0 +}; #endif @@ -3076,19 +3099,6 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, return slab_alloc(s, gfpflags, node, caller); } -static unsigned long count_partial(struct kmem_cache_node *n) -{ - unsigned long flags; - unsigned long x = 0; - struct page *page; - - spin_lock_irqsave(&n->list_lock, flags); - list_for_each_entry(page, &n->partial, lru) - x += page->inuse; - spin_unlock_irqrestore(&n->list_lock, flags); - return x; -} - #if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG) static int validate_slab(struct kmem_cache *s, struct page *page, unsigned long *map) @@ -3390,7 +3400,7 @@ static void process_slab(struct loc_track *t, struct kmem_cache *s, static int list_locations(struct kmem_cache *s, char *buf, enum track_item alloc) { - int n = 0; + int len = 0; unsigned long i; struct loc_track t = { 0, 0, NULL }; int node; @@ -3421,54 +3431,54 @@ static int list_locations(struct kmem_cache *s, char *buf, for (i = 0; i < t.count; i++) { struct location *l = &t.loc[i]; - if (n > PAGE_SIZE - 100) + if (len > PAGE_SIZE - 100) break; - n += sprintf(buf + n, "%7ld ", l->count); + len += sprintf(buf + len, "%7ld ", l->count); if (l->addr) - n += sprint_symbol(buf + n, (unsigned long)l->addr); + len += sprint_symbol(buf + len, (unsigned long)l->addr); else - n += sprintf(buf + n, "<not-available>"); + len += sprintf(buf + len, "<not-available>"); if (l->sum_time != l->min_time) { unsigned long remainder; - n += sprintf(buf + n, " age=%ld/%ld/%ld", + len += sprintf(buf + len, " age=%ld/%ld/%ld", l->min_time, div_long_long_rem(l->sum_time, l->count, &remainder), l->max_time); } else - n += sprintf(buf + n, " age=%ld", + len += sprintf(buf + len, " age=%ld", l->min_time); if (l->min_pid != l->max_pid) - n += sprintf(buf + n, " pid=%ld-%ld", + len += sprintf(buf + len, " pid=%ld-%ld", l->min_pid, l->max_pid); else - n += sprintf(buf + n, " pid=%ld", + len += sprintf(buf + len, " pid=%ld", l->min_pid); if (num_online_cpus() > 1 && !cpus_empty(l->cpus) && - n < PAGE_SIZE - 60) { - n += sprintf(buf + n, " cpus="); - n += cpulist_scnprintf(buf + n, PAGE_SIZE - n - 50, + len < PAGE_SIZE - 60) { + len += sprintf(buf + len, " cpus="); + len += cpulist_scnprintf(buf + len, PAGE_SIZE - len - 50, l->cpus); } if (num_online_nodes() > 1 && !nodes_empty(l->nodes) && - n < PAGE_SIZE - 60) { - n += sprintf(buf + n, " nodes="); - n += nodelist_scnprintf(buf + n, PAGE_SIZE - n - 50, + len < PAGE_SIZE - 60) { + len += sprintf(buf + len, " nodes="); + len += nodelist_scnprintf(buf + len, PAGE_SIZE - len - 50, l->nodes); } - n += sprintf(buf + n, "\n"); + len += sprintf(buf + len, "\n"); } free_loc_track(&t); if (!t.count) - n += sprintf(buf, "No data\n"); - return n; + len += sprintf(buf, "No data\n"); + return len; } enum slab_stat_type { @@ -3498,7 +3508,6 @@ static unsigned long slab_objects(struct kmem_cache *s, for_each_possible_cpu(cpu) { struct page *page; - int node; struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); if (!c) @@ -3510,8 +3519,6 @@ static unsigned long slab_objects(struct kmem_cache *s, continue; if (page) { if (flags & SO_CPU) { - int x = 0; - if (flags & SO_OBJECTS) x = page->inuse; else @@ -3848,24 +3855,24 @@ static ssize_t free_calls_show(struct kmem_cache *s, char *buf) SLAB_ATTR_RO(free_calls); #ifdef CONFIG_NUMA -static ssize_t defrag_ratio_show(struct kmem_cache *s, char *buf) +static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", s->defrag_ratio / 10); + return sprintf(buf, "%d\n", s->remote_node_defrag_ratio / 10); } -static ssize_t defrag_ratio_store(struct kmem_cache *s, +static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s, const char *buf, size_t length) { int n = simple_strtoul(buf, NULL, 10); if (n < 100) - s->defrag_ratio = n * 10; + s->remote_node_defrag_ratio = n * 10; return length; } -SLAB_ATTR(defrag_ratio); +SLAB_ATTR(remote_node_defrag_ratio); #endif -static struct attribute * slab_attrs[] = { +static struct attribute *slab_attrs[] = { &slab_size_attr.attr, &object_size_attr.attr, &objs_per_slab_attr.attr, @@ -3893,7 +3900,7 @@ static struct attribute * slab_attrs[] = { &cache_dma_attr.attr, #endif #ifdef CONFIG_NUMA - &defrag_ratio_attr.attr, + &remote_node_defrag_ratio_attr.attr, #endif NULL }; @@ -3940,6 +3947,13 @@ static ssize_t slab_attr_store(struct kobject *kobj, return err; } +static void kmem_cache_release(struct kobject *kobj) +{ + struct kmem_cache *s = to_slab(kobj); + + kfree(s); +} + static struct sysfs_ops slab_sysfs_ops = { .show = slab_attr_show, .store = slab_attr_store, @@ -3947,6 +3961,7 @@ static struct sysfs_ops slab_sysfs_ops = { static struct kobj_type slab_ktype = { .sysfs_ops = &slab_sysfs_ops, + .release = kmem_cache_release }; static int uevent_filter(struct kset *kset, struct kobject *kobj) @@ -4048,6 +4063,7 @@ static void sysfs_slab_remove(struct kmem_cache *s) { kobject_uevent(&s->kobj, KOBJ_REMOVE); kobject_del(&s->kobj); + kobject_put(&s->kobj); } /* diff --git a/mm/sparse.c b/mm/sparse.c index a2183cb..f6a43c0 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -237,7 +237,7 @@ static unsigned long *__kmalloc_section_usemap(void) } #endif /* CONFIG_MEMORY_HOTPLUG */ -static unsigned long *sparse_early_usemap_alloc(unsigned long pnum) +static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum) { unsigned long *usemap; struct mem_section *ms = __nr_to_section(pnum); @@ -353,17 +353,9 @@ static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid, return __kmalloc_section_memmap(nr_pages); } -static int vaddr_in_vmalloc_area(void *addr) -{ - if (addr >= (void *)VMALLOC_START && - addr < (void *)VMALLOC_END) - return 1; - return 0; -} - static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) { - if (vaddr_in_vmalloc_area(memmap)) + if (is_vmalloc_addr(memmap)) vfree(memmap); else free_pages((unsigned long)memmap, @@ -41,7 +41,7 @@ static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs) = { 0, }; * This path almost never happens for VM activity - pages are normally * freed via pagevecs. But it gets used by networking. */ -static void fastcall __page_cache_release(struct page *page) +static void __page_cache_release(struct page *page) { if (PageLRU(page)) { unsigned long flags; @@ -165,7 +165,7 @@ int rotate_reclaimable_page(struct page *page) /* * FIXME: speed this up? */ -void fastcall activate_page(struct page *page) +void activate_page(struct page *page) { struct zone *zone = page_zone(page); @@ -186,7 +186,7 @@ void fastcall activate_page(struct page *page) * inactive,referenced -> active,unreferenced * active,unreferenced -> active,referenced */ -void fastcall mark_page_accessed(struct page *page) +void mark_page_accessed(struct page *page) { if (!PageActive(page) && PageReferenced(page) && PageLRU(page)) { activate_page(page); @@ -202,7 +202,7 @@ EXPORT_SYMBOL(mark_page_accessed); * lru_cache_add: add a page to the page lists * @page: the page to add */ -void fastcall lru_cache_add(struct page *page) +void lru_cache_add(struct page *page) { struct pagevec *pvec = &get_cpu_var(lru_add_pvecs); @@ -212,7 +212,7 @@ void fastcall lru_cache_add(struct page *page) put_cpu_var(lru_add_pvecs); } -void fastcall lru_cache_add_active(struct page *page) +void lru_cache_add_active(struct page *page) { struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs); diff --git a/mm/swap_state.c b/mm/swap_state.c index b526356..ec42f01 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -10,6 +10,7 @@ #include <linux/mm.h> #include <linux/kernel_stat.h> #include <linux/swap.h> +#include <linux/swapops.h> #include <linux/init.h> #include <linux/pagemap.h> #include <linux/buffer_head.h> @@ -51,26 +52,22 @@ static struct { unsigned long del_total; unsigned long find_success; unsigned long find_total; - unsigned long noent_race; - unsigned long exist_race; } swap_cache_info; void show_swap_cache_info(void) { - printk("Swap cache: add %lu, delete %lu, find %lu/%lu, race %lu+%lu\n", + printk("Swap cache: add %lu, delete %lu, find %lu/%lu\n", swap_cache_info.add_total, swap_cache_info.del_total, - swap_cache_info.find_success, swap_cache_info.find_total, - swap_cache_info.noent_race, swap_cache_info.exist_race); + swap_cache_info.find_success, swap_cache_info.find_total); printk("Free swap = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10)); printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); } /* - * __add_to_swap_cache resembles add_to_page_cache on swapper_space, + * add_to_swap_cache resembles add_to_page_cache on swapper_space, * but sets SwapCache flag and private instead of mapping and index. */ -static int __add_to_swap_cache(struct page *page, swp_entry_t entry, - gfp_t gfp_mask) +int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) { int error; @@ -88,6 +85,7 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry, set_page_private(page, entry.val); total_swapcache_pages++; __inc_zone_page_state(page, NR_FILE_PAGES); + INC_CACHE_INFO(add_total); } write_unlock_irq(&swapper_space.tree_lock); radix_tree_preload_end(); @@ -95,31 +93,6 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry, return error; } -static int add_to_swap_cache(struct page *page, swp_entry_t entry) -{ - int error; - - BUG_ON(PageLocked(page)); - if (!swap_duplicate(entry)) { - INC_CACHE_INFO(noent_race); - return -ENOENT; - } - SetPageLocked(page); - error = __add_to_swap_cache(page, entry, GFP_KERNEL); - /* - * Anon pages are already on the LRU, we don't run lru_cache_add here. - */ - if (error) { - ClearPageLocked(page); - swap_free(entry); - if (error == -EEXIST) - INC_CACHE_INFO(exist_race); - return error; - } - INC_CACHE_INFO(add_total); - return 0; -} - /* * This must be called only on pages that have * been verified to be in the swap cache. @@ -152,6 +125,7 @@ int add_to_swap(struct page * page, gfp_t gfp_mask) int err; BUG_ON(!PageLocked(page)); + BUG_ON(!PageUptodate(page)); for (;;) { entry = get_swap_page(); @@ -169,18 +143,15 @@ int add_to_swap(struct page * page, gfp_t gfp_mask) /* * Add it to the swap cache and mark it dirty */ - err = __add_to_swap_cache(page, entry, + err = add_to_swap_cache(page, entry, gfp_mask|__GFP_NOMEMALLOC|__GFP_NOWARN); switch (err) { case 0: /* Success */ - SetPageUptodate(page); SetPageDirty(page); - INC_CACHE_INFO(add_total); return 1; case -EEXIST: /* Raced with "speculative" read_swap_cache_async */ - INC_CACHE_INFO(exist_race); swap_free(entry); continue; default: @@ -211,40 +182,6 @@ void delete_from_swap_cache(struct page *page) page_cache_release(page); } -/* - * Strange swizzling function only for use by shmem_writepage - */ -int move_to_swap_cache(struct page *page, swp_entry_t entry) -{ - int err = __add_to_swap_cache(page, entry, GFP_ATOMIC); - if (!err) { - remove_from_page_cache(page); - page_cache_release(page); /* pagecache ref */ - if (!swap_duplicate(entry)) - BUG(); - SetPageDirty(page); - INC_CACHE_INFO(add_total); - } else if (err == -EEXIST) - INC_CACHE_INFO(exist_race); - return err; -} - -/* - * Strange swizzling function for shmem_getpage (and shmem_unuse) - */ -int move_from_swap_cache(struct page *page, unsigned long index, - struct address_space *mapping) -{ - int err = add_to_page_cache(page, mapping, index, GFP_ATOMIC); - if (!err) { - delete_from_swap_cache(page); - /* shift page from clean_pages to dirty_pages list */ - ClearPageDirty(page); - set_page_dirty(page); - } - return err; -} - /* * If we are the only user, then try to free up the swap cache. * @@ -317,7 +254,7 @@ struct page * lookup_swap_cache(swp_entry_t entry) * A failure return means that either the page allocation failed or that * the swap entry is no longer in use. */ -struct page *read_swap_cache_async(swp_entry_t entry, +struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr) { struct page *found_page, *new_page = NULL; @@ -337,23 +274,27 @@ struct page *read_swap_cache_async(swp_entry_t entry, * Get a new page to read into from swap. */ if (!new_page) { - new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, - vma, addr); + new_page = alloc_page_vma(gfp_mask, vma, addr); if (!new_page) break; /* Out of memory */ } /* + * Swap entry may have been freed since our caller observed it. + */ + if (!swap_duplicate(entry)) + break; + + /* * Associate the page with swap entry in the swap cache. - * May fail (-ENOENT) if swap entry has been freed since - * our caller observed it. May fail (-EEXIST) if there - * is already a page associated with this entry in the - * swap cache: added by a racing read_swap_cache_async, - * or by try_to_swap_out (or shmem_writepage) re-using - * the just freed swap entry for an existing page. + * May fail (-EEXIST) if there is already a page associated + * with this entry in the swap cache: added by a racing + * read_swap_cache_async, or add_to_swap or shmem_writepage + * re-using the just freed swap entry for an existing page. * May fail (-ENOMEM) if radix-tree node allocation failed. */ - err = add_to_swap_cache(new_page, entry); + SetPageLocked(new_page); + err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL); if (!err) { /* * Initiate read into locked page and return. @@ -362,9 +303,57 @@ struct page *read_swap_cache_async(swp_entry_t entry, swap_readpage(NULL, new_page); return new_page; } - } while (err != -ENOENT && err != -ENOMEM); + ClearPageLocked(new_page); + swap_free(entry); + } while (err != -ENOMEM); if (new_page) page_cache_release(new_page); return found_page; } + +/** + * swapin_readahead - swap in pages in hope we need them soon + * @entry: swap entry of this memory + * @vma: user vma this address belongs to + * @addr: target address for mempolicy + * + * Returns the struct page for entry and addr, after queueing swapin. + * + * Primitive swap readahead code. We simply read an aligned block of + * (1 << page_cluster) entries in the swap area. This method is chosen + * because it doesn't cost us any seek time. We also make sure to queue + * the 'original' request together with the readahead ones... + * + * This has been extended to use the NUMA policies from the mm triggering + * the readahead. + * + * Caller must hold down_read on the vma->vm_mm if vma is not NULL. + */ +struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, + struct vm_area_struct *vma, unsigned long addr) +{ + int nr_pages; + struct page *page; + unsigned long offset; + unsigned long end_offset; + + /* + * Get starting offset for readaround, and number of pages to read. + * Adjust starting address by readbehind (for NUMA interleave case)? + * No, it's very unlikely that swap layout would follow vma layout, + * more likely that neighbouring swap pages came from the same node: + * so use the same "addr" to choose the same node for each swap read. + */ + nr_pages = valid_swaphandles(entry, &offset); + for (end_offset = offset + nr_pages; offset < end_offset; offset++) { + /* Ok, do the async read-ahead now */ + page = read_swap_cache_async(swp_entry(swp_type(entry), offset), + gfp_mask, vma, addr); + if (!page) + break; + page_cache_release(page); + } + lru_add_drain(); /* Push any new pages onto the LRU now */ + return read_swap_cache_async(entry, gfp_mask, vma, addr); +} diff --git a/mm/swapfile.c b/mm/swapfile.c index f071648..eade24d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -506,9 +506,19 @@ unsigned int count_swap_pages(int type, int free) * just let do_wp_page work it out if a write is requested later - to * force COW, vm_page_prot omits write permission from any private vma. */ -static void unuse_pte(struct vm_area_struct *vma, pte_t *pte, +static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, swp_entry_t entry, struct page *page) { + spinlock_t *ptl; + pte_t *pte; + int found = 1; + + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { + found = 0; + goto out; + } + inc_mm_counter(vma->vm_mm, anon_rss); get_page(page); set_pte_at(vma->vm_mm, addr, pte, @@ -520,6 +530,9 @@ static void unuse_pte(struct vm_area_struct *vma, pte_t *pte, * immediately swapped out again after swapon. */ activate_page(page); +out: + pte_unmap_unlock(pte, ptl); + return found; } static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, @@ -528,22 +541,33 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, { pte_t swp_pte = swp_entry_to_pte(entry); pte_t *pte; - spinlock_t *ptl; int found = 0; - pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + /* + * We don't actually need pte lock while scanning for swp_pte: since + * we hold page lock and mmap_sem, swp_pte cannot be inserted into the + * page table while we're scanning; though it could get zapped, and on + * some architectures (e.g. x86_32 with PAE) we might catch a glimpse + * of unmatched parts which look like swp_pte, so unuse_pte must + * recheck under pte lock. Scanning without pte lock lets it be + * preemptible whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE. + */ + pte = pte_offset_map(pmd, addr); do { /* * swapoff spends a _lot_ of time in this loop! * Test inline before going to call unuse_pte. */ if (unlikely(pte_same(*pte, swp_pte))) { - unuse_pte(vma, pte++, addr, entry, page); - found = 1; - break; + pte_unmap(pte); + found = unuse_pte(vma, pmd, addr, entry, page); + if (found) + goto out; + pte = pte_offset_map(pmd, addr); } } while (pte++, addr += PAGE_SIZE, addr != end); - pte_unmap_unlock(pte - 1, ptl); + pte_unmap(pte - 1); +out: return found; } @@ -730,7 +754,8 @@ static int try_to_unuse(unsigned int type) */ swap_map = &si->swap_map[i]; entry = swp_entry(type, i); - page = read_swap_cache_async(entry, NULL, 0); + page = read_swap_cache_async(entry, + GFP_HIGHUSER_MOVABLE, NULL, 0); if (!page) { /* * Either swap_duplicate() failed because entry @@ -789,7 +814,7 @@ static int try_to_unuse(unsigned int type) atomic_inc(&new_start_mm->mm_users); atomic_inc(&prev_mm->mm_users); spin_lock(&mmlist_lock); - while (*swap_map > 1 && !retval && + while (*swap_map > 1 && !retval && !shmem && (p = p->next) != &start_mm->mmlist) { mm = list_entry(p, struct mm_struct, mmlist); if (!atomic_inc_not_zero(&mm->mm_users)) @@ -821,6 +846,13 @@ static int try_to_unuse(unsigned int type) mmput(start_mm); start_mm = new_start_mm; } + if (shmem) { + /* page has already been unlocked and released */ + if (shmem > 0) + continue; + retval = shmem; + break; + } if (retval) { unlock_page(page); page_cache_release(page); @@ -859,12 +891,6 @@ static int try_to_unuse(unsigned int type) * read from disk into another page. Splitting into two * pages would be incorrect if swap supported "shared * private" pages, but they are handled by tmpfs files. - * - * Note shmem_unuse already deleted a swappage from - * the swap cache, unless the move to filepage failed: - * in which case it left swappage in cache, lowered its - * swap count to pass quickly through the loops above, - * and now we must reincrement count to try again later. */ if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) { struct writeback_control wbc = { @@ -875,12 +901,8 @@ static int try_to_unuse(unsigned int type) lock_page(page); wait_on_page_writeback(page); } - if (PageSwapCache(page)) { - if (shmem) - swap_duplicate(entry); - else - delete_from_swap_cache(page); - } + if (PageSwapCache(page)) + delete_from_swap_cache(page); /* * So we could skip searching mms once swap count went @@ -1768,31 +1790,48 @@ get_swap_info_struct(unsigned type) */ int valid_swaphandles(swp_entry_t entry, unsigned long *offset) { + struct swap_info_struct *si; int our_page_cluster = page_cluster; - int ret = 0, i = 1 << our_page_cluster; - unsigned long toff; - struct swap_info_struct *swapdev = swp_type(entry) + swap_info; + pgoff_t target, toff; + pgoff_t base, end; + int nr_pages = 0; if (!our_page_cluster) /* no readahead */ return 0; - toff = (swp_offset(entry) >> our_page_cluster) << our_page_cluster; - if (!toff) /* first page is swap header */ - toff++, i--; - *offset = toff; + + si = &swap_info[swp_type(entry)]; + target = swp_offset(entry); + base = (target >> our_page_cluster) << our_page_cluster; + end = base + (1 << our_page_cluster); + if (!base) /* first page is swap header */ + base++; spin_lock(&swap_lock); - do { - /* Don't read-ahead past the end of the swap area */ - if (toff >= swapdev->max) + if (end > si->max) /* don't go beyond end of map */ + end = si->max; + + /* Count contiguous allocated slots above our target */ + for (toff = target; ++toff < end; nr_pages++) { + /* Don't read in free or bad pages */ + if (!si->swap_map[toff]) + break; + if (si->swap_map[toff] == SWAP_MAP_BAD) break; + } + /* Count contiguous allocated slots below our target */ + for (toff = target; --toff >= base; nr_pages++) { /* Don't read in free or bad pages */ - if (!swapdev->swap_map[toff]) + if (!si->swap_map[toff]) break; - if (swapdev->swap_map[toff] == SWAP_MAP_BAD) + if (si->swap_map[toff] == SWAP_MAP_BAD) break; - toff++; - ret++; - } while (--i); + } spin_unlock(&swap_lock); - return ret; + + /* + * Indicate starting offset, and return number of pages to get: + * if only 1, say 0, since there's then no readahead to be done. + */ + *offset = ++toff; + return nr_pages? ++nr_pages: 0; } diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c index d436a9c..7020836 100644 --- a/mm/tiny-shmem.c +++ b/mm/tiny-shmem.c @@ -121,18 +121,6 @@ int shmem_unuse(swp_entry_t entry, struct page *page) return 0; } -#if 0 -int shmem_mmap(struct file *file, struct vm_area_struct *vma) -{ - file_accessed(file); -#ifndef CONFIG_MMU - return ramfs_nommu_mmap(file, vma); -#else - return 0; -#endif -} -#endif /* 0 */ - #ifndef CONFIG_MMU unsigned long shmem_get_unmapped_area(struct file *file, unsigned long addr, diff --git a/mm/truncate.c b/mm/truncate.c index cadc156..c35c49e 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -21,7 +21,7 @@ /** - * do_invalidatepage - invalidate part of all of a page + * do_invalidatepage - invalidate part or all of a page * @page: the page which is affected * @offset: the index of the truncation point * @@ -48,7 +48,7 @@ void do_invalidatepage(struct page *page, unsigned long offset) static inline void truncate_partial_page(struct page *page, unsigned partial) { - zero_user_page(page, partial, PAGE_CACHE_SIZE - partial, KM_USER0); + zero_user_segment(page, partial, PAGE_CACHE_SIZE); if (PagePrivate(page)) do_invalidatepage(page, partial); } @@ -84,7 +84,7 @@ EXPORT_SYMBOL(cancel_dirty_page); /* * If truncate cannot remove the fs-private metadata from the page, the page - * becomes anonymous. It will be left on the LRU and may even be mapped into + * becomes orphaned. It will be left on the LRU and may even be mapped into * user pagetables if we're racing with filemap_fault(). * * We need to bale out if page->mapping is no longer equal to the original @@ -98,11 +98,11 @@ truncate_complete_page(struct address_space *mapping, struct page *page) if (page->mapping != mapping) return; - cancel_dirty_page(page, PAGE_CACHE_SIZE); - if (PagePrivate(page)) do_invalidatepage(page, 0); + cancel_dirty_page(page, PAGE_CACHE_SIZE); + remove_from_page_cache(page); ClearPageUptodate(page); ClearPageMappedToDisk(page); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index af77e17..0536dde 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -166,6 +166,44 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) } EXPORT_SYMBOL_GPL(map_vm_area); +/* + * Map a vmalloc()-space virtual address to the physical page. + */ +struct page *vmalloc_to_page(const void *vmalloc_addr) +{ + unsigned long addr = (unsigned long) vmalloc_addr; + struct page *page = NULL; + pgd_t *pgd = pgd_offset_k(addr); + pud_t *pud; + pmd_t *pmd; + pte_t *ptep, pte; + + if (!pgd_none(*pgd)) { + pud = pud_offset(pgd, addr); + if (!pud_none(*pud)) { + pmd = pmd_offset(pud, addr); + if (!pmd_none(*pmd)) { + ptep = pte_offset_map(pmd, addr); + pte = *ptep; + if (pte_present(pte)) + page = pte_page(pte); + pte_unmap(ptep); + } + } + } + return page; +} +EXPORT_SYMBOL(vmalloc_to_page); + +/* + * Map a vmalloc()-space virtual address to the physical page frame number. + */ +unsigned long vmalloc_to_pfn(const void *vmalloc_addr) +{ + return page_to_pfn(vmalloc_to_page(vmalloc_addr)); +} +EXPORT_SYMBOL(vmalloc_to_pfn); + static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start, unsigned long end, int node, gfp_t gfp_mask) @@ -216,6 +254,10 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long fl if (addr > end - size) goto out; } + if ((size + addr) < addr) + goto out; + if (addr > end - size) + goto out; found: area->next = *p; @@ -268,7 +310,7 @@ struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, } /* Caller must hold vmlist_lock */ -static struct vm_struct *__find_vm_area(void *addr) +static struct vm_struct *__find_vm_area(const void *addr) { struct vm_struct *tmp; @@ -281,7 +323,7 @@ static struct vm_struct *__find_vm_area(void *addr) } /* Caller must hold vmlist_lock */ -static struct vm_struct *__remove_vm_area(void *addr) +static struct vm_struct *__remove_vm_area(const void *addr) { struct vm_struct **p, *tmp; @@ -310,7 +352,7 @@ found: * This function returns the found VM area, but using it is NOT safe * on SMP machines, except for its size or flags. */ -struct vm_struct *remove_vm_area(void *addr) +struct vm_struct *remove_vm_area(const void *addr) { struct vm_struct *v; write_lock(&vmlist_lock); @@ -319,7 +361,7 @@ struct vm_struct *remove_vm_area(void *addr) return v; } -static void __vunmap(void *addr, int deallocate_pages) +static void __vunmap(const void *addr, int deallocate_pages) { struct vm_struct *area; @@ -346,8 +388,10 @@ static void __vunmap(void *addr, int deallocate_pages) int i; for (i = 0; i < area->nr_pages; i++) { - BUG_ON(!area->pages[i]); - __free_page(area->pages[i]); + struct page *page = area->pages[i]; + + BUG_ON(!page); + __free_page(page); } if (area->flags & VM_VPAGES) @@ -370,7 +414,7 @@ static void __vunmap(void *addr, int deallocate_pages) * * Must not be called in interrupt context. */ -void vfree(void *addr) +void vfree(const void *addr) { BUG_ON(in_interrupt()); __vunmap(addr, 1); @@ -386,7 +430,7 @@ EXPORT_SYMBOL(vfree); * * Must not be called in interrupt context. */ -void vunmap(void *addr) +void vunmap(const void *addr) { BUG_ON(in_interrupt()); __vunmap(addr, 0); @@ -423,8 +467,8 @@ void *vmap(struct page **pages, unsigned int count, } EXPORT_SYMBOL(vmap); -void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, - pgprot_t prot, int node) +static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, + pgprot_t prot, int node) { struct page **pages; unsigned int nr_pages, array_size, i; @@ -451,15 +495,19 @@ void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, } for (i = 0; i < area->nr_pages; i++) { + struct page *page; + if (node < 0) - area->pages[i] = alloc_page(gfp_mask); + page = alloc_page(gfp_mask); else - area->pages[i] = alloc_pages_node(node, gfp_mask, 0); - if (unlikely(!area->pages[i])) { + page = alloc_pages_node(node, gfp_mask, 0); + + if (unlikely(!page)) { /* Successfully allocated i pages, free them in __vunmap() */ area->nr_pages = i; goto fail; } + area->pages[i] = page; } if (map_vm_area(area, prot, &pages)) diff --git a/mm/vmstat.c b/mm/vmstat.c index e8d846f..422d960 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -21,21 +21,14 @@ EXPORT_PER_CPU_SYMBOL(vm_event_states); static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask) { - int cpu = 0; + int cpu; int i; memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long)); - cpu = first_cpu(*cpumask); - while (cpu < NR_CPUS) { + for_each_cpu_mask(cpu, *cpumask) { struct vm_event_state *this = &per_cpu(vm_event_states, cpu); - cpu = next_cpu(cpu, *cpumask); - - if (cpu < NR_CPUS) - prefetch(&per_cpu(vm_event_states, cpu)); - - for (i = 0; i < NR_VM_EVENT_ITEMS; i++) ret[i] += this->event[i]; } @@ -284,6 +277,10 @@ EXPORT_SYMBOL(dec_zone_page_state); /* * Update the zone counters for one cpu. * + * The cpu specified must be either the current cpu or a processor that + * is not online. If it is the current cpu then the execution thread must + * be pinned to the current cpu. + * * Note that refresh_cpu_vm_stats strives to only access * node local memory. The per cpu pagesets on remote zones are placed * in the memory local to the processor using that pageset. So the @@ -299,7 +296,7 @@ void refresh_cpu_vm_stats(int cpu) { struct zone *zone; int i; - unsigned long flags; + int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; for_each_zone(zone) { struct per_cpu_pageset *p; @@ -311,15 +308,19 @@ void refresh_cpu_vm_stats(int cpu) for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) if (p->vm_stat_diff[i]) { + unsigned long flags; + int v; + local_irq_save(flags); - zone_page_state_add(p->vm_stat_diff[i], - zone, i); + v = p->vm_stat_diff[i]; p->vm_stat_diff[i] = 0; + local_irq_restore(flags); + atomic_long_add(v, &zone->vm_stat[i]); + global_diff[i] += v; #ifdef CONFIG_NUMA /* 3 seconds idle till flush */ p->expire = 3; #endif - local_irq_restore(flags); } #ifdef CONFIG_NUMA /* @@ -329,7 +330,7 @@ void refresh_cpu_vm_stats(int cpu) * Check if there are pages remaining in this pageset * if not then there is nothing to expire. */ - if (!p->expire || (!p->pcp[0].count && !p->pcp[1].count)) + if (!p->expire || !p->pcp.count) continue; /* @@ -344,13 +345,14 @@ void refresh_cpu_vm_stats(int cpu) if (p->expire) continue; - if (p->pcp[0].count) - drain_zone_pages(zone, p->pcp + 0); - - if (p->pcp[1].count) - drain_zone_pages(zone, p->pcp + 1); + if (p->pcp.count) + drain_zone_pages(zone, &p->pcp); #endif } + + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + if (global_diff[i]) + atomic_long_add(global_diff[i], &vm_stat[i]); } #endif @@ -681,20 +683,17 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n pagesets"); for_each_online_cpu(i) { struct per_cpu_pageset *pageset; - int j; pageset = zone_pcp(zone, i); - for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { - seq_printf(m, - "\n cpu: %i pcp: %i" - "\n count: %i" - "\n high: %i" - "\n batch: %i", - i, j, - pageset->pcp[j].count, - pageset->pcp[j].high, - pageset->pcp[j].batch); - } + seq_printf(m, + "\n cpu: %i" + "\n count: %i" + "\n high: %i" + "\n batch: %i", + i, + pageset->pcp.count, + pageset->pcp.high, + pageset->pcp.batch); #ifdef CONFIG_SMP seq_printf(m, "\n vm stats threshold: %d", pageset->stat_threshold); |