diff options
author | Steven Whitehouse <swhiteho@redhat.com> | 2006-03-31 15:34:58 -0500 |
---|---|---|
committer | Steven Whitehouse <swhiteho@redhat.com> | 2006-03-31 15:34:58 -0500 |
commit | 86579dd06deecfa6ac88d5e84e4d63c397cd6f6d (patch) | |
tree | b4475d3ccde53015ad84a06e4e55e64591171b75 /mm | |
parent | 7ea9ea832212c4a755650f7c7cc1ff0b63292a41 (diff) | |
parent | a0f067802576d4eb4c65d40b8ee7d6ea3c81dd61 (diff) | |
download | op-kernel-dev-86579dd06deecfa6ac88d5e84e4d63c397cd6f6d.zip op-kernel-dev-86579dd06deecfa6ac88d5e84e4d63c397cd6f6d.tar.gz |
Merge branch 'master'
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 10 | ||||
-rw-r--r-- | mm/Makefile | 4 | ||||
-rw-r--r-- | mm/bootmem.c | 41 | ||||
-rw-r--r-- | mm/fadvise.c | 46 | ||||
-rw-r--r-- | mm/filemap.c | 43 | ||||
-rw-r--r-- | mm/highmem.c | 26 | ||||
-rw-r--r-- | mm/hugetlb.c | 286 | ||||
-rw-r--r-- | mm/internal.h | 34 | ||||
-rw-r--r-- | mm/memory.c | 21 | ||||
-rw-r--r-- | mm/mempolicy.c | 151 | ||||
-rw-r--r-- | mm/mempool.c | 50 | ||||
-rw-r--r-- | mm/migrate.c | 655 | ||||
-rw-r--r-- | mm/mmap.c | 16 | ||||
-rw-r--r-- | mm/mmzone.c | 50 | ||||
-rw-r--r-- | mm/mprotect.c | 12 | ||||
-rw-r--r-- | mm/msync.c | 139 | ||||
-rw-r--r-- | mm/nommu.c | 4 | ||||
-rw-r--r-- | mm/page-writeback.c | 64 | ||||
-rw-r--r-- | mm/page_alloc.c | 180 | ||||
-rw-r--r-- | mm/readahead.c | 33 | ||||
-rw-r--r-- | mm/rmap.c | 14 | ||||
-rw-r--r-- | mm/shmem.c | 7 | ||||
-rw-r--r-- | mm/slab.c | 1233 | ||||
-rw-r--r-- | mm/slob.c | 10 | ||||
-rw-r--r-- | mm/swap.c | 66 | ||||
-rw-r--r-- | mm/swap_state.c | 1 | ||||
-rw-r--r-- | mm/swapfile.c | 59 | ||||
-rw-r--r-- | mm/util.c | 47 | ||||
-rw-r--r-- | mm/vmscan.c | 888 |
29 files changed, 2603 insertions, 1587 deletions
@@ -137,5 +137,11 @@ config SPLIT_PTLOCK_CPUS # support for page migration # config MIGRATION - def_bool y if NUMA || SPARSEMEM || DISCONTIGMEM - depends on SWAP + bool "Page migration" + def_bool y if NUMA + depends on SWAP && NUMA + help + Allows the migration of the physical location of pages of processes + while the virtual addresses are not changed. This is useful for + example on NUMA systems to put pages nearer to the processors accessing + the page. diff --git a/mm/Makefile b/mm/Makefile index 9aa03fa..0b8f73f 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -10,7 +10,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ page_alloc.o page-writeback.o pdflush.o \ readahead.o swap.o truncate.o vmscan.o \ - prio_tree.o util.o $(mmu-y) + prio_tree.o util.o mmzone.o $(mmu-y) obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o obj-$(CONFIG_HUGETLBFS) += hugetlb.o @@ -22,3 +22,5 @@ obj-$(CONFIG_SLOB) += slob.o obj-$(CONFIG_SLAB) += slab.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_FS_XIP) += filemap_xip.o +obj-$(CONFIG_MIGRATION) += migrate.o + diff --git a/mm/bootmem.c b/mm/bootmem.c index 35c3229..d3e3bd2 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -33,6 +33,7 @@ EXPORT_SYMBOL(max_pfn); /* This is exported so * dma_get_required_mask(), which uses * it, can be an inline function */ +static LIST_HEAD(bdata_list); #ifdef CONFIG_CRASH_DUMP /* * If we have booted due to a crash, max_pfn will be a very low value. We need @@ -52,6 +53,27 @@ unsigned long __init bootmem_bootmap_pages (unsigned long pages) return mapsize; } +/* + * link bdata in order + */ +static void link_bootmem(bootmem_data_t *bdata) +{ + bootmem_data_t *ent; + if (list_empty(&bdata_list)) { + list_add(&bdata->list, &bdata_list); + return; + } + /* insert in order */ + list_for_each_entry(ent, &bdata_list, list) { + if (bdata->node_boot_start < ent->node_boot_start) { + list_add_tail(&bdata->list, &ent->list); + return; + } + } + list_add_tail(&bdata->list, &bdata_list); + return; +} + /* * Called once to set up the allocator itself. @@ -62,13 +84,11 @@ static unsigned long __init init_bootmem_core (pg_data_t *pgdat, bootmem_data_t *bdata = pgdat->bdata; unsigned long mapsize = ((end - start)+7)/8; - pgdat->pgdat_next = pgdat_list; - pgdat_list = pgdat; - mapsize = ALIGN(mapsize, sizeof(long)); bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT); bdata->node_boot_start = (start << PAGE_SHIFT); bdata->node_low_pfn = end; + link_bootmem(bdata); /* * Initially all pages are reserved - setup_arch() has to @@ -152,7 +172,7 @@ static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, * * NOTE: This function is _not_ reentrant. */ -static void * __init +void * __init __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) { @@ -383,12 +403,11 @@ unsigned long __init free_all_bootmem (void) void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal) { - pg_data_t *pgdat = pgdat_list; + bootmem_data_t *bdata; void *ptr; - for_each_pgdat(pgdat) - if ((ptr = __alloc_bootmem_core(pgdat->bdata, size, - align, goal, 0))) + list_for_each_entry(bdata, &bdata_list, list) + if ((ptr = __alloc_bootmem_core(bdata, size, align, goal, 0))) return(ptr); /* @@ -416,11 +435,11 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigne void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal) { - pg_data_t *pgdat = pgdat_list; + bootmem_data_t *bdata; void *ptr; - for_each_pgdat(pgdat) - if ((ptr = __alloc_bootmem_core(pgdat->bdata, size, + list_for_each_entry(bdata, &bdata_list, list) + if ((ptr = __alloc_bootmem_core(bdata, size, align, goal, LOW32LIMIT))) return(ptr); diff --git a/mm/fadvise.c b/mm/fadvise.c index d257c89..907c392 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -15,6 +15,7 @@ #include <linux/backing-dev.h> #include <linux/pagevec.h> #include <linux/fadvise.h> +#include <linux/writeback.h> #include <linux/syscalls.h> #include <asm/unistd.h> @@ -22,13 +23,36 @@ /* * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could * deactivate the pages and clear PG_Referenced. + * + * LINUX_FADV_ASYNC_WRITE: start async writeout of any dirty pages between file + * offsets `offset' and `offset+len' inclusive. Any pages which are currently + * under writeout are skipped, whether or not they are dirty. + * + * LINUX_FADV_WRITE_WAIT: wait upon writeout of any dirty pages between file + * offsets `offset' and `offset+len'. + * + * By combining these two operations the application may do several things: + * + * LINUX_FADV_ASYNC_WRITE: push some or all of the dirty pages at the disk. + * + * LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE: push all of the currently + * dirty pages at the disk. + * + * LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE, LINUX_FADV_WRITE_WAIT: push + * all of the currently dirty pages at the disk, wait until they have been + * written. + * + * It should be noted that none of these operations write out the file's + * metadata. So unless the application is strictly performing overwrites of + * already-instantiated disk blocks, there are no guarantees here that the data + * will be available after a crash. */ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) { struct file *file = fget(fd); struct address_space *mapping; struct backing_dev_info *bdi; - loff_t endbyte; + loff_t endbyte; /* inclusive */ pgoff_t start_index; pgoff_t end_index; unsigned long nrpages; @@ -56,6 +80,8 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) endbyte = offset + len; if (!len || endbyte < len) endbyte = -1; + else + endbyte--; /* inclusive */ bdi = mapping->backing_dev_info; @@ -78,7 +104,7 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) /* First and last PARTIAL page! */ start_index = offset >> PAGE_CACHE_SHIFT; - end_index = (endbyte-1) >> PAGE_CACHE_SHIFT; + end_index = endbyte >> PAGE_CACHE_SHIFT; /* Careful about overflow on the "+1" */ nrpages = end_index - start_index + 1; @@ -96,11 +122,21 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) filemap_flush(mapping); /* First and last FULL page! */ - start_index = (offset + (PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT; + start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT; end_index = (endbyte >> PAGE_CACHE_SHIFT); - if (end_index > start_index) - invalidate_mapping_pages(mapping, start_index, end_index-1); + if (end_index >= start_index) + invalidate_mapping_pages(mapping, start_index, + end_index); + break; + case LINUX_FADV_ASYNC_WRITE: + ret = __filemap_fdatawrite_range(mapping, offset, endbyte, + WB_SYNC_NONE); + break; + case LINUX_FADV_WRITE_WAIT: + ret = wait_on_page_writeback_range(mapping, + offset >> PAGE_CACHE_SHIFT, + endbyte >> PAGE_CACHE_SHIFT); break; default: ret = -EINVAL; diff --git a/mm/filemap.c b/mm/filemap.c index 7624c26..1120338 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -29,7 +29,10 @@ #include <linux/blkdev.h> #include <linux/security.h> #include <linux/syscalls.h> +#include <linux/cpuset.h> #include "filemap.h" +#include "internal.h" + /* * FIXME: remove all knowledge of the buffer layer from the core VM */ @@ -172,7 +175,7 @@ static int sync_page(void *word) * dirty pages that lie within the byte offsets <start, end> * @mapping: address space structure to write * @start: offset in bytes where the range starts - * @end: offset in bytes where the range ends + * @end: offset in bytes where the range ends (inclusive) * @sync_mode: enable synchronous operation * * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as @@ -180,8 +183,8 @@ static int sync_page(void *word) * these two operations is that if a dirty page/buffer is encountered, it must * be waited upon, and not just skipped over. */ -static int __filemap_fdatawrite_range(struct address_space *mapping, - loff_t start, loff_t end, int sync_mode) +int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, + loff_t end, int sync_mode) { int ret; struct writeback_control wbc = { @@ -210,8 +213,8 @@ int filemap_fdatawrite(struct address_space *mapping) } EXPORT_SYMBOL(filemap_fdatawrite); -static int filemap_fdatawrite_range(struct address_space *mapping, - loff_t start, loff_t end) +static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, + loff_t end) { return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); } @@ -230,7 +233,7 @@ EXPORT_SYMBOL(filemap_flush); * Wait for writeback to complete against pages indexed by start->end * inclusive */ -static int wait_on_page_writeback_range(struct address_space *mapping, +int wait_on_page_writeback_range(struct address_space *mapping, pgoff_t start, pgoff_t end) { struct pagevec pvec; @@ -365,6 +368,12 @@ int filemap_write_and_wait(struct address_space *mapping) } EXPORT_SYMBOL(filemap_write_and_wait); +/* + * Write out and wait upon file offsets lstart->lend, inclusive. + * + * Note that `lend' is inclusive (describes the last byte to be written) so + * that this function can be used to write to the very end-of-file (end = -1). + */ int filemap_write_and_wait_range(struct address_space *mapping, loff_t lstart, loff_t lend) { @@ -425,6 +434,28 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, return ret; } +#ifdef CONFIG_NUMA +struct page *page_cache_alloc(struct address_space *x) +{ + if (cpuset_do_page_mem_spread()) { + int n = cpuset_mem_spread_node(); + return alloc_pages_node(n, mapping_gfp_mask(x), 0); + } + return alloc_pages(mapping_gfp_mask(x), 0); +} +EXPORT_SYMBOL(page_cache_alloc); + +struct page *page_cache_alloc_cold(struct address_space *x) +{ + if (cpuset_do_page_mem_spread()) { + int n = cpuset_mem_spread_node(); + return alloc_pages_node(n, mapping_gfp_mask(x)|__GFP_COLD, 0); + } + return alloc_pages(mapping_gfp_mask(x)|__GFP_COLD, 0); +} +EXPORT_SYMBOL(page_cache_alloc_cold); +#endif + /* * In order to wait for pages to become available there must be * waitqueues associated with pages. By using a hash table of diff --git a/mm/highmem.c b/mm/highmem.c index ce2e7e8..55885f6 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -26,18 +26,14 @@ #include <linux/init.h> #include <linux/hash.h> #include <linux/highmem.h> +#include <linux/blktrace_api.h> #include <asm/tlbflush.h> static mempool_t *page_pool, *isa_page_pool; -static void *page_pool_alloc_isa(gfp_t gfp_mask, void *data) +static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data) { - return alloc_page(gfp_mask | GFP_DMA); -} - -static void page_pool_free(void *page, void *data) -{ - __free_page(page); + return mempool_alloc_pages(gfp_mask | GFP_DMA, data); } /* @@ -50,11 +46,6 @@ static void page_pool_free(void *page, void *data) */ #ifdef CONFIG_HIGHMEM -static void *page_pool_alloc(gfp_t gfp_mask, void *data) -{ - return alloc_page(gfp_mask); -} - static int pkmap_count[LAST_PKMAP]; static unsigned int last_pkmap_nr; static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock); @@ -228,7 +219,7 @@ static __init int init_emergency_pool(void) if (!i.totalhigh) return 0; - page_pool = mempool_create(POOL_SIZE, page_pool_alloc, page_pool_free, NULL); + page_pool = mempool_create_page_pool(POOL_SIZE, 0); if (!page_pool) BUG(); printk("highmem bounce pool size: %d pages\n", POOL_SIZE); @@ -271,7 +262,8 @@ int init_emergency_isa_pool(void) if (isa_page_pool) return 0; - isa_page_pool = mempool_create(ISA_POOL_SIZE, page_pool_alloc_isa, page_pool_free, NULL); + isa_page_pool = mempool_create(ISA_POOL_SIZE, mempool_alloc_pages_isa, + mempool_free_pages, (void *) 0); if (!isa_page_pool) BUG(); @@ -336,7 +328,7 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool, int err) bio_put(bio); } -static int bounce_end_io_write(struct bio *bio, unsigned int bytes_done,int err) +static int bounce_end_io_write(struct bio *bio, unsigned int bytes_done, int err) { if (bio->bi_size) return 1; @@ -383,7 +375,7 @@ static int bounce_end_io_read_isa(struct bio *bio, unsigned int bytes_done, int } static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig, - mempool_t *pool) + mempool_t *pool) { struct page *page; struct bio *bio = NULL; @@ -483,6 +475,8 @@ void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig) pool = isa_page_pool; } + blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE); + /* * slow path */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5087077..ebad6bb 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -13,24 +13,48 @@ #include <linux/pagemap.h> #include <linux/mempolicy.h> #include <linux/cpuset.h> +#include <linux/mutex.h> #include <asm/page.h> #include <asm/pgtable.h> #include <linux/hugetlb.h> +#include "internal.h" const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; -static unsigned long nr_huge_pages, free_huge_pages; +static unsigned long nr_huge_pages, free_huge_pages, reserved_huge_pages; unsigned long max_huge_pages; static struct list_head hugepage_freelists[MAX_NUMNODES]; static unsigned int nr_huge_pages_node[MAX_NUMNODES]; static unsigned int free_huge_pages_node[MAX_NUMNODES]; - /* * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages */ static DEFINE_SPINLOCK(hugetlb_lock); +static void clear_huge_page(struct page *page, unsigned long addr) +{ + int i; + + might_sleep(); + for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) { + cond_resched(); + clear_user_highpage(page + i, addr); + } +} + +static void copy_huge_page(struct page *dst, struct page *src, + unsigned long addr) +{ + int i; + + might_sleep(); + for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) { + cond_resched(); + copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE); + } +} + static void enqueue_huge_page(struct page *page) { int nid = page_to_nid(page); @@ -64,57 +88,176 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma, return page; } -static struct page *alloc_fresh_huge_page(void) +static void free_huge_page(struct page *page) +{ + BUG_ON(page_count(page)); + + INIT_LIST_HEAD(&page->lru); + + spin_lock(&hugetlb_lock); + enqueue_huge_page(page); + spin_unlock(&hugetlb_lock); +} + +static int alloc_fresh_huge_page(void) { static int nid = 0; struct page *page; page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN, HUGETLB_PAGE_ORDER); - nid = (nid + 1) % num_online_nodes(); + nid = next_node(nid, node_online_map); + if (nid == MAX_NUMNODES) + nid = first_node(node_online_map); if (page) { + page[1].lru.next = (void *)free_huge_page; /* dtor */ spin_lock(&hugetlb_lock); nr_huge_pages++; nr_huge_pages_node[page_to_nid(page)]++; spin_unlock(&hugetlb_lock); + put_page(page); /* free it into the hugepage allocator */ + return 1; } - return page; + return 0; } -void free_huge_page(struct page *page) +static struct page *alloc_huge_page(struct vm_area_struct *vma, + unsigned long addr) { - BUG_ON(page_count(page)); + struct inode *inode = vma->vm_file->f_dentry->d_inode; + struct page *page; + int use_reserve = 0; + unsigned long idx; - INIT_LIST_HEAD(&page->lru); - page[1].lru.next = NULL; /* reset dtor */ + spin_lock(&hugetlb_lock); + + if (vma->vm_flags & VM_MAYSHARE) { + + /* idx = radix tree index, i.e. offset into file in + * HPAGE_SIZE units */ + idx = ((addr - vma->vm_start) >> HPAGE_SHIFT) + + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); + + /* The hugetlbfs specific inode info stores the number + * of "guaranteed available" (huge) pages. That is, + * the first 'prereserved_hpages' pages of the inode + * are either already instantiated, or have been + * pre-reserved (by hugetlb_reserve_for_inode()). Here + * we're in the process of instantiating the page, so + * we use this to determine whether to draw from the + * pre-reserved pool or the truly free pool. */ + if (idx < HUGETLBFS_I(inode)->prereserved_hpages) + use_reserve = 1; + } + + if (!use_reserve) { + if (free_huge_pages <= reserved_huge_pages) + goto fail; + } else { + BUG_ON(reserved_huge_pages == 0); + reserved_huge_pages--; + } + + page = dequeue_huge_page(vma, addr); + if (!page) + goto fail; + + spin_unlock(&hugetlb_lock); + set_page_refcounted(page); + return page; + + fail: + WARN_ON(use_reserve); /* reserved allocations shouldn't fail */ + spin_unlock(&hugetlb_lock); + return NULL; +} + +/* hugetlb_extend_reservation() + * + * Ensure that at least 'atleast' hugepages are, and will remain, + * available to instantiate the first 'atleast' pages of the given + * inode. If the inode doesn't already have this many pages reserved + * or instantiated, set aside some hugepages in the reserved pool to + * satisfy later faults (or fail now if there aren't enough, rather + * than getting the SIGBUS later). + */ +int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info, + unsigned long atleast) +{ + struct inode *inode = &info->vfs_inode; + unsigned long change_in_reserve = 0; + int ret = 0; spin_lock(&hugetlb_lock); - enqueue_huge_page(page); + read_lock_irq(&inode->i_mapping->tree_lock); + + if (info->prereserved_hpages >= atleast) + goto out; + + /* Because we always call this on shared mappings, none of the + * pages beyond info->prereserved_hpages can have been + * instantiated, so we need to reserve all of them now. */ + change_in_reserve = atleast - info->prereserved_hpages; + + if ((reserved_huge_pages + change_in_reserve) > free_huge_pages) { + ret = -ENOMEM; + goto out; + } + + reserved_huge_pages += change_in_reserve; + info->prereserved_hpages = atleast; + + out: + read_unlock_irq(&inode->i_mapping->tree_lock); spin_unlock(&hugetlb_lock); + + return ret; } -struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr) +/* hugetlb_truncate_reservation() + * + * This returns pages reserved for the given inode to the general free + * hugepage pool. If the inode has any pages prereserved, but not + * instantiated, beyond offset (atmost << HPAGE_SIZE), then release + * them. + */ +void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info, + unsigned long atmost) { + struct inode *inode = &info->vfs_inode; + struct address_space *mapping = inode->i_mapping; + unsigned long idx; + unsigned long change_in_reserve = 0; struct page *page; - int i; spin_lock(&hugetlb_lock); - page = dequeue_huge_page(vma, addr); - if (!page) { - spin_unlock(&hugetlb_lock); - return NULL; + read_lock_irq(&inode->i_mapping->tree_lock); + + if (info->prereserved_hpages <= atmost) + goto out; + + /* Count pages which were reserved, but not instantiated, and + * which we can now release. */ + for (idx = atmost; idx < info->prereserved_hpages; idx++) { + page = radix_tree_lookup(&mapping->page_tree, idx); + if (!page) + /* Pages which are already instantiated can't + * be unreserved (and in fact have already + * been removed from the reserved pool) */ + change_in_reserve++; } + + BUG_ON(reserved_huge_pages < change_in_reserve); + reserved_huge_pages -= change_in_reserve; + info->prereserved_hpages = atmost; + + out: + read_unlock_irq(&inode->i_mapping->tree_lock); spin_unlock(&hugetlb_lock); - set_page_count(page, 1); - page[1].lru.next = (void *)free_huge_page; /* set dtor */ - for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i) - clear_user_highpage(&page[i], addr); - return page; } static int __init hugetlb_init(void) { unsigned long i; - struct page *page; if (HPAGE_SHIFT == 0) return 0; @@ -123,12 +266,8 @@ static int __init hugetlb_init(void) INIT_LIST_HEAD(&hugepage_freelists[i]); for (i = 0; i < max_huge_pages; ++i) { - page = alloc_fresh_huge_page(); - if (!page) + if (!alloc_fresh_huge_page()) break; - spin_lock(&hugetlb_lock); - enqueue_huge_page(page); - spin_unlock(&hugetlb_lock); } max_huge_pages = free_huge_pages = nr_huge_pages = i; printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages); @@ -154,9 +293,9 @@ static void update_and_free_page(struct page *page) page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | 1 << PG_private | 1<< PG_writeback); - set_page_count(&page[i], 0); } - set_page_count(page, 1); + page[1].lru.next = NULL; + set_page_refcounted(page); __free_pages(page, HUGETLB_PAGE_ORDER); } @@ -188,12 +327,8 @@ static inline void try_to_free_low(unsigned long count) static unsigned long set_max_huge_pages(unsigned long count) { while (count > nr_huge_pages) { - struct page *page = alloc_fresh_huge_page(); - if (!page) + if (!alloc_fresh_huge_page()) return nr_huge_pages; - spin_lock(&hugetlb_lock); - enqueue_huge_page(page); - spin_unlock(&hugetlb_lock); } if (count >= nr_huge_pages) return nr_huge_pages; @@ -225,9 +360,11 @@ int hugetlb_report_meminfo(char *buf) return sprintf(buf, "HugePages_Total: %5lu\n" "HugePages_Free: %5lu\n" + "HugePages_Rsvd: %5lu\n" "Hugepagesize: %5lu kB\n", nr_huge_pages, free_huge_pages, + reserved_huge_pages, HPAGE_SIZE/1024); } @@ -240,11 +377,6 @@ int hugetlb_report_node_meminfo(int nid, char *buf) nid, free_huge_pages_node[nid]); } -int is_hugepage_mem_enough(size_t size) -{ - return (size + ~HPAGE_MASK)/HPAGE_SIZE <= free_huge_pages; -} - /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ unsigned long hugetlb_total_pages(void) { @@ -374,7 +506,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t pte) { struct page *old_page, *new_page; - int i, avoidcopy; + int avoidcopy; old_page = pte_page(pte); @@ -395,9 +527,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, } spin_unlock(&mm->page_table_lock); - for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) - copy_user_highpage(new_page + i, old_page + i, - address + i*PAGE_SIZE); + copy_huge_page(new_page, old_page, address); spin_lock(&mm->page_table_lock); ptep = huge_pte_offset(mm, address & HPAGE_MASK); @@ -442,6 +572,7 @@ retry: ret = VM_FAULT_OOM; goto out; } + clear_huge_page(page, address); if (vma->vm_flags & VM_SHARED) { int err; @@ -496,14 +627,24 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, pte_t *ptep; pte_t entry; int ret; + static DEFINE_MUTEX(hugetlb_instantiation_mutex); ptep = huge_pte_alloc(mm, address); if (!ptep) return VM_FAULT_OOM; + /* + * Serialize hugepage allocation and instantiation, so that we don't + * get spurious allocation failures if two CPUs race to instantiate + * the same page in the page cache. + */ + mutex_lock(&hugetlb_instantiation_mutex); entry = *ptep; - if (pte_none(entry)) - return hugetlb_no_page(mm, vma, address, ptep, write_access); + if (pte_none(entry)) { + ret = hugetlb_no_page(mm, vma, address, ptep, write_access); + mutex_unlock(&hugetlb_instantiation_mutex); + return ret; + } ret = VM_FAULT_MINOR; @@ -513,6 +654,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (write_access && !pte_write(entry)) ret = hugetlb_cow(mm, vma, address, ptep, entry); spin_unlock(&mm->page_table_lock); + mutex_unlock(&hugetlb_instantiation_mutex); return ret; } @@ -521,10 +663,10 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page **pages, struct vm_area_struct **vmas, unsigned long *position, int *length, int i) { - unsigned long vpfn, vaddr = *position; + unsigned long pfn_offset; + unsigned long vaddr = *position; int remainder = *length; - vpfn = vaddr/PAGE_SIZE; spin_lock(&mm->page_table_lock); while (vaddr < vma->vm_end && remainder) { pte_t *pte; @@ -552,19 +694,28 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, break; } - if (pages) { - page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; - get_page(page); - pages[i] = page; - } + pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT; + page = pte_page(*pte); +same_page: + get_page(page); + if (pages) + pages[i] = page + pfn_offset; if (vmas) vmas[i] = vma; vaddr += PAGE_SIZE; - ++vpfn; + ++pfn_offset; --remainder; ++i; + if (vaddr < vma->vm_end && remainder && + pfn_offset < HPAGE_SIZE/PAGE_SIZE) { + /* + * We use pfn_offset to avoid touching the pageframes + * of this compound page. + */ + goto same_page; + } } spin_unlock(&mm->page_table_lock); *length = remainder; @@ -572,3 +723,32 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, return i; } + +void hugetlb_change_protection(struct vm_area_struct *vma, + unsigned long address, unsigned long end, pgprot_t newprot) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long start = address; + pte_t *ptep; + pte_t pte; + + BUG_ON(address >= end); + flush_cache_range(vma, address, end); + + spin_lock(&mm->page_table_lock); + for (; address < end; address += HPAGE_SIZE) { + ptep = huge_pte_offset(mm, address); + if (!ptep) + continue; + if (!pte_none(*ptep)) { + pte = huge_ptep_get_and_clear(mm, address, ptep); + pte = pte_mkhuge(pte_modify(pte, newprot)); + set_huge_pte_at(mm, address, ptep, pte); + lazy_mmu_prot_update(pte); + } + } + spin_unlock(&mm->page_table_lock); + + flush_tlb_range(vma, start, end); +} + diff --git a/mm/internal.h b/mm/internal.h index 17256bb..d20e3cc 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -8,23 +8,33 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ +#ifndef __MM_INTERNAL_H +#define __MM_INTERNAL_H -static inline void set_page_refs(struct page *page, int order) +#include <linux/mm.h> + +static inline void set_page_count(struct page *page, int v) +{ + atomic_set(&page->_count, v); +} + +/* + * Turn a non-refcounted page (->_count == 0) into refcounted with + * a count of one. + */ +static inline void set_page_refcounted(struct page *page) { -#ifdef CONFIG_MMU + BUG_ON(PageCompound(page) && page_private(page) != (unsigned long)page); + BUG_ON(atomic_read(&page->_count)); set_page_count(page, 1); -#else - int i; +} - /* - * We need to reference all the pages for this order, otherwise if - * anyone accesses one of the pages with (get/put) it will be freed. - * - eg: access_process_vm() - */ - for (i = 0; i < (1 << order); i++) - set_page_count(page + i, 1); -#endif /* CONFIG_MMU */ +static inline void __put_page(struct page *page) +{ + atomic_dec(&page->_count); } extern void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order); + +#endif diff --git a/mm/memory.c b/mm/memory.c index 85e80a5..8d8f525 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -277,7 +277,7 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, anon_vma_unlink(vma); unlink_file_vma(vma); - if (is_hugepage_only_range(vma->vm_mm, addr, HPAGE_SIZE)) { + if (is_vm_hugetlb_page(vma)) { hugetlb_free_pgd_range(tlb, addr, vma->vm_end, floor, next? next->vm_start: ceiling); } else { @@ -285,8 +285,7 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, * Optimization: gather nearby vmas into one call down */ while (next && next->vm_start <= vma->vm_end + PMD_SIZE - && !is_hugepage_only_range(vma->vm_mm, next->vm_start, - HPAGE_SIZE)) { + && !is_vm_hugetlb_page(next)) { vma = next; next = vma->vm_next; anon_vma_unlink(vma); @@ -388,7 +387,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_ { unsigned long pfn = pte_pfn(pte); - if (vma->vm_flags & VM_PFNMAP) { + if (unlikely(vma->vm_flags & VM_PFNMAP)) { unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT; if (pfn == vma->vm_pgoff + off) return NULL; @@ -401,8 +400,6 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_ * we should just do "return pfn_to_page(pfn)", but * in the meantime we check that we get a valid pfn, * and that the resulting page looks ok. - * - * Remove this test eventually! */ if (unlikely(!pfn_valid(pfn))) { print_bad_pte(vma, pte, addr); @@ -1074,6 +1071,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, } if (pages) { pages[i] = page; + + flush_anon_page(page, start); flush_dcache_page(page); } if (vmas) @@ -1221,9 +1220,7 @@ out: * The page has to be a nice clean _individual_ kernel allocation. * If you allocate a compound page, you need to have marked it as * such (__GFP_COMP), or manually just split the page up yourself - * (which is mainly an issue of doing "set_page_count(page, 1)" for - * each sub-page, and then freeing them one by one when you free - * them rather than freeing it as a compound page). + * (see split_page()). * * NOTE! Traditionally this was done with "remap_pfn_range()" which * took an arbitrary page protection parameter. This doesn't allow @@ -2357,10 +2354,8 @@ int make_pages_present(unsigned long addr, unsigned long end) if (!vma) return -1; write = (vma->vm_flags & VM_WRITE) != 0; - if (addr >= end) - BUG(); - if (end > vma->vm_end) - BUG(); + BUG_ON(addr >= end); + BUG_ON(end > vma->vm_end); len = (end+PAGE_SIZE-1)/PAGE_SIZE-addr/PAGE_SIZE; ret = get_user_pages(current, current->mm, addr, len, write, 0, NULL, NULL); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index b21869a..dec8249 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -86,6 +86,7 @@ #include <linux/swap.h> #include <linux/seq_file.h> #include <linux/proc_fs.h> +#include <linux/migrate.h> #include <asm/tlbflush.h> #include <asm/uaccess.h> @@ -95,11 +96,8 @@ #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */ -/* The number of pages to migrate per call to migrate_pages() */ -#define MIGRATE_CHUNK_SIZE 256 - -static kmem_cache_t *policy_cache; -static kmem_cache_t *sn_cache; +static struct kmem_cache *policy_cache; +static struct kmem_cache *sn_cache; #define PDprintk(fmt...) @@ -331,17 +329,10 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, struct vm_area_struct *first, *vma, *prev; if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { - /* Must have swap device for migration */ - if (nr_swap_pages <= 0) - return ERR_PTR(-ENODEV); - /* - * Clear the LRU lists so pages can be isolated. - * Note that pages may be moved off the LRU after we have - * drained them. Those pages will fail to migrate like other - * pages that may be busy. - */ - lru_add_drain_all(); + err = migrate_prep(); + if (err) + return ERR_PTR(err); } first = find_vma(mm, start); @@ -431,6 +422,37 @@ static int contextualize_policy(int mode, nodemask_t *nodes) return mpol_check_policy(mode, nodes); } + +/* + * Update task->flags PF_MEMPOLICY bit: set iff non-default + * mempolicy. Allows more rapid checking of this (combined perhaps + * with other PF_* flag bits) on memory allocation hot code paths. + * + * If called from outside this file, the task 'p' should -only- be + * a newly forked child not yet visible on the task list, because + * manipulating the task flags of a visible task is not safe. + * + * The above limitation is why this routine has the funny name + * mpol_fix_fork_child_flag(). + * + * It is also safe to call this with a task pointer of current, + * which the static wrapper mpol_set_task_struct_flag() does, + * for use within this file. + */ + +void mpol_fix_fork_child_flag(struct task_struct *p) +{ + if (p->mempolicy) + p->flags |= PF_MEMPOLICY; + else + p->flags &= ~PF_MEMPOLICY; +} + +static void mpol_set_task_struct_flag(void) +{ + mpol_fix_fork_child_flag(current); +} + /* Set the process memory policy */ long do_set_mempolicy(int mode, nodemask_t *nodes) { @@ -443,6 +465,7 @@ long do_set_mempolicy(int mode, nodemask_t *nodes) return PTR_ERR(new); mpol_free(current->mempolicy); current->mempolicy = new; + mpol_set_task_struct_flag(); if (new && new->policy == MPOL_INTERLEAVE) current->il_next = first_node(new->v.nodes); return 0; @@ -550,92 +573,18 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask, return err; } +#ifdef CONFIG_MIGRATION /* * page migration */ - static void migrate_page_add(struct page *page, struct list_head *pagelist, unsigned long flags) { /* * Avoid migrating a page that is shared with others. */ - if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { - if (isolate_lru_page(page)) - list_add_tail(&page->lru, pagelist); - } -} - -/* - * Migrate the list 'pagelist' of pages to a certain destination. - * - * Specify destination with either non-NULL vma or dest_node >= 0 - * Return the number of pages not migrated or error code - */ -static int migrate_pages_to(struct list_head *pagelist, - struct vm_area_struct *vma, int dest) -{ - LIST_HEAD(newlist); - LIST_HEAD(moved); - LIST_HEAD(failed); - int err = 0; - unsigned long offset = 0; - int nr_pages; - struct page *page; - struct list_head *p; - -redo: - nr_pages = 0; - list_for_each(p, pagelist) { - if (vma) { - /* - * The address passed to alloc_page_vma is used to - * generate the proper interleave behavior. We fake - * the address here by an increasing offset in order - * to get the proper distribution of pages. - * - * No decision has been made as to which page - * a certain old page is moved to so we cannot - * specify the correct address. - */ - page = alloc_page_vma(GFP_HIGHUSER, vma, - offset + vma->vm_start); - offset += PAGE_SIZE; - } - else - page = alloc_pages_node(dest, GFP_HIGHUSER, 0); - - if (!page) { - err = -ENOMEM; - goto out; - } - list_add_tail(&page->lru, &newlist); - nr_pages++; - if (nr_pages > MIGRATE_CHUNK_SIZE) - break; - } - err = migrate_pages(pagelist, &newlist, &moved, &failed); - - putback_lru_pages(&moved); /* Call release pages instead ?? */ - - if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist)) - goto redo; -out: - /* Return leftover allocated pages */ - while (!list_empty(&newlist)) { - page = list_entry(newlist.next, struct page, lru); - list_del(&page->lru); - __free_page(page); - } - list_splice(&failed, pagelist); - if (err < 0) - return err; - - /* Calculate number of leftover pages */ - nr_pages = 0; - list_for_each(p, pagelist) - nr_pages++; - return nr_pages; + if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) + isolate_lru_page(page, pagelist); } /* @@ -742,8 +691,23 @@ int do_migrate_pages(struct mm_struct *mm, if (err < 0) return err; return busy; + } +#else + +static void migrate_page_add(struct page *page, struct list_head *pagelist, + unsigned long flags) +{ +} + +int do_migrate_pages(struct mm_struct *mm, + const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) +{ + return -ENOSYS; +} +#endif + long do_mbind(unsigned long start, unsigned long len, unsigned long mode, nodemask_t *nmask, unsigned long flags) { @@ -808,6 +772,7 @@ long do_mbind(unsigned long start, unsigned long len, if (!err && nr_failed && (flags & MPOL_MF_STRICT)) err = -EIO; } + if (!list_empty(&pagelist)) putback_lru_pages(&pagelist); @@ -947,7 +912,7 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, /* * Check if this process has the right to modify the specified * process. The right exists if the process has administrative - * capabilities, superuser priviledges or the same + * capabilities, superuser privileges or the same * userid as the target process. */ if ((current->euid != task->suid) && (current->euid != task->uid) && diff --git a/mm/mempool.c b/mm/mempool.c index 1a99b80..fe6e052 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -183,8 +183,8 @@ EXPORT_SYMBOL(mempool_resize); */ void mempool_destroy(mempool_t *pool) { - if (pool->curr_nr != pool->min_nr) - BUG(); /* There were outstanding elements */ + /* Check for outstanding elements */ + BUG_ON(pool->curr_nr != pool->min_nr); free_pool(pool); } EXPORT_SYMBOL(mempool_destroy); @@ -278,14 +278,56 @@ EXPORT_SYMBOL(mempool_free); */ void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data) { - kmem_cache_t *mem = (kmem_cache_t *) pool_data; + struct kmem_cache *mem = pool_data; return kmem_cache_alloc(mem, gfp_mask); } EXPORT_SYMBOL(mempool_alloc_slab); void mempool_free_slab(void *element, void *pool_data) { - kmem_cache_t *mem = (kmem_cache_t *) pool_data; + struct kmem_cache *mem = pool_data; kmem_cache_free(mem, element); } EXPORT_SYMBOL(mempool_free_slab); + +/* + * A commonly used alloc and free fn that kmalloc/kfrees the amount of memory + * specfied by pool_data + */ +void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data) +{ + size_t size = (size_t)(long)pool_data; + return kmalloc(size, gfp_mask); +} +EXPORT_SYMBOL(mempool_kmalloc); + +void *mempool_kzalloc(gfp_t gfp_mask, void *pool_data) +{ + size_t size = (size_t) pool_data; + return kzalloc(size, gfp_mask); +} +EXPORT_SYMBOL(mempool_kzalloc); + +void mempool_kfree(void *element, void *pool_data) +{ + kfree(element); +} +EXPORT_SYMBOL(mempool_kfree); + +/* + * A simple mempool-backed page allocator that allocates pages + * of the order specified by pool_data. + */ +void *mempool_alloc_pages(gfp_t gfp_mask, void *pool_data) +{ + int order = (int)(long)pool_data; + return alloc_pages(gfp_mask, order); +} +EXPORT_SYMBOL(mempool_alloc_pages); + +void mempool_free_pages(void *element, void *pool_data) +{ + int order = (int)(long)pool_data; + __free_pages(element, order); +} +EXPORT_SYMBOL(mempool_free_pages); diff --git a/mm/migrate.c b/mm/migrate.c new file mode 100644 index 0000000..09f6e4a --- /dev/null +++ b/mm/migrate.c @@ -0,0 +1,655 @@ +/* + * Memory Migration functionality - linux/mm/migration.c + * + * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter + * + * Page migration was first developed in the context of the memory hotplug + * project. The main authors of the migration code are: + * + * IWAMOTO Toshihiro <iwamoto@valinux.co.jp> + * Hirokazu Takahashi <taka@valinux.co.jp> + * Dave Hansen <haveblue@us.ibm.com> + * Christoph Lameter <clameter@sgi.com> + */ + +#include <linux/migrate.h> +#include <linux/module.h> +#include <linux/swap.h> +#include <linux/pagemap.h> +#include <linux/buffer_head.h> /* for try_to_release_page(), + buffer_heads_over_limit */ +#include <linux/mm_inline.h> +#include <linux/pagevec.h> +#include <linux/rmap.h> +#include <linux/topology.h> +#include <linux/cpu.h> +#include <linux/cpuset.h> +#include <linux/swapops.h> + +#include "internal.h" + +#include "internal.h" + +/* The maximum number of pages to take off the LRU for migration */ +#define MIGRATE_CHUNK_SIZE 256 + +#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) + +/* + * Isolate one page from the LRU lists. If successful put it onto + * the indicated list with elevated page count. + * + * Result: + * -EBUSY: page not on LRU list + * 0: page removed from LRU list and added to the specified list. + */ +int isolate_lru_page(struct page *page, struct list_head *pagelist) +{ + int ret = -EBUSY; + + if (PageLRU(page)) { + struct zone *zone = page_zone(page); + + spin_lock_irq(&zone->lru_lock); + if (PageLRU(page)) { + ret = 0; + get_page(page); + ClearPageLRU(page); + if (PageActive(page)) + del_page_from_active_list(zone, page); + else + del_page_from_inactive_list(zone, page); + list_add_tail(&page->lru, pagelist); + } + spin_unlock_irq(&zone->lru_lock); + } + return ret; +} + +/* + * migrate_prep() needs to be called after we have compiled the list of pages + * to be migrated using isolate_lru_page() but before we begin a series of calls + * to migrate_pages(). + */ +int migrate_prep(void) +{ + /* Must have swap device for migration */ + if (nr_swap_pages <= 0) + return -ENODEV; + + /* + * Clear the LRU lists so pages can be isolated. + * Note that pages may be moved off the LRU after we have + * drained them. Those pages will fail to migrate like other + * pages that may be busy. + */ + lru_add_drain_all(); + + return 0; +} + +static inline void move_to_lru(struct page *page) +{ + list_del(&page->lru); + if (PageActive(page)) { + /* + * lru_cache_add_active checks that + * the PG_active bit is off. + */ + ClearPageActive(page); + lru_cache_add_active(page); + } else { + lru_cache_add(page); + } + put_page(page); +} + +/* + * Add isolated pages on the list back to the LRU. + * + * returns the number of pages put back. + */ +int putback_lru_pages(struct list_head *l) +{ + struct page *page; + struct page *page2; + int count = 0; + + list_for_each_entry_safe(page, page2, l, lru) { + move_to_lru(page); + count++; + } + return count; +} + +/* + * Non migratable page + */ +int fail_migrate_page(struct page *newpage, struct page *page) +{ + return -EIO; +} +EXPORT_SYMBOL(fail_migrate_page); + +/* + * swapout a single page + * page is locked upon entry, unlocked on exit + */ +static int swap_page(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + + if (page_mapped(page) && mapping) + if (try_to_unmap(page, 1) != SWAP_SUCCESS) + goto unlock_retry; + + if (PageDirty(page)) { + /* Page is dirty, try to write it out here */ + switch(pageout(page, mapping)) { + case PAGE_KEEP: + case PAGE_ACTIVATE: + goto unlock_retry; + + case PAGE_SUCCESS: + goto retry; + + case PAGE_CLEAN: + ; /* try to free the page below */ + } + } + + if (PagePrivate(page)) { + if (!try_to_release_page(page, GFP_KERNEL) || + (!mapping && page_count(page) == 1)) + goto unlock_retry; + } + + if (remove_mapping(mapping, page)) { + /* Success */ + unlock_page(page); + return 0; + } + +unlock_retry: + unlock_page(page); + +retry: + return -EAGAIN; +} +EXPORT_SYMBOL(swap_page); + +/* + * Remove references for a page and establish the new page with the correct + * basic settings to be able to stop accesses to the page. + */ +int migrate_page_remove_references(struct page *newpage, + struct page *page, int nr_refs) +{ + struct address_space *mapping = page_mapping(page); + struct page **radix_pointer; + + /* + * Avoid doing any of the following work if the page count + * indicates that the page is in use or truncate has removed + * the page. + */ + if (!mapping || page_mapcount(page) + nr_refs != page_count(page)) + return -EAGAIN; + + /* + * Establish swap ptes for anonymous pages or destroy pte + * maps for files. + * + * In order to reestablish file backed mappings the fault handlers + * will take the radix tree_lock which may then be used to stop + * processses from accessing this page until the new page is ready. + * + * A process accessing via a swap pte (an anonymous page) will take a + * page_lock on the old page which will block the process until the + * migration attempt is complete. At that time the PageSwapCache bit + * will be examined. If the page was migrated then the PageSwapCache + * bit will be clear and the operation to retrieve the page will be + * retried which will find the new page in the radix tree. Then a new + * direct mapping may be generated based on the radix tree contents. + * + * If the page was not migrated then the PageSwapCache bit + * is still set and the operation may continue. + */ + if (try_to_unmap(page, 1) == SWAP_FAIL) + /* A vma has VM_LOCKED set -> permanent failure */ + return -EPERM; + + /* + * Give up if we were unable to remove all mappings. + */ + if (page_mapcount(page)) + return -EAGAIN; + + write_lock_irq(&mapping->tree_lock); + + radix_pointer = (struct page **)radix_tree_lookup_slot( + &mapping->page_tree, + page_index(page)); + + if (!page_mapping(page) || page_count(page) != nr_refs || + *radix_pointer != page) { + write_unlock_irq(&mapping->tree_lock); + return 1; + } + + /* + * Now we know that no one else is looking at the page. + * + * Certain minimal information about a page must be available + * in order for other subsystems to properly handle the page if they + * find it through the radix tree update before we are finished + * copying the page. + */ + get_page(newpage); + newpage->index = page->index; + newpage->mapping = page->mapping; + if (PageSwapCache(page)) { + SetPageSwapCache(newpage); + set_page_private(newpage, page_private(page)); + } + + *radix_pointer = newpage; + __put_page(page); + write_unlock_irq(&mapping->tree_lock); + + return 0; +} +EXPORT_SYMBOL(migrate_page_remove_references); + +/* + * Copy the page to its new location + */ +void migrate_page_copy(struct page *newpage, struct page *page) +{ + copy_highpage(newpage, page); + + if (PageError(page)) + SetPageError(newpage); + if (PageReferenced(page)) + SetPageReferenced(newpage); + if (PageUptodate(page)) + SetPageUptodate(newpage); + if (PageActive(page)) + SetPageActive(newpage); + if (PageChecked(page)) + SetPageChecked(newpage); + if (PageMappedToDisk(page)) + SetPageMappedToDisk(newpage); + + if (PageDirty(page)) { + clear_page_dirty_for_io(page); + set_page_dirty(newpage); + } + + ClearPageSwapCache(page); + ClearPageActive(page); + ClearPagePrivate(page); + set_page_private(page, 0); + page->mapping = NULL; + + /* + * If any waiters have accumulated on the new page then + * wake them up. + */ + if (PageWriteback(newpage)) + end_page_writeback(newpage); +} +EXPORT_SYMBOL(migrate_page_copy); + +/* + * Common logic to directly migrate a single page suitable for + * pages that do not use PagePrivate. + * + * Pages are locked upon entry and exit. + */ +int migrate_page(struct page *newpage, struct page *page) +{ + int rc; + + BUG_ON(PageWriteback(page)); /* Writeback must be complete */ + + rc = migrate_page_remove_references(newpage, page, 2); + + if (rc) + return rc; + + migrate_page_copy(newpage, page); + + /* + * Remove auxiliary swap entries and replace + * them with real ptes. + * + * Note that a real pte entry will allow processes that are not + * waiting on the page lock to use the new page via the page tables + * before the new page is unlocked. + */ + remove_from_swap(newpage); + return 0; +} +EXPORT_SYMBOL(migrate_page); + +/* + * migrate_pages + * + * Two lists are passed to this function. The first list + * contains the pages isolated from the LRU to be migrated. + * The second list contains new pages that the pages isolated + * can be moved to. If the second list is NULL then all + * pages are swapped out. + * + * The function returns after 10 attempts or if no pages + * are movable anymore because to has become empty + * or no retryable pages exist anymore. + * + * Return: Number of pages not migrated when "to" ran empty. + */ +int migrate_pages(struct list_head *from, struct list_head *to, + struct list_head *moved, struct list_head *failed) +{ + int retry; + int nr_failed = 0; + int pass = 0; + struct page *page; + struct page *page2; + int swapwrite = current->flags & PF_SWAPWRITE; + int rc; + + if (!swapwrite) + current->flags |= PF_SWAPWRITE; + +redo: + retry = 0; + + list_for_each_entry_safe(page, page2, from, lru) { + struct page *newpage = NULL; + struct address_space *mapping; + + cond_resched(); + + rc = 0; + if (page_count(page) == 1) + /* page was freed from under us. So we are done. */ + goto next; + + if (to && list_empty(to)) + break; + + /* + * Skip locked pages during the first two passes to give the + * functions holding the lock time to release the page. Later we + * use lock_page() to have a higher chance of acquiring the + * lock. + */ + rc = -EAGAIN; + if (pass > 2) + lock_page(page); + else + if (TestSetPageLocked(page)) + goto next; + + /* + * Only wait on writeback if we have already done a pass where + * we we may have triggered writeouts for lots of pages. + */ + if (pass > 0) { + wait_on_page_writeback(page); + } else { + if (PageWriteback(page)) + goto unlock_page; + } + + /* + * Anonymous pages must have swap cache references otherwise + * the information contained in the page maps cannot be + * preserved. + */ + if (PageAnon(page) && !PageSwapCache(page)) { + if (!add_to_swap(page, GFP_KERNEL)) { + rc = -ENOMEM; + goto unlock_page; + } + } + + if (!to) { + rc = swap_page(page); + goto next; + } + + newpage = lru_to_page(to); + lock_page(newpage); + + /* + * Pages are properly locked and writeback is complete. + * Try to migrate the page. + */ + mapping = page_mapping(page); + if (!mapping) + goto unlock_both; + + if (mapping->a_ops->migratepage) { + /* + * Most pages have a mapping and most filesystems + * should provide a migration function. Anonymous + * pages are part of swap space which also has its + * own migration function. This is the most common + * path for page migration. + */ + rc = mapping->a_ops->migratepage(newpage, page); + goto unlock_both; + } + + /* + * Default handling if a filesystem does not provide + * a migration function. We can only migrate clean + * pages so try to write out any dirty pages first. + */ + if (PageDirty(page)) { + switch (pageout(page, mapping)) { + case PAGE_KEEP: + case PAGE_ACTIVATE: + goto unlock_both; + + case PAGE_SUCCESS: + unlock_page(newpage); + goto next; + + case PAGE_CLEAN: + ; /* try to migrate the page below */ + } + } + + /* + * Buffers are managed in a filesystem specific way. + * We must have no buffers or drop them. + */ + if (!page_has_buffers(page) || + try_to_release_page(page, GFP_KERNEL)) { + rc = migrate_page(newpage, page); + goto unlock_both; + } + + /* + * On early passes with mapped pages simply + * retry. There may be a lock held for some + * buffers that may go away. Later + * swap them out. + */ + if (pass > 4) { + /* + * Persistently unable to drop buffers..... As a + * measure of last resort we fall back to + * swap_page(). + */ + unlock_page(newpage); + newpage = NULL; + rc = swap_page(page); + goto next; + } + +unlock_both: + unlock_page(newpage); + +unlock_page: + unlock_page(page); + +next: + if (rc == -EAGAIN) { + retry++; + } else if (rc) { + /* Permanent failure */ + list_move(&page->lru, failed); + nr_failed++; + } else { + if (newpage) { + /* Successful migration. Return page to LRU */ + move_to_lru(newpage); + } + list_move(&page->lru, moved); + } + } + if (retry && pass++ < 10) + goto redo; + + if (!swapwrite) + current->flags &= ~PF_SWAPWRITE; + + return nr_failed + retry; +} + +/* + * Migration function for pages with buffers. This function can only be used + * if the underlying filesystem guarantees that no other references to "page" + * exist. + */ +int buffer_migrate_page(struct page *newpage, struct page *page) +{ + struct address_space *mapping = page->mapping; + struct buffer_head *bh, *head; + int rc; + + if (!mapping) + return -EAGAIN; + + if (!page_has_buffers(page)) + return migrate_page(newpage, page); + + head = page_buffers(page); + + rc = migrate_page_remove_references(newpage, page, 3); + + if (rc) + return rc; + + bh = head; + do { + get_bh(bh); + lock_buffer(bh); + bh = bh->b_this_page; + + } while (bh != head); + + ClearPagePrivate(page); + set_page_private(newpage, page_private(page)); + set_page_private(page, 0); + put_page(page); + get_page(newpage); + + bh = head; + do { + set_bh_page(bh, newpage, bh_offset(bh)); + bh = bh->b_this_page; + + } while (bh != head); + + SetPagePrivate(newpage); + + migrate_page_copy(newpage, page); + + bh = head; + do { + unlock_buffer(bh); + put_bh(bh); + bh = bh->b_this_page; + + } while (bh != head); + + return 0; +} +EXPORT_SYMBOL(buffer_migrate_page); + +/* + * Migrate the list 'pagelist' of pages to a certain destination. + * + * Specify destination with either non-NULL vma or dest_node >= 0 + * Return the number of pages not migrated or error code + */ +int migrate_pages_to(struct list_head *pagelist, + struct vm_area_struct *vma, int dest) +{ + LIST_HEAD(newlist); + LIST_HEAD(moved); + LIST_HEAD(failed); + int err = 0; + unsigned long offset = 0; + int nr_pages; + struct page *page; + struct list_head *p; + +redo: + nr_pages = 0; + list_for_each(p, pagelist) { + if (vma) { + /* + * The address passed to alloc_page_vma is used to + * generate the proper interleave behavior. We fake + * the address here by an increasing offset in order + * to get the proper distribution of pages. + * + * No decision has been made as to which page + * a certain old page is moved to so we cannot + * specify the correct address. + */ + page = alloc_page_vma(GFP_HIGHUSER, vma, + offset + vma->vm_start); + offset += PAGE_SIZE; + } + else + page = alloc_pages_node(dest, GFP_HIGHUSER, 0); + + if (!page) { + err = -ENOMEM; + goto out; + } + list_add_tail(&page->lru, &newlist); + nr_pages++; + if (nr_pages > MIGRATE_CHUNK_SIZE) + break; + } + err = migrate_pages(pagelist, &newlist, &moved, &failed); + + putback_lru_pages(&moved); /* Call release pages instead ?? */ + + if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist)) + goto redo; +out: + /* Return leftover allocated pages */ + while (!list_empty(&newlist)) { + page = list_entry(newlist.next, struct page, lru); + list_del(&page->lru); + __free_page(page); + } + list_splice(&failed, pagelist); + if (err < 0) + return err; + + /* Calculate number of leftover pages */ + nr_pages = 0; + list_for_each(p, pagelist) + nr_pages++; + return nr_pages; +} @@ -612,7 +612,7 @@ again: remove_next = 1 + (end > next->vm_end); * If the vma has a ->close operation then the driver probably needs to release * per-vma resources, so we don't attempt to merge those. */ -#define VM_SPECIAL (VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP) +#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP) static inline int is_mergeable_vma(struct vm_area_struct *vma, struct file *file, unsigned long vm_flags) @@ -845,14 +845,6 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags, const unsigned long stack_flags = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN); -#ifdef CONFIG_HUGETLB - if (flags & VM_HUGETLB) { - if (!(flags & VM_DONTCOPY)) - mm->shared_vm += pages; - return; - } -#endif /* CONFIG_HUGETLB */ - if (file) { mm->shared_vm += pages; if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC) @@ -1048,12 +1040,11 @@ munmap_back: * specific mapper. the address has already been validated, but * not unmapped, but the maps are removed from the list. */ - vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); if (!vma) { error = -ENOMEM; goto unacct_error; } - memset(vma, 0, sizeof(*vma)); vma->vm_mm = mm; vma->vm_start = addr; @@ -1904,12 +1895,11 @@ unsigned long do_brk(unsigned long addr, unsigned long len) /* * create a vma struct for an anonymous mapping */ - vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); if (!vma) { vm_unacct_memory(len >> PAGE_SHIFT); return -ENOMEM; } - memset(vma, 0, sizeof(*vma)); vma->vm_mm = mm; vma->vm_start = addr; diff --git a/mm/mmzone.c b/mm/mmzone.c new file mode 100644 index 0000000..b022370 --- /dev/null +++ b/mm/mmzone.c @@ -0,0 +1,50 @@ +/* + * linux/mm/mmzone.c + * + * management codes for pgdats and zones. + */ + + +#include <linux/config.h> +#include <linux/stddef.h> +#include <linux/mmzone.h> +#include <linux/module.h> + +struct pglist_data *first_online_pgdat(void) +{ + return NODE_DATA(first_online_node); +} + +EXPORT_SYMBOL(first_online_pgdat); + +struct pglist_data *next_online_pgdat(struct pglist_data *pgdat) +{ + int nid = next_online_node(pgdat->node_id); + + if (nid == MAX_NUMNODES) + return NULL; + return NODE_DATA(nid); +} +EXPORT_SYMBOL(next_online_pgdat); + + +/* + * next_zone - helper magic for for_each_zone() + */ +struct zone *next_zone(struct zone *zone) +{ + pg_data_t *pgdat = zone->zone_pgdat; + + if (zone < pgdat->node_zones + MAX_NR_ZONES - 1) + zone++; + else { + pgdat = next_online_pgdat(pgdat); + if (pgdat) + zone = pgdat->node_zones; + else + zone = NULL; + } + return zone; +} +EXPORT_SYMBOL(next_zone); + diff --git a/mm/mprotect.c b/mm/mprotect.c index 653b857..4c14d42 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -124,7 +124,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, * a MAP_NORESERVE private mapping to writable will now reserve. */ if (newflags & VM_WRITE) { - if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) { + if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED))) { charged = nrpages; if (security_vm_enough_memory(charged)) return -ENOMEM; @@ -166,7 +166,10 @@ success: */ vma->vm_flags = newflags; vma->vm_page_prot = newprot; - change_protection(vma, start, end, newprot); + if (is_vm_hugetlb_page(vma)) + hugetlb_change_protection(vma, start, end, newprot); + else + change_protection(vma, start, end, newprot); vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); vm_stat_account(mm, newflags, vma->vm_file, nrpages); return 0; @@ -240,11 +243,6 @@ sys_mprotect(unsigned long start, size_t len, unsigned long prot) /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ - if (is_vm_hugetlb_page(vma)) { - error = -EACCES; - goto out; - } - newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); /* newflags >> 4 shift VM_MAY% in place of VM_% */ @@ -9,20 +9,24 @@ */ #include <linux/slab.h> #include <linux/pagemap.h> +#include <linux/fs.h> #include <linux/mm.h> #include <linux/mman.h> #include <linux/hugetlb.h> +#include <linux/writeback.h> +#include <linux/file.h> #include <linux/syscalls.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> -static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd, +static unsigned long msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end) { pte_t *pte; spinlock_t *ptl; int progress = 0; + unsigned long ret = 0; again: pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); @@ -43,58 +47,64 @@ again: if (!page) continue; if (ptep_clear_flush_dirty(vma, addr, pte) || - page_test_and_clear_dirty(page)) - set_page_dirty(page); + page_test_and_clear_dirty(page)) + ret += set_page_dirty(page); progress += 3; } while (pte++, addr += PAGE_SIZE, addr != end); pte_unmap_unlock(pte - 1, ptl); cond_resched(); if (addr != end) goto again; + return ret; } -static inline void msync_pmd_range(struct vm_area_struct *vma, pud_t *pud, - unsigned long addr, unsigned long end) +static inline unsigned long msync_pmd_range(struct vm_area_struct *vma, + pud_t *pud, unsigned long addr, unsigned long end) { pmd_t *pmd; unsigned long next; + unsigned long ret = 0; pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); if (pmd_none_or_clear_bad(pmd)) continue; - msync_pte_range(vma, pmd, addr, next); + ret += msync_pte_range(vma, pmd, addr, next); } while (pmd++, addr = next, addr != end); + return ret; } -static inline void msync_pud_range(struct vm_area_struct *vma, pgd_t *pgd, - unsigned long addr, unsigned long end) +static inline unsigned long msync_pud_range(struct vm_area_struct *vma, + pgd_t *pgd, unsigned long addr, unsigned long end) { pud_t *pud; unsigned long next; + unsigned long ret = 0; pud = pud_offset(pgd, addr); do { next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; - msync_pmd_range(vma, pud, addr, next); + ret += msync_pmd_range(vma, pud, addr, next); } while (pud++, addr = next, addr != end); + return ret; } -static void msync_page_range(struct vm_area_struct *vma, +static unsigned long msync_page_range(struct vm_area_struct *vma, unsigned long addr, unsigned long end) { pgd_t *pgd; unsigned long next; + unsigned long ret = 0; /* For hugepages we can't go walking the page table normally, * but that's ok, hugetlbfs is memory based, so we don't need * to do anything more on an msync(). */ if (vma->vm_flags & VM_HUGETLB) - return; + return 0; BUG_ON(addr >= end); pgd = pgd_offset(vma->vm_mm, addr); @@ -103,8 +113,9 @@ static void msync_page_range(struct vm_area_struct *vma, next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - msync_pud_range(vma, pgd, addr, next); + ret += msync_pud_range(vma, pgd, addr, next); } while (pgd++, addr = next, addr != end); + return ret; } /* @@ -115,53 +126,31 @@ static void msync_page_range(struct vm_area_struct *vma, * write out the dirty pages and wait on the writeout and check the result. * Or the application may run fadvise(FADV_DONTNEED) against the fd to start * async writeout immediately. - * So my _not_ starting I/O in MS_ASYNC we provide complete flexibility to + * So by _not_ starting I/O in MS_ASYNC we provide complete flexibility to * applications. */ -static int msync_interval(struct vm_area_struct *vma, - unsigned long addr, unsigned long end, int flags) +static int msync_interval(struct vm_area_struct *vma, unsigned long addr, + unsigned long end, int flags, + unsigned long *nr_pages_dirtied) { - int ret = 0; struct file *file = vma->vm_file; if ((flags & MS_INVALIDATE) && (vma->vm_flags & VM_LOCKED)) return -EBUSY; - if (file && (vma->vm_flags & VM_SHARED)) { - msync_page_range(vma, addr, end); - - if (flags & MS_SYNC) { - struct address_space *mapping = file->f_mapping; - int err; - - ret = filemap_fdatawrite(mapping); - if (file->f_op && file->f_op->fsync) { - /* - * We don't take i_mutex here because mmap_sem - * is already held. - */ - err = file->f_op->fsync(file,file->f_dentry,1); - if (err && !ret) - ret = err; - } - err = filemap_fdatawait(mapping); - if (!ret) - ret = err; - } - } - return ret; + if (file && (vma->vm_flags & VM_SHARED)) + *nr_pages_dirtied = msync_page_range(vma, addr, end); + return 0; } asmlinkage long sys_msync(unsigned long start, size_t len, int flags) { unsigned long end; struct vm_area_struct *vma; - int unmapped_error, error = -EINVAL; - - if (flags & MS_SYNC) - current->flags |= PF_SYNCWRITE; + int unmapped_error = 0; + int error = -EINVAL; + int done = 0; - down_read(¤t->mm->mmap_sem); if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC)) goto out; if (start & ~PAGE_MASK) @@ -180,13 +169,18 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags) * If the interval [start,end) covers some unmapped address ranges, * just ignore them, but return -ENOMEM at the end. */ + down_read(¤t->mm->mmap_sem); + if (flags & MS_SYNC) + current->flags |= PF_SYNCWRITE; vma = find_vma(current->mm, start); - unmapped_error = 0; - for (;;) { - /* Still start < end. */ + if (!vma) { error = -ENOMEM; - if (!vma) - goto out; + goto out_unlock; + } + do { + unsigned long nr_pages_dirtied = 0; + struct file *file; + /* Here start < vma->vm_end. */ if (start < vma->vm_start) { unmapped_error = -ENOMEM; @@ -195,22 +189,47 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags) /* Here vma->vm_start <= start < vma->vm_end. */ if (end <= vma->vm_end) { if (start < end) { - error = msync_interval(vma, start, end, flags); + error = msync_interval(vma, start, end, flags, + &nr_pages_dirtied); if (error) - goto out; + goto out_unlock; } error = unmapped_error; - goto out; + done = 1; + } else { + /* Here vma->vm_start <= start < vma->vm_end < end. */ + error = msync_interval(vma, start, vma->vm_end, flags, + &nr_pages_dirtied); + if (error) + goto out_unlock; } - /* Here vma->vm_start <= start < vma->vm_end < end. */ - error = msync_interval(vma, start, vma->vm_end, flags); - if (error) - goto out; + file = vma->vm_file; start = vma->vm_end; - vma = vma->vm_next; - } -out: - up_read(¤t->mm->mmap_sem); + if ((flags & MS_ASYNC) && file && nr_pages_dirtied) { + get_file(file); + up_read(¤t->mm->mmap_sem); + balance_dirty_pages_ratelimited_nr(file->f_mapping, + nr_pages_dirtied); + fput(file); + down_read(¤t->mm->mmap_sem); + vma = find_vma(current->mm, start); + } else if ((flags & MS_SYNC) && file && + (vma->vm_flags & VM_SHARED)) { + get_file(file); + up_read(¤t->mm->mmap_sem); + error = do_fsync(file, 0); + fput(file); + down_read(¤t->mm->mmap_sem); + if (error) + goto out_unlock; + vma = find_vma(current->mm, start); + } else { + vma = vma->vm_next; + } + } while (vma && !done); +out_unlock: current->flags &= ~PF_SYNCWRITE; + up_read(¤t->mm->mmap_sem); +out: return error; } @@ -159,7 +159,7 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) /* * kmalloc doesn't like __GFP_HIGHMEM for some reason */ - return kmalloc(size, gfp_mask & ~__GFP_HIGHMEM); + return kmalloc(size, (gfp_mask | __GFP_COMP) & ~__GFP_HIGHMEM); } struct page * vmalloc_to_page(void *addr) @@ -623,7 +623,7 @@ static int do_mmap_private(struct vm_area_struct *vma, unsigned long len) * - note that this may not return a page-aligned address if the object * we're allocating is smaller than a page */ - base = kmalloc(len, GFP_KERNEL); + base = kmalloc(len, GFP_KERNEL|__GFP_COMP); if (!base) goto enomem; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 945559f..893d767 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -75,12 +75,12 @@ int vm_dirty_ratio = 40; * The interval between `kupdate'-style writebacks, in centiseconds * (hundredths of a second) */ -int dirty_writeback_centisecs = 5 * 100; +int dirty_writeback_interval = 5 * HZ; /* * The longest number of centiseconds for which data is allowed to remain dirty */ -int dirty_expire_centisecs = 30 * 100; +int dirty_expire_interval = 30 * HZ; /* * Flag that makes the machine dump writes/reads and block dirtyings. @@ -88,7 +88,8 @@ int dirty_expire_centisecs = 30 * 100; int block_dump; /* - * Flag that puts the machine in "laptop mode". + * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies: + * a full sync is triggered after this time elapses without any disk activity. */ int laptop_mode; @@ -255,8 +256,9 @@ static void balance_dirty_pages(struct address_space *mapping) } /** - * balance_dirty_pages_ratelimited - balance dirty memory state + * balance_dirty_pages_ratelimited_nr - balance dirty memory state * @mapping: address_space which was dirtied + * @nr_pages: number of pages which the caller has just dirtied * * Processes which are dirtying memory should call in here once for each page * which was newly dirtied. The function will periodically check the system's @@ -267,10 +269,12 @@ static void balance_dirty_pages(struct address_space *mapping) * limit we decrease the ratelimiting by a lot, to prevent individual processes * from overshooting the limit by (ratelimit_pages) each. */ -void balance_dirty_pages_ratelimited(struct address_space *mapping) +void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, + unsigned long nr_pages_dirtied) { - static DEFINE_PER_CPU(int, ratelimits) = 0; - long ratelimit; + static DEFINE_PER_CPU(unsigned long, ratelimits) = 0; + unsigned long ratelimit; + unsigned long *p; ratelimit = ratelimit_pages; if (dirty_exceeded) @@ -280,15 +284,18 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping) * Check the rate limiting. Also, we do not want to throttle real-time * tasks in balance_dirty_pages(). Period. */ - if (get_cpu_var(ratelimits)++ >= ratelimit) { - __get_cpu_var(ratelimits) = 0; - put_cpu_var(ratelimits); + preempt_disable(); + p = &__get_cpu_var(ratelimits); + *p += nr_pages_dirtied; + if (unlikely(*p >= ratelimit)) { + *p = 0; + preempt_enable(); balance_dirty_pages(mapping); return; } - put_cpu_var(ratelimits); + preempt_enable(); } -EXPORT_SYMBOL(balance_dirty_pages_ratelimited); +EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr); void throttle_vm_writeout(void) { @@ -380,8 +387,8 @@ static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0); * just walks the superblock inode list, writing back any inodes which are * older than a specific point in time. * - * Try to run once per dirty_writeback_centisecs. But if a writeback event - * takes longer than a dirty_writeback_centisecs interval, then leave a + * Try to run once per dirty_writeback_interval. But if a writeback event + * takes longer than a dirty_writeback_interval interval, then leave a * one-second gap. * * older_than_this takes precedence over nr_to_write. So we'll only write back @@ -406,9 +413,9 @@ static void wb_kupdate(unsigned long arg) sync_supers(); get_writeback_state(&wbs); - oldest_jif = jiffies - (dirty_expire_centisecs * HZ) / 100; + oldest_jif = jiffies - dirty_expire_interval; start_jif = jiffies; - next_jif = start_jif + (dirty_writeback_centisecs * HZ) / 100; + next_jif = start_jif + dirty_writeback_interval; nr_to_write = wbs.nr_dirty + wbs.nr_unstable + (inodes_stat.nr_inodes - inodes_stat.nr_unused); while (nr_to_write > 0) { @@ -425,7 +432,7 @@ static void wb_kupdate(unsigned long arg) } if (time_before(next_jif, jiffies + HZ)) next_jif = jiffies + HZ; - if (dirty_writeback_centisecs) + if (dirty_writeback_interval) mod_timer(&wb_timer, next_jif); } @@ -435,11 +442,11 @@ static void wb_kupdate(unsigned long arg) int dirty_writeback_centisecs_handler(ctl_table *table, int write, struct file *file, void __user *buffer, size_t *length, loff_t *ppos) { - proc_dointvec(table, write, file, buffer, length, ppos); - if (dirty_writeback_centisecs) { + proc_dointvec_userhz_jiffies(table, write, file, buffer, length, ppos); + if (dirty_writeback_interval) { mod_timer(&wb_timer, - jiffies + (dirty_writeback_centisecs * HZ) / 100); - } else { + jiffies + dirty_writeback_interval); + } else { del_timer(&wb_timer); } return 0; @@ -468,7 +475,7 @@ static void laptop_timer_fn(unsigned long unused) */ void laptop_io_completion(void) { - mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode * HZ); + mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode); } /* @@ -544,7 +551,7 @@ void __init page_writeback_init(void) if (vm_dirty_ratio <= 0) vm_dirty_ratio = 1; } - mod_timer(&wb_timer, jiffies + (dirty_writeback_centisecs * HZ) / 100); + mod_timer(&wb_timer, jiffies + dirty_writeback_interval); set_ratelimit(); register_cpu_notifier(&ratelimit_nb); } @@ -621,8 +628,6 @@ EXPORT_SYMBOL(write_one_page); */ int __set_page_dirty_nobuffers(struct page *page) { - int ret = 0; - if (!TestSetPageDirty(page)) { struct address_space *mapping = page_mapping(page); struct address_space *mapping2; @@ -644,8 +649,9 @@ int __set_page_dirty_nobuffers(struct page *page) I_DIRTY_PAGES); } } + return 1; } - return ret; + return 0; } EXPORT_SYMBOL(__set_page_dirty_nobuffers); @@ -675,8 +681,10 @@ int fastcall set_page_dirty(struct page *page) return (*spd)(page); return __set_page_dirty_buffers(page); } - if (!PageDirty(page)) - SetPageDirty(page); + if (!PageDirty(page)) { + if (!TestSetPageDirty(page)) + return 1; + } return 0; } EXPORT_SYMBOL(set_page_dirty); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 234bd48..dc523a1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -49,13 +49,11 @@ nodemask_t node_online_map __read_mostly = { { [0] = 1UL } }; EXPORT_SYMBOL(node_online_map); nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL; EXPORT_SYMBOL(node_possible_map); -struct pglist_data *pgdat_list __read_mostly; unsigned long totalram_pages __read_mostly; unsigned long totalhigh_pages __read_mostly; long nr_swap_pages; int percpu_pagelist_fraction; -static void fastcall free_hot_cold_page(struct page *page, int cold); static void __free_pages_ok(struct page *page, unsigned int order); /* @@ -190,7 +188,7 @@ static void prep_compound_page(struct page *page, unsigned long order) for (i = 0; i < nr_pages; i++) { struct page *p = page + i; - SetPageCompound(p); + __SetPageCompound(p); set_page_private(p, (unsigned long)page); } } @@ -209,10 +207,24 @@ static void destroy_compound_page(struct page *page, unsigned long order) if (unlikely(!PageCompound(p) | (page_private(p) != (unsigned long)page))) bad_page(page); - ClearPageCompound(p); + __ClearPageCompound(p); } } +static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) +{ + int i; + + BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM); + /* + * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO + * and __GFP_HIGHMEM from hard or soft interrupt context. + */ + BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt()); + for (i = 0; i < (1 << order); i++) + clear_highpage(page + i); +} + /* * function for dealing with page's order in buddy system. * zone->lock is already acquired when we use these. @@ -423,11 +435,6 @@ static void __free_pages_ok(struct page *page, unsigned int order) mutex_debug_check_no_locks_freed(page_address(page), PAGE_SIZE<<order); -#ifndef CONFIG_MMU - for (i = 1 ; i < (1 << order) ; ++i) - __put_page(page + i); -#endif - for (i = 0 ; i < (1 << order) ; ++i) reserved += free_pages_check(page + i); if (reserved) @@ -448,28 +455,23 @@ void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order) if (order == 0) { __ClearPageReserved(page); set_page_count(page, 0); - - free_hot_cold_page(page, 0); + set_page_refcounted(page); + __free_page(page); } else { - LIST_HEAD(list); int loop; + prefetchw(page); for (loop = 0; loop < BITS_PER_LONG; loop++) { struct page *p = &page[loop]; - if (loop + 16 < BITS_PER_LONG) - prefetchw(p + 16); + if (loop + 1 < BITS_PER_LONG) + prefetchw(p + 1); __ClearPageReserved(p); set_page_count(p, 0); } - arch_free_page(page, order); - - mod_page_state(pgfree, 1 << order); - - list_add(&page->lru, &list); - kernel_map_pages(page, 1 << order, 0); - free_pages_bulk(page_zone(page), 1, &list, order); + set_page_refcounted(page); + __free_pages(page, order); } } @@ -507,7 +509,7 @@ static inline void expand(struct zone *zone, struct page *page, /* * This page is about to be returned from the page allocator */ -static int prep_new_page(struct page *page, int order) +static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) { if (unlikely(page_mapcount(page) | (page->mapping != NULL) | @@ -536,8 +538,15 @@ static int prep_new_page(struct page *page, int order) 1 << PG_referenced | 1 << PG_arch_1 | 1 << PG_checked | 1 << PG_mappedtodisk); set_page_private(page, 0); - set_page_refs(page, order); + set_page_refcounted(page); kernel_map_pages(page, 1 << order, 1); + + if (gfp_flags & __GFP_ZERO) + prep_zero_page(page, order, gfp_flags); + + if (order && (gfp_flags & __GFP_COMP)) + prep_compound_page(page, order); + return 0; } @@ -593,13 +602,14 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, /* * Called from the slab reaper to drain pagesets on a particular node that * belong to the currently executing processor. + * Note that this function must be called with the thread pinned to + * a single processor. */ void drain_node_pages(int nodeid) { int i, z; unsigned long flags; - local_irq_save(flags); for (z = 0; z < MAX_NR_ZONES; z++) { struct zone *zone = NODE_DATA(nodeid)->node_zones + z; struct per_cpu_pageset *pset; @@ -609,11 +619,14 @@ void drain_node_pages(int nodeid) struct per_cpu_pages *pcp; pcp = &pset->pcp[i]; - free_pages_bulk(zone, pcp->count, &pcp->list, 0); - pcp->count = 0; + if (pcp->count) { + local_irq_save(flags); + free_pages_bulk(zone, pcp->count, &pcp->list, 0); + pcp->count = 0; + local_irq_restore(flags); + } } } - local_irq_restore(flags); } #endif @@ -743,13 +756,22 @@ void fastcall free_cold_page(struct page *page) free_hot_cold_page(page, 1); } -static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) +/* + * split_page takes a non-compound higher-order page, and splits it into + * n (1<<order) sub-pages: page[0..n] + * Each sub-page must be freed individually. + * + * Note: this is probably too low level an operation for use in drivers. + * Please consult with lkml before using this in your driver. + */ +void split_page(struct page *page, unsigned int order) { int i; - BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM); - for(i = 0; i < (1 << order); i++) - clear_highpage(page + i); + BUG_ON(PageCompound(page)); + BUG_ON(!page_count(page)); + for (i = 1; i < (1 << order); i++) + set_page_refcounted(page + i); } /* @@ -795,14 +817,8 @@ again: put_cpu(); BUG_ON(bad_range(zone, page)); - if (prep_new_page(page, order)) + if (prep_new_page(page, order, gfp_flags)) goto again; - - if (gfp_flags & __GFP_ZERO) - prep_zero_page(page, order, gfp_flags); - - if (order && (gfp_flags & __GFP_COMP)) - prep_compound_page(page, order); return page; failed: @@ -926,7 +942,8 @@ restart: goto got_pg; do { - wakeup_kswapd(*z, order); + if (cpuset_zone_allowed(*z, gfp_mask)) + wakeup_kswapd(*z, order); } while (*(++z)); /* @@ -1183,7 +1200,7 @@ unsigned int nr_free_highpages (void) pg_data_t *pgdat; unsigned int pages = 0; - for_each_pgdat(pgdat) + for_each_online_pgdat(pgdat) pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages; return pages; @@ -1214,24 +1231,22 @@ DEFINE_PER_CPU(long, nr_pagecache_local) = 0; static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) { - int cpu = 0; + unsigned cpu; memset(ret, 0, nr * sizeof(unsigned long)); cpus_and(*cpumask, *cpumask, cpu_online_map); - cpu = first_cpu(*cpumask); - while (cpu < NR_CPUS) { - unsigned long *in, *out, off; - - if (!cpu_isset(cpu, *cpumask)) - continue; + for_each_cpu_mask(cpu, *cpumask) { + unsigned long *in; + unsigned long *out; + unsigned off; + unsigned next_cpu; in = (unsigned long *)&per_cpu(page_states, cpu); - cpu = next_cpu(cpu, *cpumask); - - if (likely(cpu < NR_CPUS)) - prefetch(&per_cpu(page_states, cpu)); + next_cpu = next_cpu(cpu, *cpumask); + if (likely(next_cpu < NR_CPUS)) + prefetch(&per_cpu(page_states, next_cpu)); out = (unsigned long *)ret; for (off = 0; off < nr; off++) @@ -1327,7 +1342,7 @@ void get_zone_counts(unsigned long *active, *active = 0; *inactive = 0; *free = 0; - for_each_pgdat(pgdat) { + for_each_online_pgdat(pgdat) { unsigned long l, m, n; __get_zone_counts(&l, &m, &n, pgdat); *active += l; @@ -1764,7 +1779,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, continue; page = pfn_to_page(pfn); set_page_links(page, zone, nid, pfn); - set_page_count(page, 1); + init_page_count(page); reset_page_mapcount(page); SetPageReserved(page); INIT_LIST_HEAD(&page->lru); @@ -2013,8 +2028,9 @@ static __meminit void zone_pcp_init(struct zone *zone) setup_pageset(zone_pcp(zone,cpu), batch); #endif } - printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", - zone->name, zone->present_pages, batch); + if (zone->present_pages) + printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", + zone->name, zone->present_pages, batch); } static __meminit void init_currently_empty_zone(struct zone *zone, @@ -2025,7 +2041,6 @@ static __meminit void init_currently_empty_zone(struct zone *zone, zone_wait_table_init(zone, size); pgdat->nr_zones = zone_idx(zone) + 1; - zone->zone_mem_map = pfn_to_page(zone_start_pfn); zone->zone_start_pfn = zone_start_pfn; memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn); @@ -2153,8 +2168,9 @@ static void *frag_start(struct seq_file *m, loff_t *pos) { pg_data_t *pgdat; loff_t node = *pos; - - for (pgdat = pgdat_list; pgdat && node; pgdat = pgdat->pgdat_next) + for (pgdat = first_online_pgdat(); + pgdat && node; + pgdat = next_online_pgdat(pgdat)) --node; return pgdat; @@ -2165,7 +2181,7 @@ static void *frag_next(struct seq_file *m, void *arg, loff_t *pos) pg_data_t *pgdat = (pg_data_t *)arg; (*pos)++; - return pgdat->pgdat_next; + return next_online_pgdat(pgdat); } static void frag_stop(struct seq_file *m, void *arg) @@ -2466,7 +2482,7 @@ static void setup_per_zone_lowmem_reserve(void) struct pglist_data *pgdat; int j, idx; - for_each_pgdat(pgdat) { + for_each_online_pgdat(pgdat) { for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; unsigned long present_pages = zone->present_pages; @@ -2685,8 +2701,7 @@ void *__init alloc_large_system_hash(const char *tablename, else numentries <<= (PAGE_SHIFT - scale); } - /* rounded up to nearest power of 2 in size */ - numentries = 1UL << (long_log2(numentries) + 1); + numentries = roundup_pow_of_two(numentries); /* limit allocation size to 1/16 total memory by default */ if (max == 0) { @@ -2729,3 +2744,44 @@ void *__init alloc_large_system_hash(const char *tablename, return table; } + +#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE +/* + * pfn <-> page translation. out-of-line version. + * (see asm-generic/memory_model.h) + */ +#if defined(CONFIG_FLATMEM) +struct page *pfn_to_page(unsigned long pfn) +{ + return mem_map + (pfn - ARCH_PFN_OFFSET); +} +unsigned long page_to_pfn(struct page *page) +{ + return (page - mem_map) + ARCH_PFN_OFFSET; +} +#elif defined(CONFIG_DISCONTIGMEM) +struct page *pfn_to_page(unsigned long pfn) +{ + int nid = arch_pfn_to_nid(pfn); + return NODE_DATA(nid)->node_mem_map + arch_local_page_offset(pfn,nid); +} +unsigned long page_to_pfn(struct page *page) +{ + struct pglist_data *pgdat = NODE_DATA(page_to_nid(page)); + return (page - pgdat->node_mem_map) + pgdat->node_start_pfn; +} +#elif defined(CONFIG_SPARSEMEM) +struct page *pfn_to_page(unsigned long pfn) +{ + return __section_mem_map_addr(__pfn_to_section(pfn)) + pfn; +} + +unsigned long page_to_pfn(struct page *page) +{ + long section_id = page_to_section(page); + return page - __section_mem_map_addr(__nr_to_section(section_id)); +} +#endif /* CONFIG_FLATMEM/DISCONTIGMME/SPARSEMEM */ +EXPORT_SYMBOL(pfn_to_page); +EXPORT_SYMBOL(page_to_pfn); +#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */ diff --git a/mm/readahead.c b/mm/readahead.c index 9f0b982..ba7db81 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -53,13 +53,24 @@ static inline unsigned long get_min_readahead(struct file_ra_state *ra) return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE; } +static inline void reset_ahead_window(struct file_ra_state *ra) +{ + /* + * ... but preserve ahead_start + ahead_size value, + * see 'recheck:' label in page_cache_readahead(). + * Note: We never use ->ahead_size as rvalue without + * checking ->ahead_start != 0 first. + */ + ra->ahead_size += ra->ahead_start; + ra->ahead_start = 0; +} + static inline void ra_off(struct file_ra_state *ra) { ra->start = 0; ra->flags = 0; ra->size = 0; - ra->ahead_start = 0; - ra->ahead_size = 0; + reset_ahead_window(ra); return; } @@ -73,10 +84,10 @@ static unsigned long get_init_ra_size(unsigned long size, unsigned long max) { unsigned long newsize = roundup_pow_of_two(size); - if (newsize <= max / 64) - newsize = newsize * newsize; + if (newsize <= max / 32) + newsize = newsize * 4; else if (newsize <= max / 4) - newsize = max / 4; + newsize = newsize * 2; else newsize = max; return newsize; @@ -427,8 +438,7 @@ static int make_ahead_window(struct address_space *mapping, struct file *filp, * congestion. The ahead window will any way be closed * in case we failed due to excessive page cache hits. */ - ra->ahead_start = 0; - ra->ahead_size = 0; + reset_ahead_window(ra); } return ret; @@ -521,11 +531,11 @@ page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra, * If we get here we are doing sequential IO and this was not the first * occurence (ie we have an existing window) */ - if (ra->ahead_start == 0) { /* no ahead window yet */ if (!make_ahead_window(mapping, filp, ra, 0)) - goto out; + goto recheck; } + /* * Already have an ahead window, check if we crossed into it. * If so, shift windows and issue a new ahead window. @@ -537,11 +547,16 @@ page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra, ra->start = ra->ahead_start; ra->size = ra->ahead_size; make_ahead_window(mapping, filp, ra, 0); +recheck: + /* prev_page shouldn't overrun the ahead window */ + ra->prev_page = min(ra->prev_page, + ra->ahead_start + ra->ahead_size - 1); } out: return ra->prev_page + 1; } +EXPORT_SYMBOL_GPL(page_cache_readahead); /* * handle_ra_miss() is called when it is known that a page which should have @@ -56,13 +56,11 @@ #include <asm/tlbflush.h> -//#define RMAP_DEBUG /* can be enabled only for debugging */ - -kmem_cache_t *anon_vma_cachep; +struct kmem_cache *anon_vma_cachep; static inline void validate_anon_vma(struct vm_area_struct *find_vma) { -#ifdef RMAP_DEBUG +#ifdef CONFIG_DEBUG_VM struct anon_vma *anon_vma = find_vma->anon_vma; struct vm_area_struct *vma; unsigned int mapcount = 0; @@ -166,7 +164,8 @@ void anon_vma_unlink(struct vm_area_struct *vma) anon_vma_free(anon_vma); } -static void anon_vma_ctor(void *data, kmem_cache_t *cachep, unsigned long flags) +static void anon_vma_ctor(void *data, struct kmem_cache *cachep, + unsigned long flags) { if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == SLAB_CTOR_CONSTRUCTOR) { @@ -550,13 +549,14 @@ void page_add_file_rmap(struct page *page) void page_remove_rmap(struct page *page) { if (atomic_add_negative(-1, &page->_mapcount)) { - if (page_mapcount(page) < 0) { +#ifdef CONFIG_DEBUG_VM + if (unlikely(page_mapcount(page) < 0)) { printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page)); printk (KERN_EMERG " page->flags = %lx\n", page->flags); printk (KERN_EMERG " page->count = %x\n", page_count(page)); printk (KERN_EMERG " page->mapping = %p\n", page->mapping); } - +#endif BUG_ON(page_mapcount(page) < 0); /* * It would be tidy to reset the PageAnon mapping here, @@ -875,7 +875,7 @@ redirty: } #ifdef CONFIG_NUMA -static int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes) +static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes) { char *nodelist = strchr(value, ':'); int err = 1; @@ -2119,7 +2119,7 @@ failed: return err; } -static kmem_cache_t *shmem_inode_cachep; +static struct kmem_cache *shmem_inode_cachep; static struct inode *shmem_alloc_inode(struct super_block *sb) { @@ -2139,7 +2139,8 @@ static void shmem_destroy_inode(struct inode *inode) kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); } -static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags) +static void init_once(void *foo, struct kmem_cache *cachep, + unsigned long flags) { struct shmem_inode_info *p = (struct shmem_inode_info *) foo; @@ -50,7 +50,7 @@ * The head array is strictly LIFO and should improve the cache hit rates. * On SMP, it additionally reduces the spinlock operations. * - * The c_cpuarray may not be read with enabled local interrupts - + * The c_cpuarray may not be read with enabled local interrupts - * it's changed with a smp_call_function(). * * SMP synchronization: @@ -94,6 +94,7 @@ #include <linux/interrupt.h> #include <linux/init.h> #include <linux/compiler.h> +#include <linux/cpuset.h> #include <linux/seq_file.h> #include <linux/notifier.h> #include <linux/kallsyms.h> @@ -170,15 +171,15 @@ #if DEBUG # define CREATE_MASK (SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \ SLAB_POISON | SLAB_HWCACHE_ALIGN | \ - SLAB_NO_REAP | SLAB_CACHE_DMA | \ + SLAB_CACHE_DMA | \ SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \ SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ - SLAB_DESTROY_BY_RCU) + SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD) #else -# define CREATE_MASK (SLAB_HWCACHE_ALIGN | SLAB_NO_REAP | \ +# define CREATE_MASK (SLAB_HWCACHE_ALIGN | \ SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \ SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ - SLAB_DESTROY_BY_RCU) + SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD) #endif /* @@ -203,7 +204,8 @@ typedef unsigned int kmem_bufctl_t; #define BUFCTL_END (((kmem_bufctl_t)(~0U))-0) #define BUFCTL_FREE (((kmem_bufctl_t)(~0U))-1) -#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-2) +#define BUFCTL_ACTIVE (((kmem_bufctl_t)(~0U))-2) +#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-3) /* Max number of objs-per-slab for caches which use off-slab slabs. * Needed to avoid a possible looping condition in cache_grow(). @@ -266,16 +268,17 @@ struct array_cache { unsigned int batchcount; unsigned int touched; spinlock_t lock; - void *entry[0]; /* - * Must have this definition in here for the proper - * alignment of array_cache. Also simplifies accessing - * the entries. - * [0] is for gcc 2.95. It should really be []. - */ + void *entry[0]; /* + * Must have this definition in here for the proper + * alignment of array_cache. Also simplifies accessing + * the entries. + * [0] is for gcc 2.95. It should really be []. + */ }; -/* bootstrap: The caches do not work without cpuarrays anymore, - * but the cpuarrays are allocated from the generic caches... +/* + * bootstrap: The caches do not work without cpuarrays anymore, but the + * cpuarrays are allocated from the generic caches... */ #define BOOT_CPUCACHE_ENTRIES 1 struct arraycache_init { @@ -291,13 +294,13 @@ struct kmem_list3 { struct list_head slabs_full; struct list_head slabs_free; unsigned long free_objects; - unsigned long next_reap; - int free_touched; unsigned int free_limit; unsigned int colour_next; /* Per-node cache coloring */ spinlock_t list_lock; struct array_cache *shared; /* shared per node */ struct array_cache **alien; /* on other nodes */ + unsigned long next_reap; /* updated without locking */ + int free_touched; /* updated without locking */ }; /* @@ -310,10 +313,8 @@ struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS]; #define SIZE_L3 (1 + MAX_NUMNODES) /* - * This function must be completely optimized away if - * a constant is passed to it. Mostly the same as - * what is in linux/slab.h except it returns an - * index. + * This function must be completely optimized away if a constant is passed to + * it. Mostly the same as what is in linux/slab.h except it returns an index. */ static __always_inline int index_of(const size_t size) { @@ -351,14 +352,14 @@ static void kmem_list3_init(struct kmem_list3 *parent) parent->free_touched = 0; } -#define MAKE_LIST(cachep, listp, slab, nodeid) \ - do { \ - INIT_LIST_HEAD(listp); \ - list_splice(&(cachep->nodelists[nodeid]->slab), listp); \ +#define MAKE_LIST(cachep, listp, slab, nodeid) \ + do { \ + INIT_LIST_HEAD(listp); \ + list_splice(&(cachep->nodelists[nodeid]->slab), listp); \ } while (0) -#define MAKE_ALL_LISTS(cachep, ptr, nodeid) \ - do { \ +#define MAKE_ALL_LISTS(cachep, ptr, nodeid) \ + do { \ MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \ MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \ MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ @@ -373,28 +374,30 @@ static void kmem_list3_init(struct kmem_list3 *parent) struct kmem_cache { /* 1) per-cpu data, touched during every alloc/free */ struct array_cache *array[NR_CPUS]; +/* 2) Cache tunables. Protected by cache_chain_mutex */ unsigned int batchcount; unsigned int limit; unsigned int shared; + unsigned int buffer_size; -/* 2) touched by every alloc & free from the backend */ +/* 3) touched by every alloc & free from the backend */ struct kmem_list3 *nodelists[MAX_NUMNODES]; - unsigned int flags; /* constant flags */ - unsigned int num; /* # of objs per slab */ - spinlock_t spinlock; -/* 3) cache_grow/shrink */ + unsigned int flags; /* constant flags */ + unsigned int num; /* # of objs per slab */ + +/* 4) cache_grow/shrink */ /* order of pgs per slab (2^n) */ unsigned int gfporder; /* force GFP flags, e.g. GFP_DMA */ gfp_t gfpflags; - size_t colour; /* cache colouring range */ + size_t colour; /* cache colouring range */ unsigned int colour_off; /* colour offset */ struct kmem_cache *slabp_cache; unsigned int slab_size; - unsigned int dflags; /* dynamic flags */ + unsigned int dflags; /* dynamic flags */ /* constructor func */ void (*ctor) (void *, struct kmem_cache *, unsigned long); @@ -402,11 +405,11 @@ struct kmem_cache { /* de-constructor func */ void (*dtor) (void *, struct kmem_cache *, unsigned long); -/* 4) cache creation/removal */ +/* 5) cache creation/removal */ const char *name; struct list_head next; -/* 5) statistics */ +/* 6) statistics */ #if STATS unsigned long num_active; unsigned long num_allocations; @@ -438,8 +441,9 @@ struct kmem_cache { #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) #define BATCHREFILL_LIMIT 16 -/* Optimization question: fewer reaps means less - * probability for unnessary cpucache drain/refill cycles. +/* + * Optimization question: fewer reaps means less probability for unnessary + * cpucache drain/refill cycles. * * OTOH the cpuarrays can contain lots of objects, * which could lock up otherwise freeable slabs. @@ -453,17 +457,19 @@ struct kmem_cache { #define STATS_INC_ALLOCED(x) ((x)->num_allocations++) #define STATS_INC_GROWN(x) ((x)->grown++) #define STATS_INC_REAPED(x) ((x)->reaped++) -#define STATS_SET_HIGH(x) do { if ((x)->num_active > (x)->high_mark) \ - (x)->high_mark = (x)->num_active; \ - } while (0) +#define STATS_SET_HIGH(x) \ + do { \ + if ((x)->num_active > (x)->high_mark) \ + (x)->high_mark = (x)->num_active; \ + } while (0) #define STATS_INC_ERR(x) ((x)->errors++) #define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++) #define STATS_INC_NODEFREES(x) ((x)->node_frees++) -#define STATS_SET_FREEABLE(x, i) \ - do { if ((x)->max_freeable < i) \ - (x)->max_freeable = i; \ - } while (0) - +#define STATS_SET_FREEABLE(x, i) \ + do { \ + if ((x)->max_freeable < i) \ + (x)->max_freeable = i; \ + } while (0) #define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit) #define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss) #define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit) @@ -478,9 +484,7 @@ struct kmem_cache { #define STATS_INC_ERR(x) do { } while (0) #define STATS_INC_NODEALLOCS(x) do { } while (0) #define STATS_INC_NODEFREES(x) do { } while (0) -#define STATS_SET_FREEABLE(x, i) \ - do { } while (0) - +#define STATS_SET_FREEABLE(x, i) do { } while (0) #define STATS_INC_ALLOCHIT(x) do { } while (0) #define STATS_INC_ALLOCMISS(x) do { } while (0) #define STATS_INC_FREEHIT(x) do { } while (0) @@ -488,7 +492,8 @@ struct kmem_cache { #endif #if DEBUG -/* Magic nums for obj red zoning. +/* + * Magic nums for obj red zoning. * Placed in the first word before and the first word after an obj. */ #define RED_INACTIVE 0x5A2CF071UL /* when obj is inactive */ @@ -499,7 +504,8 @@ struct kmem_cache { #define POISON_FREE 0x6b /* for use-after-free poisoning */ #define POISON_END 0xa5 /* end-byte of poisoning */ -/* memory layout of objects: +/* + * memory layout of objects: * 0 : objp * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that * the end of an object is aligned with the end of the real @@ -508,7 +514,8 @@ struct kmem_cache { * redzone word. * cachep->obj_offset: The real object. * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long] - * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address [BYTES_PER_WORD long] + * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address + * [BYTES_PER_WORD long] */ static int obj_offset(struct kmem_cache *cachep) { @@ -552,8 +559,8 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp) #endif /* - * Maximum size of an obj (in 2^order pages) - * and absolute limit for the gfp order. + * Maximum size of an obj (in 2^order pages) and absolute limit for the gfp + * order. */ #if defined(CONFIG_LARGE_ALLOCS) #define MAX_OBJ_ORDER 13 /* up to 32Mb */ @@ -573,9 +580,10 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp) #define BREAK_GFP_ORDER_LO 0 static int slab_break_gfp_order = BREAK_GFP_ORDER_LO; -/* Functions for storing/retrieving the cachep and or slab from the - * global 'mem_map'. These are used to find the slab an obj belongs to. - * With kfree(), these are used to find the cache which an obj belongs to. +/* + * Functions for storing/retrieving the cachep and or slab from the page + * allocator. These are used to find the slab an obj belongs to. With kfree(), + * these are used to find the cache which an obj belongs to. */ static inline void page_set_cache(struct page *page, struct kmem_cache *cache) { @@ -584,6 +592,8 @@ static inline void page_set_cache(struct page *page, struct kmem_cache *cache) static inline struct kmem_cache *page_get_cache(struct page *page) { + if (unlikely(PageCompound(page))) + page = (struct page *)page_private(page); return (struct kmem_cache *)page->lru.next; } @@ -594,6 +604,8 @@ static inline void page_set_slab(struct page *page, struct slab *slab) static inline struct slab *page_get_slab(struct page *page) { + if (unlikely(PageCompound(page))) + page = (struct page *)page_private(page); return (struct slab *)page->lru.prev; } @@ -609,7 +621,21 @@ static inline struct slab *virt_to_slab(const void *obj) return page_get_slab(page); } -/* These are the default caches for kmalloc. Custom caches can have other sizes. */ +static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab, + unsigned int idx) +{ + return slab->s_mem + cache->buffer_size * idx; +} + +static inline unsigned int obj_to_index(struct kmem_cache *cache, + struct slab *slab, void *obj) +{ + return (unsigned)(obj - slab->s_mem) / cache->buffer_size; +} + +/* + * These are the default caches for kmalloc. Custom caches can have other sizes. + */ struct cache_sizes malloc_sizes[] = { #define CACHE(x) { .cs_size = (x) }, #include <linux/kmalloc_sizes.h> @@ -642,8 +668,6 @@ static struct kmem_cache cache_cache = { .limit = BOOT_CPUCACHE_ENTRIES, .shared = 1, .buffer_size = sizeof(struct kmem_cache), - .flags = SLAB_NO_REAP, - .spinlock = SPIN_LOCK_UNLOCKED, .name = "kmem_cache", #if DEBUG .obj_size = sizeof(struct kmem_cache), @@ -655,8 +679,8 @@ static DEFINE_MUTEX(cache_chain_mutex); static struct list_head cache_chain; /* - * vm_enough_memory() looks at this to determine how many - * slab-allocated pages are possibly freeable under pressure + * vm_enough_memory() looks at this to determine how many slab-allocated pages + * are possibly freeable under pressure * * SLAB_RECLAIM_ACCOUNT turns this on per-slab */ @@ -675,7 +699,8 @@ static enum { static DEFINE_PER_CPU(struct work_struct, reap_work); -static void free_block(struct kmem_cache *cachep, void **objpp, int len, int node); +static void free_block(struct kmem_cache *cachep, void **objpp, int len, + int node); static void enable_cpucache(struct kmem_cache *cachep); static void cache_reap(void *unused); static int __node_shrink(struct kmem_cache *cachep, int node); @@ -685,7 +710,8 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) return cachep->array[smp_processor_id()]; } -static inline struct kmem_cache *__find_general_cachep(size_t size, gfp_t gfpflags) +static inline struct kmem_cache *__find_general_cachep(size_t size, + gfp_t gfpflags) { struct cache_sizes *csizep = malloc_sizes; @@ -720,8 +746,9 @@ static size_t slab_mgmt_size(size_t nr_objs, size_t align) return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align); } -/* Calculate the number of objects and left-over bytes for a given - buffer size. */ +/* + * Calculate the number of objects and left-over bytes for a given buffer size. + */ static void cache_estimate(unsigned long gfporder, size_t buffer_size, size_t align, int flags, size_t *left_over, unsigned int *num) @@ -782,7 +809,8 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, #define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg) -static void __slab_error(const char *function, struct kmem_cache *cachep, char *msg) +static void __slab_error(const char *function, struct kmem_cache *cachep, + char *msg) { printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", function, cachep->name, msg); @@ -804,7 +832,7 @@ static void init_reap_node(int cpu) node = next_node(cpu_to_node(cpu), node_online_map); if (node == MAX_NUMNODES) - node = 0; + node = first_node(node_online_map); __get_cpu_var(reap_node) = node; } @@ -870,8 +898,33 @@ static struct array_cache *alloc_arraycache(int node, int entries, return nc; } +/* + * Transfer objects in one arraycache to another. + * Locking must be handled by the caller. + * + * Return the number of entries transferred. + */ +static int transfer_objects(struct array_cache *to, + struct array_cache *from, unsigned int max) +{ + /* Figure out how many entries to transfer */ + int nr = min(min(from->avail, max), to->limit - to->avail); + + if (!nr) + return 0; + + memcpy(to->entry + to->avail, from->entry + from->avail -nr, + sizeof(void *) *nr); + + from->avail -= nr; + to->avail += nr; + to->touched = 1; + return nr; +} + #ifdef CONFIG_NUMA static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int); +static void *alternate_node_alloc(struct kmem_cache *, gfp_t); static struct array_cache **alloc_alien_cache(int node, int limit) { @@ -906,10 +959,8 @@ static void free_alien_cache(struct array_cache **ac_ptr) if (!ac_ptr) return; - for_each_node(i) kfree(ac_ptr[i]); - kfree(ac_ptr); } @@ -920,6 +971,13 @@ static void __drain_alien_cache(struct kmem_cache *cachep, if (ac->avail) { spin_lock(&rl3->list_lock); + /* + * Stuff objects into the remote nodes shared array first. + * That way we could avoid the overhead of putting the objects + * into the free lists and getting them back later. + */ + transfer_objects(rl3->shared, ac, ac->limit); + free_block(cachep, ac->entry, ac->avail, node); ac->avail = 0; spin_unlock(&rl3->list_lock); @@ -935,15 +993,16 @@ static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3) if (l3->alien) { struct array_cache *ac = l3->alien[node]; - if (ac && ac->avail) { - spin_lock_irq(&ac->lock); + + if (ac && ac->avail && spin_trylock_irq(&ac->lock)) { __drain_alien_cache(cachep, ac, node); spin_unlock_irq(&ac->lock); } } } -static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **alien) +static void drain_alien_cache(struct kmem_cache *cachep, + struct array_cache **alien) { int i = 0; struct array_cache *ac; @@ -986,20 +1045,22 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, switch (action) { case CPU_UP_PREPARE: mutex_lock(&cache_chain_mutex); - /* we need to do this right in the beginning since + /* + * We need to do this right in the beginning since * alloc_arraycache's are going to use this list. * kmalloc_node allows us to add the slab to the right * kmem_list3 and not this cpu's kmem_list3 */ list_for_each_entry(cachep, &cache_chain, next) { - /* setup the size64 kmemlist for cpu before we can + /* + * Set up the size64 kmemlist for cpu before we can * begin anything. Make sure some other cpu on this * node has not already allocated this */ if (!cachep->nodelists[node]) { - if (!(l3 = kmalloc_node(memsize, - GFP_KERNEL, node))) + l3 = kmalloc_node(memsize, GFP_KERNEL, node); + if (!l3) goto bad; kmem_list3_init(l3); l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + @@ -1015,13 +1076,15 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, spin_lock_irq(&cachep->nodelists[node]->list_lock); cachep->nodelists[node]->free_limit = - (1 + nr_cpus_node(node)) * - cachep->batchcount + cachep->num; + (1 + nr_cpus_node(node)) * + cachep->batchcount + cachep->num; spin_unlock_irq(&cachep->nodelists[node]->list_lock); } - /* Now we can go ahead with allocating the shared array's - & array cache's */ + /* + * Now we can go ahead with allocating the shared arrays and + * array caches + */ list_for_each_entry(cachep, &cache_chain, next) { struct array_cache *nc; struct array_cache *shared; @@ -1041,7 +1104,6 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, if (!alien) goto bad; cachep->array[cpu] = nc; - l3 = cachep->nodelists[node]; BUG_ON(!l3); @@ -1061,7 +1123,6 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, } #endif spin_unlock_irq(&l3->list_lock); - kfree(shared); free_alien_cache(alien); } @@ -1083,7 +1144,6 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, /* fall thru */ case CPU_UP_CANCELED: mutex_lock(&cache_chain_mutex); - list_for_each_entry(cachep, &cache_chain, next) { struct array_cache *nc; struct array_cache *shared; @@ -1150,7 +1210,7 @@ free_array_cache: #endif } return NOTIFY_OK; - bad: +bad: mutex_unlock(&cache_chain_mutex); return NOTIFY_BAD; } @@ -1160,7 +1220,8 @@ static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 }; /* * swap the static kmem_list3 with kmalloced memory */ -static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, int nodeid) +static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, + int nodeid) { struct kmem_list3 *ptr; @@ -1175,8 +1236,9 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, int no local_irq_enable(); } -/* Initialisation. - * Called after the gfp() functions have been enabled, and before smp_init(). +/* + * Initialisation. Called after the page allocator have been initialised and + * before smp_init(). */ void __init kmem_cache_init(void) { @@ -1201,9 +1263,9 @@ void __init kmem_cache_init(void) /* Bootstrap is tricky, because several objects are allocated * from caches that do not exist yet: - * 1) initialize the cache_cache cache: it contains the struct kmem_cache - * structures of all caches, except cache_cache itself: cache_cache - * is statically allocated. + * 1) initialize the cache_cache cache: it contains the struct + * kmem_cache structures of all caches, except cache_cache itself: + * cache_cache is statically allocated. * Initially an __init data area is used for the head array and the * kmem_list3 structures, it's replaced with a kmalloc allocated * array at the end of the bootstrap. @@ -1226,7 +1288,8 @@ void __init kmem_cache_init(void) cache_cache.array[smp_processor_id()] = &initarray_cache.cache; cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE]; - cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, cache_line_size()); + cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, + cache_line_size()); for (order = 0; order < MAX_ORDER; order++) { cache_estimate(order, cache_cache.buffer_size, @@ -1245,24 +1308,26 @@ void __init kmem_cache_init(void) sizes = malloc_sizes; names = cache_names; - /* Initialize the caches that provide memory for the array cache - * and the kmem_list3 structures first. - * Without this, further allocations will bug + /* + * Initialize the caches that provide memory for the array cache and the + * kmem_list3 structures first. Without this, further allocations will + * bug. */ sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name, - sizes[INDEX_AC].cs_size, - ARCH_KMALLOC_MINALIGN, - (ARCH_KMALLOC_FLAGS | - SLAB_PANIC), NULL, NULL); + sizes[INDEX_AC].cs_size, + ARCH_KMALLOC_MINALIGN, + ARCH_KMALLOC_FLAGS|SLAB_PANIC, + NULL, NULL); - if (INDEX_AC != INDEX_L3) + if (INDEX_AC != INDEX_L3) { sizes[INDEX_L3].cs_cachep = - kmem_cache_create(names[INDEX_L3].name, - sizes[INDEX_L3].cs_size, - ARCH_KMALLOC_MINALIGN, - (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, - NULL); + kmem_cache_create(names[INDEX_L3].name, + sizes[INDEX_L3].cs_size, + ARCH_KMALLOC_MINALIGN, + ARCH_KMALLOC_FLAGS|SLAB_PANIC, + NULL, NULL); + } while (sizes->cs_size != ULONG_MAX) { /* @@ -1272,13 +1337,13 @@ void __init kmem_cache_init(void) * Note for systems short on memory removing the alignment will * allow tighter packing of the smaller caches. */ - if (!sizes->cs_cachep) + if (!sizes->cs_cachep) { sizes->cs_cachep = kmem_cache_create(names->name, - sizes->cs_size, - ARCH_KMALLOC_MINALIGN, - (ARCH_KMALLOC_FLAGS - | SLAB_PANIC), - NULL, NULL); + sizes->cs_size, + ARCH_KMALLOC_MINALIGN, + ARCH_KMALLOC_FLAGS|SLAB_PANIC, + NULL, NULL); + } /* Inc off-slab bufctl limit until the ceiling is hit. */ if (!(OFF_SLAB(sizes->cs_cachep))) { @@ -1287,13 +1352,11 @@ void __init kmem_cache_init(void) } sizes->cs_dmacachep = kmem_cache_create(names->name_dma, - sizes->cs_size, - ARCH_KMALLOC_MINALIGN, - (ARCH_KMALLOC_FLAGS | - SLAB_CACHE_DMA | - SLAB_PANIC), NULL, - NULL); - + sizes->cs_size, + ARCH_KMALLOC_MINALIGN, + ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA| + SLAB_PANIC, + NULL, NULL); sizes++; names++; } @@ -1345,20 +1408,22 @@ void __init kmem_cache_init(void) struct kmem_cache *cachep; mutex_lock(&cache_chain_mutex); list_for_each_entry(cachep, &cache_chain, next) - enable_cpucache(cachep); + enable_cpucache(cachep); mutex_unlock(&cache_chain_mutex); } /* Done! */ g_cpucache_up = FULL; - /* Register a cpu startup notifier callback - * that initializes cpu_cache_get for all new cpus + /* + * Register a cpu startup notifier callback that initializes + * cpu_cache_get for all new cpus */ register_cpu_notifier(&cpucache_notifier); - /* The reap timers are started later, with a module init call: - * That part of the kernel is not yet operational. + /* + * The reap timers are started later, with a module init call: That part + * of the kernel is not yet operational. */ } @@ -1366,16 +1431,13 @@ static int __init cpucache_init(void) { int cpu; - /* - * Register the timers that return unneeded - * pages to gfp. + /* + * Register the timers that return unneeded pages to the page allocator */ for_each_online_cpu(cpu) - start_cpu_timer(cpu); - + start_cpu_timer(cpu); return 0; } - __initcall(cpucache_init); /* @@ -1402,7 +1464,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) atomic_add(i, &slab_reclaim_pages); add_page_state(nr_slab, i); while (i--) { - SetPageSlab(page); + __SetPageSlab(page); page++; } return addr; @@ -1418,8 +1480,8 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr) const unsigned long nr_freed = i; while (i--) { - if (!TestClearPageSlab(page)) - BUG(); + BUG_ON(!PageSlab(page)); + __ClearPageSlab(page); page++; } sub_page_state(nr_slab, nr_freed); @@ -1489,9 +1551,8 @@ static void dump_line(char *data, int offset, int limit) { int i; printk(KERN_ERR "%03x:", offset); - for (i = 0; i < limit; i++) { + for (i = 0; i < limit; i++) printk(" %02x", (unsigned char)data[offset + i]); - } printk("\n"); } #endif @@ -1505,15 +1566,15 @@ static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines) if (cachep->flags & SLAB_RED_ZONE) { printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n", - *dbg_redzone1(cachep, objp), - *dbg_redzone2(cachep, objp)); + *dbg_redzone1(cachep, objp), + *dbg_redzone2(cachep, objp)); } if (cachep->flags & SLAB_STORE_USER) { printk(KERN_ERR "Last user: [<%p>]", - *dbg_userword(cachep, objp)); + *dbg_userword(cachep, objp)); print_symbol("(%s)", - (unsigned long)*dbg_userword(cachep, objp)); + (unsigned long)*dbg_userword(cachep, objp)); printk("\n"); } realobj = (char *)objp + obj_offset(cachep); @@ -1546,8 +1607,8 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) /* Print header */ if (lines == 0) { printk(KERN_ERR - "Slab corruption: start=%p, len=%d\n", - realobj, size); + "Slab corruption: start=%p, len=%d\n", + realobj, size); print_objinfo(cachep, objp, 0); } /* Hexdump the affected line */ @@ -1568,18 +1629,18 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) * exist: */ struct slab *slabp = virt_to_slab(objp); - int objnr; + unsigned int objnr; - objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size; + objnr = obj_to_index(cachep, slabp, objp); if (objnr) { - objp = slabp->s_mem + (objnr - 1) * cachep->buffer_size; + objp = index_to_obj(cachep, slabp, objnr - 1); realobj = (char *)objp + obj_offset(cachep); printk(KERN_ERR "Prev obj: start=%p, len=%d\n", realobj, size); print_objinfo(cachep, objp, 2); } if (objnr + 1 < cachep->num) { - objp = slabp->s_mem + (objnr + 1) * cachep->buffer_size; + objp = index_to_obj(cachep, slabp, objnr + 1); realobj = (char *)objp + obj_offset(cachep); printk(KERN_ERR "Next obj: start=%p, len=%d\n", realobj, size); @@ -1591,22 +1652,25 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) #if DEBUG /** - * slab_destroy_objs - call the registered destructor for each object in - * a slab that is to be destroyed. + * slab_destroy_objs - destroy a slab and its objects + * @cachep: cache pointer being destroyed + * @slabp: slab pointer being destroyed + * + * Call the registered destructor for each object in a slab that is being + * destroyed. */ static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp) { int i; for (i = 0; i < cachep->num; i++) { - void *objp = slabp->s_mem + cachep->buffer_size * i; + void *objp = index_to_obj(cachep, slabp, i); if (cachep->flags & SLAB_POISON) { #ifdef CONFIG_DEBUG_PAGEALLOC - if ((cachep->buffer_size % PAGE_SIZE) == 0 - && OFF_SLAB(cachep)) + if (cachep->buffer_size % PAGE_SIZE == 0 && + OFF_SLAB(cachep)) kernel_map_pages(virt_to_page(objp), - cachep->buffer_size / PAGE_SIZE, - 1); + cachep->buffer_size / PAGE_SIZE, 1); else check_poison_obj(cachep, objp); #else @@ -1631,7 +1695,7 @@ static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp) if (cachep->dtor) { int i; for (i = 0; i < cachep->num; i++) { - void *objp = slabp->s_mem + cachep->buffer_size * i; + void *objp = index_to_obj(cachep, slabp, i); (cachep->dtor) (objp, cachep, 0); } } @@ -1639,9 +1703,13 @@ static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp) #endif /** + * slab_destroy - destroy and release all objects in a slab + * @cachep: cache pointer being destroyed + * @slabp: slab pointer being destroyed + * * Destroy all the objs in a slab, and release the mem back to the system. - * Before calling the slab must have been unlinked from the cache. - * The cache-lock is not held/needed. + * Before calling the slab must have been unlinked from the cache. The + * cache-lock is not held/needed. */ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) { @@ -1662,8 +1730,10 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) } } -/* For setting up all the kmem_list3s for cache whose buffer_size is same - as size of kmem_list3. */ +/* + * For setting up all the kmem_list3s for cache whose buffer_size is same as + * size of kmem_list3. + */ static void set_up_list3s(struct kmem_cache *cachep, int index) { int node; @@ -1689,13 +1759,13 @@ static void set_up_list3s(struct kmem_cache *cachep, int index) * high order pages for slabs. When the gfp() functions are more friendly * towards high-order requests, this should be changed. */ -static inline size_t calculate_slab_order(struct kmem_cache *cachep, +static size_t calculate_slab_order(struct kmem_cache *cachep, size_t size, size_t align, unsigned long flags) { size_t left_over = 0; int gfporder; - for (gfporder = 0 ; gfporder <= MAX_GFP_ORDER; gfporder++) { + for (gfporder = 0; gfporder <= MAX_GFP_ORDER; gfporder++) { unsigned int num; size_t remainder; @@ -1730,12 +1800,66 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep, /* * Acceptable internal fragmentation? */ - if ((left_over * 8) <= (PAGE_SIZE << gfporder)) + if (left_over * 8 <= (PAGE_SIZE << gfporder)) break; } return left_over; } +static void setup_cpu_cache(struct kmem_cache *cachep) +{ + if (g_cpucache_up == FULL) { + enable_cpucache(cachep); + return; + } + if (g_cpucache_up == NONE) { + /* + * Note: the first kmem_cache_create must create the cache + * that's used by kmalloc(24), otherwise the creation of + * further caches will BUG(). + */ + cachep->array[smp_processor_id()] = &initarray_generic.cache; + + /* + * If the cache that's used by kmalloc(sizeof(kmem_list3)) is + * the first cache, then we need to set up all its list3s, + * otherwise the creation of further caches will BUG(). + */ + set_up_list3s(cachep, SIZE_AC); + if (INDEX_AC == INDEX_L3) + g_cpucache_up = PARTIAL_L3; + else + g_cpucache_up = PARTIAL_AC; + } else { + cachep->array[smp_processor_id()] = + kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); + + if (g_cpucache_up == PARTIAL_AC) { + set_up_list3s(cachep, SIZE_L3); + g_cpucache_up = PARTIAL_L3; + } else { + int node; + for_each_online_node(node) { + cachep->nodelists[node] = + kmalloc_node(sizeof(struct kmem_list3), + GFP_KERNEL, node); + BUG_ON(!cachep->nodelists[node]); + kmem_list3_init(cachep->nodelists[node]); + } + } + } + cachep->nodelists[numa_node_id()]->next_reap = + jiffies + REAPTIMEOUT_LIST3 + + ((unsigned long)cachep) % REAPTIMEOUT_LIST3; + + cpu_cache_get(cachep)->avail = 0; + cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES; + cpu_cache_get(cachep)->batchcount = 1; + cpu_cache_get(cachep)->touched = 0; + cachep->batchcount = 1; + cachep->limit = BOOT_CPUCACHE_ENTRIES; +} + /** * kmem_cache_create - Create a cache. * @name: A string which is used in /proc/slabinfo to identify this cache. @@ -1751,9 +1875,8 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep, * and the @dtor is run before the pages are handed back. * * @name must be valid until the cache is destroyed. This implies that - * the module calling this has to destroy the cache before getting - * unloaded. - * + * the module calling this has to destroy the cache before getting unloaded. + * * The flags are * * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) @@ -1762,16 +1885,14 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep, * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check * for buffer overruns. * - * %SLAB_NO_REAP - Don't automatically reap this cache when we're under - * memory pressure. - * * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware * cacheline. This can be beneficial if you're counting cycles as closely * as davem. */ struct kmem_cache * kmem_cache_create (const char *name, size_t size, size_t align, - unsigned long flags, void (*ctor)(void*, struct kmem_cache *, unsigned long), + unsigned long flags, + void (*ctor)(void*, struct kmem_cache *, unsigned long), void (*dtor)(void*, struct kmem_cache *, unsigned long)) { size_t left_over, slab_size, ralign; @@ -1781,12 +1902,10 @@ kmem_cache_create (const char *name, size_t size, size_t align, /* * Sanity checks... these are all serious usage bugs. */ - if ((!name) || - in_interrupt() || - (size < BYTES_PER_WORD) || + if (!name || in_interrupt() || (size < BYTES_PER_WORD) || (size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) { - printk(KERN_ERR "%s: Early error in slab %s\n", - __FUNCTION__, name); + printk(KERN_ERR "%s: Early error in slab %s\n", __FUNCTION__, + name); BUG(); } @@ -1840,8 +1959,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, * above the next power of two: caches with object sizes just above a * power of two have a significant amount of internal fragmentation. */ - if ((size < 4096 - || fls(size - 1) == fls(size - 1 + 3 * BYTES_PER_WORD))) + if (size < 4096 || fls(size - 1) == fls(size-1 + 3 * BYTES_PER_WORD)) flags |= SLAB_RED_ZONE | SLAB_STORE_USER; if (!(flags & SLAB_DESTROY_BY_RCU)) flags |= SLAB_POISON; @@ -1853,13 +1971,14 @@ kmem_cache_create (const char *name, size_t size, size_t align, BUG_ON(dtor); /* - * Always checks flags, a caller might be expecting debug - * support which isn't available. + * Always checks flags, a caller might be expecting debug support which + * isn't available. */ if (flags & ~CREATE_MASK) BUG(); - /* Check that size is in terms of words. This is needed to avoid + /* + * Check that size is in terms of words. This is needed to avoid * unaligned accesses for some archs when redzoning is used, and makes * sure any on-slab bufctl's are also correctly aligned. */ @@ -1868,12 +1987,14 @@ kmem_cache_create (const char *name, size_t size, size_t align, size &= ~(BYTES_PER_WORD - 1); } - /* calculate out the final buffer alignment: */ + /* calculate the final buffer alignment: */ + /* 1) arch recommendation: can be overridden for debug */ if (flags & SLAB_HWCACHE_ALIGN) { - /* Default alignment: as specified by the arch code. - * Except if an object is really small, then squeeze multiple - * objects into one cacheline. + /* + * Default alignment: as specified by the arch code. Except if + * an object is really small, then squeeze multiple objects into + * one cacheline. */ ralign = cache_line_size(); while (size <= ralign / 2) @@ -1893,16 +2014,16 @@ kmem_cache_create (const char *name, size_t size, size_t align, if (ralign > BYTES_PER_WORD) flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); } - /* 4) Store it. Note that the debug code below can reduce + /* + * 4) Store it. Note that the debug code below can reduce * the alignment to BYTES_PER_WORD. */ align = ralign; /* Get cache's description obj. */ - cachep = kmem_cache_alloc(&cache_cache, SLAB_KERNEL); + cachep = kmem_cache_zalloc(&cache_cache, SLAB_KERNEL); if (!cachep) goto oops; - memset(cachep, 0, sizeof(struct kmem_cache)); #if DEBUG cachep->obj_size = size; @@ -1978,7 +2099,6 @@ kmem_cache_create (const char *name, size_t size, size_t align, cachep->gfpflags = 0; if (flags & SLAB_CACHE_DMA) cachep->gfpflags |= GFP_DMA; - spin_lock_init(&cachep->spinlock); cachep->buffer_size = size; if (flags & CFLGS_OFF_SLAB) @@ -1988,64 +2108,11 @@ kmem_cache_create (const char *name, size_t size, size_t align, cachep->name = name; - if (g_cpucache_up == FULL) { - enable_cpucache(cachep); - } else { - if (g_cpucache_up == NONE) { - /* Note: the first kmem_cache_create must create - * the cache that's used by kmalloc(24), otherwise - * the creation of further caches will BUG(). - */ - cachep->array[smp_processor_id()] = - &initarray_generic.cache; - - /* If the cache that's used by - * kmalloc(sizeof(kmem_list3)) is the first cache, - * then we need to set up all its list3s, otherwise - * the creation of further caches will BUG(). - */ - set_up_list3s(cachep, SIZE_AC); - if (INDEX_AC == INDEX_L3) - g_cpucache_up = PARTIAL_L3; - else - g_cpucache_up = PARTIAL_AC; - } else { - cachep->array[smp_processor_id()] = - kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); - - if (g_cpucache_up == PARTIAL_AC) { - set_up_list3s(cachep, SIZE_L3); - g_cpucache_up = PARTIAL_L3; - } else { - int node; - for_each_online_node(node) { - - cachep->nodelists[node] = - kmalloc_node(sizeof - (struct kmem_list3), - GFP_KERNEL, node); - BUG_ON(!cachep->nodelists[node]); - kmem_list3_init(cachep-> - nodelists[node]); - } - } - } - cachep->nodelists[numa_node_id()]->next_reap = - jiffies + REAPTIMEOUT_LIST3 + - ((unsigned long)cachep) % REAPTIMEOUT_LIST3; - - BUG_ON(!cpu_cache_get(cachep)); - cpu_cache_get(cachep)->avail = 0; - cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES; - cpu_cache_get(cachep)->batchcount = 1; - cpu_cache_get(cachep)->touched = 0; - cachep->batchcount = 1; - cachep->limit = BOOT_CPUCACHE_ENTRIES; - } + setup_cpu_cache(cachep); /* cache setup completed, link it into the list */ list_add(&cachep->next, &cache_chain); - oops: +oops: if (!cachep && (flags & SLAB_PANIC)) panic("kmem_cache_create(): failed to create slab `%s'\n", name); @@ -2089,30 +2156,13 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node) #define check_spinlock_acquired_node(x, y) do { } while(0) #endif -/* - * Waits for all CPUs to execute func(). - */ -static void smp_call_function_all_cpus(void (*func)(void *arg), void *arg) -{ - check_irq_on(); - preempt_disable(); - - local_irq_disable(); - func(arg); - local_irq_enable(); - - if (smp_call_function(func, arg, 1, 1)) - BUG(); - - preempt_enable(); -} - -static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac, - int force, int node); +static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, + struct array_cache *ac, + int force, int node); static void do_drain(void *arg) { - struct kmem_cache *cachep = (struct kmem_cache *) arg; + struct kmem_cache *cachep = arg; struct array_cache *ac; int node = numa_node_id(); @@ -2129,14 +2179,12 @@ static void drain_cpu_caches(struct kmem_cache *cachep) struct kmem_list3 *l3; int node; - smp_call_function_all_cpus(do_drain, cachep); + on_each_cpu(do_drain, cachep, 1, 1); check_irq_on(); for_each_online_node(node) { l3 = cachep->nodelists[node]; if (l3) { - spin_lock_irq(&l3->list_lock); - drain_array_locked(cachep, l3->shared, 1, node); - spin_unlock_irq(&l3->list_lock); + drain_array(cachep, l3, l3->shared, 1, node); if (l3->alien) drain_alien_cache(cachep, l3->alien); } @@ -2260,16 +2308,15 @@ int kmem_cache_destroy(struct kmem_cache *cachep) /* NUMA: free the list3 structures */ for_each_online_node(i) { - if ((l3 = cachep->nodelists[i])) { + l3 = cachep->nodelists[i]; + if (l3) { kfree(l3->shared); free_alien_cache(l3->alien); kfree(l3); } } kmem_cache_free(&cache_cache, cachep); - unlock_cpu_hotplug(); - return 0; } EXPORT_SYMBOL(kmem_cache_destroy); @@ -2292,7 +2339,6 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, slabp->inuse = 0; slabp->colouroff = colour_off; slabp->s_mem = objp + colour_off; - return slabp; } @@ -2307,7 +2353,7 @@ static void cache_init_objs(struct kmem_cache *cachep, int i; for (i = 0; i < cachep->num; i++) { - void *objp = slabp->s_mem + cachep->buffer_size * i; + void *objp = index_to_obj(cachep, slabp, i); #if DEBUG /* need to poison the objs? */ if (cachep->flags & SLAB_POISON) @@ -2320,9 +2366,9 @@ static void cache_init_objs(struct kmem_cache *cachep, *dbg_redzone2(cachep, objp) = RED_INACTIVE; } /* - * Constructors are not allowed to allocate memory from - * the same cache which they are a constructor for. - * Otherwise, deadlock. They must also be threaded. + * Constructors are not allowed to allocate memory from the same + * cache which they are a constructor for. Otherwise, deadlock. + * They must also be threaded. */ if (cachep->ctor && !(cachep->flags & SLAB_POISON)) cachep->ctor(objp + obj_offset(cachep), cachep, @@ -2336,8 +2382,8 @@ static void cache_init_objs(struct kmem_cache *cachep, slab_error(cachep, "constructor overwrote the" " start of an object"); } - if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep) - && cachep->flags & SLAB_POISON) + if ((cachep->buffer_size % PAGE_SIZE) == 0 && + OFF_SLAB(cachep) && cachep->flags & SLAB_POISON) kernel_map_pages(virt_to_page(objp), cachep->buffer_size / PAGE_SIZE, 0); #else @@ -2352,18 +2398,16 @@ static void cache_init_objs(struct kmem_cache *cachep, static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) { - if (flags & SLAB_DMA) { - if (!(cachep->gfpflags & GFP_DMA)) - BUG(); - } else { - if (cachep->gfpflags & GFP_DMA) - BUG(); - } + if (flags & SLAB_DMA) + BUG_ON(!(cachep->gfpflags & GFP_DMA)); + else + BUG_ON(cachep->gfpflags & GFP_DMA); } -static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, int nodeid) +static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, + int nodeid) { - void *objp = slabp->s_mem + (slabp->free * cachep->buffer_size); + void *objp = index_to_obj(cachep, slabp, slabp->free); kmem_bufctl_t next; slabp->inuse++; @@ -2377,18 +2421,18 @@ static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, int nod return objp; } -static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, void *objp, - int nodeid) +static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, + void *objp, int nodeid) { - unsigned int objnr = (unsigned)(objp-slabp->s_mem) / cachep->buffer_size; + unsigned int objnr = obj_to_index(cachep, slabp, objp); #if DEBUG /* Verify that the slab belongs to the intended node */ WARN_ON(slabp->nodeid != nodeid); - if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) { + if (slab_bufctl(slabp)[objnr] + 1 <= SLAB_LIMIT + 1) { printk(KERN_ERR "slab: double free detected in cache " - "'%s', objp %p\n", cachep->name, objp); + "'%s', objp %p\n", cachep->name, objp); BUG(); } #endif @@ -2397,14 +2441,18 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, void *ob slabp->inuse--; } -static void set_slab_attr(struct kmem_cache *cachep, struct slab *slabp, void *objp) +static void set_slab_attr(struct kmem_cache *cachep, struct slab *slabp, + void *objp) { int i; struct page *page; /* Nasty!!!!!! I hope this is OK. */ - i = 1 << cachep->gfporder; page = virt_to_page(objp); + + i = 1; + if (likely(!PageCompound(page))) + i <<= cachep->gfporder; do { page_set_cache(page, cachep); page_set_slab(page, slabp); @@ -2425,8 +2473,9 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid) unsigned long ctor_flags; struct kmem_list3 *l3; - /* Be lazy and only check for valid flags here, - * keeping it out of the critical path in kmem_cache_alloc(). + /* + * Be lazy and only check for valid flags here, keeping it out of the + * critical path in kmem_cache_alloc(). */ if (flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | SLAB_NO_GROW)) BUG(); @@ -2467,14 +2516,17 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid) */ kmem_flagcheck(cachep, flags); - /* Get mem for the objs. - * Attempt to allocate a physical page from 'nodeid', + /* + * Get mem for the objs. Attempt to allocate a physical page from + * 'nodeid'. */ - if (!(objp = kmem_getpages(cachep, flags, nodeid))) + objp = kmem_getpages(cachep, flags, nodeid); + if (!objp) goto failed; /* Get slab management. */ - if (!(slabp = alloc_slabmgmt(cachep, objp, offset, local_flags))) + slabp = alloc_slabmgmt(cachep, objp, offset, local_flags); + if (!slabp) goto opps1; slabp->nodeid = nodeid; @@ -2493,9 +2545,9 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid) l3->free_objects += cachep->num; spin_unlock(&l3->list_lock); return 1; - opps1: +opps1: kmem_freepages(cachep, objp); - failed: +failed: if (local_flags & __GFP_WAIT) local_irq_disable(); return 0; @@ -2538,8 +2590,8 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, page = virt_to_page(objp); if (page_get_cache(page) != cachep) { - printk(KERN_ERR - "mismatch in kmem_cache_free: expected cache %p, got %p\n", + printk(KERN_ERR "mismatch in kmem_cache_free: expected " + "cache %p, got %p\n", page_get_cache(page), cachep); printk(KERN_ERR "%p is %s.\n", cachep, cachep->name); printk(KERN_ERR "%p is %s.\n", page_get_cache(page), @@ -2549,13 +2601,12 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, slabp = page_get_slab(page); if (cachep->flags & SLAB_RED_ZONE) { - if (*dbg_redzone1(cachep, objp) != RED_ACTIVE - || *dbg_redzone2(cachep, objp) != RED_ACTIVE) { - slab_error(cachep, - "double free, or memory outside" - " object was overwritten"); - printk(KERN_ERR - "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n", + if (*dbg_redzone1(cachep, objp) != RED_ACTIVE || + *dbg_redzone2(cachep, objp) != RED_ACTIVE) { + slab_error(cachep, "double free, or memory outside" + " object was overwritten"); + printk(KERN_ERR "%p: redzone 1:0x%lx, " + "redzone 2:0x%lx.\n", objp, *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp)); } @@ -2565,15 +2616,16 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, if (cachep->flags & SLAB_STORE_USER) *dbg_userword(cachep, objp) = caller; - objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size; + objnr = obj_to_index(cachep, slabp, objp); BUG_ON(objnr >= cachep->num); - BUG_ON(objp != slabp->s_mem + objnr * cachep->buffer_size); + BUG_ON(objp != index_to_obj(cachep, slabp, objnr)); if (cachep->flags & SLAB_DEBUG_INITIAL) { - /* Need to call the slab's constructor so the - * caller can perform a verify of its state (debugging). - * Called without the cache-lock held. + /* + * Need to call the slab's constructor so the caller can + * perform a verify of its state (debugging). Called without + * the cache-lock held. */ cachep->ctor(objp + obj_offset(cachep), cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY); @@ -2584,9 +2636,12 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, */ cachep->dtor(objp + obj_offset(cachep), cachep, 0); } +#ifdef CONFIG_DEBUG_SLAB_LEAK + slab_bufctl(slabp)[objnr] = BUFCTL_FREE; +#endif if (cachep->flags & SLAB_POISON) { #ifdef CONFIG_DEBUG_PAGEALLOC - if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) { + if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { store_stackinfo(cachep, objp, (unsigned long)caller); kernel_map_pages(virt_to_page(objp), cachep->buffer_size / PAGE_SIZE, 0); @@ -2612,14 +2667,14 @@ static void check_slabp(struct kmem_cache *cachep, struct slab *slabp) goto bad; } if (entries != cachep->num - slabp->inuse) { - bad: - printk(KERN_ERR - "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n", - cachep->name, cachep->num, slabp, slabp->inuse); +bad: + printk(KERN_ERR "slab: Internal list corruption detected in " + "cache '%s'(%d), slabp %p(%d). Hexdump:\n", + cachep->name, cachep->num, slabp, slabp->inuse); for (i = 0; i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t); i++) { - if ((i % 16) == 0) + if (i % 16 == 0) printk("\n%03x:", i); printk(" %02x", ((unsigned char *)slabp)[i]); } @@ -2641,12 +2696,13 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) check_irq_off(); ac = cpu_cache_get(cachep); - retry: +retry: batchcount = ac->batchcount; if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { - /* if there was little recent activity on this - * cache, then perform only a partial refill. - * Otherwise we could generate refill bouncing. + /* + * If there was little recent activity on this cache, then + * perform only a partial refill. Otherwise we could generate + * refill bouncing. */ batchcount = BATCHREFILL_LIMIT; } @@ -2655,20 +2711,10 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) BUG_ON(ac->avail > 0 || !l3); spin_lock(&l3->list_lock); - if (l3->shared) { - struct array_cache *shared_array = l3->shared; - if (shared_array->avail) { - if (batchcount > shared_array->avail) - batchcount = shared_array->avail; - shared_array->avail -= batchcount; - ac->avail = batchcount; - memcpy(ac->entry, - &(shared_array->entry[shared_array->avail]), - sizeof(void *) * batchcount); - shared_array->touched = 1; - goto alloc_done; - } - } + /* See if we can refill from the shared array */ + if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) + goto alloc_done; + while (batchcount > 0) { struct list_head *entry; struct slab *slabp; @@ -2702,29 +2748,29 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) list_add(&slabp->list, &l3->slabs_partial); } - must_grow: +must_grow: l3->free_objects -= ac->avail; - alloc_done: +alloc_done: spin_unlock(&l3->list_lock); if (unlikely(!ac->avail)) { int x; x = cache_grow(cachep, flags, numa_node_id()); - // cache_grow can reenable interrupts, then ac could change. + /* cache_grow can reenable interrupts, then ac could change. */ ac = cpu_cache_get(cachep); - if (!x && ac->avail == 0) // no objects in sight? abort + if (!x && ac->avail == 0) /* no objects in sight? abort */ return NULL; - if (!ac->avail) // objects refilled by interrupt? + if (!ac->avail) /* objects refilled by interrupt? */ goto retry; } ac->touched = 1; return ac->entry[--ac->avail]; } -static inline void -cache_alloc_debugcheck_before(struct kmem_cache *cachep, gfp_t flags) +static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, + gfp_t flags) { might_sleep_if(flags & __GFP_WAIT); #if DEBUG @@ -2733,8 +2779,8 @@ cache_alloc_debugcheck_before(struct kmem_cache *cachep, gfp_t flags) } #if DEBUG -static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags, - void *objp, void *caller) +static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, + gfp_t flags, void *objp, void *caller) { if (!objp) return objp; @@ -2754,19 +2800,28 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags *dbg_userword(cachep, objp) = caller; if (cachep->flags & SLAB_RED_ZONE) { - if (*dbg_redzone1(cachep, objp) != RED_INACTIVE - || *dbg_redzone2(cachep, objp) != RED_INACTIVE) { - slab_error(cachep, - "double free, or memory outside" - " object was overwritten"); + if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || + *dbg_redzone2(cachep, objp) != RED_INACTIVE) { + slab_error(cachep, "double free, or memory outside" + " object was overwritten"); printk(KERN_ERR - "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n", - objp, *dbg_redzone1(cachep, objp), - *dbg_redzone2(cachep, objp)); + "%p: redzone 1:0x%lx, redzone 2:0x%lx\n", + objp, *dbg_redzone1(cachep, objp), + *dbg_redzone2(cachep, objp)); } *dbg_redzone1(cachep, objp) = RED_ACTIVE; *dbg_redzone2(cachep, objp) = RED_ACTIVE; } +#ifdef CONFIG_DEBUG_SLAB_LEAK + { + struct slab *slabp; + unsigned objnr; + + slabp = page_get_slab(virt_to_page(objp)); + objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size; + slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE; + } +#endif objp += obj_offset(cachep); if (cachep->ctor && cachep->flags & SLAB_POISON) { unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR; @@ -2788,11 +2843,10 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) struct array_cache *ac; #ifdef CONFIG_NUMA - if (unlikely(current->mempolicy && !in_interrupt())) { - int nid = slab_node(current->mempolicy); - - if (nid != numa_node_id()) - return __cache_alloc_node(cachep, flags, nid); + if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) { + objp = alternate_node_alloc(cachep, flags); + if (objp != NULL) + return objp; } #endif @@ -2809,8 +2863,8 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) return objp; } -static __always_inline void * -__cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) +static __always_inline void *__cache_alloc(struct kmem_cache *cachep, + gfp_t flags, void *caller) { unsigned long save_flags; void *objp; @@ -2828,9 +2882,32 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) #ifdef CONFIG_NUMA /* + * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY. + * + * If we are in_interrupt, then process context, including cpusets and + * mempolicy, may not apply and should not be used for allocation policy. + */ +static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) +{ + int nid_alloc, nid_here; + + if (in_interrupt()) + return NULL; + nid_alloc = nid_here = numa_node_id(); + if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) + nid_alloc = cpuset_mem_spread_node(); + else if (current->mempolicy) + nid_alloc = slab_node(current->mempolicy); + if (nid_alloc != nid_here) + return __cache_alloc_node(cachep, flags, nid_alloc); + return NULL; +} + +/* * A interface to enable slab creation on nodeid */ -static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) +static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, + int nodeid) { struct list_head *entry; struct slab *slabp; @@ -2841,7 +2918,7 @@ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int node l3 = cachep->nodelists[nodeid]; BUG_ON(!l3); - retry: +retry: check_irq_off(); spin_lock(&l3->list_lock); entry = l3->slabs_partial.next; @@ -2868,16 +2945,15 @@ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int node /* move slabp to correct slabp list: */ list_del(&slabp->list); - if (slabp->free == BUFCTL_END) { + if (slabp->free == BUFCTL_END) list_add(&slabp->list, &l3->slabs_full); - } else { + else list_add(&slabp->list, &l3->slabs_partial); - } spin_unlock(&l3->list_lock); goto done; - must_grow: +must_grow: spin_unlock(&l3->list_lock); x = cache_grow(cachep, flags, nodeid); @@ -2885,7 +2961,7 @@ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int node return NULL; goto retry; - done: +done: return obj; } #endif @@ -2958,7 +3034,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) } free_block(cachep, ac->entry, batchcount, node); - free_done: +free_done: #if STATS { int i = 0; @@ -2979,16 +3055,12 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) #endif spin_unlock(&l3->list_lock); ac->avail -= batchcount; - memmove(ac->entry, &(ac->entry[batchcount]), - sizeof(void *) * ac->avail); + memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail); } /* - * __cache_free - * Release an obj back to its cache. If the obj has a constructed - * state, it must be in this state _before_ it is released. - * - * Called with disabled ints. + * Release an obj back to its cache. If the obj has a constructed state, it must + * be in this state _before_ it is released. Called with disabled ints. */ static inline void __cache_free(struct kmem_cache *cachep, void *objp) { @@ -3007,9 +3079,9 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp) if (unlikely(slabp->nodeid != numa_node_id())) { struct array_cache *alien = NULL; int nodeid = slabp->nodeid; - struct kmem_list3 *l3 = - cachep->nodelists[numa_node_id()]; + struct kmem_list3 *l3; + l3 = cachep->nodelists[numa_node_id()]; STATS_INC_NODEFREES(cachep); if (l3->alien && l3->alien[nodeid]) { alien = l3->alien[nodeid]; @@ -3056,6 +3128,23 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) EXPORT_SYMBOL(kmem_cache_alloc); /** + * kmem_cache_alloc - Allocate an object. The memory is set to zero. + * @cache: The cache to allocate from. + * @flags: See kmalloc(). + * + * Allocate an object from this cache and set the allocated memory to zero. + * The flags are only relevant if the cache has no available objects. + */ +void *kmem_cache_zalloc(struct kmem_cache *cache, gfp_t flags) +{ + void *ret = __cache_alloc(cache, flags, __builtin_return_address(0)); + if (ret) + memset(ret, 0, obj_size(cache)); + return ret; +} +EXPORT_SYMBOL(kmem_cache_zalloc); + +/** * kmem_ptr_validate - check if an untrusted pointer might * be a slab entry. * @cachep: the cache we're checking against @@ -3093,7 +3182,7 @@ int fastcall kmem_ptr_validate(struct kmem_cache *cachep, void *ptr) if (unlikely(page_get_cache(page) != cachep)) goto out; return 1; - out: +out: return 0; } @@ -3119,7 +3208,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) local_irq_save(save_flags); if (nodeid == -1 || nodeid == numa_node_id() || - !cachep->nodelists[nodeid]) + !cachep->nodelists[nodeid]) ptr = ____cache_alloc(cachep, flags); else ptr = __cache_alloc_node(cachep, flags, nodeid); @@ -3148,6 +3237,7 @@ EXPORT_SYMBOL(kmalloc_node); * kmalloc - allocate memory * @size: how many bytes of memory are required. * @flags: the type of memory to allocate. + * @caller: function caller for debug tracking of the caller * * kmalloc is the normal method of allocating memory * in the kernel. @@ -3181,22 +3271,23 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, return __cache_alloc(cachep, flags, caller); } -#ifndef CONFIG_DEBUG_SLAB void *__kmalloc(size_t size, gfp_t flags) { +#ifndef CONFIG_DEBUG_SLAB return __do_kmalloc(size, flags, NULL); +#else + return __do_kmalloc(size, flags, __builtin_return_address(0)); +#endif } EXPORT_SYMBOL(__kmalloc); -#else - +#ifdef CONFIG_DEBUG_SLAB void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller) { return __do_kmalloc(size, flags, caller); } EXPORT_SYMBOL(__kmalloc_track_caller); - #endif #ifdef CONFIG_SMP @@ -3220,7 +3311,7 @@ void *__alloc_percpu(size_t size) * and we have no way of figuring out how to fix the array * that we have allocated then.... */ - for_each_cpu(i) { + for_each_possible_cpu(i) { int node = cpu_to_node(i); if (node_online(node)) @@ -3236,7 +3327,7 @@ void *__alloc_percpu(size_t size) /* Catch derefs w/o wrappers */ return (void *)(~(unsigned long)pdata); - unwind_oom: +unwind_oom: while (--i >= 0) { if (!cpu_possible(i)) continue; @@ -3307,7 +3398,7 @@ void free_percpu(const void *objp) /* * We allocate for all cpus so we cannot use for online cpu here. */ - for_each_cpu(i) + for_each_possible_cpu(i) kfree(p->ptrs[i]); kfree(p); } @@ -3327,61 +3418,86 @@ const char *kmem_cache_name(struct kmem_cache *cachep) EXPORT_SYMBOL_GPL(kmem_cache_name); /* - * This initializes kmem_list3 for all nodes. + * This initializes kmem_list3 or resizes varioius caches for all nodes. */ static int alloc_kmemlist(struct kmem_cache *cachep) { int node; struct kmem_list3 *l3; - int err = 0; + struct array_cache *new_shared; + struct array_cache **new_alien; for_each_online_node(node) { - struct array_cache *nc = NULL, *new; - struct array_cache **new_alien = NULL; -#ifdef CONFIG_NUMA - if (!(new_alien = alloc_alien_cache(node, cachep->limit))) + + new_alien = alloc_alien_cache(node, cachep->limit); + if (!new_alien) goto fail; -#endif - if (!(new = alloc_arraycache(node, (cachep->shared * - cachep->batchcount), - 0xbaadf00d))) + + new_shared = alloc_arraycache(node, + cachep->shared*cachep->batchcount, + 0xbaadf00d); + if (!new_shared) { + free_alien_cache(new_alien); goto fail; - if ((l3 = cachep->nodelists[node])) { + } + + l3 = cachep->nodelists[node]; + if (l3) { + struct array_cache *shared = l3->shared; spin_lock_irq(&l3->list_lock); - if ((nc = cachep->nodelists[node]->shared)) - free_block(cachep, nc->entry, nc->avail, node); + if (shared) + free_block(cachep, shared->entry, + shared->avail, node); - l3->shared = new; - if (!cachep->nodelists[node]->alien) { + l3->shared = new_shared; + if (!l3->alien) { l3->alien = new_alien; new_alien = NULL; } l3->free_limit = (1 + nr_cpus_node(node)) * - cachep->batchcount + cachep->num; + cachep->batchcount + cachep->num; spin_unlock_irq(&l3->list_lock); - kfree(nc); + kfree(shared); free_alien_cache(new_alien); continue; } - if (!(l3 = kmalloc_node(sizeof(struct kmem_list3), - GFP_KERNEL, node))) + l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node); + if (!l3) { + free_alien_cache(new_alien); + kfree(new_shared); goto fail; + } kmem_list3_init(l3); l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + - ((unsigned long)cachep) % REAPTIMEOUT_LIST3; - l3->shared = new; + ((unsigned long)cachep) % REAPTIMEOUT_LIST3; + l3->shared = new_shared; l3->alien = new_alien; l3->free_limit = (1 + nr_cpus_node(node)) * - cachep->batchcount + cachep->num; + cachep->batchcount + cachep->num; cachep->nodelists[node] = l3; } - return err; - fail: - err = -ENOMEM; - return err; + return 0; + +fail: + if (!cachep->next.next) { + /* Cache is not active yet. Roll back what we did */ + node--; + while (node >= 0) { + if (cachep->nodelists[node]) { + l3 = cachep->nodelists[node]; + + kfree(l3->shared); + free_alien_cache(l3->alien); + kfree(l3); + cachep->nodelists[node] = NULL; + } + node--; + } + } + return -ENOMEM; } struct ccupdate_struct { @@ -3391,7 +3507,7 @@ struct ccupdate_struct { static void do_ccupdate_local(void *info) { - struct ccupdate_struct *new = (struct ccupdate_struct *)info; + struct ccupdate_struct *new = info; struct array_cache *old; check_irq_off(); @@ -3401,16 +3517,17 @@ static void do_ccupdate_local(void *info) new->new[smp_processor_id()] = old; } -static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount, - int shared) +/* Always called with the cache_chain_mutex held */ +static int do_tune_cpucache(struct kmem_cache *cachep, int limit, + int batchcount, int shared) { struct ccupdate_struct new; int i, err; memset(&new.new, 0, sizeof(new.new)); for_each_online_cpu(i) { - new.new[i] = - alloc_arraycache(cpu_to_node(i), limit, batchcount); + new.new[i] = alloc_arraycache(cpu_to_node(i), limit, + batchcount); if (!new.new[i]) { for (i--; i >= 0; i--) kfree(new.new[i]); @@ -3419,14 +3536,12 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount } new.cachep = cachep; - smp_call_function_all_cpus(do_ccupdate_local, (void *)&new); + on_each_cpu(do_ccupdate_local, (void *)&new, 1, 1); check_irq_on(); - spin_lock(&cachep->spinlock); cachep->batchcount = batchcount; cachep->limit = limit; cachep->shared = shared; - spin_unlock(&cachep->spinlock); for_each_online_cpu(i) { struct array_cache *ccold = new.new[i]; @@ -3447,15 +3562,17 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount return 0; } +/* Called with cache_chain_mutex held always */ static void enable_cpucache(struct kmem_cache *cachep) { int err; int limit, shared; - /* The head array serves three purposes: + /* + * The head array serves three purposes: * - create a LIFO ordering, i.e. return objects that are cache-warm * - reduce the number of spinlock operations. - * - reduce the number of linked list operations on the slab and + * - reduce the number of linked list operations on the slab and * bufctl chains: array operations are cheaper. * The numbers are guessed, we should auto-tune as described by * Bonwick. @@ -3471,7 +3588,8 @@ static void enable_cpucache(struct kmem_cache *cachep) else limit = 120; - /* Cpu bound tasks (e.g. network routing) can exhibit cpu bound + /* + * CPU bound tasks (e.g. network routing) can exhibit cpu bound * allocation behaviour: Most allocs on one cpu, most free operations * on another cpu. For these cases, an efficient object passing between * cpus is necessary. This is provided by a shared array. The array @@ -3486,9 +3604,9 @@ static void enable_cpucache(struct kmem_cache *cachep) #endif #if DEBUG - /* With debugging enabled, large batchcount lead to excessively - * long periods with disabled local interrupts. Limit the - * batchcount + /* + * With debugging enabled, large batchcount lead to excessively long + * periods with disabled local interrupts. Limit the batchcount */ if (limit > 32) limit = 32; @@ -3499,23 +3617,32 @@ static void enable_cpucache(struct kmem_cache *cachep) cachep->name, -err); } -static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac, - int force, int node) +/* + * Drain an array if it contains any elements taking the l3 lock only if + * necessary. Note that the l3 listlock also protects the array_cache + * if drain_array() is used on the shared array. + */ +void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, + struct array_cache *ac, int force, int node) { int tofree; - check_spinlock_acquired_node(cachep, node); + if (!ac || !ac->avail) + return; if (ac->touched && !force) { ac->touched = 0; - } else if (ac->avail) { - tofree = force ? ac->avail : (ac->limit + 4) / 5; - if (tofree > ac->avail) { - tofree = (ac->avail + 1) / 2; + } else { + spin_lock_irq(&l3->list_lock); + if (ac->avail) { + tofree = force ? ac->avail : (ac->limit + 4) / 5; + if (tofree > ac->avail) + tofree = (ac->avail + 1) / 2; + free_block(cachep, ac->entry, tofree, node); + ac->avail -= tofree; + memmove(ac->entry, &(ac->entry[tofree]), + sizeof(void *) * ac->avail); } - free_block(cachep, ac->entry, tofree, node); - ac->avail -= tofree; - memmove(ac->entry, &(ac->entry[tofree]), - sizeof(void *) * ac->avail); + spin_unlock_irq(&l3->list_lock); } } @@ -3528,13 +3655,14 @@ static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac * - clear the per-cpu caches for this CPU. * - return freeable pages to the main free memory pool. * - * If we cannot acquire the cache chain mutex then just give up - we'll - * try again on the next iteration. + * If we cannot acquire the cache chain mutex then just give up - we'll try + * again on the next iteration. */ static void cache_reap(void *unused) { struct list_head *walk; struct kmem_list3 *l3; + int node = numa_node_id(); if (!mutex_trylock(&cache_chain_mutex)) { /* Give up. Setup the next iteration. */ @@ -3550,65 +3678,72 @@ static void cache_reap(void *unused) struct slab *slabp; searchp = list_entry(walk, struct kmem_cache, next); - - if (searchp->flags & SLAB_NO_REAP) - goto next; - check_irq_on(); - l3 = searchp->nodelists[numa_node_id()]; + /* + * We only take the l3 lock if absolutely necessary and we + * have established with reasonable certainty that + * we can do some work if the lock was obtained. + */ + l3 = searchp->nodelists[node]; + reap_alien(searchp, l3); - spin_lock_irq(&l3->list_lock); - drain_array_locked(searchp, cpu_cache_get(searchp), 0, - numa_node_id()); + drain_array(searchp, l3, cpu_cache_get(searchp), 0, node); + /* + * These are racy checks but it does not matter + * if we skip one check or scan twice. + */ if (time_after(l3->next_reap, jiffies)) - goto next_unlock; + goto next; l3->next_reap = jiffies + REAPTIMEOUT_LIST3; - if (l3->shared) - drain_array_locked(searchp, l3->shared, 0, - numa_node_id()); + drain_array(searchp, l3, l3->shared, 0, node); if (l3->free_touched) { l3->free_touched = 0; - goto next_unlock; + goto next; } - tofree = - (l3->free_limit + 5 * searchp->num - - 1) / (5 * searchp->num); + tofree = (l3->free_limit + 5 * searchp->num - 1) / + (5 * searchp->num); do { + /* + * Do not lock if there are no free blocks. + */ + if (list_empty(&l3->slabs_free)) + break; + + spin_lock_irq(&l3->list_lock); p = l3->slabs_free.next; - if (p == &(l3->slabs_free)) + if (p == &(l3->slabs_free)) { + spin_unlock_irq(&l3->list_lock); break; + } slabp = list_entry(p, struct slab, list); BUG_ON(slabp->inuse); list_del(&slabp->list); STATS_INC_REAPED(searchp); - /* Safe to drop the lock. The slab is no longer - * linked to the cache. - * searchp cannot disappear, we hold + /* + * Safe to drop the lock. The slab is no longer linked + * to the cache. searchp cannot disappear, we hold * cache_chain_lock */ l3->free_objects -= searchp->num; spin_unlock_irq(&l3->list_lock); slab_destroy(searchp, slabp); - spin_lock_irq(&l3->list_lock); } while (--tofree > 0); - next_unlock: - spin_unlock_irq(&l3->list_lock); - next: +next: cond_resched(); } check_irq_on(); mutex_unlock(&cache_chain_mutex); next_reap_node(); - /* Setup the next iteration */ + /* Set up the next iteration */ schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); } @@ -3658,8 +3793,8 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos) { struct kmem_cache *cachep = p; ++*pos; - return cachep->next.next == &cache_chain ? NULL - : list_entry(cachep->next.next, struct kmem_cache, next); + return cachep->next.next == &cache_chain ? + NULL : list_entry(cachep->next.next, struct kmem_cache, next); } static void s_stop(struct seq_file *m, void *p) @@ -3681,7 +3816,6 @@ static int s_show(struct seq_file *m, void *p) int node; struct kmem_list3 *l3; - spin_lock(&cachep->spinlock); active_objs = 0; num_slabs = 0; for_each_online_node(node) { @@ -3748,7 +3882,9 @@ static int s_show(struct seq_file *m, void *p) unsigned long node_frees = cachep->node_frees; seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \ - %4lu %4lu %4lu %4lu", allocs, high, grown, reaped, errors, max_freeable, node_allocs, node_frees); + %4lu %4lu %4lu %4lu", allocs, high, grown, + reaped, errors, max_freeable, node_allocs, + node_frees); } /* cpu stats */ { @@ -3762,7 +3898,6 @@ static int s_show(struct seq_file *m, void *p) } #endif seq_putc(m, '\n'); - spin_unlock(&cachep->spinlock); return 0; } @@ -3820,13 +3955,12 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer, mutex_lock(&cache_chain_mutex); res = -EINVAL; list_for_each(p, &cache_chain) { - struct kmem_cache *cachep = list_entry(p, struct kmem_cache, - next); + struct kmem_cache *cachep; + cachep = list_entry(p, struct kmem_cache, next); if (!strcmp(cachep->name, kbuf)) { - if (limit < 1 || - batchcount < 1 || - batchcount > limit || shared < 0) { + if (limit < 1 || batchcount < 1 || + batchcount > limit || shared < 0) { res = 0; } else { res = do_tune_cpucache(cachep, limit, @@ -3840,6 +3974,159 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer, res = count; return res; } + +#ifdef CONFIG_DEBUG_SLAB_LEAK + +static void *leaks_start(struct seq_file *m, loff_t *pos) +{ + loff_t n = *pos; + struct list_head *p; + + mutex_lock(&cache_chain_mutex); + p = cache_chain.next; + while (n--) { + p = p->next; + if (p == &cache_chain) + return NULL; + } + return list_entry(p, struct kmem_cache, next); +} + +static inline int add_caller(unsigned long *n, unsigned long v) +{ + unsigned long *p; + int l; + if (!v) + return 1; + l = n[1]; + p = n + 2; + while (l) { + int i = l/2; + unsigned long *q = p + 2 * i; + if (*q == v) { + q[1]++; + return 1; + } + if (*q > v) { + l = i; + } else { + p = q + 2; + l -= i + 1; + } + } + if (++n[1] == n[0]) + return 0; + memmove(p + 2, p, n[1] * 2 * sizeof(unsigned long) - ((void *)p - (void *)n)); + p[0] = v; + p[1] = 1; + return 1; +} + +static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s) +{ + void *p; + int i; + if (n[0] == n[1]) + return; + for (i = 0, p = s->s_mem; i < c->num; i++, p += c->buffer_size) { + if (slab_bufctl(s)[i] != BUFCTL_ACTIVE) + continue; + if (!add_caller(n, (unsigned long)*dbg_userword(c, p))) + return; + } +} + +static void show_symbol(struct seq_file *m, unsigned long address) +{ +#ifdef CONFIG_KALLSYMS + char *modname; + const char *name; + unsigned long offset, size; + char namebuf[KSYM_NAME_LEN+1]; + + name = kallsyms_lookup(address, &size, &offset, &modname, namebuf); + + if (name) { + seq_printf(m, "%s+%#lx/%#lx", name, offset, size); + if (modname) + seq_printf(m, " [%s]", modname); + return; + } +#endif + seq_printf(m, "%p", (void *)address); +} + +static int leaks_show(struct seq_file *m, void *p) +{ + struct kmem_cache *cachep = p; + struct list_head *q; + struct slab *slabp; + struct kmem_list3 *l3; + const char *name; + unsigned long *n = m->private; + int node; + int i; + + if (!(cachep->flags & SLAB_STORE_USER)) + return 0; + if (!(cachep->flags & SLAB_RED_ZONE)) + return 0; + + /* OK, we can do it */ + + n[1] = 0; + + for_each_online_node(node) { + l3 = cachep->nodelists[node]; + if (!l3) + continue; + + check_irq_on(); + spin_lock_irq(&l3->list_lock); + + list_for_each(q, &l3->slabs_full) { + slabp = list_entry(q, struct slab, list); + handle_slab(n, cachep, slabp); + } + list_for_each(q, &l3->slabs_partial) { + slabp = list_entry(q, struct slab, list); + handle_slab(n, cachep, slabp); + } + spin_unlock_irq(&l3->list_lock); + } + name = cachep->name; + if (n[0] == n[1]) { + /* Increase the buffer size */ + mutex_unlock(&cache_chain_mutex); + m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL); + if (!m->private) { + /* Too bad, we are really out */ + m->private = n; + mutex_lock(&cache_chain_mutex); + return -ENOMEM; + } + *(unsigned long *)m->private = n[0] * 2; + kfree(n); + mutex_lock(&cache_chain_mutex); + /* Now make sure this entry will be retried */ + m->count = m->size; + return 0; + } + for (i = 0; i < n[1]; i++) { + seq_printf(m, "%s: %lu ", name, n[2*i+3]); + show_symbol(m, n[2*i+2]); + seq_putc(m, '\n'); + } + return 0; +} + +struct seq_operations slabstats_op = { + .start = leaks_start, + .next = s_next, + .stop = s_stop, + .show = leaks_show, +}; +#endif #endif /** @@ -294,6 +294,16 @@ void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags) } EXPORT_SYMBOL(kmem_cache_alloc); +void *kmem_cache_zalloc(struct kmem_cache *c, gfp_t flags) +{ + void *ret = kmem_cache_alloc(c, flags); + if (ret) + memset(ret, 0, c->size); + + return ret; +} +EXPORT_SYMBOL(kmem_cache_zalloc); + void kmem_cache_free(struct kmem_cache *c, void *b) { if (c->dtor) @@ -209,19 +209,18 @@ int lru_add_drain_all(void) */ void fastcall __page_cache_release(struct page *page) { - unsigned long flags; - struct zone *zone = page_zone(page); + if (PageLRU(page)) { + unsigned long flags; + struct zone *zone = page_zone(page); - spin_lock_irqsave(&zone->lru_lock, flags); - if (TestClearPageLRU(page)) + spin_lock_irqsave(&zone->lru_lock, flags); + BUG_ON(!PageLRU(page)); + __ClearPageLRU(page); del_page_from_lru(zone, page); - if (page_count(page) != 0) - page = NULL; - spin_unlock_irqrestore(&zone->lru_lock, flags); - if (page) - free_hot_page(page); + spin_unlock_irqrestore(&zone->lru_lock, flags); + } + free_hot_page(page); } - EXPORT_SYMBOL(__page_cache_release); /* @@ -245,7 +244,6 @@ void release_pages(struct page **pages, int nr, int cold) pagevec_init(&pages_to_free, cold); for (i = 0; i < nr; i++) { struct page *page = pages[i]; - struct zone *pagezone; if (unlikely(PageCompound(page))) { if (zone) { @@ -259,23 +257,27 @@ void release_pages(struct page **pages, int nr, int cold) if (!put_page_testzero(page)) continue; - pagezone = page_zone(page); - if (pagezone != zone) { - if (zone) - spin_unlock_irq(&zone->lru_lock); - zone = pagezone; - spin_lock_irq(&zone->lru_lock); - } - if (TestClearPageLRU(page)) + if (PageLRU(page)) { + struct zone *pagezone = page_zone(page); + if (pagezone != zone) { + if (zone) + spin_unlock_irq(&zone->lru_lock); + zone = pagezone; + spin_lock_irq(&zone->lru_lock); + } + BUG_ON(!PageLRU(page)); + __ClearPageLRU(page); del_page_from_lru(zone, page); - if (page_count(page) == 0) { - if (!pagevec_add(&pages_to_free, page)) { + } + + if (!pagevec_add(&pages_to_free, page)) { + if (zone) { spin_unlock_irq(&zone->lru_lock); - __pagevec_free(&pages_to_free); - pagevec_reinit(&pages_to_free); - zone = NULL; /* No lock is held */ + zone = NULL; } - } + __pagevec_free(&pages_to_free); + pagevec_reinit(&pages_to_free); + } } if (zone) spin_unlock_irq(&zone->lru_lock); @@ -343,8 +345,8 @@ void __pagevec_lru_add(struct pagevec *pvec) zone = pagezone; spin_lock_irq(&zone->lru_lock); } - if (TestSetPageLRU(page)) - BUG(); + BUG_ON(PageLRU(page)); + SetPageLRU(page); add_page_to_inactive_list(zone, page); } if (zone) @@ -370,10 +372,10 @@ void __pagevec_lru_add_active(struct pagevec *pvec) zone = pagezone; spin_lock_irq(&zone->lru_lock); } - if (TestSetPageLRU(page)) - BUG(); - if (TestSetPageActive(page)) - BUG(); + BUG_ON(PageLRU(page)); + SetPageLRU(page); + BUG_ON(PageActive(page)); + SetPageActive(page); add_page_to_active_list(zone, page); } if (zone) @@ -510,7 +512,7 @@ long percpu_counter_sum(struct percpu_counter *fbc) spin_lock(&fbc->lock); ret = fbc->count; - for_each_cpu(cpu) { + for_each_possible_cpu(cpu) { long *pcount = per_cpu_ptr(fbc->counters, cpu); ret += *pcount; } diff --git a/mm/swap_state.c b/mm/swap_state.c index db8a3d3..d7af296 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -15,6 +15,7 @@ #include <linux/buffer_head.h> #include <linux/backing-dev.h> #include <linux/pagevec.h> +#include <linux/migrate.h> #include <asm/pgtable.h> diff --git a/mm/swapfile.c b/mm/swapfile.c index 1f9cf0d..39aa9d1 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -45,7 +45,7 @@ static const char Unused_offset[] = "Unused swap offset entry "; struct swap_list_t swap_list = {-1, -1}; -struct swap_info_struct swap_info[MAX_SWAPFILES]; +static struct swap_info_struct swap_info[MAX_SWAPFILES]; static DEFINE_MUTEX(swapon_mutex); @@ -116,7 +116,7 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si) last_in_cluster = offset + SWAPFILE_CLUSTER; else if (offset == last_in_cluster) { spin_lock(&swap_lock); - si->cluster_next = offset-SWAPFILE_CLUSTER-1; + si->cluster_next = offset-SWAPFILE_CLUSTER+1; goto cluster; } if (unlikely(--latency_ration < 0)) { @@ -417,6 +417,61 @@ void free_swap_and_cache(swp_entry_t entry) } } +#ifdef CONFIG_SOFTWARE_SUSPEND +/* + * Find the swap type that corresponds to given device (if any) + * + * This is needed for software suspend and is done in such a way that inode + * aliasing is allowed. + */ +int swap_type_of(dev_t device) +{ + int i; + + spin_lock(&swap_lock); + for (i = 0; i < nr_swapfiles; i++) { + struct inode *inode; + + if (!(swap_info[i].flags & SWP_WRITEOK)) + continue; + if (!device) { + spin_unlock(&swap_lock); + return i; + } + inode = swap_info->swap_file->f_dentry->d_inode; + if (S_ISBLK(inode->i_mode) && + device == MKDEV(imajor(inode), iminor(inode))) { + spin_unlock(&swap_lock); + return i; + } + } + spin_unlock(&swap_lock); + return -ENODEV; +} + +/* + * Return either the total number of swap pages of given type, or the number + * of free pages of that type (depending on @free) + * + * This is needed for software suspend + */ +unsigned int count_swap_pages(int type, int free) +{ + unsigned int n = 0; + + if (type < nr_swapfiles) { + spin_lock(&swap_lock); + if (swap_info[type].flags & SWP_WRITEOK) { + n = swap_info[type].pages; + if (free) + n -= swap_info[type].inuse_pages; + } + spin_unlock(&swap_lock); + } + return n; +} +#endif + /* * No need to decide whether this PTE shares the swap entry with others, * just let do_wp_page work it out if a write is requested later - to @@ -1,20 +1,22 @@ #include <linux/slab.h> #include <linux/string.h> #include <linux/module.h> +#include <linux/err.h> +#include <asm/uaccess.h> /** - * kzalloc - allocate memory. The memory is set to zero. + * __kzalloc - allocate memory. The memory is set to zero. * @size: how many bytes of memory are required. * @flags: the type of memory to allocate. */ -void *kzalloc(size_t size, gfp_t flags) +void *__kzalloc(size_t size, gfp_t flags) { - void *ret = kmalloc(size, flags); + void *ret = ____kmalloc(size, flags); if (ret) memset(ret, 0, size); return ret; } -EXPORT_SYMBOL(kzalloc); +EXPORT_SYMBOL(__kzalloc); /* * kstrdup - allocate space for and copy an existing string @@ -31,9 +33,44 @@ char *kstrdup(const char *s, gfp_t gfp) return NULL; len = strlen(s) + 1; - buf = kmalloc(len, gfp); + buf = ____kmalloc(len, gfp); if (buf) memcpy(buf, s, len); return buf; } EXPORT_SYMBOL(kstrdup); + +/* + * strndup_user - duplicate an existing string from user space + * + * @s: The string to duplicate + * @n: Maximum number of bytes to copy, including the trailing NUL. + */ +char *strndup_user(const char __user *s, long n) +{ + char *p; + long length; + + length = strnlen_user(s, n); + + if (!length) + return ERR_PTR(-EFAULT); + + if (length > n) + return ERR_PTR(-EINVAL); + + p = kmalloc(length, GFP_KERNEL); + + if (!p) + return ERR_PTR(-ENOMEM); + + if (copy_from_user(p, s, length)) { + kfree(p); + return ERR_PTR(-EFAULT); + } + + p[length - 1] = '\0'; + + return p; +} +EXPORT_SYMBOL(strndup_user); diff --git a/mm/vmscan.c b/mm/vmscan.c index 4fe7e3a..acdf001 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -33,39 +33,21 @@ #include <linux/cpuset.h> #include <linux/notifier.h> #include <linux/rwsem.h> +#include <linux/delay.h> #include <asm/tlbflush.h> #include <asm/div64.h> #include <linux/swapops.h> -/* possible outcome of pageout() */ -typedef enum { - /* failed to write page out, page is locked */ - PAGE_KEEP, - /* move page to the active list, page is locked */ - PAGE_ACTIVATE, - /* page has been sent to the disk successfully, page is unlocked */ - PAGE_SUCCESS, - /* page is clean and locked */ - PAGE_CLEAN, -} pageout_t; +#include "internal.h" struct scan_control { - /* Ask refill_inactive_zone, or shrink_cache to scan this many pages */ - unsigned long nr_to_scan; - /* Incremented by the number of inactive pages that were scanned */ unsigned long nr_scanned; - /* Incremented by the number of pages reclaimed */ - unsigned long nr_reclaimed; - unsigned long nr_mapped; /* From page_state */ - /* Ask shrink_caches, or shrink_zone to scan at this priority */ - unsigned int priority; - /* This context's GFP mask */ gfp_t gfp_mask; @@ -183,10 +165,11 @@ EXPORT_SYMBOL(remove_shrinker); * * Returns the number of slab objects which we shrunk. */ -int shrink_slab(unsigned long scanned, gfp_t gfp_mask, unsigned long lru_pages) +unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, + unsigned long lru_pages) { struct shrinker *shrinker; - int ret = 0; + unsigned long ret = 0; if (scanned == 0) scanned = SWAP_CLUSTER_MAX; @@ -306,9 +289,10 @@ static void handle_write_error(struct address_space *mapping, } /* - * pageout is called by shrink_list() for each dirty page. Calls ->writepage(). + * pageout is called by shrink_page_list() for each dirty page. + * Calls ->writepage(). */ -static pageout_t pageout(struct page *page, struct address_space *mapping) +pageout_t pageout(struct page *page, struct address_space *mapping) { /* * If the page is dirty, only perform writeback if that write @@ -376,7 +360,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping) return PAGE_CLEAN; } -static int remove_mapping(struct address_space *mapping, struct page *page) +int remove_mapping(struct address_space *mapping, struct page *page) { if (!mapping) return 0; /* truncate got there first */ @@ -414,14 +398,15 @@ cannot_free: } /* - * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed + * shrink_page_list() returns the number of reclaimed pages */ -static int shrink_list(struct list_head *page_list, struct scan_control *sc) +static unsigned long shrink_page_list(struct list_head *page_list, + struct scan_control *sc) { LIST_HEAD(ret_pages); struct pagevec freed_pvec; int pgactivate = 0; - int reclaimed = 0; + unsigned long nr_reclaimed = 0; cond_resched(); @@ -464,12 +449,9 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc) * Anonymous process memory has backing store? * Try to allocate it some swap space here. */ - if (PageAnon(page) && !PageSwapCache(page)) { - if (!sc->may_swap) - goto keep_locked; + if (PageAnon(page) && !PageSwapCache(page)) if (!add_to_swap(page, GFP_ATOMIC)) goto activate_locked; - } #endif /* CONFIG_SWAP */ mapping = page_mapping(page); @@ -481,12 +463,6 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc) * processes. Try to unmap it here. */ if (page_mapped(page) && mapping) { - /* - * No unmapping if we do not swap - */ - if (!sc->may_swap) - goto keep_locked; - switch (try_to_unmap(page, 0)) { case SWAP_FAIL: goto activate_locked; @@ -561,7 +537,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc) free_it: unlock_page(page); - reclaimed++; + nr_reclaimed++; if (!pagevec_add(&freed_pvec, page)) __pagevec_release_nonlru(&freed_pvec); continue; @@ -579,483 +555,8 @@ keep: if (pagevec_count(&freed_pvec)) __pagevec_release_nonlru(&freed_pvec); mod_page_state(pgactivate, pgactivate); - sc->nr_reclaimed += reclaimed; - return reclaimed; -} - -#ifdef CONFIG_MIGRATION -static inline void move_to_lru(struct page *page) -{ - list_del(&page->lru); - if (PageActive(page)) { - /* - * lru_cache_add_active checks that - * the PG_active bit is off. - */ - ClearPageActive(page); - lru_cache_add_active(page); - } else { - lru_cache_add(page); - } - put_page(page); -} - -/* - * Add isolated pages on the list back to the LRU. - * - * returns the number of pages put back. - */ -int putback_lru_pages(struct list_head *l) -{ - struct page *page; - struct page *page2; - int count = 0; - - list_for_each_entry_safe(page, page2, l, lru) { - move_to_lru(page); - count++; - } - return count; -} - -/* - * Non migratable page - */ -int fail_migrate_page(struct page *newpage, struct page *page) -{ - return -EIO; -} -EXPORT_SYMBOL(fail_migrate_page); - -/* - * swapout a single page - * page is locked upon entry, unlocked on exit - */ -static int swap_page(struct page *page) -{ - struct address_space *mapping = page_mapping(page); - - if (page_mapped(page) && mapping) - if (try_to_unmap(page, 1) != SWAP_SUCCESS) - goto unlock_retry; - - if (PageDirty(page)) { - /* Page is dirty, try to write it out here */ - switch(pageout(page, mapping)) { - case PAGE_KEEP: - case PAGE_ACTIVATE: - goto unlock_retry; - - case PAGE_SUCCESS: - goto retry; - - case PAGE_CLEAN: - ; /* try to free the page below */ - } - } - - if (PagePrivate(page)) { - if (!try_to_release_page(page, GFP_KERNEL) || - (!mapping && page_count(page) == 1)) - goto unlock_retry; - } - - if (remove_mapping(mapping, page)) { - /* Success */ - unlock_page(page); - return 0; - } - -unlock_retry: - unlock_page(page); - -retry: - return -EAGAIN; + return nr_reclaimed; } -EXPORT_SYMBOL(swap_page); - -/* - * Page migration was first developed in the context of the memory hotplug - * project. The main authors of the migration code are: - * - * IWAMOTO Toshihiro <iwamoto@valinux.co.jp> - * Hirokazu Takahashi <taka@valinux.co.jp> - * Dave Hansen <haveblue@us.ibm.com> - * Christoph Lameter <clameter@sgi.com> - */ - -/* - * Remove references for a page and establish the new page with the correct - * basic settings to be able to stop accesses to the page. - */ -int migrate_page_remove_references(struct page *newpage, - struct page *page, int nr_refs) -{ - struct address_space *mapping = page_mapping(page); - struct page **radix_pointer; - - /* - * Avoid doing any of the following work if the page count - * indicates that the page is in use or truncate has removed - * the page. - */ - if (!mapping || page_mapcount(page) + nr_refs != page_count(page)) - return -EAGAIN; - - /* - * Establish swap ptes for anonymous pages or destroy pte - * maps for files. - * - * In order to reestablish file backed mappings the fault handlers - * will take the radix tree_lock which may then be used to stop - * processses from accessing this page until the new page is ready. - * - * A process accessing via a swap pte (an anonymous page) will take a - * page_lock on the old page which will block the process until the - * migration attempt is complete. At that time the PageSwapCache bit - * will be examined. If the page was migrated then the PageSwapCache - * bit will be clear and the operation to retrieve the page will be - * retried which will find the new page in the radix tree. Then a new - * direct mapping may be generated based on the radix tree contents. - * - * If the page was not migrated then the PageSwapCache bit - * is still set and the operation may continue. - */ - if (try_to_unmap(page, 1) == SWAP_FAIL) - /* A vma has VM_LOCKED set -> Permanent failure */ - return -EPERM; - - /* - * Give up if we were unable to remove all mappings. - */ - if (page_mapcount(page)) - return -EAGAIN; - - write_lock_irq(&mapping->tree_lock); - - radix_pointer = (struct page **)radix_tree_lookup_slot( - &mapping->page_tree, - page_index(page)); - - if (!page_mapping(page) || page_count(page) != nr_refs || - *radix_pointer != page) { - write_unlock_irq(&mapping->tree_lock); - return -EAGAIN; - } - - /* - * Now we know that no one else is looking at the page. - * - * Certain minimal information about a page must be available - * in order for other subsystems to properly handle the page if they - * find it through the radix tree update before we are finished - * copying the page. - */ - get_page(newpage); - newpage->index = page->index; - newpage->mapping = page->mapping; - if (PageSwapCache(page)) { - SetPageSwapCache(newpage); - set_page_private(newpage, page_private(page)); - } - - *radix_pointer = newpage; - __put_page(page); - write_unlock_irq(&mapping->tree_lock); - - return 0; -} -EXPORT_SYMBOL(migrate_page_remove_references); - -/* - * Copy the page to its new location - */ -void migrate_page_copy(struct page *newpage, struct page *page) -{ - copy_highpage(newpage, page); - - if (PageError(page)) - SetPageError(newpage); - if (PageReferenced(page)) - SetPageReferenced(newpage); - if (PageUptodate(page)) - SetPageUptodate(newpage); - if (PageActive(page)) - SetPageActive(newpage); - if (PageChecked(page)) - SetPageChecked(newpage); - if (PageMappedToDisk(page)) - SetPageMappedToDisk(newpage); - - if (PageDirty(page)) { - clear_page_dirty_for_io(page); - set_page_dirty(newpage); - } - - ClearPageSwapCache(page); - ClearPageActive(page); - ClearPagePrivate(page); - set_page_private(page, 0); - page->mapping = NULL; - - /* - * If any waiters have accumulated on the new page then - * wake them up. - */ - if (PageWriteback(newpage)) - end_page_writeback(newpage); -} -EXPORT_SYMBOL(migrate_page_copy); - -/* - * Common logic to directly migrate a single page suitable for - * pages that do not use PagePrivate. - * - * Pages are locked upon entry and exit. - */ -int migrate_page(struct page *newpage, struct page *page) -{ - int rc; - - BUG_ON(PageWriteback(page)); /* Writeback must be complete */ - - rc = migrate_page_remove_references(newpage, page, 2); - - if (rc) - return rc; - - migrate_page_copy(newpage, page); - - /* - * Remove auxiliary swap entries and replace - * them with real ptes. - * - * Note that a real pte entry will allow processes that are not - * waiting on the page lock to use the new page via the page tables - * before the new page is unlocked. - */ - remove_from_swap(newpage); - return 0; -} -EXPORT_SYMBOL(migrate_page); - -/* - * migrate_pages - * - * Two lists are passed to this function. The first list - * contains the pages isolated from the LRU to be migrated. - * The second list contains new pages that the pages isolated - * can be moved to. If the second list is NULL then all - * pages are swapped out. - * - * The function returns after 10 attempts or if no pages - * are movable anymore because to has become empty - * or no retryable pages exist anymore. - * - * Return: Number of pages not migrated when "to" ran empty. - */ -int migrate_pages(struct list_head *from, struct list_head *to, - struct list_head *moved, struct list_head *failed) -{ - int retry; - int nr_failed = 0; - int pass = 0; - struct page *page; - struct page *page2; - int swapwrite = current->flags & PF_SWAPWRITE; - int rc; - - if (!swapwrite) - current->flags |= PF_SWAPWRITE; - -redo: - retry = 0; - - list_for_each_entry_safe(page, page2, from, lru) { - struct page *newpage = NULL; - struct address_space *mapping; - - cond_resched(); - - rc = 0; - if (page_count(page) == 1) - /* page was freed from under us. So we are done. */ - goto next; - - if (to && list_empty(to)) - break; - - /* - * Skip locked pages during the first two passes to give the - * functions holding the lock time to release the page. Later we - * use lock_page() to have a higher chance of acquiring the - * lock. - */ - rc = -EAGAIN; - if (pass > 2) - lock_page(page); - else - if (TestSetPageLocked(page)) - goto next; - - /* - * Only wait on writeback if we have already done a pass where - * we we may have triggered writeouts for lots of pages. - */ - if (pass > 0) { - wait_on_page_writeback(page); - } else { - if (PageWriteback(page)) - goto unlock_page; - } - - /* - * Anonymous pages must have swap cache references otherwise - * the information contained in the page maps cannot be - * preserved. - */ - if (PageAnon(page) && !PageSwapCache(page)) { - if (!add_to_swap(page, GFP_KERNEL)) { - rc = -ENOMEM; - goto unlock_page; - } - } - - if (!to) { - rc = swap_page(page); - goto next; - } - - newpage = lru_to_page(to); - lock_page(newpage); - - /* - * Pages are properly locked and writeback is complete. - * Try to migrate the page. - */ - mapping = page_mapping(page); - if (!mapping) - goto unlock_both; - - if (mapping->a_ops->migratepage) { - /* - * Most pages have a mapping and most filesystems - * should provide a migration function. Anonymous - * pages are part of swap space which also has its - * own migration function. This is the most common - * path for page migration. - */ - rc = mapping->a_ops->migratepage(newpage, page); - goto unlock_both; - } - - /* - * Default handling if a filesystem does not provide - * a migration function. We can only migrate clean - * pages so try to write out any dirty pages first. - */ - if (PageDirty(page)) { - switch (pageout(page, mapping)) { - case PAGE_KEEP: - case PAGE_ACTIVATE: - goto unlock_both; - - case PAGE_SUCCESS: - unlock_page(newpage); - goto next; - - case PAGE_CLEAN: - ; /* try to migrate the page below */ - } - } - - /* - * Buffers are managed in a filesystem specific way. - * We must have no buffers or drop them. - */ - if (!page_has_buffers(page) || - try_to_release_page(page, GFP_KERNEL)) { - rc = migrate_page(newpage, page); - goto unlock_both; - } - - /* - * On early passes with mapped pages simply - * retry. There may be a lock held for some - * buffers that may go away. Later - * swap them out. - */ - if (pass > 4) { - /* - * Persistently unable to drop buffers..... As a - * measure of last resort we fall back to - * swap_page(). - */ - unlock_page(newpage); - newpage = NULL; - rc = swap_page(page); - goto next; - } - -unlock_both: - unlock_page(newpage); - -unlock_page: - unlock_page(page); - -next: - if (rc == -EAGAIN) { - retry++; - } else if (rc) { - /* Permanent failure */ - list_move(&page->lru, failed); - nr_failed++; - } else { - if (newpage) { - /* Successful migration. Return page to LRU */ - move_to_lru(newpage); - } - list_move(&page->lru, moved); - } - } - if (retry && pass++ < 10) - goto redo; - - if (!swapwrite) - current->flags &= ~PF_SWAPWRITE; - - return nr_failed + retry; -} - -/* - * Isolate one page from the LRU lists and put it on the - * indicated list with elevated refcount. - * - * Result: - * 0 = page not on LRU list - * 1 = page removed from LRU list and added to the specified list. - */ -int isolate_lru_page(struct page *page) -{ - int ret = 0; - - if (PageLRU(page)) { - struct zone *zone = page_zone(page); - spin_lock_irq(&zone->lru_lock); - if (TestClearPageLRU(page)) { - ret = 1; - get_page(page); - if (PageActive(page)) - del_page_from_active_list(zone, page); - else - del_page_from_inactive_list(zone, page); - } - spin_unlock_irq(&zone->lru_lock); - } - - return ret; -} -#endif /* * zone->lru_lock is heavily contended. Some of the functions that @@ -1074,32 +575,35 @@ int isolate_lru_page(struct page *page) * * returns how many pages were moved onto *@dst. */ -static int isolate_lru_pages(int nr_to_scan, struct list_head *src, - struct list_head *dst, int *scanned) +static unsigned long isolate_lru_pages(unsigned long nr_to_scan, + struct list_head *src, struct list_head *dst, + unsigned long *scanned) { - int nr_taken = 0; + unsigned long nr_taken = 0; struct page *page; - int scan = 0; + unsigned long scan; - while (scan++ < nr_to_scan && !list_empty(src)) { + for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { + struct list_head *target; page = lru_to_page(src); prefetchw_prev_lru_page(page, src, flags); - if (!TestClearPageLRU(page)) - BUG(); + BUG_ON(!PageLRU(page)); + list_del(&page->lru); - if (get_page_testone(page)) { + target = src; + if (likely(get_page_unless_zero(page))) { /* - * It is being freed elsewhere + * Be careful not to clear PageLRU until after we're + * sure the page is not being freed elsewhere -- the + * page release code relies on it. */ - __put_page(page); - SetPageLRU(page); - list_add(&page->lru, src); - continue; - } else { - list_add(&page->lru, dst); + ClearPageLRU(page); + target = dst; nr_taken++; - } + } /* else it is being freed elsewhere */ + + list_add(&page->lru, target); } *scanned = scan; @@ -1107,23 +611,26 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src, } /* - * shrink_cache() adds the number of pages reclaimed to sc->nr_reclaimed + * shrink_inactive_list() is a helper for shrink_zone(). It returns the number + * of reclaimed pages */ -static void shrink_cache(struct zone *zone, struct scan_control *sc) +static unsigned long shrink_inactive_list(unsigned long max_scan, + struct zone *zone, struct scan_control *sc) { LIST_HEAD(page_list); struct pagevec pvec; - int max_scan = sc->nr_to_scan; + unsigned long nr_scanned = 0; + unsigned long nr_reclaimed = 0; pagevec_init(&pvec, 1); lru_add_drain(); spin_lock_irq(&zone->lru_lock); - while (max_scan > 0) { + do { struct page *page; - int nr_taken; - int nr_scan; - int nr_freed; + unsigned long nr_taken; + unsigned long nr_scan; + unsigned long nr_freed; nr_taken = isolate_lru_pages(sc->swap_cluster_max, &zone->inactive_list, @@ -1132,12 +639,9 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc) zone->pages_scanned += nr_scan; spin_unlock_irq(&zone->lru_lock); - if (nr_taken == 0) - goto done; - - max_scan -= nr_scan; - nr_freed = shrink_list(&page_list, sc); - + nr_scanned += nr_scan; + nr_freed = shrink_page_list(&page_list, sc); + nr_reclaimed += nr_freed; local_irq_disable(); if (current_is_kswapd()) { __mod_page_state_zone(zone, pgscan_kswapd, nr_scan); @@ -1146,14 +650,17 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc) __mod_page_state_zone(zone, pgscan_direct, nr_scan); __mod_page_state_zone(zone, pgsteal, nr_freed); + if (nr_taken == 0) + goto done; + spin_lock(&zone->lru_lock); /* * Put back any unfreeable pages. */ while (!list_empty(&page_list)) { page = lru_to_page(&page_list); - if (TestSetPageLRU(page)) - BUG(); + BUG_ON(PageLRU(page)); + SetPageLRU(page); list_del(&page->lru); if (PageActive(page)) add_page_to_active_list(zone, page); @@ -1165,10 +672,12 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc) spin_lock_irq(&zone->lru_lock); } } - } - spin_unlock_irq(&zone->lru_lock); + } while (nr_scanned < max_scan); + spin_unlock(&zone->lru_lock); done: + local_irq_enable(); pagevec_release(&pvec); + return nr_reclaimed; } /* @@ -1188,13 +697,12 @@ done: * The downside is that we have to touch page->_count against each page. * But we had to alter page->flags anyway. */ -static void -refill_inactive_zone(struct zone *zone, struct scan_control *sc) +static void shrink_active_list(unsigned long nr_pages, struct zone *zone, + struct scan_control *sc) { - int pgmoved; + unsigned long pgmoved; int pgdeactivate = 0; - int pgscanned; - int nr_pages = sc->nr_to_scan; + unsigned long pgscanned; LIST_HEAD(l_hold); /* The pages which were snipped off */ LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */ LIST_HEAD(l_active); /* Pages to go onto the active_list */ @@ -1202,7 +710,7 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) struct pagevec pvec; int reclaim_mapped = 0; - if (unlikely(sc->may_swap)) { + if (sc->may_swap) { long mapped_ratio; long distress; long swap_tendency; @@ -1272,10 +780,11 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) while (!list_empty(&l_inactive)) { page = lru_to_page(&l_inactive); prefetchw_prev_lru_page(page, &l_inactive, flags); - if (TestSetPageLRU(page)) - BUG(); - if (!TestClearPageActive(page)) - BUG(); + BUG_ON(PageLRU(page)); + SetPageLRU(page); + BUG_ON(!PageActive(page)); + ClearPageActive(page); + list_move(&page->lru, &zone->inactive_list); pgmoved++; if (!pagevec_add(&pvec, page)) { @@ -1301,8 +810,8 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) while (!list_empty(&l_active)) { page = lru_to_page(&l_active); prefetchw_prev_lru_page(page, &l_active, flags); - if (TestSetPageLRU(page)) - BUG(); + BUG_ON(PageLRU(page)); + SetPageLRU(page); BUG_ON(!PageActive(page)); list_move(&page->lru, &zone->active_list); pgmoved++; @@ -1327,11 +836,13 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ -static void -shrink_zone(struct zone *zone, struct scan_control *sc) +static unsigned long shrink_zone(int priority, struct zone *zone, + struct scan_control *sc) { unsigned long nr_active; unsigned long nr_inactive; + unsigned long nr_to_scan; + unsigned long nr_reclaimed = 0; atomic_inc(&zone->reclaim_in_progress); @@ -1339,14 +850,14 @@ shrink_zone(struct zone *zone, struct scan_control *sc) * Add one to `nr_to_scan' just to make sure that the kernel will * slowly sift through the active list. */ - zone->nr_scan_active += (zone->nr_active >> sc->priority) + 1; + zone->nr_scan_active += (zone->nr_active >> priority) + 1; nr_active = zone->nr_scan_active; if (nr_active >= sc->swap_cluster_max) zone->nr_scan_active = 0; else nr_active = 0; - zone->nr_scan_inactive += (zone->nr_inactive >> sc->priority) + 1; + zone->nr_scan_inactive += (zone->nr_inactive >> priority) + 1; nr_inactive = zone->nr_scan_inactive; if (nr_inactive >= sc->swap_cluster_max) zone->nr_scan_inactive = 0; @@ -1355,23 +866,25 @@ shrink_zone(struct zone *zone, struct scan_control *sc) while (nr_active || nr_inactive) { if (nr_active) { - sc->nr_to_scan = min(nr_active, + nr_to_scan = min(nr_active, (unsigned long)sc->swap_cluster_max); - nr_active -= sc->nr_to_scan; - refill_inactive_zone(zone, sc); + nr_active -= nr_to_scan; + shrink_active_list(nr_to_scan, zone, sc); } if (nr_inactive) { - sc->nr_to_scan = min(nr_inactive, + nr_to_scan = min(nr_inactive, (unsigned long)sc->swap_cluster_max); - nr_inactive -= sc->nr_to_scan; - shrink_cache(zone, sc); + nr_inactive -= nr_to_scan; + nr_reclaimed += shrink_inactive_list(nr_to_scan, zone, + sc); } } throttle_vm_writeout(); atomic_dec(&zone->reclaim_in_progress); + return nr_reclaimed; } /* @@ -1390,9 +903,10 @@ shrink_zone(struct zone *zone, struct scan_control *sc) * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. */ -static void -shrink_caches(struct zone **zones, struct scan_control *sc) +static unsigned long shrink_zones(int priority, struct zone **zones, + struct scan_control *sc) { + unsigned long nr_reclaimed = 0; int i; for (i = 0; zones[i] != NULL; i++) { @@ -1404,15 +918,16 @@ shrink_caches(struct zone **zones, struct scan_control *sc) if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) continue; - zone->temp_priority = sc->priority; - if (zone->prev_priority > sc->priority) - zone->prev_priority = sc->priority; + zone->temp_priority = priority; + if (zone->prev_priority > priority) + zone->prev_priority = priority; - if (zone->all_unreclaimable && sc->priority != DEF_PRIORITY) + if (zone->all_unreclaimable && priority != DEF_PRIORITY) continue; /* Let kswapd poll it */ - shrink_zone(zone, sc); + nr_reclaimed += shrink_zone(priority, zone, sc); } + return nr_reclaimed; } /* @@ -1428,19 +943,21 @@ shrink_caches(struct zone **zones, struct scan_control *sc) * holds filesystem locks which prevent writeout this might not work, and the * allocation attempt will fail. */ -int try_to_free_pages(struct zone **zones, gfp_t gfp_mask) +unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) { int priority; int ret = 0; - int total_scanned = 0, total_reclaimed = 0; + unsigned long total_scanned = 0; + unsigned long nr_reclaimed = 0; struct reclaim_state *reclaim_state = current->reclaim_state; - struct scan_control sc; unsigned long lru_pages = 0; int i; - - sc.gfp_mask = gfp_mask; - sc.may_writepage = !laptop_mode; - sc.may_swap = 1; + struct scan_control sc = { + .gfp_mask = gfp_mask, + .may_writepage = !laptop_mode, + .swap_cluster_max = SWAP_CLUSTER_MAX, + .may_swap = 1, + }; inc_page_state(allocstall); @@ -1457,20 +974,16 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask) for (priority = DEF_PRIORITY; priority >= 0; priority--) { sc.nr_mapped = read_page_state(nr_mapped); sc.nr_scanned = 0; - sc.nr_reclaimed = 0; - sc.priority = priority; - sc.swap_cluster_max = SWAP_CLUSTER_MAX; if (!priority) disable_swap_token(); - shrink_caches(zones, &sc); + nr_reclaimed += shrink_zones(priority, zones, &sc); shrink_slab(sc.nr_scanned, gfp_mask, lru_pages); if (reclaim_state) { - sc.nr_reclaimed += reclaim_state->reclaimed_slab; + nr_reclaimed += reclaim_state->reclaimed_slab; reclaim_state->reclaimed_slab = 0; } total_scanned += sc.nr_scanned; - total_reclaimed += sc.nr_reclaimed; - if (total_reclaimed >= sc.swap_cluster_max) { + if (nr_reclaimed >= sc.swap_cluster_max) { ret = 1; goto out; } @@ -1482,7 +995,8 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask) * that's undesirable in laptop mode, where we *want* lumpy * writeout. So in laptop mode, write out the whole world. */ - if (total_scanned > sc.swap_cluster_max + sc.swap_cluster_max/2) { + if (total_scanned > sc.swap_cluster_max + + sc.swap_cluster_max / 2) { wakeup_pdflush(laptop_mode ? 0 : total_scanned); sc.may_writepage = 1; } @@ -1528,22 +1042,26 @@ out: * the page allocator fallback scheme to ensure that aging of pages is balanced * across the zones. */ -static int balance_pgdat(pg_data_t *pgdat, int nr_pages, int order) +static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages, + int order) { - int to_free = nr_pages; + unsigned long to_free = nr_pages; int all_zones_ok; int priority; int i; - int total_scanned, total_reclaimed; + unsigned long total_scanned; + unsigned long nr_reclaimed; struct reclaim_state *reclaim_state = current->reclaim_state; - struct scan_control sc; + struct scan_control sc = { + .gfp_mask = GFP_KERNEL, + .may_swap = 1, + .swap_cluster_max = nr_pages ? nr_pages : SWAP_CLUSTER_MAX, + }; loop_again: total_scanned = 0; - total_reclaimed = 0; - sc.gfp_mask = GFP_KERNEL; - sc.may_writepage = !laptop_mode; - sc.may_swap = 1; + nr_reclaimed = 0; + sc.may_writepage = !laptop_mode, sc.nr_mapped = read_page_state(nr_mapped); inc_page_state(pageoutrun); @@ -1624,15 +1142,11 @@ scan: if (zone->prev_priority > priority) zone->prev_priority = priority; sc.nr_scanned = 0; - sc.nr_reclaimed = 0; - sc.priority = priority; - sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX; - shrink_zone(zone, &sc); + nr_reclaimed += shrink_zone(priority, zone, &sc); reclaim_state->reclaimed_slab = 0; nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, lru_pages); - sc.nr_reclaimed += reclaim_state->reclaimed_slab; - total_reclaimed += sc.nr_reclaimed; + nr_reclaimed += reclaim_state->reclaimed_slab; total_scanned += sc.nr_scanned; if (zone->all_unreclaimable) continue; @@ -1645,10 +1159,10 @@ scan: * even in laptop mode */ if (total_scanned > SWAP_CLUSTER_MAX * 2 && - total_scanned > total_reclaimed+total_reclaimed/2) + total_scanned > nr_reclaimed + nr_reclaimed / 2) sc.may_writepage = 1; } - if (nr_pages && to_free > total_reclaimed) + if (nr_pages && to_free > nr_reclaimed) continue; /* swsusp: need to do more work */ if (all_zones_ok) break; /* kswapd: all done */ @@ -1665,7 +1179,7 @@ scan: * matches the direct reclaim path behaviour in terms of impact * on zone->*_priority. */ - if ((total_reclaimed >= SWAP_CLUSTER_MAX) && (!nr_pages)) + if ((nr_reclaimed >= SWAP_CLUSTER_MAX) && !nr_pages) break; } out: @@ -1679,7 +1193,7 @@ out: goto loop_again; } - return total_reclaimed; + return nr_reclaimed; } /* @@ -1779,24 +1293,31 @@ void wakeup_kswapd(struct zone *zone, int order) * Try to free `nr_pages' of memory, system-wide. Returns the number of freed * pages. */ -int shrink_all_memory(int nr_pages) +unsigned long shrink_all_memory(unsigned long nr_pages) { pg_data_t *pgdat; - int nr_to_free = nr_pages; - int ret = 0; + unsigned long nr_to_free = nr_pages; + unsigned long ret = 0; + unsigned retry = 2; struct reclaim_state reclaim_state = { .reclaimed_slab = 0, }; current->reclaim_state = &reclaim_state; - for_each_pgdat(pgdat) { - int freed; +repeat: + for_each_online_pgdat(pgdat) { + unsigned long freed; + freed = balance_pgdat(pgdat, nr_to_free, 0); ret += freed; nr_to_free -= freed; - if (nr_to_free <= 0) + if ((long)nr_to_free <= 0) break; } + if (retry-- && ret < nr_pages) { + blk_congestion_wait(WRITE, HZ/5); + goto repeat; + } current->reclaim_state = NULL; return ret; } @@ -1808,14 +1329,13 @@ int shrink_all_memory(int nr_pages) away, we get changed to run anywhere: as the first one comes back, restore their cpu bindings. */ static int __devinit cpu_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) + unsigned long action, void *hcpu) { pg_data_t *pgdat; cpumask_t mask; if (action == CPU_ONLINE) { - for_each_pgdat(pgdat) { + for_each_online_pgdat(pgdat) { mask = node_to_cpumask(pgdat->node_id); if (any_online_cpu(mask) != NR_CPUS) /* One of our CPUs online: restore mask */ @@ -1829,10 +1349,17 @@ static int __devinit cpu_callback(struct notifier_block *nfb, static int __init kswapd_init(void) { pg_data_t *pgdat; + swap_setup(); - for_each_pgdat(pgdat) - pgdat->kswapd - = find_task_by_pid(kernel_thread(kswapd, pgdat, CLONE_KERNEL)); + for_each_online_pgdat(pgdat) { + pid_t pid; + + pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL); + BUG_ON(pid < 0); + read_lock(&tasklist_lock); + pgdat->kswapd = find_task_by_pid(pid); + read_unlock(&tasklist_lock); + } total_memory = nr_free_pagecache_pages(); hotcpu_notifier(cpu_callback, 0); return 0; @@ -1874,46 +1401,24 @@ int zone_reclaim_interval __read_mostly = 30*HZ; /* * Try to free up some pages from this zone through reclaim. */ -int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) +static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) { - int nr_pages; + /* Minimum pages needed in order to stay on node */ + const unsigned long nr_pages = 1 << order; struct task_struct *p = current; struct reclaim_state reclaim_state; - struct scan_control sc; - cpumask_t mask; - int node_id; - - if (time_before(jiffies, - zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval)) - return 0; - - if (!(gfp_mask & __GFP_WAIT) || - zone->all_unreclaimable || - atomic_read(&zone->reclaim_in_progress) > 0 || - (p->flags & PF_MEMALLOC)) - return 0; - - node_id = zone->zone_pgdat->node_id; - mask = node_to_cpumask(node_id); - if (!cpus_empty(mask) && node_id != numa_node_id()) - return 0; - - sc.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE); - sc.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP); - sc.nr_scanned = 0; - sc.nr_reclaimed = 0; - sc.priority = ZONE_RECLAIM_PRIORITY + 1; - sc.nr_mapped = read_page_state(nr_mapped); - sc.gfp_mask = gfp_mask; + int priority; + unsigned long nr_reclaimed = 0; + struct scan_control sc = { + .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), + .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), + .nr_mapped = read_page_state(nr_mapped), + .swap_cluster_max = max_t(unsigned long, nr_pages, + SWAP_CLUSTER_MAX), + .gfp_mask = gfp_mask, + }; disable_swap_token(); - - nr_pages = 1 << order; - if (nr_pages > SWAP_CLUSTER_MAX) - sc.swap_cluster_max = nr_pages; - else - sc.swap_cluster_max = SWAP_CLUSTER_MAX; - cond_resched(); /* * We need to be able to allocate from the reserves for RECLAIM_SWAP @@ -1928,17 +1433,20 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) * Free memory by calling shrink zone with increasing priorities * until we have enough memory freed. */ + priority = ZONE_RECLAIM_PRIORITY; do { - sc.priority--; - shrink_zone(zone, &sc); + nr_reclaimed += shrink_zone(priority, zone, &sc); + priority--; + } while (priority >= 0 && nr_reclaimed < nr_pages); - } while (sc.nr_reclaimed < nr_pages && sc.priority > 0); - - if (sc.nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) { + if (nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) { /* - * shrink_slab does not currently allow us to determine - * how many pages were freed in the zone. So we just - * shake the slab and then go offnode for a single allocation. + * shrink_slab() does not currently allow us to determine how + * many pages were freed in this zone. So we just shake the slab + * a bit and then go off node for this particular allocation + * despite possibly having freed enough memory to allocate in + * this zone. If we freed local memory then the next + * allocations will be local again. * * shrink_slab will free memory on all zones and may take * a long time. @@ -1949,10 +1457,54 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) p->reclaim_state = NULL; current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); - if (sc.nr_reclaimed == 0) + if (nr_reclaimed == 0) { + /* + * We were unable to reclaim enough pages to stay on node. We + * now allow off node accesses for a certain time period before + * trying again to reclaim pages from the local zone. + */ zone->last_unsuccessful_zone_reclaim = jiffies; + } - return sc.nr_reclaimed >= nr_pages; + return nr_reclaimed >= nr_pages; } -#endif +int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) +{ + cpumask_t mask; + int node_id; + + /* + * Do not reclaim if there was a recent unsuccessful attempt at zone + * reclaim. In that case we let allocations go off node for the + * zone_reclaim_interval. Otherwise we would scan for each off-node + * page allocation. + */ + if (time_before(jiffies, + zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval)) + return 0; + + /* + * Avoid concurrent zone reclaims, do not reclaim in a zone that does + * not have reclaimable pages and if we should not delay the allocation + * then do not scan. + */ + if (!(gfp_mask & __GFP_WAIT) || + zone->all_unreclaimable || + atomic_read(&zone->reclaim_in_progress) > 0 || + (current->flags & PF_MEMALLOC)) + return 0; + + /* + * Only run zone reclaim on the local zone or on zones that do not + * have associated processors. This will favor the local processor + * over remote processors and spread off node memory allocations + * as wide as possible. + */ + node_id = zone->zone_pgdat->node_id; + mask = node_to_cpumask(node_id); + if (!cpus_empty(mask) && node_id != numa_node_id()) + return 0; + return __zone_reclaim(zone, gfp_mask, order); +} +#endif |