From 4983da07f1e2e8dc81cb9d640fbf35b899cdbdf2 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 14 Mar 2006 19:50:19 -0800 Subject: [PATCH] page migration: fail if page is in a vma flagged VM_LOCKED page migration currently simply retries a couple of times if try_to_unmap() fails without inspecting the return code. However, SWAP_FAIL indicates that the page is in a vma that has the VM_LOCKED flag set (if ignore_refs ==1). We can check for that return code and avoid retrying the migration. migrate_page_remove_references() now needs to return a reason why the failure occured. So switch migrate_page_remove_references to use -Exx style error messages. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 7ccf763..4fe7e3a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -700,7 +700,7 @@ int migrate_page_remove_references(struct page *newpage, * the page. */ if (!mapping || page_mapcount(page) + nr_refs != page_count(page)) - return 1; + return -EAGAIN; /* * Establish swap ptes for anonymous pages or destroy pte @@ -721,13 +721,15 @@ int migrate_page_remove_references(struct page *newpage, * If the page was not migrated then the PageSwapCache bit * is still set and the operation may continue. */ - try_to_unmap(page, 1); + if (try_to_unmap(page, 1) == SWAP_FAIL) + /* A vma has VM_LOCKED set -> Permanent failure */ + return -EPERM; /* * Give up if we were unable to remove all mappings. */ if (page_mapcount(page)) - return 1; + return -EAGAIN; write_lock_irq(&mapping->tree_lock); @@ -738,7 +740,7 @@ int migrate_page_remove_references(struct page *newpage, if (!page_mapping(page) || page_count(page) != nr_refs || *radix_pointer != page) { write_unlock_irq(&mapping->tree_lock); - return 1; + return -EAGAIN; } /* @@ -813,10 +815,14 @@ EXPORT_SYMBOL(migrate_page_copy); */ int migrate_page(struct page *newpage, struct page *page) { + int rc; + BUG_ON(PageWriteback(page)); /* Writeback must be complete */ - if (migrate_page_remove_references(newpage, page, 2)) - return -EAGAIN; + rc = migrate_page_remove_references(newpage, page, 2); + + if (rc) + return rc; migrate_page_copy(newpage, page); -- cgit v1.1 From 74c002410548c7cb1744b45d17a5fa21da515b63 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 14 Mar 2006 19:50:21 -0800 Subject: [PATCH] Consistent capabilites associated with MPOL_MOVE_ALL It seems that setting scheduling policy and priorities is also the kind of thing that might be performed in apps that also use the NUMA API, so it would seem consistent to use CAP_SYS_NICE for NUMA also. So use CAP_SYS_NICE for controlling migration permissions. Signed-off-by: Christoph Lameter Cc: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 954981b..2a82060 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -748,7 +748,7 @@ long do_mbind(unsigned long start, unsigned long len, MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || mode > MPOL_MAX) return -EINVAL; - if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE)) + if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) return -EPERM; if (start & ~PAGE_MASK) @@ -942,20 +942,20 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, */ if ((current->euid != task->suid) && (current->euid != task->uid) && (current->uid != task->suid) && (current->uid != task->uid) && - !capable(CAP_SYS_ADMIN)) { + !capable(CAP_SYS_NICE)) { err = -EPERM; goto out; } task_nodes = cpuset_mems_allowed(task); /* Is the user allowed to access the target nodes? */ - if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) { + if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) { err = -EPERM; goto out; } err = do_migrate_pages(mm, &old, &new, - capable(CAP_SYS_ADMIN) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); + capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); out: mmput(mm); return err; -- cgit v1.1 From 90036ee5938d89638e80f4d0d0700d0f2dbd4a6a Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 16 Mar 2006 23:03:59 -0800 Subject: [PATCH] page migration: Fail with error if swap not setup Currently the migration of anonymous pages will silently fail if no swap is setup. This patch makes page migration functions check for available swap and fail with -ENODEV if no swap space is available. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 2a82060..b21869a 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -330,9 +330,19 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, int err; struct vm_area_struct *first, *vma, *prev; - /* Clear the LRU lists so pages can be isolated */ - if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) + if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { + /* Must have swap device for migration */ + if (nr_swap_pages <= 0) + return ERR_PTR(-ENODEV); + + /* + * Clear the LRU lists so pages can be isolated. + * Note that pages may be moved off the LRU after we have + * drained them. Those pages will fail to migrate like other + * pages that may be busy. + */ lru_add_drain_all(); + } first = find_vma(mm, start); if (!first) -- cgit v1.1 From 5b40dc780ed996162f3af8712eb03beb24dcdbef Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 16 Mar 2006 23:04:07 -0800 Subject: [PATCH] fix race in pagevec_strip? We can call try_to_release_page() with PagePrivate off and a valid page->mapping This may cause all sorts of trouble for the filesystem *_releasepage() handlers. XFS bombs out in that case. Lock the page before checking for page private. Signed-off-by: Christoph Lameter Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/swap.c b/mm/swap.c index e9ec06d..b524ea9 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -393,7 +393,8 @@ void pagevec_strip(struct pagevec *pvec) struct page *page = pvec->pages[i]; if (PagePrivate(page) && !TestSetPageLocked(page)) { - try_to_release_page(page, 0); + if (PagePrivate(page)) + try_to_release_page(page, 0); unlock_page(page); } } -- cgit v1.1 From 6f5e6b9e69bf043074a0edabe3d271899c34eb79 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 16 Mar 2006 23:04:09 -0800 Subject: [PATCH] fix free swap cache latency Lee Revell reported 28ms latency when process with lots of swapped memory exits. 2.6.15 introduced a latency regression when unmapping: in accounting the zap_work latency breaker, pte_none counted 1, pte_present PAGE_SIZE, but a swap entry counted nothing at all. We think of pages present as the slow case, but Lee's trace shows that free_swap_and_cache's radix tree lookup can make a lot of work - and we could have been doing it many thousands of times without a latency break. Move the zap_work update up to account swap entries like pages present. This does account non-linear pte_file entries, and unmap_mapping_range skipping over swap entries, by the same amount even though they're quick: but neither of those cases deserves complicating the code (and they're treated no worse than they were in 2.6.14). Signed-off-by: Hugh Dickins Acked-by: Nick Piggin Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 9abc600..85e80a5 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -623,11 +623,12 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, (*zap_work)--; continue; } + + (*zap_work) -= PAGE_SIZE; + if (pte_present(ptent)) { struct page *page; - (*zap_work) -= PAGE_SIZE; - page = vm_normal_page(vma, addr, ptent); if (unlikely(details) && page) { /* -- cgit v1.1 From b40607fc02f8248828d52d88f91b7d68df1933b0 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 22 Mar 2006 00:07:39 -0800 Subject: [PATCH] __get_page_state() cpumask cleanup and fix __get_page_state() has an open-coded for_each_cpu_mask() loop in it. Tidy that up, then notice that the code was buggy: while (cpu < NR_CPUS) { unsigned long *in, *out, off; if (!cpu_isset(cpu, *cpumask)) continue; an obvious infinite loop. I guess we just never call it with a holey cpu mask. Even after my cpumask size-reduction work, this patch increases code size :( Cc: Paul Jackson Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 234bd48..6177586 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1214,24 +1214,22 @@ DEFINE_PER_CPU(long, nr_pagecache_local) = 0; static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) { - int cpu = 0; + unsigned cpu; memset(ret, 0, nr * sizeof(unsigned long)); cpus_and(*cpumask, *cpumask, cpu_online_map); - cpu = first_cpu(*cpumask); - while (cpu < NR_CPUS) { - unsigned long *in, *out, off; - - if (!cpu_isset(cpu, *cpumask)) - continue; + for_each_cpu_mask(cpu, *cpumask) { + unsigned long *in; + unsigned long *out; + unsigned off; + unsigned next_cpu; in = (unsigned long *)&per_cpu(page_states, cpu); - cpu = next_cpu(cpu, *cpumask); - - if (likely(cpu < NR_CPUS)) - prefetch(&per_cpu(page_states, cpu)); + next_cpu = next_cpu(cpu, *cpumask); + if (likely(next_cpu < NR_CPUS)) + prefetch(&per_cpu(page_states, next_cpu)); out = (unsigned long *)ret; for (off = 0; off < nr; off++) -- cgit v1.1 From 46453a6e194a8c55fe6cf3dc8e1c4f24e2abc013 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 22 Mar 2006 00:07:58 -0800 Subject: [PATCH] mm: never ClearPageLRU released pages If vmscan finds a zero refcount page on the lru list, never ClearPageLRU it. This means the release code need not hold ->lru_lock to stabilise PageLRU, so that lock may be skipped entirely when releasing !PageLRU pages (because we know PageLRU won't have been temporarily cleared by vmscan, which was previously guaranteed by holding the lock to synchronise against vmscan). Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 52 +++++++++++++++++++++++++++------------------------- mm/vmscan.c | 18 +++++++++++------- 2 files changed, 38 insertions(+), 32 deletions(-) (limited to 'mm') diff --git a/mm/swap.c b/mm/swap.c index b524ea9..3045a0f 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -209,19 +209,18 @@ int lru_add_drain_all(void) */ void fastcall __page_cache_release(struct page *page) { - unsigned long flags; - struct zone *zone = page_zone(page); + if (PageLRU(page)) { + unsigned long flags; + struct zone *zone = page_zone(page); - spin_lock_irqsave(&zone->lru_lock, flags); - if (TestClearPageLRU(page)) + spin_lock_irqsave(&zone->lru_lock, flags); + if (!TestClearPageLRU(page)) + BUG(); del_page_from_lru(zone, page); - if (page_count(page) != 0) - page = NULL; - spin_unlock_irqrestore(&zone->lru_lock, flags); - if (page) - free_hot_page(page); + spin_unlock_irqrestore(&zone->lru_lock, flags); + } + free_hot_page(page); } - EXPORT_SYMBOL(__page_cache_release); /* @@ -245,7 +244,6 @@ void release_pages(struct page **pages, int nr, int cold) pagevec_init(&pages_to_free, cold); for (i = 0; i < nr; i++) { struct page *page = pages[i]; - struct zone *pagezone; if (unlikely(PageCompound(page))) { if (zone) { @@ -259,23 +257,27 @@ void release_pages(struct page **pages, int nr, int cold) if (!put_page_testzero(page)) continue; - pagezone = page_zone(page); - if (pagezone != zone) { - if (zone) - spin_unlock_irq(&zone->lru_lock); - zone = pagezone; - spin_lock_irq(&zone->lru_lock); - } - if (TestClearPageLRU(page)) + if (PageLRU(page)) { + struct zone *pagezone = page_zone(page); + if (pagezone != zone) { + if (zone) + spin_unlock_irq(&zone->lru_lock); + zone = pagezone; + spin_lock_irq(&zone->lru_lock); + } + if (!TestClearPageLRU(page)) + BUG(); del_page_from_lru(zone, page); - if (page_count(page) == 0) { - if (!pagevec_add(&pages_to_free, page)) { + } + + if (!pagevec_add(&pages_to_free, page)) { + if (zone) { spin_unlock_irq(&zone->lru_lock); - __pagevec_free(&pages_to_free); - pagevec_reinit(&pages_to_free); - zone = NULL; /* No lock is held */ + zone = NULL; } - } + __pagevec_free(&pages_to_free); + pagevec_reinit(&pages_to_free); + } } if (zone) spin_unlock_irq(&zone->lru_lock); diff --git a/mm/vmscan.c b/mm/vmscan.c index 4fe7e3a..acb7611 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1085,21 +1085,25 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src, page = lru_to_page(src); prefetchw_prev_lru_page(page, src, flags); - if (!TestClearPageLRU(page)) - BUG(); list_del(&page->lru); - if (get_page_testone(page)) { + if (unlikely(get_page_testone(page))) { /* * It is being freed elsewhere */ __put_page(page); - SetPageLRU(page); list_add(&page->lru, src); continue; - } else { - list_add(&page->lru, dst); - nr_taken++; } + + /* + * Be careful not to clear PageLRU until after we're sure + * the page is not being freed elsewhere -- the page release + * code relies on it. + */ + if (!TestClearPageLRU(page)) + BUG(); + list_add(&page->lru, dst); + nr_taken++; } *scanned = scan; -- cgit v1.1 From 8d438f96d2b8eade6cbcd8adfc22dae6f5cbd6c0 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 22 Mar 2006 00:07:59 -0800 Subject: [PATCH] mm: PageLRU no testset PG_lru is protected by zone->lru_lock. It does not need TestSet/TestClear operations. Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 16 ++++++++-------- mm/vmscan.c | 20 +++++++++++--------- 2 files changed, 19 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/mm/swap.c b/mm/swap.c index 3045a0f..985324e 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -214,8 +214,8 @@ void fastcall __page_cache_release(struct page *page) struct zone *zone = page_zone(page); spin_lock_irqsave(&zone->lru_lock, flags); - if (!TestClearPageLRU(page)) - BUG(); + BUG_ON(!PageLRU(page)); + ClearPageLRU(page); del_page_from_lru(zone, page); spin_unlock_irqrestore(&zone->lru_lock, flags); } @@ -265,8 +265,8 @@ void release_pages(struct page **pages, int nr, int cold) zone = pagezone; spin_lock_irq(&zone->lru_lock); } - if (!TestClearPageLRU(page)) - BUG(); + BUG_ON(!PageLRU(page)); + ClearPageLRU(page); del_page_from_lru(zone, page); } @@ -345,8 +345,8 @@ void __pagevec_lru_add(struct pagevec *pvec) zone = pagezone; spin_lock_irq(&zone->lru_lock); } - if (TestSetPageLRU(page)) - BUG(); + BUG_ON(PageLRU(page)); + SetPageLRU(page); add_page_to_inactive_list(zone, page); } if (zone) @@ -372,8 +372,8 @@ void __pagevec_lru_add_active(struct pagevec *pvec) zone = pagezone; spin_lock_irq(&zone->lru_lock); } - if (TestSetPageLRU(page)) - BUG(); + BUG_ON(PageLRU(page)); + SetPageLRU(page); if (TestSetPageActive(page)) BUG(); add_page_to_active_list(zone, page); diff --git a/mm/vmscan.c b/mm/vmscan.c index acb7611..40fb378 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1042,9 +1042,10 @@ int isolate_lru_page(struct page *page) if (PageLRU(page)) { struct zone *zone = page_zone(page); spin_lock_irq(&zone->lru_lock); - if (TestClearPageLRU(page)) { + if (PageLRU(page)) { ret = 1; get_page(page); + ClearPageLRU(page); if (PageActive(page)) del_page_from_active_list(zone, page); else @@ -1085,6 +1086,8 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src, page = lru_to_page(src); prefetchw_prev_lru_page(page, src, flags); + BUG_ON(!PageLRU(page)); + list_del(&page->lru); if (unlikely(get_page_testone(page))) { /* @@ -1100,8 +1103,7 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src, * the page is not being freed elsewhere -- the page release * code relies on it. */ - if (!TestClearPageLRU(page)) - BUG(); + ClearPageLRU(page); list_add(&page->lru, dst); nr_taken++; } @@ -1156,8 +1158,8 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc) */ while (!list_empty(&page_list)) { page = lru_to_page(&page_list); - if (TestSetPageLRU(page)) - BUG(); + BUG_ON(PageLRU(page)); + SetPageLRU(page); list_del(&page->lru); if (PageActive(page)) add_page_to_active_list(zone, page); @@ -1276,8 +1278,8 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) while (!list_empty(&l_inactive)) { page = lru_to_page(&l_inactive); prefetchw_prev_lru_page(page, &l_inactive, flags); - if (TestSetPageLRU(page)) - BUG(); + BUG_ON(PageLRU(page)); + SetPageLRU(page); if (!TestClearPageActive(page)) BUG(); list_move(&page->lru, &zone->inactive_list); @@ -1305,8 +1307,8 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) while (!list_empty(&l_active)) { page = lru_to_page(&l_active); prefetchw_prev_lru_page(page, &l_active, flags); - if (TestSetPageLRU(page)) - BUG(); + BUG_ON(PageLRU(page)); + SetPageLRU(page); BUG_ON(!PageActive(page)); list_move(&page->lru, &zone->active_list); pgmoved++; -- cgit v1.1 From 4c84cacfa424264f7ad5287298d3ea4a3e935278 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 22 Mar 2006 00:08:00 -0800 Subject: [PATCH] mm: PageActive no testset PG_active is protected by zone->lru_lock, it does not need TestSet/TestClear operations. Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 4 ++-- mm/vmscan.c | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/swap.c b/mm/swap.c index 985324e..cf88226 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -374,8 +374,8 @@ void __pagevec_lru_add_active(struct pagevec *pvec) } BUG_ON(PageLRU(page)); SetPageLRU(page); - if (TestSetPageActive(page)) - BUG(); + BUG_ON(PageActive(page)); + SetPageActive(page); add_page_to_active_list(zone, page); } if (zone) diff --git a/mm/vmscan.c b/mm/vmscan.c index 40fb378..8e477b1 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1280,8 +1280,9 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) prefetchw_prev_lru_page(page, &l_inactive, flags); BUG_ON(PageLRU(page)); SetPageLRU(page); - if (!TestClearPageActive(page)) - BUG(); + BUG_ON(!PageActive(page)); + ClearPageActive(page); + list_move(&page->lru, &zone->inactive_list); pgmoved++; if (!pagevec_add(&pvec, page)) { -- cgit v1.1 From 674539115cc88473f623581e1d53c0e2ecef2179 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 22 Mar 2006 00:08:00 -0800 Subject: [PATCH] mm: less atomic ops In the page release paths, we can be sure that nobody will mess with our page->flags because the refcount has dropped to 0. So no need for atomic operations here. Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/swap.c b/mm/swap.c index cf88226..91b7e20 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -215,7 +215,7 @@ void fastcall __page_cache_release(struct page *page) spin_lock_irqsave(&zone->lru_lock, flags); BUG_ON(!PageLRU(page)); - ClearPageLRU(page); + __ClearPageLRU(page); del_page_from_lru(zone, page); spin_unlock_irqrestore(&zone->lru_lock, flags); } @@ -266,7 +266,7 @@ void release_pages(struct page **pages, int nr, int cold) spin_lock_irq(&zone->lru_lock); } BUG_ON(!PageLRU(page)); - ClearPageLRU(page); + __ClearPageLRU(page); del_page_from_lru(zone, page); } -- cgit v1.1 From 5e9dace8d386def04219134d7160e8a778824764 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 22 Mar 2006 00:08:01 -0800 Subject: [PATCH] mm: page_alloc less atomics More atomic operation removal from page allocator Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6177586..1029198 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -190,7 +190,7 @@ static void prep_compound_page(struct page *page, unsigned long order) for (i = 0; i < nr_pages; i++) { struct page *p = page + i; - SetPageCompound(p); + __SetPageCompound(p); set_page_private(p, (unsigned long)page); } } @@ -209,7 +209,7 @@ static void destroy_compound_page(struct page *page, unsigned long order) if (unlikely(!PageCompound(p) | (page_private(p) != (unsigned long)page))) bad_page(page); - ClearPageCompound(p); + __ClearPageCompound(p); } } -- cgit v1.1 From f205b2fe62d321403525065a4cb31b6bff1bbe53 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 22 Mar 2006 00:08:02 -0800 Subject: [PATCH] mm: slab less atomics Atomic operation removal from slab Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index d0bd7f0..5988adf 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1402,7 +1402,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) atomic_add(i, &slab_reclaim_pages); add_page_state(nr_slab, i); while (i--) { - SetPageSlab(page); + __SetPageSlab(page); page++; } return addr; @@ -1418,8 +1418,8 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr) const unsigned long nr_freed = i; while (i--) { - if (!TestClearPageSlab(page)) - BUG(); + BUG_ON(!PageSlab(page)); + __ClearPageSlab(page); page++; } sub_page_state(nr_slab, nr_freed); -- cgit v1.1 From 7c8ee9a86340db686cd4314e9944dc9b6111bda9 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 22 Mar 2006 00:08:03 -0800 Subject: [PATCH] mm: simplify vmscan vs release refcounting The VM has an interesting race where a page refcount can drop to zero, but it is still on the LRU lists for a short time. This was solved by testing a 0->1 refcount transition when picking up pages from the LRU, and dropping the refcount in that case. Instead, use atomic_add_unless to ensure we never pick up a 0 refcount page from the LRU, thus a 0 refcount page will never have its refcount elevated until it is allocated again. Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 8e477b1..e21bab4 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1083,29 +1083,26 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src, int scan = 0; while (scan++ < nr_to_scan && !list_empty(src)) { + struct list_head *target; page = lru_to_page(src); prefetchw_prev_lru_page(page, src, flags); BUG_ON(!PageLRU(page)); list_del(&page->lru); - if (unlikely(get_page_testone(page))) { + target = src; + if (likely(get_page_unless_zero(page))) { /* - * It is being freed elsewhere + * Be careful not to clear PageLRU until after we're + * sure the page is not being freed elsewhere -- the + * page release code relies on it. */ - __put_page(page); - list_add(&page->lru, src); - continue; - } + ClearPageLRU(page); + target = dst; + nr_taken++; + } /* else it is being freed elsewhere */ - /* - * Be careful not to clear PageLRU until after we're sure - * the page is not being freed elsewhere -- the page release - * code relies on it. - */ - ClearPageLRU(page); - list_add(&page->lru, dst); - nr_taken++; + list_add(&page->lru, target); } *scanned = scan; -- cgit v1.1 From 8dfcc9ba27e2ed257e5de9539f7f03e57c2c0e33 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 22 Mar 2006 00:08:05 -0800 Subject: [PATCH] mm: split highorder pages Have an explicit mm call to split higher order pages into individual pages. Should help to avoid bugs and be more explicit about the code's intention. Signed-off-by: Nick Piggin Cc: Russell King Cc: David Howells Cc: Ralf Baechle Cc: Benjamin Herrenschmidt Cc: Paul Mundt Cc: "David S. Miller" Cc: Chris Zankel Signed-off-by: Yoichi Yuasa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 4 +--- mm/page_alloc.c | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 85e80a5..6af555c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1221,9 +1221,7 @@ out: * The page has to be a nice clean _individual_ kernel allocation. * If you allocate a compound page, you need to have marked it as * such (__GFP_COMP), or manually just split the page up yourself - * (which is mainly an issue of doing "set_page_count(page, 1)" for - * each sub-page, and then freeing them one by one when you free - * them rather than freeing it as a compound page). + * (see split_page()). * * NOTE! Traditionally this was done with "remap_pfn_range()" which * took an arbitrary page protection parameter. This doesn't allow diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1029198..fc65e87 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -752,6 +752,28 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) clear_highpage(page + i); } +#ifdef CONFIG_MMU +/* + * split_page takes a non-compound higher-order page, and splits it into + * n (1< 0 path. Saves a branch -- cgit v1.1 From 545b1ea9bfa5a8ca9af33d63144bd4f2faaea8dd Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 22 Mar 2006 00:08:07 -0800 Subject: [PATCH] mm: cleanup bootmem The bootmem code added to page_alloc.c duplicated some page freeing code that it really doesn't need to because it is not so performance critical. While we're here, make prefetching work properly by actually prefetching the page we're about to use before prefetching ahead to the next one (ie. get the most important transaction started first). Also prefetch just a single page ahead rather than leaving a gap of 16. Jack Steiner reported no problems with SGI's ia64 simulator. Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fc65e87..7aa0181 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -55,7 +55,6 @@ unsigned long totalhigh_pages __read_mostly; long nr_swap_pages; int percpu_pagelist_fraction; -static void fastcall free_hot_cold_page(struct page *page, int cold); static void __free_pages_ok(struct page *page, unsigned int order); /* @@ -448,28 +447,23 @@ void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order) if (order == 0) { __ClearPageReserved(page); set_page_count(page, 0); - - free_hot_cold_page(page, 0); + set_page_refs(page, 0); + __free_page(page); } else { - LIST_HEAD(list); int loop; + prefetchw(page); for (loop = 0; loop < BITS_PER_LONG; loop++) { struct page *p = &page[loop]; - if (loop + 16 < BITS_PER_LONG) - prefetchw(p + 16); + if (loop + 1 < BITS_PER_LONG) + prefetchw(p + 1); __ClearPageReserved(p); set_page_count(p, 0); } - arch_free_page(page, order); - - mod_page_state(pgfree, 1 << order); - - list_add(&page->lru, &list); - kernel_map_pages(page, 1 << order, 0); - free_pages_bulk(page_zone(page), 1, &list, order); + set_page_refs(page, order); + __free_pages(page, order); } } -- cgit v1.1 From a482289d46587ffcda4c85aab109fb74910d7a48 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 22 Mar 2006 00:08:08 -0800 Subject: [PATCH] hugepage allocator cleanup Insert "fresh" huge pages into the hugepage allocator by the same means as they are freed back into it. This reduces code size and allows enqueue_huge_page to be inlined into the hugepage free fastpath. Eliminate occurances of hugepages on the free list with non-zero refcount. This can allow stricter refcount checks in future. Also required for lockless pagecache. Signed-off-by: Nick Piggin "This patch also eliminates a leak "cleaned up" by re-clobbering the refcount on every allocation from the hugepage freelists. With respect to the lockless pagecache, the crucial aspect is to eliminate unconditional set_page_count() to 0 on pages with potentially nonzero refcounts, though closer inspection suggests the assignments removed are entirely spurious." Acked-by: William Irwin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5087077..39d49ec 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -64,7 +64,7 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma, return page; } -static struct page *alloc_fresh_huge_page(void) +static int alloc_fresh_huge_page(void) { static int nid = 0; struct page *page; @@ -72,12 +72,15 @@ static struct page *alloc_fresh_huge_page(void) HUGETLB_PAGE_ORDER); nid = (nid + 1) % num_online_nodes(); if (page) { + page[1].lru.next = (void *)free_huge_page; /* dtor */ spin_lock(&hugetlb_lock); nr_huge_pages++; nr_huge_pages_node[page_to_nid(page)]++; spin_unlock(&hugetlb_lock); + put_page(page); /* free it into the hugepage allocator */ + return 1; } - return page; + return 0; } void free_huge_page(struct page *page) @@ -85,7 +88,6 @@ void free_huge_page(struct page *page) BUG_ON(page_count(page)); INIT_LIST_HEAD(&page->lru); - page[1].lru.next = NULL; /* reset dtor */ spin_lock(&hugetlb_lock); enqueue_huge_page(page); @@ -105,7 +107,6 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr) } spin_unlock(&hugetlb_lock); set_page_count(page, 1); - page[1].lru.next = (void *)free_huge_page; /* set dtor */ for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i) clear_user_highpage(&page[i], addr); return page; @@ -114,7 +115,6 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr) static int __init hugetlb_init(void) { unsigned long i; - struct page *page; if (HPAGE_SHIFT == 0) return 0; @@ -123,12 +123,8 @@ static int __init hugetlb_init(void) INIT_LIST_HEAD(&hugepage_freelists[i]); for (i = 0; i < max_huge_pages; ++i) { - page = alloc_fresh_huge_page(); - if (!page) + if (!alloc_fresh_huge_page()) break; - spin_lock(&hugetlb_lock); - enqueue_huge_page(page); - spin_unlock(&hugetlb_lock); } max_huge_pages = free_huge_pages = nr_huge_pages = i; printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages); @@ -154,8 +150,8 @@ static void update_and_free_page(struct page *page) page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | 1 << PG_private | 1<< PG_writeback); - set_page_count(&page[i], 0); } + page[1].lru.next = NULL; set_page_count(page, 1); __free_pages(page, HUGETLB_PAGE_ORDER); } @@ -188,12 +184,8 @@ static inline void try_to_free_low(unsigned long count) static unsigned long set_max_huge_pages(unsigned long count) { while (count > nr_huge_pages) { - struct page *page = alloc_fresh_huge_page(); - if (!page) + if (!alloc_fresh_huge_page()) return nr_huge_pages; - spin_lock(&hugetlb_lock); - enqueue_huge_page(page); - spin_unlock(&hugetlb_lock); } if (count >= nr_huge_pages) return nr_huge_pages; -- cgit v1.1 From 8fea4e96a8f29ccc34c244f54574680ce9b43631 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Wed, 22 Mar 2006 00:08:10 -0800 Subject: [PATCH] slab: object to index mapping cleanup Clean up the object to index mapping that has been spread around mm/slab.c. Signed-off-by: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 5988adf..3d18b71 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -609,6 +609,18 @@ static inline struct slab *virt_to_slab(const void *obj) return page_get_slab(page); } +static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab, + unsigned int idx) +{ + return slab->s_mem + cache->buffer_size * idx; +} + +static inline unsigned int obj_to_index(struct kmem_cache *cache, + struct slab *slab, void *obj) +{ + return (unsigned)(obj - slab->s_mem) / cache->buffer_size; +} + /* These are the default caches for kmalloc. Custom caches can have other sizes. */ struct cache_sizes malloc_sizes[] = { #define CACHE(x) { .cs_size = (x) }, @@ -1568,18 +1580,18 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) * exist: */ struct slab *slabp = virt_to_slab(objp); - int objnr; + unsigned int objnr; - objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size; + objnr = obj_to_index(cachep, slabp, objp); if (objnr) { - objp = slabp->s_mem + (objnr - 1) * cachep->buffer_size; + objp = index_to_obj(cachep, slabp, objnr - 1); realobj = (char *)objp + obj_offset(cachep); printk(KERN_ERR "Prev obj: start=%p, len=%d\n", realobj, size); print_objinfo(cachep, objp, 2); } if (objnr + 1 < cachep->num) { - objp = slabp->s_mem + (objnr + 1) * cachep->buffer_size; + objp = index_to_obj(cachep, slabp, objnr + 1); realobj = (char *)objp + obj_offset(cachep); printk(KERN_ERR "Next obj: start=%p, len=%d\n", realobj, size); @@ -1598,7 +1610,7 @@ static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp) { int i; for (i = 0; i < cachep->num; i++) { - void *objp = slabp->s_mem + cachep->buffer_size * i; + void *objp = index_to_obj(cachep, slabp, i); if (cachep->flags & SLAB_POISON) { #ifdef CONFIG_DEBUG_PAGEALLOC @@ -1631,7 +1643,7 @@ static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp) if (cachep->dtor) { int i; for (i = 0; i < cachep->num; i++) { - void *objp = slabp->s_mem + cachep->buffer_size * i; + void *objp = index_to_obj(cachep, slabp, i); (cachep->dtor) (objp, cachep, 0); } } @@ -2307,7 +2319,7 @@ static void cache_init_objs(struct kmem_cache *cachep, int i; for (i = 0; i < cachep->num; i++) { - void *objp = slabp->s_mem + cachep->buffer_size * i; + void *objp = index_to_obj(cachep, slabp, i); #if DEBUG /* need to poison the objs? */ if (cachep->flags & SLAB_POISON) @@ -2363,7 +2375,7 @@ static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, int nodeid) { - void *objp = slabp->s_mem + (slabp->free * cachep->buffer_size); + void *objp = index_to_obj(cachep, slabp, slabp->free); kmem_bufctl_t next; slabp->inuse++; @@ -2380,7 +2392,7 @@ static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, int nod static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, void *objp, int nodeid) { - unsigned int objnr = (unsigned)(objp-slabp->s_mem) / cachep->buffer_size; + unsigned int objnr = obj_to_index(cachep, slabp, objp); #if DEBUG /* Verify that the slab belongs to the intended node */ @@ -2565,10 +2577,10 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, if (cachep->flags & SLAB_STORE_USER) *dbg_userword(cachep, objp) = caller; - objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size; + objnr = obj_to_index(cachep, slabp, objp); BUG_ON(objnr >= cachep->num); - BUG_ON(objp != slabp->s_mem + objnr * cachep->buffer_size); + BUG_ON(objp != index_to_obj(cachep, slabp, objnr)); if (cachep->flags & SLAB_DEBUG_INITIAL) { /* Need to call the slab's constructor so the -- cgit v1.1 From f30cf7d13eee420f5249b4d7709b46570098ab92 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Wed, 22 Mar 2006 00:08:11 -0800 Subject: [PATCH] slab: extract setup_cpu_cache Extract setup_cpu_cache() function from kmem_cache_create() to make the latter a little less complex. Signed-off-by: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 109 +++++++++++++++++++++++++++++++------------------------------- 1 file changed, 55 insertions(+), 54 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 3d18b71..4d5c4b9 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1748,6 +1748,60 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep, return left_over; } +static void setup_cpu_cache(struct kmem_cache *cachep) +{ + if (g_cpucache_up == FULL) { + enable_cpucache(cachep); + return; + } + if (g_cpucache_up == NONE) { + /* + * Note: the first kmem_cache_create must create the cache + * that's used by kmalloc(24), otherwise the creation of + * further caches will BUG(). + */ + cachep->array[smp_processor_id()] = &initarray_generic.cache; + + /* + * If the cache that's used by kmalloc(sizeof(kmem_list3)) is + * the first cache, then we need to set up all its list3s, + * otherwise the creation of further caches will BUG(). + */ + set_up_list3s(cachep, SIZE_AC); + if (INDEX_AC == INDEX_L3) + g_cpucache_up = PARTIAL_L3; + else + g_cpucache_up = PARTIAL_AC; + } else { + cachep->array[smp_processor_id()] = + kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); + + if (g_cpucache_up == PARTIAL_AC) { + set_up_list3s(cachep, SIZE_L3); + g_cpucache_up = PARTIAL_L3; + } else { + int node; + for_each_online_node(node) { + cachep->nodelists[node] = + kmalloc_node(sizeof(struct kmem_list3), + GFP_KERNEL, node); + BUG_ON(!cachep->nodelists[node]); + kmem_list3_init(cachep->nodelists[node]); + } + } + } + cachep->nodelists[numa_node_id()]->next_reap = + jiffies + REAPTIMEOUT_LIST3 + + ((unsigned long)cachep) % REAPTIMEOUT_LIST3; + + cpu_cache_get(cachep)->avail = 0; + cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES; + cpu_cache_get(cachep)->batchcount = 1; + cpu_cache_get(cachep)->touched = 0; + cachep->batchcount = 1; + cachep->limit = BOOT_CPUCACHE_ENTRIES; +} + /** * kmem_cache_create - Create a cache. * @name: A string which is used in /proc/slabinfo to identify this cache. @@ -2000,60 +2054,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, cachep->name = name; - if (g_cpucache_up == FULL) { - enable_cpucache(cachep); - } else { - if (g_cpucache_up == NONE) { - /* Note: the first kmem_cache_create must create - * the cache that's used by kmalloc(24), otherwise - * the creation of further caches will BUG(). - */ - cachep->array[smp_processor_id()] = - &initarray_generic.cache; - - /* If the cache that's used by - * kmalloc(sizeof(kmem_list3)) is the first cache, - * then we need to set up all its list3s, otherwise - * the creation of further caches will BUG(). - */ - set_up_list3s(cachep, SIZE_AC); - if (INDEX_AC == INDEX_L3) - g_cpucache_up = PARTIAL_L3; - else - g_cpucache_up = PARTIAL_AC; - } else { - cachep->array[smp_processor_id()] = - kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); - - if (g_cpucache_up == PARTIAL_AC) { - set_up_list3s(cachep, SIZE_L3); - g_cpucache_up = PARTIAL_L3; - } else { - int node; - for_each_online_node(node) { - - cachep->nodelists[node] = - kmalloc_node(sizeof - (struct kmem_list3), - GFP_KERNEL, node); - BUG_ON(!cachep->nodelists[node]); - kmem_list3_init(cachep-> - nodelists[node]); - } - } - } - cachep->nodelists[numa_node_id()]->next_reap = - jiffies + REAPTIMEOUT_LIST3 + - ((unsigned long)cachep) % REAPTIMEOUT_LIST3; - - BUG_ON(!cpu_cache_get(cachep)); - cpu_cache_get(cachep)->avail = 0; - cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES; - cpu_cache_get(cachep)->batchcount = 1; - cpu_cache_get(cachep)->touched = 0; - cachep->batchcount = 1; - cachep->limit = BOOT_CPUCACHE_ENTRIES; - } + setup_cpu_cache(cachep); /* cache setup completed, link it into the list */ list_add(&cachep->next, &cache_chain); -- cgit v1.1 From a737b3e2fcf96f576fa3e2e382236d9ee94f383f Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 22 Mar 2006 00:08:11 -0800 Subject: [PATCH] slab cleanup slab.c has become a bit revolting again. Try to repair it. - Coding style fixes - Don't do assignments-in-if-statements. - Don't typecast assignments to/from void* Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 596 ++++++++++++++++++++++++++++++++------------------------------ 1 file changed, 304 insertions(+), 292 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 4d5c4b9..7b6f9f1 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -50,7 +50,7 @@ * The head array is strictly LIFO and should improve the cache hit rates. * On SMP, it additionally reduces the spinlock operations. * - * The c_cpuarray may not be read with enabled local interrupts - + * The c_cpuarray may not be read with enabled local interrupts - * it's changed with a smp_call_function(). * * SMP synchronization: @@ -266,16 +266,17 @@ struct array_cache { unsigned int batchcount; unsigned int touched; spinlock_t lock; - void *entry[0]; /* - * Must have this definition in here for the proper - * alignment of array_cache. Also simplifies accessing - * the entries. - * [0] is for gcc 2.95. It should really be []. - */ + void *entry[0]; /* + * Must have this definition in here for the proper + * alignment of array_cache. Also simplifies accessing + * the entries. + * [0] is for gcc 2.95. It should really be []. + */ }; -/* bootstrap: The caches do not work without cpuarrays anymore, - * but the cpuarrays are allocated from the generic caches... +/* + * bootstrap: The caches do not work without cpuarrays anymore, but the + * cpuarrays are allocated from the generic caches... */ #define BOOT_CPUCACHE_ENTRIES 1 struct arraycache_init { @@ -310,10 +311,8 @@ struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS]; #define SIZE_L3 (1 + MAX_NUMNODES) /* - * This function must be completely optimized away if - * a constant is passed to it. Mostly the same as - * what is in linux/slab.h except it returns an - * index. + * This function must be completely optimized away if a constant is passed to + * it. Mostly the same as what is in linux/slab.h except it returns an index. */ static __always_inline int index_of(const size_t size) { @@ -351,14 +350,14 @@ static void kmem_list3_init(struct kmem_list3 *parent) parent->free_touched = 0; } -#define MAKE_LIST(cachep, listp, slab, nodeid) \ - do { \ - INIT_LIST_HEAD(listp); \ - list_splice(&(cachep->nodelists[nodeid]->slab), listp); \ +#define MAKE_LIST(cachep, listp, slab, nodeid) \ + do { \ + INIT_LIST_HEAD(listp); \ + list_splice(&(cachep->nodelists[nodeid]->slab), listp); \ } while (0) -#define MAKE_ALL_LISTS(cachep, ptr, nodeid) \ - do { \ +#define MAKE_ALL_LISTS(cachep, ptr, nodeid) \ + do { \ MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \ MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \ MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ @@ -379,8 +378,8 @@ struct kmem_cache { unsigned int buffer_size; /* 2) touched by every alloc & free from the backend */ struct kmem_list3 *nodelists[MAX_NUMNODES]; - unsigned int flags; /* constant flags */ - unsigned int num; /* # of objs per slab */ + unsigned int flags; /* constant flags */ + unsigned int num; /* # of objs per slab */ spinlock_t spinlock; /* 3) cache_grow/shrink */ @@ -390,11 +389,11 @@ struct kmem_cache { /* force GFP flags, e.g. GFP_DMA */ gfp_t gfpflags; - size_t colour; /* cache colouring range */ + size_t colour; /* cache colouring range */ unsigned int colour_off; /* colour offset */ struct kmem_cache *slabp_cache; unsigned int slab_size; - unsigned int dflags; /* dynamic flags */ + unsigned int dflags; /* dynamic flags */ /* constructor func */ void (*ctor) (void *, struct kmem_cache *, unsigned long); @@ -438,8 +437,9 @@ struct kmem_cache { #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) #define BATCHREFILL_LIMIT 16 -/* Optimization question: fewer reaps means less - * probability for unnessary cpucache drain/refill cycles. +/* + * Optimization question: fewer reaps means less probability for unnessary + * cpucache drain/refill cycles. * * OTOH the cpuarrays can contain lots of objects, * which could lock up otherwise freeable slabs. @@ -453,17 +453,19 @@ struct kmem_cache { #define STATS_INC_ALLOCED(x) ((x)->num_allocations++) #define STATS_INC_GROWN(x) ((x)->grown++) #define STATS_INC_REAPED(x) ((x)->reaped++) -#define STATS_SET_HIGH(x) do { if ((x)->num_active > (x)->high_mark) \ - (x)->high_mark = (x)->num_active; \ - } while (0) +#define STATS_SET_HIGH(x) \ + do { \ + if ((x)->num_active > (x)->high_mark) \ + (x)->high_mark = (x)->num_active; \ + } while (0) #define STATS_INC_ERR(x) ((x)->errors++) #define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++) #define STATS_INC_NODEFREES(x) ((x)->node_frees++) -#define STATS_SET_FREEABLE(x, i) \ - do { if ((x)->max_freeable < i) \ - (x)->max_freeable = i; \ - } while (0) - +#define STATS_SET_FREEABLE(x, i) \ + do { \ + if ((x)->max_freeable < i) \ + (x)->max_freeable = i; \ + } while (0) #define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit) #define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss) #define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit) @@ -478,9 +480,7 @@ struct kmem_cache { #define STATS_INC_ERR(x) do { } while (0) #define STATS_INC_NODEALLOCS(x) do { } while (0) #define STATS_INC_NODEFREES(x) do { } while (0) -#define STATS_SET_FREEABLE(x, i) \ - do { } while (0) - +#define STATS_SET_FREEABLE(x, i) do { } while (0) #define STATS_INC_ALLOCHIT(x) do { } while (0) #define STATS_INC_ALLOCMISS(x) do { } while (0) #define STATS_INC_FREEHIT(x) do { } while (0) @@ -488,7 +488,8 @@ struct kmem_cache { #endif #if DEBUG -/* Magic nums for obj red zoning. +/* + * Magic nums for obj red zoning. * Placed in the first word before and the first word after an obj. */ #define RED_INACTIVE 0x5A2CF071UL /* when obj is inactive */ @@ -499,7 +500,8 @@ struct kmem_cache { #define POISON_FREE 0x6b /* for use-after-free poisoning */ #define POISON_END 0xa5 /* end-byte of poisoning */ -/* memory layout of objects: +/* + * memory layout of objects: * 0 : objp * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that * the end of an object is aligned with the end of the real @@ -508,7 +510,8 @@ struct kmem_cache { * redzone word. * cachep->obj_offset: The real object. * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long] - * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address [BYTES_PER_WORD long] + * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address + * [BYTES_PER_WORD long] */ static int obj_offset(struct kmem_cache *cachep) { @@ -552,8 +555,8 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp) #endif /* - * Maximum size of an obj (in 2^order pages) - * and absolute limit for the gfp order. + * Maximum size of an obj (in 2^order pages) and absolute limit for the gfp + * order. */ #if defined(CONFIG_LARGE_ALLOCS) #define MAX_OBJ_ORDER 13 /* up to 32Mb */ @@ -573,9 +576,10 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp) #define BREAK_GFP_ORDER_LO 0 static int slab_break_gfp_order = BREAK_GFP_ORDER_LO; -/* Functions for storing/retrieving the cachep and or slab from the - * global 'mem_map'. These are used to find the slab an obj belongs to. - * With kfree(), these are used to find the cache which an obj belongs to. +/* + * Functions for storing/retrieving the cachep and or slab from the page + * allocator. These are used to find the slab an obj belongs to. With kfree(), + * these are used to find the cache which an obj belongs to. */ static inline void page_set_cache(struct page *page, struct kmem_cache *cache) { @@ -621,7 +625,9 @@ static inline unsigned int obj_to_index(struct kmem_cache *cache, return (unsigned)(obj - slab->s_mem) / cache->buffer_size; } -/* These are the default caches for kmalloc. Custom caches can have other sizes. */ +/* + * These are the default caches for kmalloc. Custom caches can have other sizes. + */ struct cache_sizes malloc_sizes[] = { #define CACHE(x) { .cs_size = (x) }, #include @@ -667,8 +673,8 @@ static DEFINE_MUTEX(cache_chain_mutex); static struct list_head cache_chain; /* - * vm_enough_memory() looks at this to determine how many - * slab-allocated pages are possibly freeable under pressure + * vm_enough_memory() looks at this to determine how many slab-allocated pages + * are possibly freeable under pressure * * SLAB_RECLAIM_ACCOUNT turns this on per-slab */ @@ -687,7 +693,8 @@ static enum { static DEFINE_PER_CPU(struct work_struct, reap_work); -static void free_block(struct kmem_cache *cachep, void **objpp, int len, int node); +static void free_block(struct kmem_cache *cachep, void **objpp, int len, + int node); static void enable_cpucache(struct kmem_cache *cachep); static void cache_reap(void *unused); static int __node_shrink(struct kmem_cache *cachep, int node); @@ -697,7 +704,8 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) return cachep->array[smp_processor_id()]; } -static inline struct kmem_cache *__find_general_cachep(size_t size, gfp_t gfpflags) +static inline struct kmem_cache *__find_general_cachep(size_t size, + gfp_t gfpflags) { struct cache_sizes *csizep = malloc_sizes; @@ -732,8 +740,9 @@ static size_t slab_mgmt_size(size_t nr_objs, size_t align) return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align); } -/* Calculate the number of objects and left-over bytes for a given - buffer size. */ +/* + * Calculate the number of objects and left-over bytes for a given buffer size. + */ static void cache_estimate(unsigned long gfporder, size_t buffer_size, size_t align, int flags, size_t *left_over, unsigned int *num) @@ -794,7 +803,8 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, #define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg) -static void __slab_error(const char *function, struct kmem_cache *cachep, char *msg) +static void __slab_error(const char *function, struct kmem_cache *cachep, + char *msg) { printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", function, cachep->name, msg); @@ -918,10 +928,8 @@ static void free_alien_cache(struct array_cache **ac_ptr) if (!ac_ptr) return; - for_each_node(i) kfree(ac_ptr[i]); - kfree(ac_ptr); } @@ -955,7 +963,8 @@ static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3) } } -static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **alien) +static void drain_alien_cache(struct kmem_cache *cachep, + struct array_cache **alien) { int i = 0; struct array_cache *ac; @@ -998,20 +1007,22 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, switch (action) { case CPU_UP_PREPARE: mutex_lock(&cache_chain_mutex); - /* we need to do this right in the beginning since + /* + * We need to do this right in the beginning since * alloc_arraycache's are going to use this list. * kmalloc_node allows us to add the slab to the right * kmem_list3 and not this cpu's kmem_list3 */ list_for_each_entry(cachep, &cache_chain, next) { - /* setup the size64 kmemlist for cpu before we can + /* + * Set up the size64 kmemlist for cpu before we can * begin anything. Make sure some other cpu on this * node has not already allocated this */ if (!cachep->nodelists[node]) { - if (!(l3 = kmalloc_node(memsize, - GFP_KERNEL, node))) + l3 = kmalloc_node(memsize, GFP_KERNEL, node); + if (!l3) goto bad; kmem_list3_init(l3); l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + @@ -1027,13 +1038,15 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, spin_lock_irq(&cachep->nodelists[node]->list_lock); cachep->nodelists[node]->free_limit = - (1 + nr_cpus_node(node)) * - cachep->batchcount + cachep->num; + (1 + nr_cpus_node(node)) * + cachep->batchcount + cachep->num; spin_unlock_irq(&cachep->nodelists[node]->list_lock); } - /* Now we can go ahead with allocating the shared array's - & array cache's */ + /* + * Now we can go ahead with allocating the shared arrays and + * array caches + */ list_for_each_entry(cachep, &cache_chain, next) { struct array_cache *nc; struct array_cache *shared; @@ -1053,7 +1066,6 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, if (!alien) goto bad; cachep->array[cpu] = nc; - l3 = cachep->nodelists[node]; BUG_ON(!l3); @@ -1073,7 +1085,6 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, } #endif spin_unlock_irq(&l3->list_lock); - kfree(shared); free_alien_cache(alien); } @@ -1095,7 +1106,6 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, /* fall thru */ case CPU_UP_CANCELED: mutex_lock(&cache_chain_mutex); - list_for_each_entry(cachep, &cache_chain, next) { struct array_cache *nc; struct array_cache *shared; @@ -1162,7 +1172,7 @@ free_array_cache: #endif } return NOTIFY_OK; - bad: +bad: mutex_unlock(&cache_chain_mutex); return NOTIFY_BAD; } @@ -1172,7 +1182,8 @@ static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 }; /* * swap the static kmem_list3 with kmalloced memory */ -static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, int nodeid) +static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, + int nodeid) { struct kmem_list3 *ptr; @@ -1187,8 +1198,9 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, int no local_irq_enable(); } -/* Initialisation. - * Called after the gfp() functions have been enabled, and before smp_init(). +/* + * Initialisation. Called after the page allocator have been initialised and + * before smp_init(). */ void __init kmem_cache_init(void) { @@ -1213,9 +1225,9 @@ void __init kmem_cache_init(void) /* Bootstrap is tricky, because several objects are allocated * from caches that do not exist yet: - * 1) initialize the cache_cache cache: it contains the struct kmem_cache - * structures of all caches, except cache_cache itself: cache_cache - * is statically allocated. + * 1) initialize the cache_cache cache: it contains the struct + * kmem_cache structures of all caches, except cache_cache itself: + * cache_cache is statically allocated. * Initially an __init data area is used for the head array and the * kmem_list3 structures, it's replaced with a kmalloc allocated * array at the end of the bootstrap. @@ -1238,7 +1250,8 @@ void __init kmem_cache_init(void) cache_cache.array[smp_processor_id()] = &initarray_cache.cache; cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE]; - cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, cache_line_size()); + cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, + cache_line_size()); for (order = 0; order < MAX_ORDER; order++) { cache_estimate(order, cache_cache.buffer_size, @@ -1257,24 +1270,26 @@ void __init kmem_cache_init(void) sizes = malloc_sizes; names = cache_names; - /* Initialize the caches that provide memory for the array cache - * and the kmem_list3 structures first. - * Without this, further allocations will bug + /* + * Initialize the caches that provide memory for the array cache and the + * kmem_list3 structures first. Without this, further allocations will + * bug. */ sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name, - sizes[INDEX_AC].cs_size, - ARCH_KMALLOC_MINALIGN, - (ARCH_KMALLOC_FLAGS | - SLAB_PANIC), NULL, NULL); + sizes[INDEX_AC].cs_size, + ARCH_KMALLOC_MINALIGN, + ARCH_KMALLOC_FLAGS|SLAB_PANIC, + NULL, NULL); - if (INDEX_AC != INDEX_L3) + if (INDEX_AC != INDEX_L3) { sizes[INDEX_L3].cs_cachep = - kmem_cache_create(names[INDEX_L3].name, - sizes[INDEX_L3].cs_size, - ARCH_KMALLOC_MINALIGN, - (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, - NULL); + kmem_cache_create(names[INDEX_L3].name, + sizes[INDEX_L3].cs_size, + ARCH_KMALLOC_MINALIGN, + ARCH_KMALLOC_FLAGS|SLAB_PANIC, + NULL, NULL); + } while (sizes->cs_size != ULONG_MAX) { /* @@ -1284,13 +1299,13 @@ void __init kmem_cache_init(void) * Note for systems short on memory removing the alignment will * allow tighter packing of the smaller caches. */ - if (!sizes->cs_cachep) + if (!sizes->cs_cachep) { sizes->cs_cachep = kmem_cache_create(names->name, - sizes->cs_size, - ARCH_KMALLOC_MINALIGN, - (ARCH_KMALLOC_FLAGS - | SLAB_PANIC), - NULL, NULL); + sizes->cs_size, + ARCH_KMALLOC_MINALIGN, + ARCH_KMALLOC_FLAGS|SLAB_PANIC, + NULL, NULL); + } /* Inc off-slab bufctl limit until the ceiling is hit. */ if (!(OFF_SLAB(sizes->cs_cachep))) { @@ -1299,13 +1314,11 @@ void __init kmem_cache_init(void) } sizes->cs_dmacachep = kmem_cache_create(names->name_dma, - sizes->cs_size, - ARCH_KMALLOC_MINALIGN, - (ARCH_KMALLOC_FLAGS | - SLAB_CACHE_DMA | - SLAB_PANIC), NULL, - NULL); - + sizes->cs_size, + ARCH_KMALLOC_MINALIGN, + ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA| + SLAB_PANIC, + NULL, NULL); sizes++; names++; } @@ -1357,20 +1370,22 @@ void __init kmem_cache_init(void) struct kmem_cache *cachep; mutex_lock(&cache_chain_mutex); list_for_each_entry(cachep, &cache_chain, next) - enable_cpucache(cachep); + enable_cpucache(cachep); mutex_unlock(&cache_chain_mutex); } /* Done! */ g_cpucache_up = FULL; - /* Register a cpu startup notifier callback - * that initializes cpu_cache_get for all new cpus + /* + * Register a cpu startup notifier callback that initializes + * cpu_cache_get for all new cpus */ register_cpu_notifier(&cpucache_notifier); - /* The reap timers are started later, with a module init call: - * That part of the kernel is not yet operational. + /* + * The reap timers are started later, with a module init call: That part + * of the kernel is not yet operational. */ } @@ -1378,16 +1393,13 @@ static int __init cpucache_init(void) { int cpu; - /* - * Register the timers that return unneeded - * pages to gfp. + /* + * Register the timers that return unneeded pages to the page allocator */ for_each_online_cpu(cpu) - start_cpu_timer(cpu); - + start_cpu_timer(cpu); return 0; } - __initcall(cpucache_init); /* @@ -1501,9 +1513,8 @@ static void dump_line(char *data, int offset, int limit) { int i; printk(KERN_ERR "%03x:", offset); - for (i = 0; i < limit; i++) { + for (i = 0; i < limit; i++) printk(" %02x", (unsigned char)data[offset + i]); - } printk("\n"); } #endif @@ -1517,15 +1528,15 @@ static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines) if (cachep->flags & SLAB_RED_ZONE) { printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n", - *dbg_redzone1(cachep, objp), - *dbg_redzone2(cachep, objp)); + *dbg_redzone1(cachep, objp), + *dbg_redzone2(cachep, objp)); } if (cachep->flags & SLAB_STORE_USER) { printk(KERN_ERR "Last user: [<%p>]", - *dbg_userword(cachep, objp)); + *dbg_userword(cachep, objp)); print_symbol("(%s)", - (unsigned long)*dbg_userword(cachep, objp)); + (unsigned long)*dbg_userword(cachep, objp)); printk("\n"); } realobj = (char *)objp + obj_offset(cachep); @@ -1558,8 +1569,8 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) /* Print header */ if (lines == 0) { printk(KERN_ERR - "Slab corruption: start=%p, len=%d\n", - realobj, size); + "Slab corruption: start=%p, len=%d\n", + realobj, size); print_objinfo(cachep, objp, 0); } /* Hexdump the affected line */ @@ -1614,11 +1625,10 @@ static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp) if (cachep->flags & SLAB_POISON) { #ifdef CONFIG_DEBUG_PAGEALLOC - if ((cachep->buffer_size % PAGE_SIZE) == 0 - && OFF_SLAB(cachep)) + if (cachep->buffer_size % PAGE_SIZE == 0 && + OFF_SLAB(cachep)) kernel_map_pages(virt_to_page(objp), - cachep->buffer_size / PAGE_SIZE, - 1); + cachep->buffer_size / PAGE_SIZE, 1); else check_poison_obj(cachep, objp); #else @@ -1650,10 +1660,10 @@ static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp) } #endif -/** +/* * Destroy all the objs in a slab, and release the mem back to the system. - * Before calling the slab must have been unlinked from the cache. - * The cache-lock is not held/needed. + * Before calling the slab must have been unlinked from the cache. The + * cache-lock is not held/needed. */ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) { @@ -1674,8 +1684,10 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) } } -/* For setting up all the kmem_list3s for cache whose buffer_size is same - as size of kmem_list3. */ +/* + * For setting up all the kmem_list3s for cache whose buffer_size is same as + * size of kmem_list3. + */ static void set_up_list3s(struct kmem_cache *cachep, int index) { int node; @@ -1701,13 +1713,13 @@ static void set_up_list3s(struct kmem_cache *cachep, int index) * high order pages for slabs. When the gfp() functions are more friendly * towards high-order requests, this should be changed. */ -static inline size_t calculate_slab_order(struct kmem_cache *cachep, +static size_t calculate_slab_order(struct kmem_cache *cachep, size_t size, size_t align, unsigned long flags) { size_t left_over = 0; int gfporder; - for (gfporder = 0 ; gfporder <= MAX_GFP_ORDER; gfporder++) { + for (gfporder = 0; gfporder <= MAX_GFP_ORDER; gfporder++) { unsigned int num; size_t remainder; @@ -1742,7 +1754,7 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep, /* * Acceptable internal fragmentation? */ - if ((left_over * 8) <= (PAGE_SIZE << gfporder)) + if (left_over * 8 <= (PAGE_SIZE << gfporder)) break; } return left_over; @@ -1817,9 +1829,8 @@ static void setup_cpu_cache(struct kmem_cache *cachep) * and the @dtor is run before the pages are handed back. * * @name must be valid until the cache is destroyed. This implies that - * the module calling this has to destroy the cache before getting - * unloaded. - * + * the module calling this has to destroy the cache before getting unloaded. + * * The flags are * * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) @@ -1837,7 +1848,8 @@ static void setup_cpu_cache(struct kmem_cache *cachep) */ struct kmem_cache * kmem_cache_create (const char *name, size_t size, size_t align, - unsigned long flags, void (*ctor)(void*, struct kmem_cache *, unsigned long), + unsigned long flags, + void (*ctor)(void*, struct kmem_cache *, unsigned long), void (*dtor)(void*, struct kmem_cache *, unsigned long)) { size_t left_over, slab_size, ralign; @@ -1847,12 +1859,10 @@ kmem_cache_create (const char *name, size_t size, size_t align, /* * Sanity checks... these are all serious usage bugs. */ - if ((!name) || - in_interrupt() || - (size < BYTES_PER_WORD) || + if (!name || in_interrupt() || (size < BYTES_PER_WORD) || (size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) { - printk(KERN_ERR "%s: Early error in slab %s\n", - __FUNCTION__, name); + printk(KERN_ERR "%s: Early error in slab %s\n", __FUNCTION__, + name); BUG(); } @@ -1906,8 +1916,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, * above the next power of two: caches with object sizes just above a * power of two have a significant amount of internal fragmentation. */ - if ((size < 4096 - || fls(size - 1) == fls(size - 1 + 3 * BYTES_PER_WORD))) + if (size < 4096 || fls(size - 1) == fls(size-1 + 3 * BYTES_PER_WORD)) flags |= SLAB_RED_ZONE | SLAB_STORE_USER; if (!(flags & SLAB_DESTROY_BY_RCU)) flags |= SLAB_POISON; @@ -1919,13 +1928,14 @@ kmem_cache_create (const char *name, size_t size, size_t align, BUG_ON(dtor); /* - * Always checks flags, a caller might be expecting debug - * support which isn't available. + * Always checks flags, a caller might be expecting debug support which + * isn't available. */ if (flags & ~CREATE_MASK) BUG(); - /* Check that size is in terms of words. This is needed to avoid + /* + * Check that size is in terms of words. This is needed to avoid * unaligned accesses for some archs when redzoning is used, and makes * sure any on-slab bufctl's are also correctly aligned. */ @@ -1934,12 +1944,14 @@ kmem_cache_create (const char *name, size_t size, size_t align, size &= ~(BYTES_PER_WORD - 1); } - /* calculate out the final buffer alignment: */ + /* calculate the final buffer alignment: */ + /* 1) arch recommendation: can be overridden for debug */ if (flags & SLAB_HWCACHE_ALIGN) { - /* Default alignment: as specified by the arch code. - * Except if an object is really small, then squeeze multiple - * objects into one cacheline. + /* + * Default alignment: as specified by the arch code. Except if + * an object is really small, then squeeze multiple objects into + * one cacheline. */ ralign = cache_line_size(); while (size <= ralign / 2) @@ -1959,7 +1971,8 @@ kmem_cache_create (const char *name, size_t size, size_t align, if (ralign > BYTES_PER_WORD) flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); } - /* 4) Store it. Note that the debug code below can reduce + /* + * 4) Store it. Note that the debug code below can reduce * the alignment to BYTES_PER_WORD. */ align = ralign; @@ -2058,7 +2071,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, /* cache setup completed, link it into the list */ list_add(&cachep->next, &cache_chain); - oops: +oops: if (!cachep && (flags & SLAB_PANIC)) panic("kmem_cache_create(): failed to create slab `%s'\n", name); @@ -2109,7 +2122,6 @@ static void smp_call_function_all_cpus(void (*func)(void *arg), void *arg) { check_irq_on(); preempt_disable(); - local_irq_disable(); func(arg); local_irq_enable(); @@ -2120,12 +2132,12 @@ static void smp_call_function_all_cpus(void (*func)(void *arg), void *arg) preempt_enable(); } -static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac, - int force, int node); +static void drain_array_locked(struct kmem_cache *cachep, + struct array_cache *ac, int force, int node); static void do_drain(void *arg) { - struct kmem_cache *cachep = (struct kmem_cache *) arg; + struct kmem_cache *cachep = arg; struct array_cache *ac; int node = numa_node_id(); @@ -2273,16 +2285,15 @@ int kmem_cache_destroy(struct kmem_cache *cachep) /* NUMA: free the list3 structures */ for_each_online_node(i) { - if ((l3 = cachep->nodelists[i])) { + l3 = cachep->nodelists[i]; + if (l3) { kfree(l3->shared); free_alien_cache(l3->alien); kfree(l3); } } kmem_cache_free(&cache_cache, cachep); - unlock_cpu_hotplug(); - return 0; } EXPORT_SYMBOL(kmem_cache_destroy); @@ -2305,7 +2316,6 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, slabp->inuse = 0; slabp->colouroff = colour_off; slabp->s_mem = objp + colour_off; - return slabp; } @@ -2333,9 +2343,9 @@ static void cache_init_objs(struct kmem_cache *cachep, *dbg_redzone2(cachep, objp) = RED_INACTIVE; } /* - * Constructors are not allowed to allocate memory from - * the same cache which they are a constructor for. - * Otherwise, deadlock. They must also be threaded. + * Constructors are not allowed to allocate memory from the same + * cache which they are a constructor for. Otherwise, deadlock. + * They must also be threaded. */ if (cachep->ctor && !(cachep->flags & SLAB_POISON)) cachep->ctor(objp + obj_offset(cachep), cachep, @@ -2349,8 +2359,8 @@ static void cache_init_objs(struct kmem_cache *cachep, slab_error(cachep, "constructor overwrote the" " start of an object"); } - if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep) - && cachep->flags & SLAB_POISON) + if ((cachep->buffer_size % PAGE_SIZE) == 0 && + OFF_SLAB(cachep) && cachep->flags & SLAB_POISON) kernel_map_pages(virt_to_page(objp), cachep->buffer_size / PAGE_SIZE, 0); #else @@ -2365,16 +2375,14 @@ static void cache_init_objs(struct kmem_cache *cachep, static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) { - if (flags & SLAB_DMA) { - if (!(cachep->gfpflags & GFP_DMA)) - BUG(); - } else { - if (cachep->gfpflags & GFP_DMA) - BUG(); - } + if (flags & SLAB_DMA) + BUG_ON(!(cachep->gfpflags & GFP_DMA)); + else + BUG_ON(cachep->gfpflags & GFP_DMA); } -static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, int nodeid) +static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, + int nodeid) { void *objp = index_to_obj(cachep, slabp, slabp->free); kmem_bufctl_t next; @@ -2390,8 +2398,8 @@ static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, int nod return objp; } -static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, void *objp, - int nodeid) +static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, + void *objp, int nodeid) { unsigned int objnr = obj_to_index(cachep, slabp, objp); @@ -2401,7 +2409,7 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, void *ob if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) { printk(KERN_ERR "slab: double free detected in cache " - "'%s', objp %p\n", cachep->name, objp); + "'%s', objp %p\n", cachep->name, objp); BUG(); } #endif @@ -2410,7 +2418,8 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, void *ob slabp->inuse--; } -static void set_slab_attr(struct kmem_cache *cachep, struct slab *slabp, void *objp) +static void set_slab_attr(struct kmem_cache *cachep, struct slab *slabp, + void *objp) { int i; struct page *page; @@ -2438,8 +2447,9 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid) unsigned long ctor_flags; struct kmem_list3 *l3; - /* Be lazy and only check for valid flags here, - * keeping it out of the critical path in kmem_cache_alloc(). + /* + * Be lazy and only check for valid flags here, keeping it out of the + * critical path in kmem_cache_alloc(). */ if (flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | SLAB_NO_GROW)) BUG(); @@ -2480,14 +2490,17 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid) */ kmem_flagcheck(cachep, flags); - /* Get mem for the objs. - * Attempt to allocate a physical page from 'nodeid', + /* + * Get mem for the objs. Attempt to allocate a physical page from + * 'nodeid'. */ - if (!(objp = kmem_getpages(cachep, flags, nodeid))) + objp = kmem_getpages(cachep, flags, nodeid); + if (!objp) goto failed; /* Get slab management. */ - if (!(slabp = alloc_slabmgmt(cachep, objp, offset, local_flags))) + slabp = alloc_slabmgmt(cachep, objp, offset, local_flags); + if (!slabp) goto opps1; slabp->nodeid = nodeid; @@ -2506,9 +2519,9 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid) l3->free_objects += cachep->num; spin_unlock(&l3->list_lock); return 1; - opps1: +opps1: kmem_freepages(cachep, objp); - failed: +failed: if (local_flags & __GFP_WAIT) local_irq_disable(); return 0; @@ -2551,8 +2564,8 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, page = virt_to_page(objp); if (page_get_cache(page) != cachep) { - printk(KERN_ERR - "mismatch in kmem_cache_free: expected cache %p, got %p\n", + printk(KERN_ERR "mismatch in kmem_cache_free: expected " + "cache %p, got %p\n", page_get_cache(page), cachep); printk(KERN_ERR "%p is %s.\n", cachep, cachep->name); printk(KERN_ERR "%p is %s.\n", page_get_cache(page), @@ -2562,13 +2575,12 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, slabp = page_get_slab(page); if (cachep->flags & SLAB_RED_ZONE) { - if (*dbg_redzone1(cachep, objp) != RED_ACTIVE - || *dbg_redzone2(cachep, objp) != RED_ACTIVE) { - slab_error(cachep, - "double free, or memory outside" - " object was overwritten"); - printk(KERN_ERR - "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n", + if (*dbg_redzone1(cachep, objp) != RED_ACTIVE || + *dbg_redzone2(cachep, objp) != RED_ACTIVE) { + slab_error(cachep, "double free, or memory outside" + " object was overwritten"); + printk(KERN_ERR "%p: redzone 1:0x%lx, " + "redzone 2:0x%lx.\n", objp, *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp)); } @@ -2584,9 +2596,10 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, BUG_ON(objp != index_to_obj(cachep, slabp, objnr)); if (cachep->flags & SLAB_DEBUG_INITIAL) { - /* Need to call the slab's constructor so the - * caller can perform a verify of its state (debugging). - * Called without the cache-lock held. + /* + * Need to call the slab's constructor so the caller can + * perform a verify of its state (debugging). Called without + * the cache-lock held. */ cachep->ctor(objp + obj_offset(cachep), cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY); @@ -2599,7 +2612,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, } if (cachep->flags & SLAB_POISON) { #ifdef CONFIG_DEBUG_PAGEALLOC - if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) { + if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { store_stackinfo(cachep, objp, (unsigned long)caller); kernel_map_pages(virt_to_page(objp), cachep->buffer_size / PAGE_SIZE, 0); @@ -2625,14 +2638,14 @@ static void check_slabp(struct kmem_cache *cachep, struct slab *slabp) goto bad; } if (entries != cachep->num - slabp->inuse) { - bad: - printk(KERN_ERR - "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n", - cachep->name, cachep->num, slabp, slabp->inuse); +bad: + printk(KERN_ERR "slab: Internal list corruption detected in " + "cache '%s'(%d), slabp %p(%d). Hexdump:\n", + cachep->name, cachep->num, slabp, slabp->inuse); for (i = 0; i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t); i++) { - if ((i % 16) == 0) + if (i % 16 == 0) printk("\n%03x:", i); printk(" %02x", ((unsigned char *)slabp)[i]); } @@ -2654,12 +2667,13 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) check_irq_off(); ac = cpu_cache_get(cachep); - retry: +retry: batchcount = ac->batchcount; if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { - /* if there was little recent activity on this - * cache, then perform only a partial refill. - * Otherwise we could generate refill bouncing. + /* + * If there was little recent activity on this cache, then + * perform only a partial refill. Otherwise we could generate + * refill bouncing. */ batchcount = BATCHREFILL_LIMIT; } @@ -2715,29 +2729,29 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) list_add(&slabp->list, &l3->slabs_partial); } - must_grow: +must_grow: l3->free_objects -= ac->avail; - alloc_done: +alloc_done: spin_unlock(&l3->list_lock); if (unlikely(!ac->avail)) { int x; x = cache_grow(cachep, flags, numa_node_id()); - // cache_grow can reenable interrupts, then ac could change. + /* cache_grow can reenable interrupts, then ac could change. */ ac = cpu_cache_get(cachep); - if (!x && ac->avail == 0) // no objects in sight? abort + if (!x && ac->avail == 0) /* no objects in sight? abort */ return NULL; - if (!ac->avail) // objects refilled by interrupt? + if (!ac->avail) /* objects refilled by interrupt? */ goto retry; } ac->touched = 1; return ac->entry[--ac->avail]; } -static inline void -cache_alloc_debugcheck_before(struct kmem_cache *cachep, gfp_t flags) +static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, + gfp_t flags) { might_sleep_if(flags & __GFP_WAIT); #if DEBUG @@ -2746,8 +2760,8 @@ cache_alloc_debugcheck_before(struct kmem_cache *cachep, gfp_t flags) } #if DEBUG -static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags, - void *objp, void *caller) +static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, + gfp_t flags, void *objp, void *caller) { if (!objp) return objp; @@ -2767,15 +2781,14 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags *dbg_userword(cachep, objp) = caller; if (cachep->flags & SLAB_RED_ZONE) { - if (*dbg_redzone1(cachep, objp) != RED_INACTIVE - || *dbg_redzone2(cachep, objp) != RED_INACTIVE) { - slab_error(cachep, - "double free, or memory outside" - " object was overwritten"); + if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || + *dbg_redzone2(cachep, objp) != RED_INACTIVE) { + slab_error(cachep, "double free, or memory outside" + " object was overwritten"); printk(KERN_ERR - "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n", - objp, *dbg_redzone1(cachep, objp), - *dbg_redzone2(cachep, objp)); + "%p: redzone 1:0x%lx, redzone 2:0x%lx\n", + objp, *dbg_redzone1(cachep, objp), + *dbg_redzone2(cachep, objp)); } *dbg_redzone1(cachep, objp) = RED_ACTIVE; *dbg_redzone2(cachep, objp) = RED_ACTIVE; @@ -2822,8 +2835,8 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) return objp; } -static __always_inline void * -__cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) +static __always_inline void *__cache_alloc(struct kmem_cache *cachep, + gfp_t flags, void *caller) { unsigned long save_flags; void *objp; @@ -2843,7 +2856,8 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) /* * A interface to enable slab creation on nodeid */ -static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) +static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, + int nodeid) { struct list_head *entry; struct slab *slabp; @@ -2854,7 +2868,7 @@ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int node l3 = cachep->nodelists[nodeid]; BUG_ON(!l3); - retry: +retry: check_irq_off(); spin_lock(&l3->list_lock); entry = l3->slabs_partial.next; @@ -2881,16 +2895,15 @@ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int node /* move slabp to correct slabp list: */ list_del(&slabp->list); - if (slabp->free == BUFCTL_END) { + if (slabp->free == BUFCTL_END) list_add(&slabp->list, &l3->slabs_full); - } else { + else list_add(&slabp->list, &l3->slabs_partial); - } spin_unlock(&l3->list_lock); goto done; - must_grow: +must_grow: spin_unlock(&l3->list_lock); x = cache_grow(cachep, flags, nodeid); @@ -2898,7 +2911,7 @@ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int node return NULL; goto retry; - done: +done: return obj; } #endif @@ -2971,7 +2984,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) } free_block(cachep, ac->entry, batchcount, node); - free_done: +free_done: #if STATS { int i = 0; @@ -2992,16 +3005,12 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) #endif spin_unlock(&l3->list_lock); ac->avail -= batchcount; - memmove(ac->entry, &(ac->entry[batchcount]), - sizeof(void *) * ac->avail); + memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail); } /* - * __cache_free - * Release an obj back to its cache. If the obj has a constructed - * state, it must be in this state _before_ it is released. - * - * Called with disabled ints. + * Release an obj back to its cache. If the obj has a constructed state, it must + * be in this state _before_ it is released. Called with disabled ints. */ static inline void __cache_free(struct kmem_cache *cachep, void *objp) { @@ -3020,9 +3029,9 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp) if (unlikely(slabp->nodeid != numa_node_id())) { struct array_cache *alien = NULL; int nodeid = slabp->nodeid; - struct kmem_list3 *l3 = - cachep->nodelists[numa_node_id()]; + struct kmem_list3 *l3; + l3 = cachep->nodelists[numa_node_id()]; STATS_INC_NODEFREES(cachep); if (l3->alien && l3->alien[nodeid]) { alien = l3->alien[nodeid]; @@ -3106,7 +3115,7 @@ int fastcall kmem_ptr_validate(struct kmem_cache *cachep, void *ptr) if (unlikely(page_get_cache(page) != cachep)) goto out; return 1; - out: +out: return 0; } @@ -3132,7 +3141,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) local_irq_save(save_flags); if (nodeid == -1 || nodeid == numa_node_id() || - !cachep->nodelists[nodeid]) + !cachep->nodelists[nodeid]) ptr = ____cache_alloc(cachep, flags); else ptr = __cache_alloc_node(cachep, flags, nodeid); @@ -3249,7 +3258,7 @@ void *__alloc_percpu(size_t size) /* Catch derefs w/o wrappers */ return (void *)(~(unsigned long)pdata); - unwind_oom: +unwind_oom: while (--i >= 0) { if (!cpu_possible(i)) continue; @@ -3352,18 +3361,20 @@ static int alloc_kmemlist(struct kmem_cache *cachep) struct array_cache *nc = NULL, *new; struct array_cache **new_alien = NULL; #ifdef CONFIG_NUMA - if (!(new_alien = alloc_alien_cache(node, cachep->limit))) + new_alien = alloc_alien_cache(node, cachep->limit); + if (!new_alien) goto fail; #endif - if (!(new = alloc_arraycache(node, (cachep->shared * - cachep->batchcount), - 0xbaadf00d))) + new = alloc_arraycache(node, cachep->shared*cachep->batchcount, + 0xbaadf00d); + if (!new) goto fail; - if ((l3 = cachep->nodelists[node])) { - + l3 = cachep->nodelists[node]; + if (l3) { spin_lock_irq(&l3->list_lock); - if ((nc = cachep->nodelists[node]->shared)) + nc = cachep->nodelists[node]->shared; + if (nc) free_block(cachep, nc->entry, nc->avail, node); l3->shared = new; @@ -3372,27 +3383,27 @@ static int alloc_kmemlist(struct kmem_cache *cachep) new_alien = NULL; } l3->free_limit = (1 + nr_cpus_node(node)) * - cachep->batchcount + cachep->num; + cachep->batchcount + cachep->num; spin_unlock_irq(&l3->list_lock); kfree(nc); free_alien_cache(new_alien); continue; } - if (!(l3 = kmalloc_node(sizeof(struct kmem_list3), - GFP_KERNEL, node))) + l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node); + if (!l3) goto fail; kmem_list3_init(l3); l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + - ((unsigned long)cachep) % REAPTIMEOUT_LIST3; + ((unsigned long)cachep) % REAPTIMEOUT_LIST3; l3->shared = new; l3->alien = new_alien; l3->free_limit = (1 + nr_cpus_node(node)) * - cachep->batchcount + cachep->num; + cachep->batchcount + cachep->num; cachep->nodelists[node] = l3; } return err; - fail: +fail: err = -ENOMEM; return err; } @@ -3404,7 +3415,7 @@ struct ccupdate_struct { static void do_ccupdate_local(void *info) { - struct ccupdate_struct *new = (struct ccupdate_struct *)info; + struct ccupdate_struct *new = info; struct array_cache *old; check_irq_off(); @@ -3414,16 +3425,16 @@ static void do_ccupdate_local(void *info) new->new[smp_processor_id()] = old; } -static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount, - int shared) +static int do_tune_cpucache(struct kmem_cache *cachep, int limit, + int batchcount, int shared) { struct ccupdate_struct new; int i, err; memset(&new.new, 0, sizeof(new.new)); for_each_online_cpu(i) { - new.new[i] = - alloc_arraycache(cpu_to_node(i), limit, batchcount); + new.new[i] = alloc_arraycache(cpu_to_node(i), limit, + batchcount); if (!new.new[i]) { for (i--; i >= 0; i--) kfree(new.new[i]); @@ -3465,10 +3476,11 @@ static void enable_cpucache(struct kmem_cache *cachep) int err; int limit, shared; - /* The head array serves three purposes: + /* + * The head array serves three purposes: * - create a LIFO ordering, i.e. return objects that are cache-warm * - reduce the number of spinlock operations. - * - reduce the number of linked list operations on the slab and + * - reduce the number of linked list operations on the slab and * bufctl chains: array operations are cheaper. * The numbers are guessed, we should auto-tune as described by * Bonwick. @@ -3484,7 +3496,8 @@ static void enable_cpucache(struct kmem_cache *cachep) else limit = 120; - /* Cpu bound tasks (e.g. network routing) can exhibit cpu bound + /* + * CPU bound tasks (e.g. network routing) can exhibit cpu bound * allocation behaviour: Most allocs on one cpu, most free operations * on another cpu. For these cases, an efficient object passing between * cpus is necessary. This is provided by a shared array. The array @@ -3499,9 +3512,9 @@ static void enable_cpucache(struct kmem_cache *cachep) #endif #if DEBUG - /* With debugging enabled, large batchcount lead to excessively - * long periods with disabled local interrupts. Limit the - * batchcount + /* + * With debugging enabled, large batchcount lead to excessively long + * periods with disabled local interrupts. Limit the batchcount */ if (limit > 32) limit = 32; @@ -3512,8 +3525,8 @@ static void enable_cpucache(struct kmem_cache *cachep) cachep->name, -err); } -static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac, - int force, int node) +static void drain_array_locked(struct kmem_cache *cachep, + struct array_cache *ac, int force, int node) { int tofree; @@ -3522,9 +3535,8 @@ static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac ac->touched = 0; } else if (ac->avail) { tofree = force ? ac->avail : (ac->limit + 4) / 5; - if (tofree > ac->avail) { + if (tofree > ac->avail) tofree = (ac->avail + 1) / 2; - } free_block(cachep, ac->entry, tofree, node); ac->avail -= tofree; memmove(ac->entry, &(ac->entry[tofree]), @@ -3541,8 +3553,8 @@ static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac * - clear the per-cpu caches for this CPU. * - return freeable pages to the main free memory pool. * - * If we cannot acquire the cache chain mutex then just give up - we'll - * try again on the next iteration. + * If we cannot acquire the cache chain mutex then just give up - we'll try + * again on the next iteration. */ static void cache_reap(void *unused) { @@ -3590,9 +3602,8 @@ static void cache_reap(void *unused) goto next_unlock; } - tofree = - (l3->free_limit + 5 * searchp->num - - 1) / (5 * searchp->num); + tofree = (l3->free_limit + 5 * searchp->num - 1) / + (5 * searchp->num); do { p = l3->slabs_free.next; if (p == &(l3->slabs_free)) @@ -3603,9 +3614,9 @@ static void cache_reap(void *unused) list_del(&slabp->list); STATS_INC_REAPED(searchp); - /* Safe to drop the lock. The slab is no longer - * linked to the cache. - * searchp cannot disappear, we hold + /* + * Safe to drop the lock. The slab is no longer linked + * to the cache. searchp cannot disappear, we hold * cache_chain_lock */ l3->free_objects -= searchp->num; @@ -3613,15 +3624,15 @@ static void cache_reap(void *unused) slab_destroy(searchp, slabp); spin_lock_irq(&l3->list_lock); } while (--tofree > 0); - next_unlock: +next_unlock: spin_unlock_irq(&l3->list_lock); - next: +next: cond_resched(); } check_irq_on(); mutex_unlock(&cache_chain_mutex); next_reap_node(); - /* Setup the next iteration */ + /* Set up the next iteration */ schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); } @@ -3671,8 +3682,8 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos) { struct kmem_cache *cachep = p; ++*pos; - return cachep->next.next == &cache_chain ? NULL - : list_entry(cachep->next.next, struct kmem_cache, next); + return cachep->next.next == &cache_chain ? + NULL : list_entry(cachep->next.next, struct kmem_cache, next); } static void s_stop(struct seq_file *m, void *p) @@ -3761,7 +3772,9 @@ static int s_show(struct seq_file *m, void *p) unsigned long node_frees = cachep->node_frees; seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \ - %4lu %4lu %4lu %4lu", allocs, high, grown, reaped, errors, max_freeable, node_allocs, node_frees); + %4lu %4lu %4lu %4lu", allocs, high, grown, + reaped, errors, max_freeable, node_allocs, + node_frees); } /* cpu stats */ { @@ -3833,13 +3846,12 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer, mutex_lock(&cache_chain_mutex); res = -EINVAL; list_for_each(p, &cache_chain) { - struct kmem_cache *cachep = list_entry(p, struct kmem_cache, - next); + struct kmem_cache *cachep; + cachep = list_entry(p, struct kmem_cache, next); if (!strcmp(cachep->name, kbuf)) { - if (limit < 1 || - batchcount < 1 || - batchcount > limit || shared < 0) { + if (limit < 1 || batchcount < 1 || + batchcount > limit || shared < 0) { res = 0; } else { res = do_tune_cpucache(cachep, limit, -- cgit v1.1 From b5d8ca7c50826c0b456b4a646875dc573adfde2b Mon Sep 17 00:00:00 2001 From: Ravikiran G Thirumalai Date: Wed, 22 Mar 2006 00:08:12 -0800 Subject: [PATCH] slab: remove cachep->spinlock Remove cachep->spinlock. Locking has moved to the kmem_list3 and most of the structures protected earlier by cachep->spinlock is now protected by the l3->list_lock. slab cache tunables like batchcount are accessed always with the cache_chain_mutex held. Patch tested on SMP and NUMA kernels with dbench processes running, constant onlining/offlining, and constant cache tuning, all at the same time. Signed-off-by: Ravikiran Thirumalai Cc: Christoph Lameter Cc: Pekka Enberg Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 7b6f9f1..2cd8020 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -372,17 +372,19 @@ static void kmem_list3_init(struct kmem_list3 *parent) struct kmem_cache { /* 1) per-cpu data, touched during every alloc/free */ struct array_cache *array[NR_CPUS]; +/* 2) Cache tunables. Protected by cache_chain_mutex */ unsigned int batchcount; unsigned int limit; unsigned int shared; + unsigned int buffer_size; -/* 2) touched by every alloc & free from the backend */ +/* 3) touched by every alloc & free from the backend */ struct kmem_list3 *nodelists[MAX_NUMNODES]; + unsigned int flags; /* constant flags */ unsigned int num; /* # of objs per slab */ - spinlock_t spinlock; -/* 3) cache_grow/shrink */ +/* 4) cache_grow/shrink */ /* order of pgs per slab (2^n) */ unsigned int gfporder; @@ -401,11 +403,11 @@ struct kmem_cache { /* de-constructor func */ void (*dtor) (void *, struct kmem_cache *, unsigned long); -/* 4) cache creation/removal */ +/* 5) cache creation/removal */ const char *name; struct list_head next; -/* 5) statistics */ +/* 6) statistics */ #if STATS unsigned long num_active; unsigned long num_allocations; @@ -661,7 +663,6 @@ static struct kmem_cache cache_cache = { .shared = 1, .buffer_size = sizeof(struct kmem_cache), .flags = SLAB_NO_REAP, - .spinlock = SPIN_LOCK_UNLOCKED, .name = "kmem_cache", #if DEBUG .obj_size = sizeof(struct kmem_cache), @@ -2057,7 +2058,6 @@ kmem_cache_create (const char *name, size_t size, size_t align, cachep->gfpflags = 0; if (flags & SLAB_CACHE_DMA) cachep->gfpflags |= GFP_DMA; - spin_lock_init(&cachep->spinlock); cachep->buffer_size = size; if (flags & CFLGS_OFF_SLAB) @@ -3425,6 +3425,7 @@ static void do_ccupdate_local(void *info) new->new[smp_processor_id()] = old; } +/* Always called with the cache_chain_mutex held */ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount, int shared) { @@ -3446,11 +3447,9 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, smp_call_function_all_cpus(do_ccupdate_local, (void *)&new); check_irq_on(); - spin_lock(&cachep->spinlock); cachep->batchcount = batchcount; cachep->limit = limit; cachep->shared = shared; - spin_unlock(&cachep->spinlock); for_each_online_cpu(i) { struct array_cache *ccold = new.new[i]; @@ -3471,6 +3470,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, return 0; } +/* Called with cache_chain_mutex held always */ static void enable_cpucache(struct kmem_cache *cachep) { int err; @@ -3705,7 +3705,6 @@ static int s_show(struct seq_file *m, void *p) int node; struct kmem_list3 *l3; - spin_lock(&cachep->spinlock); active_objs = 0; num_slabs = 0; for_each_online_node(node) { @@ -3788,7 +3787,6 @@ static int s_show(struct seq_file *m, void *p) } #endif seq_putc(m, '\n'); - spin_unlock(&cachep->spinlock); return 0; } -- cgit v1.1 From fcc234f888ba2365c44ba0507eb8a18eebf1f594 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Wed, 22 Mar 2006 00:08:13 -0800 Subject: [PATCH] mm: kill kmem_cache_t usage We have struct kmem_cache now so use it instead of the old typedef. Signed-off-by: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 4 ++-- mm/mempool.c | 4 ++-- mm/rmap.c | 5 +++-- mm/shmem.c | 5 +++-- 4 files changed, 10 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index b21869a..96195dc 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -98,8 +98,8 @@ /* The number of pages to migrate per call to migrate_pages() */ #define MIGRATE_CHUNK_SIZE 256 -static kmem_cache_t *policy_cache; -static kmem_cache_t *sn_cache; +static struct kmem_cache *policy_cache; +static struct kmem_cache *sn_cache; #define PDprintk(fmt...) diff --git a/mm/mempool.c b/mm/mempool.c index 1a99b80..f71893e 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -278,14 +278,14 @@ EXPORT_SYMBOL(mempool_free); */ void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data) { - kmem_cache_t *mem = (kmem_cache_t *) pool_data; + struct kmem_cache *mem = pool_data; return kmem_cache_alloc(mem, gfp_mask); } EXPORT_SYMBOL(mempool_alloc_slab); void mempool_free_slab(void *element, void *pool_data) { - kmem_cache_t *mem = (kmem_cache_t *) pool_data; + struct kmem_cache *mem = pool_data; kmem_cache_free(mem, element); } EXPORT_SYMBOL(mempool_free_slab); diff --git a/mm/rmap.c b/mm/rmap.c index 67f0e20..134aef9 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -58,7 +58,7 @@ //#define RMAP_DEBUG /* can be enabled only for debugging */ -kmem_cache_t *anon_vma_cachep; +struct kmem_cache *anon_vma_cachep; static inline void validate_anon_vma(struct vm_area_struct *find_vma) { @@ -166,7 +166,8 @@ void anon_vma_unlink(struct vm_area_struct *vma) anon_vma_free(anon_vma); } -static void anon_vma_ctor(void *data, kmem_cache_t *cachep, unsigned long flags) +static void anon_vma_ctor(void *data, struct kmem_cache *cachep, + unsigned long flags) { if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == SLAB_CTOR_CONSTRUCTOR) { diff --git a/mm/shmem.c b/mm/shmem.c index 7c455fb..f523a15 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2119,7 +2119,7 @@ failed: return err; } -static kmem_cache_t *shmem_inode_cachep; +static struct kmem_cache *shmem_inode_cachep; static struct inode *shmem_alloc_inode(struct super_block *sb) { @@ -2139,7 +2139,8 @@ static void shmem_destroy_inode(struct inode *inode) kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); } -static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags) +static void init_once(void *foo, struct kmem_cache *cachep, + unsigned long flags) { struct shmem_inode_info *p = (struct shmem_inode_info *) foo; -- cgit v1.1 From 911851e6ee6ac4e26f07be342a89632f78494fef Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 22 Mar 2006 00:08:14 -0800 Subject: [PATCH] slab: fix kernel-doc warnings Fix kernel-doc warnings in mm/slab.c. Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 2cd8020..5c25749 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1615,8 +1615,12 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) #if DEBUG /** - * slab_destroy_objs - call the registered destructor for each object in - * a slab that is to be destroyed. + * slab_destroy_objs - destroy a slab and its objects + * @cachep: cache pointer being destroyed + * @slabp: slab pointer being destroyed + * + * Call the registered destructor for each object in a slab that is being + * destroyed. */ static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp) { @@ -1661,7 +1665,11 @@ static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp) } #endif -/* +/** + * slab_destroy - destroy and release all objects in a slab + * @cachep: cache pointer being destroyed + * @slabp: slab pointer being destroyed + * * Destroy all the objs in a slab, and release the mem back to the system. * Before calling the slab must have been unlinked from the cache. The * cache-lock is not held/needed. @@ -3170,6 +3178,7 @@ EXPORT_SYMBOL(kmalloc_node); * kmalloc - allocate memory * @size: how many bytes of memory are required. * @flags: the type of memory to allocate. + * @caller: function caller for debug tracking of the caller * * kmalloc is the normal method of allocating memory * in the kernel. -- cgit v1.1 From ac2b898ca6fb06196a26869c23b66afe7944e52e Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 22 Mar 2006 00:08:15 -0800 Subject: [PATCH] slab: Remove SLAB_NO_REAP option SLAB_NO_REAP is documented as an option that will cause this slab not to be reaped under memory pressure. However, that is not what happens. The only thing that SLAB_NO_REAP controls at the moment is the reclaim of the unused slab elements that were allocated in batch in cache_reap(). Cache_reap() is run every few seconds independently of memory pressure. Could we remove the whole thing? Its only used by three slabs anyways and I cannot find a reason for having this option. There is an additional problem with SLAB_NO_REAP. If set then the recovery of objects from alien caches is switched off. Objects not freed on the same node where they were initially allocated will only be reused if a certain amount of objects accumulates from one alien node (not very likely) or if the cache is explicitly shrunk. (Strangely __cache_shrink does not check for SLAB_NO_REAP) Getting rid of SLAB_NO_REAP fixes the problems with alien cache freeing. Signed-off-by: Christoph Lameter Cc: Pekka Enberg Cc: Manfred Spraul Cc: Mark Fasheh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 5c25749..2423550 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -170,12 +170,12 @@ #if DEBUG # define CREATE_MASK (SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \ SLAB_POISON | SLAB_HWCACHE_ALIGN | \ - SLAB_NO_REAP | SLAB_CACHE_DMA | \ + SLAB_CACHE_DMA | \ SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \ SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ SLAB_DESTROY_BY_RCU) #else -# define CREATE_MASK (SLAB_HWCACHE_ALIGN | SLAB_NO_REAP | \ +# define CREATE_MASK (SLAB_HWCACHE_ALIGN | \ SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \ SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ SLAB_DESTROY_BY_RCU) @@ -662,7 +662,6 @@ static struct kmem_cache cache_cache = { .limit = BOOT_CPUCACHE_ENTRIES, .shared = 1, .buffer_size = sizeof(struct kmem_cache), - .flags = SLAB_NO_REAP, .name = "kmem_cache", #if DEBUG .obj_size = sizeof(struct kmem_cache), @@ -1848,9 +1847,6 @@ static void setup_cpu_cache(struct kmem_cache *cachep) * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check * for buffer overruns. * - * %SLAB_NO_REAP - Don't automatically reap this cache when we're under - * memory pressure. - * * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware * cacheline. This can be beneficial if you're counting cycles as closely * as davem. @@ -3584,10 +3580,6 @@ static void cache_reap(void *unused) struct slab *slabp; searchp = list_entry(walk, struct kmem_cache, next); - - if (searchp->flags & SLAB_NO_REAP) - goto next; - check_irq_on(); l3 = searchp->nodelists[numa_node_id()]; @@ -3635,7 +3627,6 @@ static void cache_reap(void *unused) } while (--tofree > 0); next_unlock: spin_unlock_irq(&l3->list_lock); -next: cond_resched(); } check_irq_on(); -- cgit v1.1 From a07fa3944bf924881450884224cbb2f1269cb9fa Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 22 Mar 2006 00:08:17 -0800 Subject: [PATCH] slab: use on_each_cpu() Slab duplicates on_each_cpu(). Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 21 ++------------------- 1 file changed, 2 insertions(+), 19 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 2423550..f477acf 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2119,23 +2119,6 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node) #define check_spinlock_acquired_node(x, y) do { } while(0) #endif -/* - * Waits for all CPUs to execute func(). - */ -static void smp_call_function_all_cpus(void (*func)(void *arg), void *arg) -{ - check_irq_on(); - preempt_disable(); - local_irq_disable(); - func(arg); - local_irq_enable(); - - if (smp_call_function(func, arg, 1, 1)) - BUG(); - - preempt_enable(); -} - static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac, int force, int node); @@ -2158,7 +2141,7 @@ static void drain_cpu_caches(struct kmem_cache *cachep) struct kmem_list3 *l3; int node; - smp_call_function_all_cpus(do_drain, cachep); + on_each_cpu(do_drain, cachep, 1, 1); check_irq_on(); for_each_online_node(node) { l3 = cachep->nodelists[node]; @@ -3449,7 +3432,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, } new.cachep = cachep; - smp_call_function_all_cpus(do_ccupdate_local, (void *)&new); + on_each_cpu(do_ccupdate_local, (void *)&new, 1, 1); check_irq_on(); cachep->batchcount = batchcount; -- cgit v1.1 From 8695949a1d7c99e039595db00af8e0fe4722307d Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 22 Mar 2006 00:08:18 -0800 Subject: [PATCH] Thin out scan_control: remove nr_to_scan and priority Make nr_to_scan and priority a parameter instead of putting it into scan control. This allows various small optimizations and IMHO makes the code easier to read. Signed-off-by: Christoph Lameter Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 59 +++++++++++++++++++++++++---------------------------------- 1 file changed, 25 insertions(+), 34 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index e21bab4..f7c4f37 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -52,9 +52,6 @@ typedef enum { } pageout_t; struct scan_control { - /* Ask refill_inactive_zone, or shrink_cache to scan this many pages */ - unsigned long nr_to_scan; - /* Incremented by the number of inactive pages that were scanned */ unsigned long nr_scanned; @@ -63,9 +60,6 @@ struct scan_control { unsigned long nr_mapped; /* From page_state */ - /* Ask shrink_caches, or shrink_zone to scan at this priority */ - unsigned int priority; - /* This context's GFP mask */ gfp_t gfp_mask; @@ -1112,11 +1106,10 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src, /* * shrink_cache() adds the number of pages reclaimed to sc->nr_reclaimed */ -static void shrink_cache(struct zone *zone, struct scan_control *sc) +static void shrink_cache(int max_scan, struct zone *zone, struct scan_control *sc) { LIST_HEAD(page_list); struct pagevec pvec; - int max_scan = sc->nr_to_scan; pagevec_init(&pvec, 1); @@ -1192,12 +1185,11 @@ done: * But we had to alter page->flags anyway. */ static void -refill_inactive_zone(struct zone *zone, struct scan_control *sc) +refill_inactive_zone(int nr_pages, struct zone *zone, struct scan_control *sc) { int pgmoved; int pgdeactivate = 0; int pgscanned; - int nr_pages = sc->nr_to_scan; LIST_HEAD(l_hold); /* The pages which were snipped off */ LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */ LIST_HEAD(l_active); /* Pages to go onto the active_list */ @@ -1332,10 +1324,11 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ static void -shrink_zone(struct zone *zone, struct scan_control *sc) +shrink_zone(int priority, struct zone *zone, struct scan_control *sc) { unsigned long nr_active; unsigned long nr_inactive; + unsigned long nr_to_scan; atomic_inc(&zone->reclaim_in_progress); @@ -1343,14 +1336,14 @@ shrink_zone(struct zone *zone, struct scan_control *sc) * Add one to `nr_to_scan' just to make sure that the kernel will * slowly sift through the active list. */ - zone->nr_scan_active += (zone->nr_active >> sc->priority) + 1; + zone->nr_scan_active += (zone->nr_active >> priority) + 1; nr_active = zone->nr_scan_active; if (nr_active >= sc->swap_cluster_max) zone->nr_scan_active = 0; else nr_active = 0; - zone->nr_scan_inactive += (zone->nr_inactive >> sc->priority) + 1; + zone->nr_scan_inactive += (zone->nr_inactive >> priority) + 1; nr_inactive = zone->nr_scan_inactive; if (nr_inactive >= sc->swap_cluster_max) zone->nr_scan_inactive = 0; @@ -1359,17 +1352,17 @@ shrink_zone(struct zone *zone, struct scan_control *sc) while (nr_active || nr_inactive) { if (nr_active) { - sc->nr_to_scan = min(nr_active, + nr_to_scan = min(nr_active, (unsigned long)sc->swap_cluster_max); - nr_active -= sc->nr_to_scan; - refill_inactive_zone(zone, sc); + nr_active -= nr_to_scan; + refill_inactive_zone(nr_to_scan, zone, sc); } if (nr_inactive) { - sc->nr_to_scan = min(nr_inactive, + nr_to_scan = min(nr_inactive, (unsigned long)sc->swap_cluster_max); - nr_inactive -= sc->nr_to_scan; - shrink_cache(zone, sc); + nr_inactive -= nr_to_scan; + shrink_cache(nr_to_scan, zone, sc); } } @@ -1395,7 +1388,7 @@ shrink_zone(struct zone *zone, struct scan_control *sc) * scan then give up on it. */ static void -shrink_caches(struct zone **zones, struct scan_control *sc) +shrink_caches(int priority, struct zone **zones, struct scan_control *sc) { int i; @@ -1408,14 +1401,14 @@ shrink_caches(struct zone **zones, struct scan_control *sc) if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) continue; - zone->temp_priority = sc->priority; - if (zone->prev_priority > sc->priority) - zone->prev_priority = sc->priority; + zone->temp_priority = priority; + if (zone->prev_priority > priority) + zone->prev_priority = priority; - if (zone->all_unreclaimable && sc->priority != DEF_PRIORITY) + if (zone->all_unreclaimable && priority != DEF_PRIORITY) continue; /* Let kswapd poll it */ - shrink_zone(zone, sc); + shrink_zone(priority, zone, sc); } } @@ -1462,11 +1455,10 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask) sc.nr_mapped = read_page_state(nr_mapped); sc.nr_scanned = 0; sc.nr_reclaimed = 0; - sc.priority = priority; sc.swap_cluster_max = SWAP_CLUSTER_MAX; if (!priority) disable_swap_token(); - shrink_caches(zones, &sc); + shrink_caches(priority, zones, &sc); shrink_slab(sc.nr_scanned, gfp_mask, lru_pages); if (reclaim_state) { sc.nr_reclaimed += reclaim_state->reclaimed_slab; @@ -1629,9 +1621,8 @@ scan: zone->prev_priority = priority; sc.nr_scanned = 0; sc.nr_reclaimed = 0; - sc.priority = priority; sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX; - shrink_zone(zone, &sc); + shrink_zone(priority, zone, &sc); reclaim_state->reclaimed_slab = 0; nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, lru_pages); @@ -1886,6 +1877,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) struct scan_control sc; cpumask_t mask; int node_id; + int priority; if (time_before(jiffies, zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval)) @@ -1906,7 +1898,6 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) sc.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP); sc.nr_scanned = 0; sc.nr_reclaimed = 0; - sc.priority = ZONE_RECLAIM_PRIORITY + 1; sc.nr_mapped = read_page_state(nr_mapped); sc.gfp_mask = gfp_mask; @@ -1932,11 +1923,11 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) * Free memory by calling shrink zone with increasing priorities * until we have enough memory freed. */ + priority = ZONE_RECLAIM_PRIORITY; do { - sc.priority--; - shrink_zone(zone, &sc); - - } while (sc.nr_reclaimed < nr_pages && sc.priority > 0); + shrink_zone(priority, zone, &sc); + priority--; + } while (priority >= 0 && sc.nr_reclaimed < nr_pages); if (sc.nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) { /* -- cgit v1.1 From 179e96395b1f01e95ebe1ff5ef306b810dbbd147 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 22 Mar 2006 00:08:18 -0800 Subject: [PATCH] vmscan: scan_control cleanup Initialise as much of scan_control as possible at the declaration site. This tidies things up a bit and assures us that all unmentioned fields are zeroed out. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 108 ++++++++++++++++++++++++++++++++++-------------------------- 1 file changed, 62 insertions(+), 46 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index f7c4f37..5feef4d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1431,13 +1431,14 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask) int ret = 0; int total_scanned = 0, total_reclaimed = 0; struct reclaim_state *reclaim_state = current->reclaim_state; - struct scan_control sc; unsigned long lru_pages = 0; int i; - - sc.gfp_mask = gfp_mask; - sc.may_writepage = !laptop_mode; - sc.may_swap = 1; + struct scan_control sc = { + .gfp_mask = gfp_mask, + .may_writepage = !laptop_mode, + .swap_cluster_max = SWAP_CLUSTER_MAX, + .may_swap = 1, + }; inc_page_state(allocstall); @@ -1455,7 +1456,6 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask) sc.nr_mapped = read_page_state(nr_mapped); sc.nr_scanned = 0; sc.nr_reclaimed = 0; - sc.swap_cluster_max = SWAP_CLUSTER_MAX; if (!priority) disable_swap_token(); shrink_caches(priority, zones, &sc); @@ -1478,7 +1478,8 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask) * that's undesirable in laptop mode, where we *want* lumpy * writeout. So in laptop mode, write out the whole world. */ - if (total_scanned > sc.swap_cluster_max + sc.swap_cluster_max/2) { + if (total_scanned > sc.swap_cluster_max + + sc.swap_cluster_max / 2) { wakeup_pdflush(laptop_mode ? 0 : total_scanned); sc.may_writepage = 1; } @@ -1532,14 +1533,16 @@ static int balance_pgdat(pg_data_t *pgdat, int nr_pages, int order) int i; int total_scanned, total_reclaimed; struct reclaim_state *reclaim_state = current->reclaim_state; - struct scan_control sc; + struct scan_control sc = { + .gfp_mask = GFP_KERNEL, + .may_swap = 1, + .swap_cluster_max = nr_pages ? nr_pages : SWAP_CLUSTER_MAX, + }; loop_again: total_scanned = 0; total_reclaimed = 0; - sc.gfp_mask = GFP_KERNEL; - sc.may_writepage = !laptop_mode; - sc.may_swap = 1; + sc.may_writepage = !laptop_mode, sc.nr_mapped = read_page_state(nr_mapped); inc_page_state(pageoutrun); @@ -1621,7 +1624,6 @@ scan: zone->prev_priority = priority; sc.nr_scanned = 0; sc.nr_reclaimed = 0; - sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX; shrink_zone(priority, zone, &sc); reclaim_state->reclaimed_slab = 0; nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, @@ -1869,46 +1871,21 @@ int zone_reclaim_interval __read_mostly = 30*HZ; /* * Try to free up some pages from this zone through reclaim. */ -int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) +static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) { - int nr_pages; + const int nr_pages = 1 << order; struct task_struct *p = current; struct reclaim_state reclaim_state; - struct scan_control sc; - cpumask_t mask; - int node_id; int priority; - - if (time_before(jiffies, - zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval)) - return 0; - - if (!(gfp_mask & __GFP_WAIT) || - zone->all_unreclaimable || - atomic_read(&zone->reclaim_in_progress) > 0 || - (p->flags & PF_MEMALLOC)) - return 0; - - node_id = zone->zone_pgdat->node_id; - mask = node_to_cpumask(node_id); - if (!cpus_empty(mask) && node_id != numa_node_id()) - return 0; - - sc.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE); - sc.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP); - sc.nr_scanned = 0; - sc.nr_reclaimed = 0; - sc.nr_mapped = read_page_state(nr_mapped); - sc.gfp_mask = gfp_mask; + struct scan_control sc = { + .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), + .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), + .nr_mapped = read_page_state(nr_mapped), + .swap_cluster_max = max(nr_pages, SWAP_CLUSTER_MAX), + .gfp_mask = gfp_mask, + }; disable_swap_token(); - - nr_pages = 1 << order; - if (nr_pages > SWAP_CLUSTER_MAX) - sc.swap_cluster_max = nr_pages; - else - sc.swap_cluster_max = SWAP_CLUSTER_MAX; - cond_resched(); /* * We need to be able to allocate from the reserves for RECLAIM_SWAP @@ -1949,5 +1926,44 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) return sc.nr_reclaimed >= nr_pages; } + +int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) +{ + cpumask_t mask; + int node_id; + + /* + * Do not reclaim if there was a recent unsuccessful attempt at zone + * reclaim. In that case we let allocations go off node for the + * zone_reclaim_interval. Otherwise we would scan for each off-node + * page allocation. + */ + if (time_before(jiffies, + zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval)) + return 0; + + /* + * Avoid concurrent zone reclaims, do not reclaim in a zone that does + * not have reclaimable pages and if we should not delay the allocation + * then do not scan. + */ + if (!(gfp_mask & __GFP_WAIT) || + zone->all_unreclaimable || + atomic_read(&zone->reclaim_in_progress) > 0 || + (current->flags & PF_MEMALLOC)) + return 0; + + /* + * Only run zone reclaim on the local zone or on zones that do not + * have associated processors. This will favor the local processor + * over remote processors and spread off node memory allocations + * as wide as possible. + */ + node_id = zone->zone_pgdat->node_id; + mask = node_to_cpumask(node_id); + if (!cpus_empty(mask) && node_id != numa_node_id()) + return 0; + return __zone_reclaim(zone, gfp_mask, order); +} #endif -- cgit v1.1 From 69e05944af39fc6c97b09380c8721e38433bd828 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 22 Mar 2006 00:08:19 -0800 Subject: [PATCH] vmscan: use unsigned longs Turn basically everything in vmscan.c into `unsigned long'. This is to avoid the possibility that some piece of code in there might decide to operate upon more than 4G (or even 2G) of pages in one hit. This might be silly, but we'll need it one day. Cc: Christoph Lameter Cc: Nick Piggin Signed-off-by: Rafael J. Wysocki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 104 ++++++++++++++++++++++++++++++++++-------------------------- 1 file changed, 59 insertions(+), 45 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 5feef4d..62cd7cd 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -177,10 +177,11 @@ EXPORT_SYMBOL(remove_shrinker); * * Returns the number of slab objects which we shrunk. */ -int shrink_slab(unsigned long scanned, gfp_t gfp_mask, unsigned long lru_pages) +unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, + unsigned long lru_pages) { struct shrinker *shrinker; - int ret = 0; + unsigned long ret = 0; if (scanned == 0) scanned = SWAP_CLUSTER_MAX; @@ -410,12 +411,13 @@ cannot_free: /* * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed */ -static int shrink_list(struct list_head *page_list, struct scan_control *sc) +static unsigned long shrink_list(struct list_head *page_list, + struct scan_control *sc) { LIST_HEAD(ret_pages); struct pagevec freed_pvec; int pgactivate = 0; - int reclaimed = 0; + unsigned long reclaimed = 0; cond_resched(); @@ -599,11 +601,11 @@ static inline void move_to_lru(struct page *page) * * returns the number of pages put back. */ -int putback_lru_pages(struct list_head *l) +unsigned long putback_lru_pages(struct list_head *l) { struct page *page; struct page *page2; - int count = 0; + unsigned long count = 0; list_for_each_entry_safe(page, page2, l, lru) { move_to_lru(page); @@ -848,11 +850,11 @@ EXPORT_SYMBOL(migrate_page); * * Return: Number of pages not migrated when "to" ran empty. */ -int migrate_pages(struct list_head *from, struct list_head *to, +unsigned long migrate_pages(struct list_head *from, struct list_head *to, struct list_head *moved, struct list_head *failed) { - int retry; - int nr_failed = 0; + unsigned long retry; + unsigned long nr_failed = 0; int pass = 0; struct page *page; struct page *page2; @@ -1069,12 +1071,13 @@ int isolate_lru_page(struct page *page) * * returns how many pages were moved onto *@dst. */ -static int isolate_lru_pages(int nr_to_scan, struct list_head *src, - struct list_head *dst, int *scanned) +static unsigned long isolate_lru_pages(unsigned long nr_to_scan, + struct list_head *src, struct list_head *dst, + unsigned long *scanned) { - int nr_taken = 0; + unsigned long nr_taken = 0; struct page *page; - int scan = 0; + unsigned long scan = 0; while (scan++ < nr_to_scan && !list_empty(src)) { struct list_head *target; @@ -1106,20 +1109,22 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src, /* * shrink_cache() adds the number of pages reclaimed to sc->nr_reclaimed */ -static void shrink_cache(int max_scan, struct zone *zone, struct scan_control *sc) +static void shrink_cache(unsigned long max_scan, struct zone *zone, + struct scan_control *sc) { LIST_HEAD(page_list); struct pagevec pvec; + unsigned long nr_scanned = 0; pagevec_init(&pvec, 1); lru_add_drain(); spin_lock_irq(&zone->lru_lock); - while (max_scan > 0) { + do { struct page *page; - int nr_taken; - int nr_scan; - int nr_freed; + unsigned long nr_taken; + unsigned long nr_scan; + unsigned long nr_freed; nr_taken = isolate_lru_pages(sc->swap_cluster_max, &zone->inactive_list, @@ -1131,7 +1136,7 @@ static void shrink_cache(int max_scan, struct zone *zone, struct scan_control *s if (nr_taken == 0) goto done; - max_scan -= nr_scan; + nr_scanned += nr_scan; nr_freed = shrink_list(&page_list, sc); local_irq_disable(); @@ -1161,7 +1166,7 @@ static void shrink_cache(int max_scan, struct zone *zone, struct scan_control *s spin_lock_irq(&zone->lru_lock); } } - } + } while (nr_scanned < max_scan); spin_unlock_irq(&zone->lru_lock); done: pagevec_release(&pvec); @@ -1185,11 +1190,12 @@ done: * But we had to alter page->flags anyway. */ static void -refill_inactive_zone(int nr_pages, struct zone *zone, struct scan_control *sc) +refill_inactive_zone(unsigned long nr_pages, struct zone *zone, + struct scan_control *sc) { - int pgmoved; + unsigned long pgmoved; int pgdeactivate = 0; - int pgscanned; + unsigned long pgscanned; LIST_HEAD(l_hold); /* The pages which were snipped off */ LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */ LIST_HEAD(l_active); /* Pages to go onto the active_list */ @@ -1323,8 +1329,8 @@ refill_inactive_zone(int nr_pages, struct zone *zone, struct scan_control *sc) /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ -static void -shrink_zone(int priority, struct zone *zone, struct scan_control *sc) +static void shrink_zone(int priority, struct zone *zone, + struct scan_control *sc) { unsigned long nr_active; unsigned long nr_inactive; @@ -1387,8 +1393,8 @@ shrink_zone(int priority, struct zone *zone, struct scan_control *sc) * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. */ -static void -shrink_caches(int priority, struct zone **zones, struct scan_control *sc) +static void shrink_caches(int priority, struct zone **zones, + struct scan_control *sc) { int i; @@ -1425,11 +1431,12 @@ shrink_caches(int priority, struct zone **zones, struct scan_control *sc) * holds filesystem locks which prevent writeout this might not work, and the * allocation attempt will fail. */ -int try_to_free_pages(struct zone **zones, gfp_t gfp_mask) +unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) { int priority; int ret = 0; - int total_scanned = 0, total_reclaimed = 0; + unsigned long total_scanned = 0; + unsigned long total_reclaimed = 0; struct reclaim_state *reclaim_state = current->reclaim_state; unsigned long lru_pages = 0; int i; @@ -1525,13 +1532,15 @@ out: * the page allocator fallback scheme to ensure that aging of pages is balanced * across the zones. */ -static int balance_pgdat(pg_data_t *pgdat, int nr_pages, int order) +static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages, + int order) { - int to_free = nr_pages; + unsigned long to_free = nr_pages; int all_zones_ok; int priority; int i; - int total_scanned, total_reclaimed; + unsigned long total_scanned; + unsigned long total_reclaimed; struct reclaim_state *reclaim_state = current->reclaim_state; struct scan_control sc = { .gfp_mask = GFP_KERNEL, @@ -1776,22 +1785,23 @@ void wakeup_kswapd(struct zone *zone, int order) * Try to free `nr_pages' of memory, system-wide. Returns the number of freed * pages. */ -int shrink_all_memory(int nr_pages) +unsigned long shrink_all_memory(unsigned long nr_pages) { pg_data_t *pgdat; - int nr_to_free = nr_pages; - int ret = 0; + unsigned long nr_to_free = nr_pages; + unsigned long ret = 0; struct reclaim_state reclaim_state = { .reclaimed_slab = 0, }; current->reclaim_state = &reclaim_state; for_each_pgdat(pgdat) { - int freed; + unsigned long freed; + freed = balance_pgdat(pgdat, nr_to_free, 0); ret += freed; nr_to_free -= freed; - if (nr_to_free <= 0) + if ((long)nr_to_free <= 0) break; } current->reclaim_state = NULL; @@ -1805,8 +1815,7 @@ int shrink_all_memory(int nr_pages) away, we get changed to run anywhere: as the first one comes back, restore their cpu bindings. */ static int __devinit cpu_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) + unsigned long action, void *hcpu) { pg_data_t *pgdat; cpumask_t mask; @@ -1826,10 +1835,15 @@ static int __devinit cpu_callback(struct notifier_block *nfb, static int __init kswapd_init(void) { pg_data_t *pgdat; + swap_setup(); - for_each_pgdat(pgdat) - pgdat->kswapd - = find_task_by_pid(kernel_thread(kswapd, pgdat, CLONE_KERNEL)); + for_each_pgdat(pgdat) { + pid_t pid; + + pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL); + BUG_ON(pid < 0); + pgdat->kswapd = find_task_by_pid(pid); + } total_memory = nr_free_pagecache_pages(); hotcpu_notifier(cpu_callback, 0); return 0; @@ -1873,7 +1887,7 @@ int zone_reclaim_interval __read_mostly = 30*HZ; */ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) { - const int nr_pages = 1 << order; + const unsigned long nr_pages = 1 << order; struct task_struct *p = current; struct reclaim_state reclaim_state; int priority; @@ -1881,7 +1895,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), .nr_mapped = read_page_state(nr_mapped), - .swap_cluster_max = max(nr_pages, SWAP_CLUSTER_MAX), + .swap_cluster_max = max_t(unsigned long, nr_pages, + SWAP_CLUSTER_MAX), .gfp_mask = gfp_mask, }; @@ -1966,4 +1981,3 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) return __zone_reclaim(zone, gfp_mask, order); } #endif - -- cgit v1.1 From 05ff51376f01fd8837946a4f8144a84f6cc71c19 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 22 Mar 2006 00:08:20 -0800 Subject: [PATCH] vmscan return nr_reclaimed Change all the vmscan functions to retunr the number-of-reclaimed pages and remove scan_conrtol.nr_reclaimed. Saves ten-odd bytes of text and makes things clearer and more consistent. The patch also changes the behaviour of zone_reclaim() when it falls back to slab shrinking. Christoph says "Setting this to one means that we will rescan and shrink the slab for each allocation if we are out of zone memory and RECLAIM_SLAB is set. Plus if we do an order 0 allocation we do not go off node as intended. "We better set this to zero. This means the allocation will go offnode despite us having potentially freed lots of memory on the zone. Future allocations can then again be done from this zone." Cc: Nick Piggin Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 77 ++++++++++++++++++++++++++++++------------------------------- 1 file changed, 38 insertions(+), 39 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 62cd7cd..8f6ad13 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -55,9 +55,6 @@ struct scan_control { /* Incremented by the number of inactive pages that were scanned */ unsigned long nr_scanned; - /* Incremented by the number of pages reclaimed */ - unsigned long nr_reclaimed; - unsigned long nr_mapped; /* From page_state */ /* This context's GFP mask */ @@ -409,7 +406,7 @@ cannot_free: } /* - * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed + * shrink_list return the number of reclaimed pages */ static unsigned long shrink_list(struct list_head *page_list, struct scan_control *sc) @@ -417,7 +414,7 @@ static unsigned long shrink_list(struct list_head *page_list, LIST_HEAD(ret_pages); struct pagevec freed_pvec; int pgactivate = 0; - unsigned long reclaimed = 0; + unsigned long nr_reclaimed = 0; cond_resched(); @@ -557,7 +554,7 @@ static unsigned long shrink_list(struct list_head *page_list, free_it: unlock_page(page); - reclaimed++; + nr_reclaimed++; if (!pagevec_add(&freed_pvec, page)) __pagevec_release_nonlru(&freed_pvec); continue; @@ -575,8 +572,7 @@ keep: if (pagevec_count(&freed_pvec)) __pagevec_release_nonlru(&freed_pvec); mod_page_state(pgactivate, pgactivate); - sc->nr_reclaimed += reclaimed; - return reclaimed; + return nr_reclaimed; } #ifdef CONFIG_MIGRATION @@ -1107,14 +1103,15 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, } /* - * shrink_cache() adds the number of pages reclaimed to sc->nr_reclaimed + * shrink_cache() return the number of reclaimed pages */ -static void shrink_cache(unsigned long max_scan, struct zone *zone, - struct scan_control *sc) +static unsigned long shrink_cache(unsigned long max_scan, struct zone *zone, + struct scan_control *sc) { LIST_HEAD(page_list); struct pagevec pvec; unsigned long nr_scanned = 0; + unsigned long nr_reclaimed = 0; pagevec_init(&pvec, 1); @@ -1138,7 +1135,7 @@ static void shrink_cache(unsigned long max_scan, struct zone *zone, nr_scanned += nr_scan; nr_freed = shrink_list(&page_list, sc); - + nr_reclaimed += nr_freed; local_irq_disable(); if (current_is_kswapd()) { __mod_page_state_zone(zone, pgscan_kswapd, nr_scan); @@ -1170,6 +1167,7 @@ static void shrink_cache(unsigned long max_scan, struct zone *zone, spin_unlock_irq(&zone->lru_lock); done: pagevec_release(&pvec); + return nr_reclaimed; } /* @@ -1329,12 +1327,13 @@ refill_inactive_zone(unsigned long nr_pages, struct zone *zone, /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ -static void shrink_zone(int priority, struct zone *zone, - struct scan_control *sc) +static unsigned long shrink_zone(int priority, struct zone *zone, + struct scan_control *sc) { unsigned long nr_active; unsigned long nr_inactive; unsigned long nr_to_scan; + unsigned long nr_reclaimed = 0; atomic_inc(&zone->reclaim_in_progress); @@ -1368,13 +1367,14 @@ static void shrink_zone(int priority, struct zone *zone, nr_to_scan = min(nr_inactive, (unsigned long)sc->swap_cluster_max); nr_inactive -= nr_to_scan; - shrink_cache(nr_to_scan, zone, sc); + nr_reclaimed += shrink_cache(nr_to_scan, zone, sc); } } throttle_vm_writeout(); atomic_dec(&zone->reclaim_in_progress); + return nr_reclaimed; } /* @@ -1393,9 +1393,10 @@ static void shrink_zone(int priority, struct zone *zone, * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. */ -static void shrink_caches(int priority, struct zone **zones, - struct scan_control *sc) +static unsigned long shrink_caches(int priority, struct zone **zones, + struct scan_control *sc) { + unsigned long nr_reclaimed = 0; int i; for (i = 0; zones[i] != NULL; i++) { @@ -1414,8 +1415,9 @@ static void shrink_caches(int priority, struct zone **zones, if (zone->all_unreclaimable && priority != DEF_PRIORITY) continue; /* Let kswapd poll it */ - shrink_zone(priority, zone, sc); + nr_reclaimed += shrink_zone(priority, zone, sc); } + return nr_reclaimed; } /* @@ -1436,7 +1438,7 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) int priority; int ret = 0; unsigned long total_scanned = 0; - unsigned long total_reclaimed = 0; + unsigned long nr_reclaimed = 0; struct reclaim_state *reclaim_state = current->reclaim_state; unsigned long lru_pages = 0; int i; @@ -1462,18 +1464,16 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) for (priority = DEF_PRIORITY; priority >= 0; priority--) { sc.nr_mapped = read_page_state(nr_mapped); sc.nr_scanned = 0; - sc.nr_reclaimed = 0; if (!priority) disable_swap_token(); - shrink_caches(priority, zones, &sc); + nr_reclaimed += shrink_caches(priority, zones, &sc); shrink_slab(sc.nr_scanned, gfp_mask, lru_pages); if (reclaim_state) { - sc.nr_reclaimed += reclaim_state->reclaimed_slab; + nr_reclaimed += reclaim_state->reclaimed_slab; reclaim_state->reclaimed_slab = 0; } total_scanned += sc.nr_scanned; - total_reclaimed += sc.nr_reclaimed; - if (total_reclaimed >= sc.swap_cluster_max) { + if (nr_reclaimed >= sc.swap_cluster_max) { ret = 1; goto out; } @@ -1540,7 +1540,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages, int priority; int i; unsigned long total_scanned; - unsigned long total_reclaimed; + unsigned long nr_reclaimed; struct reclaim_state *reclaim_state = current->reclaim_state; struct scan_control sc = { .gfp_mask = GFP_KERNEL, @@ -1550,7 +1550,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages, loop_again: total_scanned = 0; - total_reclaimed = 0; + nr_reclaimed = 0; sc.may_writepage = !laptop_mode, sc.nr_mapped = read_page_state(nr_mapped); @@ -1632,13 +1632,11 @@ scan: if (zone->prev_priority > priority) zone->prev_priority = priority; sc.nr_scanned = 0; - sc.nr_reclaimed = 0; - shrink_zone(priority, zone, &sc); + nr_reclaimed += shrink_zone(priority, zone, &sc); reclaim_state->reclaimed_slab = 0; nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, lru_pages); - sc.nr_reclaimed += reclaim_state->reclaimed_slab; - total_reclaimed += sc.nr_reclaimed; + nr_reclaimed += reclaim_state->reclaimed_slab; total_scanned += sc.nr_scanned; if (zone->all_unreclaimable) continue; @@ -1651,10 +1649,10 @@ scan: * even in laptop mode */ if (total_scanned > SWAP_CLUSTER_MAX * 2 && - total_scanned > total_reclaimed+total_reclaimed/2) + total_scanned > nr_reclaimed + nr_reclaimed / 2) sc.may_writepage = 1; } - if (nr_pages && to_free > total_reclaimed) + if (nr_pages && to_free > nr_reclaimed) continue; /* swsusp: need to do more work */ if (all_zones_ok) break; /* kswapd: all done */ @@ -1671,7 +1669,7 @@ scan: * matches the direct reclaim path behaviour in terms of impact * on zone->*_priority. */ - if ((total_reclaimed >= SWAP_CLUSTER_MAX) && (!nr_pages)) + if ((nr_reclaimed >= SWAP_CLUSTER_MAX) && !nr_pages) break; } out: @@ -1685,7 +1683,7 @@ out: goto loop_again; } - return total_reclaimed; + return nr_reclaimed; } /* @@ -1891,6 +1889,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) struct task_struct *p = current; struct reclaim_state reclaim_state; int priority; + unsigned long nr_reclaimed = 0; struct scan_control sc = { .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), @@ -1917,11 +1916,11 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) */ priority = ZONE_RECLAIM_PRIORITY; do { - shrink_zone(priority, zone, &sc); + nr_reclaimed += shrink_zone(priority, zone, &sc); priority--; - } while (priority >= 0 && sc.nr_reclaimed < nr_pages); + } while (priority >= 0 && nr_reclaimed < nr_pages); - if (sc.nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) { + if (nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) { /* * shrink_slab does not currently allow us to determine * how many pages were freed in the zone. So we just @@ -1936,10 +1935,10 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) p->reclaim_state = NULL; current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); - if (sc.nr_reclaimed == 0) + if (nr_reclaimed == 0) zone->last_unsuccessful_zone_reclaim = jiffies; - return sc.nr_reclaimed >= nr_pages; + return nr_reclaimed >= nr_pages; } int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) -- cgit v1.1 From 1742f19fa920cdd6905f0db5898524dde22ab2a4 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 22 Mar 2006 00:08:21 -0800 Subject: [PATCH] vmscan: rename functions We have: try_to_free_pages ->shrink_caches(struct zone **zones, ..) ->shrink_zone(struct zone *, ...) ->shrink_cache(struct zone *, ...) ->shrink_list(struct list_head *, ...) ->refill_inactive_list((struct zone *, ...) which is fairly irrational. Rename things so that we have try_to_free_pages ->shrink_zones(struct zone **zones, ..) ->shrink_zone(struct zone *, ...) ->shrink_inactive_list(struct zone *, ...) ->shrink_page_list(struct list_head *, ...) ->shrink_active_list(struct zone *, ...) Cc: Nick Piggin Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 8f6ad13..2d5d486 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -298,7 +298,8 @@ static void handle_write_error(struct address_space *mapping, } /* - * pageout is called by shrink_list() for each dirty page. Calls ->writepage(). + * pageout is called by shrink_page_list() for each dirty page. + * Calls ->writepage(). */ static pageout_t pageout(struct page *page, struct address_space *mapping) { @@ -406,10 +407,10 @@ cannot_free: } /* - * shrink_list return the number of reclaimed pages + * shrink_page_list() returns the number of reclaimed pages */ -static unsigned long shrink_list(struct list_head *page_list, - struct scan_control *sc) +static unsigned long shrink_page_list(struct list_head *page_list, + struct scan_control *sc) { LIST_HEAD(ret_pages); struct pagevec freed_pvec; @@ -1103,10 +1104,11 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, } /* - * shrink_cache() return the number of reclaimed pages + * shrink_inactive_list() is a helper for shrink_zone(). It returns the number + * of reclaimed pages */ -static unsigned long shrink_cache(unsigned long max_scan, struct zone *zone, - struct scan_control *sc) +static unsigned long shrink_inactive_list(unsigned long max_scan, + struct zone *zone, struct scan_control *sc) { LIST_HEAD(page_list); struct pagevec pvec; @@ -1134,7 +1136,7 @@ static unsigned long shrink_cache(unsigned long max_scan, struct zone *zone, goto done; nr_scanned += nr_scan; - nr_freed = shrink_list(&page_list, sc); + nr_freed = shrink_page_list(&page_list, sc); nr_reclaimed += nr_freed; local_irq_disable(); if (current_is_kswapd()) { @@ -1187,9 +1189,8 @@ done: * The downside is that we have to touch page->_count against each page. * But we had to alter page->flags anyway. */ -static void -refill_inactive_zone(unsigned long nr_pages, struct zone *zone, - struct scan_control *sc) +static void shrink_active_list(unsigned long nr_pages, struct zone *zone, + struct scan_control *sc) { unsigned long pgmoved; int pgdeactivate = 0; @@ -1360,14 +1361,15 @@ static unsigned long shrink_zone(int priority, struct zone *zone, nr_to_scan = min(nr_active, (unsigned long)sc->swap_cluster_max); nr_active -= nr_to_scan; - refill_inactive_zone(nr_to_scan, zone, sc); + shrink_active_list(nr_to_scan, zone, sc); } if (nr_inactive) { nr_to_scan = min(nr_inactive, (unsigned long)sc->swap_cluster_max); nr_inactive -= nr_to_scan; - nr_reclaimed += shrink_cache(nr_to_scan, zone, sc); + nr_reclaimed += shrink_inactive_list(nr_to_scan, zone, + sc); } } @@ -1393,7 +1395,7 @@ static unsigned long shrink_zone(int priority, struct zone *zone, * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. */ -static unsigned long shrink_caches(int priority, struct zone **zones, +static unsigned long shrink_zones(int priority, struct zone **zones, struct scan_control *sc) { unsigned long nr_reclaimed = 0; @@ -1466,7 +1468,7 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) sc.nr_scanned = 0; if (!priority) disable_swap_token(); - nr_reclaimed += shrink_caches(priority, zones, &sc); + nr_reclaimed += shrink_zones(priority, zones, &sc); shrink_slab(sc.nr_scanned, gfp_mask, lru_pages); if (reclaim_state) { nr_reclaimed += reclaim_state->reclaimed_slab; -- cgit v1.1 From 7fb2d46d396b2491818f8e43b01049b3234e6c07 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 22 Mar 2006 00:08:22 -0800 Subject: [PATCH] zone_reclaim: additional comments and cleanup Add some comments to explain how zone reclaim works. And it fixes the following issues: - PF_SWAPWRITE needs to be set for RECLAIM_SWAP to be able to write out pages to swap. Currently RECLAIM_SWAP may not do that. - remove setting nr_reclaimed pages after slab reclaim since the slab shrinking code does not use that and the nr_reclaimed pages is just right for the intended follow up action. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 2d5d486..c712b94 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1887,6 +1887,7 @@ int zone_reclaim_interval __read_mostly = 30*HZ; */ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) { + /* Minimum pages needed in order to stay on node */ const unsigned long nr_pages = 1 << order; struct task_struct *p = current; struct reclaim_state reclaim_state; @@ -1924,9 +1925,12 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) if (nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) { /* - * shrink_slab does not currently allow us to determine - * how many pages were freed in the zone. So we just - * shake the slab and then go offnode for a single allocation. + * shrink_slab() does not currently allow us to determine how + * many pages were freed in this zone. So we just shake the slab + * a bit and then go off node for this particular allocation + * despite possibly having freed enough memory to allocate in + * this zone. If we freed local memory then the next + * allocations will be local again. * * shrink_slab will free memory on all zones and may take * a long time. @@ -1937,8 +1941,14 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) p->reclaim_state = NULL; current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); - if (nr_reclaimed == 0) + if (nr_reclaimed == 0) { + /* + * We were unable to reclaim enough pages to stay on node. We + * now allow off node accesses for a certain time period before + * trying again to reclaim pages from the local zone. + */ zone->last_unsuccessful_zone_reclaim = jiffies; + } return nr_reclaimed >= nr_pages; } -- cgit v1.1 From c9b02d970c385a253edb36c87643b0df706b50b4 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 22 Mar 2006 00:08:23 -0800 Subject: [PATCH] mm: isolate_lru_pages() scan count fix In isolate_lru_pages(), *scanned reports one more scan because the scan counter is increased one more time on exit of the while-loop. Change the while-loop to for-loop to fix it. Signed-off-by: Nick Piggin Signed-off-by: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index c712b94..85e95f4 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1074,9 +1074,9 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, { unsigned long nr_taken = 0; struct page *page; - unsigned long scan = 0; + unsigned long scan; - while (scan++ < nr_to_scan && !list_empty(src)) { + for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { struct list_head *target; page = lru_to_page(src); prefetchw_prev_lru_page(page, src, flags); -- cgit v1.1 From fb8d14e172a29ba5ac69a73b61196be86fdfc3e1 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 22 Mar 2006 00:08:28 -0800 Subject: [PATCH] mm: shrink_inactive_lis() nr_scan accounting fix In shrink_inactive_list(), nr_scan is not accounted when nr_taken is 0. But 0 pages taken does not mean 0 pages scanned. Move the goto statement below the accounting code to fix it. Signed-off-by: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 85e95f4..486184d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1132,9 +1132,6 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, zone->pages_scanned += nr_scan; spin_unlock_irq(&zone->lru_lock); - if (nr_taken == 0) - goto done; - nr_scanned += nr_scan; nr_freed = shrink_page_list(&page_list, sc); nr_reclaimed += nr_freed; @@ -1146,6 +1143,9 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, __mod_page_state_zone(zone, pgscan_direct, nr_scan); __mod_page_state_zone(zone, pgsteal, nr_freed); + if (nr_taken == 0) + goto done; + spin_lock(&zone->lru_lock); /* * Put back any unfreeable pages. @@ -1166,8 +1166,9 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, } } } while (nr_scanned < max_scan); - spin_unlock_irq(&zone->lru_lock); + spin_unlock(&zone->lru_lock); done: + local_irq_enable(); pagevec_release(&pvec); return nr_reclaimed; } -- cgit v1.1 From a6f563db09c54c80d80e9013182dc512a5e53d0f Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 22 Mar 2006 00:08:29 -0800 Subject: [PATCH] remove VM_DONTCOPY bogosities Now that it's madvisable, remove two pieces of VM_DONTCOPY bogosity: 1. There was and is no logical reason why VM_DONTCOPY should be in the list of flags which forbid vma merging (and those drivers which set it are also setting VM_IO, which itself forbids the merge). 2. It's hard to understand the purpose of the VM_HUGETLB, VM_DONTCOPY block in vm_stat_account: but never mind, it's under CONFIG_HUGETLB, which (unlike CONFIG_HUGETLB_PAGE or CONFIG_HUGETLBFS) has never been defined. Signed-off-by: Hugh Dickins Cc: William Lee Irwin III Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 47556d2..0eb9894 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -612,7 +612,7 @@ again: remove_next = 1 + (end > next->vm_end); * If the vma has a ->close operation then the driver probably needs to release * per-vma resources, so we don't attempt to merge those. */ -#define VM_SPECIAL (VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP) +#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP) static inline int is_mergeable_vma(struct vm_area_struct *vma, struct file *file, unsigned long vm_flags) @@ -845,14 +845,6 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags, const unsigned long stack_flags = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN); -#ifdef CONFIG_HUGETLB - if (flags & VM_HUGETLB) { - if (!(flags & VM_DONTCOPY)) - mm->shared_vm += pages; - return; - } -#endif /* CONFIG_HUGETLB */ - if (file) { mm->shared_vm += pages; if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC) -- cgit v1.1 From 0f8053a509ceba4a077a50ea7b77039b5559b428 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 22 Mar 2006 00:08:33 -0800 Subject: [PATCH] mm: make __put_page internal Remove __put_page from outside the core mm/. It is dangerous because it does not handle compound pages nicely, and misses 1->0 transitions. If a user later appears that really needs the extra speed we can reevaluate. Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 2 ++ mm/internal.h | 11 +++++++++++ mm/vmscan.c | 2 ++ 3 files changed, 15 insertions(+) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 44da3d4..e8f58f7 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -30,6 +30,8 @@ #include #include #include "filemap.h" +#include "internal.h" + /* * FIXME: remove all knowledge of the buffer layer from the core VM */ diff --git a/mm/internal.h b/mm/internal.h index 17256bb..e3042db 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -8,6 +8,10 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ +#ifndef __MM_INTERNAL_H +#define __MM_INTERNAL_H + +#include static inline void set_page_refs(struct page *page, int order) { @@ -26,5 +30,12 @@ static inline void set_page_refs(struct page *page, int order) #endif /* CONFIG_MMU */ } +static inline void __put_page(struct page *page) +{ + atomic_dec(&page->_count); +} + extern void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order); + +#endif diff --git a/mm/vmscan.c b/mm/vmscan.c index 486184d..3914a94 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -39,6 +39,8 @@ #include +#include "internal.h" + /* possible outcome of pageout() */ typedef enum { /* failed to write page out, page is locked */ -- cgit v1.1 From 84097518d1ecd2330f9488e4c2d09953a3340e74 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 22 Mar 2006 00:08:34 -0800 Subject: [PATCH] mm: nommu use compound pages Now that compound page handling is properly fixed in the VM, move nommu over to using compound pages rather than rolling their own refcounting. nommu vm page refcounting is broken anyway, but there is no need to have divergent code in the core VM now, nor when it gets fixed. Signed-off-by: Nick Piggin Cc: David Howells (Needs testing, please). Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 12 ------------ mm/nommu.c | 4 ++-- mm/page_alloc.c | 7 ------- mm/slab.c | 9 ++++++++- 4 files changed, 10 insertions(+), 22 deletions(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index e3042db..7bb3397 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -15,19 +15,7 @@ static inline void set_page_refs(struct page *page, int order) { -#ifdef CONFIG_MMU set_page_count(page, 1); -#else - int i; - - /* - * We need to reference all the pages for this order, otherwise if - * anyone accesses one of the pages with (get/put) it will be freed. - * - eg: access_process_vm() - */ - for (i = 0; i < (1 << order); i++) - set_page_count(page + i, 1); -#endif /* CONFIG_MMU */ } static inline void __put_page(struct page *page) diff --git a/mm/nommu.c b/mm/nommu.c index 4951f47..db45efa 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -159,7 +159,7 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) /* * kmalloc doesn't like __GFP_HIGHMEM for some reason */ - return kmalloc(size, gfp_mask & ~__GFP_HIGHMEM); + return kmalloc(size, (gfp_mask | __GFP_COMP) & ~__GFP_HIGHMEM); } struct page * vmalloc_to_page(void *addr) @@ -623,7 +623,7 @@ static int do_mmap_private(struct vm_area_struct *vma, unsigned long len) * - note that this may not return a page-aligned address if the object * we're allocating is smaller than a page */ - base = kmalloc(len, GFP_KERNEL); + base = kmalloc(len, GFP_KERNEL|__GFP_COMP); if (!base) goto enomem; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7aa0181..e197818 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -422,11 +422,6 @@ static void __free_pages_ok(struct page *page, unsigned int order) mutex_debug_check_no_locks_freed(page_address(page), PAGE_SIZE<lru.next; } @@ -600,6 +602,8 @@ static inline void page_set_slab(struct page *page, struct slab *slab) static inline struct slab *page_get_slab(struct page *page) { + if (unlikely(PageCompound(page))) + page = (struct page *)page_private(page); return (struct slab *)page->lru.prev; } @@ -2412,8 +2416,11 @@ static void set_slab_attr(struct kmem_cache *cachep, struct slab *slabp, struct page *page; /* Nasty!!!!!! I hope this is OK. */ - i = 1 << cachep->gfporder; page = virt_to_page(objp); + + i = 1; + if (likely(!PageCompound(page))) + i <<= cachep->gfporder; do { page_set_cache(page, cachep); page_set_slab(page, slabp); -- cgit v1.1 From 7835e98b2e3c66dba79cb0ff8ebb90a2fe030c29 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 22 Mar 2006 00:08:40 -0800 Subject: [PATCH] remove set_page_count() outside mm/ set_page_count usage outside mm/ is limited to setting the refcount to 1. Remove set_page_count from outside mm/, and replace those users with init_page_count() and set_page_refcounted(). This allows more debug checking, and tighter control on how code is allowed to play around with page->_count. Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 5 +++-- mm/internal.h | 13 ++++++++++++- mm/page_alloc.c | 14 ++++++-------- 3 files changed, 21 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 39d49ec..20117a4 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -18,6 +18,7 @@ #include #include +#include "internal.h" const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; static unsigned long nr_huge_pages, free_huge_pages; @@ -106,7 +107,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr) return NULL; } spin_unlock(&hugetlb_lock); - set_page_count(page, 1); + set_page_refcounted(page); for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i) clear_user_highpage(&page[i], addr); return page; @@ -152,7 +153,7 @@ static void update_and_free_page(struct page *page) 1 << PG_private | 1<< PG_writeback); } page[1].lru.next = NULL; - set_page_count(page, 1); + set_page_refcounted(page); __free_pages(page, HUGETLB_PAGE_ORDER); } diff --git a/mm/internal.h b/mm/internal.h index 7bb3397..d20e3cc 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -13,8 +13,19 @@ #include -static inline void set_page_refs(struct page *page, int order) +static inline void set_page_count(struct page *page, int v) { + atomic_set(&page->_count, v); +} + +/* + * Turn a non-refcounted page (->_count == 0) into refcounted with + * a count of one. + */ +static inline void set_page_refcounted(struct page *page) +{ + BUG_ON(PageCompound(page) && page_private(page) != (unsigned long)page); + BUG_ON(atomic_read(&page->_count)); set_page_count(page, 1); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e197818..7f65b5a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -442,7 +442,7 @@ void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order) if (order == 0) { __ClearPageReserved(page); set_page_count(page, 0); - set_page_refs(page, 0); + set_page_refcounted(page); __free_page(page); } else { int loop; @@ -457,7 +457,7 @@ void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order) set_page_count(p, 0); } - set_page_refs(page, order); + set_page_refcounted(page); __free_pages(page, order); } } @@ -525,7 +525,7 @@ static int prep_new_page(struct page *page, int order) 1 << PG_referenced | 1 << PG_arch_1 | 1 << PG_checked | 1 << PG_mappedtodisk); set_page_private(page, 0); - set_page_refs(page, order); + set_page_refcounted(page); kernel_map_pages(page, 1 << order, 1); return 0; } @@ -755,10 +755,8 @@ void split_page(struct page *page, unsigned int order) BUG_ON(PageCompound(page)); BUG_ON(!page_count(page)); - for (i = 1; i < (1 << order); i++) { - BUG_ON(page_count(page + i)); - set_page_count(page + i, 1); - } + for (i = 1; i < (1 << order); i++) + set_page_refcounted(page + i); } /* @@ -1771,7 +1769,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, continue; page = pfn_to_page(pfn); set_page_links(page, zone, nid, pfn); - set_page_count(page, 1); + init_page_count(page); reset_page_mapcount(page); SetPageReserved(page); INIT_LIST_HEAD(&page->lru); -- cgit v1.1 From 17cf44064ae744f081309108fa67f0e942b10167 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 22 Mar 2006 00:08:41 -0800 Subject: [PATCH] mm: cleanup prep_ stuff Move the prep_ stuff into prep_new_page. Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7f65b5a..bdff858 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -212,6 +212,15 @@ static void destroy_compound_page(struct page *page, unsigned long order) } } +static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) +{ + int i; + + BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM); + for (i = 0; i < (1 << order); i++) + clear_highpage(page + i); +} + /* * function for dealing with page's order in buddy system. * zone->lock is already acquired when we use these. @@ -496,7 +505,7 @@ static inline void expand(struct zone *zone, struct page *page, /* * This page is about to be returned from the page allocator */ -static int prep_new_page(struct page *page, int order) +static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) { if (unlikely(page_mapcount(page) | (page->mapping != NULL) | @@ -527,6 +536,13 @@ static int prep_new_page(struct page *page, int order) set_page_private(page, 0); set_page_refcounted(page); kernel_map_pages(page, 1 << order, 1); + + if (gfp_flags & __GFP_ZERO) + prep_zero_page(page, order, gfp_flags); + + if (order && (gfp_flags & __GFP_COMP)) + prep_compound_page(page, order); + return 0; } @@ -732,15 +748,6 @@ void fastcall free_cold_page(struct page *page) free_hot_cold_page(page, 1); } -static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) -{ - int i; - - BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM); - for(i = 0; i < (1 << order); i++) - clear_highpage(page + i); -} - /* * split_page takes a non-compound higher-order page, and splits it into * n (1< Date: Wed, 22 Mar 2006 00:08:42 -0800 Subject: [PATCH] mm: prep_zero_page() in irq is a bug prep_zero_page() uses KM_USER0 and hence may not be used from IRQ context, at least for highmem pages. Cc: Nick Piggin Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bdff858..ed91684 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -217,6 +217,11 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) int i; BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM); + /* + * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO + * and __GFP_HIGHMEM from hard or soft interrupt context. + */ + BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt()); for (i = 0; i < (1 << order); i++) clear_highpage(page + i); } -- cgit v1.1 From b7ab795b7bec9997d4fde39f249d52823d36d98d Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 22 Mar 2006 00:08:42 -0800 Subject: [PATCH] mm: more CONFIG_DEBUG_VM Put a few more checks under CONFIG_DEBUG_VM Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 12 +++--------- mm/rmap.c | 9 ++++----- 2 files changed, 7 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 6af555c..71bc664 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -388,7 +388,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_ { unsigned long pfn = pte_pfn(pte); - if (vma->vm_flags & VM_PFNMAP) { + if (unlikely(vma->vm_flags & VM_PFNMAP)) { unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT; if (pfn == vma->vm_pgoff + off) return NULL; @@ -396,18 +396,12 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_ return NULL; } - /* - * Add some anal sanity checks for now. Eventually, - * we should just do "return pfn_to_page(pfn)", but - * in the meantime we check that we get a valid pfn, - * and that the resulting page looks ok. - * - * Remove this test eventually! - */ +#ifdef CONFIG_DEBUG_VM if (unlikely(!pfn_valid(pfn))) { print_bad_pte(vma, pte, addr); return NULL; } +#endif /* * NOTE! We still have PageReserved() pages in the page diff --git a/mm/rmap.c b/mm/rmap.c index 134aef9..1963e26 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -56,13 +56,11 @@ #include -//#define RMAP_DEBUG /* can be enabled only for debugging */ - struct kmem_cache *anon_vma_cachep; static inline void validate_anon_vma(struct vm_area_struct *find_vma) { -#ifdef RMAP_DEBUG +#ifdef CONFIG_DEBUG_VM struct anon_vma *anon_vma = find_vma->anon_vma; struct vm_area_struct *vma; unsigned int mapcount = 0; @@ -551,13 +549,14 @@ void page_add_file_rmap(struct page *page) void page_remove_rmap(struct page *page) { if (atomic_add_negative(-1, &page->_mapcount)) { - if (page_mapcount(page) < 0) { +#ifdef CONFIG_DEBUG_VM + if (unlikely(page_mapcount(page) < 0)) { printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page)); printk (KERN_EMERG " page->flags = %lx\n", page->flags); printk (KERN_EMERG " page->count = %x\n", page_count(page)); printk (KERN_EMERG " page->mapping = %p\n", page->mapping); } - +#endif BUG_ON(page_mapcount(page) < 0); /* * It would be tidy to reset the PageAnon mapping here, -- cgit v1.1 From 6e5ef1a96e6e3b123da56292bc35017c8c401491 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 22 Mar 2006 00:08:45 -0800 Subject: [PATCH] vmscan: emove obsolete checks from shrink_list() and fix unlikely in refill_inactive_zone() As suggested by Marcelo: 1. The optimization introduced recently for not calling page_referenced() during zone reclaim makes two additional checks in shrink_list unnecessary. 2. The if (unlikely(sc->may_swap)) in refill_inactive_zone is optimized for the zone_reclaim case. However, most peoples system only does swap. Undo that. Signed-off-by: Christoph Lameter Cc: Marcelo Tosatti Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 3914a94..f713e9f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -460,12 +460,9 @@ static unsigned long shrink_page_list(struct list_head *page_list, * Anonymous process memory has backing store? * Try to allocate it some swap space here. */ - if (PageAnon(page) && !PageSwapCache(page)) { - if (!sc->may_swap) - goto keep_locked; + if (PageAnon(page) && !PageSwapCache(page)) if (!add_to_swap(page, GFP_ATOMIC)) goto activate_locked; - } #endif /* CONFIG_SWAP */ mapping = page_mapping(page); @@ -477,12 +474,6 @@ static unsigned long shrink_page_list(struct list_head *page_list, * processes. Try to unmap it here. */ if (page_mapped(page) && mapping) { - /* - * No unmapping if we do not swap - */ - if (!sc->may_swap) - goto keep_locked; - switch (try_to_unmap(page, 0)) { case SWAP_FAIL: goto activate_locked; @@ -1205,7 +1196,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, struct pagevec pvec; int reclaim_mapped = 0; - if (unlikely(sc->may_swap)) { + if (sc->may_swap) { long mapped_ratio; long distress; long swap_tendency; -- cgit v1.1 From d15c023b44e5d323f1f4130b85d29f08e43433b1 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 22 Mar 2006 00:08:46 -0800 Subject: [PATCH] shmem: inline to avoid warning shmem.c was named and shamed in Jesper's "Building 100 kernels" warnings: shmem_parse_mpol is only used when CONFIG_TMPFS parses mount options; and only called from that one site, so mark it inline like its non-NUMA stub. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index f523a15..37eaf42e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -875,7 +875,7 @@ redirty: } #ifdef CONFIG_NUMA -static int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes) +static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes) { char *nodelist = strchr(value, ':'); int err = 1; -- cgit v1.1 From a564da3964db3256069190c2ae95069143ac37fb Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 22 Mar 2006 00:08:47 -0800 Subject: [PATCH] readahead: ->prev_page can overrun the ahead window If get_next_ra_size() does not grow fast enough, ->prev_page can overrun the ahead window. This means the caller will read the pages from ->ahead_start + ->ahead_size to ->prev_page synchronously. Signed-off-by: Oleg Nesterov Cc: Steven Pratt Cc: Ram Pai Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/readahead.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/readahead.c b/mm/readahead.c index 8d6eeaa..57557e2 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -52,13 +52,24 @@ static inline unsigned long get_min_readahead(struct file_ra_state *ra) return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE; } +static inline void reset_ahead_window(struct file_ra_state *ra) +{ + /* + * ... but preserve ahead_start + ahead_size value, + * see 'recheck:' label in page_cache_readahead(). + * Note: We never use ->ahead_size as rvalue without + * checking ->ahead_start != 0 first. + */ + ra->ahead_size += ra->ahead_start; + ra->ahead_start = 0; +} + static inline void ra_off(struct file_ra_state *ra) { ra->start = 0; ra->flags = 0; ra->size = 0; - ra->ahead_start = 0; - ra->ahead_size = 0; + reset_ahead_window(ra); return; } @@ -426,8 +437,7 @@ static int make_ahead_window(struct address_space *mapping, struct file *filp, * congestion. The ahead window will any way be closed * in case we failed due to excessive page cache hits. */ - ra->ahead_start = 0; - ra->ahead_size = 0; + reset_ahead_window(ra); } return ret; @@ -520,11 +530,11 @@ page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra, * If we get here we are doing sequential IO and this was not the first * occurence (ie we have an existing window) */ - if (ra->ahead_start == 0) { /* no ahead window yet */ if (!make_ahead_window(mapping, filp, ra, 0)) - goto out; + goto recheck; } + /* * Already have an ahead window, check if we crossed into it. * If so, shift windows and issue a new ahead window. @@ -536,6 +546,10 @@ page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra, ra->start = ra->ahead_start; ra->size = ra->ahead_size; make_ahead_window(mapping, filp, ra, 0); +recheck: + /* prev_page shouldn't overrun the ahead window */ + ra->prev_page = min(ra->prev_page, + ra->ahead_start + ra->ahead_size - 1); } out: -- cgit v1.1 From aed75ff3caafce404d9be7f0c088716375be5279 Mon Sep 17 00:00:00 2001 From: Steven Pratt Date: Wed, 22 Mar 2006 00:08:48 -0800 Subject: [PATCH] readahead: fix initial window size calculation The current current get_init_ra_size is not optimal across different IO sizes and max_readahead values. Here is a quick summary of sizes computed under current design and under the attached patch. All of these assume 1st IO at offset 0, or 1st detected sequential IO. 32k max, 4k request old new ----------------- 8k 8k 16k 16k 32k 32k 128k max, 4k request old new ----------------- 32k 16k 64k 32k 128k 64k 128k 128k 128k max, 32k request old new ----------------- 32k 64k <----- 64k 128k 128k 128k 512k max, 4k request old new ----------------- 4k 32k <---- 16k 64k 64k 128k 128k 256k 512k 512k Cc: Oleg Nesterov Cc: Steven Pratt Cc: Ram Pai Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/readahead.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/readahead.c b/mm/readahead.c index 57557e2..301b36c 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -83,10 +83,10 @@ static unsigned long get_init_ra_size(unsigned long size, unsigned long max) { unsigned long newsize = roundup_pow_of_two(size); - if (newsize <= max / 64) - newsize = newsize * newsize; + if (newsize <= max / 32) + newsize = newsize * 4; else if (newsize <= max / 4) - newsize = max / 4; + newsize = newsize * 2; else newsize = max; return newsize; -- cgit v1.1 From 8f860591ffb29738cf5539b6fbf27f50dcdeb380 Mon Sep 17 00:00:00 2001 From: "Zhang, Yanmin" Date: Wed, 22 Mar 2006 00:08:50 -0800 Subject: [PATCH] Enable mprotect on huge pages 2.6.16-rc3 uses hugetlb on-demand paging, but it doesn_t support hugetlb mprotect. From: David Gibson Remove a test from the mprotect() path which checks that the mprotect()ed range on a hugepage VMA is hugepage aligned (yes, really, the sense of is_aligned_hugepage_range() is the opposite of what you'd guess :-/). In fact, we don't need this test. If the given addresses match the beginning/end of a hugepage VMA they must already be suitably aligned. If they don't, then mprotect_fixup() will attempt to split the VMA. The very first test in split_vma() will check for a badly aligned address on a hugepage VMA and return -EINVAL if necessary. From: "Chen, Kenneth W" On i386 and x86-64, pte flag _PAGE_PSE collides with _PAGE_PROTNONE. The identify of hugetlb pte is lost when changing page protection via mprotect. A page fault occurs later will trigger a bug check in huge_pte_alloc(). The fix is to always make new pte a hugetlb pte and also to clean up legacy code where _PAGE_PRESENT is forced on in the pre-faulting day. Signed-off-by: Zhang Yanmin Cc: David Gibson Cc: "David S. Miller" Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: William Lee Irwin III Signed-off-by: Ken Chen Signed-off-by: Nishanth Aravamudan Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 29 +++++++++++++++++++++++++++++ mm/mprotect.c | 12 +++++------- 2 files changed, 34 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 20117a4..783098f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -565,3 +565,32 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, return i; } + +void hugetlb_change_protection(struct vm_area_struct *vma, + unsigned long address, unsigned long end, pgprot_t newprot) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long start = address; + pte_t *ptep; + pte_t pte; + + BUG_ON(address >= end); + flush_cache_range(vma, address, end); + + spin_lock(&mm->page_table_lock); + for (; address < end; address += HPAGE_SIZE) { + ptep = huge_pte_offset(mm, address); + if (!ptep) + continue; + if (!pte_none(*ptep)) { + pte = huge_ptep_get_and_clear(mm, address, ptep); + pte = pte_mkhuge(pte_modify(pte, newprot)); + set_huge_pte_at(mm, address, ptep, pte); + lazy_mmu_prot_update(pte); + } + } + spin_unlock(&mm->page_table_lock); + + flush_tlb_range(vma, start, end); +} + diff --git a/mm/mprotect.c b/mm/mprotect.c index 653b857..4c14d42 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -124,7 +124,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, * a MAP_NORESERVE private mapping to writable will now reserve. */ if (newflags & VM_WRITE) { - if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) { + if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED))) { charged = nrpages; if (security_vm_enough_memory(charged)) return -ENOMEM; @@ -166,7 +166,10 @@ success: */ vma->vm_flags = newflags; vma->vm_page_prot = newprot; - change_protection(vma, start, end, newprot); + if (is_vm_hugetlb_page(vma)) + hugetlb_change_protection(vma, start, end, newprot); + else + change_protection(vma, start, end, newprot); vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); vm_stat_account(mm, newflags, vma->vm_file, nrpages); return 0; @@ -240,11 +243,6 @@ sys_mprotect(unsigned long start, size_t len, unsigned long prot) /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ - if (is_vm_hugetlb_page(vma)) { - error = -EACCES; - goto out; - } - newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); /* newflags >> 4 shift VM_MAY% in place of VM_% */ -- cgit v1.1 From 79ac6ba40eb8d70f0d204e98ae9b63280ad1018c Mon Sep 17 00:00:00 2001 From: David Gibson Date: Wed, 22 Mar 2006 00:08:51 -0800 Subject: [PATCH] hugepage: Small fixes to hugepage clear/copy path Move the loops used in mm/hugetlb.c to clear and copy hugepages to their own functions for clarity. As we do so, we add some checks of need_resched - we are, after all copying megabytes of memory here. We also add might_sleep() accordingly. We generally dropped locks around the clear and copy, already but not everyone has PREEMPT enabled, so we should still be checking explicitly. For this to work, we need to remove the clear_huge_page() from alloc_huge_page(), which is called with the page_table_lock held in the COW path. We move the clear_huge_page() to just after the alloc_huge_page() in the hugepage no-page path. In the COW path, the new page is about to be copied over, so clearing it was just a waste of time anyway. So as a side effect we also fix the fact that we held the page_table_lock for far too long in this path by calling alloc_huge_page() under it. It causes no regressions on the libhugetlbfs testsuite (ppc64, POWER5). Signed-off-by: David Gibson Cc: William Lee Irwin III Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 783098f..41b1038 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -27,6 +27,29 @@ static struct list_head hugepage_freelists[MAX_NUMNODES]; static unsigned int nr_huge_pages_node[MAX_NUMNODES]; static unsigned int free_huge_pages_node[MAX_NUMNODES]; +static void clear_huge_page(struct page *page, unsigned long addr) +{ + int i; + + might_sleep(); + for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) { + cond_resched(); + clear_user_highpage(page + i, addr); + } +} + +static void copy_huge_page(struct page *dst, struct page *src, + unsigned long addr) +{ + int i; + + might_sleep(); + for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) { + cond_resched(); + copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE); + } +} + /* * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages */ @@ -98,7 +121,6 @@ void free_huge_page(struct page *page) struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr) { struct page *page; - int i; spin_lock(&hugetlb_lock); page = dequeue_huge_page(vma, addr); @@ -108,8 +130,6 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr) } spin_unlock(&hugetlb_lock); set_page_refcounted(page); - for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i) - clear_user_highpage(&page[i], addr); return page; } @@ -367,7 +387,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t pte) { struct page *old_page, *new_page; - int i, avoidcopy; + int avoidcopy; old_page = pte_page(pte); @@ -388,9 +408,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, } spin_unlock(&mm->page_table_lock); - for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) - copy_user_highpage(new_page + i, old_page + i, - address + i*PAGE_SIZE); + copy_huge_page(new_page, old_page, address); spin_lock(&mm->page_table_lock); ptep = huge_pte_offset(mm, address & HPAGE_MASK); @@ -435,6 +453,7 @@ retry: ret = VM_FAULT_OOM; goto out; } + clear_huge_page(page, address); if (vma->vm_flags & VM_SHARED) { int err; -- cgit v1.1 From 3935baa9bcda3ccaee4f7849f5157d316e34412e Mon Sep 17 00:00:00 2001 From: David Gibson Date: Wed, 22 Mar 2006 00:08:53 -0800 Subject: [PATCH] hugepage: serialize hugepage allocation and instantiation Currently, no lock or mutex is held between allocating a hugepage and inserting it into the pagetables / page cache. When we do go to insert the page into pagetables or page cache, we recheck and may free the newly allocated hugepage. However, since the number of hugepages in the system is strictly limited, and it's usualy to want to use all of them, this can still lead to spurious allocation failures. For example, suppose two processes are both mapping (MAP_SHARED) the same hugepage file, large enough to consume the entire available hugepage pool. If they race instantiating the last page in the mapping, they will both attempt to allocate the last available hugepage. One will fail, of course, returning OOM from the fault and thus causing the process to be killed, despite the fact that the entire mapping can, in fact, be instantiated. The patch fixes this race by the simple method of adding a (sleeping) mutex to serialize the hugepage fault path between allocation and insertion into pagetables and/or page cache. It would be possible to avoid the serialization by catching the allocation failures, waiting on some condition, then rechecking to see if someone else has instantiated the page for us. Given the likely frequency of hugepage instantiations, it seems very doubtful it's worth the extra complexity. This patch causes no regression on the libhugetlbfs testsuite, and one test, which can trigger this race now passes where it previously failed. Actually, the test still sometimes fails, though less often and only as a shmat() failure, rather processes getting OOM killed by the VM. The dodgy heuristic tests in fs/hugetlbfs/inode.c for whether there's enough hugepage space aren't protected by the new mutex, and would be ugly to do so, so there's still a race there. Another patch to replace those tests with something saner for this reason as well as others coming... Signed-off-by: David Gibson Cc: William Lee Irwin III Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 41b1038..d5987a8 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -26,6 +27,10 @@ unsigned long max_huge_pages; static struct list_head hugepage_freelists[MAX_NUMNODES]; static unsigned int nr_huge_pages_node[MAX_NUMNODES]; static unsigned int free_huge_pages_node[MAX_NUMNODES]; +/* + * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages + */ +static DEFINE_SPINLOCK(hugetlb_lock); static void clear_huge_page(struct page *page, unsigned long addr) { @@ -50,11 +55,6 @@ static void copy_huge_page(struct page *dst, struct page *src, } } -/* - * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages - */ -static DEFINE_SPINLOCK(hugetlb_lock); - static void enqueue_huge_page(struct page *page) { int nid = page_to_nid(page); @@ -508,14 +508,24 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, pte_t *ptep; pte_t entry; int ret; + static DEFINE_MUTEX(hugetlb_instantiation_mutex); ptep = huge_pte_alloc(mm, address); if (!ptep) return VM_FAULT_OOM; + /* + * Serialize hugepage allocation and instantiation, so that we don't + * get spurious allocation failures if two CPUs race to instantiate + * the same page in the page cache. + */ + mutex_lock(&hugetlb_instantiation_mutex); entry = *ptep; - if (pte_none(entry)) - return hugetlb_no_page(mm, vma, address, ptep, write_access); + if (pte_none(entry)) { + ret = hugetlb_no_page(mm, vma, address, ptep, write_access); + mutex_unlock(&hugetlb_instantiation_mutex); + return ret; + } ret = VM_FAULT_MINOR; @@ -525,6 +535,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (write_access && !pte_write(entry)) ret = hugetlb_cow(mm, vma, address, ptep, entry); spin_unlock(&mm->page_table_lock); + mutex_unlock(&hugetlb_instantiation_mutex); return ret; } -- cgit v1.1 From b45b5bd65f668a665db40d093e4e1fe563533608 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Wed, 22 Mar 2006 00:08:55 -0800 Subject: [PATCH] hugepage: Strict page reservation for hugepage inodes These days, hugepages are demand-allocated at first fault time. There's a somewhat dubious (and racy) heuristic when making a new mmap() to check if there are enough available hugepages to fully satisfy that mapping. A particularly obvious case where the heuristic breaks down is where a process maps its hugepages not as a single chunk, but as a bunch of individually mmap()ed (or shmat()ed) blocks without touching and instantiating the pages in between allocations. In this case the size of each block is compared against the total number of available hugepages. It's thus easy for the process to become overcommitted, because each block mapping will succeed, although the total number of hugepages required by all blocks exceeds the number available. In particular, this defeats such a program which will detect a mapping failure and adjust its hugepage usage downward accordingly. The patch below addresses this problem, by strictly reserving a number of physical hugepages for hugepage inodes which have been mapped, but not instatiated. MAP_SHARED mappings are thus "safe" - they will fail on mmap(), not later with an OOM SIGKILL. MAP_PRIVATE mappings can still trigger an OOM. (Actually SHARED mappings can technically still OOM, but only if the sysadmin explicitly reduces the hugepage pool between mapping and instantiation) This patch appears to address the problem at hand - it allows DB2 to start correctly, for instance, which previously suffered the failure described above. This patch causes no regressions on the libhugetblfs testsuite, and makes a test (designed to catch this problem) pass which previously failed (ppc64, POWER5). Signed-off-by: David Gibson Cc: William Lee Irwin III Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 136 ++++++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 126 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d5987a8..27fad5d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -22,7 +22,7 @@ #include "internal.h" const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; -static unsigned long nr_huge_pages, free_huge_pages; +static unsigned long nr_huge_pages, free_huge_pages, reserved_huge_pages; unsigned long max_huge_pages; static struct list_head hugepage_freelists[MAX_NUMNODES]; static unsigned int nr_huge_pages_node[MAX_NUMNODES]; @@ -120,17 +120,136 @@ void free_huge_page(struct page *page) struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr) { + struct inode *inode = vma->vm_file->f_dentry->d_inode; struct page *page; + int use_reserve = 0; + unsigned long idx; spin_lock(&hugetlb_lock); - page = dequeue_huge_page(vma, addr); - if (!page) { - spin_unlock(&hugetlb_lock); - return NULL; + + if (vma->vm_flags & VM_MAYSHARE) { + + /* idx = radix tree index, i.e. offset into file in + * HPAGE_SIZE units */ + idx = ((addr - vma->vm_start) >> HPAGE_SHIFT) + + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); + + /* The hugetlbfs specific inode info stores the number + * of "guaranteed available" (huge) pages. That is, + * the first 'prereserved_hpages' pages of the inode + * are either already instantiated, or have been + * pre-reserved (by hugetlb_reserve_for_inode()). Here + * we're in the process of instantiating the page, so + * we use this to determine whether to draw from the + * pre-reserved pool or the truly free pool. */ + if (idx < HUGETLBFS_I(inode)->prereserved_hpages) + use_reserve = 1; + } + + if (!use_reserve) { + if (free_huge_pages <= reserved_huge_pages) + goto fail; + } else { + BUG_ON(reserved_huge_pages == 0); + reserved_huge_pages--; } + + page = dequeue_huge_page(vma, addr); + if (!page) + goto fail; + spin_unlock(&hugetlb_lock); set_page_refcounted(page); return page; + + fail: + WARN_ON(use_reserve); /* reserved allocations shouldn't fail */ + spin_unlock(&hugetlb_lock); + return NULL; +} + +/* hugetlb_extend_reservation() + * + * Ensure that at least 'atleast' hugepages are, and will remain, + * available to instantiate the first 'atleast' pages of the given + * inode. If the inode doesn't already have this many pages reserved + * or instantiated, set aside some hugepages in the reserved pool to + * satisfy later faults (or fail now if there aren't enough, rather + * than getting the SIGBUS later). + */ +int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info, + unsigned long atleast) +{ + struct inode *inode = &info->vfs_inode; + unsigned long change_in_reserve = 0; + int ret = 0; + + spin_lock(&hugetlb_lock); + read_lock_irq(&inode->i_mapping->tree_lock); + + if (info->prereserved_hpages >= atleast) + goto out; + + /* Because we always call this on shared mappings, none of the + * pages beyond info->prereserved_hpages can have been + * instantiated, so we need to reserve all of them now. */ + change_in_reserve = atleast - info->prereserved_hpages; + + if ((reserved_huge_pages + change_in_reserve) > free_huge_pages) { + ret = -ENOMEM; + goto out; + } + + reserved_huge_pages += change_in_reserve; + info->prereserved_hpages = atleast; + + out: + read_unlock_irq(&inode->i_mapping->tree_lock); + spin_unlock(&hugetlb_lock); + + return ret; +} + +/* hugetlb_truncate_reservation() + * + * This returns pages reserved for the given inode to the general free + * hugepage pool. If the inode has any pages prereserved, but not + * instantiated, beyond offset (atmost << HPAGE_SIZE), then release + * them. + */ +void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info, + unsigned long atmost) +{ + struct inode *inode = &info->vfs_inode; + struct address_space *mapping = inode->i_mapping; + unsigned long idx; + unsigned long change_in_reserve = 0; + struct page *page; + + spin_lock(&hugetlb_lock); + read_lock_irq(&inode->i_mapping->tree_lock); + + if (info->prereserved_hpages <= atmost) + goto out; + + /* Count pages which were reserved, but not instantiated, and + * which we can now release. */ + for (idx = atmost; idx < info->prereserved_hpages; idx++) { + page = radix_tree_lookup(&mapping->page_tree, idx); + if (!page) + /* Pages which are already instantiated can't + * be unreserved (and in fact have already + * been removed from the reserved pool) */ + change_in_reserve++; + } + + BUG_ON(reserved_huge_pages < change_in_reserve); + reserved_huge_pages -= change_in_reserve; + info->prereserved_hpages = atmost; + + out: + read_unlock_irq(&inode->i_mapping->tree_lock); + spin_unlock(&hugetlb_lock); } static int __init hugetlb_init(void) @@ -238,9 +357,11 @@ int hugetlb_report_meminfo(char *buf) return sprintf(buf, "HugePages_Total: %5lu\n" "HugePages_Free: %5lu\n" + "HugePages_Rsvd: %5lu\n" "Hugepagesize: %5lu kB\n", nr_huge_pages, free_huge_pages, + reserved_huge_pages, HPAGE_SIZE/1024); } @@ -253,11 +374,6 @@ int hugetlb_report_node_meminfo(int nid, char *buf) nid, free_huge_pages_node[nid]); } -int is_hugepage_mem_enough(size_t size) -{ - return (size + ~HPAGE_MASK)/HPAGE_SIZE <= free_huge_pages; -} - /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ unsigned long hugetlb_total_pages(void) { -- cgit v1.1 From 27a85ef1b81300cfff06b4c8037e9914dfb09acc Mon Sep 17 00:00:00 2001 From: David Gibson Date: Wed, 22 Mar 2006 00:08:56 -0800 Subject: [PATCH] hugepage: Make {alloc,free}_huge_page() local Originally, mm/hugetlb.c just handled the hugepage physical allocation path and its {alloc,free}_huge_page() functions were used from the arch specific hugepage code. These days those functions are only used with mm/hugetlb.c itself. Therefore, this patch makes them static and removes their prototypes from hugetlb.h. This requires a small rearrangement of code in mm/hugetlb.c to avoid a forward declaration. This patch causes no regressions on the libhugetlbfs testsuite (ppc64, POWER5). Signed-off-by: David Gibson Cc: William Lee Irwin III Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 27fad5d..075877b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -88,6 +88,17 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma, return page; } +static void free_huge_page(struct page *page) +{ + BUG_ON(page_count(page)); + + INIT_LIST_HEAD(&page->lru); + + spin_lock(&hugetlb_lock); + enqueue_huge_page(page); + spin_unlock(&hugetlb_lock); +} + static int alloc_fresh_huge_page(void) { static int nid = 0; @@ -107,18 +118,8 @@ static int alloc_fresh_huge_page(void) return 0; } -void free_huge_page(struct page *page) -{ - BUG_ON(page_count(page)); - - INIT_LIST_HEAD(&page->lru); - - spin_lock(&hugetlb_lock); - enqueue_huge_page(page); - spin_unlock(&hugetlb_lock); -} - -struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr) +static struct page *alloc_huge_page(struct vm_area_struct *vma, + unsigned long addr) { struct inode *inode = vma->vm_file->f_dentry->d_inode; struct page *page; -- cgit v1.1 From 9da61aef0fd5b17dd4bf4baf33db12c470def774 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Wed, 22 Mar 2006 00:08:57 -0800 Subject: [PATCH] hugepage: Fix hugepage logic in free_pgtables() free_pgtables() has special logic to call hugetlb_free_pgd_range() instead of the normal free_pgd_range() on hugepage VMAs. However, the test it uses to do so is incorrect: it calls is_hugepage_only_range on a hugepage sized range at the start of the vma. is_hugepage_only_range() will return true if the given range has any intersection with a hugepage address region, and in this case the given region need not be hugepage aligned. So, for example, this test can return true if called on, say, a 4k VMA immediately preceding a (nicely aligned) hugepage VMA. At present we get away with this because the powerpc version of hugetlb_free_pgd_range() is just a call to free_pgd_range(). On ia64 (the only other arch with a non-trivial is_hugepage_only_range()) we get away with it for a different reason; the hugepage area is not contiguous with the rest of the user address space, and VMAs are not permitted in between, so the test can't return a false positive there. Nonetheless this should be fixed. We do that in the patch below by replacing the is_hugepage_only_range() test with an explicit test of the VMA using is_vm_hugetlb_page(). This in turn changes behaviour for platforms where is_hugepage_only_range() returns false always (everything except powerpc and ia64). We address this by ensuring that hugetlb_free_pgd_range() is defined to be identical to free_pgd_range() (instead of a no-op) on everything except ia64. Even so, it will prevent some otherwise possible coalescing of calls down to free_pgd_range(). Since this only happens for hugepage VMAs, removing this small optimization seems unlikely to cause any trouble. This patch causes no regressions on the libhugetlbfs testsuite - ppc64 POWER5 (8-way), ppc64 G5 (2-way) and i386 Pentium M (UP). Signed-off-by: David Gibson Cc: William Lee Irwin III Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 71bc664..f6e3be9 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -277,7 +277,7 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, anon_vma_unlink(vma); unlink_file_vma(vma); - if (is_hugepage_only_range(vma->vm_mm, addr, HPAGE_SIZE)) { + if (is_vm_hugetlb_page(vma)) { hugetlb_free_pgd_range(tlb, addr, vma->vm_end, floor, next? next->vm_start: ceiling); } else { @@ -285,8 +285,7 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, * Optimization: gather nearby vmas into one call down */ while (next && next->vm_start <= vma->vm_end + PMD_SIZE - && !is_hugepage_only_range(vma->vm_mm, next->vm_start, - HPAGE_SIZE)) { + && !is_vm_hugetlb_page(vma)) { vma = next; next = vma->vm_next; anon_vma_unlink(vma); -- cgit v1.1 From 4866920b93fd7d5b520278c3c76e6f4d5a352d81 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Wed, 22 Mar 2006 00:08:58 -0800 Subject: [PATCH] hugepage: Fix hugepage logic in free_pgtables() harder Turns out the hugepage logic in free_pgtables() was doubly broken. The loop coalescing multiple normal page VMAs into one call to free_pgd_range() had an off by one error, which could mean it would coalesce one hugepage VMA into the same bundle (checking 'vma' not 'next' in the loop). I transferred this bug into the new is_vm_hugetlb_page() based version. Here's the fix. This one didn't bite on powerpc previously for the same reason the is_hugepage_only_range() problem didn't: powerpc's hugetlb_free_pgd_range() is identical to free_pgd_range(). It didn't bite on ia64 because the hugepage region is distant enough from any other region that the separated PMD_SIZE distance test would always prevent coalescing the two together. No libhugetlbfs testsuite regressions (ppc64, POWER5). Signed-off-by: David Gibson Cc: William Lee Irwin III Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index f6e3be9..80c3fb3 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -285,7 +285,7 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, * Optimization: gather nearby vmas into one call down */ while (next && next->vm_start <= vma->vm_end + PMD_SIZE - && !is_vm_hugetlb_page(vma)) { + && !is_vm_hugetlb_page(next)) { vma = next; next = vma->vm_next; anon_vma_unlink(vma); -- cgit v1.1 From d5d4b0aa4e1430d73050babba999365593bdb9d2 Mon Sep 17 00:00:00 2001 From: "Chen, Kenneth W" Date: Wed, 22 Mar 2006 00:09:03 -0800 Subject: [PATCH] optimize follow_hugetlb_page follow_hugetlb_page() walks a range of user virtual address and then fills in list of struct page * into an array that is passed from the argument list. It also gets a reference count via get_page(). For compound page, get_page() actually traverse back to head page via page_private() macro and then adds a reference count to the head page. Since we are doing a virt to pte look up, kernel already has a struct page pointer into the head page. So instead of traverse into the small unit page struct and then follow a link back to the head page, optimize that with incrementing the reference count directly on the head page. The benefit is that we don't take a cache miss on accessing page struct for the corresponding user address and more importantly, not to pollute the cache with a "not very useful" round trip of pointer chasing. This adds a moderate performance gain on an I/O intensive database transaction workload. Signed-off-by: Ken Chen Cc: David Gibson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 075877b..06699d8 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -661,10 +661,10 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page **pages, struct vm_area_struct **vmas, unsigned long *position, int *length, int i) { - unsigned long vpfn, vaddr = *position; + unsigned long pfn_offset; + unsigned long vaddr = *position; int remainder = *length; - vpfn = vaddr/PAGE_SIZE; spin_lock(&mm->page_table_lock); while (vaddr < vma->vm_end && remainder) { pte_t *pte; @@ -692,19 +692,28 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, break; } - if (pages) { - page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; - get_page(page); - pages[i] = page; - } + pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT; + page = pte_page(*pte); +same_page: + get_page(page); + if (pages) + pages[i] = page + pfn_offset; if (vmas) vmas[i] = vma; vaddr += PAGE_SIZE; - ++vpfn; + ++pfn_offset; --remainder; ++i; + if (vaddr < vma->vm_end && remainder && + pfn_offset < HPAGE_SIZE/PAGE_SIZE) { + /* + * We use pfn_offset to avoid touching the pageframes + * of this compound page. + */ + goto same_page; + } } spin_unlock(&mm->page_table_lock); *length = remainder; -- cgit v1.1 From 248a0301e703cbf781aa02a91bcfc6da75870dd7 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 22 Mar 2006 00:09:04 -0800 Subject: [PATCH] mm: make shrink_all_memory try harder Make shrink_all_memory() repeat the attempts to free more memory if there seems to be no pages to free. Signed-off-by: Rafael J. Wysocki Cc: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index f713e9f..548e023 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -1784,11 +1785,13 @@ unsigned long shrink_all_memory(unsigned long nr_pages) pg_data_t *pgdat; unsigned long nr_to_free = nr_pages; unsigned long ret = 0; + unsigned retry = 2; struct reclaim_state reclaim_state = { .reclaimed_slab = 0, }; current->reclaim_state = &reclaim_state; +repeat: for_each_pgdat(pgdat) { unsigned long freed; @@ -1798,6 +1801,10 @@ unsigned long shrink_all_memory(unsigned long nr_pages) if ((long)nr_to_free <= 0) break; } + if (retry-- && ret < nr_pages) { + blk_congestion_wait(WRITE, HZ/5); + goto repeat; + } current->reclaim_state = NULL; return ret; } -- cgit v1.1 From 35386e3b0f876bf194982f48f027af0c216499ce Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 22 Mar 2006 00:09:05 -0800 Subject: [PATCH] slab: cache_reap(): further reduction in interrupt holdoff cache_reap takes the l3->list_lock (disabling interrupts) unconditionally and then does a few checks and maybe does some cleanup. This patch makes cache_reap() only take the lock if there is work to do and then the lock is taken and released for each cleaning action. The checking of when to do the next reaping is done without any locking and becomes racy. Should not matter since reaping can also be skipped if the slab mutex cannot be acquired. The same is true for the touched processing. If we get this wrong once in awhile then we will mistakenly clean or not clean the shared cache. This will impact performance slightly. Note that the additional drain_array() function introduced here will fall out in a subsequent patch since array cleaning will now be very similar from all callers. Signed-off-by: Christoph Lameter Cc: Pekka Enberg Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 57 +++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 43 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index ff0ab77..1845c01 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -292,13 +292,13 @@ struct kmem_list3 { struct list_head slabs_full; struct list_head slabs_free; unsigned long free_objects; - unsigned long next_reap; - int free_touched; unsigned int free_limit; unsigned int colour_next; /* Per-node cache coloring */ spinlock_t list_lock; struct array_cache *shared; /* shared per node */ struct array_cache **alien; /* on other nodes */ + unsigned long next_reap; /* updated without locking */ + int free_touched; /* updated without locking */ }; /* @@ -3539,6 +3539,22 @@ static void drain_array_locked(struct kmem_cache *cachep, } } + +/* + * Drain an array if it contains any elements taking the l3 lock only if + * necessary. + */ +static void drain_array(struct kmem_cache *searchp, struct kmem_list3 *l3, + struct array_cache *ac) +{ + if (ac && ac->avail) { + spin_lock_irq(&l3->list_lock); + drain_array_locked(searchp, ac, 0, + numa_node_id()); + spin_unlock_irq(&l3->list_lock); + } +} + /** * cache_reap - Reclaim memory from caches. * @unused: unused parameter @@ -3572,33 +3588,48 @@ static void cache_reap(void *unused) searchp = list_entry(walk, struct kmem_cache, next); check_irq_on(); + /* + * We only take the l3 lock if absolutely necessary and we + * have established with reasonable certainty that + * we can do some work if the lock was obtained. + */ l3 = searchp->nodelists[numa_node_id()]; + reap_alien(searchp, l3); - spin_lock_irq(&l3->list_lock); - drain_array_locked(searchp, cpu_cache_get(searchp), 0, - numa_node_id()); + drain_array(searchp, l3, cpu_cache_get(searchp)); + /* + * These are racy checks but it does not matter + * if we skip one check or scan twice. + */ if (time_after(l3->next_reap, jiffies)) - goto next_unlock; + goto next; l3->next_reap = jiffies + REAPTIMEOUT_LIST3; - if (l3->shared) - drain_array_locked(searchp, l3->shared, 0, - numa_node_id()); + drain_array(searchp, l3, l3->shared); if (l3->free_touched) { l3->free_touched = 0; - goto next_unlock; + goto next; } tofree = (l3->free_limit + 5 * searchp->num - 1) / (5 * searchp->num); do { + /* + * Do not lock if there are no free blocks. + */ + if (list_empty(&l3->slabs_free)) + break; + + spin_lock_irq(&l3->list_lock); p = l3->slabs_free.next; - if (p == &(l3->slabs_free)) + if (p == &(l3->slabs_free)) { + spin_unlock_irq(&l3->list_lock); break; + } slabp = list_entry(p, struct slab, list); BUG_ON(slabp->inuse); @@ -3613,10 +3644,8 @@ static void cache_reap(void *unused) l3->free_objects -= searchp->num; spin_unlock_irq(&l3->list_lock); slab_destroy(searchp, slabp); - spin_lock_irq(&l3->list_lock); } while (--tofree > 0); -next_unlock: - spin_unlock_irq(&l3->list_lock); +next: cond_resched(); } check_irq_on(); -- cgit v1.1 From aab2207cf8d9c343b6b5f0e4d27e1732f8618d14 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 22 Mar 2006 00:09:06 -0800 Subject: [PATCH] slab: make drain_array more universal by adding more parameters And a parameter to drain_array to control the freeing of all objects and then use drain_array() to replace instances of drain_array_locked with drain_array. Doing so will avoid taking locks in those locations if the arrays are empty. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 1845c01..d73b38e 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2126,6 +2126,10 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node) static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac, int force, int node); +static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, + struct array_cache *ac, + int force, int node); + static void do_drain(void *arg) { struct kmem_cache *cachep = arg; @@ -2150,9 +2154,7 @@ static void drain_cpu_caches(struct kmem_cache *cachep) for_each_online_node(node) { l3 = cachep->nodelists[node]; if (l3) { - spin_lock_irq(&l3->list_lock); - drain_array_locked(cachep, l3->shared, 1, node); - spin_unlock_irq(&l3->list_lock); + drain_array(cachep, l3, l3->shared, 1, node); if (l3->alien) drain_alien_cache(cachep, l3->alien); } @@ -3545,12 +3547,11 @@ static void drain_array_locked(struct kmem_cache *cachep, * necessary. */ static void drain_array(struct kmem_cache *searchp, struct kmem_list3 *l3, - struct array_cache *ac) + struct array_cache *ac, int force, int node) { if (ac && ac->avail) { spin_lock_irq(&l3->list_lock); - drain_array_locked(searchp, ac, 0, - numa_node_id()); + drain_array_locked(searchp, ac, force, node); spin_unlock_irq(&l3->list_lock); } } @@ -3571,6 +3572,7 @@ static void cache_reap(void *unused) { struct list_head *walk; struct kmem_list3 *l3; + int node = numa_node_id(); if (!mutex_trylock(&cache_chain_mutex)) { /* Give up. Setup the next iteration. */ @@ -3593,11 +3595,11 @@ static void cache_reap(void *unused) * have established with reasonable certainty that * we can do some work if the lock was obtained. */ - l3 = searchp->nodelists[numa_node_id()]; + l3 = searchp->nodelists[node]; reap_alien(searchp, l3); - drain_array(searchp, l3, cpu_cache_get(searchp)); + drain_array(searchp, l3, cpu_cache_get(searchp), 0, node); /* * These are racy checks but it does not matter @@ -3608,7 +3610,7 @@ static void cache_reap(void *unused) l3->next_reap = jiffies + REAPTIMEOUT_LIST3; - drain_array(searchp, l3, l3->shared); + drain_array(searchp, l3, l3->shared, 0, node); if (l3->free_touched) { l3->free_touched = 0; -- cgit v1.1 From 1b55253a7f95adc82eb20937b57b3e3e32ba65df Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 22 Mar 2006 00:09:07 -0800 Subject: [PATCH] slab: remove drain_array_locked Remove drain_array_locked and use that opportunity to limit the time the l3 lock is taken further. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 31 ++++++++++--------------------- 1 file changed, 10 insertions(+), 21 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index d73b38e..3274144 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2123,9 +2123,6 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node) #define check_spinlock_acquired_node(x, y) do { } while(0) #endif -static void drain_array_locked(struct kmem_cache *cachep, - struct array_cache *ac, int force, int node); - static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, struct array_cache *ac, int force, int node); @@ -3522,40 +3519,32 @@ static void enable_cpucache(struct kmem_cache *cachep) cachep->name, -err); } -static void drain_array_locked(struct kmem_cache *cachep, - struct array_cache *ac, int force, int node) +/* + * Drain an array if it contains any elements taking the l3 lock only if + * necessary. + */ +void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, + struct array_cache *ac, int force, int node) { int tofree; - check_spinlock_acquired_node(cachep, node); + if (!ac || !ac->avail) + return; if (ac->touched && !force) { ac->touched = 0; } else if (ac->avail) { tofree = force ? ac->avail : (ac->limit + 4) / 5; if (tofree > ac->avail) tofree = (ac->avail + 1) / 2; + spin_lock_irq(&l3->list_lock); free_block(cachep, ac->entry, tofree, node); + spin_unlock_irq(&l3->list_lock); ac->avail -= tofree; memmove(ac->entry, &(ac->entry[tofree]), sizeof(void *) * ac->avail); } } - -/* - * Drain an array if it contains any elements taking the l3 lock only if - * necessary. - */ -static void drain_array(struct kmem_cache *searchp, struct kmem_list3 *l3, - struct array_cache *ac, int force, int node) -{ - if (ac && ac->avail) { - spin_lock_irq(&l3->list_lock); - drain_array_locked(searchp, ac, force, node); - spin_unlock_irq(&l3->list_lock); - } -} - /** * cache_reap - Reclaim memory from caches. * @unused: unused parameter -- cgit v1.1 From b18e7e654d7af741d2bf34a90dc34128d0217fea Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 22 Mar 2006 00:09:07 -0800 Subject: [PATCH] slab: fix drain_array() so that it works correctly with the shared_array The list_lock also protects the shared array and we call drain_array() with the shared array. Therefore we cannot go as far as I wanted to but have to take the lock in a way so that it also protects the array_cache in drain_pages. (Note: maybe we should make the array_cache locking more consistent? I.e. always take the array cache lock for shared arrays and disable interrupts for the per cpu arrays?) Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 3274144..6b691ecb 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3521,7 +3521,8 @@ static void enable_cpucache(struct kmem_cache *cachep) /* * Drain an array if it contains any elements taking the l3 lock only if - * necessary. + * necessary. Note that the l3 listlock also protects the array_cache + * if drain_array() is used on the shared array. */ void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, struct array_cache *ac, int force, int node) @@ -3532,16 +3533,18 @@ void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, return; if (ac->touched && !force) { ac->touched = 0; - } else if (ac->avail) { - tofree = force ? ac->avail : (ac->limit + 4) / 5; - if (tofree > ac->avail) - tofree = (ac->avail + 1) / 2; + } else { spin_lock_irq(&l3->list_lock); - free_block(cachep, ac->entry, tofree, node); + if (ac->avail) { + tofree = force ? ac->avail : (ac->limit + 4) / 5; + if (tofree > ac->avail) + tofree = (ac->avail + 1) / 2; + free_block(cachep, ac->entry, tofree, node); + ac->avail -= tofree; + memmove(ac->entry, &(ac->entry[tofree]), + sizeof(void *) * ac->avail); + } spin_unlock_irq(&l3->list_lock); - ac->avail -= tofree; - memmove(ac->entry, &(ac->entry[tofree]), - sizeof(void *) * ac->avail); } } -- cgit v1.1 From 879336c3930ae9273ea1c45214cb8adae0ce494a Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 22 Mar 2006 00:09:08 -0800 Subject: [PATCH] drain_node_pages: interrupt latency reduction / optimization 1. Only disable interrupts if there is actually something to free 2. Only dirty the pcp cacheline if we actually freed something. 3. Disable interrupts for each single pcp and not for cleaning all the pcps in all zones of a node. drain_node_pages is called every 2 seconds from cache_reap. This fix should avoid most disabling of interrupts. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ed91684..b7f14a4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -603,13 +603,14 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, /* * Called from the slab reaper to drain pagesets on a particular node that * belong to the currently executing processor. + * Note that this function must be called with the thread pinned to + * a single processor. */ void drain_node_pages(int nodeid) { int i, z; unsigned long flags; - local_irq_save(flags); for (z = 0; z < MAX_NR_ZONES; z++) { struct zone *zone = NODE_DATA(nodeid)->node_zones + z; struct per_cpu_pageset *pset; @@ -619,11 +620,14 @@ void drain_node_pages(int nodeid) struct per_cpu_pages *pcp; pcp = &pset->pcp[i]; - free_pages_bulk(zone, pcp->count, &pcp->list, 0); - pcp->count = 0; + if (pcp->count) { + local_irq_save(flags); + free_pages_bulk(zone, pcp->count, &pcp->list, 0); + pcp->count = 0; + local_irq_restore(flags); + } } } - local_irq_restore(flags); } #endif -- cgit v1.1 From 9b65ef59d42a56fa1358958ede77aaa5bac385a8 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Wed, 22 Mar 2006 00:09:09 -0800 Subject: [PATCH] fix swap cluster offset When we've allocated SWAPFILE_CLUSTER pages, ->cluster_next should be the first index of swap cluster. But current code probably sets it wrong offset. Signed-off-by: Akinobu Mita Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index 1f9cf0d..365ed6f 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -116,7 +116,7 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si) last_in_cluster = offset + SWAPFILE_CLUSTER; else if (offset == last_in_cluster) { spin_lock(&swap_lock); - si->cluster_next = offset-SWAPFILE_CLUSTER-1; + si->cluster_next = offset-SWAPFILE_CLUSTER+1; goto cluster; } if (unlikely(--latency_ration < 0)) { -- cgit v1.1 From fdb7cc59084ba7eef935e4e40aaaf538ee34c625 Mon Sep 17 00:00:00 2001 From: Paul Jackson Date: Wed, 22 Mar 2006 00:09:10 -0800 Subject: [PATCH] mm: hugetlb alloc_fresh_huge_page bogus node loop fix Fix bogus node loop in hugetlb.c alloc_fresh_huge_page(), which was assuming that nodes are numbered contiguously from 0 to num_online_nodes(). Once the hotplug folks get this far, that will be false. Signed-off-by: Paul Jackson Acked-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 06699d8..ebad6bb 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -105,7 +105,9 @@ static int alloc_fresh_huge_page(void) struct page *page; page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN, HUGETLB_PAGE_ORDER); - nid = (nid + 1) % num_online_nodes(); + nid = next_node(nid, node_online_map); + if (nid == MAX_NUMNODES) + nid = first_node(node_online_map); if (page) { page[1].lru.next = (void *)free_huge_page; /* dtor */ spin_lock(&hugetlb_lock); -- cgit v1.1 From 442295c94bf650221af3ef20fc68fa3e93876818 Mon Sep 17 00:00:00 2001 From: Paul Jackson Date: Wed, 22 Mar 2006 00:09:11 -0800 Subject: [PATCH] mm: slab cache interleave rotor fix The alien cache rotor in mm/slab.c assumes that the first online node is node 0. Eventually for some archs, especially with hotplug, this will no longer be true. Fix the interleave rotor to handle the general case of node numbering. Signed-off-by: Paul Jackson Acked-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 6b691ecb..1c8f5ee 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -830,7 +830,7 @@ static void init_reap_node(int cpu) node = next_node(cpu_to_node(cpu), node_online_map); if (node == MAX_NUMNODES) - node = 0; + node = first_node(node_online_map); __get_cpu_var(reap_node) = node; } -- cgit v1.1 From b20a35035f983f4ac7e29c4a68f30e43510007e0 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 22 Mar 2006 00:09:12 -0800 Subject: [PATCH] page migration reorg Centralize the page migration functions in anticipation of additional tinkering. Creates a new file mm/migrate.c 1. Extract buffer_migrate_page() from fs/buffer.c 2. Extract central migration code from vmscan.c 3. Extract some components from mempolicy.c 4. Export pageout() and remove_from_swap() from vmscan.c 5. Make it possible to configure NUMA systems without page migration and non-NUMA systems with page migration. I had to so some #ifdeffing in mempolicy.c that may need a cleanup. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 6 + mm/Makefile | 2 + mm/mempolicy.c | 113 ++-------- mm/migrate.c | 655 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ mm/swap_state.c | 1 + mm/vmscan.c | 491 +----------------------------------------- 6 files changed, 689 insertions(+), 579 deletions(-) create mode 100644 mm/migrate.c (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index a9cb80a..bd80460 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -137,5 +137,11 @@ config SPLIT_PTLOCK_CPUS # support for page migration # config MIGRATION + bool "Page migration" def_bool y if NUMA || SPARSEMEM || DISCONTIGMEM depends on SWAP + help + Allows the migration of the physical location of pages of processes + while the virtual addresses are not changed. This is useful for + example on NUMA systems to put pages nearer to the processors accessing + the page. diff --git a/mm/Makefile b/mm/Makefile index 9aa03fa..f10c753 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -22,3 +22,5 @@ obj-$(CONFIG_SLOB) += slob.o obj-$(CONFIG_SLAB) += slab.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_FS_XIP) += filemap_xip.o +obj-$(CONFIG_MIGRATION) += migrate.o + diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 96195dc..e93cc74 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -86,6 +86,7 @@ #include #include #include +#include #include #include @@ -95,9 +96,6 @@ #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */ -/* The number of pages to migrate per call to migrate_pages() */ -#define MIGRATE_CHUNK_SIZE 256 - static struct kmem_cache *policy_cache; static struct kmem_cache *sn_cache; @@ -331,17 +329,10 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, struct vm_area_struct *first, *vma, *prev; if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { - /* Must have swap device for migration */ - if (nr_swap_pages <= 0) - return ERR_PTR(-ENODEV); - /* - * Clear the LRU lists so pages can be isolated. - * Note that pages may be moved off the LRU after we have - * drained them. Those pages will fail to migrate like other - * pages that may be busy. - */ - lru_add_drain_all(); + err = migrate_prep(); + if (err) + return ERR_PTR(err); } first = find_vma(mm, start); @@ -550,92 +541,18 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask, return err; } +#ifdef CONFIG_MIGRATION /* * page migration */ - static void migrate_page_add(struct page *page, struct list_head *pagelist, unsigned long flags) { /* * Avoid migrating a page that is shared with others. */ - if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { - if (isolate_lru_page(page)) - list_add_tail(&page->lru, pagelist); - } -} - -/* - * Migrate the list 'pagelist' of pages to a certain destination. - * - * Specify destination with either non-NULL vma or dest_node >= 0 - * Return the number of pages not migrated or error code - */ -static int migrate_pages_to(struct list_head *pagelist, - struct vm_area_struct *vma, int dest) -{ - LIST_HEAD(newlist); - LIST_HEAD(moved); - LIST_HEAD(failed); - int err = 0; - unsigned long offset = 0; - int nr_pages; - struct page *page; - struct list_head *p; - -redo: - nr_pages = 0; - list_for_each(p, pagelist) { - if (vma) { - /* - * The address passed to alloc_page_vma is used to - * generate the proper interleave behavior. We fake - * the address here by an increasing offset in order - * to get the proper distribution of pages. - * - * No decision has been made as to which page - * a certain old page is moved to so we cannot - * specify the correct address. - */ - page = alloc_page_vma(GFP_HIGHUSER, vma, - offset + vma->vm_start); - offset += PAGE_SIZE; - } - else - page = alloc_pages_node(dest, GFP_HIGHUSER, 0); - - if (!page) { - err = -ENOMEM; - goto out; - } - list_add_tail(&page->lru, &newlist); - nr_pages++; - if (nr_pages > MIGRATE_CHUNK_SIZE) - break; - } - err = migrate_pages(pagelist, &newlist, &moved, &failed); - - putback_lru_pages(&moved); /* Call release pages instead ?? */ - - if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist)) - goto redo; -out: - /* Return leftover allocated pages */ - while (!list_empty(&newlist)) { - page = list_entry(newlist.next, struct page, lru); - list_del(&page->lru); - __free_page(page); - } - list_splice(&failed, pagelist); - if (err < 0) - return err; - - /* Calculate number of leftover pages */ - nr_pages = 0; - list_for_each(p, pagelist) - nr_pages++; - return nr_pages; + if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) + isolate_lru_page(page, pagelist); } /* @@ -742,8 +659,23 @@ int do_migrate_pages(struct mm_struct *mm, if (err < 0) return err; return busy; + } +#else + +static void migrate_page_add(struct page *page, struct list_head *pagelist, + unsigned long flags) +{ +} + +int do_migrate_pages(struct mm_struct *mm, + const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) +{ + return -ENOSYS; +} +#endif + long do_mbind(unsigned long start, unsigned long len, unsigned long mode, nodemask_t *nmask, unsigned long flags) { @@ -808,6 +740,7 @@ long do_mbind(unsigned long start, unsigned long len, if (!err && nr_failed && (flags & MPOL_MF_STRICT)) err = -EIO; } + if (!list_empty(&pagelist)) putback_lru_pages(&pagelist); diff --git a/mm/migrate.c b/mm/migrate.c new file mode 100644 index 0000000..09f6e4a --- /dev/null +++ b/mm/migrate.c @@ -0,0 +1,655 @@ +/* + * Memory Migration functionality - linux/mm/migration.c + * + * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter + * + * Page migration was first developed in the context of the memory hotplug + * project. The main authors of the migration code are: + * + * IWAMOTO Toshihiro + * Hirokazu Takahashi + * Dave Hansen + * Christoph Lameter + */ + +#include +#include +#include +#include +#include /* for try_to_release_page(), + buffer_heads_over_limit */ +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +#include "internal.h" + +/* The maximum number of pages to take off the LRU for migration */ +#define MIGRATE_CHUNK_SIZE 256 + +#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) + +/* + * Isolate one page from the LRU lists. If successful put it onto + * the indicated list with elevated page count. + * + * Result: + * -EBUSY: page not on LRU list + * 0: page removed from LRU list and added to the specified list. + */ +int isolate_lru_page(struct page *page, struct list_head *pagelist) +{ + int ret = -EBUSY; + + if (PageLRU(page)) { + struct zone *zone = page_zone(page); + + spin_lock_irq(&zone->lru_lock); + if (PageLRU(page)) { + ret = 0; + get_page(page); + ClearPageLRU(page); + if (PageActive(page)) + del_page_from_active_list(zone, page); + else + del_page_from_inactive_list(zone, page); + list_add_tail(&page->lru, pagelist); + } + spin_unlock_irq(&zone->lru_lock); + } + return ret; +} + +/* + * migrate_prep() needs to be called after we have compiled the list of pages + * to be migrated using isolate_lru_page() but before we begin a series of calls + * to migrate_pages(). + */ +int migrate_prep(void) +{ + /* Must have swap device for migration */ + if (nr_swap_pages <= 0) + return -ENODEV; + + /* + * Clear the LRU lists so pages can be isolated. + * Note that pages may be moved off the LRU after we have + * drained them. Those pages will fail to migrate like other + * pages that may be busy. + */ + lru_add_drain_all(); + + return 0; +} + +static inline void move_to_lru(struct page *page) +{ + list_del(&page->lru); + if (PageActive(page)) { + /* + * lru_cache_add_active checks that + * the PG_active bit is off. + */ + ClearPageActive(page); + lru_cache_add_active(page); + } else { + lru_cache_add(page); + } + put_page(page); +} + +/* + * Add isolated pages on the list back to the LRU. + * + * returns the number of pages put back. + */ +int putback_lru_pages(struct list_head *l) +{ + struct page *page; + struct page *page2; + int count = 0; + + list_for_each_entry_safe(page, page2, l, lru) { + move_to_lru(page); + count++; + } + return count; +} + +/* + * Non migratable page + */ +int fail_migrate_page(struct page *newpage, struct page *page) +{ + return -EIO; +} +EXPORT_SYMBOL(fail_migrate_page); + +/* + * swapout a single page + * page is locked upon entry, unlocked on exit + */ +static int swap_page(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + + if (page_mapped(page) && mapping) + if (try_to_unmap(page, 1) != SWAP_SUCCESS) + goto unlock_retry; + + if (PageDirty(page)) { + /* Page is dirty, try to write it out here */ + switch(pageout(page, mapping)) { + case PAGE_KEEP: + case PAGE_ACTIVATE: + goto unlock_retry; + + case PAGE_SUCCESS: + goto retry; + + case PAGE_CLEAN: + ; /* try to free the page below */ + } + } + + if (PagePrivate(page)) { + if (!try_to_release_page(page, GFP_KERNEL) || + (!mapping && page_count(page) == 1)) + goto unlock_retry; + } + + if (remove_mapping(mapping, page)) { + /* Success */ + unlock_page(page); + return 0; + } + +unlock_retry: + unlock_page(page); + +retry: + return -EAGAIN; +} +EXPORT_SYMBOL(swap_page); + +/* + * Remove references for a page and establish the new page with the correct + * basic settings to be able to stop accesses to the page. + */ +int migrate_page_remove_references(struct page *newpage, + struct page *page, int nr_refs) +{ + struct address_space *mapping = page_mapping(page); + struct page **radix_pointer; + + /* + * Avoid doing any of the following work if the page count + * indicates that the page is in use or truncate has removed + * the page. + */ + if (!mapping || page_mapcount(page) + nr_refs != page_count(page)) + return -EAGAIN; + + /* + * Establish swap ptes for anonymous pages or destroy pte + * maps for files. + * + * In order to reestablish file backed mappings the fault handlers + * will take the radix tree_lock which may then be used to stop + * processses from accessing this page until the new page is ready. + * + * A process accessing via a swap pte (an anonymous page) will take a + * page_lock on the old page which will block the process until the + * migration attempt is complete. At that time the PageSwapCache bit + * will be examined. If the page was migrated then the PageSwapCache + * bit will be clear and the operation to retrieve the page will be + * retried which will find the new page in the radix tree. Then a new + * direct mapping may be generated based on the radix tree contents. + * + * If the page was not migrated then the PageSwapCache bit + * is still set and the operation may continue. + */ + if (try_to_unmap(page, 1) == SWAP_FAIL) + /* A vma has VM_LOCKED set -> permanent failure */ + return -EPERM; + + /* + * Give up if we were unable to remove all mappings. + */ + if (page_mapcount(page)) + return -EAGAIN; + + write_lock_irq(&mapping->tree_lock); + + radix_pointer = (struct page **)radix_tree_lookup_slot( + &mapping->page_tree, + page_index(page)); + + if (!page_mapping(page) || page_count(page) != nr_refs || + *radix_pointer != page) { + write_unlock_irq(&mapping->tree_lock); + return 1; + } + + /* + * Now we know that no one else is looking at the page. + * + * Certain minimal information about a page must be available + * in order for other subsystems to properly handle the page if they + * find it through the radix tree update before we are finished + * copying the page. + */ + get_page(newpage); + newpage->index = page->index; + newpage->mapping = page->mapping; + if (PageSwapCache(page)) { + SetPageSwapCache(newpage); + set_page_private(newpage, page_private(page)); + } + + *radix_pointer = newpage; + __put_page(page); + write_unlock_irq(&mapping->tree_lock); + + return 0; +} +EXPORT_SYMBOL(migrate_page_remove_references); + +/* + * Copy the page to its new location + */ +void migrate_page_copy(struct page *newpage, struct page *page) +{ + copy_highpage(newpage, page); + + if (PageError(page)) + SetPageError(newpage); + if (PageReferenced(page)) + SetPageReferenced(newpage); + if (PageUptodate(page)) + SetPageUptodate(newpage); + if (PageActive(page)) + SetPageActive(newpage); + if (PageChecked(page)) + SetPageChecked(newpage); + if (PageMappedToDisk(page)) + SetPageMappedToDisk(newpage); + + if (PageDirty(page)) { + clear_page_dirty_for_io(page); + set_page_dirty(newpage); + } + + ClearPageSwapCache(page); + ClearPageActive(page); + ClearPagePrivate(page); + set_page_private(page, 0); + page->mapping = NULL; + + /* + * If any waiters have accumulated on the new page then + * wake them up. + */ + if (PageWriteback(newpage)) + end_page_writeback(newpage); +} +EXPORT_SYMBOL(migrate_page_copy); + +/* + * Common logic to directly migrate a single page suitable for + * pages that do not use PagePrivate. + * + * Pages are locked upon entry and exit. + */ +int migrate_page(struct page *newpage, struct page *page) +{ + int rc; + + BUG_ON(PageWriteback(page)); /* Writeback must be complete */ + + rc = migrate_page_remove_references(newpage, page, 2); + + if (rc) + return rc; + + migrate_page_copy(newpage, page); + + /* + * Remove auxiliary swap entries and replace + * them with real ptes. + * + * Note that a real pte entry will allow processes that are not + * waiting on the page lock to use the new page via the page tables + * before the new page is unlocked. + */ + remove_from_swap(newpage); + return 0; +} +EXPORT_SYMBOL(migrate_page); + +/* + * migrate_pages + * + * Two lists are passed to this function. The first list + * contains the pages isolated from the LRU to be migrated. + * The second list contains new pages that the pages isolated + * can be moved to. If the second list is NULL then all + * pages are swapped out. + * + * The function returns after 10 attempts or if no pages + * are movable anymore because to has become empty + * or no retryable pages exist anymore. + * + * Return: Number of pages not migrated when "to" ran empty. + */ +int migrate_pages(struct list_head *from, struct list_head *to, + struct list_head *moved, struct list_head *failed) +{ + int retry; + int nr_failed = 0; + int pass = 0; + struct page *page; + struct page *page2; + int swapwrite = current->flags & PF_SWAPWRITE; + int rc; + + if (!swapwrite) + current->flags |= PF_SWAPWRITE; + +redo: + retry = 0; + + list_for_each_entry_safe(page, page2, from, lru) { + struct page *newpage = NULL; + struct address_space *mapping; + + cond_resched(); + + rc = 0; + if (page_count(page) == 1) + /* page was freed from under us. So we are done. */ + goto next; + + if (to && list_empty(to)) + break; + + /* + * Skip locked pages during the first two passes to give the + * functions holding the lock time to release the page. Later we + * use lock_page() to have a higher chance of acquiring the + * lock. + */ + rc = -EAGAIN; + if (pass > 2) + lock_page(page); + else + if (TestSetPageLocked(page)) + goto next; + + /* + * Only wait on writeback if we have already done a pass where + * we we may have triggered writeouts for lots of pages. + */ + if (pass > 0) { + wait_on_page_writeback(page); + } else { + if (PageWriteback(page)) + goto unlock_page; + } + + /* + * Anonymous pages must have swap cache references otherwise + * the information contained in the page maps cannot be + * preserved. + */ + if (PageAnon(page) && !PageSwapCache(page)) { + if (!add_to_swap(page, GFP_KERNEL)) { + rc = -ENOMEM; + goto unlock_page; + } + } + + if (!to) { + rc = swap_page(page); + goto next; + } + + newpage = lru_to_page(to); + lock_page(newpage); + + /* + * Pages are properly locked and writeback is complete. + * Try to migrate the page. + */ + mapping = page_mapping(page); + if (!mapping) + goto unlock_both; + + if (mapping->a_ops->migratepage) { + /* + * Most pages have a mapping and most filesystems + * should provide a migration function. Anonymous + * pages are part of swap space which also has its + * own migration function. This is the most common + * path for page migration. + */ + rc = mapping->a_ops->migratepage(newpage, page); + goto unlock_both; + } + + /* + * Default handling if a filesystem does not provide + * a migration function. We can only migrate clean + * pages so try to write out any dirty pages first. + */ + if (PageDirty(page)) { + switch (pageout(page, mapping)) { + case PAGE_KEEP: + case PAGE_ACTIVATE: + goto unlock_both; + + case PAGE_SUCCESS: + unlock_page(newpage); + goto next; + + case PAGE_CLEAN: + ; /* try to migrate the page below */ + } + } + + /* + * Buffers are managed in a filesystem specific way. + * We must have no buffers or drop them. + */ + if (!page_has_buffers(page) || + try_to_release_page(page, GFP_KERNEL)) { + rc = migrate_page(newpage, page); + goto unlock_both; + } + + /* + * On early passes with mapped pages simply + * retry. There may be a lock held for some + * buffers that may go away. Later + * swap them out. + */ + if (pass > 4) { + /* + * Persistently unable to drop buffers..... As a + * measure of last resort we fall back to + * swap_page(). + */ + unlock_page(newpage); + newpage = NULL; + rc = swap_page(page); + goto next; + } + +unlock_both: + unlock_page(newpage); + +unlock_page: + unlock_page(page); + +next: + if (rc == -EAGAIN) { + retry++; + } else if (rc) { + /* Permanent failure */ + list_move(&page->lru, failed); + nr_failed++; + } else { + if (newpage) { + /* Successful migration. Return page to LRU */ + move_to_lru(newpage); + } + list_move(&page->lru, moved); + } + } + if (retry && pass++ < 10) + goto redo; + + if (!swapwrite) + current->flags &= ~PF_SWAPWRITE; + + return nr_failed + retry; +} + +/* + * Migration function for pages with buffers. This function can only be used + * if the underlying filesystem guarantees that no other references to "page" + * exist. + */ +int buffer_migrate_page(struct page *newpage, struct page *page) +{ + struct address_space *mapping = page->mapping; + struct buffer_head *bh, *head; + int rc; + + if (!mapping) + return -EAGAIN; + + if (!page_has_buffers(page)) + return migrate_page(newpage, page); + + head = page_buffers(page); + + rc = migrate_page_remove_references(newpage, page, 3); + + if (rc) + return rc; + + bh = head; + do { + get_bh(bh); + lock_buffer(bh); + bh = bh->b_this_page; + + } while (bh != head); + + ClearPagePrivate(page); + set_page_private(newpage, page_private(page)); + set_page_private(page, 0); + put_page(page); + get_page(newpage); + + bh = head; + do { + set_bh_page(bh, newpage, bh_offset(bh)); + bh = bh->b_this_page; + + } while (bh != head); + + SetPagePrivate(newpage); + + migrate_page_copy(newpage, page); + + bh = head; + do { + unlock_buffer(bh); + put_bh(bh); + bh = bh->b_this_page; + + } while (bh != head); + + return 0; +} +EXPORT_SYMBOL(buffer_migrate_page); + +/* + * Migrate the list 'pagelist' of pages to a certain destination. + * + * Specify destination with either non-NULL vma or dest_node >= 0 + * Return the number of pages not migrated or error code + */ +int migrate_pages_to(struct list_head *pagelist, + struct vm_area_struct *vma, int dest) +{ + LIST_HEAD(newlist); + LIST_HEAD(moved); + LIST_HEAD(failed); + int err = 0; + unsigned long offset = 0; + int nr_pages; + struct page *page; + struct list_head *p; + +redo: + nr_pages = 0; + list_for_each(p, pagelist) { + if (vma) { + /* + * The address passed to alloc_page_vma is used to + * generate the proper interleave behavior. We fake + * the address here by an increasing offset in order + * to get the proper distribution of pages. + * + * No decision has been made as to which page + * a certain old page is moved to so we cannot + * specify the correct address. + */ + page = alloc_page_vma(GFP_HIGHUSER, vma, + offset + vma->vm_start); + offset += PAGE_SIZE; + } + else + page = alloc_pages_node(dest, GFP_HIGHUSER, 0); + + if (!page) { + err = -ENOMEM; + goto out; + } + list_add_tail(&page->lru, &newlist); + nr_pages++; + if (nr_pages > MIGRATE_CHUNK_SIZE) + break; + } + err = migrate_pages(pagelist, &newlist, &moved, &failed); + + putback_lru_pages(&moved); /* Call release pages instead ?? */ + + if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist)) + goto redo; +out: + /* Return leftover allocated pages */ + while (!list_empty(&newlist)) { + page = list_entry(newlist.next, struct page, lru); + list_del(&page->lru); + __free_page(page); + } + list_splice(&failed, pagelist); + if (err < 0) + return err; + + /* Calculate number of leftover pages */ + nr_pages = 0; + list_for_each(p, pagelist) + nr_pages++; + return nr_pages; +} diff --git a/mm/swap_state.c b/mm/swap_state.c index db8a3d3..d7af296 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -15,6 +15,7 @@ #include #include #include +#include #include diff --git a/mm/vmscan.c b/mm/vmscan.c index 548e023..fd572bb 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -42,18 +42,6 @@ #include "internal.h" -/* possible outcome of pageout() */ -typedef enum { - /* failed to write page out, page is locked */ - PAGE_KEEP, - /* move page to the active list, page is locked */ - PAGE_ACTIVATE, - /* page has been sent to the disk successfully, page is unlocked */ - PAGE_SUCCESS, - /* page is clean and locked */ - PAGE_CLEAN, -} pageout_t; - struct scan_control { /* Incremented by the number of inactive pages that were scanned */ unsigned long nr_scanned; @@ -304,7 +292,7 @@ static void handle_write_error(struct address_space *mapping, * pageout is called by shrink_page_list() for each dirty page. * Calls ->writepage(). */ -static pageout_t pageout(struct page *page, struct address_space *mapping) +pageout_t pageout(struct page *page, struct address_space *mapping) { /* * If the page is dirty, only perform writeback if that write @@ -372,7 +360,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping) return PAGE_CLEAN; } -static int remove_mapping(struct address_space *mapping, struct page *page) +int remove_mapping(struct address_space *mapping, struct page *page) { if (!mapping) return 0; /* truncate got there first */ @@ -570,481 +558,6 @@ keep: return nr_reclaimed; } -#ifdef CONFIG_MIGRATION -static inline void move_to_lru(struct page *page) -{ - list_del(&page->lru); - if (PageActive(page)) { - /* - * lru_cache_add_active checks that - * the PG_active bit is off. - */ - ClearPageActive(page); - lru_cache_add_active(page); - } else { - lru_cache_add(page); - } - put_page(page); -} - -/* - * Add isolated pages on the list back to the LRU. - * - * returns the number of pages put back. - */ -unsigned long putback_lru_pages(struct list_head *l) -{ - struct page *page; - struct page *page2; - unsigned long count = 0; - - list_for_each_entry_safe(page, page2, l, lru) { - move_to_lru(page); - count++; - } - return count; -} - -/* - * Non migratable page - */ -int fail_migrate_page(struct page *newpage, struct page *page) -{ - return -EIO; -} -EXPORT_SYMBOL(fail_migrate_page); - -/* - * swapout a single page - * page is locked upon entry, unlocked on exit - */ -static int swap_page(struct page *page) -{ - struct address_space *mapping = page_mapping(page); - - if (page_mapped(page) && mapping) - if (try_to_unmap(page, 1) != SWAP_SUCCESS) - goto unlock_retry; - - if (PageDirty(page)) { - /* Page is dirty, try to write it out here */ - switch(pageout(page, mapping)) { - case PAGE_KEEP: - case PAGE_ACTIVATE: - goto unlock_retry; - - case PAGE_SUCCESS: - goto retry; - - case PAGE_CLEAN: - ; /* try to free the page below */ - } - } - - if (PagePrivate(page)) { - if (!try_to_release_page(page, GFP_KERNEL) || - (!mapping && page_count(page) == 1)) - goto unlock_retry; - } - - if (remove_mapping(mapping, page)) { - /* Success */ - unlock_page(page); - return 0; - } - -unlock_retry: - unlock_page(page); - -retry: - return -EAGAIN; -} -EXPORT_SYMBOL(swap_page); - -/* - * Page migration was first developed in the context of the memory hotplug - * project. The main authors of the migration code are: - * - * IWAMOTO Toshihiro - * Hirokazu Takahashi - * Dave Hansen - * Christoph Lameter - */ - -/* - * Remove references for a page and establish the new page with the correct - * basic settings to be able to stop accesses to the page. - */ -int migrate_page_remove_references(struct page *newpage, - struct page *page, int nr_refs) -{ - struct address_space *mapping = page_mapping(page); - struct page **radix_pointer; - - /* - * Avoid doing any of the following work if the page count - * indicates that the page is in use or truncate has removed - * the page. - */ - if (!mapping || page_mapcount(page) + nr_refs != page_count(page)) - return -EAGAIN; - - /* - * Establish swap ptes for anonymous pages or destroy pte - * maps for files. - * - * In order to reestablish file backed mappings the fault handlers - * will take the radix tree_lock which may then be used to stop - * processses from accessing this page until the new page is ready. - * - * A process accessing via a swap pte (an anonymous page) will take a - * page_lock on the old page which will block the process until the - * migration attempt is complete. At that time the PageSwapCache bit - * will be examined. If the page was migrated then the PageSwapCache - * bit will be clear and the operation to retrieve the page will be - * retried which will find the new page in the radix tree. Then a new - * direct mapping may be generated based on the radix tree contents. - * - * If the page was not migrated then the PageSwapCache bit - * is still set and the operation may continue. - */ - if (try_to_unmap(page, 1) == SWAP_FAIL) - /* A vma has VM_LOCKED set -> Permanent failure */ - return -EPERM; - - /* - * Give up if we were unable to remove all mappings. - */ - if (page_mapcount(page)) - return -EAGAIN; - - write_lock_irq(&mapping->tree_lock); - - radix_pointer = (struct page **)radix_tree_lookup_slot( - &mapping->page_tree, - page_index(page)); - - if (!page_mapping(page) || page_count(page) != nr_refs || - *radix_pointer != page) { - write_unlock_irq(&mapping->tree_lock); - return -EAGAIN; - } - - /* - * Now we know that no one else is looking at the page. - * - * Certain minimal information about a page must be available - * in order for other subsystems to properly handle the page if they - * find it through the radix tree update before we are finished - * copying the page. - */ - get_page(newpage); - newpage->index = page->index; - newpage->mapping = page->mapping; - if (PageSwapCache(page)) { - SetPageSwapCache(newpage); - set_page_private(newpage, page_private(page)); - } - - *radix_pointer = newpage; - __put_page(page); - write_unlock_irq(&mapping->tree_lock); - - return 0; -} -EXPORT_SYMBOL(migrate_page_remove_references); - -/* - * Copy the page to its new location - */ -void migrate_page_copy(struct page *newpage, struct page *page) -{ - copy_highpage(newpage, page); - - if (PageError(page)) - SetPageError(newpage); - if (PageReferenced(page)) - SetPageReferenced(newpage); - if (PageUptodate(page)) - SetPageUptodate(newpage); - if (PageActive(page)) - SetPageActive(newpage); - if (PageChecked(page)) - SetPageChecked(newpage); - if (PageMappedToDisk(page)) - SetPageMappedToDisk(newpage); - - if (PageDirty(page)) { - clear_page_dirty_for_io(page); - set_page_dirty(newpage); - } - - ClearPageSwapCache(page); - ClearPageActive(page); - ClearPagePrivate(page); - set_page_private(page, 0); - page->mapping = NULL; - - /* - * If any waiters have accumulated on the new page then - * wake them up. - */ - if (PageWriteback(newpage)) - end_page_writeback(newpage); -} -EXPORT_SYMBOL(migrate_page_copy); - -/* - * Common logic to directly migrate a single page suitable for - * pages that do not use PagePrivate. - * - * Pages are locked upon entry and exit. - */ -int migrate_page(struct page *newpage, struct page *page) -{ - int rc; - - BUG_ON(PageWriteback(page)); /* Writeback must be complete */ - - rc = migrate_page_remove_references(newpage, page, 2); - - if (rc) - return rc; - - migrate_page_copy(newpage, page); - - /* - * Remove auxiliary swap entries and replace - * them with real ptes. - * - * Note that a real pte entry will allow processes that are not - * waiting on the page lock to use the new page via the page tables - * before the new page is unlocked. - */ - remove_from_swap(newpage); - return 0; -} -EXPORT_SYMBOL(migrate_page); - -/* - * migrate_pages - * - * Two lists are passed to this function. The first list - * contains the pages isolated from the LRU to be migrated. - * The second list contains new pages that the pages isolated - * can be moved to. If the second list is NULL then all - * pages are swapped out. - * - * The function returns after 10 attempts or if no pages - * are movable anymore because to has become empty - * or no retryable pages exist anymore. - * - * Return: Number of pages not migrated when "to" ran empty. - */ -unsigned long migrate_pages(struct list_head *from, struct list_head *to, - struct list_head *moved, struct list_head *failed) -{ - unsigned long retry; - unsigned long nr_failed = 0; - int pass = 0; - struct page *page; - struct page *page2; - int swapwrite = current->flags & PF_SWAPWRITE; - int rc; - - if (!swapwrite) - current->flags |= PF_SWAPWRITE; - -redo: - retry = 0; - - list_for_each_entry_safe(page, page2, from, lru) { - struct page *newpage = NULL; - struct address_space *mapping; - - cond_resched(); - - rc = 0; - if (page_count(page) == 1) - /* page was freed from under us. So we are done. */ - goto next; - - if (to && list_empty(to)) - break; - - /* - * Skip locked pages during the first two passes to give the - * functions holding the lock time to release the page. Later we - * use lock_page() to have a higher chance of acquiring the - * lock. - */ - rc = -EAGAIN; - if (pass > 2) - lock_page(page); - else - if (TestSetPageLocked(page)) - goto next; - - /* - * Only wait on writeback if we have already done a pass where - * we we may have triggered writeouts for lots of pages. - */ - if (pass > 0) { - wait_on_page_writeback(page); - } else { - if (PageWriteback(page)) - goto unlock_page; - } - - /* - * Anonymous pages must have swap cache references otherwise - * the information contained in the page maps cannot be - * preserved. - */ - if (PageAnon(page) && !PageSwapCache(page)) { - if (!add_to_swap(page, GFP_KERNEL)) { - rc = -ENOMEM; - goto unlock_page; - } - } - - if (!to) { - rc = swap_page(page); - goto next; - } - - newpage = lru_to_page(to); - lock_page(newpage); - - /* - * Pages are properly locked and writeback is complete. - * Try to migrate the page. - */ - mapping = page_mapping(page); - if (!mapping) - goto unlock_both; - - if (mapping->a_ops->migratepage) { - /* - * Most pages have a mapping and most filesystems - * should provide a migration function. Anonymous - * pages are part of swap space which also has its - * own migration function. This is the most common - * path for page migration. - */ - rc = mapping->a_ops->migratepage(newpage, page); - goto unlock_both; - } - - /* - * Default handling if a filesystem does not provide - * a migration function. We can only migrate clean - * pages so try to write out any dirty pages first. - */ - if (PageDirty(page)) { - switch (pageout(page, mapping)) { - case PAGE_KEEP: - case PAGE_ACTIVATE: - goto unlock_both; - - case PAGE_SUCCESS: - unlock_page(newpage); - goto next; - - case PAGE_CLEAN: - ; /* try to migrate the page below */ - } - } - - /* - * Buffers are managed in a filesystem specific way. - * We must have no buffers or drop them. - */ - if (!page_has_buffers(page) || - try_to_release_page(page, GFP_KERNEL)) { - rc = migrate_page(newpage, page); - goto unlock_both; - } - - /* - * On early passes with mapped pages simply - * retry. There may be a lock held for some - * buffers that may go away. Later - * swap them out. - */ - if (pass > 4) { - /* - * Persistently unable to drop buffers..... As a - * measure of last resort we fall back to - * swap_page(). - */ - unlock_page(newpage); - newpage = NULL; - rc = swap_page(page); - goto next; - } - -unlock_both: - unlock_page(newpage); - -unlock_page: - unlock_page(page); - -next: - if (rc == -EAGAIN) { - retry++; - } else if (rc) { - /* Permanent failure */ - list_move(&page->lru, failed); - nr_failed++; - } else { - if (newpage) { - /* Successful migration. Return page to LRU */ - move_to_lru(newpage); - } - list_move(&page->lru, moved); - } - } - if (retry && pass++ < 10) - goto redo; - - if (!swapwrite) - current->flags &= ~PF_SWAPWRITE; - - return nr_failed + retry; -} - -/* - * Isolate one page from the LRU lists and put it on the - * indicated list with elevated refcount. - * - * Result: - * 0 = page not on LRU list - * 1 = page removed from LRU list and added to the specified list. - */ -int isolate_lru_page(struct page *page) -{ - int ret = 0; - - if (PageLRU(page)) { - struct zone *zone = page_zone(page); - spin_lock_irq(&zone->lru_lock); - if (PageLRU(page)) { - ret = 1; - get_page(page); - ClearPageLRU(page); - if (PageActive(page)) - del_page_from_active_list(zone, page); - else - del_page_from_inactive_list(zone, page); - } - spin_unlock_irq(&zone->lru_lock); - } - - return ret; -} -#endif - /* * zone->lru_lock is heavily contended. Some of the functions that * shrink the lists perform better by taking out a batch of pages -- cgit v1.1 From f577eb30afdc68233f25d4d82b04102129262365 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 23 Mar 2006 02:59:59 -0800 Subject: [PATCH] swsusp: low level interface Introduce the low level interface that can be used for handling the snapshot of the system memory by the in-kernel swap-writing/reading code of swsusp and the userland interface code (to be introduced shortly). Also change the way in which swsusp records the allocated swap pages and, consequently, simplifies the in-kernel swap-writing/reading code (this is necessary for the userland interface too). To this end, it introduces two helper functions in mm/swapfile.c, so that the swsusp code does not refer directly to the swap internals. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index 365ed6f..4d11f9d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -45,7 +45,7 @@ static const char Unused_offset[] = "Unused swap offset entry "; struct swap_list_t swap_list = {-1, -1}; -struct swap_info_struct swap_info[MAX_SWAPFILES]; +static struct swap_info_struct swap_info[MAX_SWAPFILES]; static DEFINE_MUTEX(swapon_mutex); @@ -417,6 +417,59 @@ void free_swap_and_cache(swp_entry_t entry) } } +#ifdef CONFIG_SOFTWARE_SUSPEND +/* + * Find the swap type that corresponds to given device (if any) + * + * This is needed for software suspend and is done in such a way that inode + * aliasing is allowed. + */ +int swap_type_of(dev_t device) +{ + int i; + + if (!device) + return -EINVAL; + spin_lock(&swap_lock); + for (i = 0; i < nr_swapfiles; i++) { + struct inode *inode; + + if (!(swap_info[i].flags & SWP_WRITEOK)) + continue; + inode = swap_info->swap_file->f_dentry->d_inode; + if (S_ISBLK(inode->i_mode) && + device == MKDEV(imajor(inode), iminor(inode))) { + spin_unlock(&swap_lock); + return i; + } + } + spin_unlock(&swap_lock); + return -ENODEV; +} + +/* + * Return either the total number of swap pages of given type, or the number + * of free pages of that type (depending on @free) + * + * This is needed for software suspend + */ +unsigned int count_swap_pages(int type, int free) +{ + unsigned int n = 0; + + if (type < nr_swapfiles) { + spin_lock(&swap_lock); + if (swap_info[type].flags & SWP_WRITEOK) { + n = swap_info[type].pages; + if (free) + n -= swap_info[type].inuse_pages; + } + spin_unlock(&swap_lock); + } + return n; +} +#endif + /* * No need to decide whether this PTE shares the swap entry with others, * just let do_wp_page work it out if a write is requested later - to -- cgit v1.1 From 6e1819d615f24ce0726a7d0bd3dd0152d7b21654 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 23 Mar 2006 03:00:03 -0800 Subject: [PATCH] swsusp: userland interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch introduces a user space interface for swsusp. The interface is based on a special character device, called the snapshot device, that allows user space processes to perform suspend and resume-related operations with the help of some ioctls and the read()/write() functions.  Additionally it allows these processes to allocate free swap pages from a selected swap partition, called the resume partition, so that they know which sectors of the resume partition are available to them. The interface uses the same low-level system memory snapshot-handling functions that are used by the built-it swap-writing/reading code of swsusp. The interface documentation is included in the patch. The patch assumes that the major and minor numbers of the snapshot device will be 10 (ie. misc device) and 231, the registration of which has already been requested. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index 4d11f9d..39aa9d1 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -428,14 +428,16 @@ int swap_type_of(dev_t device) { int i; - if (!device) - return -EINVAL; spin_lock(&swap_lock); for (i = 0; i < nr_swapfiles; i++) { struct inode *inode; if (!(swap_info[i].flags & SWP_WRITEOK)) continue; + if (!device) { + spin_unlock(&swap_lock); + return i; + } inode = swap_info->swap_file->f_dentry->d_inode; if (S_ISBLK(inode->i_mode) && device == MKDEV(imajor(inode), iminor(inode))) { -- cgit v1.1 From d8733c2956968a01394a4d2a9e97a8b431a78776 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 23 Mar 2006 03:00:11 -0800 Subject: [PATCH] ext3_readdir: use generic readahead Linus points out that ext3_readdir's readahead only cuts in when ext3_readdir() is operating at the very start of the directory. So for large directories we end up performing no readahead at all and we suck. So take it all out and use the core VM's page_cache_readahead(). This means that ext3 directory reads will use all of readahead's dynamic sizing goop. Note that we're using the directory's filp->f_ra to hold the readahead state, but readahead is actually being performed against the underlying blockdev's address_space. Fortunately the readahead code is all set up to handle this. Tested with printk. It works. I was struggling to find a real workload which actually cared. (The patch also exports page_cache_readahead() to GPL modules) Cc: "Stephen C. Tweedie" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/readahead.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/readahead.c b/mm/readahead.c index 301b36c..0f142a4 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -555,6 +555,7 @@ recheck: out: return ra->prev_page + 1; } +EXPORT_SYMBOL_GPL(page_cache_readahead); /* * handle_ra_miss() is called when it is known that a page which should have -- cgit v1.1 From 2056a782f8e7e65fd4bfd027506b4ce1c5e9ccd4 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 23 Mar 2006 20:00:26 +0100 Subject: [PATCH] Block queue IO tracing support (blktrace) as of 2006-03-23 Signed-off-by: Jens Axboe --- mm/highmem.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/highmem.c b/mm/highmem.c index ce2e7e8..d0ea1ee 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -26,6 +26,7 @@ #include #include #include +#include #include static mempool_t *page_pool, *isa_page_pool; @@ -483,6 +484,8 @@ void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig) pool = isa_page_pool; } + blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE); + /* * slow path */ -- cgit v1.1 From f6ef943813ac3085ece7252ea101d663581219f6 Mon Sep 17 00:00:00 2001 From: Bart Samwel Date: Fri, 24 Mar 2006 03:15:48 -0800 Subject: [PATCH] Represent dirty_*_centisecs as jiffies internally Make that the internal values for: /proc/sys/vm/dirty_writeback_centisecs /proc/sys/vm/dirty_expire_centisecs are stored as jiffies instead of centiseconds. Let the sysctl interface do the conversions with full precision using clock_t_to_jiffies, instead of doing overflow-sensitive on-the-fly conversions every time the values are used. Cons: apparent precision loss if HZ is not a multiple of 100, because of conversion back and forth. This is a common problem for all sysctl values that use proc_dointvec_userhz_jiffies. (There is only one other in-tree use, in net/core/neighbour.c.) Signed-off-by: Bart Samwel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 945559f..e7910799 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -75,12 +75,12 @@ int vm_dirty_ratio = 40; * The interval between `kupdate'-style writebacks, in centiseconds * (hundredths of a second) */ -int dirty_writeback_centisecs = 5 * 100; +int dirty_writeback_interval = 5 * HZ; /* * The longest number of centiseconds for which data is allowed to remain dirty */ -int dirty_expire_centisecs = 30 * 100; +int dirty_expire_interval = 30 * HZ; /* * Flag that makes the machine dump writes/reads and block dirtyings. @@ -380,8 +380,8 @@ static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0); * just walks the superblock inode list, writing back any inodes which are * older than a specific point in time. * - * Try to run once per dirty_writeback_centisecs. But if a writeback event - * takes longer than a dirty_writeback_centisecs interval, then leave a + * Try to run once per dirty_writeback_interval. But if a writeback event + * takes longer than a dirty_writeback_interval interval, then leave a * one-second gap. * * older_than_this takes precedence over nr_to_write. So we'll only write back @@ -406,9 +406,9 @@ static void wb_kupdate(unsigned long arg) sync_supers(); get_writeback_state(&wbs); - oldest_jif = jiffies - (dirty_expire_centisecs * HZ) / 100; + oldest_jif = jiffies - dirty_expire_interval; start_jif = jiffies; - next_jif = start_jif + (dirty_writeback_centisecs * HZ) / 100; + next_jif = start_jif + dirty_writeback_interval; nr_to_write = wbs.nr_dirty + wbs.nr_unstable + (inodes_stat.nr_inodes - inodes_stat.nr_unused); while (nr_to_write > 0) { @@ -425,7 +425,7 @@ static void wb_kupdate(unsigned long arg) } if (time_before(next_jif, jiffies + HZ)) next_jif = jiffies + HZ; - if (dirty_writeback_centisecs) + if (dirty_writeback_interval) mod_timer(&wb_timer, next_jif); } @@ -435,11 +435,11 @@ static void wb_kupdate(unsigned long arg) int dirty_writeback_centisecs_handler(ctl_table *table, int write, struct file *file, void __user *buffer, size_t *length, loff_t *ppos) { - proc_dointvec(table, write, file, buffer, length, ppos); - if (dirty_writeback_centisecs) { + proc_dointvec_userhz_jiffies(table, write, file, buffer, length, ppos); + if (dirty_writeback_interval) { mod_timer(&wb_timer, - jiffies + (dirty_writeback_centisecs * HZ) / 100); - } else { + jiffies + dirty_writeback_interval); + } else { del_timer(&wb_timer); } return 0; @@ -544,7 +544,7 @@ void __init page_writeback_init(void) if (vm_dirty_ratio <= 0) vm_dirty_ratio = 1; } - mod_timer(&wb_timer, jiffies + (dirty_writeback_centisecs * HZ) / 100); + mod_timer(&wb_timer, jiffies + dirty_writeback_interval); set_ratelimit(); register_cpu_notifier(&ratelimit_nb); } -- cgit v1.1 From ed5b43f15a8e86e3ae939b98bc161ee973ecedf2 Mon Sep 17 00:00:00 2001 From: Bart Samwel Date: Fri, 24 Mar 2006 03:15:49 -0800 Subject: [PATCH] Represent laptop_mode as jiffies internally Make that the internal value for /proc/sys/vm/laptop_mode is stored as jiffies instead of seconds. Let the sysctl interface do the conversions, instead of doing on-the-fly conversions every time the value is used. Add a description of the fact that laptop_mode doubles as a flag and a timeout to the comment above the laptop_mode variable. Signed-off-by: Bart Samwel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index e7910799..c1052ee 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -88,7 +88,8 @@ int dirty_expire_interval = 30 * HZ; int block_dump; /* - * Flag that puts the machine in "laptop mode". + * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies: + * a full sync is triggered after this time elapses without any disk activity. */ int laptop_mode; @@ -468,7 +469,7 @@ static void laptop_timer_fn(unsigned long unused) */ void laptop_io_completion(void) { - mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode * HZ); + mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode); } /* -- cgit v1.1 From 0b1303fcf23678ee1785841fb0c770a35cd0833c Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 24 Mar 2006 03:15:59 -0800 Subject: [PATCH] cpusets: only wakeup kswapd for zones in the current cpuset If we get under some memory pressure in a cpuset (we only scan zones that are in the cpuset for memory) then kswapd is woken up for all zones. This patch only wakes up kswapd in zones that are part of the current cpuset. Signed-off-by: Christoph Lameter Acked-by: Paul Jackson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b7f14a4..a5c3f8b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -943,7 +943,8 @@ restart: goto got_pg; do { - wakeup_kswapd(*z, order); + if (cpuset_zone_allowed(*z, gfp_mask)) + wakeup_kswapd(*z, order); } while (*(++z)); /* -- cgit v1.1 From 44110fe385af23ca5eee8a6ad4ff55d50339097a Mon Sep 17 00:00:00 2001 From: Paul Jackson Date: Fri, 24 Mar 2006 03:16:04 -0800 Subject: [PATCH] cpuset memory spread page cache implementation and hooks Change the page cache allocation calls to support cpuset memory spreading. See the previous patch, cpuset_mem_spread, for an explanation of cpuset memory spreading. On systems without cpusets configured in the kernel, this is no change. On systems with cpusets configured in the kernel, but the "memory_spread" cpuset option not enabled for the current tasks cpuset, this adds a call to a cpuset routine and failed bit test of the processor state flag PF_SPREAD_PAGE. On tasks in cpusets with "memory_spread" enabled, this adds a call to a cpuset routine that computes which of the tasks mems_allowed nodes should be preferred for this allocation. If memory spreading applies to a particular allocation, then any other NUMA mempolicy does not apply. Signed-off-by: Paul Jackson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index e8f58f7..d4ff48e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -29,6 +29,7 @@ #include #include #include +#include #include "filemap.h" #include "internal.h" @@ -427,6 +428,28 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, return ret; } +#ifdef CONFIG_NUMA +struct page *page_cache_alloc(struct address_space *x) +{ + if (cpuset_do_page_mem_spread()) { + int n = cpuset_mem_spread_node(); + return alloc_pages_node(n, mapping_gfp_mask(x), 0); + } + return alloc_pages(mapping_gfp_mask(x), 0); +} +EXPORT_SYMBOL(page_cache_alloc); + +struct page *page_cache_alloc_cold(struct address_space *x) +{ + if (cpuset_do_page_mem_spread()) { + int n = cpuset_mem_spread_node(); + return alloc_pages_node(n, mapping_gfp_mask(x)|__GFP_COLD, 0); + } + return alloc_pages(mapping_gfp_mask(x)|__GFP_COLD, 0); +} +EXPORT_SYMBOL(page_cache_alloc_cold); +#endif + /* * In order to wait for pages to become available there must be * waitqueues associated with pages. By using a hash table of -- cgit v1.1 From 101a50019ae5e370d73984ee05d56dd3b08f330a Mon Sep 17 00:00:00 2001 From: Paul Jackson Date: Fri, 24 Mar 2006 03:16:07 -0800 Subject: [PATCH] cpuset memory spread slab cache implementation Provide the slab cache infrastructure to support cpuset memory spreading. See the previous patches, cpuset_mem_spread, for an explanation of cpuset memory spreading. This patch provides a slab cache SLAB_MEM_SPREAD flag. If set in the kmem_cache_create() call defining a slab cache, then any task marked with the process state flag PF_MEMSPREAD will spread memory page allocations for that cache over all the allowed nodes, instead of preferring the local (faulting) node. On systems not configured with CONFIG_NUMA, this results in no change to the page allocation code path for slab caches. On systems with cpusets configured in the kernel, but the "memory_spread" cpuset option not enabled for the current tasks cpuset, this adds a call to a cpuset routine and failed bit test of the processor state flag PF_SPREAD_SLAB. For tasks so marked, a second inline test is done for the slab cache flag SLAB_MEM_SPREAD, and if that is set and if the allocation is not in_interrupt(), this adds a call to to a cpuset routine that computes which of the tasks mems_allowed nodes should be preferred for this allocation. ==> This patch adds another hook into the performance critical code path to allocating objects from the slab cache, in the ____cache_alloc() chunk, below. The next patch optimizes this hook, reducing the impact of the combined mempolicy plus memory spreading hooks on this critical code path to a single check against the tasks task_struct flags word. This patch provides the generic slab flags and logic needed to apply memory spreading to a particular slab. A subsequent patch will mark a few specific slab caches for this placement policy. Signed-off-by: Paul Jackson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 1c8f5ee..de51665 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -94,6 +94,7 @@ #include #include #include +#include #include #include #include @@ -173,12 +174,12 @@ SLAB_CACHE_DMA | \ SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \ SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ - SLAB_DESTROY_BY_RCU) + SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD) #else # define CREATE_MASK (SLAB_HWCACHE_ALIGN | \ SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \ SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ - SLAB_DESTROY_BY_RCU) + SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD) #endif /* @@ -2813,6 +2814,14 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) if (nid != numa_node_id()) return __cache_alloc_node(cachep, flags, nid); } + if (unlikely(cpuset_do_slab_mem_spread() && + (cachep->flags & SLAB_MEM_SPREAD) && + !in_interrupt())) { + int nid = cpuset_mem_spread_node(); + + if (nid != numa_node_id()) + return __cache_alloc_node(cachep, flags, nid); + } #endif check_irq_off(); -- cgit v1.1 From c61afb181c649754ea221f104e268cbacfc993e3 Mon Sep 17 00:00:00 2001 From: Paul Jackson Date: Fri, 24 Mar 2006 03:16:08 -0800 Subject: [PATCH] cpuset memory spread slab cache optimizations The hooks in the slab cache allocator code path for support of NUMA mempolicies and cpuset memory spreading are in an important code path. Many systems will use neither feature. This patch optimizes those hooks down to a single check of some bits in the current tasks task_struct flags. For non NUMA systems, this hook and related code is already ifdef'd out. The optimization is done by using another task flag, set if the task is using a non-default NUMA mempolicy. Taking this flag bit along with the PF_SPREAD_PAGE and PF_SPREAD_SLAB flag bits added earlier in this 'cpuset memory spreading' patch set, one can check for the combination of any of these special case memory placement mechanisms with a single test of the current tasks task_struct flags. This patch also tightens up the code, to save a few bytes of kernel text space, and moves some of it out of line. Due to the nested inlines called from multiple places, we were ending up with three copies of this code, which once we get off the main code path (for local node allocation) seems a bit wasteful of instruction memory. Signed-off-by: Paul Jackson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 32 ++++++++++++++++++++++++++++++++ mm/slab.c | 41 ++++++++++++++++++++++++++++------------- 2 files changed, 60 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e93cc74..4f71cfd 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -422,6 +422,37 @@ static int contextualize_policy(int mode, nodemask_t *nodes) return mpol_check_policy(mode, nodes); } + +/* + * Update task->flags PF_MEMPOLICY bit: set iff non-default + * mempolicy. Allows more rapid checking of this (combined perhaps + * with other PF_* flag bits) on memory allocation hot code paths. + * + * If called from outside this file, the task 'p' should -only- be + * a newly forked child not yet visible on the task list, because + * manipulating the task flags of a visible task is not safe. + * + * The above limitation is why this routine has the funny name + * mpol_fix_fork_child_flag(). + * + * It is also safe to call this with a task pointer of current, + * which the static wrapper mpol_set_task_struct_flag() does, + * for use within this file. + */ + +void mpol_fix_fork_child_flag(struct task_struct *p) +{ + if (p->mempolicy) + p->flags |= PF_MEMPOLICY; + else + p->flags &= ~PF_MEMPOLICY; +} + +static void mpol_set_task_struct_flag(void) +{ + mpol_fix_fork_child_flag(current); +} + /* Set the process memory policy */ long do_set_mempolicy(int mode, nodemask_t *nodes) { @@ -434,6 +465,7 @@ long do_set_mempolicy(int mode, nodemask_t *nodes) return PTR_ERR(new); mpol_free(current->mempolicy); current->mempolicy = new; + mpol_set_task_struct_flag(); if (new && new->policy == MPOL_INTERLEAVE) current->il_next = first_node(new->v.nodes); return 0; diff --git a/mm/slab.c b/mm/slab.c index de51665..f80b523 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -899,6 +899,7 @@ static struct array_cache *alloc_arraycache(int node, int entries, #ifdef CONFIG_NUMA static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int); +static void *alternate_node_alloc(struct kmem_cache *, gfp_t); static struct array_cache **alloc_alien_cache(int node, int limit) { @@ -2808,19 +2809,11 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) struct array_cache *ac; #ifdef CONFIG_NUMA - if (unlikely(current->mempolicy && !in_interrupt())) { - int nid = slab_node(current->mempolicy); - - if (nid != numa_node_id()) - return __cache_alloc_node(cachep, flags, nid); - } - if (unlikely(cpuset_do_slab_mem_spread() && - (cachep->flags & SLAB_MEM_SPREAD) && - !in_interrupt())) { - int nid = cpuset_mem_spread_node(); - - if (nid != numa_node_id()) - return __cache_alloc_node(cachep, flags, nid); + if (unlikely(current->flags & (PF_SPREAD_PAGE | PF_SPREAD_SLAB | + PF_MEMPOLICY))) { + objp = alternate_node_alloc(cachep, flags); + if (objp != NULL) + return objp; } #endif @@ -2856,6 +2849,28 @@ static __always_inline void *__cache_alloc(struct kmem_cache *cachep, #ifdef CONFIG_NUMA /* + * Try allocating on another node if PF_SPREAD_PAGE|PF_SPREAD_SLAB|PF_MEMPOLICY. + * + * If we are in_interrupt, then process context, including cpusets and + * mempolicy, may not apply and should not be used for allocation policy. + */ +static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) +{ + int nid_alloc, nid_here; + + if (in_interrupt()) + return NULL; + nid_alloc = nid_here = numa_node_id(); + if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) + nid_alloc = cpuset_mem_spread_node(); + else if (current->mempolicy) + nid_alloc = slab_node(current->mempolicy); + if (nid_alloc != nid_here) + return __cache_alloc_node(cachep, flags, nid_alloc); + return NULL; +} + +/* * A interface to enable slab creation on nodeid */ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, -- cgit v1.1 From b2455396be35383c4eebc6745cc718b1dd9e23df Mon Sep 17 00:00:00 2001 From: Paul Jackson Date: Fri, 24 Mar 2006 03:16:12 -0800 Subject: [PATCH] cpuset: memory_spread_slab drop useless PF_SPREAD_PAGE check The hook in the slab cache allocation path to handle cpuset memory spreading for tasks in cpusets with 'memory_spread_slab' enabled has a modest performance bug. The hook calls into the memory spreading handler alternate_node_alloc() if either of 'memory_spread_slab' or 'memory_spread_page' is enabled, even though the handler does nothing (albeit harmlessly) for the page case Fix - drop PF_SPREAD_PAGE from the set of flag bits that are used to trigger a call to alternate_node_alloc(). The page case is handled by separate hooks -- see the calls conditioned on cpuset_do_page_mem_spread() in mm/filemap.c Signed-off-by: Paul Jackson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index f80b523..26138c9 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2809,8 +2809,7 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) struct array_cache *ac; #ifdef CONFIG_NUMA - if (unlikely(current->flags & (PF_SPREAD_PAGE | PF_SPREAD_SLAB | - PF_MEMPOLICY))) { + if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) { objp = alternate_node_alloc(cachep, flags); if (objp != NULL) return objp; @@ -2849,7 +2848,7 @@ static __always_inline void *__cache_alloc(struct kmem_cache *cachep, #ifdef CONFIG_NUMA /* - * Try allocating on another node if PF_SPREAD_PAGE|PF_SPREAD_SLAB|PF_MEMPOLICY. + * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY. * * If we are in_interrupt, then process context, including cpusets and * mempolicy, may not apply and should not be used for allocation policy. -- cgit v1.1 From 469eb4d03878b676418f853011ebfb54ccf83a5e Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 24 Mar 2006 03:17:45 -0800 Subject: [PATCH] filemap_fdatawrite_range() api: clarify -end parameter I had trouble understanding working out whether filemap_fdatawrite_range()'s `end' parameter describes the last-byte-to-be-written or the last-plus-one. Clarify that in comments. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index d4ff48e..c1b1708 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -175,7 +175,7 @@ static int sync_page(void *word) * dirty pages that lie within the byte offsets * @mapping: address space structure to write * @start: offset in bytes where the range starts - * @end: offset in bytes where the range ends + * @end: offset in bytes where the range ends (inclusive) * @sync_mode: enable synchronous operation * * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as @@ -368,6 +368,12 @@ int filemap_write_and_wait(struct address_space *mapping) } EXPORT_SYMBOL(filemap_write_and_wait); +/* + * Write out and wait upon file offsets lstart->lend, inclusive. + * + * Note that `lend' is inclusive (describes the last byte to be written) so + * that this function can be used to write to the very end-of-file (end = -1). + */ int filemap_write_and_wait_range(struct address_space *mapping, loff_t lstart, loff_t lend) { -- cgit v1.1 From ebcf28e1c7a295f3321249dd235ad2e45938fdd9 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 24 Mar 2006 03:18:04 -0800 Subject: [PATCH] fadvise(): write commands Add two new linux-specific fadvise extensions(): LINUX_FADV_ASYNC_WRITE: start async writeout of any dirty pages between file offsets `offset' and `offset+len'. Any pages which are currently under writeout are skipped, whether or not they are dirty. LINUX_FADV_WRITE_WAIT: wait upon writeout of any dirty pages between file offsets `offset' and `offset+len'. By combining these two operations the application may do several things: LINUX_FADV_ASYNC_WRITE: push some or all of the dirty pages at the disk. LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE: push all of the currently dirty pages at the disk. LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE, LINUX_FADV_WRITE_WAIT: push all of the currently dirty pages at the disk, wait until they have been written. It should be noted that none of these operations write out the file's metadata. So unless the application is strictly performing overwrites of already-instantiated disk blocks, there are no guarantees here that the data will be available after a crash. To complete this suite of operations I guess we should have a "sync file metadata only" operation. This gives applications access to all the building blocks needed for all sorts of sync operations. But sync-metadata doesn't fit well with the fadvise() interface. Probably it should be a new syscall: sys_fmetadatasync(). The patch also diddles with the meaning of `endbyte' in sys_fadvise64_64(). It is made to represent that last affected byte in the file (ie: it is inclusive). Generally, all these byterange and pagerange functions are inclusive so we can easily represent EOF with -1. As Ulrich notes, these two functions are somewhat abusive of the fadvise() concept, which appears to be "set the future policy for this fd". But these commands are a perfect fit with the fadvise() impementation, and several of the existing fadvise() commands are synchronous and don't affect future policy either. I think we can live with the slight incongruity. Cc: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/fadvise.c | 46 +++++++++++++++++++++++++++++++++++++++++----- mm/filemap.c | 10 +++++----- 2 files changed, 46 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/fadvise.c b/mm/fadvise.c index d257c89..907c392 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -22,13 +23,36 @@ /* * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could * deactivate the pages and clear PG_Referenced. + * + * LINUX_FADV_ASYNC_WRITE: start async writeout of any dirty pages between file + * offsets `offset' and `offset+len' inclusive. Any pages which are currently + * under writeout are skipped, whether or not they are dirty. + * + * LINUX_FADV_WRITE_WAIT: wait upon writeout of any dirty pages between file + * offsets `offset' and `offset+len'. + * + * By combining these two operations the application may do several things: + * + * LINUX_FADV_ASYNC_WRITE: push some or all of the dirty pages at the disk. + * + * LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE: push all of the currently + * dirty pages at the disk. + * + * LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE, LINUX_FADV_WRITE_WAIT: push + * all of the currently dirty pages at the disk, wait until they have been + * written. + * + * It should be noted that none of these operations write out the file's + * metadata. So unless the application is strictly performing overwrites of + * already-instantiated disk blocks, there are no guarantees here that the data + * will be available after a crash. */ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) { struct file *file = fget(fd); struct address_space *mapping; struct backing_dev_info *bdi; - loff_t endbyte; + loff_t endbyte; /* inclusive */ pgoff_t start_index; pgoff_t end_index; unsigned long nrpages; @@ -56,6 +80,8 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) endbyte = offset + len; if (!len || endbyte < len) endbyte = -1; + else + endbyte--; /* inclusive */ bdi = mapping->backing_dev_info; @@ -78,7 +104,7 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) /* First and last PARTIAL page! */ start_index = offset >> PAGE_CACHE_SHIFT; - end_index = (endbyte-1) >> PAGE_CACHE_SHIFT; + end_index = endbyte >> PAGE_CACHE_SHIFT; /* Careful about overflow on the "+1" */ nrpages = end_index - start_index + 1; @@ -96,11 +122,21 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) filemap_flush(mapping); /* First and last FULL page! */ - start_index = (offset + (PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT; + start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT; end_index = (endbyte >> PAGE_CACHE_SHIFT); - if (end_index > start_index) - invalidate_mapping_pages(mapping, start_index, end_index-1); + if (end_index >= start_index) + invalidate_mapping_pages(mapping, start_index, + end_index); + break; + case LINUX_FADV_ASYNC_WRITE: + ret = __filemap_fdatawrite_range(mapping, offset, endbyte, + WB_SYNC_NONE); + break; + case LINUX_FADV_WRITE_WAIT: + ret = wait_on_page_writeback_range(mapping, + offset >> PAGE_CACHE_SHIFT, + endbyte >> PAGE_CACHE_SHIFT); break; default: ret = -EINVAL; diff --git a/mm/filemap.c b/mm/filemap.c index c1b1708..3ef2073 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -183,8 +183,8 @@ static int sync_page(void *word) * these two operations is that if a dirty page/buffer is encountered, it must * be waited upon, and not just skipped over. */ -static int __filemap_fdatawrite_range(struct address_space *mapping, - loff_t start, loff_t end, int sync_mode) +int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, + loff_t end, int sync_mode) { int ret; struct writeback_control wbc = { @@ -213,8 +213,8 @@ int filemap_fdatawrite(struct address_space *mapping) } EXPORT_SYMBOL(filemap_fdatawrite); -static int filemap_fdatawrite_range(struct address_space *mapping, - loff_t start, loff_t end) +static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, + loff_t end) { return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); } @@ -233,7 +233,7 @@ EXPORT_SYMBOL(filemap_flush); * Wait for writeback to complete against pages indexed by start->end * inclusive */ -static int wait_on_page_writeback_range(struct address_space *mapping, +int wait_on_page_writeback_range(struct address_space *mapping, pgoff_t start, pgoff_t end) { struct pagevec pvec; -- cgit v1.1 From fa5a734e406b53761fcc5ee22366006f71112c2d Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 24 Mar 2006 03:18:10 -0800 Subject: [PATCH] balance_dirty_pages_ratelimited: take nr_pages arg Modify balance_dirty_pages_ratelimited() so that it can take a number-of-pages-which-I-just-dirtied argument. For msync(). Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index c1052ee..c67ddc4 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -256,8 +256,9 @@ static void balance_dirty_pages(struct address_space *mapping) } /** - * balance_dirty_pages_ratelimited - balance dirty memory state + * balance_dirty_pages_ratelimited_nr - balance dirty memory state * @mapping: address_space which was dirtied + * @nr_pages: number of pages which the caller has just dirtied * * Processes which are dirtying memory should call in here once for each page * which was newly dirtied. The function will periodically check the system's @@ -268,10 +269,12 @@ static void balance_dirty_pages(struct address_space *mapping) * limit we decrease the ratelimiting by a lot, to prevent individual processes * from overshooting the limit by (ratelimit_pages) each. */ -void balance_dirty_pages_ratelimited(struct address_space *mapping) +void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, + unsigned long nr_pages_dirtied) { - static DEFINE_PER_CPU(int, ratelimits) = 0; - long ratelimit; + static DEFINE_PER_CPU(unsigned long, ratelimits) = 0; + unsigned long ratelimit; + unsigned long *p; ratelimit = ratelimit_pages; if (dirty_exceeded) @@ -281,15 +284,18 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping) * Check the rate limiting. Also, we do not want to throttle real-time * tasks in balance_dirty_pages(). Period. */ - if (get_cpu_var(ratelimits)++ >= ratelimit) { - __get_cpu_var(ratelimits) = 0; - put_cpu_var(ratelimits); + preempt_disable(); + p = &__get_cpu_var(ratelimits); + *p += nr_pages_dirtied; + if (unlikely(*p >= ratelimit)) { + *p = 0; + preempt_enable(); balance_dirty_pages(mapping); return; } - put_cpu_var(ratelimits); + preempt_enable(); } -EXPORT_SYMBOL(balance_dirty_pages_ratelimited); +EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr); void throttle_vm_writeout(void) { -- cgit v1.1 From 4741c9fd36b3bcadd37238321c469049da94a4b9 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 24 Mar 2006 03:18:11 -0800 Subject: [PATCH] set_page_dirty() return value fixes We need set_page_dirty() to return true if it actually transitioned the page from a clean to dirty state. This wasn't right in a couple of places. Do a kernel-wide audit, fix things up. This leaves open the possibility of returning a negative errno from set_page_dirty() sometime in the future. But we don't do that at present. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index c67ddc4..893d767 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -628,8 +628,6 @@ EXPORT_SYMBOL(write_one_page); */ int __set_page_dirty_nobuffers(struct page *page) { - int ret = 0; - if (!TestSetPageDirty(page)) { struct address_space *mapping = page_mapping(page); struct address_space *mapping2; @@ -651,8 +649,9 @@ int __set_page_dirty_nobuffers(struct page *page) I_DIRTY_PAGES); } } + return 1; } - return ret; + return 0; } EXPORT_SYMBOL(__set_page_dirty_nobuffers); @@ -682,8 +681,10 @@ int fastcall set_page_dirty(struct page *page) return (*spd)(page); return __set_page_dirty_buffers(page); } - if (!PageDirty(page)) - SetPageDirty(page); + if (!PageDirty(page)) { + if (!TestSetPageDirty(page)) + return 1; + } return 0; } EXPORT_SYMBOL(set_page_dirty); -- cgit v1.1 From 9c50823eebf7c256b92b4e0f02b5fb30e97788c2 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 24 Mar 2006 03:18:12 -0800 Subject: [PATCH] msync(): perform dirty page levelling It seems sensible to perform dirty page throttling in msync: as the application dirties pages we can kick off pdflush early, or even force the msync() caller to perform writeout, or even throttle the msync() caller. The main effect of this is to start disk writeback earlier if we've just discovered that a large amount of pagecache has been dirtied. (Otherwise it wouldn't happen for up to five seconds, next time pdflush wakes up). It also will cause the page-dirtying process to get panalised for dirtying those pages rather than whacking someone else with the problem. We should do this for munmap() and possibly even exit(), too. We drop the mmap_sem while performing the dirty page balancing. It doesn't seem right to hold mmap_sem for that long. Note that this patch only affects MS_ASYNC. MS_SYNC will be syncing all the dirty pages anyway. We note that msync(MS_SYNC) does a full-file-sync inside mmap_sem, and always has. We can fix that up... The patch also tightens up the mmap_sem coverage in sys_msync(): no point in taking it while we perform the incoming arg checking. Cc: Hugh Dickins Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/msync.c | 93 +++++++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 61 insertions(+), 32 deletions(-) (limited to 'mm') diff --git a/mm/msync.c b/mm/msync.c index 3563a56..8a66f5d 100644 --- a/mm/msync.c +++ b/mm/msync.c @@ -12,17 +12,20 @@ #include #include #include +#include +#include #include #include #include -static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd, +static unsigned long msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end) { pte_t *pte; spinlock_t *ptl; int progress = 0; + unsigned long ret = 0; again: pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); @@ -43,58 +46,64 @@ again: if (!page) continue; if (ptep_clear_flush_dirty(vma, addr, pte) || - page_test_and_clear_dirty(page)) - set_page_dirty(page); + page_test_and_clear_dirty(page)) + ret += set_page_dirty(page); progress += 3; } while (pte++, addr += PAGE_SIZE, addr != end); pte_unmap_unlock(pte - 1, ptl); cond_resched(); if (addr != end) goto again; + return ret; } -static inline void msync_pmd_range(struct vm_area_struct *vma, pud_t *pud, - unsigned long addr, unsigned long end) +static inline unsigned long msync_pmd_range(struct vm_area_struct *vma, + pud_t *pud, unsigned long addr, unsigned long end) { pmd_t *pmd; unsigned long next; + unsigned long ret = 0; pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); if (pmd_none_or_clear_bad(pmd)) continue; - msync_pte_range(vma, pmd, addr, next); + ret += msync_pte_range(vma, pmd, addr, next); } while (pmd++, addr = next, addr != end); + return ret; } -static inline void msync_pud_range(struct vm_area_struct *vma, pgd_t *pgd, - unsigned long addr, unsigned long end) +static inline unsigned long msync_pud_range(struct vm_area_struct *vma, + pgd_t *pgd, unsigned long addr, unsigned long end) { pud_t *pud; unsigned long next; + unsigned long ret = 0; pud = pud_offset(pgd, addr); do { next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; - msync_pmd_range(vma, pud, addr, next); + ret += msync_pmd_range(vma, pud, addr, next); } while (pud++, addr = next, addr != end); + return ret; } -static void msync_page_range(struct vm_area_struct *vma, +static unsigned long msync_page_range(struct vm_area_struct *vma, unsigned long addr, unsigned long end) { pgd_t *pgd; unsigned long next; + unsigned long ret = 0; /* For hugepages we can't go walking the page table normally, * but that's ok, hugetlbfs is memory based, so we don't need * to do anything more on an msync(). */ if (vma->vm_flags & VM_HUGETLB) - return; + return 0; BUG_ON(addr >= end); pgd = pgd_offset(vma->vm_mm, addr); @@ -103,8 +112,9 @@ static void msync_page_range(struct vm_area_struct *vma, next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - msync_pud_range(vma, pgd, addr, next); + ret += msync_pud_range(vma, pgd, addr, next); } while (pgd++, addr = next, addr != end); + return ret; } /* @@ -118,8 +128,9 @@ static void msync_page_range(struct vm_area_struct *vma, * So my _not_ starting I/O in MS_ASYNC we provide complete flexibility to * applications. */ -static int msync_interval(struct vm_area_struct *vma, - unsigned long addr, unsigned long end, int flags) +static int msync_interval(struct vm_area_struct *vma, unsigned long addr, + unsigned long end, int flags, + unsigned long *nr_pages_dirtied) { int ret = 0; struct file *file = vma->vm_file; @@ -128,7 +139,7 @@ static int msync_interval(struct vm_area_struct *vma, return -EBUSY; if (file && (vma->vm_flags & VM_SHARED)) { - msync_page_range(vma, addr, end); + *nr_pages_dirtied = msync_page_range(vma, addr, end); if (flags & MS_SYNC) { struct address_space *mapping = file->f_mapping; @@ -157,11 +168,8 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags) unsigned long end; struct vm_area_struct *vma; int unmapped_error, error = -EINVAL; + int done = 0; - if (flags & MS_SYNC) - current->flags |= PF_SYNCWRITE; - - down_read(¤t->mm->mmap_sem); if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC)) goto out; if (start & ~PAGE_MASK) @@ -180,13 +188,19 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags) * If the interval [start,end) covers some unmapped address ranges, * just ignore them, but return -ENOMEM at the end. */ + down_read(¤t->mm->mmap_sem); + if (flags & MS_SYNC) + current->flags |= PF_SYNCWRITE; vma = find_vma(current->mm, start); unmapped_error = 0; - for (;;) { + do { + unsigned long nr_pages_dirtied = 0; + struct file *file; + /* Still start < end. */ error = -ENOMEM; if (!vma) - goto out; + goto out_unlock; /* Here start < vma->vm_end. */ if (start < vma->vm_start) { unmapped_error = -ENOMEM; @@ -195,22 +209,37 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags) /* Here vma->vm_start <= start < vma->vm_end. */ if (end <= vma->vm_end) { if (start < end) { - error = msync_interval(vma, start, end, flags); + error = msync_interval(vma, start, end, flags, + &nr_pages_dirtied); if (error) - goto out; + goto out_unlock; } error = unmapped_error; - goto out; + done = 1; + } else { + /* Here vma->vm_start <= start < vma->vm_end < end. */ + error = msync_interval(vma, start, vma->vm_end, flags, + &nr_pages_dirtied); + if (error) + goto out_unlock; } - /* Here vma->vm_start <= start < vma->vm_end < end. */ - error = msync_interval(vma, start, vma->vm_end, flags); - if (error) - goto out; + file = vma->vm_file; start = vma->vm_end; - vma = vma->vm_next; - } -out: - up_read(¤t->mm->mmap_sem); + if ((flags & MS_ASYNC) && file && nr_pages_dirtied) { + get_file(file); + up_read(¤t->mm->mmap_sem); + balance_dirty_pages_ratelimited_nr(file->f_mapping, + nr_pages_dirtied); + fput(file); + down_read(¤t->mm->mmap_sem); + vma = find_vma(current->mm, start); + } else { + vma = vma->vm_next; + } + } while (!done); +out_unlock: current->flags &= ~PF_SYNCWRITE; + up_read(¤t->mm->mmap_sem); +out: return error; } -- cgit v1.1 From 707c21c848deeb0200ba3f07e4ba90e6dc419c2f Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 24 Mar 2006 03:18:13 -0800 Subject: [PATCH] msync(MS_SYNC): don't hold mmap_sem while syncing It seems bad to hold mmap_sem while performing synchronous disk I/O. Alter the msync(MS_SYNC) code so that the lock is released while we sync the file. Cc: Hugh Dickins Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/msync.c | 49 ++++++++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 23 deletions(-) (limited to 'mm') diff --git a/mm/msync.c b/mm/msync.c index 8a66f5d..ee9bd67 100644 --- a/mm/msync.c +++ b/mm/msync.c @@ -132,35 +132,14 @@ static int msync_interval(struct vm_area_struct *vma, unsigned long addr, unsigned long end, int flags, unsigned long *nr_pages_dirtied) { - int ret = 0; struct file *file = vma->vm_file; if ((flags & MS_INVALIDATE) && (vma->vm_flags & VM_LOCKED)) return -EBUSY; - if (file && (vma->vm_flags & VM_SHARED)) { + if (file && (vma->vm_flags & VM_SHARED)) *nr_pages_dirtied = msync_page_range(vma, addr, end); - - if (flags & MS_SYNC) { - struct address_space *mapping = file->f_mapping; - int err; - - ret = filemap_fdatawrite(mapping); - if (file->f_op && file->f_op->fsync) { - /* - * We don't take i_mutex here because mmap_sem - * is already held. - */ - err = file->f_op->fsync(file,file->f_dentry,1); - if (err && !ret) - ret = err; - } - err = filemap_fdatawait(mapping); - if (!ret) - ret = err; - } - } - return ret; + return 0; } asmlinkage long sys_msync(unsigned long start, size_t len, int flags) @@ -233,6 +212,30 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags) fput(file); down_read(¤t->mm->mmap_sem); vma = find_vma(current->mm, start); + } else if ((flags & MS_SYNC) && file && + (vma->vm_flags & VM_SHARED)) { + struct address_space *mapping; + int err; + + get_file(file); + up_read(¤t->mm->mmap_sem); + mapping = file->f_mapping; + error = filemap_fdatawrite(mapping); + if (file->f_op && file->f_op->fsync) { + mutex_lock(&mapping->host->i_mutex); + err = file->f_op->fsync(file,file->f_dentry,1); + mutex_unlock(&mapping->host->i_mutex); + if (err && !error) + error = err; + } + err = filemap_fdatawait(mapping); + if (err && !error) + error = err; + fput(file); + down_read(¤t->mm->mmap_sem); + if (error) + goto out_unlock; + vma = find_vma(current->mm, start); } else { vma = vma->vm_next; } -- cgit v1.1 From 676758bdb7bfca8413a85203921746f446e237be Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 24 Mar 2006 03:18:14 -0800 Subject: [PATCH] msync: fix return value msync() does a strange thing. Essentially: vma = find_vma(); for ( ; ; ) { if (!vma) return -ENOMEM; ... vma = vma->vm_next; } so an msync() request which starts within or before a valid VMA and which ends within or beyond the final VMA will incorrectly return -ENOMEM. Fix. Cc: Hugh Dickins Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/msync.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/msync.c b/mm/msync.c index ee9bd67..d6a50f3 100644 --- a/mm/msync.c +++ b/mm/msync.c @@ -146,7 +146,8 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags) { unsigned long end; struct vm_area_struct *vma; - int unmapped_error, error = -EINVAL; + int unmapped_error = 0; + int error = -EINVAL; int done = 0; if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC)) @@ -171,15 +172,14 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags) if (flags & MS_SYNC) current->flags |= PF_SYNCWRITE; vma = find_vma(current->mm, start); - unmapped_error = 0; + if (!vma) { + error = -ENOMEM; + goto out_unlock; + } do { unsigned long nr_pages_dirtied = 0; struct file *file; - /* Still start < end. */ - error = -ENOMEM; - if (!vma) - goto out_unlock; /* Here start < vma->vm_end. */ if (start < vma->vm_start) { unmapped_error = -ENOMEM; @@ -239,7 +239,7 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags) } else { vma = vma->vm_next; } - } while (!done); + } while (vma && !done); out_unlock: current->flags &= ~PF_SYNCWRITE; up_read(¤t->mm->mmap_sem); -- cgit v1.1 From 8f2e9f157a94f444dad974b088b853ac40785b02 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 24 Mar 2006 03:18:15 -0800 Subject: [PATCH] msync(): use do_fsync() No need to duplicate all that code. Cc: Hugh Dickins Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/msync.c | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/msync.c b/mm/msync.c index d6a50f3..2672b8d 100644 --- a/mm/msync.c +++ b/mm/msync.c @@ -9,6 +9,7 @@ */ #include #include +#include #include #include #include @@ -214,23 +215,9 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags) vma = find_vma(current->mm, start); } else if ((flags & MS_SYNC) && file && (vma->vm_flags & VM_SHARED)) { - struct address_space *mapping; - int err; - get_file(file); up_read(¤t->mm->mmap_sem); - mapping = file->f_mapping; - error = filemap_fdatawrite(mapping); - if (file->f_op && file->f_op->fsync) { - mutex_lock(&mapping->host->i_mutex); - err = file->f_op->fsync(file,file->f_dentry,1); - mutex_unlock(&mapping->host->i_mutex); - if (err && !error) - error = err; - } - err = filemap_fdatawait(mapping); - if (err && !error) - error = err; + error = do_fsync(file, 0); fput(file); down_read(¤t->mm->mmap_sem); if (error) -- cgit v1.1 From 96840aa00a031069a136ec4c55d0bdd09ac6d3a7 Mon Sep 17 00:00:00 2001 From: Davi Arnaut Date: Fri, 24 Mar 2006 03:18:42 -0800 Subject: [PATCH] strndup_user() This patch series creates a strndup_user() function to easy copying C strings from userspace. Also we avoid common pitfalls like userspace modifying the final \0 after the strlen_user(). Signed-off-by: Davi Arnaut Cc: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/util.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'mm') diff --git a/mm/util.c b/mm/util.c index 5f4bb59..49e29f7 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1,6 +1,8 @@ #include #include #include +#include +#include /** * kzalloc - allocate memory. The memory is set to zero. @@ -37,3 +39,38 @@ char *kstrdup(const char *s, gfp_t gfp) return buf; } EXPORT_SYMBOL(kstrdup); + +/* + * strndup_user - duplicate an existing string from user space + * + * @s: The string to duplicate + * @n: Maximum number of bytes to copy, including the trailing NUL. + */ +char *strndup_user(const char __user *s, long n) +{ + char *p; + long length; + + length = strnlen_user(s, n); + + if (!length) + return ERR_PTR(-EFAULT); + + if (length > n) + return ERR_PTR(-EINVAL); + + p = kmalloc(length, GFP_KERNEL); + + if (!p) + return ERR_PTR(-ENOMEM); + + if (copy_from_user(p, s, length)) { + kfree(p); + return ERR_PTR(-EFAULT); + } + + p[length - 1] = '\0'; + + return p; +} +EXPORT_SYMBOL(strndup_user); -- cgit v1.1 From 16538c40776b8be6b0f23966e08fdc7b8fff823f Mon Sep 17 00:00:00 2001 From: Amos Waterland Date: Fri, 24 Mar 2006 18:30:53 +0100 Subject: The comment describing how MS_ASYNC works in msync.c is confusing because of a typo. This patch just changes "my" to "by", which I believe was the original intent. Signed-off-by: Adrian Bunk --- mm/msync.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/msync.c b/mm/msync.c index 2672b8d..bc6c953 100644 --- a/mm/msync.c +++ b/mm/msync.c @@ -126,7 +126,7 @@ static unsigned long msync_page_range(struct vm_area_struct *vma, * write out the dirty pages and wait on the writeout and check the result. * Or the application may run fadvise(FADV_DONTNEED) against the fd to start * async writeout immediately. - * So my _not_ starting I/O in MS_ASYNC we provide complete flexibility to + * So by _not_ starting I/O in MS_ASYNC we provide complete flexibility to * applications. */ static int msync_interval(struct vm_area_struct *vma, unsigned long addr, -- cgit v1.1 From 871751e25d956ad24f129ca972b7851feaa61d53 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 25 Mar 2006 03:06:39 -0800 Subject: [PATCH] slab: implement /proc/slab_allocators Implement /proc/slab_allocators. It produces output like: idr_layer_cache: 80 idr_pre_get+0x33/0x4e buffer_head: 2555 alloc_buffer_head+0x20/0x75 mm_struct: 9 mm_alloc+0x1e/0x42 mm_struct: 20 dup_mm+0x36/0x370 vm_area_struct: 384 dup_mm+0x18f/0x370 vm_area_struct: 151 do_mmap_pgoff+0x2e0/0x7c3 vm_area_struct: 1 split_vma+0x5a/0x10e vm_area_struct: 11 do_brk+0x206/0x2e2 vm_area_struct: 2 copy_vma+0xda/0x142 vm_area_struct: 9 setup_arg_pages+0x99/0x214 fs_cache: 8 copy_fs_struct+0x21/0x133 fs_cache: 29 copy_process+0xf38/0x10e3 files_cache: 30 alloc_files+0x1b/0xcf signal_cache: 81 copy_process+0xbaa/0x10e3 sighand_cache: 77 copy_process+0xe65/0x10e3 sighand_cache: 1 de_thread+0x4d/0x5f8 anon_vma: 241 anon_vma_prepare+0xd9/0xf3 size-2048: 1 add_sect_attrs+0x5f/0x145 size-2048: 2 journal_init_revoke+0x99/0x302 size-2048: 2 journal_init_revoke+0x137/0x302 size-2048: 2 journal_init_inode+0xf9/0x1c4 Cc: Manfred Spraul Cc: Alexander Nyberg Cc: Pekka Enberg Cc: Christoph Lameter Cc: Ravikiran Thirumalai Signed-off-by: Al Viro DESC slab-leaks3-locking-fix EDESC From: Andrew Morton Update for slab-remove-cachep-spinlock.patch Cc: Al Viro Cc: Manfred Spraul Cc: Alexander Nyberg Cc: Pekka Enberg Cc: Christoph Lameter Cc: Ravikiran Thirumalai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 180 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--- mm/util.c | 4 +- 2 files changed, 176 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 26138c9..a504716 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -204,7 +204,8 @@ typedef unsigned int kmem_bufctl_t; #define BUFCTL_END (((kmem_bufctl_t)(~0U))-0) #define BUFCTL_FREE (((kmem_bufctl_t)(~0U))-1) -#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-2) +#define BUFCTL_ACTIVE (((kmem_bufctl_t)(~0U))-2) +#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-3) /* Max number of objs-per-slab for caches which use off-slab slabs. * Needed to avoid a possible looping condition in cache_grow(). @@ -2399,7 +2400,7 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, /* Verify that the slab belongs to the intended node */ WARN_ON(slabp->nodeid != nodeid); - if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) { + if (slab_bufctl(slabp)[objnr] + 1 <= SLAB_LIMIT + 1) { printk(KERN_ERR "slab: double free detected in cache " "'%s', objp %p\n", cachep->name, objp); BUG(); @@ -2605,6 +2606,9 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, */ cachep->dtor(objp + obj_offset(cachep), cachep, 0); } +#ifdef CONFIG_DEBUG_SLAB_LEAK + slab_bufctl(slabp)[objnr] = BUFCTL_FREE; +#endif if (cachep->flags & SLAB_POISON) { #ifdef CONFIG_DEBUG_PAGEALLOC if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { @@ -2788,6 +2792,16 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, *dbg_redzone1(cachep, objp) = RED_ACTIVE; *dbg_redzone2(cachep, objp) = RED_ACTIVE; } +#ifdef CONFIG_DEBUG_SLAB_LEAK + { + struct slab *slabp; + unsigned objnr; + + slabp = page_get_slab(virt_to_page(objp)); + objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size; + slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE; + } +#endif objp += obj_offset(cachep); if (cachep->ctor && cachep->flags & SLAB_POISON) { unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR; @@ -3220,22 +3234,23 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, return __cache_alloc(cachep, flags, caller); } -#ifndef CONFIG_DEBUG_SLAB void *__kmalloc(size_t size, gfp_t flags) { +#ifndef CONFIG_DEBUG_SLAB return __do_kmalloc(size, flags, NULL); +#else + return __do_kmalloc(size, flags, __builtin_return_address(0)); +#endif } EXPORT_SYMBOL(__kmalloc); -#else - +#ifdef CONFIG_DEBUG_SLAB void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller) { return __do_kmalloc(size, flags, caller); } EXPORT_SYMBOL(__kmalloc_track_caller); - #endif #ifdef CONFIG_SMP @@ -3899,6 +3914,159 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer, res = count; return res; } + +#ifdef CONFIG_DEBUG_SLAB_LEAK + +static void *leaks_start(struct seq_file *m, loff_t *pos) +{ + loff_t n = *pos; + struct list_head *p; + + mutex_lock(&cache_chain_mutex); + p = cache_chain.next; + while (n--) { + p = p->next; + if (p == &cache_chain) + return NULL; + } + return list_entry(p, struct kmem_cache, next); +} + +static inline int add_caller(unsigned long *n, unsigned long v) +{ + unsigned long *p; + int l; + if (!v) + return 1; + l = n[1]; + p = n + 2; + while (l) { + int i = l/2; + unsigned long *q = p + 2 * i; + if (*q == v) { + q[1]++; + return 1; + } + if (*q > v) { + l = i; + } else { + p = q + 2; + l -= i + 1; + } + } + if (++n[1] == n[0]) + return 0; + memmove(p + 2, p, n[1] * 2 * sizeof(unsigned long) - ((void *)p - (void *)n)); + p[0] = v; + p[1] = 1; + return 1; +} + +static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s) +{ + void *p; + int i; + if (n[0] == n[1]) + return; + for (i = 0, p = s->s_mem; i < c->num; i++, p += c->buffer_size) { + if (slab_bufctl(s)[i] != BUFCTL_ACTIVE) + continue; + if (!add_caller(n, (unsigned long)*dbg_userword(c, p))) + return; + } +} + +static void show_symbol(struct seq_file *m, unsigned long address) +{ +#ifdef CONFIG_KALLSYMS + char *modname; + const char *name; + unsigned long offset, size; + char namebuf[KSYM_NAME_LEN+1]; + + name = kallsyms_lookup(address, &size, &offset, &modname, namebuf); + + if (name) { + seq_printf(m, "%s+%#lx/%#lx", name, offset, size); + if (modname) + seq_printf(m, " [%s]", modname); + return; + } +#endif + seq_printf(m, "%p", (void *)address); +} + +static int leaks_show(struct seq_file *m, void *p) +{ + struct kmem_cache *cachep = p; + struct list_head *q; + struct slab *slabp; + struct kmem_list3 *l3; + const char *name; + unsigned long *n = m->private; + int node; + int i; + + if (!(cachep->flags & SLAB_STORE_USER)) + return 0; + if (!(cachep->flags & SLAB_RED_ZONE)) + return 0; + + /* OK, we can do it */ + + n[1] = 0; + + for_each_online_node(node) { + l3 = cachep->nodelists[node]; + if (!l3) + continue; + + check_irq_on(); + spin_lock_irq(&l3->list_lock); + + list_for_each(q, &l3->slabs_full) { + slabp = list_entry(q, struct slab, list); + handle_slab(n, cachep, slabp); + } + list_for_each(q, &l3->slabs_partial) { + slabp = list_entry(q, struct slab, list); + handle_slab(n, cachep, slabp); + } + spin_unlock_irq(&l3->list_lock); + } + name = cachep->name; + if (n[0] == n[1]) { + /* Increase the buffer size */ + mutex_unlock(&cache_chain_mutex); + m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL); + if (!m->private) { + /* Too bad, we are really out */ + m->private = n; + mutex_lock(&cache_chain_mutex); + return -ENOMEM; + } + *(unsigned long *)m->private = n[0] * 2; + kfree(n); + mutex_lock(&cache_chain_mutex); + /* Now make sure this entry will be retried */ + m->count = m->size; + return 0; + } + for (i = 0; i < n[1]; i++) { + seq_printf(m, "%s: %lu ", name, n[2*i+3]); + show_symbol(m, n[2*i+2]); + seq_putc(m, '\n'); + } + return 0; +} + +struct seq_operations slabstats_op = { + .start = leaks_start, + .next = s_next, + .stop = s_stop, + .show = leaks_show, +}; +#endif #endif /** diff --git a/mm/util.c b/mm/util.c index 49e29f7..b68d3d7 100644 --- a/mm/util.c +++ b/mm/util.c @@ -11,7 +11,7 @@ */ void *kzalloc(size_t size, gfp_t flags) { - void *ret = kmalloc(size, flags); + void *ret = ____kmalloc(size, flags); if (ret) memset(ret, 0, size); return ret; @@ -33,7 +33,7 @@ char *kstrdup(const char *s, gfp_t gfp) return NULL; len = strlen(s) + 1; - buf = kmalloc(len, gfp); + buf = ____kmalloc(len, gfp); if (buf) memcpy(buf, s, len); return buf; -- cgit v1.1 From a8c0f9a41f88da703ade33f9c1626a55c786e8bb Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Sat, 25 Mar 2006 03:06:42 -0800 Subject: [PATCH] slab: introduce kmem_cache_zalloc allocator Introduce a memory-zeroing variant of kmem_cache_alloc. The allocator already exits in XFS and there are potential users for it so this patch makes the allocator available for the general public. Signed-off-by: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 17 +++++++++++++++++ mm/slob.c | 10 ++++++++++ 2 files changed, 27 insertions(+) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index a504716..6f5aeeb 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3108,6 +3108,23 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) EXPORT_SYMBOL(kmem_cache_alloc); /** + * kmem_cache_alloc - Allocate an object. The memory is set to zero. + * @cache: The cache to allocate from. + * @flags: See kmalloc(). + * + * Allocate an object from this cache and set the allocated memory to zero. + * The flags are only relevant if the cache has no available objects. + */ +void *kmem_cache_zalloc(struct kmem_cache *cache, gfp_t flags) +{ + void *ret = __cache_alloc(cache, flags, __builtin_return_address(0)); + if (ret) + memset(ret, 0, obj_size(cache)); + return ret; +} +EXPORT_SYMBOL(kmem_cache_zalloc); + +/** * kmem_ptr_validate - check if an untrusted pointer might * be a slab entry. * @cachep: the cache we're checking against diff --git a/mm/slob.c b/mm/slob.c index a1f42bd..9bcc7e2 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -294,6 +294,16 @@ void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags) } EXPORT_SYMBOL(kmem_cache_alloc); +void *kmem_cache_zalloc(struct kmem_cache *c, gfp_t flags) +{ + void *ret = kmem_cache_alloc(c, flags); + if (ret) + memset(ret, 0, c->size); + + return ret; +} +EXPORT_SYMBOL(kmem_cache_zalloc); + void kmem_cache_free(struct kmem_cache *c, void *b) { if (c->dtor) -- cgit v1.1 From 40c07ae8daa659b8feb149c84731629386873c16 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Sat, 25 Mar 2006 03:06:43 -0800 Subject: [PATCH] slab: optimize constant-size kzalloc calls As suggested by Eric Dumazet, optimize kzalloc() calls that pass a compile-time constant size. Please note that the patch increases kernel text slightly (~200 bytes for defconfig on x86). Signed-off-by: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/util.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/util.c b/mm/util.c index b68d3d7..73684792 100644 --- a/mm/util.c +++ b/mm/util.c @@ -5,18 +5,18 @@ #include /** - * kzalloc - allocate memory. The memory is set to zero. + * __kzalloc - allocate memory. The memory is set to zero. * @size: how many bytes of memory are required. * @flags: the type of memory to allocate. */ -void *kzalloc(size_t size, gfp_t flags) +void *__kzalloc(size_t size, gfp_t flags) { void *ret = ____kmalloc(size, flags); if (ret) memset(ret, 0, size); return ret; } -EXPORT_SYMBOL(kzalloc); +EXPORT_SYMBOL(__kzalloc); /* * kstrdup - allocate space for and copy an existing string -- cgit v1.1 From c5e3b83e97be4e09961c0af101644643e5d03d17 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Sat, 25 Mar 2006 03:06:43 -0800 Subject: [PATCH] mm: use kmem_cache_zalloc Convert mm/ to use the new kmem_cache_zalloc allocator. Signed-off-by: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 6 ++---- mm/slab.c | 3 +-- 2 files changed, 3 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 0eb9894..4f5b570 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1040,12 +1040,11 @@ munmap_back: * specific mapper. the address has already been validated, but * not unmapped, but the maps are removed from the list. */ - vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); if (!vma) { error = -ENOMEM; goto unacct_error; } - memset(vma, 0, sizeof(*vma)); vma->vm_mm = mm; vma->vm_start = addr; @@ -1896,12 +1895,11 @@ unsigned long do_brk(unsigned long addr, unsigned long len) /* * create a vma struct for an anonymous mapping */ - vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); if (!vma) { vm_unacct_memory(len >> PAGE_SHIFT); return -ENOMEM; } - memset(vma, 0, sizeof(*vma)); vma->vm_mm = mm; vma->vm_start = addr; diff --git a/mm/slab.c b/mm/slab.c index 6f5aeeb..6a3760e 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1990,10 +1990,9 @@ kmem_cache_create (const char *name, size_t size, size_t align, align = ralign; /* Get cache's description obj. */ - cachep = kmem_cache_alloc(&cache_cache, SLAB_KERNEL); + cachep = kmem_cache_zalloc(&cache_cache, SLAB_KERNEL); if (!cachep) goto oops; - memset(cachep, 0, sizeof(struct kmem_cache)); #if DEBUG cachep->obj_size = size; -- cgit v1.1 From 3ded175a4b7a4548f3358dcf5f3ad65f63cdb4ed Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sat, 25 Mar 2006 03:06:44 -0800 Subject: [PATCH] slab: add transfer_objects() function slabr_objects() can be used to transfer objects between various object caches of the slab allocator. It is currently only used during __cache_alloc() to retrieve elements from the shared array. We will be using it soon to transfer elements from the alien caches to the remote shared array. Signed-off-by: Christoph Lameter Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 42 ++++++++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 6a3760e..dee857a 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -898,6 +898,30 @@ static struct array_cache *alloc_arraycache(int node, int entries, return nc; } +/* + * Transfer objects in one arraycache to another. + * Locking must be handled by the caller. + * + * Return the number of entries transferred. + */ +static int transfer_objects(struct array_cache *to, + struct array_cache *from, unsigned int max) +{ + /* Figure out how many entries to transfer */ + int nr = min(min(from->avail, max), to->limit - to->avail); + + if (!nr) + return 0; + + memcpy(to->entry + to->avail, from->entry + from->avail -nr, + sizeof(void *) *nr); + + from->avail -= nr; + to->avail += nr; + to->touched = 1; + return nr; +} + #ifdef CONFIG_NUMA static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int); static void *alternate_node_alloc(struct kmem_cache *, gfp_t); @@ -2680,20 +2704,10 @@ retry: BUG_ON(ac->avail > 0 || !l3); spin_lock(&l3->list_lock); - if (l3->shared) { - struct array_cache *shared_array = l3->shared; - if (shared_array->avail) { - if (batchcount > shared_array->avail) - batchcount = shared_array->avail; - shared_array->avail -= batchcount; - ac->avail = batchcount; - memcpy(ac->entry, - &(shared_array->entry[shared_array->avail]), - sizeof(void *) * batchcount); - shared_array->touched = 1; - goto alloc_done; - } - } + /* See if we can refill from the shared array */ + if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) + goto alloc_done; + while (batchcount > 0) { struct list_head *entry; struct slab *slabp; -- cgit v1.1 From e00946fe2351307eb3eda7a3343530f6d2d1af2e Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sat, 25 Mar 2006 03:06:45 -0800 Subject: [PATCH] slab: Bypass free lists for __drain_alien_cache() __drain_alien_cache() currently drains objects by freeing them to the (remote) freelists of the original node. However, each node also has a shared list containing objects to be used on any processor of that node. We can avoid a number of remote node accesses by copying the pointers to the free objects directly into the remote shared array. And while we are at it: Skip alien draining if the alien cache spinlock is already taken. Kiran reported that this is a performance benefit. Signed-off-by: Christoph Lameter Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index dee857a..351aa6c 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -971,6 +971,13 @@ static void __drain_alien_cache(struct kmem_cache *cachep, if (ac->avail) { spin_lock(&rl3->list_lock); + /* + * Stuff objects into the remote nodes shared array first. + * That way we could avoid the overhead of putting the objects + * into the free lists and getting them back later. + */ + transfer_objects(rl3->shared, ac, ac->limit); + free_block(cachep, ac->entry, ac->avail, node); ac->avail = 0; spin_unlock(&rl3->list_lock); @@ -986,8 +993,8 @@ static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3) if (l3->alien) { struct array_cache *ac = l3->alien[node]; - if (ac && ac->avail) { - spin_lock_irq(&ac->lock); + + if (ac && ac->avail && spin_trylock_irq(&ac->lock)) { __drain_alien_cache(cachep, ac, node); spin_unlock_irq(&ac->lock); } -- cgit v1.1 From cafeb02e098ecd58fb0bd797b2c9fbba3edf54f8 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sat, 25 Mar 2006 03:06:46 -0800 Subject: [PATCH] alloc_kmemlist: Some cleanup in preparation for a real memory leak fix Inspired by Jesper Juhl's patch from today 1. Get rid of err We do not set it to anything else but zero. 2. Drop the CONFIG_NUMA stuff. There are definitions for alloc_alien_cache and free_alien_cache() that do the right thing for the non NUMA case. 3. Better naming of variables. 4. Remove redundant cachep->nodelists[node] expressions. Signed-off-by: Christoph Lameter Signed-off-by: Jesper Juhl Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 351aa6c..ef9f60f 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3424,37 +3424,38 @@ static int alloc_kmemlist(struct kmem_cache *cachep) { int node; struct kmem_list3 *l3; - int err = 0; + struct array_cache *new_shared; + struct array_cache **new_alien; for_each_online_node(node) { - struct array_cache *nc = NULL, *new; - struct array_cache **new_alien = NULL; -#ifdef CONFIG_NUMA + new_alien = alloc_alien_cache(node, cachep->limit); if (!new_alien) goto fail; -#endif - new = alloc_arraycache(node, cachep->shared*cachep->batchcount, + + new_shared = alloc_arraycache(node, cachep->shared*cachep->batchcount, 0xbaadf00d); - if (!new) + if (!new_shared) goto fail; + l3 = cachep->nodelists[node]; if (l3) { + struct array_cache *shared = l3->shared; + spin_lock_irq(&l3->list_lock); - nc = cachep->nodelists[node]->shared; - if (nc) - free_block(cachep, nc->entry, nc->avail, node); + if (shared) + free_block(cachep, shared->entry, shared->avail, node); - l3->shared = new; - if (!cachep->nodelists[node]->alien) { + l3->shared = new_shared; + if (!l3->alien) { l3->alien = new_alien; new_alien = NULL; } l3->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num; spin_unlock_irq(&l3->list_lock); - kfree(nc); + kfree(shared); free_alien_cache(new_alien); continue; } @@ -3465,16 +3466,15 @@ static int alloc_kmemlist(struct kmem_cache *cachep) kmem_list3_init(l3); l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + ((unsigned long)cachep) % REAPTIMEOUT_LIST3; - l3->shared = new; + l3->shared = new_shared; l3->alien = new_alien; l3->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num; cachep->nodelists[node] = l3; } - return err; + return 0; fail: - err = -ENOMEM; - return err; + return -ENOMEM; } struct ccupdate_struct { -- cgit v1.1 From 0718dc2a82c865ca75975acabaf984057f9fd488 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sat, 25 Mar 2006 03:06:47 -0800 Subject: [PATCH] slab: fix memory leak in alloc_kmemlist We have had this memory leak for a while now. The situation is complicated by the use of alloc_kmemlist() as a function to resize various caches by do_tune_cpucache(). What we do here is first of all make sure that we deallocate properly in the loop over all the nodes. If we are just resizing caches then we can simply return with -ENOMEM if an allocation fails. If the cache is new then we need to rollback and remove all earlier allocations. We detect that a cache is new by checking if the link to the global cache chain has been setup. This is a bit hackish .... (also fix up too overlong lines that I added in the last patch...) Signed-off-by: Christoph Lameter Cc: Jesper Juhl Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index ef9f60f..6818374 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3418,7 +3418,7 @@ const char *kmem_cache_name(struct kmem_cache *cachep) EXPORT_SYMBOL_GPL(kmem_cache_name); /* - * This initializes kmem_list3 for all nodes. + * This initializes kmem_list3 or resizes varioius caches for all nodes. */ static int alloc_kmemlist(struct kmem_cache *cachep) { @@ -3433,10 +3433,13 @@ static int alloc_kmemlist(struct kmem_cache *cachep) if (!new_alien) goto fail; - new_shared = alloc_arraycache(node, cachep->shared*cachep->batchcount, + new_shared = alloc_arraycache(node, + cachep->shared*cachep->batchcount, 0xbaadf00d); - if (!new_shared) + if (!new_shared) { + free_alien_cache(new_alien); goto fail; + } l3 = cachep->nodelists[node]; if (l3) { @@ -3445,7 +3448,8 @@ static int alloc_kmemlist(struct kmem_cache *cachep) spin_lock_irq(&l3->list_lock); if (shared) - free_block(cachep, shared->entry, shared->avail, node); + free_block(cachep, shared->entry, + shared->avail, node); l3->shared = new_shared; if (!l3->alien) { @@ -3460,8 +3464,11 @@ static int alloc_kmemlist(struct kmem_cache *cachep) continue; } l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node); - if (!l3) + if (!l3) { + free_alien_cache(new_alien); + kfree(new_shared); goto fail; + } kmem_list3_init(l3); l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + @@ -3473,7 +3480,23 @@ static int alloc_kmemlist(struct kmem_cache *cachep) cachep->nodelists[node] = l3; } return 0; + fail: + if (!cachep->next.next) { + /* Cache is not active yet. Roll back what we did */ + node--; + while (node >= 0) { + if (cachep->nodelists[node]) { + l3 = cachep->nodelists[node]; + + kfree(l3->shared); + free_alien_cache(l3->alien); + kfree(l3); + cachep->nodelists[node] = NULL; + } + node--; + } + } return -ENOMEM; } -- cgit v1.1 From d784124cfe9377c1a24d8efba31401f81c7c11f9 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sat, 25 Mar 2006 03:06:48 -0800 Subject: [PATCH] mm: make page migration dependent on swap and NUMA The page migration code could function without NUMA but we currently have no users for the non-NUMA case. Signed-off-by: Christoph Lameter Cc: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index bd80460..332f5c2 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -138,8 +138,8 @@ config SPLIT_PTLOCK_CPUS # config MIGRATION bool "Page migration" - def_bool y if NUMA || SPARSEMEM || DISCONTIGMEM - depends on SWAP + def_bool y if NUMA + depends on SWAP && NUMA help Allows the migration of the physical location of pages of processes while the virtual addresses are not changed. This is useful for -- cgit v1.1 From f5335c0f1bcba16907972b66b905f62402433e23 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Sat, 25 Mar 2006 03:06:49 -0800 Subject: [PATCH] quieten zone_pcp_init In zone_pcp_init we print out all zones even if they are empty: On node 0 totalpages: 245760 DMA zone: 245760 pages, LIFO batch:31 DMA32 zone: 0 pages, LIFO batch:0 Normal zone: 0 pages, LIFO batch:0 HighMem zone: 0 pages, LIFO batch:0 To conserve dmesg space why not print only the non zero zones. Signed-off-by: Anton Blanchard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a5c3f8b..637f57f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2029,8 +2029,9 @@ static __meminit void zone_pcp_init(struct zone *zone) setup_pageset(zone_pcp(zone,cpu), batch); #endif } - printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", - zone->name, zone->present_pages, batch); + if (zone->present_pages) + printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", + zone->name, zone->present_pages, batch); } static __meminit void init_currently_empty_zone(struct zone *zone, -- cgit v1.1 From 05eeae208d08a05a6980cf2ff61f02843c0955fd Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 25 Mar 2006 03:07:48 -0800 Subject: [PATCH] find_task_by_pid() needs tasklist_lock A couple of places are forgetting to take it. The kswapd case is probably unimportant. keventd_create_kthread() was racy. The whole thing is a bit flakey: you start a kernel thread, get its pid from kernel_thread() then look up its task_struct. a) It assumes that pid recycling takes a "long" time. b) We get a task_struct but no reference was taken on it. The owner of the kswapd and kthread task_struct*'s must assume that the new thread won't exit unexpectedly. Because if it does, they're left holding dead memory and any attempt to control or stop that task will crash. Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index fd572bb..78865c8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1356,7 +1356,9 @@ static int __init kswapd_init(void) pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL); BUG_ON(pid < 0); + read_lock(&tasklist_lock); pgdat->kswapd = find_task_by_pid(pid); + read_unlock(&tasklist_lock); } total_memory = nr_free_pagecache_pages(); hotcpu_notifier(cpu_callback, 0); -- cgit v1.1 From 6e692ed37a507e18d8afe8e5faebd8c4722c5f12 Mon Sep 17 00:00:00 2001 From: John Hawkes Date: Sat, 25 Mar 2006 03:08:02 -0800 Subject: [PATCH] fix alloc_large_system_hash() roundup The "rounded up to nearest power of 2 in size" algorithm in alloc_large_system_hash is not correct. As coded, it takes an otherwise acceptable power-of-2 value and doubles it. For example, we see the error if we boot with thash_entries=2097152 which produces a hash table with 4194304 entries. Signed-off-by: John Hawkes Cc: Roland Dreier Cc: "Chen, Kenneth W" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 637f57f..338a02b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2702,8 +2702,7 @@ void *__init alloc_large_system_hash(const char *tablename, else numentries <<= (PAGE_SHIFT - scale); } - /* rounded up to nearest power of 2 in size */ - numentries = 1UL << (long_log2(numentries) + 1); + numentries = roundup_pow_of_two(numentries); /* limit allocation size to 1/16 total memory by default */ if (max == 0) { -- cgit v1.1 From 315ab19a6d12d6af7b6957090822f3057ab7e80f Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Sat, 25 Mar 2006 16:20:22 +0100 Subject: [PATCH] mm: restore vm_normal_page check Hugh is rightly concerned that the CONFIG_DEBUG_VM coverage has gone too far in vm_normal_page, considering that we expect production kernels to be shipped with the option turned off, and that the code has been under some large changes recently. Signed-off-by: Nick Piggin Signed-off-by: Linus Torvalds --- mm/memory.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 80c3fb3..e347e10 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -395,12 +395,16 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_ return NULL; } -#ifdef CONFIG_DEBUG_VM + /* + * Add some anal sanity checks for now. Eventually, + * we should just do "return pfn_to_page(pfn)", but + * in the meantime we check that we get a valid pfn, + * and that the resulting page looks ok. + */ if (unlikely(!pfn_valid(pfn))) { print_bad_pte(vma, pte, addr); return NULL; } -#endif /* * NOTE! We still have PageReserved() pages in the page -- cgit v1.1 From 267b48014a5c0c2ae90b04dad5d95ceb903365a6 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 25 Mar 2006 16:31:10 +0100 Subject: [PATCH] x86_64: Try to allocate node memmap near the end of node This fixes problems with very large nodes (over 128GB) filling up all of the first 4GB with their mem_map and not leaving enough space for the swiotlb. Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- mm/bootmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index 35c3229..b55bd39 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -152,7 +152,7 @@ static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, * * NOTE: This function is _not_ reentrant. */ -static void * __init +void * __init __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) { -- cgit v1.1 From 5bcb28b139cffc736177ceb775d1c8b5c5a411e2 Mon Sep 17 00:00:00 2001 From: Eric Sesterhenn Date: Sun, 26 Mar 2006 18:30:52 +0200 Subject: BUG_ON() Conversion in mm/memory.c this changes if() BUG(); constructs to BUG_ON() which is cleaner, contains unlikely() and can better optimized away. Signed-off-by: Eric Sesterhenn Signed-off-by: Adrian Bunk --- mm/memory.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index e347e10..d90ff9d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2352,10 +2352,8 @@ int make_pages_present(unsigned long addr, unsigned long end) if (!vma) return -1; write = (vma->vm_flags & VM_WRITE) != 0; - if (addr >= end) - BUG(); - if (end > vma->vm_end) - BUG(); + BUG_ON(addr >= end); + BUG_ON(end > vma->vm_end); len = (end+PAGE_SIZE-1)/PAGE_SIZE-addr/PAGE_SIZE; ret = get_user_pages(current, current->mm, addr, len, write, 0, NULL, NULL); -- cgit v1.1 From f02e1fafb534459522a8c46bc46b32820684623e Mon Sep 17 00:00:00 2001 From: Eric Sesterhenn Date: Sun, 26 Mar 2006 18:31:56 +0200 Subject: BUG_ON() Conversion in mm/mempool.c this changes if() BUG(); constructs to BUG_ON() which is cleaner, contains unlikely() and can better optimized away. Signed-off-by: Eric Sesterhenn Signed-off-by: Adrian Bunk --- mm/mempool.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/mempool.c b/mm/mempool.c index f71893e..9ef13dd 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -183,8 +183,8 @@ EXPORT_SYMBOL(mempool_resize); */ void mempool_destroy(mempool_t *pool) { - if (pool->curr_nr != pool->min_nr) - BUG(); /* There were outstanding elements */ + /* Check for outstanding elements */ + BUG_ON(pool->curr_nr != pool->min_nr); free_pool(pool); } EXPORT_SYMBOL(mempool_destroy); -- cgit v1.1 From 03beb07664d768db97bf454ae5c9581cd4737bb4 Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Sun, 26 Mar 2006 01:36:57 -0800 Subject: [PATCH] Add API for flushing Anon pages Currently, get_user_pages() returns fully coherent pages to the kernel for anything other than anonymous pages. This is a problem for things like fuse and the SCSI generic ioctl SG_IO which can potentially wish to do DMA to anonymous pages passed in by users. The fix is to add a new memory management API: flush_anon_page() which is used in get_user_pages() to make anonymous pages coherent. Signed-off-by: James Bottomley Cc: Russell King Cc: "David S. Miller" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index e347e10..6768604 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1071,6 +1071,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, } if (pages) { pages[i] = page; + + flush_anon_page(page, start); flush_dcache_page(page); } if (vmas) -- cgit v1.1 From 6e0678f394c7bd21bfa5d252b071a09e10e7a749 Mon Sep 17 00:00:00 2001 From: Matthew Dobson Date: Sun, 26 Mar 2006 01:37:44 -0800 Subject: [PATCH] mempool: add page allocator This will be used by the next patch in the series to replace duplicate mempool-backed page allocators in 2 places in the kernel. It is also likely that there will be more users in the future. Signed-off-by: Matthew Dobson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempool.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'mm') diff --git a/mm/mempool.c b/mm/mempool.c index f71893e..45c0112 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -289,3 +289,21 @@ void mempool_free_slab(void *element, void *pool_data) kmem_cache_free(mem, element); } EXPORT_SYMBOL(mempool_free_slab); + +/* + * A simple mempool-backed page allocator that allocates pages + * of the order specified by pool_data. + */ +void *mempool_alloc_pages(gfp_t gfp_mask, void *pool_data) +{ + int order = (int)(long)pool_data; + return alloc_pages(gfp_mask, order); +} +EXPORT_SYMBOL(mempool_alloc_pages); + +void mempool_free_pages(void *element, void *pool_data) +{ + int order = (int)(long)pool_data; + __free_pages(element, order); +} +EXPORT_SYMBOL(mempool_free_pages); -- cgit v1.1 From a19b27ce3847c3a5d4ea6b6c91b6f7154759af23 Mon Sep 17 00:00:00 2001 From: Matthew Dobson Date: Sun, 26 Mar 2006 01:37:45 -0800 Subject: [PATCH] mempool: use common mempool page allocator Convert two mempool users that currently use their own mempool-backed page allocators to use the generic mempool page allocator. Also included are 2 trivial whitespace fixes. Signed-off-by: Matthew Dobson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/highmem.c | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/mm/highmem.c b/mm/highmem.c index d0ea1ee..55885f6 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -31,14 +31,9 @@ static mempool_t *page_pool, *isa_page_pool; -static void *page_pool_alloc_isa(gfp_t gfp_mask, void *data) +static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data) { - return alloc_page(gfp_mask | GFP_DMA); -} - -static void page_pool_free(void *page, void *data) -{ - __free_page(page); + return mempool_alloc_pages(gfp_mask | GFP_DMA, data); } /* @@ -51,11 +46,6 @@ static void page_pool_free(void *page, void *data) */ #ifdef CONFIG_HIGHMEM -static void *page_pool_alloc(gfp_t gfp_mask, void *data) -{ - return alloc_page(gfp_mask); -} - static int pkmap_count[LAST_PKMAP]; static unsigned int last_pkmap_nr; static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock); @@ -229,7 +219,7 @@ static __init int init_emergency_pool(void) if (!i.totalhigh) return 0; - page_pool = mempool_create(POOL_SIZE, page_pool_alloc, page_pool_free, NULL); + page_pool = mempool_create_page_pool(POOL_SIZE, 0); if (!page_pool) BUG(); printk("highmem bounce pool size: %d pages\n", POOL_SIZE); @@ -272,7 +262,8 @@ int init_emergency_isa_pool(void) if (isa_page_pool) return 0; - isa_page_pool = mempool_create(ISA_POOL_SIZE, page_pool_alloc_isa, page_pool_free, NULL); + isa_page_pool = mempool_create(ISA_POOL_SIZE, mempool_alloc_pages_isa, + mempool_free_pages, (void *) 0); if (!isa_page_pool) BUG(); @@ -337,7 +328,7 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool, int err) bio_put(bio); } -static int bounce_end_io_write(struct bio *bio, unsigned int bytes_done,int err) +static int bounce_end_io_write(struct bio *bio, unsigned int bytes_done, int err) { if (bio->bi_size) return 1; @@ -384,7 +375,7 @@ static int bounce_end_io_read_isa(struct bio *bio, unsigned int bytes_done, int } static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig, - mempool_t *pool) + mempool_t *pool) { struct page *page; struct bio *bio = NULL; -- cgit v1.1 From 53184082b070dfb077218828fdf839826102ed96 Mon Sep 17 00:00:00 2001 From: Matthew Dobson Date: Sun, 26 Mar 2006 01:37:46 -0800 Subject: [PATCH] mempool: add kmalloc allocator Add another allocator to the common mempool code: a kmalloc/kfree allocator This will be used by the next patch in the series to replace duplicate mempool-backed kmalloc allocators in several places in the kernel. It is also very likely that there will be more users in the future. Signed-off-by: Matthew Dobson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempool.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'mm') diff --git a/mm/mempool.c b/mm/mempool.c index 45c0112..a1397c6 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -291,6 +291,23 @@ void mempool_free_slab(void *element, void *pool_data) EXPORT_SYMBOL(mempool_free_slab); /* + * A commonly used alloc and free fn that kmalloc/kfrees the amount of memory + * specfied by pool_data + */ +void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data) +{ + size_t size = (size_t) pool_data; + return kmalloc(size, gfp_mask); +} +EXPORT_SYMBOL(mempool_kmalloc); + +void mempool_kfree(void *element, void *pool_data) +{ + kfree(element); +} +EXPORT_SYMBOL(mempool_kfree); + +/* * A simple mempool-backed page allocator that allocates pages * of the order specified by pool_data. */ -- cgit v1.1 From f183323d3822dee4d7b3147a59b6e8987fe201e0 Mon Sep 17 00:00:00 2001 From: Matthew Dobson Date: Sun, 26 Mar 2006 01:37:48 -0800 Subject: [PATCH] mempool: add kzalloc allocator Add another allocator to the common mempool code: a kzalloc/kfree allocator This will be used by the next patch in the series to replace a mempool-backed kzalloc allocator. It is also very likely that there will be more users in the future. Signed-off-by: Matthew Dobson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempool.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mempool.c b/mm/mempool.c index a1397c6..7bf064e 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -296,11 +296,18 @@ EXPORT_SYMBOL(mempool_free_slab); */ void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data) { - size_t size = (size_t) pool_data; + size_t size = (size_t)(long)pool_data; return kmalloc(size, gfp_mask); } EXPORT_SYMBOL(mempool_kmalloc); +void *mempool_kzalloc(gfp_t gfp_mask, void *pool_data) +{ + size_t size = (size_t) pool_data; + return kzalloc(size, gfp_mask); +} +EXPORT_SYMBOL(mempool_kzalloc); + void mempool_kfree(void *element, void *pool_data) { kfree(element); -- cgit v1.1 From a117e66ed45ac0569c039ea60bd7a9a61e031858 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Mon, 27 Mar 2006 01:15:25 -0800 Subject: [PATCH] unify pfn_to_page: generic functions There are 3 memory models, FLATMEM, DISCONTIGMEM, SPARSEMEM. Each arch has its own page_to_pfn(), pfn_to_page() for each models. But most of them can use the same arithmetic. This patch adds asm-generic/memory_model.h, which includes generic page_to_pfn(), pfn_to_page() definitions for each memory model. When CONFIG_OUT_OF_LINE_PFN_TO_PAGE=y, out-of-line functions are used instead of macro. This is enabled by some archs and reduces text size. Signed-off-by: KAMEZAWA Hiroyuki Cc: Hugh Dickins Cc: Andi Kleen Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Russell King Cc: Ian Molton Cc: Mikael Starvik Cc: David Howells Cc: Yoshinori Sato Cc: Hirokazu Takata Cc: Ralf Baechle Cc: Kyle McMartin Cc: Heiko Carstens Cc: Martin Schwidefsky Cc: Paul Mundt Cc: Kazumoto Kojima Cc: Richard Curnow Cc: William Lee Irwin III Cc: "David S. Miller" Cc: Jeff Dike Cc: Paolo 'Blaisorblade' Giarrusso Cc: Miles Bader Cc: Chris Zankel Cc: "Luck, Tony" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 338a02b..349b328 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2745,3 +2745,45 @@ void *__init alloc_large_system_hash(const char *tablename, return table; } + +#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE +/* + * pfn <-> page translation. out-of-line version. + * (see asm-generic/memory_model.h) + */ +#if defined(CONFIG_FLATMEM) +struct page *pfn_to_page(unsigned long pfn) +{ + return mem_map + (pfn - ARCH_PFN_OFFSET); +} +unsigned long page_to_pfn(struct page *page) +{ + return (page - mem_map) + ARCH_PFN_OFFSET; +} +#elif defined(CONFIG_DISCONTIGMEM) +struct page *pfn_to_page(unsigned long pfn) +{ + int nid = arch_pfn_to_nid(pfn); + return NODE_DATA(nid)->node_mem_map + arch_local_page_offset(pfn,nid); +} +unsigned long page_to_pfn(struct page *page) +{ + struct zone *zone = page_zone(page); + return (page - zone->zone_mem_map) + zone->zone_start_pfn; + +} +#elif defined(CONFIG_SPARSEMEM) +struct page *pfn_to_page(unsigned long pfn) +{ + return __section_mem_map_addr(__pfn_to_section(pfn)) + pfn; +} + +unsigned long page_to_pfn(struct page *page) +{ + long section_id = page_to_section(page); + return page - __section_mem_map_addr(__nr_to_section(section_id)); +} +#endif /* CONFIG_FLATMEM/DISCONTIGMME/SPARSEMEM */ +EXPORT_SYMBOL(pfn_to_page); +EXPORT_SYMBOL(page_to_pfn); +#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */ -- cgit v1.1 From a0140c1d85637ee5f4ea7c78f066e3611a6a79dc Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Mon, 27 Mar 2006 01:15:55 -0800 Subject: [PATCH] remove zone_mem_map This patch removes zone_mem_map. pfn_to_page uses pgdat, page_to_pfn uses zone. page_to_pfn can use pgdat instead of zone, which is only one user of zone_mem_map. By modifing it, we can remove zone_mem_map. Signed-off-by: KAMEZAWA Hiroyuki Cc: Dave Hansen Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 349b328..8dc8f27 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2042,7 +2042,6 @@ static __meminit void init_currently_empty_zone(struct zone *zone, zone_wait_table_init(zone, size); pgdat->nr_zones = zone_idx(zone) + 1; - zone->zone_mem_map = pfn_to_page(zone_start_pfn); zone->zone_start_pfn = zone_start_pfn; memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn); @@ -2768,9 +2767,8 @@ struct page *pfn_to_page(unsigned long pfn) } unsigned long page_to_pfn(struct page *page) { - struct zone *zone = page_zone(page); - return (page - zone->zone_mem_map) + zone->zone_start_pfn; - + struct pglist_data *pgdat = NODE_DATA(page_to_nid(page)); + return (page - pgdat->node_mem_map) + pgdat->node_start_pfn; } #elif defined(CONFIG_SPARSEMEM) struct page *pfn_to_page(unsigned long pfn) -- cgit v1.1 From 679bc9fbb508a0aac9539b2de747eb5849feb428 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Mon, 27 Mar 2006 01:15:58 -0800 Subject: [PATCH] for_each_online_pgdat: for_each_bootmem Add a list_head to bootmem_data_t and make bootmems use it. bootmem list is sorted by node_boot_start. Only nodes against which init_bootmem() is called are linked to the list. (i386 allocates bootmem only from one node(0) not from all online nodes.) A summary: 1. for_each_online_pgdat() traverses all *online* nodes. 2. alloc_bootmem() allocates memory only from initialized-for-bootmem nodes. Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 39 +++++++++++++++++++++++++++++---------- 1 file changed, 29 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index b55bd39..d3e3bd2 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -33,6 +33,7 @@ EXPORT_SYMBOL(max_pfn); /* This is exported so * dma_get_required_mask(), which uses * it, can be an inline function */ +static LIST_HEAD(bdata_list); #ifdef CONFIG_CRASH_DUMP /* * If we have booted due to a crash, max_pfn will be a very low value. We need @@ -52,6 +53,27 @@ unsigned long __init bootmem_bootmap_pages (unsigned long pages) return mapsize; } +/* + * link bdata in order + */ +static void link_bootmem(bootmem_data_t *bdata) +{ + bootmem_data_t *ent; + if (list_empty(&bdata_list)) { + list_add(&bdata->list, &bdata_list); + return; + } + /* insert in order */ + list_for_each_entry(ent, &bdata_list, list) { + if (bdata->node_boot_start < ent->node_boot_start) { + list_add_tail(&bdata->list, &ent->list); + return; + } + } + list_add_tail(&bdata->list, &bdata_list); + return; +} + /* * Called once to set up the allocator itself. @@ -62,13 +84,11 @@ static unsigned long __init init_bootmem_core (pg_data_t *pgdat, bootmem_data_t *bdata = pgdat->bdata; unsigned long mapsize = ((end - start)+7)/8; - pgdat->pgdat_next = pgdat_list; - pgdat_list = pgdat; - mapsize = ALIGN(mapsize, sizeof(long)); bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT); bdata->node_boot_start = (start << PAGE_SHIFT); bdata->node_low_pfn = end; + link_bootmem(bdata); /* * Initially all pages are reserved - setup_arch() has to @@ -383,12 +403,11 @@ unsigned long __init free_all_bootmem (void) void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal) { - pg_data_t *pgdat = pgdat_list; + bootmem_data_t *bdata; void *ptr; - for_each_pgdat(pgdat) - if ((ptr = __alloc_bootmem_core(pgdat->bdata, size, - align, goal, 0))) + list_for_each_entry(bdata, &bdata_list, list) + if ((ptr = __alloc_bootmem_core(bdata, size, align, goal, 0))) return(ptr); /* @@ -416,11 +435,11 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigne void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal) { - pg_data_t *pgdat = pgdat_list; + bootmem_data_t *bdata; void *ptr; - for_each_pgdat(pgdat) - if ((ptr = __alloc_bootmem_core(pgdat->bdata, size, + list_for_each_entry(bdata, &bdata_list, list) + if ((ptr = __alloc_bootmem_core(bdata, size, align, goal, LOW32LIMIT))) return(ptr); -- cgit v1.1 From ec936fc563715a9e2b2e363eb060655b49529325 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Mon, 27 Mar 2006 01:15:59 -0800 Subject: [PATCH] for_each_online_pgdat: renaming for_each_pgdat Replace for_each_pgdat() with for_each_online_pgdat(). Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 6 +++--- mm/vmscan.c | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8dc8f27..ccc3713 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1201,7 +1201,7 @@ unsigned int nr_free_highpages (void) pg_data_t *pgdat; unsigned int pages = 0; - for_each_pgdat(pgdat) + for_each_online_pgdat(pgdat) pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages; return pages; @@ -1343,7 +1343,7 @@ void get_zone_counts(unsigned long *active, *active = 0; *inactive = 0; *free = 0; - for_each_pgdat(pgdat) { + for_each_online_pgdat(pgdat) { unsigned long l, m, n; __get_zone_counts(&l, &m, &n, pgdat); *active += l; @@ -2482,7 +2482,7 @@ static void setup_per_zone_lowmem_reserve(void) struct pglist_data *pgdat; int j, idx; - for_each_pgdat(pgdat) { + for_each_online_pgdat(pgdat) { for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; unsigned long present_pages = zone->present_pages; diff --git a/mm/vmscan.c b/mm/vmscan.c index 78865c8..acdf001 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1305,7 +1305,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) current->reclaim_state = &reclaim_state; repeat: - for_each_pgdat(pgdat) { + for_each_online_pgdat(pgdat) { unsigned long freed; freed = balance_pgdat(pgdat, nr_to_free, 0); @@ -1335,7 +1335,7 @@ static int __devinit cpu_callback(struct notifier_block *nfb, cpumask_t mask; if (action == CPU_ONLINE) { - for_each_pgdat(pgdat) { + for_each_online_pgdat(pgdat) { mask = node_to_cpumask(pgdat->node_id); if (any_online_cpu(mask) != NR_CPUS) /* One of our CPUs online: restore mask */ @@ -1351,7 +1351,7 @@ static int __init kswapd_init(void) pg_data_t *pgdat; swap_setup(); - for_each_pgdat(pgdat) { + for_each_online_pgdat(pgdat) { pid_t pid; pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL); -- cgit v1.1 From ae0f15fb91274e67d78836d38c99ec363df33073 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Mon, 27 Mar 2006 01:16:01 -0800 Subject: [PATCH] for_each_online_pgdat: remove pgdat_list By using for_each_online_pgdat(), pgdat_list is not necessary now. This patch removes it. Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ccc3713..dc523a1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -49,7 +49,6 @@ nodemask_t node_online_map __read_mostly = { { [0] = 1UL } }; EXPORT_SYMBOL(node_online_map); nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL; EXPORT_SYMBOL(node_possible_map); -struct pglist_data *pgdat_list __read_mostly; unsigned long totalram_pages __read_mostly; unsigned long totalhigh_pages __read_mostly; long nr_swap_pages; @@ -2169,8 +2168,9 @@ static void *frag_start(struct seq_file *m, loff_t *pos) { pg_data_t *pgdat; loff_t node = *pos; - - for (pgdat = pgdat_list; pgdat && node; pgdat = pgdat->pgdat_next) + for (pgdat = first_online_pgdat(); + pgdat && node; + pgdat = next_online_pgdat(pgdat)) --node; return pgdat; @@ -2181,7 +2181,7 @@ static void *frag_next(struct seq_file *m, void *arg, loff_t *pos) pg_data_t *pgdat = (pg_data_t *)arg; (*pos)++; - return pgdat->pgdat_next; + return next_online_pgdat(pgdat); } static void frag_stop(struct seq_file *m, void *arg) -- cgit v1.1 From 95144c788dc01b6a0ff2c9c2222e37ffdab358b8 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Mon, 27 Mar 2006 01:16:02 -0800 Subject: [PATCH] uninline zone helpers Helper functions for for_each_online_pgdat/for_each_zone look too big to be inlined. Speed of these helper macro itself is not very important. (inner loops are tend to do more work than this) This patch make helper function to be out-of-lined. inline out-of-line .text 005c0680 005bf6a0 005c0680 - 005bf6a0 = FE0 = 4Kbytes. Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Makefile | 2 +- mm/mmzone.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 1 deletion(-) create mode 100644 mm/mmzone.c (limited to 'mm') diff --git a/mm/Makefile b/mm/Makefile index f10c753..0b8f73f 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -10,7 +10,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ page_alloc.o page-writeback.o pdflush.o \ readahead.o swap.o truncate.o vmscan.o \ - prio_tree.o util.o $(mmu-y) + prio_tree.o util.o mmzone.o $(mmu-y) obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o obj-$(CONFIG_HUGETLBFS) += hugetlb.o diff --git a/mm/mmzone.c b/mm/mmzone.c new file mode 100644 index 0000000..b022370 --- /dev/null +++ b/mm/mmzone.c @@ -0,0 +1,50 @@ +/* + * linux/mm/mmzone.c + * + * management codes for pgdats and zones. + */ + + +#include +#include +#include +#include + +struct pglist_data *first_online_pgdat(void) +{ + return NODE_DATA(first_online_node); +} + +EXPORT_SYMBOL(first_online_pgdat); + +struct pglist_data *next_online_pgdat(struct pglist_data *pgdat) +{ + int nid = next_online_node(pgdat->node_id); + + if (nid == MAX_NUMNODES) + return NULL; + return NODE_DATA(nid); +} +EXPORT_SYMBOL(next_online_pgdat); + + +/* + * next_zone - helper magic for for_each_zone() + */ +struct zone *next_zone(struct zone *zone) +{ + pg_data_t *pgdat = zone->zone_pgdat; + + if (zone < pgdat->node_zones + MAX_NR_ZONES - 1) + zone++; + else { + pgdat = next_online_pgdat(pgdat); + if (pgdat) + zone = pgdat->node_zones; + else + zone = NULL; + } + return zone; +} +EXPORT_SYMBOL(next_zone); + -- cgit v1.1 From 0a945022778f100115d0cb6234eb28fc1b15ccaf Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 28 Mar 2006 01:56:37 -0800 Subject: [PATCH] for_each_possible_cpu: fixes for generic part replaces for_each_cpu with for_each_possible_cpu(). Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 4 ++-- mm/swap.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 6818374..4cbf8bb 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3311,7 +3311,7 @@ void *__alloc_percpu(size_t size) * and we have no way of figuring out how to fix the array * that we have allocated then.... */ - for_each_cpu(i) { + for_each_possible_cpu(i) { int node = cpu_to_node(i); if (node_online(node)) @@ -3398,7 +3398,7 @@ void free_percpu(const void *objp) /* * We allocate for all cpus so we cannot use for online cpu here. */ - for_each_cpu(i) + for_each_possible_cpu(i) kfree(p->ptrs[i]); kfree(p); } diff --git a/mm/swap.c b/mm/swap.c index 91b7e20..88895c2 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -512,7 +512,7 @@ long percpu_counter_sum(struct percpu_counter *fbc) spin_lock(&fbc->lock); ret = fbc->count; - for_each_cpu(cpu) { + for_each_possible_cpu(cpu) { long *pcount = per_cpu_ptr(fbc->counters, cpu); ret += *pcount; } -- cgit v1.1 From 7f927fcc2fd1575d01efb4b76665975007945690 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 28 Mar 2006 01:56:53 -0800 Subject: [PATCH] Typo fixes Fix a lot of typos. Eyeballed by jmc@ in OpenBSD. Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4f71cfd..dec8249 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -912,7 +912,7 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, /* * Check if this process has the right to modify the specified * process. The right exists if the process has administrative - * capabilities, superuser priviledges or the same + * capabilities, superuser privileges or the same * userid as the target process. */ if ((current->euid != task->suid) && (current->euid != task->uid) && -- cgit v1.1 From 93fac7041f082297b93655a0e49f659cd7520e40 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 31 Mar 2006 02:29:56 -0800 Subject: [PATCH] mm: schedule find_trylock_page() removal find_trylock_page() is an odd interface in that it doesn't take a reference like the others. Now that XFS no longer uses it, and its last remaining caller actually wants an elevated refcount, opencode that callsite and schedule find_trylock_page() for removal. Signed-off-by: Nick Piggin Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index 39aa9d1..e5fd538 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -397,18 +397,24 @@ void free_swap_and_cache(swp_entry_t entry) p = swap_info_get(entry); if (p) { - if (swap_entry_free(p, swp_offset(entry)) == 1) - page = find_trylock_page(&swapper_space, entry.val); + if (swap_entry_free(p, swp_offset(entry)) == 1) { + page = find_get_page(&swapper_space, entry.val); + if (page && unlikely(TestSetPageLocked(page))) { + page_cache_release(page); + page = NULL; + } + } spin_unlock(&swap_lock); } if (page) { int one_user; BUG_ON(PagePrivate(page)); - page_cache_get(page); one_user = (page_count(page) == 2); /* Only cache user (+us), or swap space full? Free it! */ - if (!PageWriteback(page) && (one_user || vm_swap_full())) { + /* Also recheck PageSwapCache after page is locked (above) */ + if (PageSwapCache(page) && !PageWriteback(page) && + (one_user || vm_swap_full())) { delete_from_swap_cache(page); SetPageDirty(page); } -- cgit v1.1 From d6692183ac1d8f4a4e4015f9ce9acc2514618e0b Mon Sep 17 00:00:00 2001 From: "Chen, Kenneth W" Date: Fri, 31 Mar 2006 02:29:57 -0800 Subject: [PATCH] fix extra page ref count in follow_hugetlb_page git-commit: d5d4b0aa4e1430d73050babba999365593bdb9d2 "[PATCH] optimize follow_hugetlb_page" breaks mlock on hugepage areas. I mis-interpret pages argument and made get_page() unconditional. It should only get a ref count when "pages" argument is non-null. Credit goes to Adam Litke who spotted the bug. Signed-off-by: Ken Chen Acked-by: Adam Litke Cc: David Gibson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ebad6bb..d87885e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -697,9 +697,10 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT; page = pte_page(*pte); same_page: - get_page(page); - if (pages) + if (pages) { + get_page(page); pages[i] = page + pfn_offset; + } if (vmas) vmas[i] = vma; -- cgit v1.1 From 78c997a4be7d1ed3ff4c27f23d30a0185d39bcbf Mon Sep 17 00:00:00 2001 From: "Chen, Kenneth W" Date: Fri, 31 Mar 2006 02:30:01 -0800 Subject: [PATCH] hugetlb: don't allow free hugetlb count fall below reserved count With strict page reservation, I think kernel should enforce number of free hugetlb page don't fall below reserved count. Currently it is possible in the sysctl path. Add proper check in sysctl to disallow that. Signed-off-by: Ken Chen Cc: David Gibson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d87885e..832f676 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -334,6 +334,7 @@ static unsigned long set_max_huge_pages(unsigned long count) return nr_huge_pages; spin_lock(&hugetlb_lock); + count = max(count, reserved_huge_pages); try_to_free_low(count); while (count < nr_huge_pages) { struct page *page = dequeue_huge_page(NULL, 0); -- cgit v1.1 From 9b41046cd0ee0a57f849d6e1363f7933e363cca9 Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Fri, 31 Mar 2006 02:30:33 -0800 Subject: [PATCH] Don't pass boot parameters to argv_init[] The boot cmdline is parsed in parse_early_param() and parse_args(,unknown_bootoption). And __setup() is used in obsolete_checksetup(). start_kernel() -> parse_args() -> unknown_bootoption() -> obsolete_checksetup() If __setup()'s callback (->setup_func()) returns 1 in obsolete_checksetup(), obsolete_checksetup() thinks a parameter was handled. If ->setup_func() returns 0, obsolete_checksetup() tries other ->setup_func(). If all ->setup_func() that matched a parameter returns 0, a parameter is seted to argv_init[]. Then, when runing /sbin/init or init=app, argv_init[] is passed to the app. If the app doesn't ignore those arguments, it will warning and exit. This patch fixes a wrong usage of it, however fixes obvious one only. Signed-off-by: OGAWA Hirofumi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 8d8f525..0ec7bc6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -87,7 +87,7 @@ int randomize_va_space __read_mostly = 1; static int __init disable_randmaps(char *s) { randomize_va_space = 0; - return 0; + return 1; } __setup("norandmaps", disable_randmaps); -- cgit v1.1 From f79e2abb9bd452d97295f34376dedbec9686b986 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 31 Mar 2006 02:30:42 -0800 Subject: [PATCH] sys_sync_file_range() Remove the recently-added LINUX_FADV_ASYNC_WRITE and LINUX_FADV_WRITE_WAIT fadvise() additions, do it in a new sys_sync_file_range() syscall instead. Reasons: - It's more flexible. Things which would require two or three syscalls with fadvise() can be done in a single syscall. - Using fadvise() in this manner is something not covered by POSIX. The patch wires up the syscall for x86. The sycall is implemented in the new fs/sync.c. The intention is that we can move sys_fsync(), sys_fdatasync() and perhaps sys_sync() into there later. Documentation for the syscall is in fs/sync.c. A test app (sync_file_range.c) is in http://www.zip.com.au/~akpm/linux/patches/stuff/ext3-tools.tar.gz. The available-to-GPL-modules do_sync_file_range() is for knfsd: "A COMMIT can say NFS_DATA_SYNC or NFS_FILE_SYNC. I can skip the ->fsync call for NFS_DATA_SYNC which is hopefully the more common." Note: the `async' writeout mode SYNC_FILE_RANGE_WRITE will turn synchronous if the queue is congested. This is trivial to fix: add a new flag bit, set wbc->nonblocking. But I'm not sure that we want to expose implementation details down to that level. Note: it's notable that we can sync an fd which wasn't opened for writing. Same with fsync() and fdatasync()). Note: the code takes some care to handle attempts to sync file contents outside the 16TB offset on 32-bit machines. It makes such attempts appear to succeed, for best 32-bit/64-bit compatibility. Perhaps it should make such requests fail... Cc: Nick Piggin Cc: Michael Kerrisk Cc: Ulrich Drepper Cc: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/fadvise.c | 20 -------------------- 1 file changed, 20 deletions(-) (limited to 'mm') diff --git a/mm/fadvise.c b/mm/fadvise.c index 907c392..0a03357 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -35,17 +35,6 @@ * * LINUX_FADV_ASYNC_WRITE: push some or all of the dirty pages at the disk. * - * LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE: push all of the currently - * dirty pages at the disk. - * - * LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE, LINUX_FADV_WRITE_WAIT: push - * all of the currently dirty pages at the disk, wait until they have been - * written. - * - * It should be noted that none of these operations write out the file's - * metadata. So unless the application is strictly performing overwrites of - * already-instantiated disk blocks, there are no guarantees here that the data - * will be available after a crash. */ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) { @@ -129,15 +118,6 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) invalidate_mapping_pages(mapping, start_index, end_index); break; - case LINUX_FADV_ASYNC_WRITE: - ret = __filemap_fdatawrite_range(mapping, offset, endbyte, - WB_SYNC_NONE); - break; - case LINUX_FADV_WRITE_WAIT: - ret = wait_on_page_writeback_range(mapping, - offset >> PAGE_CACHE_SHIFT, - endbyte >> PAGE_CACHE_SHIFT); - break; default: ret = -EINVAL; } -- cgit v1.1