From 1e275d406bf6b88e4de6925cf594b64bb2ec49bc Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Fri, 24 Feb 2006 13:04:12 -0800
Subject: [PATCH] page migration: Fix MPOL_INTERLEAVE behavior for migration
 via mbind()

migrate_pages_to() allocates a list of new pages on the intended target
node or with the intended policy and then uses the list of new pages as
targets for the migration of a list of pages out of place.

When the pages are allocated it is not clear which of the out of place
pages will be moved to the new pages.  So we cannot specify an address as
needed by alloc_page_vma().  This causes problem for MPOL_INTERLEAVE which
will currently allocate the pages on the first node of the set.  If mbind
is used with vma that has the policy of MPOL_INTERLEAVE then the
interleaving of pages may be destroyed.

This patch fixes that by generating a fake address for each alloc_page_vma
which will result is a distribution of pages as prescribed by
MPOL_INTERLEAVE.

Lee also noted that the sequence of nodes for the new pages seems to be
inverted.  So we also invert the way the lists of pages for migration are
build.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Looks-ok-to: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mempolicy.c | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 880831b..67af4ce 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -552,7 +552,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
 	 */
 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
 		if (isolate_lru_page(page))
-			list_add(&page->lru, pagelist);
+			list_add_tail(&page->lru, pagelist);
 	}
 }
 
@@ -569,6 +569,7 @@ static int migrate_pages_to(struct list_head *pagelist,
 	LIST_HEAD(moved);
 	LIST_HEAD(failed);
 	int err = 0;
+	unsigned long offset = 0;
 	int nr_pages;
 	struct page *page;
 	struct list_head *p;
@@ -576,8 +577,21 @@ static int migrate_pages_to(struct list_head *pagelist,
 redo:
 	nr_pages = 0;
 	list_for_each(p, pagelist) {
-		if (vma)
-			page = alloc_page_vma(GFP_HIGHUSER, vma, vma->vm_start);
+		if (vma) {
+			/*
+			 * The address passed to alloc_page_vma is used to
+			 * generate the proper interleave behavior. We fake
+			 * the address here by an increasing offset in order
+			 * to get the proper distribution of pages.
+			 *
+			 * No decision has been made as to which page
+			 * a certain old page is moved to so we cannot
+			 * specify the correct address.
+			 */
+			page = alloc_page_vma(GFP_HIGHUSER, vma,
+					offset + vma->vm_start);
+			offset += PAGE_SIZE;
+		}
 		else
 			page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
 
@@ -585,7 +599,7 @@ redo:
 			err = -ENOMEM;
 			goto out;
 		}
-		list_add(&page->lru, &newlist);
+		list_add_tail(&page->lru, &newlist);
 		nr_pages++;
 		if (nr_pages > MIGRATE_CHUNK_SIZE)
 			break;
-- 
cgit v1.1


From d4f7796e9b387e471ab0e8ed4e0c2bd616b3c193 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Fri, 24 Feb 2006 13:04:22 -0800
Subject: [PATCH] vmscan: fix zone_reclaim

- PF_SWAPWRITE needs to be set for RECLAIM_SWAP to be able to write
  out pages to swap. Currently RECLAIM_SWAP may not do that.

- remove setting nr_reclaimed pages after slab reclaim since the slab shrinking
  code does not use that and the nr_reclaimed pages is just right for the
  intended follow up action.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmscan.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1838c15..b0af759 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1908,7 +1908,12 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 		sc.swap_cluster_max = SWAP_CLUSTER_MAX;
 
 	cond_resched();
-	p->flags |= PF_MEMALLOC;
+	/*
+	 * We need to be able to allocate from the reserves for RECLAIM_SWAP
+	 * and we also need to be able to write out pages for RECLAIM_WRITE
+	 * and RECLAIM_SWAP.
+	 */
+	p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
 
@@ -1932,11 +1937,10 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 		 * a long time.
 		 */
 		shrink_slab(sc.nr_scanned, gfp_mask, order);
-		sc.nr_reclaimed = 1;    /* Avoid getting the off node timeout */
 	}
 
 	p->reclaim_state = NULL;
-	current->flags &= ~PF_MEMALLOC;
+	current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
 
 	if (sc.nr_reclaimed == 0)
 		zone->last_unsuccessful_zone_reclaim = jiffies;
-- 
cgit v1.1


From 511030bcd24119fa3759ef3f914d354e107ef839 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Tue, 28 Feb 2006 16:58:57 -0800
Subject: [PATCH] Fix sys_migrate_pages: Move all pages when invoked from root

Currently sys_migrate_pages only moves pages belonging to a process.  This
is okay when invoked from a regular user.  But if invoked from root it
should move all pages as documented in the migrate_pages manpage.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mempolicy.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 67af4ce..5643cfe 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -954,7 +954,8 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 		goto out;
 	}
 
-	err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE);
+	err = do_migrate_pages(mm, &old, &new,
+		capable(CAP_SYS_ADMIN) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
 out:
 	mmput(mm);
 	return err;
-- 
cgit v1.1


From e8788c0cce63e0cc8689a123d1ce0af1e28cd583 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Tue, 28 Feb 2006 16:59:16 -0800
Subject: [PATCH] remove_from_swap: fix locking

remove_from_swap() currently attempts to use page_lock_anon_vma to obtain
an anon_vma lock.  That is not working since the page may have been
remapped via swap ptes in order to move the page.

However, do_migrate_pages() obtain the mmap_sem lock and therefore there is
a guarantee that the anonymous vma will not vanish from under us.  There is
therefore no need to use page_lock_anon_vma.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Acked-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/rmap.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/rmap.c b/mm/rmap.c
index df2c41c..d8ce5ff 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -212,25 +212,33 @@ out:
  * through real pte's pointing to valid pages and then releasing
  * the page from the swap cache.
  *
- * Must hold page lock on page.
+ * Must hold page lock on page and mmap_sem of one vma that contains
+ * the page.
  */
 void remove_from_swap(struct page *page)
 {
 	struct anon_vma *anon_vma;
 	struct vm_area_struct *vma;
+	unsigned long mapping;
 
-	if (!PageAnon(page) || !PageSwapCache(page))
+	if (!PageSwapCache(page))
 		return;
 
-	anon_vma = page_lock_anon_vma(page);
-	if (!anon_vma)
+	mapping = (unsigned long)page->mapping;
+
+	if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0)
 		return;
 
+	/*
+	 * We hold the mmap_sem lock. So no need to call page_lock_anon_vma.
+	 */
+	anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON);
+	spin_lock(&anon_vma->lock);
+
 	list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
 		remove_vma_swap(vma, page);
 
 	spin_unlock(&anon_vma->lock);
-
 	delete_from_swap_cache(page);
 }
 EXPORT_SYMBOL(remove_from_swap);
-- 
cgit v1.1


From f61388822a6040ff462c5f7260daa0f1017f2db0 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Tue, 28 Feb 2006 16:59:18 -0800
Subject: [PATCH] nommu: implement vmalloc_node()

Fix oprofile linkage.   Pointed out by "Luke Yang" <luke.adi@gmail.com>.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/nommu.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/nommu.c b/mm/nommu.c
index 99d2102..4951f47 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -53,7 +53,6 @@ DECLARE_RWSEM(nommu_vma_sem);
 struct vm_operations_struct generic_file_vm_ops = {
 };
 
-EXPORT_SYMBOL(vmalloc);
 EXPORT_SYMBOL(vfree);
 EXPORT_SYMBOL(vmalloc_to_page);
 EXPORT_SYMBOL(vmalloc_32);
@@ -205,6 +204,13 @@ void *vmalloc(unsigned long size)
 {
        return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
 }
+EXPORT_SYMBOL(vmalloc);
+
+void *vmalloc_node(unsigned long size, int node)
+{
+	return vmalloc(size);
+}
+EXPORT_SYMBOL(vmalloc_node);
 
 /*
  *	vmalloc_32  -  allocate virtually continguos memory (32bit addressable)
-- 
cgit v1.1


From d6713e046336ffa98060418c4d2c65243639e107 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Tue, 28 Feb 2006 16:59:19 -0800
Subject: [PATCH] out_of_memory(): use of uninitialised

Under some circumstances `points' can get printed before it's initialised.
Spotted by Carlos Martin <carlos@cmartin.tk>.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/oom_kill.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 8123fad..c86c737 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -302,7 +302,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
 {
 	struct mm_struct *mm = NULL;
 	task_t *p;
-	unsigned long points;
+	unsigned long points = 0;
 
 	if (printk_ratelimit()) {
 		printk("oom-killer: gfp_mask=0x%x, order=%d\n",
-- 
cgit v1.1


From 140ffcec4def3ee3af7565b2cf1d3b2580f7e180 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Thu, 2 Mar 2006 02:54:28 -0800
Subject: [PATCH] out_of_memory() locking fix

I seem to have lost this read_unlock().

While we're there, let's turn that interruptible sleep unto uninterruptible,
so we don't get a busywait if signal_pending().  (Again.  We seem to have a
habit of doing this).

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/oom_kill.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index c86c737..78747af 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -355,6 +355,7 @@ retry:
 	}
 
 out:
+	read_unlock(&tasklist_lock);
 	cpuset_unlock();
 	if (mm)
 		mmput(mm);
@@ -364,5 +365,5 @@ out:
 	 * retry to allocate memory unless "p" is current
 	 */
 	if (!test_thread_flag(TIF_MEMDIE))
-		schedule_timeout_interruptible(1);
+		schedule_timeout_uninterruptible(1);
 }
-- 
cgit v1.1


From a57ebfdb2cf9fa60dfa2f403f70ef6c432ca2a62 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Thu, 2 Mar 2006 02:54:37 -0800
Subject: [PATCH] numa_maps: Fix potential crash on non IA64 platforms

numa_maps should not scan over huge vmas in order not to cause problems for
non IA64 platforms that may have pte entries pointing to huge pages in a
variety of ways in their page tables.  Add a simple check to ignore vmas
containing huge pages.

Signed-off-by: Christoph Lameter <clameter@engr.sgi.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mempolicy.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 5643cfe..1a21008 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1793,7 +1793,8 @@ int show_numa_map(struct seq_file *m, void *v)
 	if (!md)
 		return 0;
 
-	check_pgd_range(vma, vma->vm_start, vma->vm_end,
+	if (!is_vm_hugetlb_page(vma))
+		check_pgd_range(vma, vma->vm_start, vma->vm_end,
 		    &node_online_map, MPOL_MF_STATS, md);
 
 	if (md->pages) {
-- 
cgit v1.1


From 264132bc62fe071d0ff378c1103bae9d33212f10 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Mon, 6 Mar 2006 12:10:07 -0800
Subject: Fix "check_slabp" printout size calculation

We want to use the "struct slab" size, not the size of the pointer to
same.  As it is, we'd not print out the last <n> entry pointers in the
slab (where <n> is ~10, depending on whether it's a 32-bit or 64-bit
kernel).

Gaah, that slab code was written by somebody who likes unreadable crud.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index add05d8..2b0b151 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2554,7 +2554,7 @@ static void check_slabp(struct kmem_cache *cachep, struct slab *slabp)
 		       "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n",
 		       cachep->name, cachep->num, slabp, slabp->inuse);
 		for (i = 0;
-		     i < sizeof(slabp) + cachep->num * sizeof(kmem_bufctl_t);
+		     i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t);
 		     i++) {
 			if ((i % 16) == 0)
 				printk("\n%03x:", i);
-- 
cgit v1.1


From 9888e6fa7b68d9c8cc2c162a90979825ab45150a Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Mon, 6 Mar 2006 17:44:43 -0800
Subject: slab: clarify and fix calculate_slab_order()

If we triggered the 'offslab_limit' test, we would return with
cachep->gfporder incremented once too many times.

This clarifies the logic somewhat, and fixes that bug.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 2b0b151..f2e92dc 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1628,36 +1628,36 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep,
 			size_t size, size_t align, unsigned long flags)
 {
 	size_t left_over = 0;
+	int gfporder;
 
-	for (;; cachep->gfporder++) {
+	for (gfporder = 0 ; gfporder <= MAX_GFP_ORDER; gfporder++) {
 		unsigned int num;
 		size_t remainder;
 
-		if (cachep->gfporder > MAX_GFP_ORDER) {
-			cachep->num = 0;
-			break;
-		}
-
-		cache_estimate(cachep->gfporder, size, align, flags,
-			       &remainder, &num);
+		cache_estimate(gfporder, size, align, flags, &remainder, &num);
 		if (!num)
 			continue;
+
 		/* More than offslab_limit objects will cause problems */
-		if (flags & CFLGS_OFF_SLAB && cachep->num > offslab_limit)
+		if ((flags & CFLGS_OFF_SLAB) && num > offslab_limit)
 			break;
 
+		/* Found something acceptable - save it away */
 		cachep->num = num;
+		cachep->gfporder = gfporder;
 		left_over = remainder;
 
 		/*
 		 * Large number of objects is good, but very large slabs are
 		 * currently bad for the gfp()s.
 		 */
-		if (cachep->gfporder >= slab_break_gfp_order)
+		if (gfporder >= slab_break_gfp_order)
 			break;
 
-		if ((left_over * 8) <= (PAGE_SIZE << cachep->gfporder))
-			/* Acceptable internal fragmentation */
+		/*
+		 * Acceptable internal fragmentation?
+		 */
+		if ((left_over * 8) <= (PAGE_SIZE << gfporder))
 			break;
 	}
 	return left_over;
-- 
cgit v1.1


From 397874dfe9862b494e1fdcd2baef4ac432d224c8 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Mon, 6 Mar 2006 15:42:53 -0800
Subject: [PATCH] numa_maps update

Change the format of numa_maps to be more compact and contain additional
information that is useful for managing and troubleshooting memory on a
NUMA system.  Numa_maps can now also support huge pages.

Fixes:

1. More compact format. Only display fields if they contain additional
	information.

2. Always display information for all vmas. The old numa_maps did not display
	vma with no mapped entries. This was a bit confusing because page
	migration removes ptes for file backed vmas. After page migration
	a part of the vmas vanished.

3. Rename maxref to maxmap. This is the maximum mapcount of all the pages
	in a vma and may be used as an indicator as to how many processes
	may be using a certain vma.

4. Include the ability to scan over huge page vmas.

New items shown:

dirty
	Number of pages in a vma that have either the dirty bit set in the
	page_struct or in the pte.

file=<filename>
	The file backing the pages if any

stack
	Stack area

heap
	Heap area

huge
	Huge page area. The number of pages shows is the number of huge
	pages not the regular sized pages.

swapcache
	Number of pages with swap references. Must be >0 in order to
	be shown.

active
	Number of active pages. Only displayed if different from the number
	of pages mapped.

writeback
	Number of pages under writeback. Only displayed if >0.

Sample ouput of a process using huge pages:

00000000 default
2000000000000000 default file=/lib/ld-2.3.90.so mapped=13 mapmax=30 N0=13
2000000000044000 default file=/lib/ld-2.3.90.so anon=2 dirty=2 swapcache=2 N2=2
2000000000064000 default file=/lib/librt-2.3.90.so mapped=2 active=1 N1=1 N3=1
2000000000074000 default file=/lib/librt-2.3.90.so
2000000000080000 default file=/lib/librt-2.3.90.so anon=1 swapcache=1 N2=1
2000000000084000 default
2000000000088000 default file=/lib/libc-2.3.90.so mapped=52 mapmax=32 active=48 N0=52
20000000002bc000 default file=/lib/libc-2.3.90.so
20000000002c8000 default file=/lib/libc-2.3.90.so anon=3 dirty=2 swapcache=3 active=2 N1=1 N2=2
20000000002d4000 default anon=1 swapcache=1 N1=1
20000000002d8000 default file=/lib/libpthread-2.3.90.so mapped=8 mapmax=3 active=7 N2=2 N3=6
20000000002fc000 default file=/lib/libpthread-2.3.90.so
2000000000308000 default file=/lib/libpthread-2.3.90.so anon=1 dirty=1 swapcache=1 N1=1
200000000030c000 default anon=1 dirty=1 swapcache=1 N1=1
2000000000320000 default anon=1 dirty=1 N1=1
200000000071c000 default
2000000000720000 default anon=2 dirty=2 swapcache=1 N1=1 N2=1
2000000000f1c000 default
2000000000f20000 default anon=2 dirty=2 swapcache=1 active=1 N2=1 N3=1
200000000171c000 default
2000000001720000 default anon=1 dirty=1 swapcache=1 N1=1
2000000001b20000 default
2000000001b38000 default file=/lib/libgcc_s.so.1 mapped=2 N1=2
2000000001b48000 default file=/lib/libgcc_s.so.1
2000000001b54000 default file=/lib/libgcc_s.so.1 anon=1 dirty=1 active=0 N1=1
2000000001b58000 default file=/lib/libunwind.so.7.0.0 mapped=2 active=1 N1=2
2000000001b74000 default file=/lib/libunwind.so.7.0.0
2000000001b80000 default file=/lib/libunwind.so.7.0.0
2000000001b84000 default
4000000000000000 default file=/media/huge/test9 mapped=1 N1=1
6000000000000000 default file=/media/huge/test9 anon=1 dirty=1 active=0 N1=1
6000000000004000 default heap
607fffff7fffc000 default anon=1 dirty=1 swapcache=1 N2=1
607fffffff06c000 default stack anon=1 dirty=1 active=0 N1=1
8000000060000000 default file=/mnt/huge/test0 huge dirty=3 N1=3
8000000090000000 default file=/mnt/huge/test1 huge dirty=3 N0=1 N2=2
80000000c0000000 default file=/mnt/huge/test2 huge dirty=3 N1=1 N3=2

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mempolicy.c | 120 +++++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 95 insertions(+), 25 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 1a21008..d80fa7d 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -197,7 +197,7 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 	return policy;
 }
 
-static void gather_stats(struct page *, void *);
+static void gather_stats(struct page *, void *, int pte_dirty);
 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 				unsigned long flags);
 
@@ -239,7 +239,7 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 			continue;
 
 		if (flags & MPOL_MF_STATS)
-			gather_stats(page, private);
+			gather_stats(page, private, pte_dirty(*pte));
 		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 			migrate_page_add(page, private, flags);
 		else
@@ -1753,67 +1753,137 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
 struct numa_maps {
 	unsigned long pages;
 	unsigned long anon;
-	unsigned long mapped;
+	unsigned long active;
+	unsigned long writeback;
 	unsigned long mapcount_max;
+	unsigned long dirty;
+	unsigned long swapcache;
 	unsigned long node[MAX_NUMNODES];
 };
 
-static void gather_stats(struct page *page, void *private)
+static void gather_stats(struct page *page, void *private, int pte_dirty)
 {
 	struct numa_maps *md = private;
 	int count = page_mapcount(page);
 
-	if (count)
-		md->mapped++;
+	md->pages++;
+	if (pte_dirty || PageDirty(page))
+		md->dirty++;
 
-	if (count > md->mapcount_max)
-		md->mapcount_max = count;
+	if (PageSwapCache(page))
+		md->swapcache++;
 
-	md->pages++;
+	if (PageActive(page))
+		md->active++;
+
+	if (PageWriteback(page))
+		md->writeback++;
 
 	if (PageAnon(page))
 		md->anon++;
 
+	if (count > md->mapcount_max)
+		md->mapcount_max = count;
+
 	md->node[page_to_nid(page)]++;
 	cond_resched();
 }
 
+static void check_huge_range(struct vm_area_struct *vma,
+		unsigned long start, unsigned long end,
+		struct numa_maps *md)
+{
+	unsigned long addr;
+	struct page *page;
+
+	for (addr = start; addr < end; addr += HPAGE_SIZE) {
+		pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
+		pte_t pte;
+
+		if (!ptep)
+			continue;
+
+		pte = *ptep;
+		if (pte_none(pte))
+			continue;
+
+		page = pte_page(pte);
+		if (!page)
+			continue;
+
+		gather_stats(page, md, pte_dirty(*ptep));
+	}
+}
+
 int show_numa_map(struct seq_file *m, void *v)
 {
 	struct task_struct *task = m->private;
 	struct vm_area_struct *vma = v;
 	struct numa_maps *md;
+	struct file *file = vma->vm_file;
+	struct mm_struct *mm = vma->vm_mm;
 	int n;
 	char buffer[50];
 
-	if (!vma->vm_mm)
+	if (!mm)
 		return 0;
 
 	md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
 	if (!md)
 		return 0;
 
-	if (!is_vm_hugetlb_page(vma))
+	mpol_to_str(buffer, sizeof(buffer),
+			get_vma_policy(task, vma, vma->vm_start));
+
+	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
+
+	if (file) {
+		seq_printf(m, " file=");
+		seq_path(m, file->f_vfsmnt, file->f_dentry, "\n\t= ");
+	} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
+		seq_printf(m, " heap");
+	} else if (vma->vm_start <= mm->start_stack &&
+			vma->vm_end >= mm->start_stack) {
+		seq_printf(m, " stack");
+	}
+
+	if (is_vm_hugetlb_page(vma)) {
+		check_huge_range(vma, vma->vm_start, vma->vm_end, md);
+		seq_printf(m, " huge");
+	} else {
 		check_pgd_range(vma, vma->vm_start, vma->vm_end,
-		    &node_online_map, MPOL_MF_STATS, md);
+				&node_online_map, MPOL_MF_STATS, md);
+	}
+
+	if (!md->pages)
+		goto out;
 
-	if (md->pages) {
-		mpol_to_str(buffer, sizeof(buffer),
-			    get_vma_policy(task, vma, vma->vm_start));
+	if (md->anon)
+		seq_printf(m," anon=%lu",md->anon);
 
-		seq_printf(m, "%08lx %s pages=%lu mapped=%lu maxref=%lu",
-			   vma->vm_start, buffer, md->pages,
-			   md->mapped, md->mapcount_max);
+	if (md->dirty)
+		seq_printf(m," dirty=%lu",md->dirty);
 
-		if (md->anon)
-			seq_printf(m," anon=%lu",md->anon);
+	if (md->pages != md->anon && md->pages != md->dirty)
+		seq_printf(m, " mapped=%lu", md->pages);
 
-		for_each_online_node(n)
-			if (md->node[n])
-				seq_printf(m, " N%d=%lu", n, md->node[n]);
+	if (md->mapcount_max > 1)
+		seq_printf(m, " mapmax=%lu", md->mapcount_max);
 
-		seq_putc(m, '\n');
-	}
+	if (md->swapcache)
+		seq_printf(m," swapcache=%lu", md->swapcache);
+
+	if (md->active < md->pages && !is_vm_hugetlb_page(vma))
+		seq_printf(m," active=%lu", md->active);
+
+	if (md->writeback)
+		seq_printf(m," writeback=%lu", md->writeback);
+
+	for_each_online_node(n)
+		if (md->node[n])
+			seq_printf(m, " N%d=%lu", n, md->node[n]);
+out:
+	seq_putc(m, '\n');
 	kfree(md);
 
 	if (m->count < m->size)
-- 
cgit v1.1


From f78bb8ad482267b92c122f0e37a7dce69c880247 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Wed, 8 Mar 2006 10:33:05 -0800
Subject: slab: fix calculate_slab_order() for SLAB_RECLAIM_ACCOUNT

Instead of having a hard-to-read and confusing conditional in the
caller, just make the slab order calculation handle this special case,
since it's simple and obvious there.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index f2e92dc..6ad6bd5 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1648,6 +1648,14 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep,
 		left_over = remainder;
 
 		/*
+		 * A VFS-reclaimable slab tends to have most allocations
+		 * as GFP_NOFS and we really don't want to have to be allocating
+		 * higher-order pages when we are unable to shrink dcache.
+		 */
+		if (flags & SLAB_RECLAIM_ACCOUNT)
+			break;
+
+		/*
 		 * Large number of objects is good, but very large slabs are
 		 * currently bad for the gfp()s.
 		 */
@@ -1869,17 +1877,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 
 	size = ALIGN(size, align);
 
-	if ((flags & SLAB_RECLAIM_ACCOUNT) && size <= PAGE_SIZE) {
-		/*
-		 * A VFS-reclaimable slab tends to have most allocations
-		 * as GFP_NOFS and we really don't want to have to be allocating
-		 * higher-order pages when we are unable to shrink dcache.
-		 */
-		cachep->gfporder = 0;
-		cache_estimate(cachep->gfporder, size, align, flags,
-			       &left_over, &cachep->num);
-	} else
-		left_over = calculate_slab_order(cachep, size, align, flags);
+	left_over = calculate_slab_order(cachep, size, align, flags);
 
 	if (!cachep->num) {
 		printk("kmem_cache_create: couldn't create cache %s.\n", name);
-- 
cgit v1.1


From 7f709ed0e3ccd3e88e0632b69f00174e83f8d98b Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Tue, 7 Mar 2006 21:55:22 -0800
Subject: [PATCH] numa_maps-update fix

Fix the mm/mempolicy.c build for !CONFIG_HUGETLB_PAGE.

Cc: Christoph Lameter <clameter@engr.sgi.com>
Cc: Martin Bligh <mbligh@google.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mempolicy.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d80fa7d..954981b 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1789,6 +1789,7 @@ static void gather_stats(struct page *page, void *private, int pte_dirty)
 	cond_resched();
 }
 
+#ifdef CONFIG_HUGETLB_PAGE
 static void check_huge_range(struct vm_area_struct *vma,
 		unsigned long start, unsigned long end,
 		struct numa_maps *md)
@@ -1814,6 +1815,13 @@ static void check_huge_range(struct vm_area_struct *vma,
 		gather_stats(page, md, pte_dirty(*ptep));
 	}
 }
+#else
+static inline void check_huge_range(struct vm_area_struct *vma,
+		unsigned long start, unsigned long end,
+		struct numa_maps *md)
+{
+}
+#endif
 
 int show_numa_map(struct seq_file *m, void *v)
 {
-- 
cgit v1.1


From e2bab3d92486fb781f4d06f56339264ed1492392 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Tue, 7 Mar 2006 21:55:31 -0800
Subject: [PATCH] percpu_counter_sum()

Implement percpu_counter_sum().  This is a more accurate but slower version of
percpu_counter_read_positive().

We need this for Alex's speedup-ext3_statfs patch and for the nr_file
accounting fix.  Otherwise these things would be too inaccurate on large CPU
counts.

Cc: Ravikiran G Thirumalai <kiran@scalex86.org>
Cc: Alex Tomas <alex@clusterfs.com>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/swap.c | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/swap.c b/mm/swap.c
index cce3dda..e9ec06d 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -489,13 +489,34 @@ void percpu_counter_mod(struct percpu_counter *fbc, long amount)
 	if (count >= FBC_BATCH || count <= -FBC_BATCH) {
 		spin_lock(&fbc->lock);
 		fbc->count += count;
+		*pcount = 0;
 		spin_unlock(&fbc->lock);
-		count = 0;
+	} else {
+		*pcount = count;
 	}
-	*pcount = count;
 	put_cpu();
 }
 EXPORT_SYMBOL(percpu_counter_mod);
+
+/*
+ * Add up all the per-cpu counts, return the result.  This is a more accurate
+ * but much slower version of percpu_counter_read_positive()
+ */
+long percpu_counter_sum(struct percpu_counter *fbc)
+{
+	long ret;
+	int cpu;
+
+	spin_lock(&fbc->lock);
+	ret = fbc->count;
+	for_each_cpu(cpu) {
+		long *pcount = per_cpu_ptr(fbc->counters, cpu);
+		ret += *pcount;
+	}
+	spin_unlock(&fbc->lock);
+	return ret < 0 ? 0 : ret;
+}
+EXPORT_SYMBOL(percpu_counter_sum);
 #endif
 
 /*
-- 
cgit v1.1


From 07ed76b2a085a31f427c2a912a562627947dc7de Mon Sep 17 00:00:00 2001
From: Jack Steiner <steiner@sgi.com>
Date: Tue, 7 Mar 2006 21:55:46 -0800
Subject: [PATCH] slab: allocate larger cache_cache if order 0 fails

kmem_cache_init() incorrectly assumes that the cache_cache object will fit
in an order 0 allocation.  On very large systems, this is not true.  Change
the code to try larger order allocations if order 0 fails.

Signed-off-by: Jack Steiner <steiner@sgi.com>
Cc: Manfred Spraul <manfred@colorfullife.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 6ad6bd5..61800b8 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1124,6 +1124,7 @@ void __init kmem_cache_init(void)
 	struct cache_sizes *sizes;
 	struct cache_names *names;
 	int i;
+	int order;
 
 	for (i = 0; i < NUM_INIT_LISTS; i++) {
 		kmem_list3_init(&initkmem_list3[i]);
@@ -1167,11 +1168,15 @@ void __init kmem_cache_init(void)
 
 	cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, cache_line_size());
 
-	cache_estimate(0, cache_cache.buffer_size, cache_line_size(), 0,
-		       &left_over, &cache_cache.num);
+	for (order = 0; order < MAX_ORDER; order++) {
+		cache_estimate(order, cache_cache.buffer_size,
+			cache_line_size(), 0, &left_over, &cache_cache.num);
+		if (cache_cache.num)
+			break;
+	}
 	if (!cache_cache.num)
 		BUG();
-
+	cache_cache.gfporder = order;
 	cache_cache.colour = left_over / cache_cache.colour_off;
 	cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
 				      sizeof(struct slab), cache_line_size());
-- 
cgit v1.1


From 85a6cd03a97f04ffff7bfedfa3172894ca9a617b Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Thu, 9 Mar 2006 17:33:34 -0800
Subject: [PATCH] page_add_file_rmap(): remove BUG_ON()s

Remove two early-development BUG_ONs from page_add_file_rmap.

The pfn_valid test (originally useful for checking that nobody passed an
artificial struct page) comes too late, since we already have the struct
page.

The PageAnon test (useful when anon was first distinguished from file rmap)
prevents ->nopage implementations from reusing ->mapping, which would
otherwise be available.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/rmap.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'mm')

diff --git a/mm/rmap.c b/mm/rmap.c
index d8ce5ff..67f0e20 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -537,9 +537,6 @@ void page_add_new_anon_rmap(struct page *page,
  */
 void page_add_file_rmap(struct page *page)
 {
-	BUG_ON(PageAnon(page));
-	BUG_ON(!pfn_valid(page_to_pfn(page)));
-
 	if (atomic_inc_and_test(&page->_mapcount))
 		__inc_page_state(nr_mapped);
 }
-- 
cgit v1.1


From a6bf527091b1dd40f1b6a496812ce7520621c282 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Thu, 9 Mar 2006 17:33:47 -0800
Subject: [PATCH] vmscan: no zone_reclaim if PF_MALLOC is set

If the process has already set PF_MALLOC and is already using
current->reclaim_state then do not try to reclaim memory from the zone.
This is set by kswapd and/or synchrononous global reclaim which will not
take it lightly if we zap the reclaim_state.

Signed-off-by: Christoph Lameter <clameter@sig.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmscan.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index b0af759..7ccf763 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1883,7 +1883,8 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 
 	if (!(gfp_mask & __GFP_WAIT) ||
 		zone->all_unreclaimable ||
-		atomic_read(&zone->reclaim_in_progress) > 0)
+		atomic_read(&zone->reclaim_in_progress) > 0 ||
+		(p->flags & PF_MEMALLOC))
 			return 0;
 
 	node_id = zone->zone_pgdat->node_id;
-- 
cgit v1.1


From f2937be5895dbae23ff66767a2fc17793e63159c Mon Sep 17 00:00:00 2001
From: Yasunori Goto <y-goto@jp.fujitsu.com>
Date: Thu, 9 Mar 2006 17:33:51 -0800
Subject: [PATCH] memory hotadd: pgdat->node_present_pages fix

When pages are onlined, not only zone->present_pages but also
pgdat->node_present_pages should be refreshed.

This parameter is used to show information at
/sys/device/system/node/nodeX/meminfo via si_meminfo_node().

So, it shows strange value for MemUsed which is calculated
(node_present_pages - all zones free pages).

Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory_hotplug.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'mm')

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index a918f77..1fe76d9 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -130,6 +130,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
 		onlined_pages++;
 	}
 	zone->present_pages += onlined_pages;
+	zone->zone_pgdat->node_present_pages += onlined_pages;
 
 	setup_per_zone_pages_min();
 
-- 
cgit v1.1


From 8fce4d8e3b9e3cf47cc8afeb6077e22ab795d989 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Thu, 9 Mar 2006 17:33:54 -0800
Subject: [PATCH] slab: Node rotor for freeing alien caches and remote per cpu
 pages.

The cache reaper currently tries to free all alien caches and all remote
per cpu pages in each pass of cache_reap.  For a machines with large number
of nodes (such as Altix) this may lead to sporadic delays of around ~10ms.
Interrupts are disabled while reclaiming creating unacceptable delays.

This patch changes that behavior by adding a per cpu reap_node variable.
Instead of attempting to free all caches, we free only one alien cache and
the per cpu pages from one remote node.  That reduces the time spend in
cache_reap.  However, doing so will lengthen the time it takes to
completely drain all remote per cpu pagesets and all alien caches.  The
time needed will grow with the number of nodes in the system.  All caches
are drained when they overflow their respective capacity.  So the drawback
here is only that a bit of memory may be wasted for awhile longer.

Details:

1. Rename drain_remote_pages to drain_node_pages to allow the specification
   of the node to drain of pcp pages.

2. Add additional functions init_reap_node, next_reap_node for NUMA
   that manage a per cpu reap_node counter.

3. Add a reap_alien function that reaps only from the current reap_node.

For us this seems to be a critical issue.  Holdoffs of an average of ~7ms
cause some HPC benchmarks to slow down significantly.  F.e.  NAS parallel
slows down dramatically.  NAS parallel has a 12-16 seconds runtime w/o rotor
compared to 5.8 secs with the rotor patches.  It gets down to 5.05 secs with
the additional interrupt holdoff reductions.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 17 +++++++--------
 mm/slab.c       | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 70 insertions(+), 12 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 791690d..234bd48 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -590,21 +590,20 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 }
 
 #ifdef CONFIG_NUMA
-/* Called from the slab reaper to drain remote pagesets */
-void drain_remote_pages(void)
+/*
+ * Called from the slab reaper to drain pagesets on a particular node that
+ * belong to the currently executing processor.
+ */
+void drain_node_pages(int nodeid)
 {
-	struct zone *zone;
-	int i;
+	int i, z;
 	unsigned long flags;
 
 	local_irq_save(flags);
-	for_each_zone(zone) {
+	for (z = 0; z < MAX_NR_ZONES; z++) {
+		struct zone *zone = NODE_DATA(nodeid)->node_zones + z;
 		struct per_cpu_pageset *pset;
 
-		/* Do not drain local pagesets */
-		if (zone->zone_pgdat->node_id == numa_node_id())
-			continue;
-
 		pset = zone_pcp(zone, smp_processor_id());
 		for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
 			struct per_cpu_pages *pcp;
diff --git a/mm/slab.c b/mm/slab.c
index 61800b8..d0bd7f0 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -789,6 +789,47 @@ static void __slab_error(const char *function, struct kmem_cache *cachep, char *
 	dump_stack();
 }
 
+#ifdef CONFIG_NUMA
+/*
+ * Special reaping functions for NUMA systems called from cache_reap().
+ * These take care of doing round robin flushing of alien caches (containing
+ * objects freed on different nodes from which they were allocated) and the
+ * flushing of remote pcps by calling drain_node_pages.
+ */
+static DEFINE_PER_CPU(unsigned long, reap_node);
+
+static void init_reap_node(int cpu)
+{
+	int node;
+
+	node = next_node(cpu_to_node(cpu), node_online_map);
+	if (node == MAX_NUMNODES)
+		node = 0;
+
+	__get_cpu_var(reap_node) = node;
+}
+
+static void next_reap_node(void)
+{
+	int node = __get_cpu_var(reap_node);
+
+	/*
+	 * Also drain per cpu pages on remote zones
+	 */
+	if (node != numa_node_id())
+		drain_node_pages(node);
+
+	node = next_node(node, node_online_map);
+	if (unlikely(node >= MAX_NUMNODES))
+		node = first_node(node_online_map);
+	__get_cpu_var(reap_node) = node;
+}
+
+#else
+#define init_reap_node(cpu) do { } while (0)
+#define next_reap_node(void) do { } while (0)
+#endif
+
 /*
  * Initiate the reap timer running on the target CPU.  We run at around 1 to 2Hz
  * via the workqueue/eventd.
@@ -806,6 +847,7 @@ static void __devinit start_cpu_timer(int cpu)
 	 * at that time.
 	 */
 	if (keventd_up() && reap_work->func == NULL) {
+		init_reap_node(cpu);
 		INIT_WORK(reap_work, cache_reap, NULL);
 		schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
 	}
@@ -884,6 +926,23 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
 	}
 }
 
+/*
+ * Called from cache_reap() to regularly drain alien caches round robin.
+ */
+static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
+{
+	int node = __get_cpu_var(reap_node);
+
+	if (l3->alien) {
+		struct array_cache *ac = l3->alien[node];
+		if (ac && ac->avail) {
+			spin_lock_irq(&ac->lock);
+			__drain_alien_cache(cachep, ac, node);
+			spin_unlock_irq(&ac->lock);
+		}
+	}
+}
+
 static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **alien)
 {
 	int i = 0;
@@ -902,6 +961,7 @@ static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **al
 #else
 
 #define drain_alien_cache(cachep, alien) do { } while (0)
+#define reap_alien(cachep, l3) do { } while (0)
 
 static inline struct array_cache **alloc_alien_cache(int node, int limit)
 {
@@ -3497,8 +3557,7 @@ static void cache_reap(void *unused)
 		check_irq_on();
 
 		l3 = searchp->nodelists[numa_node_id()];
-		if (l3->alien)
-			drain_alien_cache(searchp, l3->alien);
+		reap_alien(searchp, l3);
 		spin_lock_irq(&l3->list_lock);
 
 		drain_array_locked(searchp, cpu_cache_get(searchp), 0,
@@ -3548,7 +3607,7 @@ static void cache_reap(void *unused)
 	}
 	check_irq_on();
 	mutex_unlock(&cache_chain_mutex);
-	drain_remote_pages();
+	next_reap_node();
 	/* Setup the next iteration */
 	schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
 }
-- 
cgit v1.1


From 4983da07f1e2e8dc81cb9d640fbf35b899cdbdf2 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Tue, 14 Mar 2006 19:50:19 -0800
Subject: [PATCH] page migration: fail if page is in a vma flagged VM_LOCKED

page migration currently simply retries a couple of times if try_to_unmap()
fails without inspecting the return code.

However, SWAP_FAIL indicates that the page is in a vma that has the
VM_LOCKED flag set (if ignore_refs ==1).  We can check for that return code
and avoid retrying the migration.

migrate_page_remove_references() now needs to return a reason why the
failure occured.  So switch migrate_page_remove_references to use -Exx
style error messages.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmscan.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 7ccf763..4fe7e3a 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -700,7 +700,7 @@ int migrate_page_remove_references(struct page *newpage,
 	 * the page.
 	 */
 	if (!mapping || page_mapcount(page) + nr_refs != page_count(page))
-		return 1;
+		return -EAGAIN;
 
 	/*
 	 * Establish swap ptes for anonymous pages or destroy pte
@@ -721,13 +721,15 @@ int migrate_page_remove_references(struct page *newpage,
 	 * If the page was not migrated then the PageSwapCache bit
 	 * is still set and the operation may continue.
 	 */
-	try_to_unmap(page, 1);
+	if (try_to_unmap(page, 1) == SWAP_FAIL)
+		/* A vma has VM_LOCKED set -> Permanent failure */
+		return -EPERM;
 
 	/*
 	 * Give up if we were unable to remove all mappings.
 	 */
 	if (page_mapcount(page))
-		return 1;
+		return -EAGAIN;
 
 	write_lock_irq(&mapping->tree_lock);
 
@@ -738,7 +740,7 @@ int migrate_page_remove_references(struct page *newpage,
 	if (!page_mapping(page) || page_count(page) != nr_refs ||
 			*radix_pointer != page) {
 		write_unlock_irq(&mapping->tree_lock);
-		return 1;
+		return -EAGAIN;
 	}
 
 	/*
@@ -813,10 +815,14 @@ EXPORT_SYMBOL(migrate_page_copy);
  */
 int migrate_page(struct page *newpage, struct page *page)
 {
+	int rc;
+
 	BUG_ON(PageWriteback(page));	/* Writeback must be complete */
 
-	if (migrate_page_remove_references(newpage, page, 2))
-		return -EAGAIN;
+	rc = migrate_page_remove_references(newpage, page, 2);
+
+	if (rc)
+		return rc;
 
 	migrate_page_copy(newpage, page);
 
-- 
cgit v1.1


From 74c002410548c7cb1744b45d17a5fa21da515b63 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Tue, 14 Mar 2006 19:50:21 -0800
Subject: [PATCH] Consistent capabilites associated with MPOL_MOVE_ALL

It seems that setting scheduling policy and priorities is also the kind of
thing that might be performed in apps that also use the NUMA API, so it
would seem consistent to use CAP_SYS_NICE for NUMA also.

So use CAP_SYS_NICE for controlling migration permissions.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Michael Kerrisk <mtk-manpages@gmx.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mempolicy.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 954981b..2a82060 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -748,7 +748,7 @@ long do_mbind(unsigned long start, unsigned long len,
 				      MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 	    || mode > MPOL_MAX)
 		return -EINVAL;
-	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
+	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
 		return -EPERM;
 
 	if (start & ~PAGE_MASK)
@@ -942,20 +942,20 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 	 */
 	if ((current->euid != task->suid) && (current->euid != task->uid) &&
 	    (current->uid != task->suid) && (current->uid != task->uid) &&
-	    !capable(CAP_SYS_ADMIN)) {
+	    !capable(CAP_SYS_NICE)) {
 		err = -EPERM;
 		goto out;
 	}
 
 	task_nodes = cpuset_mems_allowed(task);
 	/* Is the user allowed to access the target nodes? */
-	if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) {
+	if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
 		err = -EPERM;
 		goto out;
 	}
 
 	err = do_migrate_pages(mm, &old, &new,
-		capable(CAP_SYS_ADMIN) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
+		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
 out:
 	mmput(mm);
 	return err;
-- 
cgit v1.1


From 90036ee5938d89638e80f4d0d0700d0f2dbd4a6a Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Thu, 16 Mar 2006 23:03:59 -0800
Subject: [PATCH] page migration: Fail with error if swap not setup

Currently the migration of anonymous pages will silently fail if no swap is
setup.  This patch makes page migration functions check for available swap
and fail with -ENODEV if no swap space is available.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mempolicy.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 2a82060..b21869a 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -330,9 +330,19 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 	int err;
 	struct vm_area_struct *first, *vma, *prev;
 
-	/* Clear the LRU lists so pages can be isolated */
-	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
+		/* Must have swap device for migration */
+		if (nr_swap_pages <= 0)
+			return ERR_PTR(-ENODEV);
+
+		/*
+		 * Clear the LRU lists so pages can be isolated.
+		 * Note that pages may be moved off the LRU after we have
+		 * drained them. Those pages will fail to migrate like other
+		 * pages that may be busy.
+		 */
 		lru_add_drain_all();
+	}
 
 	first = find_vma(mm, start);
 	if (!first)
-- 
cgit v1.1


From 5b40dc780ed996162f3af8712eb03beb24dcdbef Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Thu, 16 Mar 2006 23:04:07 -0800
Subject: [PATCH] fix race in pagevec_strip?

We can call try_to_release_page() with PagePrivate off and a valid
page->mapping This may cause all sorts of trouble for the filesystem
*_releasepage() handlers.  XFS bombs out in that case.

Lock the page before checking for page private.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/swap.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/swap.c b/mm/swap.c
index e9ec06d..b524ea9 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -393,7 +393,8 @@ void pagevec_strip(struct pagevec *pvec)
 		struct page *page = pvec->pages[i];
 
 		if (PagePrivate(page) && !TestSetPageLocked(page)) {
-			try_to_release_page(page, 0);
+			if (PagePrivate(page))
+				try_to_release_page(page, 0);
 			unlock_page(page);
 		}
 	}
-- 
cgit v1.1


From 6f5e6b9e69bf043074a0edabe3d271899c34eb79 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Thu, 16 Mar 2006 23:04:09 -0800
Subject: [PATCH] fix free swap cache latency

Lee Revell reported 28ms latency when process with lots of swapped memory
exits.

2.6.15 introduced a latency regression when unmapping: in accounting the
zap_work latency breaker, pte_none counted 1, pte_present PAGE_SIZE, but a
swap entry counted nothing at all.  We think of pages present as the slow
case, but Lee's trace shows that free_swap_and_cache's radix tree lookup
can make a lot of work - and we could have been doing it many thousands of
times without a latency break.

Move the zap_work update up to account swap entries like pages present.
This does account non-linear pte_file entries, and unmap_mapping_range
skipping over swap entries, by the same amount even though they're quick:
but neither of those cases deserves complicating the code (and they're
treated no worse than they were in 2.6.14).

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Acked-by: Nick Piggin <npiggin@suse.de>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index 9abc600..85e80a5 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -623,11 +623,12 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 			(*zap_work)--;
 			continue;
 		}
+
+		(*zap_work) -= PAGE_SIZE;
+
 		if (pte_present(ptent)) {
 			struct page *page;
 
-			(*zap_work) -= PAGE_SIZE;
-
 			page = vm_normal_page(vma, addr, ptent);
 			if (unlikely(details) && page) {
 				/*
-- 
cgit v1.1


From b40607fc02f8248828d52d88f91b7d68df1933b0 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 22 Mar 2006 00:07:39 -0800
Subject: [PATCH] __get_page_state() cpumask cleanup and fix

__get_page_state() has an open-coded for_each_cpu_mask() loop in it.

Tidy that up, then notice that the code was buggy:

	while (cpu < NR_CPUS) {
		unsigned long *in, *out, off;

		if (!cpu_isset(cpu, *cpumask))
			continue;

an obvious infinite loop.  I guess we just never call it with a holey cpu
mask.

Even after my cpumask size-reduction work, this patch increases code size :(

Cc: Paul Jackson <pj@sgi.com>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 234bd48..6177586 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1214,24 +1214,22 @@ DEFINE_PER_CPU(long, nr_pagecache_local) = 0;
 
 static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
 {
-	int cpu = 0;
+	unsigned cpu;
 
 	memset(ret, 0, nr * sizeof(unsigned long));
 	cpus_and(*cpumask, *cpumask, cpu_online_map);
 
-	cpu = first_cpu(*cpumask);
-	while (cpu < NR_CPUS) {
-		unsigned long *in, *out, off;
-
-		if (!cpu_isset(cpu, *cpumask))
-			continue;
+	for_each_cpu_mask(cpu, *cpumask) {
+		unsigned long *in;
+		unsigned long *out;
+		unsigned off;
+		unsigned next_cpu;
 
 		in = (unsigned long *)&per_cpu(page_states, cpu);
 
-		cpu = next_cpu(cpu, *cpumask);
-
-		if (likely(cpu < NR_CPUS))
-			prefetch(&per_cpu(page_states, cpu));
+		next_cpu = next_cpu(cpu, *cpumask);
+		if (likely(next_cpu < NR_CPUS))
+			prefetch(&per_cpu(page_states, next_cpu));
 
 		out = (unsigned long *)ret;
 		for (off = 0; off < nr; off++)
-- 
cgit v1.1


From 46453a6e194a8c55fe6cf3dc8e1c4f24e2abc013 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 22 Mar 2006 00:07:58 -0800
Subject: [PATCH] mm: never ClearPageLRU released pages

If vmscan finds a zero refcount page on the lru list, never ClearPageLRU
it.  This means the release code need not hold ->lru_lock to stabilise
PageLRU, so that lock may be skipped entirely when releasing !PageLRU pages
(because we know PageLRU won't have been temporarily cleared by vmscan,
which was previously guaranteed by holding the lock to synchronise against
vmscan).

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/swap.c   | 52 +++++++++++++++++++++++++++-------------------------
 mm/vmscan.c | 18 +++++++++++-------
 2 files changed, 38 insertions(+), 32 deletions(-)

(limited to 'mm')

diff --git a/mm/swap.c b/mm/swap.c
index b524ea9..3045a0f 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -209,19 +209,18 @@ int lru_add_drain_all(void)
  */
 void fastcall __page_cache_release(struct page *page)
 {
-	unsigned long flags;
-	struct zone *zone = page_zone(page);
+	if (PageLRU(page)) {
+		unsigned long flags;
+		struct zone *zone = page_zone(page);
 
-	spin_lock_irqsave(&zone->lru_lock, flags);
-	if (TestClearPageLRU(page))
+		spin_lock_irqsave(&zone->lru_lock, flags);
+		if (!TestClearPageLRU(page))
+			BUG();
 		del_page_from_lru(zone, page);
-	if (page_count(page) != 0)
-		page = NULL;
-	spin_unlock_irqrestore(&zone->lru_lock, flags);
-	if (page)
-		free_hot_page(page);
+		spin_unlock_irqrestore(&zone->lru_lock, flags);
+	}
+	free_hot_page(page);
 }
-
 EXPORT_SYMBOL(__page_cache_release);
 
 /*
@@ -245,7 +244,6 @@ void release_pages(struct page **pages, int nr, int cold)
 	pagevec_init(&pages_to_free, cold);
 	for (i = 0; i < nr; i++) {
 		struct page *page = pages[i];
-		struct zone *pagezone;
 
 		if (unlikely(PageCompound(page))) {
 			if (zone) {
@@ -259,23 +257,27 @@ void release_pages(struct page **pages, int nr, int cold)
 		if (!put_page_testzero(page))
 			continue;
 
-		pagezone = page_zone(page);
-		if (pagezone != zone) {
-			if (zone)
-				spin_unlock_irq(&zone->lru_lock);
-			zone = pagezone;
-			spin_lock_irq(&zone->lru_lock);
-		}
-		if (TestClearPageLRU(page))
+		if (PageLRU(page)) {
+			struct zone *pagezone = page_zone(page);
+			if (pagezone != zone) {
+				if (zone)
+					spin_unlock_irq(&zone->lru_lock);
+				zone = pagezone;
+				spin_lock_irq(&zone->lru_lock);
+			}
+			if (!TestClearPageLRU(page))
+				BUG();
 			del_page_from_lru(zone, page);
-		if (page_count(page) == 0) {
-			if (!pagevec_add(&pages_to_free, page)) {
+		}
+
+		if (!pagevec_add(&pages_to_free, page)) {
+			if (zone) {
 				spin_unlock_irq(&zone->lru_lock);
-				__pagevec_free(&pages_to_free);
-				pagevec_reinit(&pages_to_free);
-				zone = NULL;	/* No lock is held */
+				zone = NULL;
 			}
-		}
+			__pagevec_free(&pages_to_free);
+			pagevec_reinit(&pages_to_free);
+  		}
 	}
 	if (zone)
 		spin_unlock_irq(&zone->lru_lock);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 4fe7e3a..acb7611 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1085,21 +1085,25 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
 		page = lru_to_page(src);
 		prefetchw_prev_lru_page(page, src, flags);
 
-		if (!TestClearPageLRU(page))
-			BUG();
 		list_del(&page->lru);
-		if (get_page_testone(page)) {
+		if (unlikely(get_page_testone(page))) {
 			/*
 			 * It is being freed elsewhere
 			 */
 			__put_page(page);
-			SetPageLRU(page);
 			list_add(&page->lru, src);
 			continue;
-		} else {
-			list_add(&page->lru, dst);
-			nr_taken++;
 		}
+
+		/*
+		 * Be careful not to clear PageLRU until after we're sure
+		 * the page is not being freed elsewhere -- the page release
+		 * code relies on it.
+		 */
+		if (!TestClearPageLRU(page))
+			BUG();
+		list_add(&page->lru, dst);
+		nr_taken++;
 	}
 
 	*scanned = scan;
-- 
cgit v1.1


From 8d438f96d2b8eade6cbcd8adfc22dae6f5cbd6c0 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 22 Mar 2006 00:07:59 -0800
Subject: [PATCH] mm: PageLRU no testset

PG_lru is protected by zone->lru_lock. It does not need TestSet/TestClear
operations.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/swap.c   | 16 ++++++++--------
 mm/vmscan.c | 20 +++++++++++---------
 2 files changed, 19 insertions(+), 17 deletions(-)

(limited to 'mm')

diff --git a/mm/swap.c b/mm/swap.c
index 3045a0f..985324e 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -214,8 +214,8 @@ void fastcall __page_cache_release(struct page *page)
 		struct zone *zone = page_zone(page);
 
 		spin_lock_irqsave(&zone->lru_lock, flags);
-		if (!TestClearPageLRU(page))
-			BUG();
+		BUG_ON(!PageLRU(page));
+		ClearPageLRU(page);
 		del_page_from_lru(zone, page);
 		spin_unlock_irqrestore(&zone->lru_lock, flags);
 	}
@@ -265,8 +265,8 @@ void release_pages(struct page **pages, int nr, int cold)
 				zone = pagezone;
 				spin_lock_irq(&zone->lru_lock);
 			}
-			if (!TestClearPageLRU(page))
-				BUG();
+			BUG_ON(!PageLRU(page));
+			ClearPageLRU(page);
 			del_page_from_lru(zone, page);
 		}
 
@@ -345,8 +345,8 @@ void __pagevec_lru_add(struct pagevec *pvec)
 			zone = pagezone;
 			spin_lock_irq(&zone->lru_lock);
 		}
-		if (TestSetPageLRU(page))
-			BUG();
+		BUG_ON(PageLRU(page));
+		SetPageLRU(page);
 		add_page_to_inactive_list(zone, page);
 	}
 	if (zone)
@@ -372,8 +372,8 @@ void __pagevec_lru_add_active(struct pagevec *pvec)
 			zone = pagezone;
 			spin_lock_irq(&zone->lru_lock);
 		}
-		if (TestSetPageLRU(page))
-			BUG();
+		BUG_ON(PageLRU(page));
+		SetPageLRU(page);
 		if (TestSetPageActive(page))
 			BUG();
 		add_page_to_active_list(zone, page);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index acb7611..40fb378 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1042,9 +1042,10 @@ int isolate_lru_page(struct page *page)
 	if (PageLRU(page)) {
 		struct zone *zone = page_zone(page);
 		spin_lock_irq(&zone->lru_lock);
-		if (TestClearPageLRU(page)) {
+		if (PageLRU(page)) {
 			ret = 1;
 			get_page(page);
+			ClearPageLRU(page);
 			if (PageActive(page))
 				del_page_from_active_list(zone, page);
 			else
@@ -1085,6 +1086,8 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
 		page = lru_to_page(src);
 		prefetchw_prev_lru_page(page, src, flags);
 
+		BUG_ON(!PageLRU(page));
+
 		list_del(&page->lru);
 		if (unlikely(get_page_testone(page))) {
 			/*
@@ -1100,8 +1103,7 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
 		 * the page is not being freed elsewhere -- the page release
 		 * code relies on it.
 		 */
-		if (!TestClearPageLRU(page))
-			BUG();
+		ClearPageLRU(page);
 		list_add(&page->lru, dst);
 		nr_taken++;
 	}
@@ -1156,8 +1158,8 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc)
 		 */
 		while (!list_empty(&page_list)) {
 			page = lru_to_page(&page_list);
-			if (TestSetPageLRU(page))
-				BUG();
+			BUG_ON(PageLRU(page));
+			SetPageLRU(page);
 			list_del(&page->lru);
 			if (PageActive(page))
 				add_page_to_active_list(zone, page);
@@ -1276,8 +1278,8 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
 	while (!list_empty(&l_inactive)) {
 		page = lru_to_page(&l_inactive);
 		prefetchw_prev_lru_page(page, &l_inactive, flags);
-		if (TestSetPageLRU(page))
-			BUG();
+		BUG_ON(PageLRU(page));
+		SetPageLRU(page);
 		if (!TestClearPageActive(page))
 			BUG();
 		list_move(&page->lru, &zone->inactive_list);
@@ -1305,8 +1307,8 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
 	while (!list_empty(&l_active)) {
 		page = lru_to_page(&l_active);
 		prefetchw_prev_lru_page(page, &l_active, flags);
-		if (TestSetPageLRU(page))
-			BUG();
+		BUG_ON(PageLRU(page));
+		SetPageLRU(page);
 		BUG_ON(!PageActive(page));
 		list_move(&page->lru, &zone->active_list);
 		pgmoved++;
-- 
cgit v1.1


From 4c84cacfa424264f7ad5287298d3ea4a3e935278 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 22 Mar 2006 00:08:00 -0800
Subject: [PATCH] mm: PageActive no testset

PG_active is protected by zone->lru_lock, it does not need TestSet/TestClear
operations.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/swap.c   | 4 ++--
 mm/vmscan.c | 5 +++--
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/swap.c b/mm/swap.c
index 985324e..cf88226 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -374,8 +374,8 @@ void __pagevec_lru_add_active(struct pagevec *pvec)
 		}
 		BUG_ON(PageLRU(page));
 		SetPageLRU(page);
-		if (TestSetPageActive(page))
-			BUG();
+		BUG_ON(PageActive(page));
+		SetPageActive(page);
 		add_page_to_active_list(zone, page);
 	}
 	if (zone)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 40fb378..8e477b1 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1280,8 +1280,9 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
 		prefetchw_prev_lru_page(page, &l_inactive, flags);
 		BUG_ON(PageLRU(page));
 		SetPageLRU(page);
-		if (!TestClearPageActive(page))
-			BUG();
+		BUG_ON(!PageActive(page));
+		ClearPageActive(page);
+
 		list_move(&page->lru, &zone->inactive_list);
 		pgmoved++;
 		if (!pagevec_add(&pvec, page)) {
-- 
cgit v1.1


From 674539115cc88473f623581e1d53c0e2ecef2179 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 22 Mar 2006 00:08:00 -0800
Subject: [PATCH] mm: less atomic ops

In the page release paths, we can be sure that nobody will mess with our
page->flags because the refcount has dropped to 0.  So no need for atomic
operations here.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/swap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/swap.c b/mm/swap.c
index cf88226..91b7e20 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -215,7 +215,7 @@ void fastcall __page_cache_release(struct page *page)
 
 		spin_lock_irqsave(&zone->lru_lock, flags);
 		BUG_ON(!PageLRU(page));
-		ClearPageLRU(page);
+		__ClearPageLRU(page);
 		del_page_from_lru(zone, page);
 		spin_unlock_irqrestore(&zone->lru_lock, flags);
 	}
@@ -266,7 +266,7 @@ void release_pages(struct page **pages, int nr, int cold)
 				spin_lock_irq(&zone->lru_lock);
 			}
 			BUG_ON(!PageLRU(page));
-			ClearPageLRU(page);
+			__ClearPageLRU(page);
 			del_page_from_lru(zone, page);
 		}
 
-- 
cgit v1.1


From 5e9dace8d386def04219134d7160e8a778824764 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 22 Mar 2006 00:08:01 -0800
Subject: [PATCH] mm: page_alloc less atomics

More atomic operation removal from page allocator

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6177586..1029198 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -190,7 +190,7 @@ static void prep_compound_page(struct page *page, unsigned long order)
 	for (i = 0; i < nr_pages; i++) {
 		struct page *p = page + i;
 
-		SetPageCompound(p);
+		__SetPageCompound(p);
 		set_page_private(p, (unsigned long)page);
 	}
 }
@@ -209,7 +209,7 @@ static void destroy_compound_page(struct page *page, unsigned long order)
 		if (unlikely(!PageCompound(p) |
 				(page_private(p) != (unsigned long)page)))
 			bad_page(page);
-		ClearPageCompound(p);
+		__ClearPageCompound(p);
 	}
 }
 
-- 
cgit v1.1


From f205b2fe62d321403525065a4cb31b6bff1bbe53 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 22 Mar 2006 00:08:02 -0800
Subject: [PATCH] mm: slab less atomics

Atomic operation removal from slab

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index d0bd7f0..5988adf 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1402,7 +1402,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 		atomic_add(i, &slab_reclaim_pages);
 	add_page_state(nr_slab, i);
 	while (i--) {
-		SetPageSlab(page);
+		__SetPageSlab(page);
 		page++;
 	}
 	return addr;
@@ -1418,8 +1418,8 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
 	const unsigned long nr_freed = i;
 
 	while (i--) {
-		if (!TestClearPageSlab(page))
-			BUG();
+		BUG_ON(!PageSlab(page));
+		__ClearPageSlab(page);
 		page++;
 	}
 	sub_page_state(nr_slab, nr_freed);
-- 
cgit v1.1


From 7c8ee9a86340db686cd4314e9944dc9b6111bda9 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 22 Mar 2006 00:08:03 -0800
Subject: [PATCH] mm: simplify vmscan vs release refcounting

The VM has an interesting race where a page refcount can drop to zero, but it
is still on the LRU lists for a short time.  This was solved by testing a 0->1
refcount transition when picking up pages from the LRU, and dropping the
refcount in that case.

Instead, use atomic_add_unless to ensure we never pick up a 0 refcount page
from the LRU, thus a 0 refcount page will never have its refcount elevated
until it is allocated again.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmscan.c | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8e477b1..e21bab4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1083,29 +1083,26 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
 	int scan = 0;
 
 	while (scan++ < nr_to_scan && !list_empty(src)) {
+		struct list_head *target;
 		page = lru_to_page(src);
 		prefetchw_prev_lru_page(page, src, flags);
 
 		BUG_ON(!PageLRU(page));
 
 		list_del(&page->lru);
-		if (unlikely(get_page_testone(page))) {
+		target = src;
+		if (likely(get_page_unless_zero(page))) {
 			/*
-			 * It is being freed elsewhere
+			 * Be careful not to clear PageLRU until after we're
+			 * sure the page is not being freed elsewhere -- the
+			 * page release code relies on it.
 			 */
-			__put_page(page);
-			list_add(&page->lru, src);
-			continue;
-		}
+			ClearPageLRU(page);
+			target = dst;
+			nr_taken++;
+		} /* else it is being freed elsewhere */
 
-		/*
-		 * Be careful not to clear PageLRU until after we're sure
-		 * the page is not being freed elsewhere -- the page release
-		 * code relies on it.
-		 */
-		ClearPageLRU(page);
-		list_add(&page->lru, dst);
-		nr_taken++;
+		list_add(&page->lru, target);
 	}
 
 	*scanned = scan;
-- 
cgit v1.1


From 8dfcc9ba27e2ed257e5de9539f7f03e57c2c0e33 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 22 Mar 2006 00:08:05 -0800
Subject: [PATCH] mm: split highorder pages

Have an explicit mm call to split higher order pages into individual pages.
 Should help to avoid bugs and be more explicit about the code's intention.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: David Howells <dhowells@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Chris Zankel <chris@zankel.net>
Signed-off-by: Yoichi Yuasa <yoichi_yuasa@tripeaks.co.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c     |  4 +---
 mm/page_alloc.c | 22 ++++++++++++++++++++++
 2 files changed, 23 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index 85e80a5..6af555c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1221,9 +1221,7 @@ out:
  * The page has to be a nice clean _individual_ kernel allocation.
  * If you allocate a compound page, you need to have marked it as
  * such (__GFP_COMP), or manually just split the page up yourself
- * (which is mainly an issue of doing "set_page_count(page, 1)" for
- * each sub-page, and then freeing them one by one when you free
- * them rather than freeing it as a compound page).
+ * (see split_page()).
  *
  * NOTE! Traditionally this was done with "remap_pfn_range()" which
  * took an arbitrary page protection parameter. This doesn't allow
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1029198..fc65e87 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -752,6 +752,28 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
 		clear_highpage(page + i);
 }
 
+#ifdef CONFIG_MMU
+/*
+ * split_page takes a non-compound higher-order page, and splits it into
+ * n (1<<order) sub-pages: page[0..n]
+ * Each sub-page must be freed individually.
+ *
+ * Note: this is probably too low level an operation for use in drivers.
+ * Please consult with lkml before using this in your driver.
+ */
+void split_page(struct page *page, unsigned int order)
+{
+	int i;
+
+	BUG_ON(PageCompound(page));
+	BUG_ON(!page_count(page));
+	for (i = 1; i < (1 << order); i++) {
+		BUG_ON(page_count(page + i));
+		set_page_count(page + i, 1);
+	}
+}
+#endif
+
 /*
  * Really, prep_compound_page() should be called from __rmqueue_bulk().  But
  * we cheat by calling it from here, in the order > 0 path.  Saves a branch
-- 
cgit v1.1


From 545b1ea9bfa5a8ca9af33d63144bd4f2faaea8dd Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 22 Mar 2006 00:08:07 -0800
Subject: [PATCH] mm: cleanup bootmem

The bootmem code added to page_alloc.c duplicated some page freeing code
that it really doesn't need to because it is not so performance critical.

While we're here, make prefetching work properly by actually prefetching
the page we're about to use before prefetching ahead to the next one (ie.
get the most important transaction started first).  Also prefetch just a
single page ahead rather than leaving a gap of 16.

Jack Steiner reported no problems with SGI's ia64 simulator.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fc65e87..7aa0181 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -55,7 +55,6 @@ unsigned long totalhigh_pages __read_mostly;
 long nr_swap_pages;
 int percpu_pagelist_fraction;
 
-static void fastcall free_hot_cold_page(struct page *page, int cold);
 static void __free_pages_ok(struct page *page, unsigned int order);
 
 /*
@@ -448,28 +447,23 @@ void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)
 	if (order == 0) {
 		__ClearPageReserved(page);
 		set_page_count(page, 0);
-
-		free_hot_cold_page(page, 0);
+		set_page_refs(page, 0);
+		__free_page(page);
 	} else {
-		LIST_HEAD(list);
 		int loop;
 
+		prefetchw(page);
 		for (loop = 0; loop < BITS_PER_LONG; loop++) {
 			struct page *p = &page[loop];
 
-			if (loop + 16 < BITS_PER_LONG)
-				prefetchw(p + 16);
+			if (loop + 1 < BITS_PER_LONG)
+				prefetchw(p + 1);
 			__ClearPageReserved(p);
 			set_page_count(p, 0);
 		}
 
-		arch_free_page(page, order);
-
-		mod_page_state(pgfree, 1 << order);
-
-		list_add(&page->lru, &list);
-		kernel_map_pages(page, 1 << order, 0);
-		free_pages_bulk(page_zone(page), 1, &list, order);
+		set_page_refs(page, order);
+		__free_pages(page, order);
 	}
 }
 
-- 
cgit v1.1


From a482289d46587ffcda4c85aab109fb74910d7a48 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 22 Mar 2006 00:08:08 -0800
Subject: [PATCH] hugepage allocator cleanup

Insert "fresh" huge pages into the hugepage allocator by the same means as
they are freed back into it.  This reduces code size and allows
enqueue_huge_page to be inlined into the hugepage free fastpath.

Eliminate occurances of hugepages on the free list with non-zero refcount.
This can allow stricter refcount checks in future.  Also required for
lockless pagecache.

Signed-off-by: Nick Piggin <npiggin@suse.de>

"This patch also eliminates a leak "cleaned up" by re-clobbering the
refcount on every allocation from the hugepage freelists.  With respect to
the lockless pagecache, the crucial aspect is to eliminate unconditional
set_page_count() to 0 on pages with potentially nonzero refcounts, though
closer inspection suggests the assignments removed are entirely spurious."

Acked-by: William Irwin <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/hugetlb.c | 24 ++++++++----------------
 1 file changed, 8 insertions(+), 16 deletions(-)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 5087077..39d49ec 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -64,7 +64,7 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
 	return page;
 }
 
-static struct page *alloc_fresh_huge_page(void)
+static int alloc_fresh_huge_page(void)
 {
 	static int nid = 0;
 	struct page *page;
@@ -72,12 +72,15 @@ static struct page *alloc_fresh_huge_page(void)
 					HUGETLB_PAGE_ORDER);
 	nid = (nid + 1) % num_online_nodes();
 	if (page) {
+		page[1].lru.next = (void *)free_huge_page;	/* dtor */
 		spin_lock(&hugetlb_lock);
 		nr_huge_pages++;
 		nr_huge_pages_node[page_to_nid(page)]++;
 		spin_unlock(&hugetlb_lock);
+		put_page(page); /* free it into the hugepage allocator */
+		return 1;
 	}
-	return page;
+	return 0;
 }
 
 void free_huge_page(struct page *page)
@@ -85,7 +88,6 @@ void free_huge_page(struct page *page)
 	BUG_ON(page_count(page));
 
 	INIT_LIST_HEAD(&page->lru);
-	page[1].lru.next = NULL;			/* reset dtor */
 
 	spin_lock(&hugetlb_lock);
 	enqueue_huge_page(page);
@@ -105,7 +107,6 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
 	}
 	spin_unlock(&hugetlb_lock);
 	set_page_count(page, 1);
-	page[1].lru.next = (void *)free_huge_page;	/* set dtor */
 	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
 		clear_user_highpage(&page[i], addr);
 	return page;
@@ -114,7 +115,6 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
 static int __init hugetlb_init(void)
 {
 	unsigned long i;
-	struct page *page;
 
 	if (HPAGE_SHIFT == 0)
 		return 0;
@@ -123,12 +123,8 @@ static int __init hugetlb_init(void)
 		INIT_LIST_HEAD(&hugepage_freelists[i]);
 
 	for (i = 0; i < max_huge_pages; ++i) {
-		page = alloc_fresh_huge_page();
-		if (!page)
+		if (!alloc_fresh_huge_page())
 			break;
-		spin_lock(&hugetlb_lock);
-		enqueue_huge_page(page);
-		spin_unlock(&hugetlb_lock);
 	}
 	max_huge_pages = free_huge_pages = nr_huge_pages = i;
 	printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
@@ -154,8 +150,8 @@ static void update_and_free_page(struct page *page)
 		page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
 				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
 				1 << PG_private | 1<< PG_writeback);
-		set_page_count(&page[i], 0);
 	}
+	page[1].lru.next = NULL;
 	set_page_count(page, 1);
 	__free_pages(page, HUGETLB_PAGE_ORDER);
 }
@@ -188,12 +184,8 @@ static inline void try_to_free_low(unsigned long count)
 static unsigned long set_max_huge_pages(unsigned long count)
 {
 	while (count > nr_huge_pages) {
-		struct page *page = alloc_fresh_huge_page();
-		if (!page)
+		if (!alloc_fresh_huge_page())
 			return nr_huge_pages;
-		spin_lock(&hugetlb_lock);
-		enqueue_huge_page(page);
-		spin_unlock(&hugetlb_lock);
 	}
 	if (count >= nr_huge_pages)
 		return nr_huge_pages;
-- 
cgit v1.1


From 8fea4e96a8f29ccc34c244f54574680ce9b43631 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Wed, 22 Mar 2006 00:08:10 -0800
Subject: [PATCH] slab: object to index mapping cleanup

Clean up the object to index mapping that has been spread around mm/slab.c.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 34 +++++++++++++++++++++++-----------
 1 file changed, 23 insertions(+), 11 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 5988adf..3d18b71 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -609,6 +609,18 @@ static inline struct slab *virt_to_slab(const void *obj)
 	return page_get_slab(page);
 }
 
+static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,
+				 unsigned int idx)
+{
+	return slab->s_mem + cache->buffer_size * idx;
+}
+
+static inline unsigned int obj_to_index(struct kmem_cache *cache,
+					struct slab *slab, void *obj)
+{
+	return (unsigned)(obj - slab->s_mem) / cache->buffer_size;
+}
+
 /* These are the default caches for kmalloc. Custom caches can have other sizes. */
 struct cache_sizes malloc_sizes[] = {
 #define CACHE(x) { .cs_size = (x) },
@@ -1568,18 +1580,18 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
 		 * exist:
 		 */
 		struct slab *slabp = virt_to_slab(objp);
-		int objnr;
+		unsigned int objnr;
 
-		objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
+		objnr = obj_to_index(cachep, slabp, objp);
 		if (objnr) {
-			objp = slabp->s_mem + (objnr - 1) * cachep->buffer_size;
+			objp = index_to_obj(cachep, slabp, objnr - 1);
 			realobj = (char *)objp + obj_offset(cachep);
 			printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
 			       realobj, size);
 			print_objinfo(cachep, objp, 2);
 		}
 		if (objnr + 1 < cachep->num) {
-			objp = slabp->s_mem + (objnr + 1) * cachep->buffer_size;
+			objp = index_to_obj(cachep, slabp, objnr + 1);
 			realobj = (char *)objp + obj_offset(cachep);
 			printk(KERN_ERR "Next obj: start=%p, len=%d\n",
 			       realobj, size);
@@ -1598,7 +1610,7 @@ static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
 {
 	int i;
 	for (i = 0; i < cachep->num; i++) {
-		void *objp = slabp->s_mem + cachep->buffer_size * i;
+		void *objp = index_to_obj(cachep, slabp, i);
 
 		if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
@@ -1631,7 +1643,7 @@ static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
 	if (cachep->dtor) {
 		int i;
 		for (i = 0; i < cachep->num; i++) {
-			void *objp = slabp->s_mem + cachep->buffer_size * i;
+			void *objp = index_to_obj(cachep, slabp, i);
 			(cachep->dtor) (objp, cachep, 0);
 		}
 	}
@@ -2307,7 +2319,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
 	int i;
 
 	for (i = 0; i < cachep->num; i++) {
-		void *objp = slabp->s_mem + cachep->buffer_size * i;
+		void *objp = index_to_obj(cachep, slabp, i);
 #if DEBUG
 		/* need to poison the objs? */
 		if (cachep->flags & SLAB_POISON)
@@ -2363,7 +2375,7 @@ static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
 
 static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, int nodeid)
 {
-	void *objp = slabp->s_mem + (slabp->free * cachep->buffer_size);
+	void *objp = index_to_obj(cachep, slabp, slabp->free);
 	kmem_bufctl_t next;
 
 	slabp->inuse++;
@@ -2380,7 +2392,7 @@ static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, int nod
 static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, void *objp,
 			  int nodeid)
 {
-	unsigned int objnr = (unsigned)(objp-slabp->s_mem) / cachep->buffer_size;
+	unsigned int objnr = obj_to_index(cachep, slabp, objp);
 
 #if DEBUG
 	/* Verify that the slab belongs to the intended node */
@@ -2565,10 +2577,10 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
 	if (cachep->flags & SLAB_STORE_USER)
 		*dbg_userword(cachep, objp) = caller;
 
-	objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
+	objnr = obj_to_index(cachep, slabp, objp);
 
 	BUG_ON(objnr >= cachep->num);
-	BUG_ON(objp != slabp->s_mem + objnr * cachep->buffer_size);
+	BUG_ON(objp != index_to_obj(cachep, slabp, objnr));
 
 	if (cachep->flags & SLAB_DEBUG_INITIAL) {
 		/* Need to call the slab's constructor so the
-- 
cgit v1.1


From f30cf7d13eee420f5249b4d7709b46570098ab92 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Wed, 22 Mar 2006 00:08:11 -0800
Subject: [PATCH] slab: extract setup_cpu_cache

Extract setup_cpu_cache() function from kmem_cache_create() to make the
latter a little less complex.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 109 +++++++++++++++++++++++++++++++-------------------------------
 1 file changed, 55 insertions(+), 54 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 3d18b71..4d5c4b9 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1748,6 +1748,60 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep,
 	return left_over;
 }
 
+static void setup_cpu_cache(struct kmem_cache *cachep)
+{
+	if (g_cpucache_up == FULL) {
+		enable_cpucache(cachep);
+		return;
+	}
+	if (g_cpucache_up == NONE) {
+		/*
+		 * Note: the first kmem_cache_create must create the cache
+		 * that's used by kmalloc(24), otherwise the creation of
+		 * further caches will BUG().
+		 */
+		cachep->array[smp_processor_id()] = &initarray_generic.cache;
+
+		/*
+		 * If the cache that's used by kmalloc(sizeof(kmem_list3)) is
+		 * the first cache, then we need to set up all its list3s,
+		 * otherwise the creation of further caches will BUG().
+		 */
+		set_up_list3s(cachep, SIZE_AC);
+		if (INDEX_AC == INDEX_L3)
+			g_cpucache_up = PARTIAL_L3;
+		else
+			g_cpucache_up = PARTIAL_AC;
+	} else {
+		cachep->array[smp_processor_id()] =
+			kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
+
+		if (g_cpucache_up == PARTIAL_AC) {
+			set_up_list3s(cachep, SIZE_L3);
+			g_cpucache_up = PARTIAL_L3;
+		} else {
+			int node;
+			for_each_online_node(node) {
+				cachep->nodelists[node] =
+				    kmalloc_node(sizeof(struct kmem_list3),
+						GFP_KERNEL, node);
+				BUG_ON(!cachep->nodelists[node]);
+				kmem_list3_init(cachep->nodelists[node]);
+			}
+		}
+	}
+	cachep->nodelists[numa_node_id()]->next_reap =
+			jiffies + REAPTIMEOUT_LIST3 +
+			((unsigned long)cachep) % REAPTIMEOUT_LIST3;
+
+	cpu_cache_get(cachep)->avail = 0;
+	cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
+	cpu_cache_get(cachep)->batchcount = 1;
+	cpu_cache_get(cachep)->touched = 0;
+	cachep->batchcount = 1;
+	cachep->limit = BOOT_CPUCACHE_ENTRIES;
+}
+
 /**
  * kmem_cache_create - Create a cache.
  * @name: A string which is used in /proc/slabinfo to identify this cache.
@@ -2000,60 +2054,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	cachep->name = name;
 
 
-	if (g_cpucache_up == FULL) {
-		enable_cpucache(cachep);
-	} else {
-		if (g_cpucache_up == NONE) {
-			/* Note: the first kmem_cache_create must create
-			 * the cache that's used by kmalloc(24), otherwise
-			 * the creation of further caches will BUG().
-			 */
-			cachep->array[smp_processor_id()] =
-			    &initarray_generic.cache;
-
-			/* If the cache that's used by
-			 * kmalloc(sizeof(kmem_list3)) is the first cache,
-			 * then we need to set up all its list3s, otherwise
-			 * the creation of further caches will BUG().
-			 */
-			set_up_list3s(cachep, SIZE_AC);
-			if (INDEX_AC == INDEX_L3)
-				g_cpucache_up = PARTIAL_L3;
-			else
-				g_cpucache_up = PARTIAL_AC;
-		} else {
-			cachep->array[smp_processor_id()] =
-			    kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
-
-			if (g_cpucache_up == PARTIAL_AC) {
-				set_up_list3s(cachep, SIZE_L3);
-				g_cpucache_up = PARTIAL_L3;
-			} else {
-				int node;
-				for_each_online_node(node) {
-
-					cachep->nodelists[node] =
-					    kmalloc_node(sizeof
-							 (struct kmem_list3),
-							 GFP_KERNEL, node);
-					BUG_ON(!cachep->nodelists[node]);
-					kmem_list3_init(cachep->
-							nodelists[node]);
-				}
-			}
-		}
-		cachep->nodelists[numa_node_id()]->next_reap =
-		    jiffies + REAPTIMEOUT_LIST3 +
-		    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
-
-		BUG_ON(!cpu_cache_get(cachep));
-		cpu_cache_get(cachep)->avail = 0;
-		cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
-		cpu_cache_get(cachep)->batchcount = 1;
-		cpu_cache_get(cachep)->touched = 0;
-		cachep->batchcount = 1;
-		cachep->limit = BOOT_CPUCACHE_ENTRIES;
-	}
+	setup_cpu_cache(cachep);
 
 	/* cache setup completed, link it into the list */
 	list_add(&cachep->next, &cache_chain);
-- 
cgit v1.1


From a737b3e2fcf96f576fa3e2e382236d9ee94f383f Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 22 Mar 2006 00:08:11 -0800
Subject: [PATCH] slab cleanup

slab.c has become a bit revolting again.  Try to repair it.

- Coding style fixes

- Don't do assignments-in-if-statements.

- Don't typecast assignments to/from void*

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 596 ++++++++++++++++++++++++++++++++------------------------------
 1 file changed, 304 insertions(+), 292 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 4d5c4b9..7b6f9f1 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -50,7 +50,7 @@
  * The head array is strictly LIFO and should improve the cache hit rates.
  * On SMP, it additionally reduces the spinlock operations.
  *
- * The c_cpuarray may not be read with enabled local interrupts - 
+ * The c_cpuarray may not be read with enabled local interrupts -
  * it's changed with a smp_call_function().
  *
  * SMP synchronization:
@@ -266,16 +266,17 @@ struct array_cache {
 	unsigned int batchcount;
 	unsigned int touched;
 	spinlock_t lock;
-	void *entry[0];		/*
-				 * Must have this definition in here for the proper
-				 * alignment of array_cache. Also simplifies accessing
-				 * the entries.
-				 * [0] is for gcc 2.95. It should really be [].
-				 */
+	void *entry[0];	/*
+			 * Must have this definition in here for the proper
+			 * alignment of array_cache. Also simplifies accessing
+			 * the entries.
+			 * [0] is for gcc 2.95. It should really be [].
+			 */
 };
 
-/* bootstrap: The caches do not work without cpuarrays anymore,
- * but the cpuarrays are allocated from the generic caches...
+/*
+ * bootstrap: The caches do not work without cpuarrays anymore, but the
+ * cpuarrays are allocated from the generic caches...
  */
 #define BOOT_CPUCACHE_ENTRIES	1
 struct arraycache_init {
@@ -310,10 +311,8 @@ struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
 #define	SIZE_L3 (1 + MAX_NUMNODES)
 
 /*
- * This function must be completely optimized away if
- * a constant is passed to it. Mostly the same as
- * what is in linux/slab.h except it returns an
- * index.
+ * This function must be completely optimized away if a constant is passed to
+ * it.  Mostly the same as what is in linux/slab.h except it returns an index.
  */
 static __always_inline int index_of(const size_t size)
 {
@@ -351,14 +350,14 @@ static void kmem_list3_init(struct kmem_list3 *parent)
 	parent->free_touched = 0;
 }
 
-#define MAKE_LIST(cachep, listp, slab, nodeid)	\
-	do {	\
-		INIT_LIST_HEAD(listp);		\
-		list_splice(&(cachep->nodelists[nodeid]->slab), listp); \
+#define MAKE_LIST(cachep, listp, slab, nodeid)				\
+	do {								\
+		INIT_LIST_HEAD(listp);					\
+		list_splice(&(cachep->nodelists[nodeid]->slab), listp);	\
 	} while (0)
 
-#define	MAKE_ALL_LISTS(cachep, ptr, nodeid)			\
-	do {					\
+#define	MAKE_ALL_LISTS(cachep, ptr, nodeid)				\
+	do {								\
 	MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid);	\
 	MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \
 	MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid);	\
@@ -379,8 +378,8 @@ struct kmem_cache {
 	unsigned int buffer_size;
 /* 2) touched by every alloc & free from the backend */
 	struct kmem_list3 *nodelists[MAX_NUMNODES];
-	unsigned int flags;	/* constant flags */
-	unsigned int num;	/* # of objs per slab */
+	unsigned int flags;		/* constant flags */
+	unsigned int num;		/* # of objs per slab */
 	spinlock_t spinlock;
 
 /* 3) cache_grow/shrink */
@@ -390,11 +389,11 @@ struct kmem_cache {
 	/* force GFP flags, e.g. GFP_DMA */
 	gfp_t gfpflags;
 
-	size_t colour;		/* cache colouring range */
+	size_t colour;			/* cache colouring range */
 	unsigned int colour_off;	/* colour offset */
 	struct kmem_cache *slabp_cache;
 	unsigned int slab_size;
-	unsigned int dflags;	/* dynamic flags */
+	unsigned int dflags;		/* dynamic flags */
 
 	/* constructor func */
 	void (*ctor) (void *, struct kmem_cache *, unsigned long);
@@ -438,8 +437,9 @@ struct kmem_cache {
 #define	OFF_SLAB(x)	((x)->flags & CFLGS_OFF_SLAB)
 
 #define BATCHREFILL_LIMIT	16
-/* Optimization question: fewer reaps means less 
- * probability for unnessary cpucache drain/refill cycles.
+/*
+ * Optimization question: fewer reaps means less probability for unnessary
+ * cpucache drain/refill cycles.
  *
  * OTOH the cpuarrays can contain lots of objects,
  * which could lock up otherwise freeable slabs.
@@ -453,17 +453,19 @@ struct kmem_cache {
 #define	STATS_INC_ALLOCED(x)	((x)->num_allocations++)
 #define	STATS_INC_GROWN(x)	((x)->grown++)
 #define	STATS_INC_REAPED(x)	((x)->reaped++)
-#define	STATS_SET_HIGH(x)	do { if ((x)->num_active > (x)->high_mark) \
-					(x)->high_mark = (x)->num_active; \
-				} while (0)
+#define	STATS_SET_HIGH(x)						\
+	do {								\
+		if ((x)->num_active > (x)->high_mark)			\
+			(x)->high_mark = (x)->num_active;		\
+	} while (0)
 #define	STATS_INC_ERR(x)	((x)->errors++)
 #define	STATS_INC_NODEALLOCS(x)	((x)->node_allocs++)
 #define	STATS_INC_NODEFREES(x)	((x)->node_frees++)
-#define	STATS_SET_FREEABLE(x, i) \
-				do { if ((x)->max_freeable < i) \
-					(x)->max_freeable = i; \
-				} while (0)
-
+#define	STATS_SET_FREEABLE(x, i)					\
+	do {								\
+		if ((x)->max_freeable < i)				\
+			(x)->max_freeable = i;				\
+	} while (0)
 #define STATS_INC_ALLOCHIT(x)	atomic_inc(&(x)->allochit)
 #define STATS_INC_ALLOCMISS(x)	atomic_inc(&(x)->allocmiss)
 #define STATS_INC_FREEHIT(x)	atomic_inc(&(x)->freehit)
@@ -478,9 +480,7 @@ struct kmem_cache {
 #define	STATS_INC_ERR(x)	do { } while (0)
 #define	STATS_INC_NODEALLOCS(x)	do { } while (0)
 #define	STATS_INC_NODEFREES(x)	do { } while (0)
-#define	STATS_SET_FREEABLE(x, i) \
-				do { } while (0)
-
+#define	STATS_SET_FREEABLE(x, i) do { } while (0)
 #define STATS_INC_ALLOCHIT(x)	do { } while (0)
 #define STATS_INC_ALLOCMISS(x)	do { } while (0)
 #define STATS_INC_FREEHIT(x)	do { } while (0)
@@ -488,7 +488,8 @@ struct kmem_cache {
 #endif
 
 #if DEBUG
-/* Magic nums for obj red zoning.
+/*
+ * Magic nums for obj red zoning.
  * Placed in the first word before and the first word after an obj.
  */
 #define	RED_INACTIVE	0x5A2CF071UL	/* when obj is inactive */
@@ -499,7 +500,8 @@ struct kmem_cache {
 #define POISON_FREE	0x6b	/* for use-after-free poisoning */
 #define	POISON_END	0xa5	/* end-byte of poisoning */
 
-/* memory layout of objects:
+/*
+ * memory layout of objects:
  * 0		: objp
  * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that
  * 		the end of an object is aligned with the end of the real
@@ -508,7 +510,8 @@ struct kmem_cache {
  * 		redzone word.
  * cachep->obj_offset: The real object.
  * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
- * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address [BYTES_PER_WORD long]
+ * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address
+ *					[BYTES_PER_WORD long]
  */
 static int obj_offset(struct kmem_cache *cachep)
 {
@@ -552,8 +555,8 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
 #endif
 
 /*
- * Maximum size of an obj (in 2^order pages)
- * and absolute limit for the gfp order.
+ * Maximum size of an obj (in 2^order pages) and absolute limit for the gfp
+ * order.
  */
 #if defined(CONFIG_LARGE_ALLOCS)
 #define	MAX_OBJ_ORDER	13	/* up to 32Mb */
@@ -573,9 +576,10 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
 #define	BREAK_GFP_ORDER_LO	0
 static int slab_break_gfp_order = BREAK_GFP_ORDER_LO;
 
-/* Functions for storing/retrieving the cachep and or slab from the
- * global 'mem_map'. These are used to find the slab an obj belongs to.
- * With kfree(), these are used to find the cache which an obj belongs to.
+/*
+ * Functions for storing/retrieving the cachep and or slab from the page
+ * allocator.  These are used to find the slab an obj belongs to.  With kfree(),
+ * these are used to find the cache which an obj belongs to.
  */
 static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
 {
@@ -621,7 +625,9 @@ static inline unsigned int obj_to_index(struct kmem_cache *cache,
 	return (unsigned)(obj - slab->s_mem) / cache->buffer_size;
 }
 
-/* These are the default caches for kmalloc. Custom caches can have other sizes. */
+/*
+ * These are the default caches for kmalloc. Custom caches can have other sizes.
+ */
 struct cache_sizes malloc_sizes[] = {
 #define CACHE(x) { .cs_size = (x) },
 #include <linux/kmalloc_sizes.h>
@@ -667,8 +673,8 @@ static DEFINE_MUTEX(cache_chain_mutex);
 static struct list_head cache_chain;
 
 /*
- * vm_enough_memory() looks at this to determine how many
- * slab-allocated pages are possibly freeable under pressure
+ * vm_enough_memory() looks at this to determine how many slab-allocated pages
+ * are possibly freeable under pressure
  *
  * SLAB_RECLAIM_ACCOUNT turns this on per-slab
  */
@@ -687,7 +693,8 @@ static enum {
 
 static DEFINE_PER_CPU(struct work_struct, reap_work);
 
-static void free_block(struct kmem_cache *cachep, void **objpp, int len, int node);
+static void free_block(struct kmem_cache *cachep, void **objpp, int len,
+			int node);
 static void enable_cpucache(struct kmem_cache *cachep);
 static void cache_reap(void *unused);
 static int __node_shrink(struct kmem_cache *cachep, int node);
@@ -697,7 +704,8 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
 	return cachep->array[smp_processor_id()];
 }
 
-static inline struct kmem_cache *__find_general_cachep(size_t size, gfp_t gfpflags)
+static inline struct kmem_cache *__find_general_cachep(size_t size,
+							gfp_t gfpflags)
 {
 	struct cache_sizes *csizep = malloc_sizes;
 
@@ -732,8 +740,9 @@ static size_t slab_mgmt_size(size_t nr_objs, size_t align)
 	return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align);
 }
 
-/* Calculate the number of objects and left-over bytes for a given
-   buffer size. */
+/*
+ * Calculate the number of objects and left-over bytes for a given buffer size.
+ */
 static void cache_estimate(unsigned long gfporder, size_t buffer_size,
 			   size_t align, int flags, size_t *left_over,
 			   unsigned int *num)
@@ -794,7 +803,8 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,
 
 #define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg)
 
-static void __slab_error(const char *function, struct kmem_cache *cachep, char *msg)
+static void __slab_error(const char *function, struct kmem_cache *cachep,
+			char *msg)
 {
 	printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
 	       function, cachep->name, msg);
@@ -918,10 +928,8 @@ static void free_alien_cache(struct array_cache **ac_ptr)
 
 	if (!ac_ptr)
 		return;
-
 	for_each_node(i)
 	    kfree(ac_ptr[i]);
-
 	kfree(ac_ptr);
 }
 
@@ -955,7 +963,8 @@ static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
 	}
 }
 
-static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **alien)
+static void drain_alien_cache(struct kmem_cache *cachep,
+				struct array_cache **alien)
 {
 	int i = 0;
 	struct array_cache *ac;
@@ -998,20 +1007,22 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
 	switch (action) {
 	case CPU_UP_PREPARE:
 		mutex_lock(&cache_chain_mutex);
-		/* we need to do this right in the beginning since
+		/*
+		 * We need to do this right in the beginning since
 		 * alloc_arraycache's are going to use this list.
 		 * kmalloc_node allows us to add the slab to the right
 		 * kmem_list3 and not this cpu's kmem_list3
 		 */
 
 		list_for_each_entry(cachep, &cache_chain, next) {
-			/* setup the size64 kmemlist for cpu before we can
+			/*
+			 * Set up the size64 kmemlist for cpu before we can
 			 * begin anything. Make sure some other cpu on this
 			 * node has not already allocated this
 			 */
 			if (!cachep->nodelists[node]) {
-				if (!(l3 = kmalloc_node(memsize,
-							GFP_KERNEL, node)))
+				l3 = kmalloc_node(memsize, GFP_KERNEL, node);
+				if (!l3)
 					goto bad;
 				kmem_list3_init(l3);
 				l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
@@ -1027,13 +1038,15 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
 
 			spin_lock_irq(&cachep->nodelists[node]->list_lock);
 			cachep->nodelists[node]->free_limit =
-			    (1 + nr_cpus_node(node)) *
-			    cachep->batchcount + cachep->num;
+				(1 + nr_cpus_node(node)) *
+				cachep->batchcount + cachep->num;
 			spin_unlock_irq(&cachep->nodelists[node]->list_lock);
 		}
 
-		/* Now we can go ahead with allocating the shared array's
-		   & array cache's */
+		/*
+		 * Now we can go ahead with allocating the shared arrays and
+		 * array caches
+		 */
 		list_for_each_entry(cachep, &cache_chain, next) {
 			struct array_cache *nc;
 			struct array_cache *shared;
@@ -1053,7 +1066,6 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
 			if (!alien)
 				goto bad;
 			cachep->array[cpu] = nc;
-
 			l3 = cachep->nodelists[node];
 			BUG_ON(!l3);
 
@@ -1073,7 +1085,6 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
 			}
 #endif
 			spin_unlock_irq(&l3->list_lock);
-
 			kfree(shared);
 			free_alien_cache(alien);
 		}
@@ -1095,7 +1106,6 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
 		/* fall thru */
 	case CPU_UP_CANCELED:
 		mutex_lock(&cache_chain_mutex);
-
 		list_for_each_entry(cachep, &cache_chain, next) {
 			struct array_cache *nc;
 			struct array_cache *shared;
@@ -1162,7 +1172,7 @@ free_array_cache:
 #endif
 	}
 	return NOTIFY_OK;
-      bad:
+bad:
 	mutex_unlock(&cache_chain_mutex);
 	return NOTIFY_BAD;
 }
@@ -1172,7 +1182,8 @@ static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 };
 /*
  * swap the static kmem_list3 with kmalloced memory
  */
-static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, int nodeid)
+static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
+			int nodeid)
 {
 	struct kmem_list3 *ptr;
 
@@ -1187,8 +1198,9 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, int no
 	local_irq_enable();
 }
 
-/* Initialisation.
- * Called after the gfp() functions have been enabled, and before smp_init().
+/*
+ * Initialisation.  Called after the page allocator have been initialised and
+ * before smp_init().
  */
 void __init kmem_cache_init(void)
 {
@@ -1213,9 +1225,9 @@ void __init kmem_cache_init(void)
 
 	/* Bootstrap is tricky, because several objects are allocated
 	 * from caches that do not exist yet:
-	 * 1) initialize the cache_cache cache: it contains the struct kmem_cache
-	 *    structures of all caches, except cache_cache itself: cache_cache
-	 *    is statically allocated.
+	 * 1) initialize the cache_cache cache: it contains the struct
+	 *    kmem_cache structures of all caches, except cache_cache itself:
+	 *    cache_cache is statically allocated.
 	 *    Initially an __init data area is used for the head array and the
 	 *    kmem_list3 structures, it's replaced with a kmalloc allocated
 	 *    array at the end of the bootstrap.
@@ -1238,7 +1250,8 @@ void __init kmem_cache_init(void)
 	cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
 	cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE];
 
-	cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, cache_line_size());
+	cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,
+					cache_line_size());
 
 	for (order = 0; order < MAX_ORDER; order++) {
 		cache_estimate(order, cache_cache.buffer_size,
@@ -1257,24 +1270,26 @@ void __init kmem_cache_init(void)
 	sizes = malloc_sizes;
 	names = cache_names;
 
-	/* Initialize the caches that provide memory for the array cache
-	 * and the kmem_list3 structures first.
-	 * Without this, further allocations will bug
+	/*
+	 * Initialize the caches that provide memory for the array cache and the
+	 * kmem_list3 structures first.  Without this, further allocations will
+	 * bug.
 	 */
 
 	sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
-						      sizes[INDEX_AC].cs_size,
-						      ARCH_KMALLOC_MINALIGN,
-						      (ARCH_KMALLOC_FLAGS |
-						       SLAB_PANIC), NULL, NULL);
+					sizes[INDEX_AC].cs_size,
+					ARCH_KMALLOC_MINALIGN,
+					ARCH_KMALLOC_FLAGS|SLAB_PANIC,
+					NULL, NULL);
 
-	if (INDEX_AC != INDEX_L3)
+	if (INDEX_AC != INDEX_L3) {
 		sizes[INDEX_L3].cs_cachep =
-		    kmem_cache_create(names[INDEX_L3].name,
-				      sizes[INDEX_L3].cs_size,
-				      ARCH_KMALLOC_MINALIGN,
-				      (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL,
-				      NULL);
+			kmem_cache_create(names[INDEX_L3].name,
+				sizes[INDEX_L3].cs_size,
+				ARCH_KMALLOC_MINALIGN,
+				ARCH_KMALLOC_FLAGS|SLAB_PANIC,
+				NULL, NULL);
+	}
 
 	while (sizes->cs_size != ULONG_MAX) {
 		/*
@@ -1284,13 +1299,13 @@ void __init kmem_cache_init(void)
 		 * Note for systems short on memory removing the alignment will
 		 * allow tighter packing of the smaller caches.
 		 */
-		if (!sizes->cs_cachep)
+		if (!sizes->cs_cachep) {
 			sizes->cs_cachep = kmem_cache_create(names->name,
-							     sizes->cs_size,
-							     ARCH_KMALLOC_MINALIGN,
-							     (ARCH_KMALLOC_FLAGS
-							      | SLAB_PANIC),
-							     NULL, NULL);
+					sizes->cs_size,
+					ARCH_KMALLOC_MINALIGN,
+					ARCH_KMALLOC_FLAGS|SLAB_PANIC,
+					NULL, NULL);
+		}
 
 		/* Inc off-slab bufctl limit until the ceiling is hit. */
 		if (!(OFF_SLAB(sizes->cs_cachep))) {
@@ -1299,13 +1314,11 @@ void __init kmem_cache_init(void)
 		}
 
 		sizes->cs_dmacachep = kmem_cache_create(names->name_dma,
-							sizes->cs_size,
-							ARCH_KMALLOC_MINALIGN,
-							(ARCH_KMALLOC_FLAGS |
-							 SLAB_CACHE_DMA |
-							 SLAB_PANIC), NULL,
-							NULL);
-
+					sizes->cs_size,
+					ARCH_KMALLOC_MINALIGN,
+					ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA|
+						SLAB_PANIC,
+					NULL, NULL);
 		sizes++;
 		names++;
 	}
@@ -1357,20 +1370,22 @@ void __init kmem_cache_init(void)
 		struct kmem_cache *cachep;
 		mutex_lock(&cache_chain_mutex);
 		list_for_each_entry(cachep, &cache_chain, next)
-		    enable_cpucache(cachep);
+			enable_cpucache(cachep);
 		mutex_unlock(&cache_chain_mutex);
 	}
 
 	/* Done! */
 	g_cpucache_up = FULL;
 
-	/* Register a cpu startup notifier callback
-	 * that initializes cpu_cache_get for all new cpus
+	/*
+	 * Register a cpu startup notifier callback that initializes
+	 * cpu_cache_get for all new cpus
 	 */
 	register_cpu_notifier(&cpucache_notifier);
 
-	/* The reap timers are started later, with a module init call:
-	 * That part of the kernel is not yet operational.
+	/*
+	 * The reap timers are started later, with a module init call: That part
+	 * of the kernel is not yet operational.
 	 */
 }
 
@@ -1378,16 +1393,13 @@ static int __init cpucache_init(void)
 {
 	int cpu;
 
-	/* 
-	 * Register the timers that return unneeded
-	 * pages to gfp.
+	/*
+	 * Register the timers that return unneeded pages to the page allocator
 	 */
 	for_each_online_cpu(cpu)
-	    start_cpu_timer(cpu);
-
+		start_cpu_timer(cpu);
 	return 0;
 }
-
 __initcall(cpucache_init);
 
 /*
@@ -1501,9 +1513,8 @@ static void dump_line(char *data, int offset, int limit)
 {
 	int i;
 	printk(KERN_ERR "%03x:", offset);
-	for (i = 0; i < limit; i++) {
+	for (i = 0; i < limit; i++)
 		printk(" %02x", (unsigned char)data[offset + i]);
-	}
 	printk("\n");
 }
 #endif
@@ -1517,15 +1528,15 @@ static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
 
 	if (cachep->flags & SLAB_RED_ZONE) {
 		printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n",
-		       *dbg_redzone1(cachep, objp),
-		       *dbg_redzone2(cachep, objp));
+			*dbg_redzone1(cachep, objp),
+			*dbg_redzone2(cachep, objp));
 	}
 
 	if (cachep->flags & SLAB_STORE_USER) {
 		printk(KERN_ERR "Last user: [<%p>]",
-		       *dbg_userword(cachep, objp));
+			*dbg_userword(cachep, objp));
 		print_symbol("(%s)",
-			     (unsigned long)*dbg_userword(cachep, objp));
+				(unsigned long)*dbg_userword(cachep, objp));
 		printk("\n");
 	}
 	realobj = (char *)objp + obj_offset(cachep);
@@ -1558,8 +1569,8 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
 			/* Print header */
 			if (lines == 0) {
 				printk(KERN_ERR
-				       "Slab corruption: start=%p, len=%d\n",
-				       realobj, size);
+					"Slab corruption: start=%p, len=%d\n",
+					realobj, size);
 				print_objinfo(cachep, objp, 0);
 			}
 			/* Hexdump the affected line */
@@ -1614,11 +1625,10 @@ static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
 
 		if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
-			if ((cachep->buffer_size % PAGE_SIZE) == 0
-			    && OFF_SLAB(cachep))
+			if (cachep->buffer_size % PAGE_SIZE == 0 &&
+					OFF_SLAB(cachep))
 				kernel_map_pages(virt_to_page(objp),
-						 cachep->buffer_size / PAGE_SIZE,
-						 1);
+					cachep->buffer_size / PAGE_SIZE, 1);
 			else
 				check_poison_obj(cachep, objp);
 #else
@@ -1650,10 +1660,10 @@ static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
 }
 #endif
 
-/**
+/*
  * Destroy all the objs in a slab, and release the mem back to the system.
- * Before calling the slab must have been unlinked from the cache.
- * The cache-lock is not held/needed.
+ * Before calling the slab must have been unlinked from the cache.  The
+ * cache-lock is not held/needed.
  */
 static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
 {
@@ -1674,8 +1684,10 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
 	}
 }
 
-/* For setting up all the kmem_list3s for cache whose buffer_size is same
-   as size of kmem_list3. */
+/*
+ * For setting up all the kmem_list3s for cache whose buffer_size is same as
+ * size of kmem_list3.
+ */
 static void set_up_list3s(struct kmem_cache *cachep, int index)
 {
 	int node;
@@ -1701,13 +1713,13 @@ static void set_up_list3s(struct kmem_cache *cachep, int index)
  * high order pages for slabs.  When the gfp() functions are more friendly
  * towards high-order requests, this should be changed.
  */
-static inline size_t calculate_slab_order(struct kmem_cache *cachep,
+static size_t calculate_slab_order(struct kmem_cache *cachep,
 			size_t size, size_t align, unsigned long flags)
 {
 	size_t left_over = 0;
 	int gfporder;
 
-	for (gfporder = 0 ; gfporder <= MAX_GFP_ORDER; gfporder++) {
+	for (gfporder = 0; gfporder <= MAX_GFP_ORDER; gfporder++) {
 		unsigned int num;
 		size_t remainder;
 
@@ -1742,7 +1754,7 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep,
 		/*
 		 * Acceptable internal fragmentation?
 		 */
-		if ((left_over * 8) <= (PAGE_SIZE << gfporder))
+		if (left_over * 8 <= (PAGE_SIZE << gfporder))
 			break;
 	}
 	return left_over;
@@ -1817,9 +1829,8 @@ static void setup_cpu_cache(struct kmem_cache *cachep)
  * and the @dtor is run before the pages are handed back.
  *
  * @name must be valid until the cache is destroyed. This implies that
- * the module calling this has to destroy the cache before getting 
- * unloaded.
- * 
+ * the module calling this has to destroy the cache before getting unloaded.
+ *
  * The flags are
  *
  * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
@@ -1837,7 +1848,8 @@ static void setup_cpu_cache(struct kmem_cache *cachep)
  */
 struct kmem_cache *
 kmem_cache_create (const char *name, size_t size, size_t align,
-	unsigned long flags, void (*ctor)(void*, struct kmem_cache *, unsigned long),
+	unsigned long flags,
+	void (*ctor)(void*, struct kmem_cache *, unsigned long),
 	void (*dtor)(void*, struct kmem_cache *, unsigned long))
 {
 	size_t left_over, slab_size, ralign;
@@ -1847,12 +1859,10 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	/*
 	 * Sanity checks... these are all serious usage bugs.
 	 */
-	if ((!name) ||
-	    in_interrupt() ||
-	    (size < BYTES_PER_WORD) ||
+	if (!name || in_interrupt() || (size < BYTES_PER_WORD) ||
 	    (size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) {
-		printk(KERN_ERR "%s: Early error in slab %s\n",
-		       __FUNCTION__, name);
+		printk(KERN_ERR "%s: Early error in slab %s\n", __FUNCTION__,
+				name);
 		BUG();
 	}
 
@@ -1906,8 +1916,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	 * above the next power of two: caches with object sizes just above a
 	 * power of two have a significant amount of internal fragmentation.
 	 */
-	if ((size < 4096
-	     || fls(size - 1) == fls(size - 1 + 3 * BYTES_PER_WORD)))
+	if (size < 4096 || fls(size - 1) == fls(size-1 + 3 * BYTES_PER_WORD))
 		flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
 	if (!(flags & SLAB_DESTROY_BY_RCU))
 		flags |= SLAB_POISON;
@@ -1919,13 +1928,14 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 		BUG_ON(dtor);
 
 	/*
-	 * Always checks flags, a caller might be expecting debug
-	 * support which isn't available.
+	 * Always checks flags, a caller might be expecting debug support which
+	 * isn't available.
 	 */
 	if (flags & ~CREATE_MASK)
 		BUG();
 
-	/* Check that size is in terms of words.  This is needed to avoid
+	/*
+	 * Check that size is in terms of words.  This is needed to avoid
 	 * unaligned accesses for some archs when redzoning is used, and makes
 	 * sure any on-slab bufctl's are also correctly aligned.
 	 */
@@ -1934,12 +1944,14 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 		size &= ~(BYTES_PER_WORD - 1);
 	}
 
-	/* calculate out the final buffer alignment: */
+	/* calculate the final buffer alignment: */
+
 	/* 1) arch recommendation: can be overridden for debug */
 	if (flags & SLAB_HWCACHE_ALIGN) {
-		/* Default alignment: as specified by the arch code.
-		 * Except if an object is really small, then squeeze multiple
-		 * objects into one cacheline.
+		/*
+		 * Default alignment: as specified by the arch code.  Except if
+		 * an object is really small, then squeeze multiple objects into
+		 * one cacheline.
 		 */
 		ralign = cache_line_size();
 		while (size <= ralign / 2)
@@ -1959,7 +1971,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 		if (ralign > BYTES_PER_WORD)
 			flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
 	}
-	/* 4) Store it. Note that the debug code below can reduce
+	/*
+	 * 4) Store it. Note that the debug code below can reduce
 	 *    the alignment to BYTES_PER_WORD.
 	 */
 	align = ralign;
@@ -2058,7 +2071,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 
 	/* cache setup completed, link it into the list */
 	list_add(&cachep->next, &cache_chain);
-      oops:
+oops:
 	if (!cachep && (flags & SLAB_PANIC))
 		panic("kmem_cache_create(): failed to create slab `%s'\n",
 		      name);
@@ -2109,7 +2122,6 @@ static void smp_call_function_all_cpus(void (*func)(void *arg), void *arg)
 {
 	check_irq_on();
 	preempt_disable();
-
 	local_irq_disable();
 	func(arg);
 	local_irq_enable();
@@ -2120,12 +2132,12 @@ static void smp_call_function_all_cpus(void (*func)(void *arg), void *arg)
 	preempt_enable();
 }
 
-static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac,
-				int force, int node);
+static void drain_array_locked(struct kmem_cache *cachep,
+			struct array_cache *ac, int force, int node);
 
 static void do_drain(void *arg)
 {
-	struct kmem_cache *cachep = (struct kmem_cache *) arg;
+	struct kmem_cache *cachep = arg;
 	struct array_cache *ac;
 	int node = numa_node_id();
 
@@ -2273,16 +2285,15 @@ int kmem_cache_destroy(struct kmem_cache *cachep)
 
 	/* NUMA: free the list3 structures */
 	for_each_online_node(i) {
-		if ((l3 = cachep->nodelists[i])) {
+		l3 = cachep->nodelists[i];
+		if (l3) {
 			kfree(l3->shared);
 			free_alien_cache(l3->alien);
 			kfree(l3);
 		}
 	}
 	kmem_cache_free(&cache_cache, cachep);
-
 	unlock_cpu_hotplug();
-
 	return 0;
 }
 EXPORT_SYMBOL(kmem_cache_destroy);
@@ -2305,7 +2316,6 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
 	slabp->inuse = 0;
 	slabp->colouroff = colour_off;
 	slabp->s_mem = objp + colour_off;
-
 	return slabp;
 }
 
@@ -2333,9 +2343,9 @@ static void cache_init_objs(struct kmem_cache *cachep,
 			*dbg_redzone2(cachep, objp) = RED_INACTIVE;
 		}
 		/*
-		 * Constructors are not allowed to allocate memory from
-		 * the same cache which they are a constructor for.
-		 * Otherwise, deadlock. They must also be threaded.
+		 * Constructors are not allowed to allocate memory from the same
+		 * cache which they are a constructor for.  Otherwise, deadlock.
+		 * They must also be threaded.
 		 */
 		if (cachep->ctor && !(cachep->flags & SLAB_POISON))
 			cachep->ctor(objp + obj_offset(cachep), cachep,
@@ -2349,8 +2359,8 @@ static void cache_init_objs(struct kmem_cache *cachep,
 				slab_error(cachep, "constructor overwrote the"
 					   " start of an object");
 		}
-		if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)
-		    && cachep->flags & SLAB_POISON)
+		if ((cachep->buffer_size % PAGE_SIZE) == 0 &&
+			    OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
 			kernel_map_pages(virt_to_page(objp),
 					 cachep->buffer_size / PAGE_SIZE, 0);
 #else
@@ -2365,16 +2375,14 @@ static void cache_init_objs(struct kmem_cache *cachep,
 
 static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
 {
-	if (flags & SLAB_DMA) {
-		if (!(cachep->gfpflags & GFP_DMA))
-			BUG();
-	} else {
-		if (cachep->gfpflags & GFP_DMA)
-			BUG();
-	}
+	if (flags & SLAB_DMA)
+		BUG_ON(!(cachep->gfpflags & GFP_DMA));
+	else
+		BUG_ON(cachep->gfpflags & GFP_DMA);
 }
 
-static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, int nodeid)
+static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp,
+				int nodeid)
 {
 	void *objp = index_to_obj(cachep, slabp, slabp->free);
 	kmem_bufctl_t next;
@@ -2390,8 +2398,8 @@ static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, int nod
 	return objp;
 }
 
-static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, void *objp,
-			  int nodeid)
+static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
+				void *objp, int nodeid)
 {
 	unsigned int objnr = obj_to_index(cachep, slabp, objp);
 
@@ -2401,7 +2409,7 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, void *ob
 
 	if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) {
 		printk(KERN_ERR "slab: double free detected in cache "
-		       "'%s', objp %p\n", cachep->name, objp);
+				"'%s', objp %p\n", cachep->name, objp);
 		BUG();
 	}
 #endif
@@ -2410,7 +2418,8 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, void *ob
 	slabp->inuse--;
 }
 
-static void set_slab_attr(struct kmem_cache *cachep, struct slab *slabp, void *objp)
+static void set_slab_attr(struct kmem_cache *cachep, struct slab *slabp,
+			void *objp)
 {
 	int i;
 	struct page *page;
@@ -2438,8 +2447,9 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 	unsigned long ctor_flags;
 	struct kmem_list3 *l3;
 
-	/* Be lazy and only check for valid flags here,
-	 * keeping it out of the critical path in kmem_cache_alloc().
+	/*
+	 * Be lazy and only check for valid flags here,  keeping it out of the
+	 * critical path in kmem_cache_alloc().
 	 */
 	if (flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | SLAB_NO_GROW))
 		BUG();
@@ -2480,14 +2490,17 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 	 */
 	kmem_flagcheck(cachep, flags);
 
-	/* Get mem for the objs.
-	 * Attempt to allocate a physical page from 'nodeid',
+	/*
+	 * Get mem for the objs.  Attempt to allocate a physical page from
+	 * 'nodeid'.
 	 */
-	if (!(objp = kmem_getpages(cachep, flags, nodeid)))
+	objp = kmem_getpages(cachep, flags, nodeid);
+	if (!objp)
 		goto failed;
 
 	/* Get slab management. */
-	if (!(slabp = alloc_slabmgmt(cachep, objp, offset, local_flags)))
+	slabp = alloc_slabmgmt(cachep, objp, offset, local_flags);
+	if (!slabp)
 		goto opps1;
 
 	slabp->nodeid = nodeid;
@@ -2506,9 +2519,9 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 	l3->free_objects += cachep->num;
 	spin_unlock(&l3->list_lock);
 	return 1;
-      opps1:
+opps1:
 	kmem_freepages(cachep, objp);
-      failed:
+failed:
 	if (local_flags & __GFP_WAIT)
 		local_irq_disable();
 	return 0;
@@ -2551,8 +2564,8 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
 	page = virt_to_page(objp);
 
 	if (page_get_cache(page) != cachep) {
-		printk(KERN_ERR
-		       "mismatch in kmem_cache_free: expected cache %p, got %p\n",
+		printk(KERN_ERR "mismatch in kmem_cache_free: expected "
+				"cache %p, got %p\n",
 		       page_get_cache(page), cachep);
 		printk(KERN_ERR "%p is %s.\n", cachep, cachep->name);
 		printk(KERN_ERR "%p is %s.\n", page_get_cache(page),
@@ -2562,13 +2575,12 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
 	slabp = page_get_slab(page);
 
 	if (cachep->flags & SLAB_RED_ZONE) {
-		if (*dbg_redzone1(cachep, objp) != RED_ACTIVE
-		    || *dbg_redzone2(cachep, objp) != RED_ACTIVE) {
-			slab_error(cachep,
-				   "double free, or memory outside"
-				   " object was overwritten");
-			printk(KERN_ERR
-			       "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
+		if (*dbg_redzone1(cachep, objp) != RED_ACTIVE ||
+				*dbg_redzone2(cachep, objp) != RED_ACTIVE) {
+			slab_error(cachep, "double free, or memory outside"
+						" object was overwritten");
+			printk(KERN_ERR "%p: redzone 1:0x%lx, "
+					"redzone 2:0x%lx.\n",
 			       objp, *dbg_redzone1(cachep, objp),
 			       *dbg_redzone2(cachep, objp));
 		}
@@ -2584,9 +2596,10 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
 	BUG_ON(objp != index_to_obj(cachep, slabp, objnr));
 
 	if (cachep->flags & SLAB_DEBUG_INITIAL) {
-		/* Need to call the slab's constructor so the
-		 * caller can perform a verify of its state (debugging).
-		 * Called without the cache-lock held.
+		/*
+		 * Need to call the slab's constructor so the caller can
+		 * perform a verify of its state (debugging).  Called without
+		 * the cache-lock held.
 		 */
 		cachep->ctor(objp + obj_offset(cachep),
 			     cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY);
@@ -2599,7 +2612,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
 	}
 	if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
-		if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) {
+		if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
 			store_stackinfo(cachep, objp, (unsigned long)caller);
 			kernel_map_pages(virt_to_page(objp),
 					 cachep->buffer_size / PAGE_SIZE, 0);
@@ -2625,14 +2638,14 @@ static void check_slabp(struct kmem_cache *cachep, struct slab *slabp)
 			goto bad;
 	}
 	if (entries != cachep->num - slabp->inuse) {
-	      bad:
-		printk(KERN_ERR
-		       "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n",
-		       cachep->name, cachep->num, slabp, slabp->inuse);
+bad:
+		printk(KERN_ERR "slab: Internal list corruption detected in "
+				"cache '%s'(%d), slabp %p(%d). Hexdump:\n",
+			cachep->name, cachep->num, slabp, slabp->inuse);
 		for (i = 0;
 		     i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t);
 		     i++) {
-			if ((i % 16) == 0)
+			if (i % 16 == 0)
 				printk("\n%03x:", i);
 			printk(" %02x", ((unsigned char *)slabp)[i]);
 		}
@@ -2654,12 +2667,13 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
 
 	check_irq_off();
 	ac = cpu_cache_get(cachep);
-      retry:
+retry:
 	batchcount = ac->batchcount;
 	if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
-		/* if there was little recent activity on this
-		 * cache, then perform only a partial refill.
-		 * Otherwise we could generate refill bouncing.
+		/*
+		 * If there was little recent activity on this cache, then
+		 * perform only a partial refill.  Otherwise we could generate
+		 * refill bouncing.
 		 */
 		batchcount = BATCHREFILL_LIMIT;
 	}
@@ -2715,29 +2729,29 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
 			list_add(&slabp->list, &l3->slabs_partial);
 	}
 
-      must_grow:
+must_grow:
 	l3->free_objects -= ac->avail;
-      alloc_done:
+alloc_done:
 	spin_unlock(&l3->list_lock);
 
 	if (unlikely(!ac->avail)) {
 		int x;
 		x = cache_grow(cachep, flags, numa_node_id());
 
-		// cache_grow can reenable interrupts, then ac could change.
+		/* cache_grow can reenable interrupts, then ac could change. */
 		ac = cpu_cache_get(cachep);
-		if (!x && ac->avail == 0)	// no objects in sight? abort
+		if (!x && ac->avail == 0)	/* no objects in sight? abort */
 			return NULL;
 
-		if (!ac->avail)	// objects refilled by interrupt?
+		if (!ac->avail)		/* objects refilled by interrupt? */
 			goto retry;
 	}
 	ac->touched = 1;
 	return ac->entry[--ac->avail];
 }
 
-static inline void
-cache_alloc_debugcheck_before(struct kmem_cache *cachep, gfp_t flags)
+static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
+						gfp_t flags)
 {
 	might_sleep_if(flags & __GFP_WAIT);
 #if DEBUG
@@ -2746,8 +2760,8 @@ cache_alloc_debugcheck_before(struct kmem_cache *cachep, gfp_t flags)
 }
 
 #if DEBUG
-static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags,
-					void *objp, void *caller)
+static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
+				gfp_t flags, void *objp, void *caller)
 {
 	if (!objp)
 		return objp;
@@ -2767,15 +2781,14 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags
 		*dbg_userword(cachep, objp) = caller;
 
 	if (cachep->flags & SLAB_RED_ZONE) {
-		if (*dbg_redzone1(cachep, objp) != RED_INACTIVE
-		    || *dbg_redzone2(cachep, objp) != RED_INACTIVE) {
-			slab_error(cachep,
-				   "double free, or memory outside"
-				   " object was overwritten");
+		if (*dbg_redzone1(cachep, objp) != RED_INACTIVE ||
+				*dbg_redzone2(cachep, objp) != RED_INACTIVE) {
+			slab_error(cachep, "double free, or memory outside"
+						" object was overwritten");
 			printk(KERN_ERR
-			       "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
-			       objp, *dbg_redzone1(cachep, objp),
-			       *dbg_redzone2(cachep, objp));
+				"%p: redzone 1:0x%lx, redzone 2:0x%lx\n",
+				objp, *dbg_redzone1(cachep, objp),
+				*dbg_redzone2(cachep, objp));
 		}
 		*dbg_redzone1(cachep, objp) = RED_ACTIVE;
 		*dbg_redzone2(cachep, objp) = RED_ACTIVE;
@@ -2822,8 +2835,8 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 	return objp;
 }
 
-static __always_inline void *
-__cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
+static __always_inline void *__cache_alloc(struct kmem_cache *cachep,
+						gfp_t flags, void *caller)
 {
 	unsigned long save_flags;
 	void *objp;
@@ -2843,7 +2856,8 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
 /*
  * A interface to enable slab creation on nodeid
  */
-static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
+static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
+				int nodeid)
 {
 	struct list_head *entry;
 	struct slab *slabp;
@@ -2854,7 +2868,7 @@ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int node
 	l3 = cachep->nodelists[nodeid];
 	BUG_ON(!l3);
 
-      retry:
+retry:
 	check_irq_off();
 	spin_lock(&l3->list_lock);
 	entry = l3->slabs_partial.next;
@@ -2881,16 +2895,15 @@ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int node
 	/* move slabp to correct slabp list: */
 	list_del(&slabp->list);
 
-	if (slabp->free == BUFCTL_END) {
+	if (slabp->free == BUFCTL_END)
 		list_add(&slabp->list, &l3->slabs_full);
-	} else {
+	else
 		list_add(&slabp->list, &l3->slabs_partial);
-	}
 
 	spin_unlock(&l3->list_lock);
 	goto done;
 
-      must_grow:
+must_grow:
 	spin_unlock(&l3->list_lock);
 	x = cache_grow(cachep, flags, nodeid);
 
@@ -2898,7 +2911,7 @@ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int node
 		return NULL;
 
 	goto retry;
-      done:
+done:
 	return obj;
 }
 #endif
@@ -2971,7 +2984,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
 	}
 
 	free_block(cachep, ac->entry, batchcount, node);
-      free_done:
+free_done:
 #if STATS
 	{
 		int i = 0;
@@ -2992,16 +3005,12 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
 #endif
 	spin_unlock(&l3->list_lock);
 	ac->avail -= batchcount;
-	memmove(ac->entry, &(ac->entry[batchcount]),
-		sizeof(void *) * ac->avail);
+	memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
 }
 
 /*
- * __cache_free
- * Release an obj back to its cache. If the obj has a constructed
- * state, it must be in this state _before_ it is released.
- *
- * Called with disabled ints.
+ * Release an obj back to its cache. If the obj has a constructed state, it must
+ * be in this state _before_ it is released.  Called with disabled ints.
  */
 static inline void __cache_free(struct kmem_cache *cachep, void *objp)
 {
@@ -3020,9 +3029,9 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp)
 		if (unlikely(slabp->nodeid != numa_node_id())) {
 			struct array_cache *alien = NULL;
 			int nodeid = slabp->nodeid;
-			struct kmem_list3 *l3 =
-			    cachep->nodelists[numa_node_id()];
+			struct kmem_list3 *l3;
 
+			l3 = cachep->nodelists[numa_node_id()];
 			STATS_INC_NODEFREES(cachep);
 			if (l3->alien && l3->alien[nodeid]) {
 				alien = l3->alien[nodeid];
@@ -3106,7 +3115,7 @@ int fastcall kmem_ptr_validate(struct kmem_cache *cachep, void *ptr)
 	if (unlikely(page_get_cache(page) != cachep))
 		goto out;
 	return 1;
-      out:
+out:
 	return 0;
 }
 
@@ -3132,7 +3141,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 	local_irq_save(save_flags);
 
 	if (nodeid == -1 || nodeid == numa_node_id() ||
-	    !cachep->nodelists[nodeid])
+			!cachep->nodelists[nodeid])
 		ptr = ____cache_alloc(cachep, flags);
 	else
 		ptr = __cache_alloc_node(cachep, flags, nodeid);
@@ -3249,7 +3258,7 @@ void *__alloc_percpu(size_t size)
 	/* Catch derefs w/o wrappers */
 	return (void *)(~(unsigned long)pdata);
 
-      unwind_oom:
+unwind_oom:
 	while (--i >= 0) {
 		if (!cpu_possible(i))
 			continue;
@@ -3352,18 +3361,20 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
 		struct array_cache *nc = NULL, *new;
 		struct array_cache **new_alien = NULL;
 #ifdef CONFIG_NUMA
-		if (!(new_alien = alloc_alien_cache(node, cachep->limit)))
+		new_alien = alloc_alien_cache(node, cachep->limit);
+		if (!new_alien)
 			goto fail;
 #endif
-		if (!(new = alloc_arraycache(node, (cachep->shared *
-						    cachep->batchcount),
-					     0xbaadf00d)))
+		new = alloc_arraycache(node, cachep->shared*cachep->batchcount,
+					0xbaadf00d);
+		if (!new)
 			goto fail;
-		if ((l3 = cachep->nodelists[node])) {
-
+		l3 = cachep->nodelists[node];
+		if (l3) {
 			spin_lock_irq(&l3->list_lock);
 
-			if ((nc = cachep->nodelists[node]->shared))
+			nc = cachep->nodelists[node]->shared;
+			if (nc)
 				free_block(cachep, nc->entry, nc->avail, node);
 
 			l3->shared = new;
@@ -3372,27 +3383,27 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
 				new_alien = NULL;
 			}
 			l3->free_limit = (1 + nr_cpus_node(node)) *
-			    cachep->batchcount + cachep->num;
+					cachep->batchcount + cachep->num;
 			spin_unlock_irq(&l3->list_lock);
 			kfree(nc);
 			free_alien_cache(new_alien);
 			continue;
 		}
-		if (!(l3 = kmalloc_node(sizeof(struct kmem_list3),
-					GFP_KERNEL, node)))
+		l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node);
+		if (!l3)
 			goto fail;
 
 		kmem_list3_init(l3);
 		l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
-		    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
+				((unsigned long)cachep) % REAPTIMEOUT_LIST3;
 		l3->shared = new;
 		l3->alien = new_alien;
 		l3->free_limit = (1 + nr_cpus_node(node)) *
-		    cachep->batchcount + cachep->num;
+					cachep->batchcount + cachep->num;
 		cachep->nodelists[node] = l3;
 	}
 	return err;
-      fail:
+fail:
 	err = -ENOMEM;
 	return err;
 }
@@ -3404,7 +3415,7 @@ struct ccupdate_struct {
 
 static void do_ccupdate_local(void *info)
 {
-	struct ccupdate_struct *new = (struct ccupdate_struct *)info;
+	struct ccupdate_struct *new = info;
 	struct array_cache *old;
 
 	check_irq_off();
@@ -3414,16 +3425,16 @@ static void do_ccupdate_local(void *info)
 	new->new[smp_processor_id()] = old;
 }
 
-static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount,
-			    int shared)
+static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
+				int batchcount, int shared)
 {
 	struct ccupdate_struct new;
 	int i, err;
 
 	memset(&new.new, 0, sizeof(new.new));
 	for_each_online_cpu(i) {
-		new.new[i] =
-		    alloc_arraycache(cpu_to_node(i), limit, batchcount);
+		new.new[i] = alloc_arraycache(cpu_to_node(i), limit,
+						batchcount);
 		if (!new.new[i]) {
 			for (i--; i >= 0; i--)
 				kfree(new.new[i]);
@@ -3465,10 +3476,11 @@ static void enable_cpucache(struct kmem_cache *cachep)
 	int err;
 	int limit, shared;
 
-	/* The head array serves three purposes:
+	/*
+	 * The head array serves three purposes:
 	 * - create a LIFO ordering, i.e. return objects that are cache-warm
 	 * - reduce the number of spinlock operations.
-	 * - reduce the number of linked list operations on the slab and 
+	 * - reduce the number of linked list operations on the slab and
 	 *   bufctl chains: array operations are cheaper.
 	 * The numbers are guessed, we should auto-tune as described by
 	 * Bonwick.
@@ -3484,7 +3496,8 @@ static void enable_cpucache(struct kmem_cache *cachep)
 	else
 		limit = 120;
 
-	/* Cpu bound tasks (e.g. network routing) can exhibit cpu bound
+	/*
+	 * CPU bound tasks (e.g. network routing) can exhibit cpu bound
 	 * allocation behaviour: Most allocs on one cpu, most free operations
 	 * on another cpu. For these cases, an efficient object passing between
 	 * cpus is necessary. This is provided by a shared array. The array
@@ -3499,9 +3512,9 @@ static void enable_cpucache(struct kmem_cache *cachep)
 #endif
 
 #if DEBUG
-	/* With debugging enabled, large batchcount lead to excessively
-	 * long periods with disabled local interrupts. Limit the 
-	 * batchcount
+	/*
+	 * With debugging enabled, large batchcount lead to excessively long
+	 * periods with disabled local interrupts. Limit the batchcount
 	 */
 	if (limit > 32)
 		limit = 32;
@@ -3512,8 +3525,8 @@ static void enable_cpucache(struct kmem_cache *cachep)
 		       cachep->name, -err);
 }
 
-static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac,
-				int force, int node)
+static void drain_array_locked(struct kmem_cache *cachep,
+				struct array_cache *ac, int force, int node)
 {
 	int tofree;
 
@@ -3522,9 +3535,8 @@ static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac
 		ac->touched = 0;
 	} else if (ac->avail) {
 		tofree = force ? ac->avail : (ac->limit + 4) / 5;
-		if (tofree > ac->avail) {
+		if (tofree > ac->avail)
 			tofree = (ac->avail + 1) / 2;
-		}
 		free_block(cachep, ac->entry, tofree, node);
 		ac->avail -= tofree;
 		memmove(ac->entry, &(ac->entry[tofree]),
@@ -3541,8 +3553,8 @@ static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac
  * - clear the per-cpu caches for this CPU.
  * - return freeable pages to the main free memory pool.
  *
- * If we cannot acquire the cache chain mutex then just give up - we'll
- * try again on the next iteration.
+ * If we cannot acquire the cache chain mutex then just give up - we'll try
+ * again on the next iteration.
  */
 static void cache_reap(void *unused)
 {
@@ -3590,9 +3602,8 @@ static void cache_reap(void *unused)
 			goto next_unlock;
 		}
 
-		tofree =
-		    (l3->free_limit + 5 * searchp->num -
-		     1) / (5 * searchp->num);
+		tofree = (l3->free_limit + 5 * searchp->num - 1) /
+				(5 * searchp->num);
 		do {
 			p = l3->slabs_free.next;
 			if (p == &(l3->slabs_free))
@@ -3603,9 +3614,9 @@ static void cache_reap(void *unused)
 			list_del(&slabp->list);
 			STATS_INC_REAPED(searchp);
 
-			/* Safe to drop the lock. The slab is no longer
-			 * linked to the cache.
-			 * searchp cannot disappear, we hold
+			/*
+			 * Safe to drop the lock. The slab is no longer linked
+			 * to the cache. searchp cannot disappear, we hold
 			 * cache_chain_lock
 			 */
 			l3->free_objects -= searchp->num;
@@ -3613,15 +3624,15 @@ static void cache_reap(void *unused)
 			slab_destroy(searchp, slabp);
 			spin_lock_irq(&l3->list_lock);
 		} while (--tofree > 0);
-	      next_unlock:
+next_unlock:
 		spin_unlock_irq(&l3->list_lock);
-	      next:
+next:
 		cond_resched();
 	}
 	check_irq_on();
 	mutex_unlock(&cache_chain_mutex);
 	next_reap_node();
-	/* Setup the next iteration */
+	/* Set up the next iteration */
 	schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
 }
 
@@ -3671,8 +3682,8 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos)
 {
 	struct kmem_cache *cachep = p;
 	++*pos;
-	return cachep->next.next == &cache_chain ? NULL
-	    : list_entry(cachep->next.next, struct kmem_cache, next);
+	return cachep->next.next == &cache_chain ?
+		NULL : list_entry(cachep->next.next, struct kmem_cache, next);
 }
 
 static void s_stop(struct seq_file *m, void *p)
@@ -3761,7 +3772,9 @@ static int s_show(struct seq_file *m, void *p)
 		unsigned long node_frees = cachep->node_frees;
 
 		seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
-				%4lu %4lu %4lu %4lu", allocs, high, grown, reaped, errors, max_freeable, node_allocs, node_frees);
+				%4lu %4lu %4lu %4lu", allocs, high, grown,
+				reaped, errors, max_freeable, node_allocs,
+				node_frees);
 	}
 	/* cpu stats */
 	{
@@ -3833,13 +3846,12 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
 	mutex_lock(&cache_chain_mutex);
 	res = -EINVAL;
 	list_for_each(p, &cache_chain) {
-		struct kmem_cache *cachep = list_entry(p, struct kmem_cache,
-						       next);
+		struct kmem_cache *cachep;
 
+		cachep = list_entry(p, struct kmem_cache, next);
 		if (!strcmp(cachep->name, kbuf)) {
-			if (limit < 1 ||
-			    batchcount < 1 ||
-			    batchcount > limit || shared < 0) {
+			if (limit < 1 || batchcount < 1 ||
+					batchcount > limit || shared < 0) {
 				res = 0;
 			} else {
 				res = do_tune_cpucache(cachep, limit,
-- 
cgit v1.1


From b5d8ca7c50826c0b456b4a646875dc573adfde2b Mon Sep 17 00:00:00 2001
From: Ravikiran G Thirumalai <kiran@scalex86.org>
Date: Wed, 22 Mar 2006 00:08:12 -0800
Subject: [PATCH] slab: remove cachep->spinlock

Remove cachep->spinlock.  Locking has moved to the kmem_list3 and most of
the structures protected earlier by cachep->spinlock is now protected by
the l3->list_lock.  slab cache tunables like batchcount are accessed always
with the cache_chain_mutex held.

Patch tested on SMP and NUMA kernels with dbench processes running,
constant onlining/offlining, and constant cache tuning, all at the same
time.

Signed-off-by: Ravikiran Thirumalai <kiran@scalex86.org>
Cc: Christoph Lameter <christoph@lameter.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 7b6f9f1..2cd8020 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -372,17 +372,19 @@ static void kmem_list3_init(struct kmem_list3 *parent)
 struct kmem_cache {
 /* 1) per-cpu data, touched during every alloc/free */
 	struct array_cache *array[NR_CPUS];
+/* 2) Cache tunables. Protected by cache_chain_mutex */
 	unsigned int batchcount;
 	unsigned int limit;
 	unsigned int shared;
+
 	unsigned int buffer_size;
-/* 2) touched by every alloc & free from the backend */
+/* 3) touched by every alloc & free from the backend */
 	struct kmem_list3 *nodelists[MAX_NUMNODES];
+
 	unsigned int flags;		/* constant flags */
 	unsigned int num;		/* # of objs per slab */
-	spinlock_t spinlock;
 
-/* 3) cache_grow/shrink */
+/* 4) cache_grow/shrink */
 	/* order of pgs per slab (2^n) */
 	unsigned int gfporder;
 
@@ -401,11 +403,11 @@ struct kmem_cache {
 	/* de-constructor func */
 	void (*dtor) (void *, struct kmem_cache *, unsigned long);
 
-/* 4) cache creation/removal */
+/* 5) cache creation/removal */
 	const char *name;
 	struct list_head next;
 
-/* 5) statistics */
+/* 6) statistics */
 #if STATS
 	unsigned long num_active;
 	unsigned long num_allocations;
@@ -661,7 +663,6 @@ static struct kmem_cache cache_cache = {
 	.shared = 1,
 	.buffer_size = sizeof(struct kmem_cache),
 	.flags = SLAB_NO_REAP,
-	.spinlock = SPIN_LOCK_UNLOCKED,
 	.name = "kmem_cache",
 #if DEBUG
 	.obj_size = sizeof(struct kmem_cache),
@@ -2057,7 +2058,6 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	cachep->gfpflags = 0;
 	if (flags & SLAB_CACHE_DMA)
 		cachep->gfpflags |= GFP_DMA;
-	spin_lock_init(&cachep->spinlock);
 	cachep->buffer_size = size;
 
 	if (flags & CFLGS_OFF_SLAB)
@@ -3425,6 +3425,7 @@ static void do_ccupdate_local(void *info)
 	new->new[smp_processor_id()] = old;
 }
 
+/* Always called with the cache_chain_mutex held */
 static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
 				int batchcount, int shared)
 {
@@ -3446,11 +3447,9 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
 	smp_call_function_all_cpus(do_ccupdate_local, (void *)&new);
 
 	check_irq_on();
-	spin_lock(&cachep->spinlock);
 	cachep->batchcount = batchcount;
 	cachep->limit = limit;
 	cachep->shared = shared;
-	spin_unlock(&cachep->spinlock);
 
 	for_each_online_cpu(i) {
 		struct array_cache *ccold = new.new[i];
@@ -3471,6 +3470,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
 	return 0;
 }
 
+/* Called with cache_chain_mutex held always */
 static void enable_cpucache(struct kmem_cache *cachep)
 {
 	int err;
@@ -3705,7 +3705,6 @@ static int s_show(struct seq_file *m, void *p)
 	int node;
 	struct kmem_list3 *l3;
 
-	spin_lock(&cachep->spinlock);
 	active_objs = 0;
 	num_slabs = 0;
 	for_each_online_node(node) {
@@ -3788,7 +3787,6 @@ static int s_show(struct seq_file *m, void *p)
 	}
 #endif
 	seq_putc(m, '\n');
-	spin_unlock(&cachep->spinlock);
 	return 0;
 }
 
-- 
cgit v1.1


From fcc234f888ba2365c44ba0507eb8a18eebf1f594 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Wed, 22 Mar 2006 00:08:13 -0800
Subject: [PATCH] mm: kill kmem_cache_t usage

We have struct kmem_cache now so use it instead of the old typedef.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mempolicy.c | 4 ++--
 mm/mempool.c   | 4 ++--
 mm/rmap.c      | 5 +++--
 mm/shmem.c     | 5 +++--
 4 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index b21869a..96195dc 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -98,8 +98,8 @@
 /* The number of pages to migrate per call to migrate_pages() */
 #define MIGRATE_CHUNK_SIZE 256
 
-static kmem_cache_t *policy_cache;
-static kmem_cache_t *sn_cache;
+static struct kmem_cache *policy_cache;
+static struct kmem_cache *sn_cache;
 
 #define PDprintk(fmt...)
 
diff --git a/mm/mempool.c b/mm/mempool.c
index 1a99b80..f71893e 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -278,14 +278,14 @@ EXPORT_SYMBOL(mempool_free);
  */
 void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data)
 {
-	kmem_cache_t *mem = (kmem_cache_t *) pool_data;
+	struct kmem_cache *mem = pool_data;
 	return kmem_cache_alloc(mem, gfp_mask);
 }
 EXPORT_SYMBOL(mempool_alloc_slab);
 
 void mempool_free_slab(void *element, void *pool_data)
 {
-	kmem_cache_t *mem = (kmem_cache_t *) pool_data;
+	struct kmem_cache *mem = pool_data;
 	kmem_cache_free(mem, element);
 }
 EXPORT_SYMBOL(mempool_free_slab);
diff --git a/mm/rmap.c b/mm/rmap.c
index 67f0e20..134aef9 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -58,7 +58,7 @@
 
 //#define RMAP_DEBUG /* can be enabled only for debugging */
 
-kmem_cache_t *anon_vma_cachep;
+struct kmem_cache *anon_vma_cachep;
 
 static inline void validate_anon_vma(struct vm_area_struct *find_vma)
 {
@@ -166,7 +166,8 @@ void anon_vma_unlink(struct vm_area_struct *vma)
 		anon_vma_free(anon_vma);
 }
 
-static void anon_vma_ctor(void *data, kmem_cache_t *cachep, unsigned long flags)
+static void anon_vma_ctor(void *data, struct kmem_cache *cachep,
+			  unsigned long flags)
 {
 	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
 						SLAB_CTOR_CONSTRUCTOR) {
diff --git a/mm/shmem.c b/mm/shmem.c
index 7c455fb..f523a15 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2119,7 +2119,7 @@ failed:
 	return err;
 }
 
-static kmem_cache_t *shmem_inode_cachep;
+static struct kmem_cache *shmem_inode_cachep;
 
 static struct inode *shmem_alloc_inode(struct super_block *sb)
 {
@@ -2139,7 +2139,8 @@ static void shmem_destroy_inode(struct inode *inode)
 	kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
 }
 
-static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
+static void init_once(void *foo, struct kmem_cache *cachep,
+		      unsigned long flags)
 {
 	struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
 
-- 
cgit v1.1


From 911851e6ee6ac4e26f07be342a89632f78494fef Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Wed, 22 Mar 2006 00:08:14 -0800
Subject: [PATCH] slab: fix kernel-doc warnings

Fix kernel-doc warnings in mm/slab.c.

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 2cd8020..5c25749 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1615,8 +1615,12 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
 
 #if DEBUG
 /**
- * slab_destroy_objs - call the registered destructor for each object in
- *      a slab that is to be destroyed.
+ * slab_destroy_objs - destroy a slab and its objects
+ * @cachep: cache pointer being destroyed
+ * @slabp: slab pointer being destroyed
+ *
+ * Call the registered destructor for each object in a slab that is being
+ * destroyed.
  */
 static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
 {
@@ -1661,7 +1665,11 @@ static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
 }
 #endif
 
-/*
+/**
+ * slab_destroy - destroy and release all objects in a slab
+ * @cachep: cache pointer being destroyed
+ * @slabp: slab pointer being destroyed
+ *
  * Destroy all the objs in a slab, and release the mem back to the system.
  * Before calling the slab must have been unlinked from the cache.  The
  * cache-lock is not held/needed.
@@ -3170,6 +3178,7 @@ EXPORT_SYMBOL(kmalloc_node);
  * kmalloc - allocate memory
  * @size: how many bytes of memory are required.
  * @flags: the type of memory to allocate.
+ * @caller: function caller for debug tracking of the caller
  *
  * kmalloc is the normal method of allocating memory
  * in the kernel.
-- 
cgit v1.1


From ac2b898ca6fb06196a26869c23b66afe7944e52e Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Wed, 22 Mar 2006 00:08:15 -0800
Subject: [PATCH] slab: Remove SLAB_NO_REAP option

SLAB_NO_REAP is documented as an option that will cause this slab not to be
reaped under memory pressure.  However, that is not what happens.  The only
thing that SLAB_NO_REAP controls at the moment is the reclaim of the unused
slab elements that were allocated in batch in cache_reap().  Cache_reap()
is run every few seconds independently of memory pressure.

Could we remove the whole thing?  Its only used by three slabs anyways and
I cannot find a reason for having this option.

There is an additional problem with SLAB_NO_REAP.  If set then the recovery
of objects from alien caches is switched off.  Objects not freed on the
same node where they were initially allocated will only be reused if a
certain amount of objects accumulates from one alien node (not very likely)
or if the cache is explicitly shrunk.  (Strangely __cache_shrink does not
check for SLAB_NO_REAP)

Getting rid of SLAB_NO_REAP fixes the problems with alien cache freeing.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Manfred Spraul <manfred@colorfullife.com>
Cc: Mark Fasheh <mark.fasheh@oracle.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 5c25749..2423550 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -170,12 +170,12 @@
 #if DEBUG
 # define CREATE_MASK	(SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \
 			 SLAB_POISON | SLAB_HWCACHE_ALIGN | \
-			 SLAB_NO_REAP | SLAB_CACHE_DMA | \
+			 SLAB_CACHE_DMA | \
 			 SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \
 			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
 			 SLAB_DESTROY_BY_RCU)
 #else
-# define CREATE_MASK	(SLAB_HWCACHE_ALIGN | SLAB_NO_REAP | \
+# define CREATE_MASK	(SLAB_HWCACHE_ALIGN | \
 			 SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \
 			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
 			 SLAB_DESTROY_BY_RCU)
@@ -662,7 +662,6 @@ static struct kmem_cache cache_cache = {
 	.limit = BOOT_CPUCACHE_ENTRIES,
 	.shared = 1,
 	.buffer_size = sizeof(struct kmem_cache),
-	.flags = SLAB_NO_REAP,
 	.name = "kmem_cache",
 #if DEBUG
 	.obj_size = sizeof(struct kmem_cache),
@@ -1848,9 +1847,6 @@ static void setup_cpu_cache(struct kmem_cache *cachep)
  * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
  * for buffer overruns.
  *
- * %SLAB_NO_REAP - Don't automatically reap this cache when we're under
- * memory pressure.
- *
  * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
  * cacheline.  This can be beneficial if you're counting cycles as closely
  * as davem.
@@ -3584,10 +3580,6 @@ static void cache_reap(void *unused)
 		struct slab *slabp;
 
 		searchp = list_entry(walk, struct kmem_cache, next);
-
-		if (searchp->flags & SLAB_NO_REAP)
-			goto next;
-
 		check_irq_on();
 
 		l3 = searchp->nodelists[numa_node_id()];
@@ -3635,7 +3627,6 @@ static void cache_reap(void *unused)
 		} while (--tofree > 0);
 next_unlock:
 		spin_unlock_irq(&l3->list_lock);
-next:
 		cond_resched();
 	}
 	check_irq_on();
-- 
cgit v1.1


From a07fa3944bf924881450884224cbb2f1269cb9fa Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 22 Mar 2006 00:08:17 -0800
Subject: [PATCH] slab: use on_each_cpu()

Slab duplicates on_each_cpu().

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 21 ++-------------------
 1 file changed, 2 insertions(+), 19 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 2423550..f477acf 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2119,23 +2119,6 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
 #define check_spinlock_acquired_node(x, y) do { } while(0)
 #endif
 
-/*
- * Waits for all CPUs to execute func().
- */
-static void smp_call_function_all_cpus(void (*func)(void *arg), void *arg)
-{
-	check_irq_on();
-	preempt_disable();
-	local_irq_disable();
-	func(arg);
-	local_irq_enable();
-
-	if (smp_call_function(func, arg, 1, 1))
-		BUG();
-
-	preempt_enable();
-}
-
 static void drain_array_locked(struct kmem_cache *cachep,
 			struct array_cache *ac, int force, int node);
 
@@ -2158,7 +2141,7 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
 	struct kmem_list3 *l3;
 	int node;
 
-	smp_call_function_all_cpus(do_drain, cachep);
+	on_each_cpu(do_drain, cachep, 1, 1);
 	check_irq_on();
 	for_each_online_node(node) {
 		l3 = cachep->nodelists[node];
@@ -3449,7 +3432,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
 	}
 	new.cachep = cachep;
 
-	smp_call_function_all_cpus(do_ccupdate_local, (void *)&new);
+	on_each_cpu(do_ccupdate_local, (void *)&new, 1, 1);
 
 	check_irq_on();
 	cachep->batchcount = batchcount;
-- 
cgit v1.1


From 8695949a1d7c99e039595db00af8e0fe4722307d Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Wed, 22 Mar 2006 00:08:18 -0800
Subject: [PATCH] Thin out scan_control: remove nr_to_scan and priority

Make nr_to_scan and priority a parameter instead of putting it into scan
control.  This allows various small optimizations and IMHO makes the code
easier to read.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmscan.c | 59 +++++++++++++++++++++++++----------------------------------
 1 file changed, 25 insertions(+), 34 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index e21bab4..f7c4f37 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -52,9 +52,6 @@ typedef enum {
 } pageout_t;
 
 struct scan_control {
-	/* Ask refill_inactive_zone, or shrink_cache to scan this many pages */
-	unsigned long nr_to_scan;
-
 	/* Incremented by the number of inactive pages that were scanned */
 	unsigned long nr_scanned;
 
@@ -63,9 +60,6 @@ struct scan_control {
 
 	unsigned long nr_mapped;	/* From page_state */
 
-	/* Ask shrink_caches, or shrink_zone to scan at this priority */
-	unsigned int priority;
-
 	/* This context's GFP mask */
 	gfp_t gfp_mask;
 
@@ -1112,11 +1106,10 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
 /*
  * shrink_cache() adds the number of pages reclaimed to sc->nr_reclaimed
  */
-static void shrink_cache(struct zone *zone, struct scan_control *sc)
+static void shrink_cache(int max_scan, struct zone *zone, struct scan_control *sc)
 {
 	LIST_HEAD(page_list);
 	struct pagevec pvec;
-	int max_scan = sc->nr_to_scan;
 
 	pagevec_init(&pvec, 1);
 
@@ -1192,12 +1185,11 @@ done:
  * But we had to alter page->flags anyway.
  */
 static void
-refill_inactive_zone(struct zone *zone, struct scan_control *sc)
+refill_inactive_zone(int nr_pages, struct zone *zone, struct scan_control *sc)
 {
 	int pgmoved;
 	int pgdeactivate = 0;
 	int pgscanned;
-	int nr_pages = sc->nr_to_scan;
 	LIST_HEAD(l_hold);	/* The pages which were snipped off */
 	LIST_HEAD(l_inactive);	/* Pages to go onto the inactive_list */
 	LIST_HEAD(l_active);	/* Pages to go onto the active_list */
@@ -1332,10 +1324,11 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
  * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
  */
 static void
-shrink_zone(struct zone *zone, struct scan_control *sc)
+shrink_zone(int priority, struct zone *zone, struct scan_control *sc)
 {
 	unsigned long nr_active;
 	unsigned long nr_inactive;
+	unsigned long nr_to_scan;
 
 	atomic_inc(&zone->reclaim_in_progress);
 
@@ -1343,14 +1336,14 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
 	 * Add one to `nr_to_scan' just to make sure that the kernel will
 	 * slowly sift through the active list.
 	 */
-	zone->nr_scan_active += (zone->nr_active >> sc->priority) + 1;
+	zone->nr_scan_active += (zone->nr_active >> priority) + 1;
 	nr_active = zone->nr_scan_active;
 	if (nr_active >= sc->swap_cluster_max)
 		zone->nr_scan_active = 0;
 	else
 		nr_active = 0;
 
-	zone->nr_scan_inactive += (zone->nr_inactive >> sc->priority) + 1;
+	zone->nr_scan_inactive += (zone->nr_inactive >> priority) + 1;
 	nr_inactive = zone->nr_scan_inactive;
 	if (nr_inactive >= sc->swap_cluster_max)
 		zone->nr_scan_inactive = 0;
@@ -1359,17 +1352,17 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
 
 	while (nr_active || nr_inactive) {
 		if (nr_active) {
-			sc->nr_to_scan = min(nr_active,
+			nr_to_scan = min(nr_active,
 					(unsigned long)sc->swap_cluster_max);
-			nr_active -= sc->nr_to_scan;
-			refill_inactive_zone(zone, sc);
+			nr_active -= nr_to_scan;
+			refill_inactive_zone(nr_to_scan, zone, sc);
 		}
 
 		if (nr_inactive) {
-			sc->nr_to_scan = min(nr_inactive,
+			nr_to_scan = min(nr_inactive,
 					(unsigned long)sc->swap_cluster_max);
-			nr_inactive -= sc->nr_to_scan;
-			shrink_cache(zone, sc);
+			nr_inactive -= nr_to_scan;
+			shrink_cache(nr_to_scan, zone, sc);
 		}
 	}
 
@@ -1395,7 +1388,7 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
  * scan then give up on it.
  */
 static void
-shrink_caches(struct zone **zones, struct scan_control *sc)
+shrink_caches(int priority, struct zone **zones, struct scan_control *sc)
 {
 	int i;
 
@@ -1408,14 +1401,14 @@ shrink_caches(struct zone **zones, struct scan_control *sc)
 		if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
 			continue;
 
-		zone->temp_priority = sc->priority;
-		if (zone->prev_priority > sc->priority)
-			zone->prev_priority = sc->priority;
+		zone->temp_priority = priority;
+		if (zone->prev_priority > priority)
+			zone->prev_priority = priority;
 
-		if (zone->all_unreclaimable && sc->priority != DEF_PRIORITY)
+		if (zone->all_unreclaimable && priority != DEF_PRIORITY)
 			continue;	/* Let kswapd poll it */
 
-		shrink_zone(zone, sc);
+		shrink_zone(priority, zone, sc);
 	}
 }
  
@@ -1462,11 +1455,10 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
 		sc.nr_mapped = read_page_state(nr_mapped);
 		sc.nr_scanned = 0;
 		sc.nr_reclaimed = 0;
-		sc.priority = priority;
 		sc.swap_cluster_max = SWAP_CLUSTER_MAX;
 		if (!priority)
 			disable_swap_token();
-		shrink_caches(zones, &sc);
+		shrink_caches(priority, zones, &sc);
 		shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
 		if (reclaim_state) {
 			sc.nr_reclaimed += reclaim_state->reclaimed_slab;
@@ -1629,9 +1621,8 @@ scan:
 				zone->prev_priority = priority;
 			sc.nr_scanned = 0;
 			sc.nr_reclaimed = 0;
-			sc.priority = priority;
 			sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX;
-			shrink_zone(zone, &sc);
+			shrink_zone(priority, zone, &sc);
 			reclaim_state->reclaimed_slab = 0;
 			nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
 						lru_pages);
@@ -1886,6 +1877,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	struct scan_control sc;
 	cpumask_t mask;
 	int node_id;
+	int priority;
 
 	if (time_before(jiffies,
 		zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval))
@@ -1906,7 +1898,6 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	sc.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP);
 	sc.nr_scanned = 0;
 	sc.nr_reclaimed = 0;
-	sc.priority = ZONE_RECLAIM_PRIORITY + 1;
 	sc.nr_mapped = read_page_state(nr_mapped);
 	sc.gfp_mask = gfp_mask;
 
@@ -1932,11 +1923,11 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	 * Free memory by calling shrink zone with increasing priorities
 	 * until we have enough memory freed.
 	 */
+	priority = ZONE_RECLAIM_PRIORITY;
 	do {
-		sc.priority--;
-		shrink_zone(zone, &sc);
-
-	} while (sc.nr_reclaimed < nr_pages && sc.priority > 0);
+		shrink_zone(priority, zone, &sc);
+		priority--;
+	} while (priority >= 0 && sc.nr_reclaimed < nr_pages);
 
 	if (sc.nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) {
 		/*
-- 
cgit v1.1


From 179e96395b1f01e95ebe1ff5ef306b810dbbd147 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 22 Mar 2006 00:08:18 -0800
Subject: [PATCH] vmscan: scan_control cleanup

Initialise as much of scan_control as possible at the declaration site.  This
tidies things up a bit and assures us that all unmentioned fields are zeroed
out.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmscan.c | 108 ++++++++++++++++++++++++++++++++++--------------------------
 1 file changed, 62 insertions(+), 46 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index f7c4f37..5feef4d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1431,13 +1431,14 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
 	int ret = 0;
 	int total_scanned = 0, total_reclaimed = 0;
 	struct reclaim_state *reclaim_state = current->reclaim_state;
-	struct scan_control sc;
 	unsigned long lru_pages = 0;
 	int i;
-
-	sc.gfp_mask = gfp_mask;
-	sc.may_writepage = !laptop_mode;
-	sc.may_swap = 1;
+	struct scan_control sc = {
+		.gfp_mask = gfp_mask,
+		.may_writepage = !laptop_mode,
+		.swap_cluster_max = SWAP_CLUSTER_MAX,
+		.may_swap = 1,
+	};
 
 	inc_page_state(allocstall);
 
@@ -1455,7 +1456,6 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
 		sc.nr_mapped = read_page_state(nr_mapped);
 		sc.nr_scanned = 0;
 		sc.nr_reclaimed = 0;
-		sc.swap_cluster_max = SWAP_CLUSTER_MAX;
 		if (!priority)
 			disable_swap_token();
 		shrink_caches(priority, zones, &sc);
@@ -1478,7 +1478,8 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
 		 * that's undesirable in laptop mode, where we *want* lumpy
 		 * writeout.  So in laptop mode, write out the whole world.
 		 */
-		if (total_scanned > sc.swap_cluster_max + sc.swap_cluster_max/2) {
+		if (total_scanned > sc.swap_cluster_max +
+					sc.swap_cluster_max / 2) {
 			wakeup_pdflush(laptop_mode ? 0 : total_scanned);
 			sc.may_writepage = 1;
 		}
@@ -1532,14 +1533,16 @@ static int balance_pgdat(pg_data_t *pgdat, int nr_pages, int order)
 	int i;
 	int total_scanned, total_reclaimed;
 	struct reclaim_state *reclaim_state = current->reclaim_state;
-	struct scan_control sc;
+	struct scan_control sc = {
+		.gfp_mask = GFP_KERNEL,
+		.may_swap = 1,
+		.swap_cluster_max = nr_pages ? nr_pages : SWAP_CLUSTER_MAX,
+	};
 
 loop_again:
 	total_scanned = 0;
 	total_reclaimed = 0;
-	sc.gfp_mask = GFP_KERNEL;
-	sc.may_writepage = !laptop_mode;
-	sc.may_swap = 1;
+	sc.may_writepage = !laptop_mode,
 	sc.nr_mapped = read_page_state(nr_mapped);
 
 	inc_page_state(pageoutrun);
@@ -1621,7 +1624,6 @@ scan:
 				zone->prev_priority = priority;
 			sc.nr_scanned = 0;
 			sc.nr_reclaimed = 0;
-			sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX;
 			shrink_zone(priority, zone, &sc);
 			reclaim_state->reclaimed_slab = 0;
 			nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
@@ -1869,46 +1871,21 @@ int zone_reclaim_interval __read_mostly = 30*HZ;
 /*
  * Try to free up some pages from this zone through reclaim.
  */
-int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
+static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 {
-	int nr_pages;
+	const int nr_pages = 1 << order;
 	struct task_struct *p = current;
 	struct reclaim_state reclaim_state;
-	struct scan_control sc;
-	cpumask_t mask;
-	int node_id;
 	int priority;
-
-	if (time_before(jiffies,
-		zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval))
-			return 0;
-
-	if (!(gfp_mask & __GFP_WAIT) ||
-		zone->all_unreclaimable ||
-		atomic_read(&zone->reclaim_in_progress) > 0 ||
-		(p->flags & PF_MEMALLOC))
-			return 0;
-
-	node_id = zone->zone_pgdat->node_id;
-	mask = node_to_cpumask(node_id);
-	if (!cpus_empty(mask) && node_id != numa_node_id())
-		return 0;
-
-	sc.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE);
-	sc.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP);
-	sc.nr_scanned = 0;
-	sc.nr_reclaimed = 0;
-	sc.nr_mapped = read_page_state(nr_mapped);
-	sc.gfp_mask = gfp_mask;
+	struct scan_control sc = {
+		.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
+		.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP),
+		.nr_mapped = read_page_state(nr_mapped),
+		.swap_cluster_max = max(nr_pages, SWAP_CLUSTER_MAX),
+		.gfp_mask = gfp_mask,
+	};
 
 	disable_swap_token();
-
-	nr_pages = 1 << order;
-	if (nr_pages > SWAP_CLUSTER_MAX)
-		sc.swap_cluster_max = nr_pages;
-	else
-		sc.swap_cluster_max = SWAP_CLUSTER_MAX;
-
 	cond_resched();
 	/*
 	 * We need to be able to allocate from the reserves for RECLAIM_SWAP
@@ -1949,5 +1926,44 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 
 	return sc.nr_reclaimed >= nr_pages;
 }
+
+int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
+{
+	cpumask_t mask;
+	int node_id;
+
+	/*
+	 * Do not reclaim if there was a recent unsuccessful attempt at zone
+	 * reclaim.  In that case we let allocations go off node for the
+	 * zone_reclaim_interval.  Otherwise we would scan for each off-node
+	 * page allocation.
+	 */
+	if (time_before(jiffies,
+		zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval))
+			return 0;
+
+	/*
+	 * Avoid concurrent zone reclaims, do not reclaim in a zone that does
+	 * not have reclaimable pages and if we should not delay the allocation
+	 * then do not scan.
+	 */
+	if (!(gfp_mask & __GFP_WAIT) ||
+		zone->all_unreclaimable ||
+		atomic_read(&zone->reclaim_in_progress) > 0 ||
+		(current->flags & PF_MEMALLOC))
+			return 0;
+
+	/*
+	 * Only run zone reclaim on the local zone or on zones that do not
+	 * have associated processors. This will favor the local processor
+	 * over remote processors and spread off node memory allocations
+	 * as wide as possible.
+	 */
+	node_id = zone->zone_pgdat->node_id;
+	mask = node_to_cpumask(node_id);
+	if (!cpus_empty(mask) && node_id != numa_node_id())
+		return 0;
+	return __zone_reclaim(zone, gfp_mask, order);
+}
 #endif
 
-- 
cgit v1.1


From 69e05944af39fc6c97b09380c8721e38433bd828 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 22 Mar 2006 00:08:19 -0800
Subject: [PATCH] vmscan: use unsigned longs

Turn basically everything in vmscan.c into `unsigned long'.  This is to avoid
the possibility that some piece of code in there might decide to operate upon
more than 4G (or even 2G) of pages in one hit.

This might be silly, but we'll need it one day.

Cc: Christoph Lameter <clameter@sgi.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmscan.c | 104 ++++++++++++++++++++++++++++++++++--------------------------
 1 file changed, 59 insertions(+), 45 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5feef4d..62cd7cd 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -177,10 +177,11 @@ EXPORT_SYMBOL(remove_shrinker);
  *
  * Returns the number of slab objects which we shrunk.
  */
-int shrink_slab(unsigned long scanned, gfp_t gfp_mask, unsigned long lru_pages)
+unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
+			unsigned long lru_pages)
 {
 	struct shrinker *shrinker;
-	int ret = 0;
+	unsigned long ret = 0;
 
 	if (scanned == 0)
 		scanned = SWAP_CLUSTER_MAX;
@@ -410,12 +411,13 @@ cannot_free:
 /*
  * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed
  */
-static int shrink_list(struct list_head *page_list, struct scan_control *sc)
+static unsigned long shrink_list(struct list_head *page_list,
+				struct scan_control *sc)
 {
 	LIST_HEAD(ret_pages);
 	struct pagevec freed_pvec;
 	int pgactivate = 0;
-	int reclaimed = 0;
+	unsigned long reclaimed = 0;
 
 	cond_resched();
 
@@ -599,11 +601,11 @@ static inline void move_to_lru(struct page *page)
  *
  * returns the number of pages put back.
  */
-int putback_lru_pages(struct list_head *l)
+unsigned long putback_lru_pages(struct list_head *l)
 {
 	struct page *page;
 	struct page *page2;
-	int count = 0;
+	unsigned long count = 0;
 
 	list_for_each_entry_safe(page, page2, l, lru) {
 		move_to_lru(page);
@@ -848,11 +850,11 @@ EXPORT_SYMBOL(migrate_page);
  *
  * Return: Number of pages not migrated when "to" ran empty.
  */
-int migrate_pages(struct list_head *from, struct list_head *to,
+unsigned long migrate_pages(struct list_head *from, struct list_head *to,
 		  struct list_head *moved, struct list_head *failed)
 {
-	int retry;
-	int nr_failed = 0;
+	unsigned long retry;
+	unsigned long nr_failed = 0;
 	int pass = 0;
 	struct page *page;
 	struct page *page2;
@@ -1069,12 +1071,13 @@ int isolate_lru_page(struct page *page)
  *
  * returns how many pages were moved onto *@dst.
  */
-static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
-			     struct list_head *dst, int *scanned)
+static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
+		struct list_head *src, struct list_head *dst,
+		unsigned long *scanned)
 {
-	int nr_taken = 0;
+	unsigned long nr_taken = 0;
 	struct page *page;
-	int scan = 0;
+	unsigned long scan = 0;
 
 	while (scan++ < nr_to_scan && !list_empty(src)) {
 		struct list_head *target;
@@ -1106,20 +1109,22 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
 /*
  * shrink_cache() adds the number of pages reclaimed to sc->nr_reclaimed
  */
-static void shrink_cache(int max_scan, struct zone *zone, struct scan_control *sc)
+static void shrink_cache(unsigned long max_scan, struct zone *zone,
+			struct scan_control *sc)
 {
 	LIST_HEAD(page_list);
 	struct pagevec pvec;
+	unsigned long nr_scanned = 0;
 
 	pagevec_init(&pvec, 1);
 
 	lru_add_drain();
 	spin_lock_irq(&zone->lru_lock);
-	while (max_scan > 0) {
+	do {
 		struct page *page;
-		int nr_taken;
-		int nr_scan;
-		int nr_freed;
+		unsigned long nr_taken;
+		unsigned long nr_scan;
+		unsigned long nr_freed;
 
 		nr_taken = isolate_lru_pages(sc->swap_cluster_max,
 					     &zone->inactive_list,
@@ -1131,7 +1136,7 @@ static void shrink_cache(int max_scan, struct zone *zone, struct scan_control *s
 		if (nr_taken == 0)
 			goto done;
 
-		max_scan -= nr_scan;
+		nr_scanned += nr_scan;
 		nr_freed = shrink_list(&page_list, sc);
 
 		local_irq_disable();
@@ -1161,7 +1166,7 @@ static void shrink_cache(int max_scan, struct zone *zone, struct scan_control *s
 				spin_lock_irq(&zone->lru_lock);
 			}
 		}
-  	}
+  	} while (nr_scanned < max_scan);
 	spin_unlock_irq(&zone->lru_lock);
 done:
 	pagevec_release(&pvec);
@@ -1185,11 +1190,12 @@ done:
  * But we had to alter page->flags anyway.
  */
 static void
-refill_inactive_zone(int nr_pages, struct zone *zone, struct scan_control *sc)
+refill_inactive_zone(unsigned long nr_pages, struct zone *zone,
+			struct scan_control *sc)
 {
-	int pgmoved;
+	unsigned long pgmoved;
 	int pgdeactivate = 0;
-	int pgscanned;
+	unsigned long pgscanned;
 	LIST_HEAD(l_hold);	/* The pages which were snipped off */
 	LIST_HEAD(l_inactive);	/* Pages to go onto the inactive_list */
 	LIST_HEAD(l_active);	/* Pages to go onto the active_list */
@@ -1323,8 +1329,8 @@ refill_inactive_zone(int nr_pages, struct zone *zone, struct scan_control *sc)
 /*
  * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
  */
-static void
-shrink_zone(int priority, struct zone *zone, struct scan_control *sc)
+static void shrink_zone(int priority, struct zone *zone,
+			struct scan_control *sc)
 {
 	unsigned long nr_active;
 	unsigned long nr_inactive;
@@ -1387,8 +1393,8 @@ shrink_zone(int priority, struct zone *zone, struct scan_control *sc)
  * If a zone is deemed to be full of pinned pages then just give it a light
  * scan then give up on it.
  */
-static void
-shrink_caches(int priority, struct zone **zones, struct scan_control *sc)
+static void shrink_caches(int priority, struct zone **zones,
+				struct scan_control *sc)
 {
 	int i;
 
@@ -1425,11 +1431,12 @@ shrink_caches(int priority, struct zone **zones, struct scan_control *sc)
  * holds filesystem locks which prevent writeout this might not work, and the
  * allocation attempt will fail.
  */
-int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
+unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
 {
 	int priority;
 	int ret = 0;
-	int total_scanned = 0, total_reclaimed = 0;
+	unsigned long total_scanned = 0;
+	unsigned long total_reclaimed = 0;
 	struct reclaim_state *reclaim_state = current->reclaim_state;
 	unsigned long lru_pages = 0;
 	int i;
@@ -1525,13 +1532,15 @@ out:
  * the page allocator fallback scheme to ensure that aging of pages is balanced
  * across the zones.
  */
-static int balance_pgdat(pg_data_t *pgdat, int nr_pages, int order)
+static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages,
+				int order)
 {
-	int to_free = nr_pages;
+	unsigned long to_free = nr_pages;
 	int all_zones_ok;
 	int priority;
 	int i;
-	int total_scanned, total_reclaimed;
+	unsigned long total_scanned;
+	unsigned long total_reclaimed;
 	struct reclaim_state *reclaim_state = current->reclaim_state;
 	struct scan_control sc = {
 		.gfp_mask = GFP_KERNEL,
@@ -1776,22 +1785,23 @@ void wakeup_kswapd(struct zone *zone, int order)
  * Try to free `nr_pages' of memory, system-wide.  Returns the number of freed
  * pages.
  */
-int shrink_all_memory(int nr_pages)
+unsigned long shrink_all_memory(unsigned long nr_pages)
 {
 	pg_data_t *pgdat;
-	int nr_to_free = nr_pages;
-	int ret = 0;
+	unsigned long nr_to_free = nr_pages;
+	unsigned long ret = 0;
 	struct reclaim_state reclaim_state = {
 		.reclaimed_slab = 0,
 	};
 
 	current->reclaim_state = &reclaim_state;
 	for_each_pgdat(pgdat) {
-		int freed;
+		unsigned long freed;
+
 		freed = balance_pgdat(pgdat, nr_to_free, 0);
 		ret += freed;
 		nr_to_free -= freed;
-		if (nr_to_free <= 0)
+		if ((long)nr_to_free <= 0)
 			break;
 	}
 	current->reclaim_state = NULL;
@@ -1805,8 +1815,7 @@ int shrink_all_memory(int nr_pages)
    away, we get changed to run anywhere: as the first one comes back,
    restore their cpu bindings. */
 static int __devinit cpu_callback(struct notifier_block *nfb,
-				  unsigned long action,
-				  void *hcpu)
+				  unsigned long action, void *hcpu)
 {
 	pg_data_t *pgdat;
 	cpumask_t mask;
@@ -1826,10 +1835,15 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
 static int __init kswapd_init(void)
 {
 	pg_data_t *pgdat;
+
 	swap_setup();
-	for_each_pgdat(pgdat)
-		pgdat->kswapd
-		= find_task_by_pid(kernel_thread(kswapd, pgdat, CLONE_KERNEL));
+	for_each_pgdat(pgdat) {
+		pid_t pid;
+
+		pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL);
+		BUG_ON(pid < 0);
+		pgdat->kswapd = find_task_by_pid(pid);
+	}
 	total_memory = nr_free_pagecache_pages();
 	hotcpu_notifier(cpu_callback, 0);
 	return 0;
@@ -1873,7 +1887,7 @@ int zone_reclaim_interval __read_mostly = 30*HZ;
  */
 static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 {
-	const int nr_pages = 1 << order;
+	const unsigned long nr_pages = 1 << order;
 	struct task_struct *p = current;
 	struct reclaim_state reclaim_state;
 	int priority;
@@ -1881,7 +1895,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 		.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
 		.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP),
 		.nr_mapped = read_page_state(nr_mapped),
-		.swap_cluster_max = max(nr_pages, SWAP_CLUSTER_MAX),
+		.swap_cluster_max = max_t(unsigned long, nr_pages,
+					SWAP_CLUSTER_MAX),
 		.gfp_mask = gfp_mask,
 	};
 
@@ -1966,4 +1981,3 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	return __zone_reclaim(zone, gfp_mask, order);
 }
 #endif
-
-- 
cgit v1.1


From 05ff51376f01fd8837946a4f8144a84f6cc71c19 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 22 Mar 2006 00:08:20 -0800
Subject: [PATCH] vmscan return nr_reclaimed

Change all the vmscan functions to retunr the number-of-reclaimed pages and
remove scan_conrtol.nr_reclaimed.

Saves ten-odd bytes of text and makes things clearer and more consistent.

The patch also changes the behaviour of zone_reclaim() when it falls back to slab shrinking.  Christoph says

  "Setting this to one means that we will rescan and shrink the slab for
  each allocation if we are out of zone memory and RECLAIM_SLAB is set.  Plus
  if we do an order 0 allocation we do not go off node as intended.

  "We better set this to zero.  This means the allocation will go offnode
  despite us having potentially freed lots of memory on the zone.  Future
  allocations can then again be done from this zone."

Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Christoph Lameter <christoph@lameter.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmscan.c | 77 ++++++++++++++++++++++++++++++-------------------------------
 1 file changed, 38 insertions(+), 39 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 62cd7cd..8f6ad13 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -55,9 +55,6 @@ struct scan_control {
 	/* Incremented by the number of inactive pages that were scanned */
 	unsigned long nr_scanned;
 
-	/* Incremented by the number of pages reclaimed */
-	unsigned long nr_reclaimed;
-
 	unsigned long nr_mapped;	/* From page_state */
 
 	/* This context's GFP mask */
@@ -409,7 +406,7 @@ cannot_free:
 }
 
 /*
- * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed
+ * shrink_list return the number of reclaimed pages
  */
 static unsigned long shrink_list(struct list_head *page_list,
 				struct scan_control *sc)
@@ -417,7 +414,7 @@ static unsigned long shrink_list(struct list_head *page_list,
 	LIST_HEAD(ret_pages);
 	struct pagevec freed_pvec;
 	int pgactivate = 0;
-	unsigned long reclaimed = 0;
+	unsigned long nr_reclaimed = 0;
 
 	cond_resched();
 
@@ -557,7 +554,7 @@ static unsigned long shrink_list(struct list_head *page_list,
 
 free_it:
 		unlock_page(page);
-		reclaimed++;
+		nr_reclaimed++;
 		if (!pagevec_add(&freed_pvec, page))
 			__pagevec_release_nonlru(&freed_pvec);
 		continue;
@@ -575,8 +572,7 @@ keep:
 	if (pagevec_count(&freed_pvec))
 		__pagevec_release_nonlru(&freed_pvec);
 	mod_page_state(pgactivate, pgactivate);
-	sc->nr_reclaimed += reclaimed;
-	return reclaimed;
+	return nr_reclaimed;
 }
 
 #ifdef CONFIG_MIGRATION
@@ -1107,14 +1103,15 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 }
 
 /*
- * shrink_cache() adds the number of pages reclaimed to sc->nr_reclaimed
+ * shrink_cache() return the number of reclaimed pages
  */
-static void shrink_cache(unsigned long max_scan, struct zone *zone,
-			struct scan_control *sc)
+static unsigned long shrink_cache(unsigned long max_scan, struct zone *zone,
+				struct scan_control *sc)
 {
 	LIST_HEAD(page_list);
 	struct pagevec pvec;
 	unsigned long nr_scanned = 0;
+	unsigned long nr_reclaimed = 0;
 
 	pagevec_init(&pvec, 1);
 
@@ -1138,7 +1135,7 @@ static void shrink_cache(unsigned long max_scan, struct zone *zone,
 
 		nr_scanned += nr_scan;
 		nr_freed = shrink_list(&page_list, sc);
-
+		nr_reclaimed += nr_freed;
 		local_irq_disable();
 		if (current_is_kswapd()) {
 			__mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
@@ -1170,6 +1167,7 @@ static void shrink_cache(unsigned long max_scan, struct zone *zone,
 	spin_unlock_irq(&zone->lru_lock);
 done:
 	pagevec_release(&pvec);
+	return nr_reclaimed;
 }
 
 /*
@@ -1329,12 +1327,13 @@ refill_inactive_zone(unsigned long nr_pages, struct zone *zone,
 /*
  * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
  */
-static void shrink_zone(int priority, struct zone *zone,
-			struct scan_control *sc)
+static unsigned long shrink_zone(int priority, struct zone *zone,
+				struct scan_control *sc)
 {
 	unsigned long nr_active;
 	unsigned long nr_inactive;
 	unsigned long nr_to_scan;
+	unsigned long nr_reclaimed = 0;
 
 	atomic_inc(&zone->reclaim_in_progress);
 
@@ -1368,13 +1367,14 @@ static void shrink_zone(int priority, struct zone *zone,
 			nr_to_scan = min(nr_inactive,
 					(unsigned long)sc->swap_cluster_max);
 			nr_inactive -= nr_to_scan;
-			shrink_cache(nr_to_scan, zone, sc);
+			nr_reclaimed += shrink_cache(nr_to_scan, zone, sc);
 		}
 	}
 
 	throttle_vm_writeout();
 
 	atomic_dec(&zone->reclaim_in_progress);
+	return nr_reclaimed;
 }
 
 /*
@@ -1393,9 +1393,10 @@ static void shrink_zone(int priority, struct zone *zone,
  * If a zone is deemed to be full of pinned pages then just give it a light
  * scan then give up on it.
  */
-static void shrink_caches(int priority, struct zone **zones,
-				struct scan_control *sc)
+static unsigned long shrink_caches(int priority, struct zone **zones,
+					struct scan_control *sc)
 {
+	unsigned long nr_reclaimed = 0;
 	int i;
 
 	for (i = 0; zones[i] != NULL; i++) {
@@ -1414,8 +1415,9 @@ static void shrink_caches(int priority, struct zone **zones,
 		if (zone->all_unreclaimable && priority != DEF_PRIORITY)
 			continue;	/* Let kswapd poll it */
 
-		shrink_zone(priority, zone, sc);
+		nr_reclaimed += shrink_zone(priority, zone, sc);
 	}
+	return nr_reclaimed;
 }
  
 /*
@@ -1436,7 +1438,7 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
 	int priority;
 	int ret = 0;
 	unsigned long total_scanned = 0;
-	unsigned long total_reclaimed = 0;
+	unsigned long nr_reclaimed = 0;
 	struct reclaim_state *reclaim_state = current->reclaim_state;
 	unsigned long lru_pages = 0;
 	int i;
@@ -1462,18 +1464,16 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
 	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
 		sc.nr_mapped = read_page_state(nr_mapped);
 		sc.nr_scanned = 0;
-		sc.nr_reclaimed = 0;
 		if (!priority)
 			disable_swap_token();
-		shrink_caches(priority, zones, &sc);
+		nr_reclaimed += shrink_caches(priority, zones, &sc);
 		shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
 		if (reclaim_state) {
-			sc.nr_reclaimed += reclaim_state->reclaimed_slab;
+			nr_reclaimed += reclaim_state->reclaimed_slab;
 			reclaim_state->reclaimed_slab = 0;
 		}
 		total_scanned += sc.nr_scanned;
-		total_reclaimed += sc.nr_reclaimed;
-		if (total_reclaimed >= sc.swap_cluster_max) {
+		if (nr_reclaimed >= sc.swap_cluster_max) {
 			ret = 1;
 			goto out;
 		}
@@ -1540,7 +1540,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages,
 	int priority;
 	int i;
 	unsigned long total_scanned;
-	unsigned long total_reclaimed;
+	unsigned long nr_reclaimed;
 	struct reclaim_state *reclaim_state = current->reclaim_state;
 	struct scan_control sc = {
 		.gfp_mask = GFP_KERNEL,
@@ -1550,7 +1550,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages,
 
 loop_again:
 	total_scanned = 0;
-	total_reclaimed = 0;
+	nr_reclaimed = 0;
 	sc.may_writepage = !laptop_mode,
 	sc.nr_mapped = read_page_state(nr_mapped);
 
@@ -1632,13 +1632,11 @@ scan:
 			if (zone->prev_priority > priority)
 				zone->prev_priority = priority;
 			sc.nr_scanned = 0;
-			sc.nr_reclaimed = 0;
-			shrink_zone(priority, zone, &sc);
+			nr_reclaimed += shrink_zone(priority, zone, &sc);
 			reclaim_state->reclaimed_slab = 0;
 			nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
 						lru_pages);
-			sc.nr_reclaimed += reclaim_state->reclaimed_slab;
-			total_reclaimed += sc.nr_reclaimed;
+			nr_reclaimed += reclaim_state->reclaimed_slab;
 			total_scanned += sc.nr_scanned;
 			if (zone->all_unreclaimable)
 				continue;
@@ -1651,10 +1649,10 @@ scan:
 			 * even in laptop mode
 			 */
 			if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
-			    total_scanned > total_reclaimed+total_reclaimed/2)
+			    total_scanned > nr_reclaimed + nr_reclaimed / 2)
 				sc.may_writepage = 1;
 		}
-		if (nr_pages && to_free > total_reclaimed)
+		if (nr_pages && to_free > nr_reclaimed)
 			continue;	/* swsusp: need to do more work */
 		if (all_zones_ok)
 			break;		/* kswapd: all done */
@@ -1671,7 +1669,7 @@ scan:
 		 * matches the direct reclaim path behaviour in terms of impact
 		 * on zone->*_priority.
 		 */
-		if ((total_reclaimed >= SWAP_CLUSTER_MAX) && (!nr_pages))
+		if ((nr_reclaimed >= SWAP_CLUSTER_MAX) && !nr_pages)
 			break;
 	}
 out:
@@ -1685,7 +1683,7 @@ out:
 		goto loop_again;
 	}
 
-	return total_reclaimed;
+	return nr_reclaimed;
 }
 
 /*
@@ -1891,6 +1889,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	struct task_struct *p = current;
 	struct reclaim_state reclaim_state;
 	int priority;
+	unsigned long nr_reclaimed = 0;
 	struct scan_control sc = {
 		.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
 		.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP),
@@ -1917,11 +1916,11 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	 */
 	priority = ZONE_RECLAIM_PRIORITY;
 	do {
-		shrink_zone(priority, zone, &sc);
+		nr_reclaimed += shrink_zone(priority, zone, &sc);
 		priority--;
-	} while (priority >= 0 && sc.nr_reclaimed < nr_pages);
+	} while (priority >= 0 && nr_reclaimed < nr_pages);
 
-	if (sc.nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) {
+	if (nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) {
 		/*
 		 * shrink_slab does not currently allow us to determine
 		 * how many pages were freed in the zone. So we just
@@ -1936,10 +1935,10 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	p->reclaim_state = NULL;
 	current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
 
-	if (sc.nr_reclaimed == 0)
+	if (nr_reclaimed == 0)
 		zone->last_unsuccessful_zone_reclaim = jiffies;
 
-	return sc.nr_reclaimed >= nr_pages;
+	return nr_reclaimed >= nr_pages;
 }
 
 int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
-- 
cgit v1.1


From 1742f19fa920cdd6905f0db5898524dde22ab2a4 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 22 Mar 2006 00:08:21 -0800
Subject: [PATCH] vmscan: rename functions

We have:

	try_to_free_pages
	->shrink_caches(struct zone **zones, ..)
	  ->shrink_zone(struct zone *, ...)
	    ->shrink_cache(struct zone *, ...)
	      ->shrink_list(struct list_head *, ...)
	    ->refill_inactive_list((struct zone *, ...)

which is fairly irrational.

Rename things so that we have

 	try_to_free_pages
 	->shrink_zones(struct zone **zones, ..)
 	  ->shrink_zone(struct zone *, ...)
 	    ->shrink_inactive_list(struct zone *, ...)
 	      ->shrink_page_list(struct list_head *, ...)
	    ->shrink_active_list(struct zone *, ...)

Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Christoph Lameter <christoph@lameter.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmscan.c | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8f6ad13..2d5d486 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -298,7 +298,8 @@ static void handle_write_error(struct address_space *mapping,
 }
 
 /*
- * pageout is called by shrink_list() for each dirty page. Calls ->writepage().
+ * pageout is called by shrink_page_list() for each dirty page.
+ * Calls ->writepage().
  */
 static pageout_t pageout(struct page *page, struct address_space *mapping)
 {
@@ -406,10 +407,10 @@ cannot_free:
 }
 
 /*
- * shrink_list return the number of reclaimed pages
+ * shrink_page_list() returns the number of reclaimed pages
  */
-static unsigned long shrink_list(struct list_head *page_list,
-				struct scan_control *sc)
+static unsigned long shrink_page_list(struct list_head *page_list,
+					struct scan_control *sc)
 {
 	LIST_HEAD(ret_pages);
 	struct pagevec freed_pvec;
@@ -1103,10 +1104,11 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 }
 
 /*
- * shrink_cache() return the number of reclaimed pages
+ * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
+ * of reclaimed pages
  */
-static unsigned long shrink_cache(unsigned long max_scan, struct zone *zone,
-				struct scan_control *sc)
+static unsigned long shrink_inactive_list(unsigned long max_scan,
+				struct zone *zone, struct scan_control *sc)
 {
 	LIST_HEAD(page_list);
 	struct pagevec pvec;
@@ -1134,7 +1136,7 @@ static unsigned long shrink_cache(unsigned long max_scan, struct zone *zone,
 			goto done;
 
 		nr_scanned += nr_scan;
-		nr_freed = shrink_list(&page_list, sc);
+		nr_freed = shrink_page_list(&page_list, sc);
 		nr_reclaimed += nr_freed;
 		local_irq_disable();
 		if (current_is_kswapd()) {
@@ -1187,9 +1189,8 @@ done:
  * The downside is that we have to touch page->_count against each page.
  * But we had to alter page->flags anyway.
  */
-static void
-refill_inactive_zone(unsigned long nr_pages, struct zone *zone,
-			struct scan_control *sc)
+static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
+				struct scan_control *sc)
 {
 	unsigned long pgmoved;
 	int pgdeactivate = 0;
@@ -1360,14 +1361,15 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
 			nr_to_scan = min(nr_active,
 					(unsigned long)sc->swap_cluster_max);
 			nr_active -= nr_to_scan;
-			refill_inactive_zone(nr_to_scan, zone, sc);
+			shrink_active_list(nr_to_scan, zone, sc);
 		}
 
 		if (nr_inactive) {
 			nr_to_scan = min(nr_inactive,
 					(unsigned long)sc->swap_cluster_max);
 			nr_inactive -= nr_to_scan;
-			nr_reclaimed += shrink_cache(nr_to_scan, zone, sc);
+			nr_reclaimed += shrink_inactive_list(nr_to_scan, zone,
+								sc);
 		}
 	}
 
@@ -1393,7 +1395,7 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
  * If a zone is deemed to be full of pinned pages then just give it a light
  * scan then give up on it.
  */
-static unsigned long shrink_caches(int priority, struct zone **zones,
+static unsigned long shrink_zones(int priority, struct zone **zones,
 					struct scan_control *sc)
 {
 	unsigned long nr_reclaimed = 0;
@@ -1466,7 +1468,7 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
 		sc.nr_scanned = 0;
 		if (!priority)
 			disable_swap_token();
-		nr_reclaimed += shrink_caches(priority, zones, &sc);
+		nr_reclaimed += shrink_zones(priority, zones, &sc);
 		shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
 		if (reclaim_state) {
 			nr_reclaimed += reclaim_state->reclaimed_slab;
-- 
cgit v1.1


From 7fb2d46d396b2491818f8e43b01049b3234e6c07 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Wed, 22 Mar 2006 00:08:22 -0800
Subject: [PATCH] zone_reclaim: additional comments and cleanup

Add some comments to explain how zone reclaim works.  And it fixes the
following issues:

- PF_SWAPWRITE needs to be set for RECLAIM_SWAP to be able to write
  out pages to swap. Currently RECLAIM_SWAP may not do that.

- remove setting nr_reclaimed pages after slab reclaim since the slab shrinking
  code does not use that and the nr_reclaimed pages is just right for the
  intended follow up action.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmscan.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2d5d486..c712b94 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1887,6 +1887,7 @@ int zone_reclaim_interval __read_mostly = 30*HZ;
  */
 static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 {
+	/* Minimum pages needed in order to stay on node */
 	const unsigned long nr_pages = 1 << order;
 	struct task_struct *p = current;
 	struct reclaim_state reclaim_state;
@@ -1924,9 +1925,12 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 
 	if (nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) {
 		/*
-		 * shrink_slab does not currently allow us to determine
-		 * how many pages were freed in the zone. So we just
-		 * shake the slab and then go offnode for a single allocation.
+		 * shrink_slab() does not currently allow us to determine how
+		 * many pages were freed in this zone. So we just shake the slab
+		 * a bit and then go off node for this particular allocation
+		 * despite possibly having freed enough memory to allocate in
+		 * this zone.  If we freed local memory then the next
+		 * allocations will be local again.
 		 *
 		 * shrink_slab will free memory on all zones and may take
 		 * a long time.
@@ -1937,8 +1941,14 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	p->reclaim_state = NULL;
 	current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
 
-	if (nr_reclaimed == 0)
+	if (nr_reclaimed == 0) {
+		/*
+		 * We were unable to reclaim enough pages to stay on node.  We
+		 * now allow off node accesses for a certain time period before
+		 * trying again to reclaim pages from the local zone.
+		 */
 		zone->last_unsuccessful_zone_reclaim = jiffies;
+	}
 
 	return nr_reclaimed >= nr_pages;
 }
-- 
cgit v1.1


From c9b02d970c385a253edb36c87643b0df706b50b4 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <wfg@mail.ustc.edu.cn>
Date: Wed, 22 Mar 2006 00:08:23 -0800
Subject: [PATCH] mm: isolate_lru_pages() scan count fix

In isolate_lru_pages(), *scanned reports one more scan because the scan
counter is increased one more time on exit of the while-loop.

Change the while-loop to for-loop to fix it.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Wu Fengguang <wfg@mail.ustc.edu.cn>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmscan.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index c712b94..85e95f4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1074,9 +1074,9 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 {
 	unsigned long nr_taken = 0;
 	struct page *page;
-	unsigned long scan = 0;
+	unsigned long scan;
 
-	while (scan++ < nr_to_scan && !list_empty(src)) {
+	for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
 		struct list_head *target;
 		page = lru_to_page(src);
 		prefetchw_prev_lru_page(page, src, flags);
-- 
cgit v1.1


From fb8d14e172a29ba5ac69a73b61196be86fdfc3e1 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <wfg@mail.ustc.edu.cn>
Date: Wed, 22 Mar 2006 00:08:28 -0800
Subject: [PATCH] mm: shrink_inactive_lis() nr_scan accounting fix

In shrink_inactive_list(), nr_scan is not accounted when nr_taken is 0.
But 0 pages taken does not mean 0 pages scanned.

Move the goto statement below the accounting code to fix it.

Signed-off-by: Wu Fengguang <wfg@mail.ustc.edu.cn>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmscan.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 85e95f4..486184d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1132,9 +1132,6 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
 		zone->pages_scanned += nr_scan;
 		spin_unlock_irq(&zone->lru_lock);
 
-		if (nr_taken == 0)
-			goto done;
-
 		nr_scanned += nr_scan;
 		nr_freed = shrink_page_list(&page_list, sc);
 		nr_reclaimed += nr_freed;
@@ -1146,6 +1143,9 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
 			__mod_page_state_zone(zone, pgscan_direct, nr_scan);
 		__mod_page_state_zone(zone, pgsteal, nr_freed);
 
+		if (nr_taken == 0)
+			goto done;
+
 		spin_lock(&zone->lru_lock);
 		/*
 		 * Put back any unfreeable pages.
@@ -1166,8 +1166,9 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
 			}
 		}
   	} while (nr_scanned < max_scan);
-	spin_unlock_irq(&zone->lru_lock);
+	spin_unlock(&zone->lru_lock);
 done:
+	local_irq_enable();
 	pagevec_release(&pvec);
 	return nr_reclaimed;
 }
-- 
cgit v1.1


From a6f563db09c54c80d80e9013182dc512a5e53d0f Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Wed, 22 Mar 2006 00:08:29 -0800
Subject: [PATCH] remove VM_DONTCOPY bogosities

Now that it's madvisable, remove two pieces of VM_DONTCOPY bogosity:

1. There was and is no logical reason why VM_DONTCOPY should be in the
   list of flags which forbid vma merging (and those drivers which set
   it are also setting VM_IO, which itself forbids the merge).

2. It's hard to understand the purpose of the VM_HUGETLB, VM_DONTCOPY
   block in vm_stat_account: but never mind, it's under CONFIG_HUGETLB,
   which (unlike CONFIG_HUGETLB_PAGE or CONFIG_HUGETLBFS) has never been
   defined.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mmap.c | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

(limited to 'mm')

diff --git a/mm/mmap.c b/mm/mmap.c
index 47556d2..0eb9894 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -612,7 +612,7 @@ again:			remove_next = 1 + (end > next->vm_end);
  * If the vma has a ->close operation then the driver probably needs to release
  * per-vma resources, so we don't attempt to merge those.
  */
-#define VM_SPECIAL (VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP)
+#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP)
 
 static inline int is_mergeable_vma(struct vm_area_struct *vma,
 			struct file *file, unsigned long vm_flags)
@@ -845,14 +845,6 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags,
 	const unsigned long stack_flags
 		= VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN);
 
-#ifdef CONFIG_HUGETLB
-	if (flags & VM_HUGETLB) {
-		if (!(flags & VM_DONTCOPY))
-			mm->shared_vm += pages;
-		return;
-	}
-#endif /* CONFIG_HUGETLB */
-
 	if (file) {
 		mm->shared_vm += pages;
 		if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC)
-- 
cgit v1.1


From 0f8053a509ceba4a077a50ea7b77039b5559b428 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 22 Mar 2006 00:08:33 -0800
Subject: [PATCH] mm: make __put_page internal

Remove __put_page from outside the core mm/.  It is dangerous because it does
not handle compound pages nicely, and misses 1->0 transitions.  If a user
later appears that really needs the extra speed we can reevaluate.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/filemap.c  |  2 ++
 mm/internal.h | 11 +++++++++++
 mm/vmscan.c   |  2 ++
 3 files changed, 15 insertions(+)

(limited to 'mm')

diff --git a/mm/filemap.c b/mm/filemap.c
index 44da3d4..e8f58f7 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -30,6 +30,8 @@
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include "filemap.h"
+#include "internal.h"
+
 /*
  * FIXME: remove all knowledge of the buffer layer from the core VM
  */
diff --git a/mm/internal.h b/mm/internal.h
index 17256bb..e3042db 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -8,6 +8,10 @@
  * as published by the Free Software Foundation; either version
  * 2 of the License, or (at your option) any later version.
  */
+#ifndef __MM_INTERNAL_H
+#define __MM_INTERNAL_H
+
+#include <linux/mm.h>
 
 static inline void set_page_refs(struct page *page, int order)
 {
@@ -26,5 +30,12 @@ static inline void set_page_refs(struct page *page, int order)
 #endif /* CONFIG_MMU */
 }
 
+static inline void __put_page(struct page *page)
+{
+	atomic_dec(&page->_count);
+}
+
 extern void fastcall __init __free_pages_bootmem(struct page *page,
 						unsigned int order);
+
+#endif
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 486184d..3914a94 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -39,6 +39,8 @@
 
 #include <linux/swapops.h>
 
+#include "internal.h"
+
 /* possible outcome of pageout() */
 typedef enum {
 	/* failed to write page out, page is locked */
-- 
cgit v1.1


From 84097518d1ecd2330f9488e4c2d09953a3340e74 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 22 Mar 2006 00:08:34 -0800
Subject: [PATCH] mm: nommu use compound pages

Now that compound page handling is properly fixed in the VM, move nommu
over to using compound pages rather than rolling their own refcounting.

nommu vm page refcounting is broken anyway, but there is no need to have
divergent code in the core VM now, nor when it gets fixed.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: David Howells <dhowells@redhat.com>

(Needs testing, please).
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/internal.h   | 12 ------------
 mm/nommu.c      |  4 ++--
 mm/page_alloc.c |  7 -------
 mm/slab.c       |  9 ++++++++-
 4 files changed, 10 insertions(+), 22 deletions(-)

(limited to 'mm')

diff --git a/mm/internal.h b/mm/internal.h
index e3042db..7bb3397 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -15,19 +15,7 @@
 
 static inline void set_page_refs(struct page *page, int order)
 {
-#ifdef CONFIG_MMU
 	set_page_count(page, 1);
-#else
-	int i;
-
-	/*
-	 * We need to reference all the pages for this order, otherwise if
-	 * anyone accesses one of the pages with (get/put) it will be freed.
-	 * - eg: access_process_vm()
-	 */
-	for (i = 0; i < (1 << order); i++)
-		set_page_count(page + i, 1);
-#endif /* CONFIG_MMU */
 }
 
 static inline void __put_page(struct page *page)
diff --git a/mm/nommu.c b/mm/nommu.c
index 4951f47..db45efa 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -159,7 +159,7 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
 	/*
 	 * kmalloc doesn't like __GFP_HIGHMEM for some reason
 	 */
-	return kmalloc(size, gfp_mask & ~__GFP_HIGHMEM);
+	return kmalloc(size, (gfp_mask | __GFP_COMP) & ~__GFP_HIGHMEM);
 }
 
 struct page * vmalloc_to_page(void *addr)
@@ -623,7 +623,7 @@ static int do_mmap_private(struct vm_area_struct *vma, unsigned long len)
 	 * - note that this may not return a page-aligned address if the object
 	 *   we're allocating is smaller than a page
 	 */
-	base = kmalloc(len, GFP_KERNEL);
+	base = kmalloc(len, GFP_KERNEL|__GFP_COMP);
 	if (!base)
 		goto enomem;
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7aa0181..e197818 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -422,11 +422,6 @@ static void __free_pages_ok(struct page *page, unsigned int order)
 		mutex_debug_check_no_locks_freed(page_address(page),
 						 PAGE_SIZE<<order);
 
-#ifndef CONFIG_MMU
-	for (i = 1 ; i < (1 << order) ; ++i)
-		__put_page(page + i);
-#endif
-
 	for (i = 0 ; i < (1 << order) ; ++i)
 		reserved += free_pages_check(page + i);
 	if (reserved)
@@ -746,7 +741,6 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
 		clear_highpage(page + i);
 }
 
-#ifdef CONFIG_MMU
 /*
  * split_page takes a non-compound higher-order page, and splits it into
  * n (1<<order) sub-pages: page[0..n]
@@ -766,7 +760,6 @@ void split_page(struct page *page, unsigned int order)
 		set_page_count(page + i, 1);
 	}
 }
-#endif
 
 /*
  * Really, prep_compound_page() should be called from __rmqueue_bulk().  But
diff --git a/mm/slab.c b/mm/slab.c
index f477acf..ff0ab77 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -590,6 +590,8 @@ static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
 
 static inline struct kmem_cache *page_get_cache(struct page *page)
 {
+	if (unlikely(PageCompound(page)))
+		page = (struct page *)page_private(page);
 	return (struct kmem_cache *)page->lru.next;
 }
 
@@ -600,6 +602,8 @@ static inline void page_set_slab(struct page *page, struct slab *slab)
 
 static inline struct slab *page_get_slab(struct page *page)
 {
+	if (unlikely(PageCompound(page)))
+		page = (struct page *)page_private(page);
 	return (struct slab *)page->lru.prev;
 }
 
@@ -2412,8 +2416,11 @@ static void set_slab_attr(struct kmem_cache *cachep, struct slab *slabp,
 	struct page *page;
 
 	/* Nasty!!!!!! I hope this is OK. */
-	i = 1 << cachep->gfporder;
 	page = virt_to_page(objp);
+
+	i = 1;
+	if (likely(!PageCompound(page)))
+		i <<= cachep->gfporder;
 	do {
 		page_set_cache(page, cachep);
 		page_set_slab(page, slabp);
-- 
cgit v1.1


From 7835e98b2e3c66dba79cb0ff8ebb90a2fe030c29 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 22 Mar 2006 00:08:40 -0800
Subject: [PATCH] remove set_page_count() outside mm/

set_page_count usage outside mm/ is limited to setting the refcount to 1.
Remove set_page_count from outside mm/, and replace those users with
init_page_count() and set_page_refcounted().

This allows more debug checking, and tighter control on how code is allowed
to play around with page->_count.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/hugetlb.c    |  5 +++--
 mm/internal.h   | 13 ++++++++++++-
 mm/page_alloc.c | 14 ++++++--------
 3 files changed, 21 insertions(+), 11 deletions(-)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 39d49ec..20117a4 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -18,6 +18,7 @@
 #include <asm/pgtable.h>
 
 #include <linux/hugetlb.h>
+#include "internal.h"
 
 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
 static unsigned long nr_huge_pages, free_huge_pages;
@@ -106,7 +107,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
 		return NULL;
 	}
 	spin_unlock(&hugetlb_lock);
-	set_page_count(page, 1);
+	set_page_refcounted(page);
 	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
 		clear_user_highpage(&page[i], addr);
 	return page;
@@ -152,7 +153,7 @@ static void update_and_free_page(struct page *page)
 				1 << PG_private | 1<< PG_writeback);
 	}
 	page[1].lru.next = NULL;
-	set_page_count(page, 1);
+	set_page_refcounted(page);
 	__free_pages(page, HUGETLB_PAGE_ORDER);
 }
 
diff --git a/mm/internal.h b/mm/internal.h
index 7bb3397..d20e3cc 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -13,8 +13,19 @@
 
 #include <linux/mm.h>
 
-static inline void set_page_refs(struct page *page, int order)
+static inline void set_page_count(struct page *page, int v)
 {
+	atomic_set(&page->_count, v);
+}
+
+/*
+ * Turn a non-refcounted page (->_count == 0) into refcounted with
+ * a count of one.
+ */
+static inline void set_page_refcounted(struct page *page)
+{
+	BUG_ON(PageCompound(page) && page_private(page) != (unsigned long)page);
+	BUG_ON(atomic_read(&page->_count));
 	set_page_count(page, 1);
 }
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e197818..7f65b5a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -442,7 +442,7 @@ void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)
 	if (order == 0) {
 		__ClearPageReserved(page);
 		set_page_count(page, 0);
-		set_page_refs(page, 0);
+		set_page_refcounted(page);
 		__free_page(page);
 	} else {
 		int loop;
@@ -457,7 +457,7 @@ void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)
 			set_page_count(p, 0);
 		}
 
-		set_page_refs(page, order);
+		set_page_refcounted(page);
 		__free_pages(page, order);
 	}
 }
@@ -525,7 +525,7 @@ static int prep_new_page(struct page *page, int order)
 			1 << PG_referenced | 1 << PG_arch_1 |
 			1 << PG_checked | 1 << PG_mappedtodisk);
 	set_page_private(page, 0);
-	set_page_refs(page, order);
+	set_page_refcounted(page);
 	kernel_map_pages(page, 1 << order, 1);
 	return 0;
 }
@@ -755,10 +755,8 @@ void split_page(struct page *page, unsigned int order)
 
 	BUG_ON(PageCompound(page));
 	BUG_ON(!page_count(page));
-	for (i = 1; i < (1 << order); i++) {
-		BUG_ON(page_count(page + i));
-		set_page_count(page + i, 1);
-	}
+	for (i = 1; i < (1 << order); i++)
+		set_page_refcounted(page + i);
 }
 
 /*
@@ -1771,7 +1769,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 			continue;
 		page = pfn_to_page(pfn);
 		set_page_links(page, zone, nid, pfn);
-		set_page_count(page, 1);
+		init_page_count(page);
 		reset_page_mapcount(page);
 		SetPageReserved(page);
 		INIT_LIST_HEAD(&page->lru);
-- 
cgit v1.1


From 17cf44064ae744f081309108fa67f0e942b10167 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 22 Mar 2006 00:08:41 -0800
Subject: [PATCH] mm: cleanup prep_ stuff

Move the prep_ stuff into prep_new_page.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 35 ++++++++++++++++++-----------------
 1 file changed, 18 insertions(+), 17 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7f65b5a..bdff858 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -212,6 +212,15 @@ static void destroy_compound_page(struct page *page, unsigned long order)
 	}
 }
 
+static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
+{
+	int i;
+
+	BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM);
+	for (i = 0; i < (1 << order); i++)
+		clear_highpage(page + i);
+}
+
 /*
  * function for dealing with page's order in buddy system.
  * zone->lock is already acquired when we use these.
@@ -496,7 +505,7 @@ static inline void expand(struct zone *zone, struct page *page,
 /*
  * This page is about to be returned from the page allocator
  */
-static int prep_new_page(struct page *page, int order)
+static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
 {
 	if (unlikely(page_mapcount(page) |
 		(page->mapping != NULL)  |
@@ -527,6 +536,13 @@ static int prep_new_page(struct page *page, int order)
 	set_page_private(page, 0);
 	set_page_refcounted(page);
 	kernel_map_pages(page, 1 << order, 1);
+
+	if (gfp_flags & __GFP_ZERO)
+		prep_zero_page(page, order, gfp_flags);
+
+	if (order && (gfp_flags & __GFP_COMP))
+		prep_compound_page(page, order);
+
 	return 0;
 }
 
@@ -732,15 +748,6 @@ void fastcall free_cold_page(struct page *page)
 	free_hot_cold_page(page, 1);
 }
 
-static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
-{
-	int i;
-
-	BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM);
-	for(i = 0; i < (1 << order); i++)
-		clear_highpage(page + i);
-}
-
 /*
  * split_page takes a non-compound higher-order page, and splits it into
  * n (1<<order) sub-pages: page[0..n]
@@ -802,14 +809,8 @@ again:
 	put_cpu();
 
 	BUG_ON(bad_range(zone, page));
-	if (prep_new_page(page, order))
+	if (prep_new_page(page, order, gfp_flags))
 		goto again;
-
-	if (gfp_flags & __GFP_ZERO)
-		prep_zero_page(page, order, gfp_flags);
-
-	if (order && (gfp_flags & __GFP_COMP))
-		prep_compound_page(page, order);
 	return page;
 
 failed:
-- 
cgit v1.1


From 6626c5d53bc45c59798628677ba5606f02e371f3 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 22 Mar 2006 00:08:42 -0800
Subject: [PATCH] mm: prep_zero_page() in irq is a bug

prep_zero_page() uses KM_USER0 and hence may not be used from IRQ context, at
least for highmem pages.

Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Christoph Lameter <christoph@lameter.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index bdff858..ed91684 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -217,6 +217,11 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
 	int i;
 
 	BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM);
+	/*
+	 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
+	 * and __GFP_HIGHMEM from hard or soft interrupt context.
+	 */
+	BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
 	for (i = 0; i < (1 << order); i++)
 		clear_highpage(page + i);
 }
-- 
cgit v1.1


From b7ab795b7bec9997d4fde39f249d52823d36d98d Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 22 Mar 2006 00:08:42 -0800
Subject: [PATCH] mm: more CONFIG_DEBUG_VM

Put a few more checks under CONFIG_DEBUG_VM

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 12 +++---------
 mm/rmap.c   |  9 ++++-----
 2 files changed, 7 insertions(+), 14 deletions(-)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index 6af555c..71bc664 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -388,7 +388,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_
 {
 	unsigned long pfn = pte_pfn(pte);
 
-	if (vma->vm_flags & VM_PFNMAP) {
+	if (unlikely(vma->vm_flags & VM_PFNMAP)) {
 		unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT;
 		if (pfn == vma->vm_pgoff + off)
 			return NULL;
@@ -396,18 +396,12 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_
 			return NULL;
 	}
 
-	/*
-	 * Add some anal sanity checks for now. Eventually,
-	 * we should just do "return pfn_to_page(pfn)", but
-	 * in the meantime we check that we get a valid pfn,
-	 * and that the resulting page looks ok.
-	 *
-	 * Remove this test eventually!
-	 */
+#ifdef CONFIG_DEBUG_VM
 	if (unlikely(!pfn_valid(pfn))) {
 		print_bad_pte(vma, pte, addr);
 		return NULL;
 	}
+#endif
 
 	/*
 	 * NOTE! We still have PageReserved() pages in the page 
diff --git a/mm/rmap.c b/mm/rmap.c
index 134aef9..1963e26 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -56,13 +56,11 @@
 
 #include <asm/tlbflush.h>
 
-//#define RMAP_DEBUG /* can be enabled only for debugging */
-
 struct kmem_cache *anon_vma_cachep;
 
 static inline void validate_anon_vma(struct vm_area_struct *find_vma)
 {
-#ifdef RMAP_DEBUG
+#ifdef CONFIG_DEBUG_VM
 	struct anon_vma *anon_vma = find_vma->anon_vma;
 	struct vm_area_struct *vma;
 	unsigned int mapcount = 0;
@@ -551,13 +549,14 @@ void page_add_file_rmap(struct page *page)
 void page_remove_rmap(struct page *page)
 {
 	if (atomic_add_negative(-1, &page->_mapcount)) {
-		if (page_mapcount(page) < 0) {
+#ifdef CONFIG_DEBUG_VM
+		if (unlikely(page_mapcount(page) < 0)) {
 			printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page));
 			printk (KERN_EMERG "  page->flags = %lx\n", page->flags);
 			printk (KERN_EMERG "  page->count = %x\n", page_count(page));
 			printk (KERN_EMERG "  page->mapping = %p\n", page->mapping);
 		}
-
+#endif
 		BUG_ON(page_mapcount(page) < 0);
 		/*
 		 * It would be tidy to reset the PageAnon mapping here,
-- 
cgit v1.1


From 6e5ef1a96e6e3b123da56292bc35017c8c401491 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Wed, 22 Mar 2006 00:08:45 -0800
Subject: [PATCH] vmscan: emove obsolete checks from shrink_list() and fix
 unlikely in refill_inactive_zone()

As suggested by Marcelo:

1. The optimization introduced recently for not calling
   page_referenced() during zone reclaim makes two additional checks in
   shrink_list unnecessary.

2. The if (unlikely(sc->may_swap)) in refill_inactive_zone is optimized
   for the zone_reclaim case.  However, most peoples system only does swap.
   Undo that.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Marcelo Tosatti <marcelo.tosatti@cyclades.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmscan.c | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 3914a94..f713e9f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -460,12 +460,9 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		 * Anonymous process memory has backing store?
 		 * Try to allocate it some swap space here.
 		 */
-		if (PageAnon(page) && !PageSwapCache(page)) {
-			if (!sc->may_swap)
-				goto keep_locked;
+		if (PageAnon(page) && !PageSwapCache(page))
 			if (!add_to_swap(page, GFP_ATOMIC))
 				goto activate_locked;
-		}
 #endif /* CONFIG_SWAP */
 
 		mapping = page_mapping(page);
@@ -477,12 +474,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		 * processes. Try to unmap it here.
 		 */
 		if (page_mapped(page) && mapping) {
-			/*
-			 * No unmapping if we do not swap
-			 */
-			if (!sc->may_swap)
-				goto keep_locked;
-
 			switch (try_to_unmap(page, 0)) {
 			case SWAP_FAIL:
 				goto activate_locked;
@@ -1205,7 +1196,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 	struct pagevec pvec;
 	int reclaim_mapped = 0;
 
-	if (unlikely(sc->may_swap)) {
+	if (sc->may_swap) {
 		long mapped_ratio;
 		long distress;
 		long swap_tendency;
-- 
cgit v1.1


From d15c023b44e5d323f1f4130b85d29f08e43433b1 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Wed, 22 Mar 2006 00:08:46 -0800
Subject: [PATCH] shmem: inline to avoid warning

shmem.c was named and shamed in Jesper's "Building 100 kernels" warnings:
shmem_parse_mpol is only used when CONFIG_TMPFS parses mount options; and
only called from that one site, so mark it inline like its non-NUMA stub.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/shmem.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/shmem.c b/mm/shmem.c
index f523a15..37eaf42e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -875,7 +875,7 @@ redirty:
 }
 
 #ifdef CONFIG_NUMA
-static int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes)
+static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes)
 {
 	char *nodelist = strchr(value, ':');
 	int err = 1;
-- 
cgit v1.1


From a564da3964db3256069190c2ae95069143ac37fb Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 22 Mar 2006 00:08:47 -0800
Subject: [PATCH] readahead: ->prev_page can overrun the ahead window

If get_next_ra_size() does not grow fast enough, ->prev_page can overrun
the ahead window.  This means the caller will read the pages from
->ahead_start + ->ahead_size to ->prev_page synchronously.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Steven Pratt <slpratt@austin.ibm.com>
Cc: Ram Pai <linuxram@us.ibm.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/readahead.c | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/mm/readahead.c b/mm/readahead.c
index 8d6eeaa..57557e2 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -52,13 +52,24 @@ static inline unsigned long get_min_readahead(struct file_ra_state *ra)
 	return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE;
 }
 
+static inline void reset_ahead_window(struct file_ra_state *ra)
+{
+	/*
+	 * ... but preserve ahead_start + ahead_size value,
+	 * see 'recheck:' label in page_cache_readahead().
+	 * Note: We never use ->ahead_size as rvalue without
+	 * checking ->ahead_start != 0 first.
+	 */
+	ra->ahead_size += ra->ahead_start;
+	ra->ahead_start = 0;
+}
+
 static inline void ra_off(struct file_ra_state *ra)
 {
 	ra->start = 0;
 	ra->flags = 0;
 	ra->size = 0;
-	ra->ahead_start = 0;
-	ra->ahead_size = 0;
+	reset_ahead_window(ra);
 	return;
 }
 
@@ -426,8 +437,7 @@ static int make_ahead_window(struct address_space *mapping, struct file *filp,
 		 * congestion.  The ahead window will any way be closed
 		 * in case we failed due to excessive page cache hits.
 		 */
-		ra->ahead_start = 0;
-		ra->ahead_size = 0;
+		reset_ahead_window(ra);
 	}
 
 	return ret;
@@ -520,11 +530,11 @@ page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra,
 	 * If we get here we are doing sequential IO and this was not the first
 	 * occurence (ie we have an existing window)
 	 */
-
 	if (ra->ahead_start == 0) {	 /* no ahead window yet */
 		if (!make_ahead_window(mapping, filp, ra, 0))
-			goto out;
+			goto recheck;
 	}
+
 	/*
 	 * Already have an ahead window, check if we crossed into it.
 	 * If so, shift windows and issue a new ahead window.
@@ -536,6 +546,10 @@ page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra,
 		ra->start = ra->ahead_start;
 		ra->size = ra->ahead_size;
 		make_ahead_window(mapping, filp, ra, 0);
+recheck:
+		/* prev_page shouldn't overrun the ahead window */
+		ra->prev_page = min(ra->prev_page,
+			ra->ahead_start + ra->ahead_size - 1);
 	}
 
 out:
-- 
cgit v1.1


From aed75ff3caafce404d9be7f0c088716375be5279 Mon Sep 17 00:00:00 2001
From: Steven Pratt <slpratt@austin.ibm.com>
Date: Wed, 22 Mar 2006 00:08:48 -0800
Subject: [PATCH] readahead: fix initial window size calculation

The current current get_init_ra_size is not optimal across different IO
sizes and max_readahead values.  Here is a quick summary of sizes computed
under current design and under the attached patch.  All of these assume 1st
IO at offset 0, or 1st detected sequential IO.

	32k max, 4k request

	old         new
	-----------------
	 8k        8k
	16k       16k
	32k       32k

	128k max, 4k request
	old         new
	-----------------
	32k         16k
	64k         32k
	128k        64k
	128k       128k

	128k max, 32k request
	old         new
	-----------------
	32k         64k    <-----
	64k        128k
	128k       128k

	512k max, 4k request
	old         new
	-----------------
	4k         32k     <----
	16k        64k
	64k       128k
	128k      256k
	512k      512k

Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Steven Pratt <slpratt@austin.ibm.com>
Cc: Ram Pai <linuxram@us.ibm.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/readahead.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/readahead.c b/mm/readahead.c
index 57557e2..301b36c 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -83,10 +83,10 @@ static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
 {
 	unsigned long newsize = roundup_pow_of_two(size);
 
-	if (newsize <= max / 64)
-		newsize = newsize * newsize;
+	if (newsize <= max / 32)
+		newsize = newsize * 4;
 	else if (newsize <= max / 4)
-		newsize = max / 4;
+		newsize = newsize * 2;
 	else
 		newsize = max;
 	return newsize;
-- 
cgit v1.1


From 8f860591ffb29738cf5539b6fbf27f50dcdeb380 Mon Sep 17 00:00:00 2001
From: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Date: Wed, 22 Mar 2006 00:08:50 -0800
Subject: [PATCH] Enable mprotect on huge pages

2.6.16-rc3 uses hugetlb on-demand paging, but it doesn_t support hugetlb
mprotect.

From: David Gibson <david@gibson.dropbear.id.au>

  Remove a test from the mprotect() path which checks that the mprotect()ed
  range on a hugepage VMA is hugepage aligned (yes, really, the sense of
  is_aligned_hugepage_range() is the opposite of what you'd guess :-/).

  In fact, we don't need this test.  If the given addresses match the
  beginning/end of a hugepage VMA they must already be suitably aligned.  If
  they don't, then mprotect_fixup() will attempt to split the VMA.  The very
  first test in split_vma() will check for a badly aligned address on a
  hugepage VMA and return -EINVAL if necessary.

From: "Chen, Kenneth W" <kenneth.w.chen@intel.com>

  On i386 and x86-64, pte flag _PAGE_PSE collides with _PAGE_PROTNONE.  The
  identify of hugetlb pte is lost when changing page protection via mprotect.
  A page fault occurs later will trigger a bug check in huge_pte_alloc().

  The fix is to always make new pte a hugetlb pte and also to clean up
  legacy code where _PAGE_PRESENT is forced on in the pre-faulting day.

Signed-off-by: Zhang Yanmin <yanmin.zhang@intel.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/hugetlb.c  | 29 +++++++++++++++++++++++++++++
 mm/mprotect.c | 12 +++++-------
 2 files changed, 34 insertions(+), 7 deletions(-)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 20117a4..783098f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -565,3 +565,32 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	return i;
 }
+
+void hugetlb_change_protection(struct vm_area_struct *vma,
+		unsigned long address, unsigned long end, pgprot_t newprot)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned long start = address;
+	pte_t *ptep;
+	pte_t pte;
+
+	BUG_ON(address >= end);
+	flush_cache_range(vma, address, end);
+
+	spin_lock(&mm->page_table_lock);
+	for (; address < end; address += HPAGE_SIZE) {
+		ptep = huge_pte_offset(mm, address);
+		if (!ptep)
+			continue;
+		if (!pte_none(*ptep)) {
+			pte = huge_ptep_get_and_clear(mm, address, ptep);
+			pte = pte_mkhuge(pte_modify(pte, newprot));
+			set_huge_pte_at(mm, address, ptep, pte);
+			lazy_mmu_prot_update(pte);
+		}
+	}
+	spin_unlock(&mm->page_table_lock);
+
+	flush_tlb_range(vma, start, end);
+}
+
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 653b857..4c14d42 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -124,7 +124,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 	 * a MAP_NORESERVE private mapping to writable will now reserve.
 	 */
 	if (newflags & VM_WRITE) {
-		if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) {
+		if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED))) {
 			charged = nrpages;
 			if (security_vm_enough_memory(charged))
 				return -ENOMEM;
@@ -166,7 +166,10 @@ success:
 	 */
 	vma->vm_flags = newflags;
 	vma->vm_page_prot = newprot;
-	change_protection(vma, start, end, newprot);
+	if (is_vm_hugetlb_page(vma))
+		hugetlb_change_protection(vma, start, end, newprot);
+	else
+		change_protection(vma, start, end, newprot);
 	vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
 	vm_stat_account(mm, newflags, vma->vm_file, nrpages);
 	return 0;
@@ -240,11 +243,6 @@ sys_mprotect(unsigned long start, size_t len, unsigned long prot)
 
 		/* Here we know that  vma->vm_start <= nstart < vma->vm_end. */
 
-		if (is_vm_hugetlb_page(vma)) {
-			error = -EACCES;
-			goto out;
-		}
-
 		newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
 
 		/* newflags >> 4 shift VM_MAY% in place of VM_% */
-- 
cgit v1.1


From 79ac6ba40eb8d70f0d204e98ae9b63280ad1018c Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 22 Mar 2006 00:08:51 -0800
Subject: [PATCH] hugepage: Small fixes to hugepage clear/copy path

Move the loops used in mm/hugetlb.c to clear and copy hugepages to their
own functions for clarity.  As we do so, we add some checks of need_resched
- we are, after all copying megabytes of memory here.  We also add
might_sleep() accordingly.  We generally dropped locks around the clear and
copy, already but not everyone has PREEMPT enabled, so we should still be
checking explicitly.

For this to work, we need to remove the clear_huge_page() from
alloc_huge_page(), which is called with the page_table_lock held in the COW
path.  We move the clear_huge_page() to just after the alloc_huge_page() in
the hugepage no-page path.  In the COW path, the new page is about to be
copied over, so clearing it was just a waste of time anyway.  So as a side
effect we also fix the fact that we held the page_table_lock for far too
long in this path by calling alloc_huge_page() under it.

It causes no regressions on the libhugetlbfs testsuite (ppc64, POWER5).

Signed-off-by: David Gibson <dwg@au1.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/hugetlb.c | 33 ++++++++++++++++++++++++++-------
 1 file changed, 26 insertions(+), 7 deletions(-)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 783098f..41b1038 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -27,6 +27,29 @@ static struct list_head hugepage_freelists[MAX_NUMNODES];
 static unsigned int nr_huge_pages_node[MAX_NUMNODES];
 static unsigned int free_huge_pages_node[MAX_NUMNODES];
 
+static void clear_huge_page(struct page *page, unsigned long addr)
+{
+	int i;
+
+	might_sleep();
+	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) {
+		cond_resched();
+		clear_user_highpage(page + i, addr);
+	}
+}
+
+static void copy_huge_page(struct page *dst, struct page *src,
+			   unsigned long addr)
+{
+	int i;
+
+	might_sleep();
+	for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) {
+		cond_resched();
+		copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE);
+	}
+}
+
 /*
  * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
  */
@@ -98,7 +121,6 @@ void free_huge_page(struct page *page)
 struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
 {
 	struct page *page;
-	int i;
 
 	spin_lock(&hugetlb_lock);
 	page = dequeue_huge_page(vma, addr);
@@ -108,8 +130,6 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
 	}
 	spin_unlock(&hugetlb_lock);
 	set_page_refcounted(page);
-	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
-		clear_user_highpage(&page[i], addr);
 	return page;
 }
 
@@ -367,7 +387,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, pte_t *ptep, pte_t pte)
 {
 	struct page *old_page, *new_page;
-	int i, avoidcopy;
+	int avoidcopy;
 
 	old_page = pte_page(pte);
 
@@ -388,9 +408,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 	}
 
 	spin_unlock(&mm->page_table_lock);
-	for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++)
-		copy_user_highpage(new_page + i, old_page + i,
-				   address + i*PAGE_SIZE);
+	copy_huge_page(new_page, old_page, address);
 	spin_lock(&mm->page_table_lock);
 
 	ptep = huge_pte_offset(mm, address & HPAGE_MASK);
@@ -435,6 +453,7 @@ retry:
 			ret = VM_FAULT_OOM;
 			goto out;
 		}
+		clear_huge_page(page, address);
 
 		if (vma->vm_flags & VM_SHARED) {
 			int err;
-- 
cgit v1.1


From 3935baa9bcda3ccaee4f7849f5157d316e34412e Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 22 Mar 2006 00:08:53 -0800
Subject: [PATCH] hugepage: serialize hugepage allocation and instantiation

Currently, no lock or mutex is held between allocating a hugepage and
inserting it into the pagetables / page cache.  When we do go to insert the
page into pagetables or page cache, we recheck and may free the newly
allocated hugepage.  However, since the number of hugepages in the system
is strictly limited, and it's usualy to want to use all of them, this can
still lead to spurious allocation failures.

For example, suppose two processes are both mapping (MAP_SHARED) the same
hugepage file, large enough to consume the entire available hugepage pool.
If they race instantiating the last page in the mapping, they will both
attempt to allocate the last available hugepage.  One will fail, of course,
returning OOM from the fault and thus causing the process to be killed,
despite the fact that the entire mapping can, in fact, be instantiated.

The patch fixes this race by the simple method of adding a (sleeping) mutex
to serialize the hugepage fault path between allocation and insertion into
pagetables and/or page cache.  It would be possible to avoid the
serialization by catching the allocation failures, waiting on some
condition, then rechecking to see if someone else has instantiated the page
for us.  Given the likely frequency of hugepage instantiations, it seems
very doubtful it's worth the extra complexity.

This patch causes no regression on the libhugetlbfs testsuite, and one
test, which can trigger this race now passes where it previously failed.

Actually, the test still sometimes fails, though less often and only as a
shmat() failure, rather processes getting OOM killed by the VM.  The dodgy
heuristic tests in fs/hugetlbfs/inode.c for whether there's enough hugepage
space aren't protected by the new mutex, and would be ugly to do so, so
there's still a race there.  Another patch to replace those tests with
something saner for this reason as well as others coming...

Signed-off-by: David Gibson <dwg@au1.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/hugetlb.c | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 41b1038..d5987a8 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -13,6 +13,7 @@
 #include <linux/pagemap.h>
 #include <linux/mempolicy.h>
 #include <linux/cpuset.h>
+#include <linux/mutex.h>
 
 #include <asm/page.h>
 #include <asm/pgtable.h>
@@ -26,6 +27,10 @@ unsigned long max_huge_pages;
 static struct list_head hugepage_freelists[MAX_NUMNODES];
 static unsigned int nr_huge_pages_node[MAX_NUMNODES];
 static unsigned int free_huge_pages_node[MAX_NUMNODES];
+/*
+ * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
+ */
+static DEFINE_SPINLOCK(hugetlb_lock);
 
 static void clear_huge_page(struct page *page, unsigned long addr)
 {
@@ -50,11 +55,6 @@ static void copy_huge_page(struct page *dst, struct page *src,
 	}
 }
 
-/*
- * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
- */
-static DEFINE_SPINLOCK(hugetlb_lock);
-
 static void enqueue_huge_page(struct page *page)
 {
 	int nid = page_to_nid(page);
@@ -508,14 +508,24 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	pte_t *ptep;
 	pte_t entry;
 	int ret;
+	static DEFINE_MUTEX(hugetlb_instantiation_mutex);
 
 	ptep = huge_pte_alloc(mm, address);
 	if (!ptep)
 		return VM_FAULT_OOM;
 
+	/*
+	 * Serialize hugepage allocation and instantiation, so that we don't
+	 * get spurious allocation failures if two CPUs race to instantiate
+	 * the same page in the page cache.
+	 */
+	mutex_lock(&hugetlb_instantiation_mutex);
 	entry = *ptep;
-	if (pte_none(entry))
-		return hugetlb_no_page(mm, vma, address, ptep, write_access);
+	if (pte_none(entry)) {
+		ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
+		mutex_unlock(&hugetlb_instantiation_mutex);
+		return ret;
+	}
 
 	ret = VM_FAULT_MINOR;
 
@@ -525,6 +535,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		if (write_access && !pte_write(entry))
 			ret = hugetlb_cow(mm, vma, address, ptep, entry);
 	spin_unlock(&mm->page_table_lock);
+	mutex_unlock(&hugetlb_instantiation_mutex);
 
 	return ret;
 }
-- 
cgit v1.1


From b45b5bd65f668a665db40d093e4e1fe563533608 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 22 Mar 2006 00:08:55 -0800
Subject: [PATCH] hugepage: Strict page reservation for hugepage inodes

These days, hugepages are demand-allocated at first fault time.  There's a
somewhat dubious (and racy) heuristic when making a new mmap() to check if
there are enough available hugepages to fully satisfy that mapping.

A particularly obvious case where the heuristic breaks down is where a
process maps its hugepages not as a single chunk, but as a bunch of
individually mmap()ed (or shmat()ed) blocks without touching and
instantiating the pages in between allocations.  In this case the size of
each block is compared against the total number of available hugepages.
It's thus easy for the process to become overcommitted, because each block
mapping will succeed, although the total number of hugepages required by
all blocks exceeds the number available.  In particular, this defeats such
a program which will detect a mapping failure and adjust its hugepage usage
downward accordingly.

The patch below addresses this problem, by strictly reserving a number of
physical hugepages for hugepage inodes which have been mapped, but not
instatiated.  MAP_SHARED mappings are thus "safe" - they will fail on
mmap(), not later with an OOM SIGKILL.  MAP_PRIVATE mappings can still
trigger an OOM.  (Actually SHARED mappings can technically still OOM, but
only if the sysadmin explicitly reduces the hugepage pool between mapping
and instantiation)

This patch appears to address the problem at hand - it allows DB2 to start
correctly, for instance, which previously suffered the failure described
above.

This patch causes no regressions on the libhugetblfs testsuite, and makes a
test (designed to catch this problem) pass which previously failed (ppc64,
POWER5).

Signed-off-by: David Gibson <dwg@au1.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/hugetlb.c | 136 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 126 insertions(+), 10 deletions(-)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d5987a8..27fad5d 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -22,7 +22,7 @@
 #include "internal.h"
 
 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
-static unsigned long nr_huge_pages, free_huge_pages;
+static unsigned long nr_huge_pages, free_huge_pages, reserved_huge_pages;
 unsigned long max_huge_pages;
 static struct list_head hugepage_freelists[MAX_NUMNODES];
 static unsigned int nr_huge_pages_node[MAX_NUMNODES];
@@ -120,17 +120,136 @@ void free_huge_page(struct page *page)
 
 struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
 {
+	struct inode *inode = vma->vm_file->f_dentry->d_inode;
 	struct page *page;
+	int use_reserve = 0;
+	unsigned long idx;
 
 	spin_lock(&hugetlb_lock);
-	page = dequeue_huge_page(vma, addr);
-	if (!page) {
-		spin_unlock(&hugetlb_lock);
-		return NULL;
+
+	if (vma->vm_flags & VM_MAYSHARE) {
+
+		/* idx = radix tree index, i.e. offset into file in
+		 * HPAGE_SIZE units */
+		idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
+			+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
+
+		/* The hugetlbfs specific inode info stores the number
+		 * of "guaranteed available" (huge) pages.  That is,
+		 * the first 'prereserved_hpages' pages of the inode
+		 * are either already instantiated, or have been
+		 * pre-reserved (by hugetlb_reserve_for_inode()). Here
+		 * we're in the process of instantiating the page, so
+		 * we use this to determine whether to draw from the
+		 * pre-reserved pool or the truly free pool. */
+		if (idx < HUGETLBFS_I(inode)->prereserved_hpages)
+			use_reserve = 1;
+	}
+
+	if (!use_reserve) {
+		if (free_huge_pages <= reserved_huge_pages)
+			goto fail;
+	} else {
+		BUG_ON(reserved_huge_pages == 0);
+		reserved_huge_pages--;
 	}
+
+	page = dequeue_huge_page(vma, addr);
+	if (!page)
+		goto fail;
+
 	spin_unlock(&hugetlb_lock);
 	set_page_refcounted(page);
 	return page;
+
+ fail:
+	WARN_ON(use_reserve); /* reserved allocations shouldn't fail */
+	spin_unlock(&hugetlb_lock);
+	return NULL;
+}
+
+/* hugetlb_extend_reservation()
+ *
+ * Ensure that at least 'atleast' hugepages are, and will remain,
+ * available to instantiate the first 'atleast' pages of the given
+ * inode.  If the inode doesn't already have this many pages reserved
+ * or instantiated, set aside some hugepages in the reserved pool to
+ * satisfy later faults (or fail now if there aren't enough, rather
+ * than getting the SIGBUS later).
+ */
+int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info,
+			       unsigned long atleast)
+{
+	struct inode *inode = &info->vfs_inode;
+	unsigned long change_in_reserve = 0;
+	int ret = 0;
+
+	spin_lock(&hugetlb_lock);
+	read_lock_irq(&inode->i_mapping->tree_lock);
+
+	if (info->prereserved_hpages >= atleast)
+		goto out;
+
+	/* Because we always call this on shared mappings, none of the
+	 * pages beyond info->prereserved_hpages can have been
+	 * instantiated, so we need to reserve all of them now. */
+	change_in_reserve = atleast - info->prereserved_hpages;
+
+	if ((reserved_huge_pages + change_in_reserve) > free_huge_pages) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	reserved_huge_pages += change_in_reserve;
+	info->prereserved_hpages = atleast;
+
+ out:
+	read_unlock_irq(&inode->i_mapping->tree_lock);
+	spin_unlock(&hugetlb_lock);
+
+	return ret;
+}
+
+/* hugetlb_truncate_reservation()
+ *
+ * This returns pages reserved for the given inode to the general free
+ * hugepage pool.  If the inode has any pages prereserved, but not
+ * instantiated, beyond offset (atmost << HPAGE_SIZE), then release
+ * them.
+ */
+void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info,
+				  unsigned long atmost)
+{
+	struct inode *inode = &info->vfs_inode;
+	struct address_space *mapping = inode->i_mapping;
+	unsigned long idx;
+	unsigned long change_in_reserve = 0;
+	struct page *page;
+
+	spin_lock(&hugetlb_lock);
+	read_lock_irq(&inode->i_mapping->tree_lock);
+
+	if (info->prereserved_hpages <= atmost)
+		goto out;
+
+	/* Count pages which were reserved, but not instantiated, and
+	 * which we can now release. */
+	for (idx = atmost; idx < info->prereserved_hpages; idx++) {
+		page = radix_tree_lookup(&mapping->page_tree, idx);
+		if (!page)
+			/* Pages which are already instantiated can't
+			 * be unreserved (and in fact have already
+			 * been removed from the reserved pool) */
+			change_in_reserve++;
+	}
+
+	BUG_ON(reserved_huge_pages < change_in_reserve);
+	reserved_huge_pages -= change_in_reserve;
+	info->prereserved_hpages = atmost;
+
+ out:
+	read_unlock_irq(&inode->i_mapping->tree_lock);
+	spin_unlock(&hugetlb_lock);
 }
 
 static int __init hugetlb_init(void)
@@ -238,9 +357,11 @@ int hugetlb_report_meminfo(char *buf)
 	return sprintf(buf,
 			"HugePages_Total: %5lu\n"
 			"HugePages_Free:  %5lu\n"
+		        "HugePages_Rsvd:  %5lu\n"
 			"Hugepagesize:    %5lu kB\n",
 			nr_huge_pages,
 			free_huge_pages,
+		        reserved_huge_pages,
 			HPAGE_SIZE/1024);
 }
 
@@ -253,11 +374,6 @@ int hugetlb_report_node_meminfo(int nid, char *buf)
 		nid, free_huge_pages_node[nid]);
 }
 
-int is_hugepage_mem_enough(size_t size)
-{
-	return (size + ~HPAGE_MASK)/HPAGE_SIZE <= free_huge_pages;
-}
-
 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
 unsigned long hugetlb_total_pages(void)
 {
-- 
cgit v1.1


From 27a85ef1b81300cfff06b4c8037e9914dfb09acc Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 22 Mar 2006 00:08:56 -0800
Subject: [PATCH] hugepage: Make {alloc,free}_huge_page() local

Originally, mm/hugetlb.c just handled the hugepage physical allocation path
and its {alloc,free}_huge_page() functions were used from the arch specific
hugepage code.  These days those functions are only used with mm/hugetlb.c
itself.  Therefore, this patch makes them static and removes their
prototypes from hugetlb.h.  This requires a small rearrangement of code in
mm/hugetlb.c to avoid a forward declaration.

This patch causes no regressions on the libhugetlbfs testsuite (ppc64,
POWER5).

Signed-off-by: David Gibson <dwg@au1.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/hugetlb.c | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 27fad5d..075877b 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -88,6 +88,17 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
 	return page;
 }
 
+static void free_huge_page(struct page *page)
+{
+	BUG_ON(page_count(page));
+
+	INIT_LIST_HEAD(&page->lru);
+
+	spin_lock(&hugetlb_lock);
+	enqueue_huge_page(page);
+	spin_unlock(&hugetlb_lock);
+}
+
 static int alloc_fresh_huge_page(void)
 {
 	static int nid = 0;
@@ -107,18 +118,8 @@ static int alloc_fresh_huge_page(void)
 	return 0;
 }
 
-void free_huge_page(struct page *page)
-{
-	BUG_ON(page_count(page));
-
-	INIT_LIST_HEAD(&page->lru);
-
-	spin_lock(&hugetlb_lock);
-	enqueue_huge_page(page);
-	spin_unlock(&hugetlb_lock);
-}
-
-struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
+static struct page *alloc_huge_page(struct vm_area_struct *vma,
+				    unsigned long addr)
 {
 	struct inode *inode = vma->vm_file->f_dentry->d_inode;
 	struct page *page;
-- 
cgit v1.1


From 9da61aef0fd5b17dd4bf4baf33db12c470def774 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 22 Mar 2006 00:08:57 -0800
Subject: [PATCH] hugepage: Fix hugepage logic in free_pgtables()

free_pgtables() has special logic to call hugetlb_free_pgd_range() instead
of the normal free_pgd_range() on hugepage VMAs.  However, the test it uses
to do so is incorrect: it calls is_hugepage_only_range on a hugepage sized
range at the start of the vma.  is_hugepage_only_range() will return true
if the given range has any intersection with a hugepage address region, and
in this case the given region need not be hugepage aligned.  So, for
example, this test can return true if called on, say, a 4k VMA immediately
preceding a (nicely aligned) hugepage VMA.

At present we get away with this because the powerpc version of
hugetlb_free_pgd_range() is just a call to free_pgd_range().  On ia64 (the
only other arch with a non-trivial is_hugepage_only_range()) we get away
with it for a different reason; the hugepage area is not contiguous with
the rest of the user address space, and VMAs are not permitted in between,
so the test can't return a false positive there.

Nonetheless this should be fixed.  We do that in the patch below by
replacing the is_hugepage_only_range() test with an explicit test of the
VMA using is_vm_hugetlb_page().

This in turn changes behaviour for platforms where is_hugepage_only_range()
returns false always (everything except powerpc and ia64).  We address this
by ensuring that hugetlb_free_pgd_range() is defined to be identical to
free_pgd_range() (instead of a no-op) on everything except ia64.  Even so,
it will prevent some otherwise possible coalescing of calls down to
free_pgd_range().  Since this only happens for hugepage VMAs, removing this
small optimization seems unlikely to cause any trouble.

This patch causes no regressions on the libhugetlbfs testsuite - ppc64
POWER5 (8-way), ppc64 G5 (2-way) and i386 Pentium M (UP).

Signed-off-by: David Gibson <dwg@au1.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Acked-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index 71bc664..f6e3be9 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -277,7 +277,7 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
 		anon_vma_unlink(vma);
 		unlink_file_vma(vma);
 
-		if (is_hugepage_only_range(vma->vm_mm, addr, HPAGE_SIZE)) {
+		if (is_vm_hugetlb_page(vma)) {
 			hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
 				floor, next? next->vm_start: ceiling);
 		} else {
@@ -285,8 +285,7 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
 			 * Optimization: gather nearby vmas into one call down
 			 */
 			while (next && next->vm_start <= vma->vm_end + PMD_SIZE
-			  && !is_hugepage_only_range(vma->vm_mm, next->vm_start,
-							HPAGE_SIZE)) {
+			       && !is_vm_hugetlb_page(vma)) {
 				vma = next;
 				next = vma->vm_next;
 				anon_vma_unlink(vma);
-- 
cgit v1.1


From 4866920b93fd7d5b520278c3c76e6f4d5a352d81 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 22 Mar 2006 00:08:58 -0800
Subject: [PATCH] hugepage: Fix hugepage logic in free_pgtables() harder

Turns out the hugepage logic in free_pgtables() was doubly broken.  The
loop coalescing multiple normal page VMAs into one call to free_pgd_range()
had an off by one error, which could mean it would coalesce one hugepage
VMA into the same bundle (checking 'vma' not 'next' in the loop).  I
transferred this bug into the new is_vm_hugetlb_page() based version.
Here's the fix.

This one didn't bite on powerpc previously for the same reason the
is_hugepage_only_range() problem didn't: powerpc's hugetlb_free_pgd_range()
is identical to free_pgd_range().  It didn't bite on ia64 because the
hugepage region is distant enough from any other region that the separated
PMD_SIZE distance test would always prevent coalescing the two together.

No libhugetlbfs testsuite regressions (ppc64, POWER5).

Signed-off-by: David Gibson <dwg@au1.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index f6e3be9..80c3fb3 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -285,7 +285,7 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
 			 * Optimization: gather nearby vmas into one call down
 			 */
 			while (next && next->vm_start <= vma->vm_end + PMD_SIZE
-			       && !is_vm_hugetlb_page(vma)) {
+			       && !is_vm_hugetlb_page(next)) {
 				vma = next;
 				next = vma->vm_next;
 				anon_vma_unlink(vma);
-- 
cgit v1.1


From d5d4b0aa4e1430d73050babba999365593bdb9d2 Mon Sep 17 00:00:00 2001
From: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Date: Wed, 22 Mar 2006 00:09:03 -0800
Subject: [PATCH] optimize follow_hugetlb_page

follow_hugetlb_page() walks a range of user virtual address and then fills
in list of struct page * into an array that is passed from the argument
list.  It also gets a reference count via get_page().  For compound page,
get_page() actually traverse back to head page via page_private() macro and
then adds a reference count to the head page.  Since we are doing a virt to
pte look up, kernel already has a struct page pointer into the head page.
So instead of traverse into the small unit page struct and then follow a
link back to the head page, optimize that with incrementing the reference
count directly on the head page.

The benefit is that we don't take a cache miss on accessing page struct for
the corresponding user address and more importantly, not to pollute the
cache with a "not very useful" round trip of pointer chasing.  This adds a
moderate performance gain on an I/O intensive database transaction
workload.

Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/hugetlb.c | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 075877b..06699d8 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -661,10 +661,10 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			struct page **pages, struct vm_area_struct **vmas,
 			unsigned long *position, int *length, int i)
 {
-	unsigned long vpfn, vaddr = *position;
+	unsigned long pfn_offset;
+	unsigned long vaddr = *position;
 	int remainder = *length;
 
-	vpfn = vaddr/PAGE_SIZE;
 	spin_lock(&mm->page_table_lock);
 	while (vaddr < vma->vm_end && remainder) {
 		pte_t *pte;
@@ -692,19 +692,28 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			break;
 		}
 
-		if (pages) {
-			page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
-			get_page(page);
-			pages[i] = page;
-		}
+		pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT;
+		page = pte_page(*pte);
+same_page:
+		get_page(page);
+		if (pages)
+			pages[i] = page + pfn_offset;
 
 		if (vmas)
 			vmas[i] = vma;
 
 		vaddr += PAGE_SIZE;
-		++vpfn;
+		++pfn_offset;
 		--remainder;
 		++i;
+		if (vaddr < vma->vm_end && remainder &&
+				pfn_offset < HPAGE_SIZE/PAGE_SIZE) {
+			/*
+			 * We use pfn_offset to avoid touching the pageframes
+			 * of this compound page.
+			 */
+			goto same_page;
+		}
 	}
 	spin_unlock(&mm->page_table_lock);
 	*length = remainder;
-- 
cgit v1.1


From 248a0301e703cbf781aa02a91bcfc6da75870dd7 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 22 Mar 2006 00:09:04 -0800
Subject: [PATCH] mm: make shrink_all_memory try harder

Make shrink_all_memory() repeat the attempts to free more memory if there
seems to be no pages to free.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmscan.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index f713e9f..548e023 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -33,6 +33,7 @@
 #include <linux/cpuset.h>
 #include <linux/notifier.h>
 #include <linux/rwsem.h>
+#include <linux/delay.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -1784,11 +1785,13 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
 	pg_data_t *pgdat;
 	unsigned long nr_to_free = nr_pages;
 	unsigned long ret = 0;
+	unsigned retry = 2;
 	struct reclaim_state reclaim_state = {
 		.reclaimed_slab = 0,
 	};
 
 	current->reclaim_state = &reclaim_state;
+repeat:
 	for_each_pgdat(pgdat) {
 		unsigned long freed;
 
@@ -1798,6 +1801,10 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
 		if ((long)nr_to_free <= 0)
 			break;
 	}
+	if (retry-- && ret < nr_pages) {
+		blk_congestion_wait(WRITE, HZ/5);
+		goto repeat;
+	}
 	current->reclaim_state = NULL;
 	return ret;
 }
-- 
cgit v1.1


From 35386e3b0f876bf194982f48f027af0c216499ce Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Wed, 22 Mar 2006 00:09:05 -0800
Subject: [PATCH] slab: cache_reap(): further reduction in interrupt holdoff

cache_reap takes the l3->list_lock (disabling interrupts) unconditionally
and then does a few checks and maybe does some cleanup.  This patch makes
cache_reap() only take the lock if there is work to do and then the lock is
taken and released for each cleaning action.

The checking of when to do the next reaping is done without any locking and
becomes racy.  Should not matter since reaping can also be skipped if the
slab mutex cannot be acquired.

The same is true for the touched processing.  If we get this wrong once in
awhile then we will mistakenly clean or not clean the shared cache.  This
will impact performance slightly.

Note that the additional drain_array() function introduced here will fall
out in a subsequent patch since array cleaning will now be very similar
from all callers.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 57 +++++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 43 insertions(+), 14 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index ff0ab77..1845c01 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -292,13 +292,13 @@ struct kmem_list3 {
 	struct list_head slabs_full;
 	struct list_head slabs_free;
 	unsigned long free_objects;
-	unsigned long next_reap;
-	int free_touched;
 	unsigned int free_limit;
 	unsigned int colour_next;	/* Per-node cache coloring */
 	spinlock_t list_lock;
 	struct array_cache *shared;	/* shared per node */
 	struct array_cache **alien;	/* on other nodes */
+	unsigned long next_reap;	/* updated without locking */
+	int free_touched;		/* updated without locking */
 };
 
 /*
@@ -3539,6 +3539,22 @@ static void drain_array_locked(struct kmem_cache *cachep,
 	}
 }
 
+
+/*
+ * Drain an array if it contains any elements taking the l3 lock only if
+ * necessary.
+ */
+static void drain_array(struct kmem_cache *searchp, struct kmem_list3 *l3,
+					 struct array_cache *ac)
+{
+	if (ac && ac->avail) {
+		spin_lock_irq(&l3->list_lock);
+		drain_array_locked(searchp, ac, 0,
+				   numa_node_id());
+		spin_unlock_irq(&l3->list_lock);
+	}
+}
+
 /**
  * cache_reap - Reclaim memory from caches.
  * @unused: unused parameter
@@ -3572,33 +3588,48 @@ static void cache_reap(void *unused)
 		searchp = list_entry(walk, struct kmem_cache, next);
 		check_irq_on();
 
+		/*
+		 * We only take the l3 lock if absolutely necessary and we
+		 * have established with reasonable certainty that
+		 * we can do some work if the lock was obtained.
+		 */
 		l3 = searchp->nodelists[numa_node_id()];
+
 		reap_alien(searchp, l3);
-		spin_lock_irq(&l3->list_lock);
 
-		drain_array_locked(searchp, cpu_cache_get(searchp), 0,
-				   numa_node_id());
+		drain_array(searchp, l3, cpu_cache_get(searchp));
 
+		/*
+		 * These are racy checks but it does not matter
+		 * if we skip one check or scan twice.
+		 */
 		if (time_after(l3->next_reap, jiffies))
-			goto next_unlock;
+			goto next;
 
 		l3->next_reap = jiffies + REAPTIMEOUT_LIST3;
 
-		if (l3->shared)
-			drain_array_locked(searchp, l3->shared, 0,
-					   numa_node_id());
+		drain_array(searchp, l3, l3->shared);
 
 		if (l3->free_touched) {
 			l3->free_touched = 0;
-			goto next_unlock;
+			goto next;
 		}
 
 		tofree = (l3->free_limit + 5 * searchp->num - 1) /
 				(5 * searchp->num);
 		do {
+			/*
+			 * Do not lock if there are no free blocks.
+			 */
+			if (list_empty(&l3->slabs_free))
+				break;
+
+			spin_lock_irq(&l3->list_lock);
 			p = l3->slabs_free.next;
-			if (p == &(l3->slabs_free))
+			if (p == &(l3->slabs_free)) {
+				spin_unlock_irq(&l3->list_lock);
 				break;
+			}
 
 			slabp = list_entry(p, struct slab, list);
 			BUG_ON(slabp->inuse);
@@ -3613,10 +3644,8 @@ static void cache_reap(void *unused)
 			l3->free_objects -= searchp->num;
 			spin_unlock_irq(&l3->list_lock);
 			slab_destroy(searchp, slabp);
-			spin_lock_irq(&l3->list_lock);
 		} while (--tofree > 0);
-next_unlock:
-		spin_unlock_irq(&l3->list_lock);
+next:
 		cond_resched();
 	}
 	check_irq_on();
-- 
cgit v1.1


From aab2207cf8d9c343b6b5f0e4d27e1732f8618d14 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Wed, 22 Mar 2006 00:09:06 -0800
Subject: [PATCH] slab: make drain_array more universal by adding more
 parameters

And a parameter to drain_array to control the freeing of all objects and
then use drain_array() to replace instances of drain_array_locked with
drain_array.  Doing so will avoid taking locks in those locations if the
arrays are empty.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 1845c01..d73b38e 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2126,6 +2126,10 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
 static void drain_array_locked(struct kmem_cache *cachep,
 			struct array_cache *ac, int force, int node);
 
+static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
+			struct array_cache *ac,
+			int force, int node);
+
 static void do_drain(void *arg)
 {
 	struct kmem_cache *cachep = arg;
@@ -2150,9 +2154,7 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
 	for_each_online_node(node) {
 		l3 = cachep->nodelists[node];
 		if (l3) {
-			spin_lock_irq(&l3->list_lock);
-			drain_array_locked(cachep, l3->shared, 1, node);
-			spin_unlock_irq(&l3->list_lock);
+			drain_array(cachep, l3, l3->shared, 1, node);
 			if (l3->alien)
 				drain_alien_cache(cachep, l3->alien);
 		}
@@ -3545,12 +3547,11 @@ static void drain_array_locked(struct kmem_cache *cachep,
  * necessary.
  */
 static void drain_array(struct kmem_cache *searchp, struct kmem_list3 *l3,
-					 struct array_cache *ac)
+			 struct array_cache *ac, int force, int node)
 {
 	if (ac && ac->avail) {
 		spin_lock_irq(&l3->list_lock);
-		drain_array_locked(searchp, ac, 0,
-				   numa_node_id());
+		drain_array_locked(searchp, ac, force, node);
 		spin_unlock_irq(&l3->list_lock);
 	}
 }
@@ -3571,6 +3572,7 @@ static void cache_reap(void *unused)
 {
 	struct list_head *walk;
 	struct kmem_list3 *l3;
+	int node = numa_node_id();
 
 	if (!mutex_trylock(&cache_chain_mutex)) {
 		/* Give up. Setup the next iteration. */
@@ -3593,11 +3595,11 @@ static void cache_reap(void *unused)
 		 * have established with reasonable certainty that
 		 * we can do some work if the lock was obtained.
 		 */
-		l3 = searchp->nodelists[numa_node_id()];
+		l3 = searchp->nodelists[node];
 
 		reap_alien(searchp, l3);
 
-		drain_array(searchp, l3, cpu_cache_get(searchp));
+		drain_array(searchp, l3, cpu_cache_get(searchp), 0, node);
 
 		/*
 		 * These are racy checks but it does not matter
@@ -3608,7 +3610,7 @@ static void cache_reap(void *unused)
 
 		l3->next_reap = jiffies + REAPTIMEOUT_LIST3;
 
-		drain_array(searchp, l3, l3->shared);
+		drain_array(searchp, l3, l3->shared, 0, node);
 
 		if (l3->free_touched) {
 			l3->free_touched = 0;
-- 
cgit v1.1


From 1b55253a7f95adc82eb20937b57b3e3e32ba65df Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Wed, 22 Mar 2006 00:09:07 -0800
Subject: [PATCH] slab: remove drain_array_locked

Remove drain_array_locked and use that opportunity to limit the time the l3
lock is taken further.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 31 ++++++++++---------------------
 1 file changed, 10 insertions(+), 21 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index d73b38e..3274144 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2123,9 +2123,6 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
 #define check_spinlock_acquired_node(x, y) do { } while(0)
 #endif
 
-static void drain_array_locked(struct kmem_cache *cachep,
-			struct array_cache *ac, int force, int node);
-
 static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
 			struct array_cache *ac,
 			int force, int node);
@@ -3522,40 +3519,32 @@ static void enable_cpucache(struct kmem_cache *cachep)
 		       cachep->name, -err);
 }
 
-static void drain_array_locked(struct kmem_cache *cachep,
-				struct array_cache *ac, int force, int node)
+/*
+ * Drain an array if it contains any elements taking the l3 lock only if
+ * necessary.
+ */
+void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
+			 struct array_cache *ac, int force, int node)
 {
 	int tofree;
 
-	check_spinlock_acquired_node(cachep, node);
+	if (!ac || !ac->avail)
+		return;
 	if (ac->touched && !force) {
 		ac->touched = 0;
 	} else if (ac->avail) {
 		tofree = force ? ac->avail : (ac->limit + 4) / 5;
 		if (tofree > ac->avail)
 			tofree = (ac->avail + 1) / 2;
+		spin_lock_irq(&l3->list_lock);
 		free_block(cachep, ac->entry, tofree, node);
+		spin_unlock_irq(&l3->list_lock);
 		ac->avail -= tofree;
 		memmove(ac->entry, &(ac->entry[tofree]),
 			sizeof(void *) * ac->avail);
 	}
 }
 
-
-/*
- * Drain an array if it contains any elements taking the l3 lock only if
- * necessary.
- */
-static void drain_array(struct kmem_cache *searchp, struct kmem_list3 *l3,
-			 struct array_cache *ac, int force, int node)
-{
-	if (ac && ac->avail) {
-		spin_lock_irq(&l3->list_lock);
-		drain_array_locked(searchp, ac, force, node);
-		spin_unlock_irq(&l3->list_lock);
-	}
-}
-
 /**
  * cache_reap - Reclaim memory from caches.
  * @unused: unused parameter
-- 
cgit v1.1


From b18e7e654d7af741d2bf34a90dc34128d0217fea Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Wed, 22 Mar 2006 00:09:07 -0800
Subject: [PATCH] slab: fix drain_array() so that it works correctly with the
 shared_array

The list_lock also protects the shared array and we call drain_array() with
the shared array.  Therefore we cannot go as far as I wanted to but have to
take the lock in a way so that it also protects the array_cache in
drain_pages.

(Note: maybe we should make the array_cache locking more consistent?  I.e.
always take the array cache lock for shared arrays and disable interrupts
for the per cpu arrays?)

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 3274144..6b691ecb 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3521,7 +3521,8 @@ static void enable_cpucache(struct kmem_cache *cachep)
 
 /*
  * Drain an array if it contains any elements taking the l3 lock only if
- * necessary.
+ * necessary. Note that the l3 listlock also protects the array_cache
+ * if drain_array() is used on the shared array.
  */
 void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
 			 struct array_cache *ac, int force, int node)
@@ -3532,16 +3533,18 @@ void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
 		return;
 	if (ac->touched && !force) {
 		ac->touched = 0;
-	} else if (ac->avail) {
-		tofree = force ? ac->avail : (ac->limit + 4) / 5;
-		if (tofree > ac->avail)
-			tofree = (ac->avail + 1) / 2;
+	} else {
 		spin_lock_irq(&l3->list_lock);
-		free_block(cachep, ac->entry, tofree, node);
+		if (ac->avail) {
+			tofree = force ? ac->avail : (ac->limit + 4) / 5;
+			if (tofree > ac->avail)
+				tofree = (ac->avail + 1) / 2;
+			free_block(cachep, ac->entry, tofree, node);
+			ac->avail -= tofree;
+			memmove(ac->entry, &(ac->entry[tofree]),
+				sizeof(void *) * ac->avail);
+		}
 		spin_unlock_irq(&l3->list_lock);
-		ac->avail -= tofree;
-		memmove(ac->entry, &(ac->entry[tofree]),
-			sizeof(void *) * ac->avail);
 	}
 }
 
-- 
cgit v1.1


From 879336c3930ae9273ea1c45214cb8adae0ce494a Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 22 Mar 2006 00:09:08 -0800
Subject: [PATCH] drain_node_pages: interrupt latency reduction / optimization

1. Only disable interrupts if there is actually something to free

2. Only dirty the pcp cacheline if we actually freed something.

3. Disable interrupts for each single pcp and not for cleaning
  all the pcps in all zones of a node.

drain_node_pages is called every 2 seconds from cache_reap. This
fix should avoid most disabling of interrupts.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ed91684..b7f14a4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -603,13 +603,14 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 /*
  * Called from the slab reaper to drain pagesets on a particular node that
  * belong to the currently executing processor.
+ * Note that this function must be called with the thread pinned to
+ * a single processor.
  */
 void drain_node_pages(int nodeid)
 {
 	int i, z;
 	unsigned long flags;
 
-	local_irq_save(flags);
 	for (z = 0; z < MAX_NR_ZONES; z++) {
 		struct zone *zone = NODE_DATA(nodeid)->node_zones + z;
 		struct per_cpu_pageset *pset;
@@ -619,11 +620,14 @@ void drain_node_pages(int nodeid)
 			struct per_cpu_pages *pcp;
 
 			pcp = &pset->pcp[i];
-			free_pages_bulk(zone, pcp->count, &pcp->list, 0);
-			pcp->count = 0;
+			if (pcp->count) {
+				local_irq_save(flags);
+				free_pages_bulk(zone, pcp->count, &pcp->list, 0);
+				pcp->count = 0;
+				local_irq_restore(flags);
+			}
 		}
 	}
-	local_irq_restore(flags);
 }
 #endif
 
-- 
cgit v1.1


From 9b65ef59d42a56fa1358958ede77aaa5bac385a8 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <mita@miraclelinux.com>
Date: Wed, 22 Mar 2006 00:09:09 -0800
Subject: [PATCH] fix swap cluster offset

When we've allocated SWAPFILE_CLUSTER pages, ->cluster_next should be the
first index of swap cluster.  But current code probably sets it wrong offset.

Signed-off-by: Akinobu Mita <mita@miraclelinux.com>
Acked-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/swapfile.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 1f9cf0d..365ed6f 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -116,7 +116,7 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
 				last_in_cluster = offset + SWAPFILE_CLUSTER;
 			else if (offset == last_in_cluster) {
 				spin_lock(&swap_lock);
-				si->cluster_next = offset-SWAPFILE_CLUSTER-1;
+				si->cluster_next = offset-SWAPFILE_CLUSTER+1;
 				goto cluster;
 			}
 			if (unlikely(--latency_ration < 0)) {
-- 
cgit v1.1


From fdb7cc59084ba7eef935e4e40aaaf538ee34c625 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Wed, 22 Mar 2006 00:09:10 -0800
Subject: [PATCH] mm: hugetlb alloc_fresh_huge_page bogus node loop fix

Fix bogus node loop in hugetlb.c alloc_fresh_huge_page(), which was
assuming that nodes are numbered contiguously from 0 to num_online_nodes().
Once the hotplug folks get this far, that will be false.

Signed-off-by: Paul Jackson <pj@sgi.com>
Acked-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/hugetlb.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 06699d8..ebad6bb 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -105,7 +105,9 @@ static int alloc_fresh_huge_page(void)
 	struct page *page;
 	page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN,
 					HUGETLB_PAGE_ORDER);
-	nid = (nid + 1) % num_online_nodes();
+	nid = next_node(nid, node_online_map);
+	if (nid == MAX_NUMNODES)
+		nid = first_node(node_online_map);
 	if (page) {
 		page[1].lru.next = (void *)free_huge_page;	/* dtor */
 		spin_lock(&hugetlb_lock);
-- 
cgit v1.1


From 442295c94bf650221af3ef20fc68fa3e93876818 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Wed, 22 Mar 2006 00:09:11 -0800
Subject: [PATCH] mm: slab cache interleave rotor fix

The alien cache rotor in mm/slab.c assumes that the first online node is
node 0.  Eventually for some archs, especially with hotplug, this will no
longer be true.

Fix the interleave rotor to handle the general case of node numbering.

Signed-off-by: Paul Jackson <pj@sgi.com>
Acked-by: Christoph Lameter <clameter@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 6b691ecb..1c8f5ee 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -830,7 +830,7 @@ static void init_reap_node(int cpu)
 
 	node = next_node(cpu_to_node(cpu), node_online_map);
 	if (node == MAX_NUMNODES)
-		node = 0;
+		node = first_node(node_online_map);
 
 	__get_cpu_var(reap_node) = node;
 }
-- 
cgit v1.1


From b20a35035f983f4ac7e29c4a68f30e43510007e0 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 22 Mar 2006 00:09:12 -0800
Subject: [PATCH] page migration reorg

Centralize the page migration functions in anticipation of additional
tinkering.  Creates a new file mm/migrate.c

1. Extract buffer_migrate_page() from fs/buffer.c

2. Extract central migration code from vmscan.c

3. Extract some components from mempolicy.c

4. Export pageout() and remove_from_swap() from vmscan.c

5. Make it possible to configure NUMA systems without page migration
   and non-NUMA systems with page migration.

I had to so some #ifdeffing in mempolicy.c that may need a cleanup.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/Kconfig      |   6 +
 mm/Makefile     |   2 +
 mm/mempolicy.c  | 113 ++--------
 mm/migrate.c    | 655 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 mm/swap_state.c |   1 +
 mm/vmscan.c     | 491 +-----------------------------------------
 6 files changed, 689 insertions(+), 579 deletions(-)
 create mode 100644 mm/migrate.c

(limited to 'mm')

diff --git a/mm/Kconfig b/mm/Kconfig
index a9cb80a..bd80460 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -137,5 +137,11 @@ config SPLIT_PTLOCK_CPUS
 # support for page migration
 #
 config MIGRATION
+	bool "Page migration"
 	def_bool y if NUMA || SPARSEMEM || DISCONTIGMEM
 	depends on SWAP
+	help
+	  Allows the migration of the physical location of pages of processes
+	  while the virtual addresses are not changed. This is useful for
+	  example on NUMA systems to put pages nearer to the processors accessing
+	  the page.
diff --git a/mm/Makefile b/mm/Makefile
index 9aa03fa..f10c753 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -22,3 +22,5 @@ obj-$(CONFIG_SLOB) += slob.o
 obj-$(CONFIG_SLAB) += slab.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 obj-$(CONFIG_FS_XIP) += filemap_xip.o
+obj-$(CONFIG_MIGRATION) += migrate.o
+
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 96195dc..e93cc74 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -86,6 +86,7 @@
 #include <linux/swap.h>
 #include <linux/seq_file.h>
 #include <linux/proc_fs.h>
+#include <linux/migrate.h>
 
 #include <asm/tlbflush.h>
 #include <asm/uaccess.h>
@@ -95,9 +96,6 @@
 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)		/* Gather statistics */
 
-/* The number of pages to migrate per call to migrate_pages() */
-#define MIGRATE_CHUNK_SIZE 256
-
 static struct kmem_cache *policy_cache;
 static struct kmem_cache *sn_cache;
 
@@ -331,17 +329,10 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 	struct vm_area_struct *first, *vma, *prev;
 
 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
-		/* Must have swap device for migration */
-		if (nr_swap_pages <= 0)
-			return ERR_PTR(-ENODEV);
 
-		/*
-		 * Clear the LRU lists so pages can be isolated.
-		 * Note that pages may be moved off the LRU after we have
-		 * drained them. Those pages will fail to migrate like other
-		 * pages that may be busy.
-		 */
-		lru_add_drain_all();
+		err = migrate_prep();
+		if (err)
+			return ERR_PTR(err);
 	}
 
 	first = find_vma(mm, start);
@@ -550,92 +541,18 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
 	return err;
 }
 
+#ifdef CONFIG_MIGRATION
 /*
  * page migration
  */
-
 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 				unsigned long flags)
 {
 	/*
 	 * Avoid migrating a page that is shared with others.
 	 */
-	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
-		if (isolate_lru_page(page))
-			list_add_tail(&page->lru, pagelist);
-	}
-}
-
-/*
- * Migrate the list 'pagelist' of pages to a certain destination.
- *
- * Specify destination with either non-NULL vma or dest_node >= 0
- * Return the number of pages not migrated or error code
- */
-static int migrate_pages_to(struct list_head *pagelist,
-			struct vm_area_struct *vma, int dest)
-{
-	LIST_HEAD(newlist);
-	LIST_HEAD(moved);
-	LIST_HEAD(failed);
-	int err = 0;
-	unsigned long offset = 0;
-	int nr_pages;
-	struct page *page;
-	struct list_head *p;
-
-redo:
-	nr_pages = 0;
-	list_for_each(p, pagelist) {
-		if (vma) {
-			/*
-			 * The address passed to alloc_page_vma is used to
-			 * generate the proper interleave behavior. We fake
-			 * the address here by an increasing offset in order
-			 * to get the proper distribution of pages.
-			 *
-			 * No decision has been made as to which page
-			 * a certain old page is moved to so we cannot
-			 * specify the correct address.
-			 */
-			page = alloc_page_vma(GFP_HIGHUSER, vma,
-					offset + vma->vm_start);
-			offset += PAGE_SIZE;
-		}
-		else
-			page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
-
-		if (!page) {
-			err = -ENOMEM;
-			goto out;
-		}
-		list_add_tail(&page->lru, &newlist);
-		nr_pages++;
-		if (nr_pages > MIGRATE_CHUNK_SIZE)
-			break;
-	}
-	err = migrate_pages(pagelist, &newlist, &moved, &failed);
-
-	putback_lru_pages(&moved);	/* Call release pages instead ?? */
-
-	if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
-		goto redo;
-out:
-	/* Return leftover allocated pages */
-	while (!list_empty(&newlist)) {
-		page = list_entry(newlist.next, struct page, lru);
-		list_del(&page->lru);
-		__free_page(page);
-	}
-	list_splice(&failed, pagelist);
-	if (err < 0)
-		return err;
-
-	/* Calculate number of leftover pages */
-	nr_pages = 0;
-	list_for_each(p, pagelist)
-		nr_pages++;
-	return nr_pages;
+	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
+		isolate_lru_page(page, pagelist);
 }
 
 /*
@@ -742,8 +659,23 @@ int do_migrate_pages(struct mm_struct *mm,
 	if (err < 0)
 		return err;
 	return busy;
+
 }
 
+#else
+
+static void migrate_page_add(struct page *page, struct list_head *pagelist,
+				unsigned long flags)
+{
+}
+
+int do_migrate_pages(struct mm_struct *mm,
+	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
+{
+	return -ENOSYS;
+}
+#endif
+
 long do_mbind(unsigned long start, unsigned long len,
 		unsigned long mode, nodemask_t *nmask, unsigned long flags)
 {
@@ -808,6 +740,7 @@ long do_mbind(unsigned long start, unsigned long len,
 		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 			err = -EIO;
 	}
+
 	if (!list_empty(&pagelist))
 		putback_lru_pages(&pagelist);
 
diff --git a/mm/migrate.c b/mm/migrate.c
new file mode 100644
index 0000000..09f6e4a
--- /dev/null
+++ b/mm/migrate.c
@@ -0,0 +1,655 @@
+/*
+ * Memory Migration functionality - linux/mm/migration.c
+ *
+ * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
+ *
+ * Page migration was first developed in the context of the memory hotplug
+ * project. The main authors of the migration code are:
+ *
+ * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
+ * Hirokazu Takahashi <taka@valinux.co.jp>
+ * Dave Hansen <haveblue@us.ibm.com>
+ * Christoph Lameter <clameter@sgi.com>
+ */
+
+#include <linux/migrate.h>
+#include <linux/module.h>
+#include <linux/swap.h>
+#include <linux/pagemap.h>
+#include <linux/buffer_head.h>	/* for try_to_release_page(),
+					buffer_heads_over_limit */
+#include <linux/mm_inline.h>
+#include <linux/pagevec.h>
+#include <linux/rmap.h>
+#include <linux/topology.h>
+#include <linux/cpu.h>
+#include <linux/cpuset.h>
+#include <linux/swapops.h>
+
+#include "internal.h"
+
+#include "internal.h"
+
+/* The maximum number of pages to take off the LRU for migration */
+#define MIGRATE_CHUNK_SIZE 256
+
+#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
+
+/*
+ * Isolate one page from the LRU lists. If successful put it onto
+ * the indicated list with elevated page count.
+ *
+ * Result:
+ *  -EBUSY: page not on LRU list
+ *  0: page removed from LRU list and added to the specified list.
+ */
+int isolate_lru_page(struct page *page, struct list_head *pagelist)
+{
+	int ret = -EBUSY;
+
+	if (PageLRU(page)) {
+		struct zone *zone = page_zone(page);
+
+		spin_lock_irq(&zone->lru_lock);
+		if (PageLRU(page)) {
+			ret = 0;
+			get_page(page);
+			ClearPageLRU(page);
+			if (PageActive(page))
+				del_page_from_active_list(zone, page);
+			else
+				del_page_from_inactive_list(zone, page);
+			list_add_tail(&page->lru, pagelist);
+		}
+		spin_unlock_irq(&zone->lru_lock);
+	}
+	return ret;
+}
+
+/*
+ * migrate_prep() needs to be called after we have compiled the list of pages
+ * to be migrated using isolate_lru_page() but before we begin a series of calls
+ * to migrate_pages().
+ */
+int migrate_prep(void)
+{
+	/* Must have swap device for migration */
+	if (nr_swap_pages <= 0)
+		return -ENODEV;
+
+	/*
+	 * Clear the LRU lists so pages can be isolated.
+	 * Note that pages may be moved off the LRU after we have
+	 * drained them. Those pages will fail to migrate like other
+	 * pages that may be busy.
+	 */
+	lru_add_drain_all();
+
+	return 0;
+}
+
+static inline void move_to_lru(struct page *page)
+{
+	list_del(&page->lru);
+	if (PageActive(page)) {
+		/*
+		 * lru_cache_add_active checks that
+		 * the PG_active bit is off.
+		 */
+		ClearPageActive(page);
+		lru_cache_add_active(page);
+	} else {
+		lru_cache_add(page);
+	}
+	put_page(page);
+}
+
+/*
+ * Add isolated pages on the list back to the LRU.
+ *
+ * returns the number of pages put back.
+ */
+int putback_lru_pages(struct list_head *l)
+{
+	struct page *page;
+	struct page *page2;
+	int count = 0;
+
+	list_for_each_entry_safe(page, page2, l, lru) {
+		move_to_lru(page);
+		count++;
+	}
+	return count;
+}
+
+/*
+ * Non migratable page
+ */
+int fail_migrate_page(struct page *newpage, struct page *page)
+{
+	return -EIO;
+}
+EXPORT_SYMBOL(fail_migrate_page);
+
+/*
+ * swapout a single page
+ * page is locked upon entry, unlocked on exit
+ */
+static int swap_page(struct page *page)
+{
+	struct address_space *mapping = page_mapping(page);
+
+	if (page_mapped(page) && mapping)
+		if (try_to_unmap(page, 1) != SWAP_SUCCESS)
+			goto unlock_retry;
+
+	if (PageDirty(page)) {
+		/* Page is dirty, try to write it out here */
+		switch(pageout(page, mapping)) {
+		case PAGE_KEEP:
+		case PAGE_ACTIVATE:
+			goto unlock_retry;
+
+		case PAGE_SUCCESS:
+			goto retry;
+
+		case PAGE_CLEAN:
+			; /* try to free the page below */
+		}
+	}
+
+	if (PagePrivate(page)) {
+		if (!try_to_release_page(page, GFP_KERNEL) ||
+		    (!mapping && page_count(page) == 1))
+			goto unlock_retry;
+	}
+
+	if (remove_mapping(mapping, page)) {
+		/* Success */
+		unlock_page(page);
+		return 0;
+	}
+
+unlock_retry:
+	unlock_page(page);
+
+retry:
+	return -EAGAIN;
+}
+EXPORT_SYMBOL(swap_page);
+
+/*
+ * Remove references for a page and establish the new page with the correct
+ * basic settings to be able to stop accesses to the page.
+ */
+int migrate_page_remove_references(struct page *newpage,
+				struct page *page, int nr_refs)
+{
+	struct address_space *mapping = page_mapping(page);
+	struct page **radix_pointer;
+
+	/*
+	 * Avoid doing any of the following work if the page count
+	 * indicates that the page is in use or truncate has removed
+	 * the page.
+	 */
+	if (!mapping || page_mapcount(page) + nr_refs != page_count(page))
+		return -EAGAIN;
+
+	/*
+	 * Establish swap ptes for anonymous pages or destroy pte
+	 * maps for files.
+	 *
+	 * In order to reestablish file backed mappings the fault handlers
+	 * will take the radix tree_lock which may then be used to stop
+  	 * processses from accessing this page until the new page is ready.
+	 *
+	 * A process accessing via a swap pte (an anonymous page) will take a
+	 * page_lock on the old page which will block the process until the
+	 * migration attempt is complete. At that time the PageSwapCache bit
+	 * will be examined. If the page was migrated then the PageSwapCache
+	 * bit will be clear and the operation to retrieve the page will be
+	 * retried which will find the new page in the radix tree. Then a new
+	 * direct mapping may be generated based on the radix tree contents.
+	 *
+	 * If the page was not migrated then the PageSwapCache bit
+	 * is still set and the operation may continue.
+	 */
+	if (try_to_unmap(page, 1) == SWAP_FAIL)
+		/* A vma has VM_LOCKED set -> permanent failure */
+		return -EPERM;
+
+	/*
+	 * Give up if we were unable to remove all mappings.
+	 */
+	if (page_mapcount(page))
+		return -EAGAIN;
+
+	write_lock_irq(&mapping->tree_lock);
+
+	radix_pointer = (struct page **)radix_tree_lookup_slot(
+						&mapping->page_tree,
+						page_index(page));
+
+	if (!page_mapping(page) || page_count(page) != nr_refs ||
+			*radix_pointer != page) {
+		write_unlock_irq(&mapping->tree_lock);
+		return 1;
+	}
+
+	/*
+	 * Now we know that no one else is looking at the page.
+	 *
+	 * Certain minimal information about a page must be available
+	 * in order for other subsystems to properly handle the page if they
+	 * find it through the radix tree update before we are finished
+	 * copying the page.
+	 */
+	get_page(newpage);
+	newpage->index = page->index;
+	newpage->mapping = page->mapping;
+	if (PageSwapCache(page)) {
+		SetPageSwapCache(newpage);
+		set_page_private(newpage, page_private(page));
+	}
+
+	*radix_pointer = newpage;
+	__put_page(page);
+	write_unlock_irq(&mapping->tree_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(migrate_page_remove_references);
+
+/*
+ * Copy the page to its new location
+ */
+void migrate_page_copy(struct page *newpage, struct page *page)
+{
+	copy_highpage(newpage, page);
+
+	if (PageError(page))
+		SetPageError(newpage);
+	if (PageReferenced(page))
+		SetPageReferenced(newpage);
+	if (PageUptodate(page))
+		SetPageUptodate(newpage);
+	if (PageActive(page))
+		SetPageActive(newpage);
+	if (PageChecked(page))
+		SetPageChecked(newpage);
+	if (PageMappedToDisk(page))
+		SetPageMappedToDisk(newpage);
+
+	if (PageDirty(page)) {
+		clear_page_dirty_for_io(page);
+		set_page_dirty(newpage);
+ 	}
+
+	ClearPageSwapCache(page);
+	ClearPageActive(page);
+	ClearPagePrivate(page);
+	set_page_private(page, 0);
+	page->mapping = NULL;
+
+	/*
+	 * If any waiters have accumulated on the new page then
+	 * wake them up.
+	 */
+	if (PageWriteback(newpage))
+		end_page_writeback(newpage);
+}
+EXPORT_SYMBOL(migrate_page_copy);
+
+/*
+ * Common logic to directly migrate a single page suitable for
+ * pages that do not use PagePrivate.
+ *
+ * Pages are locked upon entry and exit.
+ */
+int migrate_page(struct page *newpage, struct page *page)
+{
+	int rc;
+
+	BUG_ON(PageWriteback(page));	/* Writeback must be complete */
+
+	rc = migrate_page_remove_references(newpage, page, 2);
+
+	if (rc)
+		return rc;
+
+	migrate_page_copy(newpage, page);
+
+	/*
+	 * Remove auxiliary swap entries and replace
+	 * them with real ptes.
+	 *
+	 * Note that a real pte entry will allow processes that are not
+	 * waiting on the page lock to use the new page via the page tables
+	 * before the new page is unlocked.
+	 */
+	remove_from_swap(newpage);
+	return 0;
+}
+EXPORT_SYMBOL(migrate_page);
+
+/*
+ * migrate_pages
+ *
+ * Two lists are passed to this function. The first list
+ * contains the pages isolated from the LRU to be migrated.
+ * The second list contains new pages that the pages isolated
+ * can be moved to. If the second list is NULL then all
+ * pages are swapped out.
+ *
+ * The function returns after 10 attempts or if no pages
+ * are movable anymore because to has become empty
+ * or no retryable pages exist anymore.
+ *
+ * Return: Number of pages not migrated when "to" ran empty.
+ */
+int migrate_pages(struct list_head *from, struct list_head *to,
+		  struct list_head *moved, struct list_head *failed)
+{
+	int retry;
+	int nr_failed = 0;
+	int pass = 0;
+	struct page *page;
+	struct page *page2;
+	int swapwrite = current->flags & PF_SWAPWRITE;
+	int rc;
+
+	if (!swapwrite)
+		current->flags |= PF_SWAPWRITE;
+
+redo:
+	retry = 0;
+
+	list_for_each_entry_safe(page, page2, from, lru) {
+		struct page *newpage = NULL;
+		struct address_space *mapping;
+
+		cond_resched();
+
+		rc = 0;
+		if (page_count(page) == 1)
+			/* page was freed from under us. So we are done. */
+			goto next;
+
+		if (to && list_empty(to))
+			break;
+
+		/*
+		 * Skip locked pages during the first two passes to give the
+		 * functions holding the lock time to release the page. Later we
+		 * use lock_page() to have a higher chance of acquiring the
+		 * lock.
+		 */
+		rc = -EAGAIN;
+		if (pass > 2)
+			lock_page(page);
+		else
+			if (TestSetPageLocked(page))
+				goto next;
+
+		/*
+		 * Only wait on writeback if we have already done a pass where
+		 * we we may have triggered writeouts for lots of pages.
+		 */
+		if (pass > 0) {
+			wait_on_page_writeback(page);
+		} else {
+			if (PageWriteback(page))
+				goto unlock_page;
+		}
+
+		/*
+		 * Anonymous pages must have swap cache references otherwise
+		 * the information contained in the page maps cannot be
+		 * preserved.
+		 */
+		if (PageAnon(page) && !PageSwapCache(page)) {
+			if (!add_to_swap(page, GFP_KERNEL)) {
+				rc = -ENOMEM;
+				goto unlock_page;
+			}
+		}
+
+		if (!to) {
+			rc = swap_page(page);
+			goto next;
+		}
+
+		newpage = lru_to_page(to);
+		lock_page(newpage);
+
+		/*
+		 * Pages are properly locked and writeback is complete.
+		 * Try to migrate the page.
+		 */
+		mapping = page_mapping(page);
+		if (!mapping)
+			goto unlock_both;
+
+		if (mapping->a_ops->migratepage) {
+			/*
+			 * Most pages have a mapping and most filesystems
+			 * should provide a migration function. Anonymous
+			 * pages are part of swap space which also has its
+			 * own migration function. This is the most common
+			 * path for page migration.
+			 */
+			rc = mapping->a_ops->migratepage(newpage, page);
+			goto unlock_both;
+                }
+
+		/*
+		 * Default handling if a filesystem does not provide
+		 * a migration function. We can only migrate clean
+		 * pages so try to write out any dirty pages first.
+		 */
+		if (PageDirty(page)) {
+			switch (pageout(page, mapping)) {
+			case PAGE_KEEP:
+			case PAGE_ACTIVATE:
+				goto unlock_both;
+
+			case PAGE_SUCCESS:
+				unlock_page(newpage);
+				goto next;
+
+			case PAGE_CLEAN:
+				; /* try to migrate the page below */
+			}
+                }
+
+		/*
+		 * Buffers are managed in a filesystem specific way.
+		 * We must have no buffers or drop them.
+		 */
+		if (!page_has_buffers(page) ||
+		    try_to_release_page(page, GFP_KERNEL)) {
+			rc = migrate_page(newpage, page);
+			goto unlock_both;
+		}
+
+		/*
+		 * On early passes with mapped pages simply
+		 * retry. There may be a lock held for some
+		 * buffers that may go away. Later
+		 * swap them out.
+		 */
+		if (pass > 4) {
+			/*
+			 * Persistently unable to drop buffers..... As a
+			 * measure of last resort we fall back to
+			 * swap_page().
+			 */
+			unlock_page(newpage);
+			newpage = NULL;
+			rc = swap_page(page);
+			goto next;
+		}
+
+unlock_both:
+		unlock_page(newpage);
+
+unlock_page:
+		unlock_page(page);
+
+next:
+		if (rc == -EAGAIN) {
+			retry++;
+		} else if (rc) {
+			/* Permanent failure */
+			list_move(&page->lru, failed);
+			nr_failed++;
+		} else {
+			if (newpage) {
+				/* Successful migration. Return page to LRU */
+				move_to_lru(newpage);
+			}
+			list_move(&page->lru, moved);
+		}
+	}
+	if (retry && pass++ < 10)
+		goto redo;
+
+	if (!swapwrite)
+		current->flags &= ~PF_SWAPWRITE;
+
+	return nr_failed + retry;
+}
+
+/*
+ * Migration function for pages with buffers. This function can only be used
+ * if the underlying filesystem guarantees that no other references to "page"
+ * exist.
+ */
+int buffer_migrate_page(struct page *newpage, struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+	struct buffer_head *bh, *head;
+	int rc;
+
+	if (!mapping)
+		return -EAGAIN;
+
+	if (!page_has_buffers(page))
+		return migrate_page(newpage, page);
+
+	head = page_buffers(page);
+
+	rc = migrate_page_remove_references(newpage, page, 3);
+
+	if (rc)
+		return rc;
+
+	bh = head;
+	do {
+		get_bh(bh);
+		lock_buffer(bh);
+		bh = bh->b_this_page;
+
+	} while (bh != head);
+
+	ClearPagePrivate(page);
+	set_page_private(newpage, page_private(page));
+	set_page_private(page, 0);
+	put_page(page);
+	get_page(newpage);
+
+	bh = head;
+	do {
+		set_bh_page(bh, newpage, bh_offset(bh));
+		bh = bh->b_this_page;
+
+	} while (bh != head);
+
+	SetPagePrivate(newpage);
+
+	migrate_page_copy(newpage, page);
+
+	bh = head;
+	do {
+		unlock_buffer(bh);
+ 		put_bh(bh);
+		bh = bh->b_this_page;
+
+	} while (bh != head);
+
+	return 0;
+}
+EXPORT_SYMBOL(buffer_migrate_page);
+
+/*
+ * Migrate the list 'pagelist' of pages to a certain destination.
+ *
+ * Specify destination with either non-NULL vma or dest_node >= 0
+ * Return the number of pages not migrated or error code
+ */
+int migrate_pages_to(struct list_head *pagelist,
+			struct vm_area_struct *vma, int dest)
+{
+	LIST_HEAD(newlist);
+	LIST_HEAD(moved);
+	LIST_HEAD(failed);
+	int err = 0;
+	unsigned long offset = 0;
+	int nr_pages;
+	struct page *page;
+	struct list_head *p;
+
+redo:
+	nr_pages = 0;
+	list_for_each(p, pagelist) {
+		if (vma) {
+			/*
+			 * The address passed to alloc_page_vma is used to
+			 * generate the proper interleave behavior. We fake
+			 * the address here by an increasing offset in order
+			 * to get the proper distribution of pages.
+			 *
+			 * No decision has been made as to which page
+			 * a certain old page is moved to so we cannot
+			 * specify the correct address.
+			 */
+			page = alloc_page_vma(GFP_HIGHUSER, vma,
+					offset + vma->vm_start);
+			offset += PAGE_SIZE;
+		}
+		else
+			page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
+
+		if (!page) {
+			err = -ENOMEM;
+			goto out;
+		}
+		list_add_tail(&page->lru, &newlist);
+		nr_pages++;
+		if (nr_pages > MIGRATE_CHUNK_SIZE)
+			break;
+	}
+	err = migrate_pages(pagelist, &newlist, &moved, &failed);
+
+	putback_lru_pages(&moved);	/* Call release pages instead ?? */
+
+	if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
+		goto redo;
+out:
+	/* Return leftover allocated pages */
+	while (!list_empty(&newlist)) {
+		page = list_entry(newlist.next, struct page, lru);
+		list_del(&page->lru);
+		__free_page(page);
+	}
+	list_splice(&failed, pagelist);
+	if (err < 0)
+		return err;
+
+	/* Calculate number of leftover pages */
+	nr_pages = 0;
+	list_for_each(p, pagelist)
+		nr_pages++;
+	return nr_pages;
+}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index db8a3d3..d7af296 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -15,6 +15,7 @@
 #include <linux/buffer_head.h>
 #include <linux/backing-dev.h>
 #include <linux/pagevec.h>
+#include <linux/migrate.h>
 
 #include <asm/pgtable.h>
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 548e023..fd572bb 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -42,18 +42,6 @@
 
 #include "internal.h"
 
-/* possible outcome of pageout() */
-typedef enum {
-	/* failed to write page out, page is locked */
-	PAGE_KEEP,
-	/* move page to the active list, page is locked */
-	PAGE_ACTIVATE,
-	/* page has been sent to the disk successfully, page is unlocked */
-	PAGE_SUCCESS,
-	/* page is clean and locked */
-	PAGE_CLEAN,
-} pageout_t;
-
 struct scan_control {
 	/* Incremented by the number of inactive pages that were scanned */
 	unsigned long nr_scanned;
@@ -304,7 +292,7 @@ static void handle_write_error(struct address_space *mapping,
  * pageout is called by shrink_page_list() for each dirty page.
  * Calls ->writepage().
  */
-static pageout_t pageout(struct page *page, struct address_space *mapping)
+pageout_t pageout(struct page *page, struct address_space *mapping)
 {
 	/*
 	 * If the page is dirty, only perform writeback if that write
@@ -372,7 +360,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
 	return PAGE_CLEAN;
 }
 
-static int remove_mapping(struct address_space *mapping, struct page *page)
+int remove_mapping(struct address_space *mapping, struct page *page)
 {
 	if (!mapping)
 		return 0;		/* truncate got there first */
@@ -570,481 +558,6 @@ keep:
 	return nr_reclaimed;
 }
 
-#ifdef CONFIG_MIGRATION
-static inline void move_to_lru(struct page *page)
-{
-	list_del(&page->lru);
-	if (PageActive(page)) {
-		/*
-		 * lru_cache_add_active checks that
-		 * the PG_active bit is off.
-		 */
-		ClearPageActive(page);
-		lru_cache_add_active(page);
-	} else {
-		lru_cache_add(page);
-	}
-	put_page(page);
-}
-
-/*
- * Add isolated pages on the list back to the LRU.
- *
- * returns the number of pages put back.
- */
-unsigned long putback_lru_pages(struct list_head *l)
-{
-	struct page *page;
-	struct page *page2;
-	unsigned long count = 0;
-
-	list_for_each_entry_safe(page, page2, l, lru) {
-		move_to_lru(page);
-		count++;
-	}
-	return count;
-}
-
-/*
- * Non migratable page
- */
-int fail_migrate_page(struct page *newpage, struct page *page)
-{
-	return -EIO;
-}
-EXPORT_SYMBOL(fail_migrate_page);
-
-/*
- * swapout a single page
- * page is locked upon entry, unlocked on exit
- */
-static int swap_page(struct page *page)
-{
-	struct address_space *mapping = page_mapping(page);
-
-	if (page_mapped(page) && mapping)
-		if (try_to_unmap(page, 1) != SWAP_SUCCESS)
-			goto unlock_retry;
-
-	if (PageDirty(page)) {
-		/* Page is dirty, try to write it out here */
-		switch(pageout(page, mapping)) {
-		case PAGE_KEEP:
-		case PAGE_ACTIVATE:
-			goto unlock_retry;
-
-		case PAGE_SUCCESS:
-			goto retry;
-
-		case PAGE_CLEAN:
-			; /* try to free the page below */
-		}
-	}
-
-	if (PagePrivate(page)) {
-		if (!try_to_release_page(page, GFP_KERNEL) ||
-		    (!mapping && page_count(page) == 1))
-			goto unlock_retry;
-	}
-
-	if (remove_mapping(mapping, page)) {
-		/* Success */
-		unlock_page(page);
-		return 0;
-	}
-
-unlock_retry:
-	unlock_page(page);
-
-retry:
-	return -EAGAIN;
-}
-EXPORT_SYMBOL(swap_page);
-
-/*
- * Page migration was first developed in the context of the memory hotplug
- * project. The main authors of the migration code are:
- *
- * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
- * Hirokazu Takahashi <taka@valinux.co.jp>
- * Dave Hansen <haveblue@us.ibm.com>
- * Christoph Lameter <clameter@sgi.com>
- */
-
-/*
- * Remove references for a page and establish the new page with the correct
- * basic settings to be able to stop accesses to the page.
- */
-int migrate_page_remove_references(struct page *newpage,
-				struct page *page, int nr_refs)
-{
-	struct address_space *mapping = page_mapping(page);
-	struct page **radix_pointer;
-
-	/*
-	 * Avoid doing any of the following work if the page count
-	 * indicates that the page is in use or truncate has removed
-	 * the page.
-	 */
-	if (!mapping || page_mapcount(page) + nr_refs != page_count(page))
-		return -EAGAIN;
-
-	/*
-	 * Establish swap ptes for anonymous pages or destroy pte
-	 * maps for files.
-	 *
-	 * In order to reestablish file backed mappings the fault handlers
-	 * will take the radix tree_lock which may then be used to stop
-  	 * processses from accessing this page until the new page is ready.
-	 *
-	 * A process accessing via a swap pte (an anonymous page) will take a
-	 * page_lock on the old page which will block the process until the
-	 * migration attempt is complete. At that time the PageSwapCache bit
-	 * will be examined. If the page was migrated then the PageSwapCache
-	 * bit will be clear and the operation to retrieve the page will be
-	 * retried which will find the new page in the radix tree. Then a new
-	 * direct mapping may be generated based on the radix tree contents.
-	 *
-	 * If the page was not migrated then the PageSwapCache bit
-	 * is still set and the operation may continue.
-	 */
-	if (try_to_unmap(page, 1) == SWAP_FAIL)
-		/* A vma has VM_LOCKED set -> Permanent failure */
-		return -EPERM;
-
-	/*
-	 * Give up if we were unable to remove all mappings.
-	 */
-	if (page_mapcount(page))
-		return -EAGAIN;
-
-	write_lock_irq(&mapping->tree_lock);
-
-	radix_pointer = (struct page **)radix_tree_lookup_slot(
-						&mapping->page_tree,
-						page_index(page));
-
-	if (!page_mapping(page) || page_count(page) != nr_refs ||
-			*radix_pointer != page) {
-		write_unlock_irq(&mapping->tree_lock);
-		return -EAGAIN;
-	}
-
-	/*
-	 * Now we know that no one else is looking at the page.
-	 *
-	 * Certain minimal information about a page must be available
-	 * in order for other subsystems to properly handle the page if they
-	 * find it through the radix tree update before we are finished
-	 * copying the page.
-	 */
-	get_page(newpage);
-	newpage->index = page->index;
-	newpage->mapping = page->mapping;
-	if (PageSwapCache(page)) {
-		SetPageSwapCache(newpage);
-		set_page_private(newpage, page_private(page));
-	}
-
-	*radix_pointer = newpage;
-	__put_page(page);
-	write_unlock_irq(&mapping->tree_lock);
-
-	return 0;
-}
-EXPORT_SYMBOL(migrate_page_remove_references);
-
-/*
- * Copy the page to its new location
- */
-void migrate_page_copy(struct page *newpage, struct page *page)
-{
-	copy_highpage(newpage, page);
-
-	if (PageError(page))
-		SetPageError(newpage);
-	if (PageReferenced(page))
-		SetPageReferenced(newpage);
-	if (PageUptodate(page))
-		SetPageUptodate(newpage);
-	if (PageActive(page))
-		SetPageActive(newpage);
-	if (PageChecked(page))
-		SetPageChecked(newpage);
-	if (PageMappedToDisk(page))
-		SetPageMappedToDisk(newpage);
-
-	if (PageDirty(page)) {
-		clear_page_dirty_for_io(page);
-		set_page_dirty(newpage);
- 	}
-
-	ClearPageSwapCache(page);
-	ClearPageActive(page);
-	ClearPagePrivate(page);
-	set_page_private(page, 0);
-	page->mapping = NULL;
-
-	/*
-	 * If any waiters have accumulated on the new page then
-	 * wake them up.
-	 */
-	if (PageWriteback(newpage))
-		end_page_writeback(newpage);
-}
-EXPORT_SYMBOL(migrate_page_copy);
-
-/*
- * Common logic to directly migrate a single page suitable for
- * pages that do not use PagePrivate.
- *
- * Pages are locked upon entry and exit.
- */
-int migrate_page(struct page *newpage, struct page *page)
-{
-	int rc;
-
-	BUG_ON(PageWriteback(page));	/* Writeback must be complete */
-
-	rc = migrate_page_remove_references(newpage, page, 2);
-
-	if (rc)
-		return rc;
-
-	migrate_page_copy(newpage, page);
-
-	/*
-	 * Remove auxiliary swap entries and replace
-	 * them with real ptes.
-	 *
-	 * Note that a real pte entry will allow processes that are not
-	 * waiting on the page lock to use the new page via the page tables
-	 * before the new page is unlocked.
-	 */
-	remove_from_swap(newpage);
-	return 0;
-}
-EXPORT_SYMBOL(migrate_page);
-
-/*
- * migrate_pages
- *
- * Two lists are passed to this function. The first list
- * contains the pages isolated from the LRU to be migrated.
- * The second list contains new pages that the pages isolated
- * can be moved to. If the second list is NULL then all
- * pages are swapped out.
- *
- * The function returns after 10 attempts or if no pages
- * are movable anymore because to has become empty
- * or no retryable pages exist anymore.
- *
- * Return: Number of pages not migrated when "to" ran empty.
- */
-unsigned long migrate_pages(struct list_head *from, struct list_head *to,
-		  struct list_head *moved, struct list_head *failed)
-{
-	unsigned long retry;
-	unsigned long nr_failed = 0;
-	int pass = 0;
-	struct page *page;
-	struct page *page2;
-	int swapwrite = current->flags & PF_SWAPWRITE;
-	int rc;
-
-	if (!swapwrite)
-		current->flags |= PF_SWAPWRITE;
-
-redo:
-	retry = 0;
-
-	list_for_each_entry_safe(page, page2, from, lru) {
-		struct page *newpage = NULL;
-		struct address_space *mapping;
-
-		cond_resched();
-
-		rc = 0;
-		if (page_count(page) == 1)
-			/* page was freed from under us. So we are done. */
-			goto next;
-
-		if (to && list_empty(to))
-			break;
-
-		/*
-		 * Skip locked pages during the first two passes to give the
-		 * functions holding the lock time to release the page. Later we
-		 * use lock_page() to have a higher chance of acquiring the
-		 * lock.
-		 */
-		rc = -EAGAIN;
-		if (pass > 2)
-			lock_page(page);
-		else
-			if (TestSetPageLocked(page))
-				goto next;
-
-		/*
-		 * Only wait on writeback if we have already done a pass where
-		 * we we may have triggered writeouts for lots of pages.
-		 */
-		if (pass > 0) {
-			wait_on_page_writeback(page);
-		} else {
-			if (PageWriteback(page))
-				goto unlock_page;
-		}
-
-		/*
-		 * Anonymous pages must have swap cache references otherwise
-		 * the information contained in the page maps cannot be
-		 * preserved.
-		 */
-		if (PageAnon(page) && !PageSwapCache(page)) {
-			if (!add_to_swap(page, GFP_KERNEL)) {
-				rc = -ENOMEM;
-				goto unlock_page;
-			}
-		}
-
-		if (!to) {
-			rc = swap_page(page);
-			goto next;
-		}
-
-		newpage = lru_to_page(to);
-		lock_page(newpage);
-
-		/*
-		 * Pages are properly locked and writeback is complete.
-		 * Try to migrate the page.
-		 */
-		mapping = page_mapping(page);
-		if (!mapping)
-			goto unlock_both;
-
-		if (mapping->a_ops->migratepage) {
-			/*
-			 * Most pages have a mapping and most filesystems
-			 * should provide a migration function. Anonymous
-			 * pages are part of swap space which also has its
-			 * own migration function. This is the most common
-			 * path for page migration.
-			 */
-			rc = mapping->a_ops->migratepage(newpage, page);
-			goto unlock_both;
-                }
-
-		/*
-		 * Default handling if a filesystem does not provide
-		 * a migration function. We can only migrate clean
-		 * pages so try to write out any dirty pages first.
-		 */
-		if (PageDirty(page)) {
-			switch (pageout(page, mapping)) {
-			case PAGE_KEEP:
-			case PAGE_ACTIVATE:
-				goto unlock_both;
-
-			case PAGE_SUCCESS:
-				unlock_page(newpage);
-				goto next;
-
-			case PAGE_CLEAN:
-				; /* try to migrate the page below */
-			}
-                }
-
-		/*
-		 * Buffers are managed in a filesystem specific way.
-		 * We must have no buffers or drop them.
-		 */
-		if (!page_has_buffers(page) ||
-		    try_to_release_page(page, GFP_KERNEL)) {
-			rc = migrate_page(newpage, page);
-			goto unlock_both;
-		}
-
-		/*
-		 * On early passes with mapped pages simply
-		 * retry. There may be a lock held for some
-		 * buffers that may go away. Later
-		 * swap them out.
-		 */
-		if (pass > 4) {
-			/*
-			 * Persistently unable to drop buffers..... As a
-			 * measure of last resort we fall back to
-			 * swap_page().
-			 */
-			unlock_page(newpage);
-			newpage = NULL;
-			rc = swap_page(page);
-			goto next;
-		}
-
-unlock_both:
-		unlock_page(newpage);
-
-unlock_page:
-		unlock_page(page);
-
-next:
-		if (rc == -EAGAIN) {
-			retry++;
-		} else if (rc) {
-			/* Permanent failure */
-			list_move(&page->lru, failed);
-			nr_failed++;
-		} else {
-			if (newpage) {
-				/* Successful migration. Return page to LRU */
-				move_to_lru(newpage);
-			}
-			list_move(&page->lru, moved);
-		}
-	}
-	if (retry && pass++ < 10)
-		goto redo;
-
-	if (!swapwrite)
-		current->flags &= ~PF_SWAPWRITE;
-
-	return nr_failed + retry;
-}
-
-/*
- * Isolate one page from the LRU lists and put it on the
- * indicated list with elevated refcount.
- *
- * Result:
- *  0 = page not on LRU list
- *  1 = page removed from LRU list and added to the specified list.
- */
-int isolate_lru_page(struct page *page)
-{
-	int ret = 0;
-
-	if (PageLRU(page)) {
-		struct zone *zone = page_zone(page);
-		spin_lock_irq(&zone->lru_lock);
-		if (PageLRU(page)) {
-			ret = 1;
-			get_page(page);
-			ClearPageLRU(page);
-			if (PageActive(page))
-				del_page_from_active_list(zone, page);
-			else
-				del_page_from_inactive_list(zone, page);
-		}
-		spin_unlock_irq(&zone->lru_lock);
-	}
-
-	return ret;
-}
-#endif
-
 /*
  * zone->lru_lock is heavily contended.  Some of the functions that
  * shrink the lists perform better by taking out a batch of pages
-- 
cgit v1.1